diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,66533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9994739610731194, + "eval_steps": 500, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010520778537611783, + "grad_norm": 1.57374906539917, + "learning_rate": 6.993006993006994e-07, + "loss": 2.7842, + "step": 1 + }, + { + "epoch": 0.00021041557075223566, + "grad_norm": 1.4931204319000244, + "learning_rate": 1.3986013986013987e-06, + "loss": 2.1824, + "step": 2 + }, + { + "epoch": 0.0003156233561283535, + "grad_norm": 1.372463583946228, + "learning_rate": 2.0979020979020983e-06, + "loss": 2.2936, + "step": 3 + }, + { + "epoch": 0.0004208311415044713, + "grad_norm": 1.475909948348999, + "learning_rate": 2.7972027972027974e-06, + "loss": 2.4173, + "step": 4 + }, + { + "epoch": 0.0005260389268805891, + "grad_norm": 2.1777091026306152, + "learning_rate": 3.496503496503497e-06, + "loss": 2.1757, + "step": 5 + }, + { + "epoch": 0.000631246712256707, + "grad_norm": 1.7631592750549316, + "learning_rate": 4.195804195804197e-06, + "loss": 2.4156, + "step": 6 + }, + { + "epoch": 0.0007364544976328248, + "grad_norm": 1.8002638816833496, + "learning_rate": 4.895104895104895e-06, + "loss": 2.3946, + "step": 7 + }, + { + "epoch": 0.0008416622830089426, + "grad_norm": 1.8707481622695923, + "learning_rate": 5.594405594405595e-06, + "loss": 2.1748, + "step": 8 + }, + { + "epoch": 0.0009468700683850605, + "grad_norm": 2.459454298019409, + "learning_rate": 6.2937062937062944e-06, + "loss": 2.4425, + "step": 9 + }, + { + "epoch": 0.0010520778537611783, + "grad_norm": 2.8795626163482666, + "learning_rate": 6.993006993006994e-06, + "loss": 2.0042, + "step": 10 + }, + { + "epoch": 0.0011572856391372961, + "grad_norm": 1.7612422704696655, + "learning_rate": 7.692307692307694e-06, + "loss": 2.2756, + "step": 11 + }, + { + "epoch": 0.001262493424513414, + "grad_norm": 1.7078272104263306, + "learning_rate": 8.391608391608393e-06, + "loss": 2.327, + "step": 12 + }, + { + "epoch": 0.0013677012098895318, + "grad_norm": 1.3450385332107544, + "learning_rate": 9.090909090909091e-06, + "loss": 2.1059, + "step": 13 + }, + { + "epoch": 0.0014729089952656496, + "grad_norm": 1.0579757690429688, + "learning_rate": 9.79020979020979e-06, + "loss": 2.3028, + "step": 14 + }, + { + "epoch": 0.0015781167806417674, + "grad_norm": 1.1774177551269531, + "learning_rate": 1.048951048951049e-05, + "loss": 2.1585, + "step": 15 + }, + { + "epoch": 0.0016833245660178853, + "grad_norm": 1.7530457973480225, + "learning_rate": 1.118881118881119e-05, + "loss": 2.4828, + "step": 16 + }, + { + "epoch": 0.001788532351394003, + "grad_norm": 1.65647554397583, + "learning_rate": 1.188811188811189e-05, + "loss": 1.976, + "step": 17 + }, + { + "epoch": 0.001893740136770121, + "grad_norm": 1.5993101596832275, + "learning_rate": 1.2587412587412589e-05, + "loss": 1.7929, + "step": 18 + }, + { + "epoch": 0.0019989479221462388, + "grad_norm": 1.1430275440216064, + "learning_rate": 1.3286713286713287e-05, + "loss": 2.0432, + "step": 19 + }, + { + "epoch": 0.0021041557075223566, + "grad_norm": 1.8768856525421143, + "learning_rate": 1.3986013986013988e-05, + "loss": 1.8982, + "step": 20 + }, + { + "epoch": 0.0022093634928984744, + "grad_norm": 1.7143903970718384, + "learning_rate": 1.4685314685314686e-05, + "loss": 2.0638, + "step": 21 + }, + { + "epoch": 0.0023145712782745922, + "grad_norm": 1.5235050916671753, + "learning_rate": 1.5384615384615387e-05, + "loss": 2.2032, + "step": 22 + }, + { + "epoch": 0.00241977906365071, + "grad_norm": 2.3741185665130615, + "learning_rate": 1.6083916083916083e-05, + "loss": 2.146, + "step": 23 + }, + { + "epoch": 0.002524986849026828, + "grad_norm": 1.3590655326843262, + "learning_rate": 1.6783216783216786e-05, + "loss": 2.2378, + "step": 24 + }, + { + "epoch": 0.0026301946344029457, + "grad_norm": 1.331407904624939, + "learning_rate": 1.7482517482517483e-05, + "loss": 1.82, + "step": 25 + }, + { + "epoch": 0.0027354024197790636, + "grad_norm": 1.4533194303512573, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.9261, + "step": 26 + }, + { + "epoch": 0.0028406102051551814, + "grad_norm": 1.5130401849746704, + "learning_rate": 1.888111888111888e-05, + "loss": 2.1417, + "step": 27 + }, + { + "epoch": 0.0029458179905312992, + "grad_norm": 1.1877672672271729, + "learning_rate": 1.958041958041958e-05, + "loss": 2.1062, + "step": 28 + }, + { + "epoch": 0.003051025775907417, + "grad_norm": 0.9648356437683105, + "learning_rate": 2.027972027972028e-05, + "loss": 2.0972, + "step": 29 + }, + { + "epoch": 0.003156233561283535, + "grad_norm": 1.1514739990234375, + "learning_rate": 2.097902097902098e-05, + "loss": 2.4502, + "step": 30 + }, + { + "epoch": 0.0032614413466596527, + "grad_norm": 0.9614094495773315, + "learning_rate": 2.1678321678321677e-05, + "loss": 1.953, + "step": 31 + }, + { + "epoch": 0.0033666491320357705, + "grad_norm": 0.970487117767334, + "learning_rate": 2.237762237762238e-05, + "loss": 1.9179, + "step": 32 + }, + { + "epoch": 0.0034718569174118884, + "grad_norm": 1.6860216856002808, + "learning_rate": 2.307692307692308e-05, + "loss": 2.0499, + "step": 33 + }, + { + "epoch": 0.003577064702788006, + "grad_norm": 1.1684486865997314, + "learning_rate": 2.377622377622378e-05, + "loss": 2.1496, + "step": 34 + }, + { + "epoch": 0.003682272488164124, + "grad_norm": 1.1669265031814575, + "learning_rate": 2.4475524475524478e-05, + "loss": 2.3943, + "step": 35 + }, + { + "epoch": 0.003787480273540242, + "grad_norm": 2.067823886871338, + "learning_rate": 2.5174825174825178e-05, + "loss": 2.2909, + "step": 36 + }, + { + "epoch": 0.0038926880589163597, + "grad_norm": 0.8208512663841248, + "learning_rate": 2.5874125874125877e-05, + "loss": 1.9907, + "step": 37 + }, + { + "epoch": 0.0039978958442924775, + "grad_norm": 1.7650972604751587, + "learning_rate": 2.6573426573426574e-05, + "loss": 2.1261, + "step": 38 + }, + { + "epoch": 0.004103103629668595, + "grad_norm": 1.0491235256195068, + "learning_rate": 2.7272727272727273e-05, + "loss": 2.1774, + "step": 39 + }, + { + "epoch": 0.004208311415044713, + "grad_norm": 1.6899977922439575, + "learning_rate": 2.7972027972027976e-05, + "loss": 2.5479, + "step": 40 + }, + { + "epoch": 0.004313519200420831, + "grad_norm": 0.9535022974014282, + "learning_rate": 2.8671328671328672e-05, + "loss": 1.9773, + "step": 41 + }, + { + "epoch": 0.004418726985796949, + "grad_norm": 1.9684417247772217, + "learning_rate": 2.9370629370629372e-05, + "loss": 1.7069, + "step": 42 + }, + { + "epoch": 0.004523934771173067, + "grad_norm": 1.4697853326797485, + "learning_rate": 3.0069930069930068e-05, + "loss": 2.225, + "step": 43 + }, + { + "epoch": 0.0046291425565491845, + "grad_norm": 1.3747695684432983, + "learning_rate": 3.0769230769230774e-05, + "loss": 1.8592, + "step": 44 + }, + { + "epoch": 0.004734350341925302, + "grad_norm": 1.0535547733306885, + "learning_rate": 3.146853146853147e-05, + "loss": 1.8008, + "step": 45 + }, + { + "epoch": 0.00483955812730142, + "grad_norm": 0.7596752643585205, + "learning_rate": 3.216783216783217e-05, + "loss": 1.9742, + "step": 46 + }, + { + "epoch": 0.004944765912677538, + "grad_norm": 1.3590867519378662, + "learning_rate": 3.2867132867132866e-05, + "loss": 1.8842, + "step": 47 + }, + { + "epoch": 0.005049973698053656, + "grad_norm": 1.1350642442703247, + "learning_rate": 3.356643356643357e-05, + "loss": 1.6666, + "step": 48 + }, + { + "epoch": 0.005155181483429774, + "grad_norm": 1.2734442949295044, + "learning_rate": 3.4265734265734265e-05, + "loss": 2.0847, + "step": 49 + }, + { + "epoch": 0.0052603892688058915, + "grad_norm": 2.199073314666748, + "learning_rate": 3.4965034965034965e-05, + "loss": 2.0992, + "step": 50 + }, + { + "epoch": 0.005365597054182009, + "grad_norm": 1.4339218139648438, + "learning_rate": 3.566433566433567e-05, + "loss": 2.2127, + "step": 51 + }, + { + "epoch": 0.005470804839558127, + "grad_norm": 1.0405915975570679, + "learning_rate": 3.6363636363636364e-05, + "loss": 2.1656, + "step": 52 + }, + { + "epoch": 0.005576012624934245, + "grad_norm": 1.6510354280471802, + "learning_rate": 3.7062937062937064e-05, + "loss": 1.8267, + "step": 53 + }, + { + "epoch": 0.005681220410310363, + "grad_norm": 1.8749672174453735, + "learning_rate": 3.776223776223776e-05, + "loss": 2.3715, + "step": 54 + }, + { + "epoch": 0.005786428195686481, + "grad_norm": 1.5232490301132202, + "learning_rate": 3.846153846153846e-05, + "loss": 1.9894, + "step": 55 + }, + { + "epoch": 0.0058916359810625984, + "grad_norm": 1.09939706325531, + "learning_rate": 3.916083916083916e-05, + "loss": 2.1504, + "step": 56 + }, + { + "epoch": 0.005996843766438716, + "grad_norm": 0.9933468699455261, + "learning_rate": 3.986013986013986e-05, + "loss": 2.1873, + "step": 57 + }, + { + "epoch": 0.006102051551814834, + "grad_norm": 1.2622774839401245, + "learning_rate": 4.055944055944056e-05, + "loss": 1.9191, + "step": 58 + }, + { + "epoch": 0.006207259337190952, + "grad_norm": 1.2729672193527222, + "learning_rate": 4.125874125874126e-05, + "loss": 2.1369, + "step": 59 + }, + { + "epoch": 0.00631246712256707, + "grad_norm": 1.32735276222229, + "learning_rate": 4.195804195804196e-05, + "loss": 1.8028, + "step": 60 + }, + { + "epoch": 0.006417674907943188, + "grad_norm": 1.1334835290908813, + "learning_rate": 4.265734265734266e-05, + "loss": 2.1593, + "step": 61 + }, + { + "epoch": 0.006522882693319305, + "grad_norm": 1.2618657350540161, + "learning_rate": 4.335664335664335e-05, + "loss": 1.9589, + "step": 62 + }, + { + "epoch": 0.006628090478695423, + "grad_norm": 1.6038740873336792, + "learning_rate": 4.405594405594406e-05, + "loss": 2.2707, + "step": 63 + }, + { + "epoch": 0.006733298264071541, + "grad_norm": 1.2185710668563843, + "learning_rate": 4.475524475524476e-05, + "loss": 2.0879, + "step": 64 + }, + { + "epoch": 0.006838506049447659, + "grad_norm": 1.9370228052139282, + "learning_rate": 4.545454545454546e-05, + "loss": 2.0566, + "step": 65 + }, + { + "epoch": 0.006943713834823777, + "grad_norm": 1.3582186698913574, + "learning_rate": 4.615384615384616e-05, + "loss": 2.1007, + "step": 66 + }, + { + "epoch": 0.007048921620199895, + "grad_norm": 0.7574198246002197, + "learning_rate": 4.685314685314686e-05, + "loss": 2.0049, + "step": 67 + }, + { + "epoch": 0.007154129405576012, + "grad_norm": 1.2563366889953613, + "learning_rate": 4.755244755244756e-05, + "loss": 2.0965, + "step": 68 + }, + { + "epoch": 0.00725933719095213, + "grad_norm": 1.1161051988601685, + "learning_rate": 4.825174825174825e-05, + "loss": 1.5938, + "step": 69 + }, + { + "epoch": 0.007364544976328248, + "grad_norm": 2.441533088684082, + "learning_rate": 4.8951048951048956e-05, + "loss": 2.4897, + "step": 70 + }, + { + "epoch": 0.007469752761704366, + "grad_norm": 1.1693131923675537, + "learning_rate": 4.9650349650349656e-05, + "loss": 1.9202, + "step": 71 + }, + { + "epoch": 0.007574960547080484, + "grad_norm": 1.2114942073822021, + "learning_rate": 5.0349650349650356e-05, + "loss": 1.996, + "step": 72 + }, + { + "epoch": 0.0076801683324566015, + "grad_norm": 0.6417075991630554, + "learning_rate": 5.1048951048951055e-05, + "loss": 2.1455, + "step": 73 + }, + { + "epoch": 0.007785376117832719, + "grad_norm": 1.2000133991241455, + "learning_rate": 5.1748251748251755e-05, + "loss": 2.2027, + "step": 74 + }, + { + "epoch": 0.007890583903208837, + "grad_norm": 1.086300015449524, + "learning_rate": 5.244755244755245e-05, + "loss": 1.9952, + "step": 75 + }, + { + "epoch": 0.007995791688584955, + "grad_norm": 1.217038869857788, + "learning_rate": 5.314685314685315e-05, + "loss": 1.8331, + "step": 76 + }, + { + "epoch": 0.008100999473961073, + "grad_norm": 2.1391165256500244, + "learning_rate": 5.384615384615385e-05, + "loss": 1.7158, + "step": 77 + }, + { + "epoch": 0.00820620725933719, + "grad_norm": 1.2688722610473633, + "learning_rate": 5.4545454545454546e-05, + "loss": 1.8603, + "step": 78 + }, + { + "epoch": 0.008311415044713309, + "grad_norm": 1.1678074598312378, + "learning_rate": 5.524475524475524e-05, + "loss": 1.8435, + "step": 79 + }, + { + "epoch": 0.008416622830089426, + "grad_norm": 0.6559339165687561, + "learning_rate": 5.594405594405595e-05, + "loss": 2.1147, + "step": 80 + }, + { + "epoch": 0.008521830615465544, + "grad_norm": 1.0474909543991089, + "learning_rate": 5.664335664335665e-05, + "loss": 2.3672, + "step": 81 + }, + { + "epoch": 0.008627038400841662, + "grad_norm": 0.8283208012580872, + "learning_rate": 5.7342657342657345e-05, + "loss": 2.0395, + "step": 82 + }, + { + "epoch": 0.00873224618621778, + "grad_norm": 0.7440145015716553, + "learning_rate": 5.8041958041958044e-05, + "loss": 2.5889, + "step": 83 + }, + { + "epoch": 0.008837453971593898, + "grad_norm": 1.0744500160217285, + "learning_rate": 5.8741258741258744e-05, + "loss": 1.9353, + "step": 84 + }, + { + "epoch": 0.008942661756970016, + "grad_norm": 1.0884795188903809, + "learning_rate": 5.944055944055944e-05, + "loss": 1.8183, + "step": 85 + }, + { + "epoch": 0.009047869542346133, + "grad_norm": 1.3748575448989868, + "learning_rate": 6.0139860139860136e-05, + "loss": 2.0691, + "step": 86 + }, + { + "epoch": 0.009153077327722251, + "grad_norm": 0.7459085583686829, + "learning_rate": 6.083916083916085e-05, + "loss": 2.2508, + "step": 87 + }, + { + "epoch": 0.009258285113098369, + "grad_norm": 1.4073225259780884, + "learning_rate": 6.153846153846155e-05, + "loss": 1.9281, + "step": 88 + }, + { + "epoch": 0.009363492898474487, + "grad_norm": 0.988071620464325, + "learning_rate": 6.223776223776224e-05, + "loss": 1.7869, + "step": 89 + }, + { + "epoch": 0.009468700683850605, + "grad_norm": 1.1808884143829346, + "learning_rate": 6.293706293706293e-05, + "loss": 1.5635, + "step": 90 + }, + { + "epoch": 0.009573908469226722, + "grad_norm": 1.0313398838043213, + "learning_rate": 6.363636363636364e-05, + "loss": 2.0061, + "step": 91 + }, + { + "epoch": 0.00967911625460284, + "grad_norm": 0.7924686074256897, + "learning_rate": 6.433566433566433e-05, + "loss": 2.0563, + "step": 92 + }, + { + "epoch": 0.009784324039978958, + "grad_norm": 1.0491949319839478, + "learning_rate": 6.503496503496504e-05, + "loss": 1.8572, + "step": 93 + }, + { + "epoch": 0.009889531825355076, + "grad_norm": 0.8003563284873962, + "learning_rate": 6.573426573426573e-05, + "loss": 2.1663, + "step": 94 + }, + { + "epoch": 0.009994739610731194, + "grad_norm": 0.8011645078659058, + "learning_rate": 6.643356643356644e-05, + "loss": 2.1442, + "step": 95 + }, + { + "epoch": 0.010099947396107312, + "grad_norm": 1.4755507707595825, + "learning_rate": 6.713286713286715e-05, + "loss": 1.5279, + "step": 96 + }, + { + "epoch": 0.01020515518148343, + "grad_norm": 1.2490993738174438, + "learning_rate": 6.783216783216784e-05, + "loss": 1.7732, + "step": 97 + }, + { + "epoch": 0.010310362966859547, + "grad_norm": 0.8913796544075012, + "learning_rate": 6.853146853146853e-05, + "loss": 1.6675, + "step": 98 + }, + { + "epoch": 0.010415570752235665, + "grad_norm": 2.243081569671631, + "learning_rate": 6.923076923076924e-05, + "loss": 1.9422, + "step": 99 + }, + { + "epoch": 0.010520778537611783, + "grad_norm": 1.7832255363464355, + "learning_rate": 6.993006993006993e-05, + "loss": 1.9094, + "step": 100 + }, + { + "epoch": 0.0106259863229879, + "grad_norm": 1.1693270206451416, + "learning_rate": 7.062937062937062e-05, + "loss": 2.2014, + "step": 101 + }, + { + "epoch": 0.010731194108364019, + "grad_norm": 0.9510181546211243, + "learning_rate": 7.132867132867134e-05, + "loss": 2.0675, + "step": 102 + }, + { + "epoch": 0.010836401893740136, + "grad_norm": 1.1362980604171753, + "learning_rate": 7.202797202797204e-05, + "loss": 1.8922, + "step": 103 + }, + { + "epoch": 0.010941609679116254, + "grad_norm": 1.42923903465271, + "learning_rate": 7.272727272727273e-05, + "loss": 1.5876, + "step": 104 + }, + { + "epoch": 0.011046817464492372, + "grad_norm": 0.838302493095398, + "learning_rate": 7.342657342657343e-05, + "loss": 2.0269, + "step": 105 + }, + { + "epoch": 0.01115202524986849, + "grad_norm": 0.8059056401252747, + "learning_rate": 7.412587412587413e-05, + "loss": 2.2676, + "step": 106 + }, + { + "epoch": 0.011257233035244608, + "grad_norm": 1.8598095178604126, + "learning_rate": 7.482517482517482e-05, + "loss": 2.2518, + "step": 107 + }, + { + "epoch": 0.011362440820620726, + "grad_norm": 1.1147247552871704, + "learning_rate": 7.552447552447553e-05, + "loss": 1.8006, + "step": 108 + }, + { + "epoch": 0.011467648605996843, + "grad_norm": 2.8933708667755127, + "learning_rate": 7.622377622377622e-05, + "loss": 1.4782, + "step": 109 + }, + { + "epoch": 0.011572856391372961, + "grad_norm": 1.3423585891723633, + "learning_rate": 7.692307692307693e-05, + "loss": 1.6204, + "step": 110 + }, + { + "epoch": 0.011678064176749079, + "grad_norm": 1.0002747774124146, + "learning_rate": 7.762237762237763e-05, + "loss": 1.5048, + "step": 111 + }, + { + "epoch": 0.011783271962125197, + "grad_norm": 1.511420488357544, + "learning_rate": 7.832167832167832e-05, + "loss": 2.4107, + "step": 112 + }, + { + "epoch": 0.011888479747501315, + "grad_norm": 0.7754486799240112, + "learning_rate": 7.902097902097903e-05, + "loss": 2.1876, + "step": 113 + }, + { + "epoch": 0.011993687532877433, + "grad_norm": 0.6452352404594421, + "learning_rate": 7.972027972027972e-05, + "loss": 2.3313, + "step": 114 + }, + { + "epoch": 0.01209889531825355, + "grad_norm": 0.9890616536140442, + "learning_rate": 8.041958041958042e-05, + "loss": 2.172, + "step": 115 + }, + { + "epoch": 0.012204103103629668, + "grad_norm": 1.8413726091384888, + "learning_rate": 8.111888111888112e-05, + "loss": 1.969, + "step": 116 + }, + { + "epoch": 0.012309310889005786, + "grad_norm": 0.800186812877655, + "learning_rate": 8.181818181818183e-05, + "loss": 2.268, + "step": 117 + }, + { + "epoch": 0.012414518674381904, + "grad_norm": 1.1103674173355103, + "learning_rate": 8.251748251748252e-05, + "loss": 1.9077, + "step": 118 + }, + { + "epoch": 0.012519726459758022, + "grad_norm": 4.220036029815674, + "learning_rate": 8.321678321678323e-05, + "loss": 2.0661, + "step": 119 + }, + { + "epoch": 0.01262493424513414, + "grad_norm": 1.084676742553711, + "learning_rate": 8.391608391608392e-05, + "loss": 2.1746, + "step": 120 + }, + { + "epoch": 0.012730142030510257, + "grad_norm": 1.1023154258728027, + "learning_rate": 8.461538461538461e-05, + "loss": 1.8782, + "step": 121 + }, + { + "epoch": 0.012835349815886375, + "grad_norm": 0.8550626635551453, + "learning_rate": 8.531468531468532e-05, + "loss": 2.0319, + "step": 122 + }, + { + "epoch": 0.012940557601262493, + "grad_norm": 0.5834708213806152, + "learning_rate": 8.601398601398601e-05, + "loss": 2.1768, + "step": 123 + }, + { + "epoch": 0.01304576538663861, + "grad_norm": 0.9890002012252808, + "learning_rate": 8.67132867132867e-05, + "loss": 2.1604, + "step": 124 + }, + { + "epoch": 0.013150973172014729, + "grad_norm": 1.2125439643859863, + "learning_rate": 8.741258741258743e-05, + "loss": 2.0555, + "step": 125 + }, + { + "epoch": 0.013256180957390847, + "grad_norm": 0.8572925329208374, + "learning_rate": 8.811188811188812e-05, + "loss": 2.1043, + "step": 126 + }, + { + "epoch": 0.013361388742766964, + "grad_norm": 0.8613000512123108, + "learning_rate": 8.881118881118881e-05, + "loss": 2.1363, + "step": 127 + }, + { + "epoch": 0.013466596528143082, + "grad_norm": 1.0686365365982056, + "learning_rate": 8.951048951048952e-05, + "loss": 2.1241, + "step": 128 + }, + { + "epoch": 0.0135718043135192, + "grad_norm": 0.6561310291290283, + "learning_rate": 9.020979020979021e-05, + "loss": 2.037, + "step": 129 + }, + { + "epoch": 0.013677012098895318, + "grad_norm": 1.03804612159729, + "learning_rate": 9.090909090909092e-05, + "loss": 2.0004, + "step": 130 + }, + { + "epoch": 0.013782219884271436, + "grad_norm": 0.842249870300293, + "learning_rate": 9.160839160839161e-05, + "loss": 2.1774, + "step": 131 + }, + { + "epoch": 0.013887427669647553, + "grad_norm": 1.4844613075256348, + "learning_rate": 9.230769230769232e-05, + "loss": 2.0346, + "step": 132 + }, + { + "epoch": 0.013992635455023671, + "grad_norm": 0.7268441319465637, + "learning_rate": 9.300699300699301e-05, + "loss": 2.3364, + "step": 133 + }, + { + "epoch": 0.01409784324039979, + "grad_norm": 0.8013659715652466, + "learning_rate": 9.370629370629372e-05, + "loss": 2.2674, + "step": 134 + }, + { + "epoch": 0.014203051025775907, + "grad_norm": 0.6665470004081726, + "learning_rate": 9.440559440559441e-05, + "loss": 2.0605, + "step": 135 + }, + { + "epoch": 0.014308258811152025, + "grad_norm": 1.0592727661132812, + "learning_rate": 9.510489510489511e-05, + "loss": 1.8648, + "step": 136 + }, + { + "epoch": 0.014413466596528143, + "grad_norm": 0.9042508602142334, + "learning_rate": 9.580419580419581e-05, + "loss": 1.9756, + "step": 137 + }, + { + "epoch": 0.01451867438190426, + "grad_norm": 0.7357807755470276, + "learning_rate": 9.65034965034965e-05, + "loss": 1.9707, + "step": 138 + }, + { + "epoch": 0.014623882167280378, + "grad_norm": 0.8247336149215698, + "learning_rate": 9.72027972027972e-05, + "loss": 2.1052, + "step": 139 + }, + { + "epoch": 0.014729089952656496, + "grad_norm": 1.1244010925292969, + "learning_rate": 9.790209790209791e-05, + "loss": 1.8881, + "step": 140 + }, + { + "epoch": 0.014834297738032614, + "grad_norm": 0.7070969343185425, + "learning_rate": 9.86013986013986e-05, + "loss": 1.9978, + "step": 141 + }, + { + "epoch": 0.014939505523408732, + "grad_norm": 1.1827497482299805, + "learning_rate": 9.930069930069931e-05, + "loss": 1.708, + "step": 142 + }, + { + "epoch": 0.01504471330878485, + "grad_norm": 1.9365792274475098, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 143 + }, + { + "epoch": 0.015149921094160967, + "grad_norm": 0.8472030758857727, + "learning_rate": 0.00010069930069930071, + "loss": 2.1147, + "step": 144 + }, + { + "epoch": 0.015255128879537085, + "grad_norm": 0.7535983920097351, + "learning_rate": 0.0001013986013986014, + "loss": 2.3344, + "step": 145 + }, + { + "epoch": 0.015360336664913203, + "grad_norm": 1.1090881824493408, + "learning_rate": 0.00010209790209790211, + "loss": 1.7615, + "step": 146 + }, + { + "epoch": 0.015465544450289321, + "grad_norm": 0.7076790928840637, + "learning_rate": 0.00010279720279720279, + "loss": 2.0014, + "step": 147 + }, + { + "epoch": 0.015570752235665439, + "grad_norm": 0.7309038043022156, + "learning_rate": 0.00010349650349650351, + "loss": 2.5436, + "step": 148 + }, + { + "epoch": 0.015675960021041557, + "grad_norm": 0.9821389317512512, + "learning_rate": 0.00010419580419580419, + "loss": 2.0839, + "step": 149 + }, + { + "epoch": 0.015781167806417674, + "grad_norm": 0.5570242404937744, + "learning_rate": 0.0001048951048951049, + "loss": 2.1916, + "step": 150 + }, + { + "epoch": 0.015886375591793792, + "grad_norm": 1.2793196439743042, + "learning_rate": 0.00010559440559440561, + "loss": 2.1197, + "step": 151 + }, + { + "epoch": 0.01599158337716991, + "grad_norm": 1.9450187683105469, + "learning_rate": 0.0001062937062937063, + "loss": 2.5529, + "step": 152 + }, + { + "epoch": 0.016096791162546028, + "grad_norm": 2.063152313232422, + "learning_rate": 0.000106993006993007, + "loss": 1.9606, + "step": 153 + }, + { + "epoch": 0.016201998947922146, + "grad_norm": 1.4163166284561157, + "learning_rate": 0.0001076923076923077, + "loss": 2.002, + "step": 154 + }, + { + "epoch": 0.016307206733298264, + "grad_norm": 0.6336003541946411, + "learning_rate": 0.0001083916083916084, + "loss": 2.3557, + "step": 155 + }, + { + "epoch": 0.01641241451867438, + "grad_norm": 1.3749457597732544, + "learning_rate": 0.00010909090909090909, + "loss": 2.3967, + "step": 156 + }, + { + "epoch": 0.0165176223040505, + "grad_norm": 0.8000378608703613, + "learning_rate": 0.0001097902097902098, + "loss": 2.0812, + "step": 157 + }, + { + "epoch": 0.016622830089426617, + "grad_norm": 0.8916708827018738, + "learning_rate": 0.00011048951048951048, + "loss": 1.8927, + "step": 158 + }, + { + "epoch": 0.016728037874802735, + "grad_norm": 0.8637097477912903, + "learning_rate": 0.0001111888111888112, + "loss": 1.8131, + "step": 159 + }, + { + "epoch": 0.016833245660178853, + "grad_norm": 0.6609115600585938, + "learning_rate": 0.0001118881118881119, + "loss": 1.9201, + "step": 160 + }, + { + "epoch": 0.01693845344555497, + "grad_norm": 1.2824020385742188, + "learning_rate": 0.00011258741258741258, + "loss": 2.3487, + "step": 161 + }, + { + "epoch": 0.01704366123093109, + "grad_norm": 1.6525263786315918, + "learning_rate": 0.0001132867132867133, + "loss": 1.9996, + "step": 162 + }, + { + "epoch": 0.017148869016307206, + "grad_norm": 1.3814963102340698, + "learning_rate": 0.00011398601398601398, + "loss": 1.7609, + "step": 163 + }, + { + "epoch": 0.017254076801683324, + "grad_norm": 0.6455362439155579, + "learning_rate": 0.00011468531468531469, + "loss": 2.1223, + "step": 164 + }, + { + "epoch": 0.017359284587059442, + "grad_norm": 0.7798193097114563, + "learning_rate": 0.00011538461538461538, + "loss": 2.0314, + "step": 165 + }, + { + "epoch": 0.01746449237243556, + "grad_norm": 0.7223178744316101, + "learning_rate": 0.00011608391608391609, + "loss": 2.0679, + "step": 166 + }, + { + "epoch": 0.017569700157811678, + "grad_norm": 0.8387543559074402, + "learning_rate": 0.0001167832167832168, + "loss": 2.2392, + "step": 167 + }, + { + "epoch": 0.017674907943187795, + "grad_norm": 1.1753877401351929, + "learning_rate": 0.00011748251748251749, + "loss": 2.2198, + "step": 168 + }, + { + "epoch": 0.017780115728563913, + "grad_norm": 0.6246500015258789, + "learning_rate": 0.0001181818181818182, + "loss": 1.9576, + "step": 169 + }, + { + "epoch": 0.01788532351394003, + "grad_norm": 0.7104965448379517, + "learning_rate": 0.00011888111888111889, + "loss": 1.8494, + "step": 170 + }, + { + "epoch": 0.01799053129931615, + "grad_norm": 0.6617504954338074, + "learning_rate": 0.00011958041958041959, + "loss": 2.1685, + "step": 171 + }, + { + "epoch": 0.018095739084692267, + "grad_norm": 0.5817732810974121, + "learning_rate": 0.00012027972027972027, + "loss": 2.4582, + "step": 172 + }, + { + "epoch": 0.018200946870068384, + "grad_norm": 1.5956623554229736, + "learning_rate": 0.00012097902097902098, + "loss": 1.9319, + "step": 173 + }, + { + "epoch": 0.018306154655444502, + "grad_norm": 0.6000881195068359, + "learning_rate": 0.0001216783216783217, + "loss": 2.2725, + "step": 174 + }, + { + "epoch": 0.01841136244082062, + "grad_norm": 1.0373468399047852, + "learning_rate": 0.00012237762237762238, + "loss": 2.3838, + "step": 175 + }, + { + "epoch": 0.018516570226196738, + "grad_norm": 0.6030766367912292, + "learning_rate": 0.0001230769230769231, + "loss": 2.0372, + "step": 176 + }, + { + "epoch": 0.018621778011572856, + "grad_norm": 0.7469233274459839, + "learning_rate": 0.00012377622377622376, + "loss": 2.1564, + "step": 177 + }, + { + "epoch": 0.018726985796948974, + "grad_norm": 0.9262884259223938, + "learning_rate": 0.00012447552447552448, + "loss": 1.4632, + "step": 178 + }, + { + "epoch": 0.01883219358232509, + "grad_norm": 0.8348685503005981, + "learning_rate": 0.00012517482517482518, + "loss": 2.193, + "step": 179 + }, + { + "epoch": 0.01893740136770121, + "grad_norm": 0.6114562153816223, + "learning_rate": 0.00012587412587412587, + "loss": 2.0964, + "step": 180 + }, + { + "epoch": 0.019042609153077327, + "grad_norm": 1.039941668510437, + "learning_rate": 0.0001265734265734266, + "loss": 1.9306, + "step": 181 + }, + { + "epoch": 0.019147816938453445, + "grad_norm": 0.730789303779602, + "learning_rate": 0.00012727272727272728, + "loss": 1.7754, + "step": 182 + }, + { + "epoch": 0.019253024723829563, + "grad_norm": 0.76559978723526, + "learning_rate": 0.00012797202797202797, + "loss": 2.1377, + "step": 183 + }, + { + "epoch": 0.01935823250920568, + "grad_norm": 0.7957161068916321, + "learning_rate": 0.00012867132867132867, + "loss": 2.2704, + "step": 184 + }, + { + "epoch": 0.0194634402945818, + "grad_norm": 0.9248109459877014, + "learning_rate": 0.0001293706293706294, + "loss": 2.0038, + "step": 185 + }, + { + "epoch": 0.019568648079957916, + "grad_norm": 0.7987109422683716, + "learning_rate": 0.00013006993006993008, + "loss": 2.2301, + "step": 186 + }, + { + "epoch": 0.019673855865334034, + "grad_norm": 0.5288488864898682, + "learning_rate": 0.00013076923076923077, + "loss": 1.9312, + "step": 187 + }, + { + "epoch": 0.019779063650710152, + "grad_norm": 0.6533221006393433, + "learning_rate": 0.00013146853146853147, + "loss": 2.0679, + "step": 188 + }, + { + "epoch": 0.01988427143608627, + "grad_norm": 1.0460737943649292, + "learning_rate": 0.00013216783216783219, + "loss": 1.9375, + "step": 189 + }, + { + "epoch": 0.019989479221462388, + "grad_norm": 1.1394834518432617, + "learning_rate": 0.00013286713286713288, + "loss": 2.248, + "step": 190 + }, + { + "epoch": 0.020094687006838505, + "grad_norm": 0.6399918794631958, + "learning_rate": 0.00013356643356643357, + "loss": 2.1755, + "step": 191 + }, + { + "epoch": 0.020199894792214623, + "grad_norm": 1.1859235763549805, + "learning_rate": 0.0001342657342657343, + "loss": 1.8647, + "step": 192 + }, + { + "epoch": 0.02030510257759074, + "grad_norm": 0.7891494631767273, + "learning_rate": 0.00013496503496503496, + "loss": 2.1801, + "step": 193 + }, + { + "epoch": 0.02041031036296686, + "grad_norm": 0.8865561485290527, + "learning_rate": 0.00013566433566433568, + "loss": 1.9941, + "step": 194 + }, + { + "epoch": 0.020515518148342977, + "grad_norm": 0.6699333786964417, + "learning_rate": 0.00013636363636363637, + "loss": 1.9786, + "step": 195 + }, + { + "epoch": 0.020620725933719095, + "grad_norm": 0.6600997447967529, + "learning_rate": 0.00013706293706293706, + "loss": 2.2511, + "step": 196 + }, + { + "epoch": 0.020725933719095212, + "grad_norm": 0.6689944863319397, + "learning_rate": 0.00013776223776223778, + "loss": 2.0213, + "step": 197 + }, + { + "epoch": 0.02083114150447133, + "grad_norm": 0.8018683791160583, + "learning_rate": 0.00013846153846153847, + "loss": 2.0495, + "step": 198 + }, + { + "epoch": 0.020936349289847448, + "grad_norm": 1.0803660154342651, + "learning_rate": 0.00013916083916083917, + "loss": 1.1935, + "step": 199 + }, + { + "epoch": 0.021041557075223566, + "grad_norm": 2.967259168624878, + "learning_rate": 0.00013986013986013986, + "loss": 1.5776, + "step": 200 + }, + { + "epoch": 0.021146764860599684, + "grad_norm": 0.9621496200561523, + "learning_rate": 0.00014055944055944058, + "loss": 1.9666, + "step": 201 + }, + { + "epoch": 0.0212519726459758, + "grad_norm": 0.8522931933403015, + "learning_rate": 0.00014125874125874125, + "loss": 1.8651, + "step": 202 + }, + { + "epoch": 0.02135718043135192, + "grad_norm": 0.942253589630127, + "learning_rate": 0.00014195804195804197, + "loss": 2.1782, + "step": 203 + }, + { + "epoch": 0.021462388216728037, + "grad_norm": 0.6087969541549683, + "learning_rate": 0.00014265734265734269, + "loss": 2.165, + "step": 204 + }, + { + "epoch": 0.021567596002104155, + "grad_norm": 0.6847811937332153, + "learning_rate": 0.00014335664335664335, + "loss": 2.1116, + "step": 205 + }, + { + "epoch": 0.021672803787480273, + "grad_norm": 0.9084582328796387, + "learning_rate": 0.00014405594405594407, + "loss": 1.9343, + "step": 206 + }, + { + "epoch": 0.02177801157285639, + "grad_norm": 1.0526587963104248, + "learning_rate": 0.00014475524475524476, + "loss": 2.1309, + "step": 207 + }, + { + "epoch": 0.02188321935823251, + "grad_norm": 0.8651286363601685, + "learning_rate": 0.00014545454545454546, + "loss": 2.1103, + "step": 208 + }, + { + "epoch": 0.021988427143608626, + "grad_norm": 0.6131342053413391, + "learning_rate": 0.00014615384615384615, + "loss": 1.9468, + "step": 209 + }, + { + "epoch": 0.022093634928984744, + "grad_norm": 0.784864068031311, + "learning_rate": 0.00014685314685314687, + "loss": 2.1167, + "step": 210 + }, + { + "epoch": 0.022198842714360862, + "grad_norm": 1.1468371152877808, + "learning_rate": 0.00014755244755244756, + "loss": 1.8572, + "step": 211 + }, + { + "epoch": 0.02230405049973698, + "grad_norm": 0.9270665049552917, + "learning_rate": 0.00014825174825174825, + "loss": 1.9398, + "step": 212 + }, + { + "epoch": 0.022409258285113098, + "grad_norm": 0.7162615060806274, + "learning_rate": 0.00014895104895104897, + "loss": 2.0499, + "step": 213 + }, + { + "epoch": 0.022514466070489215, + "grad_norm": 0.6274422407150269, + "learning_rate": 0.00014965034965034964, + "loss": 1.8765, + "step": 214 + }, + { + "epoch": 0.022619673855865333, + "grad_norm": 1.4804967641830444, + "learning_rate": 0.00015034965034965036, + "loss": 1.7015, + "step": 215 + }, + { + "epoch": 0.02272488164124145, + "grad_norm": 0.5013079047203064, + "learning_rate": 0.00015104895104895105, + "loss": 2.0067, + "step": 216 + }, + { + "epoch": 0.02283008942661757, + "grad_norm": 0.7269556522369385, + "learning_rate": 0.00015174825174825175, + "loss": 1.934, + "step": 217 + }, + { + "epoch": 0.022935297211993687, + "grad_norm": 0.9243491291999817, + "learning_rate": 0.00015244755244755244, + "loss": 2.2308, + "step": 218 + }, + { + "epoch": 0.023040504997369805, + "grad_norm": 0.7595291137695312, + "learning_rate": 0.00015314685314685316, + "loss": 2.0564, + "step": 219 + }, + { + "epoch": 0.023145712782745922, + "grad_norm": 0.8647655248641968, + "learning_rate": 0.00015384615384615385, + "loss": 2.0716, + "step": 220 + }, + { + "epoch": 0.02325092056812204, + "grad_norm": 1.0030921697616577, + "learning_rate": 0.00015454545454545454, + "loss": 1.9771, + "step": 221 + }, + { + "epoch": 0.023356128353498158, + "grad_norm": 0.588725745677948, + "learning_rate": 0.00015524475524475526, + "loss": 2.2316, + "step": 222 + }, + { + "epoch": 0.023461336138874276, + "grad_norm": 1.029255747795105, + "learning_rate": 0.00015594405594405596, + "loss": 2.2892, + "step": 223 + }, + { + "epoch": 0.023566543924250394, + "grad_norm": 0.9255016446113586, + "learning_rate": 0.00015664335664335665, + "loss": 2.2594, + "step": 224 + }, + { + "epoch": 0.02367175170962651, + "grad_norm": 0.84700608253479, + "learning_rate": 0.00015734265734265734, + "loss": 1.8739, + "step": 225 + }, + { + "epoch": 0.02377695949500263, + "grad_norm": 0.8125012516975403, + "learning_rate": 0.00015804195804195806, + "loss": 2.0795, + "step": 226 + }, + { + "epoch": 0.023882167280378747, + "grad_norm": 0.8614683151245117, + "learning_rate": 0.00015874125874125876, + "loss": 2.4142, + "step": 227 + }, + { + "epoch": 0.023987375065754865, + "grad_norm": 1.1380670070648193, + "learning_rate": 0.00015944055944055945, + "loss": 2.1002, + "step": 228 + }, + { + "epoch": 0.024092582851130983, + "grad_norm": 0.4754476547241211, + "learning_rate": 0.00016013986013986014, + "loss": 2.0722, + "step": 229 + }, + { + "epoch": 0.0241977906365071, + "grad_norm": 0.861049473285675, + "learning_rate": 0.00016083916083916083, + "loss": 1.9022, + "step": 230 + }, + { + "epoch": 0.02430299842188322, + "grad_norm": 1.181152582168579, + "learning_rate": 0.00016153846153846155, + "loss": 2.0356, + "step": 231 + }, + { + "epoch": 0.024408206207259336, + "grad_norm": 0.8931273818016052, + "learning_rate": 0.00016223776223776225, + "loss": 1.7346, + "step": 232 + }, + { + "epoch": 0.024513413992635454, + "grad_norm": 0.7377164959907532, + "learning_rate": 0.00016293706293706294, + "loss": 2.0764, + "step": 233 + }, + { + "epoch": 0.024618621778011572, + "grad_norm": 1.085389494895935, + "learning_rate": 0.00016363636363636366, + "loss": 1.9141, + "step": 234 + }, + { + "epoch": 0.02472382956338769, + "grad_norm": 1.0622587203979492, + "learning_rate": 0.00016433566433566435, + "loss": 1.8455, + "step": 235 + }, + { + "epoch": 0.024829037348763808, + "grad_norm": 1.0064948797225952, + "learning_rate": 0.00016503496503496504, + "loss": 1.9076, + "step": 236 + }, + { + "epoch": 0.024934245134139926, + "grad_norm": 0.5665073394775391, + "learning_rate": 0.00016573426573426574, + "loss": 2.0581, + "step": 237 + }, + { + "epoch": 0.025039452919516043, + "grad_norm": 1.042952299118042, + "learning_rate": 0.00016643356643356646, + "loss": 1.9164, + "step": 238 + }, + { + "epoch": 0.02514466070489216, + "grad_norm": 1.4019465446472168, + "learning_rate": 0.00016713286713286712, + "loss": 1.8818, + "step": 239 + }, + { + "epoch": 0.02524986849026828, + "grad_norm": 1.0074125528335571, + "learning_rate": 0.00016783216783216784, + "loss": 1.971, + "step": 240 + }, + { + "epoch": 0.025355076275644397, + "grad_norm": 0.9089799523353577, + "learning_rate": 0.00016853146853146856, + "loss": 2.087, + "step": 241 + }, + { + "epoch": 0.025460284061020515, + "grad_norm": 0.7242212891578674, + "learning_rate": 0.00016923076923076923, + "loss": 2.1149, + "step": 242 + }, + { + "epoch": 0.025565491846396633, + "grad_norm": 1.1103755235671997, + "learning_rate": 0.00016993006993006995, + "loss": 2.1568, + "step": 243 + }, + { + "epoch": 0.02567069963177275, + "grad_norm": 1.03432297706604, + "learning_rate": 0.00017062937062937064, + "loss": 2.1965, + "step": 244 + }, + { + "epoch": 0.025775907417148868, + "grad_norm": 0.8843188881874084, + "learning_rate": 0.00017132867132867133, + "loss": 1.7656, + "step": 245 + }, + { + "epoch": 0.025881115202524986, + "grad_norm": 1.1456265449523926, + "learning_rate": 0.00017202797202797203, + "loss": 2.2859, + "step": 246 + }, + { + "epoch": 0.025986322987901104, + "grad_norm": 1.0545116662979126, + "learning_rate": 0.00017272727272727275, + "loss": 2.0337, + "step": 247 + }, + { + "epoch": 0.02609153077327722, + "grad_norm": 1.3290053606033325, + "learning_rate": 0.0001734265734265734, + "loss": 2.1952, + "step": 248 + }, + { + "epoch": 0.02619673855865334, + "grad_norm": 1.2462905645370483, + "learning_rate": 0.00017412587412587413, + "loss": 2.1667, + "step": 249 + }, + { + "epoch": 0.026301946344029457, + "grad_norm": 1.0230878591537476, + "learning_rate": 0.00017482517482517485, + "loss": 2.0687, + "step": 250 + }, + { + "epoch": 0.026407154129405575, + "grad_norm": 0.6971817016601562, + "learning_rate": 0.00017552447552447552, + "loss": 1.947, + "step": 251 + }, + { + "epoch": 0.026512361914781693, + "grad_norm": 1.1189419031143188, + "learning_rate": 0.00017622377622377624, + "loss": 1.9158, + "step": 252 + }, + { + "epoch": 0.02661756970015781, + "grad_norm": 0.9112401604652405, + "learning_rate": 0.00017692307692307693, + "loss": 2.2175, + "step": 253 + }, + { + "epoch": 0.02672277748553393, + "grad_norm": 0.6773673892021179, + "learning_rate": 0.00017762237762237762, + "loss": 2.0269, + "step": 254 + }, + { + "epoch": 0.026827985270910047, + "grad_norm": 1.021903157234192, + "learning_rate": 0.00017832167832167832, + "loss": 1.4599, + "step": 255 + }, + { + "epoch": 0.026933193056286164, + "grad_norm": 1.0119444131851196, + "learning_rate": 0.00017902097902097904, + "loss": 2.5356, + "step": 256 + }, + { + "epoch": 0.027038400841662282, + "grad_norm": 0.8079578280448914, + "learning_rate": 0.00017972027972027973, + "loss": 1.7094, + "step": 257 + }, + { + "epoch": 0.0271436086270384, + "grad_norm": 0.8587137460708618, + "learning_rate": 0.00018041958041958042, + "loss": 1.9044, + "step": 258 + }, + { + "epoch": 0.027248816412414518, + "grad_norm": 0.4971306324005127, + "learning_rate": 0.00018111888111888114, + "loss": 2.7055, + "step": 259 + }, + { + "epoch": 0.027354024197790636, + "grad_norm": 0.6644048094749451, + "learning_rate": 0.00018181818181818183, + "loss": 2.22, + "step": 260 + }, + { + "epoch": 0.027459231983166753, + "grad_norm": 0.8848311901092529, + "learning_rate": 0.00018251748251748253, + "loss": 2.1772, + "step": 261 + }, + { + "epoch": 0.02756443976854287, + "grad_norm": 0.8665701746940613, + "learning_rate": 0.00018321678321678322, + "loss": 2.0003, + "step": 262 + }, + { + "epoch": 0.02766964755391899, + "grad_norm": 0.9104865193367004, + "learning_rate": 0.0001839160839160839, + "loss": 2.2117, + "step": 263 + }, + { + "epoch": 0.027774855339295107, + "grad_norm": 0.7475347518920898, + "learning_rate": 0.00018461538461538463, + "loss": 2.0686, + "step": 264 + }, + { + "epoch": 0.027880063124671225, + "grad_norm": 0.9693028926849365, + "learning_rate": 0.00018531468531468533, + "loss": 2.1383, + "step": 265 + }, + { + "epoch": 0.027985270910047343, + "grad_norm": 0.6588386297225952, + "learning_rate": 0.00018601398601398602, + "loss": 1.8683, + "step": 266 + }, + { + "epoch": 0.02809047869542346, + "grad_norm": 1.076177716255188, + "learning_rate": 0.0001867132867132867, + "loss": 1.5809, + "step": 267 + }, + { + "epoch": 0.02819568648079958, + "grad_norm": 0.6180933713912964, + "learning_rate": 0.00018741258741258743, + "loss": 1.9758, + "step": 268 + }, + { + "epoch": 0.028300894266175696, + "grad_norm": 1.6872791051864624, + "learning_rate": 0.00018811188811188812, + "loss": 2.3591, + "step": 269 + }, + { + "epoch": 0.028406102051551814, + "grad_norm": 1.2484400272369385, + "learning_rate": 0.00018881118881118882, + "loss": 1.7551, + "step": 270 + }, + { + "epoch": 0.028511309836927932, + "grad_norm": 0.5355095863342285, + "learning_rate": 0.00018951048951048954, + "loss": 2.1445, + "step": 271 + }, + { + "epoch": 0.02861651762230405, + "grad_norm": 0.8856534361839294, + "learning_rate": 0.00019020979020979023, + "loss": 2.0174, + "step": 272 + }, + { + "epoch": 0.028721725407680167, + "grad_norm": 1.0029089450836182, + "learning_rate": 0.00019090909090909092, + "loss": 2.0354, + "step": 273 + }, + { + "epoch": 0.028826933193056285, + "grad_norm": 0.859724760055542, + "learning_rate": 0.00019160839160839161, + "loss": 2.0125, + "step": 274 + }, + { + "epoch": 0.028932140978432403, + "grad_norm": 1.281600832939148, + "learning_rate": 0.00019230769230769233, + "loss": 1.8134, + "step": 275 + }, + { + "epoch": 0.02903734876380852, + "grad_norm": 1.7904019355773926, + "learning_rate": 0.000193006993006993, + "loss": 1.7277, + "step": 276 + }, + { + "epoch": 0.02914255654918464, + "grad_norm": 0.7003609538078308, + "learning_rate": 0.00019370629370629372, + "loss": 2.0308, + "step": 277 + }, + { + "epoch": 0.029247764334560757, + "grad_norm": 0.8506051898002625, + "learning_rate": 0.0001944055944055944, + "loss": 1.8183, + "step": 278 + }, + { + "epoch": 0.029352972119936874, + "grad_norm": 0.7373519539833069, + "learning_rate": 0.0001951048951048951, + "loss": 2.0156, + "step": 279 + }, + { + "epoch": 0.029458179905312992, + "grad_norm": 0.8916333913803101, + "learning_rate": 0.00019580419580419583, + "loss": 1.9928, + "step": 280 + }, + { + "epoch": 0.02956338769068911, + "grad_norm": 0.7887770533561707, + "learning_rate": 0.00019650349650349652, + "loss": 2.0476, + "step": 281 + }, + { + "epoch": 0.029668595476065228, + "grad_norm": 1.0839449167251587, + "learning_rate": 0.0001972027972027972, + "loss": 2.457, + "step": 282 + }, + { + "epoch": 0.029773803261441346, + "grad_norm": 0.7722465395927429, + "learning_rate": 0.0001979020979020979, + "loss": 2.1979, + "step": 283 + }, + { + "epoch": 0.029879011046817464, + "grad_norm": 0.6390840411186218, + "learning_rate": 0.00019860139860139862, + "loss": 1.9645, + "step": 284 + }, + { + "epoch": 0.02998421883219358, + "grad_norm": 0.6858240365982056, + "learning_rate": 0.0001993006993006993, + "loss": 2.0411, + "step": 285 + }, + { + "epoch": 0.0300894266175697, + "grad_norm": 0.831282377243042, + "learning_rate": 0.0002, + "loss": 2.4271, + "step": 286 + }, + { + "epoch": 0.030194634402945817, + "grad_norm": 1.3080503940582275, + "learning_rate": 0.00019999999419366464, + "loss": 2.4988, + "step": 287 + }, + { + "epoch": 0.030299842188321935, + "grad_norm": 0.6604061126708984, + "learning_rate": 0.00019999997677465928, + "loss": 2.3058, + "step": 288 + }, + { + "epoch": 0.030405049973698053, + "grad_norm": 0.682809591293335, + "learning_rate": 0.00019999994774298586, + "loss": 2.3447, + "step": 289 + }, + { + "epoch": 0.03051025775907417, + "grad_norm": 0.7935196161270142, + "learning_rate": 0.00019999990709864784, + "loss": 1.9286, + "step": 290 + }, + { + "epoch": 0.03061546554445029, + "grad_norm": 0.7283986806869507, + "learning_rate": 0.00019999985484164988, + "loss": 2.1939, + "step": 291 + }, + { + "epoch": 0.030720673329826406, + "grad_norm": 0.6452785134315491, + "learning_rate": 0.00019999979097199807, + "loss": 2.186, + "step": 292 + }, + { + "epoch": 0.030825881115202524, + "grad_norm": 0.7383095622062683, + "learning_rate": 0.00019999971548969982, + "loss": 2.4815, + "step": 293 + }, + { + "epoch": 0.030931088900578642, + "grad_norm": 0.8694736361503601, + "learning_rate": 0.00019999962839476393, + "loss": 1.8029, + "step": 294 + }, + { + "epoch": 0.03103629668595476, + "grad_norm": 1.1409188508987427, + "learning_rate": 0.00019999952968720045, + "loss": 1.8402, + "step": 295 + }, + { + "epoch": 0.031141504471330878, + "grad_norm": 1.1897854804992676, + "learning_rate": 0.0001999994193670209, + "loss": 2.2032, + "step": 296 + }, + { + "epoch": 0.031246712256706995, + "grad_norm": 1.0531314611434937, + "learning_rate": 0.00019999929743423804, + "loss": 2.0809, + "step": 297 + }, + { + "epoch": 0.03135192004208311, + "grad_norm": 0.9149481654167175, + "learning_rate": 0.0001999991638888661, + "loss": 1.9994, + "step": 298 + }, + { + "epoch": 0.031457127827459234, + "grad_norm": 0.9323375821113586, + "learning_rate": 0.00019999901873092054, + "loss": 2.1517, + "step": 299 + }, + { + "epoch": 0.03156233561283535, + "grad_norm": 2.169660806655884, + "learning_rate": 0.0001999988619604182, + "loss": 1.935, + "step": 300 + }, + { + "epoch": 0.03166754339821147, + "grad_norm": 1.145294189453125, + "learning_rate": 0.0001999986935773773, + "loss": 2.2346, + "step": 301 + }, + { + "epoch": 0.031772751183587584, + "grad_norm": 0.9163552522659302, + "learning_rate": 0.00019999851358181746, + "loss": 1.757, + "step": 302 + }, + { + "epoch": 0.031877958968963706, + "grad_norm": 0.9327316284179688, + "learning_rate": 0.00019999832197375948, + "loss": 2.0945, + "step": 303 + }, + { + "epoch": 0.03198316675433982, + "grad_norm": 1.3414212465286255, + "learning_rate": 0.00019999811875322566, + "loss": 1.8407, + "step": 304 + }, + { + "epoch": 0.03208837453971594, + "grad_norm": 0.7236807346343994, + "learning_rate": 0.0001999979039202396, + "loss": 2.1481, + "step": 305 + }, + { + "epoch": 0.032193582325092056, + "grad_norm": 0.7519909143447876, + "learning_rate": 0.00019999767747482623, + "loss": 2.156, + "step": 306 + }, + { + "epoch": 0.03229879011046818, + "grad_norm": 1.3652244806289673, + "learning_rate": 0.00019999743941701188, + "loss": 1.8686, + "step": 307 + }, + { + "epoch": 0.03240399789584429, + "grad_norm": 0.5761735439300537, + "learning_rate": 0.00019999718974682417, + "loss": 2.0619, + "step": 308 + }, + { + "epoch": 0.03250920568122041, + "grad_norm": 1.1688545942306519, + "learning_rate": 0.0001999969284642921, + "loss": 2.0442, + "step": 309 + }, + { + "epoch": 0.03261441346659653, + "grad_norm": 0.6734681129455566, + "learning_rate": 0.000199996655569446, + "loss": 2.2581, + "step": 310 + }, + { + "epoch": 0.03271962125197265, + "grad_norm": 0.9942630529403687, + "learning_rate": 0.00019999637106231756, + "loss": 2.0594, + "step": 311 + }, + { + "epoch": 0.03282482903734876, + "grad_norm": 0.9288811087608337, + "learning_rate": 0.00019999607494293985, + "loss": 1.8957, + "step": 312 + }, + { + "epoch": 0.032930036822724884, + "grad_norm": 2.1428439617156982, + "learning_rate": 0.00019999576721134723, + "loss": 2.2481, + "step": 313 + }, + { + "epoch": 0.033035244608101, + "grad_norm": 1.1253790855407715, + "learning_rate": 0.00019999544786757545, + "loss": 2.0877, + "step": 314 + }, + { + "epoch": 0.03314045239347712, + "grad_norm": 0.9545801281929016, + "learning_rate": 0.00019999511691166157, + "loss": 2.2495, + "step": 315 + }, + { + "epoch": 0.033245660178853234, + "grad_norm": 0.9224940538406372, + "learning_rate": 0.00019999477434364405, + "loss": 2.0281, + "step": 316 + }, + { + "epoch": 0.033350867964229355, + "grad_norm": 0.9393844604492188, + "learning_rate": 0.00019999442016356266, + "loss": 1.7388, + "step": 317 + }, + { + "epoch": 0.03345607574960547, + "grad_norm": 0.8138694763183594, + "learning_rate": 0.00019999405437145856, + "loss": 1.8485, + "step": 318 + }, + { + "epoch": 0.03356128353498159, + "grad_norm": 0.9134123921394348, + "learning_rate": 0.00019999367696737415, + "loss": 1.9404, + "step": 319 + }, + { + "epoch": 0.033666491320357705, + "grad_norm": 0.7344478368759155, + "learning_rate": 0.0001999932879513533, + "loss": 2.4859, + "step": 320 + }, + { + "epoch": 0.03377169910573383, + "grad_norm": 1.5100687742233276, + "learning_rate": 0.00019999288732344122, + "loss": 1.9299, + "step": 321 + }, + { + "epoch": 0.03387690689110994, + "grad_norm": 0.7897876501083374, + "learning_rate": 0.0001999924750836844, + "loss": 2.0341, + "step": 322 + }, + { + "epoch": 0.03398211467648606, + "grad_norm": 0.8573527932167053, + "learning_rate": 0.00019999205123213073, + "loss": 1.8272, + "step": 323 + }, + { + "epoch": 0.03408732246186218, + "grad_norm": 0.6045930981636047, + "learning_rate": 0.00019999161576882938, + "loss": 2.3906, + "step": 324 + }, + { + "epoch": 0.0341925302472383, + "grad_norm": 1.1515878438949585, + "learning_rate": 0.000199991168693831, + "loss": 1.8901, + "step": 325 + }, + { + "epoch": 0.03429773803261441, + "grad_norm": 0.7033329606056213, + "learning_rate": 0.00019999071000718742, + "loss": 2.0389, + "step": 326 + }, + { + "epoch": 0.034402945817990534, + "grad_norm": 0.7773725986480713, + "learning_rate": 0.00019999023970895198, + "loss": 1.7427, + "step": 327 + }, + { + "epoch": 0.03450815360336665, + "grad_norm": 0.6639922261238098, + "learning_rate": 0.0001999897577991792, + "loss": 2.4179, + "step": 328 + }, + { + "epoch": 0.03461336138874277, + "grad_norm": 0.8306198716163635, + "learning_rate": 0.00019998926427792517, + "loss": 1.8829, + "step": 329 + }, + { + "epoch": 0.034718569174118884, + "grad_norm": 0.9085239768028259, + "learning_rate": 0.00019998875914524714, + "loss": 2.3608, + "step": 330 + }, + { + "epoch": 0.034823776959495005, + "grad_norm": 1.2059913873672485, + "learning_rate": 0.00019998824240120372, + "loss": 2.1859, + "step": 331 + }, + { + "epoch": 0.03492898474487112, + "grad_norm": 0.8465059399604797, + "learning_rate": 0.000199987714045855, + "loss": 1.9598, + "step": 332 + }, + { + "epoch": 0.03503419253024724, + "grad_norm": 1.1591929197311401, + "learning_rate": 0.00019998717407926228, + "loss": 1.7713, + "step": 333 + }, + { + "epoch": 0.035139400315623355, + "grad_norm": 1.2473613023757935, + "learning_rate": 0.0001999866225014883, + "loss": 2.0015, + "step": 334 + }, + { + "epoch": 0.035244608100999476, + "grad_norm": 1.0094128847122192, + "learning_rate": 0.0001999860593125971, + "loss": 2.3931, + "step": 335 + }, + { + "epoch": 0.03534981588637559, + "grad_norm": 0.587139368057251, + "learning_rate": 0.00019998548451265405, + "loss": 2.1993, + "step": 336 + }, + { + "epoch": 0.03545502367175171, + "grad_norm": 1.4602278470993042, + "learning_rate": 0.00019998489810172596, + "loss": 2.1095, + "step": 337 + }, + { + "epoch": 0.035560231457127826, + "grad_norm": 0.9542207717895508, + "learning_rate": 0.00019998430007988087, + "loss": 2.0584, + "step": 338 + }, + { + "epoch": 0.03566543924250395, + "grad_norm": 0.9044630527496338, + "learning_rate": 0.00019998369044718826, + "loss": 2.2235, + "step": 339 + }, + { + "epoch": 0.03577064702788006, + "grad_norm": 0.8812581300735474, + "learning_rate": 0.0001999830692037189, + "loss": 2.0134, + "step": 340 + }, + { + "epoch": 0.03587585481325618, + "grad_norm": 0.8419604301452637, + "learning_rate": 0.000199982436349545, + "loss": 2.0388, + "step": 341 + }, + { + "epoch": 0.0359810625986323, + "grad_norm": 0.7698401808738708, + "learning_rate": 0.00019998179188473997, + "loss": 1.9938, + "step": 342 + }, + { + "epoch": 0.03608627038400842, + "grad_norm": 0.808614194393158, + "learning_rate": 0.0001999811358093787, + "loss": 2.1887, + "step": 343 + }, + { + "epoch": 0.03619147816938453, + "grad_norm": 0.756734311580658, + "learning_rate": 0.00019998046812353732, + "loss": 2.1403, + "step": 344 + }, + { + "epoch": 0.036296685954760655, + "grad_norm": 0.8613669276237488, + "learning_rate": 0.00019997978882729345, + "loss": 1.4033, + "step": 345 + }, + { + "epoch": 0.03640189374013677, + "grad_norm": 0.8686724305152893, + "learning_rate": 0.0001999790979207259, + "loss": 2.2557, + "step": 346 + }, + { + "epoch": 0.03650710152551289, + "grad_norm": 1.0263909101486206, + "learning_rate": 0.00019997839540391495, + "loss": 2.1586, + "step": 347 + }, + { + "epoch": 0.036612309310889005, + "grad_norm": 1.0269930362701416, + "learning_rate": 0.00019997768127694214, + "loss": 2.3377, + "step": 348 + }, + { + "epoch": 0.036717517096265126, + "grad_norm": 1.0349308252334595, + "learning_rate": 0.00019997695553989042, + "loss": 1.8215, + "step": 349 + }, + { + "epoch": 0.03682272488164124, + "grad_norm": 0.8196139931678772, + "learning_rate": 0.0001999762181928441, + "loss": 2.0538, + "step": 350 + }, + { + "epoch": 0.03692793266701736, + "grad_norm": 0.9920759797096252, + "learning_rate": 0.00019997546923588875, + "loss": 2.2415, + "step": 351 + }, + { + "epoch": 0.037033140452393476, + "grad_norm": 1.101965069770813, + "learning_rate": 0.00019997470866911136, + "loss": 1.9107, + "step": 352 + }, + { + "epoch": 0.0371383482377696, + "grad_norm": 1.4456695318222046, + "learning_rate": 0.00019997393649260028, + "loss": 2.3002, + "step": 353 + }, + { + "epoch": 0.03724355602314571, + "grad_norm": 1.1259775161743164, + "learning_rate": 0.00019997315270644514, + "loss": 1.8325, + "step": 354 + }, + { + "epoch": 0.03734876380852183, + "grad_norm": 0.7673949003219604, + "learning_rate": 0.000199972357310737, + "loss": 2.3046, + "step": 355 + }, + { + "epoch": 0.03745397159389795, + "grad_norm": 0.8659745454788208, + "learning_rate": 0.00019997155030556822, + "loss": 2.1083, + "step": 356 + }, + { + "epoch": 0.03755917937927407, + "grad_norm": 0.6444668769836426, + "learning_rate": 0.0001999707316910325, + "loss": 2.1361, + "step": 357 + }, + { + "epoch": 0.03766438716465018, + "grad_norm": 0.978226900100708, + "learning_rate": 0.0001999699014672249, + "loss": 2.1801, + "step": 358 + }, + { + "epoch": 0.037769594950026304, + "grad_norm": 1.1167306900024414, + "learning_rate": 0.0001999690596342418, + "loss": 2.3531, + "step": 359 + }, + { + "epoch": 0.03787480273540242, + "grad_norm": 0.9561347365379333, + "learning_rate": 0.00019996820619218105, + "loss": 2.2767, + "step": 360 + }, + { + "epoch": 0.03798001052077854, + "grad_norm": 0.6484118700027466, + "learning_rate": 0.00019996734114114165, + "loss": 2.2618, + "step": 361 + }, + { + "epoch": 0.038085218306154654, + "grad_norm": 1.1894108057022095, + "learning_rate": 0.00019996646448122414, + "loss": 2.169, + "step": 362 + }, + { + "epoch": 0.038190426091530776, + "grad_norm": 0.7844385504722595, + "learning_rate": 0.00019996557621253027, + "loss": 1.8752, + "step": 363 + }, + { + "epoch": 0.03829563387690689, + "grad_norm": 0.7246622443199158, + "learning_rate": 0.00019996467633516326, + "loss": 2.1476, + "step": 364 + }, + { + "epoch": 0.03840084166228301, + "grad_norm": 0.7702452540397644, + "learning_rate": 0.0001999637648492275, + "loss": 2.038, + "step": 365 + }, + { + "epoch": 0.038506049447659126, + "grad_norm": 0.7791807055473328, + "learning_rate": 0.00019996284175482893, + "loss": 1.9363, + "step": 366 + }, + { + "epoch": 0.03861125723303525, + "grad_norm": 0.650177538394928, + "learning_rate": 0.00019996190705207475, + "loss": 2.0905, + "step": 367 + }, + { + "epoch": 0.03871646501841136, + "grad_norm": 1.108355164527893, + "learning_rate": 0.00019996096074107342, + "loss": 2.0743, + "step": 368 + }, + { + "epoch": 0.03882167280378748, + "grad_norm": 1.025335431098938, + "learning_rate": 0.0001999600028219349, + "loss": 1.895, + "step": 369 + }, + { + "epoch": 0.0389268805891636, + "grad_norm": 1.1372300386428833, + "learning_rate": 0.0001999590332947704, + "loss": 2.1421, + "step": 370 + }, + { + "epoch": 0.03903208837453972, + "grad_norm": 1.4822285175323486, + "learning_rate": 0.00019995805215969258, + "loss": 2.3544, + "step": 371 + }, + { + "epoch": 0.03913729615991583, + "grad_norm": 1.2210115194320679, + "learning_rate": 0.00019995705941681523, + "loss": 2.0829, + "step": 372 + }, + { + "epoch": 0.039242503945291954, + "grad_norm": 0.9157518148422241, + "learning_rate": 0.00019995605506625377, + "loss": 1.9328, + "step": 373 + }, + { + "epoch": 0.03934771173066807, + "grad_norm": 0.8865572214126587, + "learning_rate": 0.00019995503910812478, + "loss": 2.202, + "step": 374 + }, + { + "epoch": 0.03945291951604419, + "grad_norm": 1.0392677783966064, + "learning_rate": 0.00019995401154254626, + "loss": 2.1567, + "step": 375 + }, + { + "epoch": 0.039558127301420304, + "grad_norm": 1.0013548135757446, + "learning_rate": 0.00019995297236963749, + "loss": 1.9707, + "step": 376 + }, + { + "epoch": 0.039663335086796425, + "grad_norm": 0.6896701455116272, + "learning_rate": 0.00019995192158951919, + "loss": 1.8789, + "step": 377 + }, + { + "epoch": 0.03976854287217254, + "grad_norm": 0.6109434366226196, + "learning_rate": 0.00019995085920231336, + "loss": 2.2985, + "step": 378 + }, + { + "epoch": 0.03987375065754866, + "grad_norm": 0.7272301912307739, + "learning_rate": 0.00019994978520814337, + "loss": 1.8993, + "step": 379 + }, + { + "epoch": 0.039978958442924775, + "grad_norm": 1.2493723630905151, + "learning_rate": 0.00019994869960713397, + "loss": 2.2194, + "step": 380 + }, + { + "epoch": 0.040084166228300896, + "grad_norm": 0.7804626822471619, + "learning_rate": 0.0001999476023994112, + "loss": 2.0154, + "step": 381 + }, + { + "epoch": 0.04018937401367701, + "grad_norm": 0.863120973110199, + "learning_rate": 0.00019994649358510248, + "loss": 1.7918, + "step": 382 + }, + { + "epoch": 0.04029458179905313, + "grad_norm": 1.1610263586044312, + "learning_rate": 0.0001999453731643366, + "loss": 2.3425, + "step": 383 + }, + { + "epoch": 0.040399789584429247, + "grad_norm": 0.9565117359161377, + "learning_rate": 0.00019994424113724363, + "loss": 1.8774, + "step": 384 + }, + { + "epoch": 0.04050499736980537, + "grad_norm": 0.676775336265564, + "learning_rate": 0.00019994309750395506, + "loss": 2.066, + "step": 385 + }, + { + "epoch": 0.04061020515518148, + "grad_norm": 0.6846379637718201, + "learning_rate": 0.00019994194226460367, + "loss": 1.9599, + "step": 386 + }, + { + "epoch": 0.0407154129405576, + "grad_norm": 0.5663594603538513, + "learning_rate": 0.0001999407754193236, + "loss": 1.9791, + "step": 387 + }, + { + "epoch": 0.04082062072593372, + "grad_norm": 0.967096745967865, + "learning_rate": 0.0001999395969682504, + "loss": 2.1146, + "step": 388 + }, + { + "epoch": 0.04092582851130984, + "grad_norm": 0.7784907817840576, + "learning_rate": 0.00019993840691152093, + "loss": 2.0466, + "step": 389 + }, + { + "epoch": 0.041031036296685953, + "grad_norm": 0.7650101780891418, + "learning_rate": 0.0001999372052492733, + "loss": 2.2071, + "step": 390 + }, + { + "epoch": 0.041136244082062075, + "grad_norm": 0.6302483677864075, + "learning_rate": 0.00019993599198164715, + "loss": 2.0403, + "step": 391 + }, + { + "epoch": 0.04124145186743819, + "grad_norm": 0.6898879408836365, + "learning_rate": 0.00019993476710878332, + "loss": 1.7578, + "step": 392 + }, + { + "epoch": 0.04134665965281431, + "grad_norm": 1.1919569969177246, + "learning_rate": 0.00019993353063082404, + "loss": 1.9792, + "step": 393 + }, + { + "epoch": 0.041451867438190425, + "grad_norm": 0.6219474077224731, + "learning_rate": 0.00019993228254791293, + "loss": 2.4463, + "step": 394 + }, + { + "epoch": 0.041557075223566546, + "grad_norm": 0.9380045533180237, + "learning_rate": 0.00019993102286019495, + "loss": 1.8507, + "step": 395 + }, + { + "epoch": 0.04166228300894266, + "grad_norm": 1.5710420608520508, + "learning_rate": 0.0001999297515678163, + "loss": 2.1511, + "step": 396 + }, + { + "epoch": 0.04176749079431878, + "grad_norm": 0.6227583289146423, + "learning_rate": 0.00019992846867092473, + "loss": 2.0332, + "step": 397 + }, + { + "epoch": 0.041872698579694896, + "grad_norm": 0.7754364609718323, + "learning_rate": 0.0001999271741696691, + "loss": 1.6828, + "step": 398 + }, + { + "epoch": 0.04197790636507102, + "grad_norm": 1.0617812871932983, + "learning_rate": 0.0001999258680641998, + "loss": 2.0997, + "step": 399 + }, + { + "epoch": 0.04208311415044713, + "grad_norm": 0.8257030844688416, + "learning_rate": 0.00019992455035466847, + "loss": 1.948, + "step": 400 + }, + { + "epoch": 0.04218832193582325, + "grad_norm": 0.9446254968643188, + "learning_rate": 0.00019992322104122817, + "loss": 1.7266, + "step": 401 + }, + { + "epoch": 0.04229352972119937, + "grad_norm": 0.52854323387146, + "learning_rate": 0.00019992188012403324, + "loss": 2.2328, + "step": 402 + }, + { + "epoch": 0.04239873750657549, + "grad_norm": 0.9252466559410095, + "learning_rate": 0.00019992052760323941, + "loss": 2.1785, + "step": 403 + }, + { + "epoch": 0.0425039452919516, + "grad_norm": 0.7319321632385254, + "learning_rate": 0.00019991916347900378, + "loss": 2.2174, + "step": 404 + }, + { + "epoch": 0.042609153077327724, + "grad_norm": 0.5902657508850098, + "learning_rate": 0.00019991778775148465, + "loss": 2.1312, + "step": 405 + }, + { + "epoch": 0.04271436086270384, + "grad_norm": 0.8190617561340332, + "learning_rate": 0.0001999164004208419, + "loss": 1.658, + "step": 406 + }, + { + "epoch": 0.04281956864807996, + "grad_norm": 0.9919306635856628, + "learning_rate": 0.00019991500148723658, + "loss": 2.158, + "step": 407 + }, + { + "epoch": 0.042924776433456074, + "grad_norm": 1.2936168909072876, + "learning_rate": 0.00019991359095083112, + "loss": 1.8505, + "step": 408 + }, + { + "epoch": 0.043029984218832196, + "grad_norm": 0.947219729423523, + "learning_rate": 0.00019991216881178937, + "loss": 1.8942, + "step": 409 + }, + { + "epoch": 0.04313519200420831, + "grad_norm": 1.0280535221099854, + "learning_rate": 0.00019991073507027646, + "loss": 1.7948, + "step": 410 + }, + { + "epoch": 0.04324039978958443, + "grad_norm": 0.795024573802948, + "learning_rate": 0.00019990928972645887, + "loss": 1.9645, + "step": 411 + }, + { + "epoch": 0.043345607574960546, + "grad_norm": 0.9444951415061951, + "learning_rate": 0.00019990783278050448, + "loss": 2.0397, + "step": 412 + }, + { + "epoch": 0.04345081536033667, + "grad_norm": 0.7790425419807434, + "learning_rate": 0.00019990636423258246, + "loss": 2.0172, + "step": 413 + }, + { + "epoch": 0.04355602314571278, + "grad_norm": 0.8171620965003967, + "learning_rate": 0.00019990488408286333, + "loss": 1.9963, + "step": 414 + }, + { + "epoch": 0.0436612309310889, + "grad_norm": 1.3680206537246704, + "learning_rate": 0.000199903392331519, + "loss": 1.5671, + "step": 415 + }, + { + "epoch": 0.04376643871646502, + "grad_norm": 1.41587495803833, + "learning_rate": 0.00019990188897872266, + "loss": 2.2752, + "step": 416 + }, + { + "epoch": 0.04387164650184114, + "grad_norm": 0.888954222202301, + "learning_rate": 0.00019990037402464896, + "loss": 2.1677, + "step": 417 + }, + { + "epoch": 0.04397685428721725, + "grad_norm": 0.8410660028457642, + "learning_rate": 0.00019989884746947378, + "loss": 2.1372, + "step": 418 + }, + { + "epoch": 0.044082062072593374, + "grad_norm": 0.8259899616241455, + "learning_rate": 0.0001998973093133744, + "loss": 1.7659, + "step": 419 + }, + { + "epoch": 0.04418726985796949, + "grad_norm": 1.2934560775756836, + "learning_rate": 0.00019989575955652944, + "loss": 1.8465, + "step": 420 + }, + { + "epoch": 0.04429247764334561, + "grad_norm": 0.7665582299232483, + "learning_rate": 0.00019989419819911887, + "loss": 2.2067, + "step": 421 + }, + { + "epoch": 0.044397685428721724, + "grad_norm": 0.676923394203186, + "learning_rate": 0.000199892625241324, + "loss": 2.3273, + "step": 422 + }, + { + "epoch": 0.044502893214097845, + "grad_norm": 1.0820274353027344, + "learning_rate": 0.00019989104068332756, + "loss": 1.8123, + "step": 423 + }, + { + "epoch": 0.04460810099947396, + "grad_norm": 1.1282938718795776, + "learning_rate": 0.00019988944452531345, + "loss": 1.9473, + "step": 424 + }, + { + "epoch": 0.04471330878485008, + "grad_norm": 1.5150783061981201, + "learning_rate": 0.00019988783676746708, + "loss": 1.6027, + "step": 425 + }, + { + "epoch": 0.044818516570226195, + "grad_norm": 0.991240918636322, + "learning_rate": 0.00019988621740997512, + "loss": 1.9637, + "step": 426 + }, + { + "epoch": 0.04492372435560232, + "grad_norm": 0.8624051809310913, + "learning_rate": 0.00019988458645302568, + "loss": 1.9826, + "step": 427 + }, + { + "epoch": 0.04502893214097843, + "grad_norm": 0.9856376647949219, + "learning_rate": 0.00019988294389680812, + "loss": 2.3543, + "step": 428 + }, + { + "epoch": 0.04513413992635455, + "grad_norm": 0.5573300719261169, + "learning_rate": 0.0001998812897415132, + "loss": 1.955, + "step": 429 + }, + { + "epoch": 0.04523934771173067, + "grad_norm": 0.8506868481636047, + "learning_rate": 0.000199879623987333, + "loss": 2.0627, + "step": 430 + }, + { + "epoch": 0.04534455549710679, + "grad_norm": 1.4136837720870972, + "learning_rate": 0.00019987794663446095, + "loss": 2.0503, + "step": 431 + }, + { + "epoch": 0.0454497632824829, + "grad_norm": 0.9526495337486267, + "learning_rate": 0.0001998762576830919, + "loss": 1.7729, + "step": 432 + }, + { + "epoch": 0.045554971067859024, + "grad_norm": 0.8205630779266357, + "learning_rate": 0.00019987455713342187, + "loss": 2.3268, + "step": 433 + }, + { + "epoch": 0.04566017885323514, + "grad_norm": 0.9681246876716614, + "learning_rate": 0.0001998728449856484, + "loss": 1.862, + "step": 434 + }, + { + "epoch": 0.04576538663861126, + "grad_norm": 0.7441048622131348, + "learning_rate": 0.00019987112123997033, + "loss": 2.1119, + "step": 435 + }, + { + "epoch": 0.045870594423987374, + "grad_norm": 1.2735470533370972, + "learning_rate": 0.00019986938589658783, + "loss": 1.7813, + "step": 436 + }, + { + "epoch": 0.045975802209363495, + "grad_norm": 1.4802747964859009, + "learning_rate": 0.00019986763895570242, + "loss": 2.1473, + "step": 437 + }, + { + "epoch": 0.04608100999473961, + "grad_norm": 1.388631820678711, + "learning_rate": 0.0001998658804175169, + "loss": 1.8662, + "step": 438 + }, + { + "epoch": 0.04618621778011573, + "grad_norm": 0.5619102716445923, + "learning_rate": 0.00019986411028223558, + "loss": 2.0551, + "step": 439 + }, + { + "epoch": 0.046291425565491845, + "grad_norm": 0.7170664668083191, + "learning_rate": 0.000199862328550064, + "loss": 2.0751, + "step": 440 + }, + { + "epoch": 0.046396633350867966, + "grad_norm": 0.7204136252403259, + "learning_rate": 0.000199860535221209, + "loss": 2.0988, + "step": 441 + }, + { + "epoch": 0.04650184113624408, + "grad_norm": 1.1917775869369507, + "learning_rate": 0.0001998587302958789, + "loss": 1.8188, + "step": 442 + }, + { + "epoch": 0.0466070489216202, + "grad_norm": 0.861599862575531, + "learning_rate": 0.00019985691377428326, + "loss": 2.3283, + "step": 443 + }, + { + "epoch": 0.046712256706996316, + "grad_norm": 0.7343453168869019, + "learning_rate": 0.00019985508565663305, + "loss": 2.1423, + "step": 444 + }, + { + "epoch": 0.04681746449237244, + "grad_norm": 1.0733033418655396, + "learning_rate": 0.00019985324594314055, + "loss": 1.814, + "step": 445 + }, + { + "epoch": 0.04692267227774855, + "grad_norm": 0.7048352956771851, + "learning_rate": 0.00019985139463401944, + "loss": 2.2652, + "step": 446 + }, + { + "epoch": 0.04702788006312467, + "grad_norm": 0.9343500733375549, + "learning_rate": 0.00019984953172948465, + "loss": 2.1747, + "step": 447 + }, + { + "epoch": 0.04713308784850079, + "grad_norm": 0.9725675582885742, + "learning_rate": 0.00019984765722975254, + "loss": 2.0506, + "step": 448 + }, + { + "epoch": 0.04723829563387691, + "grad_norm": 1.3480443954467773, + "learning_rate": 0.00019984577113504076, + "loss": 2.4285, + "step": 449 + }, + { + "epoch": 0.04734350341925302, + "grad_norm": 1.5416226387023926, + "learning_rate": 0.0001998438734455684, + "loss": 1.7112, + "step": 450 + }, + { + "epoch": 0.047448711204629145, + "grad_norm": 0.7121959328651428, + "learning_rate": 0.0001998419641615558, + "loss": 1.9964, + "step": 451 + }, + { + "epoch": 0.04755391899000526, + "grad_norm": 0.905596911907196, + "learning_rate": 0.00019984004328322464, + "loss": 2.351, + "step": 452 + }, + { + "epoch": 0.04765912677538138, + "grad_norm": 0.9009513258934021, + "learning_rate": 0.00019983811081079807, + "loss": 2.0917, + "step": 453 + }, + { + "epoch": 0.047764334560757495, + "grad_norm": 1.4585151672363281, + "learning_rate": 0.0001998361667445004, + "loss": 1.8901, + "step": 454 + }, + { + "epoch": 0.047869542346133616, + "grad_norm": 1.1657588481903076, + "learning_rate": 0.00019983421108455746, + "loss": 1.8957, + "step": 455 + }, + { + "epoch": 0.04797475013150973, + "grad_norm": 0.7370551824569702, + "learning_rate": 0.00019983224383119633, + "loss": 1.9769, + "step": 456 + }, + { + "epoch": 0.04807995791688585, + "grad_norm": 0.9683654308319092, + "learning_rate": 0.00019983026498464546, + "loss": 1.6403, + "step": 457 + }, + { + "epoch": 0.048185165702261966, + "grad_norm": 0.7799041867256165, + "learning_rate": 0.00019982827454513466, + "loss": 1.9753, + "step": 458 + }, + { + "epoch": 0.04829037348763809, + "grad_norm": 1.0562982559204102, + "learning_rate": 0.00019982627251289504, + "loss": 2.2229, + "step": 459 + }, + { + "epoch": 0.0483955812730142, + "grad_norm": 0.7870259881019592, + "learning_rate": 0.00019982425888815915, + "loss": 2.025, + "step": 460 + }, + { + "epoch": 0.04850078905839032, + "grad_norm": 1.191759467124939, + "learning_rate": 0.00019982223367116076, + "loss": 2.3058, + "step": 461 + }, + { + "epoch": 0.04860599684376644, + "grad_norm": 1.1136091947555542, + "learning_rate": 0.0001998201968621351, + "loss": 1.824, + "step": 462 + }, + { + "epoch": 0.04871120462914256, + "grad_norm": 0.8942115306854248, + "learning_rate": 0.00019981814846131867, + "loss": 2.1555, + "step": 463 + }, + { + "epoch": 0.04881641241451867, + "grad_norm": 1.0691640377044678, + "learning_rate": 0.00019981608846894933, + "loss": 1.9872, + "step": 464 + }, + { + "epoch": 0.048921620199894794, + "grad_norm": 1.0228484869003296, + "learning_rate": 0.00019981401688526636, + "loss": 2.1705, + "step": 465 + }, + { + "epoch": 0.04902682798527091, + "grad_norm": 0.9551164507865906, + "learning_rate": 0.00019981193371051026, + "loss": 1.9308, + "step": 466 + }, + { + "epoch": 0.04913203577064703, + "grad_norm": 1.2936720848083496, + "learning_rate": 0.000199809838944923, + "loss": 2.2471, + "step": 467 + }, + { + "epoch": 0.049237243556023144, + "grad_norm": 0.9659333229064941, + "learning_rate": 0.00019980773258874778, + "loss": 1.9027, + "step": 468 + }, + { + "epoch": 0.049342451341399265, + "grad_norm": 0.814227819442749, + "learning_rate": 0.00019980561464222926, + "loss": 1.8804, + "step": 469 + }, + { + "epoch": 0.04944765912677538, + "grad_norm": 1.2761330604553223, + "learning_rate": 0.00019980348510561334, + "loss": 2.2427, + "step": 470 + }, + { + "epoch": 0.0495528669121515, + "grad_norm": 1.2825102806091309, + "learning_rate": 0.00019980134397914735, + "loss": 1.4184, + "step": 471 + }, + { + "epoch": 0.049658074697527615, + "grad_norm": 1.0994774103164673, + "learning_rate": 0.00019979919126307993, + "loss": 2.3454, + "step": 472 + }, + { + "epoch": 0.04976328248290374, + "grad_norm": 1.1263169050216675, + "learning_rate": 0.00019979702695766105, + "loss": 2.1226, + "step": 473 + }, + { + "epoch": 0.04986849026827985, + "grad_norm": 1.1242156028747559, + "learning_rate": 0.00019979485106314207, + "loss": 1.7008, + "step": 474 + }, + { + "epoch": 0.04997369805365597, + "grad_norm": 0.7600993514060974, + "learning_rate": 0.00019979266357977564, + "loss": 1.9561, + "step": 475 + }, + { + "epoch": 0.05007890583903209, + "grad_norm": 1.0761032104492188, + "learning_rate": 0.00019979046450781577, + "loss": 2.1057, + "step": 476 + }, + { + "epoch": 0.05018411362440821, + "grad_norm": 0.9772080779075623, + "learning_rate": 0.00019978825384751788, + "loss": 2.1573, + "step": 477 + }, + { + "epoch": 0.05028932140978432, + "grad_norm": 0.6181446313858032, + "learning_rate": 0.0001997860315991387, + "loss": 2.1602, + "step": 478 + }, + { + "epoch": 0.050394529195160444, + "grad_norm": 1.589167594909668, + "learning_rate": 0.0001997837977629362, + "loss": 2.1132, + "step": 479 + }, + { + "epoch": 0.05049973698053656, + "grad_norm": 0.6622167229652405, + "learning_rate": 0.0001997815523391699, + "loss": 2.331, + "step": 480 + }, + { + "epoch": 0.05060494476591268, + "grad_norm": 1.0673385858535767, + "learning_rate": 0.00019977929532810046, + "loss": 2.387, + "step": 481 + }, + { + "epoch": 0.050710152551288794, + "grad_norm": 0.897954523563385, + "learning_rate": 0.00019977702672999007, + "loss": 2.0587, + "step": 482 + }, + { + "epoch": 0.050815360336664915, + "grad_norm": 1.2834991216659546, + "learning_rate": 0.00019977474654510205, + "loss": 2.2961, + "step": 483 + }, + { + "epoch": 0.05092056812204103, + "grad_norm": 0.9525724649429321, + "learning_rate": 0.0001997724547737013, + "loss": 2.1939, + "step": 484 + }, + { + "epoch": 0.05102577590741715, + "grad_norm": 1.231590747833252, + "learning_rate": 0.00019977015141605392, + "loss": 2.0266, + "step": 485 + }, + { + "epoch": 0.051130983692793265, + "grad_norm": 2.677229642868042, + "learning_rate": 0.0001997678364724274, + "loss": 1.3896, + "step": 486 + }, + { + "epoch": 0.051236191478169386, + "grad_norm": 0.6867842674255371, + "learning_rate": 0.00019976550994309054, + "loss": 1.9468, + "step": 487 + }, + { + "epoch": 0.0513413992635455, + "grad_norm": 0.6714895963668823, + "learning_rate": 0.00019976317182831356, + "loss": 1.925, + "step": 488 + }, + { + "epoch": 0.05144660704892162, + "grad_norm": 1.246391773223877, + "learning_rate": 0.00019976082212836793, + "loss": 2.1784, + "step": 489 + }, + { + "epoch": 0.051551814834297736, + "grad_norm": 1.2437677383422852, + "learning_rate": 0.00019975846084352653, + "loss": 2.0721, + "step": 490 + }, + { + "epoch": 0.05165702261967386, + "grad_norm": 0.6305761933326721, + "learning_rate": 0.00019975608797406357, + "loss": 2.1311, + "step": 491 + }, + { + "epoch": 0.05176223040504997, + "grad_norm": 1.3557363748550415, + "learning_rate": 0.0001997537035202546, + "loss": 2.0776, + "step": 492 + }, + { + "epoch": 0.05186743819042609, + "grad_norm": 1.1781286001205444, + "learning_rate": 0.00019975130748237655, + "loss": 1.7604, + "step": 493 + }, + { + "epoch": 0.05197264597580221, + "grad_norm": 1.2240660190582275, + "learning_rate": 0.0001997488998607076, + "loss": 1.9454, + "step": 494 + }, + { + "epoch": 0.05207785376117833, + "grad_norm": 0.7471686601638794, + "learning_rate": 0.00019974648065552736, + "loss": 2.2171, + "step": 495 + }, + { + "epoch": 0.05218306154655444, + "grad_norm": 0.757163405418396, + "learning_rate": 0.0001997440498671168, + "loss": 2.1052, + "step": 496 + }, + { + "epoch": 0.052288269331930565, + "grad_norm": 0.6472220420837402, + "learning_rate": 0.00019974160749575818, + "loss": 1.8683, + "step": 497 + }, + { + "epoch": 0.05239347711730668, + "grad_norm": 1.0862151384353638, + "learning_rate": 0.00019973915354173515, + "loss": 1.9728, + "step": 498 + }, + { + "epoch": 0.0524986849026828, + "grad_norm": 0.7950903177261353, + "learning_rate": 0.00019973668800533264, + "loss": 1.7395, + "step": 499 + }, + { + "epoch": 0.052603892688058915, + "grad_norm": 0.8745871782302856, + "learning_rate": 0.00019973421088683696, + "loss": 1.7054, + "step": 500 + }, + { + "epoch": 0.052709100473435036, + "grad_norm": 0.8229191899299622, + "learning_rate": 0.00019973172218653578, + "loss": 1.9457, + "step": 501 + }, + { + "epoch": 0.05281430825881115, + "grad_norm": 1.065964698791504, + "learning_rate": 0.00019972922190471812, + "loss": 2.1223, + "step": 502 + }, + { + "epoch": 0.05291951604418727, + "grad_norm": 1.3047000169754028, + "learning_rate": 0.00019972671004167433, + "loss": 2.145, + "step": 503 + }, + { + "epoch": 0.053024723829563386, + "grad_norm": 0.7635353803634644, + "learning_rate": 0.00019972418659769606, + "loss": 2.4166, + "step": 504 + }, + { + "epoch": 0.05312993161493951, + "grad_norm": 0.7105801701545715, + "learning_rate": 0.00019972165157307643, + "loss": 2.3084, + "step": 505 + }, + { + "epoch": 0.05323513940031562, + "grad_norm": 1.485958218574524, + "learning_rate": 0.00019971910496810976, + "loss": 2.1874, + "step": 506 + }, + { + "epoch": 0.05334034718569174, + "grad_norm": 1.4032565355300903, + "learning_rate": 0.0001997165467830918, + "loss": 1.6684, + "step": 507 + }, + { + "epoch": 0.05344555497106786, + "grad_norm": 0.8490357398986816, + "learning_rate": 0.00019971397701831962, + "loss": 2.2013, + "step": 508 + }, + { + "epoch": 0.05355076275644398, + "grad_norm": 1.023547649383545, + "learning_rate": 0.00019971139567409165, + "loss": 1.885, + "step": 509 + }, + { + "epoch": 0.05365597054182009, + "grad_norm": 1.5144532918930054, + "learning_rate": 0.00019970880275070762, + "loss": 1.6336, + "step": 510 + }, + { + "epoch": 0.053761178327196214, + "grad_norm": 0.5086830258369446, + "learning_rate": 0.00019970619824846866, + "loss": 2.2339, + "step": 511 + }, + { + "epoch": 0.05386638611257233, + "grad_norm": 0.7949154376983643, + "learning_rate": 0.00019970358216767723, + "loss": 1.8825, + "step": 512 + }, + { + "epoch": 0.05397159389794845, + "grad_norm": 0.9173998832702637, + "learning_rate": 0.00019970095450863714, + "loss": 1.5642, + "step": 513 + }, + { + "epoch": 0.054076801683324564, + "grad_norm": 1.7043392658233643, + "learning_rate": 0.00019969831527165348, + "loss": 1.8851, + "step": 514 + }, + { + "epoch": 0.054182009468700686, + "grad_norm": 0.7839592099189758, + "learning_rate": 0.00019969566445703278, + "loss": 2.0123, + "step": 515 + }, + { + "epoch": 0.0542872172540768, + "grad_norm": 0.8127647638320923, + "learning_rate": 0.00019969300206508286, + "loss": 2.2239, + "step": 516 + }, + { + "epoch": 0.05439242503945292, + "grad_norm": 0.8179565668106079, + "learning_rate": 0.00019969032809611287, + "loss": 2.2176, + "step": 517 + }, + { + "epoch": 0.054497632824829036, + "grad_norm": 0.7557021975517273, + "learning_rate": 0.0001996876425504334, + "loss": 2.0041, + "step": 518 + }, + { + "epoch": 0.05460284061020516, + "grad_norm": 0.9029064178466797, + "learning_rate": 0.0001996849454283562, + "loss": 2.214, + "step": 519 + }, + { + "epoch": 0.05470804839558127, + "grad_norm": 0.778174638748169, + "learning_rate": 0.0001996822367301946, + "loss": 2.3064, + "step": 520 + }, + { + "epoch": 0.05481325618095739, + "grad_norm": 0.7716636061668396, + "learning_rate": 0.00019967951645626306, + "loss": 2.2683, + "step": 521 + }, + { + "epoch": 0.05491846396633351, + "grad_norm": 0.7747607231140137, + "learning_rate": 0.00019967678460687752, + "loss": 2.257, + "step": 522 + }, + { + "epoch": 0.05502367175170963, + "grad_norm": 1.1563743352890015, + "learning_rate": 0.00019967404118235521, + "loss": 1.6323, + "step": 523 + }, + { + "epoch": 0.05512887953708574, + "grad_norm": 0.7931325435638428, + "learning_rate": 0.0001996712861830147, + "loss": 1.4886, + "step": 524 + }, + { + "epoch": 0.055234087322461864, + "grad_norm": 1.187528133392334, + "learning_rate": 0.00019966851960917596, + "loss": 2.2017, + "step": 525 + }, + { + "epoch": 0.05533929510783798, + "grad_norm": 0.762750506401062, + "learning_rate": 0.00019966574146116023, + "loss": 1.953, + "step": 526 + }, + { + "epoch": 0.0554445028932141, + "grad_norm": 0.7893527150154114, + "learning_rate": 0.00019966295173929016, + "loss": 2.2539, + "step": 527 + }, + { + "epoch": 0.055549710678590214, + "grad_norm": 1.012707233428955, + "learning_rate": 0.00019966015044388966, + "loss": 2.0272, + "step": 528 + }, + { + "epoch": 0.055654918463966335, + "grad_norm": 0.7175372838973999, + "learning_rate": 0.00019965733757528405, + "loss": 2.2922, + "step": 529 + }, + { + "epoch": 0.05576012624934245, + "grad_norm": 1.1108931303024292, + "learning_rate": 0.0001996545131338, + "loss": 2.0014, + "step": 530 + }, + { + "epoch": 0.05586533403471857, + "grad_norm": 1.002982258796692, + "learning_rate": 0.00019965167711976552, + "loss": 1.9751, + "step": 531 + }, + { + "epoch": 0.055970541820094685, + "grad_norm": 1.2420021295547485, + "learning_rate": 0.00019964882953350989, + "loss": 1.8003, + "step": 532 + }, + { + "epoch": 0.05607574960547081, + "grad_norm": 0.8945255875587463, + "learning_rate": 0.00019964597037536383, + "loss": 2.0404, + "step": 533 + }, + { + "epoch": 0.05618095739084692, + "grad_norm": 0.8845522999763489, + "learning_rate": 0.00019964309964565937, + "loss": 1.8745, + "step": 534 + }, + { + "epoch": 0.05628616517622304, + "grad_norm": 0.826899528503418, + "learning_rate": 0.00019964021734472987, + "loss": 1.8893, + "step": 535 + }, + { + "epoch": 0.05639137296159916, + "grad_norm": 0.8589649200439453, + "learning_rate": 0.00019963732347291, + "loss": 2.2759, + "step": 536 + }, + { + "epoch": 0.05649658074697528, + "grad_norm": 0.9177697896957397, + "learning_rate": 0.00019963441803053588, + "loss": 1.9493, + "step": 537 + }, + { + "epoch": 0.05660178853235139, + "grad_norm": 1.1781582832336426, + "learning_rate": 0.0001996315010179449, + "loss": 1.8959, + "step": 538 + }, + { + "epoch": 0.056706996317727514, + "grad_norm": 1.119227647781372, + "learning_rate": 0.00019962857243547574, + "loss": 2.3161, + "step": 539 + }, + { + "epoch": 0.05681220410310363, + "grad_norm": 0.8425557017326355, + "learning_rate": 0.00019962563228346857, + "loss": 2.2769, + "step": 540 + }, + { + "epoch": 0.05691741188847975, + "grad_norm": 1.2644481658935547, + "learning_rate": 0.0001996226805622648, + "loss": 1.9871, + "step": 541 + }, + { + "epoch": 0.057022619673855864, + "grad_norm": 0.9367343187332153, + "learning_rate": 0.00019961971727220715, + "loss": 2.4598, + "step": 542 + }, + { + "epoch": 0.057127827459231985, + "grad_norm": 0.9345009326934814, + "learning_rate": 0.00019961674241363974, + "loss": 2.0872, + "step": 543 + }, + { + "epoch": 0.0572330352446081, + "grad_norm": 1.1807975769042969, + "learning_rate": 0.00019961375598690813, + "loss": 1.8824, + "step": 544 + }, + { + "epoch": 0.05733824302998422, + "grad_norm": 0.9418180584907532, + "learning_rate": 0.00019961075799235903, + "loss": 2.2195, + "step": 545 + }, + { + "epoch": 0.057443450815360335, + "grad_norm": 1.003548502922058, + "learning_rate": 0.0001996077484303406, + "loss": 1.5655, + "step": 546 + }, + { + "epoch": 0.057548658600736456, + "grad_norm": 0.9430425763130188, + "learning_rate": 0.00019960472730120237, + "loss": 1.9868, + "step": 547 + }, + { + "epoch": 0.05765386638611257, + "grad_norm": 0.7052996754646301, + "learning_rate": 0.00019960169460529515, + "loss": 2.1614, + "step": 548 + }, + { + "epoch": 0.05775907417148869, + "grad_norm": 0.9624035954475403, + "learning_rate": 0.0001995986503429711, + "loss": 1.8665, + "step": 549 + }, + { + "epoch": 0.057864281956864806, + "grad_norm": 0.8051895499229431, + "learning_rate": 0.00019959559451458375, + "loss": 2.1922, + "step": 550 + }, + { + "epoch": 0.05796948974224093, + "grad_norm": 5.186445713043213, + "learning_rate": 0.000199592527120488, + "loss": 2.3402, + "step": 551 + }, + { + "epoch": 0.05807469752761704, + "grad_norm": 0.7552983164787292, + "learning_rate": 0.00019958944816104, + "loss": 1.9901, + "step": 552 + }, + { + "epoch": 0.05817990531299316, + "grad_norm": 1.3142822980880737, + "learning_rate": 0.0001995863576365973, + "loss": 2.0759, + "step": 553 + }, + { + "epoch": 0.05828511309836928, + "grad_norm": 0.6937042474746704, + "learning_rate": 0.00019958325554751886, + "loss": 2.1585, + "step": 554 + }, + { + "epoch": 0.0583903208837454, + "grad_norm": 0.8620643615722656, + "learning_rate": 0.00019958014189416489, + "loss": 2.2186, + "step": 555 + }, + { + "epoch": 0.05849552866912151, + "grad_norm": 0.8564947843551636, + "learning_rate": 0.00019957701667689691, + "loss": 2.0109, + "step": 556 + }, + { + "epoch": 0.058600736454497634, + "grad_norm": 0.6378996968269348, + "learning_rate": 0.0001995738798960779, + "loss": 2.0941, + "step": 557 + }, + { + "epoch": 0.05870594423987375, + "grad_norm": 0.6846954822540283, + "learning_rate": 0.0001995707315520721, + "loss": 2.0447, + "step": 558 + }, + { + "epoch": 0.05881115202524987, + "grad_norm": 1.0269975662231445, + "learning_rate": 0.00019956757164524516, + "loss": 1.7775, + "step": 559 + }, + { + "epoch": 0.058916359810625984, + "grad_norm": 0.8012015223503113, + "learning_rate": 0.00019956440017596393, + "loss": 2.0707, + "step": 560 + }, + { + "epoch": 0.059021567596002106, + "grad_norm": 1.592552900314331, + "learning_rate": 0.0001995612171445968, + "loss": 1.9245, + "step": 561 + }, + { + "epoch": 0.05912677538137822, + "grad_norm": 0.7692115306854248, + "learning_rate": 0.00019955802255151338, + "loss": 2.3235, + "step": 562 + }, + { + "epoch": 0.05923198316675434, + "grad_norm": 0.8409086465835571, + "learning_rate": 0.00019955481639708463, + "loss": 2.0308, + "step": 563 + }, + { + "epoch": 0.059337190952130456, + "grad_norm": 0.7190937399864197, + "learning_rate": 0.00019955159868168288, + "loss": 2.2148, + "step": 564 + }, + { + "epoch": 0.05944239873750658, + "grad_norm": 1.3858685493469238, + "learning_rate": 0.00019954836940568177, + "loss": 2.1553, + "step": 565 + }, + { + "epoch": 0.05954760652288269, + "grad_norm": 0.7564934492111206, + "learning_rate": 0.00019954512856945632, + "loss": 1.713, + "step": 566 + }, + { + "epoch": 0.05965281430825881, + "grad_norm": 1.1952269077301025, + "learning_rate": 0.00019954187617338294, + "loss": 2.1569, + "step": 567 + }, + { + "epoch": 0.05975802209363493, + "grad_norm": 1.282727837562561, + "learning_rate": 0.00019953861221783922, + "loss": 2.0503, + "step": 568 + }, + { + "epoch": 0.05986322987901105, + "grad_norm": 0.8127844333648682, + "learning_rate": 0.00019953533670320422, + "loss": 1.7488, + "step": 569 + }, + { + "epoch": 0.05996843766438716, + "grad_norm": 1.0792090892791748, + "learning_rate": 0.00019953204962985837, + "loss": 1.9068, + "step": 570 + }, + { + "epoch": 0.060073645449763284, + "grad_norm": 0.9024432897567749, + "learning_rate": 0.00019952875099818332, + "loss": 2.2592, + "step": 571 + }, + { + "epoch": 0.0601788532351394, + "grad_norm": 1.1777130365371704, + "learning_rate": 0.0001995254408085622, + "loss": 2.4742, + "step": 572 + }, + { + "epoch": 0.06028406102051552, + "grad_norm": 0.7864671945571899, + "learning_rate": 0.00019952211906137932, + "loss": 2.0437, + "step": 573 + }, + { + "epoch": 0.060389268805891634, + "grad_norm": 0.9623721837997437, + "learning_rate": 0.00019951878575702047, + "loss": 2.3201, + "step": 574 + }, + { + "epoch": 0.060494476591267755, + "grad_norm": 0.7268714904785156, + "learning_rate": 0.00019951544089587278, + "loss": 1.9149, + "step": 575 + }, + { + "epoch": 0.06059968437664387, + "grad_norm": 1.013013243675232, + "learning_rate": 0.00019951208447832461, + "loss": 1.7478, + "step": 576 + }, + { + "epoch": 0.06070489216201999, + "grad_norm": 0.6717780232429504, + "learning_rate": 0.00019950871650476577, + "loss": 2.208, + "step": 577 + }, + { + "epoch": 0.060810099947396105, + "grad_norm": 2.0669307708740234, + "learning_rate": 0.00019950533697558732, + "loss": 2.185, + "step": 578 + }, + { + "epoch": 0.06091530773277223, + "grad_norm": 1.002172827720642, + "learning_rate": 0.0001995019458911818, + "loss": 2.0923, + "step": 579 + }, + { + "epoch": 0.06102051551814834, + "grad_norm": 1.053417444229126, + "learning_rate": 0.00019949854325194294, + "loss": 2.0336, + "step": 580 + }, + { + "epoch": 0.06112572330352446, + "grad_norm": 0.8169642686843872, + "learning_rate": 0.0001994951290582659, + "loss": 2.1974, + "step": 581 + }, + { + "epoch": 0.06123093108890058, + "grad_norm": 0.8666344881057739, + "learning_rate": 0.0001994917033105471, + "loss": 2.1284, + "step": 582 + }, + { + "epoch": 0.0613361388742767, + "grad_norm": 0.8064733743667603, + "learning_rate": 0.00019948826600918443, + "loss": 2.0592, + "step": 583 + }, + { + "epoch": 0.06144134665965281, + "grad_norm": 1.0283541679382324, + "learning_rate": 0.00019948481715457707, + "loss": 2.2653, + "step": 584 + }, + { + "epoch": 0.061546554445028934, + "grad_norm": 1.1109108924865723, + "learning_rate": 0.00019948135674712546, + "loss": 2.0874, + "step": 585 + }, + { + "epoch": 0.06165176223040505, + "grad_norm": 0.8930381536483765, + "learning_rate": 0.00019947788478723153, + "loss": 1.988, + "step": 586 + }, + { + "epoch": 0.06175697001578117, + "grad_norm": 0.8919090032577515, + "learning_rate": 0.00019947440127529836, + "loss": 2.3961, + "step": 587 + }, + { + "epoch": 0.061862177801157284, + "grad_norm": 0.9803879857063293, + "learning_rate": 0.00019947090621173053, + "loss": 2.2986, + "step": 588 + }, + { + "epoch": 0.061967385586533405, + "grad_norm": 2.3181352615356445, + "learning_rate": 0.00019946739959693393, + "loss": 2.3345, + "step": 589 + }, + { + "epoch": 0.06207259337190952, + "grad_norm": 1.0249263048171997, + "learning_rate": 0.00019946388143131575, + "loss": 1.9578, + "step": 590 + }, + { + "epoch": 0.06217780115728564, + "grad_norm": 1.8943885564804077, + "learning_rate": 0.00019946035171528455, + "loss": 1.5106, + "step": 591 + }, + { + "epoch": 0.062283008942661755, + "grad_norm": 1.0123568773269653, + "learning_rate": 0.0001994568104492502, + "loss": 2.1627, + "step": 592 + }, + { + "epoch": 0.062388216728037876, + "grad_norm": 0.9259027242660522, + "learning_rate": 0.00019945325763362398, + "loss": 1.6539, + "step": 593 + }, + { + "epoch": 0.06249342451341399, + "grad_norm": 0.9884849786758423, + "learning_rate": 0.00019944969326881845, + "loss": 2.0843, + "step": 594 + }, + { + "epoch": 0.0625986322987901, + "grad_norm": 0.735097348690033, + "learning_rate": 0.0001994461173552475, + "loss": 2.4694, + "step": 595 + }, + { + "epoch": 0.06270384008416623, + "grad_norm": 1.136003017425537, + "learning_rate": 0.0001994425298933264, + "loss": 1.7615, + "step": 596 + }, + { + "epoch": 0.06280904786954235, + "grad_norm": 1.316763162612915, + "learning_rate": 0.00019943893088347178, + "loss": 2.3421, + "step": 597 + }, + { + "epoch": 0.06291425565491847, + "grad_norm": 1.165870189666748, + "learning_rate": 0.00019943532032610156, + "loss": 2.354, + "step": 598 + }, + { + "epoch": 0.06301946344029458, + "grad_norm": 12.435591697692871, + "learning_rate": 0.00019943169822163502, + "loss": 2.0802, + "step": 599 + }, + { + "epoch": 0.0631246712256707, + "grad_norm": 1.253097414970398, + "learning_rate": 0.00019942806457049278, + "loss": 2.0145, + "step": 600 + }, + { + "epoch": 0.06322987901104682, + "grad_norm": 1.299344539642334, + "learning_rate": 0.00019942441937309684, + "loss": 1.7266, + "step": 601 + }, + { + "epoch": 0.06333508679642294, + "grad_norm": 0.9620880484580994, + "learning_rate": 0.00019942076262987043, + "loss": 2.2549, + "step": 602 + }, + { + "epoch": 0.06344029458179905, + "grad_norm": 1.1274274587631226, + "learning_rate": 0.00019941709434123826, + "loss": 1.9165, + "step": 603 + }, + { + "epoch": 0.06354550236717517, + "grad_norm": 0.8177148103713989, + "learning_rate": 0.00019941341450762629, + "loss": 2.0151, + "step": 604 + }, + { + "epoch": 0.06365071015255129, + "grad_norm": 1.068963885307312, + "learning_rate": 0.00019940972312946186, + "loss": 2.1957, + "step": 605 + }, + { + "epoch": 0.06375591793792741, + "grad_norm": 0.8297029733657837, + "learning_rate": 0.00019940602020717364, + "loss": 2.2105, + "step": 606 + }, + { + "epoch": 0.06386112572330352, + "grad_norm": 0.8417388796806335, + "learning_rate": 0.00019940230574119164, + "loss": 1.9136, + "step": 607 + }, + { + "epoch": 0.06396633350867964, + "grad_norm": 0.8712098598480225, + "learning_rate": 0.00019939857973194717, + "loss": 2.4713, + "step": 608 + }, + { + "epoch": 0.06407154129405576, + "grad_norm": 1.0298516750335693, + "learning_rate": 0.000199394842179873, + "loss": 1.8507, + "step": 609 + }, + { + "epoch": 0.06417674907943188, + "grad_norm": 0.9027281999588013, + "learning_rate": 0.00019939109308540304, + "loss": 2.0152, + "step": 610 + }, + { + "epoch": 0.06428195686480799, + "grad_norm": 1.213963270187378, + "learning_rate": 0.00019938733244897274, + "loss": 1.9294, + "step": 611 + }, + { + "epoch": 0.06438716465018411, + "grad_norm": 1.2193472385406494, + "learning_rate": 0.00019938356027101884, + "loss": 1.9002, + "step": 612 + }, + { + "epoch": 0.06449237243556023, + "grad_norm": 1.7133346796035767, + "learning_rate": 0.0001993797765519793, + "loss": 2.3218, + "step": 613 + }, + { + "epoch": 0.06459758022093635, + "grad_norm": 1.3086202144622803, + "learning_rate": 0.0001993759812922936, + "loss": 2.2043, + "step": 614 + }, + { + "epoch": 0.06470278800631246, + "grad_norm": 1.2867093086242676, + "learning_rate": 0.0001993721744924024, + "loss": 2.1243, + "step": 615 + }, + { + "epoch": 0.06480799579168858, + "grad_norm": 1.0671265125274658, + "learning_rate": 0.00019936835615274782, + "loss": 2.1034, + "step": 616 + }, + { + "epoch": 0.0649132035770647, + "grad_norm": 0.6220844984054565, + "learning_rate": 0.00019936452627377323, + "loss": 2.1934, + "step": 617 + }, + { + "epoch": 0.06501841136244083, + "grad_norm": 0.9203489422798157, + "learning_rate": 0.0001993606848559234, + "loss": 2.1497, + "step": 618 + }, + { + "epoch": 0.06512361914781693, + "grad_norm": 1.5812958478927612, + "learning_rate": 0.00019935683189964447, + "loss": 2.0074, + "step": 619 + }, + { + "epoch": 0.06522882693319305, + "grad_norm": 0.9502617120742798, + "learning_rate": 0.00019935296740538377, + "loss": 2.3318, + "step": 620 + }, + { + "epoch": 0.06533403471856918, + "grad_norm": 1.632648229598999, + "learning_rate": 0.00019934909137359018, + "loss": 1.8631, + "step": 621 + }, + { + "epoch": 0.0654392425039453, + "grad_norm": 1.4153451919555664, + "learning_rate": 0.00019934520380471372, + "loss": 1.6571, + "step": 622 + }, + { + "epoch": 0.0655444502893214, + "grad_norm": 0.8237829804420471, + "learning_rate": 0.00019934130469920588, + "loss": 1.9875, + "step": 623 + }, + { + "epoch": 0.06564965807469753, + "grad_norm": 0.8410417437553406, + "learning_rate": 0.00019933739405751945, + "loss": 2.0338, + "step": 624 + }, + { + "epoch": 0.06575486586007365, + "grad_norm": 0.9069095253944397, + "learning_rate": 0.00019933347188010858, + "loss": 2.1402, + "step": 625 + }, + { + "epoch": 0.06586007364544977, + "grad_norm": 1.0030279159545898, + "learning_rate": 0.0001993295381674287, + "loss": 2.1201, + "step": 626 + }, + { + "epoch": 0.06596528143082588, + "grad_norm": 1.0119670629501343, + "learning_rate": 0.00019932559291993665, + "loss": 1.9571, + "step": 627 + }, + { + "epoch": 0.066070489216202, + "grad_norm": 1.5967551469802856, + "learning_rate": 0.00019932163613809055, + "loss": 1.943, + "step": 628 + }, + { + "epoch": 0.06617569700157812, + "grad_norm": 0.7849034070968628, + "learning_rate": 0.0001993176678223499, + "loss": 2.3237, + "step": 629 + }, + { + "epoch": 0.06628090478695424, + "grad_norm": 0.9208400249481201, + "learning_rate": 0.00019931368797317553, + "loss": 2.4865, + "step": 630 + }, + { + "epoch": 0.06638611257233035, + "grad_norm": 1.232565999031067, + "learning_rate": 0.00019930969659102962, + "loss": 2.2367, + "step": 631 + }, + { + "epoch": 0.06649132035770647, + "grad_norm": 0.7089071273803711, + "learning_rate": 0.0001993056936763757, + "loss": 2.3407, + "step": 632 + }, + { + "epoch": 0.06659652814308259, + "grad_norm": 0.9543630480766296, + "learning_rate": 0.00019930167922967853, + "loss": 1.8678, + "step": 633 + }, + { + "epoch": 0.06670173592845871, + "grad_norm": 1.19600510597229, + "learning_rate": 0.00019929765325140436, + "loss": 1.9537, + "step": 634 + }, + { + "epoch": 0.06680694371383482, + "grad_norm": 0.7090086936950684, + "learning_rate": 0.0001992936157420207, + "loss": 1.946, + "step": 635 + }, + { + "epoch": 0.06691215149921094, + "grad_norm": 55.697574615478516, + "learning_rate": 0.0001992895667019964, + "loss": 2.0818, + "step": 636 + }, + { + "epoch": 0.06701735928458706, + "grad_norm": 0.9292970299720764, + "learning_rate": 0.00019928550613180164, + "loss": 2.256, + "step": 637 + }, + { + "epoch": 0.06712256706996318, + "grad_norm": 1.2075432538986206, + "learning_rate": 0.00019928143403190806, + "loss": 1.8407, + "step": 638 + }, + { + "epoch": 0.06722777485533929, + "grad_norm": 0.8975555300712585, + "learning_rate": 0.00019927735040278842, + "loss": 2.1384, + "step": 639 + }, + { + "epoch": 0.06733298264071541, + "grad_norm": 0.9104759097099304, + "learning_rate": 0.00019927325524491703, + "loss": 2.1348, + "step": 640 + }, + { + "epoch": 0.06743819042609153, + "grad_norm": 0.8688013553619385, + "learning_rate": 0.0001992691485587694, + "loss": 1.7637, + "step": 641 + }, + { + "epoch": 0.06754339821146765, + "grad_norm": 1.2788143157958984, + "learning_rate": 0.0001992650303448224, + "loss": 1.8858, + "step": 642 + }, + { + "epoch": 0.06764860599684376, + "grad_norm": 0.7544511556625366, + "learning_rate": 0.0001992609006035543, + "loss": 1.9076, + "step": 643 + }, + { + "epoch": 0.06775381378221988, + "grad_norm": 1.3862735033035278, + "learning_rate": 0.00019925675933544473, + "loss": 2.212, + "step": 644 + }, + { + "epoch": 0.067859021567596, + "grad_norm": 1.04462730884552, + "learning_rate": 0.00019925260654097448, + "loss": 1.9386, + "step": 645 + }, + { + "epoch": 0.06796422935297212, + "grad_norm": 0.9594143033027649, + "learning_rate": 0.0001992484422206259, + "loss": 2.0205, + "step": 646 + }, + { + "epoch": 0.06806943713834823, + "grad_norm": 1.3281629085540771, + "learning_rate": 0.00019924426637488252, + "loss": 1.6588, + "step": 647 + }, + { + "epoch": 0.06817464492372435, + "grad_norm": 1.1453989744186401, + "learning_rate": 0.0001992400790042293, + "loss": 1.8736, + "step": 648 + }, + { + "epoch": 0.06827985270910047, + "grad_norm": 1.5039159059524536, + "learning_rate": 0.0001992358801091525, + "loss": 1.3861, + "step": 649 + }, + { + "epoch": 0.0683850604944766, + "grad_norm": 1.353845477104187, + "learning_rate": 0.0001992316696901397, + "loss": 1.8725, + "step": 650 + }, + { + "epoch": 0.0684902682798527, + "grad_norm": 0.9831173419952393, + "learning_rate": 0.00019922744774767987, + "loss": 2.1601, + "step": 651 + }, + { + "epoch": 0.06859547606522882, + "grad_norm": 2.004272937774658, + "learning_rate": 0.0001992232142822633, + "loss": 1.7202, + "step": 652 + }, + { + "epoch": 0.06870068385060495, + "grad_norm": 0.7142511010169983, + "learning_rate": 0.00019921896929438158, + "loss": 2.1088, + "step": 653 + }, + { + "epoch": 0.06880589163598107, + "grad_norm": 0.8999900817871094, + "learning_rate": 0.00019921471278452768, + "loss": 1.9379, + "step": 654 + }, + { + "epoch": 0.06891109942135717, + "grad_norm": 1.3912204504013062, + "learning_rate": 0.00019921044475319585, + "loss": 1.6042, + "step": 655 + }, + { + "epoch": 0.0690163072067333, + "grad_norm": 0.8359988331794739, + "learning_rate": 0.0001992061652008818, + "loss": 2.0374, + "step": 656 + }, + { + "epoch": 0.06912151499210942, + "grad_norm": 0.8871389627456665, + "learning_rate": 0.00019920187412808248, + "loss": 2.088, + "step": 657 + }, + { + "epoch": 0.06922672277748554, + "grad_norm": 1.1207047700881958, + "learning_rate": 0.00019919757153529614, + "loss": 1.8852, + "step": 658 + }, + { + "epoch": 0.06933193056286165, + "grad_norm": 1.3903989791870117, + "learning_rate": 0.0001991932574230225, + "loss": 2.1249, + "step": 659 + }, + { + "epoch": 0.06943713834823777, + "grad_norm": 0.6716268062591553, + "learning_rate": 0.00019918893179176253, + "loss": 2.1855, + "step": 660 + }, + { + "epoch": 0.06954234613361389, + "grad_norm": 1.388175368309021, + "learning_rate": 0.0001991845946420185, + "loss": 2.5185, + "step": 661 + }, + { + "epoch": 0.06964755391899001, + "grad_norm": 1.7384852170944214, + "learning_rate": 0.0001991802459742941, + "loss": 1.2171, + "step": 662 + }, + { + "epoch": 0.06975276170436612, + "grad_norm": 1.0405939817428589, + "learning_rate": 0.0001991758857890943, + "loss": 2.0206, + "step": 663 + }, + { + "epoch": 0.06985796948974224, + "grad_norm": 0.9668706655502319, + "learning_rate": 0.0001991715140869255, + "loss": 2.1, + "step": 664 + }, + { + "epoch": 0.06996317727511836, + "grad_norm": 1.021179437637329, + "learning_rate": 0.00019916713086829533, + "loss": 1.8814, + "step": 665 + }, + { + "epoch": 0.07006838506049448, + "grad_norm": 0.6269777417182922, + "learning_rate": 0.0001991627361337128, + "loss": 2.5095, + "step": 666 + }, + { + "epoch": 0.07017359284587059, + "grad_norm": 1.236629605293274, + "learning_rate": 0.00019915832988368824, + "loss": 2.1106, + "step": 667 + }, + { + "epoch": 0.07027880063124671, + "grad_norm": 1.0450339317321777, + "learning_rate": 0.0001991539121187334, + "loss": 2.1438, + "step": 668 + }, + { + "epoch": 0.07038400841662283, + "grad_norm": 0.8368414044380188, + "learning_rate": 0.00019914948283936119, + "loss": 2.1143, + "step": 669 + }, + { + "epoch": 0.07048921620199895, + "grad_norm": 1.016280174255371, + "learning_rate": 0.0001991450420460861, + "loss": 2.1759, + "step": 670 + }, + { + "epoch": 0.07059442398737506, + "grad_norm": 1.2927619218826294, + "learning_rate": 0.00019914058973942368, + "loss": 1.9304, + "step": 671 + }, + { + "epoch": 0.07069963177275118, + "grad_norm": 1.0759140253067017, + "learning_rate": 0.0001991361259198911, + "loss": 1.7529, + "step": 672 + }, + { + "epoch": 0.0708048395581273, + "grad_norm": 0.802558183670044, + "learning_rate": 0.00019913165058800663, + "loss": 2.4059, + "step": 673 + }, + { + "epoch": 0.07091004734350342, + "grad_norm": 0.8148163557052612, + "learning_rate": 0.00019912716374429, + "loss": 2.1683, + "step": 674 + }, + { + "epoch": 0.07101525512887953, + "grad_norm": 1.259080171585083, + "learning_rate": 0.00019912266538926225, + "loss": 2.1657, + "step": 675 + }, + { + "epoch": 0.07112046291425565, + "grad_norm": 1.0984203815460205, + "learning_rate": 0.00019911815552344582, + "loss": 2.0398, + "step": 676 + }, + { + "epoch": 0.07122567069963177, + "grad_norm": 1.0263009071350098, + "learning_rate": 0.00019911363414736434, + "loss": 1.7294, + "step": 677 + }, + { + "epoch": 0.0713308784850079, + "grad_norm": 1.2345585823059082, + "learning_rate": 0.00019910910126154293, + "loss": 2.333, + "step": 678 + }, + { + "epoch": 0.071436086270384, + "grad_norm": 0.7678645253181458, + "learning_rate": 0.00019910455686650793, + "loss": 2.2339, + "step": 679 + }, + { + "epoch": 0.07154129405576012, + "grad_norm": 0.8823609948158264, + "learning_rate": 0.0001991000009627871, + "loss": 2.0109, + "step": 680 + }, + { + "epoch": 0.07164650184113625, + "grad_norm": 0.9095181822776794, + "learning_rate": 0.00019909543355090946, + "loss": 2.3621, + "step": 681 + }, + { + "epoch": 0.07175170962651237, + "grad_norm": 1.3118935823440552, + "learning_rate": 0.00019909085463140546, + "loss": 1.9576, + "step": 682 + }, + { + "epoch": 0.07185691741188847, + "grad_norm": 1.0895490646362305, + "learning_rate": 0.0001990862642048068, + "loss": 1.8436, + "step": 683 + }, + { + "epoch": 0.0719621251972646, + "grad_norm": 1.1863242387771606, + "learning_rate": 0.00019908166227164655, + "loss": 1.8009, + "step": 684 + }, + { + "epoch": 0.07206733298264072, + "grad_norm": 1.0298961400985718, + "learning_rate": 0.00019907704883245916, + "loss": 1.9716, + "step": 685 + }, + { + "epoch": 0.07217254076801684, + "grad_norm": 0.7246216535568237, + "learning_rate": 0.00019907242388778033, + "loss": 2.119, + "step": 686 + }, + { + "epoch": 0.07227774855339295, + "grad_norm": 1.172180414199829, + "learning_rate": 0.00019906778743814711, + "loss": 2.3032, + "step": 687 + }, + { + "epoch": 0.07238295633876907, + "grad_norm": 1.1342841386795044, + "learning_rate": 0.000199063139484098, + "loss": 2.4119, + "step": 688 + }, + { + "epoch": 0.07248816412414519, + "grad_norm": 1.160383701324463, + "learning_rate": 0.0001990584800261727, + "loss": 1.9328, + "step": 689 + }, + { + "epoch": 0.07259337190952131, + "grad_norm": 0.991884171962738, + "learning_rate": 0.00019905380906491232, + "loss": 2.1967, + "step": 690 + }, + { + "epoch": 0.07269857969489742, + "grad_norm": 1.0760473012924194, + "learning_rate": 0.00019904912660085927, + "loss": 1.9094, + "step": 691 + }, + { + "epoch": 0.07280378748027354, + "grad_norm": 1.6143467426300049, + "learning_rate": 0.00019904443263455728, + "loss": 1.8885, + "step": 692 + }, + { + "epoch": 0.07290899526564966, + "grad_norm": 1.200732707977295, + "learning_rate": 0.00019903972716655148, + "loss": 1.8561, + "step": 693 + }, + { + "epoch": 0.07301420305102578, + "grad_norm": 0.9255091547966003, + "learning_rate": 0.0001990350101973883, + "loss": 1.9438, + "step": 694 + }, + { + "epoch": 0.07311941083640189, + "grad_norm": 0.8052567839622498, + "learning_rate": 0.00019903028172761552, + "loss": 1.8901, + "step": 695 + }, + { + "epoch": 0.07322461862177801, + "grad_norm": 0.843561589717865, + "learning_rate": 0.00019902554175778222, + "loss": 2.1764, + "step": 696 + }, + { + "epoch": 0.07332982640715413, + "grad_norm": 1.0128999948501587, + "learning_rate": 0.0001990207902884388, + "loss": 1.6193, + "step": 697 + }, + { + "epoch": 0.07343503419253025, + "grad_norm": 1.1728532314300537, + "learning_rate": 0.00019901602732013709, + "loss": 1.8212, + "step": 698 + }, + { + "epoch": 0.07354024197790636, + "grad_norm": 0.6884058117866516, + "learning_rate": 0.00019901125285343022, + "loss": 2.1301, + "step": 699 + }, + { + "epoch": 0.07364544976328248, + "grad_norm": 0.8318617343902588, + "learning_rate": 0.00019900646688887253, + "loss": 2.3534, + "step": 700 + }, + { + "epoch": 0.0737506575486586, + "grad_norm": 0.7841008901596069, + "learning_rate": 0.0001990016694270199, + "loss": 1.9417, + "step": 701 + }, + { + "epoch": 0.07385586533403472, + "grad_norm": 0.7915363907814026, + "learning_rate": 0.0001989968604684294, + "loss": 2.337, + "step": 702 + }, + { + "epoch": 0.07396107311941083, + "grad_norm": 0.9618809819221497, + "learning_rate": 0.00019899204001365948, + "loss": 2.1485, + "step": 703 + }, + { + "epoch": 0.07406628090478695, + "grad_norm": 1.1301730871200562, + "learning_rate": 0.00019898720806326993, + "loss": 2.3046, + "step": 704 + }, + { + "epoch": 0.07417148869016307, + "grad_norm": 0.922090470790863, + "learning_rate": 0.00019898236461782186, + "loss": 1.9871, + "step": 705 + }, + { + "epoch": 0.0742766964755392, + "grad_norm": 0.9652605056762695, + "learning_rate": 0.0001989775096778777, + "loss": 2.1545, + "step": 706 + }, + { + "epoch": 0.0743819042609153, + "grad_norm": 0.8259909749031067, + "learning_rate": 0.00019897264324400128, + "loss": 1.9013, + "step": 707 + }, + { + "epoch": 0.07448711204629142, + "grad_norm": 0.7845616340637207, + "learning_rate": 0.00019896776531675773, + "loss": 2.0664, + "step": 708 + }, + { + "epoch": 0.07459231983166754, + "grad_norm": 0.9520933032035828, + "learning_rate": 0.0001989628758967135, + "loss": 1.898, + "step": 709 + }, + { + "epoch": 0.07469752761704367, + "grad_norm": 1.1913421154022217, + "learning_rate": 0.00019895797498443633, + "loss": 1.794, + "step": 710 + }, + { + "epoch": 0.07480273540241977, + "grad_norm": 0.8856778144836426, + "learning_rate": 0.00019895306258049542, + "loss": 2.2846, + "step": 711 + }, + { + "epoch": 0.0749079431877959, + "grad_norm": 1.1165918111801147, + "learning_rate": 0.00019894813868546115, + "loss": 1.8066, + "step": 712 + }, + { + "epoch": 0.07501315097317202, + "grad_norm": 1.1976250410079956, + "learning_rate": 0.0001989432032999054, + "loss": 1.9725, + "step": 713 + }, + { + "epoch": 0.07511835875854814, + "grad_norm": 1.1325089931488037, + "learning_rate": 0.00019893825642440128, + "loss": 2.4161, + "step": 714 + }, + { + "epoch": 0.07522356654392424, + "grad_norm": 1.7667649984359741, + "learning_rate": 0.0001989332980595232, + "loss": 1.7692, + "step": 715 + }, + { + "epoch": 0.07532877432930037, + "grad_norm": 0.7589969038963318, + "learning_rate": 0.00019892832820584704, + "loss": 1.9779, + "step": 716 + }, + { + "epoch": 0.07543398211467649, + "grad_norm": 1.1662501096725464, + "learning_rate": 0.00019892334686394985, + "loss": 2.1976, + "step": 717 + }, + { + "epoch": 0.07553918990005261, + "grad_norm": 0.9506317973136902, + "learning_rate": 0.00019891835403441013, + "loss": 2.2166, + "step": 718 + }, + { + "epoch": 0.07564439768542872, + "grad_norm": 0.8529815077781677, + "learning_rate": 0.00019891334971780772, + "loss": 2.1537, + "step": 719 + }, + { + "epoch": 0.07574960547080484, + "grad_norm": 0.7838415503501892, + "learning_rate": 0.0001989083339147237, + "loss": 2.0705, + "step": 720 + }, + { + "epoch": 0.07585481325618096, + "grad_norm": 0.8060374855995178, + "learning_rate": 0.00019890330662574056, + "loss": 2.0404, + "step": 721 + }, + { + "epoch": 0.07596002104155708, + "grad_norm": 1.2424758672714233, + "learning_rate": 0.0001988982678514421, + "loss": 2.1626, + "step": 722 + }, + { + "epoch": 0.07606522882693319, + "grad_norm": 0.752369225025177, + "learning_rate": 0.00019889321759241347, + "loss": 2.0623, + "step": 723 + }, + { + "epoch": 0.07617043661230931, + "grad_norm": 1.5461053848266602, + "learning_rate": 0.00019888815584924113, + "loss": 2.5155, + "step": 724 + }, + { + "epoch": 0.07627564439768543, + "grad_norm": 0.9477027654647827, + "learning_rate": 0.00019888308262251285, + "loss": 2.526, + "step": 725 + }, + { + "epoch": 0.07638085218306155, + "grad_norm": 1.75247061252594, + "learning_rate": 0.00019887799791281784, + "loss": 1.9596, + "step": 726 + }, + { + "epoch": 0.07648605996843766, + "grad_norm": 1.1626888513565063, + "learning_rate": 0.0001988729017207465, + "loss": 2.2181, + "step": 727 + }, + { + "epoch": 0.07659126775381378, + "grad_norm": 5.1132049560546875, + "learning_rate": 0.0001988677940468907, + "loss": 2.1096, + "step": 728 + }, + { + "epoch": 0.0766964755391899, + "grad_norm": 1.210626482963562, + "learning_rate": 0.0001988626748918435, + "loss": 1.8661, + "step": 729 + }, + { + "epoch": 0.07680168332456602, + "grad_norm": 1.0915313959121704, + "learning_rate": 0.00019885754425619945, + "loss": 1.9508, + "step": 730 + }, + { + "epoch": 0.07690689110994213, + "grad_norm": 0.9805365204811096, + "learning_rate": 0.0001988524021405543, + "loss": 1.8635, + "step": 731 + }, + { + "epoch": 0.07701209889531825, + "grad_norm": 1.2141715288162231, + "learning_rate": 0.0001988472485455052, + "loss": 2.0866, + "step": 732 + }, + { + "epoch": 0.07711730668069437, + "grad_norm": 2.095106601715088, + "learning_rate": 0.00019884208347165062, + "loss": 1.658, + "step": 733 + }, + { + "epoch": 0.0772225144660705, + "grad_norm": 1.0189520120620728, + "learning_rate": 0.00019883690691959035, + "loss": 2.0882, + "step": 734 + }, + { + "epoch": 0.0773277222514466, + "grad_norm": 1.0242335796356201, + "learning_rate": 0.00019883171888992557, + "loss": 1.9676, + "step": 735 + }, + { + "epoch": 0.07743293003682272, + "grad_norm": 1.7329810857772827, + "learning_rate": 0.00019882651938325872, + "loss": 2.0722, + "step": 736 + }, + { + "epoch": 0.07753813782219884, + "grad_norm": 0.8744789958000183, + "learning_rate": 0.00019882130840019358, + "loss": 2.3554, + "step": 737 + }, + { + "epoch": 0.07764334560757497, + "grad_norm": 1.0732308626174927, + "learning_rate": 0.0001988160859413353, + "loss": 2.057, + "step": 738 + }, + { + "epoch": 0.07774855339295107, + "grad_norm": 0.9206953048706055, + "learning_rate": 0.0001988108520072904, + "loss": 2.1232, + "step": 739 + }, + { + "epoch": 0.0778537611783272, + "grad_norm": 0.7946454286575317, + "learning_rate": 0.0001988056065986666, + "loss": 2.1398, + "step": 740 + }, + { + "epoch": 0.07795896896370332, + "grad_norm": 1.073172926902771, + "learning_rate": 0.00019880034971607308, + "loss": 2.0701, + "step": 741 + }, + { + "epoch": 0.07806417674907944, + "grad_norm": 0.7679953575134277, + "learning_rate": 0.00019879508136012026, + "loss": 2.3293, + "step": 742 + }, + { + "epoch": 0.07816938453445554, + "grad_norm": 1.1941078901290894, + "learning_rate": 0.00019878980153141998, + "loss": 2.2848, + "step": 743 + }, + { + "epoch": 0.07827459231983167, + "grad_norm": 0.7265371680259705, + "learning_rate": 0.00019878451023058537, + "loss": 2.1909, + "step": 744 + }, + { + "epoch": 0.07837980010520779, + "grad_norm": 1.4622846841812134, + "learning_rate": 0.00019877920745823085, + "loss": 2.1973, + "step": 745 + }, + { + "epoch": 0.07848500789058391, + "grad_norm": 1.132213830947876, + "learning_rate": 0.00019877389321497227, + "loss": 1.757, + "step": 746 + }, + { + "epoch": 0.07859021567596002, + "grad_norm": 1.0680114030838013, + "learning_rate": 0.00019876856750142673, + "loss": 1.8053, + "step": 747 + }, + { + "epoch": 0.07869542346133614, + "grad_norm": 1.1053400039672852, + "learning_rate": 0.00019876323031821266, + "loss": 2.0033, + "step": 748 + }, + { + "epoch": 0.07880063124671226, + "grad_norm": 1.0279300212860107, + "learning_rate": 0.0001987578816659499, + "loss": 1.9289, + "step": 749 + }, + { + "epoch": 0.07890583903208838, + "grad_norm": 0.8577696084976196, + "learning_rate": 0.00019875252154525952, + "loss": 2.3191, + "step": 750 + }, + { + "epoch": 0.07901104681746449, + "grad_norm": 2.2503817081451416, + "learning_rate": 0.000198747149956764, + "loss": 1.8925, + "step": 751 + }, + { + "epoch": 0.07911625460284061, + "grad_norm": 0.7883946299552917, + "learning_rate": 0.0001987417669010871, + "loss": 2.1394, + "step": 752 + }, + { + "epoch": 0.07922146238821673, + "grad_norm": 0.9540475010871887, + "learning_rate": 0.00019873637237885402, + "loss": 2.1549, + "step": 753 + }, + { + "epoch": 0.07932667017359285, + "grad_norm": 1.1005622148513794, + "learning_rate": 0.0001987309663906911, + "loss": 1.8121, + "step": 754 + }, + { + "epoch": 0.07943187795896896, + "grad_norm": 1.2828789949417114, + "learning_rate": 0.00019872554893722618, + "loss": 1.9085, + "step": 755 + }, + { + "epoch": 0.07953708574434508, + "grad_norm": 0.9298080205917358, + "learning_rate": 0.00019872012001908833, + "loss": 2.301, + "step": 756 + }, + { + "epoch": 0.0796422935297212, + "grad_norm": 1.2716705799102783, + "learning_rate": 0.00019871467963690807, + "loss": 2.0119, + "step": 757 + }, + { + "epoch": 0.07974750131509732, + "grad_norm": 1.2654720544815063, + "learning_rate": 0.0001987092277913171, + "loss": 1.7698, + "step": 758 + }, + { + "epoch": 0.07985270910047343, + "grad_norm": 1.111207127571106, + "learning_rate": 0.00019870376448294851, + "loss": 1.6179, + "step": 759 + }, + { + "epoch": 0.07995791688584955, + "grad_norm": 1.3444252014160156, + "learning_rate": 0.00019869828971243682, + "loss": 1.7896, + "step": 760 + }, + { + "epoch": 0.08006312467122567, + "grad_norm": 0.7708830237388611, + "learning_rate": 0.00019869280348041774, + "loss": 2.241, + "step": 761 + }, + { + "epoch": 0.08016833245660179, + "grad_norm": 1.6333880424499512, + "learning_rate": 0.0001986873057875284, + "loss": 1.9599, + "step": 762 + }, + { + "epoch": 0.0802735402419779, + "grad_norm": 0.9313520789146423, + "learning_rate": 0.00019868179663440718, + "loss": 2.3505, + "step": 763 + }, + { + "epoch": 0.08037874802735402, + "grad_norm": 1.2057533264160156, + "learning_rate": 0.00019867627602169387, + "loss": 2.1487, + "step": 764 + }, + { + "epoch": 0.08048395581273014, + "grad_norm": 1.0157853364944458, + "learning_rate": 0.00019867074395002958, + "loss": 2.0869, + "step": 765 + }, + { + "epoch": 0.08058916359810626, + "grad_norm": 0.8782801628112793, + "learning_rate": 0.00019866520042005669, + "loss": 2.1859, + "step": 766 + }, + { + "epoch": 0.08069437138348237, + "grad_norm": 0.8281430006027222, + "learning_rate": 0.00019865964543241897, + "loss": 2.1563, + "step": 767 + }, + { + "epoch": 0.08079957916885849, + "grad_norm": 0.9678866863250732, + "learning_rate": 0.00019865407898776152, + "loss": 2.1626, + "step": 768 + }, + { + "epoch": 0.08090478695423461, + "grad_norm": 0.9700407981872559, + "learning_rate": 0.00019864850108673073, + "loss": 2.2104, + "step": 769 + }, + { + "epoch": 0.08100999473961074, + "grad_norm": 1.1797422170639038, + "learning_rate": 0.00019864291172997435, + "loss": 1.8951, + "step": 770 + }, + { + "epoch": 0.08111520252498684, + "grad_norm": 1.1290271282196045, + "learning_rate": 0.00019863731091814146, + "loss": 2.2929, + "step": 771 + }, + { + "epoch": 0.08122041031036296, + "grad_norm": 1.6742466688156128, + "learning_rate": 0.00019863169865188244, + "loss": 1.3898, + "step": 772 + }, + { + "epoch": 0.08132561809573909, + "grad_norm": 1.3946906328201294, + "learning_rate": 0.00019862607493184906, + "loss": 1.6763, + "step": 773 + }, + { + "epoch": 0.0814308258811152, + "grad_norm": 1.0987610816955566, + "learning_rate": 0.00019862043975869438, + "loss": 1.9443, + "step": 774 + }, + { + "epoch": 0.08153603366649131, + "grad_norm": 1.0788226127624512, + "learning_rate": 0.00019861479313307273, + "loss": 1.8061, + "step": 775 + }, + { + "epoch": 0.08164124145186744, + "grad_norm": 0.9351698160171509, + "learning_rate": 0.0001986091350556399, + "loss": 2.159, + "step": 776 + }, + { + "epoch": 0.08174644923724356, + "grad_norm": 1.1127021312713623, + "learning_rate": 0.000198603465527053, + "loss": 2.0271, + "step": 777 + }, + { + "epoch": 0.08185165702261968, + "grad_norm": 0.7860623598098755, + "learning_rate": 0.00019859778454797027, + "loss": 1.8645, + "step": 778 + }, + { + "epoch": 0.08195686480799579, + "grad_norm": 1.112596869468689, + "learning_rate": 0.00019859209211905152, + "loss": 1.7839, + "step": 779 + }, + { + "epoch": 0.08206207259337191, + "grad_norm": 1.2952525615692139, + "learning_rate": 0.00019858638824095775, + "loss": 2.0274, + "step": 780 + }, + { + "epoch": 0.08216728037874803, + "grad_norm": 1.0121921300888062, + "learning_rate": 0.00019858067291435137, + "loss": 2.2605, + "step": 781 + }, + { + "epoch": 0.08227248816412415, + "grad_norm": 1.0447254180908203, + "learning_rate": 0.00019857494613989606, + "loss": 2.1212, + "step": 782 + }, + { + "epoch": 0.08237769594950026, + "grad_norm": 1.3232123851776123, + "learning_rate": 0.00019856920791825683, + "loss": 1.7452, + "step": 783 + }, + { + "epoch": 0.08248290373487638, + "grad_norm": 1.3245103359222412, + "learning_rate": 0.0001985634582501001, + "loss": 1.6841, + "step": 784 + }, + { + "epoch": 0.0825881115202525, + "grad_norm": 1.0271555185317993, + "learning_rate": 0.00019855769713609348, + "loss": 1.9158, + "step": 785 + }, + { + "epoch": 0.08269331930562862, + "grad_norm": 1.0767377614974976, + "learning_rate": 0.00019855192457690607, + "loss": 1.7579, + "step": 786 + }, + { + "epoch": 0.08279852709100473, + "grad_norm": 0.7473664283752441, + "learning_rate": 0.00019854614057320818, + "loss": 2.3149, + "step": 787 + }, + { + "epoch": 0.08290373487638085, + "grad_norm": 1.0371477603912354, + "learning_rate": 0.0001985403451256715, + "loss": 1.9446, + "step": 788 + }, + { + "epoch": 0.08300894266175697, + "grad_norm": 1.0556628704071045, + "learning_rate": 0.00019853453823496898, + "loss": 2.0536, + "step": 789 + }, + { + "epoch": 0.08311415044713309, + "grad_norm": 1.5194404125213623, + "learning_rate": 0.00019852871990177503, + "loss": 2.1898, + "step": 790 + }, + { + "epoch": 0.0832193582325092, + "grad_norm": 1.2382123470306396, + "learning_rate": 0.0001985228901267653, + "loss": 2.096, + "step": 791 + }, + { + "epoch": 0.08332456601788532, + "grad_norm": 0.805229902267456, + "learning_rate": 0.00019851704891061676, + "loss": 2.2309, + "step": 792 + }, + { + "epoch": 0.08342977380326144, + "grad_norm": 1.50439453125, + "learning_rate": 0.00019851119625400774, + "loss": 1.664, + "step": 793 + }, + { + "epoch": 0.08353498158863756, + "grad_norm": 1.2170758247375488, + "learning_rate": 0.0001985053321576179, + "loss": 2.2134, + "step": 794 + }, + { + "epoch": 0.08364018937401367, + "grad_norm": 0.898504376411438, + "learning_rate": 0.0001984994566221282, + "loss": 1.7387, + "step": 795 + }, + { + "epoch": 0.08374539715938979, + "grad_norm": 1.0071955919265747, + "learning_rate": 0.00019849356964822093, + "loss": 2.255, + "step": 796 + }, + { + "epoch": 0.08385060494476591, + "grad_norm": 1.0701338052749634, + "learning_rate": 0.00019848767123657976, + "loss": 2.1611, + "step": 797 + }, + { + "epoch": 0.08395581273014203, + "grad_norm": 0.9649009704589844, + "learning_rate": 0.00019848176138788964, + "loss": 2.2191, + "step": 798 + }, + { + "epoch": 0.08406102051551814, + "grad_norm": 2.6447019577026367, + "learning_rate": 0.00019847584010283686, + "loss": 1.9287, + "step": 799 + }, + { + "epoch": 0.08416622830089426, + "grad_norm": 0.8168442845344543, + "learning_rate": 0.00019846990738210907, + "loss": 1.7487, + "step": 800 + }, + { + "epoch": 0.08427143608627038, + "grad_norm": 1.1686725616455078, + "learning_rate": 0.00019846396322639514, + "loss": 1.7051, + "step": 801 + }, + { + "epoch": 0.0843766438716465, + "grad_norm": 0.8122353553771973, + "learning_rate": 0.00019845800763638544, + "loss": 1.7214, + "step": 802 + }, + { + "epoch": 0.08448185165702261, + "grad_norm": 1.4718796014785767, + "learning_rate": 0.0001984520406127715, + "loss": 1.8946, + "step": 803 + }, + { + "epoch": 0.08458705944239873, + "grad_norm": 1.2869932651519775, + "learning_rate": 0.0001984460621562463, + "loss": 1.6935, + "step": 804 + }, + { + "epoch": 0.08469226722777486, + "grad_norm": 0.8310649991035461, + "learning_rate": 0.00019844007226750408, + "loss": 1.7826, + "step": 805 + }, + { + "epoch": 0.08479747501315098, + "grad_norm": 1.2225552797317505, + "learning_rate": 0.0001984340709472404, + "loss": 1.9194, + "step": 806 + }, + { + "epoch": 0.08490268279852708, + "grad_norm": 0.819175660610199, + "learning_rate": 0.00019842805819615222, + "loss": 2.0277, + "step": 807 + }, + { + "epoch": 0.0850078905839032, + "grad_norm": 0.8296215534210205, + "learning_rate": 0.00019842203401493772, + "loss": 2.1235, + "step": 808 + }, + { + "epoch": 0.08511309836927933, + "grad_norm": 1.2839940786361694, + "learning_rate": 0.00019841599840429654, + "loss": 1.6553, + "step": 809 + }, + { + "epoch": 0.08521830615465545, + "grad_norm": 1.0420074462890625, + "learning_rate": 0.00019840995136492955, + "loss": 1.9328, + "step": 810 + }, + { + "epoch": 0.08532351394003156, + "grad_norm": 0.8936055898666382, + "learning_rate": 0.00019840389289753896, + "loss": 2.176, + "step": 811 + }, + { + "epoch": 0.08542872172540768, + "grad_norm": 0.811882495880127, + "learning_rate": 0.0001983978230028283, + "loss": 2.2401, + "step": 812 + }, + { + "epoch": 0.0855339295107838, + "grad_norm": 0.9509261846542358, + "learning_rate": 0.00019839174168150247, + "loss": 2.1676, + "step": 813 + }, + { + "epoch": 0.08563913729615992, + "grad_norm": 0.921415388584137, + "learning_rate": 0.0001983856489342677, + "loss": 2.3311, + "step": 814 + }, + { + "epoch": 0.08574434508153603, + "grad_norm": 0.9342227578163147, + "learning_rate": 0.00019837954476183148, + "loss": 2.2151, + "step": 815 + }, + { + "epoch": 0.08584955286691215, + "grad_norm": 0.9104680418968201, + "learning_rate": 0.00019837342916490268, + "loss": 2.3867, + "step": 816 + }, + { + "epoch": 0.08595476065228827, + "grad_norm": 1.001815915107727, + "learning_rate": 0.0001983673021441915, + "loss": 1.998, + "step": 817 + }, + { + "epoch": 0.08605996843766439, + "grad_norm": 1.704974889755249, + "learning_rate": 0.00019836116370040944, + "loss": 1.8599, + "step": 818 + }, + { + "epoch": 0.0861651762230405, + "grad_norm": 0.82003253698349, + "learning_rate": 0.0001983550138342693, + "loss": 2.3351, + "step": 819 + }, + { + "epoch": 0.08627038400841662, + "grad_norm": 0.7901598215103149, + "learning_rate": 0.00019834885254648533, + "loss": 2.0966, + "step": 820 + }, + { + "epoch": 0.08637559179379274, + "grad_norm": 1.0715858936309814, + "learning_rate": 0.00019834267983777292, + "loss": 1.8696, + "step": 821 + }, + { + "epoch": 0.08648079957916886, + "grad_norm": 1.0539333820343018, + "learning_rate": 0.000198336495708849, + "loss": 1.7904, + "step": 822 + }, + { + "epoch": 0.08658600736454497, + "grad_norm": 1.2801223993301392, + "learning_rate": 0.00019833030016043156, + "loss": 2.1986, + "step": 823 + }, + { + "epoch": 0.08669121514992109, + "grad_norm": 1.5818783044815063, + "learning_rate": 0.00019832409319324023, + "loss": 1.8651, + "step": 824 + }, + { + "epoch": 0.08679642293529721, + "grad_norm": 0.8712964057922363, + "learning_rate": 0.00019831787480799568, + "loss": 2.2397, + "step": 825 + }, + { + "epoch": 0.08690163072067333, + "grad_norm": 1.518312931060791, + "learning_rate": 0.00019831164500542012, + "loss": 2.1451, + "step": 826 + }, + { + "epoch": 0.08700683850604944, + "grad_norm": 1.9714057445526123, + "learning_rate": 0.00019830540378623694, + "loss": 2.4544, + "step": 827 + }, + { + "epoch": 0.08711204629142556, + "grad_norm": 1.1616429090499878, + "learning_rate": 0.00019829915115117093, + "loss": 1.8353, + "step": 828 + }, + { + "epoch": 0.08721725407680168, + "grad_norm": 12.633820533752441, + "learning_rate": 0.0001982928871009482, + "loss": 2.3892, + "step": 829 + }, + { + "epoch": 0.0873224618621778, + "grad_norm": 0.8267893195152283, + "learning_rate": 0.00019828661163629615, + "loss": 2.1383, + "step": 830 + }, + { + "epoch": 0.08742766964755391, + "grad_norm": 0.9138221144676208, + "learning_rate": 0.00019828032475794352, + "loss": 2.3029, + "step": 831 + }, + { + "epoch": 0.08753287743293003, + "grad_norm": 1.1330816745758057, + "learning_rate": 0.00019827402646662047, + "loss": 2.2558, + "step": 832 + }, + { + "epoch": 0.08763808521830616, + "grad_norm": 0.8642826676368713, + "learning_rate": 0.0001982677167630583, + "loss": 1.9996, + "step": 833 + }, + { + "epoch": 0.08774329300368228, + "grad_norm": 0.8680064678192139, + "learning_rate": 0.00019826139564798974, + "loss": 2.3248, + "step": 834 + }, + { + "epoch": 0.08784850078905838, + "grad_norm": 1.49010169506073, + "learning_rate": 0.0001982550631221489, + "loss": 1.9746, + "step": 835 + }, + { + "epoch": 0.0879537085744345, + "grad_norm": 2.9857089519500732, + "learning_rate": 0.00019824871918627115, + "loss": 2.3267, + "step": 836 + }, + { + "epoch": 0.08805891635981063, + "grad_norm": 1.274704933166504, + "learning_rate": 0.0001982423638410931, + "loss": 2.3621, + "step": 837 + }, + { + "epoch": 0.08816412414518675, + "grad_norm": 0.9977430105209351, + "learning_rate": 0.0001982359970873529, + "loss": 2.0158, + "step": 838 + }, + { + "epoch": 0.08826933193056286, + "grad_norm": 0.9175136685371399, + "learning_rate": 0.0001982296189257898, + "loss": 1.8731, + "step": 839 + }, + { + "epoch": 0.08837453971593898, + "grad_norm": 0.993022620677948, + "learning_rate": 0.00019822322935714458, + "loss": 1.9218, + "step": 840 + }, + { + "epoch": 0.0884797475013151, + "grad_norm": 1.6498595476150513, + "learning_rate": 0.00019821682838215915, + "loss": 2.0277, + "step": 841 + }, + { + "epoch": 0.08858495528669122, + "grad_norm": 0.8532339334487915, + "learning_rate": 0.00019821041600157682, + "loss": 1.7503, + "step": 842 + }, + { + "epoch": 0.08869016307206733, + "grad_norm": 0.8794986605644226, + "learning_rate": 0.0001982039922161423, + "loss": 2.2771, + "step": 843 + }, + { + "epoch": 0.08879537085744345, + "grad_norm": 1.4324920177459717, + "learning_rate": 0.00019819755702660155, + "loss": 2.0883, + "step": 844 + }, + { + "epoch": 0.08890057864281957, + "grad_norm": 1.0365421772003174, + "learning_rate": 0.00019819111043370186, + "loss": 2.0581, + "step": 845 + }, + { + "epoch": 0.08900578642819569, + "grad_norm": 0.9392815232276917, + "learning_rate": 0.00019818465243819184, + "loss": 2.3084, + "step": 846 + }, + { + "epoch": 0.0891109942135718, + "grad_norm": 0.9906817674636841, + "learning_rate": 0.00019817818304082146, + "loss": 2.3203, + "step": 847 + }, + { + "epoch": 0.08921620199894792, + "grad_norm": 1.1406745910644531, + "learning_rate": 0.000198171702242342, + "loss": 1.9426, + "step": 848 + }, + { + "epoch": 0.08932140978432404, + "grad_norm": 0.843298077583313, + "learning_rate": 0.00019816521004350596, + "loss": 2.2718, + "step": 849 + }, + { + "epoch": 0.08942661756970016, + "grad_norm": 0.8017824292182922, + "learning_rate": 0.00019815870644506738, + "loss": 1.9611, + "step": 850 + }, + { + "epoch": 0.08953182535507627, + "grad_norm": 1.6382418870925903, + "learning_rate": 0.00019815219144778143, + "loss": 1.5162, + "step": 851 + }, + { + "epoch": 0.08963703314045239, + "grad_norm": 0.6821427941322327, + "learning_rate": 0.00019814566505240472, + "loss": 1.8852, + "step": 852 + }, + { + "epoch": 0.08974224092582851, + "grad_norm": 0.967110276222229, + "learning_rate": 0.00019813912725969509, + "loss": 2.2542, + "step": 853 + }, + { + "epoch": 0.08984744871120463, + "grad_norm": 1.212904691696167, + "learning_rate": 0.00019813257807041178, + "loss": 2.1593, + "step": 854 + }, + { + "epoch": 0.08995265649658074, + "grad_norm": 0.7041783928871155, + "learning_rate": 0.00019812601748531533, + "loss": 2.0618, + "step": 855 + }, + { + "epoch": 0.09005786428195686, + "grad_norm": 1.1121630668640137, + "learning_rate": 0.00019811944550516758, + "loss": 1.94, + "step": 856 + }, + { + "epoch": 0.09016307206733298, + "grad_norm": 1.1690653562545776, + "learning_rate": 0.00019811286213073173, + "loss": 2.095, + "step": 857 + }, + { + "epoch": 0.0902682798527091, + "grad_norm": 1.0316269397735596, + "learning_rate": 0.00019810626736277228, + "loss": 1.8736, + "step": 858 + }, + { + "epoch": 0.09037348763808521, + "grad_norm": 0.9707139134407043, + "learning_rate": 0.00019809966120205505, + "loss": 1.9554, + "step": 859 + }, + { + "epoch": 0.09047869542346133, + "grad_norm": 1.1598957777023315, + "learning_rate": 0.0001980930436493472, + "loss": 2.1387, + "step": 860 + }, + { + "epoch": 0.09058390320883745, + "grad_norm": 0.881175696849823, + "learning_rate": 0.0001980864147054172, + "loss": 1.9518, + "step": 861 + }, + { + "epoch": 0.09068911099421358, + "grad_norm": 1.648833990097046, + "learning_rate": 0.0001980797743710349, + "loss": 1.8736, + "step": 862 + }, + { + "epoch": 0.09079431877958968, + "grad_norm": 1.2492873668670654, + "learning_rate": 0.0001980731226469713, + "loss": 2.2163, + "step": 863 + }, + { + "epoch": 0.0908995265649658, + "grad_norm": 0.6980217099189758, + "learning_rate": 0.00019806645953399893, + "loss": 1.8288, + "step": 864 + }, + { + "epoch": 0.09100473435034193, + "grad_norm": 1.0036383867263794, + "learning_rate": 0.00019805978503289158, + "loss": 2.3498, + "step": 865 + }, + { + "epoch": 0.09110994213571805, + "grad_norm": 0.788852870464325, + "learning_rate": 0.00019805309914442426, + "loss": 1.8863, + "step": 866 + }, + { + "epoch": 0.09121514992109415, + "grad_norm": 1.2762038707733154, + "learning_rate": 0.00019804640186937343, + "loss": 2.1017, + "step": 867 + }, + { + "epoch": 0.09132035770647028, + "grad_norm": 0.878587543964386, + "learning_rate": 0.0001980396932085168, + "loss": 2.1149, + "step": 868 + }, + { + "epoch": 0.0914255654918464, + "grad_norm": 1.13942551612854, + "learning_rate": 0.00019803297316263346, + "loss": 1.8322, + "step": 869 + }, + { + "epoch": 0.09153077327722252, + "grad_norm": 0.9839184284210205, + "learning_rate": 0.00019802624173250374, + "loss": 2.1218, + "step": 870 + }, + { + "epoch": 0.09163598106259863, + "grad_norm": 1.22666335105896, + "learning_rate": 0.00019801949891890938, + "loss": 1.9397, + "step": 871 + }, + { + "epoch": 0.09174118884797475, + "grad_norm": 1.162661075592041, + "learning_rate": 0.00019801274472263335, + "loss": 1.6768, + "step": 872 + }, + { + "epoch": 0.09184639663335087, + "grad_norm": 1.2919373512268066, + "learning_rate": 0.00019800597914446005, + "loss": 2.1166, + "step": 873 + }, + { + "epoch": 0.09195160441872699, + "grad_norm": 1.0327990055084229, + "learning_rate": 0.0001979992021851751, + "loss": 2.3549, + "step": 874 + }, + { + "epoch": 0.0920568122041031, + "grad_norm": 0.6984542012214661, + "learning_rate": 0.0001979924138455655, + "loss": 1.9739, + "step": 875 + }, + { + "epoch": 0.09216201998947922, + "grad_norm": 1.3677195310592651, + "learning_rate": 0.00019798561412641958, + "loss": 1.7295, + "step": 876 + }, + { + "epoch": 0.09226722777485534, + "grad_norm": 1.9641640186309814, + "learning_rate": 0.00019797880302852697, + "loss": 2.3405, + "step": 877 + }, + { + "epoch": 0.09237243556023146, + "grad_norm": 0.9281172752380371, + "learning_rate": 0.00019797198055267857, + "loss": 1.2817, + "step": 878 + }, + { + "epoch": 0.09247764334560757, + "grad_norm": 0.967189371585846, + "learning_rate": 0.0001979651466996667, + "loss": 2.2958, + "step": 879 + }, + { + "epoch": 0.09258285113098369, + "grad_norm": 1.9948618412017822, + "learning_rate": 0.0001979583014702849, + "loss": 1.6858, + "step": 880 + }, + { + "epoch": 0.09268805891635981, + "grad_norm": 0.7749442458152771, + "learning_rate": 0.00019795144486532814, + "loss": 2.4653, + "step": 881 + }, + { + "epoch": 0.09279326670173593, + "grad_norm": 1.0032241344451904, + "learning_rate": 0.00019794457688559265, + "loss": 2.2595, + "step": 882 + }, + { + "epoch": 0.09289847448711204, + "grad_norm": 0.8965802788734436, + "learning_rate": 0.00019793769753187595, + "loss": 1.9857, + "step": 883 + }, + { + "epoch": 0.09300368227248816, + "grad_norm": 0.8196418285369873, + "learning_rate": 0.00019793080680497696, + "loss": 2.0913, + "step": 884 + }, + { + "epoch": 0.09310889005786428, + "grad_norm": 0.9274260401725769, + "learning_rate": 0.00019792390470569583, + "loss": 2.0106, + "step": 885 + }, + { + "epoch": 0.0932140978432404, + "grad_norm": 0.8402416110038757, + "learning_rate": 0.00019791699123483412, + "loss": 2.5971, + "step": 886 + }, + { + "epoch": 0.09331930562861651, + "grad_norm": 1.1058905124664307, + "learning_rate": 0.00019791006639319463, + "loss": 2.134, + "step": 887 + }, + { + "epoch": 0.09342451341399263, + "grad_norm": 0.6589333415031433, + "learning_rate": 0.00019790313018158156, + "loss": 2.375, + "step": 888 + }, + { + "epoch": 0.09352972119936875, + "grad_norm": 1.1351195573806763, + "learning_rate": 0.00019789618260080034, + "loss": 1.8826, + "step": 889 + }, + { + "epoch": 0.09363492898474488, + "grad_norm": 1.2128400802612305, + "learning_rate": 0.00019788922365165785, + "loss": 2.023, + "step": 890 + }, + { + "epoch": 0.09374013677012098, + "grad_norm": 0.9158788919448853, + "learning_rate": 0.0001978822533349621, + "loss": 2.2709, + "step": 891 + }, + { + "epoch": 0.0938453445554971, + "grad_norm": 1.525220274925232, + "learning_rate": 0.00019787527165152265, + "loss": 2.0153, + "step": 892 + }, + { + "epoch": 0.09395055234087323, + "grad_norm": 0.9528924226760864, + "learning_rate": 0.00019786827860215014, + "loss": 2.1791, + "step": 893 + }, + { + "epoch": 0.09405576012624935, + "grad_norm": 1.2895303964614868, + "learning_rate": 0.00019786127418765673, + "loss": 1.9553, + "step": 894 + }, + { + "epoch": 0.09416096791162545, + "grad_norm": 0.9034793972969055, + "learning_rate": 0.0001978542584088558, + "loss": 2.3079, + "step": 895 + }, + { + "epoch": 0.09426617569700158, + "grad_norm": 1.507163643836975, + "learning_rate": 0.0001978472312665621, + "loss": 1.5934, + "step": 896 + }, + { + "epoch": 0.0943713834823777, + "grad_norm": 0.9403305649757385, + "learning_rate": 0.0001978401927615916, + "loss": 1.7338, + "step": 897 + }, + { + "epoch": 0.09447659126775382, + "grad_norm": 0.8427889347076416, + "learning_rate": 0.00019783314289476168, + "loss": 2.0062, + "step": 898 + }, + { + "epoch": 0.09458179905312993, + "grad_norm": 0.8502089977264404, + "learning_rate": 0.0001978260816668911, + "loss": 2.1841, + "step": 899 + }, + { + "epoch": 0.09468700683850605, + "grad_norm": 0.8778148889541626, + "learning_rate": 0.00019781900907879974, + "loss": 2.0756, + "step": 900 + }, + { + "epoch": 0.09479221462388217, + "grad_norm": 0.8359899520874023, + "learning_rate": 0.00019781192513130896, + "loss": 1.9334, + "step": 901 + }, + { + "epoch": 0.09489742240925829, + "grad_norm": 1.1402784585952759, + "learning_rate": 0.00019780482982524142, + "loss": 1.7855, + "step": 902 + }, + { + "epoch": 0.0950026301946344, + "grad_norm": 0.7871297597885132, + "learning_rate": 0.00019779772316142104, + "loss": 2.3578, + "step": 903 + }, + { + "epoch": 0.09510783798001052, + "grad_norm": 0.9127509593963623, + "learning_rate": 0.0001977906051406731, + "loss": 2.2987, + "step": 904 + }, + { + "epoch": 0.09521304576538664, + "grad_norm": 1.1920058727264404, + "learning_rate": 0.00019778347576382424, + "loss": 2.1563, + "step": 905 + }, + { + "epoch": 0.09531825355076276, + "grad_norm": 1.1233481168746948, + "learning_rate": 0.0001977763350317023, + "loss": 2.1715, + "step": 906 + }, + { + "epoch": 0.09542346133613887, + "grad_norm": 0.7940162420272827, + "learning_rate": 0.00019776918294513656, + "loss": 2.3458, + "step": 907 + }, + { + "epoch": 0.09552866912151499, + "grad_norm": 1.0063912868499756, + "learning_rate": 0.00019776201950495755, + "loss": 2.1605, + "step": 908 + }, + { + "epoch": 0.09563387690689111, + "grad_norm": 1.514668583869934, + "learning_rate": 0.00019775484471199715, + "loss": 1.5532, + "step": 909 + }, + { + "epoch": 0.09573908469226723, + "grad_norm": 0.8086494207382202, + "learning_rate": 0.0001977476585670885, + "loss": 1.8652, + "step": 910 + }, + { + "epoch": 0.09584429247764334, + "grad_norm": 0.925151526927948, + "learning_rate": 0.00019774046107106616, + "loss": 2.1997, + "step": 911 + }, + { + "epoch": 0.09594950026301946, + "grad_norm": 0.7572203278541565, + "learning_rate": 0.0001977332522247659, + "loss": 2.116, + "step": 912 + }, + { + "epoch": 0.09605470804839558, + "grad_norm": 1.877502202987671, + "learning_rate": 0.00019772603202902492, + "loss": 1.468, + "step": 913 + }, + { + "epoch": 0.0961599158337717, + "grad_norm": 1.2712554931640625, + "learning_rate": 0.00019771880048468163, + "loss": 2.0208, + "step": 914 + }, + { + "epoch": 0.09626512361914781, + "grad_norm": 0.7878434062004089, + "learning_rate": 0.00019771155759257584, + "loss": 2.0554, + "step": 915 + }, + { + "epoch": 0.09637033140452393, + "grad_norm": 0.7482603788375854, + "learning_rate": 0.0001977043033535486, + "loss": 1.954, + "step": 916 + }, + { + "epoch": 0.09647553918990005, + "grad_norm": 0.8648151755332947, + "learning_rate": 0.00019769703776844236, + "loss": 1.9668, + "step": 917 + }, + { + "epoch": 0.09658074697527617, + "grad_norm": 1.0494996309280396, + "learning_rate": 0.0001976897608381008, + "loss": 2.1018, + "step": 918 + }, + { + "epoch": 0.09668595476065228, + "grad_norm": 1.2748123407363892, + "learning_rate": 0.00019768247256336902, + "loss": 1.8108, + "step": 919 + }, + { + "epoch": 0.0967911625460284, + "grad_norm": 0.7436427474021912, + "learning_rate": 0.00019767517294509338, + "loss": 2.0819, + "step": 920 + }, + { + "epoch": 0.09689637033140452, + "grad_norm": 0.9573638439178467, + "learning_rate": 0.00019766786198412154, + "loss": 1.9145, + "step": 921 + }, + { + "epoch": 0.09700157811678065, + "grad_norm": 1.3364827632904053, + "learning_rate": 0.00019766053968130247, + "loss": 2.1283, + "step": 922 + }, + { + "epoch": 0.09710678590215675, + "grad_norm": 0.971378743648529, + "learning_rate": 0.00019765320603748655, + "loss": 2.0985, + "step": 923 + }, + { + "epoch": 0.09721199368753287, + "grad_norm": 0.8163911700248718, + "learning_rate": 0.00019764586105352534, + "loss": 2.3535, + "step": 924 + }, + { + "epoch": 0.097317201472909, + "grad_norm": 0.8818692564964294, + "learning_rate": 0.00019763850473027183, + "loss": 2.0812, + "step": 925 + }, + { + "epoch": 0.09742240925828512, + "grad_norm": 0.984257698059082, + "learning_rate": 0.00019763113706858031, + "loss": 1.9479, + "step": 926 + }, + { + "epoch": 0.09752761704366122, + "grad_norm": 0.9912437200546265, + "learning_rate": 0.00019762375806930632, + "loss": 1.6832, + "step": 927 + }, + { + "epoch": 0.09763282482903735, + "grad_norm": 0.8805968761444092, + "learning_rate": 0.0001976163677333068, + "loss": 2.3275, + "step": 928 + }, + { + "epoch": 0.09773803261441347, + "grad_norm": 1.2597976922988892, + "learning_rate": 0.00019760896606143988, + "loss": 1.8149, + "step": 929 + }, + { + "epoch": 0.09784324039978959, + "grad_norm": 0.709547758102417, + "learning_rate": 0.0001976015530545652, + "loss": 2.4011, + "step": 930 + }, + { + "epoch": 0.0979484481851657, + "grad_norm": 1.3702518939971924, + "learning_rate": 0.00019759412871354353, + "loss": 1.4587, + "step": 931 + }, + { + "epoch": 0.09805365597054182, + "grad_norm": 1.471561312675476, + "learning_rate": 0.00019758669303923706, + "loss": 1.9036, + "step": 932 + }, + { + "epoch": 0.09815886375591794, + "grad_norm": 0.7236111164093018, + "learning_rate": 0.0001975792460325093, + "loss": 2.8316, + "step": 933 + }, + { + "epoch": 0.09826407154129406, + "grad_norm": 0.9491783380508423, + "learning_rate": 0.000197571787694225, + "loss": 2.0152, + "step": 934 + }, + { + "epoch": 0.09836927932667017, + "grad_norm": 1.1442753076553345, + "learning_rate": 0.0001975643180252503, + "loss": 1.9371, + "step": 935 + }, + { + "epoch": 0.09847448711204629, + "grad_norm": 1.107806921005249, + "learning_rate": 0.00019755683702645262, + "loss": 2.0472, + "step": 936 + }, + { + "epoch": 0.09857969489742241, + "grad_norm": 1.2097374200820923, + "learning_rate": 0.0001975493446987007, + "loss": 2.0863, + "step": 937 + }, + { + "epoch": 0.09868490268279853, + "grad_norm": 0.8795415759086609, + "learning_rate": 0.0001975418410428646, + "loss": 2.0176, + "step": 938 + }, + { + "epoch": 0.09879011046817464, + "grad_norm": 0.8208872079849243, + "learning_rate": 0.0001975343260598157, + "loss": 2.0678, + "step": 939 + }, + { + "epoch": 0.09889531825355076, + "grad_norm": 1.147039532661438, + "learning_rate": 0.0001975267997504267, + "loss": 1.71, + "step": 940 + }, + { + "epoch": 0.09900052603892688, + "grad_norm": 1.0926604270935059, + "learning_rate": 0.00019751926211557157, + "loss": 2.2779, + "step": 941 + }, + { + "epoch": 0.099105733824303, + "grad_norm": 1.0695338249206543, + "learning_rate": 0.00019751171315612567, + "loss": 2.0222, + "step": 942 + }, + { + "epoch": 0.09921094160967911, + "grad_norm": 1.0999269485473633, + "learning_rate": 0.00019750415287296563, + "loss": 2.1906, + "step": 943 + }, + { + "epoch": 0.09931614939505523, + "grad_norm": 1.2324326038360596, + "learning_rate": 0.00019749658126696934, + "loss": 2.123, + "step": 944 + }, + { + "epoch": 0.09942135718043135, + "grad_norm": 0.9449137449264526, + "learning_rate": 0.00019748899833901614, + "loss": 1.8318, + "step": 945 + }, + { + "epoch": 0.09952656496580747, + "grad_norm": 1.1237130165100098, + "learning_rate": 0.0001974814040899866, + "loss": 1.9734, + "step": 946 + }, + { + "epoch": 0.09963177275118358, + "grad_norm": 0.7438077926635742, + "learning_rate": 0.00019747379852076263, + "loss": 2.406, + "step": 947 + }, + { + "epoch": 0.0997369805365597, + "grad_norm": 0.7513858675956726, + "learning_rate": 0.00019746618163222736, + "loss": 1.8843, + "step": 948 + }, + { + "epoch": 0.09984218832193582, + "grad_norm": 0.7331326007843018, + "learning_rate": 0.0001974585534252654, + "loss": 2.3763, + "step": 949 + }, + { + "epoch": 0.09994739610731194, + "grad_norm": 1.0843267440795898, + "learning_rate": 0.00019745091390076252, + "loss": 1.962, + "step": 950 + }, + { + "epoch": 0.10005260389268805, + "grad_norm": 1.1343190670013428, + "learning_rate": 0.00019744326305960595, + "loss": 2.0071, + "step": 951 + }, + { + "epoch": 0.10015781167806417, + "grad_norm": 1.4789917469024658, + "learning_rate": 0.0001974356009026841, + "loss": 1.715, + "step": 952 + }, + { + "epoch": 0.1002630194634403, + "grad_norm": 1.1908003091812134, + "learning_rate": 0.00019742792743088675, + "loss": 1.8924, + "step": 953 + }, + { + "epoch": 0.10036822724881642, + "grad_norm": 0.7355921864509583, + "learning_rate": 0.000197420242645105, + "loss": 2.3287, + "step": 954 + }, + { + "epoch": 0.10047343503419252, + "grad_norm": 0.7553874254226685, + "learning_rate": 0.0001974125465462313, + "loss": 1.7495, + "step": 955 + }, + { + "epoch": 0.10057864281956864, + "grad_norm": 1.1832842826843262, + "learning_rate": 0.00019740483913515932, + "loss": 2.0124, + "step": 956 + }, + { + "epoch": 0.10068385060494477, + "grad_norm": 1.039902925491333, + "learning_rate": 0.0001973971204127841, + "loss": 2.0525, + "step": 957 + }, + { + "epoch": 0.10078905839032089, + "grad_norm": 0.8891255855560303, + "learning_rate": 0.00019738939038000205, + "loss": 2.1702, + "step": 958 + }, + { + "epoch": 0.100894266175697, + "grad_norm": 1.692345142364502, + "learning_rate": 0.00019738164903771078, + "loss": 1.9213, + "step": 959 + }, + { + "epoch": 0.10099947396107312, + "grad_norm": 0.9816641807556152, + "learning_rate": 0.00019737389638680924, + "loss": 1.9241, + "step": 960 + }, + { + "epoch": 0.10110468174644924, + "grad_norm": 0.90645432472229, + "learning_rate": 0.0001973661324281978, + "loss": 1.6803, + "step": 961 + }, + { + "epoch": 0.10120988953182536, + "grad_norm": 1.1190398931503296, + "learning_rate": 0.00019735835716277802, + "loss": 2.0615, + "step": 962 + }, + { + "epoch": 0.10131509731720147, + "grad_norm": 0.9607227444648743, + "learning_rate": 0.0001973505705914528, + "loss": 1.9322, + "step": 963 + }, + { + "epoch": 0.10142030510257759, + "grad_norm": 0.7287774682044983, + "learning_rate": 0.00019734277271512638, + "loss": 2.3326, + "step": 964 + }, + { + "epoch": 0.10152551288795371, + "grad_norm": 1.8522443771362305, + "learning_rate": 0.00019733496353470433, + "loss": 2.1133, + "step": 965 + }, + { + "epoch": 0.10163072067332983, + "grad_norm": 0.899685800075531, + "learning_rate": 0.00019732714305109345, + "loss": 2.0211, + "step": 966 + }, + { + "epoch": 0.10173592845870594, + "grad_norm": 1.8534700870513916, + "learning_rate": 0.00019731931126520195, + "loss": 1.6331, + "step": 967 + }, + { + "epoch": 0.10184113624408206, + "grad_norm": 0.8898396492004395, + "learning_rate": 0.00019731146817793932, + "loss": 2.2339, + "step": 968 + }, + { + "epoch": 0.10194634402945818, + "grad_norm": 1.305379867553711, + "learning_rate": 0.00019730361379021632, + "loss": 1.9012, + "step": 969 + }, + { + "epoch": 0.1020515518148343, + "grad_norm": 1.1972163915634155, + "learning_rate": 0.00019729574810294507, + "loss": 1.6715, + "step": 970 + }, + { + "epoch": 0.10215675960021041, + "grad_norm": 1.1665350198745728, + "learning_rate": 0.00019728787111703895, + "loss": 2.3843, + "step": 971 + }, + { + "epoch": 0.10226196738558653, + "grad_norm": 0.8711062073707581, + "learning_rate": 0.00019727998283341274, + "loss": 2.2372, + "step": 972 + }, + { + "epoch": 0.10236717517096265, + "grad_norm": 0.9705300331115723, + "learning_rate": 0.00019727208325298246, + "loss": 2.2943, + "step": 973 + }, + { + "epoch": 0.10247238295633877, + "grad_norm": 1.7931549549102783, + "learning_rate": 0.00019726417237666546, + "loss": 2.1751, + "step": 974 + }, + { + "epoch": 0.10257759074171488, + "grad_norm": 1.630979061126709, + "learning_rate": 0.00019725625020538038, + "loss": 1.9252, + "step": 975 + }, + { + "epoch": 0.102682798527091, + "grad_norm": 1.412156105041504, + "learning_rate": 0.00019724831674004724, + "loss": 1.9631, + "step": 976 + }, + { + "epoch": 0.10278800631246712, + "grad_norm": 0.7268504500389099, + "learning_rate": 0.00019724037198158733, + "loss": 1.8943, + "step": 977 + }, + { + "epoch": 0.10289321409784324, + "grad_norm": 1.439155101776123, + "learning_rate": 0.00019723241593092318, + "loss": 1.9662, + "step": 978 + }, + { + "epoch": 0.10299842188321935, + "grad_norm": 1.4355570077896118, + "learning_rate": 0.00019722444858897878, + "loss": 2.2064, + "step": 979 + }, + { + "epoch": 0.10310362966859547, + "grad_norm": 0.9133340716362, + "learning_rate": 0.00019721646995667932, + "loss": 2.6105, + "step": 980 + }, + { + "epoch": 0.1032088374539716, + "grad_norm": 0.9703789949417114, + "learning_rate": 0.0001972084800349513, + "loss": 2.2393, + "step": 981 + }, + { + "epoch": 0.10331404523934772, + "grad_norm": 1.9817613363265991, + "learning_rate": 0.00019720047882472262, + "loss": 1.6778, + "step": 982 + }, + { + "epoch": 0.10341925302472382, + "grad_norm": 0.9752820134162903, + "learning_rate": 0.00019719246632692242, + "loss": 2.002, + "step": 983 + }, + { + "epoch": 0.10352446081009994, + "grad_norm": 1.4282121658325195, + "learning_rate": 0.00019718444254248114, + "loss": 1.8544, + "step": 984 + }, + { + "epoch": 0.10362966859547607, + "grad_norm": 0.9460940957069397, + "learning_rate": 0.00019717640747233056, + "loss": 2.1763, + "step": 985 + }, + { + "epoch": 0.10373487638085219, + "grad_norm": 1.4641571044921875, + "learning_rate": 0.00019716836111740378, + "loss": 2.2195, + "step": 986 + }, + { + "epoch": 0.1038400841662283, + "grad_norm": 0.9056519865989685, + "learning_rate": 0.00019716030347863517, + "loss": 2.0533, + "step": 987 + }, + { + "epoch": 0.10394529195160442, + "grad_norm": 1.7422082424163818, + "learning_rate": 0.00019715223455696047, + "loss": 2.1961, + "step": 988 + }, + { + "epoch": 0.10405049973698054, + "grad_norm": 0.7487703561782837, + "learning_rate": 0.0001971441543533167, + "loss": 2.3884, + "step": 989 + }, + { + "epoch": 0.10415570752235666, + "grad_norm": 1.123172402381897, + "learning_rate": 0.0001971360628686422, + "loss": 2.3124, + "step": 990 + }, + { + "epoch": 0.10426091530773277, + "grad_norm": 0.9539334177970886, + "learning_rate": 0.00019712796010387654, + "loss": 2.1406, + "step": 991 + }, + { + "epoch": 0.10436612309310889, + "grad_norm": 1.5190744400024414, + "learning_rate": 0.0001971198460599607, + "loss": 2.081, + "step": 992 + }, + { + "epoch": 0.10447133087848501, + "grad_norm": 0.7697191834449768, + "learning_rate": 0.00019711172073783696, + "loss": 2.2589, + "step": 993 + }, + { + "epoch": 0.10457653866386113, + "grad_norm": 1.2157846689224243, + "learning_rate": 0.0001971035841384489, + "loss": 1.9017, + "step": 994 + }, + { + "epoch": 0.10468174644923724, + "grad_norm": 0.8707440495491028, + "learning_rate": 0.00019709543626274131, + "loss": 2.2788, + "step": 995 + }, + { + "epoch": 0.10478695423461336, + "grad_norm": 1.0644099712371826, + "learning_rate": 0.00019708727711166047, + "loss": 1.8108, + "step": 996 + }, + { + "epoch": 0.10489216201998948, + "grad_norm": 1.3928571939468384, + "learning_rate": 0.00019707910668615382, + "loss": 1.811, + "step": 997 + }, + { + "epoch": 0.1049973698053656, + "grad_norm": 1.6720000505447388, + "learning_rate": 0.00019707092498717023, + "loss": 1.6834, + "step": 998 + }, + { + "epoch": 0.10510257759074171, + "grad_norm": 1.0016800165176392, + "learning_rate": 0.00019706273201565972, + "loss": 2.008, + "step": 999 + }, + { + "epoch": 0.10520778537611783, + "grad_norm": 0.7917373180389404, + "learning_rate": 0.00019705452777257377, + "loss": 2.1877, + "step": 1000 + }, + { + "epoch": 0.10531299316149395, + "grad_norm": 1.1289230585098267, + "learning_rate": 0.00019704631225886515, + "loss": 2.135, + "step": 1001 + }, + { + "epoch": 0.10541820094687007, + "grad_norm": 1.262541651725769, + "learning_rate": 0.00019703808547548782, + "loss": 2.1767, + "step": 1002 + }, + { + "epoch": 0.10552340873224618, + "grad_norm": 0.9078851938247681, + "learning_rate": 0.00019702984742339715, + "loss": 2.0435, + "step": 1003 + }, + { + "epoch": 0.1056286165176223, + "grad_norm": 1.3077322244644165, + "learning_rate": 0.00019702159810354978, + "loss": 1.8528, + "step": 1004 + }, + { + "epoch": 0.10573382430299842, + "grad_norm": 1.0650526285171509, + "learning_rate": 0.00019701333751690378, + "loss": 2.2322, + "step": 1005 + }, + { + "epoch": 0.10583903208837454, + "grad_norm": 1.7082046270370483, + "learning_rate": 0.0001970050656644183, + "loss": 2.0592, + "step": 1006 + }, + { + "epoch": 0.10594423987375065, + "grad_norm": 0.9092219471931458, + "learning_rate": 0.000196996782547054, + "loss": 1.929, + "step": 1007 + }, + { + "epoch": 0.10604944765912677, + "grad_norm": 1.0924969911575317, + "learning_rate": 0.00019698848816577274, + "loss": 2.0751, + "step": 1008 + }, + { + "epoch": 0.1061546554445029, + "grad_norm": 0.8354095220565796, + "learning_rate": 0.0001969801825215377, + "loss": 2.3727, + "step": 1009 + }, + { + "epoch": 0.10625986322987901, + "grad_norm": 0.8786736726760864, + "learning_rate": 0.00019697186561531345, + "loss": 2.0541, + "step": 1010 + }, + { + "epoch": 0.10636507101525512, + "grad_norm": 1.0986034870147705, + "learning_rate": 0.00019696353744806574, + "loss": 2.3069, + "step": 1011 + }, + { + "epoch": 0.10647027880063124, + "grad_norm": 0.7767875790596008, + "learning_rate": 0.00019695519802076175, + "loss": 2.2888, + "step": 1012 + }, + { + "epoch": 0.10657548658600736, + "grad_norm": 1.035365104675293, + "learning_rate": 0.00019694684733436986, + "loss": 1.9233, + "step": 1013 + }, + { + "epoch": 0.10668069437138349, + "grad_norm": 1.2048399448394775, + "learning_rate": 0.00019693848538985983, + "loss": 2.3406, + "step": 1014 + }, + { + "epoch": 0.1067859021567596, + "grad_norm": 0.9446375370025635, + "learning_rate": 0.0001969301121882027, + "loss": 1.9142, + "step": 1015 + }, + { + "epoch": 0.10689110994213571, + "grad_norm": 1.7339941263198853, + "learning_rate": 0.0001969217277303708, + "loss": 1.6881, + "step": 1016 + }, + { + "epoch": 0.10699631772751184, + "grad_norm": 1.0482121706008911, + "learning_rate": 0.00019691333201733786, + "loss": 2.0258, + "step": 1017 + }, + { + "epoch": 0.10710152551288796, + "grad_norm": 1.113286018371582, + "learning_rate": 0.00019690492505007877, + "loss": 1.7817, + "step": 1018 + }, + { + "epoch": 0.10720673329826406, + "grad_norm": 1.4576612710952759, + "learning_rate": 0.00019689650682956986, + "loss": 2.085, + "step": 1019 + }, + { + "epoch": 0.10731194108364019, + "grad_norm": 1.79568612575531, + "learning_rate": 0.00019688807735678866, + "loss": 2.0266, + "step": 1020 + }, + { + "epoch": 0.10741714886901631, + "grad_norm": 1.656327486038208, + "learning_rate": 0.00019687963663271409, + "loss": 2.1882, + "step": 1021 + }, + { + "epoch": 0.10752235665439243, + "grad_norm": 1.273909091949463, + "learning_rate": 0.00019687118465832636, + "loss": 1.9803, + "step": 1022 + }, + { + "epoch": 0.10762756443976854, + "grad_norm": 1.4731061458587646, + "learning_rate": 0.00019686272143460692, + "loss": 1.8951, + "step": 1023 + }, + { + "epoch": 0.10773277222514466, + "grad_norm": 0.7379525899887085, + "learning_rate": 0.00019685424696253858, + "loss": 2.0547, + "step": 1024 + }, + { + "epoch": 0.10783798001052078, + "grad_norm": 0.7799714803695679, + "learning_rate": 0.0001968457612431055, + "loss": 1.9213, + "step": 1025 + }, + { + "epoch": 0.1079431877958969, + "grad_norm": 2.3484795093536377, + "learning_rate": 0.00019683726427729306, + "loss": 2.5578, + "step": 1026 + }, + { + "epoch": 0.10804839558127301, + "grad_norm": 1.0442508459091187, + "learning_rate": 0.000196828756066088, + "loss": 2.1757, + "step": 1027 + }, + { + "epoch": 0.10815360336664913, + "grad_norm": 1.0097154378890991, + "learning_rate": 0.00019682023661047836, + "loss": 1.8894, + "step": 1028 + }, + { + "epoch": 0.10825881115202525, + "grad_norm": 1.412798523902893, + "learning_rate": 0.00019681170591145345, + "loss": 2.2574, + "step": 1029 + }, + { + "epoch": 0.10836401893740137, + "grad_norm": 0.9488776922225952, + "learning_rate": 0.00019680316397000395, + "loss": 1.6457, + "step": 1030 + }, + { + "epoch": 0.10846922672277748, + "grad_norm": 0.8774062395095825, + "learning_rate": 0.00019679461078712178, + "loss": 2.0935, + "step": 1031 + }, + { + "epoch": 0.1085744345081536, + "grad_norm": 1.0559821128845215, + "learning_rate": 0.00019678604636380018, + "loss": 1.9539, + "step": 1032 + }, + { + "epoch": 0.10867964229352972, + "grad_norm": 1.1021692752838135, + "learning_rate": 0.0001967774707010337, + "loss": 2.1199, + "step": 1033 + }, + { + "epoch": 0.10878485007890584, + "grad_norm": 1.441303014755249, + "learning_rate": 0.0001967688837998183, + "loss": 1.7238, + "step": 1034 + }, + { + "epoch": 0.10889005786428195, + "grad_norm": 1.2004631757736206, + "learning_rate": 0.00019676028566115102, + "loss": 2.3156, + "step": 1035 + }, + { + "epoch": 0.10899526564965807, + "grad_norm": 1.1571768522262573, + "learning_rate": 0.0001967516762860304, + "loss": 2.2756, + "step": 1036 + }, + { + "epoch": 0.10910047343503419, + "grad_norm": 0.7429097890853882, + "learning_rate": 0.0001967430556754562, + "loss": 1.4574, + "step": 1037 + }, + { + "epoch": 0.10920568122041031, + "grad_norm": 1.2119436264038086, + "learning_rate": 0.00019673442383042952, + "loss": 2.3933, + "step": 1038 + }, + { + "epoch": 0.10931088900578642, + "grad_norm": 1.4365086555480957, + "learning_rate": 0.00019672578075195272, + "loss": 1.9312, + "step": 1039 + }, + { + "epoch": 0.10941609679116254, + "grad_norm": 1.0024199485778809, + "learning_rate": 0.00019671712644102956, + "loss": 1.9121, + "step": 1040 + }, + { + "epoch": 0.10952130457653866, + "grad_norm": 0.6678622961044312, + "learning_rate": 0.00019670846089866496, + "loss": 1.9272, + "step": 1041 + }, + { + "epoch": 0.10962651236191479, + "grad_norm": 1.1651445627212524, + "learning_rate": 0.00019669978412586528, + "loss": 2.1199, + "step": 1042 + }, + { + "epoch": 0.10973172014729089, + "grad_norm": 1.0801807641983032, + "learning_rate": 0.00019669109612363803, + "loss": 1.9533, + "step": 1043 + }, + { + "epoch": 0.10983692793266701, + "grad_norm": 1.6487115621566772, + "learning_rate": 0.00019668239689299224, + "loss": 1.4197, + "step": 1044 + }, + { + "epoch": 0.10994213571804314, + "grad_norm": 1.0398926734924316, + "learning_rate": 0.00019667368643493804, + "loss": 1.8756, + "step": 1045 + }, + { + "epoch": 0.11004734350341926, + "grad_norm": 1.2380412817001343, + "learning_rate": 0.00019666496475048698, + "loss": 1.731, + "step": 1046 + }, + { + "epoch": 0.11015255128879536, + "grad_norm": 1.0695997476577759, + "learning_rate": 0.00019665623184065187, + "loss": 1.816, + "step": 1047 + }, + { + "epoch": 0.11025775907417149, + "grad_norm": 1.0123080015182495, + "learning_rate": 0.00019664748770644686, + "loss": 2.0089, + "step": 1048 + }, + { + "epoch": 0.1103629668595476, + "grad_norm": 0.980810821056366, + "learning_rate": 0.00019663873234888733, + "loss": 2.0396, + "step": 1049 + }, + { + "epoch": 0.11046817464492373, + "grad_norm": 1.2700474262237549, + "learning_rate": 0.00019662996576899004, + "loss": 2.3745, + "step": 1050 + }, + { + "epoch": 0.11057338243029984, + "grad_norm": 1.1388194561004639, + "learning_rate": 0.00019662118796777303, + "loss": 1.8479, + "step": 1051 + }, + { + "epoch": 0.11067859021567596, + "grad_norm": 1.463036298751831, + "learning_rate": 0.0001966123989462556, + "loss": 1.9751, + "step": 1052 + }, + { + "epoch": 0.11078379800105208, + "grad_norm": 1.2596359252929688, + "learning_rate": 0.00019660359870545845, + "loss": 2.0322, + "step": 1053 + }, + { + "epoch": 0.1108890057864282, + "grad_norm": 0.8281905055046082, + "learning_rate": 0.00019659478724640348, + "loss": 1.9301, + "step": 1054 + }, + { + "epoch": 0.1109942135718043, + "grad_norm": 1.469774603843689, + "learning_rate": 0.00019658596457011393, + "loss": 1.5288, + "step": 1055 + }, + { + "epoch": 0.11109942135718043, + "grad_norm": 0.8355114459991455, + "learning_rate": 0.0001965771306776144, + "loss": 2.1748, + "step": 1056 + }, + { + "epoch": 0.11120462914255655, + "grad_norm": 0.8642022013664246, + "learning_rate": 0.00019656828556993068, + "loss": 1.6075, + "step": 1057 + }, + { + "epoch": 0.11130983692793267, + "grad_norm": 0.6303163170814514, + "learning_rate": 0.00019655942924808994, + "loss": 2.3366, + "step": 1058 + }, + { + "epoch": 0.11141504471330879, + "grad_norm": 0.9480810761451721, + "learning_rate": 0.00019655056171312069, + "loss": 1.8519, + "step": 1059 + }, + { + "epoch": 0.1115202524986849, + "grad_norm": 0.8360817432403564, + "learning_rate": 0.0001965416829660526, + "loss": 2.1247, + "step": 1060 + }, + { + "epoch": 0.11162546028406102, + "grad_norm": 0.9040228724479675, + "learning_rate": 0.0001965327930079168, + "loss": 2.2492, + "step": 1061 + }, + { + "epoch": 0.11173066806943714, + "grad_norm": 0.8252432942390442, + "learning_rate": 0.00019652389183974557, + "loss": 1.7289, + "step": 1062 + }, + { + "epoch": 0.11183587585481326, + "grad_norm": 0.783162534236908, + "learning_rate": 0.00019651497946257266, + "loss": 2.3029, + "step": 1063 + }, + { + "epoch": 0.11194108364018937, + "grad_norm": 0.8001965284347534, + "learning_rate": 0.00019650605587743302, + "loss": 1.6801, + "step": 1064 + }, + { + "epoch": 0.11204629142556549, + "grad_norm": 1.3523527383804321, + "learning_rate": 0.00019649712108536286, + "loss": 2.1644, + "step": 1065 + }, + { + "epoch": 0.11215149921094161, + "grad_norm": 1.2581042051315308, + "learning_rate": 0.00019648817508739983, + "loss": 2.1609, + "step": 1066 + }, + { + "epoch": 0.11225670699631773, + "grad_norm": 1.0684157609939575, + "learning_rate": 0.00019647921788458272, + "loss": 2.4841, + "step": 1067 + }, + { + "epoch": 0.11236191478169384, + "grad_norm": 0.9492899775505066, + "learning_rate": 0.00019647024947795175, + "loss": 1.8177, + "step": 1068 + }, + { + "epoch": 0.11246712256706996, + "grad_norm": 1.1650155782699585, + "learning_rate": 0.00019646126986854837, + "loss": 1.6481, + "step": 1069 + }, + { + "epoch": 0.11257233035244608, + "grad_norm": 1.2191792726516724, + "learning_rate": 0.00019645227905741534, + "loss": 1.7948, + "step": 1070 + }, + { + "epoch": 0.1126775381378222, + "grad_norm": 1.074036717414856, + "learning_rate": 0.0001964432770455968, + "loss": 1.764, + "step": 1071 + }, + { + "epoch": 0.11278274592319831, + "grad_norm": 1.0380913019180298, + "learning_rate": 0.00019643426383413805, + "loss": 1.526, + "step": 1072 + }, + { + "epoch": 0.11288795370857443, + "grad_norm": 1.4645812511444092, + "learning_rate": 0.0001964252394240858, + "loss": 2.016, + "step": 1073 + }, + { + "epoch": 0.11299316149395056, + "grad_norm": 0.8805195093154907, + "learning_rate": 0.000196416203816488, + "loss": 2.0084, + "step": 1074 + }, + { + "epoch": 0.11309836927932668, + "grad_norm": 1.593756079673767, + "learning_rate": 0.00019640715701239395, + "loss": 2.1586, + "step": 1075 + }, + { + "epoch": 0.11320357706470278, + "grad_norm": 0.9434992671012878, + "learning_rate": 0.00019639809901285423, + "loss": 1.8971, + "step": 1076 + }, + { + "epoch": 0.1133087848500789, + "grad_norm": 1.0768349170684814, + "learning_rate": 0.00019638902981892068, + "loss": 2.289, + "step": 1077 + }, + { + "epoch": 0.11341399263545503, + "grad_norm": 1.1029759645462036, + "learning_rate": 0.0001963799494316465, + "loss": 2.2014, + "step": 1078 + }, + { + "epoch": 0.11351920042083115, + "grad_norm": 1.1551514863967896, + "learning_rate": 0.0001963708578520862, + "loss": 2.3707, + "step": 1079 + }, + { + "epoch": 0.11362440820620726, + "grad_norm": 1.2115519046783447, + "learning_rate": 0.00019636175508129552, + "loss": 1.4232, + "step": 1080 + }, + { + "epoch": 0.11372961599158338, + "grad_norm": 1.1642786264419556, + "learning_rate": 0.0001963526411203315, + "loss": 2.1658, + "step": 1081 + }, + { + "epoch": 0.1138348237769595, + "grad_norm": 1.1630796194076538, + "learning_rate": 0.00019634351597025255, + "loss": 2.0171, + "step": 1082 + }, + { + "epoch": 0.11394003156233562, + "grad_norm": 1.058016061782837, + "learning_rate": 0.00019633437963211832, + "loss": 1.5272, + "step": 1083 + }, + { + "epoch": 0.11404523934771173, + "grad_norm": 1.1244498491287231, + "learning_rate": 0.00019632523210698987, + "loss": 2.4017, + "step": 1084 + }, + { + "epoch": 0.11415044713308785, + "grad_norm": 1.0353667736053467, + "learning_rate": 0.00019631607339592937, + "loss": 1.9151, + "step": 1085 + }, + { + "epoch": 0.11425565491846397, + "grad_norm": 0.735931396484375, + "learning_rate": 0.00019630690350000042, + "loss": 2.3843, + "step": 1086 + }, + { + "epoch": 0.11436086270384009, + "grad_norm": 0.9920666217803955, + "learning_rate": 0.00019629772242026793, + "loss": 2.1867, + "step": 1087 + }, + { + "epoch": 0.1144660704892162, + "grad_norm": 0.9778599739074707, + "learning_rate": 0.000196288530157798, + "loss": 2.3535, + "step": 1088 + }, + { + "epoch": 0.11457127827459232, + "grad_norm": 1.1692768335342407, + "learning_rate": 0.00019627932671365813, + "loss": 1.9833, + "step": 1089 + }, + { + "epoch": 0.11467648605996844, + "grad_norm": 1.3122508525848389, + "learning_rate": 0.00019627011208891713, + "loss": 1.8988, + "step": 1090 + }, + { + "epoch": 0.11478169384534456, + "grad_norm": 1.044777512550354, + "learning_rate": 0.00019626088628464498, + "loss": 1.92, + "step": 1091 + }, + { + "epoch": 0.11488690163072067, + "grad_norm": 0.8848743438720703, + "learning_rate": 0.00019625164930191311, + "loss": 2.4497, + "step": 1092 + }, + { + "epoch": 0.11499210941609679, + "grad_norm": 0.9451887607574463, + "learning_rate": 0.00019624240114179416, + "loss": 1.7285, + "step": 1093 + }, + { + "epoch": 0.11509731720147291, + "grad_norm": 0.9410362243652344, + "learning_rate": 0.00019623314180536205, + "loss": 1.9463, + "step": 1094 + }, + { + "epoch": 0.11520252498684903, + "grad_norm": 1.023637294769287, + "learning_rate": 0.00019622387129369212, + "loss": 1.8648, + "step": 1095 + }, + { + "epoch": 0.11530773277222514, + "grad_norm": 1.2272717952728271, + "learning_rate": 0.00019621458960786083, + "loss": 2.1191, + "step": 1096 + }, + { + "epoch": 0.11541294055760126, + "grad_norm": 1.150048851966858, + "learning_rate": 0.0001962052967489461, + "loss": 1.7046, + "step": 1097 + }, + { + "epoch": 0.11551814834297738, + "grad_norm": 0.7519487738609314, + "learning_rate": 0.00019619599271802706, + "loss": 2.2551, + "step": 1098 + }, + { + "epoch": 0.1156233561283535, + "grad_norm": 1.0038561820983887, + "learning_rate": 0.00019618667751618416, + "loss": 2.2381, + "step": 1099 + }, + { + "epoch": 0.11572856391372961, + "grad_norm": 0.9891253113746643, + "learning_rate": 0.0001961773511444991, + "loss": 2.2862, + "step": 1100 + }, + { + "epoch": 0.11583377169910573, + "grad_norm": 0.9823799729347229, + "learning_rate": 0.00019616801360405499, + "loss": 1.7907, + "step": 1101 + }, + { + "epoch": 0.11593897948448185, + "grad_norm": 2.5130529403686523, + "learning_rate": 0.0001961586648959361, + "loss": 2.1288, + "step": 1102 + }, + { + "epoch": 0.11604418726985798, + "grad_norm": 0.940342366695404, + "learning_rate": 0.00019614930502122812, + "loss": 2.1486, + "step": 1103 + }, + { + "epoch": 0.11614939505523408, + "grad_norm": 1.1634947061538696, + "learning_rate": 0.00019613993398101795, + "loss": 1.3548, + "step": 1104 + }, + { + "epoch": 0.1162546028406102, + "grad_norm": 1.2739289999008179, + "learning_rate": 0.00019613055177639384, + "loss": 2.4006, + "step": 1105 + }, + { + "epoch": 0.11635981062598633, + "grad_norm": 0.7770984768867493, + "learning_rate": 0.0001961211584084453, + "loss": 2.2417, + "step": 1106 + }, + { + "epoch": 0.11646501841136245, + "grad_norm": 0.9923128485679626, + "learning_rate": 0.00019611175387826315, + "loss": 1.9447, + "step": 1107 + }, + { + "epoch": 0.11657022619673856, + "grad_norm": 1.3460139036178589, + "learning_rate": 0.00019610233818693953, + "loss": 1.8778, + "step": 1108 + }, + { + "epoch": 0.11667543398211468, + "grad_norm": 0.7878180146217346, + "learning_rate": 0.0001960929113355678, + "loss": 1.8391, + "step": 1109 + }, + { + "epoch": 0.1167806417674908, + "grad_norm": 1.1408841609954834, + "learning_rate": 0.00019608347332524272, + "loss": 1.8246, + "step": 1110 + }, + { + "epoch": 0.11688584955286692, + "grad_norm": 1.3990023136138916, + "learning_rate": 0.00019607402415706027, + "loss": 1.8562, + "step": 1111 + }, + { + "epoch": 0.11699105733824303, + "grad_norm": 1.045021414756775, + "learning_rate": 0.00019606456383211777, + "loss": 1.9724, + "step": 1112 + }, + { + "epoch": 0.11709626512361915, + "grad_norm": 0.6789584755897522, + "learning_rate": 0.0001960550923515138, + "loss": 2.149, + "step": 1113 + }, + { + "epoch": 0.11720147290899527, + "grad_norm": 0.8170668482780457, + "learning_rate": 0.00019604560971634826, + "loss": 2.0608, + "step": 1114 + }, + { + "epoch": 0.11730668069437139, + "grad_norm": 0.6909334659576416, + "learning_rate": 0.00019603611592772233, + "loss": 2.1037, + "step": 1115 + }, + { + "epoch": 0.1174118884797475, + "grad_norm": 0.8929466605186462, + "learning_rate": 0.0001960266109867385, + "loss": 1.7609, + "step": 1116 + }, + { + "epoch": 0.11751709626512362, + "grad_norm": 1.2429050207138062, + "learning_rate": 0.00019601709489450056, + "loss": 1.6607, + "step": 1117 + }, + { + "epoch": 0.11762230405049974, + "grad_norm": 0.7718959450721741, + "learning_rate": 0.00019600756765211354, + "loss": 2.273, + "step": 1118 + }, + { + "epoch": 0.11772751183587586, + "grad_norm": 1.4449671506881714, + "learning_rate": 0.00019599802926068384, + "loss": 1.9564, + "step": 1119 + }, + { + "epoch": 0.11783271962125197, + "grad_norm": 0.955116868019104, + "learning_rate": 0.00019598847972131914, + "loss": 1.8775, + "step": 1120 + }, + { + "epoch": 0.11793792740662809, + "grad_norm": 1.5084177255630493, + "learning_rate": 0.00019597891903512835, + "loss": 2.0589, + "step": 1121 + }, + { + "epoch": 0.11804313519200421, + "grad_norm": 1.039664626121521, + "learning_rate": 0.00019596934720322176, + "loss": 2.331, + "step": 1122 + }, + { + "epoch": 0.11814834297738033, + "grad_norm": 1.2235255241394043, + "learning_rate": 0.00019595976422671086, + "loss": 1.7996, + "step": 1123 + }, + { + "epoch": 0.11825355076275644, + "grad_norm": 0.9834601283073425, + "learning_rate": 0.00019595017010670858, + "loss": 2.4876, + "step": 1124 + }, + { + "epoch": 0.11835875854813256, + "grad_norm": 1.0267283916473389, + "learning_rate": 0.00019594056484432897, + "loss": 2.1381, + "step": 1125 + }, + { + "epoch": 0.11846396633350868, + "grad_norm": 1.187152624130249, + "learning_rate": 0.00019593094844068748, + "loss": 2.1336, + "step": 1126 + }, + { + "epoch": 0.1185691741188848, + "grad_norm": 0.8260311484336853, + "learning_rate": 0.00019592132089690085, + "loss": 2.0608, + "step": 1127 + }, + { + "epoch": 0.11867438190426091, + "grad_norm": 0.8362654447555542, + "learning_rate": 0.0001959116822140871, + "loss": 2.2765, + "step": 1128 + }, + { + "epoch": 0.11877958968963703, + "grad_norm": 0.9070550799369812, + "learning_rate": 0.00019590203239336552, + "loss": 2.1471, + "step": 1129 + }, + { + "epoch": 0.11888479747501315, + "grad_norm": 0.8633410334587097, + "learning_rate": 0.0001958923714358567, + "loss": 2.2632, + "step": 1130 + }, + { + "epoch": 0.11899000526038928, + "grad_norm": 1.0233980417251587, + "learning_rate": 0.00019588269934268257, + "loss": 2.0631, + "step": 1131 + }, + { + "epoch": 0.11909521304576538, + "grad_norm": 1.4428153038024902, + "learning_rate": 0.00019587301611496632, + "loss": 1.646, + "step": 1132 + }, + { + "epoch": 0.1192004208311415, + "grad_norm": 1.2302439212799072, + "learning_rate": 0.00019586332175383238, + "loss": 2.0555, + "step": 1133 + }, + { + "epoch": 0.11930562861651763, + "grad_norm": 0.8173778057098389, + "learning_rate": 0.00019585361626040654, + "loss": 1.8569, + "step": 1134 + }, + { + "epoch": 0.11941083640189375, + "grad_norm": 1.5377962589263916, + "learning_rate": 0.00019584389963581592, + "loss": 1.9263, + "step": 1135 + }, + { + "epoch": 0.11951604418726985, + "grad_norm": 0.9011675119400024, + "learning_rate": 0.00019583417188118882, + "loss": 1.7328, + "step": 1136 + }, + { + "epoch": 0.11962125197264598, + "grad_norm": 1.0108987092971802, + "learning_rate": 0.0001958244329976549, + "loss": 1.8854, + "step": 1137 + }, + { + "epoch": 0.1197264597580221, + "grad_norm": 1.1729274988174438, + "learning_rate": 0.00019581468298634515, + "loss": 2.1649, + "step": 1138 + }, + { + "epoch": 0.11983166754339822, + "grad_norm": 1.0889396667480469, + "learning_rate": 0.00019580492184839175, + "loss": 2.0792, + "step": 1139 + }, + { + "epoch": 0.11993687532877433, + "grad_norm": 1.3217434883117676, + "learning_rate": 0.00019579514958492826, + "loss": 2.1042, + "step": 1140 + }, + { + "epoch": 0.12004208311415045, + "grad_norm": 1.0219428539276123, + "learning_rate": 0.00019578536619708952, + "loss": 2.3675, + "step": 1141 + }, + { + "epoch": 0.12014729089952657, + "grad_norm": 1.771694302558899, + "learning_rate": 0.0001957755716860116, + "loss": 2.1605, + "step": 1142 + }, + { + "epoch": 0.12025249868490269, + "grad_norm": 1.005615472793579, + "learning_rate": 0.0001957657660528319, + "loss": 1.9396, + "step": 1143 + }, + { + "epoch": 0.1203577064702788, + "grad_norm": 0.8795095086097717, + "learning_rate": 0.00019575594929868918, + "loss": 2.2603, + "step": 1144 + }, + { + "epoch": 0.12046291425565492, + "grad_norm": 1.1998109817504883, + "learning_rate": 0.00019574612142472334, + "loss": 2.5464, + "step": 1145 + }, + { + "epoch": 0.12056812204103104, + "grad_norm": 0.9663223624229431, + "learning_rate": 0.00019573628243207573, + "loss": 1.8903, + "step": 1146 + }, + { + "epoch": 0.12067332982640716, + "grad_norm": 0.9603292346000671, + "learning_rate": 0.0001957264323218889, + "loss": 2.1182, + "step": 1147 + }, + { + "epoch": 0.12077853761178327, + "grad_norm": 0.8244251608848572, + "learning_rate": 0.00019571657109530667, + "loss": 2.0749, + "step": 1148 + }, + { + "epoch": 0.12088374539715939, + "grad_norm": 1.2357412576675415, + "learning_rate": 0.00019570669875347427, + "loss": 1.8142, + "step": 1149 + }, + { + "epoch": 0.12098895318253551, + "grad_norm": 1.6487301588058472, + "learning_rate": 0.00019569681529753806, + "loss": 2.057, + "step": 1150 + }, + { + "epoch": 0.12109416096791163, + "grad_norm": 1.3535126447677612, + "learning_rate": 0.00019568692072864581, + "loss": 1.8645, + "step": 1151 + }, + { + "epoch": 0.12119936875328774, + "grad_norm": 0.983331024646759, + "learning_rate": 0.0001956770150479466, + "loss": 2.2773, + "step": 1152 + }, + { + "epoch": 0.12130457653866386, + "grad_norm": 0.625877320766449, + "learning_rate": 0.00019566709825659064, + "loss": 2.2146, + "step": 1153 + }, + { + "epoch": 0.12140978432403998, + "grad_norm": 1.0601180791854858, + "learning_rate": 0.0001956571703557296, + "loss": 2.2351, + "step": 1154 + }, + { + "epoch": 0.1215149921094161, + "grad_norm": 1.0137640237808228, + "learning_rate": 0.00019564723134651634, + "loss": 2.5221, + "step": 1155 + }, + { + "epoch": 0.12162019989479221, + "grad_norm": 1.044480323791504, + "learning_rate": 0.0001956372812301051, + "loss": 2.14, + "step": 1156 + }, + { + "epoch": 0.12172540768016833, + "grad_norm": 1.44232976436615, + "learning_rate": 0.00019562732000765127, + "loss": 1.9134, + "step": 1157 + }, + { + "epoch": 0.12183061546554445, + "grad_norm": 1.2370481491088867, + "learning_rate": 0.0001956173476803117, + "loss": 1.727, + "step": 1158 + }, + { + "epoch": 0.12193582325092057, + "grad_norm": 1.006666898727417, + "learning_rate": 0.00019560736424924439, + "loss": 2.221, + "step": 1159 + }, + { + "epoch": 0.12204103103629668, + "grad_norm": 0.8660513162612915, + "learning_rate": 0.0001955973697156087, + "loss": 2.1285, + "step": 1160 + }, + { + "epoch": 0.1221462388216728, + "grad_norm": 0.7105367183685303, + "learning_rate": 0.00019558736408056525, + "loss": 2.0834, + "step": 1161 + }, + { + "epoch": 0.12225144660704892, + "grad_norm": 1.236364483833313, + "learning_rate": 0.000195577347345276, + "loss": 1.8932, + "step": 1162 + }, + { + "epoch": 0.12235665439242505, + "grad_norm": 1.5001050233840942, + "learning_rate": 0.0001955673195109041, + "loss": 2.263, + "step": 1163 + }, + { + "epoch": 0.12246186217780115, + "grad_norm": 0.8300707936286926, + "learning_rate": 0.0001955572805786141, + "loss": 2.3241, + "step": 1164 + }, + { + "epoch": 0.12256706996317727, + "grad_norm": 1.1796188354492188, + "learning_rate": 0.00019554723054957175, + "loss": 1.7304, + "step": 1165 + }, + { + "epoch": 0.1226722777485534, + "grad_norm": 1.1886162757873535, + "learning_rate": 0.00019553716942494415, + "loss": 1.8389, + "step": 1166 + }, + { + "epoch": 0.12277748553392952, + "grad_norm": 1.0573186874389648, + "learning_rate": 0.00019552709720589966, + "loss": 2.0146, + "step": 1167 + }, + { + "epoch": 0.12288269331930562, + "grad_norm": 0.8099238276481628, + "learning_rate": 0.00019551701389360795, + "loss": 1.593, + "step": 1168 + }, + { + "epoch": 0.12298790110468175, + "grad_norm": 1.0094003677368164, + "learning_rate": 0.00019550691948923992, + "loss": 2.4218, + "step": 1169 + }, + { + "epoch": 0.12309310889005787, + "grad_norm": 0.8765414953231812, + "learning_rate": 0.00019549681399396785, + "loss": 2.2399, + "step": 1170 + }, + { + "epoch": 0.12319831667543399, + "grad_norm": 1.4000588655471802, + "learning_rate": 0.00019548669740896525, + "loss": 1.7225, + "step": 1171 + }, + { + "epoch": 0.1233035244608101, + "grad_norm": 1.909455418586731, + "learning_rate": 0.0001954765697354069, + "loss": 2.0569, + "step": 1172 + }, + { + "epoch": 0.12340873224618622, + "grad_norm": 1.0092931985855103, + "learning_rate": 0.00019546643097446888, + "loss": 1.8983, + "step": 1173 + }, + { + "epoch": 0.12351394003156234, + "grad_norm": 1.216364860534668, + "learning_rate": 0.0001954562811273286, + "loss": 2.0644, + "step": 1174 + }, + { + "epoch": 0.12361914781693846, + "grad_norm": 1.245184302330017, + "learning_rate": 0.00019544612019516472, + "loss": 1.4739, + "step": 1175 + }, + { + "epoch": 0.12372435560231457, + "grad_norm": 1.5006418228149414, + "learning_rate": 0.00019543594817915722, + "loss": 2.0464, + "step": 1176 + }, + { + "epoch": 0.12382956338769069, + "grad_norm": 1.2883630990982056, + "learning_rate": 0.00019542576508048732, + "loss": 1.9773, + "step": 1177 + }, + { + "epoch": 0.12393477117306681, + "grad_norm": 1.4748666286468506, + "learning_rate": 0.00019541557090033753, + "loss": 2.0239, + "step": 1178 + }, + { + "epoch": 0.12403997895844293, + "grad_norm": 1.251774549484253, + "learning_rate": 0.0001954053656398917, + "loss": 1.8751, + "step": 1179 + }, + { + "epoch": 0.12414518674381904, + "grad_norm": 1.2670092582702637, + "learning_rate": 0.00019539514930033493, + "loss": 2.093, + "step": 1180 + }, + { + "epoch": 0.12425039452919516, + "grad_norm": 0.845373272895813, + "learning_rate": 0.00019538492188285358, + "loss": 2.2009, + "step": 1181 + }, + { + "epoch": 0.12435560231457128, + "grad_norm": 1.3684707880020142, + "learning_rate": 0.00019537468338863537, + "loss": 2.1124, + "step": 1182 + }, + { + "epoch": 0.1244608100999474, + "grad_norm": 1.0904892683029175, + "learning_rate": 0.0001953644338188692, + "loss": 2.2674, + "step": 1183 + }, + { + "epoch": 0.12456601788532351, + "grad_norm": 1.0843944549560547, + "learning_rate": 0.0001953541731747454, + "loss": 2.1427, + "step": 1184 + }, + { + "epoch": 0.12467122567069963, + "grad_norm": 1.4140195846557617, + "learning_rate": 0.00019534390145745545, + "loss": 1.9423, + "step": 1185 + }, + { + "epoch": 0.12477643345607575, + "grad_norm": 1.2071352005004883, + "learning_rate": 0.00019533361866819218, + "loss": 1.7771, + "step": 1186 + }, + { + "epoch": 0.12488164124145187, + "grad_norm": 0.9805304408073425, + "learning_rate": 0.0001953233248081497, + "loss": 2.1534, + "step": 1187 + }, + { + "epoch": 0.12498684902682798, + "grad_norm": 0.8106799721717834, + "learning_rate": 0.0001953130198785234, + "loss": 1.8775, + "step": 1188 + }, + { + "epoch": 0.12509205681220412, + "grad_norm": 1.2186012268066406, + "learning_rate": 0.00019530270388050998, + "loss": 2.0908, + "step": 1189 + }, + { + "epoch": 0.1251972645975802, + "grad_norm": 1.270003318786621, + "learning_rate": 0.00019529237681530735, + "loss": 1.9182, + "step": 1190 + }, + { + "epoch": 0.12530247238295633, + "grad_norm": 1.1836543083190918, + "learning_rate": 0.00019528203868411482, + "loss": 2.2695, + "step": 1191 + }, + { + "epoch": 0.12540768016833245, + "grad_norm": 1.4582277536392212, + "learning_rate": 0.00019527168948813288, + "loss": 1.7637, + "step": 1192 + }, + { + "epoch": 0.12551288795370857, + "grad_norm": 1.24919593334198, + "learning_rate": 0.00019526132922856334, + "loss": 1.8924, + "step": 1193 + }, + { + "epoch": 0.1256180957390847, + "grad_norm": 1.0060888528823853, + "learning_rate": 0.00019525095790660937, + "loss": 2.2829, + "step": 1194 + }, + { + "epoch": 0.12572330352446082, + "grad_norm": 1.7646416425704956, + "learning_rate": 0.00019524057552347527, + "loss": 2.0088, + "step": 1195 + }, + { + "epoch": 0.12582851130983694, + "grad_norm": 1.0595672130584717, + "learning_rate": 0.00019523018208036677, + "loss": 1.9792, + "step": 1196 + }, + { + "epoch": 0.12593371909521306, + "grad_norm": 0.9665113091468811, + "learning_rate": 0.00019521977757849083, + "loss": 1.8552, + "step": 1197 + }, + { + "epoch": 0.12603892688058915, + "grad_norm": 0.996906042098999, + "learning_rate": 0.00019520936201905566, + "loss": 1.9122, + "step": 1198 + }, + { + "epoch": 0.12614413466596527, + "grad_norm": 1.0974137783050537, + "learning_rate": 0.0001951989354032708, + "loss": 2.3548, + "step": 1199 + }, + { + "epoch": 0.1262493424513414, + "grad_norm": 1.2760165929794312, + "learning_rate": 0.00019518849773234704, + "loss": 1.9927, + "step": 1200 + }, + { + "epoch": 0.12635455023671752, + "grad_norm": 0.8448911309242249, + "learning_rate": 0.0001951780490074965, + "loss": 2.1343, + "step": 1201 + }, + { + "epoch": 0.12645975802209364, + "grad_norm": 0.8463537096977234, + "learning_rate": 0.00019516758922993256, + "loss": 2.4417, + "step": 1202 + }, + { + "epoch": 0.12656496580746976, + "grad_norm": 0.971531331539154, + "learning_rate": 0.0001951571184008698, + "loss": 2.0901, + "step": 1203 + }, + { + "epoch": 0.12667017359284588, + "grad_norm": 1.1150871515274048, + "learning_rate": 0.00019514663652152428, + "loss": 2.3631, + "step": 1204 + }, + { + "epoch": 0.126775381378222, + "grad_norm": 1.44049870967865, + "learning_rate": 0.00019513614359311315, + "loss": 1.9413, + "step": 1205 + }, + { + "epoch": 0.1268805891635981, + "grad_norm": 1.1765494346618652, + "learning_rate": 0.00019512563961685494, + "loss": 2.2888, + "step": 1206 + }, + { + "epoch": 0.12698579694897422, + "grad_norm": 1.1426244974136353, + "learning_rate": 0.00019511512459396944, + "loss": 1.7372, + "step": 1207 + }, + { + "epoch": 0.12709100473435034, + "grad_norm": 1.2937824726104736, + "learning_rate": 0.00019510459852567773, + "loss": 1.7583, + "step": 1208 + }, + { + "epoch": 0.12719621251972646, + "grad_norm": 1.0427653789520264, + "learning_rate": 0.0001950940614132022, + "loss": 2.2694, + "step": 1209 + }, + { + "epoch": 0.12730142030510258, + "grad_norm": 1.0768921375274658, + "learning_rate": 0.00019508351325776642, + "loss": 2.0166, + "step": 1210 + }, + { + "epoch": 0.1274066280904787, + "grad_norm": 1.1876052618026733, + "learning_rate": 0.00019507295406059533, + "loss": 2.2435, + "step": 1211 + }, + { + "epoch": 0.12751183587585482, + "grad_norm": 1.4047654867172241, + "learning_rate": 0.0001950623838229152, + "loss": 1.9735, + "step": 1212 + }, + { + "epoch": 0.12761704366123094, + "grad_norm": 1.5884112119674683, + "learning_rate": 0.00019505180254595343, + "loss": 1.8768, + "step": 1213 + }, + { + "epoch": 0.12772225144660704, + "grad_norm": 0.8576595187187195, + "learning_rate": 0.00019504121023093888, + "loss": 2.186, + "step": 1214 + }, + { + "epoch": 0.12782745923198316, + "grad_norm": 1.0435853004455566, + "learning_rate": 0.00019503060687910148, + "loss": 1.963, + "step": 1215 + }, + { + "epoch": 0.12793266701735928, + "grad_norm": 1.279901385307312, + "learning_rate": 0.0001950199924916727, + "loss": 1.7715, + "step": 1216 + }, + { + "epoch": 0.1280378748027354, + "grad_norm": 0.7529373168945312, + "learning_rate": 0.00019500936706988502, + "loss": 2.0821, + "step": 1217 + }, + { + "epoch": 0.12814308258811152, + "grad_norm": 1.64811372756958, + "learning_rate": 0.00019499873061497246, + "loss": 2.1265, + "step": 1218 + }, + { + "epoch": 0.12824829037348764, + "grad_norm": 1.1517812013626099, + "learning_rate": 0.00019498808312817006, + "loss": 1.9678, + "step": 1219 + }, + { + "epoch": 0.12835349815886377, + "grad_norm": 1.0397827625274658, + "learning_rate": 0.00019497742461071441, + "loss": 1.9882, + "step": 1220 + }, + { + "epoch": 0.1284587059442399, + "grad_norm": 1.1834861040115356, + "learning_rate": 0.0001949667550638432, + "loss": 2.175, + "step": 1221 + }, + { + "epoch": 0.12856391372961598, + "grad_norm": 1.0176396369934082, + "learning_rate": 0.00019495607448879546, + "loss": 1.9798, + "step": 1222 + }, + { + "epoch": 0.1286691215149921, + "grad_norm": 1.2823526859283447, + "learning_rate": 0.00019494538288681145, + "loss": 1.8454, + "step": 1223 + }, + { + "epoch": 0.12877432930036822, + "grad_norm": 1.0017826557159424, + "learning_rate": 0.00019493468025913276, + "loss": 2.1296, + "step": 1224 + }, + { + "epoch": 0.12887953708574434, + "grad_norm": 0.8220755457878113, + "learning_rate": 0.00019492396660700226, + "loss": 1.9987, + "step": 1225 + }, + { + "epoch": 0.12898474487112047, + "grad_norm": 0.9506868124008179, + "learning_rate": 0.00019491324193166408, + "loss": 2.3586, + "step": 1226 + }, + { + "epoch": 0.1290899526564966, + "grad_norm": 0.796829342842102, + "learning_rate": 0.00019490250623436367, + "loss": 1.4214, + "step": 1227 + }, + { + "epoch": 0.1291951604418727, + "grad_norm": 1.117042899131775, + "learning_rate": 0.00019489175951634775, + "loss": 1.7607, + "step": 1228 + }, + { + "epoch": 0.12930036822724883, + "grad_norm": 1.1482354402542114, + "learning_rate": 0.00019488100177886427, + "loss": 1.782, + "step": 1229 + }, + { + "epoch": 0.12940557601262492, + "grad_norm": 0.8992315530776978, + "learning_rate": 0.00019487023302316243, + "loss": 2.3632, + "step": 1230 + }, + { + "epoch": 0.12951078379800104, + "grad_norm": 0.9906787872314453, + "learning_rate": 0.00019485945325049288, + "loss": 1.8036, + "step": 1231 + }, + { + "epoch": 0.12961599158337717, + "grad_norm": 1.0407614707946777, + "learning_rate": 0.00019484866246210738, + "loss": 1.9722, + "step": 1232 + }, + { + "epoch": 0.1297211993687533, + "grad_norm": 1.3917356729507446, + "learning_rate": 0.00019483786065925904, + "loss": 1.6965, + "step": 1233 + }, + { + "epoch": 0.1298264071541294, + "grad_norm": 0.9043229818344116, + "learning_rate": 0.0001948270478432022, + "loss": 1.9409, + "step": 1234 + }, + { + "epoch": 0.12993161493950553, + "grad_norm": 1.2875796556472778, + "learning_rate": 0.0001948162240151926, + "loss": 2.0062, + "step": 1235 + }, + { + "epoch": 0.13003682272488165, + "grad_norm": 1.3274993896484375, + "learning_rate": 0.00019480538917648711, + "loss": 1.8964, + "step": 1236 + }, + { + "epoch": 0.13014203051025777, + "grad_norm": 0.974236786365509, + "learning_rate": 0.00019479454332834396, + "loss": 1.4836, + "step": 1237 + }, + { + "epoch": 0.13024723829563387, + "grad_norm": 0.9269886016845703, + "learning_rate": 0.00019478368647202264, + "loss": 2.19, + "step": 1238 + }, + { + "epoch": 0.13035244608101, + "grad_norm": 0.7717685699462891, + "learning_rate": 0.0001947728186087839, + "loss": 2.4556, + "step": 1239 + }, + { + "epoch": 0.1304576538663861, + "grad_norm": 1.7023308277130127, + "learning_rate": 0.00019476193973988988, + "loss": 2.433, + "step": 1240 + }, + { + "epoch": 0.13056286165176223, + "grad_norm": 1.3712750673294067, + "learning_rate": 0.0001947510498666038, + "loss": 2.1469, + "step": 1241 + }, + { + "epoch": 0.13066806943713835, + "grad_norm": 1.1237108707427979, + "learning_rate": 0.0001947401489901903, + "loss": 2.3985, + "step": 1242 + }, + { + "epoch": 0.13077327722251447, + "grad_norm": 1.0519534349441528, + "learning_rate": 0.0001947292371119153, + "loss": 2.3266, + "step": 1243 + }, + { + "epoch": 0.1308784850078906, + "grad_norm": 0.9474523663520813, + "learning_rate": 0.0001947183142330459, + "loss": 2.0388, + "step": 1244 + }, + { + "epoch": 0.13098369279326671, + "grad_norm": 1.21075439453125, + "learning_rate": 0.00019470738035485058, + "loss": 1.7828, + "step": 1245 + }, + { + "epoch": 0.1310889005786428, + "grad_norm": 0.9895322322845459, + "learning_rate": 0.00019469643547859904, + "loss": 2.0604, + "step": 1246 + }, + { + "epoch": 0.13119410836401893, + "grad_norm": 1.5811057090759277, + "learning_rate": 0.0001946854796055623, + "loss": 1.7441, + "step": 1247 + }, + { + "epoch": 0.13129931614939505, + "grad_norm": 1.3504817485809326, + "learning_rate": 0.00019467451273701256, + "loss": 2.4175, + "step": 1248 + }, + { + "epoch": 0.13140452393477117, + "grad_norm": 1.2100443840026855, + "learning_rate": 0.00019466353487422345, + "loss": 2.0371, + "step": 1249 + }, + { + "epoch": 0.1315097317201473, + "grad_norm": 0.8898875713348389, + "learning_rate": 0.00019465254601846974, + "loss": 2.3904, + "step": 1250 + }, + { + "epoch": 0.13161493950552342, + "grad_norm": 1.0012081861495972, + "learning_rate": 0.00019464154617102755, + "loss": 1.8023, + "step": 1251 + }, + { + "epoch": 0.13172014729089954, + "grad_norm": 1.0802556276321411, + "learning_rate": 0.00019463053533317425, + "loss": 2.0063, + "step": 1252 + }, + { + "epoch": 0.13182535507627566, + "grad_norm": 1.0085639953613281, + "learning_rate": 0.00019461951350618849, + "loss": 1.7158, + "step": 1253 + }, + { + "epoch": 0.13193056286165175, + "grad_norm": 0.7718974351882935, + "learning_rate": 0.00019460848069135017, + "loss": 2.2337, + "step": 1254 + }, + { + "epoch": 0.13203577064702787, + "grad_norm": 1.247361421585083, + "learning_rate": 0.0001945974368899406, + "loss": 2.0403, + "step": 1255 + }, + { + "epoch": 0.132140978432404, + "grad_norm": 0.9886590838432312, + "learning_rate": 0.00019458638210324212, + "loss": 2.1477, + "step": 1256 + }, + { + "epoch": 0.13224618621778012, + "grad_norm": 0.9976163506507874, + "learning_rate": 0.00019457531633253856, + "loss": 2.0871, + "step": 1257 + }, + { + "epoch": 0.13235139400315624, + "grad_norm": 0.9172899127006531, + "learning_rate": 0.00019456423957911497, + "loss": 2.0631, + "step": 1258 + }, + { + "epoch": 0.13245660178853236, + "grad_norm": 1.053161382675171, + "learning_rate": 0.0001945531518442576, + "loss": 2.0908, + "step": 1259 + }, + { + "epoch": 0.13256180957390848, + "grad_norm": 1.0828722715377808, + "learning_rate": 0.00019454205312925408, + "loss": 1.9512, + "step": 1260 + }, + { + "epoch": 0.1326670173592846, + "grad_norm": 0.9323759078979492, + "learning_rate": 0.00019453094343539325, + "loss": 2.3588, + "step": 1261 + }, + { + "epoch": 0.1327722251446607, + "grad_norm": 1.2428725957870483, + "learning_rate": 0.00019451982276396526, + "loss": 1.9449, + "step": 1262 + }, + { + "epoch": 0.13287743293003682, + "grad_norm": 1.8878873586654663, + "learning_rate": 0.00019450869111626147, + "loss": 1.4895, + "step": 1263 + }, + { + "epoch": 0.13298264071541294, + "grad_norm": 1.343825101852417, + "learning_rate": 0.0001944975484935746, + "loss": 1.713, + "step": 1264 + }, + { + "epoch": 0.13308784850078906, + "grad_norm": 0.9605780243873596, + "learning_rate": 0.0001944863948971986, + "loss": 2.0469, + "step": 1265 + }, + { + "epoch": 0.13319305628616518, + "grad_norm": 1.6616069078445435, + "learning_rate": 0.0001944752303284287, + "loss": 2.0865, + "step": 1266 + }, + { + "epoch": 0.1332982640715413, + "grad_norm": 1.17947518825531, + "learning_rate": 0.0001944640547885614, + "loss": 1.984, + "step": 1267 + }, + { + "epoch": 0.13340347185691742, + "grad_norm": 1.6048225164413452, + "learning_rate": 0.00019445286827889446, + "loss": 1.7243, + "step": 1268 + }, + { + "epoch": 0.13350867964229354, + "grad_norm": 0.9817343950271606, + "learning_rate": 0.00019444167080072698, + "loss": 1.7584, + "step": 1269 + }, + { + "epoch": 0.13361388742766964, + "grad_norm": 1.3782764673233032, + "learning_rate": 0.00019443046235535923, + "loss": 1.9988, + "step": 1270 + }, + { + "epoch": 0.13371909521304576, + "grad_norm": 1.0830388069152832, + "learning_rate": 0.00019441924294409289, + "loss": 2.0023, + "step": 1271 + }, + { + "epoch": 0.13382430299842188, + "grad_norm": 1.0523784160614014, + "learning_rate": 0.00019440801256823074, + "loss": 2.1114, + "step": 1272 + }, + { + "epoch": 0.133929510783798, + "grad_norm": 0.7290390729904175, + "learning_rate": 0.00019439677122907697, + "loss": 2.0477, + "step": 1273 + }, + { + "epoch": 0.13403471856917412, + "grad_norm": 1.1659435033798218, + "learning_rate": 0.00019438551892793701, + "loss": 2.0237, + "step": 1274 + }, + { + "epoch": 0.13413992635455024, + "grad_norm": 0.9051696062088013, + "learning_rate": 0.00019437425566611754, + "loss": 1.4642, + "step": 1275 + }, + { + "epoch": 0.13424513413992636, + "grad_norm": 0.9961209297180176, + "learning_rate": 0.0001943629814449265, + "loss": 2.1073, + "step": 1276 + }, + { + "epoch": 0.13435034192530249, + "grad_norm": 1.1717655658721924, + "learning_rate": 0.0001943516962656732, + "loss": 1.878, + "step": 1277 + }, + { + "epoch": 0.13445554971067858, + "grad_norm": 1.0735725164413452, + "learning_rate": 0.00019434040012966807, + "loss": 1.8939, + "step": 1278 + }, + { + "epoch": 0.1345607574960547, + "grad_norm": 1.1567238569259644, + "learning_rate": 0.00019432909303822296, + "loss": 2.2329, + "step": 1279 + }, + { + "epoch": 0.13466596528143082, + "grad_norm": 2.026787757873535, + "learning_rate": 0.00019431777499265087, + "loss": 1.5744, + "step": 1280 + }, + { + "epoch": 0.13477117306680694, + "grad_norm": 1.0909669399261475, + "learning_rate": 0.00019430644599426614, + "loss": 2.273, + "step": 1281 + }, + { + "epoch": 0.13487638085218306, + "grad_norm": 1.2396634817123413, + "learning_rate": 0.0001942951060443844, + "loss": 1.8932, + "step": 1282 + }, + { + "epoch": 0.13498158863755919, + "grad_norm": 1.163996696472168, + "learning_rate": 0.00019428375514432254, + "loss": 2.2442, + "step": 1283 + }, + { + "epoch": 0.1350867964229353, + "grad_norm": 0.9469679594039917, + "learning_rate": 0.0001942723932953986, + "loss": 2.4131, + "step": 1284 + }, + { + "epoch": 0.13519200420831143, + "grad_norm": 0.993421196937561, + "learning_rate": 0.00019426102049893208, + "loss": 2.3047, + "step": 1285 + }, + { + "epoch": 0.13529721199368752, + "grad_norm": 1.0937494039535522, + "learning_rate": 0.00019424963675624364, + "loss": 1.9778, + "step": 1286 + }, + { + "epoch": 0.13540241977906364, + "grad_norm": 1.0145161151885986, + "learning_rate": 0.00019423824206865527, + "loss": 2.3466, + "step": 1287 + }, + { + "epoch": 0.13550762756443976, + "grad_norm": 1.2493735551834106, + "learning_rate": 0.00019422683643749013, + "loss": 2.1595, + "step": 1288 + }, + { + "epoch": 0.13561283534981589, + "grad_norm": 0.7461625933647156, + "learning_rate": 0.00019421541986407276, + "loss": 1.9241, + "step": 1289 + }, + { + "epoch": 0.135718043135192, + "grad_norm": 1.2610676288604736, + "learning_rate": 0.00019420399234972894, + "loss": 1.9008, + "step": 1290 + }, + { + "epoch": 0.13582325092056813, + "grad_norm": 1.5543406009674072, + "learning_rate": 0.0001941925538957857, + "loss": 1.8748, + "step": 1291 + }, + { + "epoch": 0.13592845870594425, + "grad_norm": 0.6136517524719238, + "learning_rate": 0.00019418110450357135, + "loss": 1.6854, + "step": 1292 + }, + { + "epoch": 0.13603366649132037, + "grad_norm": 1.5480318069458008, + "learning_rate": 0.00019416964417441542, + "loss": 2.0514, + "step": 1293 + }, + { + "epoch": 0.13613887427669646, + "grad_norm": 0.9149646759033203, + "learning_rate": 0.00019415817290964883, + "loss": 2.1665, + "step": 1294 + }, + { + "epoch": 0.13624408206207259, + "grad_norm": 0.9369128346443176, + "learning_rate": 0.0001941466907106037, + "loss": 1.9612, + "step": 1295 + }, + { + "epoch": 0.1363492898474487, + "grad_norm": 0.8914728164672852, + "learning_rate": 0.0001941351975786134, + "loss": 1.5696, + "step": 1296 + }, + { + "epoch": 0.13645449763282483, + "grad_norm": 1.2152458429336548, + "learning_rate": 0.00019412369351501255, + "loss": 2.3433, + "step": 1297 + }, + { + "epoch": 0.13655970541820095, + "grad_norm": 0.6514436602592468, + "learning_rate": 0.0001941121785211371, + "loss": 2.3054, + "step": 1298 + }, + { + "epoch": 0.13666491320357707, + "grad_norm": 1.1356117725372314, + "learning_rate": 0.0001941006525983243, + "loss": 2.2101, + "step": 1299 + }, + { + "epoch": 0.1367701209889532, + "grad_norm": 1.195465326309204, + "learning_rate": 0.00019408911574791255, + "loss": 1.6433, + "step": 1300 + }, + { + "epoch": 0.1368753287743293, + "grad_norm": 1.0879439115524292, + "learning_rate": 0.00019407756797124164, + "loss": 2.0753, + "step": 1301 + }, + { + "epoch": 0.1369805365597054, + "grad_norm": 1.4740591049194336, + "learning_rate": 0.00019406600926965255, + "loss": 2.1329, + "step": 1302 + }, + { + "epoch": 0.13708574434508153, + "grad_norm": 1.2994344234466553, + "learning_rate": 0.00019405443964448757, + "loss": 1.9924, + "step": 1303 + }, + { + "epoch": 0.13719095213045765, + "grad_norm": 1.0827962160110474, + "learning_rate": 0.0001940428590970902, + "loss": 2.1453, + "step": 1304 + }, + { + "epoch": 0.13729615991583377, + "grad_norm": 0.9276465177536011, + "learning_rate": 0.0001940312676288053, + "loss": 2.3533, + "step": 1305 + }, + { + "epoch": 0.1374013677012099, + "grad_norm": 1.7329941987991333, + "learning_rate": 0.00019401966524097892, + "loss": 1.963, + "step": 1306 + }, + { + "epoch": 0.137506575486586, + "grad_norm": 1.348925232887268, + "learning_rate": 0.00019400805193495839, + "loss": 1.9953, + "step": 1307 + }, + { + "epoch": 0.13761178327196213, + "grad_norm": 1.512202262878418, + "learning_rate": 0.00019399642771209238, + "loss": 1.7091, + "step": 1308 + }, + { + "epoch": 0.13771699105733826, + "grad_norm": 1.0821994543075562, + "learning_rate": 0.00019398479257373073, + "loss": 2.0883, + "step": 1309 + }, + { + "epoch": 0.13782219884271435, + "grad_norm": 0.9924389719963074, + "learning_rate": 0.00019397314652122463, + "loss": 2.0062, + "step": 1310 + }, + { + "epoch": 0.13792740662809047, + "grad_norm": 1.1338258981704712, + "learning_rate": 0.00019396148955592643, + "loss": 2.1751, + "step": 1311 + }, + { + "epoch": 0.1380326144134666, + "grad_norm": 1.3012595176696777, + "learning_rate": 0.00019394982167918987, + "loss": 1.8652, + "step": 1312 + }, + { + "epoch": 0.1381378221988427, + "grad_norm": 1.109009861946106, + "learning_rate": 0.0001939381428923699, + "loss": 2.3266, + "step": 1313 + }, + { + "epoch": 0.13824302998421883, + "grad_norm": 0.9063061475753784, + "learning_rate": 0.00019392645319682273, + "loss": 1.8569, + "step": 1314 + }, + { + "epoch": 0.13834823776959496, + "grad_norm": 0.9491289854049683, + "learning_rate": 0.00019391475259390584, + "loss": 2.031, + "step": 1315 + }, + { + "epoch": 0.13845344555497108, + "grad_norm": 1.2872332334518433, + "learning_rate": 0.00019390304108497794, + "loss": 2.0533, + "step": 1316 + }, + { + "epoch": 0.1385586533403472, + "grad_norm": 1.0027546882629395, + "learning_rate": 0.00019389131867139913, + "loss": 2.1953, + "step": 1317 + }, + { + "epoch": 0.1386638611257233, + "grad_norm": 0.9697312712669373, + "learning_rate": 0.00019387958535453068, + "loss": 1.8958, + "step": 1318 + }, + { + "epoch": 0.1387690689110994, + "grad_norm": 0.9787667393684387, + "learning_rate": 0.00019386784113573508, + "loss": 2.074, + "step": 1319 + }, + { + "epoch": 0.13887427669647553, + "grad_norm": 1.1985487937927246, + "learning_rate": 0.00019385608601637624, + "loss": 2.2367, + "step": 1320 + }, + { + "epoch": 0.13897948448185166, + "grad_norm": 0.9592642188072205, + "learning_rate": 0.00019384431999781916, + "loss": 1.7508, + "step": 1321 + }, + { + "epoch": 0.13908469226722778, + "grad_norm": 2.198991060256958, + "learning_rate": 0.0001938325430814302, + "loss": 1.7532, + "step": 1322 + }, + { + "epoch": 0.1391899000526039, + "grad_norm": 1.6652568578720093, + "learning_rate": 0.00019382075526857705, + "loss": 1.5727, + "step": 1323 + }, + { + "epoch": 0.13929510783798002, + "grad_norm": 0.9190332889556885, + "learning_rate": 0.00019380895656062846, + "loss": 2.206, + "step": 1324 + }, + { + "epoch": 0.13940031562335614, + "grad_norm": 1.020340085029602, + "learning_rate": 0.00019379714695895472, + "loss": 2.0502, + "step": 1325 + }, + { + "epoch": 0.13950552340873223, + "grad_norm": 1.3590896129608154, + "learning_rate": 0.00019378532646492714, + "loss": 2.3099, + "step": 1326 + }, + { + "epoch": 0.13961073119410836, + "grad_norm": 0.7917930483818054, + "learning_rate": 0.00019377349507991842, + "loss": 2.2146, + "step": 1327 + }, + { + "epoch": 0.13971593897948448, + "grad_norm": 1.0664583444595337, + "learning_rate": 0.00019376165280530252, + "loss": 2.5567, + "step": 1328 + }, + { + "epoch": 0.1398211467648606, + "grad_norm": 0.7282881736755371, + "learning_rate": 0.00019374979964245463, + "loss": 1.9319, + "step": 1329 + }, + { + "epoch": 0.13992635455023672, + "grad_norm": 0.9543462991714478, + "learning_rate": 0.0001937379355927512, + "loss": 1.9968, + "step": 1330 + }, + { + "epoch": 0.14003156233561284, + "grad_norm": 0.8911160826683044, + "learning_rate": 0.00019372606065757003, + "loss": 1.9732, + "step": 1331 + }, + { + "epoch": 0.14013677012098896, + "grad_norm": 0.8454022407531738, + "learning_rate": 0.00019371417483829003, + "loss": 1.6059, + "step": 1332 + }, + { + "epoch": 0.14024197790636508, + "grad_norm": 0.9383593797683716, + "learning_rate": 0.00019370227813629147, + "loss": 2.2838, + "step": 1333 + }, + { + "epoch": 0.14034718569174118, + "grad_norm": 1.4767099618911743, + "learning_rate": 0.00019369037055295594, + "loss": 1.5007, + "step": 1334 + }, + { + "epoch": 0.1404523934771173, + "grad_norm": 1.4801223278045654, + "learning_rate": 0.00019367845208966618, + "loss": 2.0174, + "step": 1335 + }, + { + "epoch": 0.14055760126249342, + "grad_norm": 0.995274543762207, + "learning_rate": 0.00019366652274780628, + "loss": 2.0762, + "step": 1336 + }, + { + "epoch": 0.14066280904786954, + "grad_norm": 0.9528397917747498, + "learning_rate": 0.0001936545825287615, + "loss": 2.177, + "step": 1337 + }, + { + "epoch": 0.14076801683324566, + "grad_norm": 1.0369575023651123, + "learning_rate": 0.00019364263143391847, + "loss": 1.8945, + "step": 1338 + }, + { + "epoch": 0.14087322461862178, + "grad_norm": 0.8390545845031738, + "learning_rate": 0.00019363066946466502, + "loss": 1.9435, + "step": 1339 + }, + { + "epoch": 0.1409784324039979, + "grad_norm": 1.2658418416976929, + "learning_rate": 0.0001936186966223902, + "loss": 2.3394, + "step": 1340 + }, + { + "epoch": 0.14108364018937403, + "grad_norm": 0.9371610283851624, + "learning_rate": 0.00019360671290848447, + "loss": 2.1424, + "step": 1341 + }, + { + "epoch": 0.14118884797475012, + "grad_norm": 0.9509402513504028, + "learning_rate": 0.00019359471832433936, + "loss": 1.9365, + "step": 1342 + }, + { + "epoch": 0.14129405576012624, + "grad_norm": 1.1386808156967163, + "learning_rate": 0.00019358271287134784, + "loss": 1.7742, + "step": 1343 + }, + { + "epoch": 0.14139926354550236, + "grad_norm": 0.946628987789154, + "learning_rate": 0.00019357069655090404, + "loss": 2.1615, + "step": 1344 + }, + { + "epoch": 0.14150447133087848, + "grad_norm": 0.8672672510147095, + "learning_rate": 0.00019355866936440337, + "loss": 1.9622, + "step": 1345 + }, + { + "epoch": 0.1416096791162546, + "grad_norm": 1.2565221786499023, + "learning_rate": 0.0001935466313132425, + "loss": 1.6286, + "step": 1346 + }, + { + "epoch": 0.14171488690163073, + "grad_norm": 1.0216825008392334, + "learning_rate": 0.00019353458239881936, + "loss": 1.6907, + "step": 1347 + }, + { + "epoch": 0.14182009468700685, + "grad_norm": 1.3723381757736206, + "learning_rate": 0.00019352252262253318, + "loss": 1.64, + "step": 1348 + }, + { + "epoch": 0.14192530247238297, + "grad_norm": 1.0351169109344482, + "learning_rate": 0.00019351045198578445, + "loss": 2.2069, + "step": 1349 + }, + { + "epoch": 0.14203051025775906, + "grad_norm": 1.1500434875488281, + "learning_rate": 0.00019349837048997478, + "loss": 1.8967, + "step": 1350 + }, + { + "epoch": 0.14213571804313518, + "grad_norm": 1.0211580991744995, + "learning_rate": 0.00019348627813650727, + "loss": 1.6682, + "step": 1351 + }, + { + "epoch": 0.1422409258285113, + "grad_norm": 0.8861009478569031, + "learning_rate": 0.00019347417492678615, + "loss": 2.383, + "step": 1352 + }, + { + "epoch": 0.14234613361388743, + "grad_norm": 1.089370608329773, + "learning_rate": 0.00019346206086221686, + "loss": 2.0606, + "step": 1353 + }, + { + "epoch": 0.14245134139926355, + "grad_norm": 0.9320117235183716, + "learning_rate": 0.00019344993594420622, + "loss": 2.165, + "step": 1354 + }, + { + "epoch": 0.14255654918463967, + "grad_norm": 0.9916388988494873, + "learning_rate": 0.00019343780017416223, + "loss": 2.1013, + "step": 1355 + }, + { + "epoch": 0.1426617569700158, + "grad_norm": 0.894123375415802, + "learning_rate": 0.00019342565355349417, + "loss": 2.3444, + "step": 1356 + }, + { + "epoch": 0.1427669647553919, + "grad_norm": 0.835313081741333, + "learning_rate": 0.00019341349608361267, + "loss": 2.1591, + "step": 1357 + }, + { + "epoch": 0.142872172540768, + "grad_norm": 0.9938138723373413, + "learning_rate": 0.00019340132776592942, + "loss": 2.1381, + "step": 1358 + }, + { + "epoch": 0.14297738032614413, + "grad_norm": 1.408219337463379, + "learning_rate": 0.00019338914860185752, + "loss": 1.9765, + "step": 1359 + }, + { + "epoch": 0.14308258811152025, + "grad_norm": 1.1644525527954102, + "learning_rate": 0.00019337695859281137, + "loss": 1.788, + "step": 1360 + }, + { + "epoch": 0.14318779589689637, + "grad_norm": 0.6040117144584656, + "learning_rate": 0.00019336475774020648, + "loss": 2.1476, + "step": 1361 + }, + { + "epoch": 0.1432930036822725, + "grad_norm": 0.9746436476707458, + "learning_rate": 0.0001933525460454597, + "loss": 1.6193, + "step": 1362 + }, + { + "epoch": 0.1433982114676486, + "grad_norm": 1.6310737133026123, + "learning_rate": 0.00019334032350998919, + "loss": 2.009, + "step": 1363 + }, + { + "epoch": 0.14350341925302473, + "grad_norm": 0.9230230450630188, + "learning_rate": 0.00019332809013521428, + "loss": 2.1239, + "step": 1364 + }, + { + "epoch": 0.14360862703840085, + "grad_norm": 0.9308440685272217, + "learning_rate": 0.00019331584592255553, + "loss": 1.8617, + "step": 1365 + }, + { + "epoch": 0.14371383482377695, + "grad_norm": 1.1313167810440063, + "learning_rate": 0.0001933035908734349, + "loss": 1.4825, + "step": 1366 + }, + { + "epoch": 0.14381904260915307, + "grad_norm": 1.0187263488769531, + "learning_rate": 0.0001932913249892755, + "loss": 2.1542, + "step": 1367 + }, + { + "epoch": 0.1439242503945292, + "grad_norm": 1.1843492984771729, + "learning_rate": 0.00019327904827150176, + "loss": 1.8278, + "step": 1368 + }, + { + "epoch": 0.1440294581799053, + "grad_norm": 1.6638554334640503, + "learning_rate": 0.00019326676072153927, + "loss": 1.9298, + "step": 1369 + }, + { + "epoch": 0.14413466596528143, + "grad_norm": 0.9473015666007996, + "learning_rate": 0.00019325446234081498, + "loss": 2.2619, + "step": 1370 + }, + { + "epoch": 0.14423987375065755, + "grad_norm": 0.9866594672203064, + "learning_rate": 0.00019324215313075706, + "loss": 1.7889, + "step": 1371 + }, + { + "epoch": 0.14434508153603368, + "grad_norm": 1.3924474716186523, + "learning_rate": 0.00019322983309279495, + "loss": 1.8161, + "step": 1372 + }, + { + "epoch": 0.1444502893214098, + "grad_norm": 1.1654231548309326, + "learning_rate": 0.00019321750222835933, + "loss": 1.8105, + "step": 1373 + }, + { + "epoch": 0.1445554971067859, + "grad_norm": 0.7712641358375549, + "learning_rate": 0.0001932051605388821, + "loss": 1.7791, + "step": 1374 + }, + { + "epoch": 0.144660704892162, + "grad_norm": 0.7534984350204468, + "learning_rate": 0.00019319280802579654, + "loss": 1.9788, + "step": 1375 + }, + { + "epoch": 0.14476591267753813, + "grad_norm": 0.9810335040092468, + "learning_rate": 0.00019318044469053702, + "loss": 2.0382, + "step": 1376 + }, + { + "epoch": 0.14487112046291425, + "grad_norm": 1.1605186462402344, + "learning_rate": 0.0001931680705345393, + "loss": 2.2038, + "step": 1377 + }, + { + "epoch": 0.14497632824829038, + "grad_norm": 1.2045458555221558, + "learning_rate": 0.00019315568555924035, + "loss": 2.3353, + "step": 1378 + }, + { + "epoch": 0.1450815360336665, + "grad_norm": 1.1022682189941406, + "learning_rate": 0.0001931432897660784, + "loss": 1.8617, + "step": 1379 + }, + { + "epoch": 0.14518674381904262, + "grad_norm": 1.5427874326705933, + "learning_rate": 0.0001931308831564929, + "loss": 2.1449, + "step": 1380 + }, + { + "epoch": 0.14529195160441874, + "grad_norm": 1.1985043287277222, + "learning_rate": 0.00019311846573192461, + "loss": 2.0909, + "step": 1381 + }, + { + "epoch": 0.14539715938979483, + "grad_norm": 1.052314043045044, + "learning_rate": 0.00019310603749381558, + "loss": 1.9035, + "step": 1382 + }, + { + "epoch": 0.14550236717517095, + "grad_norm": 1.17412269115448, + "learning_rate": 0.00019309359844360893, + "loss": 1.6764, + "step": 1383 + }, + { + "epoch": 0.14560757496054708, + "grad_norm": 1.25741446018219, + "learning_rate": 0.00019308114858274932, + "loss": 2.1154, + "step": 1384 + }, + { + "epoch": 0.1457127827459232, + "grad_norm": 1.7540018558502197, + "learning_rate": 0.0001930686879126824, + "loss": 1.1782, + "step": 1385 + }, + { + "epoch": 0.14581799053129932, + "grad_norm": 1.423569917678833, + "learning_rate": 0.00019305621643485522, + "loss": 2.217, + "step": 1386 + }, + { + "epoch": 0.14592319831667544, + "grad_norm": 1.670623540878296, + "learning_rate": 0.00019304373415071605, + "loss": 2.1301, + "step": 1387 + }, + { + "epoch": 0.14602840610205156, + "grad_norm": 1.0802117586135864, + "learning_rate": 0.00019303124106171443, + "loss": 2.0065, + "step": 1388 + }, + { + "epoch": 0.14613361388742768, + "grad_norm": 0.7987265586853027, + "learning_rate": 0.00019301873716930107, + "loss": 2.3087, + "step": 1389 + }, + { + "epoch": 0.14623882167280378, + "grad_norm": 1.3220068216323853, + "learning_rate": 0.00019300622247492814, + "loss": 2.1964, + "step": 1390 + }, + { + "epoch": 0.1463440294581799, + "grad_norm": 1.0792020559310913, + "learning_rate": 0.00019299369698004884, + "loss": 2.0078, + "step": 1391 + }, + { + "epoch": 0.14644923724355602, + "grad_norm": 2.057413101196289, + "learning_rate": 0.0001929811606861177, + "loss": 1.783, + "step": 1392 + }, + { + "epoch": 0.14655444502893214, + "grad_norm": 1.4398462772369385, + "learning_rate": 0.0001929686135945906, + "loss": 1.3909, + "step": 1393 + }, + { + "epoch": 0.14665965281430826, + "grad_norm": 1.0750278234481812, + "learning_rate": 0.0001929560557069245, + "loss": 1.8938, + "step": 1394 + }, + { + "epoch": 0.14676486059968438, + "grad_norm": 0.8204072713851929, + "learning_rate": 0.00019294348702457773, + "loss": 2.4199, + "step": 1395 + }, + { + "epoch": 0.1468700683850605, + "grad_norm": 0.9489021897315979, + "learning_rate": 0.0001929309075490099, + "loss": 2.0696, + "step": 1396 + }, + { + "epoch": 0.14697527617043663, + "grad_norm": 0.8120673298835754, + "learning_rate": 0.00019291831728168182, + "loss": 2.0151, + "step": 1397 + }, + { + "epoch": 0.14708048395581272, + "grad_norm": 1.4392826557159424, + "learning_rate": 0.00019290571622405548, + "loss": 1.8832, + "step": 1398 + }, + { + "epoch": 0.14718569174118884, + "grad_norm": 1.1055705547332764, + "learning_rate": 0.00019289310437759427, + "loss": 1.829, + "step": 1399 + }, + { + "epoch": 0.14729089952656496, + "grad_norm": 0.5811572074890137, + "learning_rate": 0.00019288048174376273, + "loss": 2.2987, + "step": 1400 + }, + { + "epoch": 0.14739610731194108, + "grad_norm": 1.0770584344863892, + "learning_rate": 0.0001928678483240267, + "loss": 1.9941, + "step": 1401 + }, + { + "epoch": 0.1475013150973172, + "grad_norm": 1.227790117263794, + "learning_rate": 0.00019285520411985326, + "loss": 2.0154, + "step": 1402 + }, + { + "epoch": 0.14760652288269333, + "grad_norm": 0.6818186640739441, + "learning_rate": 0.0001928425491327107, + "loss": 2.2323, + "step": 1403 + }, + { + "epoch": 0.14771173066806945, + "grad_norm": 1.290170431137085, + "learning_rate": 0.00019282988336406865, + "loss": 2.1058, + "step": 1404 + }, + { + "epoch": 0.14781693845344557, + "grad_norm": 0.9615659713745117, + "learning_rate": 0.0001928172068153979, + "loss": 2.0933, + "step": 1405 + }, + { + "epoch": 0.14792214623882166, + "grad_norm": 0.9037755131721497, + "learning_rate": 0.00019280451948817059, + "loss": 2.0112, + "step": 1406 + }, + { + "epoch": 0.14802735402419778, + "grad_norm": 1.2614983320236206, + "learning_rate": 0.00019279182138386003, + "loss": 1.8352, + "step": 1407 + }, + { + "epoch": 0.1481325618095739, + "grad_norm": 1.1680060625076294, + "learning_rate": 0.0001927791125039408, + "loss": 2.0655, + "step": 1408 + }, + { + "epoch": 0.14823776959495003, + "grad_norm": 1.1708638668060303, + "learning_rate": 0.00019276639284988875, + "loss": 1.9568, + "step": 1409 + }, + { + "epoch": 0.14834297738032615, + "grad_norm": 1.2957854270935059, + "learning_rate": 0.00019275366242318097, + "loss": 1.9057, + "step": 1410 + }, + { + "epoch": 0.14844818516570227, + "grad_norm": 1.3663761615753174, + "learning_rate": 0.00019274092122529584, + "loss": 1.9964, + "step": 1411 + }, + { + "epoch": 0.1485533929510784, + "grad_norm": 1.0850753784179688, + "learning_rate": 0.00019272816925771288, + "loss": 1.8218, + "step": 1412 + }, + { + "epoch": 0.1486586007364545, + "grad_norm": 0.7503268718719482, + "learning_rate": 0.00019271540652191296, + "loss": 2.2849, + "step": 1413 + }, + { + "epoch": 0.1487638085218306, + "grad_norm": 1.1095421314239502, + "learning_rate": 0.0001927026330193782, + "loss": 2.0907, + "step": 1414 + }, + { + "epoch": 0.14886901630720673, + "grad_norm": 1.947831630706787, + "learning_rate": 0.00019268984875159191, + "loss": 2.3621, + "step": 1415 + }, + { + "epoch": 0.14897422409258285, + "grad_norm": 1.045089602470398, + "learning_rate": 0.00019267705372003876, + "loss": 1.8236, + "step": 1416 + }, + { + "epoch": 0.14907943187795897, + "grad_norm": 1.137406826019287, + "learning_rate": 0.0001926642479262045, + "loss": 2.1149, + "step": 1417 + }, + { + "epoch": 0.1491846396633351, + "grad_norm": 1.469742774963379, + "learning_rate": 0.00019265143137157627, + "loss": 1.8987, + "step": 1418 + }, + { + "epoch": 0.1492898474487112, + "grad_norm": 1.202427864074707, + "learning_rate": 0.00019263860405764241, + "loss": 1.8768, + "step": 1419 + }, + { + "epoch": 0.14939505523408733, + "grad_norm": 0.9847029447555542, + "learning_rate": 0.0001926257659858925, + "loss": 1.9719, + "step": 1420 + }, + { + "epoch": 0.14950026301946345, + "grad_norm": 1.1331363916397095, + "learning_rate": 0.00019261291715781743, + "loss": 2.2542, + "step": 1421 + }, + { + "epoch": 0.14960547080483955, + "grad_norm": 1.19657564163208, + "learning_rate": 0.00019260005757490922, + "loss": 2.2169, + "step": 1422 + }, + { + "epoch": 0.14971067859021567, + "grad_norm": 1.2917791604995728, + "learning_rate": 0.00019258718723866127, + "loss": 2.0383, + "step": 1423 + }, + { + "epoch": 0.1498158863755918, + "grad_norm": 1.163900375366211, + "learning_rate": 0.00019257430615056816, + "loss": 1.9061, + "step": 1424 + }, + { + "epoch": 0.1499210941609679, + "grad_norm": 1.1655508279800415, + "learning_rate": 0.00019256141431212568, + "loss": 2.2014, + "step": 1425 + }, + { + "epoch": 0.15002630194634403, + "grad_norm": 0.8750263452529907, + "learning_rate": 0.00019254851172483098, + "loss": 2.0095, + "step": 1426 + }, + { + "epoch": 0.15013150973172015, + "grad_norm": 1.1399056911468506, + "learning_rate": 0.00019253559839018235, + "loss": 1.8426, + "step": 1427 + }, + { + "epoch": 0.15023671751709627, + "grad_norm": 1.3897678852081299, + "learning_rate": 0.00019252267430967942, + "loss": 1.3708, + "step": 1428 + }, + { + "epoch": 0.1503419253024724, + "grad_norm": 0.97458416223526, + "learning_rate": 0.00019250973948482298, + "loss": 1.6044, + "step": 1429 + }, + { + "epoch": 0.1504471330878485, + "grad_norm": 1.1727815866470337, + "learning_rate": 0.0001924967939171151, + "loss": 2.0115, + "step": 1430 + }, + { + "epoch": 0.1505523408732246, + "grad_norm": 1.3556091785430908, + "learning_rate": 0.00019248383760805916, + "loss": 1.7871, + "step": 1431 + }, + { + "epoch": 0.15065754865860073, + "grad_norm": 0.9026738405227661, + "learning_rate": 0.00019247087055915968, + "loss": 1.9852, + "step": 1432 + }, + { + "epoch": 0.15076275644397685, + "grad_norm": 1.1884734630584717, + "learning_rate": 0.0001924578927719225, + "loss": 2.3177, + "step": 1433 + }, + { + "epoch": 0.15086796422935297, + "grad_norm": 1.2027161121368408, + "learning_rate": 0.00019244490424785468, + "loss": 1.6248, + "step": 1434 + }, + { + "epoch": 0.1509731720147291, + "grad_norm": 1.37662935256958, + "learning_rate": 0.00019243190498846458, + "loss": 2.0508, + "step": 1435 + }, + { + "epoch": 0.15107837980010522, + "grad_norm": 1.3299387693405151, + "learning_rate": 0.00019241889499526169, + "loss": 1.862, + "step": 1436 + }, + { + "epoch": 0.15118358758548134, + "grad_norm": 1.6885327100753784, + "learning_rate": 0.00019240587426975686, + "loss": 2.6639, + "step": 1437 + }, + { + "epoch": 0.15128879537085743, + "grad_norm": 1.0421466827392578, + "learning_rate": 0.00019239284281346214, + "loss": 1.8263, + "step": 1438 + }, + { + "epoch": 0.15139400315623355, + "grad_norm": 1.2554149627685547, + "learning_rate": 0.00019237980062789082, + "loss": 2.4153, + "step": 1439 + }, + { + "epoch": 0.15149921094160967, + "grad_norm": 0.9526044130325317, + "learning_rate": 0.00019236674771455747, + "loss": 2.236, + "step": 1440 + }, + { + "epoch": 0.1516044187269858, + "grad_norm": 0.9509533643722534, + "learning_rate": 0.00019235368407497788, + "loss": 1.966, + "step": 1441 + }, + { + "epoch": 0.15170962651236192, + "grad_norm": 1.2150377035140991, + "learning_rate": 0.00019234060971066902, + "loss": 1.6399, + "step": 1442 + }, + { + "epoch": 0.15181483429773804, + "grad_norm": 1.228989601135254, + "learning_rate": 0.00019232752462314923, + "loss": 2.1176, + "step": 1443 + }, + { + "epoch": 0.15192004208311416, + "grad_norm": 1.2654527425765991, + "learning_rate": 0.0001923144288139381, + "loss": 1.5625, + "step": 1444 + }, + { + "epoch": 0.15202524986849028, + "grad_norm": 1.086130976676941, + "learning_rate": 0.00019230132228455628, + "loss": 1.8363, + "step": 1445 + }, + { + "epoch": 0.15213045765386637, + "grad_norm": 1.0267728567123413, + "learning_rate": 0.00019228820503652586, + "loss": 1.7803, + "step": 1446 + }, + { + "epoch": 0.1522356654392425, + "grad_norm": 1.3312456607818604, + "learning_rate": 0.00019227507707137006, + "loss": 1.7026, + "step": 1447 + }, + { + "epoch": 0.15234087322461862, + "grad_norm": 1.434946060180664, + "learning_rate": 0.00019226193839061347, + "loss": 1.7181, + "step": 1448 + }, + { + "epoch": 0.15244608100999474, + "grad_norm": 2.0896921157836914, + "learning_rate": 0.00019224878899578175, + "loss": 2.235, + "step": 1449 + }, + { + "epoch": 0.15255128879537086, + "grad_norm": 0.7048740386962891, + "learning_rate": 0.00019223562888840193, + "loss": 2.3047, + "step": 1450 + }, + { + "epoch": 0.15265649658074698, + "grad_norm": 1.5920569896697998, + "learning_rate": 0.00019222245807000223, + "loss": 2.2711, + "step": 1451 + }, + { + "epoch": 0.1527617043661231, + "grad_norm": 0.8454103469848633, + "learning_rate": 0.00019220927654211217, + "loss": 1.8763, + "step": 1452 + }, + { + "epoch": 0.15286691215149922, + "grad_norm": 0.9767968654632568, + "learning_rate": 0.0001921960843062625, + "loss": 2.1151, + "step": 1453 + }, + { + "epoch": 0.15297211993687532, + "grad_norm": 1.5461039543151855, + "learning_rate": 0.00019218288136398513, + "loss": 1.5243, + "step": 1454 + }, + { + "epoch": 0.15307732772225144, + "grad_norm": 1.1929031610488892, + "learning_rate": 0.0001921696677168133, + "loss": 1.914, + "step": 1455 + }, + { + "epoch": 0.15318253550762756, + "grad_norm": 1.3064931631088257, + "learning_rate": 0.00019215644336628148, + "loss": 1.9663, + "step": 1456 + }, + { + "epoch": 0.15328774329300368, + "grad_norm": 1.21585214138031, + "learning_rate": 0.0001921432083139253, + "loss": 2.0572, + "step": 1457 + }, + { + "epoch": 0.1533929510783798, + "grad_norm": 1.0481001138687134, + "learning_rate": 0.00019212996256128182, + "loss": 2.2602, + "step": 1458 + }, + { + "epoch": 0.15349815886375592, + "grad_norm": 1.2363125085830688, + "learning_rate": 0.00019211670610988913, + "loss": 1.9804, + "step": 1459 + }, + { + "epoch": 0.15360336664913204, + "grad_norm": 0.9035821557044983, + "learning_rate": 0.0001921034389612867, + "loss": 2.1159, + "step": 1460 + }, + { + "epoch": 0.15370857443450817, + "grad_norm": 1.1142661571502686, + "learning_rate": 0.00019209016111701522, + "loss": 2.3541, + "step": 1461 + }, + { + "epoch": 0.15381378221988426, + "grad_norm": 0.9691978096961975, + "learning_rate": 0.00019207687257861655, + "loss": 2.0898, + "step": 1462 + }, + { + "epoch": 0.15391899000526038, + "grad_norm": 1.0944209098815918, + "learning_rate": 0.00019206357334763388, + "loss": 1.7832, + "step": 1463 + }, + { + "epoch": 0.1540241977906365, + "grad_norm": 1.0283271074295044, + "learning_rate": 0.00019205026342561157, + "loss": 2.3259, + "step": 1464 + }, + { + "epoch": 0.15412940557601262, + "grad_norm": 0.9159666299819946, + "learning_rate": 0.0001920369428140953, + "loss": 2.1943, + "step": 1465 + }, + { + "epoch": 0.15423461336138874, + "grad_norm": 1.10454261302948, + "learning_rate": 0.00019202361151463194, + "loss": 1.7206, + "step": 1466 + }, + { + "epoch": 0.15433982114676487, + "grad_norm": 1.2136410474777222, + "learning_rate": 0.00019201026952876958, + "loss": 2.1148, + "step": 1467 + }, + { + "epoch": 0.154445028932141, + "grad_norm": 1.4272185564041138, + "learning_rate": 0.00019199691685805763, + "loss": 2.401, + "step": 1468 + }, + { + "epoch": 0.1545502367175171, + "grad_norm": 1.2169204950332642, + "learning_rate": 0.00019198355350404667, + "loss": 2.039, + "step": 1469 + }, + { + "epoch": 0.1546554445028932, + "grad_norm": 0.9736878275871277, + "learning_rate": 0.0001919701794682885, + "loss": 2.3821, + "step": 1470 + }, + { + "epoch": 0.15476065228826932, + "grad_norm": 1.0728427171707153, + "learning_rate": 0.00019195679475233625, + "loss": 2.079, + "step": 1471 + }, + { + "epoch": 0.15486586007364544, + "grad_norm": 0.9223616719245911, + "learning_rate": 0.00019194339935774422, + "loss": 1.9857, + "step": 1472 + }, + { + "epoch": 0.15497106785902157, + "grad_norm": 0.7518747448921204, + "learning_rate": 0.00019192999328606803, + "loss": 2.2304, + "step": 1473 + }, + { + "epoch": 0.1550762756443977, + "grad_norm": 1.1213191747665405, + "learning_rate": 0.0001919165765388644, + "loss": 2.1738, + "step": 1474 + }, + { + "epoch": 0.1551814834297738, + "grad_norm": 1.642525553703308, + "learning_rate": 0.00019190314911769142, + "loss": 1.8865, + "step": 1475 + }, + { + "epoch": 0.15528669121514993, + "grad_norm": 0.9955556392669678, + "learning_rate": 0.00019188971102410837, + "loss": 2.1058, + "step": 1476 + }, + { + "epoch": 0.15539189900052605, + "grad_norm": 0.9576829075813293, + "learning_rate": 0.00019187626225967576, + "loss": 1.9758, + "step": 1477 + }, + { + "epoch": 0.15549710678590214, + "grad_norm": 1.4288785457611084, + "learning_rate": 0.00019186280282595535, + "loss": 1.7091, + "step": 1478 + }, + { + "epoch": 0.15560231457127827, + "grad_norm": 1.115845799446106, + "learning_rate": 0.00019184933272451015, + "loss": 2.1405, + "step": 1479 + }, + { + "epoch": 0.1557075223566544, + "grad_norm": 1.4371943473815918, + "learning_rate": 0.0001918358519569044, + "loss": 2.0125, + "step": 1480 + }, + { + "epoch": 0.1558127301420305, + "grad_norm": 1.4342418909072876, + "learning_rate": 0.00019182236052470354, + "loss": 2.0421, + "step": 1481 + }, + { + "epoch": 0.15591793792740663, + "grad_norm": 1.1712485551834106, + "learning_rate": 0.00019180885842947436, + "loss": 2.1814, + "step": 1482 + }, + { + "epoch": 0.15602314571278275, + "grad_norm": 0.816021740436554, + "learning_rate": 0.00019179534567278475, + "loss": 2.2745, + "step": 1483 + }, + { + "epoch": 0.15612835349815887, + "grad_norm": 1.6961162090301514, + "learning_rate": 0.0001917818222562039, + "loss": 1.8956, + "step": 1484 + }, + { + "epoch": 0.156233561283535, + "grad_norm": 1.0247284173965454, + "learning_rate": 0.0001917682881813023, + "loss": 1.9351, + "step": 1485 + }, + { + "epoch": 0.1563387690689111, + "grad_norm": 0.9849500060081482, + "learning_rate": 0.00019175474344965157, + "loss": 1.6681, + "step": 1486 + }, + { + "epoch": 0.1564439768542872, + "grad_norm": 1.3697773218154907, + "learning_rate": 0.00019174118806282458, + "loss": 1.9333, + "step": 1487 + }, + { + "epoch": 0.15654918463966333, + "grad_norm": 1.421255111694336, + "learning_rate": 0.00019172762202239558, + "loss": 1.9458, + "step": 1488 + }, + { + "epoch": 0.15665439242503945, + "grad_norm": 1.1675089597702026, + "learning_rate": 0.00019171404532993986, + "loss": 2.0317, + "step": 1489 + }, + { + "epoch": 0.15675960021041557, + "grad_norm": 0.8886706233024597, + "learning_rate": 0.00019170045798703406, + "loss": 2.0854, + "step": 1490 + }, + { + "epoch": 0.1568648079957917, + "grad_norm": 1.510884404182434, + "learning_rate": 0.00019168685999525607, + "loss": 2.1653, + "step": 1491 + }, + { + "epoch": 0.15697001578116782, + "grad_norm": 1.1406757831573486, + "learning_rate": 0.00019167325135618487, + "loss": 2.0393, + "step": 1492 + }, + { + "epoch": 0.15707522356654394, + "grad_norm": 0.9392661452293396, + "learning_rate": 0.00019165963207140095, + "loss": 2.1535, + "step": 1493 + }, + { + "epoch": 0.15718043135192003, + "grad_norm": 0.9734488725662231, + "learning_rate": 0.00019164600214248575, + "loss": 1.9538, + "step": 1494 + }, + { + "epoch": 0.15728563913729615, + "grad_norm": 1.0362365245819092, + "learning_rate": 0.0001916323615710221, + "loss": 1.6926, + "step": 1495 + }, + { + "epoch": 0.15739084692267227, + "grad_norm": 1.099932074546814, + "learning_rate": 0.00019161871035859403, + "loss": 1.9106, + "step": 1496 + }, + { + "epoch": 0.1574960547080484, + "grad_norm": 1.4597340822219849, + "learning_rate": 0.0001916050485067868, + "loss": 2.1146, + "step": 1497 + }, + { + "epoch": 0.15760126249342452, + "grad_norm": 1.3075284957885742, + "learning_rate": 0.00019159137601718697, + "loss": 2.6296, + "step": 1498 + }, + { + "epoch": 0.15770647027880064, + "grad_norm": 1.0450359582901, + "learning_rate": 0.00019157769289138225, + "loss": 1.7802, + "step": 1499 + }, + { + "epoch": 0.15781167806417676, + "grad_norm": 1.9684633016586304, + "learning_rate": 0.0001915639991309616, + "loss": 2.1394, + "step": 1500 + }, + { + "epoch": 0.15791688584955288, + "grad_norm": 1.4822932481765747, + "learning_rate": 0.00019155029473751526, + "loss": 1.9256, + "step": 1501 + }, + { + "epoch": 0.15802209363492897, + "grad_norm": 0.9977061748504639, + "learning_rate": 0.00019153657971263463, + "loss": 1.7428, + "step": 1502 + }, + { + "epoch": 0.1581273014203051, + "grad_norm": 1.2301387786865234, + "learning_rate": 0.00019152285405791243, + "loss": 1.8108, + "step": 1503 + }, + { + "epoch": 0.15823250920568122, + "grad_norm": 0.9384319186210632, + "learning_rate": 0.00019150911777494258, + "loss": 2.55, + "step": 1504 + }, + { + "epoch": 0.15833771699105734, + "grad_norm": 0.9354838728904724, + "learning_rate": 0.00019149537086532022, + "loss": 2.1337, + "step": 1505 + }, + { + "epoch": 0.15844292477643346, + "grad_norm": 1.6690521240234375, + "learning_rate": 0.0001914816133306417, + "loss": 2.1056, + "step": 1506 + }, + { + "epoch": 0.15854813256180958, + "grad_norm": 1.3110581636428833, + "learning_rate": 0.0001914678451725047, + "loss": 2.1933, + "step": 1507 + }, + { + "epoch": 0.1586533403471857, + "grad_norm": 1.4883490800857544, + "learning_rate": 0.000191454066392508, + "loss": 1.751, + "step": 1508 + }, + { + "epoch": 0.15875854813256182, + "grad_norm": 1.056282639503479, + "learning_rate": 0.00019144027699225172, + "loss": 1.4755, + "step": 1509 + }, + { + "epoch": 0.15886375591793792, + "grad_norm": 1.4437083005905151, + "learning_rate": 0.00019142647697333723, + "loss": 2.0506, + "step": 1510 + }, + { + "epoch": 0.15896896370331404, + "grad_norm": 1.2706918716430664, + "learning_rate": 0.00019141266633736697, + "loss": 2.2959, + "step": 1511 + }, + { + "epoch": 0.15907417148869016, + "grad_norm": 0.7875407934188843, + "learning_rate": 0.00019139884508594484, + "loss": 1.9302, + "step": 1512 + }, + { + "epoch": 0.15917937927406628, + "grad_norm": 1.6068692207336426, + "learning_rate": 0.00019138501322067577, + "loss": 2.0472, + "step": 1513 + }, + { + "epoch": 0.1592845870594424, + "grad_norm": 1.250916600227356, + "learning_rate": 0.00019137117074316602, + "loss": 2.3526, + "step": 1514 + }, + { + "epoch": 0.15938979484481852, + "grad_norm": 0.9924522638320923, + "learning_rate": 0.00019135731765502313, + "loss": 1.804, + "step": 1515 + }, + { + "epoch": 0.15949500263019464, + "grad_norm": 0.9752918481826782, + "learning_rate": 0.00019134345395785572, + "loss": 1.527, + "step": 1516 + }, + { + "epoch": 0.15960021041557076, + "grad_norm": 0.8332353830337524, + "learning_rate": 0.00019132957965327382, + "loss": 2.2386, + "step": 1517 + }, + { + "epoch": 0.15970541820094686, + "grad_norm": 1.0230636596679688, + "learning_rate": 0.0001913156947428886, + "loss": 1.935, + "step": 1518 + }, + { + "epoch": 0.15981062598632298, + "grad_norm": 1.3288902044296265, + "learning_rate": 0.00019130179922831241, + "loss": 2.178, + "step": 1519 + }, + { + "epoch": 0.1599158337716991, + "grad_norm": 1.1439303159713745, + "learning_rate": 0.00019128789311115892, + "loss": 2.2228, + "step": 1520 + }, + { + "epoch": 0.16002104155707522, + "grad_norm": 1.0876315832138062, + "learning_rate": 0.00019127397639304305, + "loss": 2.0773, + "step": 1521 + }, + { + "epoch": 0.16012624934245134, + "grad_norm": 1.0651696920394897, + "learning_rate": 0.00019126004907558085, + "loss": 2.0508, + "step": 1522 + }, + { + "epoch": 0.16023145712782746, + "grad_norm": 1.309206485748291, + "learning_rate": 0.00019124611116038963, + "loss": 2.4417, + "step": 1523 + }, + { + "epoch": 0.16033666491320359, + "grad_norm": 1.2913589477539062, + "learning_rate": 0.00019123216264908802, + "loss": 2.1339, + "step": 1524 + }, + { + "epoch": 0.1604418726985797, + "grad_norm": 1.7865016460418701, + "learning_rate": 0.00019121820354329577, + "loss": 2.0545, + "step": 1525 + }, + { + "epoch": 0.1605470804839558, + "grad_norm": 1.2007263898849487, + "learning_rate": 0.00019120423384463392, + "loss": 1.865, + "step": 1526 + }, + { + "epoch": 0.16065228826933192, + "grad_norm": 1.5868077278137207, + "learning_rate": 0.0001911902535547247, + "loss": 2.2881, + "step": 1527 + }, + { + "epoch": 0.16075749605470804, + "grad_norm": 0.9651376605033875, + "learning_rate": 0.00019117626267519162, + "loss": 1.9428, + "step": 1528 + }, + { + "epoch": 0.16086270384008416, + "grad_norm": 1.0492637157440186, + "learning_rate": 0.0001911622612076594, + "loss": 1.8641, + "step": 1529 + }, + { + "epoch": 0.16096791162546029, + "grad_norm": 1.4848203659057617, + "learning_rate": 0.000191148249153754, + "loss": 2.256, + "step": 1530 + }, + { + "epoch": 0.1610731194108364, + "grad_norm": 1.3082940578460693, + "learning_rate": 0.00019113422651510255, + "loss": 2.2692, + "step": 1531 + }, + { + "epoch": 0.16117832719621253, + "grad_norm": 0.7588707804679871, + "learning_rate": 0.00019112019329333346, + "loss": 2.3249, + "step": 1532 + }, + { + "epoch": 0.16128353498158865, + "grad_norm": 0.7588993906974792, + "learning_rate": 0.0001911061494900764, + "loss": 2.1544, + "step": 1533 + }, + { + "epoch": 0.16138874276696474, + "grad_norm": 1.9041491746902466, + "learning_rate": 0.00019109209510696217, + "loss": 1.9289, + "step": 1534 + }, + { + "epoch": 0.16149395055234086, + "grad_norm": 1.3282427787780762, + "learning_rate": 0.00019107803014562294, + "loss": 1.6547, + "step": 1535 + }, + { + "epoch": 0.16159915833771699, + "grad_norm": 1.084847092628479, + "learning_rate": 0.00019106395460769196, + "loss": 1.9799, + "step": 1536 + }, + { + "epoch": 0.1617043661230931, + "grad_norm": 1.2308772802352905, + "learning_rate": 0.0001910498684948038, + "loss": 1.7734, + "step": 1537 + }, + { + "epoch": 0.16180957390846923, + "grad_norm": 1.1046706438064575, + "learning_rate": 0.0001910357718085942, + "loss": 1.8926, + "step": 1538 + }, + { + "epoch": 0.16191478169384535, + "grad_norm": 0.9326602816581726, + "learning_rate": 0.00019102166455070024, + "loss": 1.679, + "step": 1539 + }, + { + "epoch": 0.16201998947922147, + "grad_norm": 2.735374927520752, + "learning_rate": 0.0001910075467227601, + "loss": 2.2685, + "step": 1540 + }, + { + "epoch": 0.1621251972645976, + "grad_norm": 0.7285892367362976, + "learning_rate": 0.00019099341832641323, + "loss": 2.1306, + "step": 1541 + }, + { + "epoch": 0.16223040504997369, + "grad_norm": 1.3543635606765747, + "learning_rate": 0.0001909792793633003, + "loss": 1.7381, + "step": 1542 + }, + { + "epoch": 0.1623356128353498, + "grad_norm": 1.4372515678405762, + "learning_rate": 0.00019096512983506327, + "loss": 1.3866, + "step": 1543 + }, + { + "epoch": 0.16244082062072593, + "grad_norm": 1.202860951423645, + "learning_rate": 0.00019095096974334523, + "loss": 1.9439, + "step": 1544 + }, + { + "epoch": 0.16254602840610205, + "grad_norm": 1.5187021493911743, + "learning_rate": 0.0001909367990897906, + "loss": 1.6743, + "step": 1545 + }, + { + "epoch": 0.16265123619147817, + "grad_norm": 0.9257557392120361, + "learning_rate": 0.00019092261787604492, + "loss": 2.1086, + "step": 1546 + }, + { + "epoch": 0.1627564439768543, + "grad_norm": 1.202397108078003, + "learning_rate": 0.00019090842610375503, + "loss": 2.1001, + "step": 1547 + }, + { + "epoch": 0.1628616517622304, + "grad_norm": 1.3095595836639404, + "learning_rate": 0.000190894223774569, + "loss": 2.525, + "step": 1548 + }, + { + "epoch": 0.16296685954760654, + "grad_norm": 1.2810163497924805, + "learning_rate": 0.00019088001089013603, + "loss": 2.0124, + "step": 1549 + }, + { + "epoch": 0.16307206733298263, + "grad_norm": 2.0133657455444336, + "learning_rate": 0.00019086578745210666, + "loss": 1.9361, + "step": 1550 + }, + { + "epoch": 0.16317727511835875, + "grad_norm": 1.1870371103286743, + "learning_rate": 0.00019085155346213264, + "loss": 1.7296, + "step": 1551 + }, + { + "epoch": 0.16328248290373487, + "grad_norm": 1.0325980186462402, + "learning_rate": 0.00019083730892186686, + "loss": 2.4486, + "step": 1552 + }, + { + "epoch": 0.163387690689111, + "grad_norm": 1.313650131225586, + "learning_rate": 0.00019082305383296352, + "loss": 2.4732, + "step": 1553 + }, + { + "epoch": 0.16349289847448711, + "grad_norm": 0.8540735840797424, + "learning_rate": 0.00019080878819707802, + "loss": 1.8071, + "step": 1554 + }, + { + "epoch": 0.16359810625986324, + "grad_norm": 1.3443700075149536, + "learning_rate": 0.00019079451201586695, + "loss": 2.3741, + "step": 1555 + }, + { + "epoch": 0.16370331404523936, + "grad_norm": 1.2170543670654297, + "learning_rate": 0.0001907802252909882, + "loss": 2.0448, + "step": 1556 + }, + { + "epoch": 0.16380852183061548, + "grad_norm": 0.8038926720619202, + "learning_rate": 0.0001907659280241008, + "loss": 2.1864, + "step": 1557 + }, + { + "epoch": 0.16391372961599157, + "grad_norm": 1.2641410827636719, + "learning_rate": 0.00019075162021686505, + "loss": 1.7906, + "step": 1558 + }, + { + "epoch": 0.1640189374013677, + "grad_norm": 0.9859477877616882, + "learning_rate": 0.0001907373018709425, + "loss": 2.3066, + "step": 1559 + }, + { + "epoch": 0.16412414518674381, + "grad_norm": 1.0549753904342651, + "learning_rate": 0.00019072297298799589, + "loss": 2.0271, + "step": 1560 + }, + { + "epoch": 0.16422935297211994, + "grad_norm": 1.015430212020874, + "learning_rate": 0.0001907086335696892, + "loss": 2.1745, + "step": 1561 + }, + { + "epoch": 0.16433456075749606, + "grad_norm": 0.9478970766067505, + "learning_rate": 0.00019069428361768754, + "loss": 2.1743, + "step": 1562 + }, + { + "epoch": 0.16443976854287218, + "grad_norm": 0.8643477559089661, + "learning_rate": 0.00019067992313365735, + "loss": 2.4163, + "step": 1563 + }, + { + "epoch": 0.1645449763282483, + "grad_norm": 1.1437067985534668, + "learning_rate": 0.00019066555211926634, + "loss": 2.0313, + "step": 1564 + }, + { + "epoch": 0.16465018411362442, + "grad_norm": 1.8715989589691162, + "learning_rate": 0.00019065117057618332, + "loss": 2.3086, + "step": 1565 + }, + { + "epoch": 0.16475539189900051, + "grad_norm": 1.1657463312149048, + "learning_rate": 0.00019063677850607834, + "loss": 2.4182, + "step": 1566 + }, + { + "epoch": 0.16486059968437664, + "grad_norm": 1.0688217878341675, + "learning_rate": 0.00019062237591062272, + "loss": 2.3177, + "step": 1567 + }, + { + "epoch": 0.16496580746975276, + "grad_norm": 1.5269672870635986, + "learning_rate": 0.000190607962791489, + "loss": 1.4833, + "step": 1568 + }, + { + "epoch": 0.16507101525512888, + "grad_norm": 0.9268459677696228, + "learning_rate": 0.00019059353915035096, + "loss": 2.1505, + "step": 1569 + }, + { + "epoch": 0.165176223040505, + "grad_norm": 1.0593661069869995, + "learning_rate": 0.00019057910498888352, + "loss": 2.1309, + "step": 1570 + }, + { + "epoch": 0.16528143082588112, + "grad_norm": 1.552679181098938, + "learning_rate": 0.00019056466030876288, + "loss": 1.8734, + "step": 1571 + }, + { + "epoch": 0.16538663861125724, + "grad_norm": 1.9197827577590942, + "learning_rate": 0.00019055020511166647, + "loss": 1.7033, + "step": 1572 + }, + { + "epoch": 0.16549184639663336, + "grad_norm": 1.1575535535812378, + "learning_rate": 0.0001905357393992729, + "loss": 2.3521, + "step": 1573 + }, + { + "epoch": 0.16559705418200946, + "grad_norm": 1.0774550437927246, + "learning_rate": 0.00019052126317326207, + "loss": 2.1065, + "step": 1574 + }, + { + "epoch": 0.16570226196738558, + "grad_norm": 1.0978038311004639, + "learning_rate": 0.000190506776435315, + "loss": 1.7549, + "step": 1575 + }, + { + "epoch": 0.1658074697527617, + "grad_norm": 0.8305821418762207, + "learning_rate": 0.00019049227918711402, + "loss": 2.5738, + "step": 1576 + }, + { + "epoch": 0.16591267753813782, + "grad_norm": 1.5406334400177002, + "learning_rate": 0.00019047777143034266, + "loss": 1.5855, + "step": 1577 + }, + { + "epoch": 0.16601788532351394, + "grad_norm": 1.2726386785507202, + "learning_rate": 0.00019046325316668562, + "loss": 1.9179, + "step": 1578 + }, + { + "epoch": 0.16612309310889006, + "grad_norm": 0.9605535864830017, + "learning_rate": 0.0001904487243978289, + "loss": 2.2189, + "step": 1579 + }, + { + "epoch": 0.16622830089426618, + "grad_norm": 1.0368754863739014, + "learning_rate": 0.00019043418512545963, + "loss": 2.1578, + "step": 1580 + }, + { + "epoch": 0.1663335086796423, + "grad_norm": 0.9062163829803467, + "learning_rate": 0.00019041963535126625, + "loss": 2.2097, + "step": 1581 + }, + { + "epoch": 0.1664387164650184, + "grad_norm": 1.6533317565917969, + "learning_rate": 0.00019040507507693836, + "loss": 2.1455, + "step": 1582 + }, + { + "epoch": 0.16654392425039452, + "grad_norm": 0.9693316221237183, + "learning_rate": 0.0001903905043041668, + "loss": 2.1971, + "step": 1583 + }, + { + "epoch": 0.16664913203577064, + "grad_norm": 0.8169092535972595, + "learning_rate": 0.00019037592303464362, + "loss": 2.4642, + "step": 1584 + }, + { + "epoch": 0.16675433982114676, + "grad_norm": 1.0367541313171387, + "learning_rate": 0.0001903613312700621, + "loss": 2.2354, + "step": 1585 + }, + { + "epoch": 0.16685954760652288, + "grad_norm": 1.3058645725250244, + "learning_rate": 0.00019034672901211672, + "loss": 1.8324, + "step": 1586 + }, + { + "epoch": 0.166964755391899, + "grad_norm": 1.1962265968322754, + "learning_rate": 0.0001903321162625032, + "loss": 2.0343, + "step": 1587 + }, + { + "epoch": 0.16706996317727513, + "grad_norm": 1.3322851657867432, + "learning_rate": 0.0001903174930229185, + "loss": 2.0793, + "step": 1588 + }, + { + "epoch": 0.16717517096265125, + "grad_norm": 0.9945839643478394, + "learning_rate": 0.00019030285929506075, + "loss": 1.8876, + "step": 1589 + }, + { + "epoch": 0.16728037874802734, + "grad_norm": 1.0389971733093262, + "learning_rate": 0.0001902882150806293, + "loss": 2.1234, + "step": 1590 + }, + { + "epoch": 0.16738558653340346, + "grad_norm": 1.1944520473480225, + "learning_rate": 0.00019027356038132473, + "loss": 2.3191, + "step": 1591 + }, + { + "epoch": 0.16749079431877958, + "grad_norm": 1.4879711866378784, + "learning_rate": 0.00019025889519884887, + "loss": 1.7512, + "step": 1592 + }, + { + "epoch": 0.1675960021041557, + "grad_norm": 0.7883063554763794, + "learning_rate": 0.00019024421953490472, + "loss": 2.2303, + "step": 1593 + }, + { + "epoch": 0.16770120988953183, + "grad_norm": 1.214263677597046, + "learning_rate": 0.00019022953339119654, + "loss": 1.8914, + "step": 1594 + }, + { + "epoch": 0.16780641767490795, + "grad_norm": 1.0355852842330933, + "learning_rate": 0.00019021483676942973, + "loss": 2.1548, + "step": 1595 + }, + { + "epoch": 0.16791162546028407, + "grad_norm": 1.011046290397644, + "learning_rate": 0.00019020012967131106, + "loss": 1.8183, + "step": 1596 + }, + { + "epoch": 0.1680168332456602, + "grad_norm": 1.062743067741394, + "learning_rate": 0.0001901854120985483, + "loss": 1.9263, + "step": 1597 + }, + { + "epoch": 0.16812204103103628, + "grad_norm": 0.8611815571784973, + "learning_rate": 0.00019017068405285058, + "loss": 2.2722, + "step": 1598 + }, + { + "epoch": 0.1682272488164124, + "grad_norm": 1.0582350492477417, + "learning_rate": 0.0001901559455359283, + "loss": 2.1025, + "step": 1599 + }, + { + "epoch": 0.16833245660178853, + "grad_norm": 0.7138919234275818, + "learning_rate": 0.00019014119654949294, + "loss": 1.8953, + "step": 1600 + }, + { + "epoch": 0.16843766438716465, + "grad_norm": 1.2566789388656616, + "learning_rate": 0.00019012643709525722, + "loss": 2.494, + "step": 1601 + }, + { + "epoch": 0.16854287217254077, + "grad_norm": 2.009523391723633, + "learning_rate": 0.00019011166717493517, + "loss": 1.0732, + "step": 1602 + }, + { + "epoch": 0.1686480799579169, + "grad_norm": 1.5301228761672974, + "learning_rate": 0.0001900968867902419, + "loss": 1.6838, + "step": 1603 + }, + { + "epoch": 0.168753287743293, + "grad_norm": 1.3879108428955078, + "learning_rate": 0.0001900820959428939, + "loss": 2.1531, + "step": 1604 + }, + { + "epoch": 0.16885849552866913, + "grad_norm": 1.2042590379714966, + "learning_rate": 0.0001900672946346087, + "loss": 2.2596, + "step": 1605 + }, + { + "epoch": 0.16896370331404523, + "grad_norm": 1.7592346668243408, + "learning_rate": 0.00019005248286710518, + "loss": 2.145, + "step": 1606 + }, + { + "epoch": 0.16906891109942135, + "grad_norm": 0.7835527658462524, + "learning_rate": 0.00019003766064210336, + "loss": 2.4961, + "step": 1607 + }, + { + "epoch": 0.16917411888479747, + "grad_norm": 1.1461858749389648, + "learning_rate": 0.00019002282796132448, + "loss": 1.8269, + "step": 1608 + }, + { + "epoch": 0.1692793266701736, + "grad_norm": 1.6687688827514648, + "learning_rate": 0.000190007984826491, + "loss": 1.8541, + "step": 1609 + }, + { + "epoch": 0.1693845344555497, + "grad_norm": 0.8094751238822937, + "learning_rate": 0.0001899931312393267, + "loss": 2.0899, + "step": 1610 + }, + { + "epoch": 0.16948974224092583, + "grad_norm": 1.5134220123291016, + "learning_rate": 0.00018997826720155636, + "loss": 2.0814, + "step": 1611 + }, + { + "epoch": 0.16959495002630195, + "grad_norm": 1.034388542175293, + "learning_rate": 0.00018996339271490616, + "loss": 1.7283, + "step": 1612 + }, + { + "epoch": 0.16970015781167808, + "grad_norm": 0.7677897214889526, + "learning_rate": 0.0001899485077811034, + "loss": 2.0904, + "step": 1613 + }, + { + "epoch": 0.16980536559705417, + "grad_norm": 1.4724828004837036, + "learning_rate": 0.00018993361240187665, + "loss": 1.9291, + "step": 1614 + }, + { + "epoch": 0.1699105733824303, + "grad_norm": 1.181376338005066, + "learning_rate": 0.00018991870657895558, + "loss": 1.8633, + "step": 1615 + }, + { + "epoch": 0.1700157811678064, + "grad_norm": 1.2235981225967407, + "learning_rate": 0.00018990379031407124, + "loss": 1.8902, + "step": 1616 + }, + { + "epoch": 0.17012098895318253, + "grad_norm": 1.796578288078308, + "learning_rate": 0.0001898888636089558, + "loss": 1.7133, + "step": 1617 + }, + { + "epoch": 0.17022619673855865, + "grad_norm": 0.8562111854553223, + "learning_rate": 0.00018987392646534258, + "loss": 1.918, + "step": 1618 + }, + { + "epoch": 0.17033140452393478, + "grad_norm": 1.1729540824890137, + "learning_rate": 0.00018985897888496627, + "loss": 2.0491, + "step": 1619 + }, + { + "epoch": 0.1704366123093109, + "grad_norm": 1.2273286581039429, + "learning_rate": 0.0001898440208695626, + "loss": 1.9927, + "step": 1620 + }, + { + "epoch": 0.17054182009468702, + "grad_norm": 1.4244452714920044, + "learning_rate": 0.00018982905242086867, + "loss": 1.7184, + "step": 1621 + }, + { + "epoch": 0.1706470278800631, + "grad_norm": 1.2891523838043213, + "learning_rate": 0.00018981407354062268, + "loss": 2.1326, + "step": 1622 + }, + { + "epoch": 0.17075223566543923, + "grad_norm": 1.7193819284439087, + "learning_rate": 0.00018979908423056408, + "loss": 2.0726, + "step": 1623 + }, + { + "epoch": 0.17085744345081536, + "grad_norm": 1.0913747549057007, + "learning_rate": 0.00018978408449243353, + "loss": 2.1798, + "step": 1624 + }, + { + "epoch": 0.17096265123619148, + "grad_norm": 1.2384060621261597, + "learning_rate": 0.00018976907432797287, + "loss": 2.2418, + "step": 1625 + }, + { + "epoch": 0.1710678590215676, + "grad_norm": 1.3474977016448975, + "learning_rate": 0.00018975405373892524, + "loss": 2.2404, + "step": 1626 + }, + { + "epoch": 0.17117306680694372, + "grad_norm": 1.227752685546875, + "learning_rate": 0.0001897390227270349, + "loss": 2.2127, + "step": 1627 + }, + { + "epoch": 0.17127827459231984, + "grad_norm": 1.1790754795074463, + "learning_rate": 0.00018972398129404736, + "loss": 2.3705, + "step": 1628 + }, + { + "epoch": 0.17138348237769596, + "grad_norm": 1.323519229888916, + "learning_rate": 0.00018970892944170933, + "loss": 1.9858, + "step": 1629 + }, + { + "epoch": 0.17148869016307206, + "grad_norm": 0.9499973058700562, + "learning_rate": 0.0001896938671717687, + "loss": 1.9775, + "step": 1630 + }, + { + "epoch": 0.17159389794844818, + "grad_norm": 0.8820815086364746, + "learning_rate": 0.00018967879448597463, + "loss": 2.2519, + "step": 1631 + }, + { + "epoch": 0.1716991057338243, + "grad_norm": 1.1254478693008423, + "learning_rate": 0.00018966371138607748, + "loss": 2.1188, + "step": 1632 + }, + { + "epoch": 0.17180431351920042, + "grad_norm": 0.9858778715133667, + "learning_rate": 0.00018964861787382876, + "loss": 2.0559, + "step": 1633 + }, + { + "epoch": 0.17190952130457654, + "grad_norm": 0.9913214445114136, + "learning_rate": 0.0001896335139509813, + "loss": 2.1155, + "step": 1634 + }, + { + "epoch": 0.17201472908995266, + "grad_norm": 0.8039067983627319, + "learning_rate": 0.00018961839961928898, + "loss": 1.7209, + "step": 1635 + }, + { + "epoch": 0.17211993687532878, + "grad_norm": 1.1919358968734741, + "learning_rate": 0.00018960327488050705, + "loss": 1.7773, + "step": 1636 + }, + { + "epoch": 0.1722251446607049, + "grad_norm": 1.3590924739837646, + "learning_rate": 0.00018958813973639184, + "loss": 1.134, + "step": 1637 + }, + { + "epoch": 0.172330352446081, + "grad_norm": 0.8363476991653442, + "learning_rate": 0.00018957299418870095, + "loss": 2.165, + "step": 1638 + }, + { + "epoch": 0.17243556023145712, + "grad_norm": 0.8952641487121582, + "learning_rate": 0.00018955783823919325, + "loss": 2.4521, + "step": 1639 + }, + { + "epoch": 0.17254076801683324, + "grad_norm": 1.4345828294754028, + "learning_rate": 0.0001895426718896287, + "loss": 1.983, + "step": 1640 + }, + { + "epoch": 0.17264597580220936, + "grad_norm": 1.2068872451782227, + "learning_rate": 0.00018952749514176848, + "loss": 2.0875, + "step": 1641 + }, + { + "epoch": 0.17275118358758548, + "grad_norm": 1.1335731744766235, + "learning_rate": 0.00018951230799737508, + "loss": 2.2086, + "step": 1642 + }, + { + "epoch": 0.1728563913729616, + "grad_norm": 1.1118179559707642, + "learning_rate": 0.0001894971104582121, + "loss": 2.2505, + "step": 1643 + }, + { + "epoch": 0.17296159915833773, + "grad_norm": 1.528136968612671, + "learning_rate": 0.0001894819025260444, + "loss": 1.9484, + "step": 1644 + }, + { + "epoch": 0.17306680694371385, + "grad_norm": 1.4697853326797485, + "learning_rate": 0.00018946668420263802, + "loss": 2.1765, + "step": 1645 + }, + { + "epoch": 0.17317201472908994, + "grad_norm": 1.0284318923950195, + "learning_rate": 0.0001894514554897602, + "loss": 2.173, + "step": 1646 + }, + { + "epoch": 0.17327722251446606, + "grad_norm": 0.9980498552322388, + "learning_rate": 0.0001894362163891794, + "loss": 2.1735, + "step": 1647 + }, + { + "epoch": 0.17338243029984218, + "grad_norm": 1.4780910015106201, + "learning_rate": 0.00018942096690266534, + "loss": 1.8087, + "step": 1648 + }, + { + "epoch": 0.1734876380852183, + "grad_norm": 0.8312754034996033, + "learning_rate": 0.00018940570703198878, + "loss": 2.2845, + "step": 1649 + }, + { + "epoch": 0.17359284587059443, + "grad_norm": 0.8467996120452881, + "learning_rate": 0.00018939043677892192, + "loss": 2.1111, + "step": 1650 + }, + { + "epoch": 0.17369805365597055, + "grad_norm": 1.1280200481414795, + "learning_rate": 0.00018937515614523797, + "loss": 1.7534, + "step": 1651 + }, + { + "epoch": 0.17380326144134667, + "grad_norm": 1.277297019958496, + "learning_rate": 0.00018935986513271146, + "loss": 2.1373, + "step": 1652 + }, + { + "epoch": 0.1739084692267228, + "grad_norm": 0.9894921779632568, + "learning_rate": 0.00018934456374311806, + "loss": 1.9866, + "step": 1653 + }, + { + "epoch": 0.17401367701209888, + "grad_norm": 1.4973275661468506, + "learning_rate": 0.00018932925197823468, + "loss": 1.9207, + "step": 1654 + }, + { + "epoch": 0.174118884797475, + "grad_norm": 0.9163431525230408, + "learning_rate": 0.0001893139298398394, + "loss": 2.214, + "step": 1655 + }, + { + "epoch": 0.17422409258285113, + "grad_norm": 1.1179120540618896, + "learning_rate": 0.00018929859732971162, + "loss": 2.3422, + "step": 1656 + }, + { + "epoch": 0.17432930036822725, + "grad_norm": 1.4268755912780762, + "learning_rate": 0.00018928325444963172, + "loss": 1.8902, + "step": 1657 + }, + { + "epoch": 0.17443450815360337, + "grad_norm": 0.9193028211593628, + "learning_rate": 0.0001892679012013815, + "loss": 2.2039, + "step": 1658 + }, + { + "epoch": 0.1745397159389795, + "grad_norm": 0.7367107272148132, + "learning_rate": 0.00018925253758674386, + "loss": 2.1194, + "step": 1659 + }, + { + "epoch": 0.1746449237243556, + "grad_norm": 1.8018954992294312, + "learning_rate": 0.00018923716360750293, + "loss": 2.2415, + "step": 1660 + }, + { + "epoch": 0.17475013150973173, + "grad_norm": 1.6338098049163818, + "learning_rate": 0.00018922177926544405, + "loss": 1.9905, + "step": 1661 + }, + { + "epoch": 0.17485533929510783, + "grad_norm": 1.4636485576629639, + "learning_rate": 0.00018920638456235375, + "loss": 2.3444, + "step": 1662 + }, + { + "epoch": 0.17496054708048395, + "grad_norm": 0.698484480381012, + "learning_rate": 0.00018919097950001977, + "loss": 1.9483, + "step": 1663 + }, + { + "epoch": 0.17506575486586007, + "grad_norm": 0.85833340883255, + "learning_rate": 0.00018917556408023102, + "loss": 2.0118, + "step": 1664 + }, + { + "epoch": 0.1751709626512362, + "grad_norm": 1.2028058767318726, + "learning_rate": 0.00018916013830477766, + "loss": 2.0637, + "step": 1665 + }, + { + "epoch": 0.1752761704366123, + "grad_norm": 0.8081074357032776, + "learning_rate": 0.00018914470217545103, + "loss": 1.7637, + "step": 1666 + }, + { + "epoch": 0.17538137822198843, + "grad_norm": 0.9753775000572205, + "learning_rate": 0.0001891292556940437, + "loss": 2.4205, + "step": 1667 + }, + { + "epoch": 0.17548658600736455, + "grad_norm": 0.8399510383605957, + "learning_rate": 0.00018911379886234938, + "loss": 1.6638, + "step": 1668 + }, + { + "epoch": 0.17559179379274067, + "grad_norm": 1.3433152437210083, + "learning_rate": 0.00018909833168216306, + "loss": 2.049, + "step": 1669 + }, + { + "epoch": 0.17569700157811677, + "grad_norm": 0.8703423142433167, + "learning_rate": 0.00018908285415528088, + "loss": 1.973, + "step": 1670 + }, + { + "epoch": 0.1758022093634929, + "grad_norm": 1.1213691234588623, + "learning_rate": 0.0001890673662835002, + "loss": 2.2099, + "step": 1671 + }, + { + "epoch": 0.175907417148869, + "grad_norm": 1.0827136039733887, + "learning_rate": 0.00018905186806861957, + "loss": 2.2668, + "step": 1672 + }, + { + "epoch": 0.17601262493424513, + "grad_norm": 1.356737494468689, + "learning_rate": 0.0001890363595124387, + "loss": 2.206, + "step": 1673 + }, + { + "epoch": 0.17611783271962125, + "grad_norm": 1.1863815784454346, + "learning_rate": 0.00018902084061675863, + "loss": 1.8536, + "step": 1674 + }, + { + "epoch": 0.17622304050499737, + "grad_norm": 0.8727356791496277, + "learning_rate": 0.00018900531138338144, + "loss": 2.0044, + "step": 1675 + }, + { + "epoch": 0.1763282482903735, + "grad_norm": 1.1327565908432007, + "learning_rate": 0.00018898977181411054, + "loss": 1.8945, + "step": 1676 + }, + { + "epoch": 0.17643345607574962, + "grad_norm": 0.8317870497703552, + "learning_rate": 0.0001889742219107505, + "loss": 2.3596, + "step": 1677 + }, + { + "epoch": 0.1765386638611257, + "grad_norm": 1.6865041255950928, + "learning_rate": 0.00018895866167510704, + "loss": 2.0753, + "step": 1678 + }, + { + "epoch": 0.17664387164650183, + "grad_norm": 1.1657840013504028, + "learning_rate": 0.00018894309110898712, + "loss": 2.2913, + "step": 1679 + }, + { + "epoch": 0.17674907943187795, + "grad_norm": 1.2765939235687256, + "learning_rate": 0.0001889275102141989, + "loss": 1.9399, + "step": 1680 + }, + { + "epoch": 0.17685428721725407, + "grad_norm": 1.0933256149291992, + "learning_rate": 0.0001889119189925518, + "loss": 2.0644, + "step": 1681 + }, + { + "epoch": 0.1769594950026302, + "grad_norm": 1.196715235710144, + "learning_rate": 0.0001888963174458563, + "loss": 2.2077, + "step": 1682 + }, + { + "epoch": 0.17706470278800632, + "grad_norm": 0.8405833840370178, + "learning_rate": 0.00018888070557592418, + "loss": 2.35, + "step": 1683 + }, + { + "epoch": 0.17716991057338244, + "grad_norm": 0.6497626304626465, + "learning_rate": 0.0001888650833845684, + "loss": 2.0005, + "step": 1684 + }, + { + "epoch": 0.17727511835875856, + "grad_norm": 1.1812434196472168, + "learning_rate": 0.00018884945087360312, + "loss": 1.9776, + "step": 1685 + }, + { + "epoch": 0.17738032614413465, + "grad_norm": 1.2201387882232666, + "learning_rate": 0.00018883380804484367, + "loss": 1.9608, + "step": 1686 + }, + { + "epoch": 0.17748553392951077, + "grad_norm": 1.2153204679489136, + "learning_rate": 0.00018881815490010662, + "loss": 2.1224, + "step": 1687 + }, + { + "epoch": 0.1775907417148869, + "grad_norm": 0.8220624327659607, + "learning_rate": 0.00018880249144120973, + "loss": 2.0308, + "step": 1688 + }, + { + "epoch": 0.17769594950026302, + "grad_norm": 1.218924880027771, + "learning_rate": 0.0001887868176699719, + "loss": 2.0924, + "step": 1689 + }, + { + "epoch": 0.17780115728563914, + "grad_norm": 1.4680471420288086, + "learning_rate": 0.0001887711335882133, + "loss": 1.4879, + "step": 1690 + }, + { + "epoch": 0.17790636507101526, + "grad_norm": 0.9292452335357666, + "learning_rate": 0.00018875543919775534, + "loss": 2.3925, + "step": 1691 + }, + { + "epoch": 0.17801157285639138, + "grad_norm": 0.935463011264801, + "learning_rate": 0.00018873973450042044, + "loss": 1.7671, + "step": 1692 + }, + { + "epoch": 0.1781167806417675, + "grad_norm": 1.0145299434661865, + "learning_rate": 0.00018872401949803237, + "loss": 1.9564, + "step": 1693 + }, + { + "epoch": 0.1782219884271436, + "grad_norm": 1.4179667234420776, + "learning_rate": 0.00018870829419241608, + "loss": 1.9507, + "step": 1694 + }, + { + "epoch": 0.17832719621251972, + "grad_norm": 1.5500625371932983, + "learning_rate": 0.0001886925585853977, + "loss": 1.6376, + "step": 1695 + }, + { + "epoch": 0.17843240399789584, + "grad_norm": 1.2018243074417114, + "learning_rate": 0.0001886768126788046, + "loss": 1.9299, + "step": 1696 + }, + { + "epoch": 0.17853761178327196, + "grad_norm": 1.6733994483947754, + "learning_rate": 0.0001886610564744652, + "loss": 2.1827, + "step": 1697 + }, + { + "epoch": 0.17864281956864808, + "grad_norm": 1.0895193815231323, + "learning_rate": 0.00018864528997420928, + "loss": 2.2253, + "step": 1698 + }, + { + "epoch": 0.1787480273540242, + "grad_norm": 1.5276850461959839, + "learning_rate": 0.0001886295131798677, + "loss": 2.1097, + "step": 1699 + }, + { + "epoch": 0.17885323513940032, + "grad_norm": 1.0547844171524048, + "learning_rate": 0.00018861372609327263, + "loss": 1.8359, + "step": 1700 + }, + { + "epoch": 0.17895844292477645, + "grad_norm": 1.3544683456420898, + "learning_rate": 0.00018859792871625736, + "loss": 1.8435, + "step": 1701 + }, + { + "epoch": 0.17906365071015254, + "grad_norm": 1.3501803874969482, + "learning_rate": 0.0001885821210506564, + "loss": 1.7588, + "step": 1702 + }, + { + "epoch": 0.17916885849552866, + "grad_norm": 1.4099911451339722, + "learning_rate": 0.00018856630309830536, + "loss": 2.0634, + "step": 1703 + }, + { + "epoch": 0.17927406628090478, + "grad_norm": 1.3355512619018555, + "learning_rate": 0.0001885504748610412, + "loss": 1.6948, + "step": 1704 + }, + { + "epoch": 0.1793792740662809, + "grad_norm": 1.5482978820800781, + "learning_rate": 0.000188534636340702, + "loss": 1.7464, + "step": 1705 + }, + { + "epoch": 0.17948448185165702, + "grad_norm": 1.2930607795715332, + "learning_rate": 0.000188518787539127, + "loss": 2.3852, + "step": 1706 + }, + { + "epoch": 0.17958968963703315, + "grad_norm": 1.4282091856002808, + "learning_rate": 0.00018850292845815672, + "loss": 1.8695, + "step": 1707 + }, + { + "epoch": 0.17969489742240927, + "grad_norm": 1.1130648851394653, + "learning_rate": 0.00018848705909963275, + "loss": 2.0868, + "step": 1708 + }, + { + "epoch": 0.1798001052077854, + "grad_norm": 1.1909211874008179, + "learning_rate": 0.000188471179465398, + "loss": 1.8749, + "step": 1709 + }, + { + "epoch": 0.17990531299316148, + "grad_norm": 0.7360309958457947, + "learning_rate": 0.00018845528955729654, + "loss": 1.9737, + "step": 1710 + }, + { + "epoch": 0.1800105207785376, + "grad_norm": 1.4310061931610107, + "learning_rate": 0.00018843938937717356, + "loss": 2.1225, + "step": 1711 + }, + { + "epoch": 0.18011572856391372, + "grad_norm": 1.2631195783615112, + "learning_rate": 0.00018842347892687552, + "loss": 1.7833, + "step": 1712 + }, + { + "epoch": 0.18022093634928985, + "grad_norm": 1.025079607963562, + "learning_rate": 0.00018840755820825002, + "loss": 2.2132, + "step": 1713 + }, + { + "epoch": 0.18032614413466597, + "grad_norm": 0.9161180257797241, + "learning_rate": 0.0001883916272231459, + "loss": 2.1401, + "step": 1714 + }, + { + "epoch": 0.1804313519200421, + "grad_norm": 0.9029863476753235, + "learning_rate": 0.0001883756859734132, + "loss": 1.8179, + "step": 1715 + }, + { + "epoch": 0.1805365597054182, + "grad_norm": 1.060342788696289, + "learning_rate": 0.00018835973446090312, + "loss": 1.8488, + "step": 1716 + }, + { + "epoch": 0.18064176749079433, + "grad_norm": 1.106149435043335, + "learning_rate": 0.000188343772687468, + "loss": 2.1516, + "step": 1717 + }, + { + "epoch": 0.18074697527617042, + "grad_norm": 1.5603445768356323, + "learning_rate": 0.0001883278006549615, + "loss": 1.7992, + "step": 1718 + }, + { + "epoch": 0.18085218306154655, + "grad_norm": 1.4736990928649902, + "learning_rate": 0.00018831181836523832, + "loss": 1.5836, + "step": 1719 + }, + { + "epoch": 0.18095739084692267, + "grad_norm": 1.0077097415924072, + "learning_rate": 0.00018829582582015453, + "loss": 1.9346, + "step": 1720 + }, + { + "epoch": 0.1810625986322988, + "grad_norm": 1.0348749160766602, + "learning_rate": 0.0001882798230215672, + "loss": 1.9961, + "step": 1721 + }, + { + "epoch": 0.1811678064176749, + "grad_norm": 1.658117413520813, + "learning_rate": 0.00018826380997133475, + "loss": 1.9501, + "step": 1722 + }, + { + "epoch": 0.18127301420305103, + "grad_norm": 0.9015898704528809, + "learning_rate": 0.00018824778667131669, + "loss": 2.412, + "step": 1723 + }, + { + "epoch": 0.18137822198842715, + "grad_norm": 1.5265978574752808, + "learning_rate": 0.00018823175312337374, + "loss": 1.9471, + "step": 1724 + }, + { + "epoch": 0.18148342977380327, + "grad_norm": 1.099857211112976, + "learning_rate": 0.00018821570932936785, + "loss": 2.1251, + "step": 1725 + }, + { + "epoch": 0.18158863755917937, + "grad_norm": 0.8147949576377869, + "learning_rate": 0.0001881996552911621, + "loss": 1.7347, + "step": 1726 + }, + { + "epoch": 0.1816938453445555, + "grad_norm": 1.1035587787628174, + "learning_rate": 0.00018818359101062087, + "loss": 2.0451, + "step": 1727 + }, + { + "epoch": 0.1817990531299316, + "grad_norm": 1.2525732517242432, + "learning_rate": 0.00018816751648960956, + "loss": 2.0103, + "step": 1728 + }, + { + "epoch": 0.18190426091530773, + "grad_norm": 1.0820599794387817, + "learning_rate": 0.0001881514317299949, + "loss": 2.0033, + "step": 1729 + }, + { + "epoch": 0.18200946870068385, + "grad_norm": 1.1944822072982788, + "learning_rate": 0.00018813533673364474, + "loss": 1.5899, + "step": 1730 + }, + { + "epoch": 0.18211467648605997, + "grad_norm": 0.9098942875862122, + "learning_rate": 0.00018811923150242814, + "loss": 1.6848, + "step": 1731 + }, + { + "epoch": 0.1822198842714361, + "grad_norm": 1.7374067306518555, + "learning_rate": 0.00018810311603821534, + "loss": 1.9592, + "step": 1732 + }, + { + "epoch": 0.18232509205681222, + "grad_norm": 1.2925838232040405, + "learning_rate": 0.00018808699034287784, + "loss": 2.1075, + "step": 1733 + }, + { + "epoch": 0.1824302998421883, + "grad_norm": 1.4516898393630981, + "learning_rate": 0.0001880708544182882, + "loss": 1.8392, + "step": 1734 + }, + { + "epoch": 0.18253550762756443, + "grad_norm": 1.116715908050537, + "learning_rate": 0.00018805470826632024, + "loss": 2.0185, + "step": 1735 + }, + { + "epoch": 0.18264071541294055, + "grad_norm": 0.9944667816162109, + "learning_rate": 0.00018803855188884896, + "loss": 2.3831, + "step": 1736 + }, + { + "epoch": 0.18274592319831667, + "grad_norm": 0.8294582962989807, + "learning_rate": 0.00018802238528775055, + "loss": 2.1609, + "step": 1737 + }, + { + "epoch": 0.1828511309836928, + "grad_norm": 1.0243922472000122, + "learning_rate": 0.0001880062084649024, + "loss": 2.3259, + "step": 1738 + }, + { + "epoch": 0.18295633876906892, + "grad_norm": 1.0900909900665283, + "learning_rate": 0.00018799002142218306, + "loss": 2.0693, + "step": 1739 + }, + { + "epoch": 0.18306154655444504, + "grad_norm": 1.310567855834961, + "learning_rate": 0.00018797382416147227, + "loss": 2.1476, + "step": 1740 + }, + { + "epoch": 0.18316675433982116, + "grad_norm": 0.9449117183685303, + "learning_rate": 0.00018795761668465098, + "loss": 2.0165, + "step": 1741 + }, + { + "epoch": 0.18327196212519725, + "grad_norm": 1.4861035346984863, + "learning_rate": 0.0001879413989936013, + "loss": 1.583, + "step": 1742 + }, + { + "epoch": 0.18337716991057337, + "grad_norm": 1.134390950202942, + "learning_rate": 0.00018792517109020654, + "loss": 1.9263, + "step": 1743 + }, + { + "epoch": 0.1834823776959495, + "grad_norm": 1.9425288438796997, + "learning_rate": 0.00018790893297635118, + "loss": 2.2563, + "step": 1744 + }, + { + "epoch": 0.18358758548132562, + "grad_norm": 1.343895673751831, + "learning_rate": 0.0001878926846539209, + "loss": 1.8078, + "step": 1745 + }, + { + "epoch": 0.18369279326670174, + "grad_norm": 1.0244340896606445, + "learning_rate": 0.00018787642612480261, + "loss": 2.0321, + "step": 1746 + }, + { + "epoch": 0.18379800105207786, + "grad_norm": 1.5598095655441284, + "learning_rate": 0.0001878601573908843, + "loss": 2.149, + "step": 1747 + }, + { + "epoch": 0.18390320883745398, + "grad_norm": 1.1912977695465088, + "learning_rate": 0.00018784387845405525, + "loss": 2.3885, + "step": 1748 + }, + { + "epoch": 0.1840084166228301, + "grad_norm": 1.210438847541809, + "learning_rate": 0.00018782758931620584, + "loss": 1.8181, + "step": 1749 + }, + { + "epoch": 0.1841136244082062, + "grad_norm": 1.1713554859161377, + "learning_rate": 0.0001878112899792277, + "loss": 1.9201, + "step": 1750 + }, + { + "epoch": 0.18421883219358232, + "grad_norm": 1.1458523273468018, + "learning_rate": 0.0001877949804450136, + "loss": 1.8281, + "step": 1751 + }, + { + "epoch": 0.18432403997895844, + "grad_norm": 0.9206444621086121, + "learning_rate": 0.00018777866071545751, + "loss": 2.0485, + "step": 1752 + }, + { + "epoch": 0.18442924776433456, + "grad_norm": 1.196629285812378, + "learning_rate": 0.0001877623307924546, + "loss": 2.2331, + "step": 1753 + }, + { + "epoch": 0.18453445554971068, + "grad_norm": 1.2622110843658447, + "learning_rate": 0.00018774599067790127, + "loss": 1.6533, + "step": 1754 + }, + { + "epoch": 0.1846396633350868, + "grad_norm": 1.3255887031555176, + "learning_rate": 0.0001877296403736949, + "loss": 1.9875, + "step": 1755 + }, + { + "epoch": 0.18474487112046292, + "grad_norm": 0.9285155534744263, + "learning_rate": 0.00018771327988173435, + "loss": 2.0829, + "step": 1756 + }, + { + "epoch": 0.18485007890583904, + "grad_norm": 1.3429813385009766, + "learning_rate": 0.0001876969092039194, + "loss": 2.3519, + "step": 1757 + }, + { + "epoch": 0.18495528669121514, + "grad_norm": 0.8573184013366699, + "learning_rate": 0.0001876805283421512, + "loss": 2.2899, + "step": 1758 + }, + { + "epoch": 0.18506049447659126, + "grad_norm": 1.3685758113861084, + "learning_rate": 0.00018766413729833192, + "loss": 2.1059, + "step": 1759 + }, + { + "epoch": 0.18516570226196738, + "grad_norm": 1.2804449796676636, + "learning_rate": 0.0001876477360743651, + "loss": 2.1972, + "step": 1760 + }, + { + "epoch": 0.1852709100473435, + "grad_norm": 1.6442826986312866, + "learning_rate": 0.00018763132467215527, + "loss": 1.6334, + "step": 1761 + }, + { + "epoch": 0.18537611783271962, + "grad_norm": 1.3029580116271973, + "learning_rate": 0.00018761490309360826, + "loss": 1.8276, + "step": 1762 + }, + { + "epoch": 0.18548132561809574, + "grad_norm": 1.166380524635315, + "learning_rate": 0.00018759847134063108, + "loss": 1.6849, + "step": 1763 + }, + { + "epoch": 0.18558653340347187, + "grad_norm": 1.6139475107192993, + "learning_rate": 0.0001875820294151319, + "loss": 1.8161, + "step": 1764 + }, + { + "epoch": 0.185691741188848, + "grad_norm": 1.228589653968811, + "learning_rate": 0.00018756557731902, + "loss": 1.7924, + "step": 1765 + }, + { + "epoch": 0.18579694897422408, + "grad_norm": 1.2991068363189697, + "learning_rate": 0.00018754911505420598, + "loss": 1.9833, + "step": 1766 + }, + { + "epoch": 0.1859021567596002, + "grad_norm": 1.323332667350769, + "learning_rate": 0.00018753264262260153, + "loss": 2.072, + "step": 1767 + }, + { + "epoch": 0.18600736454497632, + "grad_norm": 1.4269095659255981, + "learning_rate": 0.0001875161600261195, + "loss": 2.2458, + "step": 1768 + }, + { + "epoch": 0.18611257233035244, + "grad_norm": 1.6868364810943604, + "learning_rate": 0.000187499667266674, + "loss": 1.4101, + "step": 1769 + }, + { + "epoch": 0.18621778011572857, + "grad_norm": 1.9576934576034546, + "learning_rate": 0.0001874831643461803, + "loss": 2.193, + "step": 1770 + }, + { + "epoch": 0.1863229879011047, + "grad_norm": 1.4027034044265747, + "learning_rate": 0.00018746665126655477, + "loss": 2.0634, + "step": 1771 + }, + { + "epoch": 0.1864281956864808, + "grad_norm": 1.0985709428787231, + "learning_rate": 0.00018745012802971503, + "loss": 2.1046, + "step": 1772 + }, + { + "epoch": 0.18653340347185693, + "grad_norm": 0.9427407383918762, + "learning_rate": 0.00018743359463757996, + "loss": 2.001, + "step": 1773 + }, + { + "epoch": 0.18663861125723302, + "grad_norm": 1.0140053033828735, + "learning_rate": 0.0001874170510920694, + "loss": 2.0172, + "step": 1774 + }, + { + "epoch": 0.18674381904260914, + "grad_norm": 1.3929756879806519, + "learning_rate": 0.00018740049739510454, + "loss": 1.9716, + "step": 1775 + }, + { + "epoch": 0.18684902682798527, + "grad_norm": 0.9349243640899658, + "learning_rate": 0.00018738393354860775, + "loss": 2.0728, + "step": 1776 + }, + { + "epoch": 0.1869542346133614, + "grad_norm": 0.9166633486747742, + "learning_rate": 0.00018736735955450251, + "loss": 1.9776, + "step": 1777 + }, + { + "epoch": 0.1870594423987375, + "grad_norm": 1.3984131813049316, + "learning_rate": 0.0001873507754147135, + "loss": 2.0968, + "step": 1778 + }, + { + "epoch": 0.18716465018411363, + "grad_norm": 1.1432921886444092, + "learning_rate": 0.0001873341811311666, + "loss": 1.687, + "step": 1779 + }, + { + "epoch": 0.18726985796948975, + "grad_norm": 1.310202956199646, + "learning_rate": 0.00018731757670578878, + "loss": 1.7605, + "step": 1780 + }, + { + "epoch": 0.18737506575486587, + "grad_norm": 1.1294410228729248, + "learning_rate": 0.00018730096214050832, + "loss": 1.805, + "step": 1781 + }, + { + "epoch": 0.18748027354024197, + "grad_norm": 0.7705867886543274, + "learning_rate": 0.0001872843374372546, + "loss": 2.1706, + "step": 1782 + }, + { + "epoch": 0.1875854813256181, + "grad_norm": 1.1728864908218384, + "learning_rate": 0.00018726770259795821, + "loss": 2.1194, + "step": 1783 + }, + { + "epoch": 0.1876906891109942, + "grad_norm": 1.0230637788772583, + "learning_rate": 0.0001872510576245509, + "loss": 2.2653, + "step": 1784 + }, + { + "epoch": 0.18779589689637033, + "grad_norm": 1.1115864515304565, + "learning_rate": 0.00018723440251896552, + "loss": 2.1956, + "step": 1785 + }, + { + "epoch": 0.18790110468174645, + "grad_norm": 1.0122426748275757, + "learning_rate": 0.00018721773728313628, + "loss": 1.6465, + "step": 1786 + }, + { + "epoch": 0.18800631246712257, + "grad_norm": 0.9406682848930359, + "learning_rate": 0.0001872010619189984, + "loss": 1.7737, + "step": 1787 + }, + { + "epoch": 0.1881115202524987, + "grad_norm": 1.0271477699279785, + "learning_rate": 0.00018718437642848833, + "loss": 2.3859, + "step": 1788 + }, + { + "epoch": 0.18821672803787481, + "grad_norm": 1.2405545711517334, + "learning_rate": 0.00018716768081354374, + "loss": 1.8296, + "step": 1789 + }, + { + "epoch": 0.1883219358232509, + "grad_norm": 1.5526014566421509, + "learning_rate": 0.0001871509750761034, + "loss": 2.0319, + "step": 1790 + }, + { + "epoch": 0.18842714360862703, + "grad_norm": 1.2660319805145264, + "learning_rate": 0.00018713425921810733, + "loss": 2.2965, + "step": 1791 + }, + { + "epoch": 0.18853235139400315, + "grad_norm": 0.9081699848175049, + "learning_rate": 0.00018711753324149663, + "loss": 2.2645, + "step": 1792 + }, + { + "epoch": 0.18863755917937927, + "grad_norm": 0.7778252363204956, + "learning_rate": 0.00018710079714821367, + "loss": 2.1503, + "step": 1793 + }, + { + "epoch": 0.1887427669647554, + "grad_norm": 0.903505802154541, + "learning_rate": 0.00018708405094020197, + "loss": 2.1276, + "step": 1794 + }, + { + "epoch": 0.18884797475013151, + "grad_norm": 0.9733725190162659, + "learning_rate": 0.00018706729461940617, + "loss": 2.1536, + "step": 1795 + }, + { + "epoch": 0.18895318253550764, + "grad_norm": 1.1625339984893799, + "learning_rate": 0.00018705052818777219, + "loss": 1.7845, + "step": 1796 + }, + { + "epoch": 0.18905839032088376, + "grad_norm": 0.9139503836631775, + "learning_rate": 0.000187033751647247, + "loss": 2.1074, + "step": 1797 + }, + { + "epoch": 0.18916359810625985, + "grad_norm": 1.0095267295837402, + "learning_rate": 0.00018701696499977884, + "loss": 1.7807, + "step": 1798 + }, + { + "epoch": 0.18926880589163597, + "grad_norm": 0.9188694357872009, + "learning_rate": 0.00018700016824731706, + "loss": 1.9302, + "step": 1799 + }, + { + "epoch": 0.1893740136770121, + "grad_norm": 0.9856252074241638, + "learning_rate": 0.0001869833613918122, + "loss": 2.1511, + "step": 1800 + }, + { + "epoch": 0.18947922146238821, + "grad_norm": 1.3189128637313843, + "learning_rate": 0.00018696654443521607, + "loss": 1.8588, + "step": 1801 + }, + { + "epoch": 0.18958442924776434, + "grad_norm": 1.3261507749557495, + "learning_rate": 0.00018694971737948145, + "loss": 1.9224, + "step": 1802 + }, + { + "epoch": 0.18968963703314046, + "grad_norm": 1.4043636322021484, + "learning_rate": 0.00018693288022656252, + "loss": 1.8017, + "step": 1803 + }, + { + "epoch": 0.18979484481851658, + "grad_norm": 1.6043760776519775, + "learning_rate": 0.00018691603297841446, + "loss": 1.9915, + "step": 1804 + }, + { + "epoch": 0.1899000526038927, + "grad_norm": 0.9396890997886658, + "learning_rate": 0.0001868991756369937, + "loss": 1.8151, + "step": 1805 + }, + { + "epoch": 0.1900052603892688, + "grad_norm": 1.0986272096633911, + "learning_rate": 0.0001868823082042578, + "loss": 2.1091, + "step": 1806 + }, + { + "epoch": 0.19011046817464491, + "grad_norm": 0.7267233729362488, + "learning_rate": 0.00018686543068216556, + "loss": 2.2471, + "step": 1807 + }, + { + "epoch": 0.19021567596002104, + "grad_norm": 0.7171627283096313, + "learning_rate": 0.0001868485430726769, + "loss": 1.785, + "step": 1808 + }, + { + "epoch": 0.19032088374539716, + "grad_norm": 1.197615146636963, + "learning_rate": 0.0001868316453777529, + "loss": 2.0766, + "step": 1809 + }, + { + "epoch": 0.19042609153077328, + "grad_norm": 1.0680128335952759, + "learning_rate": 0.00018681473759935585, + "loss": 2.1709, + "step": 1810 + }, + { + "epoch": 0.1905312993161494, + "grad_norm": 1.3420791625976562, + "learning_rate": 0.00018679781973944922, + "loss": 2.1537, + "step": 1811 + }, + { + "epoch": 0.19063650710152552, + "grad_norm": 0.9118467569351196, + "learning_rate": 0.00018678089179999762, + "loss": 1.8085, + "step": 1812 + }, + { + "epoch": 0.19074171488690164, + "grad_norm": 0.9484670162200928, + "learning_rate": 0.00018676395378296678, + "loss": 2.1784, + "step": 1813 + }, + { + "epoch": 0.19084692267227774, + "grad_norm": 0.7800343632698059, + "learning_rate": 0.0001867470056903237, + "loss": 2.0045, + "step": 1814 + }, + { + "epoch": 0.19095213045765386, + "grad_norm": 1.0197774171829224, + "learning_rate": 0.00018673004752403651, + "loss": 2.1594, + "step": 1815 + }, + { + "epoch": 0.19105733824302998, + "grad_norm": 1.9036952257156372, + "learning_rate": 0.0001867130792860745, + "loss": 2.3911, + "step": 1816 + }, + { + "epoch": 0.1911625460284061, + "grad_norm": 1.0456516742706299, + "learning_rate": 0.00018669610097840812, + "loss": 2.0251, + "step": 1817 + }, + { + "epoch": 0.19126775381378222, + "grad_norm": 1.23896324634552, + "learning_rate": 0.00018667911260300904, + "loss": 2.091, + "step": 1818 + }, + { + "epoch": 0.19137296159915834, + "grad_norm": 1.0979390144348145, + "learning_rate": 0.00018666211416184999, + "loss": 1.8643, + "step": 1819 + }, + { + "epoch": 0.19147816938453446, + "grad_norm": 1.4400193691253662, + "learning_rate": 0.00018664510565690506, + "loss": 1.9773, + "step": 1820 + }, + { + "epoch": 0.19158337716991058, + "grad_norm": 1.1655758619308472, + "learning_rate": 0.0001866280870901493, + "loss": 2.0926, + "step": 1821 + }, + { + "epoch": 0.19168858495528668, + "grad_norm": 1.0224357843399048, + "learning_rate": 0.00018661105846355902, + "loss": 1.7359, + "step": 1822 + }, + { + "epoch": 0.1917937927406628, + "grad_norm": 1.4255973100662231, + "learning_rate": 0.00018659401977911175, + "loss": 2.5401, + "step": 1823 + }, + { + "epoch": 0.19189900052603892, + "grad_norm": 1.0111758708953857, + "learning_rate": 0.0001865769710387861, + "loss": 2.319, + "step": 1824 + }, + { + "epoch": 0.19200420831141504, + "grad_norm": 1.0198891162872314, + "learning_rate": 0.00018655991224456191, + "loss": 1.7324, + "step": 1825 + }, + { + "epoch": 0.19210941609679116, + "grad_norm": 1.0950661897659302, + "learning_rate": 0.00018654284339842013, + "loss": 1.8189, + "step": 1826 + }, + { + "epoch": 0.19221462388216728, + "grad_norm": 1.0327266454696655, + "learning_rate": 0.0001865257645023429, + "loss": 1.9624, + "step": 1827 + }, + { + "epoch": 0.1923198316675434, + "grad_norm": 1.1767691373825073, + "learning_rate": 0.0001865086755583136, + "loss": 2.2126, + "step": 1828 + }, + { + "epoch": 0.19242503945291953, + "grad_norm": 0.9736087918281555, + "learning_rate": 0.0001864915765683167, + "loss": 2.2341, + "step": 1829 + }, + { + "epoch": 0.19253024723829562, + "grad_norm": 1.7082476615905762, + "learning_rate": 0.00018647446753433777, + "loss": 1.7682, + "step": 1830 + }, + { + "epoch": 0.19263545502367174, + "grad_norm": 1.0709311962127686, + "learning_rate": 0.00018645734845836368, + "loss": 1.7948, + "step": 1831 + }, + { + "epoch": 0.19274066280904786, + "grad_norm": 1.171373724937439, + "learning_rate": 0.00018644021934238243, + "loss": 2.1489, + "step": 1832 + }, + { + "epoch": 0.19284587059442398, + "grad_norm": 1.0802509784698486, + "learning_rate": 0.00018642308018838316, + "loss": 2.4032, + "step": 1833 + }, + { + "epoch": 0.1929510783798001, + "grad_norm": 1.047210693359375, + "learning_rate": 0.00018640593099835618, + "loss": 2.2197, + "step": 1834 + }, + { + "epoch": 0.19305628616517623, + "grad_norm": 0.7914498448371887, + "learning_rate": 0.00018638877177429292, + "loss": 1.5993, + "step": 1835 + }, + { + "epoch": 0.19316149395055235, + "grad_norm": 0.8980052471160889, + "learning_rate": 0.0001863716025181861, + "loss": 2.2428, + "step": 1836 + }, + { + "epoch": 0.19326670173592847, + "grad_norm": 1.1372369527816772, + "learning_rate": 0.00018635442323202946, + "loss": 2.2588, + "step": 1837 + }, + { + "epoch": 0.19337190952130456, + "grad_norm": 0.995132327079773, + "learning_rate": 0.00018633723391781802, + "loss": 2.2008, + "step": 1838 + }, + { + "epoch": 0.19347711730668068, + "grad_norm": 1.5439562797546387, + "learning_rate": 0.00018632003457754793, + "loss": 2.0667, + "step": 1839 + }, + { + "epoch": 0.1935823250920568, + "grad_norm": 1.945412039756775, + "learning_rate": 0.00018630282521321645, + "loss": 2.2433, + "step": 1840 + }, + { + "epoch": 0.19368753287743293, + "grad_norm": 0.7718631029129028, + "learning_rate": 0.00018628560582682207, + "loss": 2.3626, + "step": 1841 + }, + { + "epoch": 0.19379274066280905, + "grad_norm": 1.3353549242019653, + "learning_rate": 0.0001862683764203644, + "loss": 1.9693, + "step": 1842 + }, + { + "epoch": 0.19389794844818517, + "grad_norm": 1.200984239578247, + "learning_rate": 0.00018625113699584426, + "loss": 2.0508, + "step": 1843 + }, + { + "epoch": 0.1940031562335613, + "grad_norm": 1.5393805503845215, + "learning_rate": 0.00018623388755526364, + "loss": 1.7895, + "step": 1844 + }, + { + "epoch": 0.1941083640189374, + "grad_norm": 1.4993560314178467, + "learning_rate": 0.00018621662810062558, + "loss": 1.9832, + "step": 1845 + }, + { + "epoch": 0.1942135718043135, + "grad_norm": 1.0732812881469727, + "learning_rate": 0.00018619935863393444, + "loss": 1.6443, + "step": 1846 + }, + { + "epoch": 0.19431877958968963, + "grad_norm": 1.4361530542373657, + "learning_rate": 0.0001861820791571956, + "loss": 2.2628, + "step": 1847 + }, + { + "epoch": 0.19442398737506575, + "grad_norm": 0.9760112166404724, + "learning_rate": 0.00018616478967241568, + "loss": 2.043, + "step": 1848 + }, + { + "epoch": 0.19452919516044187, + "grad_norm": 1.2888370752334595, + "learning_rate": 0.00018614749018160248, + "loss": 2.0368, + "step": 1849 + }, + { + "epoch": 0.194634402945818, + "grad_norm": 1.2557207345962524, + "learning_rate": 0.00018613018068676493, + "loss": 2.0163, + "step": 1850 + }, + { + "epoch": 0.1947396107311941, + "grad_norm": 1.2219709157943726, + "learning_rate": 0.00018611286118991313, + "loss": 1.4421, + "step": 1851 + }, + { + "epoch": 0.19484481851657023, + "grad_norm": 0.9364914894104004, + "learning_rate": 0.0001860955316930583, + "loss": 1.4738, + "step": 1852 + }, + { + "epoch": 0.19495002630194636, + "grad_norm": 1.3384712934494019, + "learning_rate": 0.0001860781921982129, + "loss": 2.2564, + "step": 1853 + }, + { + "epoch": 0.19505523408732245, + "grad_norm": 2.036452054977417, + "learning_rate": 0.00018606084270739049, + "loss": 2.0251, + "step": 1854 + }, + { + "epoch": 0.19516044187269857, + "grad_norm": 1.2831714153289795, + "learning_rate": 0.00018604348322260578, + "loss": 1.8063, + "step": 1855 + }, + { + "epoch": 0.1952656496580747, + "grad_norm": 0.8491331934928894, + "learning_rate": 0.0001860261137458747, + "loss": 1.6988, + "step": 1856 + }, + { + "epoch": 0.1953708574434508, + "grad_norm": 1.3487755060195923, + "learning_rate": 0.00018600873427921435, + "loss": 1.9603, + "step": 1857 + }, + { + "epoch": 0.19547606522882693, + "grad_norm": 1.7233006954193115, + "learning_rate": 0.00018599134482464287, + "loss": 1.8172, + "step": 1858 + }, + { + "epoch": 0.19558127301420306, + "grad_norm": 1.1606876850128174, + "learning_rate": 0.0001859739453841797, + "loss": 1.8372, + "step": 1859 + }, + { + "epoch": 0.19568648079957918, + "grad_norm": 1.3234946727752686, + "learning_rate": 0.00018595653595984536, + "loss": 2.0716, + "step": 1860 + }, + { + "epoch": 0.1957916885849553, + "grad_norm": 1.0269123315811157, + "learning_rate": 0.0001859391165536615, + "loss": 1.8958, + "step": 1861 + }, + { + "epoch": 0.1958968963703314, + "grad_norm": 1.1604347229003906, + "learning_rate": 0.0001859216871676511, + "loss": 2.1997, + "step": 1862 + }, + { + "epoch": 0.1960021041557075, + "grad_norm": 0.8779052495956421, + "learning_rate": 0.00018590424780383805, + "loss": 1.8132, + "step": 1863 + }, + { + "epoch": 0.19610731194108363, + "grad_norm": 1.0170305967330933, + "learning_rate": 0.0001858867984642476, + "loss": 2.1858, + "step": 1864 + }, + { + "epoch": 0.19621251972645976, + "grad_norm": 0.9716585874557495, + "learning_rate": 0.00018586933915090605, + "loss": 2.1944, + "step": 1865 + }, + { + "epoch": 0.19631772751183588, + "grad_norm": 1.326249361038208, + "learning_rate": 0.0001858518698658409, + "loss": 1.8595, + "step": 1866 + }, + { + "epoch": 0.196422935297212, + "grad_norm": 0.9899535179138184, + "learning_rate": 0.00018583439061108084, + "loss": 2.2828, + "step": 1867 + }, + { + "epoch": 0.19652814308258812, + "grad_norm": 0.9519728422164917, + "learning_rate": 0.0001858169013886556, + "loss": 2.2894, + "step": 1868 + }, + { + "epoch": 0.19663335086796424, + "grad_norm": 1.11469304561615, + "learning_rate": 0.0001857994022005962, + "loss": 2.0972, + "step": 1869 + }, + { + "epoch": 0.19673855865334033, + "grad_norm": 0.7735962271690369, + "learning_rate": 0.00018578189304893479, + "loss": 1.8865, + "step": 1870 + }, + { + "epoch": 0.19684376643871646, + "grad_norm": 1.504098653793335, + "learning_rate": 0.00018576437393570458, + "loss": 2.2802, + "step": 1871 + }, + { + "epoch": 0.19694897422409258, + "grad_norm": 0.8838121891021729, + "learning_rate": 0.00018574684486294006, + "loss": 2.0275, + "step": 1872 + }, + { + "epoch": 0.1970541820094687, + "grad_norm": 0.868427574634552, + "learning_rate": 0.0001857293058326768, + "loss": 2.2436, + "step": 1873 + }, + { + "epoch": 0.19715938979484482, + "grad_norm": 1.472787857055664, + "learning_rate": 0.00018571175684695154, + "loss": 2.3729, + "step": 1874 + }, + { + "epoch": 0.19726459758022094, + "grad_norm": 0.9417343139648438, + "learning_rate": 0.00018569419790780218, + "loss": 2.0141, + "step": 1875 + }, + { + "epoch": 0.19736980536559706, + "grad_norm": 1.3230361938476562, + "learning_rate": 0.00018567662901726784, + "loss": 2.1847, + "step": 1876 + }, + { + "epoch": 0.19747501315097318, + "grad_norm": 0.8992961049079895, + "learning_rate": 0.00018565905017738868, + "loss": 2.3444, + "step": 1877 + }, + { + "epoch": 0.19758022093634928, + "grad_norm": 1.455939769744873, + "learning_rate": 0.00018564146139020608, + "loss": 2.0235, + "step": 1878 + }, + { + "epoch": 0.1976854287217254, + "grad_norm": 1.1618740558624268, + "learning_rate": 0.00018562386265776263, + "loss": 2.627, + "step": 1879 + }, + { + "epoch": 0.19779063650710152, + "grad_norm": 0.8262205719947815, + "learning_rate": 0.00018560625398210192, + "loss": 2.0536, + "step": 1880 + }, + { + "epoch": 0.19789584429247764, + "grad_norm": 0.9469594955444336, + "learning_rate": 0.00018558863536526885, + "loss": 2.3183, + "step": 1881 + }, + { + "epoch": 0.19800105207785376, + "grad_norm": 1.1301559209823608, + "learning_rate": 0.00018557100680930937, + "loss": 2.3325, + "step": 1882 + }, + { + "epoch": 0.19810625986322988, + "grad_norm": 0.8819060921669006, + "learning_rate": 0.00018555336831627063, + "loss": 1.6358, + "step": 1883 + }, + { + "epoch": 0.198211467648606, + "grad_norm": 1.2020803689956665, + "learning_rate": 0.000185535719888201, + "loss": 2.0548, + "step": 1884 + }, + { + "epoch": 0.19831667543398213, + "grad_norm": 1.0018656253814697, + "learning_rate": 0.00018551806152714985, + "loss": 2.1452, + "step": 1885 + }, + { + "epoch": 0.19842188321935822, + "grad_norm": 1.1717722415924072, + "learning_rate": 0.00018550039323516783, + "loss": 2.0638, + "step": 1886 + }, + { + "epoch": 0.19852709100473434, + "grad_norm": 1.4248872995376587, + "learning_rate": 0.00018548271501430668, + "loss": 1.6008, + "step": 1887 + }, + { + "epoch": 0.19863229879011046, + "grad_norm": 1.5258474349975586, + "learning_rate": 0.00018546502686661934, + "loss": 1.6953, + "step": 1888 + }, + { + "epoch": 0.19873750657548658, + "grad_norm": 1.2465327978134155, + "learning_rate": 0.00018544732879415986, + "loss": 2.2588, + "step": 1889 + }, + { + "epoch": 0.1988427143608627, + "grad_norm": 1.0645866394042969, + "learning_rate": 0.00018542962079898346, + "loss": 2.2034, + "step": 1890 + }, + { + "epoch": 0.19894792214623883, + "grad_norm": 1.7710703611373901, + "learning_rate": 0.00018541190288314647, + "loss": 2.1372, + "step": 1891 + }, + { + "epoch": 0.19905312993161495, + "grad_norm": 1.4344483613967896, + "learning_rate": 0.00018539417504870648, + "loss": 1.7988, + "step": 1892 + }, + { + "epoch": 0.19915833771699107, + "grad_norm": 1.4004452228546143, + "learning_rate": 0.00018537643729772216, + "loss": 1.9769, + "step": 1893 + }, + { + "epoch": 0.19926354550236716, + "grad_norm": 0.8972628116607666, + "learning_rate": 0.00018535868963225326, + "loss": 2.2142, + "step": 1894 + }, + { + "epoch": 0.19936875328774328, + "grad_norm": 0.7937827706336975, + "learning_rate": 0.00018534093205436087, + "loss": 2.0791, + "step": 1895 + }, + { + "epoch": 0.1994739610731194, + "grad_norm": 1.6337438821792603, + "learning_rate": 0.00018532316456610704, + "loss": 1.486, + "step": 1896 + }, + { + "epoch": 0.19957916885849553, + "grad_norm": 2.0083436965942383, + "learning_rate": 0.00018530538716955504, + "loss": 1.8192, + "step": 1897 + }, + { + "epoch": 0.19968437664387165, + "grad_norm": 1.693180799484253, + "learning_rate": 0.0001852875998667694, + "loss": 1.4399, + "step": 1898 + }, + { + "epoch": 0.19978958442924777, + "grad_norm": 0.8119178414344788, + "learning_rate": 0.0001852698026598156, + "loss": 2.0997, + "step": 1899 + }, + { + "epoch": 0.1998947922146239, + "grad_norm": 1.1830841302871704, + "learning_rate": 0.0001852519955507604, + "loss": 1.7776, + "step": 1900 + }, + { + "epoch": 0.2, + "grad_norm": 0.7960675358772278, + "learning_rate": 0.00018523417854167168, + "loss": 2.2009, + "step": 1901 + }, + { + "epoch": 0.2001052077853761, + "grad_norm": 0.9493720531463623, + "learning_rate": 0.00018521635163461846, + "loss": 1.9135, + "step": 1902 + }, + { + "epoch": 0.20021041557075223, + "grad_norm": 1.2081838846206665, + "learning_rate": 0.00018519851483167097, + "loss": 1.9611, + "step": 1903 + }, + { + "epoch": 0.20031562335612835, + "grad_norm": 1.2762458324432373, + "learning_rate": 0.00018518066813490047, + "loss": 2.1839, + "step": 1904 + }, + { + "epoch": 0.20042083114150447, + "grad_norm": 1.249342679977417, + "learning_rate": 0.0001851628115463795, + "loss": 1.6887, + "step": 1905 + }, + { + "epoch": 0.2005260389268806, + "grad_norm": 1.3049440383911133, + "learning_rate": 0.00018514494506818166, + "loss": 2.3789, + "step": 1906 + }, + { + "epoch": 0.2006312467122567, + "grad_norm": 1.0053954124450684, + "learning_rate": 0.0001851270687023817, + "loss": 2.2105, + "step": 1907 + }, + { + "epoch": 0.20073645449763283, + "grad_norm": 1.1582366228103638, + "learning_rate": 0.0001851091824510556, + "loss": 1.9553, + "step": 1908 + }, + { + "epoch": 0.20084166228300895, + "grad_norm": 1.2567706108093262, + "learning_rate": 0.00018509128631628036, + "loss": 2.3391, + "step": 1909 + }, + { + "epoch": 0.20094687006838505, + "grad_norm": 0.823357105255127, + "learning_rate": 0.00018507338030013427, + "loss": 2.1023, + "step": 1910 + }, + { + "epoch": 0.20105207785376117, + "grad_norm": 0.7399005889892578, + "learning_rate": 0.0001850554644046967, + "loss": 1.947, + "step": 1911 + }, + { + "epoch": 0.2011572856391373, + "grad_norm": 1.4578897953033447, + "learning_rate": 0.00018503753863204807, + "loss": 1.2573, + "step": 1912 + }, + { + "epoch": 0.2012624934245134, + "grad_norm": 0.9766344428062439, + "learning_rate": 0.00018501960298427013, + "loss": 1.9681, + "step": 1913 + }, + { + "epoch": 0.20136770120988953, + "grad_norm": 1.1950548887252808, + "learning_rate": 0.00018500165746344562, + "loss": 2.1528, + "step": 1914 + }, + { + "epoch": 0.20147290899526565, + "grad_norm": 1.4169249534606934, + "learning_rate": 0.0001849837020716586, + "loss": 2.0959, + "step": 1915 + }, + { + "epoch": 0.20157811678064178, + "grad_norm": 1.3415583372116089, + "learning_rate": 0.00018496573681099401, + "loss": 1.7363, + "step": 1916 + }, + { + "epoch": 0.2016833245660179, + "grad_norm": 1.3929237127304077, + "learning_rate": 0.00018494776168353827, + "loss": 1.8159, + "step": 1917 + }, + { + "epoch": 0.201788532351394, + "grad_norm": 1.296832799911499, + "learning_rate": 0.00018492977669137868, + "loss": 2.2187, + "step": 1918 + }, + { + "epoch": 0.2018937401367701, + "grad_norm": 0.8477041125297546, + "learning_rate": 0.00018491178183660376, + "loss": 2.0901, + "step": 1919 + }, + { + "epoch": 0.20199894792214623, + "grad_norm": 1.353830099105835, + "learning_rate": 0.00018489377712130326, + "loss": 2.3666, + "step": 1920 + }, + { + "epoch": 0.20210415570752235, + "grad_norm": 1.109371304512024, + "learning_rate": 0.00018487576254756793, + "loss": 2.1329, + "step": 1921 + }, + { + "epoch": 0.20220936349289848, + "grad_norm": 1.1594985723495483, + "learning_rate": 0.0001848577381174898, + "loss": 2.059, + "step": 1922 + }, + { + "epoch": 0.2023145712782746, + "grad_norm": 1.2802788019180298, + "learning_rate": 0.00018483970383316198, + "loss": 2.0002, + "step": 1923 + }, + { + "epoch": 0.20241977906365072, + "grad_norm": 1.8601975440979004, + "learning_rate": 0.00018482165969667874, + "loss": 2.3373, + "step": 1924 + }, + { + "epoch": 0.20252498684902684, + "grad_norm": 1.242659330368042, + "learning_rate": 0.00018480360571013544, + "loss": 1.755, + "step": 1925 + }, + { + "epoch": 0.20263019463440293, + "grad_norm": 1.3230918645858765, + "learning_rate": 0.00018478554187562868, + "loss": 2.2089, + "step": 1926 + }, + { + "epoch": 0.20273540241977905, + "grad_norm": 1.1816704273223877, + "learning_rate": 0.00018476746819525613, + "loss": 1.6534, + "step": 1927 + }, + { + "epoch": 0.20284061020515518, + "grad_norm": 1.1363966464996338, + "learning_rate": 0.00018474938467111663, + "loss": 2.1807, + "step": 1928 + }, + { + "epoch": 0.2029458179905313, + "grad_norm": 1.0613765716552734, + "learning_rate": 0.00018473129130531016, + "loss": 1.87, + "step": 1929 + }, + { + "epoch": 0.20305102577590742, + "grad_norm": 0.7839847207069397, + "learning_rate": 0.00018471318809993784, + "loss": 1.6391, + "step": 1930 + }, + { + "epoch": 0.20315623356128354, + "grad_norm": 1.5794448852539062, + "learning_rate": 0.00018469507505710194, + "loss": 2.2534, + "step": 1931 + }, + { + "epoch": 0.20326144134665966, + "grad_norm": 1.1429455280303955, + "learning_rate": 0.0001846769521789059, + "loss": 1.6465, + "step": 1932 + }, + { + "epoch": 0.20336664913203578, + "grad_norm": 1.0982143878936768, + "learning_rate": 0.0001846588194674542, + "loss": 1.8907, + "step": 1933 + }, + { + "epoch": 0.20347185691741188, + "grad_norm": 1.2898775339126587, + "learning_rate": 0.00018464067692485254, + "loss": 1.6681, + "step": 1934 + }, + { + "epoch": 0.203577064702788, + "grad_norm": 1.1818253993988037, + "learning_rate": 0.00018462252455320785, + "loss": 2.2638, + "step": 1935 + }, + { + "epoch": 0.20368227248816412, + "grad_norm": 1.7338184118270874, + "learning_rate": 0.000184604362354628, + "loss": 2.2689, + "step": 1936 + }, + { + "epoch": 0.20378748027354024, + "grad_norm": 1.2104718685150146, + "learning_rate": 0.00018458619033122218, + "loss": 1.9735, + "step": 1937 + }, + { + "epoch": 0.20389268805891636, + "grad_norm": 0.9602882266044617, + "learning_rate": 0.00018456800848510056, + "loss": 2.2646, + "step": 1938 + }, + { + "epoch": 0.20399789584429248, + "grad_norm": 0.9840019941329956, + "learning_rate": 0.00018454981681837463, + "loss": 1.8798, + "step": 1939 + }, + { + "epoch": 0.2041031036296686, + "grad_norm": 0.9403349161148071, + "learning_rate": 0.0001845316153331569, + "loss": 2.063, + "step": 1940 + }, + { + "epoch": 0.20420831141504472, + "grad_norm": 1.7658482789993286, + "learning_rate": 0.000184513404031561, + "loss": 2.1487, + "step": 1941 + }, + { + "epoch": 0.20431351920042082, + "grad_norm": 1.6682535409927368, + "learning_rate": 0.00018449518291570183, + "loss": 2.1085, + "step": 1942 + }, + { + "epoch": 0.20441872698579694, + "grad_norm": 1.355338215827942, + "learning_rate": 0.00018447695198769526, + "loss": 2.2586, + "step": 1943 + }, + { + "epoch": 0.20452393477117306, + "grad_norm": 1.2086683511734009, + "learning_rate": 0.00018445871124965843, + "loss": 2.0692, + "step": 1944 + }, + { + "epoch": 0.20462914255654918, + "grad_norm": 1.9813165664672852, + "learning_rate": 0.00018444046070370963, + "loss": 1.7036, + "step": 1945 + }, + { + "epoch": 0.2047343503419253, + "grad_norm": 1.0692365169525146, + "learning_rate": 0.00018442220035196812, + "loss": 2.2982, + "step": 1946 + }, + { + "epoch": 0.20483955812730142, + "grad_norm": 1.4883077144622803, + "learning_rate": 0.00018440393019655452, + "loss": 2.304, + "step": 1947 + }, + { + "epoch": 0.20494476591267755, + "grad_norm": 2.0655646324157715, + "learning_rate": 0.00018438565023959043, + "loss": 2.0391, + "step": 1948 + }, + { + "epoch": 0.20504997369805367, + "grad_norm": 1.2688896656036377, + "learning_rate": 0.00018436736048319866, + "loss": 1.6661, + "step": 1949 + }, + { + "epoch": 0.20515518148342976, + "grad_norm": 1.0065981149673462, + "learning_rate": 0.00018434906092950313, + "loss": 2.1238, + "step": 1950 + }, + { + "epoch": 0.20526038926880588, + "grad_norm": 1.2444024085998535, + "learning_rate": 0.00018433075158062891, + "loss": 2.0663, + "step": 1951 + }, + { + "epoch": 0.205365597054182, + "grad_norm": 1.5449178218841553, + "learning_rate": 0.00018431243243870223, + "loss": 2.1242, + "step": 1952 + }, + { + "epoch": 0.20547080483955812, + "grad_norm": 0.7938444018363953, + "learning_rate": 0.00018429410350585034, + "loss": 2.2292, + "step": 1953 + }, + { + "epoch": 0.20557601262493425, + "grad_norm": 1.2025595903396606, + "learning_rate": 0.00018427576478420186, + "loss": 2.057, + "step": 1954 + }, + { + "epoch": 0.20568122041031037, + "grad_norm": 1.0300257205963135, + "learning_rate": 0.00018425741627588627, + "loss": 1.9549, + "step": 1955 + }, + { + "epoch": 0.2057864281956865, + "grad_norm": 1.1756985187530518, + "learning_rate": 0.0001842390579830344, + "loss": 1.7581, + "step": 1956 + }, + { + "epoch": 0.2058916359810626, + "grad_norm": 1.201172947883606, + "learning_rate": 0.00018422068990777812, + "loss": 1.9362, + "step": 1957 + }, + { + "epoch": 0.2059968437664387, + "grad_norm": 1.3182251453399658, + "learning_rate": 0.00018420231205225048, + "loss": 2.057, + "step": 1958 + }, + { + "epoch": 0.20610205155181482, + "grad_norm": 1.0256531238555908, + "learning_rate": 0.00018418392441858555, + "loss": 1.3062, + "step": 1959 + }, + { + "epoch": 0.20620725933719095, + "grad_norm": 1.0511935949325562, + "learning_rate": 0.00018416552700891873, + "loss": 2.1073, + "step": 1960 + }, + { + "epoch": 0.20631246712256707, + "grad_norm": 1.2152209281921387, + "learning_rate": 0.0001841471198253864, + "loss": 2.1933, + "step": 1961 + }, + { + "epoch": 0.2064176749079432, + "grad_norm": 1.1236765384674072, + "learning_rate": 0.00018412870287012612, + "loss": 1.9926, + "step": 1962 + }, + { + "epoch": 0.2065228826933193, + "grad_norm": 1.0651775598526, + "learning_rate": 0.00018411027614527665, + "loss": 2.0337, + "step": 1963 + }, + { + "epoch": 0.20662809047869543, + "grad_norm": 1.3447325229644775, + "learning_rate": 0.00018409183965297776, + "loss": 2.0808, + "step": 1964 + }, + { + "epoch": 0.20673329826407155, + "grad_norm": 1.0653175115585327, + "learning_rate": 0.0001840733933953704, + "loss": 2.1391, + "step": 1965 + }, + { + "epoch": 0.20683850604944765, + "grad_norm": 0.9823555946350098, + "learning_rate": 0.0001840549373745968, + "loss": 2.2805, + "step": 1966 + }, + { + "epoch": 0.20694371383482377, + "grad_norm": 1.1034152507781982, + "learning_rate": 0.00018403647159280002, + "loss": 2.1311, + "step": 1967 + }, + { + "epoch": 0.2070489216201999, + "grad_norm": 1.0444360971450806, + "learning_rate": 0.00018401799605212457, + "loss": 2.3337, + "step": 1968 + }, + { + "epoch": 0.207154129405576, + "grad_norm": 0.9108467102050781, + "learning_rate": 0.00018399951075471588, + "loss": 1.8849, + "step": 1969 + }, + { + "epoch": 0.20725933719095213, + "grad_norm": 1.4065741300582886, + "learning_rate": 0.0001839810157027206, + "loss": 1.8689, + "step": 1970 + }, + { + "epoch": 0.20736454497632825, + "grad_norm": 1.062463641166687, + "learning_rate": 0.00018396251089828654, + "loss": 1.9281, + "step": 1971 + }, + { + "epoch": 0.20746975276170437, + "grad_norm": 0.9371905326843262, + "learning_rate": 0.00018394399634356256, + "loss": 2.3794, + "step": 1972 + }, + { + "epoch": 0.2075749605470805, + "grad_norm": 1.0700284242630005, + "learning_rate": 0.0001839254720406987, + "loss": 2.0316, + "step": 1973 + }, + { + "epoch": 0.2076801683324566, + "grad_norm": 1.718322992324829, + "learning_rate": 0.00018390693799184613, + "loss": 1.9874, + "step": 1974 + }, + { + "epoch": 0.2077853761178327, + "grad_norm": 1.9158811569213867, + "learning_rate": 0.00018388839419915715, + "loss": 1.595, + "step": 1975 + }, + { + "epoch": 0.20789058390320883, + "grad_norm": 0.9104257225990295, + "learning_rate": 0.00018386984066478518, + "loss": 1.8649, + "step": 1976 + }, + { + "epoch": 0.20799579168858495, + "grad_norm": 1.0303380489349365, + "learning_rate": 0.00018385127739088482, + "loss": 1.9061, + "step": 1977 + }, + { + "epoch": 0.20810099947396107, + "grad_norm": 0.9922066926956177, + "learning_rate": 0.0001838327043796117, + "loss": 1.966, + "step": 1978 + }, + { + "epoch": 0.2082062072593372, + "grad_norm": 1.0381168127059937, + "learning_rate": 0.0001838141216331227, + "loss": 2.3589, + "step": 1979 + }, + { + "epoch": 0.20831141504471332, + "grad_norm": 1.5640273094177246, + "learning_rate": 0.00018379552915357575, + "loss": 1.9377, + "step": 1980 + }, + { + "epoch": 0.20841662283008944, + "grad_norm": 1.299121618270874, + "learning_rate": 0.00018377692694312994, + "loss": 2.0013, + "step": 1981 + }, + { + "epoch": 0.20852183061546553, + "grad_norm": 1.0334281921386719, + "learning_rate": 0.0001837583150039454, + "loss": 1.9686, + "step": 1982 + }, + { + "epoch": 0.20862703840084165, + "grad_norm": 1.1700999736785889, + "learning_rate": 0.00018373969333818364, + "loss": 1.6355, + "step": 1983 + }, + { + "epoch": 0.20873224618621777, + "grad_norm": 1.4064104557037354, + "learning_rate": 0.00018372106194800703, + "loss": 2.3571, + "step": 1984 + }, + { + "epoch": 0.2088374539715939, + "grad_norm": 1.18000328540802, + "learning_rate": 0.00018370242083557914, + "loss": 2.0538, + "step": 1985 + }, + { + "epoch": 0.20894266175697002, + "grad_norm": 0.7155065536499023, + "learning_rate": 0.00018368377000306475, + "loss": 2.2469, + "step": 1986 + }, + { + "epoch": 0.20904786954234614, + "grad_norm": 0.7191174626350403, + "learning_rate": 0.00018366510945262972, + "loss": 2.0709, + "step": 1987 + }, + { + "epoch": 0.20915307732772226, + "grad_norm": 1.1349201202392578, + "learning_rate": 0.00018364643918644108, + "loss": 1.3495, + "step": 1988 + }, + { + "epoch": 0.20925828511309838, + "grad_norm": 0.8776866793632507, + "learning_rate": 0.00018362775920666684, + "loss": 2.28, + "step": 1989 + }, + { + "epoch": 0.20936349289847447, + "grad_norm": 1.0915156602859497, + "learning_rate": 0.00018360906951547633, + "loss": 2.3553, + "step": 1990 + }, + { + "epoch": 0.2094687006838506, + "grad_norm": 1.0722743272781372, + "learning_rate": 0.00018359037011503988, + "loss": 2.0794, + "step": 1991 + }, + { + "epoch": 0.20957390846922672, + "grad_norm": 1.764479398727417, + "learning_rate": 0.000183571661007529, + "loss": 2.4184, + "step": 1992 + }, + { + "epoch": 0.20967911625460284, + "grad_norm": 0.9234581589698792, + "learning_rate": 0.00018355294219511633, + "loss": 2.3174, + "step": 1993 + }, + { + "epoch": 0.20978432403997896, + "grad_norm": 1.334053635597229, + "learning_rate": 0.00018353421367997563, + "loss": 1.8774, + "step": 1994 + }, + { + "epoch": 0.20988953182535508, + "grad_norm": 0.8430811166763306, + "learning_rate": 0.00018351547546428175, + "loss": 2.094, + "step": 1995 + }, + { + "epoch": 0.2099947396107312, + "grad_norm": 1.3585329055786133, + "learning_rate": 0.00018349672755021073, + "loss": 1.9657, + "step": 1996 + }, + { + "epoch": 0.21009994739610732, + "grad_norm": 1.2807389497756958, + "learning_rate": 0.00018347796993993968, + "loss": 1.1998, + "step": 1997 + }, + { + "epoch": 0.21020515518148342, + "grad_norm": 1.249908685684204, + "learning_rate": 0.00018345920263564683, + "loss": 2.0678, + "step": 1998 + }, + { + "epoch": 0.21031036296685954, + "grad_norm": 1.2682554721832275, + "learning_rate": 0.00018344042563951167, + "loss": 2.2967, + "step": 1999 + }, + { + "epoch": 0.21041557075223566, + "grad_norm": 1.0497610569000244, + "learning_rate": 0.0001834216389537146, + "loss": 2.2477, + "step": 2000 + }, + { + "epoch": 0.21052077853761178, + "grad_norm": 1.0887739658355713, + "learning_rate": 0.00018340284258043732, + "loss": 1.9668, + "step": 2001 + }, + { + "epoch": 0.2106259863229879, + "grad_norm": 1.3399494886398315, + "learning_rate": 0.00018338403652186255, + "loss": 1.4247, + "step": 2002 + }, + { + "epoch": 0.21073119410836402, + "grad_norm": 1.225252389907837, + "learning_rate": 0.0001833652207801742, + "loss": 1.7742, + "step": 2003 + }, + { + "epoch": 0.21083640189374014, + "grad_norm": 0.9872320294380188, + "learning_rate": 0.0001833463953575573, + "loss": 2.131, + "step": 2004 + }, + { + "epoch": 0.21094160967911627, + "grad_norm": 1.161528468132019, + "learning_rate": 0.00018332756025619796, + "loss": 2.1038, + "step": 2005 + }, + { + "epoch": 0.21104681746449236, + "grad_norm": 0.8207192420959473, + "learning_rate": 0.00018330871547828342, + "loss": 1.9778, + "step": 2006 + }, + { + "epoch": 0.21115202524986848, + "grad_norm": 1.2393897771835327, + "learning_rate": 0.00018328986102600207, + "loss": 1.8538, + "step": 2007 + }, + { + "epoch": 0.2112572330352446, + "grad_norm": 1.052492380142212, + "learning_rate": 0.00018327099690154344, + "loss": 2.0008, + "step": 2008 + }, + { + "epoch": 0.21136244082062072, + "grad_norm": 1.1104390621185303, + "learning_rate": 0.00018325212310709815, + "loss": 2.2228, + "step": 2009 + }, + { + "epoch": 0.21146764860599684, + "grad_norm": 1.1710113286972046, + "learning_rate": 0.00018323323964485795, + "loss": 2.3255, + "step": 2010 + }, + { + "epoch": 0.21157285639137297, + "grad_norm": 1.5669982433319092, + "learning_rate": 0.00018321434651701567, + "loss": 2.0485, + "step": 2011 + }, + { + "epoch": 0.2116780641767491, + "grad_norm": 1.486722707748413, + "learning_rate": 0.00018319544372576537, + "loss": 1.396, + "step": 2012 + }, + { + "epoch": 0.2117832719621252, + "grad_norm": 0.9690135717391968, + "learning_rate": 0.00018317653127330216, + "loss": 1.9004, + "step": 2013 + }, + { + "epoch": 0.2118884797475013, + "grad_norm": 1.1157475709915161, + "learning_rate": 0.00018315760916182228, + "loss": 1.9181, + "step": 2014 + }, + { + "epoch": 0.21199368753287742, + "grad_norm": 1.8870946168899536, + "learning_rate": 0.00018313867739352304, + "loss": 1.8504, + "step": 2015 + }, + { + "epoch": 0.21209889531825354, + "grad_norm": 1.5294593572616577, + "learning_rate": 0.000183119735970603, + "loss": 2.0109, + "step": 2016 + }, + { + "epoch": 0.21220410310362967, + "grad_norm": 1.3320099115371704, + "learning_rate": 0.00018310078489526172, + "loss": 2.149, + "step": 2017 + }, + { + "epoch": 0.2123093108890058, + "grad_norm": 1.6566550731658936, + "learning_rate": 0.0001830818241696999, + "loss": 1.8748, + "step": 2018 + }, + { + "epoch": 0.2124145186743819, + "grad_norm": 1.4480758905410767, + "learning_rate": 0.00018306285379611947, + "loss": 1.9223, + "step": 2019 + }, + { + "epoch": 0.21251972645975803, + "grad_norm": 1.1992216110229492, + "learning_rate": 0.00018304387377672331, + "loss": 2.334, + "step": 2020 + }, + { + "epoch": 0.21262493424513415, + "grad_norm": 1.6246322393417358, + "learning_rate": 0.00018302488411371556, + "loss": 1.818, + "step": 2021 + }, + { + "epoch": 0.21273014203051024, + "grad_norm": 1.3721061944961548, + "learning_rate": 0.00018300588480930143, + "loss": 2.0362, + "step": 2022 + }, + { + "epoch": 0.21283534981588637, + "grad_norm": 1.2342585325241089, + "learning_rate": 0.00018298687586568721, + "loss": 1.7764, + "step": 2023 + }, + { + "epoch": 0.2129405576012625, + "grad_norm": 1.8841065168380737, + "learning_rate": 0.00018296785728508038, + "loss": 1.5991, + "step": 2024 + }, + { + "epoch": 0.2130457653866386, + "grad_norm": 1.1739152669906616, + "learning_rate": 0.00018294882906968947, + "loss": 2.1558, + "step": 2025 + }, + { + "epoch": 0.21315097317201473, + "grad_norm": 0.8841888904571533, + "learning_rate": 0.00018292979122172418, + "loss": 1.7489, + "step": 2026 + }, + { + "epoch": 0.21325618095739085, + "grad_norm": 1.0950676202774048, + "learning_rate": 0.00018291074374339534, + "loss": 2.2481, + "step": 2027 + }, + { + "epoch": 0.21336138874276697, + "grad_norm": 1.693406581878662, + "learning_rate": 0.00018289168663691486, + "loss": 2.3203, + "step": 2028 + }, + { + "epoch": 0.2134665965281431, + "grad_norm": 0.8693836331367493, + "learning_rate": 0.0001828726199044957, + "loss": 2.1194, + "step": 2029 + }, + { + "epoch": 0.2135718043135192, + "grad_norm": 0.9883258938789368, + "learning_rate": 0.00018285354354835215, + "loss": 2.0936, + "step": 2030 + }, + { + "epoch": 0.2136770120988953, + "grad_norm": 1.0930025577545166, + "learning_rate": 0.00018283445757069942, + "loss": 1.9489, + "step": 2031 + }, + { + "epoch": 0.21378221988427143, + "grad_norm": 1.8851250410079956, + "learning_rate": 0.00018281536197375386, + "loss": 1.5551, + "step": 2032 + }, + { + "epoch": 0.21388742766964755, + "grad_norm": 0.933934211730957, + "learning_rate": 0.00018279625675973304, + "loss": 1.7964, + "step": 2033 + }, + { + "epoch": 0.21399263545502367, + "grad_norm": 0.9329856038093567, + "learning_rate": 0.00018277714193085554, + "loss": 1.8394, + "step": 2034 + }, + { + "epoch": 0.2140978432403998, + "grad_norm": 2.0386810302734375, + "learning_rate": 0.00018275801748934115, + "loss": 2.0189, + "step": 2035 + }, + { + "epoch": 0.21420305102577591, + "grad_norm": 1.8662831783294678, + "learning_rate": 0.0001827388834374107, + "loss": 1.7599, + "step": 2036 + }, + { + "epoch": 0.21430825881115204, + "grad_norm": 1.1685835123062134, + "learning_rate": 0.0001827197397772862, + "loss": 1.8818, + "step": 2037 + }, + { + "epoch": 0.21441346659652813, + "grad_norm": 0.8799912333488464, + "learning_rate": 0.00018270058651119063, + "loss": 2.1813, + "step": 2038 + }, + { + "epoch": 0.21451867438190425, + "grad_norm": 0.8218079805374146, + "learning_rate": 0.00018268142364134834, + "loss": 2.3432, + "step": 2039 + }, + { + "epoch": 0.21462388216728037, + "grad_norm": 1.1055721044540405, + "learning_rate": 0.00018266225116998457, + "loss": 1.585, + "step": 2040 + }, + { + "epoch": 0.2147290899526565, + "grad_norm": 1.1287375688552856, + "learning_rate": 0.00018264306909932575, + "loss": 2.0216, + "step": 2041 + }, + { + "epoch": 0.21483429773803261, + "grad_norm": 1.085099220275879, + "learning_rate": 0.0001826238774315995, + "loss": 1.9095, + "step": 2042 + }, + { + "epoch": 0.21493950552340874, + "grad_norm": 1.0446321964263916, + "learning_rate": 0.0001826046761690344, + "loss": 2.0777, + "step": 2043 + }, + { + "epoch": 0.21504471330878486, + "grad_norm": 1.3021361827850342, + "learning_rate": 0.0001825854653138603, + "loss": 2.0682, + "step": 2044 + }, + { + "epoch": 0.21514992109416098, + "grad_norm": 0.9500402808189392, + "learning_rate": 0.00018256624486830803, + "loss": 2.3274, + "step": 2045 + }, + { + "epoch": 0.21525512887953707, + "grad_norm": 1.077355980873108, + "learning_rate": 0.00018254701483460964, + "loss": 1.9925, + "step": 2046 + }, + { + "epoch": 0.2153603366649132, + "grad_norm": 1.6127686500549316, + "learning_rate": 0.00018252777521499821, + "loss": 1.9386, + "step": 2047 + }, + { + "epoch": 0.21546554445028931, + "grad_norm": 0.7945027351379395, + "learning_rate": 0.00018250852601170805, + "loss": 2.1476, + "step": 2048 + }, + { + "epoch": 0.21557075223566544, + "grad_norm": 1.248733401298523, + "learning_rate": 0.00018248926722697444, + "loss": 2.1239, + "step": 2049 + }, + { + "epoch": 0.21567596002104156, + "grad_norm": 1.2039722204208374, + "learning_rate": 0.00018246999886303383, + "loss": 2.132, + "step": 2050 + }, + { + "epoch": 0.21578116780641768, + "grad_norm": 1.0198581218719482, + "learning_rate": 0.00018245072092212388, + "loss": 1.758, + "step": 2051 + }, + { + "epoch": 0.2158863755917938, + "grad_norm": 1.2120723724365234, + "learning_rate": 0.00018243143340648316, + "loss": 2.1045, + "step": 2052 + }, + { + "epoch": 0.21599158337716992, + "grad_norm": 0.8687598705291748, + "learning_rate": 0.00018241213631835153, + "loss": 1.568, + "step": 2053 + }, + { + "epoch": 0.21609679116254601, + "grad_norm": 1.0547373294830322, + "learning_rate": 0.0001823928296599699, + "loss": 2.3057, + "step": 2054 + }, + { + "epoch": 0.21620199894792214, + "grad_norm": 0.9490458369255066, + "learning_rate": 0.00018237351343358026, + "loss": 2.2151, + "step": 2055 + }, + { + "epoch": 0.21630720673329826, + "grad_norm": 1.1601332426071167, + "learning_rate": 0.00018235418764142575, + "loss": 2.0495, + "step": 2056 + }, + { + "epoch": 0.21641241451867438, + "grad_norm": 1.8616313934326172, + "learning_rate": 0.00018233485228575063, + "loss": 1.877, + "step": 2057 + }, + { + "epoch": 0.2165176223040505, + "grad_norm": 1.3617507219314575, + "learning_rate": 0.00018231550736880024, + "loss": 2.1049, + "step": 2058 + }, + { + "epoch": 0.21662283008942662, + "grad_norm": 1.3656481504440308, + "learning_rate": 0.00018229615289282102, + "loss": 2.0189, + "step": 2059 + }, + { + "epoch": 0.21672803787480274, + "grad_norm": 1.2620902061462402, + "learning_rate": 0.0001822767888600606, + "loss": 2.1029, + "step": 2060 + }, + { + "epoch": 0.21683324566017886, + "grad_norm": 0.9363958239555359, + "learning_rate": 0.00018225741527276755, + "loss": 1.9721, + "step": 2061 + }, + { + "epoch": 0.21693845344555496, + "grad_norm": 0.7752951979637146, + "learning_rate": 0.0001822380321331918, + "loss": 1.9241, + "step": 2062 + }, + { + "epoch": 0.21704366123093108, + "grad_norm": 1.2256505489349365, + "learning_rate": 0.00018221863944358412, + "loss": 2.0667, + "step": 2063 + }, + { + "epoch": 0.2171488690163072, + "grad_norm": 1.6273034811019897, + "learning_rate": 0.00018219923720619663, + "loss": 1.9696, + "step": 2064 + }, + { + "epoch": 0.21725407680168332, + "grad_norm": 1.1030242443084717, + "learning_rate": 0.00018217982542328238, + "loss": 2.0116, + "step": 2065 + }, + { + "epoch": 0.21735928458705944, + "grad_norm": 0.9453311562538147, + "learning_rate": 0.00018216040409709563, + "loss": 1.7222, + "step": 2066 + }, + { + "epoch": 0.21746449237243556, + "grad_norm": 1.2243266105651855, + "learning_rate": 0.00018214097322989168, + "loss": 2.2173, + "step": 2067 + }, + { + "epoch": 0.21756970015781169, + "grad_norm": 2.4034857749938965, + "learning_rate": 0.000182121532823927, + "loss": 1.7776, + "step": 2068 + }, + { + "epoch": 0.2176749079431878, + "grad_norm": 1.4039760828018188, + "learning_rate": 0.00018210208288145914, + "loss": 2.3769, + "step": 2069 + }, + { + "epoch": 0.2177801157285639, + "grad_norm": 1.2816754579544067, + "learning_rate": 0.00018208262340474677, + "loss": 1.9379, + "step": 2070 + }, + { + "epoch": 0.21788532351394002, + "grad_norm": 1.6516449451446533, + "learning_rate": 0.0001820631543960496, + "loss": 1.7285, + "step": 2071 + }, + { + "epoch": 0.21799053129931614, + "grad_norm": 1.015907645225525, + "learning_rate": 0.00018204367585762855, + "loss": 1.749, + "step": 2072 + }, + { + "epoch": 0.21809573908469226, + "grad_norm": 1.1783592700958252, + "learning_rate": 0.00018202418779174556, + "loss": 2.1641, + "step": 2073 + }, + { + "epoch": 0.21820094687006839, + "grad_norm": 0.8992342352867126, + "learning_rate": 0.00018200469020066378, + "loss": 1.8566, + "step": 2074 + }, + { + "epoch": 0.2183061546554445, + "grad_norm": 1.1242965459823608, + "learning_rate": 0.00018198518308664734, + "loss": 1.4988, + "step": 2075 + }, + { + "epoch": 0.21841136244082063, + "grad_norm": 1.6040006875991821, + "learning_rate": 0.00018196566645196157, + "loss": 2.4172, + "step": 2076 + }, + { + "epoch": 0.21851657022619675, + "grad_norm": 1.6206622123718262, + "learning_rate": 0.00018194614029887286, + "loss": 1.8929, + "step": 2077 + }, + { + "epoch": 0.21862177801157284, + "grad_norm": 1.8726282119750977, + "learning_rate": 0.0001819266046296487, + "loss": 1.8977, + "step": 2078 + }, + { + "epoch": 0.21872698579694896, + "grad_norm": 0.9685164093971252, + "learning_rate": 0.00018190705944655776, + "loss": 2.2144, + "step": 2079 + }, + { + "epoch": 0.21883219358232509, + "grad_norm": 1.0125783681869507, + "learning_rate": 0.00018188750475186968, + "loss": 1.9572, + "step": 2080 + }, + { + "epoch": 0.2189374013677012, + "grad_norm": 1.1588927507400513, + "learning_rate": 0.00018186794054785534, + "loss": 1.7875, + "step": 2081 + }, + { + "epoch": 0.21904260915307733, + "grad_norm": 0.96497642993927, + "learning_rate": 0.00018184836683678667, + "loss": 2.2885, + "step": 2082 + }, + { + "epoch": 0.21914781693845345, + "grad_norm": 1.0914496183395386, + "learning_rate": 0.00018182878362093665, + "loss": 1.9495, + "step": 2083 + }, + { + "epoch": 0.21925302472382957, + "grad_norm": 1.0270262956619263, + "learning_rate": 0.00018180919090257945, + "loss": 1.7052, + "step": 2084 + }, + { + "epoch": 0.2193582325092057, + "grad_norm": 1.0222289562225342, + "learning_rate": 0.00018178958868399033, + "loss": 2.1131, + "step": 2085 + }, + { + "epoch": 0.21946344029458179, + "grad_norm": 2.0089921951293945, + "learning_rate": 0.00018176997696744556, + "loss": 2.0778, + "step": 2086 + }, + { + "epoch": 0.2195686480799579, + "grad_norm": 1.4937543869018555, + "learning_rate": 0.00018175035575522264, + "loss": 1.8517, + "step": 2087 + }, + { + "epoch": 0.21967385586533403, + "grad_norm": 1.0364367961883545, + "learning_rate": 0.00018173072504960012, + "loss": 1.9049, + "step": 2088 + }, + { + "epoch": 0.21977906365071015, + "grad_norm": 1.225338339805603, + "learning_rate": 0.00018171108485285763, + "loss": 2.0639, + "step": 2089 + }, + { + "epoch": 0.21988427143608627, + "grad_norm": 1.0059758424758911, + "learning_rate": 0.0001816914351672759, + "loss": 1.9738, + "step": 2090 + }, + { + "epoch": 0.2199894792214624, + "grad_norm": 1.3843426704406738, + "learning_rate": 0.00018167177599513683, + "loss": 2.1056, + "step": 2091 + }, + { + "epoch": 0.2200946870068385, + "grad_norm": 1.4146124124526978, + "learning_rate": 0.00018165210733872336, + "loss": 2.014, + "step": 2092 + }, + { + "epoch": 0.22019989479221463, + "grad_norm": 0.9480724334716797, + "learning_rate": 0.00018163242920031953, + "loss": 2.1525, + "step": 2093 + }, + { + "epoch": 0.22030510257759073, + "grad_norm": 1.4912493228912354, + "learning_rate": 0.00018161274158221048, + "loss": 2.697, + "step": 2094 + }, + { + "epoch": 0.22041031036296685, + "grad_norm": 1.2136460542678833, + "learning_rate": 0.00018159304448668253, + "loss": 1.5769, + "step": 2095 + }, + { + "epoch": 0.22051551814834297, + "grad_norm": 0.991580069065094, + "learning_rate": 0.00018157333791602297, + "loss": 2.2276, + "step": 2096 + }, + { + "epoch": 0.2206207259337191, + "grad_norm": 0.8912844061851501, + "learning_rate": 0.00018155362187252032, + "loss": 2.0501, + "step": 2097 + }, + { + "epoch": 0.2207259337190952, + "grad_norm": 1.1324025392532349, + "learning_rate": 0.00018153389635846412, + "loss": 2.2222, + "step": 2098 + }, + { + "epoch": 0.22083114150447133, + "grad_norm": 0.7306355237960815, + "learning_rate": 0.000181514161376145, + "loss": 2.0178, + "step": 2099 + }, + { + "epoch": 0.22093634928984746, + "grad_norm": 1.2878000736236572, + "learning_rate": 0.00018149441692785474, + "loss": 2.1164, + "step": 2100 + }, + { + "epoch": 0.22104155707522358, + "grad_norm": 1.2069727182388306, + "learning_rate": 0.00018147466301588622, + "loss": 1.9003, + "step": 2101 + }, + { + "epoch": 0.22114676486059967, + "grad_norm": 0.8253376483917236, + "learning_rate": 0.00018145489964253332, + "loss": 1.7612, + "step": 2102 + }, + { + "epoch": 0.2212519726459758, + "grad_norm": 1.2956628799438477, + "learning_rate": 0.0001814351268100912, + "loss": 2.1136, + "step": 2103 + }, + { + "epoch": 0.2213571804313519, + "grad_norm": 0.9152336120605469, + "learning_rate": 0.00018141534452085595, + "loss": 1.8014, + "step": 2104 + }, + { + "epoch": 0.22146238821672803, + "grad_norm": 1.4479109048843384, + "learning_rate": 0.00018139555277712482, + "loss": 1.951, + "step": 2105 + }, + { + "epoch": 0.22156759600210416, + "grad_norm": 1.5760619640350342, + "learning_rate": 0.0001813757515811962, + "loss": 1.9071, + "step": 2106 + }, + { + "epoch": 0.22167280378748028, + "grad_norm": 1.1229628324508667, + "learning_rate": 0.0001813559409353695, + "loss": 1.8727, + "step": 2107 + }, + { + "epoch": 0.2217780115728564, + "grad_norm": 0.8988067507743835, + "learning_rate": 0.0001813361208419453, + "loss": 2.2383, + "step": 2108 + }, + { + "epoch": 0.22188321935823252, + "grad_norm": 1.3481158018112183, + "learning_rate": 0.0001813162913032252, + "loss": 2.0623, + "step": 2109 + }, + { + "epoch": 0.2219884271436086, + "grad_norm": 1.297059416770935, + "learning_rate": 0.00018129645232151193, + "loss": 2.4296, + "step": 2110 + }, + { + "epoch": 0.22209363492898473, + "grad_norm": 1.9435932636260986, + "learning_rate": 0.0001812766038991094, + "loss": 2.1651, + "step": 2111 + }, + { + "epoch": 0.22219884271436086, + "grad_norm": 1.3611021041870117, + "learning_rate": 0.00018125674603832248, + "loss": 1.9258, + "step": 2112 + }, + { + "epoch": 0.22230405049973698, + "grad_norm": 1.4265265464782715, + "learning_rate": 0.00018123687874145721, + "loss": 1.6352, + "step": 2113 + }, + { + "epoch": 0.2224092582851131, + "grad_norm": 0.9899458289146423, + "learning_rate": 0.00018121700201082072, + "loss": 2.3061, + "step": 2114 + }, + { + "epoch": 0.22251446607048922, + "grad_norm": 0.9266303777694702, + "learning_rate": 0.00018119711584872123, + "loss": 2.1004, + "step": 2115 + }, + { + "epoch": 0.22261967385586534, + "grad_norm": 1.2452328205108643, + "learning_rate": 0.00018117722025746806, + "loss": 2.2262, + "step": 2116 + }, + { + "epoch": 0.22272488164124146, + "grad_norm": 1.2869268655776978, + "learning_rate": 0.0001811573152393716, + "loss": 2.3989, + "step": 2117 + }, + { + "epoch": 0.22283008942661758, + "grad_norm": 0.9200719594955444, + "learning_rate": 0.00018113740079674337, + "loss": 1.9065, + "step": 2118 + }, + { + "epoch": 0.22293529721199368, + "grad_norm": 0.869301974773407, + "learning_rate": 0.00018111747693189595, + "loss": 1.7604, + "step": 2119 + }, + { + "epoch": 0.2230405049973698, + "grad_norm": 1.119777798652649, + "learning_rate": 0.00018109754364714305, + "loss": 2.128, + "step": 2120 + }, + { + "epoch": 0.22314571278274592, + "grad_norm": 1.4669591188430786, + "learning_rate": 0.00018107760094479948, + "loss": 2.174, + "step": 2121 + }, + { + "epoch": 0.22325092056812204, + "grad_norm": 0.9276453256607056, + "learning_rate": 0.0001810576488271811, + "loss": 2.1165, + "step": 2122 + }, + { + "epoch": 0.22335612835349816, + "grad_norm": 0.9469357132911682, + "learning_rate": 0.00018103768729660485, + "loss": 2.2046, + "step": 2123 + }, + { + "epoch": 0.22346133613887428, + "grad_norm": 1.525495171546936, + "learning_rate": 0.00018101771635538883, + "loss": 1.4605, + "step": 2124 + }, + { + "epoch": 0.2235665439242504, + "grad_norm": 1.022232174873352, + "learning_rate": 0.00018099773600585223, + "loss": 2.0554, + "step": 2125 + }, + { + "epoch": 0.22367175170962653, + "grad_norm": 1.015611171722412, + "learning_rate": 0.00018097774625031523, + "loss": 2.064, + "step": 2126 + }, + { + "epoch": 0.22377695949500262, + "grad_norm": 1.5566353797912598, + "learning_rate": 0.0001809577470910992, + "loss": 1.9989, + "step": 2127 + }, + { + "epoch": 0.22388216728037874, + "grad_norm": 1.2023249864578247, + "learning_rate": 0.0001809377385305266, + "loss": 1.9456, + "step": 2128 + }, + { + "epoch": 0.22398737506575486, + "grad_norm": 1.1629719734191895, + "learning_rate": 0.00018091772057092097, + "loss": 2.3635, + "step": 2129 + }, + { + "epoch": 0.22409258285113098, + "grad_norm": 1.1971515417099, + "learning_rate": 0.00018089769321460688, + "loss": 2.101, + "step": 2130 + }, + { + "epoch": 0.2241977906365071, + "grad_norm": 0.9968559741973877, + "learning_rate": 0.00018087765646391008, + "loss": 1.94, + "step": 2131 + }, + { + "epoch": 0.22430299842188323, + "grad_norm": 1.0076571702957153, + "learning_rate": 0.00018085761032115736, + "loss": 2.1199, + "step": 2132 + }, + { + "epoch": 0.22440820620725935, + "grad_norm": 1.0761382579803467, + "learning_rate": 0.00018083755478867658, + "loss": 1.8256, + "step": 2133 + }, + { + "epoch": 0.22451341399263547, + "grad_norm": 1.2399243116378784, + "learning_rate": 0.00018081748986879679, + "loss": 1.6247, + "step": 2134 + }, + { + "epoch": 0.22461862177801156, + "grad_norm": 0.6975569725036621, + "learning_rate": 0.00018079741556384802, + "loss": 2.0154, + "step": 2135 + }, + { + "epoch": 0.22472382956338768, + "grad_norm": 0.8129298686981201, + "learning_rate": 0.00018077733187616142, + "loss": 1.9586, + "step": 2136 + }, + { + "epoch": 0.2248290373487638, + "grad_norm": 1.2736451625823975, + "learning_rate": 0.0001807572388080693, + "loss": 2.3598, + "step": 2137 + }, + { + "epoch": 0.22493424513413993, + "grad_norm": 1.5190147161483765, + "learning_rate": 0.00018073713636190494, + "loss": 2.1019, + "step": 2138 + }, + { + "epoch": 0.22503945291951605, + "grad_norm": 1.2464752197265625, + "learning_rate": 0.0001807170245400028, + "loss": 2.0247, + "step": 2139 + }, + { + "epoch": 0.22514466070489217, + "grad_norm": 1.5845669507980347, + "learning_rate": 0.0001806969033446984, + "loss": 1.6235, + "step": 2140 + }, + { + "epoch": 0.2252498684902683, + "grad_norm": 1.4465845823287964, + "learning_rate": 0.00018067677277832834, + "loss": 1.8367, + "step": 2141 + }, + { + "epoch": 0.2253550762756444, + "grad_norm": 1.5385208129882812, + "learning_rate": 0.00018065663284323028, + "loss": 1.9726, + "step": 2142 + }, + { + "epoch": 0.2254602840610205, + "grad_norm": 1.2862590551376343, + "learning_rate": 0.0001806364835417431, + "loss": 2.216, + "step": 2143 + }, + { + "epoch": 0.22556549184639663, + "grad_norm": 1.4813519716262817, + "learning_rate": 0.0001806163248762066, + "loss": 2.0377, + "step": 2144 + }, + { + "epoch": 0.22567069963177275, + "grad_norm": 3.828671455383301, + "learning_rate": 0.00018059615684896176, + "loss": 2.1483, + "step": 2145 + }, + { + "epoch": 0.22577590741714887, + "grad_norm": 1.3726669549942017, + "learning_rate": 0.00018057597946235062, + "loss": 1.8516, + "step": 2146 + }, + { + "epoch": 0.225881115202525, + "grad_norm": 1.2943693399429321, + "learning_rate": 0.0001805557927187163, + "loss": 1.9399, + "step": 2147 + }, + { + "epoch": 0.2259863229879011, + "grad_norm": 1.1112357378005981, + "learning_rate": 0.00018053559662040302, + "loss": 2.064, + "step": 2148 + }, + { + "epoch": 0.22609153077327723, + "grad_norm": 1.11170494556427, + "learning_rate": 0.00018051539116975613, + "loss": 1.9364, + "step": 2149 + }, + { + "epoch": 0.22619673855865335, + "grad_norm": 0.9152960181236267, + "learning_rate": 0.000180495176369122, + "loss": 2.0502, + "step": 2150 + }, + { + "epoch": 0.22630194634402945, + "grad_norm": 1.5197174549102783, + "learning_rate": 0.00018047495222084812, + "loss": 2.0064, + "step": 2151 + }, + { + "epoch": 0.22640715412940557, + "grad_norm": 1.3327728509902954, + "learning_rate": 0.000180454718727283, + "loss": 1.2452, + "step": 2152 + }, + { + "epoch": 0.2265123619147817, + "grad_norm": 1.113082766532898, + "learning_rate": 0.00018043447589077634, + "loss": 1.7021, + "step": 2153 + }, + { + "epoch": 0.2266175697001578, + "grad_norm": 1.596432089805603, + "learning_rate": 0.00018041422371367885, + "loss": 1.9555, + "step": 2154 + }, + { + "epoch": 0.22672277748553393, + "grad_norm": 1.9314936399459839, + "learning_rate": 0.00018039396219834237, + "loss": 1.774, + "step": 2155 + }, + { + "epoch": 0.22682798527091005, + "grad_norm": 1.492937445640564, + "learning_rate": 0.00018037369134711977, + "loss": 2.2366, + "step": 2156 + }, + { + "epoch": 0.22693319305628618, + "grad_norm": 1.4675370454788208, + "learning_rate": 0.00018035341116236507, + "loss": 1.9737, + "step": 2157 + }, + { + "epoch": 0.2270384008416623, + "grad_norm": 1.595471978187561, + "learning_rate": 0.00018033312164643332, + "loss": 2.141, + "step": 2158 + }, + { + "epoch": 0.2271436086270384, + "grad_norm": 1.6596059799194336, + "learning_rate": 0.0001803128228016807, + "loss": 1.5989, + "step": 2159 + }, + { + "epoch": 0.2272488164124145, + "grad_norm": 1.2517389059066772, + "learning_rate": 0.00018029251463046444, + "loss": 1.805, + "step": 2160 + }, + { + "epoch": 0.22735402419779063, + "grad_norm": 1.7200475931167603, + "learning_rate": 0.00018027219713514283, + "loss": 0.9951, + "step": 2161 + }, + { + "epoch": 0.22745923198316675, + "grad_norm": 1.616018295288086, + "learning_rate": 0.00018025187031807532, + "loss": 1.9761, + "step": 2162 + }, + { + "epoch": 0.22756443976854288, + "grad_norm": 1.3265496492385864, + "learning_rate": 0.00018023153418162235, + "loss": 1.8146, + "step": 2163 + }, + { + "epoch": 0.227669647553919, + "grad_norm": 1.3496745824813843, + "learning_rate": 0.0001802111887281455, + "loss": 1.7593, + "step": 2164 + }, + { + "epoch": 0.22777485533929512, + "grad_norm": 1.171179175376892, + "learning_rate": 0.0001801908339600075, + "loss": 1.7885, + "step": 2165 + }, + { + "epoch": 0.22788006312467124, + "grad_norm": 1.5108046531677246, + "learning_rate": 0.00018017046987957197, + "loss": 1.9684, + "step": 2166 + }, + { + "epoch": 0.22798527091004733, + "grad_norm": 1.351288914680481, + "learning_rate": 0.00018015009648920374, + "loss": 1.671, + "step": 2167 + }, + { + "epoch": 0.22809047869542345, + "grad_norm": 1.2217696905136108, + "learning_rate": 0.00018012971379126875, + "loss": 1.9302, + "step": 2168 + }, + { + "epoch": 0.22819568648079958, + "grad_norm": 1.644564151763916, + "learning_rate": 0.00018010932178813397, + "loss": 1.7754, + "step": 2169 + }, + { + "epoch": 0.2283008942661757, + "grad_norm": 1.167407751083374, + "learning_rate": 0.00018008892048216744, + "loss": 1.9996, + "step": 2170 + }, + { + "epoch": 0.22840610205155182, + "grad_norm": 1.467685341835022, + "learning_rate": 0.00018006850987573834, + "loss": 1.9082, + "step": 2171 + }, + { + "epoch": 0.22851130983692794, + "grad_norm": 1.2934365272521973, + "learning_rate": 0.0001800480899712168, + "loss": 2.4512, + "step": 2172 + }, + { + "epoch": 0.22861651762230406, + "grad_norm": 1.2714877128601074, + "learning_rate": 0.00018002766077097415, + "loss": 2.1337, + "step": 2173 + }, + { + "epoch": 0.22872172540768018, + "grad_norm": 1.2429519891738892, + "learning_rate": 0.0001800072222773828, + "loss": 1.7869, + "step": 2174 + }, + { + "epoch": 0.22882693319305628, + "grad_norm": 1.2201809883117676, + "learning_rate": 0.00017998677449281621, + "loss": 1.6531, + "step": 2175 + }, + { + "epoch": 0.2289321409784324, + "grad_norm": 1.0419690608978271, + "learning_rate": 0.00017996631741964888, + "loss": 2.3172, + "step": 2176 + }, + { + "epoch": 0.22903734876380852, + "grad_norm": 1.217642903327942, + "learning_rate": 0.0001799458510602564, + "loss": 1.8395, + "step": 2177 + }, + { + "epoch": 0.22914255654918464, + "grad_norm": 1.229682445526123, + "learning_rate": 0.0001799253754170155, + "loss": 2.1528, + "step": 2178 + }, + { + "epoch": 0.22924776433456076, + "grad_norm": 0.8546053767204285, + "learning_rate": 0.00017990489049230396, + "loss": 2.0306, + "step": 2179 + }, + { + "epoch": 0.22935297211993688, + "grad_norm": 1.6027251482009888, + "learning_rate": 0.0001798843962885006, + "loss": 2.03, + "step": 2180 + }, + { + "epoch": 0.229458179905313, + "grad_norm": 1.3929436206817627, + "learning_rate": 0.00017986389280798533, + "loss": 2.0689, + "step": 2181 + }, + { + "epoch": 0.22956338769068912, + "grad_norm": 1.221983790397644, + "learning_rate": 0.00017984338005313922, + "loss": 2.0643, + "step": 2182 + }, + { + "epoch": 0.22966859547606522, + "grad_norm": 1.0649733543395996, + "learning_rate": 0.00017982285802634426, + "loss": 1.6149, + "step": 2183 + }, + { + "epoch": 0.22977380326144134, + "grad_norm": 1.2897831201553345, + "learning_rate": 0.00017980232672998368, + "loss": 2.0364, + "step": 2184 + }, + { + "epoch": 0.22987901104681746, + "grad_norm": 0.9529754519462585, + "learning_rate": 0.00017978178616644166, + "loss": 1.6717, + "step": 2185 + }, + { + "epoch": 0.22998421883219358, + "grad_norm": 1.1082221269607544, + "learning_rate": 0.00017976123633810354, + "loss": 2.2986, + "step": 2186 + }, + { + "epoch": 0.2300894266175697, + "grad_norm": 1.5675081014633179, + "learning_rate": 0.00017974067724735567, + "loss": 1.7103, + "step": 2187 + }, + { + "epoch": 0.23019463440294582, + "grad_norm": 1.7995283603668213, + "learning_rate": 0.00017972010889658554, + "loss": 2.0048, + "step": 2188 + }, + { + "epoch": 0.23029984218832195, + "grad_norm": 1.0870211124420166, + "learning_rate": 0.00017969953128818168, + "loss": 2.0764, + "step": 2189 + }, + { + "epoch": 0.23040504997369807, + "grad_norm": 1.2951061725616455, + "learning_rate": 0.0001796789444245337, + "loss": 2.4356, + "step": 2190 + }, + { + "epoch": 0.23051025775907416, + "grad_norm": 1.5553895235061646, + "learning_rate": 0.00017965834830803228, + "loss": 2.0726, + "step": 2191 + }, + { + "epoch": 0.23061546554445028, + "grad_norm": 1.519358515739441, + "learning_rate": 0.0001796377429410692, + "loss": 2.0531, + "step": 2192 + }, + { + "epoch": 0.2307206733298264, + "grad_norm": 1.6568381786346436, + "learning_rate": 0.00017961712832603724, + "loss": 1.7599, + "step": 2193 + }, + { + "epoch": 0.23082588111520252, + "grad_norm": 0.9388352036476135, + "learning_rate": 0.00017959650446533037, + "loss": 2.2012, + "step": 2194 + }, + { + "epoch": 0.23093108890057865, + "grad_norm": 1.2496514320373535, + "learning_rate": 0.0001795758713613435, + "loss": 1.9036, + "step": 2195 + }, + { + "epoch": 0.23103629668595477, + "grad_norm": 1.169584035873413, + "learning_rate": 0.00017955522901647275, + "loss": 2.3784, + "step": 2196 + }, + { + "epoch": 0.2311415044713309, + "grad_norm": 0.9535863399505615, + "learning_rate": 0.00017953457743311523, + "loss": 1.9531, + "step": 2197 + }, + { + "epoch": 0.231246712256707, + "grad_norm": 0.8409315347671509, + "learning_rate": 0.00017951391661366912, + "loss": 1.9737, + "step": 2198 + }, + { + "epoch": 0.2313519200420831, + "grad_norm": 1.8287293910980225, + "learning_rate": 0.00017949324656053373, + "loss": 1.6721, + "step": 2199 + }, + { + "epoch": 0.23145712782745922, + "grad_norm": 1.6631537675857544, + "learning_rate": 0.00017947256727610935, + "loss": 1.6647, + "step": 2200 + }, + { + "epoch": 0.23156233561283535, + "grad_norm": 2.273388147354126, + "learning_rate": 0.00017945187876279746, + "loss": 1.5766, + "step": 2201 + }, + { + "epoch": 0.23166754339821147, + "grad_norm": 1.3821104764938354, + "learning_rate": 0.0001794311810230005, + "loss": 2.6111, + "step": 2202 + }, + { + "epoch": 0.2317727511835876, + "grad_norm": 0.8510130643844604, + "learning_rate": 0.00017941047405912203, + "loss": 2.1294, + "step": 2203 + }, + { + "epoch": 0.2318779589689637, + "grad_norm": 1.8961708545684814, + "learning_rate": 0.00017938975787356673, + "loss": 2.1342, + "step": 2204 + }, + { + "epoch": 0.23198316675433983, + "grad_norm": 1.116129994392395, + "learning_rate": 0.00017936903246874026, + "loss": 1.8121, + "step": 2205 + }, + { + "epoch": 0.23208837453971595, + "grad_norm": 1.1960958242416382, + "learning_rate": 0.0001793482978470494, + "loss": 1.9618, + "step": 2206 + }, + { + "epoch": 0.23219358232509205, + "grad_norm": 1.3476707935333252, + "learning_rate": 0.00017932755401090203, + "loss": 2.1292, + "step": 2207 + }, + { + "epoch": 0.23229879011046817, + "grad_norm": 1.285470962524414, + "learning_rate": 0.00017930680096270697, + "loss": 2.2751, + "step": 2208 + }, + { + "epoch": 0.2324039978958443, + "grad_norm": 1.4089267253875732, + "learning_rate": 0.00017928603870487434, + "loss": 1.8112, + "step": 2209 + }, + { + "epoch": 0.2325092056812204, + "grad_norm": 0.8862564563751221, + "learning_rate": 0.00017926526723981506, + "loss": 2.086, + "step": 2210 + }, + { + "epoch": 0.23261441346659653, + "grad_norm": 1.1878046989440918, + "learning_rate": 0.00017924448656994133, + "loss": 1.8758, + "step": 2211 + }, + { + "epoch": 0.23271962125197265, + "grad_norm": 1.1390782594680786, + "learning_rate": 0.00017922369669766633, + "loss": 1.7815, + "step": 2212 + }, + { + "epoch": 0.23282482903734877, + "grad_norm": 1.1724015474319458, + "learning_rate": 0.0001792028976254043, + "loss": 1.8632, + "step": 2213 + }, + { + "epoch": 0.2329300368227249, + "grad_norm": 1.2103734016418457, + "learning_rate": 0.00017918208935557058, + "loss": 1.739, + "step": 2214 + }, + { + "epoch": 0.233035244608101, + "grad_norm": 1.0550634860992432, + "learning_rate": 0.00017916127189058158, + "loss": 2.301, + "step": 2215 + }, + { + "epoch": 0.2331404523934771, + "grad_norm": 1.6345337629318237, + "learning_rate": 0.0001791404452328547, + "loss": 1.9167, + "step": 2216 + }, + { + "epoch": 0.23324566017885323, + "grad_norm": 0.9604158997535706, + "learning_rate": 0.00017911960938480858, + "loss": 1.8264, + "step": 2217 + }, + { + "epoch": 0.23335086796422935, + "grad_norm": 1.6715139150619507, + "learning_rate": 0.00017909876434886273, + "loss": 1.826, + "step": 2218 + }, + { + "epoch": 0.23345607574960547, + "grad_norm": 1.1755411624908447, + "learning_rate": 0.00017907791012743783, + "loss": 1.5755, + "step": 2219 + }, + { + "epoch": 0.2335612835349816, + "grad_norm": 1.8296947479248047, + "learning_rate": 0.00017905704672295563, + "loss": 1.5263, + "step": 2220 + }, + { + "epoch": 0.23366649132035772, + "grad_norm": 1.162333607673645, + "learning_rate": 0.00017903617413783893, + "loss": 2.2662, + "step": 2221 + }, + { + "epoch": 0.23377169910573384, + "grad_norm": 1.608921766281128, + "learning_rate": 0.0001790152923745116, + "loss": 2.2386, + "step": 2222 + }, + { + "epoch": 0.23387690689110993, + "grad_norm": 0.9484199285507202, + "learning_rate": 0.00017899440143539854, + "loss": 1.5288, + "step": 2223 + }, + { + "epoch": 0.23398211467648605, + "grad_norm": 0.7746570110321045, + "learning_rate": 0.00017897350132292577, + "loss": 1.7404, + "step": 2224 + }, + { + "epoch": 0.23408732246186217, + "grad_norm": 1.0333335399627686, + "learning_rate": 0.00017895259203952032, + "loss": 2.0346, + "step": 2225 + }, + { + "epoch": 0.2341925302472383, + "grad_norm": 1.308410882949829, + "learning_rate": 0.00017893167358761037, + "loss": 2.0028, + "step": 2226 + }, + { + "epoch": 0.23429773803261442, + "grad_norm": 1.7827560901641846, + "learning_rate": 0.00017891074596962508, + "loss": 1.665, + "step": 2227 + }, + { + "epoch": 0.23440294581799054, + "grad_norm": 2.48459792137146, + "learning_rate": 0.0001788898091879947, + "loss": 1.9477, + "step": 2228 + }, + { + "epoch": 0.23450815360336666, + "grad_norm": 1.5465545654296875, + "learning_rate": 0.00017886886324515054, + "loss": 2.1428, + "step": 2229 + }, + { + "epoch": 0.23461336138874278, + "grad_norm": 1.2667497396469116, + "learning_rate": 0.00017884790814352502, + "loss": 1.6537, + "step": 2230 + }, + { + "epoch": 0.23471856917411887, + "grad_norm": 1.3898303508758545, + "learning_rate": 0.00017882694388555157, + "loss": 2.175, + "step": 2231 + }, + { + "epoch": 0.234823776959495, + "grad_norm": 1.1822909116744995, + "learning_rate": 0.0001788059704736647, + "loss": 1.6444, + "step": 2232 + }, + { + "epoch": 0.23492898474487112, + "grad_norm": 1.4326550960540771, + "learning_rate": 0.00017878498791029998, + "loss": 2.4223, + "step": 2233 + }, + { + "epoch": 0.23503419253024724, + "grad_norm": 1.0603430271148682, + "learning_rate": 0.00017876399619789406, + "loss": 1.8874, + "step": 2234 + }, + { + "epoch": 0.23513940031562336, + "grad_norm": 0.9951735138893127, + "learning_rate": 0.0001787429953388846, + "loss": 1.9595, + "step": 2235 + }, + { + "epoch": 0.23524460810099948, + "grad_norm": 0.889345645904541, + "learning_rate": 0.0001787219853357104, + "loss": 1.5804, + "step": 2236 + }, + { + "epoch": 0.2353498158863756, + "grad_norm": 1.0618045330047607, + "learning_rate": 0.00017870096619081123, + "loss": 1.8707, + "step": 2237 + }, + { + "epoch": 0.23545502367175172, + "grad_norm": 0.9510637521743774, + "learning_rate": 0.00017867993790662804, + "loss": 2.1408, + "step": 2238 + }, + { + "epoch": 0.23556023145712782, + "grad_norm": 0.9027166366577148, + "learning_rate": 0.00017865890048560277, + "loss": 2.3268, + "step": 2239 + }, + { + "epoch": 0.23566543924250394, + "grad_norm": 2.181145191192627, + "learning_rate": 0.00017863785393017838, + "loss": 2.1089, + "step": 2240 + }, + { + "epoch": 0.23577064702788006, + "grad_norm": 0.9953977465629578, + "learning_rate": 0.00017861679824279897, + "loss": 2.0335, + "step": 2241 + }, + { + "epoch": 0.23587585481325618, + "grad_norm": 1.1575350761413574, + "learning_rate": 0.00017859573342590964, + "loss": 1.7528, + "step": 2242 + }, + { + "epoch": 0.2359810625986323, + "grad_norm": 1.475678563117981, + "learning_rate": 0.00017857465948195662, + "loss": 2.1524, + "step": 2243 + }, + { + "epoch": 0.23608627038400842, + "grad_norm": 1.2343095541000366, + "learning_rate": 0.00017855357641338712, + "loss": 2.0534, + "step": 2244 + }, + { + "epoch": 0.23619147816938454, + "grad_norm": 1.7605794668197632, + "learning_rate": 0.0001785324842226495, + "loss": 2.1463, + "step": 2245 + }, + { + "epoch": 0.23629668595476067, + "grad_norm": 1.5780607461929321, + "learning_rate": 0.00017851138291219301, + "loss": 1.9925, + "step": 2246 + }, + { + "epoch": 0.23640189374013676, + "grad_norm": 1.1927117109298706, + "learning_rate": 0.00017849027248446824, + "loss": 1.7036, + "step": 2247 + }, + { + "epoch": 0.23650710152551288, + "grad_norm": 0.9393693208694458, + "learning_rate": 0.00017846915294192654, + "loss": 1.765, + "step": 2248 + }, + { + "epoch": 0.236612309310889, + "grad_norm": 2.274392604827881, + "learning_rate": 0.00017844802428702052, + "loss": 1.7931, + "step": 2249 + }, + { + "epoch": 0.23671751709626512, + "grad_norm": 1.2075339555740356, + "learning_rate": 0.00017842688652220377, + "loss": 2.3247, + "step": 2250 + }, + { + "epoch": 0.23682272488164124, + "grad_norm": 1.0093438625335693, + "learning_rate": 0.00017840573964993093, + "loss": 2.0814, + "step": 2251 + }, + { + "epoch": 0.23692793266701737, + "grad_norm": 1.147470474243164, + "learning_rate": 0.00017838458367265772, + "loss": 1.8764, + "step": 2252 + }, + { + "epoch": 0.2370331404523935, + "grad_norm": 1.6974701881408691, + "learning_rate": 0.00017836341859284093, + "loss": 1.4435, + "step": 2253 + }, + { + "epoch": 0.2371383482377696, + "grad_norm": 1.7535107135772705, + "learning_rate": 0.00017834224441293836, + "loss": 1.4787, + "step": 2254 + }, + { + "epoch": 0.2372435560231457, + "grad_norm": 1.3474122285842896, + "learning_rate": 0.00017832106113540897, + "loss": 1.3433, + "step": 2255 + }, + { + "epoch": 0.23734876380852182, + "grad_norm": 1.8467276096343994, + "learning_rate": 0.0001782998687627126, + "loss": 2.1831, + "step": 2256 + }, + { + "epoch": 0.23745397159389794, + "grad_norm": 1.08902907371521, + "learning_rate": 0.00017827866729731035, + "loss": 2.0054, + "step": 2257 + }, + { + "epoch": 0.23755917937927407, + "grad_norm": 0.9274120926856995, + "learning_rate": 0.0001782574567416642, + "loss": 2.1129, + "step": 2258 + }, + { + "epoch": 0.2376643871646502, + "grad_norm": 0.9804845452308655, + "learning_rate": 0.00017823623709823733, + "loss": 1.9947, + "step": 2259 + }, + { + "epoch": 0.2377695949500263, + "grad_norm": 2.3834564685821533, + "learning_rate": 0.00017821500836949386, + "loss": 1.8823, + "step": 2260 + }, + { + "epoch": 0.23787480273540243, + "grad_norm": 1.2635396718978882, + "learning_rate": 0.000178193770557899, + "loss": 2.2756, + "step": 2261 + }, + { + "epoch": 0.23798001052077855, + "grad_norm": 0.8009362816810608, + "learning_rate": 0.00017817252366591907, + "loss": 2.1824, + "step": 2262 + }, + { + "epoch": 0.23808521830615464, + "grad_norm": 1.8110861778259277, + "learning_rate": 0.0001781512676960214, + "loss": 1.3582, + "step": 2263 + }, + { + "epoch": 0.23819042609153077, + "grad_norm": 1.50228750705719, + "learning_rate": 0.00017813000265067433, + "loss": 1.5663, + "step": 2264 + }, + { + "epoch": 0.2382956338769069, + "grad_norm": 1.1287262439727783, + "learning_rate": 0.00017810872853234733, + "loss": 1.9131, + "step": 2265 + }, + { + "epoch": 0.238400841662283, + "grad_norm": 1.1263632774353027, + "learning_rate": 0.0001780874453435109, + "loss": 1.8182, + "step": 2266 + }, + { + "epoch": 0.23850604944765913, + "grad_norm": 1.2594873905181885, + "learning_rate": 0.0001780661530866366, + "loss": 1.8225, + "step": 2267 + }, + { + "epoch": 0.23861125723303525, + "grad_norm": 1.1252297163009644, + "learning_rate": 0.00017804485176419697, + "loss": 2.0544, + "step": 2268 + }, + { + "epoch": 0.23871646501841137, + "grad_norm": 1.2109135389328003, + "learning_rate": 0.00017802354137866572, + "loss": 2.2275, + "step": 2269 + }, + { + "epoch": 0.2388216728037875, + "grad_norm": 1.0338249206542969, + "learning_rate": 0.00017800222193251752, + "loss": 2.1604, + "step": 2270 + }, + { + "epoch": 0.2389268805891636, + "grad_norm": 1.1437997817993164, + "learning_rate": 0.00017798089342822816, + "loss": 2.406, + "step": 2271 + }, + { + "epoch": 0.2390320883745397, + "grad_norm": 1.0176033973693848, + "learning_rate": 0.00017795955586827442, + "loss": 2.3426, + "step": 2272 + }, + { + "epoch": 0.23913729615991583, + "grad_norm": 1.5589518547058105, + "learning_rate": 0.00017793820925513418, + "loss": 2.1739, + "step": 2273 + }, + { + "epoch": 0.23924250394529195, + "grad_norm": 0.8495704531669617, + "learning_rate": 0.00017791685359128633, + "loss": 2.1634, + "step": 2274 + }, + { + "epoch": 0.23934771173066807, + "grad_norm": 1.0862325429916382, + "learning_rate": 0.00017789548887921087, + "loss": 2.2329, + "step": 2275 + }, + { + "epoch": 0.2394529195160442, + "grad_norm": 1.8624638319015503, + "learning_rate": 0.00017787411512138875, + "loss": 1.7344, + "step": 2276 + }, + { + "epoch": 0.23955812730142031, + "grad_norm": 1.0375189781188965, + "learning_rate": 0.0001778527323203021, + "loss": 1.7976, + "step": 2277 + }, + { + "epoch": 0.23966333508679644, + "grad_norm": 0.92099928855896, + "learning_rate": 0.000177831340478434, + "loss": 2.173, + "step": 2278 + }, + { + "epoch": 0.23976854287217253, + "grad_norm": 2.462179183959961, + "learning_rate": 0.00017780993959826865, + "loss": 1.6677, + "step": 2279 + }, + { + "epoch": 0.23987375065754865, + "grad_norm": 1.415610432624817, + "learning_rate": 0.00017778852968229123, + "loss": 2.1977, + "step": 2280 + }, + { + "epoch": 0.23997895844292477, + "grad_norm": 1.6684101819992065, + "learning_rate": 0.000177767110732988, + "loss": 1.7925, + "step": 2281 + }, + { + "epoch": 0.2400841662283009, + "grad_norm": 0.9390462040901184, + "learning_rate": 0.00017774568275284627, + "loss": 1.7441, + "step": 2282 + }, + { + "epoch": 0.24018937401367702, + "grad_norm": 1.377502679824829, + "learning_rate": 0.00017772424574435443, + "loss": 1.847, + "step": 2283 + }, + { + "epoch": 0.24029458179905314, + "grad_norm": 1.1089184284210205, + "learning_rate": 0.00017770279971000185, + "loss": 2.3168, + "step": 2284 + }, + { + "epoch": 0.24039978958442926, + "grad_norm": 1.1749531030654907, + "learning_rate": 0.00017768134465227903, + "loss": 2.303, + "step": 2285 + }, + { + "epoch": 0.24050499736980538, + "grad_norm": 1.2190381288528442, + "learning_rate": 0.00017765988057367747, + "loss": 1.7057, + "step": 2286 + }, + { + "epoch": 0.24061020515518147, + "grad_norm": 0.8646054267883301, + "learning_rate": 0.00017763840747668966, + "loss": 1.7069, + "step": 2287 + }, + { + "epoch": 0.2407154129405576, + "grad_norm": 0.8684479594230652, + "learning_rate": 0.00017761692536380928, + "loss": 1.9386, + "step": 2288 + }, + { + "epoch": 0.24082062072593372, + "grad_norm": 1.0403144359588623, + "learning_rate": 0.00017759543423753093, + "loss": 1.9768, + "step": 2289 + }, + { + "epoch": 0.24092582851130984, + "grad_norm": 1.298659086227417, + "learning_rate": 0.00017757393410035033, + "loss": 1.9757, + "step": 2290 + }, + { + "epoch": 0.24103103629668596, + "grad_norm": 1.4886418581008911, + "learning_rate": 0.00017755242495476418, + "loss": 1.6223, + "step": 2291 + }, + { + "epoch": 0.24113624408206208, + "grad_norm": 1.2547330856323242, + "learning_rate": 0.00017753090680327032, + "loss": 2.0024, + "step": 2292 + }, + { + "epoch": 0.2412414518674382, + "grad_norm": 1.3677291870117188, + "learning_rate": 0.00017750937964836755, + "loss": 1.797, + "step": 2293 + }, + { + "epoch": 0.24134665965281432, + "grad_norm": 1.4770119190216064, + "learning_rate": 0.00017748784349255577, + "loss": 2.2053, + "step": 2294 + }, + { + "epoch": 0.24145186743819042, + "grad_norm": 0.817295491695404, + "learning_rate": 0.00017746629833833585, + "loss": 1.9076, + "step": 2295 + }, + { + "epoch": 0.24155707522356654, + "grad_norm": 1.4393826723098755, + "learning_rate": 0.00017744474418820985, + "loss": 1.6945, + "step": 2296 + }, + { + "epoch": 0.24166228300894266, + "grad_norm": 1.3872381448745728, + "learning_rate": 0.00017742318104468067, + "loss": 1.8003, + "step": 2297 + }, + { + "epoch": 0.24176749079431878, + "grad_norm": 1.1846232414245605, + "learning_rate": 0.00017740160891025245, + "loss": 1.9204, + "step": 2298 + }, + { + "epoch": 0.2418726985796949, + "grad_norm": 0.8881604075431824, + "learning_rate": 0.00017738002778743027, + "loss": 2.1186, + "step": 2299 + }, + { + "epoch": 0.24197790636507102, + "grad_norm": 1.211893081665039, + "learning_rate": 0.00017735843767872024, + "loss": 2.0164, + "step": 2300 + }, + { + "epoch": 0.24208311415044714, + "grad_norm": 1.456984281539917, + "learning_rate": 0.0001773368385866296, + "loss": 1.7958, + "step": 2301 + }, + { + "epoch": 0.24218832193582326, + "grad_norm": 1.3485262393951416, + "learning_rate": 0.00017731523051366658, + "loss": 1.935, + "step": 2302 + }, + { + "epoch": 0.24229352972119936, + "grad_norm": 1.3391015529632568, + "learning_rate": 0.0001772936134623404, + "loss": 1.9303, + "step": 2303 + }, + { + "epoch": 0.24239873750657548, + "grad_norm": 1.4360216856002808, + "learning_rate": 0.00017727198743516142, + "loss": 1.9268, + "step": 2304 + }, + { + "epoch": 0.2425039452919516, + "grad_norm": 1.5892516374588013, + "learning_rate": 0.00017725035243464099, + "loss": 1.6656, + "step": 2305 + }, + { + "epoch": 0.24260915307732772, + "grad_norm": 1.308066964149475, + "learning_rate": 0.0001772287084632915, + "loss": 1.7519, + "step": 2306 + }, + { + "epoch": 0.24271436086270384, + "grad_norm": 1.079184651374817, + "learning_rate": 0.00017720705552362644, + "loss": 2.3011, + "step": 2307 + }, + { + "epoch": 0.24281956864807996, + "grad_norm": 1.0954896211624146, + "learning_rate": 0.00017718539361816023, + "loss": 2.275, + "step": 2308 + }, + { + "epoch": 0.24292477643345609, + "grad_norm": 1.0315781831741333, + "learning_rate": 0.00017716372274940843, + "loss": 1.8872, + "step": 2309 + }, + { + "epoch": 0.2430299842188322, + "grad_norm": 0.9103153944015503, + "learning_rate": 0.00017714204291988762, + "loss": 2.0967, + "step": 2310 + }, + { + "epoch": 0.2431351920042083, + "grad_norm": 1.182989239692688, + "learning_rate": 0.00017712035413211535, + "loss": 1.6962, + "step": 2311 + }, + { + "epoch": 0.24324039978958442, + "grad_norm": 0.8709691166877747, + "learning_rate": 0.00017709865638861034, + "loss": 1.6754, + "step": 2312 + }, + { + "epoch": 0.24334560757496054, + "grad_norm": 1.3362677097320557, + "learning_rate": 0.0001770769496918922, + "loss": 1.6432, + "step": 2313 + }, + { + "epoch": 0.24345081536033666, + "grad_norm": 1.2893030643463135, + "learning_rate": 0.00017705523404448176, + "loss": 2.0481, + "step": 2314 + }, + { + "epoch": 0.24355602314571279, + "grad_norm": 0.913367748260498, + "learning_rate": 0.00017703350944890068, + "loss": 2.1723, + "step": 2315 + }, + { + "epoch": 0.2436612309310889, + "grad_norm": 0.9429225325584412, + "learning_rate": 0.00017701177590767183, + "loss": 2.47, + "step": 2316 + }, + { + "epoch": 0.24376643871646503, + "grad_norm": 0.9314096570014954, + "learning_rate": 0.00017699003342331904, + "loss": 2.2355, + "step": 2317 + }, + { + "epoch": 0.24387164650184115, + "grad_norm": 1.0147193670272827, + "learning_rate": 0.0001769682819983672, + "loss": 2.2912, + "step": 2318 + }, + { + "epoch": 0.24397685428721724, + "grad_norm": 0.9757153987884521, + "learning_rate": 0.00017694652163534222, + "loss": 1.817, + "step": 2319 + }, + { + "epoch": 0.24408206207259336, + "grad_norm": 1.4220627546310425, + "learning_rate": 0.00017692475233677105, + "loss": 2.0166, + "step": 2320 + }, + { + "epoch": 0.24418726985796949, + "grad_norm": 1.103240728378296, + "learning_rate": 0.0001769029741051817, + "loss": 1.9228, + "step": 2321 + }, + { + "epoch": 0.2442924776433456, + "grad_norm": 1.1101772785186768, + "learning_rate": 0.0001768811869431032, + "loss": 2.1224, + "step": 2322 + }, + { + "epoch": 0.24439768542872173, + "grad_norm": 1.5481605529785156, + "learning_rate": 0.00017685939085306562, + "loss": 2.0526, + "step": 2323 + }, + { + "epoch": 0.24450289321409785, + "grad_norm": 1.3062520027160645, + "learning_rate": 0.00017683758583760008, + "loss": 1.8925, + "step": 2324 + }, + { + "epoch": 0.24460810099947397, + "grad_norm": 1.5073399543762207, + "learning_rate": 0.00017681577189923873, + "loss": 2.0869, + "step": 2325 + }, + { + "epoch": 0.2447133087848501, + "grad_norm": 0.8444550037384033, + "learning_rate": 0.00017679394904051473, + "loss": 2.1741, + "step": 2326 + }, + { + "epoch": 0.24481851657022619, + "grad_norm": 0.9408750534057617, + "learning_rate": 0.0001767721172639623, + "loss": 1.9257, + "step": 2327 + }, + { + "epoch": 0.2449237243556023, + "grad_norm": 1.572348713874817, + "learning_rate": 0.0001767502765721167, + "loss": 2.2418, + "step": 2328 + }, + { + "epoch": 0.24502893214097843, + "grad_norm": 1.0508718490600586, + "learning_rate": 0.0001767284269675142, + "loss": 2.1225, + "step": 2329 + }, + { + "epoch": 0.24513413992635455, + "grad_norm": 1.1722843647003174, + "learning_rate": 0.00017670656845269214, + "loss": 2.103, + "step": 2330 + }, + { + "epoch": 0.24523934771173067, + "grad_norm": 1.4209791421890259, + "learning_rate": 0.00017668470103018887, + "loss": 1.5394, + "step": 2331 + }, + { + "epoch": 0.2453445554971068, + "grad_norm": 1.2421069145202637, + "learning_rate": 0.00017666282470254381, + "loss": 2.0012, + "step": 2332 + }, + { + "epoch": 0.2454497632824829, + "grad_norm": 0.9292833209037781, + "learning_rate": 0.00017664093947229736, + "loss": 2.2945, + "step": 2333 + }, + { + "epoch": 0.24555497106785903, + "grad_norm": 1.0548170804977417, + "learning_rate": 0.00017661904534199097, + "loss": 2.0605, + "step": 2334 + }, + { + "epoch": 0.24566017885323513, + "grad_norm": 0.8535258769989014, + "learning_rate": 0.00017659714231416714, + "loss": 2.3665, + "step": 2335 + }, + { + "epoch": 0.24576538663861125, + "grad_norm": 1.2200136184692383, + "learning_rate": 0.00017657523039136942, + "loss": 2.0179, + "step": 2336 + }, + { + "epoch": 0.24587059442398737, + "grad_norm": 1.0717980861663818, + "learning_rate": 0.00017655330957614234, + "loss": 2.1129, + "step": 2337 + }, + { + "epoch": 0.2459758022093635, + "grad_norm": 0.9226016402244568, + "learning_rate": 0.0001765313798710315, + "loss": 2.2021, + "step": 2338 + }, + { + "epoch": 0.2460810099947396, + "grad_norm": 1.2788020372390747, + "learning_rate": 0.00017650944127858354, + "loss": 1.7879, + "step": 2339 + }, + { + "epoch": 0.24618621778011573, + "grad_norm": 1.342400074005127, + "learning_rate": 0.00017648749380134608, + "loss": 1.8949, + "step": 2340 + }, + { + "epoch": 0.24629142556549186, + "grad_norm": 0.979184627532959, + "learning_rate": 0.00017646553744186784, + "loss": 1.773, + "step": 2341 + }, + { + "epoch": 0.24639663335086798, + "grad_norm": 1.4060213565826416, + "learning_rate": 0.00017644357220269856, + "loss": 2.2494, + "step": 2342 + }, + { + "epoch": 0.24650184113624407, + "grad_norm": 1.8778382539749146, + "learning_rate": 0.0001764215980863889, + "loss": 2.1433, + "step": 2343 + }, + { + "epoch": 0.2466070489216202, + "grad_norm": 1.214531421661377, + "learning_rate": 0.00017639961509549078, + "loss": 1.936, + "step": 2344 + }, + { + "epoch": 0.2467122567069963, + "grad_norm": 1.4594982862472534, + "learning_rate": 0.0001763776232325569, + "loss": 1.9461, + "step": 2345 + }, + { + "epoch": 0.24681746449237243, + "grad_norm": 1.5665651559829712, + "learning_rate": 0.00017635562250014112, + "loss": 2.5673, + "step": 2346 + }, + { + "epoch": 0.24692267227774856, + "grad_norm": 2.044776439666748, + "learning_rate": 0.00017633361290079837, + "loss": 2.6654, + "step": 2347 + }, + { + "epoch": 0.24702788006312468, + "grad_norm": 1.7862943410873413, + "learning_rate": 0.0001763115944370845, + "loss": 2.3661, + "step": 2348 + }, + { + "epoch": 0.2471330878485008, + "grad_norm": 1.3187406063079834, + "learning_rate": 0.00017628956711155644, + "loss": 1.5839, + "step": 2349 + }, + { + "epoch": 0.24723829563387692, + "grad_norm": 1.1463435888290405, + "learning_rate": 0.0001762675309267722, + "loss": 2.1677, + "step": 2350 + }, + { + "epoch": 0.247343503419253, + "grad_norm": 1.8317863941192627, + "learning_rate": 0.00017624548588529072, + "loss": 2.1135, + "step": 2351 + }, + { + "epoch": 0.24744871120462913, + "grad_norm": 1.2014631032943726, + "learning_rate": 0.00017622343198967202, + "loss": 1.9363, + "step": 2352 + }, + { + "epoch": 0.24755391899000526, + "grad_norm": 1.3410941362380981, + "learning_rate": 0.00017620136924247719, + "loss": 2.149, + "step": 2353 + }, + { + "epoch": 0.24765912677538138, + "grad_norm": 1.2935651540756226, + "learning_rate": 0.00017617929764626825, + "loss": 2.3697, + "step": 2354 + }, + { + "epoch": 0.2477643345607575, + "grad_norm": 1.6687452793121338, + "learning_rate": 0.00017615721720360834, + "loss": 1.979, + "step": 2355 + }, + { + "epoch": 0.24786954234613362, + "grad_norm": 1.249712347984314, + "learning_rate": 0.00017613512791706155, + "loss": 1.6578, + "step": 2356 + }, + { + "epoch": 0.24797475013150974, + "grad_norm": 1.1926326751708984, + "learning_rate": 0.0001761130297891931, + "loss": 2.3114, + "step": 2357 + }, + { + "epoch": 0.24807995791688586, + "grad_norm": 1.159462332725525, + "learning_rate": 0.00017609092282256912, + "loss": 1.8345, + "step": 2358 + }, + { + "epoch": 0.24818516570226196, + "grad_norm": 1.2362000942230225, + "learning_rate": 0.00017606880701975683, + "loss": 2.6279, + "step": 2359 + }, + { + "epoch": 0.24829037348763808, + "grad_norm": 1.116899013519287, + "learning_rate": 0.00017604668238332448, + "loss": 2.3413, + "step": 2360 + }, + { + "epoch": 0.2483955812730142, + "grad_norm": 1.3379223346710205, + "learning_rate": 0.0001760245489158413, + "loss": 1.7864, + "step": 2361 + }, + { + "epoch": 0.24850078905839032, + "grad_norm": 1.2308326959609985, + "learning_rate": 0.00017600240661987763, + "loss": 2.0573, + "step": 2362 + }, + { + "epoch": 0.24860599684376644, + "grad_norm": 1.1983873844146729, + "learning_rate": 0.00017598025549800473, + "loss": 1.7869, + "step": 2363 + }, + { + "epoch": 0.24871120462914256, + "grad_norm": 1.1547279357910156, + "learning_rate": 0.00017595809555279494, + "loss": 1.9424, + "step": 2364 + }, + { + "epoch": 0.24881641241451868, + "grad_norm": 1.0015462636947632, + "learning_rate": 0.00017593592678682166, + "loss": 2.0275, + "step": 2365 + }, + { + "epoch": 0.2489216201998948, + "grad_norm": 0.9650968313217163, + "learning_rate": 0.00017591374920265923, + "loss": 1.815, + "step": 2366 + }, + { + "epoch": 0.2490268279852709, + "grad_norm": 1.2822868824005127, + "learning_rate": 0.00017589156280288311, + "loss": 2.1693, + "step": 2367 + }, + { + "epoch": 0.24913203577064702, + "grad_norm": 1.4724416732788086, + "learning_rate": 0.00017586936759006968, + "loss": 2.15, + "step": 2368 + }, + { + "epoch": 0.24923724355602314, + "grad_norm": 1.5108137130737305, + "learning_rate": 0.00017584716356679647, + "loss": 1.6316, + "step": 2369 + }, + { + "epoch": 0.24934245134139926, + "grad_norm": 1.047283411026001, + "learning_rate": 0.0001758249507356419, + "loss": 1.9975, + "step": 2370 + }, + { + "epoch": 0.24944765912677538, + "grad_norm": 1.0887430906295776, + "learning_rate": 0.00017580272909918545, + "loss": 1.6365, + "step": 2371 + }, + { + "epoch": 0.2495528669121515, + "grad_norm": 0.8896718621253967, + "learning_rate": 0.0001757804986600077, + "loss": 1.9713, + "step": 2372 + }, + { + "epoch": 0.24965807469752763, + "grad_norm": 0.9527761340141296, + "learning_rate": 0.00017575825942069018, + "loss": 1.6953, + "step": 2373 + }, + { + "epoch": 0.24976328248290375, + "grad_norm": 1.2873328924179077, + "learning_rate": 0.00017573601138381548, + "loss": 2.5118, + "step": 2374 + }, + { + "epoch": 0.24986849026827984, + "grad_norm": 0.9107836484909058, + "learning_rate": 0.00017571375455196714, + "loss": 1.8383, + "step": 2375 + }, + { + "epoch": 0.24997369805365596, + "grad_norm": 1.5500174760818481, + "learning_rate": 0.00017569148892772983, + "loss": 1.7582, + "step": 2376 + }, + { + "epoch": 0.2500789058390321, + "grad_norm": 1.8706609010696411, + "learning_rate": 0.0001756692145136891, + "loss": 1.4764, + "step": 2377 + }, + { + "epoch": 0.25018411362440823, + "grad_norm": 1.1527718305587769, + "learning_rate": 0.00017564693131243172, + "loss": 2.3996, + "step": 2378 + }, + { + "epoch": 0.2502893214097843, + "grad_norm": 1.112915277481079, + "learning_rate": 0.0001756246393265453, + "loss": 2.4006, + "step": 2379 + }, + { + "epoch": 0.2503945291951604, + "grad_norm": 1.4158530235290527, + "learning_rate": 0.00017560233855861855, + "loss": 2.4866, + "step": 2380 + }, + { + "epoch": 0.25049973698053657, + "grad_norm": 1.486147403717041, + "learning_rate": 0.00017558002901124113, + "loss": 1.7218, + "step": 2381 + }, + { + "epoch": 0.25060494476591266, + "grad_norm": 1.7639609575271606, + "learning_rate": 0.00017555771068700386, + "loss": 1.7781, + "step": 2382 + }, + { + "epoch": 0.2507101525512888, + "grad_norm": 1.4476866722106934, + "learning_rate": 0.00017553538358849844, + "loss": 1.8637, + "step": 2383 + }, + { + "epoch": 0.2508153603366649, + "grad_norm": 1.6569194793701172, + "learning_rate": 0.00017551304771831766, + "loss": 1.8196, + "step": 2384 + }, + { + "epoch": 0.25092056812204105, + "grad_norm": 1.3412871360778809, + "learning_rate": 0.0001754907030790553, + "loss": 1.9298, + "step": 2385 + }, + { + "epoch": 0.25102577590741715, + "grad_norm": 1.0702909231185913, + "learning_rate": 0.00017546834967330617, + "loss": 2.0842, + "step": 2386 + }, + { + "epoch": 0.25113098369279324, + "grad_norm": 0.887639045715332, + "learning_rate": 0.00017544598750366614, + "loss": 2.1446, + "step": 2387 + }, + { + "epoch": 0.2512361914781694, + "grad_norm": 0.9364414811134338, + "learning_rate": 0.000175423616572732, + "loss": 2.218, + "step": 2388 + }, + { + "epoch": 0.2513413992635455, + "grad_norm": 1.431705355644226, + "learning_rate": 0.00017540123688310162, + "loss": 1.7036, + "step": 2389 + }, + { + "epoch": 0.25144660704892163, + "grad_norm": 1.2119871377944946, + "learning_rate": 0.00017537884843737392, + "loss": 2.1576, + "step": 2390 + }, + { + "epoch": 0.2515518148342977, + "grad_norm": 1.0442527532577515, + "learning_rate": 0.00017535645123814873, + "loss": 2.1174, + "step": 2391 + }, + { + "epoch": 0.2516570226196739, + "grad_norm": 1.7288084030151367, + "learning_rate": 0.000175334045288027, + "loss": 1.7428, + "step": 2392 + }, + { + "epoch": 0.25176223040504997, + "grad_norm": 0.9551234245300293, + "learning_rate": 0.00017531163058961066, + "loss": 1.8753, + "step": 2393 + }, + { + "epoch": 0.2518674381904261, + "grad_norm": 1.1774919033050537, + "learning_rate": 0.0001752892071455027, + "loss": 2.1542, + "step": 2394 + }, + { + "epoch": 0.2519726459758022, + "grad_norm": 1.0245354175567627, + "learning_rate": 0.000175266774958307, + "loss": 1.8211, + "step": 2395 + }, + { + "epoch": 0.2520778537611783, + "grad_norm": 1.1799890995025635, + "learning_rate": 0.0001752443340306286, + "loss": 1.9991, + "step": 2396 + }, + { + "epoch": 0.25218306154655445, + "grad_norm": 2.0197417736053467, + "learning_rate": 0.00017522188436507342, + "loss": 1.7232, + "step": 2397 + }, + { + "epoch": 0.25228826933193055, + "grad_norm": 1.235841155052185, + "learning_rate": 0.0001751994259642485, + "loss": 2.0587, + "step": 2398 + }, + { + "epoch": 0.2523934771173067, + "grad_norm": 1.3500566482543945, + "learning_rate": 0.00017517695883076192, + "loss": 1.611, + "step": 2399 + }, + { + "epoch": 0.2524986849026828, + "grad_norm": 1.6199384927749634, + "learning_rate": 0.00017515448296722262, + "loss": 1.7509, + "step": 2400 + }, + { + "epoch": 0.25260389268805894, + "grad_norm": 1.55356764793396, + "learning_rate": 0.00017513199837624073, + "loss": 1.9007, + "step": 2401 + }, + { + "epoch": 0.25270910047343503, + "grad_norm": 1.423980474472046, + "learning_rate": 0.00017510950506042727, + "loss": 1.6977, + "step": 2402 + }, + { + "epoch": 0.2528143082588111, + "grad_norm": 1.0989725589752197, + "learning_rate": 0.00017508700302239428, + "loss": 1.8371, + "step": 2403 + }, + { + "epoch": 0.2529195160441873, + "grad_norm": 1.6548444032669067, + "learning_rate": 0.00017506449226475492, + "loss": 1.9493, + "step": 2404 + }, + { + "epoch": 0.25302472382956337, + "grad_norm": 1.6886351108551025, + "learning_rate": 0.00017504197279012321, + "loss": 2.4116, + "step": 2405 + }, + { + "epoch": 0.2531299316149395, + "grad_norm": 0.9853400588035583, + "learning_rate": 0.00017501944460111436, + "loss": 2.1326, + "step": 2406 + }, + { + "epoch": 0.2532351394003156, + "grad_norm": 0.8209208250045776, + "learning_rate": 0.00017499690770034443, + "loss": 2.1675, + "step": 2407 + }, + { + "epoch": 0.25334034718569176, + "grad_norm": 1.097050666809082, + "learning_rate": 0.00017497436209043055, + "loss": 2.0179, + "step": 2408 + }, + { + "epoch": 0.25344555497106785, + "grad_norm": 1.007564663887024, + "learning_rate": 0.00017495180777399088, + "loss": 2.0509, + "step": 2409 + }, + { + "epoch": 0.253550762756444, + "grad_norm": 2.713827133178711, + "learning_rate": 0.00017492924475364462, + "loss": 1.6709, + "step": 2410 + }, + { + "epoch": 0.2536559705418201, + "grad_norm": 1.4967379570007324, + "learning_rate": 0.00017490667303201186, + "loss": 1.4405, + "step": 2411 + }, + { + "epoch": 0.2537611783271962, + "grad_norm": 1.0364857912063599, + "learning_rate": 0.00017488409261171386, + "loss": 2.3166, + "step": 2412 + }, + { + "epoch": 0.25386638611257234, + "grad_norm": 1.656065583229065, + "learning_rate": 0.00017486150349537276, + "loss": 1.8276, + "step": 2413 + }, + { + "epoch": 0.25397159389794843, + "grad_norm": 1.0365979671478271, + "learning_rate": 0.00017483890568561173, + "loss": 2.1835, + "step": 2414 + }, + { + "epoch": 0.2540768016833246, + "grad_norm": 0.7417734861373901, + "learning_rate": 0.0001748162991850551, + "loss": 2.2011, + "step": 2415 + }, + { + "epoch": 0.2541820094687007, + "grad_norm": 1.164849042892456, + "learning_rate": 0.00017479368399632797, + "loss": 2.0921, + "step": 2416 + }, + { + "epoch": 0.2542872172540768, + "grad_norm": 1.243613362312317, + "learning_rate": 0.0001747710601220566, + "loss": 1.3508, + "step": 2417 + }, + { + "epoch": 0.2543924250394529, + "grad_norm": 1.5702705383300781, + "learning_rate": 0.0001747484275648682, + "loss": 2.0768, + "step": 2418 + }, + { + "epoch": 0.254497632824829, + "grad_norm": 1.5284066200256348, + "learning_rate": 0.0001747257863273911, + "loss": 2.144, + "step": 2419 + }, + { + "epoch": 0.25460284061020516, + "grad_norm": 1.222301721572876, + "learning_rate": 0.0001747031364122545, + "loss": 1.833, + "step": 2420 + }, + { + "epoch": 0.25470804839558125, + "grad_norm": 1.341837763786316, + "learning_rate": 0.00017468047782208865, + "loss": 1.9542, + "step": 2421 + }, + { + "epoch": 0.2548132561809574, + "grad_norm": 1.10076904296875, + "learning_rate": 0.00017465781055952482, + "loss": 1.6218, + "step": 2422 + }, + { + "epoch": 0.2549184639663335, + "grad_norm": 1.2603121995925903, + "learning_rate": 0.0001746351346271953, + "loss": 2.0813, + "step": 2423 + }, + { + "epoch": 0.25502367175170965, + "grad_norm": 1.3991905450820923, + "learning_rate": 0.00017461245002773336, + "loss": 2.2184, + "step": 2424 + }, + { + "epoch": 0.25512887953708574, + "grad_norm": 0.874963641166687, + "learning_rate": 0.00017458975676377326, + "loss": 2.2639, + "step": 2425 + }, + { + "epoch": 0.2552340873224619, + "grad_norm": 1.275423288345337, + "learning_rate": 0.00017456705483795038, + "loss": 1.8021, + "step": 2426 + }, + { + "epoch": 0.255339295107838, + "grad_norm": 1.325136423110962, + "learning_rate": 0.00017454434425290093, + "loss": 2.2287, + "step": 2427 + }, + { + "epoch": 0.2554445028932141, + "grad_norm": 1.783543348312378, + "learning_rate": 0.00017452162501126227, + "loss": 1.6924, + "step": 2428 + }, + { + "epoch": 0.2555497106785902, + "grad_norm": 1.0781643390655518, + "learning_rate": 0.0001744988971156727, + "loss": 1.8557, + "step": 2429 + }, + { + "epoch": 0.2556549184639663, + "grad_norm": 1.3726019859313965, + "learning_rate": 0.00017447616056877148, + "loss": 1.7486, + "step": 2430 + }, + { + "epoch": 0.25576012624934247, + "grad_norm": 1.8802924156188965, + "learning_rate": 0.000174453415373199, + "loss": 1.8817, + "step": 2431 + }, + { + "epoch": 0.25586533403471856, + "grad_norm": 1.2403502464294434, + "learning_rate": 0.00017443066153159656, + "loss": 2.513, + "step": 2432 + }, + { + "epoch": 0.2559705418200947, + "grad_norm": 0.8978909850120544, + "learning_rate": 0.00017440789904660652, + "loss": 2.163, + "step": 2433 + }, + { + "epoch": 0.2560757496054708, + "grad_norm": 1.4527589082717896, + "learning_rate": 0.00017438512792087218, + "loss": 2.2242, + "step": 2434 + }, + { + "epoch": 0.2561809573908469, + "grad_norm": 1.3555766344070435, + "learning_rate": 0.00017436234815703788, + "loss": 2.0565, + "step": 2435 + }, + { + "epoch": 0.25628616517622305, + "grad_norm": 1.2842960357666016, + "learning_rate": 0.0001743395597577489, + "loss": 2.1045, + "step": 2436 + }, + { + "epoch": 0.25639137296159914, + "grad_norm": 0.9545568823814392, + "learning_rate": 0.0001743167627256517, + "loss": 1.7778, + "step": 2437 + }, + { + "epoch": 0.2564965807469753, + "grad_norm": 1.2293113470077515, + "learning_rate": 0.00017429395706339355, + "loss": 2.1365, + "step": 2438 + }, + { + "epoch": 0.2566017885323514, + "grad_norm": 1.2413405179977417, + "learning_rate": 0.0001742711427736228, + "loss": 2.3555, + "step": 2439 + }, + { + "epoch": 0.25670699631772753, + "grad_norm": 0.9717143774032593, + "learning_rate": 0.00017424831985898883, + "loss": 1.5027, + "step": 2440 + }, + { + "epoch": 0.2568122041031036, + "grad_norm": 1.797363042831421, + "learning_rate": 0.0001742254883221419, + "loss": 1.6288, + "step": 2441 + }, + { + "epoch": 0.2569174118884798, + "grad_norm": 1.5854530334472656, + "learning_rate": 0.0001742026481657335, + "loss": 2.0474, + "step": 2442 + }, + { + "epoch": 0.25702261967385587, + "grad_norm": 1.1225696802139282, + "learning_rate": 0.0001741797993924159, + "loss": 2.2607, + "step": 2443 + }, + { + "epoch": 0.25712782745923196, + "grad_norm": 1.4167970418930054, + "learning_rate": 0.00017415694200484247, + "loss": 1.8007, + "step": 2444 + }, + { + "epoch": 0.2572330352446081, + "grad_norm": 1.275691032409668, + "learning_rate": 0.00017413407600566755, + "loss": 1.6061, + "step": 2445 + }, + { + "epoch": 0.2573382430299842, + "grad_norm": 1.4618197679519653, + "learning_rate": 0.00017411120139754652, + "loss": 2.1667, + "step": 2446 + }, + { + "epoch": 0.25744345081536035, + "grad_norm": 1.4069091081619263, + "learning_rate": 0.00017408831818313566, + "loss": 1.4578, + "step": 2447 + }, + { + "epoch": 0.25754865860073645, + "grad_norm": 1.2108229398727417, + "learning_rate": 0.0001740654263650924, + "loss": 2.1124, + "step": 2448 + }, + { + "epoch": 0.2576538663861126, + "grad_norm": 1.4401681423187256, + "learning_rate": 0.0001740425259460751, + "loss": 1.7578, + "step": 2449 + }, + { + "epoch": 0.2577590741714887, + "grad_norm": 1.814821481704712, + "learning_rate": 0.00017401961692874304, + "loss": 2.2632, + "step": 2450 + }, + { + "epoch": 0.2578642819568648, + "grad_norm": 1.439861536026001, + "learning_rate": 0.00017399669931575663, + "loss": 2.0677, + "step": 2451 + }, + { + "epoch": 0.25796948974224093, + "grad_norm": 1.9982361793518066, + "learning_rate": 0.0001739737731097772, + "loss": 2.1271, + "step": 2452 + }, + { + "epoch": 0.258074697527617, + "grad_norm": 1.41429603099823, + "learning_rate": 0.00017395083831346707, + "loss": 1.7051, + "step": 2453 + }, + { + "epoch": 0.2581799053129932, + "grad_norm": 1.6925151348114014, + "learning_rate": 0.0001739278949294896, + "loss": 2.1179, + "step": 2454 + }, + { + "epoch": 0.25828511309836927, + "grad_norm": 1.9045064449310303, + "learning_rate": 0.0001739049429605091, + "loss": 1.7847, + "step": 2455 + }, + { + "epoch": 0.2583903208837454, + "grad_norm": 1.0189872980117798, + "learning_rate": 0.00017388198240919102, + "loss": 2.0259, + "step": 2456 + }, + { + "epoch": 0.2584955286691215, + "grad_norm": 1.117735505104065, + "learning_rate": 0.00017385901327820157, + "loss": 2.3478, + "step": 2457 + }, + { + "epoch": 0.25860073645449766, + "grad_norm": 1.060653567314148, + "learning_rate": 0.0001738360355702081, + "loss": 2.1214, + "step": 2458 + }, + { + "epoch": 0.25870594423987375, + "grad_norm": 1.3654861450195312, + "learning_rate": 0.00017381304928787897, + "loss": 1.7643, + "step": 2459 + }, + { + "epoch": 0.25881115202524985, + "grad_norm": 0.8313367366790771, + "learning_rate": 0.00017379005443388348, + "loss": 1.7377, + "step": 2460 + }, + { + "epoch": 0.258916359810626, + "grad_norm": 0.9707303643226624, + "learning_rate": 0.00017376705101089198, + "loss": 2.323, + "step": 2461 + }, + { + "epoch": 0.2590215675960021, + "grad_norm": 1.2245289087295532, + "learning_rate": 0.0001737440390215757, + "loss": 1.6078, + "step": 2462 + }, + { + "epoch": 0.25912677538137824, + "grad_norm": 1.0803886651992798, + "learning_rate": 0.00017372101846860707, + "loss": 1.9575, + "step": 2463 + }, + { + "epoch": 0.25923198316675433, + "grad_norm": 0.9316054582595825, + "learning_rate": 0.00017369798935465926, + "loss": 1.5473, + "step": 2464 + }, + { + "epoch": 0.2593371909521305, + "grad_norm": 1.0661040544509888, + "learning_rate": 0.00017367495168240667, + "loss": 2.1103, + "step": 2465 + }, + { + "epoch": 0.2594423987375066, + "grad_norm": 0.8651044368743896, + "learning_rate": 0.00017365190545452452, + "loss": 2.0468, + "step": 2466 + }, + { + "epoch": 0.25954760652288267, + "grad_norm": 1.3294917345046997, + "learning_rate": 0.00017362885067368915, + "loss": 1.7681, + "step": 2467 + }, + { + "epoch": 0.2596528143082588, + "grad_norm": 1.514784574508667, + "learning_rate": 0.0001736057873425778, + "loss": 2.0482, + "step": 2468 + }, + { + "epoch": 0.2597580220936349, + "grad_norm": 1.4999936819076538, + "learning_rate": 0.00017358271546386874, + "loss": 1.8877, + "step": 2469 + }, + { + "epoch": 0.25986322987901106, + "grad_norm": 0.9134921431541443, + "learning_rate": 0.00017355963504024123, + "loss": 2.1311, + "step": 2470 + }, + { + "epoch": 0.25996843766438715, + "grad_norm": 1.9929500818252563, + "learning_rate": 0.0001735365460743755, + "loss": 1.5913, + "step": 2471 + }, + { + "epoch": 0.2600736454497633, + "grad_norm": 1.1672104597091675, + "learning_rate": 0.00017351344856895287, + "loss": 1.9764, + "step": 2472 + }, + { + "epoch": 0.2601788532351394, + "grad_norm": 1.5550178289413452, + "learning_rate": 0.0001734903425266555, + "loss": 2.003, + "step": 2473 + }, + { + "epoch": 0.26028406102051554, + "grad_norm": 0.8885960578918457, + "learning_rate": 0.00017346722795016665, + "loss": 1.7482, + "step": 2474 + }, + { + "epoch": 0.26038926880589164, + "grad_norm": 0.9775580763816833, + "learning_rate": 0.00017344410484217056, + "loss": 2.274, + "step": 2475 + }, + { + "epoch": 0.26049447659126773, + "grad_norm": 1.9509919881820679, + "learning_rate": 0.00017342097320535244, + "loss": 2.0511, + "step": 2476 + }, + { + "epoch": 0.2605996843766439, + "grad_norm": 1.423757553100586, + "learning_rate": 0.00017339783304239843, + "loss": 1.6038, + "step": 2477 + }, + { + "epoch": 0.26070489216202, + "grad_norm": 1.0820887088775635, + "learning_rate": 0.0001733746843559958, + "loss": 1.811, + "step": 2478 + }, + { + "epoch": 0.2608100999473961, + "grad_norm": 1.143119215965271, + "learning_rate": 0.00017335152714883267, + "loss": 1.6442, + "step": 2479 + }, + { + "epoch": 0.2609153077327722, + "grad_norm": 0.8639788627624512, + "learning_rate": 0.00017332836142359823, + "loss": 2.1835, + "step": 2480 + }, + { + "epoch": 0.26102051551814837, + "grad_norm": 1.1601319313049316, + "learning_rate": 0.00017330518718298264, + "loss": 1.8785, + "step": 2481 + }, + { + "epoch": 0.26112572330352446, + "grad_norm": 1.2111576795578003, + "learning_rate": 0.00017328200442967706, + "loss": 2.1807, + "step": 2482 + }, + { + "epoch": 0.26123093108890055, + "grad_norm": 1.271608591079712, + "learning_rate": 0.00017325881316637362, + "loss": 2.0741, + "step": 2483 + }, + { + "epoch": 0.2613361388742767, + "grad_norm": 1.105197548866272, + "learning_rate": 0.00017323561339576543, + "loss": 1.9965, + "step": 2484 + }, + { + "epoch": 0.2614413466596528, + "grad_norm": 1.9587740898132324, + "learning_rate": 0.00017321240512054663, + "loss": 1.8361, + "step": 2485 + }, + { + "epoch": 0.26154655444502894, + "grad_norm": 1.2962583303451538, + "learning_rate": 0.0001731891883434123, + "loss": 2.3446, + "step": 2486 + }, + { + "epoch": 0.26165176223040504, + "grad_norm": 1.0470434427261353, + "learning_rate": 0.00017316596306705853, + "loss": 2.0182, + "step": 2487 + }, + { + "epoch": 0.2617569700157812, + "grad_norm": 2.4733150005340576, + "learning_rate": 0.0001731427292941824, + "loss": 2.066, + "step": 2488 + }, + { + "epoch": 0.2618621778011573, + "grad_norm": 1.580335259437561, + "learning_rate": 0.00017311948702748196, + "loss": 2.2136, + "step": 2489 + }, + { + "epoch": 0.26196738558653343, + "grad_norm": 1.2528669834136963, + "learning_rate": 0.0001730962362696563, + "loss": 2.0774, + "step": 2490 + }, + { + "epoch": 0.2620725933719095, + "grad_norm": 1.2130522727966309, + "learning_rate": 0.0001730729770234054, + "loss": 1.7999, + "step": 2491 + }, + { + "epoch": 0.2621778011572856, + "grad_norm": 1.3857465982437134, + "learning_rate": 0.00017304970929143032, + "loss": 1.8678, + "step": 2492 + }, + { + "epoch": 0.26228300894266177, + "grad_norm": 1.4128077030181885, + "learning_rate": 0.00017302643307643304, + "loss": 2.1356, + "step": 2493 + }, + { + "epoch": 0.26238821672803786, + "grad_norm": 1.1475266218185425, + "learning_rate": 0.00017300314838111653, + "loss": 1.8912, + "step": 2494 + }, + { + "epoch": 0.262493424513414, + "grad_norm": 0.7525075674057007, + "learning_rate": 0.0001729798552081848, + "loss": 1.9335, + "step": 2495 + }, + { + "epoch": 0.2625986322987901, + "grad_norm": 1.9416919946670532, + "learning_rate": 0.00017295655356034284, + "loss": 2.0356, + "step": 2496 + }, + { + "epoch": 0.26270384008416625, + "grad_norm": 1.7642756700515747, + "learning_rate": 0.00017293324344029652, + "loss": 1.6941, + "step": 2497 + }, + { + "epoch": 0.26280904786954234, + "grad_norm": 1.2156851291656494, + "learning_rate": 0.00017290992485075282, + "loss": 1.9932, + "step": 2498 + }, + { + "epoch": 0.26291425565491844, + "grad_norm": 1.551274061203003, + "learning_rate": 0.00017288659779441962, + "loss": 2.309, + "step": 2499 + }, + { + "epoch": 0.2630194634402946, + "grad_norm": 1.3318612575531006, + "learning_rate": 0.00017286326227400583, + "loss": 1.9814, + "step": 2500 + }, + { + "epoch": 0.2631246712256707, + "grad_norm": 0.949734628200531, + "learning_rate": 0.00017283991829222133, + "loss": 1.8459, + "step": 2501 + }, + { + "epoch": 0.26322987901104683, + "grad_norm": 1.3344646692276, + "learning_rate": 0.00017281656585177698, + "loss": 1.2661, + "step": 2502 + }, + { + "epoch": 0.2633350867964229, + "grad_norm": 1.1102912425994873, + "learning_rate": 0.0001727932049553846, + "loss": 1.8715, + "step": 2503 + }, + { + "epoch": 0.2634402945817991, + "grad_norm": 1.1450645923614502, + "learning_rate": 0.00017276983560575703, + "loss": 1.9675, + "step": 2504 + }, + { + "epoch": 0.26354550236717517, + "grad_norm": 2.4335386753082275, + "learning_rate": 0.0001727464578056081, + "loss": 1.688, + "step": 2505 + }, + { + "epoch": 0.2636507101525513, + "grad_norm": 1.290065884590149, + "learning_rate": 0.00017272307155765258, + "loss": 2.4197, + "step": 2506 + }, + { + "epoch": 0.2637559179379274, + "grad_norm": 1.097562313079834, + "learning_rate": 0.00017269967686460617, + "loss": 1.8908, + "step": 2507 + }, + { + "epoch": 0.2638611257233035, + "grad_norm": 1.137505292892456, + "learning_rate": 0.00017267627372918575, + "loss": 1.9866, + "step": 2508 + }, + { + "epoch": 0.26396633350867965, + "grad_norm": 1.3950042724609375, + "learning_rate": 0.00017265286215410893, + "loss": 2.0389, + "step": 2509 + }, + { + "epoch": 0.26407154129405574, + "grad_norm": 1.7362642288208008, + "learning_rate": 0.00017262944214209452, + "loss": 1.6798, + "step": 2510 + }, + { + "epoch": 0.2641767490794319, + "grad_norm": 1.443983793258667, + "learning_rate": 0.0001726060136958621, + "loss": 2.0108, + "step": 2511 + }, + { + "epoch": 0.264281956864808, + "grad_norm": 1.4797279834747314, + "learning_rate": 0.00017258257681813244, + "loss": 2.1318, + "step": 2512 + }, + { + "epoch": 0.26438716465018414, + "grad_norm": 3.060091257095337, + "learning_rate": 0.00017255913151162714, + "loss": 2.1173, + "step": 2513 + }, + { + "epoch": 0.26449237243556023, + "grad_norm": 1.2308971881866455, + "learning_rate": 0.00017253567777906882, + "loss": 1.9013, + "step": 2514 + }, + { + "epoch": 0.2645975802209363, + "grad_norm": 1.0173864364624023, + "learning_rate": 0.00017251221562318108, + "loss": 2.111, + "step": 2515 + }, + { + "epoch": 0.2647027880063125, + "grad_norm": 1.6821645498275757, + "learning_rate": 0.0001724887450466885, + "loss": 2.4451, + "step": 2516 + }, + { + "epoch": 0.26480799579168857, + "grad_norm": 1.388883352279663, + "learning_rate": 0.0001724652660523167, + "loss": 2.4442, + "step": 2517 + }, + { + "epoch": 0.2649132035770647, + "grad_norm": 1.0989224910736084, + "learning_rate": 0.00017244177864279215, + "loss": 1.6966, + "step": 2518 + }, + { + "epoch": 0.2650184113624408, + "grad_norm": 1.3560047149658203, + "learning_rate": 0.0001724182828208424, + "loss": 1.9355, + "step": 2519 + }, + { + "epoch": 0.26512361914781696, + "grad_norm": 0.8736411333084106, + "learning_rate": 0.00017239477858919594, + "loss": 1.9303, + "step": 2520 + }, + { + "epoch": 0.26522882693319305, + "grad_norm": 0.9820002317428589, + "learning_rate": 0.00017237126595058224, + "loss": 2.123, + "step": 2521 + }, + { + "epoch": 0.2653340347185692, + "grad_norm": 1.382918357849121, + "learning_rate": 0.0001723477449077317, + "loss": 2.0202, + "step": 2522 + }, + { + "epoch": 0.2654392425039453, + "grad_norm": 1.1429475545883179, + "learning_rate": 0.00017232421546337583, + "loss": 1.7081, + "step": 2523 + }, + { + "epoch": 0.2655444502893214, + "grad_norm": 1.5039023160934448, + "learning_rate": 0.00017230067762024693, + "loss": 1.82, + "step": 2524 + }, + { + "epoch": 0.26564965807469754, + "grad_norm": 1.7104127407073975, + "learning_rate": 0.00017227713138107844, + "loss": 2.1606, + "step": 2525 + }, + { + "epoch": 0.26575486586007363, + "grad_norm": 1.3519093990325928, + "learning_rate": 0.0001722535767486047, + "loss": 1.8595, + "step": 2526 + }, + { + "epoch": 0.2658600736454498, + "grad_norm": 1.0122334957122803, + "learning_rate": 0.000172230013725561, + "loss": 2.1709, + "step": 2527 + }, + { + "epoch": 0.2659652814308259, + "grad_norm": 1.7543777227401733, + "learning_rate": 0.00017220644231468366, + "loss": 2.0135, + "step": 2528 + }, + { + "epoch": 0.266070489216202, + "grad_norm": 0.9146006107330322, + "learning_rate": 0.00017218286251870994, + "loss": 1.9128, + "step": 2529 + }, + { + "epoch": 0.2661756970015781, + "grad_norm": 1.8088133335113525, + "learning_rate": 0.0001721592743403781, + "loss": 1.7378, + "step": 2530 + }, + { + "epoch": 0.2662809047869542, + "grad_norm": 0.819983720779419, + "learning_rate": 0.00017213567778242731, + "loss": 2.0829, + "step": 2531 + }, + { + "epoch": 0.26638611257233036, + "grad_norm": 0.8384549021720886, + "learning_rate": 0.00017211207284759784, + "loss": 1.9466, + "step": 2532 + }, + { + "epoch": 0.26649132035770645, + "grad_norm": 2.0535356998443604, + "learning_rate": 0.00017208845953863076, + "loss": 2.1554, + "step": 2533 + }, + { + "epoch": 0.2665965281430826, + "grad_norm": 0.8103688955307007, + "learning_rate": 0.00017206483785826832, + "loss": 2.2735, + "step": 2534 + }, + { + "epoch": 0.2667017359284587, + "grad_norm": 1.2302720546722412, + "learning_rate": 0.00017204120780925353, + "loss": 1.7633, + "step": 2535 + }, + { + "epoch": 0.26680694371383484, + "grad_norm": 1.417521595954895, + "learning_rate": 0.0001720175693943305, + "loss": 1.8596, + "step": 2536 + }, + { + "epoch": 0.26691215149921094, + "grad_norm": 1.3307616710662842, + "learning_rate": 0.00017199392261624429, + "loss": 1.6087, + "step": 2537 + }, + { + "epoch": 0.2670173592845871, + "grad_norm": 1.0965129137039185, + "learning_rate": 0.0001719702674777409, + "loss": 2.2497, + "step": 2538 + }, + { + "epoch": 0.2671225670699632, + "grad_norm": 1.9108279943466187, + "learning_rate": 0.00017194660398156737, + "loss": 1.6981, + "step": 2539 + }, + { + "epoch": 0.2672277748553393, + "grad_norm": 1.4486432075500488, + "learning_rate": 0.0001719229321304716, + "loss": 1.6847, + "step": 2540 + }, + { + "epoch": 0.2673329826407154, + "grad_norm": 1.0305055379867554, + "learning_rate": 0.00017189925192720258, + "loss": 2.2104, + "step": 2541 + }, + { + "epoch": 0.2674381904260915, + "grad_norm": 1.1246416568756104, + "learning_rate": 0.0001718755633745102, + "loss": 1.5966, + "step": 2542 + }, + { + "epoch": 0.26754339821146766, + "grad_norm": 1.09429931640625, + "learning_rate": 0.00017185186647514531, + "loss": 2.6052, + "step": 2543 + }, + { + "epoch": 0.26764860599684376, + "grad_norm": 1.0281459093093872, + "learning_rate": 0.0001718281612318598, + "loss": 1.998, + "step": 2544 + }, + { + "epoch": 0.2677538137822199, + "grad_norm": 1.2437368631362915, + "learning_rate": 0.0001718044476474064, + "loss": 2.2582, + "step": 2545 + }, + { + "epoch": 0.267859021567596, + "grad_norm": 1.0594416856765747, + "learning_rate": 0.00017178072572453896, + "loss": 1.4396, + "step": 2546 + }, + { + "epoch": 0.2679642293529721, + "grad_norm": 1.3116780519485474, + "learning_rate": 0.00017175699546601223, + "loss": 2.0078, + "step": 2547 + }, + { + "epoch": 0.26806943713834824, + "grad_norm": 1.8310174942016602, + "learning_rate": 0.00017173325687458188, + "loss": 1.6103, + "step": 2548 + }, + { + "epoch": 0.26817464492372434, + "grad_norm": 1.762455940246582, + "learning_rate": 0.00017170950995300466, + "loss": 1.9907, + "step": 2549 + }, + { + "epoch": 0.2682798527091005, + "grad_norm": 1.2596118450164795, + "learning_rate": 0.00017168575470403815, + "loss": 1.6975, + "step": 2550 + }, + { + "epoch": 0.2683850604944766, + "grad_norm": 1.0478299856185913, + "learning_rate": 0.000171661991130441, + "loss": 1.7204, + "step": 2551 + }, + { + "epoch": 0.26849026827985273, + "grad_norm": 2.9845941066741943, + "learning_rate": 0.0001716382192349728, + "loss": 1.4462, + "step": 2552 + }, + { + "epoch": 0.2685954760652288, + "grad_norm": 1.6229699850082397, + "learning_rate": 0.00017161443902039412, + "loss": 1.9217, + "step": 2553 + }, + { + "epoch": 0.26870068385060497, + "grad_norm": 1.0252301692962646, + "learning_rate": 0.00017159065048946644, + "loss": 1.9798, + "step": 2554 + }, + { + "epoch": 0.26880589163598106, + "grad_norm": 1.1335787773132324, + "learning_rate": 0.00017156685364495226, + "loss": 2.2337, + "step": 2555 + }, + { + "epoch": 0.26891109942135716, + "grad_norm": 1.192686676979065, + "learning_rate": 0.00017154304848961504, + "loss": 2.0498, + "step": 2556 + }, + { + "epoch": 0.2690163072067333, + "grad_norm": 1.2302987575531006, + "learning_rate": 0.00017151923502621918, + "loss": 2.0112, + "step": 2557 + }, + { + "epoch": 0.2691215149921094, + "grad_norm": 1.11729097366333, + "learning_rate": 0.00017149541325753008, + "loss": 1.5503, + "step": 2558 + }, + { + "epoch": 0.26922672277748555, + "grad_norm": 1.276808261871338, + "learning_rate": 0.00017147158318631402, + "loss": 1.7576, + "step": 2559 + }, + { + "epoch": 0.26933193056286164, + "grad_norm": 1.1111189126968384, + "learning_rate": 0.0001714477448153384, + "loss": 1.7231, + "step": 2560 + }, + { + "epoch": 0.2694371383482378, + "grad_norm": 1.3912397623062134, + "learning_rate": 0.00017142389814737142, + "loss": 1.9727, + "step": 2561 + }, + { + "epoch": 0.2695423461336139, + "grad_norm": 1.2298855781555176, + "learning_rate": 0.00017140004318518236, + "loss": 1.8264, + "step": 2562 + }, + { + "epoch": 0.26964755391899, + "grad_norm": 0.9467676877975464, + "learning_rate": 0.0001713761799315414, + "loss": 2.2141, + "step": 2563 + }, + { + "epoch": 0.26975276170436613, + "grad_norm": 1.3311164379119873, + "learning_rate": 0.00017135230838921967, + "loss": 1.7519, + "step": 2564 + }, + { + "epoch": 0.2698579694897422, + "grad_norm": 1.4334806203842163, + "learning_rate": 0.00017132842856098937, + "loss": 2.1874, + "step": 2565 + }, + { + "epoch": 0.26996317727511837, + "grad_norm": 1.0612467527389526, + "learning_rate": 0.0001713045404496235, + "loss": 2.026, + "step": 2566 + }, + { + "epoch": 0.27006838506049446, + "grad_norm": 2.4904496669769287, + "learning_rate": 0.00017128064405789618, + "loss": 1.9635, + "step": 2567 + }, + { + "epoch": 0.2701735928458706, + "grad_norm": 1.3276349306106567, + "learning_rate": 0.00017125673938858237, + "loss": 1.9939, + "step": 2568 + }, + { + "epoch": 0.2702788006312467, + "grad_norm": 1.099576711654663, + "learning_rate": 0.0001712328264444581, + "loss": 2.0464, + "step": 2569 + }, + { + "epoch": 0.27038400841662286, + "grad_norm": 1.4065220355987549, + "learning_rate": 0.00017120890522830017, + "loss": 2.1369, + "step": 2570 + }, + { + "epoch": 0.27048921620199895, + "grad_norm": 1.4357450008392334, + "learning_rate": 0.00017118497574288664, + "loss": 1.6478, + "step": 2571 + }, + { + "epoch": 0.27059442398737504, + "grad_norm": 1.6182183027267456, + "learning_rate": 0.00017116103799099625, + "loss": 1.3441, + "step": 2572 + }, + { + "epoch": 0.2706996317727512, + "grad_norm": 1.831085443496704, + "learning_rate": 0.00017113709197540887, + "loss": 1.8925, + "step": 2573 + }, + { + "epoch": 0.2708048395581273, + "grad_norm": 1.2130355834960938, + "learning_rate": 0.0001711131376989052, + "loss": 1.8865, + "step": 2574 + }, + { + "epoch": 0.27091004734350344, + "grad_norm": 0.9673578143119812, + "learning_rate": 0.00017108917516426704, + "loss": 1.6841, + "step": 2575 + }, + { + "epoch": 0.27101525512887953, + "grad_norm": 1.0448040962219238, + "learning_rate": 0.00017106520437427708, + "loss": 2.3308, + "step": 2576 + }, + { + "epoch": 0.2711204629142557, + "grad_norm": 0.8160276412963867, + "learning_rate": 0.00017104122533171895, + "loss": 2.3082, + "step": 2577 + }, + { + "epoch": 0.27122567069963177, + "grad_norm": 1.134219765663147, + "learning_rate": 0.00017101723803937722, + "loss": 2.3576, + "step": 2578 + }, + { + "epoch": 0.27133087848500786, + "grad_norm": 1.3649998903274536, + "learning_rate": 0.00017099324250003753, + "loss": 2.0624, + "step": 2579 + }, + { + "epoch": 0.271436086270384, + "grad_norm": 1.4927576780319214, + "learning_rate": 0.00017096923871648634, + "loss": 1.884, + "step": 2580 + }, + { + "epoch": 0.2715412940557601, + "grad_norm": 1.5744551420211792, + "learning_rate": 0.00017094522669151117, + "loss": 1.2819, + "step": 2581 + }, + { + "epoch": 0.27164650184113626, + "grad_norm": 1.319411277770996, + "learning_rate": 0.00017092120642790042, + "loss": 1.7153, + "step": 2582 + }, + { + "epoch": 0.27175170962651235, + "grad_norm": 1.3168647289276123, + "learning_rate": 0.00017089717792844353, + "loss": 1.0528, + "step": 2583 + }, + { + "epoch": 0.2718569174118885, + "grad_norm": 1.3358632326126099, + "learning_rate": 0.00017087314119593078, + "loss": 2.0915, + "step": 2584 + }, + { + "epoch": 0.2719621251972646, + "grad_norm": 1.1743086576461792, + "learning_rate": 0.00017084909623315357, + "loss": 1.875, + "step": 2585 + }, + { + "epoch": 0.27206733298264074, + "grad_norm": 1.6610081195831299, + "learning_rate": 0.00017082504304290408, + "loss": 2.0099, + "step": 2586 + }, + { + "epoch": 0.27217254076801684, + "grad_norm": 1.2851457595825195, + "learning_rate": 0.0001708009816279756, + "loss": 1.8666, + "step": 2587 + }, + { + "epoch": 0.27227774855339293, + "grad_norm": 0.9746414422988892, + "learning_rate": 0.00017077691199116223, + "loss": 2.0146, + "step": 2588 + }, + { + "epoch": 0.2723829563387691, + "grad_norm": 0.9980962872505188, + "learning_rate": 0.00017075283413525916, + "loss": 2.1908, + "step": 2589 + }, + { + "epoch": 0.27248816412414517, + "grad_norm": 1.0672366619110107, + "learning_rate": 0.00017072874806306246, + "loss": 2.4187, + "step": 2590 + }, + { + "epoch": 0.2725933719095213, + "grad_norm": 1.1528050899505615, + "learning_rate": 0.00017070465377736914, + "loss": 1.5965, + "step": 2591 + }, + { + "epoch": 0.2726985796948974, + "grad_norm": 1.3461138010025024, + "learning_rate": 0.00017068055128097718, + "loss": 2.3055, + "step": 2592 + }, + { + "epoch": 0.27280378748027356, + "grad_norm": 0.876352071762085, + "learning_rate": 0.00017065644057668555, + "loss": 1.8871, + "step": 2593 + }, + { + "epoch": 0.27290899526564966, + "grad_norm": 1.1798096895217896, + "learning_rate": 0.00017063232166729413, + "loss": 1.7881, + "step": 2594 + }, + { + "epoch": 0.27301420305102575, + "grad_norm": 0.9815201163291931, + "learning_rate": 0.00017060819455560382, + "loss": 1.9022, + "step": 2595 + }, + { + "epoch": 0.2731194108364019, + "grad_norm": 1.1576168537139893, + "learning_rate": 0.00017058405924441636, + "loss": 1.8402, + "step": 2596 + }, + { + "epoch": 0.273224618621778, + "grad_norm": 1.1924322843551636, + "learning_rate": 0.00017055991573653454, + "loss": 1.7346, + "step": 2597 + }, + { + "epoch": 0.27332982640715414, + "grad_norm": 1.4050421714782715, + "learning_rate": 0.00017053576403476206, + "loss": 1.8869, + "step": 2598 + }, + { + "epoch": 0.27343503419253024, + "grad_norm": 1.221558928489685, + "learning_rate": 0.00017051160414190353, + "loss": 2.056, + "step": 2599 + }, + { + "epoch": 0.2735402419779064, + "grad_norm": 1.141769528388977, + "learning_rate": 0.00017048743606076463, + "loss": 2.0981, + "step": 2600 + }, + { + "epoch": 0.2736454497632825, + "grad_norm": 1.7314319610595703, + "learning_rate": 0.0001704632597941519, + "loss": 1.6496, + "step": 2601 + }, + { + "epoch": 0.2737506575486586, + "grad_norm": 1.8792369365692139, + "learning_rate": 0.0001704390753448728, + "loss": 2.0167, + "step": 2602 + }, + { + "epoch": 0.2738558653340347, + "grad_norm": 1.1898305416107178, + "learning_rate": 0.00017041488271573587, + "loss": 2.1775, + "step": 2603 + }, + { + "epoch": 0.2739610731194108, + "grad_norm": 1.3105398416519165, + "learning_rate": 0.00017039068190955047, + "loss": 2.2253, + "step": 2604 + }, + { + "epoch": 0.27406628090478696, + "grad_norm": 1.4331053495407104, + "learning_rate": 0.00017036647292912696, + "loss": 1.6136, + "step": 2605 + }, + { + "epoch": 0.27417148869016306, + "grad_norm": 1.059443712234497, + "learning_rate": 0.00017034225577727667, + "loss": 1.6369, + "step": 2606 + }, + { + "epoch": 0.2742766964755392, + "grad_norm": 1.0023260116577148, + "learning_rate": 0.00017031803045681188, + "loss": 2.3698, + "step": 2607 + }, + { + "epoch": 0.2743819042609153, + "grad_norm": 2.0491862297058105, + "learning_rate": 0.00017029379697054573, + "loss": 1.6626, + "step": 2608 + }, + { + "epoch": 0.27448711204629145, + "grad_norm": 1.3866617679595947, + "learning_rate": 0.0001702695553212924, + "loss": 1.8494, + "step": 2609 + }, + { + "epoch": 0.27459231983166754, + "grad_norm": 1.2196578979492188, + "learning_rate": 0.00017024530551186702, + "loss": 1.9425, + "step": 2610 + }, + { + "epoch": 0.27469752761704364, + "grad_norm": 0.9433722496032715, + "learning_rate": 0.00017022104754508562, + "loss": 2.2177, + "step": 2611 + }, + { + "epoch": 0.2748027354024198, + "grad_norm": 1.2304083108901978, + "learning_rate": 0.0001701967814237652, + "loss": 1.6013, + "step": 2612 + }, + { + "epoch": 0.2749079431877959, + "grad_norm": 1.4996156692504883, + "learning_rate": 0.0001701725071507237, + "loss": 1.9624, + "step": 2613 + }, + { + "epoch": 0.275013150973172, + "grad_norm": 2.1100172996520996, + "learning_rate": 0.00017014822472878, + "loss": 1.4203, + "step": 2614 + }, + { + "epoch": 0.2751183587585481, + "grad_norm": 1.0386134386062622, + "learning_rate": 0.00017012393416075398, + "loss": 2.0622, + "step": 2615 + }, + { + "epoch": 0.27522356654392427, + "grad_norm": 2.1451127529144287, + "learning_rate": 0.0001700996354494664, + "loss": 1.6151, + "step": 2616 + }, + { + "epoch": 0.27532877432930036, + "grad_norm": 1.6166155338287354, + "learning_rate": 0.000170075328597739, + "loss": 1.8475, + "step": 2617 + }, + { + "epoch": 0.2754339821146765, + "grad_norm": 1.1394798755645752, + "learning_rate": 0.00017005101360839442, + "loss": 1.9892, + "step": 2618 + }, + { + "epoch": 0.2755391899000526, + "grad_norm": 1.3837336301803589, + "learning_rate": 0.00017002669048425632, + "loss": 2.2187, + "step": 2619 + }, + { + "epoch": 0.2756443976854287, + "grad_norm": 1.8999110460281372, + "learning_rate": 0.00017000235922814922, + "loss": 2.209, + "step": 2620 + }, + { + "epoch": 0.27574960547080485, + "grad_norm": 1.2286471128463745, + "learning_rate": 0.00016997801984289866, + "loss": 1.8712, + "step": 2621 + }, + { + "epoch": 0.27585481325618094, + "grad_norm": 1.0605058670043945, + "learning_rate": 0.00016995367233133113, + "loss": 2.3471, + "step": 2622 + }, + { + "epoch": 0.2759600210415571, + "grad_norm": 1.838855266571045, + "learning_rate": 0.00016992931669627392, + "loss": 1.7718, + "step": 2623 + }, + { + "epoch": 0.2760652288269332, + "grad_norm": 1.4109746217727661, + "learning_rate": 0.00016990495294055548, + "loss": 1.8921, + "step": 2624 + }, + { + "epoch": 0.27617043661230933, + "grad_norm": 1.4461547136306763, + "learning_rate": 0.00016988058106700505, + "loss": 2.1139, + "step": 2625 + }, + { + "epoch": 0.2762756443976854, + "grad_norm": 1.5509105920791626, + "learning_rate": 0.00016985620107845282, + "loss": 1.7556, + "step": 2626 + }, + { + "epoch": 0.2763808521830615, + "grad_norm": 0.8982523679733276, + "learning_rate": 0.00016983181297773, + "loss": 1.9906, + "step": 2627 + }, + { + "epoch": 0.27648605996843767, + "grad_norm": 1.2106822729110718, + "learning_rate": 0.0001698074167676687, + "loss": 1.9958, + "step": 2628 + }, + { + "epoch": 0.27659126775381376, + "grad_norm": 1.0443248748779297, + "learning_rate": 0.00016978301245110195, + "loss": 1.5866, + "step": 2629 + }, + { + "epoch": 0.2766964755391899, + "grad_norm": 0.8765720129013062, + "learning_rate": 0.00016975860003086378, + "loss": 2.1551, + "step": 2630 + }, + { + "epoch": 0.276801683324566, + "grad_norm": 1.5277172327041626, + "learning_rate": 0.00016973417950978906, + "loss": 1.5894, + "step": 2631 + }, + { + "epoch": 0.27690689110994215, + "grad_norm": 1.0646064281463623, + "learning_rate": 0.00016970975089071371, + "loss": 2.0302, + "step": 2632 + }, + { + "epoch": 0.27701209889531825, + "grad_norm": 0.9117512702941895, + "learning_rate": 0.00016968531417647456, + "loss": 2.1135, + "step": 2633 + }, + { + "epoch": 0.2771173066806944, + "grad_norm": 1.429713487625122, + "learning_rate": 0.0001696608693699093, + "loss": 1.8757, + "step": 2634 + }, + { + "epoch": 0.2772225144660705, + "grad_norm": 1.1439270973205566, + "learning_rate": 0.00016963641647385673, + "loss": 1.9884, + "step": 2635 + }, + { + "epoch": 0.2773277222514466, + "grad_norm": 0.8106045126914978, + "learning_rate": 0.00016961195549115637, + "loss": 2.0197, + "step": 2636 + }, + { + "epoch": 0.27743293003682273, + "grad_norm": 1.2242646217346191, + "learning_rate": 0.00016958748642464887, + "loss": 2.0144, + "step": 2637 + }, + { + "epoch": 0.2775381378221988, + "grad_norm": 1.4185434579849243, + "learning_rate": 0.00016956300927717575, + "loss": 2.1412, + "step": 2638 + }, + { + "epoch": 0.277643345607575, + "grad_norm": 1.560724139213562, + "learning_rate": 0.0001695385240515794, + "loss": 2.1001, + "step": 2639 + }, + { + "epoch": 0.27774855339295107, + "grad_norm": 1.4623364210128784, + "learning_rate": 0.00016951403075070323, + "loss": 1.9271, + "step": 2640 + }, + { + "epoch": 0.2778537611783272, + "grad_norm": 0.8837911486625671, + "learning_rate": 0.00016948952937739155, + "loss": 1.621, + "step": 2641 + }, + { + "epoch": 0.2779589689637033, + "grad_norm": 1.3567240238189697, + "learning_rate": 0.00016946501993448968, + "loss": 1.9028, + "step": 2642 + }, + { + "epoch": 0.2780641767490794, + "grad_norm": 1.2848210334777832, + "learning_rate": 0.00016944050242484378, + "loss": 1.6725, + "step": 2643 + }, + { + "epoch": 0.27816938453445555, + "grad_norm": 1.0903668403625488, + "learning_rate": 0.00016941597685130098, + "loss": 1.9121, + "step": 2644 + }, + { + "epoch": 0.27827459231983165, + "grad_norm": 1.519652247428894, + "learning_rate": 0.0001693914432167094, + "loss": 2.1452, + "step": 2645 + }, + { + "epoch": 0.2783798001052078, + "grad_norm": 1.244167447090149, + "learning_rate": 0.000169366901523918, + "loss": 2.3742, + "step": 2646 + }, + { + "epoch": 0.2784850078905839, + "grad_norm": 1.5084514617919922, + "learning_rate": 0.00016934235177577673, + "loss": 1.4103, + "step": 2647 + }, + { + "epoch": 0.27859021567596004, + "grad_norm": 1.1031715869903564, + "learning_rate": 0.00016931779397513652, + "loss": 1.1998, + "step": 2648 + }, + { + "epoch": 0.27869542346133613, + "grad_norm": 1.4771610498428345, + "learning_rate": 0.0001692932281248491, + "loss": 1.7789, + "step": 2649 + }, + { + "epoch": 0.2788006312467123, + "grad_norm": 1.3364812135696411, + "learning_rate": 0.00016926865422776737, + "loss": 1.576, + "step": 2650 + }, + { + "epoch": 0.2789058390320884, + "grad_norm": 1.3395233154296875, + "learning_rate": 0.00016924407228674485, + "loss": 2.1849, + "step": 2651 + }, + { + "epoch": 0.27901104681746447, + "grad_norm": 1.4787706136703491, + "learning_rate": 0.00016921948230463625, + "loss": 1.7253, + "step": 2652 + }, + { + "epoch": 0.2791162546028406, + "grad_norm": 1.3256549835205078, + "learning_rate": 0.0001691948842842971, + "loss": 1.7977, + "step": 2653 + }, + { + "epoch": 0.2792214623882167, + "grad_norm": 1.2367287874221802, + "learning_rate": 0.0001691702782285839, + "loss": 2.3646, + "step": 2654 + }, + { + "epoch": 0.27932667017359286, + "grad_norm": 1.422200083732605, + "learning_rate": 0.00016914566414035403, + "loss": 1.379, + "step": 2655 + }, + { + "epoch": 0.27943187795896895, + "grad_norm": 1.134897232055664, + "learning_rate": 0.0001691210420224659, + "loss": 2.3472, + "step": 2656 + }, + { + "epoch": 0.2795370857443451, + "grad_norm": 1.5080149173736572, + "learning_rate": 0.00016909641187777877, + "loss": 1.8669, + "step": 2657 + }, + { + "epoch": 0.2796422935297212, + "grad_norm": 0.9561126828193665, + "learning_rate": 0.00016907177370915287, + "loss": 2.1019, + "step": 2658 + }, + { + "epoch": 0.2797475013150973, + "grad_norm": 1.668290138244629, + "learning_rate": 0.00016904712751944931, + "loss": 2.1505, + "step": 2659 + }, + { + "epoch": 0.27985270910047344, + "grad_norm": 1.1527786254882812, + "learning_rate": 0.0001690224733115302, + "loss": 2.1848, + "step": 2660 + }, + { + "epoch": 0.27995791688584953, + "grad_norm": 1.4785900115966797, + "learning_rate": 0.0001689978110882586, + "loss": 2.2729, + "step": 2661 + }, + { + "epoch": 0.2800631246712257, + "grad_norm": 1.4332002401351929, + "learning_rate": 0.00016897314085249834, + "loss": 2.0227, + "step": 2662 + }, + { + "epoch": 0.2801683324566018, + "grad_norm": 1.4204810857772827, + "learning_rate": 0.00016894846260711438, + "loss": 2.3247, + "step": 2663 + }, + { + "epoch": 0.2802735402419779, + "grad_norm": 1.2798484563827515, + "learning_rate": 0.00016892377635497252, + "loss": 1.9749, + "step": 2664 + }, + { + "epoch": 0.280378748027354, + "grad_norm": 1.1675047874450684, + "learning_rate": 0.00016889908209893943, + "loss": 2.3451, + "step": 2665 + }, + { + "epoch": 0.28048395581273017, + "grad_norm": 0.8004907369613647, + "learning_rate": 0.00016887437984188286, + "loss": 2.2131, + "step": 2666 + }, + { + "epoch": 0.28058916359810626, + "grad_norm": 1.09357750415802, + "learning_rate": 0.00016884966958667132, + "loss": 2.1521, + "step": 2667 + }, + { + "epoch": 0.28069437138348236, + "grad_norm": 0.926487147808075, + "learning_rate": 0.00016882495133617437, + "loss": 2.0104, + "step": 2668 + }, + { + "epoch": 0.2807995791688585, + "grad_norm": 1.332603096961975, + "learning_rate": 0.0001688002250932625, + "loss": 1.4487, + "step": 2669 + }, + { + "epoch": 0.2809047869542346, + "grad_norm": 0.9102864265441895, + "learning_rate": 0.000168775490860807, + "loss": 2.0708, + "step": 2670 + }, + { + "epoch": 0.28100999473961075, + "grad_norm": 1.8600473403930664, + "learning_rate": 0.0001687507486416802, + "loss": 1.901, + "step": 2671 + }, + { + "epoch": 0.28111520252498684, + "grad_norm": 1.7245584726333618, + "learning_rate": 0.00016872599843875544, + "loss": 1.7357, + "step": 2672 + }, + { + "epoch": 0.281220410310363, + "grad_norm": 0.940122663974762, + "learning_rate": 0.00016870124025490673, + "loss": 1.7142, + "step": 2673 + }, + { + "epoch": 0.2813256180957391, + "grad_norm": 1.2285487651824951, + "learning_rate": 0.0001686764740930092, + "loss": 2.1235, + "step": 2674 + }, + { + "epoch": 0.2814308258811152, + "grad_norm": 1.5378491878509521, + "learning_rate": 0.0001686516999559389, + "loss": 1.9053, + "step": 2675 + }, + { + "epoch": 0.2815360336664913, + "grad_norm": 1.1821976900100708, + "learning_rate": 0.00016862691784657273, + "loss": 2.2413, + "step": 2676 + }, + { + "epoch": 0.2816412414518674, + "grad_norm": 0.931596040725708, + "learning_rate": 0.0001686021277677886, + "loss": 1.9538, + "step": 2677 + }, + { + "epoch": 0.28174644923724357, + "grad_norm": 2.422799825668335, + "learning_rate": 0.00016857732972246528, + "loss": 2.2234, + "step": 2678 + }, + { + "epoch": 0.28185165702261966, + "grad_norm": 1.1872382164001465, + "learning_rate": 0.00016855252371348245, + "loss": 2.0544, + "step": 2679 + }, + { + "epoch": 0.2819568648079958, + "grad_norm": 0.8804065585136414, + "learning_rate": 0.0001685277097437208, + "loss": 1.9305, + "step": 2680 + }, + { + "epoch": 0.2820620725933719, + "grad_norm": 0.8961731195449829, + "learning_rate": 0.00016850288781606186, + "loss": 1.8034, + "step": 2681 + }, + { + "epoch": 0.28216728037874805, + "grad_norm": 1.362924575805664, + "learning_rate": 0.00016847805793338818, + "loss": 2.5519, + "step": 2682 + }, + { + "epoch": 0.28227248816412415, + "grad_norm": 1.105214238166809, + "learning_rate": 0.00016845322009858307, + "loss": 2.212, + "step": 2683 + }, + { + "epoch": 0.28237769594950024, + "grad_norm": 1.1936612129211426, + "learning_rate": 0.00016842837431453093, + "loss": 2.4178, + "step": 2684 + }, + { + "epoch": 0.2824829037348764, + "grad_norm": 1.873302936553955, + "learning_rate": 0.000168403520584117, + "loss": 1.7252, + "step": 2685 + }, + { + "epoch": 0.2825881115202525, + "grad_norm": 1.440673589706421, + "learning_rate": 0.0001683786589102275, + "loss": 2.1752, + "step": 2686 + }, + { + "epoch": 0.28269331930562863, + "grad_norm": 1.075239658355713, + "learning_rate": 0.0001683537892957495, + "loss": 1.8977, + "step": 2687 + }, + { + "epoch": 0.2827985270910047, + "grad_norm": 1.338554859161377, + "learning_rate": 0.00016832891174357103, + "loss": 1.5485, + "step": 2688 + }, + { + "epoch": 0.2829037348763809, + "grad_norm": 1.2033013105392456, + "learning_rate": 0.00016830402625658104, + "loss": 2.182, + "step": 2689 + }, + { + "epoch": 0.28300894266175697, + "grad_norm": 1.282525658607483, + "learning_rate": 0.00016827913283766938, + "loss": 1.8707, + "step": 2690 + }, + { + "epoch": 0.28311415044713306, + "grad_norm": 1.3480403423309326, + "learning_rate": 0.0001682542314897269, + "loss": 1.5163, + "step": 2691 + }, + { + "epoch": 0.2832193582325092, + "grad_norm": 1.0361692905426025, + "learning_rate": 0.00016822932221564524, + "loss": 2.2251, + "step": 2692 + }, + { + "epoch": 0.2833245660178853, + "grad_norm": 1.486932635307312, + "learning_rate": 0.0001682044050183171, + "loss": 1.9494, + "step": 2693 + }, + { + "epoch": 0.28342977380326145, + "grad_norm": 1.127858281135559, + "learning_rate": 0.00016817947990063598, + "loss": 2.0038, + "step": 2694 + }, + { + "epoch": 0.28353498158863755, + "grad_norm": 1.3391180038452148, + "learning_rate": 0.00016815454686549636, + "loss": 1.6913, + "step": 2695 + }, + { + "epoch": 0.2836401893740137, + "grad_norm": 1.6128203868865967, + "learning_rate": 0.00016812960591579366, + "loss": 1.7916, + "step": 2696 + }, + { + "epoch": 0.2837453971593898, + "grad_norm": 1.50484037399292, + "learning_rate": 0.00016810465705442416, + "loss": 1.7511, + "step": 2697 + }, + { + "epoch": 0.28385060494476594, + "grad_norm": 1.4077752828598022, + "learning_rate": 0.00016807970028428508, + "loss": 2.2318, + "step": 2698 + }, + { + "epoch": 0.28395581273014203, + "grad_norm": 1.3436038494110107, + "learning_rate": 0.0001680547356082746, + "loss": 2.0339, + "step": 2699 + }, + { + "epoch": 0.2840610205155181, + "grad_norm": 1.6516331434249878, + "learning_rate": 0.00016802976302929178, + "loss": 1.5578, + "step": 2700 + }, + { + "epoch": 0.2841662283008943, + "grad_norm": 2.065028429031372, + "learning_rate": 0.0001680047825502366, + "loss": 1.8351, + "step": 2701 + }, + { + "epoch": 0.28427143608627037, + "grad_norm": 1.5128923654556274, + "learning_rate": 0.00016797979417400996, + "loss": 1.9321, + "step": 2702 + }, + { + "epoch": 0.2843766438716465, + "grad_norm": 2.0304954051971436, + "learning_rate": 0.00016795479790351366, + "loss": 1.3774, + "step": 2703 + }, + { + "epoch": 0.2844818516570226, + "grad_norm": 1.0998564958572388, + "learning_rate": 0.00016792979374165046, + "loss": 2.1468, + "step": 2704 + }, + { + "epoch": 0.28458705944239876, + "grad_norm": 2.0375499725341797, + "learning_rate": 0.00016790478169132397, + "loss": 2.3572, + "step": 2705 + }, + { + "epoch": 0.28469226722777485, + "grad_norm": 1.1634581089019775, + "learning_rate": 0.00016787976175543882, + "loss": 2.1935, + "step": 2706 + }, + { + "epoch": 0.28479747501315095, + "grad_norm": 1.2639390230178833, + "learning_rate": 0.00016785473393690045, + "loss": 1.9523, + "step": 2707 + }, + { + "epoch": 0.2849026827985271, + "grad_norm": 1.566070795059204, + "learning_rate": 0.00016782969823861526, + "loss": 2.3277, + "step": 2708 + }, + { + "epoch": 0.2850078905839032, + "grad_norm": 1.1275237798690796, + "learning_rate": 0.0001678046546634906, + "loss": 1.8413, + "step": 2709 + }, + { + "epoch": 0.28511309836927934, + "grad_norm": 1.370336890220642, + "learning_rate": 0.00016777960321443463, + "loss": 2.0708, + "step": 2710 + }, + { + "epoch": 0.28521830615465543, + "grad_norm": 1.3864731788635254, + "learning_rate": 0.00016775454389435655, + "loss": 1.8423, + "step": 2711 + }, + { + "epoch": 0.2853235139400316, + "grad_norm": 1.9456918239593506, + "learning_rate": 0.0001677294767061664, + "loss": 2.1698, + "step": 2712 + }, + { + "epoch": 0.2854287217254077, + "grad_norm": 1.2052172422409058, + "learning_rate": 0.00016770440165277516, + "loss": 2.0182, + "step": 2713 + }, + { + "epoch": 0.2855339295107838, + "grad_norm": 1.085540533065796, + "learning_rate": 0.0001676793187370947, + "loss": 2.1485, + "step": 2714 + }, + { + "epoch": 0.2856391372961599, + "grad_norm": 1.0382028818130493, + "learning_rate": 0.0001676542279620378, + "loss": 1.9553, + "step": 2715 + }, + { + "epoch": 0.285744345081536, + "grad_norm": 1.1799119710922241, + "learning_rate": 0.0001676291293305182, + "loss": 1.8491, + "step": 2716 + }, + { + "epoch": 0.28584955286691216, + "grad_norm": 1.027492880821228, + "learning_rate": 0.00016760402284545048, + "loss": 2.1497, + "step": 2717 + }, + { + "epoch": 0.28595476065228825, + "grad_norm": 1.3454300165176392, + "learning_rate": 0.00016757890850975025, + "loss": 2.153, + "step": 2718 + }, + { + "epoch": 0.2860599684376644, + "grad_norm": 1.4455000162124634, + "learning_rate": 0.00016755378632633388, + "loss": 1.9732, + "step": 2719 + }, + { + "epoch": 0.2861651762230405, + "grad_norm": 1.3513553142547607, + "learning_rate": 0.00016752865629811873, + "loss": 1.5569, + "step": 2720 + }, + { + "epoch": 0.28627038400841665, + "grad_norm": 0.8183251619338989, + "learning_rate": 0.00016750351842802314, + "loss": 1.4716, + "step": 2721 + }, + { + "epoch": 0.28637559179379274, + "grad_norm": 2.2286715507507324, + "learning_rate": 0.00016747837271896622, + "loss": 1.947, + "step": 2722 + }, + { + "epoch": 0.28648079957916883, + "grad_norm": 0.9977225065231323, + "learning_rate": 0.00016745321917386804, + "loss": 2.348, + "step": 2723 + }, + { + "epoch": 0.286586007364545, + "grad_norm": 0.8259665369987488, + "learning_rate": 0.00016742805779564968, + "loss": 2.4167, + "step": 2724 + }, + { + "epoch": 0.2866912151499211, + "grad_norm": 0.9765976667404175, + "learning_rate": 0.00016740288858723302, + "loss": 1.8912, + "step": 2725 + }, + { + "epoch": 0.2867964229352972, + "grad_norm": 1.2133702039718628, + "learning_rate": 0.0001673777115515408, + "loss": 2.5914, + "step": 2726 + }, + { + "epoch": 0.2869016307206733, + "grad_norm": 1.3183568716049194, + "learning_rate": 0.00016735252669149685, + "loss": 1.7136, + "step": 2727 + }, + { + "epoch": 0.28700683850604947, + "grad_norm": 1.4817814826965332, + "learning_rate": 0.00016732733401002574, + "loss": 1.8334, + "step": 2728 + }, + { + "epoch": 0.28711204629142556, + "grad_norm": 1.6449335813522339, + "learning_rate": 0.00016730213351005303, + "loss": 2.3261, + "step": 2729 + }, + { + "epoch": 0.2872172540768017, + "grad_norm": 1.0374553203582764, + "learning_rate": 0.0001672769251945052, + "loss": 2.4651, + "step": 2730 + }, + { + "epoch": 0.2873224618621778, + "grad_norm": 1.436585783958435, + "learning_rate": 0.0001672517090663096, + "loss": 1.9923, + "step": 2731 + }, + { + "epoch": 0.2874276696475539, + "grad_norm": 1.1871074438095093, + "learning_rate": 0.00016722648512839446, + "loss": 2.0888, + "step": 2732 + }, + { + "epoch": 0.28753287743293005, + "grad_norm": 1.0876706838607788, + "learning_rate": 0.00016720125338368894, + "loss": 1.427, + "step": 2733 + }, + { + "epoch": 0.28763808521830614, + "grad_norm": 0.8011285662651062, + "learning_rate": 0.00016717601383512318, + "loss": 1.8155, + "step": 2734 + }, + { + "epoch": 0.2877432930036823, + "grad_norm": 1.4743174314498901, + "learning_rate": 0.00016715076648562814, + "loss": 2.1159, + "step": 2735 + }, + { + "epoch": 0.2878485007890584, + "grad_norm": 2.063098907470703, + "learning_rate": 0.00016712551133813572, + "loss": 2.3086, + "step": 2736 + }, + { + "epoch": 0.28795370857443453, + "grad_norm": 1.3487850427627563, + "learning_rate": 0.0001671002483955787, + "loss": 2.1455, + "step": 2737 + }, + { + "epoch": 0.2880589163598106, + "grad_norm": 1.5087181329727173, + "learning_rate": 0.00016707497766089082, + "loss": 1.8298, + "step": 2738 + }, + { + "epoch": 0.2881641241451867, + "grad_norm": 1.2308396100997925, + "learning_rate": 0.00016704969913700662, + "loss": 2.4025, + "step": 2739 + }, + { + "epoch": 0.28826933193056287, + "grad_norm": 1.2315316200256348, + "learning_rate": 0.00016702441282686166, + "loss": 1.8982, + "step": 2740 + }, + { + "epoch": 0.28837453971593896, + "grad_norm": 2.0835769176483154, + "learning_rate": 0.00016699911873339232, + "loss": 1.9045, + "step": 2741 + }, + { + "epoch": 0.2884797475013151, + "grad_norm": 0.9750843048095703, + "learning_rate": 0.00016697381685953596, + "loss": 1.8595, + "step": 2742 + }, + { + "epoch": 0.2885849552866912, + "grad_norm": 1.373468279838562, + "learning_rate": 0.0001669485072082308, + "loss": 2.2024, + "step": 2743 + }, + { + "epoch": 0.28869016307206735, + "grad_norm": 1.492432951927185, + "learning_rate": 0.00016692318978241594, + "loss": 2.0474, + "step": 2744 + }, + { + "epoch": 0.28879537085744345, + "grad_norm": 2.0323355197906494, + "learning_rate": 0.00016689786458503141, + "loss": 2.0644, + "step": 2745 + }, + { + "epoch": 0.2889005786428196, + "grad_norm": 1.1842961311340332, + "learning_rate": 0.0001668725316190182, + "loss": 2.1992, + "step": 2746 + }, + { + "epoch": 0.2890057864281957, + "grad_norm": 1.259732961654663, + "learning_rate": 0.00016684719088731807, + "loss": 1.7769, + "step": 2747 + }, + { + "epoch": 0.2891109942135718, + "grad_norm": 0.8478556275367737, + "learning_rate": 0.0001668218423928738, + "loss": 1.7708, + "step": 2748 + }, + { + "epoch": 0.28921620199894793, + "grad_norm": 0.826978325843811, + "learning_rate": 0.000166796486138629, + "loss": 1.8427, + "step": 2749 + }, + { + "epoch": 0.289321409784324, + "grad_norm": 0.9389429688453674, + "learning_rate": 0.00016677112212752824, + "loss": 1.7696, + "step": 2750 + }, + { + "epoch": 0.2894266175697002, + "grad_norm": 1.0777794122695923, + "learning_rate": 0.0001667457503625169, + "loss": 1.8971, + "step": 2751 + }, + { + "epoch": 0.28953182535507627, + "grad_norm": 1.1226226091384888, + "learning_rate": 0.00016672037084654139, + "loss": 1.8703, + "step": 2752 + }, + { + "epoch": 0.2896370331404524, + "grad_norm": 1.2667369842529297, + "learning_rate": 0.0001666949835825489, + "loss": 1.8995, + "step": 2753 + }, + { + "epoch": 0.2897422409258285, + "grad_norm": 1.5252774953842163, + "learning_rate": 0.0001666695885734876, + "loss": 2.2547, + "step": 2754 + }, + { + "epoch": 0.2898474487112046, + "grad_norm": 1.3169254064559937, + "learning_rate": 0.0001666441858223065, + "loss": 1.7265, + "step": 2755 + }, + { + "epoch": 0.28995265649658075, + "grad_norm": 1.177160382270813, + "learning_rate": 0.00016661877533195556, + "loss": 1.8653, + "step": 2756 + }, + { + "epoch": 0.29005786428195685, + "grad_norm": 1.028063416481018, + "learning_rate": 0.00016659335710538564, + "loss": 2.0327, + "step": 2757 + }, + { + "epoch": 0.290163072067333, + "grad_norm": 1.1455798149108887, + "learning_rate": 0.00016656793114554842, + "loss": 2.0376, + "step": 2758 + }, + { + "epoch": 0.2902682798527091, + "grad_norm": 1.1332205533981323, + "learning_rate": 0.00016654249745539656, + "loss": 1.9765, + "step": 2759 + }, + { + "epoch": 0.29037348763808524, + "grad_norm": 2.2703967094421387, + "learning_rate": 0.00016651705603788362, + "loss": 2.2095, + "step": 2760 + }, + { + "epoch": 0.29047869542346133, + "grad_norm": 1.228402853012085, + "learning_rate": 0.00016649160689596396, + "loss": 2.0485, + "step": 2761 + }, + { + "epoch": 0.2905839032088375, + "grad_norm": 0.901681125164032, + "learning_rate": 0.00016646615003259295, + "loss": 1.8973, + "step": 2762 + }, + { + "epoch": 0.2906891109942136, + "grad_norm": 1.9370111227035522, + "learning_rate": 0.00016644068545072682, + "loss": 2.3251, + "step": 2763 + }, + { + "epoch": 0.29079431877958967, + "grad_norm": 1.6907079219818115, + "learning_rate": 0.00016641521315332265, + "loss": 1.8123, + "step": 2764 + }, + { + "epoch": 0.2908995265649658, + "grad_norm": 0.9530484080314636, + "learning_rate": 0.0001663897331433385, + "loss": 1.4562, + "step": 2765 + }, + { + "epoch": 0.2910047343503419, + "grad_norm": 0.8451758623123169, + "learning_rate": 0.00016636424542373324, + "loss": 1.7969, + "step": 2766 + }, + { + "epoch": 0.29110994213571806, + "grad_norm": 0.9732938408851624, + "learning_rate": 0.00016633874999746667, + "loss": 1.7659, + "step": 2767 + }, + { + "epoch": 0.29121514992109415, + "grad_norm": 1.020836591720581, + "learning_rate": 0.00016631324686749958, + "loss": 2.0531, + "step": 2768 + }, + { + "epoch": 0.2913203577064703, + "grad_norm": 1.1106858253479004, + "learning_rate": 0.0001662877360367934, + "loss": 1.9912, + "step": 2769 + }, + { + "epoch": 0.2914255654918464, + "grad_norm": 0.9008655548095703, + "learning_rate": 0.0001662622175083108, + "loss": 1.8145, + "step": 2770 + }, + { + "epoch": 0.2915307732772225, + "grad_norm": 0.9763224124908447, + "learning_rate": 0.00016623669128501504, + "loss": 2.3397, + "step": 2771 + }, + { + "epoch": 0.29163598106259864, + "grad_norm": 1.1344367265701294, + "learning_rate": 0.00016621115736987046, + "loss": 1.7173, + "step": 2772 + }, + { + "epoch": 0.29174118884797473, + "grad_norm": 1.7770837545394897, + "learning_rate": 0.00016618561576584216, + "loss": 1.9058, + "step": 2773 + }, + { + "epoch": 0.2918463966333509, + "grad_norm": 0.8264276385307312, + "learning_rate": 0.00016616006647589626, + "loss": 2.0614, + "step": 2774 + }, + { + "epoch": 0.291951604418727, + "grad_norm": 1.0680514574050903, + "learning_rate": 0.0001661345095029997, + "loss": 1.7038, + "step": 2775 + }, + { + "epoch": 0.2920568122041031, + "grad_norm": 1.3225088119506836, + "learning_rate": 0.00016610894485012033, + "loss": 1.9587, + "step": 2776 + }, + { + "epoch": 0.2921620199894792, + "grad_norm": 2.3858277797698975, + "learning_rate": 0.0001660833725202269, + "loss": 2.4027, + "step": 2777 + }, + { + "epoch": 0.29226722777485536, + "grad_norm": 1.472761869430542, + "learning_rate": 0.00016605779251628903, + "loss": 1.394, + "step": 2778 + }, + { + "epoch": 0.29237243556023146, + "grad_norm": 1.8934521675109863, + "learning_rate": 0.00016603220484127723, + "loss": 1.8053, + "step": 2779 + }, + { + "epoch": 0.29247764334560755, + "grad_norm": 1.2446554899215698, + "learning_rate": 0.00016600660949816291, + "loss": 2.0428, + "step": 2780 + }, + { + "epoch": 0.2925828511309837, + "grad_norm": 1.7331035137176514, + "learning_rate": 0.00016598100648991838, + "loss": 1.9776, + "step": 2781 + }, + { + "epoch": 0.2926880589163598, + "grad_norm": 1.300978422164917, + "learning_rate": 0.00016595539581951686, + "loss": 1.6838, + "step": 2782 + }, + { + "epoch": 0.29279326670173594, + "grad_norm": 1.042041540145874, + "learning_rate": 0.0001659297774899324, + "loss": 1.783, + "step": 2783 + }, + { + "epoch": 0.29289847448711204, + "grad_norm": 1.229884147644043, + "learning_rate": 0.00016590415150413997, + "loss": 1.9093, + "step": 2784 + }, + { + "epoch": 0.2930036822724882, + "grad_norm": 1.859123706817627, + "learning_rate": 0.00016587851786511543, + "loss": 2.0027, + "step": 2785 + }, + { + "epoch": 0.2931088900578643, + "grad_norm": 1.4690340757369995, + "learning_rate": 0.00016585287657583557, + "loss": 1.4002, + "step": 2786 + }, + { + "epoch": 0.2932140978432404, + "grad_norm": 1.5459439754486084, + "learning_rate": 0.00016582722763927802, + "loss": 1.4447, + "step": 2787 + }, + { + "epoch": 0.2933193056286165, + "grad_norm": 1.0940881967544556, + "learning_rate": 0.00016580157105842123, + "loss": 1.9658, + "step": 2788 + }, + { + "epoch": 0.2934245134139926, + "grad_norm": 1.102705478668213, + "learning_rate": 0.00016577590683624472, + "loss": 2.0337, + "step": 2789 + }, + { + "epoch": 0.29352972119936876, + "grad_norm": 1.6445786952972412, + "learning_rate": 0.00016575023497572872, + "loss": 1.674, + "step": 2790 + }, + { + "epoch": 0.29363492898474486, + "grad_norm": 1.018031120300293, + "learning_rate": 0.00016572455547985446, + "loss": 2.1392, + "step": 2791 + }, + { + "epoch": 0.293740136770121, + "grad_norm": 1.7538018226623535, + "learning_rate": 0.00016569886835160399, + "loss": 1.8097, + "step": 2792 + }, + { + "epoch": 0.2938453445554971, + "grad_norm": 1.397821307182312, + "learning_rate": 0.00016567317359396028, + "loss": 2.0166, + "step": 2793 + }, + { + "epoch": 0.29395055234087325, + "grad_norm": 1.5573742389678955, + "learning_rate": 0.0001656474712099072, + "loss": 1.9029, + "step": 2794 + }, + { + "epoch": 0.29405576012624934, + "grad_norm": 0.9295175671577454, + "learning_rate": 0.0001656217612024294, + "loss": 2.1548, + "step": 2795 + }, + { + "epoch": 0.29416096791162544, + "grad_norm": 0.8610682487487793, + "learning_rate": 0.00016559604357451263, + "loss": 2.2194, + "step": 2796 + }, + { + "epoch": 0.2942661756970016, + "grad_norm": 1.385067343711853, + "learning_rate": 0.00016557031832914327, + "loss": 1.9511, + "step": 2797 + }, + { + "epoch": 0.2943713834823777, + "grad_norm": 0.9952557682991028, + "learning_rate": 0.00016554458546930878, + "loss": 2.6092, + "step": 2798 + }, + { + "epoch": 0.29447659126775383, + "grad_norm": 1.1171859502792358, + "learning_rate": 0.0001655188449979974, + "loss": 1.2882, + "step": 2799 + }, + { + "epoch": 0.2945817990531299, + "grad_norm": 1.6443709135055542, + "learning_rate": 0.00016549309691819833, + "loss": 1.6552, + "step": 2800 + }, + { + "epoch": 0.29468700683850607, + "grad_norm": 0.9413725137710571, + "learning_rate": 0.00016546734123290156, + "loss": 2.4733, + "step": 2801 + }, + { + "epoch": 0.29479221462388217, + "grad_norm": 1.1774075031280518, + "learning_rate": 0.000165441577945098, + "loss": 1.8616, + "step": 2802 + }, + { + "epoch": 0.29489742240925826, + "grad_norm": 0.9130268692970276, + "learning_rate": 0.00016541580705777955, + "loss": 2.1065, + "step": 2803 + }, + { + "epoch": 0.2950026301946344, + "grad_norm": 1.0994932651519775, + "learning_rate": 0.0001653900285739388, + "loss": 2.1053, + "step": 2804 + }, + { + "epoch": 0.2951078379800105, + "grad_norm": 0.8119322657585144, + "learning_rate": 0.00016536424249656933, + "loss": 2.3161, + "step": 2805 + }, + { + "epoch": 0.29521304576538665, + "grad_norm": 1.0407122373580933, + "learning_rate": 0.00016533844882866568, + "loss": 2.0664, + "step": 2806 + }, + { + "epoch": 0.29531825355076274, + "grad_norm": 1.3417972326278687, + "learning_rate": 0.00016531264757322308, + "loss": 1.924, + "step": 2807 + }, + { + "epoch": 0.2954234613361389, + "grad_norm": 1.2302452325820923, + "learning_rate": 0.0001652868387332378, + "loss": 1.8267, + "step": 2808 + }, + { + "epoch": 0.295528669121515, + "grad_norm": 1.2221964597702026, + "learning_rate": 0.00016526102231170691, + "loss": 2.0054, + "step": 2809 + }, + { + "epoch": 0.29563387690689114, + "grad_norm": 1.5506309270858765, + "learning_rate": 0.0001652351983116284, + "loss": 1.9843, + "step": 2810 + }, + { + "epoch": 0.29573908469226723, + "grad_norm": 1.3232911825180054, + "learning_rate": 0.00016520936673600117, + "loss": 2.2001, + "step": 2811 + }, + { + "epoch": 0.2958442924776433, + "grad_norm": 1.1889708042144775, + "learning_rate": 0.00016518352758782486, + "loss": 1.3271, + "step": 2812 + }, + { + "epoch": 0.29594950026301947, + "grad_norm": 1.4930849075317383, + "learning_rate": 0.00016515768087010013, + "loss": 2.121, + "step": 2813 + }, + { + "epoch": 0.29605470804839557, + "grad_norm": 1.3592177629470825, + "learning_rate": 0.0001651318265858285, + "loss": 2.0658, + "step": 2814 + }, + { + "epoch": 0.2961599158337717, + "grad_norm": 1.2253750562667847, + "learning_rate": 0.00016510596473801232, + "loss": 1.8405, + "step": 2815 + }, + { + "epoch": 0.2962651236191478, + "grad_norm": 1.1184381246566772, + "learning_rate": 0.00016508009532965485, + "loss": 1.7584, + "step": 2816 + }, + { + "epoch": 0.29637033140452396, + "grad_norm": 1.332506537437439, + "learning_rate": 0.0001650542183637602, + "loss": 1.8449, + "step": 2817 + }, + { + "epoch": 0.29647553918990005, + "grad_norm": 1.4818626642227173, + "learning_rate": 0.00016502833384333338, + "loss": 2.0897, + "step": 2818 + }, + { + "epoch": 0.29658074697527614, + "grad_norm": 0.7966058254241943, + "learning_rate": 0.0001650024417713803, + "loss": 1.9058, + "step": 2819 + }, + { + "epoch": 0.2966859547606523, + "grad_norm": 1.4730877876281738, + "learning_rate": 0.00016497654215090772, + "loss": 1.8389, + "step": 2820 + }, + { + "epoch": 0.2967911625460284, + "grad_norm": 1.3068904876708984, + "learning_rate": 0.00016495063498492326, + "loss": 2.4661, + "step": 2821 + }, + { + "epoch": 0.29689637033140454, + "grad_norm": 1.5194685459136963, + "learning_rate": 0.00016492472027643541, + "loss": 1.685, + "step": 2822 + }, + { + "epoch": 0.29700157811678063, + "grad_norm": 0.9527503848075867, + "learning_rate": 0.00016489879802845361, + "loss": 2.2541, + "step": 2823 + }, + { + "epoch": 0.2971067859021568, + "grad_norm": 1.2973871231079102, + "learning_rate": 0.0001648728682439881, + "loss": 2.3992, + "step": 2824 + }, + { + "epoch": 0.29721199368753287, + "grad_norm": 1.154152750968933, + "learning_rate": 0.00016484693092605002, + "loss": 1.7631, + "step": 2825 + }, + { + "epoch": 0.297317201472909, + "grad_norm": 1.4371882677078247, + "learning_rate": 0.00016482098607765137, + "loss": 1.8837, + "step": 2826 + }, + { + "epoch": 0.2974224092582851, + "grad_norm": 1.1324549913406372, + "learning_rate": 0.00016479503370180507, + "loss": 2.2255, + "step": 2827 + }, + { + "epoch": 0.2975276170436612, + "grad_norm": 0.9485172629356384, + "learning_rate": 0.0001647690738015249, + "loss": 2.1955, + "step": 2828 + }, + { + "epoch": 0.29763282482903736, + "grad_norm": 1.0813753604888916, + "learning_rate": 0.0001647431063798254, + "loss": 1.8017, + "step": 2829 + }, + { + "epoch": 0.29773803261441345, + "grad_norm": 1.0293214321136475, + "learning_rate": 0.0001647171314397222, + "loss": 1.9523, + "step": 2830 + }, + { + "epoch": 0.2978432403997896, + "grad_norm": 1.2349849939346313, + "learning_rate": 0.00016469114898423165, + "loss": 1.8325, + "step": 2831 + }, + { + "epoch": 0.2979484481851657, + "grad_norm": 1.0925050973892212, + "learning_rate": 0.00016466515901637096, + "loss": 2.1776, + "step": 2832 + }, + { + "epoch": 0.29805365597054184, + "grad_norm": 1.1909685134887695, + "learning_rate": 0.0001646391615391583, + "loss": 1.9048, + "step": 2833 + }, + { + "epoch": 0.29815886375591794, + "grad_norm": 1.5420279502868652, + "learning_rate": 0.00016461315655561263, + "loss": 1.8361, + "step": 2834 + }, + { + "epoch": 0.29826407154129403, + "grad_norm": 1.3840932846069336, + "learning_rate": 0.00016458714406875392, + "loss": 1.9673, + "step": 2835 + }, + { + "epoch": 0.2983692793266702, + "grad_norm": 1.4501628875732422, + "learning_rate": 0.0001645611240816028, + "loss": 1.3108, + "step": 2836 + }, + { + "epoch": 0.29847448711204627, + "grad_norm": 1.304292917251587, + "learning_rate": 0.00016453509659718093, + "loss": 1.4608, + "step": 2837 + }, + { + "epoch": 0.2985796948974224, + "grad_norm": 1.1828423738479614, + "learning_rate": 0.00016450906161851082, + "loss": 1.8168, + "step": 2838 + }, + { + "epoch": 0.2986849026827985, + "grad_norm": 1.2528148889541626, + "learning_rate": 0.00016448301914861584, + "loss": 1.5209, + "step": 2839 + }, + { + "epoch": 0.29879011046817466, + "grad_norm": 0.9708895683288574, + "learning_rate": 0.00016445696919052013, + "loss": 2.2244, + "step": 2840 + }, + { + "epoch": 0.29889531825355076, + "grad_norm": 1.3018192052841187, + "learning_rate": 0.00016443091174724885, + "loss": 1.6512, + "step": 2841 + }, + { + "epoch": 0.2990005260389269, + "grad_norm": 1.1458721160888672, + "learning_rate": 0.00016440484682182799, + "loss": 2.0022, + "step": 2842 + }, + { + "epoch": 0.299105733824303, + "grad_norm": 2.0912718772888184, + "learning_rate": 0.00016437877441728433, + "loss": 1.9703, + "step": 2843 + }, + { + "epoch": 0.2992109416096791, + "grad_norm": 1.1203174591064453, + "learning_rate": 0.00016435269453664558, + "loss": 1.9248, + "step": 2844 + }, + { + "epoch": 0.29931614939505524, + "grad_norm": 1.6757680177688599, + "learning_rate": 0.00016432660718294033, + "loss": 1.8856, + "step": 2845 + }, + { + "epoch": 0.29942135718043134, + "grad_norm": 1.9626332521438599, + "learning_rate": 0.00016430051235919802, + "loss": 1.3917, + "step": 2846 + }, + { + "epoch": 0.2995265649658075, + "grad_norm": 1.4652724266052246, + "learning_rate": 0.00016427441006844893, + "loss": 2.1621, + "step": 2847 + }, + { + "epoch": 0.2996317727511836, + "grad_norm": 1.40691339969635, + "learning_rate": 0.00016424830031372425, + "loss": 1.8587, + "step": 2848 + }, + { + "epoch": 0.2997369805365597, + "grad_norm": 1.054113745689392, + "learning_rate": 0.000164222183098056, + "loss": 1.9425, + "step": 2849 + }, + { + "epoch": 0.2998421883219358, + "grad_norm": 1.6056944131851196, + "learning_rate": 0.00016419605842447714, + "loss": 1.5958, + "step": 2850 + }, + { + "epoch": 0.2999473961073119, + "grad_norm": 1.3629722595214844, + "learning_rate": 0.00016416992629602142, + "loss": 1.524, + "step": 2851 + }, + { + "epoch": 0.30005260389268806, + "grad_norm": 1.3340497016906738, + "learning_rate": 0.00016414378671572344, + "loss": 1.2919, + "step": 2852 + }, + { + "epoch": 0.30015781167806416, + "grad_norm": 1.4080469608306885, + "learning_rate": 0.00016411763968661873, + "loss": 2.1183, + "step": 2853 + }, + { + "epoch": 0.3002630194634403, + "grad_norm": 1.1511781215667725, + "learning_rate": 0.00016409148521174367, + "loss": 1.4441, + "step": 2854 + }, + { + "epoch": 0.3003682272488164, + "grad_norm": 1.404482126235962, + "learning_rate": 0.00016406532329413546, + "loss": 1.9554, + "step": 2855 + }, + { + "epoch": 0.30047343503419255, + "grad_norm": 1.4892315864562988, + "learning_rate": 0.00016403915393683221, + "loss": 1.9983, + "step": 2856 + }, + { + "epoch": 0.30057864281956864, + "grad_norm": 1.159440040588379, + "learning_rate": 0.00016401297714287294, + "loss": 2.1476, + "step": 2857 + }, + { + "epoch": 0.3006838506049448, + "grad_norm": 1.5921545028686523, + "learning_rate": 0.00016398679291529738, + "loss": 2.1005, + "step": 2858 + }, + { + "epoch": 0.3007890583903209, + "grad_norm": 0.9857234358787537, + "learning_rate": 0.00016396060125714628, + "loss": 1.659, + "step": 2859 + }, + { + "epoch": 0.300894266175697, + "grad_norm": 1.3065168857574463, + "learning_rate": 0.00016393440217146114, + "loss": 2.1434, + "step": 2860 + }, + { + "epoch": 0.3009994739610731, + "grad_norm": 1.689706563949585, + "learning_rate": 0.00016390819566128445, + "loss": 1.8146, + "step": 2861 + }, + { + "epoch": 0.3011046817464492, + "grad_norm": 2.270273208618164, + "learning_rate": 0.00016388198172965942, + "loss": 2.1251, + "step": 2862 + }, + { + "epoch": 0.30120988953182537, + "grad_norm": 1.3508756160736084, + "learning_rate": 0.00016385576037963021, + "loss": 1.9374, + "step": 2863 + }, + { + "epoch": 0.30131509731720146, + "grad_norm": 1.7013899087905884, + "learning_rate": 0.00016382953161424185, + "loss": 1.9598, + "step": 2864 + }, + { + "epoch": 0.3014203051025776, + "grad_norm": 1.457298755645752, + "learning_rate": 0.00016380329543654013, + "loss": 2.2662, + "step": 2865 + }, + { + "epoch": 0.3015255128879537, + "grad_norm": 1.280228853225708, + "learning_rate": 0.00016377705184957185, + "loss": 1.9162, + "step": 2866 + }, + { + "epoch": 0.3016307206733298, + "grad_norm": 1.6916749477386475, + "learning_rate": 0.00016375080085638451, + "loss": 2.1827, + "step": 2867 + }, + { + "epoch": 0.30173592845870595, + "grad_norm": 1.3950283527374268, + "learning_rate": 0.00016372454246002663, + "loss": 1.6382, + "step": 2868 + }, + { + "epoch": 0.30184113624408204, + "grad_norm": 1.4823285341262817, + "learning_rate": 0.00016369827666354745, + "loss": 1.5104, + "step": 2869 + }, + { + "epoch": 0.3019463440294582, + "grad_norm": 0.9238750338554382, + "learning_rate": 0.00016367200346999714, + "loss": 2.1355, + "step": 2870 + }, + { + "epoch": 0.3020515518148343, + "grad_norm": 1.290266990661621, + "learning_rate": 0.00016364572288242677, + "loss": 1.9956, + "step": 2871 + }, + { + "epoch": 0.30215675960021043, + "grad_norm": 0.9198346138000488, + "learning_rate": 0.00016361943490388815, + "loss": 2.0828, + "step": 2872 + }, + { + "epoch": 0.3022619673855865, + "grad_norm": 1.6661968231201172, + "learning_rate": 0.00016359313953743406, + "loss": 1.7579, + "step": 2873 + }, + { + "epoch": 0.3023671751709627, + "grad_norm": 1.031909465789795, + "learning_rate": 0.00016356683678611807, + "loss": 2.0152, + "step": 2874 + }, + { + "epoch": 0.30247238295633877, + "grad_norm": 1.662083387374878, + "learning_rate": 0.00016354052665299468, + "loss": 2.1388, + "step": 2875 + }, + { + "epoch": 0.30257759074171486, + "grad_norm": 1.226048231124878, + "learning_rate": 0.00016351420914111916, + "loss": 2.4793, + "step": 2876 + }, + { + "epoch": 0.302682798527091, + "grad_norm": 1.6051714420318604, + "learning_rate": 0.00016348788425354766, + "loss": 1.7133, + "step": 2877 + }, + { + "epoch": 0.3027880063124671, + "grad_norm": 0.972946047782898, + "learning_rate": 0.00016346155199333721, + "loss": 1.8606, + "step": 2878 + }, + { + "epoch": 0.30289321409784326, + "grad_norm": 1.1665260791778564, + "learning_rate": 0.00016343521236354574, + "loss": 2.1413, + "step": 2879 + }, + { + "epoch": 0.30299842188321935, + "grad_norm": 1.250375509262085, + "learning_rate": 0.00016340886536723192, + "loss": 1.6386, + "step": 2880 + }, + { + "epoch": 0.3031036296685955, + "grad_norm": 1.2102625370025635, + "learning_rate": 0.00016338251100745537, + "loss": 1.9187, + "step": 2881 + }, + { + "epoch": 0.3032088374539716, + "grad_norm": 2.1711556911468506, + "learning_rate": 0.00016335614928727652, + "loss": 1.7001, + "step": 2882 + }, + { + "epoch": 0.3033140452393477, + "grad_norm": 2.243464231491089, + "learning_rate": 0.0001633297802097567, + "loss": 1.7878, + "step": 2883 + }, + { + "epoch": 0.30341925302472383, + "grad_norm": 1.4908397197723389, + "learning_rate": 0.00016330340377795804, + "loss": 1.846, + "step": 2884 + }, + { + "epoch": 0.3035244608100999, + "grad_norm": 1.6034961938858032, + "learning_rate": 0.00016327701999494353, + "loss": 1.6955, + "step": 2885 + }, + { + "epoch": 0.3036296685954761, + "grad_norm": 1.2690043449401855, + "learning_rate": 0.0001632506288637771, + "loss": 2.009, + "step": 2886 + }, + { + "epoch": 0.30373487638085217, + "grad_norm": 1.138432502746582, + "learning_rate": 0.00016322423038752336, + "loss": 1.8181, + "step": 2887 + }, + { + "epoch": 0.3038400841662283, + "grad_norm": 0.7713850140571594, + "learning_rate": 0.000163197824569248, + "loss": 2.1781, + "step": 2888 + }, + { + "epoch": 0.3039452919516044, + "grad_norm": 1.5833183526992798, + "learning_rate": 0.00016317141141201731, + "loss": 2.0445, + "step": 2889 + }, + { + "epoch": 0.30405049973698056, + "grad_norm": 1.196471095085144, + "learning_rate": 0.0001631449909188987, + "loss": 1.6292, + "step": 2890 + }, + { + "epoch": 0.30415570752235666, + "grad_norm": 1.7439603805541992, + "learning_rate": 0.0001631185630929602, + "loss": 2.1841, + "step": 2891 + }, + { + "epoch": 0.30426091530773275, + "grad_norm": 1.4162222146987915, + "learning_rate": 0.00016309212793727077, + "loss": 1.9838, + "step": 2892 + }, + { + "epoch": 0.3043661230931089, + "grad_norm": 1.5100573301315308, + "learning_rate": 0.00016306568545490033, + "loss": 2.1413, + "step": 2893 + }, + { + "epoch": 0.304471330878485, + "grad_norm": 1.3554414510726929, + "learning_rate": 0.00016303923564891948, + "loss": 2.3482, + "step": 2894 + }, + { + "epoch": 0.30457653866386114, + "grad_norm": 1.4444999694824219, + "learning_rate": 0.0001630127785223998, + "loss": 2.3344, + "step": 2895 + }, + { + "epoch": 0.30468174644923723, + "grad_norm": 1.267393708229065, + "learning_rate": 0.00016298631407841361, + "loss": 1.4887, + "step": 2896 + }, + { + "epoch": 0.3047869542346134, + "grad_norm": 1.479801893234253, + "learning_rate": 0.00016295984232003426, + "loss": 2.2112, + "step": 2897 + }, + { + "epoch": 0.3048921620199895, + "grad_norm": 1.5580319166183472, + "learning_rate": 0.0001629333632503357, + "loss": 1.7964, + "step": 2898 + }, + { + "epoch": 0.30499736980536557, + "grad_norm": 1.1315181255340576, + "learning_rate": 0.00016290687687239283, + "loss": 1.6472, + "step": 2899 + }, + { + "epoch": 0.3051025775907417, + "grad_norm": 1.8991228342056274, + "learning_rate": 0.00016288038318928156, + "loss": 1.6517, + "step": 2900 + }, + { + "epoch": 0.3052077853761178, + "grad_norm": 1.7622857093811035, + "learning_rate": 0.00016285388220407847, + "loss": 1.8343, + "step": 2901 + }, + { + "epoch": 0.30531299316149396, + "grad_norm": 1.0036766529083252, + "learning_rate": 0.00016282737391986097, + "loss": 2.2478, + "step": 2902 + }, + { + "epoch": 0.30541820094687006, + "grad_norm": 1.0161508321762085, + "learning_rate": 0.00016280085833970744, + "loss": 2.0217, + "step": 2903 + }, + { + "epoch": 0.3055234087322462, + "grad_norm": 1.6816000938415527, + "learning_rate": 0.00016277433546669703, + "loss": 1.6079, + "step": 2904 + }, + { + "epoch": 0.3056286165176223, + "grad_norm": 1.2823631763458252, + "learning_rate": 0.00016274780530390977, + "loss": 1.6762, + "step": 2905 + }, + { + "epoch": 0.30573382430299845, + "grad_norm": 1.2397310733795166, + "learning_rate": 0.00016272126785442644, + "loss": 1.9937, + "step": 2906 + }, + { + "epoch": 0.30583903208837454, + "grad_norm": 1.5026395320892334, + "learning_rate": 0.0001626947231213289, + "loss": 1.6613, + "step": 2907 + }, + { + "epoch": 0.30594423987375063, + "grad_norm": 1.2583715915679932, + "learning_rate": 0.00016266817110769955, + "loss": 2.0048, + "step": 2908 + }, + { + "epoch": 0.3060494476591268, + "grad_norm": 0.9579928517341614, + "learning_rate": 0.00016264161181662188, + "loss": 2.0814, + "step": 2909 + }, + { + "epoch": 0.3061546554445029, + "grad_norm": 1.003151297569275, + "learning_rate": 0.0001626150452511801, + "loss": 2.2672, + "step": 2910 + }, + { + "epoch": 0.306259863229879, + "grad_norm": 1.4802247285842896, + "learning_rate": 0.00016258847141445928, + "loss": 2.5276, + "step": 2911 + }, + { + "epoch": 0.3063650710152551, + "grad_norm": 1.5277745723724365, + "learning_rate": 0.00016256189030954538, + "loss": 2.1005, + "step": 2912 + }, + { + "epoch": 0.30647027880063127, + "grad_norm": 1.1211442947387695, + "learning_rate": 0.00016253530193952517, + "loss": 1.8205, + "step": 2913 + }, + { + "epoch": 0.30657548658600736, + "grad_norm": 1.083178997039795, + "learning_rate": 0.0001625087063074863, + "loss": 2.128, + "step": 2914 + }, + { + "epoch": 0.30668069437138346, + "grad_norm": 2.118847370147705, + "learning_rate": 0.00016248210341651716, + "loss": 1.9704, + "step": 2915 + }, + { + "epoch": 0.3067859021567596, + "grad_norm": 1.371398687362671, + "learning_rate": 0.00016245549326970713, + "loss": 1.5593, + "step": 2916 + }, + { + "epoch": 0.3068911099421357, + "grad_norm": 1.180167317390442, + "learning_rate": 0.0001624288758701463, + "loss": 1.7496, + "step": 2917 + }, + { + "epoch": 0.30699631772751185, + "grad_norm": 1.3497270345687866, + "learning_rate": 0.00016240225122092573, + "loss": 2.0314, + "step": 2918 + }, + { + "epoch": 0.30710152551288794, + "grad_norm": 1.6975984573364258, + "learning_rate": 0.00016237561932513718, + "loss": 1.77, + "step": 2919 + }, + { + "epoch": 0.3072067332982641, + "grad_norm": 1.6393413543701172, + "learning_rate": 0.00016234898018587337, + "loss": 1.93, + "step": 2920 + }, + { + "epoch": 0.3073119410836402, + "grad_norm": 1.7652052640914917, + "learning_rate": 0.00016232233380622779, + "loss": 1.8103, + "step": 2921 + }, + { + "epoch": 0.30741714886901633, + "grad_norm": 1.489980936050415, + "learning_rate": 0.00016229568018929483, + "loss": 1.8286, + "step": 2922 + }, + { + "epoch": 0.3075223566543924, + "grad_norm": 1.4050182104110718, + "learning_rate": 0.00016226901933816962, + "loss": 1.5679, + "step": 2923 + }, + { + "epoch": 0.3076275644397685, + "grad_norm": 1.0518805980682373, + "learning_rate": 0.0001622423512559483, + "loss": 1.6626, + "step": 2924 + }, + { + "epoch": 0.30773277222514467, + "grad_norm": 1.4546114206314087, + "learning_rate": 0.00016221567594572762, + "loss": 1.9034, + "step": 2925 + }, + { + "epoch": 0.30783798001052076, + "grad_norm": 0.9748585820198059, + "learning_rate": 0.00016218899341060542, + "loss": 2.2178, + "step": 2926 + }, + { + "epoch": 0.3079431877958969, + "grad_norm": 1.3530354499816895, + "learning_rate": 0.00016216230365368017, + "loss": 2.2276, + "step": 2927 + }, + { + "epoch": 0.308048395581273, + "grad_norm": 0.9455937743186951, + "learning_rate": 0.00016213560667805127, + "loss": 1.9019, + "step": 2928 + }, + { + "epoch": 0.30815360336664915, + "grad_norm": 1.0966365337371826, + "learning_rate": 0.00016210890248681906, + "loss": 1.5131, + "step": 2929 + }, + { + "epoch": 0.30825881115202525, + "grad_norm": 0.9960981607437134, + "learning_rate": 0.00016208219108308444, + "loss": 1.9477, + "step": 2930 + }, + { + "epoch": 0.30836401893740134, + "grad_norm": 1.0510518550872803, + "learning_rate": 0.00016205547246994945, + "loss": 1.7093, + "step": 2931 + }, + { + "epoch": 0.3084692267227775, + "grad_norm": 1.185834527015686, + "learning_rate": 0.00016202874665051674, + "loss": 2.012, + "step": 2932 + }, + { + "epoch": 0.3085744345081536, + "grad_norm": 0.9046087861061096, + "learning_rate": 0.00016200201362788995, + "loss": 1.9285, + "step": 2933 + }, + { + "epoch": 0.30867964229352973, + "grad_norm": 1.357476830482483, + "learning_rate": 0.00016197527340517352, + "loss": 1.8586, + "step": 2934 + }, + { + "epoch": 0.3087848500789058, + "grad_norm": 0.9783453941345215, + "learning_rate": 0.00016194852598547263, + "loss": 2.1739, + "step": 2935 + }, + { + "epoch": 0.308890057864282, + "grad_norm": 1.2122621536254883, + "learning_rate": 0.00016192177137189345, + "loss": 1.7506, + "step": 2936 + }, + { + "epoch": 0.30899526564965807, + "grad_norm": 1.8045505285263062, + "learning_rate": 0.00016189500956754284, + "loss": 1.656, + "step": 2937 + }, + { + "epoch": 0.3091004734350342, + "grad_norm": 1.205743670463562, + "learning_rate": 0.00016186824057552856, + "loss": 1.9285, + "step": 2938 + }, + { + "epoch": 0.3092056812204103, + "grad_norm": 1.1010884046554565, + "learning_rate": 0.00016184146439895928, + "loss": 2.1431, + "step": 2939 + }, + { + "epoch": 0.3093108890057864, + "grad_norm": 0.7133980393409729, + "learning_rate": 0.00016181468104094435, + "loss": 2.1475, + "step": 2940 + }, + { + "epoch": 0.30941609679116255, + "grad_norm": 1.0357547998428345, + "learning_rate": 0.00016178789050459407, + "loss": 1.8142, + "step": 2941 + }, + { + "epoch": 0.30952130457653865, + "grad_norm": 1.1060856580734253, + "learning_rate": 0.0001617610927930195, + "loss": 1.4338, + "step": 2942 + }, + { + "epoch": 0.3096265123619148, + "grad_norm": 2.330672264099121, + "learning_rate": 0.00016173428790933265, + "loss": 1.5461, + "step": 2943 + }, + { + "epoch": 0.3097317201472909, + "grad_norm": 1.4529541730880737, + "learning_rate": 0.0001617074758566462, + "loss": 1.8743, + "step": 2944 + }, + { + "epoch": 0.30983692793266704, + "grad_norm": 0.984163224697113, + "learning_rate": 0.00016168065663807376, + "loss": 2.2996, + "step": 2945 + }, + { + "epoch": 0.30994213571804313, + "grad_norm": 1.3428571224212646, + "learning_rate": 0.00016165383025672981, + "loss": 2.0059, + "step": 2946 + }, + { + "epoch": 0.3100473435034192, + "grad_norm": 0.8479394316673279, + "learning_rate": 0.00016162699671572956, + "loss": 2.0439, + "step": 2947 + }, + { + "epoch": 0.3101525512887954, + "grad_norm": 0.9885584712028503, + "learning_rate": 0.0001616001560181891, + "loss": 2.2262, + "step": 2948 + }, + { + "epoch": 0.31025775907417147, + "grad_norm": 1.3757953643798828, + "learning_rate": 0.0001615733081672254, + "loss": 1.9734, + "step": 2949 + }, + { + "epoch": 0.3103629668595476, + "grad_norm": 1.0954573154449463, + "learning_rate": 0.00016154645316595616, + "loss": 1.7084, + "step": 2950 + }, + { + "epoch": 0.3104681746449237, + "grad_norm": 1.236785888671875, + "learning_rate": 0.00016151959101749996, + "loss": 1.4454, + "step": 2951 + }, + { + "epoch": 0.31057338243029986, + "grad_norm": 1.077486515045166, + "learning_rate": 0.00016149272172497626, + "loss": 1.6408, + "step": 2952 + }, + { + "epoch": 0.31067859021567595, + "grad_norm": 0.9998572468757629, + "learning_rate": 0.00016146584529150526, + "loss": 2.4455, + "step": 2953 + }, + { + "epoch": 0.3107837980010521, + "grad_norm": 1.189342737197876, + "learning_rate": 0.00016143896172020808, + "loss": 1.8845, + "step": 2954 + }, + { + "epoch": 0.3108890057864282, + "grad_norm": 1.0967530012130737, + "learning_rate": 0.00016141207101420655, + "loss": 2.104, + "step": 2955 + }, + { + "epoch": 0.3109942135718043, + "grad_norm": 1.4149389266967773, + "learning_rate": 0.00016138517317662346, + "loss": 1.6627, + "step": 2956 + }, + { + "epoch": 0.31109942135718044, + "grad_norm": 0.9122399687767029, + "learning_rate": 0.00016135826821058233, + "loss": 1.7756, + "step": 2957 + }, + { + "epoch": 0.31120462914255653, + "grad_norm": 1.4171504974365234, + "learning_rate": 0.00016133135611920757, + "loss": 1.9604, + "step": 2958 + }, + { + "epoch": 0.3113098369279327, + "grad_norm": 1.4796384572982788, + "learning_rate": 0.0001613044369056244, + "loss": 1.6938, + "step": 2959 + }, + { + "epoch": 0.3114150447133088, + "grad_norm": 1.3851723670959473, + "learning_rate": 0.0001612775105729588, + "loss": 1.7427, + "step": 2960 + }, + { + "epoch": 0.3115202524986849, + "grad_norm": 1.3645011186599731, + "learning_rate": 0.00016125057712433773, + "loss": 1.7735, + "step": 2961 + }, + { + "epoch": 0.311625460284061, + "grad_norm": 1.73271906375885, + "learning_rate": 0.00016122363656288882, + "loss": 2.1165, + "step": 2962 + }, + { + "epoch": 0.3117306680694371, + "grad_norm": 0.9199891686439514, + "learning_rate": 0.0001611966888917406, + "loss": 1.6816, + "step": 2963 + }, + { + "epoch": 0.31183587585481326, + "grad_norm": 1.2702879905700684, + "learning_rate": 0.00016116973411402238, + "loss": 2.1454, + "step": 2964 + }, + { + "epoch": 0.31194108364018935, + "grad_norm": 1.4072431325912476, + "learning_rate": 0.0001611427722328644, + "loss": 1.9006, + "step": 2965 + }, + { + "epoch": 0.3120462914255655, + "grad_norm": 0.7870166301727295, + "learning_rate": 0.0001611158032513976, + "loss": 1.8407, + "step": 2966 + }, + { + "epoch": 0.3121514992109416, + "grad_norm": 1.3497145175933838, + "learning_rate": 0.00016108882717275384, + "loss": 1.6683, + "step": 2967 + }, + { + "epoch": 0.31225670699631775, + "grad_norm": 1.090868592262268, + "learning_rate": 0.00016106184400006569, + "loss": 1.9826, + "step": 2968 + }, + { + "epoch": 0.31236191478169384, + "grad_norm": 1.025602102279663, + "learning_rate": 0.00016103485373646672, + "loss": 2.011, + "step": 2969 + }, + { + "epoch": 0.31246712256707, + "grad_norm": 1.3139764070510864, + "learning_rate": 0.00016100785638509114, + "loss": 2.148, + "step": 2970 + }, + { + "epoch": 0.3125723303524461, + "grad_norm": 1.5432015657424927, + "learning_rate": 0.00016098085194907413, + "loss": 1.8339, + "step": 2971 + }, + { + "epoch": 0.3126775381378222, + "grad_norm": 1.3246941566467285, + "learning_rate": 0.00016095384043155156, + "loss": 1.5852, + "step": 2972 + }, + { + "epoch": 0.3127827459231983, + "grad_norm": 1.229443907737732, + "learning_rate": 0.00016092682183566025, + "loss": 1.4615, + "step": 2973 + }, + { + "epoch": 0.3128879537085744, + "grad_norm": 1.3812941312789917, + "learning_rate": 0.0001608997961645377, + "loss": 2.0711, + "step": 2974 + }, + { + "epoch": 0.31299316149395057, + "grad_norm": 1.3320645093917847, + "learning_rate": 0.0001608727634213224, + "loss": 2.0261, + "step": 2975 + }, + { + "epoch": 0.31309836927932666, + "grad_norm": 1.8210208415985107, + "learning_rate": 0.00016084572360915348, + "loss": 1.314, + "step": 2976 + }, + { + "epoch": 0.3132035770647028, + "grad_norm": 0.9022918939590454, + "learning_rate": 0.00016081867673117106, + "loss": 1.8564, + "step": 2977 + }, + { + "epoch": 0.3133087848500789, + "grad_norm": 1.5104812383651733, + "learning_rate": 0.00016079162279051602, + "loss": 1.8883, + "step": 2978 + }, + { + "epoch": 0.313413992635455, + "grad_norm": 1.4188772439956665, + "learning_rate": 0.00016076456179032998, + "loss": 2.1038, + "step": 2979 + }, + { + "epoch": 0.31351920042083115, + "grad_norm": 1.3051005601882935, + "learning_rate": 0.00016073749373375545, + "loss": 2.2416, + "step": 2980 + }, + { + "epoch": 0.31362440820620724, + "grad_norm": 1.0661115646362305, + "learning_rate": 0.00016071041862393578, + "loss": 1.8527, + "step": 2981 + }, + { + "epoch": 0.3137296159915834, + "grad_norm": 1.1394139528274536, + "learning_rate": 0.00016068333646401516, + "loss": 1.468, + "step": 2982 + }, + { + "epoch": 0.3138348237769595, + "grad_norm": 1.1041537523269653, + "learning_rate": 0.00016065624725713847, + "loss": 1.4634, + "step": 2983 + }, + { + "epoch": 0.31394003156233563, + "grad_norm": 1.4715754985809326, + "learning_rate": 0.00016062915100645153, + "loss": 1.8815, + "step": 2984 + }, + { + "epoch": 0.3140452393477117, + "grad_norm": 1.0146689414978027, + "learning_rate": 0.0001606020477151009, + "loss": 1.9733, + "step": 2985 + }, + { + "epoch": 0.3141504471330879, + "grad_norm": 1.4208067655563354, + "learning_rate": 0.00016057493738623406, + "loss": 1.6583, + "step": 2986 + }, + { + "epoch": 0.31425565491846397, + "grad_norm": 0.9717585444450378, + "learning_rate": 0.0001605478200229992, + "loss": 1.8575, + "step": 2987 + }, + { + "epoch": 0.31436086270384006, + "grad_norm": 2.195236921310425, + "learning_rate": 0.0001605206956285454, + "loss": 1.4584, + "step": 2988 + }, + { + "epoch": 0.3144660704892162, + "grad_norm": 1.941888451576233, + "learning_rate": 0.00016049356420602247, + "loss": 1.9882, + "step": 2989 + }, + { + "epoch": 0.3145712782745923, + "grad_norm": 1.0348442792892456, + "learning_rate": 0.00016046642575858115, + "loss": 2.0475, + "step": 2990 + }, + { + "epoch": 0.31467648605996845, + "grad_norm": 1.9281662702560425, + "learning_rate": 0.00016043928028937292, + "loss": 2.1587, + "step": 2991 + }, + { + "epoch": 0.31478169384534455, + "grad_norm": 1.6395374536514282, + "learning_rate": 0.00016041212780155007, + "loss": 1.9435, + "step": 2992 + }, + { + "epoch": 0.3148869016307207, + "grad_norm": 1.1363707780838013, + "learning_rate": 0.0001603849682982658, + "loss": 1.6754, + "step": 2993 + }, + { + "epoch": 0.3149921094160968, + "grad_norm": 1.3249703645706177, + "learning_rate": 0.00016035780178267394, + "loss": 2.2371, + "step": 2994 + }, + { + "epoch": 0.3150973172014729, + "grad_norm": 1.0637754201889038, + "learning_rate": 0.00016033062825792935, + "loss": 2.1387, + "step": 2995 + }, + { + "epoch": 0.31520252498684903, + "grad_norm": 0.9320810437202454, + "learning_rate": 0.00016030344772718756, + "loss": 2.0091, + "step": 2996 + }, + { + "epoch": 0.3153077327722251, + "grad_norm": 1.375025987625122, + "learning_rate": 0.00016027626019360496, + "loss": 2.2975, + "step": 2997 + }, + { + "epoch": 0.3154129405576013, + "grad_norm": 1.532082200050354, + "learning_rate": 0.00016024906566033874, + "loss": 2.0983, + "step": 2998 + }, + { + "epoch": 0.31551814834297737, + "grad_norm": 0.982284426689148, + "learning_rate": 0.00016022186413054693, + "loss": 1.4862, + "step": 2999 + }, + { + "epoch": 0.3156233561283535, + "grad_norm": 1.5432162284851074, + "learning_rate": 0.00016019465560738834, + "loss": 2.0209, + "step": 3000 + }, + { + "epoch": 0.3157285639137296, + "grad_norm": 1.4942352771759033, + "learning_rate": 0.0001601674400940226, + "loss": 2.1858, + "step": 3001 + }, + { + "epoch": 0.31583377169910576, + "grad_norm": 1.9148730039596558, + "learning_rate": 0.0001601402175936102, + "loss": 1.2212, + "step": 3002 + }, + { + "epoch": 0.31593897948448185, + "grad_norm": 1.283167839050293, + "learning_rate": 0.00016011298810931232, + "loss": 1.617, + "step": 3003 + }, + { + "epoch": 0.31604418726985795, + "grad_norm": 1.2399694919586182, + "learning_rate": 0.00016008575164429113, + "loss": 1.8931, + "step": 3004 + }, + { + "epoch": 0.3161493950552341, + "grad_norm": 1.6097055673599243, + "learning_rate": 0.00016005850820170943, + "loss": 1.6942, + "step": 3005 + }, + { + "epoch": 0.3162546028406102, + "grad_norm": 0.8781225085258484, + "learning_rate": 0.00016003125778473097, + "loss": 1.8621, + "step": 3006 + }, + { + "epoch": 0.31635981062598634, + "grad_norm": 1.7202116250991821, + "learning_rate": 0.0001600040003965202, + "loss": 2.01, + "step": 3007 + }, + { + "epoch": 0.31646501841136243, + "grad_norm": 1.6193592548370361, + "learning_rate": 0.00015997673604024244, + "loss": 1.5022, + "step": 3008 + }, + { + "epoch": 0.3165702261967386, + "grad_norm": 1.6493405103683472, + "learning_rate": 0.00015994946471906382, + "loss": 1.9556, + "step": 3009 + }, + { + "epoch": 0.3166754339821147, + "grad_norm": 1.438834309577942, + "learning_rate": 0.0001599221864361513, + "loss": 2.3363, + "step": 3010 + }, + { + "epoch": 0.31678064176749077, + "grad_norm": 1.1828105449676514, + "learning_rate": 0.00015989490119467257, + "loss": 2.0282, + "step": 3011 + }, + { + "epoch": 0.3168858495528669, + "grad_norm": 1.8960435390472412, + "learning_rate": 0.00015986760899779618, + "loss": 1.9431, + "step": 3012 + }, + { + "epoch": 0.316991057338243, + "grad_norm": 1.4587379693984985, + "learning_rate": 0.0001598403098486915, + "loss": 1.9177, + "step": 3013 + }, + { + "epoch": 0.31709626512361916, + "grad_norm": 2.3861501216888428, + "learning_rate": 0.00015981300375052872, + "loss": 1.7569, + "step": 3014 + }, + { + "epoch": 0.31720147290899525, + "grad_norm": 1.6264148950576782, + "learning_rate": 0.00015978569070647876, + "loss": 1.835, + "step": 3015 + }, + { + "epoch": 0.3173066806943714, + "grad_norm": 1.7277299165725708, + "learning_rate": 0.0001597583707197134, + "loss": 2.2641, + "step": 3016 + }, + { + "epoch": 0.3174118884797475, + "grad_norm": 1.0360989570617676, + "learning_rate": 0.00015973104379340524, + "loss": 1.9667, + "step": 3017 + }, + { + "epoch": 0.31751709626512364, + "grad_norm": 0.7866742014884949, + "learning_rate": 0.00015970370993072762, + "loss": 1.7332, + "step": 3018 + }, + { + "epoch": 0.31762230405049974, + "grad_norm": 1.069047212600708, + "learning_rate": 0.0001596763691348548, + "loss": 2.3551, + "step": 3019 + }, + { + "epoch": 0.31772751183587583, + "grad_norm": 1.3254644870758057, + "learning_rate": 0.00015964902140896175, + "loss": 2.3056, + "step": 3020 + }, + { + "epoch": 0.317832719621252, + "grad_norm": 0.88875812292099, + "learning_rate": 0.00015962166675622424, + "loss": 2.1039, + "step": 3021 + }, + { + "epoch": 0.3179379274066281, + "grad_norm": 1.263662338256836, + "learning_rate": 0.0001595943051798189, + "loss": 2.3112, + "step": 3022 + }, + { + "epoch": 0.3180431351920042, + "grad_norm": 0.8133673667907715, + "learning_rate": 0.00015956693668292313, + "loss": 2.2302, + "step": 3023 + }, + { + "epoch": 0.3181483429773803, + "grad_norm": 3.3079943656921387, + "learning_rate": 0.00015953956126871517, + "loss": 2.2323, + "step": 3024 + }, + { + "epoch": 0.31825355076275647, + "grad_norm": 1.0143603086471558, + "learning_rate": 0.00015951217894037402, + "loss": 2.016, + "step": 3025 + }, + { + "epoch": 0.31835875854813256, + "grad_norm": 1.2601126432418823, + "learning_rate": 0.0001594847897010795, + "loss": 2.0657, + "step": 3026 + }, + { + "epoch": 0.31846396633350865, + "grad_norm": 0.7915671467781067, + "learning_rate": 0.00015945739355401222, + "loss": 1.8548, + "step": 3027 + }, + { + "epoch": 0.3185691741188848, + "grad_norm": 1.0004899501800537, + "learning_rate": 0.0001594299905023536, + "loss": 2.4322, + "step": 3028 + }, + { + "epoch": 0.3186743819042609, + "grad_norm": 0.8855611681938171, + "learning_rate": 0.0001594025805492859, + "loss": 2.1558, + "step": 3029 + }, + { + "epoch": 0.31877958968963704, + "grad_norm": 1.2269375324249268, + "learning_rate": 0.00015937516369799216, + "loss": 2.2797, + "step": 3030 + }, + { + "epoch": 0.31888479747501314, + "grad_norm": 1.0635052919387817, + "learning_rate": 0.00015934773995165613, + "loss": 2.2713, + "step": 3031 + }, + { + "epoch": 0.3189900052603893, + "grad_norm": 1.1982238292694092, + "learning_rate": 0.0001593203093134625, + "loss": 1.9763, + "step": 3032 + }, + { + "epoch": 0.3190952130457654, + "grad_norm": 1.1229660511016846, + "learning_rate": 0.0001592928717865967, + "loss": 2.0421, + "step": 3033 + }, + { + "epoch": 0.31920042083114153, + "grad_norm": 0.9558283686637878, + "learning_rate": 0.00015926542737424492, + "loss": 2.0596, + "step": 3034 + }, + { + "epoch": 0.3193056286165176, + "grad_norm": 1.1449071168899536, + "learning_rate": 0.00015923797607959422, + "loss": 2.0033, + "step": 3035 + }, + { + "epoch": 0.3194108364018937, + "grad_norm": 2.168914794921875, + "learning_rate": 0.00015921051790583247, + "loss": 1.6018, + "step": 3036 + }, + { + "epoch": 0.31951604418726987, + "grad_norm": 0.9856159090995789, + "learning_rate": 0.00015918305285614822, + "loss": 1.8371, + "step": 3037 + }, + { + "epoch": 0.31962125197264596, + "grad_norm": 1.085970163345337, + "learning_rate": 0.0001591555809337309, + "loss": 2.1444, + "step": 3038 + }, + { + "epoch": 0.3197264597580221, + "grad_norm": 1.2033791542053223, + "learning_rate": 0.0001591281021417708, + "loss": 1.6321, + "step": 3039 + }, + { + "epoch": 0.3198316675433982, + "grad_norm": 1.5555825233459473, + "learning_rate": 0.0001591006164834589, + "loss": 1.6976, + "step": 3040 + }, + { + "epoch": 0.31993687532877435, + "grad_norm": 1.6176249980926514, + "learning_rate": 0.00015907312396198697, + "loss": 2.028, + "step": 3041 + }, + { + "epoch": 0.32004208311415044, + "grad_norm": 1.1690542697906494, + "learning_rate": 0.00015904562458054773, + "loss": 1.9713, + "step": 3042 + }, + { + "epoch": 0.32014729089952654, + "grad_norm": 1.2883795499801636, + "learning_rate": 0.00015901811834233452, + "loss": 2.1284, + "step": 3043 + }, + { + "epoch": 0.3202524986849027, + "grad_norm": 1.3291345834732056, + "learning_rate": 0.00015899060525054157, + "loss": 1.5719, + "step": 3044 + }, + { + "epoch": 0.3203577064702788, + "grad_norm": 1.5232844352722168, + "learning_rate": 0.0001589630853083639, + "loss": 1.8812, + "step": 3045 + }, + { + "epoch": 0.32046291425565493, + "grad_norm": 0.951919436454773, + "learning_rate": 0.0001589355585189973, + "loss": 2.0791, + "step": 3046 + }, + { + "epoch": 0.320568122041031, + "grad_norm": 1.3650517463684082, + "learning_rate": 0.0001589080248856383, + "loss": 1.834, + "step": 3047 + }, + { + "epoch": 0.32067332982640717, + "grad_norm": 1.4583851099014282, + "learning_rate": 0.00015888048441148442, + "loss": 2.0812, + "step": 3048 + }, + { + "epoch": 0.32077853761178327, + "grad_norm": 1.3375087976455688, + "learning_rate": 0.00015885293709973374, + "loss": 1.7964, + "step": 3049 + }, + { + "epoch": 0.3208837453971594, + "grad_norm": 1.1333143711090088, + "learning_rate": 0.0001588253829535853, + "loss": 2.2698, + "step": 3050 + }, + { + "epoch": 0.3209889531825355, + "grad_norm": 0.9990382790565491, + "learning_rate": 0.0001587978219762388, + "loss": 1.4186, + "step": 3051 + }, + { + "epoch": 0.3210941609679116, + "grad_norm": 1.230421543121338, + "learning_rate": 0.0001587702541708949, + "loss": 1.8636, + "step": 3052 + }, + { + "epoch": 0.32119936875328775, + "grad_norm": 1.4861817359924316, + "learning_rate": 0.00015874267954075485, + "loss": 2.0363, + "step": 3053 + }, + { + "epoch": 0.32130457653866384, + "grad_norm": 1.4042917490005493, + "learning_rate": 0.0001587150980890209, + "loss": 1.6804, + "step": 3054 + }, + { + "epoch": 0.32140978432404, + "grad_norm": 1.5827716588974, + "learning_rate": 0.00015868750981889594, + "loss": 1.7993, + "step": 3055 + }, + { + "epoch": 0.3215149921094161, + "grad_norm": 1.274047613143921, + "learning_rate": 0.00015865991473358373, + "loss": 1.8368, + "step": 3056 + }, + { + "epoch": 0.32162019989479224, + "grad_norm": 1.2829877138137817, + "learning_rate": 0.00015863231283628877, + "loss": 1.9838, + "step": 3057 + }, + { + "epoch": 0.32172540768016833, + "grad_norm": 0.8687334060668945, + "learning_rate": 0.00015860470413021642, + "loss": 1.5084, + "step": 3058 + }, + { + "epoch": 0.3218306154655444, + "grad_norm": 1.9248266220092773, + "learning_rate": 0.00015857708861857274, + "loss": 1.6113, + "step": 3059 + }, + { + "epoch": 0.32193582325092057, + "grad_norm": 1.0948201417922974, + "learning_rate": 0.00015854946630456467, + "loss": 2.1865, + "step": 3060 + }, + { + "epoch": 0.32204103103629667, + "grad_norm": 1.5295681953430176, + "learning_rate": 0.00015852183719139985, + "loss": 2.0163, + "step": 3061 + }, + { + "epoch": 0.3221462388216728, + "grad_norm": 2.258373737335205, + "learning_rate": 0.00015849420128228678, + "loss": 2.2256, + "step": 3062 + }, + { + "epoch": 0.3222514466070489, + "grad_norm": 1.5048192739486694, + "learning_rate": 0.00015846655858043477, + "loss": 1.7284, + "step": 3063 + }, + { + "epoch": 0.32235665439242506, + "grad_norm": 1.728192925453186, + "learning_rate": 0.0001584389090890538, + "loss": 2.0966, + "step": 3064 + }, + { + "epoch": 0.32246186217780115, + "grad_norm": 1.150688886642456, + "learning_rate": 0.00015841125281135473, + "loss": 1.9364, + "step": 3065 + }, + { + "epoch": 0.3225670699631773, + "grad_norm": 1.404353141784668, + "learning_rate": 0.0001583835897505493, + "loss": 1.6709, + "step": 3066 + }, + { + "epoch": 0.3226722777485534, + "grad_norm": 1.3692048788070679, + "learning_rate": 0.00015835591990984974, + "loss": 1.8023, + "step": 3067 + }, + { + "epoch": 0.3227774855339295, + "grad_norm": 1.486565351486206, + "learning_rate": 0.00015832824329246946, + "loss": 1.8788, + "step": 3068 + }, + { + "epoch": 0.32288269331930564, + "grad_norm": 0.9628434777259827, + "learning_rate": 0.0001583005599016223, + "loss": 2.3548, + "step": 3069 + }, + { + "epoch": 0.32298790110468173, + "grad_norm": 0.9888852834701538, + "learning_rate": 0.0001582728697405231, + "loss": 2.1615, + "step": 3070 + }, + { + "epoch": 0.3230931088900579, + "grad_norm": 1.8097474575042725, + "learning_rate": 0.00015824517281238745, + "loss": 1.4427, + "step": 3071 + }, + { + "epoch": 0.32319831667543397, + "grad_norm": 1.5706349611282349, + "learning_rate": 0.00015821746912043165, + "loss": 1.9168, + "step": 3072 + }, + { + "epoch": 0.3233035244608101, + "grad_norm": 3.1145801544189453, + "learning_rate": 0.0001581897586678729, + "loss": 1.4371, + "step": 3073 + }, + { + "epoch": 0.3234087322461862, + "grad_norm": 1.5123764276504517, + "learning_rate": 0.00015816204145792904, + "loss": 1.9067, + "step": 3074 + }, + { + "epoch": 0.3235139400315623, + "grad_norm": 0.8363044261932373, + "learning_rate": 0.00015813431749381887, + "loss": 2.0816, + "step": 3075 + }, + { + "epoch": 0.32361914781693846, + "grad_norm": 1.4123753309249878, + "learning_rate": 0.00015810658677876184, + "loss": 1.9094, + "step": 3076 + }, + { + "epoch": 0.32372435560231455, + "grad_norm": 1.268897533416748, + "learning_rate": 0.0001580788493159782, + "loss": 1.6229, + "step": 3077 + }, + { + "epoch": 0.3238295633876907, + "grad_norm": 1.2455761432647705, + "learning_rate": 0.00015805110510868907, + "loss": 1.9262, + "step": 3078 + }, + { + "epoch": 0.3239347711730668, + "grad_norm": 1.2342606782913208, + "learning_rate": 0.00015802335416011625, + "loss": 1.8135, + "step": 3079 + }, + { + "epoch": 0.32403997895844294, + "grad_norm": 1.7797157764434814, + "learning_rate": 0.00015799559647348236, + "loss": 1.5395, + "step": 3080 + }, + { + "epoch": 0.32414518674381904, + "grad_norm": 1.31904935836792, + "learning_rate": 0.00015796783205201086, + "loss": 1.5797, + "step": 3081 + }, + { + "epoch": 0.3242503945291952, + "grad_norm": 1.5645530223846436, + "learning_rate": 0.00015794006089892587, + "loss": 1.6131, + "step": 3082 + }, + { + "epoch": 0.3243556023145713, + "grad_norm": 1.483992099761963, + "learning_rate": 0.00015791228301745245, + "loss": 2.2207, + "step": 3083 + }, + { + "epoch": 0.32446081009994737, + "grad_norm": 1.0606383085250854, + "learning_rate": 0.00015788449841081626, + "loss": 1.5808, + "step": 3084 + }, + { + "epoch": 0.3245660178853235, + "grad_norm": 1.3656562566757202, + "learning_rate": 0.00015785670708224389, + "loss": 2.1471, + "step": 3085 + }, + { + "epoch": 0.3246712256706996, + "grad_norm": 1.2391986846923828, + "learning_rate": 0.00015782890903496264, + "loss": 2.1729, + "step": 3086 + }, + { + "epoch": 0.32477643345607576, + "grad_norm": 1.3538752794265747, + "learning_rate": 0.0001578011042722006, + "loss": 2.518, + "step": 3087 + }, + { + "epoch": 0.32488164124145186, + "grad_norm": 1.5246939659118652, + "learning_rate": 0.0001577732927971867, + "loss": 2.2928, + "step": 3088 + }, + { + "epoch": 0.324986849026828, + "grad_norm": 1.872592806816101, + "learning_rate": 0.0001577454746131505, + "loss": 2.0892, + "step": 3089 + }, + { + "epoch": 0.3250920568122041, + "grad_norm": 1.573846697807312, + "learning_rate": 0.00015771764972332254, + "loss": 2.2432, + "step": 3090 + }, + { + "epoch": 0.3251972645975802, + "grad_norm": 0.8727262020111084, + "learning_rate": 0.00015768981813093393, + "loss": 1.7948, + "step": 3091 + }, + { + "epoch": 0.32530247238295634, + "grad_norm": 1.4603215456008911, + "learning_rate": 0.00015766197983921673, + "loss": 2.5792, + "step": 3092 + }, + { + "epoch": 0.32540768016833244, + "grad_norm": 1.198878526687622, + "learning_rate": 0.00015763413485140365, + "loss": 2.1937, + "step": 3093 + }, + { + "epoch": 0.3255128879537086, + "grad_norm": 1.553514003753662, + "learning_rate": 0.00015760628317072834, + "loss": 2.3651, + "step": 3094 + }, + { + "epoch": 0.3256180957390847, + "grad_norm": 1.511500358581543, + "learning_rate": 0.00015757842480042502, + "loss": 2.1293, + "step": 3095 + }, + { + "epoch": 0.3257233035244608, + "grad_norm": 1.5197930335998535, + "learning_rate": 0.00015755055974372883, + "loss": 2.1377, + "step": 3096 + }, + { + "epoch": 0.3258285113098369, + "grad_norm": 1.308167815208435, + "learning_rate": 0.00015752268800387563, + "loss": 1.8966, + "step": 3097 + }, + { + "epoch": 0.32593371909521307, + "grad_norm": 0.7427685260772705, + "learning_rate": 0.0001574948095841021, + "loss": 2.4438, + "step": 3098 + }, + { + "epoch": 0.32603892688058916, + "grad_norm": 1.212828278541565, + "learning_rate": 0.00015746692448764568, + "loss": 1.8894, + "step": 3099 + }, + { + "epoch": 0.32614413466596526, + "grad_norm": 1.2902162075042725, + "learning_rate": 0.00015743903271774455, + "loss": 1.7276, + "step": 3100 + }, + { + "epoch": 0.3262493424513414, + "grad_norm": 1.072805404663086, + "learning_rate": 0.0001574111342776377, + "loss": 1.969, + "step": 3101 + }, + { + "epoch": 0.3263545502367175, + "grad_norm": 3.502418279647827, + "learning_rate": 0.00015738322917056486, + "loss": 2.6105, + "step": 3102 + }, + { + "epoch": 0.32645975802209365, + "grad_norm": 1.2981481552124023, + "learning_rate": 0.00015735531739976657, + "loss": 2.0216, + "step": 3103 + }, + { + "epoch": 0.32656496580746974, + "grad_norm": 0.8937673568725586, + "learning_rate": 0.00015732739896848414, + "loss": 1.7309, + "step": 3104 + }, + { + "epoch": 0.3266701735928459, + "grad_norm": 1.0238783359527588, + "learning_rate": 0.00015729947387995962, + "loss": 1.7886, + "step": 3105 + }, + { + "epoch": 0.326775381378222, + "grad_norm": 1.0271786451339722, + "learning_rate": 0.00015727154213743592, + "loss": 1.7114, + "step": 3106 + }, + { + "epoch": 0.3268805891635981, + "grad_norm": 1.1980253458023071, + "learning_rate": 0.0001572436037441566, + "loss": 2.0706, + "step": 3107 + }, + { + "epoch": 0.32698579694897423, + "grad_norm": 1.797404408454895, + "learning_rate": 0.0001572156587033661, + "loss": 2.1992, + "step": 3108 + }, + { + "epoch": 0.3270910047343503, + "grad_norm": 1.8935017585754395, + "learning_rate": 0.00015718770701830955, + "loss": 2.225, + "step": 3109 + }, + { + "epoch": 0.32719621251972647, + "grad_norm": 1.513788104057312, + "learning_rate": 0.0001571597486922329, + "loss": 2.0227, + "step": 3110 + }, + { + "epoch": 0.32730142030510256, + "grad_norm": 0.831935703754425, + "learning_rate": 0.00015713178372838286, + "loss": 1.5778, + "step": 3111 + }, + { + "epoch": 0.3274066280904787, + "grad_norm": 1.2550475597381592, + "learning_rate": 0.0001571038121300069, + "loss": 1.7091, + "step": 3112 + }, + { + "epoch": 0.3275118358758548, + "grad_norm": 1.2048956155776978, + "learning_rate": 0.00015707583390035327, + "loss": 2.2076, + "step": 3113 + }, + { + "epoch": 0.32761704366123096, + "grad_norm": 1.4993027448654175, + "learning_rate": 0.00015704784904267097, + "loss": 2.2555, + "step": 3114 + }, + { + "epoch": 0.32772225144660705, + "grad_norm": 1.5597890615463257, + "learning_rate": 0.00015701985756020985, + "loss": 1.8569, + "step": 3115 + }, + { + "epoch": 0.32782745923198314, + "grad_norm": 1.346541166305542, + "learning_rate": 0.00015699185945622043, + "loss": 2.031, + "step": 3116 + }, + { + "epoch": 0.3279326670173593, + "grad_norm": 1.1803152561187744, + "learning_rate": 0.00015696385473395403, + "loss": 1.9566, + "step": 3117 + }, + { + "epoch": 0.3280378748027354, + "grad_norm": 1.5781463384628296, + "learning_rate": 0.00015693584339666279, + "loss": 2.0671, + "step": 3118 + }, + { + "epoch": 0.32814308258811153, + "grad_norm": 1.163499355316162, + "learning_rate": 0.0001569078254475995, + "loss": 2.3857, + "step": 3119 + }, + { + "epoch": 0.32824829037348763, + "grad_norm": 1.9654390811920166, + "learning_rate": 0.00015687980089001787, + "loss": 2.0257, + "step": 3120 + }, + { + "epoch": 0.3283534981588638, + "grad_norm": 1.0074301958084106, + "learning_rate": 0.00015685176972717223, + "loss": 2.3659, + "step": 3121 + }, + { + "epoch": 0.32845870594423987, + "grad_norm": 1.2155638933181763, + "learning_rate": 0.00015682373196231782, + "loss": 2.0024, + "step": 3122 + }, + { + "epoch": 0.32856391372961596, + "grad_norm": 1.3712031841278076, + "learning_rate": 0.0001567956875987105, + "loss": 1.7298, + "step": 3123 + }, + { + "epoch": 0.3286691215149921, + "grad_norm": 1.682850956916809, + "learning_rate": 0.000156767636639607, + "loss": 2.3542, + "step": 3124 + }, + { + "epoch": 0.3287743293003682, + "grad_norm": 1.4702867269515991, + "learning_rate": 0.00015673957908826479, + "loss": 2.0367, + "step": 3125 + }, + { + "epoch": 0.32887953708574436, + "grad_norm": 1.9041625261306763, + "learning_rate": 0.00015671151494794211, + "loss": 2.015, + "step": 3126 + }, + { + "epoch": 0.32898474487112045, + "grad_norm": 1.1154800653457642, + "learning_rate": 0.00015668344422189794, + "loss": 1.9378, + "step": 3127 + }, + { + "epoch": 0.3290899526564966, + "grad_norm": 1.1576582193374634, + "learning_rate": 0.00015665536691339207, + "loss": 2.1949, + "step": 3128 + }, + { + "epoch": 0.3291951604418727, + "grad_norm": 0.9152946472167969, + "learning_rate": 0.00015662728302568498, + "loss": 1.8082, + "step": 3129 + }, + { + "epoch": 0.32930036822724884, + "grad_norm": 1.0784518718719482, + "learning_rate": 0.00015659919256203795, + "loss": 2.1528, + "step": 3130 + }, + { + "epoch": 0.32940557601262493, + "grad_norm": 1.0181763172149658, + "learning_rate": 0.00015657109552571312, + "loss": 2.1109, + "step": 3131 + }, + { + "epoch": 0.32951078379800103, + "grad_norm": 1.47239351272583, + "learning_rate": 0.00015654299191997324, + "loss": 2.1156, + "step": 3132 + }, + { + "epoch": 0.3296159915833772, + "grad_norm": 1.092498540878296, + "learning_rate": 0.0001565148817480819, + "loss": 1.7449, + "step": 3133 + }, + { + "epoch": 0.32972119936875327, + "grad_norm": 0.9805752635002136, + "learning_rate": 0.00015648676501330342, + "loss": 1.804, + "step": 3134 + }, + { + "epoch": 0.3298264071541294, + "grad_norm": 1.1172640323638916, + "learning_rate": 0.00015645864171890295, + "loss": 1.6379, + "step": 3135 + }, + { + "epoch": 0.3299316149395055, + "grad_norm": 1.313032627105713, + "learning_rate": 0.0001564305118681463, + "loss": 2.2116, + "step": 3136 + }, + { + "epoch": 0.33003682272488166, + "grad_norm": 1.0228915214538574, + "learning_rate": 0.00015640237546430018, + "loss": 2.1348, + "step": 3137 + }, + { + "epoch": 0.33014203051025776, + "grad_norm": 1.2504109144210815, + "learning_rate": 0.00015637423251063185, + "loss": 1.7011, + "step": 3138 + }, + { + "epoch": 0.33024723829563385, + "grad_norm": 1.2419320344924927, + "learning_rate": 0.00015634608301040958, + "loss": 1.5222, + "step": 3139 + }, + { + "epoch": 0.33035244608101, + "grad_norm": 1.6612623929977417, + "learning_rate": 0.00015631792696690225, + "loss": 1.9523, + "step": 3140 + }, + { + "epoch": 0.3304576538663861, + "grad_norm": 1.404956340789795, + "learning_rate": 0.00015628976438337948, + "loss": 1.6778, + "step": 3141 + }, + { + "epoch": 0.33056286165176224, + "grad_norm": 2.2217705249786377, + "learning_rate": 0.00015626159526311174, + "loss": 1.7111, + "step": 3142 + }, + { + "epoch": 0.33066806943713833, + "grad_norm": 1.682693362236023, + "learning_rate": 0.0001562334196093702, + "loss": 1.9422, + "step": 3143 + }, + { + "epoch": 0.3307732772225145, + "grad_norm": 2.735732078552246, + "learning_rate": 0.00015620523742542687, + "loss": 1.8769, + "step": 3144 + }, + { + "epoch": 0.3308784850078906, + "grad_norm": 1.4678375720977783, + "learning_rate": 0.00015617704871455433, + "loss": 2.3066, + "step": 3145 + }, + { + "epoch": 0.3309836927932667, + "grad_norm": 1.5375454425811768, + "learning_rate": 0.00015614885348002612, + "loss": 1.7219, + "step": 3146 + }, + { + "epoch": 0.3310889005786428, + "grad_norm": 1.09225332736969, + "learning_rate": 0.00015612065172511646, + "loss": 1.9515, + "step": 3147 + }, + { + "epoch": 0.3311941083640189, + "grad_norm": 1.3868629932403564, + "learning_rate": 0.0001560924434531003, + "loss": 2.0467, + "step": 3148 + }, + { + "epoch": 0.33129931614939506, + "grad_norm": 1.3525406122207642, + "learning_rate": 0.00015606422866725343, + "loss": 1.7597, + "step": 3149 + }, + { + "epoch": 0.33140452393477116, + "grad_norm": 1.2624315023422241, + "learning_rate": 0.00015603600737085227, + "loss": 1.9116, + "step": 3150 + }, + { + "epoch": 0.3315097317201473, + "grad_norm": 1.287229299545288, + "learning_rate": 0.00015600777956717408, + "loss": 1.8875, + "step": 3151 + }, + { + "epoch": 0.3316149395055234, + "grad_norm": 1.0664699077606201, + "learning_rate": 0.0001559795452594969, + "loss": 1.8313, + "step": 3152 + }, + { + "epoch": 0.33172014729089955, + "grad_norm": 1.5198394060134888, + "learning_rate": 0.00015595130445109946, + "loss": 1.6447, + "step": 3153 + }, + { + "epoch": 0.33182535507627564, + "grad_norm": 1.22080659866333, + "learning_rate": 0.0001559230571452613, + "loss": 2.5198, + "step": 3154 + }, + { + "epoch": 0.33193056286165173, + "grad_norm": 1.2474714517593384, + "learning_rate": 0.00015589480334526266, + "loss": 1.736, + "step": 3155 + }, + { + "epoch": 0.3320357706470279, + "grad_norm": 1.2309380769729614, + "learning_rate": 0.00015586654305438456, + "loss": 1.5404, + "step": 3156 + }, + { + "epoch": 0.332140978432404, + "grad_norm": 1.3968786001205444, + "learning_rate": 0.00015583827627590875, + "loss": 1.7834, + "step": 3157 + }, + { + "epoch": 0.3322461862177801, + "grad_norm": 1.3345340490341187, + "learning_rate": 0.00015581000301311782, + "loss": 1.7573, + "step": 3158 + }, + { + "epoch": 0.3323513940031562, + "grad_norm": 1.5625921487808228, + "learning_rate": 0.00015578172326929498, + "loss": 1.8578, + "step": 3159 + }, + { + "epoch": 0.33245660178853237, + "grad_norm": 0.9418458342552185, + "learning_rate": 0.0001557534370477243, + "loss": 2.0889, + "step": 3160 + }, + { + "epoch": 0.33256180957390846, + "grad_norm": 1.5350128412246704, + "learning_rate": 0.00015572514435169063, + "loss": 1.1002, + "step": 3161 + }, + { + "epoch": 0.3326670173592846, + "grad_norm": 1.865135908126831, + "learning_rate": 0.0001556968451844794, + "loss": 1.707, + "step": 3162 + }, + { + "epoch": 0.3327722251446607, + "grad_norm": 1.427790641784668, + "learning_rate": 0.00015566853954937694, + "loss": 1.8425, + "step": 3163 + }, + { + "epoch": 0.3328774329300368, + "grad_norm": 1.1270205974578857, + "learning_rate": 0.0001556402274496703, + "loss": 1.5166, + "step": 3164 + }, + { + "epoch": 0.33298264071541295, + "grad_norm": 1.270928978919983, + "learning_rate": 0.0001556119088886473, + "loss": 1.4399, + "step": 3165 + }, + { + "epoch": 0.33308784850078904, + "grad_norm": 1.3536688089370728, + "learning_rate": 0.0001555835838695964, + "loss": 1.5239, + "step": 3166 + }, + { + "epoch": 0.3331930562861652, + "grad_norm": 1.0648913383483887, + "learning_rate": 0.00015555525239580698, + "loss": 2.1152, + "step": 3167 + }, + { + "epoch": 0.3332982640715413, + "grad_norm": 1.2379913330078125, + "learning_rate": 0.00015552691447056903, + "loss": 2.2395, + "step": 3168 + }, + { + "epoch": 0.33340347185691743, + "grad_norm": 1.243102788925171, + "learning_rate": 0.0001554985700971733, + "loss": 1.8046, + "step": 3169 + }, + { + "epoch": 0.3335086796422935, + "grad_norm": 1.4841232299804688, + "learning_rate": 0.00015547021927891144, + "loss": 1.4817, + "step": 3170 + }, + { + "epoch": 0.3336138874276697, + "grad_norm": 1.3383890390396118, + "learning_rate": 0.00015544186201907562, + "loss": 1.7118, + "step": 3171 + }, + { + "epoch": 0.33371909521304577, + "grad_norm": 1.7296276092529297, + "learning_rate": 0.00015541349832095896, + "loss": 1.689, + "step": 3172 + }, + { + "epoch": 0.33382430299842186, + "grad_norm": 1.2010234594345093, + "learning_rate": 0.0001553851281878552, + "loss": 2.0383, + "step": 3173 + }, + { + "epoch": 0.333929510783798, + "grad_norm": 1.05833899974823, + "learning_rate": 0.00015535675162305887, + "loss": 1.6341, + "step": 3174 + }, + { + "epoch": 0.3340347185691741, + "grad_norm": 1.8304420709609985, + "learning_rate": 0.0001553283686298653, + "loss": 2.2299, + "step": 3175 + }, + { + "epoch": 0.33413992635455025, + "grad_norm": 1.1802849769592285, + "learning_rate": 0.00015529997921157044, + "loss": 1.7604, + "step": 3176 + }, + { + "epoch": 0.33424513413992635, + "grad_norm": 1.0604053735733032, + "learning_rate": 0.00015527158337147112, + "loss": 2.3878, + "step": 3177 + }, + { + "epoch": 0.3343503419253025, + "grad_norm": 1.17803955078125, + "learning_rate": 0.0001552431811128648, + "loss": 2.2497, + "step": 3178 + }, + { + "epoch": 0.3344555497106786, + "grad_norm": 1.691798210144043, + "learning_rate": 0.0001552147724390498, + "loss": 2.2167, + "step": 3179 + }, + { + "epoch": 0.3345607574960547, + "grad_norm": 1.0652263164520264, + "learning_rate": 0.00015518635735332507, + "loss": 1.471, + "step": 3180 + }, + { + "epoch": 0.33466596528143083, + "grad_norm": 1.2501590251922607, + "learning_rate": 0.00015515793585899038, + "loss": 1.8941, + "step": 3181 + }, + { + "epoch": 0.3347711730668069, + "grad_norm": 0.7410596013069153, + "learning_rate": 0.00015512950795934627, + "loss": 1.897, + "step": 3182 + }, + { + "epoch": 0.3348763808521831, + "grad_norm": 1.589612603187561, + "learning_rate": 0.0001551010736576939, + "loss": 2.0134, + "step": 3183 + }, + { + "epoch": 0.33498158863755917, + "grad_norm": 1.5273739099502563, + "learning_rate": 0.00015507263295733528, + "loss": 2.0521, + "step": 3184 + }, + { + "epoch": 0.3350867964229353, + "grad_norm": 1.4499183893203735, + "learning_rate": 0.00015504418586157316, + "loss": 2.2479, + "step": 3185 + }, + { + "epoch": 0.3351920042083114, + "grad_norm": 1.040443778038025, + "learning_rate": 0.000155015732373711, + "loss": 2.3157, + "step": 3186 + }, + { + "epoch": 0.33529721199368756, + "grad_norm": 1.6593208312988281, + "learning_rate": 0.000154987272497053, + "loss": 2.2596, + "step": 3187 + }, + { + "epoch": 0.33540241977906365, + "grad_norm": 2.389070510864258, + "learning_rate": 0.0001549588062349041, + "loss": 1.6402, + "step": 3188 + }, + { + "epoch": 0.33550762756443975, + "grad_norm": 1.5094181299209595, + "learning_rate": 0.00015493033359057003, + "loss": 1.8529, + "step": 3189 + }, + { + "epoch": 0.3356128353498159, + "grad_norm": 0.9902091026306152, + "learning_rate": 0.00015490185456735719, + "loss": 2.0056, + "step": 3190 + }, + { + "epoch": 0.335718043135192, + "grad_norm": 1.285557508468628, + "learning_rate": 0.00015487336916857278, + "loss": 2.1874, + "step": 3191 + }, + { + "epoch": 0.33582325092056814, + "grad_norm": 1.2860695123672485, + "learning_rate": 0.00015484487739752468, + "loss": 2.1247, + "step": 3192 + }, + { + "epoch": 0.33592845870594423, + "grad_norm": 1.0790120363235474, + "learning_rate": 0.00015481637925752155, + "loss": 2.0198, + "step": 3193 + }, + { + "epoch": 0.3360336664913204, + "grad_norm": 1.1655452251434326, + "learning_rate": 0.00015478787475187283, + "loss": 1.522, + "step": 3194 + }, + { + "epoch": 0.3361388742766965, + "grad_norm": 1.2023252248764038, + "learning_rate": 0.00015475936388388862, + "loss": 1.7807, + "step": 3195 + }, + { + "epoch": 0.33624408206207257, + "grad_norm": 1.0464221239089966, + "learning_rate": 0.00015473084665687984, + "loss": 1.6904, + "step": 3196 + }, + { + "epoch": 0.3363492898474487, + "grad_norm": 0.9979501366615295, + "learning_rate": 0.00015470232307415803, + "loss": 1.8852, + "step": 3197 + }, + { + "epoch": 0.3364544976328248, + "grad_norm": 1.2959654331207275, + "learning_rate": 0.00015467379313903557, + "loss": 2.162, + "step": 3198 + }, + { + "epoch": 0.33655970541820096, + "grad_norm": 1.7765225172042847, + "learning_rate": 0.00015464525685482557, + "loss": 1.5029, + "step": 3199 + }, + { + "epoch": 0.33666491320357705, + "grad_norm": 1.5161840915679932, + "learning_rate": 0.00015461671422484178, + "loss": 1.7194, + "step": 3200 + }, + { + "epoch": 0.3367701209889532, + "grad_norm": 1.1318210363388062, + "learning_rate": 0.00015458816525239886, + "loss": 1.8918, + "step": 3201 + }, + { + "epoch": 0.3368753287743293, + "grad_norm": 1.2241750955581665, + "learning_rate": 0.00015455960994081205, + "loss": 1.732, + "step": 3202 + }, + { + "epoch": 0.33698053655970545, + "grad_norm": 1.930464267730713, + "learning_rate": 0.0001545310482933974, + "loss": 1.8916, + "step": 3203 + }, + { + "epoch": 0.33708574434508154, + "grad_norm": 1.4464246034622192, + "learning_rate": 0.0001545024803134717, + "loss": 1.728, + "step": 3204 + }, + { + "epoch": 0.33719095213045763, + "grad_norm": 1.4596894979476929, + "learning_rate": 0.00015447390600435238, + "loss": 1.5601, + "step": 3205 + }, + { + "epoch": 0.3372961599158338, + "grad_norm": 1.1431268453598022, + "learning_rate": 0.00015444532536935777, + "loss": 2.2355, + "step": 3206 + }, + { + "epoch": 0.3374013677012099, + "grad_norm": 1.3519384860992432, + "learning_rate": 0.0001544167384118068, + "loss": 1.5459, + "step": 3207 + }, + { + "epoch": 0.337506575486586, + "grad_norm": 1.1788008213043213, + "learning_rate": 0.00015438814513501922, + "loss": 2.1492, + "step": 3208 + }, + { + "epoch": 0.3376117832719621, + "grad_norm": 1.527076244354248, + "learning_rate": 0.00015435954554231541, + "loss": 1.882, + "step": 3209 + }, + { + "epoch": 0.33771699105733827, + "grad_norm": 1.9754518270492554, + "learning_rate": 0.0001543309396370166, + "loss": 1.8531, + "step": 3210 + }, + { + "epoch": 0.33782219884271436, + "grad_norm": 1.0372203588485718, + "learning_rate": 0.00015430232742244467, + "loss": 2.1218, + "step": 3211 + }, + { + "epoch": 0.33792740662809045, + "grad_norm": 1.702439785003662, + "learning_rate": 0.00015427370890192224, + "loss": 1.7968, + "step": 3212 + }, + { + "epoch": 0.3380326144134666, + "grad_norm": 1.84778892993927, + "learning_rate": 0.0001542450840787727, + "loss": 1.9588, + "step": 3213 + }, + { + "epoch": 0.3381378221988427, + "grad_norm": 1.402566909790039, + "learning_rate": 0.00015421645295632023, + "loss": 2.245, + "step": 3214 + }, + { + "epoch": 0.33824302998421885, + "grad_norm": 1.550502061843872, + "learning_rate": 0.0001541878155378896, + "loss": 1.999, + "step": 3215 + }, + { + "epoch": 0.33834823776959494, + "grad_norm": 2.043599843978882, + "learning_rate": 0.00015415917182680638, + "loss": 1.7732, + "step": 3216 + }, + { + "epoch": 0.3384534455549711, + "grad_norm": 1.1463595628738403, + "learning_rate": 0.00015413052182639683, + "loss": 1.9726, + "step": 3217 + }, + { + "epoch": 0.3385586533403472, + "grad_norm": 1.1954436302185059, + "learning_rate": 0.0001541018655399881, + "loss": 2.0425, + "step": 3218 + }, + { + "epoch": 0.33866386112572333, + "grad_norm": 1.8237320184707642, + "learning_rate": 0.00015407320297090786, + "loss": 2.3845, + "step": 3219 + }, + { + "epoch": 0.3387690689110994, + "grad_norm": 1.442068338394165, + "learning_rate": 0.0001540445341224846, + "loss": 1.988, + "step": 3220 + }, + { + "epoch": 0.3388742766964755, + "grad_norm": 1.3725106716156006, + "learning_rate": 0.00015401585899804755, + "loss": 2.2294, + "step": 3221 + }, + { + "epoch": 0.33897948448185167, + "grad_norm": 1.1935747861862183, + "learning_rate": 0.00015398717760092666, + "loss": 2.0404, + "step": 3222 + }, + { + "epoch": 0.33908469226722776, + "grad_norm": 1.4062366485595703, + "learning_rate": 0.00015395848993445265, + "loss": 1.838, + "step": 3223 + }, + { + "epoch": 0.3391899000526039, + "grad_norm": 1.1401127576828003, + "learning_rate": 0.00015392979600195684, + "loss": 1.7744, + "step": 3224 + }, + { + "epoch": 0.33929510783798, + "grad_norm": 1.2197401523590088, + "learning_rate": 0.00015390109580677144, + "loss": 1.9057, + "step": 3225 + }, + { + "epoch": 0.33940031562335615, + "grad_norm": 1.2865253686904907, + "learning_rate": 0.00015387238935222927, + "loss": 1.8107, + "step": 3226 + }, + { + "epoch": 0.33950552340873225, + "grad_norm": 1.3906419277191162, + "learning_rate": 0.0001538436766416639, + "loss": 1.5141, + "step": 3227 + }, + { + "epoch": 0.33961073119410834, + "grad_norm": 1.4416093826293945, + "learning_rate": 0.00015381495767840967, + "loss": 1.9024, + "step": 3228 + }, + { + "epoch": 0.3397159389794845, + "grad_norm": 1.0017902851104736, + "learning_rate": 0.00015378623246580165, + "loss": 1.5883, + "step": 3229 + }, + { + "epoch": 0.3398211467648606, + "grad_norm": 1.0945497751235962, + "learning_rate": 0.00015375750100717555, + "loss": 2.3554, + "step": 3230 + }, + { + "epoch": 0.33992635455023673, + "grad_norm": 1.2821848392486572, + "learning_rate": 0.00015372876330586784, + "loss": 2.1862, + "step": 3231 + }, + { + "epoch": 0.3400315623356128, + "grad_norm": 0.8067452311515808, + "learning_rate": 0.00015370001936521583, + "loss": 2.2071, + "step": 3232 + }, + { + "epoch": 0.340136770120989, + "grad_norm": 1.01970374584198, + "learning_rate": 0.00015367126918855738, + "loss": 1.802, + "step": 3233 + }, + { + "epoch": 0.34024197790636507, + "grad_norm": 0.9139577746391296, + "learning_rate": 0.00015364251277923114, + "loss": 1.951, + "step": 3234 + }, + { + "epoch": 0.3403471856917412, + "grad_norm": 1.34104323387146, + "learning_rate": 0.00015361375014057656, + "loss": 1.7029, + "step": 3235 + }, + { + "epoch": 0.3404523934771173, + "grad_norm": 1.2936007976531982, + "learning_rate": 0.00015358498127593376, + "loss": 2.0298, + "step": 3236 + }, + { + "epoch": 0.3405576012624934, + "grad_norm": 1.3565287590026855, + "learning_rate": 0.00015355620618864348, + "loss": 1.7207, + "step": 3237 + }, + { + "epoch": 0.34066280904786955, + "grad_norm": 1.0391281843185425, + "learning_rate": 0.00015352742488204733, + "loss": 1.8902, + "step": 3238 + }, + { + "epoch": 0.34076801683324565, + "grad_norm": 1.0751245021820068, + "learning_rate": 0.0001534986373594876, + "loss": 1.6464, + "step": 3239 + }, + { + "epoch": 0.3408732246186218, + "grad_norm": 1.167717456817627, + "learning_rate": 0.0001534698436243073, + "loss": 1.8188, + "step": 3240 + }, + { + "epoch": 0.3409784324039979, + "grad_norm": 1.1624265909194946, + "learning_rate": 0.00015344104367985014, + "loss": 2.048, + "step": 3241 + }, + { + "epoch": 0.34108364018937404, + "grad_norm": 1.119253158569336, + "learning_rate": 0.00015341223752946052, + "loss": 2.1498, + "step": 3242 + }, + { + "epoch": 0.34118884797475013, + "grad_norm": 2.0956761837005615, + "learning_rate": 0.00015338342517648367, + "loss": 2.0089, + "step": 3243 + }, + { + "epoch": 0.3412940557601262, + "grad_norm": 2.5349323749542236, + "learning_rate": 0.0001533546066242654, + "loss": 1.4671, + "step": 3244 + }, + { + "epoch": 0.3413992635455024, + "grad_norm": 1.1138498783111572, + "learning_rate": 0.0001533257818761524, + "loss": 2.0825, + "step": 3245 + }, + { + "epoch": 0.34150447133087847, + "grad_norm": 1.0680429935455322, + "learning_rate": 0.00015329695093549192, + "loss": 1.474, + "step": 3246 + }, + { + "epoch": 0.3416096791162546, + "grad_norm": 1.8456655740737915, + "learning_rate": 0.00015326811380563204, + "loss": 2.3713, + "step": 3247 + }, + { + "epoch": 0.3417148869016307, + "grad_norm": 1.1764317750930786, + "learning_rate": 0.0001532392704899215, + "loss": 2.155, + "step": 3248 + }, + { + "epoch": 0.34182009468700686, + "grad_norm": 1.6202481985092163, + "learning_rate": 0.0001532104209917098, + "loss": 1.6939, + "step": 3249 + }, + { + "epoch": 0.34192530247238295, + "grad_norm": 0.7920756340026855, + "learning_rate": 0.00015318156531434713, + "loss": 1.8199, + "step": 3250 + }, + { + "epoch": 0.3420305102577591, + "grad_norm": 1.4184741973876953, + "learning_rate": 0.00015315270346118442, + "loss": 1.917, + "step": 3251 + }, + { + "epoch": 0.3421357180431352, + "grad_norm": 1.0654038190841675, + "learning_rate": 0.00015312383543557328, + "loss": 1.8263, + "step": 3252 + }, + { + "epoch": 0.3422409258285113, + "grad_norm": 0.845812976360321, + "learning_rate": 0.00015309496124086603, + "loss": 1.9213, + "step": 3253 + }, + { + "epoch": 0.34234613361388744, + "grad_norm": 1.5473440885543823, + "learning_rate": 0.0001530660808804158, + "loss": 1.8193, + "step": 3254 + }, + { + "epoch": 0.34245134139926353, + "grad_norm": 2.408548593521118, + "learning_rate": 0.00015303719435757633, + "loss": 1.9995, + "step": 3255 + }, + { + "epoch": 0.3425565491846397, + "grad_norm": 1.4737664461135864, + "learning_rate": 0.0001530083016757021, + "loss": 1.8523, + "step": 3256 + }, + { + "epoch": 0.3426617569700158, + "grad_norm": 1.6030080318450928, + "learning_rate": 0.0001529794028381484, + "loss": 1.8994, + "step": 3257 + }, + { + "epoch": 0.3427669647553919, + "grad_norm": 1.5346298217773438, + "learning_rate": 0.00015295049784827108, + "loss": 1.7178, + "step": 3258 + }, + { + "epoch": 0.342872172540768, + "grad_norm": 1.2144919633865356, + "learning_rate": 0.0001529215867094268, + "loss": 2.0721, + "step": 3259 + }, + { + "epoch": 0.3429773803261441, + "grad_norm": 1.6161445379257202, + "learning_rate": 0.00015289266942497293, + "loss": 2.2569, + "step": 3260 + }, + { + "epoch": 0.34308258811152026, + "grad_norm": 1.4736202955245972, + "learning_rate": 0.00015286374599826754, + "loss": 1.9167, + "step": 3261 + }, + { + "epoch": 0.34318779589689635, + "grad_norm": 1.4252387285232544, + "learning_rate": 0.0001528348164326694, + "loss": 2.426, + "step": 3262 + }, + { + "epoch": 0.3432930036822725, + "grad_norm": 0.9563824534416199, + "learning_rate": 0.000152805880731538, + "loss": 2.0023, + "step": 3263 + }, + { + "epoch": 0.3433982114676486, + "grad_norm": 1.182991862297058, + "learning_rate": 0.00015277693889823355, + "loss": 1.8019, + "step": 3264 + }, + { + "epoch": 0.34350341925302474, + "grad_norm": 0.8522358536720276, + "learning_rate": 0.000152747990936117, + "loss": 1.7264, + "step": 3265 + }, + { + "epoch": 0.34360862703840084, + "grad_norm": 1.2024897336959839, + "learning_rate": 0.0001527190368485499, + "loss": 1.8793, + "step": 3266 + }, + { + "epoch": 0.343713834823777, + "grad_norm": 1.762102484703064, + "learning_rate": 0.0001526900766388947, + "loss": 1.6688, + "step": 3267 + }, + { + "epoch": 0.3438190426091531, + "grad_norm": 1.8461966514587402, + "learning_rate": 0.00015266111031051442, + "loss": 1.385, + "step": 3268 + }, + { + "epoch": 0.3439242503945292, + "grad_norm": 1.0345498323440552, + "learning_rate": 0.00015263213786677278, + "loss": 1.694, + "step": 3269 + }, + { + "epoch": 0.3440294581799053, + "grad_norm": 1.057798981666565, + "learning_rate": 0.00015260315931103427, + "loss": 1.4546, + "step": 3270 + }, + { + "epoch": 0.3441346659652814, + "grad_norm": 1.6364519596099854, + "learning_rate": 0.00015257417464666412, + "loss": 1.8252, + "step": 3271 + }, + { + "epoch": 0.34423987375065757, + "grad_norm": 0.9418036341667175, + "learning_rate": 0.0001525451838770282, + "loss": 2.3333, + "step": 3272 + }, + { + "epoch": 0.34434508153603366, + "grad_norm": 1.4215340614318848, + "learning_rate": 0.00015251618700549307, + "loss": 1.7159, + "step": 3273 + }, + { + "epoch": 0.3444502893214098, + "grad_norm": 0.8506739139556885, + "learning_rate": 0.0001524871840354261, + "loss": 2.1433, + "step": 3274 + }, + { + "epoch": 0.3445554971067859, + "grad_norm": 1.7179830074310303, + "learning_rate": 0.00015245817497019524, + "loss": 2.0006, + "step": 3275 + }, + { + "epoch": 0.344660704892162, + "grad_norm": 1.3353736400604248, + "learning_rate": 0.0001524291598131693, + "loss": 2.0854, + "step": 3276 + }, + { + "epoch": 0.34476591267753814, + "grad_norm": 1.2626408338546753, + "learning_rate": 0.00015240013856771768, + "loss": 1.9261, + "step": 3277 + }, + { + "epoch": 0.34487112046291424, + "grad_norm": 1.2768040895462036, + "learning_rate": 0.00015237111123721052, + "loss": 2.1543, + "step": 3278 + }, + { + "epoch": 0.3449763282482904, + "grad_norm": 1.1452258825302124, + "learning_rate": 0.00015234207782501865, + "loss": 1.9403, + "step": 3279 + }, + { + "epoch": 0.3450815360336665, + "grad_norm": 1.7478617429733276, + "learning_rate": 0.0001523130383345136, + "loss": 1.8556, + "step": 3280 + }, + { + "epoch": 0.34518674381904263, + "grad_norm": 1.261497974395752, + "learning_rate": 0.00015228399276906774, + "loss": 2.3515, + "step": 3281 + }, + { + "epoch": 0.3452919516044187, + "grad_norm": 2.1745786666870117, + "learning_rate": 0.00015225494113205393, + "loss": 1.8923, + "step": 3282 + }, + { + "epoch": 0.34539715938979487, + "grad_norm": 1.2414960861206055, + "learning_rate": 0.0001522258834268459, + "loss": 1.8377, + "step": 3283 + }, + { + "epoch": 0.34550236717517097, + "grad_norm": 1.666604995727539, + "learning_rate": 0.00015219681965681798, + "loss": 2.1966, + "step": 3284 + }, + { + "epoch": 0.34560757496054706, + "grad_norm": 2.886913776397705, + "learning_rate": 0.0001521677498253453, + "loss": 1.9478, + "step": 3285 + }, + { + "epoch": 0.3457127827459232, + "grad_norm": 1.258230447769165, + "learning_rate": 0.00015213867393580358, + "loss": 2.0871, + "step": 3286 + }, + { + "epoch": 0.3458179905312993, + "grad_norm": 1.2245984077453613, + "learning_rate": 0.0001521095919915694, + "loss": 1.9927, + "step": 3287 + }, + { + "epoch": 0.34592319831667545, + "grad_norm": 1.6182457208633423, + "learning_rate": 0.00015208050399601985, + "loss": 1.6089, + "step": 3288 + }, + { + "epoch": 0.34602840610205154, + "grad_norm": 1.1847516298294067, + "learning_rate": 0.00015205140995253283, + "loss": 2.0237, + "step": 3289 + }, + { + "epoch": 0.3461336138874277, + "grad_norm": 1.0616167783737183, + "learning_rate": 0.00015202230986448704, + "loss": 1.7928, + "step": 3290 + }, + { + "epoch": 0.3462388216728038, + "grad_norm": 1.7937065362930298, + "learning_rate": 0.0001519932037352617, + "loss": 2.0342, + "step": 3291 + }, + { + "epoch": 0.3463440294581799, + "grad_norm": 1.3131473064422607, + "learning_rate": 0.0001519640915682368, + "loss": 1.8552, + "step": 3292 + }, + { + "epoch": 0.34644923724355603, + "grad_norm": 1.2970378398895264, + "learning_rate": 0.0001519349733667931, + "loss": 1.7052, + "step": 3293 + }, + { + "epoch": 0.3465544450289321, + "grad_norm": 1.1303088665008545, + "learning_rate": 0.00015190584913431194, + "loss": 1.8543, + "step": 3294 + }, + { + "epoch": 0.3466596528143083, + "grad_norm": 1.530392050743103, + "learning_rate": 0.00015187671887417542, + "loss": 2.0367, + "step": 3295 + }, + { + "epoch": 0.34676486059968437, + "grad_norm": 1.7276886701583862, + "learning_rate": 0.00015184758258976637, + "loss": 2.6418, + "step": 3296 + }, + { + "epoch": 0.3468700683850605, + "grad_norm": 1.323357343673706, + "learning_rate": 0.0001518184402844683, + "loss": 2.1965, + "step": 3297 + }, + { + "epoch": 0.3469752761704366, + "grad_norm": 4.387045383453369, + "learning_rate": 0.00015178929196166537, + "loss": 1.9068, + "step": 3298 + }, + { + "epoch": 0.34708048395581276, + "grad_norm": 1.7642402648925781, + "learning_rate": 0.00015176013762474252, + "loss": 1.7479, + "step": 3299 + }, + { + "epoch": 0.34718569174118885, + "grad_norm": 1.1075654029846191, + "learning_rate": 0.00015173097727708533, + "loss": 1.7473, + "step": 3300 + }, + { + "epoch": 0.34729089952656494, + "grad_norm": 1.3891255855560303, + "learning_rate": 0.0001517018109220801, + "loss": 1.863, + "step": 3301 + }, + { + "epoch": 0.3473961073119411, + "grad_norm": 0.9241618514060974, + "learning_rate": 0.0001516726385631138, + "loss": 2.2975, + "step": 3302 + }, + { + "epoch": 0.3475013150973172, + "grad_norm": 2.6608288288116455, + "learning_rate": 0.00015164346020357417, + "loss": 2.5467, + "step": 3303 + }, + { + "epoch": 0.34760652288269334, + "grad_norm": 1.2339940071105957, + "learning_rate": 0.00015161427584684954, + "loss": 1.4252, + "step": 3304 + }, + { + "epoch": 0.34771173066806943, + "grad_norm": 1.0386180877685547, + "learning_rate": 0.00015158508549632902, + "loss": 1.5439, + "step": 3305 + }, + { + "epoch": 0.3478169384534456, + "grad_norm": 1.0006986856460571, + "learning_rate": 0.0001515558891554024, + "loss": 2.3416, + "step": 3306 + }, + { + "epoch": 0.3479221462388217, + "grad_norm": 1.0212254524230957, + "learning_rate": 0.0001515266868274601, + "loss": 1.6541, + "step": 3307 + }, + { + "epoch": 0.34802735402419777, + "grad_norm": 1.2508071660995483, + "learning_rate": 0.0001514974785158934, + "loss": 1.5524, + "step": 3308 + }, + { + "epoch": 0.3481325618095739, + "grad_norm": 1.5572774410247803, + "learning_rate": 0.00015146826422409405, + "loss": 2.1225, + "step": 3309 + }, + { + "epoch": 0.34823776959495, + "grad_norm": 1.8511630296707153, + "learning_rate": 0.00015143904395545466, + "loss": 1.6325, + "step": 3310 + }, + { + "epoch": 0.34834297738032616, + "grad_norm": 1.0825985670089722, + "learning_rate": 0.00015140981771336848, + "loss": 1.9844, + "step": 3311 + }, + { + "epoch": 0.34844818516570225, + "grad_norm": 1.2613874673843384, + "learning_rate": 0.00015138058550122945, + "loss": 1.5684, + "step": 3312 + }, + { + "epoch": 0.3485533929510784, + "grad_norm": 1.5934251546859741, + "learning_rate": 0.00015135134732243227, + "loss": 2.6311, + "step": 3313 + }, + { + "epoch": 0.3486586007364545, + "grad_norm": 1.325727939605713, + "learning_rate": 0.00015132210318037214, + "loss": 1.2294, + "step": 3314 + }, + { + "epoch": 0.34876380852183064, + "grad_norm": 1.7962899208068848, + "learning_rate": 0.00015129285307844523, + "loss": 2.3393, + "step": 3315 + }, + { + "epoch": 0.34886901630720674, + "grad_norm": 1.5050058364868164, + "learning_rate": 0.00015126359702004818, + "loss": 1.8036, + "step": 3316 + }, + { + "epoch": 0.34897422409258283, + "grad_norm": 1.342183232307434, + "learning_rate": 0.0001512343350085784, + "loss": 1.9512, + "step": 3317 + }, + { + "epoch": 0.349079431877959, + "grad_norm": 1.6918140649795532, + "learning_rate": 0.00015120506704743402, + "loss": 2.1275, + "step": 3318 + }, + { + "epoch": 0.3491846396633351, + "grad_norm": 1.7969458103179932, + "learning_rate": 0.00015117579314001382, + "loss": 1.9085, + "step": 3319 + }, + { + "epoch": 0.3492898474487112, + "grad_norm": 1.1267280578613281, + "learning_rate": 0.00015114651328971727, + "loss": 1.8183, + "step": 3320 + }, + { + "epoch": 0.3493950552340873, + "grad_norm": 1.4564090967178345, + "learning_rate": 0.00015111722749994457, + "loss": 1.8921, + "step": 3321 + }, + { + "epoch": 0.34950026301946346, + "grad_norm": 1.4575985670089722, + "learning_rate": 0.00015108793577409656, + "loss": 1.7664, + "step": 3322 + }, + { + "epoch": 0.34960547080483956, + "grad_norm": 1.3683162927627563, + "learning_rate": 0.0001510586381155748, + "loss": 1.585, + "step": 3323 + }, + { + "epoch": 0.34971067859021565, + "grad_norm": 2.316188097000122, + "learning_rate": 0.0001510293345277815, + "loss": 1.8608, + "step": 3324 + }, + { + "epoch": 0.3498158863755918, + "grad_norm": 0.8995792269706726, + "learning_rate": 0.0001510000250141196, + "loss": 1.475, + "step": 3325 + }, + { + "epoch": 0.3499210941609679, + "grad_norm": 1.4614365100860596, + "learning_rate": 0.0001509707095779928, + "loss": 2.1418, + "step": 3326 + }, + { + "epoch": 0.35002630194634404, + "grad_norm": 2.0038840770721436, + "learning_rate": 0.00015094138822280533, + "loss": 2.1766, + "step": 3327 + }, + { + "epoch": 0.35013150973172014, + "grad_norm": 2.1047608852386475, + "learning_rate": 0.00015091206095196215, + "loss": 1.7852, + "step": 3328 + }, + { + "epoch": 0.3502367175170963, + "grad_norm": 1.098832607269287, + "learning_rate": 0.000150882727768869, + "loss": 1.973, + "step": 3329 + }, + { + "epoch": 0.3503419253024724, + "grad_norm": 1.2280389070510864, + "learning_rate": 0.00015085338867693225, + "loss": 1.7759, + "step": 3330 + }, + { + "epoch": 0.35044713308784853, + "grad_norm": 1.6479296684265137, + "learning_rate": 0.0001508240436795589, + "loss": 1.9789, + "step": 3331 + }, + { + "epoch": 0.3505523408732246, + "grad_norm": 1.37543785572052, + "learning_rate": 0.00015079469278015672, + "loss": 2.2909, + "step": 3332 + }, + { + "epoch": 0.3506575486586007, + "grad_norm": 0.9722198247909546, + "learning_rate": 0.00015076533598213415, + "loss": 1.6292, + "step": 3333 + }, + { + "epoch": 0.35076275644397686, + "grad_norm": 1.5459274053573608, + "learning_rate": 0.00015073597328890025, + "loss": 1.8489, + "step": 3334 + }, + { + "epoch": 0.35086796422935296, + "grad_norm": 1.3140374422073364, + "learning_rate": 0.00015070660470386485, + "loss": 2.0101, + "step": 3335 + }, + { + "epoch": 0.3509731720147291, + "grad_norm": 1.515724778175354, + "learning_rate": 0.00015067723023043844, + "loss": 1.9427, + "step": 3336 + }, + { + "epoch": 0.3510783798001052, + "grad_norm": 1.626247763633728, + "learning_rate": 0.00015064784987203216, + "loss": 1.574, + "step": 3337 + }, + { + "epoch": 0.35118358758548135, + "grad_norm": 1.1140034198760986, + "learning_rate": 0.00015061846363205784, + "loss": 2.173, + "step": 3338 + }, + { + "epoch": 0.35128879537085744, + "grad_norm": 1.1657525300979614, + "learning_rate": 0.000150589071513928, + "loss": 2.1967, + "step": 3339 + }, + { + "epoch": 0.35139400315623354, + "grad_norm": 0.87020343542099, + "learning_rate": 0.00015055967352105588, + "loss": 1.7997, + "step": 3340 + }, + { + "epoch": 0.3514992109416097, + "grad_norm": 1.1044055223464966, + "learning_rate": 0.0001505302696568554, + "loss": 2.051, + "step": 3341 + }, + { + "epoch": 0.3516044187269858, + "grad_norm": 2.5715765953063965, + "learning_rate": 0.00015050085992474106, + "loss": 2.1982, + "step": 3342 + }, + { + "epoch": 0.35170962651236193, + "grad_norm": 1.7368378639221191, + "learning_rate": 0.00015047144432812814, + "loss": 1.2449, + "step": 3343 + }, + { + "epoch": 0.351814834297738, + "grad_norm": 1.0947346687316895, + "learning_rate": 0.00015044202287043263, + "loss": 1.4047, + "step": 3344 + }, + { + "epoch": 0.35192004208311417, + "grad_norm": 1.4075251817703247, + "learning_rate": 0.00015041259555507108, + "loss": 1.8252, + "step": 3345 + }, + { + "epoch": 0.35202524986849026, + "grad_norm": 1.3330639600753784, + "learning_rate": 0.00015038316238546082, + "loss": 1.9027, + "step": 3346 + }, + { + "epoch": 0.3521304576538664, + "grad_norm": 1.0962144136428833, + "learning_rate": 0.00015035372336501984, + "loss": 1.3782, + "step": 3347 + }, + { + "epoch": 0.3522356654392425, + "grad_norm": 1.612261414527893, + "learning_rate": 0.00015032427849716675, + "loss": 2.1975, + "step": 3348 + }, + { + "epoch": 0.3523408732246186, + "grad_norm": 1.682795763015747, + "learning_rate": 0.0001502948277853209, + "loss": 1.8072, + "step": 3349 + }, + { + "epoch": 0.35244608100999475, + "grad_norm": 1.4576448202133179, + "learning_rate": 0.00015026537123290234, + "loss": 1.7883, + "step": 3350 + }, + { + "epoch": 0.35255128879537084, + "grad_norm": 1.385787844657898, + "learning_rate": 0.00015023590884333173, + "loss": 2.0371, + "step": 3351 + }, + { + "epoch": 0.352656496580747, + "grad_norm": 1.0585753917694092, + "learning_rate": 0.00015020644062003046, + "loss": 1.912, + "step": 3352 + }, + { + "epoch": 0.3527617043661231, + "grad_norm": 1.1872546672821045, + "learning_rate": 0.00015017696656642056, + "loss": 2.1347, + "step": 3353 + }, + { + "epoch": 0.35286691215149923, + "grad_norm": 1.5553888082504272, + "learning_rate": 0.00015014748668592477, + "loss": 1.8306, + "step": 3354 + }, + { + "epoch": 0.35297211993687533, + "grad_norm": 1.696855902671814, + "learning_rate": 0.00015011800098196646, + "loss": 1.5191, + "step": 3355 + }, + { + "epoch": 0.3530773277222514, + "grad_norm": 1.31337571144104, + "learning_rate": 0.00015008850945796975, + "loss": 2.1424, + "step": 3356 + }, + { + "epoch": 0.35318253550762757, + "grad_norm": 1.327221393585205, + "learning_rate": 0.00015005901211735938, + "loss": 1.8816, + "step": 3357 + }, + { + "epoch": 0.35328774329300366, + "grad_norm": 1.1838815212249756, + "learning_rate": 0.0001500295089635608, + "loss": 2.271, + "step": 3358 + }, + { + "epoch": 0.3533929510783798, + "grad_norm": 1.5247220993041992, + "learning_rate": 0.00015000000000000001, + "loss": 1.7908, + "step": 3359 + }, + { + "epoch": 0.3534981588637559, + "grad_norm": 1.6162418127059937, + "learning_rate": 0.0001499704852301039, + "loss": 2.1344, + "step": 3360 + }, + { + "epoch": 0.35360336664913206, + "grad_norm": 1.533591628074646, + "learning_rate": 0.0001499409646572999, + "loss": 2.2785, + "step": 3361 + }, + { + "epoch": 0.35370857443450815, + "grad_norm": 1.3864167928695679, + "learning_rate": 0.00014991143828501613, + "loss": 2.2172, + "step": 3362 + }, + { + "epoch": 0.3538137822198843, + "grad_norm": 1.580679178237915, + "learning_rate": 0.00014988190611668135, + "loss": 1.6399, + "step": 3363 + }, + { + "epoch": 0.3539189900052604, + "grad_norm": 1.053759217262268, + "learning_rate": 0.00014985236815572513, + "loss": 1.3566, + "step": 3364 + }, + { + "epoch": 0.3540241977906365, + "grad_norm": 1.2287757396697998, + "learning_rate": 0.0001498228244055775, + "loss": 2.0034, + "step": 3365 + }, + { + "epoch": 0.35412940557601263, + "grad_norm": 1.3084381818771362, + "learning_rate": 0.00014979327486966938, + "loss": 1.9535, + "step": 3366 + }, + { + "epoch": 0.35423461336138873, + "grad_norm": 1.1982232332229614, + "learning_rate": 0.0001497637195514322, + "loss": 2.0076, + "step": 3367 + }, + { + "epoch": 0.3543398211467649, + "grad_norm": 1.4315484762191772, + "learning_rate": 0.00014973415845429813, + "loss": 1.9953, + "step": 3368 + }, + { + "epoch": 0.35444502893214097, + "grad_norm": 1.0541669130325317, + "learning_rate": 0.0001497045915817, + "loss": 1.6665, + "step": 3369 + }, + { + "epoch": 0.3545502367175171, + "grad_norm": 1.2444473505020142, + "learning_rate": 0.00014967501893707133, + "loss": 2.4482, + "step": 3370 + }, + { + "epoch": 0.3546554445028932, + "grad_norm": 1.091001033782959, + "learning_rate": 0.00014964544052384628, + "loss": 2.1292, + "step": 3371 + }, + { + "epoch": 0.3547606522882693, + "grad_norm": 1.2684603929519653, + "learning_rate": 0.0001496158563454597, + "loss": 1.6962, + "step": 3372 + }, + { + "epoch": 0.35486586007364546, + "grad_norm": 1.5170433521270752, + "learning_rate": 0.0001495862664053471, + "loss": 1.6657, + "step": 3373 + }, + { + "epoch": 0.35497106785902155, + "grad_norm": 1.2288258075714111, + "learning_rate": 0.0001495566707069447, + "loss": 1.9427, + "step": 3374 + }, + { + "epoch": 0.3550762756443977, + "grad_norm": 1.205127239227295, + "learning_rate": 0.0001495270692536893, + "loss": 1.958, + "step": 3375 + }, + { + "epoch": 0.3551814834297738, + "grad_norm": 1.3871060609817505, + "learning_rate": 0.0001494974620490184, + "loss": 1.9658, + "step": 3376 + }, + { + "epoch": 0.35528669121514994, + "grad_norm": 1.6164618730545044, + "learning_rate": 0.00014946784909637028, + "loss": 1.354, + "step": 3377 + }, + { + "epoch": 0.35539189900052603, + "grad_norm": 1.082649827003479, + "learning_rate": 0.00014943823039918373, + "loss": 1.5321, + "step": 3378 + }, + { + "epoch": 0.3554971067859022, + "grad_norm": 1.9146784543991089, + "learning_rate": 0.00014940860596089828, + "loss": 1.5985, + "step": 3379 + }, + { + "epoch": 0.3556023145712783, + "grad_norm": 1.5512475967407227, + "learning_rate": 0.0001493789757849541, + "loss": 1.7136, + "step": 3380 + }, + { + "epoch": 0.35570752235665437, + "grad_norm": 1.2535208463668823, + "learning_rate": 0.00014934933987479206, + "loss": 1.5282, + "step": 3381 + }, + { + "epoch": 0.3558127301420305, + "grad_norm": 1.6112395524978638, + "learning_rate": 0.0001493196982338537, + "loss": 2.1735, + "step": 3382 + }, + { + "epoch": 0.3559179379274066, + "grad_norm": 1.6096539497375488, + "learning_rate": 0.00014929005086558117, + "loss": 2.2356, + "step": 3383 + }, + { + "epoch": 0.35602314571278276, + "grad_norm": 1.7732410430908203, + "learning_rate": 0.00014926039777341733, + "loss": 1.8119, + "step": 3384 + }, + { + "epoch": 0.35612835349815886, + "grad_norm": 2.6006152629852295, + "learning_rate": 0.00014923073896080575, + "loss": 1.6363, + "step": 3385 + }, + { + "epoch": 0.356233561283535, + "grad_norm": 1.6109579801559448, + "learning_rate": 0.00014920107443119052, + "loss": 1.8525, + "step": 3386 + }, + { + "epoch": 0.3563387690689111, + "grad_norm": 1.49036705493927, + "learning_rate": 0.00014917140418801655, + "loss": 1.6045, + "step": 3387 + }, + { + "epoch": 0.3564439768542872, + "grad_norm": 1.5587002038955688, + "learning_rate": 0.00014914172823472934, + "loss": 2.4678, + "step": 3388 + }, + { + "epoch": 0.35654918463966334, + "grad_norm": 1.2528430223464966, + "learning_rate": 0.00014911204657477506, + "loss": 2.4837, + "step": 3389 + }, + { + "epoch": 0.35665439242503943, + "grad_norm": 1.163336157798767, + "learning_rate": 0.00014908235921160055, + "loss": 2.1102, + "step": 3390 + }, + { + "epoch": 0.3567596002104156, + "grad_norm": 1.2272528409957886, + "learning_rate": 0.00014905266614865324, + "loss": 2.1122, + "step": 3391 + }, + { + "epoch": 0.3568648079957917, + "grad_norm": 1.4555405378341675, + "learning_rate": 0.00014902296738938134, + "loss": 1.264, + "step": 3392 + }, + { + "epoch": 0.3569700157811678, + "grad_norm": 2.172147274017334, + "learning_rate": 0.00014899326293723371, + "loss": 1.9308, + "step": 3393 + }, + { + "epoch": 0.3570752235665439, + "grad_norm": 1.3015145063400269, + "learning_rate": 0.00014896355279565976, + "loss": 2.0421, + "step": 3394 + }, + { + "epoch": 0.35718043135192007, + "grad_norm": 1.9815869331359863, + "learning_rate": 0.00014893383696810964, + "loss": 1.5845, + "step": 3395 + }, + { + "epoch": 0.35728563913729616, + "grad_norm": 0.8126930594444275, + "learning_rate": 0.0001489041154580342, + "loss": 2.2649, + "step": 3396 + }, + { + "epoch": 0.35739084692267226, + "grad_norm": 1.3474737405776978, + "learning_rate": 0.00014887438826888483, + "loss": 1.7113, + "step": 3397 + }, + { + "epoch": 0.3574960547080484, + "grad_norm": 1.247473120689392, + "learning_rate": 0.00014884465540411368, + "loss": 2.136, + "step": 3398 + }, + { + "epoch": 0.3576012624934245, + "grad_norm": 1.0129485130310059, + "learning_rate": 0.00014881491686717362, + "loss": 2.1247, + "step": 3399 + }, + { + "epoch": 0.35770647027880065, + "grad_norm": 1.0661226511001587, + "learning_rate": 0.00014878517266151794, + "loss": 1.9584, + "step": 3400 + }, + { + "epoch": 0.35781167806417674, + "grad_norm": 1.4464398622512817, + "learning_rate": 0.00014875542279060085, + "loss": 1.9324, + "step": 3401 + }, + { + "epoch": 0.3579168858495529, + "grad_norm": 1.864518165588379, + "learning_rate": 0.00014872566725787701, + "loss": 1.1379, + "step": 3402 + }, + { + "epoch": 0.358022093634929, + "grad_norm": 1.319411277770996, + "learning_rate": 0.00014869590606680192, + "loss": 1.796, + "step": 3403 + }, + { + "epoch": 0.3581273014203051, + "grad_norm": 0.8547179698944092, + "learning_rate": 0.0001486661392208316, + "loss": 1.9455, + "step": 3404 + }, + { + "epoch": 0.3582325092056812, + "grad_norm": 1.2236049175262451, + "learning_rate": 0.00014863636672342277, + "loss": 1.4572, + "step": 3405 + }, + { + "epoch": 0.3583377169910573, + "grad_norm": 1.5455988645553589, + "learning_rate": 0.00014860658857803285, + "loss": 1.9523, + "step": 3406 + }, + { + "epoch": 0.35844292477643347, + "grad_norm": 1.231605052947998, + "learning_rate": 0.00014857680478811984, + "loss": 2.3316, + "step": 3407 + }, + { + "epoch": 0.35854813256180956, + "grad_norm": 1.5083273649215698, + "learning_rate": 0.00014854701535714244, + "loss": 2.0371, + "step": 3408 + }, + { + "epoch": 0.3586533403471857, + "grad_norm": 1.7702112197875977, + "learning_rate": 0.00014851722028856005, + "loss": 2.159, + "step": 3409 + }, + { + "epoch": 0.3587585481325618, + "grad_norm": 1.20704984664917, + "learning_rate": 0.0001484874195858326, + "loss": 1.1983, + "step": 3410 + }, + { + "epoch": 0.35886375591793795, + "grad_norm": 1.5127192735671997, + "learning_rate": 0.00014845761325242077, + "loss": 1.638, + "step": 3411 + }, + { + "epoch": 0.35896896370331405, + "grad_norm": 1.1735661029815674, + "learning_rate": 0.0001484278012917859, + "loss": 1.6514, + "step": 3412 + }, + { + "epoch": 0.35907417148869014, + "grad_norm": 1.142444133758545, + "learning_rate": 0.00014839798370738994, + "loss": 1.6814, + "step": 3413 + }, + { + "epoch": 0.3591793792740663, + "grad_norm": 2.166907787322998, + "learning_rate": 0.00014836816050269548, + "loss": 2.0189, + "step": 3414 + }, + { + "epoch": 0.3592845870594424, + "grad_norm": 0.9603028893470764, + "learning_rate": 0.00014833833168116582, + "loss": 2.2928, + "step": 3415 + }, + { + "epoch": 0.35938979484481853, + "grad_norm": 1.3047958612442017, + "learning_rate": 0.00014830849724626488, + "loss": 2.1418, + "step": 3416 + }, + { + "epoch": 0.3594950026301946, + "grad_norm": 2.171341896057129, + "learning_rate": 0.00014827865720145724, + "loss": 1.9426, + "step": 3417 + }, + { + "epoch": 0.3596002104155708, + "grad_norm": 1.029051661491394, + "learning_rate": 0.0001482488115502081, + "loss": 2.0982, + "step": 3418 + }, + { + "epoch": 0.35970541820094687, + "grad_norm": 1.3975342512130737, + "learning_rate": 0.00014821896029598337, + "loss": 2.189, + "step": 3419 + }, + { + "epoch": 0.35981062598632296, + "grad_norm": 1.4755433797836304, + "learning_rate": 0.00014818910344224957, + "loss": 1.6457, + "step": 3420 + }, + { + "epoch": 0.3599158337716991, + "grad_norm": 1.5850578546524048, + "learning_rate": 0.00014815924099247384, + "loss": 2.2274, + "step": 3421 + }, + { + "epoch": 0.3600210415570752, + "grad_norm": 1.3168679475784302, + "learning_rate": 0.00014812937295012406, + "loss": 2.4136, + "step": 3422 + }, + { + "epoch": 0.36012624934245135, + "grad_norm": 1.5056153535842896, + "learning_rate": 0.00014809949931866867, + "loss": 1.8313, + "step": 3423 + }, + { + "epoch": 0.36023145712782745, + "grad_norm": 3.0403895378112793, + "learning_rate": 0.00014806962010157683, + "loss": 2.1145, + "step": 3424 + }, + { + "epoch": 0.3603366649132036, + "grad_norm": 0.8465742468833923, + "learning_rate": 0.00014803973530231828, + "loss": 2.0201, + "step": 3425 + }, + { + "epoch": 0.3604418726985797, + "grad_norm": 1.618240237236023, + "learning_rate": 0.00014800984492436346, + "loss": 2.0447, + "step": 3426 + }, + { + "epoch": 0.36054708048395584, + "grad_norm": 1.5372968912124634, + "learning_rate": 0.00014797994897118347, + "loss": 1.9388, + "step": 3427 + }, + { + "epoch": 0.36065228826933193, + "grad_norm": 1.471268892288208, + "learning_rate": 0.00014795004744625, + "loss": 2.3546, + "step": 3428 + }, + { + "epoch": 0.360757496054708, + "grad_norm": 1.4779199361801147, + "learning_rate": 0.00014792014035303535, + "loss": 1.9209, + "step": 3429 + }, + { + "epoch": 0.3608627038400842, + "grad_norm": 0.9379653930664062, + "learning_rate": 0.0001478902276950127, + "loss": 1.705, + "step": 3430 + }, + { + "epoch": 0.36096791162546027, + "grad_norm": 1.2077665328979492, + "learning_rate": 0.00014786030947565554, + "loss": 1.7868, + "step": 3431 + }, + { + "epoch": 0.3610731194108364, + "grad_norm": 1.1456810235977173, + "learning_rate": 0.00014783038569843822, + "loss": 2.3369, + "step": 3432 + }, + { + "epoch": 0.3611783271962125, + "grad_norm": 1.7450131177902222, + "learning_rate": 0.00014780045636683578, + "loss": 1.9921, + "step": 3433 + }, + { + "epoch": 0.36128353498158866, + "grad_norm": 1.1912232637405396, + "learning_rate": 0.00014777052148432372, + "loss": 1.9747, + "step": 3434 + }, + { + "epoch": 0.36138874276696475, + "grad_norm": 1.3234916925430298, + "learning_rate": 0.00014774058105437827, + "loss": 1.6403, + "step": 3435 + }, + { + "epoch": 0.36149395055234085, + "grad_norm": 1.4043394327163696, + "learning_rate": 0.00014771063508047636, + "loss": 1.8832, + "step": 3436 + }, + { + "epoch": 0.361599158337717, + "grad_norm": 1.1788146495819092, + "learning_rate": 0.00014768068356609554, + "loss": 1.997, + "step": 3437 + }, + { + "epoch": 0.3617043661230931, + "grad_norm": 1.0332565307617188, + "learning_rate": 0.00014765072651471393, + "loss": 1.7426, + "step": 3438 + }, + { + "epoch": 0.36180957390846924, + "grad_norm": 1.34288489818573, + "learning_rate": 0.00014762076392981033, + "loss": 2.0445, + "step": 3439 + }, + { + "epoch": 0.36191478169384533, + "grad_norm": 1.6269721984863281, + "learning_rate": 0.00014759079581486424, + "loss": 1.7493, + "step": 3440 + }, + { + "epoch": 0.3620199894792215, + "grad_norm": 1.204362392425537, + "learning_rate": 0.00014756082217335577, + "loss": 2.179, + "step": 3441 + }, + { + "epoch": 0.3621251972645976, + "grad_norm": 1.8871209621429443, + "learning_rate": 0.0001475308430087656, + "loss": 1.5849, + "step": 3442 + }, + { + "epoch": 0.3622304050499737, + "grad_norm": 1.0620596408843994, + "learning_rate": 0.00014750085832457519, + "loss": 2.2185, + "step": 3443 + }, + { + "epoch": 0.3623356128353498, + "grad_norm": 1.24690842628479, + "learning_rate": 0.00014747086812426648, + "loss": 1.5221, + "step": 3444 + }, + { + "epoch": 0.3624408206207259, + "grad_norm": 1.5834826231002808, + "learning_rate": 0.0001474408724113222, + "loss": 1.4759, + "step": 3445 + }, + { + "epoch": 0.36254602840610206, + "grad_norm": 1.1942899227142334, + "learning_rate": 0.0001474108711892256, + "loss": 2.2882, + "step": 3446 + }, + { + "epoch": 0.36265123619147815, + "grad_norm": 1.3137141466140747, + "learning_rate": 0.00014738086446146065, + "loss": 1.6804, + "step": 3447 + }, + { + "epoch": 0.3627564439768543, + "grad_norm": 1.0239195823669434, + "learning_rate": 0.00014735085223151198, + "loss": 2.2197, + "step": 3448 + }, + { + "epoch": 0.3628616517622304, + "grad_norm": 1.18763267993927, + "learning_rate": 0.00014732083450286472, + "loss": 1.7646, + "step": 3449 + }, + { + "epoch": 0.36296685954760655, + "grad_norm": 1.4239435195922852, + "learning_rate": 0.00014729081127900476, + "loss": 1.8619, + "step": 3450 + }, + { + "epoch": 0.36307206733298264, + "grad_norm": 1.079707384109497, + "learning_rate": 0.00014726078256341863, + "loss": 2.0, + "step": 3451 + }, + { + "epoch": 0.36317727511835873, + "grad_norm": 1.7194312810897827, + "learning_rate": 0.00014723074835959346, + "loss": 1.6595, + "step": 3452 + }, + { + "epoch": 0.3632824829037349, + "grad_norm": 1.498183250427246, + "learning_rate": 0.000147200708671017, + "loss": 1.6809, + "step": 3453 + }, + { + "epoch": 0.363387690689111, + "grad_norm": 1.7695226669311523, + "learning_rate": 0.00014717066350117768, + "loss": 1.6466, + "step": 3454 + }, + { + "epoch": 0.3634928984744871, + "grad_norm": 1.1211014986038208, + "learning_rate": 0.00014714061285356453, + "loss": 1.772, + "step": 3455 + }, + { + "epoch": 0.3635981062598632, + "grad_norm": 1.286805510520935, + "learning_rate": 0.00014711055673166724, + "loss": 2.2176, + "step": 3456 + }, + { + "epoch": 0.36370331404523937, + "grad_norm": 2.3629977703094482, + "learning_rate": 0.0001470804951389761, + "loss": 1.4311, + "step": 3457 + }, + { + "epoch": 0.36380852183061546, + "grad_norm": 1.3376028537750244, + "learning_rate": 0.00014705042807898214, + "loss": 1.7872, + "step": 3458 + }, + { + "epoch": 0.3639137296159916, + "grad_norm": 2.238438129425049, + "learning_rate": 0.0001470203555551769, + "loss": 1.8577, + "step": 3459 + }, + { + "epoch": 0.3640189374013677, + "grad_norm": 1.5337350368499756, + "learning_rate": 0.00014699027757105254, + "loss": 1.7483, + "step": 3460 + }, + { + "epoch": 0.3641241451867438, + "grad_norm": 2.3909807205200195, + "learning_rate": 0.00014696019413010204, + "loss": 2.0391, + "step": 3461 + }, + { + "epoch": 0.36422935297211995, + "grad_norm": 1.3781287670135498, + "learning_rate": 0.00014693010523581882, + "loss": 1.8577, + "step": 3462 + }, + { + "epoch": 0.36433456075749604, + "grad_norm": 1.3029046058654785, + "learning_rate": 0.00014690001089169702, + "loss": 1.7383, + "step": 3463 + }, + { + "epoch": 0.3644397685428722, + "grad_norm": 1.002429485321045, + "learning_rate": 0.00014686991110123135, + "loss": 1.7997, + "step": 3464 + }, + { + "epoch": 0.3645449763282483, + "grad_norm": 1.2660531997680664, + "learning_rate": 0.0001468398058679173, + "loss": 1.7173, + "step": 3465 + }, + { + "epoch": 0.36465018411362443, + "grad_norm": 1.3863353729248047, + "learning_rate": 0.0001468096951952508, + "loss": 1.7758, + "step": 3466 + }, + { + "epoch": 0.3647553918990005, + "grad_norm": 1.7203501462936401, + "learning_rate": 0.00014677957908672856, + "loss": 1.7279, + "step": 3467 + }, + { + "epoch": 0.3648605996843766, + "grad_norm": 1.1552826166152954, + "learning_rate": 0.0001467494575458478, + "loss": 1.8039, + "step": 3468 + }, + { + "epoch": 0.36496580746975277, + "grad_norm": 0.9511840343475342, + "learning_rate": 0.00014671933057610654, + "loss": 1.8241, + "step": 3469 + }, + { + "epoch": 0.36507101525512886, + "grad_norm": 1.4740869998931885, + "learning_rate": 0.00014668919818100322, + "loss": 2.1223, + "step": 3470 + }, + { + "epoch": 0.365176223040505, + "grad_norm": 2.0013620853424072, + "learning_rate": 0.00014665906036403706, + "loss": 2.3534, + "step": 3471 + }, + { + "epoch": 0.3652814308258811, + "grad_norm": 1.2783856391906738, + "learning_rate": 0.0001466289171287079, + "loss": 1.7754, + "step": 3472 + }, + { + "epoch": 0.36538663861125725, + "grad_norm": 1.5455641746520996, + "learning_rate": 0.00014659876847851607, + "loss": 2.1852, + "step": 3473 + }, + { + "epoch": 0.36549184639663335, + "grad_norm": 1.3689314126968384, + "learning_rate": 0.00014656861441696278, + "loss": 1.7998, + "step": 3474 + }, + { + "epoch": 0.3655970541820095, + "grad_norm": 1.5268868207931519, + "learning_rate": 0.00014653845494754962, + "loss": 1.4686, + "step": 3475 + }, + { + "epoch": 0.3657022619673856, + "grad_norm": 1.6166832447052002, + "learning_rate": 0.00014650829007377894, + "loss": 1.9012, + "step": 3476 + }, + { + "epoch": 0.3658074697527617, + "grad_norm": 1.4161622524261475, + "learning_rate": 0.00014647811979915366, + "loss": 2.1802, + "step": 3477 + }, + { + "epoch": 0.36591267753813783, + "grad_norm": 1.2434589862823486, + "learning_rate": 0.00014644794412717736, + "loss": 1.9538, + "step": 3478 + }, + { + "epoch": 0.3660178853235139, + "grad_norm": 1.0335787534713745, + "learning_rate": 0.00014641776306135431, + "loss": 1.8217, + "step": 3479 + }, + { + "epoch": 0.3661230931088901, + "grad_norm": 1.653498649597168, + "learning_rate": 0.00014638757660518923, + "loss": 1.8545, + "step": 3480 + }, + { + "epoch": 0.36622830089426617, + "grad_norm": 0.8305730223655701, + "learning_rate": 0.00014635738476218767, + "loss": 1.4702, + "step": 3481 + }, + { + "epoch": 0.3663335086796423, + "grad_norm": 1.3280541896820068, + "learning_rate": 0.00014632718753585566, + "loss": 1.7956, + "step": 3482 + }, + { + "epoch": 0.3664387164650184, + "grad_norm": 1.6454240083694458, + "learning_rate": 0.0001462969849296999, + "loss": 1.5739, + "step": 3483 + }, + { + "epoch": 0.3665439242503945, + "grad_norm": 1.0778430700302124, + "learning_rate": 0.00014626677694722773, + "loss": 2.2551, + "step": 3484 + }, + { + "epoch": 0.36664913203577065, + "grad_norm": 1.3582223653793335, + "learning_rate": 0.00014623656359194712, + "loss": 1.9954, + "step": 3485 + }, + { + "epoch": 0.36675433982114675, + "grad_norm": 1.7454923391342163, + "learning_rate": 0.00014620634486736667, + "loss": 2.0445, + "step": 3486 + }, + { + "epoch": 0.3668595476065229, + "grad_norm": 1.1220197677612305, + "learning_rate": 0.00014617612077699548, + "loss": 1.8598, + "step": 3487 + }, + { + "epoch": 0.366964755391899, + "grad_norm": 1.2056738138198853, + "learning_rate": 0.00014614589132434347, + "loss": 2.0579, + "step": 3488 + }, + { + "epoch": 0.36706996317727514, + "grad_norm": 1.1674832105636597, + "learning_rate": 0.00014611565651292106, + "loss": 1.7296, + "step": 3489 + }, + { + "epoch": 0.36717517096265123, + "grad_norm": 1.2299489974975586, + "learning_rate": 0.00014608541634623929, + "loss": 1.6829, + "step": 3490 + }, + { + "epoch": 0.3672803787480274, + "grad_norm": 1.580704689025879, + "learning_rate": 0.00014605517082780988, + "loss": 2.2437, + "step": 3491 + }, + { + "epoch": 0.3673855865334035, + "grad_norm": 1.4246400594711304, + "learning_rate": 0.00014602491996114516, + "loss": 1.8395, + "step": 3492 + }, + { + "epoch": 0.36749079431877957, + "grad_norm": 1.4935017824172974, + "learning_rate": 0.00014599466374975802, + "loss": 2.118, + "step": 3493 + }, + { + "epoch": 0.3675960021041557, + "grad_norm": 1.4232902526855469, + "learning_rate": 0.00014596440219716205, + "loss": 1.8154, + "step": 3494 + }, + { + "epoch": 0.3677012098895318, + "grad_norm": 1.707642912864685, + "learning_rate": 0.00014593413530687138, + "loss": 1.7055, + "step": 3495 + }, + { + "epoch": 0.36780641767490796, + "grad_norm": 1.4270106554031372, + "learning_rate": 0.0001459038630824009, + "loss": 2.0895, + "step": 3496 + }, + { + "epoch": 0.36791162546028405, + "grad_norm": 1.169714331626892, + "learning_rate": 0.00014587358552726592, + "loss": 1.8576, + "step": 3497 + }, + { + "epoch": 0.3680168332456602, + "grad_norm": 1.1575000286102295, + "learning_rate": 0.0001458433026449825, + "loss": 1.8767, + "step": 3498 + }, + { + "epoch": 0.3681220410310363, + "grad_norm": 1.5656603574752808, + "learning_rate": 0.0001458130144390673, + "loss": 1.8864, + "step": 3499 + }, + { + "epoch": 0.3682272488164124, + "grad_norm": 0.9591088891029358, + "learning_rate": 0.0001457827209130376, + "loss": 1.8874, + "step": 3500 + }, + { + "epoch": 0.36833245660178854, + "grad_norm": 1.8865973949432373, + "learning_rate": 0.00014575242207041128, + "loss": 1.846, + "step": 3501 + }, + { + "epoch": 0.36843766438716463, + "grad_norm": 1.3061480522155762, + "learning_rate": 0.00014572211791470685, + "loss": 2.1671, + "step": 3502 + }, + { + "epoch": 0.3685428721725408, + "grad_norm": 1.3023380041122437, + "learning_rate": 0.00014569180844944344, + "loss": 1.8949, + "step": 3503 + }, + { + "epoch": 0.3686480799579169, + "grad_norm": 1.3070828914642334, + "learning_rate": 0.00014566149367814074, + "loss": 1.9491, + "step": 3504 + }, + { + "epoch": 0.368753287743293, + "grad_norm": 1.1962894201278687, + "learning_rate": 0.00014563117360431914, + "loss": 1.5836, + "step": 3505 + }, + { + "epoch": 0.3688584955286691, + "grad_norm": 1.190346121788025, + "learning_rate": 0.00014560084823149965, + "loss": 2.0028, + "step": 3506 + }, + { + "epoch": 0.36896370331404527, + "grad_norm": 1.0240740776062012, + "learning_rate": 0.00014557051756320378, + "loss": 1.9108, + "step": 3507 + }, + { + "epoch": 0.36906891109942136, + "grad_norm": 1.517669916152954, + "learning_rate": 0.0001455401816029538, + "loss": 1.7805, + "step": 3508 + }, + { + "epoch": 0.36917411888479745, + "grad_norm": 1.1098333597183228, + "learning_rate": 0.00014550984035427243, + "loss": 1.9193, + "step": 3509 + }, + { + "epoch": 0.3692793266701736, + "grad_norm": 1.9058518409729004, + "learning_rate": 0.00014547949382068322, + "loss": 2.471, + "step": 3510 + }, + { + "epoch": 0.3693845344555497, + "grad_norm": 0.9477731585502625, + "learning_rate": 0.0001454491420057101, + "loss": 1.2372, + "step": 3511 + }, + { + "epoch": 0.36948974224092584, + "grad_norm": 2.694037437438965, + "learning_rate": 0.00014541878491287783, + "loss": 2.3677, + "step": 3512 + }, + { + "epoch": 0.36959495002630194, + "grad_norm": 0.9963229298591614, + "learning_rate": 0.0001453884225457116, + "loss": 1.8101, + "step": 3513 + }, + { + "epoch": 0.3697001578116781, + "grad_norm": 1.3422759771347046, + "learning_rate": 0.00014535805490773732, + "loss": 1.9981, + "step": 3514 + }, + { + "epoch": 0.3698053655970542, + "grad_norm": 0.9793868660926819, + "learning_rate": 0.0001453276820024815, + "loss": 2.0641, + "step": 3515 + }, + { + "epoch": 0.3699105733824303, + "grad_norm": 1.3728710412979126, + "learning_rate": 0.0001452973038334712, + "loss": 2.2349, + "step": 3516 + }, + { + "epoch": 0.3700157811678064, + "grad_norm": 1.3043874502182007, + "learning_rate": 0.0001452669204042342, + "loss": 2.091, + "step": 3517 + }, + { + "epoch": 0.3701209889531825, + "grad_norm": 1.4674620628356934, + "learning_rate": 0.0001452365317182988, + "loss": 1.8651, + "step": 3518 + }, + { + "epoch": 0.37022619673855867, + "grad_norm": 1.3150261640548706, + "learning_rate": 0.00014520613777919392, + "loss": 1.8597, + "step": 3519 + }, + { + "epoch": 0.37033140452393476, + "grad_norm": 1.2791160345077515, + "learning_rate": 0.00014517573859044907, + "loss": 1.9082, + "step": 3520 + }, + { + "epoch": 0.3704366123093109, + "grad_norm": 1.371948480606079, + "learning_rate": 0.00014514533415559453, + "loss": 1.6619, + "step": 3521 + }, + { + "epoch": 0.370541820094687, + "grad_norm": 1.731662392616272, + "learning_rate": 0.00014511492447816097, + "loss": 2.1011, + "step": 3522 + }, + { + "epoch": 0.37064702788006315, + "grad_norm": 1.2484428882598877, + "learning_rate": 0.0001450845095616798, + "loss": 1.4152, + "step": 3523 + }, + { + "epoch": 0.37075223566543924, + "grad_norm": 1.5884274244308472, + "learning_rate": 0.00014505408940968296, + "loss": 2.0706, + "step": 3524 + }, + { + "epoch": 0.37085744345081534, + "grad_norm": 1.000995397567749, + "learning_rate": 0.00014502366402570309, + "loss": 2.1573, + "step": 3525 + }, + { + "epoch": 0.3709626512361915, + "grad_norm": 0.8821168541908264, + "learning_rate": 0.00014499323341327338, + "loss": 1.902, + "step": 3526 + }, + { + "epoch": 0.3710678590215676, + "grad_norm": 1.4428273439407349, + "learning_rate": 0.00014496279757592766, + "loss": 2.1331, + "step": 3527 + }, + { + "epoch": 0.37117306680694373, + "grad_norm": 1.5744878053665161, + "learning_rate": 0.00014493235651720027, + "loss": 1.7205, + "step": 3528 + }, + { + "epoch": 0.3712782745923198, + "grad_norm": 1.5465056896209717, + "learning_rate": 0.00014490191024062632, + "loss": 1.5546, + "step": 3529 + }, + { + "epoch": 0.371383482377696, + "grad_norm": 1.1197417974472046, + "learning_rate": 0.00014487145874974135, + "loss": 1.3374, + "step": 3530 + }, + { + "epoch": 0.37148869016307207, + "grad_norm": 0.9378413558006287, + "learning_rate": 0.00014484100204808167, + "loss": 2.3889, + "step": 3531 + }, + { + "epoch": 0.37159389794844816, + "grad_norm": 1.0312168598175049, + "learning_rate": 0.00014481054013918408, + "loss": 2.095, + "step": 3532 + }, + { + "epoch": 0.3716991057338243, + "grad_norm": 1.6550025939941406, + "learning_rate": 0.00014478007302658598, + "loss": 1.6941, + "step": 3533 + }, + { + "epoch": 0.3718043135192004, + "grad_norm": 1.5724120140075684, + "learning_rate": 0.0001447496007138255, + "loss": 2.3644, + "step": 3534 + }, + { + "epoch": 0.37190952130457655, + "grad_norm": 1.3543592691421509, + "learning_rate": 0.00014471912320444122, + "loss": 2.1931, + "step": 3535 + }, + { + "epoch": 0.37201472908995264, + "grad_norm": 1.7770755290985107, + "learning_rate": 0.00014468864050197242, + "loss": 1.5458, + "step": 3536 + }, + { + "epoch": 0.3721199368753288, + "grad_norm": 1.3519712686538696, + "learning_rate": 0.00014465815260995894, + "loss": 2.119, + "step": 3537 + }, + { + "epoch": 0.3722251446607049, + "grad_norm": 2.1993541717529297, + "learning_rate": 0.0001446276595319413, + "loss": 1.3163, + "step": 3538 + }, + { + "epoch": 0.37233035244608104, + "grad_norm": 1.6572602987289429, + "learning_rate": 0.00014459716127146049, + "loss": 1.7625, + "step": 3539 + }, + { + "epoch": 0.37243556023145713, + "grad_norm": 1.320532202720642, + "learning_rate": 0.0001445666578320582, + "loss": 1.377, + "step": 3540 + }, + { + "epoch": 0.3725407680168332, + "grad_norm": 1.6189240217208862, + "learning_rate": 0.00014453614921727668, + "loss": 1.7391, + "step": 3541 + }, + { + "epoch": 0.3726459758022094, + "grad_norm": 1.0008089542388916, + "learning_rate": 0.00014450563543065881, + "loss": 1.8331, + "step": 3542 + }, + { + "epoch": 0.37275118358758547, + "grad_norm": 1.0552862882614136, + "learning_rate": 0.00014447511647574805, + "loss": 2.0291, + "step": 3543 + }, + { + "epoch": 0.3728563913729616, + "grad_norm": 1.1746324300765991, + "learning_rate": 0.00014444459235608847, + "loss": 2.281, + "step": 3544 + }, + { + "epoch": 0.3729615991583377, + "grad_norm": 1.2236127853393555, + "learning_rate": 0.00014441406307522475, + "loss": 1.8409, + "step": 3545 + }, + { + "epoch": 0.37306680694371386, + "grad_norm": 1.948947548866272, + "learning_rate": 0.0001443835286367021, + "loss": 1.3529, + "step": 3546 + }, + { + "epoch": 0.37317201472908995, + "grad_norm": 1.3793987035751343, + "learning_rate": 0.00014435298904406642, + "loss": 1.852, + "step": 3547 + }, + { + "epoch": 0.37327722251446604, + "grad_norm": 0.9412884712219238, + "learning_rate": 0.00014432244430086423, + "loss": 2.2095, + "step": 3548 + }, + { + "epoch": 0.3733824302998422, + "grad_norm": 1.2927563190460205, + "learning_rate": 0.00014429189441064248, + "loss": 1.9944, + "step": 3549 + }, + { + "epoch": 0.3734876380852183, + "grad_norm": 1.234215497970581, + "learning_rate": 0.00014426133937694887, + "loss": 1.7411, + "step": 3550 + }, + { + "epoch": 0.37359284587059444, + "grad_norm": 1.1995365619659424, + "learning_rate": 0.00014423077920333173, + "loss": 2.0875, + "step": 3551 + }, + { + "epoch": 0.37369805365597053, + "grad_norm": 1.4042108058929443, + "learning_rate": 0.00014420021389333982, + "loss": 2.1245, + "step": 3552 + }, + { + "epoch": 0.3738032614413467, + "grad_norm": 1.579077959060669, + "learning_rate": 0.0001441696434505226, + "loss": 1.2663, + "step": 3553 + }, + { + "epoch": 0.3739084692267228, + "grad_norm": 1.5505826473236084, + "learning_rate": 0.00014413906787843014, + "loss": 1.8943, + "step": 3554 + }, + { + "epoch": 0.3740136770120989, + "grad_norm": 1.1203827857971191, + "learning_rate": 0.00014410848718061312, + "loss": 1.9703, + "step": 3555 + }, + { + "epoch": 0.374118884797475, + "grad_norm": 1.4529035091400146, + "learning_rate": 0.00014407790136062267, + "loss": 2.0986, + "step": 3556 + }, + { + "epoch": 0.3742240925828511, + "grad_norm": 2.0236105918884277, + "learning_rate": 0.0001440473104220107, + "loss": 1.363, + "step": 3557 + }, + { + "epoch": 0.37432930036822726, + "grad_norm": 1.0123807191848755, + "learning_rate": 0.0001440167143683296, + "loss": 1.7753, + "step": 3558 + }, + { + "epoch": 0.37443450815360335, + "grad_norm": 0.9799262285232544, + "learning_rate": 0.00014398611320313244, + "loss": 1.4174, + "step": 3559 + }, + { + "epoch": 0.3745397159389795, + "grad_norm": 1.738737940788269, + "learning_rate": 0.00014395550692997277, + "loss": 1.9, + "step": 3560 + }, + { + "epoch": 0.3746449237243556, + "grad_norm": 1.3359789848327637, + "learning_rate": 0.00014392489555240486, + "loss": 1.7223, + "step": 3561 + }, + { + "epoch": 0.37475013150973174, + "grad_norm": 2.408348798751831, + "learning_rate": 0.00014389427907398342, + "loss": 1.6607, + "step": 3562 + }, + { + "epoch": 0.37485533929510784, + "grad_norm": 2.228571891784668, + "learning_rate": 0.0001438636574982639, + "loss": 1.2856, + "step": 3563 + }, + { + "epoch": 0.37496054708048393, + "grad_norm": 1.1137131452560425, + "learning_rate": 0.0001438330308288023, + "loss": 2.1641, + "step": 3564 + }, + { + "epoch": 0.3750657548658601, + "grad_norm": 1.3347491025924683, + "learning_rate": 0.00014380239906915514, + "loss": 2.0624, + "step": 3565 + }, + { + "epoch": 0.3751709626512362, + "grad_norm": 1.4927902221679688, + "learning_rate": 0.00014377176222287965, + "loss": 1.6468, + "step": 3566 + }, + { + "epoch": 0.3752761704366123, + "grad_norm": 1.1425906419754028, + "learning_rate": 0.0001437411202935335, + "loss": 2.0824, + "step": 3567 + }, + { + "epoch": 0.3753813782219884, + "grad_norm": 1.4617263078689575, + "learning_rate": 0.00014371047328467511, + "loss": 2.1863, + "step": 3568 + }, + { + "epoch": 0.37548658600736456, + "grad_norm": 1.414243459701538, + "learning_rate": 0.00014367982119986342, + "loss": 1.8408, + "step": 3569 + }, + { + "epoch": 0.37559179379274066, + "grad_norm": 1.367104172706604, + "learning_rate": 0.00014364916404265788, + "loss": 1.6474, + "step": 3570 + }, + { + "epoch": 0.3756970015781168, + "grad_norm": 2.073486089706421, + "learning_rate": 0.0001436185018166187, + "loss": 1.8869, + "step": 3571 + }, + { + "epoch": 0.3758022093634929, + "grad_norm": 0.7273818850517273, + "learning_rate": 0.0001435878345253065, + "loss": 1.4154, + "step": 3572 + }, + { + "epoch": 0.375907417148869, + "grad_norm": 1.1615116596221924, + "learning_rate": 0.00014355716217228265, + "loss": 2.0725, + "step": 3573 + }, + { + "epoch": 0.37601262493424514, + "grad_norm": 1.9539501667022705, + "learning_rate": 0.00014352648476110896, + "loss": 1.8466, + "step": 3574 + }, + { + "epoch": 0.37611783271962124, + "grad_norm": 1.2255717515945435, + "learning_rate": 0.0001434958022953479, + "loss": 2.1784, + "step": 3575 + }, + { + "epoch": 0.3762230405049974, + "grad_norm": 1.502297282218933, + "learning_rate": 0.00014346511477856259, + "loss": 1.8617, + "step": 3576 + }, + { + "epoch": 0.3763282482903735, + "grad_norm": 1.4873766899108887, + "learning_rate": 0.0001434344222143166, + "loss": 2.0098, + "step": 3577 + }, + { + "epoch": 0.37643345607574963, + "grad_norm": 1.7163630723953247, + "learning_rate": 0.0001434037246061742, + "loss": 2.5738, + "step": 3578 + }, + { + "epoch": 0.3765386638611257, + "grad_norm": 1.179367184638977, + "learning_rate": 0.0001433730219577002, + "loss": 1.5599, + "step": 3579 + }, + { + "epoch": 0.3766438716465018, + "grad_norm": 1.8585902452468872, + "learning_rate": 0.00014334231427245994, + "loss": 1.9643, + "step": 3580 + }, + { + "epoch": 0.37674907943187796, + "grad_norm": 1.168847918510437, + "learning_rate": 0.00014331160155401948, + "loss": 2.2399, + "step": 3581 + }, + { + "epoch": 0.37685428721725406, + "grad_norm": 1.2563929557800293, + "learning_rate": 0.00014328088380594534, + "loss": 2.0344, + "step": 3582 + }, + { + "epoch": 0.3769594950026302, + "grad_norm": 1.3818440437316895, + "learning_rate": 0.0001432501610318047, + "loss": 1.6415, + "step": 3583 + }, + { + "epoch": 0.3770647027880063, + "grad_norm": 1.4234402179718018, + "learning_rate": 0.00014321943323516526, + "loss": 1.7941, + "step": 3584 + }, + { + "epoch": 0.37716991057338245, + "grad_norm": 1.5692628622055054, + "learning_rate": 0.00014318870041959538, + "loss": 1.653, + "step": 3585 + }, + { + "epoch": 0.37727511835875854, + "grad_norm": 1.3026686906814575, + "learning_rate": 0.00014315796258866393, + "loss": 2.1109, + "step": 3586 + }, + { + "epoch": 0.3773803261441347, + "grad_norm": 1.237284541130066, + "learning_rate": 0.00014312721974594038, + "loss": 1.7561, + "step": 3587 + }, + { + "epoch": 0.3774855339295108, + "grad_norm": 1.4320054054260254, + "learning_rate": 0.00014309647189499481, + "loss": 2.1673, + "step": 3588 + }, + { + "epoch": 0.3775907417148869, + "grad_norm": 1.15114164352417, + "learning_rate": 0.0001430657190393979, + "loss": 1.7654, + "step": 3589 + }, + { + "epoch": 0.37769594950026303, + "grad_norm": 1.3742176294326782, + "learning_rate": 0.00014303496118272084, + "loss": 1.8697, + "step": 3590 + }, + { + "epoch": 0.3778011572856391, + "grad_norm": 1.233818531036377, + "learning_rate": 0.00014300419832853544, + "loss": 1.5884, + "step": 3591 + }, + { + "epoch": 0.37790636507101527, + "grad_norm": 1.7550405263900757, + "learning_rate": 0.0001429734304804141, + "loss": 1.99, + "step": 3592 + }, + { + "epoch": 0.37801157285639136, + "grad_norm": 1.5846307277679443, + "learning_rate": 0.0001429426576419298, + "loss": 1.879, + "step": 3593 + }, + { + "epoch": 0.3781167806417675, + "grad_norm": 1.696427822113037, + "learning_rate": 0.00014291187981665607, + "loss": 1.921, + "step": 3594 + }, + { + "epoch": 0.3782219884271436, + "grad_norm": 1.756914496421814, + "learning_rate": 0.00014288109700816705, + "loss": 1.7842, + "step": 3595 + }, + { + "epoch": 0.3783271962125197, + "grad_norm": 1.9030567407608032, + "learning_rate": 0.0001428503092200374, + "loss": 1.8851, + "step": 3596 + }, + { + "epoch": 0.37843240399789585, + "grad_norm": 0.8810452818870544, + "learning_rate": 0.0001428195164558425, + "loss": 1.7901, + "step": 3597 + }, + { + "epoch": 0.37853761178327194, + "grad_norm": 1.4262439012527466, + "learning_rate": 0.00014278871871915814, + "loss": 1.9453, + "step": 3598 + }, + { + "epoch": 0.3786428195686481, + "grad_norm": 1.6858327388763428, + "learning_rate": 0.00014275791601356074, + "loss": 1.8671, + "step": 3599 + }, + { + "epoch": 0.3787480273540242, + "grad_norm": 0.9039170742034912, + "learning_rate": 0.0001427271083426274, + "loss": 2.199, + "step": 3600 + }, + { + "epoch": 0.37885323513940034, + "grad_norm": 1.2409204244613647, + "learning_rate": 0.00014269629570993564, + "loss": 1.6864, + "step": 3601 + }, + { + "epoch": 0.37895844292477643, + "grad_norm": 1.6588237285614014, + "learning_rate": 0.00014266547811906364, + "loss": 2.1573, + "step": 3602 + }, + { + "epoch": 0.3790636507101526, + "grad_norm": 1.5845474004745483, + "learning_rate": 0.00014263465557359017, + "loss": 1.7634, + "step": 3603 + }, + { + "epoch": 0.37916885849552867, + "grad_norm": 1.337897539138794, + "learning_rate": 0.00014260382807709457, + "loss": 2.0049, + "step": 3604 + }, + { + "epoch": 0.37927406628090476, + "grad_norm": 1.3715331554412842, + "learning_rate": 0.00014257299563315667, + "loss": 1.8703, + "step": 3605 + }, + { + "epoch": 0.3793792740662809, + "grad_norm": 0.8518646955490112, + "learning_rate": 0.00014254215824535698, + "loss": 2.0015, + "step": 3606 + }, + { + "epoch": 0.379484481851657, + "grad_norm": 0.998729407787323, + "learning_rate": 0.00014251131591727656, + "loss": 1.7552, + "step": 3607 + }, + { + "epoch": 0.37958968963703316, + "grad_norm": 1.5885865688323975, + "learning_rate": 0.00014248046865249697, + "loss": 1.7364, + "step": 3608 + }, + { + "epoch": 0.37969489742240925, + "grad_norm": 1.1192876100540161, + "learning_rate": 0.00014244961645460048, + "loss": 1.9686, + "step": 3609 + }, + { + "epoch": 0.3798001052077854, + "grad_norm": 1.3762092590332031, + "learning_rate": 0.0001424187593271698, + "loss": 1.668, + "step": 3610 + }, + { + "epoch": 0.3799053129931615, + "grad_norm": 1.3695660829544067, + "learning_rate": 0.0001423878972737883, + "loss": 1.939, + "step": 3611 + }, + { + "epoch": 0.3800105207785376, + "grad_norm": 1.5928248167037964, + "learning_rate": 0.00014235703029803984, + "loss": 1.6483, + "step": 3612 + }, + { + "epoch": 0.38011572856391374, + "grad_norm": 1.027282953262329, + "learning_rate": 0.00014232615840350894, + "loss": 1.3274, + "step": 3613 + }, + { + "epoch": 0.38022093634928983, + "grad_norm": 1.2726476192474365, + "learning_rate": 0.00014229528159378065, + "loss": 2.1561, + "step": 3614 + }, + { + "epoch": 0.380326144134666, + "grad_norm": 1.3544089794158936, + "learning_rate": 0.00014226439987244057, + "loss": 1.5668, + "step": 3615 + }, + { + "epoch": 0.38043135192004207, + "grad_norm": 1.9423713684082031, + "learning_rate": 0.00014223351324307493, + "loss": 1.7549, + "step": 3616 + }, + { + "epoch": 0.3805365597054182, + "grad_norm": 0.9007306694984436, + "learning_rate": 0.00014220262170927046, + "loss": 2.1485, + "step": 3617 + }, + { + "epoch": 0.3806417674907943, + "grad_norm": 1.2513463497161865, + "learning_rate": 0.0001421717252746145, + "loss": 1.9919, + "step": 3618 + }, + { + "epoch": 0.38074697527617046, + "grad_norm": 1.4759798049926758, + "learning_rate": 0.00014214082394269493, + "loss": 2.2475, + "step": 3619 + }, + { + "epoch": 0.38085218306154656, + "grad_norm": 1.4133992195129395, + "learning_rate": 0.00014210991771710025, + "loss": 2.229, + "step": 3620 + }, + { + "epoch": 0.38095739084692265, + "grad_norm": 1.1135759353637695, + "learning_rate": 0.0001420790066014195, + "loss": 1.7897, + "step": 3621 + }, + { + "epoch": 0.3810625986322988, + "grad_norm": 1.0141302347183228, + "learning_rate": 0.00014204809059924228, + "loss": 1.5876, + "step": 3622 + }, + { + "epoch": 0.3811678064176749, + "grad_norm": 1.2272731065750122, + "learning_rate": 0.00014201716971415875, + "loss": 1.9002, + "step": 3623 + }, + { + "epoch": 0.38127301420305104, + "grad_norm": 1.4292423725128174, + "learning_rate": 0.00014198624394975968, + "loss": 1.5974, + "step": 3624 + }, + { + "epoch": 0.38137822198842714, + "grad_norm": 1.052093744277954, + "learning_rate": 0.00014195531330963635, + "loss": 1.9424, + "step": 3625 + }, + { + "epoch": 0.3814834297738033, + "grad_norm": 1.377421498298645, + "learning_rate": 0.00014192437779738062, + "loss": 2.4788, + "step": 3626 + }, + { + "epoch": 0.3815886375591794, + "grad_norm": 1.4875088930130005, + "learning_rate": 0.00014189343741658497, + "loss": 1.8751, + "step": 3627 + }, + { + "epoch": 0.38169384534455547, + "grad_norm": 1.2840195894241333, + "learning_rate": 0.0001418624921708424, + "loss": 1.8626, + "step": 3628 + }, + { + "epoch": 0.3817990531299316, + "grad_norm": 1.7244127988815308, + "learning_rate": 0.00014183154206374643, + "loss": 1.7683, + "step": 3629 + }, + { + "epoch": 0.3819042609153077, + "grad_norm": 1.5196006298065186, + "learning_rate": 0.0001418005870988912, + "loss": 2.0749, + "step": 3630 + }, + { + "epoch": 0.38200946870068386, + "grad_norm": 1.3021456003189087, + "learning_rate": 0.0001417696272798715, + "loss": 2.3025, + "step": 3631 + }, + { + "epoch": 0.38211467648605996, + "grad_norm": 0.9717100262641907, + "learning_rate": 0.0001417386626102825, + "loss": 2.2728, + "step": 3632 + }, + { + "epoch": 0.3822198842714361, + "grad_norm": 1.2531250715255737, + "learning_rate": 0.00014170769309372006, + "loss": 2.11, + "step": 3633 + }, + { + "epoch": 0.3823250920568122, + "grad_norm": 3.0939295291900635, + "learning_rate": 0.00014167671873378056, + "loss": 2.119, + "step": 3634 + }, + { + "epoch": 0.38243029984218835, + "grad_norm": 1.3304295539855957, + "learning_rate": 0.00014164573953406095, + "loss": 1.6821, + "step": 3635 + }, + { + "epoch": 0.38253550762756444, + "grad_norm": 1.434133768081665, + "learning_rate": 0.00014161475549815877, + "loss": 2.2729, + "step": 3636 + }, + { + "epoch": 0.38264071541294054, + "grad_norm": 1.4510753154754639, + "learning_rate": 0.00014158376662967202, + "loss": 2.2107, + "step": 3637 + }, + { + "epoch": 0.3827459231983167, + "grad_norm": 1.247135877609253, + "learning_rate": 0.0001415527729321994, + "loss": 1.846, + "step": 3638 + }, + { + "epoch": 0.3828511309836928, + "grad_norm": 0.9109544157981873, + "learning_rate": 0.00014152177440934012, + "loss": 1.9013, + "step": 3639 + }, + { + "epoch": 0.3829563387690689, + "grad_norm": 0.9982622265815735, + "learning_rate": 0.00014149077106469387, + "loss": 2.0598, + "step": 3640 + }, + { + "epoch": 0.383061546554445, + "grad_norm": 1.1758394241333008, + "learning_rate": 0.00014145976290186102, + "loss": 2.3168, + "step": 3641 + }, + { + "epoch": 0.38316675433982117, + "grad_norm": 0.8653580546379089, + "learning_rate": 0.00014142874992444243, + "loss": 1.5041, + "step": 3642 + }, + { + "epoch": 0.38327196212519726, + "grad_norm": 1.015748143196106, + "learning_rate": 0.0001413977321360395, + "loss": 1.9884, + "step": 3643 + }, + { + "epoch": 0.38337716991057336, + "grad_norm": 1.737280249595642, + "learning_rate": 0.00014136670954025427, + "loss": 1.7972, + "step": 3644 + }, + { + "epoch": 0.3834823776959495, + "grad_norm": 2.5558969974517822, + "learning_rate": 0.0001413356821406893, + "loss": 2.1549, + "step": 3645 + }, + { + "epoch": 0.3835875854813256, + "grad_norm": 1.4405326843261719, + "learning_rate": 0.0001413046499409477, + "loss": 1.9342, + "step": 3646 + }, + { + "epoch": 0.38369279326670175, + "grad_norm": 1.235929012298584, + "learning_rate": 0.0001412736129446331, + "loss": 2.0326, + "step": 3647 + }, + { + "epoch": 0.38379800105207784, + "grad_norm": 1.8266171216964722, + "learning_rate": 0.0001412425711553497, + "loss": 1.9242, + "step": 3648 + }, + { + "epoch": 0.383903208837454, + "grad_norm": 1.1843401193618774, + "learning_rate": 0.00014121152457670234, + "loss": 2.4682, + "step": 3649 + }, + { + "epoch": 0.3840084166228301, + "grad_norm": 1.2029701471328735, + "learning_rate": 0.00014118047321229633, + "loss": 1.5255, + "step": 3650 + }, + { + "epoch": 0.38411362440820623, + "grad_norm": 1.264588713645935, + "learning_rate": 0.00014114941706573758, + "loss": 1.5459, + "step": 3651 + }, + { + "epoch": 0.3842188321935823, + "grad_norm": 1.7081525325775146, + "learning_rate": 0.00014111835614063253, + "loss": 2.1296, + "step": 3652 + }, + { + "epoch": 0.3843240399789584, + "grad_norm": 0.9113522171974182, + "learning_rate": 0.0001410872904405882, + "loss": 1.5649, + "step": 3653 + }, + { + "epoch": 0.38442924776433457, + "grad_norm": 1.2940986156463623, + "learning_rate": 0.0001410562199692121, + "loss": 1.6233, + "step": 3654 + }, + { + "epoch": 0.38453445554971066, + "grad_norm": 1.201011300086975, + "learning_rate": 0.00014102514473011233, + "loss": 1.9928, + "step": 3655 + }, + { + "epoch": 0.3846396633350868, + "grad_norm": 1.310477614402771, + "learning_rate": 0.0001409940647268977, + "loss": 1.7539, + "step": 3656 + }, + { + "epoch": 0.3847448711204629, + "grad_norm": 0.9373623132705688, + "learning_rate": 0.00014096297996317724, + "loss": 1.9289, + "step": 3657 + }, + { + "epoch": 0.38485007890583905, + "grad_norm": 1.0480597019195557, + "learning_rate": 0.00014093189044256084, + "loss": 2.0449, + "step": 3658 + }, + { + "epoch": 0.38495528669121515, + "grad_norm": 1.2501201629638672, + "learning_rate": 0.00014090079616865882, + "loss": 1.7853, + "step": 3659 + }, + { + "epoch": 0.38506049447659124, + "grad_norm": 1.2125840187072754, + "learning_rate": 0.00014086969714508196, + "loss": 1.3741, + "step": 3660 + }, + { + "epoch": 0.3851657022619674, + "grad_norm": 0.8791213631629944, + "learning_rate": 0.00014083859337544175, + "loss": 1.7084, + "step": 3661 + }, + { + "epoch": 0.3852709100473435, + "grad_norm": 1.8772777318954468, + "learning_rate": 0.00014080748486335022, + "loss": 1.6383, + "step": 3662 + }, + { + "epoch": 0.38537611783271963, + "grad_norm": 1.3970454931259155, + "learning_rate": 0.0001407763716124198, + "loss": 1.5493, + "step": 3663 + }, + { + "epoch": 0.3854813256180957, + "grad_norm": 1.5599162578582764, + "learning_rate": 0.00014074525362626366, + "loss": 2.0028, + "step": 3664 + }, + { + "epoch": 0.3855865334034719, + "grad_norm": 1.0183049440383911, + "learning_rate": 0.00014071413090849534, + "loss": 2.151, + "step": 3665 + }, + { + "epoch": 0.38569174118884797, + "grad_norm": 1.4794666767120361, + "learning_rate": 0.0001406830034627291, + "loss": 1.8065, + "step": 3666 + }, + { + "epoch": 0.3857969489742241, + "grad_norm": 1.77559232711792, + "learning_rate": 0.00014065187129257964, + "loss": 2.3033, + "step": 3667 + }, + { + "epoch": 0.3859021567596002, + "grad_norm": 2.2555582523345947, + "learning_rate": 0.00014062073440166222, + "loss": 1.48, + "step": 3668 + }, + { + "epoch": 0.3860073645449763, + "grad_norm": 0.9742986559867859, + "learning_rate": 0.00014058959279359266, + "loss": 1.6655, + "step": 3669 + }, + { + "epoch": 0.38611257233035245, + "grad_norm": 1.42844557762146, + "learning_rate": 0.00014055844647198738, + "loss": 2.1812, + "step": 3670 + }, + { + "epoch": 0.38621778011572855, + "grad_norm": 1.0865979194641113, + "learning_rate": 0.00014052729544046326, + "loss": 1.8439, + "step": 3671 + }, + { + "epoch": 0.3863229879011047, + "grad_norm": 1.070984125137329, + "learning_rate": 0.00014049613970263774, + "loss": 1.7071, + "step": 3672 + }, + { + "epoch": 0.3864281956864808, + "grad_norm": 1.4193851947784424, + "learning_rate": 0.0001404649792621289, + "loss": 2.1123, + "step": 3673 + }, + { + "epoch": 0.38653340347185694, + "grad_norm": 1.3338897228240967, + "learning_rate": 0.00014043381412255526, + "loss": 2.0661, + "step": 3674 + }, + { + "epoch": 0.38663861125723303, + "grad_norm": 1.063714861869812, + "learning_rate": 0.00014040264428753592, + "loss": 1.5893, + "step": 3675 + }, + { + "epoch": 0.3867438190426091, + "grad_norm": 1.06549870967865, + "learning_rate": 0.00014037146976069055, + "loss": 1.7869, + "step": 3676 + }, + { + "epoch": 0.3868490268279853, + "grad_norm": 1.382040023803711, + "learning_rate": 0.00014034029054563933, + "loss": 1.5798, + "step": 3677 + }, + { + "epoch": 0.38695423461336137, + "grad_norm": 1.4620635509490967, + "learning_rate": 0.000140309106646003, + "loss": 1.7959, + "step": 3678 + }, + { + "epoch": 0.3870594423987375, + "grad_norm": 1.2155455350875854, + "learning_rate": 0.0001402779180654029, + "loss": 1.5766, + "step": 3679 + }, + { + "epoch": 0.3871646501841136, + "grad_norm": 1.3501040935516357, + "learning_rate": 0.00014024672480746078, + "loss": 1.7963, + "step": 3680 + }, + { + "epoch": 0.38726985796948976, + "grad_norm": 1.1991915702819824, + "learning_rate": 0.00014021552687579902, + "loss": 1.9882, + "step": 3681 + }, + { + "epoch": 0.38737506575486585, + "grad_norm": 2.334041118621826, + "learning_rate": 0.00014018432427404055, + "loss": 1.3908, + "step": 3682 + }, + { + "epoch": 0.387480273540242, + "grad_norm": 1.5623788833618164, + "learning_rate": 0.0001401531170058088, + "loss": 1.2739, + "step": 3683 + }, + { + "epoch": 0.3875854813256181, + "grad_norm": 1.153784155845642, + "learning_rate": 0.00014012190507472783, + "loss": 1.6715, + "step": 3684 + }, + { + "epoch": 0.3876906891109942, + "grad_norm": 0.9832100868225098, + "learning_rate": 0.00014009068848442214, + "loss": 2.1983, + "step": 3685 + }, + { + "epoch": 0.38779589689637034, + "grad_norm": 1.071271538734436, + "learning_rate": 0.0001400594672385168, + "loss": 2.1699, + "step": 3686 + }, + { + "epoch": 0.38790110468174643, + "grad_norm": 1.600494146347046, + "learning_rate": 0.00014002824134063747, + "loss": 1.9091, + "step": 3687 + }, + { + "epoch": 0.3880063124671226, + "grad_norm": 1.5296038389205933, + "learning_rate": 0.00013999701079441028, + "loss": 2.0367, + "step": 3688 + }, + { + "epoch": 0.3881115202524987, + "grad_norm": 1.072365403175354, + "learning_rate": 0.0001399657756034619, + "loss": 2.0738, + "step": 3689 + }, + { + "epoch": 0.3882167280378748, + "grad_norm": 1.1414501667022705, + "learning_rate": 0.00013993453577141964, + "loss": 1.5334, + "step": 3690 + }, + { + "epoch": 0.3883219358232509, + "grad_norm": 1.220701813697815, + "learning_rate": 0.00013990329130191123, + "loss": 1.9406, + "step": 3691 + }, + { + "epoch": 0.388427143608627, + "grad_norm": 1.4676032066345215, + "learning_rate": 0.000139872042198565, + "loss": 1.6478, + "step": 3692 + }, + { + "epoch": 0.38853235139400316, + "grad_norm": 2.2082393169403076, + "learning_rate": 0.0001398407884650098, + "loss": 2.5011, + "step": 3693 + }, + { + "epoch": 0.38863755917937925, + "grad_norm": 1.4031996726989746, + "learning_rate": 0.000139809530104875, + "loss": 1.8676, + "step": 3694 + }, + { + "epoch": 0.3887427669647554, + "grad_norm": 1.0090667009353638, + "learning_rate": 0.00013977826712179058, + "loss": 1.7266, + "step": 3695 + }, + { + "epoch": 0.3888479747501315, + "grad_norm": 1.4006050825119019, + "learning_rate": 0.000139746999519387, + "loss": 1.8296, + "step": 3696 + }, + { + "epoch": 0.38895318253550765, + "grad_norm": 1.0253705978393555, + "learning_rate": 0.00013971572730129525, + "loss": 1.7789, + "step": 3697 + }, + { + "epoch": 0.38905839032088374, + "grad_norm": 1.1264349222183228, + "learning_rate": 0.00013968445047114685, + "loss": 2.0904, + "step": 3698 + }, + { + "epoch": 0.3891635981062599, + "grad_norm": 1.5611605644226074, + "learning_rate": 0.0001396531690325739, + "loss": 2.101, + "step": 3699 + }, + { + "epoch": 0.389268805891636, + "grad_norm": 1.7648766040802002, + "learning_rate": 0.00013962188298920902, + "loss": 2.1288, + "step": 3700 + }, + { + "epoch": 0.3893740136770121, + "grad_norm": 2.0247011184692383, + "learning_rate": 0.00013959059234468536, + "loss": 1.7906, + "step": 3701 + }, + { + "epoch": 0.3894792214623882, + "grad_norm": 1.0505448579788208, + "learning_rate": 0.00013955929710263653, + "loss": 1.995, + "step": 3702 + }, + { + "epoch": 0.3895844292477643, + "grad_norm": 1.6498749256134033, + "learning_rate": 0.00013952799726669682, + "loss": 1.8126, + "step": 3703 + }, + { + "epoch": 0.38968963703314047, + "grad_norm": 1.861400842666626, + "learning_rate": 0.00013949669284050092, + "loss": 2.3192, + "step": 3704 + }, + { + "epoch": 0.38979484481851656, + "grad_norm": 1.1413242816925049, + "learning_rate": 0.00013946538382768418, + "loss": 1.4746, + "step": 3705 + }, + { + "epoch": 0.3899000526038927, + "grad_norm": 1.281286597251892, + "learning_rate": 0.00013943407023188234, + "loss": 2.3798, + "step": 3706 + }, + { + "epoch": 0.3900052603892688, + "grad_norm": 1.3880997896194458, + "learning_rate": 0.00013940275205673178, + "loss": 2.0053, + "step": 3707 + }, + { + "epoch": 0.3901104681746449, + "grad_norm": 1.1959868669509888, + "learning_rate": 0.0001393714293058694, + "loss": 2.0179, + "step": 3708 + }, + { + "epoch": 0.39021567596002105, + "grad_norm": 1.6288052797317505, + "learning_rate": 0.00013934010198293257, + "loss": 2.1802, + "step": 3709 + }, + { + "epoch": 0.39032088374539714, + "grad_norm": 1.229329228401184, + "learning_rate": 0.00013930877009155922, + "loss": 1.8899, + "step": 3710 + }, + { + "epoch": 0.3904260915307733, + "grad_norm": 1.2303109169006348, + "learning_rate": 0.00013927743363538787, + "loss": 1.6863, + "step": 3711 + }, + { + "epoch": 0.3905312993161494, + "grad_norm": 1.653576374053955, + "learning_rate": 0.0001392460926180575, + "loss": 1.765, + "step": 3712 + }, + { + "epoch": 0.39063650710152553, + "grad_norm": 1.6338762044906616, + "learning_rate": 0.0001392147470432076, + "loss": 2.0503, + "step": 3713 + }, + { + "epoch": 0.3907417148869016, + "grad_norm": 1.4691399335861206, + "learning_rate": 0.00013918339691447825, + "loss": 2.1288, + "step": 3714 + }, + { + "epoch": 0.3908469226722778, + "grad_norm": 1.0673540830612183, + "learning_rate": 0.0001391520422355101, + "loss": 1.8172, + "step": 3715 + }, + { + "epoch": 0.39095213045765387, + "grad_norm": 0.9647770524024963, + "learning_rate": 0.00013912068300994413, + "loss": 1.7418, + "step": 3716 + }, + { + "epoch": 0.39105733824302996, + "grad_norm": 1.7503970861434937, + "learning_rate": 0.0001390893192414221, + "loss": 1.78, + "step": 3717 + }, + { + "epoch": 0.3911625460284061, + "grad_norm": 0.7777706980705261, + "learning_rate": 0.00013905795093358615, + "loss": 1.9819, + "step": 3718 + }, + { + "epoch": 0.3912677538137822, + "grad_norm": 1.5159608125686646, + "learning_rate": 0.00013902657809007897, + "loss": 1.7175, + "step": 3719 + }, + { + "epoch": 0.39137296159915835, + "grad_norm": 1.442254662513733, + "learning_rate": 0.00013899520071454377, + "loss": 2.1266, + "step": 3720 + }, + { + "epoch": 0.39147816938453445, + "grad_norm": 1.2013967037200928, + "learning_rate": 0.00013896381881062437, + "loss": 1.9828, + "step": 3721 + }, + { + "epoch": 0.3915833771699106, + "grad_norm": 2.616184949874878, + "learning_rate": 0.00013893243238196495, + "loss": 1.7142, + "step": 3722 + }, + { + "epoch": 0.3916885849552867, + "grad_norm": 1.076119065284729, + "learning_rate": 0.0001389010414322104, + "loss": 2.2307, + "step": 3723 + }, + { + "epoch": 0.3917937927406628, + "grad_norm": 1.1124643087387085, + "learning_rate": 0.00013886964596500595, + "loss": 1.7956, + "step": 3724 + }, + { + "epoch": 0.39189900052603893, + "grad_norm": 1.8706750869750977, + "learning_rate": 0.00013883824598399756, + "loss": 2.3762, + "step": 3725 + }, + { + "epoch": 0.392004208311415, + "grad_norm": 1.2666409015655518, + "learning_rate": 0.00013880684149283152, + "loss": 1.5874, + "step": 3726 + }, + { + "epoch": 0.3921094160967912, + "grad_norm": 1.3697994947433472, + "learning_rate": 0.00013877543249515476, + "loss": 2.4001, + "step": 3727 + }, + { + "epoch": 0.39221462388216727, + "grad_norm": 1.538824200630188, + "learning_rate": 0.00013874401899461474, + "loss": 1.8197, + "step": 3728 + }, + { + "epoch": 0.3923198316675434, + "grad_norm": 1.4980809688568115, + "learning_rate": 0.00013871260099485936, + "loss": 1.434, + "step": 3729 + }, + { + "epoch": 0.3924250394529195, + "grad_norm": 1.348250150680542, + "learning_rate": 0.0001386811784995371, + "loss": 1.6379, + "step": 3730 + }, + { + "epoch": 0.39253024723829566, + "grad_norm": 2.0274887084960938, + "learning_rate": 0.00013864975151229697, + "loss": 1.7713, + "step": 3731 + }, + { + "epoch": 0.39263545502367175, + "grad_norm": 1.4459730386734009, + "learning_rate": 0.00013861832003678846, + "loss": 1.68, + "step": 3732 + }, + { + "epoch": 0.39274066280904785, + "grad_norm": 1.5965417623519897, + "learning_rate": 0.00013858688407666163, + "loss": 1.962, + "step": 3733 + }, + { + "epoch": 0.392845870594424, + "grad_norm": 1.5508756637573242, + "learning_rate": 0.00013855544363556698, + "loss": 2.1783, + "step": 3734 + }, + { + "epoch": 0.3929510783798001, + "grad_norm": 1.3883273601531982, + "learning_rate": 0.00013852399871715562, + "loss": 2.3433, + "step": 3735 + }, + { + "epoch": 0.39305628616517624, + "grad_norm": 1.741829752922058, + "learning_rate": 0.00013849254932507917, + "loss": 2.3758, + "step": 3736 + }, + { + "epoch": 0.39316149395055233, + "grad_norm": 1.35794198513031, + "learning_rate": 0.00013846109546298971, + "loss": 2.0261, + "step": 3737 + }, + { + "epoch": 0.3932667017359285, + "grad_norm": 1.0668870210647583, + "learning_rate": 0.00013842963713453987, + "loss": 2.2452, + "step": 3738 + }, + { + "epoch": 0.3933719095213046, + "grad_norm": 1.3212822675704956, + "learning_rate": 0.00013839817434338286, + "loss": 1.9696, + "step": 3739 + }, + { + "epoch": 0.39347711730668067, + "grad_norm": 0.7865563035011292, + "learning_rate": 0.00013836670709317225, + "loss": 1.6841, + "step": 3740 + }, + { + "epoch": 0.3935823250920568, + "grad_norm": 1.023016095161438, + "learning_rate": 0.0001383352353875623, + "loss": 2.1734, + "step": 3741 + }, + { + "epoch": 0.3936875328774329, + "grad_norm": 0.966575562953949, + "learning_rate": 0.00013830375923020772, + "loss": 1.735, + "step": 3742 + }, + { + "epoch": 0.39379274066280906, + "grad_norm": 1.255327582359314, + "learning_rate": 0.00013827227862476372, + "loss": 2.0901, + "step": 3743 + }, + { + "epoch": 0.39389794844818515, + "grad_norm": 1.4241955280303955, + "learning_rate": 0.00013824079357488598, + "loss": 1.5569, + "step": 3744 + }, + { + "epoch": 0.3940031562335613, + "grad_norm": 0.807941734790802, + "learning_rate": 0.00013820930408423086, + "loss": 1.9292, + "step": 3745 + }, + { + "epoch": 0.3941083640189374, + "grad_norm": 1.034824013710022, + "learning_rate": 0.00013817781015645507, + "loss": 2.1646, + "step": 3746 + }, + { + "epoch": 0.39421357180431355, + "grad_norm": 1.143803358078003, + "learning_rate": 0.00013814631179521588, + "loss": 2.2347, + "step": 3747 + }, + { + "epoch": 0.39431877958968964, + "grad_norm": 1.7627426385879517, + "learning_rate": 0.0001381148090041711, + "loss": 1.8837, + "step": 3748 + }, + { + "epoch": 0.39442398737506573, + "grad_norm": 1.153597354888916, + "learning_rate": 0.0001380833017869791, + "loss": 1.989, + "step": 3749 + }, + { + "epoch": 0.3945291951604419, + "grad_norm": 1.3962596654891968, + "learning_rate": 0.00013805179014729865, + "loss": 2.4745, + "step": 3750 + }, + { + "epoch": 0.394634402945818, + "grad_norm": 1.1619459390640259, + "learning_rate": 0.0001380202740887891, + "loss": 2.1719, + "step": 3751 + }, + { + "epoch": 0.3947396107311941, + "grad_norm": 1.4817026853561401, + "learning_rate": 0.00013798875361511033, + "loss": 2.6195, + "step": 3752 + }, + { + "epoch": 0.3948448185165702, + "grad_norm": 1.251517415046692, + "learning_rate": 0.00013795722872992272, + "loss": 1.9008, + "step": 3753 + }, + { + "epoch": 0.39495002630194637, + "grad_norm": 1.6343916654586792, + "learning_rate": 0.0001379256994368871, + "loss": 1.8265, + "step": 3754 + }, + { + "epoch": 0.39505523408732246, + "grad_norm": 1.4928406476974487, + "learning_rate": 0.0001378941657396649, + "loss": 2.0495, + "step": 3755 + }, + { + "epoch": 0.39516044187269855, + "grad_norm": 0.9346429109573364, + "learning_rate": 0.00013786262764191803, + "loss": 2.0691, + "step": 3756 + }, + { + "epoch": 0.3952656496580747, + "grad_norm": 1.5070823431015015, + "learning_rate": 0.00013783108514730884, + "loss": 1.6465, + "step": 3757 + }, + { + "epoch": 0.3953708574434508, + "grad_norm": 1.225148320198059, + "learning_rate": 0.00013779953825950034, + "loss": 1.3778, + "step": 3758 + }, + { + "epoch": 0.39547606522882695, + "grad_norm": 1.4228816032409668, + "learning_rate": 0.00013776798698215593, + "loss": 1.559, + "step": 3759 + }, + { + "epoch": 0.39558127301420304, + "grad_norm": 1.1593899726867676, + "learning_rate": 0.00013773643131893956, + "loss": 2.1666, + "step": 3760 + }, + { + "epoch": 0.3956864807995792, + "grad_norm": 1.2001755237579346, + "learning_rate": 0.00013770487127351568, + "loss": 1.5535, + "step": 3761 + }, + { + "epoch": 0.3957916885849553, + "grad_norm": 1.1403510570526123, + "learning_rate": 0.00013767330684954926, + "loss": 1.765, + "step": 3762 + }, + { + "epoch": 0.39589689637033143, + "grad_norm": 1.4081346988677979, + "learning_rate": 0.00013764173805070576, + "loss": 2.1388, + "step": 3763 + }, + { + "epoch": 0.3960021041557075, + "grad_norm": 0.9673276543617249, + "learning_rate": 0.00013761016488065118, + "loss": 2.1821, + "step": 3764 + }, + { + "epoch": 0.3961073119410836, + "grad_norm": 1.567577838897705, + "learning_rate": 0.00013757858734305203, + "loss": 2.0253, + "step": 3765 + }, + { + "epoch": 0.39621251972645977, + "grad_norm": 1.2986990213394165, + "learning_rate": 0.00013754700544157524, + "loss": 2.1731, + "step": 3766 + }, + { + "epoch": 0.39631772751183586, + "grad_norm": 1.9788020849227905, + "learning_rate": 0.00013751541917988836, + "loss": 1.8981, + "step": 3767 + }, + { + "epoch": 0.396422935297212, + "grad_norm": 1.5377720594406128, + "learning_rate": 0.0001374838285616594, + "loss": 2.0393, + "step": 3768 + }, + { + "epoch": 0.3965281430825881, + "grad_norm": 0.8659994006156921, + "learning_rate": 0.00013745223359055682, + "loss": 1.7419, + "step": 3769 + }, + { + "epoch": 0.39663335086796425, + "grad_norm": 1.1457507610321045, + "learning_rate": 0.0001374206342702497, + "loss": 2.1031, + "step": 3770 + }, + { + "epoch": 0.39673855865334035, + "grad_norm": 0.8939380645751953, + "learning_rate": 0.00013738903060440757, + "loss": 2.0608, + "step": 3771 + }, + { + "epoch": 0.39684376643871644, + "grad_norm": 1.0141644477844238, + "learning_rate": 0.0001373574225967004, + "loss": 1.9432, + "step": 3772 + }, + { + "epoch": 0.3969489742240926, + "grad_norm": 1.7639262676239014, + "learning_rate": 0.0001373258102507988, + "loss": 1.7746, + "step": 3773 + }, + { + "epoch": 0.3970541820094687, + "grad_norm": 1.8275938034057617, + "learning_rate": 0.00013729419357037372, + "loss": 1.9925, + "step": 3774 + }, + { + "epoch": 0.39715938979484483, + "grad_norm": 1.8225183486938477, + "learning_rate": 0.00013726257255909676, + "loss": 2.0511, + "step": 3775 + }, + { + "epoch": 0.3972645975802209, + "grad_norm": 1.1750774383544922, + "learning_rate": 0.00013723094722063996, + "loss": 2.0788, + "step": 3776 + }, + { + "epoch": 0.3973698053655971, + "grad_norm": 1.4258540868759155, + "learning_rate": 0.00013719931755867587, + "loss": 2.0758, + "step": 3777 + }, + { + "epoch": 0.39747501315097317, + "grad_norm": 0.9708757400512695, + "learning_rate": 0.0001371676835768775, + "loss": 1.8594, + "step": 3778 + }, + { + "epoch": 0.3975802209363493, + "grad_norm": 1.1266876459121704, + "learning_rate": 0.00013713604527891844, + "loss": 1.6579, + "step": 3779 + }, + { + "epoch": 0.3976854287217254, + "grad_norm": 2.256415605545044, + "learning_rate": 0.00013710440266847274, + "loss": 2.2314, + "step": 3780 + }, + { + "epoch": 0.3977906365071015, + "grad_norm": 1.4716403484344482, + "learning_rate": 0.0001370727557492149, + "loss": 1.7153, + "step": 3781 + }, + { + "epoch": 0.39789584429247765, + "grad_norm": 1.2496949434280396, + "learning_rate": 0.00013704110452482005, + "loss": 2.0778, + "step": 3782 + }, + { + "epoch": 0.39800105207785375, + "grad_norm": 0.9806689620018005, + "learning_rate": 0.0001370094489989637, + "loss": 2.1003, + "step": 3783 + }, + { + "epoch": 0.3981062598632299, + "grad_norm": 1.0199190378189087, + "learning_rate": 0.00013697778917532192, + "loss": 2.0489, + "step": 3784 + }, + { + "epoch": 0.398211467648606, + "grad_norm": 0.8735719919204712, + "learning_rate": 0.00013694612505757122, + "loss": 1.2784, + "step": 3785 + }, + { + "epoch": 0.39831667543398214, + "grad_norm": 1.2320698499679565, + "learning_rate": 0.00013691445664938866, + "loss": 2.0641, + "step": 3786 + }, + { + "epoch": 0.39842188321935823, + "grad_norm": 1.306445598602295, + "learning_rate": 0.00013688278395445185, + "loss": 1.8181, + "step": 3787 + }, + { + "epoch": 0.3985270910047343, + "grad_norm": 1.5891810655593872, + "learning_rate": 0.00013685110697643878, + "loss": 1.7734, + "step": 3788 + }, + { + "epoch": 0.3986322987901105, + "grad_norm": 1.2579609155654907, + "learning_rate": 0.00013681942571902803, + "loss": 1.9562, + "step": 3789 + }, + { + "epoch": 0.39873750657548657, + "grad_norm": 1.3183737993240356, + "learning_rate": 0.00013678774018589855, + "loss": 1.824, + "step": 3790 + }, + { + "epoch": 0.3988427143608627, + "grad_norm": 1.2794685363769531, + "learning_rate": 0.00013675605038072997, + "loss": 1.4231, + "step": 3791 + }, + { + "epoch": 0.3989479221462388, + "grad_norm": 1.0361131429672241, + "learning_rate": 0.00013672435630720232, + "loss": 1.7537, + "step": 3792 + }, + { + "epoch": 0.39905312993161496, + "grad_norm": 1.2648097276687622, + "learning_rate": 0.00013669265796899607, + "loss": 2.534, + "step": 3793 + }, + { + "epoch": 0.39915833771699105, + "grad_norm": 1.1882283687591553, + "learning_rate": 0.00013666095536979232, + "loss": 1.7441, + "step": 3794 + }, + { + "epoch": 0.3992635455023672, + "grad_norm": 1.5789291858673096, + "learning_rate": 0.00013662924851327247, + "loss": 1.7312, + "step": 3795 + }, + { + "epoch": 0.3993687532877433, + "grad_norm": 1.9894529581069946, + "learning_rate": 0.00013659753740311866, + "loss": 1.8041, + "step": 3796 + }, + { + "epoch": 0.3994739610731194, + "grad_norm": 1.1728602647781372, + "learning_rate": 0.00013656582204301334, + "loss": 2.1131, + "step": 3797 + }, + { + "epoch": 0.39957916885849554, + "grad_norm": 2.1701881885528564, + "learning_rate": 0.00013653410243663952, + "loss": 2.1482, + "step": 3798 + }, + { + "epoch": 0.39968437664387163, + "grad_norm": 1.0504519939422607, + "learning_rate": 0.00013650237858768067, + "loss": 2.0582, + "step": 3799 + }, + { + "epoch": 0.3997895844292478, + "grad_norm": 2.3325071334838867, + "learning_rate": 0.00013647065049982078, + "loss": 1.565, + "step": 3800 + }, + { + "epoch": 0.3998947922146239, + "grad_norm": 1.386509895324707, + "learning_rate": 0.0001364389181767444, + "loss": 1.8805, + "step": 3801 + }, + { + "epoch": 0.4, + "grad_norm": 1.0210331678390503, + "learning_rate": 0.0001364071816221364, + "loss": 2.0216, + "step": 3802 + }, + { + "epoch": 0.4001052077853761, + "grad_norm": 1.7572848796844482, + "learning_rate": 0.00013637544083968227, + "loss": 2.1344, + "step": 3803 + }, + { + "epoch": 0.4002104155707522, + "grad_norm": 1.721763014793396, + "learning_rate": 0.00013634369583306798, + "loss": 1.8024, + "step": 3804 + }, + { + "epoch": 0.40031562335612836, + "grad_norm": 1.5794200897216797, + "learning_rate": 0.00013631194660598, + "loss": 1.9257, + "step": 3805 + }, + { + "epoch": 0.40042083114150445, + "grad_norm": 1.026024341583252, + "learning_rate": 0.00013628019316210522, + "loss": 2.4013, + "step": 3806 + }, + { + "epoch": 0.4005260389268806, + "grad_norm": 4.058107852935791, + "learning_rate": 0.0001362484355051311, + "loss": 1.554, + "step": 3807 + }, + { + "epoch": 0.4006312467122567, + "grad_norm": 1.9049569368362427, + "learning_rate": 0.00013621667363874552, + "loss": 2.2071, + "step": 3808 + }, + { + "epoch": 0.40073645449763284, + "grad_norm": 1.2772477865219116, + "learning_rate": 0.00013618490756663686, + "loss": 1.6058, + "step": 3809 + }, + { + "epoch": 0.40084166228300894, + "grad_norm": 1.0035552978515625, + "learning_rate": 0.00013615313729249405, + "loss": 1.8947, + "step": 3810 + }, + { + "epoch": 0.4009468700683851, + "grad_norm": 0.8478926420211792, + "learning_rate": 0.00013612136282000644, + "loss": 1.8861, + "step": 3811 + }, + { + "epoch": 0.4010520778537612, + "grad_norm": 1.2318350076675415, + "learning_rate": 0.00013608958415286396, + "loss": 1.8174, + "step": 3812 + }, + { + "epoch": 0.4011572856391373, + "grad_norm": 1.1952483654022217, + "learning_rate": 0.00013605780129475687, + "loss": 1.8322, + "step": 3813 + }, + { + "epoch": 0.4012624934245134, + "grad_norm": 1.1804511547088623, + "learning_rate": 0.00013602601424937604, + "loss": 1.7156, + "step": 3814 + }, + { + "epoch": 0.4013677012098895, + "grad_norm": 1.3360835313796997, + "learning_rate": 0.00013599422302041286, + "loss": 1.7492, + "step": 3815 + }, + { + "epoch": 0.40147290899526566, + "grad_norm": 1.1277227401733398, + "learning_rate": 0.00013596242761155903, + "loss": 1.4064, + "step": 3816 + }, + { + "epoch": 0.40157811678064176, + "grad_norm": 1.0985256433486938, + "learning_rate": 0.00013593062802650692, + "loss": 1.8555, + "step": 3817 + }, + { + "epoch": 0.4016833245660179, + "grad_norm": 1.0841742753982544, + "learning_rate": 0.0001358988242689493, + "loss": 1.4974, + "step": 3818 + }, + { + "epoch": 0.401788532351394, + "grad_norm": 1.3948936462402344, + "learning_rate": 0.0001358670163425795, + "loss": 1.8494, + "step": 3819 + }, + { + "epoch": 0.4018937401367701, + "grad_norm": 2.593919515609741, + "learning_rate": 0.0001358352042510911, + "loss": 1.9506, + "step": 3820 + }, + { + "epoch": 0.40199894792214624, + "grad_norm": 1.0042921304702759, + "learning_rate": 0.00013580338799817844, + "loss": 2.05, + "step": 3821 + }, + { + "epoch": 0.40210415570752234, + "grad_norm": 1.3841592073440552, + "learning_rate": 0.00013577156758753627, + "loss": 1.8187, + "step": 3822 + }, + { + "epoch": 0.4022093634928985, + "grad_norm": 2.3145501613616943, + "learning_rate": 0.00013573974302285972, + "loss": 2.1793, + "step": 3823 + }, + { + "epoch": 0.4023145712782746, + "grad_norm": 1.6691625118255615, + "learning_rate": 0.00013570791430784452, + "loss": 1.8489, + "step": 3824 + }, + { + "epoch": 0.40241977906365073, + "grad_norm": 1.7601234912872314, + "learning_rate": 0.0001356760814461868, + "loss": 1.055, + "step": 3825 + }, + { + "epoch": 0.4025249868490268, + "grad_norm": 1.5484458208084106, + "learning_rate": 0.00013564424444158324, + "loss": 1.8766, + "step": 3826 + }, + { + "epoch": 0.40263019463440297, + "grad_norm": 1.6130404472351074, + "learning_rate": 0.00013561240329773092, + "loss": 1.8005, + "step": 3827 + }, + { + "epoch": 0.40273540241977906, + "grad_norm": 1.7206354141235352, + "learning_rate": 0.00013558055801832748, + "loss": 1.5215, + "step": 3828 + }, + { + "epoch": 0.40284061020515516, + "grad_norm": 1.2879303693771362, + "learning_rate": 0.00013554870860707106, + "loss": 2.0524, + "step": 3829 + }, + { + "epoch": 0.4029458179905313, + "grad_norm": 0.978996217250824, + "learning_rate": 0.0001355168550676601, + "loss": 1.7266, + "step": 3830 + }, + { + "epoch": 0.4030510257759074, + "grad_norm": 1.726261019706726, + "learning_rate": 0.00013548499740379373, + "loss": 2.3547, + "step": 3831 + }, + { + "epoch": 0.40315623356128355, + "grad_norm": 1.3626611232757568, + "learning_rate": 0.00013545313561917144, + "loss": 1.5038, + "step": 3832 + }, + { + "epoch": 0.40326144134665964, + "grad_norm": 1.0368597507476807, + "learning_rate": 0.00013542126971749328, + "loss": 1.8802, + "step": 3833 + }, + { + "epoch": 0.4033666491320358, + "grad_norm": 1.394249439239502, + "learning_rate": 0.00013538939970245972, + "loss": 1.7687, + "step": 3834 + }, + { + "epoch": 0.4034718569174119, + "grad_norm": 1.244553565979004, + "learning_rate": 0.0001353575255777717, + "loss": 1.9525, + "step": 3835 + }, + { + "epoch": 0.403577064702788, + "grad_norm": 1.686198353767395, + "learning_rate": 0.00013532564734713068, + "loss": 1.6175, + "step": 3836 + }, + { + "epoch": 0.40368227248816413, + "grad_norm": 2.307166814804077, + "learning_rate": 0.00013529376501423852, + "loss": 2.3844, + "step": 3837 + }, + { + "epoch": 0.4037874802735402, + "grad_norm": 1.9087625741958618, + "learning_rate": 0.00013526187858279765, + "loss": 1.272, + "step": 3838 + }, + { + "epoch": 0.40389268805891637, + "grad_norm": 1.4016311168670654, + "learning_rate": 0.00013522998805651096, + "loss": 1.9822, + "step": 3839 + }, + { + "epoch": 0.40399789584429247, + "grad_norm": 1.2222285270690918, + "learning_rate": 0.00013519809343908178, + "loss": 2.1131, + "step": 3840 + }, + { + "epoch": 0.4041031036296686, + "grad_norm": 1.7945427894592285, + "learning_rate": 0.00013516619473421387, + "loss": 1.8994, + "step": 3841 + }, + { + "epoch": 0.4042083114150447, + "grad_norm": 1.665122628211975, + "learning_rate": 0.0001351342919456116, + "loss": 1.7643, + "step": 3842 + }, + { + "epoch": 0.40431351920042086, + "grad_norm": 1.1571388244628906, + "learning_rate": 0.00013510238507697967, + "loss": 2.1814, + "step": 3843 + }, + { + "epoch": 0.40441872698579695, + "grad_norm": 1.2247772216796875, + "learning_rate": 0.00013507047413202335, + "loss": 1.9968, + "step": 3844 + }, + { + "epoch": 0.40452393477117304, + "grad_norm": 1.482480525970459, + "learning_rate": 0.00013503855911444837, + "loss": 1.9141, + "step": 3845 + }, + { + "epoch": 0.4046291425565492, + "grad_norm": 1.5328335762023926, + "learning_rate": 0.00013500664002796093, + "loss": 1.8682, + "step": 3846 + }, + { + "epoch": 0.4047343503419253, + "grad_norm": 1.9800300598144531, + "learning_rate": 0.0001349747168762676, + "loss": 1.7248, + "step": 3847 + }, + { + "epoch": 0.40483955812730144, + "grad_norm": 1.3006877899169922, + "learning_rate": 0.0001349427896630756, + "loss": 1.5546, + "step": 3848 + }, + { + "epoch": 0.40494476591267753, + "grad_norm": 1.683077096939087, + "learning_rate": 0.0001349108583920925, + "loss": 2.1227, + "step": 3849 + }, + { + "epoch": 0.4050499736980537, + "grad_norm": 1.1028867959976196, + "learning_rate": 0.00013487892306702638, + "loss": 1.8122, + "step": 3850 + }, + { + "epoch": 0.40515518148342977, + "grad_norm": 1.3588521480560303, + "learning_rate": 0.00013484698369158578, + "loss": 1.7322, + "step": 3851 + }, + { + "epoch": 0.40526038926880587, + "grad_norm": 1.1769453287124634, + "learning_rate": 0.0001348150402694797, + "loss": 2.0872, + "step": 3852 + }, + { + "epoch": 0.405365597054182, + "grad_norm": 1.4604272842407227, + "learning_rate": 0.00013478309280441763, + "loss": 2.044, + "step": 3853 + }, + { + "epoch": 0.4054708048395581, + "grad_norm": 1.201572299003601, + "learning_rate": 0.00013475114130010954, + "loss": 2.048, + "step": 3854 + }, + { + "epoch": 0.40557601262493426, + "grad_norm": 2.324699878692627, + "learning_rate": 0.00013471918576026583, + "loss": 2.0551, + "step": 3855 + }, + { + "epoch": 0.40568122041031035, + "grad_norm": 1.0302995443344116, + "learning_rate": 0.00013468722618859743, + "loss": 1.7291, + "step": 3856 + }, + { + "epoch": 0.4057864281956865, + "grad_norm": 1.6787930727005005, + "learning_rate": 0.00013465526258881565, + "loss": 2.4362, + "step": 3857 + }, + { + "epoch": 0.4058916359810626, + "grad_norm": 1.2186408042907715, + "learning_rate": 0.00013462329496463236, + "loss": 1.8517, + "step": 3858 + }, + { + "epoch": 0.40599684376643874, + "grad_norm": 0.8594940900802612, + "learning_rate": 0.0001345913233197598, + "loss": 1.8623, + "step": 3859 + }, + { + "epoch": 0.40610205155181484, + "grad_norm": 1.7866066694259644, + "learning_rate": 0.00013455934765791084, + "loss": 1.7403, + "step": 3860 + }, + { + "epoch": 0.40620725933719093, + "grad_norm": 1.9410916566848755, + "learning_rate": 0.00013452736798279856, + "loss": 1.7438, + "step": 3861 + }, + { + "epoch": 0.4063124671225671, + "grad_norm": 0.9494317173957825, + "learning_rate": 0.0001344953842981368, + "loss": 1.8397, + "step": 3862 + }, + { + "epoch": 0.40641767490794317, + "grad_norm": 1.590301752090454, + "learning_rate": 0.0001344633966076396, + "loss": 2.0589, + "step": 3863 + }, + { + "epoch": 0.4065228826933193, + "grad_norm": 1.0571048259735107, + "learning_rate": 0.0001344314049150217, + "loss": 2.1632, + "step": 3864 + }, + { + "epoch": 0.4066280904786954, + "grad_norm": 1.2266767024993896, + "learning_rate": 0.00013439940922399806, + "loss": 2.1405, + "step": 3865 + }, + { + "epoch": 0.40673329826407156, + "grad_norm": 1.6102620363235474, + "learning_rate": 0.00013436740953828432, + "loss": 2.0555, + "step": 3866 + }, + { + "epoch": 0.40683850604944766, + "grad_norm": 1.640577793121338, + "learning_rate": 0.0001343354058615965, + "loss": 1.3221, + "step": 3867 + }, + { + "epoch": 0.40694371383482375, + "grad_norm": 1.5055091381072998, + "learning_rate": 0.00013430339819765105, + "loss": 1.9195, + "step": 3868 + }, + { + "epoch": 0.4070489216201999, + "grad_norm": 1.0952521562576294, + "learning_rate": 0.0001342713865501649, + "loss": 1.8598, + "step": 3869 + }, + { + "epoch": 0.407154129405576, + "grad_norm": 0.8317450881004333, + "learning_rate": 0.00013423937092285555, + "loss": 1.9036, + "step": 3870 + }, + { + "epoch": 0.40725933719095214, + "grad_norm": 1.3598711490631104, + "learning_rate": 0.00013420735131944073, + "loss": 1.9391, + "step": 3871 + }, + { + "epoch": 0.40736454497632824, + "grad_norm": 1.854884147644043, + "learning_rate": 0.0001341753277436389, + "loss": 1.5506, + "step": 3872 + }, + { + "epoch": 0.4074697527617044, + "grad_norm": 1.2758795022964478, + "learning_rate": 0.00013414330019916875, + "loss": 1.6345, + "step": 3873 + }, + { + "epoch": 0.4075749605470805, + "grad_norm": 1.4134248495101929, + "learning_rate": 0.0001341112686897496, + "loss": 1.3446, + "step": 3874 + }, + { + "epoch": 0.4076801683324566, + "grad_norm": 1.3641563653945923, + "learning_rate": 0.00013407923321910115, + "loss": 2.0881, + "step": 3875 + }, + { + "epoch": 0.4077853761178327, + "grad_norm": 1.287705659866333, + "learning_rate": 0.00013404719379094354, + "loss": 2.2842, + "step": 3876 + }, + { + "epoch": 0.4078905839032088, + "grad_norm": 1.201897382736206, + "learning_rate": 0.00013401515040899746, + "loss": 1.7941, + "step": 3877 + }, + { + "epoch": 0.40799579168858496, + "grad_norm": 1.092402696609497, + "learning_rate": 0.00013398310307698397, + "loss": 1.8657, + "step": 3878 + }, + { + "epoch": 0.40810099947396106, + "grad_norm": 1.349609136581421, + "learning_rate": 0.0001339510517986246, + "loss": 1.6198, + "step": 3879 + }, + { + "epoch": 0.4082062072593372, + "grad_norm": 1.9795316457748413, + "learning_rate": 0.0001339189965776414, + "loss": 1.7579, + "step": 3880 + }, + { + "epoch": 0.4083114150447133, + "grad_norm": 1.3774399757385254, + "learning_rate": 0.0001338869374177568, + "loss": 1.903, + "step": 3881 + }, + { + "epoch": 0.40841662283008945, + "grad_norm": 1.0936564207077026, + "learning_rate": 0.00013385487432269376, + "loss": 2.2156, + "step": 3882 + }, + { + "epoch": 0.40852183061546554, + "grad_norm": 1.4462928771972656, + "learning_rate": 0.00013382280729617568, + "loss": 2.0977, + "step": 3883 + }, + { + "epoch": 0.40862703840084164, + "grad_norm": 2.339925527572632, + "learning_rate": 0.00013379073634192632, + "loss": 1.6663, + "step": 3884 + }, + { + "epoch": 0.4087322461862178, + "grad_norm": 1.1641333103179932, + "learning_rate": 0.00013375866146367, + "loss": 2.5582, + "step": 3885 + }, + { + "epoch": 0.4088374539715939, + "grad_norm": 0.9723519086837769, + "learning_rate": 0.00013372658266513153, + "loss": 1.5445, + "step": 3886 + }, + { + "epoch": 0.40894266175697, + "grad_norm": 0.9060115218162537, + "learning_rate": 0.00013369449995003608, + "loss": 2.0697, + "step": 3887 + }, + { + "epoch": 0.4090478695423461, + "grad_norm": 1.7128041982650757, + "learning_rate": 0.00013366241332210928, + "loss": 1.4517, + "step": 3888 + }, + { + "epoch": 0.40915307732772227, + "grad_norm": 2.7365641593933105, + "learning_rate": 0.00013363032278507726, + "loss": 1.2883, + "step": 3889 + }, + { + "epoch": 0.40925828511309836, + "grad_norm": 1.3242186307907104, + "learning_rate": 0.00013359822834266662, + "loss": 2.3144, + "step": 3890 + }, + { + "epoch": 0.4093634928984745, + "grad_norm": 1.483139991760254, + "learning_rate": 0.00013356612999860436, + "loss": 1.9398, + "step": 3891 + }, + { + "epoch": 0.4094687006838506, + "grad_norm": 1.365196943283081, + "learning_rate": 0.00013353402775661795, + "loss": 1.5895, + "step": 3892 + }, + { + "epoch": 0.4095739084692267, + "grad_norm": 1.608028769493103, + "learning_rate": 0.0001335019216204353, + "loss": 1.9566, + "step": 3893 + }, + { + "epoch": 0.40967911625460285, + "grad_norm": 1.6372560262680054, + "learning_rate": 0.00013346981159378485, + "loss": 2.0881, + "step": 3894 + }, + { + "epoch": 0.40978432403997894, + "grad_norm": 1.121978521347046, + "learning_rate": 0.00013343769768039537, + "loss": 1.9097, + "step": 3895 + }, + { + "epoch": 0.4098895318253551, + "grad_norm": 1.4127432107925415, + "learning_rate": 0.00013340557988399617, + "loss": 2.1085, + "step": 3896 + }, + { + "epoch": 0.4099947396107312, + "grad_norm": 1.146700143814087, + "learning_rate": 0.00013337345820831696, + "loss": 1.546, + "step": 3897 + }, + { + "epoch": 0.41009994739610733, + "grad_norm": 1.1912237405776978, + "learning_rate": 0.000133341332657088, + "loss": 1.4891, + "step": 3898 + }, + { + "epoch": 0.4102051551814834, + "grad_norm": 1.4994091987609863, + "learning_rate": 0.0001333092032340398, + "loss": 2.1801, + "step": 3899 + }, + { + "epoch": 0.4103103629668595, + "grad_norm": 1.059524655342102, + "learning_rate": 0.00013327706994290355, + "loss": 1.9326, + "step": 3900 + }, + { + "epoch": 0.41041557075223567, + "grad_norm": 1.365159034729004, + "learning_rate": 0.00013324493278741073, + "loss": 1.833, + "step": 3901 + }, + { + "epoch": 0.41052077853761176, + "grad_norm": 2.979818344116211, + "learning_rate": 0.00013321279177129337, + "loss": 2.8629, + "step": 3902 + }, + { + "epoch": 0.4106259863229879, + "grad_norm": 2.018228530883789, + "learning_rate": 0.00013318064689828385, + "loss": 1.9213, + "step": 3903 + }, + { + "epoch": 0.410731194108364, + "grad_norm": 1.5214287042617798, + "learning_rate": 0.00013314849817211508, + "loss": 1.7545, + "step": 3904 + }, + { + "epoch": 0.41083640189374016, + "grad_norm": 1.0850343704223633, + "learning_rate": 0.00013311634559652036, + "loss": 1.5041, + "step": 3905 + }, + { + "epoch": 0.41094160967911625, + "grad_norm": 1.1685420274734497, + "learning_rate": 0.00013308418917523348, + "loss": 1.9563, + "step": 3906 + }, + { + "epoch": 0.4110468174644924, + "grad_norm": 1.6198590993881226, + "learning_rate": 0.00013305202891198862, + "loss": 1.7929, + "step": 3907 + }, + { + "epoch": 0.4111520252498685, + "grad_norm": 2.0444579124450684, + "learning_rate": 0.0001330198648105205, + "loss": 2.1797, + "step": 3908 + }, + { + "epoch": 0.4112572330352446, + "grad_norm": 1.8956327438354492, + "learning_rate": 0.00013298769687456426, + "loss": 2.1496, + "step": 3909 + }, + { + "epoch": 0.41136244082062073, + "grad_norm": 1.3414783477783203, + "learning_rate": 0.00013295552510785534, + "loss": 1.4914, + "step": 3910 + }, + { + "epoch": 0.4114676486059968, + "grad_norm": 1.368857502937317, + "learning_rate": 0.00013292334951412984, + "loss": 1.7118, + "step": 3911 + }, + { + "epoch": 0.411572856391373, + "grad_norm": 1.7612696886062622, + "learning_rate": 0.00013289117009712418, + "loss": 2.0468, + "step": 3912 + }, + { + "epoch": 0.41167806417674907, + "grad_norm": 1.1317222118377686, + "learning_rate": 0.00013285898686057524, + "loss": 1.9907, + "step": 3913 + }, + { + "epoch": 0.4117832719621252, + "grad_norm": 1.8835352659225464, + "learning_rate": 0.00013282679980822034, + "loss": 1.4493, + "step": 3914 + }, + { + "epoch": 0.4118884797475013, + "grad_norm": 1.1284127235412598, + "learning_rate": 0.00013279460894379729, + "loss": 1.5342, + "step": 3915 + }, + { + "epoch": 0.4119936875328774, + "grad_norm": 1.2283440828323364, + "learning_rate": 0.0001327624142710443, + "loss": 2.1995, + "step": 3916 + }, + { + "epoch": 0.41209889531825356, + "grad_norm": 1.2161046266555786, + "learning_rate": 0.00013273021579370003, + "loss": 2.1285, + "step": 3917 + }, + { + "epoch": 0.41220410310362965, + "grad_norm": 1.6340278387069702, + "learning_rate": 0.00013269801351550354, + "loss": 1.4938, + "step": 3918 + }, + { + "epoch": 0.4123093108890058, + "grad_norm": 1.4137933254241943, + "learning_rate": 0.00013266580744019445, + "loss": 1.6995, + "step": 3919 + }, + { + "epoch": 0.4124145186743819, + "grad_norm": 1.0340012311935425, + "learning_rate": 0.0001326335975715127, + "loss": 1.9991, + "step": 3920 + }, + { + "epoch": 0.41251972645975804, + "grad_norm": 1.2276008129119873, + "learning_rate": 0.00013260138391319872, + "loss": 1.3765, + "step": 3921 + }, + { + "epoch": 0.41262493424513413, + "grad_norm": 1.1833198070526123, + "learning_rate": 0.00013256916646899337, + "loss": 2.0268, + "step": 3922 + }, + { + "epoch": 0.4127301420305103, + "grad_norm": 2.14572811126709, + "learning_rate": 0.00013253694524263798, + "loss": 2.0048, + "step": 3923 + }, + { + "epoch": 0.4128353498158864, + "grad_norm": 1.3830515146255493, + "learning_rate": 0.00013250472023787425, + "loss": 2.113, + "step": 3924 + }, + { + "epoch": 0.41294055760126247, + "grad_norm": 1.290831208229065, + "learning_rate": 0.00013247249145844443, + "loss": 1.6246, + "step": 3925 + }, + { + "epoch": 0.4130457653866386, + "grad_norm": 1.4509975910186768, + "learning_rate": 0.00013244025890809112, + "loss": 1.8658, + "step": 3926 + }, + { + "epoch": 0.4131509731720147, + "grad_norm": 1.2265394926071167, + "learning_rate": 0.00013240802259055734, + "loss": 1.763, + "step": 3927 + }, + { + "epoch": 0.41325618095739086, + "grad_norm": 1.2114838361740112, + "learning_rate": 0.0001323757825095866, + "loss": 1.7796, + "step": 3928 + }, + { + "epoch": 0.41336138874276696, + "grad_norm": 1.1204650402069092, + "learning_rate": 0.00013234353866892285, + "loss": 1.5069, + "step": 3929 + }, + { + "epoch": 0.4134665965281431, + "grad_norm": 1.245553970336914, + "learning_rate": 0.00013231129107231052, + "loss": 1.9431, + "step": 3930 + }, + { + "epoch": 0.4135718043135192, + "grad_norm": 1.2581690549850464, + "learning_rate": 0.00013227903972349428, + "loss": 1.7897, + "step": 3931 + }, + { + "epoch": 0.4136770120988953, + "grad_norm": 1.647939920425415, + "learning_rate": 0.00013224678462621947, + "loss": 2.0146, + "step": 3932 + }, + { + "epoch": 0.41378221988427144, + "grad_norm": 1.1378495693206787, + "learning_rate": 0.00013221452578423176, + "loss": 1.7066, + "step": 3933 + }, + { + "epoch": 0.41388742766964753, + "grad_norm": 1.322014570236206, + "learning_rate": 0.00013218226320127724, + "loss": 1.4162, + "step": 3934 + }, + { + "epoch": 0.4139926354550237, + "grad_norm": 0.965604841709137, + "learning_rate": 0.00013214999688110249, + "loss": 2.0709, + "step": 3935 + }, + { + "epoch": 0.4140978432403998, + "grad_norm": 1.4124212265014648, + "learning_rate": 0.00013211772682745446, + "loss": 2.03, + "step": 3936 + }, + { + "epoch": 0.4142030510257759, + "grad_norm": 1.284655213356018, + "learning_rate": 0.00013208545304408057, + "loss": 1.9079, + "step": 3937 + }, + { + "epoch": 0.414308258811152, + "grad_norm": 1.4193320274353027, + "learning_rate": 0.00013205317553472868, + "loss": 1.6901, + "step": 3938 + }, + { + "epoch": 0.41441346659652817, + "grad_norm": 1.1792535781860352, + "learning_rate": 0.00013202089430314705, + "loss": 1.5996, + "step": 3939 + }, + { + "epoch": 0.41451867438190426, + "grad_norm": 1.242659568786621, + "learning_rate": 0.00013198860935308444, + "loss": 2.35, + "step": 3940 + }, + { + "epoch": 0.41462388216728036, + "grad_norm": 1.4708364009857178, + "learning_rate": 0.0001319563206882899, + "loss": 1.83, + "step": 3941 + }, + { + "epoch": 0.4147290899526565, + "grad_norm": 1.332457184791565, + "learning_rate": 0.00013192402831251312, + "loss": 2.0127, + "step": 3942 + }, + { + "epoch": 0.4148342977380326, + "grad_norm": 1.8387818336486816, + "learning_rate": 0.00013189173222950403, + "loss": 2.4401, + "step": 3943 + }, + { + "epoch": 0.41493950552340875, + "grad_norm": 1.1250078678131104, + "learning_rate": 0.0001318594324430131, + "loss": 2.0481, + "step": 3944 + }, + { + "epoch": 0.41504471330878484, + "grad_norm": 1.0061681270599365, + "learning_rate": 0.00013182712895679118, + "loss": 1.6706, + "step": 3945 + }, + { + "epoch": 0.415149921094161, + "grad_norm": 1.2926052808761597, + "learning_rate": 0.0001317948217745896, + "loss": 1.596, + "step": 3946 + }, + { + "epoch": 0.4152551288795371, + "grad_norm": 1.2874207496643066, + "learning_rate": 0.00013176251090016007, + "loss": 1.7986, + "step": 3947 + }, + { + "epoch": 0.4153603366649132, + "grad_norm": 1.1514983177185059, + "learning_rate": 0.00013173019633725474, + "loss": 2.1873, + "step": 3948 + }, + { + "epoch": 0.4154655444502893, + "grad_norm": 1.636364459991455, + "learning_rate": 0.00013169787808962617, + "loss": 1.9381, + "step": 3949 + }, + { + "epoch": 0.4155707522356654, + "grad_norm": 1.7293845415115356, + "learning_rate": 0.00013166555616102744, + "loss": 1.8849, + "step": 3950 + }, + { + "epoch": 0.41567596002104157, + "grad_norm": 1.3633328676223755, + "learning_rate": 0.0001316332305552119, + "loss": 1.2241, + "step": 3951 + }, + { + "epoch": 0.41578116780641766, + "grad_norm": 0.8640151023864746, + "learning_rate": 0.00013160090127593344, + "loss": 1.9409, + "step": 3952 + }, + { + "epoch": 0.4158863755917938, + "grad_norm": 1.580431580543518, + "learning_rate": 0.00013156856832694642, + "loss": 1.7308, + "step": 3953 + }, + { + "epoch": 0.4159915833771699, + "grad_norm": 0.9848331212997437, + "learning_rate": 0.0001315362317120055, + "loss": 1.8526, + "step": 3954 + }, + { + "epoch": 0.41609679116254605, + "grad_norm": 1.7985378503799438, + "learning_rate": 0.00013150389143486586, + "loss": 1.7818, + "step": 3955 + }, + { + "epoch": 0.41620199894792215, + "grad_norm": 1.3943655490875244, + "learning_rate": 0.000131471547499283, + "loss": 1.8059, + "step": 3956 + }, + { + "epoch": 0.41630720673329824, + "grad_norm": 1.2380815744400024, + "learning_rate": 0.00013143919990901302, + "loss": 1.652, + "step": 3957 + }, + { + "epoch": 0.4164124145186744, + "grad_norm": 1.415740966796875, + "learning_rate": 0.00013140684866781225, + "loss": 2.4166, + "step": 3958 + }, + { + "epoch": 0.4165176223040505, + "grad_norm": 1.2207032442092896, + "learning_rate": 0.00013137449377943755, + "loss": 1.7993, + "step": 3959 + }, + { + "epoch": 0.41662283008942663, + "grad_norm": 1.1489773988723755, + "learning_rate": 0.00013134213524764623, + "loss": 1.7678, + "step": 3960 + }, + { + "epoch": 0.4167280378748027, + "grad_norm": 1.381227970123291, + "learning_rate": 0.00013130977307619594, + "loss": 2.0441, + "step": 3961 + }, + { + "epoch": 0.4168332456601789, + "grad_norm": 1.6043063402175903, + "learning_rate": 0.0001312774072688448, + "loss": 1.9146, + "step": 3962 + }, + { + "epoch": 0.41693845344555497, + "grad_norm": 1.3174347877502441, + "learning_rate": 0.00013124503782935133, + "loss": 1.7681, + "step": 3963 + }, + { + "epoch": 0.41704366123093106, + "grad_norm": 1.854496955871582, + "learning_rate": 0.00013121266476147454, + "loss": 2.158, + "step": 3964 + }, + { + "epoch": 0.4171488690163072, + "grad_norm": 1.0530141592025757, + "learning_rate": 0.00013118028806897373, + "loss": 1.8426, + "step": 3965 + }, + { + "epoch": 0.4172540768016833, + "grad_norm": 1.5964198112487793, + "learning_rate": 0.00013114790775560877, + "loss": 1.3975, + "step": 3966 + }, + { + "epoch": 0.41735928458705945, + "grad_norm": 1.1978175640106201, + "learning_rate": 0.00013111552382513985, + "loss": 2.0528, + "step": 3967 + }, + { + "epoch": 0.41746449237243555, + "grad_norm": 1.6343151330947876, + "learning_rate": 0.0001310831362813276, + "loss": 1.9448, + "step": 3968 + }, + { + "epoch": 0.4175697001578117, + "grad_norm": 1.598139762878418, + "learning_rate": 0.0001310507451279331, + "loss": 1.6996, + "step": 3969 + }, + { + "epoch": 0.4176749079431878, + "grad_norm": 0.9592291712760925, + "learning_rate": 0.00013101835036871781, + "loss": 1.1519, + "step": 3970 + }, + { + "epoch": 0.41778011572856394, + "grad_norm": 2.241469621658325, + "learning_rate": 0.00013098595200744366, + "loss": 1.9488, + "step": 3971 + }, + { + "epoch": 0.41788532351394003, + "grad_norm": 2.4150736331939697, + "learning_rate": 0.0001309535500478729, + "loss": 2.2178, + "step": 3972 + }, + { + "epoch": 0.4179905312993161, + "grad_norm": 1.3556127548217773, + "learning_rate": 0.00013092114449376828, + "loss": 1.5269, + "step": 3973 + }, + { + "epoch": 0.4180957390846923, + "grad_norm": 1.4668810367584229, + "learning_rate": 0.00013088873534889304, + "loss": 2.0874, + "step": 3974 + }, + { + "epoch": 0.41820094687006837, + "grad_norm": 1.5009702444076538, + "learning_rate": 0.00013085632261701063, + "loss": 1.7267, + "step": 3975 + }, + { + "epoch": 0.4183061546554445, + "grad_norm": 1.6611928939819336, + "learning_rate": 0.0001308239063018851, + "loss": 1.5624, + "step": 3976 + }, + { + "epoch": 0.4184113624408206, + "grad_norm": 1.1202343702316284, + "learning_rate": 0.00013079148640728077, + "loss": 1.78, + "step": 3977 + }, + { + "epoch": 0.41851657022619676, + "grad_norm": 1.4417529106140137, + "learning_rate": 0.0001307590629369626, + "loss": 2.0658, + "step": 3978 + }, + { + "epoch": 0.41862177801157285, + "grad_norm": 1.2309139966964722, + "learning_rate": 0.0001307266358946957, + "loss": 1.5864, + "step": 3979 + }, + { + "epoch": 0.41872698579694895, + "grad_norm": 1.4191454648971558, + "learning_rate": 0.00013069420528424579, + "loss": 1.9842, + "step": 3980 + }, + { + "epoch": 0.4188321935823251, + "grad_norm": 1.4356913566589355, + "learning_rate": 0.00013066177110937884, + "loss": 1.7109, + "step": 3981 + }, + { + "epoch": 0.4189374013677012, + "grad_norm": 2.5019426345825195, + "learning_rate": 0.00013062933337386142, + "loss": 1.8636, + "step": 3982 + }, + { + "epoch": 0.41904260915307734, + "grad_norm": 1.5507819652557373, + "learning_rate": 0.00013059689208146035, + "loss": 1.5129, + "step": 3983 + }, + { + "epoch": 0.41914781693845343, + "grad_norm": 1.7946397066116333, + "learning_rate": 0.00013056444723594297, + "loss": 1.8957, + "step": 3984 + }, + { + "epoch": 0.4192530247238296, + "grad_norm": 1.1665949821472168, + "learning_rate": 0.000130531998841077, + "loss": 1.995, + "step": 3985 + }, + { + "epoch": 0.4193582325092057, + "grad_norm": 1.3788167238235474, + "learning_rate": 0.00013049954690063048, + "loss": 1.5433, + "step": 3986 + }, + { + "epoch": 0.4194634402945818, + "grad_norm": 1.3582801818847656, + "learning_rate": 0.00013046709141837205, + "loss": 1.6558, + "step": 3987 + }, + { + "epoch": 0.4195686480799579, + "grad_norm": 1.2572566270828247, + "learning_rate": 0.00013043463239807064, + "loss": 1.9563, + "step": 3988 + }, + { + "epoch": 0.419673855865334, + "grad_norm": 1.366934061050415, + "learning_rate": 0.00013040216984349555, + "loss": 1.987, + "step": 3989 + }, + { + "epoch": 0.41977906365071016, + "grad_norm": 1.2102018594741821, + "learning_rate": 0.0001303697037584166, + "loss": 2.09, + "step": 3990 + }, + { + "epoch": 0.41988427143608625, + "grad_norm": 1.2199288606643677, + "learning_rate": 0.000130337234146604, + "loss": 2.0518, + "step": 3991 + }, + { + "epoch": 0.4199894792214624, + "grad_norm": 1.3983739614486694, + "learning_rate": 0.00013030476101182824, + "loss": 1.9292, + "step": 3992 + }, + { + "epoch": 0.4200946870068385, + "grad_norm": 1.5765870809555054, + "learning_rate": 0.0001302722843578604, + "loss": 1.7341, + "step": 3993 + }, + { + "epoch": 0.42019989479221465, + "grad_norm": 1.6028943061828613, + "learning_rate": 0.00013023980418847185, + "loss": 2.1453, + "step": 3994 + }, + { + "epoch": 0.42030510257759074, + "grad_norm": 1.2651803493499756, + "learning_rate": 0.00013020732050743442, + "loss": 1.869, + "step": 3995 + }, + { + "epoch": 0.42041031036296683, + "grad_norm": 1.994928240776062, + "learning_rate": 0.00013017483331852035, + "loss": 2.1251, + "step": 3996 + }, + { + "epoch": 0.420515518148343, + "grad_norm": 1.914730191230774, + "learning_rate": 0.00013014234262550222, + "loss": 1.464, + "step": 3997 + }, + { + "epoch": 0.4206207259337191, + "grad_norm": 1.306941032409668, + "learning_rate": 0.00013010984843215312, + "loss": 1.7878, + "step": 3998 + }, + { + "epoch": 0.4207259337190952, + "grad_norm": 2.770078659057617, + "learning_rate": 0.00013007735074224645, + "loss": 1.7626, + "step": 3999 + }, + { + "epoch": 0.4208311415044713, + "grad_norm": 1.6361559629440308, + "learning_rate": 0.0001300448495595561, + "loss": 1.6572, + "step": 4000 + }, + { + "epoch": 0.42093634928984747, + "grad_norm": 1.2924569845199585, + "learning_rate": 0.0001300123448878563, + "loss": 1.9668, + "step": 4001 + }, + { + "epoch": 0.42104155707522356, + "grad_norm": 1.4006869792938232, + "learning_rate": 0.00012997983673092173, + "loss": 1.384, + "step": 4002 + }, + { + "epoch": 0.4211467648605997, + "grad_norm": 1.6290597915649414, + "learning_rate": 0.00012994732509252744, + "loss": 2.2109, + "step": 4003 + }, + { + "epoch": 0.4212519726459758, + "grad_norm": 1.2416553497314453, + "learning_rate": 0.00012991480997644886, + "loss": 2.3437, + "step": 4004 + }, + { + "epoch": 0.4213571804313519, + "grad_norm": 1.4514647722244263, + "learning_rate": 0.00012988229138646192, + "loss": 1.9429, + "step": 4005 + }, + { + "epoch": 0.42146238821672805, + "grad_norm": 1.3162492513656616, + "learning_rate": 0.00012984976932634292, + "loss": 1.8274, + "step": 4006 + }, + { + "epoch": 0.42156759600210414, + "grad_norm": 1.3386199474334717, + "learning_rate": 0.00012981724379986846, + "loss": 1.7178, + "step": 4007 + }, + { + "epoch": 0.4216728037874803, + "grad_norm": 1.1760342121124268, + "learning_rate": 0.00012978471481081566, + "loss": 1.4665, + "step": 4008 + }, + { + "epoch": 0.4217780115728564, + "grad_norm": 1.1634572744369507, + "learning_rate": 0.00012975218236296204, + "loss": 1.805, + "step": 4009 + }, + { + "epoch": 0.42188321935823253, + "grad_norm": 1.3325061798095703, + "learning_rate": 0.00012971964646008542, + "loss": 2.1894, + "step": 4010 + }, + { + "epoch": 0.4219884271436086, + "grad_norm": 1.4473451375961304, + "learning_rate": 0.00012968710710596417, + "loss": 2.0688, + "step": 4011 + }, + { + "epoch": 0.4220936349289847, + "grad_norm": 1.3676494359970093, + "learning_rate": 0.0001296545643043769, + "loss": 2.0859, + "step": 4012 + }, + { + "epoch": 0.42219884271436087, + "grad_norm": 1.168908953666687, + "learning_rate": 0.00012962201805910274, + "loss": 1.6922, + "step": 4013 + }, + { + "epoch": 0.42230405049973696, + "grad_norm": 1.1409196853637695, + "learning_rate": 0.00012958946837392113, + "loss": 1.9286, + "step": 4014 + }, + { + "epoch": 0.4224092582851131, + "grad_norm": 1.3931876420974731, + "learning_rate": 0.00012955691525261203, + "loss": 1.5437, + "step": 4015 + }, + { + "epoch": 0.4225144660704892, + "grad_norm": 1.6057072877883911, + "learning_rate": 0.00012952435869895569, + "loss": 2.0498, + "step": 4016 + }, + { + "epoch": 0.42261967385586535, + "grad_norm": 2.05791974067688, + "learning_rate": 0.00012949179871673278, + "loss": 1.7363, + "step": 4017 + }, + { + "epoch": 0.42272488164124145, + "grad_norm": 1.1081055402755737, + "learning_rate": 0.00012945923530972438, + "loss": 2.24, + "step": 4018 + }, + { + "epoch": 0.4228300894266176, + "grad_norm": 1.165257215499878, + "learning_rate": 0.00012942666848171202, + "loss": 2.1699, + "step": 4019 + }, + { + "epoch": 0.4229352972119937, + "grad_norm": 2.037813901901245, + "learning_rate": 0.00012939409823647753, + "loss": 1.3196, + "step": 4020 + }, + { + "epoch": 0.4230405049973698, + "grad_norm": 1.1846753358840942, + "learning_rate": 0.00012936152457780322, + "loss": 1.6817, + "step": 4021 + }, + { + "epoch": 0.42314571278274593, + "grad_norm": 0.9720628261566162, + "learning_rate": 0.00012932894750947177, + "loss": 1.6067, + "step": 4022 + }, + { + "epoch": 0.423250920568122, + "grad_norm": 1.4518640041351318, + "learning_rate": 0.00012929636703526618, + "loss": 1.9264, + "step": 4023 + }, + { + "epoch": 0.4233561283534982, + "grad_norm": 1.9571900367736816, + "learning_rate": 0.00012926378315896998, + "loss": 2.0874, + "step": 4024 + }, + { + "epoch": 0.42346133613887427, + "grad_norm": 1.8690723180770874, + "learning_rate": 0.00012923119588436702, + "loss": 1.3981, + "step": 4025 + }, + { + "epoch": 0.4235665439242504, + "grad_norm": 0.9987134337425232, + "learning_rate": 0.0001291986052152415, + "loss": 2.2083, + "step": 4026 + }, + { + "epoch": 0.4236717517096265, + "grad_norm": 1.6420297622680664, + "learning_rate": 0.0001291660111553781, + "loss": 2.006, + "step": 4027 + }, + { + "epoch": 0.4237769594950026, + "grad_norm": 1.2390166521072388, + "learning_rate": 0.0001291334137085619, + "loss": 1.4468, + "step": 4028 + }, + { + "epoch": 0.42388216728037875, + "grad_norm": 1.2712844610214233, + "learning_rate": 0.00012910081287857827, + "loss": 2.3897, + "step": 4029 + }, + { + "epoch": 0.42398737506575485, + "grad_norm": 1.1292376518249512, + "learning_rate": 0.0001290682086692131, + "loss": 2.0741, + "step": 4030 + }, + { + "epoch": 0.424092582851131, + "grad_norm": 1.479219913482666, + "learning_rate": 0.00012903560108425258, + "loss": 1.9733, + "step": 4031 + }, + { + "epoch": 0.4241977906365071, + "grad_norm": 0.8995711207389832, + "learning_rate": 0.00012900299012748328, + "loss": 1.9121, + "step": 4032 + }, + { + "epoch": 0.42430299842188324, + "grad_norm": 1.6135600805282593, + "learning_rate": 0.00012897037580269225, + "loss": 1.7079, + "step": 4033 + }, + { + "epoch": 0.42440820620725933, + "grad_norm": 1.3254718780517578, + "learning_rate": 0.0001289377581136669, + "loss": 1.5347, + "step": 4034 + }, + { + "epoch": 0.4245134139926355, + "grad_norm": 1.225527286529541, + "learning_rate": 0.00012890513706419497, + "loss": 0.9884, + "step": 4035 + }, + { + "epoch": 0.4246186217780116, + "grad_norm": 1.1481863260269165, + "learning_rate": 0.00012887251265806466, + "loss": 1.9885, + "step": 4036 + }, + { + "epoch": 0.42472382956338767, + "grad_norm": 1.3110417127609253, + "learning_rate": 0.00012883988489906454, + "loss": 1.7185, + "step": 4037 + }, + { + "epoch": 0.4248290373487638, + "grad_norm": 1.7494990825653076, + "learning_rate": 0.00012880725379098352, + "loss": 1.8137, + "step": 4038 + }, + { + "epoch": 0.4249342451341399, + "grad_norm": 1.0107097625732422, + "learning_rate": 0.00012877461933761102, + "loss": 1.7371, + "step": 4039 + }, + { + "epoch": 0.42503945291951606, + "grad_norm": 1.2290560007095337, + "learning_rate": 0.00012874198154273672, + "loss": 2.0912, + "step": 4040 + }, + { + "epoch": 0.42514466070489215, + "grad_norm": 1.1773860454559326, + "learning_rate": 0.00012870934041015071, + "loss": 1.9689, + "step": 4041 + }, + { + "epoch": 0.4252498684902683, + "grad_norm": 1.1857486963272095, + "learning_rate": 0.00012867669594364357, + "loss": 1.6265, + "step": 4042 + }, + { + "epoch": 0.4253550762756444, + "grad_norm": 1.5043977499008179, + "learning_rate": 0.00012864404814700618, + "loss": 1.9106, + "step": 4043 + }, + { + "epoch": 0.4254602840610205, + "grad_norm": 1.6733112335205078, + "learning_rate": 0.00012861139702402977, + "loss": 1.4501, + "step": 4044 + }, + { + "epoch": 0.42556549184639664, + "grad_norm": 1.2191860675811768, + "learning_rate": 0.00012857874257850605, + "loss": 1.5926, + "step": 4045 + }, + { + "epoch": 0.42567069963177273, + "grad_norm": 1.3937748670578003, + "learning_rate": 0.00012854608481422707, + "loss": 1.2054, + "step": 4046 + }, + { + "epoch": 0.4257759074171489, + "grad_norm": 1.3559739589691162, + "learning_rate": 0.00012851342373498525, + "loss": 1.5141, + "step": 4047 + }, + { + "epoch": 0.425881115202525, + "grad_norm": 1.3075133562088013, + "learning_rate": 0.0001284807593445734, + "loss": 1.3719, + "step": 4048 + }, + { + "epoch": 0.4259863229879011, + "grad_norm": 1.1074446439743042, + "learning_rate": 0.00012844809164678478, + "loss": 1.8366, + "step": 4049 + }, + { + "epoch": 0.4260915307732772, + "grad_norm": 2.5052926540374756, + "learning_rate": 0.00012841542064541292, + "loss": 1.7895, + "step": 4050 + }, + { + "epoch": 0.42619673855865337, + "grad_norm": 1.4112364053726196, + "learning_rate": 0.00012838274634425188, + "loss": 1.6224, + "step": 4051 + }, + { + "epoch": 0.42630194634402946, + "grad_norm": 1.2035661935806274, + "learning_rate": 0.00012835006874709594, + "loss": 1.7851, + "step": 4052 + }, + { + "epoch": 0.42640715412940555, + "grad_norm": 1.3116462230682373, + "learning_rate": 0.00012831738785773985, + "loss": 1.5662, + "step": 4053 + }, + { + "epoch": 0.4265123619147817, + "grad_norm": 1.2714296579360962, + "learning_rate": 0.00012828470367997884, + "loss": 1.3814, + "step": 4054 + }, + { + "epoch": 0.4266175697001578, + "grad_norm": 2.2298810482025146, + "learning_rate": 0.00012825201621760826, + "loss": 1.8722, + "step": 4055 + }, + { + "epoch": 0.42672277748553394, + "grad_norm": 1.7081495523452759, + "learning_rate": 0.00012821932547442408, + "loss": 2.1527, + "step": 4056 + }, + { + "epoch": 0.42682798527091004, + "grad_norm": 1.4093936681747437, + "learning_rate": 0.00012818663145422256, + "loss": 2.1676, + "step": 4057 + }, + { + "epoch": 0.4269331930562862, + "grad_norm": 1.707614541053772, + "learning_rate": 0.00012815393416080035, + "loss": 1.9155, + "step": 4058 + }, + { + "epoch": 0.4270384008416623, + "grad_norm": 1.1860383749008179, + "learning_rate": 0.00012812123359795446, + "loss": 1.788, + "step": 4059 + }, + { + "epoch": 0.4271436086270384, + "grad_norm": 1.6916757822036743, + "learning_rate": 0.00012808852976948232, + "loss": 1.3056, + "step": 4060 + }, + { + "epoch": 0.4272488164124145, + "grad_norm": 1.7355988025665283, + "learning_rate": 0.00012805582267918172, + "loss": 2.3532, + "step": 4061 + }, + { + "epoch": 0.4273540241977906, + "grad_norm": 0.883721113204956, + "learning_rate": 0.00012802311233085082, + "loss": 2.0607, + "step": 4062 + }, + { + "epoch": 0.42745923198316677, + "grad_norm": 1.3145248889923096, + "learning_rate": 0.00012799039872828812, + "loss": 2.2087, + "step": 4063 + }, + { + "epoch": 0.42756443976854286, + "grad_norm": 1.7425910234451294, + "learning_rate": 0.00012795768187529263, + "loss": 0.9703, + "step": 4064 + }, + { + "epoch": 0.427669647553919, + "grad_norm": 0.8028357028961182, + "learning_rate": 0.00012792496177566363, + "loss": 1.613, + "step": 4065 + }, + { + "epoch": 0.4277748553392951, + "grad_norm": 1.5005557537078857, + "learning_rate": 0.00012789223843320073, + "loss": 1.8242, + "step": 4066 + }, + { + "epoch": 0.42788006312467125, + "grad_norm": 1.2664399147033691, + "learning_rate": 0.00012785951185170403, + "loss": 2.2433, + "step": 4067 + }, + { + "epoch": 0.42798527091004734, + "grad_norm": 1.4799624681472778, + "learning_rate": 0.000127826782034974, + "loss": 2.0403, + "step": 4068 + }, + { + "epoch": 0.42809047869542344, + "grad_norm": 1.3635642528533936, + "learning_rate": 0.00012779404898681136, + "loss": 2.2521, + "step": 4069 + }, + { + "epoch": 0.4281956864807996, + "grad_norm": 1.3099942207336426, + "learning_rate": 0.00012776131271101732, + "loss": 2.0154, + "step": 4070 + }, + { + "epoch": 0.4283008942661757, + "grad_norm": 1.4328498840332031, + "learning_rate": 0.00012772857321139352, + "loss": 1.9576, + "step": 4071 + }, + { + "epoch": 0.42840610205155183, + "grad_norm": 1.504550576210022, + "learning_rate": 0.00012769583049174177, + "loss": 2.4232, + "step": 4072 + }, + { + "epoch": 0.4285113098369279, + "grad_norm": 1.3973898887634277, + "learning_rate": 0.0001276630845558644, + "loss": 1.5293, + "step": 4073 + }, + { + "epoch": 0.42861651762230407, + "grad_norm": 1.2882184982299805, + "learning_rate": 0.00012763033540756416, + "loss": 2.3094, + "step": 4074 + }, + { + "epoch": 0.42872172540768017, + "grad_norm": 1.2002280950546265, + "learning_rate": 0.00012759758305064405, + "loss": 1.8278, + "step": 4075 + }, + { + "epoch": 0.42882693319305626, + "grad_norm": 1.0488744974136353, + "learning_rate": 0.0001275648274889075, + "loss": 1.8504, + "step": 4076 + }, + { + "epoch": 0.4289321409784324, + "grad_norm": 1.2440074682235718, + "learning_rate": 0.00012753206872615825, + "loss": 1.8653, + "step": 4077 + }, + { + "epoch": 0.4290373487638085, + "grad_norm": 0.9582875967025757, + "learning_rate": 0.00012749930676620057, + "loss": 2.0311, + "step": 4078 + }, + { + "epoch": 0.42914255654918465, + "grad_norm": 1.1008398532867432, + "learning_rate": 0.00012746654161283896, + "loss": 1.6135, + "step": 4079 + }, + { + "epoch": 0.42924776433456074, + "grad_norm": 1.095392107963562, + "learning_rate": 0.00012743377326987826, + "loss": 1.7668, + "step": 4080 + }, + { + "epoch": 0.4293529721199369, + "grad_norm": 1.0410737991333008, + "learning_rate": 0.00012740100174112384, + "loss": 1.607, + "step": 4081 + }, + { + "epoch": 0.429458179905313, + "grad_norm": 0.979989767074585, + "learning_rate": 0.00012736822703038133, + "loss": 1.4283, + "step": 4082 + }, + { + "epoch": 0.42956338769068914, + "grad_norm": 1.1817355155944824, + "learning_rate": 0.00012733544914145673, + "loss": 1.7454, + "step": 4083 + }, + { + "epoch": 0.42966859547606523, + "grad_norm": 1.5766743421554565, + "learning_rate": 0.00012730266807815642, + "loss": 2.0112, + "step": 4084 + }, + { + "epoch": 0.4297738032614413, + "grad_norm": 1.0003412961959839, + "learning_rate": 0.0001272698838442872, + "loss": 1.9937, + "step": 4085 + }, + { + "epoch": 0.42987901104681747, + "grad_norm": 2.305222272872925, + "learning_rate": 0.00012723709644365614, + "loss": 2.0902, + "step": 4086 + }, + { + "epoch": 0.42998421883219357, + "grad_norm": 1.1228468418121338, + "learning_rate": 0.00012720430588007077, + "loss": 2.1054, + "step": 4087 + }, + { + "epoch": 0.4300894266175697, + "grad_norm": 1.6916412115097046, + "learning_rate": 0.00012717151215733892, + "loss": 1.3402, + "step": 4088 + }, + { + "epoch": 0.4301946344029458, + "grad_norm": 1.376791000366211, + "learning_rate": 0.0001271387152792689, + "loss": 2.2196, + "step": 4089 + }, + { + "epoch": 0.43029984218832196, + "grad_norm": 1.307098150253296, + "learning_rate": 0.00012710591524966918, + "loss": 1.9181, + "step": 4090 + }, + { + "epoch": 0.43040504997369805, + "grad_norm": 0.9531205892562866, + "learning_rate": 0.00012707311207234878, + "loss": 1.4014, + "step": 4091 + }, + { + "epoch": 0.43051025775907414, + "grad_norm": 1.9267292022705078, + "learning_rate": 0.00012704030575111705, + "loss": 1.4603, + "step": 4092 + }, + { + "epoch": 0.4306154655444503, + "grad_norm": 1.3264367580413818, + "learning_rate": 0.00012700749628978363, + "loss": 2.0973, + "step": 4093 + }, + { + "epoch": 0.4307206733298264, + "grad_norm": 1.430148720741272, + "learning_rate": 0.00012697468369215863, + "loss": 1.5874, + "step": 4094 + }, + { + "epoch": 0.43082588111520254, + "grad_norm": 1.3423150777816772, + "learning_rate": 0.00012694186796205243, + "loss": 1.6053, + "step": 4095 + }, + { + "epoch": 0.43093108890057863, + "grad_norm": 1.2820467948913574, + "learning_rate": 0.00012690904910327578, + "loss": 1.8371, + "step": 4096 + }, + { + "epoch": 0.4310362966859548, + "grad_norm": 1.965772271156311, + "learning_rate": 0.00012687622711963993, + "loss": 1.9467, + "step": 4097 + }, + { + "epoch": 0.43114150447133087, + "grad_norm": 1.4754441976547241, + "learning_rate": 0.0001268434020149563, + "loss": 1.8387, + "step": 4098 + }, + { + "epoch": 0.431246712256707, + "grad_norm": 1.0966287851333618, + "learning_rate": 0.00012681057379303678, + "loss": 2.0815, + "step": 4099 + }, + { + "epoch": 0.4313519200420831, + "grad_norm": 1.2483052015304565, + "learning_rate": 0.00012677774245769362, + "loss": 2.1525, + "step": 4100 + }, + { + "epoch": 0.4314571278274592, + "grad_norm": 1.1769499778747559, + "learning_rate": 0.00012674490801273938, + "loss": 2.0793, + "step": 4101 + }, + { + "epoch": 0.43156233561283536, + "grad_norm": 1.8558199405670166, + "learning_rate": 0.00012671207046198706, + "loss": 2.3923, + "step": 4102 + }, + { + "epoch": 0.43166754339821145, + "grad_norm": 1.0593931674957275, + "learning_rate": 0.00012667922980924998, + "loss": 2.0428, + "step": 4103 + }, + { + "epoch": 0.4317727511835876, + "grad_norm": 0.8765531182289124, + "learning_rate": 0.00012664638605834177, + "loss": 1.7572, + "step": 4104 + }, + { + "epoch": 0.4318779589689637, + "grad_norm": 0.9581341743469238, + "learning_rate": 0.00012661353921307648, + "loss": 2.2846, + "step": 4105 + }, + { + "epoch": 0.43198316675433984, + "grad_norm": 1.0098820924758911, + "learning_rate": 0.00012658068927726853, + "loss": 1.7325, + "step": 4106 + }, + { + "epoch": 0.43208837453971594, + "grad_norm": 1.224402666091919, + "learning_rate": 0.00012654783625473266, + "loss": 1.6651, + "step": 4107 + }, + { + "epoch": 0.43219358232509203, + "grad_norm": 0.9913700222969055, + "learning_rate": 0.00012651498014928402, + "loss": 2.4144, + "step": 4108 + }, + { + "epoch": 0.4322987901104682, + "grad_norm": 1.0635404586791992, + "learning_rate": 0.00012648212096473798, + "loss": 1.4415, + "step": 4109 + }, + { + "epoch": 0.43240399789584427, + "grad_norm": 0.8817548751831055, + "learning_rate": 0.00012644925870491052, + "loss": 1.8636, + "step": 4110 + }, + { + "epoch": 0.4325092056812204, + "grad_norm": 1.284338116645813, + "learning_rate": 0.0001264163933736177, + "loss": 1.6044, + "step": 4111 + }, + { + "epoch": 0.4326144134665965, + "grad_norm": 0.9684668779373169, + "learning_rate": 0.00012638352497467608, + "loss": 2.1041, + "step": 4112 + }, + { + "epoch": 0.43271962125197266, + "grad_norm": 1.4305078983306885, + "learning_rate": 0.00012635065351190261, + "loss": 1.9746, + "step": 4113 + }, + { + "epoch": 0.43282482903734876, + "grad_norm": 1.0623332262039185, + "learning_rate": 0.0001263177789891145, + "loss": 1.7623, + "step": 4114 + }, + { + "epoch": 0.4329300368227249, + "grad_norm": 1.431246042251587, + "learning_rate": 0.00012628490141012937, + "loss": 2.0701, + "step": 4115 + }, + { + "epoch": 0.433035244608101, + "grad_norm": 1.5637847185134888, + "learning_rate": 0.00012625202077876525, + "loss": 1.6522, + "step": 4116 + }, + { + "epoch": 0.4331404523934771, + "grad_norm": 1.6422128677368164, + "learning_rate": 0.00012621913709884037, + "loss": 1.8042, + "step": 4117 + }, + { + "epoch": 0.43324566017885324, + "grad_norm": 1.0828219652175903, + "learning_rate": 0.0001261862503741734, + "loss": 1.9911, + "step": 4118 + }, + { + "epoch": 0.43335086796422934, + "grad_norm": 1.0087640285491943, + "learning_rate": 0.00012615336060858344, + "loss": 2.199, + "step": 4119 + }, + { + "epoch": 0.4334560757496055, + "grad_norm": 1.5731931924819946, + "learning_rate": 0.00012612046780588986, + "loss": 1.9389, + "step": 4120 + }, + { + "epoch": 0.4335612835349816, + "grad_norm": 1.271106481552124, + "learning_rate": 0.00012608757196991234, + "loss": 1.8302, + "step": 4121 + }, + { + "epoch": 0.4336664913203577, + "grad_norm": 1.1410576105117798, + "learning_rate": 0.000126054673104471, + "loss": 1.6422, + "step": 4122 + }, + { + "epoch": 0.4337716991057338, + "grad_norm": 1.52702796459198, + "learning_rate": 0.00012602177121338626, + "loss": 2.0576, + "step": 4123 + }, + { + "epoch": 0.4338769068911099, + "grad_norm": 1.1503021717071533, + "learning_rate": 0.0001259888663004789, + "loss": 1.6653, + "step": 4124 + }, + { + "epoch": 0.43398211467648606, + "grad_norm": 1.6308045387268066, + "learning_rate": 0.00012595595836957006, + "loss": 1.885, + "step": 4125 + }, + { + "epoch": 0.43408732246186216, + "grad_norm": 1.5235882997512817, + "learning_rate": 0.0001259230474244813, + "loss": 1.9792, + "step": 4126 + }, + { + "epoch": 0.4341925302472383, + "grad_norm": 1.6996004581451416, + "learning_rate": 0.00012589013346903438, + "loss": 1.9094, + "step": 4127 + }, + { + "epoch": 0.4342977380326144, + "grad_norm": 1.2587865591049194, + "learning_rate": 0.0001258572165070515, + "loss": 1.8968, + "step": 4128 + }, + { + "epoch": 0.43440294581799055, + "grad_norm": 1.0128796100616455, + "learning_rate": 0.00012582429654235523, + "loss": 1.1083, + "step": 4129 + }, + { + "epoch": 0.43450815360336664, + "grad_norm": 1.4834532737731934, + "learning_rate": 0.00012579137357876844, + "loss": 2.2179, + "step": 4130 + }, + { + "epoch": 0.4346133613887428, + "grad_norm": 1.3126288652420044, + "learning_rate": 0.00012575844762011438, + "loss": 2.4677, + "step": 4131 + }, + { + "epoch": 0.4347185691741189, + "grad_norm": 0.8772363066673279, + "learning_rate": 0.0001257255186702166, + "loss": 1.4076, + "step": 4132 + }, + { + "epoch": 0.434823776959495, + "grad_norm": 1.3988395929336548, + "learning_rate": 0.00012569258673289903, + "loss": 1.8661, + "step": 4133 + }, + { + "epoch": 0.4349289847448711, + "grad_norm": 1.469456434249878, + "learning_rate": 0.000125659651811986, + "loss": 1.8328, + "step": 4134 + }, + { + "epoch": 0.4350341925302472, + "grad_norm": 1.3229504823684692, + "learning_rate": 0.00012562671391130208, + "loss": 1.7977, + "step": 4135 + }, + { + "epoch": 0.43513940031562337, + "grad_norm": 1.2461540699005127, + "learning_rate": 0.00012559377303467226, + "loss": 1.7205, + "step": 4136 + }, + { + "epoch": 0.43524460810099946, + "grad_norm": 1.4295867681503296, + "learning_rate": 0.00012556082918592187, + "loss": 2.3218, + "step": 4137 + }, + { + "epoch": 0.4353498158863756, + "grad_norm": 1.1670809984207153, + "learning_rate": 0.00012552788236887654, + "loss": 1.7761, + "step": 4138 + }, + { + "epoch": 0.4354550236717517, + "grad_norm": 2.9805972576141357, + "learning_rate": 0.0001254949325873623, + "loss": 1.9536, + "step": 4139 + }, + { + "epoch": 0.4355602314571278, + "grad_norm": 1.0447720289230347, + "learning_rate": 0.0001254619798452055, + "loss": 1.6888, + "step": 4140 + }, + { + "epoch": 0.43566543924250395, + "grad_norm": 1.6240369081497192, + "learning_rate": 0.00012542902414623282, + "loss": 2.3843, + "step": 4141 + }, + { + "epoch": 0.43577064702788004, + "grad_norm": 1.2419333457946777, + "learning_rate": 0.0001253960654942713, + "loss": 2.0027, + "step": 4142 + }, + { + "epoch": 0.4358758548132562, + "grad_norm": 1.9167548418045044, + "learning_rate": 0.00012536310389314832, + "loss": 1.8499, + "step": 4143 + }, + { + "epoch": 0.4359810625986323, + "grad_norm": 0.9558454155921936, + "learning_rate": 0.0001253301393466916, + "loss": 2.0347, + "step": 4144 + }, + { + "epoch": 0.43608627038400843, + "grad_norm": 1.5771024227142334, + "learning_rate": 0.0001252971718587292, + "loss": 2.0212, + "step": 4145 + }, + { + "epoch": 0.43619147816938453, + "grad_norm": 1.3093889951705933, + "learning_rate": 0.00012526420143308954, + "loss": 1.4546, + "step": 4146 + }, + { + "epoch": 0.4362966859547607, + "grad_norm": 1.1525970697402954, + "learning_rate": 0.00012523122807360138, + "loss": 1.8394, + "step": 4147 + }, + { + "epoch": 0.43640189374013677, + "grad_norm": 1.2308920621871948, + "learning_rate": 0.00012519825178409377, + "loss": 1.8985, + "step": 4148 + }, + { + "epoch": 0.43650710152551286, + "grad_norm": 1.5874372720718384, + "learning_rate": 0.00012516527256839616, + "loss": 1.814, + "step": 4149 + }, + { + "epoch": 0.436612309310889, + "grad_norm": 1.3634819984436035, + "learning_rate": 0.0001251322904303383, + "loss": 1.764, + "step": 4150 + }, + { + "epoch": 0.4367175170962651, + "grad_norm": 1.272567868232727, + "learning_rate": 0.00012509930537375036, + "loss": 2.2667, + "step": 4151 + }, + { + "epoch": 0.43682272488164126, + "grad_norm": 2.495673418045044, + "learning_rate": 0.0001250663174024627, + "loss": 1.88, + "step": 4152 + }, + { + "epoch": 0.43692793266701735, + "grad_norm": 1.3136742115020752, + "learning_rate": 0.00012503332652030613, + "loss": 1.7169, + "step": 4153 + }, + { + "epoch": 0.4370331404523935, + "grad_norm": 1.5158318281173706, + "learning_rate": 0.0001250003327311118, + "loss": 1.7009, + "step": 4154 + }, + { + "epoch": 0.4371383482377696, + "grad_norm": 1.6757663488388062, + "learning_rate": 0.00012496733603871115, + "loss": 2.4002, + "step": 4155 + }, + { + "epoch": 0.4372435560231457, + "grad_norm": 1.6522952318191528, + "learning_rate": 0.000124934336446936, + "loss": 1.8635, + "step": 4156 + }, + { + "epoch": 0.43734876380852183, + "grad_norm": 1.3090764284133911, + "learning_rate": 0.00012490133395961844, + "loss": 1.7477, + "step": 4157 + }, + { + "epoch": 0.43745397159389793, + "grad_norm": 1.1116405725479126, + "learning_rate": 0.000124868328580591, + "loss": 1.7589, + "step": 4158 + }, + { + "epoch": 0.4375591793792741, + "grad_norm": 1.0621029138565063, + "learning_rate": 0.0001248353203136864, + "loss": 1.7158, + "step": 4159 + }, + { + "epoch": 0.43766438716465017, + "grad_norm": 1.3473455905914307, + "learning_rate": 0.00012480230916273784, + "loss": 1.8552, + "step": 4160 + }, + { + "epoch": 0.4377695949500263, + "grad_norm": 1.4094268083572388, + "learning_rate": 0.00012476929513157881, + "loss": 1.7809, + "step": 4161 + }, + { + "epoch": 0.4378748027354024, + "grad_norm": 1.2711224555969238, + "learning_rate": 0.00012473627822404314, + "loss": 1.9564, + "step": 4162 + }, + { + "epoch": 0.43798001052077856, + "grad_norm": 1.670932650566101, + "learning_rate": 0.00012470325844396487, + "loss": 1.7578, + "step": 4163 + }, + { + "epoch": 0.43808521830615466, + "grad_norm": 1.6926169395446777, + "learning_rate": 0.00012467023579517856, + "loss": 1.9468, + "step": 4164 + }, + { + "epoch": 0.43819042609153075, + "grad_norm": 1.197359323501587, + "learning_rate": 0.000124637210281519, + "loss": 1.4122, + "step": 4165 + }, + { + "epoch": 0.4382956338769069, + "grad_norm": 1.7274643182754517, + "learning_rate": 0.00012460418190682134, + "loss": 2.1852, + "step": 4166 + }, + { + "epoch": 0.438400841662283, + "grad_norm": 1.5755963325500488, + "learning_rate": 0.00012457115067492108, + "loss": 2.3088, + "step": 4167 + }, + { + "epoch": 0.43850604944765914, + "grad_norm": 1.3150062561035156, + "learning_rate": 0.000124538116589654, + "loss": 1.9563, + "step": 4168 + }, + { + "epoch": 0.43861125723303523, + "grad_norm": 1.4447141885757446, + "learning_rate": 0.0001245050796548562, + "loss": 1.6322, + "step": 4169 + }, + { + "epoch": 0.4387164650184114, + "grad_norm": 1.5568017959594727, + "learning_rate": 0.0001244720398743642, + "loss": 1.272, + "step": 4170 + }, + { + "epoch": 0.4388216728037875, + "grad_norm": 1.4406249523162842, + "learning_rate": 0.00012443899725201482, + "loss": 1.4789, + "step": 4171 + }, + { + "epoch": 0.43892688058916357, + "grad_norm": 2.007683753967285, + "learning_rate": 0.0001244059517916452, + "loss": 2.0801, + "step": 4172 + }, + { + "epoch": 0.4390320883745397, + "grad_norm": 2.023632287979126, + "learning_rate": 0.00012437290349709271, + "loss": 2.739, + "step": 4173 + }, + { + "epoch": 0.4391372961599158, + "grad_norm": 1.7203633785247803, + "learning_rate": 0.0001243398523721952, + "loss": 2.2986, + "step": 4174 + }, + { + "epoch": 0.43924250394529196, + "grad_norm": 1.1265063285827637, + "learning_rate": 0.0001243067984207908, + "loss": 1.9861, + "step": 4175 + }, + { + "epoch": 0.43934771173066806, + "grad_norm": 1.3561371564865112, + "learning_rate": 0.00012427374164671794, + "loss": 2.1845, + "step": 4176 + }, + { + "epoch": 0.4394529195160442, + "grad_norm": 1.0997103452682495, + "learning_rate": 0.00012424068205381538, + "loss": 2.1557, + "step": 4177 + }, + { + "epoch": 0.4395581273014203, + "grad_norm": 1.7254860401153564, + "learning_rate": 0.00012420761964592223, + "loss": 1.6301, + "step": 4178 + }, + { + "epoch": 0.43966333508679645, + "grad_norm": 1.4888262748718262, + "learning_rate": 0.00012417455442687795, + "loss": 1.9758, + "step": 4179 + }, + { + "epoch": 0.43976854287217254, + "grad_norm": 1.2371491193771362, + "learning_rate": 0.00012414148640052227, + "loss": 2.0836, + "step": 4180 + }, + { + "epoch": 0.43987375065754863, + "grad_norm": 1.2289551496505737, + "learning_rate": 0.00012410841557069523, + "loss": 2.0425, + "step": 4181 + }, + { + "epoch": 0.4399789584429248, + "grad_norm": 1.2869768142700195, + "learning_rate": 0.0001240753419412373, + "loss": 1.9797, + "step": 4182 + }, + { + "epoch": 0.4400841662283009, + "grad_norm": 1.227531909942627, + "learning_rate": 0.00012404226551598923, + "loss": 2.0121, + "step": 4183 + }, + { + "epoch": 0.440189374013677, + "grad_norm": 1.2489982843399048, + "learning_rate": 0.000124009186298792, + "loss": 1.5442, + "step": 4184 + }, + { + "epoch": 0.4402945817990531, + "grad_norm": 1.3487237691879272, + "learning_rate": 0.000123976104293487, + "loss": 1.7614, + "step": 4185 + }, + { + "epoch": 0.44039978958442927, + "grad_norm": 1.567232370376587, + "learning_rate": 0.000123943019503916, + "loss": 1.7655, + "step": 4186 + }, + { + "epoch": 0.44050499736980536, + "grad_norm": 3.1679434776306152, + "learning_rate": 0.00012390993193392097, + "loss": 1.0603, + "step": 4187 + }, + { + "epoch": 0.44061020515518146, + "grad_norm": 1.1254740953445435, + "learning_rate": 0.00012387684158734425, + "loss": 1.781, + "step": 4188 + }, + { + "epoch": 0.4407154129405576, + "grad_norm": 1.0286729335784912, + "learning_rate": 0.0001238437484680286, + "loss": 1.839, + "step": 4189 + }, + { + "epoch": 0.4408206207259337, + "grad_norm": 1.7165943384170532, + "learning_rate": 0.0001238106525798169, + "loss": 1.6264, + "step": 4190 + }, + { + "epoch": 0.44092582851130985, + "grad_norm": 1.339667797088623, + "learning_rate": 0.00012377755392655254, + "loss": 1.8619, + "step": 4191 + }, + { + "epoch": 0.44103103629668594, + "grad_norm": 1.1098575592041016, + "learning_rate": 0.00012374445251207914, + "loss": 1.8949, + "step": 4192 + }, + { + "epoch": 0.4411362440820621, + "grad_norm": 1.4389253854751587, + "learning_rate": 0.00012371134834024067, + "loss": 1.6791, + "step": 4193 + }, + { + "epoch": 0.4412414518674382, + "grad_norm": 1.6168919801712036, + "learning_rate": 0.00012367824141488142, + "loss": 1.251, + "step": 4194 + }, + { + "epoch": 0.44134665965281433, + "grad_norm": 1.0156736373901367, + "learning_rate": 0.00012364513173984592, + "loss": 2.281, + "step": 4195 + }, + { + "epoch": 0.4414518674381904, + "grad_norm": 1.645992636680603, + "learning_rate": 0.00012361201931897916, + "loss": 1.8607, + "step": 4196 + }, + { + "epoch": 0.4415570752235665, + "grad_norm": 1.0895994901657104, + "learning_rate": 0.00012357890415612635, + "loss": 2.0752, + "step": 4197 + }, + { + "epoch": 0.44166228300894267, + "grad_norm": 1.4120984077453613, + "learning_rate": 0.00012354578625513302, + "loss": 1.6247, + "step": 4198 + }, + { + "epoch": 0.44176749079431876, + "grad_norm": 1.073986291885376, + "learning_rate": 0.00012351266561984507, + "loss": 1.4361, + "step": 4199 + }, + { + "epoch": 0.4418726985796949, + "grad_norm": 1.2587485313415527, + "learning_rate": 0.0001234795422541087, + "loss": 2.4348, + "step": 4200 + }, + { + "epoch": 0.441977906365071, + "grad_norm": 1.7424877882003784, + "learning_rate": 0.00012344641616177042, + "loss": 1.1056, + "step": 4201 + }, + { + "epoch": 0.44208311415044715, + "grad_norm": 1.4434423446655273, + "learning_rate": 0.00012341328734667698, + "loss": 2.2263, + "step": 4202 + }, + { + "epoch": 0.44218832193582325, + "grad_norm": 1.3455250263214111, + "learning_rate": 0.00012338015581267567, + "loss": 1.872, + "step": 4203 + }, + { + "epoch": 0.44229352972119934, + "grad_norm": 1.5791774988174438, + "learning_rate": 0.00012334702156361377, + "loss": 2.61, + "step": 4204 + }, + { + "epoch": 0.4423987375065755, + "grad_norm": 1.0933024883270264, + "learning_rate": 0.0001233138846033392, + "loss": 2.0891, + "step": 4205 + }, + { + "epoch": 0.4425039452919516, + "grad_norm": 1.2376352548599243, + "learning_rate": 0.00012328074493569993, + "loss": 1.4209, + "step": 4206 + }, + { + "epoch": 0.44260915307732773, + "grad_norm": 1.549952507019043, + "learning_rate": 0.00012324760256454445, + "loss": 1.6895, + "step": 4207 + }, + { + "epoch": 0.4427143608627038, + "grad_norm": 1.5059970617294312, + "learning_rate": 0.0001232144574937214, + "loss": 1.8844, + "step": 4208 + }, + { + "epoch": 0.44281956864808, + "grad_norm": 1.6260056495666504, + "learning_rate": 0.00012318130972707985, + "loss": 2.5282, + "step": 4209 + }, + { + "epoch": 0.44292477643345607, + "grad_norm": 1.3028151988983154, + "learning_rate": 0.0001231481592684692, + "loss": 2.0735, + "step": 4210 + }, + { + "epoch": 0.4430299842188322, + "grad_norm": 0.9203398823738098, + "learning_rate": 0.00012311500612173897, + "loss": 2.0104, + "step": 4211 + }, + { + "epoch": 0.4431351920042083, + "grad_norm": 1.2784510850906372, + "learning_rate": 0.0001230818502907392, + "loss": 1.7065, + "step": 4212 + }, + { + "epoch": 0.4432403997895844, + "grad_norm": 1.1374523639678955, + "learning_rate": 0.0001230486917793202, + "loss": 1.569, + "step": 4213 + }, + { + "epoch": 0.44334560757496055, + "grad_norm": 1.1995996236801147, + "learning_rate": 0.00012301553059133248, + "loss": 1.6635, + "step": 4214 + }, + { + "epoch": 0.44345081536033665, + "grad_norm": 1.210629940032959, + "learning_rate": 0.000122982366730627, + "loss": 1.8382, + "step": 4215 + }, + { + "epoch": 0.4435560231457128, + "grad_norm": 1.6628928184509277, + "learning_rate": 0.00012294920020105497, + "loss": 1.4067, + "step": 4216 + }, + { + "epoch": 0.4436612309310889, + "grad_norm": 1.001809000968933, + "learning_rate": 0.00012291603100646786, + "loss": 1.9476, + "step": 4217 + }, + { + "epoch": 0.44376643871646504, + "grad_norm": 2.0265283584594727, + "learning_rate": 0.00012288285915071752, + "loss": 1.9624, + "step": 4218 + }, + { + "epoch": 0.44387164650184113, + "grad_norm": 1.6050817966461182, + "learning_rate": 0.00012284968463765613, + "loss": 1.7624, + "step": 4219 + }, + { + "epoch": 0.4439768542872172, + "grad_norm": 1.2535775899887085, + "learning_rate": 0.00012281650747113612, + "loss": 2.0082, + "step": 4220 + }, + { + "epoch": 0.4440820620725934, + "grad_norm": 1.6964945793151855, + "learning_rate": 0.00012278332765501017, + "loss": 2.1988, + "step": 4221 + }, + { + "epoch": 0.44418726985796947, + "grad_norm": 1.662829041481018, + "learning_rate": 0.00012275014519313145, + "loss": 2.4153, + "step": 4222 + }, + { + "epoch": 0.4442924776433456, + "grad_norm": 1.6172958612442017, + "learning_rate": 0.00012271696008935324, + "loss": 1.6942, + "step": 4223 + }, + { + "epoch": 0.4443976854287217, + "grad_norm": 1.380727767944336, + "learning_rate": 0.0001226837723475293, + "loss": 1.9723, + "step": 4224 + }, + { + "epoch": 0.44450289321409786, + "grad_norm": 1.6496597528457642, + "learning_rate": 0.00012265058197151357, + "loss": 1.5928, + "step": 4225 + }, + { + "epoch": 0.44460810099947395, + "grad_norm": 1.51160728931427, + "learning_rate": 0.00012261738896516034, + "loss": 2.1494, + "step": 4226 + }, + { + "epoch": 0.4447133087848501, + "grad_norm": 1.5989267826080322, + "learning_rate": 0.0001225841933323242, + "loss": 1.6661, + "step": 4227 + }, + { + "epoch": 0.4448185165702262, + "grad_norm": 1.2797974348068237, + "learning_rate": 0.00012255099507686007, + "loss": 2.1762, + "step": 4228 + }, + { + "epoch": 0.4449237243556023, + "grad_norm": 0.9348226189613342, + "learning_rate": 0.00012251779420262312, + "loss": 1.6779, + "step": 4229 + }, + { + "epoch": 0.44502893214097844, + "grad_norm": 1.528698205947876, + "learning_rate": 0.0001224845907134689, + "loss": 1.4624, + "step": 4230 + }, + { + "epoch": 0.44513413992635453, + "grad_norm": 2.389446973800659, + "learning_rate": 0.00012245138461325318, + "loss": 1.4935, + "step": 4231 + }, + { + "epoch": 0.4452393477117307, + "grad_norm": 1.2253481149673462, + "learning_rate": 0.0001224181759058321, + "loss": 2.4449, + "step": 4232 + }, + { + "epoch": 0.4453445554971068, + "grad_norm": 1.3102329969406128, + "learning_rate": 0.00012238496459506207, + "loss": 1.9277, + "step": 4233 + }, + { + "epoch": 0.4454497632824829, + "grad_norm": 1.1610690355300903, + "learning_rate": 0.00012235175068479984, + "loss": 2.1319, + "step": 4234 + }, + { + "epoch": 0.445554971067859, + "grad_norm": 2.348677635192871, + "learning_rate": 0.00012231853417890237, + "loss": 2.2125, + "step": 4235 + }, + { + "epoch": 0.44566017885323517, + "grad_norm": 1.5758837461471558, + "learning_rate": 0.00012228531508122703, + "loss": 1.6423, + "step": 4236 + }, + { + "epoch": 0.44576538663861126, + "grad_norm": 1.0791726112365723, + "learning_rate": 0.00012225209339563145, + "loss": 1.7604, + "step": 4237 + }, + { + "epoch": 0.44587059442398735, + "grad_norm": 1.3658674955368042, + "learning_rate": 0.00012221886912597353, + "loss": 1.8358, + "step": 4238 + }, + { + "epoch": 0.4459758022093635, + "grad_norm": 2.2098424434661865, + "learning_rate": 0.00012218564227611152, + "loss": 2.1306, + "step": 4239 + }, + { + "epoch": 0.4460810099947396, + "grad_norm": 1.6169191598892212, + "learning_rate": 0.0001221524128499039, + "loss": 2.0847, + "step": 4240 + }, + { + "epoch": 0.44618621778011575, + "grad_norm": 1.3867640495300293, + "learning_rate": 0.00012211918085120954, + "loss": 1.8926, + "step": 4241 + }, + { + "epoch": 0.44629142556549184, + "grad_norm": 1.7240827083587646, + "learning_rate": 0.00012208594628388753, + "loss": 2.1408, + "step": 4242 + }, + { + "epoch": 0.446396633350868, + "grad_norm": 1.132155179977417, + "learning_rate": 0.00012205270915179729, + "loss": 2.1424, + "step": 4243 + }, + { + "epoch": 0.4465018411362441, + "grad_norm": 1.2491706609725952, + "learning_rate": 0.00012201946945879856, + "loss": 2.039, + "step": 4244 + }, + { + "epoch": 0.4466070489216202, + "grad_norm": 1.3799735307693481, + "learning_rate": 0.00012198622720875139, + "loss": 1.7383, + "step": 4245 + }, + { + "epoch": 0.4467122567069963, + "grad_norm": 1.3152798414230347, + "learning_rate": 0.000121952982405516, + "loss": 2.1134, + "step": 4246 + }, + { + "epoch": 0.4468174644923724, + "grad_norm": 1.6262292861938477, + "learning_rate": 0.00012191973505295311, + "loss": 1.8854, + "step": 4247 + }, + { + "epoch": 0.44692267227774857, + "grad_norm": 1.3993144035339355, + "learning_rate": 0.00012188648515492355, + "loss": 1.8349, + "step": 4248 + }, + { + "epoch": 0.44702788006312466, + "grad_norm": 1.2754366397857666, + "learning_rate": 0.00012185323271528853, + "loss": 1.2813, + "step": 4249 + }, + { + "epoch": 0.4471330878485008, + "grad_norm": 1.437742829322815, + "learning_rate": 0.00012181997773790954, + "loss": 1.79, + "step": 4250 + }, + { + "epoch": 0.4472382956338769, + "grad_norm": 1.1852113008499146, + "learning_rate": 0.00012178672022664838, + "loss": 2.2712, + "step": 4251 + }, + { + "epoch": 0.44734350341925305, + "grad_norm": 1.1398671865463257, + "learning_rate": 0.00012175346018536717, + "loss": 2.3377, + "step": 4252 + }, + { + "epoch": 0.44744871120462915, + "grad_norm": 0.8755077123641968, + "learning_rate": 0.00012172019761792825, + "loss": 2.2223, + "step": 4253 + }, + { + "epoch": 0.44755391899000524, + "grad_norm": 0.9048967957496643, + "learning_rate": 0.00012168693252819433, + "loss": 1.8152, + "step": 4254 + }, + { + "epoch": 0.4476591267753814, + "grad_norm": 1.133655071258545, + "learning_rate": 0.00012165366492002832, + "loss": 1.8637, + "step": 4255 + }, + { + "epoch": 0.4477643345607575, + "grad_norm": 2.401837110519409, + "learning_rate": 0.00012162039479729351, + "loss": 2.07, + "step": 4256 + }, + { + "epoch": 0.44786954234613363, + "grad_norm": 1.3808231353759766, + "learning_rate": 0.00012158712216385344, + "loss": 1.9546, + "step": 4257 + }, + { + "epoch": 0.4479747501315097, + "grad_norm": 1.5541157722473145, + "learning_rate": 0.00012155384702357198, + "loss": 1.6161, + "step": 4258 + }, + { + "epoch": 0.4480799579168859, + "grad_norm": 1.5308620929718018, + "learning_rate": 0.00012152056938031324, + "loss": 2.0226, + "step": 4259 + }, + { + "epoch": 0.44818516570226197, + "grad_norm": 1.0472149848937988, + "learning_rate": 0.00012148728923794162, + "loss": 2.0012, + "step": 4260 + }, + { + "epoch": 0.44829037348763806, + "grad_norm": 1.0326931476593018, + "learning_rate": 0.00012145400660032187, + "loss": 1.6771, + "step": 4261 + }, + { + "epoch": 0.4483955812730142, + "grad_norm": 1.4375073909759521, + "learning_rate": 0.00012142072147131898, + "loss": 2.0649, + "step": 4262 + }, + { + "epoch": 0.4485007890583903, + "grad_norm": 0.880775511264801, + "learning_rate": 0.00012138743385479823, + "loss": 1.7757, + "step": 4263 + }, + { + "epoch": 0.44860599684376645, + "grad_norm": 1.205504298210144, + "learning_rate": 0.00012135414375462522, + "loss": 1.7152, + "step": 4264 + }, + { + "epoch": 0.44871120462914255, + "grad_norm": 0.8612405061721802, + "learning_rate": 0.00012132085117466582, + "loss": 1.7546, + "step": 4265 + }, + { + "epoch": 0.4488164124145187, + "grad_norm": 1.592641830444336, + "learning_rate": 0.00012128755611878617, + "loss": 1.8245, + "step": 4266 + }, + { + "epoch": 0.4489216201998948, + "grad_norm": 1.2067207098007202, + "learning_rate": 0.00012125425859085273, + "loss": 1.6449, + "step": 4267 + }, + { + "epoch": 0.44902682798527094, + "grad_norm": 1.0257954597473145, + "learning_rate": 0.00012122095859473223, + "loss": 1.13, + "step": 4268 + }, + { + "epoch": 0.44913203577064703, + "grad_norm": 2.4131839275360107, + "learning_rate": 0.00012118765613429173, + "loss": 1.3638, + "step": 4269 + }, + { + "epoch": 0.4492372435560231, + "grad_norm": 1.0637006759643555, + "learning_rate": 0.00012115435121339844, + "loss": 1.5358, + "step": 4270 + }, + { + "epoch": 0.4493424513413993, + "grad_norm": 1.3625959157943726, + "learning_rate": 0.00012112104383592, + "loss": 1.4193, + "step": 4271 + }, + { + "epoch": 0.44944765912677537, + "grad_norm": 1.6006038188934326, + "learning_rate": 0.00012108773400572431, + "loss": 1.8992, + "step": 4272 + }, + { + "epoch": 0.4495528669121515, + "grad_norm": 1.9256750345230103, + "learning_rate": 0.00012105442172667951, + "loss": 2.2142, + "step": 4273 + }, + { + "epoch": 0.4496580746975276, + "grad_norm": 1.241640567779541, + "learning_rate": 0.00012102110700265403, + "loss": 1.4468, + "step": 4274 + }, + { + "epoch": 0.44976328248290376, + "grad_norm": 1.4494163990020752, + "learning_rate": 0.00012098778983751662, + "loss": 1.8657, + "step": 4275 + }, + { + "epoch": 0.44986849026827985, + "grad_norm": 0.9596696496009827, + "learning_rate": 0.0001209544702351363, + "loss": 1.9698, + "step": 4276 + }, + { + "epoch": 0.44997369805365595, + "grad_norm": 0.9092001914978027, + "learning_rate": 0.00012092114819938233, + "loss": 1.3448, + "step": 4277 + }, + { + "epoch": 0.4500789058390321, + "grad_norm": 1.8261799812316895, + "learning_rate": 0.00012088782373412432, + "loss": 2.2352, + "step": 4278 + }, + { + "epoch": 0.4501841136244082, + "grad_norm": 1.3973294496536255, + "learning_rate": 0.00012085449684323216, + "loss": 1.4725, + "step": 4279 + }, + { + "epoch": 0.45028932140978434, + "grad_norm": 1.5682103633880615, + "learning_rate": 0.00012082116753057593, + "loss": 1.8422, + "step": 4280 + }, + { + "epoch": 0.45039452919516043, + "grad_norm": 1.5455161333084106, + "learning_rate": 0.00012078783580002607, + "loss": 1.604, + "step": 4281 + }, + { + "epoch": 0.4504997369805366, + "grad_norm": 1.6469101905822754, + "learning_rate": 0.00012075450165545328, + "loss": 2.1398, + "step": 4282 + }, + { + "epoch": 0.4506049447659127, + "grad_norm": 1.5728659629821777, + "learning_rate": 0.00012072116510072858, + "loss": 1.9734, + "step": 4283 + }, + { + "epoch": 0.4507101525512888, + "grad_norm": 1.4864883422851562, + "learning_rate": 0.00012068782613972318, + "loss": 1.7709, + "step": 4284 + }, + { + "epoch": 0.4508153603366649, + "grad_norm": 0.9983920454978943, + "learning_rate": 0.00012065448477630867, + "loss": 1.5441, + "step": 4285 + }, + { + "epoch": 0.450920568122041, + "grad_norm": 1.9383924007415771, + "learning_rate": 0.00012062114101435686, + "loss": 1.9526, + "step": 4286 + }, + { + "epoch": 0.45102577590741716, + "grad_norm": 1.959188461303711, + "learning_rate": 0.00012058779485773985, + "loss": 1.9704, + "step": 4287 + }, + { + "epoch": 0.45113098369279325, + "grad_norm": 1.8748353719711304, + "learning_rate": 0.00012055444631033, + "loss": 1.5454, + "step": 4288 + }, + { + "epoch": 0.4512361914781694, + "grad_norm": 1.4259321689605713, + "learning_rate": 0.000120521095376, + "loss": 1.6818, + "step": 4289 + }, + { + "epoch": 0.4513413992635455, + "grad_norm": 1.2353706359863281, + "learning_rate": 0.00012048774205862279, + "loss": 2.2079, + "step": 4290 + }, + { + "epoch": 0.45144660704892164, + "grad_norm": 1.5690447092056274, + "learning_rate": 0.00012045438636207151, + "loss": 1.7757, + "step": 4291 + }, + { + "epoch": 0.45155181483429774, + "grad_norm": 1.6075197458267212, + "learning_rate": 0.00012042102829021973, + "loss": 2.0568, + "step": 4292 + }, + { + "epoch": 0.45165702261967383, + "grad_norm": 1.3543256521224976, + "learning_rate": 0.00012038766784694117, + "loss": 1.6417, + "step": 4293 + }, + { + "epoch": 0.45176223040505, + "grad_norm": 1.3820946216583252, + "learning_rate": 0.00012035430503610988, + "loss": 2.0945, + "step": 4294 + }, + { + "epoch": 0.4518674381904261, + "grad_norm": 1.327938437461853, + "learning_rate": 0.00012032093986160015, + "loss": 1.7868, + "step": 4295 + }, + { + "epoch": 0.4519726459758022, + "grad_norm": 1.0907479524612427, + "learning_rate": 0.0001202875723272866, + "loss": 1.7495, + "step": 4296 + }, + { + "epoch": 0.4520778537611783, + "grad_norm": 1.2229701280593872, + "learning_rate": 0.0001202542024370441, + "loss": 2.1307, + "step": 4297 + }, + { + "epoch": 0.45218306154655447, + "grad_norm": 0.9999223947525024, + "learning_rate": 0.00012022083019474774, + "loss": 2.0101, + "step": 4298 + }, + { + "epoch": 0.45228826933193056, + "grad_norm": 1.5254671573638916, + "learning_rate": 0.00012018745560427298, + "loss": 1.6449, + "step": 4299 + }, + { + "epoch": 0.4523934771173067, + "grad_norm": 1.047942042350769, + "learning_rate": 0.00012015407866949548, + "loss": 1.5853, + "step": 4300 + }, + { + "epoch": 0.4524986849026828, + "grad_norm": 1.6619633436203003, + "learning_rate": 0.0001201206993942912, + "loss": 2.1214, + "step": 4301 + }, + { + "epoch": 0.4526038926880589, + "grad_norm": 1.228927731513977, + "learning_rate": 0.00012008731778253632, + "loss": 1.7196, + "step": 4302 + }, + { + "epoch": 0.45270910047343504, + "grad_norm": 1.4575189352035522, + "learning_rate": 0.00012005393383810737, + "loss": 2.0006, + "step": 4303 + }, + { + "epoch": 0.45281430825881114, + "grad_norm": 1.3251969814300537, + "learning_rate": 0.00012002054756488115, + "loss": 1.7704, + "step": 4304 + }, + { + "epoch": 0.4529195160441873, + "grad_norm": 1.6667139530181885, + "learning_rate": 0.00011998715896673465, + "loss": 1.2439, + "step": 4305 + }, + { + "epoch": 0.4530247238295634, + "grad_norm": 1.0954786539077759, + "learning_rate": 0.0001199537680475452, + "loss": 1.827, + "step": 4306 + }, + { + "epoch": 0.45312993161493953, + "grad_norm": 1.4709742069244385, + "learning_rate": 0.00011992037481119036, + "loss": 2.3623, + "step": 4307 + }, + { + "epoch": 0.4532351394003156, + "grad_norm": 1.467758059501648, + "learning_rate": 0.00011988697926154799, + "loss": 1.6894, + "step": 4308 + }, + { + "epoch": 0.4533403471856917, + "grad_norm": 1.011841058731079, + "learning_rate": 0.00011985358140249621, + "loss": 1.4649, + "step": 4309 + }, + { + "epoch": 0.45344555497106787, + "grad_norm": 1.7795685529708862, + "learning_rate": 0.0001198201812379134, + "loss": 1.4143, + "step": 4310 + }, + { + "epoch": 0.45355076275644396, + "grad_norm": 1.545654058456421, + "learning_rate": 0.00011978677877167822, + "loss": 2.1281, + "step": 4311 + }, + { + "epoch": 0.4536559705418201, + "grad_norm": 1.2907682657241821, + "learning_rate": 0.00011975337400766958, + "loss": 1.4049, + "step": 4312 + }, + { + "epoch": 0.4537611783271962, + "grad_norm": 1.0812395811080933, + "learning_rate": 0.00011971996694976663, + "loss": 2.2, + "step": 4313 + }, + { + "epoch": 0.45386638611257235, + "grad_norm": 1.4511226415634155, + "learning_rate": 0.00011968655760184891, + "loss": 2.0119, + "step": 4314 + }, + { + "epoch": 0.45397159389794844, + "grad_norm": 1.6760962009429932, + "learning_rate": 0.00011965314596779604, + "loss": 1.6423, + "step": 4315 + }, + { + "epoch": 0.4540768016833246, + "grad_norm": 1.0780059099197388, + "learning_rate": 0.00011961973205148804, + "loss": 1.5716, + "step": 4316 + }, + { + "epoch": 0.4541820094687007, + "grad_norm": 1.6850388050079346, + "learning_rate": 0.00011958631585680518, + "loss": 1.7895, + "step": 4317 + }, + { + "epoch": 0.4542872172540768, + "grad_norm": 1.061856746673584, + "learning_rate": 0.00011955289738762796, + "loss": 1.8926, + "step": 4318 + }, + { + "epoch": 0.45439242503945293, + "grad_norm": 1.7766098976135254, + "learning_rate": 0.00011951947664783713, + "loss": 1.3322, + "step": 4319 + }, + { + "epoch": 0.454497632824829, + "grad_norm": 1.695619821548462, + "learning_rate": 0.00011948605364131375, + "loss": 1.6927, + "step": 4320 + }, + { + "epoch": 0.45460284061020517, + "grad_norm": 1.510094404220581, + "learning_rate": 0.00011945262837193915, + "loss": 1.9738, + "step": 4321 + }, + { + "epoch": 0.45470804839558127, + "grad_norm": 1.4329527616500854, + "learning_rate": 0.0001194192008435949, + "loss": 1.2781, + "step": 4322 + }, + { + "epoch": 0.4548132561809574, + "grad_norm": 1.3772903680801392, + "learning_rate": 0.00011938577106016275, + "loss": 2.2099, + "step": 4323 + }, + { + "epoch": 0.4549184639663335, + "grad_norm": 1.3040528297424316, + "learning_rate": 0.00011935233902552485, + "loss": 1.9477, + "step": 4324 + }, + { + "epoch": 0.4550236717517096, + "grad_norm": 1.0545731782913208, + "learning_rate": 0.00011931890474356358, + "loss": 1.987, + "step": 4325 + }, + { + "epoch": 0.45512887953708575, + "grad_norm": 1.2383321523666382, + "learning_rate": 0.00011928546821816149, + "loss": 1.0446, + "step": 4326 + }, + { + "epoch": 0.45523408732246184, + "grad_norm": 0.9607561230659485, + "learning_rate": 0.00011925202945320146, + "loss": 1.446, + "step": 4327 + }, + { + "epoch": 0.455339295107838, + "grad_norm": 0.9800947308540344, + "learning_rate": 0.00011921858845256669, + "loss": 1.6101, + "step": 4328 + }, + { + "epoch": 0.4554445028932141, + "grad_norm": 1.430310845375061, + "learning_rate": 0.00011918514522014051, + "loss": 1.9691, + "step": 4329 + }, + { + "epoch": 0.45554971067859024, + "grad_norm": 1.495063304901123, + "learning_rate": 0.00011915169975980658, + "loss": 1.7227, + "step": 4330 + }, + { + "epoch": 0.45565491846396633, + "grad_norm": 1.0809447765350342, + "learning_rate": 0.00011911825207544885, + "loss": 2.2602, + "step": 4331 + }, + { + "epoch": 0.4557601262493425, + "grad_norm": 1.0745564699172974, + "learning_rate": 0.00011908480217095141, + "loss": 1.8813, + "step": 4332 + }, + { + "epoch": 0.4558653340347186, + "grad_norm": 1.3847942352294922, + "learning_rate": 0.00011905135005019881, + "loss": 2.1904, + "step": 4333 + }, + { + "epoch": 0.45597054182009467, + "grad_norm": 1.2953932285308838, + "learning_rate": 0.0001190178957170756, + "loss": 1.7487, + "step": 4334 + }, + { + "epoch": 0.4560757496054708, + "grad_norm": 1.1227591037750244, + "learning_rate": 0.00011898443917546682, + "loss": 1.7218, + "step": 4335 + }, + { + "epoch": 0.4561809573908469, + "grad_norm": 2.098883867263794, + "learning_rate": 0.00011895098042925763, + "loss": 2.1559, + "step": 4336 + }, + { + "epoch": 0.45628616517622306, + "grad_norm": 1.4620628356933594, + "learning_rate": 0.00011891751948233348, + "loss": 1.692, + "step": 4337 + }, + { + "epoch": 0.45639137296159915, + "grad_norm": 2.7015371322631836, + "learning_rate": 0.00011888405633858009, + "loss": 2.0226, + "step": 4338 + }, + { + "epoch": 0.4564965807469753, + "grad_norm": 1.333720326423645, + "learning_rate": 0.00011885059100188341, + "loss": 2.1821, + "step": 4339 + }, + { + "epoch": 0.4566017885323514, + "grad_norm": 1.3061823844909668, + "learning_rate": 0.00011881712347612965, + "loss": 1.7872, + "step": 4340 + }, + { + "epoch": 0.4567069963177275, + "grad_norm": 2.075654983520508, + "learning_rate": 0.00011878365376520535, + "loss": 1.7089, + "step": 4341 + }, + { + "epoch": 0.45681220410310364, + "grad_norm": 1.908001184463501, + "learning_rate": 0.00011875018187299719, + "loss": 1.7904, + "step": 4342 + }, + { + "epoch": 0.45691741188847973, + "grad_norm": 1.1886532306671143, + "learning_rate": 0.0001187167078033921, + "loss": 1.8649, + "step": 4343 + }, + { + "epoch": 0.4570226196738559, + "grad_norm": 1.1382215023040771, + "learning_rate": 0.00011868323156027742, + "loss": 1.6879, + "step": 4344 + }, + { + "epoch": 0.457127827459232, + "grad_norm": 1.0881881713867188, + "learning_rate": 0.00011864975314754058, + "loss": 1.816, + "step": 4345 + }, + { + "epoch": 0.4572330352446081, + "grad_norm": 1.6298319101333618, + "learning_rate": 0.00011861627256906929, + "loss": 1.1575, + "step": 4346 + }, + { + "epoch": 0.4573382430299842, + "grad_norm": 1.241731882095337, + "learning_rate": 0.00011858278982875157, + "loss": 1.7019, + "step": 4347 + }, + { + "epoch": 0.45744345081536036, + "grad_norm": 0.9705410599708557, + "learning_rate": 0.00011854930493047566, + "loss": 1.9562, + "step": 4348 + }, + { + "epoch": 0.45754865860073646, + "grad_norm": 1.3326573371887207, + "learning_rate": 0.00011851581787813006, + "loss": 2.3098, + "step": 4349 + }, + { + "epoch": 0.45765386638611255, + "grad_norm": 1.1833465099334717, + "learning_rate": 0.00011848232867560352, + "loss": 2.0325, + "step": 4350 + }, + { + "epoch": 0.4577590741714887, + "grad_norm": 1.501226544380188, + "learning_rate": 0.00011844883732678495, + "loss": 1.9941, + "step": 4351 + }, + { + "epoch": 0.4578642819568648, + "grad_norm": 1.8644059896469116, + "learning_rate": 0.00011841534383556372, + "loss": 1.6826, + "step": 4352 + }, + { + "epoch": 0.45796948974224094, + "grad_norm": 1.2248753309249878, + "learning_rate": 0.00011838184820582923, + "loss": 1.382, + "step": 4353 + }, + { + "epoch": 0.45807469752761704, + "grad_norm": 2.008018970489502, + "learning_rate": 0.00011834835044147121, + "loss": 1.5946, + "step": 4354 + }, + { + "epoch": 0.4581799053129932, + "grad_norm": 1.5521501302719116, + "learning_rate": 0.00011831485054637973, + "loss": 1.5836, + "step": 4355 + }, + { + "epoch": 0.4582851130983693, + "grad_norm": 1.707040548324585, + "learning_rate": 0.00011828134852444493, + "loss": 1.7551, + "step": 4356 + }, + { + "epoch": 0.4583903208837454, + "grad_norm": 1.7729437351226807, + "learning_rate": 0.00011824784437955732, + "loss": 2.3166, + "step": 4357 + }, + { + "epoch": 0.4584955286691215, + "grad_norm": 1.4290671348571777, + "learning_rate": 0.0001182143381156076, + "loss": 2.0124, + "step": 4358 + }, + { + "epoch": 0.4586007364544976, + "grad_norm": 2.462273359298706, + "learning_rate": 0.00011818082973648683, + "loss": 2.1217, + "step": 4359 + }, + { + "epoch": 0.45870594423987376, + "grad_norm": 1.7780591249465942, + "learning_rate": 0.00011814731924608616, + "loss": 1.4172, + "step": 4360 + }, + { + "epoch": 0.45881115202524986, + "grad_norm": 1.9604672193527222, + "learning_rate": 0.00011811380664829703, + "loss": 1.3744, + "step": 4361 + }, + { + "epoch": 0.458916359810626, + "grad_norm": 1.0941351652145386, + "learning_rate": 0.00011808029194701122, + "loss": 1.9376, + "step": 4362 + }, + { + "epoch": 0.4590215675960021, + "grad_norm": 1.587456226348877, + "learning_rate": 0.00011804677514612062, + "loss": 1.9933, + "step": 4363 + }, + { + "epoch": 0.45912677538137825, + "grad_norm": 1.38690984249115, + "learning_rate": 0.00011801325624951745, + "loss": 2.1653, + "step": 4364 + }, + { + "epoch": 0.45923198316675434, + "grad_norm": 1.6729885339736938, + "learning_rate": 0.00011797973526109416, + "loss": 1.7463, + "step": 4365 + }, + { + "epoch": 0.45933719095213044, + "grad_norm": 1.2510261535644531, + "learning_rate": 0.00011794621218474345, + "loss": 1.6988, + "step": 4366 + }, + { + "epoch": 0.4594423987375066, + "grad_norm": 1.0695215463638306, + "learning_rate": 0.00011791268702435816, + "loss": 2.1912, + "step": 4367 + }, + { + "epoch": 0.4595476065228827, + "grad_norm": 1.849915862083435, + "learning_rate": 0.00011787915978383151, + "loss": 1.7784, + "step": 4368 + }, + { + "epoch": 0.45965281430825883, + "grad_norm": 1.144481897354126, + "learning_rate": 0.00011784563046705695, + "loss": 1.928, + "step": 4369 + }, + { + "epoch": 0.4597580220936349, + "grad_norm": 1.9032231569290161, + "learning_rate": 0.00011781209907792805, + "loss": 1.3965, + "step": 4370 + }, + { + "epoch": 0.45986322987901107, + "grad_norm": 1.352990746498108, + "learning_rate": 0.00011777856562033876, + "loss": 1.987, + "step": 4371 + }, + { + "epoch": 0.45996843766438716, + "grad_norm": 1.394627571105957, + "learning_rate": 0.00011774503009818316, + "loss": 2.3461, + "step": 4372 + }, + { + "epoch": 0.46007364544976326, + "grad_norm": 0.8705309629440308, + "learning_rate": 0.00011771149251535569, + "loss": 2.7158, + "step": 4373 + }, + { + "epoch": 0.4601788532351394, + "grad_norm": 1.0310938358306885, + "learning_rate": 0.0001176779528757509, + "loss": 1.9135, + "step": 4374 + }, + { + "epoch": 0.4602840610205155, + "grad_norm": 1.4547808170318604, + "learning_rate": 0.00011764441118326364, + "loss": 1.8487, + "step": 4375 + }, + { + "epoch": 0.46038926880589165, + "grad_norm": 2.0553081035614014, + "learning_rate": 0.00011761086744178902, + "loss": 1.698, + "step": 4376 + }, + { + "epoch": 0.46049447659126774, + "grad_norm": 1.7115883827209473, + "learning_rate": 0.00011757732165522237, + "loss": 2.4097, + "step": 4377 + }, + { + "epoch": 0.4605996843766439, + "grad_norm": 1.6055662631988525, + "learning_rate": 0.00011754377382745922, + "loss": 1.6749, + "step": 4378 + }, + { + "epoch": 0.46070489216202, + "grad_norm": 1.6005985736846924, + "learning_rate": 0.00011751022396239539, + "loss": 2.0828, + "step": 4379 + }, + { + "epoch": 0.46081009994739613, + "grad_norm": 1.4807254076004028, + "learning_rate": 0.00011747667206392691, + "loss": 1.7367, + "step": 4380 + }, + { + "epoch": 0.46091530773277223, + "grad_norm": 1.8019040822982788, + "learning_rate": 0.00011744311813595006, + "loss": 2.1629, + "step": 4381 + }, + { + "epoch": 0.4610205155181483, + "grad_norm": 1.1676995754241943, + "learning_rate": 0.00011740956218236132, + "loss": 1.6825, + "step": 4382 + }, + { + "epoch": 0.46112572330352447, + "grad_norm": 1.9756308794021606, + "learning_rate": 0.00011737600420705748, + "loss": 2.0505, + "step": 4383 + }, + { + "epoch": 0.46123093108890056, + "grad_norm": 1.3423806428909302, + "learning_rate": 0.00011734244421393548, + "loss": 2.0603, + "step": 4384 + }, + { + "epoch": 0.4613361388742767, + "grad_norm": 1.3737515211105347, + "learning_rate": 0.00011730888220689251, + "loss": 1.5806, + "step": 4385 + }, + { + "epoch": 0.4614413466596528, + "grad_norm": 1.299750804901123, + "learning_rate": 0.0001172753181898261, + "loss": 1.4868, + "step": 4386 + }, + { + "epoch": 0.46154655444502896, + "grad_norm": 1.3489303588867188, + "learning_rate": 0.00011724175216663384, + "loss": 1.6957, + "step": 4387 + }, + { + "epoch": 0.46165176223040505, + "grad_norm": 1.3841158151626587, + "learning_rate": 0.00011720818414121368, + "loss": 1.5918, + "step": 4388 + }, + { + "epoch": 0.46175697001578114, + "grad_norm": 1.4252556562423706, + "learning_rate": 0.00011717461411746378, + "loss": 1.7677, + "step": 4389 + }, + { + "epoch": 0.4618621778011573, + "grad_norm": 1.7873703241348267, + "learning_rate": 0.0001171410420992825, + "loss": 1.6604, + "step": 4390 + }, + { + "epoch": 0.4619673855865334, + "grad_norm": 1.825914978981018, + "learning_rate": 0.00011710746809056841, + "loss": 1.7022, + "step": 4391 + }, + { + "epoch": 0.46207259337190953, + "grad_norm": 1.4188059568405151, + "learning_rate": 0.00011707389209522039, + "loss": 2.1184, + "step": 4392 + }, + { + "epoch": 0.46217780115728563, + "grad_norm": 1.4496338367462158, + "learning_rate": 0.0001170403141171375, + "loss": 2.0205, + "step": 4393 + }, + { + "epoch": 0.4622830089426618, + "grad_norm": 2.31512713432312, + "learning_rate": 0.00011700673416021908, + "loss": 1.822, + "step": 4394 + }, + { + "epoch": 0.46238821672803787, + "grad_norm": 1.1979174613952637, + "learning_rate": 0.00011697315222836458, + "loss": 2.3128, + "step": 4395 + }, + { + "epoch": 0.462493424513414, + "grad_norm": 1.3453646898269653, + "learning_rate": 0.00011693956832547384, + "loss": 2.0374, + "step": 4396 + }, + { + "epoch": 0.4625986322987901, + "grad_norm": 1.1278789043426514, + "learning_rate": 0.0001169059824554468, + "loss": 1.5538, + "step": 4397 + }, + { + "epoch": 0.4627038400841662, + "grad_norm": 1.6684142351150513, + "learning_rate": 0.00011687239462218369, + "loss": 2.0152, + "step": 4398 + }, + { + "epoch": 0.46280904786954236, + "grad_norm": 1.2076631784439087, + "learning_rate": 0.00011683880482958493, + "loss": 1.9577, + "step": 4399 + }, + { + "epoch": 0.46291425565491845, + "grad_norm": 1.3028810024261475, + "learning_rate": 0.00011680521308155124, + "loss": 1.413, + "step": 4400 + }, + { + "epoch": 0.4630194634402946, + "grad_norm": 1.2423094511032104, + "learning_rate": 0.00011677161938198348, + "loss": 1.7865, + "step": 4401 + }, + { + "epoch": 0.4631246712256707, + "grad_norm": 1.2606873512268066, + "learning_rate": 0.0001167380237347828, + "loss": 1.9457, + "step": 4402 + }, + { + "epoch": 0.46322987901104684, + "grad_norm": 1.7899268865585327, + "learning_rate": 0.00011670442614385053, + "loss": 2.2358, + "step": 4403 + }, + { + "epoch": 0.46333508679642293, + "grad_norm": 1.2076514959335327, + "learning_rate": 0.00011667082661308826, + "loss": 1.7803, + "step": 4404 + }, + { + "epoch": 0.46344029458179903, + "grad_norm": 1.6006786823272705, + "learning_rate": 0.00011663722514639778, + "loss": 1.696, + "step": 4405 + }, + { + "epoch": 0.4635455023671752, + "grad_norm": 1.0841476917266846, + "learning_rate": 0.00011660362174768114, + "loss": 1.8837, + "step": 4406 + }, + { + "epoch": 0.46365071015255127, + "grad_norm": 0.9759506583213806, + "learning_rate": 0.0001165700164208406, + "loss": 1.4346, + "step": 4407 + }, + { + "epoch": 0.4637559179379274, + "grad_norm": 1.5411173105239868, + "learning_rate": 0.00011653640916977861, + "loss": 1.925, + "step": 4408 + }, + { + "epoch": 0.4638611257233035, + "grad_norm": 1.1252624988555908, + "learning_rate": 0.00011650279999839787, + "loss": 1.8466, + "step": 4409 + }, + { + "epoch": 0.46396633350867966, + "grad_norm": 1.3989907503128052, + "learning_rate": 0.00011646918891060127, + "loss": 1.8891, + "step": 4410 + }, + { + "epoch": 0.46407154129405576, + "grad_norm": 1.7185852527618408, + "learning_rate": 0.00011643557591029206, + "loss": 1.7989, + "step": 4411 + }, + { + "epoch": 0.4641767490794319, + "grad_norm": 1.037192702293396, + "learning_rate": 0.0001164019610013735, + "loss": 1.3982, + "step": 4412 + }, + { + "epoch": 0.464281956864808, + "grad_norm": 1.2058227062225342, + "learning_rate": 0.00011636834418774922, + "loss": 1.605, + "step": 4413 + }, + { + "epoch": 0.4643871646501841, + "grad_norm": 1.8014076948165894, + "learning_rate": 0.00011633472547332305, + "loss": 1.3505, + "step": 4414 + }, + { + "epoch": 0.46449237243556024, + "grad_norm": 1.513741135597229, + "learning_rate": 0.00011630110486199899, + "loss": 1.6722, + "step": 4415 + }, + { + "epoch": 0.46459758022093633, + "grad_norm": 1.3467214107513428, + "learning_rate": 0.00011626748235768128, + "loss": 1.9053, + "step": 4416 + }, + { + "epoch": 0.4647027880063125, + "grad_norm": 1.5441854000091553, + "learning_rate": 0.00011623385796427442, + "loss": 1.7715, + "step": 4417 + }, + { + "epoch": 0.4648079957916886, + "grad_norm": 1.012117624282837, + "learning_rate": 0.00011620023168568311, + "loss": 1.9828, + "step": 4418 + }, + { + "epoch": 0.4649132035770647, + "grad_norm": 1.2613452672958374, + "learning_rate": 0.00011616660352581225, + "loss": 1.996, + "step": 4419 + }, + { + "epoch": 0.4650184113624408, + "grad_norm": 2.198415756225586, + "learning_rate": 0.00011613297348856693, + "loss": 2.229, + "step": 4420 + }, + { + "epoch": 0.4651236191478169, + "grad_norm": 1.369099736213684, + "learning_rate": 0.00011609934157785251, + "loss": 1.7323, + "step": 4421 + }, + { + "epoch": 0.46522882693319306, + "grad_norm": 1.4710428714752197, + "learning_rate": 0.00011606570779757461, + "loss": 1.7945, + "step": 4422 + }, + { + "epoch": 0.46533403471856916, + "grad_norm": 2.027179479598999, + "learning_rate": 0.00011603207215163894, + "loss": 1.0333, + "step": 4423 + }, + { + "epoch": 0.4654392425039453, + "grad_norm": 1.1320077180862427, + "learning_rate": 0.00011599843464395151, + "loss": 1.8657, + "step": 4424 + }, + { + "epoch": 0.4655444502893214, + "grad_norm": 1.3532216548919678, + "learning_rate": 0.00011596479527841859, + "loss": 2.1035, + "step": 4425 + }, + { + "epoch": 0.46564965807469755, + "grad_norm": 1.3773186206817627, + "learning_rate": 0.00011593115405894652, + "loss": 1.9023, + "step": 4426 + }, + { + "epoch": 0.46575486586007364, + "grad_norm": 1.1166491508483887, + "learning_rate": 0.00011589751098944202, + "loss": 1.5557, + "step": 4427 + }, + { + "epoch": 0.4658600736454498, + "grad_norm": 1.6609245538711548, + "learning_rate": 0.0001158638660738119, + "loss": 2.003, + "step": 4428 + }, + { + "epoch": 0.4659652814308259, + "grad_norm": 1.4807888269424438, + "learning_rate": 0.00011583021931596325, + "loss": 2.0677, + "step": 4429 + }, + { + "epoch": 0.466070489216202, + "grad_norm": 1.3660595417022705, + "learning_rate": 0.0001157965707198034, + "loss": 1.838, + "step": 4430 + }, + { + "epoch": 0.4661756970015781, + "grad_norm": 1.6003875732421875, + "learning_rate": 0.00011576292028923976, + "loss": 1.789, + "step": 4431 + }, + { + "epoch": 0.4662809047869542, + "grad_norm": 1.518566370010376, + "learning_rate": 0.00011572926802818011, + "loss": 2.4484, + "step": 4432 + }, + { + "epoch": 0.46638611257233037, + "grad_norm": 2.0211031436920166, + "learning_rate": 0.00011569561394053236, + "loss": 1.7331, + "step": 4433 + }, + { + "epoch": 0.46649132035770646, + "grad_norm": 1.0467982292175293, + "learning_rate": 0.00011566195803020464, + "loss": 1.5526, + "step": 4434 + }, + { + "epoch": 0.4665965281430826, + "grad_norm": 1.9077860116958618, + "learning_rate": 0.00011562830030110532, + "loss": 2.1115, + "step": 4435 + }, + { + "epoch": 0.4667017359284587, + "grad_norm": 1.1763733625411987, + "learning_rate": 0.00011559464075714292, + "loss": 1.5898, + "step": 4436 + }, + { + "epoch": 0.4668069437138348, + "grad_norm": 1.084130048751831, + "learning_rate": 0.00011556097940222628, + "loss": 2.0153, + "step": 4437 + }, + { + "epoch": 0.46691215149921095, + "grad_norm": 0.8493435978889465, + "learning_rate": 0.00011552731624026432, + "loss": 1.9469, + "step": 4438 + }, + { + "epoch": 0.46701735928458704, + "grad_norm": 2.182161331176758, + "learning_rate": 0.00011549365127516627, + "loss": 2.1621, + "step": 4439 + }, + { + "epoch": 0.4671225670699632, + "grad_norm": 1.1532615423202515, + "learning_rate": 0.0001154599845108415, + "loss": 2.1096, + "step": 4440 + }, + { + "epoch": 0.4672277748553393, + "grad_norm": 0.9221954345703125, + "learning_rate": 0.00011542631595119965, + "loss": 1.8867, + "step": 4441 + }, + { + "epoch": 0.46733298264071543, + "grad_norm": 1.4042004346847534, + "learning_rate": 0.00011539264560015052, + "loss": 1.7898, + "step": 4442 + }, + { + "epoch": 0.4674381904260915, + "grad_norm": 1.2298986911773682, + "learning_rate": 0.00011535897346160416, + "loss": 1.8017, + "step": 4443 + }, + { + "epoch": 0.4675433982114677, + "grad_norm": 0.9590116739273071, + "learning_rate": 0.00011532529953947075, + "loss": 1.823, + "step": 4444 + }, + { + "epoch": 0.46764860599684377, + "grad_norm": 1.2512177228927612, + "learning_rate": 0.00011529162383766079, + "loss": 1.6104, + "step": 4445 + }, + { + "epoch": 0.46775381378221986, + "grad_norm": 1.2496157884597778, + "learning_rate": 0.00011525794636008491, + "loss": 1.4588, + "step": 4446 + }, + { + "epoch": 0.467859021567596, + "grad_norm": 1.6144243478775024, + "learning_rate": 0.00011522426711065397, + "loss": 1.9968, + "step": 4447 + }, + { + "epoch": 0.4679642293529721, + "grad_norm": 1.1202630996704102, + "learning_rate": 0.000115190586093279, + "loss": 2.2607, + "step": 4448 + }, + { + "epoch": 0.46806943713834825, + "grad_norm": 1.414974570274353, + "learning_rate": 0.00011515690331187133, + "loss": 1.7647, + "step": 4449 + }, + { + "epoch": 0.46817464492372435, + "grad_norm": 1.3937184810638428, + "learning_rate": 0.00011512321877034234, + "loss": 1.5259, + "step": 4450 + }, + { + "epoch": 0.4682798527091005, + "grad_norm": 1.7348077297210693, + "learning_rate": 0.00011508953247260379, + "loss": 1.8668, + "step": 4451 + }, + { + "epoch": 0.4683850604944766, + "grad_norm": 1.4032397270202637, + "learning_rate": 0.00011505584442256752, + "loss": 1.1793, + "step": 4452 + }, + { + "epoch": 0.4684902682798527, + "grad_norm": 1.652096152305603, + "learning_rate": 0.00011502215462414561, + "loss": 1.5654, + "step": 4453 + }, + { + "epoch": 0.46859547606522883, + "grad_norm": 1.547971487045288, + "learning_rate": 0.00011498846308125033, + "loss": 1.9391, + "step": 4454 + }, + { + "epoch": 0.4687006838506049, + "grad_norm": 1.1444284915924072, + "learning_rate": 0.00011495476979779418, + "loss": 1.7784, + "step": 4455 + }, + { + "epoch": 0.4688058916359811, + "grad_norm": 1.4032714366912842, + "learning_rate": 0.00011492107477768992, + "loss": 1.7778, + "step": 4456 + }, + { + "epoch": 0.46891109942135717, + "grad_norm": 1.3546231985092163, + "learning_rate": 0.00011488737802485033, + "loss": 1.4267, + "step": 4457 + }, + { + "epoch": 0.4690163072067333, + "grad_norm": 1.4766918420791626, + "learning_rate": 0.00011485367954318856, + "loss": 1.6944, + "step": 4458 + }, + { + "epoch": 0.4691215149921094, + "grad_norm": 1.9072582721710205, + "learning_rate": 0.0001148199793366179, + "loss": 2.0898, + "step": 4459 + }, + { + "epoch": 0.46922672277748556, + "grad_norm": 1.6005427837371826, + "learning_rate": 0.00011478627740905183, + "loss": 1.5538, + "step": 4460 + }, + { + "epoch": 0.46933193056286165, + "grad_norm": 1.1679853200912476, + "learning_rate": 0.00011475257376440405, + "loss": 1.6107, + "step": 4461 + }, + { + "epoch": 0.46943713834823775, + "grad_norm": 1.3431743383407593, + "learning_rate": 0.0001147188684065885, + "loss": 1.99, + "step": 4462 + }, + { + "epoch": 0.4695423461336139, + "grad_norm": 1.37478768825531, + "learning_rate": 0.00011468516133951921, + "loss": 1.9783, + "step": 4463 + }, + { + "epoch": 0.46964755391899, + "grad_norm": 1.1095014810562134, + "learning_rate": 0.00011465145256711048, + "loss": 1.9507, + "step": 4464 + }, + { + "epoch": 0.46975276170436614, + "grad_norm": 1.8217538595199585, + "learning_rate": 0.0001146177420932768, + "loss": 2.4746, + "step": 4465 + }, + { + "epoch": 0.46985796948974223, + "grad_norm": 1.3353768587112427, + "learning_rate": 0.00011458402992193289, + "loss": 1.7805, + "step": 4466 + }, + { + "epoch": 0.4699631772751184, + "grad_norm": 1.624212384223938, + "learning_rate": 0.0001145503160569936, + "loss": 1.928, + "step": 4467 + }, + { + "epoch": 0.4700683850604945, + "grad_norm": 1.4001796245574951, + "learning_rate": 0.00011451660050237401, + "loss": 1.7054, + "step": 4468 + }, + { + "epoch": 0.47017359284587057, + "grad_norm": 1.3089070320129395, + "learning_rate": 0.00011448288326198939, + "loss": 2.2208, + "step": 4469 + }, + { + "epoch": 0.4702788006312467, + "grad_norm": 1.7245391607284546, + "learning_rate": 0.00011444916433975528, + "loss": 1.6884, + "step": 4470 + }, + { + "epoch": 0.4703840084166228, + "grad_norm": 1.450230360031128, + "learning_rate": 0.00011441544373958725, + "loss": 1.5062, + "step": 4471 + }, + { + "epoch": 0.47048921620199896, + "grad_norm": 1.8866242170333862, + "learning_rate": 0.00011438172146540123, + "loss": 1.6767, + "step": 4472 + }, + { + "epoch": 0.47059442398737505, + "grad_norm": 1.5962532758712769, + "learning_rate": 0.00011434799752111324, + "loss": 1.8297, + "step": 4473 + }, + { + "epoch": 0.4706996317727512, + "grad_norm": 1.6113159656524658, + "learning_rate": 0.00011431427191063957, + "loss": 1.84, + "step": 4474 + }, + { + "epoch": 0.4708048395581273, + "grad_norm": 1.374487042427063, + "learning_rate": 0.00011428054463789661, + "loss": 1.865, + "step": 4475 + }, + { + "epoch": 0.47091004734350345, + "grad_norm": 2.844097137451172, + "learning_rate": 0.00011424681570680105, + "loss": 1.3199, + "step": 4476 + }, + { + "epoch": 0.47101525512887954, + "grad_norm": 1.1103769540786743, + "learning_rate": 0.00011421308512126969, + "loss": 2.3692, + "step": 4477 + }, + { + "epoch": 0.47112046291425563, + "grad_norm": 1.4509694576263428, + "learning_rate": 0.00011417935288521955, + "loss": 2.0409, + "step": 4478 + }, + { + "epoch": 0.4712256706996318, + "grad_norm": 1.3323218822479248, + "learning_rate": 0.00011414561900256784, + "loss": 2.2169, + "step": 4479 + }, + { + "epoch": 0.4713308784850079, + "grad_norm": 1.0157517194747925, + "learning_rate": 0.00011411188347723198, + "loss": 2.1209, + "step": 4480 + }, + { + "epoch": 0.471436086270384, + "grad_norm": 1.2026599645614624, + "learning_rate": 0.00011407814631312957, + "loss": 1.934, + "step": 4481 + }, + { + "epoch": 0.4715412940557601, + "grad_norm": 1.1034396886825562, + "learning_rate": 0.00011404440751417838, + "loss": 1.7275, + "step": 4482 + }, + { + "epoch": 0.47164650184113627, + "grad_norm": 0.9263768196105957, + "learning_rate": 0.00011401066708429641, + "loss": 2.0278, + "step": 4483 + }, + { + "epoch": 0.47175170962651236, + "grad_norm": 1.4000307321548462, + "learning_rate": 0.0001139769250274018, + "loss": 2.0002, + "step": 4484 + }, + { + "epoch": 0.47185691741188845, + "grad_norm": 1.6272485256195068, + "learning_rate": 0.0001139431813474129, + "loss": 1.8365, + "step": 4485 + }, + { + "epoch": 0.4719621251972646, + "grad_norm": 1.3752802610397339, + "learning_rate": 0.00011390943604824826, + "loss": 2.1863, + "step": 4486 + }, + { + "epoch": 0.4720673329826407, + "grad_norm": 1.1118308305740356, + "learning_rate": 0.00011387568913382664, + "loss": 2.1644, + "step": 4487 + }, + { + "epoch": 0.47217254076801685, + "grad_norm": 1.0211619138717651, + "learning_rate": 0.00011384194060806692, + "loss": 1.9979, + "step": 4488 + }, + { + "epoch": 0.47227774855339294, + "grad_norm": 1.477390170097351, + "learning_rate": 0.0001138081904748882, + "loss": 1.9993, + "step": 4489 + }, + { + "epoch": 0.4723829563387691, + "grad_norm": 0.9481652975082397, + "learning_rate": 0.00011377443873820981, + "loss": 1.886, + "step": 4490 + }, + { + "epoch": 0.4724881641241452, + "grad_norm": 1.2719368934631348, + "learning_rate": 0.00011374068540195122, + "loss": 1.6373, + "step": 4491 + }, + { + "epoch": 0.47259337190952133, + "grad_norm": 1.15910005569458, + "learning_rate": 0.00011370693047003205, + "loss": 2.0742, + "step": 4492 + }, + { + "epoch": 0.4726985796948974, + "grad_norm": 1.1939606666564941, + "learning_rate": 0.00011367317394637218, + "loss": 1.4249, + "step": 4493 + }, + { + "epoch": 0.4728037874802735, + "grad_norm": 1.500286340713501, + "learning_rate": 0.00011363941583489171, + "loss": 2.1489, + "step": 4494 + }, + { + "epoch": 0.47290899526564967, + "grad_norm": 1.2122026681900024, + "learning_rate": 0.00011360565613951073, + "loss": 2.0753, + "step": 4495 + }, + { + "epoch": 0.47301420305102576, + "grad_norm": 1.3212809562683105, + "learning_rate": 0.0001135718948641497, + "loss": 1.4172, + "step": 4496 + }, + { + "epoch": 0.4731194108364019, + "grad_norm": 1.0483636856079102, + "learning_rate": 0.00011353813201272921, + "loss": 1.5167, + "step": 4497 + }, + { + "epoch": 0.473224618621778, + "grad_norm": 0.8720149993896484, + "learning_rate": 0.00011350436758917007, + "loss": 1.6892, + "step": 4498 + }, + { + "epoch": 0.47332982640715415, + "grad_norm": 1.857450008392334, + "learning_rate": 0.00011347060159739315, + "loss": 2.0008, + "step": 4499 + }, + { + "epoch": 0.47343503419253025, + "grad_norm": 1.0890053510665894, + "learning_rate": 0.00011343683404131964, + "loss": 1.5298, + "step": 4500 + }, + { + "epoch": 0.47354024197790634, + "grad_norm": 1.181207299232483, + "learning_rate": 0.00011340306492487084, + "loss": 1.6269, + "step": 4501 + }, + { + "epoch": 0.4736454497632825, + "grad_norm": 1.423409104347229, + "learning_rate": 0.00011336929425196826, + "loss": 2.2304, + "step": 4502 + }, + { + "epoch": 0.4737506575486586, + "grad_norm": 1.1393976211547852, + "learning_rate": 0.00011333552202653353, + "loss": 1.6851, + "step": 4503 + }, + { + "epoch": 0.47385586533403473, + "grad_norm": 1.761922001838684, + "learning_rate": 0.00011330174825248857, + "loss": 1.5886, + "step": 4504 + }, + { + "epoch": 0.4739610731194108, + "grad_norm": 2.4872066974639893, + "learning_rate": 0.0001132679729337554, + "loss": 1.8363, + "step": 4505 + }, + { + "epoch": 0.474066280904787, + "grad_norm": 2.0420877933502197, + "learning_rate": 0.00011323419607425618, + "loss": 1.7054, + "step": 4506 + }, + { + "epoch": 0.47417148869016307, + "grad_norm": 1.310332179069519, + "learning_rate": 0.00011320041767791336, + "loss": 2.296, + "step": 4507 + }, + { + "epoch": 0.4742766964755392, + "grad_norm": 1.570314884185791, + "learning_rate": 0.00011316663774864951, + "loss": 1.3577, + "step": 4508 + }, + { + "epoch": 0.4743819042609153, + "grad_norm": 1.4115073680877686, + "learning_rate": 0.00011313285629038737, + "loss": 1.744, + "step": 4509 + }, + { + "epoch": 0.4744871120462914, + "grad_norm": 1.366669774055481, + "learning_rate": 0.00011309907330704988, + "loss": 1.6847, + "step": 4510 + }, + { + "epoch": 0.47459231983166755, + "grad_norm": 1.2731913328170776, + "learning_rate": 0.00011306528880256016, + "loss": 2.1791, + "step": 4511 + }, + { + "epoch": 0.47469752761704365, + "grad_norm": 0.9359650015830994, + "learning_rate": 0.00011303150278084145, + "loss": 1.6904, + "step": 4512 + }, + { + "epoch": 0.4748027354024198, + "grad_norm": 1.9053103923797607, + "learning_rate": 0.00011299771524581722, + "loss": 1.1146, + "step": 4513 + }, + { + "epoch": 0.4749079431877959, + "grad_norm": 1.60822331905365, + "learning_rate": 0.00011296392620141114, + "loss": 1.47, + "step": 4514 + }, + { + "epoch": 0.47501315097317204, + "grad_norm": 1.3728163242340088, + "learning_rate": 0.00011293013565154702, + "loss": 2.0799, + "step": 4515 + }, + { + "epoch": 0.47511835875854813, + "grad_norm": 1.5891927480697632, + "learning_rate": 0.0001128963436001488, + "loss": 2.0376, + "step": 4516 + }, + { + "epoch": 0.4752235665439242, + "grad_norm": 1.4155621528625488, + "learning_rate": 0.00011286255005114065, + "loss": 1.6403, + "step": 4517 + }, + { + "epoch": 0.4753287743293004, + "grad_norm": 1.1898889541625977, + "learning_rate": 0.00011282875500844694, + "loss": 2.0779, + "step": 4518 + }, + { + "epoch": 0.47543398211467647, + "grad_norm": 1.205228328704834, + "learning_rate": 0.00011279495847599216, + "loss": 2.0405, + "step": 4519 + }, + { + "epoch": 0.4755391899000526, + "grad_norm": 1.6402561664581299, + "learning_rate": 0.00011276116045770096, + "loss": 1.8382, + "step": 4520 + }, + { + "epoch": 0.4756443976854287, + "grad_norm": 1.9118852615356445, + "learning_rate": 0.00011272736095749823, + "loss": 2.0503, + "step": 4521 + }, + { + "epoch": 0.47574960547080486, + "grad_norm": 1.6586589813232422, + "learning_rate": 0.00011269355997930899, + "loss": 1.7323, + "step": 4522 + }, + { + "epoch": 0.47585481325618095, + "grad_norm": 1.5802953243255615, + "learning_rate": 0.00011265975752705842, + "loss": 2.3248, + "step": 4523 + }, + { + "epoch": 0.4759600210415571, + "grad_norm": 1.256369709968567, + "learning_rate": 0.0001126259536046719, + "loss": 1.5487, + "step": 4524 + }, + { + "epoch": 0.4760652288269332, + "grad_norm": 1.2171026468276978, + "learning_rate": 0.00011259214821607496, + "loss": 1.3623, + "step": 4525 + }, + { + "epoch": 0.4761704366123093, + "grad_norm": 1.427194595336914, + "learning_rate": 0.00011255834136519334, + "loss": 1.1506, + "step": 4526 + }, + { + "epoch": 0.47627564439768544, + "grad_norm": 1.2046290636062622, + "learning_rate": 0.00011252453305595285, + "loss": 1.3518, + "step": 4527 + }, + { + "epoch": 0.47638085218306153, + "grad_norm": 1.498401403427124, + "learning_rate": 0.00011249072329227959, + "loss": 2.0285, + "step": 4528 + }, + { + "epoch": 0.4764860599684377, + "grad_norm": 1.4994467496871948, + "learning_rate": 0.00011245691207809978, + "loss": 1.9723, + "step": 4529 + }, + { + "epoch": 0.4765912677538138, + "grad_norm": 0.8767755627632141, + "learning_rate": 0.00011242309941733978, + "loss": 1.7982, + "step": 4530 + }, + { + "epoch": 0.4766964755391899, + "grad_norm": 1.7396360635757446, + "learning_rate": 0.00011238928531392614, + "loss": 2.3044, + "step": 4531 + }, + { + "epoch": 0.476801683324566, + "grad_norm": 1.3658801317214966, + "learning_rate": 0.00011235546977178562, + "loss": 1.4995, + "step": 4532 + }, + { + "epoch": 0.4769068911099421, + "grad_norm": 1.1424037218093872, + "learning_rate": 0.00011232165279484506, + "loss": 1.6863, + "step": 4533 + }, + { + "epoch": 0.47701209889531826, + "grad_norm": 1.8249849081039429, + "learning_rate": 0.00011228783438703154, + "loss": 1.6042, + "step": 4534 + }, + { + "epoch": 0.47711730668069435, + "grad_norm": 1.236474871635437, + "learning_rate": 0.0001122540145522723, + "loss": 2.4466, + "step": 4535 + }, + { + "epoch": 0.4772225144660705, + "grad_norm": 1.7508653402328491, + "learning_rate": 0.00011222019329449467, + "loss": 1.9367, + "step": 4536 + }, + { + "epoch": 0.4773277222514466, + "grad_norm": 1.29315185546875, + "learning_rate": 0.00011218637061762624, + "loss": 1.9788, + "step": 4537 + }, + { + "epoch": 0.47743293003682274, + "grad_norm": 1.5131714344024658, + "learning_rate": 0.00011215254652559472, + "loss": 2.1485, + "step": 4538 + }, + { + "epoch": 0.47753813782219884, + "grad_norm": 1.8211942911148071, + "learning_rate": 0.00011211872102232801, + "loss": 1.3127, + "step": 4539 + }, + { + "epoch": 0.477643345607575, + "grad_norm": 1.5046416521072388, + "learning_rate": 0.0001120848941117541, + "loss": 1.6588, + "step": 4540 + }, + { + "epoch": 0.4777485533929511, + "grad_norm": 2.2012786865234375, + "learning_rate": 0.00011205106579780125, + "loss": 2.1727, + "step": 4541 + }, + { + "epoch": 0.4778537611783272, + "grad_norm": 2.173950433731079, + "learning_rate": 0.00011201723608439778, + "loss": 2.1504, + "step": 4542 + }, + { + "epoch": 0.4779589689637033, + "grad_norm": 1.459008812904358, + "learning_rate": 0.00011198340497547231, + "loss": 2.1339, + "step": 4543 + }, + { + "epoch": 0.4780641767490794, + "grad_norm": 1.1652424335479736, + "learning_rate": 0.00011194957247495344, + "loss": 1.6663, + "step": 4544 + }, + { + "epoch": 0.47816938453445557, + "grad_norm": 1.2521672248840332, + "learning_rate": 0.00011191573858677007, + "loss": 2.2594, + "step": 4545 + }, + { + "epoch": 0.47827459231983166, + "grad_norm": 1.4206370115280151, + "learning_rate": 0.00011188190331485125, + "loss": 2.0058, + "step": 4546 + }, + { + "epoch": 0.4783798001052078, + "grad_norm": 1.4350491762161255, + "learning_rate": 0.00011184806666312609, + "loss": 1.8621, + "step": 4547 + }, + { + "epoch": 0.4784850078905839, + "grad_norm": 0.9594536423683167, + "learning_rate": 0.00011181422863552398, + "loss": 2.0049, + "step": 4548 + }, + { + "epoch": 0.47859021567596, + "grad_norm": 1.3745486736297607, + "learning_rate": 0.0001117803892359744, + "loss": 2.1858, + "step": 4549 + }, + { + "epoch": 0.47869542346133614, + "grad_norm": 1.600081443786621, + "learning_rate": 0.00011174654846840701, + "loss": 1.9469, + "step": 4550 + }, + { + "epoch": 0.47880063124671224, + "grad_norm": 0.9910126328468323, + "learning_rate": 0.00011171270633675161, + "loss": 2.1031, + "step": 4551 + }, + { + "epoch": 0.4789058390320884, + "grad_norm": 1.027575135231018, + "learning_rate": 0.00011167886284493821, + "loss": 1.5686, + "step": 4552 + }, + { + "epoch": 0.4790110468174645, + "grad_norm": 1.3736275434494019, + "learning_rate": 0.00011164501799689693, + "loss": 1.8597, + "step": 4553 + }, + { + "epoch": 0.47911625460284063, + "grad_norm": 1.4835340976715088, + "learning_rate": 0.00011161117179655804, + "loss": 2.0892, + "step": 4554 + }, + { + "epoch": 0.4792214623882167, + "grad_norm": 1.1146955490112305, + "learning_rate": 0.00011157732424785202, + "loss": 1.7941, + "step": 4555 + }, + { + "epoch": 0.4793266701735929, + "grad_norm": 1.7215523719787598, + "learning_rate": 0.00011154347535470947, + "loss": 1.6763, + "step": 4556 + }, + { + "epoch": 0.47943187795896897, + "grad_norm": 1.5364283323287964, + "learning_rate": 0.0001115096251210611, + "loss": 2.0692, + "step": 4557 + }, + { + "epoch": 0.47953708574434506, + "grad_norm": 1.7501965761184692, + "learning_rate": 0.00011147577355083789, + "loss": 2.2162, + "step": 4558 + }, + { + "epoch": 0.4796422935297212, + "grad_norm": 1.8486822843551636, + "learning_rate": 0.00011144192064797088, + "loss": 1.9551, + "step": 4559 + }, + { + "epoch": 0.4797475013150973, + "grad_norm": 2.2193543910980225, + "learning_rate": 0.0001114080664163913, + "loss": 1.7281, + "step": 4560 + }, + { + "epoch": 0.47985270910047345, + "grad_norm": 1.7399812936782837, + "learning_rate": 0.00011137421086003052, + "loss": 1.8969, + "step": 4561 + }, + { + "epoch": 0.47995791688584954, + "grad_norm": 1.6608930826187134, + "learning_rate": 0.0001113403539828201, + "loss": 1.8734, + "step": 4562 + }, + { + "epoch": 0.4800631246712257, + "grad_norm": 1.4710301160812378, + "learning_rate": 0.00011130649578869173, + "loss": 1.8415, + "step": 4563 + }, + { + "epoch": 0.4801683324566018, + "grad_norm": 1.3207552433013916, + "learning_rate": 0.00011127263628157722, + "loss": 1.8769, + "step": 4564 + }, + { + "epoch": 0.4802735402419779, + "grad_norm": 1.6613874435424805, + "learning_rate": 0.00011123877546540857, + "loss": 1.778, + "step": 4565 + }, + { + "epoch": 0.48037874802735403, + "grad_norm": 1.1604993343353271, + "learning_rate": 0.00011120491334411793, + "loss": 1.5905, + "step": 4566 + }, + { + "epoch": 0.4804839558127301, + "grad_norm": 1.4732471704483032, + "learning_rate": 0.00011117104992163762, + "loss": 1.515, + "step": 4567 + }, + { + "epoch": 0.4805891635981063, + "grad_norm": 1.1420209407806396, + "learning_rate": 0.00011113718520190006, + "loss": 1.6139, + "step": 4568 + }, + { + "epoch": 0.48069437138348237, + "grad_norm": 1.2326006889343262, + "learning_rate": 0.00011110331918883787, + "loss": 1.5793, + "step": 4569 + }, + { + "epoch": 0.4807995791688585, + "grad_norm": 1.0035511255264282, + "learning_rate": 0.00011106945188638378, + "loss": 2.121, + "step": 4570 + }, + { + "epoch": 0.4809047869542346, + "grad_norm": 0.9064120054244995, + "learning_rate": 0.0001110355832984707, + "loss": 1.6258, + "step": 4571 + }, + { + "epoch": 0.48100999473961076, + "grad_norm": 0.8680402040481567, + "learning_rate": 0.00011100171342903165, + "loss": 2.326, + "step": 4572 + }, + { + "epoch": 0.48111520252498685, + "grad_norm": 1.4351173639297485, + "learning_rate": 0.00011096784228199985, + "loss": 1.8333, + "step": 4573 + }, + { + "epoch": 0.48122041031036294, + "grad_norm": 1.4196946620941162, + "learning_rate": 0.00011093396986130866, + "loss": 1.5755, + "step": 4574 + }, + { + "epoch": 0.4813256180957391, + "grad_norm": 1.547824501991272, + "learning_rate": 0.00011090009617089155, + "loss": 2.2177, + "step": 4575 + }, + { + "epoch": 0.4814308258811152, + "grad_norm": 1.1497628688812256, + "learning_rate": 0.00011086622121468213, + "loss": 1.8107, + "step": 4576 + }, + { + "epoch": 0.48153603366649134, + "grad_norm": 0.9802242517471313, + "learning_rate": 0.00011083234499661426, + "loss": 1.9389, + "step": 4577 + }, + { + "epoch": 0.48164124145186743, + "grad_norm": 1.283870816230774, + "learning_rate": 0.00011079846752062182, + "loss": 2.4584, + "step": 4578 + }, + { + "epoch": 0.4817464492372436, + "grad_norm": 1.0405322313308716, + "learning_rate": 0.00011076458879063891, + "loss": 1.7302, + "step": 4579 + }, + { + "epoch": 0.4818516570226197, + "grad_norm": 1.3652944564819336, + "learning_rate": 0.00011073070881059977, + "loss": 1.8197, + "step": 4580 + }, + { + "epoch": 0.48195686480799577, + "grad_norm": 0.9112363457679749, + "learning_rate": 0.00011069682758443873, + "loss": 2.3815, + "step": 4581 + }, + { + "epoch": 0.4820620725933719, + "grad_norm": 2.2471213340759277, + "learning_rate": 0.00011066294511609032, + "loss": 1.918, + "step": 4582 + }, + { + "epoch": 0.482167280378748, + "grad_norm": 1.5038715600967407, + "learning_rate": 0.00011062906140948922, + "loss": 1.571, + "step": 4583 + }, + { + "epoch": 0.48227248816412416, + "grad_norm": 1.7240707874298096, + "learning_rate": 0.00011059517646857023, + "loss": 1.7805, + "step": 4584 + }, + { + "epoch": 0.48237769594950025, + "grad_norm": 1.71571946144104, + "learning_rate": 0.00011056129029726825, + "loss": 2.1035, + "step": 4585 + }, + { + "epoch": 0.4824829037348764, + "grad_norm": 1.455073356628418, + "learning_rate": 0.00011052740289951842, + "loss": 1.5254, + "step": 4586 + }, + { + "epoch": 0.4825881115202525, + "grad_norm": 1.1514016389846802, + "learning_rate": 0.00011049351427925598, + "loss": 1.5474, + "step": 4587 + }, + { + "epoch": 0.48269331930562864, + "grad_norm": 1.1613560914993286, + "learning_rate": 0.00011045962444041624, + "loss": 2.2161, + "step": 4588 + }, + { + "epoch": 0.48279852709100474, + "grad_norm": 1.7991211414337158, + "learning_rate": 0.00011042573338693479, + "loss": 1.9893, + "step": 4589 + }, + { + "epoch": 0.48290373487638083, + "grad_norm": 1.4048147201538086, + "learning_rate": 0.00011039184112274725, + "loss": 1.771, + "step": 4590 + }, + { + "epoch": 0.483008942661757, + "grad_norm": 1.2304316759109497, + "learning_rate": 0.00011035794765178941, + "loss": 2.0752, + "step": 4591 + }, + { + "epoch": 0.4831141504471331, + "grad_norm": 1.0293245315551758, + "learning_rate": 0.00011032405297799722, + "loss": 2.0021, + "step": 4592 + }, + { + "epoch": 0.4832193582325092, + "grad_norm": 1.4348002672195435, + "learning_rate": 0.00011029015710530674, + "loss": 1.5027, + "step": 4593 + }, + { + "epoch": 0.4833245660178853, + "grad_norm": 1.2140568494796753, + "learning_rate": 0.0001102562600376542, + "loss": 1.864, + "step": 4594 + }, + { + "epoch": 0.48342977380326146, + "grad_norm": 1.5038398504257202, + "learning_rate": 0.000110222361778976, + "loss": 1.567, + "step": 4595 + }, + { + "epoch": 0.48353498158863756, + "grad_norm": 2.2957375049591064, + "learning_rate": 0.00011018846233320854, + "loss": 1.1497, + "step": 4596 + }, + { + "epoch": 0.48364018937401365, + "grad_norm": 1.7357734441757202, + "learning_rate": 0.0001101545617042885, + "loss": 2.1494, + "step": 4597 + }, + { + "epoch": 0.4837453971593898, + "grad_norm": 1.3374382257461548, + "learning_rate": 0.0001101206598961527, + "loss": 2.0164, + "step": 4598 + }, + { + "epoch": 0.4838506049447659, + "grad_norm": 1.0593690872192383, + "learning_rate": 0.00011008675691273793, + "loss": 1.7716, + "step": 4599 + }, + { + "epoch": 0.48395581273014204, + "grad_norm": 1.28423273563385, + "learning_rate": 0.00011005285275798132, + "loss": 1.9312, + "step": 4600 + }, + { + "epoch": 0.48406102051551814, + "grad_norm": 1.176078200340271, + "learning_rate": 0.00011001894743582004, + "loss": 2.0926, + "step": 4601 + }, + { + "epoch": 0.4841662283008943, + "grad_norm": 1.3551251888275146, + "learning_rate": 0.00010998504095019137, + "loss": 1.9196, + "step": 4602 + }, + { + "epoch": 0.4842714360862704, + "grad_norm": 1.554956316947937, + "learning_rate": 0.00010995113330503278, + "loss": 1.5871, + "step": 4603 + }, + { + "epoch": 0.48437664387164653, + "grad_norm": 0.9341940879821777, + "learning_rate": 0.00010991722450428184, + "loss": 1.5597, + "step": 4604 + }, + { + "epoch": 0.4844818516570226, + "grad_norm": 1.026902437210083, + "learning_rate": 0.00010988331455187628, + "loss": 2.1465, + "step": 4605 + }, + { + "epoch": 0.4845870594423987, + "grad_norm": 1.683455228805542, + "learning_rate": 0.00010984940345175392, + "loss": 1.9747, + "step": 4606 + }, + { + "epoch": 0.48469226722777486, + "grad_norm": 1.6722251176834106, + "learning_rate": 0.0001098154912078528, + "loss": 1.8076, + "step": 4607 + }, + { + "epoch": 0.48479747501315096, + "grad_norm": 1.0394576787948608, + "learning_rate": 0.000109781577824111, + "loss": 1.9039, + "step": 4608 + }, + { + "epoch": 0.4849026827985271, + "grad_norm": 1.5216197967529297, + "learning_rate": 0.00010974766330446678, + "loss": 2.1068, + "step": 4609 + }, + { + "epoch": 0.4850078905839032, + "grad_norm": 2.0184504985809326, + "learning_rate": 0.00010971374765285851, + "loss": 1.9356, + "step": 4610 + }, + { + "epoch": 0.48511309836927935, + "grad_norm": 2.1446685791015625, + "learning_rate": 0.0001096798308732247, + "loss": 1.0265, + "step": 4611 + }, + { + "epoch": 0.48521830615465544, + "grad_norm": 1.6989480257034302, + "learning_rate": 0.00010964591296950406, + "loss": 1.8608, + "step": 4612 + }, + { + "epoch": 0.48532351394003154, + "grad_norm": 1.0963361263275146, + "learning_rate": 0.00010961199394563526, + "loss": 1.5642, + "step": 4613 + }, + { + "epoch": 0.4854287217254077, + "grad_norm": 1.4704118967056274, + "learning_rate": 0.00010957807380555727, + "loss": 1.5999, + "step": 4614 + }, + { + "epoch": 0.4855339295107838, + "grad_norm": 1.1268386840820312, + "learning_rate": 0.00010954415255320909, + "loss": 1.4149, + "step": 4615 + }, + { + "epoch": 0.48563913729615993, + "grad_norm": 1.2833198308944702, + "learning_rate": 0.00010951023019252993, + "loss": 1.3482, + "step": 4616 + }, + { + "epoch": 0.485744345081536, + "grad_norm": 1.0503816604614258, + "learning_rate": 0.00010947630672745906, + "loss": 1.5722, + "step": 4617 + }, + { + "epoch": 0.48584955286691217, + "grad_norm": 2.151472330093384, + "learning_rate": 0.00010944238216193586, + "loss": 2.0229, + "step": 4618 + }, + { + "epoch": 0.48595476065228826, + "grad_norm": 1.2306005954742432, + "learning_rate": 0.00010940845649989994, + "loss": 1.4716, + "step": 4619 + }, + { + "epoch": 0.4860599684376644, + "grad_norm": 1.4052913188934326, + "learning_rate": 0.00010937452974529093, + "loss": 1.6263, + "step": 4620 + }, + { + "epoch": 0.4861651762230405, + "grad_norm": 1.0920840501785278, + "learning_rate": 0.00010934060190204865, + "loss": 2.1744, + "step": 4621 + }, + { + "epoch": 0.4862703840084166, + "grad_norm": 2.252732276916504, + "learning_rate": 0.00010930667297411305, + "loss": 1.9662, + "step": 4622 + }, + { + "epoch": 0.48637559179379275, + "grad_norm": 1.5878801345825195, + "learning_rate": 0.00010927274296542416, + "loss": 1.7275, + "step": 4623 + }, + { + "epoch": 0.48648079957916884, + "grad_norm": 0.946306586265564, + "learning_rate": 0.00010923881187992215, + "loss": 1.5401, + "step": 4624 + }, + { + "epoch": 0.486586007364545, + "grad_norm": 1.2825038433074951, + "learning_rate": 0.00010920487972154734, + "loss": 2.1301, + "step": 4625 + }, + { + "epoch": 0.4866912151499211, + "grad_norm": 2.0224785804748535, + "learning_rate": 0.00010917094649424018, + "loss": 1.6339, + "step": 4626 + }, + { + "epoch": 0.48679642293529723, + "grad_norm": 1.4419629573822021, + "learning_rate": 0.00010913701220194117, + "loss": 2.2219, + "step": 4627 + }, + { + "epoch": 0.48690163072067333, + "grad_norm": 1.4137790203094482, + "learning_rate": 0.00010910307684859102, + "loss": 2.2587, + "step": 4628 + }, + { + "epoch": 0.4870068385060494, + "grad_norm": 1.2425460815429688, + "learning_rate": 0.00010906914043813056, + "loss": 1.6606, + "step": 4629 + }, + { + "epoch": 0.48711204629142557, + "grad_norm": 1.4972299337387085, + "learning_rate": 0.00010903520297450067, + "loss": 1.3636, + "step": 4630 + }, + { + "epoch": 0.48721725407680166, + "grad_norm": 1.6092376708984375, + "learning_rate": 0.0001090012644616424, + "loss": 2.0573, + "step": 4631 + }, + { + "epoch": 0.4873224618621778, + "grad_norm": 1.6330995559692383, + "learning_rate": 0.00010896732490349697, + "loss": 1.7161, + "step": 4632 + }, + { + "epoch": 0.4874276696475539, + "grad_norm": 2.3415868282318115, + "learning_rate": 0.00010893338430400562, + "loss": 1.524, + "step": 4633 + }, + { + "epoch": 0.48753287743293006, + "grad_norm": 1.2462005615234375, + "learning_rate": 0.00010889944266710972, + "loss": 1.5716, + "step": 4634 + }, + { + "epoch": 0.48763808521830615, + "grad_norm": 1.564387321472168, + "learning_rate": 0.00010886549999675088, + "loss": 1.6856, + "step": 4635 + }, + { + "epoch": 0.4877432930036823, + "grad_norm": 0.9912921786308289, + "learning_rate": 0.00010883155629687071, + "loss": 1.878, + "step": 4636 + }, + { + "epoch": 0.4878485007890584, + "grad_norm": 1.348905086517334, + "learning_rate": 0.000108797611571411, + "loss": 2.4173, + "step": 4637 + }, + { + "epoch": 0.4879537085744345, + "grad_norm": 1.316444754600525, + "learning_rate": 0.00010876366582431361, + "loss": 2.1035, + "step": 4638 + }, + { + "epoch": 0.48805891635981064, + "grad_norm": 1.0654300451278687, + "learning_rate": 0.00010872971905952057, + "loss": 2.1269, + "step": 4639 + }, + { + "epoch": 0.48816412414518673, + "grad_norm": 1.8568501472473145, + "learning_rate": 0.00010869577128097404, + "loss": 1.2797, + "step": 4640 + }, + { + "epoch": 0.4882693319305629, + "grad_norm": 1.3833286762237549, + "learning_rate": 0.00010866182249261617, + "loss": 1.9619, + "step": 4641 + }, + { + "epoch": 0.48837453971593897, + "grad_norm": 0.8749449253082275, + "learning_rate": 0.00010862787269838939, + "loss": 1.6768, + "step": 4642 + }, + { + "epoch": 0.4884797475013151, + "grad_norm": 1.499631643295288, + "learning_rate": 0.00010859392190223619, + "loss": 1.9632, + "step": 4643 + }, + { + "epoch": 0.4885849552866912, + "grad_norm": 1.0441616773605347, + "learning_rate": 0.00010855997010809915, + "loss": 2.0146, + "step": 4644 + }, + { + "epoch": 0.4886901630720673, + "grad_norm": 1.8089901208877563, + "learning_rate": 0.00010852601731992094, + "loss": 1.8159, + "step": 4645 + }, + { + "epoch": 0.48879537085744346, + "grad_norm": 1.6164319515228271, + "learning_rate": 0.00010849206354164439, + "loss": 1.3445, + "step": 4646 + }, + { + "epoch": 0.48890057864281955, + "grad_norm": 1.5210686922073364, + "learning_rate": 0.00010845810877721252, + "loss": 1.6565, + "step": 4647 + }, + { + "epoch": 0.4890057864281957, + "grad_norm": 1.2863743305206299, + "learning_rate": 0.00010842415303056827, + "loss": 1.5714, + "step": 4648 + }, + { + "epoch": 0.4891109942135718, + "grad_norm": 1.657315731048584, + "learning_rate": 0.0001083901963056549, + "loss": 1.7163, + "step": 4649 + }, + { + "epoch": 0.48921620199894794, + "grad_norm": 1.0181341171264648, + "learning_rate": 0.00010835623860641569, + "loss": 1.8581, + "step": 4650 + }, + { + "epoch": 0.48932140978432404, + "grad_norm": 0.8646203279495239, + "learning_rate": 0.00010832227993679396, + "loss": 1.8136, + "step": 4651 + }, + { + "epoch": 0.4894266175697002, + "grad_norm": 1.2895039319992065, + "learning_rate": 0.00010828832030073329, + "loss": 1.9504, + "step": 4652 + }, + { + "epoch": 0.4895318253550763, + "grad_norm": 1.2667862176895142, + "learning_rate": 0.00010825435970217728, + "loss": 1.9774, + "step": 4653 + }, + { + "epoch": 0.48963703314045237, + "grad_norm": 1.3460382223129272, + "learning_rate": 0.00010822039814506964, + "loss": 2.0969, + "step": 4654 + }, + { + "epoch": 0.4897422409258285, + "grad_norm": 1.2763968706130981, + "learning_rate": 0.00010818643563335424, + "loss": 2.2074, + "step": 4655 + }, + { + "epoch": 0.4898474487112046, + "grad_norm": 0.9694689512252808, + "learning_rate": 0.00010815247217097504, + "loss": 1.7114, + "step": 4656 + }, + { + "epoch": 0.48995265649658076, + "grad_norm": 2.234217882156372, + "learning_rate": 0.00010811850776187608, + "loss": 2.196, + "step": 4657 + }, + { + "epoch": 0.49005786428195686, + "grad_norm": 1.10568106174469, + "learning_rate": 0.00010808454241000155, + "loss": 1.66, + "step": 4658 + }, + { + "epoch": 0.490163072067333, + "grad_norm": 1.4655194282531738, + "learning_rate": 0.00010805057611929573, + "loss": 1.9986, + "step": 4659 + }, + { + "epoch": 0.4902682798527091, + "grad_norm": 1.9132064580917358, + "learning_rate": 0.00010801660889370301, + "loss": 1.7011, + "step": 4660 + }, + { + "epoch": 0.4903734876380852, + "grad_norm": 1.7220205068588257, + "learning_rate": 0.00010798264073716791, + "loss": 1.6763, + "step": 4661 + }, + { + "epoch": 0.49047869542346134, + "grad_norm": 0.8977935910224915, + "learning_rate": 0.000107948671653635, + "loss": 1.8618, + "step": 4662 + }, + { + "epoch": 0.49058390320883744, + "grad_norm": 1.6346113681793213, + "learning_rate": 0.00010791470164704904, + "loss": 1.6551, + "step": 4663 + }, + { + "epoch": 0.4906891109942136, + "grad_norm": 1.7571468353271484, + "learning_rate": 0.00010788073072135485, + "loss": 1.6048, + "step": 4664 + }, + { + "epoch": 0.4907943187795897, + "grad_norm": 1.8179447650909424, + "learning_rate": 0.00010784675888049735, + "loss": 2.1345, + "step": 4665 + }, + { + "epoch": 0.4908995265649658, + "grad_norm": 1.7753139734268188, + "learning_rate": 0.00010781278612842159, + "loss": 1.8888, + "step": 4666 + }, + { + "epoch": 0.4910047343503419, + "grad_norm": 1.0005837678909302, + "learning_rate": 0.00010777881246907269, + "loss": 1.645, + "step": 4667 + }, + { + "epoch": 0.49110994213571807, + "grad_norm": 1.0366028547286987, + "learning_rate": 0.00010774483790639591, + "loss": 1.5939, + "step": 4668 + }, + { + "epoch": 0.49121514992109416, + "grad_norm": 0.8974822163581848, + "learning_rate": 0.00010771086244433662, + "loss": 1.9674, + "step": 4669 + }, + { + "epoch": 0.49132035770647026, + "grad_norm": 1.6691381931304932, + "learning_rate": 0.00010767688608684023, + "loss": 1.32, + "step": 4670 + }, + { + "epoch": 0.4914255654918464, + "grad_norm": 1.127601981163025, + "learning_rate": 0.00010764290883785237, + "loss": 1.8129, + "step": 4671 + }, + { + "epoch": 0.4915307732772225, + "grad_norm": 1.6219996213912964, + "learning_rate": 0.00010760893070131868, + "loss": 1.8319, + "step": 4672 + }, + { + "epoch": 0.49163598106259865, + "grad_norm": 1.6142001152038574, + "learning_rate": 0.0001075749516811849, + "loss": 1.3877, + "step": 4673 + }, + { + "epoch": 0.49174118884797474, + "grad_norm": 1.2514292001724243, + "learning_rate": 0.00010754097178139695, + "loss": 1.4618, + "step": 4674 + }, + { + "epoch": 0.4918463966333509, + "grad_norm": 2.3644227981567383, + "learning_rate": 0.00010750699100590076, + "loss": 1.4959, + "step": 4675 + }, + { + "epoch": 0.491951604418727, + "grad_norm": 0.9489824175834656, + "learning_rate": 0.00010747300935864243, + "loss": 1.775, + "step": 4676 + }, + { + "epoch": 0.4920568122041031, + "grad_norm": 1.1649144887924194, + "learning_rate": 0.00010743902684356815, + "loss": 1.8669, + "step": 4677 + }, + { + "epoch": 0.4921620199894792, + "grad_norm": 1.477669596672058, + "learning_rate": 0.00010740504346462417, + "loss": 1.9316, + "step": 4678 + }, + { + "epoch": 0.4922672277748553, + "grad_norm": 1.8862009048461914, + "learning_rate": 0.00010737105922575685, + "loss": 2.0482, + "step": 4679 + }, + { + "epoch": 0.49237243556023147, + "grad_norm": 1.249657154083252, + "learning_rate": 0.00010733707413091269, + "loss": 1.5986, + "step": 4680 + }, + { + "epoch": 0.49247764334560756, + "grad_norm": 1.274321436882019, + "learning_rate": 0.00010730308818403832, + "loss": 2.0053, + "step": 4681 + }, + { + "epoch": 0.4925828511309837, + "grad_norm": 1.5534741878509521, + "learning_rate": 0.00010726910138908032, + "loss": 1.7589, + "step": 4682 + }, + { + "epoch": 0.4926880589163598, + "grad_norm": 1.196625828742981, + "learning_rate": 0.00010723511374998554, + "loss": 1.6722, + "step": 4683 + }, + { + "epoch": 0.49279326670173595, + "grad_norm": 1.5351232290267944, + "learning_rate": 0.00010720112527070083, + "loss": 1.7944, + "step": 4684 + }, + { + "epoch": 0.49289847448711205, + "grad_norm": 1.5849000215530396, + "learning_rate": 0.00010716713595517313, + "loss": 1.7102, + "step": 4685 + }, + { + "epoch": 0.49300368227248814, + "grad_norm": 1.8203864097595215, + "learning_rate": 0.00010713314580734954, + "loss": 2.2117, + "step": 4686 + }, + { + "epoch": 0.4931088900578643, + "grad_norm": 1.308393955230713, + "learning_rate": 0.00010709915483117723, + "loss": 1.8919, + "step": 4687 + }, + { + "epoch": 0.4932140978432404, + "grad_norm": 1.2589519023895264, + "learning_rate": 0.00010706516303060345, + "loss": 1.3221, + "step": 4688 + }, + { + "epoch": 0.49331930562861653, + "grad_norm": 1.2415590286254883, + "learning_rate": 0.00010703117040957553, + "loss": 1.8339, + "step": 4689 + }, + { + "epoch": 0.4934245134139926, + "grad_norm": 1.551783800125122, + "learning_rate": 0.00010699717697204095, + "loss": 1.8219, + "step": 4690 + }, + { + "epoch": 0.4935297211993688, + "grad_norm": 1.280253291130066, + "learning_rate": 0.00010696318272194726, + "loss": 1.7465, + "step": 4691 + }, + { + "epoch": 0.49363492898474487, + "grad_norm": 1.4209909439086914, + "learning_rate": 0.00010692918766324209, + "loss": 1.7862, + "step": 4692 + }, + { + "epoch": 0.49374013677012096, + "grad_norm": 1.7612025737762451, + "learning_rate": 0.00010689519179987316, + "loss": 1.8797, + "step": 4693 + }, + { + "epoch": 0.4938453445554971, + "grad_norm": 1.5453522205352783, + "learning_rate": 0.00010686119513578831, + "loss": 2.0076, + "step": 4694 + }, + { + "epoch": 0.4939505523408732, + "grad_norm": 1.7672468423843384, + "learning_rate": 0.00010682719767493547, + "loss": 1.8416, + "step": 4695 + }, + { + "epoch": 0.49405576012624935, + "grad_norm": 1.4348251819610596, + "learning_rate": 0.00010679319942126264, + "loss": 1.872, + "step": 4696 + }, + { + "epoch": 0.49416096791162545, + "grad_norm": 1.9852848052978516, + "learning_rate": 0.00010675920037871794, + "loss": 1.921, + "step": 4697 + }, + { + "epoch": 0.4942661756970016, + "grad_norm": 1.4216840267181396, + "learning_rate": 0.00010672520055124958, + "loss": 1.5686, + "step": 4698 + }, + { + "epoch": 0.4943713834823777, + "grad_norm": 1.5400251150131226, + "learning_rate": 0.00010669119994280581, + "loss": 2.021, + "step": 4699 + }, + { + "epoch": 0.49447659126775384, + "grad_norm": 1.3077402114868164, + "learning_rate": 0.00010665719855733501, + "loss": 2.1264, + "step": 4700 + }, + { + "epoch": 0.49458179905312993, + "grad_norm": 1.2699223756790161, + "learning_rate": 0.00010662319639878565, + "loss": 1.7802, + "step": 4701 + }, + { + "epoch": 0.494687006838506, + "grad_norm": 1.2943834066390991, + "learning_rate": 0.00010658919347110634, + "loss": 1.8573, + "step": 4702 + }, + { + "epoch": 0.4947922146238822, + "grad_norm": 1.5511493682861328, + "learning_rate": 0.00010655518977824566, + "loss": 2.2023, + "step": 4703 + }, + { + "epoch": 0.49489742240925827, + "grad_norm": 1.4400460720062256, + "learning_rate": 0.00010652118532415236, + "loss": 1.9261, + "step": 4704 + }, + { + "epoch": 0.4950026301946344, + "grad_norm": 1.2507339715957642, + "learning_rate": 0.00010648718011277535, + "loss": 1.6863, + "step": 4705 + }, + { + "epoch": 0.4951078379800105, + "grad_norm": 1.3244736194610596, + "learning_rate": 0.00010645317414806342, + "loss": 1.9085, + "step": 4706 + }, + { + "epoch": 0.49521304576538666, + "grad_norm": 1.1200934648513794, + "learning_rate": 0.00010641916743396563, + "loss": 1.5047, + "step": 4707 + }, + { + "epoch": 0.49531825355076275, + "grad_norm": 1.6609597206115723, + "learning_rate": 0.00010638515997443109, + "loss": 1.1343, + "step": 4708 + }, + { + "epoch": 0.49542346133613885, + "grad_norm": 1.0850027799606323, + "learning_rate": 0.00010635115177340893, + "loss": 1.6411, + "step": 4709 + }, + { + "epoch": 0.495528669121515, + "grad_norm": 1.537176489830017, + "learning_rate": 0.00010631714283484842, + "loss": 1.524, + "step": 4710 + }, + { + "epoch": 0.4956338769068911, + "grad_norm": 2.0602495670318604, + "learning_rate": 0.00010628313316269891, + "loss": 1.9078, + "step": 4711 + }, + { + "epoch": 0.49573908469226724, + "grad_norm": 1.759886622428894, + "learning_rate": 0.00010624912276090988, + "loss": 1.9751, + "step": 4712 + }, + { + "epoch": 0.49584429247764333, + "grad_norm": 1.4797873497009277, + "learning_rate": 0.00010621511163343077, + "loss": 2.0188, + "step": 4713 + }, + { + "epoch": 0.4959495002630195, + "grad_norm": 1.672033429145813, + "learning_rate": 0.00010618109978421119, + "loss": 1.5586, + "step": 4714 + }, + { + "epoch": 0.4960547080483956, + "grad_norm": 1.9004497528076172, + "learning_rate": 0.00010614708721720085, + "loss": 1.4778, + "step": 4715 + }, + { + "epoch": 0.4961599158337717, + "grad_norm": 1.600061058998108, + "learning_rate": 0.00010611307393634955, + "loss": 1.4879, + "step": 4716 + }, + { + "epoch": 0.4962651236191478, + "grad_norm": 1.7574427127838135, + "learning_rate": 0.0001060790599456071, + "loss": 1.8051, + "step": 4717 + }, + { + "epoch": 0.4963703314045239, + "grad_norm": 1.1964126825332642, + "learning_rate": 0.0001060450452489234, + "loss": 1.7907, + "step": 4718 + }, + { + "epoch": 0.49647553918990006, + "grad_norm": 1.7190804481506348, + "learning_rate": 0.00010601102985024853, + "loss": 2.033, + "step": 4719 + }, + { + "epoch": 0.49658074697527615, + "grad_norm": 1.3082791566848755, + "learning_rate": 0.00010597701375353257, + "loss": 1.7543, + "step": 4720 + }, + { + "epoch": 0.4966859547606523, + "grad_norm": 1.5606935024261475, + "learning_rate": 0.00010594299696272565, + "loss": 1.6072, + "step": 4721 + }, + { + "epoch": 0.4967911625460284, + "grad_norm": 1.4001544713974, + "learning_rate": 0.00010590897948177806, + "loss": 2.1012, + "step": 4722 + }, + { + "epoch": 0.49689637033140455, + "grad_norm": 1.4540914297103882, + "learning_rate": 0.00010587496131464019, + "loss": 1.3696, + "step": 4723 + }, + { + "epoch": 0.49700157811678064, + "grad_norm": 1.2410134077072144, + "learning_rate": 0.00010584094246526237, + "loss": 1.4659, + "step": 4724 + }, + { + "epoch": 0.49710678590215673, + "grad_norm": 1.1496258974075317, + "learning_rate": 0.00010580692293759513, + "loss": 1.7615, + "step": 4725 + }, + { + "epoch": 0.4972119936875329, + "grad_norm": 1.969294548034668, + "learning_rate": 0.00010577290273558908, + "loss": 1.6735, + "step": 4726 + }, + { + "epoch": 0.497317201472909, + "grad_norm": 1.9359304904937744, + "learning_rate": 0.00010573888186319482, + "loss": 1.7351, + "step": 4727 + }, + { + "epoch": 0.4974224092582851, + "grad_norm": 1.6216297149658203, + "learning_rate": 0.0001057048603243631, + "loss": 1.7732, + "step": 4728 + }, + { + "epoch": 0.4975276170436612, + "grad_norm": 1.4571208953857422, + "learning_rate": 0.00010567083812304477, + "loss": 1.5308, + "step": 4729 + }, + { + "epoch": 0.49763282482903737, + "grad_norm": 1.5610977411270142, + "learning_rate": 0.00010563681526319069, + "loss": 1.8377, + "step": 4730 + }, + { + "epoch": 0.49773803261441346, + "grad_norm": 1.9585388898849487, + "learning_rate": 0.00010560279174875179, + "loss": 2.1389, + "step": 4731 + }, + { + "epoch": 0.4978432403997896, + "grad_norm": 1.929347038269043, + "learning_rate": 0.0001055687675836791, + "loss": 0.9178, + "step": 4732 + }, + { + "epoch": 0.4979484481851657, + "grad_norm": 1.9738050699234009, + "learning_rate": 0.00010553474277192381, + "loss": 1.9461, + "step": 4733 + }, + { + "epoch": 0.4980536559705418, + "grad_norm": 1.227402925491333, + "learning_rate": 0.00010550071731743707, + "loss": 2.15, + "step": 4734 + }, + { + "epoch": 0.49815886375591795, + "grad_norm": 1.7934569120407104, + "learning_rate": 0.00010546669122417013, + "loss": 1.7138, + "step": 4735 + }, + { + "epoch": 0.49826407154129404, + "grad_norm": 1.155175805091858, + "learning_rate": 0.00010543266449607432, + "loss": 1.708, + "step": 4736 + }, + { + "epoch": 0.4983692793266702, + "grad_norm": 1.2957451343536377, + "learning_rate": 0.0001053986371371011, + "loss": 1.8071, + "step": 4737 + }, + { + "epoch": 0.4984744871120463, + "grad_norm": 1.2430212497711182, + "learning_rate": 0.0001053646091512019, + "loss": 1.568, + "step": 4738 + }, + { + "epoch": 0.49857969489742243, + "grad_norm": 0.9808127880096436, + "learning_rate": 0.00010533058054232832, + "loss": 1.6683, + "step": 4739 + }, + { + "epoch": 0.4986849026827985, + "grad_norm": 1.0529319047927856, + "learning_rate": 0.00010529655131443199, + "loss": 1.5132, + "step": 4740 + }, + { + "epoch": 0.4987901104681746, + "grad_norm": 1.2187601327896118, + "learning_rate": 0.0001052625214714646, + "loss": 1.7863, + "step": 4741 + }, + { + "epoch": 0.49889531825355077, + "grad_norm": 1.5013796091079712, + "learning_rate": 0.00010522849101737788, + "loss": 1.8348, + "step": 4742 + }, + { + "epoch": 0.49900052603892686, + "grad_norm": 1.5242013931274414, + "learning_rate": 0.00010519445995612374, + "loss": 2.0386, + "step": 4743 + }, + { + "epoch": 0.499105733824303, + "grad_norm": 1.6240057945251465, + "learning_rate": 0.00010516042829165408, + "loss": 1.6951, + "step": 4744 + }, + { + "epoch": 0.4992109416096791, + "grad_norm": 1.4725539684295654, + "learning_rate": 0.00010512639602792088, + "loss": 1.9254, + "step": 4745 + }, + { + "epoch": 0.49931614939505525, + "grad_norm": 1.0966986417770386, + "learning_rate": 0.00010509236316887615, + "loss": 1.7749, + "step": 4746 + }, + { + "epoch": 0.49942135718043135, + "grad_norm": 1.0299161672592163, + "learning_rate": 0.0001050583297184721, + "loss": 1.4928, + "step": 4747 + }, + { + "epoch": 0.4995265649658075, + "grad_norm": 1.2240405082702637, + "learning_rate": 0.00010502429568066084, + "loss": 1.4958, + "step": 4748 + }, + { + "epoch": 0.4996317727511836, + "grad_norm": 1.3169293403625488, + "learning_rate": 0.00010499026105939467, + "loss": 1.3972, + "step": 4749 + }, + { + "epoch": 0.4997369805365597, + "grad_norm": 1.836531400680542, + "learning_rate": 0.00010495622585862594, + "loss": 2.0759, + "step": 4750 + }, + { + "epoch": 0.49984218832193583, + "grad_norm": 1.1490098237991333, + "learning_rate": 0.00010492219008230704, + "loss": 1.9707, + "step": 4751 + }, + { + "epoch": 0.4999473961073119, + "grad_norm": 1.809786319732666, + "learning_rate": 0.00010488815373439036, + "loss": 2.212, + "step": 4752 + }, + { + "epoch": 0.500052603892688, + "grad_norm": 1.3090404272079468, + "learning_rate": 0.0001048541168188285, + "loss": 1.7952, + "step": 4753 + }, + { + "epoch": 0.5001578116780642, + "grad_norm": 1.8462897539138794, + "learning_rate": 0.00010482007933957407, + "loss": 1.837, + "step": 4754 + }, + { + "epoch": 0.5002630194634403, + "grad_norm": 1.5334430932998657, + "learning_rate": 0.00010478604130057965, + "loss": 1.7316, + "step": 4755 + }, + { + "epoch": 0.5003682272488165, + "grad_norm": 1.2265576124191284, + "learning_rate": 0.00010475200270579803, + "loss": 1.6373, + "step": 4756 + }, + { + "epoch": 0.5004734350341925, + "grad_norm": 1.6965841054916382, + "learning_rate": 0.00010471796355918202, + "loss": 2.1889, + "step": 4757 + }, + { + "epoch": 0.5005786428195687, + "grad_norm": 1.6729693412780762, + "learning_rate": 0.0001046839238646844, + "loss": 1.4878, + "step": 4758 + }, + { + "epoch": 0.5006838506049448, + "grad_norm": 1.4285995960235596, + "learning_rate": 0.00010464988362625812, + "loss": 2.3533, + "step": 4759 + }, + { + "epoch": 0.5007890583903208, + "grad_norm": 0.9867104291915894, + "learning_rate": 0.00010461584284785617, + "loss": 2.2433, + "step": 4760 + }, + { + "epoch": 0.500894266175697, + "grad_norm": 1.198486566543579, + "learning_rate": 0.00010458180153343162, + "loss": 1.491, + "step": 4761 + }, + { + "epoch": 0.5009994739610731, + "grad_norm": 2.1748099327087402, + "learning_rate": 0.00010454775968693753, + "loss": 1.4489, + "step": 4762 + }, + { + "epoch": 0.5011046817464493, + "grad_norm": 1.7211601734161377, + "learning_rate": 0.00010451371731232708, + "loss": 1.4753, + "step": 4763 + }, + { + "epoch": 0.5012098895318253, + "grad_norm": 1.0159982442855835, + "learning_rate": 0.00010447967441355349, + "loss": 1.9713, + "step": 4764 + }, + { + "epoch": 0.5013150973172015, + "grad_norm": 1.20917546749115, + "learning_rate": 0.00010444563099457008, + "loss": 1.5139, + "step": 4765 + }, + { + "epoch": 0.5014203051025776, + "grad_norm": 1.7025082111358643, + "learning_rate": 0.00010441158705933016, + "loss": 1.5271, + "step": 4766 + }, + { + "epoch": 0.5015255128879537, + "grad_norm": 1.3307257890701294, + "learning_rate": 0.00010437754261178719, + "loss": 1.9943, + "step": 4767 + }, + { + "epoch": 0.5016307206733298, + "grad_norm": 2.893484592437744, + "learning_rate": 0.00010434349765589459, + "loss": 1.4271, + "step": 4768 + }, + { + "epoch": 0.501735928458706, + "grad_norm": 1.664506196975708, + "learning_rate": 0.0001043094521956059, + "loss": 1.5161, + "step": 4769 + }, + { + "epoch": 0.5018411362440821, + "grad_norm": 1.219966173171997, + "learning_rate": 0.00010427540623487475, + "loss": 1.3841, + "step": 4770 + }, + { + "epoch": 0.5019463440294581, + "grad_norm": 1.3663967847824097, + "learning_rate": 0.00010424135977765475, + "loss": 2.0504, + "step": 4771 + }, + { + "epoch": 0.5020515518148343, + "grad_norm": 1.2188875675201416, + "learning_rate": 0.00010420731282789957, + "loss": 1.7724, + "step": 4772 + }, + { + "epoch": 0.5021567596002104, + "grad_norm": 1.4278043508529663, + "learning_rate": 0.00010417326538956305, + "loss": 1.5938, + "step": 4773 + }, + { + "epoch": 0.5022619673855865, + "grad_norm": 1.2825640439987183, + "learning_rate": 0.00010413921746659894, + "loss": 2.0405, + "step": 4774 + }, + { + "epoch": 0.5023671751709626, + "grad_norm": 1.455748200416565, + "learning_rate": 0.00010410516906296115, + "loss": 1.3787, + "step": 4775 + }, + { + "epoch": 0.5024723829563388, + "grad_norm": 0.9623004198074341, + "learning_rate": 0.00010407112018260356, + "loss": 1.4602, + "step": 4776 + }, + { + "epoch": 0.5025775907417149, + "grad_norm": 1.078614592552185, + "learning_rate": 0.0001040370708294802, + "loss": 1.9882, + "step": 4777 + }, + { + "epoch": 0.502682798527091, + "grad_norm": 1.2258964776992798, + "learning_rate": 0.00010400302100754514, + "loss": 1.5276, + "step": 4778 + }, + { + "epoch": 0.5027880063124671, + "grad_norm": 1.2255494594573975, + "learning_rate": 0.00010396897072075237, + "loss": 1.2363, + "step": 4779 + }, + { + "epoch": 0.5028932140978433, + "grad_norm": 1.2698360681533813, + "learning_rate": 0.00010393491997305613, + "loss": 1.7825, + "step": 4780 + }, + { + "epoch": 0.5029984218832193, + "grad_norm": 1.997664451599121, + "learning_rate": 0.00010390086876841061, + "loss": 1.9237, + "step": 4781 + }, + { + "epoch": 0.5031036296685955, + "grad_norm": 1.0741022825241089, + "learning_rate": 0.00010386681711077002, + "loss": 1.8118, + "step": 4782 + }, + { + "epoch": 0.5032088374539716, + "grad_norm": 1.1076297760009766, + "learning_rate": 0.0001038327650040887, + "loss": 1.4896, + "step": 4783 + }, + { + "epoch": 0.5033140452393478, + "grad_norm": 2.157230854034424, + "learning_rate": 0.000103798712452321, + "loss": 1.6452, + "step": 4784 + }, + { + "epoch": 0.5034192530247238, + "grad_norm": 1.4255646467208862, + "learning_rate": 0.00010376465945942133, + "loss": 2.2098, + "step": 4785 + }, + { + "epoch": 0.5035244608100999, + "grad_norm": 1.7662725448608398, + "learning_rate": 0.00010373060602934415, + "loss": 2.1151, + "step": 4786 + }, + { + "epoch": 0.5036296685954761, + "grad_norm": 1.1463786363601685, + "learning_rate": 0.00010369655216604397, + "loss": 1.8305, + "step": 4787 + }, + { + "epoch": 0.5037348763808522, + "grad_norm": 1.4723010063171387, + "learning_rate": 0.00010366249787347537, + "loss": 2.3292, + "step": 4788 + }, + { + "epoch": 0.5038400841662283, + "grad_norm": 0.9463380575180054, + "learning_rate": 0.00010362844315559297, + "loss": 1.6957, + "step": 4789 + }, + { + "epoch": 0.5039452919516044, + "grad_norm": 1.907961368560791, + "learning_rate": 0.0001035943880163514, + "loss": 2.1228, + "step": 4790 + }, + { + "epoch": 0.5040504997369806, + "grad_norm": 1.6900379657745361, + "learning_rate": 0.00010356033245970536, + "loss": 2.1128, + "step": 4791 + }, + { + "epoch": 0.5041557075223566, + "grad_norm": 1.4977108240127563, + "learning_rate": 0.00010352627648960966, + "loss": 1.584, + "step": 4792 + }, + { + "epoch": 0.5042609153077328, + "grad_norm": 1.1327449083328247, + "learning_rate": 0.00010349222011001908, + "loss": 1.4022, + "step": 4793 + }, + { + "epoch": 0.5043661230931089, + "grad_norm": 1.6916382312774658, + "learning_rate": 0.0001034581633248885, + "loss": 1.8529, + "step": 4794 + }, + { + "epoch": 0.5044713308784851, + "grad_norm": 1.5584489107131958, + "learning_rate": 0.00010342410613817277, + "loss": 1.1992, + "step": 4795 + }, + { + "epoch": 0.5045765386638611, + "grad_norm": 1.77381432056427, + "learning_rate": 0.0001033900485538269, + "loss": 2.2351, + "step": 4796 + }, + { + "epoch": 0.5046817464492372, + "grad_norm": 1.6010788679122925, + "learning_rate": 0.00010335599057580583, + "loss": 1.8224, + "step": 4797 + }, + { + "epoch": 0.5047869542346134, + "grad_norm": 1.5694942474365234, + "learning_rate": 0.0001033219322080646, + "loss": 2.1278, + "step": 4798 + }, + { + "epoch": 0.5048921620199894, + "grad_norm": 1.6243647336959839, + "learning_rate": 0.00010328787345455837, + "loss": 1.913, + "step": 4799 + }, + { + "epoch": 0.5049973698053656, + "grad_norm": 2.164302110671997, + "learning_rate": 0.00010325381431924221, + "loss": 1.8029, + "step": 4800 + }, + { + "epoch": 0.5051025775907417, + "grad_norm": 0.8400641083717346, + "learning_rate": 0.00010321975480607129, + "loss": 1.3965, + "step": 4801 + }, + { + "epoch": 0.5052077853761179, + "grad_norm": 1.4535051584243774, + "learning_rate": 0.00010318569491900088, + "loss": 2.4847, + "step": 4802 + }, + { + "epoch": 0.5053129931614939, + "grad_norm": 1.4781876802444458, + "learning_rate": 0.00010315163466198616, + "loss": 1.9232, + "step": 4803 + }, + { + "epoch": 0.5054182009468701, + "grad_norm": 1.4223634004592896, + "learning_rate": 0.00010311757403898252, + "loss": 1.3876, + "step": 4804 + }, + { + "epoch": 0.5055234087322462, + "grad_norm": 2.039214849472046, + "learning_rate": 0.00010308351305394528, + "loss": 2.0856, + "step": 4805 + }, + { + "epoch": 0.5056286165176223, + "grad_norm": 1.5365289449691772, + "learning_rate": 0.0001030494517108298, + "loss": 1.9099, + "step": 4806 + }, + { + "epoch": 0.5057338243029984, + "grad_norm": 1.1576287746429443, + "learning_rate": 0.00010301539001359155, + "loss": 1.7402, + "step": 4807 + }, + { + "epoch": 0.5058390320883746, + "grad_norm": 1.7206387519836426, + "learning_rate": 0.00010298132796618596, + "loss": 1.9806, + "step": 4808 + }, + { + "epoch": 0.5059442398737507, + "grad_norm": 1.5616505146026611, + "learning_rate": 0.00010294726557256862, + "loss": 1.7111, + "step": 4809 + }, + { + "epoch": 0.5060494476591267, + "grad_norm": 1.6971865892410278, + "learning_rate": 0.00010291320283669499, + "loss": 2.0612, + "step": 4810 + }, + { + "epoch": 0.5061546554445029, + "grad_norm": 0.908016562461853, + "learning_rate": 0.0001028791397625207, + "loss": 2.0077, + "step": 4811 + }, + { + "epoch": 0.506259863229879, + "grad_norm": 0.8206803798675537, + "learning_rate": 0.00010284507635400142, + "loss": 1.7445, + "step": 4812 + }, + { + "epoch": 0.5063650710152551, + "grad_norm": 1.4862329959869385, + "learning_rate": 0.00010281101261509278, + "loss": 1.562, + "step": 4813 + }, + { + "epoch": 0.5064702788006312, + "grad_norm": 1.458016276359558, + "learning_rate": 0.00010277694854975051, + "loss": 1.0668, + "step": 4814 + }, + { + "epoch": 0.5065754865860074, + "grad_norm": 1.4686590433120728, + "learning_rate": 0.00010274288416193034, + "loss": 1.8496, + "step": 4815 + }, + { + "epoch": 0.5066806943713835, + "grad_norm": 1.2068148851394653, + "learning_rate": 0.00010270881945558808, + "loss": 2.0571, + "step": 4816 + }, + { + "epoch": 0.5067859021567596, + "grad_norm": 2.0828161239624023, + "learning_rate": 0.00010267475443467954, + "loss": 1.4989, + "step": 4817 + }, + { + "epoch": 0.5068911099421357, + "grad_norm": 1.2611733675003052, + "learning_rate": 0.00010264068910316055, + "loss": 1.7526, + "step": 4818 + }, + { + "epoch": 0.5069963177275119, + "grad_norm": 1.14738929271698, + "learning_rate": 0.00010260662346498703, + "loss": 2.0188, + "step": 4819 + }, + { + "epoch": 0.507101525512888, + "grad_norm": 1.0972294807434082, + "learning_rate": 0.00010257255752411495, + "loss": 1.8969, + "step": 4820 + }, + { + "epoch": 0.507206733298264, + "grad_norm": 1.5468823909759521, + "learning_rate": 0.0001025384912845002, + "loss": 1.5493, + "step": 4821 + }, + { + "epoch": 0.5073119410836402, + "grad_norm": 1.338287353515625, + "learning_rate": 0.0001025044247500988, + "loss": 1.9397, + "step": 4822 + }, + { + "epoch": 0.5074171488690163, + "grad_norm": 1.2431987524032593, + "learning_rate": 0.00010247035792486683, + "loss": 1.6526, + "step": 4823 + }, + { + "epoch": 0.5075223566543924, + "grad_norm": 3.7089953422546387, + "learning_rate": 0.00010243629081276031, + "loss": 2.1808, + "step": 4824 + }, + { + "epoch": 0.5076275644397685, + "grad_norm": 1.4418970346450806, + "learning_rate": 0.00010240222341773538, + "loss": 1.5014, + "step": 4825 + }, + { + "epoch": 0.5077327722251447, + "grad_norm": 1.5097178220748901, + "learning_rate": 0.00010236815574374816, + "loss": 1.9034, + "step": 4826 + }, + { + "epoch": 0.5078379800105208, + "grad_norm": 1.105833888053894, + "learning_rate": 0.00010233408779475482, + "loss": 1.6967, + "step": 4827 + }, + { + "epoch": 0.5079431877958969, + "grad_norm": 1.0300766229629517, + "learning_rate": 0.00010230001957471151, + "loss": 1.9713, + "step": 4828 + }, + { + "epoch": 0.508048395581273, + "grad_norm": 1.495307445526123, + "learning_rate": 0.00010226595108757451, + "loss": 1.7107, + "step": 4829 + }, + { + "epoch": 0.5081536033666492, + "grad_norm": 1.6375658512115479, + "learning_rate": 0.0001022318823373001, + "loss": 1.7321, + "step": 4830 + }, + { + "epoch": 0.5082588111520252, + "grad_norm": 1.7210208177566528, + "learning_rate": 0.00010219781332784451, + "loss": 1.7835, + "step": 4831 + }, + { + "epoch": 0.5083640189374014, + "grad_norm": 1.3353241682052612, + "learning_rate": 0.00010216374406316411, + "loss": 1.8193, + "step": 4832 + }, + { + "epoch": 0.5084692267227775, + "grad_norm": 1.2672992944717407, + "learning_rate": 0.00010212967454721523, + "loss": 2.0596, + "step": 4833 + }, + { + "epoch": 0.5085744345081536, + "grad_norm": 1.5790518522262573, + "learning_rate": 0.00010209560478395428, + "loss": 1.3901, + "step": 4834 + }, + { + "epoch": 0.5086796422935297, + "grad_norm": 1.039208173751831, + "learning_rate": 0.00010206153477733762, + "loss": 2.0043, + "step": 4835 + }, + { + "epoch": 0.5087848500789058, + "grad_norm": 1.4386025667190552, + "learning_rate": 0.00010202746453132172, + "loss": 1.807, + "step": 4836 + }, + { + "epoch": 0.508890057864282, + "grad_norm": 1.9724053144454956, + "learning_rate": 0.00010199339404986308, + "loss": 1.9082, + "step": 4837 + }, + { + "epoch": 0.508995265649658, + "grad_norm": 1.3470215797424316, + "learning_rate": 0.00010195932333691812, + "loss": 2.0544, + "step": 4838 + }, + { + "epoch": 0.5091004734350342, + "grad_norm": 2.0379178524017334, + "learning_rate": 0.0001019252523964434, + "loss": 1.5562, + "step": 4839 + }, + { + "epoch": 0.5092056812204103, + "grad_norm": 1.6759377717971802, + "learning_rate": 0.00010189118123239543, + "loss": 2.1569, + "step": 4840 + }, + { + "epoch": 0.5093108890057865, + "grad_norm": 1.4470930099487305, + "learning_rate": 0.00010185710984873084, + "loss": 2.3249, + "step": 4841 + }, + { + "epoch": 0.5094160967911625, + "grad_norm": 1.781419038772583, + "learning_rate": 0.0001018230382494062, + "loss": 2.0459, + "step": 4842 + }, + { + "epoch": 0.5095213045765387, + "grad_norm": 1.2743864059448242, + "learning_rate": 0.00010178896643837809, + "loss": 1.7856, + "step": 4843 + }, + { + "epoch": 0.5096265123619148, + "grad_norm": 1.4296742677688599, + "learning_rate": 0.00010175489441960327, + "loss": 1.943, + "step": 4844 + }, + { + "epoch": 0.5097317201472908, + "grad_norm": 2.150285005569458, + "learning_rate": 0.00010172082219703829, + "loss": 1.6565, + "step": 4845 + }, + { + "epoch": 0.509836927932667, + "grad_norm": 1.225191593170166, + "learning_rate": 0.0001016867497746399, + "loss": 1.8036, + "step": 4846 + }, + { + "epoch": 0.5099421357180431, + "grad_norm": 1.5737195014953613, + "learning_rate": 0.00010165267715636482, + "loss": 1.6792, + "step": 4847 + }, + { + "epoch": 0.5100473435034193, + "grad_norm": 1.7750451564788818, + "learning_rate": 0.00010161860434616982, + "loss": 1.3811, + "step": 4848 + }, + { + "epoch": 0.5101525512887953, + "grad_norm": 1.4361519813537598, + "learning_rate": 0.00010158453134801155, + "loss": 1.4103, + "step": 4849 + }, + { + "epoch": 0.5102577590741715, + "grad_norm": 1.3288068771362305, + "learning_rate": 0.00010155045816584691, + "loss": 2.1853, + "step": 4850 + }, + { + "epoch": 0.5103629668595476, + "grad_norm": 1.5870311260223389, + "learning_rate": 0.0001015163848036327, + "loss": 1.683, + "step": 4851 + }, + { + "epoch": 0.5104681746449238, + "grad_norm": 1.2659305334091187, + "learning_rate": 0.00010148231126532568, + "loss": 1.7193, + "step": 4852 + }, + { + "epoch": 0.5105733824302998, + "grad_norm": 1.333278775215149, + "learning_rate": 0.00010144823755488273, + "loss": 1.5757, + "step": 4853 + }, + { + "epoch": 0.510678590215676, + "grad_norm": 0.9039688110351562, + "learning_rate": 0.00010141416367626075, + "loss": 1.8356, + "step": 4854 + }, + { + "epoch": 0.5107837980010521, + "grad_norm": 0.9868873357772827, + "learning_rate": 0.00010138008963341657, + "loss": 1.6002, + "step": 4855 + }, + { + "epoch": 0.5108890057864282, + "grad_norm": 1.448551893234253, + "learning_rate": 0.00010134601543030713, + "loss": 2.0055, + "step": 4856 + }, + { + "epoch": 0.5109942135718043, + "grad_norm": 0.9456648230552673, + "learning_rate": 0.00010131194107088935, + "loss": 1.847, + "step": 4857 + }, + { + "epoch": 0.5110994213571805, + "grad_norm": 1.293251633644104, + "learning_rate": 0.00010127786655912021, + "loss": 1.235, + "step": 4858 + }, + { + "epoch": 0.5112046291425566, + "grad_norm": 1.0912578105926514, + "learning_rate": 0.00010124379189895661, + "loss": 2.0099, + "step": 4859 + }, + { + "epoch": 0.5113098369279326, + "grad_norm": 1.4057592153549194, + "learning_rate": 0.00010120971709435553, + "loss": 2.2696, + "step": 4860 + }, + { + "epoch": 0.5114150447133088, + "grad_norm": 0.9786197543144226, + "learning_rate": 0.000101175642149274, + "loss": 1.313, + "step": 4861 + }, + { + "epoch": 0.5115202524986849, + "grad_norm": 1.2093125581741333, + "learning_rate": 0.00010114156706766904, + "loss": 1.5894, + "step": 4862 + }, + { + "epoch": 0.511625460284061, + "grad_norm": 1.1187880039215088, + "learning_rate": 0.00010110749185349763, + "loss": 1.8596, + "step": 4863 + }, + { + "epoch": 0.5117306680694371, + "grad_norm": 1.3426660299301147, + "learning_rate": 0.00010107341651071684, + "loss": 1.6574, + "step": 4864 + }, + { + "epoch": 0.5118358758548133, + "grad_norm": 1.5116214752197266, + "learning_rate": 0.00010103934104328375, + "loss": 1.551, + "step": 4865 + }, + { + "epoch": 0.5119410836401894, + "grad_norm": 1.235568642616272, + "learning_rate": 0.00010100526545515539, + "loss": 1.8606, + "step": 4866 + }, + { + "epoch": 0.5120462914255655, + "grad_norm": 1.1877378225326538, + "learning_rate": 0.00010097118975028885, + "loss": 1.8721, + "step": 4867 + }, + { + "epoch": 0.5121514992109416, + "grad_norm": 1.012769103050232, + "learning_rate": 0.00010093711393264127, + "loss": 1.5946, + "step": 4868 + }, + { + "epoch": 0.5122567069963178, + "grad_norm": 1.3246477842330933, + "learning_rate": 0.00010090303800616974, + "loss": 1.3306, + "step": 4869 + }, + { + "epoch": 0.5123619147816938, + "grad_norm": 1.2476806640625, + "learning_rate": 0.00010086896197483136, + "loss": 1.8805, + "step": 4870 + }, + { + "epoch": 0.5124671225670699, + "grad_norm": 1.3036119937896729, + "learning_rate": 0.00010083488584258326, + "loss": 1.6369, + "step": 4871 + }, + { + "epoch": 0.5125723303524461, + "grad_norm": 1.2664161920547485, + "learning_rate": 0.00010080080961338265, + "loss": 1.5136, + "step": 4872 + }, + { + "epoch": 0.5126775381378222, + "grad_norm": 1.7038179636001587, + "learning_rate": 0.00010076673329118665, + "loss": 2.3886, + "step": 4873 + }, + { + "epoch": 0.5127827459231983, + "grad_norm": 1.4440287351608276, + "learning_rate": 0.00010073265687995243, + "loss": 1.9345, + "step": 4874 + }, + { + "epoch": 0.5128879537085744, + "grad_norm": 2.526855707168579, + "learning_rate": 0.0001006985803836372, + "loss": 1.3719, + "step": 4875 + }, + { + "epoch": 0.5129931614939506, + "grad_norm": 1.2801002264022827, + "learning_rate": 0.00010066450380619812, + "loss": 2.1389, + "step": 4876 + }, + { + "epoch": 0.5130983692793266, + "grad_norm": 1.681620478630066, + "learning_rate": 0.0001006304271515924, + "loss": 1.7829, + "step": 4877 + }, + { + "epoch": 0.5132035770647028, + "grad_norm": 1.5121160745620728, + "learning_rate": 0.00010059635042377725, + "loss": 2.1326, + "step": 4878 + }, + { + "epoch": 0.5133087848500789, + "grad_norm": 1.5406997203826904, + "learning_rate": 0.00010056227362670989, + "loss": 2.1427, + "step": 4879 + }, + { + "epoch": 0.5134139926354551, + "grad_norm": 1.1765246391296387, + "learning_rate": 0.00010052819676434754, + "loss": 1.8298, + "step": 4880 + }, + { + "epoch": 0.5135192004208311, + "grad_norm": 1.1417006254196167, + "learning_rate": 0.00010049411984064745, + "loss": 1.2332, + "step": 4881 + }, + { + "epoch": 0.5136244082062073, + "grad_norm": 0.9688271880149841, + "learning_rate": 0.00010046004285956684, + "loss": 1.8887, + "step": 4882 + }, + { + "epoch": 0.5137296159915834, + "grad_norm": 1.804365634918213, + "learning_rate": 0.00010042596582506298, + "loss": 1.6797, + "step": 4883 + }, + { + "epoch": 0.5138348237769595, + "grad_norm": 1.8476359844207764, + "learning_rate": 0.00010039188874109308, + "loss": 1.5778, + "step": 4884 + }, + { + "epoch": 0.5139400315623356, + "grad_norm": 1.6497973203659058, + "learning_rate": 0.00010035781161161446, + "loss": 1.8296, + "step": 4885 + }, + { + "epoch": 0.5140452393477117, + "grad_norm": 1.3590152263641357, + "learning_rate": 0.00010032373444058437, + "loss": 2.2775, + "step": 4886 + }, + { + "epoch": 0.5141504471330879, + "grad_norm": 1.5917162895202637, + "learning_rate": 0.00010028965723196002, + "loss": 2.0477, + "step": 4887 + }, + { + "epoch": 0.5142556549184639, + "grad_norm": 2.074033737182617, + "learning_rate": 0.00010025557998969875, + "loss": 1.708, + "step": 4888 + }, + { + "epoch": 0.5143608627038401, + "grad_norm": 1.4734264612197876, + "learning_rate": 0.00010022150271775783, + "loss": 1.8895, + "step": 4889 + }, + { + "epoch": 0.5144660704892162, + "grad_norm": 0.9418952465057373, + "learning_rate": 0.00010018742542009452, + "loss": 1.4237, + "step": 4890 + }, + { + "epoch": 0.5145712782745924, + "grad_norm": 1.5608463287353516, + "learning_rate": 0.00010015334810066612, + "loss": 1.7448, + "step": 4891 + }, + { + "epoch": 0.5146764860599684, + "grad_norm": 1.5609275102615356, + "learning_rate": 0.0001001192707634299, + "loss": 1.8851, + "step": 4892 + }, + { + "epoch": 0.5147816938453446, + "grad_norm": 1.515660047531128, + "learning_rate": 0.00010008519341234318, + "loss": 2.3635, + "step": 4893 + }, + { + "epoch": 0.5148869016307207, + "grad_norm": 1.6261802911758423, + "learning_rate": 0.00010005111605136319, + "loss": 1.8002, + "step": 4894 + }, + { + "epoch": 0.5149921094160967, + "grad_norm": 1.6840312480926514, + "learning_rate": 0.00010001703868444728, + "loss": 1.8234, + "step": 4895 + }, + { + "epoch": 0.5150973172014729, + "grad_norm": 1.0442866086959839, + "learning_rate": 9.998296131555273e-05, + "loss": 1.7835, + "step": 4896 + }, + { + "epoch": 0.515202524986849, + "grad_norm": 1.02167546749115, + "learning_rate": 9.994888394863683e-05, + "loss": 1.5165, + "step": 4897 + }, + { + "epoch": 0.5153077327722252, + "grad_norm": 1.9780964851379395, + "learning_rate": 9.991480658765685e-05, + "loss": 1.7533, + "step": 4898 + }, + { + "epoch": 0.5154129405576012, + "grad_norm": 2.685133695602417, + "learning_rate": 9.988072923657012e-05, + "loss": 2.181, + "step": 4899 + }, + { + "epoch": 0.5155181483429774, + "grad_norm": 1.0838717222213745, + "learning_rate": 9.98466518993339e-05, + "loss": 1.7872, + "step": 4900 + }, + { + "epoch": 0.5156233561283535, + "grad_norm": 2.0336694717407227, + "learning_rate": 9.981257457990548e-05, + "loss": 1.6419, + "step": 4901 + }, + { + "epoch": 0.5157285639137296, + "grad_norm": 1.6287788152694702, + "learning_rate": 9.977849728224219e-05, + "loss": 2.189, + "step": 4902 + }, + { + "epoch": 0.5158337716991057, + "grad_norm": 1.2882606983184814, + "learning_rate": 9.974442001030125e-05, + "loss": 1.7009, + "step": 4903 + }, + { + "epoch": 0.5159389794844819, + "grad_norm": 1.8696643114089966, + "learning_rate": 9.971034276803998e-05, + "loss": 2.0965, + "step": 4904 + }, + { + "epoch": 0.516044187269858, + "grad_norm": 1.5908252000808716, + "learning_rate": 9.967626555941564e-05, + "loss": 1.727, + "step": 4905 + }, + { + "epoch": 0.516149395055234, + "grad_norm": 1.1613487005233765, + "learning_rate": 9.964218838838554e-05, + "loss": 2.1126, + "step": 4906 + }, + { + "epoch": 0.5162546028406102, + "grad_norm": 1.0229045152664185, + "learning_rate": 9.960811125890695e-05, + "loss": 2.3455, + "step": 4907 + }, + { + "epoch": 0.5163598106259863, + "grad_norm": 1.273767113685608, + "learning_rate": 9.957403417493707e-05, + "loss": 1.6472, + "step": 4908 + }, + { + "epoch": 0.5164650184113624, + "grad_norm": 1.6049546003341675, + "learning_rate": 9.953995714043319e-05, + "loss": 1.6844, + "step": 4909 + }, + { + "epoch": 0.5165702261967385, + "grad_norm": 1.1211811304092407, + "learning_rate": 9.95058801593526e-05, + "loss": 2.0708, + "step": 4910 + }, + { + "epoch": 0.5166754339821147, + "grad_norm": 1.2839888334274292, + "learning_rate": 9.94718032356525e-05, + "loss": 1.8021, + "step": 4911 + }, + { + "epoch": 0.5167806417674908, + "grad_norm": 1.3963242769241333, + "learning_rate": 9.943772637329015e-05, + "loss": 2.2216, + "step": 4912 + }, + { + "epoch": 0.5168858495528669, + "grad_norm": 1.2890313863754272, + "learning_rate": 9.940364957622276e-05, + "loss": 1.7626, + "step": 4913 + }, + { + "epoch": 0.516991057338243, + "grad_norm": 1.017500400543213, + "learning_rate": 9.936957284840763e-05, + "loss": 1.8122, + "step": 4914 + }, + { + "epoch": 0.5170962651236192, + "grad_norm": 1.0217024087905884, + "learning_rate": 9.93354961938019e-05, + "loss": 2.2297, + "step": 4915 + }, + { + "epoch": 0.5172014729089953, + "grad_norm": 1.5468822717666626, + "learning_rate": 9.93014196163628e-05, + "loss": 2.0883, + "step": 4916 + }, + { + "epoch": 0.5173066806943714, + "grad_norm": 1.8743644952774048, + "learning_rate": 9.926734312004759e-05, + "loss": 1.6563, + "step": 4917 + }, + { + "epoch": 0.5174118884797475, + "grad_norm": 1.372307538986206, + "learning_rate": 9.923326670881336e-05, + "loss": 1.7315, + "step": 4918 + }, + { + "epoch": 0.5175170962651237, + "grad_norm": 1.082401156425476, + "learning_rate": 9.919919038661736e-05, + "loss": 1.8861, + "step": 4919 + }, + { + "epoch": 0.5176223040504997, + "grad_norm": 1.2021406888961792, + "learning_rate": 9.916511415741676e-05, + "loss": 2.2311, + "step": 4920 + }, + { + "epoch": 0.5177275118358758, + "grad_norm": 1.4951965808868408, + "learning_rate": 9.913103802516868e-05, + "loss": 2.1998, + "step": 4921 + }, + { + "epoch": 0.517832719621252, + "grad_norm": 1.098451852798462, + "learning_rate": 9.90969619938303e-05, + "loss": 2.01, + "step": 4922 + }, + { + "epoch": 0.5179379274066281, + "grad_norm": 1.2300734519958496, + "learning_rate": 9.906288606735875e-05, + "loss": 1.8119, + "step": 4923 + }, + { + "epoch": 0.5180431351920042, + "grad_norm": 1.2791273593902588, + "learning_rate": 9.902881024971116e-05, + "loss": 1.8411, + "step": 4924 + }, + { + "epoch": 0.5181483429773803, + "grad_norm": 1.747589111328125, + "learning_rate": 9.899473454484461e-05, + "loss": 1.8983, + "step": 4925 + }, + { + "epoch": 0.5182535507627565, + "grad_norm": 1.192456841468811, + "learning_rate": 9.896065895671625e-05, + "loss": 1.4597, + "step": 4926 + }, + { + "epoch": 0.5183587585481325, + "grad_norm": 1.400451898574829, + "learning_rate": 9.892658348928316e-05, + "loss": 2.15, + "step": 4927 + }, + { + "epoch": 0.5184639663335087, + "grad_norm": 1.4197369813919067, + "learning_rate": 9.88925081465024e-05, + "loss": 1.6032, + "step": 4928 + }, + { + "epoch": 0.5185691741188848, + "grad_norm": 1.509582757949829, + "learning_rate": 9.8858432932331e-05, + "loss": 1.9569, + "step": 4929 + }, + { + "epoch": 0.518674381904261, + "grad_norm": 0.9736523628234863, + "learning_rate": 9.882435785072601e-05, + "loss": 1.6721, + "step": 4930 + }, + { + "epoch": 0.518779589689637, + "grad_norm": 1.5283832550048828, + "learning_rate": 9.87902829056445e-05, + "loss": 2.0695, + "step": 4931 + }, + { + "epoch": 0.5188847974750131, + "grad_norm": 1.4057093858718872, + "learning_rate": 9.875620810104344e-05, + "loss": 1.4839, + "step": 4932 + }, + { + "epoch": 0.5189900052603893, + "grad_norm": 1.614888310432434, + "learning_rate": 9.872213344087983e-05, + "loss": 1.7947, + "step": 4933 + }, + { + "epoch": 0.5190952130457653, + "grad_norm": 1.5216361284255981, + "learning_rate": 9.868805892911067e-05, + "loss": 1.3341, + "step": 4934 + }, + { + "epoch": 0.5192004208311415, + "grad_norm": 1.41942298412323, + "learning_rate": 9.86539845696929e-05, + "loss": 1.8061, + "step": 4935 + }, + { + "epoch": 0.5193056286165176, + "grad_norm": 1.3126589059829712, + "learning_rate": 9.861991036658345e-05, + "loss": 2.092, + "step": 4936 + }, + { + "epoch": 0.5194108364018938, + "grad_norm": 2.1999170780181885, + "learning_rate": 9.858583632373927e-05, + "loss": 1.9388, + "step": 4937 + }, + { + "epoch": 0.5195160441872698, + "grad_norm": 1.6807349920272827, + "learning_rate": 9.85517624451173e-05, + "loss": 1.3968, + "step": 4938 + }, + { + "epoch": 0.519621251972646, + "grad_norm": 1.121988296508789, + "learning_rate": 9.851768873467435e-05, + "loss": 1.8882, + "step": 4939 + }, + { + "epoch": 0.5197264597580221, + "grad_norm": 1.7207212448120117, + "learning_rate": 9.848361519636733e-05, + "loss": 1.5875, + "step": 4940 + }, + { + "epoch": 0.5198316675433982, + "grad_norm": 1.8505074977874756, + "learning_rate": 9.84495418341531e-05, + "loss": 2.0285, + "step": 4941 + }, + { + "epoch": 0.5199368753287743, + "grad_norm": 1.3527370691299438, + "learning_rate": 9.841546865198846e-05, + "loss": 1.8778, + "step": 4942 + }, + { + "epoch": 0.5200420831141505, + "grad_norm": 1.1984349489212036, + "learning_rate": 9.838139565383022e-05, + "loss": 1.9808, + "step": 4943 + }, + { + "epoch": 0.5201472908995266, + "grad_norm": 1.9260704517364502, + "learning_rate": 9.834732284363519e-05, + "loss": 1.8141, + "step": 4944 + }, + { + "epoch": 0.5202524986849026, + "grad_norm": 1.8520371913909912, + "learning_rate": 9.83132502253601e-05, + "loss": 2.1056, + "step": 4945 + }, + { + "epoch": 0.5203577064702788, + "grad_norm": 1.291551113128662, + "learning_rate": 9.827917780296172e-05, + "loss": 1.7853, + "step": 4946 + }, + { + "epoch": 0.5204629142556549, + "grad_norm": 1.5085182189941406, + "learning_rate": 9.824510558039675e-05, + "loss": 1.8278, + "step": 4947 + }, + { + "epoch": 0.5205681220410311, + "grad_norm": 1.3979451656341553, + "learning_rate": 9.821103356162189e-05, + "loss": 1.2452, + "step": 4948 + }, + { + "epoch": 0.5206733298264071, + "grad_norm": 1.249040961265564, + "learning_rate": 9.817696175059381e-05, + "loss": 1.2894, + "step": 4949 + }, + { + "epoch": 0.5207785376117833, + "grad_norm": 1.443249225616455, + "learning_rate": 9.814289015126919e-05, + "loss": 1.8148, + "step": 4950 + }, + { + "epoch": 0.5208837453971594, + "grad_norm": 1.2137818336486816, + "learning_rate": 9.81088187676046e-05, + "loss": 2.0707, + "step": 4951 + }, + { + "epoch": 0.5209889531825355, + "grad_norm": 1.0801608562469482, + "learning_rate": 9.807474760355665e-05, + "loss": 1.5743, + "step": 4952 + }, + { + "epoch": 0.5210941609679116, + "grad_norm": 1.206287145614624, + "learning_rate": 9.804067666308192e-05, + "loss": 2.013, + "step": 4953 + }, + { + "epoch": 0.5211993687532878, + "grad_norm": 1.502138376235962, + "learning_rate": 9.800660595013696e-05, + "loss": 1.8673, + "step": 4954 + }, + { + "epoch": 0.5213045765386639, + "grad_norm": 1.6293638944625854, + "learning_rate": 9.797253546867831e-05, + "loss": 2.0488, + "step": 4955 + }, + { + "epoch": 0.52140978432404, + "grad_norm": 1.6563540697097778, + "learning_rate": 9.79384652226624e-05, + "loss": 1.6133, + "step": 4956 + }, + { + "epoch": 0.5215149921094161, + "grad_norm": 1.4221042394638062, + "learning_rate": 9.790439521604574e-05, + "loss": 1.7739, + "step": 4957 + }, + { + "epoch": 0.5216201998947922, + "grad_norm": 1.3993518352508545, + "learning_rate": 9.78703254527848e-05, + "loss": 1.2957, + "step": 4958 + }, + { + "epoch": 0.5217254076801683, + "grad_norm": 1.2816994190216064, + "learning_rate": 9.783625593683592e-05, + "loss": 1.9906, + "step": 4959 + }, + { + "epoch": 0.5218306154655444, + "grad_norm": 1.2781740427017212, + "learning_rate": 9.78021866721555e-05, + "loss": 1.5746, + "step": 4960 + }, + { + "epoch": 0.5219358232509206, + "grad_norm": 1.2438249588012695, + "learning_rate": 9.776811766269993e-05, + "loss": 1.978, + "step": 4961 + }, + { + "epoch": 0.5220410310362967, + "grad_norm": 2.31378436088562, + "learning_rate": 9.773404891242551e-05, + "loss": 1.8975, + "step": 4962 + }, + { + "epoch": 0.5221462388216728, + "grad_norm": 1.96122407913208, + "learning_rate": 9.769998042528852e-05, + "loss": 2.2517, + "step": 4963 + }, + { + "epoch": 0.5222514466070489, + "grad_norm": 1.7919749021530151, + "learning_rate": 9.766591220524521e-05, + "loss": 1.7129, + "step": 4964 + }, + { + "epoch": 0.5223566543924251, + "grad_norm": 1.509630799293518, + "learning_rate": 9.763184425625186e-05, + "loss": 1.589, + "step": 4965 + }, + { + "epoch": 0.5224618621778011, + "grad_norm": 1.3050771951675415, + "learning_rate": 9.759777658226462e-05, + "loss": 1.8745, + "step": 4966 + }, + { + "epoch": 0.5225670699631773, + "grad_norm": 2.0204238891601562, + "learning_rate": 9.756370918723968e-05, + "loss": 2.0122, + "step": 4967 + }, + { + "epoch": 0.5226722777485534, + "grad_norm": 1.1685175895690918, + "learning_rate": 9.752964207513318e-05, + "loss": 1.5934, + "step": 4968 + }, + { + "epoch": 0.5227774855339296, + "grad_norm": 2.0005977153778076, + "learning_rate": 9.749557524990121e-05, + "loss": 1.8383, + "step": 4969 + }, + { + "epoch": 0.5228826933193056, + "grad_norm": 1.3754639625549316, + "learning_rate": 9.746150871549981e-05, + "loss": 2.2025, + "step": 4970 + }, + { + "epoch": 0.5229879011046817, + "grad_norm": 1.4466034173965454, + "learning_rate": 9.742744247588512e-05, + "loss": 1.8922, + "step": 4971 + }, + { + "epoch": 0.5230931088900579, + "grad_norm": 1.596903920173645, + "learning_rate": 9.739337653501299e-05, + "loss": 1.8912, + "step": 4972 + }, + { + "epoch": 0.5231983166754339, + "grad_norm": 1.1891504526138306, + "learning_rate": 9.73593108968395e-05, + "loss": 1.5827, + "step": 4973 + }, + { + "epoch": 0.5233035244608101, + "grad_norm": 1.6135132312774658, + "learning_rate": 9.732524556532051e-05, + "loss": 2.2273, + "step": 4974 + }, + { + "epoch": 0.5234087322461862, + "grad_norm": 2.420681953430176, + "learning_rate": 9.729118054441194e-05, + "loss": 1.9405, + "step": 4975 + }, + { + "epoch": 0.5235139400315624, + "grad_norm": 1.0372477769851685, + "learning_rate": 9.72571158380697e-05, + "loss": 2.0549, + "step": 4976 + }, + { + "epoch": 0.5236191478169384, + "grad_norm": 1.2813150882720947, + "learning_rate": 9.722305145024951e-05, + "loss": 1.3903, + "step": 4977 + }, + { + "epoch": 0.5237243556023146, + "grad_norm": 1.1346882581710815, + "learning_rate": 9.718898738490723e-05, + "loss": 2.2729, + "step": 4978 + }, + { + "epoch": 0.5238295633876907, + "grad_norm": 1.6237995624542236, + "learning_rate": 9.71549236459986e-05, + "loss": 1.7977, + "step": 4979 + }, + { + "epoch": 0.5239347711730669, + "grad_norm": 1.280985713005066, + "learning_rate": 9.71208602374793e-05, + "loss": 1.543, + "step": 4980 + }, + { + "epoch": 0.5240399789584429, + "grad_norm": 1.655838131904602, + "learning_rate": 9.708679716330504e-05, + "loss": 2.5013, + "step": 4981 + }, + { + "epoch": 0.524145186743819, + "grad_norm": 1.0946263074874878, + "learning_rate": 9.705273442743142e-05, + "loss": 1.923, + "step": 4982 + }, + { + "epoch": 0.5242503945291952, + "grad_norm": 1.1894264221191406, + "learning_rate": 9.701867203381405e-05, + "loss": 1.7739, + "step": 4983 + }, + { + "epoch": 0.5243556023145712, + "grad_norm": 1.1565569639205933, + "learning_rate": 9.698460998640848e-05, + "loss": 1.5141, + "step": 4984 + }, + { + "epoch": 0.5244608100999474, + "grad_norm": 1.2816983461380005, + "learning_rate": 9.695054828917021e-05, + "loss": 1.6513, + "step": 4985 + }, + { + "epoch": 0.5245660178853235, + "grad_norm": 1.1288788318634033, + "learning_rate": 9.691648694605475e-05, + "loss": 1.9599, + "step": 4986 + }, + { + "epoch": 0.5246712256706997, + "grad_norm": 2.1675658226013184, + "learning_rate": 9.688242596101749e-05, + "loss": 1.8789, + "step": 4987 + }, + { + "epoch": 0.5247764334560757, + "grad_norm": 1.5547009706497192, + "learning_rate": 9.684836533801383e-05, + "loss": 1.6302, + "step": 4988 + }, + { + "epoch": 0.5248816412414519, + "grad_norm": 1.4101799726486206, + "learning_rate": 9.681430508099916e-05, + "loss": 1.6675, + "step": 4989 + }, + { + "epoch": 0.524986849026828, + "grad_norm": 1.8517366647720337, + "learning_rate": 9.678024519392871e-05, + "loss": 1.555, + "step": 4990 + }, + { + "epoch": 0.525092056812204, + "grad_norm": 1.7405954599380493, + "learning_rate": 9.67461856807578e-05, + "loss": 2.156, + "step": 4991 + }, + { + "epoch": 0.5251972645975802, + "grad_norm": 1.4130207300186157, + "learning_rate": 9.671212654544167e-05, + "loss": 1.2812, + "step": 4992 + }, + { + "epoch": 0.5253024723829564, + "grad_norm": 0.9680867791175842, + "learning_rate": 9.667806779193541e-05, + "loss": 1.6768, + "step": 4993 + }, + { + "epoch": 0.5254076801683325, + "grad_norm": 1.184627890586853, + "learning_rate": 9.664400942419423e-05, + "loss": 1.4777, + "step": 4994 + }, + { + "epoch": 0.5255128879537085, + "grad_norm": 1.430696964263916, + "learning_rate": 9.660995144617316e-05, + "loss": 1.8958, + "step": 4995 + }, + { + "epoch": 0.5256180957390847, + "grad_norm": 1.450882911682129, + "learning_rate": 9.657589386182725e-05, + "loss": 2.0249, + "step": 4996 + }, + { + "epoch": 0.5257233035244608, + "grad_norm": 1.3977291584014893, + "learning_rate": 9.654183667511154e-05, + "loss": 1.873, + "step": 4997 + }, + { + "epoch": 0.5258285113098369, + "grad_norm": 1.272571325302124, + "learning_rate": 9.650777988998093e-05, + "loss": 2.1763, + "step": 4998 + }, + { + "epoch": 0.525933719095213, + "grad_norm": 1.565454125404358, + "learning_rate": 9.647372351039035e-05, + "loss": 2.0786, + "step": 4999 + }, + { + "epoch": 0.5260389268805892, + "grad_norm": 1.4186714887619019, + "learning_rate": 9.643966754029466e-05, + "loss": 1.9929, + "step": 5000 + }, + { + "epoch": 0.5261441346659653, + "grad_norm": 1.9095489978790283, + "learning_rate": 9.640561198364864e-05, + "loss": 1.466, + "step": 5001 + }, + { + "epoch": 0.5262493424513414, + "grad_norm": 1.268289566040039, + "learning_rate": 9.637155684440705e-05, + "loss": 1.3982, + "step": 5002 + }, + { + "epoch": 0.5263545502367175, + "grad_norm": 1.090356469154358, + "learning_rate": 9.633750212652465e-05, + "loss": 2.1568, + "step": 5003 + }, + { + "epoch": 0.5264597580220937, + "grad_norm": 2.0192060470581055, + "learning_rate": 9.630344783395604e-05, + "loss": 1.8322, + "step": 5004 + }, + { + "epoch": 0.5265649658074697, + "grad_norm": 0.9348209500312805, + "learning_rate": 9.626939397065586e-05, + "loss": 1.8744, + "step": 5005 + }, + { + "epoch": 0.5266701735928458, + "grad_norm": 1.5496996641159058, + "learning_rate": 9.623534054057868e-05, + "loss": 1.703, + "step": 5006 + }, + { + "epoch": 0.526775381378222, + "grad_norm": 1.0677669048309326, + "learning_rate": 9.620128754767904e-05, + "loss": 1.8331, + "step": 5007 + }, + { + "epoch": 0.5268805891635981, + "grad_norm": 1.6514785289764404, + "learning_rate": 9.616723499591131e-05, + "loss": 1.8713, + "step": 5008 + }, + { + "epoch": 0.5269857969489742, + "grad_norm": 1.1393558979034424, + "learning_rate": 9.613318288922999e-05, + "loss": 1.6698, + "step": 5009 + }, + { + "epoch": 0.5270910047343503, + "grad_norm": 1.6217390298843384, + "learning_rate": 9.609913123158941e-05, + "loss": 1.7692, + "step": 5010 + }, + { + "epoch": 0.5271962125197265, + "grad_norm": 1.725000023841858, + "learning_rate": 9.606508002694386e-05, + "loss": 1.7788, + "step": 5011 + }, + { + "epoch": 0.5273014203051026, + "grad_norm": 1.360186219215393, + "learning_rate": 9.603102927924762e-05, + "loss": 1.9891, + "step": 5012 + }, + { + "epoch": 0.5274066280904787, + "grad_norm": 1.6628353595733643, + "learning_rate": 9.59969789924549e-05, + "loss": 1.7478, + "step": 5013 + }, + { + "epoch": 0.5275118358758548, + "grad_norm": 1.2454009056091309, + "learning_rate": 9.596292917051985e-05, + "loss": 1.744, + "step": 5014 + }, + { + "epoch": 0.527617043661231, + "grad_norm": 1.4857524633407593, + "learning_rate": 9.592887981739648e-05, + "loss": 1.5022, + "step": 5015 + }, + { + "epoch": 0.527722251446607, + "grad_norm": 1.3809852600097656, + "learning_rate": 9.58948309370389e-05, + "loss": 1.8917, + "step": 5016 + }, + { + "epoch": 0.5278274592319832, + "grad_norm": 0.873028039932251, + "learning_rate": 9.58607825334011e-05, + "loss": 1.7126, + "step": 5017 + }, + { + "epoch": 0.5279326670173593, + "grad_norm": 1.1375359296798706, + "learning_rate": 9.5826734610437e-05, + "loss": 2.3039, + "step": 5018 + }, + { + "epoch": 0.5280378748027355, + "grad_norm": 1.4480020999908447, + "learning_rate": 9.579268717210045e-05, + "loss": 1.6948, + "step": 5019 + }, + { + "epoch": 0.5281430825881115, + "grad_norm": 1.870962381362915, + "learning_rate": 9.575864022234527e-05, + "loss": 1.8897, + "step": 5020 + }, + { + "epoch": 0.5282482903734876, + "grad_norm": 1.3602070808410645, + "learning_rate": 9.572459376512528e-05, + "loss": 2.2099, + "step": 5021 + }, + { + "epoch": 0.5283534981588638, + "grad_norm": 1.2275303602218628, + "learning_rate": 9.56905478043941e-05, + "loss": 1.9035, + "step": 5022 + }, + { + "epoch": 0.5284587059442398, + "grad_norm": 1.2830156087875366, + "learning_rate": 9.565650234410542e-05, + "loss": 1.6825, + "step": 5023 + }, + { + "epoch": 0.528563913729616, + "grad_norm": 1.4303338527679443, + "learning_rate": 9.562245738821285e-05, + "loss": 2.0779, + "step": 5024 + }, + { + "epoch": 0.5286691215149921, + "grad_norm": 1.9997981786727905, + "learning_rate": 9.558841294066985e-05, + "loss": 1.9707, + "step": 5025 + }, + { + "epoch": 0.5287743293003683, + "grad_norm": 1.8480234146118164, + "learning_rate": 9.555436900542993e-05, + "loss": 1.656, + "step": 5026 + }, + { + "epoch": 0.5288795370857443, + "grad_norm": 1.244927167892456, + "learning_rate": 9.552032558644654e-05, + "loss": 2.1726, + "step": 5027 + }, + { + "epoch": 0.5289847448711205, + "grad_norm": 1.005563497543335, + "learning_rate": 9.548628268767294e-05, + "loss": 2.0138, + "step": 5028 + }, + { + "epoch": 0.5290899526564966, + "grad_norm": 1.155867576599121, + "learning_rate": 9.545224031306249e-05, + "loss": 1.6304, + "step": 5029 + }, + { + "epoch": 0.5291951604418726, + "grad_norm": 1.5326203107833862, + "learning_rate": 9.541819846656839e-05, + "loss": 1.7073, + "step": 5030 + }, + { + "epoch": 0.5293003682272488, + "grad_norm": 1.6995837688446045, + "learning_rate": 9.538415715214383e-05, + "loss": 1.8962, + "step": 5031 + }, + { + "epoch": 0.529405576012625, + "grad_norm": 1.8225377798080444, + "learning_rate": 9.535011637374189e-05, + "loss": 2.0861, + "step": 5032 + }, + { + "epoch": 0.5295107837980011, + "grad_norm": 1.634724736213684, + "learning_rate": 9.53160761353156e-05, + "loss": 1.4762, + "step": 5033 + }, + { + "epoch": 0.5296159915833771, + "grad_norm": 0.9659777879714966, + "learning_rate": 9.528203644081801e-05, + "loss": 1.0181, + "step": 5034 + }, + { + "epoch": 0.5297211993687533, + "grad_norm": 1.0323597192764282, + "learning_rate": 9.5247997294202e-05, + "loss": 1.7988, + "step": 5035 + }, + { + "epoch": 0.5298264071541294, + "grad_norm": 1.1700800657272339, + "learning_rate": 9.521395869942039e-05, + "loss": 1.8157, + "step": 5036 + }, + { + "epoch": 0.5299316149395055, + "grad_norm": 2.2417099475860596, + "learning_rate": 9.517992066042598e-05, + "loss": 2.1183, + "step": 5037 + }, + { + "epoch": 0.5300368227248816, + "grad_norm": 1.6578564643859863, + "learning_rate": 9.514588318117152e-05, + "loss": 1.5281, + "step": 5038 + }, + { + "epoch": 0.5301420305102578, + "grad_norm": 1.3541558980941772, + "learning_rate": 9.511184626560968e-05, + "loss": 2.1024, + "step": 5039 + }, + { + "epoch": 0.5302472382956339, + "grad_norm": 1.1925020217895508, + "learning_rate": 9.507780991769302e-05, + "loss": 1.6266, + "step": 5040 + }, + { + "epoch": 0.53035244608101, + "grad_norm": 2.338503360748291, + "learning_rate": 9.504377414137407e-05, + "loss": 2.1032, + "step": 5041 + }, + { + "epoch": 0.5304576538663861, + "grad_norm": 1.6318100690841675, + "learning_rate": 9.500973894060534e-05, + "loss": 1.9588, + "step": 5042 + }, + { + "epoch": 0.5305628616517623, + "grad_norm": 1.629852533340454, + "learning_rate": 9.497570431933917e-05, + "loss": 1.8528, + "step": 5043 + }, + { + "epoch": 0.5306680694371384, + "grad_norm": 1.3625423908233643, + "learning_rate": 9.494167028152792e-05, + "loss": 2.5084, + "step": 5044 + }, + { + "epoch": 0.5307732772225144, + "grad_norm": 1.6486890316009521, + "learning_rate": 9.490763683112386e-05, + "loss": 1.3986, + "step": 5045 + }, + { + "epoch": 0.5308784850078906, + "grad_norm": 1.887699007987976, + "learning_rate": 9.487360397207916e-05, + "loss": 1.9863, + "step": 5046 + }, + { + "epoch": 0.5309836927932667, + "grad_norm": 1.5402462482452393, + "learning_rate": 9.483957170834593e-05, + "loss": 2.2311, + "step": 5047 + }, + { + "epoch": 0.5310889005786428, + "grad_norm": 1.120913028717041, + "learning_rate": 9.480554004387627e-05, + "loss": 1.6302, + "step": 5048 + }, + { + "epoch": 0.5311941083640189, + "grad_norm": 1.8108731508255005, + "learning_rate": 9.477150898262213e-05, + "loss": 1.8564, + "step": 5049 + }, + { + "epoch": 0.5312993161493951, + "grad_norm": 0.9820263385772705, + "learning_rate": 9.473747852853543e-05, + "loss": 2.0829, + "step": 5050 + }, + { + "epoch": 0.5314045239347712, + "grad_norm": 2.3030552864074707, + "learning_rate": 9.4703448685568e-05, + "loss": 1.5117, + "step": 5051 + }, + { + "epoch": 0.5315097317201473, + "grad_norm": 1.2382352352142334, + "learning_rate": 9.466941945767168e-05, + "loss": 1.4252, + "step": 5052 + }, + { + "epoch": 0.5316149395055234, + "grad_norm": 1.1163996458053589, + "learning_rate": 9.463539084879809e-05, + "loss": 1.8226, + "step": 5053 + }, + { + "epoch": 0.5317201472908996, + "grad_norm": 1.8982148170471191, + "learning_rate": 9.46013628628989e-05, + "loss": 1.6211, + "step": 5054 + }, + { + "epoch": 0.5318253550762756, + "grad_norm": 1.5627444982528687, + "learning_rate": 9.456733550392568e-05, + "loss": 1.846, + "step": 5055 + }, + { + "epoch": 0.5319305628616517, + "grad_norm": 3.1989519596099854, + "learning_rate": 9.453330877582988e-05, + "loss": 1.5141, + "step": 5056 + }, + { + "epoch": 0.5320357706470279, + "grad_norm": 1.2844103574752808, + "learning_rate": 9.449928268256299e-05, + "loss": 1.4024, + "step": 5057 + }, + { + "epoch": 0.532140978432404, + "grad_norm": 2.4570162296295166, + "learning_rate": 9.446525722807623e-05, + "loss": 1.7243, + "step": 5058 + }, + { + "epoch": 0.5322461862177801, + "grad_norm": 1.4320532083511353, + "learning_rate": 9.443123241632093e-05, + "loss": 1.5919, + "step": 5059 + }, + { + "epoch": 0.5323513940031562, + "grad_norm": 1.297221302986145, + "learning_rate": 9.439720825124827e-05, + "loss": 1.6516, + "step": 5060 + }, + { + "epoch": 0.5324566017885324, + "grad_norm": 1.2610849142074585, + "learning_rate": 9.436318473680936e-05, + "loss": 1.7887, + "step": 5061 + }, + { + "epoch": 0.5325618095739084, + "grad_norm": 1.0165259838104248, + "learning_rate": 9.432916187695525e-05, + "loss": 1.8535, + "step": 5062 + }, + { + "epoch": 0.5326670173592846, + "grad_norm": 1.0816254615783691, + "learning_rate": 9.42951396756369e-05, + "loss": 1.93, + "step": 5063 + }, + { + "epoch": 0.5327722251446607, + "grad_norm": 1.453770637512207, + "learning_rate": 9.42611181368052e-05, + "loss": 1.601, + "step": 5064 + }, + { + "epoch": 0.5328774329300369, + "grad_norm": 1.5351864099502563, + "learning_rate": 9.422709726441094e-05, + "loss": 2.3019, + "step": 5065 + }, + { + "epoch": 0.5329826407154129, + "grad_norm": 1.867706537246704, + "learning_rate": 9.419307706240489e-05, + "loss": 2.1457, + "step": 5066 + }, + { + "epoch": 0.533087848500789, + "grad_norm": 2.1947357654571533, + "learning_rate": 9.415905753473765e-05, + "loss": 1.4276, + "step": 5067 + }, + { + "epoch": 0.5331930562861652, + "grad_norm": 1.1833242177963257, + "learning_rate": 9.412503868535983e-05, + "loss": 1.9144, + "step": 5068 + }, + { + "epoch": 0.5332982640715412, + "grad_norm": 1.284825325012207, + "learning_rate": 9.409102051822195e-05, + "loss": 2.1141, + "step": 5069 + }, + { + "epoch": 0.5334034718569174, + "grad_norm": 1.165414571762085, + "learning_rate": 9.405700303727435e-05, + "loss": 1.8646, + "step": 5070 + }, + { + "epoch": 0.5335086796422935, + "grad_norm": 1.4519579410552979, + "learning_rate": 9.402298624646744e-05, + "loss": 2.0232, + "step": 5071 + }, + { + "epoch": 0.5336138874276697, + "grad_norm": 1.0977115631103516, + "learning_rate": 9.398897014975149e-05, + "loss": 1.4565, + "step": 5072 + }, + { + "epoch": 0.5337190952130457, + "grad_norm": 1.6109386682510376, + "learning_rate": 9.39549547510766e-05, + "loss": 2.0598, + "step": 5073 + }, + { + "epoch": 0.5338243029984219, + "grad_norm": 1.9979568719863892, + "learning_rate": 9.392094005439291e-05, + "loss": 2.0561, + "step": 5074 + }, + { + "epoch": 0.533929510783798, + "grad_norm": 1.9197112321853638, + "learning_rate": 9.388692606365043e-05, + "loss": 1.81, + "step": 5075 + }, + { + "epoch": 0.5340347185691742, + "grad_norm": 1.5256714820861816, + "learning_rate": 9.385291278279914e-05, + "loss": 1.4039, + "step": 5076 + }, + { + "epoch": 0.5341399263545502, + "grad_norm": 1.7796401977539062, + "learning_rate": 9.381890021578881e-05, + "loss": 2.1833, + "step": 5077 + }, + { + "epoch": 0.5342451341399264, + "grad_norm": 1.145056962966919, + "learning_rate": 9.37848883665693e-05, + "loss": 1.7855, + "step": 5078 + }, + { + "epoch": 0.5343503419253025, + "grad_norm": 2.008789300918579, + "learning_rate": 9.375087723909017e-05, + "loss": 1.6486, + "step": 5079 + }, + { + "epoch": 0.5344555497106785, + "grad_norm": 1.488021731376648, + "learning_rate": 9.371686683730113e-05, + "loss": 1.9833, + "step": 5080 + }, + { + "epoch": 0.5345607574960547, + "grad_norm": 1.6001816987991333, + "learning_rate": 9.368285716515162e-05, + "loss": 1.4451, + "step": 5081 + }, + { + "epoch": 0.5346659652814308, + "grad_norm": 1.6554683446884155, + "learning_rate": 9.36488482265911e-05, + "loss": 2.2062, + "step": 5082 + }, + { + "epoch": 0.534771173066807, + "grad_norm": 1.8193860054016113, + "learning_rate": 9.361484002556898e-05, + "loss": 1.4615, + "step": 5083 + }, + { + "epoch": 0.534876380852183, + "grad_norm": 1.3823652267456055, + "learning_rate": 9.35808325660344e-05, + "loss": 1.9892, + "step": 5084 + }, + { + "epoch": 0.5349815886375592, + "grad_norm": 1.5087515115737915, + "learning_rate": 9.354682585193662e-05, + "loss": 1.9344, + "step": 5085 + }, + { + "epoch": 0.5350867964229353, + "grad_norm": 1.761411190032959, + "learning_rate": 9.351281988722469e-05, + "loss": 1.7704, + "step": 5086 + }, + { + "epoch": 0.5351920042083114, + "grad_norm": 1.332608938217163, + "learning_rate": 9.347881467584764e-05, + "loss": 1.6883, + "step": 5087 + }, + { + "epoch": 0.5352972119936875, + "grad_norm": 1.6421012878417969, + "learning_rate": 9.344481022175436e-05, + "loss": 1.832, + "step": 5088 + }, + { + "epoch": 0.5354024197790637, + "grad_norm": 1.594934344291687, + "learning_rate": 9.34108065288937e-05, + "loss": 1.6753, + "step": 5089 + }, + { + "epoch": 0.5355076275644398, + "grad_norm": 1.367395043373108, + "learning_rate": 9.337680360121436e-05, + "loss": 1.1994, + "step": 5090 + }, + { + "epoch": 0.5356128353498159, + "grad_norm": 1.6289167404174805, + "learning_rate": 9.334280144266501e-05, + "loss": 1.8281, + "step": 5091 + }, + { + "epoch": 0.535718043135192, + "grad_norm": 2.1067452430725098, + "learning_rate": 9.330880005719422e-05, + "loss": 1.4705, + "step": 5092 + }, + { + "epoch": 0.5358232509205682, + "grad_norm": 1.5518765449523926, + "learning_rate": 9.327479944875045e-05, + "loss": 2.2224, + "step": 5093 + }, + { + "epoch": 0.5359284587059442, + "grad_norm": 1.0815091133117676, + "learning_rate": 9.324079962128207e-05, + "loss": 1.8141, + "step": 5094 + }, + { + "epoch": 0.5360336664913203, + "grad_norm": 1.461266279220581, + "learning_rate": 9.320680057873735e-05, + "loss": 2.0772, + "step": 5095 + }, + { + "epoch": 0.5361388742766965, + "grad_norm": 1.2924128770828247, + "learning_rate": 9.317280232506454e-05, + "loss": 1.8573, + "step": 5096 + }, + { + "epoch": 0.5362440820620726, + "grad_norm": 1.1167140007019043, + "learning_rate": 9.31388048642117e-05, + "loss": 2.0632, + "step": 5097 + }, + { + "epoch": 0.5363492898474487, + "grad_norm": 1.1252926588058472, + "learning_rate": 9.310480820012684e-05, + "loss": 2.7284, + "step": 5098 + }, + { + "epoch": 0.5364544976328248, + "grad_norm": 1.0790119171142578, + "learning_rate": 9.307081233675791e-05, + "loss": 1.9891, + "step": 5099 + }, + { + "epoch": 0.536559705418201, + "grad_norm": 1.325523018836975, + "learning_rate": 9.303681727805276e-05, + "loss": 2.269, + "step": 5100 + }, + { + "epoch": 0.536664913203577, + "grad_norm": 1.8128808736801147, + "learning_rate": 9.300282302795909e-05, + "loss": 1.2854, + "step": 5101 + }, + { + "epoch": 0.5367701209889532, + "grad_norm": 2.0237619876861572, + "learning_rate": 9.29688295904245e-05, + "loss": 1.7467, + "step": 5102 + }, + { + "epoch": 0.5368753287743293, + "grad_norm": 1.2668648958206177, + "learning_rate": 9.293483696939658e-05, + "loss": 1.72, + "step": 5103 + }, + { + "epoch": 0.5369805365597055, + "grad_norm": 1.171208143234253, + "learning_rate": 9.290084516882281e-05, + "loss": 1.9828, + "step": 5104 + }, + { + "epoch": 0.5370857443450815, + "grad_norm": 1.4353476762771606, + "learning_rate": 9.286685419265048e-05, + "loss": 1.9993, + "step": 5105 + }, + { + "epoch": 0.5371909521304576, + "grad_norm": 1.5309464931488037, + "learning_rate": 9.283286404482688e-05, + "loss": 2.002, + "step": 5106 + }, + { + "epoch": 0.5372961599158338, + "grad_norm": 1.001417636871338, + "learning_rate": 9.27988747292992e-05, + "loss": 2.2841, + "step": 5107 + }, + { + "epoch": 0.5374013677012099, + "grad_norm": 1.408814549446106, + "learning_rate": 9.276488625001448e-05, + "loss": 1.5659, + "step": 5108 + }, + { + "epoch": 0.537506575486586, + "grad_norm": 1.94085693359375, + "learning_rate": 9.273089861091969e-05, + "loss": 1.3672, + "step": 5109 + }, + { + "epoch": 0.5376117832719621, + "grad_norm": 1.7884807586669922, + "learning_rate": 9.269691181596169e-05, + "loss": 1.5813, + "step": 5110 + }, + { + "epoch": 0.5377169910573383, + "grad_norm": 1.3529471158981323, + "learning_rate": 9.266292586908732e-05, + "loss": 2.0667, + "step": 5111 + }, + { + "epoch": 0.5378221988427143, + "grad_norm": 0.8547163009643555, + "learning_rate": 9.262894077424317e-05, + "loss": 1.8774, + "step": 5112 + }, + { + "epoch": 0.5379274066280905, + "grad_norm": 1.3882497549057007, + "learning_rate": 9.259495653537586e-05, + "loss": 1.9437, + "step": 5113 + }, + { + "epoch": 0.5380326144134666, + "grad_norm": 1.9616116285324097, + "learning_rate": 9.256097315643188e-05, + "loss": 1.5366, + "step": 5114 + }, + { + "epoch": 0.5381378221988428, + "grad_norm": 0.89047771692276, + "learning_rate": 9.252699064135758e-05, + "loss": 2.124, + "step": 5115 + }, + { + "epoch": 0.5382430299842188, + "grad_norm": 1.3593010902404785, + "learning_rate": 9.249300899409924e-05, + "loss": 1.9311, + "step": 5116 + }, + { + "epoch": 0.538348237769595, + "grad_norm": 1.1994686126708984, + "learning_rate": 9.245902821860308e-05, + "loss": 1.8233, + "step": 5117 + }, + { + "epoch": 0.5384534455549711, + "grad_norm": 1.4323614835739136, + "learning_rate": 9.24250483188151e-05, + "loss": 1.5137, + "step": 5118 + }, + { + "epoch": 0.5385586533403471, + "grad_norm": 1.0706384181976318, + "learning_rate": 9.239106929868133e-05, + "loss": 1.4401, + "step": 5119 + }, + { + "epoch": 0.5386638611257233, + "grad_norm": 1.6682357788085938, + "learning_rate": 9.235709116214764e-05, + "loss": 1.8276, + "step": 5120 + }, + { + "epoch": 0.5387690689110994, + "grad_norm": 1.3352079391479492, + "learning_rate": 9.232311391315979e-05, + "loss": 2.0025, + "step": 5121 + }, + { + "epoch": 0.5388742766964756, + "grad_norm": 1.127171516418457, + "learning_rate": 9.228913755566344e-05, + "loss": 1.959, + "step": 5122 + }, + { + "epoch": 0.5389794844818516, + "grad_norm": 0.859086275100708, + "learning_rate": 9.225516209360413e-05, + "loss": 1.5351, + "step": 5123 + }, + { + "epoch": 0.5390846922672278, + "grad_norm": 1.3062644004821777, + "learning_rate": 9.222118753092735e-05, + "loss": 1.8726, + "step": 5124 + }, + { + "epoch": 0.5391899000526039, + "grad_norm": 1.8840057849884033, + "learning_rate": 9.218721387157846e-05, + "loss": 1.8806, + "step": 5125 + }, + { + "epoch": 0.53929510783798, + "grad_norm": 1.518869400024414, + "learning_rate": 9.215324111950267e-05, + "loss": 1.5398, + "step": 5126 + }, + { + "epoch": 0.5394003156233561, + "grad_norm": 1.1188104152679443, + "learning_rate": 9.211926927864518e-05, + "loss": 1.5614, + "step": 5127 + }, + { + "epoch": 0.5395055234087323, + "grad_norm": 1.0320559740066528, + "learning_rate": 9.208529835295098e-05, + "loss": 1.6083, + "step": 5128 + }, + { + "epoch": 0.5396107311941084, + "grad_norm": 2.2655322551727295, + "learning_rate": 9.205132834636502e-05, + "loss": 1.5238, + "step": 5129 + }, + { + "epoch": 0.5397159389794844, + "grad_norm": 1.1405690908432007, + "learning_rate": 9.201735926283213e-05, + "loss": 2.1654, + "step": 5130 + }, + { + "epoch": 0.5398211467648606, + "grad_norm": 1.9452557563781738, + "learning_rate": 9.198339110629701e-05, + "loss": 1.5046, + "step": 5131 + }, + { + "epoch": 0.5399263545502367, + "grad_norm": 1.15932035446167, + "learning_rate": 9.194942388070431e-05, + "loss": 1.6429, + "step": 5132 + }, + { + "epoch": 0.5400315623356128, + "grad_norm": 1.0874872207641602, + "learning_rate": 9.191545758999848e-05, + "loss": 1.3977, + "step": 5133 + }, + { + "epoch": 0.5401367701209889, + "grad_norm": 1.428829550743103, + "learning_rate": 9.188149223812393e-05, + "loss": 1.8418, + "step": 5134 + }, + { + "epoch": 0.5402419779063651, + "grad_norm": 1.535487174987793, + "learning_rate": 9.1847527829025e-05, + "loss": 1.6133, + "step": 5135 + }, + { + "epoch": 0.5403471856917412, + "grad_norm": 1.295906662940979, + "learning_rate": 9.181356436664578e-05, + "loss": 1.9259, + "step": 5136 + }, + { + "epoch": 0.5404523934771173, + "grad_norm": 0.8215249180793762, + "learning_rate": 9.177960185493036e-05, + "loss": 1.8883, + "step": 5137 + }, + { + "epoch": 0.5405576012624934, + "grad_norm": 1.4424512386322021, + "learning_rate": 9.174564029782275e-05, + "loss": 1.9409, + "step": 5138 + }, + { + "epoch": 0.5406628090478696, + "grad_norm": 1.4258966445922852, + "learning_rate": 9.171167969926672e-05, + "loss": 1.5655, + "step": 5139 + }, + { + "epoch": 0.5407680168332457, + "grad_norm": 1.681448221206665, + "learning_rate": 9.167772006320604e-05, + "loss": 1.6488, + "step": 5140 + }, + { + "epoch": 0.5408732246186218, + "grad_norm": 1.6626986265182495, + "learning_rate": 9.164376139358433e-05, + "loss": 1.7347, + "step": 5141 + }, + { + "epoch": 0.5409784324039979, + "grad_norm": 1.1256407499313354, + "learning_rate": 9.16098036943451e-05, + "loss": 2.2423, + "step": 5142 + }, + { + "epoch": 0.541083640189374, + "grad_norm": 2.0072197914123535, + "learning_rate": 9.157584696943175e-05, + "loss": 1.2106, + "step": 5143 + }, + { + "epoch": 0.5411888479747501, + "grad_norm": 1.7380682229995728, + "learning_rate": 9.154189122278754e-05, + "loss": 1.436, + "step": 5144 + }, + { + "epoch": 0.5412940557601262, + "grad_norm": 1.4622247219085693, + "learning_rate": 9.150793645835562e-05, + "loss": 1.0282, + "step": 5145 + }, + { + "epoch": 0.5413992635455024, + "grad_norm": 1.5467033386230469, + "learning_rate": 9.147398268007912e-05, + "loss": 1.4557, + "step": 5146 + }, + { + "epoch": 0.5415044713308785, + "grad_norm": 1.8523279428482056, + "learning_rate": 9.14400298919009e-05, + "loss": 1.7551, + "step": 5147 + }, + { + "epoch": 0.5416096791162546, + "grad_norm": 2.7987358570098877, + "learning_rate": 9.140607809776382e-05, + "loss": 1.4688, + "step": 5148 + }, + { + "epoch": 0.5417148869016307, + "grad_norm": 1.0487982034683228, + "learning_rate": 9.137212730161062e-05, + "loss": 2.1586, + "step": 5149 + }, + { + "epoch": 0.5418200946870069, + "grad_norm": 1.263087272644043, + "learning_rate": 9.133817750738384e-05, + "loss": 1.4165, + "step": 5150 + }, + { + "epoch": 0.5419253024723829, + "grad_norm": 1.5688707828521729, + "learning_rate": 9.1304228719026e-05, + "loss": 1.8595, + "step": 5151 + }, + { + "epoch": 0.5420305102577591, + "grad_norm": 1.5955549478530884, + "learning_rate": 9.127028094047944e-05, + "loss": 1.722, + "step": 5152 + }, + { + "epoch": 0.5421357180431352, + "grad_norm": 1.5457823276519775, + "learning_rate": 9.123633417568641e-05, + "loss": 1.8036, + "step": 5153 + }, + { + "epoch": 0.5422409258285114, + "grad_norm": 2.240226984024048, + "learning_rate": 9.120238842858903e-05, + "loss": 1.8054, + "step": 5154 + }, + { + "epoch": 0.5423461336138874, + "grad_norm": 1.7893513441085815, + "learning_rate": 9.11684437031293e-05, + "loss": 1.8167, + "step": 5155 + }, + { + "epoch": 0.5424513413992635, + "grad_norm": 1.6553772687911987, + "learning_rate": 9.113450000324914e-05, + "loss": 1.9113, + "step": 5156 + }, + { + "epoch": 0.5425565491846397, + "grad_norm": 1.1440198421478271, + "learning_rate": 9.110055733289029e-05, + "loss": 1.7322, + "step": 5157 + }, + { + "epoch": 0.5426617569700157, + "grad_norm": 1.0650477409362793, + "learning_rate": 9.106661569599442e-05, + "loss": 1.9446, + "step": 5158 + }, + { + "epoch": 0.5427669647553919, + "grad_norm": 1.3577461242675781, + "learning_rate": 9.103267509650305e-05, + "loss": 2.0171, + "step": 5159 + }, + { + "epoch": 0.542872172540768, + "grad_norm": 1.0990657806396484, + "learning_rate": 9.099873553835758e-05, + "loss": 1.5223, + "step": 5160 + }, + { + "epoch": 0.5429773803261442, + "grad_norm": 1.6959264278411865, + "learning_rate": 9.096479702549933e-05, + "loss": 2.3626, + "step": 5161 + }, + { + "epoch": 0.5430825881115202, + "grad_norm": 1.6704100370407104, + "learning_rate": 9.093085956186945e-05, + "loss": 2.2676, + "step": 5162 + }, + { + "epoch": 0.5431877958968964, + "grad_norm": 2.997234582901001, + "learning_rate": 9.089692315140896e-05, + "loss": 1.617, + "step": 5163 + }, + { + "epoch": 0.5432930036822725, + "grad_norm": 1.61349356174469, + "learning_rate": 9.086298779805887e-05, + "loss": 1.7279, + "step": 5164 + }, + { + "epoch": 0.5433982114676486, + "grad_norm": 1.6692132949829102, + "learning_rate": 9.082905350575986e-05, + "loss": 1.9036, + "step": 5165 + }, + { + "epoch": 0.5435034192530247, + "grad_norm": 1.7874563932418823, + "learning_rate": 9.079512027845268e-05, + "loss": 2.4346, + "step": 5166 + }, + { + "epoch": 0.5436086270384008, + "grad_norm": 1.3825801610946655, + "learning_rate": 9.076118812007789e-05, + "loss": 1.3331, + "step": 5167 + }, + { + "epoch": 0.543713834823777, + "grad_norm": 1.6000703573226929, + "learning_rate": 9.072725703457587e-05, + "loss": 1.7898, + "step": 5168 + }, + { + "epoch": 0.543819042609153, + "grad_norm": 1.600172996520996, + "learning_rate": 9.069332702588698e-05, + "loss": 1.4787, + "step": 5169 + }, + { + "epoch": 0.5439242503945292, + "grad_norm": 1.7228039503097534, + "learning_rate": 9.065939809795137e-05, + "loss": 2.2927, + "step": 5170 + }, + { + "epoch": 0.5440294581799053, + "grad_norm": 1.3640235662460327, + "learning_rate": 9.062547025470908e-05, + "loss": 1.8261, + "step": 5171 + }, + { + "epoch": 0.5441346659652815, + "grad_norm": 1.3940327167510986, + "learning_rate": 9.059154350010008e-05, + "loss": 1.997, + "step": 5172 + }, + { + "epoch": 0.5442398737506575, + "grad_norm": 1.7132809162139893, + "learning_rate": 9.055761783806416e-05, + "loss": 2.0858, + "step": 5173 + }, + { + "epoch": 0.5443450815360337, + "grad_norm": 1.438615322113037, + "learning_rate": 9.052369327254098e-05, + "loss": 1.7468, + "step": 5174 + }, + { + "epoch": 0.5444502893214098, + "grad_norm": 1.4938368797302246, + "learning_rate": 9.048976980747008e-05, + "loss": 1.7274, + "step": 5175 + }, + { + "epoch": 0.5445554971067859, + "grad_norm": 1.2716962099075317, + "learning_rate": 9.045584744679092e-05, + "loss": 1.5497, + "step": 5176 + }, + { + "epoch": 0.544660704892162, + "grad_norm": 1.7363189458847046, + "learning_rate": 9.042192619444275e-05, + "loss": 1.6291, + "step": 5177 + }, + { + "epoch": 0.5447659126775382, + "grad_norm": 1.5554457902908325, + "learning_rate": 9.038800605436475e-05, + "loss": 1.8769, + "step": 5178 + }, + { + "epoch": 0.5448711204629143, + "grad_norm": 1.222988247871399, + "learning_rate": 9.035408703049596e-05, + "loss": 1.9806, + "step": 5179 + }, + { + "epoch": 0.5449763282482903, + "grad_norm": 1.5716575384140015, + "learning_rate": 9.03201691267753e-05, + "loss": 2.0254, + "step": 5180 + }, + { + "epoch": 0.5450815360336665, + "grad_norm": 1.1745754480361938, + "learning_rate": 9.02862523471415e-05, + "loss": 2.1139, + "step": 5181 + }, + { + "epoch": 0.5451867438190426, + "grad_norm": 0.8382359147071838, + "learning_rate": 9.025233669553322e-05, + "loss": 1.5266, + "step": 5182 + }, + { + "epoch": 0.5452919516044187, + "grad_norm": 1.7974534034729004, + "learning_rate": 9.021842217588901e-05, + "loss": 1.7337, + "step": 5183 + }, + { + "epoch": 0.5453971593897948, + "grad_norm": 1.5042651891708374, + "learning_rate": 9.018450879214721e-05, + "loss": 2.0298, + "step": 5184 + }, + { + "epoch": 0.545502367175171, + "grad_norm": 1.2993311882019043, + "learning_rate": 9.015059654824611e-05, + "loss": 1.4338, + "step": 5185 + }, + { + "epoch": 0.5456075749605471, + "grad_norm": 1.5049631595611572, + "learning_rate": 9.011668544812377e-05, + "loss": 1.6596, + "step": 5186 + }, + { + "epoch": 0.5457127827459232, + "grad_norm": 1.4581938982009888, + "learning_rate": 9.00827754957182e-05, + "loss": 1.8009, + "step": 5187 + }, + { + "epoch": 0.5458179905312993, + "grad_norm": 1.5323725938796997, + "learning_rate": 9.004886669496728e-05, + "loss": 1.6577, + "step": 5188 + }, + { + "epoch": 0.5459231983166755, + "grad_norm": 1.0720787048339844, + "learning_rate": 9.001495904980867e-05, + "loss": 1.7775, + "step": 5189 + }, + { + "epoch": 0.5460284061020515, + "grad_norm": 1.4327290058135986, + "learning_rate": 8.998105256418e-05, + "loss": 1.7773, + "step": 5190 + }, + { + "epoch": 0.5461336138874276, + "grad_norm": 1.6901955604553223, + "learning_rate": 8.99471472420187e-05, + "loss": 1.8843, + "step": 5191 + }, + { + "epoch": 0.5462388216728038, + "grad_norm": 2.1214852333068848, + "learning_rate": 8.991324308726209e-05, + "loss": 2.1451, + "step": 5192 + }, + { + "epoch": 0.54634402945818, + "grad_norm": 1.875383734703064, + "learning_rate": 8.987934010384733e-05, + "loss": 1.6287, + "step": 5193 + }, + { + "epoch": 0.546449237243556, + "grad_norm": 1.438472867012024, + "learning_rate": 8.984543829571151e-05, + "loss": 1.7141, + "step": 5194 + }, + { + "epoch": 0.5465544450289321, + "grad_norm": 1.697603464126587, + "learning_rate": 8.981153766679149e-05, + "loss": 1.9385, + "step": 5195 + }, + { + "epoch": 0.5466596528143083, + "grad_norm": 1.5435723066329956, + "learning_rate": 8.977763822102404e-05, + "loss": 1.7889, + "step": 5196 + }, + { + "epoch": 0.5467648605996843, + "grad_norm": 1.3071941137313843, + "learning_rate": 8.97437399623458e-05, + "loss": 1.8384, + "step": 5197 + }, + { + "epoch": 0.5468700683850605, + "grad_norm": 1.1430269479751587, + "learning_rate": 8.970984289469327e-05, + "loss": 1.4976, + "step": 5198 + }, + { + "epoch": 0.5469752761704366, + "grad_norm": 2.691263198852539, + "learning_rate": 8.96759470220028e-05, + "loss": 1.39, + "step": 5199 + }, + { + "epoch": 0.5470804839558128, + "grad_norm": 1.5296295881271362, + "learning_rate": 8.96420523482106e-05, + "loss": 1.8981, + "step": 5200 + }, + { + "epoch": 0.5471856917411888, + "grad_norm": 2.23455810546875, + "learning_rate": 8.960815887725278e-05, + "loss": 1.573, + "step": 5201 + }, + { + "epoch": 0.547290899526565, + "grad_norm": 2.9234347343444824, + "learning_rate": 8.957426661306522e-05, + "loss": 1.4707, + "step": 5202 + }, + { + "epoch": 0.5473961073119411, + "grad_norm": 1.3259090185165405, + "learning_rate": 8.954037555958376e-05, + "loss": 1.2053, + "step": 5203 + }, + { + "epoch": 0.5475013150973173, + "grad_norm": 1.4656827449798584, + "learning_rate": 8.950648572074405e-05, + "loss": 1.3973, + "step": 5204 + }, + { + "epoch": 0.5476065228826933, + "grad_norm": 1.52262282371521, + "learning_rate": 8.947259710048158e-05, + "loss": 2.1761, + "step": 5205 + }, + { + "epoch": 0.5477117306680694, + "grad_norm": 1.1430667638778687, + "learning_rate": 8.943870970273174e-05, + "loss": 1.4505, + "step": 5206 + }, + { + "epoch": 0.5478169384534456, + "grad_norm": 1.6592637300491333, + "learning_rate": 8.940482353142983e-05, + "loss": 1.9043, + "step": 5207 + }, + { + "epoch": 0.5479221462388216, + "grad_norm": 1.875982642173767, + "learning_rate": 8.937093859051083e-05, + "loss": 1.9452, + "step": 5208 + }, + { + "epoch": 0.5480273540241978, + "grad_norm": 1.242350697517395, + "learning_rate": 8.933705488390972e-05, + "loss": 1.7839, + "step": 5209 + }, + { + "epoch": 0.5481325618095739, + "grad_norm": 1.8692058324813843, + "learning_rate": 8.930317241556132e-05, + "loss": 1.6776, + "step": 5210 + }, + { + "epoch": 0.5482377695949501, + "grad_norm": 1.910408854484558, + "learning_rate": 8.926929118940026e-05, + "loss": 1.588, + "step": 5211 + }, + { + "epoch": 0.5483429773803261, + "grad_norm": 1.608375906944275, + "learning_rate": 8.923541120936111e-05, + "loss": 1.525, + "step": 5212 + }, + { + "epoch": 0.5484481851657023, + "grad_norm": 1.786906123161316, + "learning_rate": 8.92015324793782e-05, + "loss": 1.5707, + "step": 5213 + }, + { + "epoch": 0.5485533929510784, + "grad_norm": 1.4942151308059692, + "learning_rate": 8.916765500338575e-05, + "loss": 2.0173, + "step": 5214 + }, + { + "epoch": 0.5486586007364544, + "grad_norm": 1.761673092842102, + "learning_rate": 8.913377878531789e-05, + "loss": 1.9173, + "step": 5215 + }, + { + "epoch": 0.5487638085218306, + "grad_norm": 1.1698923110961914, + "learning_rate": 8.909990382910849e-05, + "loss": 1.9928, + "step": 5216 + }, + { + "epoch": 0.5488690163072067, + "grad_norm": 1.3080923557281494, + "learning_rate": 8.906603013869136e-05, + "loss": 2.0938, + "step": 5217 + }, + { + "epoch": 0.5489742240925829, + "grad_norm": 1.443345546722412, + "learning_rate": 8.903215771800017e-05, + "loss": 2.204, + "step": 5218 + }, + { + "epoch": 0.5490794318779589, + "grad_norm": 1.6220980882644653, + "learning_rate": 8.899828657096838e-05, + "loss": 1.4677, + "step": 5219 + }, + { + "epoch": 0.5491846396633351, + "grad_norm": 2.169625759124756, + "learning_rate": 8.896441670152932e-05, + "loss": 1.6339, + "step": 5220 + }, + { + "epoch": 0.5492898474487112, + "grad_norm": 0.9750475287437439, + "learning_rate": 8.893054811361624e-05, + "loss": 1.76, + "step": 5221 + }, + { + "epoch": 0.5493950552340873, + "grad_norm": 1.8625184297561646, + "learning_rate": 8.889668081116214e-05, + "loss": 1.6533, + "step": 5222 + }, + { + "epoch": 0.5495002630194634, + "grad_norm": 1.3964723348617554, + "learning_rate": 8.886281479809993e-05, + "loss": 2.0727, + "step": 5223 + }, + { + "epoch": 0.5496054708048396, + "grad_norm": 1.3396962881088257, + "learning_rate": 8.882895007836236e-05, + "loss": 1.5633, + "step": 5224 + }, + { + "epoch": 0.5497106785902157, + "grad_norm": 1.373002290725708, + "learning_rate": 8.879508665588206e-05, + "loss": 1.6469, + "step": 5225 + }, + { + "epoch": 0.5498158863755918, + "grad_norm": 0.9774206876754761, + "learning_rate": 8.876122453459143e-05, + "loss": 1.7567, + "step": 5226 + }, + { + "epoch": 0.5499210941609679, + "grad_norm": 1.5650739669799805, + "learning_rate": 8.872736371842279e-05, + "loss": 1.8958, + "step": 5227 + }, + { + "epoch": 0.550026301946344, + "grad_norm": 1.5855236053466797, + "learning_rate": 8.869350421130831e-05, + "loss": 2.1006, + "step": 5228 + }, + { + "epoch": 0.5501315097317201, + "grad_norm": 1.8107531070709229, + "learning_rate": 8.865964601717994e-05, + "loss": 1.9863, + "step": 5229 + }, + { + "epoch": 0.5502367175170962, + "grad_norm": 1.2894846200942993, + "learning_rate": 8.862578913996952e-05, + "loss": 1.749, + "step": 5230 + }, + { + "epoch": 0.5503419253024724, + "grad_norm": 1.1580836772918701, + "learning_rate": 8.859193358360874e-05, + "loss": 1.3304, + "step": 5231 + }, + { + "epoch": 0.5504471330878485, + "grad_norm": 1.6148262023925781, + "learning_rate": 8.855807935202915e-05, + "loss": 1.5428, + "step": 5232 + }, + { + "epoch": 0.5505523408732246, + "grad_norm": 1.7478631734848022, + "learning_rate": 8.852422644916216e-05, + "loss": 1.6134, + "step": 5233 + }, + { + "epoch": 0.5506575486586007, + "grad_norm": 1.809343934059143, + "learning_rate": 8.849037487893893e-05, + "loss": 2.3462, + "step": 5234 + }, + { + "epoch": 0.5507627564439769, + "grad_norm": 1.5731574296951294, + "learning_rate": 8.845652464529057e-05, + "loss": 1.7138, + "step": 5235 + }, + { + "epoch": 0.550867964229353, + "grad_norm": 1.3270047903060913, + "learning_rate": 8.842267575214802e-05, + "loss": 2.0084, + "step": 5236 + }, + { + "epoch": 0.5509731720147291, + "grad_norm": 1.761651873588562, + "learning_rate": 8.838882820344198e-05, + "loss": 1.4156, + "step": 5237 + }, + { + "epoch": 0.5510783798001052, + "grad_norm": 1.6372829675674438, + "learning_rate": 8.835498200310309e-05, + "loss": 2.1276, + "step": 5238 + }, + { + "epoch": 0.5511835875854814, + "grad_norm": 1.144127368927002, + "learning_rate": 8.832113715506181e-05, + "loss": 1.574, + "step": 5239 + }, + { + "epoch": 0.5512887953708574, + "grad_norm": 1.8449068069458008, + "learning_rate": 8.82872936632484e-05, + "loss": 1.67, + "step": 5240 + }, + { + "epoch": 0.5513940031562335, + "grad_norm": 1.1163779497146606, + "learning_rate": 8.825345153159301e-05, + "loss": 1.6902, + "step": 5241 + }, + { + "epoch": 0.5514992109416097, + "grad_norm": 1.2700437307357788, + "learning_rate": 8.821961076402563e-05, + "loss": 2.3529, + "step": 5242 + }, + { + "epoch": 0.5516044187269858, + "grad_norm": 1.2813799381256104, + "learning_rate": 8.818577136447603e-05, + "loss": 1.6812, + "step": 5243 + }, + { + "epoch": 0.5517096265123619, + "grad_norm": 1.321010947227478, + "learning_rate": 8.815193333687391e-05, + "loss": 1.7545, + "step": 5244 + }, + { + "epoch": 0.551814834297738, + "grad_norm": 1.3672521114349365, + "learning_rate": 8.811809668514878e-05, + "loss": 1.5022, + "step": 5245 + }, + { + "epoch": 0.5519200420831142, + "grad_norm": 1.245273470878601, + "learning_rate": 8.808426141322994e-05, + "loss": 1.6858, + "step": 5246 + }, + { + "epoch": 0.5520252498684902, + "grad_norm": 2.0327329635620117, + "learning_rate": 8.805042752504656e-05, + "loss": 1.8182, + "step": 5247 + }, + { + "epoch": 0.5521304576538664, + "grad_norm": 1.9399306774139404, + "learning_rate": 8.801659502452769e-05, + "loss": 1.8135, + "step": 5248 + }, + { + "epoch": 0.5522356654392425, + "grad_norm": 1.4941593408584595, + "learning_rate": 8.79827639156022e-05, + "loss": 2.0688, + "step": 5249 + }, + { + "epoch": 0.5523408732246187, + "grad_norm": 1.5331664085388184, + "learning_rate": 8.794893420219881e-05, + "loss": 2.1115, + "step": 5250 + }, + { + "epoch": 0.5524460810099947, + "grad_norm": 2.205839157104492, + "learning_rate": 8.791510588824594e-05, + "loss": 1.3747, + "step": 5251 + }, + { + "epoch": 0.5525512887953709, + "grad_norm": 1.1780728101730347, + "learning_rate": 8.788127897767204e-05, + "loss": 1.7812, + "step": 5252 + }, + { + "epoch": 0.552656496580747, + "grad_norm": 1.1612539291381836, + "learning_rate": 8.784745347440533e-05, + "loss": 2.0561, + "step": 5253 + }, + { + "epoch": 0.552761704366123, + "grad_norm": 1.275850534439087, + "learning_rate": 8.78136293823738e-05, + "loss": 1.6603, + "step": 5254 + }, + { + "epoch": 0.5528669121514992, + "grad_norm": 1.7113317251205444, + "learning_rate": 8.777980670550536e-05, + "loss": 1.4027, + "step": 5255 + }, + { + "epoch": 0.5529721199368753, + "grad_norm": 1.0461790561676025, + "learning_rate": 8.774598544772774e-05, + "loss": 2.2595, + "step": 5256 + }, + { + "epoch": 0.5530773277222515, + "grad_norm": 1.1092756986618042, + "learning_rate": 8.771216561296849e-05, + "loss": 1.6448, + "step": 5257 + }, + { + "epoch": 0.5531825355076275, + "grad_norm": 1.1810952425003052, + "learning_rate": 8.767834720515496e-05, + "loss": 1.8668, + "step": 5258 + }, + { + "epoch": 0.5532877432930037, + "grad_norm": 1.1021080017089844, + "learning_rate": 8.76445302282144e-05, + "loss": 2.0023, + "step": 5259 + }, + { + "epoch": 0.5533929510783798, + "grad_norm": 1.4798521995544434, + "learning_rate": 8.761071468607388e-05, + "loss": 1.8638, + "step": 5260 + }, + { + "epoch": 0.5534981588637559, + "grad_norm": 1.3366169929504395, + "learning_rate": 8.757690058266025e-05, + "loss": 1.458, + "step": 5261 + }, + { + "epoch": 0.553603366649132, + "grad_norm": 1.5138647556304932, + "learning_rate": 8.754308792190024e-05, + "loss": 1.6667, + "step": 5262 + }, + { + "epoch": 0.5537085744345082, + "grad_norm": 1.3789781332015991, + "learning_rate": 8.750927670772044e-05, + "loss": 2.1057, + "step": 5263 + }, + { + "epoch": 0.5538137822198843, + "grad_norm": 1.4035643339157104, + "learning_rate": 8.747546694404717e-05, + "loss": 1.6416, + "step": 5264 + }, + { + "epoch": 0.5539189900052603, + "grad_norm": 1.30898916721344, + "learning_rate": 8.744165863480669e-05, + "loss": 1.9435, + "step": 5265 + }, + { + "epoch": 0.5540241977906365, + "grad_norm": 1.4918098449707031, + "learning_rate": 8.740785178392505e-05, + "loss": 1.9183, + "step": 5266 + }, + { + "epoch": 0.5541294055760126, + "grad_norm": 1.285561203956604, + "learning_rate": 8.737404639532811e-05, + "loss": 1.82, + "step": 5267 + }, + { + "epoch": 0.5542346133613888, + "grad_norm": 1.183773159980774, + "learning_rate": 8.734024247294157e-05, + "loss": 1.9191, + "step": 5268 + }, + { + "epoch": 0.5543398211467648, + "grad_norm": 1.6294718980789185, + "learning_rate": 8.7306440020691e-05, + "loss": 1.9698, + "step": 5269 + }, + { + "epoch": 0.554445028932141, + "grad_norm": 1.5693155527114868, + "learning_rate": 8.727263904250178e-05, + "loss": 1.8887, + "step": 5270 + }, + { + "epoch": 0.5545502367175171, + "grad_norm": 1.6532758474349976, + "learning_rate": 8.723883954229908e-05, + "loss": 1.8751, + "step": 5271 + }, + { + "epoch": 0.5546554445028932, + "grad_norm": 1.02011239528656, + "learning_rate": 8.72050415240079e-05, + "loss": 2.3712, + "step": 5272 + }, + { + "epoch": 0.5547606522882693, + "grad_norm": 1.6323237419128418, + "learning_rate": 8.71712449915531e-05, + "loss": 1.7668, + "step": 5273 + }, + { + "epoch": 0.5548658600736455, + "grad_norm": 1.436021327972412, + "learning_rate": 8.713744994885938e-05, + "loss": 1.8613, + "step": 5274 + }, + { + "epoch": 0.5549710678590216, + "grad_norm": 1.5757768154144287, + "learning_rate": 8.710365639985126e-05, + "loss": 2.0038, + "step": 5275 + }, + { + "epoch": 0.5550762756443977, + "grad_norm": 2.4132494926452637, + "learning_rate": 8.706986434845302e-05, + "loss": 1.5683, + "step": 5276 + }, + { + "epoch": 0.5551814834297738, + "grad_norm": 1.5619103908538818, + "learning_rate": 8.703607379858889e-05, + "loss": 1.2968, + "step": 5277 + }, + { + "epoch": 0.55528669121515, + "grad_norm": 1.2575358152389526, + "learning_rate": 8.70022847541828e-05, + "loss": 1.5361, + "step": 5278 + }, + { + "epoch": 0.555391899000526, + "grad_norm": 0.924577534198761, + "learning_rate": 8.696849721915859e-05, + "loss": 1.6288, + "step": 5279 + }, + { + "epoch": 0.5554971067859021, + "grad_norm": 1.1372270584106445, + "learning_rate": 8.693471119743987e-05, + "loss": 2.1464, + "step": 5280 + }, + { + "epoch": 0.5556023145712783, + "grad_norm": 1.3003607988357544, + "learning_rate": 8.690092669295014e-05, + "loss": 2.0537, + "step": 5281 + }, + { + "epoch": 0.5557075223566544, + "grad_norm": 1.147006869316101, + "learning_rate": 8.686714370961264e-05, + "loss": 2.0348, + "step": 5282 + }, + { + "epoch": 0.5558127301420305, + "grad_norm": 1.6022502183914185, + "learning_rate": 8.68333622513505e-05, + "loss": 1.6282, + "step": 5283 + }, + { + "epoch": 0.5559179379274066, + "grad_norm": 1.5122016668319702, + "learning_rate": 8.679958232208668e-05, + "loss": 1.2036, + "step": 5284 + }, + { + "epoch": 0.5560231457127828, + "grad_norm": 1.583200216293335, + "learning_rate": 8.676580392574385e-05, + "loss": 1.5517, + "step": 5285 + }, + { + "epoch": 0.5561283534981588, + "grad_norm": 1.0503642559051514, + "learning_rate": 8.673202706624464e-05, + "loss": 1.9161, + "step": 5286 + }, + { + "epoch": 0.556233561283535, + "grad_norm": 1.8765935897827148, + "learning_rate": 8.669825174751144e-05, + "loss": 2.1381, + "step": 5287 + }, + { + "epoch": 0.5563387690689111, + "grad_norm": 0.8225523233413696, + "learning_rate": 8.666447797346648e-05, + "loss": 1.781, + "step": 5288 + }, + { + "epoch": 0.5564439768542873, + "grad_norm": 1.6632425785064697, + "learning_rate": 8.663070574803175e-05, + "loss": 1.636, + "step": 5289 + }, + { + "epoch": 0.5565491846396633, + "grad_norm": 1.561035394668579, + "learning_rate": 8.659693507512917e-05, + "loss": 1.7061, + "step": 5290 + }, + { + "epoch": 0.5566543924250394, + "grad_norm": 1.1021442413330078, + "learning_rate": 8.656316595868037e-05, + "loss": 1.7368, + "step": 5291 + }, + { + "epoch": 0.5567596002104156, + "grad_norm": 1.2675588130950928, + "learning_rate": 8.652939840260686e-05, + "loss": 1.6662, + "step": 5292 + }, + { + "epoch": 0.5568648079957917, + "grad_norm": 1.4895219802856445, + "learning_rate": 8.649563241082998e-05, + "loss": 1.3631, + "step": 5293 + }, + { + "epoch": 0.5569700157811678, + "grad_norm": 1.1941972970962524, + "learning_rate": 8.64618679872708e-05, + "loss": 1.2894, + "step": 5294 + }, + { + "epoch": 0.5570752235665439, + "grad_norm": 1.6961616277694702, + "learning_rate": 8.642810513585035e-05, + "loss": 2.1377, + "step": 5295 + }, + { + "epoch": 0.5571804313519201, + "grad_norm": 1.7371389865875244, + "learning_rate": 8.639434386048932e-05, + "loss": 2.2303, + "step": 5296 + }, + { + "epoch": 0.5572856391372961, + "grad_norm": 1.3116867542266846, + "learning_rate": 8.636058416510836e-05, + "loss": 1.7173, + "step": 5297 + }, + { + "epoch": 0.5573908469226723, + "grad_norm": 1.2672263383865356, + "learning_rate": 8.632682605362784e-05, + "loss": 1.8597, + "step": 5298 + }, + { + "epoch": 0.5574960547080484, + "grad_norm": 1.0156499147415161, + "learning_rate": 8.629306952996797e-05, + "loss": 2.0679, + "step": 5299 + }, + { + "epoch": 0.5576012624934246, + "grad_norm": 1.6344144344329834, + "learning_rate": 8.625931459804881e-05, + "loss": 1.8525, + "step": 5300 + }, + { + "epoch": 0.5577064702788006, + "grad_norm": 1.6262001991271973, + "learning_rate": 8.622556126179023e-05, + "loss": 1.9396, + "step": 5301 + }, + { + "epoch": 0.5578116780641768, + "grad_norm": 1.4433337450027466, + "learning_rate": 8.619180952511181e-05, + "loss": 2.3541, + "step": 5302 + }, + { + "epoch": 0.5579168858495529, + "grad_norm": 1.4532362222671509, + "learning_rate": 8.61580593919331e-05, + "loss": 1.6864, + "step": 5303 + }, + { + "epoch": 0.5580220936349289, + "grad_norm": 1.5361088514328003, + "learning_rate": 8.612431086617337e-05, + "loss": 1.7575, + "step": 5304 + }, + { + "epoch": 0.5581273014203051, + "grad_norm": 1.3419499397277832, + "learning_rate": 8.609056395175175e-05, + "loss": 1.6097, + "step": 5305 + }, + { + "epoch": 0.5582325092056812, + "grad_norm": 1.3803333044052124, + "learning_rate": 8.605681865258712e-05, + "loss": 1.5305, + "step": 5306 + }, + { + "epoch": 0.5583377169910574, + "grad_norm": 1.5263311862945557, + "learning_rate": 8.602307497259821e-05, + "loss": 1.6276, + "step": 5307 + }, + { + "epoch": 0.5584429247764334, + "grad_norm": 1.4065600633621216, + "learning_rate": 8.598933291570361e-05, + "loss": 1.5332, + "step": 5308 + }, + { + "epoch": 0.5585481325618096, + "grad_norm": 1.0995397567749023, + "learning_rate": 8.595559248582161e-05, + "loss": 2.0225, + "step": 5309 + }, + { + "epoch": 0.5586533403471857, + "grad_norm": 1.0431246757507324, + "learning_rate": 8.592185368687043e-05, + "loss": 1.662, + "step": 5310 + }, + { + "epoch": 0.5587585481325618, + "grad_norm": 1.5991910696029663, + "learning_rate": 8.588811652276803e-05, + "loss": 1.2915, + "step": 5311 + }, + { + "epoch": 0.5588637559179379, + "grad_norm": 1.2698297500610352, + "learning_rate": 8.585438099743217e-05, + "loss": 2.1424, + "step": 5312 + }, + { + "epoch": 0.5589689637033141, + "grad_norm": 1.9178346395492554, + "learning_rate": 8.582064711478046e-05, + "loss": 1.7453, + "step": 5313 + }, + { + "epoch": 0.5590741714886902, + "grad_norm": 1.5896426439285278, + "learning_rate": 8.578691487873036e-05, + "loss": 1.0764, + "step": 5314 + }, + { + "epoch": 0.5591793792740662, + "grad_norm": 1.0997956991195679, + "learning_rate": 8.575318429319899e-05, + "loss": 1.588, + "step": 5315 + }, + { + "epoch": 0.5592845870594424, + "grad_norm": 1.4343537092208862, + "learning_rate": 8.571945536210342e-05, + "loss": 1.578, + "step": 5316 + }, + { + "epoch": 0.5593897948448185, + "grad_norm": 1.9470630884170532, + "learning_rate": 8.568572808936047e-05, + "loss": 1.905, + "step": 5317 + }, + { + "epoch": 0.5594950026301946, + "grad_norm": 1.999913215637207, + "learning_rate": 8.565200247888678e-05, + "loss": 2.0373, + "step": 5318 + }, + { + "epoch": 0.5596002104155707, + "grad_norm": 1.2631433010101318, + "learning_rate": 8.56182785345988e-05, + "loss": 2.4557, + "step": 5319 + }, + { + "epoch": 0.5597054182009469, + "grad_norm": 0.9738912582397461, + "learning_rate": 8.558455626041277e-05, + "loss": 1.4726, + "step": 5320 + }, + { + "epoch": 0.559810625986323, + "grad_norm": 1.5372016429901123, + "learning_rate": 8.555083566024474e-05, + "loss": 2.1864, + "step": 5321 + }, + { + "epoch": 0.5599158337716991, + "grad_norm": 1.7038294076919556, + "learning_rate": 8.551711673801062e-05, + "loss": 1.8453, + "step": 5322 + }, + { + "epoch": 0.5600210415570752, + "grad_norm": 1.2219566106796265, + "learning_rate": 8.548339949762601e-05, + "loss": 1.3113, + "step": 5323 + }, + { + "epoch": 0.5601262493424514, + "grad_norm": 2.9071662425994873, + "learning_rate": 8.544968394300642e-05, + "loss": 1.8082, + "step": 5324 + }, + { + "epoch": 0.5602314571278275, + "grad_norm": 2.0915303230285645, + "learning_rate": 8.541597007806712e-05, + "loss": 1.5558, + "step": 5325 + }, + { + "epoch": 0.5603366649132036, + "grad_norm": 1.6373728513717651, + "learning_rate": 8.538225790672322e-05, + "loss": 1.5953, + "step": 5326 + }, + { + "epoch": 0.5604418726985797, + "grad_norm": 1.2306228876113892, + "learning_rate": 8.534854743288954e-05, + "loss": 1.5297, + "step": 5327 + }, + { + "epoch": 0.5605470804839559, + "grad_norm": 1.990144968032837, + "learning_rate": 8.531483866048081e-05, + "loss": 1.6063, + "step": 5328 + }, + { + "epoch": 0.5606522882693319, + "grad_norm": 1.5562758445739746, + "learning_rate": 8.528113159341153e-05, + "loss": 1.6421, + "step": 5329 + }, + { + "epoch": 0.560757496054708, + "grad_norm": 1.1921128034591675, + "learning_rate": 8.524742623559594e-05, + "loss": 1.4861, + "step": 5330 + }, + { + "epoch": 0.5608627038400842, + "grad_norm": 1.379715085029602, + "learning_rate": 8.521372259094818e-05, + "loss": 1.4598, + "step": 5331 + }, + { + "epoch": 0.5609679116254603, + "grad_norm": 2.179091453552246, + "learning_rate": 8.518002066338212e-05, + "loss": 1.9094, + "step": 5332 + }, + { + "epoch": 0.5610731194108364, + "grad_norm": 1.716064214706421, + "learning_rate": 8.514632045681145e-05, + "loss": 1.6363, + "step": 5333 + }, + { + "epoch": 0.5611783271962125, + "grad_norm": 1.1381585597991943, + "learning_rate": 8.511262197514968e-05, + "loss": 1.5435, + "step": 5334 + }, + { + "epoch": 0.5612835349815887, + "grad_norm": 2.24033522605896, + "learning_rate": 8.507892522231012e-05, + "loss": 1.8934, + "step": 5335 + }, + { + "epoch": 0.5613887427669647, + "grad_norm": 1.3724555969238281, + "learning_rate": 8.504523020220583e-05, + "loss": 2.2629, + "step": 5336 + }, + { + "epoch": 0.5614939505523409, + "grad_norm": 1.862369418144226, + "learning_rate": 8.501153691874971e-05, + "loss": 2.0515, + "step": 5337 + }, + { + "epoch": 0.561599158337717, + "grad_norm": 2.476644992828369, + "learning_rate": 8.497784537585444e-05, + "loss": 1.9697, + "step": 5338 + }, + { + "epoch": 0.5617043661230932, + "grad_norm": 1.3707612752914429, + "learning_rate": 8.494415557743252e-05, + "loss": 2.1986, + "step": 5339 + }, + { + "epoch": 0.5618095739084692, + "grad_norm": 2.085458993911743, + "learning_rate": 8.491046752739624e-05, + "loss": 1.9783, + "step": 5340 + }, + { + "epoch": 0.5619147816938453, + "grad_norm": 1.2586299180984497, + "learning_rate": 8.487678122965767e-05, + "loss": 2.0315, + "step": 5341 + }, + { + "epoch": 0.5620199894792215, + "grad_norm": 1.0489764213562012, + "learning_rate": 8.48430966881287e-05, + "loss": 1.7353, + "step": 5342 + }, + { + "epoch": 0.5621251972645975, + "grad_norm": 1.1171035766601562, + "learning_rate": 8.480941390672101e-05, + "loss": 1.8519, + "step": 5343 + }, + { + "epoch": 0.5622304050499737, + "grad_norm": 1.3247672319412231, + "learning_rate": 8.477573288934605e-05, + "loss": 1.7206, + "step": 5344 + }, + { + "epoch": 0.5623356128353498, + "grad_norm": 2.1845767498016357, + "learning_rate": 8.47420536399151e-05, + "loss": 2.2207, + "step": 5345 + }, + { + "epoch": 0.562440820620726, + "grad_norm": 1.5519604682922363, + "learning_rate": 8.470837616233924e-05, + "loss": 1.7987, + "step": 5346 + }, + { + "epoch": 0.562546028406102, + "grad_norm": 1.6114767789840698, + "learning_rate": 8.467470046052927e-05, + "loss": 1.608, + "step": 5347 + }, + { + "epoch": 0.5626512361914782, + "grad_norm": 1.703809142112732, + "learning_rate": 8.464102653839588e-05, + "loss": 1.2808, + "step": 5348 + }, + { + "epoch": 0.5627564439768543, + "grad_norm": 1.431298851966858, + "learning_rate": 8.460735439984949e-05, + "loss": 2.1355, + "step": 5349 + }, + { + "epoch": 0.5628616517622304, + "grad_norm": 1.7516570091247559, + "learning_rate": 8.457368404880037e-05, + "loss": 2.1989, + "step": 5350 + }, + { + "epoch": 0.5629668595476065, + "grad_norm": 1.4657411575317383, + "learning_rate": 8.454001548915851e-05, + "loss": 1.5914, + "step": 5351 + }, + { + "epoch": 0.5630720673329827, + "grad_norm": 1.8436520099639893, + "learning_rate": 8.450634872483374e-05, + "loss": 2.0691, + "step": 5352 + }, + { + "epoch": 0.5631772751183588, + "grad_norm": 1.4249285459518433, + "learning_rate": 8.44726837597357e-05, + "loss": 1.4602, + "step": 5353 + }, + { + "epoch": 0.5632824829037348, + "grad_norm": 1.0019652843475342, + "learning_rate": 8.443902059777373e-05, + "loss": 2.0361, + "step": 5354 + }, + { + "epoch": 0.563387690689111, + "grad_norm": 2.051858901977539, + "learning_rate": 8.440535924285706e-05, + "loss": 1.5908, + "step": 5355 + }, + { + "epoch": 0.5634928984744871, + "grad_norm": 1.3849743604660034, + "learning_rate": 8.43716996988947e-05, + "loss": 1.8964, + "step": 5356 + }, + { + "epoch": 0.5635981062598633, + "grad_norm": 1.2312546968460083, + "learning_rate": 8.433804196979541e-05, + "loss": 1.8502, + "step": 5357 + }, + { + "epoch": 0.5637033140452393, + "grad_norm": 1.2379924058914185, + "learning_rate": 8.430438605946769e-05, + "loss": 2.0632, + "step": 5358 + }, + { + "epoch": 0.5638085218306155, + "grad_norm": 0.9644449353218079, + "learning_rate": 8.427073197181993e-05, + "loss": 1.6485, + "step": 5359 + }, + { + "epoch": 0.5639137296159916, + "grad_norm": 1.0192735195159912, + "learning_rate": 8.423707971076026e-05, + "loss": 1.4893, + "step": 5360 + }, + { + "epoch": 0.5640189374013677, + "grad_norm": 1.3611501455307007, + "learning_rate": 8.420342928019666e-05, + "loss": 1.7624, + "step": 5361 + }, + { + "epoch": 0.5641241451867438, + "grad_norm": 1.648772120475769, + "learning_rate": 8.416978068403676e-05, + "loss": 1.7394, + "step": 5362 + }, + { + "epoch": 0.56422935297212, + "grad_norm": 1.2542641162872314, + "learning_rate": 8.413613392618811e-05, + "loss": 2.0594, + "step": 5363 + }, + { + "epoch": 0.5643345607574961, + "grad_norm": 1.6775392293930054, + "learning_rate": 8.410248901055801e-05, + "loss": 2.0371, + "step": 5364 + }, + { + "epoch": 0.5644397685428721, + "grad_norm": 1.1697038412094116, + "learning_rate": 8.40688459410535e-05, + "loss": 1.8887, + "step": 5365 + }, + { + "epoch": 0.5645449763282483, + "grad_norm": 1.5925973653793335, + "learning_rate": 8.403520472158143e-05, + "loss": 1.8341, + "step": 5366 + }, + { + "epoch": 0.5646501841136244, + "grad_norm": 1.3041836023330688, + "learning_rate": 8.40015653560485e-05, + "loss": 1.5598, + "step": 5367 + }, + { + "epoch": 0.5647553918990005, + "grad_norm": 1.0811697244644165, + "learning_rate": 8.396792784836108e-05, + "loss": 1.3874, + "step": 5368 + }, + { + "epoch": 0.5648605996843766, + "grad_norm": 1.3267872333526611, + "learning_rate": 8.393429220242541e-05, + "loss": 2.1926, + "step": 5369 + }, + { + "epoch": 0.5649658074697528, + "grad_norm": 1.133404016494751, + "learning_rate": 8.39006584221475e-05, + "loss": 1.7818, + "step": 5370 + }, + { + "epoch": 0.5650710152551289, + "grad_norm": 2.1951334476470947, + "learning_rate": 8.38670265114331e-05, + "loss": 2.0216, + "step": 5371 + }, + { + "epoch": 0.565176223040505, + "grad_norm": 1.3310232162475586, + "learning_rate": 8.383339647418777e-05, + "loss": 1.5981, + "step": 5372 + }, + { + "epoch": 0.5652814308258811, + "grad_norm": 0.9504427313804626, + "learning_rate": 8.379976831431689e-05, + "loss": 1.7203, + "step": 5373 + }, + { + "epoch": 0.5653866386112573, + "grad_norm": 1.273130178451538, + "learning_rate": 8.376614203572559e-05, + "loss": 1.5829, + "step": 5374 + }, + { + "epoch": 0.5654918463966333, + "grad_norm": 1.1506158113479614, + "learning_rate": 8.373251764231872e-05, + "loss": 1.3818, + "step": 5375 + }, + { + "epoch": 0.5655970541820095, + "grad_norm": 1.0576092004776, + "learning_rate": 8.369889513800102e-05, + "loss": 1.9378, + "step": 5376 + }, + { + "epoch": 0.5657022619673856, + "grad_norm": 1.4205738306045532, + "learning_rate": 8.366527452667698e-05, + "loss": 1.9037, + "step": 5377 + }, + { + "epoch": 0.5658074697527617, + "grad_norm": 1.1215096712112427, + "learning_rate": 8.363165581225083e-05, + "loss": 1.6023, + "step": 5378 + }, + { + "epoch": 0.5659126775381378, + "grad_norm": 1.6068058013916016, + "learning_rate": 8.359803899862655e-05, + "loss": 2.4349, + "step": 5379 + }, + { + "epoch": 0.5660178853235139, + "grad_norm": 1.7272415161132812, + "learning_rate": 8.356442408970799e-05, + "loss": 1.3773, + "step": 5380 + }, + { + "epoch": 0.5661230931088901, + "grad_norm": 1.9546716213226318, + "learning_rate": 8.353081108939874e-05, + "loss": 1.6571, + "step": 5381 + }, + { + "epoch": 0.5662283008942661, + "grad_norm": 1.531908392906189, + "learning_rate": 8.349720000160218e-05, + "loss": 1.9016, + "step": 5382 + }, + { + "epoch": 0.5663335086796423, + "grad_norm": 1.5750274658203125, + "learning_rate": 8.346359083022143e-05, + "loss": 2.044, + "step": 5383 + }, + { + "epoch": 0.5664387164650184, + "grad_norm": 1.3168236017227173, + "learning_rate": 8.342998357915942e-05, + "loss": 1.6516, + "step": 5384 + }, + { + "epoch": 0.5665439242503946, + "grad_norm": 1.2597466707229614, + "learning_rate": 8.339637825231887e-05, + "loss": 1.4705, + "step": 5385 + }, + { + "epoch": 0.5666491320357706, + "grad_norm": 1.7140848636627197, + "learning_rate": 8.336277485360223e-05, + "loss": 1.9122, + "step": 5386 + }, + { + "epoch": 0.5667543398211468, + "grad_norm": 1.403868556022644, + "learning_rate": 8.332917338691175e-05, + "loss": 1.7833, + "step": 5387 + }, + { + "epoch": 0.5668595476065229, + "grad_norm": 1.871900200843811, + "learning_rate": 8.32955738561495e-05, + "loss": 1.7798, + "step": 5388 + }, + { + "epoch": 0.5669647553918991, + "grad_norm": 1.4764777421951294, + "learning_rate": 8.326197626521723e-05, + "loss": 1.9589, + "step": 5389 + }, + { + "epoch": 0.5670699631772751, + "grad_norm": 1.7492798566818237, + "learning_rate": 8.322838061801653e-05, + "loss": 2.1874, + "step": 5390 + }, + { + "epoch": 0.5671751709626512, + "grad_norm": 1.9016979932785034, + "learning_rate": 8.319478691844878e-05, + "loss": 1.4618, + "step": 5391 + }, + { + "epoch": 0.5672803787480274, + "grad_norm": 1.8420443534851074, + "learning_rate": 8.316119517041508e-05, + "loss": 1.305, + "step": 5392 + }, + { + "epoch": 0.5673855865334034, + "grad_norm": 1.3935943841934204, + "learning_rate": 8.312760537781632e-05, + "loss": 1.9563, + "step": 5393 + }, + { + "epoch": 0.5674907943187796, + "grad_norm": 1.9522449970245361, + "learning_rate": 8.30940175445532e-05, + "loss": 1.5606, + "step": 5394 + }, + { + "epoch": 0.5675960021041557, + "grad_norm": 1.159475326538086, + "learning_rate": 8.306043167452617e-05, + "loss": 1.5782, + "step": 5395 + }, + { + "epoch": 0.5677012098895319, + "grad_norm": 1.9530138969421387, + "learning_rate": 8.30268477716354e-05, + "loss": 1.8486, + "step": 5396 + }, + { + "epoch": 0.5678064176749079, + "grad_norm": 1.253212571144104, + "learning_rate": 8.299326583978092e-05, + "loss": 1.7982, + "step": 5397 + }, + { + "epoch": 0.5679116254602841, + "grad_norm": 1.5111079216003418, + "learning_rate": 8.29596858828625e-05, + "loss": 1.5008, + "step": 5398 + }, + { + "epoch": 0.5680168332456602, + "grad_norm": 1.4527966976165771, + "learning_rate": 8.292610790477962e-05, + "loss": 2.2226, + "step": 5399 + }, + { + "epoch": 0.5681220410310363, + "grad_norm": 1.5884594917297363, + "learning_rate": 8.289253190943164e-05, + "loss": 1.9317, + "step": 5400 + }, + { + "epoch": 0.5682272488164124, + "grad_norm": 1.4073702096939087, + "learning_rate": 8.285895790071757e-05, + "loss": 1.4504, + "step": 5401 + }, + { + "epoch": 0.5683324566017885, + "grad_norm": 1.079156756401062, + "learning_rate": 8.282538588253627e-05, + "loss": 1.2555, + "step": 5402 + }, + { + "epoch": 0.5684376643871647, + "grad_norm": 1.4192665815353394, + "learning_rate": 8.279181585878635e-05, + "loss": 1.947, + "step": 5403 + }, + { + "epoch": 0.5685428721725407, + "grad_norm": 1.4388915300369263, + "learning_rate": 8.275824783336618e-05, + "loss": 1.5275, + "step": 5404 + }, + { + "epoch": 0.5686480799579169, + "grad_norm": 0.9725925326347351, + "learning_rate": 8.272468181017391e-05, + "loss": 1.8777, + "step": 5405 + }, + { + "epoch": 0.568753287743293, + "grad_norm": 2.05253005027771, + "learning_rate": 8.26911177931075e-05, + "loss": 1.1397, + "step": 5406 + }, + { + "epoch": 0.5688584955286691, + "grad_norm": 1.8297544717788696, + "learning_rate": 8.265755578606456e-05, + "loss": 1.6646, + "step": 5407 + }, + { + "epoch": 0.5689637033140452, + "grad_norm": 1.0543756484985352, + "learning_rate": 8.262399579294253e-05, + "loss": 2.0218, + "step": 5408 + }, + { + "epoch": 0.5690689110994214, + "grad_norm": 1.6449626684188843, + "learning_rate": 8.259043781763869e-05, + "loss": 1.4181, + "step": 5409 + }, + { + "epoch": 0.5691741188847975, + "grad_norm": 1.7541320323944092, + "learning_rate": 8.255688186404996e-05, + "loss": 1.8855, + "step": 5410 + }, + { + "epoch": 0.5692793266701736, + "grad_norm": 2.0391685962677, + "learning_rate": 8.25233279360731e-05, + "loss": 1.7674, + "step": 5411 + }, + { + "epoch": 0.5693845344555497, + "grad_norm": 1.9726836681365967, + "learning_rate": 8.248977603760464e-05, + "loss": 1.7674, + "step": 5412 + }, + { + "epoch": 0.5694897422409259, + "grad_norm": 1.3250395059585571, + "learning_rate": 8.245622617254079e-05, + "loss": 1.7336, + "step": 5413 + }, + { + "epoch": 0.5695949500263019, + "grad_norm": 1.848076581954956, + "learning_rate": 8.242267834477764e-05, + "loss": 1.9363, + "step": 5414 + }, + { + "epoch": 0.569700157811678, + "grad_norm": 1.8783268928527832, + "learning_rate": 8.238913255821099e-05, + "loss": 1.3876, + "step": 5415 + }, + { + "epoch": 0.5698053655970542, + "grad_norm": 1.4165843725204468, + "learning_rate": 8.235558881673637e-05, + "loss": 2.0615, + "step": 5416 + }, + { + "epoch": 0.5699105733824303, + "grad_norm": 1.4009727239608765, + "learning_rate": 8.232204712424911e-05, + "loss": 1.9336, + "step": 5417 + }, + { + "epoch": 0.5700157811678064, + "grad_norm": 1.3084132671356201, + "learning_rate": 8.228850748464431e-05, + "loss": 1.7258, + "step": 5418 + }, + { + "epoch": 0.5701209889531825, + "grad_norm": 1.5547186136245728, + "learning_rate": 8.225496990181684e-05, + "loss": 1.4471, + "step": 5419 + }, + { + "epoch": 0.5702261967385587, + "grad_norm": 1.5895652770996094, + "learning_rate": 8.222143437966124e-05, + "loss": 1.815, + "step": 5420 + }, + { + "epoch": 0.5703314045239348, + "grad_norm": 1.7180938720703125, + "learning_rate": 8.218790092207199e-05, + "loss": 1.4393, + "step": 5421 + }, + { + "epoch": 0.5704366123093109, + "grad_norm": 1.1043965816497803, + "learning_rate": 8.21543695329431e-05, + "loss": 1.5099, + "step": 5422 + }, + { + "epoch": 0.570541820094687, + "grad_norm": 1.113439917564392, + "learning_rate": 8.212084021616852e-05, + "loss": 1.9908, + "step": 5423 + }, + { + "epoch": 0.5706470278800632, + "grad_norm": 1.640676736831665, + "learning_rate": 8.208731297564189e-05, + "loss": 1.6397, + "step": 5424 + }, + { + "epoch": 0.5707522356654392, + "grad_norm": 1.0427812337875366, + "learning_rate": 8.205378781525662e-05, + "loss": 1.6538, + "step": 5425 + }, + { + "epoch": 0.5708574434508153, + "grad_norm": 1.614121437072754, + "learning_rate": 8.202026473890588e-05, + "loss": 1.6651, + "step": 5426 + }, + { + "epoch": 0.5709626512361915, + "grad_norm": 1.6778579950332642, + "learning_rate": 8.198674375048257e-05, + "loss": 1.6189, + "step": 5427 + }, + { + "epoch": 0.5710678590215676, + "grad_norm": 1.0318078994750977, + "learning_rate": 8.195322485387939e-05, + "loss": 1.7629, + "step": 5428 + }, + { + "epoch": 0.5711730668069437, + "grad_norm": 1.1207058429718018, + "learning_rate": 8.191970805298881e-05, + "loss": 1.6471, + "step": 5429 + }, + { + "epoch": 0.5712782745923198, + "grad_norm": 1.0314679145812988, + "learning_rate": 8.188619335170298e-05, + "loss": 1.5387, + "step": 5430 + }, + { + "epoch": 0.571383482377696, + "grad_norm": 1.1926995515823364, + "learning_rate": 8.185268075391388e-05, + "loss": 1.5988, + "step": 5431 + }, + { + "epoch": 0.571488690163072, + "grad_norm": 1.2411372661590576, + "learning_rate": 8.181917026351318e-05, + "loss": 1.2326, + "step": 5432 + }, + { + "epoch": 0.5715938979484482, + "grad_norm": 1.1884357929229736, + "learning_rate": 8.17856618843924e-05, + "loss": 1.7397, + "step": 5433 + }, + { + "epoch": 0.5716991057338243, + "grad_norm": 1.4349546432495117, + "learning_rate": 8.175215562044272e-05, + "loss": 1.6069, + "step": 5434 + }, + { + "epoch": 0.5718043135192005, + "grad_norm": 2.4111011028289795, + "learning_rate": 8.17186514755551e-05, + "loss": 1.669, + "step": 5435 + }, + { + "epoch": 0.5719095213045765, + "grad_norm": 1.5937304496765137, + "learning_rate": 8.168514945362031e-05, + "loss": 1.7242, + "step": 5436 + }, + { + "epoch": 0.5720147290899527, + "grad_norm": 1.468326210975647, + "learning_rate": 8.165164955852879e-05, + "loss": 1.5671, + "step": 5437 + }, + { + "epoch": 0.5721199368753288, + "grad_norm": 1.6039079427719116, + "learning_rate": 8.161815179417078e-05, + "loss": 1.2596, + "step": 5438 + }, + { + "epoch": 0.5722251446607048, + "grad_norm": 1.7768357992172241, + "learning_rate": 8.15846561644363e-05, + "loss": 2.1289, + "step": 5439 + }, + { + "epoch": 0.572330352446081, + "grad_norm": 1.709168553352356, + "learning_rate": 8.155116267321503e-05, + "loss": 1.5969, + "step": 5440 + }, + { + "epoch": 0.5724355602314571, + "grad_norm": 2.2013728618621826, + "learning_rate": 8.151767132439649e-05, + "loss": 2.0476, + "step": 5441 + }, + { + "epoch": 0.5725407680168333, + "grad_norm": 1.7853853702545166, + "learning_rate": 8.148418212186992e-05, + "loss": 1.9721, + "step": 5442 + }, + { + "epoch": 0.5726459758022093, + "grad_norm": 1.9058358669281006, + "learning_rate": 8.145069506952436e-05, + "loss": 2.2435, + "step": 5443 + }, + { + "epoch": 0.5727511835875855, + "grad_norm": 2.1775949001312256, + "learning_rate": 8.141721017124847e-05, + "loss": 1.8347, + "step": 5444 + }, + { + "epoch": 0.5728563913729616, + "grad_norm": 1.0717917680740356, + "learning_rate": 8.138372743093076e-05, + "loss": 1.486, + "step": 5445 + }, + { + "epoch": 0.5729615991583377, + "grad_norm": 1.1905382871627808, + "learning_rate": 8.135024685245947e-05, + "loss": 1.5313, + "step": 5446 + }, + { + "epoch": 0.5730668069437138, + "grad_norm": 1.8441627025604248, + "learning_rate": 8.131676843972263e-05, + "loss": 1.5609, + "step": 5447 + }, + { + "epoch": 0.57317201472909, + "grad_norm": 1.70012366771698, + "learning_rate": 8.128329219660791e-05, + "loss": 1.4996, + "step": 5448 + }, + { + "epoch": 0.5732772225144661, + "grad_norm": 2.055849313735962, + "learning_rate": 8.124981812700285e-05, + "loss": 1.6694, + "step": 5449 + }, + { + "epoch": 0.5733824302998421, + "grad_norm": 1.5213593244552612, + "learning_rate": 8.121634623479466e-05, + "loss": 1.7859, + "step": 5450 + }, + { + "epoch": 0.5734876380852183, + "grad_norm": 1.240997314453125, + "learning_rate": 8.118287652387035e-05, + "loss": 1.9417, + "step": 5451 + }, + { + "epoch": 0.5735928458705944, + "grad_norm": 1.187052607536316, + "learning_rate": 8.114940899811662e-05, + "loss": 1.7307, + "step": 5452 + }, + { + "epoch": 0.5736980536559706, + "grad_norm": 2.3511910438537598, + "learning_rate": 8.111594366141993e-05, + "loss": 2.0432, + "step": 5453 + }, + { + "epoch": 0.5738032614413466, + "grad_norm": 1.187315583229065, + "learning_rate": 8.108248051766656e-05, + "loss": 1.6997, + "step": 5454 + }, + { + "epoch": 0.5739084692267228, + "grad_norm": 1.4285547733306885, + "learning_rate": 8.10490195707424e-05, + "loss": 2.0346, + "step": 5455 + }, + { + "epoch": 0.5740136770120989, + "grad_norm": 1.6566582918167114, + "learning_rate": 8.101556082453319e-05, + "loss": 2.0706, + "step": 5456 + }, + { + "epoch": 0.574118884797475, + "grad_norm": 1.2293633222579956, + "learning_rate": 8.098210428292441e-05, + "loss": 2.2512, + "step": 5457 + }, + { + "epoch": 0.5742240925828511, + "grad_norm": 1.1286085844039917, + "learning_rate": 8.094864994980123e-05, + "loss": 1.6408, + "step": 5458 + }, + { + "epoch": 0.5743293003682273, + "grad_norm": 1.2557892799377441, + "learning_rate": 8.091519782904857e-05, + "loss": 1.9494, + "step": 5459 + }, + { + "epoch": 0.5744345081536034, + "grad_norm": 1.3432778120040894, + "learning_rate": 8.088174792455119e-05, + "loss": 2.1635, + "step": 5460 + }, + { + "epoch": 0.5745397159389795, + "grad_norm": 2.0232627391815186, + "learning_rate": 8.084830024019343e-05, + "loss": 1.565, + "step": 5461 + }, + { + "epoch": 0.5746449237243556, + "grad_norm": 1.3621368408203125, + "learning_rate": 8.08148547798595e-05, + "loss": 1.755, + "step": 5462 + }, + { + "epoch": 0.5747501315097318, + "grad_norm": 1.1986950635910034, + "learning_rate": 8.078141154743332e-05, + "loss": 1.8158, + "step": 5463 + }, + { + "epoch": 0.5748553392951078, + "grad_norm": 1.457479476928711, + "learning_rate": 8.074797054679855e-05, + "loss": 1.6491, + "step": 5464 + }, + { + "epoch": 0.5749605470804839, + "grad_norm": 1.3137367963790894, + "learning_rate": 8.071453178183856e-05, + "loss": 1.7712, + "step": 5465 + }, + { + "epoch": 0.5750657548658601, + "grad_norm": 1.0634950399398804, + "learning_rate": 8.068109525643647e-05, + "loss": 1.9505, + "step": 5466 + }, + { + "epoch": 0.5751709626512362, + "grad_norm": 0.9608885645866394, + "learning_rate": 8.064766097447516e-05, + "loss": 1.57, + "step": 5467 + }, + { + "epoch": 0.5752761704366123, + "grad_norm": 1.5873912572860718, + "learning_rate": 8.061422893983729e-05, + "loss": 1.6664, + "step": 5468 + }, + { + "epoch": 0.5753813782219884, + "grad_norm": 1.4330425262451172, + "learning_rate": 8.058079915640515e-05, + "loss": 1.8861, + "step": 5469 + }, + { + "epoch": 0.5754865860073646, + "grad_norm": 1.3749375343322754, + "learning_rate": 8.054737162806086e-05, + "loss": 1.9692, + "step": 5470 + }, + { + "epoch": 0.5755917937927406, + "grad_norm": 1.9150141477584839, + "learning_rate": 8.051394635868626e-05, + "loss": 1.6297, + "step": 5471 + }, + { + "epoch": 0.5756970015781168, + "grad_norm": 1.4533182382583618, + "learning_rate": 8.048052335216289e-05, + "loss": 1.2963, + "step": 5472 + }, + { + "epoch": 0.5758022093634929, + "grad_norm": 2.1088831424713135, + "learning_rate": 8.044710261237207e-05, + "loss": 1.4709, + "step": 5473 + }, + { + "epoch": 0.5759074171488691, + "grad_norm": 1.277236819267273, + "learning_rate": 8.041368414319483e-05, + "loss": 1.8557, + "step": 5474 + }, + { + "epoch": 0.5760126249342451, + "grad_norm": 1.2016223669052124, + "learning_rate": 8.038026794851198e-05, + "loss": 1.6482, + "step": 5475 + }, + { + "epoch": 0.5761178327196212, + "grad_norm": 1.476108431816101, + "learning_rate": 8.034685403220398e-05, + "loss": 1.7097, + "step": 5476 + }, + { + "epoch": 0.5762230405049974, + "grad_norm": 2.2213351726531982, + "learning_rate": 8.031344239815111e-05, + "loss": 1.9822, + "step": 5477 + }, + { + "epoch": 0.5763282482903734, + "grad_norm": 1.5243347883224487, + "learning_rate": 8.028003305023338e-05, + "loss": 2.1552, + "step": 5478 + }, + { + "epoch": 0.5764334560757496, + "grad_norm": 2.157576322555542, + "learning_rate": 8.024662599233043e-05, + "loss": 2.0237, + "step": 5479 + }, + { + "epoch": 0.5765386638611257, + "grad_norm": 1.6475622653961182, + "learning_rate": 8.021322122832178e-05, + "loss": 2.1017, + "step": 5480 + }, + { + "epoch": 0.5766438716465019, + "grad_norm": 1.4928076267242432, + "learning_rate": 8.01798187620866e-05, + "loss": 1.6096, + "step": 5481 + }, + { + "epoch": 0.5767490794318779, + "grad_norm": 1.6426701545715332, + "learning_rate": 8.014641859750379e-05, + "loss": 1.8119, + "step": 5482 + }, + { + "epoch": 0.5768542872172541, + "grad_norm": 1.8036248683929443, + "learning_rate": 8.011302073845201e-05, + "loss": 1.9572, + "step": 5483 + }, + { + "epoch": 0.5769594950026302, + "grad_norm": 1.9108166694641113, + "learning_rate": 8.007962518880966e-05, + "loss": 2.202, + "step": 5484 + }, + { + "epoch": 0.5770647027880064, + "grad_norm": 1.3631569147109985, + "learning_rate": 8.004623195245481e-05, + "loss": 1.9852, + "step": 5485 + }, + { + "epoch": 0.5771699105733824, + "grad_norm": 1.4796253442764282, + "learning_rate": 8.001284103326539e-05, + "loss": 1.9356, + "step": 5486 + }, + { + "epoch": 0.5772751183587586, + "grad_norm": 1.3997398614883423, + "learning_rate": 7.99794524351189e-05, + "loss": 1.9229, + "step": 5487 + }, + { + "epoch": 0.5773803261441347, + "grad_norm": 0.9555890560150146, + "learning_rate": 7.994606616189264e-05, + "loss": 1.6695, + "step": 5488 + }, + { + "epoch": 0.5774855339295107, + "grad_norm": 3.1957201957702637, + "learning_rate": 7.991268221746373e-05, + "loss": 2.0273, + "step": 5489 + }, + { + "epoch": 0.5775907417148869, + "grad_norm": 1.504439115524292, + "learning_rate": 7.987930060570885e-05, + "loss": 2.1869, + "step": 5490 + }, + { + "epoch": 0.577695949500263, + "grad_norm": 1.781453251838684, + "learning_rate": 7.984592133050454e-05, + "loss": 1.8198, + "step": 5491 + }, + { + "epoch": 0.5778011572856392, + "grad_norm": 1.4022020101547241, + "learning_rate": 7.981254439572704e-05, + "loss": 1.6377, + "step": 5492 + }, + { + "epoch": 0.5779063650710152, + "grad_norm": 2.5264246463775635, + "learning_rate": 7.977916980525227e-05, + "loss": 1.8286, + "step": 5493 + }, + { + "epoch": 0.5780115728563914, + "grad_norm": 1.4550338983535767, + "learning_rate": 7.974579756295591e-05, + "loss": 1.9897, + "step": 5494 + }, + { + "epoch": 0.5781167806417675, + "grad_norm": 1.6223548650741577, + "learning_rate": 7.97124276727134e-05, + "loss": 1.7706, + "step": 5495 + }, + { + "epoch": 0.5782219884271436, + "grad_norm": 1.2290290594100952, + "learning_rate": 7.967906013839987e-05, + "loss": 1.8209, + "step": 5496 + }, + { + "epoch": 0.5783271962125197, + "grad_norm": 1.5976051092147827, + "learning_rate": 7.964569496389013e-05, + "loss": 2.1965, + "step": 5497 + }, + { + "epoch": 0.5784324039978959, + "grad_norm": 1.4536248445510864, + "learning_rate": 7.961233215305884e-05, + "loss": 1.8925, + "step": 5498 + }, + { + "epoch": 0.578537611783272, + "grad_norm": 0.8368136286735535, + "learning_rate": 7.957897170978031e-05, + "loss": 1.5314, + "step": 5499 + }, + { + "epoch": 0.578642819568648, + "grad_norm": 1.7957181930541992, + "learning_rate": 7.95456136379285e-05, + "loss": 2.1585, + "step": 5500 + }, + { + "epoch": 0.5787480273540242, + "grad_norm": 1.1656877994537354, + "learning_rate": 7.951225794137724e-05, + "loss": 1.6489, + "step": 5501 + }, + { + "epoch": 0.5788532351394003, + "grad_norm": 1.2200168371200562, + "learning_rate": 7.947890462400002e-05, + "loss": 1.6195, + "step": 5502 + }, + { + "epoch": 0.5789584429247764, + "grad_norm": 1.2942944765090942, + "learning_rate": 7.944555368967001e-05, + "loss": 1.8944, + "step": 5503 + }, + { + "epoch": 0.5790636507101525, + "grad_norm": 1.5925309658050537, + "learning_rate": 7.941220514226016e-05, + "loss": 1.5092, + "step": 5504 + }, + { + "epoch": 0.5791688584955287, + "grad_norm": 1.2316579818725586, + "learning_rate": 7.937885898564315e-05, + "loss": 1.7135, + "step": 5505 + }, + { + "epoch": 0.5792740662809048, + "grad_norm": 1.9800273180007935, + "learning_rate": 7.934551522369134e-05, + "loss": 1.6139, + "step": 5506 + }, + { + "epoch": 0.5793792740662809, + "grad_norm": 1.2461237907409668, + "learning_rate": 7.931217386027686e-05, + "loss": 1.6989, + "step": 5507 + }, + { + "epoch": 0.579484481851657, + "grad_norm": 1.102742075920105, + "learning_rate": 7.927883489927147e-05, + "loss": 1.9475, + "step": 5508 + }, + { + "epoch": 0.5795896896370332, + "grad_norm": 1.5722696781158447, + "learning_rate": 7.924549834454674e-05, + "loss": 1.6444, + "step": 5509 + }, + { + "epoch": 0.5796948974224092, + "grad_norm": 1.331183671951294, + "learning_rate": 7.921216419997398e-05, + "loss": 1.3104, + "step": 5510 + }, + { + "epoch": 0.5798001052077854, + "grad_norm": 1.825430989265442, + "learning_rate": 7.917883246942412e-05, + "loss": 2.1806, + "step": 5511 + }, + { + "epoch": 0.5799053129931615, + "grad_norm": 1.2514722347259521, + "learning_rate": 7.914550315676787e-05, + "loss": 1.7759, + "step": 5512 + }, + { + "epoch": 0.5800105207785377, + "grad_norm": 1.1857120990753174, + "learning_rate": 7.91121762658757e-05, + "loss": 1.59, + "step": 5513 + }, + { + "epoch": 0.5801157285639137, + "grad_norm": 1.2514647245407104, + "learning_rate": 7.907885180061767e-05, + "loss": 2.1173, + "step": 5514 + }, + { + "epoch": 0.5802209363492898, + "grad_norm": 2.3011891841888428, + "learning_rate": 7.904552976486372e-05, + "loss": 1.8243, + "step": 5515 + }, + { + "epoch": 0.580326144134666, + "grad_norm": 1.686474084854126, + "learning_rate": 7.90122101624834e-05, + "loss": 1.6543, + "step": 5516 + }, + { + "epoch": 0.5804313519200421, + "grad_norm": 1.496381163597107, + "learning_rate": 7.897889299734599e-05, + "loss": 1.6579, + "step": 5517 + }, + { + "epoch": 0.5805365597054182, + "grad_norm": 2.0382790565490723, + "learning_rate": 7.894557827332052e-05, + "loss": 1.0179, + "step": 5518 + }, + { + "epoch": 0.5806417674907943, + "grad_norm": 1.186845064163208, + "learning_rate": 7.891226599427572e-05, + "loss": 1.7989, + "step": 5519 + }, + { + "epoch": 0.5807469752761705, + "grad_norm": 1.6111037731170654, + "learning_rate": 7.887895616408001e-05, + "loss": 1.6084, + "step": 5520 + }, + { + "epoch": 0.5808521830615465, + "grad_norm": 1.4457639455795288, + "learning_rate": 7.884564878660159e-05, + "loss": 1.784, + "step": 5521 + }, + { + "epoch": 0.5809573908469227, + "grad_norm": 1.4394114017486572, + "learning_rate": 7.88123438657083e-05, + "loss": 1.555, + "step": 5522 + }, + { + "epoch": 0.5810625986322988, + "grad_norm": 2.1034014225006104, + "learning_rate": 7.877904140526778e-05, + "loss": 1.7858, + "step": 5523 + }, + { + "epoch": 0.581167806417675, + "grad_norm": 1.5533802509307861, + "learning_rate": 7.874574140914727e-05, + "loss": 1.9843, + "step": 5524 + }, + { + "epoch": 0.581273014203051, + "grad_norm": 1.3675698041915894, + "learning_rate": 7.871244388121381e-05, + "loss": 2.3865, + "step": 5525 + }, + { + "epoch": 0.5813782219884271, + "grad_norm": 1.3307299613952637, + "learning_rate": 7.867914882533419e-05, + "loss": 1.7025, + "step": 5526 + }, + { + "epoch": 0.5814834297738033, + "grad_norm": 1.7351833581924438, + "learning_rate": 7.864585624537478e-05, + "loss": 1.679, + "step": 5527 + }, + { + "epoch": 0.5815886375591793, + "grad_norm": 1.8544718027114868, + "learning_rate": 7.86125661452018e-05, + "loss": 2.0512, + "step": 5528 + }, + { + "epoch": 0.5816938453445555, + "grad_norm": 1.8268216848373413, + "learning_rate": 7.857927852868107e-05, + "loss": 1.7498, + "step": 5529 + }, + { + "epoch": 0.5817990531299316, + "grad_norm": 1.4137629270553589, + "learning_rate": 7.854599339967817e-05, + "loss": 1.8924, + "step": 5530 + }, + { + "epoch": 0.5819042609153078, + "grad_norm": 1.431031584739685, + "learning_rate": 7.851271076205843e-05, + "loss": 1.6172, + "step": 5531 + }, + { + "epoch": 0.5820094687006838, + "grad_norm": 1.0963670015335083, + "learning_rate": 7.84794306196868e-05, + "loss": 1.6189, + "step": 5532 + }, + { + "epoch": 0.58211467648606, + "grad_norm": 1.4139877557754517, + "learning_rate": 7.844615297642805e-05, + "loss": 1.8776, + "step": 5533 + }, + { + "epoch": 0.5822198842714361, + "grad_norm": 1.4871132373809814, + "learning_rate": 7.84128778361466e-05, + "loss": 1.6445, + "step": 5534 + }, + { + "epoch": 0.5823250920568122, + "grad_norm": 1.2888953685760498, + "learning_rate": 7.837960520270652e-05, + "loss": 1.4948, + "step": 5535 + }, + { + "epoch": 0.5824302998421883, + "grad_norm": 1.376495599746704, + "learning_rate": 7.83463350799717e-05, + "loss": 1.8704, + "step": 5536 + }, + { + "epoch": 0.5825355076275645, + "grad_norm": 1.1731458902359009, + "learning_rate": 7.831306747180571e-05, + "loss": 1.5219, + "step": 5537 + }, + { + "epoch": 0.5826407154129406, + "grad_norm": 1.5646111965179443, + "learning_rate": 7.827980238207177e-05, + "loss": 1.3992, + "step": 5538 + }, + { + "epoch": 0.5827459231983166, + "grad_norm": 1.2269337177276611, + "learning_rate": 7.824653981463284e-05, + "loss": 2.2809, + "step": 5539 + }, + { + "epoch": 0.5828511309836928, + "grad_norm": 1.4412834644317627, + "learning_rate": 7.821327977335164e-05, + "loss": 1.2724, + "step": 5540 + }, + { + "epoch": 0.5829563387690689, + "grad_norm": 2.3292593955993652, + "learning_rate": 7.818002226209049e-05, + "loss": 1.4849, + "step": 5541 + }, + { + "epoch": 0.583061546554445, + "grad_norm": 1.5618584156036377, + "learning_rate": 7.814676728471151e-05, + "loss": 2.0205, + "step": 5542 + }, + { + "epoch": 0.5831667543398211, + "grad_norm": 1.4830820560455322, + "learning_rate": 7.811351484507647e-05, + "loss": 1.7858, + "step": 5543 + }, + { + "epoch": 0.5832719621251973, + "grad_norm": 1.1995006799697876, + "learning_rate": 7.808026494704692e-05, + "loss": 2.016, + "step": 5544 + }, + { + "epoch": 0.5833771699105734, + "grad_norm": 1.5571500062942505, + "learning_rate": 7.804701759448398e-05, + "loss": 1.7179, + "step": 5545 + }, + { + "epoch": 0.5834823776959495, + "grad_norm": 1.301533579826355, + "learning_rate": 7.801377279124862e-05, + "loss": 2.0771, + "step": 5546 + }, + { + "epoch": 0.5835875854813256, + "grad_norm": 1.841912031173706, + "learning_rate": 7.798053054120143e-05, + "loss": 1.9201, + "step": 5547 + }, + { + "epoch": 0.5836927932667018, + "grad_norm": 2.079071044921875, + "learning_rate": 7.794729084820272e-05, + "loss": 2.0332, + "step": 5548 + }, + { + "epoch": 0.5837980010520779, + "grad_norm": 1.3038408756256104, + "learning_rate": 7.791405371611249e-05, + "loss": 1.6136, + "step": 5549 + }, + { + "epoch": 0.583903208837454, + "grad_norm": 1.194056510925293, + "learning_rate": 7.788081914879051e-05, + "loss": 1.6015, + "step": 5550 + }, + { + "epoch": 0.5840084166228301, + "grad_norm": 1.3977482318878174, + "learning_rate": 7.784758715009616e-05, + "loss": 1.8922, + "step": 5551 + }, + { + "epoch": 0.5841136244082062, + "grad_norm": 1.124822735786438, + "learning_rate": 7.781435772388854e-05, + "loss": 1.4993, + "step": 5552 + }, + { + "epoch": 0.5842188321935823, + "grad_norm": 1.9948351383209229, + "learning_rate": 7.778113087402649e-05, + "loss": 1.7896, + "step": 5553 + }, + { + "epoch": 0.5843240399789584, + "grad_norm": 1.693495273590088, + "learning_rate": 7.774790660436858e-05, + "loss": 1.8484, + "step": 5554 + }, + { + "epoch": 0.5844292477643346, + "grad_norm": 1.079588532447815, + "learning_rate": 7.771468491877299e-05, + "loss": 2.1711, + "step": 5555 + }, + { + "epoch": 0.5845344555497107, + "grad_norm": 2.974010467529297, + "learning_rate": 7.768146582109765e-05, + "loss": 1.8077, + "step": 5556 + }, + { + "epoch": 0.5846396633350868, + "grad_norm": 2.0303964614868164, + "learning_rate": 7.764824931520018e-05, + "loss": 1.8446, + "step": 5557 + }, + { + "epoch": 0.5847448711204629, + "grad_norm": 1.5578415393829346, + "learning_rate": 7.761503540493795e-05, + "loss": 1.1523, + "step": 5558 + }, + { + "epoch": 0.5848500789058391, + "grad_norm": 1.480180025100708, + "learning_rate": 7.758182409416792e-05, + "loss": 1.5651, + "step": 5559 + }, + { + "epoch": 0.5849552866912151, + "grad_norm": 1.6015815734863281, + "learning_rate": 7.754861538674683e-05, + "loss": 1.8731, + "step": 5560 + }, + { + "epoch": 0.5850604944765913, + "grad_norm": 2.9025611877441406, + "learning_rate": 7.751540928653113e-05, + "loss": 0.9335, + "step": 5561 + }, + { + "epoch": 0.5851657022619674, + "grad_norm": 1.393959403038025, + "learning_rate": 7.748220579737689e-05, + "loss": 2.0774, + "step": 5562 + }, + { + "epoch": 0.5852709100473436, + "grad_norm": 1.700785756111145, + "learning_rate": 7.744900492313995e-05, + "loss": 1.7171, + "step": 5563 + }, + { + "epoch": 0.5853761178327196, + "grad_norm": 1.6016006469726562, + "learning_rate": 7.741580666767583e-05, + "loss": 1.7215, + "step": 5564 + }, + { + "epoch": 0.5854813256180957, + "grad_norm": 1.1057853698730469, + "learning_rate": 7.738261103483968e-05, + "loss": 1.1364, + "step": 5565 + }, + { + "epoch": 0.5855865334034719, + "grad_norm": 1.9537837505340576, + "learning_rate": 7.734941802848643e-05, + "loss": 1.5892, + "step": 5566 + }, + { + "epoch": 0.5856917411888479, + "grad_norm": 1.664334774017334, + "learning_rate": 7.731622765247069e-05, + "loss": 1.7183, + "step": 5567 + }, + { + "epoch": 0.5857969489742241, + "grad_norm": 1.9867991209030151, + "learning_rate": 7.728303991064675e-05, + "loss": 1.5369, + "step": 5568 + }, + { + "epoch": 0.5859021567596002, + "grad_norm": 1.713571310043335, + "learning_rate": 7.724985480686856e-05, + "loss": 2.0291, + "step": 5569 + }, + { + "epoch": 0.5860073645449764, + "grad_norm": 1.1955255270004272, + "learning_rate": 7.721667234498982e-05, + "loss": 2.378, + "step": 5570 + }, + { + "epoch": 0.5861125723303524, + "grad_norm": 1.5413669347763062, + "learning_rate": 7.718349252886395e-05, + "loss": 2.0389, + "step": 5571 + }, + { + "epoch": 0.5862177801157286, + "grad_norm": 1.7773537635803223, + "learning_rate": 7.715031536234392e-05, + "loss": 1.7247, + "step": 5572 + }, + { + "epoch": 0.5863229879011047, + "grad_norm": 1.7139322757720947, + "learning_rate": 7.711714084928251e-05, + "loss": 1.7779, + "step": 5573 + }, + { + "epoch": 0.5864281956864807, + "grad_norm": 1.3501460552215576, + "learning_rate": 7.708396899353219e-05, + "loss": 1.6001, + "step": 5574 + }, + { + "epoch": 0.5865334034718569, + "grad_norm": 3.4829583168029785, + "learning_rate": 7.705079979894509e-05, + "loss": 1.5646, + "step": 5575 + }, + { + "epoch": 0.586638611257233, + "grad_norm": 1.7532110214233398, + "learning_rate": 7.701763326937304e-05, + "loss": 1.5209, + "step": 5576 + }, + { + "epoch": 0.5867438190426092, + "grad_norm": 1.371377944946289, + "learning_rate": 7.698446940866754e-05, + "loss": 1.5853, + "step": 5577 + }, + { + "epoch": 0.5868490268279852, + "grad_norm": 1.3144980669021606, + "learning_rate": 7.695130822067984e-05, + "loss": 1.5741, + "step": 5578 + }, + { + "epoch": 0.5869542346133614, + "grad_norm": 1.3463817834854126, + "learning_rate": 7.691814970926083e-05, + "loss": 1.7503, + "step": 5579 + }, + { + "epoch": 0.5870594423987375, + "grad_norm": 1.7925662994384766, + "learning_rate": 7.688499387826107e-05, + "loss": 1.3115, + "step": 5580 + }, + { + "epoch": 0.5871646501841137, + "grad_norm": 4.266043186187744, + "learning_rate": 7.685184073153085e-05, + "loss": 1.97, + "step": 5581 + }, + { + "epoch": 0.5872698579694897, + "grad_norm": 1.344207525253296, + "learning_rate": 7.681869027292016e-05, + "loss": 1.6598, + "step": 5582 + }, + { + "epoch": 0.5873750657548659, + "grad_norm": 1.2654930353164673, + "learning_rate": 7.67855425062786e-05, + "loss": 1.6116, + "step": 5583 + }, + { + "epoch": 0.587480273540242, + "grad_norm": 1.6905845403671265, + "learning_rate": 7.675239743545557e-05, + "loss": 2.0407, + "step": 5584 + }, + { + "epoch": 0.587585481325618, + "grad_norm": 1.8982789516448975, + "learning_rate": 7.67192550643001e-05, + "loss": 1.9079, + "step": 5585 + }, + { + "epoch": 0.5876906891109942, + "grad_norm": 1.407412052154541, + "learning_rate": 7.668611539666085e-05, + "loss": 1.8989, + "step": 5586 + }, + { + "epoch": 0.5877958968963704, + "grad_norm": 1.6381629705429077, + "learning_rate": 7.665297843638623e-05, + "loss": 1.3814, + "step": 5587 + }, + { + "epoch": 0.5879011046817465, + "grad_norm": 2.0129973888397217, + "learning_rate": 7.661984418732438e-05, + "loss": 2.0981, + "step": 5588 + }, + { + "epoch": 0.5880063124671225, + "grad_norm": 1.516152024269104, + "learning_rate": 7.6586712653323e-05, + "loss": 1.6191, + "step": 5589 + }, + { + "epoch": 0.5881115202524987, + "grad_norm": 1.2309309244155884, + "learning_rate": 7.655358383822959e-05, + "loss": 1.4041, + "step": 5590 + }, + { + "epoch": 0.5882167280378748, + "grad_norm": 1.7488607168197632, + "learning_rate": 7.652045774589129e-05, + "loss": 2.1543, + "step": 5591 + }, + { + "epoch": 0.5883219358232509, + "grad_norm": 1.7684071063995361, + "learning_rate": 7.648733438015493e-05, + "loss": 2.056, + "step": 5592 + }, + { + "epoch": 0.588427143608627, + "grad_norm": 1.56985342502594, + "learning_rate": 7.645421374486702e-05, + "loss": 1.8957, + "step": 5593 + }, + { + "epoch": 0.5885323513940032, + "grad_norm": 2.020207166671753, + "learning_rate": 7.64210958438737e-05, + "loss": 1.7548, + "step": 5594 + }, + { + "epoch": 0.5886375591793793, + "grad_norm": 1.3547688722610474, + "learning_rate": 7.638798068102086e-05, + "loss": 1.8783, + "step": 5595 + }, + { + "epoch": 0.5887427669647554, + "grad_norm": 1.2740882635116577, + "learning_rate": 7.635486826015412e-05, + "loss": 2.1481, + "step": 5596 + }, + { + "epoch": 0.5888479747501315, + "grad_norm": 1.9236841201782227, + "learning_rate": 7.632175858511863e-05, + "loss": 1.3462, + "step": 5597 + }, + { + "epoch": 0.5889531825355077, + "grad_norm": 1.3053388595581055, + "learning_rate": 7.628865165975934e-05, + "loss": 1.6169, + "step": 5598 + }, + { + "epoch": 0.5890583903208837, + "grad_norm": 1.8421201705932617, + "learning_rate": 7.625554748792085e-05, + "loss": 1.6345, + "step": 5599 + }, + { + "epoch": 0.5891635981062598, + "grad_norm": 1.4617395401000977, + "learning_rate": 7.622244607344748e-05, + "loss": 1.9279, + "step": 5600 + }, + { + "epoch": 0.589268805891636, + "grad_norm": 1.3376179933547974, + "learning_rate": 7.618934742018312e-05, + "loss": 1.5823, + "step": 5601 + }, + { + "epoch": 0.5893740136770121, + "grad_norm": 1.5780961513519287, + "learning_rate": 7.615625153197143e-05, + "loss": 2.1328, + "step": 5602 + }, + { + "epoch": 0.5894792214623882, + "grad_norm": 1.1965007781982422, + "learning_rate": 7.612315841265577e-05, + "loss": 1.136, + "step": 5603 + }, + { + "epoch": 0.5895844292477643, + "grad_norm": 1.290488600730896, + "learning_rate": 7.609006806607907e-05, + "loss": 1.5277, + "step": 5604 + }, + { + "epoch": 0.5896896370331405, + "grad_norm": 1.6050409078598022, + "learning_rate": 7.605698049608403e-05, + "loss": 1.8864, + "step": 5605 + }, + { + "epoch": 0.5897948448185165, + "grad_norm": 1.6614772081375122, + "learning_rate": 7.602389570651303e-05, + "loss": 1.9447, + "step": 5606 + }, + { + "epoch": 0.5899000526038927, + "grad_norm": 1.3600127696990967, + "learning_rate": 7.599081370120804e-05, + "loss": 1.6589, + "step": 5607 + }, + { + "epoch": 0.5900052603892688, + "grad_norm": 1.4860687255859375, + "learning_rate": 7.595773448401081e-05, + "loss": 1.5742, + "step": 5608 + }, + { + "epoch": 0.590110468174645, + "grad_norm": 1.154513955116272, + "learning_rate": 7.59246580587627e-05, + "loss": 2.1251, + "step": 5609 + }, + { + "epoch": 0.590215675960021, + "grad_norm": 2.0505974292755127, + "learning_rate": 7.589158442930478e-05, + "loss": 1.799, + "step": 5610 + }, + { + "epoch": 0.5903208837453972, + "grad_norm": 2.0056207180023193, + "learning_rate": 7.585851359947776e-05, + "loss": 1.6373, + "step": 5611 + }, + { + "epoch": 0.5904260915307733, + "grad_norm": 1.5547747611999512, + "learning_rate": 7.582544557312205e-05, + "loss": 1.4256, + "step": 5612 + }, + { + "epoch": 0.5905312993161494, + "grad_norm": 1.8376978635787964, + "learning_rate": 7.579238035407776e-05, + "loss": 1.3878, + "step": 5613 + }, + { + "epoch": 0.5906365071015255, + "grad_norm": 1.0613032579421997, + "learning_rate": 7.575931794618466e-05, + "loss": 1.9392, + "step": 5614 + }, + { + "epoch": 0.5907417148869016, + "grad_norm": 1.1583870649337769, + "learning_rate": 7.572625835328211e-05, + "loss": 1.7329, + "step": 5615 + }, + { + "epoch": 0.5908469226722778, + "grad_norm": 1.783522367477417, + "learning_rate": 7.569320157920923e-05, + "loss": 1.6231, + "step": 5616 + }, + { + "epoch": 0.5909521304576538, + "grad_norm": 1.3971893787384033, + "learning_rate": 7.566014762780483e-05, + "loss": 1.6672, + "step": 5617 + }, + { + "epoch": 0.59105733824303, + "grad_norm": 1.681029200553894, + "learning_rate": 7.562709650290732e-05, + "loss": 1.5265, + "step": 5618 + }, + { + "epoch": 0.5911625460284061, + "grad_norm": 1.8332031965255737, + "learning_rate": 7.559404820835484e-05, + "loss": 1.695, + "step": 5619 + }, + { + "epoch": 0.5912677538137823, + "grad_norm": 1.792855143547058, + "learning_rate": 7.556100274798519e-05, + "loss": 1.7872, + "step": 5620 + }, + { + "epoch": 0.5913729615991583, + "grad_norm": 1.1729624271392822, + "learning_rate": 7.55279601256358e-05, + "loss": 2.0353, + "step": 5621 + }, + { + "epoch": 0.5914781693845345, + "grad_norm": 1.0647164583206177, + "learning_rate": 7.549492034514381e-05, + "loss": 1.7371, + "step": 5622 + }, + { + "epoch": 0.5915833771699106, + "grad_norm": 1.6151602268218994, + "learning_rate": 7.546188341034603e-05, + "loss": 1.0778, + "step": 5623 + }, + { + "epoch": 0.5916885849552866, + "grad_norm": 1.2735862731933594, + "learning_rate": 7.542884932507896e-05, + "loss": 1.7771, + "step": 5624 + }, + { + "epoch": 0.5917937927406628, + "grad_norm": 1.3248671293258667, + "learning_rate": 7.539581809317866e-05, + "loss": 1.3799, + "step": 5625 + }, + { + "epoch": 0.5918990005260389, + "grad_norm": 1.618085265159607, + "learning_rate": 7.536278971848101e-05, + "loss": 1.9688, + "step": 5626 + }, + { + "epoch": 0.5920042083114151, + "grad_norm": 1.132401704788208, + "learning_rate": 7.532976420482146e-05, + "loss": 1.8808, + "step": 5627 + }, + { + "epoch": 0.5921094160967911, + "grad_norm": 1.2111188173294067, + "learning_rate": 7.529674155603516e-05, + "loss": 1.8875, + "step": 5628 + }, + { + "epoch": 0.5922146238821673, + "grad_norm": 1.4924447536468506, + "learning_rate": 7.52637217759569e-05, + "loss": 1.5021, + "step": 5629 + }, + { + "epoch": 0.5923198316675434, + "grad_norm": 1.672135353088379, + "learning_rate": 7.52307048684212e-05, + "loss": 1.4673, + "step": 5630 + }, + { + "epoch": 0.5924250394529195, + "grad_norm": 1.491942048072815, + "learning_rate": 7.519769083726216e-05, + "loss": 1.5303, + "step": 5631 + }, + { + "epoch": 0.5925302472382956, + "grad_norm": 1.7676005363464355, + "learning_rate": 7.51646796863136e-05, + "loss": 1.7333, + "step": 5632 + }, + { + "epoch": 0.5926354550236718, + "grad_norm": 0.917346179485321, + "learning_rate": 7.513167141940904e-05, + "loss": 1.8271, + "step": 5633 + }, + { + "epoch": 0.5927406628090479, + "grad_norm": 1.5192989110946655, + "learning_rate": 7.509866604038157e-05, + "loss": 1.8536, + "step": 5634 + }, + { + "epoch": 0.592845870594424, + "grad_norm": 0.7993506789207458, + "learning_rate": 7.506566355306402e-05, + "loss": 1.6131, + "step": 5635 + }, + { + "epoch": 0.5929510783798001, + "grad_norm": 1.2340017557144165, + "learning_rate": 7.503266396128887e-05, + "loss": 1.6488, + "step": 5636 + }, + { + "epoch": 0.5930562861651762, + "grad_norm": 1.8110322952270508, + "learning_rate": 7.499966726888823e-05, + "loss": 2.1135, + "step": 5637 + }, + { + "epoch": 0.5931614939505523, + "grad_norm": 1.2862292528152466, + "learning_rate": 7.49666734796939e-05, + "loss": 2.2786, + "step": 5638 + }, + { + "epoch": 0.5932667017359284, + "grad_norm": 2.9395089149475098, + "learning_rate": 7.493368259753734e-05, + "loss": 1.8967, + "step": 5639 + }, + { + "epoch": 0.5933719095213046, + "grad_norm": 2.132434844970703, + "learning_rate": 7.490069462624967e-05, + "loss": 1.3319, + "step": 5640 + }, + { + "epoch": 0.5934771173066807, + "grad_norm": 1.9779353141784668, + "learning_rate": 7.486770956966171e-05, + "loss": 2.0619, + "step": 5641 + }, + { + "epoch": 0.5935823250920568, + "grad_norm": 1.540423035621643, + "learning_rate": 7.483472743160387e-05, + "loss": 1.6819, + "step": 5642 + }, + { + "epoch": 0.5936875328774329, + "grad_norm": 1.2363083362579346, + "learning_rate": 7.480174821590624e-05, + "loss": 1.7712, + "step": 5643 + }, + { + "epoch": 0.5937927406628091, + "grad_norm": 1.7649072408676147, + "learning_rate": 7.476877192639866e-05, + "loss": 1.4488, + "step": 5644 + }, + { + "epoch": 0.5938979484481852, + "grad_norm": 1.634068489074707, + "learning_rate": 7.473579856691047e-05, + "loss": 2.044, + "step": 5645 + }, + { + "epoch": 0.5940031562335613, + "grad_norm": 1.0379782915115356, + "learning_rate": 7.470282814127081e-05, + "loss": 1.8553, + "step": 5646 + }, + { + "epoch": 0.5941083640189374, + "grad_norm": 1.697240948677063, + "learning_rate": 7.466986065330841e-05, + "loss": 1.4889, + "step": 5647 + }, + { + "epoch": 0.5942135718043136, + "grad_norm": 1.2661585807800293, + "learning_rate": 7.463689610685171e-05, + "loss": 2.0126, + "step": 5648 + }, + { + "epoch": 0.5943187795896896, + "grad_norm": 1.558271884918213, + "learning_rate": 7.460393450572872e-05, + "loss": 1.5544, + "step": 5649 + }, + { + "epoch": 0.5944239873750657, + "grad_norm": 1.657724142074585, + "learning_rate": 7.457097585376719e-05, + "loss": 2.0922, + "step": 5650 + }, + { + "epoch": 0.5945291951604419, + "grad_norm": 1.5575658082962036, + "learning_rate": 7.453802015479452e-05, + "loss": 2.0446, + "step": 5651 + }, + { + "epoch": 0.594634402945818, + "grad_norm": 1.7816355228424072, + "learning_rate": 7.45050674126377e-05, + "loss": 1.5467, + "step": 5652 + }, + { + "epoch": 0.5947396107311941, + "grad_norm": 1.14240562915802, + "learning_rate": 7.447211763112346e-05, + "loss": 1.4401, + "step": 5653 + }, + { + "epoch": 0.5948448185165702, + "grad_norm": 1.691174030303955, + "learning_rate": 7.443917081407816e-05, + "loss": 1.5377, + "step": 5654 + }, + { + "epoch": 0.5949500263019464, + "grad_norm": 2.1762633323669434, + "learning_rate": 7.440622696532775e-05, + "loss": 1.6951, + "step": 5655 + }, + { + "epoch": 0.5950552340873224, + "grad_norm": 1.880930781364441, + "learning_rate": 7.437328608869793e-05, + "loss": 1.3951, + "step": 5656 + }, + { + "epoch": 0.5951604418726986, + "grad_norm": 1.4270281791687012, + "learning_rate": 7.434034818801405e-05, + "loss": 2.372, + "step": 5657 + }, + { + "epoch": 0.5952656496580747, + "grad_norm": 1.9598197937011719, + "learning_rate": 7.4307413267101e-05, + "loss": 1.3984, + "step": 5658 + }, + { + "epoch": 0.5953708574434509, + "grad_norm": 1.6525239944458008, + "learning_rate": 7.427448132978346e-05, + "loss": 1.699, + "step": 5659 + }, + { + "epoch": 0.5954760652288269, + "grad_norm": 1.7269477844238281, + "learning_rate": 7.424155237988567e-05, + "loss": 1.6827, + "step": 5660 + }, + { + "epoch": 0.595581273014203, + "grad_norm": 1.3311235904693604, + "learning_rate": 7.420862642123158e-05, + "loss": 1.6645, + "step": 5661 + }, + { + "epoch": 0.5956864807995792, + "grad_norm": 1.446498990058899, + "learning_rate": 7.417570345764481e-05, + "loss": 1.3863, + "step": 5662 + }, + { + "epoch": 0.5957916885849552, + "grad_norm": 1.93930184841156, + "learning_rate": 7.414278349294852e-05, + "loss": 1.7474, + "step": 5663 + }, + { + "epoch": 0.5958968963703314, + "grad_norm": 1.7791824340820312, + "learning_rate": 7.410986653096565e-05, + "loss": 1.564, + "step": 5664 + }, + { + "epoch": 0.5960021041557075, + "grad_norm": 1.487729549407959, + "learning_rate": 7.407695257551875e-05, + "loss": 1.54, + "step": 5665 + }, + { + "epoch": 0.5961073119410837, + "grad_norm": 1.072798490524292, + "learning_rate": 7.404404163042995e-05, + "loss": 1.4969, + "step": 5666 + }, + { + "epoch": 0.5962125197264597, + "grad_norm": 1.5234678983688354, + "learning_rate": 7.401113369952113e-05, + "loss": 2.2265, + "step": 5667 + }, + { + "epoch": 0.5963177275118359, + "grad_norm": 1.3278653621673584, + "learning_rate": 7.397822878661377e-05, + "loss": 2.0208, + "step": 5668 + }, + { + "epoch": 0.596422935297212, + "grad_norm": 1.424271583557129, + "learning_rate": 7.394532689552905e-05, + "loss": 2.034, + "step": 5669 + }, + { + "epoch": 0.5965281430825881, + "grad_norm": 1.7164738178253174, + "learning_rate": 7.391242803008768e-05, + "loss": 2.2689, + "step": 5670 + }, + { + "epoch": 0.5966333508679642, + "grad_norm": 3.884221315383911, + "learning_rate": 7.387953219411015e-05, + "loss": 1.8426, + "step": 5671 + }, + { + "epoch": 0.5967385586533404, + "grad_norm": 1.339316964149475, + "learning_rate": 7.384663939141656e-05, + "loss": 1.9429, + "step": 5672 + }, + { + "epoch": 0.5968437664387165, + "grad_norm": 1.4739105701446533, + "learning_rate": 7.381374962582659e-05, + "loss": 1.3312, + "step": 5673 + }, + { + "epoch": 0.5969489742240925, + "grad_norm": 1.0755393505096436, + "learning_rate": 7.378086290115964e-05, + "loss": 1.7607, + "step": 5674 + }, + { + "epoch": 0.5970541820094687, + "grad_norm": 1.6911028623580933, + "learning_rate": 7.374797922123478e-05, + "loss": 1.4523, + "step": 5675 + }, + { + "epoch": 0.5971593897948448, + "grad_norm": 1.7561362981796265, + "learning_rate": 7.371509858987061e-05, + "loss": 2.0099, + "step": 5676 + }, + { + "epoch": 0.597264597580221, + "grad_norm": 1.560784935951233, + "learning_rate": 7.368222101088549e-05, + "loss": 1.5806, + "step": 5677 + }, + { + "epoch": 0.597369805365597, + "grad_norm": 1.1424559354782104, + "learning_rate": 7.364934648809741e-05, + "loss": 2.1778, + "step": 5678 + }, + { + "epoch": 0.5974750131509732, + "grad_norm": 1.393723726272583, + "learning_rate": 7.361647502532395e-05, + "loss": 2.0167, + "step": 5679 + }, + { + "epoch": 0.5975802209363493, + "grad_norm": 0.8226059079170227, + "learning_rate": 7.358360662638236e-05, + "loss": 1.9121, + "step": 5680 + }, + { + "epoch": 0.5976854287217254, + "grad_norm": 1.6506311893463135, + "learning_rate": 7.355074129508953e-05, + "loss": 2.2151, + "step": 5681 + }, + { + "epoch": 0.5977906365071015, + "grad_norm": 1.2521584033966064, + "learning_rate": 7.351787903526201e-05, + "loss": 2.056, + "step": 5682 + }, + { + "epoch": 0.5978958442924777, + "grad_norm": 1.4110209941864014, + "learning_rate": 7.348501985071603e-05, + "loss": 1.7132, + "step": 5683 + }, + { + "epoch": 0.5980010520778538, + "grad_norm": 1.7280937433242798, + "learning_rate": 7.345216374526736e-05, + "loss": 1.8772, + "step": 5684 + }, + { + "epoch": 0.5981062598632298, + "grad_norm": 1.448601484298706, + "learning_rate": 7.341931072273148e-05, + "loss": 1.9043, + "step": 5685 + }, + { + "epoch": 0.598211467648606, + "grad_norm": 1.5108412504196167, + "learning_rate": 7.338646078692356e-05, + "loss": 1.0912, + "step": 5686 + }, + { + "epoch": 0.5983166754339821, + "grad_norm": 1.2896003723144531, + "learning_rate": 7.335361394165825e-05, + "loss": 1.8589, + "step": 5687 + }, + { + "epoch": 0.5984218832193582, + "grad_norm": 1.537742257118225, + "learning_rate": 7.332077019075005e-05, + "loss": 1.4406, + "step": 5688 + }, + { + "epoch": 0.5985270910047343, + "grad_norm": 1.5651648044586182, + "learning_rate": 7.328792953801296e-05, + "loss": 1.7907, + "step": 5689 + }, + { + "epoch": 0.5986322987901105, + "grad_norm": 1.8168197870254517, + "learning_rate": 7.325509198726064e-05, + "loss": 2.1356, + "step": 5690 + }, + { + "epoch": 0.5987375065754866, + "grad_norm": 1.6306514739990234, + "learning_rate": 7.322225754230641e-05, + "loss": 2.0181, + "step": 5691 + }, + { + "epoch": 0.5988427143608627, + "grad_norm": 1.733379602432251, + "learning_rate": 7.318942620696323e-05, + "loss": 1.3749, + "step": 5692 + }, + { + "epoch": 0.5989479221462388, + "grad_norm": 2.751979351043701, + "learning_rate": 7.315659798504373e-05, + "loss": 1.2602, + "step": 5693 + }, + { + "epoch": 0.599053129931615, + "grad_norm": 1.8571940660476685, + "learning_rate": 7.312377288036009e-05, + "loss": 1.5986, + "step": 5694 + }, + { + "epoch": 0.599158337716991, + "grad_norm": 2.700819253921509, + "learning_rate": 7.30909508967242e-05, + "loss": 2.2613, + "step": 5695 + }, + { + "epoch": 0.5992635455023672, + "grad_norm": 1.9650945663452148, + "learning_rate": 7.30581320379476e-05, + "loss": 1.4637, + "step": 5696 + }, + { + "epoch": 0.5993687532877433, + "grad_norm": 2.2203495502471924, + "learning_rate": 7.302531630784137e-05, + "loss": 1.7075, + "step": 5697 + }, + { + "epoch": 0.5994739610731195, + "grad_norm": 1.5563139915466309, + "learning_rate": 7.299250371021635e-05, + "loss": 1.9756, + "step": 5698 + }, + { + "epoch": 0.5995791688584955, + "grad_norm": 1.332543969154358, + "learning_rate": 7.295969424888295e-05, + "loss": 1.4028, + "step": 5699 + }, + { + "epoch": 0.5996843766438716, + "grad_norm": 1.5485048294067383, + "learning_rate": 7.292688792765126e-05, + "loss": 1.5521, + "step": 5700 + }, + { + "epoch": 0.5997895844292478, + "grad_norm": 1.584691047668457, + "learning_rate": 7.289408475033086e-05, + "loss": 1.5764, + "step": 5701 + }, + { + "epoch": 0.5998947922146238, + "grad_norm": 1.7013684511184692, + "learning_rate": 7.286128472073114e-05, + "loss": 2.1275, + "step": 5702 + }, + { + "epoch": 0.6, + "grad_norm": 1.437490701675415, + "learning_rate": 7.282848784266107e-05, + "loss": 2.214, + "step": 5703 + }, + { + "epoch": 0.6001052077853761, + "grad_norm": 1.080392837524414, + "learning_rate": 7.279569411992926e-05, + "loss": 1.4222, + "step": 5704 + }, + { + "epoch": 0.6002104155707523, + "grad_norm": 1.6214826107025146, + "learning_rate": 7.276290355634387e-05, + "loss": 2.1349, + "step": 5705 + }, + { + "epoch": 0.6003156233561283, + "grad_norm": 1.7013198137283325, + "learning_rate": 7.273011615571282e-05, + "loss": 1.5759, + "step": 5706 + }, + { + "epoch": 0.6004208311415045, + "grad_norm": 1.5219919681549072, + "learning_rate": 7.26973319218436e-05, + "loss": 2.1324, + "step": 5707 + }, + { + "epoch": 0.6005260389268806, + "grad_norm": 1.9878424406051636, + "learning_rate": 7.266455085854329e-05, + "loss": 2.4534, + "step": 5708 + }, + { + "epoch": 0.6006312467122568, + "grad_norm": 1.9398351907730103, + "learning_rate": 7.263177296961867e-05, + "loss": 1.2152, + "step": 5709 + }, + { + "epoch": 0.6007364544976328, + "grad_norm": 1.528769850730896, + "learning_rate": 7.259899825887617e-05, + "loss": 2.2585, + "step": 5710 + }, + { + "epoch": 0.600841662283009, + "grad_norm": 1.6075571775436401, + "learning_rate": 7.256622673012175e-05, + "loss": 2.076, + "step": 5711 + }, + { + "epoch": 0.6009468700683851, + "grad_norm": 1.6357545852661133, + "learning_rate": 7.253345838716108e-05, + "loss": 1.9542, + "step": 5712 + }, + { + "epoch": 0.6010520778537611, + "grad_norm": 1.8700578212738037, + "learning_rate": 7.250069323379945e-05, + "loss": 1.5798, + "step": 5713 + }, + { + "epoch": 0.6011572856391373, + "grad_norm": 2.4654650688171387, + "learning_rate": 7.246793127384174e-05, + "loss": 1.8467, + "step": 5714 + }, + { + "epoch": 0.6012624934245134, + "grad_norm": 2.0539462566375732, + "learning_rate": 7.243517251109254e-05, + "loss": 1.6161, + "step": 5715 + }, + { + "epoch": 0.6013677012098896, + "grad_norm": 2.258213520050049, + "learning_rate": 7.240241694935597e-05, + "loss": 1.9685, + "step": 5716 + }, + { + "epoch": 0.6014729089952656, + "grad_norm": 1.7232213020324707, + "learning_rate": 7.236966459243586e-05, + "loss": 2.4748, + "step": 5717 + }, + { + "epoch": 0.6015781167806418, + "grad_norm": 1.12027907371521, + "learning_rate": 7.233691544413558e-05, + "loss": 1.8557, + "step": 5718 + }, + { + "epoch": 0.6016833245660179, + "grad_norm": 1.5948566198349, + "learning_rate": 7.230416950825825e-05, + "loss": 1.2433, + "step": 5719 + }, + { + "epoch": 0.601788532351394, + "grad_norm": 1.1951110363006592, + "learning_rate": 7.227142678860652e-05, + "loss": 1.9291, + "step": 5720 + }, + { + "epoch": 0.6018937401367701, + "grad_norm": 1.3234333992004395, + "learning_rate": 7.22386872889827e-05, + "loss": 2.0288, + "step": 5721 + }, + { + "epoch": 0.6019989479221463, + "grad_norm": 1.8448857069015503, + "learning_rate": 7.22059510131887e-05, + "loss": 2.0855, + "step": 5722 + }, + { + "epoch": 0.6021041557075224, + "grad_norm": 1.6373887062072754, + "learning_rate": 7.217321796502605e-05, + "loss": 1.0645, + "step": 5723 + }, + { + "epoch": 0.6022093634928984, + "grad_norm": 2.3060243129730225, + "learning_rate": 7.214048814829598e-05, + "loss": 1.3767, + "step": 5724 + }, + { + "epoch": 0.6023145712782746, + "grad_norm": 1.55854332447052, + "learning_rate": 7.210776156679931e-05, + "loss": 1.5182, + "step": 5725 + }, + { + "epoch": 0.6024197790636507, + "grad_norm": 1.64441978931427, + "learning_rate": 7.20750382243364e-05, + "loss": 2.3997, + "step": 5726 + }, + { + "epoch": 0.6025249868490268, + "grad_norm": 0.9315603375434875, + "learning_rate": 7.204231812470736e-05, + "loss": 2.0199, + "step": 5727 + }, + { + "epoch": 0.6026301946344029, + "grad_norm": 1.946433663368225, + "learning_rate": 7.200960127171188e-05, + "loss": 2.2558, + "step": 5728 + }, + { + "epoch": 0.6027354024197791, + "grad_norm": 1.7605088949203491, + "learning_rate": 7.197688766914921e-05, + "loss": 1.8627, + "step": 5729 + }, + { + "epoch": 0.6028406102051552, + "grad_norm": 1.548949956893921, + "learning_rate": 7.19441773208183e-05, + "loss": 1.4795, + "step": 5730 + }, + { + "epoch": 0.6029458179905313, + "grad_norm": 2.702446222305298, + "learning_rate": 7.19114702305177e-05, + "loss": 1.6094, + "step": 5731 + }, + { + "epoch": 0.6030510257759074, + "grad_norm": 2.4522550106048584, + "learning_rate": 7.187876640204556e-05, + "loss": 1.7925, + "step": 5732 + }, + { + "epoch": 0.6031562335612836, + "grad_norm": 1.2998313903808594, + "learning_rate": 7.184606583919966e-05, + "loss": 1.4222, + "step": 5733 + }, + { + "epoch": 0.6032614413466596, + "grad_norm": 2.228344678878784, + "learning_rate": 7.181336854577747e-05, + "loss": 1.7094, + "step": 5734 + }, + { + "epoch": 0.6033666491320357, + "grad_norm": 1.8566731214523315, + "learning_rate": 7.178067452557595e-05, + "loss": 1.943, + "step": 5735 + }, + { + "epoch": 0.6034718569174119, + "grad_norm": 1.346657395362854, + "learning_rate": 7.174798378239176e-05, + "loss": 1.7142, + "step": 5736 + }, + { + "epoch": 0.603577064702788, + "grad_norm": 1.3058844804763794, + "learning_rate": 7.171529632002121e-05, + "loss": 1.9819, + "step": 5737 + }, + { + "epoch": 0.6036822724881641, + "grad_norm": 1.9217864274978638, + "learning_rate": 7.168261214226014e-05, + "loss": 1.6599, + "step": 5738 + }, + { + "epoch": 0.6037874802735402, + "grad_norm": 1.5166659355163574, + "learning_rate": 7.164993125290407e-05, + "loss": 1.9654, + "step": 5739 + }, + { + "epoch": 0.6038926880589164, + "grad_norm": 1.7919169664382935, + "learning_rate": 7.161725365574811e-05, + "loss": 1.7845, + "step": 5740 + }, + { + "epoch": 0.6039978958442925, + "grad_norm": 1.7721195220947266, + "learning_rate": 7.158457935458706e-05, + "loss": 1.7366, + "step": 5741 + }, + { + "epoch": 0.6041031036296686, + "grad_norm": 3.324281930923462, + "learning_rate": 7.155190835321523e-05, + "loss": 1.8447, + "step": 5742 + }, + { + "epoch": 0.6042083114150447, + "grad_norm": 1.3101855516433716, + "learning_rate": 7.151924065542665e-05, + "loss": 1.9632, + "step": 5743 + }, + { + "epoch": 0.6043135192004209, + "grad_norm": 1.5195070505142212, + "learning_rate": 7.14865762650148e-05, + "loss": 1.4127, + "step": 5744 + }, + { + "epoch": 0.6044187269857969, + "grad_norm": 1.4836463928222656, + "learning_rate": 7.1453915185773e-05, + "loss": 2.0167, + "step": 5745 + }, + { + "epoch": 0.604523934771173, + "grad_norm": 2.11446475982666, + "learning_rate": 7.1421257421494e-05, + "loss": 1.4259, + "step": 5746 + }, + { + "epoch": 0.6046291425565492, + "grad_norm": 1.1224156618118286, + "learning_rate": 7.138860297597026e-05, + "loss": 1.6661, + "step": 5747 + }, + { + "epoch": 0.6047343503419254, + "grad_norm": 0.8229581713676453, + "learning_rate": 7.135595185299386e-05, + "loss": 1.5664, + "step": 5748 + }, + { + "epoch": 0.6048395581273014, + "grad_norm": 1.6141940355300903, + "learning_rate": 7.132330405635645e-05, + "loss": 1.6177, + "step": 5749 + }, + { + "epoch": 0.6049447659126775, + "grad_norm": 1.415408968925476, + "learning_rate": 7.12906595898493e-05, + "loss": 1.1493, + "step": 5750 + }, + { + "epoch": 0.6050499736980537, + "grad_norm": 2.1405954360961914, + "learning_rate": 7.12580184572633e-05, + "loss": 2.1279, + "step": 5751 + }, + { + "epoch": 0.6051551814834297, + "grad_norm": 1.3183344602584839, + "learning_rate": 7.122538066238902e-05, + "loss": 1.8046, + "step": 5752 + }, + { + "epoch": 0.6052603892688059, + "grad_norm": 2.0879404544830322, + "learning_rate": 7.119274620901649e-05, + "loss": 1.4804, + "step": 5753 + }, + { + "epoch": 0.605365597054182, + "grad_norm": 1.4626708030700684, + "learning_rate": 7.116011510093547e-05, + "loss": 1.9303, + "step": 5754 + }, + { + "epoch": 0.6054708048395582, + "grad_norm": 1.3531354665756226, + "learning_rate": 7.112748734193537e-05, + "loss": 1.9332, + "step": 5755 + }, + { + "epoch": 0.6055760126249342, + "grad_norm": 2.5419464111328125, + "learning_rate": 7.109486293580505e-05, + "loss": 2.2981, + "step": 5756 + }, + { + "epoch": 0.6056812204103104, + "grad_norm": 1.8105266094207764, + "learning_rate": 7.106224188633311e-05, + "loss": 2.1267, + "step": 5757 + }, + { + "epoch": 0.6057864281956865, + "grad_norm": 1.5312819480895996, + "learning_rate": 7.102962419730776e-05, + "loss": 1.9916, + "step": 5758 + }, + { + "epoch": 0.6058916359810625, + "grad_norm": 2.3092429637908936, + "learning_rate": 7.099700987251674e-05, + "loss": 2.2213, + "step": 5759 + }, + { + "epoch": 0.6059968437664387, + "grad_norm": 1.4174106121063232, + "learning_rate": 7.096439891574745e-05, + "loss": 1.9643, + "step": 5760 + }, + { + "epoch": 0.6061020515518148, + "grad_norm": 2.2073097229003906, + "learning_rate": 7.09317913307869e-05, + "loss": 2.1425, + "step": 5761 + }, + { + "epoch": 0.606207259337191, + "grad_norm": 1.472976803779602, + "learning_rate": 7.089918712142172e-05, + "loss": 1.3008, + "step": 5762 + }, + { + "epoch": 0.606312467122567, + "grad_norm": 27.837261199951172, + "learning_rate": 7.086658629143811e-05, + "loss": 1.9546, + "step": 5763 + }, + { + "epoch": 0.6064176749079432, + "grad_norm": 1.4600732326507568, + "learning_rate": 7.083398884462194e-05, + "loss": 1.7937, + "step": 5764 + }, + { + "epoch": 0.6065228826933193, + "grad_norm": 1.1610373258590698, + "learning_rate": 7.080139478475853e-05, + "loss": 2.1027, + "step": 5765 + }, + { + "epoch": 0.6066280904786954, + "grad_norm": 1.7224698066711426, + "learning_rate": 7.076880411563305e-05, + "loss": 1.8358, + "step": 5766 + }, + { + "epoch": 0.6067332982640715, + "grad_norm": 1.7691140174865723, + "learning_rate": 7.073621684103007e-05, + "loss": 1.7801, + "step": 5767 + }, + { + "epoch": 0.6068385060494477, + "grad_norm": 1.6140860319137573, + "learning_rate": 7.070363296473384e-05, + "loss": 1.1727, + "step": 5768 + }, + { + "epoch": 0.6069437138348238, + "grad_norm": 1.1361531019210815, + "learning_rate": 7.067105249052828e-05, + "loss": 1.7667, + "step": 5769 + }, + { + "epoch": 0.6070489216201999, + "grad_norm": 1.355162501335144, + "learning_rate": 7.063847542219679e-05, + "loss": 1.0353, + "step": 5770 + }, + { + "epoch": 0.607154129405576, + "grad_norm": 0.9227995872497559, + "learning_rate": 7.060590176352248e-05, + "loss": 1.9074, + "step": 5771 + }, + { + "epoch": 0.6072593371909522, + "grad_norm": 1.033094882965088, + "learning_rate": 7.057333151828799e-05, + "loss": 2.163, + "step": 5772 + }, + { + "epoch": 0.6073645449763283, + "grad_norm": 1.4324654340744019, + "learning_rate": 7.054076469027565e-05, + "loss": 1.7172, + "step": 5773 + }, + { + "epoch": 0.6074697527617043, + "grad_norm": 2.334664821624756, + "learning_rate": 7.050820128326724e-05, + "loss": 2.092, + "step": 5774 + }, + { + "epoch": 0.6075749605470805, + "grad_norm": 2.054091453552246, + "learning_rate": 7.047564130104434e-05, + "loss": 1.0949, + "step": 5775 + }, + { + "epoch": 0.6076801683324566, + "grad_norm": 2.1884820461273193, + "learning_rate": 7.044308474738798e-05, + "loss": 1.7727, + "step": 5776 + }, + { + "epoch": 0.6077853761178327, + "grad_norm": 1.3433359861373901, + "learning_rate": 7.041053162607886e-05, + "loss": 1.8029, + "step": 5777 + }, + { + "epoch": 0.6078905839032088, + "grad_norm": 1.5086860656738281, + "learning_rate": 7.037798194089728e-05, + "loss": 1.7065, + "step": 5778 + }, + { + "epoch": 0.607995791688585, + "grad_norm": 1.6031910181045532, + "learning_rate": 7.034543569562313e-05, + "loss": 1.7895, + "step": 5779 + }, + { + "epoch": 0.6081009994739611, + "grad_norm": 1.8059049844741821, + "learning_rate": 7.031289289403584e-05, + "loss": 1.7179, + "step": 5780 + }, + { + "epoch": 0.6082062072593372, + "grad_norm": 1.4660775661468506, + "learning_rate": 7.028035353991456e-05, + "loss": 2.0142, + "step": 5781 + }, + { + "epoch": 0.6083114150447133, + "grad_norm": 1.5748193264007568, + "learning_rate": 7.024781763703797e-05, + "loss": 1.9199, + "step": 5782 + }, + { + "epoch": 0.6084166228300895, + "grad_norm": 1.2944796085357666, + "learning_rate": 7.021528518918433e-05, + "loss": 1.1247, + "step": 5783 + }, + { + "epoch": 0.6085218306154655, + "grad_norm": 1.12498140335083, + "learning_rate": 7.018275620013154e-05, + "loss": 1.3863, + "step": 5784 + }, + { + "epoch": 0.6086270384008416, + "grad_norm": 1.4992774724960327, + "learning_rate": 7.01502306736571e-05, + "loss": 1.9758, + "step": 5785 + }, + { + "epoch": 0.6087322461862178, + "grad_norm": 1.8976898193359375, + "learning_rate": 7.01177086135381e-05, + "loss": 1.6387, + "step": 5786 + }, + { + "epoch": 0.608837453971594, + "grad_norm": 1.2787110805511475, + "learning_rate": 7.008519002355118e-05, + "loss": 1.9901, + "step": 5787 + }, + { + "epoch": 0.60894266175697, + "grad_norm": 1.5045870542526245, + "learning_rate": 7.005267490747263e-05, + "loss": 1.5003, + "step": 5788 + }, + { + "epoch": 0.6090478695423461, + "grad_norm": 1.5272982120513916, + "learning_rate": 7.002016326907831e-05, + "loss": 2.1544, + "step": 5789 + }, + { + "epoch": 0.6091530773277223, + "grad_norm": 1.7053319215774536, + "learning_rate": 6.998765511214374e-05, + "loss": 1.8607, + "step": 5790 + }, + { + "epoch": 0.6092582851130983, + "grad_norm": 1.7263697385787964, + "learning_rate": 6.995515044044393e-05, + "loss": 2.009, + "step": 5791 + }, + { + "epoch": 0.6093634928984745, + "grad_norm": 1.4831238985061646, + "learning_rate": 6.992264925775356e-05, + "loss": 1.7212, + "step": 5792 + }, + { + "epoch": 0.6094687006838506, + "grad_norm": 1.557859182357788, + "learning_rate": 6.989015156784689e-05, + "loss": 1.6485, + "step": 5793 + }, + { + "epoch": 0.6095739084692268, + "grad_norm": 1.1065948009490967, + "learning_rate": 6.98576573744978e-05, + "loss": 1.7752, + "step": 5794 + }, + { + "epoch": 0.6096791162546028, + "grad_norm": 1.3376137018203735, + "learning_rate": 6.982516668147967e-05, + "loss": 1.8011, + "step": 5795 + }, + { + "epoch": 0.609784324039979, + "grad_norm": 1.223147988319397, + "learning_rate": 6.979267949256558e-05, + "loss": 1.2879, + "step": 5796 + }, + { + "epoch": 0.6098895318253551, + "grad_norm": 1.0986629724502563, + "learning_rate": 6.976019581152818e-05, + "loss": 2.0554, + "step": 5797 + }, + { + "epoch": 0.6099947396107311, + "grad_norm": 1.3327361345291138, + "learning_rate": 6.972771564213963e-05, + "loss": 1.5186, + "step": 5798 + }, + { + "epoch": 0.6100999473961073, + "grad_norm": 1.3183317184448242, + "learning_rate": 6.969523898817176e-05, + "loss": 1.9726, + "step": 5799 + }, + { + "epoch": 0.6102051551814834, + "grad_norm": 1.7785512208938599, + "learning_rate": 6.966276585339604e-05, + "loss": 2.1342, + "step": 5800 + }, + { + "epoch": 0.6103103629668596, + "grad_norm": 1.0888605117797852, + "learning_rate": 6.96302962415834e-05, + "loss": 1.4301, + "step": 5801 + }, + { + "epoch": 0.6104155707522356, + "grad_norm": 1.8897411823272705, + "learning_rate": 6.959783015650446e-05, + "loss": 2.3269, + "step": 5802 + }, + { + "epoch": 0.6105207785376118, + "grad_norm": 1.019864559173584, + "learning_rate": 6.956536760192938e-05, + "loss": 1.5989, + "step": 5803 + }, + { + "epoch": 0.6106259863229879, + "grad_norm": 1.5215952396392822, + "learning_rate": 6.953290858162794e-05, + "loss": 2.14, + "step": 5804 + }, + { + "epoch": 0.6107311941083641, + "grad_norm": 1.6711841821670532, + "learning_rate": 6.95004530993695e-05, + "loss": 2.1474, + "step": 5805 + }, + { + "epoch": 0.6108364018937401, + "grad_norm": 1.50629460811615, + "learning_rate": 6.946800115892305e-05, + "loss": 2.319, + "step": 5806 + }, + { + "epoch": 0.6109416096791163, + "grad_norm": 1.2559032440185547, + "learning_rate": 6.943555276405705e-05, + "loss": 1.4975, + "step": 5807 + }, + { + "epoch": 0.6110468174644924, + "grad_norm": 0.9776951670646667, + "learning_rate": 6.940310791853968e-05, + "loss": 2.0983, + "step": 5808 + }, + { + "epoch": 0.6111520252498684, + "grad_norm": 1.521141767501831, + "learning_rate": 6.937066662613863e-05, + "loss": 1.5946, + "step": 5809 + }, + { + "epoch": 0.6112572330352446, + "grad_norm": 1.377365231513977, + "learning_rate": 6.933822889062118e-05, + "loss": 1.4309, + "step": 5810 + }, + { + "epoch": 0.6113624408206207, + "grad_norm": 1.1052137613296509, + "learning_rate": 6.930579471575427e-05, + "loss": 2.1511, + "step": 5811 + }, + { + "epoch": 0.6114676486059969, + "grad_norm": 1.3557707071304321, + "learning_rate": 6.927336410530432e-05, + "loss": 1.6768, + "step": 5812 + }, + { + "epoch": 0.6115728563913729, + "grad_norm": 2.0096118450164795, + "learning_rate": 6.924093706303743e-05, + "loss": 1.9034, + "step": 5813 + }, + { + "epoch": 0.6116780641767491, + "grad_norm": 2.108721971511841, + "learning_rate": 6.920851359271922e-05, + "loss": 1.6447, + "step": 5814 + }, + { + "epoch": 0.6117832719621252, + "grad_norm": 1.1239184141159058, + "learning_rate": 6.917609369811496e-05, + "loss": 1.9551, + "step": 5815 + }, + { + "epoch": 0.6118884797475013, + "grad_norm": 1.2308555841445923, + "learning_rate": 6.914367738298941e-05, + "loss": 2.1679, + "step": 5816 + }, + { + "epoch": 0.6119936875328774, + "grad_norm": 1.2711541652679443, + "learning_rate": 6.9111264651107e-05, + "loss": 1.9665, + "step": 5817 + }, + { + "epoch": 0.6120988953182536, + "grad_norm": 1.186928629875183, + "learning_rate": 6.907885550623172e-05, + "loss": 1.644, + "step": 5818 + }, + { + "epoch": 0.6122041031036297, + "grad_norm": 1.484810709953308, + "learning_rate": 6.904644995212713e-05, + "loss": 1.3018, + "step": 5819 + }, + { + "epoch": 0.6123093108890058, + "grad_norm": 1.1081911325454712, + "learning_rate": 6.901404799255638e-05, + "loss": 1.6068, + "step": 5820 + }, + { + "epoch": 0.6124145186743819, + "grad_norm": 1.3067530393600464, + "learning_rate": 6.898164963128221e-05, + "loss": 1.6391, + "step": 5821 + }, + { + "epoch": 0.612519726459758, + "grad_norm": 0.9942488670349121, + "learning_rate": 6.894925487206691e-05, + "loss": 1.9811, + "step": 5822 + }, + { + "epoch": 0.6126249342451341, + "grad_norm": 1.2306791543960571, + "learning_rate": 6.891686371867239e-05, + "loss": 1.9109, + "step": 5823 + }, + { + "epoch": 0.6127301420305102, + "grad_norm": 1.1397932767868042, + "learning_rate": 6.888447617486016e-05, + "loss": 1.5098, + "step": 5824 + }, + { + "epoch": 0.6128353498158864, + "grad_norm": 1.0217232704162598, + "learning_rate": 6.885209224439123e-05, + "loss": 1.4489, + "step": 5825 + }, + { + "epoch": 0.6129405576012625, + "grad_norm": 2.21636700630188, + "learning_rate": 6.881971193102625e-05, + "loss": 1.5184, + "step": 5826 + }, + { + "epoch": 0.6130457653866386, + "grad_norm": 1.0205903053283691, + "learning_rate": 6.878733523852549e-05, + "loss": 1.8485, + "step": 5827 + }, + { + "epoch": 0.6131509731720147, + "grad_norm": 1.5595358610153198, + "learning_rate": 6.875496217064867e-05, + "loss": 1.5117, + "step": 5828 + }, + { + "epoch": 0.6132561809573909, + "grad_norm": 1.1299363374710083, + "learning_rate": 6.872259273115525e-05, + "loss": 1.9574, + "step": 5829 + }, + { + "epoch": 0.6133613887427669, + "grad_norm": 1.109512448310852, + "learning_rate": 6.869022692380411e-05, + "loss": 2.0785, + "step": 5830 + }, + { + "epoch": 0.6134665965281431, + "grad_norm": 1.951183795928955, + "learning_rate": 6.865786475235381e-05, + "loss": 2.1442, + "step": 5831 + }, + { + "epoch": 0.6135718043135192, + "grad_norm": 1.3576364517211914, + "learning_rate": 6.862550622056249e-05, + "loss": 1.6519, + "step": 5832 + }, + { + "epoch": 0.6136770120988954, + "grad_norm": 1.2096575498580933, + "learning_rate": 6.85931513321878e-05, + "loss": 1.4433, + "step": 5833 + }, + { + "epoch": 0.6137822198842714, + "grad_norm": 2.158343553543091, + "learning_rate": 6.856080009098701e-05, + "loss": 1.4268, + "step": 5834 + }, + { + "epoch": 0.6138874276696475, + "grad_norm": 1.6646260023117065, + "learning_rate": 6.852845250071702e-05, + "loss": 1.9002, + "step": 5835 + }, + { + "epoch": 0.6139926354550237, + "grad_norm": 1.3010480403900146, + "learning_rate": 6.849610856513418e-05, + "loss": 1.8029, + "step": 5836 + }, + { + "epoch": 0.6140978432403998, + "grad_norm": 1.888411283493042, + "learning_rate": 6.846376828799451e-05, + "loss": 1.7934, + "step": 5837 + }, + { + "epoch": 0.6142030510257759, + "grad_norm": 1.5725120306015015, + "learning_rate": 6.843143167305361e-05, + "loss": 1.2792, + "step": 5838 + }, + { + "epoch": 0.614308258811152, + "grad_norm": 1.0562975406646729, + "learning_rate": 6.839909872406657e-05, + "loss": 1.8781, + "step": 5839 + }, + { + "epoch": 0.6144134665965282, + "grad_norm": 1.287243127822876, + "learning_rate": 6.836676944478812e-05, + "loss": 1.6942, + "step": 5840 + }, + { + "epoch": 0.6145186743819042, + "grad_norm": 1.386404275894165, + "learning_rate": 6.83344438389726e-05, + "loss": 1.4059, + "step": 5841 + }, + { + "epoch": 0.6146238821672804, + "grad_norm": 1.312274694442749, + "learning_rate": 6.830212191037386e-05, + "loss": 1.6319, + "step": 5842 + }, + { + "epoch": 0.6147290899526565, + "grad_norm": 1.408747673034668, + "learning_rate": 6.826980366274529e-05, + "loss": 1.0995, + "step": 5843 + }, + { + "epoch": 0.6148342977380327, + "grad_norm": 1.132150650024414, + "learning_rate": 6.823748909983994e-05, + "loss": 1.7678, + "step": 5844 + }, + { + "epoch": 0.6149395055234087, + "grad_norm": 1.0861999988555908, + "learning_rate": 6.820517822541041e-05, + "loss": 1.2332, + "step": 5845 + }, + { + "epoch": 0.6150447133087849, + "grad_norm": 1.2827601432800293, + "learning_rate": 6.81728710432088e-05, + "loss": 2.0137, + "step": 5846 + }, + { + "epoch": 0.615149921094161, + "grad_norm": 1.5106278657913208, + "learning_rate": 6.81405675569869e-05, + "loss": 1.7959, + "step": 5847 + }, + { + "epoch": 0.615255128879537, + "grad_norm": 1.3239543437957764, + "learning_rate": 6.810826777049597e-05, + "loss": 1.6723, + "step": 5848 + }, + { + "epoch": 0.6153603366649132, + "grad_norm": 1.4004448652267456, + "learning_rate": 6.807597168748689e-05, + "loss": 1.9686, + "step": 5849 + }, + { + "epoch": 0.6154655444502893, + "grad_norm": 0.9085174798965454, + "learning_rate": 6.804367931171013e-05, + "loss": 1.5498, + "step": 5850 + }, + { + "epoch": 0.6155707522356655, + "grad_norm": 1.3527324199676514, + "learning_rate": 6.801139064691562e-05, + "loss": 1.4251, + "step": 5851 + }, + { + "epoch": 0.6156759600210415, + "grad_norm": 1.9584040641784668, + "learning_rate": 6.797910569685297e-05, + "loss": 1.6893, + "step": 5852 + }, + { + "epoch": 0.6157811678064177, + "grad_norm": 1.093604326248169, + "learning_rate": 6.794682446527137e-05, + "loss": 1.369, + "step": 5853 + }, + { + "epoch": 0.6158863755917938, + "grad_norm": 1.4752413034439087, + "learning_rate": 6.791454695591945e-05, + "loss": 1.6492, + "step": 5854 + }, + { + "epoch": 0.6159915833771699, + "grad_norm": 0.9944615960121155, + "learning_rate": 6.788227317254556e-05, + "loss": 1.9943, + "step": 5855 + }, + { + "epoch": 0.616096791162546, + "grad_norm": 1.6758373975753784, + "learning_rate": 6.785000311889754e-05, + "loss": 1.9175, + "step": 5856 + }, + { + "epoch": 0.6162019989479222, + "grad_norm": 2.0718863010406494, + "learning_rate": 6.781773679872276e-05, + "loss": 1.7659, + "step": 5857 + }, + { + "epoch": 0.6163072067332983, + "grad_norm": 1.6973742246627808, + "learning_rate": 6.778547421576825e-05, + "loss": 1.8969, + "step": 5858 + }, + { + "epoch": 0.6164124145186743, + "grad_norm": 1.7583503723144531, + "learning_rate": 6.775321537378054e-05, + "loss": 1.8226, + "step": 5859 + }, + { + "epoch": 0.6165176223040505, + "grad_norm": 1.5600526332855225, + "learning_rate": 6.772096027650574e-05, + "loss": 1.6122, + "step": 5860 + }, + { + "epoch": 0.6166228300894266, + "grad_norm": 1.8818323612213135, + "learning_rate": 6.768870892768952e-05, + "loss": 1.4151, + "step": 5861 + }, + { + "epoch": 0.6167280378748027, + "grad_norm": 1.4384057521820068, + "learning_rate": 6.765646133107714e-05, + "loss": 2.2372, + "step": 5862 + }, + { + "epoch": 0.6168332456601788, + "grad_norm": 1.1038570404052734, + "learning_rate": 6.762421749041342e-05, + "loss": 2.0336, + "step": 5863 + }, + { + "epoch": 0.616938453445555, + "grad_norm": 1.2065256834030151, + "learning_rate": 6.759197740944267e-05, + "loss": 1.1006, + "step": 5864 + }, + { + "epoch": 0.6170436612309311, + "grad_norm": 2.1591498851776123, + "learning_rate": 6.75597410919089e-05, + "loss": 1.3848, + "step": 5865 + }, + { + "epoch": 0.6171488690163072, + "grad_norm": 1.131312608718872, + "learning_rate": 6.752750854155558e-05, + "loss": 1.5338, + "step": 5866 + }, + { + "epoch": 0.6172540768016833, + "grad_norm": 2.397049903869629, + "learning_rate": 6.749527976212573e-05, + "loss": 1.3482, + "step": 5867 + }, + { + "epoch": 0.6173592845870595, + "grad_norm": 1.6215959787368774, + "learning_rate": 6.746305475736202e-05, + "loss": 1.6116, + "step": 5868 + }, + { + "epoch": 0.6174644923724356, + "grad_norm": 1.2333688735961914, + "learning_rate": 6.743083353100664e-05, + "loss": 1.7272, + "step": 5869 + }, + { + "epoch": 0.6175697001578117, + "grad_norm": 2.3157310485839844, + "learning_rate": 6.739861608680129e-05, + "loss": 1.7432, + "step": 5870 + }, + { + "epoch": 0.6176749079431878, + "grad_norm": 1.532241940498352, + "learning_rate": 6.736640242848735e-05, + "loss": 1.1707, + "step": 5871 + }, + { + "epoch": 0.617780115728564, + "grad_norm": 1.3878227472305298, + "learning_rate": 6.733419255980559e-05, + "loss": 2.2097, + "step": 5872 + }, + { + "epoch": 0.61788532351394, + "grad_norm": 1.6810948848724365, + "learning_rate": 6.730198648449648e-05, + "loss": 1.7966, + "step": 5873 + }, + { + "epoch": 0.6179905312993161, + "grad_norm": 1.3429570198059082, + "learning_rate": 6.726978420630002e-05, + "loss": 1.7373, + "step": 5874 + }, + { + "epoch": 0.6180957390846923, + "grad_norm": 1.960030436515808, + "learning_rate": 6.723758572895573e-05, + "loss": 2.0929, + "step": 5875 + }, + { + "epoch": 0.6182009468700684, + "grad_norm": 2.1462693214416504, + "learning_rate": 6.720539105620272e-05, + "loss": 1.7261, + "step": 5876 + }, + { + "epoch": 0.6183061546554445, + "grad_norm": 1.1600507497787476, + "learning_rate": 6.717320019177969e-05, + "loss": 1.5239, + "step": 5877 + }, + { + "epoch": 0.6184113624408206, + "grad_norm": 1.3913511037826538, + "learning_rate": 6.71410131394248e-05, + "loss": 1.8831, + "step": 5878 + }, + { + "epoch": 0.6185165702261968, + "grad_norm": 1.2141426801681519, + "learning_rate": 6.710882990287585e-05, + "loss": 2.0194, + "step": 5879 + }, + { + "epoch": 0.6186217780115728, + "grad_norm": 1.3260263204574585, + "learning_rate": 6.70766504858702e-05, + "loss": 1.4936, + "step": 5880 + }, + { + "epoch": 0.618726985796949, + "grad_norm": 1.1059815883636475, + "learning_rate": 6.704447489214468e-05, + "loss": 2.0024, + "step": 5881 + }, + { + "epoch": 0.6188321935823251, + "grad_norm": 1.362945795059204, + "learning_rate": 6.701230312543578e-05, + "loss": 1.6022, + "step": 5882 + }, + { + "epoch": 0.6189374013677013, + "grad_norm": 1.131805658340454, + "learning_rate": 6.698013518947952e-05, + "loss": 2.0098, + "step": 5883 + }, + { + "epoch": 0.6190426091530773, + "grad_norm": 1.8835530281066895, + "learning_rate": 6.69479710880114e-05, + "loss": 1.5957, + "step": 5884 + }, + { + "epoch": 0.6191478169384534, + "grad_norm": 1.4850059747695923, + "learning_rate": 6.691581082476656e-05, + "loss": 1.617, + "step": 5885 + }, + { + "epoch": 0.6192530247238296, + "grad_norm": 1.5727633237838745, + "learning_rate": 6.688365440347965e-05, + "loss": 1.9237, + "step": 5886 + }, + { + "epoch": 0.6193582325092056, + "grad_norm": 1.0962116718292236, + "learning_rate": 6.685150182788495e-05, + "loss": 1.6268, + "step": 5887 + }, + { + "epoch": 0.6194634402945818, + "grad_norm": 2.0313258171081543, + "learning_rate": 6.681935310171616e-05, + "loss": 1.8827, + "step": 5888 + }, + { + "epoch": 0.6195686480799579, + "grad_norm": 1.543372631072998, + "learning_rate": 6.678720822870663e-05, + "loss": 1.8296, + "step": 5889 + }, + { + "epoch": 0.6196738558653341, + "grad_norm": 1.0897983312606812, + "learning_rate": 6.675506721258926e-05, + "loss": 1.9225, + "step": 5890 + }, + { + "epoch": 0.6197790636507101, + "grad_norm": 1.2541539669036865, + "learning_rate": 6.672293005709644e-05, + "loss": 1.6706, + "step": 5891 + }, + { + "epoch": 0.6198842714360863, + "grad_norm": 1.780050277709961, + "learning_rate": 6.669079676596019e-05, + "loss": 1.6821, + "step": 5892 + }, + { + "epoch": 0.6199894792214624, + "grad_norm": 1.5210779905319214, + "learning_rate": 6.665866734291205e-05, + "loss": 1.7069, + "step": 5893 + }, + { + "epoch": 0.6200946870068385, + "grad_norm": 1.2912250757217407, + "learning_rate": 6.662654179168306e-05, + "loss": 1.7669, + "step": 5894 + }, + { + "epoch": 0.6201998947922146, + "grad_norm": 1.736703634262085, + "learning_rate": 6.659442011600387e-05, + "loss": 1.457, + "step": 5895 + }, + { + "epoch": 0.6203051025775908, + "grad_norm": 1.1300891637802124, + "learning_rate": 6.656230231960466e-05, + "loss": 1.8526, + "step": 5896 + }, + { + "epoch": 0.6204103103629669, + "grad_norm": 1.8092447519302368, + "learning_rate": 6.653018840621516e-05, + "loss": 2.3459, + "step": 5897 + }, + { + "epoch": 0.6205155181483429, + "grad_norm": 1.1894848346710205, + "learning_rate": 6.649807837956472e-05, + "loss": 1.5572, + "step": 5898 + }, + { + "epoch": 0.6206207259337191, + "grad_norm": 1.3011435270309448, + "learning_rate": 6.646597224338207e-05, + "loss": 1.6887, + "step": 5899 + }, + { + "epoch": 0.6207259337190952, + "grad_norm": 2.1688480377197266, + "learning_rate": 6.643387000139565e-05, + "loss": 1.3811, + "step": 5900 + }, + { + "epoch": 0.6208311415044714, + "grad_norm": 1.4247039556503296, + "learning_rate": 6.640177165733339e-05, + "loss": 1.7982, + "step": 5901 + }, + { + "epoch": 0.6209363492898474, + "grad_norm": 0.9942802786827087, + "learning_rate": 6.636967721492274e-05, + "loss": 1.3522, + "step": 5902 + }, + { + "epoch": 0.6210415570752236, + "grad_norm": 1.3744851350784302, + "learning_rate": 6.633758667789074e-05, + "loss": 1.7667, + "step": 5903 + }, + { + "epoch": 0.6211467648605997, + "grad_norm": 1.2810285091400146, + "learning_rate": 6.630550004996396e-05, + "loss": 2.0339, + "step": 5904 + }, + { + "epoch": 0.6212519726459758, + "grad_norm": 1.1830532550811768, + "learning_rate": 6.627341733486847e-05, + "loss": 1.8695, + "step": 5905 + }, + { + "epoch": 0.6213571804313519, + "grad_norm": 1.8294991254806519, + "learning_rate": 6.624133853632998e-05, + "loss": 1.95, + "step": 5906 + }, + { + "epoch": 0.6214623882167281, + "grad_norm": 2.3274435997009277, + "learning_rate": 6.620926365807372e-05, + "loss": 1.9036, + "step": 5907 + }, + { + "epoch": 0.6215675960021042, + "grad_norm": 1.6169378757476807, + "learning_rate": 6.617719270382436e-05, + "loss": 2.0314, + "step": 5908 + }, + { + "epoch": 0.6216728037874802, + "grad_norm": 1.2674856185913086, + "learning_rate": 6.614512567730625e-05, + "loss": 1.1626, + "step": 5909 + }, + { + "epoch": 0.6217780115728564, + "grad_norm": 1.3875672817230225, + "learning_rate": 6.611306258224319e-05, + "loss": 1.5651, + "step": 5910 + }, + { + "epoch": 0.6218832193582325, + "grad_norm": 2.156421422958374, + "learning_rate": 6.608100342235861e-05, + "loss": 2.0087, + "step": 5911 + }, + { + "epoch": 0.6219884271436086, + "grad_norm": 1.7543668746948242, + "learning_rate": 6.604894820137541e-05, + "loss": 1.9096, + "step": 5912 + }, + { + "epoch": 0.6220936349289847, + "grad_norm": 1.749904990196228, + "learning_rate": 6.601689692301604e-05, + "loss": 1.5697, + "step": 5913 + }, + { + "epoch": 0.6221988427143609, + "grad_norm": 1.3653181791305542, + "learning_rate": 6.598484959100257e-05, + "loss": 1.8171, + "step": 5914 + }, + { + "epoch": 0.622304050499737, + "grad_norm": 1.5228540897369385, + "learning_rate": 6.59528062090565e-05, + "loss": 1.6228, + "step": 5915 + }, + { + "epoch": 0.6224092582851131, + "grad_norm": 1.0161175727844238, + "learning_rate": 6.592076678089889e-05, + "loss": 1.3673, + "step": 5916 + }, + { + "epoch": 0.6225144660704892, + "grad_norm": 1.3234752416610718, + "learning_rate": 6.588873131025042e-05, + "loss": 1.1581, + "step": 5917 + }, + { + "epoch": 0.6226196738558654, + "grad_norm": 1.3997480869293213, + "learning_rate": 6.585669980083128e-05, + "loss": 1.6416, + "step": 5918 + }, + { + "epoch": 0.6227248816412414, + "grad_norm": 2.785766124725342, + "learning_rate": 6.582467225636116e-05, + "loss": 1.6409, + "step": 5919 + }, + { + "epoch": 0.6228300894266176, + "grad_norm": 1.0819607973098755, + "learning_rate": 6.579264868055928e-05, + "loss": 1.4652, + "step": 5920 + }, + { + "epoch": 0.6229352972119937, + "grad_norm": 1.7267218828201294, + "learning_rate": 6.576062907714448e-05, + "loss": 2.1395, + "step": 5921 + }, + { + "epoch": 0.6230405049973698, + "grad_norm": 1.7600226402282715, + "learning_rate": 6.572861344983511e-05, + "loss": 2.0258, + "step": 5922 + }, + { + "epoch": 0.6231457127827459, + "grad_norm": 1.3963185548782349, + "learning_rate": 6.569660180234898e-05, + "loss": 1.7034, + "step": 5923 + }, + { + "epoch": 0.623250920568122, + "grad_norm": 1.8364591598510742, + "learning_rate": 6.566459413840351e-05, + "loss": 1.9706, + "step": 5924 + }, + { + "epoch": 0.6233561283534982, + "grad_norm": 1.4244076013565063, + "learning_rate": 6.56325904617157e-05, + "loss": 2.1374, + "step": 5925 + }, + { + "epoch": 0.6234613361388742, + "grad_norm": 1.3648873567581177, + "learning_rate": 6.560059077600195e-05, + "loss": 1.6418, + "step": 5926 + }, + { + "epoch": 0.6235665439242504, + "grad_norm": 1.695998191833496, + "learning_rate": 6.556859508497834e-05, + "loss": 1.712, + "step": 5927 + }, + { + "epoch": 0.6236717517096265, + "grad_norm": 2.2389862537384033, + "learning_rate": 6.553660339236041e-05, + "loss": 1.6126, + "step": 5928 + }, + { + "epoch": 0.6237769594950027, + "grad_norm": 1.46173095703125, + "learning_rate": 6.550461570186322e-05, + "loss": 2.1347, + "step": 5929 + }, + { + "epoch": 0.6238821672803787, + "grad_norm": 1.7502105236053467, + "learning_rate": 6.547263201720143e-05, + "loss": 1.4319, + "step": 5930 + }, + { + "epoch": 0.6239873750657549, + "grad_norm": 1.4961512088775635, + "learning_rate": 6.54406523420892e-05, + "loss": 2.2034, + "step": 5931 + }, + { + "epoch": 0.624092582851131, + "grad_norm": 1.5784069299697876, + "learning_rate": 6.54086766802402e-05, + "loss": 2.0528, + "step": 5932 + }, + { + "epoch": 0.6241977906365072, + "grad_norm": 1.485357642173767, + "learning_rate": 6.537670503536766e-05, + "loss": 2.0438, + "step": 5933 + }, + { + "epoch": 0.6243029984218832, + "grad_norm": 1.615233063697815, + "learning_rate": 6.534473741118434e-05, + "loss": 1.5522, + "step": 5934 + }, + { + "epoch": 0.6244082062072593, + "grad_norm": 1.7099108695983887, + "learning_rate": 6.53127738114026e-05, + "loss": 1.8988, + "step": 5935 + }, + { + "epoch": 0.6245134139926355, + "grad_norm": 1.4433890581130981, + "learning_rate": 6.528081423973422e-05, + "loss": 2.0022, + "step": 5936 + }, + { + "epoch": 0.6246186217780115, + "grad_norm": 1.8532782793045044, + "learning_rate": 6.52488586998905e-05, + "loss": 1.748, + "step": 5937 + }, + { + "epoch": 0.6247238295633877, + "grad_norm": 1.721971869468689, + "learning_rate": 6.52169071955824e-05, + "loss": 1.4468, + "step": 5938 + }, + { + "epoch": 0.6248290373487638, + "grad_norm": 1.340277910232544, + "learning_rate": 6.518495973052036e-05, + "loss": 1.4, + "step": 5939 + }, + { + "epoch": 0.62493424513414, + "grad_norm": 1.9042940139770508, + "learning_rate": 6.515301630841426e-05, + "loss": 1.6993, + "step": 5940 + }, + { + "epoch": 0.625039452919516, + "grad_norm": 2.6405863761901855, + "learning_rate": 6.512107693297365e-05, + "loss": 1.4162, + "step": 5941 + }, + { + "epoch": 0.6251446607048922, + "grad_norm": 1.6686121225357056, + "learning_rate": 6.508914160790752e-05, + "loss": 1.3212, + "step": 5942 + }, + { + "epoch": 0.6252498684902683, + "grad_norm": 1.4782112836837769, + "learning_rate": 6.505721033692443e-05, + "loss": 1.7841, + "step": 5943 + }, + { + "epoch": 0.6253550762756444, + "grad_norm": 1.8376623392105103, + "learning_rate": 6.502528312373241e-05, + "loss": 1.5778, + "step": 5944 + }, + { + "epoch": 0.6254602840610205, + "grad_norm": 1.424533724784851, + "learning_rate": 6.49933599720391e-05, + "loss": 1.4852, + "step": 5945 + }, + { + "epoch": 0.6255654918463966, + "grad_norm": 1.5104315280914307, + "learning_rate": 6.496144088555162e-05, + "loss": 1.6104, + "step": 5946 + }, + { + "epoch": 0.6256706996317728, + "grad_norm": 1.5233991146087646, + "learning_rate": 6.492952586797665e-05, + "loss": 1.2861, + "step": 5947 + }, + { + "epoch": 0.6257759074171488, + "grad_norm": 1.3382761478424072, + "learning_rate": 6.489761492302034e-05, + "loss": 2.0656, + "step": 5948 + }, + { + "epoch": 0.625881115202525, + "grad_norm": 1.0290378332138062, + "learning_rate": 6.486570805438843e-05, + "loss": 1.7404, + "step": 5949 + }, + { + "epoch": 0.6259863229879011, + "grad_norm": 1.8992608785629272, + "learning_rate": 6.483380526578615e-05, + "loss": 1.3986, + "step": 5950 + }, + { + "epoch": 0.6260915307732772, + "grad_norm": 1.5879778861999512, + "learning_rate": 6.480190656091825e-05, + "loss": 1.7387, + "step": 5951 + }, + { + "epoch": 0.6261967385586533, + "grad_norm": 1.1449265480041504, + "learning_rate": 6.477001194348906e-05, + "loss": 1.3209, + "step": 5952 + }, + { + "epoch": 0.6263019463440295, + "grad_norm": 1.9107697010040283, + "learning_rate": 6.473812141720234e-05, + "loss": 1.6479, + "step": 5953 + }, + { + "epoch": 0.6264071541294056, + "grad_norm": 2.4911386966705322, + "learning_rate": 6.47062349857615e-05, + "loss": 1.9478, + "step": 5954 + }, + { + "epoch": 0.6265123619147817, + "grad_norm": 1.8238067626953125, + "learning_rate": 6.467435265286935e-05, + "loss": 1.6787, + "step": 5955 + }, + { + "epoch": 0.6266175697001578, + "grad_norm": 1.4120367765426636, + "learning_rate": 6.46424744222283e-05, + "loss": 1.6102, + "step": 5956 + }, + { + "epoch": 0.626722777485534, + "grad_norm": 0.9109852910041809, + "learning_rate": 6.461060029754031e-05, + "loss": 1.982, + "step": 5957 + }, + { + "epoch": 0.62682798527091, + "grad_norm": 1.119532823562622, + "learning_rate": 6.457873028250674e-05, + "loss": 1.7647, + "step": 5958 + }, + { + "epoch": 0.6269331930562861, + "grad_norm": 1.667096734046936, + "learning_rate": 6.454686438082858e-05, + "loss": 1.2471, + "step": 5959 + }, + { + "epoch": 0.6270384008416623, + "grad_norm": 1.9557980298995972, + "learning_rate": 6.45150025962063e-05, + "loss": 1.553, + "step": 5960 + }, + { + "epoch": 0.6271436086270384, + "grad_norm": 1.4863619804382324, + "learning_rate": 6.448314493233995e-05, + "loss": 1.706, + "step": 5961 + }, + { + "epoch": 0.6272488164124145, + "grad_norm": 1.5268634557724, + "learning_rate": 6.445129139292899e-05, + "loss": 1.7509, + "step": 5962 + }, + { + "epoch": 0.6273540241977906, + "grad_norm": 1.5882983207702637, + "learning_rate": 6.441944198167253e-05, + "loss": 1.9265, + "step": 5963 + }, + { + "epoch": 0.6274592319831668, + "grad_norm": 1.2085886001586914, + "learning_rate": 6.43875967022691e-05, + "loss": 1.6107, + "step": 5964 + }, + { + "epoch": 0.6275644397685429, + "grad_norm": 2.4494829177856445, + "learning_rate": 6.435575555841679e-05, + "loss": 1.9172, + "step": 5965 + }, + { + "epoch": 0.627669647553919, + "grad_norm": 0.8796269297599792, + "learning_rate": 6.432391855381321e-05, + "loss": 0.9647, + "step": 5966 + }, + { + "epoch": 0.6277748553392951, + "grad_norm": 1.2112059593200684, + "learning_rate": 6.42920856921555e-05, + "loss": 1.734, + "step": 5967 + }, + { + "epoch": 0.6278800631246713, + "grad_norm": 1.265119194984436, + "learning_rate": 6.426025697714029e-05, + "loss": 1.9329, + "step": 5968 + }, + { + "epoch": 0.6279852709100473, + "grad_norm": 1.5609668493270874, + "learning_rate": 6.422843241246374e-05, + "loss": 1.5995, + "step": 5969 + }, + { + "epoch": 0.6280904786954234, + "grad_norm": 1.1384879350662231, + "learning_rate": 6.419661200182158e-05, + "loss": 2.1466, + "step": 5970 + }, + { + "epoch": 0.6281956864807996, + "grad_norm": 1.4794455766677856, + "learning_rate": 6.416479574890894e-05, + "loss": 2.2411, + "step": 5971 + }, + { + "epoch": 0.6283008942661757, + "grad_norm": 1.0125900506973267, + "learning_rate": 6.413298365742055e-05, + "loss": 1.6002, + "step": 5972 + }, + { + "epoch": 0.6284061020515518, + "grad_norm": 1.3207452297210693, + "learning_rate": 6.41011757310507e-05, + "loss": 1.6828, + "step": 5973 + }, + { + "epoch": 0.6285113098369279, + "grad_norm": 1.2414960861206055, + "learning_rate": 6.406937197349308e-05, + "loss": 1.6645, + "step": 5974 + }, + { + "epoch": 0.6286165176223041, + "grad_norm": 1.6514127254486084, + "learning_rate": 6.403757238844096e-05, + "loss": 1.9715, + "step": 5975 + }, + { + "epoch": 0.6287217254076801, + "grad_norm": 1.0575364828109741, + "learning_rate": 6.400577697958718e-05, + "loss": 2.2627, + "step": 5976 + }, + { + "epoch": 0.6288269331930563, + "grad_norm": 1.608067512512207, + "learning_rate": 6.397398575062396e-05, + "loss": 1.871, + "step": 5977 + }, + { + "epoch": 0.6289321409784324, + "grad_norm": 1.4367784261703491, + "learning_rate": 6.394219870524314e-05, + "loss": 1.8095, + "step": 5978 + }, + { + "epoch": 0.6290373487638086, + "grad_norm": 1.3168582916259766, + "learning_rate": 6.391041584713608e-05, + "loss": 1.6412, + "step": 5979 + }, + { + "epoch": 0.6291425565491846, + "grad_norm": 1.5024646520614624, + "learning_rate": 6.387863717999357e-05, + "loss": 2.0073, + "step": 5980 + }, + { + "epoch": 0.6292477643345608, + "grad_norm": 1.1878137588500977, + "learning_rate": 6.384686270750599e-05, + "loss": 1.0912, + "step": 5981 + }, + { + "epoch": 0.6293529721199369, + "grad_norm": 1.134263515472412, + "learning_rate": 6.381509243336318e-05, + "loss": 1.9806, + "step": 5982 + }, + { + "epoch": 0.6294581799053129, + "grad_norm": 1.093222737312317, + "learning_rate": 6.378332636125453e-05, + "loss": 1.8243, + "step": 5983 + }, + { + "epoch": 0.6295633876906891, + "grad_norm": 1.2039942741394043, + "learning_rate": 6.375156449486895e-05, + "loss": 1.6477, + "step": 5984 + }, + { + "epoch": 0.6296685954760652, + "grad_norm": 1.28606116771698, + "learning_rate": 6.371980683789479e-05, + "loss": 1.6914, + "step": 5985 + }, + { + "epoch": 0.6297738032614414, + "grad_norm": 1.0567982196807861, + "learning_rate": 6.368805339402e-05, + "loss": 1.9269, + "step": 5986 + }, + { + "epoch": 0.6298790110468174, + "grad_norm": 1.3032525777816772, + "learning_rate": 6.365630416693203e-05, + "loss": 1.9851, + "step": 5987 + }, + { + "epoch": 0.6299842188321936, + "grad_norm": 2.6468424797058105, + "learning_rate": 6.362455916031774e-05, + "loss": 1.8291, + "step": 5988 + }, + { + "epoch": 0.6300894266175697, + "grad_norm": 1.2577786445617676, + "learning_rate": 6.359281837786363e-05, + "loss": 1.6674, + "step": 5989 + }, + { + "epoch": 0.6301946344029458, + "grad_norm": 1.3022758960723877, + "learning_rate": 6.356108182325562e-05, + "loss": 1.4832, + "step": 5990 + }, + { + "epoch": 0.6302998421883219, + "grad_norm": 1.515007495880127, + "learning_rate": 6.352934950017921e-05, + "loss": 2.0965, + "step": 5991 + }, + { + "epoch": 0.6304050499736981, + "grad_norm": 1.462588906288147, + "learning_rate": 6.349762141231934e-05, + "loss": 1.5093, + "step": 5992 + }, + { + "epoch": 0.6305102577590742, + "grad_norm": 1.503005862236023, + "learning_rate": 6.34658975633605e-05, + "loss": 1.4638, + "step": 5993 + }, + { + "epoch": 0.6306154655444502, + "grad_norm": 1.4400442838668823, + "learning_rate": 6.343417795698667e-05, + "loss": 1.2181, + "step": 5994 + }, + { + "epoch": 0.6307206733298264, + "grad_norm": 1.6780505180358887, + "learning_rate": 6.340246259688133e-05, + "loss": 1.6997, + "step": 5995 + }, + { + "epoch": 0.6308258811152025, + "grad_norm": 1.3209447860717773, + "learning_rate": 6.337075148672751e-05, + "loss": 1.5113, + "step": 5996 + }, + { + "epoch": 0.6309310889005787, + "grad_norm": 1.7085509300231934, + "learning_rate": 6.333904463020772e-05, + "loss": 1.8627, + "step": 5997 + }, + { + "epoch": 0.6310362966859547, + "grad_norm": 1.7101259231567383, + "learning_rate": 6.330734203100394e-05, + "loss": 2.1645, + "step": 5998 + }, + { + "epoch": 0.6311415044713309, + "grad_norm": 1.1601276397705078, + "learning_rate": 6.327564369279768e-05, + "loss": 2.0211, + "step": 5999 + }, + { + "epoch": 0.631246712256707, + "grad_norm": 1.344951868057251, + "learning_rate": 6.324394961927005e-05, + "loss": 1.6263, + "step": 6000 + }, + { + "epoch": 0.6313519200420831, + "grad_norm": 1.1966255903244019, + "learning_rate": 6.321225981410147e-05, + "loss": 1.8129, + "step": 6001 + }, + { + "epoch": 0.6314571278274592, + "grad_norm": 1.8970527648925781, + "learning_rate": 6.318057428097203e-05, + "loss": 1.8969, + "step": 6002 + }, + { + "epoch": 0.6315623356128354, + "grad_norm": 1.4465181827545166, + "learning_rate": 6.314889302356125e-05, + "loss": 2.0013, + "step": 6003 + }, + { + "epoch": 0.6316675433982115, + "grad_norm": 1.2381423711776733, + "learning_rate": 6.311721604554816e-05, + "loss": 1.6398, + "step": 6004 + }, + { + "epoch": 0.6317727511835876, + "grad_norm": 1.656591773033142, + "learning_rate": 6.308554335061135e-05, + "loss": 1.757, + "step": 6005 + }, + { + "epoch": 0.6318779589689637, + "grad_norm": 1.8384569883346558, + "learning_rate": 6.305387494242882e-05, + "loss": 1.7847, + "step": 6006 + }, + { + "epoch": 0.6319831667543399, + "grad_norm": 1.5828437805175781, + "learning_rate": 6.302221082467812e-05, + "loss": 1.5665, + "step": 6007 + }, + { + "epoch": 0.6320883745397159, + "grad_norm": 1.4291951656341553, + "learning_rate": 6.299055100103632e-05, + "loss": 1.9382, + "step": 6008 + }, + { + "epoch": 0.632193582325092, + "grad_norm": 1.4509108066558838, + "learning_rate": 6.295889547517997e-05, + "loss": 2.2804, + "step": 6009 + }, + { + "epoch": 0.6322987901104682, + "grad_norm": 1.5382047891616821, + "learning_rate": 6.29272442507851e-05, + "loss": 0.9914, + "step": 6010 + }, + { + "epoch": 0.6324039978958443, + "grad_norm": 0.95433109998703, + "learning_rate": 6.289559733152727e-05, + "loss": 1.648, + "step": 6011 + }, + { + "epoch": 0.6325092056812204, + "grad_norm": 0.9161847829818726, + "learning_rate": 6.286395472108158e-05, + "loss": 1.5227, + "step": 6012 + }, + { + "epoch": 0.6326144134665965, + "grad_norm": 2.2820706367492676, + "learning_rate": 6.283231642312251e-05, + "loss": 1.4385, + "step": 6013 + }, + { + "epoch": 0.6327196212519727, + "grad_norm": 1.1651874780654907, + "learning_rate": 6.280068244132415e-05, + "loss": 1.3021, + "step": 6014 + }, + { + "epoch": 0.6328248290373487, + "grad_norm": 1.7624272108078003, + "learning_rate": 6.276905277936005e-05, + "loss": 1.7141, + "step": 6015 + }, + { + "epoch": 0.6329300368227249, + "grad_norm": 1.4675081968307495, + "learning_rate": 6.273742744090325e-05, + "loss": 1.8335, + "step": 6016 + }, + { + "epoch": 0.633035244608101, + "grad_norm": 1.489524245262146, + "learning_rate": 6.270580642962629e-05, + "loss": 1.8555, + "step": 6017 + }, + { + "epoch": 0.6331404523934772, + "grad_norm": 1.2529215812683105, + "learning_rate": 6.267418974920125e-05, + "loss": 1.7023, + "step": 6018 + }, + { + "epoch": 0.6332456601788532, + "grad_norm": 1.7118120193481445, + "learning_rate": 6.26425774032996e-05, + "loss": 1.7771, + "step": 6019 + }, + { + "epoch": 0.6333508679642293, + "grad_norm": 1.1630779504776, + "learning_rate": 6.261096939559243e-05, + "loss": 1.6504, + "step": 6020 + }, + { + "epoch": 0.6334560757496055, + "grad_norm": 1.5538249015808105, + "learning_rate": 6.257936572975029e-05, + "loss": 1.967, + "step": 6021 + }, + { + "epoch": 0.6335612835349815, + "grad_norm": 1.3640360832214355, + "learning_rate": 6.25477664094432e-05, + "loss": 1.583, + "step": 6022 + }, + { + "epoch": 0.6336664913203577, + "grad_norm": 1.6162277460098267, + "learning_rate": 6.251617143834065e-05, + "loss": 1.8734, + "step": 6023 + }, + { + "epoch": 0.6337716991057338, + "grad_norm": 1.6816718578338623, + "learning_rate": 6.248458082011167e-05, + "loss": 1.6501, + "step": 6024 + }, + { + "epoch": 0.63387690689111, + "grad_norm": 1.3452551364898682, + "learning_rate": 6.245299455842477e-05, + "loss": 1.609, + "step": 6025 + }, + { + "epoch": 0.633982114676486, + "grad_norm": 1.8827195167541504, + "learning_rate": 6.2421412656948e-05, + "loss": 1.2681, + "step": 6026 + }, + { + "epoch": 0.6340873224618622, + "grad_norm": 1.4761601686477661, + "learning_rate": 6.238983511934883e-05, + "loss": 1.4392, + "step": 6027 + }, + { + "epoch": 0.6341925302472383, + "grad_norm": 1.6614230871200562, + "learning_rate": 6.235826194929423e-05, + "loss": 1.8507, + "step": 6028 + }, + { + "epoch": 0.6342977380326145, + "grad_norm": 1.2700610160827637, + "learning_rate": 6.232669315045076e-05, + "loss": 1.9591, + "step": 6029 + }, + { + "epoch": 0.6344029458179905, + "grad_norm": 1.129776120185852, + "learning_rate": 6.229512872648435e-05, + "loss": 1.9298, + "step": 6030 + }, + { + "epoch": 0.6345081536033667, + "grad_norm": 1.5436091423034668, + "learning_rate": 6.226356868106046e-05, + "loss": 1.6272, + "step": 6031 + }, + { + "epoch": 0.6346133613887428, + "grad_norm": 1.54899001121521, + "learning_rate": 6.22320130178441e-05, + "loss": 1.6734, + "step": 6032 + }, + { + "epoch": 0.6347185691741188, + "grad_norm": 2.6774120330810547, + "learning_rate": 6.220046174049968e-05, + "loss": 1.5251, + "step": 6033 + }, + { + "epoch": 0.634823776959495, + "grad_norm": 1.2315189838409424, + "learning_rate": 6.216891485269118e-05, + "loss": 1.4497, + "step": 6034 + }, + { + "epoch": 0.6349289847448711, + "grad_norm": 0.9964236617088318, + "learning_rate": 6.213737235808201e-05, + "loss": 1.695, + "step": 6035 + }, + { + "epoch": 0.6350341925302473, + "grad_norm": 1.0956590175628662, + "learning_rate": 6.210583426033513e-05, + "loss": 2.1008, + "step": 6036 + }, + { + "epoch": 0.6351394003156233, + "grad_norm": 1.8140305280685425, + "learning_rate": 6.207430056311292e-05, + "loss": 1.6233, + "step": 6037 + }, + { + "epoch": 0.6352446081009995, + "grad_norm": 2.13716721534729, + "learning_rate": 6.204277127007729e-05, + "loss": 2.0819, + "step": 6038 + }, + { + "epoch": 0.6353498158863756, + "grad_norm": 1.4235343933105469, + "learning_rate": 6.201124638488968e-05, + "loss": 1.3355, + "step": 6039 + }, + { + "epoch": 0.6354550236717517, + "grad_norm": 1.6728320121765137, + "learning_rate": 6.19797259112109e-05, + "loss": 1.7395, + "step": 6040 + }, + { + "epoch": 0.6355602314571278, + "grad_norm": 1.455208659172058, + "learning_rate": 6.194820985270136e-05, + "loss": 1.9213, + "step": 6041 + }, + { + "epoch": 0.635665439242504, + "grad_norm": 1.577768325805664, + "learning_rate": 6.191669821302091e-05, + "loss": 1.9931, + "step": 6042 + }, + { + "epoch": 0.6357706470278801, + "grad_norm": 1.7181400060653687, + "learning_rate": 6.188519099582893e-05, + "loss": 1.69, + "step": 6043 + }, + { + "epoch": 0.6358758548132561, + "grad_norm": 1.1263937950134277, + "learning_rate": 6.185368820478417e-05, + "loss": 1.3001, + "step": 6044 + }, + { + "epoch": 0.6359810625986323, + "grad_norm": 1.8166402578353882, + "learning_rate": 6.182218984354497e-05, + "loss": 1.9147, + "step": 6045 + }, + { + "epoch": 0.6360862703840084, + "grad_norm": 1.7598438262939453, + "learning_rate": 6.179069591576916e-05, + "loss": 1.9621, + "step": 6046 + }, + { + "epoch": 0.6361914781693845, + "grad_norm": 1.1624400615692139, + "learning_rate": 6.175920642511404e-05, + "loss": 2.1653, + "step": 6047 + }, + { + "epoch": 0.6362966859547606, + "grad_norm": 2.103053092956543, + "learning_rate": 6.172772137523632e-05, + "loss": 1.9937, + "step": 6048 + }, + { + "epoch": 0.6364018937401368, + "grad_norm": 0.8931379914283752, + "learning_rate": 6.169624076979229e-05, + "loss": 1.8209, + "step": 6049 + }, + { + "epoch": 0.6365071015255129, + "grad_norm": 1.4275034666061401, + "learning_rate": 6.166476461243771e-05, + "loss": 1.3759, + "step": 6050 + }, + { + "epoch": 0.636612309310889, + "grad_norm": 1.7524127960205078, + "learning_rate": 6.163329290682775e-05, + "loss": 1.8575, + "step": 6051 + }, + { + "epoch": 0.6367175170962651, + "grad_norm": 1.2306782007217407, + "learning_rate": 6.160182565661717e-05, + "loss": 1.3555, + "step": 6052 + }, + { + "epoch": 0.6368227248816413, + "grad_norm": 1.199044942855835, + "learning_rate": 6.157036286546014e-05, + "loss": 1.6082, + "step": 6053 + }, + { + "epoch": 0.6369279326670173, + "grad_norm": 0.8733108043670654, + "learning_rate": 6.153890453701031e-05, + "loss": 1.8242, + "step": 6054 + }, + { + "epoch": 0.6370331404523935, + "grad_norm": 1.531132698059082, + "learning_rate": 6.150745067492085e-05, + "loss": 1.5927, + "step": 6055 + }, + { + "epoch": 0.6371383482377696, + "grad_norm": 2.1788547039031982, + "learning_rate": 6.14760012828444e-05, + "loss": 2.3346, + "step": 6056 + }, + { + "epoch": 0.6372435560231458, + "grad_norm": 1.90433931350708, + "learning_rate": 6.144455636443304e-05, + "loss": 1.5207, + "step": 6057 + }, + { + "epoch": 0.6373487638085218, + "grad_norm": 2.0594825744628906, + "learning_rate": 6.141311592333841e-05, + "loss": 1.23, + "step": 6058 + }, + { + "epoch": 0.6374539715938979, + "grad_norm": 1.3014582395553589, + "learning_rate": 6.138167996321155e-05, + "loss": 1.8238, + "step": 6059 + }, + { + "epoch": 0.6375591793792741, + "grad_norm": 1.347284197807312, + "learning_rate": 6.135024848770306e-05, + "loss": 1.1295, + "step": 6060 + }, + { + "epoch": 0.6376643871646502, + "grad_norm": 1.683262825012207, + "learning_rate": 6.131882150046291e-05, + "loss": 1.449, + "step": 6061 + }, + { + "epoch": 0.6377695949500263, + "grad_norm": 1.515728235244751, + "learning_rate": 6.128739900514064e-05, + "loss": 1.3518, + "step": 6062 + }, + { + "epoch": 0.6378748027354024, + "grad_norm": 1.24699068069458, + "learning_rate": 6.125598100538528e-05, + "loss": 1.9356, + "step": 6063 + }, + { + "epoch": 0.6379800105207786, + "grad_norm": 2.0184340476989746, + "learning_rate": 6.122456750484528e-05, + "loss": 1.4496, + "step": 6064 + }, + { + "epoch": 0.6380852183061546, + "grad_norm": 1.378085970878601, + "learning_rate": 6.119315850716853e-05, + "loss": 1.6465, + "step": 6065 + }, + { + "epoch": 0.6381904260915308, + "grad_norm": 1.6989691257476807, + "learning_rate": 6.116175401600249e-05, + "loss": 1.8344, + "step": 6066 + }, + { + "epoch": 0.6382956338769069, + "grad_norm": 1.1261394023895264, + "learning_rate": 6.113035403499408e-05, + "loss": 1.4539, + "step": 6067 + }, + { + "epoch": 0.6384008416622831, + "grad_norm": 1.6350476741790771, + "learning_rate": 6.109895856778967e-05, + "loss": 1.5789, + "step": 6068 + }, + { + "epoch": 0.6385060494476591, + "grad_norm": 1.9975515604019165, + "learning_rate": 6.106756761803507e-05, + "loss": 1.6606, + "step": 6069 + }, + { + "epoch": 0.6386112572330352, + "grad_norm": 1.355108618736267, + "learning_rate": 6.103618118937567e-05, + "loss": 1.4639, + "step": 6070 + }, + { + "epoch": 0.6387164650184114, + "grad_norm": 1.5188654661178589, + "learning_rate": 6.1004799285456235e-05, + "loss": 2.2806, + "step": 6071 + }, + { + "epoch": 0.6388216728037874, + "grad_norm": 1.9493741989135742, + "learning_rate": 6.097342190992105e-05, + "loss": 2.7309, + "step": 6072 + }, + { + "epoch": 0.6389268805891636, + "grad_norm": 1.889501690864563, + "learning_rate": 6.0942049066413855e-05, + "loss": 2.1665, + "step": 6073 + }, + { + "epoch": 0.6390320883745397, + "grad_norm": 1.2038894891738892, + "learning_rate": 6.091068075857791e-05, + "loss": 1.5338, + "step": 6074 + }, + { + "epoch": 0.6391372961599159, + "grad_norm": 1.072657823562622, + "learning_rate": 6.087931699005588e-05, + "loss": 1.9299, + "step": 6075 + }, + { + "epoch": 0.6392425039452919, + "grad_norm": 1.8095741271972656, + "learning_rate": 6.084795776448995e-05, + "loss": 1.8311, + "step": 6076 + }, + { + "epoch": 0.6393477117306681, + "grad_norm": 1.5067007541656494, + "learning_rate": 6.0816603085521764e-05, + "loss": 1.3399, + "step": 6077 + }, + { + "epoch": 0.6394529195160442, + "grad_norm": 1.420461654663086, + "learning_rate": 6.078525295679243e-05, + "loss": 2.0262, + "step": 6078 + }, + { + "epoch": 0.6395581273014203, + "grad_norm": 1.3096047639846802, + "learning_rate": 6.075390738194252e-05, + "loss": 1.7605, + "step": 6079 + }, + { + "epoch": 0.6396633350867964, + "grad_norm": 1.2972394227981567, + "learning_rate": 6.072256636461214e-05, + "loss": 1.6696, + "step": 6080 + }, + { + "epoch": 0.6397685428721726, + "grad_norm": 1.2217929363250732, + "learning_rate": 6.0691229908440775e-05, + "loss": 2.151, + "step": 6081 + }, + { + "epoch": 0.6398737506575487, + "grad_norm": 1.5966821908950806, + "learning_rate": 6.065989801706744e-05, + "loss": 1.2176, + "step": 6082 + }, + { + "epoch": 0.6399789584429247, + "grad_norm": 1.4299871921539307, + "learning_rate": 6.0628570694130594e-05, + "loss": 1.6411, + "step": 6083 + }, + { + "epoch": 0.6400841662283009, + "grad_norm": 1.8349710702896118, + "learning_rate": 6.059724794326822e-05, + "loss": 1.697, + "step": 6084 + }, + { + "epoch": 0.640189374013677, + "grad_norm": 0.9039126038551331, + "learning_rate": 6.056592976811766e-05, + "loss": 1.509, + "step": 6085 + }, + { + "epoch": 0.6402945817990531, + "grad_norm": 1.0073765516281128, + "learning_rate": 6.053461617231586e-05, + "loss": 1.7619, + "step": 6086 + }, + { + "epoch": 0.6403997895844292, + "grad_norm": 1.246187448501587, + "learning_rate": 6.05033071594991e-05, + "loss": 2.2623, + "step": 6087 + }, + { + "epoch": 0.6405049973698054, + "grad_norm": 1.0114855766296387, + "learning_rate": 6.047200273330325e-05, + "loss": 1.8475, + "step": 6088 + }, + { + "epoch": 0.6406102051551815, + "grad_norm": 1.7710341215133667, + "learning_rate": 6.044070289736352e-05, + "loss": 1.2952, + "step": 6089 + }, + { + "epoch": 0.6407154129405576, + "grad_norm": 1.3931844234466553, + "learning_rate": 6.04094076553147e-05, + "loss": 1.7707, + "step": 6090 + }, + { + "epoch": 0.6408206207259337, + "grad_norm": 1.1191129684448242, + "learning_rate": 6.0378117010791e-05, + "loss": 1.9957, + "step": 6091 + }, + { + "epoch": 0.6409258285113099, + "grad_norm": 1.1291834115982056, + "learning_rate": 6.034683096742613e-05, + "loss": 1.8528, + "step": 6092 + }, + { + "epoch": 0.641031036296686, + "grad_norm": 1.390347957611084, + "learning_rate": 6.031554952885317e-05, + "loss": 1.6936, + "step": 6093 + }, + { + "epoch": 0.641136244082062, + "grad_norm": 1.2321089506149292, + "learning_rate": 6.028427269870478e-05, + "loss": 1.4666, + "step": 6094 + }, + { + "epoch": 0.6412414518674382, + "grad_norm": 1.7802302837371826, + "learning_rate": 6.025300048061302e-05, + "loss": 2.4148, + "step": 6095 + }, + { + "epoch": 0.6413466596528143, + "grad_norm": 1.3233754634857178, + "learning_rate": 6.0221732878209425e-05, + "loss": 1.2631, + "step": 6096 + }, + { + "epoch": 0.6414518674381904, + "grad_norm": 1.9571051597595215, + "learning_rate": 6.0190469895125e-05, + "loss": 1.8066, + "step": 6097 + }, + { + "epoch": 0.6415570752235665, + "grad_norm": 1.236015796661377, + "learning_rate": 6.0159211534990246e-05, + "loss": 1.7285, + "step": 6098 + }, + { + "epoch": 0.6416622830089427, + "grad_norm": 2.017487049102783, + "learning_rate": 6.012795780143503e-05, + "loss": 2.0849, + "step": 6099 + }, + { + "epoch": 0.6417674907943188, + "grad_norm": 1.846824049949646, + "learning_rate": 6.009670869808879e-05, + "loss": 1.7874, + "step": 6100 + }, + { + "epoch": 0.6418726985796949, + "grad_norm": 2.1392126083374023, + "learning_rate": 6.006546422858039e-05, + "loss": 1.5903, + "step": 6101 + }, + { + "epoch": 0.641977906365071, + "grad_norm": 2.1033949851989746, + "learning_rate": 6.003422439653811e-05, + "loss": 1.7481, + "step": 6102 + }, + { + "epoch": 0.6420831141504472, + "grad_norm": 1.5869107246398926, + "learning_rate": 6.0002989205589734e-05, + "loss": 1.3625, + "step": 6103 + }, + { + "epoch": 0.6421883219358232, + "grad_norm": 1.291629433631897, + "learning_rate": 5.997175865936253e-05, + "loss": 1.7495, + "step": 6104 + }, + { + "epoch": 0.6422935297211994, + "grad_norm": 1.8969913721084595, + "learning_rate": 5.994053276148319e-05, + "loss": 1.7404, + "step": 6105 + }, + { + "epoch": 0.6423987375065755, + "grad_norm": 1.4604777097702026, + "learning_rate": 5.990931151557786e-05, + "loss": 1.749, + "step": 6106 + }, + { + "epoch": 0.6425039452919517, + "grad_norm": 1.6946632862091064, + "learning_rate": 5.987809492527219e-05, + "loss": 1.9089, + "step": 6107 + }, + { + "epoch": 0.6426091530773277, + "grad_norm": 1.3629951477050781, + "learning_rate": 5.984688299419121e-05, + "loss": 1.7441, + "step": 6108 + }, + { + "epoch": 0.6427143608627038, + "grad_norm": 1.3938630819320679, + "learning_rate": 5.981567572595951e-05, + "loss": 1.412, + "step": 6109 + }, + { + "epoch": 0.64281956864808, + "grad_norm": 1.3842976093292236, + "learning_rate": 5.978447312420103e-05, + "loss": 1.7809, + "step": 6110 + }, + { + "epoch": 0.642924776433456, + "grad_norm": 1.5348026752471924, + "learning_rate": 5.9753275192539284e-05, + "loss": 1.398, + "step": 6111 + }, + { + "epoch": 0.6430299842188322, + "grad_norm": 1.968120813369751, + "learning_rate": 5.9722081934597164e-05, + "loss": 1.6602, + "step": 6112 + }, + { + "epoch": 0.6431351920042083, + "grad_norm": 1.070837378501892, + "learning_rate": 5.9690893353997e-05, + "loss": 1.6361, + "step": 6113 + }, + { + "epoch": 0.6432403997895845, + "grad_norm": 2.2223477363586426, + "learning_rate": 5.965970945436068e-05, + "loss": 1.5202, + "step": 6114 + }, + { + "epoch": 0.6433456075749605, + "grad_norm": 1.0596355199813843, + "learning_rate": 5.9628530239309456e-05, + "loss": 1.4806, + "step": 6115 + }, + { + "epoch": 0.6434508153603367, + "grad_norm": 1.7896836996078491, + "learning_rate": 5.95973557124641e-05, + "loss": 1.3566, + "step": 6116 + }, + { + "epoch": 0.6435560231457128, + "grad_norm": 1.657974362373352, + "learning_rate": 5.9566185877444755e-05, + "loss": 1.7571, + "step": 6117 + }, + { + "epoch": 0.6436612309310888, + "grad_norm": 1.8864045143127441, + "learning_rate": 5.9535020737871115e-05, + "loss": 1.9383, + "step": 6118 + }, + { + "epoch": 0.643766438716465, + "grad_norm": 1.6928189992904663, + "learning_rate": 5.950386029736228e-05, + "loss": 1.394, + "step": 6119 + }, + { + "epoch": 0.6438716465018411, + "grad_norm": 1.4859063625335693, + "learning_rate": 5.947270455953677e-05, + "loss": 1.7629, + "step": 6120 + }, + { + "epoch": 0.6439768542872173, + "grad_norm": 2.7389304637908936, + "learning_rate": 5.9441553528012626e-05, + "loss": 1.9255, + "step": 6121 + }, + { + "epoch": 0.6440820620725933, + "grad_norm": 2.5364770889282227, + "learning_rate": 5.9410407206407335e-05, + "loss": 1.8894, + "step": 6122 + }, + { + "epoch": 0.6441872698579695, + "grad_norm": 1.21177077293396, + "learning_rate": 5.9379265598337786e-05, + "loss": 1.7663, + "step": 6123 + }, + { + "epoch": 0.6442924776433456, + "grad_norm": 1.6818538904190063, + "learning_rate": 5.934812870742036e-05, + "loss": 2.2091, + "step": 6124 + }, + { + "epoch": 0.6443976854287218, + "grad_norm": 1.6835134029388428, + "learning_rate": 5.9316996537270895e-05, + "loss": 1.6155, + "step": 6125 + }, + { + "epoch": 0.6445028932140978, + "grad_norm": 1.7322865724563599, + "learning_rate": 5.928586909150464e-05, + "loss": 1.8218, + "step": 6126 + }, + { + "epoch": 0.644608100999474, + "grad_norm": 2.4374451637268066, + "learning_rate": 5.925474637373635e-05, + "loss": 1.2015, + "step": 6127 + }, + { + "epoch": 0.6447133087848501, + "grad_norm": 1.3041414022445679, + "learning_rate": 5.9223628387580176e-05, + "loss": 2.0628, + "step": 6128 + }, + { + "epoch": 0.6448185165702262, + "grad_norm": 1.4549946784973145, + "learning_rate": 5.919251513664982e-05, + "loss": 1.3928, + "step": 6129 + }, + { + "epoch": 0.6449237243556023, + "grad_norm": 2.7458503246307373, + "learning_rate": 5.916140662455828e-05, + "loss": 1.7509, + "step": 6130 + }, + { + "epoch": 0.6450289321409785, + "grad_norm": 1.7431880235671997, + "learning_rate": 5.913030285491808e-05, + "loss": 1.6354, + "step": 6131 + }, + { + "epoch": 0.6451341399263546, + "grad_norm": 1.597713828086853, + "learning_rate": 5.909920383134124e-05, + "loss": 1.6892, + "step": 6132 + }, + { + "epoch": 0.6452393477117306, + "grad_norm": 1.7393560409545898, + "learning_rate": 5.90681095574392e-05, + "loss": 1.5215, + "step": 6133 + }, + { + "epoch": 0.6453445554971068, + "grad_norm": 1.863715648651123, + "learning_rate": 5.903702003682278e-05, + "loss": 2.0865, + "step": 6134 + }, + { + "epoch": 0.6454497632824829, + "grad_norm": 1.6684576272964478, + "learning_rate": 5.900593527310234e-05, + "loss": 2.2619, + "step": 6135 + }, + { + "epoch": 0.645554971067859, + "grad_norm": 1.9676207304000854, + "learning_rate": 5.897485526988766e-05, + "loss": 1.9868, + "step": 6136 + }, + { + "epoch": 0.6456601788532351, + "grad_norm": 2.0112955570220947, + "learning_rate": 5.8943780030787935e-05, + "loss": 1.1787, + "step": 6137 + }, + { + "epoch": 0.6457653866386113, + "grad_norm": 1.3841115236282349, + "learning_rate": 5.891270955941184e-05, + "loss": 1.5988, + "step": 6138 + }, + { + "epoch": 0.6458705944239874, + "grad_norm": 2.0346131324768066, + "learning_rate": 5.8881643859367475e-05, + "loss": 1.7077, + "step": 6139 + }, + { + "epoch": 0.6459758022093635, + "grad_norm": 1.9194620847702026, + "learning_rate": 5.8850582934262446e-05, + "loss": 1.7822, + "step": 6140 + }, + { + "epoch": 0.6460810099947396, + "grad_norm": 1.3288987874984741, + "learning_rate": 5.881952678770368e-05, + "loss": 1.7559, + "step": 6141 + }, + { + "epoch": 0.6461862177801158, + "grad_norm": 1.0419418811798096, + "learning_rate": 5.8788475423297674e-05, + "loss": 2.037, + "step": 6142 + }, + { + "epoch": 0.6462914255654918, + "grad_norm": 1.2322132587432861, + "learning_rate": 5.875742884465033e-05, + "loss": 2.2489, + "step": 6143 + }, + { + "epoch": 0.6463966333508679, + "grad_norm": 1.2339824438095093, + "learning_rate": 5.872638705536695e-05, + "loss": 1.3035, + "step": 6144 + }, + { + "epoch": 0.6465018411362441, + "grad_norm": 1.2765202522277832, + "learning_rate": 5.869535005905232e-05, + "loss": 1.6457, + "step": 6145 + }, + { + "epoch": 0.6466070489216202, + "grad_norm": 1.7985990047454834, + "learning_rate": 5.86643178593107e-05, + "loss": 1.2573, + "step": 6146 + }, + { + "epoch": 0.6467122567069963, + "grad_norm": 1.2002838850021362, + "learning_rate": 5.863329045974572e-05, + "loss": 1.7499, + "step": 6147 + }, + { + "epoch": 0.6468174644923724, + "grad_norm": 1.2847400903701782, + "learning_rate": 5.860226786396049e-05, + "loss": 2.0853, + "step": 6148 + }, + { + "epoch": 0.6469226722777486, + "grad_norm": 1.2351857423782349, + "learning_rate": 5.857125007555759e-05, + "loss": 1.9727, + "step": 6149 + }, + { + "epoch": 0.6470278800631246, + "grad_norm": 1.335922360420227, + "learning_rate": 5.8540237098139006e-05, + "loss": 1.3194, + "step": 6150 + }, + { + "epoch": 0.6471330878485008, + "grad_norm": 2.39359188079834, + "learning_rate": 5.850922893530617e-05, + "loss": 1.9385, + "step": 6151 + }, + { + "epoch": 0.6472382956338769, + "grad_norm": 1.5656342506408691, + "learning_rate": 5.847822559065992e-05, + "loss": 2.1779, + "step": 6152 + }, + { + "epoch": 0.6473435034192531, + "grad_norm": 1.4513521194458008, + "learning_rate": 5.8447227067800616e-05, + "loss": 1.644, + "step": 6153 + }, + { + "epoch": 0.6474487112046291, + "grad_norm": 1.374786138534546, + "learning_rate": 5.841623337032801e-05, + "loss": 1.4539, + "step": 6154 + }, + { + "epoch": 0.6475539189900053, + "grad_norm": 1.5855339765548706, + "learning_rate": 5.838524450184126e-05, + "loss": 1.6174, + "step": 6155 + }, + { + "epoch": 0.6476591267753814, + "grad_norm": 1.3137321472167969, + "learning_rate": 5.835426046593906e-05, + "loss": 1.5084, + "step": 6156 + }, + { + "epoch": 0.6477643345607575, + "grad_norm": 1.2792092561721802, + "learning_rate": 5.8323281266219466e-05, + "loss": 1.8137, + "step": 6157 + }, + { + "epoch": 0.6478695423461336, + "grad_norm": 1.172930121421814, + "learning_rate": 5.8292306906279935e-05, + "loss": 1.7933, + "step": 6158 + }, + { + "epoch": 0.6479747501315097, + "grad_norm": 1.8277498483657837, + "learning_rate": 5.8261337389717506e-05, + "loss": 2.0978, + "step": 6159 + }, + { + "epoch": 0.6480799579168859, + "grad_norm": 1.2689887285232544, + "learning_rate": 5.823037272012852e-05, + "loss": 1.3535, + "step": 6160 + }, + { + "epoch": 0.6481851657022619, + "grad_norm": 1.2342352867126465, + "learning_rate": 5.8199412901108774e-05, + "loss": 2.1196, + "step": 6161 + }, + { + "epoch": 0.6482903734876381, + "grad_norm": 1.4118258953094482, + "learning_rate": 5.8168457936253604e-05, + "loss": 2.2767, + "step": 6162 + }, + { + "epoch": 0.6483955812730142, + "grad_norm": 2.2059452533721924, + "learning_rate": 5.8137507829157655e-05, + "loss": 1.887, + "step": 6163 + }, + { + "epoch": 0.6485007890583904, + "grad_norm": 1.9310505390167236, + "learning_rate": 5.8106562583415037e-05, + "loss": 1.9649, + "step": 6164 + }, + { + "epoch": 0.6486059968437664, + "grad_norm": 2.263485908508301, + "learning_rate": 5.807562220261939e-05, + "loss": 2.1794, + "step": 6165 + }, + { + "epoch": 0.6487112046291426, + "grad_norm": 1.5354760885238647, + "learning_rate": 5.804468669036369e-05, + "loss": 1.8551, + "step": 6166 + }, + { + "epoch": 0.6488164124145187, + "grad_norm": 2.3956634998321533, + "learning_rate": 5.801375605024033e-05, + "loss": 1.8773, + "step": 6167 + }, + { + "epoch": 0.6489216201998947, + "grad_norm": 1.603357195854187, + "learning_rate": 5.798283028584126e-05, + "loss": 1.9813, + "step": 6168 + }, + { + "epoch": 0.6490268279852709, + "grad_norm": 1.1227657794952393, + "learning_rate": 5.795190940075774e-05, + "loss": 1.279, + "step": 6169 + }, + { + "epoch": 0.649132035770647, + "grad_norm": 1.2054638862609863, + "learning_rate": 5.792099339858048e-05, + "loss": 1.933, + "step": 6170 + }, + { + "epoch": 0.6492372435560232, + "grad_norm": 1.7264693975448608, + "learning_rate": 5.789008228289975e-05, + "loss": 1.4221, + "step": 6171 + }, + { + "epoch": 0.6493424513413992, + "grad_norm": 1.8171310424804688, + "learning_rate": 5.785917605730509e-05, + "loss": 2.2143, + "step": 6172 + }, + { + "epoch": 0.6494476591267754, + "grad_norm": 1.2820940017700195, + "learning_rate": 5.7828274725385544e-05, + "loss": 1.1761, + "step": 6173 + }, + { + "epoch": 0.6495528669121515, + "grad_norm": 2.7752187252044678, + "learning_rate": 5.7797378290729595e-05, + "loss": 1.6048, + "step": 6174 + }, + { + "epoch": 0.6496580746975276, + "grad_norm": 2.0099592208862305, + "learning_rate": 5.7766486756925086e-05, + "loss": 2.1283, + "step": 6175 + }, + { + "epoch": 0.6497632824829037, + "grad_norm": 1.9316635131835938, + "learning_rate": 5.773560012755945e-05, + "loss": 2.0693, + "step": 6176 + }, + { + "epoch": 0.6498684902682799, + "grad_norm": 1.1823091506958008, + "learning_rate": 5.770471840621938e-05, + "loss": 2.015, + "step": 6177 + }, + { + "epoch": 0.649973698053656, + "grad_norm": 1.3414077758789062, + "learning_rate": 5.767384159649107e-05, + "loss": 1.5643, + "step": 6178 + }, + { + "epoch": 0.650078905839032, + "grad_norm": 1.3535269498825073, + "learning_rate": 5.764296970196018e-05, + "loss": 1.6854, + "step": 6179 + }, + { + "epoch": 0.6501841136244082, + "grad_norm": 1.28633451461792, + "learning_rate": 5.761210272621175e-05, + "loss": 1.9889, + "step": 6180 + }, + { + "epoch": 0.6502893214097843, + "grad_norm": 1.7541005611419678, + "learning_rate": 5.7581240672830206e-05, + "loss": 2.1746, + "step": 6181 + }, + { + "epoch": 0.6503945291951604, + "grad_norm": 1.0183143615722656, + "learning_rate": 5.7550383545399545e-05, + "loss": 1.1908, + "step": 6182 + }, + { + "epoch": 0.6504997369805365, + "grad_norm": 1.4759997129440308, + "learning_rate": 5.7519531347503054e-05, + "loss": 1.4782, + "step": 6183 + }, + { + "epoch": 0.6506049447659127, + "grad_norm": 1.615997076034546, + "learning_rate": 5.7488684082723454e-05, + "loss": 1.966, + "step": 6184 + }, + { + "epoch": 0.6507101525512888, + "grad_norm": 1.790147304534912, + "learning_rate": 5.745784175464304e-05, + "loss": 1.5509, + "step": 6185 + }, + { + "epoch": 0.6508153603366649, + "grad_norm": 1.5848060846328735, + "learning_rate": 5.742700436684337e-05, + "loss": 1.7556, + "step": 6186 + }, + { + "epoch": 0.650920568122041, + "grad_norm": 1.6461782455444336, + "learning_rate": 5.739617192290545e-05, + "loss": 1.6253, + "step": 6187 + }, + { + "epoch": 0.6510257759074172, + "grad_norm": 1.7758451700210571, + "learning_rate": 5.736534442640984e-05, + "loss": 1.7024, + "step": 6188 + }, + { + "epoch": 0.6511309836927933, + "grad_norm": 1.5912493467330933, + "learning_rate": 5.73345218809364e-05, + "loss": 1.5275, + "step": 6189 + }, + { + "epoch": 0.6512361914781694, + "grad_norm": 0.911089301109314, + "learning_rate": 5.7303704290064375e-05, + "loss": 1.5513, + "step": 6190 + }, + { + "epoch": 0.6513413992635455, + "grad_norm": 1.3374494314193726, + "learning_rate": 5.727289165737263e-05, + "loss": 1.6507, + "step": 6191 + }, + { + "epoch": 0.6514466070489217, + "grad_norm": 1.4440089464187622, + "learning_rate": 5.724208398643924e-05, + "loss": 1.9977, + "step": 6192 + }, + { + "epoch": 0.6515518148342977, + "grad_norm": 2.0308356285095215, + "learning_rate": 5.721128128084191e-05, + "loss": 1.7098, + "step": 6193 + }, + { + "epoch": 0.6516570226196738, + "grad_norm": 1.5039621591567993, + "learning_rate": 5.7180483544157546e-05, + "loss": 1.539, + "step": 6194 + }, + { + "epoch": 0.65176223040505, + "grad_norm": 1.9608867168426514, + "learning_rate": 5.7149690779962594e-05, + "loss": 1.5212, + "step": 6195 + }, + { + "epoch": 0.6518674381904261, + "grad_norm": 2.1692678928375244, + "learning_rate": 5.711890299183298e-05, + "loss": 1.9479, + "step": 6196 + }, + { + "epoch": 0.6519726459758022, + "grad_norm": 1.2889890670776367, + "learning_rate": 5.7088120183343976e-05, + "loss": 1.9426, + "step": 6197 + }, + { + "epoch": 0.6520778537611783, + "grad_norm": 1.587933897972107, + "learning_rate": 5.705734235807021e-05, + "loss": 1.6135, + "step": 6198 + }, + { + "epoch": 0.6521830615465545, + "grad_norm": 2.34407639503479, + "learning_rate": 5.7026569519585916e-05, + "loss": 1.769, + "step": 6199 + }, + { + "epoch": 0.6522882693319305, + "grad_norm": 1.3627784252166748, + "learning_rate": 5.6995801671464556e-05, + "loss": 1.8221, + "step": 6200 + }, + { + "epoch": 0.6523934771173067, + "grad_norm": 1.99224054813385, + "learning_rate": 5.696503881727917e-05, + "loss": 1.7675, + "step": 6201 + }, + { + "epoch": 0.6524986849026828, + "grad_norm": 1.4435157775878906, + "learning_rate": 5.693428096060213e-05, + "loss": 1.2552, + "step": 6202 + }, + { + "epoch": 0.652603892688059, + "grad_norm": 1.6768929958343506, + "learning_rate": 5.6903528105005175e-05, + "loss": 2.1129, + "step": 6203 + }, + { + "epoch": 0.652709100473435, + "grad_norm": 1.7496428489685059, + "learning_rate": 5.6872780254059646e-05, + "loss": 1.8461, + "step": 6204 + }, + { + "epoch": 0.6528143082588111, + "grad_norm": 2.0133605003356934, + "learning_rate": 5.6842037411336116e-05, + "loss": 1.4858, + "step": 6205 + }, + { + "epoch": 0.6529195160441873, + "grad_norm": 1.2797411680221558, + "learning_rate": 5.6811299580404634e-05, + "loss": 1.4251, + "step": 6206 + }, + { + "epoch": 0.6530247238295633, + "grad_norm": 1.3031213283538818, + "learning_rate": 5.678056676483474e-05, + "loss": 1.9861, + "step": 6207 + }, + { + "epoch": 0.6531299316149395, + "grad_norm": 0.9973400831222534, + "learning_rate": 5.6749838968195326e-05, + "loss": 1.543, + "step": 6208 + }, + { + "epoch": 0.6532351394003156, + "grad_norm": 1.331733226776123, + "learning_rate": 5.671911619405465e-05, + "loss": 1.9739, + "step": 6209 + }, + { + "epoch": 0.6533403471856918, + "grad_norm": 2.5411765575408936, + "learning_rate": 5.668839844598053e-05, + "loss": 2.0505, + "step": 6210 + }, + { + "epoch": 0.6534455549710678, + "grad_norm": 1.2405493259429932, + "learning_rate": 5.665768572754007e-05, + "loss": 1.9774, + "step": 6211 + }, + { + "epoch": 0.653550762756444, + "grad_norm": 1.3992949724197388, + "learning_rate": 5.6626978042299814e-05, + "loss": 1.735, + "step": 6212 + }, + { + "epoch": 0.6536559705418201, + "grad_norm": 1.305866003036499, + "learning_rate": 5.6596275393825804e-05, + "loss": 2.2097, + "step": 6213 + }, + { + "epoch": 0.6537611783271962, + "grad_norm": 1.8051230907440186, + "learning_rate": 5.656557778568341e-05, + "loss": 0.9921, + "step": 6214 + }, + { + "epoch": 0.6538663861125723, + "grad_norm": 1.1146427392959595, + "learning_rate": 5.653488522143744e-05, + "loss": 1.6378, + "step": 6215 + }, + { + "epoch": 0.6539715938979485, + "grad_norm": 1.5699180364608765, + "learning_rate": 5.650419770465213e-05, + "loss": 1.5805, + "step": 6216 + }, + { + "epoch": 0.6540768016833246, + "grad_norm": 1.5119853019714355, + "learning_rate": 5.647351523889106e-05, + "loss": 1.5158, + "step": 6217 + }, + { + "epoch": 0.6541820094687006, + "grad_norm": 1.7245757579803467, + "learning_rate": 5.6442837827717386e-05, + "loss": 1.2592, + "step": 6218 + }, + { + "epoch": 0.6542872172540768, + "grad_norm": 1.8998960256576538, + "learning_rate": 5.6412165474693526e-05, + "loss": 1.7007, + "step": 6219 + }, + { + "epoch": 0.6543924250394529, + "grad_norm": 1.3357900381088257, + "learning_rate": 5.638149818338131e-05, + "loss": 1.5266, + "step": 6220 + }, + { + "epoch": 0.6544976328248291, + "grad_norm": 1.37607741355896, + "learning_rate": 5.635083595734212e-05, + "loss": 1.6224, + "step": 6221 + }, + { + "epoch": 0.6546028406102051, + "grad_norm": 1.2907686233520508, + "learning_rate": 5.6320178800136626e-05, + "loss": 1.5942, + "step": 6222 + }, + { + "epoch": 0.6547080483955813, + "grad_norm": 1.0506342649459839, + "learning_rate": 5.628952671532488e-05, + "loss": 1.6633, + "step": 6223 + }, + { + "epoch": 0.6548132561809574, + "grad_norm": 1.3889976739883423, + "learning_rate": 5.625887970646651e-05, + "loss": 1.5176, + "step": 6224 + }, + { + "epoch": 0.6549184639663335, + "grad_norm": 1.7190347909927368, + "learning_rate": 5.6228237777120406e-05, + "loss": 1.4025, + "step": 6225 + }, + { + "epoch": 0.6550236717517096, + "grad_norm": 2.0763463973999023, + "learning_rate": 5.6197600930844864e-05, + "loss": 1.6678, + "step": 6226 + }, + { + "epoch": 0.6551288795370858, + "grad_norm": 1.7063326835632324, + "learning_rate": 5.616696917119773e-05, + "loss": 1.6968, + "step": 6227 + }, + { + "epoch": 0.6552340873224619, + "grad_norm": 1.3571560382843018, + "learning_rate": 5.6136342501736126e-05, + "loss": 1.7551, + "step": 6228 + }, + { + "epoch": 0.655339295107838, + "grad_norm": 1.795301079750061, + "learning_rate": 5.610572092601659e-05, + "loss": 1.7408, + "step": 6229 + }, + { + "epoch": 0.6554445028932141, + "grad_norm": 1.3519048690795898, + "learning_rate": 5.6075104447595186e-05, + "loss": 1.8361, + "step": 6230 + }, + { + "epoch": 0.6555497106785902, + "grad_norm": 1.5597832202911377, + "learning_rate": 5.604449307002726e-05, + "loss": 2.2093, + "step": 6231 + }, + { + "epoch": 0.6556549184639663, + "grad_norm": 1.250503420829773, + "learning_rate": 5.601388679686757e-05, + "loss": 1.7934, + "step": 6232 + }, + { + "epoch": 0.6557601262493424, + "grad_norm": 1.4893306493759155, + "learning_rate": 5.598328563167039e-05, + "loss": 1.7416, + "step": 6233 + }, + { + "epoch": 0.6558653340347186, + "grad_norm": 2.179457902908325, + "learning_rate": 5.5952689577989324e-05, + "loss": 1.2715, + "step": 6234 + }, + { + "epoch": 0.6559705418200947, + "grad_norm": 2.1621286869049072, + "learning_rate": 5.592209863937733e-05, + "loss": 1.8506, + "step": 6235 + }, + { + "epoch": 0.6560757496054708, + "grad_norm": 1.5253031253814697, + "learning_rate": 5.589151281938695e-05, + "loss": 1.5244, + "step": 6236 + }, + { + "epoch": 0.6561809573908469, + "grad_norm": 2.098914384841919, + "learning_rate": 5.586093212156986e-05, + "loss": 1.7779, + "step": 6237 + }, + { + "epoch": 0.6562861651762231, + "grad_norm": 1.6869080066680908, + "learning_rate": 5.583035654947743e-05, + "loss": 1.5032, + "step": 6238 + }, + { + "epoch": 0.6563913729615991, + "grad_norm": 1.3635993003845215, + "learning_rate": 5.5799786106660234e-05, + "loss": 1.3241, + "step": 6239 + }, + { + "epoch": 0.6564965807469753, + "grad_norm": 1.3757116794586182, + "learning_rate": 5.576922079666829e-05, + "loss": 1.5449, + "step": 6240 + }, + { + "epoch": 0.6566017885323514, + "grad_norm": 1.2479023933410645, + "learning_rate": 5.573866062305113e-05, + "loss": 1.9185, + "step": 6241 + }, + { + "epoch": 0.6567069963177276, + "grad_norm": 1.2713433504104614, + "learning_rate": 5.570810558935756e-05, + "loss": 1.955, + "step": 6242 + }, + { + "epoch": 0.6568122041031036, + "grad_norm": 1.468234658241272, + "learning_rate": 5.56775556991358e-05, + "loss": 1.5705, + "step": 6243 + }, + { + "epoch": 0.6569174118884797, + "grad_norm": 1.4933536052703857, + "learning_rate": 5.5647010955933586e-05, + "loss": 1.5719, + "step": 6244 + }, + { + "epoch": 0.6570226196738559, + "grad_norm": 1.548898458480835, + "learning_rate": 5.561647136329789e-05, + "loss": 1.3829, + "step": 6245 + }, + { + "epoch": 0.6571278274592319, + "grad_norm": 1.346425175666809, + "learning_rate": 5.5585936924775275e-05, + "loss": 2.2525, + "step": 6246 + }, + { + "epoch": 0.6572330352446081, + "grad_norm": 1.4224555492401123, + "learning_rate": 5.555540764391156e-05, + "loss": 1.5195, + "step": 6247 + }, + { + "epoch": 0.6573382430299842, + "grad_norm": 1.1593234539031982, + "learning_rate": 5.552488352425195e-05, + "loss": 1.8726, + "step": 6248 + }, + { + "epoch": 0.6574434508153604, + "grad_norm": 1.3007352352142334, + "learning_rate": 5.549436456934121e-05, + "loss": 1.6618, + "step": 6249 + }, + { + "epoch": 0.6575486586007364, + "grad_norm": 1.5232993364334106, + "learning_rate": 5.5463850782723346e-05, + "loss": 1.1512, + "step": 6250 + }, + { + "epoch": 0.6576538663861126, + "grad_norm": 1.0250799655914307, + "learning_rate": 5.5433342167941803e-05, + "loss": 1.8019, + "step": 6251 + }, + { + "epoch": 0.6577590741714887, + "grad_norm": 1.5869766473770142, + "learning_rate": 5.540283872853953e-05, + "loss": 1.0935, + "step": 6252 + }, + { + "epoch": 0.6578642819568649, + "grad_norm": 1.3457857370376587, + "learning_rate": 5.5372340468058726e-05, + "loss": 1.8767, + "step": 6253 + }, + { + "epoch": 0.6579694897422409, + "grad_norm": 1.3926914930343628, + "learning_rate": 5.5341847390041035e-05, + "loss": 1.7272, + "step": 6254 + }, + { + "epoch": 0.658074697527617, + "grad_norm": 1.3577390909194946, + "learning_rate": 5.531135949802759e-05, + "loss": 1.9582, + "step": 6255 + }, + { + "epoch": 0.6581799053129932, + "grad_norm": 1.4779307842254639, + "learning_rate": 5.52808767955588e-05, + "loss": 1.578, + "step": 6256 + }, + { + "epoch": 0.6582851130983692, + "grad_norm": 1.3357998132705688, + "learning_rate": 5.5250399286174546e-05, + "loss": 1.6586, + "step": 6257 + }, + { + "epoch": 0.6583903208837454, + "grad_norm": 1.462868094444275, + "learning_rate": 5.521992697341407e-05, + "loss": 1.3614, + "step": 6258 + }, + { + "epoch": 0.6584955286691215, + "grad_norm": 1.4444942474365234, + "learning_rate": 5.518945986081596e-05, + "loss": 1.1518, + "step": 6259 + }, + { + "epoch": 0.6586007364544977, + "grad_norm": 1.828895092010498, + "learning_rate": 5.515899795191837e-05, + "loss": 1.2489, + "step": 6260 + }, + { + "epoch": 0.6587059442398737, + "grad_norm": 1.7447843551635742, + "learning_rate": 5.512854125025868e-05, + "loss": 1.7433, + "step": 6261 + }, + { + "epoch": 0.6588111520252499, + "grad_norm": 1.3825078010559082, + "learning_rate": 5.5098089759373714e-05, + "loss": 2.2669, + "step": 6262 + }, + { + "epoch": 0.658916359810626, + "grad_norm": 1.5063486099243164, + "learning_rate": 5.5067643482799746e-05, + "loss": 1.814, + "step": 6263 + }, + { + "epoch": 0.6590215675960021, + "grad_norm": 1.3307785987854004, + "learning_rate": 5.50372024240724e-05, + "loss": 1.5017, + "step": 6264 + }, + { + "epoch": 0.6591267753813782, + "grad_norm": 1.2832993268966675, + "learning_rate": 5.500676658672662e-05, + "loss": 1.5099, + "step": 6265 + }, + { + "epoch": 0.6592319831667544, + "grad_norm": 1.9960840940475464, + "learning_rate": 5.4976335974296923e-05, + "loss": 1.546, + "step": 6266 + }, + { + "epoch": 0.6593371909521305, + "grad_norm": 2.2886388301849365, + "learning_rate": 5.4945910590317074e-05, + "loss": 1.4721, + "step": 6267 + }, + { + "epoch": 0.6594423987375065, + "grad_norm": 1.4624122381210327, + "learning_rate": 5.491549043832023e-05, + "loss": 1.8469, + "step": 6268 + }, + { + "epoch": 0.6595476065228827, + "grad_norm": 1.8550081253051758, + "learning_rate": 5.488507552183906e-05, + "loss": 1.6482, + "step": 6269 + }, + { + "epoch": 0.6596528143082588, + "grad_norm": 1.4133846759796143, + "learning_rate": 5.4854665844405505e-05, + "loss": 1.7455, + "step": 6270 + }, + { + "epoch": 0.6597580220936349, + "grad_norm": 1.177406668663025, + "learning_rate": 5.48242614095509e-05, + "loss": 1.6862, + "step": 6271 + }, + { + "epoch": 0.659863229879011, + "grad_norm": 1.473006010055542, + "learning_rate": 5.4793862220806114e-05, + "loss": 1.888, + "step": 6272 + }, + { + "epoch": 0.6599684376643872, + "grad_norm": 1.623795986175537, + "learning_rate": 5.4763468281701235e-05, + "loss": 1.6365, + "step": 6273 + }, + { + "epoch": 0.6600736454497633, + "grad_norm": 1.2633938789367676, + "learning_rate": 5.473307959576579e-05, + "loss": 1.6566, + "step": 6274 + }, + { + "epoch": 0.6601788532351394, + "grad_norm": 1.473612904548645, + "learning_rate": 5.470269616652879e-05, + "loss": 2.0853, + "step": 6275 + }, + { + "epoch": 0.6602840610205155, + "grad_norm": 1.1714433431625366, + "learning_rate": 5.467231799751853e-05, + "loss": 1.3471, + "step": 6276 + }, + { + "epoch": 0.6603892688058917, + "grad_norm": 1.7451331615447998, + "learning_rate": 5.464194509226267e-05, + "loss": 1.5852, + "step": 6277 + }, + { + "epoch": 0.6604944765912677, + "grad_norm": 1.2476874589920044, + "learning_rate": 5.461157745428841e-05, + "loss": 2.072, + "step": 6278 + }, + { + "epoch": 0.6605996843766438, + "grad_norm": 1.997950553894043, + "learning_rate": 5.45812150871222e-05, + "loss": 1.5509, + "step": 6279 + }, + { + "epoch": 0.66070489216202, + "grad_norm": 1.408400058746338, + "learning_rate": 5.455085799428992e-05, + "loss": 1.9504, + "step": 6280 + }, + { + "epoch": 0.6608100999473961, + "grad_norm": 1.8198645114898682, + "learning_rate": 5.452050617931683e-05, + "loss": 2.1562, + "step": 6281 + }, + { + "epoch": 0.6609153077327722, + "grad_norm": 1.5449212789535522, + "learning_rate": 5.449015964572758e-05, + "loss": 1.5864, + "step": 6282 + }, + { + "epoch": 0.6610205155181483, + "grad_norm": 1.971395492553711, + "learning_rate": 5.445981839704626e-05, + "loss": 1.6987, + "step": 6283 + }, + { + "epoch": 0.6611257233035245, + "grad_norm": 1.50814950466156, + "learning_rate": 5.4429482436796265e-05, + "loss": 1.385, + "step": 6284 + }, + { + "epoch": 0.6612309310889006, + "grad_norm": 0.9933773279190063, + "learning_rate": 5.439915176850037e-05, + "loss": 1.5161, + "step": 6285 + }, + { + "epoch": 0.6613361388742767, + "grad_norm": 1.3976936340332031, + "learning_rate": 5.4368826395680875e-05, + "loss": 1.037, + "step": 6286 + }, + { + "epoch": 0.6614413466596528, + "grad_norm": 1.625756859779358, + "learning_rate": 5.4338506321859304e-05, + "loss": 1.6669, + "step": 6287 + }, + { + "epoch": 0.661546554445029, + "grad_norm": 1.8389314413070679, + "learning_rate": 5.430819155055659e-05, + "loss": 2.227, + "step": 6288 + }, + { + "epoch": 0.661651762230405, + "grad_norm": 1.3766262531280518, + "learning_rate": 5.427788208529318e-05, + "loss": 2.0434, + "step": 6289 + }, + { + "epoch": 0.6617569700157812, + "grad_norm": 1.752624750137329, + "learning_rate": 5.4247577929588745e-05, + "loss": 1.7892, + "step": 6290 + }, + { + "epoch": 0.6618621778011573, + "grad_norm": 1.2732161283493042, + "learning_rate": 5.4217279086962416e-05, + "loss": 1.6846, + "step": 6291 + }, + { + "epoch": 0.6619673855865335, + "grad_norm": 1.762704849243164, + "learning_rate": 5.418698556093271e-05, + "loss": 1.3161, + "step": 6292 + }, + { + "epoch": 0.6620725933719095, + "grad_norm": 1.4395411014556885, + "learning_rate": 5.41566973550175e-05, + "loss": 2.1022, + "step": 6293 + }, + { + "epoch": 0.6621778011572856, + "grad_norm": 1.3859562873840332, + "learning_rate": 5.41264144727341e-05, + "loss": 1.3643, + "step": 6294 + }, + { + "epoch": 0.6622830089426618, + "grad_norm": 1.1399462223052979, + "learning_rate": 5.409613691759914e-05, + "loss": 2.0001, + "step": 6295 + }, + { + "epoch": 0.6623882167280378, + "grad_norm": 2.5576746463775635, + "learning_rate": 5.406586469312859e-05, + "loss": 1.2268, + "step": 6296 + }, + { + "epoch": 0.662493424513414, + "grad_norm": 1.959189534187317, + "learning_rate": 5.403559780283795e-05, + "loss": 0.9835, + "step": 6297 + }, + { + "epoch": 0.6625986322987901, + "grad_norm": 1.6175181865692139, + "learning_rate": 5.400533625024199e-05, + "loss": 1.8782, + "step": 6298 + }, + { + "epoch": 0.6627038400841663, + "grad_norm": 1.2457187175750732, + "learning_rate": 5.397508003885483e-05, + "loss": 1.7866, + "step": 6299 + }, + { + "epoch": 0.6628090478695423, + "grad_norm": 1.5392271280288696, + "learning_rate": 5.394482917219015e-05, + "loss": 1.664, + "step": 6300 + }, + { + "epoch": 0.6629142556549185, + "grad_norm": 2.1098384857177734, + "learning_rate": 5.391458365376072e-05, + "loss": 2.0587, + "step": 6301 + }, + { + "epoch": 0.6630194634402946, + "grad_norm": 1.1697083711624146, + "learning_rate": 5.3884343487078984e-05, + "loss": 2.0644, + "step": 6302 + }, + { + "epoch": 0.6631246712256706, + "grad_norm": 1.4275423288345337, + "learning_rate": 5.385410867565658e-05, + "loss": 2.0324, + "step": 6303 + }, + { + "epoch": 0.6632298790110468, + "grad_norm": 1.6790199279785156, + "learning_rate": 5.382387922300454e-05, + "loss": 1.5068, + "step": 6304 + }, + { + "epoch": 0.663335086796423, + "grad_norm": 1.8416249752044678, + "learning_rate": 5.379365513263338e-05, + "loss": 1.9226, + "step": 6305 + }, + { + "epoch": 0.6634402945817991, + "grad_norm": 1.2681195735931396, + "learning_rate": 5.3763436408052904e-05, + "loss": 2.0517, + "step": 6306 + }, + { + "epoch": 0.6635455023671751, + "grad_norm": 1.1384334564208984, + "learning_rate": 5.3733223052772265e-05, + "loss": 1.3946, + "step": 6307 + }, + { + "epoch": 0.6636507101525513, + "grad_norm": 0.976108968257904, + "learning_rate": 5.370301507030012e-05, + "loss": 1.823, + "step": 6308 + }, + { + "epoch": 0.6637559179379274, + "grad_norm": 1.8622645139694214, + "learning_rate": 5.367281246414439e-05, + "loss": 1.3448, + "step": 6309 + }, + { + "epoch": 0.6638611257233035, + "grad_norm": 1.617946982383728, + "learning_rate": 5.364261523781234e-05, + "loss": 1.6499, + "step": 6310 + }, + { + "epoch": 0.6639663335086796, + "grad_norm": 1.8407984972000122, + "learning_rate": 5.361242339481078e-05, + "loss": 2.258, + "step": 6311 + }, + { + "epoch": 0.6640715412940558, + "grad_norm": 1.4248191118240356, + "learning_rate": 5.358223693864575e-05, + "loss": 1.4035, + "step": 6312 + }, + { + "epoch": 0.6641767490794319, + "grad_norm": 1.5487806797027588, + "learning_rate": 5.3552055872822636e-05, + "loss": 1.6018, + "step": 6313 + }, + { + "epoch": 0.664281956864808, + "grad_norm": 1.2339686155319214, + "learning_rate": 5.352188020084638e-05, + "loss": 2.0545, + "step": 6314 + }, + { + "epoch": 0.6643871646501841, + "grad_norm": 1.4714442491531372, + "learning_rate": 5.349170992622112e-05, + "loss": 1.9142, + "step": 6315 + }, + { + "epoch": 0.6644923724355603, + "grad_norm": 1.6540426015853882, + "learning_rate": 5.34615450524504e-05, + "loss": 1.6512, + "step": 6316 + }, + { + "epoch": 0.6645975802209364, + "grad_norm": 1.5326392650604248, + "learning_rate": 5.3431385583037244e-05, + "loss": 1.5426, + "step": 6317 + }, + { + "epoch": 0.6647027880063124, + "grad_norm": 1.2688010931015015, + "learning_rate": 5.340123152148393e-05, + "loss": 1.5912, + "step": 6318 + }, + { + "epoch": 0.6648079957916886, + "grad_norm": 1.2774553298950195, + "learning_rate": 5.337108287129211e-05, + "loss": 1.7242, + "step": 6319 + }, + { + "epoch": 0.6649132035770647, + "grad_norm": 1.8071551322937012, + "learning_rate": 5.334093963596294e-05, + "loss": 1.8998, + "step": 6320 + }, + { + "epoch": 0.6650184113624408, + "grad_norm": 1.0663738250732422, + "learning_rate": 5.33108018189968e-05, + "loss": 1.7188, + "step": 6321 + }, + { + "epoch": 0.6651236191478169, + "grad_norm": 1.3195592164993286, + "learning_rate": 5.328066942389351e-05, + "loss": 1.8181, + "step": 6322 + }, + { + "epoch": 0.6652288269331931, + "grad_norm": 1.5998344421386719, + "learning_rate": 5.325054245415223e-05, + "loss": 1.454, + "step": 6323 + }, + { + "epoch": 0.6653340347185692, + "grad_norm": 1.0443423986434937, + "learning_rate": 5.322042091327148e-05, + "loss": 1.8442, + "step": 6324 + }, + { + "epoch": 0.6654392425039453, + "grad_norm": 1.2177835702896118, + "learning_rate": 5.319030480474923e-05, + "loss": 1.6729, + "step": 6325 + }, + { + "epoch": 0.6655444502893214, + "grad_norm": 2.082824945449829, + "learning_rate": 5.316019413208275e-05, + "loss": 1.8512, + "step": 6326 + }, + { + "epoch": 0.6656496580746976, + "grad_norm": 1.6515694856643677, + "learning_rate": 5.313008889876865e-05, + "loss": 1.9645, + "step": 6327 + }, + { + "epoch": 0.6657548658600736, + "grad_norm": 1.6280758380889893, + "learning_rate": 5.309998910830303e-05, + "loss": 2.4568, + "step": 6328 + }, + { + "epoch": 0.6658600736454497, + "grad_norm": 1.4808907508850098, + "learning_rate": 5.306989476418123e-05, + "loss": 1.864, + "step": 6329 + }, + { + "epoch": 0.6659652814308259, + "grad_norm": 1.735153079032898, + "learning_rate": 5.3039805869897985e-05, + "loss": 2.0455, + "step": 6330 + }, + { + "epoch": 0.666070489216202, + "grad_norm": 1.4877818822860718, + "learning_rate": 5.3009722428947475e-05, + "loss": 1.5956, + "step": 6331 + }, + { + "epoch": 0.6661756970015781, + "grad_norm": 1.3403220176696777, + "learning_rate": 5.297964444482317e-05, + "loss": 1.806, + "step": 6332 + }, + { + "epoch": 0.6662809047869542, + "grad_norm": 1.4084241390228271, + "learning_rate": 5.294957192101788e-05, + "loss": 1.4097, + "step": 6333 + }, + { + "epoch": 0.6663861125723304, + "grad_norm": 1.148952603340149, + "learning_rate": 5.2919504861023903e-05, + "loss": 2.0219, + "step": 6334 + }, + { + "epoch": 0.6664913203577064, + "grad_norm": 1.5325878858566284, + "learning_rate": 5.288944326833281e-05, + "loss": 2.2054, + "step": 6335 + }, + { + "epoch": 0.6665965281430826, + "grad_norm": 1.375962495803833, + "learning_rate": 5.285938714643548e-05, + "loss": 1.1267, + "step": 6336 + }, + { + "epoch": 0.6667017359284587, + "grad_norm": 1.8014800548553467, + "learning_rate": 5.2829336498822335e-05, + "loss": 1.6149, + "step": 6337 + }, + { + "epoch": 0.6668069437138349, + "grad_norm": 1.3949754238128662, + "learning_rate": 5.279929132898298e-05, + "loss": 1.2513, + "step": 6338 + }, + { + "epoch": 0.6669121514992109, + "grad_norm": 1.4356800317764282, + "learning_rate": 5.276925164040653e-05, + "loss": 1.763, + "step": 6339 + }, + { + "epoch": 0.667017359284587, + "grad_norm": 1.1746344566345215, + "learning_rate": 5.2739217436581365e-05, + "loss": 1.503, + "step": 6340 + }, + { + "epoch": 0.6671225670699632, + "grad_norm": 2.2012033462524414, + "learning_rate": 5.270918872099522e-05, + "loss": 2.013, + "step": 6341 + }, + { + "epoch": 0.6672277748553394, + "grad_norm": 2.013820171356201, + "learning_rate": 5.2679165497135285e-05, + "loss": 1.8092, + "step": 6342 + }, + { + "epoch": 0.6673329826407154, + "grad_norm": 1.2771358489990234, + "learning_rate": 5.264914776848808e-05, + "loss": 1.6848, + "step": 6343 + }, + { + "epoch": 0.6674381904260915, + "grad_norm": 1.3562238216400146, + "learning_rate": 5.2619135538539355e-05, + "loss": 1.7494, + "step": 6344 + }, + { + "epoch": 0.6675433982114677, + "grad_norm": 2.059318780899048, + "learning_rate": 5.2589128810774426e-05, + "loss": 1.9948, + "step": 6345 + }, + { + "epoch": 0.6676486059968437, + "grad_norm": 1.2015169858932495, + "learning_rate": 5.2559127588677846e-05, + "loss": 1.5834, + "step": 6346 + }, + { + "epoch": 0.6677538137822199, + "grad_norm": 1.4178433418273926, + "learning_rate": 5.252913187573354e-05, + "loss": 1.76, + "step": 6347 + }, + { + "epoch": 0.667859021567596, + "grad_norm": 1.4683200120925903, + "learning_rate": 5.249914167542486e-05, + "loss": 1.7822, + "step": 6348 + }, + { + "epoch": 0.6679642293529722, + "grad_norm": 1.260953664779663, + "learning_rate": 5.246915699123439e-05, + "loss": 0.8526, + "step": 6349 + }, + { + "epoch": 0.6680694371383482, + "grad_norm": 1.269636869430542, + "learning_rate": 5.243917782664425e-05, + "loss": 1.7709, + "step": 6350 + }, + { + "epoch": 0.6681746449237244, + "grad_norm": 1.585425615310669, + "learning_rate": 5.240920418513577e-05, + "loss": 1.542, + "step": 6351 + }, + { + "epoch": 0.6682798527091005, + "grad_norm": 2.510784149169922, + "learning_rate": 5.2379236070189677e-05, + "loss": 1.4086, + "step": 6352 + }, + { + "epoch": 0.6683850604944765, + "grad_norm": 1.2472009658813477, + "learning_rate": 5.234927348528611e-05, + "loss": 1.7158, + "step": 6353 + }, + { + "epoch": 0.6684902682798527, + "grad_norm": 1.3211387395858765, + "learning_rate": 5.231931643390451e-05, + "loss": 1.8004, + "step": 6354 + }, + { + "epoch": 0.6685954760652288, + "grad_norm": 1.3025023937225342, + "learning_rate": 5.228936491952363e-05, + "loss": 2.2033, + "step": 6355 + }, + { + "epoch": 0.668700683850605, + "grad_norm": 2.7773120403289795, + "learning_rate": 5.2259418945621754e-05, + "loss": 2.0115, + "step": 6356 + }, + { + "epoch": 0.668805891635981, + "grad_norm": 1.2266188859939575, + "learning_rate": 5.222947851567633e-05, + "loss": 1.3392, + "step": 6357 + }, + { + "epoch": 0.6689110994213572, + "grad_norm": 1.2715981006622314, + "learning_rate": 5.219954363316424e-05, + "loss": 1.6663, + "step": 6358 + }, + { + "epoch": 0.6690163072067333, + "grad_norm": 2.1951401233673096, + "learning_rate": 5.2169614301561775e-05, + "loss": 1.6716, + "step": 6359 + }, + { + "epoch": 0.6691215149921094, + "grad_norm": 1.2387498617172241, + "learning_rate": 5.2139690524344495e-05, + "loss": 1.6217, + "step": 6360 + }, + { + "epoch": 0.6692267227774855, + "grad_norm": 1.4151759147644043, + "learning_rate": 5.210977230498733e-05, + "loss": 2.2641, + "step": 6361 + }, + { + "epoch": 0.6693319305628617, + "grad_norm": 1.9566744565963745, + "learning_rate": 5.207985964696462e-05, + "loss": 1.018, + "step": 6362 + }, + { + "epoch": 0.6694371383482378, + "grad_norm": 1.473006010055542, + "learning_rate": 5.2049952553750046e-05, + "loss": 1.7412, + "step": 6363 + }, + { + "epoch": 0.6695423461336139, + "grad_norm": 1.857047438621521, + "learning_rate": 5.202005102881653e-05, + "loss": 1.3064, + "step": 6364 + }, + { + "epoch": 0.66964755391899, + "grad_norm": 1.2658127546310425, + "learning_rate": 5.199015507563656e-05, + "loss": 1.7702, + "step": 6365 + }, + { + "epoch": 0.6697527617043662, + "grad_norm": 1.1785234212875366, + "learning_rate": 5.1960264697681726e-05, + "loss": 2.3375, + "step": 6366 + }, + { + "epoch": 0.6698579694897422, + "grad_norm": 1.1640081405639648, + "learning_rate": 5.19303798984232e-05, + "loss": 2.1538, + "step": 6367 + }, + { + "epoch": 0.6699631772751183, + "grad_norm": 1.3098970651626587, + "learning_rate": 5.1900500681331363e-05, + "loss": 2.0698, + "step": 6368 + }, + { + "epoch": 0.6700683850604945, + "grad_norm": 1.9889769554138184, + "learning_rate": 5.1870627049875954e-05, + "loss": 1.8563, + "step": 6369 + }, + { + "epoch": 0.6701735928458706, + "grad_norm": 1.559300184249878, + "learning_rate": 5.184075900752619e-05, + "loss": 1.4302, + "step": 6370 + }, + { + "epoch": 0.6702788006312467, + "grad_norm": 1.8356385231018066, + "learning_rate": 5.1810896557750485e-05, + "loss": 1.4985, + "step": 6371 + }, + { + "epoch": 0.6703840084166228, + "grad_norm": 1.3510042428970337, + "learning_rate": 5.178103970401664e-05, + "loss": 1.2977, + "step": 6372 + }, + { + "epoch": 0.670489216201999, + "grad_norm": 1.291181206703186, + "learning_rate": 5.1751188449791924e-05, + "loss": 1.4883, + "step": 6373 + }, + { + "epoch": 0.6705944239873751, + "grad_norm": 1.7430511713027954, + "learning_rate": 5.1721342798542795e-05, + "loss": 1.609, + "step": 6374 + }, + { + "epoch": 0.6706996317727512, + "grad_norm": 1.0107218027114868, + "learning_rate": 5.169150275373513e-05, + "loss": 1.5741, + "step": 6375 + }, + { + "epoch": 0.6708048395581273, + "grad_norm": 1.1406487226486206, + "learning_rate": 5.16616683188342e-05, + "loss": 1.1794, + "step": 6376 + }, + { + "epoch": 0.6709100473435035, + "grad_norm": 1.4461215734481812, + "learning_rate": 5.163183949730456e-05, + "loss": 1.4193, + "step": 6377 + }, + { + "epoch": 0.6710152551288795, + "grad_norm": 1.2850546836853027, + "learning_rate": 5.1602016292610075e-05, + "loss": 1.5627, + "step": 6378 + }, + { + "epoch": 0.6711204629142556, + "grad_norm": 1.6368341445922852, + "learning_rate": 5.157219870821413e-05, + "loss": 1.6965, + "step": 6379 + }, + { + "epoch": 0.6712256706996318, + "grad_norm": 1.221516489982605, + "learning_rate": 5.154238674757925e-05, + "loss": 1.3989, + "step": 6380 + }, + { + "epoch": 0.6713308784850079, + "grad_norm": 1.8983935117721558, + "learning_rate": 5.151258041416742e-05, + "loss": 2.2616, + "step": 6381 + }, + { + "epoch": 0.671436086270384, + "grad_norm": 1.487594485282898, + "learning_rate": 5.148277971143998e-05, + "loss": 2.0648, + "step": 6382 + }, + { + "epoch": 0.6715412940557601, + "grad_norm": 1.1469671726226807, + "learning_rate": 5.145298464285757e-05, + "loss": 1.7304, + "step": 6383 + }, + { + "epoch": 0.6716465018411363, + "grad_norm": 1.3301132917404175, + "learning_rate": 5.142319521188017e-05, + "loss": 1.673, + "step": 6384 + }, + { + "epoch": 0.6717517096265123, + "grad_norm": 1.2145934104919434, + "learning_rate": 5.1393411421967174e-05, + "loss": 1.6685, + "step": 6385 + }, + { + "epoch": 0.6718569174118885, + "grad_norm": 1.669686198234558, + "learning_rate": 5.136363327657725e-05, + "loss": 1.9549, + "step": 6386 + }, + { + "epoch": 0.6719621251972646, + "grad_norm": 1.2095863819122314, + "learning_rate": 5.1333860779168455e-05, + "loss": 1.6674, + "step": 6387 + }, + { + "epoch": 0.6720673329826408, + "grad_norm": 1.208296537399292, + "learning_rate": 5.1304093933198136e-05, + "loss": 1.8812, + "step": 6388 + }, + { + "epoch": 0.6721725407680168, + "grad_norm": 1.2008256912231445, + "learning_rate": 5.127433274212301e-05, + "loss": 1.8069, + "step": 6389 + }, + { + "epoch": 0.672277748553393, + "grad_norm": 1.6719691753387451, + "learning_rate": 5.12445772093992e-05, + "loss": 1.8409, + "step": 6390 + }, + { + "epoch": 0.6723829563387691, + "grad_norm": 1.1335160732269287, + "learning_rate": 5.1214827338482094e-05, + "loss": 1.9089, + "step": 6391 + }, + { + "epoch": 0.6724881641241451, + "grad_norm": 1.1994752883911133, + "learning_rate": 5.1185083132826414e-05, + "loss": 1.6759, + "step": 6392 + }, + { + "epoch": 0.6725933719095213, + "grad_norm": 1.675121784210205, + "learning_rate": 5.115534459588631e-05, + "loss": 1.1881, + "step": 6393 + }, + { + "epoch": 0.6726985796948974, + "grad_norm": 2.1219699382781982, + "learning_rate": 5.1125611731115174e-05, + "loss": 1.4658, + "step": 6394 + }, + { + "epoch": 0.6728037874802736, + "grad_norm": 1.6067357063293457, + "learning_rate": 5.1095884541965835e-05, + "loss": 1.4778, + "step": 6395 + }, + { + "epoch": 0.6729089952656496, + "grad_norm": 1.3906060457229614, + "learning_rate": 5.106616303189039e-05, + "loss": 1.727, + "step": 6396 + }, + { + "epoch": 0.6730142030510258, + "grad_norm": 1.3164790868759155, + "learning_rate": 5.103644720434027e-05, + "loss": 1.8831, + "step": 6397 + }, + { + "epoch": 0.6731194108364019, + "grad_norm": 2.2263622283935547, + "learning_rate": 5.100673706276633e-05, + "loss": 1.7309, + "step": 6398 + }, + { + "epoch": 0.673224618621778, + "grad_norm": 2.4222521781921387, + "learning_rate": 5.097703261061868e-05, + "loss": 1.2371, + "step": 6399 + }, + { + "epoch": 0.6733298264071541, + "grad_norm": 1.8905748128890991, + "learning_rate": 5.094733385134677e-05, + "loss": 1.5675, + "step": 6400 + }, + { + "epoch": 0.6734350341925303, + "grad_norm": 1.5610456466674805, + "learning_rate": 5.091764078839949e-05, + "loss": 1.4919, + "step": 6401 + }, + { + "epoch": 0.6735402419779064, + "grad_norm": 1.896572232246399, + "learning_rate": 5.088795342522497e-05, + "loss": 1.4104, + "step": 6402 + }, + { + "epoch": 0.6736454497632824, + "grad_norm": 1.2327032089233398, + "learning_rate": 5.085827176527064e-05, + "loss": 1.3777, + "step": 6403 + }, + { + "epoch": 0.6737506575486586, + "grad_norm": 1.7423434257507324, + "learning_rate": 5.082859581198344e-05, + "loss": 1.6206, + "step": 6404 + }, + { + "epoch": 0.6738558653340347, + "grad_norm": 1.2722043991088867, + "learning_rate": 5.0798925568809486e-05, + "loss": 1.4621, + "step": 6405 + }, + { + "epoch": 0.6739610731194109, + "grad_norm": 1.320483922958374, + "learning_rate": 5.076926103919426e-05, + "loss": 2.0733, + "step": 6406 + }, + { + "epoch": 0.6740662809047869, + "grad_norm": 1.2347058057785034, + "learning_rate": 5.0739602226582706e-05, + "loss": 1.6421, + "step": 6407 + }, + { + "epoch": 0.6741714886901631, + "grad_norm": 1.6619998216629028, + "learning_rate": 5.0709949134418865e-05, + "loss": 2.2148, + "step": 6408 + }, + { + "epoch": 0.6742766964755392, + "grad_norm": 2.012146234512329, + "learning_rate": 5.0680301766146355e-05, + "loss": 1.6082, + "step": 6409 + }, + { + "epoch": 0.6743819042609153, + "grad_norm": 1.2811518907546997, + "learning_rate": 5.0650660125207994e-05, + "loss": 1.6369, + "step": 6410 + }, + { + "epoch": 0.6744871120462914, + "grad_norm": 1.504457712173462, + "learning_rate": 5.062102421504593e-05, + "loss": 1.9906, + "step": 6411 + }, + { + "epoch": 0.6745923198316676, + "grad_norm": 1.718910574913025, + "learning_rate": 5.059139403910177e-05, + "loss": 1.5476, + "step": 6412 + }, + { + "epoch": 0.6746975276170437, + "grad_norm": 1.8171217441558838, + "learning_rate": 5.056176960081631e-05, + "loss": 1.605, + "step": 6413 + }, + { + "epoch": 0.6748027354024198, + "grad_norm": 1.2599389553070068, + "learning_rate": 5.0532150903629724e-05, + "loss": 1.892, + "step": 6414 + }, + { + "epoch": 0.6749079431877959, + "grad_norm": 1.823708415031433, + "learning_rate": 5.050253795098159e-05, + "loss": 1.8375, + "step": 6415 + }, + { + "epoch": 0.675013150973172, + "grad_norm": 1.2546007633209229, + "learning_rate": 5.047293074631074e-05, + "loss": 1.4461, + "step": 6416 + }, + { + "epoch": 0.6751183587585481, + "grad_norm": 1.1789088249206543, + "learning_rate": 5.0443329293055305e-05, + "loss": 1.7132, + "step": 6417 + }, + { + "epoch": 0.6752235665439242, + "grad_norm": 1.3993732929229736, + "learning_rate": 5.041373359465289e-05, + "loss": 1.5287, + "step": 6418 + }, + { + "epoch": 0.6753287743293004, + "grad_norm": 1.6638224124908447, + "learning_rate": 5.0384143654540314e-05, + "loss": 1.1025, + "step": 6419 + }, + { + "epoch": 0.6754339821146765, + "grad_norm": 1.2212295532226562, + "learning_rate": 5.035455947615373e-05, + "loss": 2.0747, + "step": 6420 + }, + { + "epoch": 0.6755391899000526, + "grad_norm": 1.5106666088104248, + "learning_rate": 5.032498106292869e-05, + "loss": 1.3987, + "step": 6421 + }, + { + "epoch": 0.6756443976854287, + "grad_norm": 1.0953783988952637, + "learning_rate": 5.029540841830004e-05, + "loss": 1.4668, + "step": 6422 + }, + { + "epoch": 0.6757496054708049, + "grad_norm": 1.4428025484085083, + "learning_rate": 5.0265841545701886e-05, + "loss": 1.8213, + "step": 6423 + }, + { + "epoch": 0.6758548132561809, + "grad_norm": 1.1565446853637695, + "learning_rate": 5.023628044856783e-05, + "loss": 1.9793, + "step": 6424 + }, + { + "epoch": 0.6759600210415571, + "grad_norm": 1.158007025718689, + "learning_rate": 5.020672513033066e-05, + "loss": 1.5613, + "step": 6425 + }, + { + "epoch": 0.6760652288269332, + "grad_norm": 1.8131704330444336, + "learning_rate": 5.017717559442249e-05, + "loss": 2.1706, + "step": 6426 + }, + { + "epoch": 0.6761704366123094, + "grad_norm": 1.257173776626587, + "learning_rate": 5.014763184427489e-05, + "loss": 1.7876, + "step": 6427 + }, + { + "epoch": 0.6762756443976854, + "grad_norm": 1.4548490047454834, + "learning_rate": 5.011809388331865e-05, + "loss": 2.1786, + "step": 6428 + }, + { + "epoch": 0.6763808521830615, + "grad_norm": 1.3961397409439087, + "learning_rate": 5.0088561714983906e-05, + "loss": 1.6834, + "step": 6429 + }, + { + "epoch": 0.6764860599684377, + "grad_norm": 1.5321639776229858, + "learning_rate": 5.0059035342700144e-05, + "loss": 1.0357, + "step": 6430 + }, + { + "epoch": 0.6765912677538137, + "grad_norm": 1.6330370903015137, + "learning_rate": 5.0029514769896114e-05, + "loss": 1.6928, + "step": 6431 + }, + { + "epoch": 0.6766964755391899, + "grad_norm": 1.063062310218811, + "learning_rate": 5.000000000000002e-05, + "loss": 1.5404, + "step": 6432 + }, + { + "epoch": 0.676801683324566, + "grad_norm": 0.9866712093353271, + "learning_rate": 4.9970491036439284e-05, + "loss": 1.7373, + "step": 6433 + }, + { + "epoch": 0.6769068911099422, + "grad_norm": 1.9362293481826782, + "learning_rate": 4.9940987882640647e-05, + "loss": 2.12, + "step": 6434 + }, + { + "epoch": 0.6770120988953182, + "grad_norm": 1.8418405055999756, + "learning_rate": 4.991149054203027e-05, + "loss": 1.6442, + "step": 6435 + }, + { + "epoch": 0.6771173066806944, + "grad_norm": 1.7248798608779907, + "learning_rate": 4.988199901803357e-05, + "loss": 1.6048, + "step": 6436 + }, + { + "epoch": 0.6772225144660705, + "grad_norm": 1.2888562679290771, + "learning_rate": 4.985251331407524e-05, + "loss": 1.7667, + "step": 6437 + }, + { + "epoch": 0.6773277222514467, + "grad_norm": 1.4221709966659546, + "learning_rate": 4.982303343357946e-05, + "loss": 1.7982, + "step": 6438 + }, + { + "epoch": 0.6774329300368227, + "grad_norm": 1.8176785707473755, + "learning_rate": 4.9793559379969566e-05, + "loss": 1.9093, + "step": 6439 + }, + { + "epoch": 0.6775381378221988, + "grad_norm": 1.803604245185852, + "learning_rate": 4.9764091156668266e-05, + "loss": 1.9377, + "step": 6440 + }, + { + "epoch": 0.677643345607575, + "grad_norm": 1.8046791553497314, + "learning_rate": 4.973462876709767e-05, + "loss": 1.357, + "step": 6441 + }, + { + "epoch": 0.677748553392951, + "grad_norm": 2.0363752841949463, + "learning_rate": 4.970517221467909e-05, + "loss": 1.4152, + "step": 6442 + }, + { + "epoch": 0.6778537611783272, + "grad_norm": 1.5641330480575562, + "learning_rate": 4.967572150283326e-05, + "loss": 1.8567, + "step": 6443 + }, + { + "epoch": 0.6779589689637033, + "grad_norm": 1.910548448562622, + "learning_rate": 4.9646276634980194e-05, + "loss": 2.0569, + "step": 6444 + }, + { + "epoch": 0.6780641767490795, + "grad_norm": 1.1518720388412476, + "learning_rate": 4.961683761453917e-05, + "loss": 1.8144, + "step": 6445 + }, + { + "epoch": 0.6781693845344555, + "grad_norm": 2.0624051094055176, + "learning_rate": 4.958740444492892e-05, + "loss": 1.5392, + "step": 6446 + }, + { + "epoch": 0.6782745923198317, + "grad_norm": 1.4468212127685547, + "learning_rate": 4.955797712956739e-05, + "loss": 1.4592, + "step": 6447 + }, + { + "epoch": 0.6783798001052078, + "grad_norm": 1.4319233894348145, + "learning_rate": 4.9528555671871835e-05, + "loss": 1.6202, + "step": 6448 + }, + { + "epoch": 0.6784850078905839, + "grad_norm": 2.070284605026245, + "learning_rate": 4.9499140075258957e-05, + "loss": 1.5695, + "step": 6449 + }, + { + "epoch": 0.67859021567596, + "grad_norm": 1.7108412981033325, + "learning_rate": 4.9469730343144635e-05, + "loss": 1.3484, + "step": 6450 + }, + { + "epoch": 0.6786954234613362, + "grad_norm": 1.4341391324996948, + "learning_rate": 4.944032647894414e-05, + "loss": 1.2341, + "step": 6451 + }, + { + "epoch": 0.6788006312467123, + "grad_norm": 1.3790180683135986, + "learning_rate": 4.941092848607204e-05, + "loss": 1.4654, + "step": 6452 + }, + { + "epoch": 0.6789058390320883, + "grad_norm": 2.0244486331939697, + "learning_rate": 4.9381536367942195e-05, + "loss": 1.7548, + "step": 6453 + }, + { + "epoch": 0.6790110468174645, + "grad_norm": 1.3763808012008667, + "learning_rate": 4.935215012796789e-05, + "loss": 1.2737, + "step": 6454 + }, + { + "epoch": 0.6791162546028406, + "grad_norm": 1.4411334991455078, + "learning_rate": 4.93227697695616e-05, + "loss": 1.1812, + "step": 6455 + }, + { + "epoch": 0.6792214623882167, + "grad_norm": 1.3906333446502686, + "learning_rate": 4.929339529613515e-05, + "loss": 1.8517, + "step": 6456 + }, + { + "epoch": 0.6793266701735928, + "grad_norm": 1.6477974653244019, + "learning_rate": 4.9264026711099764e-05, + "loss": 1.6151, + "step": 6457 + }, + { + "epoch": 0.679431877958969, + "grad_norm": 1.5431394577026367, + "learning_rate": 4.9234664017865896e-05, + "loss": 2.044, + "step": 6458 + }, + { + "epoch": 0.6795370857443451, + "grad_norm": 1.9720886945724487, + "learning_rate": 4.920530721984329e-05, + "loss": 1.6605, + "step": 6459 + }, + { + "epoch": 0.6796422935297212, + "grad_norm": 1.2280595302581787, + "learning_rate": 4.917595632044113e-05, + "loss": 1.6364, + "step": 6460 + }, + { + "epoch": 0.6797475013150973, + "grad_norm": 1.3215405941009521, + "learning_rate": 4.914661132306779e-05, + "loss": 1.275, + "step": 6461 + }, + { + "epoch": 0.6798527091004735, + "grad_norm": 1.446702241897583, + "learning_rate": 4.911727223113099e-05, + "loss": 2.1152, + "step": 6462 + }, + { + "epoch": 0.6799579168858495, + "grad_norm": 1.3523727655410767, + "learning_rate": 4.908793904803787e-05, + "loss": 1.2915, + "step": 6463 + }, + { + "epoch": 0.6800631246712256, + "grad_norm": 1.8233401775360107, + "learning_rate": 4.9058611777194716e-05, + "loss": 1.0423, + "step": 6464 + }, + { + "epoch": 0.6801683324566018, + "grad_norm": 1.9023778438568115, + "learning_rate": 4.9029290422007204e-05, + "loss": 1.5998, + "step": 6465 + }, + { + "epoch": 0.680273540241978, + "grad_norm": 1.1877436637878418, + "learning_rate": 4.8999974985880384e-05, + "loss": 1.9294, + "step": 6466 + }, + { + "epoch": 0.680378748027354, + "grad_norm": 1.5044838190078735, + "learning_rate": 4.8970665472218537e-05, + "loss": 1.5298, + "step": 6467 + }, + { + "epoch": 0.6804839558127301, + "grad_norm": 1.9050568342208862, + "learning_rate": 4.8941361884425215e-05, + "loss": 1.5252, + "step": 6468 + }, + { + "epoch": 0.6805891635981063, + "grad_norm": 1.1263233423233032, + "learning_rate": 4.891206422590347e-05, + "loss": 1.6088, + "step": 6469 + }, + { + "epoch": 0.6806943713834824, + "grad_norm": 1.9254624843597412, + "learning_rate": 4.8882772500055464e-05, + "loss": 2.1593, + "step": 6470 + }, + { + "epoch": 0.6807995791688585, + "grad_norm": 2.4042844772338867, + "learning_rate": 4.885348671028273e-05, + "loss": 1.6122, + "step": 6471 + }, + { + "epoch": 0.6809047869542346, + "grad_norm": 1.4427109956741333, + "learning_rate": 4.882420685998623e-05, + "loss": 1.304, + "step": 6472 + }, + { + "epoch": 0.6810099947396108, + "grad_norm": 1.020768404006958, + "learning_rate": 4.8794932952566e-05, + "loss": 1.9123, + "step": 6473 + }, + { + "epoch": 0.6811152025249868, + "grad_norm": 1.6786848306655884, + "learning_rate": 4.8765664991421634e-05, + "loss": 1.9933, + "step": 6474 + }, + { + "epoch": 0.681220410310363, + "grad_norm": 1.6853234767913818, + "learning_rate": 4.8736402979951867e-05, + "loss": 1.9755, + "step": 6475 + }, + { + "epoch": 0.6813256180957391, + "grad_norm": 1.9556273221969604, + "learning_rate": 4.870714692155479e-05, + "loss": 1.5554, + "step": 6476 + }, + { + "epoch": 0.6814308258811153, + "grad_norm": 1.1733824014663696, + "learning_rate": 4.867789681962788e-05, + "loss": 1.7569, + "step": 6477 + }, + { + "epoch": 0.6815360336664913, + "grad_norm": 1.0935428142547607, + "learning_rate": 4.864865267756779e-05, + "loss": 1.993, + "step": 6478 + }, + { + "epoch": 0.6816412414518674, + "grad_norm": 1.0769548416137695, + "learning_rate": 4.8619414498770556e-05, + "loss": 1.8029, + "step": 6479 + }, + { + "epoch": 0.6817464492372436, + "grad_norm": 1.2517523765563965, + "learning_rate": 4.859018228663155e-05, + "loss": 2.1288, + "step": 6480 + }, + { + "epoch": 0.6818516570226196, + "grad_norm": 1.2191598415374756, + "learning_rate": 4.856095604454539e-05, + "loss": 1.2368, + "step": 6481 + }, + { + "epoch": 0.6819568648079958, + "grad_norm": 1.3996628522872925, + "learning_rate": 4.8531735775905975e-05, + "loss": 1.7399, + "step": 6482 + }, + { + "epoch": 0.6820620725933719, + "grad_norm": 1.9159706830978394, + "learning_rate": 4.850252148410665e-05, + "loss": 1.5246, + "step": 6483 + }, + { + "epoch": 0.6821672803787481, + "grad_norm": 1.7578186988830566, + "learning_rate": 4.8473313172539925e-05, + "loss": 1.8513, + "step": 6484 + }, + { + "epoch": 0.6822724881641241, + "grad_norm": 1.3070460557937622, + "learning_rate": 4.8444110844597626e-05, + "loss": 1.6536, + "step": 6485 + }, + { + "epoch": 0.6823776959495003, + "grad_norm": 2.009951591491699, + "learning_rate": 4.8414914503671006e-05, + "loss": 1.1448, + "step": 6486 + }, + { + "epoch": 0.6824829037348764, + "grad_norm": 2.186450719833374, + "learning_rate": 4.838572415315046e-05, + "loss": 1.8157, + "step": 6487 + }, + { + "epoch": 0.6825881115202524, + "grad_norm": 1.4713935852050781, + "learning_rate": 4.835653979642585e-05, + "loss": 1.0556, + "step": 6488 + }, + { + "epoch": 0.6826933193056286, + "grad_norm": 1.4540444612503052, + "learning_rate": 4.832736143688621e-05, + "loss": 0.9736, + "step": 6489 + }, + { + "epoch": 0.6827985270910047, + "grad_norm": 1.8345993757247925, + "learning_rate": 4.829818907791988e-05, + "loss": 1.4606, + "step": 6490 + }, + { + "epoch": 0.6829037348763809, + "grad_norm": 1.7492965459823608, + "learning_rate": 4.826902272291467e-05, + "loss": 2.0597, + "step": 6491 + }, + { + "epoch": 0.6830089426617569, + "grad_norm": 1.3600223064422607, + "learning_rate": 4.8239862375257484e-05, + "loss": 1.4011, + "step": 6492 + }, + { + "epoch": 0.6831141504471331, + "grad_norm": 1.2309643030166626, + "learning_rate": 4.821070803833464e-05, + "loss": 1.3888, + "step": 6493 + }, + { + "epoch": 0.6832193582325092, + "grad_norm": 1.445009708404541, + "learning_rate": 4.818155971553174e-05, + "loss": 2.1089, + "step": 6494 + }, + { + "epoch": 0.6833245660178853, + "grad_norm": 1.5966815948486328, + "learning_rate": 4.815241741023367e-05, + "loss": 1.3903, + "step": 6495 + }, + { + "epoch": 0.6834297738032614, + "grad_norm": 2.34873104095459, + "learning_rate": 4.8123281125824605e-05, + "loss": 1.5625, + "step": 6496 + }, + { + "epoch": 0.6835349815886376, + "grad_norm": 1.5460422039031982, + "learning_rate": 4.809415086568812e-05, + "loss": 1.5584, + "step": 6497 + }, + { + "epoch": 0.6836401893740137, + "grad_norm": 1.2038516998291016, + "learning_rate": 4.806502663320692e-05, + "loss": 1.5281, + "step": 6498 + }, + { + "epoch": 0.6837453971593898, + "grad_norm": 1.3815281391143799, + "learning_rate": 4.803590843176321e-05, + "loss": 1.5034, + "step": 6499 + }, + { + "epoch": 0.6838506049447659, + "grad_norm": 0.981395959854126, + "learning_rate": 4.800679626473833e-05, + "loss": 1.9422, + "step": 6500 + }, + { + "epoch": 0.683955812730142, + "grad_norm": 1.7010056972503662, + "learning_rate": 4.797769013551295e-05, + "loss": 1.8847, + "step": 6501 + }, + { + "epoch": 0.6840610205155182, + "grad_norm": 1.381854772567749, + "learning_rate": 4.7948590047467153e-05, + "loss": 1.5431, + "step": 6502 + }, + { + "epoch": 0.6841662283008942, + "grad_norm": 1.5627236366271973, + "learning_rate": 4.7919496003980204e-05, + "loss": 2.1837, + "step": 6503 + }, + { + "epoch": 0.6842714360862704, + "grad_norm": 1.6263660192489624, + "learning_rate": 4.7890408008430634e-05, + "loss": 1.7972, + "step": 6504 + }, + { + "epoch": 0.6843766438716465, + "grad_norm": 1.3480515480041504, + "learning_rate": 4.786132606419643e-05, + "loss": 2.0057, + "step": 6505 + }, + { + "epoch": 0.6844818516570226, + "grad_norm": 1.6764869689941406, + "learning_rate": 4.783225017465475e-05, + "loss": 1.7601, + "step": 6506 + }, + { + "epoch": 0.6845870594423987, + "grad_norm": 1.2646028995513916, + "learning_rate": 4.780318034318202e-05, + "loss": 1.5116, + "step": 6507 + }, + { + "epoch": 0.6846922672277749, + "grad_norm": 1.2866673469543457, + "learning_rate": 4.7774116573154125e-05, + "loss": 1.5891, + "step": 6508 + }, + { + "epoch": 0.684797475013151, + "grad_norm": 1.5979012250900269, + "learning_rate": 4.774505886794609e-05, + "loss": 1.6371, + "step": 6509 + }, + { + "epoch": 0.6849026827985271, + "grad_norm": 1.9245673418045044, + "learning_rate": 4.771600723093227e-05, + "loss": 1.6979, + "step": 6510 + }, + { + "epoch": 0.6850078905839032, + "grad_norm": 1.8134851455688477, + "learning_rate": 4.7686961665486396e-05, + "loss": 1.9395, + "step": 6511 + }, + { + "epoch": 0.6851130983692794, + "grad_norm": 1.7789303064346313, + "learning_rate": 4.76579221749814e-05, + "loss": 1.915, + "step": 6512 + }, + { + "epoch": 0.6852183061546554, + "grad_norm": 1.5152759552001953, + "learning_rate": 4.7628888762789504e-05, + "loss": 1.4219, + "step": 6513 + }, + { + "epoch": 0.6853235139400315, + "grad_norm": 1.3812377452850342, + "learning_rate": 4.7599861432282334e-05, + "loss": 1.9226, + "step": 6514 + }, + { + "epoch": 0.6854287217254077, + "grad_norm": 0.8945170044898987, + "learning_rate": 4.757084018683071e-05, + "loss": 1.481, + "step": 6515 + }, + { + "epoch": 0.6855339295107838, + "grad_norm": 1.3005067110061646, + "learning_rate": 4.754182502980477e-05, + "loss": 1.8098, + "step": 6516 + }, + { + "epoch": 0.6856391372961599, + "grad_norm": 2.368222951889038, + "learning_rate": 4.7512815964573966e-05, + "loss": 1.1709, + "step": 6517 + }, + { + "epoch": 0.685744345081536, + "grad_norm": 1.7931416034698486, + "learning_rate": 4.748381299450695e-05, + "loss": 1.9524, + "step": 6518 + }, + { + "epoch": 0.6858495528669122, + "grad_norm": 1.7992273569107056, + "learning_rate": 4.7454816122971846e-05, + "loss": 0.7749, + "step": 6519 + }, + { + "epoch": 0.6859547606522882, + "grad_norm": 1.1509588956832886, + "learning_rate": 4.7425825353335915e-05, + "loss": 1.5281, + "step": 6520 + }, + { + "epoch": 0.6860599684376644, + "grad_norm": 2.1700100898742676, + "learning_rate": 4.7396840688965726e-05, + "loss": 1.7834, + "step": 6521 + }, + { + "epoch": 0.6861651762230405, + "grad_norm": 1.983696699142456, + "learning_rate": 4.7367862133227244e-05, + "loss": 1.9279, + "step": 6522 + }, + { + "epoch": 0.6862703840084167, + "grad_norm": 1.1107069253921509, + "learning_rate": 4.7338889689485624e-05, + "loss": 1.4591, + "step": 6523 + }, + { + "epoch": 0.6863755917937927, + "grad_norm": 1.1690186262130737, + "learning_rate": 4.730992336110529e-05, + "loss": 1.857, + "step": 6524 + }, + { + "epoch": 0.6864807995791689, + "grad_norm": 1.3441321849822998, + "learning_rate": 4.7280963151450096e-05, + "loss": 1.4621, + "step": 6525 + }, + { + "epoch": 0.686586007364545, + "grad_norm": 2.181993246078491, + "learning_rate": 4.7252009063883054e-05, + "loss": 1.483, + "step": 6526 + }, + { + "epoch": 0.686691215149921, + "grad_norm": 1.914829134941101, + "learning_rate": 4.722306110176647e-05, + "loss": 0.7554, + "step": 6527 + }, + { + "epoch": 0.6867964229352972, + "grad_norm": 2.0463366508483887, + "learning_rate": 4.719411926846203e-05, + "loss": 1.722, + "step": 6528 + }, + { + "epoch": 0.6869016307206733, + "grad_norm": 1.2413219213485718, + "learning_rate": 4.716518356733064e-05, + "loss": 1.53, + "step": 6529 + }, + { + "epoch": 0.6870068385060495, + "grad_norm": 1.5359559059143066, + "learning_rate": 4.713625400173247e-05, + "loss": 1.1939, + "step": 6530 + }, + { + "epoch": 0.6871120462914255, + "grad_norm": 1.2304325103759766, + "learning_rate": 4.7107330575027084e-05, + "loss": 1.6209, + "step": 6531 + }, + { + "epoch": 0.6872172540768017, + "grad_norm": 1.448864221572876, + "learning_rate": 4.707841329057322e-05, + "loss": 1.6243, + "step": 6532 + }, + { + "epoch": 0.6873224618621778, + "grad_norm": 1.8410067558288574, + "learning_rate": 4.7049502151728933e-05, + "loss": 1.5567, + "step": 6533 + }, + { + "epoch": 0.687427669647554, + "grad_norm": 2.130892753601074, + "learning_rate": 4.702059716185162e-05, + "loss": 1.9438, + "step": 6534 + }, + { + "epoch": 0.68753287743293, + "grad_norm": 1.4536875486373901, + "learning_rate": 4.6991698324297874e-05, + "loss": 1.7863, + "step": 6535 + }, + { + "epoch": 0.6876380852183062, + "grad_norm": 1.5897191762924194, + "learning_rate": 4.696280564242371e-05, + "loss": 0.9989, + "step": 6536 + }, + { + "epoch": 0.6877432930036823, + "grad_norm": 1.4431312084197998, + "learning_rate": 4.693391911958426e-05, + "loss": 1.7681, + "step": 6537 + }, + { + "epoch": 0.6878485007890583, + "grad_norm": 1.8787363767623901, + "learning_rate": 4.690503875913399e-05, + "loss": 1.8113, + "step": 6538 + }, + { + "epoch": 0.6879537085744345, + "grad_norm": 1.407496452331543, + "learning_rate": 4.687616456442677e-05, + "loss": 1.7591, + "step": 6539 + }, + { + "epoch": 0.6880589163598106, + "grad_norm": 1.6579171419143677, + "learning_rate": 4.684729653881563e-05, + "loss": 1.686, + "step": 6540 + }, + { + "epoch": 0.6881641241451868, + "grad_norm": 2.2351560592651367, + "learning_rate": 4.681843468565288e-05, + "loss": 1.7476, + "step": 6541 + }, + { + "epoch": 0.6882693319305628, + "grad_norm": 2.1286520957946777, + "learning_rate": 4.6789579008290216e-05, + "loss": 1.9967, + "step": 6542 + }, + { + "epoch": 0.688374539715939, + "grad_norm": 1.540056586265564, + "learning_rate": 4.676072951007849e-05, + "loss": 1.0093, + "step": 6543 + }, + { + "epoch": 0.6884797475013151, + "grad_norm": 1.2352454662322998, + "learning_rate": 4.673188619436798e-05, + "loss": 1.7239, + "step": 6544 + }, + { + "epoch": 0.6885849552866912, + "grad_norm": 1.0865057706832886, + "learning_rate": 4.670304906450811e-05, + "loss": 2.0791, + "step": 6545 + }, + { + "epoch": 0.6886901630720673, + "grad_norm": 1.6150873899459839, + "learning_rate": 4.667421812384761e-05, + "loss": 1.813, + "step": 6546 + }, + { + "epoch": 0.6887953708574435, + "grad_norm": 1.4798706769943237, + "learning_rate": 4.66453933757346e-05, + "loss": 1.634, + "step": 6547 + }, + { + "epoch": 0.6889005786428196, + "grad_norm": 1.3703545331954956, + "learning_rate": 4.661657482351637e-05, + "loss": 1.7388, + "step": 6548 + }, + { + "epoch": 0.6890057864281957, + "grad_norm": 1.6316715478897095, + "learning_rate": 4.658776247053948e-05, + "loss": 2.3265, + "step": 6549 + }, + { + "epoch": 0.6891109942135718, + "grad_norm": 1.0946226119995117, + "learning_rate": 4.6558956320149884e-05, + "loss": 1.2866, + "step": 6550 + }, + { + "epoch": 0.689216201998948, + "grad_norm": 1.6693997383117676, + "learning_rate": 4.6530156375692726e-05, + "loss": 1.583, + "step": 6551 + }, + { + "epoch": 0.689321409784324, + "grad_norm": 1.315307378768921, + "learning_rate": 4.650136264051238e-05, + "loss": 2.1853, + "step": 6552 + }, + { + "epoch": 0.6894266175697001, + "grad_norm": 1.1383665800094604, + "learning_rate": 4.6472575117952676e-05, + "loss": 1.7014, + "step": 6553 + }, + { + "epoch": 0.6895318253550763, + "grad_norm": 1.3326300382614136, + "learning_rate": 4.644379381135655e-05, + "loss": 1.7168, + "step": 6554 + }, + { + "epoch": 0.6896370331404524, + "grad_norm": 1.85318922996521, + "learning_rate": 4.641501872406626e-05, + "loss": 1.7368, + "step": 6555 + }, + { + "epoch": 0.6897422409258285, + "grad_norm": 1.3762325048446655, + "learning_rate": 4.6386249859423434e-05, + "loss": 2.0301, + "step": 6556 + }, + { + "epoch": 0.6898474487112046, + "grad_norm": 1.573012351989746, + "learning_rate": 4.635748722076887e-05, + "loss": 2.1703, + "step": 6557 + }, + { + "epoch": 0.6899526564965808, + "grad_norm": 1.4833385944366455, + "learning_rate": 4.632873081144267e-05, + "loss": 1.4837, + "step": 6558 + }, + { + "epoch": 0.6900578642819568, + "grad_norm": 1.669286847114563, + "learning_rate": 4.629998063478422e-05, + "loss": 1.6238, + "step": 6559 + }, + { + "epoch": 0.690163072067333, + "grad_norm": 1.670414924621582, + "learning_rate": 4.627123669413216e-05, + "loss": 1.5925, + "step": 6560 + }, + { + "epoch": 0.6902682798527091, + "grad_norm": 1.836533784866333, + "learning_rate": 4.624249899282449e-05, + "loss": 1.5332, + "step": 6561 + }, + { + "epoch": 0.6903734876380853, + "grad_norm": 1.0955008268356323, + "learning_rate": 4.6213767534198395e-05, + "loss": 1.9474, + "step": 6562 + }, + { + "epoch": 0.6904786954234613, + "grad_norm": 1.3523670434951782, + "learning_rate": 4.618504232159032e-05, + "loss": 1.3039, + "step": 6563 + }, + { + "epoch": 0.6905839032088374, + "grad_norm": 1.6386244297027588, + "learning_rate": 4.6156323358336116e-05, + "loss": 1.7856, + "step": 6564 + }, + { + "epoch": 0.6906891109942136, + "grad_norm": 1.7706154584884644, + "learning_rate": 4.6127610647770767e-05, + "loss": 2.0213, + "step": 6565 + }, + { + "epoch": 0.6907943187795897, + "grad_norm": 1.2387391328811646, + "learning_rate": 4.6098904193228576e-05, + "loss": 1.5836, + "step": 6566 + }, + { + "epoch": 0.6908995265649658, + "grad_norm": 1.4333043098449707, + "learning_rate": 4.6070203998043173e-05, + "loss": 1.6886, + "step": 6567 + }, + { + "epoch": 0.6910047343503419, + "grad_norm": 1.220526933670044, + "learning_rate": 4.60415100655474e-05, + "loss": 1.6656, + "step": 6568 + }, + { + "epoch": 0.6911099421357181, + "grad_norm": 1.2803022861480713, + "learning_rate": 4.601282239907334e-05, + "loss": 1.6475, + "step": 6569 + }, + { + "epoch": 0.6912151499210941, + "grad_norm": 1.9146497249603271, + "learning_rate": 4.5984141001952477e-05, + "loss": 1.638, + "step": 6570 + }, + { + "epoch": 0.6913203577064703, + "grad_norm": 0.9470534324645996, + "learning_rate": 4.595546587751545e-05, + "loss": 1.6093, + "step": 6571 + }, + { + "epoch": 0.6914255654918464, + "grad_norm": 1.471522569656372, + "learning_rate": 4.592679702909216e-05, + "loss": 1.1011, + "step": 6572 + }, + { + "epoch": 0.6915307732772226, + "grad_norm": 1.4972786903381348, + "learning_rate": 4.589813446001192e-05, + "loss": 1.4328, + "step": 6573 + }, + { + "epoch": 0.6916359810625986, + "grad_norm": 1.2175990343093872, + "learning_rate": 4.5869478173603175e-05, + "loss": 1.8741, + "step": 6574 + }, + { + "epoch": 0.6917411888479748, + "grad_norm": 1.4953633546829224, + "learning_rate": 4.584082817319364e-05, + "loss": 1.8677, + "step": 6575 + }, + { + "epoch": 0.6918463966333509, + "grad_norm": 1.126177191734314, + "learning_rate": 4.581218446211043e-05, + "loss": 1.9968, + "step": 6576 + }, + { + "epoch": 0.6919516044187269, + "grad_norm": 1.063721776008606, + "learning_rate": 4.578354704367978e-05, + "loss": 1.6822, + "step": 6577 + }, + { + "epoch": 0.6920568122041031, + "grad_norm": 1.747952938079834, + "learning_rate": 4.575491592122727e-05, + "loss": 1.8211, + "step": 6578 + }, + { + "epoch": 0.6921620199894792, + "grad_norm": 1.776321291923523, + "learning_rate": 4.572629109807782e-05, + "loss": 2.0429, + "step": 6579 + }, + { + "epoch": 0.6922672277748554, + "grad_norm": 1.6665074825286865, + "learning_rate": 4.569767257755538e-05, + "loss": 1.7637, + "step": 6580 + }, + { + "epoch": 0.6923724355602314, + "grad_norm": 1.534345030784607, + "learning_rate": 4.566906036298345e-05, + "loss": 1.4153, + "step": 6581 + }, + { + "epoch": 0.6924776433456076, + "grad_norm": 1.4633084535598755, + "learning_rate": 4.564045445768464e-05, + "loss": 1.7596, + "step": 6582 + }, + { + "epoch": 0.6925828511309837, + "grad_norm": 1.559431552886963, + "learning_rate": 4.561185486498081e-05, + "loss": 1.2046, + "step": 6583 + }, + { + "epoch": 0.6926880589163598, + "grad_norm": 1.8445146083831787, + "learning_rate": 4.558326158819322e-05, + "loss": 1.4508, + "step": 6584 + }, + { + "epoch": 0.6927932667017359, + "grad_norm": 1.142551064491272, + "learning_rate": 4.555467463064227e-05, + "loss": 1.0419, + "step": 6585 + }, + { + "epoch": 0.6928984744871121, + "grad_norm": 2.6292953491210938, + "learning_rate": 4.552609399564762e-05, + "loss": 1.4199, + "step": 6586 + }, + { + "epoch": 0.6930036822724882, + "grad_norm": 0.9221459627151489, + "learning_rate": 4.549751968652836e-05, + "loss": 1.4626, + "step": 6587 + }, + { + "epoch": 0.6931088900578642, + "grad_norm": 1.3929451704025269, + "learning_rate": 4.5468951706602644e-05, + "loss": 2.0308, + "step": 6588 + }, + { + "epoch": 0.6932140978432404, + "grad_norm": 1.388808012008667, + "learning_rate": 4.5440390059187964e-05, + "loss": 1.8868, + "step": 6589 + }, + { + "epoch": 0.6933193056286165, + "grad_norm": 1.4524317979812622, + "learning_rate": 4.541183474760118e-05, + "loss": 1.747, + "step": 6590 + }, + { + "epoch": 0.6934245134139926, + "grad_norm": 1.3403692245483398, + "learning_rate": 4.538328577515821e-05, + "loss": 1.5627, + "step": 6591 + }, + { + "epoch": 0.6935297211993687, + "grad_norm": 1.3830885887145996, + "learning_rate": 4.535474314517447e-05, + "loss": 1.6002, + "step": 6592 + }, + { + "epoch": 0.6936349289847449, + "grad_norm": 2.1278774738311768, + "learning_rate": 4.532620686096446e-05, + "loss": 1.9424, + "step": 6593 + }, + { + "epoch": 0.693740136770121, + "grad_norm": 1.8062392473220825, + "learning_rate": 4.529767692584198e-05, + "loss": 2.012, + "step": 6594 + }, + { + "epoch": 0.6938453445554971, + "grad_norm": 1.8589589595794678, + "learning_rate": 4.5269153343120174e-05, + "loss": 1.4928, + "step": 6595 + }, + { + "epoch": 0.6939505523408732, + "grad_norm": 1.7464429140090942, + "learning_rate": 4.524063611611138e-05, + "loss": 1.1396, + "step": 6596 + }, + { + "epoch": 0.6940557601262494, + "grad_norm": 1.3345447778701782, + "learning_rate": 4.5212125248127143e-05, + "loss": 1.5639, + "step": 6597 + }, + { + "epoch": 0.6941609679116255, + "grad_norm": 1.2210328578948975, + "learning_rate": 4.518362074247844e-05, + "loss": 1.8796, + "step": 6598 + }, + { + "epoch": 0.6942661756970016, + "grad_norm": 1.5386043787002563, + "learning_rate": 4.515512260247534e-05, + "loss": 1.1067, + "step": 6599 + }, + { + "epoch": 0.6943713834823777, + "grad_norm": 1.575567364692688, + "learning_rate": 4.5126630831427264e-05, + "loss": 1.0896, + "step": 6600 + }, + { + "epoch": 0.6944765912677539, + "grad_norm": 1.171311855316162, + "learning_rate": 4.5098145432642845e-05, + "loss": 1.6938, + "step": 6601 + }, + { + "epoch": 0.6945817990531299, + "grad_norm": 1.322606086730957, + "learning_rate": 4.506966640942999e-05, + "loss": 1.9014, + "step": 6602 + }, + { + "epoch": 0.694687006838506, + "grad_norm": 1.8430259227752686, + "learning_rate": 4.504119376509591e-05, + "loss": 1.916, + "step": 6603 + }, + { + "epoch": 0.6947922146238822, + "grad_norm": 1.57040536403656, + "learning_rate": 4.501272750294704e-05, + "loss": 1.1643, + "step": 6604 + }, + { + "epoch": 0.6948974224092583, + "grad_norm": 1.510570764541626, + "learning_rate": 4.4984267626289e-05, + "loss": 1.4288, + "step": 6605 + }, + { + "epoch": 0.6950026301946344, + "grad_norm": 1.7676854133605957, + "learning_rate": 4.495581413842685e-05, + "loss": 1.0889, + "step": 6606 + }, + { + "epoch": 0.6951078379800105, + "grad_norm": 1.2651560306549072, + "learning_rate": 4.492736704266475e-05, + "loss": 1.6327, + "step": 6607 + }, + { + "epoch": 0.6952130457653867, + "grad_norm": 1.7821506261825562, + "learning_rate": 4.4898926342306115e-05, + "loss": 1.7645, + "step": 6608 + }, + { + "epoch": 0.6953182535507627, + "grad_norm": 1.2521933317184448, + "learning_rate": 4.487049204065377e-05, + "loss": 2.0037, + "step": 6609 + }, + { + "epoch": 0.6954234613361389, + "grad_norm": 1.3124496936798096, + "learning_rate": 4.4842064141009644e-05, + "loss": 1.5326, + "step": 6610 + }, + { + "epoch": 0.695528669121515, + "grad_norm": 1.9574371576309204, + "learning_rate": 4.4813642646674936e-05, + "loss": 1.8915, + "step": 6611 + }, + { + "epoch": 0.6956338769068912, + "grad_norm": 1.5282095670700073, + "learning_rate": 4.4785227560950226e-05, + "loss": 1.9569, + "step": 6612 + }, + { + "epoch": 0.6957390846922672, + "grad_norm": 1.1745871305465698, + "learning_rate": 4.475681888713522e-05, + "loss": 1.8232, + "step": 6613 + }, + { + "epoch": 0.6958442924776433, + "grad_norm": 1.0485895872116089, + "learning_rate": 4.472841662852888e-05, + "loss": 1.814, + "step": 6614 + }, + { + "epoch": 0.6959495002630195, + "grad_norm": 1.5237230062484741, + "learning_rate": 4.470002078842957e-05, + "loss": 1.4868, + "step": 6615 + }, + { + "epoch": 0.6960547080483955, + "grad_norm": 1.6121946573257446, + "learning_rate": 4.467163137013473e-05, + "loss": 1.727, + "step": 6616 + }, + { + "epoch": 0.6961599158337717, + "grad_norm": 1.475019097328186, + "learning_rate": 4.4643248376941104e-05, + "loss": 1.5386, + "step": 6617 + }, + { + "epoch": 0.6962651236191478, + "grad_norm": 1.4994443655014038, + "learning_rate": 4.461487181214481e-05, + "loss": 1.6624, + "step": 6618 + }, + { + "epoch": 0.696370331404524, + "grad_norm": 2.619147300720215, + "learning_rate": 4.458650167904106e-05, + "loss": 1.5808, + "step": 6619 + }, + { + "epoch": 0.6964755391899, + "grad_norm": 1.311523675918579, + "learning_rate": 4.455813798092438e-05, + "loss": 1.7372, + "step": 6620 + }, + { + "epoch": 0.6965807469752762, + "grad_norm": 1.382138967514038, + "learning_rate": 4.452978072108859e-05, + "loss": 1.5582, + "step": 6621 + }, + { + "epoch": 0.6966859547606523, + "grad_norm": 1.6938810348510742, + "learning_rate": 4.450142990282671e-05, + "loss": 1.7523, + "step": 6622 + }, + { + "epoch": 0.6967911625460284, + "grad_norm": 1.1859630346298218, + "learning_rate": 4.4473085529431024e-05, + "loss": 1.7407, + "step": 6623 + }, + { + "epoch": 0.6968963703314045, + "grad_norm": 1.2967774868011475, + "learning_rate": 4.4444747604193074e-05, + "loss": 1.9172, + "step": 6624 + }, + { + "epoch": 0.6970015781167807, + "grad_norm": 1.2925198078155518, + "learning_rate": 4.44164161304036e-05, + "loss": 1.9331, + "step": 6625 + }, + { + "epoch": 0.6971067859021568, + "grad_norm": 1.6732769012451172, + "learning_rate": 4.438809111135274e-05, + "loss": 1.8172, + "step": 6626 + }, + { + "epoch": 0.6972119936875328, + "grad_norm": 1.5457383394241333, + "learning_rate": 4.435977255032971e-05, + "loss": 1.7216, + "step": 6627 + }, + { + "epoch": 0.697317201472909, + "grad_norm": 1.5493268966674805, + "learning_rate": 4.4331460450623064e-05, + "loss": 1.2244, + "step": 6628 + }, + { + "epoch": 0.6974224092582851, + "grad_norm": 1.3660433292388916, + "learning_rate": 4.430315481552063e-05, + "loss": 1.4947, + "step": 6629 + }, + { + "epoch": 0.6975276170436613, + "grad_norm": 1.6390910148620605, + "learning_rate": 4.427485564830942e-05, + "loss": 1.8469, + "step": 6630 + }, + { + "epoch": 0.6976328248290373, + "grad_norm": 2.015472173690796, + "learning_rate": 4.424656295227568e-05, + "loss": 1.7447, + "step": 6631 + }, + { + "epoch": 0.6977380326144135, + "grad_norm": 1.0427132844924927, + "learning_rate": 4.4218276730705045e-05, + "loss": 1.5705, + "step": 6632 + }, + { + "epoch": 0.6978432403997896, + "grad_norm": 1.1284888982772827, + "learning_rate": 4.4189996986882245e-05, + "loss": 1.869, + "step": 6633 + }, + { + "epoch": 0.6979484481851657, + "grad_norm": 1.4589309692382812, + "learning_rate": 4.4161723724091276e-05, + "loss": 1.6884, + "step": 6634 + }, + { + "epoch": 0.6980536559705418, + "grad_norm": 0.9497972726821899, + "learning_rate": 4.413345694561549e-05, + "loss": 1.9814, + "step": 6635 + }, + { + "epoch": 0.698158863755918, + "grad_norm": 1.5925368070602417, + "learning_rate": 4.410519665473736e-05, + "loss": 1.5224, + "step": 6636 + }, + { + "epoch": 0.6982640715412941, + "grad_norm": 2.054982900619507, + "learning_rate": 4.407694285473871e-05, + "loss": 1.4793, + "step": 6637 + }, + { + "epoch": 0.6983692793266701, + "grad_norm": 1.2141071557998657, + "learning_rate": 4.404869554890054e-05, + "loss": 1.2192, + "step": 6638 + }, + { + "epoch": 0.6984744871120463, + "grad_norm": 1.3335728645324707, + "learning_rate": 4.402045474050308e-05, + "loss": 1.9024, + "step": 6639 + }, + { + "epoch": 0.6985796948974224, + "grad_norm": 1.548563838005066, + "learning_rate": 4.399222043282591e-05, + "loss": 2.0828, + "step": 6640 + }, + { + "epoch": 0.6986849026827985, + "grad_norm": 2.5695066452026367, + "learning_rate": 4.3963992629147755e-05, + "loss": 1.768, + "step": 6641 + }, + { + "epoch": 0.6987901104681746, + "grad_norm": 1.7723878622055054, + "learning_rate": 4.393577133274658e-05, + "loss": 1.7614, + "step": 6642 + }, + { + "epoch": 0.6988953182535508, + "grad_norm": 1.082208275794983, + "learning_rate": 4.390755654689973e-05, + "loss": 2.1651, + "step": 6643 + }, + { + "epoch": 0.6990005260389269, + "grad_norm": 1.8237231969833374, + "learning_rate": 4.3879348274883594e-05, + "loss": 1.8408, + "step": 6644 + }, + { + "epoch": 0.699105733824303, + "grad_norm": 1.6238867044448853, + "learning_rate": 4.3851146519973906e-05, + "loss": 1.2499, + "step": 6645 + }, + { + "epoch": 0.6992109416096791, + "grad_norm": 1.6902471780776978, + "learning_rate": 4.382295128544572e-05, + "loss": 1.916, + "step": 6646 + }, + { + "epoch": 0.6993161493950553, + "grad_norm": 1.5541309118270874, + "learning_rate": 4.379476257457318e-05, + "loss": 1.6904, + "step": 6647 + }, + { + "epoch": 0.6994213571804313, + "grad_norm": 1.7713851928710938, + "learning_rate": 4.376658039062981e-05, + "loss": 1.7081, + "step": 6648 + }, + { + "epoch": 0.6995265649658075, + "grad_norm": 1.4880828857421875, + "learning_rate": 4.373840473688829e-05, + "loss": 1.865, + "step": 6649 + }, + { + "epoch": 0.6996317727511836, + "grad_norm": 1.3452609777450562, + "learning_rate": 4.371023561662052e-05, + "loss": 2.0241, + "step": 6650 + }, + { + "epoch": 0.6997369805365597, + "grad_norm": 1.048627495765686, + "learning_rate": 4.3682073033097785e-05, + "loss": 1.8618, + "step": 6651 + }, + { + "epoch": 0.6998421883219358, + "grad_norm": 1.0086387395858765, + "learning_rate": 4.365391698959044e-05, + "loss": 1.8462, + "step": 6652 + }, + { + "epoch": 0.6999473961073119, + "grad_norm": 1.9956762790679932, + "learning_rate": 4.3625767489368143e-05, + "loss": 1.7446, + "step": 6653 + }, + { + "epoch": 0.7000526038926881, + "grad_norm": 1.3309284448623657, + "learning_rate": 4.3597624535699865e-05, + "loss": 1.8599, + "step": 6654 + }, + { + "epoch": 0.7001578116780641, + "grad_norm": 1.1695083379745483, + "learning_rate": 4.3569488131853733e-05, + "loss": 1.3716, + "step": 6655 + }, + { + "epoch": 0.7002630194634403, + "grad_norm": 1.7402067184448242, + "learning_rate": 4.354135828109707e-05, + "loss": 1.3841, + "step": 6656 + }, + { + "epoch": 0.7003682272488164, + "grad_norm": 1.5015475749969482, + "learning_rate": 4.351323498669659e-05, + "loss": 1.2452, + "step": 6657 + }, + { + "epoch": 0.7004734350341926, + "grad_norm": 1.4218254089355469, + "learning_rate": 4.3485118251918146e-05, + "loss": 1.9119, + "step": 6658 + }, + { + "epoch": 0.7005786428195686, + "grad_norm": 1.1995227336883545, + "learning_rate": 4.345700808002676e-05, + "loss": 1.6662, + "step": 6659 + }, + { + "epoch": 0.7006838506049448, + "grad_norm": 1.214152455329895, + "learning_rate": 4.3428904474286894e-05, + "loss": 1.6176, + "step": 6660 + }, + { + "epoch": 0.7007890583903209, + "grad_norm": 1.6113765239715576, + "learning_rate": 4.340080743796204e-05, + "loss": 1.8252, + "step": 6661 + }, + { + "epoch": 0.7008942661756971, + "grad_norm": 1.3512358665466309, + "learning_rate": 4.337271697431503e-05, + "loss": 1.5226, + "step": 6662 + }, + { + "epoch": 0.7009994739610731, + "grad_norm": 1.1863796710968018, + "learning_rate": 4.3344633086607955e-05, + "loss": 1.6714, + "step": 6663 + }, + { + "epoch": 0.7011046817464492, + "grad_norm": 1.354353904724121, + "learning_rate": 4.331655577810207e-05, + "loss": 1.7832, + "step": 6664 + }, + { + "epoch": 0.7012098895318254, + "grad_norm": 1.0943442583084106, + "learning_rate": 4.328848505205792e-05, + "loss": 1.9705, + "step": 6665 + }, + { + "epoch": 0.7013150973172014, + "grad_norm": 1.4298934936523438, + "learning_rate": 4.326042091173526e-05, + "loss": 1.2593, + "step": 6666 + }, + { + "epoch": 0.7014203051025776, + "grad_norm": 2.2146830558776855, + "learning_rate": 4.3232363360393026e-05, + "loss": 1.7405, + "step": 6667 + }, + { + "epoch": 0.7015255128879537, + "grad_norm": 1.1296732425689697, + "learning_rate": 4.320431240128955e-05, + "loss": 1.8832, + "step": 6668 + }, + { + "epoch": 0.7016307206733299, + "grad_norm": 1.771040439605713, + "learning_rate": 4.317626803768224e-05, + "loss": 1.9983, + "step": 6669 + }, + { + "epoch": 0.7017359284587059, + "grad_norm": 1.8517831563949585, + "learning_rate": 4.3148230272827784e-05, + "loss": 1.7378, + "step": 6670 + }, + { + "epoch": 0.7018411362440821, + "grad_norm": 1.6702775955200195, + "learning_rate": 4.312019910998217e-05, + "loss": 1.7574, + "step": 6671 + }, + { + "epoch": 0.7019463440294582, + "grad_norm": 1.3865050077438354, + "learning_rate": 4.3092174552400535e-05, + "loss": 1.5659, + "step": 6672 + }, + { + "epoch": 0.7020515518148343, + "grad_norm": 1.095326542854309, + "learning_rate": 4.306415660333724e-05, + "loss": 1.0576, + "step": 6673 + }, + { + "epoch": 0.7021567596002104, + "grad_norm": 1.3745871782302856, + "learning_rate": 4.303614526604598e-05, + "loss": 1.9348, + "step": 6674 + }, + { + "epoch": 0.7022619673855865, + "grad_norm": 1.3735612630844116, + "learning_rate": 4.300814054377961e-05, + "loss": 1.3382, + "step": 6675 + }, + { + "epoch": 0.7023671751709627, + "grad_norm": 1.8741841316223145, + "learning_rate": 4.298014243979016e-05, + "loss": 1.7592, + "step": 6676 + }, + { + "epoch": 0.7024723829563387, + "grad_norm": 1.2442336082458496, + "learning_rate": 4.295215095732904e-05, + "loss": 1.4593, + "step": 6677 + }, + { + "epoch": 0.7025775907417149, + "grad_norm": 1.6263591051101685, + "learning_rate": 4.292416609964678e-05, + "loss": 1.3543, + "step": 6678 + }, + { + "epoch": 0.702682798527091, + "grad_norm": 1.3969215154647827, + "learning_rate": 4.289618786999313e-05, + "loss": 1.6856, + "step": 6679 + }, + { + "epoch": 0.7027880063124671, + "grad_norm": 1.4358292818069458, + "learning_rate": 4.2868216271617175e-05, + "loss": 1.7041, + "step": 6680 + }, + { + "epoch": 0.7028932140978432, + "grad_norm": 1.739772081375122, + "learning_rate": 4.284025130776711e-05, + "loss": 2.2992, + "step": 6681 + }, + { + "epoch": 0.7029984218832194, + "grad_norm": 1.508034586906433, + "learning_rate": 4.281229298169046e-05, + "loss": 1.5951, + "step": 6682 + }, + { + "epoch": 0.7031036296685955, + "grad_norm": 1.9585610628128052, + "learning_rate": 4.278434129663392e-05, + "loss": 1.8921, + "step": 6683 + }, + { + "epoch": 0.7032088374539716, + "grad_norm": 1.0624831914901733, + "learning_rate": 4.275639625584338e-05, + "loss": 1.7377, + "step": 6684 + }, + { + "epoch": 0.7033140452393477, + "grad_norm": 1.728935956954956, + "learning_rate": 4.2728457862564074e-05, + "loss": 1.6178, + "step": 6685 + }, + { + "epoch": 0.7034192530247239, + "grad_norm": 1.3254528045654297, + "learning_rate": 4.2700526120040405e-05, + "loss": 1.6511, + "step": 6686 + }, + { + "epoch": 0.7035244608100999, + "grad_norm": 1.6863447427749634, + "learning_rate": 4.267260103151589e-05, + "loss": 1.4184, + "step": 6687 + }, + { + "epoch": 0.703629668595476, + "grad_norm": 1.829534888267517, + "learning_rate": 4.264468260023348e-05, + "loss": 1.9914, + "step": 6688 + }, + { + "epoch": 0.7037348763808522, + "grad_norm": 1.3819561004638672, + "learning_rate": 4.261677082943521e-05, + "loss": 1.7343, + "step": 6689 + }, + { + "epoch": 0.7038400841662283, + "grad_norm": 1.2711637020111084, + "learning_rate": 4.2588865722362334e-05, + "loss": 1.9204, + "step": 6690 + }, + { + "epoch": 0.7039452919516044, + "grad_norm": 1.641611099243164, + "learning_rate": 4.256096728225548e-05, + "loss": 1.5314, + "step": 6691 + }, + { + "epoch": 0.7040504997369805, + "grad_norm": 1.4409161806106567, + "learning_rate": 4.253307551235431e-05, + "loss": 1.745, + "step": 6692 + }, + { + "epoch": 0.7041557075223567, + "grad_norm": 1.646813988685608, + "learning_rate": 4.25051904158979e-05, + "loss": 1.9297, + "step": 6693 + }, + { + "epoch": 0.7042609153077328, + "grad_norm": 1.5589001178741455, + "learning_rate": 4.247731199612439e-05, + "loss": 1.5343, + "step": 6694 + }, + { + "epoch": 0.7043661230931089, + "grad_norm": 2.051630973815918, + "learning_rate": 4.244944025627118e-05, + "loss": 1.4001, + "step": 6695 + }, + { + "epoch": 0.704471330878485, + "grad_norm": 1.5140016078948975, + "learning_rate": 4.2421575199575e-05, + "loss": 1.8798, + "step": 6696 + }, + { + "epoch": 0.7045765386638612, + "grad_norm": 1.471285104751587, + "learning_rate": 4.23937168292717e-05, + "loss": 1.5356, + "step": 6697 + }, + { + "epoch": 0.7046817464492372, + "grad_norm": 1.4824402332305908, + "learning_rate": 4.236586514859633e-05, + "loss": 1.5193, + "step": 6698 + }, + { + "epoch": 0.7047869542346133, + "grad_norm": 1.7141013145446777, + "learning_rate": 4.233802016078329e-05, + "loss": 1.4694, + "step": 6699 + }, + { + "epoch": 0.7048921620199895, + "grad_norm": 1.1465423107147217, + "learning_rate": 4.23101818690661e-05, + "loss": 1.6485, + "step": 6700 + }, + { + "epoch": 0.7049973698053656, + "grad_norm": 1.4803214073181152, + "learning_rate": 4.2282350276677475e-05, + "loss": 1.6299, + "step": 6701 + }, + { + "epoch": 0.7051025775907417, + "grad_norm": 1.4447064399719238, + "learning_rate": 4.2254525386849497e-05, + "loss": 1.7181, + "step": 6702 + }, + { + "epoch": 0.7052077853761178, + "grad_norm": 2.032977819442749, + "learning_rate": 4.222670720281333e-05, + "loss": 0.969, + "step": 6703 + }, + { + "epoch": 0.705312993161494, + "grad_norm": 1.6613085269927979, + "learning_rate": 4.219889572779937e-05, + "loss": 1.4189, + "step": 6704 + }, + { + "epoch": 0.70541820094687, + "grad_norm": 1.3568087816238403, + "learning_rate": 4.217109096503736e-05, + "loss": 1.8463, + "step": 6705 + }, + { + "epoch": 0.7055234087322462, + "grad_norm": 1.6580621004104614, + "learning_rate": 4.214329291775613e-05, + "loss": 1.6142, + "step": 6706 + }, + { + "epoch": 0.7056286165176223, + "grad_norm": 1.5139204263687134, + "learning_rate": 4.2115501589183734e-05, + "loss": 1.7448, + "step": 6707 + }, + { + "epoch": 0.7057338243029985, + "grad_norm": 2.026707649230957, + "learning_rate": 4.208771698254761e-05, + "loss": 1.5281, + "step": 6708 + }, + { + "epoch": 0.7058390320883745, + "grad_norm": 1.6222214698791504, + "learning_rate": 4.205993910107413e-05, + "loss": 1.3807, + "step": 6709 + }, + { + "epoch": 0.7059442398737507, + "grad_norm": 1.521702766418457, + "learning_rate": 4.2032167947989175e-05, + "loss": 1.706, + "step": 6710 + }, + { + "epoch": 0.7060494476591268, + "grad_norm": 1.6336759328842163, + "learning_rate": 4.200440352651767e-05, + "loss": 1.8752, + "step": 6711 + }, + { + "epoch": 0.7061546554445028, + "grad_norm": 1.2385070323944092, + "learning_rate": 4.197664583988376e-05, + "loss": 1.8389, + "step": 6712 + }, + { + "epoch": 0.706259863229879, + "grad_norm": 1.7074710130691528, + "learning_rate": 4.1948894891310955e-05, + "loss": 2.3842, + "step": 6713 + }, + { + "epoch": 0.7063650710152551, + "grad_norm": 1.0736308097839355, + "learning_rate": 4.192115068402183e-05, + "loss": 1.5353, + "step": 6714 + }, + { + "epoch": 0.7064702788006313, + "grad_norm": 2.215698719024658, + "learning_rate": 4.189341322123818e-05, + "loss": 1.0607, + "step": 6715 + }, + { + "epoch": 0.7065754865860073, + "grad_norm": 2.0693204402923584, + "learning_rate": 4.186568250618115e-05, + "loss": 1.8865, + "step": 6716 + }, + { + "epoch": 0.7066806943713835, + "grad_norm": 2.8470065593719482, + "learning_rate": 4.183795854207098e-05, + "loss": 1.8147, + "step": 6717 + }, + { + "epoch": 0.7067859021567596, + "grad_norm": 1.833847999572754, + "learning_rate": 4.181024133212713e-05, + "loss": 1.4404, + "step": 6718 + }, + { + "epoch": 0.7068911099421357, + "grad_norm": 1.5717699527740479, + "learning_rate": 4.1782530879568374e-05, + "loss": 1.5635, + "step": 6719 + }, + { + "epoch": 0.7069963177275118, + "grad_norm": 1.44779372215271, + "learning_rate": 4.175482718761259e-05, + "loss": 1.9344, + "step": 6720 + }, + { + "epoch": 0.707101525512888, + "grad_norm": 1.8596739768981934, + "learning_rate": 4.172713025947691e-05, + "loss": 2.044, + "step": 6721 + }, + { + "epoch": 0.7072067332982641, + "grad_norm": 1.5004318952560425, + "learning_rate": 4.169944009837773e-05, + "loss": 1.8553, + "step": 6722 + }, + { + "epoch": 0.7073119410836401, + "grad_norm": 1.9273593425750732, + "learning_rate": 4.1671756707530594e-05, + "loss": 1.2872, + "step": 6723 + }, + { + "epoch": 0.7074171488690163, + "grad_norm": 1.7082390785217285, + "learning_rate": 4.164408009015024e-05, + "loss": 1.8351, + "step": 6724 + }, + { + "epoch": 0.7075223566543924, + "grad_norm": 2.25231671333313, + "learning_rate": 4.1616410249450746e-05, + "loss": 1.7099, + "step": 6725 + }, + { + "epoch": 0.7076275644397686, + "grad_norm": 1.2057231664657593, + "learning_rate": 4.1588747188645275e-05, + "loss": 2.0397, + "step": 6726 + }, + { + "epoch": 0.7077327722251446, + "grad_norm": 1.857649564743042, + "learning_rate": 4.156109091094622e-05, + "loss": 1.8514, + "step": 6727 + }, + { + "epoch": 0.7078379800105208, + "grad_norm": 1.4656842947006226, + "learning_rate": 4.1533441419565265e-05, + "loss": 1.2047, + "step": 6728 + }, + { + "epoch": 0.7079431877958969, + "grad_norm": 1.4205302000045776, + "learning_rate": 4.150579871771324e-05, + "loss": 1.2295, + "step": 6729 + }, + { + "epoch": 0.708048395581273, + "grad_norm": 1.5245219469070435, + "learning_rate": 4.14781628086002e-05, + "loss": 1.7168, + "step": 6730 + }, + { + "epoch": 0.7081536033666491, + "grad_norm": 1.4015170335769653, + "learning_rate": 4.145053369543539e-05, + "loss": 1.6294, + "step": 6731 + }, + { + "epoch": 0.7082588111520253, + "grad_norm": 1.7922075986862183, + "learning_rate": 4.1422911381427274e-05, + "loss": 1.4777, + "step": 6732 + }, + { + "epoch": 0.7083640189374014, + "grad_norm": 1.0388702154159546, + "learning_rate": 4.1395295869783615e-05, + "loss": 1.8166, + "step": 6733 + }, + { + "epoch": 0.7084692267227775, + "grad_norm": 1.3340606689453125, + "learning_rate": 4.136768716371125e-05, + "loss": 1.4049, + "step": 6734 + }, + { + "epoch": 0.7085744345081536, + "grad_norm": 1.485906720161438, + "learning_rate": 4.134008526641628e-05, + "loss": 1.9171, + "step": 6735 + }, + { + "epoch": 0.7086796422935298, + "grad_norm": 1.4040229320526123, + "learning_rate": 4.131249018110408e-05, + "loss": 1.3881, + "step": 6736 + }, + { + "epoch": 0.7087848500789058, + "grad_norm": 2.063403844833374, + "learning_rate": 4.12849019109791e-05, + "loss": 1.8729, + "step": 6737 + }, + { + "epoch": 0.7088900578642819, + "grad_norm": 1.6362403631210327, + "learning_rate": 4.1257320459245154e-05, + "loss": 1.7883, + "step": 6738 + }, + { + "epoch": 0.7089952656496581, + "grad_norm": 1.6882222890853882, + "learning_rate": 4.122974582910515e-05, + "loss": 1.3546, + "step": 6739 + }, + { + "epoch": 0.7091004734350342, + "grad_norm": 1.3080660104751587, + "learning_rate": 4.1202178023761195e-05, + "loss": 1.4403, + "step": 6740 + }, + { + "epoch": 0.7092056812204103, + "grad_norm": 2.508234739303589, + "learning_rate": 4.117461704641473e-05, + "loss": 1.5945, + "step": 6741 + }, + { + "epoch": 0.7093108890057864, + "grad_norm": 1.9061312675476074, + "learning_rate": 4.1147062900266285e-05, + "loss": 1.9146, + "step": 6742 + }, + { + "epoch": 0.7094160967911626, + "grad_norm": 1.108188271522522, + "learning_rate": 4.111951558851559e-05, + "loss": 1.9639, + "step": 6743 + }, + { + "epoch": 0.7095213045765386, + "grad_norm": 1.4938721656799316, + "learning_rate": 4.109197511436169e-05, + "loss": 2.0542, + "step": 6744 + }, + { + "epoch": 0.7096265123619148, + "grad_norm": 1.5593281984329224, + "learning_rate": 4.106444148100275e-05, + "loss": 1.6947, + "step": 6745 + }, + { + "epoch": 0.7097317201472909, + "grad_norm": 1.297189712524414, + "learning_rate": 4.10369146916361e-05, + "loss": 1.4036, + "step": 6746 + }, + { + "epoch": 0.7098369279326671, + "grad_norm": 0.986399233341217, + "learning_rate": 4.100939474945843e-05, + "loss": 1.5753, + "step": 6747 + }, + { + "epoch": 0.7099421357180431, + "grad_norm": 1.9157464504241943, + "learning_rate": 4.09818816576655e-05, + "loss": 1.4842, + "step": 6748 + }, + { + "epoch": 0.7100473435034192, + "grad_norm": 1.0592901706695557, + "learning_rate": 4.095437541945226e-05, + "loss": 2.2621, + "step": 6749 + }, + { + "epoch": 0.7101525512887954, + "grad_norm": 1.2856818437576294, + "learning_rate": 4.0926876038013026e-05, + "loss": 1.7017, + "step": 6750 + }, + { + "epoch": 0.7102577590741714, + "grad_norm": 1.17574942111969, + "learning_rate": 4.0899383516541146e-05, + "loss": 1.4187, + "step": 6751 + }, + { + "epoch": 0.7103629668595476, + "grad_norm": 3.1928186416625977, + "learning_rate": 4.087189785822925e-05, + "loss": 1.4191, + "step": 6752 + }, + { + "epoch": 0.7104681746449237, + "grad_norm": 1.1456485986709595, + "learning_rate": 4.084441906626914e-05, + "loss": 1.5175, + "step": 6753 + }, + { + "epoch": 0.7105733824302999, + "grad_norm": 1.2433396577835083, + "learning_rate": 4.0816947143851816e-05, + "loss": 1.2899, + "step": 6754 + }, + { + "epoch": 0.7106785902156759, + "grad_norm": 1.4701285362243652, + "learning_rate": 4.078948209416758e-05, + "loss": 1.6928, + "step": 6755 + }, + { + "epoch": 0.7107837980010521, + "grad_norm": 1.0511400699615479, + "learning_rate": 4.0762023920405804e-05, + "loss": 1.7949, + "step": 6756 + }, + { + "epoch": 0.7108890057864282, + "grad_norm": 1.6696053743362427, + "learning_rate": 4.073457262575509e-05, + "loss": 1.991, + "step": 6757 + }, + { + "epoch": 0.7109942135718044, + "grad_norm": 1.6411384344100952, + "learning_rate": 4.0707128213403336e-05, + "loss": 1.3082, + "step": 6758 + }, + { + "epoch": 0.7110994213571804, + "grad_norm": 2.3022947311401367, + "learning_rate": 4.067969068653754e-05, + "loss": 1.8167, + "step": 6759 + }, + { + "epoch": 0.7112046291425566, + "grad_norm": 1.5651785135269165, + "learning_rate": 4.0652260048343885e-05, + "loss": 1.7786, + "step": 6760 + }, + { + "epoch": 0.7113098369279327, + "grad_norm": 1.68247389793396, + "learning_rate": 4.0624836302007886e-05, + "loss": 2.5257, + "step": 6761 + }, + { + "epoch": 0.7114150447133087, + "grad_norm": 1.1986908912658691, + "learning_rate": 4.059741945071412e-05, + "loss": 1.7782, + "step": 6762 + }, + { + "epoch": 0.7115202524986849, + "grad_norm": 1.7932888269424438, + "learning_rate": 4.05700094976464e-05, + "loss": 1.3013, + "step": 6763 + }, + { + "epoch": 0.711625460284061, + "grad_norm": 1.4314543008804321, + "learning_rate": 4.054260644598781e-05, + "loss": 1.3228, + "step": 6764 + }, + { + "epoch": 0.7117306680694372, + "grad_norm": 1.5555866956710815, + "learning_rate": 4.0515210298920545e-05, + "loss": 1.5795, + "step": 6765 + }, + { + "epoch": 0.7118358758548132, + "grad_norm": 1.2650176286697388, + "learning_rate": 4.048782105962598e-05, + "loss": 2.0545, + "step": 6766 + }, + { + "epoch": 0.7119410836401894, + "grad_norm": 1.1688004732131958, + "learning_rate": 4.0460438731284845e-05, + "loss": 1.5544, + "step": 6767 + }, + { + "epoch": 0.7120462914255655, + "grad_norm": 1.4285098314285278, + "learning_rate": 4.0433063317076893e-05, + "loss": 1.4884, + "step": 6768 + }, + { + "epoch": 0.7121514992109416, + "grad_norm": 2.4605870246887207, + "learning_rate": 4.0405694820181104e-05, + "loss": 1.0857, + "step": 6769 + }, + { + "epoch": 0.7122567069963177, + "grad_norm": 1.3824515342712402, + "learning_rate": 4.037833324377578e-05, + "loss": 1.6449, + "step": 6770 + }, + { + "epoch": 0.7123619147816939, + "grad_norm": 2.0455679893493652, + "learning_rate": 4.035097859103829e-05, + "loss": 1.7399, + "step": 6771 + }, + { + "epoch": 0.71246712256707, + "grad_norm": 1.9680933952331543, + "learning_rate": 4.032363086514523e-05, + "loss": 2.0078, + "step": 6772 + }, + { + "epoch": 0.712572330352446, + "grad_norm": 1.2317272424697876, + "learning_rate": 4.0296290069272416e-05, + "loss": 1.2191, + "step": 6773 + }, + { + "epoch": 0.7126775381378222, + "grad_norm": 1.2015796899795532, + "learning_rate": 4.026895620659479e-05, + "loss": 1.8345, + "step": 6774 + }, + { + "epoch": 0.7127827459231983, + "grad_norm": 2.4510412216186523, + "learning_rate": 4.024162928028663e-05, + "loss": 1.554, + "step": 6775 + }, + { + "epoch": 0.7128879537085744, + "grad_norm": 1.147552251815796, + "learning_rate": 4.021430929352128e-05, + "loss": 1.9372, + "step": 6776 + }, + { + "epoch": 0.7129931614939505, + "grad_norm": 1.3203489780426025, + "learning_rate": 4.01869962494713e-05, + "loss": 1.7651, + "step": 6777 + }, + { + "epoch": 0.7130983692793267, + "grad_norm": 2.268237829208374, + "learning_rate": 4.0159690151308504e-05, + "loss": 1.2124, + "step": 6778 + }, + { + "epoch": 0.7132035770647028, + "grad_norm": 1.9930332899093628, + "learning_rate": 4.013239100220385e-05, + "loss": 1.3614, + "step": 6779 + }, + { + "epoch": 0.7133087848500789, + "grad_norm": 1.6461526155471802, + "learning_rate": 4.010509880532745e-05, + "loss": 1.8164, + "step": 6780 + }, + { + "epoch": 0.713413992635455, + "grad_norm": 1.5002760887145996, + "learning_rate": 4.007781356384873e-05, + "loss": 2.0341, + "step": 6781 + }, + { + "epoch": 0.7135192004208312, + "grad_norm": 2.1459338665008545, + "learning_rate": 4.0050535280936205e-05, + "loss": 1.5856, + "step": 6782 + }, + { + "epoch": 0.7136244082062072, + "grad_norm": 1.3347117900848389, + "learning_rate": 4.002326395975758e-05, + "loss": 1.6578, + "step": 6783 + }, + { + "epoch": 0.7137296159915834, + "grad_norm": 2.3518266677856445, + "learning_rate": 3.9995999603479836e-05, + "loss": 1.6109, + "step": 6784 + }, + { + "epoch": 0.7138348237769595, + "grad_norm": 1.601401686668396, + "learning_rate": 3.996874221526905e-05, + "loss": 1.5894, + "step": 6785 + }, + { + "epoch": 0.7139400315623357, + "grad_norm": 2.32871150970459, + "learning_rate": 3.994149179829058e-05, + "loss": 1.2623, + "step": 6786 + }, + { + "epoch": 0.7140452393477117, + "grad_norm": 1.1024394035339355, + "learning_rate": 3.9914248355708894e-05, + "loss": 2.0739, + "step": 6787 + }, + { + "epoch": 0.7141504471330878, + "grad_norm": 1.1708279848098755, + "learning_rate": 3.988701189068765e-05, + "loss": 1.6601, + "step": 6788 + }, + { + "epoch": 0.714255654918464, + "grad_norm": 1.7237240076065063, + "learning_rate": 3.985978240638981e-05, + "loss": 1.7179, + "step": 6789 + }, + { + "epoch": 0.7143608627038401, + "grad_norm": 1.493378758430481, + "learning_rate": 3.9832559905977404e-05, + "loss": 1.97, + "step": 6790 + }, + { + "epoch": 0.7144660704892162, + "grad_norm": 1.8581169843673706, + "learning_rate": 3.9805344392611653e-05, + "loss": 1.7689, + "step": 6791 + }, + { + "epoch": 0.7145712782745923, + "grad_norm": 1.4340283870697021, + "learning_rate": 3.977813586945308e-05, + "loss": 1.2957, + "step": 6792 + }, + { + "epoch": 0.7146764860599685, + "grad_norm": 1.2262341976165771, + "learning_rate": 3.9750934339661275e-05, + "loss": 1.7558, + "step": 6793 + }, + { + "epoch": 0.7147816938453445, + "grad_norm": 2.0445914268493652, + "learning_rate": 3.972373980639508e-05, + "loss": 2.3494, + "step": 6794 + }, + { + "epoch": 0.7148869016307207, + "grad_norm": 2.4502785205841064, + "learning_rate": 3.9696552272812484e-05, + "loss": 2.2171, + "step": 6795 + }, + { + "epoch": 0.7149921094160968, + "grad_norm": 1.6572961807250977, + "learning_rate": 3.966937174207066e-05, + "loss": 1.8123, + "step": 6796 + }, + { + "epoch": 0.715097317201473, + "grad_norm": 2.4192516803741455, + "learning_rate": 3.9642198217326075e-05, + "loss": 1.5431, + "step": 6797 + }, + { + "epoch": 0.715202524986849, + "grad_norm": 2.080559253692627, + "learning_rate": 3.961503170173426e-05, + "loss": 1.6864, + "step": 6798 + }, + { + "epoch": 0.7153077327722251, + "grad_norm": 2.1489667892456055, + "learning_rate": 3.958787219844994e-05, + "loss": 1.4183, + "step": 6799 + }, + { + "epoch": 0.7154129405576013, + "grad_norm": 0.9947004914283752, + "learning_rate": 3.9560719710627115e-05, + "loss": 1.6476, + "step": 6800 + }, + { + "epoch": 0.7155181483429773, + "grad_norm": 1.2469807863235474, + "learning_rate": 3.9533574241418884e-05, + "loss": 1.5677, + "step": 6801 + }, + { + "epoch": 0.7156233561283535, + "grad_norm": 1.3769633769989014, + "learning_rate": 3.9506435793977535e-05, + "loss": 1.7477, + "step": 6802 + }, + { + "epoch": 0.7157285639137296, + "grad_norm": 1.638057827949524, + "learning_rate": 3.947930437145464e-05, + "loss": 1.2805, + "step": 6803 + }, + { + "epoch": 0.7158337716991058, + "grad_norm": 1.301336646080017, + "learning_rate": 3.9452179977000826e-05, + "loss": 1.4646, + "step": 6804 + }, + { + "epoch": 0.7159389794844818, + "grad_norm": 1.5804367065429688, + "learning_rate": 3.942506261376594e-05, + "loss": 2.0871, + "step": 6805 + }, + { + "epoch": 0.716044187269858, + "grad_norm": 1.6908665895462036, + "learning_rate": 3.93979522848991e-05, + "loss": 1.9848, + "step": 6806 + }, + { + "epoch": 0.7161493950552341, + "grad_norm": 1.653495192527771, + "learning_rate": 3.93708489935485e-05, + "loss": 2.2619, + "step": 6807 + }, + { + "epoch": 0.7162546028406102, + "grad_norm": 1.4094468355178833, + "learning_rate": 3.934375274286154e-05, + "loss": 1.3118, + "step": 6808 + }, + { + "epoch": 0.7163598106259863, + "grad_norm": 1.1481539011001587, + "learning_rate": 3.931666353598485e-05, + "loss": 1.5423, + "step": 6809 + }, + { + "epoch": 0.7164650184113625, + "grad_norm": 1.4447335004806519, + "learning_rate": 3.928958137606421e-05, + "loss": 1.7084, + "step": 6810 + }, + { + "epoch": 0.7165702261967386, + "grad_norm": 1.6430801153182983, + "learning_rate": 3.9262506266244534e-05, + "loss": 1.5448, + "step": 6811 + }, + { + "epoch": 0.7166754339821146, + "grad_norm": 1.7120585441589355, + "learning_rate": 3.923543820967004e-05, + "loss": 1.8249, + "step": 6812 + }, + { + "epoch": 0.7167806417674908, + "grad_norm": 1.6759033203125, + "learning_rate": 3.9208377209484014e-05, + "loss": 1.4926, + "step": 6813 + }, + { + "epoch": 0.7168858495528669, + "grad_norm": 1.6250011920928955, + "learning_rate": 3.918132326882892e-05, + "loss": 1.7885, + "step": 6814 + }, + { + "epoch": 0.716991057338243, + "grad_norm": 1.3283613920211792, + "learning_rate": 3.9154276390846555e-05, + "loss": 1.2566, + "step": 6815 + }, + { + "epoch": 0.7170962651236191, + "grad_norm": 1.6472797393798828, + "learning_rate": 3.912723657867764e-05, + "loss": 1.2617, + "step": 6816 + }, + { + "epoch": 0.7172014729089953, + "grad_norm": 1.686281442642212, + "learning_rate": 3.910020383546233e-05, + "loss": 1.7821, + "step": 6817 + }, + { + "epoch": 0.7173066806943714, + "grad_norm": 1.3786574602127075, + "learning_rate": 3.907317816433982e-05, + "loss": 1.6652, + "step": 6818 + }, + { + "epoch": 0.7174118884797475, + "grad_norm": 1.1315668821334839, + "learning_rate": 3.9046159568448446e-05, + "loss": 1.6919, + "step": 6819 + }, + { + "epoch": 0.7175170962651236, + "grad_norm": 1.1525264978408813, + "learning_rate": 3.90191480509259e-05, + "loss": 2.0687, + "step": 6820 + }, + { + "epoch": 0.7176223040504998, + "grad_norm": 2.229443311691284, + "learning_rate": 3.8992143614908874e-05, + "loss": 1.6294, + "step": 6821 + }, + { + "epoch": 0.7177275118358759, + "grad_norm": 1.3578799962997437, + "learning_rate": 3.896514626353328e-05, + "loss": 1.0833, + "step": 6822 + }, + { + "epoch": 0.717832719621252, + "grad_norm": 1.2835088968276978, + "learning_rate": 3.893815599993431e-05, + "loss": 2.0935, + "step": 6823 + }, + { + "epoch": 0.7179379274066281, + "grad_norm": 1.247178077697754, + "learning_rate": 3.8911172827246215e-05, + "loss": 1.4496, + "step": 6824 + }, + { + "epoch": 0.7180431351920042, + "grad_norm": 1.1701315641403198, + "learning_rate": 3.888419674860241e-05, + "loss": 1.8054, + "step": 6825 + }, + { + "epoch": 0.7181483429773803, + "grad_norm": 1.4860901832580566, + "learning_rate": 3.885722776713563e-05, + "loss": 1.6664, + "step": 6826 + }, + { + "epoch": 0.7182535507627564, + "grad_norm": 2.2401113510131836, + "learning_rate": 3.8830265885977656e-05, + "loss": 1.5906, + "step": 6827 + }, + { + "epoch": 0.7183587585481326, + "grad_norm": 1.4506216049194336, + "learning_rate": 3.8803311108259435e-05, + "loss": 1.4774, + "step": 6828 + }, + { + "epoch": 0.7184639663335087, + "grad_norm": 1.3430507183074951, + "learning_rate": 3.877636343711122e-05, + "loss": 1.604, + "step": 6829 + }, + { + "epoch": 0.7185691741188848, + "grad_norm": 1.241802453994751, + "learning_rate": 3.874942287566227e-05, + "loss": 1.391, + "step": 6830 + }, + { + "epoch": 0.7186743819042609, + "grad_norm": 1.9131029844284058, + "learning_rate": 3.8722489427041185e-05, + "loss": 2.0294, + "step": 6831 + }, + { + "epoch": 0.7187795896896371, + "grad_norm": 1.5295307636260986, + "learning_rate": 3.869556309437563e-05, + "loss": 1.2301, + "step": 6832 + }, + { + "epoch": 0.7188847974750131, + "grad_norm": 1.5379595756530762, + "learning_rate": 3.866864388079242e-05, + "loss": 1.8451, + "step": 6833 + }, + { + "epoch": 0.7189900052603893, + "grad_norm": 1.5962355136871338, + "learning_rate": 3.864173178941767e-05, + "loss": 1.1769, + "step": 6834 + }, + { + "epoch": 0.7190952130457654, + "grad_norm": 1.6853702068328857, + "learning_rate": 3.8614826823376557e-05, + "loss": 1.1957, + "step": 6835 + }, + { + "epoch": 0.7192004208311416, + "grad_norm": 1.4030197858810425, + "learning_rate": 3.858792898579348e-05, + "loss": 1.0337, + "step": 6836 + }, + { + "epoch": 0.7193056286165176, + "grad_norm": 1.4564900398254395, + "learning_rate": 3.8561038279791974e-05, + "loss": 1.8006, + "step": 6837 + }, + { + "epoch": 0.7194108364018937, + "grad_norm": 1.0711987018585205, + "learning_rate": 3.853415470849479e-05, + "loss": 1.8206, + "step": 6838 + }, + { + "epoch": 0.7195160441872699, + "grad_norm": 1.8588398694992065, + "learning_rate": 3.8507278275023774e-05, + "loss": 1.295, + "step": 6839 + }, + { + "epoch": 0.7196212519726459, + "grad_norm": 2.0701515674591064, + "learning_rate": 3.848040898250007e-05, + "loss": 1.3578, + "step": 6840 + }, + { + "epoch": 0.7197264597580221, + "grad_norm": 1.5314396619796753, + "learning_rate": 3.8453546834043866e-05, + "loss": 1.7377, + "step": 6841 + }, + { + "epoch": 0.7198316675433982, + "grad_norm": 1.5056813955307007, + "learning_rate": 3.842669183277463e-05, + "loss": 1.7067, + "step": 6842 + }, + { + "epoch": 0.7199368753287744, + "grad_norm": 2.3147194385528564, + "learning_rate": 3.839984398181092e-05, + "loss": 1.9101, + "step": 6843 + }, + { + "epoch": 0.7200420831141504, + "grad_norm": 2.0493955612182617, + "learning_rate": 3.8373003284270445e-05, + "loss": 1.6542, + "step": 6844 + }, + { + "epoch": 0.7201472908995266, + "grad_norm": 1.2979793548583984, + "learning_rate": 3.834616974327021e-05, + "loss": 1.5933, + "step": 6845 + }, + { + "epoch": 0.7202524986849027, + "grad_norm": 1.7987275123596191, + "learning_rate": 3.831934336192625e-05, + "loss": 1.5552, + "step": 6846 + }, + { + "epoch": 0.7203577064702787, + "grad_norm": 1.7454094886779785, + "learning_rate": 3.829252414335381e-05, + "loss": 1.7823, + "step": 6847 + }, + { + "epoch": 0.7204629142556549, + "grad_norm": 1.5051946640014648, + "learning_rate": 3.826571209066737e-05, + "loss": 1.4311, + "step": 6848 + }, + { + "epoch": 0.720568122041031, + "grad_norm": 1.2123810052871704, + "learning_rate": 3.8238907206980513e-05, + "loss": 1.8447, + "step": 6849 + }, + { + "epoch": 0.7206733298264072, + "grad_norm": 1.1441395282745361, + "learning_rate": 3.821210949540593e-05, + "loss": 1.6574, + "step": 6850 + }, + { + "epoch": 0.7207785376117832, + "grad_norm": 1.625401258468628, + "learning_rate": 3.818531895905566e-05, + "loss": 1.9151, + "step": 6851 + }, + { + "epoch": 0.7208837453971594, + "grad_norm": 2.0465898513793945, + "learning_rate": 3.815853560104075e-05, + "loss": 1.8502, + "step": 6852 + }, + { + "epoch": 0.7209889531825355, + "grad_norm": 1.7587473392486572, + "learning_rate": 3.813175942447141e-05, + "loss": 1.6247, + "step": 6853 + }, + { + "epoch": 0.7210941609679117, + "grad_norm": 3.3340325355529785, + "learning_rate": 3.810499043245718e-05, + "loss": 1.1276, + "step": 6854 + }, + { + "epoch": 0.7211993687532877, + "grad_norm": 1.4002957344055176, + "learning_rate": 3.807822862810657e-05, + "loss": 1.452, + "step": 6855 + }, + { + "epoch": 0.7213045765386639, + "grad_norm": 1.8713266849517822, + "learning_rate": 3.805147401452734e-05, + "loss": 1.9372, + "step": 6856 + }, + { + "epoch": 0.72140978432404, + "grad_norm": 2.1480062007904053, + "learning_rate": 3.802472659482649e-05, + "loss": 1.6592, + "step": 6857 + }, + { + "epoch": 0.721514992109416, + "grad_norm": 0.9366277456283569, + "learning_rate": 3.799798637211005e-05, + "loss": 1.5559, + "step": 6858 + }, + { + "epoch": 0.7216201998947922, + "grad_norm": 1.6962757110595703, + "learning_rate": 3.7971253349483285e-05, + "loss": 1.6838, + "step": 6859 + }, + { + "epoch": 0.7217254076801684, + "grad_norm": 1.3592772483825684, + "learning_rate": 3.794452753005061e-05, + "loss": 1.8806, + "step": 6860 + }, + { + "epoch": 0.7218306154655445, + "grad_norm": 2.0401268005371094, + "learning_rate": 3.791780891691558e-05, + "loss": 2.4508, + "step": 6861 + }, + { + "epoch": 0.7219358232509205, + "grad_norm": 1.0858590602874756, + "learning_rate": 3.7891097513180995e-05, + "loss": 2.0971, + "step": 6862 + }, + { + "epoch": 0.7220410310362967, + "grad_norm": 1.913306713104248, + "learning_rate": 3.786439332194874e-05, + "loss": 1.1957, + "step": 6863 + }, + { + "epoch": 0.7221462388216728, + "grad_norm": 2.756697416305542, + "learning_rate": 3.783769634631985e-05, + "loss": 1.5992, + "step": 6864 + }, + { + "epoch": 0.7222514466070489, + "grad_norm": 1.9628227949142456, + "learning_rate": 3.781100658939461e-05, + "loss": 2.0914, + "step": 6865 + }, + { + "epoch": 0.722356654392425, + "grad_norm": 1.16515052318573, + "learning_rate": 3.7784324054272405e-05, + "loss": 1.7869, + "step": 6866 + }, + { + "epoch": 0.7224618621778012, + "grad_norm": 1.3066214323043823, + "learning_rate": 3.7757648744051736e-05, + "loss": 2.3816, + "step": 6867 + }, + { + "epoch": 0.7225670699631773, + "grad_norm": 1.6055272817611694, + "learning_rate": 3.773098066183039e-05, + "loss": 1.3618, + "step": 6868 + }, + { + "epoch": 0.7226722777485534, + "grad_norm": 1.2972073554992676, + "learning_rate": 3.7704319810705225e-05, + "loss": 1.9931, + "step": 6869 + }, + { + "epoch": 0.7227774855339295, + "grad_norm": 1.3764903545379639, + "learning_rate": 3.767766619377222e-05, + "loss": 2.0662, + "step": 6870 + }, + { + "epoch": 0.7228826933193057, + "grad_norm": 1.6971714496612549, + "learning_rate": 3.7651019814126654e-05, + "loss": 1.4372, + "step": 6871 + }, + { + "epoch": 0.7229879011046817, + "grad_norm": 2.1044673919677734, + "learning_rate": 3.7624380674862845e-05, + "loss": 2.0204, + "step": 6872 + }, + { + "epoch": 0.7230931088900578, + "grad_norm": 1.2413538694381714, + "learning_rate": 3.759774877907428e-05, + "loss": 1.8452, + "step": 6873 + }, + { + "epoch": 0.723198316675434, + "grad_norm": 1.1550841331481934, + "learning_rate": 3.75711241298537e-05, + "loss": 1.655, + "step": 6874 + }, + { + "epoch": 0.7233035244608101, + "grad_norm": 1.314095139503479, + "learning_rate": 3.75445067302929e-05, + "loss": 1.648, + "step": 6875 + }, + { + "epoch": 0.7234087322461862, + "grad_norm": 1.6076027154922485, + "learning_rate": 3.751789658348284e-05, + "loss": 1.8427, + "step": 6876 + }, + { + "epoch": 0.7235139400315623, + "grad_norm": 1.512660264968872, + "learning_rate": 3.749129369251372e-05, + "loss": 2.0951, + "step": 6877 + }, + { + "epoch": 0.7236191478169385, + "grad_norm": 1.5200482606887817, + "learning_rate": 3.7464698060474814e-05, + "loss": 1.3188, + "step": 6878 + }, + { + "epoch": 0.7237243556023145, + "grad_norm": 1.276044249534607, + "learning_rate": 3.7438109690454646e-05, + "loss": 1.5648, + "step": 6879 + }, + { + "epoch": 0.7238295633876907, + "grad_norm": 1.4830344915390015, + "learning_rate": 3.741152858554077e-05, + "loss": 1.3373, + "step": 6880 + }, + { + "epoch": 0.7239347711730668, + "grad_norm": 1.0072591304779053, + "learning_rate": 3.7384954748819934e-05, + "loss": 1.9321, + "step": 6881 + }, + { + "epoch": 0.724039978958443, + "grad_norm": 1.2842603921890259, + "learning_rate": 3.7358388183378166e-05, + "loss": 1.791, + "step": 6882 + }, + { + "epoch": 0.724145186743819, + "grad_norm": 2.4097936153411865, + "learning_rate": 3.733182889230049e-05, + "loss": 1.6026, + "step": 6883 + }, + { + "epoch": 0.7242503945291952, + "grad_norm": 1.9151344299316406, + "learning_rate": 3.730527687867114e-05, + "loss": 1.7359, + "step": 6884 + }, + { + "epoch": 0.7243556023145713, + "grad_norm": 1.9665045738220215, + "learning_rate": 3.727873214557357e-05, + "loss": 2.2739, + "step": 6885 + }, + { + "epoch": 0.7244608100999474, + "grad_norm": 1.395699143409729, + "learning_rate": 3.725219469609026e-05, + "loss": 1.382, + "step": 6886 + }, + { + "epoch": 0.7245660178853235, + "grad_norm": 2.1957099437713623, + "learning_rate": 3.722566453330298e-05, + "loss": 1.5297, + "step": 6887 + }, + { + "epoch": 0.7246712256706996, + "grad_norm": 2.5647594928741455, + "learning_rate": 3.7199141660292594e-05, + "loss": 1.7129, + "step": 6888 + }, + { + "epoch": 0.7247764334560758, + "grad_norm": 1.1315737962722778, + "learning_rate": 3.717262608013903e-05, + "loss": 1.2686, + "step": 6889 + }, + { + "epoch": 0.7248816412414518, + "grad_norm": 2.1699776649475098, + "learning_rate": 3.714611779592156e-05, + "loss": 2.1215, + "step": 6890 + }, + { + "epoch": 0.724986849026828, + "grad_norm": 1.1801518201828003, + "learning_rate": 3.711961681071845e-05, + "loss": 2.093, + "step": 6891 + }, + { + "epoch": 0.7250920568122041, + "grad_norm": 1.2889292240142822, + "learning_rate": 3.7093123127607155e-05, + "loss": 1.5338, + "step": 6892 + }, + { + "epoch": 0.7251972645975803, + "grad_norm": 1.0799230337142944, + "learning_rate": 3.706663674966435e-05, + "loss": 1.2338, + "step": 6893 + }, + { + "epoch": 0.7253024723829563, + "grad_norm": 0.9696769714355469, + "learning_rate": 3.7040157679965796e-05, + "loss": 1.7135, + "step": 6894 + }, + { + "epoch": 0.7254076801683325, + "grad_norm": 1.161821961402893, + "learning_rate": 3.701368592158636e-05, + "loss": 1.9596, + "step": 6895 + }, + { + "epoch": 0.7255128879537086, + "grad_norm": 1.847954511642456, + "learning_rate": 3.6987221477600206e-05, + "loss": 1.9575, + "step": 6896 + }, + { + "epoch": 0.7256180957390846, + "grad_norm": 1.5871635675430298, + "learning_rate": 3.6960764351080535e-05, + "loss": 1.4081, + "step": 6897 + }, + { + "epoch": 0.7257233035244608, + "grad_norm": 2.1948516368865967, + "learning_rate": 3.6934314545099666e-05, + "loss": 1.8729, + "step": 6898 + }, + { + "epoch": 0.7258285113098369, + "grad_norm": 1.237642765045166, + "learning_rate": 3.690787206272923e-05, + "loss": 1.4971, + "step": 6899 + }, + { + "epoch": 0.7259337190952131, + "grad_norm": 2.1544601917266846, + "learning_rate": 3.6881436907039845e-05, + "loss": 1.6227, + "step": 6900 + }, + { + "epoch": 0.7260389268805891, + "grad_norm": 1.725161075592041, + "learning_rate": 3.6855009081101355e-05, + "loss": 2.0619, + "step": 6901 + }, + { + "epoch": 0.7261441346659653, + "grad_norm": 2.4666850566864014, + "learning_rate": 3.6828588587982715e-05, + "loss": 1.9632, + "step": 6902 + }, + { + "epoch": 0.7262493424513414, + "grad_norm": 1.0997231006622314, + "learning_rate": 3.680217543075204e-05, + "loss": 1.6217, + "step": 6903 + }, + { + "epoch": 0.7263545502367175, + "grad_norm": 1.5529903173446655, + "learning_rate": 3.6775769612476666e-05, + "loss": 1.8783, + "step": 6904 + }, + { + "epoch": 0.7264597580220936, + "grad_norm": 1.4422789812088013, + "learning_rate": 3.674937113622297e-05, + "loss": 1.4074, + "step": 6905 + }, + { + "epoch": 0.7265649658074698, + "grad_norm": 1.7190214395523071, + "learning_rate": 3.6722980005056474e-05, + "loss": 1.3818, + "step": 6906 + }, + { + "epoch": 0.7266701735928459, + "grad_norm": 1.7955530881881714, + "learning_rate": 3.669659622204199e-05, + "loss": 2.1845, + "step": 6907 + }, + { + "epoch": 0.726775381378222, + "grad_norm": 2.1686935424804688, + "learning_rate": 3.6670219790243344e-05, + "loss": 1.2339, + "step": 6908 + }, + { + "epoch": 0.7268805891635981, + "grad_norm": 1.332140326499939, + "learning_rate": 3.664385071272348e-05, + "loss": 1.6447, + "step": 6909 + }, + { + "epoch": 0.7269857969489742, + "grad_norm": 2.21612548828125, + "learning_rate": 3.6617488992544656e-05, + "loss": 1.5974, + "step": 6910 + }, + { + "epoch": 0.7270910047343503, + "grad_norm": 1.5317012071609497, + "learning_rate": 3.659113463276812e-05, + "loss": 1.6033, + "step": 6911 + }, + { + "epoch": 0.7271962125197264, + "grad_norm": 1.3678877353668213, + "learning_rate": 3.656478763645428e-05, + "loss": 1.4386, + "step": 6912 + }, + { + "epoch": 0.7273014203051026, + "grad_norm": 1.0950294733047485, + "learning_rate": 3.6538448006662795e-05, + "loss": 1.7116, + "step": 6913 + }, + { + "epoch": 0.7274066280904787, + "grad_norm": 1.2202073335647583, + "learning_rate": 3.6512115746452366e-05, + "loss": 1.0636, + "step": 6914 + }, + { + "epoch": 0.7275118358758548, + "grad_norm": 1.1715128421783447, + "learning_rate": 3.648579085888085e-05, + "loss": 1.5944, + "step": 6915 + }, + { + "epoch": 0.7276170436612309, + "grad_norm": 0.9897493720054626, + "learning_rate": 3.6459473347005334e-05, + "loss": 1.6186, + "step": 6916 + }, + { + "epoch": 0.7277222514466071, + "grad_norm": 1.9510959386825562, + "learning_rate": 3.643316321388194e-05, + "loss": 1.1786, + "step": 6917 + }, + { + "epoch": 0.7278274592319832, + "grad_norm": 1.5720961093902588, + "learning_rate": 3.640686046256594e-05, + "loss": 1.6419, + "step": 6918 + }, + { + "epoch": 0.7279326670173593, + "grad_norm": 1.484616994857788, + "learning_rate": 3.6380565096111866e-05, + "loss": 1.5632, + "step": 6919 + }, + { + "epoch": 0.7280378748027354, + "grad_norm": 1.5460129976272583, + "learning_rate": 3.6354277117573264e-05, + "loss": 1.5821, + "step": 6920 + }, + { + "epoch": 0.7281430825881116, + "grad_norm": 1.5257236957550049, + "learning_rate": 3.632799653000286e-05, + "loss": 1.3601, + "step": 6921 + }, + { + "epoch": 0.7282482903734876, + "grad_norm": 1.349097490310669, + "learning_rate": 3.630172333645261e-05, + "loss": 1.8427, + "step": 6922 + }, + { + "epoch": 0.7283534981588637, + "grad_norm": 1.3090531826019287, + "learning_rate": 3.627545753997341e-05, + "loss": 1.4297, + "step": 6923 + }, + { + "epoch": 0.7284587059442399, + "grad_norm": 1.86859130859375, + "learning_rate": 3.624919914361552e-05, + "loss": 1.3909, + "step": 6924 + }, + { + "epoch": 0.728563913729616, + "grad_norm": 2.1052424907684326, + "learning_rate": 3.622294815042821e-05, + "loss": 1.4364, + "step": 6925 + }, + { + "epoch": 0.7286691215149921, + "grad_norm": 1.1871145963668823, + "learning_rate": 3.6196704563459885e-05, + "loss": 1.5061, + "step": 6926 + }, + { + "epoch": 0.7287743293003682, + "grad_norm": 2.0205655097961426, + "learning_rate": 3.617046838575819e-05, + "loss": 1.5433, + "step": 6927 + }, + { + "epoch": 0.7288795370857444, + "grad_norm": 1.3786308765411377, + "learning_rate": 3.6144239620369816e-05, + "loss": 1.7252, + "step": 6928 + }, + { + "epoch": 0.7289847448711204, + "grad_norm": 1.8365287780761719, + "learning_rate": 3.611801827034059e-05, + "loss": 1.3477, + "step": 6929 + }, + { + "epoch": 0.7290899526564966, + "grad_norm": 2.479865789413452, + "learning_rate": 3.609180433871558e-05, + "loss": 1.362, + "step": 6930 + }, + { + "epoch": 0.7291951604418727, + "grad_norm": 1.4172632694244385, + "learning_rate": 3.606559782853889e-05, + "loss": 1.4992, + "step": 6931 + }, + { + "epoch": 0.7293003682272489, + "grad_norm": 1.7480148077011108, + "learning_rate": 3.603939874285375e-05, + "loss": 1.7928, + "step": 6932 + }, + { + "epoch": 0.7294055760126249, + "grad_norm": 2.404942512512207, + "learning_rate": 3.6013207084702646e-05, + "loss": 1.1866, + "step": 6933 + }, + { + "epoch": 0.729510783798001, + "grad_norm": 1.3613299131393433, + "learning_rate": 3.5987022857127086e-05, + "loss": 1.1066, + "step": 6934 + }, + { + "epoch": 0.7296159915833772, + "grad_norm": 2.4102582931518555, + "learning_rate": 3.596084606316778e-05, + "loss": 1.5663, + "step": 6935 + }, + { + "epoch": 0.7297211993687532, + "grad_norm": 1.294023036956787, + "learning_rate": 3.593467670586457e-05, + "loss": 1.8498, + "step": 6936 + }, + { + "epoch": 0.7298264071541294, + "grad_norm": 1.4353657960891724, + "learning_rate": 3.5908514788256344e-05, + "loss": 2.0498, + "step": 6937 + }, + { + "epoch": 0.7299316149395055, + "grad_norm": 2.242220401763916, + "learning_rate": 3.588236031338129e-05, + "loss": 1.8686, + "step": 6938 + }, + { + "epoch": 0.7300368227248817, + "grad_norm": 1.6816960573196411, + "learning_rate": 3.585621328427658e-05, + "loss": 1.8161, + "step": 6939 + }, + { + "epoch": 0.7301420305102577, + "grad_norm": 1.5004388093948364, + "learning_rate": 3.58300737039786e-05, + "loss": 1.7814, + "step": 6940 + }, + { + "epoch": 0.7302472382956339, + "grad_norm": 2.273651599884033, + "learning_rate": 3.580394157552286e-05, + "loss": 1.9734, + "step": 6941 + }, + { + "epoch": 0.73035244608101, + "grad_norm": 1.1475566625595093, + "learning_rate": 3.577781690194399e-05, + "loss": 1.4193, + "step": 6942 + }, + { + "epoch": 0.7304576538663861, + "grad_norm": 1.948401927947998, + "learning_rate": 3.5751699686275786e-05, + "loss": 1.8889, + "step": 6943 + }, + { + "epoch": 0.7305628616517622, + "grad_norm": 2.96512770652771, + "learning_rate": 3.572558993155112e-05, + "loss": 2.0799, + "step": 6944 + }, + { + "epoch": 0.7306680694371384, + "grad_norm": 1.6030654907226562, + "learning_rate": 3.569948764080201e-05, + "loss": 1.497, + "step": 6945 + }, + { + "epoch": 0.7307732772225145, + "grad_norm": 1.6341030597686768, + "learning_rate": 3.5673392817059705e-05, + "loss": 1.5135, + "step": 6946 + }, + { + "epoch": 0.7308784850078905, + "grad_norm": 1.9366779327392578, + "learning_rate": 3.5647305463354466e-05, + "loss": 1.916, + "step": 6947 + }, + { + "epoch": 0.7309836927932667, + "grad_norm": 1.9645856618881226, + "learning_rate": 3.562122558271569e-05, + "loss": 1.4412, + "step": 6948 + }, + { + "epoch": 0.7310889005786428, + "grad_norm": 1.8454338312149048, + "learning_rate": 3.559515317817204e-05, + "loss": 1.4741, + "step": 6949 + }, + { + "epoch": 0.731194108364019, + "grad_norm": 1.1774706840515137, + "learning_rate": 3.556908825275117e-05, + "loss": 1.5745, + "step": 6950 + }, + { + "epoch": 0.731299316149395, + "grad_norm": 1.3022416830062866, + "learning_rate": 3.5543030809479874e-05, + "loss": 1.7731, + "step": 6951 + }, + { + "epoch": 0.7314045239347712, + "grad_norm": 0.9204453229904175, + "learning_rate": 3.5516980851384194e-05, + "loss": 1.5551, + "step": 6952 + }, + { + "epoch": 0.7315097317201473, + "grad_norm": 1.7485677003860474, + "learning_rate": 3.549093838148919e-05, + "loss": 1.6053, + "step": 6953 + }, + { + "epoch": 0.7316149395055234, + "grad_norm": 1.1012275218963623, + "learning_rate": 3.546490340281906e-05, + "loss": 1.6423, + "step": 6954 + }, + { + "epoch": 0.7317201472908995, + "grad_norm": 1.050255537033081, + "learning_rate": 3.543887591839723e-05, + "loss": 1.4111, + "step": 6955 + }, + { + "epoch": 0.7318253550762757, + "grad_norm": 1.184462070465088, + "learning_rate": 3.5412855931246126e-05, + "loss": 1.9403, + "step": 6956 + }, + { + "epoch": 0.7319305628616518, + "grad_norm": 1.2454978227615356, + "learning_rate": 3.538684344438736e-05, + "loss": 1.5388, + "step": 6957 + }, + { + "epoch": 0.7320357706470279, + "grad_norm": 1.4453421831130981, + "learning_rate": 3.5360838460841725e-05, + "loss": 1.9107, + "step": 6958 + }, + { + "epoch": 0.732140978432404, + "grad_norm": 1.2171581983566284, + "learning_rate": 3.533484098362908e-05, + "loss": 1.5818, + "step": 6959 + }, + { + "epoch": 0.7322461862177801, + "grad_norm": 2.109985113143921, + "learning_rate": 3.5308851015768375e-05, + "loss": 2.0371, + "step": 6960 + }, + { + "epoch": 0.7323513940031562, + "grad_norm": 2.1604323387145996, + "learning_rate": 3.52828685602778e-05, + "loss": 1.6088, + "step": 6961 + }, + { + "epoch": 0.7324566017885323, + "grad_norm": 1.6006240844726562, + "learning_rate": 3.525689362017461e-05, + "loss": 1.353, + "step": 6962 + }, + { + "epoch": 0.7325618095739085, + "grad_norm": 3.5379319190979004, + "learning_rate": 3.523092619847512e-05, + "loss": 1.9756, + "step": 6963 + }, + { + "epoch": 0.7326670173592846, + "grad_norm": 1.7658113241195679, + "learning_rate": 3.520496629819494e-05, + "loss": 1.8514, + "step": 6964 + }, + { + "epoch": 0.7327722251446607, + "grad_norm": 2.1337103843688965, + "learning_rate": 3.517901392234865e-05, + "loss": 1.785, + "step": 6965 + }, + { + "epoch": 0.7328774329300368, + "grad_norm": 1.1942282915115356, + "learning_rate": 3.515306907395002e-05, + "loss": 1.2969, + "step": 6966 + }, + { + "epoch": 0.732982640715413, + "grad_norm": 1.3888230323791504, + "learning_rate": 3.512713175601194e-05, + "loss": 1.5518, + "step": 6967 + }, + { + "epoch": 0.733087848500789, + "grad_norm": 1.5503004789352417, + "learning_rate": 3.51012019715464e-05, + "loss": 1.4631, + "step": 6968 + }, + { + "epoch": 0.7331930562861652, + "grad_norm": 2.311354160308838, + "learning_rate": 3.5075279723564616e-05, + "loss": 1.0551, + "step": 6969 + }, + { + "epoch": 0.7332982640715413, + "grad_norm": 2.0432231426239014, + "learning_rate": 3.504936501507679e-05, + "loss": 2.2122, + "step": 6970 + }, + { + "epoch": 0.7334034718569175, + "grad_norm": 1.2789908647537231, + "learning_rate": 3.502345784909229e-05, + "loss": 1.6112, + "step": 6971 + }, + { + "epoch": 0.7335086796422935, + "grad_norm": 1.3882858753204346, + "learning_rate": 3.499755822861971e-05, + "loss": 1.2767, + "step": 6972 + }, + { + "epoch": 0.7336138874276696, + "grad_norm": 1.920723795890808, + "learning_rate": 3.497166615666664e-05, + "loss": 1.8132, + "step": 6973 + }, + { + "epoch": 0.7337190952130458, + "grad_norm": 1.692369818687439, + "learning_rate": 3.4945781636239814e-05, + "loss": 1.7823, + "step": 6974 + }, + { + "epoch": 0.7338243029984218, + "grad_norm": 1.6629142761230469, + "learning_rate": 3.491990467034518e-05, + "loss": 1.7026, + "step": 6975 + }, + { + "epoch": 0.733929510783798, + "grad_norm": 1.6108320951461792, + "learning_rate": 3.4894035261987715e-05, + "loss": 1.2472, + "step": 6976 + }, + { + "epoch": 0.7340347185691741, + "grad_norm": 2.1136841773986816, + "learning_rate": 3.486817341417151e-05, + "loss": 1.0959, + "step": 6977 + }, + { + "epoch": 0.7341399263545503, + "grad_norm": 2.271759510040283, + "learning_rate": 3.484231912989989e-05, + "loss": 1.7992, + "step": 6978 + }, + { + "epoch": 0.7342451341399263, + "grad_norm": 1.7375454902648926, + "learning_rate": 3.481647241217516e-05, + "loss": 1.3712, + "step": 6979 + }, + { + "epoch": 0.7343503419253025, + "grad_norm": 1.6927552223205566, + "learning_rate": 3.4790633263998874e-05, + "loss": 1.3709, + "step": 6980 + }, + { + "epoch": 0.7344555497106786, + "grad_norm": 1.4837682247161865, + "learning_rate": 3.476480168837161e-05, + "loss": 1.4844, + "step": 6981 + }, + { + "epoch": 0.7345607574960548, + "grad_norm": 1.7805767059326172, + "learning_rate": 3.473897768829308e-05, + "loss": 1.9579, + "step": 6982 + }, + { + "epoch": 0.7346659652814308, + "grad_norm": 1.7141696214675903, + "learning_rate": 3.471316126676222e-05, + "loss": 1.1072, + "step": 6983 + }, + { + "epoch": 0.734771173066807, + "grad_norm": 2.4495108127593994, + "learning_rate": 3.4687352426776945e-05, + "loss": 1.4545, + "step": 6984 + }, + { + "epoch": 0.7348763808521831, + "grad_norm": 1.993420124053955, + "learning_rate": 3.466155117133433e-05, + "loss": 2.0231, + "step": 6985 + }, + { + "epoch": 0.7349815886375591, + "grad_norm": 1.47242271900177, + "learning_rate": 3.4635757503430685e-05, + "loss": 1.5673, + "step": 6986 + }, + { + "epoch": 0.7350867964229353, + "grad_norm": 2.614037036895752, + "learning_rate": 3.460997142606126e-05, + "loss": 2.0294, + "step": 6987 + }, + { + "epoch": 0.7351920042083114, + "grad_norm": 1.551087737083435, + "learning_rate": 3.4584192942220495e-05, + "loss": 1.889, + "step": 6988 + }, + { + "epoch": 0.7352972119936876, + "grad_norm": 1.1275653839111328, + "learning_rate": 3.455842205490202e-05, + "loss": 1.9836, + "step": 6989 + }, + { + "epoch": 0.7354024197790636, + "grad_norm": 2.598032236099243, + "learning_rate": 3.453265876709847e-05, + "loss": 1.7154, + "step": 6990 + }, + { + "epoch": 0.7355076275644398, + "grad_norm": 2.10746431350708, + "learning_rate": 3.45069030818017e-05, + "loss": 1.7135, + "step": 6991 + }, + { + "epoch": 0.7356128353498159, + "grad_norm": 1.3205444812774658, + "learning_rate": 3.448115500200263e-05, + "loss": 1.7395, + "step": 6992 + }, + { + "epoch": 0.735718043135192, + "grad_norm": 2.8115272521972656, + "learning_rate": 3.4455414530691234e-05, + "loss": 1.9317, + "step": 6993 + }, + { + "epoch": 0.7358232509205681, + "grad_norm": 2.6412436962127686, + "learning_rate": 3.442968167085675e-05, + "loss": 1.5553, + "step": 6994 + }, + { + "epoch": 0.7359284587059443, + "grad_norm": 1.441660761833191, + "learning_rate": 3.440395642548743e-05, + "loss": 1.4242, + "step": 6995 + }, + { + "epoch": 0.7360336664913204, + "grad_norm": 1.848536729812622, + "learning_rate": 3.437823879757059e-05, + "loss": 1.2416, + "step": 6996 + }, + { + "epoch": 0.7361388742766964, + "grad_norm": 1.5345301628112793, + "learning_rate": 3.435252879009284e-05, + "loss": 2.0876, + "step": 6997 + }, + { + "epoch": 0.7362440820620726, + "grad_norm": 1.2060550451278687, + "learning_rate": 3.432682640603975e-05, + "loss": 2.075, + "step": 6998 + }, + { + "epoch": 0.7363492898474487, + "grad_norm": 1.2402889728546143, + "learning_rate": 3.430113164839601e-05, + "loss": 2.3051, + "step": 6999 + }, + { + "epoch": 0.7364544976328248, + "grad_norm": 1.3935623168945312, + "learning_rate": 3.427544452014556e-05, + "loss": 1.045, + "step": 7000 + }, + { + "epoch": 0.7365597054182009, + "grad_norm": 1.647644281387329, + "learning_rate": 3.42497650242713e-05, + "loss": 1.5662, + "step": 7001 + }, + { + "epoch": 0.7366649132035771, + "grad_norm": 1.3244524002075195, + "learning_rate": 3.422409316375529e-05, + "loss": 1.6658, + "step": 7002 + }, + { + "epoch": 0.7367701209889532, + "grad_norm": 2.1039204597473145, + "learning_rate": 3.4198428941578776e-05, + "loss": 1.6034, + "step": 7003 + }, + { + "epoch": 0.7368753287743293, + "grad_norm": 1.919314980506897, + "learning_rate": 3.417277236072203e-05, + "loss": 1.7035, + "step": 7004 + }, + { + "epoch": 0.7369805365597054, + "grad_norm": 1.365127682685852, + "learning_rate": 3.414712342416443e-05, + "loss": 1.7206, + "step": 7005 + }, + { + "epoch": 0.7370857443450816, + "grad_norm": 1.5440795421600342, + "learning_rate": 3.4121482134884575e-05, + "loss": 0.6072, + "step": 7006 + }, + { + "epoch": 0.7371909521304576, + "grad_norm": 1.7925177812576294, + "learning_rate": 3.409584849586006e-05, + "loss": 1.7965, + "step": 7007 + }, + { + "epoch": 0.7372961599158337, + "grad_norm": 1.45937979221344, + "learning_rate": 3.4070222510067653e-05, + "loss": 1.5833, + "step": 7008 + }, + { + "epoch": 0.7374013677012099, + "grad_norm": 1.4615490436553955, + "learning_rate": 3.40446041804832e-05, + "loss": 1.5594, + "step": 7009 + }, + { + "epoch": 0.737506575486586, + "grad_norm": 2.050783395767212, + "learning_rate": 3.401899351008163e-05, + "loss": 1.8929, + "step": 7010 + }, + { + "epoch": 0.7376117832719621, + "grad_norm": 1.2625833749771118, + "learning_rate": 3.3993390501837116e-05, + "loss": 1.7766, + "step": 7011 + }, + { + "epoch": 0.7377169910573382, + "grad_norm": 1.5997296571731567, + "learning_rate": 3.396779515872282e-05, + "loss": 2.3762, + "step": 7012 + }, + { + "epoch": 0.7378221988427144, + "grad_norm": 1.210913062095642, + "learning_rate": 3.3942207483710986e-05, + "loss": 1.93, + "step": 7013 + }, + { + "epoch": 0.7379274066280905, + "grad_norm": 1.6027042865753174, + "learning_rate": 3.391662747977312e-05, + "loss": 2.1033, + "step": 7014 + }, + { + "epoch": 0.7380326144134666, + "grad_norm": 1.183409571647644, + "learning_rate": 3.389105514987969e-05, + "loss": 2.1993, + "step": 7015 + }, + { + "epoch": 0.7381378221988427, + "grad_norm": 1.665959358215332, + "learning_rate": 3.38654904970003e-05, + "loss": 1.7956, + "step": 7016 + }, + { + "epoch": 0.7382430299842189, + "grad_norm": 1.5837026834487915, + "learning_rate": 3.383993352410375e-05, + "loss": 1.4741, + "step": 7017 + }, + { + "epoch": 0.7383482377695949, + "grad_norm": 1.7137260437011719, + "learning_rate": 3.381438423415787e-05, + "loss": 1.6929, + "step": 7018 + }, + { + "epoch": 0.738453445554971, + "grad_norm": 1.120949625968933, + "learning_rate": 3.378884263012957e-05, + "loss": 1.3851, + "step": 7019 + }, + { + "epoch": 0.7385586533403472, + "grad_norm": 1.4287333488464355, + "learning_rate": 3.3763308714984974e-05, + "loss": 2.1058, + "step": 7020 + }, + { + "epoch": 0.7386638611257234, + "grad_norm": 1.9645508527755737, + "learning_rate": 3.373778249168923e-05, + "loss": 1.747, + "step": 7021 + }, + { + "epoch": 0.7387690689110994, + "grad_norm": 1.1778647899627686, + "learning_rate": 3.3712263963206583e-05, + "loss": 2.0621, + "step": 7022 + }, + { + "epoch": 0.7388742766964755, + "grad_norm": 2.0654361248016357, + "learning_rate": 3.368675313250046e-05, + "loss": 1.6014, + "step": 7023 + }, + { + "epoch": 0.7389794844818517, + "grad_norm": 1.6172906160354614, + "learning_rate": 3.3661250002533305e-05, + "loss": 1.6385, + "step": 7024 + }, + { + "epoch": 0.7390846922672277, + "grad_norm": 1.6722594499588013, + "learning_rate": 3.363575457626678e-05, + "loss": 1.7566, + "step": 7025 + }, + { + "epoch": 0.7391899000526039, + "grad_norm": 1.1120942831039429, + "learning_rate": 3.361026685666152e-05, + "loss": 1.7478, + "step": 7026 + }, + { + "epoch": 0.73929510783798, + "grad_norm": 1.4074763059616089, + "learning_rate": 3.358478684667734e-05, + "loss": 1.9311, + "step": 7027 + }, + { + "epoch": 0.7394003156233562, + "grad_norm": 1.5094165802001953, + "learning_rate": 3.355931454927319e-05, + "loss": 1.4962, + "step": 7028 + }, + { + "epoch": 0.7395055234087322, + "grad_norm": 1.2964214086532593, + "learning_rate": 3.353384996740709e-05, + "loss": 2.0931, + "step": 7029 + }, + { + "epoch": 0.7396107311941084, + "grad_norm": 1.637769103050232, + "learning_rate": 3.3508393104036076e-05, + "loss": 1.556, + "step": 7030 + }, + { + "epoch": 0.7397159389794845, + "grad_norm": 1.4458996057510376, + "learning_rate": 3.348294396211643e-05, + "loss": 1.7264, + "step": 7031 + }, + { + "epoch": 0.7398211467648605, + "grad_norm": 1.4599863290786743, + "learning_rate": 3.345750254460348e-05, + "loss": 1.3761, + "step": 7032 + }, + { + "epoch": 0.7399263545502367, + "grad_norm": 1.3754396438598633, + "learning_rate": 3.3432068854451594e-05, + "loss": 1.8374, + "step": 7033 + }, + { + "epoch": 0.7400315623356128, + "grad_norm": 1.6223207712173462, + "learning_rate": 3.3406642894614394e-05, + "loss": 1.5258, + "step": 7034 + }, + { + "epoch": 0.740136770120989, + "grad_norm": 2.438016414642334, + "learning_rate": 3.3381224668044434e-05, + "loss": 1.9796, + "step": 7035 + }, + { + "epoch": 0.740241977906365, + "grad_norm": 1.7482492923736572, + "learning_rate": 3.3355814177693514e-05, + "loss": 2.0305, + "step": 7036 + }, + { + "epoch": 0.7403471856917412, + "grad_norm": 2.278324842453003, + "learning_rate": 3.3330411426512435e-05, + "loss": 1.6287, + "step": 7037 + }, + { + "epoch": 0.7404523934771173, + "grad_norm": 1.471503734588623, + "learning_rate": 3.3305016417451105e-05, + "loss": 1.3077, + "step": 7038 + }, + { + "epoch": 0.7405576012624934, + "grad_norm": 1.7559709548950195, + "learning_rate": 3.327962915345864e-05, + "loss": 1.2328, + "step": 7039 + }, + { + "epoch": 0.7406628090478695, + "grad_norm": 1.2783993482589722, + "learning_rate": 3.325424963748313e-05, + "loss": 1.6599, + "step": 7040 + }, + { + "epoch": 0.7407680168332457, + "grad_norm": 1.633362054824829, + "learning_rate": 3.3228877872471786e-05, + "loss": 1.8694, + "step": 7041 + }, + { + "epoch": 0.7408732246186218, + "grad_norm": 1.3471500873565674, + "learning_rate": 3.3203513861371017e-05, + "loss": 1.4373, + "step": 7042 + }, + { + "epoch": 0.7409784324039979, + "grad_norm": 1.6833635568618774, + "learning_rate": 3.317815760712622e-05, + "loss": 1.2696, + "step": 7043 + }, + { + "epoch": 0.741083640189374, + "grad_norm": 1.1694504022598267, + "learning_rate": 3.315280911268193e-05, + "loss": 1.5157, + "step": 7044 + }, + { + "epoch": 0.7411888479747502, + "grad_norm": 1.1271198987960815, + "learning_rate": 3.312746838098181e-05, + "loss": 1.4756, + "step": 7045 + }, + { + "epoch": 0.7412940557601263, + "grad_norm": 1.275770664215088, + "learning_rate": 3.310213541496858e-05, + "loss": 1.4592, + "step": 7046 + }, + { + "epoch": 0.7413992635455023, + "grad_norm": 1.2125309705734253, + "learning_rate": 3.307681021758405e-05, + "loss": 1.9089, + "step": 7047 + }, + { + "epoch": 0.7415044713308785, + "grad_norm": 1.1902837753295898, + "learning_rate": 3.305149279176921e-05, + "loss": 2.156, + "step": 7048 + }, + { + "epoch": 0.7416096791162546, + "grad_norm": 1.931227684020996, + "learning_rate": 3.302618314046405e-05, + "loss": 1.6646, + "step": 7049 + }, + { + "epoch": 0.7417148869016307, + "grad_norm": 1.360236644744873, + "learning_rate": 3.300088126660768e-05, + "loss": 0.9917, + "step": 7050 + }, + { + "epoch": 0.7418200946870068, + "grad_norm": 1.1433465480804443, + "learning_rate": 3.297558717313839e-05, + "loss": 1.7004, + "step": 7051 + }, + { + "epoch": 0.741925302472383, + "grad_norm": 1.5731385946273804, + "learning_rate": 3.295030086299341e-05, + "loss": 1.8221, + "step": 7052 + }, + { + "epoch": 0.7420305102577591, + "grad_norm": 1.5698689222335815, + "learning_rate": 3.292502233910922e-05, + "loss": 2.1785, + "step": 7053 + }, + { + "epoch": 0.7421357180431352, + "grad_norm": 2.299558162689209, + "learning_rate": 3.2899751604421324e-05, + "loss": 1.2728, + "step": 7054 + }, + { + "epoch": 0.7422409258285113, + "grad_norm": 1.727777123451233, + "learning_rate": 3.287448866186428e-05, + "loss": 1.5016, + "step": 7055 + }, + { + "epoch": 0.7423461336138875, + "grad_norm": 1.1530907154083252, + "learning_rate": 3.284923351437187e-05, + "loss": 1.8296, + "step": 7056 + }, + { + "epoch": 0.7424513413992635, + "grad_norm": 1.4847744703292847, + "learning_rate": 3.282398616487684e-05, + "loss": 1.5375, + "step": 7057 + }, + { + "epoch": 0.7425565491846396, + "grad_norm": 1.4450937509536743, + "learning_rate": 3.279874661631106e-05, + "loss": 2.0141, + "step": 7058 + }, + { + "epoch": 0.7426617569700158, + "grad_norm": 1.7158372402191162, + "learning_rate": 3.277351487160558e-05, + "loss": 1.7055, + "step": 7059 + }, + { + "epoch": 0.742766964755392, + "grad_norm": 1.9537938833236694, + "learning_rate": 3.2748290933690454e-05, + "loss": 1.7621, + "step": 7060 + }, + { + "epoch": 0.742872172540768, + "grad_norm": 1.7717899084091187, + "learning_rate": 3.2723074805494805e-05, + "loss": 1.357, + "step": 7061 + }, + { + "epoch": 0.7429773803261441, + "grad_norm": 1.749004602432251, + "learning_rate": 3.269786648994697e-05, + "loss": 1.998, + "step": 7062 + }, + { + "epoch": 0.7430825881115203, + "grad_norm": 1.6080665588378906, + "learning_rate": 3.267266598997429e-05, + "loss": 1.6851, + "step": 7063 + }, + { + "epoch": 0.7431877958968963, + "grad_norm": 1.6543947458267212, + "learning_rate": 3.2647473308503164e-05, + "loss": 1.483, + "step": 7064 + }, + { + "epoch": 0.7432930036822725, + "grad_norm": 1.3643039464950562, + "learning_rate": 3.262228844845922e-05, + "loss": 1.7009, + "step": 7065 + }, + { + "epoch": 0.7433982114676486, + "grad_norm": 1.9166144132614136, + "learning_rate": 3.259711141276703e-05, + "loss": 2.1545, + "step": 7066 + }, + { + "epoch": 0.7435034192530248, + "grad_norm": 1.5741544961929321, + "learning_rate": 3.2571942204350324e-05, + "loss": 1.9652, + "step": 7067 + }, + { + "epoch": 0.7436086270384008, + "grad_norm": 1.287895679473877, + "learning_rate": 3.254678082613196e-05, + "loss": 1.753, + "step": 7068 + }, + { + "epoch": 0.743713834823777, + "grad_norm": 1.658474326133728, + "learning_rate": 3.252162728103382e-05, + "loss": 1.4791, + "step": 7069 + }, + { + "epoch": 0.7438190426091531, + "grad_norm": 1.8779799938201904, + "learning_rate": 3.249648157197688e-05, + "loss": 1.3374, + "step": 7070 + }, + { + "epoch": 0.7439242503945291, + "grad_norm": 1.7601832151412964, + "learning_rate": 3.2471343701881275e-05, + "loss": 1.4406, + "step": 7071 + }, + { + "epoch": 0.7440294581799053, + "grad_norm": 1.6893101930618286, + "learning_rate": 3.244621367366616e-05, + "loss": 1.0462, + "step": 7072 + }, + { + "epoch": 0.7441346659652814, + "grad_norm": 1.2970998287200928, + "learning_rate": 3.24210914902498e-05, + "loss": 1.7947, + "step": 7073 + }, + { + "epoch": 0.7442398737506576, + "grad_norm": 1.8401261568069458, + "learning_rate": 3.2395977154549554e-05, + "loss": 2.1458, + "step": 7074 + }, + { + "epoch": 0.7443450815360336, + "grad_norm": 2.419576406478882, + "learning_rate": 3.2370870669481834e-05, + "loss": 2.044, + "step": 7075 + }, + { + "epoch": 0.7444502893214098, + "grad_norm": 1.1183511018753052, + "learning_rate": 3.234577203796223e-05, + "loss": 1.7889, + "step": 7076 + }, + { + "epoch": 0.7445554971067859, + "grad_norm": 1.3616372346878052, + "learning_rate": 3.232068126290535e-05, + "loss": 1.8218, + "step": 7077 + }, + { + "epoch": 0.7446607048921621, + "grad_norm": 1.127997875213623, + "learning_rate": 3.229559834722485e-05, + "loss": 2.2213, + "step": 7078 + }, + { + "epoch": 0.7447659126775381, + "grad_norm": 1.5394973754882812, + "learning_rate": 3.227052329383362e-05, + "loss": 1.6531, + "step": 7079 + }, + { + "epoch": 0.7448711204629143, + "grad_norm": 1.4563318490982056, + "learning_rate": 3.224545610564345e-05, + "loss": 2.0104, + "step": 7080 + }, + { + "epoch": 0.7449763282482904, + "grad_norm": 1.76889169216156, + "learning_rate": 3.2220396785565375e-05, + "loss": 1.372, + "step": 7081 + }, + { + "epoch": 0.7450815360336664, + "grad_norm": 1.5505903959274292, + "learning_rate": 3.219534533650944e-05, + "loss": 2.0011, + "step": 7082 + }, + { + "epoch": 0.7451867438190426, + "grad_norm": 1.7687225341796875, + "learning_rate": 3.217030176138474e-05, + "loss": 1.8728, + "step": 7083 + }, + { + "epoch": 0.7452919516044187, + "grad_norm": 1.5347532033920288, + "learning_rate": 3.214526606309957e-05, + "loss": 1.6496, + "step": 7084 + }, + { + "epoch": 0.7453971593897949, + "grad_norm": 1.3686238527297974, + "learning_rate": 3.212023824456121e-05, + "loss": 1.7251, + "step": 7085 + }, + { + "epoch": 0.7455023671751709, + "grad_norm": 1.5368924140930176, + "learning_rate": 3.2095218308676024e-05, + "loss": 1.6474, + "step": 7086 + }, + { + "epoch": 0.7456075749605471, + "grad_norm": 1.4504789113998413, + "learning_rate": 3.207020625834957e-05, + "loss": 1.838, + "step": 7087 + }, + { + "epoch": 0.7457127827459232, + "grad_norm": 1.296593427658081, + "learning_rate": 3.204520209648637e-05, + "loss": 1.4902, + "step": 7088 + }, + { + "epoch": 0.7458179905312993, + "grad_norm": 1.3226828575134277, + "learning_rate": 3.2020205825990056e-05, + "loss": 1.504, + "step": 7089 + }, + { + "epoch": 0.7459231983166754, + "grad_norm": 1.6971758604049683, + "learning_rate": 3.199521744976342e-05, + "loss": 1.4026, + "step": 7090 + }, + { + "epoch": 0.7460284061020516, + "grad_norm": 2.515711545944214, + "learning_rate": 3.1970236970708234e-05, + "loss": 1.9895, + "step": 7091 + }, + { + "epoch": 0.7461336138874277, + "grad_norm": 2.220277786254883, + "learning_rate": 3.194526439172539e-05, + "loss": 1.8306, + "step": 7092 + }, + { + "epoch": 0.7462388216728038, + "grad_norm": 1.5911275148391724, + "learning_rate": 3.192029971571492e-05, + "loss": 2.0089, + "step": 7093 + }, + { + "epoch": 0.7463440294581799, + "grad_norm": 1.8149131536483765, + "learning_rate": 3.189534294557587e-05, + "loss": 1.6243, + "step": 7094 + }, + { + "epoch": 0.746449237243556, + "grad_norm": 1.3475899696350098, + "learning_rate": 3.187039408420638e-05, + "loss": 1.5206, + "step": 7095 + }, + { + "epoch": 0.7465544450289321, + "grad_norm": 1.1558003425598145, + "learning_rate": 3.184545313450368e-05, + "loss": 2.1106, + "step": 7096 + }, + { + "epoch": 0.7466596528143082, + "grad_norm": 1.5029397010803223, + "learning_rate": 3.182052009936404e-05, + "loss": 1.6606, + "step": 7097 + }, + { + "epoch": 0.7467648605996844, + "grad_norm": 1.187970519065857, + "learning_rate": 3.1795594981682917e-05, + "loss": 1.9256, + "step": 7098 + }, + { + "epoch": 0.7468700683850605, + "grad_norm": 1.2517000436782837, + "learning_rate": 3.1770677784354773e-05, + "loss": 1.4797, + "step": 7099 + }, + { + "epoch": 0.7469752761704366, + "grad_norm": 1.3138651847839355, + "learning_rate": 3.174576851027311e-05, + "loss": 2.2821, + "step": 7100 + }, + { + "epoch": 0.7470804839558127, + "grad_norm": 1.6332142353057861, + "learning_rate": 3.172086716233061e-05, + "loss": 1.5739, + "step": 7101 + }, + { + "epoch": 0.7471856917411889, + "grad_norm": 1.3998867273330688, + "learning_rate": 3.1695973743418994e-05, + "loss": 1.7157, + "step": 7102 + }, + { + "epoch": 0.7472908995265649, + "grad_norm": 1.38740873336792, + "learning_rate": 3.167108825642897e-05, + "loss": 1.3737, + "step": 7103 + }, + { + "epoch": 0.7473961073119411, + "grad_norm": 1.5622433423995972, + "learning_rate": 3.164621070425051e-05, + "loss": 1.449, + "step": 7104 + }, + { + "epoch": 0.7475013150973172, + "grad_norm": 1.275417685508728, + "learning_rate": 3.162134108977253e-05, + "loss": 1.6909, + "step": 7105 + }, + { + "epoch": 0.7476065228826934, + "grad_norm": 1.0392786264419556, + "learning_rate": 3.159647941588298e-05, + "loss": 1.8211, + "step": 7106 + }, + { + "epoch": 0.7477117306680694, + "grad_norm": 1.3279063701629639, + "learning_rate": 3.1571625685469086e-05, + "loss": 1.6451, + "step": 7107 + }, + { + "epoch": 0.7478169384534455, + "grad_norm": 1.5140197277069092, + "learning_rate": 3.1546779901416965e-05, + "loss": 2.0696, + "step": 7108 + }, + { + "epoch": 0.7479221462388217, + "grad_norm": 2.04419207572937, + "learning_rate": 3.152194206661185e-05, + "loss": 1.963, + "step": 7109 + }, + { + "epoch": 0.7480273540241978, + "grad_norm": 1.3939889669418335, + "learning_rate": 3.149711218393814e-05, + "loss": 1.2607, + "step": 7110 + }, + { + "epoch": 0.7481325618095739, + "grad_norm": 1.5365245342254639, + "learning_rate": 3.147229025627922e-05, + "loss": 1.5934, + "step": 7111 + }, + { + "epoch": 0.74823776959495, + "grad_norm": 1.3564833402633667, + "learning_rate": 3.144747628651754e-05, + "loss": 1.3875, + "step": 7112 + }, + { + "epoch": 0.7483429773803262, + "grad_norm": 1.9279536008834839, + "learning_rate": 3.142267027753474e-05, + "loss": 2.0533, + "step": 7113 + }, + { + "epoch": 0.7484481851657022, + "grad_norm": 1.5397069454193115, + "learning_rate": 3.139787223221141e-05, + "loss": 1.8882, + "step": 7114 + }, + { + "epoch": 0.7485533929510784, + "grad_norm": 2.6162831783294678, + "learning_rate": 3.137308215342729e-05, + "loss": 2.1646, + "step": 7115 + }, + { + "epoch": 0.7486586007364545, + "grad_norm": 1.33000910282135, + "learning_rate": 3.134830004406114e-05, + "loss": 1.243, + "step": 7116 + }, + { + "epoch": 0.7487638085218307, + "grad_norm": 1.1963053941726685, + "learning_rate": 3.1323525906990826e-05, + "loss": 1.856, + "step": 7117 + }, + { + "epoch": 0.7488690163072067, + "grad_norm": 1.3302133083343506, + "learning_rate": 3.129875974509332e-05, + "loss": 1.9321, + "step": 7118 + }, + { + "epoch": 0.7489742240925829, + "grad_norm": 1.8992441892623901, + "learning_rate": 3.127400156124463e-05, + "loss": 1.5662, + "step": 7119 + }, + { + "epoch": 0.749079431877959, + "grad_norm": 1.2559592723846436, + "learning_rate": 3.1249251358319784e-05, + "loss": 1.9587, + "step": 7120 + }, + { + "epoch": 0.749184639663335, + "grad_norm": 1.8781805038452148, + "learning_rate": 3.122450913919302e-05, + "loss": 1.5198, + "step": 7121 + }, + { + "epoch": 0.7492898474487112, + "grad_norm": 1.254428505897522, + "learning_rate": 3.1199774906737557e-05, + "loss": 2.0107, + "step": 7122 + }, + { + "epoch": 0.7493950552340873, + "grad_norm": 1.7342922687530518, + "learning_rate": 3.1175048663825626e-05, + "loss": 1.8242, + "step": 7123 + }, + { + "epoch": 0.7495002630194635, + "grad_norm": 1.5174115896224976, + "learning_rate": 3.11503304133287e-05, + "loss": 1.2619, + "step": 7124 + }, + { + "epoch": 0.7496054708048395, + "grad_norm": 1.4857186079025269, + "learning_rate": 3.1125620158117186e-05, + "loss": 1.4888, + "step": 7125 + }, + { + "epoch": 0.7497106785902157, + "grad_norm": 1.7400273084640503, + "learning_rate": 3.110091790106057e-05, + "loss": 1.5356, + "step": 7126 + }, + { + "epoch": 0.7498158863755918, + "grad_norm": 1.744299292564392, + "learning_rate": 3.107622364502751e-05, + "loss": 2.2264, + "step": 7127 + }, + { + "epoch": 0.7499210941609679, + "grad_norm": 2.1220436096191406, + "learning_rate": 3.105153739288561e-05, + "loss": 1.645, + "step": 7128 + }, + { + "epoch": 0.750026301946344, + "grad_norm": 1.1671090126037598, + "learning_rate": 3.102685914750166e-05, + "loss": 1.389, + "step": 7129 + }, + { + "epoch": 0.7501315097317202, + "grad_norm": 1.858191728591919, + "learning_rate": 3.100218891174144e-05, + "loss": 1.56, + "step": 7130 + }, + { + "epoch": 0.7502367175170963, + "grad_norm": 1.2349094152450562, + "learning_rate": 3.097752668846977e-05, + "loss": 1.6298, + "step": 7131 + }, + { + "epoch": 0.7503419253024723, + "grad_norm": 1.369010329246521, + "learning_rate": 3.095287248055069e-05, + "loss": 1.7109, + "step": 7132 + }, + { + "epoch": 0.7504471330878485, + "grad_norm": 1.2926762104034424, + "learning_rate": 3.092822629084715e-05, + "loss": 1.5841, + "step": 7133 + }, + { + "epoch": 0.7505523408732246, + "grad_norm": 1.9117960929870605, + "learning_rate": 3.090358812222122e-05, + "loss": 2.0065, + "step": 7134 + }, + { + "epoch": 0.7506575486586007, + "grad_norm": 1.1057976484298706, + "learning_rate": 3.0878957977534096e-05, + "loss": 1.5541, + "step": 7135 + }, + { + "epoch": 0.7507627564439768, + "grad_norm": 1.4562286138534546, + "learning_rate": 3.0854335859645975e-05, + "loss": 1.7463, + "step": 7136 + }, + { + "epoch": 0.750867964229353, + "grad_norm": 1.2036335468292236, + "learning_rate": 3.0829721771416146e-05, + "loss": 1.3571, + "step": 7137 + }, + { + "epoch": 0.7509731720147291, + "grad_norm": 1.2491194009780884, + "learning_rate": 3.080511571570294e-05, + "loss": 1.834, + "step": 7138 + }, + { + "epoch": 0.7510783798001052, + "grad_norm": 1.3567789793014526, + "learning_rate": 3.078051769536378e-05, + "loss": 1.8808, + "step": 7139 + }, + { + "epoch": 0.7511835875854813, + "grad_norm": 1.4406784772872925, + "learning_rate": 3.0755927713255174e-05, + "loss": 1.8301, + "step": 7140 + }, + { + "epoch": 0.7512887953708575, + "grad_norm": 1.7407746315002441, + "learning_rate": 3.073134577223268e-05, + "loss": 1.3971, + "step": 7141 + }, + { + "epoch": 0.7513940031562336, + "grad_norm": 1.3878988027572632, + "learning_rate": 3.070677187515087e-05, + "loss": 1.412, + "step": 7142 + }, + { + "epoch": 0.7514992109416097, + "grad_norm": 2.31683611869812, + "learning_rate": 3.06822060248635e-05, + "loss": 2.0428, + "step": 7143 + }, + { + "epoch": 0.7516044187269858, + "grad_norm": 1.463760495185852, + "learning_rate": 3.065764822422329e-05, + "loss": 1.7479, + "step": 7144 + }, + { + "epoch": 0.751709626512362, + "grad_norm": 1.3902920484542847, + "learning_rate": 3.063309847608202e-05, + "loss": 1.5929, + "step": 7145 + }, + { + "epoch": 0.751814834297738, + "grad_norm": 1.9475911855697632, + "learning_rate": 3.060855678329063e-05, + "loss": 1.6469, + "step": 7146 + }, + { + "epoch": 0.7519200420831141, + "grad_norm": 1.611803650856018, + "learning_rate": 3.0584023148699046e-05, + "loss": 1.775, + "step": 7147 + }, + { + "epoch": 0.7520252498684903, + "grad_norm": 1.7494837045669556, + "learning_rate": 3.055949757515624e-05, + "loss": 1.7225, + "step": 7148 + }, + { + "epoch": 0.7521304576538664, + "grad_norm": 1.4257961511611938, + "learning_rate": 3.0534980065510345e-05, + "loss": 1.7894, + "step": 7149 + }, + { + "epoch": 0.7522356654392425, + "grad_norm": 1.4211279153823853, + "learning_rate": 3.0510470622608478e-05, + "loss": 1.7914, + "step": 7150 + }, + { + "epoch": 0.7523408732246186, + "grad_norm": 1.7004531621932983, + "learning_rate": 3.0485969249296797e-05, + "loss": 2.1648, + "step": 7151 + }, + { + "epoch": 0.7524460810099948, + "grad_norm": 1.0059231519699097, + "learning_rate": 3.0461475948420637e-05, + "loss": 1.7455, + "step": 7152 + }, + { + "epoch": 0.7525512887953708, + "grad_norm": 1.09248685836792, + "learning_rate": 3.043699072282429e-05, + "loss": 1.839, + "step": 7153 + }, + { + "epoch": 0.752656496580747, + "grad_norm": 2.1308209896087646, + "learning_rate": 3.041251357535111e-05, + "loss": 1.5851, + "step": 7154 + }, + { + "epoch": 0.7527617043661231, + "grad_norm": 1.225701928138733, + "learning_rate": 3.0388044508843616e-05, + "loss": 2.0519, + "step": 7155 + }, + { + "epoch": 0.7528669121514993, + "grad_norm": 1.9547263383865356, + "learning_rate": 3.0363583526143292e-05, + "loss": 1.7462, + "step": 7156 + }, + { + "epoch": 0.7529721199368753, + "grad_norm": 1.4787694215774536, + "learning_rate": 3.0339130630090673e-05, + "loss": 1.0761, + "step": 7157 + }, + { + "epoch": 0.7530773277222514, + "grad_norm": 1.6599515676498413, + "learning_rate": 3.031468582352548e-05, + "loss": 1.8568, + "step": 7158 + }, + { + "epoch": 0.7531825355076276, + "grad_norm": 2.056112766265869, + "learning_rate": 3.0290249109286296e-05, + "loss": 1.7289, + "step": 7159 + }, + { + "epoch": 0.7532877432930036, + "grad_norm": 1.7584683895111084, + "learning_rate": 3.0265820490210973e-05, + "loss": 1.9411, + "step": 7160 + }, + { + "epoch": 0.7533929510783798, + "grad_norm": 1.8417630195617676, + "learning_rate": 3.0241399969136276e-05, + "loss": 1.5764, + "step": 7161 + }, + { + "epoch": 0.7534981588637559, + "grad_norm": 1.1658049821853638, + "learning_rate": 3.0216987548898068e-05, + "loss": 1.7206, + "step": 7162 + }, + { + "epoch": 0.7536033666491321, + "grad_norm": 1.9770032167434692, + "learning_rate": 3.019258323233133e-05, + "loss": 1.5526, + "step": 7163 + }, + { + "epoch": 0.7537085744345081, + "grad_norm": 1.6163029670715332, + "learning_rate": 3.0168187022270032e-05, + "loss": 1.8265, + "step": 7164 + }, + { + "epoch": 0.7538137822198843, + "grad_norm": 1.530423641204834, + "learning_rate": 3.0143798921547193e-05, + "loss": 1.5206, + "step": 7165 + }, + { + "epoch": 0.7539189900052604, + "grad_norm": 1.2398508787155151, + "learning_rate": 3.011941893299499e-05, + "loss": 1.5016, + "step": 7166 + }, + { + "epoch": 0.7540241977906365, + "grad_norm": 1.3902400732040405, + "learning_rate": 3.0095047059444546e-05, + "loss": 1.2807, + "step": 7167 + }, + { + "epoch": 0.7541294055760126, + "grad_norm": 1.967103362083435, + "learning_rate": 3.0070683303726076e-05, + "loss": 1.3802, + "step": 7168 + }, + { + "epoch": 0.7542346133613888, + "grad_norm": 1.6876822710037231, + "learning_rate": 3.0046327668668904e-05, + "loss": 1.5044, + "step": 7169 + }, + { + "epoch": 0.7543398211467649, + "grad_norm": 1.5691266059875488, + "learning_rate": 3.002198015710136e-05, + "loss": 1.4267, + "step": 7170 + }, + { + "epoch": 0.7544450289321409, + "grad_norm": 1.312497615814209, + "learning_rate": 2.999764077185079e-05, + "loss": 1.4682, + "step": 7171 + }, + { + "epoch": 0.7545502367175171, + "grad_norm": 1.056235432624817, + "learning_rate": 2.997330951574371e-05, + "loss": 1.8569, + "step": 7172 + }, + { + "epoch": 0.7546554445028932, + "grad_norm": 1.192795753479004, + "learning_rate": 2.9948986391605584e-05, + "loss": 1.3137, + "step": 7173 + }, + { + "epoch": 0.7547606522882694, + "grad_norm": 2.208214282989502, + "learning_rate": 2.9924671402261018e-05, + "loss": 1.7247, + "step": 7174 + }, + { + "epoch": 0.7548658600736454, + "grad_norm": 1.3944358825683594, + "learning_rate": 2.9900364550533612e-05, + "loss": 1.6949, + "step": 7175 + }, + { + "epoch": 0.7549710678590216, + "grad_norm": 1.4793987274169922, + "learning_rate": 2.9876065839246005e-05, + "loss": 1.8035, + "step": 7176 + }, + { + "epoch": 0.7550762756443977, + "grad_norm": 1.936575174331665, + "learning_rate": 2.9851775271219996e-05, + "loss": 1.279, + "step": 7177 + }, + { + "epoch": 0.7551814834297738, + "grad_norm": 1.4418457746505737, + "learning_rate": 2.9827492849276317e-05, + "loss": 1.6786, + "step": 7178 + }, + { + "epoch": 0.7552866912151499, + "grad_norm": 1.314530849456787, + "learning_rate": 2.9803218576234836e-05, + "loss": 1.5827, + "step": 7179 + }, + { + "epoch": 0.7553918990005261, + "grad_norm": 1.114014983177185, + "learning_rate": 2.9778952454914422e-05, + "loss": 1.484, + "step": 7180 + }, + { + "epoch": 0.7554971067859022, + "grad_norm": 1.7452201843261719, + "learning_rate": 2.9754694488133038e-05, + "loss": 2.1129, + "step": 7181 + }, + { + "epoch": 0.7556023145712782, + "grad_norm": 1.7121224403381348, + "learning_rate": 2.9730444678707625e-05, + "loss": 1.7277, + "step": 7182 + }, + { + "epoch": 0.7557075223566544, + "grad_norm": 1.2562663555145264, + "learning_rate": 2.9706203029454316e-05, + "loss": 1.6911, + "step": 7183 + }, + { + "epoch": 0.7558127301420305, + "grad_norm": 1.4814766645431519, + "learning_rate": 2.9681969543188148e-05, + "loss": 1.5525, + "step": 7184 + }, + { + "epoch": 0.7559179379274066, + "grad_norm": 0.9775087833404541, + "learning_rate": 2.965774422272334e-05, + "loss": 1.9295, + "step": 7185 + }, + { + "epoch": 0.7560231457127827, + "grad_norm": 1.847643494606018, + "learning_rate": 2.9633527070873057e-05, + "loss": 1.5732, + "step": 7186 + }, + { + "epoch": 0.7561283534981589, + "grad_norm": 1.5069630146026611, + "learning_rate": 2.9609318090449533e-05, + "loss": 1.3961, + "step": 7187 + }, + { + "epoch": 0.756233561283535, + "grad_norm": 1.6327673196792603, + "learning_rate": 2.958511728426414e-05, + "loss": 1.0892, + "step": 7188 + }, + { + "epoch": 0.7563387690689111, + "grad_norm": 1.8135273456573486, + "learning_rate": 2.956092465512721e-05, + "loss": 1.354, + "step": 7189 + }, + { + "epoch": 0.7564439768542872, + "grad_norm": 2.080944538116455, + "learning_rate": 2.9536740205848113e-05, + "loss": 1.5985, + "step": 7190 + }, + { + "epoch": 0.7565491846396634, + "grad_norm": 1.4439144134521484, + "learning_rate": 2.9512563939235382e-05, + "loss": 1.5032, + "step": 7191 + }, + { + "epoch": 0.7566543924250394, + "grad_norm": 1.6564826965332031, + "learning_rate": 2.9488395858096485e-05, + "loss": 1.9067, + "step": 7192 + }, + { + "epoch": 0.7567596002104156, + "grad_norm": 1.779802918434143, + "learning_rate": 2.9464235965237964e-05, + "loss": 1.8031, + "step": 7193 + }, + { + "epoch": 0.7568648079957917, + "grad_norm": 1.4289071559906006, + "learning_rate": 2.9440084263465484e-05, + "loss": 1.616, + "step": 7194 + }, + { + "epoch": 0.7569700157811678, + "grad_norm": 1.3593642711639404, + "learning_rate": 2.941594075558366e-05, + "loss": 0.9734, + "step": 7195 + }, + { + "epoch": 0.7570752235665439, + "grad_norm": 2.13067889213562, + "learning_rate": 2.939180544439618e-05, + "loss": 1.4978, + "step": 7196 + }, + { + "epoch": 0.75718043135192, + "grad_norm": 1.6457139253616333, + "learning_rate": 2.936767833270586e-05, + "loss": 1.8645, + "step": 7197 + }, + { + "epoch": 0.7572856391372962, + "grad_norm": 1.727137565612793, + "learning_rate": 2.934355942331447e-05, + "loss": 1.7537, + "step": 7198 + }, + { + "epoch": 0.7573908469226722, + "grad_norm": 1.2516162395477295, + "learning_rate": 2.9319448719022824e-05, + "loss": 1.9984, + "step": 7199 + }, + { + "epoch": 0.7574960547080484, + "grad_norm": 1.7397236824035645, + "learning_rate": 2.929534622263088e-05, + "loss": 1.6422, + "step": 7200 + }, + { + "epoch": 0.7576012624934245, + "grad_norm": 1.722465991973877, + "learning_rate": 2.927125193693756e-05, + "loss": 1.745, + "step": 7201 + }, + { + "epoch": 0.7577064702788007, + "grad_norm": 1.5950039625167847, + "learning_rate": 2.9247165864740856e-05, + "loss": 1.6426, + "step": 7202 + }, + { + "epoch": 0.7578116780641767, + "grad_norm": 1.0130795240402222, + "learning_rate": 2.9223088008837785e-05, + "loss": 1.5676, + "step": 7203 + }, + { + "epoch": 0.7579168858495529, + "grad_norm": 2.0876479148864746, + "learning_rate": 2.919901837202441e-05, + "loss": 1.7984, + "step": 7204 + }, + { + "epoch": 0.758022093634929, + "grad_norm": 2.0102639198303223, + "learning_rate": 2.917495695709592e-05, + "loss": 1.2913, + "step": 7205 + }, + { + "epoch": 0.7581273014203052, + "grad_norm": 1.934104561805725, + "learning_rate": 2.915090376684646e-05, + "loss": 0.9719, + "step": 7206 + }, + { + "epoch": 0.7582325092056812, + "grad_norm": 1.983300805091858, + "learning_rate": 2.9126858804069223e-05, + "loss": 1.83, + "step": 7207 + }, + { + "epoch": 0.7583377169910573, + "grad_norm": 2.1567113399505615, + "learning_rate": 2.9102822071556512e-05, + "loss": 1.7204, + "step": 7208 + }, + { + "epoch": 0.7584429247764335, + "grad_norm": 1.1860876083374023, + "learning_rate": 2.9078793572099616e-05, + "loss": 1.9178, + "step": 7209 + }, + { + "epoch": 0.7585481325618095, + "grad_norm": 2.058955669403076, + "learning_rate": 2.905477330848886e-05, + "loss": 1.7382, + "step": 7210 + }, + { + "epoch": 0.7586533403471857, + "grad_norm": 1.8354625701904297, + "learning_rate": 2.9030761283513684e-05, + "loss": 2.3162, + "step": 7211 + }, + { + "epoch": 0.7587585481325618, + "grad_norm": 1.469107747077942, + "learning_rate": 2.9006757499962512e-05, + "loss": 1.5963, + "step": 7212 + }, + { + "epoch": 0.758863755917938, + "grad_norm": 1.9937857389450073, + "learning_rate": 2.8982761960622782e-05, + "loss": 1.22, + "step": 7213 + }, + { + "epoch": 0.758968963703314, + "grad_norm": 1.7028518915176392, + "learning_rate": 2.8958774668281084e-05, + "loss": 1.4993, + "step": 7214 + }, + { + "epoch": 0.7590741714886902, + "grad_norm": 1.4997313022613525, + "learning_rate": 2.8934795625722943e-05, + "loss": 1.2415, + "step": 7215 + }, + { + "epoch": 0.7591793792740663, + "grad_norm": 2.3115732669830322, + "learning_rate": 2.8910824835732952e-05, + "loss": 1.4934, + "step": 7216 + }, + { + "epoch": 0.7592845870594424, + "grad_norm": 1.127637267112732, + "learning_rate": 2.8886862301094807e-05, + "loss": 1.6622, + "step": 7217 + }, + { + "epoch": 0.7593897948448185, + "grad_norm": 1.6619749069213867, + "learning_rate": 2.8862908024591186e-05, + "loss": 1.9157, + "step": 7218 + }, + { + "epoch": 0.7594950026301946, + "grad_norm": 1.5014777183532715, + "learning_rate": 2.8838962009003756e-05, + "loss": 1.8232, + "step": 7219 + }, + { + "epoch": 0.7596002104155708, + "grad_norm": 1.5407613515853882, + "learning_rate": 2.881502425711339e-05, + "loss": 1.6237, + "step": 7220 + }, + { + "epoch": 0.7597054182009468, + "grad_norm": 1.672674298286438, + "learning_rate": 2.8791094771699802e-05, + "loss": 1.9443, + "step": 7221 + }, + { + "epoch": 0.759810625986323, + "grad_norm": 1.8555692434310913, + "learning_rate": 2.8767173555541972e-05, + "loss": 1.3203, + "step": 7222 + }, + { + "epoch": 0.7599158337716991, + "grad_norm": 1.1525888442993164, + "learning_rate": 2.8743260611417665e-05, + "loss": 1.4575, + "step": 7223 + }, + { + "epoch": 0.7600210415570752, + "grad_norm": 1.2628978490829468, + "learning_rate": 2.8719355942103842e-05, + "loss": 1.5078, + "step": 7224 + }, + { + "epoch": 0.7601262493424513, + "grad_norm": 1.7043256759643555, + "learning_rate": 2.8695459550376515e-05, + "loss": 1.2751, + "step": 7225 + }, + { + "epoch": 0.7602314571278275, + "grad_norm": 2.068817615509033, + "learning_rate": 2.867157143901067e-05, + "loss": 1.9544, + "step": 7226 + }, + { + "epoch": 0.7603366649132036, + "grad_norm": 1.3836297988891602, + "learning_rate": 2.8647691610780326e-05, + "loss": 1.5782, + "step": 7227 + }, + { + "epoch": 0.7604418726985797, + "grad_norm": 1.6043787002563477, + "learning_rate": 2.862382006845863e-05, + "loss": 1.122, + "step": 7228 + }, + { + "epoch": 0.7605470804839558, + "grad_norm": 1.506122350692749, + "learning_rate": 2.8599956814817642e-05, + "loss": 1.8322, + "step": 7229 + }, + { + "epoch": 0.760652288269332, + "grad_norm": 1.124856948852539, + "learning_rate": 2.857610185262859e-05, + "loss": 1.8696, + "step": 7230 + }, + { + "epoch": 0.760757496054708, + "grad_norm": 1.269283652305603, + "learning_rate": 2.8552255184661624e-05, + "loss": 1.8721, + "step": 7231 + }, + { + "epoch": 0.7608627038400841, + "grad_norm": 1.4874763488769531, + "learning_rate": 2.8528416813685975e-05, + "loss": 1.6838, + "step": 7232 + }, + { + "epoch": 0.7609679116254603, + "grad_norm": 1.3948214054107666, + "learning_rate": 2.8504586742469952e-05, + "loss": 1.6012, + "step": 7233 + }, + { + "epoch": 0.7610731194108364, + "grad_norm": 1.520309329032898, + "learning_rate": 2.848076497378085e-05, + "loss": 2.2024, + "step": 7234 + }, + { + "epoch": 0.7611783271962125, + "grad_norm": 1.4063392877578735, + "learning_rate": 2.8456951510384966e-05, + "loss": 1.7189, + "step": 7235 + }, + { + "epoch": 0.7612835349815886, + "grad_norm": 1.1339725255966187, + "learning_rate": 2.8433146355047748e-05, + "loss": 1.6356, + "step": 7236 + }, + { + "epoch": 0.7613887427669648, + "grad_norm": 1.4083565473556519, + "learning_rate": 2.8409349510533578e-05, + "loss": 1.9028, + "step": 7237 + }, + { + "epoch": 0.7614939505523409, + "grad_norm": 1.4033015966415405, + "learning_rate": 2.8385560979605884e-05, + "loss": 1.6745, + "step": 7238 + }, + { + "epoch": 0.761599158337717, + "grad_norm": 1.8718355894088745, + "learning_rate": 2.83617807650272e-05, + "loss": 1.8888, + "step": 7239 + }, + { + "epoch": 0.7617043661230931, + "grad_norm": 1.4770317077636719, + "learning_rate": 2.833800886955902e-05, + "loss": 1.5846, + "step": 7240 + }, + { + "epoch": 0.7618095739084693, + "grad_norm": 1.7958834171295166, + "learning_rate": 2.831424529596185e-05, + "loss": 1.2971, + "step": 7241 + }, + { + "epoch": 0.7619147816938453, + "grad_norm": 1.5714819431304932, + "learning_rate": 2.8290490046995365e-05, + "loss": 1.4623, + "step": 7242 + }, + { + "epoch": 0.7620199894792214, + "grad_norm": 1.6446837186813354, + "learning_rate": 2.8266743125418128e-05, + "loss": 1.7678, + "step": 7243 + }, + { + "epoch": 0.7621251972645976, + "grad_norm": 1.509402871131897, + "learning_rate": 2.8243004533987793e-05, + "loss": 1.8722, + "step": 7244 + }, + { + "epoch": 0.7622304050499737, + "grad_norm": 1.4639089107513428, + "learning_rate": 2.8219274275461062e-05, + "loss": 1.5998, + "step": 7245 + }, + { + "epoch": 0.7623356128353498, + "grad_norm": 1.3890831470489502, + "learning_rate": 2.8195552352593612e-05, + "loss": 1.7413, + "step": 7246 + }, + { + "epoch": 0.7624408206207259, + "grad_norm": 1.8161264657974243, + "learning_rate": 2.8171838768140245e-05, + "loss": 1.8833, + "step": 7247 + }, + { + "epoch": 0.7625460284061021, + "grad_norm": 1.8306972980499268, + "learning_rate": 2.8148133524854716e-05, + "loss": 1.9024, + "step": 7248 + }, + { + "epoch": 0.7626512361914781, + "grad_norm": 1.03922438621521, + "learning_rate": 2.8124436625489813e-05, + "loss": 1.7287, + "step": 7249 + }, + { + "epoch": 0.7627564439768543, + "grad_norm": 2.0580270290374756, + "learning_rate": 2.8100748072797435e-05, + "loss": 1.5174, + "step": 7250 + }, + { + "epoch": 0.7628616517622304, + "grad_norm": 1.4314181804656982, + "learning_rate": 2.8077067869528417e-05, + "loss": 1.6338, + "step": 7251 + }, + { + "epoch": 0.7629668595476066, + "grad_norm": 1.2152514457702637, + "learning_rate": 2.8053396018432644e-05, + "loss": 1.6196, + "step": 7252 + }, + { + "epoch": 0.7630720673329826, + "grad_norm": 2.0151803493499756, + "learning_rate": 2.802973252225911e-05, + "loss": 1.5984, + "step": 7253 + }, + { + "epoch": 0.7631772751183588, + "grad_norm": 1.595259428024292, + "learning_rate": 2.8006077383755747e-05, + "loss": 1.4817, + "step": 7254 + }, + { + "epoch": 0.7632824829037349, + "grad_norm": 0.9341708421707153, + "learning_rate": 2.7982430605669517e-05, + "loss": 1.7419, + "step": 7255 + }, + { + "epoch": 0.7633876906891109, + "grad_norm": 1.5944820642471313, + "learning_rate": 2.7958792190746496e-05, + "loss": 1.6313, + "step": 7256 + }, + { + "epoch": 0.7634928984744871, + "grad_norm": 1.1736741065979004, + "learning_rate": 2.7935162141731718e-05, + "loss": 1.6742, + "step": 7257 + }, + { + "epoch": 0.7635981062598632, + "grad_norm": 1.133220911026001, + "learning_rate": 2.7911540461369222e-05, + "loss": 1.443, + "step": 7258 + }, + { + "epoch": 0.7637033140452394, + "grad_norm": 1.820003867149353, + "learning_rate": 2.788792715240218e-05, + "loss": 1.9558, + "step": 7259 + }, + { + "epoch": 0.7638085218306154, + "grad_norm": 1.6289899349212646, + "learning_rate": 2.78643222175727e-05, + "loss": 1.863, + "step": 7260 + }, + { + "epoch": 0.7639137296159916, + "grad_norm": 1.1900887489318848, + "learning_rate": 2.7840725659621915e-05, + "loss": 2.0473, + "step": 7261 + }, + { + "epoch": 0.7640189374013677, + "grad_norm": 2.775047779083252, + "learning_rate": 2.781713748129008e-05, + "loss": 1.7868, + "step": 7262 + }, + { + "epoch": 0.7641241451867438, + "grad_norm": 1.3283895254135132, + "learning_rate": 2.7793557685316363e-05, + "loss": 1.4167, + "step": 7263 + }, + { + "epoch": 0.7642293529721199, + "grad_norm": 1.6619638204574585, + "learning_rate": 2.7769986274439e-05, + "loss": 1.7392, + "step": 7264 + }, + { + "epoch": 0.7643345607574961, + "grad_norm": 1.9690237045288086, + "learning_rate": 2.774642325139535e-05, + "loss": 1.3192, + "step": 7265 + }, + { + "epoch": 0.7644397685428722, + "grad_norm": 1.4869492053985596, + "learning_rate": 2.772286861892157e-05, + "loss": 1.8371, + "step": 7266 + }, + { + "epoch": 0.7645449763282482, + "grad_norm": 2.642908811569214, + "learning_rate": 2.769932237975309e-05, + "loss": 1.3455, + "step": 7267 + }, + { + "epoch": 0.7646501841136244, + "grad_norm": 2.0182862281799316, + "learning_rate": 2.7675784536624215e-05, + "loss": 1.8982, + "step": 7268 + }, + { + "epoch": 0.7647553918990005, + "grad_norm": 1.7238906621932983, + "learning_rate": 2.7652255092268298e-05, + "loss": 1.21, + "step": 7269 + }, + { + "epoch": 0.7648605996843767, + "grad_norm": 1.1936345100402832, + "learning_rate": 2.762873404941779e-05, + "loss": 2.1362, + "step": 7270 + }, + { + "epoch": 0.7649658074697527, + "grad_norm": 1.5153876543045044, + "learning_rate": 2.7605221410804093e-05, + "loss": 1.9925, + "step": 7271 + }, + { + "epoch": 0.7650710152551289, + "grad_norm": 1.9088298082351685, + "learning_rate": 2.7581717179157606e-05, + "loss": 1.6018, + "step": 7272 + }, + { + "epoch": 0.765176223040505, + "grad_norm": 1.2692924737930298, + "learning_rate": 2.755822135720787e-05, + "loss": 1.8706, + "step": 7273 + }, + { + "epoch": 0.7652814308258811, + "grad_norm": 1.2598018646240234, + "learning_rate": 2.7534733947683334e-05, + "loss": 1.7554, + "step": 7274 + }, + { + "epoch": 0.7653866386112572, + "grad_norm": 2.6628575325012207, + "learning_rate": 2.7511254953311495e-05, + "loss": 1.4819, + "step": 7275 + }, + { + "epoch": 0.7654918463966334, + "grad_norm": 1.1039395332336426, + "learning_rate": 2.748778437681895e-05, + "loss": 1.0194, + "step": 7276 + }, + { + "epoch": 0.7655970541820095, + "grad_norm": 1.6342849731445312, + "learning_rate": 2.7464322220931205e-05, + "loss": 1.455, + "step": 7277 + }, + { + "epoch": 0.7657022619673856, + "grad_norm": 1.5950639247894287, + "learning_rate": 2.7440868488372884e-05, + "loss": 1.3383, + "step": 7278 + }, + { + "epoch": 0.7658074697527617, + "grad_norm": 1.440382719039917, + "learning_rate": 2.7417423181867585e-05, + "loss": 1.8153, + "step": 7279 + }, + { + "epoch": 0.7659126775381379, + "grad_norm": 2.1018497943878174, + "learning_rate": 2.7393986304137887e-05, + "loss": 1.6111, + "step": 7280 + }, + { + "epoch": 0.7660178853235139, + "grad_norm": 1.6211340427398682, + "learning_rate": 2.73705578579055e-05, + "loss": 1.7447, + "step": 7281 + }, + { + "epoch": 0.76612309310889, + "grad_norm": 1.3533055782318115, + "learning_rate": 2.7347137845891068e-05, + "loss": 1.5142, + "step": 7282 + }, + { + "epoch": 0.7662283008942662, + "grad_norm": 1.5132229328155518, + "learning_rate": 2.7323726270814253e-05, + "loss": 1.6848, + "step": 7283 + }, + { + "epoch": 0.7663335086796423, + "grad_norm": 1.4339923858642578, + "learning_rate": 2.7300323135393812e-05, + "loss": 1.6556, + "step": 7284 + }, + { + "epoch": 0.7664387164650184, + "grad_norm": 2.367436170578003, + "learning_rate": 2.727692844234746e-05, + "loss": 1.9757, + "step": 7285 + }, + { + "epoch": 0.7665439242503945, + "grad_norm": 1.1163835525512695, + "learning_rate": 2.72535421943919e-05, + "loss": 1.7622, + "step": 7286 + }, + { + "epoch": 0.7666491320357707, + "grad_norm": 1.5662150382995605, + "learning_rate": 2.7230164394242995e-05, + "loss": 1.6492, + "step": 7287 + }, + { + "epoch": 0.7667543398211467, + "grad_norm": 1.4997937679290771, + "learning_rate": 2.720679504461542e-05, + "loss": 1.3393, + "step": 7288 + }, + { + "epoch": 0.7668595476065229, + "grad_norm": 2.0090813636779785, + "learning_rate": 2.7183434148223052e-05, + "loss": 1.1286, + "step": 7289 + }, + { + "epoch": 0.766964755391899, + "grad_norm": 1.4088441133499146, + "learning_rate": 2.7160081707778707e-05, + "loss": 1.2337, + "step": 7290 + }, + { + "epoch": 0.7670699631772752, + "grad_norm": 1.4657981395721436, + "learning_rate": 2.7136737725994187e-05, + "loss": 1.166, + "step": 7291 + }, + { + "epoch": 0.7671751709626512, + "grad_norm": 2.0377955436706543, + "learning_rate": 2.7113402205580408e-05, + "loss": 1.8475, + "step": 7292 + }, + { + "epoch": 0.7672803787480273, + "grad_norm": 1.972943663597107, + "learning_rate": 2.7090075149247217e-05, + "loss": 1.7478, + "step": 7293 + }, + { + "epoch": 0.7673855865334035, + "grad_norm": 1.5420266389846802, + "learning_rate": 2.7066756559703498e-05, + "loss": 1.8444, + "step": 7294 + }, + { + "epoch": 0.7674907943187795, + "grad_norm": 1.34401535987854, + "learning_rate": 2.7043446439657193e-05, + "loss": 1.4609, + "step": 7295 + }, + { + "epoch": 0.7675960021041557, + "grad_norm": 1.3176796436309814, + "learning_rate": 2.7020144791815218e-05, + "loss": 1.3681, + "step": 7296 + }, + { + "epoch": 0.7677012098895318, + "grad_norm": 1.1592514514923096, + "learning_rate": 2.699685161888348e-05, + "loss": 1.6415, + "step": 7297 + }, + { + "epoch": 0.767806417674908, + "grad_norm": 1.5379077196121216, + "learning_rate": 2.6973566923566994e-05, + "loss": 1.543, + "step": 7298 + }, + { + "epoch": 0.767911625460284, + "grad_norm": 2.4584033489227295, + "learning_rate": 2.6950290708569716e-05, + "loss": 1.558, + "step": 7299 + }, + { + "epoch": 0.7680168332456602, + "grad_norm": 1.853628396987915, + "learning_rate": 2.6927022976594607e-05, + "loss": 2.2037, + "step": 7300 + }, + { + "epoch": 0.7681220410310363, + "grad_norm": 1.1601117849349976, + "learning_rate": 2.6903763730343713e-05, + "loss": 1.8498, + "step": 7301 + }, + { + "epoch": 0.7682272488164125, + "grad_norm": 1.5906758308410645, + "learning_rate": 2.6880512972518047e-05, + "loss": 2.0151, + "step": 7302 + }, + { + "epoch": 0.7683324566017885, + "grad_norm": 1.081691026687622, + "learning_rate": 2.6857270705817595e-05, + "loss": 1.7113, + "step": 7303 + }, + { + "epoch": 0.7684376643871647, + "grad_norm": 1.651273250579834, + "learning_rate": 2.6834036932941474e-05, + "loss": 1.6251, + "step": 7304 + }, + { + "epoch": 0.7685428721725408, + "grad_norm": 1.15084969997406, + "learning_rate": 2.681081165658772e-05, + "loss": 1.5665, + "step": 7305 + }, + { + "epoch": 0.7686480799579168, + "grad_norm": 1.6445162296295166, + "learning_rate": 2.678759487945337e-05, + "loss": 1.7392, + "step": 7306 + }, + { + "epoch": 0.768753287743293, + "grad_norm": 1.2424595355987549, + "learning_rate": 2.676438660423457e-05, + "loss": 2.0125, + "step": 7307 + }, + { + "epoch": 0.7688584955286691, + "grad_norm": 2.284137487411499, + "learning_rate": 2.6741186833626407e-05, + "loss": 1.8287, + "step": 7308 + }, + { + "epoch": 0.7689637033140453, + "grad_norm": 2.1580452919006348, + "learning_rate": 2.6717995570322964e-05, + "loss": 1.9503, + "step": 7309 + }, + { + "epoch": 0.7690689110994213, + "grad_norm": 1.483157753944397, + "learning_rate": 2.669481281701739e-05, + "loss": 1.3867, + "step": 7310 + }, + { + "epoch": 0.7691741188847975, + "grad_norm": 1.2321768999099731, + "learning_rate": 2.667163857640179e-05, + "loss": 1.7758, + "step": 7311 + }, + { + "epoch": 0.7692793266701736, + "grad_norm": 1.4711391925811768, + "learning_rate": 2.664847285116736e-05, + "loss": 1.371, + "step": 7312 + }, + { + "epoch": 0.7693845344555497, + "grad_norm": 1.1832042932510376, + "learning_rate": 2.6625315644004244e-05, + "loss": 1.6168, + "step": 7313 + }, + { + "epoch": 0.7694897422409258, + "grad_norm": 1.5034308433532715, + "learning_rate": 2.660216695760157e-05, + "loss": 1.6241, + "step": 7314 + }, + { + "epoch": 0.769594950026302, + "grad_norm": 1.575340747833252, + "learning_rate": 2.6579026794647587e-05, + "loss": 1.6202, + "step": 7315 + }, + { + "epoch": 0.7697001578116781, + "grad_norm": 1.1293988227844238, + "learning_rate": 2.655589515782946e-05, + "loss": 1.7913, + "step": 7316 + }, + { + "epoch": 0.7698053655970541, + "grad_norm": 2.4325718879699707, + "learning_rate": 2.653277204983334e-05, + "loss": 1.902, + "step": 7317 + }, + { + "epoch": 0.7699105733824303, + "grad_norm": 1.1691190004348755, + "learning_rate": 2.650965747334452e-05, + "loss": 1.5993, + "step": 7318 + }, + { + "epoch": 0.7700157811678064, + "grad_norm": 1.888068675994873, + "learning_rate": 2.648655143104717e-05, + "loss": 1.406, + "step": 7319 + }, + { + "epoch": 0.7701209889531825, + "grad_norm": 1.8982930183410645, + "learning_rate": 2.6463453925624503e-05, + "loss": 1.584, + "step": 7320 + }, + { + "epoch": 0.7702261967385586, + "grad_norm": 1.547112226486206, + "learning_rate": 2.6440364959758813e-05, + "loss": 1.9128, + "step": 7321 + }, + { + "epoch": 0.7703314045239348, + "grad_norm": 2.7643849849700928, + "learning_rate": 2.641728453613127e-05, + "loss": 2.1104, + "step": 7322 + }, + { + "epoch": 0.7704366123093109, + "grad_norm": 1.3012899160385132, + "learning_rate": 2.6394212657422225e-05, + "loss": 1.4724, + "step": 7323 + }, + { + "epoch": 0.770541820094687, + "grad_norm": 1.8508931398391724, + "learning_rate": 2.6371149326310874e-05, + "loss": 1.8537, + "step": 7324 + }, + { + "epoch": 0.7706470278800631, + "grad_norm": 2.359945774078369, + "learning_rate": 2.6348094545475465e-05, + "loss": 1.7603, + "step": 7325 + }, + { + "epoch": 0.7707522356654393, + "grad_norm": 1.6346330642700195, + "learning_rate": 2.6325048317593337e-05, + "loss": 1.9917, + "step": 7326 + }, + { + "epoch": 0.7708574434508153, + "grad_norm": 1.718336820602417, + "learning_rate": 2.6302010645340746e-05, + "loss": 1.9148, + "step": 7327 + }, + { + "epoch": 0.7709626512361915, + "grad_norm": 2.0984418392181396, + "learning_rate": 2.6278981531392945e-05, + "loss": 1.852, + "step": 7328 + }, + { + "epoch": 0.7710678590215676, + "grad_norm": 1.1272097826004028, + "learning_rate": 2.625596097842432e-05, + "loss": 1.2315, + "step": 7329 + }, + { + "epoch": 0.7711730668069438, + "grad_norm": 1.9899998903274536, + "learning_rate": 2.6232948989108086e-05, + "loss": 1.5241, + "step": 7330 + }, + { + "epoch": 0.7712782745923198, + "grad_norm": 1.6918717622756958, + "learning_rate": 2.6209945566116545e-05, + "loss": 1.5811, + "step": 7331 + }, + { + "epoch": 0.7713834823776959, + "grad_norm": 2.1108553409576416, + "learning_rate": 2.618695071212107e-05, + "loss": 1.7367, + "step": 7332 + }, + { + "epoch": 0.7714886901630721, + "grad_norm": 2.049133062362671, + "learning_rate": 2.616396442979192e-05, + "loss": 1.7588, + "step": 7333 + }, + { + "epoch": 0.7715938979484482, + "grad_norm": 2.400456428527832, + "learning_rate": 2.6140986721798466e-05, + "loss": 1.5582, + "step": 7334 + }, + { + "epoch": 0.7716991057338243, + "grad_norm": 1.7933460474014282, + "learning_rate": 2.6118017590809017e-05, + "loss": 2.0447, + "step": 7335 + }, + { + "epoch": 0.7718043135192004, + "grad_norm": 1.4793000221252441, + "learning_rate": 2.6095057039490878e-05, + "loss": 1.9915, + "step": 7336 + }, + { + "epoch": 0.7719095213045766, + "grad_norm": 1.532442569732666, + "learning_rate": 2.6072105070510422e-05, + "loss": 1.7986, + "step": 7337 + }, + { + "epoch": 0.7720147290899526, + "grad_norm": 1.4576950073242188, + "learning_rate": 2.6049161686532965e-05, + "loss": 1.3017, + "step": 7338 + }, + { + "epoch": 0.7721199368753288, + "grad_norm": 1.2867484092712402, + "learning_rate": 2.6026226890222814e-05, + "loss": 1.3474, + "step": 7339 + }, + { + "epoch": 0.7722251446607049, + "grad_norm": 1.4471523761749268, + "learning_rate": 2.600330068424338e-05, + "loss": 1.8549, + "step": 7340 + }, + { + "epoch": 0.7723303524460811, + "grad_norm": 1.353736400604248, + "learning_rate": 2.5980383071256975e-05, + "loss": 1.4423, + "step": 7341 + }, + { + "epoch": 0.7724355602314571, + "grad_norm": 1.9009381532669067, + "learning_rate": 2.595747405392491e-05, + "loss": 1.3345, + "step": 7342 + }, + { + "epoch": 0.7725407680168332, + "grad_norm": 1.2530128955841064, + "learning_rate": 2.59345736349076e-05, + "loss": 1.5174, + "step": 7343 + }, + { + "epoch": 0.7726459758022094, + "grad_norm": 1.770195722579956, + "learning_rate": 2.5911681816864354e-05, + "loss": 1.6167, + "step": 7344 + }, + { + "epoch": 0.7727511835875854, + "grad_norm": 2.0301904678344727, + "learning_rate": 2.588879860245351e-05, + "loss": 1.6029, + "step": 7345 + }, + { + "epoch": 0.7728563913729616, + "grad_norm": 2.069531202316284, + "learning_rate": 2.5865923994332463e-05, + "loss": 1.8537, + "step": 7346 + }, + { + "epoch": 0.7729615991583377, + "grad_norm": 1.2178399562835693, + "learning_rate": 2.5843057995157548e-05, + "loss": 1.4361, + "step": 7347 + }, + { + "epoch": 0.7730668069437139, + "grad_norm": 1.5581814050674438, + "learning_rate": 2.582020060758409e-05, + "loss": 1.5575, + "step": 7348 + }, + { + "epoch": 0.7731720147290899, + "grad_norm": 1.6995668411254883, + "learning_rate": 2.579735183426649e-05, + "loss": 1.515, + "step": 7349 + }, + { + "epoch": 0.7732772225144661, + "grad_norm": 1.9909636974334717, + "learning_rate": 2.577451167785808e-05, + "loss": 2.097, + "step": 7350 + }, + { + "epoch": 0.7733824302998422, + "grad_norm": 1.49967622756958, + "learning_rate": 2.5751680141011214e-05, + "loss": 1.6389, + "step": 7351 + }, + { + "epoch": 0.7734876380852183, + "grad_norm": 1.4037280082702637, + "learning_rate": 2.5728857226377246e-05, + "loss": 1.6968, + "step": 7352 + }, + { + "epoch": 0.7735928458705944, + "grad_norm": 1.8199878931045532, + "learning_rate": 2.5706042936606477e-05, + "loss": 1.9098, + "step": 7353 + }, + { + "epoch": 0.7736980536559706, + "grad_norm": 1.3631402254104614, + "learning_rate": 2.5683237274348327e-05, + "loss": 1.8551, + "step": 7354 + }, + { + "epoch": 0.7738032614413467, + "grad_norm": 1.259263038635254, + "learning_rate": 2.5660440242251117e-05, + "loss": 1.6699, + "step": 7355 + }, + { + "epoch": 0.7739084692267227, + "grad_norm": 1.1155054569244385, + "learning_rate": 2.5637651842962164e-05, + "loss": 1.5838, + "step": 7356 + }, + { + "epoch": 0.7740136770120989, + "grad_norm": 1.8859355449676514, + "learning_rate": 2.561487207912785e-05, + "loss": 1.7185, + "step": 7357 + }, + { + "epoch": 0.774118884797475, + "grad_norm": 1.8426570892333984, + "learning_rate": 2.5592100953393504e-05, + "loss": 1.9184, + "step": 7358 + }, + { + "epoch": 0.7742240925828511, + "grad_norm": 1.5915162563323975, + "learning_rate": 2.5569338468403426e-05, + "loss": 1.8337, + "step": 7359 + }, + { + "epoch": 0.7743293003682272, + "grad_norm": 1.3176912069320679, + "learning_rate": 2.5546584626801006e-05, + "loss": 1.5611, + "step": 7360 + }, + { + "epoch": 0.7744345081536034, + "grad_norm": 1.1959328651428223, + "learning_rate": 2.5523839431228537e-05, + "loss": 2.022, + "step": 7361 + }, + { + "epoch": 0.7745397159389795, + "grad_norm": 1.117728590965271, + "learning_rate": 2.550110288432733e-05, + "loss": 2.0816, + "step": 7362 + }, + { + "epoch": 0.7746449237243556, + "grad_norm": 1.091355800628662, + "learning_rate": 2.5478374988737753e-05, + "loss": 1.4844, + "step": 7363 + }, + { + "epoch": 0.7747501315097317, + "grad_norm": 1.6453313827514648, + "learning_rate": 2.5455655747099093e-05, + "loss": 1.8608, + "step": 7364 + }, + { + "epoch": 0.7748553392951079, + "grad_norm": 1.3026262521743774, + "learning_rate": 2.543294516204964e-05, + "loss": 1.9639, + "step": 7365 + }, + { + "epoch": 0.774960547080484, + "grad_norm": 1.868897795677185, + "learning_rate": 2.541024323622674e-05, + "loss": 1.6702, + "step": 7366 + }, + { + "epoch": 0.77506575486586, + "grad_norm": 2.3049232959747314, + "learning_rate": 2.5387549972266678e-05, + "loss": 1.1612, + "step": 7367 + }, + { + "epoch": 0.7751709626512362, + "grad_norm": 1.9099220037460327, + "learning_rate": 2.5364865372804712e-05, + "loss": 1.3566, + "step": 7368 + }, + { + "epoch": 0.7752761704366123, + "grad_norm": 1.5426534414291382, + "learning_rate": 2.5342189440475204e-05, + "loss": 1.7221, + "step": 7369 + }, + { + "epoch": 0.7753813782219884, + "grad_norm": 1.1355007886886597, + "learning_rate": 2.531952217791136e-05, + "loss": 1.6406, + "step": 7370 + }, + { + "epoch": 0.7754865860073645, + "grad_norm": 1.2040497064590454, + "learning_rate": 2.529686358774551e-05, + "loss": 1.8762, + "step": 7371 + }, + { + "epoch": 0.7755917937927407, + "grad_norm": 1.0842825174331665, + "learning_rate": 2.5274213672608936e-05, + "loss": 1.9909, + "step": 7372 + }, + { + "epoch": 0.7756970015781168, + "grad_norm": 2.0430164337158203, + "learning_rate": 2.52515724351318e-05, + "loss": 1.8172, + "step": 7373 + }, + { + "epoch": 0.7758022093634929, + "grad_norm": 1.6244868040084839, + "learning_rate": 2.5228939877943448e-05, + "loss": 1.9493, + "step": 7374 + }, + { + "epoch": 0.775907417148869, + "grad_norm": 2.7612531185150146, + "learning_rate": 2.520631600367209e-05, + "loss": 2.1905, + "step": 7375 + }, + { + "epoch": 0.7760126249342452, + "grad_norm": 1.8194602727890015, + "learning_rate": 2.5183700814944945e-05, + "loss": 1.7323, + "step": 7376 + }, + { + "epoch": 0.7761178327196212, + "grad_norm": 1.4295552968978882, + "learning_rate": 2.5161094314388278e-05, + "loss": 1.9286, + "step": 7377 + }, + { + "epoch": 0.7762230405049974, + "grad_norm": 1.6510827541351318, + "learning_rate": 2.5138496504627263e-05, + "loss": 1.4062, + "step": 7378 + }, + { + "epoch": 0.7763282482903735, + "grad_norm": 1.3370580673217773, + "learning_rate": 2.5115907388286165e-05, + "loss": 0.8727, + "step": 7379 + }, + { + "epoch": 0.7764334560757497, + "grad_norm": 1.845016598701477, + "learning_rate": 2.509332696798816e-05, + "loss": 2.2291, + "step": 7380 + }, + { + "epoch": 0.7765386638611257, + "grad_norm": 2.652451515197754, + "learning_rate": 2.5070755246355393e-05, + "loss": 1.9059, + "step": 7381 + }, + { + "epoch": 0.7766438716465018, + "grad_norm": 1.796697974205017, + "learning_rate": 2.5048192226009126e-05, + "loss": 0.9683, + "step": 7382 + }, + { + "epoch": 0.776749079431878, + "grad_norm": 2.484699010848999, + "learning_rate": 2.5025637909569475e-05, + "loss": 1.9713, + "step": 7383 + }, + { + "epoch": 0.776854287217254, + "grad_norm": 1.428265929222107, + "learning_rate": 2.5003092299655584e-05, + "loss": 1.8807, + "step": 7384 + }, + { + "epoch": 0.7769594950026302, + "grad_norm": 1.6806254386901855, + "learning_rate": 2.4980555398885653e-05, + "loss": 2.3045, + "step": 7385 + }, + { + "epoch": 0.7770647027880063, + "grad_norm": 1.6273754835128784, + "learning_rate": 2.4958027209876788e-05, + "loss": 1.5729, + "step": 7386 + }, + { + "epoch": 0.7771699105733825, + "grad_norm": 1.8025354146957397, + "learning_rate": 2.493550773524509e-05, + "loss": 1.7046, + "step": 7387 + }, + { + "epoch": 0.7772751183587585, + "grad_norm": 1.5753192901611328, + "learning_rate": 2.4912996977605718e-05, + "loss": 1.2373, + "step": 7388 + }, + { + "epoch": 0.7773803261441347, + "grad_norm": 1.7229233980178833, + "learning_rate": 2.4890494939572762e-05, + "loss": 1.7167, + "step": 7389 + }, + { + "epoch": 0.7774855339295108, + "grad_norm": 2.0432682037353516, + "learning_rate": 2.4868001623759263e-05, + "loss": 1.456, + "step": 7390 + }, + { + "epoch": 0.7775907417148868, + "grad_norm": 1.1946852207183838, + "learning_rate": 2.4845517032777364e-05, + "loss": 1.5981, + "step": 7391 + }, + { + "epoch": 0.777695949500263, + "grad_norm": 1.7232786417007446, + "learning_rate": 2.4823041169238092e-05, + "loss": 1.8003, + "step": 7392 + }, + { + "epoch": 0.7778011572856391, + "grad_norm": 2.1846988201141357, + "learning_rate": 2.480057403575148e-05, + "loss": 1.6615, + "step": 7393 + }, + { + "epoch": 0.7779063650710153, + "grad_norm": 1.523494839668274, + "learning_rate": 2.4778115634926624e-05, + "loss": 1.9023, + "step": 7394 + }, + { + "epoch": 0.7780115728563913, + "grad_norm": 1.5766888856887817, + "learning_rate": 2.4755665969371446e-05, + "loss": 1.6448, + "step": 7395 + }, + { + "epoch": 0.7781167806417675, + "grad_norm": 1.718214750289917, + "learning_rate": 2.4733225041693033e-05, + "loss": 1.8259, + "step": 7396 + }, + { + "epoch": 0.7782219884271436, + "grad_norm": 1.3229570388793945, + "learning_rate": 2.4710792854497346e-05, + "loss": 1.775, + "step": 7397 + }, + { + "epoch": 0.7783271962125198, + "grad_norm": 1.1155831813812256, + "learning_rate": 2.4688369410389334e-05, + "loss": 1.6853, + "step": 7398 + }, + { + "epoch": 0.7784324039978958, + "grad_norm": 1.7406342029571533, + "learning_rate": 2.4665954711973017e-05, + "loss": 1.4841, + "step": 7399 + }, + { + "epoch": 0.778537611783272, + "grad_norm": 1.489790439605713, + "learning_rate": 2.464354876185131e-05, + "loss": 1.4955, + "step": 7400 + }, + { + "epoch": 0.7786428195686481, + "grad_norm": 1.8641880750656128, + "learning_rate": 2.462115156262612e-05, + "loss": 1.6168, + "step": 7401 + }, + { + "epoch": 0.7787480273540242, + "grad_norm": 1.3012237548828125, + "learning_rate": 2.4598763116898405e-05, + "loss": 2.0688, + "step": 7402 + }, + { + "epoch": 0.7788532351394003, + "grad_norm": 2.4502408504486084, + "learning_rate": 2.4576383427268034e-05, + "loss": 1.3047, + "step": 7403 + }, + { + "epoch": 0.7789584429247765, + "grad_norm": 1.7557679414749146, + "learning_rate": 2.455401249633387e-05, + "loss": 1.3024, + "step": 7404 + }, + { + "epoch": 0.7790636507101526, + "grad_norm": 2.018825054168701, + "learning_rate": 2.4531650326693822e-05, + "loss": 2.3045, + "step": 7405 + }, + { + "epoch": 0.7791688584955286, + "grad_norm": 1.1796764135360718, + "learning_rate": 2.4509296920944712e-05, + "loss": 1.772, + "step": 7406 + }, + { + "epoch": 0.7792740662809048, + "grad_norm": 1.2721928358078003, + "learning_rate": 2.448695228168234e-05, + "loss": 1.2919, + "step": 7407 + }, + { + "epoch": 0.7793792740662809, + "grad_norm": 1.6079884767532349, + "learning_rate": 2.4464616411501572e-05, + "loss": 1.4524, + "step": 7408 + }, + { + "epoch": 0.779484481851657, + "grad_norm": 1.7307249307632446, + "learning_rate": 2.4442289312996158e-05, + "loss": 1.9304, + "step": 7409 + }, + { + "epoch": 0.7795896896370331, + "grad_norm": 1.4798263311386108, + "learning_rate": 2.4419970988758857e-05, + "loss": 1.9729, + "step": 7410 + }, + { + "epoch": 0.7796948974224093, + "grad_norm": 1.2387443780899048, + "learning_rate": 2.439766144138148e-05, + "loss": 1.5655, + "step": 7411 + }, + { + "epoch": 0.7798001052077854, + "grad_norm": 1.4330637454986572, + "learning_rate": 2.4375360673454718e-05, + "loss": 1.3443, + "step": 7412 + }, + { + "epoch": 0.7799053129931615, + "grad_norm": 1.1398248672485352, + "learning_rate": 2.435306868756827e-05, + "loss": 1.8242, + "step": 7413 + }, + { + "epoch": 0.7800105207785376, + "grad_norm": 2.0635628700256348, + "learning_rate": 2.433078548631088e-05, + "loss": 1.3481, + "step": 7414 + }, + { + "epoch": 0.7801157285639138, + "grad_norm": 1.374840259552002, + "learning_rate": 2.4308511072270202e-05, + "loss": 1.5717, + "step": 7415 + }, + { + "epoch": 0.7802209363492898, + "grad_norm": 1.5022417306900024, + "learning_rate": 2.4286245448032895e-05, + "loss": 1.6766, + "step": 7416 + }, + { + "epoch": 0.7803261441346659, + "grad_norm": 1.2573763132095337, + "learning_rate": 2.4263988616184574e-05, + "loss": 1.5103, + "step": 7417 + }, + { + "epoch": 0.7804313519200421, + "grad_norm": 1.6818538904190063, + "learning_rate": 2.4241740579309836e-05, + "loss": 0.8336, + "step": 7418 + }, + { + "epoch": 0.7805365597054182, + "grad_norm": 1.3387900590896606, + "learning_rate": 2.4219501339992334e-05, + "loss": 1.8443, + "step": 7419 + }, + { + "epoch": 0.7806417674907943, + "grad_norm": 1.4211339950561523, + "learning_rate": 2.4197270900814594e-05, + "loss": 1.7912, + "step": 7420 + }, + { + "epoch": 0.7807469752761704, + "grad_norm": 1.4029207229614258, + "learning_rate": 2.417504926435814e-05, + "loss": 1.2693, + "step": 7421 + }, + { + "epoch": 0.7808521830615466, + "grad_norm": 1.4126625061035156, + "learning_rate": 2.415283643320356e-05, + "loss": 1.4289, + "step": 7422 + }, + { + "epoch": 0.7809573908469227, + "grad_norm": 1.4173007011413574, + "learning_rate": 2.413063240993031e-05, + "loss": 1.8065, + "step": 7423 + }, + { + "epoch": 0.7810625986322988, + "grad_norm": 1.166293740272522, + "learning_rate": 2.4108437197116905e-05, + "loss": 1.9179, + "step": 7424 + }, + { + "epoch": 0.7811678064176749, + "grad_norm": 1.7653440237045288, + "learning_rate": 2.408625079734078e-05, + "loss": 1.5022, + "step": 7425 + }, + { + "epoch": 0.7812730142030511, + "grad_norm": 1.2314149141311646, + "learning_rate": 2.406407321317835e-05, + "loss": 1.5374, + "step": 7426 + }, + { + "epoch": 0.7813782219884271, + "grad_norm": 1.553765058517456, + "learning_rate": 2.4041904447205067e-05, + "loss": 1.6092, + "step": 7427 + }, + { + "epoch": 0.7814834297738033, + "grad_norm": 1.543643593788147, + "learning_rate": 2.40197445019953e-05, + "loss": 1.2104, + "step": 7428 + }, + { + "epoch": 0.7815886375591794, + "grad_norm": 1.2574193477630615, + "learning_rate": 2.3997593380122386e-05, + "loss": 1.6868, + "step": 7429 + }, + { + "epoch": 0.7816938453445555, + "grad_norm": 1.3462152481079102, + "learning_rate": 2.3975451084158707e-05, + "loss": 1.5889, + "step": 7430 + }, + { + "epoch": 0.7817990531299316, + "grad_norm": 1.5161739587783813, + "learning_rate": 2.395331761667554e-05, + "loss": 1.805, + "step": 7431 + }, + { + "epoch": 0.7819042609153077, + "grad_norm": 1.5476092100143433, + "learning_rate": 2.3931192980243166e-05, + "loss": 1.4936, + "step": 7432 + }, + { + "epoch": 0.7820094687006839, + "grad_norm": 1.412503957748413, + "learning_rate": 2.3909077177430893e-05, + "loss": 1.5105, + "step": 7433 + }, + { + "epoch": 0.7821146764860599, + "grad_norm": 1.7502893209457397, + "learning_rate": 2.3886970210806915e-05, + "loss": 1.1703, + "step": 7434 + }, + { + "epoch": 0.7822198842714361, + "grad_norm": 1.8714232444763184, + "learning_rate": 2.3864872082938426e-05, + "loss": 1.2081, + "step": 7435 + }, + { + "epoch": 0.7823250920568122, + "grad_norm": 1.5723828077316284, + "learning_rate": 2.3842782796391672e-05, + "loss": 1.4935, + "step": 7436 + }, + { + "epoch": 0.7824302998421884, + "grad_norm": 1.6067105531692505, + "learning_rate": 2.3820702353731773e-05, + "loss": 1.9455, + "step": 7437 + }, + { + "epoch": 0.7825355076275644, + "grad_norm": 1.2671598196029663, + "learning_rate": 2.3798630757522844e-05, + "loss": 1.6182, + "step": 7438 + }, + { + "epoch": 0.7826407154129406, + "grad_norm": 1.860741138458252, + "learning_rate": 2.3776568010328003e-05, + "loss": 1.8387, + "step": 7439 + }, + { + "epoch": 0.7827459231983167, + "grad_norm": 1.9345104694366455, + "learning_rate": 2.3754514114709304e-05, + "loss": 1.4232, + "step": 7440 + }, + { + "epoch": 0.7828511309836927, + "grad_norm": 2.1556923389434814, + "learning_rate": 2.3732469073227827e-05, + "loss": 1.588, + "step": 7441 + }, + { + "epoch": 0.7829563387690689, + "grad_norm": 1.9754849672317505, + "learning_rate": 2.371043288844358e-05, + "loss": 1.6852, + "step": 7442 + }, + { + "epoch": 0.783061546554445, + "grad_norm": 1.2634505033493042, + "learning_rate": 2.3688405562915517e-05, + "loss": 1.8126, + "step": 7443 + }, + { + "epoch": 0.7831667543398212, + "grad_norm": 2.078622579574585, + "learning_rate": 2.3666387099201648e-05, + "loss": 0.9941, + "step": 7444 + }, + { + "epoch": 0.7832719621251972, + "grad_norm": 1.6752347946166992, + "learning_rate": 2.3644377499858893e-05, + "loss": 1.8038, + "step": 7445 + }, + { + "epoch": 0.7833771699105734, + "grad_norm": 1.0644302368164062, + "learning_rate": 2.3622376767443123e-05, + "loss": 1.8299, + "step": 7446 + }, + { + "epoch": 0.7834823776959495, + "grad_norm": 1.3895219564437866, + "learning_rate": 2.3600384904509254e-05, + "loss": 1.8465, + "step": 7447 + }, + { + "epoch": 0.7835875854813256, + "grad_norm": 1.346755862236023, + "learning_rate": 2.3578401913611103e-05, + "loss": 1.5028, + "step": 7448 + }, + { + "epoch": 0.7836927932667017, + "grad_norm": 1.92898428440094, + "learning_rate": 2.3556427797301462e-05, + "loss": 1.1488, + "step": 7449 + }, + { + "epoch": 0.7837980010520779, + "grad_norm": 1.4644855260849, + "learning_rate": 2.3534462558132177e-05, + "loss": 1.8202, + "step": 7450 + }, + { + "epoch": 0.783903208837454, + "grad_norm": 1.326379656791687, + "learning_rate": 2.3512506198653948e-05, + "loss": 1.3529, + "step": 7451 + }, + { + "epoch": 0.78400841662283, + "grad_norm": 1.445070505142212, + "learning_rate": 2.3490558721416477e-05, + "loss": 1.7824, + "step": 7452 + }, + { + "epoch": 0.7841136244082062, + "grad_norm": 1.5502876043319702, + "learning_rate": 2.346862012896852e-05, + "loss": 1.8986, + "step": 7453 + }, + { + "epoch": 0.7842188321935823, + "grad_norm": 1.3673009872436523, + "learning_rate": 2.3446690423857685e-05, + "loss": 1.4538, + "step": 7454 + }, + { + "epoch": 0.7843240399789585, + "grad_norm": 2.0649657249450684, + "learning_rate": 2.3424769608630593e-05, + "loss": 1.7845, + "step": 7455 + }, + { + "epoch": 0.7844292477643345, + "grad_norm": 1.5386171340942383, + "learning_rate": 2.340285768583287e-05, + "loss": 2.1823, + "step": 7456 + }, + { + "epoch": 0.7845344555497107, + "grad_norm": 2.3255977630615234, + "learning_rate": 2.3380954658009057e-05, + "loss": 1.5615, + "step": 7457 + }, + { + "epoch": 0.7846396633350868, + "grad_norm": 2.3962433338165283, + "learning_rate": 2.335906052770267e-05, + "loss": 1.4474, + "step": 7458 + }, + { + "epoch": 0.7847448711204629, + "grad_norm": 1.4439377784729004, + "learning_rate": 2.3337175297456225e-05, + "loss": 2.055, + "step": 7459 + }, + { + "epoch": 0.784850078905839, + "grad_norm": 1.2580662965774536, + "learning_rate": 2.3315298969811127e-05, + "loss": 1.9116, + "step": 7460 + }, + { + "epoch": 0.7849552866912152, + "grad_norm": 2.165815830230713, + "learning_rate": 2.3293431547307887e-05, + "loss": 1.5773, + "step": 7461 + }, + { + "epoch": 0.7850604944765913, + "grad_norm": 1.3377245664596558, + "learning_rate": 2.327157303248584e-05, + "loss": 1.23, + "step": 7462 + }, + { + "epoch": 0.7851657022619674, + "grad_norm": 1.4234360456466675, + "learning_rate": 2.324972342788333e-05, + "loss": 1.5741, + "step": 7463 + }, + { + "epoch": 0.7852709100473435, + "grad_norm": 1.4981180429458618, + "learning_rate": 2.3227882736037732e-05, + "loss": 1.6655, + "step": 7464 + }, + { + "epoch": 0.7853761178327197, + "grad_norm": 1.7403837442398071, + "learning_rate": 2.3206050959485314e-05, + "loss": 1.9563, + "step": 7465 + }, + { + "epoch": 0.7854813256180957, + "grad_norm": 1.0999480485916138, + "learning_rate": 2.3184228100761285e-05, + "loss": 1.9103, + "step": 7466 + }, + { + "epoch": 0.7855865334034718, + "grad_norm": 1.1709569692611694, + "learning_rate": 2.316241416239994e-05, + "loss": 1.8448, + "step": 7467 + }, + { + "epoch": 0.785691741188848, + "grad_norm": 2.044626235961914, + "learning_rate": 2.31406091469344e-05, + "loss": 1.7845, + "step": 7468 + }, + { + "epoch": 0.7857969489742241, + "grad_norm": 1.8200260400772095, + "learning_rate": 2.3118813056896814e-05, + "loss": 1.8616, + "step": 7469 + }, + { + "epoch": 0.7859021567596002, + "grad_norm": 1.0556297302246094, + "learning_rate": 2.3097025894818326e-05, + "loss": 1.9248, + "step": 7470 + }, + { + "epoch": 0.7860073645449763, + "grad_norm": 1.9120993614196777, + "learning_rate": 2.307524766322896e-05, + "loss": 1.1505, + "step": 7471 + }, + { + "epoch": 0.7861125723303525, + "grad_norm": 1.3910388946533203, + "learning_rate": 2.30534783646578e-05, + "loss": 2.0402, + "step": 7472 + }, + { + "epoch": 0.7862177801157285, + "grad_norm": 2.19036602973938, + "learning_rate": 2.303171800163282e-05, + "loss": 1.7415, + "step": 7473 + }, + { + "epoch": 0.7863229879011047, + "grad_norm": 1.8024297952651978, + "learning_rate": 2.300996657668095e-05, + "loss": 1.6391, + "step": 7474 + }, + { + "epoch": 0.7864281956864808, + "grad_norm": 1.367393136024475, + "learning_rate": 2.298822409232817e-05, + "loss": 1.9299, + "step": 7475 + }, + { + "epoch": 0.786533403471857, + "grad_norm": 1.068220615386963, + "learning_rate": 2.2966490551099328e-05, + "loss": 1.2364, + "step": 7476 + }, + { + "epoch": 0.786638611257233, + "grad_norm": 1.452365756034851, + "learning_rate": 2.2944765955518242e-05, + "loss": 1.0474, + "step": 7477 + }, + { + "epoch": 0.7867438190426091, + "grad_norm": 1.1206425428390503, + "learning_rate": 2.2923050308107785e-05, + "loss": 1.9693, + "step": 7478 + }, + { + "epoch": 0.7868490268279853, + "grad_norm": 1.8541259765625, + "learning_rate": 2.290134361138968e-05, + "loss": 1.7089, + "step": 7479 + }, + { + "epoch": 0.7869542346133613, + "grad_norm": 1.5118783712387085, + "learning_rate": 2.287964586788467e-05, + "loss": 1.7571, + "step": 7480 + }, + { + "epoch": 0.7870594423987375, + "grad_norm": 1.5137947797775269, + "learning_rate": 2.2857957080112423e-05, + "loss": 1.0681, + "step": 7481 + }, + { + "epoch": 0.7871646501841136, + "grad_norm": 1.3139784336090088, + "learning_rate": 2.2836277250591574e-05, + "loss": 1.2861, + "step": 7482 + }, + { + "epoch": 0.7872698579694898, + "grad_norm": 2.15798020362854, + "learning_rate": 2.2814606381839786e-05, + "loss": 1.4572, + "step": 7483 + }, + { + "epoch": 0.7873750657548658, + "grad_norm": 1.4466586112976074, + "learning_rate": 2.279294447637359e-05, + "loss": 1.554, + "step": 7484 + }, + { + "epoch": 0.787480273540242, + "grad_norm": 2.4875476360321045, + "learning_rate": 2.2771291536708494e-05, + "loss": 2.1975, + "step": 7485 + }, + { + "epoch": 0.7875854813256181, + "grad_norm": 1.745013952255249, + "learning_rate": 2.2749647565359024e-05, + "loss": 1.6658, + "step": 7486 + }, + { + "epoch": 0.7876906891109943, + "grad_norm": 0.8909004926681519, + "learning_rate": 2.2728012564838608e-05, + "loss": 1.9497, + "step": 7487 + }, + { + "epoch": 0.7877958968963703, + "grad_norm": 1.4795335531234741, + "learning_rate": 2.2706386537659606e-05, + "loss": 1.4597, + "step": 7488 + }, + { + "epoch": 0.7879011046817465, + "grad_norm": 1.7913676500320435, + "learning_rate": 2.2684769486333445e-05, + "loss": 1.4416, + "step": 7489 + }, + { + "epoch": 0.7880063124671226, + "grad_norm": 1.4431122541427612, + "learning_rate": 2.2663161413370415e-05, + "loss": 1.8649, + "step": 7490 + }, + { + "epoch": 0.7881115202524986, + "grad_norm": 1.1279875040054321, + "learning_rate": 2.2641562321279752e-05, + "loss": 1.5093, + "step": 7491 + }, + { + "epoch": 0.7882167280378748, + "grad_norm": 1.534596562385559, + "learning_rate": 2.2619972212569752e-05, + "loss": 1.663, + "step": 7492 + }, + { + "epoch": 0.7883219358232509, + "grad_norm": 1.2349904775619507, + "learning_rate": 2.259839108974757e-05, + "loss": 1.8006, + "step": 7493 + }, + { + "epoch": 0.7884271436086271, + "grad_norm": 1.3401429653167725, + "learning_rate": 2.2576818955319333e-05, + "loss": 1.2232, + "step": 7494 + }, + { + "epoch": 0.7885323513940031, + "grad_norm": 1.3276652097702026, + "learning_rate": 2.2555255811790177e-05, + "loss": 1.3863, + "step": 7495 + }, + { + "epoch": 0.7886375591793793, + "grad_norm": 2.36476731300354, + "learning_rate": 2.2533701661664154e-05, + "loss": 1.9646, + "step": 7496 + }, + { + "epoch": 0.7887427669647554, + "grad_norm": 1.2848833799362183, + "learning_rate": 2.251215650744424e-05, + "loss": 1.5951, + "step": 7497 + }, + { + "epoch": 0.7888479747501315, + "grad_norm": 1.5567632913589478, + "learning_rate": 2.2490620351632452e-05, + "loss": 1.1698, + "step": 7498 + }, + { + "epoch": 0.7889531825355076, + "grad_norm": 2.238565444946289, + "learning_rate": 2.2469093196729696e-05, + "loss": 1.9644, + "step": 7499 + }, + { + "epoch": 0.7890583903208838, + "grad_norm": 1.1401231288909912, + "learning_rate": 2.24475750452358e-05, + "loss": 1.5368, + "step": 7500 + }, + { + "epoch": 0.7891635981062599, + "grad_norm": 2.0586628913879395, + "learning_rate": 2.242606589964972e-05, + "loss": 1.9507, + "step": 7501 + }, + { + "epoch": 0.789268805891636, + "grad_norm": 1.760940670967102, + "learning_rate": 2.2404565762469088e-05, + "loss": 1.8004, + "step": 7502 + }, + { + "epoch": 0.7893740136770121, + "grad_norm": 1.8332325220108032, + "learning_rate": 2.2383074636190748e-05, + "loss": 1.1507, + "step": 7503 + }, + { + "epoch": 0.7894792214623882, + "grad_norm": 1.1777528524398804, + "learning_rate": 2.236159252331037e-05, + "loss": 1.8222, + "step": 7504 + }, + { + "epoch": 0.7895844292477643, + "grad_norm": 2.514356851577759, + "learning_rate": 2.234011942632257e-05, + "loss": 1.0901, + "step": 7505 + }, + { + "epoch": 0.7896896370331404, + "grad_norm": 1.4754233360290527, + "learning_rate": 2.2318655347720995e-05, + "loss": 1.3529, + "step": 7506 + }, + { + "epoch": 0.7897948448185166, + "grad_norm": 1.38016939163208, + "learning_rate": 2.2297200289998176e-05, + "loss": 1.4848, + "step": 7507 + }, + { + "epoch": 0.7899000526038927, + "grad_norm": 2.304391384124756, + "learning_rate": 2.2275754255645587e-05, + "loss": 1.2804, + "step": 7508 + }, + { + "epoch": 0.7900052603892688, + "grad_norm": 2.2973196506500244, + "learning_rate": 2.2254317247153746e-05, + "loss": 1.3045, + "step": 7509 + }, + { + "epoch": 0.7901104681746449, + "grad_norm": 1.2090044021606445, + "learning_rate": 2.2232889267012038e-05, + "loss": 2.2979, + "step": 7510 + }, + { + "epoch": 0.7902156759600211, + "grad_norm": 1.6775548458099365, + "learning_rate": 2.221147031770878e-05, + "loss": 1.7299, + "step": 7511 + }, + { + "epoch": 0.7903208837453971, + "grad_norm": 1.9138872623443604, + "learning_rate": 2.2190060401731362e-05, + "loss": 1.9317, + "step": 7512 + }, + { + "epoch": 0.7904260915307733, + "grad_norm": 1.4541093111038208, + "learning_rate": 2.2168659521566004e-05, + "loss": 1.3817, + "step": 7513 + }, + { + "epoch": 0.7905312993161494, + "grad_norm": 2.349430799484253, + "learning_rate": 2.2147267679697892e-05, + "loss": 1.1925, + "step": 7514 + }, + { + "epoch": 0.7906365071015256, + "grad_norm": 2.0313339233398438, + "learning_rate": 2.2125884878611258e-05, + "loss": 1.643, + "step": 7515 + }, + { + "epoch": 0.7907417148869016, + "grad_norm": 1.2877901792526245, + "learning_rate": 2.210451112078914e-05, + "loss": 1.2234, + "step": 7516 + }, + { + "epoch": 0.7908469226722777, + "grad_norm": 1.4288432598114014, + "learning_rate": 2.2083146408713673e-05, + "loss": 1.4857, + "step": 7517 + }, + { + "epoch": 0.7909521304576539, + "grad_norm": 1.7954027652740479, + "learning_rate": 2.206179074486584e-05, + "loss": 1.359, + "step": 7518 + }, + { + "epoch": 0.79105733824303, + "grad_norm": 1.427760124206543, + "learning_rate": 2.204044413172558e-05, + "loss": 2.6701, + "step": 7519 + }, + { + "epoch": 0.7911625460284061, + "grad_norm": 1.8676557540893555, + "learning_rate": 2.201910657177185e-05, + "loss": 1.5542, + "step": 7520 + }, + { + "epoch": 0.7912677538137822, + "grad_norm": 2.1520488262176514, + "learning_rate": 2.19977780674825e-05, + "loss": 1.6475, + "step": 7521 + }, + { + "epoch": 0.7913729615991584, + "grad_norm": 2.3536930084228516, + "learning_rate": 2.1976458621334317e-05, + "loss": 1.2516, + "step": 7522 + }, + { + "epoch": 0.7914781693845344, + "grad_norm": 1.8606661558151245, + "learning_rate": 2.195514823580307e-05, + "loss": 1.6912, + "step": 7523 + }, + { + "epoch": 0.7915833771699106, + "grad_norm": 1.4110456705093384, + "learning_rate": 2.1933846913363466e-05, + "loss": 1.6379, + "step": 7524 + }, + { + "epoch": 0.7916885849552867, + "grad_norm": 1.5939291715621948, + "learning_rate": 2.1912554656489127e-05, + "loss": 1.5101, + "step": 7525 + }, + { + "epoch": 0.7917937927406629, + "grad_norm": 1.7965341806411743, + "learning_rate": 2.1891271467652696e-05, + "loss": 1.6246, + "step": 7526 + }, + { + "epoch": 0.7918990005260389, + "grad_norm": 1.8288224935531616, + "learning_rate": 2.186999734932569e-05, + "loss": 1.5909, + "step": 7527 + }, + { + "epoch": 0.792004208311415, + "grad_norm": 1.5028138160705566, + "learning_rate": 2.1848732303978638e-05, + "loss": 1.6557, + "step": 7528 + }, + { + "epoch": 0.7921094160967912, + "grad_norm": 1.5027766227722168, + "learning_rate": 2.1827476334080953e-05, + "loss": 1.3482, + "step": 7529 + }, + { + "epoch": 0.7922146238821672, + "grad_norm": 1.7562695741653442, + "learning_rate": 2.1806229442101e-05, + "loss": 2.0568, + "step": 7530 + }, + { + "epoch": 0.7923198316675434, + "grad_norm": 1.675010085105896, + "learning_rate": 2.178499163050617e-05, + "loss": 1.6856, + "step": 7531 + }, + { + "epoch": 0.7924250394529195, + "grad_norm": 1.9306681156158447, + "learning_rate": 2.1763762901762696e-05, + "loss": 1.4367, + "step": 7532 + }, + { + "epoch": 0.7925302472382957, + "grad_norm": 1.7994093894958496, + "learning_rate": 2.174254325833579e-05, + "loss": 1.6346, + "step": 7533 + }, + { + "epoch": 0.7926354550236717, + "grad_norm": 1.6323491334915161, + "learning_rate": 2.172133270268967e-05, + "loss": 1.7997, + "step": 7534 + }, + { + "epoch": 0.7927406628090479, + "grad_norm": 1.3499654531478882, + "learning_rate": 2.1700131237287414e-05, + "loss": 1.4139, + "step": 7535 + }, + { + "epoch": 0.792845870594424, + "grad_norm": 1.7880961894989014, + "learning_rate": 2.1678938864591046e-05, + "loss": 1.2011, + "step": 7536 + }, + { + "epoch": 0.7929510783798001, + "grad_norm": 1.5255318880081177, + "learning_rate": 2.1657755587061644e-05, + "loss": 1.9614, + "step": 7537 + }, + { + "epoch": 0.7930562861651762, + "grad_norm": 1.1118429899215698, + "learning_rate": 2.1636581407159105e-05, + "loss": 1.2734, + "step": 7538 + }, + { + "epoch": 0.7931614939505524, + "grad_norm": 1.6367865800857544, + "learning_rate": 2.1615416327342296e-05, + "loss": 2.0369, + "step": 7539 + }, + { + "epoch": 0.7932667017359285, + "grad_norm": 1.7008229494094849, + "learning_rate": 2.1594260350069096e-05, + "loss": 1.5876, + "step": 7540 + }, + { + "epoch": 0.7933719095213045, + "grad_norm": 1.5725849866867065, + "learning_rate": 2.157311347779626e-05, + "loss": 1.5209, + "step": 7541 + }, + { + "epoch": 0.7934771173066807, + "grad_norm": 1.7557533979415894, + "learning_rate": 2.1551975712979478e-05, + "loss": 1.2324, + "step": 7542 + }, + { + "epoch": 0.7935823250920568, + "grad_norm": 1.1576135158538818, + "learning_rate": 2.1530847058073466e-05, + "loss": 2.2651, + "step": 7543 + }, + { + "epoch": 0.7936875328774329, + "grad_norm": 1.6503269672393799, + "learning_rate": 2.1509727515531786e-05, + "loss": 1.9303, + "step": 7544 + }, + { + "epoch": 0.793792740662809, + "grad_norm": 2.0574920177459717, + "learning_rate": 2.1488617087806982e-05, + "loss": 1.6262, + "step": 7545 + }, + { + "epoch": 0.7938979484481852, + "grad_norm": 1.3746944665908813, + "learning_rate": 2.1467515777350544e-05, + "loss": 1.2937, + "step": 7546 + }, + { + "epoch": 0.7940031562335613, + "grad_norm": 1.4370603561401367, + "learning_rate": 2.1446423586612886e-05, + "loss": 1.3888, + "step": 7547 + }, + { + "epoch": 0.7941083640189374, + "grad_norm": 1.5636769533157349, + "learning_rate": 2.142534051804339e-05, + "loss": 1.312, + "step": 7548 + }, + { + "epoch": 0.7942135718043135, + "grad_norm": 1.7676070928573608, + "learning_rate": 2.140426657409038e-05, + "loss": 1.97, + "step": 7549 + }, + { + "epoch": 0.7943187795896897, + "grad_norm": 1.89109468460083, + "learning_rate": 2.1383201757201042e-05, + "loss": 1.271, + "step": 7550 + }, + { + "epoch": 0.7944239873750658, + "grad_norm": 0.966954231262207, + "learning_rate": 2.1362146069821643e-05, + "loss": 1.6096, + "step": 7551 + }, + { + "epoch": 0.7945291951604418, + "grad_norm": 1.17153799533844, + "learning_rate": 2.1341099514397266e-05, + "loss": 1.5011, + "step": 7552 + }, + { + "epoch": 0.794634402945818, + "grad_norm": 1.4526724815368652, + "learning_rate": 2.132006209337195e-05, + "loss": 1.4198, + "step": 7553 + }, + { + "epoch": 0.7947396107311941, + "grad_norm": 2.1010799407958984, + "learning_rate": 2.1299033809188773e-05, + "loss": 1.6424, + "step": 7554 + }, + { + "epoch": 0.7948448185165702, + "grad_norm": 1.633360505104065, + "learning_rate": 2.1278014664289648e-05, + "loss": 1.9107, + "step": 7555 + }, + { + "epoch": 0.7949500263019463, + "grad_norm": 1.9874669313430786, + "learning_rate": 2.125700466111542e-05, + "loss": 1.8695, + "step": 7556 + }, + { + "epoch": 0.7950552340873225, + "grad_norm": 1.591950535774231, + "learning_rate": 2.1236003802105974e-05, + "loss": 1.7396, + "step": 7557 + }, + { + "epoch": 0.7951604418726986, + "grad_norm": 1.1268935203552246, + "learning_rate": 2.121501208970005e-05, + "loss": 1.3255, + "step": 7558 + }, + { + "epoch": 0.7952656496580747, + "grad_norm": 1.344058871269226, + "learning_rate": 2.1194029526335303e-05, + "loss": 1.7558, + "step": 7559 + }, + { + "epoch": 0.7953708574434508, + "grad_norm": 1.8388925790786743, + "learning_rate": 2.117305611444843e-05, + "loss": 1.4927, + "step": 7560 + }, + { + "epoch": 0.795476065228827, + "grad_norm": 0.9249711632728577, + "learning_rate": 2.115209185647499e-05, + "loss": 1.4039, + "step": 7561 + }, + { + "epoch": 0.795581273014203, + "grad_norm": 1.0573887825012207, + "learning_rate": 2.1131136754849447e-05, + "loss": 1.5272, + "step": 7562 + }, + { + "epoch": 0.7956864807995792, + "grad_norm": 1.5813140869140625, + "learning_rate": 2.1110190812005315e-05, + "loss": 1.2803, + "step": 7563 + }, + { + "epoch": 0.7957916885849553, + "grad_norm": 2.2133073806762695, + "learning_rate": 2.1089254030374916e-05, + "loss": 2.0426, + "step": 7564 + }, + { + "epoch": 0.7958968963703315, + "grad_norm": 1.3776949644088745, + "learning_rate": 2.106832641238966e-05, + "loss": 1.5064, + "step": 7565 + }, + { + "epoch": 0.7960021041557075, + "grad_norm": 2.1420438289642334, + "learning_rate": 2.1047407960479702e-05, + "loss": 1.888, + "step": 7566 + }, + { + "epoch": 0.7961073119410836, + "grad_norm": 0.984514594078064, + "learning_rate": 2.102649867707426e-05, + "loss": 1.5134, + "step": 7567 + }, + { + "epoch": 0.7962125197264598, + "grad_norm": 1.1638802289962769, + "learning_rate": 2.1005598564601492e-05, + "loss": 1.907, + "step": 7568 + }, + { + "epoch": 0.7963177275118358, + "grad_norm": 1.849143385887146, + "learning_rate": 2.0984707625488442e-05, + "loss": 2.197, + "step": 7569 + }, + { + "epoch": 0.796422935297212, + "grad_norm": 1.8121823072433472, + "learning_rate": 2.096382586216108e-05, + "loss": 1.6732, + "step": 7570 + }, + { + "epoch": 0.7965281430825881, + "grad_norm": 1.8981144428253174, + "learning_rate": 2.0942953277044386e-05, + "loss": 1.9267, + "step": 7571 + }, + { + "epoch": 0.7966333508679643, + "grad_norm": 1.1137375831604004, + "learning_rate": 2.092208987256217e-05, + "loss": 1.5532, + "step": 7572 + }, + { + "epoch": 0.7967385586533403, + "grad_norm": 1.7999489307403564, + "learning_rate": 2.0901235651137284e-05, + "loss": 2.235, + "step": 7573 + }, + { + "epoch": 0.7968437664387165, + "grad_norm": 1.321373462677002, + "learning_rate": 2.0880390615191448e-05, + "loss": 1.6331, + "step": 7574 + }, + { + "epoch": 0.7969489742240926, + "grad_norm": 2.581979274749756, + "learning_rate": 2.0859554767145272e-05, + "loss": 1.2734, + "step": 7575 + }, + { + "epoch": 0.7970541820094686, + "grad_norm": 1.2444729804992676, + "learning_rate": 2.0838728109418436e-05, + "loss": 1.7502, + "step": 7576 + }, + { + "epoch": 0.7971593897948448, + "grad_norm": 1.8155887126922607, + "learning_rate": 2.081791064442943e-05, + "loss": 1.1768, + "step": 7577 + }, + { + "epoch": 0.797264597580221, + "grad_norm": 1.7820537090301514, + "learning_rate": 2.079710237459569e-05, + "loss": 1.7911, + "step": 7578 + }, + { + "epoch": 0.7973698053655971, + "grad_norm": 1.5820388793945312, + "learning_rate": 2.0776303302333677e-05, + "loss": 1.8153, + "step": 7579 + }, + { + "epoch": 0.7974750131509731, + "grad_norm": 2.486478090286255, + "learning_rate": 2.0755513430058672e-05, + "loss": 0.9953, + "step": 7580 + }, + { + "epoch": 0.7975802209363493, + "grad_norm": 1.5562423467636108, + "learning_rate": 2.073473276018493e-05, + "loss": 1.8432, + "step": 7581 + }, + { + "epoch": 0.7976854287217254, + "grad_norm": 1.674886703491211, + "learning_rate": 2.0713961295125685e-05, + "loss": 0.5957, + "step": 7582 + }, + { + "epoch": 0.7977906365071016, + "grad_norm": 1.56822669506073, + "learning_rate": 2.0693199037293022e-05, + "loss": 1.8226, + "step": 7583 + }, + { + "epoch": 0.7978958442924776, + "grad_norm": 1.492409110069275, + "learning_rate": 2.067244598909799e-05, + "loss": 1.4119, + "step": 7584 + }, + { + "epoch": 0.7980010520778538, + "grad_norm": 1.627830147743225, + "learning_rate": 2.0651702152950602e-05, + "loss": 1.837, + "step": 7585 + }, + { + "epoch": 0.7981062598632299, + "grad_norm": 1.6998107433319092, + "learning_rate": 2.0630967531259758e-05, + "loss": 1.6075, + "step": 7586 + }, + { + "epoch": 0.798211467648606, + "grad_norm": 1.3308358192443848, + "learning_rate": 2.0610242126433297e-05, + "loss": 1.7592, + "step": 7587 + }, + { + "epoch": 0.7983166754339821, + "grad_norm": 1.7519946098327637, + "learning_rate": 2.0589525940877996e-05, + "loss": 1.5667, + "step": 7588 + }, + { + "epoch": 0.7984218832193583, + "grad_norm": 2.321702003479004, + "learning_rate": 2.0568818976999526e-05, + "loss": 1.3998, + "step": 7589 + }, + { + "epoch": 0.7985270910047344, + "grad_norm": 1.7129671573638916, + "learning_rate": 2.0548121237202576e-05, + "loss": 1.7467, + "step": 7590 + }, + { + "epoch": 0.7986322987901104, + "grad_norm": 1.7081111669540405, + "learning_rate": 2.0527432723890684e-05, + "loss": 1.6383, + "step": 7591 + }, + { + "epoch": 0.7987375065754866, + "grad_norm": 1.5592128038406372, + "learning_rate": 2.0506753439466297e-05, + "loss": 1.7673, + "step": 7592 + }, + { + "epoch": 0.7988427143608627, + "grad_norm": 2.919283390045166, + "learning_rate": 2.04860833863309e-05, + "loss": 1.7805, + "step": 7593 + }, + { + "epoch": 0.7989479221462388, + "grad_norm": 2.3130650520324707, + "learning_rate": 2.0465422566884805e-05, + "loss": 1.2299, + "step": 7594 + }, + { + "epoch": 0.7990531299316149, + "grad_norm": 1.2341796159744263, + "learning_rate": 2.044477098352726e-05, + "loss": 1.4645, + "step": 7595 + }, + { + "epoch": 0.7991583377169911, + "grad_norm": 2.093064069747925, + "learning_rate": 2.0424128638656513e-05, + "loss": 1.6587, + "step": 7596 + }, + { + "epoch": 0.7992635455023672, + "grad_norm": 2.029285430908203, + "learning_rate": 2.040349553466967e-05, + "loss": 1.9854, + "step": 7597 + }, + { + "epoch": 0.7993687532877433, + "grad_norm": 1.8652676343917847, + "learning_rate": 2.0382871673962766e-05, + "loss": 1.6049, + "step": 7598 + }, + { + "epoch": 0.7994739610731194, + "grad_norm": 1.369429588317871, + "learning_rate": 2.0362257058930822e-05, + "loss": 1.6057, + "step": 7599 + }, + { + "epoch": 0.7995791688584956, + "grad_norm": 1.3833380937576294, + "learning_rate": 2.0341651691967735e-05, + "loss": 2.1065, + "step": 7600 + }, + { + "epoch": 0.7996843766438716, + "grad_norm": 2.4091103076934814, + "learning_rate": 2.0321055575466284e-05, + "loss": 2.0793, + "step": 7601 + }, + { + "epoch": 0.7997895844292477, + "grad_norm": 1.5020534992218018, + "learning_rate": 2.0300468711818322e-05, + "loss": 1.3944, + "step": 7602 + }, + { + "epoch": 0.7998947922146239, + "grad_norm": 1.9810582399368286, + "learning_rate": 2.027989110341446e-05, + "loss": 2.023, + "step": 7603 + }, + { + "epoch": 0.8, + "grad_norm": 1.6195448637008667, + "learning_rate": 2.0259322752644327e-05, + "loss": 1.1899, + "step": 7604 + }, + { + "epoch": 0.8001052077853761, + "grad_norm": 1.4536365270614624, + "learning_rate": 2.0238763661896477e-05, + "loss": 1.6324, + "step": 7605 + }, + { + "epoch": 0.8002104155707522, + "grad_norm": 1.5313684940338135, + "learning_rate": 2.0218213833558352e-05, + "loss": 1.6158, + "step": 7606 + }, + { + "epoch": 0.8003156233561284, + "grad_norm": 1.2193766832351685, + "learning_rate": 2.0197673270016327e-05, + "loss": 1.7051, + "step": 7607 + }, + { + "epoch": 0.8004208311415044, + "grad_norm": 1.6945223808288574, + "learning_rate": 2.0177141973655766e-05, + "loss": 1.9545, + "step": 7608 + }, + { + "epoch": 0.8005260389268806, + "grad_norm": 1.5775741338729858, + "learning_rate": 2.01566199468608e-05, + "loss": 1.5485, + "step": 7609 + }, + { + "epoch": 0.8006312467122567, + "grad_norm": 2.2319235801696777, + "learning_rate": 2.0136107192014676e-05, + "loss": 1.1478, + "step": 7610 + }, + { + "epoch": 0.8007364544976329, + "grad_norm": 2.3744618892669678, + "learning_rate": 2.011560371149943e-05, + "loss": 1.3589, + "step": 7611 + }, + { + "epoch": 0.8008416622830089, + "grad_norm": 1.1278767585754395, + "learning_rate": 2.0095109507696053e-05, + "loss": 1.7969, + "step": 7612 + }, + { + "epoch": 0.800946870068385, + "grad_norm": 2.358863353729248, + "learning_rate": 2.0074624582984512e-05, + "loss": 1.5369, + "step": 7613 + }, + { + "epoch": 0.8010520778537612, + "grad_norm": 1.1637340784072876, + "learning_rate": 2.0054148939743634e-05, + "loss": 1.6914, + "step": 7614 + }, + { + "epoch": 0.8011572856391374, + "grad_norm": 1.5420373678207397, + "learning_rate": 2.0033682580351144e-05, + "loss": 1.3169, + "step": 7615 + }, + { + "epoch": 0.8012624934245134, + "grad_norm": 2.321443557739258, + "learning_rate": 2.001322550718382e-05, + "loss": 1.8365, + "step": 7616 + }, + { + "epoch": 0.8013677012098895, + "grad_norm": 1.631739616394043, + "learning_rate": 1.9992777722617207e-05, + "loss": 1.0929, + "step": 7617 + }, + { + "epoch": 0.8014729089952657, + "grad_norm": 1.5631780624389648, + "learning_rate": 1.997233922902585e-05, + "loss": 1.6305, + "step": 7618 + }, + { + "epoch": 0.8015781167806417, + "grad_norm": 1.8600497245788574, + "learning_rate": 1.995191002878323e-05, + "loss": 1.323, + "step": 7619 + }, + { + "epoch": 0.8016833245660179, + "grad_norm": 1.6327234506607056, + "learning_rate": 1.9931490124261688e-05, + "loss": 1.7434, + "step": 7620 + }, + { + "epoch": 0.801788532351394, + "grad_norm": 1.5391840934753418, + "learning_rate": 1.9911079517832555e-05, + "loss": 1.564, + "step": 7621 + }, + { + "epoch": 0.8018937401367702, + "grad_norm": 1.508278727531433, + "learning_rate": 1.9890678211866033e-05, + "loss": 1.7802, + "step": 7622 + }, + { + "epoch": 0.8019989479221462, + "grad_norm": 1.6780965328216553, + "learning_rate": 1.9870286208731236e-05, + "loss": 1.6782, + "step": 7623 + }, + { + "epoch": 0.8021041557075224, + "grad_norm": 1.9837194681167603, + "learning_rate": 1.9849903510796262e-05, + "loss": 1.9724, + "step": 7624 + }, + { + "epoch": 0.8022093634928985, + "grad_norm": 1.8240439891815186, + "learning_rate": 1.9829530120428064e-05, + "loss": 1.7938, + "step": 7625 + }, + { + "epoch": 0.8023145712782745, + "grad_norm": 1.649254560470581, + "learning_rate": 1.9809166039992522e-05, + "loss": 1.2282, + "step": 7626 + }, + { + "epoch": 0.8024197790636507, + "grad_norm": 2.882291793823242, + "learning_rate": 1.978881127185448e-05, + "loss": 1.2825, + "step": 7627 + }, + { + "epoch": 0.8025249868490268, + "grad_norm": 1.6689751148223877, + "learning_rate": 1.9768465818377656e-05, + "loss": 1.7849, + "step": 7628 + }, + { + "epoch": 0.802630194634403, + "grad_norm": 1.3756548166275024, + "learning_rate": 1.9748129681924675e-05, + "loss": 1.9856, + "step": 7629 + }, + { + "epoch": 0.802735402419779, + "grad_norm": 1.1730574369430542, + "learning_rate": 1.9727802864857194e-05, + "loss": 1.7476, + "step": 7630 + }, + { + "epoch": 0.8028406102051552, + "grad_norm": 1.5033677816390991, + "learning_rate": 1.970748536953557e-05, + "loss": 1.54, + "step": 7631 + }, + { + "epoch": 0.8029458179905313, + "grad_norm": 2.147925615310669, + "learning_rate": 1.9687177198319308e-05, + "loss": 1.521, + "step": 7632 + }, + { + "epoch": 0.8030510257759074, + "grad_norm": 1.849800944328308, + "learning_rate": 1.9666878353566697e-05, + "loss": 1.9367, + "step": 7633 + }, + { + "epoch": 0.8031562335612835, + "grad_norm": 1.3101515769958496, + "learning_rate": 1.9646588837634937e-05, + "loss": 1.9278, + "step": 7634 + }, + { + "epoch": 0.8032614413466597, + "grad_norm": 2.0604212284088135, + "learning_rate": 1.9626308652880243e-05, + "loss": 1.7948, + "step": 7635 + }, + { + "epoch": 0.8033666491320358, + "grad_norm": 1.3913450241088867, + "learning_rate": 1.9606037801657673e-05, + "loss": 1.5491, + "step": 7636 + }, + { + "epoch": 0.8034718569174119, + "grad_norm": 1.931066870689392, + "learning_rate": 1.9585776286321167e-05, + "loss": 1.4223, + "step": 7637 + }, + { + "epoch": 0.803577064702788, + "grad_norm": 2.459156036376953, + "learning_rate": 1.956552410922369e-05, + "loss": 1.1812, + "step": 7638 + }, + { + "epoch": 0.8036822724881642, + "grad_norm": 2.2327475547790527, + "learning_rate": 1.9545281272717032e-05, + "loss": 1.8111, + "step": 7639 + }, + { + "epoch": 0.8037874802735402, + "grad_norm": 1.7723302841186523, + "learning_rate": 1.9525047779151905e-05, + "loss": 2.2684, + "step": 7640 + }, + { + "epoch": 0.8038926880589163, + "grad_norm": 1.0957763195037842, + "learning_rate": 1.950482363087801e-05, + "loss": 1.3338, + "step": 7641 + }, + { + "epoch": 0.8039978958442925, + "grad_norm": 1.4228318929672241, + "learning_rate": 1.948460883024388e-05, + "loss": 1.2643, + "step": 7642 + }, + { + "epoch": 0.8041031036296686, + "grad_norm": 2.340993642807007, + "learning_rate": 1.9464403379596963e-05, + "loss": 2.2087, + "step": 7643 + }, + { + "epoch": 0.8042083114150447, + "grad_norm": 1.3735971450805664, + "learning_rate": 1.9444207281283723e-05, + "loss": 1.388, + "step": 7644 + }, + { + "epoch": 0.8043135192004208, + "grad_norm": 1.704942226409912, + "learning_rate": 1.9424020537649414e-05, + "loss": 1.6904, + "step": 7645 + }, + { + "epoch": 0.804418726985797, + "grad_norm": 1.3440220355987549, + "learning_rate": 1.940384315103825e-05, + "loss": 2.1261, + "step": 7646 + }, + { + "epoch": 0.8045239347711731, + "grad_norm": 1.9415428638458252, + "learning_rate": 1.938367512379341e-05, + "loss": 2.098, + "step": 7647 + }, + { + "epoch": 0.8046291425565492, + "grad_norm": 1.430979609489441, + "learning_rate": 1.9363516458256916e-05, + "loss": 1.1349, + "step": 7648 + }, + { + "epoch": 0.8047343503419253, + "grad_norm": 1.5660107135772705, + "learning_rate": 1.93433671567697e-05, + "loss": 2.0174, + "step": 7649 + }, + { + "epoch": 0.8048395581273015, + "grad_norm": 1.210718035697937, + "learning_rate": 1.932322722167168e-05, + "loss": 1.4806, + "step": 7650 + }, + { + "epoch": 0.8049447659126775, + "grad_norm": 1.4528659582138062, + "learning_rate": 1.9303096655301633e-05, + "loss": 1.9193, + "step": 7651 + }, + { + "epoch": 0.8050499736980536, + "grad_norm": 1.8522127866744995, + "learning_rate": 1.9282975459997234e-05, + "loss": 1.4749, + "step": 7652 + }, + { + "epoch": 0.8051551814834298, + "grad_norm": 1.7069746255874634, + "learning_rate": 1.9262863638095097e-05, + "loss": 1.7067, + "step": 7653 + }, + { + "epoch": 0.8052603892688059, + "grad_norm": 1.1644920110702515, + "learning_rate": 1.9242761191930725e-05, + "loss": 1.9264, + "step": 7654 + }, + { + "epoch": 0.805365597054182, + "grad_norm": 1.9184256792068481, + "learning_rate": 1.9222668123838593e-05, + "loss": 2.0388, + "step": 7655 + }, + { + "epoch": 0.8054708048395581, + "grad_norm": 2.2873263359069824, + "learning_rate": 1.9202584436152014e-05, + "loss": 1.719, + "step": 7656 + }, + { + "epoch": 0.8055760126249343, + "grad_norm": 1.712296724319458, + "learning_rate": 1.9182510131203224e-05, + "loss": 1.6544, + "step": 7657 + }, + { + "epoch": 0.8056812204103103, + "grad_norm": 2.636054515838623, + "learning_rate": 1.9162445211323432e-05, + "loss": 1.9039, + "step": 7658 + }, + { + "epoch": 0.8057864281956865, + "grad_norm": 1.4774726629257202, + "learning_rate": 1.914238967884269e-05, + "loss": 1.6511, + "step": 7659 + }, + { + "epoch": 0.8058916359810626, + "grad_norm": 1.71102774143219, + "learning_rate": 1.9122343536089937e-05, + "loss": 1.2127, + "step": 7660 + }, + { + "epoch": 0.8059968437664388, + "grad_norm": 1.1164214611053467, + "learning_rate": 1.910230678539314e-05, + "loss": 1.5642, + "step": 7661 + }, + { + "epoch": 0.8061020515518148, + "grad_norm": 1.0777497291564941, + "learning_rate": 1.9082279429079065e-05, + "loss": 1.6877, + "step": 7662 + }, + { + "epoch": 0.806207259337191, + "grad_norm": 1.3562448024749756, + "learning_rate": 1.9062261469473397e-05, + "loss": 1.9616, + "step": 7663 + }, + { + "epoch": 0.8063124671225671, + "grad_norm": 1.527605652809143, + "learning_rate": 1.9042252908900814e-05, + "loss": 1.4711, + "step": 7664 + }, + { + "epoch": 0.8064176749079431, + "grad_norm": 1.3711210489273071, + "learning_rate": 1.9022253749684783e-05, + "loss": 1.6572, + "step": 7665 + }, + { + "epoch": 0.8065228826933193, + "grad_norm": 2.562347888946533, + "learning_rate": 1.9002263994147796e-05, + "loss": 1.9137, + "step": 7666 + }, + { + "epoch": 0.8066280904786954, + "grad_norm": 1.8726879358291626, + "learning_rate": 1.8982283644611175e-05, + "loss": 1.1054, + "step": 7667 + }, + { + "epoch": 0.8067332982640716, + "grad_norm": 2.17808198928833, + "learning_rate": 1.8962312703395146e-05, + "loss": 1.6859, + "step": 7668 + }, + { + "epoch": 0.8068385060494476, + "grad_norm": 1.516634225845337, + "learning_rate": 1.8942351172818905e-05, + "loss": 1.5421, + "step": 7669 + }, + { + "epoch": 0.8069437138348238, + "grad_norm": 1.740494966506958, + "learning_rate": 1.8922399055200525e-05, + "loss": 1.5834, + "step": 7670 + }, + { + "epoch": 0.8070489216201999, + "grad_norm": 1.4491758346557617, + "learning_rate": 1.8902456352856925e-05, + "loss": 1.4972, + "step": 7671 + }, + { + "epoch": 0.807154129405576, + "grad_norm": 1.3840901851654053, + "learning_rate": 1.888252306810406e-05, + "loss": 1.7152, + "step": 7672 + }, + { + "epoch": 0.8072593371909521, + "grad_norm": 1.5224378108978271, + "learning_rate": 1.886259920325667e-05, + "loss": 1.2755, + "step": 7673 + }, + { + "epoch": 0.8073645449763283, + "grad_norm": 1.3467313051223755, + "learning_rate": 1.8842684760628425e-05, + "loss": 1.9975, + "step": 7674 + }, + { + "epoch": 0.8074697527617044, + "grad_norm": 1.9911725521087646, + "learning_rate": 1.882277974253197e-05, + "loss": 2.0025, + "step": 7675 + }, + { + "epoch": 0.8075749605470804, + "grad_norm": 1.4871760606765747, + "learning_rate": 1.8802884151278775e-05, + "loss": 1.4795, + "step": 7676 + }, + { + "epoch": 0.8076801683324566, + "grad_norm": 1.2734004259109497, + "learning_rate": 1.8782997989179297e-05, + "loss": 1.6471, + "step": 7677 + }, + { + "epoch": 0.8077853761178327, + "grad_norm": 1.3544495105743408, + "learning_rate": 1.8763121258542815e-05, + "loss": 1.744, + "step": 7678 + }, + { + "epoch": 0.8078905839032089, + "grad_norm": 1.0602861642837524, + "learning_rate": 1.874325396167753e-05, + "loss": 2.062, + "step": 7679 + }, + { + "epoch": 0.8079957916885849, + "grad_norm": 1.5346481800079346, + "learning_rate": 1.8723396100890623e-05, + "loss": 0.9999, + "step": 7680 + }, + { + "epoch": 0.8081009994739611, + "grad_norm": 1.847928762435913, + "learning_rate": 1.870354767848809e-05, + "loss": 1.5345, + "step": 7681 + }, + { + "epoch": 0.8082062072593372, + "grad_norm": 1.852360725402832, + "learning_rate": 1.8683708696774826e-05, + "loss": 1.8659, + "step": 7682 + }, + { + "epoch": 0.8083114150447133, + "grad_norm": 1.2460297346115112, + "learning_rate": 1.866387915805473e-05, + "loss": 1.8196, + "step": 7683 + }, + { + "epoch": 0.8084166228300894, + "grad_norm": 1.7130165100097656, + "learning_rate": 1.8644059064630516e-05, + "loss": 1.652, + "step": 7684 + }, + { + "epoch": 0.8085218306154656, + "grad_norm": 1.4033414125442505, + "learning_rate": 1.86242484188038e-05, + "loss": 1.5075, + "step": 7685 + }, + { + "epoch": 0.8086270384008417, + "grad_norm": 0.9263932108879089, + "learning_rate": 1.8604447222875178e-05, + "loss": 1.8016, + "step": 7686 + }, + { + "epoch": 0.8087322461862178, + "grad_norm": 1.5973109006881714, + "learning_rate": 1.8584655479144063e-05, + "loss": 1.5324, + "step": 7687 + }, + { + "epoch": 0.8088374539715939, + "grad_norm": 1.074812889099121, + "learning_rate": 1.8564873189908793e-05, + "loss": 1.6024, + "step": 7688 + }, + { + "epoch": 0.80894266175697, + "grad_norm": 1.3290114402770996, + "learning_rate": 1.854510035746667e-05, + "loss": 1.786, + "step": 7689 + }, + { + "epoch": 0.8090478695423461, + "grad_norm": 2.054743766784668, + "learning_rate": 1.8525336984113807e-05, + "loss": 1.2776, + "step": 7690 + }, + { + "epoch": 0.8091530773277222, + "grad_norm": 1.4225199222564697, + "learning_rate": 1.850558307214525e-05, + "loss": 1.4513, + "step": 7691 + }, + { + "epoch": 0.8092582851130984, + "grad_norm": 1.4592626094818115, + "learning_rate": 1.848583862385501e-05, + "loss": 1.4772, + "step": 7692 + }, + { + "epoch": 0.8093634928984745, + "grad_norm": 1.3948107957839966, + "learning_rate": 1.8466103641535904e-05, + "loss": 1.6123, + "step": 7693 + }, + { + "epoch": 0.8094687006838506, + "grad_norm": 1.5830365419387817, + "learning_rate": 1.8446378127479692e-05, + "loss": 1.663, + "step": 7694 + }, + { + "epoch": 0.8095739084692267, + "grad_norm": 1.350455403327942, + "learning_rate": 1.8426662083977042e-05, + "loss": 1.6457, + "step": 7695 + }, + { + "epoch": 0.8096791162546029, + "grad_norm": 1.3711130619049072, + "learning_rate": 1.8406955513317482e-05, + "loss": 1.8406, + "step": 7696 + }, + { + "epoch": 0.8097843240399789, + "grad_norm": 2.136914014816284, + "learning_rate": 1.8387258417789532e-05, + "loss": 1.7469, + "step": 7697 + }, + { + "epoch": 0.8098895318253551, + "grad_norm": 2.448425531387329, + "learning_rate": 1.836757079968051e-05, + "loss": 1.3565, + "step": 7698 + }, + { + "epoch": 0.8099947396107312, + "grad_norm": 1.5618599653244019, + "learning_rate": 1.8347892661276656e-05, + "loss": 1.587, + "step": 7699 + }, + { + "epoch": 0.8100999473961074, + "grad_norm": 1.5216078758239746, + "learning_rate": 1.8328224004863183e-05, + "loss": 2.1681, + "step": 7700 + }, + { + "epoch": 0.8102051551814834, + "grad_norm": 1.2957028150558472, + "learning_rate": 1.830856483272412e-05, + "loss": 1.0984, + "step": 7701 + }, + { + "epoch": 0.8103103629668595, + "grad_norm": 1.9072808027267456, + "learning_rate": 1.8288915147142384e-05, + "loss": 1.4575, + "step": 7702 + }, + { + "epoch": 0.8104155707522357, + "grad_norm": 1.6128511428833008, + "learning_rate": 1.8269274950399895e-05, + "loss": 1.3623, + "step": 7703 + }, + { + "epoch": 0.8105207785376117, + "grad_norm": 1.4067853689193726, + "learning_rate": 1.8249644244777376e-05, + "loss": 1.5455, + "step": 7704 + }, + { + "epoch": 0.8106259863229879, + "grad_norm": 1.5415167808532715, + "learning_rate": 1.8230023032554444e-05, + "loss": 1.7658, + "step": 7705 + }, + { + "epoch": 0.810731194108364, + "grad_norm": 1.5726876258850098, + "learning_rate": 1.82104113160097e-05, + "loss": 1.7449, + "step": 7706 + }, + { + "epoch": 0.8108364018937402, + "grad_norm": 1.307512879371643, + "learning_rate": 1.819080909742057e-05, + "loss": 1.7211, + "step": 7707 + }, + { + "epoch": 0.8109416096791162, + "grad_norm": 1.4828598499298096, + "learning_rate": 1.8171216379063348e-05, + "loss": 1.6675, + "step": 7708 + }, + { + "epoch": 0.8110468174644924, + "grad_norm": 1.5087251663208008, + "learning_rate": 1.8151633163213357e-05, + "loss": 2.0347, + "step": 7709 + }, + { + "epoch": 0.8111520252498685, + "grad_norm": 1.217799186706543, + "learning_rate": 1.8132059452144666e-05, + "loss": 1.4303, + "step": 7710 + }, + { + "epoch": 0.8112572330352447, + "grad_norm": 1.011033058166504, + "learning_rate": 1.8112495248130312e-05, + "loss": 1.6298, + "step": 7711 + }, + { + "epoch": 0.8113624408206207, + "grad_norm": 1.5476887226104736, + "learning_rate": 1.809294055344226e-05, + "loss": 2.5314, + "step": 7712 + }, + { + "epoch": 0.8114676486059968, + "grad_norm": 1.962207317352295, + "learning_rate": 1.8073395370351287e-05, + "loss": 1.6565, + "step": 7713 + }, + { + "epoch": 0.811572856391373, + "grad_norm": 1.5745935440063477, + "learning_rate": 1.8053859701127153e-05, + "loss": 1.4709, + "step": 7714 + }, + { + "epoch": 0.811678064176749, + "grad_norm": 1.5660057067871094, + "learning_rate": 1.8034333548038474e-05, + "loss": 2.3473, + "step": 7715 + }, + { + "epoch": 0.8117832719621252, + "grad_norm": 1.5668786764144897, + "learning_rate": 1.8014816913352682e-05, + "loss": 1.6102, + "step": 7716 + }, + { + "epoch": 0.8118884797475013, + "grad_norm": 1.458255648612976, + "learning_rate": 1.7995309799336256e-05, + "loss": 1.5805, + "step": 7717 + }, + { + "epoch": 0.8119936875328775, + "grad_norm": 1.485870599746704, + "learning_rate": 1.7975812208254473e-05, + "loss": 1.7534, + "step": 7718 + }, + { + "epoch": 0.8120988953182535, + "grad_norm": 1.127456784248352, + "learning_rate": 1.7956324142371485e-05, + "loss": 1.7223, + "step": 7719 + }, + { + "epoch": 0.8122041031036297, + "grad_norm": 1.6504329442977905, + "learning_rate": 1.7936845603950447e-05, + "loss": 1.1134, + "step": 7720 + }, + { + "epoch": 0.8123093108890058, + "grad_norm": 2.026771068572998, + "learning_rate": 1.7917376595253264e-05, + "loss": 1.236, + "step": 7721 + }, + { + "epoch": 0.8124145186743819, + "grad_norm": 1.5272623300552368, + "learning_rate": 1.7897917118540875e-05, + "loss": 1.6421, + "step": 7722 + }, + { + "epoch": 0.812519726459758, + "grad_norm": 1.7590751647949219, + "learning_rate": 1.787846717607302e-05, + "loss": 1.2447, + "step": 7723 + }, + { + "epoch": 0.8126249342451342, + "grad_norm": 1.131596326828003, + "learning_rate": 1.7859026770108323e-05, + "loss": 1.3865, + "step": 7724 + }, + { + "epoch": 0.8127301420305103, + "grad_norm": 1.4612594842910767, + "learning_rate": 1.7839595902904393e-05, + "loss": 1.7474, + "step": 7725 + }, + { + "epoch": 0.8128353498158863, + "grad_norm": 1.4456712007522583, + "learning_rate": 1.782017457671764e-05, + "loss": 1.4284, + "step": 7726 + }, + { + "epoch": 0.8129405576012625, + "grad_norm": 2.546482801437378, + "learning_rate": 1.780076279380337e-05, + "loss": 1.9752, + "step": 7727 + }, + { + "epoch": 0.8130457653866386, + "grad_norm": 2.082400321960449, + "learning_rate": 1.778136055641587e-05, + "loss": 1.666, + "step": 7728 + }, + { + "epoch": 0.8131509731720147, + "grad_norm": 1.367645025253296, + "learning_rate": 1.776196786680824e-05, + "loss": 1.4718, + "step": 7729 + }, + { + "epoch": 0.8132561809573908, + "grad_norm": 2.418299436569214, + "learning_rate": 1.7742584727232437e-05, + "loss": 1.7791, + "step": 7730 + }, + { + "epoch": 0.813361388742767, + "grad_norm": 1.4075639247894287, + "learning_rate": 1.7723211139939445e-05, + "loss": 1.9541, + "step": 7731 + }, + { + "epoch": 0.8134665965281431, + "grad_norm": 1.8531979322433472, + "learning_rate": 1.7703847107178996e-05, + "loss": 1.3266, + "step": 7732 + }, + { + "epoch": 0.8135718043135192, + "grad_norm": 1.2879350185394287, + "learning_rate": 1.768449263119977e-05, + "loss": 1.3593, + "step": 7733 + }, + { + "epoch": 0.8136770120988953, + "grad_norm": 1.4324150085449219, + "learning_rate": 1.7665147714249376e-05, + "loss": 1.1906, + "step": 7734 + }, + { + "epoch": 0.8137822198842715, + "grad_norm": 1.2460278272628784, + "learning_rate": 1.7645812358574264e-05, + "loss": 1.7535, + "step": 7735 + }, + { + "epoch": 0.8138874276696475, + "grad_norm": 1.501090407371521, + "learning_rate": 1.762648656641974e-05, + "loss": 1.507, + "step": 7736 + }, + { + "epoch": 0.8139926354550236, + "grad_norm": 2.3010196685791016, + "learning_rate": 1.7607170340030143e-05, + "loss": 1.1804, + "step": 7737 + }, + { + "epoch": 0.8140978432403998, + "grad_norm": 1.1160740852355957, + "learning_rate": 1.7587863681648487e-05, + "loss": 1.7082, + "step": 7738 + }, + { + "epoch": 0.814203051025776, + "grad_norm": 1.3494609594345093, + "learning_rate": 1.7568566593516867e-05, + "loss": 1.5496, + "step": 7739 + }, + { + "epoch": 0.814308258811152, + "grad_norm": 1.8527060747146606, + "learning_rate": 1.754927907787617e-05, + "loss": 1.4893, + "step": 7740 + }, + { + "epoch": 0.8144134665965281, + "grad_norm": 2.120997428894043, + "learning_rate": 1.753000113696617e-05, + "loss": 1.9963, + "step": 7741 + }, + { + "epoch": 0.8145186743819043, + "grad_norm": 2.1121816635131836, + "learning_rate": 1.7510732773025584e-05, + "loss": 2.1124, + "step": 7742 + }, + { + "epoch": 0.8146238821672804, + "grad_norm": 2.398660659790039, + "learning_rate": 1.7491473988291984e-05, + "loss": 1.6849, + "step": 7743 + }, + { + "epoch": 0.8147290899526565, + "grad_norm": 1.6509634256362915, + "learning_rate": 1.7472224785001778e-05, + "loss": 1.4914, + "step": 7744 + }, + { + "epoch": 0.8148342977380326, + "grad_norm": 1.1640690565109253, + "learning_rate": 1.745298516539039e-05, + "loss": 1.7646, + "step": 7745 + }, + { + "epoch": 0.8149395055234088, + "grad_norm": 0.9993892312049866, + "learning_rate": 1.7433755131692e-05, + "loss": 1.5413, + "step": 7746 + }, + { + "epoch": 0.8150447133087848, + "grad_norm": 2.1200225353240967, + "learning_rate": 1.7414534686139717e-05, + "loss": 1.3748, + "step": 7747 + }, + { + "epoch": 0.815149921094161, + "grad_norm": 1.2785836458206177, + "learning_rate": 1.7395323830965605e-05, + "loss": 1.9115, + "step": 7748 + }, + { + "epoch": 0.8152551288795371, + "grad_norm": 1.8023689985275269, + "learning_rate": 1.7376122568400532e-05, + "loss": 1.432, + "step": 7749 + }, + { + "epoch": 0.8153603366649133, + "grad_norm": 1.6197617053985596, + "learning_rate": 1.7356930900674228e-05, + "loss": 1.2918, + "step": 7750 + }, + { + "epoch": 0.8154655444502893, + "grad_norm": 1.9692368507385254, + "learning_rate": 1.7337748830015442e-05, + "loss": 1.4366, + "step": 7751 + }, + { + "epoch": 0.8155707522356654, + "grad_norm": 1.7193232774734497, + "learning_rate": 1.7318576358651685e-05, + "loss": 1.9742, + "step": 7752 + }, + { + "epoch": 0.8156759600210416, + "grad_norm": 1.3902307748794556, + "learning_rate": 1.7299413488809356e-05, + "loss": 1.6264, + "step": 7753 + }, + { + "epoch": 0.8157811678064176, + "grad_norm": 1.7483313083648682, + "learning_rate": 1.728026022271384e-05, + "loss": 1.9915, + "step": 7754 + }, + { + "epoch": 0.8158863755917938, + "grad_norm": 1.3935078382492065, + "learning_rate": 1.726111656258932e-05, + "loss": 1.5677, + "step": 7755 + }, + { + "epoch": 0.8159915833771699, + "grad_norm": 1.7369499206542969, + "learning_rate": 1.724198251065885e-05, + "loss": 1.5543, + "step": 7756 + }, + { + "epoch": 0.8160967911625461, + "grad_norm": 1.9357125759124756, + "learning_rate": 1.7222858069144464e-05, + "loss": 1.8799, + "step": 7757 + }, + { + "epoch": 0.8162019989479221, + "grad_norm": 1.534250020980835, + "learning_rate": 1.720374324026699e-05, + "loss": 1.5193, + "step": 7758 + }, + { + "epoch": 0.8163072067332983, + "grad_norm": 1.6834253072738647, + "learning_rate": 1.718463802624617e-05, + "loss": 1.3028, + "step": 7759 + }, + { + "epoch": 0.8164124145186744, + "grad_norm": 1.60200834274292, + "learning_rate": 1.716554242930063e-05, + "loss": 1.5029, + "step": 7760 + }, + { + "epoch": 0.8165176223040504, + "grad_norm": 1.758754849433899, + "learning_rate": 1.7146456451647863e-05, + "loss": 1.7036, + "step": 7761 + }, + { + "epoch": 0.8166228300894266, + "grad_norm": 1.4392300844192505, + "learning_rate": 1.7127380095504296e-05, + "loss": 1.312, + "step": 7762 + }, + { + "epoch": 0.8167280378748027, + "grad_norm": 1.1349120140075684, + "learning_rate": 1.710831336308519e-05, + "loss": 1.618, + "step": 7763 + }, + { + "epoch": 0.8168332456601789, + "grad_norm": 2.290132999420166, + "learning_rate": 1.708925625660467e-05, + "loss": 1.3436, + "step": 7764 + }, + { + "epoch": 0.8169384534455549, + "grad_norm": 1.6770775318145752, + "learning_rate": 1.707020877827583e-05, + "loss": 1.3586, + "step": 7765 + }, + { + "epoch": 0.8170436612309311, + "grad_norm": 1.3523870706558228, + "learning_rate": 1.7051170930310555e-05, + "loss": 1.717, + "step": 7766 + }, + { + "epoch": 0.8171488690163072, + "grad_norm": 1.853025197982788, + "learning_rate": 1.703214271491964e-05, + "loss": 1.5571, + "step": 7767 + }, + { + "epoch": 0.8172540768016833, + "grad_norm": 1.83201003074646, + "learning_rate": 1.701312413431281e-05, + "loss": 1.9148, + "step": 7768 + }, + { + "epoch": 0.8173592845870594, + "grad_norm": 1.2569760084152222, + "learning_rate": 1.699411519069858e-05, + "loss": 2.3183, + "step": 7769 + }, + { + "epoch": 0.8174644923724356, + "grad_norm": 2.7303202152252197, + "learning_rate": 1.697511588628443e-05, + "loss": 1.6214, + "step": 7770 + }, + { + "epoch": 0.8175697001578117, + "grad_norm": 2.05470609664917, + "learning_rate": 1.6956126223276692e-05, + "loss": 1.3433, + "step": 7771 + }, + { + "epoch": 0.8176749079431878, + "grad_norm": 1.919611930847168, + "learning_rate": 1.693714620388054e-05, + "loss": 1.4729, + "step": 7772 + }, + { + "epoch": 0.8177801157285639, + "grad_norm": 1.4189374446868896, + "learning_rate": 1.6918175830300088e-05, + "loss": 1.756, + "step": 7773 + }, + { + "epoch": 0.81788532351394, + "grad_norm": 1.5107403993606567, + "learning_rate": 1.6899215104738307e-05, + "loss": 1.9558, + "step": 7774 + }, + { + "epoch": 0.8179905312993162, + "grad_norm": 1.0100902318954468, + "learning_rate": 1.6880264029396997e-05, + "loss": 1.6087, + "step": 7775 + }, + { + "epoch": 0.8180957390846922, + "grad_norm": 1.7361371517181396, + "learning_rate": 1.686132260647696e-05, + "loss": 1.3897, + "step": 7776 + }, + { + "epoch": 0.8182009468700684, + "grad_norm": 1.6154359579086304, + "learning_rate": 1.6842390838177746e-05, + "loss": 1.8557, + "step": 7777 + }, + { + "epoch": 0.8183061546554445, + "grad_norm": 1.9448010921478271, + "learning_rate": 1.682346872669782e-05, + "loss": 1.7147, + "step": 7778 + }, + { + "epoch": 0.8184113624408206, + "grad_norm": 1.653695821762085, + "learning_rate": 1.6804556274234616e-05, + "loss": 1.7196, + "step": 7779 + }, + { + "epoch": 0.8185165702261967, + "grad_norm": 1.5844122171401978, + "learning_rate": 1.678565348298433e-05, + "loss": 1.4943, + "step": 7780 + }, + { + "epoch": 0.8186217780115729, + "grad_norm": 2.1291873455047607, + "learning_rate": 1.6766760355142098e-05, + "loss": 1.0847, + "step": 7781 + }, + { + "epoch": 0.818726985796949, + "grad_norm": 1.2485053539276123, + "learning_rate": 1.6747876892901893e-05, + "loss": 1.5608, + "step": 7782 + }, + { + "epoch": 0.8188321935823251, + "grad_norm": 2.34297776222229, + "learning_rate": 1.6729003098456576e-05, + "loss": 1.5364, + "step": 7783 + }, + { + "epoch": 0.8189374013677012, + "grad_norm": 1.2006014585494995, + "learning_rate": 1.6710138973997957e-05, + "loss": 1.8377, + "step": 7784 + }, + { + "epoch": 0.8190426091530774, + "grad_norm": 2.0813822746276855, + "learning_rate": 1.6691284521716622e-05, + "loss": 2.0049, + "step": 7785 + }, + { + "epoch": 0.8191478169384534, + "grad_norm": 1.6687533855438232, + "learning_rate": 1.667243974380207e-05, + "loss": 1.6737, + "step": 7786 + }, + { + "epoch": 0.8192530247238295, + "grad_norm": 1.5384453535079956, + "learning_rate": 1.665360464244272e-05, + "loss": 1.6079, + "step": 7787 + }, + { + "epoch": 0.8193582325092057, + "grad_norm": 1.5753425359725952, + "learning_rate": 1.6634779219825814e-05, + "loss": 1.8783, + "step": 7788 + }, + { + "epoch": 0.8194634402945818, + "grad_norm": 1.79296875, + "learning_rate": 1.6615963478137454e-05, + "loss": 1.8779, + "step": 7789 + }, + { + "epoch": 0.8195686480799579, + "grad_norm": 2.0262389183044434, + "learning_rate": 1.6597157419562703e-05, + "loss": 1.7647, + "step": 7790 + }, + { + "epoch": 0.819673855865334, + "grad_norm": 1.363146185874939, + "learning_rate": 1.6578361046285418e-05, + "loss": 1.7111, + "step": 7791 + }, + { + "epoch": 0.8197790636507102, + "grad_norm": 2.63222599029541, + "learning_rate": 1.655957436048834e-05, + "loss": 1.0101, + "step": 7792 + }, + { + "epoch": 0.8198842714360862, + "grad_norm": 1.6749835014343262, + "learning_rate": 1.6540797364353155e-05, + "loss": 1.5263, + "step": 7793 + }, + { + "epoch": 0.8199894792214624, + "grad_norm": 1.5727925300598145, + "learning_rate": 1.6522030060060356e-05, + "loss": 1.6823, + "step": 7794 + }, + { + "epoch": 0.8200946870068385, + "grad_norm": 1.6061309576034546, + "learning_rate": 1.6503272449789286e-05, + "loss": 1.4122, + "step": 7795 + }, + { + "epoch": 0.8201998947922147, + "grad_norm": 2.4078192710876465, + "learning_rate": 1.648452453571826e-05, + "loss": 1.9501, + "step": 7796 + }, + { + "epoch": 0.8203051025775907, + "grad_norm": 1.4480916261672974, + "learning_rate": 1.646578632002439e-05, + "loss": 1.7646, + "step": 7797 + }, + { + "epoch": 0.8204103103629669, + "grad_norm": 2.407717227935791, + "learning_rate": 1.644705780488367e-05, + "loss": 1.4102, + "step": 7798 + }, + { + "epoch": 0.820515518148343, + "grad_norm": 1.7151941061019897, + "learning_rate": 1.6428338992471005e-05, + "loss": 1.57, + "step": 7799 + }, + { + "epoch": 0.820620725933719, + "grad_norm": 1.3426469564437866, + "learning_rate": 1.6409629884960144e-05, + "loss": 1.7375, + "step": 7800 + }, + { + "epoch": 0.8207259337190952, + "grad_norm": 1.4567279815673828, + "learning_rate": 1.6390930484523704e-05, + "loss": 1.5916, + "step": 7801 + }, + { + "epoch": 0.8208311415044713, + "grad_norm": 2.2961177825927734, + "learning_rate": 1.6372240793333195e-05, + "loss": 1.5445, + "step": 7802 + }, + { + "epoch": 0.8209363492898475, + "grad_norm": 1.0729035139083862, + "learning_rate": 1.6353560813558953e-05, + "loss": 1.5018, + "step": 7803 + }, + { + "epoch": 0.8210415570752235, + "grad_norm": 1.8320657014846802, + "learning_rate": 1.6334890547370286e-05, + "loss": 1.4641, + "step": 7804 + }, + { + "epoch": 0.8211467648605997, + "grad_norm": 1.700055480003357, + "learning_rate": 1.6316229996935262e-05, + "loss": 2.0822, + "step": 7805 + }, + { + "epoch": 0.8212519726459758, + "grad_norm": 1.5183660984039307, + "learning_rate": 1.6297579164420873e-05, + "loss": 1.8721, + "step": 7806 + }, + { + "epoch": 0.821357180431352, + "grad_norm": 1.26556396484375, + "learning_rate": 1.6278938051993008e-05, + "loss": 1.7277, + "step": 7807 + }, + { + "epoch": 0.821462388216728, + "grad_norm": 2.1678805351257324, + "learning_rate": 1.6260306661816383e-05, + "loss": 1.5226, + "step": 7808 + }, + { + "epoch": 0.8215675960021042, + "grad_norm": 1.2085503339767456, + "learning_rate": 1.6241684996054574e-05, + "loss": 1.6556, + "step": 7809 + }, + { + "epoch": 0.8216728037874803, + "grad_norm": 1.5834039449691772, + "learning_rate": 1.6223073056870097e-05, + "loss": 1.7377, + "step": 7810 + }, + { + "epoch": 0.8217780115728563, + "grad_norm": 1.1570148468017578, + "learning_rate": 1.6204470846424268e-05, + "loss": 1.5493, + "step": 7811 + }, + { + "epoch": 0.8218832193582325, + "grad_norm": 2.1178195476531982, + "learning_rate": 1.6185878366877295e-05, + "loss": 1.6665, + "step": 7812 + }, + { + "epoch": 0.8219884271436086, + "grad_norm": 2.094390392303467, + "learning_rate": 1.61672956203883e-05, + "loss": 1.0295, + "step": 7813 + }, + { + "epoch": 0.8220936349289848, + "grad_norm": 1.8944170475006104, + "learning_rate": 1.6148722609115176e-05, + "loss": 1.8045, + "step": 7814 + }, + { + "epoch": 0.8221988427143608, + "grad_norm": 1.5587190389633179, + "learning_rate": 1.613015933521481e-05, + "loss": 1.6959, + "step": 7815 + }, + { + "epoch": 0.822304050499737, + "grad_norm": 1.878710150718689, + "learning_rate": 1.6111605800842865e-05, + "loss": 1.62, + "step": 7816 + }, + { + "epoch": 0.8224092582851131, + "grad_norm": 1.67520272731781, + "learning_rate": 1.609306200815387e-05, + "loss": 1.3253, + "step": 7817 + }, + { + "epoch": 0.8225144660704892, + "grad_norm": 1.488664984703064, + "learning_rate": 1.607452795930131e-05, + "loss": 1.8211, + "step": 7818 + }, + { + "epoch": 0.8226196738558653, + "grad_norm": 1.5285340547561646, + "learning_rate": 1.6056003656437458e-05, + "loss": 1.2902, + "step": 7819 + }, + { + "epoch": 0.8227248816412415, + "grad_norm": 1.4336509704589844, + "learning_rate": 1.6037489101713465e-05, + "loss": 1.866, + "step": 7820 + }, + { + "epoch": 0.8228300894266176, + "grad_norm": 1.3473023176193237, + "learning_rate": 1.6018984297279393e-05, + "loss": 1.4556, + "step": 7821 + }, + { + "epoch": 0.8229352972119937, + "grad_norm": 1.2973744869232178, + "learning_rate": 1.6000489245284133e-05, + "loss": 1.9248, + "step": 7822 + }, + { + "epoch": 0.8230405049973698, + "grad_norm": 1.9646800756454468, + "learning_rate": 1.5982003947875467e-05, + "loss": 1.7111, + "step": 7823 + }, + { + "epoch": 0.823145712782746, + "grad_norm": 1.3596656322479248, + "learning_rate": 1.59635284072e-05, + "loss": 1.5034, + "step": 7824 + }, + { + "epoch": 0.823250920568122, + "grad_norm": 2.065723180770874, + "learning_rate": 1.594506262540324e-05, + "loss": 1.876, + "step": 7825 + }, + { + "epoch": 0.8233561283534981, + "grad_norm": 1.7020695209503174, + "learning_rate": 1.592660660462959e-05, + "loss": 1.237, + "step": 7826 + }, + { + "epoch": 0.8234613361388743, + "grad_norm": 1.4010860919952393, + "learning_rate": 1.5908160347022272e-05, + "loss": 1.9249, + "step": 7827 + }, + { + "epoch": 0.8235665439242504, + "grad_norm": 1.8709789514541626, + "learning_rate": 1.588972385472336e-05, + "loss": 2.0006, + "step": 7828 + }, + { + "epoch": 0.8236717517096265, + "grad_norm": 1.6557965278625488, + "learning_rate": 1.5871297129873864e-05, + "loss": 1.9393, + "step": 7829 + }, + { + "epoch": 0.8237769594950026, + "grad_norm": 1.690056562423706, + "learning_rate": 1.5852880174613617e-05, + "loss": 1.6349, + "step": 7830 + }, + { + "epoch": 0.8238821672803788, + "grad_norm": 1.9580641984939575, + "learning_rate": 1.5834472991081273e-05, + "loss": 1.3244, + "step": 7831 + }, + { + "epoch": 0.8239873750657548, + "grad_norm": 1.4145424365997314, + "learning_rate": 1.5816075581414458e-05, + "loss": 1.5102, + "step": 7832 + }, + { + "epoch": 0.824092582851131, + "grad_norm": 1.6314736604690552, + "learning_rate": 1.5797687947749563e-05, + "loss": 1.424, + "step": 7833 + }, + { + "epoch": 0.8241977906365071, + "grad_norm": 1.2429150342941284, + "learning_rate": 1.577931009222189e-05, + "loss": 1.2783, + "step": 7834 + }, + { + "epoch": 0.8243029984218833, + "grad_norm": 1.750223994255066, + "learning_rate": 1.5760942016965608e-05, + "loss": 1.892, + "step": 7835 + }, + { + "epoch": 0.8244082062072593, + "grad_norm": 1.6230374574661255, + "learning_rate": 1.5742583724113746e-05, + "loss": 1.8513, + "step": 7836 + }, + { + "epoch": 0.8245134139926354, + "grad_norm": 1.2478396892547607, + "learning_rate": 1.5724235215798167e-05, + "loss": 1.8733, + "step": 7837 + }, + { + "epoch": 0.8246186217780116, + "grad_norm": 1.6397819519042969, + "learning_rate": 1.5705896494149654e-05, + "loss": 1.4, + "step": 7838 + }, + { + "epoch": 0.8247238295633877, + "grad_norm": 1.4806641340255737, + "learning_rate": 1.56875675612978e-05, + "loss": 1.4183, + "step": 7839 + }, + { + "epoch": 0.8248290373487638, + "grad_norm": 1.4073078632354736, + "learning_rate": 1.5669248419371085e-05, + "loss": 1.7526, + "step": 7840 + }, + { + "epoch": 0.8249342451341399, + "grad_norm": 1.6545143127441406, + "learning_rate": 1.565093907049686e-05, + "loss": 1.8446, + "step": 7841 + }, + { + "epoch": 0.8250394529195161, + "grad_norm": 1.461369276046753, + "learning_rate": 1.563263951680134e-05, + "loss": 1.4983, + "step": 7842 + }, + { + "epoch": 0.8251446607048921, + "grad_norm": 1.4910365343093872, + "learning_rate": 1.5614349760409552e-05, + "loss": 1.63, + "step": 7843 + }, + { + "epoch": 0.8252498684902683, + "grad_norm": 1.4851601123809814, + "learning_rate": 1.5596069803445502e-05, + "loss": 1.3496, + "step": 7844 + }, + { + "epoch": 0.8253550762756444, + "grad_norm": 1.3788039684295654, + "learning_rate": 1.5577799648031876e-05, + "loss": 1.8372, + "step": 7845 + }, + { + "epoch": 0.8254602840610206, + "grad_norm": 1.6159765720367432, + "learning_rate": 1.5559539296290403e-05, + "loss": 1.5477, + "step": 7846 + }, + { + "epoch": 0.8255654918463966, + "grad_norm": 2.3057737350463867, + "learning_rate": 1.5541288750341575e-05, + "loss": 1.866, + "step": 7847 + }, + { + "epoch": 0.8256706996317728, + "grad_norm": 2.4619557857513428, + "learning_rate": 1.5523048012304754e-05, + "loss": 1.6384, + "step": 7848 + }, + { + "epoch": 0.8257759074171489, + "grad_norm": 1.4026074409484863, + "learning_rate": 1.5504817084298207e-05, + "loss": 1.2915, + "step": 7849 + }, + { + "epoch": 0.8258811152025249, + "grad_norm": 1.2069450616836548, + "learning_rate": 1.548659596843902e-05, + "loss": 1.6054, + "step": 7850 + }, + { + "epoch": 0.8259863229879011, + "grad_norm": 1.7536677122116089, + "learning_rate": 1.5468384666843115e-05, + "loss": 1.7181, + "step": 7851 + }, + { + "epoch": 0.8260915307732772, + "grad_norm": 1.5129624605178833, + "learning_rate": 1.545018318162538e-05, + "loss": 2.1859, + "step": 7852 + }, + { + "epoch": 0.8261967385586534, + "grad_norm": 1.5461325645446777, + "learning_rate": 1.5431991514899446e-05, + "loss": 2.0414, + "step": 7853 + }, + { + "epoch": 0.8263019463440294, + "grad_norm": 1.6245604753494263, + "learning_rate": 1.5413809668777844e-05, + "loss": 1.2136, + "step": 7854 + }, + { + "epoch": 0.8264071541294056, + "grad_norm": 1.2918306589126587, + "learning_rate": 1.5395637645372007e-05, + "loss": 1.3098, + "step": 7855 + }, + { + "epoch": 0.8265123619147817, + "grad_norm": 1.5559884309768677, + "learning_rate": 1.5377475446792178e-05, + "loss": 1.5964, + "step": 7856 + }, + { + "epoch": 0.8266175697001578, + "grad_norm": 1.8957018852233887, + "learning_rate": 1.535932307514745e-05, + "loss": 1.8956, + "step": 7857 + }, + { + "epoch": 0.8267227774855339, + "grad_norm": 2.0008089542388916, + "learning_rate": 1.534118053254584e-05, + "loss": 1.6321, + "step": 7858 + }, + { + "epoch": 0.8268279852709101, + "grad_norm": 1.7587536573410034, + "learning_rate": 1.5323047821094126e-05, + "loss": 1.8533, + "step": 7859 + }, + { + "epoch": 0.8269331930562862, + "grad_norm": 2.5158214569091797, + "learning_rate": 1.5304924942898068e-05, + "loss": 1.1862, + "step": 7860 + }, + { + "epoch": 0.8270384008416622, + "grad_norm": 1.6874955892562866, + "learning_rate": 1.528681190006218e-05, + "loss": 1.5662, + "step": 7861 + }, + { + "epoch": 0.8271436086270384, + "grad_norm": 2.3075296878814697, + "learning_rate": 1.5268708694689847e-05, + "loss": 1.449, + "step": 7862 + }, + { + "epoch": 0.8272488164124145, + "grad_norm": 1.3436137437820435, + "learning_rate": 1.5250615328883388e-05, + "loss": 1.3811, + "step": 7863 + }, + { + "epoch": 0.8273540241977906, + "grad_norm": 2.0536916255950928, + "learning_rate": 1.5232531804743899e-05, + "loss": 1.6885, + "step": 7864 + }, + { + "epoch": 0.8274592319831667, + "grad_norm": 1.4442811012268066, + "learning_rate": 1.5214458124371345e-05, + "loss": 1.5104, + "step": 7865 + }, + { + "epoch": 0.8275644397685429, + "grad_norm": 2.0475945472717285, + "learning_rate": 1.5196394289864591e-05, + "loss": 1.7676, + "step": 7866 + }, + { + "epoch": 0.827669647553919, + "grad_norm": 1.2523587942123413, + "learning_rate": 1.5178340303321314e-05, + "loss": 1.5507, + "step": 7867 + }, + { + "epoch": 0.8277748553392951, + "grad_norm": 2.2663662433624268, + "learning_rate": 1.516029616683804e-05, + "loss": 1.4212, + "step": 7868 + }, + { + "epoch": 0.8278800631246712, + "grad_norm": 1.719728946685791, + "learning_rate": 1.5142261882510223e-05, + "loss": 2.0605, + "step": 7869 + }, + { + "epoch": 0.8279852709100474, + "grad_norm": 1.38359534740448, + "learning_rate": 1.5124237452432077e-05, + "loss": 1.6727, + "step": 7870 + }, + { + "epoch": 0.8280904786954235, + "grad_norm": 2.143702507019043, + "learning_rate": 1.5106222878696775e-05, + "loss": 1.3658, + "step": 7871 + }, + { + "epoch": 0.8281956864807996, + "grad_norm": 1.9734598398208618, + "learning_rate": 1.5088218163396262e-05, + "loss": 1.5329, + "step": 7872 + }, + { + "epoch": 0.8283008942661757, + "grad_norm": 1.5374348163604736, + "learning_rate": 1.5070223308621345e-05, + "loss": 1.5764, + "step": 7873 + }, + { + "epoch": 0.8284061020515519, + "grad_norm": 1.664512038230896, + "learning_rate": 1.5052238316461753e-05, + "loss": 1.4934, + "step": 7874 + }, + { + "epoch": 0.8285113098369279, + "grad_norm": 1.5587176084518433, + "learning_rate": 1.5034263189005993e-05, + "loss": 1.8861, + "step": 7875 + }, + { + "epoch": 0.828616517622304, + "grad_norm": 1.4705950021743774, + "learning_rate": 1.5016297928341438e-05, + "loss": 1.7146, + "step": 7876 + }, + { + "epoch": 0.8287217254076802, + "grad_norm": 1.6136786937713623, + "learning_rate": 1.499834253655439e-05, + "loss": 1.3576, + "step": 7877 + }, + { + "epoch": 0.8288269331930563, + "grad_norm": 1.1220991611480713, + "learning_rate": 1.4980397015729908e-05, + "loss": 1.9862, + "step": 7878 + }, + { + "epoch": 0.8289321409784324, + "grad_norm": 1.931833267211914, + "learning_rate": 1.496246136795194e-05, + "loss": 1.7605, + "step": 7879 + }, + { + "epoch": 0.8290373487638085, + "grad_norm": 3.0278358459472656, + "learning_rate": 1.4944535595303344e-05, + "loss": 1.4795, + "step": 7880 + }, + { + "epoch": 0.8291425565491847, + "grad_norm": 2.9382994174957275, + "learning_rate": 1.492661969986574e-05, + "loss": 1.4424, + "step": 7881 + }, + { + "epoch": 0.8292477643345607, + "grad_norm": 1.7495644092559814, + "learning_rate": 1.4908713683719632e-05, + "loss": 1.7025, + "step": 7882 + }, + { + "epoch": 0.8293529721199369, + "grad_norm": 1.588752269744873, + "learning_rate": 1.4890817548944424e-05, + "loss": 0.9935, + "step": 7883 + }, + { + "epoch": 0.829458179905313, + "grad_norm": 1.3464096784591675, + "learning_rate": 1.4872931297618308e-05, + "loss": 1.6566, + "step": 7884 + }, + { + "epoch": 0.8295633876906892, + "grad_norm": 1.621984601020813, + "learning_rate": 1.4855054931818357e-05, + "loss": 1.4483, + "step": 7885 + }, + { + "epoch": 0.8296685954760652, + "grad_norm": 1.6145195960998535, + "learning_rate": 1.483718845362051e-05, + "loss": 1.6128, + "step": 7886 + }, + { + "epoch": 0.8297738032614413, + "grad_norm": 1.5069514513015747, + "learning_rate": 1.4819331865099539e-05, + "loss": 1.5058, + "step": 7887 + }, + { + "epoch": 0.8298790110468175, + "grad_norm": 2.006563425064087, + "learning_rate": 1.4801485168329066e-05, + "loss": 1.7359, + "step": 7888 + }, + { + "epoch": 0.8299842188321935, + "grad_norm": 1.1885156631469727, + "learning_rate": 1.4783648365381563e-05, + "loss": 1.1542, + "step": 7889 + }, + { + "epoch": 0.8300894266175697, + "grad_norm": 2.4285728931427, + "learning_rate": 1.4765821458328355e-05, + "loss": 1.7459, + "step": 7890 + }, + { + "epoch": 0.8301946344029458, + "grad_norm": 1.307138442993164, + "learning_rate": 1.4748004449239639e-05, + "loss": 1.1976, + "step": 7891 + }, + { + "epoch": 0.830299842188322, + "grad_norm": 2.049842357635498, + "learning_rate": 1.473019734018445e-05, + "loss": 1.3167, + "step": 7892 + }, + { + "epoch": 0.830405049973698, + "grad_norm": 2.1878762245178223, + "learning_rate": 1.471240013323063e-05, + "loss": 1.5895, + "step": 7893 + }, + { + "epoch": 0.8305102577590742, + "grad_norm": 1.3817623853683472, + "learning_rate": 1.4694612830444953e-05, + "loss": 1.8225, + "step": 7894 + }, + { + "epoch": 0.8306154655444503, + "grad_norm": 1.5477038621902466, + "learning_rate": 1.4676835433892989e-05, + "loss": 1.5813, + "step": 7895 + }, + { + "epoch": 0.8307206733298264, + "grad_norm": 2.0993480682373047, + "learning_rate": 1.4659067945639137e-05, + "loss": 2.0678, + "step": 7896 + }, + { + "epoch": 0.8308258811152025, + "grad_norm": 1.446555733680725, + "learning_rate": 1.464131036774674e-05, + "loss": 1.5909, + "step": 7897 + }, + { + "epoch": 0.8309310889005787, + "grad_norm": 1.8362712860107422, + "learning_rate": 1.4623562702277882e-05, + "loss": 1.1406, + "step": 7898 + }, + { + "epoch": 0.8310362966859548, + "grad_norm": 1.4514873027801514, + "learning_rate": 1.4605824951293524e-05, + "loss": 1.7013, + "step": 7899 + }, + { + "epoch": 0.8311415044713308, + "grad_norm": 1.1335458755493164, + "learning_rate": 1.4588097116853538e-05, + "loss": 1.2316, + "step": 7900 + }, + { + "epoch": 0.831246712256707, + "grad_norm": 1.7612231969833374, + "learning_rate": 1.4570379201016581e-05, + "loss": 1.456, + "step": 7901 + }, + { + "epoch": 0.8313519200420831, + "grad_norm": 2.277941942214966, + "learning_rate": 1.4552671205840163e-05, + "loss": 2.1247, + "step": 7902 + }, + { + "epoch": 0.8314571278274593, + "grad_norm": 1.3662701845169067, + "learning_rate": 1.4534973133380669e-05, + "loss": 1.7872, + "step": 7903 + }, + { + "epoch": 0.8315623356128353, + "grad_norm": 1.494207501411438, + "learning_rate": 1.4517284985693335e-05, + "loss": 1.7331, + "step": 7904 + }, + { + "epoch": 0.8316675433982115, + "grad_norm": 1.8795514106750488, + "learning_rate": 1.4499606764832175e-05, + "loss": 1.3668, + "step": 7905 + }, + { + "epoch": 0.8317727511835876, + "grad_norm": 1.7564284801483154, + "learning_rate": 1.4481938472850154e-05, + "loss": 1.5335, + "step": 7906 + }, + { + "epoch": 0.8318779589689637, + "grad_norm": 2.4675612449645996, + "learning_rate": 1.4464280111799e-05, + "loss": 1.1034, + "step": 7907 + }, + { + "epoch": 0.8319831667543398, + "grad_norm": 1.5577418804168701, + "learning_rate": 1.4446631683729372e-05, + "loss": 2.0499, + "step": 7908 + }, + { + "epoch": 0.832088374539716, + "grad_norm": 2.425018787384033, + "learning_rate": 1.4428993190690677e-05, + "loss": 1.6102, + "step": 7909 + }, + { + "epoch": 0.8321935823250921, + "grad_norm": 1.142414927482605, + "learning_rate": 1.4411364634731183e-05, + "loss": 1.52, + "step": 7910 + }, + { + "epoch": 0.8322987901104681, + "grad_norm": 1.444345474243164, + "learning_rate": 1.4393746017898113e-05, + "loss": 1.5863, + "step": 7911 + }, + { + "epoch": 0.8324039978958443, + "grad_norm": 1.6111942529678345, + "learning_rate": 1.4376137342237417e-05, + "loss": 1.4745, + "step": 7912 + }, + { + "epoch": 0.8325092056812204, + "grad_norm": 2.1231207847595215, + "learning_rate": 1.4358538609793915e-05, + "loss": 1.719, + "step": 7913 + }, + { + "epoch": 0.8326144134665965, + "grad_norm": 1.4172590970993042, + "learning_rate": 1.4340949822611338e-05, + "loss": 1.4065, + "step": 7914 + }, + { + "epoch": 0.8327196212519726, + "grad_norm": 1.674170970916748, + "learning_rate": 1.4323370982732176e-05, + "loss": 1.8991, + "step": 7915 + }, + { + "epoch": 0.8328248290373488, + "grad_norm": 1.4009698629379272, + "learning_rate": 1.4305802092197829e-05, + "loss": 1.5887, + "step": 7916 + }, + { + "epoch": 0.8329300368227249, + "grad_norm": 1.580376148223877, + "learning_rate": 1.4288243153048497e-05, + "loss": 2.2614, + "step": 7917 + }, + { + "epoch": 0.833035244608101, + "grad_norm": 2.047874689102173, + "learning_rate": 1.4270694167323228e-05, + "loss": 1.3469, + "step": 7918 + }, + { + "epoch": 0.8331404523934771, + "grad_norm": 2.807647705078125, + "learning_rate": 1.425315513705997e-05, + "loss": 1.8353, + "step": 7919 + }, + { + "epoch": 0.8332456601788533, + "grad_norm": 1.343658208847046, + "learning_rate": 1.4235626064295438e-05, + "loss": 1.6872, + "step": 7920 + }, + { + "epoch": 0.8333508679642293, + "grad_norm": 1.3211522102355957, + "learning_rate": 1.4218106951065224e-05, + "loss": 1.6872, + "step": 7921 + }, + { + "epoch": 0.8334560757496055, + "grad_norm": 2.710646867752075, + "learning_rate": 1.4200597799403793e-05, + "loss": 1.703, + "step": 7922 + }, + { + "epoch": 0.8335612835349816, + "grad_norm": 1.5261638164520264, + "learning_rate": 1.4183098611344415e-05, + "loss": 1.5745, + "step": 7923 + }, + { + "epoch": 0.8336664913203577, + "grad_norm": 1.4623990058898926, + "learning_rate": 1.4165609388919176e-05, + "loss": 1.606, + "step": 7924 + }, + { + "epoch": 0.8337716991057338, + "grad_norm": 1.5466994047164917, + "learning_rate": 1.414813013415911e-05, + "loss": 1.7387, + "step": 7925 + }, + { + "epoch": 0.8338769068911099, + "grad_norm": 1.6613030433654785, + "learning_rate": 1.4130660849093969e-05, + "loss": 1.8445, + "step": 7926 + }, + { + "epoch": 0.8339821146764861, + "grad_norm": 1.4952675104141235, + "learning_rate": 1.4113201535752407e-05, + "loss": 1.7024, + "step": 7927 + }, + { + "epoch": 0.8340873224618621, + "grad_norm": 1.191802740097046, + "learning_rate": 1.409575219616196e-05, + "loss": 1.56, + "step": 7928 + }, + { + "epoch": 0.8341925302472383, + "grad_norm": 1.860492467880249, + "learning_rate": 1.4078312832348938e-05, + "loss": 0.9361, + "step": 7929 + }, + { + "epoch": 0.8342977380326144, + "grad_norm": 1.4754762649536133, + "learning_rate": 1.4060883446338502e-05, + "loss": 1.5502, + "step": 7930 + }, + { + "epoch": 0.8344029458179906, + "grad_norm": 1.2694263458251953, + "learning_rate": 1.4043464040154686e-05, + "loss": 1.761, + "step": 7931 + }, + { + "epoch": 0.8345081536033666, + "grad_norm": 2.5978357791900635, + "learning_rate": 1.4026054615820317e-05, + "loss": 0.6868, + "step": 7932 + }, + { + "epoch": 0.8346133613887428, + "grad_norm": 1.792586326599121, + "learning_rate": 1.4008655175357144e-05, + "loss": 1.808, + "step": 7933 + }, + { + "epoch": 0.8347185691741189, + "grad_norm": 1.2222882509231567, + "learning_rate": 1.3991265720785685e-05, + "loss": 1.7221, + "step": 7934 + }, + { + "epoch": 0.8348237769594951, + "grad_norm": 1.4182674884796143, + "learning_rate": 1.397388625412529e-05, + "loss": 1.4725, + "step": 7935 + }, + { + "epoch": 0.8349289847448711, + "grad_norm": 1.2701834440231323, + "learning_rate": 1.3956516777394235e-05, + "loss": 1.3707, + "step": 7936 + }, + { + "epoch": 0.8350341925302472, + "grad_norm": 1.180488109588623, + "learning_rate": 1.393915729260955e-05, + "loss": 1.5807, + "step": 7937 + }, + { + "epoch": 0.8351394003156234, + "grad_norm": 2.2820448875427246, + "learning_rate": 1.3921807801787112e-05, + "loss": 1.4147, + "step": 7938 + }, + { + "epoch": 0.8352446081009994, + "grad_norm": 2.877859354019165, + "learning_rate": 1.390446830694172e-05, + "loss": 1.2865, + "step": 7939 + }, + { + "epoch": 0.8353498158863756, + "grad_norm": 2.0427865982055664, + "learning_rate": 1.3887138810086908e-05, + "loss": 1.4107, + "step": 7940 + }, + { + "epoch": 0.8354550236717517, + "grad_norm": 1.6468256711959839, + "learning_rate": 1.3869819313235077e-05, + "loss": 1.4342, + "step": 7941 + }, + { + "epoch": 0.8355602314571279, + "grad_norm": 2.2093889713287354, + "learning_rate": 1.385250981839753e-05, + "loss": 1.8815, + "step": 7942 + }, + { + "epoch": 0.8356654392425039, + "grad_norm": 1.257178783416748, + "learning_rate": 1.3835210327584348e-05, + "loss": 1.3339, + "step": 7943 + }, + { + "epoch": 0.8357706470278801, + "grad_norm": 1.405913233757019, + "learning_rate": 1.3817920842804433e-05, + "loss": 1.3401, + "step": 7944 + }, + { + "epoch": 0.8358758548132562, + "grad_norm": 2.065035820007324, + "learning_rate": 1.3800641366065604e-05, + "loss": 1.1262, + "step": 7945 + }, + { + "epoch": 0.8359810625986323, + "grad_norm": 1.2551019191741943, + "learning_rate": 1.3783371899374442e-05, + "loss": 1.651, + "step": 7946 + }, + { + "epoch": 0.8360862703840084, + "grad_norm": 1.498842477798462, + "learning_rate": 1.3766112444736368e-05, + "loss": 1.2671, + "step": 7947 + }, + { + "epoch": 0.8361914781693845, + "grad_norm": 1.7398850917816162, + "learning_rate": 1.3748863004155732e-05, + "loss": 1.9418, + "step": 7948 + }, + { + "epoch": 0.8362966859547607, + "grad_norm": 1.3350509405136108, + "learning_rate": 1.37316235796356e-05, + "loss": 1.7554, + "step": 7949 + }, + { + "epoch": 0.8364018937401367, + "grad_norm": 1.206315279006958, + "learning_rate": 1.3714394173177936e-05, + "loss": 1.4759, + "step": 7950 + }, + { + "epoch": 0.8365071015255129, + "grad_norm": 2.4317026138305664, + "learning_rate": 1.3697174786783584e-05, + "loss": 1.921, + "step": 7951 + }, + { + "epoch": 0.836612309310889, + "grad_norm": 1.54344642162323, + "learning_rate": 1.3679965422452101e-05, + "loss": 2.177, + "step": 7952 + }, + { + "epoch": 0.8367175170962651, + "grad_norm": 1.8043365478515625, + "learning_rate": 1.3662766082181999e-05, + "loss": 1.2102, + "step": 7953 + }, + { + "epoch": 0.8368227248816412, + "grad_norm": 1.4540766477584839, + "learning_rate": 1.364557676797057e-05, + "loss": 0.7462, + "step": 7954 + }, + { + "epoch": 0.8369279326670174, + "grad_norm": 1.8703304529190063, + "learning_rate": 1.3628397481813936e-05, + "loss": 1.6058, + "step": 7955 + }, + { + "epoch": 0.8370331404523935, + "grad_norm": 1.9202097654342651, + "learning_rate": 1.3611228225707107e-05, + "loss": 1.4126, + "step": 7956 + }, + { + "epoch": 0.8371383482377696, + "grad_norm": 6.3809990882873535, + "learning_rate": 1.3594069001643872e-05, + "loss": 1.2723, + "step": 7957 + }, + { + "epoch": 0.8372435560231457, + "grad_norm": 1.363297462463379, + "learning_rate": 1.3576919811616862e-05, + "loss": 1.6663, + "step": 7958 + }, + { + "epoch": 0.8373487638085219, + "grad_norm": 4.482295513153076, + "learning_rate": 1.3559780657617582e-05, + "loss": 1.5827, + "step": 7959 + }, + { + "epoch": 0.8374539715938979, + "grad_norm": 2.7441794872283936, + "learning_rate": 1.354265154163633e-05, + "loss": 1.9887, + "step": 7960 + }, + { + "epoch": 0.837559179379274, + "grad_norm": 1.571427583694458, + "learning_rate": 1.352553246566225e-05, + "loss": 1.7604, + "step": 7961 + }, + { + "epoch": 0.8376643871646502, + "grad_norm": 1.7176398038864136, + "learning_rate": 1.3508423431683337e-05, + "loss": 1.6065, + "step": 7962 + }, + { + "epoch": 0.8377695949500263, + "grad_norm": 1.1875994205474854, + "learning_rate": 1.3491324441686392e-05, + "loss": 1.7916, + "step": 7963 + }, + { + "epoch": 0.8378748027354024, + "grad_norm": 1.7343966960906982, + "learning_rate": 1.3474235497657084e-05, + "loss": 1.7253, + "step": 7964 + }, + { + "epoch": 0.8379800105207785, + "grad_norm": 2.453002452850342, + "learning_rate": 1.345715660157989e-05, + "loss": 2.2406, + "step": 7965 + }, + { + "epoch": 0.8380852183061547, + "grad_norm": 2.0220048427581787, + "learning_rate": 1.3440087755438102e-05, + "loss": 1.5343, + "step": 7966 + }, + { + "epoch": 0.8381904260915308, + "grad_norm": 2.1328377723693848, + "learning_rate": 1.3423028961213912e-05, + "loss": 2.0797, + "step": 7967 + }, + { + "epoch": 0.8382956338769069, + "grad_norm": 2.1333796977996826, + "learning_rate": 1.340598022088827e-05, + "loss": 1.8357, + "step": 7968 + }, + { + "epoch": 0.838400841662283, + "grad_norm": 1.2837255001068115, + "learning_rate": 1.338894153644098e-05, + "loss": 1.6283, + "step": 7969 + }, + { + "epoch": 0.8385060494476592, + "grad_norm": 1.9028080701828003, + "learning_rate": 1.3371912909850726e-05, + "loss": 1.2375, + "step": 7970 + }, + { + "epoch": 0.8386112572330352, + "grad_norm": 1.260032296180725, + "learning_rate": 1.335489434309496e-05, + "loss": 1.5313, + "step": 7971 + }, + { + "epoch": 0.8387164650184114, + "grad_norm": 1.5635043382644653, + "learning_rate": 1.3337885838149988e-05, + "loss": 1.5254, + "step": 7972 + }, + { + "epoch": 0.8388216728037875, + "grad_norm": 1.1617438793182373, + "learning_rate": 1.3320887396990999e-05, + "loss": 1.3332, + "step": 7973 + }, + { + "epoch": 0.8389268805891636, + "grad_norm": 1.6330403089523315, + "learning_rate": 1.3303899021591882e-05, + "loss": 1.3151, + "step": 7974 + }, + { + "epoch": 0.8390320883745397, + "grad_norm": 1.6695667505264282, + "learning_rate": 1.328692071392552e-05, + "loss": 1.3915, + "step": 7975 + }, + { + "epoch": 0.8391372961599158, + "grad_norm": 1.507834553718567, + "learning_rate": 1.3269952475963509e-05, + "loss": 1.7375, + "step": 7976 + }, + { + "epoch": 0.839242503945292, + "grad_norm": 2.011200428009033, + "learning_rate": 1.3252994309676303e-05, + "loss": 1.5429, + "step": 7977 + }, + { + "epoch": 0.839347711730668, + "grad_norm": 1.5089296102523804, + "learning_rate": 1.3236046217033237e-05, + "loss": 1.9026, + "step": 7978 + }, + { + "epoch": 0.8394529195160442, + "grad_norm": 2.053021192550659, + "learning_rate": 1.3219108200002418e-05, + "loss": 1.2597, + "step": 7979 + }, + { + "epoch": 0.8395581273014203, + "grad_norm": 2.294508457183838, + "learning_rate": 1.3202180260550778e-05, + "loss": 2.0704, + "step": 7980 + }, + { + "epoch": 0.8396633350867965, + "grad_norm": 1.4443711042404175, + "learning_rate": 1.318526240064415e-05, + "loss": 1.6057, + "step": 7981 + }, + { + "epoch": 0.8397685428721725, + "grad_norm": 1.4994503259658813, + "learning_rate": 1.3168354622247115e-05, + "loss": 1.7615, + "step": 7982 + }, + { + "epoch": 0.8398737506575487, + "grad_norm": 2.1308159828186035, + "learning_rate": 1.3151456927323113e-05, + "loss": 1.8955, + "step": 7983 + }, + { + "epoch": 0.8399789584429248, + "grad_norm": 1.4624441862106323, + "learning_rate": 1.3134569317834454e-05, + "loss": 1.7515, + "step": 7984 + }, + { + "epoch": 0.8400841662283008, + "grad_norm": 1.2271381616592407, + "learning_rate": 1.3117691795742226e-05, + "loss": 1.494, + "step": 7985 + }, + { + "epoch": 0.840189374013677, + "grad_norm": 1.1306962966918945, + "learning_rate": 1.3100824363006326e-05, + "loss": 1.977, + "step": 7986 + }, + { + "epoch": 0.8402945817990531, + "grad_norm": 1.7996954917907715, + "learning_rate": 1.3083967021585564e-05, + "loss": 1.9938, + "step": 7987 + }, + { + "epoch": 0.8403997895844293, + "grad_norm": 1.3190232515335083, + "learning_rate": 1.3067119773437498e-05, + "loss": 1.3396, + "step": 7988 + }, + { + "epoch": 0.8405049973698053, + "grad_norm": 1.5940443277359009, + "learning_rate": 1.3050282620518528e-05, + "loss": 1.1903, + "step": 7989 + }, + { + "epoch": 0.8406102051551815, + "grad_norm": 1.4561878442764282, + "learning_rate": 1.3033455564783948e-05, + "loss": 1.3508, + "step": 7990 + }, + { + "epoch": 0.8407154129405576, + "grad_norm": 1.4680160284042358, + "learning_rate": 1.3016638608187792e-05, + "loss": 1.8019, + "step": 7991 + }, + { + "epoch": 0.8408206207259337, + "grad_norm": 1.9364701509475708, + "learning_rate": 1.2999831752682955e-05, + "loss": 1.4724, + "step": 7992 + }, + { + "epoch": 0.8409258285113098, + "grad_norm": 2.371671199798584, + "learning_rate": 1.2983035000221177e-05, + "loss": 2.2839, + "step": 7993 + }, + { + "epoch": 0.841031036296686, + "grad_norm": 1.6771042346954346, + "learning_rate": 1.2966248352753018e-05, + "loss": 1.2167, + "step": 7994 + }, + { + "epoch": 0.8411362440820621, + "grad_norm": 2.0259828567504883, + "learning_rate": 1.294947181222783e-05, + "loss": 1.7223, + "step": 7995 + }, + { + "epoch": 0.8412414518674382, + "grad_norm": 1.4655014276504517, + "learning_rate": 1.2932705380593846e-05, + "loss": 1.8478, + "step": 7996 + }, + { + "epoch": 0.8413466596528143, + "grad_norm": 1.9223905801773071, + "learning_rate": 1.2915949059798038e-05, + "loss": 1.5038, + "step": 7997 + }, + { + "epoch": 0.8414518674381904, + "grad_norm": 1.4577714204788208, + "learning_rate": 1.2899202851786341e-05, + "loss": 1.5354, + "step": 7998 + }, + { + "epoch": 0.8415570752235666, + "grad_norm": 1.3747444152832031, + "learning_rate": 1.2882466758503397e-05, + "loss": 2.3442, + "step": 7999 + }, + { + "epoch": 0.8416622830089426, + "grad_norm": 1.4338715076446533, + "learning_rate": 1.2865740781892699e-05, + "loss": 1.7268, + "step": 8000 + }, + { + "epoch": 0.8417674907943188, + "grad_norm": 1.9332977533340454, + "learning_rate": 1.2849024923896612e-05, + "loss": 2.0012, + "step": 8001 + }, + { + "epoch": 0.8418726985796949, + "grad_norm": 2.2032244205474854, + "learning_rate": 1.2832319186456288e-05, + "loss": 1.7438, + "step": 8002 + }, + { + "epoch": 0.841977906365071, + "grad_norm": 2.1048672199249268, + "learning_rate": 1.281562357151167e-05, + "loss": 0.9579, + "step": 8003 + }, + { + "epoch": 0.8420831141504471, + "grad_norm": 1.7806977033615112, + "learning_rate": 1.2798938081001621e-05, + "loss": 1.4379, + "step": 8004 + }, + { + "epoch": 0.8421883219358233, + "grad_norm": 2.0925915241241455, + "learning_rate": 1.2782262716863747e-05, + "loss": 2.3933, + "step": 8005 + }, + { + "epoch": 0.8422935297211994, + "grad_norm": 2.210217237472534, + "learning_rate": 1.2765597481034475e-05, + "loss": 1.8541, + "step": 8006 + }, + { + "epoch": 0.8423987375065755, + "grad_norm": 1.8628958463668823, + "learning_rate": 1.2748942375449135e-05, + "loss": 0.738, + "step": 8007 + }, + { + "epoch": 0.8425039452919516, + "grad_norm": 1.774337649345398, + "learning_rate": 1.2732297402041793e-05, + "loss": 1.3924, + "step": 8008 + }, + { + "epoch": 0.8426091530773278, + "grad_norm": 1.3823806047439575, + "learning_rate": 1.27156625627454e-05, + "loss": 1.7851, + "step": 8009 + }, + { + "epoch": 0.8427143608627038, + "grad_norm": 1.3597822189331055, + "learning_rate": 1.2699037859491702e-05, + "loss": 1.7565, + "step": 8010 + }, + { + "epoch": 0.8428195686480799, + "grad_norm": 1.7457836866378784, + "learning_rate": 1.2682423294211231e-05, + "loss": 2.0374, + "step": 8011 + }, + { + "epoch": 0.8429247764334561, + "grad_norm": 1.9241029024124146, + "learning_rate": 1.2665818868833445e-05, + "loss": 2.1648, + "step": 8012 + }, + { + "epoch": 0.8430299842188322, + "grad_norm": 1.4577206373214722, + "learning_rate": 1.2649224585286524e-05, + "loss": 1.5166, + "step": 8013 + }, + { + "epoch": 0.8431351920042083, + "grad_norm": 1.7863467931747437, + "learning_rate": 1.263264044549748e-05, + "loss": 1.5639, + "step": 8014 + }, + { + "epoch": 0.8432403997895844, + "grad_norm": 1.8786143064498901, + "learning_rate": 1.2616066451392262e-05, + "loss": 2.2911, + "step": 8015 + }, + { + "epoch": 0.8433456075749606, + "grad_norm": 1.9232258796691895, + "learning_rate": 1.2599502604895475e-05, + "loss": 1.5803, + "step": 8016 + }, + { + "epoch": 0.8434508153603366, + "grad_norm": 1.5343316793441772, + "learning_rate": 1.2582948907930626e-05, + "loss": 1.6762, + "step": 8017 + }, + { + "epoch": 0.8435560231457128, + "grad_norm": 1.7372281551361084, + "learning_rate": 1.2566405362420086e-05, + "loss": 1.2532, + "step": 8018 + }, + { + "epoch": 0.8436612309310889, + "grad_norm": 1.6992888450622559, + "learning_rate": 1.2549871970284954e-05, + "loss": 2.2251, + "step": 8019 + }, + { + "epoch": 0.8437664387164651, + "grad_norm": 1.0523444414138794, + "learning_rate": 1.253334873344525e-05, + "loss": 1.7233, + "step": 8020 + }, + { + "epoch": 0.8438716465018411, + "grad_norm": 2.2978196144104004, + "learning_rate": 1.2516835653819725e-05, + "loss": 1.1533, + "step": 8021 + }, + { + "epoch": 0.8439768542872172, + "grad_norm": 1.5797593593597412, + "learning_rate": 1.2500332733325993e-05, + "loss": 1.2688, + "step": 8022 + }, + { + "epoch": 0.8440820620725934, + "grad_norm": 2.2443854808807373, + "learning_rate": 1.2483839973880508e-05, + "loss": 1.2121, + "step": 8023 + }, + { + "epoch": 0.8441872698579694, + "grad_norm": 1.3470818996429443, + "learning_rate": 1.2467357377398504e-05, + "loss": 1.6369, + "step": 8024 + }, + { + "epoch": 0.8442924776433456, + "grad_norm": 1.4854938983917236, + "learning_rate": 1.2450884945794017e-05, + "loss": 1.9805, + "step": 8025 + }, + { + "epoch": 0.8443976854287217, + "grad_norm": 1.5202157497406006, + "learning_rate": 1.2434422680980006e-05, + "loss": 1.9186, + "step": 8026 + }, + { + "epoch": 0.8445028932140979, + "grad_norm": 1.5458056926727295, + "learning_rate": 1.2417970584868132e-05, + "loss": 1.3289, + "step": 8027 + }, + { + "epoch": 0.8446081009994739, + "grad_norm": 1.4900844097137451, + "learning_rate": 1.2401528659368911e-05, + "loss": 1.3999, + "step": 8028 + }, + { + "epoch": 0.8447133087848501, + "grad_norm": 1.6544053554534912, + "learning_rate": 1.2385096906391746e-05, + "loss": 1.7014, + "step": 8029 + }, + { + "epoch": 0.8448185165702262, + "grad_norm": 1.7936450242996216, + "learning_rate": 1.2368675327844758e-05, + "loss": 1.373, + "step": 8030 + }, + { + "epoch": 0.8449237243556024, + "grad_norm": 1.2108367681503296, + "learning_rate": 1.2352263925634922e-05, + "loss": 1.4582, + "step": 8031 + }, + { + "epoch": 0.8450289321409784, + "grad_norm": 1.679121732711792, + "learning_rate": 1.2335862701668078e-05, + "loss": 1.4045, + "step": 8032 + }, + { + "epoch": 0.8451341399263546, + "grad_norm": 1.2412477731704712, + "learning_rate": 1.2319471657848825e-05, + "loss": 1.9746, + "step": 8033 + }, + { + "epoch": 0.8452393477117307, + "grad_norm": 1.6706655025482178, + "learning_rate": 1.2303090796080585e-05, + "loss": 1.6583, + "step": 8034 + }, + { + "epoch": 0.8453445554971067, + "grad_norm": 2.606658935546875, + "learning_rate": 1.2286720118265659e-05, + "loss": 0.7691, + "step": 8035 + }, + { + "epoch": 0.8454497632824829, + "grad_norm": 1.2284669876098633, + "learning_rate": 1.2270359626305084e-05, + "loss": 1.4472, + "step": 8036 + }, + { + "epoch": 0.845554971067859, + "grad_norm": 1.9849002361297607, + "learning_rate": 1.2254009322098759e-05, + "loss": 1.8249, + "step": 8037 + }, + { + "epoch": 0.8456601788532352, + "grad_norm": 2.283884286880493, + "learning_rate": 1.22376692075454e-05, + "loss": 1.3119, + "step": 8038 + }, + { + "epoch": 0.8457653866386112, + "grad_norm": 1.697540283203125, + "learning_rate": 1.2221339284542488e-05, + "loss": 1.223, + "step": 8039 + }, + { + "epoch": 0.8458705944239874, + "grad_norm": 2.4393091201782227, + "learning_rate": 1.220501955498643e-05, + "loss": 1.8292, + "step": 8040 + }, + { + "epoch": 0.8459758022093635, + "grad_norm": 1.4089748859405518, + "learning_rate": 1.2188710020772343e-05, + "loss": 1.5042, + "step": 8041 + }, + { + "epoch": 0.8460810099947396, + "grad_norm": 2.5239205360412598, + "learning_rate": 1.2172410683794177e-05, + "loss": 1.4657, + "step": 8042 + }, + { + "epoch": 0.8461862177801157, + "grad_norm": 2.2481822967529297, + "learning_rate": 1.2156121545944776e-05, + "loss": 1.74, + "step": 8043 + }, + { + "epoch": 0.8462914255654919, + "grad_norm": 1.7899489402770996, + "learning_rate": 1.2139842609115726e-05, + "loss": 1.798, + "step": 8044 + }, + { + "epoch": 0.846396633350868, + "grad_norm": 2.1522939205169678, + "learning_rate": 1.2123573875197402e-05, + "loss": 2.2505, + "step": 8045 + }, + { + "epoch": 0.846501841136244, + "grad_norm": 1.7058902978897095, + "learning_rate": 1.2107315346079107e-05, + "loss": 1.4551, + "step": 8046 + }, + { + "epoch": 0.8466070489216202, + "grad_norm": 1.6210920810699463, + "learning_rate": 1.209106702364885e-05, + "loss": 1.5557, + "step": 8047 + }, + { + "epoch": 0.8467122567069963, + "grad_norm": 2.2149577140808105, + "learning_rate": 1.2074828909793479e-05, + "loss": 1.0683, + "step": 8048 + }, + { + "epoch": 0.8468174644923724, + "grad_norm": 1.5407854318618774, + "learning_rate": 1.2058601006398718e-05, + "loss": 1.813, + "step": 8049 + }, + { + "epoch": 0.8469226722777485, + "grad_norm": 1.5162616968154907, + "learning_rate": 1.2042383315349037e-05, + "loss": 1.1297, + "step": 8050 + }, + { + "epoch": 0.8470278800631247, + "grad_norm": 1.622755527496338, + "learning_rate": 1.2026175838527732e-05, + "loss": 1.7975, + "step": 8051 + }, + { + "epoch": 0.8471330878485008, + "grad_norm": 1.260178565979004, + "learning_rate": 1.2009978577816949e-05, + "loss": 1.3295, + "step": 8052 + }, + { + "epoch": 0.8472382956338769, + "grad_norm": 2.4802284240722656, + "learning_rate": 1.199379153509761e-05, + "loss": 1.7802, + "step": 8053 + }, + { + "epoch": 0.847343503419253, + "grad_norm": 1.4476691484451294, + "learning_rate": 1.1977614712249441e-05, + "loss": 1.2737, + "step": 8054 + }, + { + "epoch": 0.8474487112046292, + "grad_norm": 1.3926537036895752, + "learning_rate": 1.1961448111151053e-05, + "loss": 1.7071, + "step": 8055 + }, + { + "epoch": 0.8475539189900052, + "grad_norm": 1.6615149974822998, + "learning_rate": 1.1945291733679764e-05, + "loss": 1.4381, + "step": 8056 + }, + { + "epoch": 0.8476591267753814, + "grad_norm": 1.7258626222610474, + "learning_rate": 1.192914558171181e-05, + "loss": 1.9516, + "step": 8057 + }, + { + "epoch": 0.8477643345607575, + "grad_norm": 2.5182111263275146, + "learning_rate": 1.1913009657122188e-05, + "loss": 1.4435, + "step": 8058 + }, + { + "epoch": 0.8478695423461337, + "grad_norm": 1.2161062955856323, + "learning_rate": 1.1896883961784656e-05, + "loss": 1.7092, + "step": 8059 + }, + { + "epoch": 0.8479747501315097, + "grad_norm": 1.7733458280563354, + "learning_rate": 1.1880768497571882e-05, + "loss": 1.5346, + "step": 8060 + }, + { + "epoch": 0.8480799579168858, + "grad_norm": 1.790793776512146, + "learning_rate": 1.1864663266355303e-05, + "loss": 1.3009, + "step": 8061 + }, + { + "epoch": 0.848185165702262, + "grad_norm": 1.6531035900115967, + "learning_rate": 1.1848568270005135e-05, + "loss": 1.4765, + "step": 8062 + }, + { + "epoch": 0.8482903734876381, + "grad_norm": 1.2282698154449463, + "learning_rate": 1.1832483510390469e-05, + "loss": 1.9069, + "step": 8063 + }, + { + "epoch": 0.8483955812730142, + "grad_norm": 1.9797649383544922, + "learning_rate": 1.1816408989379158e-05, + "loss": 1.4846, + "step": 8064 + }, + { + "epoch": 0.8485007890583903, + "grad_norm": 1.0080591440200806, + "learning_rate": 1.1800344708837895e-05, + "loss": 1.6405, + "step": 8065 + }, + { + "epoch": 0.8486059968437665, + "grad_norm": 1.6383482217788696, + "learning_rate": 1.178429067063217e-05, + "loss": 1.3591, + "step": 8066 + }, + { + "epoch": 0.8487112046291425, + "grad_norm": 1.3764389753341675, + "learning_rate": 1.1768246876626265e-05, + "loss": 1.0397, + "step": 8067 + }, + { + "epoch": 0.8488164124145187, + "grad_norm": 1.7834566831588745, + "learning_rate": 1.1752213328683337e-05, + "loss": 0.6507, + "step": 8068 + }, + { + "epoch": 0.8489216201998948, + "grad_norm": 1.8441131114959717, + "learning_rate": 1.1736190028665273e-05, + "loss": 1.811, + "step": 8069 + }, + { + "epoch": 0.849026827985271, + "grad_norm": 1.3411214351654053, + "learning_rate": 1.1720176978432795e-05, + "loss": 1.6706, + "step": 8070 + }, + { + "epoch": 0.849132035770647, + "grad_norm": 1.3901194334030151, + "learning_rate": 1.1704174179845496e-05, + "loss": 1.4316, + "step": 8071 + }, + { + "epoch": 0.8492372435560231, + "grad_norm": 1.992873191833496, + "learning_rate": 1.1688181634761685e-05, + "loss": 2.0151, + "step": 8072 + }, + { + "epoch": 0.8493424513413993, + "grad_norm": 1.5536729097366333, + "learning_rate": 1.1672199345038526e-05, + "loss": 1.6268, + "step": 8073 + }, + { + "epoch": 0.8494476591267753, + "grad_norm": 1.7125334739685059, + "learning_rate": 1.1656227312532009e-05, + "loss": 1.4057, + "step": 8074 + }, + { + "epoch": 0.8495528669121515, + "grad_norm": 1.3038568496704102, + "learning_rate": 1.1640265539096918e-05, + "loss": 1.768, + "step": 8075 + }, + { + "epoch": 0.8496580746975276, + "grad_norm": 1.536347508430481, + "learning_rate": 1.1624314026586802e-05, + "loss": 1.634, + "step": 8076 + }, + { + "epoch": 0.8497632824829038, + "grad_norm": 1.6387101411819458, + "learning_rate": 1.1608372776854103e-05, + "loss": 1.7797, + "step": 8077 + }, + { + "epoch": 0.8498684902682798, + "grad_norm": 1.4296340942382812, + "learning_rate": 1.1592441791750009e-05, + "loss": 1.8383, + "step": 8078 + }, + { + "epoch": 0.849973698053656, + "grad_norm": 1.9562045335769653, + "learning_rate": 1.1576521073124513e-05, + "loss": 1.4364, + "step": 8079 + }, + { + "epoch": 0.8500789058390321, + "grad_norm": 2.0499372482299805, + "learning_rate": 1.1560610622826484e-05, + "loss": 1.2246, + "step": 8080 + }, + { + "epoch": 0.8501841136244082, + "grad_norm": 2.626941442489624, + "learning_rate": 1.1544710442703488e-05, + "loss": 1.1103, + "step": 8081 + }, + { + "epoch": 0.8502893214097843, + "grad_norm": 1.4844361543655396, + "learning_rate": 1.1528820534602002e-05, + "loss": 1.3648, + "step": 8082 + }, + { + "epoch": 0.8503945291951605, + "grad_norm": 1.9580860137939453, + "learning_rate": 1.1512940900367275e-05, + "loss": 1.21, + "step": 8083 + }, + { + "epoch": 0.8504997369805366, + "grad_norm": 1.9782524108886719, + "learning_rate": 1.1497071541843306e-05, + "loss": 1.9595, + "step": 8084 + }, + { + "epoch": 0.8506049447659126, + "grad_norm": 1.1815834045410156, + "learning_rate": 1.1481212460873014e-05, + "loss": 1.5414, + "step": 8085 + }, + { + "epoch": 0.8507101525512888, + "grad_norm": 1.4871257543563843, + "learning_rate": 1.1465363659298023e-05, + "loss": 1.876, + "step": 8086 + }, + { + "epoch": 0.8508153603366649, + "grad_norm": 1.5685290098190308, + "learning_rate": 1.1449525138958805e-05, + "loss": 1.6721, + "step": 8087 + }, + { + "epoch": 0.850920568122041, + "grad_norm": 1.5825947523117065, + "learning_rate": 1.1433696901694658e-05, + "loss": 1.4306, + "step": 8088 + }, + { + "epoch": 0.8510257759074171, + "grad_norm": 1.341077208518982, + "learning_rate": 1.141787894934364e-05, + "loss": 1.7303, + "step": 8089 + }, + { + "epoch": 0.8511309836927933, + "grad_norm": 1.389611840248108, + "learning_rate": 1.1402071283742632e-05, + "loss": 1.4725, + "step": 8090 + }, + { + "epoch": 0.8512361914781694, + "grad_norm": 2.9490091800689697, + "learning_rate": 1.1386273906727363e-05, + "loss": 1.549, + "step": 8091 + }, + { + "epoch": 0.8513413992635455, + "grad_norm": 1.6228629350662231, + "learning_rate": 1.1370486820132308e-05, + "loss": 1.2005, + "step": 8092 + }, + { + "epoch": 0.8514466070489216, + "grad_norm": 1.9842947721481323, + "learning_rate": 1.1354710025790738e-05, + "loss": 1.4727, + "step": 8093 + }, + { + "epoch": 0.8515518148342978, + "grad_norm": 2.101422071456909, + "learning_rate": 1.1338943525534818e-05, + "loss": 1.7552, + "step": 8094 + }, + { + "epoch": 0.8516570226196739, + "grad_norm": 1.4584681987762451, + "learning_rate": 1.1323187321195439e-05, + "loss": 1.9696, + "step": 8095 + }, + { + "epoch": 0.85176223040505, + "grad_norm": 2.264514446258545, + "learning_rate": 1.1307441414602282e-05, + "loss": 1.3978, + "step": 8096 + }, + { + "epoch": 0.8518674381904261, + "grad_norm": 1.4211277961730957, + "learning_rate": 1.1291705807583918e-05, + "loss": 1.4239, + "step": 8097 + }, + { + "epoch": 0.8519726459758022, + "grad_norm": 2.931905508041382, + "learning_rate": 1.1275980501967642e-05, + "loss": 1.3678, + "step": 8098 + }, + { + "epoch": 0.8520778537611783, + "grad_norm": 1.608646273612976, + "learning_rate": 1.126026549957958e-05, + "loss": 1.7463, + "step": 8099 + }, + { + "epoch": 0.8521830615465544, + "grad_norm": 2.0848610401153564, + "learning_rate": 1.1244560802244686e-05, + "loss": 1.5804, + "step": 8100 + }, + { + "epoch": 0.8522882693319306, + "grad_norm": 1.426628589630127, + "learning_rate": 1.1228866411786687e-05, + "loss": 1.608, + "step": 8101 + }, + { + "epoch": 0.8523934771173067, + "grad_norm": 1.9477870464324951, + "learning_rate": 1.1213182330028104e-05, + "loss": 1.4227, + "step": 8102 + }, + { + "epoch": 0.8524986849026828, + "grad_norm": 1.8772066831588745, + "learning_rate": 1.1197508558790304e-05, + "loss": 1.3724, + "step": 8103 + }, + { + "epoch": 0.8526038926880589, + "grad_norm": 1.8329577445983887, + "learning_rate": 1.1181845099893384e-05, + "loss": 1.495, + "step": 8104 + }, + { + "epoch": 0.8527091004734351, + "grad_norm": 1.718479871749878, + "learning_rate": 1.1166191955156346e-05, + "loss": 1.5246, + "step": 8105 + }, + { + "epoch": 0.8528143082588111, + "grad_norm": 2.2497758865356445, + "learning_rate": 1.1150549126396914e-05, + "loss": 1.773, + "step": 8106 + }, + { + "epoch": 0.8529195160441873, + "grad_norm": 1.177225947380066, + "learning_rate": 1.1134916615431611e-05, + "loss": 1.7879, + "step": 8107 + }, + { + "epoch": 0.8530247238295634, + "grad_norm": 1.9921637773513794, + "learning_rate": 1.1119294424075843e-05, + "loss": 1.4187, + "step": 8108 + }, + { + "epoch": 0.8531299316149396, + "grad_norm": 2.8252499103546143, + "learning_rate": 1.1103682554143736e-05, + "loss": 1.5446, + "step": 8109 + }, + { + "epoch": 0.8532351394003156, + "grad_norm": 1.615925669670105, + "learning_rate": 1.1088081007448214e-05, + "loss": 1.9703, + "step": 8110 + }, + { + "epoch": 0.8533403471856917, + "grad_norm": 1.6888537406921387, + "learning_rate": 1.10724897858011e-05, + "loss": 1.6726, + "step": 8111 + }, + { + "epoch": 0.8534455549710679, + "grad_norm": 1.7525259256362915, + "learning_rate": 1.1056908891012884e-05, + "loss": 1.2526, + "step": 8112 + }, + { + "epoch": 0.8535507627564439, + "grad_norm": 1.5043063163757324, + "learning_rate": 1.104133832489298e-05, + "loss": 1.8318, + "step": 8113 + }, + { + "epoch": 0.8536559705418201, + "grad_norm": 1.4781932830810547, + "learning_rate": 1.1025778089249527e-05, + "loss": 1.7238, + "step": 8114 + }, + { + "epoch": 0.8537611783271962, + "grad_norm": 1.9893333911895752, + "learning_rate": 1.1010228185889449e-05, + "loss": 1.5819, + "step": 8115 + }, + { + "epoch": 0.8538663861125724, + "grad_norm": 2.0442187786102295, + "learning_rate": 1.0994688616618565e-05, + "loss": 1.9624, + "step": 8116 + }, + { + "epoch": 0.8539715938979484, + "grad_norm": 1.369695782661438, + "learning_rate": 1.09791593832414e-05, + "loss": 2.0449, + "step": 8117 + }, + { + "epoch": 0.8540768016833246, + "grad_norm": 2.589287042617798, + "learning_rate": 1.09636404875613e-05, + "loss": 1.9639, + "step": 8118 + }, + { + "epoch": 0.8541820094687007, + "grad_norm": 1.5649428367614746, + "learning_rate": 1.0948131931380457e-05, + "loss": 1.2722, + "step": 8119 + }, + { + "epoch": 0.8542872172540767, + "grad_norm": 1.4404014348983765, + "learning_rate": 1.0932633716499818e-05, + "loss": 1.7971, + "step": 8120 + }, + { + "epoch": 0.8543924250394529, + "grad_norm": 1.283900260925293, + "learning_rate": 1.0917145844719101e-05, + "loss": 1.5276, + "step": 8121 + }, + { + "epoch": 0.854497632824829, + "grad_norm": 1.41376531124115, + "learning_rate": 1.0901668317836933e-05, + "loss": 1.4866, + "step": 8122 + }, + { + "epoch": 0.8546028406102052, + "grad_norm": 1.4160473346710205, + "learning_rate": 1.088620113765061e-05, + "loss": 1.8107, + "step": 8123 + }, + { + "epoch": 0.8547080483955812, + "grad_norm": 1.5706056356430054, + "learning_rate": 1.0870744305956315e-05, + "loss": 1.4278, + "step": 8124 + }, + { + "epoch": 0.8548132561809574, + "grad_norm": 1.7656512260437012, + "learning_rate": 1.0855297824548982e-05, + "loss": 1.6278, + "step": 8125 + }, + { + "epoch": 0.8549184639663335, + "grad_norm": 1.3173682689666748, + "learning_rate": 1.0839861695222354e-05, + "loss": 1.8172, + "step": 8126 + }, + { + "epoch": 0.8550236717517097, + "grad_norm": 1.5881686210632324, + "learning_rate": 1.0824435919769005e-05, + "loss": 2.1094, + "step": 8127 + }, + { + "epoch": 0.8551288795370857, + "grad_norm": 2.1221399307250977, + "learning_rate": 1.080902049998026e-05, + "loss": 1.5415, + "step": 8128 + }, + { + "epoch": 0.8552340873224619, + "grad_norm": 2.114959955215454, + "learning_rate": 1.0793615437646254e-05, + "loss": 0.6612, + "step": 8129 + }, + { + "epoch": 0.855339295107838, + "grad_norm": 1.5936521291732788, + "learning_rate": 1.0778220734555955e-05, + "loss": 1.7225, + "step": 8130 + }, + { + "epoch": 0.855444502893214, + "grad_norm": 1.4308537244796753, + "learning_rate": 1.0762836392497078e-05, + "loss": 1.5506, + "step": 8131 + }, + { + "epoch": 0.8555497106785902, + "grad_norm": 1.9329140186309814, + "learning_rate": 1.0747462413256148e-05, + "loss": 1.2109, + "step": 8132 + }, + { + "epoch": 0.8556549184639664, + "grad_norm": 1.7347162961959839, + "learning_rate": 1.0732098798618517e-05, + "loss": 1.8518, + "step": 8133 + }, + { + "epoch": 0.8557601262493425, + "grad_norm": 1.4052585363388062, + "learning_rate": 1.07167455503683e-05, + "loss": 1.7604, + "step": 8134 + }, + { + "epoch": 0.8558653340347185, + "grad_norm": 2.470710039138794, + "learning_rate": 1.0701402670288407e-05, + "loss": 1.5943, + "step": 8135 + }, + { + "epoch": 0.8559705418200947, + "grad_norm": 1.489478349685669, + "learning_rate": 1.0686070160160588e-05, + "loss": 1.3159, + "step": 8136 + }, + { + "epoch": 0.8560757496054708, + "grad_norm": 1.2157397270202637, + "learning_rate": 1.067074802176533e-05, + "loss": 1.3678, + "step": 8137 + }, + { + "epoch": 0.8561809573908469, + "grad_norm": 1.8080435991287231, + "learning_rate": 1.0655436256881935e-05, + "loss": 1.366, + "step": 8138 + }, + { + "epoch": 0.856286165176223, + "grad_norm": 1.8651374578475952, + "learning_rate": 1.0640134867288542e-05, + "loss": 1.5744, + "step": 8139 + }, + { + "epoch": 0.8563913729615992, + "grad_norm": 1.4366014003753662, + "learning_rate": 1.0624843854762034e-05, + "loss": 1.7384, + "step": 8140 + }, + { + "epoch": 0.8564965807469753, + "grad_norm": 1.5148162841796875, + "learning_rate": 1.0609563221078079e-05, + "loss": 1.2506, + "step": 8141 + }, + { + "epoch": 0.8566017885323514, + "grad_norm": 1.3511996269226074, + "learning_rate": 1.059429296801121e-05, + "loss": 1.7989, + "step": 8142 + }, + { + "epoch": 0.8567069963177275, + "grad_norm": 1.3725030422210693, + "learning_rate": 1.05790330973347e-05, + "loss": 1.2092, + "step": 8143 + }, + { + "epoch": 0.8568122041031037, + "grad_norm": 1.480331301689148, + "learning_rate": 1.056378361082062e-05, + "loss": 1.7247, + "step": 8144 + }, + { + "epoch": 0.8569174118884797, + "grad_norm": 1.1043062210083008, + "learning_rate": 1.0548544510239833e-05, + "loss": 2.1902, + "step": 8145 + }, + { + "epoch": 0.8570226196738558, + "grad_norm": 1.8570284843444824, + "learning_rate": 1.053331579736201e-05, + "loss": 1.2777, + "step": 8146 + }, + { + "epoch": 0.857127827459232, + "grad_norm": 1.4823728799819946, + "learning_rate": 1.0518097473955624e-05, + "loss": 1.4131, + "step": 8147 + }, + { + "epoch": 0.8572330352446081, + "grad_norm": 1.5833871364593506, + "learning_rate": 1.0502889541787918e-05, + "loss": 1.4025, + "step": 8148 + }, + { + "epoch": 0.8573382430299842, + "grad_norm": 1.5487840175628662, + "learning_rate": 1.0487692002624937e-05, + "loss": 2.1141, + "step": 8149 + }, + { + "epoch": 0.8574434508153603, + "grad_norm": 2.5426361560821533, + "learning_rate": 1.0472504858231535e-05, + "loss": 1.4438, + "step": 8150 + }, + { + "epoch": 0.8575486586007365, + "grad_norm": 1.742750644683838, + "learning_rate": 1.0457328110371345e-05, + "loss": 1.3274, + "step": 8151 + }, + { + "epoch": 0.8576538663861125, + "grad_norm": 1.529963731765747, + "learning_rate": 1.0442161760806756e-05, + "loss": 1.545, + "step": 8152 + }, + { + "epoch": 0.8577590741714887, + "grad_norm": 1.5266475677490234, + "learning_rate": 1.042700581129904e-05, + "loss": 1.4656, + "step": 8153 + }, + { + "epoch": 0.8578642819568648, + "grad_norm": 2.2451913356781006, + "learning_rate": 1.0411860263608186e-05, + "loss": 1.5722, + "step": 8154 + }, + { + "epoch": 0.857969489742241, + "grad_norm": 1.2465153932571411, + "learning_rate": 1.0396725119492967e-05, + "loss": 1.718, + "step": 8155 + }, + { + "epoch": 0.858074697527617, + "grad_norm": 1.9138121604919434, + "learning_rate": 1.038160038071102e-05, + "loss": 1.7302, + "step": 8156 + }, + { + "epoch": 0.8581799053129932, + "grad_norm": 1.7178139686584473, + "learning_rate": 1.036648604901871e-05, + "loss": 2.008, + "step": 8157 + }, + { + "epoch": 0.8582851130983693, + "grad_norm": 1.064705729484558, + "learning_rate": 1.0351382126171227e-05, + "loss": 1.4776, + "step": 8158 + }, + { + "epoch": 0.8583903208837455, + "grad_norm": 2.823317766189575, + "learning_rate": 1.033628861392253e-05, + "loss": 1.6744, + "step": 8159 + }, + { + "epoch": 0.8584955286691215, + "grad_norm": 1.7388535737991333, + "learning_rate": 1.0321205514025357e-05, + "loss": 1.3214, + "step": 8160 + }, + { + "epoch": 0.8586007364544976, + "grad_norm": 2.498295783996582, + "learning_rate": 1.0306132828231318e-05, + "loss": 1.5151, + "step": 8161 + }, + { + "epoch": 0.8587059442398738, + "grad_norm": 2.1892495155334473, + "learning_rate": 1.0291070558290705e-05, + "loss": 1.1706, + "step": 8162 + }, + { + "epoch": 0.8588111520252498, + "grad_norm": 1.274142861366272, + "learning_rate": 1.027601870595265e-05, + "loss": 1.5027, + "step": 8163 + }, + { + "epoch": 0.858916359810626, + "grad_norm": 1.8496899604797363, + "learning_rate": 1.02609772729651e-05, + "loss": 1.3466, + "step": 8164 + }, + { + "epoch": 0.8590215675960021, + "grad_norm": 1.2990412712097168, + "learning_rate": 1.0245946261074769e-05, + "loss": 1.7513, + "step": 8165 + }, + { + "epoch": 0.8591267753813783, + "grad_norm": 1.4909148216247559, + "learning_rate": 1.0230925672027137e-05, + "loss": 1.5437, + "step": 8166 + }, + { + "epoch": 0.8592319831667543, + "grad_norm": 1.508446455001831, + "learning_rate": 1.0215915507566499e-05, + "loss": 1.4974, + "step": 8167 + }, + { + "epoch": 0.8593371909521305, + "grad_norm": 1.729293942451477, + "learning_rate": 1.0200915769435937e-05, + "loss": 1.8774, + "step": 8168 + }, + { + "epoch": 0.8594423987375066, + "grad_norm": 2.2102980613708496, + "learning_rate": 1.0185926459377326e-05, + "loss": 0.6269, + "step": 8169 + }, + { + "epoch": 0.8595476065228826, + "grad_norm": 1.7405325174331665, + "learning_rate": 1.017094757913134e-05, + "loss": 1.7546, + "step": 8170 + }, + { + "epoch": 0.8596528143082588, + "grad_norm": 1.6175463199615479, + "learning_rate": 1.0155979130437387e-05, + "loss": 2.0148, + "step": 8171 + }, + { + "epoch": 0.8597580220936349, + "grad_norm": 1.628763198852539, + "learning_rate": 1.0141021115033745e-05, + "loss": 1.0579, + "step": 8172 + }, + { + "epoch": 0.8598632298790111, + "grad_norm": 1.998671293258667, + "learning_rate": 1.012607353465742e-05, + "loss": 1.5527, + "step": 8173 + }, + { + "epoch": 0.8599684376643871, + "grad_norm": 2.1311984062194824, + "learning_rate": 1.0111136391044218e-05, + "loss": 1.4062, + "step": 8174 + }, + { + "epoch": 0.8600736454497633, + "grad_norm": 2.120063304901123, + "learning_rate": 1.009620968592876e-05, + "loss": 1.3496, + "step": 8175 + }, + { + "epoch": 0.8601788532351394, + "grad_norm": 1.719994306564331, + "learning_rate": 1.0081293421044435e-05, + "loss": 1.2293, + "step": 8176 + }, + { + "epoch": 0.8602840610205155, + "grad_norm": 1.8668345212936401, + "learning_rate": 1.0066387598123383e-05, + "loss": 1.7347, + "step": 8177 + }, + { + "epoch": 0.8603892688058916, + "grad_norm": 1.9415462017059326, + "learning_rate": 1.0051492218896619e-05, + "loss": 1.2794, + "step": 8178 + }, + { + "epoch": 0.8604944765912678, + "grad_norm": 2.177868604660034, + "learning_rate": 1.0036607285093857e-05, + "loss": 1.6339, + "step": 8179 + }, + { + "epoch": 0.8605996843766439, + "grad_norm": 1.627882719039917, + "learning_rate": 1.002173279844364e-05, + "loss": 1.5512, + "step": 8180 + }, + { + "epoch": 0.86070489216202, + "grad_norm": 1.8268787860870361, + "learning_rate": 1.0006868760673327e-05, + "loss": 2.4057, + "step": 8181 + }, + { + "epoch": 0.8608100999473961, + "grad_norm": 1.1629137992858887, + "learning_rate": 9.992015173508995e-06, + "loss": 1.5108, + "step": 8182 + }, + { + "epoch": 0.8609153077327723, + "grad_norm": 1.567643404006958, + "learning_rate": 9.977172038675531e-06, + "loss": 1.5915, + "step": 8183 + }, + { + "epoch": 0.8610205155181483, + "grad_norm": 1.6565831899642944, + "learning_rate": 9.962339357896666e-06, + "loss": 1.6012, + "step": 8184 + }, + { + "epoch": 0.8611257233035244, + "grad_norm": 1.5608383417129517, + "learning_rate": 9.947517132894835e-06, + "loss": 1.5782, + "step": 8185 + }, + { + "epoch": 0.8612309310889006, + "grad_norm": 1.3910160064697266, + "learning_rate": 9.932705365391293e-06, + "loss": 1.7121, + "step": 8186 + }, + { + "epoch": 0.8613361388742767, + "grad_norm": 1.9219480752944946, + "learning_rate": 9.91790405710613e-06, + "loss": 1.6089, + "step": 8187 + }, + { + "epoch": 0.8614413466596528, + "grad_norm": 1.804298996925354, + "learning_rate": 9.903113209758096e-06, + "loss": 1.7155, + "step": 8188 + }, + { + "epoch": 0.8615465544450289, + "grad_norm": 0.9984854459762573, + "learning_rate": 9.88833282506486e-06, + "loss": 1.7646, + "step": 8189 + }, + { + "epoch": 0.8616517622304051, + "grad_norm": 1.9518897533416748, + "learning_rate": 9.873562904742805e-06, + "loss": 1.1884, + "step": 8190 + }, + { + "epoch": 0.8617569700157812, + "grad_norm": 1.4349753856658936, + "learning_rate": 9.858803450507081e-06, + "loss": 1.8053, + "step": 8191 + }, + { + "epoch": 0.8618621778011573, + "grad_norm": 1.8369276523590088, + "learning_rate": 9.844054464071717e-06, + "loss": 1.9263, + "step": 8192 + }, + { + "epoch": 0.8619673855865334, + "grad_norm": 1.3132811784744263, + "learning_rate": 9.829315947149431e-06, + "loss": 1.5865, + "step": 8193 + }, + { + "epoch": 0.8620725933719096, + "grad_norm": 2.2313907146453857, + "learning_rate": 9.814587901451733e-06, + "loss": 0.9993, + "step": 8194 + }, + { + "epoch": 0.8621778011572856, + "grad_norm": 1.455952525138855, + "learning_rate": 9.799870328688988e-06, + "loss": 1.4218, + "step": 8195 + }, + { + "epoch": 0.8622830089426617, + "grad_norm": 1.646437644958496, + "learning_rate": 9.785163230570282e-06, + "loss": 1.7, + "step": 8196 + }, + { + "epoch": 0.8623882167280379, + "grad_norm": 1.9158176183700562, + "learning_rate": 9.770466608803475e-06, + "loss": 2.0093, + "step": 8197 + }, + { + "epoch": 0.862493424513414, + "grad_norm": 1.696004033088684, + "learning_rate": 9.755780465095287e-06, + "loss": 1.4588, + "step": 8198 + }, + { + "epoch": 0.8625986322987901, + "grad_norm": 1.7793548107147217, + "learning_rate": 9.741104801151146e-06, + "loss": 1.8082, + "step": 8199 + }, + { + "epoch": 0.8627038400841662, + "grad_norm": 2.2487435340881348, + "learning_rate": 9.726439618675276e-06, + "loss": 1.5888, + "step": 8200 + }, + { + "epoch": 0.8628090478695424, + "grad_norm": 1.4217240810394287, + "learning_rate": 9.711784919370715e-06, + "loss": 1.4924, + "step": 8201 + }, + { + "epoch": 0.8629142556549184, + "grad_norm": 1.571770191192627, + "learning_rate": 9.697140704939245e-06, + "loss": 1.2066, + "step": 8202 + }, + { + "epoch": 0.8630194634402946, + "grad_norm": 1.2527103424072266, + "learning_rate": 9.682506977081496e-06, + "loss": 1.5845, + "step": 8203 + }, + { + "epoch": 0.8631246712256707, + "grad_norm": 1.499109148979187, + "learning_rate": 9.667883737496786e-06, + "loss": 0.9014, + "step": 8204 + }, + { + "epoch": 0.8632298790110469, + "grad_norm": 2.3821563720703125, + "learning_rate": 9.653270987883267e-06, + "loss": 1.6248, + "step": 8205 + }, + { + "epoch": 0.8633350867964229, + "grad_norm": 1.5363044738769531, + "learning_rate": 9.638668729937905e-06, + "loss": 1.9727, + "step": 8206 + }, + { + "epoch": 0.863440294581799, + "grad_norm": 1.1496707201004028, + "learning_rate": 9.624076965356388e-06, + "loss": 1.8011, + "step": 8207 + }, + { + "epoch": 0.8635455023671752, + "grad_norm": 1.7943934202194214, + "learning_rate": 9.609495695833216e-06, + "loss": 1.4859, + "step": 8208 + }, + { + "epoch": 0.8636507101525512, + "grad_norm": 2.865455389022827, + "learning_rate": 9.594924923061655e-06, + "loss": 1.6245, + "step": 8209 + }, + { + "epoch": 0.8637559179379274, + "grad_norm": 1.4639524221420288, + "learning_rate": 9.580364648733775e-06, + "loss": 1.8505, + "step": 8210 + }, + { + "epoch": 0.8638611257233035, + "grad_norm": 1.4049338102340698, + "learning_rate": 9.56581487454038e-06, + "loss": 1.5261, + "step": 8211 + }, + { + "epoch": 0.8639663335086797, + "grad_norm": 1.3107516765594482, + "learning_rate": 9.551275602171127e-06, + "loss": 1.493, + "step": 8212 + }, + { + "epoch": 0.8640715412940557, + "grad_norm": 1.295406460762024, + "learning_rate": 9.53674683331438e-06, + "loss": 1.3942, + "step": 8213 + }, + { + "epoch": 0.8641767490794319, + "grad_norm": 1.5618852376937866, + "learning_rate": 9.522228569657343e-06, + "loss": 2.2107, + "step": 8214 + }, + { + "epoch": 0.864281956864808, + "grad_norm": 1.705565094947815, + "learning_rate": 9.507720812885978e-06, + "loss": 1.8339, + "step": 8215 + }, + { + "epoch": 0.8643871646501841, + "grad_norm": 2.110238552093506, + "learning_rate": 9.493223564684994e-06, + "loss": 1.6849, + "step": 8216 + }, + { + "epoch": 0.8644923724355602, + "grad_norm": 1.721220850944519, + "learning_rate": 9.478736826737944e-06, + "loss": 1.3985, + "step": 8217 + }, + { + "epoch": 0.8645975802209364, + "grad_norm": 1.481402039527893, + "learning_rate": 9.464260600727104e-06, + "loss": 1.7841, + "step": 8218 + }, + { + "epoch": 0.8647027880063125, + "grad_norm": 2.818570852279663, + "learning_rate": 9.44979488833353e-06, + "loss": 1.3507, + "step": 8219 + }, + { + "epoch": 0.8648079957916885, + "grad_norm": 1.7059146165847778, + "learning_rate": 9.435339691237121e-06, + "loss": 1.4943, + "step": 8220 + }, + { + "epoch": 0.8649132035770647, + "grad_norm": 1.5640789270401, + "learning_rate": 9.420895011116492e-06, + "loss": 1.7849, + "step": 8221 + }, + { + "epoch": 0.8650184113624408, + "grad_norm": 2.26628041267395, + "learning_rate": 9.406460849649045e-06, + "loss": 1.6047, + "step": 8222 + }, + { + "epoch": 0.865123619147817, + "grad_norm": 1.201790452003479, + "learning_rate": 9.392037208510996e-06, + "loss": 1.8968, + "step": 8223 + }, + { + "epoch": 0.865228826933193, + "grad_norm": 1.8288311958312988, + "learning_rate": 9.37762408937729e-06, + "loss": 0.9009, + "step": 8224 + }, + { + "epoch": 0.8653340347185692, + "grad_norm": 1.5502636432647705, + "learning_rate": 9.36322149392168e-06, + "loss": 1.1525, + "step": 8225 + }, + { + "epoch": 0.8654392425039453, + "grad_norm": 1.8618223667144775, + "learning_rate": 9.348829423816718e-06, + "loss": 1.6285, + "step": 8226 + }, + { + "epoch": 0.8655444502893214, + "grad_norm": 1.6962296962738037, + "learning_rate": 9.334447880733676e-06, + "loss": 2.1117, + "step": 8227 + }, + { + "epoch": 0.8656496580746975, + "grad_norm": 1.1905288696289062, + "learning_rate": 9.320076866342642e-06, + "loss": 1.6631, + "step": 8228 + }, + { + "epoch": 0.8657548658600737, + "grad_norm": 1.2927476167678833, + "learning_rate": 9.30571638231249e-06, + "loss": 2.1606, + "step": 8229 + }, + { + "epoch": 0.8658600736454498, + "grad_norm": 1.4459666013717651, + "learning_rate": 9.291366430310844e-06, + "loss": 1.3559, + "step": 8230 + }, + { + "epoch": 0.8659652814308259, + "grad_norm": 1.827365517616272, + "learning_rate": 9.277027012004125e-06, + "loss": 1.6095, + "step": 8231 + }, + { + "epoch": 0.866070489216202, + "grad_norm": 1.9253352880477905, + "learning_rate": 9.262698129057512e-06, + "loss": 1.2613, + "step": 8232 + }, + { + "epoch": 0.8661756970015781, + "grad_norm": 1.1855809688568115, + "learning_rate": 9.248379783134952e-06, + "loss": 1.8954, + "step": 8233 + }, + { + "epoch": 0.8662809047869542, + "grad_norm": 2.3653783798217773, + "learning_rate": 9.234071975899228e-06, + "loss": 1.2972, + "step": 8234 + }, + { + "epoch": 0.8663861125723303, + "grad_norm": 1.508872389793396, + "learning_rate": 9.21977470901184e-06, + "loss": 1.6728, + "step": 8235 + }, + { + "epoch": 0.8664913203577065, + "grad_norm": 1.531275749206543, + "learning_rate": 9.205487984133076e-06, + "loss": 1.2766, + "step": 8236 + }, + { + "epoch": 0.8665965281430826, + "grad_norm": 1.4290529489517212, + "learning_rate": 9.191211802922017e-06, + "loss": 1.7471, + "step": 8237 + }, + { + "epoch": 0.8667017359284587, + "grad_norm": 1.9806660413742065, + "learning_rate": 9.176946167036516e-06, + "loss": 1.0977, + "step": 8238 + }, + { + "epoch": 0.8668069437138348, + "grad_norm": 1.909472107887268, + "learning_rate": 9.162691078133157e-06, + "loss": 1.8081, + "step": 8239 + }, + { + "epoch": 0.866912151499211, + "grad_norm": 2.2242584228515625, + "learning_rate": 9.148446537867383e-06, + "loss": 1.6033, + "step": 8240 + }, + { + "epoch": 0.867017359284587, + "grad_norm": 1.3341119289398193, + "learning_rate": 9.134212547893351e-06, + "loss": 1.9168, + "step": 8241 + }, + { + "epoch": 0.8671225670699632, + "grad_norm": 1.836234211921692, + "learning_rate": 9.11998910986398e-06, + "loss": 2.2142, + "step": 8242 + }, + { + "epoch": 0.8672277748553393, + "grad_norm": 1.667237401008606, + "learning_rate": 9.105776225431029e-06, + "loss": 1.3457, + "step": 8243 + }, + { + "epoch": 0.8673329826407155, + "grad_norm": 1.6611175537109375, + "learning_rate": 9.091573896244976e-06, + "loss": 1.3769, + "step": 8244 + }, + { + "epoch": 0.8674381904260915, + "grad_norm": 1.894445776939392, + "learning_rate": 9.07738212395508e-06, + "loss": 1.6441, + "step": 8245 + }, + { + "epoch": 0.8675433982114676, + "grad_norm": 1.9377069473266602, + "learning_rate": 9.063200910209413e-06, + "loss": 1.4411, + "step": 8246 + }, + { + "epoch": 0.8676486059968438, + "grad_norm": 1.5607261657714844, + "learning_rate": 9.049030256654777e-06, + "loss": 1.5803, + "step": 8247 + }, + { + "epoch": 0.8677538137822198, + "grad_norm": 1.2471551895141602, + "learning_rate": 9.034870164936737e-06, + "loss": 1.4342, + "step": 8248 + }, + { + "epoch": 0.867859021567596, + "grad_norm": 1.3888238668441772, + "learning_rate": 9.020720636699709e-06, + "loss": 1.5969, + "step": 8249 + }, + { + "epoch": 0.8679642293529721, + "grad_norm": 3.419867753982544, + "learning_rate": 9.006581673586789e-06, + "loss": 1.5199, + "step": 8250 + }, + { + "epoch": 0.8680694371383483, + "grad_norm": 1.7983578443527222, + "learning_rate": 8.992453277239942e-06, + "loss": 1.5565, + "step": 8251 + }, + { + "epoch": 0.8681746449237243, + "grad_norm": 1.5577338933944702, + "learning_rate": 8.978335449299791e-06, + "loss": 1.5991, + "step": 8252 + }, + { + "epoch": 0.8682798527091005, + "grad_norm": 2.433664321899414, + "learning_rate": 8.9642281914058e-06, + "loss": 1.3975, + "step": 8253 + }, + { + "epoch": 0.8683850604944766, + "grad_norm": 1.6767748594284058, + "learning_rate": 8.950131505196236e-06, + "loss": 1.9497, + "step": 8254 + }, + { + "epoch": 0.8684902682798528, + "grad_norm": 1.431541919708252, + "learning_rate": 8.936045392308079e-06, + "loss": 2.1185, + "step": 8255 + }, + { + "epoch": 0.8685954760652288, + "grad_norm": 2.1313490867614746, + "learning_rate": 8.921969854377088e-06, + "loss": 1.7141, + "step": 8256 + }, + { + "epoch": 0.868700683850605, + "grad_norm": 1.8773490190505981, + "learning_rate": 8.907904893037833e-06, + "loss": 1.592, + "step": 8257 + }, + { + "epoch": 0.8688058916359811, + "grad_norm": 1.4465880393981934, + "learning_rate": 8.893850509923619e-06, + "loss": 1.3952, + "step": 8258 + }, + { + "epoch": 0.8689110994213571, + "grad_norm": 1.6788699626922607, + "learning_rate": 8.87980670666655e-06, + "loss": 1.2683, + "step": 8259 + }, + { + "epoch": 0.8690163072067333, + "grad_norm": 1.4873356819152832, + "learning_rate": 8.865773484897477e-06, + "loss": 2.0945, + "step": 8260 + }, + { + "epoch": 0.8691215149921094, + "grad_norm": 1.6656405925750732, + "learning_rate": 8.85175084624602e-06, + "loss": 1.4234, + "step": 8261 + }, + { + "epoch": 0.8692267227774856, + "grad_norm": 1.3971835374832153, + "learning_rate": 8.8377387923406e-06, + "loss": 1.4474, + "step": 8262 + }, + { + "epoch": 0.8693319305628616, + "grad_norm": 1.494502305984497, + "learning_rate": 8.82373732480839e-06, + "loss": 1.8342, + "step": 8263 + }, + { + "epoch": 0.8694371383482378, + "grad_norm": 1.7074618339538574, + "learning_rate": 8.809746445275312e-06, + "loss": 1.4596, + "step": 8264 + }, + { + "epoch": 0.8695423461336139, + "grad_norm": 2.2928879261016846, + "learning_rate": 8.795766155366114e-06, + "loss": 1.4602, + "step": 8265 + }, + { + "epoch": 0.86964755391899, + "grad_norm": 1.632634162902832, + "learning_rate": 8.781796456704262e-06, + "loss": 1.6407, + "step": 8266 + }, + { + "epoch": 0.8697527617043661, + "grad_norm": 1.7646962404251099, + "learning_rate": 8.767837350912e-06, + "loss": 1.6187, + "step": 8267 + }, + { + "epoch": 0.8698579694897423, + "grad_norm": 1.2554585933685303, + "learning_rate": 8.75388883961038e-06, + "loss": 1.3565, + "step": 8268 + }, + { + "epoch": 0.8699631772751184, + "grad_norm": 2.188851833343506, + "learning_rate": 8.739950924419183e-06, + "loss": 1.4143, + "step": 8269 + }, + { + "epoch": 0.8700683850604944, + "grad_norm": 1.2379424571990967, + "learning_rate": 8.726023606956956e-06, + "loss": 2.0142, + "step": 8270 + }, + { + "epoch": 0.8701735928458706, + "grad_norm": 1.5064557790756226, + "learning_rate": 8.712106888841064e-06, + "loss": 1.8644, + "step": 8271 + }, + { + "epoch": 0.8702788006312467, + "grad_norm": 2.4556884765625, + "learning_rate": 8.698200771687592e-06, + "loss": 1.1037, + "step": 8272 + }, + { + "epoch": 0.8703840084166228, + "grad_norm": 0.9739543199539185, + "learning_rate": 8.684305257111425e-06, + "loss": 1.5688, + "step": 8273 + }, + { + "epoch": 0.8704892162019989, + "grad_norm": 1.589955449104309, + "learning_rate": 8.670420346726182e-06, + "loss": 1.9209, + "step": 8274 + }, + { + "epoch": 0.8705944239873751, + "grad_norm": 1.278948187828064, + "learning_rate": 8.656546042144275e-06, + "loss": 1.406, + "step": 8275 + }, + { + "epoch": 0.8706996317727512, + "grad_norm": 1.640868067741394, + "learning_rate": 8.642682344976904e-06, + "loss": 1.4179, + "step": 8276 + }, + { + "epoch": 0.8708048395581273, + "grad_norm": 1.3409028053283691, + "learning_rate": 8.628829256833992e-06, + "loss": 1.4, + "step": 8277 + }, + { + "epoch": 0.8709100473435034, + "grad_norm": 1.4325164556503296, + "learning_rate": 8.614986779324252e-06, + "loss": 1.6548, + "step": 8278 + }, + { + "epoch": 0.8710152551288796, + "grad_norm": 2.271646022796631, + "learning_rate": 8.601154914055187e-06, + "loss": 1.3482, + "step": 8279 + }, + { + "epoch": 0.8711204629142556, + "grad_norm": 1.5409026145935059, + "learning_rate": 8.587333662633035e-06, + "loss": 1.5707, + "step": 8280 + }, + { + "epoch": 0.8712256706996317, + "grad_norm": 1.1222127676010132, + "learning_rate": 8.57352302666279e-06, + "loss": 1.4839, + "step": 8281 + }, + { + "epoch": 0.8713308784850079, + "grad_norm": 1.5568034648895264, + "learning_rate": 8.559723007748278e-06, + "loss": 1.5636, + "step": 8282 + }, + { + "epoch": 0.871436086270384, + "grad_norm": 1.717877984046936, + "learning_rate": 8.545933607492019e-06, + "loss": 1.3919, + "step": 8283 + }, + { + "epoch": 0.8715412940557601, + "grad_norm": 1.7419726848602295, + "learning_rate": 8.53215482749532e-06, + "loss": 1.164, + "step": 8284 + }, + { + "epoch": 0.8716465018411362, + "grad_norm": 1.5329818725585938, + "learning_rate": 8.518386669358313e-06, + "loss": 1.5617, + "step": 8285 + }, + { + "epoch": 0.8717517096265124, + "grad_norm": 1.6898797750473022, + "learning_rate": 8.50462913467982e-06, + "loss": 1.2344, + "step": 8286 + }, + { + "epoch": 0.8718569174118885, + "grad_norm": 2.2260582447052, + "learning_rate": 8.490882225057428e-06, + "loss": 1.6568, + "step": 8287 + }, + { + "epoch": 0.8719621251972646, + "grad_norm": 1.553295612335205, + "learning_rate": 8.477145942087583e-06, + "loss": 1.0574, + "step": 8288 + }, + { + "epoch": 0.8720673329826407, + "grad_norm": 1.8379864692687988, + "learning_rate": 8.463420287365386e-06, + "loss": 1.7292, + "step": 8289 + }, + { + "epoch": 0.8721725407680169, + "grad_norm": 2.359562873840332, + "learning_rate": 8.449705262484763e-06, + "loss": 1.2984, + "step": 8290 + }, + { + "epoch": 0.8722777485533929, + "grad_norm": 1.2469983100891113, + "learning_rate": 8.436000869038418e-06, + "loss": 1.3601, + "step": 8291 + }, + { + "epoch": 0.8723829563387691, + "grad_norm": 1.5775099992752075, + "learning_rate": 8.422307108617777e-06, + "loss": 1.3026, + "step": 8292 + }, + { + "epoch": 0.8724881641241452, + "grad_norm": 1.6676642894744873, + "learning_rate": 8.408623982813036e-06, + "loss": 1.1921, + "step": 8293 + }, + { + "epoch": 0.8725933719095214, + "grad_norm": 1.9857598543167114, + "learning_rate": 8.39495149321322e-06, + "loss": 1.7544, + "step": 8294 + }, + { + "epoch": 0.8726985796948974, + "grad_norm": 2.608306407928467, + "learning_rate": 8.381289641405998e-06, + "loss": 1.2208, + "step": 8295 + }, + { + "epoch": 0.8728037874802735, + "grad_norm": 1.3763883113861084, + "learning_rate": 8.367638428977942e-06, + "loss": 1.5372, + "step": 8296 + }, + { + "epoch": 0.8729089952656497, + "grad_norm": 1.8553149700164795, + "learning_rate": 8.353997857514296e-06, + "loss": 1.6343, + "step": 8297 + }, + { + "epoch": 0.8730142030510257, + "grad_norm": 2.202342987060547, + "learning_rate": 8.34036792859908e-06, + "loss": 1.5843, + "step": 8298 + }, + { + "epoch": 0.8731194108364019, + "grad_norm": 2.3540780544281006, + "learning_rate": 8.32674864381513e-06, + "loss": 1.5165, + "step": 8299 + }, + { + "epoch": 0.873224618621778, + "grad_norm": 2.0896689891815186, + "learning_rate": 8.31314000474398e-06, + "loss": 1.613, + "step": 8300 + }, + { + "epoch": 0.8733298264071542, + "grad_norm": 1.7080516815185547, + "learning_rate": 8.299542012965944e-06, + "loss": 1.3788, + "step": 8301 + }, + { + "epoch": 0.8734350341925302, + "grad_norm": 1.593510627746582, + "learning_rate": 8.285954670060159e-06, + "loss": 1.6948, + "step": 8302 + }, + { + "epoch": 0.8735402419779064, + "grad_norm": 2.1248574256896973, + "learning_rate": 8.272377977604439e-06, + "loss": 1.6842, + "step": 8303 + }, + { + "epoch": 0.8736454497632825, + "grad_norm": 1.1334164142608643, + "learning_rate": 8.258811937175403e-06, + "loss": 2.0723, + "step": 8304 + }, + { + "epoch": 0.8737506575486585, + "grad_norm": 1.6127103567123413, + "learning_rate": 8.245256550348456e-06, + "loss": 1.4466, + "step": 8305 + }, + { + "epoch": 0.8738558653340347, + "grad_norm": 1.4904117584228516, + "learning_rate": 8.231711818697708e-06, + "loss": 1.4292, + "step": 8306 + }, + { + "epoch": 0.8739610731194108, + "grad_norm": 1.7392334938049316, + "learning_rate": 8.218177743796096e-06, + "loss": 2.2474, + "step": 8307 + }, + { + "epoch": 0.874066280904787, + "grad_norm": 2.0337204933166504, + "learning_rate": 8.204654327215267e-06, + "loss": 1.369, + "step": 8308 + }, + { + "epoch": 0.874171488690163, + "grad_norm": 1.8506286144256592, + "learning_rate": 8.19114157052564e-06, + "loss": 1.587, + "step": 8309 + }, + { + "epoch": 0.8742766964755392, + "grad_norm": 1.861567497253418, + "learning_rate": 8.177639475296451e-06, + "loss": 0.9129, + "step": 8310 + }, + { + "epoch": 0.8743819042609153, + "grad_norm": 1.4691027402877808, + "learning_rate": 8.16414804309562e-06, + "loss": 1.5196, + "step": 8311 + }, + { + "epoch": 0.8744871120462914, + "grad_norm": 1.7774051427841187, + "learning_rate": 8.150667275489842e-06, + "loss": 2.4965, + "step": 8312 + }, + { + "epoch": 0.8745923198316675, + "grad_norm": 1.6832690238952637, + "learning_rate": 8.137197174044653e-06, + "loss": 1.4504, + "step": 8313 + }, + { + "epoch": 0.8746975276170437, + "grad_norm": 1.254486083984375, + "learning_rate": 8.123737740324256e-06, + "loss": 1.5565, + "step": 8314 + }, + { + "epoch": 0.8748027354024198, + "grad_norm": 2.1630659103393555, + "learning_rate": 8.110288975891634e-06, + "loss": 1.232, + "step": 8315 + }, + { + "epoch": 0.8749079431877959, + "grad_norm": 1.3778032064437866, + "learning_rate": 8.096850882308593e-06, + "loss": 1.517, + "step": 8316 + }, + { + "epoch": 0.875013150973172, + "grad_norm": 2.6462600231170654, + "learning_rate": 8.083423461135608e-06, + "loss": 1.6253, + "step": 8317 + }, + { + "epoch": 0.8751183587585482, + "grad_norm": 2.1237175464630127, + "learning_rate": 8.070006713931988e-06, + "loss": 1.6068, + "step": 8318 + }, + { + "epoch": 0.8752235665439243, + "grad_norm": 1.620760202407837, + "learning_rate": 8.056600642255773e-06, + "loss": 1.4239, + "step": 8319 + }, + { + "epoch": 0.8753287743293003, + "grad_norm": 1.3363070487976074, + "learning_rate": 8.043205247663755e-06, + "loss": 1.7418, + "step": 8320 + }, + { + "epoch": 0.8754339821146765, + "grad_norm": 1.1547250747680664, + "learning_rate": 8.029820531711518e-06, + "loss": 1.7182, + "step": 8321 + }, + { + "epoch": 0.8755391899000526, + "grad_norm": 1.5865908861160278, + "learning_rate": 8.016446495953367e-06, + "loss": 2.1783, + "step": 8322 + }, + { + "epoch": 0.8756443976854287, + "grad_norm": 1.7956304550170898, + "learning_rate": 8.00308314194238e-06, + "loss": 1.5709, + "step": 8323 + }, + { + "epoch": 0.8757496054708048, + "grad_norm": 1.6148009300231934, + "learning_rate": 7.989730471230417e-06, + "loss": 1.8065, + "step": 8324 + }, + { + "epoch": 0.875854813256181, + "grad_norm": 1.6082184314727783, + "learning_rate": 7.97638848536808e-06, + "loss": 1.824, + "step": 8325 + }, + { + "epoch": 0.8759600210415571, + "grad_norm": 1.5092358589172363, + "learning_rate": 7.963057185904698e-06, + "loss": 1.1529, + "step": 8326 + }, + { + "epoch": 0.8760652288269332, + "grad_norm": 1.4924912452697754, + "learning_rate": 7.949736574388433e-06, + "loss": 1.3965, + "step": 8327 + }, + { + "epoch": 0.8761704366123093, + "grad_norm": 1.484432339668274, + "learning_rate": 7.936426652366147e-06, + "loss": 1.5392, + "step": 8328 + }, + { + "epoch": 0.8762756443976855, + "grad_norm": 1.8063507080078125, + "learning_rate": 7.923127421383458e-06, + "loss": 1.7397, + "step": 8329 + }, + { + "epoch": 0.8763808521830615, + "grad_norm": 1.7138848304748535, + "learning_rate": 7.909838882984799e-06, + "loss": 1.9147, + "step": 8330 + }, + { + "epoch": 0.8764860599684376, + "grad_norm": 1.6054270267486572, + "learning_rate": 7.896561038713302e-06, + "loss": 1.7124, + "step": 8331 + }, + { + "epoch": 0.8765912677538138, + "grad_norm": 1.4371368885040283, + "learning_rate": 7.883293890110865e-06, + "loss": 1.3358, + "step": 8332 + }, + { + "epoch": 0.87669647553919, + "grad_norm": 1.3768408298492432, + "learning_rate": 7.870037438718191e-06, + "loss": 1.4805, + "step": 8333 + }, + { + "epoch": 0.876801683324566, + "grad_norm": 1.5090663433074951, + "learning_rate": 7.856791686074694e-06, + "loss": 1.819, + "step": 8334 + }, + { + "epoch": 0.8769068911099421, + "grad_norm": 1.7968672513961792, + "learning_rate": 7.84355663371854e-06, + "loss": 1.5492, + "step": 8335 + }, + { + "epoch": 0.8770120988953183, + "grad_norm": 1.5424824953079224, + "learning_rate": 7.830332283186714e-06, + "loss": 1.8034, + "step": 8336 + }, + { + "epoch": 0.8771173066806943, + "grad_norm": 1.5214895009994507, + "learning_rate": 7.817118636014886e-06, + "loss": 1.3005, + "step": 8337 + }, + { + "epoch": 0.8772225144660705, + "grad_norm": 1.3257561922073364, + "learning_rate": 7.803915693737518e-06, + "loss": 1.3967, + "step": 8338 + }, + { + "epoch": 0.8773277222514466, + "grad_norm": 2.690847635269165, + "learning_rate": 7.790723457887828e-06, + "loss": 1.3393, + "step": 8339 + }, + { + "epoch": 0.8774329300368228, + "grad_norm": 1.336316466331482, + "learning_rate": 7.777541929997766e-06, + "loss": 1.6733, + "step": 8340 + }, + { + "epoch": 0.8775381378221988, + "grad_norm": 1.7110776901245117, + "learning_rate": 7.7643711115981e-06, + "loss": 1.7544, + "step": 8341 + }, + { + "epoch": 0.877643345607575, + "grad_norm": 1.3267966508865356, + "learning_rate": 7.751211004218295e-06, + "loss": 1.7771, + "step": 8342 + }, + { + "epoch": 0.8777485533929511, + "grad_norm": 1.5423023700714111, + "learning_rate": 7.73806160938656e-06, + "loss": 1.3971, + "step": 8343 + }, + { + "epoch": 0.8778537611783271, + "grad_norm": 1.562389850616455, + "learning_rate": 7.724922928629941e-06, + "loss": 1.2311, + "step": 8344 + }, + { + "epoch": 0.8779589689637033, + "grad_norm": 1.6092841625213623, + "learning_rate": 7.711794963474173e-06, + "loss": 2.0053, + "step": 8345 + }, + { + "epoch": 0.8780641767490794, + "grad_norm": 1.8974779844284058, + "learning_rate": 7.698677715443736e-06, + "loss": 1.2428, + "step": 8346 + }, + { + "epoch": 0.8781693845344556, + "grad_norm": 1.8776745796203613, + "learning_rate": 7.685571186061934e-06, + "loss": 1.9842, + "step": 8347 + }, + { + "epoch": 0.8782745923198316, + "grad_norm": 1.4398527145385742, + "learning_rate": 7.672475376850764e-06, + "loss": 1.878, + "step": 8348 + }, + { + "epoch": 0.8783798001052078, + "grad_norm": 1.1596499681472778, + "learning_rate": 7.65939028933098e-06, + "loss": 1.6491, + "step": 8349 + }, + { + "epoch": 0.8784850078905839, + "grad_norm": 2.233144760131836, + "learning_rate": 7.646315925022152e-06, + "loss": 1.5677, + "step": 8350 + }, + { + "epoch": 0.8785902156759601, + "grad_norm": 2.088862419128418, + "learning_rate": 7.633252285442526e-06, + "loss": 1.6353, + "step": 8351 + }, + { + "epoch": 0.8786954234613361, + "grad_norm": 1.5038179159164429, + "learning_rate": 7.620199372109172e-06, + "loss": 2.0619, + "step": 8352 + }, + { + "epoch": 0.8788006312467123, + "grad_norm": 1.6128473281860352, + "learning_rate": 7.607157186537872e-06, + "loss": 1.9283, + "step": 8353 + }, + { + "epoch": 0.8789058390320884, + "grad_norm": 1.4330729246139526, + "learning_rate": 7.59412573024314e-06, + "loss": 1.757, + "step": 8354 + }, + { + "epoch": 0.8790110468174644, + "grad_norm": 1.7641589641571045, + "learning_rate": 7.581105004738321e-06, + "loss": 1.8615, + "step": 8355 + }, + { + "epoch": 0.8791162546028406, + "grad_norm": 1.5351738929748535, + "learning_rate": 7.568095011535448e-06, + "loss": 1.5014, + "step": 8356 + }, + { + "epoch": 0.8792214623882167, + "grad_norm": 1.7756725549697876, + "learning_rate": 7.555095752145313e-06, + "loss": 1.4319, + "step": 8357 + }, + { + "epoch": 0.8793266701735929, + "grad_norm": 1.6346051692962646, + "learning_rate": 7.542107228077533e-06, + "loss": 1.2158, + "step": 8358 + }, + { + "epoch": 0.8794318779589689, + "grad_norm": 2.2152860164642334, + "learning_rate": 7.529129440840355e-06, + "loss": 1.389, + "step": 8359 + }, + { + "epoch": 0.8795370857443451, + "grad_norm": 1.4864938259124756, + "learning_rate": 7.516162391940873e-06, + "loss": 0.8488, + "step": 8360 + }, + { + "epoch": 0.8796422935297212, + "grad_norm": 2.238067626953125, + "learning_rate": 7.503206082884917e-06, + "loss": 1.8396, + "step": 8361 + }, + { + "epoch": 0.8797475013150973, + "grad_norm": 1.6795562505722046, + "learning_rate": 7.4902605151770385e-06, + "loss": 1.9979, + "step": 8362 + }, + { + "epoch": 0.8798527091004734, + "grad_norm": 2.297811269760132, + "learning_rate": 7.477325690320602e-06, + "loss": 1.9526, + "step": 8363 + }, + { + "epoch": 0.8799579168858496, + "grad_norm": 2.567331075668335, + "learning_rate": 7.4644016098176615e-06, + "loss": 1.5925, + "step": 8364 + }, + { + "epoch": 0.8800631246712257, + "grad_norm": 1.6319959163665771, + "learning_rate": 7.451488275169028e-06, + "loss": 1.9061, + "step": 8365 + }, + { + "epoch": 0.8801683324566018, + "grad_norm": 2.1503756046295166, + "learning_rate": 7.438585687874333e-06, + "loss": 1.4737, + "step": 8366 + }, + { + "epoch": 0.8802735402419779, + "grad_norm": 1.6706979274749756, + "learning_rate": 7.42569384943187e-06, + "loss": 1.822, + "step": 8367 + }, + { + "epoch": 0.880378748027354, + "grad_norm": 1.9475339651107788, + "learning_rate": 7.412812761338739e-06, + "loss": 1.8185, + "step": 8368 + }, + { + "epoch": 0.8804839558127301, + "grad_norm": 1.8608582019805908, + "learning_rate": 7.3999424250907775e-06, + "loss": 2.2257, + "step": 8369 + }, + { + "epoch": 0.8805891635981062, + "grad_norm": 1.6136201620101929, + "learning_rate": 7.387082842182591e-06, + "loss": 1.6696, + "step": 8370 + }, + { + "epoch": 0.8806943713834824, + "grad_norm": 1.2842328548431396, + "learning_rate": 7.374234014107484e-06, + "loss": 1.8681, + "step": 8371 + }, + { + "epoch": 0.8807995791688585, + "grad_norm": 2.344787120819092, + "learning_rate": 7.361395942357596e-06, + "loss": 1.4332, + "step": 8372 + }, + { + "epoch": 0.8809047869542346, + "grad_norm": 3.1230101585388184, + "learning_rate": 7.348568628423746e-06, + "loss": 1.5007, + "step": 8373 + }, + { + "epoch": 0.8810099947396107, + "grad_norm": 1.2945367097854614, + "learning_rate": 7.335752073795499e-06, + "loss": 1.7119, + "step": 8374 + }, + { + "epoch": 0.8811152025249869, + "grad_norm": 1.189368486404419, + "learning_rate": 7.322946279961252e-06, + "loss": 1.8333, + "step": 8375 + }, + { + "epoch": 0.8812204103103629, + "grad_norm": 1.435043454170227, + "learning_rate": 7.31015124840807e-06, + "loss": 1.6932, + "step": 8376 + }, + { + "epoch": 0.8813256180957391, + "grad_norm": 2.0978446006774902, + "learning_rate": 7.297366980621789e-06, + "loss": 1.2705, + "step": 8377 + }, + { + "epoch": 0.8814308258811152, + "grad_norm": 1.0726866722106934, + "learning_rate": 7.284593478087043e-06, + "loss": 1.846, + "step": 8378 + }, + { + "epoch": 0.8815360336664914, + "grad_norm": 1.2944754362106323, + "learning_rate": 7.2718307422871445e-06, + "loss": 1.2121, + "step": 8379 + }, + { + "epoch": 0.8816412414518674, + "grad_norm": 1.729756474494934, + "learning_rate": 7.259078774704198e-06, + "loss": 2.0314, + "step": 8380 + }, + { + "epoch": 0.8817464492372435, + "grad_norm": 1.872869610786438, + "learning_rate": 7.24633757681904e-06, + "loss": 1.9162, + "step": 8381 + }, + { + "epoch": 0.8818516570226197, + "grad_norm": 1.577900767326355, + "learning_rate": 7.233607150111255e-06, + "loss": 1.697, + "step": 8382 + }, + { + "epoch": 0.8819568648079958, + "grad_norm": 1.8250850439071655, + "learning_rate": 7.2208874960592145e-06, + "loss": 1.1092, + "step": 8383 + }, + { + "epoch": 0.8820620725933719, + "grad_norm": 1.5504481792449951, + "learning_rate": 7.208178616139994e-06, + "loss": 1.265, + "step": 8384 + }, + { + "epoch": 0.882167280378748, + "grad_norm": 1.681287407875061, + "learning_rate": 7.195480511829411e-06, + "loss": 1.366, + "step": 8385 + }, + { + "epoch": 0.8822724881641242, + "grad_norm": 1.5954519510269165, + "learning_rate": 7.18279318460211e-06, + "loss": 1.4569, + "step": 8386 + }, + { + "epoch": 0.8823776959495002, + "grad_norm": 1.6172798871994019, + "learning_rate": 7.1701166359313894e-06, + "loss": 1.6745, + "step": 8387 + }, + { + "epoch": 0.8824829037348764, + "grad_norm": 1.5901143550872803, + "learning_rate": 7.157450867289317e-06, + "loss": 1.6646, + "step": 8388 + }, + { + "epoch": 0.8825881115202525, + "grad_norm": 1.5728827714920044, + "learning_rate": 7.1447958801467816e-06, + "loss": 1.5367, + "step": 8389 + }, + { + "epoch": 0.8826933193056287, + "grad_norm": 1.7115237712860107, + "learning_rate": 7.132151675973331e-06, + "loss": 1.3662, + "step": 8390 + }, + { + "epoch": 0.8827985270910047, + "grad_norm": 1.2988559007644653, + "learning_rate": 7.119518256237279e-06, + "loss": 1.5863, + "step": 8391 + }, + { + "epoch": 0.8829037348763809, + "grad_norm": 1.656714916229248, + "learning_rate": 7.106895622405752e-06, + "loss": 1.6779, + "step": 8392 + }, + { + "epoch": 0.883008942661757, + "grad_norm": 1.3429776430130005, + "learning_rate": 7.0942837759445325e-06, + "loss": 1.6569, + "step": 8393 + }, + { + "epoch": 0.883114150447133, + "grad_norm": 1.6043792963027954, + "learning_rate": 7.081682718318194e-06, + "loss": 1.4003, + "step": 8394 + }, + { + "epoch": 0.8832193582325092, + "grad_norm": 1.8559693098068237, + "learning_rate": 7.069092450990089e-06, + "loss": 1.6701, + "step": 8395 + }, + { + "epoch": 0.8833245660178853, + "grad_norm": 1.5783679485321045, + "learning_rate": 7.056512975422269e-06, + "loss": 1.5224, + "step": 8396 + }, + { + "epoch": 0.8834297738032615, + "grad_norm": 1.5466448068618774, + "learning_rate": 7.0439442930755105e-06, + "loss": 1.5427, + "step": 8397 + }, + { + "epoch": 0.8835349815886375, + "grad_norm": 1.4492648839950562, + "learning_rate": 7.031386405409434e-06, + "loss": 1.3369, + "step": 8398 + }, + { + "epoch": 0.8836401893740137, + "grad_norm": 1.4293040037155151, + "learning_rate": 7.018839313882286e-06, + "loss": 1.3867, + "step": 8399 + }, + { + "epoch": 0.8837453971593898, + "grad_norm": 2.3548078536987305, + "learning_rate": 7.006303019951177e-06, + "loss": 1.9804, + "step": 8400 + }, + { + "epoch": 0.8838506049447659, + "grad_norm": 1.3848503828048706, + "learning_rate": 6.993777525071887e-06, + "loss": 1.1785, + "step": 8401 + }, + { + "epoch": 0.883955812730142, + "grad_norm": 1.6691620349884033, + "learning_rate": 6.98126283069892e-06, + "loss": 1.5292, + "step": 8402 + }, + { + "epoch": 0.8840610205155182, + "grad_norm": 1.5758638381958008, + "learning_rate": 6.968758938285614e-06, + "loss": 1.3405, + "step": 8403 + }, + { + "epoch": 0.8841662283008943, + "grad_norm": 1.6990686655044556, + "learning_rate": 6.956265849283994e-06, + "loss": 1.3229, + "step": 8404 + }, + { + "epoch": 0.8842714360862703, + "grad_norm": 2.8560450077056885, + "learning_rate": 6.943783565144812e-06, + "loss": 1.6618, + "step": 8405 + }, + { + "epoch": 0.8843766438716465, + "grad_norm": 1.6072392463684082, + "learning_rate": 6.931312087317632e-06, + "loss": 1.7917, + "step": 8406 + }, + { + "epoch": 0.8844818516570226, + "grad_norm": 1.5466362237930298, + "learning_rate": 6.918851417250693e-06, + "loss": 1.321, + "step": 8407 + }, + { + "epoch": 0.8845870594423987, + "grad_norm": 1.8458850383758545, + "learning_rate": 6.906401556391051e-06, + "loss": 1.6448, + "step": 8408 + }, + { + "epoch": 0.8846922672277748, + "grad_norm": 2.208996295928955, + "learning_rate": 6.893962506184448e-06, + "loss": 1.8648, + "step": 8409 + }, + { + "epoch": 0.884797475013151, + "grad_norm": 1.3810334205627441, + "learning_rate": 6.8815342680753735e-06, + "loss": 1.7885, + "step": 8410 + }, + { + "epoch": 0.8849026827985271, + "grad_norm": 1.5404844284057617, + "learning_rate": 6.869116843507106e-06, + "loss": 1.5237, + "step": 8411 + }, + { + "epoch": 0.8850078905839032, + "grad_norm": 1.3252919912338257, + "learning_rate": 6.856710233921626e-06, + "loss": 1.9188, + "step": 8412 + }, + { + "epoch": 0.8851130983692793, + "grad_norm": 1.3163280487060547, + "learning_rate": 6.844314440759647e-06, + "loss": 1.8506, + "step": 8413 + }, + { + "epoch": 0.8852183061546555, + "grad_norm": 1.7833784818649292, + "learning_rate": 6.8319294654607065e-06, + "loss": 1.2899, + "step": 8414 + }, + { + "epoch": 0.8853235139400316, + "grad_norm": 2.266003370285034, + "learning_rate": 6.8195553094629995e-06, + "loss": 1.1743, + "step": 8415 + }, + { + "epoch": 0.8854287217254077, + "grad_norm": 2.029226303100586, + "learning_rate": 6.807191974203486e-06, + "loss": 1.04, + "step": 8416 + }, + { + "epoch": 0.8855339295107838, + "grad_norm": 1.7757220268249512, + "learning_rate": 6.7948394611178964e-06, + "loss": 1.2583, + "step": 8417 + }, + { + "epoch": 0.88563913729616, + "grad_norm": 2.46942138671875, + "learning_rate": 6.782497771640694e-06, + "loss": 1.5153, + "step": 8418 + }, + { + "epoch": 0.885744345081536, + "grad_norm": 2.080439805984497, + "learning_rate": 6.770166907205044e-06, + "loss": 0.9513, + "step": 8419 + }, + { + "epoch": 0.8858495528669121, + "grad_norm": 1.3978705406188965, + "learning_rate": 6.7578468692429345e-06, + "loss": 1.2028, + "step": 8420 + }, + { + "epoch": 0.8859547606522883, + "grad_norm": 1.2934406995773315, + "learning_rate": 6.7455376591850195e-06, + "loss": 1.6844, + "step": 8421 + }, + { + "epoch": 0.8860599684376644, + "grad_norm": 1.9341928958892822, + "learning_rate": 6.733239278460735e-06, + "loss": 1.5859, + "step": 8422 + }, + { + "epoch": 0.8861651762230405, + "grad_norm": 1.2040367126464844, + "learning_rate": 6.7209517284982704e-06, + "loss": 1.3829, + "step": 8423 + }, + { + "epoch": 0.8862703840084166, + "grad_norm": 1.6010814905166626, + "learning_rate": 6.7086750107244965e-06, + "loss": 1.3713, + "step": 8424 + }, + { + "epoch": 0.8863755917937928, + "grad_norm": 2.1789660453796387, + "learning_rate": 6.696409126565107e-06, + "loss": 1.9039, + "step": 8425 + }, + { + "epoch": 0.8864807995791688, + "grad_norm": 1.1743876934051514, + "learning_rate": 6.684154077444482e-06, + "loss": 1.6147, + "step": 8426 + }, + { + "epoch": 0.886586007364545, + "grad_norm": 1.855284571647644, + "learning_rate": 6.6719098647857525e-06, + "loss": 1.5138, + "step": 8427 + }, + { + "epoch": 0.8866912151499211, + "grad_norm": 1.862816333770752, + "learning_rate": 6.659676490010824e-06, + "loss": 1.5119, + "step": 8428 + }, + { + "epoch": 0.8867964229352973, + "grad_norm": 1.4365988969802856, + "learning_rate": 6.647453954540295e-06, + "loss": 1.6655, + "step": 8429 + }, + { + "epoch": 0.8869016307206733, + "grad_norm": 1.137057900428772, + "learning_rate": 6.635242259793528e-06, + "loss": 1.6778, + "step": 8430 + }, + { + "epoch": 0.8870068385060494, + "grad_norm": 1.6340605020523071, + "learning_rate": 6.623041407188646e-06, + "loss": 1.7023, + "step": 8431 + }, + { + "epoch": 0.8871120462914256, + "grad_norm": 1.682706356048584, + "learning_rate": 6.610851398142482e-06, + "loss": 1.6385, + "step": 8432 + }, + { + "epoch": 0.8872172540768016, + "grad_norm": 2.859862804412842, + "learning_rate": 6.598672234070602e-06, + "loss": 1.4929, + "step": 8433 + }, + { + "epoch": 0.8873224618621778, + "grad_norm": 1.687212347984314, + "learning_rate": 6.586503916387366e-06, + "loss": 1.3471, + "step": 8434 + }, + { + "epoch": 0.8874276696475539, + "grad_norm": 1.9600337743759155, + "learning_rate": 6.574346446505841e-06, + "loss": 1.8195, + "step": 8435 + }, + { + "epoch": 0.8875328774329301, + "grad_norm": 1.6050618886947632, + "learning_rate": 6.562199825837789e-06, + "loss": 1.4023, + "step": 8436 + }, + { + "epoch": 0.8876380852183061, + "grad_norm": 1.2669776678085327, + "learning_rate": 6.550064055793815e-06, + "loss": 1.6314, + "step": 8437 + }, + { + "epoch": 0.8877432930036823, + "grad_norm": 1.4774951934814453, + "learning_rate": 6.537939137783166e-06, + "loss": 1.5599, + "step": 8438 + }, + { + "epoch": 0.8878485007890584, + "grad_norm": 2.499821424484253, + "learning_rate": 6.525825073213876e-06, + "loss": 1.9315, + "step": 8439 + }, + { + "epoch": 0.8879537085744345, + "grad_norm": 1.3363654613494873, + "learning_rate": 6.513721863492739e-06, + "loss": 1.8184, + "step": 8440 + }, + { + "epoch": 0.8880589163598106, + "grad_norm": 1.872019648551941, + "learning_rate": 6.501629510025231e-06, + "loss": 1.8917, + "step": 8441 + }, + { + "epoch": 0.8881641241451868, + "grad_norm": 1.3895902633666992, + "learning_rate": 6.489548014215585e-06, + "loss": 1.296, + "step": 8442 + }, + { + "epoch": 0.8882693319305629, + "grad_norm": 1.1768591403961182, + "learning_rate": 6.4774773774668225e-06, + "loss": 1.3739, + "step": 8443 + }, + { + "epoch": 0.8883745397159389, + "grad_norm": 1.620846152305603, + "learning_rate": 6.465417601180657e-06, + "loss": 1.3514, + "step": 8444 + }, + { + "epoch": 0.8884797475013151, + "grad_norm": 1.3851511478424072, + "learning_rate": 6.453368686757533e-06, + "loss": 1.2824, + "step": 8445 + }, + { + "epoch": 0.8885849552866912, + "grad_norm": 1.9092847108840942, + "learning_rate": 6.441330635596665e-06, + "loss": 1.6623, + "step": 8446 + }, + { + "epoch": 0.8886901630720674, + "grad_norm": 1.3836404085159302, + "learning_rate": 6.42930344909598e-06, + "loss": 1.932, + "step": 8447 + }, + { + "epoch": 0.8887953708574434, + "grad_norm": 1.2607841491699219, + "learning_rate": 6.417287128652172e-06, + "loss": 1.5857, + "step": 8448 + }, + { + "epoch": 0.8889005786428196, + "grad_norm": 1.6962370872497559, + "learning_rate": 6.405281675660657e-06, + "loss": 1.7428, + "step": 8449 + }, + { + "epoch": 0.8890057864281957, + "grad_norm": 1.8756303787231445, + "learning_rate": 6.393287091515565e-06, + "loss": 1.3188, + "step": 8450 + }, + { + "epoch": 0.8891109942135718, + "grad_norm": 1.370875358581543, + "learning_rate": 6.3813033776098045e-06, + "loss": 1.9644, + "step": 8451 + }, + { + "epoch": 0.8892162019989479, + "grad_norm": 1.3783890008926392, + "learning_rate": 6.369330535335016e-06, + "loss": 1.3712, + "step": 8452 + }, + { + "epoch": 0.8893214097843241, + "grad_norm": 1.7575373649597168, + "learning_rate": 6.357368566081534e-06, + "loss": 1.7293, + "step": 8453 + }, + { + "epoch": 0.8894266175697002, + "grad_norm": 1.3849029541015625, + "learning_rate": 6.345417471238501e-06, + "loss": 1.6498, + "step": 8454 + }, + { + "epoch": 0.8895318253550762, + "grad_norm": 2.384744167327881, + "learning_rate": 6.333477252193731e-06, + "loss": 2.3091, + "step": 8455 + }, + { + "epoch": 0.8896370331404524, + "grad_norm": 1.8546522855758667, + "learning_rate": 6.321547910333814e-06, + "loss": 1.5911, + "step": 8456 + }, + { + "epoch": 0.8897422409258285, + "grad_norm": 1.3433914184570312, + "learning_rate": 6.309629447044074e-06, + "loss": 2.1943, + "step": 8457 + }, + { + "epoch": 0.8898474487112046, + "grad_norm": 1.997424840927124, + "learning_rate": 6.297721863708528e-06, + "loss": 1.8741, + "step": 8458 + }, + { + "epoch": 0.8899526564965807, + "grad_norm": 1.5434318780899048, + "learning_rate": 6.285825161710002e-06, + "loss": 0.8767, + "step": 8459 + }, + { + "epoch": 0.8900578642819569, + "grad_norm": 1.7368338108062744, + "learning_rate": 6.273939342430013e-06, + "loss": 1.6961, + "step": 8460 + }, + { + "epoch": 0.890163072067333, + "grad_norm": 1.936927318572998, + "learning_rate": 6.262064407248791e-06, + "loss": 0.8594, + "step": 8461 + }, + { + "epoch": 0.8902682798527091, + "grad_norm": 1.7954018115997314, + "learning_rate": 6.250200357545377e-06, + "loss": 1.6956, + "step": 8462 + }, + { + "epoch": 0.8903734876380852, + "grad_norm": 1.5254756212234497, + "learning_rate": 6.238347194697492e-06, + "loss": 1.6629, + "step": 8463 + }, + { + "epoch": 0.8904786954234614, + "grad_norm": 1.8098357915878296, + "learning_rate": 6.226504920081566e-06, + "loss": 1.17, + "step": 8464 + }, + { + "epoch": 0.8905839032088374, + "grad_norm": 1.3831403255462646, + "learning_rate": 6.214673535072868e-06, + "loss": 1.6587, + "step": 8465 + }, + { + "epoch": 0.8906891109942136, + "grad_norm": 1.8217341899871826, + "learning_rate": 6.202853041045298e-06, + "loss": 1.7295, + "step": 8466 + }, + { + "epoch": 0.8907943187795897, + "grad_norm": 0.8784952759742737, + "learning_rate": 6.191043439371535e-06, + "loss": 2.3105, + "step": 8467 + }, + { + "epoch": 0.8908995265649658, + "grad_norm": 1.1799579858779907, + "learning_rate": 6.179244731422984e-06, + "loss": 1.1108, + "step": 8468 + }, + { + "epoch": 0.8910047343503419, + "grad_norm": 1.5560115575790405, + "learning_rate": 6.167456918569792e-06, + "loss": 1.9163, + "step": 8469 + }, + { + "epoch": 0.891109942135718, + "grad_norm": 1.6790612936019897, + "learning_rate": 6.155680002180864e-06, + "loss": 1.5781, + "step": 8470 + }, + { + "epoch": 0.8912151499210942, + "grad_norm": 2.0735700130462646, + "learning_rate": 6.143913983623795e-06, + "loss": 1.601, + "step": 8471 + }, + { + "epoch": 0.8913203577064703, + "grad_norm": 1.3503835201263428, + "learning_rate": 6.132158864264914e-06, + "loss": 1.8698, + "step": 8472 + }, + { + "epoch": 0.8914255654918464, + "grad_norm": 1.1508896350860596, + "learning_rate": 6.120414645469341e-06, + "loss": 1.8827, + "step": 8473 + }, + { + "epoch": 0.8915307732772225, + "grad_norm": 2.075125217437744, + "learning_rate": 6.108681328600874e-06, + "loss": 1.8445, + "step": 8474 + }, + { + "epoch": 0.8916359810625987, + "grad_norm": 1.5181657075881958, + "learning_rate": 6.0969589150220554e-06, + "loss": 1.8365, + "step": 8475 + }, + { + "epoch": 0.8917411888479747, + "grad_norm": 1.7302663326263428, + "learning_rate": 6.085247406094197e-06, + "loss": 1.0664, + "step": 8476 + }, + { + "epoch": 0.8918463966333509, + "grad_norm": 1.5871552228927612, + "learning_rate": 6.073546803177299e-06, + "loss": 1.7855, + "step": 8477 + }, + { + "epoch": 0.891951604418727, + "grad_norm": 1.1431766748428345, + "learning_rate": 6.0618571076301085e-06, + "loss": 2.0511, + "step": 8478 + }, + { + "epoch": 0.8920568122041032, + "grad_norm": 1.5150169134140015, + "learning_rate": 6.050178320810141e-06, + "loss": 1.4898, + "step": 8479 + }, + { + "epoch": 0.8921620199894792, + "grad_norm": 2.173424005508423, + "learning_rate": 6.038510444073586e-06, + "loss": 1.1078, + "step": 8480 + }, + { + "epoch": 0.8922672277748553, + "grad_norm": 2.229217529296875, + "learning_rate": 6.026853478775396e-06, + "loss": 1.9185, + "step": 8481 + }, + { + "epoch": 0.8923724355602315, + "grad_norm": 2.0256011486053467, + "learning_rate": 6.015207426269276e-06, + "loss": 2.1693, + "step": 8482 + }, + { + "epoch": 0.8924776433456075, + "grad_norm": 1.8043358325958252, + "learning_rate": 6.003572287907633e-06, + "loss": 1.7443, + "step": 8483 + }, + { + "epoch": 0.8925828511309837, + "grad_norm": 1.3562211990356445, + "learning_rate": 5.991948065041608e-06, + "loss": 1.4683, + "step": 8484 + }, + { + "epoch": 0.8926880589163598, + "grad_norm": 1.540953278541565, + "learning_rate": 5.9803347590211e-06, + "loss": 1.6193, + "step": 8485 + }, + { + "epoch": 0.892793266701736, + "grad_norm": 1.7050999402999878, + "learning_rate": 5.968732371194729e-06, + "loss": 1.4715, + "step": 8486 + }, + { + "epoch": 0.892898474487112, + "grad_norm": 2.0793585777282715, + "learning_rate": 5.957140902909819e-06, + "loss": 1.22, + "step": 8487 + }, + { + "epoch": 0.8930036822724882, + "grad_norm": 2.1180810928344727, + "learning_rate": 5.945560355512458e-06, + "loss": 1.2444, + "step": 8488 + }, + { + "epoch": 0.8931088900578643, + "grad_norm": 1.1381313800811768, + "learning_rate": 5.93399073034745e-06, + "loss": 1.5193, + "step": 8489 + }, + { + "epoch": 0.8932140978432404, + "grad_norm": 1.6418750286102295, + "learning_rate": 5.922432028758362e-06, + "loss": 1.7449, + "step": 8490 + }, + { + "epoch": 0.8933193056286165, + "grad_norm": 1.5433701276779175, + "learning_rate": 5.910884252087457e-06, + "loss": 1.4825, + "step": 8491 + }, + { + "epoch": 0.8934245134139926, + "grad_norm": 1.8106061220169067, + "learning_rate": 5.8993474016757145e-06, + "loss": 1.542, + "step": 8492 + }, + { + "epoch": 0.8935297211993688, + "grad_norm": 1.619828701019287, + "learning_rate": 5.88782147886291e-06, + "loss": 1.8641, + "step": 8493 + }, + { + "epoch": 0.8936349289847448, + "grad_norm": 1.3758325576782227, + "learning_rate": 5.876306484987481e-06, + "loss": 1.3756, + "step": 8494 + }, + { + "epoch": 0.893740136770121, + "grad_norm": 1.4138890504837036, + "learning_rate": 5.8648024213866396e-06, + "loss": 1.3945, + "step": 8495 + }, + { + "epoch": 0.8938453445554971, + "grad_norm": 2.485138416290283, + "learning_rate": 5.853309289396314e-06, + "loss": 1.6306, + "step": 8496 + }, + { + "epoch": 0.8939505523408732, + "grad_norm": 1.9388067722320557, + "learning_rate": 5.841827090351171e-06, + "loss": 1.4469, + "step": 8497 + }, + { + "epoch": 0.8940557601262493, + "grad_norm": 1.8217087984085083, + "learning_rate": 5.830355825584577e-06, + "loss": 1.3221, + "step": 8498 + }, + { + "epoch": 0.8941609679116255, + "grad_norm": 1.5511425733566284, + "learning_rate": 5.818895496428689e-06, + "loss": 1.3081, + "step": 8499 + }, + { + "epoch": 0.8942661756970016, + "grad_norm": 2.5423057079315186, + "learning_rate": 5.8074461042143095e-06, + "loss": 1.1463, + "step": 8500 + }, + { + "epoch": 0.8943713834823777, + "grad_norm": 1.830324411392212, + "learning_rate": 5.796007650271063e-06, + "loss": 1.4945, + "step": 8501 + }, + { + "epoch": 0.8944765912677538, + "grad_norm": 2.4133145809173584, + "learning_rate": 5.784580135927242e-06, + "loss": 1.5212, + "step": 8502 + }, + { + "epoch": 0.89458179905313, + "grad_norm": 1.9608807563781738, + "learning_rate": 5.7731635625098755e-06, + "loss": 1.3602, + "step": 8503 + }, + { + "epoch": 0.8946870068385061, + "grad_norm": 2.1056807041168213, + "learning_rate": 5.761757931344758e-06, + "loss": 1.5038, + "step": 8504 + }, + { + "epoch": 0.8947922146238821, + "grad_norm": 2.061222553253174, + "learning_rate": 5.750363243756363e-06, + "loss": 1.9814, + "step": 8505 + }, + { + "epoch": 0.8948974224092583, + "grad_norm": 2.296455144882202, + "learning_rate": 5.738979501067921e-06, + "loss": 2.0531, + "step": 8506 + }, + { + "epoch": 0.8950026301946344, + "grad_norm": 2.311011552810669, + "learning_rate": 5.727606704601407e-06, + "loss": 1.719, + "step": 8507 + }, + { + "epoch": 0.8951078379800105, + "grad_norm": 1.3462337255477905, + "learning_rate": 5.7162448556774995e-06, + "loss": 1.7112, + "step": 8508 + }, + { + "epoch": 0.8952130457653866, + "grad_norm": 1.3177099227905273, + "learning_rate": 5.704893955615598e-06, + "loss": 1.8481, + "step": 8509 + }, + { + "epoch": 0.8953182535507628, + "grad_norm": 1.1072567701339722, + "learning_rate": 5.693554005733859e-06, + "loss": 1.6795, + "step": 8510 + }, + { + "epoch": 0.8954234613361389, + "grad_norm": 1.5659089088439941, + "learning_rate": 5.68222500734914e-06, + "loss": 1.4823, + "step": 8511 + }, + { + "epoch": 0.895528669121515, + "grad_norm": 1.4302914142608643, + "learning_rate": 5.6709069617770675e-06, + "loss": 1.2352, + "step": 8512 + }, + { + "epoch": 0.8956338769068911, + "grad_norm": 1.4420124292373657, + "learning_rate": 5.659599870331944e-06, + "loss": 1.5293, + "step": 8513 + }, + { + "epoch": 0.8957390846922673, + "grad_norm": 2.497387170791626, + "learning_rate": 5.64830373432681e-06, + "loss": 1.2493, + "step": 8514 + }, + { + "epoch": 0.8958442924776433, + "grad_norm": 1.808634877204895, + "learning_rate": 5.637018555073492e-06, + "loss": 1.8987, + "step": 8515 + }, + { + "epoch": 0.8959495002630194, + "grad_norm": 2.0679233074188232, + "learning_rate": 5.625744333882488e-06, + "loss": 1.3279, + "step": 8516 + }, + { + "epoch": 0.8960547080483956, + "grad_norm": 1.3654800653457642, + "learning_rate": 5.614481072063005e-06, + "loss": 1.6085, + "step": 8517 + }, + { + "epoch": 0.8961599158337717, + "grad_norm": 1.6575655937194824, + "learning_rate": 5.603228770923041e-06, + "loss": 1.1494, + "step": 8518 + }, + { + "epoch": 0.8962651236191478, + "grad_norm": 1.3457351922988892, + "learning_rate": 5.591987431769285e-06, + "loss": 1.7025, + "step": 8519 + }, + { + "epoch": 0.8963703314045239, + "grad_norm": 1.5918161869049072, + "learning_rate": 5.580757055907137e-06, + "loss": 1.3449, + "step": 8520 + }, + { + "epoch": 0.8964755391899001, + "grad_norm": 1.6226881742477417, + "learning_rate": 5.5695376446407656e-06, + "loss": 1.6743, + "step": 8521 + }, + { + "epoch": 0.8965807469752761, + "grad_norm": 1.7749323844909668, + "learning_rate": 5.558329199273038e-06, + "loss": 1.8004, + "step": 8522 + }, + { + "epoch": 0.8966859547606523, + "grad_norm": 1.319456696510315, + "learning_rate": 5.547131721105536e-06, + "loss": 1.5449, + "step": 8523 + }, + { + "epoch": 0.8967911625460284, + "grad_norm": 1.598297119140625, + "learning_rate": 5.53594521143862e-06, + "loss": 1.7227, + "step": 8524 + }, + { + "epoch": 0.8968963703314046, + "grad_norm": 2.359658718109131, + "learning_rate": 5.524769671571317e-06, + "loss": 1.5778, + "step": 8525 + }, + { + "epoch": 0.8970015781167806, + "grad_norm": 1.2139389514923096, + "learning_rate": 5.5136051028014e-06, + "loss": 2.0547, + "step": 8526 + }, + { + "epoch": 0.8971067859021568, + "grad_norm": 1.9385275840759277, + "learning_rate": 5.50245150642541e-06, + "loss": 1.374, + "step": 8527 + }, + { + "epoch": 0.8972119936875329, + "grad_norm": 1.8721158504486084, + "learning_rate": 5.491308883738544e-06, + "loss": 1.3719, + "step": 8528 + }, + { + "epoch": 0.8973172014729089, + "grad_norm": 1.8050134181976318, + "learning_rate": 5.480177236034756e-06, + "loss": 1.4891, + "step": 8529 + }, + { + "epoch": 0.8974224092582851, + "grad_norm": 1.385988712310791, + "learning_rate": 5.469056564606767e-06, + "loss": 1.7196, + "step": 8530 + }, + { + "epoch": 0.8975276170436612, + "grad_norm": 1.7569900751113892, + "learning_rate": 5.4579468707459225e-06, + "loss": 1.7562, + "step": 8531 + }, + { + "epoch": 0.8976328248290374, + "grad_norm": 1.610825538635254, + "learning_rate": 5.446848155742401e-06, + "loss": 1.6719, + "step": 8532 + }, + { + "epoch": 0.8977380326144134, + "grad_norm": 2.0842642784118652, + "learning_rate": 5.435760420885061e-06, + "loss": 1.8032, + "step": 8533 + }, + { + "epoch": 0.8978432403997896, + "grad_norm": 1.5236550569534302, + "learning_rate": 5.42468366746145e-06, + "loss": 1.4211, + "step": 8534 + }, + { + "epoch": 0.8979484481851657, + "grad_norm": 1.6460191011428833, + "learning_rate": 5.4136178967579054e-06, + "loss": 1.5113, + "step": 8535 + }, + { + "epoch": 0.8980536559705419, + "grad_norm": 2.305039405822754, + "learning_rate": 5.402563110059456e-06, + "loss": 1.5592, + "step": 8536 + }, + { + "epoch": 0.8981588637559179, + "grad_norm": 1.2543623447418213, + "learning_rate": 5.3915193086498286e-06, + "loss": 1.4207, + "step": 8537 + }, + { + "epoch": 0.8982640715412941, + "grad_norm": 1.8181365728378296, + "learning_rate": 5.380486493811543e-06, + "loss": 1.5938, + "step": 8538 + }, + { + "epoch": 0.8983692793266702, + "grad_norm": 1.4921931028366089, + "learning_rate": 5.3694646668257855e-06, + "loss": 1.7251, + "step": 8539 + }, + { + "epoch": 0.8984744871120462, + "grad_norm": 1.6796791553497314, + "learning_rate": 5.358453828972465e-06, + "loss": 1.7007, + "step": 8540 + }, + { + "epoch": 0.8985796948974224, + "grad_norm": 1.4309808015823364, + "learning_rate": 5.3474539815302815e-06, + "loss": 1.6625, + "step": 8541 + }, + { + "epoch": 0.8986849026827985, + "grad_norm": 1.6457716226577759, + "learning_rate": 5.336465125776579e-06, + "loss": 1.4362, + "step": 8542 + }, + { + "epoch": 0.8987901104681747, + "grad_norm": 1.1779471635818481, + "learning_rate": 5.325487262987439e-06, + "loss": 1.5655, + "step": 8543 + }, + { + "epoch": 0.8988953182535507, + "grad_norm": 1.411882996559143, + "learning_rate": 5.314520394437728e-06, + "loss": 1.5253, + "step": 8544 + }, + { + "epoch": 0.8990005260389269, + "grad_norm": 1.2842921018600464, + "learning_rate": 5.303564521400961e-06, + "loss": 1.5975, + "step": 8545 + }, + { + "epoch": 0.899105733824303, + "grad_norm": 2.102381467819214, + "learning_rate": 5.292619645149433e-06, + "loss": 2.1535, + "step": 8546 + }, + { + "epoch": 0.8992109416096791, + "grad_norm": 2.0416359901428223, + "learning_rate": 5.281685766954114e-06, + "loss": 1.4009, + "step": 8547 + }, + { + "epoch": 0.8993161493950552, + "grad_norm": 1.6468279361724854, + "learning_rate": 5.270762888084712e-06, + "loss": 1.8449, + "step": 8548 + }, + { + "epoch": 0.8994213571804314, + "grad_norm": 1.4324162006378174, + "learning_rate": 5.259851009809702e-06, + "loss": 1.5525, + "step": 8549 + }, + { + "epoch": 0.8995265649658075, + "grad_norm": 1.8984599113464355, + "learning_rate": 5.2489501333962135e-06, + "loss": 2.262, + "step": 8550 + }, + { + "epoch": 0.8996317727511836, + "grad_norm": 1.3665709495544434, + "learning_rate": 5.238060260110145e-06, + "loss": 1.2597, + "step": 8551 + }, + { + "epoch": 0.8997369805365597, + "grad_norm": 1.834430456161499, + "learning_rate": 5.227181391216096e-06, + "loss": 1.4875, + "step": 8552 + }, + { + "epoch": 0.8998421883219359, + "grad_norm": 1.4777787923812866, + "learning_rate": 5.2163135279773904e-06, + "loss": 1.5871, + "step": 8553 + }, + { + "epoch": 0.8999473961073119, + "grad_norm": 1.73262619972229, + "learning_rate": 5.205456671656061e-06, + "loss": 1.4334, + "step": 8554 + }, + { + "epoch": 0.900052603892688, + "grad_norm": 2.0025711059570312, + "learning_rate": 5.194610823512913e-06, + "loss": 1.5515, + "step": 8555 + }, + { + "epoch": 0.9001578116780642, + "grad_norm": 2.1046454906463623, + "learning_rate": 5.183775984807415e-06, + "loss": 1.7606, + "step": 8556 + }, + { + "epoch": 0.9002630194634403, + "grad_norm": 2.093031167984009, + "learning_rate": 5.172952156797795e-06, + "loss": 1.0529, + "step": 8557 + }, + { + "epoch": 0.9003682272488164, + "grad_norm": 3.1299211978912354, + "learning_rate": 5.1621393407409904e-06, + "loss": 1.3046, + "step": 8558 + }, + { + "epoch": 0.9004734350341925, + "grad_norm": 1.444140076637268, + "learning_rate": 5.151337537892631e-06, + "loss": 1.6833, + "step": 8559 + }, + { + "epoch": 0.9005786428195687, + "grad_norm": 1.5545951128005981, + "learning_rate": 5.140546749507136e-06, + "loss": 1.5637, + "step": 8560 + }, + { + "epoch": 0.9006838506049447, + "grad_norm": 1.6997040510177612, + "learning_rate": 5.129766976837569e-06, + "loss": 1.3593, + "step": 8561 + }, + { + "epoch": 0.9007890583903209, + "grad_norm": 2.1797568798065186, + "learning_rate": 5.118998221135762e-06, + "loss": 1.5224, + "step": 8562 + }, + { + "epoch": 0.900894266175697, + "grad_norm": 1.9926127195358276, + "learning_rate": 5.10824048365226e-06, + "loss": 1.8785, + "step": 8563 + }, + { + "epoch": 0.9009994739610732, + "grad_norm": 1.50703763961792, + "learning_rate": 5.097493765636318e-06, + "loss": 1.1829, + "step": 8564 + }, + { + "epoch": 0.9011046817464492, + "grad_norm": 2.1419105529785156, + "learning_rate": 5.086758068335917e-06, + "loss": 1.2841, + "step": 8565 + }, + { + "epoch": 0.9012098895318253, + "grad_norm": 2.1330740451812744, + "learning_rate": 5.076033392997758e-06, + "loss": 1.5291, + "step": 8566 + }, + { + "epoch": 0.9013150973172015, + "grad_norm": 1.655969262123108, + "learning_rate": 5.06531974086728e-06, + "loss": 0.8968, + "step": 8567 + }, + { + "epoch": 0.9014203051025776, + "grad_norm": 1.8075084686279297, + "learning_rate": 5.054617113188586e-06, + "loss": 2.1496, + "step": 8568 + }, + { + "epoch": 0.9015255128879537, + "grad_norm": 1.51304292678833, + "learning_rate": 5.043925511204573e-06, + "loss": 1.8161, + "step": 8569 + }, + { + "epoch": 0.9016307206733298, + "grad_norm": 1.4924466609954834, + "learning_rate": 5.0332449361568e-06, + "loss": 1.731, + "step": 8570 + }, + { + "epoch": 0.901735928458706, + "grad_norm": 1.9998409748077393, + "learning_rate": 5.0225753892855776e-06, + "loss": 1.6692, + "step": 8571 + }, + { + "epoch": 0.901841136244082, + "grad_norm": 1.3246253728866577, + "learning_rate": 5.011916871829925e-06, + "loss": 1.6233, + "step": 8572 + }, + { + "epoch": 0.9019463440294582, + "grad_norm": 1.8206862211227417, + "learning_rate": 5.0012693850275736e-06, + "loss": 1.7423, + "step": 8573 + }, + { + "epoch": 0.9020515518148343, + "grad_norm": 2.139857292175293, + "learning_rate": 4.9906329301149914e-06, + "loss": 1.4421, + "step": 8574 + }, + { + "epoch": 0.9021567596002105, + "grad_norm": 2.2966580390930176, + "learning_rate": 4.980007508327345e-06, + "loss": 1.6362, + "step": 8575 + }, + { + "epoch": 0.9022619673855865, + "grad_norm": 1.746172308921814, + "learning_rate": 4.969393120898525e-06, + "loss": 1.2432, + "step": 8576 + }, + { + "epoch": 0.9023671751709627, + "grad_norm": 1.2494480609893799, + "learning_rate": 4.958789769061156e-06, + "loss": 1.7419, + "step": 8577 + }, + { + "epoch": 0.9024723829563388, + "grad_norm": 1.3508919477462769, + "learning_rate": 4.948197454046577e-06, + "loss": 1.7533, + "step": 8578 + }, + { + "epoch": 0.9025775907417148, + "grad_norm": 1.8284884691238403, + "learning_rate": 4.937616177084814e-06, + "loss": 1.5497, + "step": 8579 + }, + { + "epoch": 0.902682798527091, + "grad_norm": 1.5557414293289185, + "learning_rate": 4.927045939404673e-06, + "loss": 1.8222, + "step": 8580 + }, + { + "epoch": 0.9027880063124671, + "grad_norm": 1.6787934303283691, + "learning_rate": 4.916486742233606e-06, + "loss": 1.385, + "step": 8581 + }, + { + "epoch": 0.9028932140978433, + "grad_norm": 1.3682160377502441, + "learning_rate": 4.90593858679782e-06, + "loss": 1.768, + "step": 8582 + }, + { + "epoch": 0.9029984218832193, + "grad_norm": 2.39587140083313, + "learning_rate": 4.89540147432227e-06, + "loss": 1.8781, + "step": 8583 + }, + { + "epoch": 0.9031036296685955, + "grad_norm": 1.224987268447876, + "learning_rate": 4.884875406030565e-06, + "loss": 1.6377, + "step": 8584 + }, + { + "epoch": 0.9032088374539716, + "grad_norm": 1.3309333324432373, + "learning_rate": 4.874360383145072e-06, + "loss": 1.7443, + "step": 8585 + }, + { + "epoch": 0.9033140452393477, + "grad_norm": 3.1084442138671875, + "learning_rate": 4.863856406886869e-06, + "loss": 2.1872, + "step": 8586 + }, + { + "epoch": 0.9034192530247238, + "grad_norm": 1.980631709098816, + "learning_rate": 4.853363478475748e-06, + "loss": 1.273, + "step": 8587 + }, + { + "epoch": 0.9035244608101, + "grad_norm": 1.4033839702606201, + "learning_rate": 4.8428815991302005e-06, + "loss": 1.3123, + "step": 8588 + }, + { + "epoch": 0.9036296685954761, + "grad_norm": 1.6415311098098755, + "learning_rate": 4.832410770067486e-06, + "loss": 1.5425, + "step": 8589 + }, + { + "epoch": 0.9037348763808521, + "grad_norm": 1.605977177619934, + "learning_rate": 4.821950992503521e-06, + "loss": 1.6757, + "step": 8590 + }, + { + "epoch": 0.9038400841662283, + "grad_norm": 2.0766685009002686, + "learning_rate": 4.811502267652968e-06, + "loss": 1.7121, + "step": 8591 + }, + { + "epoch": 0.9039452919516044, + "grad_norm": 1.8514540195465088, + "learning_rate": 4.801064596729221e-06, + "loss": 1.3121, + "step": 8592 + }, + { + "epoch": 0.9040504997369805, + "grad_norm": 1.8307629823684692, + "learning_rate": 4.790637980944346e-06, + "loss": 1.871, + "step": 8593 + }, + { + "epoch": 0.9041557075223566, + "grad_norm": 1.437143325805664, + "learning_rate": 4.780222421509184e-06, + "loss": 1.8685, + "step": 8594 + }, + { + "epoch": 0.9042609153077328, + "grad_norm": 1.956531047821045, + "learning_rate": 4.769817919633235e-06, + "loss": 1.5374, + "step": 8595 + }, + { + "epoch": 0.9043661230931089, + "grad_norm": 1.2559046745300293, + "learning_rate": 4.759424476524732e-06, + "loss": 2.0446, + "step": 8596 + }, + { + "epoch": 0.904471330878485, + "grad_norm": 1.7142302989959717, + "learning_rate": 4.749042093390654e-06, + "loss": 1.7011, + "step": 8597 + }, + { + "epoch": 0.9045765386638611, + "grad_norm": 2.1067309379577637, + "learning_rate": 4.738670771436671e-06, + "loss": 1.5651, + "step": 8598 + }, + { + "epoch": 0.9046817464492373, + "grad_norm": 1.2750693559646606, + "learning_rate": 4.72831051186714e-06, + "loss": 1.9008, + "step": 8599 + }, + { + "epoch": 0.9047869542346134, + "grad_norm": 1.5802183151245117, + "learning_rate": 4.717961315885211e-06, + "loss": 2.3383, + "step": 8600 + }, + { + "epoch": 0.9048921620199895, + "grad_norm": 1.7453382015228271, + "learning_rate": 4.707623184692655e-06, + "loss": 1.7165, + "step": 8601 + }, + { + "epoch": 0.9049973698053656, + "grad_norm": 1.6323179006576538, + "learning_rate": 4.697296119490047e-06, + "loss": 1.3792, + "step": 8602 + }, + { + "epoch": 0.9051025775907418, + "grad_norm": 1.3392667770385742, + "learning_rate": 4.686980121476614e-06, + "loss": 1.369, + "step": 8603 + }, + { + "epoch": 0.9052077853761178, + "grad_norm": 2.2435269355773926, + "learning_rate": 4.67667519185031e-06, + "loss": 1.8554, + "step": 8604 + }, + { + "epoch": 0.9053129931614939, + "grad_norm": 1.6639388799667358, + "learning_rate": 4.666381331807834e-06, + "loss": 1.2958, + "step": 8605 + }, + { + "epoch": 0.9054182009468701, + "grad_norm": 1.4905084371566772, + "learning_rate": 4.656098542544574e-06, + "loss": 1.7928, + "step": 8606 + }, + { + "epoch": 0.9055234087322462, + "grad_norm": 1.625221848487854, + "learning_rate": 4.645826825254607e-06, + "loss": 1.9308, + "step": 8607 + }, + { + "epoch": 0.9056286165176223, + "grad_norm": 1.763044834136963, + "learning_rate": 4.6355661811308015e-06, + "loss": 1.7369, + "step": 8608 + }, + { + "epoch": 0.9057338243029984, + "grad_norm": 3.1937339305877686, + "learning_rate": 4.625316611364661e-06, + "loss": 1.1817, + "step": 8609 + }, + { + "epoch": 0.9058390320883746, + "grad_norm": 1.484457015991211, + "learning_rate": 4.615078117146421e-06, + "loss": 1.6794, + "step": 8610 + }, + { + "epoch": 0.9059442398737506, + "grad_norm": 1.7743538618087769, + "learning_rate": 4.604850699665087e-06, + "loss": 2.1803, + "step": 8611 + }, + { + "epoch": 0.9060494476591268, + "grad_norm": 2.6126396656036377, + "learning_rate": 4.594634360108319e-06, + "loss": 1.6458, + "step": 8612 + }, + { + "epoch": 0.9061546554445029, + "grad_norm": 2.2468442916870117, + "learning_rate": 4.584429099662468e-06, + "loss": 1.8922, + "step": 8613 + }, + { + "epoch": 0.9062598632298791, + "grad_norm": 2.1800897121429443, + "learning_rate": 4.574234919512698e-06, + "loss": 1.5657, + "step": 8614 + }, + { + "epoch": 0.9063650710152551, + "grad_norm": 2.317814826965332, + "learning_rate": 4.564051820842796e-06, + "loss": 1.7934, + "step": 8615 + }, + { + "epoch": 0.9064702788006312, + "grad_norm": 1.8590099811553955, + "learning_rate": 4.553879804835282e-06, + "loss": 2.0423, + "step": 8616 + }, + { + "epoch": 0.9065754865860074, + "grad_norm": 1.0570123195648193, + "learning_rate": 4.543718872671421e-06, + "loss": 2.114, + "step": 8617 + }, + { + "epoch": 0.9066806943713834, + "grad_norm": 1.6926145553588867, + "learning_rate": 4.533569025531137e-06, + "loss": 1.5145, + "step": 8618 + }, + { + "epoch": 0.9067859021567596, + "grad_norm": 2.080049514770508, + "learning_rate": 4.523430264593132e-06, + "loss": 2.0468, + "step": 8619 + }, + { + "epoch": 0.9068911099421357, + "grad_norm": 1.7230383157730103, + "learning_rate": 4.5133025910347845e-06, + "loss": 1.407, + "step": 8620 + }, + { + "epoch": 0.9069963177275119, + "grad_norm": 1.7170302867889404, + "learning_rate": 4.5031860060321455e-06, + "loss": 1.0797, + "step": 8621 + }, + { + "epoch": 0.9071015255128879, + "grad_norm": 1.9875949621200562, + "learning_rate": 4.493080510760083e-06, + "loss": 1.2423, + "step": 8622 + }, + { + "epoch": 0.9072067332982641, + "grad_norm": 2.1834206581115723, + "learning_rate": 4.482986106392073e-06, + "loss": 1.2699, + "step": 8623 + }, + { + "epoch": 0.9073119410836402, + "grad_norm": 1.1872564554214478, + "learning_rate": 4.472902794100342e-06, + "loss": 1.7113, + "step": 8624 + }, + { + "epoch": 0.9074171488690163, + "grad_norm": 1.889709711074829, + "learning_rate": 4.4628305750558656e-06, + "loss": 1.9109, + "step": 8625 + }, + { + "epoch": 0.9075223566543924, + "grad_norm": 1.6247581243515015, + "learning_rate": 4.452769450428273e-06, + "loss": 1.1514, + "step": 8626 + }, + { + "epoch": 0.9076275644397686, + "grad_norm": 1.985897183418274, + "learning_rate": 4.442719421385922e-06, + "loss": 1.7738, + "step": 8627 + }, + { + "epoch": 0.9077327722251447, + "grad_norm": 1.4728162288665771, + "learning_rate": 4.4326804890959195e-06, + "loss": 1.8999, + "step": 8628 + }, + { + "epoch": 0.9078379800105207, + "grad_norm": 1.9778163433074951, + "learning_rate": 4.422652654724036e-06, + "loss": 2.0382, + "step": 8629 + }, + { + "epoch": 0.9079431877958969, + "grad_norm": 1.3184362649917603, + "learning_rate": 4.412635919434749e-06, + "loss": 1.6149, + "step": 8630 + }, + { + "epoch": 0.908048395581273, + "grad_norm": 1.9387370347976685, + "learning_rate": 4.402630284391318e-06, + "loss": 1.4208, + "step": 8631 + }, + { + "epoch": 0.9081536033666492, + "grad_norm": 1.576529860496521, + "learning_rate": 4.392635750755625e-06, + "loss": 1.4622, + "step": 8632 + }, + { + "epoch": 0.9082588111520252, + "grad_norm": 2.166128396987915, + "learning_rate": 4.382652319688307e-06, + "loss": 2.017, + "step": 8633 + }, + { + "epoch": 0.9083640189374014, + "grad_norm": 1.8880326747894287, + "learning_rate": 4.372679992348727e-06, + "loss": 1.7115, + "step": 8634 + }, + { + "epoch": 0.9084692267227775, + "grad_norm": 1.9160791635513306, + "learning_rate": 4.362718769894925e-06, + "loss": 2.233, + "step": 8635 + }, + { + "epoch": 0.9085744345081536, + "grad_norm": 1.0861622095108032, + "learning_rate": 4.352768653483652e-06, + "loss": 1.4918, + "step": 8636 + }, + { + "epoch": 0.9086796422935297, + "grad_norm": 1.6084988117218018, + "learning_rate": 4.342829644270429e-06, + "loss": 1.69, + "step": 8637 + }, + { + "epoch": 0.9087848500789059, + "grad_norm": 1.7415450811386108, + "learning_rate": 4.332901743409379e-06, + "loss": 1.4335, + "step": 8638 + }, + { + "epoch": 0.908890057864282, + "grad_norm": 1.4303922653198242, + "learning_rate": 4.322984952053433e-06, + "loss": 1.3406, + "step": 8639 + }, + { + "epoch": 0.908995265649658, + "grad_norm": 1.2648975849151611, + "learning_rate": 4.313079271354192e-06, + "loss": 1.3649, + "step": 8640 + }, + { + "epoch": 0.9091004734350342, + "grad_norm": 1.7955864667892456, + "learning_rate": 4.303184702461948e-06, + "loss": 1.5743, + "step": 8641 + }, + { + "epoch": 0.9092056812204103, + "grad_norm": 1.23116934299469, + "learning_rate": 4.293301246525761e-06, + "loss": 1.4245, + "step": 8642 + }, + { + "epoch": 0.9093108890057864, + "grad_norm": 1.7114944458007812, + "learning_rate": 4.283428904693343e-06, + "loss": 1.8552, + "step": 8643 + }, + { + "epoch": 0.9094160967911625, + "grad_norm": 1.141519546508789, + "learning_rate": 4.273567678111123e-06, + "loss": 1.773, + "step": 8644 + }, + { + "epoch": 0.9095213045765387, + "grad_norm": 1.2643916606903076, + "learning_rate": 4.263717567924286e-06, + "loss": 1.8676, + "step": 8645 + }, + { + "epoch": 0.9096265123619148, + "grad_norm": 1.395230770111084, + "learning_rate": 4.2538785752766816e-06, + "loss": 1.591, + "step": 8646 + }, + { + "epoch": 0.9097317201472909, + "grad_norm": 1.6044540405273438, + "learning_rate": 4.244050701310853e-06, + "loss": 2.1957, + "step": 8647 + }, + { + "epoch": 0.909836927932667, + "grad_norm": 1.558889389038086, + "learning_rate": 4.234233947168109e-06, + "loss": 1.3307, + "step": 8648 + }, + { + "epoch": 0.9099421357180432, + "grad_norm": 2.091890811920166, + "learning_rate": 4.224428313988416e-06, + "loss": 1.092, + "step": 8649 + }, + { + "epoch": 0.9100473435034192, + "grad_norm": 2.1650590896606445, + "learning_rate": 4.214633802910506e-06, + "loss": 1.9047, + "step": 8650 + }, + { + "epoch": 0.9101525512887954, + "grad_norm": 1.981006145477295, + "learning_rate": 4.204850415071748e-06, + "loss": 1.1051, + "step": 8651 + }, + { + "epoch": 0.9102577590741715, + "grad_norm": 1.77347993850708, + "learning_rate": 4.195078151608256e-06, + "loss": 1.6043, + "step": 8652 + }, + { + "epoch": 0.9103629668595477, + "grad_norm": 3.0252108573913574, + "learning_rate": 4.185317013654866e-06, + "loss": 1.4648, + "step": 8653 + }, + { + "epoch": 0.9104681746449237, + "grad_norm": 1.7558192014694214, + "learning_rate": 4.175567002345104e-06, + "loss": 1.2865, + "step": 8654 + }, + { + "epoch": 0.9105733824302998, + "grad_norm": 1.6132521629333496, + "learning_rate": 4.165828118811199e-06, + "loss": 1.5495, + "step": 8655 + }, + { + "epoch": 0.910678590215676, + "grad_norm": 1.244947910308838, + "learning_rate": 4.156100364184101e-06, + "loss": 2.0034, + "step": 8656 + }, + { + "epoch": 0.910783798001052, + "grad_norm": 1.8241870403289795, + "learning_rate": 4.146383739593474e-06, + "loss": 1.6338, + "step": 8657 + }, + { + "epoch": 0.9108890057864282, + "grad_norm": 0.9370825290679932, + "learning_rate": 4.136678246167636e-06, + "loss": 1.6983, + "step": 8658 + }, + { + "epoch": 0.9109942135718043, + "grad_norm": 1.7532540559768677, + "learning_rate": 4.12698388503372e-06, + "loss": 1.5103, + "step": 8659 + }, + { + "epoch": 0.9110994213571805, + "grad_norm": 1.93152916431427, + "learning_rate": 4.1173006573174354e-06, + "loss": 1.9619, + "step": 8660 + }, + { + "epoch": 0.9112046291425565, + "grad_norm": 1.5335841178894043, + "learning_rate": 4.107628564143306e-06, + "loss": 1.519, + "step": 8661 + }, + { + "epoch": 0.9113098369279327, + "grad_norm": 1.8598703145980835, + "learning_rate": 4.0979676066345005e-06, + "loss": 1.3632, + "step": 8662 + }, + { + "epoch": 0.9114150447133088, + "grad_norm": 1.7030982971191406, + "learning_rate": 4.08831778591291e-06, + "loss": 1.0882, + "step": 8663 + }, + { + "epoch": 0.911520252498685, + "grad_norm": 1.5589040517807007, + "learning_rate": 4.078679103099159e-06, + "loss": 2.0111, + "step": 8664 + }, + { + "epoch": 0.911625460284061, + "grad_norm": 1.4710445404052734, + "learning_rate": 4.069051559312531e-06, + "loss": 1.8036, + "step": 8665 + }, + { + "epoch": 0.9117306680694371, + "grad_norm": 2.0354957580566406, + "learning_rate": 4.0594351556710544e-06, + "loss": 1.6073, + "step": 8666 + }, + { + "epoch": 0.9118358758548133, + "grad_norm": 1.0362532138824463, + "learning_rate": 4.049829893291457e-06, + "loss": 1.689, + "step": 8667 + }, + { + "epoch": 0.9119410836401893, + "grad_norm": 1.7926151752471924, + "learning_rate": 4.040235773289147e-06, + "loss": 1.7301, + "step": 8668 + }, + { + "epoch": 0.9120462914255655, + "grad_norm": 1.6435024738311768, + "learning_rate": 4.030652796778267e-06, + "loss": 1.5168, + "step": 8669 + }, + { + "epoch": 0.9121514992109416, + "grad_norm": 1.7422984838485718, + "learning_rate": 4.021080964871671e-06, + "loss": 1.7809, + "step": 8670 + }, + { + "epoch": 0.9122567069963178, + "grad_norm": 2.355043411254883, + "learning_rate": 4.011520278680891e-06, + "loss": 1.9162, + "step": 8671 + }, + { + "epoch": 0.9123619147816938, + "grad_norm": 1.7827218770980835, + "learning_rate": 4.001970739316163e-06, + "loss": 1.322, + "step": 8672 + }, + { + "epoch": 0.91246712256707, + "grad_norm": 1.2466920614242554, + "learning_rate": 3.9924323478864655e-06, + "loss": 1.2959, + "step": 8673 + }, + { + "epoch": 0.9125723303524461, + "grad_norm": 1.3089375495910645, + "learning_rate": 3.982905105499468e-06, + "loss": 1.3969, + "step": 8674 + }, + { + "epoch": 0.9126775381378222, + "grad_norm": 1.8237321376800537, + "learning_rate": 3.973389013261497e-06, + "loss": 1.2769, + "step": 8675 + }, + { + "epoch": 0.9127827459231983, + "grad_norm": 1.5145193338394165, + "learning_rate": 3.9638840722776685e-06, + "loss": 1.4357, + "step": 8676 + }, + { + "epoch": 0.9128879537085745, + "grad_norm": 1.9698090553283691, + "learning_rate": 3.954390283651754e-06, + "loss": 1.4994, + "step": 8677 + }, + { + "epoch": 0.9129931614939506, + "grad_norm": 1.725354790687561, + "learning_rate": 3.944907648486196e-06, + "loss": 1.4207, + "step": 8678 + }, + { + "epoch": 0.9130983692793266, + "grad_norm": 2.0058798789978027, + "learning_rate": 3.935436167882234e-06, + "loss": 1.1381, + "step": 8679 + }, + { + "epoch": 0.9132035770647028, + "grad_norm": 1.658754587173462, + "learning_rate": 3.925975842939733e-06, + "loss": 1.7292, + "step": 8680 + }, + { + "epoch": 0.9133087848500789, + "grad_norm": 1.5891391038894653, + "learning_rate": 3.916526674757293e-06, + "loss": 1.0397, + "step": 8681 + }, + { + "epoch": 0.913413992635455, + "grad_norm": 1.2456856966018677, + "learning_rate": 3.907088664432224e-06, + "loss": 2.1817, + "step": 8682 + }, + { + "epoch": 0.9135192004208311, + "grad_norm": 1.409684658050537, + "learning_rate": 3.897661813060494e-06, + "loss": 1.3057, + "step": 8683 + }, + { + "epoch": 0.9136244082062073, + "grad_norm": 1.887300968170166, + "learning_rate": 3.8882461217368604e-06, + "loss": 1.7419, + "step": 8684 + }, + { + "epoch": 0.9137296159915834, + "grad_norm": 1.338424801826477, + "learning_rate": 3.878841591554716e-06, + "loss": 1.6628, + "step": 8685 + }, + { + "epoch": 0.9138348237769595, + "grad_norm": 1.1566272974014282, + "learning_rate": 3.869448223606165e-06, + "loss": 1.7944, + "step": 8686 + }, + { + "epoch": 0.9139400315623356, + "grad_norm": 1.7602925300598145, + "learning_rate": 3.860066018982056e-06, + "loss": 1.2639, + "step": 8687 + }, + { + "epoch": 0.9140452393477118, + "grad_norm": 1.099798560142517, + "learning_rate": 3.850694978771896e-06, + "loss": 1.5982, + "step": 8688 + }, + { + "epoch": 0.9141504471330878, + "grad_norm": 1.6427594423294067, + "learning_rate": 3.841335104063904e-06, + "loss": 2.0918, + "step": 8689 + }, + { + "epoch": 0.914255654918464, + "grad_norm": 1.4541171789169312, + "learning_rate": 3.831986395945042e-06, + "loss": 1.8839, + "step": 8690 + }, + { + "epoch": 0.9143608627038401, + "grad_norm": 1.9176745414733887, + "learning_rate": 3.822648855500921e-06, + "loss": 1.5762, + "step": 8691 + }, + { + "epoch": 0.9144660704892162, + "grad_norm": 1.3941978216171265, + "learning_rate": 3.813322483815862e-06, + "loss": 1.6778, + "step": 8692 + }, + { + "epoch": 0.9145712782745923, + "grad_norm": 2.07476544380188, + "learning_rate": 3.8040072819729545e-06, + "loss": 1.2133, + "step": 8693 + }, + { + "epoch": 0.9146764860599684, + "grad_norm": 1.7748063802719116, + "learning_rate": 3.794703251053899e-06, + "loss": 1.7154, + "step": 8694 + }, + { + "epoch": 0.9147816938453446, + "grad_norm": 1.5125572681427002, + "learning_rate": 3.785410392139166e-06, + "loss": 1.6641, + "step": 8695 + }, + { + "epoch": 0.9148869016307207, + "grad_norm": 2.81387996673584, + "learning_rate": 3.776128706307902e-06, + "loss": 1.3597, + "step": 8696 + }, + { + "epoch": 0.9149921094160968, + "grad_norm": 2.081625461578369, + "learning_rate": 3.7668581946379345e-06, + "loss": 1.3875, + "step": 8697 + }, + { + "epoch": 0.9150973172014729, + "grad_norm": 1.5099921226501465, + "learning_rate": 3.7575988582058575e-06, + "loss": 2.1729, + "step": 8698 + }, + { + "epoch": 0.9152025249868491, + "grad_norm": 1.3036530017852783, + "learning_rate": 3.7483506980868997e-06, + "loss": 0.8875, + "step": 8699 + }, + { + "epoch": 0.9153077327722251, + "grad_norm": 1.566401720046997, + "learning_rate": 3.7391137153550137e-06, + "loss": 1.5267, + "step": 8700 + }, + { + "epoch": 0.9154129405576013, + "grad_norm": 1.6144717931747437, + "learning_rate": 3.7298879110828965e-06, + "loss": 1.6758, + "step": 8701 + }, + { + "epoch": 0.9155181483429774, + "grad_norm": 1.2582496404647827, + "learning_rate": 3.7206732863418804e-06, + "loss": 1.6363, + "step": 8702 + }, + { + "epoch": 0.9156233561283535, + "grad_norm": 1.8955252170562744, + "learning_rate": 3.71146984220202e-06, + "loss": 1.6015, + "step": 8703 + }, + { + "epoch": 0.9157285639137296, + "grad_norm": 1.6370620727539062, + "learning_rate": 3.702277579732116e-06, + "loss": 1.4619, + "step": 8704 + }, + { + "epoch": 0.9158337716991057, + "grad_norm": 2.216202974319458, + "learning_rate": 3.6930964999995933e-06, + "loss": 1.4897, + "step": 8705 + }, + { + "epoch": 0.9159389794844819, + "grad_norm": 1.8498296737670898, + "learning_rate": 3.683926604070653e-06, + "loss": 1.7555, + "step": 8706 + }, + { + "epoch": 0.9160441872698579, + "grad_norm": 2.3383429050445557, + "learning_rate": 3.6747678930101558e-06, + "loss": 1.4773, + "step": 8707 + }, + { + "epoch": 0.9161493950552341, + "grad_norm": 1.3877893686294556, + "learning_rate": 3.6656203678816723e-06, + "loss": 1.9327, + "step": 8708 + }, + { + "epoch": 0.9162546028406102, + "grad_norm": 1.5240907669067383, + "learning_rate": 3.6564840297474757e-06, + "loss": 2.1459, + "step": 8709 + }, + { + "epoch": 0.9163598106259864, + "grad_norm": 1.7055188417434692, + "learning_rate": 3.6473588796685386e-06, + "loss": 1.5657, + "step": 8710 + }, + { + "epoch": 0.9164650184113624, + "grad_norm": 2.1579809188842773, + "learning_rate": 3.6382449187045144e-06, + "loss": 1.5808, + "step": 8711 + }, + { + "epoch": 0.9165702261967386, + "grad_norm": 1.276623010635376, + "learning_rate": 3.629142147913811e-06, + "loss": 1.9783, + "step": 8712 + }, + { + "epoch": 0.9166754339821147, + "grad_norm": 2.286100149154663, + "learning_rate": 3.6200505683534945e-06, + "loss": 1.714, + "step": 8713 + }, + { + "epoch": 0.9167806417674907, + "grad_norm": 2.115018367767334, + "learning_rate": 3.6109701810793208e-06, + "loss": 1.7412, + "step": 8714 + }, + { + "epoch": 0.9168858495528669, + "grad_norm": 1.6776834726333618, + "learning_rate": 3.6019009871457897e-06, + "loss": 1.715, + "step": 8715 + }, + { + "epoch": 0.916991057338243, + "grad_norm": 1.3437702655792236, + "learning_rate": 3.59284298760606e-06, + "loss": 1.1101, + "step": 8716 + }, + { + "epoch": 0.9170962651236192, + "grad_norm": 3.0596518516540527, + "learning_rate": 3.5837961835120006e-06, + "loss": 1.4793, + "step": 8717 + }, + { + "epoch": 0.9172014729089952, + "grad_norm": 1.5906575918197632, + "learning_rate": 3.5747605759142157e-06, + "loss": 1.6282, + "step": 8718 + }, + { + "epoch": 0.9173066806943714, + "grad_norm": 2.017958402633667, + "learning_rate": 3.565736165861966e-06, + "loss": 1.9276, + "step": 8719 + }, + { + "epoch": 0.9174118884797475, + "grad_norm": 2.3949334621429443, + "learning_rate": 3.5567229544032133e-06, + "loss": 0.857, + "step": 8720 + }, + { + "epoch": 0.9175170962651236, + "grad_norm": 3.835064649581909, + "learning_rate": 3.5477209425846538e-06, + "loss": 1.3035, + "step": 8721 + }, + { + "epoch": 0.9176223040504997, + "grad_norm": 1.541480302810669, + "learning_rate": 3.538730131451651e-06, + "loss": 1.6198, + "step": 8722 + }, + { + "epoch": 0.9177275118358759, + "grad_norm": 1.4783751964569092, + "learning_rate": 3.529750522048281e-06, + "loss": 1.5319, + "step": 8723 + }, + { + "epoch": 0.917832719621252, + "grad_norm": 1.9219422340393066, + "learning_rate": 3.5207821154173093e-06, + "loss": 1.6938, + "step": 8724 + }, + { + "epoch": 0.917937927406628, + "grad_norm": 1.5939526557922363, + "learning_rate": 3.5118249126002035e-06, + "loss": 1.4352, + "step": 8725 + }, + { + "epoch": 0.9180431351920042, + "grad_norm": 1.3447009325027466, + "learning_rate": 3.5028789146371533e-06, + "loss": 1.7133, + "step": 8726 + }, + { + "epoch": 0.9181483429773803, + "grad_norm": 1.859089970588684, + "learning_rate": 3.4939441225670054e-06, + "loss": 1.8013, + "step": 8727 + }, + { + "epoch": 0.9182535507627565, + "grad_norm": 1.8480535745620728, + "learning_rate": 3.4850205374273416e-06, + "loss": 1.5742, + "step": 8728 + }, + { + "epoch": 0.9183587585481325, + "grad_norm": 2.163987159729004, + "learning_rate": 3.476108160254443e-06, + "loss": 1.2029, + "step": 8729 + }, + { + "epoch": 0.9184639663335087, + "grad_norm": 1.8570356369018555, + "learning_rate": 3.4672069920832493e-06, + "loss": 1.558, + "step": 8730 + }, + { + "epoch": 0.9185691741188848, + "grad_norm": 1.9687753915786743, + "learning_rate": 3.4583170339474223e-06, + "loss": 1.6714, + "step": 8731 + }, + { + "epoch": 0.9186743819042609, + "grad_norm": 1.7975976467132568, + "learning_rate": 3.4494382868793474e-06, + "loss": 1.623, + "step": 8732 + }, + { + "epoch": 0.918779589689637, + "grad_norm": 1.992436170578003, + "learning_rate": 3.440570751910066e-06, + "loss": 1.2232, + "step": 8733 + }, + { + "epoch": 0.9188847974750132, + "grad_norm": 1.5416083335876465, + "learning_rate": 3.4317144300693328e-06, + "loss": 1.3667, + "step": 8734 + }, + { + "epoch": 0.9189900052603893, + "grad_norm": 2.0780837535858154, + "learning_rate": 3.4228693223856136e-06, + "loss": 1.6205, + "step": 8735 + }, + { + "epoch": 0.9190952130457654, + "grad_norm": 2.3178086280822754, + "learning_rate": 3.4140354298860756e-06, + "loss": 1.6387, + "step": 8736 + }, + { + "epoch": 0.9192004208311415, + "grad_norm": 2.03950834274292, + "learning_rate": 3.405212753596532e-06, + "loss": 1.2235, + "step": 8737 + }, + { + "epoch": 0.9193056286165177, + "grad_norm": 1.5267329216003418, + "learning_rate": 3.3964012945415624e-06, + "loss": 1.9435, + "step": 8738 + }, + { + "epoch": 0.9194108364018937, + "grad_norm": 1.4271742105484009, + "learning_rate": 3.3876010537444046e-06, + "loss": 1.3572, + "step": 8739 + }, + { + "epoch": 0.9195160441872698, + "grad_norm": 1.5185612440109253, + "learning_rate": 3.3788120322269855e-06, + "loss": 1.1357, + "step": 8740 + }, + { + "epoch": 0.919621251972646, + "grad_norm": 2.3991973400115967, + "learning_rate": 3.3700342310099773e-06, + "loss": 2.0348, + "step": 8741 + }, + { + "epoch": 0.9197264597580221, + "grad_norm": 1.1119378805160522, + "learning_rate": 3.361267651112676e-06, + "loss": 1.4776, + "step": 8742 + }, + { + "epoch": 0.9198316675433982, + "grad_norm": 2.3870489597320557, + "learning_rate": 3.3525122935531562e-06, + "loss": 1.8226, + "step": 8743 + }, + { + "epoch": 0.9199368753287743, + "grad_norm": 1.734883189201355, + "learning_rate": 3.3437681593481486e-06, + "loss": 1.6325, + "step": 8744 + }, + { + "epoch": 0.9200420831141505, + "grad_norm": 1.6427862644195557, + "learning_rate": 3.3350352495130298e-06, + "loss": 1.6529, + "step": 8745 + }, + { + "epoch": 0.9201472908995265, + "grad_norm": 1.838613748550415, + "learning_rate": 3.326313565061978e-06, + "loss": 1.3113, + "step": 8746 + }, + { + "epoch": 0.9202524986849027, + "grad_norm": 2.063891887664795, + "learning_rate": 3.3176031070077827e-06, + "loss": 2.1088, + "step": 8747 + }, + { + "epoch": 0.9203577064702788, + "grad_norm": 1.299898624420166, + "learning_rate": 3.3089038763619684e-06, + "loss": 1.7546, + "step": 8748 + }, + { + "epoch": 0.920462914255655, + "grad_norm": 1.4041413068771362, + "learning_rate": 3.30021587413476e-06, + "loss": 2.0231, + "step": 8749 + }, + { + "epoch": 0.920568122041031, + "grad_norm": 2.1937994956970215, + "learning_rate": 3.2915391013350392e-06, + "loss": 2.0115, + "step": 8750 + }, + { + "epoch": 0.9206733298264071, + "grad_norm": 1.2597063779830933, + "learning_rate": 3.282873558970445e-06, + "loss": 1.7548, + "step": 8751 + }, + { + "epoch": 0.9207785376117833, + "grad_norm": 1.6283094882965088, + "learning_rate": 3.2742192480472724e-06, + "loss": 1.5641, + "step": 8752 + }, + { + "epoch": 0.9208837453971593, + "grad_norm": 1.535847783088684, + "learning_rate": 3.2655761695704834e-06, + "loss": 1.769, + "step": 8753 + }, + { + "epoch": 0.9209889531825355, + "grad_norm": 1.572363018989563, + "learning_rate": 3.256944324543809e-06, + "loss": 1.5744, + "step": 8754 + }, + { + "epoch": 0.9210941609679116, + "grad_norm": 1.8032740354537964, + "learning_rate": 3.2483237139696255e-06, + "loss": 1.1064, + "step": 8755 + }, + { + "epoch": 0.9211993687532878, + "grad_norm": 1.5819141864776611, + "learning_rate": 3.2397143388489983e-06, + "loss": 1.3802, + "step": 8756 + }, + { + "epoch": 0.9213045765386638, + "grad_norm": 1.1894673109054565, + "learning_rate": 3.2311162001817387e-06, + "loss": 1.4237, + "step": 8757 + }, + { + "epoch": 0.92140978432404, + "grad_norm": 1.9379583597183228, + "learning_rate": 3.2225292989662925e-06, + "loss": 1.9112, + "step": 8758 + }, + { + "epoch": 0.9215149921094161, + "grad_norm": 1.32359778881073, + "learning_rate": 3.21395363619984e-06, + "loss": 1.3942, + "step": 8759 + }, + { + "epoch": 0.9216201998947923, + "grad_norm": 1.8300716876983643, + "learning_rate": 3.2053892128782403e-06, + "loss": 1.7214, + "step": 8760 + }, + { + "epoch": 0.9217254076801683, + "grad_norm": 1.1215351819992065, + "learning_rate": 3.1968360299960643e-06, + "loss": 1.3853, + "step": 8761 + }, + { + "epoch": 0.9218306154655445, + "grad_norm": 1.482314944267273, + "learning_rate": 3.1882940885465397e-06, + "loss": 1.6937, + "step": 8762 + }, + { + "epoch": 0.9219358232509206, + "grad_norm": 1.5101746320724487, + "learning_rate": 3.1797633895216394e-06, + "loss": 1.7438, + "step": 8763 + }, + { + "epoch": 0.9220410310362966, + "grad_norm": 1.2713983058929443, + "learning_rate": 3.171243933911994e-06, + "loss": 1.5315, + "step": 8764 + }, + { + "epoch": 0.9221462388216728, + "grad_norm": 2.0249693393707275, + "learning_rate": 3.1627357227069333e-06, + "loss": 1.4189, + "step": 8765 + }, + { + "epoch": 0.9222514466070489, + "grad_norm": 1.0391638278961182, + "learning_rate": 3.154238756894512e-06, + "loss": 1.8894, + "step": 8766 + }, + { + "epoch": 0.9223566543924251, + "grad_norm": 1.596531629562378, + "learning_rate": 3.1457530374614295e-06, + "loss": 2.0511, + "step": 8767 + }, + { + "epoch": 0.9224618621778011, + "grad_norm": 1.645261526107788, + "learning_rate": 3.1372785653931093e-06, + "loss": 1.7624, + "step": 8768 + }, + { + "epoch": 0.9225670699631773, + "grad_norm": 1.3146398067474365, + "learning_rate": 3.128815341673674e-06, + "loss": 1.9661, + "step": 8769 + }, + { + "epoch": 0.9226722777485534, + "grad_norm": 1.578999400138855, + "learning_rate": 3.120363367285917e-06, + "loss": 1.3778, + "step": 8770 + }, + { + "epoch": 0.9227774855339295, + "grad_norm": 2.222402811050415, + "learning_rate": 3.111922643211351e-06, + "loss": 2.074, + "step": 8771 + }, + { + "epoch": 0.9228826933193056, + "grad_norm": 1.712213158607483, + "learning_rate": 3.1034931704301606e-06, + "loss": 1.6251, + "step": 8772 + }, + { + "epoch": 0.9229879011046818, + "grad_norm": 1.4497140645980835, + "learning_rate": 3.0950749499212283e-06, + "loss": 1.5102, + "step": 8773 + }, + { + "epoch": 0.9230931088900579, + "grad_norm": 1.7401906251907349, + "learning_rate": 3.0866679826621504e-06, + "loss": 1.5702, + "step": 8774 + }, + { + "epoch": 0.923198316675434, + "grad_norm": 1.3884131908416748, + "learning_rate": 3.078272269629201e-06, + "loss": 1.6859, + "step": 8775 + }, + { + "epoch": 0.9233035244608101, + "grad_norm": 1.8712023496627808, + "learning_rate": 3.0698878117973117e-06, + "loss": 1.7209, + "step": 8776 + }, + { + "epoch": 0.9234087322461862, + "grad_norm": 1.4813895225524902, + "learning_rate": 3.0615146101401925e-06, + "loss": 1.5664, + "step": 8777 + }, + { + "epoch": 0.9235139400315623, + "grad_norm": 1.8341212272644043, + "learning_rate": 3.053152665630166e-06, + "loss": 1.4758, + "step": 8778 + }, + { + "epoch": 0.9236191478169384, + "grad_norm": 2.59242582321167, + "learning_rate": 3.0448019792382654e-06, + "loss": 1.4894, + "step": 8779 + }, + { + "epoch": 0.9237243556023146, + "grad_norm": 1.6962684392929077, + "learning_rate": 3.0364625519342603e-06, + "loss": 1.1454, + "step": 8780 + }, + { + "epoch": 0.9238295633876907, + "grad_norm": 1.346834421157837, + "learning_rate": 3.028134384686565e-06, + "loss": 1.3534, + "step": 8781 + }, + { + "epoch": 0.9239347711730668, + "grad_norm": 1.6630899906158447, + "learning_rate": 3.0198174784622944e-06, + "loss": 1.7483, + "step": 8782 + }, + { + "epoch": 0.9240399789584429, + "grad_norm": 1.2288480997085571, + "learning_rate": 3.0115118342272765e-06, + "loss": 1.9129, + "step": 8783 + }, + { + "epoch": 0.9241451867438191, + "grad_norm": 1.761152982711792, + "learning_rate": 3.0032174529460165e-06, + "loss": 1.4155, + "step": 8784 + }, + { + "epoch": 0.9242503945291951, + "grad_norm": 1.8571745157241821, + "learning_rate": 2.9949343355817003e-06, + "loss": 1.5797, + "step": 8785 + }, + { + "epoch": 0.9243556023145713, + "grad_norm": 1.4947131872177124, + "learning_rate": 2.9866624830962366e-06, + "loss": 1.5442, + "step": 8786 + }, + { + "epoch": 0.9244608100999474, + "grad_norm": 1.9050381183624268, + "learning_rate": 2.9784018964502114e-06, + "loss": 1.0048, + "step": 8787 + }, + { + "epoch": 0.9245660178853236, + "grad_norm": 1.943846583366394, + "learning_rate": 2.9701525766028802e-06, + "loss": 1.6949, + "step": 8788 + }, + { + "epoch": 0.9246712256706996, + "grad_norm": 2.4148600101470947, + "learning_rate": 2.9619145245122217e-06, + "loss": 0.8479, + "step": 8789 + }, + { + "epoch": 0.9247764334560757, + "grad_norm": 1.4801805019378662, + "learning_rate": 2.9536877411348808e-06, + "loss": 2.1227, + "step": 8790 + }, + { + "epoch": 0.9248816412414519, + "grad_norm": 1.3251457214355469, + "learning_rate": 2.945472227426227e-06, + "loss": 1.5081, + "step": 8791 + }, + { + "epoch": 0.924986849026828, + "grad_norm": 2.5353012084960938, + "learning_rate": 2.9372679843402863e-06, + "loss": 1.4429, + "step": 8792 + }, + { + "epoch": 0.9250920568122041, + "grad_norm": 1.5721760988235474, + "learning_rate": 2.9290750128297963e-06, + "loss": 1.9756, + "step": 8793 + }, + { + "epoch": 0.9251972645975802, + "grad_norm": 1.1288405656814575, + "learning_rate": 2.9208933138461737e-06, + "loss": 2.0431, + "step": 8794 + }, + { + "epoch": 0.9253024723829564, + "grad_norm": 1.4272089004516602, + "learning_rate": 2.9127228883395472e-06, + "loss": 1.9728, + "step": 8795 + }, + { + "epoch": 0.9254076801683324, + "grad_norm": 1.9231061935424805, + "learning_rate": 2.904563737258692e-06, + "loss": 2.0053, + "step": 8796 + }, + { + "epoch": 0.9255128879537086, + "grad_norm": 1.3104902505874634, + "learning_rate": 2.8964158615511383e-06, + "loss": 1.5842, + "step": 8797 + }, + { + "epoch": 0.9256180957390847, + "grad_norm": 1.5530825853347778, + "learning_rate": 2.8882792621630406e-06, + "loss": 1.1644, + "step": 8798 + }, + { + "epoch": 0.9257233035244609, + "grad_norm": 1.2885637283325195, + "learning_rate": 2.8801539400393097e-06, + "loss": 1.7493, + "step": 8799 + }, + { + "epoch": 0.9258285113098369, + "grad_norm": 1.3112657070159912, + "learning_rate": 2.8720398961234907e-06, + "loss": 1.2708, + "step": 8800 + }, + { + "epoch": 0.925933719095213, + "grad_norm": 1.7676241397857666, + "learning_rate": 2.86393713135783e-06, + "loss": 1.7696, + "step": 8801 + }, + { + "epoch": 0.9260389268805892, + "grad_norm": 1.611799955368042, + "learning_rate": 2.8558456466832973e-06, + "loss": 1.4262, + "step": 8802 + }, + { + "epoch": 0.9261441346659652, + "grad_norm": 1.5839171409606934, + "learning_rate": 2.8477654430395185e-06, + "loss": 1.9976, + "step": 8803 + }, + { + "epoch": 0.9262493424513414, + "grad_norm": 1.8299293518066406, + "learning_rate": 2.839696521364821e-06, + "loss": 1.5226, + "step": 8804 + }, + { + "epoch": 0.9263545502367175, + "grad_norm": 2.451498508453369, + "learning_rate": 2.8316388825962324e-06, + "loss": 1.4629, + "step": 8805 + }, + { + "epoch": 0.9264597580220937, + "grad_norm": 1.6395529508590698, + "learning_rate": 2.823592527669461e-06, + "loss": 1.8826, + "step": 8806 + }, + { + "epoch": 0.9265649658074697, + "grad_norm": 1.9740159511566162, + "learning_rate": 2.8155574575188694e-06, + "loss": 1.2348, + "step": 8807 + }, + { + "epoch": 0.9266701735928459, + "grad_norm": 2.034790515899658, + "learning_rate": 2.8075336730775894e-06, + "loss": 1.6714, + "step": 8808 + }, + { + "epoch": 0.926775381378222, + "grad_norm": 1.232695460319519, + "learning_rate": 2.7995211752773752e-06, + "loss": 1.2372, + "step": 8809 + }, + { + "epoch": 0.9268805891635981, + "grad_norm": 1.4230536222457886, + "learning_rate": 2.7915199650486944e-06, + "loss": 1.8896, + "step": 8810 + }, + { + "epoch": 0.9269857969489742, + "grad_norm": 1.6737197637557983, + "learning_rate": 2.7835300433207035e-06, + "loss": 1.6089, + "step": 8811 + }, + { + "epoch": 0.9270910047343504, + "grad_norm": 2.7729249000549316, + "learning_rate": 2.7755514110212264e-06, + "loss": 1.311, + "step": 8812 + }, + { + "epoch": 0.9271962125197265, + "grad_norm": 1.7959167957305908, + "learning_rate": 2.767584069076823e-06, + "loss": 1.3979, + "step": 8813 + }, + { + "epoch": 0.9273014203051025, + "grad_norm": 0.992779016494751, + "learning_rate": 2.7596280184126965e-06, + "loss": 1.7532, + "step": 8814 + }, + { + "epoch": 0.9274066280904787, + "grad_norm": 1.6769089698791504, + "learning_rate": 2.751683259952764e-06, + "loss": 1.6832, + "step": 8815 + }, + { + "epoch": 0.9275118358758548, + "grad_norm": 1.5358198881149292, + "learning_rate": 2.7437497946196322e-06, + "loss": 1.1027, + "step": 8816 + }, + { + "epoch": 0.9276170436612309, + "grad_norm": 1.683468222618103, + "learning_rate": 2.7358276233345747e-06, + "loss": 1.014, + "step": 8817 + }, + { + "epoch": 0.927722251446607, + "grad_norm": 1.826439619064331, + "learning_rate": 2.727916747017556e-06, + "loss": 1.3374, + "step": 8818 + }, + { + "epoch": 0.9278274592319832, + "grad_norm": 1.3325717449188232, + "learning_rate": 2.7200171665872742e-06, + "loss": 1.8112, + "step": 8819 + }, + { + "epoch": 0.9279326670173593, + "grad_norm": 1.434065818786621, + "learning_rate": 2.7121288829610624e-06, + "loss": 1.5424, + "step": 8820 + }, + { + "epoch": 0.9280378748027354, + "grad_norm": 1.613349437713623, + "learning_rate": 2.7042518970549546e-06, + "loss": 1.3637, + "step": 8821 + }, + { + "epoch": 0.9281430825881115, + "grad_norm": 1.9332239627838135, + "learning_rate": 2.696386209783697e-06, + "loss": 1.5076, + "step": 8822 + }, + { + "epoch": 0.9282482903734877, + "grad_norm": 1.393325686454773, + "learning_rate": 2.6885318220606914e-06, + "loss": 1.6009, + "step": 8823 + }, + { + "epoch": 0.9283534981588638, + "grad_norm": 1.6824408769607544, + "learning_rate": 2.6806887347980427e-06, + "loss": 1.6287, + "step": 8824 + }, + { + "epoch": 0.9284587059442398, + "grad_norm": 1.7882195711135864, + "learning_rate": 2.6728569489065437e-06, + "loss": 2.0023, + "step": 8825 + }, + { + "epoch": 0.928563913729616, + "grad_norm": 1.559647560119629, + "learning_rate": 2.6650364652956894e-06, + "loss": 1.3792, + "step": 8826 + }, + { + "epoch": 0.9286691215149921, + "grad_norm": 1.2669860124588013, + "learning_rate": 2.657227284873609e-06, + "loss": 1.6591, + "step": 8827 + }, + { + "epoch": 0.9287743293003682, + "grad_norm": 1.9442473649978638, + "learning_rate": 2.6494294085472103e-06, + "loss": 1.7604, + "step": 8828 + }, + { + "epoch": 0.9288795370857443, + "grad_norm": 1.3520489931106567, + "learning_rate": 2.6416428372219914e-06, + "loss": 1.8279, + "step": 8829 + }, + { + "epoch": 0.9289847448711205, + "grad_norm": 1.2436840534210205, + "learning_rate": 2.6338675718022064e-06, + "loss": 1.9452, + "step": 8830 + }, + { + "epoch": 0.9290899526564966, + "grad_norm": 1.687325119972229, + "learning_rate": 2.6261036131907557e-06, + "loss": 1.1885, + "step": 8831 + }, + { + "epoch": 0.9291951604418727, + "grad_norm": 1.489946722984314, + "learning_rate": 2.61835096228924e-06, + "loss": 1.9571, + "step": 8832 + }, + { + "epoch": 0.9293003682272488, + "grad_norm": 1.1309078931808472, + "learning_rate": 2.6106096199979614e-06, + "loss": 1.4592, + "step": 8833 + }, + { + "epoch": 0.929405576012625, + "grad_norm": 1.816552758216858, + "learning_rate": 2.6028795872159005e-06, + "loss": 1.5374, + "step": 8834 + }, + { + "epoch": 0.929510783798001, + "grad_norm": 1.753360390663147, + "learning_rate": 2.5951608648406955e-06, + "loss": 1.5404, + "step": 8835 + }, + { + "epoch": 0.9296159915833772, + "grad_norm": 1.432955265045166, + "learning_rate": 2.587453453768729e-06, + "loss": 1.7795, + "step": 8836 + }, + { + "epoch": 0.9297211993687533, + "grad_norm": 2.5248990058898926, + "learning_rate": 2.579757354895018e-06, + "loss": 1.5791, + "step": 8837 + }, + { + "epoch": 0.9298264071541295, + "grad_norm": 1.3202687501907349, + "learning_rate": 2.5720725691132706e-06, + "loss": 1.4432, + "step": 8838 + }, + { + "epoch": 0.9299316149395055, + "grad_norm": 1.9619760513305664, + "learning_rate": 2.564399097315928e-06, + "loss": 1.7022, + "step": 8839 + }, + { + "epoch": 0.9300368227248816, + "grad_norm": 1.2252382040023804, + "learning_rate": 2.5567369403940776e-06, + "loss": 1.579, + "step": 8840 + }, + { + "epoch": 0.9301420305102578, + "grad_norm": 1.8557909727096558, + "learning_rate": 2.5490860992374745e-06, + "loss": 1.6796, + "step": 8841 + }, + { + "epoch": 0.9302472382956338, + "grad_norm": 1.5600543022155762, + "learning_rate": 2.5414465747346182e-06, + "loss": 1.5379, + "step": 8842 + }, + { + "epoch": 0.93035244608101, + "grad_norm": 1.3198964595794678, + "learning_rate": 2.5338183677726334e-06, + "loss": 1.8882, + "step": 8843 + }, + { + "epoch": 0.9304576538663861, + "grad_norm": 1.934952974319458, + "learning_rate": 2.526201479237389e-06, + "loss": 0.9782, + "step": 8844 + }, + { + "epoch": 0.9305628616517623, + "grad_norm": 1.6024317741394043, + "learning_rate": 2.5185959100133883e-06, + "loss": 1.7564, + "step": 8845 + }, + { + "epoch": 0.9306680694371383, + "grad_norm": 1.7014966011047363, + "learning_rate": 2.5110016609838473e-06, + "loss": 1.5487, + "step": 8846 + }, + { + "epoch": 0.9307732772225145, + "grad_norm": 1.7204549312591553, + "learning_rate": 2.50341873303066e-06, + "loss": 1.7045, + "step": 8847 + }, + { + "epoch": 0.9308784850078906, + "grad_norm": 1.1743104457855225, + "learning_rate": 2.495847127034401e-06, + "loss": 1.7379, + "step": 8848 + }, + { + "epoch": 0.9309836927932666, + "grad_norm": 1.450788140296936, + "learning_rate": 2.4882868438743436e-06, + "loss": 1.2717, + "step": 8849 + }, + { + "epoch": 0.9310889005786428, + "grad_norm": 1.5399311780929565, + "learning_rate": 2.480737884428441e-06, + "loss": 1.3663, + "step": 8850 + }, + { + "epoch": 0.931194108364019, + "grad_norm": 1.6990025043487549, + "learning_rate": 2.4732002495733154e-06, + "loss": 1.8612, + "step": 8851 + }, + { + "epoch": 0.9312993161493951, + "grad_norm": 1.3516472578048706, + "learning_rate": 2.46567394018431e-06, + "loss": 1.5048, + "step": 8852 + }, + { + "epoch": 0.9314045239347711, + "grad_norm": 2.059854030609131, + "learning_rate": 2.458158957135415e-06, + "loss": 1.6879, + "step": 8853 + }, + { + "epoch": 0.9315097317201473, + "grad_norm": 1.1983468532562256, + "learning_rate": 2.4506553012993093e-06, + "loss": 1.7273, + "step": 8854 + }, + { + "epoch": 0.9316149395055234, + "grad_norm": 1.6070380210876465, + "learning_rate": 2.443162973547386e-06, + "loss": 1.3611, + "step": 8855 + }, + { + "epoch": 0.9317201472908996, + "grad_norm": 1.6409918069839478, + "learning_rate": 2.435681974749704e-06, + "loss": 1.5396, + "step": 8856 + }, + { + "epoch": 0.9318253550762756, + "grad_norm": 2.1190476417541504, + "learning_rate": 2.4282123057750016e-06, + "loss": 1.9413, + "step": 8857 + }, + { + "epoch": 0.9319305628616518, + "grad_norm": 1.454859733581543, + "learning_rate": 2.4207539674907075e-06, + "loss": 1.6396, + "step": 8858 + }, + { + "epoch": 0.9320357706470279, + "grad_norm": 1.9500346183776855, + "learning_rate": 2.413306960762929e-06, + "loss": 1.0076, + "step": 8859 + }, + { + "epoch": 0.932140978432404, + "grad_norm": 2.006927013397217, + "learning_rate": 2.4058712864564736e-06, + "loss": 1.8154, + "step": 8860 + }, + { + "epoch": 0.9322461862177801, + "grad_norm": 1.5914230346679688, + "learning_rate": 2.398446945434818e-06, + "loss": 1.5964, + "step": 8861 + }, + { + "epoch": 0.9323513940031563, + "grad_norm": 1.940948724746704, + "learning_rate": 2.3910339385601168e-06, + "loss": 1.8728, + "step": 8862 + }, + { + "epoch": 0.9324566017885324, + "grad_norm": 1.6678147315979004, + "learning_rate": 2.383632266693225e-06, + "loss": 2.0333, + "step": 8863 + }, + { + "epoch": 0.9325618095739084, + "grad_norm": 1.5024465322494507, + "learning_rate": 2.376241930693679e-06, + "loss": 1.7904, + "step": 8864 + }, + { + "epoch": 0.9326670173592846, + "grad_norm": 1.273901343345642, + "learning_rate": 2.368862931419702e-06, + "loss": 1.3617, + "step": 8865 + }, + { + "epoch": 0.9327722251446607, + "grad_norm": 1.9819680452346802, + "learning_rate": 2.3614952697281534e-06, + "loss": 1.5756, + "step": 8866 + }, + { + "epoch": 0.9328774329300368, + "grad_norm": 1.4044946432113647, + "learning_rate": 2.354138946474671e-06, + "loss": 1.7253, + "step": 8867 + }, + { + "epoch": 0.9329826407154129, + "grad_norm": 1.6213405132293701, + "learning_rate": 2.346793962513483e-06, + "loss": 1.9362, + "step": 8868 + }, + { + "epoch": 0.9330878485007891, + "grad_norm": 2.6082890033721924, + "learning_rate": 2.3394603186975393e-06, + "loss": 1.5571, + "step": 8869 + }, + { + "epoch": 0.9331930562861652, + "grad_norm": 1.7648690938949585, + "learning_rate": 2.332138015878482e-06, + "loss": 1.5838, + "step": 8870 + }, + { + "epoch": 0.9332982640715413, + "grad_norm": 1.2517858743667603, + "learning_rate": 2.324827054906631e-06, + "loss": 1.4428, + "step": 8871 + }, + { + "epoch": 0.9334034718569174, + "grad_norm": 1.9067564010620117, + "learning_rate": 2.317527436630973e-06, + "loss": 2.0886, + "step": 8872 + }, + { + "epoch": 0.9335086796422936, + "grad_norm": 1.4557033777236938, + "learning_rate": 2.310239161899208e-06, + "loss": 1.4884, + "step": 8873 + }, + { + "epoch": 0.9336138874276696, + "grad_norm": 1.6103370189666748, + "learning_rate": 2.3029622315576595e-06, + "loss": 1.6499, + "step": 8874 + }, + { + "epoch": 0.9337190952130457, + "grad_norm": 1.6522938013076782, + "learning_rate": 2.2956966464514175e-06, + "loss": 1.7791, + "step": 8875 + }, + { + "epoch": 0.9338243029984219, + "grad_norm": 1.3017587661743164, + "learning_rate": 2.288442407424185e-06, + "loss": 1.9411, + "step": 8876 + }, + { + "epoch": 0.933929510783798, + "grad_norm": 1.478482961654663, + "learning_rate": 2.2811995153183776e-06, + "loss": 1.6778, + "step": 8877 + }, + { + "epoch": 0.9340347185691741, + "grad_norm": 2.009734630584717, + "learning_rate": 2.2739679709750885e-06, + "loss": 1.4429, + "step": 8878 + }, + { + "epoch": 0.9341399263545502, + "grad_norm": 1.2722523212432861, + "learning_rate": 2.2667477752341017e-06, + "loss": 1.4058, + "step": 8879 + }, + { + "epoch": 0.9342451341399264, + "grad_norm": 1.124986171722412, + "learning_rate": 2.2595389289338575e-06, + "loss": 1.3739, + "step": 8880 + }, + { + "epoch": 0.9343503419253024, + "grad_norm": 1.039442777633667, + "learning_rate": 2.252341432911509e-06, + "loss": 1.7472, + "step": 8881 + }, + { + "epoch": 0.9344555497106786, + "grad_norm": 2.12754487991333, + "learning_rate": 2.245155288002876e-06, + "loss": 1.2261, + "step": 8882 + }, + { + "epoch": 0.9345607574960547, + "grad_norm": 1.5434430837631226, + "learning_rate": 2.2379804950424576e-06, + "loss": 1.5896, + "step": 8883 + }, + { + "epoch": 0.9346659652814309, + "grad_norm": 1.6226022243499756, + "learning_rate": 2.2308170548634435e-06, + "loss": 1.3338, + "step": 8884 + }, + { + "epoch": 0.9347711730668069, + "grad_norm": 1.7707509994506836, + "learning_rate": 2.2236649682977117e-06, + "loss": 1.332, + "step": 8885 + }, + { + "epoch": 0.934876380852183, + "grad_norm": 1.5284020900726318, + "learning_rate": 2.2165242361757764e-06, + "loss": 1.669, + "step": 8886 + }, + { + "epoch": 0.9349815886375592, + "grad_norm": 1.250614881515503, + "learning_rate": 2.2093948593268963e-06, + "loss": 1.7689, + "step": 8887 + }, + { + "epoch": 0.9350867964229354, + "grad_norm": 1.4621397256851196, + "learning_rate": 2.202276838578976e-06, + "loss": 1.5322, + "step": 8888 + }, + { + "epoch": 0.9351920042083114, + "grad_norm": 1.8375461101531982, + "learning_rate": 2.1951701747585982e-06, + "loss": 1.6119, + "step": 8889 + }, + { + "epoch": 0.9352972119936875, + "grad_norm": 1.5318689346313477, + "learning_rate": 2.188074868691059e-06, + "loss": 1.2107, + "step": 8890 + }, + { + "epoch": 0.9354024197790637, + "grad_norm": 2.4939355850219727, + "learning_rate": 2.1809909212002767e-06, + "loss": 1.7044, + "step": 8891 + }, + { + "epoch": 0.9355076275644397, + "grad_norm": 1.93202543258667, + "learning_rate": 2.1739183331089263e-06, + "loss": 1.3337, + "step": 8892 + }, + { + "epoch": 0.9356128353498159, + "grad_norm": 1.680690884590149, + "learning_rate": 2.166857105238307e-06, + "loss": 1.9268, + "step": 8893 + }, + { + "epoch": 0.935718043135192, + "grad_norm": 1.772756576538086, + "learning_rate": 2.159807238408418e-06, + "loss": 1.7988, + "step": 8894 + }, + { + "epoch": 0.9358232509205682, + "grad_norm": 1.440395474433899, + "learning_rate": 2.1527687334379266e-06, + "loss": 1.2087, + "step": 8895 + }, + { + "epoch": 0.9359284587059442, + "grad_norm": 1.6496853828430176, + "learning_rate": 2.1457415911442013e-06, + "loss": 1.5179, + "step": 8896 + }, + { + "epoch": 0.9360336664913204, + "grad_norm": 1.3402870893478394, + "learning_rate": 2.1387258123432673e-06, + "loss": 1.7395, + "step": 8897 + }, + { + "epoch": 0.9361388742766965, + "grad_norm": 1.9559837579727173, + "learning_rate": 2.131721397849862e-06, + "loss": 1.5187, + "step": 8898 + }, + { + "epoch": 0.9362440820620725, + "grad_norm": 2.084958076477051, + "learning_rate": 2.1247283484773785e-06, + "loss": 1.8087, + "step": 8899 + }, + { + "epoch": 0.9363492898474487, + "grad_norm": 2.734379291534424, + "learning_rate": 2.1177466650379007e-06, + "loss": 1.286, + "step": 8900 + }, + { + "epoch": 0.9364544976328248, + "grad_norm": 1.1719292402267456, + "learning_rate": 2.1107763483421805e-06, + "loss": 1.8817, + "step": 8901 + }, + { + "epoch": 0.936559705418201, + "grad_norm": 1.631633996963501, + "learning_rate": 2.103817399199659e-06, + "loss": 1.5544, + "step": 8902 + }, + { + "epoch": 0.936664913203577, + "grad_norm": 2.258840799331665, + "learning_rate": 2.0968698184184565e-06, + "loss": 1.4009, + "step": 8903 + }, + { + "epoch": 0.9367701209889532, + "grad_norm": 1.6519112586975098, + "learning_rate": 2.0899336068053833e-06, + "loss": 1.5555, + "step": 8904 + }, + { + "epoch": 0.9368753287743293, + "grad_norm": 1.7025299072265625, + "learning_rate": 2.0830087651658945e-06, + "loss": 2.2402, + "step": 8905 + }, + { + "epoch": 0.9369805365597054, + "grad_norm": 1.8736423254013062, + "learning_rate": 2.076095294304181e-06, + "loss": 1.5652, + "step": 8906 + }, + { + "epoch": 0.9370857443450815, + "grad_norm": 1.4368400573730469, + "learning_rate": 2.069193195023067e-06, + "loss": 2.0581, + "step": 8907 + }, + { + "epoch": 0.9371909521304577, + "grad_norm": 1.2674177885055542, + "learning_rate": 2.0623024681240554e-06, + "loss": 1.8723, + "step": 8908 + }, + { + "epoch": 0.9372961599158338, + "grad_norm": 1.2145195007324219, + "learning_rate": 2.0554231144073623e-06, + "loss": 1.8348, + "step": 8909 + }, + { + "epoch": 0.9374013677012099, + "grad_norm": 2.1379621028900146, + "learning_rate": 2.04855513467187e-06, + "loss": 1.5549, + "step": 8910 + }, + { + "epoch": 0.937506575486586, + "grad_norm": 2.2935433387756348, + "learning_rate": 2.041698529715097e-06, + "loss": 0.869, + "step": 8911 + }, + { + "epoch": 0.9376117832719622, + "grad_norm": 2.0212948322296143, + "learning_rate": 2.034853300333328e-06, + "loss": 2.2311, + "step": 8912 + }, + { + "epoch": 0.9377169910573382, + "grad_norm": 1.7877557277679443, + "learning_rate": 2.0280194473214497e-06, + "loss": 1.214, + "step": 8913 + }, + { + "epoch": 0.9378221988427143, + "grad_norm": 1.538549780845642, + "learning_rate": 2.0211969714730496e-06, + "loss": 1.2042, + "step": 8914 + }, + { + "epoch": 0.9379274066280905, + "grad_norm": 1.2701212167739868, + "learning_rate": 2.0143858735804154e-06, + "loss": 1.6479, + "step": 8915 + }, + { + "epoch": 0.9380326144134666, + "grad_norm": 2.006523609161377, + "learning_rate": 2.007586154434493e-06, + "loss": 1.9177, + "step": 8916 + }, + { + "epoch": 0.9381378221988427, + "grad_norm": 1.5271292924880981, + "learning_rate": 2.000797814824906e-06, + "loss": 1.2413, + "step": 8917 + }, + { + "epoch": 0.9382430299842188, + "grad_norm": 1.2923787832260132, + "learning_rate": 1.9940208555399685e-06, + "loss": 1.626, + "step": 8918 + }, + { + "epoch": 0.938348237769595, + "grad_norm": 2.304880380630493, + "learning_rate": 1.987255277366662e-06, + "loss": 1.7644, + "step": 8919 + }, + { + "epoch": 0.9384534455549711, + "grad_norm": 1.4698824882507324, + "learning_rate": 1.9805010810906464e-06, + "loss": 1.9222, + "step": 8920 + }, + { + "epoch": 0.9385586533403472, + "grad_norm": 1.2957773208618164, + "learning_rate": 1.9737582674962728e-06, + "loss": 1.5484, + "step": 8921 + }, + { + "epoch": 0.9386638611257233, + "grad_norm": 1.2710483074188232, + "learning_rate": 1.967026837366559e-06, + "loss": 1.4719, + "step": 8922 + }, + { + "epoch": 0.9387690689110995, + "grad_norm": 1.4735010862350464, + "learning_rate": 1.9603067914832017e-06, + "loss": 1.6447, + "step": 8923 + }, + { + "epoch": 0.9388742766964755, + "grad_norm": 1.5090333223342896, + "learning_rate": 1.9535981306265884e-06, + "loss": 1.9183, + "step": 8924 + }, + { + "epoch": 0.9389794844818516, + "grad_norm": 1.4228880405426025, + "learning_rate": 1.9469008555757505e-06, + "loss": 1.8077, + "step": 8925 + }, + { + "epoch": 0.9390846922672278, + "grad_norm": 1.4772889614105225, + "learning_rate": 1.9402149671084446e-06, + "loss": 1.4166, + "step": 8926 + }, + { + "epoch": 0.9391899000526039, + "grad_norm": 2.280817985534668, + "learning_rate": 1.9335404660010713e-06, + "loss": 1.458, + "step": 8927 + }, + { + "epoch": 0.93929510783798, + "grad_norm": 2.147027015686035, + "learning_rate": 1.926877353028711e-06, + "loss": 0.6933, + "step": 8928 + }, + { + "epoch": 0.9394003156233561, + "grad_norm": 1.2121834754943848, + "learning_rate": 1.9202256289651446e-06, + "loss": 1.8354, + "step": 8929 + }, + { + "epoch": 0.9395055234087323, + "grad_norm": 1.4258288145065308, + "learning_rate": 1.913585294582798e-06, + "loss": 1.588, + "step": 8930 + }, + { + "epoch": 0.9396107311941083, + "grad_norm": 1.710619330406189, + "learning_rate": 1.9069563506527998e-06, + "loss": 1.7881, + "step": 8931 + }, + { + "epoch": 0.9397159389794845, + "grad_norm": 1.781956434249878, + "learning_rate": 1.9003387979449562e-06, + "loss": 1.416, + "step": 8932 + }, + { + "epoch": 0.9398211467648606, + "grad_norm": 2.1186389923095703, + "learning_rate": 1.8937326372277408e-06, + "loss": 1.2819, + "step": 8933 + }, + { + "epoch": 0.9399263545502368, + "grad_norm": 1.4919960498809814, + "learning_rate": 1.8871378692682851e-06, + "loss": 1.6042, + "step": 8934 + }, + { + "epoch": 0.9400315623356128, + "grad_norm": 1.996641755104065, + "learning_rate": 1.8805544948324317e-06, + "loss": 1.4017, + "step": 8935 + }, + { + "epoch": 0.940136770120989, + "grad_norm": 1.87164306640625, + "learning_rate": 1.87398251468468e-06, + "loss": 1.7661, + "step": 8936 + }, + { + "epoch": 0.9402419779063651, + "grad_norm": 1.9063903093338013, + "learning_rate": 1.867421929588231e-06, + "loss": 1.7494, + "step": 8937 + }, + { + "epoch": 0.9403471856917411, + "grad_norm": 3.1897265911102295, + "learning_rate": 1.8608727403049309e-06, + "loss": 1.7943, + "step": 8938 + }, + { + "epoch": 0.9404523934771173, + "grad_norm": 6.021173000335693, + "learning_rate": 1.8543349475953043e-06, + "loss": 1.4585, + "step": 8939 + }, + { + "epoch": 0.9405576012624934, + "grad_norm": 2.0118443965911865, + "learning_rate": 1.8478085522185773e-06, + "loss": 1.4911, + "step": 8940 + }, + { + "epoch": 0.9406628090478696, + "grad_norm": 1.1734709739685059, + "learning_rate": 1.8412935549326438e-06, + "loss": 1.9156, + "step": 8941 + }, + { + "epoch": 0.9407680168332456, + "grad_norm": 1.911544919013977, + "learning_rate": 1.834789956494043e-06, + "loss": 1.7905, + "step": 8942 + }, + { + "epoch": 0.9408732246186218, + "grad_norm": 1.821887731552124, + "learning_rate": 1.8282977576580484e-06, + "loss": 1.6208, + "step": 8943 + }, + { + "epoch": 0.9409784324039979, + "grad_norm": 1.2936660051345825, + "learning_rate": 1.8218169591785572e-06, + "loss": 1.9121, + "step": 8944 + }, + { + "epoch": 0.941083640189374, + "grad_norm": 1.6752891540527344, + "learning_rate": 1.8153475618081673e-06, + "loss": 1.3991, + "step": 8945 + }, + { + "epoch": 0.9411888479747501, + "grad_norm": 1.817868709564209, + "learning_rate": 1.8088895662981554e-06, + "loss": 1.518, + "step": 8946 + }, + { + "epoch": 0.9412940557601263, + "grad_norm": 1.5891584157943726, + "learning_rate": 1.802442973398455e-06, + "loss": 1.763, + "step": 8947 + }, + { + "epoch": 0.9413992635455024, + "grad_norm": 1.1031978130340576, + "learning_rate": 1.7960077838577117e-06, + "loss": 1.5801, + "step": 8948 + }, + { + "epoch": 0.9415044713308784, + "grad_norm": 1.214778184890747, + "learning_rate": 1.7895839984231943e-06, + "loss": 1.9143, + "step": 8949 + }, + { + "epoch": 0.9416096791162546, + "grad_norm": 1.8901784420013428, + "learning_rate": 1.7831716178408841e-06, + "loss": 1.397, + "step": 8950 + }, + { + "epoch": 0.9417148869016307, + "grad_norm": 1.900726079940796, + "learning_rate": 1.776770642855441e-06, + "loss": 1.8984, + "step": 8951 + }, + { + "epoch": 0.9418200946870069, + "grad_norm": 1.5284584760665894, + "learning_rate": 1.7703810742101813e-06, + "loss": 1.8202, + "step": 8952 + }, + { + "epoch": 0.9419253024723829, + "grad_norm": 1.6591954231262207, + "learning_rate": 1.7640029126471003e-06, + "loss": 1.2281, + "step": 8953 + }, + { + "epoch": 0.9420305102577591, + "grad_norm": 1.8267433643341064, + "learning_rate": 1.7576361589068834e-06, + "loss": 1.3939, + "step": 8954 + }, + { + "epoch": 0.9421357180431352, + "grad_norm": 1.8574069738388062, + "learning_rate": 1.7512808137288727e-06, + "loss": 1.6839, + "step": 8955 + }, + { + "epoch": 0.9422409258285113, + "grad_norm": 2.1914479732513428, + "learning_rate": 1.7449368778511e-06, + "loss": 1.6874, + "step": 8956 + }, + { + "epoch": 0.9423461336138874, + "grad_norm": 1.5957982540130615, + "learning_rate": 1.7386043520102534e-06, + "loss": 1.8577, + "step": 8957 + }, + { + "epoch": 0.9424513413992636, + "grad_norm": 1.8510699272155762, + "learning_rate": 1.732283236941723e-06, + "loss": 1.8177, + "step": 8958 + }, + { + "epoch": 0.9425565491846397, + "grad_norm": 1.7489219903945923, + "learning_rate": 1.7259735333795545e-06, + "loss": 1.9759, + "step": 8959 + }, + { + "epoch": 0.9426617569700158, + "grad_norm": 1.5425418615341187, + "learning_rate": 1.7196752420564622e-06, + "loss": 1.5018, + "step": 8960 + }, + { + "epoch": 0.9427669647553919, + "grad_norm": 1.9875203371047974, + "learning_rate": 1.7133883637038494e-06, + "loss": 1.3086, + "step": 8961 + }, + { + "epoch": 0.942872172540768, + "grad_norm": 1.7433385848999023, + "learning_rate": 1.7071128990518103e-06, + "loss": 1.4756, + "step": 8962 + }, + { + "epoch": 0.9429773803261441, + "grad_norm": 1.6027960777282715, + "learning_rate": 1.700848848829073e-06, + "loss": 2.1611, + "step": 8963 + }, + { + "epoch": 0.9430825881115202, + "grad_norm": 1.8459538221359253, + "learning_rate": 1.6945962137630668e-06, + "loss": 1.7465, + "step": 8964 + }, + { + "epoch": 0.9431877958968964, + "grad_norm": 1.179093599319458, + "learning_rate": 1.6883549945798883e-06, + "loss": 1.8516, + "step": 8965 + }, + { + "epoch": 0.9432930036822725, + "grad_norm": 1.7608375549316406, + "learning_rate": 1.6821251920043246e-06, + "loss": 1.5069, + "step": 8966 + }, + { + "epoch": 0.9433982114676486, + "grad_norm": 1.7923959493637085, + "learning_rate": 1.675906806759786e-06, + "loss": 1.3353, + "step": 8967 + }, + { + "epoch": 0.9435034192530247, + "grad_norm": 1.7323315143585205, + "learning_rate": 1.6696998395684394e-06, + "loss": 1.7977, + "step": 8968 + }, + { + "epoch": 0.9436086270384009, + "grad_norm": 1.0940277576446533, + "learning_rate": 1.6635042911510413e-06, + "loss": 1.8435, + "step": 8969 + }, + { + "epoch": 0.9437138348237769, + "grad_norm": 1.5782438516616821, + "learning_rate": 1.6573201622270828e-06, + "loss": 1.428, + "step": 8970 + }, + { + "epoch": 0.9438190426091531, + "grad_norm": 1.616591453552246, + "learning_rate": 1.6511474535146899e-06, + "loss": 1.3093, + "step": 8971 + }, + { + "epoch": 0.9439242503945292, + "grad_norm": 1.872626781463623, + "learning_rate": 1.6449861657306998e-06, + "loss": 1.1612, + "step": 8972 + }, + { + "epoch": 0.9440294581799054, + "grad_norm": 2.045409679412842, + "learning_rate": 1.6388362995905848e-06, + "loss": 1.7662, + "step": 8973 + }, + { + "epoch": 0.9441346659652814, + "grad_norm": 1.8982219696044922, + "learning_rate": 1.6326978558085182e-06, + "loss": 1.5276, + "step": 8974 + }, + { + "epoch": 0.9442398737506575, + "grad_norm": 2.2144486904144287, + "learning_rate": 1.6265708350973296e-06, + "loss": 1.3016, + "step": 8975 + }, + { + "epoch": 0.9443450815360337, + "grad_norm": 1.7731724977493286, + "learning_rate": 1.6204552381685278e-06, + "loss": 1.884, + "step": 8976 + }, + { + "epoch": 0.9444502893214097, + "grad_norm": 1.628603219985962, + "learning_rate": 1.6143510657323112e-06, + "loss": 1.5011, + "step": 8977 + }, + { + "epoch": 0.9445554971067859, + "grad_norm": 1.250891089439392, + "learning_rate": 1.6082583184975352e-06, + "loss": 1.5904, + "step": 8978 + }, + { + "epoch": 0.944660704892162, + "grad_norm": 1.6449693441390991, + "learning_rate": 1.6021769971717116e-06, + "loss": 1.9314, + "step": 8979 + }, + { + "epoch": 0.9447659126775382, + "grad_norm": 1.9107930660247803, + "learning_rate": 1.5961071024610752e-06, + "loss": 1.5378, + "step": 8980 + }, + { + "epoch": 0.9448711204629142, + "grad_norm": 1.551930546760559, + "learning_rate": 1.5900486350704625e-06, + "loss": 1.4893, + "step": 8981 + }, + { + "epoch": 0.9449763282482904, + "grad_norm": 1.4413615465164185, + "learning_rate": 1.584001595703466e-06, + "loss": 1.8078, + "step": 8982 + }, + { + "epoch": 0.9450815360336665, + "grad_norm": 1.8441150188446045, + "learning_rate": 1.5779659850622797e-06, + "loss": 1.6129, + "step": 8983 + }, + { + "epoch": 0.9451867438190427, + "grad_norm": 1.5965230464935303, + "learning_rate": 1.5719418038477985e-06, + "loss": 2.0439, + "step": 8984 + }, + { + "epoch": 0.9452919516044187, + "grad_norm": 2.328061819076538, + "learning_rate": 1.5659290527596071e-06, + "loss": 1.4441, + "step": 8985 + }, + { + "epoch": 0.9453971593897948, + "grad_norm": 2.004664182662964, + "learning_rate": 1.559927732495936e-06, + "loss": 1.7041, + "step": 8986 + }, + { + "epoch": 0.945502367175171, + "grad_norm": 1.5392452478408813, + "learning_rate": 1.5539378437536944e-06, + "loss": 1.8988, + "step": 8987 + }, + { + "epoch": 0.945607574960547, + "grad_norm": 1.6413873434066772, + "learning_rate": 1.5479593872284926e-06, + "loss": 1.6044, + "step": 8988 + }, + { + "epoch": 0.9457127827459232, + "grad_norm": 2.1332480907440186, + "learning_rate": 1.5419923636145639e-06, + "loss": 1.8659, + "step": 8989 + }, + { + "epoch": 0.9458179905312993, + "grad_norm": 1.1487113237380981, + "learning_rate": 1.536036773604843e-06, + "loss": 1.6138, + "step": 8990 + }, + { + "epoch": 0.9459231983166755, + "grad_norm": 1.1979764699935913, + "learning_rate": 1.5300926178909435e-06, + "loss": 1.684, + "step": 8991 + }, + { + "epoch": 0.9460284061020515, + "grad_norm": 1.9243789911270142, + "learning_rate": 1.5241598971631354e-06, + "loss": 1.5863, + "step": 8992 + }, + { + "epoch": 0.9461336138874277, + "grad_norm": 1.8303254842758179, + "learning_rate": 1.5182386121103676e-06, + "loss": 1.7, + "step": 8993 + }, + { + "epoch": 0.9462388216728038, + "grad_norm": 1.7681361436843872, + "learning_rate": 1.5123287634202454e-06, + "loss": 1.521, + "step": 8994 + }, + { + "epoch": 0.9463440294581799, + "grad_norm": 1.3203057050704956, + "learning_rate": 1.5064303517790757e-06, + "loss": 1.8113, + "step": 8995 + }, + { + "epoch": 0.946449237243556, + "grad_norm": 1.5649878978729248, + "learning_rate": 1.5005433778718213e-06, + "loss": 1.4567, + "step": 8996 + }, + { + "epoch": 0.9465544450289322, + "grad_norm": 1.9078657627105713, + "learning_rate": 1.4946678423821248e-06, + "loss": 1.6237, + "step": 8997 + }, + { + "epoch": 0.9466596528143083, + "grad_norm": 1.2230523824691772, + "learning_rate": 1.4888037459922622e-06, + "loss": 1.6624, + "step": 8998 + }, + { + "epoch": 0.9467648605996843, + "grad_norm": 2.302837371826172, + "learning_rate": 1.4829510893832332e-06, + "loss": 1.7364, + "step": 8999 + }, + { + "epoch": 0.9468700683850605, + "grad_norm": 1.3794721364974976, + "learning_rate": 1.4771098732346943e-06, + "loss": 1.9772, + "step": 9000 + }, + { + "epoch": 0.9469752761704366, + "grad_norm": 1.796860694885254, + "learning_rate": 1.4712800982249474e-06, + "loss": 1.4391, + "step": 9001 + }, + { + "epoch": 0.9470804839558127, + "grad_norm": 1.5692015886306763, + "learning_rate": 1.4654617650310176e-06, + "loss": 1.5924, + "step": 9002 + }, + { + "epoch": 0.9471856917411888, + "grad_norm": 1.5295088291168213, + "learning_rate": 1.4596548743285198e-06, + "loss": 1.4613, + "step": 9003 + }, + { + "epoch": 0.947290899526565, + "grad_norm": 2.021333932876587, + "learning_rate": 1.453859426791826e-06, + "loss": 1.9719, + "step": 9004 + }, + { + "epoch": 0.9473961073119411, + "grad_norm": 1.1852548122406006, + "learning_rate": 1.448075423093942e-06, + "loss": 1.3462, + "step": 9005 + }, + { + "epoch": 0.9475013150973172, + "grad_norm": 1.628966212272644, + "learning_rate": 1.4423028639065195e-06, + "loss": 2.1885, + "step": 9006 + }, + { + "epoch": 0.9476065228826933, + "grad_norm": 1.4825149774551392, + "learning_rate": 1.4365417498999334e-06, + "loss": 1.4696, + "step": 9007 + }, + { + "epoch": 0.9477117306680695, + "grad_norm": 1.7557880878448486, + "learning_rate": 1.430792081743182e-06, + "loss": 1.3468, + "step": 9008 + }, + { + "epoch": 0.9478169384534455, + "grad_norm": 1.8458571434020996, + "learning_rate": 1.4250538601039642e-06, + "loss": 1.804, + "step": 9009 + }, + { + "epoch": 0.9479221462388217, + "grad_norm": 1.6594163179397583, + "learning_rate": 1.4193270856486585e-06, + "loss": 1.3729, + "step": 9010 + }, + { + "epoch": 0.9480273540241978, + "grad_norm": 1.11782968044281, + "learning_rate": 1.413611759042266e-06, + "loss": 1.8055, + "step": 9011 + }, + { + "epoch": 0.948132561809574, + "grad_norm": 1.8020600080490112, + "learning_rate": 1.4079078809485002e-06, + "loss": 1.6278, + "step": 9012 + }, + { + "epoch": 0.94823776959495, + "grad_norm": 1.9331527948379517, + "learning_rate": 1.4022154520297425e-06, + "loss": 1.819, + "step": 9013 + }, + { + "epoch": 0.9483429773803261, + "grad_norm": 1.2520698308944702, + "learning_rate": 1.396534472947031e-06, + "loss": 1.757, + "step": 9014 + }, + { + "epoch": 0.9484481851657023, + "grad_norm": 1.4876186847686768, + "learning_rate": 1.3908649443600707e-06, + "loss": 1.5025, + "step": 9015 + }, + { + "epoch": 0.9485533929510784, + "grad_norm": 1.5193793773651123, + "learning_rate": 1.3852068669272688e-06, + "loss": 1.8764, + "step": 9016 + }, + { + "epoch": 0.9486586007364545, + "grad_norm": 1.3971233367919922, + "learning_rate": 1.3795602413056442e-06, + "loss": 1.3161, + "step": 9017 + }, + { + "epoch": 0.9487638085218306, + "grad_norm": 1.4068975448608398, + "learning_rate": 1.3739250681509497e-06, + "loss": 1.2116, + "step": 9018 + }, + { + "epoch": 0.9488690163072068, + "grad_norm": 1.3958295583724976, + "learning_rate": 1.3683013481175621e-06, + "loss": 1.4589, + "step": 9019 + }, + { + "epoch": 0.9489742240925828, + "grad_norm": 1.2469528913497925, + "learning_rate": 1.3626890818585591e-06, + "loss": 1.808, + "step": 9020 + }, + { + "epoch": 0.949079431877959, + "grad_norm": 1.113645076751709, + "learning_rate": 1.3570882700256637e-06, + "loss": 1.4229, + "step": 9021 + }, + { + "epoch": 0.9491846396633351, + "grad_norm": 1.3229107856750488, + "learning_rate": 1.351498913269289e-06, + "loss": 1.8561, + "step": 9022 + }, + { + "epoch": 0.9492898474487113, + "grad_norm": 1.7333238124847412, + "learning_rate": 1.3459210122384934e-06, + "loss": 1.3316, + "step": 9023 + }, + { + "epoch": 0.9493950552340873, + "grad_norm": 1.8732205629348755, + "learning_rate": 1.340354567581037e-06, + "loss": 1.7108, + "step": 9024 + }, + { + "epoch": 0.9495002630194634, + "grad_norm": 1.6077425479888916, + "learning_rate": 1.3347995799433355e-06, + "loss": 1.6701, + "step": 9025 + }, + { + "epoch": 0.9496054708048396, + "grad_norm": 2.317784070968628, + "learning_rate": 1.3292560499704398e-06, + "loss": 1.5069, + "step": 9026 + }, + { + "epoch": 0.9497106785902156, + "grad_norm": 2.149060010910034, + "learning_rate": 1.3237239783061462e-06, + "loss": 0.9591, + "step": 9027 + }, + { + "epoch": 0.9498158863755918, + "grad_norm": 1.2673544883728027, + "learning_rate": 1.3182033655928404e-06, + "loss": 1.7899, + "step": 9028 + }, + { + "epoch": 0.9499210941609679, + "grad_norm": 1.8225202560424805, + "learning_rate": 1.3126942124716213e-06, + "loss": 1.5151, + "step": 9029 + }, + { + "epoch": 0.9500263019463441, + "grad_norm": 1.9195647239685059, + "learning_rate": 1.3071965195822656e-06, + "loss": 1.5733, + "step": 9030 + }, + { + "epoch": 0.9501315097317201, + "grad_norm": 2.377620220184326, + "learning_rate": 1.301710287563196e-06, + "loss": 1.1683, + "step": 9031 + }, + { + "epoch": 0.9502367175170963, + "grad_norm": 1.6789700984954834, + "learning_rate": 1.2962355170514917e-06, + "loss": 1.5605, + "step": 9032 + }, + { + "epoch": 0.9503419253024724, + "grad_norm": 1.4375206232070923, + "learning_rate": 1.2907722086829332e-06, + "loss": 1.3032, + "step": 9033 + }, + { + "epoch": 0.9504471330878485, + "grad_norm": 1.3653327226638794, + "learning_rate": 1.2853203630919686e-06, + "loss": 1.4516, + "step": 9034 + }, + { + "epoch": 0.9505523408732246, + "grad_norm": 1.7745347023010254, + "learning_rate": 1.279879980911669e-06, + "loss": 1.5384, + "step": 9035 + }, + { + "epoch": 0.9506575486586007, + "grad_norm": 1.7795803546905518, + "learning_rate": 1.2744510627738516e-06, + "loss": 1.8052, + "step": 9036 + }, + { + "epoch": 0.9507627564439769, + "grad_norm": 1.5467740297317505, + "learning_rate": 1.2690336093089228e-06, + "loss": 1.7491, + "step": 9037 + }, + { + "epoch": 0.9508679642293529, + "grad_norm": 1.2983478307724, + "learning_rate": 1.263627621146013e-06, + "loss": 1.6505, + "step": 9038 + }, + { + "epoch": 0.9509731720147291, + "grad_norm": 2.1444427967071533, + "learning_rate": 1.2582330989128977e-06, + "loss": 1.3857, + "step": 9039 + }, + { + "epoch": 0.9510783798001052, + "grad_norm": 1.9414992332458496, + "learning_rate": 1.252850043236009e-06, + "loss": 1.2304, + "step": 9040 + }, + { + "epoch": 0.9511835875854813, + "grad_norm": 1.6080853939056396, + "learning_rate": 1.2474784547404916e-06, + "loss": 1.8033, + "step": 9041 + }, + { + "epoch": 0.9512887953708574, + "grad_norm": 1.4745537042617798, + "learning_rate": 1.2421183340501242e-06, + "loss": 2.1338, + "step": 9042 + }, + { + "epoch": 0.9513940031562336, + "grad_norm": 1.2472604513168335, + "learning_rate": 1.2367696817873419e-06, + "loss": 2.3056, + "step": 9043 + }, + { + "epoch": 0.9514992109416097, + "grad_norm": 1.7281334400177002, + "learning_rate": 1.2314324985732818e-06, + "loss": 1.5415, + "step": 9044 + }, + { + "epoch": 0.9516044187269858, + "grad_norm": 1.563148856163025, + "learning_rate": 1.2261067850277252e-06, + "loss": 1.6814, + "step": 9045 + }, + { + "epoch": 0.9517096265123619, + "grad_norm": 1.6207623481750488, + "learning_rate": 1.2207925417691334e-06, + "loss": 1.566, + "step": 9046 + }, + { + "epoch": 0.951814834297738, + "grad_norm": 1.622622013092041, + "learning_rate": 1.215489769414635e-06, + "loss": 1.4526, + "step": 9047 + }, + { + "epoch": 0.9519200420831142, + "grad_norm": 1.5105351209640503, + "learning_rate": 1.210198468580015e-06, + "loss": 1.8531, + "step": 9048 + }, + { + "epoch": 0.9520252498684902, + "grad_norm": 1.3942266702651978, + "learning_rate": 1.2049186398797374e-06, + "loss": 1.8842, + "step": 9049 + }, + { + "epoch": 0.9521304576538664, + "grad_norm": 1.670791745185852, + "learning_rate": 1.1996502839269453e-06, + "loss": 1.9948, + "step": 9050 + }, + { + "epoch": 0.9522356654392425, + "grad_norm": 1.3902188539505005, + "learning_rate": 1.1943934013334047e-06, + "loss": 1.091, + "step": 9051 + }, + { + "epoch": 0.9523408732246186, + "grad_norm": 1.8161249160766602, + "learning_rate": 1.189147992709616e-06, + "loss": 2.1526, + "step": 9052 + }, + { + "epoch": 0.9524460810099947, + "grad_norm": 1.8096421957015991, + "learning_rate": 1.1839140586646923e-06, + "loss": 0.9295, + "step": 9053 + }, + { + "epoch": 0.9525512887953709, + "grad_norm": 1.7646183967590332, + "learning_rate": 1.178691599806425e-06, + "loss": 1.7518, + "step": 9054 + }, + { + "epoch": 0.952656496580747, + "grad_norm": 1.164070963859558, + "learning_rate": 1.173480616741296e-06, + "loss": 1.5824, + "step": 9055 + }, + { + "epoch": 0.9527617043661231, + "grad_norm": 1.876624584197998, + "learning_rate": 1.168281110074443e-06, + "loss": 1.4504, + "step": 9056 + }, + { + "epoch": 0.9528669121514992, + "grad_norm": 1.5865232944488525, + "learning_rate": 1.1630930804096495e-06, + "loss": 1.5255, + "step": 9057 + }, + { + "epoch": 0.9529721199368754, + "grad_norm": 1.653921127319336, + "learning_rate": 1.1579165283494009e-06, + "loss": 1.5343, + "step": 9058 + }, + { + "epoch": 0.9530773277222514, + "grad_norm": 3.066737651824951, + "learning_rate": 1.152751454494827e-06, + "loss": 1.3524, + "step": 9059 + }, + { + "epoch": 0.9531825355076275, + "grad_norm": 1.9732846021652222, + "learning_rate": 1.1475978594457149e-06, + "loss": 1.6233, + "step": 9060 + }, + { + "epoch": 0.9532877432930037, + "grad_norm": 1.4492088556289673, + "learning_rate": 1.1424557438005634e-06, + "loss": 1.6444, + "step": 9061 + }, + { + "epoch": 0.9533929510783798, + "grad_norm": 1.6626065969467163, + "learning_rate": 1.1373251081565062e-06, + "loss": 1.6292, + "step": 9062 + }, + { + "epoch": 0.9534981588637559, + "grad_norm": 1.2074167728424072, + "learning_rate": 1.132205953109311e-06, + "loss": 1.9072, + "step": 9063 + }, + { + "epoch": 0.953603366649132, + "grad_norm": 1.8209060430526733, + "learning_rate": 1.127098279253491e-06, + "loss": 1.4265, + "step": 9064 + }, + { + "epoch": 0.9537085744345082, + "grad_norm": 1.6529898643493652, + "learning_rate": 1.1220020871821723e-06, + "loss": 1.3912, + "step": 9065 + }, + { + "epoch": 0.9538137822198842, + "grad_norm": 1.3081694841384888, + "learning_rate": 1.1169173774871478e-06, + "loss": 1.8764, + "step": 9066 + }, + { + "epoch": 0.9539189900052604, + "grad_norm": 1.5082989931106567, + "learning_rate": 1.1118441507589006e-06, + "loss": 1.6502, + "step": 9067 + }, + { + "epoch": 0.9540241977906365, + "grad_norm": 1.7318092584609985, + "learning_rate": 1.1067824075865485e-06, + "loss": 1.3874, + "step": 9068 + }, + { + "epoch": 0.9541294055760127, + "grad_norm": 1.9931410551071167, + "learning_rate": 1.1017321485579102e-06, + "loss": 2.344, + "step": 9069 + }, + { + "epoch": 0.9542346133613887, + "grad_norm": 1.397017240524292, + "learning_rate": 1.096693374259461e-06, + "loss": 1.9131, + "step": 9070 + }, + { + "epoch": 0.9543398211467649, + "grad_norm": 0.9994755983352661, + "learning_rate": 1.0916660852763216e-06, + "loss": 1.4483, + "step": 9071 + }, + { + "epoch": 0.954445028932141, + "grad_norm": 1.567875623703003, + "learning_rate": 1.0866502821923031e-06, + "loss": 1.4536, + "step": 9072 + }, + { + "epoch": 0.954550236717517, + "grad_norm": 1.6587287187576294, + "learning_rate": 1.0816459655898726e-06, + "loss": 1.8396, + "step": 9073 + }, + { + "epoch": 0.9546554445028932, + "grad_norm": 1.5894458293914795, + "learning_rate": 1.0766531360501654e-06, + "loss": 1.3923, + "step": 9074 + }, + { + "epoch": 0.9547606522882693, + "grad_norm": 1.1351405382156372, + "learning_rate": 1.0716717941529841e-06, + "loss": 1.543, + "step": 9075 + }, + { + "epoch": 0.9548658600736455, + "grad_norm": 1.4005601406097412, + "learning_rate": 1.0667019404767996e-06, + "loss": 1.7229, + "step": 9076 + }, + { + "epoch": 0.9549710678590215, + "grad_norm": 2.1269547939300537, + "learning_rate": 1.0617435755987281e-06, + "loss": 1.5129, + "step": 9077 + }, + { + "epoch": 0.9550762756443977, + "grad_norm": 1.9350544214248657, + "learning_rate": 1.0567967000945866e-06, + "loss": 1.4607, + "step": 9078 + }, + { + "epoch": 0.9551814834297738, + "grad_norm": 1.4004015922546387, + "learning_rate": 1.0518613145388378e-06, + "loss": 1.907, + "step": 9079 + }, + { + "epoch": 0.95528669121515, + "grad_norm": 1.2579517364501953, + "learning_rate": 1.04693741950459e-06, + "loss": 1.4664, + "step": 9080 + }, + { + "epoch": 0.955391899000526, + "grad_norm": 1.7487457990646362, + "learning_rate": 1.0420250155636745e-06, + "loss": 1.6682, + "step": 9081 + }, + { + "epoch": 0.9554971067859022, + "grad_norm": 2.7791035175323486, + "learning_rate": 1.0371241032865242e-06, + "loss": 1.2912, + "step": 9082 + }, + { + "epoch": 0.9556023145712783, + "grad_norm": 1.7350082397460938, + "learning_rate": 1.0322346832422613e-06, + "loss": 1.7033, + "step": 9083 + }, + { + "epoch": 0.9557075223566543, + "grad_norm": 2.800816535949707, + "learning_rate": 1.0273567559987097e-06, + "loss": 1.7198, + "step": 9084 + }, + { + "epoch": 0.9558127301420305, + "grad_norm": 1.4789823293685913, + "learning_rate": 1.0224903221222938e-06, + "loss": 1.9354, + "step": 9085 + }, + { + "epoch": 0.9559179379274066, + "grad_norm": 2.029716968536377, + "learning_rate": 1.0176353821781616e-06, + "loss": 1.2883, + "step": 9086 + }, + { + "epoch": 0.9560231457127828, + "grad_norm": 1.2375006675720215, + "learning_rate": 1.0127919367301064e-06, + "loss": 1.7844, + "step": 9087 + }, + { + "epoch": 0.9561283534981588, + "grad_norm": 1.4403471946716309, + "learning_rate": 1.0079599863405454e-06, + "loss": 1.4304, + "step": 9088 + }, + { + "epoch": 0.956233561283535, + "grad_norm": 1.9305444955825806, + "learning_rate": 1.0031395315706183e-06, + "loss": 1.041, + "step": 9089 + }, + { + "epoch": 0.9563387690689111, + "grad_norm": 1.9020962715148926, + "learning_rate": 9.983305729801107e-07, + "loss": 1.609, + "step": 9090 + }, + { + "epoch": 0.9564439768542872, + "grad_norm": 2.1255598068237305, + "learning_rate": 9.93533111127476e-07, + "loss": 1.4914, + "step": 9091 + }, + { + "epoch": 0.9565491846396633, + "grad_norm": 2.1392078399658203, + "learning_rate": 9.88747146569813e-07, + "loss": 1.6591, + "step": 9092 + }, + { + "epoch": 0.9566543924250395, + "grad_norm": 1.4976539611816406, + "learning_rate": 9.8397267986291e-07, + "loss": 1.5315, + "step": 9093 + }, + { + "epoch": 0.9567596002104156, + "grad_norm": 1.4068844318389893, + "learning_rate": 9.79209711561202e-07, + "loss": 1.2471, + "step": 9094 + }, + { + "epoch": 0.9568648079957917, + "grad_norm": 1.8085061311721802, + "learning_rate": 9.744582422178127e-07, + "loss": 1.3131, + "step": 9095 + }, + { + "epoch": 0.9569700157811678, + "grad_norm": 1.4883860349655151, + "learning_rate": 9.6971827238449e-07, + "loss": 1.8711, + "step": 9096 + }, + { + "epoch": 0.957075223566544, + "grad_norm": 1.6276978254318237, + "learning_rate": 9.649898026117043e-07, + "loss": 1.5425, + "step": 9097 + }, + { + "epoch": 0.95718043135192, + "grad_norm": 1.1478794813156128, + "learning_rate": 9.602728334485278e-07, + "loss": 1.746, + "step": 9098 + }, + { + "epoch": 0.9572856391372961, + "grad_norm": 1.3659400939941406, + "learning_rate": 9.555673654427332e-07, + "loss": 1.3815, + "step": 9099 + }, + { + "epoch": 0.9573908469226723, + "grad_norm": 1.6303960084915161, + "learning_rate": 9.508733991407615e-07, + "loss": 1.6709, + "step": 9100 + }, + { + "epoch": 0.9574960547080484, + "grad_norm": 1.6804249286651611, + "learning_rate": 9.461909350876985e-07, + "loss": 1.5344, + "step": 9101 + }, + { + "epoch": 0.9576012624934245, + "grad_norm": 1.8790122270584106, + "learning_rate": 9.415199738272984e-07, + "loss": 1.4744, + "step": 9102 + }, + { + "epoch": 0.9577064702788006, + "grad_norm": 1.4778565168380737, + "learning_rate": 9.36860515902005e-07, + "loss": 1.7999, + "step": 9103 + }, + { + "epoch": 0.9578116780641768, + "grad_norm": 1.2392534017562866, + "learning_rate": 9.322125618528854e-07, + "loss": 1.7232, + "step": 9104 + }, + { + "epoch": 0.9579168858495528, + "grad_norm": 1.5124746561050415, + "learning_rate": 9.275761122196968e-07, + "loss": 2.0347, + "step": 9105 + }, + { + "epoch": 0.958022093634929, + "grad_norm": 1.9507633447647095, + "learning_rate": 9.229511675408642e-07, + "loss": 1.6567, + "step": 9106 + }, + { + "epoch": 0.9581273014203051, + "grad_norm": 1.7960712909698486, + "learning_rate": 9.183377283534578e-07, + "loss": 1.7705, + "step": 9107 + }, + { + "epoch": 0.9582325092056813, + "grad_norm": 2.1413114070892334, + "learning_rate": 9.137357951932157e-07, + "loss": 1.7895, + "step": 9108 + }, + { + "epoch": 0.9583377169910573, + "grad_norm": 1.709755301475525, + "learning_rate": 9.091453685945661e-07, + "loss": 1.7887, + "step": 9109 + }, + { + "epoch": 0.9584429247764334, + "grad_norm": 1.2714898586273193, + "learning_rate": 9.045664490905492e-07, + "loss": 1.8393, + "step": 9110 + }, + { + "epoch": 0.9585481325618096, + "grad_norm": 1.3688026666641235, + "learning_rate": 8.999990372129286e-07, + "loss": 1.6004, + "step": 9111 + }, + { + "epoch": 0.9586533403471857, + "grad_norm": 1.3194514513015747, + "learning_rate": 8.95443133492091e-07, + "loss": 1.1129, + "step": 9112 + }, + { + "epoch": 0.9587585481325618, + "grad_norm": 1.4183218479156494, + "learning_rate": 8.90898738457091e-07, + "loss": 1.8257, + "step": 9113 + }, + { + "epoch": 0.9588637559179379, + "grad_norm": 1.4666954278945923, + "learning_rate": 8.863658526356622e-07, + "loss": 1.6854, + "step": 9114 + }, + { + "epoch": 0.9589689637033141, + "grad_norm": 1.620832085609436, + "learning_rate": 8.818444765541944e-07, + "loss": 1.9613, + "step": 9115 + }, + { + "epoch": 0.9590741714886901, + "grad_norm": 1.858279824256897, + "learning_rate": 8.773346107377456e-07, + "loss": 1.7767, + "step": 9116 + }, + { + "epoch": 0.9591793792740663, + "grad_norm": 1.720293402671814, + "learning_rate": 8.72836255710019e-07, + "loss": 1.8888, + "step": 9117 + }, + { + "epoch": 0.9592845870594424, + "grad_norm": 1.7517911195755005, + "learning_rate": 8.683494119934076e-07, + "loss": 1.449, + "step": 9118 + }, + { + "epoch": 0.9593897948448186, + "grad_norm": 0.9157357215881348, + "learning_rate": 8.638740801089396e-07, + "loss": 1.5477, + "step": 9119 + }, + { + "epoch": 0.9594950026301946, + "grad_norm": 1.0953036546707153, + "learning_rate": 8.59410260576321e-07, + "loss": 1.6595, + "step": 9120 + }, + { + "epoch": 0.9596002104155708, + "grad_norm": 1.7858799695968628, + "learning_rate": 8.549579539139374e-07, + "loss": 1.6835, + "step": 9121 + }, + { + "epoch": 0.9597054182009469, + "grad_norm": 1.9369888305664062, + "learning_rate": 8.505171606388085e-07, + "loss": 2.1841, + "step": 9122 + }, + { + "epoch": 0.9598106259863229, + "grad_norm": 1.8354744911193848, + "learning_rate": 8.460878812666217e-07, + "loss": 1.823, + "step": 9123 + }, + { + "epoch": 0.9599158337716991, + "grad_norm": 1.8804636001586914, + "learning_rate": 8.416701163117546e-07, + "loss": 2.0077, + "step": 9124 + }, + { + "epoch": 0.9600210415570752, + "grad_norm": 1.981581449508667, + "learning_rate": 8.372638662872079e-07, + "loss": 1.5237, + "step": 9125 + }, + { + "epoch": 0.9601262493424514, + "grad_norm": 1.4924275875091553, + "learning_rate": 8.328691317046722e-07, + "loss": 1.5502, + "step": 9126 + }, + { + "epoch": 0.9602314571278274, + "grad_norm": 1.8991860151290894, + "learning_rate": 8.28485913074506e-07, + "loss": 1.5965, + "step": 9127 + }, + { + "epoch": 0.9603366649132036, + "grad_norm": 0.9699298739433289, + "learning_rate": 8.24114210905691e-07, + "loss": 2.2511, + "step": 9128 + }, + { + "epoch": 0.9604418726985797, + "grad_norm": 1.3279988765716553, + "learning_rate": 8.197540257059321e-07, + "loss": 1.6017, + "step": 9129 + }, + { + "epoch": 0.9605470804839558, + "grad_norm": 1.4322397708892822, + "learning_rate": 8.154053579815358e-07, + "loss": 1.5948, + "step": 9130 + }, + { + "epoch": 0.9606522882693319, + "grad_norm": 1.4159713983535767, + "learning_rate": 8.110682082375087e-07, + "loss": 1.4411, + "step": 9131 + }, + { + "epoch": 0.9607574960547081, + "grad_norm": 1.7567311525344849, + "learning_rate": 8.067425769775039e-07, + "loss": 1.3948, + "step": 9132 + }, + { + "epoch": 0.9608627038400842, + "grad_norm": 2.1508755683898926, + "learning_rate": 8.024284647038527e-07, + "loss": 1.8868, + "step": 9133 + }, + { + "epoch": 0.9609679116254602, + "grad_norm": 2.318276882171631, + "learning_rate": 7.981258719175322e-07, + "loss": 1.4337, + "step": 9134 + }, + { + "epoch": 0.9610731194108364, + "grad_norm": 1.1780331134796143, + "learning_rate": 7.938347991181982e-07, + "loss": 1.6706, + "step": 9135 + }, + { + "epoch": 0.9611783271962125, + "grad_norm": 2.116865634918213, + "learning_rate": 7.895552468041412e-07, + "loss": 1.8336, + "step": 9136 + }, + { + "epoch": 0.9612835349815886, + "grad_norm": 1.3692445755004883, + "learning_rate": 7.852872154723412e-07, + "loss": 1.9935, + "step": 9137 + }, + { + "epoch": 0.9613887427669647, + "grad_norm": 1.171465277671814, + "learning_rate": 7.810307056184352e-07, + "loss": 1.6881, + "step": 9138 + }, + { + "epoch": 0.9614939505523409, + "grad_norm": 1.1161651611328125, + "learning_rate": 7.767857177367055e-07, + "loss": 1.6728, + "step": 9139 + }, + { + "epoch": 0.961599158337717, + "grad_norm": 1.869456171989441, + "learning_rate": 7.725522523201245e-07, + "loss": 1.1257, + "step": 9140 + }, + { + "epoch": 0.9617043661230931, + "grad_norm": 1.2645540237426758, + "learning_rate": 7.683303098602989e-07, + "loss": 1.4703, + "step": 9141 + }, + { + "epoch": 0.9618095739084692, + "grad_norm": 1.4299430847167969, + "learning_rate": 7.641198908475144e-07, + "loss": 1.6185, + "step": 9142 + }, + { + "epoch": 0.9619147816938454, + "grad_norm": 2.149198532104492, + "learning_rate": 7.599209957707021e-07, + "loss": 1.4881, + "step": 9143 + }, + { + "epoch": 0.9620199894792215, + "grad_norm": 1.9210573434829712, + "learning_rate": 7.55733625117483e-07, + "loss": 2.0232, + "step": 9144 + }, + { + "epoch": 0.9621251972645976, + "grad_norm": 2.284515380859375, + "learning_rate": 7.515577793741124e-07, + "loss": 1.4036, + "step": 9145 + }, + { + "epoch": 0.9622304050499737, + "grad_norm": 1.6469871997833252, + "learning_rate": 7.473934590255249e-07, + "loss": 1.4179, + "step": 9146 + }, + { + "epoch": 0.9623356128353499, + "grad_norm": 1.3524980545043945, + "learning_rate": 7.432406645552891e-07, + "loss": 1.4155, + "step": 9147 + }, + { + "epoch": 0.9624408206207259, + "grad_norm": 1.9270532131195068, + "learning_rate": 7.39099396445686e-07, + "loss": 1.7492, + "step": 9148 + }, + { + "epoch": 0.962546028406102, + "grad_norm": 1.6175739765167236, + "learning_rate": 7.349696551776086e-07, + "loss": 2.1136, + "step": 9149 + }, + { + "epoch": 0.9626512361914782, + "grad_norm": 1.5902010202407837, + "learning_rate": 7.308514412306289e-07, + "loss": 1.8893, + "step": 9150 + }, + { + "epoch": 0.9627564439768543, + "grad_norm": 1.2558472156524658, + "learning_rate": 7.267447550829865e-07, + "loss": 1.5318, + "step": 9151 + }, + { + "epoch": 0.9628616517622304, + "grad_norm": 1.7991538047790527, + "learning_rate": 7.226495972115776e-07, + "loss": 1.2432, + "step": 9152 + }, + { + "epoch": 0.9629668595476065, + "grad_norm": 1.6260236501693726, + "learning_rate": 7.185659680919554e-07, + "loss": 1.7901, + "step": 9153 + }, + { + "epoch": 0.9630720673329827, + "grad_norm": 1.319873571395874, + "learning_rate": 7.144938681983515e-07, + "loss": 1.4871, + "step": 9154 + }, + { + "epoch": 0.9631772751183587, + "grad_norm": 1.6436864137649536, + "learning_rate": 7.104332980036211e-07, + "loss": 1.2697, + "step": 9155 + }, + { + "epoch": 0.9632824829037349, + "grad_norm": 1.0608686208724976, + "learning_rate": 7.063842579793311e-07, + "loss": 1.8403, + "step": 9156 + }, + { + "epoch": 0.963387690689111, + "grad_norm": 1.5618516206741333, + "learning_rate": 7.023467485956614e-07, + "loss": 1.4696, + "step": 9157 + }, + { + "epoch": 0.9634928984744872, + "grad_norm": 1.3680403232574463, + "learning_rate": 6.983207703214811e-07, + "loss": 1.6647, + "step": 9158 + }, + { + "epoch": 0.9635981062598632, + "grad_norm": 2.8899571895599365, + "learning_rate": 6.943063236243275e-07, + "loss": 1.7808, + "step": 9159 + }, + { + "epoch": 0.9637033140452393, + "grad_norm": 1.5044270753860474, + "learning_rate": 6.903034089703719e-07, + "loss": 1.8185, + "step": 9160 + }, + { + "epoch": 0.9638085218306155, + "grad_norm": 1.38004469871521, + "learning_rate": 6.863120268244649e-07, + "loss": 1.6207, + "step": 9161 + }, + { + "epoch": 0.9639137296159915, + "grad_norm": 2.003220558166504, + "learning_rate": 6.823321776501024e-07, + "loss": 1.5458, + "step": 9162 + }, + { + "epoch": 0.9640189374013677, + "grad_norm": 1.4244567155838013, + "learning_rate": 6.783638619094701e-07, + "loss": 1.2926, + "step": 9163 + }, + { + "epoch": 0.9641241451867438, + "grad_norm": 1.428270697593689, + "learning_rate": 6.744070800633661e-07, + "loss": 1.5582, + "step": 9164 + }, + { + "epoch": 0.96422935297212, + "grad_norm": 1.64469313621521, + "learning_rate": 6.704618325713119e-07, + "loss": 1.7808, + "step": 9165 + }, + { + "epoch": 0.964334560757496, + "grad_norm": 1.6442152261734009, + "learning_rate": 6.665281198914408e-07, + "loss": 1.4003, + "step": 9166 + }, + { + "epoch": 0.9644397685428722, + "grad_norm": 1.406569480895996, + "learning_rate": 6.626059424805542e-07, + "loss": 1.4821, + "step": 9167 + }, + { + "epoch": 0.9645449763282483, + "grad_norm": 1.100145936012268, + "learning_rate": 6.586953007941321e-07, + "loss": 1.6582, + "step": 9168 + }, + { + "epoch": 0.9646501841136244, + "grad_norm": 1.651857614517212, + "learning_rate": 6.547961952863002e-07, + "loss": 1.7471, + "step": 9169 + }, + { + "epoch": 0.9647553918990005, + "grad_norm": 1.3238495588302612, + "learning_rate": 6.509086264098407e-07, + "loss": 1.7772, + "step": 9170 + }, + { + "epoch": 0.9648605996843767, + "grad_norm": 2.2787668704986572, + "learning_rate": 6.470325946162259e-07, + "loss": 1.3509, + "step": 9171 + }, + { + "epoch": 0.9649658074697528, + "grad_norm": 1.7403054237365723, + "learning_rate": 6.431681003555622e-07, + "loss": 1.4681, + "step": 9172 + }, + { + "epoch": 0.9650710152551288, + "grad_norm": 1.6274385452270508, + "learning_rate": 6.393151440766021e-07, + "loss": 1.6136, + "step": 9173 + }, + { + "epoch": 0.965176223040505, + "grad_norm": 1.5703790187835693, + "learning_rate": 6.354737262267873e-07, + "loss": 1.4191, + "step": 9174 + }, + { + "epoch": 0.9652814308258811, + "grad_norm": 1.5442698001861572, + "learning_rate": 6.316438472522057e-07, + "loss": 1.1351, + "step": 9175 + }, + { + "epoch": 0.9653866386112573, + "grad_norm": 1.7261892557144165, + "learning_rate": 6.278255075976125e-07, + "loss": 1.1987, + "step": 9176 + }, + { + "epoch": 0.9654918463966333, + "grad_norm": 1.5503300428390503, + "learning_rate": 6.240187077064307e-07, + "loss": 1.8204, + "step": 9177 + }, + { + "epoch": 0.9655970541820095, + "grad_norm": 1.6254150867462158, + "learning_rate": 6.202234480207069e-07, + "loss": 1.6718, + "step": 9178 + }, + { + "epoch": 0.9657022619673856, + "grad_norm": 1.0417579412460327, + "learning_rate": 6.164397289811885e-07, + "loss": 1.3387, + "step": 9179 + }, + { + "epoch": 0.9658074697527617, + "grad_norm": 1.2805875539779663, + "learning_rate": 6.126675510272572e-07, + "loss": 1.2551, + "step": 9180 + }, + { + "epoch": 0.9659126775381378, + "grad_norm": 1.3211545944213867, + "learning_rate": 6.089069145969739e-07, + "loss": 1.65, + "step": 9181 + }, + { + "epoch": 0.966017885323514, + "grad_norm": 4.055902481079102, + "learning_rate": 6.051578201270336e-07, + "loss": 1.776, + "step": 9182 + }, + { + "epoch": 0.9661230931088901, + "grad_norm": 1.3434686660766602, + "learning_rate": 6.014202680528324e-07, + "loss": 1.4995, + "step": 9183 + }, + { + "epoch": 0.9662283008942661, + "grad_norm": 1.8617429733276367, + "learning_rate": 5.976942588083678e-07, + "loss": 1.7132, + "step": 9184 + }, + { + "epoch": 0.9663335086796423, + "grad_norm": 1.6335538625717163, + "learning_rate": 5.939797928263602e-07, + "loss": 1.5556, + "step": 9185 + }, + { + "epoch": 0.9664387164650184, + "grad_norm": 1.5544825792312622, + "learning_rate": 5.902768705381312e-07, + "loss": 1.736, + "step": 9186 + }, + { + "epoch": 0.9665439242503945, + "grad_norm": 1.460477590560913, + "learning_rate": 5.865854923737035e-07, + "loss": 1.4154, + "step": 9187 + }, + { + "epoch": 0.9666491320357706, + "grad_norm": 1.5231237411499023, + "learning_rate": 5.829056587617455e-07, + "loss": 1.3271, + "step": 9188 + }, + { + "epoch": 0.9667543398211468, + "grad_norm": 1.2489863634109497, + "learning_rate": 5.792373701295706e-07, + "loss": 1.9272, + "step": 9189 + }, + { + "epoch": 0.9668595476065229, + "grad_norm": 1.2746580839157104, + "learning_rate": 5.755806269031827e-07, + "loss": 1.4012, + "step": 9190 + }, + { + "epoch": 0.966964755391899, + "grad_norm": 1.310562252998352, + "learning_rate": 5.719354295072199e-07, + "loss": 1.9932, + "step": 9191 + }, + { + "epoch": 0.9670699631772751, + "grad_norm": 1.565401554107666, + "learning_rate": 5.68301778364988e-07, + "loss": 1.5477, + "step": 9192 + }, + { + "epoch": 0.9671751709626513, + "grad_norm": 1.3643187284469604, + "learning_rate": 5.646796738984495e-07, + "loss": 1.7313, + "step": 9193 + }, + { + "epoch": 0.9672803787480273, + "grad_norm": 2.277108907699585, + "learning_rate": 5.610691165282234e-07, + "loss": 1.4127, + "step": 9194 + }, + { + "epoch": 0.9673855865334035, + "grad_norm": 1.1328575611114502, + "learning_rate": 5.574701066735965e-07, + "loss": 2.0227, + "step": 9195 + }, + { + "epoch": 0.9674907943187796, + "grad_norm": 1.7198408842086792, + "learning_rate": 5.538826447525125e-07, + "loss": 1.7175, + "step": 9196 + }, + { + "epoch": 0.9675960021041558, + "grad_norm": 1.514518141746521, + "learning_rate": 5.503067311815713e-07, + "loss": 1.512, + "step": 9197 + }, + { + "epoch": 0.9677012098895318, + "grad_norm": 1.6654568910598755, + "learning_rate": 5.467423663760296e-07, + "loss": 1.337, + "step": 9198 + }, + { + "epoch": 0.9678064176749079, + "grad_norm": 1.9775021076202393, + "learning_rate": 5.431895507498008e-07, + "loss": 1.1514, + "step": 9199 + }, + { + "epoch": 0.9679116254602841, + "grad_norm": 1.0919164419174194, + "learning_rate": 5.39648284715466e-07, + "loss": 1.7102, + "step": 9200 + }, + { + "epoch": 0.9680168332456601, + "grad_norm": 1.3616853952407837, + "learning_rate": 5.361185686842629e-07, + "loss": 1.7125, + "step": 9201 + }, + { + "epoch": 0.9681220410310363, + "grad_norm": 1.843559741973877, + "learning_rate": 5.326004030660747e-07, + "loss": 1.1107, + "step": 9202 + }, + { + "epoch": 0.9682272488164124, + "grad_norm": 1.5905053615570068, + "learning_rate": 5.290937882694746e-07, + "loss": 1.4774, + "step": 9203 + }, + { + "epoch": 0.9683324566017886, + "grad_norm": 1.699774146080017, + "learning_rate": 5.255987247016591e-07, + "loss": 1.6351, + "step": 9204 + }, + { + "epoch": 0.9684376643871646, + "grad_norm": 2.0290257930755615, + "learning_rate": 5.221152127685036e-07, + "loss": 2.2553, + "step": 9205 + }, + { + "epoch": 0.9685428721725408, + "grad_norm": 2.343132972717285, + "learning_rate": 5.186432528745289e-07, + "loss": 0.9011, + "step": 9206 + }, + { + "epoch": 0.9686480799579169, + "grad_norm": 2.2507107257843018, + "learning_rate": 5.151828454229346e-07, + "loss": 1.7659, + "step": 9207 + }, + { + "epoch": 0.9687532877432931, + "grad_norm": 1.9188868999481201, + "learning_rate": 5.117339908155549e-07, + "loss": 1.9411, + "step": 9208 + }, + { + "epoch": 0.9688584955286691, + "grad_norm": 1.9085512161254883, + "learning_rate": 5.082966894529028e-07, + "loss": 1.3046, + "step": 9209 + }, + { + "epoch": 0.9689637033140452, + "grad_norm": 1.6628177165985107, + "learning_rate": 5.04870941734148e-07, + "loss": 1.3953, + "step": 9210 + }, + { + "epoch": 0.9690689110994214, + "grad_norm": 1.5158159732818604, + "learning_rate": 5.014567480570831e-07, + "loss": 1.9355, + "step": 9211 + }, + { + "epoch": 0.9691741188847974, + "grad_norm": 1.3943663835525513, + "learning_rate": 4.980541088182133e-07, + "loss": 1.5488, + "step": 9212 + }, + { + "epoch": 0.9692793266701736, + "grad_norm": 1.2974984645843506, + "learning_rate": 4.946630244126782e-07, + "loss": 1.4926, + "step": 9213 + }, + { + "epoch": 0.9693845344555497, + "grad_norm": 1.5148580074310303, + "learning_rate": 4.912834952342515e-07, + "loss": 1.7226, + "step": 9214 + }, + { + "epoch": 0.9694897422409259, + "grad_norm": 1.8382201194763184, + "learning_rate": 4.879155216753972e-07, + "loss": 1.6377, + "step": 9215 + }, + { + "epoch": 0.9695949500263019, + "grad_norm": 1.510947823524475, + "learning_rate": 4.845591041272358e-07, + "loss": 2.0674, + "step": 9216 + }, + { + "epoch": 0.9697001578116781, + "grad_norm": 2.992677927017212, + "learning_rate": 4.812142429795219e-07, + "loss": 1.6575, + "step": 9217 + }, + { + "epoch": 0.9698053655970542, + "grad_norm": 2.957728147506714, + "learning_rate": 4.778809386206895e-07, + "loss": 1.6637, + "step": 9218 + }, + { + "epoch": 0.9699105733824303, + "grad_norm": 1.4280604124069214, + "learning_rate": 4.745591914378289e-07, + "loss": 1.9542, + "step": 9219 + }, + { + "epoch": 0.9700157811678064, + "grad_norm": 2.1100828647613525, + "learning_rate": 4.71249001816676e-07, + "loss": 1.6198, + "step": 9220 + }, + { + "epoch": 0.9701209889531826, + "grad_norm": 1.218092918395996, + "learning_rate": 4.6795037014163436e-07, + "loss": 1.2455, + "step": 9221 + }, + { + "epoch": 0.9702261967385587, + "grad_norm": 1.3231292963027954, + "learning_rate": 4.6466329679577536e-07, + "loss": 1.4456, + "step": 9222 + }, + { + "epoch": 0.9703314045239347, + "grad_norm": 1.482961893081665, + "learning_rate": 4.613877821607937e-07, + "loss": 2.1301, + "step": 9223 + }, + { + "epoch": 0.9704366123093109, + "grad_norm": 1.3625333309173584, + "learning_rate": 4.581238266170851e-07, + "loss": 1.631, + "step": 9224 + }, + { + "epoch": 0.970541820094687, + "grad_norm": 1.9850952625274658, + "learning_rate": 4.548714305436685e-07, + "loss": 1.8183, + "step": 9225 + }, + { + "epoch": 0.9706470278800631, + "grad_norm": 2.0296475887298584, + "learning_rate": 4.5163059431824194e-07, + "loss": 1.5821, + "step": 9226 + }, + { + "epoch": 0.9707522356654392, + "grad_norm": 1.2871662378311157, + "learning_rate": 4.484013183171376e-07, + "loss": 1.5999, + "step": 9227 + }, + { + "epoch": 0.9708574434508154, + "grad_norm": 1.5916684865951538, + "learning_rate": 4.4518360291538883e-07, + "loss": 1.8337, + "step": 9228 + }, + { + "epoch": 0.9709626512361915, + "grad_norm": 1.2991007566452026, + "learning_rate": 4.4197744848663017e-07, + "loss": 1.2721, + "step": 9229 + }, + { + "epoch": 0.9710678590215676, + "grad_norm": 1.4421882629394531, + "learning_rate": 4.3878285540319694e-07, + "loss": 1.684, + "step": 9230 + }, + { + "epoch": 0.9711730668069437, + "grad_norm": 1.5107342004776, + "learning_rate": 4.355998240360704e-07, + "loss": 1.1753, + "step": 9231 + }, + { + "epoch": 0.9712782745923199, + "grad_norm": 1.8420056104660034, + "learning_rate": 4.324283547548658e-07, + "loss": 1.5455, + "step": 9232 + }, + { + "epoch": 0.9713834823776959, + "grad_norm": 2.1014416217803955, + "learning_rate": 4.2926844792789967e-07, + "loss": 1.3949, + "step": 9233 + }, + { + "epoch": 0.971488690163072, + "grad_norm": 1.452222466468811, + "learning_rate": 4.261201039221008e-07, + "loss": 1.5272, + "step": 9234 + }, + { + "epoch": 0.9715938979484482, + "grad_norm": 1.7641346454620361, + "learning_rate": 4.2298332310308775e-07, + "loss": 1.3401, + "step": 9235 + }, + { + "epoch": 0.9716991057338243, + "grad_norm": 1.8564509153366089, + "learning_rate": 4.1985810583512473e-07, + "loss": 1.4316, + "step": 9236 + }, + { + "epoch": 0.9718043135192004, + "grad_norm": 1.6556718349456787, + "learning_rate": 4.167444524811215e-07, + "loss": 1.5263, + "step": 9237 + }, + { + "epoch": 0.9719095213045765, + "grad_norm": 1.864583969116211, + "learning_rate": 4.136423634026776e-07, + "loss": 1.4365, + "step": 9238 + }, + { + "epoch": 0.9720147290899527, + "grad_norm": 1.7581238746643066, + "learning_rate": 4.1055183896001606e-07, + "loss": 1.5028, + "step": 9239 + }, + { + "epoch": 0.9721199368753288, + "grad_norm": 1.2646266222000122, + "learning_rate": 4.074728795120275e-07, + "loss": 1.6811, + "step": 9240 + }, + { + "epoch": 0.9722251446607049, + "grad_norm": 1.6444590091705322, + "learning_rate": 4.0440548541625935e-07, + "loss": 1.6224, + "step": 9241 + }, + { + "epoch": 0.972330352446081, + "grad_norm": 1.8099961280822754, + "learning_rate": 4.013496570289155e-07, + "loss": 1.5547, + "step": 9242 + }, + { + "epoch": 0.9724355602314572, + "grad_norm": 2.0473053455352783, + "learning_rate": 3.983053947048676e-07, + "loss": 1.4913, + "step": 9243 + }, + { + "epoch": 0.9725407680168332, + "grad_norm": 1.557085394859314, + "learning_rate": 3.9527269879764406e-07, + "loss": 1.415, + "step": 9244 + }, + { + "epoch": 0.9726459758022094, + "grad_norm": 1.8768304586410522, + "learning_rate": 3.9225156965939647e-07, + "loss": 1.5624, + "step": 9245 + }, + { + "epoch": 0.9727511835875855, + "grad_norm": 1.8402485847473145, + "learning_rate": 3.892420076409886e-07, + "loss": 1.712, + "step": 9246 + }, + { + "epoch": 0.9728563913729616, + "grad_norm": 1.5501086711883545, + "learning_rate": 3.862440130918854e-07, + "loss": 1.5368, + "step": 9247 + }, + { + "epoch": 0.9729615991583377, + "grad_norm": 2.000622272491455, + "learning_rate": 3.832575863602528e-07, + "loss": 2.0911, + "step": 9248 + }, + { + "epoch": 0.9730668069437138, + "grad_norm": 1.5791133642196655, + "learning_rate": 3.8028272779287997e-07, + "loss": 1.7847, + "step": 9249 + }, + { + "epoch": 0.97317201472909, + "grad_norm": 1.8884774446487427, + "learning_rate": 3.7731943773523515e-07, + "loss": 1.5126, + "step": 9250 + }, + { + "epoch": 0.973277222514466, + "grad_norm": 1.8291395902633667, + "learning_rate": 3.7436771653143187e-07, + "loss": 1.6039, + "step": 9251 + }, + { + "epoch": 0.9733824302998422, + "grad_norm": 1.446852684020996, + "learning_rate": 3.7142756452425155e-07, + "loss": 1.3811, + "step": 9252 + }, + { + "epoch": 0.9734876380852183, + "grad_norm": 1.450920820236206, + "learning_rate": 3.68498982055121e-07, + "loss": 1.5193, + "step": 9253 + }, + { + "epoch": 0.9735928458705945, + "grad_norm": 1.6216645240783691, + "learning_rate": 3.655819694641127e-07, + "loss": 1.7471, + "step": 9254 + }, + { + "epoch": 0.9736980536559705, + "grad_norm": 1.7787151336669922, + "learning_rate": 3.62676527089989e-07, + "loss": 1.8712, + "step": 9255 + }, + { + "epoch": 0.9738032614413467, + "grad_norm": 1.63424813747406, + "learning_rate": 3.5978265527014666e-07, + "loss": 1.8227, + "step": 9256 + }, + { + "epoch": 0.9739084692267228, + "grad_norm": 2.0026700496673584, + "learning_rate": 3.56900354340628e-07, + "loss": 1.7547, + "step": 9257 + }, + { + "epoch": 0.9740136770120988, + "grad_norm": 1.6562840938568115, + "learning_rate": 3.5402962463616517e-07, + "loss": 2.0525, + "step": 9258 + }, + { + "epoch": 0.974118884797475, + "grad_norm": 2.0857789516448975, + "learning_rate": 3.511704664901139e-07, + "loss": 1.4605, + "step": 9259 + }, + { + "epoch": 0.9742240925828511, + "grad_norm": 1.9922007322311401, + "learning_rate": 3.483228802344973e-07, + "loss": 1.4324, + "step": 9260 + }, + { + "epoch": 0.9743293003682273, + "grad_norm": 2.007927417755127, + "learning_rate": 3.454868661999955e-07, + "loss": 1.6704, + "step": 9261 + }, + { + "epoch": 0.9744345081536033, + "grad_norm": 1.3496975898742676, + "learning_rate": 3.42662424715956e-07, + "loss": 1.6075, + "step": 9262 + }, + { + "epoch": 0.9745397159389795, + "grad_norm": 1.6474722623825073, + "learning_rate": 3.39849556110361e-07, + "loss": 1.8737, + "step": 9263 + }, + { + "epoch": 0.9746449237243556, + "grad_norm": 1.7069952487945557, + "learning_rate": 3.370482607098602e-07, + "loss": 1.2898, + "step": 9264 + }, + { + "epoch": 0.9747501315097317, + "grad_norm": 1.7524888515472412, + "learning_rate": 3.342585388397712e-07, + "loss": 1.5437, + "step": 9265 + }, + { + "epoch": 0.9748553392951078, + "grad_norm": 1.8465501070022583, + "learning_rate": 3.3148039082404605e-07, + "loss": 1.4052, + "step": 9266 + }, + { + "epoch": 0.974960547080484, + "grad_norm": 1.5890626907348633, + "learning_rate": 3.2871381698529324e-07, + "loss": 1.616, + "step": 9267 + }, + { + "epoch": 0.9750657548658601, + "grad_norm": 1.5656459331512451, + "learning_rate": 3.259588176448003e-07, + "loss": 1.3757, + "step": 9268 + }, + { + "epoch": 0.9751709626512362, + "grad_norm": 1.868996262550354, + "learning_rate": 3.2321539312248903e-07, + "loss": 1.3586, + "step": 9269 + }, + { + "epoch": 0.9752761704366123, + "grad_norm": 1.7891117334365845, + "learning_rate": 3.204835437369491e-07, + "loss": 1.2176, + "step": 9270 + }, + { + "epoch": 0.9753813782219884, + "grad_norm": 1.7648265361785889, + "learning_rate": 3.177632698054156e-07, + "loss": 1.7038, + "step": 9271 + }, + { + "epoch": 0.9754865860073646, + "grad_norm": 1.6850988864898682, + "learning_rate": 3.150545716437914e-07, + "loss": 1.4181, + "step": 9272 + }, + { + "epoch": 0.9755917937927406, + "grad_norm": 1.7368006706237793, + "learning_rate": 3.1235744956662484e-07, + "loss": 1.3408, + "step": 9273 + }, + { + "epoch": 0.9756970015781168, + "grad_norm": 2.081385374069214, + "learning_rate": 3.0967190388712097e-07, + "loss": 1.3886, + "step": 9274 + }, + { + "epoch": 0.9758022093634929, + "grad_norm": 2.454695224761963, + "learning_rate": 3.0699793491715256e-07, + "loss": 1.1868, + "step": 9275 + }, + { + "epoch": 0.975907417148869, + "grad_norm": 1.7129137516021729, + "learning_rate": 3.043355429672268e-07, + "loss": 1.7567, + "step": 9276 + }, + { + "epoch": 0.9760126249342451, + "grad_norm": 1.3494802713394165, + "learning_rate": 3.016847283465185e-07, + "loss": 1.4914, + "step": 9277 + }, + { + "epoch": 0.9761178327196213, + "grad_norm": 1.3890875577926636, + "learning_rate": 2.990454913628704e-07, + "loss": 1.3622, + "step": 9278 + }, + { + "epoch": 0.9762230405049974, + "grad_norm": 2.182384967803955, + "learning_rate": 2.9641783232275955e-07, + "loss": 1.4551, + "step": 9279 + }, + { + "epoch": 0.9763282482903735, + "grad_norm": 1.537185788154602, + "learning_rate": 2.938017515313418e-07, + "loss": 1.7531, + "step": 9280 + }, + { + "epoch": 0.9764334560757496, + "grad_norm": 1.466887354850769, + "learning_rate": 2.9119724929239645e-07, + "loss": 1.567, + "step": 9281 + }, + { + "epoch": 0.9765386638611258, + "grad_norm": 1.78278648853302, + "learning_rate": 2.886043259083704e-07, + "loss": 1.3838, + "step": 9282 + }, + { + "epoch": 0.9766438716465018, + "grad_norm": 1.6056292057037354, + "learning_rate": 2.860229816803894e-07, + "loss": 1.1808, + "step": 9283 + }, + { + "epoch": 0.9767490794318779, + "grad_norm": 1.9775320291519165, + "learning_rate": 2.834532169082138e-07, + "loss": 1.6403, + "step": 9284 + }, + { + "epoch": 0.9768542872172541, + "grad_norm": 1.4951421022415161, + "learning_rate": 2.8089503189024926e-07, + "loss": 1.6209, + "step": 9285 + }, + { + "epoch": 0.9769594950026302, + "grad_norm": 2.1086246967315674, + "learning_rate": 2.7834842692358033e-07, + "loss": 1.6395, + "step": 9286 + }, + { + "epoch": 0.9770647027880063, + "grad_norm": 1.4690982103347778, + "learning_rate": 2.7581340230393717e-07, + "loss": 1.6837, + "step": 9287 + }, + { + "epoch": 0.9771699105733824, + "grad_norm": 1.447407603263855, + "learning_rate": 2.7328995832568426e-07, + "loss": 1.6867, + "step": 9288 + }, + { + "epoch": 0.9772751183587586, + "grad_norm": 1.7926383018493652, + "learning_rate": 2.7077809528188724e-07, + "loss": 1.4406, + "step": 9289 + }, + { + "epoch": 0.9773803261441346, + "grad_norm": 2.195378541946411, + "learning_rate": 2.6827781346423496e-07, + "loss": 1.808, + "step": 9290 + }, + { + "epoch": 0.9774855339295108, + "grad_norm": 1.9407440423965454, + "learning_rate": 2.657891131630619e-07, + "loss": 1.362, + "step": 9291 + }, + { + "epoch": 0.9775907417148869, + "grad_norm": 1.7981421947479248, + "learning_rate": 2.633119946673923e-07, + "loss": 1.4832, + "step": 9292 + }, + { + "epoch": 0.9776959495002631, + "grad_norm": 1.5545809268951416, + "learning_rate": 2.608464582648629e-07, + "loss": 1.8864, + "step": 9293 + }, + { + "epoch": 0.9778011572856391, + "grad_norm": 1.306902289390564, + "learning_rate": 2.583925042418112e-07, + "loss": 1.5875, + "step": 9294 + }, + { + "epoch": 0.9779063650710152, + "grad_norm": 2.3127174377441406, + "learning_rate": 2.5595013288318703e-07, + "loss": 1.6263, + "step": 9295 + }, + { + "epoch": 0.9780115728563914, + "grad_norm": 2.087738275527954, + "learning_rate": 2.5351934447263026e-07, + "loss": 1.7355, + "step": 9296 + }, + { + "epoch": 0.9781167806417674, + "grad_norm": 1.412264108657837, + "learning_rate": 2.5110013929241504e-07, + "loss": 1.5116, + "step": 9297 + }, + { + "epoch": 0.9782219884271436, + "grad_norm": 1.165013313293457, + "learning_rate": 2.4869251762348333e-07, + "loss": 1.421, + "step": 9298 + }, + { + "epoch": 0.9783271962125197, + "grad_norm": 2.194392442703247, + "learning_rate": 2.462964797454004e-07, + "loss": 1.9825, + "step": 9299 + }, + { + "epoch": 0.9784324039978959, + "grad_norm": 1.6436448097229004, + "learning_rate": 2.4391202593643246e-07, + "loss": 1.6435, + "step": 9300 + }, + { + "epoch": 0.9785376117832719, + "grad_norm": 0.9663065075874329, + "learning_rate": 2.4153915647348034e-07, + "loss": 1.5033, + "step": 9301 + }, + { + "epoch": 0.9786428195686481, + "grad_norm": 1.650208592414856, + "learning_rate": 2.391778716320792e-07, + "loss": 1.4037, + "step": 9302 + }, + { + "epoch": 0.9787480273540242, + "grad_norm": 1.5926425457000732, + "learning_rate": 2.3682817168644288e-07, + "loss": 1.9914, + "step": 9303 + }, + { + "epoch": 0.9788532351394004, + "grad_norm": 1.1928669214248657, + "learning_rate": 2.3449005690945324e-07, + "loss": 1.5487, + "step": 9304 + }, + { + "epoch": 0.9789584429247764, + "grad_norm": 1.331669569015503, + "learning_rate": 2.321635275726042e-07, + "loss": 1.9395, + "step": 9305 + }, + { + "epoch": 0.9790636507101526, + "grad_norm": 1.4218162298202515, + "learning_rate": 2.2984858394607956e-07, + "loss": 1.4188, + "step": 9306 + }, + { + "epoch": 0.9791688584955287, + "grad_norm": 1.6478614807128906, + "learning_rate": 2.275452262986977e-07, + "loss": 1.7231, + "step": 9307 + }, + { + "epoch": 0.9792740662809047, + "grad_norm": 1.5847270488739014, + "learning_rate": 2.252534548979446e-07, + "loss": 1.3288, + "step": 9308 + }, + { + "epoch": 0.9793792740662809, + "grad_norm": 1.5312392711639404, + "learning_rate": 2.2297327000996293e-07, + "loss": 1.4311, + "step": 9309 + }, + { + "epoch": 0.979484481851657, + "grad_norm": 1.6331710815429688, + "learning_rate": 2.207046718995409e-07, + "loss": 1.8195, + "step": 9310 + }, + { + "epoch": 0.9795896896370332, + "grad_norm": 1.5755873918533325, + "learning_rate": 2.1844766083011226e-07, + "loss": 1.5262, + "step": 9311 + }, + { + "epoch": 0.9796948974224092, + "grad_norm": 1.929260492324829, + "learning_rate": 2.162022370637895e-07, + "loss": 1.5236, + "step": 9312 + }, + { + "epoch": 0.9798001052077854, + "grad_norm": 1.6517144441604614, + "learning_rate": 2.1396840086131964e-07, + "loss": 1.9677, + "step": 9313 + }, + { + "epoch": 0.9799053129931615, + "grad_norm": 1.5350128412246704, + "learning_rate": 2.1174615248210626e-07, + "loss": 1.2628, + "step": 9314 + }, + { + "epoch": 0.9800105207785376, + "grad_norm": 1.6820600032806396, + "learning_rate": 2.0953549218423185e-07, + "loss": 2.0707, + "step": 9315 + }, + { + "epoch": 0.9801157285639137, + "grad_norm": 2.348896026611328, + "learning_rate": 2.0733642022437994e-07, + "loss": 1.3693, + "step": 9316 + }, + { + "epoch": 0.9802209363492899, + "grad_norm": 3.21044659614563, + "learning_rate": 2.0514893685795733e-07, + "loss": 1.4473, + "step": 9317 + }, + { + "epoch": 0.980326144134666, + "grad_norm": 1.5892785787582397, + "learning_rate": 2.0297304233896087e-07, + "loss": 1.5056, + "step": 9318 + }, + { + "epoch": 0.980431351920042, + "grad_norm": 1.6441622972488403, + "learning_rate": 2.008087369200773e-07, + "loss": 1.6748, + "step": 9319 + }, + { + "epoch": 0.9805365597054182, + "grad_norm": 1.501565933227539, + "learning_rate": 1.9865602085265002e-07, + "loss": 1.3629, + "step": 9320 + }, + { + "epoch": 0.9806417674907943, + "grad_norm": 1.4179154634475708, + "learning_rate": 1.9651489438666792e-07, + "loss": 1.7201, + "step": 9321 + }, + { + "epoch": 0.9807469752761704, + "grad_norm": 1.6609681844711304, + "learning_rate": 1.943853577707544e-07, + "loss": 1.7947, + "step": 9322 + }, + { + "epoch": 0.9808521830615465, + "grad_norm": 1.9472432136535645, + "learning_rate": 1.922674112522227e-07, + "loss": 1.6767, + "step": 9323 + }, + { + "epoch": 0.9809573908469227, + "grad_norm": 1.3472323417663574, + "learning_rate": 1.9016105507702054e-07, + "loss": 1.2528, + "step": 9324 + }, + { + "epoch": 0.9810625986322988, + "grad_norm": 1.9179092645645142, + "learning_rate": 1.8806628948974114e-07, + "loss": 1.5153, + "step": 9325 + }, + { + "epoch": 0.9811678064176749, + "grad_norm": 2.2002909183502197, + "learning_rate": 1.859831147336566e-07, + "loss": 1.584, + "step": 9326 + }, + { + "epoch": 0.981273014203051, + "grad_norm": 2.532457113265991, + "learning_rate": 1.8391153105067338e-07, + "loss": 2.4041, + "step": 9327 + }, + { + "epoch": 0.9813782219884272, + "grad_norm": 1.7120616436004639, + "learning_rate": 1.8185153868135462e-07, + "loss": 1.6406, + "step": 9328 + }, + { + "epoch": 0.9814834297738032, + "grad_norm": 1.6035572290420532, + "learning_rate": 1.798031378649201e-07, + "loss": 1.5754, + "step": 9329 + }, + { + "epoch": 0.9815886375591794, + "grad_norm": 1.2862604856491089, + "learning_rate": 1.7776632883924615e-07, + "loss": 1.7776, + "step": 9330 + }, + { + "epoch": 0.9816938453445555, + "grad_norm": 2.551787853240967, + "learning_rate": 1.7574111184086582e-07, + "loss": 1.962, + "step": 9331 + }, + { + "epoch": 0.9817990531299317, + "grad_norm": 1.6095662117004395, + "learning_rate": 1.7372748710495768e-07, + "loss": 1.6107, + "step": 9332 + }, + { + "epoch": 0.9819042609153077, + "grad_norm": 2.01579213142395, + "learning_rate": 1.7172545486535685e-07, + "loss": 1.1388, + "step": 9333 + }, + { + "epoch": 0.9820094687006838, + "grad_norm": 2.388930559158325, + "learning_rate": 1.6973501535455516e-07, + "loss": 1.7735, + "step": 9334 + }, + { + "epoch": 0.98211467648606, + "grad_norm": 1.5731823444366455, + "learning_rate": 1.6775616880368994e-07, + "loss": 1.653, + "step": 9335 + }, + { + "epoch": 0.9822198842714361, + "grad_norm": 1.1225214004516602, + "learning_rate": 1.6578891544255514e-07, + "loss": 1.8642, + "step": 9336 + }, + { + "epoch": 0.9823250920568122, + "grad_norm": 1.4525314569473267, + "learning_rate": 1.638332554996125e-07, + "loss": 1.2603, + "step": 9337 + }, + { + "epoch": 0.9824302998421883, + "grad_norm": 2.0482780933380127, + "learning_rate": 1.6188918920195806e-07, + "loss": 1.4663, + "step": 9338 + }, + { + "epoch": 0.9825355076275645, + "grad_norm": 1.067059874534607, + "learning_rate": 1.5995671677535573e-07, + "loss": 1.371, + "step": 9339 + }, + { + "epoch": 0.9826407154129405, + "grad_norm": 2.4665608406066895, + "learning_rate": 1.5803583844421488e-07, + "loss": 1.533, + "step": 9340 + }, + { + "epoch": 0.9827459231983167, + "grad_norm": 1.2421523332595825, + "learning_rate": 1.561265544316015e-07, + "loss": 1.3759, + "step": 9341 + }, + { + "epoch": 0.9828511309836928, + "grad_norm": 1.3852927684783936, + "learning_rate": 1.5422886495922718e-07, + "loss": 1.5905, + "step": 9342 + }, + { + "epoch": 0.982956338769069, + "grad_norm": 1.9068608283996582, + "learning_rate": 1.5234277024747112e-07, + "loss": 1.7971, + "step": 9343 + }, + { + "epoch": 0.983061546554445, + "grad_norm": 1.5108956098556519, + "learning_rate": 1.5046827051536928e-07, + "loss": 1.8727, + "step": 9344 + }, + { + "epoch": 0.9831667543398211, + "grad_norm": 1.5862867832183838, + "learning_rate": 1.4860536598058085e-07, + "loss": 1.9341, + "step": 9345 + }, + { + "epoch": 0.9832719621251973, + "grad_norm": 1.2081376314163208, + "learning_rate": 1.4675405685944387e-07, + "loss": 1.7027, + "step": 9346 + }, + { + "epoch": 0.9833771699105733, + "grad_norm": 1.2375844717025757, + "learning_rate": 1.4491434336696418e-07, + "loss": 1.5949, + "step": 9347 + }, + { + "epoch": 0.9834823776959495, + "grad_norm": 1.2393487691879272, + "learning_rate": 1.430862257167598e-07, + "loss": 1.7839, + "step": 9348 + }, + { + "epoch": 0.9835875854813256, + "grad_norm": 2.2204091548919678, + "learning_rate": 1.412697041211275e-07, + "loss": 1.5249, + "step": 9349 + }, + { + "epoch": 0.9836927932667018, + "grad_norm": 1.9110403060913086, + "learning_rate": 1.3946477879102083e-07, + "loss": 1.1106, + "step": 9350 + }, + { + "epoch": 0.9837980010520778, + "grad_norm": 1.352942705154419, + "learning_rate": 1.3767144993602766e-07, + "loss": 1.3338, + "step": 9351 + }, + { + "epoch": 0.983903208837454, + "grad_norm": 1.511385440826416, + "learning_rate": 1.3588971776441472e-07, + "loss": 1.8067, + "step": 9352 + }, + { + "epoch": 0.9840084166228301, + "grad_norm": 1.3924455642700195, + "learning_rate": 1.3411958248309431e-07, + "loss": 1.8532, + "step": 9353 + }, + { + "epoch": 0.9841136244082062, + "grad_norm": 1.5775933265686035, + "learning_rate": 1.3236104429760199e-07, + "loss": 1.602, + "step": 9354 + }, + { + "epoch": 0.9842188321935823, + "grad_norm": 1.5030940771102905, + "learning_rate": 1.306141034121744e-07, + "loss": 1.8972, + "step": 9355 + }, + { + "epoch": 0.9843240399789585, + "grad_norm": 2.3042783737182617, + "learning_rate": 1.2887876002967149e-07, + "loss": 1.3577, + "step": 9356 + }, + { + "epoch": 0.9844292477643346, + "grad_norm": 2.0904388427734375, + "learning_rate": 1.2715501435159872e-07, + "loss": 1.3343, + "step": 9357 + }, + { + "epoch": 0.9845344555497106, + "grad_norm": 1.2769339084625244, + "learning_rate": 1.254428665781515e-07, + "loss": 1.8264, + "step": 9358 + }, + { + "epoch": 0.9846396633350868, + "grad_norm": 1.4355113506317139, + "learning_rate": 1.2374231690813754e-07, + "loss": 1.8146, + "step": 9359 + }, + { + "epoch": 0.9847448711204629, + "grad_norm": 1.9717060327529907, + "learning_rate": 1.2205336553904323e-07, + "loss": 1.3479, + "step": 9360 + }, + { + "epoch": 0.984850078905839, + "grad_norm": 1.9314119815826416, + "learning_rate": 1.203760126670117e-07, + "loss": 1.4207, + "step": 9361 + }, + { + "epoch": 0.9849552866912151, + "grad_norm": 1.9035725593566895, + "learning_rate": 1.1871025848680939e-07, + "loss": 1.2061, + "step": 9362 + }, + { + "epoch": 0.9850604944765913, + "grad_norm": 1.5623804330825806, + "learning_rate": 1.1705610319188154e-07, + "loss": 1.5218, + "step": 9363 + }, + { + "epoch": 0.9851657022619674, + "grad_norm": 1.5503050088882446, + "learning_rate": 1.1541354697431894e-07, + "loss": 1.8319, + "step": 9364 + }, + { + "epoch": 0.9852709100473435, + "grad_norm": 1.8744232654571533, + "learning_rate": 1.1378259002488013e-07, + "loss": 1.2055, + "step": 9365 + }, + { + "epoch": 0.9853761178327196, + "grad_norm": 1.7430399656295776, + "learning_rate": 1.1216323253294691e-07, + "loss": 1.44, + "step": 9366 + }, + { + "epoch": 0.9854813256180958, + "grad_norm": 1.298393964767456, + "learning_rate": 1.1055547468658001e-07, + "loss": 1.7648, + "step": 9367 + }, + { + "epoch": 0.9855865334034719, + "grad_norm": 1.0370997190475464, + "learning_rate": 1.089593166724634e-07, + "loss": 1.4384, + "step": 9368 + }, + { + "epoch": 0.985691741188848, + "grad_norm": 1.9467689990997314, + "learning_rate": 1.0737475867598212e-07, + "loss": 1.8683, + "step": 9369 + }, + { + "epoch": 0.9857969489742241, + "grad_norm": 1.4504022598266602, + "learning_rate": 1.0580180088112234e-07, + "loss": 1.2465, + "step": 9370 + }, + { + "epoch": 0.9859021567596002, + "grad_norm": 1.4652175903320312, + "learning_rate": 1.0424044347056017e-07, + "loss": 1.6443, + "step": 9371 + }, + { + "epoch": 0.9860073645449763, + "grad_norm": 1.7554495334625244, + "learning_rate": 1.0269068662560611e-07, + "loss": 1.0831, + "step": 9372 + }, + { + "epoch": 0.9861125723303524, + "grad_norm": 1.1378939151763916, + "learning_rate": 1.0115253052622731e-07, + "loss": 1.7837, + "step": 9373 + }, + { + "epoch": 0.9862177801157286, + "grad_norm": 1.8769643306732178, + "learning_rate": 9.962597535104756e-08, + "loss": 1.515, + "step": 9374 + }, + { + "epoch": 0.9863229879011047, + "grad_norm": 1.4734489917755127, + "learning_rate": 9.811102127733618e-08, + "loss": 2.1066, + "step": 9375 + }, + { + "epoch": 0.9864281956864808, + "grad_norm": 1.2402914762496948, + "learning_rate": 9.660766848101909e-08, + "loss": 1.6076, + "step": 9376 + }, + { + "epoch": 0.9865334034718569, + "grad_norm": 1.601291537284851, + "learning_rate": 9.511591713668999e-08, + "loss": 1.5006, + "step": 9377 + }, + { + "epoch": 0.9866386112572331, + "grad_norm": 1.4172643423080444, + "learning_rate": 9.363576741755476e-08, + "loss": 1.2291, + "step": 9378 + }, + { + "epoch": 0.9867438190426091, + "grad_norm": 1.5718390941619873, + "learning_rate": 9.216721949553142e-08, + "loss": 1.8789, + "step": 9379 + }, + { + "epoch": 0.9868490268279853, + "grad_norm": 1.7157665491104126, + "learning_rate": 9.071027354112804e-08, + "loss": 1.6226, + "step": 9380 + }, + { + "epoch": 0.9869542346133614, + "grad_norm": 1.4585630893707275, + "learning_rate": 8.926492972355371e-08, + "loss": 1.5842, + "step": 9381 + }, + { + "epoch": 0.9870594423987376, + "grad_norm": 1.299195647239685, + "learning_rate": 8.783118821064085e-08, + "loss": 1.4625, + "step": 9382 + }, + { + "epoch": 0.9871646501841136, + "grad_norm": 1.305501937866211, + "learning_rate": 8.640904916888959e-08, + "loss": 1.4309, + "step": 9383 + }, + { + "epoch": 0.9872698579694897, + "grad_norm": 1.4798643589019775, + "learning_rate": 8.499851276344561e-08, + "loss": 1.4667, + "step": 9384 + }, + { + "epoch": 0.9873750657548659, + "grad_norm": 1.9000688791275024, + "learning_rate": 8.359957915812233e-08, + "loss": 1.4393, + "step": 9385 + }, + { + "epoch": 0.9874802735402419, + "grad_norm": 1.4523059129714966, + "learning_rate": 8.221224851535647e-08, + "loss": 1.5859, + "step": 9386 + }, + { + "epoch": 0.9875854813256181, + "grad_norm": 1.9257595539093018, + "learning_rate": 8.08365209962525e-08, + "loss": 1.7144, + "step": 9387 + }, + { + "epoch": 0.9876906891109942, + "grad_norm": 1.2338892221450806, + "learning_rate": 7.94723967605937e-08, + "loss": 1.5707, + "step": 9388 + }, + { + "epoch": 0.9877958968963704, + "grad_norm": 1.9463906288146973, + "learning_rate": 7.81198759667645e-08, + "loss": 1.7662, + "step": 9389 + }, + { + "epoch": 0.9879011046817464, + "grad_norm": 1.2963135242462158, + "learning_rate": 7.677895877183927e-08, + "loss": 1.6959, + "step": 9390 + }, + { + "epoch": 0.9880063124671226, + "grad_norm": 1.3970755338668823, + "learning_rate": 7.544964533153787e-08, + "loss": 1.334, + "step": 9391 + }, + { + "epoch": 0.9881115202524987, + "grad_norm": 1.601676344871521, + "learning_rate": 7.41319358002146e-08, + "loss": 1.6125, + "step": 9392 + }, + { + "epoch": 0.9882167280378747, + "grad_norm": 1.3263901472091675, + "learning_rate": 7.282583033091372e-08, + "loss": 1.9106, + "step": 9393 + }, + { + "epoch": 0.9883219358232509, + "grad_norm": 1.8151953220367432, + "learning_rate": 7.15313290752917e-08, + "loss": 1.8621, + "step": 9394 + }, + { + "epoch": 0.988427143608627, + "grad_norm": 1.4019142389297485, + "learning_rate": 7.024843218368382e-08, + "loss": 1.565, + "step": 9395 + }, + { + "epoch": 0.9885323513940032, + "grad_norm": 2.185393810272217, + "learning_rate": 6.897713980505982e-08, + "loss": 1.4799, + "step": 9396 + }, + { + "epoch": 0.9886375591793792, + "grad_norm": 1.357749104499817, + "learning_rate": 6.771745208705715e-08, + "loss": 1.5501, + "step": 9397 + }, + { + "epoch": 0.9887427669647554, + "grad_norm": 1.7001276016235352, + "learning_rate": 6.646936917595881e-08, + "loss": 1.5436, + "step": 9398 + }, + { + "epoch": 0.9888479747501315, + "grad_norm": 2.9914774894714355, + "learning_rate": 6.52328912167044e-08, + "loss": 1.9068, + "step": 9399 + }, + { + "epoch": 0.9889531825355077, + "grad_norm": 1.8778984546661377, + "learning_rate": 6.400801835286796e-08, + "loss": 1.6909, + "step": 9400 + }, + { + "epoch": 0.9890583903208837, + "grad_norm": 1.316900372505188, + "learning_rate": 6.279475072670238e-08, + "loss": 1.4041, + "step": 9401 + }, + { + "epoch": 0.9891635981062599, + "grad_norm": 2.04650616645813, + "learning_rate": 6.159308847909495e-08, + "loss": 1.7704, + "step": 9402 + }, + { + "epoch": 0.989268805891636, + "grad_norm": 2.6405978202819824, + "learning_rate": 6.040303174958961e-08, + "loss": 1.8183, + "step": 9403 + }, + { + "epoch": 0.989374013677012, + "grad_norm": 1.2661974430084229, + "learning_rate": 5.922458067639802e-08, + "loss": 2.0856, + "step": 9404 + }, + { + "epoch": 0.9894792214623882, + "grad_norm": 1.692745566368103, + "learning_rate": 5.805773539634407e-08, + "loss": 1.5816, + "step": 9405 + }, + { + "epoch": 0.9895844292477644, + "grad_norm": 1.9099050760269165, + "learning_rate": 5.690249604495268e-08, + "loss": 1.4334, + "step": 9406 + }, + { + "epoch": 0.9896896370331405, + "grad_norm": 1.8465721607208252, + "learning_rate": 5.57588627563721e-08, + "loss": 1.7839, + "step": 9407 + }, + { + "epoch": 0.9897948448185165, + "grad_norm": 1.444919228553772, + "learning_rate": 5.46268356634072e-08, + "loss": 1.5675, + "step": 9408 + }, + { + "epoch": 0.9899000526038927, + "grad_norm": 1.517018437385559, + "learning_rate": 5.3506414897508404e-08, + "loss": 1.4653, + "step": 9409 + }, + { + "epoch": 0.9900052603892688, + "grad_norm": 1.6585253477096558, + "learning_rate": 5.239760058879384e-08, + "loss": 2.2469, + "step": 9410 + }, + { + "epoch": 0.9901104681746449, + "grad_norm": 1.9332027435302734, + "learning_rate": 5.130039286602717e-08, + "loss": 1.6433, + "step": 9411 + }, + { + "epoch": 0.990215675960021, + "grad_norm": 1.3716708421707153, + "learning_rate": 5.02147918566287e-08, + "loss": 1.4928, + "step": 9412 + }, + { + "epoch": 0.9903208837453972, + "grad_norm": 2.6388068199157715, + "learning_rate": 4.9140797686653136e-08, + "loss": 1.4978, + "step": 9413 + }, + { + "epoch": 0.9904260915307733, + "grad_norm": 1.895566463470459, + "learning_rate": 4.807841048082296e-08, + "loss": 1.6034, + "step": 9414 + }, + { + "epoch": 0.9905312993161494, + "grad_norm": 1.3241360187530518, + "learning_rate": 4.702763036252833e-08, + "loss": 1.7537, + "step": 9415 + }, + { + "epoch": 0.9906365071015255, + "grad_norm": 2.1230876445770264, + "learning_rate": 4.598845745376057e-08, + "loss": 1.9085, + "step": 9416 + }, + { + "epoch": 0.9907417148869017, + "grad_norm": 2.1953372955322266, + "learning_rate": 4.496089187522312e-08, + "loss": 1.5051, + "step": 9417 + }, + { + "epoch": 0.9908469226722777, + "grad_norm": 1.0922819375991821, + "learning_rate": 4.3944933746231655e-08, + "loss": 1.938, + "step": 9418 + }, + { + "epoch": 0.9909521304576538, + "grad_norm": 1.2908408641815186, + "learning_rate": 4.294058318475846e-08, + "loss": 1.4622, + "step": 9419 + }, + { + "epoch": 0.99105733824303, + "grad_norm": 1.8100134134292603, + "learning_rate": 4.194784030745469e-08, + "loss": 1.8774, + "step": 9420 + }, + { + "epoch": 0.9911625460284061, + "grad_norm": 1.8238705396652222, + "learning_rate": 4.096670522959478e-08, + "loss": 1.5957, + "step": 9421 + }, + { + "epoch": 0.9912677538137822, + "grad_norm": 1.4968173503875732, + "learning_rate": 3.999717806510983e-08, + "loss": 1.6067, + "step": 9422 + }, + { + "epoch": 0.9913729615991583, + "grad_norm": 1.1250231266021729, + "learning_rate": 3.903925892658755e-08, + "loss": 1.5794, + "step": 9423 + }, + { + "epoch": 0.9914781693845345, + "grad_norm": 1.5623326301574707, + "learning_rate": 3.809294792527229e-08, + "loss": 1.7024, + "step": 9424 + }, + { + "epoch": 0.9915833771699105, + "grad_norm": 2.0509960651397705, + "learning_rate": 3.715824517106503e-08, + "loss": 1.489, + "step": 9425 + }, + { + "epoch": 0.9916885849552867, + "grad_norm": 1.0217515230178833, + "learning_rate": 3.623515077250117e-08, + "loss": 1.5209, + "step": 9426 + }, + { + "epoch": 0.9917937927406628, + "grad_norm": 1.395032525062561, + "learning_rate": 3.532366483677274e-08, + "loss": 1.8385, + "step": 9427 + }, + { + "epoch": 0.991899000526039, + "grad_norm": 1.5118807554244995, + "learning_rate": 3.442378746972841e-08, + "loss": 1.3888, + "step": 9428 + }, + { + "epoch": 0.992004208311415, + "grad_norm": 1.96005117893219, + "learning_rate": 3.3535518775873466e-08, + "loss": 1.5879, + "step": 9429 + }, + { + "epoch": 0.9921094160967912, + "grad_norm": 1.319014072418213, + "learning_rate": 3.265885885835873e-08, + "loss": 1.8016, + "step": 9430 + }, + { + "epoch": 0.9922146238821673, + "grad_norm": 1.2182525396347046, + "learning_rate": 3.179380781898056e-08, + "loss": 1.9213, + "step": 9431 + }, + { + "epoch": 0.9923198316675435, + "grad_norm": 1.6044673919677734, + "learning_rate": 3.0940365758203025e-08, + "loss": 1.421, + "step": 9432 + }, + { + "epoch": 0.9924250394529195, + "grad_norm": 1.7028839588165283, + "learning_rate": 3.009853277512464e-08, + "loss": 1.4184, + "step": 9433 + }, + { + "epoch": 0.9925302472382956, + "grad_norm": 1.4499717950820923, + "learning_rate": 2.9268308967522752e-08, + "loss": 1.8255, + "step": 9434 + }, + { + "epoch": 0.9926354550236718, + "grad_norm": 2.1826236248016357, + "learning_rate": 2.844969443178691e-08, + "loss": 2.1099, + "step": 9435 + }, + { + "epoch": 0.9927406628090478, + "grad_norm": 1.6331896781921387, + "learning_rate": 2.7642689262996625e-08, + "loss": 2.0769, + "step": 9436 + }, + { + "epoch": 0.992845870594424, + "grad_norm": 1.0556201934814453, + "learning_rate": 2.6847293554854712e-08, + "loss": 1.2877, + "step": 9437 + }, + { + "epoch": 0.9929510783798001, + "grad_norm": 1.582120656967163, + "learning_rate": 2.6063507399731735e-08, + "loss": 1.6723, + "step": 9438 + }, + { + "epoch": 0.9930562861651763, + "grad_norm": 1.865082859992981, + "learning_rate": 2.5291330888643772e-08, + "loss": 1.612, + "step": 9439 + }, + { + "epoch": 0.9931614939505523, + "grad_norm": 1.324381709098816, + "learning_rate": 2.453076411127464e-08, + "loss": 2.0149, + "step": 9440 + }, + { + "epoch": 0.9932667017359285, + "grad_norm": 1.3037350177764893, + "learning_rate": 2.378180715593148e-08, + "loss": 1.6584, + "step": 9441 + }, + { + "epoch": 0.9933719095213046, + "grad_norm": 2.0570385456085205, + "learning_rate": 2.304446010958916e-08, + "loss": 1.0094, + "step": 9442 + }, + { + "epoch": 0.9934771173066806, + "grad_norm": 2.5256316661834717, + "learning_rate": 2.2318723057879188e-08, + "loss": 1.3119, + "step": 9443 + }, + { + "epoch": 0.9935823250920568, + "grad_norm": 1.4097765684127808, + "learning_rate": 2.1604596085078587e-08, + "loss": 1.5828, + "step": 9444 + }, + { + "epoch": 0.9936875328774329, + "grad_norm": 1.6299117803573608, + "learning_rate": 2.0902079274121024e-08, + "loss": 1.523, + "step": 9445 + }, + { + "epoch": 0.9937927406628091, + "grad_norm": 1.7055869102478027, + "learning_rate": 2.0211172706574576e-08, + "loss": 1.7484, + "step": 9446 + }, + { + "epoch": 0.9938979484481851, + "grad_norm": 1.4784151315689087, + "learning_rate": 1.953187646268617e-08, + "loss": 1.4377, + "step": 9447 + }, + { + "epoch": 0.9940031562335613, + "grad_norm": 1.5295408964157104, + "learning_rate": 1.886419062132605e-08, + "loss": 1.6043, + "step": 9448 + }, + { + "epoch": 0.9941083640189374, + "grad_norm": 1.7209928035736084, + "learning_rate": 1.8208115260032187e-08, + "loss": 1.236, + "step": 9449 + }, + { + "epoch": 0.9942135718043135, + "grad_norm": 1.6919333934783936, + "learning_rate": 1.7563650455010295e-08, + "loss": 1.4099, + "step": 9450 + }, + { + "epoch": 0.9943187795896896, + "grad_norm": 1.644214391708374, + "learning_rate": 1.6930796281078297e-08, + "loss": 1.8462, + "step": 9451 + }, + { + "epoch": 0.9944239873750658, + "grad_norm": 1.6857322454452515, + "learning_rate": 1.6309552811744067e-08, + "loss": 1.568, + "step": 9452 + }, + { + "epoch": 0.9945291951604419, + "grad_norm": 1.1969326734542847, + "learning_rate": 1.569992011913879e-08, + "loss": 2.0799, + "step": 9453 + }, + { + "epoch": 0.994634402945818, + "grad_norm": 1.4197190999984741, + "learning_rate": 1.5101898274050286e-08, + "loss": 1.6347, + "step": 9454 + }, + { + "epoch": 0.9947396107311941, + "grad_norm": 1.7226569652557373, + "learning_rate": 1.4515487345956313e-08, + "loss": 1.5217, + "step": 9455 + }, + { + "epoch": 0.9948448185165703, + "grad_norm": 1.4008227586746216, + "learning_rate": 1.3940687402924646e-08, + "loss": 1.7268, + "step": 9456 + }, + { + "epoch": 0.9949500263019463, + "grad_norm": 1.268113613128662, + "learning_rate": 1.3377498511712993e-08, + "loss": 1.6932, + "step": 9457 + }, + { + "epoch": 0.9950552340873224, + "grad_norm": 1.930999994277954, + "learning_rate": 1.2825920737724596e-08, + "loss": 1.7915, + "step": 9458 + }, + { + "epoch": 0.9951604418726986, + "grad_norm": 2.0582447052001953, + "learning_rate": 1.2285954145008216e-08, + "loss": 1.3104, + "step": 9459 + }, + { + "epoch": 0.9952656496580747, + "grad_norm": 1.6370129585266113, + "learning_rate": 1.1757598796280355e-08, + "loss": 1.6174, + "step": 9460 + }, + { + "epoch": 0.9953708574434508, + "grad_norm": 1.1519914865493774, + "learning_rate": 1.1240854752880836e-08, + "loss": 1.6631, + "step": 9461 + }, + { + "epoch": 0.9954760652288269, + "grad_norm": 1.6220288276672363, + "learning_rate": 1.0735722074828313e-08, + "loss": 2.0331, + "step": 9462 + }, + { + "epoch": 0.9955812730142031, + "grad_norm": 1.75868558883667, + "learning_rate": 1.0242200820786974e-08, + "loss": 1.8015, + "step": 9463 + }, + { + "epoch": 0.9956864807995792, + "grad_norm": 1.8576023578643799, + "learning_rate": 9.760291048055426e-09, + "loss": 1.8266, + "step": 9464 + }, + { + "epoch": 0.9957916885849553, + "grad_norm": 1.2487952709197998, + "learning_rate": 9.289992812600012e-09, + "loss": 2.035, + "step": 9465 + }, + { + "epoch": 0.9958968963703314, + "grad_norm": 2.236783027648926, + "learning_rate": 8.831306169032604e-09, + "loss": 1.9039, + "step": 9466 + }, + { + "epoch": 0.9960021041557076, + "grad_norm": 1.4378468990325928, + "learning_rate": 8.384231170632805e-09, + "loss": 1.9437, + "step": 9467 + }, + { + "epoch": 0.9961073119410836, + "grad_norm": 1.5383528470993042, + "learning_rate": 7.94876786929244e-09, + "loss": 1.2174, + "step": 9468 + }, + { + "epoch": 0.9962125197264597, + "grad_norm": 3.4968624114990234, + "learning_rate": 7.52491631560437e-09, + "loss": 1.4205, + "step": 9469 + }, + { + "epoch": 0.9963177275118359, + "grad_norm": 1.9415160417556763, + "learning_rate": 7.112676558784781e-09, + "loss": 1.4909, + "step": 9470 + }, + { + "epoch": 0.996422935297212, + "grad_norm": 1.2908811569213867, + "learning_rate": 6.71204864669539e-09, + "loss": 1.5163, + "step": 9471 + }, + { + "epoch": 0.9965281430825881, + "grad_norm": 1.1630277633666992, + "learning_rate": 6.3230326258656435e-09, + "loss": 1.5793, + "step": 9472 + }, + { + "epoch": 0.9966333508679642, + "grad_norm": 1.6427892446517944, + "learning_rate": 5.945628541481619e-09, + "loss": 1.774, + "step": 9473 + }, + { + "epoch": 0.9967385586533404, + "grad_norm": 1.8046869039535522, + "learning_rate": 5.579836437341612e-09, + "loss": 1.6256, + "step": 9474 + }, + { + "epoch": 0.9968437664387164, + "grad_norm": 2.09666109085083, + "learning_rate": 5.225656355956066e-09, + "loss": 1.5055, + "step": 9475 + }, + { + "epoch": 0.9969489742240926, + "grad_norm": 1.8133103847503662, + "learning_rate": 4.883088338425434e-09, + "loss": 1.7931, + "step": 9476 + }, + { + "epoch": 0.9970541820094687, + "grad_norm": 1.8678925037384033, + "learning_rate": 4.552132424562317e-09, + "loss": 1.9548, + "step": 9477 + }, + { + "epoch": 0.9971593897948449, + "grad_norm": 1.6927310228347778, + "learning_rate": 4.2327886527693265e-09, + "loss": 1.4532, + "step": 9478 + }, + { + "epoch": 0.9972645975802209, + "grad_norm": 1.3339251279830933, + "learning_rate": 3.925057060150117e-09, + "loss": 1.4882, + "step": 9479 + }, + { + "epoch": 0.997369805365597, + "grad_norm": 2.199472188949585, + "learning_rate": 3.628937682431666e-09, + "loss": 1.782, + "step": 9480 + }, + { + "epoch": 0.9974750131509732, + "grad_norm": 1.5956065654754639, + "learning_rate": 3.344430554008682e-09, + "loss": 1.7815, + "step": 9481 + }, + { + "epoch": 0.9975802209363492, + "grad_norm": 1.884323000907898, + "learning_rate": 3.0715357079103003e-09, + "loss": 1.3084, + "step": 9482 + }, + { + "epoch": 0.9976854287217254, + "grad_norm": 1.6075438261032104, + "learning_rate": 2.8102531758333883e-09, + "loss": 1.4152, + "step": 9483 + }, + { + "epoch": 0.9977906365071015, + "grad_norm": 1.7321268320083618, + "learning_rate": 2.5605829881203414e-09, + "loss": 1.776, + "step": 9484 + }, + { + "epoch": 0.9978958442924777, + "grad_norm": 1.7372652292251587, + "learning_rate": 2.3225251737701846e-09, + "loss": 1.2503, + "step": 9485 + }, + { + "epoch": 0.9980010520778537, + "grad_norm": 1.7862646579742432, + "learning_rate": 2.0960797604052675e-09, + "loss": 1.4679, + "step": 9486 + }, + { + "epoch": 0.9981062598632299, + "grad_norm": 1.7311826944351196, + "learning_rate": 1.881246774348977e-09, + "loss": 0.6319, + "step": 9487 + }, + { + "epoch": 0.998211467648606, + "grad_norm": 2.2221319675445557, + "learning_rate": 1.678026240536923e-09, + "loss": 1.4301, + "step": 9488 + }, + { + "epoch": 0.9983166754339821, + "grad_norm": 1.4736319780349731, + "learning_rate": 1.4864181825613444e-09, + "loss": 1.6957, + "step": 9489 + }, + { + "epoch": 0.9984218832193582, + "grad_norm": 1.5542229413986206, + "learning_rate": 1.3064226226933152e-09, + "loss": 1.4633, + "step": 9490 + }, + { + "epoch": 0.9985270910047344, + "grad_norm": 1.928504467010498, + "learning_rate": 1.1380395818050282e-09, + "loss": 1.9097, + "step": 9491 + }, + { + "epoch": 0.9986322987901105, + "grad_norm": 1.4703706502914429, + "learning_rate": 9.812690794808177e-10, + "loss": 1.5353, + "step": 9492 + }, + { + "epoch": 0.9987375065754865, + "grad_norm": 1.7381463050842285, + "learning_rate": 8.361111339061367e-10, + "loss": 1.562, + "step": 9493 + }, + { + "epoch": 0.9988427143608627, + "grad_norm": 2.3469841480255127, + "learning_rate": 7.025657619563752e-10, + "loss": 1.756, + "step": 9494 + }, + { + "epoch": 0.9989479221462388, + "grad_norm": 1.3420002460479736, + "learning_rate": 5.806329791191445e-10, + "loss": 1.4611, + "step": 9495 + }, + { + "epoch": 0.999053129931615, + "grad_norm": 1.709895372390747, + "learning_rate": 4.703127995608902e-10, + "loss": 1.4044, + "step": 9496 + }, + { + "epoch": 0.999158337716991, + "grad_norm": 1.3718494176864624, + "learning_rate": 3.716052360935862e-10, + "loss": 1.7666, + "step": 9497 + }, + { + "epoch": 0.9992635455023672, + "grad_norm": 1.1952706575393677, + "learning_rate": 2.8451030018583623e-10, + "loss": 1.8885, + "step": 9498 + }, + { + "epoch": 0.9993687532877433, + "grad_norm": 1.6405409574508667, + "learning_rate": 2.0902800194066985e-10, + "loss": 1.5163, + "step": 9499 + }, + { + "epoch": 0.9994739610731194, + "grad_norm": 1.4296813011169434, + "learning_rate": 1.4515835012884893e-10, + "loss": 1.4754, + "step": 9500 + } + ], + "logging_steps": 1.0, + "max_steps": 9505, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.969403554197504e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}