diff --git "a/VideoXL_weight_8/trainer_state.json" "b/VideoXL_weight_8/trainer_state.json" new file mode 100644--- /dev/null +++ "b/VideoXL_weight_8/trainer_state.json" @@ -0,0 +1,105021 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.218298848707588, + "eval_steps": 500, + "global_step": 15000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 63.09210744064006, + "learning_rate": 2.702702702702703e-08, + "loss": 2.0907, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 41.448049465342834, + "learning_rate": 5.405405405405406e-08, + "loss": 2.0215, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 47.026785887695155, + "learning_rate": 8.108108108108109e-08, + "loss": 2.0995, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 52.83417573592187, + "learning_rate": 1.0810810810810812e-07, + "loss": 2.1134, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 29.752645757077097, + "learning_rate": 1.3513513513513515e-07, + "loss": 2.1541, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 41.74351662988793, + "learning_rate": 1.6216216216216218e-07, + "loss": 2.2341, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 47.13605325623752, + "learning_rate": 1.8918918918918921e-07, + "loss": 2.3537, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 40.05344148605918, + "learning_rate": 2.1621621621621625e-07, + "loss": 2.1526, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 41.606123543309764, + "learning_rate": 2.4324324324324326e-07, + "loss": 2.2158, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 47.3919369207522, + "learning_rate": 2.702702702702703e-07, + "loss": 2.2508, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 44.71142856608848, + "learning_rate": 2.972972972972973e-07, + "loss": 2.1635, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 42.27079352967162, + "learning_rate": 3.2432432432432436e-07, + "loss": 2.2709, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 35.78531807299802, + "learning_rate": 3.513513513513514e-07, + "loss": 2.1149, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 49.284697480404766, + "learning_rate": 3.7837837837837843e-07, + "loss": 2.2079, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 40.71527633576261, + "learning_rate": 4.0540540540540546e-07, + "loss": 1.853, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 46.29481642359868, + "learning_rate": 4.324324324324325e-07, + "loss": 1.9446, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 38.30865061994851, + "learning_rate": 4.5945945945945953e-07, + "loss": 2.0898, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 43.88696790640592, + "learning_rate": 4.864864864864865e-07, + "loss": 1.8325, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 25.462836661475112, + "learning_rate": 5.135135135135135e-07, + "loss": 1.6462, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 25.210403543119014, + "learning_rate": 5.405405405405406e-07, + "loss": 1.6375, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 27.033049559916677, + "learning_rate": 5.675675675675676e-07, + "loss": 1.6669, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 23.7804483699658, + "learning_rate": 5.945945945945947e-07, + "loss": 1.6073, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 41.65583695037341, + "learning_rate": 6.216216216216217e-07, + "loss": 1.5485, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 19.50002166627634, + "learning_rate": 6.486486486486487e-07, + "loss": 1.4954, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 15.840159645176156, + "learning_rate": 6.756756756756758e-07, + "loss": 1.463, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 14.39522714576107, + "learning_rate": 7.027027027027028e-07, + "loss": 1.1524, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 10.525545979505534, + "learning_rate": 7.297297297297298e-07, + "loss": 1.2525, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 21.623559694339768, + "learning_rate": 7.567567567567569e-07, + "loss": 1.2244, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 12.438738315604772, + "learning_rate": 7.837837837837839e-07, + "loss": 1.242, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 8.791264076001971, + "learning_rate": 8.108108108108109e-07, + "loss": 1.123, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 8.444522217696221, + "learning_rate": 8.37837837837838e-07, + "loss": 1.128, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 11.80937189736569, + "learning_rate": 8.64864864864865e-07, + "loss": 1.1132, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 12.134902430214023, + "learning_rate": 8.91891891891892e-07, + "loss": 1.1739, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 24.27912192427012, + "learning_rate": 9.189189189189191e-07, + "loss": 1.0642, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 13.074055779236055, + "learning_rate": 9.459459459459461e-07, + "loss": 1.5063, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 8.454507776358321, + "learning_rate": 9.72972972972973e-07, + "loss": 0.9876, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 9.362154008331844, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.3223, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 11.847660888509978, + "learning_rate": 1.027027027027027e-06, + "loss": 1.0352, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 6.8948029511869615, + "learning_rate": 1.0540540540540542e-06, + "loss": 1.115, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 5.656382557200861, + "learning_rate": 1.0810810810810812e-06, + "loss": 1.3187, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 6.842034747268024, + "learning_rate": 1.1081081081081083e-06, + "loss": 1.1373, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 5.9197907315298846, + "learning_rate": 1.1351351351351352e-06, + "loss": 0.9229, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 4.946532136719213, + "learning_rate": 1.1621621621621624e-06, + "loss": 1.2383, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 5.933609927910655, + "learning_rate": 1.1891891891891893e-06, + "loss": 1.0346, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 6.49703904887925, + "learning_rate": 1.2162162162162164e-06, + "loss": 1.0387, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 4.762039043147149, + "learning_rate": 1.2432432432432434e-06, + "loss": 0.9998, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 3.92883755344509, + "learning_rate": 1.2702702702702705e-06, + "loss": 1.0296, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 7.005043590598583, + "learning_rate": 1.2972972972972974e-06, + "loss": 1.1131, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 5.354632966884938, + "learning_rate": 1.3243243243243246e-06, + "loss": 1.0134, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 4.556566511271708, + "learning_rate": 1.3513513513513515e-06, + "loss": 0.859, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 5.632328269570113, + "learning_rate": 1.3783783783783786e-06, + "loss": 1.0472, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 4.9486532482865435, + "learning_rate": 1.4054054054054056e-06, + "loss": 0.9433, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 4.865192625631578, + "learning_rate": 1.4324324324324327e-06, + "loss": 1.1251, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 4.357689894765142, + "learning_rate": 1.4594594594594596e-06, + "loss": 1.0328, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 3.718722198461544, + "learning_rate": 1.4864864864864868e-06, + "loss": 1.0607, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 5.324459427328635, + "learning_rate": 1.5135135135135137e-06, + "loss": 0.9641, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 4.866079854989161, + "learning_rate": 1.5405405405405409e-06, + "loss": 1.1034, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 4.98776615712582, + "learning_rate": 1.5675675675675678e-06, + "loss": 1.0333, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 4.711843051671795, + "learning_rate": 1.5945945945945947e-06, + "loss": 1.1368, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 3.96747806209191, + "learning_rate": 1.6216216216216219e-06, + "loss": 1.0649, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 4.807553599493247, + "learning_rate": 1.6486486486486488e-06, + "loss": 0.8842, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 3.2736315334006445, + "learning_rate": 1.675675675675676e-06, + "loss": 0.9168, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 4.182243485448351, + "learning_rate": 1.7027027027027028e-06, + "loss": 0.9442, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 4.4850171249159985, + "learning_rate": 1.72972972972973e-06, + "loss": 0.8695, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 4.44246074518427, + "learning_rate": 1.756756756756757e-06, + "loss": 0.9205, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 4.2871223675437555, + "learning_rate": 1.783783783783784e-06, + "loss": 0.9512, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 20.313595128982193, + "learning_rate": 1.810810810810811e-06, + "loss": 0.9122, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 3.4560526712826176, + "learning_rate": 1.8378378378378381e-06, + "loss": 0.915, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 3.7990307229254463, + "learning_rate": 1.864864864864865e-06, + "loss": 0.9674, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 3.920458916035328, + "learning_rate": 1.8918918918918922e-06, + "loss": 0.9847, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 4.222083142609564, + "learning_rate": 1.918918918918919e-06, + "loss": 1.0758, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 3.548309752947073, + "learning_rate": 1.945945945945946e-06, + "loss": 0.9791, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 3.622733873280609, + "learning_rate": 1.9729729729729734e-06, + "loss": 0.9241, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 3.173226704615171, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8568, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 3.319861917917085, + "learning_rate": 2.0270270270270273e-06, + "loss": 0.93, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 3.340663374122506, + "learning_rate": 2.054054054054054e-06, + "loss": 1.0522, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 3.6689457635724394, + "learning_rate": 2.0810810810810815e-06, + "loss": 0.8734, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 3.9546214054278646, + "learning_rate": 2.1081081081081085e-06, + "loss": 1.309, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 4.1361774814504955, + "learning_rate": 2.1351351351351354e-06, + "loss": 1.0352, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 3.933341932972953, + "learning_rate": 2.1621621621621623e-06, + "loss": 1.1228, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 4.352479533301301, + "learning_rate": 2.1891891891891897e-06, + "loss": 0.9103, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 4.620591320930049, + "learning_rate": 2.2162162162162166e-06, + "loss": 1.0265, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.8703432705528513, + "learning_rate": 2.2432432432432435e-06, + "loss": 0.9905, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 4.393478414302074, + "learning_rate": 2.2702702702702705e-06, + "loss": 1.0622, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 6.265928855922091, + "learning_rate": 2.297297297297298e-06, + "loss": 1.1032, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 4.107673309909188, + "learning_rate": 2.3243243243243247e-06, + "loss": 0.8631, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 3.835601844883204, + "learning_rate": 2.3513513513513517e-06, + "loss": 1.0654, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 3.7545219775073124, + "learning_rate": 2.3783783783783786e-06, + "loss": 1.0408, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 4.924451663306775, + "learning_rate": 2.4054054054054055e-06, + "loss": 0.9569, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.8906953580773687, + "learning_rate": 2.432432432432433e-06, + "loss": 0.9616, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 3.575749328563311, + "learning_rate": 2.45945945945946e-06, + "loss": 1.0187, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 8.469633477745303, + "learning_rate": 2.4864864864864867e-06, + "loss": 1.1732, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 3.3897385105814117, + "learning_rate": 2.5135135135135137e-06, + "loss": 0.8862, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 3.7948253205051694, + "learning_rate": 2.540540540540541e-06, + "loss": 1.0327, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 5.371187255522291, + "learning_rate": 2.5675675675675675e-06, + "loss": 1.034, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 4.560170089873212, + "learning_rate": 2.594594594594595e-06, + "loss": 0.8899, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 4.671898125348859, + "learning_rate": 2.621621621621622e-06, + "loss": 0.791, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 3.76292106021511, + "learning_rate": 2.648648648648649e-06, + "loss": 0.8861, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 5.3526080641590505, + "learning_rate": 2.6756756756756757e-06, + "loss": 0.9757, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 6.325620004738459, + "learning_rate": 2.702702702702703e-06, + "loss": 1.0721, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 3.7766975500508364, + "learning_rate": 2.72972972972973e-06, + "loss": 0.9764, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 3.833312727347566, + "learning_rate": 2.7567567567567573e-06, + "loss": 1.0064, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 4.274198012440263, + "learning_rate": 2.783783783783784e-06, + "loss": 0.9874, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 4.292666283682956, + "learning_rate": 2.810810810810811e-06, + "loss": 0.9572, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 8.044630585777997, + "learning_rate": 2.837837837837838e-06, + "loss": 0.9615, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 10.377028267012342, + "learning_rate": 2.8648648648648654e-06, + "loss": 0.7981, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 6.230962434779337, + "learning_rate": 2.891891891891892e-06, + "loss": 0.9214, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 5.078803174877479, + "learning_rate": 2.9189189189189193e-06, + "loss": 0.9328, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 9.178026998368148, + "learning_rate": 2.9459459459459462e-06, + "loss": 0.8246, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 3.632601979523958, + "learning_rate": 2.9729729729729736e-06, + "loss": 1.0224, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 6.064716985739635, + "learning_rate": 3e-06, + "loss": 1.0143, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 5.987949350631371, + "learning_rate": 3.0270270270270274e-06, + "loss": 0.9865, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 4.197057468104055, + "learning_rate": 3.0540540540540544e-06, + "loss": 0.967, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 5.023760757753928, + "learning_rate": 3.0810810810810817e-06, + "loss": 0.9915, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 5.949813120525793, + "learning_rate": 3.1081081081081082e-06, + "loss": 0.9101, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 4.733118568805419, + "learning_rate": 3.1351351351351356e-06, + "loss": 0.9872, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 4.920966341519552, + "learning_rate": 3.1621621621621625e-06, + "loss": 0.8534, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 5.071200176079295, + "learning_rate": 3.1891891891891894e-06, + "loss": 1.1142, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 4.889251040672398, + "learning_rate": 3.2162162162162164e-06, + "loss": 0.9795, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 5.9299095643721875, + "learning_rate": 3.2432432432432437e-06, + "loss": 0.8322, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 10.5772155570809, + "learning_rate": 3.2702702702702706e-06, + "loss": 0.937, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 5.747279093538681, + "learning_rate": 3.2972972972972976e-06, + "loss": 0.9593, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 11.953456917374227, + "learning_rate": 3.3243243243243245e-06, + "loss": 0.9224, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 4.896396900943141, + "learning_rate": 3.351351351351352e-06, + "loss": 0.8117, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 4.8692754198154, + "learning_rate": 3.3783783783783788e-06, + "loss": 0.9104, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 6.34690544735298, + "learning_rate": 3.4054054054054057e-06, + "loss": 1.0861, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 5.362948551124936, + "learning_rate": 3.4324324324324326e-06, + "loss": 0.8018, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 4.653801965753808, + "learning_rate": 3.45945945945946e-06, + "loss": 0.7235, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 5.489050527532971, + "learning_rate": 3.4864864864864865e-06, + "loss": 0.8185, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 5.124224436914261, + "learning_rate": 3.513513513513514e-06, + "loss": 0.7926, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 10.999712639206107, + "learning_rate": 3.5405405405405408e-06, + "loss": 1.0187, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 5.719248554836371, + "learning_rate": 3.567567567567568e-06, + "loss": 0.9715, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 6.850255386958801, + "learning_rate": 3.5945945945945946e-06, + "loss": 0.9046, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 5.275246259471631, + "learning_rate": 3.621621621621622e-06, + "loss": 1.0054, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 9.780966148279665, + "learning_rate": 3.648648648648649e-06, + "loss": 0.9502, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 5.857996828125333, + "learning_rate": 3.6756756756756763e-06, + "loss": 0.8745, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 18.804929852954515, + "learning_rate": 3.7027027027027028e-06, + "loss": 0.8358, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 6.666580613271323, + "learning_rate": 3.72972972972973e-06, + "loss": 0.9765, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 16.59079768970637, + "learning_rate": 3.756756756756757e-06, + "loss": 0.9116, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 10.078376920826488, + "learning_rate": 3.7837837837837844e-06, + "loss": 0.947, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 5.230742828123224, + "learning_rate": 3.810810810810811e-06, + "loss": 0.7936, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 6.251875425533412, + "learning_rate": 3.837837837837838e-06, + "loss": 0.7847, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 21.089064452089342, + "learning_rate": 3.864864864864865e-06, + "loss": 0.805, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 6.813479314855189, + "learning_rate": 3.891891891891892e-06, + "loss": 0.8976, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 8.473649775180455, + "learning_rate": 3.918918918918919e-06, + "loss": 0.7821, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 5.811233637559505, + "learning_rate": 3.945945945945947e-06, + "loss": 0.9501, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 4.985364552629455, + "learning_rate": 3.972972972972973e-06, + "loss": 0.9514, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 7.431266694054168, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7551, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 4.753559866561872, + "learning_rate": 4.027027027027028e-06, + "loss": 1.1062, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 9.58835384775318, + "learning_rate": 4.0540540540540545e-06, + "loss": 0.7539, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 7.457588006162735, + "learning_rate": 4.0810810810810815e-06, + "loss": 0.9726, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 6.596394144876574, + "learning_rate": 4.108108108108108e-06, + "loss": 0.8226, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 6.733853503611332, + "learning_rate": 4.135135135135135e-06, + "loss": 0.87, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 5.71577603498809, + "learning_rate": 4.162162162162163e-06, + "loss": 0.7438, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 5.658958532440653, + "learning_rate": 4.189189189189189e-06, + "loss": 0.7735, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 8.494352208903642, + "learning_rate": 4.216216216216217e-06, + "loss": 0.8795, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 7.513746715130291, + "learning_rate": 4.243243243243244e-06, + "loss": 0.8419, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 14.129489357210856, + "learning_rate": 4.270270270270271e-06, + "loss": 0.8887, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 6.485982506631931, + "learning_rate": 4.297297297297298e-06, + "loss": 0.7415, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 7.495307495484502, + "learning_rate": 4.324324324324325e-06, + "loss": 0.8468, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 11.894580487680233, + "learning_rate": 4.351351351351352e-06, + "loss": 0.8536, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 28.270425570727863, + "learning_rate": 4.378378378378379e-06, + "loss": 0.8292, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 17.981107840077165, + "learning_rate": 4.4054054054054054e-06, + "loss": 0.8553, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 7.740632578049493, + "learning_rate": 4.432432432432433e-06, + "loss": 0.9331, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 6.083395560536381, + "learning_rate": 4.45945945945946e-06, + "loss": 0.9074, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 4.160025202563131, + "learning_rate": 4.486486486486487e-06, + "loss": 0.8046, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 8.556373648188933, + "learning_rate": 4.513513513513514e-06, + "loss": 0.7178, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 8.174998528877031, + "learning_rate": 4.540540540540541e-06, + "loss": 1.0875, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 6.6416170532041425, + "learning_rate": 4.567567567567568e-06, + "loss": 0.8532, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 7.585986727506687, + "learning_rate": 4.594594594594596e-06, + "loss": 0.8369, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 6.119169963884442, + "learning_rate": 4.621621621621622e-06, + "loss": 0.7298, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 18.974694130198262, + "learning_rate": 4.6486486486486495e-06, + "loss": 0.9494, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 13.577484804958662, + "learning_rate": 4.675675675675676e-06, + "loss": 0.7836, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 6.473307435218165, + "learning_rate": 4.702702702702703e-06, + "loss": 0.8785, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 12.309467131845393, + "learning_rate": 4.72972972972973e-06, + "loss": 0.9952, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 167.12808754379284, + "learning_rate": 4.756756756756757e-06, + "loss": 0.9495, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 9.989622733894004, + "learning_rate": 4.783783783783784e-06, + "loss": 0.7514, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 13.067919078628053, + "learning_rate": 4.810810810810811e-06, + "loss": 0.8249, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 7.935641593095554, + "learning_rate": 4.837837837837838e-06, + "loss": 0.6688, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 8.893171488568813, + "learning_rate": 4.864864864864866e-06, + "loss": 0.8727, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 7.3781143637466995, + "learning_rate": 4.891891891891893e-06, + "loss": 0.7877, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 4.358170942522939, + "learning_rate": 4.91891891891892e-06, + "loss": 0.8175, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 13.741369902994137, + "learning_rate": 4.9459459459459466e-06, + "loss": 0.7942, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 5.918663857808989, + "learning_rate": 4.9729729729729735e-06, + "loss": 0.9443, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 8.353429675245378, + "learning_rate": 5e-06, + "loss": 0.7864, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 6.7497784790141315, + "learning_rate": 5.027027027027027e-06, + "loss": 0.795, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 15.351375545207794, + "learning_rate": 5.054054054054054e-06, + "loss": 0.8987, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 14.258534725657086, + "learning_rate": 5.081081081081082e-06, + "loss": 0.9285, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 6.817064617260092, + "learning_rate": 5.108108108108108e-06, + "loss": 0.973, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 17.3062496560883, + "learning_rate": 5.135135135135135e-06, + "loss": 0.708, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 7.203174443499325, + "learning_rate": 5.162162162162162e-06, + "loss": 0.9388, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 40.67582300539913, + "learning_rate": 5.18918918918919e-06, + "loss": 1.0517, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 6.624778758554718, + "learning_rate": 5.216216216216217e-06, + "loss": 1.0026, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 14.287986400957072, + "learning_rate": 5.243243243243244e-06, + "loss": 0.906, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 26.677020243165824, + "learning_rate": 5.2702702702702705e-06, + "loss": 0.77, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 10.717079814807025, + "learning_rate": 5.297297297297298e-06, + "loss": 0.8379, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 11.19930819211915, + "learning_rate": 5.324324324324324e-06, + "loss": 0.9831, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 43.4313433637369, + "learning_rate": 5.351351351351351e-06, + "loss": 0.8114, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 16.408372849316656, + "learning_rate": 5.378378378378378e-06, + "loss": 0.7613, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 23.725845507583394, + "learning_rate": 5.405405405405406e-06, + "loss": 0.8821, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 28.163039556065826, + "learning_rate": 5.432432432432433e-06, + "loss": 0.8628, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 16.159000089346666, + "learning_rate": 5.45945945945946e-06, + "loss": 0.7834, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 10.275740415764174, + "learning_rate": 5.486486486486487e-06, + "loss": 0.9121, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 7.390622591115062, + "learning_rate": 5.513513513513515e-06, + "loss": 0.8141, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 13.232428200498525, + "learning_rate": 5.540540540540541e-06, + "loss": 0.8234, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 62.07516521308197, + "learning_rate": 5.567567567567568e-06, + "loss": 0.7933, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 11.175694736971199, + "learning_rate": 5.5945945945945945e-06, + "loss": 0.8196, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 6.0444088873022, + "learning_rate": 5.621621621621622e-06, + "loss": 0.7836, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 39.72425191380748, + "learning_rate": 5.648648648648649e-06, + "loss": 0.7978, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 18.83072139201907, + "learning_rate": 5.675675675675676e-06, + "loss": 0.7496, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 19.39567354263128, + "learning_rate": 5.702702702702702e-06, + "loss": 0.8802, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 11.085087085604622, + "learning_rate": 5.729729729729731e-06, + "loss": 1.029, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 12.909762265401948, + "learning_rate": 5.756756756756757e-06, + "loss": 0.7876, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 16.11046008150788, + "learning_rate": 5.783783783783784e-06, + "loss": 0.8135, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 44.14717703238208, + "learning_rate": 5.810810810810811e-06, + "loss": 0.835, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 18.177978861554003, + "learning_rate": 5.837837837837839e-06, + "loss": 0.8438, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 66.71809569625383, + "learning_rate": 5.8648648648648655e-06, + "loss": 0.7977, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 14.840364705847634, + "learning_rate": 5.8918918918918924e-06, + "loss": 0.8885, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 9.681458196947158, + "learning_rate": 5.9189189189189185e-06, + "loss": 0.8049, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 9.887191247030259, + "learning_rate": 5.945945945945947e-06, + "loss": 0.9249, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 14.750787757895132, + "learning_rate": 5.972972972972973e-06, + "loss": 0.8357, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 18.087394846992474, + "learning_rate": 6e-06, + "loss": 0.8042, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 10.240412502278401, + "learning_rate": 6.027027027027027e-06, + "loss": 0.9406, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 12.269878789685151, + "learning_rate": 6.054054054054055e-06, + "loss": 0.848, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 21.600613059620084, + "learning_rate": 6.081081081081082e-06, + "loss": 0.7376, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 12.963309761319014, + "learning_rate": 6.108108108108109e-06, + "loss": 0.8527, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 10.669914035420659, + "learning_rate": 6.135135135135135e-06, + "loss": 0.7558, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 53.12530809986306, + "learning_rate": 6.162162162162163e-06, + "loss": 0.8773, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 36.05059540713405, + "learning_rate": 6.1891891891891895e-06, + "loss": 0.8124, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 8.365441593192777, + "learning_rate": 6.2162162162162164e-06, + "loss": 0.9012, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 21.84171449880579, + "learning_rate": 6.243243243243243e-06, + "loss": 1.0252, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 11.051045322010468, + "learning_rate": 6.270270270270271e-06, + "loss": 0.8, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 38.374965406477585, + "learning_rate": 6.297297297297298e-06, + "loss": 0.6923, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 8.842298304296136, + "learning_rate": 6.324324324324325e-06, + "loss": 0.6765, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 293.9493770531771, + "learning_rate": 6.351351351351351e-06, + "loss": 0.8428, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 12.708204678434027, + "learning_rate": 6.378378378378379e-06, + "loss": 0.763, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 11.827661198740815, + "learning_rate": 6.405405405405406e-06, + "loss": 0.9776, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 34.965119172103456, + "learning_rate": 6.432432432432433e-06, + "loss": 1.1438, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 12.538066150617878, + "learning_rate": 6.45945945945946e-06, + "loss": 0.771, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 11.680474102043787, + "learning_rate": 6.486486486486487e-06, + "loss": 0.9046, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 13.05422018950879, + "learning_rate": 6.513513513513514e-06, + "loss": 0.8716, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 42.99120943286823, + "learning_rate": 6.540540540540541e-06, + "loss": 1.0477, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 29.171658361373236, + "learning_rate": 6.567567567567567e-06, + "loss": 0.8403, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 19.917751544861098, + "learning_rate": 6.594594594594595e-06, + "loss": 0.7563, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 76.86140496135718, + "learning_rate": 6.621621621621622e-06, + "loss": 0.7932, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 8.143892348910658, + "learning_rate": 6.648648648648649e-06, + "loss": 0.6029, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 10.470600723252575, + "learning_rate": 6.675675675675676e-06, + "loss": 0.766, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 7.93404184031104, + "learning_rate": 6.702702702702704e-06, + "loss": 0.8127, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 9.90086838041159, + "learning_rate": 6.729729729729731e-06, + "loss": 0.8232, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 19.44330816468144, + "learning_rate": 6.7567567567567575e-06, + "loss": 0.7112, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 10.947870884598686, + "learning_rate": 6.783783783783784e-06, + "loss": 0.9063, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 12.326745122540027, + "learning_rate": 6.810810810810811e-06, + "loss": 0.9685, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 8.246205071186225, + "learning_rate": 6.837837837837838e-06, + "loss": 0.9441, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 35.1358639145026, + "learning_rate": 6.864864864864865e-06, + "loss": 1.0168, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 37.984856338464056, + "learning_rate": 6.891891891891892e-06, + "loss": 0.7486, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 13.464001338586746, + "learning_rate": 6.91891891891892e-06, + "loss": 0.8285, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 10.01327371335838, + "learning_rate": 6.945945945945947e-06, + "loss": 0.9119, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 9.943870994340054, + "learning_rate": 6.972972972972973e-06, + "loss": 0.9181, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 10.739836047039292, + "learning_rate": 7e-06, + "loss": 0.9713, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 42.29798024529886, + "learning_rate": 7.027027027027028e-06, + "loss": 0.667, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 22.42866752760827, + "learning_rate": 7.054054054054055e-06, + "loss": 0.7318, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 13.08598647990953, + "learning_rate": 7.0810810810810815e-06, + "loss": 0.8075, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 13.965215731114823, + "learning_rate": 7.1081081081081085e-06, + "loss": 1.0707, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 9.03690171729719, + "learning_rate": 7.135135135135136e-06, + "loss": 0.8516, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 13.126195171696416, + "learning_rate": 7.162162162162163e-06, + "loss": 0.8807, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 48.83167530366758, + "learning_rate": 7.189189189189189e-06, + "loss": 0.7823, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 11.281746359886013, + "learning_rate": 7.216216216216216e-06, + "loss": 0.8177, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 18.678521837489985, + "learning_rate": 7.243243243243244e-06, + "loss": 0.8439, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 48.95499846632492, + "learning_rate": 7.270270270270271e-06, + "loss": 0.8774, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 21.23848843609404, + "learning_rate": 7.297297297297298e-06, + "loss": 0.7706, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 7.103337137246995, + "learning_rate": 7.324324324324325e-06, + "loss": 0.7704, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 13.435096376321923, + "learning_rate": 7.3513513513513525e-06, + "loss": 0.9393, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 31.604525793900656, + "learning_rate": 7.3783783783783794e-06, + "loss": 0.7038, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 11.1820896481639, + "learning_rate": 7.4054054054054055e-06, + "loss": 0.8782, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 29.861537718787407, + "learning_rate": 7.4324324324324324e-06, + "loss": 0.9541, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 9.413675337859308, + "learning_rate": 7.45945945945946e-06, + "loss": 0.8048, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 14.658207912177437, + "learning_rate": 7.486486486486487e-06, + "loss": 0.8344, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 24.12011249754865, + "learning_rate": 7.513513513513514e-06, + "loss": 0.8484, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 13.683050242309397, + "learning_rate": 7.540540540540541e-06, + "loss": 0.8401, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 30.951710006586342, + "learning_rate": 7.567567567567569e-06, + "loss": 0.8466, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 11.206756155627268, + "learning_rate": 7.594594594594596e-06, + "loss": 0.8336, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 11.33322173314846, + "learning_rate": 7.621621621621622e-06, + "loss": 0.9492, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 17.580655502858523, + "learning_rate": 7.648648648648649e-06, + "loss": 0.7291, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 54.61659065403924, + "learning_rate": 7.675675675675676e-06, + "loss": 0.7802, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 9.870581737775675, + "learning_rate": 7.702702702702704e-06, + "loss": 0.908, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 43.24613941366979, + "learning_rate": 7.72972972972973e-06, + "loss": 0.8057, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 9.05910901731955, + "learning_rate": 7.756756756756756e-06, + "loss": 1.025, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 8.752340650843097, + "learning_rate": 7.783783783783784e-06, + "loss": 0.7396, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 9.68669964854762, + "learning_rate": 7.810810810810812e-06, + "loss": 0.7391, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 12.654321468851618, + "learning_rate": 7.837837837837838e-06, + "loss": 1.0119, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 47.915191525751524, + "learning_rate": 7.864864864864866e-06, + "loss": 0.9611, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 18.333907389232806, + "learning_rate": 7.891891891891894e-06, + "loss": 0.9687, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 8.1082761360877, + "learning_rate": 7.91891891891892e-06, + "loss": 0.7831, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 23.48602561232429, + "learning_rate": 7.945945945945946e-06, + "loss": 0.8965, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 6.460926653242866, + "learning_rate": 7.972972972972974e-06, + "loss": 0.9003, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 11.915890253917796, + "learning_rate": 8.000000000000001e-06, + "loss": 0.8482, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 21.819683719088506, + "learning_rate": 8.027027027027027e-06, + "loss": 0.7953, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 19.7783148322034, + "learning_rate": 8.054054054054055e-06, + "loss": 0.8208, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 13.65257668344217, + "learning_rate": 8.081081081081081e-06, + "loss": 0.7397, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 5.508168222898289, + "learning_rate": 8.108108108108109e-06, + "loss": 0.9242, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 14.12216937768534, + "learning_rate": 8.135135135135137e-06, + "loss": 0.7812, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 58.2610147214343, + "learning_rate": 8.162162162162163e-06, + "loss": 0.8097, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 67.13676650261509, + "learning_rate": 8.189189189189189e-06, + "loss": 0.7253, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 11.723304970513432, + "learning_rate": 8.216216216216217e-06, + "loss": 0.9157, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 15.014324067992717, + "learning_rate": 8.243243243243245e-06, + "loss": 0.9872, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 15.576979719397047, + "learning_rate": 8.27027027027027e-06, + "loss": 0.812, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 8.576466090425274, + "learning_rate": 8.297297297297298e-06, + "loss": 0.8314, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 5.02320474319744, + "learning_rate": 8.324324324324326e-06, + "loss": 0.9842, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 37.93003233763656, + "learning_rate": 8.351351351351352e-06, + "loss": 0.8876, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 21.295280345764002, + "learning_rate": 8.378378378378378e-06, + "loss": 0.8483, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 14.22497315408456, + "learning_rate": 8.405405405405406e-06, + "loss": 0.8202, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 53.193557108033076, + "learning_rate": 8.432432432432434e-06, + "loss": 0.9068, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 32.07621818371433, + "learning_rate": 8.45945945945946e-06, + "loss": 0.921, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 10.069641389080939, + "learning_rate": 8.486486486486488e-06, + "loss": 0.8778, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 18.053523835812715, + "learning_rate": 8.513513513513514e-06, + "loss": 0.9505, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 30.015659405729796, + "learning_rate": 8.540540540540542e-06, + "loss": 0.8973, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 7.319370360672046, + "learning_rate": 8.567567567567568e-06, + "loss": 0.9075, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 32.16047427964124, + "learning_rate": 8.594594594594595e-06, + "loss": 0.8375, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 15.609338453897303, + "learning_rate": 8.621621621621622e-06, + "loss": 0.8815, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 5.704130449974582, + "learning_rate": 8.64864864864865e-06, + "loss": 0.9223, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 23.296175132616224, + "learning_rate": 8.675675675675677e-06, + "loss": 0.8277, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 7.552859261589776, + "learning_rate": 8.702702702702703e-06, + "loss": 0.9559, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 15.665438869109835, + "learning_rate": 8.72972972972973e-06, + "loss": 0.9702, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 10.197205514663084, + "learning_rate": 8.756756756756759e-06, + "loss": 0.9542, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 9.659667664276105, + "learning_rate": 8.783783783783785e-06, + "loss": 1.0003, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 10.982299903736404, + "learning_rate": 8.810810810810811e-06, + "loss": 0.8914, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 11.340952539457385, + "learning_rate": 8.837837837837839e-06, + "loss": 0.882, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 10.00825153269294, + "learning_rate": 8.864864864864866e-06, + "loss": 0.7987, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 23.540477775816996, + "learning_rate": 8.891891891891893e-06, + "loss": 0.8074, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 5.037275719179406, + "learning_rate": 8.91891891891892e-06, + "loss": 0.8387, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 7.968204358297256, + "learning_rate": 8.945945945945946e-06, + "loss": 0.695, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 7.318856178170313, + "learning_rate": 8.972972972972974e-06, + "loss": 0.9031, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 10.935686502657077, + "learning_rate": 9e-06, + "loss": 0.9114, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 12.193951959889082, + "learning_rate": 9.027027027027028e-06, + "loss": 0.9356, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 9.002218206553692, + "learning_rate": 9.054054054054054e-06, + "loss": 0.7767, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 13.997837968068213, + "learning_rate": 9.081081081081082e-06, + "loss": 0.8262, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 9.435639627514435, + "learning_rate": 9.10810810810811e-06, + "loss": 0.7897, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 7.673515641143361, + "learning_rate": 9.135135135135136e-06, + "loss": 0.8418, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 24.025442039162453, + "learning_rate": 9.162162162162162e-06, + "loss": 0.8225, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 5.376223169399579, + "learning_rate": 9.189189189189191e-06, + "loss": 0.8157, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 7.43802341579848, + "learning_rate": 9.216216216216217e-06, + "loss": 0.9067, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 31.498113437103736, + "learning_rate": 9.243243243243243e-06, + "loss": 0.8863, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 4.208395070776476, + "learning_rate": 9.270270270270271e-06, + "loss": 0.7518, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 11.349808483251481, + "learning_rate": 9.297297297297299e-06, + "loss": 0.7932, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 10.33455696368958, + "learning_rate": 9.324324324324325e-06, + "loss": 0.8835, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 10.756883745264732, + "learning_rate": 9.351351351351353e-06, + "loss": 0.9388, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 7.126584689121563, + "learning_rate": 9.378378378378379e-06, + "loss": 0.9549, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 7.771750612149306, + "learning_rate": 9.405405405405407e-06, + "loss": 0.8982, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 16.16719526827546, + "learning_rate": 9.432432432432433e-06, + "loss": 0.7405, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 12.004866216631472, + "learning_rate": 9.45945945945946e-06, + "loss": 0.8116, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 22.29157327442273, + "learning_rate": 9.486486486486487e-06, + "loss": 0.8311, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 7.442624737259189, + "learning_rate": 9.513513513513514e-06, + "loss": 0.8761, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 6.386628643473507, + "learning_rate": 9.540540540540542e-06, + "loss": 0.9021, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 19.339016076084857, + "learning_rate": 9.567567567567568e-06, + "loss": 0.9448, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 15.423321124464712, + "learning_rate": 9.594594594594594e-06, + "loss": 0.8387, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 44.03518591651131, + "learning_rate": 9.621621621621622e-06, + "loss": 0.9625, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 10.677430139662313, + "learning_rate": 9.64864864864865e-06, + "loss": 0.7932, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 41.77893396768942, + "learning_rate": 9.675675675675676e-06, + "loss": 0.9091, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 14.501349416940233, + "learning_rate": 9.702702702702704e-06, + "loss": 0.8284, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 9.592105906329472, + "learning_rate": 9.729729729729732e-06, + "loss": 0.7937, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 12.98542648937007, + "learning_rate": 9.756756756756758e-06, + "loss": 1.0166, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 18.600162256938955, + "learning_rate": 9.783783783783785e-06, + "loss": 0.8389, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 11.395296324329149, + "learning_rate": 9.810810810810811e-06, + "loss": 0.8358, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 20.771734506081142, + "learning_rate": 9.83783783783784e-06, + "loss": 0.8873, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 15.65993859593466, + "learning_rate": 9.864864864864865e-06, + "loss": 0.9912, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 11.50319106780396, + "learning_rate": 9.891891891891893e-06, + "loss": 0.8361, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 32.95975633319058, + "learning_rate": 9.91891891891892e-06, + "loss": 0.8634, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 13.692061678208075, + "learning_rate": 9.945945945945947e-06, + "loss": 0.8092, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 12.716953515371314, + "learning_rate": 9.972972972972975e-06, + "loss": 0.942, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 82.65871288690137, + "learning_rate": 1e-05, + "loss": 0.8142, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 14.984308807124416, + "learning_rate": 9.99999982698426e-06, + "loss": 0.7474, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 19.07609430392833, + "learning_rate": 9.999999307937047e-06, + "loss": 0.8128, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 9.479775709543809, + "learning_rate": 9.9999984428584e-06, + "loss": 0.7832, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 18.387923540354905, + "learning_rate": 9.99999723174838e-06, + "loss": 0.7065, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 18.984544852756734, + "learning_rate": 9.999995674607067e-06, + "loss": 1.03, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 9.703472141415524, + "learning_rate": 9.99999377143457e-06, + "loss": 0.9147, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 21.100188675244212, + "learning_rate": 9.999991522231024e-06, + "loss": 0.8972, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 21.197882737234895, + "learning_rate": 9.99998892699658e-06, + "loss": 0.8198, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 47.95622701048999, + "learning_rate": 9.999985985731423e-06, + "loss": 0.7839, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 11.682867285214934, + "learning_rate": 9.999982698435748e-06, + "loss": 0.8002, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 16.06406263555143, + "learning_rate": 9.999979065109791e-06, + "loss": 0.8017, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 21.273465938072796, + "learning_rate": 9.999975085753801e-06, + "loss": 0.9241, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 59.854815074167185, + "learning_rate": 9.99997076036805e-06, + "loss": 0.7414, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 15.653054979719489, + "learning_rate": 9.999966088952842e-06, + "loss": 0.7665, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 23.33735783957125, + "learning_rate": 9.999961071508497e-06, + "loss": 0.813, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 38.95163376222572, + "learning_rate": 9.999955708035365e-06, + "loss": 0.9086, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 18.9349861876493, + "learning_rate": 9.999949998533815e-06, + "loss": 0.8665, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 18.711072587977014, + "learning_rate": 9.999943943004242e-06, + "loss": 0.9797, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 21.361431170756777, + "learning_rate": 9.999937541447067e-06, + "loss": 0.9983, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 15.582938979737273, + "learning_rate": 9.999930793862732e-06, + "loss": 0.9204, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 27.94977648249872, + "learning_rate": 9.999923700251704e-06, + "loss": 0.9338, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 14.347217431070954, + "learning_rate": 9.999916260614471e-06, + "loss": 0.7673, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 25.11071957357206, + "learning_rate": 9.999908474951554e-06, + "loss": 0.9327, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 52.04473252984409, + "learning_rate": 9.999900343263487e-06, + "loss": 0.7869, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 19.819982323886766, + "learning_rate": 9.999891865550835e-06, + "loss": 0.7551, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 23.45317773095086, + "learning_rate": 9.999883041814184e-06, + "loss": 1.009, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 9.49863677886503, + "learning_rate": 9.999873872054145e-06, + "loss": 0.9752, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 14.319079242729478, + "learning_rate": 9.99986435627135e-06, + "loss": 0.9145, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 34.90754528895307, + "learning_rate": 9.99985449446646e-06, + "loss": 0.8761, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 26.24096701095773, + "learning_rate": 9.99984428664016e-06, + "loss": 0.8054, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 14.045655627785289, + "learning_rate": 9.999833732793154e-06, + "loss": 0.6386, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 30.77148153147053, + "learning_rate": 9.99982283292617e-06, + "loss": 0.8872, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 99.80768094405511, + "learning_rate": 9.999811587039964e-06, + "loss": 0.8507, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 50.15541055042202, + "learning_rate": 9.999799995135316e-06, + "loss": 0.882, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 9.799442026624805, + "learning_rate": 9.999788057213026e-06, + "loss": 0.9453, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 8.310578500393895, + "learning_rate": 9.999775773273922e-06, + "loss": 0.9552, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 27.55216918044366, + "learning_rate": 9.999763143318853e-06, + "loss": 0.9458, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 15.658885945918538, + "learning_rate": 9.999750167348694e-06, + "loss": 0.709, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 21.74888396097429, + "learning_rate": 9.999736845364342e-06, + "loss": 0.9854, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 21.960870027184015, + "learning_rate": 9.999723177366719e-06, + "loss": 0.8204, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 14.416621672095202, + "learning_rate": 9.999709163356772e-06, + "loss": 0.8273, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 9.039269424969744, + "learning_rate": 9.999694803335468e-06, + "loss": 1.0574, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 11.429652592891859, + "learning_rate": 9.999680097303805e-06, + "loss": 0.8025, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 20.34028441419009, + "learning_rate": 9.999665045262799e-06, + "loss": 0.8172, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 54.22029061670841, + "learning_rate": 9.999649647213491e-06, + "loss": 0.8617, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 14.818505791880616, + "learning_rate": 9.999633903156947e-06, + "loss": 0.7233, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 16.380377983332654, + "learning_rate": 9.999617813094256e-06, + "loss": 0.7646, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 67.89249470883054, + "learning_rate": 9.999601377026533e-06, + "loss": 0.8145, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 33.99404947542081, + "learning_rate": 9.999584594954913e-06, + "loss": 1.0665, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 11.974198270070763, + "learning_rate": 9.99956746688056e-06, + "loss": 0.9434, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 21.085216202470985, + "learning_rate": 9.99954999280466e-06, + "loss": 0.8267, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 8.519989197695768, + "learning_rate": 9.99953217272842e-06, + "loss": 0.694, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 72.0450639432991, + "learning_rate": 9.99951400665307e-06, + "loss": 0.8817, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 15.686591475421789, + "learning_rate": 9.999495494579876e-06, + "loss": 0.9289, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 15.321139988054547, + "learning_rate": 9.999476636510112e-06, + "loss": 0.8548, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 73.31438876403607, + "learning_rate": 9.999457432445087e-06, + "loss": 0.8143, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 20.009931132840833, + "learning_rate": 9.999437882386128e-06, + "loss": 0.9922, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 16.554001506590943, + "learning_rate": 9.999417986334587e-06, + "loss": 0.9504, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 11.152764050819433, + "learning_rate": 9.999397744291845e-06, + "loss": 0.8524, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 73.60849741763013, + "learning_rate": 9.999377156259298e-06, + "loss": 1.0088, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 12.460274613470789, + "learning_rate": 9.999356222238375e-06, + "loss": 0.7579, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 9.876954911654076, + "learning_rate": 9.99933494223052e-06, + "loss": 0.8904, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 11.250303463849706, + "learning_rate": 9.999313316237211e-06, + "loss": 0.7619, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 17.229309353493207, + "learning_rate": 9.999291344259943e-06, + "loss": 0.6296, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 27.16986762458146, + "learning_rate": 9.999269026300234e-06, + "loss": 0.8607, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 18.356589295904822, + "learning_rate": 9.999246362359631e-06, + "loss": 0.8597, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 12.183166560627248, + "learning_rate": 9.999223352439701e-06, + "loss": 0.8629, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 8.995941766241017, + "learning_rate": 9.999199996542038e-06, + "loss": 0.936, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 70.1160321604293, + "learning_rate": 9.999176294668258e-06, + "loss": 0.8156, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 8.437858254059023, + "learning_rate": 9.999152246820001e-06, + "loss": 0.8791, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 13.629524415961784, + "learning_rate": 9.999127852998932e-06, + "loss": 0.9075, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 11.658206569666648, + "learning_rate": 9.999103113206736e-06, + "loss": 0.9753, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 6.233027530226944, + "learning_rate": 9.99907802744513e-06, + "loss": 0.8657, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 12.489952303435613, + "learning_rate": 9.999052595715845e-06, + "loss": 0.8385, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 8.85308572898137, + "learning_rate": 9.999026818020647e-06, + "loss": 0.7528, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 14.363125086991815, + "learning_rate": 9.999000694361315e-06, + "loss": 0.8781, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 7.8147922724491155, + "learning_rate": 9.99897422473966e-06, + "loss": 0.8988, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 9.759788144245777, + "learning_rate": 9.99894740915751e-06, + "loss": 0.8792, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 10.930411502289326, + "learning_rate": 9.998920247616724e-06, + "loss": 0.7594, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 15.948737319156722, + "learning_rate": 9.998892740119183e-06, + "loss": 0.7225, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 8.245018743731332, + "learning_rate": 9.998864886666788e-06, + "loss": 0.8376, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 8.927031973432706, + "learning_rate": 9.998836687261466e-06, + "loss": 0.917, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 68.44457516666587, + "learning_rate": 9.998808141905171e-06, + "loss": 0.6569, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 11.778712971332425, + "learning_rate": 9.998779250599877e-06, + "loss": 0.6889, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 10.145913860704326, + "learning_rate": 9.998750013347584e-06, + "loss": 0.8116, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 14.055120216565927, + "learning_rate": 9.998720430150316e-06, + "loss": 0.9681, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 12.036685065237625, + "learning_rate": 9.99869050101012e-06, + "loss": 0.7294, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 8.070374183474446, + "learning_rate": 9.998660225929066e-06, + "loss": 0.7193, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 15.193301442760763, + "learning_rate": 9.99862960490925e-06, + "loss": 0.8729, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 24.32121791679853, + "learning_rate": 9.998598637952792e-06, + "loss": 0.8121, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 13.715253203453173, + "learning_rate": 9.998567325061834e-06, + "loss": 0.8142, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 10.391314046194852, + "learning_rate": 9.998535666238545e-06, + "loss": 0.847, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 4.817008131711015, + "learning_rate": 9.998503661485112e-06, + "loss": 0.8997, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 8.98268603874573, + "learning_rate": 9.998471310803754e-06, + "loss": 0.9557, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 10.22966883613815, + "learning_rate": 9.998438614196709e-06, + "loss": 0.8092, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 5.71949311627198, + "learning_rate": 9.998405571666237e-06, + "loss": 0.79, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 8.299204246951271, + "learning_rate": 9.998372183214628e-06, + "loss": 0.9451, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 6.352553299280233, + "learning_rate": 9.998338448844193e-06, + "loss": 0.8229, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 12.701480493487278, + "learning_rate": 9.998304368557264e-06, + "loss": 0.8424, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 29.9010218910551, + "learning_rate": 9.9982699423562e-06, + "loss": 0.6158, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 18.898379582824525, + "learning_rate": 9.998235170243384e-06, + "loss": 0.8722, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 7.534229213257622, + "learning_rate": 9.998200052221225e-06, + "loss": 0.6099, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 5.617559645478937, + "learning_rate": 9.99816458829215e-06, + "loss": 0.8279, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 11.528433696022832, + "learning_rate": 9.998128778458613e-06, + "loss": 0.7423, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 19.452763832906626, + "learning_rate": 9.998092622723095e-06, + "loss": 0.8853, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 11.607687984152511, + "learning_rate": 9.998056121088098e-06, + "loss": 0.894, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 11.798486981877323, + "learning_rate": 9.998019273556145e-06, + "loss": 0.7839, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 10.129697512795723, + "learning_rate": 9.997982080129788e-06, + "loss": 0.8359, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 16.619690469837995, + "learning_rate": 9.997944540811604e-06, + "loss": 0.8599, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 12.384986377355515, + "learning_rate": 9.997906655604187e-06, + "loss": 0.8412, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 4.446378744474751, + "learning_rate": 9.997868424510157e-06, + "loss": 0.7223, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 13.531849510333954, + "learning_rate": 9.997829847532165e-06, + "loss": 0.7298, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 6.389313888303728, + "learning_rate": 9.99779092467288e-06, + "loss": 0.8489, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 26.70632484403452, + "learning_rate": 9.997751655934993e-06, + "loss": 0.7851, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 9.439671918202823, + "learning_rate": 9.997712041321224e-06, + "loss": 0.7441, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 6.037847172015665, + "learning_rate": 9.997672080834312e-06, + "loss": 0.8028, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 9.319705177930302, + "learning_rate": 9.997631774477025e-06, + "loss": 0.7406, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 8.780172227792313, + "learning_rate": 9.997591122252151e-06, + "loss": 0.6629, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 12.765849491442697, + "learning_rate": 9.997550124162505e-06, + "loss": 0.8551, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 42.76686905336114, + "learning_rate": 9.99750878021092e-06, + "loss": 0.8103, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 8.473647422158049, + "learning_rate": 9.997467090400264e-06, + "loss": 0.5554, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 8.012366043371337, + "learning_rate": 9.997425054733418e-06, + "loss": 0.8308, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 7.661421852178396, + "learning_rate": 9.997382673213292e-06, + "loss": 0.8219, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 19.47089309477259, + "learning_rate": 9.997339945842817e-06, + "loss": 0.8561, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 41.049332291403395, + "learning_rate": 9.997296872624952e-06, + "loss": 0.8416, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 5.878836183985898, + "learning_rate": 9.99725345356268e-06, + "loss": 0.6601, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 6.394440800021863, + "learning_rate": 9.997209688659004e-06, + "loss": 0.8918, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 6.972600615103139, + "learning_rate": 9.99716557791695e-06, + "loss": 0.8948, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 6.592321556402771, + "learning_rate": 9.997121121339574e-06, + "loss": 0.7166, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 8.772451028326715, + "learning_rate": 9.997076318929952e-06, + "loss": 0.6199, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 14.866626166936214, + "learning_rate": 9.997031170691185e-06, + "loss": 0.8597, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 12.672064495051528, + "learning_rate": 9.996985676626398e-06, + "loss": 0.7912, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 7.608075763625257, + "learning_rate": 9.996939836738736e-06, + "loss": 0.8034, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 15.512777950005521, + "learning_rate": 9.996893651031377e-06, + "loss": 0.8629, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 9.90129001736726, + "learning_rate": 9.996847119507513e-06, + "loss": 0.879, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 12.11366108997228, + "learning_rate": 9.996800242170366e-06, + "loss": 0.8153, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 13.447484190971544, + "learning_rate": 9.996753019023178e-06, + "loss": 0.8408, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 7.109671040543963, + "learning_rate": 9.996705450069219e-06, + "loss": 1.0441, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 10.140216221545709, + "learning_rate": 9.996657535311783e-06, + "loss": 0.6856, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 10.469463780075216, + "learning_rate": 9.996609274754183e-06, + "loss": 0.7259, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 7.96189644553197, + "learning_rate": 9.99656066839976e-06, + "loss": 0.9266, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 13.949353734455318, + "learning_rate": 9.996511716251878e-06, + "loss": 0.8243, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 7.637588078075644, + "learning_rate": 9.996462418313925e-06, + "loss": 0.9055, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 17.414766143190207, + "learning_rate": 9.996412774589312e-06, + "loss": 0.8257, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 105.3535998268008, + "learning_rate": 9.996362785081475e-06, + "loss": 0.8769, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 7.399070067689621, + "learning_rate": 9.996312449793872e-06, + "loss": 0.7435, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 10.605966365232831, + "learning_rate": 9.99626176872999e-06, + "loss": 0.9566, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 11.739482851618803, + "learning_rate": 9.996210741893334e-06, + "loss": 0.8259, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 26.58860849380813, + "learning_rate": 9.996159369287436e-06, + "loss": 0.8887, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 28.865443231952558, + "learning_rate": 9.996107650915851e-06, + "loss": 0.8697, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 13.273543874253734, + "learning_rate": 9.996055586782158e-06, + "loss": 0.8519, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 11.655026322563018, + "learning_rate": 9.996003176889962e-06, + "loss": 0.7761, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 12.34750613332825, + "learning_rate": 9.995950421242887e-06, + "loss": 0.8807, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 6.820187379622272, + "learning_rate": 9.995897319844588e-06, + "loss": 0.7704, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 9.56767507898871, + "learning_rate": 9.995843872698734e-06, + "loss": 0.6508, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 13.342475881792621, + "learning_rate": 9.995790079809031e-06, + "loss": 0.621, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 44.066648368484785, + "learning_rate": 9.995735941179198e-06, + "loss": 0.7811, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 8.378457307459202, + "learning_rate": 9.995681456812981e-06, + "loss": 0.7337, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 11.631531361554062, + "learning_rate": 9.995626626714152e-06, + "loss": 0.8762, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 13.86011073227737, + "learning_rate": 9.995571450886506e-06, + "loss": 0.7705, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 28.580487567109287, + "learning_rate": 9.99551592933386e-06, + "loss": 0.7632, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 43.33520935228094, + "learning_rate": 9.995460062060058e-06, + "loss": 0.9407, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 11.909652801913152, + "learning_rate": 9.995403849068965e-06, + "loss": 0.8536, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 9.402057446232243, + "learning_rate": 9.995347290364472e-06, + "loss": 0.8637, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 7.89031775768683, + "learning_rate": 9.995290385950493e-06, + "loss": 0.9203, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 8.271432064165714, + "learning_rate": 9.995233135830968e-06, + "loss": 0.9156, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 9.093368906953206, + "learning_rate": 9.995175540009855e-06, + "loss": 0.9131, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 9.66103466481114, + "learning_rate": 9.995117598491146e-06, + "loss": 0.8857, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 6.1237346833238115, + "learning_rate": 9.995059311278845e-06, + "loss": 0.826, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 5.634030117867691, + "learning_rate": 9.995000678376987e-06, + "loss": 1.0652, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 24.541992102496202, + "learning_rate": 9.994941699789632e-06, + "loss": 0.8759, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 7.385627874316691, + "learning_rate": 9.994882375520862e-06, + "loss": 0.7202, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 9.62743297829404, + "learning_rate": 9.99482270557478e-06, + "loss": 0.8987, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 7.415142458764166, + "learning_rate": 9.994762689955518e-06, + "loss": 0.8352, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 13.443510555984039, + "learning_rate": 9.994702328667225e-06, + "loss": 0.7744, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 9.037861971705254, + "learning_rate": 9.994641621714085e-06, + "loss": 0.8173, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 11.120780780804768, + "learning_rate": 9.994580569100295e-06, + "loss": 0.7303, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 16.32836065555792, + "learning_rate": 9.99451917083008e-06, + "loss": 0.8896, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 19.86637565436871, + "learning_rate": 9.994457426907692e-06, + "loss": 0.8949, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 8.047045756352302, + "learning_rate": 9.9943953373374e-06, + "loss": 0.846, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 8.179799801211328, + "learning_rate": 9.994332902123505e-06, + "loss": 0.5653, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 10.708603057981128, + "learning_rate": 9.994270121270327e-06, + "loss": 0.8086, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 8.102467932129697, + "learning_rate": 9.994206994782207e-06, + "loss": 0.8118, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 38.096051297213926, + "learning_rate": 9.994143522663519e-06, + "loss": 0.8278, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 21.453507615082344, + "learning_rate": 9.994079704918654e-06, + "loss": 0.821, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 15.606877303262893, + "learning_rate": 9.994015541552028e-06, + "loss": 0.8356, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 7.952697114146562, + "learning_rate": 9.993951032568082e-06, + "loss": 0.9863, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 15.638502879234304, + "learning_rate": 9.993886177971278e-06, + "loss": 0.8728, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 9.508113486056013, + "learning_rate": 9.993820977766108e-06, + "loss": 0.6744, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 15.021278085558164, + "learning_rate": 9.993755431957082e-06, + "loss": 0.7812, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 5.6549632416044, + "learning_rate": 9.993689540548736e-06, + "loss": 0.8249, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 7.85201852254656, + "learning_rate": 9.993623303545632e-06, + "loss": 0.7181, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 6.582155391119535, + "learning_rate": 9.993556720952354e-06, + "loss": 0.9711, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 15.396503116993552, + "learning_rate": 9.993489792773507e-06, + "loss": 0.8035, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 7.7702839069916205, + "learning_rate": 9.993422519013726e-06, + "loss": 0.7612, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 18.015171235239254, + "learning_rate": 9.993354899677665e-06, + "loss": 0.8618, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 18.01915151771511, + "learning_rate": 9.993286934770004e-06, + "loss": 0.8659, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 7.346995314339446, + "learning_rate": 9.993218624295446e-06, + "loss": 0.8125, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 5.706666932661295, + "learning_rate": 9.99314996825872e-06, + "loss": 0.8467, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 9.44517150285393, + "learning_rate": 9.993080966664579e-06, + "loss": 0.8866, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 9.70613542007537, + "learning_rate": 9.993011619517793e-06, + "loss": 0.9496, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 8.317169078764937, + "learning_rate": 9.992941926823166e-06, + "loss": 0.7998, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 9.107898931077647, + "learning_rate": 9.992871888585518e-06, + "loss": 0.7518, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 52.801750389259034, + "learning_rate": 9.992801504809698e-06, + "loss": 0.8034, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 46.3118984132503, + "learning_rate": 9.992730775500578e-06, + "loss": 0.8421, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 8.819460983792244, + "learning_rate": 9.99265970066305e-06, + "loss": 0.7137, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 8.52295056695198, + "learning_rate": 9.992588280302034e-06, + "loss": 0.9252, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 9.29962370533259, + "learning_rate": 9.992516514422474e-06, + "loss": 0.7638, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 9.114961360948504, + "learning_rate": 9.992444403029335e-06, + "loss": 0.7391, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 18.57087743130155, + "learning_rate": 9.99237194612761e-06, + "loss": 0.7999, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 15.113295270410948, + "learning_rate": 9.99229914372231e-06, + "loss": 0.7913, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 8.48388464750635, + "learning_rate": 9.992225995818476e-06, + "loss": 0.7446, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 9.584669962239762, + "learning_rate": 9.99215250242117e-06, + "loss": 0.7364, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 5.79968652253179, + "learning_rate": 9.992078663535475e-06, + "loss": 0.7238, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 7.113171104738922, + "learning_rate": 9.992004479166507e-06, + "loss": 0.7656, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 7.847450193474083, + "learning_rate": 9.991929949319397e-06, + "loss": 0.8573, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 8.359203786968319, + "learning_rate": 9.991855073999299e-06, + "loss": 0.8307, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 5.441631950180476, + "learning_rate": 9.991779853211401e-06, + "loss": 0.8836, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 23.24922130707377, + "learning_rate": 9.991704286960906e-06, + "loss": 0.8915, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 8.206025850462575, + "learning_rate": 9.991628375253044e-06, + "loss": 0.9392, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 8.145059409157405, + "learning_rate": 9.991552118093069e-06, + "loss": 0.6938, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 10.315105346460362, + "learning_rate": 9.991475515486258e-06, + "loss": 0.7187, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 5.962641471520498, + "learning_rate": 9.99139856743791e-06, + "loss": 0.7906, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 7.458265783871584, + "learning_rate": 9.991321273953357e-06, + "loss": 0.9382, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 7.945797071600797, + "learning_rate": 9.991243635037942e-06, + "loss": 0.7283, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 6.866928978058393, + "learning_rate": 9.991165650697039e-06, + "loss": 0.9184, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 6.511885796757297, + "learning_rate": 9.991087320936046e-06, + "loss": 0.8652, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 9.231785413957425, + "learning_rate": 9.991008645760385e-06, + "loss": 0.9465, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 9.372565656750247, + "learning_rate": 9.990929625175498e-06, + "loss": 0.7801, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 8.094511258311677, + "learning_rate": 9.990850259186857e-06, + "loss": 0.6775, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 10.555078383463702, + "learning_rate": 9.990770547799953e-06, + "loss": 0.592, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 18.95371020735239, + "learning_rate": 9.990690491020304e-06, + "loss": 0.882, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 58.73856983645397, + "learning_rate": 9.990610088853446e-06, + "loss": 0.898, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 8.214048073862799, + "learning_rate": 9.990529341304946e-06, + "loss": 0.7934, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 7.511290764183826, + "learning_rate": 9.990448248380396e-06, + "loss": 0.6018, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 64.51133102591518, + "learning_rate": 9.990366810085403e-06, + "loss": 0.7759, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 71.88669653549478, + "learning_rate": 9.990285026425604e-06, + "loss": 0.8268, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 13.866712918824344, + "learning_rate": 9.99020289740666e-06, + "loss": 0.8188, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 11.501494526769124, + "learning_rate": 9.990120423034257e-06, + "loss": 0.9362, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 6.885597538297738, + "learning_rate": 9.990037603314098e-06, + "loss": 0.7901, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 13.85336232473651, + "learning_rate": 9.989954438251916e-06, + "loss": 0.7303, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 20.060591287148707, + "learning_rate": 9.98987092785347e-06, + "loss": 0.9798, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 30.633730971156545, + "learning_rate": 9.989787072124535e-06, + "loss": 0.7626, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 8.793244591626761, + "learning_rate": 9.989702871070918e-06, + "loss": 0.7987, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 9.875060327355442, + "learning_rate": 9.989618324698445e-06, + "loss": 0.682, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 72.97580919120949, + "learning_rate": 9.989533433012965e-06, + "loss": 0.9387, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 14.408292072146214, + "learning_rate": 9.989448196020355e-06, + "loss": 0.9296, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 7.130777903033677, + "learning_rate": 9.989362613726515e-06, + "loss": 0.6926, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 15.925510702672547, + "learning_rate": 9.989276686137364e-06, + "loss": 0.7189, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 11.511840518712551, + "learning_rate": 9.989190413258854e-06, + "loss": 0.8567, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 9.17592349998759, + "learning_rate": 9.98910379509695e-06, + "loss": 0.8127, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 5.84215044694441, + "learning_rate": 9.989016831657652e-06, + "loss": 0.8363, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 5.944119202717555, + "learning_rate": 9.988929522946976e-06, + "loss": 0.7899, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 19.26785296376063, + "learning_rate": 9.988841868970962e-06, + "loss": 0.8757, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 10.636390260512607, + "learning_rate": 9.98875386973568e-06, + "loss": 0.7518, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 21.91253000806537, + "learning_rate": 9.988665525247217e-06, + "loss": 0.7686, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 7.139432620257901, + "learning_rate": 9.988576835511687e-06, + "loss": 0.7362, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 9.27924269716798, + "learning_rate": 9.988487800535233e-06, + "loss": 0.8678, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 9.24001209333957, + "learning_rate": 9.98839842032401e-06, + "loss": 0.9575, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 7.188691399855119, + "learning_rate": 9.98830869488421e-06, + "loss": 0.6572, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 11.12289938087904, + "learning_rate": 9.988218624222036e-06, + "loss": 0.7697, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 9.330598690482764, + "learning_rate": 9.988128208343727e-06, + "loss": 0.8043, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 45.96512746314069, + "learning_rate": 9.988037447255537e-06, + "loss": 0.7814, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 22.111735762299233, + "learning_rate": 9.987946340963749e-06, + "loss": 0.7844, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 10.93177817508431, + "learning_rate": 9.987854889474667e-06, + "loss": 0.8582, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 11.006109742208354, + "learning_rate": 9.987763092794621e-06, + "loss": 0.7843, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 12.534174103061337, + "learning_rate": 9.987670950929963e-06, + "loss": 0.8191, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 29.15393915044455, + "learning_rate": 9.98757846388707e-06, + "loss": 0.7001, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 15.198062054677523, + "learning_rate": 9.987485631672345e-06, + "loss": 0.8103, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 23.488705504357615, + "learning_rate": 9.987392454292208e-06, + "loss": 0.7999, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 10.33662065802245, + "learning_rate": 9.987298931753111e-06, + "loss": 0.778, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 6.4768738924870535, + "learning_rate": 9.987205064061526e-06, + "loss": 1.093, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 17.003363279686255, + "learning_rate": 9.987110851223946e-06, + "loss": 0.733, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 11.123138576352284, + "learning_rate": 9.987016293246896e-06, + "loss": 0.9307, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 5.779564996135507, + "learning_rate": 9.986921390136916e-06, + "loss": 0.9614, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 6.224334648193334, + "learning_rate": 9.986826141900577e-06, + "loss": 0.8988, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 24.475577569836908, + "learning_rate": 9.986730548544468e-06, + "loss": 0.724, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 7.280460449672077, + "learning_rate": 9.986634610075207e-06, + "loss": 0.796, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 7.542748537533427, + "learning_rate": 9.986538326499433e-06, + "loss": 0.7362, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 11.064940928844694, + "learning_rate": 9.986441697823808e-06, + "loss": 0.9358, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 116.55399093608999, + "learning_rate": 9.986344724055022e-06, + "loss": 0.7826, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 6.78303370496228, + "learning_rate": 9.986247405199782e-06, + "loss": 0.6952, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 17.642355639075667, + "learning_rate": 9.986149741264827e-06, + "loss": 0.7374, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 10.017056289977392, + "learning_rate": 9.986051732256913e-06, + "loss": 0.9157, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 10.064148719695446, + "learning_rate": 9.985953378182827e-06, + "loss": 0.8006, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 5.47457956272967, + "learning_rate": 9.985854679049371e-06, + "loss": 0.7697, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 21.651631018896396, + "learning_rate": 9.985755634863378e-06, + "loss": 0.8534, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 6.719366299840288, + "learning_rate": 9.985656245631702e-06, + "loss": 0.7404, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 7.698109554492035, + "learning_rate": 9.985556511361221e-06, + "loss": 0.8373, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 5.361117337465909, + "learning_rate": 9.985456432058839e-06, + "loss": 0.9731, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 6.763210772346195, + "learning_rate": 9.985356007731482e-06, + "loss": 0.8492, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 30.9162879260446, + "learning_rate": 9.985255238386097e-06, + "loss": 0.8259, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 7.5957623732265205, + "learning_rate": 9.985154124029659e-06, + "loss": 0.7056, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 7.890736808979091, + "learning_rate": 9.985052664669168e-06, + "loss": 0.6557, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 12.27263139518476, + "learning_rate": 9.984950860311644e-06, + "loss": 0.8214, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 9.402801393874103, + "learning_rate": 9.984848710964132e-06, + "loss": 0.7401, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 23.394724952251263, + "learning_rate": 9.984746216633703e-06, + "loss": 0.5883, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 7.021658919636003, + "learning_rate": 9.984643377327447e-06, + "loss": 0.9109, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 4.130236454929482, + "learning_rate": 9.984540193052485e-06, + "loss": 0.8465, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 5.769902858828049, + "learning_rate": 9.984436663815957e-06, + "loss": 0.8428, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 6.114982039936177, + "learning_rate": 9.984332789625026e-06, + "loss": 0.7195, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 10.889498593203678, + "learning_rate": 9.984228570486885e-06, + "loss": 0.8967, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 5.932489028692374, + "learning_rate": 9.98412400640874e-06, + "loss": 0.7486, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 6.098005309857313, + "learning_rate": 9.984019097397832e-06, + "loss": 0.7823, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 45.02380744832399, + "learning_rate": 9.983913843461421e-06, + "loss": 0.947, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 6.546751936447832, + "learning_rate": 9.98380824460679e-06, + "loss": 0.8759, + "step": 676 + }, + { + "epoch": 0.05, + "grad_norm": 7.2243920500059335, + "learning_rate": 9.983702300841249e-06, + "loss": 0.767, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 4.046857789064689, + "learning_rate": 9.983596012172127e-06, + "loss": 0.7718, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 5.504942305925027, + "learning_rate": 9.983489378606785e-06, + "loss": 0.7371, + "step": 679 + }, + { + "epoch": 0.06, + "grad_norm": 6.879825384686828, + "learning_rate": 9.983382400152597e-06, + "loss": 0.6592, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 10.16447589071718, + "learning_rate": 9.983275076816969e-06, + "loss": 0.8063, + "step": 681 + }, + { + "epoch": 0.06, + "grad_norm": 6.238786324911869, + "learning_rate": 9.983167408607328e-06, + "loss": 0.7522, + "step": 682 + }, + { + "epoch": 0.06, + "grad_norm": 10.215890960915836, + "learning_rate": 9.983059395531126e-06, + "loss": 0.9019, + "step": 683 + }, + { + "epoch": 0.06, + "grad_norm": 6.033651462159386, + "learning_rate": 9.982951037595839e-06, + "loss": 0.7745, + "step": 684 + }, + { + "epoch": 0.06, + "grad_norm": 30.753639761740306, + "learning_rate": 9.982842334808965e-06, + "loss": 0.6734, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 7.483458909889676, + "learning_rate": 9.982733287178024e-06, + "loss": 0.7812, + "step": 686 + }, + { + "epoch": 0.06, + "grad_norm": 6.177365800238773, + "learning_rate": 9.982623894710568e-06, + "loss": 0.5936, + "step": 687 + }, + { + "epoch": 0.06, + "grad_norm": 15.630098730739194, + "learning_rate": 9.982514157414165e-06, + "loss": 0.6302, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 4.984736565477625, + "learning_rate": 9.98240407529641e-06, + "loss": 0.7691, + "step": 689 + }, + { + "epoch": 0.06, + "grad_norm": 6.432693743912565, + "learning_rate": 9.98229364836492e-06, + "loss": 0.6902, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 6.165986107848322, + "learning_rate": 9.98218287662734e-06, + "loss": 0.7465, + "step": 691 + }, + { + "epoch": 0.06, + "grad_norm": 10.432503462266839, + "learning_rate": 9.982071760091334e-06, + "loss": 0.7174, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 4.484394514975136, + "learning_rate": 9.981960298764591e-06, + "loss": 0.8959, + "step": 693 + }, + { + "epoch": 0.06, + "grad_norm": 75.90186566843316, + "learning_rate": 9.98184849265483e-06, + "loss": 0.8356, + "step": 694 + }, + { + "epoch": 0.06, + "grad_norm": 7.2608229172175, + "learning_rate": 9.981736341769781e-06, + "loss": 0.8358, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 4.995034634233328, + "learning_rate": 9.98162384611721e-06, + "loss": 0.8041, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 6.720473969590034, + "learning_rate": 9.981511005704905e-06, + "loss": 0.8333, + "step": 697 + }, + { + "epoch": 0.06, + "grad_norm": 9.76444910976019, + "learning_rate": 9.98139782054067e-06, + "loss": 0.7411, + "step": 698 + }, + { + "epoch": 0.06, + "grad_norm": 17.596370679277303, + "learning_rate": 9.98128429063234e-06, + "loss": 0.8383, + "step": 699 + }, + { + "epoch": 0.06, + "grad_norm": 4.711920513879975, + "learning_rate": 9.981170415987774e-06, + "loss": 0.7657, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 8.744580599469144, + "learning_rate": 9.98105619661485e-06, + "loss": 0.8097, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 7.07065657363302, + "learning_rate": 9.980941632521472e-06, + "loss": 0.783, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 6.295848545653682, + "learning_rate": 9.980826723715572e-06, + "loss": 0.8645, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 8.358935616768742, + "learning_rate": 9.980711470205102e-06, + "loss": 0.7963, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 9.241298999677191, + "learning_rate": 9.980595871998037e-06, + "loss": 0.7444, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 8.18634156502095, + "learning_rate": 9.980479929102377e-06, + "loss": 0.8947, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 47.982377304650846, + "learning_rate": 9.980363641526145e-06, + "loss": 0.7308, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 10.28182854752954, + "learning_rate": 9.980247009277391e-06, + "loss": 0.7749, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 7.875265862503272, + "learning_rate": 9.980130032364185e-06, + "loss": 0.6975, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 5.7454490609534625, + "learning_rate": 9.980012710794624e-06, + "loss": 0.8281, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 4.854895941374371, + "learning_rate": 9.979895044576829e-06, + "loss": 0.8886, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 4.812770872264873, + "learning_rate": 9.979777033718938e-06, + "loss": 0.7602, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 13.114650721568749, + "learning_rate": 9.97965867822912e-06, + "loss": 0.8744, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 6.11235671654343, + "learning_rate": 9.979539978115568e-06, + "loss": 0.6982, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 7.261228859091739, + "learning_rate": 9.979420933386497e-06, + "loss": 0.8365, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 10.343688127492557, + "learning_rate": 9.979301544050143e-06, + "loss": 0.7786, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 9.053383160153807, + "learning_rate": 9.979181810114771e-06, + "loss": 0.6904, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 6.804278320561714, + "learning_rate": 9.979061731588666e-06, + "loss": 0.8131, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 24.677229568245547, + "learning_rate": 9.978941308480137e-06, + "loss": 0.9041, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 5.969549266616758, + "learning_rate": 9.978820540797521e-06, + "loss": 0.8384, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 22.663565775540853, + "learning_rate": 9.978699428549175e-06, + "loss": 1.0064, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 4.362272357297812, + "learning_rate": 9.978577971743477e-06, + "loss": 0.8144, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 9.040528256887317, + "learning_rate": 9.978456170388838e-06, + "loss": 0.7186, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 11.804019951391107, + "learning_rate": 9.978334024493686e-06, + "loss": 0.7353, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 8.812013060668532, + "learning_rate": 9.978211534066471e-06, + "loss": 0.7705, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 6.740598429924063, + "learning_rate": 9.978088699115673e-06, + "loss": 0.835, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 6.897707916902952, + "learning_rate": 9.977965519649793e-06, + "loss": 0.7805, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 7.7187957644071155, + "learning_rate": 9.977841995677355e-06, + "loss": 0.6987, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 31.282590316712046, + "learning_rate": 9.977718127206909e-06, + "loss": 0.7077, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 9.352277370686064, + "learning_rate": 9.977593914247024e-06, + "loss": 0.6778, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 48.19954764014883, + "learning_rate": 9.977469356806299e-06, + "loss": 0.8746, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 18.704782543573447, + "learning_rate": 9.977344454893354e-06, + "loss": 0.844, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 15.174353611194654, + "learning_rate": 9.977219208516833e-06, + "loss": 0.8113, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 5.492339289354649, + "learning_rate": 9.977093617685404e-06, + "loss": 0.7634, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 7.338302644714976, + "learning_rate": 9.976967682407758e-06, + "loss": 0.8516, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 7.983333758170923, + "learning_rate": 9.97684140269261e-06, + "loss": 0.9512, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 7.777921188900074, + "learning_rate": 9.976714778548701e-06, + "loss": 0.7497, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 8.916373984163911, + "learning_rate": 9.976587809984795e-06, + "loss": 0.916, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 6.787036602947435, + "learning_rate": 9.976460497009674e-06, + "loss": 0.9375, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 5.364770410436953, + "learning_rate": 9.976332839632155e-06, + "loss": 0.6312, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 4.322247544796824, + "learning_rate": 9.976204837861068e-06, + "loss": 0.8753, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 5.443957648071532, + "learning_rate": 9.976076491705276e-06, + "loss": 0.7943, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 12.419575551869404, + "learning_rate": 9.975947801173656e-06, + "loss": 0.8131, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 5.777255877496506, + "learning_rate": 9.975818766275118e-06, + "loss": 0.9184, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 11.940859893489135, + "learning_rate": 9.975689387018591e-06, + "loss": 0.6416, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 13.008816765202388, + "learning_rate": 9.975559663413029e-06, + "loss": 0.7062, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 7.167797892402563, + "learning_rate": 9.97542959546741e-06, + "loss": 0.5999, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 4.353738233885998, + "learning_rate": 9.975299183190734e-06, + "loss": 0.7373, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 5.559853128623595, + "learning_rate": 9.975168426592028e-06, + "loss": 0.7956, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 22.636498283945155, + "learning_rate": 9.975037325680341e-06, + "loss": 0.8175, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 9.22874708470722, + "learning_rate": 9.974905880464745e-06, + "loss": 0.7904, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 8.822797181919888, + "learning_rate": 9.974774090954339e-06, + "loss": 0.9434, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 7.860772888999804, + "learning_rate": 9.974641957158242e-06, + "loss": 0.852, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 9.19956956070986, + "learning_rate": 9.974509479085596e-06, + "loss": 0.887, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 8.69151889213691, + "learning_rate": 9.974376656745574e-06, + "loss": 0.8229, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 9.46372560684122, + "learning_rate": 9.974243490147366e-06, + "loss": 0.801, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 6.312659465417973, + "learning_rate": 9.974109979300187e-06, + "loss": 0.7765, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 4.3638867032447415, + "learning_rate": 9.973976124213278e-06, + "loss": 0.8664, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 13.756599665248617, + "learning_rate": 9.973841924895904e-06, + "loss": 0.8044, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 9.308361457370733, + "learning_rate": 9.97370738135735e-06, + "loss": 0.8209, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 13.670262940598521, + "learning_rate": 9.973572493606928e-06, + "loss": 0.9134, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 18.096126530856427, + "learning_rate": 9.973437261653973e-06, + "loss": 0.538, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 8.327060634411392, + "learning_rate": 9.973301685507844e-06, + "loss": 0.9501, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 24.55584070533181, + "learning_rate": 9.973165765177925e-06, + "loss": 0.7773, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 9.108620867789838, + "learning_rate": 9.973029500673622e-06, + "loss": 0.763, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 7.978183697469351, + "learning_rate": 9.972892892004363e-06, + "loss": 0.6603, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 8.088448138160917, + "learning_rate": 9.972755939179604e-06, + "loss": 0.7674, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 8.872525630493133, + "learning_rate": 9.972618642208823e-06, + "loss": 0.7546, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 10.049844894137848, + "learning_rate": 9.972481001101523e-06, + "loss": 0.7202, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 8.019992840517824, + "learning_rate": 9.972343015867228e-06, + "loss": 0.7685, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 13.369252931835081, + "learning_rate": 9.972204686515486e-06, + "loss": 0.7988, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 17.282561723904053, + "learning_rate": 9.972066013055874e-06, + "loss": 0.8278, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 10.26836660016062, + "learning_rate": 9.971926995497987e-06, + "loss": 0.7436, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 12.25179735568449, + "learning_rate": 9.971787633851447e-06, + "loss": 0.8776, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 24.541224534268473, + "learning_rate": 9.971647928125894e-06, + "loss": 0.7755, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 4.905244768329663, + "learning_rate": 9.971507878331005e-06, + "loss": 0.7453, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 7.05090508101291, + "learning_rate": 9.971367484476465e-06, + "loss": 0.856, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 8.817778912742655, + "learning_rate": 9.971226746571992e-06, + "loss": 0.769, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 5.3409807647908565, + "learning_rate": 9.971085664627328e-06, + "loss": 0.7754, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 42.16575317865683, + "learning_rate": 9.970944238652236e-06, + "loss": 0.8689, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 6.120024385474541, + "learning_rate": 9.970802468656503e-06, + "loss": 0.7669, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 8.189114538713227, + "learning_rate": 9.970660354649939e-06, + "loss": 0.8429, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 5.698893152267697, + "learning_rate": 9.970517896642382e-06, + "loss": 1.0017, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 21.042069550275208, + "learning_rate": 9.970375094643689e-06, + "loss": 0.872, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 13.878057295932173, + "learning_rate": 9.970231948663743e-06, + "loss": 0.8266, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 6.336916372172675, + "learning_rate": 9.970088458712451e-06, + "loss": 0.8635, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 7.0449806932194265, + "learning_rate": 9.969944624799745e-06, + "loss": 0.7286, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 8.54847444967724, + "learning_rate": 9.969800446935577e-06, + "loss": 0.808, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 4.276295039491074, + "learning_rate": 9.969655925129924e-06, + "loss": 0.8296, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 5.854457082157973, + "learning_rate": 9.96951105939279e-06, + "loss": 0.6399, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 17.09779184862654, + "learning_rate": 9.9693658497342e-06, + "loss": 0.7, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 6.424050990095431, + "learning_rate": 9.969220296164205e-06, + "loss": 0.7608, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 8.160276097768419, + "learning_rate": 9.969074398692875e-06, + "loss": 0.7454, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 7.5014742118887705, + "learning_rate": 9.96892815733031e-06, + "loss": 0.7727, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 8.284246371359577, + "learning_rate": 9.968781572086628e-06, + "loss": 0.8904, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 5.881953539663061, + "learning_rate": 9.968634642971978e-06, + "loss": 0.7347, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 7.722557182043191, + "learning_rate": 9.968487369996523e-06, + "loss": 0.6517, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 5.794780158131567, + "learning_rate": 9.968339753170459e-06, + "loss": 0.7945, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 4.871365261617554, + "learning_rate": 9.968191792504001e-06, + "loss": 0.6972, + "step": 799 + }, + { + "epoch": 0.06, + "grad_norm": 6.930616893287979, + "learning_rate": 9.968043488007386e-06, + "loss": 0.8232, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 4.28903403125063, + "learning_rate": 9.967894839690884e-06, + "loss": 0.9223, + "step": 801 + }, + { + "epoch": 0.07, + "grad_norm": 5.325421378008341, + "learning_rate": 9.967745847564776e-06, + "loss": 0.8776, + "step": 802 + }, + { + "epoch": 0.07, + "grad_norm": 7.932127702313135, + "learning_rate": 9.967596511639378e-06, + "loss": 0.909, + "step": 803 + }, + { + "epoch": 0.07, + "grad_norm": 7.1841323377343524, + "learning_rate": 9.96744683192502e-06, + "loss": 0.7199, + "step": 804 + }, + { + "epoch": 0.07, + "grad_norm": 6.642553610954152, + "learning_rate": 9.967296808432066e-06, + "loss": 0.7498, + "step": 805 + }, + { + "epoch": 0.07, + "grad_norm": 5.2268716616633, + "learning_rate": 9.967146441170896e-06, + "loss": 0.6228, + "step": 806 + }, + { + "epoch": 0.07, + "grad_norm": 4.121699384903832, + "learning_rate": 9.966995730151915e-06, + "loss": 0.8428, + "step": 807 + }, + { + "epoch": 0.07, + "grad_norm": 15.56106843908596, + "learning_rate": 9.966844675385555e-06, + "loss": 0.7743, + "step": 808 + }, + { + "epoch": 0.07, + "grad_norm": 5.948918031370312, + "learning_rate": 9.966693276882272e-06, + "loss": 0.8727, + "step": 809 + }, + { + "epoch": 0.07, + "grad_norm": 4.664897988238388, + "learning_rate": 9.966541534652538e-06, + "loss": 0.6481, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 6.177603280928483, + "learning_rate": 9.966389448706859e-06, + "loss": 0.78, + "step": 811 + }, + { + "epoch": 0.07, + "grad_norm": 13.162756931173213, + "learning_rate": 9.96623701905576e-06, + "loss": 0.7799, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 17.018743069941877, + "learning_rate": 9.966084245709788e-06, + "loss": 0.727, + "step": 813 + }, + { + "epoch": 0.07, + "grad_norm": 7.666835410148128, + "learning_rate": 9.96593112867952e-06, + "loss": 0.7523, + "step": 814 + }, + { + "epoch": 0.07, + "grad_norm": 5.540559464419787, + "learning_rate": 9.965777667975546e-06, + "loss": 0.8156, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 4.951176468811312, + "learning_rate": 9.965623863608494e-06, + "loss": 0.7693, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 3.7206632323204025, + "learning_rate": 9.965469715589002e-06, + "loss": 0.8613, + "step": 817 + }, + { + "epoch": 0.07, + "grad_norm": 6.7556748849564325, + "learning_rate": 9.96531522392774e-06, + "loss": 0.9728, + "step": 818 + }, + { + "epoch": 0.07, + "grad_norm": 5.8668072394888995, + "learning_rate": 9.965160388635402e-06, + "loss": 0.6932, + "step": 819 + }, + { + "epoch": 0.07, + "grad_norm": 26.194320057358315, + "learning_rate": 9.9650052097227e-06, + "loss": 0.7615, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 5.333708559215378, + "learning_rate": 9.964849687200377e-06, + "loss": 0.8371, + "step": 821 + }, + { + "epoch": 0.07, + "grad_norm": 5.1594197263264965, + "learning_rate": 9.964693821079194e-06, + "loss": 0.7396, + "step": 822 + }, + { + "epoch": 0.07, + "grad_norm": 5.1273121776194355, + "learning_rate": 9.964537611369938e-06, + "loss": 0.8317, + "step": 823 + }, + { + "epoch": 0.07, + "grad_norm": 4.653307624111108, + "learning_rate": 9.964381058083421e-06, + "loss": 0.6836, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 9.604982551891355, + "learning_rate": 9.964224161230476e-06, + "loss": 0.7292, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 4.7403449517953735, + "learning_rate": 9.96406692082196e-06, + "loss": 0.7123, + "step": 826 + }, + { + "epoch": 0.07, + "grad_norm": 8.227697079100423, + "learning_rate": 9.963909336868758e-06, + "loss": 0.8756, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 6.115060769086787, + "learning_rate": 9.963751409381774e-06, + "loss": 0.7654, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 16.90809934504262, + "learning_rate": 9.963593138371939e-06, + "loss": 0.779, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 6.359338702571578, + "learning_rate": 9.963434523850206e-06, + "loss": 0.7796, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 4.411438184565806, + "learning_rate": 9.96327556582755e-06, + "loss": 0.7237, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 44.76047610019357, + "learning_rate": 9.963116264314974e-06, + "loss": 0.7859, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 9.619867305900573, + "learning_rate": 9.962956619323504e-06, + "loss": 0.8518, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 11.188991808541468, + "learning_rate": 9.962796630864184e-06, + "loss": 0.7654, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 6.004332394358668, + "learning_rate": 9.96263629894809e-06, + "loss": 0.7891, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 5.588585308557958, + "learning_rate": 9.962475623586316e-06, + "loss": 0.6648, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 9.148501115648214, + "learning_rate": 9.962314604789982e-06, + "loss": 0.762, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 14.722793520390981, + "learning_rate": 9.962153242570233e-06, + "loss": 0.7699, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 9.859290348194369, + "learning_rate": 9.961991536938237e-06, + "loss": 0.6424, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 3.8594444931415635, + "learning_rate": 9.961829487905182e-06, + "loss": 0.4996, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 7.623088174752541, + "learning_rate": 9.961667095482283e-06, + "loss": 0.8039, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 4.677367894065911, + "learning_rate": 9.96150435968078e-06, + "loss": 0.7172, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 14.383615352481165, + "learning_rate": 9.961341280511936e-06, + "loss": 0.7313, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 4.08138449110562, + "learning_rate": 9.961177857987037e-06, + "loss": 0.8424, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 10.94874355130115, + "learning_rate": 9.96101409211739e-06, + "loss": 0.8608, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 6.230957946137558, + "learning_rate": 9.960849982914332e-06, + "loss": 0.9696, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 4.761848121321998, + "learning_rate": 9.960685530389218e-06, + "loss": 0.8284, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 5.987747041413682, + "learning_rate": 9.960520734553432e-06, + "loss": 0.6766, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 6.126573632799808, + "learning_rate": 9.960355595418375e-06, + "loss": 0.6997, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 8.144664170303466, + "learning_rate": 9.960190112995479e-06, + "loss": 0.6508, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 8.37940730156233, + "learning_rate": 9.960024287296195e-06, + "loss": 0.8425, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 6.394458375642536, + "learning_rate": 9.959858118332e-06, + "loss": 0.7271, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 9.177863306437578, + "learning_rate": 9.959691606114393e-06, + "loss": 0.7561, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 8.302236980695053, + "learning_rate": 9.959524750654898e-06, + "loss": 0.8669, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 10.544754941989344, + "learning_rate": 9.959357551965063e-06, + "loss": 0.7252, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 4.99750362486209, + "learning_rate": 9.959190010056458e-06, + "loss": 0.7404, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 36.414827290278545, + "learning_rate": 9.959022124940678e-06, + "loss": 0.7767, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 14.503261023860437, + "learning_rate": 9.958853896629344e-06, + "loss": 0.9081, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 4.67930378717921, + "learning_rate": 9.958685325134097e-06, + "loss": 0.7509, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 4.865634714529107, + "learning_rate": 9.958516410466601e-06, + "loss": 0.8449, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 7.771632641240565, + "learning_rate": 9.95834715263855e-06, + "loss": 0.7373, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 6.04839485681136, + "learning_rate": 9.958177551661655e-06, + "loss": 0.8477, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 19.242405087909226, + "learning_rate": 9.958007607547652e-06, + "loss": 0.8284, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 5.213087523048071, + "learning_rate": 9.957837320308309e-06, + "loss": 0.7162, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 5.418000678399474, + "learning_rate": 9.957666689955403e-06, + "loss": 0.7309, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 7.648405072934473, + "learning_rate": 9.957495716500747e-06, + "loss": 0.9614, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 3.8831454782837795, + "learning_rate": 9.957324399956172e-06, + "loss": 0.737, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 8.28695972505524, + "learning_rate": 9.957152740333534e-06, + "loss": 0.8296, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 5.43908349412972, + "learning_rate": 9.956980737644715e-06, + "loss": 0.9532, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 6.23835385602191, + "learning_rate": 9.956808391901615e-06, + "loss": 0.7306, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 4.725563833646563, + "learning_rate": 9.956635703116166e-06, + "loss": 0.696, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 3.512834238982999, + "learning_rate": 9.956462671300317e-06, + "loss": 0.829, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 3.0551970397151087, + "learning_rate": 9.956289296466041e-06, + "loss": 0.8191, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 4.320361362167167, + "learning_rate": 9.956115578625339e-06, + "loss": 0.7991, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 6.967663473238619, + "learning_rate": 9.955941517790232e-06, + "loss": 0.676, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 5.394789910865541, + "learning_rate": 9.955767113972767e-06, + "loss": 0.761, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 9.614141369274806, + "learning_rate": 9.955592367185015e-06, + "loss": 0.7956, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 4.658583378466397, + "learning_rate": 9.955417277439068e-06, + "loss": 0.8907, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 4.897367126235742, + "learning_rate": 9.955241844747042e-06, + "loss": 0.8763, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 8.290562520715572, + "learning_rate": 9.95506606912108e-06, + "loss": 0.9094, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 15.439530966584819, + "learning_rate": 9.954889950573347e-06, + "loss": 0.783, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 37.29525276950261, + "learning_rate": 9.95471348911603e-06, + "loss": 0.9104, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 5.187187775729593, + "learning_rate": 9.954536684761343e-06, + "loss": 0.9567, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 6.512834614492173, + "learning_rate": 9.95435953752152e-06, + "loss": 0.8454, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 5.405657187936197, + "learning_rate": 9.954182047408823e-06, + "loss": 0.7677, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 6.9931936765485085, + "learning_rate": 9.954004214435533e-06, + "loss": 0.8084, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 5.667047853835735, + "learning_rate": 9.953826038613961e-06, + "loss": 0.7438, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 20.8710846896458, + "learning_rate": 9.953647519956432e-06, + "loss": 0.5489, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 4.589623792937581, + "learning_rate": 9.953468658475305e-06, + "loss": 0.6361, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 5.322500375505189, + "learning_rate": 9.953289454182958e-06, + "loss": 0.7671, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 3.8203693202343194, + "learning_rate": 9.953109907091792e-06, + "loss": 0.7516, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 6.990557318329104, + "learning_rate": 9.952930017214233e-06, + "loss": 0.7722, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 6.7532650792567575, + "learning_rate": 9.95274978456273e-06, + "loss": 0.8809, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 7.508917201796005, + "learning_rate": 9.952569209149757e-06, + "loss": 0.8374, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 5.992657387810211, + "learning_rate": 9.952388290987812e-06, + "loss": 0.7263, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 7.581471653942725, + "learning_rate": 9.95220703008941e-06, + "loss": 0.6452, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 25.857956670490243, + "learning_rate": 9.952025426467105e-06, + "loss": 0.8136, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 5.113196391952603, + "learning_rate": 9.951843480133458e-06, + "loss": 0.8126, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 18.49369519994825, + "learning_rate": 9.951661191101063e-06, + "loss": 0.8243, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 23.873132265642464, + "learning_rate": 9.951478559382536e-06, + "loss": 0.751, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 4.6761732460968295, + "learning_rate": 9.951295584990515e-06, + "loss": 0.7602, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 7.609129030298003, + "learning_rate": 9.951112267937663e-06, + "loss": 0.6253, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 6.786478944356428, + "learning_rate": 9.950928608236668e-06, + "loss": 0.8975, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 5.455662032494263, + "learning_rate": 9.95074460590024e-06, + "loss": 0.8192, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 26.577339739078592, + "learning_rate": 9.950560260941112e-06, + "loss": 0.8423, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 4.321835415456136, + "learning_rate": 9.950375573372042e-06, + "loss": 0.7966, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 5.004862640712375, + "learning_rate": 9.950190543205813e-06, + "loss": 0.6711, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 3.9501277130340715, + "learning_rate": 9.95000517045523e-06, + "loss": 0.7701, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 13.828369329201337, + "learning_rate": 9.949819455133121e-06, + "loss": 0.7923, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 4.881289854963828, + "learning_rate": 9.949633397252339e-06, + "loss": 0.7466, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 4.0183721039686215, + "learning_rate": 9.94944699682576e-06, + "loss": 0.7513, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 4.567117089736312, + "learning_rate": 9.949260253866286e-06, + "loss": 0.8003, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 4.449917123996754, + "learning_rate": 9.949073168386838e-06, + "loss": 0.8539, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 7.388867799156344, + "learning_rate": 9.948885740400365e-06, + "loss": 0.7033, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 20.231811817769845, + "learning_rate": 9.948697969919839e-06, + "loss": 0.7475, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 4.602413427599765, + "learning_rate": 9.948509856958253e-06, + "loss": 0.8719, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 6.117071530642284, + "learning_rate": 9.948321401528625e-06, + "loss": 0.9034, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 19.01094701332023, + "learning_rate": 9.948132603644001e-06, + "loss": 0.8231, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 5.7879819054520105, + "learning_rate": 9.947943463317445e-06, + "loss": 0.6423, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 5.155867668448361, + "learning_rate": 9.947753980562045e-06, + "loss": 0.7194, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 23.22171030764841, + "learning_rate": 9.947564155390916e-06, + "loss": 0.7074, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 6.393907034813833, + "learning_rate": 9.947373987817194e-06, + "loss": 0.7863, + "step": 922 + }, + { + "epoch": 0.07, + "grad_norm": 5.475109094971649, + "learning_rate": 9.947183477854042e-06, + "loss": 0.9534, + "step": 923 + }, + { + "epoch": 0.08, + "grad_norm": 4.53901485891668, + "learning_rate": 9.946992625514646e-06, + "loss": 0.7146, + "step": 924 + }, + { + "epoch": 0.08, + "grad_norm": 13.622381875614638, + "learning_rate": 9.946801430812208e-06, + "loss": 0.8143, + "step": 925 + }, + { + "epoch": 0.08, + "grad_norm": 9.549300713937553, + "learning_rate": 9.946609893759966e-06, + "loss": 0.749, + "step": 926 + }, + { + "epoch": 0.08, + "grad_norm": 6.481220949010384, + "learning_rate": 9.94641801437117e-06, + "loss": 0.763, + "step": 927 + }, + { + "epoch": 0.08, + "grad_norm": 5.56160845553395, + "learning_rate": 9.946225792659104e-06, + "loss": 0.7637, + "step": 928 + }, + { + "epoch": 0.08, + "grad_norm": 6.161101929715245, + "learning_rate": 9.946033228637069e-06, + "loss": 0.9231, + "step": 929 + }, + { + "epoch": 0.08, + "grad_norm": 5.096161531859888, + "learning_rate": 9.945840322318391e-06, + "loss": 0.7357, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 6.544050522382474, + "learning_rate": 9.945647073716422e-06, + "loss": 0.7041, + "step": 931 + }, + { + "epoch": 0.08, + "grad_norm": 7.850436824436384, + "learning_rate": 9.945453482844535e-06, + "loss": 0.8433, + "step": 932 + }, + { + "epoch": 0.08, + "grad_norm": 6.440581462075318, + "learning_rate": 9.945259549716127e-06, + "loss": 0.6464, + "step": 933 + }, + { + "epoch": 0.08, + "grad_norm": 5.340891125007818, + "learning_rate": 9.94506527434462e-06, + "loss": 0.7887, + "step": 934 + }, + { + "epoch": 0.08, + "grad_norm": 7.592498978195759, + "learning_rate": 9.944870656743462e-06, + "loss": 0.8702, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 9.84904867711743, + "learning_rate": 9.944675696926117e-06, + "loss": 0.7181, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 21.662536540182213, + "learning_rate": 9.944480394906079e-06, + "loss": 0.7432, + "step": 937 + }, + { + "epoch": 0.08, + "grad_norm": 7.27272270876898, + "learning_rate": 9.944284750696865e-06, + "loss": 0.7084, + "step": 938 + }, + { + "epoch": 0.08, + "grad_norm": 6.765101245689383, + "learning_rate": 9.944088764312014e-06, + "loss": 0.6646, + "step": 939 + }, + { + "epoch": 0.08, + "grad_norm": 5.518373504631924, + "learning_rate": 9.943892435765093e-06, + "loss": 0.8554, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 3.940540018156056, + "learning_rate": 9.943695765069683e-06, + "loss": 0.8284, + "step": 941 + }, + { + "epoch": 0.08, + "grad_norm": 5.63113714704677, + "learning_rate": 9.943498752239398e-06, + "loss": 0.7775, + "step": 942 + }, + { + "epoch": 0.08, + "grad_norm": 4.503367299437282, + "learning_rate": 9.943301397287874e-06, + "loss": 0.7488, + "step": 943 + }, + { + "epoch": 0.08, + "grad_norm": 6.246638434274801, + "learning_rate": 9.943103700228768e-06, + "loss": 0.7664, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 11.15387398581592, + "learning_rate": 9.942905661075759e-06, + "loss": 0.6623, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 3.8034695725627046, + "learning_rate": 9.942707279842557e-06, + "loss": 0.763, + "step": 946 + }, + { + "epoch": 0.08, + "grad_norm": 5.608412165544444, + "learning_rate": 9.94250855654289e-06, + "loss": 0.7495, + "step": 947 + }, + { + "epoch": 0.08, + "grad_norm": 4.523342986017144, + "learning_rate": 9.942309491190509e-06, + "loss": 0.8043, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 6.493377748582832, + "learning_rate": 9.942110083799192e-06, + "loss": 0.9902, + "step": 949 + }, + { + "epoch": 0.08, + "grad_norm": 10.937879370172757, + "learning_rate": 9.94191033438274e-06, + "loss": 0.7315, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 5.216161601391438, + "learning_rate": 9.941710242954976e-06, + "loss": 0.5672, + "step": 951 + }, + { + "epoch": 0.08, + "grad_norm": 4.703809997189642, + "learning_rate": 9.941509809529746e-06, + "loss": 0.9608, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 5.620467549065916, + "learning_rate": 9.941309034120925e-06, + "loss": 0.7088, + "step": 953 + }, + { + "epoch": 0.08, + "grad_norm": 4.461534523951197, + "learning_rate": 9.941107916742405e-06, + "loss": 0.8243, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 22.560882535644645, + "learning_rate": 9.940906457408103e-06, + "loss": 0.7528, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 4.27339083312216, + "learning_rate": 9.940704656131967e-06, + "loss": 0.7824, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 6.427607352352942, + "learning_rate": 9.940502512927958e-06, + "loss": 0.8037, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 21.486530270408817, + "learning_rate": 9.940300027810067e-06, + "loss": 0.6702, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 3.6136062975255667, + "learning_rate": 9.94009720079231e-06, + "loss": 0.8245, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 5.82086673808323, + "learning_rate": 9.939894031888717e-06, + "loss": 0.7954, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 4.979728493186614, + "learning_rate": 9.939690521113355e-06, + "loss": 0.7076, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 4.12188946976356, + "learning_rate": 9.939486668480306e-06, + "loss": 0.7562, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 4.299141519077421, + "learning_rate": 9.939282474003678e-06, + "loss": 0.8121, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 6.822823660416211, + "learning_rate": 9.939077937697604e-06, + "loss": 0.7428, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 11.154851287993086, + "learning_rate": 9.938873059576235e-06, + "loss": 0.645, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 4.516010444982705, + "learning_rate": 9.938667839653752e-06, + "loss": 0.7897, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 4.313693456095772, + "learning_rate": 9.93846227794436e-06, + "loss": 0.7475, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 7.846145724945153, + "learning_rate": 9.938256374462286e-06, + "loss": 0.716, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 4.385874599495664, + "learning_rate": 9.938050129221773e-06, + "loss": 0.8602, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 3.2789458776551155, + "learning_rate": 9.937843542237099e-06, + "loss": 0.602, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 5.819009004471656, + "learning_rate": 9.937636613522562e-06, + "loss": 0.7621, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 3.9582290627576424, + "learning_rate": 9.93742934309248e-06, + "loss": 0.837, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 7.6323447591746705, + "learning_rate": 9.9372217309612e-06, + "loss": 0.7551, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 3.631259920871801, + "learning_rate": 9.937013777143087e-06, + "loss": 0.6543, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 6.509976918945766, + "learning_rate": 9.936805481652536e-06, + "loss": 0.8063, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 8.516313771294374, + "learning_rate": 9.936596844503962e-06, + "loss": 0.8063, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 11.591748132257733, + "learning_rate": 9.936387865711802e-06, + "loss": 0.8324, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 4.603496334600352, + "learning_rate": 9.936178545290519e-06, + "loss": 0.938, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 4.349682373081811, + "learning_rate": 9.9359688832546e-06, + "loss": 0.6962, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 3.5776018820043127, + "learning_rate": 9.935758879618556e-06, + "loss": 0.7722, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 4.072291376069065, + "learning_rate": 9.93554853439692e-06, + "loss": 0.6594, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 4.592067075313992, + "learning_rate": 9.935337847604246e-06, + "loss": 0.6964, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 2.9351920534778757, + "learning_rate": 9.935126819255119e-06, + "loss": 0.8057, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 3.7762648145261517, + "learning_rate": 9.934915449364141e-06, + "loss": 0.7966, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 6.6685467898552755, + "learning_rate": 9.934703737945944e-06, + "loss": 0.7689, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 3.760959075165353, + "learning_rate": 9.934491685015173e-06, + "loss": 0.7678, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 3.4864339606064476, + "learning_rate": 9.934279290586511e-06, + "loss": 0.618, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 5.228770735384011, + "learning_rate": 9.93406655467465e-06, + "loss": 0.7949, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 3.147651548750773, + "learning_rate": 9.933853477294317e-06, + "loss": 0.7729, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 7.406575738382119, + "learning_rate": 9.93364005846026e-06, + "loss": 0.7157, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 3.6224987419974766, + "learning_rate": 9.933426298187243e-06, + "loss": 0.7651, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 4.119228522175504, + "learning_rate": 9.933212196490063e-06, + "loss": 0.5336, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 8.300633494688958, + "learning_rate": 9.932997753383538e-06, + "loss": 0.8058, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 4.380093926346218, + "learning_rate": 9.932782968882506e-06, + "loss": 0.7643, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 5.201132628181408, + "learning_rate": 9.932567843001835e-06, + "loss": 0.926, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 5.950779583930611, + "learning_rate": 9.932352375756411e-06, + "loss": 0.7841, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 21.05161336474183, + "learning_rate": 9.932136567161145e-06, + "loss": 0.6382, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 9.056181659713767, + "learning_rate": 9.931920417230974e-06, + "loss": 0.729, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 23.988667050010946, + "learning_rate": 9.931703925980856e-06, + "loss": 0.8483, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 9.214668855527453, + "learning_rate": 9.931487093425775e-06, + "loss": 0.6327, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 5.746109229520153, + "learning_rate": 9.931269919580734e-06, + "loss": 0.8844, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 5.299552832418721, + "learning_rate": 9.931052404460766e-06, + "loss": 0.7639, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 4.160625810606795, + "learning_rate": 9.930834548080922e-06, + "loss": 0.6231, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 3.5668914498558024, + "learning_rate": 9.930616350456282e-06, + "loss": 0.6595, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 4.63681778605169, + "learning_rate": 9.930397811601943e-06, + "loss": 0.801, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 5.496515292026558, + "learning_rate": 9.930178931533032e-06, + "loss": 0.6517, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 3.5020793942161, + "learning_rate": 9.929959710264695e-06, + "loss": 0.7167, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 17.1445253482021, + "learning_rate": 9.929740147812106e-06, + "loss": 0.816, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 11.27795117529141, + "learning_rate": 9.929520244190458e-06, + "loss": 0.7646, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 5.49549461415629, + "learning_rate": 9.92929999941497e-06, + "loss": 0.6922, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 4.993225096537147, + "learning_rate": 9.929079413500884e-06, + "loss": 0.9263, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 3.7330883391512684, + "learning_rate": 9.928858486463467e-06, + "loss": 0.8416, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 5.40128996639886, + "learning_rate": 9.928637218318009e-06, + "loss": 0.6479, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 4.0160429141944185, + "learning_rate": 9.928415609079821e-06, + "loss": 0.8747, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 3.945935170920747, + "learning_rate": 9.92819365876424e-06, + "loss": 0.8002, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 5.196704350368994, + "learning_rate": 9.927971367386629e-06, + "loss": 0.6667, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 7.304828327588545, + "learning_rate": 9.92774873496237e-06, + "loss": 0.6251, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 4.25473957858187, + "learning_rate": 9.927525761506871e-06, + "loss": 0.7353, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 4.640389992903682, + "learning_rate": 9.927302447035563e-06, + "loss": 0.833, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 5.760963916229781, + "learning_rate": 9.9270787915639e-06, + "loss": 0.7687, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 3.842918585443859, + "learning_rate": 9.926854795107363e-06, + "loss": 0.5339, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 10.666138604444795, + "learning_rate": 9.92663045768145e-06, + "loss": 0.756, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 3.906078679440604, + "learning_rate": 9.926405779301691e-06, + "loss": 0.6946, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 3.8490032747580383, + "learning_rate": 9.92618075998363e-06, + "loss": 0.8035, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 3.083633094743296, + "learning_rate": 9.925955399742845e-06, + "loss": 0.7367, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 3.593264625777261, + "learning_rate": 9.925729698594931e-06, + "loss": 0.7698, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 3.8398926563000746, + "learning_rate": 9.925503656555503e-06, + "loss": 0.6662, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 8.489739430039288, + "learning_rate": 9.925277273640211e-06, + "loss": 0.9429, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 5.856188681022169, + "learning_rate": 9.925050549864718e-06, + "loss": 0.6923, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 3.4962359611927623, + "learning_rate": 9.92482348524472e-06, + "loss": 0.8971, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 4.430716110068405, + "learning_rate": 9.924596079795923e-06, + "loss": 0.7463, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 3.1610562957822372, + "learning_rate": 9.924368333534072e-06, + "loss": 0.8483, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 3.8900016458608695, + "learning_rate": 9.924140246474926e-06, + "loss": 0.7393, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 3.671228713876813, + "learning_rate": 9.923911818634269e-06, + "loss": 0.9098, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 7.769093234314537, + "learning_rate": 9.92368305002791e-06, + "loss": 0.7433, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 5.43434876369951, + "learning_rate": 9.923453940671683e-06, + "loss": 0.664, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 5.72929780125456, + "learning_rate": 9.923224490581443e-06, + "loss": 0.5399, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 9.94036833098379, + "learning_rate": 9.922994699773068e-06, + "loss": 0.8391, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 5.117092099920273, + "learning_rate": 9.922764568262464e-06, + "loss": 0.8107, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 5.613431087025792, + "learning_rate": 9.922534096065552e-06, + "loss": 0.8647, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 4.64098920179695, + "learning_rate": 9.92230328319829e-06, + "loss": 0.7277, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 3.0310641486331624, + "learning_rate": 9.922072129676644e-06, + "loss": 0.6895, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 4.291735516284705, + "learning_rate": 9.921840635516616e-06, + "loss": 0.6512, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 4.495560680326641, + "learning_rate": 9.921608800734227e-06, + "loss": 0.7719, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 3.161303808316317, + "learning_rate": 9.921376625345518e-06, + "loss": 0.7885, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 10.851499882243445, + "learning_rate": 9.921144109366559e-06, + "loss": 0.9069, + "step": 1046 + }, + { + "epoch": 0.09, + "grad_norm": 3.608379955270225, + "learning_rate": 9.920911252813443e-06, + "loss": 0.8589, + "step": 1047 + }, + { + "epoch": 0.09, + "grad_norm": 14.927786675515918, + "learning_rate": 9.920678055702282e-06, + "loss": 0.6949, + "step": 1048 + }, + { + "epoch": 0.09, + "grad_norm": 4.5890084066537655, + "learning_rate": 9.920444518049218e-06, + "loss": 0.6165, + "step": 1049 + }, + { + "epoch": 0.09, + "grad_norm": 4.451038148414518, + "learning_rate": 9.920210639870409e-06, + "loss": 0.8226, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 2.937558557650849, + "learning_rate": 9.919976421182047e-06, + "loss": 0.7284, + "step": 1051 + }, + { + "epoch": 0.09, + "grad_norm": 4.064327801853339, + "learning_rate": 9.919741862000334e-06, + "loss": 0.7984, + "step": 1052 + }, + { + "epoch": 0.09, + "grad_norm": 8.973358208837215, + "learning_rate": 9.91950696234151e-06, + "loss": 0.6713, + "step": 1053 + }, + { + "epoch": 0.09, + "grad_norm": 7.636967905058214, + "learning_rate": 9.919271722221828e-06, + "loss": 0.8619, + "step": 1054 + }, + { + "epoch": 0.09, + "grad_norm": 5.402826456928093, + "learning_rate": 9.919036141657568e-06, + "loss": 0.829, + "step": 1055 + }, + { + "epoch": 0.09, + "grad_norm": 6.993785916137985, + "learning_rate": 9.918800220665035e-06, + "loss": 0.7523, + "step": 1056 + }, + { + "epoch": 0.09, + "grad_norm": 3.8553929408426555, + "learning_rate": 9.918563959260555e-06, + "loss": 0.7235, + "step": 1057 + }, + { + "epoch": 0.09, + "grad_norm": 6.596063254727656, + "learning_rate": 9.918327357460477e-06, + "loss": 0.7393, + "step": 1058 + }, + { + "epoch": 0.09, + "grad_norm": 4.259353763831047, + "learning_rate": 9.91809041528118e-06, + "loss": 0.7052, + "step": 1059 + }, + { + "epoch": 0.09, + "grad_norm": 3.649189238817127, + "learning_rate": 9.917853132739058e-06, + "loss": 0.7828, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 4.093778551435277, + "learning_rate": 9.917615509850536e-06, + "loss": 0.6905, + "step": 1061 + }, + { + "epoch": 0.09, + "grad_norm": 7.57228672830486, + "learning_rate": 9.917377546632055e-06, + "loss": 0.704, + "step": 1062 + }, + { + "epoch": 0.09, + "grad_norm": 4.889695775191646, + "learning_rate": 9.917139243100088e-06, + "loss": 0.8628, + "step": 1063 + }, + { + "epoch": 0.09, + "grad_norm": 3.3953735733294175, + "learning_rate": 9.91690059927112e-06, + "loss": 0.6576, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 8.329555407868353, + "learning_rate": 9.916661615161674e-06, + "loss": 0.8183, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 8.237534015676456, + "learning_rate": 9.916422290788285e-06, + "loss": 0.7328, + "step": 1066 + }, + { + "epoch": 0.09, + "grad_norm": 5.745277757879374, + "learning_rate": 9.916182626167518e-06, + "loss": 0.7609, + "step": 1067 + }, + { + "epoch": 0.09, + "grad_norm": 3.6367273981050947, + "learning_rate": 9.915942621315959e-06, + "loss": 0.5816, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 4.710848061986129, + "learning_rate": 9.915702276250217e-06, + "loss": 0.7976, + "step": 1069 + }, + { + "epoch": 0.09, + "grad_norm": 3.9552797363841035, + "learning_rate": 9.915461590986926e-06, + "loss": 0.5878, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 3.90018679865148, + "learning_rate": 9.915220565542743e-06, + "loss": 0.7667, + "step": 1071 + }, + { + "epoch": 0.09, + "grad_norm": 3.915986148623406, + "learning_rate": 9.914979199934346e-06, + "loss": 0.923, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 3.3974519555701472, + "learning_rate": 9.914737494178442e-06, + "loss": 0.5939, + "step": 1073 + }, + { + "epoch": 0.09, + "grad_norm": 5.059956409317443, + "learning_rate": 9.914495448291758e-06, + "loss": 0.7642, + "step": 1074 + }, + { + "epoch": 0.09, + "grad_norm": 6.082387550847085, + "learning_rate": 9.914253062291044e-06, + "loss": 0.7543, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 25.154978832787183, + "learning_rate": 9.914010336193077e-06, + "loss": 0.8236, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 3.9672640135861936, + "learning_rate": 9.913767270014652e-06, + "loss": 0.771, + "step": 1077 + }, + { + "epoch": 0.09, + "grad_norm": 7.638904124693897, + "learning_rate": 9.913523863772592e-06, + "loss": 0.6481, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 4.028721901556489, + "learning_rate": 9.913280117483745e-06, + "loss": 0.6476, + "step": 1079 + }, + { + "epoch": 0.09, + "grad_norm": 4.375989313113419, + "learning_rate": 9.913036031164975e-06, + "loss": 0.7894, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 4.220239002106115, + "learning_rate": 9.912791604833178e-06, + "loss": 0.8664, + "step": 1081 + }, + { + "epoch": 0.09, + "grad_norm": 4.666463661160009, + "learning_rate": 9.912546838505266e-06, + "loss": 0.6611, + "step": 1082 + }, + { + "epoch": 0.09, + "grad_norm": 6.9154326364342475, + "learning_rate": 9.912301732198184e-06, + "loss": 0.9785, + "step": 1083 + }, + { + "epoch": 0.09, + "grad_norm": 4.277318960057017, + "learning_rate": 9.912056285928891e-06, + "loss": 0.8013, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 3.079100562094572, + "learning_rate": 9.911810499714373e-06, + "loss": 0.913, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 4.438400510465831, + "learning_rate": 9.91156437357164e-06, + "loss": 0.7312, + "step": 1086 + }, + { + "epoch": 0.09, + "grad_norm": 4.14487571794865, + "learning_rate": 9.91131790751773e-06, + "loss": 0.9048, + "step": 1087 + }, + { + "epoch": 0.09, + "grad_norm": 4.547231442082386, + "learning_rate": 9.911071101569694e-06, + "loss": 0.8275, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 5.7787973842987554, + "learning_rate": 9.910823955744615e-06, + "loss": 0.6561, + "step": 1089 + }, + { + "epoch": 0.09, + "grad_norm": 3.4428992464361463, + "learning_rate": 9.910576470059598e-06, + "loss": 0.6894, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 6.076962340195229, + "learning_rate": 9.91032864453177e-06, + "loss": 0.808, + "step": 1091 + }, + { + "epoch": 0.09, + "grad_norm": 5.985556415986422, + "learning_rate": 9.910080479178282e-06, + "loss": 0.7723, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 5.3835831629937285, + "learning_rate": 9.90983197401631e-06, + "loss": 0.9289, + "step": 1093 + }, + { + "epoch": 0.09, + "grad_norm": 5.5393749699238315, + "learning_rate": 9.909583129063046e-06, + "loss": 0.9609, + "step": 1094 + }, + { + "epoch": 0.09, + "grad_norm": 4.323564617243667, + "learning_rate": 9.90933394433572e-06, + "loss": 0.7551, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 4.5267962554846095, + "learning_rate": 9.909084419851571e-06, + "loss": 0.7603, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 3.9995133224500856, + "learning_rate": 9.90883455562787e-06, + "loss": 0.8373, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 9.165032274693864, + "learning_rate": 9.908584351681911e-06, + "loss": 0.8055, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 3.7727125927230594, + "learning_rate": 9.908333808031007e-06, + "loss": 0.703, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 5.820870400258755, + "learning_rate": 9.908082924692499e-06, + "loss": 0.7268, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 7.165965543667079, + "learning_rate": 9.907831701683747e-06, + "loss": 0.7373, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 7.652492053681831, + "learning_rate": 9.907580139022139e-06, + "loss": 0.8142, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 3.149190540920497, + "learning_rate": 9.907328236725086e-06, + "loss": 0.803, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 3.483704993006792, + "learning_rate": 9.90707599481002e-06, + "loss": 0.7613, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 5.172405368788908, + "learning_rate": 9.906823413294398e-06, + "loss": 0.8217, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 3.5411551906184147, + "learning_rate": 9.906570492195698e-06, + "loss": 0.9043, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 5.486225917593387, + "learning_rate": 9.906317231531427e-06, + "loss": 0.6923, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 15.853957582324012, + "learning_rate": 9.906063631319111e-06, + "loss": 0.7069, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 3.6877238672400647, + "learning_rate": 9.9058096915763e-06, + "loss": 0.8677, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 4.9036420214887295, + "learning_rate": 9.905555412320569e-06, + "loss": 0.6888, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 6.166212071831924, + "learning_rate": 9.905300793569515e-06, + "loss": 0.8216, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 4.8565221167143955, + "learning_rate": 9.90504583534076e-06, + "loss": 0.7228, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 5.336056210710972, + "learning_rate": 9.904790537651949e-06, + "loss": 0.612, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 2.871625813259238, + "learning_rate": 9.904534900520748e-06, + "loss": 0.7724, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 8.531092644906606, + "learning_rate": 9.904278923964851e-06, + "loss": 0.7963, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 3.737463118053625, + "learning_rate": 9.904022608001975e-06, + "loss": 0.8101, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 3.2928231451485708, + "learning_rate": 9.903765952649854e-06, + "loss": 0.6575, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 4.302129382605941, + "learning_rate": 9.903508957926253e-06, + "loss": 0.89, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 3.38020190482699, + "learning_rate": 9.903251623848957e-06, + "loss": 0.7938, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 4.7240983048709975, + "learning_rate": 9.902993950435776e-06, + "loss": 0.5785, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 3.4218352497309037, + "learning_rate": 9.902735937704541e-06, + "loss": 0.7044, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 4.475443915494931, + "learning_rate": 9.902477585673109e-06, + "loss": 0.9424, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 3.6710225942063683, + "learning_rate": 9.902218894359359e-06, + "loss": 0.7194, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 3.546319129901945, + "learning_rate": 9.901959863781195e-06, + "loss": 0.8076, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 3.3664768878646094, + "learning_rate": 9.901700493956544e-06, + "loss": 0.7384, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 2.806530483700508, + "learning_rate": 9.901440784903354e-06, + "loss": 0.6184, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 3.5366917484214175, + "learning_rate": 9.9011807366396e-06, + "loss": 0.8521, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 7.141202850953822, + "learning_rate": 9.900920349183278e-06, + "loss": 0.8012, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 3.265130767977999, + "learning_rate": 9.90065962255241e-06, + "loss": 0.8052, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 4.680723499730134, + "learning_rate": 9.900398556765038e-06, + "loss": 0.7557, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 11.458425188798179, + "learning_rate": 9.900137151839233e-06, + "loss": 0.59, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 6.148770459949628, + "learning_rate": 9.89987540779308e-06, + "loss": 0.8973, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 2.497490234921751, + "learning_rate": 9.8996133246447e-06, + "loss": 0.6229, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 3.7581806057091853, + "learning_rate": 9.899350902412224e-06, + "loss": 0.785, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 3.632708287602974, + "learning_rate": 9.899088141113819e-06, + "loss": 0.7011, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 2.703635349152616, + "learning_rate": 9.898825040767666e-06, + "loss": 0.7454, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 3.6772863618608467, + "learning_rate": 9.898561601391977e-06, + "loss": 0.859, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 16.065177253473, + "learning_rate": 9.898297823004979e-06, + "loss": 0.679, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 3.1070934550836493, + "learning_rate": 9.898033705624928e-06, + "loss": 0.7288, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 3.7684569781107493, + "learning_rate": 9.897769249270106e-06, + "loss": 0.7278, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 4.587958546895321, + "learning_rate": 9.897504453958815e-06, + "loss": 0.6837, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 3.0571552864798477, + "learning_rate": 9.897239319709375e-06, + "loss": 0.8673, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 3.403443207430614, + "learning_rate": 9.896973846540142e-06, + "loss": 0.6961, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 6.893497510576452, + "learning_rate": 9.896708034469482e-06, + "loss": 0.6644, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 4.622753770607722, + "learning_rate": 9.896441883515794e-06, + "loss": 0.6017, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 3.512599040118152, + "learning_rate": 9.896175393697499e-06, + "loss": 0.7315, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 8.982560874807408, + "learning_rate": 9.895908565033036e-06, + "loss": 0.7207, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 9.634916335681975, + "learning_rate": 9.895641397540874e-06, + "loss": 0.8745, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 3.113997486510584, + "learning_rate": 9.895373891239502e-06, + "loss": 0.8938, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 8.092991688453978, + "learning_rate": 9.895106046147432e-06, + "loss": 0.8294, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 5.173684016647621, + "learning_rate": 9.894837862283201e-06, + "loss": 0.9007, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 2.6763507861653935, + "learning_rate": 9.894569339665372e-06, + "loss": 0.785, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 3.6301244824476955, + "learning_rate": 9.894300478312524e-06, + "loss": 0.9116, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 4.32987022976241, + "learning_rate": 9.894031278243266e-06, + "loss": 0.7559, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 9.034278671980887, + "learning_rate": 9.89376173947623e-06, + "loss": 0.6825, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 5.249049503838335, + "learning_rate": 9.893491862030065e-06, + "loss": 0.671, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 4.120151287262343, + "learning_rate": 9.893221645923452e-06, + "loss": 0.5626, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 5.066088791253685, + "learning_rate": 9.892951091175093e-06, + "loss": 0.8943, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 8.783782213292914, + "learning_rate": 9.892680197803707e-06, + "loss": 0.8003, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 3.201175712678574, + "learning_rate": 9.892408965828046e-06, + "loss": 0.5925, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 4.661309716128301, + "learning_rate": 9.89213739526688e-06, + "loss": 0.7266, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 3.0405354335454304, + "learning_rate": 9.891865486139002e-06, + "loss": 0.704, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 4.838473687401868, + "learning_rate": 9.89159323846323e-06, + "loss": 0.7643, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 3.8642295887465643, + "learning_rate": 9.891320652258406e-06, + "loss": 0.8438, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 2.6984076540239452, + "learning_rate": 9.891047727543398e-06, + "loss": 0.7953, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 3.7914722923979927, + "learning_rate": 9.890774464337086e-06, + "loss": 0.6631, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 5.682970295545688, + "learning_rate": 9.890500862658387e-06, + "loss": 0.5272, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 4.661819920465713, + "learning_rate": 9.890226922526238e-06, + "loss": 0.7997, + "step": 1168 + }, + { + "epoch": 0.09, + "grad_norm": 9.056706508755106, + "learning_rate": 9.889952643959592e-06, + "loss": 0.605, + "step": 1169 + }, + { + "epoch": 0.1, + "grad_norm": 5.08371783545947, + "learning_rate": 9.889678026977435e-06, + "loss": 0.8474, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 2.88028529351079, + "learning_rate": 9.889403071598769e-06, + "loss": 0.7543, + "step": 1171 + }, + { + "epoch": 0.1, + "grad_norm": 3.1051354067412933, + "learning_rate": 9.889127777842624e-06, + "loss": 0.7966, + "step": 1172 + }, + { + "epoch": 0.1, + "grad_norm": 4.529648349875851, + "learning_rate": 9.888852145728054e-06, + "loss": 0.8149, + "step": 1173 + }, + { + "epoch": 0.1, + "grad_norm": 5.820747980845195, + "learning_rate": 9.888576175274132e-06, + "loss": 0.7026, + "step": 1174 + }, + { + "epoch": 0.1, + "grad_norm": 2.4140725505470724, + "learning_rate": 9.888299866499957e-06, + "loss": 0.7341, + "step": 1175 + }, + { + "epoch": 0.1, + "grad_norm": 8.811083069309303, + "learning_rate": 9.888023219424653e-06, + "loss": 0.6041, + "step": 1176 + }, + { + "epoch": 0.1, + "grad_norm": 3.4939161375008956, + "learning_rate": 9.887746234067363e-06, + "loss": 0.8145, + "step": 1177 + }, + { + "epoch": 0.1, + "grad_norm": 3.645271050372457, + "learning_rate": 9.88746891044726e-06, + "loss": 0.7544, + "step": 1178 + }, + { + "epoch": 0.1, + "grad_norm": 3.837622379800246, + "learning_rate": 9.887191248583532e-06, + "loss": 0.7722, + "step": 1179 + }, + { + "epoch": 0.1, + "grad_norm": 3.500485781262857, + "learning_rate": 9.8869132484954e-06, + "loss": 0.7321, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 2.989167239484751, + "learning_rate": 9.8866349102021e-06, + "loss": 0.7465, + "step": 1181 + }, + { + "epoch": 0.1, + "grad_norm": 3.2981052822821355, + "learning_rate": 9.886356233722894e-06, + "loss": 0.7189, + "step": 1182 + }, + { + "epoch": 0.1, + "grad_norm": 3.1438841463435474, + "learning_rate": 9.886077219077071e-06, + "loss": 0.7959, + "step": 1183 + }, + { + "epoch": 0.1, + "grad_norm": 23.913583349958333, + "learning_rate": 9.885797866283937e-06, + "loss": 0.7115, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 42.709794356624315, + "learning_rate": 9.88551817536283e-06, + "loss": 0.7295, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 5.256986148891334, + "learning_rate": 9.8852381463331e-06, + "loss": 0.702, + "step": 1186 + }, + { + "epoch": 0.1, + "grad_norm": 3.946855501441445, + "learning_rate": 9.884957779214133e-06, + "loss": 0.8018, + "step": 1187 + }, + { + "epoch": 0.1, + "grad_norm": 3.898297967568548, + "learning_rate": 9.884677074025329e-06, + "loss": 0.8004, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 5.380857753666877, + "learning_rate": 9.884396030786116e-06, + "loss": 0.6774, + "step": 1189 + }, + { + "epoch": 0.1, + "grad_norm": 3.126318798216784, + "learning_rate": 9.88411464951594e-06, + "loss": 0.7404, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 4.154249770978452, + "learning_rate": 9.88383293023428e-06, + "loss": 0.8671, + "step": 1191 + }, + { + "epoch": 0.1, + "grad_norm": 4.422385190539039, + "learning_rate": 9.883550872960629e-06, + "loss": 0.7488, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 3.677109055124081, + "learning_rate": 9.883268477714508e-06, + "loss": 0.7291, + "step": 1193 + }, + { + "epoch": 0.1, + "grad_norm": 4.22588489269259, + "learning_rate": 9.882985744515461e-06, + "loss": 0.7681, + "step": 1194 + }, + { + "epoch": 0.1, + "grad_norm": 11.721689097909122, + "learning_rate": 9.882702673383056e-06, + "loss": 0.8409, + "step": 1195 + }, + { + "epoch": 0.1, + "grad_norm": 7.628247655467624, + "learning_rate": 9.88241926433688e-06, + "loss": 0.6905, + "step": 1196 + }, + { + "epoch": 0.1, + "grad_norm": 3.489912960548631, + "learning_rate": 9.88213551739655e-06, + "loss": 0.7578, + "step": 1197 + }, + { + "epoch": 0.1, + "grad_norm": 4.736191386382483, + "learning_rate": 9.8818514325817e-06, + "loss": 0.7734, + "step": 1198 + }, + { + "epoch": 0.1, + "grad_norm": 3.3029953110072117, + "learning_rate": 9.881567009911995e-06, + "loss": 0.7993, + "step": 1199 + }, + { + "epoch": 0.1, + "grad_norm": 3.37304964960025, + "learning_rate": 9.881282249407114e-06, + "loss": 0.7551, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 2.847014608114322, + "learning_rate": 9.880997151086767e-06, + "loss": 0.6851, + "step": 1201 + }, + { + "epoch": 0.1, + "grad_norm": 7.602963107318313, + "learning_rate": 9.880711714970682e-06, + "loss": 0.8794, + "step": 1202 + }, + { + "epoch": 0.1, + "grad_norm": 18.143295739591395, + "learning_rate": 9.880425941078617e-06, + "loss": 0.7016, + "step": 1203 + }, + { + "epoch": 0.1, + "grad_norm": 2.9908623486806176, + "learning_rate": 9.880139829430346e-06, + "loss": 0.7635, + "step": 1204 + }, + { + "epoch": 0.1, + "grad_norm": 3.7187187436499762, + "learning_rate": 9.879853380045672e-06, + "loss": 0.885, + "step": 1205 + }, + { + "epoch": 0.1, + "grad_norm": 2.987013273660116, + "learning_rate": 9.879566592944417e-06, + "loss": 0.8207, + "step": 1206 + }, + { + "epoch": 0.1, + "grad_norm": 3.034792493197225, + "learning_rate": 9.87927946814643e-06, + "loss": 0.7653, + "step": 1207 + }, + { + "epoch": 0.1, + "grad_norm": 8.445968639494074, + "learning_rate": 9.878992005671581e-06, + "loss": 0.6941, + "step": 1208 + }, + { + "epoch": 0.1, + "grad_norm": 2.8062090327018088, + "learning_rate": 9.878704205539765e-06, + "loss": 0.7038, + "step": 1209 + }, + { + "epoch": 0.1, + "grad_norm": 10.61830280204193, + "learning_rate": 9.878416067770898e-06, + "loss": 0.7423, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 5.829245003002502, + "learning_rate": 9.878127592384923e-06, + "loss": 0.7727, + "step": 1211 + }, + { + "epoch": 0.1, + "grad_norm": 4.393883352997549, + "learning_rate": 9.877838779401803e-06, + "loss": 0.6695, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 3.2174716737054845, + "learning_rate": 9.877549628841528e-06, + "loss": 0.712, + "step": 1213 + }, + { + "epoch": 0.1, + "grad_norm": 4.90407546886971, + "learning_rate": 9.877260140724104e-06, + "loss": 0.6899, + "step": 1214 + }, + { + "epoch": 0.1, + "grad_norm": 3.2478865697461625, + "learning_rate": 9.87697031506957e-06, + "loss": 0.7064, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 3.9199303918916093, + "learning_rate": 9.876680151897981e-06, + "loss": 0.8831, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 6.258564186550168, + "learning_rate": 9.87638965122942e-06, + "loss": 0.6719, + "step": 1217 + }, + { + "epoch": 0.1, + "grad_norm": 3.7664582403507896, + "learning_rate": 9.876098813083993e-06, + "loss": 0.6672, + "step": 1218 + }, + { + "epoch": 0.1, + "grad_norm": 3.6093551415355383, + "learning_rate": 9.875807637481825e-06, + "loss": 0.7742, + "step": 1219 + }, + { + "epoch": 0.1, + "grad_norm": 7.480689390255201, + "learning_rate": 9.875516124443064e-06, + "loss": 0.7473, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 2.825691151118583, + "learning_rate": 9.875224273987893e-06, + "loss": 0.69, + "step": 1221 + }, + { + "epoch": 0.1, + "grad_norm": 6.473230991173139, + "learning_rate": 9.874932086136503e-06, + "loss": 0.7992, + "step": 1222 + }, + { + "epoch": 0.1, + "grad_norm": 11.567424305188858, + "learning_rate": 9.874639560909118e-06, + "loss": 0.6756, + "step": 1223 + }, + { + "epoch": 0.1, + "grad_norm": 6.1197661778074774, + "learning_rate": 9.874346698325983e-06, + "loss": 0.9457, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 5.401258383044548, + "learning_rate": 9.874053498407365e-06, + "loss": 0.6093, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 3.068493519169048, + "learning_rate": 9.873759961173554e-06, + "loss": 0.7869, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 7.119172691670688, + "learning_rate": 9.873466086644867e-06, + "loss": 0.7752, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 6.845706370348848, + "learning_rate": 9.87317187484164e-06, + "loss": 0.6347, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 5.18166262147119, + "learning_rate": 9.872877325784235e-06, + "loss": 0.737, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 5.23514363104195, + "learning_rate": 9.87258243949304e-06, + "loss": 0.8877, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 5.930707484437629, + "learning_rate": 9.872287215988456e-06, + "loss": 0.5708, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 3.629782001284931, + "learning_rate": 9.87199165529092e-06, + "loss": 0.7978, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 6.204077121038583, + "learning_rate": 9.871695757420885e-06, + "loss": 0.7749, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 4.368812882976357, + "learning_rate": 9.871399522398828e-06, + "loss": 0.71, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 2.9552147719298585, + "learning_rate": 9.87110295024525e-06, + "loss": 0.6845, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 4.222280327890182, + "learning_rate": 9.870806040980679e-06, + "loss": 0.8664, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 3.921781730860754, + "learning_rate": 9.870508794625662e-06, + "loss": 0.7239, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 5.521359178989577, + "learning_rate": 9.870211211200766e-06, + "loss": 0.7154, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 5.086981897236873, + "learning_rate": 9.86991329072659e-06, + "loss": 0.6208, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 5.122179985145022, + "learning_rate": 9.869615033223752e-06, + "loss": 0.767, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 4.688880231626787, + "learning_rate": 9.869316438712891e-06, + "loss": 0.7438, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 4.281464908071756, + "learning_rate": 9.869017507214672e-06, + "loss": 0.9089, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 3.4748085789350203, + "learning_rate": 9.868718238749785e-06, + "loss": 0.6636, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 4.1330389738677065, + "learning_rate": 9.868418633338938e-06, + "loss": 0.6336, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 4.071640208100227, + "learning_rate": 9.86811869100287e-06, + "loss": 0.7533, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 4.431312299211099, + "learning_rate": 9.867818411762336e-06, + "loss": 0.6853, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 3.433496208969935, + "learning_rate": 9.867517795638115e-06, + "loss": 0.6625, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 6.380602274024577, + "learning_rate": 9.867216842651017e-06, + "loss": 0.784, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 16.182759959644432, + "learning_rate": 9.866915552821865e-06, + "loss": 0.6352, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 5.035815541348587, + "learning_rate": 9.866613926171514e-06, + "loss": 0.8515, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 5.090006178253494, + "learning_rate": 9.866311962720835e-06, + "loss": 0.9131, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 6.816810378484307, + "learning_rate": 9.866009662490727e-06, + "loss": 0.7506, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 3.743464506369369, + "learning_rate": 9.865707025502112e-06, + "loss": 0.6838, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 17.615484855741332, + "learning_rate": 9.865404051775936e-06, + "loss": 0.7172, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 4.504644141995805, + "learning_rate": 9.86510074133316e-06, + "loss": 0.6782, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 4.026086493993496, + "learning_rate": 9.864797094194783e-06, + "loss": 0.639, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 5.273730124505048, + "learning_rate": 9.864493110381816e-06, + "loss": 0.8838, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 3.9377573821834795, + "learning_rate": 9.864188789915295e-06, + "loss": 0.7895, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 4.5487719317790285, + "learning_rate": 9.86388413281628e-06, + "loss": 0.8105, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 3.1291290137230505, + "learning_rate": 9.863579139105862e-06, + "loss": 0.7212, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 4.790756925984459, + "learning_rate": 9.863273808805141e-06, + "loss": 0.6061, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 7.465308176691548, + "learning_rate": 9.862968141935252e-06, + "loss": 0.8746, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 3.6525539309667825, + "learning_rate": 9.862662138517347e-06, + "loss": 0.8401, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 4.739852255292235, + "learning_rate": 9.862355798572604e-06, + "loss": 0.8153, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 6.3575762048790105, + "learning_rate": 9.862049122122226e-06, + "loss": 0.8266, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 3.987603304843584, + "learning_rate": 9.861742109187433e-06, + "loss": 0.7441, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 12.003326557188275, + "learning_rate": 9.861434759789475e-06, + "loss": 0.7768, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 3.16378972002077, + "learning_rate": 9.86112707394962e-06, + "loss": 0.8174, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 4.749024088971492, + "learning_rate": 9.860819051689163e-06, + "loss": 0.6776, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 3.9307918192554983, + "learning_rate": 9.860510693029424e-06, + "loss": 0.6796, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 6.731818846528893, + "learning_rate": 9.860201997991739e-06, + "loss": 0.686, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 4.370986106627797, + "learning_rate": 9.859892966597474e-06, + "loss": 0.6092, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 4.647759419860096, + "learning_rate": 9.859583598868013e-06, + "loss": 0.7857, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 2.962857668988205, + "learning_rate": 9.85927389482477e-06, + "loss": 0.6617, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 6.423050561608751, + "learning_rate": 9.858963854489179e-06, + "loss": 0.6811, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 3.6952153575177755, + "learning_rate": 9.858653477882691e-06, + "loss": 0.6863, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 4.0237581720151425, + "learning_rate": 9.858342765026793e-06, + "loss": 0.6368, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 7.446626041979622, + "learning_rate": 9.858031715942983e-06, + "loss": 0.5983, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 4.324469182781904, + "learning_rate": 9.857720330652791e-06, + "loss": 0.7926, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 5.447489173862893, + "learning_rate": 9.857408609177763e-06, + "loss": 0.7575, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 4.780062264599041, + "learning_rate": 9.857096551539476e-06, + "loss": 0.6093, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 3.8045104779810845, + "learning_rate": 9.856784157759525e-06, + "loss": 0.7633, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 4.084174277425293, + "learning_rate": 9.85647142785953e-06, + "loss": 0.8884, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 17.856027150545184, + "learning_rate": 9.856158361861132e-06, + "loss": 0.6974, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 3.0808533240744116, + "learning_rate": 9.855844959786e-06, + "loss": 0.6732, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 3.130692814480986, + "learning_rate": 9.85553122165582e-06, + "loss": 0.7204, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 3.622360332815775, + "learning_rate": 9.855217147492309e-06, + "loss": 0.6865, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 4.327705891381432, + "learning_rate": 9.854902737317198e-06, + "loss": 0.7003, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 2.682869651435079, + "learning_rate": 9.854587991152249e-06, + "loss": 0.5716, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 3.423295045112559, + "learning_rate": 9.854272909019245e-06, + "loss": 0.9297, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 4.7457811764820175, + "learning_rate": 9.85395749093999e-06, + "loss": 0.8619, + "step": 1291 + }, + { + "epoch": 0.1, + "grad_norm": 3.5570624935048167, + "learning_rate": 9.853641736936315e-06, + "loss": 0.8548, + "step": 1292 + }, + { + "epoch": 0.11, + "grad_norm": 6.892499974946268, + "learning_rate": 9.853325647030067e-06, + "loss": 0.7379, + "step": 1293 + }, + { + "epoch": 0.11, + "grad_norm": 3.3796158757922488, + "learning_rate": 9.853009221243129e-06, + "loss": 0.779, + "step": 1294 + }, + { + "epoch": 0.11, + "grad_norm": 2.47836428277722, + "learning_rate": 9.852692459597395e-06, + "loss": 0.7652, + "step": 1295 + }, + { + "epoch": 0.11, + "grad_norm": 2.893773105757726, + "learning_rate": 9.852375362114787e-06, + "loss": 0.808, + "step": 1296 + }, + { + "epoch": 0.11, + "grad_norm": 4.927216898149595, + "learning_rate": 9.852057928817252e-06, + "loss": 0.8106, + "step": 1297 + }, + { + "epoch": 0.11, + "grad_norm": 3.890251321374567, + "learning_rate": 9.851740159726755e-06, + "loss": 0.6629, + "step": 1298 + }, + { + "epoch": 0.11, + "grad_norm": 5.677072589303417, + "learning_rate": 9.851422054865292e-06, + "loss": 0.8141, + "step": 1299 + }, + { + "epoch": 0.11, + "grad_norm": 3.0892354294650444, + "learning_rate": 9.851103614254874e-06, + "loss": 0.6687, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 7.01215476816205, + "learning_rate": 9.850784837917541e-06, + "loss": 0.7888, + "step": 1301 + }, + { + "epoch": 0.11, + "grad_norm": 3.10074205471101, + "learning_rate": 9.850465725875356e-06, + "loss": 0.7357, + "step": 1302 + }, + { + "epoch": 0.11, + "grad_norm": 8.474427065852185, + "learning_rate": 9.8501462781504e-06, + "loss": 0.6398, + "step": 1303 + }, + { + "epoch": 0.11, + "grad_norm": 3.8797306602969823, + "learning_rate": 9.849826494764783e-06, + "loss": 0.682, + "step": 1304 + }, + { + "epoch": 0.11, + "grad_norm": 3.1228451636841297, + "learning_rate": 9.849506375740637e-06, + "loss": 0.6532, + "step": 1305 + }, + { + "epoch": 0.11, + "grad_norm": 5.058043253431571, + "learning_rate": 9.849185921100111e-06, + "loss": 0.7395, + "step": 1306 + }, + { + "epoch": 0.11, + "grad_norm": 3.884543817228299, + "learning_rate": 9.84886513086539e-06, + "loss": 0.7521, + "step": 1307 + }, + { + "epoch": 0.11, + "grad_norm": 3.7404597676394116, + "learning_rate": 9.848544005058668e-06, + "loss": 0.6452, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 4.03529209310363, + "learning_rate": 9.848222543702175e-06, + "loss": 0.8362, + "step": 1309 + }, + { + "epoch": 0.11, + "grad_norm": 6.319576305206275, + "learning_rate": 9.847900746818153e-06, + "loss": 0.8138, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 8.301016651406, + "learning_rate": 9.847578614428874e-06, + "loss": 0.7712, + "step": 1311 + }, + { + "epoch": 0.11, + "grad_norm": 4.557192462840488, + "learning_rate": 9.847256146556633e-06, + "loss": 0.9223, + "step": 1312 + }, + { + "epoch": 0.11, + "grad_norm": 3.7930994281395516, + "learning_rate": 9.846933343223746e-06, + "loss": 0.636, + "step": 1313 + }, + { + "epoch": 0.11, + "grad_norm": 2.859412704473073, + "learning_rate": 9.846610204452553e-06, + "loss": 0.6062, + "step": 1314 + }, + { + "epoch": 0.11, + "grad_norm": 5.643984249036258, + "learning_rate": 9.846286730265418e-06, + "loss": 0.8242, + "step": 1315 + }, + { + "epoch": 0.11, + "grad_norm": 2.987864541937237, + "learning_rate": 9.845962920684723e-06, + "loss": 0.766, + "step": 1316 + }, + { + "epoch": 0.11, + "grad_norm": 2.9479138581079813, + "learning_rate": 9.845638775732883e-06, + "loss": 0.9124, + "step": 1317 + }, + { + "epoch": 0.11, + "grad_norm": 3.045048700796494, + "learning_rate": 9.845314295432331e-06, + "loss": 0.6485, + "step": 1318 + }, + { + "epoch": 0.11, + "grad_norm": 5.320703422437716, + "learning_rate": 9.844989479805521e-06, + "loss": 0.7124, + "step": 1319 + }, + { + "epoch": 0.11, + "grad_norm": 4.019287745659873, + "learning_rate": 9.844664328874928e-06, + "loss": 0.7201, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 3.225188193685589, + "learning_rate": 9.844338842663064e-06, + "loss": 0.6698, + "step": 1321 + }, + { + "epoch": 0.11, + "grad_norm": 5.893974502105138, + "learning_rate": 9.844013021192447e-06, + "loss": 0.5956, + "step": 1322 + }, + { + "epoch": 0.11, + "grad_norm": 3.0719570522862183, + "learning_rate": 9.84368686448563e-06, + "loss": 0.8989, + "step": 1323 + }, + { + "epoch": 0.11, + "grad_norm": 2.9436413276156683, + "learning_rate": 9.84336037256518e-06, + "loss": 0.5827, + "step": 1324 + }, + { + "epoch": 0.11, + "grad_norm": 4.851365162956393, + "learning_rate": 9.8430335454537e-06, + "loss": 0.7802, + "step": 1325 + }, + { + "epoch": 0.11, + "grad_norm": 4.1196957284703695, + "learning_rate": 9.842706383173803e-06, + "loss": 0.9247, + "step": 1326 + }, + { + "epoch": 0.11, + "grad_norm": 5.032915178173537, + "learning_rate": 9.842378885748132e-06, + "loss": 0.6944, + "step": 1327 + }, + { + "epoch": 0.11, + "grad_norm": 3.393978549846444, + "learning_rate": 9.842051053199352e-06, + "loss": 0.6831, + "step": 1328 + }, + { + "epoch": 0.11, + "grad_norm": 3.7570707697960333, + "learning_rate": 9.84172288555015e-06, + "loss": 0.7007, + "step": 1329 + }, + { + "epoch": 0.11, + "grad_norm": 3.0620712391739846, + "learning_rate": 9.84139438282324e-06, + "loss": 0.7323, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 4.2511517165266115, + "learning_rate": 9.841065545041353e-06, + "loss": 0.6887, + "step": 1331 + }, + { + "epoch": 0.11, + "grad_norm": 4.79888414711986, + "learning_rate": 9.84073637222725e-06, + "loss": 0.8734, + "step": 1332 + }, + { + "epoch": 0.11, + "grad_norm": 4.1563644388943235, + "learning_rate": 9.84040686440371e-06, + "loss": 0.7673, + "step": 1333 + }, + { + "epoch": 0.11, + "grad_norm": 4.282574467026865, + "learning_rate": 9.840077021593538e-06, + "loss": 0.7938, + "step": 1334 + }, + { + "epoch": 0.11, + "grad_norm": 4.164066486128604, + "learning_rate": 9.83974684381956e-06, + "loss": 0.7682, + "step": 1335 + }, + { + "epoch": 0.11, + "grad_norm": 3.592944111014293, + "learning_rate": 9.839416331104625e-06, + "loss": 0.7271, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 3.9796280426402593, + "learning_rate": 9.83908548347161e-06, + "loss": 0.5406, + "step": 1337 + }, + { + "epoch": 0.11, + "grad_norm": 4.656920266592827, + "learning_rate": 9.838754300943409e-06, + "loss": 0.6151, + "step": 1338 + }, + { + "epoch": 0.11, + "grad_norm": 4.074963637615123, + "learning_rate": 9.838422783542945e-06, + "loss": 0.8296, + "step": 1339 + }, + { + "epoch": 0.11, + "grad_norm": 18.23943022448206, + "learning_rate": 9.838090931293158e-06, + "loss": 0.7592, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 3.2824548550695245, + "learning_rate": 9.837758744217016e-06, + "loss": 0.6474, + "step": 1341 + }, + { + "epoch": 0.11, + "grad_norm": 3.0149532039872264, + "learning_rate": 9.837426222337507e-06, + "loss": 0.7854, + "step": 1342 + }, + { + "epoch": 0.11, + "grad_norm": 5.185941002706444, + "learning_rate": 9.837093365677644e-06, + "loss": 0.7233, + "step": 1343 + }, + { + "epoch": 0.11, + "grad_norm": 3.4361284649067456, + "learning_rate": 9.836760174260465e-06, + "loss": 0.4904, + "step": 1344 + }, + { + "epoch": 0.11, + "grad_norm": 3.2142201547203877, + "learning_rate": 9.836426648109025e-06, + "loss": 0.8148, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 3.2303286304956362, + "learning_rate": 9.83609278724641e-06, + "loss": 0.6964, + "step": 1346 + }, + { + "epoch": 0.11, + "grad_norm": 4.265667483490254, + "learning_rate": 9.835758591695723e-06, + "loss": 0.9437, + "step": 1347 + }, + { + "epoch": 0.11, + "grad_norm": 2.802017275522471, + "learning_rate": 9.835424061480094e-06, + "loss": 0.9152, + "step": 1348 + }, + { + "epoch": 0.11, + "grad_norm": 4.469253067501435, + "learning_rate": 9.835089196622671e-06, + "loss": 0.8302, + "step": 1349 + }, + { + "epoch": 0.11, + "grad_norm": 2.3663676695982256, + "learning_rate": 9.834753997146633e-06, + "loss": 0.7885, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 4.681566637271748, + "learning_rate": 9.834418463075177e-06, + "loss": 0.7215, + "step": 1351 + }, + { + "epoch": 0.11, + "grad_norm": 2.587681371757967, + "learning_rate": 9.834082594431522e-06, + "loss": 0.8074, + "step": 1352 + }, + { + "epoch": 0.11, + "grad_norm": 4.609762766154074, + "learning_rate": 9.833746391238916e-06, + "loss": 0.7177, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 5.864333074469486, + "learning_rate": 9.833409853520621e-06, + "loss": 0.7291, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 4.222122466097027, + "learning_rate": 9.833072981299932e-06, + "loss": 0.6248, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 4.914777826483462, + "learning_rate": 9.83273577460016e-06, + "loss": 0.8035, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 4.21583162734347, + "learning_rate": 9.832398233444644e-06, + "loss": 0.6882, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 5.892542672832988, + "learning_rate": 9.832060357856744e-06, + "loss": 0.7175, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 6.650533048362267, + "learning_rate": 9.83172214785984e-06, + "loss": 0.7258, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 6.035461930987028, + "learning_rate": 9.83138360347734e-06, + "loss": 0.8521, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 3.8104720262286644, + "learning_rate": 9.831044724732675e-06, + "loss": 0.7937, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 3.9715382675032522, + "learning_rate": 9.830705511649297e-06, + "loss": 0.7865, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 12.693449097841741, + "learning_rate": 9.83036596425068e-06, + "loss": 0.91, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 10.523822309578007, + "learning_rate": 9.830026082560324e-06, + "loss": 0.774, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 5.1513051221823885, + "learning_rate": 9.82968586660175e-06, + "loss": 0.8242, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 5.508823435580966, + "learning_rate": 9.829345316398504e-06, + "loss": 0.7704, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 7.2015623358486245, + "learning_rate": 9.829004431974155e-06, + "loss": 0.8457, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 8.020546139595686, + "learning_rate": 9.828663213352294e-06, + "loss": 0.7546, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 3.912217681032362, + "learning_rate": 9.828321660556533e-06, + "loss": 0.8607, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 6.139422916838143, + "learning_rate": 9.827979773610513e-06, + "loss": 0.6751, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 3.9158330917625546, + "learning_rate": 9.827637552537893e-06, + "loss": 0.774, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 4.447669477789021, + "learning_rate": 9.827294997362354e-06, + "loss": 0.6145, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 3.6150675739777958, + "learning_rate": 9.82695210810761e-06, + "loss": 0.7509, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 2.9606610833748537, + "learning_rate": 9.826608884797385e-06, + "loss": 0.7393, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 3.1563997092642953, + "learning_rate": 9.826265327455435e-06, + "loss": 0.825, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 2.8612181489399884, + "learning_rate": 9.825921436105534e-06, + "loss": 0.6644, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 4.815457879604574, + "learning_rate": 9.825577210771486e-06, + "loss": 0.755, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 3.6836468187364555, + "learning_rate": 9.825232651477109e-06, + "loss": 0.7721, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 3.726206008686008, + "learning_rate": 9.824887758246252e-06, + "loss": 0.6431, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 4.978849678161101, + "learning_rate": 9.824542531102779e-06, + "loss": 0.7141, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 5.277986474909162, + "learning_rate": 9.824196970070587e-06, + "loss": 0.7952, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 4.211976656945047, + "learning_rate": 9.82385107517359e-06, + "loss": 0.7929, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 3.6968624569747814, + "learning_rate": 9.823504846435722e-06, + "loss": 0.7881, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 8.60074925097063, + "learning_rate": 9.823158283880949e-06, + "loss": 0.9359, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 3.504318647545033, + "learning_rate": 9.822811387533256e-06, + "loss": 0.882, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 2.756782688733998, + "learning_rate": 9.822464157416644e-06, + "loss": 0.5845, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 2.8280030843040076, + "learning_rate": 9.82211659355515e-06, + "loss": 0.6769, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 4.460742776352826, + "learning_rate": 9.821768695972824e-06, + "loss": 0.866, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 6.930601287950963, + "learning_rate": 9.821420464693746e-06, + "loss": 0.6968, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 4.359791930361724, + "learning_rate": 9.821071899742012e-06, + "loss": 0.7572, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 5.2867918000178085, + "learning_rate": 9.820723001141746e-06, + "loss": 0.7458, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 3.1276764961494656, + "learning_rate": 9.820373768917095e-06, + "loss": 0.8062, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 2.7633204537476272, + "learning_rate": 9.820024203092229e-06, + "loss": 0.6961, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 4.7491626303353955, + "learning_rate": 9.819674303691338e-06, + "loss": 0.9546, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 3.8720420922962315, + "learning_rate": 9.819324070738637e-06, + "loss": 0.6164, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 5.216852710886374, + "learning_rate": 9.818973504258366e-06, + "loss": 0.8797, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 3.3214315322637327, + "learning_rate": 9.818622604274785e-06, + "loss": 0.8203, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 4.427877516698093, + "learning_rate": 9.81827137081218e-06, + "loss": 0.7682, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 2.36370016904225, + "learning_rate": 9.817919803894857e-06, + "loss": 0.6713, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 4.600179590731129, + "learning_rate": 9.81756790354715e-06, + "loss": 0.7642, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 2.814208092833744, + "learning_rate": 9.817215669793408e-06, + "loss": 0.7536, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 6.274969998529941, + "learning_rate": 9.81686310265801e-06, + "loss": 0.5873, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 5.933085071967237, + "learning_rate": 9.816510202165357e-06, + "loss": 0.8715, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 2.9930877783172627, + "learning_rate": 9.81615696833987e-06, + "loss": 0.8511, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 10.18151334316766, + "learning_rate": 9.815803401205995e-06, + "loss": 0.7355, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 4.907339405641265, + "learning_rate": 9.815449500788203e-06, + "loss": 0.7398, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 2.2582056261583925, + "learning_rate": 9.815095267110983e-06, + "loss": 0.5298, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 2.873666475256101, + "learning_rate": 9.814740700198855e-06, + "loss": 0.7106, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 9.515688970151894, + "learning_rate": 9.814385800076352e-06, + "loss": 0.7679, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 2.645885120179381, + "learning_rate": 9.814030566768041e-06, + "loss": 0.6623, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 4.704194459795749, + "learning_rate": 9.8136750002985e-06, + "loss": 0.6011, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 3.2039376023315893, + "learning_rate": 9.81331910069234e-06, + "loss": 0.5915, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 6.431963012416284, + "learning_rate": 9.812962867974192e-06, + "loss": 0.6776, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 4.136702986877205, + "learning_rate": 9.812606302168709e-06, + "loss": 0.6939, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 4.9833394954813, + "learning_rate": 9.812249403300565e-06, + "loss": 0.8666, + "step": 1415 + }, + { + "epoch": 0.12, + "grad_norm": 4.354514489605485, + "learning_rate": 9.811892171394464e-06, + "loss": 0.7489, + "step": 1416 + }, + { + "epoch": 0.12, + "grad_norm": 2.8001839948301606, + "learning_rate": 9.811534606475127e-06, + "loss": 0.8282, + "step": 1417 + }, + { + "epoch": 0.12, + "grad_norm": 7.215447239892779, + "learning_rate": 9.811176708567295e-06, + "loss": 0.6568, + "step": 1418 + }, + { + "epoch": 0.12, + "grad_norm": 4.437516725787354, + "learning_rate": 9.810818477695745e-06, + "loss": 0.7617, + "step": 1419 + }, + { + "epoch": 0.12, + "grad_norm": 3.1178580634854196, + "learning_rate": 9.810459913885265e-06, + "loss": 0.8128, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 2.398401994492875, + "learning_rate": 9.81010101716067e-06, + "loss": 0.6911, + "step": 1421 + }, + { + "epoch": 0.12, + "grad_norm": 6.21656955629377, + "learning_rate": 9.809741787546797e-06, + "loss": 0.6232, + "step": 1422 + }, + { + "epoch": 0.12, + "grad_norm": 3.3665971922059055, + "learning_rate": 9.809382225068506e-06, + "loss": 0.7555, + "step": 1423 + }, + { + "epoch": 0.12, + "grad_norm": 4.180964991051591, + "learning_rate": 9.809022329750684e-06, + "loss": 0.7165, + "step": 1424 + }, + { + "epoch": 0.12, + "grad_norm": 5.513670951307397, + "learning_rate": 9.808662101618237e-06, + "loss": 0.7134, + "step": 1425 + }, + { + "epoch": 0.12, + "grad_norm": 5.059842872677863, + "learning_rate": 9.808301540696094e-06, + "loss": 0.7651, + "step": 1426 + }, + { + "epoch": 0.12, + "grad_norm": 4.817648716935748, + "learning_rate": 9.80794064700921e-06, + "loss": 0.815, + "step": 1427 + }, + { + "epoch": 0.12, + "grad_norm": 2.7239279947800816, + "learning_rate": 9.807579420582558e-06, + "loss": 0.7762, + "step": 1428 + }, + { + "epoch": 0.12, + "grad_norm": 3.13670492941162, + "learning_rate": 9.80721786144114e-06, + "loss": 0.6804, + "step": 1429 + }, + { + "epoch": 0.12, + "grad_norm": 7.517447696207459, + "learning_rate": 9.806855969609978e-06, + "loss": 0.8434, + "step": 1430 + }, + { + "epoch": 0.12, + "grad_norm": 2.8457699893758757, + "learning_rate": 9.806493745114117e-06, + "loss": 0.7085, + "step": 1431 + }, + { + "epoch": 0.12, + "grad_norm": 7.218995933348008, + "learning_rate": 9.806131187978623e-06, + "loss": 0.7737, + "step": 1432 + }, + { + "epoch": 0.12, + "grad_norm": 4.243621549976317, + "learning_rate": 9.805768298228589e-06, + "loss": 0.6403, + "step": 1433 + }, + { + "epoch": 0.12, + "grad_norm": 3.887382517738277, + "learning_rate": 9.805405075889129e-06, + "loss": 0.903, + "step": 1434 + }, + { + "epoch": 0.12, + "grad_norm": 3.3395693112018314, + "learning_rate": 9.805041520985382e-06, + "loss": 0.7278, + "step": 1435 + }, + { + "epoch": 0.12, + "grad_norm": 5.903451235150011, + "learning_rate": 9.804677633542506e-06, + "loss": 0.626, + "step": 1436 + }, + { + "epoch": 0.12, + "grad_norm": 3.818325992960267, + "learning_rate": 9.804313413585684e-06, + "loss": 0.6567, + "step": 1437 + }, + { + "epoch": 0.12, + "grad_norm": 4.724132711115311, + "learning_rate": 9.803948861140124e-06, + "loss": 0.6915, + "step": 1438 + }, + { + "epoch": 0.12, + "grad_norm": 3.7630648843591423, + "learning_rate": 9.803583976231054e-06, + "loss": 0.7323, + "step": 1439 + }, + { + "epoch": 0.12, + "grad_norm": 4.97228214912387, + "learning_rate": 9.80321875888373e-06, + "loss": 0.7567, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 3.9839792879030806, + "learning_rate": 9.802853209123421e-06, + "loss": 0.771, + "step": 1441 + }, + { + "epoch": 0.12, + "grad_norm": 3.647333773792597, + "learning_rate": 9.80248732697543e-06, + "loss": 0.6099, + "step": 1442 + }, + { + "epoch": 0.12, + "grad_norm": 4.183808026716674, + "learning_rate": 9.802121112465075e-06, + "loss": 0.6127, + "step": 1443 + }, + { + "epoch": 0.12, + "grad_norm": 3.346608070926879, + "learning_rate": 9.801754565617705e-06, + "loss": 0.9237, + "step": 1444 + }, + { + "epoch": 0.12, + "grad_norm": 3.2752442217734035, + "learning_rate": 9.801387686458684e-06, + "loss": 0.7318, + "step": 1445 + }, + { + "epoch": 0.12, + "grad_norm": 3.7808130574377232, + "learning_rate": 9.801020475013403e-06, + "loss": 0.7514, + "step": 1446 + }, + { + "epoch": 0.12, + "grad_norm": 4.443561104607242, + "learning_rate": 9.800652931307275e-06, + "loss": 0.7709, + "step": 1447 + }, + { + "epoch": 0.12, + "grad_norm": 4.498737791381994, + "learning_rate": 9.800285055365737e-06, + "loss": 0.7431, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 5.20600439260958, + "learning_rate": 9.799916847214247e-06, + "loss": 0.7441, + "step": 1449 + }, + { + "epoch": 0.12, + "grad_norm": 3.634669244965309, + "learning_rate": 9.79954830687829e-06, + "loss": 0.8426, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 5.8541970597774355, + "learning_rate": 9.79917943438337e-06, + "loss": 0.6051, + "step": 1451 + }, + { + "epoch": 0.12, + "grad_norm": 5.608197206055959, + "learning_rate": 9.798810229755013e-06, + "loss": 0.6145, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 3.6834657894994485, + "learning_rate": 9.798440693018773e-06, + "loss": 0.6379, + "step": 1453 + }, + { + "epoch": 0.12, + "grad_norm": 5.896879105166612, + "learning_rate": 9.798070824200225e-06, + "loss": 0.5685, + "step": 1454 + }, + { + "epoch": 0.12, + "grad_norm": 4.336016558477347, + "learning_rate": 9.797700623324964e-06, + "loss": 0.9523, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 5.517274002876056, + "learning_rate": 9.797330090418611e-06, + "loss": 0.7818, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 4.823655865529944, + "learning_rate": 9.796959225506809e-06, + "loss": 0.8967, + "step": 1457 + }, + { + "epoch": 0.12, + "grad_norm": 3.083259607958461, + "learning_rate": 9.796588028615225e-06, + "loss": 0.7802, + "step": 1458 + }, + { + "epoch": 0.12, + "grad_norm": 4.985306766786993, + "learning_rate": 9.796216499769546e-06, + "loss": 0.8343, + "step": 1459 + }, + { + "epoch": 0.12, + "grad_norm": 3.6340494882707577, + "learning_rate": 9.795844638995488e-06, + "loss": 0.8408, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 3.676162084189054, + "learning_rate": 9.795472446318783e-06, + "loss": 0.8181, + "step": 1461 + }, + { + "epoch": 0.12, + "grad_norm": 3.039025258973637, + "learning_rate": 9.79509992176519e-06, + "loss": 0.6969, + "step": 1462 + }, + { + "epoch": 0.12, + "grad_norm": 3.1368968951337015, + "learning_rate": 9.79472706536049e-06, + "loss": 0.709, + "step": 1463 + }, + { + "epoch": 0.12, + "grad_norm": 5.107027935878554, + "learning_rate": 9.794353877130486e-06, + "loss": 0.8323, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 2.6985355905232855, + "learning_rate": 9.793980357101007e-06, + "loss": 0.6948, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 3.520592408347667, + "learning_rate": 9.793606505297901e-06, + "loss": 0.8019, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 6.025873076477271, + "learning_rate": 9.793232321747041e-06, + "loss": 0.7294, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 8.228704305364857, + "learning_rate": 9.792857806474326e-06, + "loss": 0.839, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 5.283289796361565, + "learning_rate": 9.79248295950567e-06, + "loss": 0.7583, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 3.2365232592925026, + "learning_rate": 9.79210778086702e-06, + "loss": 0.7232, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 3.5610705930577806, + "learning_rate": 9.791732270584337e-06, + "loss": 0.7624, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 71.87344968782362, + "learning_rate": 9.791356428683609e-06, + "loss": 0.6685, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 2.6023200462718195, + "learning_rate": 9.790980255190848e-06, + "loss": 0.6391, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 6.123095231800686, + "learning_rate": 9.790603750132086e-06, + "loss": 0.7494, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 2.7949851830378085, + "learning_rate": 9.790226913533381e-06, + "loss": 0.861, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 3.8635255046440697, + "learning_rate": 9.789849745420811e-06, + "loss": 0.7404, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 2.7249795024375283, + "learning_rate": 9.78947224582048e-06, + "loss": 0.7155, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 4.319728234195847, + "learning_rate": 9.789094414758512e-06, + "loss": 0.7326, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 3.9541784743255035, + "learning_rate": 9.788716252261057e-06, + "loss": 0.9948, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 3.6481944948746183, + "learning_rate": 9.788337758354283e-06, + "loss": 0.6686, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 4.33823988503373, + "learning_rate": 9.787958933064388e-06, + "loss": 0.7917, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 3.659800176796344, + "learning_rate": 9.787579776417588e-06, + "loss": 0.946, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 12.986213911061727, + "learning_rate": 9.78720028844012e-06, + "loss": 0.7908, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 6.70633643922088, + "learning_rate": 9.786820469158252e-06, + "loss": 0.7295, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 3.3918464495622973, + "learning_rate": 9.786440318598264e-06, + "loss": 0.8559, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 6.207525068754861, + "learning_rate": 9.78605983678647e-06, + "loss": 0.6354, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 3.550985950678446, + "learning_rate": 9.7856790237492e-06, + "loss": 0.7395, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 4.255351552300113, + "learning_rate": 9.785297879512808e-06, + "loss": 0.6068, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 3.828040458636102, + "learning_rate": 9.784916404103673e-06, + "loss": 0.7176, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 4.162791265980011, + "learning_rate": 9.784534597548194e-06, + "loss": 0.6831, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 3.5414114522811593, + "learning_rate": 9.784152459872794e-06, + "loss": 0.8602, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 3.6330627544278062, + "learning_rate": 9.78376999110392e-06, + "loss": 0.7169, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 3.4462182761143665, + "learning_rate": 9.783387191268044e-06, + "loss": 0.6611, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 4.12205422268827, + "learning_rate": 9.783004060391652e-06, + "loss": 0.8308, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 6.544013277773115, + "learning_rate": 9.782620598501264e-06, + "loss": 0.9487, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 5.2842059634444905, + "learning_rate": 9.782236805623418e-06, + "loss": 0.8053, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 3.012914193091309, + "learning_rate": 9.781852681784674e-06, + "loss": 0.7071, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 2.9169832886546394, + "learning_rate": 9.781468227011615e-06, + "loss": 0.8165, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 19.654348234412605, + "learning_rate": 9.781083441330846e-06, + "loss": 0.8418, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 3.0636544844751388, + "learning_rate": 9.780698324769e-06, + "loss": 0.7762, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 2.803171298191634, + "learning_rate": 9.780312877352728e-06, + "loss": 0.9643, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 4.879320141032356, + "learning_rate": 9.779927099108708e-06, + "loss": 0.6837, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 3.498759066490165, + "learning_rate": 9.779540990063632e-06, + "loss": 0.6386, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 3.484009818687112, + "learning_rate": 9.779154550244228e-06, + "loss": 0.8281, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 4.125103255513952, + "learning_rate": 9.778767779677235e-06, + "loss": 0.5755, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 13.7832085656859, + "learning_rate": 9.778380678389422e-06, + "loss": 0.7407, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 7.906356438572428, + "learning_rate": 9.77799324640758e-06, + "loss": 0.8034, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 4.80487227646703, + "learning_rate": 9.77760548375852e-06, + "loss": 0.7786, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 6.504099533589149, + "learning_rate": 9.77721739046908e-06, + "loss": 0.8263, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 7.899753141912911, + "learning_rate": 9.776828966566114e-06, + "loss": 0.6274, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 3.1207034813087717, + "learning_rate": 9.776440212076507e-06, + "loss": 0.7159, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 7.212287823348495, + "learning_rate": 9.776051127027165e-06, + "loss": 0.6522, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 3.7861539770247647, + "learning_rate": 9.775661711445009e-06, + "loss": 0.6826, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 3.5804504967248585, + "learning_rate": 9.775271965356994e-06, + "loss": 0.7533, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 3.7781668816987017, + "learning_rate": 9.774881888790091e-06, + "loss": 0.7047, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 3.255518383570825, + "learning_rate": 9.774491481771296e-06, + "loss": 0.7052, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 3.5353756843084536, + "learning_rate": 9.774100744327628e-06, + "loss": 0.8397, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 3.3009173636756506, + "learning_rate": 9.77370967648613e-06, + "loss": 0.7117, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 6.357232559699015, + "learning_rate": 9.773318278273862e-06, + "loss": 0.615, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 3.811067317360187, + "learning_rate": 9.772926549717915e-06, + "loss": 0.8086, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 2.9257194337598103, + "learning_rate": 9.772534490845398e-06, + "loss": 0.8217, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 4.239608551099964, + "learning_rate": 9.772142101683443e-06, + "loss": 0.6287, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 4.414276247472141, + "learning_rate": 9.771749382259209e-06, + "loss": 0.6379, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 4.410868529754491, + "learning_rate": 9.771356332599868e-06, + "loss": 0.8062, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 5.476402461146055, + "learning_rate": 9.77096295273263e-06, + "loss": 0.7956, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 3.1821749729809983, + "learning_rate": 9.770569242684714e-06, + "loss": 0.7438, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 2.626853742899924, + "learning_rate": 9.770175202483367e-06, + "loss": 0.6873, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 2.9209283050335193, + "learning_rate": 9.769780832155862e-06, + "loss": 0.6846, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 6.925387010633106, + "learning_rate": 9.76938613172949e-06, + "loss": 0.7209, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 4.953243123104708, + "learning_rate": 9.768991101231567e-06, + "loss": 0.7806, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 3.7479085895488358, + "learning_rate": 9.768595740689432e-06, + "loss": 0.7354, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 4.029064129851027, + "learning_rate": 9.768200050130446e-06, + "loss": 0.5923, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 3.345727472277948, + "learning_rate": 9.767804029581993e-06, + "loss": 0.6241, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 8.19472531105154, + "learning_rate": 9.767407679071482e-06, + "loss": 0.7253, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 3.4116277608646617, + "learning_rate": 9.767010998626341e-06, + "loss": 0.6667, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 6.4904954053061035, + "learning_rate": 9.766613988274024e-06, + "loss": 0.6583, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 3.301838046534502, + "learning_rate": 9.766216648042004e-06, + "loss": 0.7839, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 4.210564392011646, + "learning_rate": 9.765818977957781e-06, + "loss": 0.7943, + "step": 1538 + }, + { + "epoch": 0.12, + "grad_norm": 3.478621495963454, + "learning_rate": 9.765420978048879e-06, + "loss": 0.9659, + "step": 1539 + }, + { + "epoch": 0.13, + "grad_norm": 3.4065037649780527, + "learning_rate": 9.765022648342839e-06, + "loss": 0.7844, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 3.357295104102806, + "learning_rate": 9.764623988867228e-06, + "loss": 0.6693, + "step": 1541 + }, + { + "epoch": 0.13, + "grad_norm": 3.6415067741208635, + "learning_rate": 9.764224999649636e-06, + "loss": 0.6968, + "step": 1542 + }, + { + "epoch": 0.13, + "grad_norm": 4.769340923934249, + "learning_rate": 9.763825680717679e-06, + "loss": 0.7311, + "step": 1543 + }, + { + "epoch": 0.13, + "grad_norm": 3.2399213836180696, + "learning_rate": 9.763426032098986e-06, + "loss": 0.8573, + "step": 1544 + }, + { + "epoch": 0.13, + "grad_norm": 24.764452079320318, + "learning_rate": 9.763026053821218e-06, + "loss": 0.6742, + "step": 1545 + }, + { + "epoch": 0.13, + "grad_norm": 3.5012147070447197, + "learning_rate": 9.76262574591206e-06, + "loss": 0.7019, + "step": 1546 + }, + { + "epoch": 0.13, + "grad_norm": 7.7074823896995195, + "learning_rate": 9.76222510839921e-06, + "loss": 0.6455, + "step": 1547 + }, + { + "epoch": 0.13, + "grad_norm": 3.794748635038636, + "learning_rate": 9.761824141310397e-06, + "loss": 0.767, + "step": 1548 + }, + { + "epoch": 0.13, + "grad_norm": 5.048637103372197, + "learning_rate": 9.761422844673372e-06, + "loss": 0.6651, + "step": 1549 + }, + { + "epoch": 0.13, + "grad_norm": 4.295827773749372, + "learning_rate": 9.761021218515904e-06, + "loss": 0.5697, + "step": 1550 + }, + { + "epoch": 0.13, + "grad_norm": 4.503935681519453, + "learning_rate": 9.760619262865792e-06, + "loss": 0.7316, + "step": 1551 + }, + { + "epoch": 0.13, + "grad_norm": 5.0531287369333855, + "learning_rate": 9.76021697775085e-06, + "loss": 0.7947, + "step": 1552 + }, + { + "epoch": 0.13, + "grad_norm": 7.187633323599077, + "learning_rate": 9.759814363198921e-06, + "loss": 0.8353, + "step": 1553 + }, + { + "epoch": 0.13, + "grad_norm": 3.089252124795491, + "learning_rate": 9.759411419237868e-06, + "loss": 0.7553, + "step": 1554 + }, + { + "epoch": 0.13, + "grad_norm": 2.622250229174503, + "learning_rate": 9.759008145895577e-06, + "loss": 0.6976, + "step": 1555 + }, + { + "epoch": 0.13, + "grad_norm": 2.3298340353415616, + "learning_rate": 9.758604543199957e-06, + "loss": 0.607, + "step": 1556 + }, + { + "epoch": 0.13, + "grad_norm": 2.8074810879103573, + "learning_rate": 9.758200611178938e-06, + "loss": 0.8519, + "step": 1557 + }, + { + "epoch": 0.13, + "grad_norm": 2.065575274393397, + "learning_rate": 9.757796349860478e-06, + "loss": 0.8482, + "step": 1558 + }, + { + "epoch": 0.13, + "grad_norm": 4.928625489656893, + "learning_rate": 9.757391759272554e-06, + "loss": 0.7214, + "step": 1559 + }, + { + "epoch": 0.13, + "grad_norm": 3.32932390498336, + "learning_rate": 9.756986839443166e-06, + "loss": 0.6417, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 4.878711826750918, + "learning_rate": 9.756581590400333e-06, + "loss": 0.748, + "step": 1561 + }, + { + "epoch": 0.13, + "grad_norm": 3.4381105533696634, + "learning_rate": 9.756176012172107e-06, + "loss": 0.7327, + "step": 1562 + }, + { + "epoch": 0.13, + "grad_norm": 3.260757765571373, + "learning_rate": 9.755770104786553e-06, + "loss": 0.6864, + "step": 1563 + }, + { + "epoch": 0.13, + "grad_norm": 3.530872038852194, + "learning_rate": 9.755363868271762e-06, + "loss": 0.7138, + "step": 1564 + }, + { + "epoch": 0.13, + "grad_norm": 3.6159314004526673, + "learning_rate": 9.75495730265585e-06, + "loss": 0.7214, + "step": 1565 + }, + { + "epoch": 0.13, + "grad_norm": 3.248587499761892, + "learning_rate": 9.754550407966952e-06, + "loss": 0.721, + "step": 1566 + }, + { + "epoch": 0.13, + "grad_norm": 4.698375023078237, + "learning_rate": 9.754143184233228e-06, + "loss": 0.8135, + "step": 1567 + }, + { + "epoch": 0.13, + "grad_norm": 3.5734272848569355, + "learning_rate": 9.753735631482864e-06, + "loss": 0.6116, + "step": 1568 + }, + { + "epoch": 0.13, + "grad_norm": 2.93473976656805, + "learning_rate": 9.75332774974406e-06, + "loss": 0.7677, + "step": 1569 + }, + { + "epoch": 0.13, + "grad_norm": 3.681297471107408, + "learning_rate": 9.752919539045045e-06, + "loss": 0.7641, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 3.397010514729807, + "learning_rate": 9.752510999414074e-06, + "loss": 0.7181, + "step": 1571 + }, + { + "epoch": 0.13, + "grad_norm": 4.271453857530892, + "learning_rate": 9.752102130879416e-06, + "loss": 0.694, + "step": 1572 + }, + { + "epoch": 0.13, + "grad_norm": 6.312327619980903, + "learning_rate": 9.75169293346937e-06, + "loss": 0.7841, + "step": 1573 + }, + { + "epoch": 0.13, + "grad_norm": 4.016671233218695, + "learning_rate": 9.751283407212253e-06, + "loss": 0.7293, + "step": 1574 + }, + { + "epoch": 0.13, + "grad_norm": 3.6028262358301344, + "learning_rate": 9.750873552136407e-06, + "loss": 0.8454, + "step": 1575 + }, + { + "epoch": 0.13, + "grad_norm": 5.9556768900797215, + "learning_rate": 9.750463368270198e-06, + "loss": 0.7693, + "step": 1576 + }, + { + "epoch": 0.13, + "grad_norm": 4.865050985543065, + "learning_rate": 9.750052855642013e-06, + "loss": 0.8567, + "step": 1577 + }, + { + "epoch": 0.13, + "grad_norm": 2.953356898139655, + "learning_rate": 9.749642014280261e-06, + "loss": 0.5878, + "step": 1578 + }, + { + "epoch": 0.13, + "grad_norm": 3.267259520432038, + "learning_rate": 9.749230844213375e-06, + "loss": 0.7799, + "step": 1579 + }, + { + "epoch": 0.13, + "grad_norm": 4.099288373331892, + "learning_rate": 9.748819345469812e-06, + "loss": 0.8095, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 5.361025667084002, + "learning_rate": 9.748407518078048e-06, + "loss": 0.791, + "step": 1581 + }, + { + "epoch": 0.13, + "grad_norm": 4.136759686537642, + "learning_rate": 9.747995362066587e-06, + "loss": 0.7257, + "step": 1582 + }, + { + "epoch": 0.13, + "grad_norm": 3.749714291511869, + "learning_rate": 9.74758287746395e-06, + "loss": 0.7365, + "step": 1583 + }, + { + "epoch": 0.13, + "grad_norm": 4.1602821158679495, + "learning_rate": 9.747170064298684e-06, + "loss": 0.7499, + "step": 1584 + }, + { + "epoch": 0.13, + "grad_norm": 4.312976124125651, + "learning_rate": 9.74675692259936e-06, + "loss": 0.8296, + "step": 1585 + }, + { + "epoch": 0.13, + "grad_norm": 3.583681748239942, + "learning_rate": 9.746343452394569e-06, + "loss": 0.6602, + "step": 1586 + }, + { + "epoch": 0.13, + "grad_norm": 4.890247791264872, + "learning_rate": 9.745929653712924e-06, + "loss": 0.8061, + "step": 1587 + }, + { + "epoch": 0.13, + "grad_norm": 3.0714448128221137, + "learning_rate": 9.745515526583066e-06, + "loss": 0.7941, + "step": 1588 + }, + { + "epoch": 0.13, + "grad_norm": 4.939323175831899, + "learning_rate": 9.745101071033652e-06, + "loss": 0.7877, + "step": 1589 + }, + { + "epoch": 0.13, + "grad_norm": 3.3550098838732514, + "learning_rate": 9.744686287093368e-06, + "loss": 0.6597, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 2.472382571921731, + "learning_rate": 9.744271174790915e-06, + "loss": 0.8399, + "step": 1591 + }, + { + "epoch": 0.13, + "grad_norm": 3.1502334028698082, + "learning_rate": 9.743855734155028e-06, + "loss": 0.6903, + "step": 1592 + }, + { + "epoch": 0.13, + "grad_norm": 3.623558369747105, + "learning_rate": 9.743439965214452e-06, + "loss": 0.7642, + "step": 1593 + }, + { + "epoch": 0.13, + "grad_norm": 5.995890866801721, + "learning_rate": 9.743023867997964e-06, + "loss": 0.7963, + "step": 1594 + }, + { + "epoch": 0.13, + "grad_norm": 6.017176668193397, + "learning_rate": 9.74260744253436e-06, + "loss": 0.6918, + "step": 1595 + }, + { + "epoch": 0.13, + "grad_norm": 3.0397103388014988, + "learning_rate": 9.742190688852457e-06, + "loss": 0.617, + "step": 1596 + }, + { + "epoch": 0.13, + "grad_norm": 2.8398968635548134, + "learning_rate": 9.741773606981101e-06, + "loss": 0.6697, + "step": 1597 + }, + { + "epoch": 0.13, + "grad_norm": 3.8420418288266456, + "learning_rate": 9.741356196949154e-06, + "loss": 0.6895, + "step": 1598 + }, + { + "epoch": 0.13, + "grad_norm": 16.80330079068573, + "learning_rate": 9.740938458785505e-06, + "loss": 0.777, + "step": 1599 + }, + { + "epoch": 0.13, + "grad_norm": 3.4688192895732644, + "learning_rate": 9.740520392519063e-06, + "loss": 0.8204, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 4.316045543354153, + "learning_rate": 9.74010199817876e-06, + "loss": 0.7809, + "step": 1601 + }, + { + "epoch": 0.13, + "grad_norm": 4.262150862751194, + "learning_rate": 9.739683275793554e-06, + "loss": 0.7865, + "step": 1602 + }, + { + "epoch": 0.13, + "grad_norm": 7.976718925923404, + "learning_rate": 9.739264225392421e-06, + "loss": 0.7256, + "step": 1603 + }, + { + "epoch": 0.13, + "grad_norm": 2.6253850331117072, + "learning_rate": 9.738844847004363e-06, + "loss": 0.7406, + "step": 1604 + }, + { + "epoch": 0.13, + "grad_norm": 4.931278221817675, + "learning_rate": 9.738425140658403e-06, + "loss": 0.8762, + "step": 1605 + }, + { + "epoch": 0.13, + "grad_norm": 3.4771144454922815, + "learning_rate": 9.738005106383588e-06, + "loss": 0.7741, + "step": 1606 + }, + { + "epoch": 0.13, + "grad_norm": 3.3991810278507995, + "learning_rate": 9.737584744208986e-06, + "loss": 0.7351, + "step": 1607 + }, + { + "epoch": 0.13, + "grad_norm": 7.215732942096885, + "learning_rate": 9.73716405416369e-06, + "loss": 0.6021, + "step": 1608 + }, + { + "epoch": 0.13, + "grad_norm": 6.971042511230237, + "learning_rate": 9.736743036276814e-06, + "loss": 0.7296, + "step": 1609 + }, + { + "epoch": 0.13, + "grad_norm": 4.516028298366066, + "learning_rate": 9.736321690577494e-06, + "loss": 0.7452, + "step": 1610 + }, + { + "epoch": 0.13, + "grad_norm": 3.4586796419005155, + "learning_rate": 9.735900017094893e-06, + "loss": 0.6768, + "step": 1611 + }, + { + "epoch": 0.13, + "grad_norm": 6.562280809624755, + "learning_rate": 9.735478015858188e-06, + "loss": 0.6827, + "step": 1612 + }, + { + "epoch": 0.13, + "grad_norm": 5.152785425889034, + "learning_rate": 9.73505568689659e-06, + "loss": 0.7027, + "step": 1613 + }, + { + "epoch": 0.13, + "grad_norm": 3.7540001728442887, + "learning_rate": 9.734633030239322e-06, + "loss": 0.825, + "step": 1614 + }, + { + "epoch": 0.13, + "grad_norm": 5.7794735133142785, + "learning_rate": 9.734210045915638e-06, + "loss": 0.7935, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 22.75812624211879, + "learning_rate": 9.73378673395481e-06, + "loss": 0.7182, + "step": 1616 + }, + { + "epoch": 0.13, + "grad_norm": 3.81560985174846, + "learning_rate": 9.733363094386133e-06, + "loss": 0.783, + "step": 1617 + }, + { + "epoch": 0.13, + "grad_norm": 4.258476658082455, + "learning_rate": 9.732939127238926e-06, + "loss": 0.6002, + "step": 1618 + }, + { + "epoch": 0.13, + "grad_norm": 3.8999253106124563, + "learning_rate": 9.73251483254253e-06, + "loss": 0.6897, + "step": 1619 + }, + { + "epoch": 0.13, + "grad_norm": 4.912416800591878, + "learning_rate": 9.732090210326308e-06, + "loss": 0.8058, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 8.827230766672944, + "learning_rate": 9.731665260619649e-06, + "loss": 0.6443, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 4.83679943374413, + "learning_rate": 9.731239983451962e-06, + "loss": 0.8323, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 3.473150037431046, + "learning_rate": 9.730814378852677e-06, + "loss": 0.5805, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 6.158690394884845, + "learning_rate": 9.730388446851248e-06, + "loss": 0.7607, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 7.0326071661230625, + "learning_rate": 9.729962187477156e-06, + "loss": 0.7769, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 3.5495280896586814, + "learning_rate": 9.729535600759898e-06, + "loss": 0.61, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 3.838332207588205, + "learning_rate": 9.729108686728996e-06, + "loss": 0.7304, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 4.160931677255557, + "learning_rate": 9.728681445413995e-06, + "loss": 0.635, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 6.7148478403544996, + "learning_rate": 9.728253876844464e-06, + "loss": 0.7475, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 2.9286470509589106, + "learning_rate": 9.727825981049994e-06, + "loss": 0.6701, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 3.476303799251314, + "learning_rate": 9.727397758060198e-06, + "loss": 0.6478, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 3.4558827574908944, + "learning_rate": 9.72696920790471e-06, + "loss": 0.6674, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 3.5000527582279424, + "learning_rate": 9.72654033061319e-06, + "loss": 0.8207, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 3.806506387468297, + "learning_rate": 9.726111126215316e-06, + "loss": 0.6835, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 4.105878418541739, + "learning_rate": 9.725681594740796e-06, + "loss": 0.8573, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 3.694683178377096, + "learning_rate": 9.725251736219355e-06, + "loss": 0.7239, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 3.509286681837293, + "learning_rate": 9.72482155068074e-06, + "loss": 0.8, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 5.755639336900618, + "learning_rate": 9.724391038154723e-06, + "loss": 0.8501, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 3.6837007945942197, + "learning_rate": 9.723960198671101e-06, + "loss": 0.7626, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 5.164929000928166, + "learning_rate": 9.723529032259689e-06, + "loss": 0.7144, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 3.3622909819007583, + "learning_rate": 9.723097538950324e-06, + "loss": 0.6487, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 3.775566972616635, + "learning_rate": 9.72266571877287e-06, + "loss": 0.7558, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 4.464648193463835, + "learning_rate": 9.722233571757214e-06, + "loss": 0.8902, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 4.249818869896293, + "learning_rate": 9.72180109793326e-06, + "loss": 0.7797, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 3.649693492349038, + "learning_rate": 9.72136829733094e-06, + "loss": 0.7602, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 3.128506318982704, + "learning_rate": 9.720935169980205e-06, + "loss": 0.6496, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 4.364456287479509, + "learning_rate": 9.72050171591103e-06, + "loss": 0.7596, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 4.707521581585037, + "learning_rate": 9.720067935153415e-06, + "loss": 0.7027, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 3.7451575476002277, + "learning_rate": 9.719633827737379e-06, + "loss": 0.6735, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 3.4153221993125813, + "learning_rate": 9.719199393692963e-06, + "loss": 0.8063, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 9.499821851026198, + "learning_rate": 9.718764633050235e-06, + "loss": 0.747, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 3.684862169262496, + "learning_rate": 9.718329545839282e-06, + "loss": 0.7243, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 3.9077420046052005, + "learning_rate": 9.717894132090218e-06, + "loss": 0.7649, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 4.438028296673117, + "learning_rate": 9.71745839183317e-06, + "loss": 0.8714, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 2.6841104629812276, + "learning_rate": 9.717022325098301e-06, + "loss": 0.6432, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 5.059769876649569, + "learning_rate": 9.716585931915786e-06, + "loss": 0.521, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 9.756685481067823, + "learning_rate": 9.716149212315824e-06, + "loss": 0.7772, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 3.1213390266795322, + "learning_rate": 9.715712166328643e-06, + "loss": 0.6246, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 3.1112938180535887, + "learning_rate": 9.715274793984489e-06, + "loss": 0.8226, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 3.9629629456608013, + "learning_rate": 9.714837095313626e-06, + "loss": 0.8292, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 5.082119336013578, + "learning_rate": 9.71439907034635e-06, + "loss": 0.651, + "step": 1661 + }, + { + "epoch": 0.13, + "grad_norm": 3.559760814378013, + "learning_rate": 9.713960719112976e-06, + "loss": 0.8007, + "step": 1662 + }, + { + "epoch": 0.14, + "grad_norm": 3.456020348047644, + "learning_rate": 9.713522041643837e-06, + "loss": 0.6624, + "step": 1663 + }, + { + "epoch": 0.14, + "grad_norm": 6.340988530016686, + "learning_rate": 9.713083037969292e-06, + "loss": 0.7331, + "step": 1664 + }, + { + "epoch": 0.14, + "grad_norm": 3.1034382008641166, + "learning_rate": 9.712643708119729e-06, + "loss": 0.7857, + "step": 1665 + }, + { + "epoch": 0.14, + "grad_norm": 4.8534432361182, + "learning_rate": 9.712204052125546e-06, + "loss": 0.6796, + "step": 1666 + }, + { + "epoch": 0.14, + "grad_norm": 4.362221883946943, + "learning_rate": 9.711764070017172e-06, + "loss": 0.7872, + "step": 1667 + }, + { + "epoch": 0.14, + "grad_norm": 3.5273449809380115, + "learning_rate": 9.711323761825057e-06, + "loss": 0.7047, + "step": 1668 + }, + { + "epoch": 0.14, + "grad_norm": 6.218440607655703, + "learning_rate": 9.710883127579673e-06, + "loss": 0.7077, + "step": 1669 + }, + { + "epoch": 0.14, + "grad_norm": 3.809729051839772, + "learning_rate": 9.710442167311514e-06, + "loss": 0.8732, + "step": 1670 + }, + { + "epoch": 0.14, + "grad_norm": 5.639587441069127, + "learning_rate": 9.710000881051097e-06, + "loss": 0.5721, + "step": 1671 + }, + { + "epoch": 0.14, + "grad_norm": 4.916564432609439, + "learning_rate": 9.709559268828963e-06, + "loss": 0.5893, + "step": 1672 + }, + { + "epoch": 0.14, + "grad_norm": 3.517988348365595, + "learning_rate": 9.709117330675676e-06, + "loss": 0.7944, + "step": 1673 + }, + { + "epoch": 0.14, + "grad_norm": 4.081701792650104, + "learning_rate": 9.708675066621814e-06, + "loss": 0.9646, + "step": 1674 + }, + { + "epoch": 0.14, + "grad_norm": 3.77230917704101, + "learning_rate": 9.708232476697992e-06, + "loss": 0.5697, + "step": 1675 + }, + { + "epoch": 0.14, + "grad_norm": 3.1655052789769083, + "learning_rate": 9.707789560934837e-06, + "loss": 0.6759, + "step": 1676 + }, + { + "epoch": 0.14, + "grad_norm": 3.469268732521162, + "learning_rate": 9.707346319363002e-06, + "loss": 0.775, + "step": 1677 + }, + { + "epoch": 0.14, + "grad_norm": 3.3039809631673167, + "learning_rate": 9.706902752013161e-06, + "loss": 0.9036, + "step": 1678 + }, + { + "epoch": 0.14, + "grad_norm": 4.301390790437207, + "learning_rate": 9.706458858916013e-06, + "loss": 0.7811, + "step": 1679 + }, + { + "epoch": 0.14, + "grad_norm": 3.684122889489982, + "learning_rate": 9.706014640102276e-06, + "loss": 0.6429, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 4.579633319155013, + "learning_rate": 9.705570095602696e-06, + "loss": 0.6272, + "step": 1681 + }, + { + "epoch": 0.14, + "grad_norm": 5.041456818256814, + "learning_rate": 9.705125225448036e-06, + "loss": 0.8406, + "step": 1682 + }, + { + "epoch": 0.14, + "grad_norm": 5.3077083679831825, + "learning_rate": 9.704680029669085e-06, + "loss": 0.7385, + "step": 1683 + }, + { + "epoch": 0.14, + "grad_norm": 6.416998630785171, + "learning_rate": 9.704234508296653e-06, + "loss": 0.593, + "step": 1684 + }, + { + "epoch": 0.14, + "grad_norm": 3.884262700895454, + "learning_rate": 9.703788661361573e-06, + "loss": 0.7537, + "step": 1685 + }, + { + "epoch": 0.14, + "grad_norm": 4.7039844730202836, + "learning_rate": 9.703342488894699e-06, + "loss": 0.7329, + "step": 1686 + }, + { + "epoch": 0.14, + "grad_norm": 7.075261630793293, + "learning_rate": 9.70289599092691e-06, + "loss": 0.8081, + "step": 1687 + }, + { + "epoch": 0.14, + "grad_norm": 3.236227572092828, + "learning_rate": 9.702449167489108e-06, + "loss": 0.653, + "step": 1688 + }, + { + "epoch": 0.14, + "grad_norm": 5.77749139079335, + "learning_rate": 9.702002018612212e-06, + "loss": 0.7132, + "step": 1689 + }, + { + "epoch": 0.14, + "grad_norm": 3.1191345565918405, + "learning_rate": 9.701554544327171e-06, + "loss": 0.7941, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 3.1856564607312383, + "learning_rate": 9.701106744664954e-06, + "loss": 0.7089, + "step": 1691 + }, + { + "epoch": 0.14, + "grad_norm": 3.7536402174148438, + "learning_rate": 9.70065861965655e-06, + "loss": 0.6437, + "step": 1692 + }, + { + "epoch": 0.14, + "grad_norm": 6.697915829945942, + "learning_rate": 9.700210169332968e-06, + "loss": 0.8034, + "step": 1693 + }, + { + "epoch": 0.14, + "grad_norm": 3.897304716912581, + "learning_rate": 9.69976139372525e-06, + "loss": 0.8438, + "step": 1694 + }, + { + "epoch": 0.14, + "grad_norm": 4.858907689189309, + "learning_rate": 9.699312292864452e-06, + "loss": 0.6158, + "step": 1695 + }, + { + "epoch": 0.14, + "grad_norm": 3.7408560410532457, + "learning_rate": 9.698862866781653e-06, + "loss": 0.675, + "step": 1696 + }, + { + "epoch": 0.14, + "grad_norm": 4.066613783840214, + "learning_rate": 9.698413115507956e-06, + "loss": 0.5913, + "step": 1697 + }, + { + "epoch": 0.14, + "grad_norm": 2.943715922588739, + "learning_rate": 9.69796303907449e-06, + "loss": 0.7509, + "step": 1698 + }, + { + "epoch": 0.14, + "grad_norm": 3.1752119270299954, + "learning_rate": 9.697512637512398e-06, + "loss": 0.897, + "step": 1699 + }, + { + "epoch": 0.14, + "grad_norm": 5.332048346244037, + "learning_rate": 9.697061910852857e-06, + "loss": 0.7403, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 3.000810669673395, + "learning_rate": 9.696610859127053e-06, + "loss": 0.7969, + "step": 1701 + }, + { + "epoch": 0.14, + "grad_norm": 4.5628617251272665, + "learning_rate": 9.696159482366207e-06, + "loss": 0.6227, + "step": 1702 + }, + { + "epoch": 0.14, + "grad_norm": 5.810673287403003, + "learning_rate": 9.695707780601556e-06, + "loss": 0.7164, + "step": 1703 + }, + { + "epoch": 0.14, + "grad_norm": 3.933997524287036, + "learning_rate": 9.69525575386436e-06, + "loss": 0.683, + "step": 1704 + }, + { + "epoch": 0.14, + "grad_norm": 3.808249114411638, + "learning_rate": 9.694803402185901e-06, + "loss": 0.651, + "step": 1705 + }, + { + "epoch": 0.14, + "grad_norm": 5.2463119126658, + "learning_rate": 9.694350725597487e-06, + "loss": 0.9314, + "step": 1706 + }, + { + "epoch": 0.14, + "grad_norm": 3.3827877952299095, + "learning_rate": 9.693897724130442e-06, + "loss": 0.6986, + "step": 1707 + }, + { + "epoch": 0.14, + "grad_norm": 5.257882224395762, + "learning_rate": 9.693444397816123e-06, + "loss": 0.6333, + "step": 1708 + }, + { + "epoch": 0.14, + "grad_norm": 5.669050899034887, + "learning_rate": 9.692990746685897e-06, + "loss": 0.8155, + "step": 1709 + }, + { + "epoch": 0.14, + "grad_norm": 8.978030928106222, + "learning_rate": 9.692536770771162e-06, + "loss": 0.9794, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 5.690780774152081, + "learning_rate": 9.692082470103337e-06, + "loss": 0.6688, + "step": 1711 + }, + { + "epoch": 0.14, + "grad_norm": 8.713956379906906, + "learning_rate": 9.69162784471386e-06, + "loss": 0.6405, + "step": 1712 + }, + { + "epoch": 0.14, + "grad_norm": 4.661951758377616, + "learning_rate": 9.691172894634196e-06, + "loss": 0.7027, + "step": 1713 + }, + { + "epoch": 0.14, + "grad_norm": 5.149150609877356, + "learning_rate": 9.690717619895828e-06, + "loss": 0.7267, + "step": 1714 + }, + { + "epoch": 0.14, + "grad_norm": 4.82640177701345, + "learning_rate": 9.690262020530266e-06, + "loss": 0.7559, + "step": 1715 + }, + { + "epoch": 0.14, + "grad_norm": 4.24779206602431, + "learning_rate": 9.689806096569042e-06, + "loss": 0.8767, + "step": 1716 + }, + { + "epoch": 0.14, + "grad_norm": 3.7460626768945606, + "learning_rate": 9.689349848043704e-06, + "loss": 0.7105, + "step": 1717 + }, + { + "epoch": 0.14, + "grad_norm": 2.6161676148212667, + "learning_rate": 9.688893274985832e-06, + "loss": 0.7263, + "step": 1718 + }, + { + "epoch": 0.14, + "grad_norm": 5.9366964310925425, + "learning_rate": 9.68843637742702e-06, + "loss": 0.6256, + "step": 1719 + }, + { + "epoch": 0.14, + "grad_norm": 6.232294317812228, + "learning_rate": 9.68797915539889e-06, + "loss": 0.7877, + "step": 1720 + }, + { + "epoch": 0.14, + "grad_norm": 4.269863659089573, + "learning_rate": 9.687521608933086e-06, + "loss": 0.7911, + "step": 1721 + }, + { + "epoch": 0.14, + "grad_norm": 4.586358459197461, + "learning_rate": 9.68706373806127e-06, + "loss": 0.723, + "step": 1722 + }, + { + "epoch": 0.14, + "grad_norm": 5.572545518875326, + "learning_rate": 9.686605542815132e-06, + "loss": 0.854, + "step": 1723 + }, + { + "epoch": 0.14, + "grad_norm": 7.808445752083356, + "learning_rate": 9.686147023226381e-06, + "loss": 0.8211, + "step": 1724 + }, + { + "epoch": 0.14, + "grad_norm": 4.572732887179025, + "learning_rate": 9.68568817932675e-06, + "loss": 0.6699, + "step": 1725 + }, + { + "epoch": 0.14, + "grad_norm": 4.528070016874158, + "learning_rate": 9.685229011147991e-06, + "loss": 0.7123, + "step": 1726 + }, + { + "epoch": 0.14, + "grad_norm": 4.733358412030933, + "learning_rate": 9.684769518721887e-06, + "loss": 0.7278, + "step": 1727 + }, + { + "epoch": 0.14, + "grad_norm": 25.415134789600803, + "learning_rate": 9.684309702080234e-06, + "loss": 0.8275, + "step": 1728 + }, + { + "epoch": 0.14, + "grad_norm": 5.826567339095941, + "learning_rate": 9.683849561254854e-06, + "loss": 0.7114, + "step": 1729 + }, + { + "epoch": 0.14, + "grad_norm": 4.443699228473538, + "learning_rate": 9.683389096277591e-06, + "loss": 0.6388, + "step": 1730 + }, + { + "epoch": 0.14, + "grad_norm": 4.174932894829508, + "learning_rate": 9.682928307180317e-06, + "loss": 0.6331, + "step": 1731 + }, + { + "epoch": 0.14, + "grad_norm": 3.975173601550326, + "learning_rate": 9.682467193994915e-06, + "loss": 0.6824, + "step": 1732 + }, + { + "epoch": 0.14, + "grad_norm": 3.051988374164017, + "learning_rate": 9.682005756753301e-06, + "loss": 0.6847, + "step": 1733 + }, + { + "epoch": 0.14, + "grad_norm": 3.684899367072688, + "learning_rate": 9.681543995487407e-06, + "loss": 0.6022, + "step": 1734 + }, + { + "epoch": 0.14, + "grad_norm": 3.868213850010096, + "learning_rate": 9.681081910229194e-06, + "loss": 0.7808, + "step": 1735 + }, + { + "epoch": 0.14, + "grad_norm": 3.7400153113373045, + "learning_rate": 9.680619501010636e-06, + "loss": 0.726, + "step": 1736 + }, + { + "epoch": 0.14, + "grad_norm": 2.7790315917630988, + "learning_rate": 9.680156767863736e-06, + "loss": 0.6927, + "step": 1737 + }, + { + "epoch": 0.14, + "grad_norm": 4.170456886364669, + "learning_rate": 9.679693710820521e-06, + "loss": 0.8767, + "step": 1738 + }, + { + "epoch": 0.14, + "grad_norm": 3.669372878457488, + "learning_rate": 9.679230329913034e-06, + "loss": 0.7986, + "step": 1739 + }, + { + "epoch": 0.14, + "grad_norm": 4.311506097873196, + "learning_rate": 9.678766625173348e-06, + "loss": 0.7221, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 5.9754152293851135, + "learning_rate": 9.678302596633549e-06, + "loss": 0.8068, + "step": 1741 + }, + { + "epoch": 0.14, + "grad_norm": 2.8927795881229543, + "learning_rate": 9.677838244325754e-06, + "loss": 0.6958, + "step": 1742 + }, + { + "epoch": 0.14, + "grad_norm": 5.111544802536774, + "learning_rate": 9.677373568282098e-06, + "loss": 0.7129, + "step": 1743 + }, + { + "epoch": 0.14, + "grad_norm": 4.69299567871582, + "learning_rate": 9.676908568534739e-06, + "loss": 0.6893, + "step": 1744 + }, + { + "epoch": 0.14, + "grad_norm": 4.809374203726032, + "learning_rate": 9.67644324511586e-06, + "loss": 0.7542, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 3.7598658794809148, + "learning_rate": 9.675977598057664e-06, + "loss": 0.6899, + "step": 1746 + }, + { + "epoch": 0.14, + "grad_norm": 3.179260241309964, + "learning_rate": 9.675511627392375e-06, + "loss": 0.6792, + "step": 1747 + }, + { + "epoch": 0.14, + "grad_norm": 2.740633301983333, + "learning_rate": 9.675045333152242e-06, + "loss": 0.6809, + "step": 1748 + }, + { + "epoch": 0.14, + "grad_norm": 3.582531376185183, + "learning_rate": 9.674578715369536e-06, + "loss": 0.694, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 3.599352365182395, + "learning_rate": 9.674111774076549e-06, + "loss": 0.6444, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 3.5431050033351172, + "learning_rate": 9.673644509305596e-06, + "loss": 0.7184, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 3.6986730858156793, + "learning_rate": 9.673176921089016e-06, + "loss": 0.7416, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 21.586212402551016, + "learning_rate": 9.672709009459167e-06, + "loss": 0.9409, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 4.365684397860148, + "learning_rate": 9.672240774448434e-06, + "loss": 0.8122, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 6.169262957732291, + "learning_rate": 9.671772216089219e-06, + "loss": 0.8823, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 4.264796285739442, + "learning_rate": 9.671303334413952e-06, + "loss": 0.6417, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 3.4620593306834664, + "learning_rate": 9.670834129455083e-06, + "loss": 0.6823, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 3.273104747839789, + "learning_rate": 9.670364601245078e-06, + "loss": 0.7805, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 3.3824265010799683, + "learning_rate": 9.66989474981644e-06, + "loss": 0.8015, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 2.929243097614128, + "learning_rate": 9.669424575201679e-06, + "loss": 0.6834, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 3.6156161624234584, + "learning_rate": 9.668954077433336e-06, + "loss": 0.7689, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 5.546147555348058, + "learning_rate": 9.668483256543973e-06, + "loss": 0.6405, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 3.411230876685932, + "learning_rate": 9.668012112566175e-06, + "loss": 0.7831, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 4.956644317993248, + "learning_rate": 9.667540645532543e-06, + "loss": 0.7594, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 8.876866042672079, + "learning_rate": 9.667068855475713e-06, + "loss": 0.8745, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 4.863997392704245, + "learning_rate": 9.66659674242833e-06, + "loss": 0.7023, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 2.640770332580646, + "learning_rate": 9.666124306423069e-06, + "loss": 0.8052, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 4.2726813397915056, + "learning_rate": 9.665651547492624e-06, + "loss": 0.765, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 4.267810710443771, + "learning_rate": 9.665178465669717e-06, + "loss": 0.759, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 2.7138884087296002, + "learning_rate": 9.664705060987085e-06, + "loss": 0.5243, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 5.452283881485013, + "learning_rate": 9.664231333477493e-06, + "loss": 0.7354, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 4.723021780442232, + "learning_rate": 9.663757283173722e-06, + "loss": 0.6879, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 3.456598824191064, + "learning_rate": 9.663282910108582e-06, + "loss": 0.919, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 3.7254655470463534, + "learning_rate": 9.662808214314903e-06, + "loss": 0.8854, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 6.629445656200479, + "learning_rate": 9.662333195825534e-06, + "loss": 0.8656, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 12.562796230998803, + "learning_rate": 9.661857854673354e-06, + "loss": 0.6733, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 11.915009040285069, + "learning_rate": 9.661382190891256e-06, + "loss": 0.8731, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 2.9094443813064395, + "learning_rate": 9.66090620451216e-06, + "loss": 0.7486, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 3.515241499222879, + "learning_rate": 9.660429895569008e-06, + "loss": 0.7267, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 3.858544465573257, + "learning_rate": 9.659953264094762e-06, + "loss": 0.8004, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 4.756422341648063, + "learning_rate": 9.659476310122408e-06, + "loss": 0.7356, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 3.7892863640391106, + "learning_rate": 9.658999033684954e-06, + "loss": 0.7049, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 3.104125614579526, + "learning_rate": 9.658521434815434e-06, + "loss": 0.8571, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 5.3104807382038, + "learning_rate": 9.658043513546898e-06, + "loss": 0.6898, + "step": 1784 + }, + { + "epoch": 0.14, + "grad_norm": 2.912492451282596, + "learning_rate": 9.657565269912419e-06, + "loss": 0.7234, + "step": 1785 + }, + { + "epoch": 0.15, + "grad_norm": 3.8341955253957387, + "learning_rate": 9.657086703945097e-06, + "loss": 0.7386, + "step": 1786 + }, + { + "epoch": 0.15, + "grad_norm": 4.185507625532871, + "learning_rate": 9.656607815678053e-06, + "loss": 0.8705, + "step": 1787 + }, + { + "epoch": 0.15, + "grad_norm": 3.89318246332328, + "learning_rate": 9.656128605144428e-06, + "loss": 0.6939, + "step": 1788 + }, + { + "epoch": 0.15, + "grad_norm": 3.7465657167093154, + "learning_rate": 9.655649072377387e-06, + "loss": 0.7771, + "step": 1789 + }, + { + "epoch": 0.15, + "grad_norm": 3.9439154072796496, + "learning_rate": 9.655169217410114e-06, + "loss": 0.7217, + "step": 1790 + }, + { + "epoch": 0.15, + "grad_norm": 3.1702162148882467, + "learning_rate": 9.65468904027582e-06, + "loss": 0.7735, + "step": 1791 + }, + { + "epoch": 0.15, + "grad_norm": 4.1311088520496, + "learning_rate": 9.654208541007736e-06, + "loss": 0.7909, + "step": 1792 + }, + { + "epoch": 0.15, + "grad_norm": 5.868429103302607, + "learning_rate": 9.653727719639117e-06, + "loss": 0.6055, + "step": 1793 + }, + { + "epoch": 0.15, + "grad_norm": 3.0001341093158387, + "learning_rate": 9.653246576203236e-06, + "loss": 0.7969, + "step": 1794 + }, + { + "epoch": 0.15, + "grad_norm": 3.745962226406492, + "learning_rate": 9.652765110733392e-06, + "loss": 0.6836, + "step": 1795 + }, + { + "epoch": 0.15, + "grad_norm": 4.870664572201797, + "learning_rate": 9.652283323262907e-06, + "loss": 0.8017, + "step": 1796 + }, + { + "epoch": 0.15, + "grad_norm": 10.765902878112202, + "learning_rate": 9.651801213825125e-06, + "loss": 0.7454, + "step": 1797 + }, + { + "epoch": 0.15, + "grad_norm": 5.38238967901382, + "learning_rate": 9.651318782453407e-06, + "loss": 0.6853, + "step": 1798 + }, + { + "epoch": 0.15, + "grad_norm": 7.015738270677254, + "learning_rate": 9.650836029181142e-06, + "loss": 0.6958, + "step": 1799 + }, + { + "epoch": 0.15, + "grad_norm": 4.6035277786668996, + "learning_rate": 9.65035295404174e-06, + "loss": 0.6783, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 6.62280486904139, + "learning_rate": 9.649869557068632e-06, + "loss": 0.7725, + "step": 1801 + }, + { + "epoch": 0.15, + "grad_norm": 4.724076696970895, + "learning_rate": 9.649385838295274e-06, + "loss": 0.7587, + "step": 1802 + }, + { + "epoch": 0.15, + "grad_norm": 34.01121497096036, + "learning_rate": 9.64890179775514e-06, + "loss": 0.7934, + "step": 1803 + }, + { + "epoch": 0.15, + "grad_norm": 4.039469766190597, + "learning_rate": 9.648417435481728e-06, + "loss": 0.8182, + "step": 1804 + }, + { + "epoch": 0.15, + "grad_norm": 16.172027999273965, + "learning_rate": 9.647932751508561e-06, + "loss": 0.5744, + "step": 1805 + }, + { + "epoch": 0.15, + "grad_norm": 2.6978817473841286, + "learning_rate": 9.647447745869185e-06, + "loss": 0.6485, + "step": 1806 + }, + { + "epoch": 0.15, + "grad_norm": 4.404682338788708, + "learning_rate": 9.64696241859716e-06, + "loss": 0.704, + "step": 1807 + }, + { + "epoch": 0.15, + "grad_norm": 5.531883256365931, + "learning_rate": 9.646476769726076e-06, + "loss": 0.6829, + "step": 1808 + }, + { + "epoch": 0.15, + "grad_norm": 3.0739698498616104, + "learning_rate": 9.645990799289544e-06, + "loss": 0.7043, + "step": 1809 + }, + { + "epoch": 0.15, + "grad_norm": 10.458369186395796, + "learning_rate": 9.645504507321192e-06, + "loss": 0.6906, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 4.264114854895885, + "learning_rate": 9.645017893854682e-06, + "loss": 0.7697, + "step": 1811 + }, + { + "epoch": 0.15, + "grad_norm": 4.64115015554052, + "learning_rate": 9.644530958923683e-06, + "loss": 0.7407, + "step": 1812 + }, + { + "epoch": 0.15, + "grad_norm": 6.57708205817582, + "learning_rate": 9.644043702561899e-06, + "loss": 0.5949, + "step": 1813 + }, + { + "epoch": 0.15, + "grad_norm": 2.974082388564952, + "learning_rate": 9.643556124803049e-06, + "loss": 0.8784, + "step": 1814 + }, + { + "epoch": 0.15, + "grad_norm": 2.6420921537346542, + "learning_rate": 9.643068225680877e-06, + "loss": 0.8026, + "step": 1815 + }, + { + "epoch": 0.15, + "grad_norm": 3.35434422701114, + "learning_rate": 9.642580005229148e-06, + "loss": 0.7062, + "step": 1816 + }, + { + "epoch": 0.15, + "grad_norm": 10.526516538261736, + "learning_rate": 9.64209146348165e-06, + "loss": 0.8309, + "step": 1817 + }, + { + "epoch": 0.15, + "grad_norm": 3.2630227711576154, + "learning_rate": 9.641602600472195e-06, + "loss": 0.7267, + "step": 1818 + }, + { + "epoch": 0.15, + "grad_norm": 3.158520496304099, + "learning_rate": 9.641113416234615e-06, + "loss": 0.7555, + "step": 1819 + }, + { + "epoch": 0.15, + "grad_norm": 6.2817029368610475, + "learning_rate": 9.640623910802763e-06, + "loss": 0.7808, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 5.778041260608554, + "learning_rate": 9.640134084210515e-06, + "loss": 0.8656, + "step": 1821 + }, + { + "epoch": 0.15, + "grad_norm": 3.338031539263942, + "learning_rate": 9.639643936491772e-06, + "loss": 0.6372, + "step": 1822 + }, + { + "epoch": 0.15, + "grad_norm": 3.0522752405586506, + "learning_rate": 9.639153467680455e-06, + "loss": 0.8437, + "step": 1823 + }, + { + "epoch": 0.15, + "grad_norm": 3.6172029956296217, + "learning_rate": 9.638662677810509e-06, + "loss": 0.8244, + "step": 1824 + }, + { + "epoch": 0.15, + "grad_norm": 3.234354990918008, + "learning_rate": 9.638171566915897e-06, + "loss": 0.7631, + "step": 1825 + }, + { + "epoch": 0.15, + "grad_norm": 4.248374596154979, + "learning_rate": 9.637680135030609e-06, + "loss": 0.6777, + "step": 1826 + }, + { + "epoch": 0.15, + "grad_norm": 3.889120425005195, + "learning_rate": 9.637188382188654e-06, + "loss": 0.7615, + "step": 1827 + }, + { + "epoch": 0.15, + "grad_norm": 3.7514652989888835, + "learning_rate": 9.636696308424066e-06, + "loss": 0.7635, + "step": 1828 + }, + { + "epoch": 0.15, + "grad_norm": 4.61654240479573, + "learning_rate": 9.636203913770896e-06, + "loss": 0.7345, + "step": 1829 + }, + { + "epoch": 0.15, + "grad_norm": 18.458235311642586, + "learning_rate": 9.635711198263225e-06, + "loss": 0.7597, + "step": 1830 + }, + { + "epoch": 0.15, + "grad_norm": 4.1459296580252545, + "learning_rate": 9.63521816193515e-06, + "loss": 0.8751, + "step": 1831 + }, + { + "epoch": 0.15, + "grad_norm": 23.770557693442274, + "learning_rate": 9.634724804820793e-06, + "loss": 0.7278, + "step": 1832 + }, + { + "epoch": 0.15, + "grad_norm": 3.1179063965618545, + "learning_rate": 9.634231126954296e-06, + "loss": 0.7737, + "step": 1833 + }, + { + "epoch": 0.15, + "grad_norm": 2.80327418176487, + "learning_rate": 9.633737128369824e-06, + "loss": 0.5436, + "step": 1834 + }, + { + "epoch": 0.15, + "grad_norm": 4.212624824775542, + "learning_rate": 9.633242809101568e-06, + "loss": 0.8598, + "step": 1835 + }, + { + "epoch": 0.15, + "grad_norm": 17.212815482701565, + "learning_rate": 9.632748169183737e-06, + "loss": 0.7054, + "step": 1836 + }, + { + "epoch": 0.15, + "grad_norm": 3.7657281770707995, + "learning_rate": 9.632253208650562e-06, + "loss": 0.8222, + "step": 1837 + }, + { + "epoch": 0.15, + "grad_norm": 3.1747568899720426, + "learning_rate": 9.631757927536297e-06, + "loss": 0.7882, + "step": 1838 + }, + { + "epoch": 0.15, + "grad_norm": 3.6537778128673626, + "learning_rate": 9.63126232587522e-06, + "loss": 0.84, + "step": 1839 + }, + { + "epoch": 0.15, + "grad_norm": 3.8424096742359164, + "learning_rate": 9.63076640370163e-06, + "loss": 0.7681, + "step": 1840 + }, + { + "epoch": 0.15, + "grad_norm": 3.5514062459548876, + "learning_rate": 9.630270161049847e-06, + "loss": 0.6818, + "step": 1841 + }, + { + "epoch": 0.15, + "grad_norm": 2.9608597355195188, + "learning_rate": 9.629773597954213e-06, + "loss": 0.7155, + "step": 1842 + }, + { + "epoch": 0.15, + "grad_norm": 9.645003243504389, + "learning_rate": 9.629276714449095e-06, + "loss": 0.6655, + "step": 1843 + }, + { + "epoch": 0.15, + "grad_norm": 3.5498534043129286, + "learning_rate": 9.62877951056888e-06, + "loss": 0.7792, + "step": 1844 + }, + { + "epoch": 0.15, + "grad_norm": 3.229474133748755, + "learning_rate": 9.628281986347978e-06, + "loss": 0.9283, + "step": 1845 + }, + { + "epoch": 0.15, + "grad_norm": 4.264767916182582, + "learning_rate": 9.62778414182082e-06, + "loss": 0.687, + "step": 1846 + }, + { + "epoch": 0.15, + "grad_norm": 2.8152615536340764, + "learning_rate": 9.627285977021861e-06, + "loss": 0.605, + "step": 1847 + }, + { + "epoch": 0.15, + "grad_norm": 85.75608395244225, + "learning_rate": 9.626787491985576e-06, + "loss": 0.7308, + "step": 1848 + }, + { + "epoch": 0.15, + "grad_norm": 4.733506267481033, + "learning_rate": 9.626288686746465e-06, + "loss": 0.8634, + "step": 1849 + }, + { + "epoch": 0.15, + "grad_norm": 9.764897727206725, + "learning_rate": 9.625789561339046e-06, + "loss": 0.7068, + "step": 1850 + }, + { + "epoch": 0.15, + "grad_norm": 3.8616128670841383, + "learning_rate": 9.625290115797864e-06, + "loss": 0.8541, + "step": 1851 + }, + { + "epoch": 0.15, + "grad_norm": 2.961195298602995, + "learning_rate": 9.624790350157482e-06, + "loss": 0.6664, + "step": 1852 + }, + { + "epoch": 0.15, + "grad_norm": 5.553594883581051, + "learning_rate": 9.624290264452488e-06, + "loss": 0.577, + "step": 1853 + }, + { + "epoch": 0.15, + "grad_norm": 6.810953982965657, + "learning_rate": 9.623789858717491e-06, + "loss": 0.7871, + "step": 1854 + }, + { + "epoch": 0.15, + "grad_norm": 3.525060023018861, + "learning_rate": 9.623289132987122e-06, + "loss": 0.6726, + "step": 1855 + }, + { + "epoch": 0.15, + "grad_norm": 4.175663554760576, + "learning_rate": 9.622788087296033e-06, + "loss": 0.7703, + "step": 1856 + }, + { + "epoch": 0.15, + "grad_norm": 3.9650322979692305, + "learning_rate": 9.622286721678903e-06, + "loss": 0.7014, + "step": 1857 + }, + { + "epoch": 0.15, + "grad_norm": 5.314001414751438, + "learning_rate": 9.621785036170425e-06, + "loss": 0.7249, + "step": 1858 + }, + { + "epoch": 0.15, + "grad_norm": 3.6232587886025094, + "learning_rate": 9.621283030805324e-06, + "loss": 0.8366, + "step": 1859 + }, + { + "epoch": 0.15, + "grad_norm": 5.4504591504261155, + "learning_rate": 9.620780705618338e-06, + "loss": 0.6685, + "step": 1860 + }, + { + "epoch": 0.15, + "grad_norm": 6.171991140368373, + "learning_rate": 9.620278060644232e-06, + "loss": 0.7909, + "step": 1861 + }, + { + "epoch": 0.15, + "grad_norm": 4.3592939498546786, + "learning_rate": 9.619775095917793e-06, + "loss": 0.8664, + "step": 1862 + }, + { + "epoch": 0.15, + "grad_norm": 3.38970622071405, + "learning_rate": 9.61927181147383e-06, + "loss": 0.6201, + "step": 1863 + }, + { + "epoch": 0.15, + "grad_norm": 4.95411428406291, + "learning_rate": 9.618768207347171e-06, + "loss": 0.823, + "step": 1864 + }, + { + "epoch": 0.15, + "grad_norm": 4.524357263767203, + "learning_rate": 9.61826428357267e-06, + "loss": 0.6175, + "step": 1865 + }, + { + "epoch": 0.15, + "grad_norm": 5.098024400666344, + "learning_rate": 9.617760040185202e-06, + "loss": 0.6533, + "step": 1866 + }, + { + "epoch": 0.15, + "grad_norm": 3.7909623854257997, + "learning_rate": 9.617255477219662e-06, + "loss": 0.7238, + "step": 1867 + }, + { + "epoch": 0.15, + "grad_norm": 3.838778584529365, + "learning_rate": 9.616750594710972e-06, + "loss": 0.7322, + "step": 1868 + }, + { + "epoch": 0.15, + "grad_norm": 3.784931088387499, + "learning_rate": 9.61624539269407e-06, + "loss": 0.7113, + "step": 1869 + }, + { + "epoch": 0.15, + "grad_norm": 3.5834781467863133, + "learning_rate": 9.615739871203922e-06, + "loss": 0.7007, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 4.983176662639147, + "learning_rate": 9.615234030275511e-06, + "loss": 0.8822, + "step": 1871 + }, + { + "epoch": 0.15, + "grad_norm": 4.8234030026158985, + "learning_rate": 9.614727869943845e-06, + "loss": 0.6518, + "step": 1872 + }, + { + "epoch": 0.15, + "grad_norm": 4.107915054639396, + "learning_rate": 9.614221390243955e-06, + "loss": 0.7061, + "step": 1873 + }, + { + "epoch": 0.15, + "grad_norm": 3.5722162038368026, + "learning_rate": 9.61371459121089e-06, + "loss": 0.8054, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 5.180211867570742, + "learning_rate": 9.613207472879725e-06, + "loss": 0.7597, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 4.895786217891239, + "learning_rate": 9.612700035285557e-06, + "loss": 0.7773, + "step": 1876 + }, + { + "epoch": 0.15, + "grad_norm": 10.543365189270618, + "learning_rate": 9.612192278463502e-06, + "loss": 0.7339, + "step": 1877 + }, + { + "epoch": 0.15, + "grad_norm": 2.9303148430854273, + "learning_rate": 9.611684202448699e-06, + "loss": 0.7264, + "step": 1878 + }, + { + "epoch": 0.15, + "grad_norm": 3.2328542319356472, + "learning_rate": 9.611175807276311e-06, + "loss": 0.8334, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 3.350563349574019, + "learning_rate": 9.610667092981526e-06, + "loss": 0.6904, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 2.6365398544093166, + "learning_rate": 9.610158059599546e-06, + "loss": 0.5767, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 2.7046603502510562, + "learning_rate": 9.6096487071656e-06, + "loss": 0.7838, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 2.9165358776349994, + "learning_rate": 9.609139035714938e-06, + "loss": 0.812, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 3.4585004951500746, + "learning_rate": 9.608629045282833e-06, + "loss": 0.8616, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 3.1190589043565997, + "learning_rate": 9.60811873590458e-06, + "loss": 0.6763, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 3.7294620548658166, + "learning_rate": 9.607608107615496e-06, + "loss": 0.8731, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 4.328997582138383, + "learning_rate": 9.60709716045092e-06, + "loss": 0.7513, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 2.7809222946994168, + "learning_rate": 9.60658589444621e-06, + "loss": 0.8948, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 3.7609806153607837, + "learning_rate": 9.606074309636751e-06, + "loss": 0.7104, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 5.442058931950496, + "learning_rate": 9.605562406057948e-06, + "loss": 0.72, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 4.628884486758903, + "learning_rate": 9.605050183745228e-06, + "loss": 0.6918, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 15.338926483134891, + "learning_rate": 9.604537642734039e-06, + "loss": 0.685, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 4.331612460959655, + "learning_rate": 9.604024783059851e-06, + "loss": 0.8155, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 3.858972245844192, + "learning_rate": 9.60351160475816e-06, + "loss": 0.6348, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 4.1501786353724395, + "learning_rate": 9.602998107864481e-06, + "loss": 0.7534, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 6.364780975234508, + "learning_rate": 9.602484292414348e-06, + "loss": 0.8168, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 3.6894908704816025, + "learning_rate": 9.601970158443324e-06, + "loss": 0.7513, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 3.4982378535850573, + "learning_rate": 9.601455705986989e-06, + "loss": 0.5723, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 4.570514638212711, + "learning_rate": 9.600940935080944e-06, + "loss": 0.6902, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 2.828916395650367, + "learning_rate": 9.600425845760816e-06, + "loss": 0.7753, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 7.067852792139686, + "learning_rate": 9.599910438062255e-06, + "loss": 0.755, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 3.649984593118544, + "learning_rate": 9.599394712020927e-06, + "loss": 0.797, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 3.652218346686841, + "learning_rate": 9.598878667672525e-06, + "loss": 0.8341, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 3.169274276842476, + "learning_rate": 9.598362305052764e-06, + "loss": 0.7978, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 3.3453049081747848, + "learning_rate": 9.597845624197376e-06, + "loss": 0.8288, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 5.293662057668835, + "learning_rate": 9.59732862514212e-06, + "loss": 0.6776, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 3.7790879214603135, + "learning_rate": 9.596811307922776e-06, + "loss": 0.5941, + "step": 1907 + }, + { + "epoch": 0.15, + "grad_norm": 3.983357639424124, + "learning_rate": 9.596293672575147e-06, + "loss": 0.8004, + "step": 1908 + }, + { + "epoch": 0.16, + "grad_norm": 3.7850724701619787, + "learning_rate": 9.595775719135054e-06, + "loss": 0.6015, + "step": 1909 + }, + { + "epoch": 0.16, + "grad_norm": 6.209393942261223, + "learning_rate": 9.595257447638344e-06, + "loss": 0.8922, + "step": 1910 + }, + { + "epoch": 0.16, + "grad_norm": 4.840525113305607, + "learning_rate": 9.594738858120885e-06, + "loss": 0.6557, + "step": 1911 + }, + { + "epoch": 0.16, + "grad_norm": 5.780534332511471, + "learning_rate": 9.594219950618565e-06, + "loss": 0.8158, + "step": 1912 + }, + { + "epoch": 0.16, + "grad_norm": 3.564062676339331, + "learning_rate": 9.593700725167298e-06, + "loss": 0.7096, + "step": 1913 + }, + { + "epoch": 0.16, + "grad_norm": 4.866051578170717, + "learning_rate": 9.593181181803014e-06, + "loss": 0.6888, + "step": 1914 + }, + { + "epoch": 0.16, + "grad_norm": 2.6515255514994003, + "learning_rate": 9.592661320561676e-06, + "loss": 0.832, + "step": 1915 + }, + { + "epoch": 0.16, + "grad_norm": 2.7834864921790334, + "learning_rate": 9.592141141479254e-06, + "loss": 0.7611, + "step": 1916 + }, + { + "epoch": 0.16, + "grad_norm": 4.9199684368681105, + "learning_rate": 9.59162064459175e-06, + "loss": 0.7701, + "step": 1917 + }, + { + "epoch": 0.16, + "grad_norm": 5.432288169454921, + "learning_rate": 9.591099829935187e-06, + "loss": 0.6937, + "step": 1918 + }, + { + "epoch": 0.16, + "grad_norm": 3.6986372082979204, + "learning_rate": 9.590578697545607e-06, + "loss": 0.8122, + "step": 1919 + }, + { + "epoch": 0.16, + "grad_norm": 4.075509683323473, + "learning_rate": 9.590057247459077e-06, + "loss": 0.7345, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 9.368961983218787, + "learning_rate": 9.589535479711685e-06, + "loss": 0.8236, + "step": 1921 + }, + { + "epoch": 0.16, + "grad_norm": 4.275887276675151, + "learning_rate": 9.589013394339537e-06, + "loss": 0.6647, + "step": 1922 + }, + { + "epoch": 0.16, + "grad_norm": 23.46387668240611, + "learning_rate": 9.58849099137877e-06, + "loss": 0.6335, + "step": 1923 + }, + { + "epoch": 0.16, + "grad_norm": 11.820622139434557, + "learning_rate": 9.587968270865534e-06, + "loss": 0.5967, + "step": 1924 + }, + { + "epoch": 0.16, + "grad_norm": 3.0046436101014744, + "learning_rate": 9.587445232836005e-06, + "loss": 0.731, + "step": 1925 + }, + { + "epoch": 0.16, + "grad_norm": 7.857594921186901, + "learning_rate": 9.586921877326381e-06, + "loss": 0.7659, + "step": 1926 + }, + { + "epoch": 0.16, + "grad_norm": 5.955937226745144, + "learning_rate": 9.586398204372882e-06, + "loss": 0.7501, + "step": 1927 + }, + { + "epoch": 0.16, + "grad_norm": 3.871114498134791, + "learning_rate": 9.585874214011749e-06, + "loss": 0.7319, + "step": 1928 + }, + { + "epoch": 0.16, + "grad_norm": 6.576319978287988, + "learning_rate": 9.585349906279245e-06, + "loss": 0.7733, + "step": 1929 + }, + { + "epoch": 0.16, + "grad_norm": 5.532723547252205, + "learning_rate": 9.584825281211656e-06, + "loss": 0.8911, + "step": 1930 + }, + { + "epoch": 0.16, + "grad_norm": 4.86223905744695, + "learning_rate": 9.584300338845289e-06, + "loss": 0.6837, + "step": 1931 + }, + { + "epoch": 0.16, + "grad_norm": 4.322644156473204, + "learning_rate": 9.583775079216472e-06, + "loss": 0.9413, + "step": 1932 + }, + { + "epoch": 0.16, + "grad_norm": 5.722805925680523, + "learning_rate": 9.58324950236156e-06, + "loss": 0.7049, + "step": 1933 + }, + { + "epoch": 0.16, + "grad_norm": 4.879090590846604, + "learning_rate": 9.582723608316921e-06, + "loss": 0.7505, + "step": 1934 + }, + { + "epoch": 0.16, + "grad_norm": 5.550854453836339, + "learning_rate": 9.582197397118956e-06, + "loss": 0.8024, + "step": 1935 + }, + { + "epoch": 0.16, + "grad_norm": 7.736103598975232, + "learning_rate": 9.581670868804079e-06, + "loss": 0.7483, + "step": 1936 + }, + { + "epoch": 0.16, + "grad_norm": 5.767619235708984, + "learning_rate": 9.581144023408729e-06, + "loss": 0.7122, + "step": 1937 + }, + { + "epoch": 0.16, + "grad_norm": 3.1983253808885648, + "learning_rate": 9.580616860969365e-06, + "loss": 0.7379, + "step": 1938 + }, + { + "epoch": 0.16, + "grad_norm": 23.541184725652943, + "learning_rate": 9.580089381522476e-06, + "loss": 0.6798, + "step": 1939 + }, + { + "epoch": 0.16, + "grad_norm": 3.878537419811449, + "learning_rate": 9.57956158510456e-06, + "loss": 0.8329, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 6.6122434806498, + "learning_rate": 9.579033471752148e-06, + "loss": 0.7559, + "step": 1941 + }, + { + "epoch": 0.16, + "grad_norm": 5.133554861086589, + "learning_rate": 9.578505041501787e-06, + "loss": 0.8041, + "step": 1942 + }, + { + "epoch": 0.16, + "grad_norm": 12.392117446144335, + "learning_rate": 9.57797629439005e-06, + "loss": 0.7294, + "step": 1943 + }, + { + "epoch": 0.16, + "grad_norm": 8.945281611934789, + "learning_rate": 9.577447230453529e-06, + "loss": 0.7469, + "step": 1944 + }, + { + "epoch": 0.16, + "grad_norm": 4.1683942397239795, + "learning_rate": 9.576917849728836e-06, + "loss": 0.8529, + "step": 1945 + }, + { + "epoch": 0.16, + "grad_norm": 6.520692594885672, + "learning_rate": 9.57638815225261e-06, + "loss": 0.7877, + "step": 1946 + }, + { + "epoch": 0.16, + "grad_norm": 4.589191675738484, + "learning_rate": 9.575858138061506e-06, + "loss": 0.661, + "step": 1947 + }, + { + "epoch": 0.16, + "grad_norm": 3.820931375483734, + "learning_rate": 9.575327807192209e-06, + "loss": 0.7366, + "step": 1948 + }, + { + "epoch": 0.16, + "grad_norm": 3.4661618180292106, + "learning_rate": 9.57479715968142e-06, + "loss": 0.6171, + "step": 1949 + }, + { + "epoch": 0.16, + "grad_norm": 3.9086771753899145, + "learning_rate": 9.57426619556586e-06, + "loss": 0.7694, + "step": 1950 + }, + { + "epoch": 0.16, + "grad_norm": 6.650492966210364, + "learning_rate": 9.57373491488228e-06, + "loss": 0.861, + "step": 1951 + }, + { + "epoch": 0.16, + "grad_norm": 6.2456134810704595, + "learning_rate": 9.573203317667442e-06, + "loss": 0.707, + "step": 1952 + }, + { + "epoch": 0.16, + "grad_norm": 13.692815390595538, + "learning_rate": 9.572671403958142e-06, + "loss": 0.7654, + "step": 1953 + }, + { + "epoch": 0.16, + "grad_norm": 5.992182603247182, + "learning_rate": 9.572139173791185e-06, + "loss": 0.8073, + "step": 1954 + }, + { + "epoch": 0.16, + "grad_norm": 7.513363643468951, + "learning_rate": 9.571606627203413e-06, + "loss": 0.8222, + "step": 1955 + }, + { + "epoch": 0.16, + "grad_norm": 4.741155559243881, + "learning_rate": 9.571073764231675e-06, + "loss": 0.8639, + "step": 1956 + }, + { + "epoch": 0.16, + "grad_norm": 9.502065882743967, + "learning_rate": 9.570540584912852e-06, + "loss": 0.7418, + "step": 1957 + }, + { + "epoch": 0.16, + "grad_norm": 5.462441810993187, + "learning_rate": 9.570007089283841e-06, + "loss": 0.7486, + "step": 1958 + }, + { + "epoch": 0.16, + "grad_norm": 8.661879460531605, + "learning_rate": 9.569473277381565e-06, + "loss": 0.5803, + "step": 1959 + }, + { + "epoch": 0.16, + "grad_norm": 5.580014944970623, + "learning_rate": 9.568939149242966e-06, + "loss": 0.7293, + "step": 1960 + }, + { + "epoch": 0.16, + "grad_norm": 3.629131804674316, + "learning_rate": 9.56840470490501e-06, + "loss": 0.6889, + "step": 1961 + }, + { + "epoch": 0.16, + "grad_norm": 5.291300986477962, + "learning_rate": 9.567869944404682e-06, + "loss": 0.8523, + "step": 1962 + }, + { + "epoch": 0.16, + "grad_norm": 4.748288580761305, + "learning_rate": 9.567334867778992e-06, + "loss": 0.7286, + "step": 1963 + }, + { + "epoch": 0.16, + "grad_norm": 4.837748808239457, + "learning_rate": 9.566799475064973e-06, + "loss": 0.7461, + "step": 1964 + }, + { + "epoch": 0.16, + "grad_norm": 6.040507984847288, + "learning_rate": 9.566263766299675e-06, + "loss": 0.7607, + "step": 1965 + }, + { + "epoch": 0.16, + "grad_norm": 2.69920525999466, + "learning_rate": 9.56572774152017e-06, + "loss": 0.6709, + "step": 1966 + }, + { + "epoch": 0.16, + "grad_norm": 5.430829077976158, + "learning_rate": 9.565191400763561e-06, + "loss": 0.8636, + "step": 1967 + }, + { + "epoch": 0.16, + "grad_norm": 2.666879830836213, + "learning_rate": 9.564654744066959e-06, + "loss": 0.7372, + "step": 1968 + }, + { + "epoch": 0.16, + "grad_norm": 9.706743045295168, + "learning_rate": 9.564117771467509e-06, + "loss": 0.7752, + "step": 1969 + }, + { + "epoch": 0.16, + "grad_norm": 4.226346527490758, + "learning_rate": 9.56358048300237e-06, + "loss": 0.8174, + "step": 1970 + }, + { + "epoch": 0.16, + "grad_norm": 7.9886786268770065, + "learning_rate": 9.563042878708728e-06, + "loss": 0.5509, + "step": 1971 + }, + { + "epoch": 0.16, + "grad_norm": 3.2923447172436435, + "learning_rate": 9.562504958623788e-06, + "loss": 0.7268, + "step": 1972 + }, + { + "epoch": 0.16, + "grad_norm": 8.632310394426218, + "learning_rate": 9.561966722784774e-06, + "loss": 0.868, + "step": 1973 + }, + { + "epoch": 0.16, + "grad_norm": 4.958841960391876, + "learning_rate": 9.561428171228941e-06, + "loss": 0.7845, + "step": 1974 + }, + { + "epoch": 0.16, + "grad_norm": 4.729717093725785, + "learning_rate": 9.560889303993557e-06, + "loss": 0.7959, + "step": 1975 + }, + { + "epoch": 0.16, + "grad_norm": 4.102901402558135, + "learning_rate": 9.560350121115915e-06, + "loss": 0.7684, + "step": 1976 + }, + { + "epoch": 0.16, + "grad_norm": 4.075061004810444, + "learning_rate": 9.559810622633332e-06, + "loss": 0.732, + "step": 1977 + }, + { + "epoch": 0.16, + "grad_norm": 4.90634097311846, + "learning_rate": 9.559270808583142e-06, + "loss": 0.5855, + "step": 1978 + }, + { + "epoch": 0.16, + "grad_norm": 3.0505078913101116, + "learning_rate": 9.558730679002703e-06, + "loss": 0.7735, + "step": 1979 + }, + { + "epoch": 0.16, + "grad_norm": 2.9272693231438844, + "learning_rate": 9.558190233929396e-06, + "loss": 0.6365, + "step": 1980 + }, + { + "epoch": 0.16, + "grad_norm": 5.347141482559581, + "learning_rate": 9.557649473400628e-06, + "loss": 0.7674, + "step": 1981 + }, + { + "epoch": 0.16, + "grad_norm": 3.310478891248533, + "learning_rate": 9.557108397453816e-06, + "loss": 0.6082, + "step": 1982 + }, + { + "epoch": 0.16, + "grad_norm": 5.423444601805491, + "learning_rate": 9.556567006126409e-06, + "loss": 0.7377, + "step": 1983 + }, + { + "epoch": 0.16, + "grad_norm": 4.746374117258417, + "learning_rate": 9.556025299455876e-06, + "loss": 0.6249, + "step": 1984 + }, + { + "epoch": 0.16, + "grad_norm": 13.264254345867498, + "learning_rate": 9.555483277479705e-06, + "loss": 0.6606, + "step": 1985 + }, + { + "epoch": 0.16, + "grad_norm": 3.7513680932794684, + "learning_rate": 9.554940940235406e-06, + "loss": 0.8212, + "step": 1986 + }, + { + "epoch": 0.16, + "grad_norm": 3.044470117574727, + "learning_rate": 9.554398287760515e-06, + "loss": 0.8485, + "step": 1987 + }, + { + "epoch": 0.16, + "grad_norm": 4.173017285118528, + "learning_rate": 9.553855320092587e-06, + "loss": 0.9029, + "step": 1988 + }, + { + "epoch": 0.16, + "grad_norm": 6.720472294713218, + "learning_rate": 9.553312037269196e-06, + "loss": 0.7308, + "step": 1989 + }, + { + "epoch": 0.16, + "grad_norm": 3.1250736699510395, + "learning_rate": 9.552768439327941e-06, + "loss": 0.7176, + "step": 1990 + }, + { + "epoch": 0.16, + "grad_norm": 4.047436353972096, + "learning_rate": 9.552224526306445e-06, + "loss": 0.937, + "step": 1991 + }, + { + "epoch": 0.16, + "grad_norm": 2.8621236169750195, + "learning_rate": 9.551680298242348e-06, + "loss": 0.7641, + "step": 1992 + }, + { + "epoch": 0.16, + "grad_norm": 5.392121781727244, + "learning_rate": 9.551135755173315e-06, + "loss": 0.6102, + "step": 1993 + }, + { + "epoch": 0.16, + "grad_norm": 4.014438324106472, + "learning_rate": 9.55059089713703e-06, + "loss": 0.7779, + "step": 1994 + }, + { + "epoch": 0.16, + "grad_norm": 4.043296082866594, + "learning_rate": 9.550045724171204e-06, + "loss": 0.8215, + "step": 1995 + }, + { + "epoch": 0.16, + "grad_norm": 4.648604972711596, + "learning_rate": 9.549500236313562e-06, + "loss": 0.5957, + "step": 1996 + }, + { + "epoch": 0.16, + "grad_norm": 3.3469182083357625, + "learning_rate": 9.54895443360186e-06, + "loss": 0.8371, + "step": 1997 + }, + { + "epoch": 0.16, + "grad_norm": 4.313193016165895, + "learning_rate": 9.548408316073868e-06, + "loss": 0.6786, + "step": 1998 + }, + { + "epoch": 0.16, + "grad_norm": 3.0685193150962844, + "learning_rate": 9.547861883767383e-06, + "loss": 0.7183, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 4.551754914135308, + "learning_rate": 9.547315136720217e-06, + "loss": 0.606, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 4.388912798266088, + "learning_rate": 9.546768074970213e-06, + "loss": 0.7162, + "step": 2001 + }, + { + "epoch": 0.16, + "grad_norm": 11.62340631243312, + "learning_rate": 9.546220698555227e-06, + "loss": 0.5885, + "step": 2002 + }, + { + "epoch": 0.16, + "grad_norm": 7.283462508030961, + "learning_rate": 9.545673007513145e-06, + "loss": 0.545, + "step": 2003 + }, + { + "epoch": 0.16, + "grad_norm": 4.312574679222201, + "learning_rate": 9.54512500188187e-06, + "loss": 0.5053, + "step": 2004 + }, + { + "epoch": 0.16, + "grad_norm": 5.754950103029749, + "learning_rate": 9.544576681699325e-06, + "loss": 0.801, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 4.936679378804686, + "learning_rate": 9.544028047003458e-06, + "loss": 0.6033, + "step": 2006 + }, + { + "epoch": 0.16, + "grad_norm": 7.202703373623657, + "learning_rate": 9.54347909783224e-06, + "loss": 0.7605, + "step": 2007 + }, + { + "epoch": 0.16, + "grad_norm": 2.749700340286971, + "learning_rate": 9.54292983422366e-06, + "loss": 0.6187, + "step": 2008 + }, + { + "epoch": 0.16, + "grad_norm": 3.6780006488104973, + "learning_rate": 9.54238025621573e-06, + "loss": 0.884, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 8.457104952900698, + "learning_rate": 9.541830363846487e-06, + "loss": 0.6231, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 5.015180909674552, + "learning_rate": 9.541280157153983e-06, + "loss": 0.7633, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 3.513798459864995, + "learning_rate": 9.540729636176298e-06, + "loss": 0.7383, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 4.955117948871653, + "learning_rate": 9.540178800951533e-06, + "loss": 0.7131, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 5.592603386141614, + "learning_rate": 9.539627651517807e-06, + "loss": 0.6427, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 4.088672247221793, + "learning_rate": 9.539076187913262e-06, + "loss": 1.0259, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 5.03446194313494, + "learning_rate": 9.538524410176066e-06, + "loss": 0.7748, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 4.069479300323137, + "learning_rate": 9.537972318344403e-06, + "loss": 0.8233, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 5.6678608365502505, + "learning_rate": 9.537419912456484e-06, + "loss": 0.8731, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 7.395120297258934, + "learning_rate": 9.536867192550536e-06, + "loss": 0.7527, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 3.3407798256646806, + "learning_rate": 9.536314158664813e-06, + "loss": 0.6686, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 5.156708901906526, + "learning_rate": 9.535760810837584e-06, + "loss": 0.8188, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 3.6540335836117044, + "learning_rate": 9.53520714910715e-06, + "loss": 0.6477, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 7.591064639731454, + "learning_rate": 9.534653173511825e-06, + "loss": 0.8695, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 3.983265518122732, + "learning_rate": 9.534098884089948e-06, + "loss": 0.8926, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 4.774210134866984, + "learning_rate": 9.53354428087988e-06, + "loss": 0.8349, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 5.1143741792919775, + "learning_rate": 9.53298936392e-06, + "loss": 0.7237, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 4.240390179722257, + "learning_rate": 9.532434133248713e-06, + "loss": 0.743, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 2.6729041427744438, + "learning_rate": 9.531878588904448e-06, + "loss": 0.7273, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 2.8738962510521997, + "learning_rate": 9.531322730925648e-06, + "loss": 0.7683, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 3.9668327773181953, + "learning_rate": 9.530766559350784e-06, + "loss": 0.7224, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 3.452322122144783, + "learning_rate": 9.530210074218346e-06, + "loss": 0.807, + "step": 2031 + }, + { + "epoch": 0.17, + "grad_norm": 5.92006307280626, + "learning_rate": 9.529653275566848e-06, + "loss": 0.6063, + "step": 2032 + }, + { + "epoch": 0.17, + "grad_norm": 2.662203907162793, + "learning_rate": 9.529096163434822e-06, + "loss": 0.8439, + "step": 2033 + }, + { + "epoch": 0.17, + "grad_norm": 3.7089146074680674, + "learning_rate": 9.528538737860822e-06, + "loss": 0.7786, + "step": 2034 + }, + { + "epoch": 0.17, + "grad_norm": 9.334824574146266, + "learning_rate": 9.527980998883428e-06, + "loss": 0.6236, + "step": 2035 + }, + { + "epoch": 0.17, + "grad_norm": 3.4433397520672435, + "learning_rate": 9.527422946541238e-06, + "loss": 0.7464, + "step": 2036 + }, + { + "epoch": 0.17, + "grad_norm": 3.7348087981084404, + "learning_rate": 9.526864580872874e-06, + "loss": 0.6583, + "step": 2037 + }, + { + "epoch": 0.17, + "grad_norm": 4.156000313740864, + "learning_rate": 9.526305901916977e-06, + "loss": 0.759, + "step": 2038 + }, + { + "epoch": 0.17, + "grad_norm": 4.847661734969043, + "learning_rate": 9.525746909712211e-06, + "loss": 0.769, + "step": 2039 + }, + { + "epoch": 0.17, + "grad_norm": 4.725551787401016, + "learning_rate": 9.525187604297263e-06, + "loss": 0.6909, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 3.63523204318454, + "learning_rate": 9.52462798571084e-06, + "loss": 0.7335, + "step": 2041 + }, + { + "epoch": 0.17, + "grad_norm": 4.570478317623154, + "learning_rate": 9.52406805399167e-06, + "loss": 0.7998, + "step": 2042 + }, + { + "epoch": 0.17, + "grad_norm": 3.150966690651914, + "learning_rate": 9.523507809178506e-06, + "loss": 0.8675, + "step": 2043 + }, + { + "epoch": 0.17, + "grad_norm": 3.106597272183591, + "learning_rate": 9.52294725131012e-06, + "loss": 0.747, + "step": 2044 + }, + { + "epoch": 0.17, + "grad_norm": 4.94705803016184, + "learning_rate": 9.522386380425304e-06, + "loss": 0.8825, + "step": 2045 + }, + { + "epoch": 0.17, + "grad_norm": 3.1018437421997573, + "learning_rate": 9.521825196562875e-06, + "loss": 0.7366, + "step": 2046 + }, + { + "epoch": 0.17, + "grad_norm": 5.317760526975097, + "learning_rate": 9.521263699761672e-06, + "loss": 0.759, + "step": 2047 + }, + { + "epoch": 0.17, + "grad_norm": 4.759913705581531, + "learning_rate": 9.52070189006055e-06, + "loss": 0.7566, + "step": 2048 + }, + { + "epoch": 0.17, + "grad_norm": 4.868405365200601, + "learning_rate": 9.520139767498396e-06, + "loss": 0.6385, + "step": 2049 + }, + { + "epoch": 0.17, + "grad_norm": 5.315602251041462, + "learning_rate": 9.519577332114107e-06, + "loss": 0.8282, + "step": 2050 + }, + { + "epoch": 0.17, + "grad_norm": 5.184938433524998, + "learning_rate": 9.51901458394661e-06, + "loss": 0.5954, + "step": 2051 + }, + { + "epoch": 0.17, + "grad_norm": 9.59952242453887, + "learning_rate": 9.518451523034849e-06, + "loss": 0.7167, + "step": 2052 + }, + { + "epoch": 0.17, + "grad_norm": 4.544466954040468, + "learning_rate": 9.51788814941779e-06, + "loss": 0.6876, + "step": 2053 + }, + { + "epoch": 0.17, + "grad_norm": 6.584001426821807, + "learning_rate": 9.517324463134427e-06, + "loss": 0.9588, + "step": 2054 + }, + { + "epoch": 0.17, + "grad_norm": 7.014282958950346, + "learning_rate": 9.516760464223768e-06, + "loss": 0.7257, + "step": 2055 + }, + { + "epoch": 0.17, + "grad_norm": 19.68556681860114, + "learning_rate": 9.516196152724844e-06, + "loss": 0.7014, + "step": 2056 + }, + { + "epoch": 0.17, + "grad_norm": 3.639990438257149, + "learning_rate": 9.515631528676709e-06, + "loss": 0.6899, + "step": 2057 + }, + { + "epoch": 0.17, + "grad_norm": 4.845043218929715, + "learning_rate": 9.515066592118441e-06, + "loss": 0.8476, + "step": 2058 + }, + { + "epoch": 0.17, + "grad_norm": 5.235041824788746, + "learning_rate": 9.514501343089135e-06, + "loss": 0.6884, + "step": 2059 + }, + { + "epoch": 0.17, + "grad_norm": 4.661354298247606, + "learning_rate": 9.51393578162791e-06, + "loss": 0.7492, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 11.40175527577864, + "learning_rate": 9.513369907773907e-06, + "loss": 0.8302, + "step": 2061 + }, + { + "epoch": 0.17, + "grad_norm": 6.225136140618681, + "learning_rate": 9.512803721566288e-06, + "loss": 0.8657, + "step": 2062 + }, + { + "epoch": 0.17, + "grad_norm": 3.438067838384648, + "learning_rate": 9.512237223044236e-06, + "loss": 0.6311, + "step": 2063 + }, + { + "epoch": 0.17, + "grad_norm": 6.009569773614633, + "learning_rate": 9.511670412246956e-06, + "loss": 0.6985, + "step": 2064 + }, + { + "epoch": 0.17, + "grad_norm": 7.165404543758577, + "learning_rate": 9.511103289213678e-06, + "loss": 0.6983, + "step": 2065 + }, + { + "epoch": 0.17, + "grad_norm": 3.5925137624816283, + "learning_rate": 9.510535853983646e-06, + "loss": 0.7729, + "step": 2066 + }, + { + "epoch": 0.17, + "grad_norm": 3.830215572789879, + "learning_rate": 9.509968106596135e-06, + "loss": 0.9328, + "step": 2067 + }, + { + "epoch": 0.17, + "grad_norm": 3.7681736109660107, + "learning_rate": 9.509400047090432e-06, + "loss": 0.8825, + "step": 2068 + }, + { + "epoch": 0.17, + "grad_norm": 4.029848217398536, + "learning_rate": 9.508831675505852e-06, + "loss": 0.6479, + "step": 2069 + }, + { + "epoch": 0.17, + "grad_norm": 4.617958175930304, + "learning_rate": 9.508262991881732e-06, + "loss": 0.8034, + "step": 2070 + }, + { + "epoch": 0.17, + "grad_norm": 28.21407865701392, + "learning_rate": 9.507693996257423e-06, + "loss": 0.5949, + "step": 2071 + }, + { + "epoch": 0.17, + "grad_norm": 5.5739910955395375, + "learning_rate": 9.50712468867231e-06, + "loss": 0.701, + "step": 2072 + }, + { + "epoch": 0.17, + "grad_norm": 4.565537743131155, + "learning_rate": 9.506555069165788e-06, + "loss": 0.6443, + "step": 2073 + }, + { + "epoch": 0.17, + "grad_norm": 5.208104204780439, + "learning_rate": 9.505985137777279e-06, + "loss": 0.7324, + "step": 2074 + }, + { + "epoch": 0.17, + "grad_norm": 4.2543928650833385, + "learning_rate": 9.505414894546228e-06, + "loss": 0.8729, + "step": 2075 + }, + { + "epoch": 0.17, + "grad_norm": 12.149072373841449, + "learning_rate": 9.504844339512096e-06, + "loss": 0.5921, + "step": 2076 + }, + { + "epoch": 0.17, + "grad_norm": 7.1978628038926145, + "learning_rate": 9.50427347271437e-06, + "loss": 0.7645, + "step": 2077 + }, + { + "epoch": 0.17, + "grad_norm": 7.181758722730161, + "learning_rate": 9.503702294192563e-06, + "loss": 0.8186, + "step": 2078 + }, + { + "epoch": 0.17, + "grad_norm": 2.8504907787196663, + "learning_rate": 9.503130803986195e-06, + "loss": 0.7339, + "step": 2079 + }, + { + "epoch": 0.17, + "grad_norm": 5.938044664849103, + "learning_rate": 9.502559002134825e-06, + "loss": 0.7385, + "step": 2080 + }, + { + "epoch": 0.17, + "grad_norm": 6.923340086153708, + "learning_rate": 9.501986888678018e-06, + "loss": 0.9369, + "step": 2081 + }, + { + "epoch": 0.17, + "grad_norm": 3.0951155645712025, + "learning_rate": 9.501414463655375e-06, + "loss": 0.7347, + "step": 2082 + }, + { + "epoch": 0.17, + "grad_norm": 6.190582623889132, + "learning_rate": 9.500841727106505e-06, + "loss": 0.7804, + "step": 2083 + }, + { + "epoch": 0.17, + "grad_norm": 4.209644942656819, + "learning_rate": 9.500268679071049e-06, + "loss": 0.6868, + "step": 2084 + }, + { + "epoch": 0.17, + "grad_norm": 5.367701846607473, + "learning_rate": 9.499695319588665e-06, + "loss": 0.8, + "step": 2085 + }, + { + "epoch": 0.17, + "grad_norm": 3.0898186882174556, + "learning_rate": 9.499121648699032e-06, + "loss": 0.6248, + "step": 2086 + }, + { + "epoch": 0.17, + "grad_norm": 4.262526272615166, + "learning_rate": 9.498547666441851e-06, + "loss": 0.7654, + "step": 2087 + }, + { + "epoch": 0.17, + "grad_norm": 6.91330994481406, + "learning_rate": 9.497973372856848e-06, + "loss": 0.633, + "step": 2088 + }, + { + "epoch": 0.17, + "grad_norm": 3.9779799090856396, + "learning_rate": 9.497398767983765e-06, + "loss": 0.8404, + "step": 2089 + }, + { + "epoch": 0.17, + "grad_norm": 6.934955577626417, + "learning_rate": 9.49682385186237e-06, + "loss": 0.8079, + "step": 2090 + }, + { + "epoch": 0.17, + "grad_norm": 10.4209913773888, + "learning_rate": 9.49624862453245e-06, + "loss": 0.7093, + "step": 2091 + }, + { + "epoch": 0.17, + "grad_norm": 3.2256437758995236, + "learning_rate": 9.495673086033813e-06, + "loss": 0.6766, + "step": 2092 + }, + { + "epoch": 0.17, + "grad_norm": 2.904518263986565, + "learning_rate": 9.495097236406293e-06, + "loss": 0.6487, + "step": 2093 + }, + { + "epoch": 0.17, + "grad_norm": 3.9394011083893625, + "learning_rate": 9.49452107568974e-06, + "loss": 0.8508, + "step": 2094 + }, + { + "epoch": 0.17, + "grad_norm": 20.7042875945951, + "learning_rate": 9.493944603924028e-06, + "loss": 0.7621, + "step": 2095 + }, + { + "epoch": 0.17, + "grad_norm": 7.076726490097566, + "learning_rate": 9.493367821149055e-06, + "loss": 0.6712, + "step": 2096 + }, + { + "epoch": 0.17, + "grad_norm": 4.889614597467827, + "learning_rate": 9.492790727404735e-06, + "loss": 0.8938, + "step": 2097 + }, + { + "epoch": 0.17, + "grad_norm": 16.48751405685699, + "learning_rate": 9.492213322731007e-06, + "loss": 0.7715, + "step": 2098 + }, + { + "epoch": 0.17, + "grad_norm": 4.029371087791076, + "learning_rate": 9.491635607167833e-06, + "loss": 0.8416, + "step": 2099 + }, + { + "epoch": 0.17, + "grad_norm": 2.7653381349070747, + "learning_rate": 9.491057580755195e-06, + "loss": 0.68, + "step": 2100 + }, + { + "epoch": 0.17, + "grad_norm": 3.722529293694443, + "learning_rate": 9.490479243533091e-06, + "loss": 0.6533, + "step": 2101 + }, + { + "epoch": 0.17, + "grad_norm": 4.286052108979617, + "learning_rate": 9.48990059554155e-06, + "loss": 0.6244, + "step": 2102 + }, + { + "epoch": 0.17, + "grad_norm": 9.147750980824322, + "learning_rate": 9.489321636820618e-06, + "loss": 0.7, + "step": 2103 + }, + { + "epoch": 0.17, + "grad_norm": 4.37475200060871, + "learning_rate": 9.48874236741036e-06, + "loss": 0.6361, + "step": 2104 + }, + { + "epoch": 0.17, + "grad_norm": 2.4498326683227556, + "learning_rate": 9.488162787350868e-06, + "loss": 0.7527, + "step": 2105 + }, + { + "epoch": 0.17, + "grad_norm": 8.887814751834652, + "learning_rate": 9.487582896682252e-06, + "loss": 0.7385, + "step": 2106 + }, + { + "epoch": 0.17, + "grad_norm": 4.504219828683532, + "learning_rate": 9.487002695444642e-06, + "loss": 0.7546, + "step": 2107 + }, + { + "epoch": 0.17, + "grad_norm": 12.295969044189745, + "learning_rate": 9.486422183678193e-06, + "loss": 0.6661, + "step": 2108 + }, + { + "epoch": 0.17, + "grad_norm": 6.7786179183041275, + "learning_rate": 9.48584136142308e-06, + "loss": 0.6965, + "step": 2109 + }, + { + "epoch": 0.17, + "grad_norm": 5.284237269569144, + "learning_rate": 9.485260228719502e-06, + "loss": 0.5682, + "step": 2110 + }, + { + "epoch": 0.17, + "grad_norm": 4.980833282126374, + "learning_rate": 9.484678785607672e-06, + "loss": 0.7451, + "step": 2111 + }, + { + "epoch": 0.17, + "grad_norm": 5.92210072304862, + "learning_rate": 9.484097032127832e-06, + "loss": 0.6947, + "step": 2112 + }, + { + "epoch": 0.17, + "grad_norm": 3.334226237755275, + "learning_rate": 9.483514968320244e-06, + "loss": 0.812, + "step": 2113 + }, + { + "epoch": 0.17, + "grad_norm": 3.8236768840805073, + "learning_rate": 9.482932594225191e-06, + "loss": 0.8132, + "step": 2114 + }, + { + "epoch": 0.17, + "grad_norm": 3.1970556987782914, + "learning_rate": 9.482349909882973e-06, + "loss": 0.7479, + "step": 2115 + }, + { + "epoch": 0.17, + "grad_norm": 3.221272605889116, + "learning_rate": 9.48176691533392e-06, + "loss": 0.7033, + "step": 2116 + }, + { + "epoch": 0.17, + "grad_norm": 3.603480952052131, + "learning_rate": 9.481183610618376e-06, + "loss": 0.6808, + "step": 2117 + }, + { + "epoch": 0.17, + "grad_norm": 9.582406893560186, + "learning_rate": 9.480599995776711e-06, + "loss": 0.8008, + "step": 2118 + }, + { + "epoch": 0.17, + "grad_norm": 3.2382574803857005, + "learning_rate": 9.480016070849313e-06, + "loss": 0.6857, + "step": 2119 + }, + { + "epoch": 0.17, + "grad_norm": 5.564920211494573, + "learning_rate": 9.479431835876596e-06, + "loss": 0.6747, + "step": 2120 + }, + { + "epoch": 0.17, + "grad_norm": 3.4924521573959746, + "learning_rate": 9.47884729089899e-06, + "loss": 0.7785, + "step": 2121 + }, + { + "epoch": 0.17, + "grad_norm": 15.528146713415923, + "learning_rate": 9.47826243595695e-06, + "loss": 0.8467, + "step": 2122 + }, + { + "epoch": 0.17, + "grad_norm": 5.41285363032105, + "learning_rate": 9.477677271090953e-06, + "loss": 0.5509, + "step": 2123 + }, + { + "epoch": 0.17, + "grad_norm": 3.039445064345485, + "learning_rate": 9.477091796341493e-06, + "loss": 0.6809, + "step": 2124 + }, + { + "epoch": 0.17, + "grad_norm": 4.1881250448313025, + "learning_rate": 9.476506011749092e-06, + "loss": 0.8855, + "step": 2125 + }, + { + "epoch": 0.17, + "grad_norm": 3.2351841021100927, + "learning_rate": 9.475919917354289e-06, + "loss": 0.7689, + "step": 2126 + }, + { + "epoch": 0.17, + "grad_norm": 5.544810584894087, + "learning_rate": 9.475333513197645e-06, + "loss": 0.9212, + "step": 2127 + }, + { + "epoch": 0.17, + "grad_norm": 3.885776783984978, + "learning_rate": 9.474746799319742e-06, + "loss": 0.8256, + "step": 2128 + }, + { + "epoch": 0.17, + "grad_norm": 6.88149219177898, + "learning_rate": 9.474159775761187e-06, + "loss": 0.5833, + "step": 2129 + }, + { + "epoch": 0.17, + "grad_norm": 4.908739721336877, + "learning_rate": 9.473572442562603e-06, + "loss": 0.8242, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 3.413556592269906, + "learning_rate": 9.472984799764636e-06, + "loss": 0.8259, + "step": 2131 + }, + { + "epoch": 0.17, + "grad_norm": 3.371195729587371, + "learning_rate": 9.47239684740796e-06, + "loss": 0.7847, + "step": 2132 + }, + { + "epoch": 0.17, + "grad_norm": 7.3310849215939875, + "learning_rate": 9.471808585533258e-06, + "loss": 0.7993, + "step": 2133 + }, + { + "epoch": 0.17, + "grad_norm": 6.936647099613488, + "learning_rate": 9.471220014181247e-06, + "loss": 0.7231, + "step": 2134 + }, + { + "epoch": 0.17, + "grad_norm": 5.746035773959225, + "learning_rate": 9.470631133392658e-06, + "loss": 0.6573, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 4.2937572447606005, + "learning_rate": 9.470041943208244e-06, + "loss": 0.7179, + "step": 2136 + }, + { + "epoch": 0.17, + "grad_norm": 5.616107754082006, + "learning_rate": 9.469452443668783e-06, + "loss": 0.7301, + "step": 2137 + }, + { + "epoch": 0.17, + "grad_norm": 3.1741497862748744, + "learning_rate": 9.468862634815071e-06, + "loss": 0.7679, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 4.6015770803456935, + "learning_rate": 9.468272516687927e-06, + "loss": 0.9215, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 4.248050816852012, + "learning_rate": 9.467682089328188e-06, + "loss": 0.652, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 3.667877088209888, + "learning_rate": 9.467091352776719e-06, + "loss": 0.6745, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 5.7982642605322505, + "learning_rate": 9.4665003070744e-06, + "loss": 0.5877, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 6.232698648392658, + "learning_rate": 9.465908952262138e-06, + "loss": 0.6455, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 3.0300997784156403, + "learning_rate": 9.465317288380856e-06, + "loss": 0.664, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 4.661573369932758, + "learning_rate": 9.464725315471503e-06, + "loss": 0.7229, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 3.1932061298592256, + "learning_rate": 9.464133033575044e-06, + "loss": 0.7656, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 3.2102250362492124, + "learning_rate": 9.463540442732471e-06, + "loss": 0.8558, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 19.91230012279406, + "learning_rate": 9.462947542984795e-06, + "loss": 0.548, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 4.421373048694864, + "learning_rate": 9.46235433437305e-06, + "loss": 0.7712, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 22.704662749315894, + "learning_rate": 9.461760816938284e-06, + "loss": 0.7436, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 4.5205088420416795, + "learning_rate": 9.461166990721577e-06, + "loss": 0.7562, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 6.494619105681334, + "learning_rate": 9.460572855764026e-06, + "loss": 0.7307, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 7.0948556508911205, + "learning_rate": 9.459978412106747e-06, + "loss": 0.7919, + "step": 2153 + }, + { + "epoch": 0.17, + "grad_norm": 3.971358603425492, + "learning_rate": 9.459383659790878e-06, + "loss": 0.9116, + "step": 2154 + }, + { + "epoch": 0.18, + "grad_norm": 3.770241627934109, + "learning_rate": 9.458788598857583e-06, + "loss": 0.7459, + "step": 2155 + }, + { + "epoch": 0.18, + "grad_norm": 3.9916453650688513, + "learning_rate": 9.458193229348041e-06, + "loss": 0.7347, + "step": 2156 + }, + { + "epoch": 0.18, + "grad_norm": 4.685289420583578, + "learning_rate": 9.457597551303456e-06, + "loss": 0.7292, + "step": 2157 + }, + { + "epoch": 0.18, + "grad_norm": 9.550061305922657, + "learning_rate": 9.457001564765054e-06, + "loss": 0.8577, + "step": 2158 + }, + { + "epoch": 0.18, + "grad_norm": 4.658212701419767, + "learning_rate": 9.45640526977408e-06, + "loss": 0.6219, + "step": 2159 + }, + { + "epoch": 0.18, + "grad_norm": 4.328712174049183, + "learning_rate": 9.455808666371801e-06, + "loss": 0.8334, + "step": 2160 + }, + { + "epoch": 0.18, + "grad_norm": 31.283901558930665, + "learning_rate": 9.455211754599507e-06, + "loss": 0.8123, + "step": 2161 + }, + { + "epoch": 0.18, + "grad_norm": 6.2149093110170845, + "learning_rate": 9.454614534498506e-06, + "loss": 0.6287, + "step": 2162 + }, + { + "epoch": 0.18, + "grad_norm": 3.499307005845178, + "learning_rate": 9.454017006110131e-06, + "loss": 0.7581, + "step": 2163 + }, + { + "epoch": 0.18, + "grad_norm": 4.901657518956858, + "learning_rate": 9.453419169475735e-06, + "loss": 0.7202, + "step": 2164 + }, + { + "epoch": 0.18, + "grad_norm": 4.592944564405269, + "learning_rate": 9.452821024636691e-06, + "loss": 0.8367, + "step": 2165 + }, + { + "epoch": 0.18, + "grad_norm": 5.226155793398849, + "learning_rate": 9.452222571634395e-06, + "loss": 0.6827, + "step": 2166 + }, + { + "epoch": 0.18, + "grad_norm": 4.891219402143376, + "learning_rate": 9.451623810510265e-06, + "loss": 0.925, + "step": 2167 + }, + { + "epoch": 0.18, + "grad_norm": 3.6963689247433322, + "learning_rate": 9.451024741305735e-06, + "loss": 0.741, + "step": 2168 + }, + { + "epoch": 0.18, + "grad_norm": 3.2262127277331176, + "learning_rate": 9.450425364062267e-06, + "loss": 0.7828, + "step": 2169 + }, + { + "epoch": 0.18, + "grad_norm": 4.250505801213511, + "learning_rate": 9.449825678821342e-06, + "loss": 0.7198, + "step": 2170 + }, + { + "epoch": 0.18, + "grad_norm": 5.424240575005041, + "learning_rate": 9.449225685624464e-06, + "loss": 0.8776, + "step": 2171 + }, + { + "epoch": 0.18, + "grad_norm": 3.2262148617883866, + "learning_rate": 9.448625384513152e-06, + "loss": 0.7382, + "step": 2172 + }, + { + "epoch": 0.18, + "grad_norm": 3.277875025867993, + "learning_rate": 9.448024775528952e-06, + "loss": 0.6762, + "step": 2173 + }, + { + "epoch": 0.18, + "grad_norm": 20.74502568538336, + "learning_rate": 9.447423858713432e-06, + "loss": 0.7334, + "step": 2174 + }, + { + "epoch": 0.18, + "grad_norm": 29.118910404589663, + "learning_rate": 9.446822634108176e-06, + "loss": 0.7457, + "step": 2175 + }, + { + "epoch": 0.18, + "grad_norm": 58.80773566882819, + "learning_rate": 9.446221101754795e-06, + "loss": 0.9126, + "step": 2176 + }, + { + "epoch": 0.18, + "grad_norm": 19.832332368923616, + "learning_rate": 9.445619261694919e-06, + "loss": 0.7298, + "step": 2177 + }, + { + "epoch": 0.18, + "grad_norm": 8.565317443396655, + "learning_rate": 9.445017113970196e-06, + "loss": 0.7495, + "step": 2178 + }, + { + "epoch": 0.18, + "grad_norm": 7.2153398268895605, + "learning_rate": 9.444414658622303e-06, + "loss": 0.7608, + "step": 2179 + }, + { + "epoch": 0.18, + "grad_norm": 5.836142163920102, + "learning_rate": 9.44381189569293e-06, + "loss": 0.7979, + "step": 2180 + }, + { + "epoch": 0.18, + "grad_norm": 6.857945916804737, + "learning_rate": 9.443208825223794e-06, + "loss": 0.7325, + "step": 2181 + }, + { + "epoch": 0.18, + "grad_norm": 3.7139954606241856, + "learning_rate": 9.442605447256629e-06, + "loss": 0.7275, + "step": 2182 + }, + { + "epoch": 0.18, + "grad_norm": 3.2927347969230767, + "learning_rate": 9.442001761833194e-06, + "loss": 0.696, + "step": 2183 + }, + { + "epoch": 0.18, + "grad_norm": 4.0152490524861895, + "learning_rate": 9.441397768995269e-06, + "loss": 0.8456, + "step": 2184 + }, + { + "epoch": 0.18, + "grad_norm": 4.562548877766768, + "learning_rate": 9.440793468784652e-06, + "loss": 0.7004, + "step": 2185 + }, + { + "epoch": 0.18, + "grad_norm": 3.2214597213621525, + "learning_rate": 9.440188861243167e-06, + "loss": 0.7021, + "step": 2186 + }, + { + "epoch": 0.18, + "grad_norm": 4.992153901709443, + "learning_rate": 9.439583946412655e-06, + "loss": 0.6799, + "step": 2187 + }, + { + "epoch": 0.18, + "grad_norm": 5.52581188192942, + "learning_rate": 9.438978724334979e-06, + "loss": 0.8105, + "step": 2188 + }, + { + "epoch": 0.18, + "grad_norm": 6.3332410062828615, + "learning_rate": 9.438373195052027e-06, + "loss": 0.7983, + "step": 2189 + }, + { + "epoch": 0.18, + "grad_norm": 4.543033824275824, + "learning_rate": 9.4377673586057e-06, + "loss": 0.785, + "step": 2190 + }, + { + "epoch": 0.18, + "grad_norm": 3.4754594955103317, + "learning_rate": 9.437161215037931e-06, + "loss": 0.8684, + "step": 2191 + }, + { + "epoch": 0.18, + "grad_norm": 2.581726752132544, + "learning_rate": 9.436554764390668e-06, + "loss": 0.6639, + "step": 2192 + }, + { + "epoch": 0.18, + "grad_norm": 5.159742703622623, + "learning_rate": 9.435948006705882e-06, + "loss": 0.7564, + "step": 2193 + }, + { + "epoch": 0.18, + "grad_norm": 4.920817169686874, + "learning_rate": 9.43534094202556e-06, + "loss": 0.7033, + "step": 2194 + }, + { + "epoch": 0.18, + "grad_norm": 8.283248881171149, + "learning_rate": 9.434733570391719e-06, + "loss": 0.6106, + "step": 2195 + }, + { + "epoch": 0.18, + "grad_norm": 3.7483595265567726, + "learning_rate": 9.434125891846391e-06, + "loss": 0.7352, + "step": 2196 + }, + { + "epoch": 0.18, + "grad_norm": 8.247800003799725, + "learning_rate": 9.433517906431631e-06, + "loss": 0.6542, + "step": 2197 + }, + { + "epoch": 0.18, + "grad_norm": 6.2287248294909014, + "learning_rate": 9.432909614189518e-06, + "loss": 0.8759, + "step": 2198 + }, + { + "epoch": 0.18, + "grad_norm": 4.826646694421205, + "learning_rate": 9.432301015162146e-06, + "loss": 0.643, + "step": 2199 + }, + { + "epoch": 0.18, + "grad_norm": 9.164982356453415, + "learning_rate": 9.431692109391637e-06, + "loss": 0.8407, + "step": 2200 + }, + { + "epoch": 0.18, + "grad_norm": 4.94094157276911, + "learning_rate": 9.43108289692013e-06, + "loss": 0.9695, + "step": 2201 + }, + { + "epoch": 0.18, + "grad_norm": 6.620832287217032, + "learning_rate": 9.430473377789785e-06, + "loss": 0.8258, + "step": 2202 + }, + { + "epoch": 0.18, + "grad_norm": 4.160582982679015, + "learning_rate": 9.429863552042786e-06, + "loss": 0.8213, + "step": 2203 + }, + { + "epoch": 0.18, + "grad_norm": 3.866757732410586, + "learning_rate": 9.429253419721335e-06, + "loss": 0.8619, + "step": 2204 + }, + { + "epoch": 0.18, + "grad_norm": 3.6665511360027305, + "learning_rate": 9.428642980867661e-06, + "loss": 0.7055, + "step": 2205 + }, + { + "epoch": 0.18, + "grad_norm": 25.09850099463008, + "learning_rate": 9.428032235524007e-06, + "loss": 0.9461, + "step": 2206 + }, + { + "epoch": 0.18, + "grad_norm": 5.442019759611695, + "learning_rate": 9.427421183732642e-06, + "loss": 0.6679, + "step": 2207 + }, + { + "epoch": 0.18, + "grad_norm": 8.97515284051912, + "learning_rate": 9.426809825535851e-06, + "loss": 0.6993, + "step": 2208 + }, + { + "epoch": 0.18, + "grad_norm": 10.144782682181232, + "learning_rate": 9.426198160975948e-06, + "loss": 0.6951, + "step": 2209 + }, + { + "epoch": 0.18, + "grad_norm": 4.949928000474687, + "learning_rate": 9.425586190095263e-06, + "loss": 0.8416, + "step": 2210 + }, + { + "epoch": 0.18, + "grad_norm": 4.019983078496993, + "learning_rate": 9.424973912936147e-06, + "loss": 0.733, + "step": 2211 + }, + { + "epoch": 0.18, + "grad_norm": 4.9531397698021244, + "learning_rate": 9.424361329540976e-06, + "loss": 0.7383, + "step": 2212 + }, + { + "epoch": 0.18, + "grad_norm": 13.257236245716877, + "learning_rate": 9.42374843995214e-06, + "loss": 0.7672, + "step": 2213 + }, + { + "epoch": 0.18, + "grad_norm": 9.82778335009299, + "learning_rate": 9.42313524421206e-06, + "loss": 0.5727, + "step": 2214 + }, + { + "epoch": 0.18, + "grad_norm": 3.9354030172853096, + "learning_rate": 9.42252174236317e-06, + "loss": 0.7967, + "step": 2215 + }, + { + "epoch": 0.18, + "grad_norm": 5.029573106170355, + "learning_rate": 9.42190793444793e-06, + "loss": 0.7446, + "step": 2216 + }, + { + "epoch": 0.18, + "grad_norm": 3.6301152354638875, + "learning_rate": 9.421293820508817e-06, + "loss": 0.6335, + "step": 2217 + }, + { + "epoch": 0.18, + "grad_norm": 11.412441280826037, + "learning_rate": 9.420679400588334e-06, + "loss": 0.8085, + "step": 2218 + }, + { + "epoch": 0.18, + "grad_norm": 5.591515482115656, + "learning_rate": 9.420064674729002e-06, + "loss": 0.6424, + "step": 2219 + }, + { + "epoch": 0.18, + "grad_norm": 4.97081935293266, + "learning_rate": 9.419449642973361e-06, + "loss": 0.8159, + "step": 2220 + }, + { + "epoch": 0.18, + "grad_norm": 4.254327215298947, + "learning_rate": 9.41883430536398e-06, + "loss": 0.7236, + "step": 2221 + }, + { + "epoch": 0.18, + "grad_norm": 4.3356343693922055, + "learning_rate": 9.41821866194344e-06, + "loss": 0.7456, + "step": 2222 + }, + { + "epoch": 0.18, + "grad_norm": 5.555589072833969, + "learning_rate": 9.41760271275435e-06, + "loss": 0.6993, + "step": 2223 + }, + { + "epoch": 0.18, + "grad_norm": 4.34791617128502, + "learning_rate": 9.416986457839336e-06, + "loss": 0.6812, + "step": 2224 + }, + { + "epoch": 0.18, + "grad_norm": 5.164799230470531, + "learning_rate": 9.41636989724105e-06, + "loss": 0.6288, + "step": 2225 + }, + { + "epoch": 0.18, + "grad_norm": 5.775550342921115, + "learning_rate": 9.415753031002157e-06, + "loss": 0.4857, + "step": 2226 + }, + { + "epoch": 0.18, + "grad_norm": 5.5456612743322875, + "learning_rate": 9.415135859165349e-06, + "loss": 0.8171, + "step": 2227 + }, + { + "epoch": 0.18, + "grad_norm": 3.8019097332372764, + "learning_rate": 9.414518381773342e-06, + "loss": 0.6382, + "step": 2228 + }, + { + "epoch": 0.18, + "grad_norm": 8.500468757993067, + "learning_rate": 9.413900598868867e-06, + "loss": 0.9072, + "step": 2229 + }, + { + "epoch": 0.18, + "grad_norm": 4.636342966302793, + "learning_rate": 9.413282510494676e-06, + "loss": 0.8352, + "step": 2230 + }, + { + "epoch": 0.18, + "grad_norm": 3.7141200791250024, + "learning_rate": 9.41266411669355e-06, + "loss": 0.6337, + "step": 2231 + }, + { + "epoch": 0.18, + "grad_norm": 4.316702394877184, + "learning_rate": 9.412045417508281e-06, + "loss": 0.8037, + "step": 2232 + }, + { + "epoch": 0.18, + "grad_norm": 7.034867282733218, + "learning_rate": 9.411426412981688e-06, + "loss": 0.6919, + "step": 2233 + }, + { + "epoch": 0.18, + "grad_norm": 3.4363398924341304, + "learning_rate": 9.410807103156611e-06, + "loss": 0.5706, + "step": 2234 + }, + { + "epoch": 0.18, + "grad_norm": 3.989275170502083, + "learning_rate": 9.410187488075912e-06, + "loss": 0.5811, + "step": 2235 + }, + { + "epoch": 0.18, + "grad_norm": 5.685331390481626, + "learning_rate": 9.409567567782466e-06, + "loss": 0.743, + "step": 2236 + }, + { + "epoch": 0.18, + "grad_norm": 3.9990264377885496, + "learning_rate": 9.408947342319183e-06, + "loss": 0.7371, + "step": 2237 + }, + { + "epoch": 0.18, + "grad_norm": 5.526359854449125, + "learning_rate": 9.408326811728982e-06, + "loss": 0.8689, + "step": 2238 + }, + { + "epoch": 0.18, + "grad_norm": 2.993134258706843, + "learning_rate": 9.407705976054808e-06, + "loss": 0.6858, + "step": 2239 + }, + { + "epoch": 0.18, + "grad_norm": 6.724796790304231, + "learning_rate": 9.407084835339627e-06, + "loss": 0.8478, + "step": 2240 + }, + { + "epoch": 0.18, + "grad_norm": 4.4900506903132635, + "learning_rate": 9.406463389626425e-06, + "loss": 0.8297, + "step": 2241 + }, + { + "epoch": 0.18, + "grad_norm": 3.280449139976349, + "learning_rate": 9.405841638958212e-06, + "loss": 0.8071, + "step": 2242 + }, + { + "epoch": 0.18, + "grad_norm": 12.834778059369317, + "learning_rate": 9.405219583378018e-06, + "loss": 0.8655, + "step": 2243 + }, + { + "epoch": 0.18, + "grad_norm": 4.373236267679056, + "learning_rate": 9.40459722292889e-06, + "loss": 0.6861, + "step": 2244 + }, + { + "epoch": 0.18, + "grad_norm": 4.239570273055634, + "learning_rate": 9.4039745576539e-06, + "loss": 0.5999, + "step": 2245 + }, + { + "epoch": 0.18, + "grad_norm": 8.546177899064674, + "learning_rate": 9.40335158759614e-06, + "loss": 0.7862, + "step": 2246 + }, + { + "epoch": 0.18, + "grad_norm": 2.779135081755068, + "learning_rate": 9.402728312798726e-06, + "loss": 0.8104, + "step": 2247 + }, + { + "epoch": 0.18, + "grad_norm": 4.803584755754255, + "learning_rate": 9.402104733304792e-06, + "loss": 0.6223, + "step": 2248 + }, + { + "epoch": 0.18, + "grad_norm": 4.413725450716245, + "learning_rate": 9.401480849157489e-06, + "loss": 0.7495, + "step": 2249 + }, + { + "epoch": 0.18, + "grad_norm": 5.3182307049865765, + "learning_rate": 9.4008566604e-06, + "loss": 0.5752, + "step": 2250 + }, + { + "epoch": 0.18, + "grad_norm": 10.231812398070327, + "learning_rate": 9.400232167075519e-06, + "loss": 0.7582, + "step": 2251 + }, + { + "epoch": 0.18, + "grad_norm": 16.15212034972755, + "learning_rate": 9.399607369227265e-06, + "loss": 0.9528, + "step": 2252 + }, + { + "epoch": 0.18, + "grad_norm": 3.5790077520491015, + "learning_rate": 9.398982266898481e-06, + "loss": 0.7943, + "step": 2253 + }, + { + "epoch": 0.18, + "grad_norm": 3.9149732075112587, + "learning_rate": 9.398356860132425e-06, + "loss": 0.7267, + "step": 2254 + }, + { + "epoch": 0.18, + "grad_norm": 4.076918350740101, + "learning_rate": 9.39773114897238e-06, + "loss": 0.753, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 4.7194783380100125, + "learning_rate": 9.397105133461647e-06, + "loss": 0.5195, + "step": 2256 + }, + { + "epoch": 0.18, + "grad_norm": 13.315535602071407, + "learning_rate": 9.396478813643554e-06, + "loss": 0.9935, + "step": 2257 + }, + { + "epoch": 0.18, + "grad_norm": 10.264093427957008, + "learning_rate": 9.395852189561445e-06, + "loss": 0.7321, + "step": 2258 + }, + { + "epoch": 0.18, + "grad_norm": 4.966416020563409, + "learning_rate": 9.395225261258686e-06, + "loss": 0.7473, + "step": 2259 + }, + { + "epoch": 0.18, + "grad_norm": 7.9751275833382085, + "learning_rate": 9.394598028778664e-06, + "loss": 0.7364, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 6.8490424967201005, + "learning_rate": 9.393970492164787e-06, + "loss": 0.8506, + "step": 2261 + }, + { + "epoch": 0.18, + "grad_norm": 7.222531318099842, + "learning_rate": 9.393342651460487e-06, + "loss": 0.5739, + "step": 2262 + }, + { + "epoch": 0.18, + "grad_norm": 5.5657210446969625, + "learning_rate": 9.392714506709211e-06, + "loss": 0.7827, + "step": 2263 + }, + { + "epoch": 0.18, + "grad_norm": 3.836570039104997, + "learning_rate": 9.392086057954432e-06, + "loss": 0.7109, + "step": 2264 + }, + { + "epoch": 0.18, + "grad_norm": 5.6814510069416, + "learning_rate": 9.391457305239644e-06, + "loss": 0.8507, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 8.854628491932893, + "learning_rate": 9.39082824860836e-06, + "loss": 0.6068, + "step": 2266 + }, + { + "epoch": 0.18, + "grad_norm": 10.040644252419083, + "learning_rate": 9.390198888104113e-06, + "loss": 0.8004, + "step": 2267 + }, + { + "epoch": 0.18, + "grad_norm": 27.865214705251393, + "learning_rate": 9.389569223770461e-06, + "loss": 0.8255, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 3.579127353507633, + "learning_rate": 9.388939255650978e-06, + "loss": 0.8223, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 3.6277253920021946, + "learning_rate": 9.388308983789264e-06, + "loss": 0.7211, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 6.205500060959045, + "learning_rate": 9.38767840822894e-06, + "loss": 0.6732, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 7.000987755892481, + "learning_rate": 9.38704752901364e-06, + "loss": 0.6675, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 4.250483458646069, + "learning_rate": 9.38641634618703e-06, + "loss": 0.7621, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 6.938613553879345, + "learning_rate": 9.385784859792787e-06, + "loss": 0.931, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 18.515364405854417, + "learning_rate": 9.38515306987462e-06, + "loss": 0.4976, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 4.723782511167325, + "learning_rate": 9.384520976476246e-06, + "loss": 0.7536, + "step": 2276 + }, + { + "epoch": 0.18, + "grad_norm": 4.953722635888628, + "learning_rate": 9.383888579641414e-06, + "loss": 0.7423, + "step": 2277 + }, + { + "epoch": 0.19, + "grad_norm": 4.231940019795666, + "learning_rate": 9.383255879413891e-06, + "loss": 0.7268, + "step": 2278 + }, + { + "epoch": 0.19, + "grad_norm": 4.276197776391912, + "learning_rate": 9.382622875837459e-06, + "loss": 1.0088, + "step": 2279 + }, + { + "epoch": 0.19, + "grad_norm": 6.063738292759658, + "learning_rate": 9.381989568955931e-06, + "loss": 0.8127, + "step": 2280 + }, + { + "epoch": 0.19, + "grad_norm": 6.145934266294392, + "learning_rate": 9.381355958813132e-06, + "loss": 0.6489, + "step": 2281 + }, + { + "epoch": 0.19, + "grad_norm": 4.153770353330247, + "learning_rate": 9.380722045452915e-06, + "loss": 0.6746, + "step": 2282 + }, + { + "epoch": 0.19, + "grad_norm": 6.36240727890106, + "learning_rate": 9.380087828919149e-06, + "loss": 0.7459, + "step": 2283 + }, + { + "epoch": 0.19, + "grad_norm": 6.791107575161874, + "learning_rate": 9.379453309255726e-06, + "loss": 0.7729, + "step": 2284 + }, + { + "epoch": 0.19, + "grad_norm": 2.8117804704605573, + "learning_rate": 9.378818486506556e-06, + "loss": 0.7784, + "step": 2285 + }, + { + "epoch": 0.19, + "grad_norm": 8.461903675312902, + "learning_rate": 9.378183360715579e-06, + "loss": 0.7211, + "step": 2286 + }, + { + "epoch": 0.19, + "grad_norm": 4.238414116723706, + "learning_rate": 9.377547931926743e-06, + "loss": 0.677, + "step": 2287 + }, + { + "epoch": 0.19, + "grad_norm": 8.326222112098273, + "learning_rate": 9.376912200184029e-06, + "loss": 0.7605, + "step": 2288 + }, + { + "epoch": 0.19, + "grad_norm": 4.178287786114565, + "learning_rate": 9.37627616553143e-06, + "loss": 0.6586, + "step": 2289 + }, + { + "epoch": 0.19, + "grad_norm": 5.932124692152528, + "learning_rate": 9.375639828012965e-06, + "loss": 0.693, + "step": 2290 + }, + { + "epoch": 0.19, + "grad_norm": 3.622012361601318, + "learning_rate": 9.375003187672674e-06, + "loss": 0.6803, + "step": 2291 + }, + { + "epoch": 0.19, + "grad_norm": 6.322313501361187, + "learning_rate": 9.374366244554614e-06, + "loss": 0.8546, + "step": 2292 + }, + { + "epoch": 0.19, + "grad_norm": 3.822640361895152, + "learning_rate": 9.373728998702868e-06, + "loss": 0.9114, + "step": 2293 + }, + { + "epoch": 0.19, + "grad_norm": 4.264606666000039, + "learning_rate": 9.373091450161534e-06, + "loss": 0.7643, + "step": 2294 + }, + { + "epoch": 0.19, + "grad_norm": 7.147994751195317, + "learning_rate": 9.372453598974738e-06, + "loss": 0.7313, + "step": 2295 + }, + { + "epoch": 0.19, + "grad_norm": 12.536771734088504, + "learning_rate": 9.371815445186622e-06, + "loss": 0.8124, + "step": 2296 + }, + { + "epoch": 0.19, + "grad_norm": 3.852887065233964, + "learning_rate": 9.371176988841349e-06, + "loss": 0.8395, + "step": 2297 + }, + { + "epoch": 0.19, + "grad_norm": 4.606432324759747, + "learning_rate": 9.370538229983105e-06, + "loss": 0.6717, + "step": 2298 + }, + { + "epoch": 0.19, + "grad_norm": 7.120679921792463, + "learning_rate": 9.369899168656095e-06, + "loss": 0.708, + "step": 2299 + }, + { + "epoch": 0.19, + "grad_norm": 7.993921079656909, + "learning_rate": 9.36925980490455e-06, + "loss": 0.6993, + "step": 2300 + }, + { + "epoch": 0.19, + "grad_norm": 4.196791159078374, + "learning_rate": 9.368620138772715e-06, + "loss": 0.668, + "step": 2301 + }, + { + "epoch": 0.19, + "grad_norm": 3.929597684313037, + "learning_rate": 9.367980170304857e-06, + "loss": 0.7922, + "step": 2302 + }, + { + "epoch": 0.19, + "grad_norm": 3.4611831599650142, + "learning_rate": 9.36733989954527e-06, + "loss": 0.6946, + "step": 2303 + }, + { + "epoch": 0.19, + "grad_norm": 3.5608415267233555, + "learning_rate": 9.366699326538264e-06, + "loss": 0.6488, + "step": 2304 + }, + { + "epoch": 0.19, + "grad_norm": 5.109551002011297, + "learning_rate": 9.366058451328169e-06, + "loss": 0.7842, + "step": 2305 + }, + { + "epoch": 0.19, + "grad_norm": 5.191770420081686, + "learning_rate": 9.365417273959336e-06, + "loss": 0.5759, + "step": 2306 + }, + { + "epoch": 0.19, + "grad_norm": 5.441496337417363, + "learning_rate": 9.364775794476142e-06, + "loss": 0.7642, + "step": 2307 + }, + { + "epoch": 0.19, + "grad_norm": 4.844667617725312, + "learning_rate": 9.36413401292298e-06, + "loss": 0.8297, + "step": 2308 + }, + { + "epoch": 0.19, + "grad_norm": 21.017412086286843, + "learning_rate": 9.363491929344266e-06, + "loss": 0.6978, + "step": 2309 + }, + { + "epoch": 0.19, + "grad_norm": 4.383074441942804, + "learning_rate": 9.362849543784436e-06, + "loss": 0.6984, + "step": 2310 + }, + { + "epoch": 0.19, + "grad_norm": 3.8425171936933276, + "learning_rate": 9.362206856287946e-06, + "loss": 0.9348, + "step": 2311 + }, + { + "epoch": 0.19, + "grad_norm": 4.422808583565183, + "learning_rate": 9.361563866899274e-06, + "loss": 0.6869, + "step": 2312 + }, + { + "epoch": 0.19, + "grad_norm": 3.6981474985411316, + "learning_rate": 9.360920575662922e-06, + "loss": 0.6265, + "step": 2313 + }, + { + "epoch": 0.19, + "grad_norm": 3.7457272927885703, + "learning_rate": 9.360276982623405e-06, + "loss": 0.5414, + "step": 2314 + }, + { + "epoch": 0.19, + "grad_norm": 12.327323483355492, + "learning_rate": 9.359633087825268e-06, + "loss": 0.903, + "step": 2315 + }, + { + "epoch": 0.19, + "grad_norm": 3.6750288103496485, + "learning_rate": 9.35898889131307e-06, + "loss": 0.7297, + "step": 2316 + }, + { + "epoch": 0.19, + "grad_norm": 6.510314404097722, + "learning_rate": 9.358344393131395e-06, + "loss": 0.7476, + "step": 2317 + }, + { + "epoch": 0.19, + "grad_norm": 14.031610718159982, + "learning_rate": 9.357699593324846e-06, + "loss": 0.748, + "step": 2318 + }, + { + "epoch": 0.19, + "grad_norm": 5.193450483632661, + "learning_rate": 9.357054491938045e-06, + "loss": 0.6919, + "step": 2319 + }, + { + "epoch": 0.19, + "grad_norm": 4.972271873679375, + "learning_rate": 9.35640908901564e-06, + "loss": 0.7184, + "step": 2320 + }, + { + "epoch": 0.19, + "grad_norm": 4.126551476664808, + "learning_rate": 9.355763384602294e-06, + "loss": 0.7888, + "step": 2321 + }, + { + "epoch": 0.19, + "grad_norm": 3.2265443374899627, + "learning_rate": 9.355117378742698e-06, + "loss": 1.0356, + "step": 2322 + }, + { + "epoch": 0.19, + "grad_norm": 3.8368630705620177, + "learning_rate": 9.354471071481557e-06, + "loss": 0.8026, + "step": 2323 + }, + { + "epoch": 0.19, + "grad_norm": 5.01786405246856, + "learning_rate": 9.3538244628636e-06, + "loss": 0.7618, + "step": 2324 + }, + { + "epoch": 0.19, + "grad_norm": 14.700103199170401, + "learning_rate": 9.353177552933575e-06, + "loss": 0.7757, + "step": 2325 + }, + { + "epoch": 0.19, + "grad_norm": 4.044818974028292, + "learning_rate": 9.352530341736255e-06, + "loss": 0.6542, + "step": 2326 + }, + { + "epoch": 0.19, + "grad_norm": 3.9466598141175013, + "learning_rate": 9.351882829316428e-06, + "loss": 0.7744, + "step": 2327 + }, + { + "epoch": 0.19, + "grad_norm": 3.7063760438442594, + "learning_rate": 9.351235015718907e-06, + "loss": 0.6686, + "step": 2328 + }, + { + "epoch": 0.19, + "grad_norm": 11.267700730580987, + "learning_rate": 9.350586900988527e-06, + "loss": 0.6818, + "step": 2329 + }, + { + "epoch": 0.19, + "grad_norm": 4.402227070589877, + "learning_rate": 9.349938485170139e-06, + "loss": 0.7174, + "step": 2330 + }, + { + "epoch": 0.19, + "grad_norm": 6.198450140006214, + "learning_rate": 9.34928976830862e-06, + "loss": 0.6164, + "step": 2331 + }, + { + "epoch": 0.19, + "grad_norm": 4.527063590552783, + "learning_rate": 9.34864075044886e-06, + "loss": 0.8365, + "step": 2332 + }, + { + "epoch": 0.19, + "grad_norm": 17.43452740570329, + "learning_rate": 9.347991431635782e-06, + "loss": 0.6985, + "step": 2333 + }, + { + "epoch": 0.19, + "grad_norm": 3.3467749681425665, + "learning_rate": 9.347341811914319e-06, + "loss": 0.7493, + "step": 2334 + }, + { + "epoch": 0.19, + "grad_norm": 7.50519663713635, + "learning_rate": 9.34669189132943e-06, + "loss": 0.7604, + "step": 2335 + }, + { + "epoch": 0.19, + "grad_norm": 6.398172472173961, + "learning_rate": 9.346041669926092e-06, + "loss": 0.6765, + "step": 2336 + }, + { + "epoch": 0.19, + "grad_norm": 3.2056553025713703, + "learning_rate": 9.345391147749305e-06, + "loss": 0.8591, + "step": 2337 + }, + { + "epoch": 0.19, + "grad_norm": 2.9162958907759684, + "learning_rate": 9.344740324844091e-06, + "loss": 0.7318, + "step": 2338 + }, + { + "epoch": 0.19, + "grad_norm": 4.29811290595597, + "learning_rate": 9.344089201255488e-06, + "loss": 0.7979, + "step": 2339 + }, + { + "epoch": 0.19, + "grad_norm": 3.833766431633332, + "learning_rate": 9.343437777028561e-06, + "loss": 0.7323, + "step": 2340 + }, + { + "epoch": 0.19, + "grad_norm": 3.213013395758147, + "learning_rate": 9.342786052208392e-06, + "loss": 0.7295, + "step": 2341 + }, + { + "epoch": 0.19, + "grad_norm": 4.853398291706577, + "learning_rate": 9.342134026840083e-06, + "loss": 0.6861, + "step": 2342 + }, + { + "epoch": 0.19, + "grad_norm": 6.7240291158695555, + "learning_rate": 9.34148170096876e-06, + "loss": 0.6154, + "step": 2343 + }, + { + "epoch": 0.19, + "grad_norm": 2.9431677635216937, + "learning_rate": 9.340829074639566e-06, + "loss": 0.7909, + "step": 2344 + }, + { + "epoch": 0.19, + "grad_norm": 3.3251200775984358, + "learning_rate": 9.340176147897669e-06, + "loss": 0.9101, + "step": 2345 + }, + { + "epoch": 0.19, + "grad_norm": 5.8973364914007345, + "learning_rate": 9.339522920788252e-06, + "loss": 0.8606, + "step": 2346 + }, + { + "epoch": 0.19, + "grad_norm": 3.3615007147016294, + "learning_rate": 9.338869393356527e-06, + "loss": 0.727, + "step": 2347 + }, + { + "epoch": 0.19, + "grad_norm": 3.058557056916983, + "learning_rate": 9.338215565647719e-06, + "loss": 0.8776, + "step": 2348 + }, + { + "epoch": 0.19, + "grad_norm": 3.5062387986331482, + "learning_rate": 9.33756143770708e-06, + "loss": 0.6682, + "step": 2349 + }, + { + "epoch": 0.19, + "grad_norm": 6.045118009924259, + "learning_rate": 9.336907009579876e-06, + "loss": 0.7358, + "step": 2350 + }, + { + "epoch": 0.19, + "grad_norm": 3.6700071214039913, + "learning_rate": 9.336252281311401e-06, + "loss": 0.7535, + "step": 2351 + }, + { + "epoch": 0.19, + "grad_norm": 4.6999920801688235, + "learning_rate": 9.335597252946965e-06, + "loss": 0.7672, + "step": 2352 + }, + { + "epoch": 0.19, + "grad_norm": 3.64335633573572, + "learning_rate": 9.334941924531898e-06, + "loss": 0.8333, + "step": 2353 + }, + { + "epoch": 0.19, + "grad_norm": 3.573641049751428, + "learning_rate": 9.334286296111556e-06, + "loss": 0.6451, + "step": 2354 + }, + { + "epoch": 0.19, + "grad_norm": 4.409540914342568, + "learning_rate": 9.333630367731311e-06, + "loss": 0.7662, + "step": 2355 + }, + { + "epoch": 0.19, + "grad_norm": 4.967924836661743, + "learning_rate": 9.332974139436559e-06, + "loss": 0.8419, + "step": 2356 + }, + { + "epoch": 0.19, + "grad_norm": 5.096478222514124, + "learning_rate": 9.332317611272712e-06, + "loss": 0.8238, + "step": 2357 + }, + { + "epoch": 0.19, + "grad_norm": 5.857698443164866, + "learning_rate": 9.331660783285208e-06, + "loss": 0.8513, + "step": 2358 + }, + { + "epoch": 0.19, + "grad_norm": 3.2551301611971284, + "learning_rate": 9.331003655519507e-06, + "loss": 0.7557, + "step": 2359 + }, + { + "epoch": 0.19, + "grad_norm": 3.66896932854, + "learning_rate": 9.330346228021078e-06, + "loss": 0.6739, + "step": 2360 + }, + { + "epoch": 0.19, + "grad_norm": 3.690054102407496, + "learning_rate": 9.329688500835425e-06, + "loss": 0.6769, + "step": 2361 + }, + { + "epoch": 0.19, + "grad_norm": 3.6143976288606896, + "learning_rate": 9.329030474008067e-06, + "loss": 0.5682, + "step": 2362 + }, + { + "epoch": 0.19, + "grad_norm": 5.11143936678274, + "learning_rate": 9.328372147584543e-06, + "loss": 0.8142, + "step": 2363 + }, + { + "epoch": 0.19, + "grad_norm": 4.61915082587634, + "learning_rate": 9.327713521610412e-06, + "loss": 0.8337, + "step": 2364 + }, + { + "epoch": 0.19, + "grad_norm": 5.804813193443495, + "learning_rate": 9.327054596131255e-06, + "loss": 0.8011, + "step": 2365 + }, + { + "epoch": 0.19, + "grad_norm": 5.970154371428778, + "learning_rate": 9.326395371192674e-06, + "loss": 0.7136, + "step": 2366 + }, + { + "epoch": 0.19, + "grad_norm": 3.50649400309234, + "learning_rate": 9.325735846840293e-06, + "loss": 0.6342, + "step": 2367 + }, + { + "epoch": 0.19, + "grad_norm": 3.6864505087668307, + "learning_rate": 9.325076023119755e-06, + "loss": 0.811, + "step": 2368 + }, + { + "epoch": 0.19, + "grad_norm": 4.848541929943327, + "learning_rate": 9.324415900076723e-06, + "loss": 0.6728, + "step": 2369 + }, + { + "epoch": 0.19, + "grad_norm": 4.955746611523773, + "learning_rate": 9.323755477756881e-06, + "loss": 0.8638, + "step": 2370 + }, + { + "epoch": 0.19, + "grad_norm": 3.525791337165016, + "learning_rate": 9.323094756205937e-06, + "loss": 0.8577, + "step": 2371 + }, + { + "epoch": 0.19, + "grad_norm": 4.419750128267288, + "learning_rate": 9.322433735469614e-06, + "loss": 0.7429, + "step": 2372 + }, + { + "epoch": 0.19, + "grad_norm": 4.943105026606118, + "learning_rate": 9.32177241559366e-06, + "loss": 0.7331, + "step": 2373 + }, + { + "epoch": 0.19, + "grad_norm": 5.743421660517981, + "learning_rate": 9.321110796623845e-06, + "loss": 0.7454, + "step": 2374 + }, + { + "epoch": 0.19, + "grad_norm": 2.887159635521565, + "learning_rate": 9.320448878605952e-06, + "loss": 0.7013, + "step": 2375 + }, + { + "epoch": 0.19, + "grad_norm": 4.87707740256212, + "learning_rate": 9.319786661585795e-06, + "loss": 0.8355, + "step": 2376 + }, + { + "epoch": 0.19, + "grad_norm": 26.319113658153096, + "learning_rate": 9.3191241456092e-06, + "loss": 0.7375, + "step": 2377 + }, + { + "epoch": 0.19, + "grad_norm": 6.250961548257255, + "learning_rate": 9.318461330722018e-06, + "loss": 0.8943, + "step": 2378 + }, + { + "epoch": 0.19, + "grad_norm": 4.328251845432506, + "learning_rate": 9.317798216970122e-06, + "loss": 0.6079, + "step": 2379 + }, + { + "epoch": 0.19, + "grad_norm": 11.980780426581918, + "learning_rate": 9.317134804399401e-06, + "loss": 0.7604, + "step": 2380 + }, + { + "epoch": 0.19, + "grad_norm": 4.0360965946343725, + "learning_rate": 9.31647109305577e-06, + "loss": 0.7548, + "step": 2381 + }, + { + "epoch": 0.19, + "grad_norm": 7.416253015904599, + "learning_rate": 9.31580708298516e-06, + "loss": 0.6372, + "step": 2382 + }, + { + "epoch": 0.19, + "grad_norm": 5.934474168732798, + "learning_rate": 9.315142774233526e-06, + "loss": 0.5775, + "step": 2383 + }, + { + "epoch": 0.19, + "grad_norm": 9.900341195987588, + "learning_rate": 9.31447816684684e-06, + "loss": 0.7505, + "step": 2384 + }, + { + "epoch": 0.19, + "grad_norm": 5.888032546218741, + "learning_rate": 9.3138132608711e-06, + "loss": 0.8276, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 4.728322042089633, + "learning_rate": 9.313148056352321e-06, + "loss": 0.606, + "step": 2386 + }, + { + "epoch": 0.19, + "grad_norm": 7.680449818304649, + "learning_rate": 9.312482553336538e-06, + "loss": 0.6513, + "step": 2387 + }, + { + "epoch": 0.19, + "grad_norm": 8.3042396282667, + "learning_rate": 9.311816751869809e-06, + "loss": 0.7596, + "step": 2388 + }, + { + "epoch": 0.19, + "grad_norm": 11.012809700729955, + "learning_rate": 9.31115065199821e-06, + "loss": 0.6874, + "step": 2389 + }, + { + "epoch": 0.19, + "grad_norm": 3.82375398087193, + "learning_rate": 9.310484253767842e-06, + "loss": 0.843, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 10.035565717110465, + "learning_rate": 9.309817557224822e-06, + "loss": 0.7894, + "step": 2391 + }, + { + "epoch": 0.19, + "grad_norm": 2.1176819332320185, + "learning_rate": 9.30915056241529e-06, + "loss": 0.7811, + "step": 2392 + }, + { + "epoch": 0.19, + "grad_norm": 6.8857717448143765, + "learning_rate": 9.308483269385406e-06, + "loss": 0.6032, + "step": 2393 + }, + { + "epoch": 0.19, + "grad_norm": 2.874008031592198, + "learning_rate": 9.307815678181353e-06, + "loss": 0.6924, + "step": 2394 + }, + { + "epoch": 0.19, + "grad_norm": 10.463015575706057, + "learning_rate": 9.307147788849329e-06, + "loss": 0.5474, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 4.687493708477168, + "learning_rate": 9.306479601435559e-06, + "loss": 0.6662, + "step": 2396 + }, + { + "epoch": 0.19, + "grad_norm": 4.51276910928311, + "learning_rate": 9.305811115986285e-06, + "loss": 0.6058, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 5.418141843764589, + "learning_rate": 9.30514233254777e-06, + "loss": 0.8429, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 8.089715281313294, + "learning_rate": 9.304473251166297e-06, + "loss": 0.713, + "step": 2399 + }, + { + "epoch": 0.19, + "grad_norm": 9.17760105151901, + "learning_rate": 9.303803871888172e-06, + "loss": 0.7263, + "step": 2400 + }, + { + "epoch": 0.2, + "grad_norm": 3.069840671597759, + "learning_rate": 9.303134194759723e-06, + "loss": 0.6533, + "step": 2401 + }, + { + "epoch": 0.2, + "grad_norm": 14.548532164484056, + "learning_rate": 9.302464219827289e-06, + "loss": 0.7322, + "step": 2402 + }, + { + "epoch": 0.2, + "grad_norm": 4.66309000674165, + "learning_rate": 9.301793947137241e-06, + "loss": 0.7569, + "step": 2403 + }, + { + "epoch": 0.2, + "grad_norm": 9.068422879052775, + "learning_rate": 9.301123376735968e-06, + "loss": 0.7603, + "step": 2404 + }, + { + "epoch": 0.2, + "grad_norm": 5.316122393629453, + "learning_rate": 9.300452508669872e-06, + "loss": 0.5841, + "step": 2405 + }, + { + "epoch": 0.2, + "grad_norm": 5.608212652754311, + "learning_rate": 9.299781342985387e-06, + "loss": 0.6299, + "step": 2406 + }, + { + "epoch": 0.2, + "grad_norm": 12.592680835420376, + "learning_rate": 9.29910987972896e-06, + "loss": 0.7323, + "step": 2407 + }, + { + "epoch": 0.2, + "grad_norm": 8.70180704022712, + "learning_rate": 9.298438118947058e-06, + "loss": 0.8467, + "step": 2408 + }, + { + "epoch": 0.2, + "grad_norm": 4.271549885425811, + "learning_rate": 9.297766060686173e-06, + "loss": 0.6919, + "step": 2409 + }, + { + "epoch": 0.2, + "grad_norm": 3.394635097810448, + "learning_rate": 9.297093704992817e-06, + "loss": 0.6244, + "step": 2410 + }, + { + "epoch": 0.2, + "grad_norm": 8.205242624225207, + "learning_rate": 9.296421051913518e-06, + "loss": 0.7436, + "step": 2411 + }, + { + "epoch": 0.2, + "grad_norm": 11.830884593444653, + "learning_rate": 9.295748101494831e-06, + "loss": 0.6126, + "step": 2412 + }, + { + "epoch": 0.2, + "grad_norm": 4.678889404361456, + "learning_rate": 9.295074853783328e-06, + "loss": 0.7838, + "step": 2413 + }, + { + "epoch": 0.2, + "grad_norm": 5.215608165467666, + "learning_rate": 9.2944013088256e-06, + "loss": 0.7382, + "step": 2414 + }, + { + "epoch": 0.2, + "grad_norm": 22.05641581643632, + "learning_rate": 9.293727466668262e-06, + "loss": 0.7399, + "step": 2415 + }, + { + "epoch": 0.2, + "grad_norm": 3.2455972046859527, + "learning_rate": 9.293053327357947e-06, + "loss": 0.6739, + "step": 2416 + }, + { + "epoch": 0.2, + "grad_norm": 3.482274017575485, + "learning_rate": 9.29237889094131e-06, + "loss": 0.7395, + "step": 2417 + }, + { + "epoch": 0.2, + "grad_norm": 6.62447564070319, + "learning_rate": 9.291704157465026e-06, + "loss": 0.6913, + "step": 2418 + }, + { + "epoch": 0.2, + "grad_norm": 5.754532711577562, + "learning_rate": 9.291029126975794e-06, + "loss": 0.8103, + "step": 2419 + }, + { + "epoch": 0.2, + "grad_norm": 3.6977649972780298, + "learning_rate": 9.290353799520328e-06, + "loss": 0.7778, + "step": 2420 + }, + { + "epoch": 0.2, + "grad_norm": 5.445637231729394, + "learning_rate": 9.289678175145363e-06, + "loss": 0.6751, + "step": 2421 + }, + { + "epoch": 0.2, + "grad_norm": 11.750413892119365, + "learning_rate": 9.28900225389766e-06, + "loss": 0.7406, + "step": 2422 + }, + { + "epoch": 0.2, + "grad_norm": 4.44653728905703, + "learning_rate": 9.288326035823993e-06, + "loss": 0.7168, + "step": 2423 + }, + { + "epoch": 0.2, + "grad_norm": 4.215463719017191, + "learning_rate": 9.287649520971165e-06, + "loss": 0.7707, + "step": 2424 + }, + { + "epoch": 0.2, + "grad_norm": 5.330572591632841, + "learning_rate": 9.286972709385991e-06, + "loss": 0.6941, + "step": 2425 + }, + { + "epoch": 0.2, + "grad_norm": 8.29775919968905, + "learning_rate": 9.286295601115314e-06, + "loss": 0.6932, + "step": 2426 + }, + { + "epoch": 0.2, + "grad_norm": 4.046667143945288, + "learning_rate": 9.285618196205993e-06, + "loss": 0.9281, + "step": 2427 + }, + { + "epoch": 0.2, + "grad_norm": 4.663080138856968, + "learning_rate": 9.284940494704906e-06, + "loss": 0.6811, + "step": 2428 + }, + { + "epoch": 0.2, + "grad_norm": 5.721644979362868, + "learning_rate": 9.284262496658957e-06, + "loss": 0.8363, + "step": 2429 + }, + { + "epoch": 0.2, + "grad_norm": 2.9731674669583628, + "learning_rate": 9.283584202115068e-06, + "loss": 0.649, + "step": 2430 + }, + { + "epoch": 0.2, + "grad_norm": 4.899695102469641, + "learning_rate": 9.282905611120181e-06, + "loss": 0.7552, + "step": 2431 + }, + { + "epoch": 0.2, + "grad_norm": 4.593115129967294, + "learning_rate": 9.282226723721259e-06, + "loss": 0.7794, + "step": 2432 + }, + { + "epoch": 0.2, + "grad_norm": 3.3414789061934025, + "learning_rate": 9.281547539965284e-06, + "loss": 0.5234, + "step": 2433 + }, + { + "epoch": 0.2, + "grad_norm": 6.518797846484355, + "learning_rate": 9.28086805989926e-06, + "loss": 0.7435, + "step": 2434 + }, + { + "epoch": 0.2, + "grad_norm": 5.703012747917296, + "learning_rate": 9.28018828357021e-06, + "loss": 0.6855, + "step": 2435 + }, + { + "epoch": 0.2, + "grad_norm": 14.777940053968829, + "learning_rate": 9.279508211025182e-06, + "loss": 0.6449, + "step": 2436 + }, + { + "epoch": 0.2, + "grad_norm": 3.3756434709678023, + "learning_rate": 9.27882784231124e-06, + "loss": 0.776, + "step": 2437 + }, + { + "epoch": 0.2, + "grad_norm": 3.145525975191203, + "learning_rate": 9.27814717747547e-06, + "loss": 0.7516, + "step": 2438 + }, + { + "epoch": 0.2, + "grad_norm": 2.8201143451161568, + "learning_rate": 9.277466216564977e-06, + "loss": 0.7306, + "step": 2439 + }, + { + "epoch": 0.2, + "grad_norm": 4.63624619681657, + "learning_rate": 9.276784959626889e-06, + "loss": 0.7089, + "step": 2440 + }, + { + "epoch": 0.2, + "grad_norm": 5.969777072079392, + "learning_rate": 9.276103406708354e-06, + "loss": 0.804, + "step": 2441 + }, + { + "epoch": 0.2, + "grad_norm": 5.0156245023661805, + "learning_rate": 9.275421557856536e-06, + "loss": 0.7404, + "step": 2442 + }, + { + "epoch": 0.2, + "grad_norm": 3.9233748296140334, + "learning_rate": 9.274739413118629e-06, + "loss": 0.7455, + "step": 2443 + }, + { + "epoch": 0.2, + "grad_norm": 6.286216557644056, + "learning_rate": 9.274056972541837e-06, + "loss": 0.6616, + "step": 2444 + }, + { + "epoch": 0.2, + "grad_norm": 5.201183671289719, + "learning_rate": 9.273374236173391e-06, + "loss": 0.9272, + "step": 2445 + }, + { + "epoch": 0.2, + "grad_norm": 10.094561464137378, + "learning_rate": 9.27269120406054e-06, + "loss": 0.6327, + "step": 2446 + }, + { + "epoch": 0.2, + "grad_norm": 3.5616417022652325, + "learning_rate": 9.272007876250555e-06, + "loss": 0.7214, + "step": 2447 + }, + { + "epoch": 0.2, + "grad_norm": 5.097648215835642, + "learning_rate": 9.271324252790725e-06, + "loss": 0.6461, + "step": 2448 + }, + { + "epoch": 0.2, + "grad_norm": 3.754695935863431, + "learning_rate": 9.270640333728364e-06, + "loss": 0.7626, + "step": 2449 + }, + { + "epoch": 0.2, + "grad_norm": 7.320374059189283, + "learning_rate": 9.269956119110802e-06, + "loss": 0.6931, + "step": 2450 + }, + { + "epoch": 0.2, + "grad_norm": 2.7245871458377335, + "learning_rate": 9.269271608985391e-06, + "loss": 0.7529, + "step": 2451 + }, + { + "epoch": 0.2, + "grad_norm": 7.652491304119991, + "learning_rate": 9.268586803399502e-06, + "loss": 0.835, + "step": 2452 + }, + { + "epoch": 0.2, + "grad_norm": 3.5167850286565376, + "learning_rate": 9.267901702400527e-06, + "loss": 0.5729, + "step": 2453 + }, + { + "epoch": 0.2, + "grad_norm": 4.81871743740869, + "learning_rate": 9.267216306035884e-06, + "loss": 0.6267, + "step": 2454 + }, + { + "epoch": 0.2, + "grad_norm": 25.41575022264047, + "learning_rate": 9.266530614353004e-06, + "loss": 0.6244, + "step": 2455 + }, + { + "epoch": 0.2, + "grad_norm": 5.882065999052178, + "learning_rate": 9.26584462739934e-06, + "loss": 0.7517, + "step": 2456 + }, + { + "epoch": 0.2, + "grad_norm": 2.7537123263085883, + "learning_rate": 9.265158345222368e-06, + "loss": 0.677, + "step": 2457 + }, + { + "epoch": 0.2, + "grad_norm": 3.704220380670085, + "learning_rate": 9.264471767869583e-06, + "loss": 0.7541, + "step": 2458 + }, + { + "epoch": 0.2, + "grad_norm": 4.882102013109601, + "learning_rate": 9.263784895388502e-06, + "loss": 0.6696, + "step": 2459 + }, + { + "epoch": 0.2, + "grad_norm": 6.053350155401451, + "learning_rate": 9.263097727826656e-06, + "loss": 0.7681, + "step": 2460 + }, + { + "epoch": 0.2, + "grad_norm": 4.158498158700847, + "learning_rate": 9.262410265231607e-06, + "loss": 0.8663, + "step": 2461 + }, + { + "epoch": 0.2, + "grad_norm": 6.428038778547417, + "learning_rate": 9.261722507650928e-06, + "loss": 0.7338, + "step": 2462 + }, + { + "epoch": 0.2, + "grad_norm": 6.203353385480414, + "learning_rate": 9.261034455132217e-06, + "loss": 0.6559, + "step": 2463 + }, + { + "epoch": 0.2, + "grad_norm": 5.0245193673378665, + "learning_rate": 9.260346107723093e-06, + "loss": 0.9303, + "step": 2464 + }, + { + "epoch": 0.2, + "grad_norm": 7.517011180786985, + "learning_rate": 9.259657465471194e-06, + "loss": 0.8187, + "step": 2465 + }, + { + "epoch": 0.2, + "grad_norm": 4.8162816710073395, + "learning_rate": 9.258968528424175e-06, + "loss": 0.6355, + "step": 2466 + }, + { + "epoch": 0.2, + "grad_norm": 4.848319058960849, + "learning_rate": 9.25827929662972e-06, + "loss": 0.8751, + "step": 2467 + }, + { + "epoch": 0.2, + "grad_norm": 5.925785039864059, + "learning_rate": 9.257589770135523e-06, + "loss": 0.5809, + "step": 2468 + }, + { + "epoch": 0.2, + "grad_norm": 5.167450948106605, + "learning_rate": 9.256899948989307e-06, + "loss": 0.7539, + "step": 2469 + }, + { + "epoch": 0.2, + "grad_norm": 10.592110715898121, + "learning_rate": 9.25620983323881e-06, + "loss": 0.6947, + "step": 2470 + }, + { + "epoch": 0.2, + "grad_norm": 8.771202071923351, + "learning_rate": 9.255519422931794e-06, + "loss": 0.7642, + "step": 2471 + }, + { + "epoch": 0.2, + "grad_norm": 4.579273850723875, + "learning_rate": 9.254828718116039e-06, + "loss": 0.8416, + "step": 2472 + }, + { + "epoch": 0.2, + "grad_norm": 7.206878900209931, + "learning_rate": 9.254137718839345e-06, + "loss": 0.6051, + "step": 2473 + }, + { + "epoch": 0.2, + "grad_norm": 8.490069815113932, + "learning_rate": 9.253446425149536e-06, + "loss": 0.7622, + "step": 2474 + }, + { + "epoch": 0.2, + "grad_norm": 4.593174587959045, + "learning_rate": 9.252754837094452e-06, + "loss": 0.7506, + "step": 2475 + }, + { + "epoch": 0.2, + "grad_norm": 3.6670161066786338, + "learning_rate": 9.252062954721955e-06, + "loss": 0.6759, + "step": 2476 + }, + { + "epoch": 0.2, + "grad_norm": 5.109094261245794, + "learning_rate": 9.251370778079929e-06, + "loss": 0.647, + "step": 2477 + }, + { + "epoch": 0.2, + "grad_norm": 5.667414142519284, + "learning_rate": 9.250678307216276e-06, + "loss": 0.7136, + "step": 2478 + }, + { + "epoch": 0.2, + "grad_norm": 8.468549003494472, + "learning_rate": 9.24998554217892e-06, + "loss": 0.7176, + "step": 2479 + }, + { + "epoch": 0.2, + "grad_norm": 6.332944214897854, + "learning_rate": 9.249292483015804e-06, + "loss": 0.8866, + "step": 2480 + }, + { + "epoch": 0.2, + "grad_norm": 5.194460579570262, + "learning_rate": 9.248599129774894e-06, + "loss": 0.6979, + "step": 2481 + }, + { + "epoch": 0.2, + "grad_norm": 3.5378491140522508, + "learning_rate": 9.247905482504172e-06, + "loss": 0.7276, + "step": 2482 + }, + { + "epoch": 0.2, + "grad_norm": 4.953606223020982, + "learning_rate": 9.247211541251641e-06, + "loss": 0.6265, + "step": 2483 + }, + { + "epoch": 0.2, + "grad_norm": 4.752546980929678, + "learning_rate": 9.246517306065332e-06, + "loss": 0.7611, + "step": 2484 + }, + { + "epoch": 0.2, + "grad_norm": 3.5243821889165905, + "learning_rate": 9.245822776993286e-06, + "loss": 0.6616, + "step": 2485 + }, + { + "epoch": 0.2, + "grad_norm": 5.381215631396902, + "learning_rate": 9.245127954083571e-06, + "loss": 0.648, + "step": 2486 + }, + { + "epoch": 0.2, + "grad_norm": 13.793619643426357, + "learning_rate": 9.24443283738427e-06, + "loss": 0.714, + "step": 2487 + }, + { + "epoch": 0.2, + "grad_norm": 7.876424720391902, + "learning_rate": 9.243737426943492e-06, + "loss": 0.7141, + "step": 2488 + }, + { + "epoch": 0.2, + "grad_norm": 3.5833129321416726, + "learning_rate": 9.243041722809363e-06, + "loss": 0.677, + "step": 2489 + }, + { + "epoch": 0.2, + "grad_norm": 3.7272067742654147, + "learning_rate": 9.242345725030033e-06, + "loss": 0.7157, + "step": 2490 + }, + { + "epoch": 0.2, + "grad_norm": 4.028858578064078, + "learning_rate": 9.241649433653663e-06, + "loss": 0.6102, + "step": 2491 + }, + { + "epoch": 0.2, + "grad_norm": 4.166557613872918, + "learning_rate": 9.240952848728447e-06, + "loss": 0.6071, + "step": 2492 + }, + { + "epoch": 0.2, + "grad_norm": 4.578049889394283, + "learning_rate": 9.24025597030259e-06, + "loss": 0.9905, + "step": 2493 + }, + { + "epoch": 0.2, + "grad_norm": 5.4750457881306716, + "learning_rate": 9.239558798424322e-06, + "loss": 0.5054, + "step": 2494 + }, + { + "epoch": 0.2, + "grad_norm": 8.503202954229025, + "learning_rate": 9.238861333141889e-06, + "loss": 0.6401, + "step": 2495 + }, + { + "epoch": 0.2, + "grad_norm": 4.0566150011204725, + "learning_rate": 9.238163574503562e-06, + "loss": 0.7722, + "step": 2496 + }, + { + "epoch": 0.2, + "grad_norm": 3.94304659018854, + "learning_rate": 9.23746552255763e-06, + "loss": 0.8436, + "step": 2497 + }, + { + "epoch": 0.2, + "grad_norm": 3.22190650411511, + "learning_rate": 9.236767177352403e-06, + "loss": 0.7536, + "step": 2498 + }, + { + "epoch": 0.2, + "grad_norm": 3.560225109749607, + "learning_rate": 9.23606853893621e-06, + "loss": 0.8405, + "step": 2499 + }, + { + "epoch": 0.2, + "grad_norm": 3.6273205491560927, + "learning_rate": 9.235369607357402e-06, + "loss": 0.7262, + "step": 2500 + }, + { + "epoch": 0.2, + "grad_norm": 3.7894401395286414, + "learning_rate": 9.23467038266435e-06, + "loss": 0.6985, + "step": 2501 + }, + { + "epoch": 0.2, + "grad_norm": 4.018240683004062, + "learning_rate": 9.233970864905444e-06, + "loss": 0.6976, + "step": 2502 + }, + { + "epoch": 0.2, + "grad_norm": 6.070353867016748, + "learning_rate": 9.233271054129092e-06, + "loss": 0.7613, + "step": 2503 + }, + { + "epoch": 0.2, + "grad_norm": 4.267060739887431, + "learning_rate": 9.23257095038373e-06, + "loss": 0.6894, + "step": 2504 + }, + { + "epoch": 0.2, + "grad_norm": 5.434883757957719, + "learning_rate": 9.231870553717808e-06, + "loss": 0.904, + "step": 2505 + }, + { + "epoch": 0.2, + "grad_norm": 2.941550034736005, + "learning_rate": 9.231169864179797e-06, + "loss": 0.6429, + "step": 2506 + }, + { + "epoch": 0.2, + "grad_norm": 18.317090624838876, + "learning_rate": 9.230468881818192e-06, + "loss": 0.6335, + "step": 2507 + }, + { + "epoch": 0.2, + "grad_norm": 14.634152352091174, + "learning_rate": 9.2297676066815e-06, + "loss": 0.7564, + "step": 2508 + }, + { + "epoch": 0.2, + "grad_norm": 4.262918747511421, + "learning_rate": 9.229066038818258e-06, + "loss": 0.7803, + "step": 2509 + }, + { + "epoch": 0.2, + "grad_norm": 3.511425031951057, + "learning_rate": 9.228364178277018e-06, + "loss": 0.6934, + "step": 2510 + }, + { + "epoch": 0.2, + "grad_norm": 4.287734182747469, + "learning_rate": 9.227662025106352e-06, + "loss": 0.7392, + "step": 2511 + }, + { + "epoch": 0.2, + "grad_norm": 7.901303849645927, + "learning_rate": 9.226959579354855e-06, + "loss": 0.9464, + "step": 2512 + }, + { + "epoch": 0.2, + "grad_norm": 11.554448610093775, + "learning_rate": 9.22625684107114e-06, + "loss": 0.7764, + "step": 2513 + }, + { + "epoch": 0.2, + "grad_norm": 3.4052494621608873, + "learning_rate": 9.22555381030384e-06, + "loss": 0.6834, + "step": 2514 + }, + { + "epoch": 0.2, + "grad_norm": 8.933844726491499, + "learning_rate": 9.224850487101611e-06, + "loss": 0.6246, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 4.514531331201028, + "learning_rate": 9.224146871513127e-06, + "loss": 0.7275, + "step": 2516 + }, + { + "epoch": 0.2, + "grad_norm": 2.7734713986543102, + "learning_rate": 9.223442963587082e-06, + "loss": 0.6056, + "step": 2517 + }, + { + "epoch": 0.2, + "grad_norm": 10.545427676606385, + "learning_rate": 9.222738763372189e-06, + "loss": 0.6675, + "step": 2518 + }, + { + "epoch": 0.2, + "grad_norm": 19.33672342398612, + "learning_rate": 9.222034270917187e-06, + "loss": 0.8545, + "step": 2519 + }, + { + "epoch": 0.2, + "grad_norm": 3.936441722365738, + "learning_rate": 9.221329486270827e-06, + "loss": 0.7063, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 5.152952337943601, + "learning_rate": 9.220624409481888e-06, + "loss": 0.7511, + "step": 2521 + }, + { + "epoch": 0.2, + "grad_norm": 4.109248980137321, + "learning_rate": 9.219919040599165e-06, + "loss": 0.5497, + "step": 2522 + }, + { + "epoch": 0.2, + "grad_norm": 9.062705042077418, + "learning_rate": 9.219213379671474e-06, + "loss": 0.7234, + "step": 2523 + }, + { + "epoch": 0.2, + "grad_norm": 4.331730186367032, + "learning_rate": 9.218507426747651e-06, + "loss": 0.7561, + "step": 2524 + }, + { + "epoch": 0.21, + "grad_norm": 3.19338022044433, + "learning_rate": 9.21780118187655e-06, + "loss": 0.7714, + "step": 2525 + }, + { + "epoch": 0.21, + "grad_norm": 5.816456915568769, + "learning_rate": 9.217094645107052e-06, + "loss": 0.6624, + "step": 2526 + }, + { + "epoch": 0.21, + "grad_norm": 3.1932817427796785, + "learning_rate": 9.216387816488051e-06, + "loss": 0.7515, + "step": 2527 + }, + { + "epoch": 0.21, + "grad_norm": 5.767854446240626, + "learning_rate": 9.215680696068465e-06, + "loss": 0.7591, + "step": 2528 + }, + { + "epoch": 0.21, + "grad_norm": 4.268644369034617, + "learning_rate": 9.214973283897231e-06, + "loss": 0.7581, + "step": 2529 + }, + { + "epoch": 0.21, + "grad_norm": 4.1786040800356545, + "learning_rate": 9.214265580023305e-06, + "loss": 0.8489, + "step": 2530 + }, + { + "epoch": 0.21, + "grad_norm": 3.0289662252248117, + "learning_rate": 9.213557584495665e-06, + "loss": 0.9291, + "step": 2531 + }, + { + "epoch": 0.21, + "grad_norm": 3.24024098194718, + "learning_rate": 9.212849297363312e-06, + "loss": 0.7309, + "step": 2532 + }, + { + "epoch": 0.21, + "grad_norm": 2.7551136024973046, + "learning_rate": 9.212140718675257e-06, + "loss": 0.5896, + "step": 2533 + }, + { + "epoch": 0.21, + "grad_norm": 9.465515236290557, + "learning_rate": 9.211431848480545e-06, + "loss": 0.5831, + "step": 2534 + }, + { + "epoch": 0.21, + "grad_norm": 3.1350217664061297, + "learning_rate": 9.210722686828232e-06, + "loss": 0.8176, + "step": 2535 + }, + { + "epoch": 0.21, + "grad_norm": 3.060282934708333, + "learning_rate": 9.210013233767396e-06, + "loss": 0.6891, + "step": 2536 + }, + { + "epoch": 0.21, + "grad_norm": 3.190790993536294, + "learning_rate": 9.209303489347136e-06, + "loss": 0.9744, + "step": 2537 + }, + { + "epoch": 0.21, + "grad_norm": 3.9908004651289977, + "learning_rate": 9.20859345361657e-06, + "loss": 0.8672, + "step": 2538 + }, + { + "epoch": 0.21, + "grad_norm": 4.145993801746851, + "learning_rate": 9.207883126624838e-06, + "loss": 0.7271, + "step": 2539 + }, + { + "epoch": 0.21, + "grad_norm": 3.8188967255938335, + "learning_rate": 9.207172508421099e-06, + "loss": 0.6967, + "step": 2540 + }, + { + "epoch": 0.21, + "grad_norm": 5.295485117994193, + "learning_rate": 9.20646159905453e-06, + "loss": 0.7497, + "step": 2541 + }, + { + "epoch": 0.21, + "grad_norm": 8.12223447423027, + "learning_rate": 9.205750398574334e-06, + "loss": 0.8167, + "step": 2542 + }, + { + "epoch": 0.21, + "grad_norm": 4.441117416734898, + "learning_rate": 9.205038907029729e-06, + "loss": 0.7028, + "step": 2543 + }, + { + "epoch": 0.21, + "grad_norm": 43.745123805205154, + "learning_rate": 9.204327124469953e-06, + "loss": 0.7531, + "step": 2544 + }, + { + "epoch": 0.21, + "grad_norm": 3.6208033610516672, + "learning_rate": 9.203615050944269e-06, + "loss": 0.8874, + "step": 2545 + }, + { + "epoch": 0.21, + "grad_norm": 3.352843162960466, + "learning_rate": 9.202902686501954e-06, + "loss": 0.7101, + "step": 2546 + }, + { + "epoch": 0.21, + "grad_norm": 5.048818767546314, + "learning_rate": 9.20219003119231e-06, + "loss": 0.7227, + "step": 2547 + }, + { + "epoch": 0.21, + "grad_norm": 3.5273439437975993, + "learning_rate": 9.201477085064656e-06, + "loss": 0.4769, + "step": 2548 + }, + { + "epoch": 0.21, + "grad_norm": 4.2639328301832915, + "learning_rate": 9.200763848168334e-06, + "loss": 0.7139, + "step": 2549 + }, + { + "epoch": 0.21, + "grad_norm": 3.479648858302999, + "learning_rate": 9.200050320552702e-06, + "loss": 0.7772, + "step": 2550 + }, + { + "epoch": 0.21, + "grad_norm": 4.898347724267929, + "learning_rate": 9.199336502267145e-06, + "loss": 0.8678, + "step": 2551 + }, + { + "epoch": 0.21, + "grad_norm": 9.41603516254404, + "learning_rate": 9.19862239336106e-06, + "loss": 0.745, + "step": 2552 + }, + { + "epoch": 0.21, + "grad_norm": 3.545931265898865, + "learning_rate": 9.197907993883865e-06, + "loss": 0.8986, + "step": 2553 + }, + { + "epoch": 0.21, + "grad_norm": 3.301868952189717, + "learning_rate": 9.197193303885008e-06, + "loss": 0.64, + "step": 2554 + }, + { + "epoch": 0.21, + "grad_norm": 4.345528438571801, + "learning_rate": 9.196478323413946e-06, + "loss": 0.8305, + "step": 2555 + }, + { + "epoch": 0.21, + "grad_norm": 8.106496876694042, + "learning_rate": 9.19576305252016e-06, + "loss": 0.8963, + "step": 2556 + }, + { + "epoch": 0.21, + "grad_norm": 3.361375352777354, + "learning_rate": 9.195047491253154e-06, + "loss": 0.823, + "step": 2557 + }, + { + "epoch": 0.21, + "grad_norm": 4.923056400330417, + "learning_rate": 9.194331639662445e-06, + "loss": 0.6374, + "step": 2558 + }, + { + "epoch": 0.21, + "grad_norm": 4.321522171284863, + "learning_rate": 9.193615497797579e-06, + "loss": 0.7804, + "step": 2559 + }, + { + "epoch": 0.21, + "grad_norm": 9.091655304786851, + "learning_rate": 9.192899065708115e-06, + "loss": 0.6688, + "step": 2560 + }, + { + "epoch": 0.21, + "grad_norm": 3.3904821534983802, + "learning_rate": 9.192182343443634e-06, + "loss": 0.7634, + "step": 2561 + }, + { + "epoch": 0.21, + "grad_norm": 5.887533608463917, + "learning_rate": 9.19146533105374e-06, + "loss": 0.7445, + "step": 2562 + }, + { + "epoch": 0.21, + "grad_norm": 4.683333555125534, + "learning_rate": 9.190748028588053e-06, + "loss": 0.7642, + "step": 2563 + }, + { + "epoch": 0.21, + "grad_norm": 3.6142932153919536, + "learning_rate": 9.190030436096213e-06, + "loss": 0.7218, + "step": 2564 + }, + { + "epoch": 0.21, + "grad_norm": 6.490733019316102, + "learning_rate": 9.189312553627886e-06, + "loss": 0.7971, + "step": 2565 + }, + { + "epoch": 0.21, + "grad_norm": 5.115652874886379, + "learning_rate": 9.188594381232754e-06, + "loss": 0.8786, + "step": 2566 + }, + { + "epoch": 0.21, + "grad_norm": 3.1046024833077643, + "learning_rate": 9.187875918960516e-06, + "loss": 0.662, + "step": 2567 + }, + { + "epoch": 0.21, + "grad_norm": 4.108103830208113, + "learning_rate": 9.187157166860894e-06, + "loss": 0.626, + "step": 2568 + }, + { + "epoch": 0.21, + "grad_norm": 3.327610341189339, + "learning_rate": 9.186438124983633e-06, + "loss": 0.7444, + "step": 2569 + }, + { + "epoch": 0.21, + "grad_norm": 2.714346878290693, + "learning_rate": 9.185718793378492e-06, + "loss": 0.614, + "step": 2570 + }, + { + "epoch": 0.21, + "grad_norm": 7.829820932368684, + "learning_rate": 9.184999172095257e-06, + "loss": 0.6303, + "step": 2571 + }, + { + "epoch": 0.21, + "grad_norm": 7.117774846367013, + "learning_rate": 9.184279261183728e-06, + "loss": 0.6896, + "step": 2572 + }, + { + "epoch": 0.21, + "grad_norm": 4.381983065807821, + "learning_rate": 9.183559060693728e-06, + "loss": 0.7862, + "step": 2573 + }, + { + "epoch": 0.21, + "grad_norm": 12.820989595388447, + "learning_rate": 9.182838570675097e-06, + "loss": 0.7306, + "step": 2574 + }, + { + "epoch": 0.21, + "grad_norm": 4.43502747538333, + "learning_rate": 9.182117791177702e-06, + "loss": 0.7599, + "step": 2575 + }, + { + "epoch": 0.21, + "grad_norm": 3.281867666805685, + "learning_rate": 9.181396722251422e-06, + "loss": 0.738, + "step": 2576 + }, + { + "epoch": 0.21, + "grad_norm": 5.779989177399466, + "learning_rate": 9.18067536394616e-06, + "loss": 0.6623, + "step": 2577 + }, + { + "epoch": 0.21, + "grad_norm": 28.064252006983025, + "learning_rate": 9.17995371631184e-06, + "loss": 0.6323, + "step": 2578 + }, + { + "epoch": 0.21, + "grad_norm": 3.758641172429831, + "learning_rate": 9.179231779398403e-06, + "loss": 0.7176, + "step": 2579 + }, + { + "epoch": 0.21, + "grad_norm": 2.8678666455948183, + "learning_rate": 9.178509553255812e-06, + "loss": 0.8448, + "step": 2580 + }, + { + "epoch": 0.21, + "grad_norm": 3.3331168817316525, + "learning_rate": 9.177787037934052e-06, + "loss": 0.6274, + "step": 2581 + }, + { + "epoch": 0.21, + "grad_norm": 2.3517500160629163, + "learning_rate": 9.177064233483121e-06, + "loss": 0.6993, + "step": 2582 + }, + { + "epoch": 0.21, + "grad_norm": 5.668439438071556, + "learning_rate": 9.176341139953046e-06, + "loss": 0.7189, + "step": 2583 + }, + { + "epoch": 0.21, + "grad_norm": 8.236309432328966, + "learning_rate": 9.175617757393867e-06, + "loss": 0.5444, + "step": 2584 + }, + { + "epoch": 0.21, + "grad_norm": 5.0573208614919185, + "learning_rate": 9.174894085855645e-06, + "loss": 0.7263, + "step": 2585 + }, + { + "epoch": 0.21, + "grad_norm": 3.651801211304331, + "learning_rate": 9.174170125388468e-06, + "loss": 0.715, + "step": 2586 + }, + { + "epoch": 0.21, + "grad_norm": 3.9889525021493264, + "learning_rate": 9.173445876042436e-06, + "loss": 0.9097, + "step": 2587 + }, + { + "epoch": 0.21, + "grad_norm": 4.780204533400854, + "learning_rate": 9.17272133786767e-06, + "loss": 0.7696, + "step": 2588 + }, + { + "epoch": 0.21, + "grad_norm": 31.55899433611728, + "learning_rate": 9.171996510914311e-06, + "loss": 0.7628, + "step": 2589 + }, + { + "epoch": 0.21, + "grad_norm": 4.108568143082509, + "learning_rate": 9.171271395232528e-06, + "loss": 0.6953, + "step": 2590 + }, + { + "epoch": 0.21, + "grad_norm": 4.051114625637082, + "learning_rate": 9.170545990872499e-06, + "loss": 0.8119, + "step": 2591 + }, + { + "epoch": 0.21, + "grad_norm": 13.255343175481997, + "learning_rate": 9.169820297884428e-06, + "loss": 0.9631, + "step": 2592 + }, + { + "epoch": 0.21, + "grad_norm": 4.1977369801329925, + "learning_rate": 9.169094316318537e-06, + "loss": 0.7845, + "step": 2593 + }, + { + "epoch": 0.21, + "grad_norm": 4.909765594058128, + "learning_rate": 9.168368046225067e-06, + "loss": 0.7788, + "step": 2594 + }, + { + "epoch": 0.21, + "grad_norm": 4.08491745696979, + "learning_rate": 9.167641487654283e-06, + "loss": 0.7316, + "step": 2595 + }, + { + "epoch": 0.21, + "grad_norm": 5.228780810271977, + "learning_rate": 9.166914640656467e-06, + "loss": 0.5747, + "step": 2596 + }, + { + "epoch": 0.21, + "grad_norm": 3.3880724253382017, + "learning_rate": 9.166187505281919e-06, + "loss": 0.7476, + "step": 2597 + }, + { + "epoch": 0.21, + "grad_norm": 5.592187134524757, + "learning_rate": 9.165460081580965e-06, + "loss": 0.7696, + "step": 2598 + }, + { + "epoch": 0.21, + "grad_norm": 3.866632645471413, + "learning_rate": 9.164732369603944e-06, + "loss": 0.6975, + "step": 2599 + }, + { + "epoch": 0.21, + "grad_norm": 2.9567097617618865, + "learning_rate": 9.16400436940122e-06, + "loss": 0.8094, + "step": 2600 + }, + { + "epoch": 0.21, + "grad_norm": 4.440064003884874, + "learning_rate": 9.163276081023177e-06, + "loss": 0.8229, + "step": 2601 + }, + { + "epoch": 0.21, + "grad_norm": 3.576412861252549, + "learning_rate": 9.162547504520214e-06, + "loss": 0.6277, + "step": 2602 + }, + { + "epoch": 0.21, + "grad_norm": 11.04488692956889, + "learning_rate": 9.161818639942752e-06, + "loss": 0.6133, + "step": 2603 + }, + { + "epoch": 0.21, + "grad_norm": 3.102596020115686, + "learning_rate": 9.161089487341237e-06, + "loss": 0.7928, + "step": 2604 + }, + { + "epoch": 0.21, + "grad_norm": 8.996732415763038, + "learning_rate": 9.160360046766129e-06, + "loss": 0.6554, + "step": 2605 + }, + { + "epoch": 0.21, + "grad_norm": 3.3563536131865703, + "learning_rate": 9.159630318267908e-06, + "loss": 0.7639, + "step": 2606 + }, + { + "epoch": 0.21, + "grad_norm": 10.741720068471897, + "learning_rate": 9.15890030189708e-06, + "loss": 0.7563, + "step": 2607 + }, + { + "epoch": 0.21, + "grad_norm": 10.079084306533423, + "learning_rate": 9.158169997704166e-06, + "loss": 0.5617, + "step": 2608 + }, + { + "epoch": 0.21, + "grad_norm": 5.189167989992016, + "learning_rate": 9.157439405739703e-06, + "loss": 0.8172, + "step": 2609 + }, + { + "epoch": 0.21, + "grad_norm": 4.775254099694386, + "learning_rate": 9.156708526054257e-06, + "loss": 0.6895, + "step": 2610 + }, + { + "epoch": 0.21, + "grad_norm": 3.0002308214770297, + "learning_rate": 9.15597735869841e-06, + "loss": 0.7644, + "step": 2611 + }, + { + "epoch": 0.21, + "grad_norm": 4.004904774854671, + "learning_rate": 9.155245903722758e-06, + "loss": 0.7785, + "step": 2612 + }, + { + "epoch": 0.21, + "grad_norm": 6.354417392143378, + "learning_rate": 9.154514161177927e-06, + "loss": 0.7169, + "step": 2613 + }, + { + "epoch": 0.21, + "grad_norm": 3.4366143474901256, + "learning_rate": 9.153782131114559e-06, + "loss": 0.6287, + "step": 2614 + }, + { + "epoch": 0.21, + "grad_norm": 7.383108649041177, + "learning_rate": 9.15304981358331e-06, + "loss": 0.7268, + "step": 2615 + }, + { + "epoch": 0.21, + "grad_norm": 19.76298133232945, + "learning_rate": 9.152317208634866e-06, + "loss": 0.7053, + "step": 2616 + }, + { + "epoch": 0.21, + "grad_norm": 3.1786785748148367, + "learning_rate": 9.151584316319928e-06, + "loss": 0.6673, + "step": 2617 + }, + { + "epoch": 0.21, + "grad_norm": 2.968299364251915, + "learning_rate": 9.150851136689212e-06, + "loss": 0.7316, + "step": 2618 + }, + { + "epoch": 0.21, + "grad_norm": 15.79613254809934, + "learning_rate": 9.150117669793462e-06, + "loss": 0.7724, + "step": 2619 + }, + { + "epoch": 0.21, + "grad_norm": 3.0563021303323317, + "learning_rate": 9.149383915683439e-06, + "loss": 0.6686, + "step": 2620 + }, + { + "epoch": 0.21, + "grad_norm": 4.841994801750214, + "learning_rate": 9.148649874409921e-06, + "loss": 0.7466, + "step": 2621 + }, + { + "epoch": 0.21, + "grad_norm": 4.5679022173226445, + "learning_rate": 9.14791554602371e-06, + "loss": 0.6087, + "step": 2622 + }, + { + "epoch": 0.21, + "grad_norm": 30.473889457845605, + "learning_rate": 9.147180930575625e-06, + "loss": 0.7049, + "step": 2623 + }, + { + "epoch": 0.21, + "grad_norm": 3.5776867590333645, + "learning_rate": 9.146446028116508e-06, + "loss": 0.717, + "step": 2624 + }, + { + "epoch": 0.21, + "grad_norm": 4.546865600071528, + "learning_rate": 9.145710838697217e-06, + "loss": 0.5362, + "step": 2625 + }, + { + "epoch": 0.21, + "grad_norm": 4.420614653982784, + "learning_rate": 9.144975362368633e-06, + "loss": 0.8378, + "step": 2626 + }, + { + "epoch": 0.21, + "grad_norm": 3.481214764733894, + "learning_rate": 9.144239599181655e-06, + "loss": 0.7211, + "step": 2627 + }, + { + "epoch": 0.21, + "grad_norm": 4.690752450673927, + "learning_rate": 9.143503549187203e-06, + "loss": 0.8571, + "step": 2628 + }, + { + "epoch": 0.21, + "grad_norm": 4.8232974925696555, + "learning_rate": 9.142767212436214e-06, + "loss": 0.8012, + "step": 2629 + }, + { + "epoch": 0.21, + "grad_norm": 5.715245998342851, + "learning_rate": 9.142030588979649e-06, + "loss": 0.7481, + "step": 2630 + }, + { + "epoch": 0.21, + "grad_norm": 3.3430597167109304, + "learning_rate": 9.141293678868488e-06, + "loss": 0.6595, + "step": 2631 + }, + { + "epoch": 0.21, + "grad_norm": 4.667305769535707, + "learning_rate": 9.140556482153729e-06, + "loss": 0.7439, + "step": 2632 + }, + { + "epoch": 0.21, + "grad_norm": 5.896872062295749, + "learning_rate": 9.13981899888639e-06, + "loss": 0.7425, + "step": 2633 + }, + { + "epoch": 0.21, + "grad_norm": 5.652694425829886, + "learning_rate": 9.139081229117508e-06, + "loss": 0.7239, + "step": 2634 + }, + { + "epoch": 0.21, + "grad_norm": 3.6717784615264386, + "learning_rate": 9.138343172898145e-06, + "loss": 0.6198, + "step": 2635 + }, + { + "epoch": 0.21, + "grad_norm": 4.635901425925286, + "learning_rate": 9.137604830279377e-06, + "loss": 0.7164, + "step": 2636 + }, + { + "epoch": 0.21, + "grad_norm": 4.420739061260594, + "learning_rate": 9.136866201312302e-06, + "loss": 0.7577, + "step": 2637 + }, + { + "epoch": 0.21, + "grad_norm": 3.238242204210583, + "learning_rate": 9.136127286048038e-06, + "loss": 0.7427, + "step": 2638 + }, + { + "epoch": 0.21, + "grad_norm": 3.1140559155848666, + "learning_rate": 9.135388084537725e-06, + "loss": 0.7473, + "step": 2639 + }, + { + "epoch": 0.21, + "grad_norm": 4.579194673033722, + "learning_rate": 9.134648596832513e-06, + "loss": 0.8568, + "step": 2640 + }, + { + "epoch": 0.21, + "grad_norm": 4.394945091205819, + "learning_rate": 9.133908822983589e-06, + "loss": 0.9303, + "step": 2641 + }, + { + "epoch": 0.21, + "grad_norm": 7.516818800844026, + "learning_rate": 9.133168763042141e-06, + "loss": 0.6363, + "step": 2642 + }, + { + "epoch": 0.21, + "grad_norm": 3.563621335239375, + "learning_rate": 9.132428417059393e-06, + "loss": 0.934, + "step": 2643 + }, + { + "epoch": 0.21, + "grad_norm": 5.610273032555965, + "learning_rate": 9.131687785086579e-06, + "loss": 0.7349, + "step": 2644 + }, + { + "epoch": 0.21, + "grad_norm": 6.595061349046355, + "learning_rate": 9.130946867174952e-06, + "loss": 0.7058, + "step": 2645 + }, + { + "epoch": 0.21, + "grad_norm": 4.9931245800779855, + "learning_rate": 9.130205663375792e-06, + "loss": 0.7296, + "step": 2646 + }, + { + "epoch": 0.21, + "grad_norm": 9.04287059619439, + "learning_rate": 9.129464173740397e-06, + "loss": 0.7229, + "step": 2647 + }, + { + "epoch": 0.22, + "grad_norm": 10.186035226012091, + "learning_rate": 9.128722398320077e-06, + "loss": 0.7902, + "step": 2648 + }, + { + "epoch": 0.22, + "grad_norm": 14.042674649080956, + "learning_rate": 9.127980337166172e-06, + "loss": 0.829, + "step": 2649 + }, + { + "epoch": 0.22, + "grad_norm": 4.567737943991186, + "learning_rate": 9.127237990330035e-06, + "loss": 0.7444, + "step": 2650 + }, + { + "epoch": 0.22, + "grad_norm": 8.205808130214953, + "learning_rate": 9.126495357863042e-06, + "loss": 0.6834, + "step": 2651 + }, + { + "epoch": 0.22, + "grad_norm": 4.325680194904642, + "learning_rate": 9.125752439816588e-06, + "loss": 0.7545, + "step": 2652 + }, + { + "epoch": 0.22, + "grad_norm": 4.853116556720304, + "learning_rate": 9.125009236242088e-06, + "loss": 0.6094, + "step": 2653 + }, + { + "epoch": 0.22, + "grad_norm": 5.731713979445744, + "learning_rate": 9.124265747190974e-06, + "loss": 0.7126, + "step": 2654 + }, + { + "epoch": 0.22, + "grad_norm": 7.764051409387372, + "learning_rate": 9.123521972714702e-06, + "loss": 0.7216, + "step": 2655 + }, + { + "epoch": 0.22, + "grad_norm": 5.647308351505891, + "learning_rate": 9.122777912864747e-06, + "loss": 0.6114, + "step": 2656 + }, + { + "epoch": 0.22, + "grad_norm": 7.104482992303934, + "learning_rate": 9.122033567692601e-06, + "loss": 0.7737, + "step": 2657 + }, + { + "epoch": 0.22, + "grad_norm": 6.480850805948082, + "learning_rate": 9.121288937249777e-06, + "loss": 0.6796, + "step": 2658 + }, + { + "epoch": 0.22, + "grad_norm": 4.427958378154737, + "learning_rate": 9.120544021587807e-06, + "loss": 0.7595, + "step": 2659 + }, + { + "epoch": 0.22, + "grad_norm": 5.712765690668253, + "learning_rate": 9.11979882075825e-06, + "loss": 0.8159, + "step": 2660 + }, + { + "epoch": 0.22, + "grad_norm": 29.89773928911269, + "learning_rate": 9.119053334812671e-06, + "loss": 0.7007, + "step": 2661 + }, + { + "epoch": 0.22, + "grad_norm": 4.550242400499846, + "learning_rate": 9.118307563802665e-06, + "loss": 0.6697, + "step": 2662 + }, + { + "epoch": 0.22, + "grad_norm": 6.08703493114258, + "learning_rate": 9.117561507779847e-06, + "loss": 0.7202, + "step": 2663 + }, + { + "epoch": 0.22, + "grad_norm": 7.546835579384152, + "learning_rate": 9.116815166795844e-06, + "loss": 0.652, + "step": 2664 + }, + { + "epoch": 0.22, + "grad_norm": 4.295192199334068, + "learning_rate": 9.116068540902313e-06, + "loss": 0.7643, + "step": 2665 + }, + { + "epoch": 0.22, + "grad_norm": 2.7338442793599462, + "learning_rate": 9.115321630150918e-06, + "loss": 0.5044, + "step": 2666 + }, + { + "epoch": 0.22, + "grad_norm": 5.547642337921033, + "learning_rate": 9.114574434593357e-06, + "loss": 0.776, + "step": 2667 + }, + { + "epoch": 0.22, + "grad_norm": 7.284702566417422, + "learning_rate": 9.113826954281335e-06, + "loss": 0.7563, + "step": 2668 + }, + { + "epoch": 0.22, + "grad_norm": 5.76573369240654, + "learning_rate": 9.113079189266587e-06, + "loss": 0.8402, + "step": 2669 + }, + { + "epoch": 0.22, + "grad_norm": 5.230268456930929, + "learning_rate": 9.112331139600861e-06, + "loss": 0.7483, + "step": 2670 + }, + { + "epoch": 0.22, + "grad_norm": 5.5528895761977894, + "learning_rate": 9.111582805335926e-06, + "loss": 0.7327, + "step": 2671 + }, + { + "epoch": 0.22, + "grad_norm": 3.8461377163237023, + "learning_rate": 9.110834186523572e-06, + "loss": 0.6512, + "step": 2672 + }, + { + "epoch": 0.22, + "grad_norm": 5.1447165679573885, + "learning_rate": 9.11008528321561e-06, + "loss": 0.6981, + "step": 2673 + }, + { + "epoch": 0.22, + "grad_norm": 4.557117022653451, + "learning_rate": 9.109336095463865e-06, + "loss": 0.939, + "step": 2674 + }, + { + "epoch": 0.22, + "grad_norm": 13.751336320407118, + "learning_rate": 9.10858662332019e-06, + "loss": 0.7143, + "step": 2675 + }, + { + "epoch": 0.22, + "grad_norm": 4.680425328380966, + "learning_rate": 9.107836866836448e-06, + "loss": 0.8348, + "step": 2676 + }, + { + "epoch": 0.22, + "grad_norm": 9.953345091881308, + "learning_rate": 9.107086826064533e-06, + "loss": 0.7134, + "step": 2677 + }, + { + "epoch": 0.22, + "grad_norm": 14.32229750054991, + "learning_rate": 9.106336501056348e-06, + "loss": 0.7114, + "step": 2678 + }, + { + "epoch": 0.22, + "grad_norm": 8.102795209142728, + "learning_rate": 9.10558589186382e-06, + "loss": 0.9128, + "step": 2679 + }, + { + "epoch": 0.22, + "grad_norm": 13.912212284352316, + "learning_rate": 9.104834998538899e-06, + "loss": 0.6523, + "step": 2680 + }, + { + "epoch": 0.22, + "grad_norm": 16.282086926318353, + "learning_rate": 9.10408382113355e-06, + "loss": 0.802, + "step": 2681 + }, + { + "epoch": 0.22, + "grad_norm": 39.375009345723356, + "learning_rate": 9.103332359699757e-06, + "loss": 0.6199, + "step": 2682 + }, + { + "epoch": 0.22, + "grad_norm": 8.912235763225437, + "learning_rate": 9.102580614289532e-06, + "loss": 0.6851, + "step": 2683 + }, + { + "epoch": 0.22, + "grad_norm": 50.155658157814074, + "learning_rate": 9.101828584954893e-06, + "loss": 0.8759, + "step": 2684 + }, + { + "epoch": 0.22, + "grad_norm": 9.107254244590859, + "learning_rate": 9.101076271747888e-06, + "loss": 0.929, + "step": 2685 + }, + { + "epoch": 0.22, + "grad_norm": 5.209268275606788, + "learning_rate": 9.100323674720585e-06, + "loss": 0.7997, + "step": 2686 + }, + { + "epoch": 0.22, + "grad_norm": 5.78347972967204, + "learning_rate": 9.099570793925065e-06, + "loss": 0.6723, + "step": 2687 + }, + { + "epoch": 0.22, + "grad_norm": 3.273140075954397, + "learning_rate": 9.098817629413434e-06, + "loss": 0.6191, + "step": 2688 + }, + { + "epoch": 0.22, + "grad_norm": 5.036411978263137, + "learning_rate": 9.098064181237814e-06, + "loss": 0.6556, + "step": 2689 + }, + { + "epoch": 0.22, + "grad_norm": 8.382277251866585, + "learning_rate": 9.097310449450348e-06, + "loss": 0.7097, + "step": 2690 + }, + { + "epoch": 0.22, + "grad_norm": 5.318812685537838, + "learning_rate": 9.096556434103201e-06, + "loss": 0.8173, + "step": 2691 + }, + { + "epoch": 0.22, + "grad_norm": 7.913724205519675, + "learning_rate": 9.095802135248557e-06, + "loss": 0.7485, + "step": 2692 + }, + { + "epoch": 0.22, + "grad_norm": 6.204978940727581, + "learning_rate": 9.095047552938612e-06, + "loss": 0.7458, + "step": 2693 + }, + { + "epoch": 0.22, + "grad_norm": 5.438141731507851, + "learning_rate": 9.094292687225594e-06, + "loss": 0.763, + "step": 2694 + }, + { + "epoch": 0.22, + "grad_norm": 3.9326766404857745, + "learning_rate": 9.093537538161742e-06, + "loss": 0.7283, + "step": 2695 + }, + { + "epoch": 0.22, + "grad_norm": 4.767800338634533, + "learning_rate": 9.092782105799317e-06, + "loss": 0.8797, + "step": 2696 + }, + { + "epoch": 0.22, + "grad_norm": 6.570497039625524, + "learning_rate": 9.0920263901906e-06, + "loss": 0.7888, + "step": 2697 + }, + { + "epoch": 0.22, + "grad_norm": 6.932513129150175, + "learning_rate": 9.091270391387892e-06, + "loss": 0.7179, + "step": 2698 + }, + { + "epoch": 0.22, + "grad_norm": 9.875900860904347, + "learning_rate": 9.090514109443511e-06, + "loss": 0.6616, + "step": 2699 + }, + { + "epoch": 0.22, + "grad_norm": 5.317775666968505, + "learning_rate": 9.089757544409798e-06, + "loss": 0.7709, + "step": 2700 + }, + { + "epoch": 0.22, + "grad_norm": 6.961696962761908, + "learning_rate": 9.089000696339112e-06, + "loss": 0.5837, + "step": 2701 + }, + { + "epoch": 0.22, + "grad_norm": 2.97624468295653, + "learning_rate": 9.088243565283832e-06, + "loss": 0.7805, + "step": 2702 + }, + { + "epoch": 0.22, + "grad_norm": 7.419510373643693, + "learning_rate": 9.087486151296355e-06, + "loss": 0.7519, + "step": 2703 + }, + { + "epoch": 0.22, + "grad_norm": 4.581844965387088, + "learning_rate": 9.086728454429099e-06, + "loss": 0.7128, + "step": 2704 + }, + { + "epoch": 0.22, + "grad_norm": 5.83024760691626, + "learning_rate": 9.085970474734501e-06, + "loss": 0.771, + "step": 2705 + }, + { + "epoch": 0.22, + "grad_norm": 6.522818646515112, + "learning_rate": 9.08521221226502e-06, + "loss": 0.8641, + "step": 2706 + }, + { + "epoch": 0.22, + "grad_norm": 6.734050701435857, + "learning_rate": 9.084453667073131e-06, + "loss": 0.8186, + "step": 2707 + }, + { + "epoch": 0.22, + "grad_norm": 4.004804390681544, + "learning_rate": 9.08369483921133e-06, + "loss": 0.7255, + "step": 2708 + }, + { + "epoch": 0.22, + "grad_norm": 4.437988292791883, + "learning_rate": 9.082935728732135e-06, + "loss": 0.7883, + "step": 2709 + }, + { + "epoch": 0.22, + "grad_norm": 23.604595825957652, + "learning_rate": 9.082176335688076e-06, + "loss": 0.792, + "step": 2710 + }, + { + "epoch": 0.22, + "grad_norm": 2.4938608384537417, + "learning_rate": 9.081416660131713e-06, + "loss": 0.5597, + "step": 2711 + }, + { + "epoch": 0.22, + "grad_norm": 3.9557799062829435, + "learning_rate": 9.080656702115619e-06, + "loss": 0.752, + "step": 2712 + }, + { + "epoch": 0.22, + "grad_norm": 10.548714184582717, + "learning_rate": 9.079896461692386e-06, + "loss": 0.7945, + "step": 2713 + }, + { + "epoch": 0.22, + "grad_norm": 6.892085288408094, + "learning_rate": 9.07913593891463e-06, + "loss": 0.5684, + "step": 2714 + }, + { + "epoch": 0.22, + "grad_norm": 5.879424259898536, + "learning_rate": 9.078375133834981e-06, + "loss": 0.6846, + "step": 2715 + }, + { + "epoch": 0.22, + "grad_norm": 5.303886354130676, + "learning_rate": 9.077614046506094e-06, + "loss": 0.689, + "step": 2716 + }, + { + "epoch": 0.22, + "grad_norm": 2.891782486024536, + "learning_rate": 9.07685267698064e-06, + "loss": 0.7651, + "step": 2717 + }, + { + "epoch": 0.22, + "grad_norm": 3.1691665416430355, + "learning_rate": 9.076091025311311e-06, + "loss": 0.6953, + "step": 2718 + }, + { + "epoch": 0.22, + "grad_norm": 5.866002378618986, + "learning_rate": 9.075329091550818e-06, + "loss": 0.8198, + "step": 2719 + }, + { + "epoch": 0.22, + "grad_norm": 9.05782311958584, + "learning_rate": 9.07456687575189e-06, + "loss": 0.7667, + "step": 2720 + }, + { + "epoch": 0.22, + "grad_norm": 4.138826918516352, + "learning_rate": 9.07380437796728e-06, + "loss": 0.8756, + "step": 2721 + }, + { + "epoch": 0.22, + "grad_norm": 3.633036725881948, + "learning_rate": 9.073041598249757e-06, + "loss": 0.7408, + "step": 2722 + }, + { + "epoch": 0.22, + "grad_norm": 9.987181530244671, + "learning_rate": 9.072278536652107e-06, + "loss": 0.7306, + "step": 2723 + }, + { + "epoch": 0.22, + "grad_norm": 3.8200064178265314, + "learning_rate": 9.071515193227145e-06, + "loss": 0.6324, + "step": 2724 + }, + { + "epoch": 0.22, + "grad_norm": 4.895201551282844, + "learning_rate": 9.070751568027691e-06, + "loss": 0.7071, + "step": 2725 + }, + { + "epoch": 0.22, + "grad_norm": 15.408219559885941, + "learning_rate": 9.0699876611066e-06, + "loss": 0.7833, + "step": 2726 + }, + { + "epoch": 0.22, + "grad_norm": 8.003758363197623, + "learning_rate": 9.069223472516736e-06, + "loss": 0.5875, + "step": 2727 + }, + { + "epoch": 0.22, + "grad_norm": 3.7300051283717215, + "learning_rate": 9.068459002310983e-06, + "loss": 0.6757, + "step": 2728 + }, + { + "epoch": 0.22, + "grad_norm": 4.99193212662554, + "learning_rate": 9.067694250542252e-06, + "loss": 0.6082, + "step": 2729 + }, + { + "epoch": 0.22, + "grad_norm": 11.404216329577935, + "learning_rate": 9.066929217263465e-06, + "loss": 0.8323, + "step": 2730 + }, + { + "epoch": 0.22, + "grad_norm": 5.948284416130249, + "learning_rate": 9.066163902527571e-06, + "loss": 0.815, + "step": 2731 + }, + { + "epoch": 0.22, + "grad_norm": 5.550798934221118, + "learning_rate": 9.065398306387532e-06, + "loss": 0.7345, + "step": 2732 + }, + { + "epoch": 0.22, + "grad_norm": 5.943286957990646, + "learning_rate": 9.064632428896331e-06, + "loss": 0.757, + "step": 2733 + }, + { + "epoch": 0.22, + "grad_norm": 9.29412293612629, + "learning_rate": 9.063866270106972e-06, + "loss": 0.8429, + "step": 2734 + }, + { + "epoch": 0.22, + "grad_norm": 4.3469635273621146, + "learning_rate": 9.063099830072482e-06, + "loss": 0.6731, + "step": 2735 + }, + { + "epoch": 0.22, + "grad_norm": 12.67535621903259, + "learning_rate": 9.062333108845897e-06, + "loss": 0.8227, + "step": 2736 + }, + { + "epoch": 0.22, + "grad_norm": 4.946492570529726, + "learning_rate": 9.061566106480283e-06, + "loss": 0.8305, + "step": 2737 + }, + { + "epoch": 0.22, + "grad_norm": 7.3608034825731705, + "learning_rate": 9.060798823028722e-06, + "loss": 0.7179, + "step": 2738 + }, + { + "epoch": 0.22, + "grad_norm": 3.8846830053711408, + "learning_rate": 9.060031258544313e-06, + "loss": 0.6044, + "step": 2739 + }, + { + "epoch": 0.22, + "grad_norm": 3.090659456074845, + "learning_rate": 9.059263413080178e-06, + "loss": 0.7603, + "step": 2740 + }, + { + "epoch": 0.22, + "grad_norm": 11.908399498640355, + "learning_rate": 9.058495286689454e-06, + "loss": 0.7644, + "step": 2741 + }, + { + "epoch": 0.22, + "grad_norm": 15.029772778090752, + "learning_rate": 9.057726879425302e-06, + "loss": 0.758, + "step": 2742 + }, + { + "epoch": 0.22, + "grad_norm": 4.891524469156882, + "learning_rate": 9.0569581913409e-06, + "loss": 0.8149, + "step": 2743 + }, + { + "epoch": 0.22, + "grad_norm": 12.510455052677278, + "learning_rate": 9.056189222489448e-06, + "loss": 0.8281, + "step": 2744 + }, + { + "epoch": 0.22, + "grad_norm": 2.848452257463674, + "learning_rate": 9.055419972924161e-06, + "loss": 0.7077, + "step": 2745 + }, + { + "epoch": 0.22, + "grad_norm": 7.5548896297754355, + "learning_rate": 9.054650442698276e-06, + "loss": 0.5451, + "step": 2746 + }, + { + "epoch": 0.22, + "grad_norm": 4.354326541771681, + "learning_rate": 9.05388063186505e-06, + "loss": 0.7335, + "step": 2747 + }, + { + "epoch": 0.22, + "grad_norm": 4.195021040256123, + "learning_rate": 9.053110540477762e-06, + "loss": 0.7195, + "step": 2748 + }, + { + "epoch": 0.22, + "grad_norm": 3.6155431524507655, + "learning_rate": 9.052340168589702e-06, + "loss": 0.8022, + "step": 2749 + }, + { + "epoch": 0.22, + "grad_norm": 7.4616120963422, + "learning_rate": 9.051569516254186e-06, + "loss": 0.7934, + "step": 2750 + }, + { + "epoch": 0.22, + "grad_norm": 3.589843109233797, + "learning_rate": 9.050798583524549e-06, + "loss": 0.7515, + "step": 2751 + }, + { + "epoch": 0.22, + "grad_norm": 3.6711177946861135, + "learning_rate": 9.050027370454146e-06, + "loss": 0.7092, + "step": 2752 + }, + { + "epoch": 0.22, + "grad_norm": 5.392293003814468, + "learning_rate": 9.049255877096346e-06, + "loss": 0.7708, + "step": 2753 + }, + { + "epoch": 0.22, + "grad_norm": 2.708608722137364, + "learning_rate": 9.048484103504542e-06, + "loss": 0.7861, + "step": 2754 + }, + { + "epoch": 0.22, + "grad_norm": 3.083993388669979, + "learning_rate": 9.04771204973215e-06, + "loss": 0.6698, + "step": 2755 + }, + { + "epoch": 0.22, + "grad_norm": 3.326937002207549, + "learning_rate": 9.046939715832595e-06, + "loss": 0.7664, + "step": 2756 + }, + { + "epoch": 0.22, + "grad_norm": 3.3260271832751815, + "learning_rate": 9.046167101859332e-06, + "loss": 0.6076, + "step": 2757 + }, + { + "epoch": 0.22, + "grad_norm": 7.1350910382500174, + "learning_rate": 9.045394207865826e-06, + "loss": 0.5636, + "step": 2758 + }, + { + "epoch": 0.22, + "grad_norm": 7.346422187879488, + "learning_rate": 9.04462103390557e-06, + "loss": 0.7002, + "step": 2759 + }, + { + "epoch": 0.22, + "grad_norm": 6.21289492987418, + "learning_rate": 9.043847580032072e-06, + "loss": 0.7792, + "step": 2760 + }, + { + "epoch": 0.22, + "grad_norm": 4.108704503870191, + "learning_rate": 9.04307384629886e-06, + "loss": 0.6693, + "step": 2761 + }, + { + "epoch": 0.22, + "grad_norm": 4.064972310908145, + "learning_rate": 9.04229983275948e-06, + "loss": 0.8571, + "step": 2762 + }, + { + "epoch": 0.22, + "grad_norm": 3.3714328539508003, + "learning_rate": 9.041525539467498e-06, + "loss": 0.6904, + "step": 2763 + }, + { + "epoch": 0.22, + "grad_norm": 3.7886605911104545, + "learning_rate": 9.040750966476502e-06, + "loss": 0.8888, + "step": 2764 + }, + { + "epoch": 0.22, + "grad_norm": 4.970085572119584, + "learning_rate": 9.039976113840097e-06, + "loss": 0.8186, + "step": 2765 + }, + { + "epoch": 0.22, + "grad_norm": 5.123548129494378, + "learning_rate": 9.039200981611907e-06, + "loss": 0.8157, + "step": 2766 + }, + { + "epoch": 0.22, + "grad_norm": 5.515641156549808, + "learning_rate": 9.038425569845574e-06, + "loss": 0.8627, + "step": 2767 + }, + { + "epoch": 0.22, + "grad_norm": 3.951729168884959, + "learning_rate": 9.037649878594766e-06, + "loss": 0.7646, + "step": 2768 + }, + { + "epoch": 0.22, + "grad_norm": 5.459338968085195, + "learning_rate": 9.036873907913163e-06, + "loss": 0.5484, + "step": 2769 + }, + { + "epoch": 0.22, + "grad_norm": 6.414953471945675, + "learning_rate": 9.036097657854467e-06, + "loss": 0.7407, + "step": 2770 + }, + { + "epoch": 0.23, + "grad_norm": 3.3980258179149323, + "learning_rate": 9.035321128472398e-06, + "loss": 0.674, + "step": 2771 + }, + { + "epoch": 0.23, + "grad_norm": 5.128790761681696, + "learning_rate": 9.034544319820701e-06, + "loss": 0.7561, + "step": 2772 + }, + { + "epoch": 0.23, + "grad_norm": 3.0122225874327375, + "learning_rate": 9.033767231953131e-06, + "loss": 0.6936, + "step": 2773 + }, + { + "epoch": 0.23, + "grad_norm": 7.521065240423371, + "learning_rate": 9.032989864923474e-06, + "loss": 0.6843, + "step": 2774 + }, + { + "epoch": 0.23, + "grad_norm": 5.552363230323675, + "learning_rate": 9.032212218785521e-06, + "loss": 0.7114, + "step": 2775 + }, + { + "epoch": 0.23, + "grad_norm": 5.524191250555285, + "learning_rate": 9.031434293593094e-06, + "loss": 0.8634, + "step": 2776 + }, + { + "epoch": 0.23, + "grad_norm": 25.952765763226378, + "learning_rate": 9.03065608940003e-06, + "loss": 0.7736, + "step": 2777 + }, + { + "epoch": 0.23, + "grad_norm": 8.601359586721653, + "learning_rate": 9.029877606260187e-06, + "loss": 0.6508, + "step": 2778 + }, + { + "epoch": 0.23, + "grad_norm": 4.631633213230639, + "learning_rate": 9.029098844227438e-06, + "loss": 0.6534, + "step": 2779 + }, + { + "epoch": 0.23, + "grad_norm": 4.900497168771613, + "learning_rate": 9.02831980335568e-06, + "loss": 0.7626, + "step": 2780 + }, + { + "epoch": 0.23, + "grad_norm": 3.542578512079863, + "learning_rate": 9.027540483698828e-06, + "loss": 0.8199, + "step": 2781 + }, + { + "epoch": 0.23, + "grad_norm": 3.636061515855482, + "learning_rate": 9.026760885310812e-06, + "loss": 0.7583, + "step": 2782 + }, + { + "epoch": 0.23, + "grad_norm": 6.484983563447593, + "learning_rate": 9.02598100824559e-06, + "loss": 0.6814, + "step": 2783 + }, + { + "epoch": 0.23, + "grad_norm": 5.0532201747978505, + "learning_rate": 9.025200852557135e-06, + "loss": 0.9847, + "step": 2784 + }, + { + "epoch": 0.23, + "grad_norm": 5.90432258345458, + "learning_rate": 9.024420418299433e-06, + "loss": 0.7339, + "step": 2785 + }, + { + "epoch": 0.23, + "grad_norm": 3.20909135354148, + "learning_rate": 9.0236397055265e-06, + "loss": 0.9107, + "step": 2786 + }, + { + "epoch": 0.23, + "grad_norm": 5.245169752474064, + "learning_rate": 9.022858714292362e-06, + "loss": 0.855, + "step": 2787 + }, + { + "epoch": 0.23, + "grad_norm": 7.195303770794165, + "learning_rate": 9.022077444651074e-06, + "loss": 0.703, + "step": 2788 + }, + { + "epoch": 0.23, + "grad_norm": 6.208064501030853, + "learning_rate": 9.0212958966567e-06, + "loss": 0.5855, + "step": 2789 + }, + { + "epoch": 0.23, + "grad_norm": 6.6778823552188165, + "learning_rate": 9.020514070363331e-06, + "loss": 0.9221, + "step": 2790 + }, + { + "epoch": 0.23, + "grad_norm": 3.917314724362079, + "learning_rate": 9.019731965825072e-06, + "loss": 0.6728, + "step": 2791 + }, + { + "epoch": 0.23, + "grad_norm": 7.046068880236291, + "learning_rate": 9.018949583096051e-06, + "loss": 0.7425, + "step": 2792 + }, + { + "epoch": 0.23, + "grad_norm": 17.78106361519418, + "learning_rate": 9.018166922230413e-06, + "loss": 0.6993, + "step": 2793 + }, + { + "epoch": 0.23, + "grad_norm": 5.411868695428226, + "learning_rate": 9.017383983282325e-06, + "loss": 0.8871, + "step": 2794 + }, + { + "epoch": 0.23, + "grad_norm": 5.482486583475973, + "learning_rate": 9.016600766305967e-06, + "loss": 0.6458, + "step": 2795 + }, + { + "epoch": 0.23, + "grad_norm": 6.48750953029335, + "learning_rate": 9.015817271355549e-06, + "loss": 0.776, + "step": 2796 + }, + { + "epoch": 0.23, + "grad_norm": 6.9090477126764, + "learning_rate": 9.015033498485287e-06, + "loss": 0.6723, + "step": 2797 + }, + { + "epoch": 0.23, + "grad_norm": 10.674092993281306, + "learning_rate": 9.014249447749429e-06, + "loss": 0.8224, + "step": 2798 + }, + { + "epoch": 0.23, + "grad_norm": 11.969592780783005, + "learning_rate": 9.01346511920223e-06, + "loss": 0.7704, + "step": 2799 + }, + { + "epoch": 0.23, + "grad_norm": 10.157812541375764, + "learning_rate": 9.012680512897975e-06, + "loss": 0.686, + "step": 2800 + }, + { + "epoch": 0.23, + "grad_norm": 6.468509377620594, + "learning_rate": 9.011895628890964e-06, + "loss": 0.7035, + "step": 2801 + }, + { + "epoch": 0.23, + "grad_norm": 7.7116699413186405, + "learning_rate": 9.011110467235515e-06, + "loss": 0.679, + "step": 2802 + }, + { + "epoch": 0.23, + "grad_norm": 8.19715850556492, + "learning_rate": 9.010325027985964e-06, + "loss": 0.5679, + "step": 2803 + }, + { + "epoch": 0.23, + "grad_norm": 159.1343689802648, + "learning_rate": 9.00953931119667e-06, + "loss": 0.6765, + "step": 2804 + }, + { + "epoch": 0.23, + "grad_norm": 4.8607421455164115, + "learning_rate": 9.00875331692201e-06, + "loss": 0.8127, + "step": 2805 + }, + { + "epoch": 0.23, + "grad_norm": 6.117314737617798, + "learning_rate": 9.00796704521638e-06, + "loss": 0.7689, + "step": 2806 + }, + { + "epoch": 0.23, + "grad_norm": 6.54933432155176, + "learning_rate": 9.007180496134193e-06, + "loss": 0.8096, + "step": 2807 + }, + { + "epoch": 0.23, + "grad_norm": 2.870586844463507, + "learning_rate": 9.006393669729885e-06, + "loss": 0.7061, + "step": 2808 + }, + { + "epoch": 0.23, + "grad_norm": 9.807046624109839, + "learning_rate": 9.005606566057908e-06, + "loss": 0.8141, + "step": 2809 + }, + { + "epoch": 0.23, + "grad_norm": 3.5340386115084863, + "learning_rate": 9.004819185172735e-06, + "loss": 0.8112, + "step": 2810 + }, + { + "epoch": 0.23, + "grad_norm": 4.063250975591392, + "learning_rate": 9.00403152712886e-06, + "loss": 0.83, + "step": 2811 + }, + { + "epoch": 0.23, + "grad_norm": 5.7301654516006355, + "learning_rate": 9.003243591980791e-06, + "loss": 0.6636, + "step": 2812 + }, + { + "epoch": 0.23, + "grad_norm": 10.976593614344091, + "learning_rate": 9.002455379783057e-06, + "loss": 0.782, + "step": 2813 + }, + { + "epoch": 0.23, + "grad_norm": 4.576247375781676, + "learning_rate": 9.00166689059021e-06, + "loss": 0.8233, + "step": 2814 + }, + { + "epoch": 0.23, + "grad_norm": 3.031119383054468, + "learning_rate": 9.00087812445682e-06, + "loss": 0.677, + "step": 2815 + }, + { + "epoch": 0.23, + "grad_norm": 5.283066300402596, + "learning_rate": 9.00008908143747e-06, + "loss": 0.6836, + "step": 2816 + }, + { + "epoch": 0.23, + "grad_norm": 5.876115062606732, + "learning_rate": 8.999299761586768e-06, + "loss": 0.744, + "step": 2817 + }, + { + "epoch": 0.23, + "grad_norm": 4.627007787847743, + "learning_rate": 8.998510164959344e-06, + "loss": 0.6885, + "step": 2818 + }, + { + "epoch": 0.23, + "grad_norm": 6.103590004582193, + "learning_rate": 8.997720291609837e-06, + "loss": 0.7772, + "step": 2819 + }, + { + "epoch": 0.23, + "grad_norm": 7.5357072138337475, + "learning_rate": 8.996930141592915e-06, + "loss": 0.6992, + "step": 2820 + }, + { + "epoch": 0.23, + "grad_norm": 6.1759140276645, + "learning_rate": 8.996139714963262e-06, + "loss": 0.682, + "step": 2821 + }, + { + "epoch": 0.23, + "grad_norm": 4.082556187949097, + "learning_rate": 8.995349011775577e-06, + "loss": 0.6829, + "step": 2822 + }, + { + "epoch": 0.23, + "grad_norm": 5.076183239269856, + "learning_rate": 8.994558032084583e-06, + "loss": 0.705, + "step": 2823 + }, + { + "epoch": 0.23, + "grad_norm": 7.688959602021805, + "learning_rate": 8.993766775945023e-06, + "loss": 0.6444, + "step": 2824 + }, + { + "epoch": 0.23, + "grad_norm": 4.582962583972632, + "learning_rate": 8.992975243411655e-06, + "loss": 0.8809, + "step": 2825 + }, + { + "epoch": 0.23, + "grad_norm": 5.716913247496087, + "learning_rate": 8.992183434539257e-06, + "loss": 0.7502, + "step": 2826 + }, + { + "epoch": 0.23, + "grad_norm": 9.799116045158838, + "learning_rate": 8.99139134938263e-06, + "loss": 0.81, + "step": 2827 + }, + { + "epoch": 0.23, + "grad_norm": 5.775335489654277, + "learning_rate": 8.99059898799659e-06, + "loss": 0.8494, + "step": 2828 + }, + { + "epoch": 0.23, + "grad_norm": 4.664086288782057, + "learning_rate": 8.989806350435972e-06, + "loss": 0.6815, + "step": 2829 + }, + { + "epoch": 0.23, + "grad_norm": 7.566274278442036, + "learning_rate": 8.989013436755633e-06, + "loss": 0.6916, + "step": 2830 + }, + { + "epoch": 0.23, + "grad_norm": 8.41029249038631, + "learning_rate": 8.988220247010448e-06, + "loss": 0.7804, + "step": 2831 + }, + { + "epoch": 0.23, + "grad_norm": 13.63422625877445, + "learning_rate": 8.987426781255309e-06, + "loss": 0.7555, + "step": 2832 + }, + { + "epoch": 0.23, + "grad_norm": 3.968314691032537, + "learning_rate": 8.98663303954513e-06, + "loss": 0.6407, + "step": 2833 + }, + { + "epoch": 0.23, + "grad_norm": 4.576776974928141, + "learning_rate": 8.985839021934843e-06, + "loss": 0.5905, + "step": 2834 + }, + { + "epoch": 0.23, + "grad_norm": 10.79745550370275, + "learning_rate": 8.9850447284794e-06, + "loss": 0.7459, + "step": 2835 + }, + { + "epoch": 0.23, + "grad_norm": 4.140438408403593, + "learning_rate": 8.984250159233767e-06, + "loss": 0.7428, + "step": 2836 + }, + { + "epoch": 0.23, + "grad_norm": 10.106899792737101, + "learning_rate": 8.983455314252938e-06, + "loss": 0.685, + "step": 2837 + }, + { + "epoch": 0.23, + "grad_norm": 3.2763532351510856, + "learning_rate": 8.98266019359192e-06, + "loss": 0.7423, + "step": 2838 + }, + { + "epoch": 0.23, + "grad_norm": 3.7270712753211215, + "learning_rate": 8.981864797305738e-06, + "loss": 0.8173, + "step": 2839 + }, + { + "epoch": 0.23, + "grad_norm": 5.818057949924994, + "learning_rate": 8.981069125449442e-06, + "loss": 0.6716, + "step": 2840 + }, + { + "epoch": 0.23, + "grad_norm": 10.871715122164675, + "learning_rate": 8.980273178078093e-06, + "loss": 0.6722, + "step": 2841 + }, + { + "epoch": 0.23, + "grad_norm": 4.17038667688079, + "learning_rate": 8.97947695524678e-06, + "loss": 0.8945, + "step": 2842 + }, + { + "epoch": 0.23, + "grad_norm": 3.0193063113500447, + "learning_rate": 8.978680457010604e-06, + "loss": 0.6195, + "step": 2843 + }, + { + "epoch": 0.23, + "grad_norm": 5.12050083063855, + "learning_rate": 8.977883683424689e-06, + "loss": 0.7538, + "step": 2844 + }, + { + "epoch": 0.23, + "grad_norm": 10.17477056381511, + "learning_rate": 8.977086634544176e-06, + "loss": 0.607, + "step": 2845 + }, + { + "epoch": 0.23, + "grad_norm": 4.778498388808307, + "learning_rate": 8.976289310424227e-06, + "loss": 0.8404, + "step": 2846 + }, + { + "epoch": 0.23, + "grad_norm": 10.725442519310553, + "learning_rate": 8.97549171112002e-06, + "loss": 0.6572, + "step": 2847 + }, + { + "epoch": 0.23, + "grad_norm": 5.798472030578463, + "learning_rate": 8.974693836686755e-06, + "loss": 0.7007, + "step": 2848 + }, + { + "epoch": 0.23, + "grad_norm": 3.3678798962129974, + "learning_rate": 8.97389568717965e-06, + "loss": 0.6143, + "step": 2849 + }, + { + "epoch": 0.23, + "grad_norm": 5.005002754914163, + "learning_rate": 8.973097262653942e-06, + "loss": 0.7606, + "step": 2850 + }, + { + "epoch": 0.23, + "grad_norm": 6.6915528397723545, + "learning_rate": 8.972298563164886e-06, + "loss": 0.7101, + "step": 2851 + }, + { + "epoch": 0.23, + "grad_norm": 2.7458693390509934, + "learning_rate": 8.971499588767758e-06, + "loss": 0.7373, + "step": 2852 + }, + { + "epoch": 0.23, + "grad_norm": 3.2718898789834117, + "learning_rate": 8.970700339517853e-06, + "loss": 0.7791, + "step": 2853 + }, + { + "epoch": 0.23, + "grad_norm": 8.39732457615963, + "learning_rate": 8.96990081547048e-06, + "loss": 0.7041, + "step": 2854 + }, + { + "epoch": 0.23, + "grad_norm": 3.1767190404039245, + "learning_rate": 8.969101016680977e-06, + "loss": 0.6039, + "step": 2855 + }, + { + "epoch": 0.23, + "grad_norm": 3.7462376424204877, + "learning_rate": 8.96830094320469e-06, + "loss": 0.8686, + "step": 2856 + }, + { + "epoch": 0.23, + "grad_norm": 5.759937543816016, + "learning_rate": 8.967500595096994e-06, + "loss": 0.8381, + "step": 2857 + }, + { + "epoch": 0.23, + "grad_norm": 4.950413113827954, + "learning_rate": 8.966699972413274e-06, + "loss": 0.6799, + "step": 2858 + }, + { + "epoch": 0.23, + "grad_norm": 4.7334578789563, + "learning_rate": 8.965899075208939e-06, + "loss": 0.6635, + "step": 2859 + }, + { + "epoch": 0.23, + "grad_norm": 7.772266338772254, + "learning_rate": 8.965097903539416e-06, + "loss": 0.6693, + "step": 2860 + }, + { + "epoch": 0.23, + "grad_norm": 3.256297534265736, + "learning_rate": 8.964296457460152e-06, + "loss": 0.8322, + "step": 2861 + }, + { + "epoch": 0.23, + "grad_norm": 3.5487616498954755, + "learning_rate": 8.963494737026612e-06, + "loss": 0.7519, + "step": 2862 + }, + { + "epoch": 0.23, + "grad_norm": 8.344681829474355, + "learning_rate": 8.96269274229428e-06, + "loss": 0.7004, + "step": 2863 + }, + { + "epoch": 0.23, + "grad_norm": 8.468238635543933, + "learning_rate": 8.96189047331866e-06, + "loss": 0.8048, + "step": 2864 + }, + { + "epoch": 0.23, + "grad_norm": 4.781805468608929, + "learning_rate": 8.961087930155273e-06, + "loss": 0.8814, + "step": 2865 + }, + { + "epoch": 0.23, + "grad_norm": 7.215505993559402, + "learning_rate": 8.96028511285966e-06, + "loss": 0.7634, + "step": 2866 + }, + { + "epoch": 0.23, + "grad_norm": 4.2756190959970235, + "learning_rate": 8.95948202148738e-06, + "loss": 0.7938, + "step": 2867 + }, + { + "epoch": 0.23, + "grad_norm": 4.094251268260469, + "learning_rate": 8.958678656094016e-06, + "loss": 0.8088, + "step": 2868 + }, + { + "epoch": 0.23, + "grad_norm": 6.077152939904661, + "learning_rate": 8.95787501673516e-06, + "loss": 0.7624, + "step": 2869 + }, + { + "epoch": 0.23, + "grad_norm": 3.7708214250193173, + "learning_rate": 8.957071103466433e-06, + "loss": 0.898, + "step": 2870 + }, + { + "epoch": 0.23, + "grad_norm": 5.349001255416674, + "learning_rate": 8.95626691634347e-06, + "loss": 0.6955, + "step": 2871 + }, + { + "epoch": 0.23, + "grad_norm": 12.180066712533977, + "learning_rate": 8.955462455421927e-06, + "loss": 0.815, + "step": 2872 + }, + { + "epoch": 0.23, + "grad_norm": 4.115698496278884, + "learning_rate": 8.954657720757474e-06, + "loss": 0.7697, + "step": 2873 + }, + { + "epoch": 0.23, + "grad_norm": 2.9431212575479817, + "learning_rate": 8.953852712405808e-06, + "loss": 0.6371, + "step": 2874 + }, + { + "epoch": 0.23, + "grad_norm": 41.665402554757456, + "learning_rate": 8.953047430422637e-06, + "loss": 0.6509, + "step": 2875 + }, + { + "epoch": 0.23, + "grad_norm": 7.26980610478533, + "learning_rate": 8.952241874863695e-06, + "loss": 0.7843, + "step": 2876 + }, + { + "epoch": 0.23, + "grad_norm": 11.2514832443559, + "learning_rate": 8.95143604578473e-06, + "loss": 0.6886, + "step": 2877 + }, + { + "epoch": 0.23, + "grad_norm": 5.650493882294282, + "learning_rate": 8.950629943241509e-06, + "loss": 0.7846, + "step": 2878 + }, + { + "epoch": 0.23, + "grad_norm": 3.2060699417501985, + "learning_rate": 8.94982356728982e-06, + "loss": 0.6502, + "step": 2879 + }, + { + "epoch": 0.23, + "grad_norm": 7.744698136827544, + "learning_rate": 8.94901691798547e-06, + "loss": 0.6769, + "step": 2880 + }, + { + "epoch": 0.23, + "grad_norm": 3.3738188460740184, + "learning_rate": 8.948209995384288e-06, + "loss": 0.743, + "step": 2881 + }, + { + "epoch": 0.23, + "grad_norm": 3.208656472987335, + "learning_rate": 8.947402799542111e-06, + "loss": 0.71, + "step": 2882 + }, + { + "epoch": 0.23, + "grad_norm": 4.9103193498062545, + "learning_rate": 8.946595330514807e-06, + "loss": 0.7137, + "step": 2883 + }, + { + "epoch": 0.23, + "grad_norm": 2.8310233675017726, + "learning_rate": 8.945787588358255e-06, + "loss": 0.6973, + "step": 2884 + }, + { + "epoch": 0.23, + "grad_norm": 4.795397985986916, + "learning_rate": 8.944979573128358e-06, + "loss": 0.5901, + "step": 2885 + }, + { + "epoch": 0.23, + "grad_norm": 3.787550220829312, + "learning_rate": 8.944171284881035e-06, + "loss": 0.8325, + "step": 2886 + }, + { + "epoch": 0.23, + "grad_norm": 2.8658509970749644, + "learning_rate": 8.943362723672225e-06, + "loss": 0.7268, + "step": 2887 + }, + { + "epoch": 0.23, + "grad_norm": 4.1679513736456935, + "learning_rate": 8.942553889557883e-06, + "loss": 0.7348, + "step": 2888 + }, + { + "epoch": 0.23, + "grad_norm": 4.023436653729692, + "learning_rate": 8.941744782593989e-06, + "loss": 0.6086, + "step": 2889 + }, + { + "epoch": 0.23, + "grad_norm": 3.1176196767647446, + "learning_rate": 8.940935402836535e-06, + "loss": 0.7208, + "step": 2890 + }, + { + "epoch": 0.23, + "grad_norm": 3.877078572220708, + "learning_rate": 8.940125750341539e-06, + "loss": 0.6787, + "step": 2891 + }, + { + "epoch": 0.23, + "grad_norm": 4.580160887932237, + "learning_rate": 8.939315825165032e-06, + "loss": 0.746, + "step": 2892 + }, + { + "epoch": 0.23, + "grad_norm": 3.9396391316635353, + "learning_rate": 8.938505627363065e-06, + "loss": 0.6851, + "step": 2893 + }, + { + "epoch": 0.24, + "grad_norm": 3.2973651654376344, + "learning_rate": 8.937695156991711e-06, + "loss": 0.7109, + "step": 2894 + }, + { + "epoch": 0.24, + "grad_norm": 4.751102448978402, + "learning_rate": 8.936884414107056e-06, + "loss": 0.7315, + "step": 2895 + }, + { + "epoch": 0.24, + "grad_norm": 3.631403945039312, + "learning_rate": 8.936073398765212e-06, + "loss": 0.6349, + "step": 2896 + }, + { + "epoch": 0.24, + "grad_norm": 4.536280341361928, + "learning_rate": 8.935262111022306e-06, + "loss": 0.8574, + "step": 2897 + }, + { + "epoch": 0.24, + "grad_norm": 3.6673633731076545, + "learning_rate": 8.934450550934483e-06, + "loss": 0.6901, + "step": 2898 + }, + { + "epoch": 0.24, + "grad_norm": 3.7937151047372013, + "learning_rate": 8.933638718557908e-06, + "loss": 0.6662, + "step": 2899 + }, + { + "epoch": 0.24, + "grad_norm": 3.102159245358778, + "learning_rate": 8.932826613948767e-06, + "loss": 0.6885, + "step": 2900 + }, + { + "epoch": 0.24, + "grad_norm": 3.9685981120214713, + "learning_rate": 8.932014237163259e-06, + "loss": 0.7894, + "step": 2901 + }, + { + "epoch": 0.24, + "grad_norm": 3.3082226075129015, + "learning_rate": 8.931201588257609e-06, + "loss": 0.9473, + "step": 2902 + }, + { + "epoch": 0.24, + "grad_norm": 3.7680416084117185, + "learning_rate": 8.930388667288055e-06, + "loss": 0.5511, + "step": 2903 + }, + { + "epoch": 0.24, + "grad_norm": 3.0563907102177894, + "learning_rate": 8.92957547431086e-06, + "loss": 0.7402, + "step": 2904 + }, + { + "epoch": 0.24, + "grad_norm": 4.761138966287189, + "learning_rate": 8.928762009382297e-06, + "loss": 0.5399, + "step": 2905 + }, + { + "epoch": 0.24, + "grad_norm": 5.285451266079011, + "learning_rate": 8.927948272558666e-06, + "loss": 0.7228, + "step": 2906 + }, + { + "epoch": 0.24, + "grad_norm": 4.714409275034327, + "learning_rate": 8.927134263896284e-06, + "loss": 0.7647, + "step": 2907 + }, + { + "epoch": 0.24, + "grad_norm": 4.049724201601192, + "learning_rate": 8.926319983451481e-06, + "loss": 0.8375, + "step": 2908 + }, + { + "epoch": 0.24, + "grad_norm": 4.3866914038557345, + "learning_rate": 8.925505431280615e-06, + "loss": 0.8092, + "step": 2909 + }, + { + "epoch": 0.24, + "grad_norm": 3.2641281404544316, + "learning_rate": 8.924690607440055e-06, + "loss": 0.6129, + "step": 2910 + }, + { + "epoch": 0.24, + "grad_norm": 4.199554759709983, + "learning_rate": 8.923875511986193e-06, + "loss": 0.6647, + "step": 2911 + }, + { + "epoch": 0.24, + "grad_norm": 5.216177254280003, + "learning_rate": 8.92306014497544e-06, + "loss": 0.6786, + "step": 2912 + }, + { + "epoch": 0.24, + "grad_norm": 4.739850240580817, + "learning_rate": 8.92224450646422e-06, + "loss": 0.7789, + "step": 2913 + }, + { + "epoch": 0.24, + "grad_norm": 3.751931414325418, + "learning_rate": 8.92142859650899e-06, + "loss": 0.7805, + "step": 2914 + }, + { + "epoch": 0.24, + "grad_norm": 3.418777866813581, + "learning_rate": 8.920612415166206e-06, + "loss": 0.7217, + "step": 2915 + }, + { + "epoch": 0.24, + "grad_norm": 4.049103202621135, + "learning_rate": 8.919795962492354e-06, + "loss": 0.7773, + "step": 2916 + }, + { + "epoch": 0.24, + "grad_norm": 25.230412440481512, + "learning_rate": 8.918979238543944e-06, + "loss": 0.6934, + "step": 2917 + }, + { + "epoch": 0.24, + "grad_norm": 4.160417945895964, + "learning_rate": 8.918162243377494e-06, + "loss": 0.8345, + "step": 2918 + }, + { + "epoch": 0.24, + "grad_norm": 4.818048401033922, + "learning_rate": 8.917344977049546e-06, + "loss": 0.8726, + "step": 2919 + }, + { + "epoch": 0.24, + "grad_norm": 8.433136039723975, + "learning_rate": 8.91652743961666e-06, + "loss": 0.5962, + "step": 2920 + }, + { + "epoch": 0.24, + "grad_norm": 2.9957261984543777, + "learning_rate": 8.915709631135414e-06, + "loss": 0.8819, + "step": 2921 + }, + { + "epoch": 0.24, + "grad_norm": 5.653362698214063, + "learning_rate": 8.914891551662406e-06, + "loss": 0.695, + "step": 2922 + }, + { + "epoch": 0.24, + "grad_norm": 8.198581345468392, + "learning_rate": 8.914073201254253e-06, + "loss": 0.8013, + "step": 2923 + }, + { + "epoch": 0.24, + "grad_norm": 7.2519737607953605, + "learning_rate": 8.91325457996759e-06, + "loss": 0.8766, + "step": 2924 + }, + { + "epoch": 0.24, + "grad_norm": 4.284678448884324, + "learning_rate": 8.912435687859068e-06, + "loss": 0.8102, + "step": 2925 + }, + { + "epoch": 0.24, + "grad_norm": 4.972797496955421, + "learning_rate": 8.911616524985364e-06, + "loss": 0.7795, + "step": 2926 + }, + { + "epoch": 0.24, + "grad_norm": 4.320373873162612, + "learning_rate": 8.910797091403166e-06, + "loss": 0.6979, + "step": 2927 + }, + { + "epoch": 0.24, + "grad_norm": 3.4464603496627686, + "learning_rate": 8.909977387169185e-06, + "loss": 0.6842, + "step": 2928 + }, + { + "epoch": 0.24, + "grad_norm": 6.110748688570022, + "learning_rate": 8.90915741234015e-06, + "loss": 0.6581, + "step": 2929 + }, + { + "epoch": 0.24, + "grad_norm": 8.03880441712925, + "learning_rate": 8.908337166972807e-06, + "loss": 0.7596, + "step": 2930 + }, + { + "epoch": 0.24, + "grad_norm": 3.2612986682443843, + "learning_rate": 8.907516651123925e-06, + "loss": 0.7736, + "step": 2931 + }, + { + "epoch": 0.24, + "grad_norm": 3.9220094671982237, + "learning_rate": 8.906695864850284e-06, + "loss": 0.8371, + "step": 2932 + }, + { + "epoch": 0.24, + "grad_norm": 8.216047649079009, + "learning_rate": 8.905874808208692e-06, + "loss": 0.6946, + "step": 2933 + }, + { + "epoch": 0.24, + "grad_norm": 4.477608413845603, + "learning_rate": 8.90505348125597e-06, + "loss": 0.6315, + "step": 2934 + }, + { + "epoch": 0.24, + "grad_norm": 3.1300250509081033, + "learning_rate": 8.90423188404896e-06, + "loss": 0.7536, + "step": 2935 + }, + { + "epoch": 0.24, + "grad_norm": 2.7445509532532855, + "learning_rate": 8.903410016644518e-06, + "loss": 0.7202, + "step": 2936 + }, + { + "epoch": 0.24, + "grad_norm": 6.176002590916825, + "learning_rate": 8.902587879099527e-06, + "loss": 0.8109, + "step": 2937 + }, + { + "epoch": 0.24, + "grad_norm": 2.7247398817132495, + "learning_rate": 8.901765471470882e-06, + "loss": 0.4841, + "step": 2938 + }, + { + "epoch": 0.24, + "grad_norm": 4.09224987020506, + "learning_rate": 8.900942793815498e-06, + "loss": 0.7496, + "step": 2939 + }, + { + "epoch": 0.24, + "grad_norm": 4.7155817858854885, + "learning_rate": 8.90011984619031e-06, + "loss": 0.7336, + "step": 2940 + }, + { + "epoch": 0.24, + "grad_norm": 2.7569497434953756, + "learning_rate": 8.899296628652272e-06, + "loss": 0.5279, + "step": 2941 + }, + { + "epoch": 0.24, + "grad_norm": 14.99718403013933, + "learning_rate": 8.898473141258356e-06, + "loss": 0.7909, + "step": 2942 + }, + { + "epoch": 0.24, + "grad_norm": 7.3153697290077995, + "learning_rate": 8.897649384065552e-06, + "loss": 0.713, + "step": 2943 + }, + { + "epoch": 0.24, + "grad_norm": 4.577070504939822, + "learning_rate": 8.896825357130867e-06, + "loss": 0.4662, + "step": 2944 + }, + { + "epoch": 0.24, + "grad_norm": 6.32892176040599, + "learning_rate": 8.896001060511333e-06, + "loss": 0.8103, + "step": 2945 + }, + { + "epoch": 0.24, + "grad_norm": 6.723451276861912, + "learning_rate": 8.895176494263993e-06, + "loss": 0.745, + "step": 2946 + }, + { + "epoch": 0.24, + "grad_norm": 4.524435486440204, + "learning_rate": 8.894351658445913e-06, + "loss": 0.8632, + "step": 2947 + }, + { + "epoch": 0.24, + "grad_norm": 5.012111949529782, + "learning_rate": 8.893526553114178e-06, + "loss": 0.7762, + "step": 2948 + }, + { + "epoch": 0.24, + "grad_norm": 5.983253921214527, + "learning_rate": 8.89270117832589e-06, + "loss": 0.8027, + "step": 2949 + }, + { + "epoch": 0.24, + "grad_norm": 3.6157778015799646, + "learning_rate": 8.89187553413817e-06, + "loss": 0.7658, + "step": 2950 + }, + { + "epoch": 0.24, + "grad_norm": 7.014843217231772, + "learning_rate": 8.891049620608158e-06, + "loss": 0.7264, + "step": 2951 + }, + { + "epoch": 0.24, + "grad_norm": 3.916985475158266, + "learning_rate": 8.890223437793012e-06, + "loss": 0.7381, + "step": 2952 + }, + { + "epoch": 0.24, + "grad_norm": 5.101418337055632, + "learning_rate": 8.889396985749909e-06, + "loss": 0.8047, + "step": 2953 + }, + { + "epoch": 0.24, + "grad_norm": 30.985538830399083, + "learning_rate": 8.888570264536046e-06, + "loss": 0.7548, + "step": 2954 + }, + { + "epoch": 0.24, + "grad_norm": 4.99969946714301, + "learning_rate": 8.887743274208635e-06, + "loss": 0.7697, + "step": 2955 + }, + { + "epoch": 0.24, + "grad_norm": 3.6278033608690445, + "learning_rate": 8.886916014824911e-06, + "loss": 0.7013, + "step": 2956 + }, + { + "epoch": 0.24, + "grad_norm": 4.221829947321943, + "learning_rate": 8.886088486442124e-06, + "loss": 0.8106, + "step": 2957 + }, + { + "epoch": 0.24, + "grad_norm": 5.855650165755511, + "learning_rate": 8.885260689117546e-06, + "loss": 0.8342, + "step": 2958 + }, + { + "epoch": 0.24, + "grad_norm": 4.8751913851337845, + "learning_rate": 8.884432622908463e-06, + "loss": 0.6958, + "step": 2959 + }, + { + "epoch": 0.24, + "grad_norm": 26.064569563907508, + "learning_rate": 8.883604287872186e-06, + "loss": 0.6542, + "step": 2960 + }, + { + "epoch": 0.24, + "grad_norm": 4.164135301125337, + "learning_rate": 8.882775684066037e-06, + "loss": 0.6284, + "step": 2961 + }, + { + "epoch": 0.24, + "grad_norm": 4.322623650020969, + "learning_rate": 8.881946811547364e-06, + "loss": 0.742, + "step": 2962 + }, + { + "epoch": 0.24, + "grad_norm": 4.151084763430534, + "learning_rate": 8.881117670373528e-06, + "loss": 0.8275, + "step": 2963 + }, + { + "epoch": 0.24, + "grad_norm": 12.26965996564063, + "learning_rate": 8.880288260601913e-06, + "loss": 0.6818, + "step": 2964 + }, + { + "epoch": 0.24, + "grad_norm": 3.5077286971251516, + "learning_rate": 8.879458582289917e-06, + "loss": 0.6596, + "step": 2965 + }, + { + "epoch": 0.24, + "grad_norm": 4.730617930444589, + "learning_rate": 8.878628635494961e-06, + "loss": 0.8271, + "step": 2966 + }, + { + "epoch": 0.24, + "grad_norm": 5.679268827529147, + "learning_rate": 8.87779842027448e-06, + "loss": 0.7606, + "step": 2967 + }, + { + "epoch": 0.24, + "grad_norm": 3.663158013441438, + "learning_rate": 8.876967936685933e-06, + "loss": 0.7634, + "step": 2968 + }, + { + "epoch": 0.24, + "grad_norm": 4.017154311013711, + "learning_rate": 8.876137184786793e-06, + "loss": 0.6235, + "step": 2969 + }, + { + "epoch": 0.24, + "grad_norm": 6.613746315417937, + "learning_rate": 8.875306164634554e-06, + "loss": 0.8739, + "step": 2970 + }, + { + "epoch": 0.24, + "grad_norm": 12.061884813730193, + "learning_rate": 8.874474876286728e-06, + "loss": 0.7142, + "step": 2971 + }, + { + "epoch": 0.24, + "grad_norm": 3.1380666794402345, + "learning_rate": 8.873643319800842e-06, + "loss": 0.667, + "step": 2972 + }, + { + "epoch": 0.24, + "grad_norm": 4.061328281557785, + "learning_rate": 8.872811495234451e-06, + "loss": 0.6998, + "step": 2973 + }, + { + "epoch": 0.24, + "grad_norm": 3.7872838144424077, + "learning_rate": 8.871979402645116e-06, + "loss": 0.7295, + "step": 2974 + }, + { + "epoch": 0.24, + "grad_norm": 5.767980903179702, + "learning_rate": 8.871147042090428e-06, + "loss": 0.7298, + "step": 2975 + }, + { + "epoch": 0.24, + "grad_norm": 3.471594681483766, + "learning_rate": 8.870314413627991e-06, + "loss": 0.5385, + "step": 2976 + }, + { + "epoch": 0.24, + "grad_norm": 4.0531376937887655, + "learning_rate": 8.869481517315427e-06, + "loss": 0.6623, + "step": 2977 + }, + { + "epoch": 0.24, + "grad_norm": 19.57774768012508, + "learning_rate": 8.868648353210377e-06, + "loss": 0.7447, + "step": 2978 + }, + { + "epoch": 0.24, + "grad_norm": 2.809809088960798, + "learning_rate": 8.867814921370502e-06, + "loss": 0.783, + "step": 2979 + }, + { + "epoch": 0.24, + "grad_norm": 6.083747142893606, + "learning_rate": 8.866981221853482e-06, + "loss": 0.7387, + "step": 2980 + }, + { + "epoch": 0.24, + "grad_norm": 4.296696723523334, + "learning_rate": 8.86614725471701e-06, + "loss": 0.8645, + "step": 2981 + }, + { + "epoch": 0.24, + "grad_norm": 6.718845781139819, + "learning_rate": 8.865313020018806e-06, + "loss": 0.7093, + "step": 2982 + }, + { + "epoch": 0.24, + "grad_norm": 3.94522223345727, + "learning_rate": 8.864478517816604e-06, + "loss": 0.7054, + "step": 2983 + }, + { + "epoch": 0.24, + "grad_norm": 3.743995145549366, + "learning_rate": 8.863643748168156e-06, + "loss": 0.6699, + "step": 2984 + }, + { + "epoch": 0.24, + "grad_norm": 4.678538963630396, + "learning_rate": 8.862808711131232e-06, + "loss": 0.7078, + "step": 2985 + }, + { + "epoch": 0.24, + "grad_norm": 4.512166415645682, + "learning_rate": 8.861973406763623e-06, + "loss": 0.7259, + "step": 2986 + }, + { + "epoch": 0.24, + "grad_norm": 3.308639076101463, + "learning_rate": 8.861137835123137e-06, + "loss": 0.8633, + "step": 2987 + }, + { + "epoch": 0.24, + "grad_norm": 3.1698731790083743, + "learning_rate": 8.860301996267601e-06, + "loss": 0.7326, + "step": 2988 + }, + { + "epoch": 0.24, + "grad_norm": 5.7846675006469885, + "learning_rate": 8.859465890254861e-06, + "loss": 0.8694, + "step": 2989 + }, + { + "epoch": 0.24, + "grad_norm": 3.878481918281038, + "learning_rate": 8.85862951714278e-06, + "loss": 0.7486, + "step": 2990 + }, + { + "epoch": 0.24, + "grad_norm": 6.233066220053629, + "learning_rate": 8.857792876989241e-06, + "loss": 0.9758, + "step": 2991 + }, + { + "epoch": 0.24, + "grad_norm": 8.267107087713777, + "learning_rate": 8.856955969852144e-06, + "loss": 0.6349, + "step": 2992 + }, + { + "epoch": 0.24, + "grad_norm": 4.16452610056301, + "learning_rate": 8.856118795789408e-06, + "loss": 0.6864, + "step": 2993 + }, + { + "epoch": 0.24, + "grad_norm": 4.079853891757035, + "learning_rate": 8.85528135485897e-06, + "loss": 0.8241, + "step": 2994 + }, + { + "epoch": 0.24, + "grad_norm": 2.452062790799971, + "learning_rate": 8.85444364711879e-06, + "loss": 0.648, + "step": 2995 + }, + { + "epoch": 0.24, + "grad_norm": 5.035225754085019, + "learning_rate": 8.853605672626839e-06, + "loss": 0.6989, + "step": 2996 + }, + { + "epoch": 0.24, + "grad_norm": 2.9659680279723117, + "learning_rate": 8.852767431441111e-06, + "loss": 0.6898, + "step": 2997 + }, + { + "epoch": 0.24, + "grad_norm": 3.1360219037988175, + "learning_rate": 8.851928923619617e-06, + "loss": 0.801, + "step": 2998 + }, + { + "epoch": 0.24, + "grad_norm": 6.939176580779341, + "learning_rate": 8.85109014922039e-06, + "loss": 0.7802, + "step": 2999 + }, + { + "epoch": 0.24, + "grad_norm": 8.694564045091749, + "learning_rate": 8.850251108301473e-06, + "loss": 0.6329, + "step": 3000 + }, + { + "epoch": 0.24, + "grad_norm": 6.227734515933598, + "learning_rate": 8.849411800920938e-06, + "loss": 0.5915, + "step": 3001 + }, + { + "epoch": 0.24, + "grad_norm": 3.841397543367415, + "learning_rate": 8.848572227136869e-06, + "loss": 0.4682, + "step": 3002 + }, + { + "epoch": 0.24, + "grad_norm": 2.902602930789638, + "learning_rate": 8.847732387007369e-06, + "loss": 0.6879, + "step": 3003 + }, + { + "epoch": 0.24, + "grad_norm": 7.734585232478385, + "learning_rate": 8.84689228059056e-06, + "loss": 0.8703, + "step": 3004 + }, + { + "epoch": 0.24, + "grad_norm": 3.2350269726057577, + "learning_rate": 8.846051907944582e-06, + "loss": 0.7505, + "step": 3005 + }, + { + "epoch": 0.24, + "grad_norm": 3.9041982976320346, + "learning_rate": 8.845211269127597e-06, + "loss": 0.7281, + "step": 3006 + }, + { + "epoch": 0.24, + "grad_norm": 7.260643312028547, + "learning_rate": 8.844370364197781e-06, + "loss": 0.7002, + "step": 3007 + }, + { + "epoch": 0.24, + "grad_norm": 5.850802894543463, + "learning_rate": 8.843529193213327e-06, + "loss": 0.7696, + "step": 3008 + }, + { + "epoch": 0.24, + "grad_norm": 3.5835452071859226, + "learning_rate": 8.842687756232454e-06, + "loss": 0.8905, + "step": 3009 + }, + { + "epoch": 0.24, + "grad_norm": 5.366428063749266, + "learning_rate": 8.841846053313392e-06, + "loss": 0.846, + "step": 3010 + }, + { + "epoch": 0.24, + "grad_norm": 4.557842788932391, + "learning_rate": 8.841004084514394e-06, + "loss": 0.6322, + "step": 3011 + }, + { + "epoch": 0.24, + "grad_norm": 3.611465150609721, + "learning_rate": 8.840161849893729e-06, + "loss": 0.8319, + "step": 3012 + }, + { + "epoch": 0.24, + "grad_norm": 3.948823286750815, + "learning_rate": 8.839319349509683e-06, + "loss": 0.6801, + "step": 3013 + }, + { + "epoch": 0.24, + "grad_norm": 3.4502409186427605, + "learning_rate": 8.838476583420562e-06, + "loss": 0.68, + "step": 3014 + }, + { + "epoch": 0.24, + "grad_norm": 13.732684267483572, + "learning_rate": 8.837633551684695e-06, + "loss": 0.8244, + "step": 3015 + }, + { + "epoch": 0.24, + "grad_norm": 4.5115760396705475, + "learning_rate": 8.83679025436042e-06, + "loss": 0.762, + "step": 3016 + }, + { + "epoch": 0.25, + "grad_norm": 4.407557344691762, + "learning_rate": 8.835946691506103e-06, + "loss": 0.805, + "step": 3017 + }, + { + "epoch": 0.25, + "grad_norm": 6.606794455494425, + "learning_rate": 8.835102863180123e-06, + "loss": 0.7429, + "step": 3018 + }, + { + "epoch": 0.25, + "grad_norm": 5.271796741961873, + "learning_rate": 8.834258769440875e-06, + "loss": 0.5733, + "step": 3019 + }, + { + "epoch": 0.25, + "grad_norm": 5.009153662451588, + "learning_rate": 8.833414410346777e-06, + "loss": 0.7196, + "step": 3020 + }, + { + "epoch": 0.25, + "grad_norm": 3.5986463336224723, + "learning_rate": 8.832569785956267e-06, + "loss": 0.7625, + "step": 3021 + }, + { + "epoch": 0.25, + "grad_norm": 5.645621611539132, + "learning_rate": 8.831724896327794e-06, + "loss": 0.7989, + "step": 3022 + }, + { + "epoch": 0.25, + "grad_norm": 7.344755117388516, + "learning_rate": 8.830879741519831e-06, + "loss": 0.7602, + "step": 3023 + }, + { + "epoch": 0.25, + "grad_norm": 3.009304169840582, + "learning_rate": 8.830034321590871e-06, + "loss": 0.7909, + "step": 3024 + }, + { + "epoch": 0.25, + "grad_norm": 3.839214190877482, + "learning_rate": 8.82918863659942e-06, + "loss": 0.7278, + "step": 3025 + }, + { + "epoch": 0.25, + "grad_norm": 4.697837699981018, + "learning_rate": 8.828342686604004e-06, + "loss": 0.7607, + "step": 3026 + }, + { + "epoch": 0.25, + "grad_norm": 18.52590714108115, + "learning_rate": 8.827496471663169e-06, + "loss": 0.799, + "step": 3027 + }, + { + "epoch": 0.25, + "grad_norm": 3.5046475953453884, + "learning_rate": 8.826649991835476e-06, + "loss": 0.8405, + "step": 3028 + }, + { + "epoch": 0.25, + "grad_norm": 3.3402376933912796, + "learning_rate": 8.825803247179512e-06, + "loss": 0.6462, + "step": 3029 + }, + { + "epoch": 0.25, + "grad_norm": 3.794294837183767, + "learning_rate": 8.824956237753872e-06, + "loss": 0.7486, + "step": 3030 + }, + { + "epoch": 0.25, + "grad_norm": 4.738082352840565, + "learning_rate": 8.824108963617177e-06, + "loss": 0.5948, + "step": 3031 + }, + { + "epoch": 0.25, + "grad_norm": 9.682229478473296, + "learning_rate": 8.823261424828064e-06, + "loss": 0.7001, + "step": 3032 + }, + { + "epoch": 0.25, + "grad_norm": 2.9608653386945707, + "learning_rate": 8.822413621445188e-06, + "loss": 0.8597, + "step": 3033 + }, + { + "epoch": 0.25, + "grad_norm": 5.439436375748, + "learning_rate": 8.821565553527218e-06, + "loss": 0.7269, + "step": 3034 + }, + { + "epoch": 0.25, + "grad_norm": 5.9023024152372505, + "learning_rate": 8.820717221132854e-06, + "loss": 0.8707, + "step": 3035 + }, + { + "epoch": 0.25, + "grad_norm": 10.05615509544385, + "learning_rate": 8.819868624320797e-06, + "loss": 0.7602, + "step": 3036 + }, + { + "epoch": 0.25, + "grad_norm": 2.785187791692384, + "learning_rate": 8.81901976314978e-06, + "loss": 0.745, + "step": 3037 + }, + { + "epoch": 0.25, + "grad_norm": 2.870028343890995, + "learning_rate": 8.818170637678549e-06, + "loss": 0.6221, + "step": 3038 + }, + { + "epoch": 0.25, + "grad_norm": 3.532454302973348, + "learning_rate": 8.817321247965872e-06, + "loss": 0.6828, + "step": 3039 + }, + { + "epoch": 0.25, + "grad_norm": 6.307726204717456, + "learning_rate": 8.816471594070523e-06, + "loss": 0.6429, + "step": 3040 + }, + { + "epoch": 0.25, + "grad_norm": 3.1197549955973014, + "learning_rate": 8.815621676051313e-06, + "loss": 0.6803, + "step": 3041 + }, + { + "epoch": 0.25, + "grad_norm": 5.556611672109788, + "learning_rate": 8.814771493967058e-06, + "loss": 0.8512, + "step": 3042 + }, + { + "epoch": 0.25, + "grad_norm": 3.583272860994243, + "learning_rate": 8.813921047876595e-06, + "loss": 0.5224, + "step": 3043 + }, + { + "epoch": 0.25, + "grad_norm": 3.086527891586726, + "learning_rate": 8.813070337838781e-06, + "loss": 0.6818, + "step": 3044 + }, + { + "epoch": 0.25, + "grad_norm": 2.92353885695445, + "learning_rate": 8.81221936391249e-06, + "loss": 0.7864, + "step": 3045 + }, + { + "epoch": 0.25, + "grad_norm": 2.9551810477045874, + "learning_rate": 8.811368126156615e-06, + "loss": 0.8103, + "step": 3046 + }, + { + "epoch": 0.25, + "grad_norm": 3.9291030248161487, + "learning_rate": 8.81051662463007e-06, + "loss": 0.6173, + "step": 3047 + }, + { + "epoch": 0.25, + "grad_norm": 5.970094486217373, + "learning_rate": 8.809664859391778e-06, + "loss": 0.6372, + "step": 3048 + }, + { + "epoch": 0.25, + "grad_norm": 6.7523921063696655, + "learning_rate": 8.808812830500693e-06, + "loss": 0.6764, + "step": 3049 + }, + { + "epoch": 0.25, + "grad_norm": 3.2139730372308084, + "learning_rate": 8.807960538015777e-06, + "loss": 0.856, + "step": 3050 + }, + { + "epoch": 0.25, + "grad_norm": 11.041542864660634, + "learning_rate": 8.807107981996014e-06, + "loss": 0.6652, + "step": 3051 + }, + { + "epoch": 0.25, + "grad_norm": 3.1807474015485133, + "learning_rate": 8.806255162500407e-06, + "loss": 0.748, + "step": 3052 + }, + { + "epoch": 0.25, + "grad_norm": 5.8150175097872685, + "learning_rate": 8.805402079587977e-06, + "loss": 0.6305, + "step": 3053 + }, + { + "epoch": 0.25, + "grad_norm": 3.6815465550837905, + "learning_rate": 8.804548733317764e-06, + "loss": 0.7747, + "step": 3054 + }, + { + "epoch": 0.25, + "grad_norm": 4.547709117122394, + "learning_rate": 8.803695123748821e-06, + "loss": 0.8176, + "step": 3055 + }, + { + "epoch": 0.25, + "grad_norm": 2.3666443745613903, + "learning_rate": 8.802841250940226e-06, + "loss": 0.7812, + "step": 3056 + }, + { + "epoch": 0.25, + "grad_norm": 3.114873178015105, + "learning_rate": 8.80198711495107e-06, + "loss": 0.6454, + "step": 3057 + }, + { + "epoch": 0.25, + "grad_norm": 3.7267090030878736, + "learning_rate": 8.80113271584047e-06, + "loss": 0.7653, + "step": 3058 + }, + { + "epoch": 0.25, + "grad_norm": 5.7336090204671235, + "learning_rate": 8.800278053667551e-06, + "loss": 0.8391, + "step": 3059 + }, + { + "epoch": 0.25, + "grad_norm": 4.146656996565322, + "learning_rate": 8.799423128491463e-06, + "loss": 0.7512, + "step": 3060 + }, + { + "epoch": 0.25, + "grad_norm": 8.978857674046594, + "learning_rate": 8.798567940371367e-06, + "loss": 0.7533, + "step": 3061 + }, + { + "epoch": 0.25, + "grad_norm": 8.32912900018707, + "learning_rate": 8.797712489366456e-06, + "loss": 0.6606, + "step": 3062 + }, + { + "epoch": 0.25, + "grad_norm": 5.424232967561503, + "learning_rate": 8.796856775535926e-06, + "loss": 0.7162, + "step": 3063 + }, + { + "epoch": 0.25, + "grad_norm": 2.8718379914775785, + "learning_rate": 8.796000798939001e-06, + "loss": 0.6549, + "step": 3064 + }, + { + "epoch": 0.25, + "grad_norm": 4.387055998905367, + "learning_rate": 8.795144559634921e-06, + "loss": 0.7572, + "step": 3065 + }, + { + "epoch": 0.25, + "grad_norm": 16.522730471824318, + "learning_rate": 8.794288057682939e-06, + "loss": 0.6019, + "step": 3066 + }, + { + "epoch": 0.25, + "grad_norm": 2.968471855541699, + "learning_rate": 8.793431293142334e-06, + "loss": 0.6142, + "step": 3067 + }, + { + "epoch": 0.25, + "grad_norm": 8.859673006858515, + "learning_rate": 8.792574266072397e-06, + "loss": 0.8155, + "step": 3068 + }, + { + "epoch": 0.25, + "grad_norm": 11.576330758995297, + "learning_rate": 8.791716976532441e-06, + "loss": 0.5978, + "step": 3069 + }, + { + "epoch": 0.25, + "grad_norm": 4.4253756691029, + "learning_rate": 8.790859424581796e-06, + "loss": 0.6893, + "step": 3070 + }, + { + "epoch": 0.25, + "grad_norm": 4.015026491366955, + "learning_rate": 8.79000161027981e-06, + "loss": 0.789, + "step": 3071 + }, + { + "epoch": 0.25, + "grad_norm": 3.137490299438896, + "learning_rate": 8.789143533685847e-06, + "loss": 0.6068, + "step": 3072 + }, + { + "epoch": 0.25, + "grad_norm": 10.06986050159233, + "learning_rate": 8.788285194859293e-06, + "loss": 0.8089, + "step": 3073 + }, + { + "epoch": 0.25, + "grad_norm": 5.8039239794254565, + "learning_rate": 8.787426593859552e-06, + "loss": 0.7599, + "step": 3074 + }, + { + "epoch": 0.25, + "grad_norm": 3.812170928524484, + "learning_rate": 8.786567730746043e-06, + "loss": 0.6906, + "step": 3075 + }, + { + "epoch": 0.25, + "grad_norm": 4.334727792341875, + "learning_rate": 8.785708605578204e-06, + "loss": 0.6469, + "step": 3076 + }, + { + "epoch": 0.25, + "grad_norm": 4.100441618871479, + "learning_rate": 8.784849218415494e-06, + "loss": 0.7728, + "step": 3077 + }, + { + "epoch": 0.25, + "grad_norm": 10.776677988372978, + "learning_rate": 8.783989569317386e-06, + "loss": 0.665, + "step": 3078 + }, + { + "epoch": 0.25, + "grad_norm": 8.065157601787455, + "learning_rate": 8.783129658343375e-06, + "loss": 0.7224, + "step": 3079 + }, + { + "epoch": 0.25, + "grad_norm": 9.754760023461866, + "learning_rate": 8.78226948555297e-06, + "loss": 0.6977, + "step": 3080 + }, + { + "epoch": 0.25, + "grad_norm": 2.91142717873937, + "learning_rate": 8.7814090510057e-06, + "loss": 0.7452, + "step": 3081 + }, + { + "epoch": 0.25, + "grad_norm": 5.358599442144094, + "learning_rate": 8.780548354761117e-06, + "loss": 0.7386, + "step": 3082 + }, + { + "epoch": 0.25, + "grad_norm": 6.1770719932293385, + "learning_rate": 8.77968739687878e-06, + "loss": 0.6611, + "step": 3083 + }, + { + "epoch": 0.25, + "grad_norm": 5.094304578414262, + "learning_rate": 8.778826177418279e-06, + "loss": 0.6689, + "step": 3084 + }, + { + "epoch": 0.25, + "grad_norm": 4.0009570657251805, + "learning_rate": 8.777964696439211e-06, + "loss": 0.8095, + "step": 3085 + }, + { + "epoch": 0.25, + "grad_norm": 3.538530084619795, + "learning_rate": 8.777102954001199e-06, + "loss": 0.7265, + "step": 3086 + }, + { + "epoch": 0.25, + "grad_norm": 3.935273935837516, + "learning_rate": 8.776240950163881e-06, + "loss": 0.7395, + "step": 3087 + }, + { + "epoch": 0.25, + "grad_norm": 4.920882725364082, + "learning_rate": 8.77537868498691e-06, + "loss": 0.73, + "step": 3088 + }, + { + "epoch": 0.25, + "grad_norm": 4.464398557139395, + "learning_rate": 8.774516158529964e-06, + "loss": 0.7222, + "step": 3089 + }, + { + "epoch": 0.25, + "grad_norm": 3.519622405821405, + "learning_rate": 8.773653370852732e-06, + "loss": 0.6436, + "step": 3090 + }, + { + "epoch": 0.25, + "grad_norm": 2.6481939261108702, + "learning_rate": 8.772790322014928e-06, + "loss": 0.6496, + "step": 3091 + }, + { + "epoch": 0.25, + "grad_norm": 6.903381244721313, + "learning_rate": 8.771927012076276e-06, + "loss": 0.7779, + "step": 3092 + }, + { + "epoch": 0.25, + "grad_norm": 6.087918109238199, + "learning_rate": 8.771063441096527e-06, + "loss": 0.6783, + "step": 3093 + }, + { + "epoch": 0.25, + "grad_norm": 2.957922211199016, + "learning_rate": 8.770199609135441e-06, + "loss": 0.6523, + "step": 3094 + }, + { + "epoch": 0.25, + "grad_norm": 4.841301250019361, + "learning_rate": 8.769335516252803e-06, + "loss": 0.5435, + "step": 3095 + }, + { + "epoch": 0.25, + "grad_norm": 21.38121476483069, + "learning_rate": 8.768471162508416e-06, + "loss": 0.6877, + "step": 3096 + }, + { + "epoch": 0.25, + "grad_norm": 2.888489436387813, + "learning_rate": 8.767606547962095e-06, + "loss": 0.6186, + "step": 3097 + }, + { + "epoch": 0.25, + "grad_norm": 3.271647791303926, + "learning_rate": 8.766741672673677e-06, + "loss": 0.8017, + "step": 3098 + }, + { + "epoch": 0.25, + "grad_norm": 3.585655721670914, + "learning_rate": 8.76587653670302e-06, + "loss": 0.7705, + "step": 3099 + }, + { + "epoch": 0.25, + "grad_norm": 11.770367549933098, + "learning_rate": 8.765011140109993e-06, + "loss": 0.8647, + "step": 3100 + }, + { + "epoch": 0.25, + "grad_norm": 4.691469114463657, + "learning_rate": 8.76414548295449e-06, + "loss": 0.7185, + "step": 3101 + }, + { + "epoch": 0.25, + "grad_norm": 3.425361225456822, + "learning_rate": 8.763279565296417e-06, + "loss": 0.6477, + "step": 3102 + }, + { + "epoch": 0.25, + "grad_norm": 3.0483154802755688, + "learning_rate": 8.762413387195702e-06, + "loss": 0.6128, + "step": 3103 + }, + { + "epoch": 0.25, + "grad_norm": 14.842809738947048, + "learning_rate": 8.761546948712293e-06, + "loss": 0.7448, + "step": 3104 + }, + { + "epoch": 0.25, + "grad_norm": 6.164015345671067, + "learning_rate": 8.760680249906149e-06, + "loss": 0.7513, + "step": 3105 + }, + { + "epoch": 0.25, + "grad_norm": 4.358286757291583, + "learning_rate": 8.759813290837254e-06, + "loss": 0.8066, + "step": 3106 + }, + { + "epoch": 0.25, + "grad_norm": 4.630388700537639, + "learning_rate": 8.758946071565605e-06, + "loss": 0.8617, + "step": 3107 + }, + { + "epoch": 0.25, + "grad_norm": 5.078490895055653, + "learning_rate": 8.758078592151218e-06, + "loss": 0.8909, + "step": 3108 + }, + { + "epoch": 0.25, + "grad_norm": 2.9463029330233406, + "learning_rate": 8.75721085265413e-06, + "loss": 0.7876, + "step": 3109 + }, + { + "epoch": 0.25, + "grad_norm": 6.116187620171812, + "learning_rate": 8.756342853134394e-06, + "loss": 0.7866, + "step": 3110 + }, + { + "epoch": 0.25, + "grad_norm": 7.0949835345468335, + "learning_rate": 8.75547459365208e-06, + "loss": 0.6554, + "step": 3111 + }, + { + "epoch": 0.25, + "grad_norm": 7.547951850207582, + "learning_rate": 8.75460607426728e-06, + "loss": 0.6369, + "step": 3112 + }, + { + "epoch": 0.25, + "grad_norm": 10.377164443560627, + "learning_rate": 8.753737295040097e-06, + "loss": 0.7811, + "step": 3113 + }, + { + "epoch": 0.25, + "grad_norm": 13.518420405245568, + "learning_rate": 8.752868256030658e-06, + "loss": 0.7348, + "step": 3114 + }, + { + "epoch": 0.25, + "grad_norm": 4.069145433334808, + "learning_rate": 8.751998957299105e-06, + "loss": 0.745, + "step": 3115 + }, + { + "epoch": 0.25, + "grad_norm": 3.7879224799977327, + "learning_rate": 8.7511293989056e-06, + "loss": 0.7569, + "step": 3116 + }, + { + "epoch": 0.25, + "grad_norm": 3.6759781094322803, + "learning_rate": 8.750259580910323e-06, + "loss": 0.7986, + "step": 3117 + }, + { + "epoch": 0.25, + "grad_norm": 6.698303676964798, + "learning_rate": 8.749389503373467e-06, + "loss": 0.6889, + "step": 3118 + }, + { + "epoch": 0.25, + "grad_norm": 3.906326166244853, + "learning_rate": 8.748519166355251e-06, + "loss": 0.6908, + "step": 3119 + }, + { + "epoch": 0.25, + "grad_norm": 18.650861876775295, + "learning_rate": 8.747648569915905e-06, + "loss": 0.5615, + "step": 3120 + }, + { + "epoch": 0.25, + "grad_norm": 3.635344764721767, + "learning_rate": 8.746777714115681e-06, + "loss": 0.7414, + "step": 3121 + }, + { + "epoch": 0.25, + "grad_norm": 8.318153874007834, + "learning_rate": 8.745906599014848e-06, + "loss": 0.7507, + "step": 3122 + }, + { + "epoch": 0.25, + "grad_norm": 4.020214871131042, + "learning_rate": 8.745035224673693e-06, + "loss": 0.7481, + "step": 3123 + }, + { + "epoch": 0.25, + "grad_norm": 3.574527386136311, + "learning_rate": 8.744163591152517e-06, + "loss": 0.5815, + "step": 3124 + }, + { + "epoch": 0.25, + "grad_norm": 3.5863367982424403, + "learning_rate": 8.743291698511646e-06, + "loss": 0.7305, + "step": 3125 + }, + { + "epoch": 0.25, + "grad_norm": 5.451184676294389, + "learning_rate": 8.742419546811423e-06, + "loss": 0.7432, + "step": 3126 + }, + { + "epoch": 0.25, + "grad_norm": 3.062352225952689, + "learning_rate": 8.7415471361122e-06, + "loss": 0.8169, + "step": 3127 + }, + { + "epoch": 0.25, + "grad_norm": 3.5007176115921355, + "learning_rate": 8.740674466474357e-06, + "loss": 0.6944, + "step": 3128 + }, + { + "epoch": 0.25, + "grad_norm": 7.740229179001324, + "learning_rate": 8.739801537958289e-06, + "loss": 0.6355, + "step": 3129 + }, + { + "epoch": 0.25, + "grad_norm": 4.760860305494583, + "learning_rate": 8.738928350624405e-06, + "loss": 0.8089, + "step": 3130 + }, + { + "epoch": 0.25, + "grad_norm": 8.48155766612004, + "learning_rate": 8.738054904533138e-06, + "loss": 0.794, + "step": 3131 + }, + { + "epoch": 0.25, + "grad_norm": 4.647497005446932, + "learning_rate": 8.737181199744936e-06, + "loss": 0.7155, + "step": 3132 + }, + { + "epoch": 0.25, + "grad_norm": 4.099857403437535, + "learning_rate": 8.73630723632026e-06, + "loss": 0.6637, + "step": 3133 + }, + { + "epoch": 0.25, + "grad_norm": 3.6104204533974054, + "learning_rate": 8.735433014319602e-06, + "loss": 0.8782, + "step": 3134 + }, + { + "epoch": 0.25, + "grad_norm": 7.023665806477569, + "learning_rate": 8.734558533803456e-06, + "loss": 0.7411, + "step": 3135 + }, + { + "epoch": 0.25, + "grad_norm": 3.5091305472528966, + "learning_rate": 8.733683794832346e-06, + "loss": 0.8685, + "step": 3136 + }, + { + "epoch": 0.25, + "grad_norm": 2.7535990976611684, + "learning_rate": 8.732808797466808e-06, + "loss": 0.7291, + "step": 3137 + }, + { + "epoch": 0.25, + "grad_norm": 4.460848618672478, + "learning_rate": 8.731933541767396e-06, + "loss": 0.8162, + "step": 3138 + }, + { + "epoch": 0.25, + "grad_norm": 3.627034661632686, + "learning_rate": 8.731058027794688e-06, + "loss": 0.6, + "step": 3139 + }, + { + "epoch": 0.26, + "grad_norm": 3.5932830331587238, + "learning_rate": 8.73018225560927e-06, + "loss": 0.6503, + "step": 3140 + }, + { + "epoch": 0.26, + "grad_norm": 4.660642123583916, + "learning_rate": 8.729306225271752e-06, + "loss": 0.6394, + "step": 3141 + }, + { + "epoch": 0.26, + "grad_norm": 5.6774384893824585, + "learning_rate": 8.728429936842762e-06, + "loss": 0.8571, + "step": 3142 + }, + { + "epoch": 0.26, + "grad_norm": 8.630971072221547, + "learning_rate": 8.727553390382946e-06, + "loss": 0.6036, + "step": 3143 + }, + { + "epoch": 0.26, + "grad_norm": 3.8080765109471972, + "learning_rate": 8.726676585952963e-06, + "loss": 0.7048, + "step": 3144 + }, + { + "epoch": 0.26, + "grad_norm": 3.054423786006399, + "learning_rate": 8.725799523613494e-06, + "loss": 0.7577, + "step": 3145 + }, + { + "epoch": 0.26, + "grad_norm": 2.9213456980334045, + "learning_rate": 8.72492220342524e-06, + "loss": 0.9067, + "step": 3146 + }, + { + "epoch": 0.26, + "grad_norm": 2.4419951437245992, + "learning_rate": 8.724044625448915e-06, + "loss": 0.6094, + "step": 3147 + }, + { + "epoch": 0.26, + "grad_norm": 4.994992269704754, + "learning_rate": 8.723166789745255e-06, + "loss": 0.749, + "step": 3148 + }, + { + "epoch": 0.26, + "grad_norm": 5.866976280194647, + "learning_rate": 8.722288696375009e-06, + "loss": 0.7469, + "step": 3149 + }, + { + "epoch": 0.26, + "grad_norm": 2.8070344942227035, + "learning_rate": 8.721410345398946e-06, + "loss": 0.8725, + "step": 3150 + }, + { + "epoch": 0.26, + "grad_norm": 4.4476406179513885, + "learning_rate": 8.720531736877858e-06, + "loss": 0.7973, + "step": 3151 + }, + { + "epoch": 0.26, + "grad_norm": 5.03799945204716, + "learning_rate": 8.719652870872546e-06, + "loss": 0.805, + "step": 3152 + }, + { + "epoch": 0.26, + "grad_norm": 5.37050191752738, + "learning_rate": 8.718773747443834e-06, + "loss": 0.6877, + "step": 3153 + }, + { + "epoch": 0.26, + "grad_norm": 3.5957418475430787, + "learning_rate": 8.717894366652564e-06, + "loss": 0.6798, + "step": 3154 + }, + { + "epoch": 0.26, + "grad_norm": 10.372394165580374, + "learning_rate": 8.717014728559594e-06, + "loss": 0.7821, + "step": 3155 + }, + { + "epoch": 0.26, + "grad_norm": 7.857694882228299, + "learning_rate": 8.716134833225803e-06, + "loss": 0.6747, + "step": 3156 + }, + { + "epoch": 0.26, + "grad_norm": 3.722154342366707, + "learning_rate": 8.715254680712079e-06, + "loss": 0.6145, + "step": 3157 + }, + { + "epoch": 0.26, + "grad_norm": 3.011208930046009, + "learning_rate": 8.714374271079339e-06, + "loss": 0.7584, + "step": 3158 + }, + { + "epoch": 0.26, + "grad_norm": 3.139584702264579, + "learning_rate": 8.713493604388513e-06, + "loss": 0.7627, + "step": 3159 + }, + { + "epoch": 0.26, + "grad_norm": 4.426293700278985, + "learning_rate": 8.712612680700545e-06, + "loss": 0.6592, + "step": 3160 + }, + { + "epoch": 0.26, + "grad_norm": 3.6095904479113576, + "learning_rate": 8.711731500076405e-06, + "loss": 0.8101, + "step": 3161 + }, + { + "epoch": 0.26, + "grad_norm": 4.897542453631338, + "learning_rate": 8.710850062577074e-06, + "loss": 0.6313, + "step": 3162 + }, + { + "epoch": 0.26, + "grad_norm": 3.380202800138096, + "learning_rate": 8.709968368263553e-06, + "loss": 0.6786, + "step": 3163 + }, + { + "epoch": 0.26, + "grad_norm": 3.9420736333133997, + "learning_rate": 8.709086417196862e-06, + "loss": 0.6966, + "step": 3164 + }, + { + "epoch": 0.26, + "grad_norm": 3.01766478559132, + "learning_rate": 8.708204209438034e-06, + "loss": 0.706, + "step": 3165 + }, + { + "epoch": 0.26, + "grad_norm": 11.739275142211664, + "learning_rate": 8.707321745048127e-06, + "loss": 0.7194, + "step": 3166 + }, + { + "epoch": 0.26, + "grad_norm": 3.6215568506631444, + "learning_rate": 8.706439024088213e-06, + "loss": 0.7671, + "step": 3167 + }, + { + "epoch": 0.26, + "grad_norm": 3.100253708566771, + "learning_rate": 8.705556046619382e-06, + "loss": 0.7356, + "step": 3168 + }, + { + "epoch": 0.26, + "grad_norm": 7.239927471366727, + "learning_rate": 8.704672812702737e-06, + "loss": 0.7065, + "step": 3169 + }, + { + "epoch": 0.26, + "grad_norm": 6.508042993680408, + "learning_rate": 8.70378932239941e-06, + "loss": 0.6799, + "step": 3170 + }, + { + "epoch": 0.26, + "grad_norm": 3.3221637334496372, + "learning_rate": 8.702905575770539e-06, + "loss": 0.8029, + "step": 3171 + }, + { + "epoch": 0.26, + "grad_norm": 3.236179506992233, + "learning_rate": 8.702021572877288e-06, + "loss": 0.6837, + "step": 3172 + }, + { + "epoch": 0.26, + "grad_norm": 4.430688642614841, + "learning_rate": 8.701137313780833e-06, + "loss": 0.7404, + "step": 3173 + }, + { + "epoch": 0.26, + "grad_norm": 7.043340689823841, + "learning_rate": 8.700252798542372e-06, + "loss": 0.8444, + "step": 3174 + }, + { + "epoch": 0.26, + "grad_norm": 3.4045707643206984, + "learning_rate": 8.699368027223118e-06, + "loss": 0.7279, + "step": 3175 + }, + { + "epoch": 0.26, + "grad_norm": 3.139954314971914, + "learning_rate": 8.698482999884304e-06, + "loss": 0.7152, + "step": 3176 + }, + { + "epoch": 0.26, + "grad_norm": 3.7429082791425596, + "learning_rate": 8.697597716587181e-06, + "loss": 0.5052, + "step": 3177 + }, + { + "epoch": 0.26, + "grad_norm": 10.686945224955448, + "learning_rate": 8.696712177393011e-06, + "loss": 0.7174, + "step": 3178 + }, + { + "epoch": 0.26, + "grad_norm": 8.892058849149779, + "learning_rate": 8.695826382363083e-06, + "loss": 0.7848, + "step": 3179 + }, + { + "epoch": 0.26, + "grad_norm": 4.269392998250927, + "learning_rate": 8.694940331558699e-06, + "loss": 0.7712, + "step": 3180 + }, + { + "epoch": 0.26, + "grad_norm": 3.9938333471267473, + "learning_rate": 8.694054025041178e-06, + "loss": 0.7543, + "step": 3181 + }, + { + "epoch": 0.26, + "grad_norm": 5.5194613206959655, + "learning_rate": 8.693167462871859e-06, + "loss": 0.5992, + "step": 3182 + }, + { + "epoch": 0.26, + "grad_norm": 3.0265730767118795, + "learning_rate": 8.692280645112097e-06, + "loss": 0.7448, + "step": 3183 + }, + { + "epoch": 0.26, + "grad_norm": 5.145453021093892, + "learning_rate": 8.691393571823266e-06, + "loss": 0.7072, + "step": 3184 + }, + { + "epoch": 0.26, + "grad_norm": 5.465871706903786, + "learning_rate": 8.690506243066757e-06, + "loss": 0.6629, + "step": 3185 + }, + { + "epoch": 0.26, + "grad_norm": 4.006342358675749, + "learning_rate": 8.68961865890398e-06, + "loss": 0.7542, + "step": 3186 + }, + { + "epoch": 0.26, + "grad_norm": 3.134299434254786, + "learning_rate": 8.688730819396358e-06, + "loss": 0.6575, + "step": 3187 + }, + { + "epoch": 0.26, + "grad_norm": 5.150631234628473, + "learning_rate": 8.687842724605338e-06, + "loss": 0.7069, + "step": 3188 + }, + { + "epoch": 0.26, + "grad_norm": 3.55472783469278, + "learning_rate": 8.686954374592382e-06, + "loss": 0.7212, + "step": 3189 + }, + { + "epoch": 0.26, + "grad_norm": 3.252418671539824, + "learning_rate": 8.686065769418967e-06, + "loss": 0.8669, + "step": 3190 + }, + { + "epoch": 0.26, + "grad_norm": 5.523631755961889, + "learning_rate": 8.68517690914659e-06, + "loss": 0.6601, + "step": 3191 + }, + { + "epoch": 0.26, + "grad_norm": 4.459577519303184, + "learning_rate": 8.68428779383677e-06, + "loss": 0.8299, + "step": 3192 + }, + { + "epoch": 0.26, + "grad_norm": 3.8596646274970308, + "learning_rate": 8.683398423551034e-06, + "loss": 0.7054, + "step": 3193 + }, + { + "epoch": 0.26, + "grad_norm": 3.336044153338888, + "learning_rate": 8.682508798350937e-06, + "loss": 0.7074, + "step": 3194 + }, + { + "epoch": 0.26, + "grad_norm": 13.8381498669211, + "learning_rate": 8.681618918298043e-06, + "loss": 0.6351, + "step": 3195 + }, + { + "epoch": 0.26, + "grad_norm": 7.032259708332252, + "learning_rate": 8.680728783453937e-06, + "loss": 0.5975, + "step": 3196 + }, + { + "epoch": 0.26, + "grad_norm": 18.368391685083438, + "learning_rate": 8.679838393880224e-06, + "loss": 0.5734, + "step": 3197 + }, + { + "epoch": 0.26, + "grad_norm": 3.98708049777821, + "learning_rate": 8.678947749638525e-06, + "loss": 0.7007, + "step": 3198 + }, + { + "epoch": 0.26, + "grad_norm": 3.3257990633485464, + "learning_rate": 8.678056850790477e-06, + "loss": 0.7348, + "step": 3199 + }, + { + "epoch": 0.26, + "grad_norm": 6.545915739914991, + "learning_rate": 8.677165697397736e-06, + "loss": 0.7186, + "step": 3200 + }, + { + "epoch": 0.26, + "grad_norm": 3.2962723461974264, + "learning_rate": 8.676274289521976e-06, + "loss": 0.8171, + "step": 3201 + }, + { + "epoch": 0.26, + "grad_norm": 4.266180606392446, + "learning_rate": 8.675382627224886e-06, + "loss": 0.6979, + "step": 3202 + }, + { + "epoch": 0.26, + "grad_norm": 5.99352455597282, + "learning_rate": 8.674490710568176e-06, + "loss": 0.8087, + "step": 3203 + }, + { + "epoch": 0.26, + "grad_norm": 3.612149124374396, + "learning_rate": 8.673598539613573e-06, + "loss": 0.7438, + "step": 3204 + }, + { + "epoch": 0.26, + "grad_norm": 3.2998574631719113, + "learning_rate": 8.67270611442282e-06, + "loss": 0.6706, + "step": 3205 + }, + { + "epoch": 0.26, + "grad_norm": 4.300327282534021, + "learning_rate": 8.671813435057678e-06, + "loss": 0.8463, + "step": 3206 + }, + { + "epoch": 0.26, + "grad_norm": 9.340763817220994, + "learning_rate": 8.670920501579928e-06, + "loss": 0.765, + "step": 3207 + }, + { + "epoch": 0.26, + "grad_norm": 3.205538070884684, + "learning_rate": 8.670027314051364e-06, + "loss": 0.8013, + "step": 3208 + }, + { + "epoch": 0.26, + "grad_norm": 3.5057415428389707, + "learning_rate": 8.669133872533804e-06, + "loss": 0.6927, + "step": 3209 + }, + { + "epoch": 0.26, + "grad_norm": 3.2638647950868855, + "learning_rate": 8.668240177089074e-06, + "loss": 0.7248, + "step": 3210 + }, + { + "epoch": 0.26, + "grad_norm": 2.866964393651608, + "learning_rate": 8.667346227779028e-06, + "loss": 0.7641, + "step": 3211 + }, + { + "epoch": 0.26, + "grad_norm": 20.584110147417896, + "learning_rate": 8.666452024665533e-06, + "loss": 0.7217, + "step": 3212 + }, + { + "epoch": 0.26, + "grad_norm": 2.6132459682668814, + "learning_rate": 8.66555756781047e-06, + "loss": 0.7023, + "step": 3213 + }, + { + "epoch": 0.26, + "grad_norm": 2.519297971317584, + "learning_rate": 8.664662857275744e-06, + "loss": 0.6993, + "step": 3214 + }, + { + "epoch": 0.26, + "grad_norm": 3.4265703667495155, + "learning_rate": 8.663767893123272e-06, + "loss": 0.6637, + "step": 3215 + }, + { + "epoch": 0.26, + "grad_norm": 8.205509648102641, + "learning_rate": 8.662872675414993e-06, + "loss": 0.6888, + "step": 3216 + }, + { + "epoch": 0.26, + "grad_norm": 3.419030151415074, + "learning_rate": 8.661977204212864e-06, + "loss": 0.665, + "step": 3217 + }, + { + "epoch": 0.26, + "grad_norm": 17.903286004744963, + "learning_rate": 8.661081479578852e-06, + "loss": 0.7801, + "step": 3218 + }, + { + "epoch": 0.26, + "grad_norm": 3.3684227355501983, + "learning_rate": 8.660185501574952e-06, + "loss": 0.7133, + "step": 3219 + }, + { + "epoch": 0.26, + "grad_norm": 3.6874838253854554, + "learning_rate": 8.659289270263167e-06, + "loss": 0.7478, + "step": 3220 + }, + { + "epoch": 0.26, + "grad_norm": 3.185170736521441, + "learning_rate": 8.658392785705525e-06, + "loss": 0.8235, + "step": 3221 + }, + { + "epoch": 0.26, + "grad_norm": 3.7015605614848512, + "learning_rate": 8.657496047964066e-06, + "loss": 0.8226, + "step": 3222 + }, + { + "epoch": 0.26, + "grad_norm": 3.6710987725206263, + "learning_rate": 8.656599057100853e-06, + "loss": 0.4764, + "step": 3223 + }, + { + "epoch": 0.26, + "grad_norm": 4.79206371788448, + "learning_rate": 8.655701813177959e-06, + "loss": 0.8447, + "step": 3224 + }, + { + "epoch": 0.26, + "grad_norm": 3.5200200680803624, + "learning_rate": 8.65480431625748e-06, + "loss": 0.8119, + "step": 3225 + }, + { + "epoch": 0.26, + "grad_norm": 3.0744180472335843, + "learning_rate": 8.653906566401533e-06, + "loss": 0.7374, + "step": 3226 + }, + { + "epoch": 0.26, + "grad_norm": 2.6018496170031113, + "learning_rate": 8.653008563672242e-06, + "loss": 0.7083, + "step": 3227 + }, + { + "epoch": 0.26, + "grad_norm": 2.698042521378998, + "learning_rate": 8.65211030813176e-06, + "loss": 0.7159, + "step": 3228 + }, + { + "epoch": 0.26, + "grad_norm": 3.7187653352667662, + "learning_rate": 8.651211799842248e-06, + "loss": 0.8173, + "step": 3229 + }, + { + "epoch": 0.26, + "grad_norm": 3.73243687165133, + "learning_rate": 8.65031303886589e-06, + "loss": 0.7924, + "step": 3230 + }, + { + "epoch": 0.26, + "grad_norm": 2.9375138583642664, + "learning_rate": 8.649414025264884e-06, + "loss": 0.6647, + "step": 3231 + }, + { + "epoch": 0.26, + "grad_norm": 6.0223365449654445, + "learning_rate": 8.64851475910145e-06, + "loss": 0.6637, + "step": 3232 + }, + { + "epoch": 0.26, + "grad_norm": 16.04673929851229, + "learning_rate": 8.647615240437821e-06, + "loss": 0.7696, + "step": 3233 + }, + { + "epoch": 0.26, + "grad_norm": 3.6864006100333637, + "learning_rate": 8.64671546933625e-06, + "loss": 0.6658, + "step": 3234 + }, + { + "epoch": 0.26, + "grad_norm": 4.075556435914477, + "learning_rate": 8.645815445859008e-06, + "loss": 0.8458, + "step": 3235 + }, + { + "epoch": 0.26, + "grad_norm": 11.6625120687269, + "learning_rate": 8.644915170068382e-06, + "loss": 0.5764, + "step": 3236 + }, + { + "epoch": 0.26, + "grad_norm": 5.421487139734052, + "learning_rate": 8.644014642026673e-06, + "loss": 0.9108, + "step": 3237 + }, + { + "epoch": 0.26, + "grad_norm": 3.328575562706458, + "learning_rate": 8.643113861796209e-06, + "loss": 0.5935, + "step": 3238 + }, + { + "epoch": 0.26, + "grad_norm": 4.387767583405524, + "learning_rate": 8.642212829439325e-06, + "loss": 0.8214, + "step": 3239 + }, + { + "epoch": 0.26, + "grad_norm": 3.680973041559468, + "learning_rate": 8.64131154501838e-06, + "loss": 0.7718, + "step": 3240 + }, + { + "epoch": 0.26, + "grad_norm": 6.230458925054092, + "learning_rate": 8.640410008595748e-06, + "loss": 0.6446, + "step": 3241 + }, + { + "epoch": 0.26, + "grad_norm": 4.240807683835683, + "learning_rate": 8.639508220233822e-06, + "loss": 0.8377, + "step": 3242 + }, + { + "epoch": 0.26, + "grad_norm": 3.451694084035272, + "learning_rate": 8.638606179995013e-06, + "loss": 0.588, + "step": 3243 + }, + { + "epoch": 0.26, + "grad_norm": 2.7660389044695433, + "learning_rate": 8.637703887941744e-06, + "loss": 0.9464, + "step": 3244 + }, + { + "epoch": 0.26, + "grad_norm": 3.47333453510192, + "learning_rate": 8.63680134413646e-06, + "loss": 0.7578, + "step": 3245 + }, + { + "epoch": 0.26, + "grad_norm": 2.5437573262789854, + "learning_rate": 8.635898548641627e-06, + "loss": 0.8159, + "step": 3246 + }, + { + "epoch": 0.26, + "grad_norm": 11.893660589589684, + "learning_rate": 8.634995501519718e-06, + "loss": 0.6712, + "step": 3247 + }, + { + "epoch": 0.26, + "grad_norm": 4.116797672031491, + "learning_rate": 8.634092202833233e-06, + "loss": 0.741, + "step": 3248 + }, + { + "epoch": 0.26, + "grad_norm": 6.167257437660994, + "learning_rate": 8.633188652644686e-06, + "loss": 0.8481, + "step": 3249 + }, + { + "epoch": 0.26, + "grad_norm": 19.25051829808035, + "learning_rate": 8.632284851016607e-06, + "loss": 0.625, + "step": 3250 + }, + { + "epoch": 0.26, + "grad_norm": 3.1216677192328364, + "learning_rate": 8.631380798011546e-06, + "loss": 0.7214, + "step": 3251 + }, + { + "epoch": 0.26, + "grad_norm": 4.089922296354801, + "learning_rate": 8.63047649369207e-06, + "loss": 0.6974, + "step": 3252 + }, + { + "epoch": 0.26, + "grad_norm": 6.084856075812232, + "learning_rate": 8.62957193812076e-06, + "loss": 0.8369, + "step": 3253 + }, + { + "epoch": 0.26, + "grad_norm": 3.159501872481622, + "learning_rate": 8.628667131360218e-06, + "loss": 0.7078, + "step": 3254 + }, + { + "epoch": 0.26, + "grad_norm": 3.514417320153134, + "learning_rate": 8.627762073473063e-06, + "loss": 0.6147, + "step": 3255 + }, + { + "epoch": 0.26, + "grad_norm": 2.8403537692686958, + "learning_rate": 8.62685676452193e-06, + "loss": 0.7494, + "step": 3256 + }, + { + "epoch": 0.26, + "grad_norm": 4.57325391759759, + "learning_rate": 8.625951204569473e-06, + "loss": 0.7323, + "step": 3257 + }, + { + "epoch": 0.26, + "grad_norm": 7.413588873273317, + "learning_rate": 8.62504539367836e-06, + "loss": 0.8129, + "step": 3258 + }, + { + "epoch": 0.26, + "grad_norm": 11.650052778296885, + "learning_rate": 8.624139331911283e-06, + "loss": 0.8058, + "step": 3259 + }, + { + "epoch": 0.26, + "grad_norm": 3.934127231152171, + "learning_rate": 8.623233019330943e-06, + "loss": 0.6543, + "step": 3260 + }, + { + "epoch": 0.26, + "grad_norm": 2.632913542481407, + "learning_rate": 8.622326456000065e-06, + "loss": 0.6536, + "step": 3261 + }, + { + "epoch": 0.26, + "grad_norm": 3.5199339880452825, + "learning_rate": 8.621419641981387e-06, + "loss": 0.7564, + "step": 3262 + }, + { + "epoch": 0.27, + "grad_norm": 2.6632097472282963, + "learning_rate": 8.620512577337668e-06, + "loss": 0.7519, + "step": 3263 + }, + { + "epoch": 0.27, + "grad_norm": 3.4141906149158503, + "learning_rate": 8.619605262131683e-06, + "loss": 0.8445, + "step": 3264 + }, + { + "epoch": 0.27, + "grad_norm": 3.711403876788153, + "learning_rate": 8.618697696426223e-06, + "loss": 0.8001, + "step": 3265 + }, + { + "epoch": 0.27, + "grad_norm": 2.6086964718147536, + "learning_rate": 8.617789880284097e-06, + "loss": 0.7267, + "step": 3266 + }, + { + "epoch": 0.27, + "grad_norm": 3.2318309937701994, + "learning_rate": 8.61688181376813e-06, + "loss": 0.7547, + "step": 3267 + }, + { + "epoch": 0.27, + "grad_norm": 9.90480681067458, + "learning_rate": 8.61597349694117e-06, + "loss": 0.86, + "step": 3268 + }, + { + "epoch": 0.27, + "grad_norm": 3.73241757591505, + "learning_rate": 8.615064929866074e-06, + "loss": 0.7248, + "step": 3269 + }, + { + "epoch": 0.27, + "grad_norm": 4.972615979290784, + "learning_rate": 8.614156112605725e-06, + "loss": 0.6817, + "step": 3270 + }, + { + "epoch": 0.27, + "grad_norm": 3.398181059119163, + "learning_rate": 8.613247045223014e-06, + "loss": 0.7055, + "step": 3271 + }, + { + "epoch": 0.27, + "grad_norm": 2.7709866941183368, + "learning_rate": 8.61233772778086e-06, + "loss": 0.6862, + "step": 3272 + }, + { + "epoch": 0.27, + "grad_norm": 3.5354902840870133, + "learning_rate": 8.611428160342185e-06, + "loss": 0.8771, + "step": 3273 + }, + { + "epoch": 0.27, + "grad_norm": 2.4969323475028062, + "learning_rate": 8.610518342969947e-06, + "loss": 0.7203, + "step": 3274 + }, + { + "epoch": 0.27, + "grad_norm": 3.6404422775039387, + "learning_rate": 8.609608275727102e-06, + "loss": 0.81, + "step": 3275 + }, + { + "epoch": 0.27, + "grad_norm": 3.7932823084202294, + "learning_rate": 8.608697958676638e-06, + "loss": 0.5665, + "step": 3276 + }, + { + "epoch": 0.27, + "grad_norm": 3.253721430568261, + "learning_rate": 8.607787391881552e-06, + "loss": 0.7267, + "step": 3277 + }, + { + "epoch": 0.27, + "grad_norm": 5.208576658088399, + "learning_rate": 8.606876575404863e-06, + "loss": 0.6558, + "step": 3278 + }, + { + "epoch": 0.27, + "grad_norm": 3.663975777075048, + "learning_rate": 8.605965509309605e-06, + "loss": 0.6548, + "step": 3279 + }, + { + "epoch": 0.27, + "grad_norm": 3.6859607417409617, + "learning_rate": 8.605054193658827e-06, + "loss": 0.7854, + "step": 3280 + }, + { + "epoch": 0.27, + "grad_norm": 3.011007018139226, + "learning_rate": 8.604142628515602e-06, + "loss": 0.7417, + "step": 3281 + }, + { + "epoch": 0.27, + "grad_norm": 3.1255284155438243, + "learning_rate": 8.60323081394301e-06, + "loss": 0.6975, + "step": 3282 + }, + { + "epoch": 0.27, + "grad_norm": 4.265618960931716, + "learning_rate": 8.60231875000416e-06, + "loss": 0.6337, + "step": 3283 + }, + { + "epoch": 0.27, + "grad_norm": 7.015580863351832, + "learning_rate": 8.60140643676217e-06, + "loss": 0.8029, + "step": 3284 + }, + { + "epoch": 0.27, + "grad_norm": 4.312401811027869, + "learning_rate": 8.600493874280179e-06, + "loss": 0.7472, + "step": 3285 + }, + { + "epoch": 0.27, + "grad_norm": 3.1523141103985455, + "learning_rate": 8.59958106262134e-06, + "loss": 0.6893, + "step": 3286 + }, + { + "epoch": 0.27, + "grad_norm": 3.0021671163905306, + "learning_rate": 8.598668001848828e-06, + "loss": 0.8166, + "step": 3287 + }, + { + "epoch": 0.27, + "grad_norm": 3.889461048343854, + "learning_rate": 8.59775469202583e-06, + "loss": 0.7855, + "step": 3288 + }, + { + "epoch": 0.27, + "grad_norm": 11.14539313668885, + "learning_rate": 8.596841133215554e-06, + "loss": 0.8144, + "step": 3289 + }, + { + "epoch": 0.27, + "grad_norm": 2.951924747837581, + "learning_rate": 8.595927325481227e-06, + "loss": 0.7788, + "step": 3290 + }, + { + "epoch": 0.27, + "grad_norm": 2.5379457137799064, + "learning_rate": 8.595013268886083e-06, + "loss": 0.8488, + "step": 3291 + }, + { + "epoch": 0.27, + "grad_norm": 3.0527164699105014, + "learning_rate": 8.594098963493387e-06, + "loss": 0.7177, + "step": 3292 + }, + { + "epoch": 0.27, + "grad_norm": 2.9382365798445327, + "learning_rate": 8.593184409366411e-06, + "loss": 0.8075, + "step": 3293 + }, + { + "epoch": 0.27, + "grad_norm": 6.308891412264307, + "learning_rate": 8.592269606568451e-06, + "loss": 0.6582, + "step": 3294 + }, + { + "epoch": 0.27, + "grad_norm": 3.114670097246518, + "learning_rate": 8.591354555162813e-06, + "loss": 0.8553, + "step": 3295 + }, + { + "epoch": 0.27, + "grad_norm": 3.5803268420882426, + "learning_rate": 8.59043925521283e-06, + "loss": 0.7822, + "step": 3296 + }, + { + "epoch": 0.27, + "grad_norm": 2.594740443503019, + "learning_rate": 8.589523706781841e-06, + "loss": 0.7845, + "step": 3297 + }, + { + "epoch": 0.27, + "grad_norm": 3.107354089805622, + "learning_rate": 8.588607909933211e-06, + "loss": 0.8989, + "step": 3298 + }, + { + "epoch": 0.27, + "grad_norm": 2.7220507819778, + "learning_rate": 8.587691864730316e-06, + "loss": 0.7899, + "step": 3299 + }, + { + "epoch": 0.27, + "grad_norm": 4.731172012254202, + "learning_rate": 8.586775571236557e-06, + "loss": 0.7715, + "step": 3300 + }, + { + "epoch": 0.27, + "grad_norm": 3.859806549335766, + "learning_rate": 8.585859029515342e-06, + "loss": 0.8083, + "step": 3301 + }, + { + "epoch": 0.27, + "grad_norm": 3.2749258568492525, + "learning_rate": 8.584942239630105e-06, + "loss": 0.687, + "step": 3302 + }, + { + "epoch": 0.27, + "grad_norm": 3.598193068305734, + "learning_rate": 8.584025201644292e-06, + "loss": 0.6268, + "step": 3303 + }, + { + "epoch": 0.27, + "grad_norm": 2.813056064809422, + "learning_rate": 8.583107915621367e-06, + "loss": 0.7308, + "step": 3304 + }, + { + "epoch": 0.27, + "grad_norm": 4.024180443138766, + "learning_rate": 8.582190381624814e-06, + "loss": 0.7338, + "step": 3305 + }, + { + "epoch": 0.27, + "grad_norm": 4.212881623334999, + "learning_rate": 8.581272599718131e-06, + "loss": 0.7103, + "step": 3306 + }, + { + "epoch": 0.27, + "grad_norm": 2.4047430453994494, + "learning_rate": 8.580354569964836e-06, + "loss": 0.7758, + "step": 3307 + }, + { + "epoch": 0.27, + "grad_norm": 3.826312790016627, + "learning_rate": 8.579436292428458e-06, + "loss": 0.6325, + "step": 3308 + }, + { + "epoch": 0.27, + "grad_norm": 2.992864527424246, + "learning_rate": 8.578517767172554e-06, + "loss": 0.7728, + "step": 3309 + }, + { + "epoch": 0.27, + "grad_norm": 6.3477405169102195, + "learning_rate": 8.577598994260687e-06, + "loss": 0.6637, + "step": 3310 + }, + { + "epoch": 0.27, + "grad_norm": 3.418703277272325, + "learning_rate": 8.576679973756443e-06, + "loss": 0.7187, + "step": 3311 + }, + { + "epoch": 0.27, + "grad_norm": 4.519412720410982, + "learning_rate": 8.575760705723424e-06, + "loss": 0.6031, + "step": 3312 + }, + { + "epoch": 0.27, + "grad_norm": 2.922918879405777, + "learning_rate": 8.57484119022525e-06, + "loss": 0.6717, + "step": 3313 + }, + { + "epoch": 0.27, + "grad_norm": 4.042210254722186, + "learning_rate": 8.573921427325556e-06, + "loss": 0.754, + "step": 3314 + }, + { + "epoch": 0.27, + "grad_norm": 3.4261580989070533, + "learning_rate": 8.573001417087997e-06, + "loss": 0.6211, + "step": 3315 + }, + { + "epoch": 0.27, + "grad_norm": 4.258452715009701, + "learning_rate": 8.57208115957624e-06, + "loss": 0.6796, + "step": 3316 + }, + { + "epoch": 0.27, + "grad_norm": 9.093959121669448, + "learning_rate": 8.571160654853976e-06, + "loss": 0.603, + "step": 3317 + }, + { + "epoch": 0.27, + "grad_norm": 4.982302494181531, + "learning_rate": 8.57023990298491e-06, + "loss": 0.6429, + "step": 3318 + }, + { + "epoch": 0.27, + "grad_norm": 2.3518782927192032, + "learning_rate": 8.569318904032763e-06, + "loss": 0.7076, + "step": 3319 + }, + { + "epoch": 0.27, + "grad_norm": 2.5223315000276467, + "learning_rate": 8.56839765806127e-06, + "loss": 0.8356, + "step": 3320 + }, + { + "epoch": 0.27, + "grad_norm": 4.02554128565961, + "learning_rate": 8.567476165134192e-06, + "loss": 0.7827, + "step": 3321 + }, + { + "epoch": 0.27, + "grad_norm": 4.877455648809495, + "learning_rate": 8.566554425315303e-06, + "loss": 0.7862, + "step": 3322 + }, + { + "epoch": 0.27, + "grad_norm": 4.977412807946987, + "learning_rate": 8.56563243866839e-06, + "loss": 0.7247, + "step": 3323 + }, + { + "epoch": 0.27, + "grad_norm": 4.287138727702408, + "learning_rate": 8.56471020525726e-06, + "loss": 0.8824, + "step": 3324 + }, + { + "epoch": 0.27, + "grad_norm": 4.209857331932112, + "learning_rate": 8.56378772514574e-06, + "loss": 0.7889, + "step": 3325 + }, + { + "epoch": 0.27, + "grad_norm": 2.9840601705402134, + "learning_rate": 8.56286499839767e-06, + "loss": 0.7115, + "step": 3326 + }, + { + "epoch": 0.27, + "grad_norm": 5.061276330275833, + "learning_rate": 8.561942025076907e-06, + "loss": 0.7493, + "step": 3327 + }, + { + "epoch": 0.27, + "grad_norm": 4.184272445539894, + "learning_rate": 8.561018805247329e-06, + "loss": 0.8091, + "step": 3328 + }, + { + "epoch": 0.27, + "grad_norm": 3.308403445544687, + "learning_rate": 8.560095338972827e-06, + "loss": 0.5852, + "step": 3329 + }, + { + "epoch": 0.27, + "grad_norm": 3.525710230827271, + "learning_rate": 8.559171626317312e-06, + "loss": 0.8984, + "step": 3330 + }, + { + "epoch": 0.27, + "grad_norm": 3.355085812506672, + "learning_rate": 8.55824766734471e-06, + "loss": 0.7264, + "step": 3331 + }, + { + "epoch": 0.27, + "grad_norm": 3.2051788796381735, + "learning_rate": 8.557323462118963e-06, + "loss": 0.6795, + "step": 3332 + }, + { + "epoch": 0.27, + "grad_norm": 3.9361538292386156, + "learning_rate": 8.556399010704036e-06, + "loss": 0.7104, + "step": 3333 + }, + { + "epoch": 0.27, + "grad_norm": 4.341182964460685, + "learning_rate": 8.555474313163903e-06, + "loss": 0.6994, + "step": 3334 + }, + { + "epoch": 0.27, + "grad_norm": 3.6435448222869016, + "learning_rate": 8.554549369562562e-06, + "loss": 0.7705, + "step": 3335 + }, + { + "epoch": 0.27, + "grad_norm": 3.749605049380575, + "learning_rate": 8.553624179964023e-06, + "loss": 0.8921, + "step": 3336 + }, + { + "epoch": 0.27, + "grad_norm": 2.496170511113874, + "learning_rate": 8.552698744432315e-06, + "loss": 0.7504, + "step": 3337 + }, + { + "epoch": 0.27, + "grad_norm": 3.879469439273665, + "learning_rate": 8.551773063031484e-06, + "loss": 0.7558, + "step": 3338 + }, + { + "epoch": 0.27, + "grad_norm": 3.40773341954371, + "learning_rate": 8.550847135825594e-06, + "loss": 0.7489, + "step": 3339 + }, + { + "epoch": 0.27, + "grad_norm": 4.816316620409646, + "learning_rate": 8.549920962878724e-06, + "loss": 0.6747, + "step": 3340 + }, + { + "epoch": 0.27, + "grad_norm": 3.6856419753685907, + "learning_rate": 8.54899454425497e-06, + "loss": 0.7621, + "step": 3341 + }, + { + "epoch": 0.27, + "grad_norm": 4.328313549215571, + "learning_rate": 8.548067880018447e-06, + "loss": 0.6951, + "step": 3342 + }, + { + "epoch": 0.27, + "grad_norm": 4.955548046693349, + "learning_rate": 8.547140970233287e-06, + "loss": 0.598, + "step": 3343 + }, + { + "epoch": 0.27, + "grad_norm": 2.824717911688605, + "learning_rate": 8.546213814963638e-06, + "loss": 0.7172, + "step": 3344 + }, + { + "epoch": 0.27, + "grad_norm": 3.263676419431877, + "learning_rate": 8.545286414273663e-06, + "loss": 0.5756, + "step": 3345 + }, + { + "epoch": 0.27, + "grad_norm": 6.478938172033021, + "learning_rate": 8.544358768227545e-06, + "loss": 0.5948, + "step": 3346 + }, + { + "epoch": 0.27, + "grad_norm": 19.71863583589898, + "learning_rate": 8.543430876889485e-06, + "loss": 0.6282, + "step": 3347 + }, + { + "epoch": 0.27, + "grad_norm": 4.425442234625287, + "learning_rate": 8.542502740323695e-06, + "loss": 0.7568, + "step": 3348 + }, + { + "epoch": 0.27, + "grad_norm": 4.1858823762381085, + "learning_rate": 8.54157435859441e-06, + "loss": 0.6382, + "step": 3349 + }, + { + "epoch": 0.27, + "grad_norm": 3.811568952025271, + "learning_rate": 8.540645731765882e-06, + "loss": 0.6938, + "step": 3350 + }, + { + "epoch": 0.27, + "grad_norm": 6.115209242386223, + "learning_rate": 8.539716859902374e-06, + "loss": 0.7653, + "step": 3351 + }, + { + "epoch": 0.27, + "grad_norm": 3.0582887733013737, + "learning_rate": 8.538787743068172e-06, + "loss": 0.7282, + "step": 3352 + }, + { + "epoch": 0.27, + "grad_norm": 5.862014359957153, + "learning_rate": 8.537858381327575e-06, + "loss": 0.6535, + "step": 3353 + }, + { + "epoch": 0.27, + "grad_norm": 2.4952699251979085, + "learning_rate": 8.536928774744904e-06, + "loss": 0.6181, + "step": 3354 + }, + { + "epoch": 0.27, + "grad_norm": 2.9748004901951672, + "learning_rate": 8.535998923384489e-06, + "loss": 0.6724, + "step": 3355 + }, + { + "epoch": 0.27, + "grad_norm": 6.311947842609172, + "learning_rate": 8.535068827310684e-06, + "loss": 0.5583, + "step": 3356 + }, + { + "epoch": 0.27, + "grad_norm": 3.721584659685497, + "learning_rate": 8.534138486587859e-06, + "loss": 0.6294, + "step": 3357 + }, + { + "epoch": 0.27, + "grad_norm": 2.874083674460974, + "learning_rate": 8.533207901280399e-06, + "loss": 0.6627, + "step": 3358 + }, + { + "epoch": 0.27, + "grad_norm": 5.468262701122169, + "learning_rate": 8.532277071452704e-06, + "loss": 0.7833, + "step": 3359 + }, + { + "epoch": 0.27, + "grad_norm": 3.118763990779353, + "learning_rate": 8.531345997169194e-06, + "loss": 0.8438, + "step": 3360 + }, + { + "epoch": 0.27, + "grad_norm": 3.6512936062832635, + "learning_rate": 8.530414678494306e-06, + "loss": 0.6003, + "step": 3361 + }, + { + "epoch": 0.27, + "grad_norm": 4.632181001735019, + "learning_rate": 8.529483115492492e-06, + "loss": 0.7535, + "step": 3362 + }, + { + "epoch": 0.27, + "grad_norm": 6.205890458070212, + "learning_rate": 8.528551308228224e-06, + "loss": 0.7304, + "step": 3363 + }, + { + "epoch": 0.27, + "grad_norm": 2.875337219739922, + "learning_rate": 8.52761925676599e-06, + "loss": 0.82, + "step": 3364 + }, + { + "epoch": 0.27, + "grad_norm": 8.374558804639715, + "learning_rate": 8.526686961170289e-06, + "loss": 0.6903, + "step": 3365 + }, + { + "epoch": 0.27, + "grad_norm": 3.852214292850994, + "learning_rate": 8.525754421505646e-06, + "loss": 0.7556, + "step": 3366 + }, + { + "epoch": 0.27, + "grad_norm": 2.327822204715053, + "learning_rate": 8.524821637836595e-06, + "loss": 0.8042, + "step": 3367 + }, + { + "epoch": 0.27, + "grad_norm": 3.011465567353725, + "learning_rate": 8.523888610227692e-06, + "loss": 0.7225, + "step": 3368 + }, + { + "epoch": 0.27, + "grad_norm": 6.976772859441323, + "learning_rate": 8.522955338743512e-06, + "loss": 0.7556, + "step": 3369 + }, + { + "epoch": 0.27, + "grad_norm": 2.760826114518901, + "learning_rate": 8.522021823448638e-06, + "loss": 0.6433, + "step": 3370 + }, + { + "epoch": 0.27, + "grad_norm": 3.124483369687337, + "learning_rate": 8.521088064407678e-06, + "loss": 0.5861, + "step": 3371 + }, + { + "epoch": 0.27, + "grad_norm": 2.736065248571767, + "learning_rate": 8.520154061685255e-06, + "loss": 0.7044, + "step": 3372 + }, + { + "epoch": 0.27, + "grad_norm": 3.8630743693346283, + "learning_rate": 8.519219815346004e-06, + "loss": 0.7131, + "step": 3373 + }, + { + "epoch": 0.27, + "grad_norm": 2.593686948829996, + "learning_rate": 8.518285325454583e-06, + "loss": 0.7322, + "step": 3374 + }, + { + "epoch": 0.27, + "grad_norm": 2.8602596559928783, + "learning_rate": 8.517350592075667e-06, + "loss": 0.597, + "step": 3375 + }, + { + "epoch": 0.27, + "grad_norm": 3.407124875356108, + "learning_rate": 8.51641561527394e-06, + "loss": 0.7004, + "step": 3376 + }, + { + "epoch": 0.27, + "grad_norm": 4.664443118333286, + "learning_rate": 8.515480395114112e-06, + "loss": 0.6819, + "step": 3377 + }, + { + "epoch": 0.27, + "grad_norm": 3.7238867556436, + "learning_rate": 8.514544931660907e-06, + "loss": 0.7568, + "step": 3378 + }, + { + "epoch": 0.27, + "grad_norm": 3.1714579659740965, + "learning_rate": 8.513609224979061e-06, + "loss": 0.6853, + "step": 3379 + }, + { + "epoch": 0.27, + "grad_norm": 2.5147249991604075, + "learning_rate": 8.512673275133334e-06, + "loss": 0.7837, + "step": 3380 + }, + { + "epoch": 0.27, + "grad_norm": 3.270292987243555, + "learning_rate": 8.5117370821885e-06, + "loss": 0.8481, + "step": 3381 + }, + { + "epoch": 0.27, + "grad_norm": 4.27839286481896, + "learning_rate": 8.510800646209347e-06, + "loss": 0.8577, + "step": 3382 + }, + { + "epoch": 0.27, + "grad_norm": 3.2198892692705776, + "learning_rate": 8.509863967260684e-06, + "loss": 0.7464, + "step": 3383 + }, + { + "epoch": 0.27, + "grad_norm": 3.236189360973214, + "learning_rate": 8.508927045407334e-06, + "loss": 0.8242, + "step": 3384 + }, + { + "epoch": 0.27, + "grad_norm": 2.9783758277604737, + "learning_rate": 8.507989880714139e-06, + "loss": 0.5955, + "step": 3385 + }, + { + "epoch": 0.28, + "grad_norm": 2.4696213306028127, + "learning_rate": 8.507052473245953e-06, + "loss": 0.6692, + "step": 3386 + }, + { + "epoch": 0.28, + "grad_norm": 6.409553334139583, + "learning_rate": 8.506114823067657e-06, + "loss": 0.861, + "step": 3387 + }, + { + "epoch": 0.28, + "grad_norm": 3.5029173349099416, + "learning_rate": 8.50517693024414e-06, + "loss": 0.6611, + "step": 3388 + }, + { + "epoch": 0.28, + "grad_norm": 2.6795803552507045, + "learning_rate": 8.504238794840305e-06, + "loss": 0.658, + "step": 3389 + }, + { + "epoch": 0.28, + "grad_norm": 3.200816002397676, + "learning_rate": 8.503300416921082e-06, + "loss": 0.6274, + "step": 3390 + }, + { + "epoch": 0.28, + "grad_norm": 4.037378088364288, + "learning_rate": 8.502361796551415e-06, + "loss": 0.746, + "step": 3391 + }, + { + "epoch": 0.28, + "grad_norm": 3.5002154410766737, + "learning_rate": 8.501422933796256e-06, + "loss": 0.7615, + "step": 3392 + }, + { + "epoch": 0.28, + "grad_norm": 3.558197577812753, + "learning_rate": 8.500483828720582e-06, + "loss": 0.6948, + "step": 3393 + }, + { + "epoch": 0.28, + "grad_norm": 2.5048463446024143, + "learning_rate": 8.49954448138939e-06, + "loss": 0.6263, + "step": 3394 + }, + { + "epoch": 0.28, + "grad_norm": 4.408952276463324, + "learning_rate": 8.498604891867683e-06, + "loss": 0.6482, + "step": 3395 + }, + { + "epoch": 0.28, + "grad_norm": 4.226213450289688, + "learning_rate": 8.497665060220488e-06, + "loss": 0.6842, + "step": 3396 + }, + { + "epoch": 0.28, + "grad_norm": 3.7265993812223486, + "learning_rate": 8.496724986512848e-06, + "loss": 0.6746, + "step": 3397 + }, + { + "epoch": 0.28, + "grad_norm": 2.9026796021396524, + "learning_rate": 8.495784670809822e-06, + "loss": 0.7646, + "step": 3398 + }, + { + "epoch": 0.28, + "grad_norm": 5.26894273274618, + "learning_rate": 8.494844113176486e-06, + "loss": 0.6401, + "step": 3399 + }, + { + "epoch": 0.28, + "grad_norm": 3.443409431752811, + "learning_rate": 8.49390331367793e-06, + "loss": 0.7328, + "step": 3400 + }, + { + "epoch": 0.28, + "grad_norm": 3.135560789299575, + "learning_rate": 8.492962272379268e-06, + "loss": 0.8426, + "step": 3401 + }, + { + "epoch": 0.28, + "grad_norm": 12.24784666684616, + "learning_rate": 8.492020989345622e-06, + "loss": 0.8092, + "step": 3402 + }, + { + "epoch": 0.28, + "grad_norm": 2.245527474555863, + "learning_rate": 8.491079464642134e-06, + "loss": 0.7113, + "step": 3403 + }, + { + "epoch": 0.28, + "grad_norm": 3.6502809284815516, + "learning_rate": 8.490137698333969e-06, + "loss": 0.6906, + "step": 3404 + }, + { + "epoch": 0.28, + "grad_norm": 6.74085520338024, + "learning_rate": 8.489195690486296e-06, + "loss": 0.7697, + "step": 3405 + }, + { + "epoch": 0.28, + "grad_norm": 2.92429672890809, + "learning_rate": 8.488253441164313e-06, + "loss": 0.7274, + "step": 3406 + }, + { + "epoch": 0.28, + "grad_norm": 2.779385686596072, + "learning_rate": 8.48731095043323e-06, + "loss": 0.6477, + "step": 3407 + }, + { + "epoch": 0.28, + "grad_norm": 2.5557130204025778, + "learning_rate": 8.486368218358268e-06, + "loss": 0.7512, + "step": 3408 + }, + { + "epoch": 0.28, + "grad_norm": 3.716731033773124, + "learning_rate": 8.485425245004675e-06, + "loss": 0.7646, + "step": 3409 + }, + { + "epoch": 0.28, + "grad_norm": 4.414773653984684, + "learning_rate": 8.484482030437708e-06, + "loss": 0.7015, + "step": 3410 + }, + { + "epoch": 0.28, + "grad_norm": 5.932430800896966, + "learning_rate": 8.483538574722648e-06, + "loss": 0.6358, + "step": 3411 + }, + { + "epoch": 0.28, + "grad_norm": 3.6415645027039543, + "learning_rate": 8.482594877924779e-06, + "loss": 0.7446, + "step": 3412 + }, + { + "epoch": 0.28, + "grad_norm": 18.0612837191058, + "learning_rate": 8.481650940109419e-06, + "loss": 0.7081, + "step": 3413 + }, + { + "epoch": 0.28, + "grad_norm": 3.162714756439013, + "learning_rate": 8.480706761341893e-06, + "loss": 0.839, + "step": 3414 + }, + { + "epoch": 0.28, + "grad_norm": 3.2378503097262574, + "learning_rate": 8.47976234168754e-06, + "loss": 0.8578, + "step": 3415 + }, + { + "epoch": 0.28, + "grad_norm": 4.702868047060205, + "learning_rate": 8.478817681211724e-06, + "loss": 0.6566, + "step": 3416 + }, + { + "epoch": 0.28, + "grad_norm": 2.928765837017126, + "learning_rate": 8.47787277997982e-06, + "loss": 0.69, + "step": 3417 + }, + { + "epoch": 0.28, + "grad_norm": 4.086795076167226, + "learning_rate": 8.476927638057221e-06, + "loss": 0.7978, + "step": 3418 + }, + { + "epoch": 0.28, + "grad_norm": 6.090646742371641, + "learning_rate": 8.475982255509336e-06, + "loss": 0.8555, + "step": 3419 + }, + { + "epoch": 0.28, + "grad_norm": 8.08169650251828, + "learning_rate": 8.475036632401594e-06, + "loss": 0.6639, + "step": 3420 + }, + { + "epoch": 0.28, + "grad_norm": 2.7400709623157193, + "learning_rate": 8.474090768799436e-06, + "loss": 0.6374, + "step": 3421 + }, + { + "epoch": 0.28, + "grad_norm": 5.882548233665905, + "learning_rate": 8.473144664768322e-06, + "loss": 0.7037, + "step": 3422 + }, + { + "epoch": 0.28, + "grad_norm": 4.758768983190327, + "learning_rate": 8.472198320373729e-06, + "loss": 0.7754, + "step": 3423 + }, + { + "epoch": 0.28, + "grad_norm": 3.5030693413732834, + "learning_rate": 8.471251735681148e-06, + "loss": 0.5913, + "step": 3424 + }, + { + "epoch": 0.28, + "grad_norm": 4.429324157522576, + "learning_rate": 8.47030491075609e-06, + "loss": 0.7187, + "step": 3425 + }, + { + "epoch": 0.28, + "grad_norm": 3.6729377630220994, + "learning_rate": 8.46935784566408e-06, + "loss": 0.7205, + "step": 3426 + }, + { + "epoch": 0.28, + "grad_norm": 3.2308402649290424, + "learning_rate": 8.468410540470666e-06, + "loss": 0.7664, + "step": 3427 + }, + { + "epoch": 0.28, + "grad_norm": 2.888006085360701, + "learning_rate": 8.467462995241403e-06, + "loss": 0.7553, + "step": 3428 + }, + { + "epoch": 0.28, + "grad_norm": 3.0257686387451046, + "learning_rate": 8.466515210041866e-06, + "loss": 0.633, + "step": 3429 + }, + { + "epoch": 0.28, + "grad_norm": 3.124613920405623, + "learning_rate": 8.46556718493765e-06, + "loss": 0.7035, + "step": 3430 + }, + { + "epoch": 0.28, + "grad_norm": 2.644456571765262, + "learning_rate": 8.464618919994364e-06, + "loss": 0.7086, + "step": 3431 + }, + { + "epoch": 0.28, + "grad_norm": 4.331826202589411, + "learning_rate": 8.463670415277634e-06, + "loss": 0.5793, + "step": 3432 + }, + { + "epoch": 0.28, + "grad_norm": 4.691061721311305, + "learning_rate": 8.462721670853101e-06, + "loss": 0.8795, + "step": 3433 + }, + { + "epoch": 0.28, + "grad_norm": 2.348335174885549, + "learning_rate": 8.461772686786427e-06, + "loss": 0.5998, + "step": 3434 + }, + { + "epoch": 0.28, + "grad_norm": 3.1957922047266103, + "learning_rate": 8.460823463143284e-06, + "loss": 0.7169, + "step": 3435 + }, + { + "epoch": 0.28, + "grad_norm": 4.56126454463297, + "learning_rate": 8.459873999989367e-06, + "loss": 0.71, + "step": 3436 + }, + { + "epoch": 0.28, + "grad_norm": 13.590360129304072, + "learning_rate": 8.458924297390385e-06, + "loss": 0.7248, + "step": 3437 + }, + { + "epoch": 0.28, + "grad_norm": 2.7250754830610573, + "learning_rate": 8.457974355412062e-06, + "loss": 0.7403, + "step": 3438 + }, + { + "epoch": 0.28, + "grad_norm": 9.239943051830677, + "learning_rate": 8.457024174120141e-06, + "loss": 0.5732, + "step": 3439 + }, + { + "epoch": 0.28, + "grad_norm": 3.4328699534374105, + "learning_rate": 8.456073753580378e-06, + "loss": 0.6238, + "step": 3440 + }, + { + "epoch": 0.28, + "grad_norm": 4.0457530104295545, + "learning_rate": 8.455123093858551e-06, + "loss": 0.9201, + "step": 3441 + }, + { + "epoch": 0.28, + "grad_norm": 3.8638679370017464, + "learning_rate": 8.454172195020452e-06, + "loss": 0.7626, + "step": 3442 + }, + { + "epoch": 0.28, + "grad_norm": 3.08677814213671, + "learning_rate": 8.453221057131886e-06, + "loss": 0.805, + "step": 3443 + }, + { + "epoch": 0.28, + "grad_norm": 5.525897117337661, + "learning_rate": 8.45226968025868e-06, + "loss": 0.5757, + "step": 3444 + }, + { + "epoch": 0.28, + "grad_norm": 2.99680735890996, + "learning_rate": 8.451318064466676e-06, + "loss": 0.7734, + "step": 3445 + }, + { + "epoch": 0.28, + "grad_norm": 3.7724471786923566, + "learning_rate": 8.450366209821728e-06, + "loss": 0.8221, + "step": 3446 + }, + { + "epoch": 0.28, + "grad_norm": 2.9787516855280822, + "learning_rate": 8.449414116389716e-06, + "loss": 0.6826, + "step": 3447 + }, + { + "epoch": 0.28, + "grad_norm": 3.0121815343253235, + "learning_rate": 8.448461784236525e-06, + "loss": 0.7126, + "step": 3448 + }, + { + "epoch": 0.28, + "grad_norm": 3.405700738764285, + "learning_rate": 8.447509213428067e-06, + "loss": 0.7307, + "step": 3449 + }, + { + "epoch": 0.28, + "grad_norm": 3.0016034731368797, + "learning_rate": 8.446556404030263e-06, + "loss": 0.7837, + "step": 3450 + }, + { + "epoch": 0.28, + "grad_norm": 3.4388507483947346, + "learning_rate": 8.445603356109057e-06, + "loss": 0.569, + "step": 3451 + }, + { + "epoch": 0.28, + "grad_norm": 4.1548362998446215, + "learning_rate": 8.4446500697304e-06, + "loss": 0.5445, + "step": 3452 + }, + { + "epoch": 0.28, + "grad_norm": 4.007684452189185, + "learning_rate": 8.443696544960272e-06, + "loss": 0.6466, + "step": 3453 + }, + { + "epoch": 0.28, + "grad_norm": 2.3653943716866133, + "learning_rate": 8.44274278186466e-06, + "loss": 0.6692, + "step": 3454 + }, + { + "epoch": 0.28, + "grad_norm": 2.3619559048975645, + "learning_rate": 8.441788780509568e-06, + "loss": 0.7444, + "step": 3455 + }, + { + "epoch": 0.28, + "grad_norm": 2.3586144759068945, + "learning_rate": 8.44083454096102e-06, + "loss": 0.6642, + "step": 3456 + }, + { + "epoch": 0.28, + "grad_norm": 3.526775691433164, + "learning_rate": 8.43988006328506e-06, + "loss": 0.6953, + "step": 3457 + }, + { + "epoch": 0.28, + "grad_norm": 3.9035646119017198, + "learning_rate": 8.438925347547737e-06, + "loss": 0.68, + "step": 3458 + }, + { + "epoch": 0.28, + "grad_norm": 2.2702725961322923, + "learning_rate": 8.437970393815129e-06, + "loss": 0.7081, + "step": 3459 + }, + { + "epoch": 0.28, + "grad_norm": 3.208156894459663, + "learning_rate": 8.437015202153322e-06, + "loss": 0.7157, + "step": 3460 + }, + { + "epoch": 0.28, + "grad_norm": 3.6148937122543683, + "learning_rate": 8.436059772628421e-06, + "loss": 0.6189, + "step": 3461 + }, + { + "epoch": 0.28, + "grad_norm": 3.774059221482722, + "learning_rate": 8.435104105306549e-06, + "loss": 0.8104, + "step": 3462 + }, + { + "epoch": 0.28, + "grad_norm": 3.346137098837187, + "learning_rate": 8.434148200253843e-06, + "loss": 0.8152, + "step": 3463 + }, + { + "epoch": 0.28, + "grad_norm": 2.764372417391963, + "learning_rate": 8.433192057536458e-06, + "loss": 0.7842, + "step": 3464 + }, + { + "epoch": 0.28, + "grad_norm": 3.165528144141994, + "learning_rate": 8.432235677220567e-06, + "loss": 0.7955, + "step": 3465 + }, + { + "epoch": 0.28, + "grad_norm": 2.7907531631475035, + "learning_rate": 8.431279059372357e-06, + "loss": 0.801, + "step": 3466 + }, + { + "epoch": 0.28, + "grad_norm": 2.512382878932326, + "learning_rate": 8.43032220405803e-06, + "loss": 0.6152, + "step": 3467 + }, + { + "epoch": 0.28, + "grad_norm": 2.9922035044268234, + "learning_rate": 8.429365111343806e-06, + "loss": 0.608, + "step": 3468 + }, + { + "epoch": 0.28, + "grad_norm": 2.35153913387305, + "learning_rate": 8.428407781295924e-06, + "loss": 0.7296, + "step": 3469 + }, + { + "epoch": 0.28, + "grad_norm": 14.417730754359736, + "learning_rate": 8.427450213980636e-06, + "loss": 0.6743, + "step": 3470 + }, + { + "epoch": 0.28, + "grad_norm": 3.2153960200237335, + "learning_rate": 8.426492409464213e-06, + "loss": 0.6143, + "step": 3471 + }, + { + "epoch": 0.28, + "grad_norm": 3.6248165627709525, + "learning_rate": 8.42553436781294e-06, + "loss": 0.5606, + "step": 3472 + }, + { + "epoch": 0.28, + "grad_norm": 2.9303729349030707, + "learning_rate": 8.42457608909312e-06, + "loss": 0.6752, + "step": 3473 + }, + { + "epoch": 0.28, + "grad_norm": 4.333787668761645, + "learning_rate": 8.423617573371073e-06, + "loss": 0.7768, + "step": 3474 + }, + { + "epoch": 0.28, + "grad_norm": 4.843328457272623, + "learning_rate": 8.422658820713131e-06, + "loss": 0.7368, + "step": 3475 + }, + { + "epoch": 0.28, + "grad_norm": 3.1918146761477892, + "learning_rate": 8.421699831185649e-06, + "loss": 0.5924, + "step": 3476 + }, + { + "epoch": 0.28, + "grad_norm": 2.9849385817252903, + "learning_rate": 8.420740604854993e-06, + "loss": 0.6306, + "step": 3477 + }, + { + "epoch": 0.28, + "grad_norm": 2.6153714177783614, + "learning_rate": 8.419781141787549e-06, + "loss": 0.6301, + "step": 3478 + }, + { + "epoch": 0.28, + "grad_norm": 2.854108276694084, + "learning_rate": 8.418821442049716e-06, + "loss": 0.6411, + "step": 3479 + }, + { + "epoch": 0.28, + "grad_norm": 2.5389709265408014, + "learning_rate": 8.417861505707914e-06, + "loss": 0.6805, + "step": 3480 + }, + { + "epoch": 0.28, + "grad_norm": 3.151605011069831, + "learning_rate": 8.416901332828574e-06, + "loss": 0.6677, + "step": 3481 + }, + { + "epoch": 0.28, + "grad_norm": 2.6457456494684215, + "learning_rate": 8.415940923478148e-06, + "loss": 0.792, + "step": 3482 + }, + { + "epoch": 0.28, + "grad_norm": 2.3914292361238343, + "learning_rate": 8.414980277723101e-06, + "loss": 0.7138, + "step": 3483 + }, + { + "epoch": 0.28, + "grad_norm": 2.4391007777872424, + "learning_rate": 8.414019395629918e-06, + "loss": 0.8203, + "step": 3484 + }, + { + "epoch": 0.28, + "grad_norm": 4.590218169483447, + "learning_rate": 8.413058277265094e-06, + "loss": 0.6714, + "step": 3485 + }, + { + "epoch": 0.28, + "grad_norm": 5.813957562117255, + "learning_rate": 8.412096922695147e-06, + "loss": 0.8835, + "step": 3486 + }, + { + "epoch": 0.28, + "grad_norm": 3.8069302518506674, + "learning_rate": 8.41113533198661e-06, + "loss": 0.6778, + "step": 3487 + }, + { + "epoch": 0.28, + "grad_norm": 3.380066090694027, + "learning_rate": 8.41017350520603e-06, + "loss": 0.7246, + "step": 3488 + }, + { + "epoch": 0.28, + "grad_norm": 3.4084201369289695, + "learning_rate": 8.40921144241997e-06, + "loss": 0.8061, + "step": 3489 + }, + { + "epoch": 0.28, + "grad_norm": 2.4492832590578675, + "learning_rate": 8.408249143695014e-06, + "loss": 0.5812, + "step": 3490 + }, + { + "epoch": 0.28, + "grad_norm": 2.1911238479461477, + "learning_rate": 8.407286609097754e-06, + "loss": 0.768, + "step": 3491 + }, + { + "epoch": 0.28, + "grad_norm": 3.8501856132564254, + "learning_rate": 8.406323838694808e-06, + "loss": 0.6398, + "step": 3492 + }, + { + "epoch": 0.28, + "grad_norm": 2.42463582869534, + "learning_rate": 8.405360832552805e-06, + "loss": 0.6164, + "step": 3493 + }, + { + "epoch": 0.28, + "grad_norm": 3.0978672820008626, + "learning_rate": 8.40439759073839e-06, + "loss": 0.7561, + "step": 3494 + }, + { + "epoch": 0.28, + "grad_norm": 2.990085745586637, + "learning_rate": 8.403434113318225e-06, + "loss": 0.5866, + "step": 3495 + }, + { + "epoch": 0.28, + "grad_norm": 3.7734092720972114, + "learning_rate": 8.40247040035899e-06, + "loss": 0.7322, + "step": 3496 + }, + { + "epoch": 0.28, + "grad_norm": 3.327528314580933, + "learning_rate": 8.401506451927382e-06, + "loss": 0.6608, + "step": 3497 + }, + { + "epoch": 0.28, + "grad_norm": 4.689073563397524, + "learning_rate": 8.400542268090106e-06, + "loss": 0.7661, + "step": 3498 + }, + { + "epoch": 0.28, + "grad_norm": 2.7070712671273958, + "learning_rate": 8.399577848913896e-06, + "loss": 0.659, + "step": 3499 + }, + { + "epoch": 0.28, + "grad_norm": 6.99560555668214, + "learning_rate": 8.398613194465492e-06, + "loss": 0.6466, + "step": 3500 + }, + { + "epoch": 0.28, + "grad_norm": 4.226474068478074, + "learning_rate": 8.397648304811657e-06, + "loss": 0.7224, + "step": 3501 + }, + { + "epoch": 0.28, + "grad_norm": 6.306901945693885, + "learning_rate": 8.396683180019166e-06, + "loss": 0.8435, + "step": 3502 + }, + { + "epoch": 0.28, + "grad_norm": 5.61944085955927, + "learning_rate": 8.39571782015481e-06, + "loss": 0.7978, + "step": 3503 + }, + { + "epoch": 0.28, + "grad_norm": 3.453655124080758, + "learning_rate": 8.3947522252854e-06, + "loss": 0.721, + "step": 3504 + }, + { + "epoch": 0.28, + "grad_norm": 8.422867185274093, + "learning_rate": 8.393786395477761e-06, + "loss": 0.7179, + "step": 3505 + }, + { + "epoch": 0.28, + "grad_norm": 5.295405306358807, + "learning_rate": 8.392820330798734e-06, + "loss": 0.6321, + "step": 3506 + }, + { + "epoch": 0.28, + "grad_norm": 3.612987800486962, + "learning_rate": 8.391854031315178e-06, + "loss": 0.7428, + "step": 3507 + }, + { + "epoch": 0.28, + "grad_norm": 10.884359010582811, + "learning_rate": 8.390887497093968e-06, + "loss": 0.8065, + "step": 3508 + }, + { + "epoch": 0.29, + "grad_norm": 5.069873249800019, + "learning_rate": 8.38992072820199e-06, + "loss": 0.7499, + "step": 3509 + }, + { + "epoch": 0.29, + "grad_norm": 6.917081130488773, + "learning_rate": 8.388953724706152e-06, + "loss": 0.6734, + "step": 3510 + }, + { + "epoch": 0.29, + "grad_norm": 4.403011702490444, + "learning_rate": 8.387986486673381e-06, + "loss": 0.7111, + "step": 3511 + }, + { + "epoch": 0.29, + "grad_norm": 3.6740319296504635, + "learning_rate": 8.38701901417061e-06, + "loss": 0.4359, + "step": 3512 + }, + { + "epoch": 0.29, + "grad_norm": 7.450811376950235, + "learning_rate": 8.386051307264798e-06, + "loss": 0.7917, + "step": 3513 + }, + { + "epoch": 0.29, + "grad_norm": 3.6074589865262108, + "learning_rate": 8.385083366022914e-06, + "loss": 0.7174, + "step": 3514 + }, + { + "epoch": 0.29, + "grad_norm": 3.1477663539696805, + "learning_rate": 8.384115190511948e-06, + "loss": 0.8458, + "step": 3515 + }, + { + "epoch": 0.29, + "grad_norm": 4.563630410783101, + "learning_rate": 8.383146780798901e-06, + "loss": 0.8753, + "step": 3516 + }, + { + "epoch": 0.29, + "grad_norm": 5.697895793470481, + "learning_rate": 8.382178136950796e-06, + "loss": 0.7696, + "step": 3517 + }, + { + "epoch": 0.29, + "grad_norm": 3.869601461331248, + "learning_rate": 8.381209259034668e-06, + "loss": 0.7951, + "step": 3518 + }, + { + "epoch": 0.29, + "grad_norm": 3.296027703116834, + "learning_rate": 8.380240147117569e-06, + "loss": 0.5266, + "step": 3519 + }, + { + "epoch": 0.29, + "grad_norm": 2.679296448401381, + "learning_rate": 8.379270801266569e-06, + "loss": 0.6947, + "step": 3520 + }, + { + "epoch": 0.29, + "grad_norm": 12.013200752045412, + "learning_rate": 8.37830122154875e-06, + "loss": 0.5685, + "step": 3521 + }, + { + "epoch": 0.29, + "grad_norm": 4.724271009762378, + "learning_rate": 8.377331408031216e-06, + "loss": 0.6585, + "step": 3522 + }, + { + "epoch": 0.29, + "grad_norm": 4.009187488614058, + "learning_rate": 8.376361360781083e-06, + "loss": 0.8104, + "step": 3523 + }, + { + "epoch": 0.29, + "grad_norm": 4.097410112926117, + "learning_rate": 8.375391079865485e-06, + "loss": 0.6784, + "step": 3524 + }, + { + "epoch": 0.29, + "grad_norm": 2.324946916563888, + "learning_rate": 8.37442056535157e-06, + "loss": 0.7041, + "step": 3525 + }, + { + "epoch": 0.29, + "grad_norm": 3.9475960587596277, + "learning_rate": 8.373449817306505e-06, + "loss": 0.7196, + "step": 3526 + }, + { + "epoch": 0.29, + "grad_norm": 3.95878259928452, + "learning_rate": 8.372478835797473e-06, + "loss": 0.874, + "step": 3527 + }, + { + "epoch": 0.29, + "grad_norm": 2.643227518871769, + "learning_rate": 8.37150762089167e-06, + "loss": 0.7626, + "step": 3528 + }, + { + "epoch": 0.29, + "grad_norm": 3.6363233498330807, + "learning_rate": 8.37053617265631e-06, + "loss": 0.8392, + "step": 3529 + }, + { + "epoch": 0.29, + "grad_norm": 4.074460069516078, + "learning_rate": 8.369564491158626e-06, + "loss": 0.5997, + "step": 3530 + }, + { + "epoch": 0.29, + "grad_norm": 3.1682130922620364, + "learning_rate": 8.368592576465861e-06, + "loss": 0.7175, + "step": 3531 + }, + { + "epoch": 0.29, + "grad_norm": 5.657803736000153, + "learning_rate": 8.367620428645281e-06, + "loss": 0.8291, + "step": 3532 + }, + { + "epoch": 0.29, + "grad_norm": 3.940670943290156, + "learning_rate": 8.366648047764161e-06, + "loss": 0.7834, + "step": 3533 + }, + { + "epoch": 0.29, + "grad_norm": 3.3124287401586177, + "learning_rate": 8.3656754338898e-06, + "loss": 0.8664, + "step": 3534 + }, + { + "epoch": 0.29, + "grad_norm": 2.353958015493078, + "learning_rate": 8.364702587089503e-06, + "loss": 0.6858, + "step": 3535 + }, + { + "epoch": 0.29, + "grad_norm": 3.6547224889261156, + "learning_rate": 8.363729507430605e-06, + "loss": 0.6812, + "step": 3536 + }, + { + "epoch": 0.29, + "grad_norm": 3.634120295835721, + "learning_rate": 8.362756194980444e-06, + "loss": 0.7321, + "step": 3537 + }, + { + "epoch": 0.29, + "grad_norm": 2.686769946884574, + "learning_rate": 8.36178264980638e-06, + "loss": 0.6026, + "step": 3538 + }, + { + "epoch": 0.29, + "grad_norm": 4.572798911309554, + "learning_rate": 8.36080887197579e-06, + "loss": 0.7351, + "step": 3539 + }, + { + "epoch": 0.29, + "grad_norm": 4.386834462797566, + "learning_rate": 8.359834861556066e-06, + "loss": 0.8774, + "step": 3540 + }, + { + "epoch": 0.29, + "grad_norm": 4.698203621612668, + "learning_rate": 8.358860618614612e-06, + "loss": 0.7425, + "step": 3541 + }, + { + "epoch": 0.29, + "grad_norm": 5.993951310822244, + "learning_rate": 8.357886143218855e-06, + "loss": 0.7102, + "step": 3542 + }, + { + "epoch": 0.29, + "grad_norm": 2.3723043035676974, + "learning_rate": 8.356911435436234e-06, + "loss": 0.8388, + "step": 3543 + }, + { + "epoch": 0.29, + "grad_norm": 7.866736365019808, + "learning_rate": 8.355936495334204e-06, + "loss": 0.7495, + "step": 3544 + }, + { + "epoch": 0.29, + "grad_norm": 4.190351196990587, + "learning_rate": 8.35496132298024e-06, + "loss": 0.781, + "step": 3545 + }, + { + "epoch": 0.29, + "grad_norm": 2.7827325543577834, + "learning_rate": 8.353985918441825e-06, + "loss": 0.9104, + "step": 3546 + }, + { + "epoch": 0.29, + "grad_norm": 2.9659921774841074, + "learning_rate": 8.353010281786467e-06, + "loss": 0.6534, + "step": 3547 + }, + { + "epoch": 0.29, + "grad_norm": 3.4660431004813876, + "learning_rate": 8.352034413081687e-06, + "loss": 0.7023, + "step": 3548 + }, + { + "epoch": 0.29, + "grad_norm": 4.3956004608456976, + "learning_rate": 8.351058312395018e-06, + "loss": 0.745, + "step": 3549 + }, + { + "epoch": 0.29, + "grad_norm": 3.2721546593834083, + "learning_rate": 8.350081979794013e-06, + "loss": 0.8692, + "step": 3550 + }, + { + "epoch": 0.29, + "grad_norm": 2.6106291170866154, + "learning_rate": 8.349105415346241e-06, + "loss": 0.8022, + "step": 3551 + }, + { + "epoch": 0.29, + "grad_norm": 3.754286301608582, + "learning_rate": 8.348128619119287e-06, + "loss": 0.6679, + "step": 3552 + }, + { + "epoch": 0.29, + "grad_norm": 2.832642617623086, + "learning_rate": 8.347151591180753e-06, + "loss": 0.5043, + "step": 3553 + }, + { + "epoch": 0.29, + "grad_norm": 5.473816912274864, + "learning_rate": 8.346174331598251e-06, + "loss": 0.7565, + "step": 3554 + }, + { + "epoch": 0.29, + "grad_norm": 6.696370863529595, + "learning_rate": 8.345196840439418e-06, + "loss": 0.9184, + "step": 3555 + }, + { + "epoch": 0.29, + "grad_norm": 5.4155927023771335, + "learning_rate": 8.344219117771899e-06, + "loss": 0.9407, + "step": 3556 + }, + { + "epoch": 0.29, + "grad_norm": 3.9019251848703753, + "learning_rate": 8.343241163663361e-06, + "loss": 0.6713, + "step": 3557 + }, + { + "epoch": 0.29, + "grad_norm": 3.685186102235513, + "learning_rate": 8.342262978181482e-06, + "loss": 0.6528, + "step": 3558 + }, + { + "epoch": 0.29, + "grad_norm": 3.9003490148262694, + "learning_rate": 8.341284561393961e-06, + "loss": 0.8707, + "step": 3559 + }, + { + "epoch": 0.29, + "grad_norm": 6.101881252429892, + "learning_rate": 8.340305913368511e-06, + "loss": 0.7126, + "step": 3560 + }, + { + "epoch": 0.29, + "grad_norm": 3.0557781741379557, + "learning_rate": 8.339327034172859e-06, + "loss": 0.6309, + "step": 3561 + }, + { + "epoch": 0.29, + "grad_norm": 2.7543765839910184, + "learning_rate": 8.33834792387475e-06, + "loss": 0.6994, + "step": 3562 + }, + { + "epoch": 0.29, + "grad_norm": 4.681327631691478, + "learning_rate": 8.337368582541944e-06, + "loss": 0.6937, + "step": 3563 + }, + { + "epoch": 0.29, + "grad_norm": 4.2397481421777945, + "learning_rate": 8.33638901024222e-06, + "loss": 0.8142, + "step": 3564 + }, + { + "epoch": 0.29, + "grad_norm": 4.177156794099068, + "learning_rate": 8.335409207043366e-06, + "loss": 0.7458, + "step": 3565 + }, + { + "epoch": 0.29, + "grad_norm": 4.515968050680153, + "learning_rate": 8.334429173013197e-06, + "loss": 0.7827, + "step": 3566 + }, + { + "epoch": 0.29, + "grad_norm": 3.3611763736210274, + "learning_rate": 8.333448908219531e-06, + "loss": 0.5784, + "step": 3567 + }, + { + "epoch": 0.29, + "grad_norm": 2.627453403537669, + "learning_rate": 8.332468412730213e-06, + "loss": 0.797, + "step": 3568 + }, + { + "epoch": 0.29, + "grad_norm": 4.92414908826998, + "learning_rate": 8.331487686613097e-06, + "loss": 0.6804, + "step": 3569 + }, + { + "epoch": 0.29, + "grad_norm": 2.487233066374833, + "learning_rate": 8.330506729936057e-06, + "loss": 0.6234, + "step": 3570 + }, + { + "epoch": 0.29, + "grad_norm": 3.354856210777375, + "learning_rate": 8.32952554276698e-06, + "loss": 0.646, + "step": 3571 + }, + { + "epoch": 0.29, + "grad_norm": 2.809882085875991, + "learning_rate": 8.328544125173772e-06, + "loss": 0.8571, + "step": 3572 + }, + { + "epoch": 0.29, + "grad_norm": 5.669261037920729, + "learning_rate": 8.327562477224352e-06, + "loss": 0.6522, + "step": 3573 + }, + { + "epoch": 0.29, + "grad_norm": 7.422766432934572, + "learning_rate": 8.326580598986656e-06, + "loss": 0.7032, + "step": 3574 + }, + { + "epoch": 0.29, + "grad_norm": 9.040526061350263, + "learning_rate": 8.325598490528636e-06, + "loss": 0.6551, + "step": 3575 + }, + { + "epoch": 0.29, + "grad_norm": 8.502540473610582, + "learning_rate": 8.324616151918263e-06, + "loss": 0.822, + "step": 3576 + }, + { + "epoch": 0.29, + "grad_norm": 4.9695111393128055, + "learning_rate": 8.323633583223516e-06, + "loss": 0.6965, + "step": 3577 + }, + { + "epoch": 0.29, + "grad_norm": 3.7476995893291107, + "learning_rate": 8.3226507845124e-06, + "loss": 0.9427, + "step": 3578 + }, + { + "epoch": 0.29, + "grad_norm": 6.3133010568439065, + "learning_rate": 8.321667755852927e-06, + "loss": 0.7375, + "step": 3579 + }, + { + "epoch": 0.29, + "grad_norm": 14.596130966979175, + "learning_rate": 8.320684497313131e-06, + "loss": 0.8515, + "step": 3580 + }, + { + "epoch": 0.29, + "grad_norm": 4.48868441617736, + "learning_rate": 8.319701008961058e-06, + "loss": 0.7092, + "step": 3581 + }, + { + "epoch": 0.29, + "grad_norm": 3.261924824907615, + "learning_rate": 8.318717290864775e-06, + "loss": 0.7307, + "step": 3582 + }, + { + "epoch": 0.29, + "grad_norm": 5.002244261221541, + "learning_rate": 8.317733343092357e-06, + "loss": 0.6581, + "step": 3583 + }, + { + "epoch": 0.29, + "grad_norm": 3.1779613490746215, + "learning_rate": 8.316749165711903e-06, + "loss": 0.749, + "step": 3584 + }, + { + "epoch": 0.29, + "grad_norm": 4.273774017189862, + "learning_rate": 8.315764758791522e-06, + "loss": 0.6508, + "step": 3585 + }, + { + "epoch": 0.29, + "grad_norm": 4.596467257361083, + "learning_rate": 8.314780122399341e-06, + "loss": 0.7284, + "step": 3586 + }, + { + "epoch": 0.29, + "grad_norm": 4.8650311340865615, + "learning_rate": 8.313795256603505e-06, + "loss": 0.6657, + "step": 3587 + }, + { + "epoch": 0.29, + "grad_norm": 58.183853366780966, + "learning_rate": 8.312810161472173e-06, + "loss": 0.6059, + "step": 3588 + }, + { + "epoch": 0.29, + "grad_norm": 4.998058633242659, + "learning_rate": 8.311824837073517e-06, + "loss": 0.597, + "step": 3589 + }, + { + "epoch": 0.29, + "grad_norm": 4.9324808993077855, + "learning_rate": 8.31083928347573e-06, + "loss": 0.778, + "step": 3590 + }, + { + "epoch": 0.29, + "grad_norm": 8.116975026108058, + "learning_rate": 8.309853500747016e-06, + "loss": 0.6889, + "step": 3591 + }, + { + "epoch": 0.29, + "grad_norm": 3.6743642447769718, + "learning_rate": 8.308867488955602e-06, + "loss": 0.7925, + "step": 3592 + }, + { + "epoch": 0.29, + "grad_norm": 3.1179402791073247, + "learning_rate": 8.307881248169722e-06, + "loss": 0.6022, + "step": 3593 + }, + { + "epoch": 0.29, + "grad_norm": 9.107621761143436, + "learning_rate": 8.306894778457631e-06, + "loss": 0.789, + "step": 3594 + }, + { + "epoch": 0.29, + "grad_norm": 6.1421653345043365, + "learning_rate": 8.3059080798876e-06, + "loss": 0.8314, + "step": 3595 + }, + { + "epoch": 0.29, + "grad_norm": 5.756707212595984, + "learning_rate": 8.304921152527915e-06, + "loss": 0.7591, + "step": 3596 + }, + { + "epoch": 0.29, + "grad_norm": 4.267457112052182, + "learning_rate": 8.303933996446876e-06, + "loss": 0.7158, + "step": 3597 + }, + { + "epoch": 0.29, + "grad_norm": 3.8107606900174447, + "learning_rate": 8.3029466117128e-06, + "loss": 0.668, + "step": 3598 + }, + { + "epoch": 0.29, + "grad_norm": 4.7362959449697355, + "learning_rate": 8.301958998394021e-06, + "loss": 0.589, + "step": 3599 + }, + { + "epoch": 0.29, + "grad_norm": 3.2649192657070283, + "learning_rate": 8.300971156558892e-06, + "loss": 0.8364, + "step": 3600 + }, + { + "epoch": 0.29, + "grad_norm": 4.720488068861202, + "learning_rate": 8.299983086275773e-06, + "loss": 0.8166, + "step": 3601 + }, + { + "epoch": 0.29, + "grad_norm": 16.892930927665617, + "learning_rate": 8.298994787613044e-06, + "loss": 0.6964, + "step": 3602 + }, + { + "epoch": 0.29, + "grad_norm": 4.861898063963509, + "learning_rate": 8.298006260639106e-06, + "loss": 0.7707, + "step": 3603 + }, + { + "epoch": 0.29, + "grad_norm": 5.178656987316199, + "learning_rate": 8.297017505422366e-06, + "loss": 0.7489, + "step": 3604 + }, + { + "epoch": 0.29, + "grad_norm": 4.4784671865672765, + "learning_rate": 8.296028522031257e-06, + "loss": 0.7794, + "step": 3605 + }, + { + "epoch": 0.29, + "grad_norm": 9.668476136119686, + "learning_rate": 8.295039310534221e-06, + "loss": 0.7147, + "step": 3606 + }, + { + "epoch": 0.29, + "grad_norm": 5.4932237648156725, + "learning_rate": 8.294049870999717e-06, + "loss": 0.7207, + "step": 3607 + }, + { + "epoch": 0.29, + "grad_norm": 4.737710658658471, + "learning_rate": 8.293060203496219e-06, + "loss": 0.6734, + "step": 3608 + }, + { + "epoch": 0.29, + "grad_norm": 5.90600443219457, + "learning_rate": 8.292070308092223e-06, + "loss": 0.7794, + "step": 3609 + }, + { + "epoch": 0.29, + "grad_norm": 10.638777054584876, + "learning_rate": 8.291080184856231e-06, + "loss": 0.6812, + "step": 3610 + }, + { + "epoch": 0.29, + "grad_norm": 14.256564123422592, + "learning_rate": 8.290089833856769e-06, + "loss": 0.834, + "step": 3611 + }, + { + "epoch": 0.29, + "grad_norm": 11.990025629425269, + "learning_rate": 8.289099255162374e-06, + "loss": 0.7073, + "step": 3612 + }, + { + "epoch": 0.29, + "grad_norm": 3.5422846125454366, + "learning_rate": 8.288108448841601e-06, + "loss": 0.678, + "step": 3613 + }, + { + "epoch": 0.29, + "grad_norm": 4.150900741587477, + "learning_rate": 8.287117414963019e-06, + "loss": 0.7254, + "step": 3614 + }, + { + "epoch": 0.29, + "grad_norm": 7.258771600795925, + "learning_rate": 8.286126153595213e-06, + "loss": 0.6764, + "step": 3615 + }, + { + "epoch": 0.29, + "grad_norm": 3.22900983739793, + "learning_rate": 8.285134664806788e-06, + "loss": 0.7984, + "step": 3616 + }, + { + "epoch": 0.29, + "grad_norm": 13.989358835704389, + "learning_rate": 8.284142948666361e-06, + "loss": 0.5343, + "step": 3617 + }, + { + "epoch": 0.29, + "grad_norm": 2.885262362128362, + "learning_rate": 8.28315100524256e-06, + "loss": 0.7728, + "step": 3618 + }, + { + "epoch": 0.29, + "grad_norm": 15.38925584885041, + "learning_rate": 8.28215883460404e-06, + "loss": 0.737, + "step": 3619 + }, + { + "epoch": 0.29, + "grad_norm": 8.323833041225676, + "learning_rate": 8.281166436819458e-06, + "loss": 0.6107, + "step": 3620 + }, + { + "epoch": 0.29, + "grad_norm": 3.342949124390851, + "learning_rate": 8.280173811957503e-06, + "loss": 0.7527, + "step": 3621 + }, + { + "epoch": 0.29, + "grad_norm": 16.728989660947818, + "learning_rate": 8.279180960086866e-06, + "loss": 0.7969, + "step": 3622 + }, + { + "epoch": 0.29, + "grad_norm": 13.372498276211944, + "learning_rate": 8.278187881276257e-06, + "loss": 0.9697, + "step": 3623 + }, + { + "epoch": 0.29, + "grad_norm": 3.4951571916636768, + "learning_rate": 8.277194575594407e-06, + "loss": 0.7804, + "step": 3624 + }, + { + "epoch": 0.29, + "grad_norm": 4.407764803027612, + "learning_rate": 8.276201043110057e-06, + "loss": 0.6956, + "step": 3625 + }, + { + "epoch": 0.29, + "grad_norm": 3.1558749371758874, + "learning_rate": 8.275207283891967e-06, + "loss": 0.7098, + "step": 3626 + }, + { + "epoch": 0.29, + "grad_norm": 4.1912222347567685, + "learning_rate": 8.274213298008908e-06, + "loss": 0.803, + "step": 3627 + }, + { + "epoch": 0.29, + "grad_norm": 2.861561582272143, + "learning_rate": 8.273219085529676e-06, + "loss": 0.7111, + "step": 3628 + }, + { + "epoch": 0.29, + "grad_norm": 5.90546243405222, + "learning_rate": 8.272224646523072e-06, + "loss": 0.7486, + "step": 3629 + }, + { + "epoch": 0.29, + "grad_norm": 28.710477893550134, + "learning_rate": 8.271229981057917e-06, + "loss": 0.7903, + "step": 3630 + }, + { + "epoch": 0.29, + "grad_norm": 9.61738656263696, + "learning_rate": 8.270235089203052e-06, + "loss": 0.7065, + "step": 3631 + }, + { + "epoch": 0.29, + "grad_norm": 4.330971256643809, + "learning_rate": 8.269239971027328e-06, + "loss": 0.7236, + "step": 3632 + }, + { + "epoch": 0.3, + "grad_norm": 10.476720229592933, + "learning_rate": 8.268244626599613e-06, + "loss": 0.6467, + "step": 3633 + }, + { + "epoch": 0.3, + "grad_norm": 7.134023231459432, + "learning_rate": 8.267249055988788e-06, + "loss": 0.6713, + "step": 3634 + }, + { + "epoch": 0.3, + "grad_norm": 4.755786462906381, + "learning_rate": 8.266253259263758e-06, + "loss": 0.6968, + "step": 3635 + }, + { + "epoch": 0.3, + "grad_norm": 4.110370260727674, + "learning_rate": 8.26525723649344e-06, + "loss": 0.6632, + "step": 3636 + }, + { + "epoch": 0.3, + "grad_norm": 6.127947200354974, + "learning_rate": 8.264260987746757e-06, + "loss": 0.9702, + "step": 3637 + }, + { + "epoch": 0.3, + "grad_norm": 3.457642874317659, + "learning_rate": 8.263264513092662e-06, + "loss": 0.5843, + "step": 3638 + }, + { + "epoch": 0.3, + "grad_norm": 3.0145808515627945, + "learning_rate": 8.262267812600116e-06, + "loss": 0.7334, + "step": 3639 + }, + { + "epoch": 0.3, + "grad_norm": 4.637743016152468, + "learning_rate": 8.261270886338095e-06, + "loss": 0.7659, + "step": 3640 + }, + { + "epoch": 0.3, + "grad_norm": 4.389122136583996, + "learning_rate": 8.260273734375594e-06, + "loss": 0.7615, + "step": 3641 + }, + { + "epoch": 0.3, + "grad_norm": 4.0054671638627735, + "learning_rate": 8.259276356781624e-06, + "loss": 0.6937, + "step": 3642 + }, + { + "epoch": 0.3, + "grad_norm": 3.85937232364193, + "learning_rate": 8.258278753625207e-06, + "loss": 0.6817, + "step": 3643 + }, + { + "epoch": 0.3, + "grad_norm": 3.6184458301138047, + "learning_rate": 8.257280924975384e-06, + "loss": 0.8854, + "step": 3644 + }, + { + "epoch": 0.3, + "grad_norm": 2.040096233880686, + "learning_rate": 8.25628287090121e-06, + "loss": 0.5259, + "step": 3645 + }, + { + "epoch": 0.3, + "grad_norm": 6.013349852507329, + "learning_rate": 8.255284591471762e-06, + "loss": 0.5649, + "step": 3646 + }, + { + "epoch": 0.3, + "grad_norm": 5.70417966570395, + "learning_rate": 8.25428608675612e-06, + "loss": 0.7666, + "step": 3647 + }, + { + "epoch": 0.3, + "grad_norm": 2.70292318410213, + "learning_rate": 8.253287356823392e-06, + "loss": 0.6846, + "step": 3648 + }, + { + "epoch": 0.3, + "grad_norm": 6.823091090790905, + "learning_rate": 8.252288401742695e-06, + "loss": 0.6188, + "step": 3649 + }, + { + "epoch": 0.3, + "grad_norm": 3.330574465843821, + "learning_rate": 8.25128922158316e-06, + "loss": 0.6335, + "step": 3650 + }, + { + "epoch": 0.3, + "grad_norm": 4.092917909725572, + "learning_rate": 8.25028981641394e-06, + "loss": 0.6873, + "step": 3651 + }, + { + "epoch": 0.3, + "grad_norm": 3.9567687453204337, + "learning_rate": 8.249290186304199e-06, + "loss": 0.6845, + "step": 3652 + }, + { + "epoch": 0.3, + "grad_norm": 5.652588686788329, + "learning_rate": 8.24829033132312e-06, + "loss": 0.7102, + "step": 3653 + }, + { + "epoch": 0.3, + "grad_norm": 16.934139524506165, + "learning_rate": 8.247290251539894e-06, + "loss": 0.843, + "step": 3654 + }, + { + "epoch": 0.3, + "grad_norm": 4.629057489008848, + "learning_rate": 8.246289947023737e-06, + "loss": 0.7287, + "step": 3655 + }, + { + "epoch": 0.3, + "grad_norm": 8.04421697646196, + "learning_rate": 8.245289417843877e-06, + "loss": 0.7892, + "step": 3656 + }, + { + "epoch": 0.3, + "grad_norm": 11.157584894799221, + "learning_rate": 8.244288664069555e-06, + "loss": 0.6364, + "step": 3657 + }, + { + "epoch": 0.3, + "grad_norm": 3.220427030017744, + "learning_rate": 8.243287685770028e-06, + "loss": 0.7461, + "step": 3658 + }, + { + "epoch": 0.3, + "grad_norm": 5.090587417819171, + "learning_rate": 8.242286483014572e-06, + "loss": 0.6714, + "step": 3659 + }, + { + "epoch": 0.3, + "grad_norm": 6.824641620491466, + "learning_rate": 8.241285055872478e-06, + "loss": 0.6592, + "step": 3660 + }, + { + "epoch": 0.3, + "grad_norm": 3.0371439600628167, + "learning_rate": 8.240283404413048e-06, + "loss": 0.5896, + "step": 3661 + }, + { + "epoch": 0.3, + "grad_norm": 7.581295379012853, + "learning_rate": 8.239281528705605e-06, + "loss": 0.7222, + "step": 3662 + }, + { + "epoch": 0.3, + "grad_norm": 4.113283987241927, + "learning_rate": 8.238279428819482e-06, + "loss": 0.8067, + "step": 3663 + }, + { + "epoch": 0.3, + "grad_norm": 4.028242828895518, + "learning_rate": 8.237277104824032e-06, + "loss": 0.7602, + "step": 3664 + }, + { + "epoch": 0.3, + "grad_norm": 2.8527376290444737, + "learning_rate": 8.236274556788626e-06, + "loss": 0.7552, + "step": 3665 + }, + { + "epoch": 0.3, + "grad_norm": 2.398252215178266, + "learning_rate": 8.235271784782642e-06, + "loss": 0.5816, + "step": 3666 + }, + { + "epoch": 0.3, + "grad_norm": 13.798165023567215, + "learning_rate": 8.23426878887548e-06, + "loss": 0.7432, + "step": 3667 + }, + { + "epoch": 0.3, + "grad_norm": 5.764702713114874, + "learning_rate": 8.233265569136552e-06, + "loss": 0.6469, + "step": 3668 + }, + { + "epoch": 0.3, + "grad_norm": 3.2083962163269004, + "learning_rate": 8.232262125635288e-06, + "loss": 0.7711, + "step": 3669 + }, + { + "epoch": 0.3, + "grad_norm": 13.405610019275006, + "learning_rate": 8.231258458441135e-06, + "loss": 0.8616, + "step": 3670 + }, + { + "epoch": 0.3, + "grad_norm": 4.003424065008685, + "learning_rate": 8.230254567623548e-06, + "loss": 0.6992, + "step": 3671 + }, + { + "epoch": 0.3, + "grad_norm": 6.808840152647502, + "learning_rate": 8.229250453252008e-06, + "loss": 0.7909, + "step": 3672 + }, + { + "epoch": 0.3, + "grad_norm": 27.671924702286308, + "learning_rate": 8.228246115396004e-06, + "loss": 0.5618, + "step": 3673 + }, + { + "epoch": 0.3, + "grad_norm": 7.469459501924244, + "learning_rate": 8.227241554125041e-06, + "loss": 0.7337, + "step": 3674 + }, + { + "epoch": 0.3, + "grad_norm": 3.8140699490300976, + "learning_rate": 8.22623676950864e-06, + "loss": 0.8273, + "step": 3675 + }, + { + "epoch": 0.3, + "grad_norm": 3.747528686031325, + "learning_rate": 8.225231761616344e-06, + "loss": 0.7381, + "step": 3676 + }, + { + "epoch": 0.3, + "grad_norm": 22.825994657233448, + "learning_rate": 8.2242265305177e-06, + "loss": 0.8461, + "step": 3677 + }, + { + "epoch": 0.3, + "grad_norm": 5.217365869318151, + "learning_rate": 8.22322107628228e-06, + "loss": 0.7227, + "step": 3678 + }, + { + "epoch": 0.3, + "grad_norm": 10.506315964182377, + "learning_rate": 8.222215398979667e-06, + "loss": 0.6625, + "step": 3679 + }, + { + "epoch": 0.3, + "grad_norm": 5.351009570919189, + "learning_rate": 8.221209498679458e-06, + "loss": 0.5191, + "step": 3680 + }, + { + "epoch": 0.3, + "grad_norm": 3.5972851340230574, + "learning_rate": 8.22020337545127e-06, + "loss": 0.7848, + "step": 3681 + }, + { + "epoch": 0.3, + "grad_norm": 3.43347290715262, + "learning_rate": 8.219197029364733e-06, + "loss": 0.7332, + "step": 3682 + }, + { + "epoch": 0.3, + "grad_norm": 6.618761929906742, + "learning_rate": 8.21819046048949e-06, + "loss": 0.8361, + "step": 3683 + }, + { + "epoch": 0.3, + "grad_norm": 3.787197114602886, + "learning_rate": 8.217183668895205e-06, + "loss": 0.64, + "step": 3684 + }, + { + "epoch": 0.3, + "grad_norm": 6.301737415393621, + "learning_rate": 8.216176654651553e-06, + "loss": 0.7579, + "step": 3685 + }, + { + "epoch": 0.3, + "grad_norm": 4.193477279991484, + "learning_rate": 8.215169417828226e-06, + "loss": 0.722, + "step": 3686 + }, + { + "epoch": 0.3, + "grad_norm": 3.155925487295511, + "learning_rate": 8.214161958494931e-06, + "loss": 0.7473, + "step": 3687 + }, + { + "epoch": 0.3, + "grad_norm": 8.361390389888049, + "learning_rate": 8.213154276721388e-06, + "loss": 0.7481, + "step": 3688 + }, + { + "epoch": 0.3, + "grad_norm": 4.862336838419223, + "learning_rate": 8.212146372577342e-06, + "loss": 0.7305, + "step": 3689 + }, + { + "epoch": 0.3, + "grad_norm": 3.4286693281464604, + "learning_rate": 8.211138246132537e-06, + "loss": 0.8738, + "step": 3690 + }, + { + "epoch": 0.3, + "grad_norm": 3.6446888336077645, + "learning_rate": 8.21012989745675e-06, + "loss": 0.8209, + "step": 3691 + }, + { + "epoch": 0.3, + "grad_norm": 4.652062931231507, + "learning_rate": 8.20912132661976e-06, + "loss": 0.8877, + "step": 3692 + }, + { + "epoch": 0.3, + "grad_norm": 4.715650488218229, + "learning_rate": 8.208112533691367e-06, + "loss": 0.7064, + "step": 3693 + }, + { + "epoch": 0.3, + "grad_norm": 4.308412838971919, + "learning_rate": 8.207103518741388e-06, + "loss": 0.801, + "step": 3694 + }, + { + "epoch": 0.3, + "grad_norm": 9.56051350880958, + "learning_rate": 8.20609428183965e-06, + "loss": 0.7208, + "step": 3695 + }, + { + "epoch": 0.3, + "grad_norm": 33.76159425314962, + "learning_rate": 8.205084823056003e-06, + "loss": 0.8033, + "step": 3696 + }, + { + "epoch": 0.3, + "grad_norm": 3.2218662127313467, + "learning_rate": 8.204075142460305e-06, + "loss": 0.8255, + "step": 3697 + }, + { + "epoch": 0.3, + "grad_norm": 3.06119888507931, + "learning_rate": 8.20306524012243e-06, + "loss": 0.6171, + "step": 3698 + }, + { + "epoch": 0.3, + "grad_norm": 3.121401973897496, + "learning_rate": 8.202055116112275e-06, + "loss": 0.7353, + "step": 3699 + }, + { + "epoch": 0.3, + "grad_norm": 4.16319907242963, + "learning_rate": 8.201044770499743e-06, + "loss": 0.6991, + "step": 3700 + }, + { + "epoch": 0.3, + "grad_norm": 3.269329925980588, + "learning_rate": 8.200034203354758e-06, + "loss": 0.771, + "step": 3701 + }, + { + "epoch": 0.3, + "grad_norm": 4.2595357695144465, + "learning_rate": 8.199023414747257e-06, + "loss": 0.6551, + "step": 3702 + }, + { + "epoch": 0.3, + "grad_norm": 16.02140458308134, + "learning_rate": 8.198012404747192e-06, + "loss": 0.5345, + "step": 3703 + }, + { + "epoch": 0.3, + "grad_norm": 3.228032581669579, + "learning_rate": 8.197001173424533e-06, + "loss": 0.6517, + "step": 3704 + }, + { + "epoch": 0.3, + "grad_norm": 3.949558875655524, + "learning_rate": 8.195989720849262e-06, + "loss": 0.8581, + "step": 3705 + }, + { + "epoch": 0.3, + "grad_norm": 5.104777399217436, + "learning_rate": 8.19497804709138e-06, + "loss": 0.6803, + "step": 3706 + }, + { + "epoch": 0.3, + "grad_norm": 11.0962878218219, + "learning_rate": 8.1939661522209e-06, + "loss": 0.6335, + "step": 3707 + }, + { + "epoch": 0.3, + "grad_norm": 3.8814262311878434, + "learning_rate": 8.192954036307849e-06, + "loss": 0.6256, + "step": 3708 + }, + { + "epoch": 0.3, + "grad_norm": 3.650576332196466, + "learning_rate": 8.191941699422276e-06, + "loss": 0.5718, + "step": 3709 + }, + { + "epoch": 0.3, + "grad_norm": 9.402264979420416, + "learning_rate": 8.19092914163424e-06, + "loss": 0.6868, + "step": 3710 + }, + { + "epoch": 0.3, + "grad_norm": 4.149492229222343, + "learning_rate": 8.189916363013815e-06, + "loss": 0.7222, + "step": 3711 + }, + { + "epoch": 0.3, + "grad_norm": 2.884462249735279, + "learning_rate": 8.188903363631092e-06, + "loss": 0.6119, + "step": 3712 + }, + { + "epoch": 0.3, + "grad_norm": 9.582224929265774, + "learning_rate": 8.187890143556178e-06, + "loss": 0.8185, + "step": 3713 + }, + { + "epoch": 0.3, + "grad_norm": 3.835798100412109, + "learning_rate": 8.186876702859192e-06, + "loss": 0.6862, + "step": 3714 + }, + { + "epoch": 0.3, + "grad_norm": 2.9582423619020948, + "learning_rate": 8.185863041610273e-06, + "loss": 0.8561, + "step": 3715 + }, + { + "epoch": 0.3, + "grad_norm": 4.441781979271604, + "learning_rate": 8.18484915987957e-06, + "loss": 0.7729, + "step": 3716 + }, + { + "epoch": 0.3, + "grad_norm": 4.968029214728309, + "learning_rate": 8.183835057737256e-06, + "loss": 0.6525, + "step": 3717 + }, + { + "epoch": 0.3, + "grad_norm": 2.5392518946270815, + "learning_rate": 8.182820735253504e-06, + "loss": 0.6627, + "step": 3718 + }, + { + "epoch": 0.3, + "grad_norm": 4.5045341141325865, + "learning_rate": 8.181806192498518e-06, + "loss": 0.8008, + "step": 3719 + }, + { + "epoch": 0.3, + "grad_norm": 3.522173499385073, + "learning_rate": 8.18079142954251e-06, + "loss": 0.7786, + "step": 3720 + }, + { + "epoch": 0.3, + "grad_norm": 4.654976094078377, + "learning_rate": 8.179776446455707e-06, + "loss": 0.658, + "step": 3721 + }, + { + "epoch": 0.3, + "grad_norm": 4.5458818280687945, + "learning_rate": 8.178761243308353e-06, + "loss": 0.7548, + "step": 3722 + }, + { + "epoch": 0.3, + "grad_norm": 3.226859149071264, + "learning_rate": 8.177745820170705e-06, + "loss": 0.6415, + "step": 3723 + }, + { + "epoch": 0.3, + "grad_norm": 4.809055215778432, + "learning_rate": 8.176730177113037e-06, + "loss": 0.6932, + "step": 3724 + }, + { + "epoch": 0.3, + "grad_norm": 4.3931875738987625, + "learning_rate": 8.175714314205639e-06, + "loss": 0.5867, + "step": 3725 + }, + { + "epoch": 0.3, + "grad_norm": 10.929005429797298, + "learning_rate": 8.174698231518813e-06, + "loss": 0.6594, + "step": 3726 + }, + { + "epoch": 0.3, + "grad_norm": 3.6819152021342068, + "learning_rate": 8.173681929122883e-06, + "loss": 0.8414, + "step": 3727 + }, + { + "epoch": 0.3, + "grad_norm": 5.620990990818821, + "learning_rate": 8.172665407088178e-06, + "loss": 0.7122, + "step": 3728 + }, + { + "epoch": 0.3, + "grad_norm": 11.723919229690786, + "learning_rate": 8.17164866548505e-06, + "loss": 0.6452, + "step": 3729 + }, + { + "epoch": 0.3, + "grad_norm": 3.666451843600515, + "learning_rate": 8.170631704383865e-06, + "loss": 0.6022, + "step": 3730 + }, + { + "epoch": 0.3, + "grad_norm": 4.915128103429705, + "learning_rate": 8.169614523855001e-06, + "loss": 0.7327, + "step": 3731 + }, + { + "epoch": 0.3, + "grad_norm": 4.190277250855935, + "learning_rate": 8.168597123968857e-06, + "loss": 0.6404, + "step": 3732 + }, + { + "epoch": 0.3, + "grad_norm": 3.551149035740589, + "learning_rate": 8.167579504795838e-06, + "loss": 0.6042, + "step": 3733 + }, + { + "epoch": 0.3, + "grad_norm": 3.7588664695071636, + "learning_rate": 8.166561666406374e-06, + "loss": 0.6908, + "step": 3734 + }, + { + "epoch": 0.3, + "grad_norm": 4.690428244975542, + "learning_rate": 8.165543608870906e-06, + "loss": 0.5072, + "step": 3735 + }, + { + "epoch": 0.3, + "grad_norm": 5.334374688163294, + "learning_rate": 8.164525332259884e-06, + "loss": 0.6226, + "step": 3736 + }, + { + "epoch": 0.3, + "grad_norm": 3.626404541421322, + "learning_rate": 8.163506836643787e-06, + "loss": 0.8602, + "step": 3737 + }, + { + "epoch": 0.3, + "grad_norm": 2.8301599530963757, + "learning_rate": 8.162488122093095e-06, + "loss": 0.6091, + "step": 3738 + }, + { + "epoch": 0.3, + "grad_norm": 4.416725221469185, + "learning_rate": 8.161469188678315e-06, + "loss": 0.5649, + "step": 3739 + }, + { + "epoch": 0.3, + "grad_norm": 2.920413777212486, + "learning_rate": 8.16045003646996e-06, + "loss": 0.6782, + "step": 3740 + }, + { + "epoch": 0.3, + "grad_norm": 14.40707580450295, + "learning_rate": 8.159430665538561e-06, + "loss": 0.9344, + "step": 3741 + }, + { + "epoch": 0.3, + "grad_norm": 4.634730500073284, + "learning_rate": 8.158411075954669e-06, + "loss": 0.7096, + "step": 3742 + }, + { + "epoch": 0.3, + "grad_norm": 5.23791572111884, + "learning_rate": 8.157391267788842e-06, + "loss": 0.7501, + "step": 3743 + }, + { + "epoch": 0.3, + "grad_norm": 3.9980562219990357, + "learning_rate": 8.15637124111166e-06, + "loss": 0.6696, + "step": 3744 + }, + { + "epoch": 0.3, + "grad_norm": 3.1546970890797126, + "learning_rate": 8.155350995993713e-06, + "loss": 0.6542, + "step": 3745 + }, + { + "epoch": 0.3, + "grad_norm": 4.397072615644264, + "learning_rate": 8.15433053250561e-06, + "loss": 0.6655, + "step": 3746 + }, + { + "epoch": 0.3, + "grad_norm": 3.309917802773555, + "learning_rate": 8.153309850717973e-06, + "loss": 0.848, + "step": 3747 + }, + { + "epoch": 0.3, + "grad_norm": 4.106108013638399, + "learning_rate": 8.152288950701437e-06, + "loss": 0.7339, + "step": 3748 + }, + { + "epoch": 0.3, + "grad_norm": 2.5119122903194873, + "learning_rate": 8.151267832526658e-06, + "loss": 0.6879, + "step": 3749 + }, + { + "epoch": 0.3, + "grad_norm": 19.681356443712104, + "learning_rate": 8.150246496264304e-06, + "loss": 0.7779, + "step": 3750 + }, + { + "epoch": 0.3, + "grad_norm": 3.1975643451556115, + "learning_rate": 8.149224941985058e-06, + "loss": 0.675, + "step": 3751 + }, + { + "epoch": 0.3, + "grad_norm": 3.8651322132765658, + "learning_rate": 8.148203169759617e-06, + "loss": 0.8486, + "step": 3752 + }, + { + "epoch": 0.3, + "grad_norm": 4.388062201438417, + "learning_rate": 8.14718117965869e-06, + "loss": 0.7546, + "step": 3753 + }, + { + "epoch": 0.3, + "grad_norm": 5.081302613724464, + "learning_rate": 8.146158971753013e-06, + "loss": 0.8501, + "step": 3754 + }, + { + "epoch": 0.3, + "grad_norm": 2.3703340241905386, + "learning_rate": 8.145136546113323e-06, + "loss": 0.5522, + "step": 3755 + }, + { + "epoch": 0.31, + "grad_norm": 2.837232088800711, + "learning_rate": 8.144113902810383e-06, + "loss": 0.73, + "step": 3756 + }, + { + "epoch": 0.31, + "grad_norm": 3.948321468695182, + "learning_rate": 8.143091041914962e-06, + "loss": 0.6876, + "step": 3757 + }, + { + "epoch": 0.31, + "grad_norm": 13.642195500446972, + "learning_rate": 8.14206796349785e-06, + "loss": 0.6109, + "step": 3758 + }, + { + "epoch": 0.31, + "grad_norm": 3.3087815498278923, + "learning_rate": 8.141044667629852e-06, + "loss": 0.5886, + "step": 3759 + }, + { + "epoch": 0.31, + "grad_norm": 3.3378795789401368, + "learning_rate": 8.140021154381786e-06, + "loss": 0.7511, + "step": 3760 + }, + { + "epoch": 0.31, + "grad_norm": 2.353272407476943, + "learning_rate": 8.138997423824483e-06, + "loss": 0.721, + "step": 3761 + }, + { + "epoch": 0.31, + "grad_norm": 2.7848418121481506, + "learning_rate": 8.137973476028795e-06, + "loss": 0.8508, + "step": 3762 + }, + { + "epoch": 0.31, + "grad_norm": 9.008549712860148, + "learning_rate": 8.136949311065583e-06, + "loss": 0.5955, + "step": 3763 + }, + { + "epoch": 0.31, + "grad_norm": 7.3202003930463135, + "learning_rate": 8.135924929005728e-06, + "loss": 0.6649, + "step": 3764 + }, + { + "epoch": 0.31, + "grad_norm": 2.779952441739666, + "learning_rate": 8.134900329920121e-06, + "loss": 0.6654, + "step": 3765 + }, + { + "epoch": 0.31, + "grad_norm": 7.615992101317586, + "learning_rate": 8.133875513879675e-06, + "loss": 0.7246, + "step": 3766 + }, + { + "epoch": 0.31, + "grad_norm": 3.1962301378940468, + "learning_rate": 8.132850480955307e-06, + "loss": 0.5659, + "step": 3767 + }, + { + "epoch": 0.31, + "grad_norm": 5.0673075297467, + "learning_rate": 8.131825231217962e-06, + "loss": 0.692, + "step": 3768 + }, + { + "epoch": 0.31, + "grad_norm": 5.093738633189581, + "learning_rate": 8.130799764738591e-06, + "loss": 0.6394, + "step": 3769 + }, + { + "epoch": 0.31, + "grad_norm": 3.546881923059605, + "learning_rate": 8.129774081588164e-06, + "loss": 0.8117, + "step": 3770 + }, + { + "epoch": 0.31, + "grad_norm": 4.468497928599216, + "learning_rate": 8.128748181837662e-06, + "loss": 0.7291, + "step": 3771 + }, + { + "epoch": 0.31, + "grad_norm": 4.426907273653148, + "learning_rate": 8.127722065558087e-06, + "loss": 0.6537, + "step": 3772 + }, + { + "epoch": 0.31, + "grad_norm": 5.037250436580407, + "learning_rate": 8.12669573282045e-06, + "loss": 0.7802, + "step": 3773 + }, + { + "epoch": 0.31, + "grad_norm": 3.3550861426249994, + "learning_rate": 8.125669183695784e-06, + "loss": 0.7178, + "step": 3774 + }, + { + "epoch": 0.31, + "grad_norm": 3.9552158515403444, + "learning_rate": 8.124642418255127e-06, + "loss": 0.7624, + "step": 3775 + }, + { + "epoch": 0.31, + "grad_norm": 4.404467342782481, + "learning_rate": 8.12361543656954e-06, + "loss": 0.7331, + "step": 3776 + }, + { + "epoch": 0.31, + "grad_norm": 3.4965353440458755, + "learning_rate": 8.122588238710098e-06, + "loss": 0.6302, + "step": 3777 + }, + { + "epoch": 0.31, + "grad_norm": 3.879138548035437, + "learning_rate": 8.121560824747889e-06, + "loss": 0.6388, + "step": 3778 + }, + { + "epoch": 0.31, + "grad_norm": 27.99905130815318, + "learning_rate": 8.120533194754015e-06, + "loss": 0.6645, + "step": 3779 + }, + { + "epoch": 0.31, + "grad_norm": 3.4101101232374416, + "learning_rate": 8.119505348799595e-06, + "loss": 0.8705, + "step": 3780 + }, + { + "epoch": 0.31, + "grad_norm": 3.2128273612029, + "learning_rate": 8.118477286955764e-06, + "loss": 0.754, + "step": 3781 + }, + { + "epoch": 0.31, + "grad_norm": 3.376333074694725, + "learning_rate": 8.117449009293668e-06, + "loss": 0.6783, + "step": 3782 + }, + { + "epoch": 0.31, + "grad_norm": 3.829956856229782, + "learning_rate": 8.116420515884473e-06, + "loss": 0.7888, + "step": 3783 + }, + { + "epoch": 0.31, + "grad_norm": 2.4042293291090195, + "learning_rate": 8.115391806799354e-06, + "loss": 0.6911, + "step": 3784 + }, + { + "epoch": 0.31, + "grad_norm": 2.9572527336676777, + "learning_rate": 8.114362882109507e-06, + "loss": 0.6015, + "step": 3785 + }, + { + "epoch": 0.31, + "grad_norm": 2.740495907573269, + "learning_rate": 8.113333741886137e-06, + "loss": 0.8441, + "step": 3786 + }, + { + "epoch": 0.31, + "grad_norm": 3.537452524642415, + "learning_rate": 8.11230438620047e-06, + "loss": 0.6159, + "step": 3787 + }, + { + "epoch": 0.31, + "grad_norm": 6.220159024018744, + "learning_rate": 8.111274815123746e-06, + "loss": 0.701, + "step": 3788 + }, + { + "epoch": 0.31, + "grad_norm": 4.220892546241309, + "learning_rate": 8.110245028727211e-06, + "loss": 0.7133, + "step": 3789 + }, + { + "epoch": 0.31, + "grad_norm": 6.438574183988354, + "learning_rate": 8.109215027082137e-06, + "loss": 0.8488, + "step": 3790 + }, + { + "epoch": 0.31, + "grad_norm": 4.320626595467487, + "learning_rate": 8.108184810259806e-06, + "loss": 0.6818, + "step": 3791 + }, + { + "epoch": 0.31, + "grad_norm": 21.409502765716667, + "learning_rate": 8.107154378331515e-06, + "loss": 0.745, + "step": 3792 + }, + { + "epoch": 0.31, + "grad_norm": 2.8557435853692428, + "learning_rate": 8.106123731368579e-06, + "loss": 0.8006, + "step": 3793 + }, + { + "epoch": 0.31, + "grad_norm": 3.9262834880074604, + "learning_rate": 8.10509286944232e-06, + "loss": 0.9271, + "step": 3794 + }, + { + "epoch": 0.31, + "grad_norm": 21.228192529370176, + "learning_rate": 8.104061792624085e-06, + "loss": 0.7205, + "step": 3795 + }, + { + "epoch": 0.31, + "grad_norm": 4.580604934591343, + "learning_rate": 8.103030500985227e-06, + "loss": 0.783, + "step": 3796 + }, + { + "epoch": 0.31, + "grad_norm": 3.712073932733611, + "learning_rate": 8.101998994597123e-06, + "loss": 0.6982, + "step": 3797 + }, + { + "epoch": 0.31, + "grad_norm": 4.201304396840697, + "learning_rate": 8.100967273531154e-06, + "loss": 0.75, + "step": 3798 + }, + { + "epoch": 0.31, + "grad_norm": 3.37374017557371, + "learning_rate": 8.099935337858726e-06, + "loss": 0.7596, + "step": 3799 + }, + { + "epoch": 0.31, + "grad_norm": 4.209357533596984, + "learning_rate": 8.098903187651252e-06, + "loss": 0.6863, + "step": 3800 + }, + { + "epoch": 0.31, + "grad_norm": 4.480290511002427, + "learning_rate": 8.097870822980166e-06, + "loss": 0.7374, + "step": 3801 + }, + { + "epoch": 0.31, + "grad_norm": 3.549953778624068, + "learning_rate": 8.096838243916916e-06, + "loss": 0.6068, + "step": 3802 + }, + { + "epoch": 0.31, + "grad_norm": 4.8415110662599625, + "learning_rate": 8.095805450532957e-06, + "loss": 0.6972, + "step": 3803 + }, + { + "epoch": 0.31, + "grad_norm": 4.551316528601511, + "learning_rate": 8.09477244289977e-06, + "loss": 0.6196, + "step": 3804 + }, + { + "epoch": 0.31, + "grad_norm": 7.3475027891572475, + "learning_rate": 8.093739221088842e-06, + "loss": 0.6489, + "step": 3805 + }, + { + "epoch": 0.31, + "grad_norm": 2.7883605417197868, + "learning_rate": 8.09270578517168e-06, + "loss": 0.8722, + "step": 3806 + }, + { + "epoch": 0.31, + "grad_norm": 2.822608041368121, + "learning_rate": 8.091672135219805e-06, + "loss": 0.7387, + "step": 3807 + }, + { + "epoch": 0.31, + "grad_norm": 3.925695166219304, + "learning_rate": 8.090638271304754e-06, + "loss": 0.6987, + "step": 3808 + }, + { + "epoch": 0.31, + "grad_norm": 3.0358414295226566, + "learning_rate": 8.08960419349807e-06, + "loss": 0.6139, + "step": 3809 + }, + { + "epoch": 0.31, + "grad_norm": 14.760050401619537, + "learning_rate": 8.088569901871325e-06, + "loss": 0.7213, + "step": 3810 + }, + { + "epoch": 0.31, + "grad_norm": 3.351132418502165, + "learning_rate": 8.087535396496093e-06, + "loss": 0.7162, + "step": 3811 + }, + { + "epoch": 0.31, + "grad_norm": 2.9944465077998217, + "learning_rate": 8.086500677443974e-06, + "loss": 0.6915, + "step": 3812 + }, + { + "epoch": 0.31, + "grad_norm": 3.17940947655128, + "learning_rate": 8.085465744786572e-06, + "loss": 0.7636, + "step": 3813 + }, + { + "epoch": 0.31, + "grad_norm": 3.0347479813539513, + "learning_rate": 8.084430598595514e-06, + "loss": 0.65, + "step": 3814 + }, + { + "epoch": 0.31, + "grad_norm": 8.387621966752047, + "learning_rate": 8.083395238942437e-06, + "loss": 0.6454, + "step": 3815 + }, + { + "epoch": 0.31, + "grad_norm": 2.5776737410641046, + "learning_rate": 8.082359665898994e-06, + "loss": 0.5806, + "step": 3816 + }, + { + "epoch": 0.31, + "grad_norm": 3.4223821327636186, + "learning_rate": 8.081323879536854e-06, + "loss": 0.7425, + "step": 3817 + }, + { + "epoch": 0.31, + "grad_norm": 3.7529775593871144, + "learning_rate": 8.0802878799277e-06, + "loss": 0.6963, + "step": 3818 + }, + { + "epoch": 0.31, + "grad_norm": 2.6405069508913654, + "learning_rate": 8.079251667143229e-06, + "loss": 0.7639, + "step": 3819 + }, + { + "epoch": 0.31, + "grad_norm": 3.637474970576809, + "learning_rate": 8.078215241255156e-06, + "loss": 0.8213, + "step": 3820 + }, + { + "epoch": 0.31, + "grad_norm": 2.4993380341655866, + "learning_rate": 8.077178602335204e-06, + "loss": 0.7573, + "step": 3821 + }, + { + "epoch": 0.31, + "grad_norm": 2.90511779672793, + "learning_rate": 8.076141750455119e-06, + "loss": 0.6998, + "step": 3822 + }, + { + "epoch": 0.31, + "grad_norm": 2.4960415586852878, + "learning_rate": 8.075104685686655e-06, + "loss": 0.6006, + "step": 3823 + }, + { + "epoch": 0.31, + "grad_norm": 4.898140379041115, + "learning_rate": 8.074067408101585e-06, + "loss": 0.7718, + "step": 3824 + }, + { + "epoch": 0.31, + "grad_norm": 4.661629721662624, + "learning_rate": 8.073029917771692e-06, + "loss": 0.7093, + "step": 3825 + }, + { + "epoch": 0.31, + "grad_norm": 2.8367074784109696, + "learning_rate": 8.071992214768783e-06, + "loss": 0.8326, + "step": 3826 + }, + { + "epoch": 0.31, + "grad_norm": 2.9893009979048797, + "learning_rate": 8.070954299164668e-06, + "loss": 0.7114, + "step": 3827 + }, + { + "epoch": 0.31, + "grad_norm": 2.7184241696259988, + "learning_rate": 8.069916171031181e-06, + "loss": 0.7541, + "step": 3828 + }, + { + "epoch": 0.31, + "grad_norm": 4.608302419770265, + "learning_rate": 8.068877830440162e-06, + "loss": 0.7252, + "step": 3829 + }, + { + "epoch": 0.31, + "grad_norm": 4.772772539385872, + "learning_rate": 8.067839277463475e-06, + "loss": 0.7514, + "step": 3830 + }, + { + "epoch": 0.31, + "grad_norm": 7.42560705038805, + "learning_rate": 8.066800512172994e-06, + "loss": 0.9019, + "step": 3831 + }, + { + "epoch": 0.31, + "grad_norm": 3.5132295504116358, + "learning_rate": 8.065761534640606e-06, + "loss": 0.7021, + "step": 3832 + }, + { + "epoch": 0.31, + "grad_norm": 7.792408664433977, + "learning_rate": 8.064722344938218e-06, + "loss": 0.5339, + "step": 3833 + }, + { + "epoch": 0.31, + "grad_norm": 3.7994299304054078, + "learning_rate": 8.063682943137745e-06, + "loss": 0.717, + "step": 3834 + }, + { + "epoch": 0.31, + "grad_norm": 3.4985436004169403, + "learning_rate": 8.062643329311123e-06, + "loss": 0.6081, + "step": 3835 + }, + { + "epoch": 0.31, + "grad_norm": 3.661006060019155, + "learning_rate": 8.061603503530298e-06, + "loss": 0.629, + "step": 3836 + }, + { + "epoch": 0.31, + "grad_norm": 4.401692786571959, + "learning_rate": 8.060563465867232e-06, + "loss": 0.6103, + "step": 3837 + }, + { + "epoch": 0.31, + "grad_norm": 3.190241383371681, + "learning_rate": 8.059523216393907e-06, + "loss": 0.5868, + "step": 3838 + }, + { + "epoch": 0.31, + "grad_norm": 7.806495671361692, + "learning_rate": 8.058482755182309e-06, + "loss": 0.6442, + "step": 3839 + }, + { + "epoch": 0.31, + "grad_norm": 4.627446392232139, + "learning_rate": 8.057442082304445e-06, + "loss": 0.6792, + "step": 3840 + }, + { + "epoch": 0.31, + "grad_norm": 3.407444953752837, + "learning_rate": 8.05640119783234e-06, + "loss": 0.7449, + "step": 3841 + }, + { + "epoch": 0.31, + "grad_norm": 2.867512216974582, + "learning_rate": 8.055360101838026e-06, + "loss": 0.657, + "step": 3842 + }, + { + "epoch": 0.31, + "grad_norm": 3.631023611547521, + "learning_rate": 8.054318794393554e-06, + "loss": 0.7608, + "step": 3843 + }, + { + "epoch": 0.31, + "grad_norm": 4.3522360411418815, + "learning_rate": 8.05327727557099e-06, + "loss": 0.6165, + "step": 3844 + }, + { + "epoch": 0.31, + "grad_norm": 6.8773670097717385, + "learning_rate": 8.052235545442416e-06, + "loss": 0.8298, + "step": 3845 + }, + { + "epoch": 0.31, + "grad_norm": 2.7943439718396124, + "learning_rate": 8.051193604079921e-06, + "loss": 0.6853, + "step": 3846 + }, + { + "epoch": 0.31, + "grad_norm": 3.2710406852298743, + "learning_rate": 8.05015145155562e-06, + "loss": 0.6216, + "step": 3847 + }, + { + "epoch": 0.31, + "grad_norm": 3.422232158520331, + "learning_rate": 8.04910908794163e-06, + "loss": 0.5977, + "step": 3848 + }, + { + "epoch": 0.31, + "grad_norm": 2.9666381291830604, + "learning_rate": 8.048066513310093e-06, + "loss": 0.6585, + "step": 3849 + }, + { + "epoch": 0.31, + "grad_norm": 2.5115687230386263, + "learning_rate": 8.047023727733162e-06, + "loss": 0.7279, + "step": 3850 + }, + { + "epoch": 0.31, + "grad_norm": 3.7143353322138837, + "learning_rate": 8.045980731283002e-06, + "loss": 0.7238, + "step": 3851 + }, + { + "epoch": 0.31, + "grad_norm": 3.2578923529505577, + "learning_rate": 8.044937524031798e-06, + "loss": 0.652, + "step": 3852 + }, + { + "epoch": 0.31, + "grad_norm": 5.020446780126338, + "learning_rate": 8.043894106051743e-06, + "loss": 0.7128, + "step": 3853 + }, + { + "epoch": 0.31, + "grad_norm": 4.1552285674975105, + "learning_rate": 8.042850477415052e-06, + "loss": 0.7723, + "step": 3854 + }, + { + "epoch": 0.31, + "grad_norm": 3.8896819488437973, + "learning_rate": 8.041806638193948e-06, + "loss": 0.8213, + "step": 3855 + }, + { + "epoch": 0.31, + "grad_norm": 2.5192806884695997, + "learning_rate": 8.04076258846067e-06, + "loss": 0.7643, + "step": 3856 + }, + { + "epoch": 0.31, + "grad_norm": 4.8767008503737435, + "learning_rate": 8.039718328287478e-06, + "loss": 0.6367, + "step": 3857 + }, + { + "epoch": 0.31, + "grad_norm": 3.7011601281168387, + "learning_rate": 8.038673857746636e-06, + "loss": 0.7832, + "step": 3858 + }, + { + "epoch": 0.31, + "grad_norm": 4.006813002241106, + "learning_rate": 8.03762917691043e-06, + "loss": 0.7053, + "step": 3859 + }, + { + "epoch": 0.31, + "grad_norm": 2.8536675590287, + "learning_rate": 8.03658428585116e-06, + "loss": 0.7103, + "step": 3860 + }, + { + "epoch": 0.31, + "grad_norm": 2.96592025655951, + "learning_rate": 8.035539184641134e-06, + "loss": 0.5904, + "step": 3861 + }, + { + "epoch": 0.31, + "grad_norm": 3.9042987888483203, + "learning_rate": 8.034493873352685e-06, + "loss": 0.6634, + "step": 3862 + }, + { + "epoch": 0.31, + "grad_norm": 2.6581721103457925, + "learning_rate": 8.033448352058155e-06, + "loss": 0.6537, + "step": 3863 + }, + { + "epoch": 0.31, + "grad_norm": 5.482733499186662, + "learning_rate": 8.032402620829895e-06, + "loss": 0.6193, + "step": 3864 + }, + { + "epoch": 0.31, + "grad_norm": 3.686880810322882, + "learning_rate": 8.031356679740283e-06, + "loss": 0.7373, + "step": 3865 + }, + { + "epoch": 0.31, + "grad_norm": 2.72885750004841, + "learning_rate": 8.030310528861703e-06, + "loss": 0.9408, + "step": 3866 + }, + { + "epoch": 0.31, + "grad_norm": 4.228982453368367, + "learning_rate": 8.02926416826655e-06, + "loss": 0.7633, + "step": 3867 + }, + { + "epoch": 0.31, + "grad_norm": 3.642871831297605, + "learning_rate": 8.028217598027247e-06, + "loss": 0.8684, + "step": 3868 + }, + { + "epoch": 0.31, + "grad_norm": 2.8614605115837417, + "learning_rate": 8.027170818216215e-06, + "loss": 0.6858, + "step": 3869 + }, + { + "epoch": 0.31, + "grad_norm": 2.4204190947417823, + "learning_rate": 8.026123828905902e-06, + "loss": 0.7924, + "step": 3870 + }, + { + "epoch": 0.31, + "grad_norm": 6.850563297247912, + "learning_rate": 8.025076630168769e-06, + "loss": 0.7901, + "step": 3871 + }, + { + "epoch": 0.31, + "grad_norm": 4.068546960885706, + "learning_rate": 8.024029222077286e-06, + "loss": 0.5381, + "step": 3872 + }, + { + "epoch": 0.31, + "grad_norm": 3.6982159833354946, + "learning_rate": 8.022981604703937e-06, + "loss": 0.7101, + "step": 3873 + }, + { + "epoch": 0.31, + "grad_norm": 4.9032663721247065, + "learning_rate": 8.021933778121227e-06, + "loss": 0.6717, + "step": 3874 + }, + { + "epoch": 0.31, + "grad_norm": 2.1326191524631852, + "learning_rate": 8.020885742401675e-06, + "loss": 0.7521, + "step": 3875 + }, + { + "epoch": 0.31, + "grad_norm": 2.922491285521148, + "learning_rate": 8.019837497617804e-06, + "loss": 0.7085, + "step": 3876 + }, + { + "epoch": 0.31, + "grad_norm": 4.339102430661286, + "learning_rate": 8.018789043842166e-06, + "loss": 0.6647, + "step": 3877 + }, + { + "epoch": 0.31, + "grad_norm": 3.7043738162032946, + "learning_rate": 8.017740381147319e-06, + "loss": 0.7106, + "step": 3878 + }, + { + "epoch": 0.32, + "grad_norm": 3.600741001391441, + "learning_rate": 8.016691509605836e-06, + "loss": 0.5931, + "step": 3879 + }, + { + "epoch": 0.32, + "grad_norm": 2.500302960993017, + "learning_rate": 8.015642429290304e-06, + "loss": 0.7905, + "step": 3880 + }, + { + "epoch": 0.32, + "grad_norm": 6.646574277323032, + "learning_rate": 8.01459314027333e-06, + "loss": 0.7212, + "step": 3881 + }, + { + "epoch": 0.32, + "grad_norm": 2.9361183762790195, + "learning_rate": 8.013543642627529e-06, + "loss": 0.7837, + "step": 3882 + }, + { + "epoch": 0.32, + "grad_norm": 2.5386458953787736, + "learning_rate": 8.012493936425532e-06, + "loss": 0.7738, + "step": 3883 + }, + { + "epoch": 0.32, + "grad_norm": 4.561581248764552, + "learning_rate": 8.011444021739986e-06, + "loss": 0.6709, + "step": 3884 + }, + { + "epoch": 0.32, + "grad_norm": 3.427316464110523, + "learning_rate": 8.010393898643555e-06, + "loss": 0.6204, + "step": 3885 + }, + { + "epoch": 0.32, + "grad_norm": 3.082382707597153, + "learning_rate": 8.009343567208909e-06, + "loss": 0.6868, + "step": 3886 + }, + { + "epoch": 0.32, + "grad_norm": 3.271984413705368, + "learning_rate": 8.00829302750874e-06, + "loss": 0.7346, + "step": 3887 + }, + { + "epoch": 0.32, + "grad_norm": 3.9674644683223046, + "learning_rate": 8.007242279615752e-06, + "loss": 0.7115, + "step": 3888 + }, + { + "epoch": 0.32, + "grad_norm": 8.355456734207701, + "learning_rate": 8.006191323602663e-06, + "loss": 0.7421, + "step": 3889 + }, + { + "epoch": 0.32, + "grad_norm": 5.644927970513903, + "learning_rate": 8.005140159542206e-06, + "loss": 0.6537, + "step": 3890 + }, + { + "epoch": 0.32, + "grad_norm": 2.740662912736704, + "learning_rate": 8.004088787507128e-06, + "loss": 0.7205, + "step": 3891 + }, + { + "epoch": 0.32, + "grad_norm": 3.245086876737402, + "learning_rate": 8.00303720757019e-06, + "loss": 0.7096, + "step": 3892 + }, + { + "epoch": 0.32, + "grad_norm": 2.6166627426331432, + "learning_rate": 8.00198541980417e-06, + "loss": 0.8382, + "step": 3893 + }, + { + "epoch": 0.32, + "grad_norm": 4.279142322343546, + "learning_rate": 8.000933424281856e-06, + "loss": 0.8966, + "step": 3894 + }, + { + "epoch": 0.32, + "grad_norm": 3.046516627699684, + "learning_rate": 7.999881221076054e-06, + "loss": 0.7695, + "step": 3895 + }, + { + "epoch": 0.32, + "grad_norm": 5.149416183217677, + "learning_rate": 7.998828810259581e-06, + "loss": 0.7099, + "step": 3896 + }, + { + "epoch": 0.32, + "grad_norm": 3.4371758680033433, + "learning_rate": 7.997776191905273e-06, + "loss": 0.6788, + "step": 3897 + }, + { + "epoch": 0.32, + "grad_norm": 3.112988680951314, + "learning_rate": 7.996723366085978e-06, + "loss": 0.7384, + "step": 3898 + }, + { + "epoch": 0.32, + "grad_norm": 2.9164236741212255, + "learning_rate": 7.995670332874556e-06, + "loss": 0.7185, + "step": 3899 + }, + { + "epoch": 0.32, + "grad_norm": 3.37605795294584, + "learning_rate": 7.994617092343885e-06, + "loss": 0.7222, + "step": 3900 + }, + { + "epoch": 0.32, + "grad_norm": 3.322943911841214, + "learning_rate": 7.993563644566856e-06, + "loss": 0.6943, + "step": 3901 + }, + { + "epoch": 0.32, + "grad_norm": 3.1120697485803515, + "learning_rate": 7.992509989616373e-06, + "loss": 0.6316, + "step": 3902 + }, + { + "epoch": 0.32, + "grad_norm": 3.4396877999425595, + "learning_rate": 7.991456127565357e-06, + "loss": 0.6001, + "step": 3903 + }, + { + "epoch": 0.32, + "grad_norm": 2.622676151672132, + "learning_rate": 7.990402058486742e-06, + "loss": 0.7524, + "step": 3904 + }, + { + "epoch": 0.32, + "grad_norm": 7.481288424645567, + "learning_rate": 7.989347782453473e-06, + "loss": 0.7567, + "step": 3905 + }, + { + "epoch": 0.32, + "grad_norm": 3.1560061965951647, + "learning_rate": 7.988293299538516e-06, + "loss": 0.6474, + "step": 3906 + }, + { + "epoch": 0.32, + "grad_norm": 3.4366162184737896, + "learning_rate": 7.987238609814848e-06, + "loss": 0.6249, + "step": 3907 + }, + { + "epoch": 0.32, + "grad_norm": 2.453829820837504, + "learning_rate": 7.986183713355458e-06, + "loss": 0.8548, + "step": 3908 + }, + { + "epoch": 0.32, + "grad_norm": 3.5821712677389512, + "learning_rate": 7.985128610233353e-06, + "loss": 0.7608, + "step": 3909 + }, + { + "epoch": 0.32, + "grad_norm": 2.454011600103875, + "learning_rate": 7.984073300521552e-06, + "loss": 0.7452, + "step": 3910 + }, + { + "epoch": 0.32, + "grad_norm": 3.600422892100495, + "learning_rate": 7.983017784293088e-06, + "loss": 0.7923, + "step": 3911 + }, + { + "epoch": 0.32, + "grad_norm": 2.6557279994683998, + "learning_rate": 7.981962061621012e-06, + "loss": 0.6576, + "step": 3912 + }, + { + "epoch": 0.32, + "grad_norm": 3.3142445035826054, + "learning_rate": 7.980906132578386e-06, + "loss": 0.6326, + "step": 3913 + }, + { + "epoch": 0.32, + "grad_norm": 5.058565457460089, + "learning_rate": 7.979849997238284e-06, + "loss": 0.6233, + "step": 3914 + }, + { + "epoch": 0.32, + "grad_norm": 3.1961231368546623, + "learning_rate": 7.978793655673803e-06, + "loss": 0.6064, + "step": 3915 + }, + { + "epoch": 0.32, + "grad_norm": 2.8958684933834777, + "learning_rate": 7.977737107958042e-06, + "loss": 0.67, + "step": 3916 + }, + { + "epoch": 0.32, + "grad_norm": 3.1444688522029964, + "learning_rate": 7.976680354164124e-06, + "loss": 0.6008, + "step": 3917 + }, + { + "epoch": 0.32, + "grad_norm": 2.6625101019817103, + "learning_rate": 7.975623394365184e-06, + "loss": 0.6148, + "step": 3918 + }, + { + "epoch": 0.32, + "grad_norm": 6.888914690748621, + "learning_rate": 7.974566228634369e-06, + "loss": 0.7404, + "step": 3919 + }, + { + "epoch": 0.32, + "grad_norm": 4.028543995166718, + "learning_rate": 7.97350885704484e-06, + "loss": 0.6674, + "step": 3920 + }, + { + "epoch": 0.32, + "grad_norm": 2.088499732839049, + "learning_rate": 7.972451279669777e-06, + "loss": 0.7586, + "step": 3921 + }, + { + "epoch": 0.32, + "grad_norm": 3.1876230287709304, + "learning_rate": 7.97139349658237e-06, + "loss": 0.6663, + "step": 3922 + }, + { + "epoch": 0.32, + "grad_norm": 5.966207897723611, + "learning_rate": 7.970335507855822e-06, + "loss": 0.7048, + "step": 3923 + }, + { + "epoch": 0.32, + "grad_norm": 3.1849210585315615, + "learning_rate": 7.969277313563354e-06, + "loss": 0.7949, + "step": 3924 + }, + { + "epoch": 0.32, + "grad_norm": 3.8554392915448648, + "learning_rate": 7.9682189137782e-06, + "loss": 0.7147, + "step": 3925 + }, + { + "epoch": 0.32, + "grad_norm": 4.814488122979218, + "learning_rate": 7.967160308573607e-06, + "loss": 0.7245, + "step": 3926 + }, + { + "epoch": 0.32, + "grad_norm": 3.2795228392151903, + "learning_rate": 7.96610149802284e-06, + "loss": 0.6609, + "step": 3927 + }, + { + "epoch": 0.32, + "grad_norm": 4.08718181667886, + "learning_rate": 7.965042482199173e-06, + "loss": 0.9151, + "step": 3928 + }, + { + "epoch": 0.32, + "grad_norm": 5.183521625048285, + "learning_rate": 7.963983261175894e-06, + "loss": 0.641, + "step": 3929 + }, + { + "epoch": 0.32, + "grad_norm": 3.4963650414362473, + "learning_rate": 7.962923835026312e-06, + "loss": 0.7218, + "step": 3930 + }, + { + "epoch": 0.32, + "grad_norm": 61.29066160922814, + "learning_rate": 7.961864203823746e-06, + "loss": 0.7766, + "step": 3931 + }, + { + "epoch": 0.32, + "grad_norm": 3.000856294908018, + "learning_rate": 7.960804367641526e-06, + "loss": 0.7444, + "step": 3932 + }, + { + "epoch": 0.32, + "grad_norm": 5.2796317153813295, + "learning_rate": 7.959744326553002e-06, + "loss": 0.7061, + "step": 3933 + }, + { + "epoch": 0.32, + "grad_norm": 2.472945957129339, + "learning_rate": 7.958684080631533e-06, + "loss": 0.6291, + "step": 3934 + }, + { + "epoch": 0.32, + "grad_norm": 3.3855962610444417, + "learning_rate": 7.957623629950498e-06, + "loss": 0.6866, + "step": 3935 + }, + { + "epoch": 0.32, + "grad_norm": 18.989269384398778, + "learning_rate": 7.956562974583284e-06, + "loss": 0.7107, + "step": 3936 + }, + { + "epoch": 0.32, + "grad_norm": 3.000022537766427, + "learning_rate": 7.955502114603296e-06, + "loss": 0.7728, + "step": 3937 + }, + { + "epoch": 0.32, + "grad_norm": 27.389408777657138, + "learning_rate": 7.954441050083954e-06, + "loss": 0.7416, + "step": 3938 + }, + { + "epoch": 0.32, + "grad_norm": 4.145808818591805, + "learning_rate": 7.953379781098686e-06, + "loss": 0.6525, + "step": 3939 + }, + { + "epoch": 0.32, + "grad_norm": 3.5067754599912964, + "learning_rate": 7.952318307720943e-06, + "loss": 0.6996, + "step": 3940 + }, + { + "epoch": 0.32, + "grad_norm": 3.0612911379596013, + "learning_rate": 7.951256630024184e-06, + "loss": 0.5761, + "step": 3941 + }, + { + "epoch": 0.32, + "grad_norm": 4.248779484097216, + "learning_rate": 7.950194748081882e-06, + "loss": 0.8271, + "step": 3942 + }, + { + "epoch": 0.32, + "grad_norm": 3.7779819140901467, + "learning_rate": 7.94913266196753e-06, + "loss": 0.8272, + "step": 3943 + }, + { + "epoch": 0.32, + "grad_norm": 7.0151021506094455, + "learning_rate": 7.948070371754626e-06, + "loss": 0.7065, + "step": 3944 + }, + { + "epoch": 0.32, + "grad_norm": 4.2031065845617865, + "learning_rate": 7.94700787751669e-06, + "loss": 0.6629, + "step": 3945 + }, + { + "epoch": 0.32, + "grad_norm": 4.450483734856019, + "learning_rate": 7.945945179327252e-06, + "loss": 0.9009, + "step": 3946 + }, + { + "epoch": 0.32, + "grad_norm": 6.079892970279501, + "learning_rate": 7.94488227725986e-06, + "loss": 0.639, + "step": 3947 + }, + { + "epoch": 0.32, + "grad_norm": 3.911611484615307, + "learning_rate": 7.943819171388073e-06, + "loss": 0.7575, + "step": 3948 + }, + { + "epoch": 0.32, + "grad_norm": 3.0008004807524165, + "learning_rate": 7.942755861785462e-06, + "loss": 0.7012, + "step": 3949 + }, + { + "epoch": 0.32, + "grad_norm": 9.559793246786608, + "learning_rate": 7.941692348525616e-06, + "loss": 0.6029, + "step": 3950 + }, + { + "epoch": 0.32, + "grad_norm": 2.9265953303421948, + "learning_rate": 7.940628631682139e-06, + "loss": 0.8689, + "step": 3951 + }, + { + "epoch": 0.32, + "grad_norm": 2.77688089050199, + "learning_rate": 7.939564711328643e-06, + "loss": 0.6822, + "step": 3952 + }, + { + "epoch": 0.32, + "grad_norm": 2.6517052552594227, + "learning_rate": 7.93850058753876e-06, + "loss": 0.7966, + "step": 3953 + }, + { + "epoch": 0.32, + "grad_norm": 10.706768629888632, + "learning_rate": 7.937436260386134e-06, + "loss": 0.7603, + "step": 3954 + }, + { + "epoch": 0.32, + "grad_norm": 2.8395172328084257, + "learning_rate": 7.936371729944423e-06, + "loss": 0.7506, + "step": 3955 + }, + { + "epoch": 0.32, + "grad_norm": 4.296087403075084, + "learning_rate": 7.935306996287301e-06, + "loss": 0.6108, + "step": 3956 + }, + { + "epoch": 0.32, + "grad_norm": 3.2670653461580605, + "learning_rate": 7.934242059488453e-06, + "loss": 0.7584, + "step": 3957 + }, + { + "epoch": 0.32, + "grad_norm": 2.7338787844543275, + "learning_rate": 7.933176919621577e-06, + "loss": 0.6956, + "step": 3958 + }, + { + "epoch": 0.32, + "grad_norm": 3.6406179770695606, + "learning_rate": 7.932111576760389e-06, + "loss": 0.7553, + "step": 3959 + }, + { + "epoch": 0.32, + "grad_norm": 5.201492720465625, + "learning_rate": 7.931046030978619e-06, + "loss": 0.8215, + "step": 3960 + }, + { + "epoch": 0.32, + "grad_norm": 4.192323551263408, + "learning_rate": 7.929980282350009e-06, + "loss": 0.6738, + "step": 3961 + }, + { + "epoch": 0.32, + "grad_norm": 2.99813192274881, + "learning_rate": 7.928914330948312e-06, + "loss": 0.7431, + "step": 3962 + }, + { + "epoch": 0.32, + "grad_norm": 4.979824801122564, + "learning_rate": 7.927848176847303e-06, + "loss": 0.8108, + "step": 3963 + }, + { + "epoch": 0.32, + "grad_norm": 5.1769395705307355, + "learning_rate": 7.926781820120765e-06, + "loss": 0.6437, + "step": 3964 + }, + { + "epoch": 0.32, + "grad_norm": 4.9488841733506925, + "learning_rate": 7.925715260842497e-06, + "loss": 0.8524, + "step": 3965 + }, + { + "epoch": 0.32, + "grad_norm": 4.815261070731859, + "learning_rate": 7.92464849908631e-06, + "loss": 0.894, + "step": 3966 + }, + { + "epoch": 0.32, + "grad_norm": 6.18374869735505, + "learning_rate": 7.923581534926034e-06, + "loss": 0.7378, + "step": 3967 + }, + { + "epoch": 0.32, + "grad_norm": 3.4300636590844404, + "learning_rate": 7.922514368435506e-06, + "loss": 0.7354, + "step": 3968 + }, + { + "epoch": 0.32, + "grad_norm": 2.3166955433573033, + "learning_rate": 7.92144699968858e-06, + "loss": 0.6804, + "step": 3969 + }, + { + "epoch": 0.32, + "grad_norm": 3.551603313376008, + "learning_rate": 7.920379428759129e-06, + "loss": 0.8918, + "step": 3970 + }, + { + "epoch": 0.32, + "grad_norm": 3.6522260731083804, + "learning_rate": 7.919311655721034e-06, + "loss": 0.7785, + "step": 3971 + }, + { + "epoch": 0.32, + "grad_norm": 22.84440412445875, + "learning_rate": 7.91824368064819e-06, + "loss": 0.7526, + "step": 3972 + }, + { + "epoch": 0.32, + "grad_norm": 2.7481284744959, + "learning_rate": 7.917175503614507e-06, + "loss": 0.7263, + "step": 3973 + }, + { + "epoch": 0.32, + "grad_norm": 2.7468459620503265, + "learning_rate": 7.916107124693912e-06, + "loss": 0.8064, + "step": 3974 + }, + { + "epoch": 0.32, + "grad_norm": 3.029343681462667, + "learning_rate": 7.915038543960342e-06, + "loss": 0.6701, + "step": 3975 + }, + { + "epoch": 0.32, + "grad_norm": 5.271869274714605, + "learning_rate": 7.913969761487752e-06, + "loss": 0.583, + "step": 3976 + }, + { + "epoch": 0.32, + "grad_norm": 4.280948884365676, + "learning_rate": 7.912900777350106e-06, + "loss": 0.7116, + "step": 3977 + }, + { + "epoch": 0.32, + "grad_norm": 2.831829235665687, + "learning_rate": 7.911831591621384e-06, + "loss": 0.6972, + "step": 3978 + }, + { + "epoch": 0.32, + "grad_norm": 24.045967917161263, + "learning_rate": 7.910762204375584e-06, + "loss": 0.886, + "step": 3979 + }, + { + "epoch": 0.32, + "grad_norm": 4.149930835612155, + "learning_rate": 7.909692615686709e-06, + "loss": 0.6526, + "step": 3980 + }, + { + "epoch": 0.32, + "grad_norm": 5.169162787830478, + "learning_rate": 7.908622825628787e-06, + "loss": 0.6912, + "step": 3981 + }, + { + "epoch": 0.32, + "grad_norm": 3.3947539979383414, + "learning_rate": 7.907552834275847e-06, + "loss": 0.7366, + "step": 3982 + }, + { + "epoch": 0.32, + "grad_norm": 4.168128472472476, + "learning_rate": 7.906482641701948e-06, + "loss": 0.7986, + "step": 3983 + }, + { + "epoch": 0.32, + "grad_norm": 2.951075973434164, + "learning_rate": 7.905412247981145e-06, + "loss": 0.8068, + "step": 3984 + }, + { + "epoch": 0.32, + "grad_norm": 2.1030447991337367, + "learning_rate": 7.904341653187525e-06, + "loss": 0.6505, + "step": 3985 + }, + { + "epoch": 0.32, + "grad_norm": 7.600191197001962, + "learning_rate": 7.903270857395171e-06, + "loss": 0.7605, + "step": 3986 + }, + { + "epoch": 0.32, + "grad_norm": 7.881893183982748, + "learning_rate": 7.902199860678197e-06, + "loss": 0.5648, + "step": 3987 + }, + { + "epoch": 0.32, + "grad_norm": 2.992401383156658, + "learning_rate": 7.901128663110716e-06, + "loss": 0.7371, + "step": 3988 + }, + { + "epoch": 0.32, + "grad_norm": 4.032999457858187, + "learning_rate": 7.900057264766865e-06, + "loss": 0.7199, + "step": 3989 + }, + { + "epoch": 0.32, + "grad_norm": 2.431758377946861, + "learning_rate": 7.898985665720792e-06, + "loss": 0.6945, + "step": 3990 + }, + { + "epoch": 0.32, + "grad_norm": 3.965754674928643, + "learning_rate": 7.897913866046658e-06, + "loss": 0.7991, + "step": 3991 + }, + { + "epoch": 0.32, + "grad_norm": 7.070921047310893, + "learning_rate": 7.896841865818636e-06, + "loss": 0.7076, + "step": 3992 + }, + { + "epoch": 0.32, + "grad_norm": 4.710028351588609, + "learning_rate": 7.895769665110918e-06, + "loss": 0.7267, + "step": 3993 + }, + { + "epoch": 0.32, + "grad_norm": 3.141045921812545, + "learning_rate": 7.894697263997706e-06, + "loss": 0.8131, + "step": 3994 + }, + { + "epoch": 0.32, + "grad_norm": 4.2493039545761295, + "learning_rate": 7.893624662553216e-06, + "loss": 0.5688, + "step": 3995 + }, + { + "epoch": 0.32, + "grad_norm": 3.6611038009331507, + "learning_rate": 7.892551860851679e-06, + "loss": 0.7407, + "step": 3996 + }, + { + "epoch": 0.32, + "grad_norm": 11.319736058964265, + "learning_rate": 7.891478858967342e-06, + "loss": 0.7448, + "step": 3997 + }, + { + "epoch": 0.32, + "grad_norm": 3.095309253359473, + "learning_rate": 7.89040565697446e-06, + "loss": 0.6961, + "step": 3998 + }, + { + "epoch": 0.32, + "grad_norm": 3.7157411447831166, + "learning_rate": 7.889332254947308e-06, + "loss": 0.8233, + "step": 3999 + }, + { + "epoch": 0.32, + "grad_norm": 2.737224342747105, + "learning_rate": 7.888258652960171e-06, + "loss": 0.6638, + "step": 4000 + }, + { + "epoch": 0.32, + "grad_norm": 3.0678091740404394, + "learning_rate": 7.88718485108735e-06, + "loss": 0.578, + "step": 4001 + }, + { + "epoch": 0.33, + "grad_norm": 2.924838946815285, + "learning_rate": 7.886110849403157e-06, + "loss": 0.7711, + "step": 4002 + }, + { + "epoch": 0.33, + "grad_norm": 5.937967326413743, + "learning_rate": 7.88503664798192e-06, + "loss": 0.7255, + "step": 4003 + }, + { + "epoch": 0.33, + "grad_norm": 10.954740968906862, + "learning_rate": 7.883962246897982e-06, + "loss": 0.6236, + "step": 4004 + }, + { + "epoch": 0.33, + "grad_norm": 5.981709783040065, + "learning_rate": 7.8828876462257e-06, + "loss": 0.6681, + "step": 4005 + }, + { + "epoch": 0.33, + "grad_norm": 5.029120277526408, + "learning_rate": 7.881812846039438e-06, + "loss": 0.707, + "step": 4006 + }, + { + "epoch": 0.33, + "grad_norm": 10.757851442583057, + "learning_rate": 7.880737846413582e-06, + "loss": 0.6998, + "step": 4007 + }, + { + "epoch": 0.33, + "grad_norm": 4.862184012969428, + "learning_rate": 7.87966264742253e-06, + "loss": 0.6842, + "step": 4008 + }, + { + "epoch": 0.33, + "grad_norm": 5.892476043664106, + "learning_rate": 7.878587249140688e-06, + "loss": 0.8522, + "step": 4009 + }, + { + "epoch": 0.33, + "grad_norm": 3.560054172882769, + "learning_rate": 7.877511651642486e-06, + "loss": 0.753, + "step": 4010 + }, + { + "epoch": 0.33, + "grad_norm": 3.7985266866211225, + "learning_rate": 7.876435855002357e-06, + "loss": 0.6187, + "step": 4011 + }, + { + "epoch": 0.33, + "grad_norm": 2.459276693031832, + "learning_rate": 7.875359859294758e-06, + "loss": 0.4691, + "step": 4012 + }, + { + "epoch": 0.33, + "grad_norm": 2.982910160543017, + "learning_rate": 7.87428366459415e-06, + "loss": 0.7514, + "step": 4013 + }, + { + "epoch": 0.33, + "grad_norm": 3.567711268346048, + "learning_rate": 7.873207270975017e-06, + "loss": 0.6869, + "step": 4014 + }, + { + "epoch": 0.33, + "grad_norm": 3.6696161888412417, + "learning_rate": 7.872130678511847e-06, + "loss": 0.7617, + "step": 4015 + }, + { + "epoch": 0.33, + "grad_norm": 3.1382651657499987, + "learning_rate": 7.87105388727915e-06, + "loss": 0.6328, + "step": 4016 + }, + { + "epoch": 0.33, + "grad_norm": 4.731992711249098, + "learning_rate": 7.869976897351446e-06, + "loss": 0.8129, + "step": 4017 + }, + { + "epoch": 0.33, + "grad_norm": 2.9595947287652535, + "learning_rate": 7.86889970880327e-06, + "loss": 0.6981, + "step": 4018 + }, + { + "epoch": 0.33, + "grad_norm": 2.3505552225578508, + "learning_rate": 7.867822321709171e-06, + "loss": 0.7741, + "step": 4019 + }, + { + "epoch": 0.33, + "grad_norm": 2.8971529726414476, + "learning_rate": 7.86674473614371e-06, + "loss": 0.7891, + "step": 4020 + }, + { + "epoch": 0.33, + "grad_norm": 4.169734958740916, + "learning_rate": 7.865666952181463e-06, + "loss": 0.757, + "step": 4021 + }, + { + "epoch": 0.33, + "grad_norm": 3.773414590397785, + "learning_rate": 7.864588969897017e-06, + "loss": 0.7726, + "step": 4022 + }, + { + "epoch": 0.33, + "grad_norm": 5.436593425986144, + "learning_rate": 7.863510789364978e-06, + "loss": 0.6835, + "step": 4023 + }, + { + "epoch": 0.33, + "grad_norm": 4.243976058788896, + "learning_rate": 7.862432410659964e-06, + "loss": 0.8677, + "step": 4024 + }, + { + "epoch": 0.33, + "grad_norm": 5.771829521723743, + "learning_rate": 7.861353833856605e-06, + "loss": 0.6527, + "step": 4025 + }, + { + "epoch": 0.33, + "grad_norm": 2.5059523116447466, + "learning_rate": 7.860275059029541e-06, + "loss": 0.6992, + "step": 4026 + }, + { + "epoch": 0.33, + "grad_norm": 3.568862388621924, + "learning_rate": 7.859196086253434e-06, + "loss": 0.6531, + "step": 4027 + }, + { + "epoch": 0.33, + "grad_norm": 3.6640053235759846, + "learning_rate": 7.858116915602955e-06, + "loss": 0.7008, + "step": 4028 + }, + { + "epoch": 0.33, + "grad_norm": 3.8830768438375487, + "learning_rate": 7.85703754715279e-06, + "loss": 0.7535, + "step": 4029 + }, + { + "epoch": 0.33, + "grad_norm": 3.7048391993201566, + "learning_rate": 7.855957980977636e-06, + "loss": 0.9496, + "step": 4030 + }, + { + "epoch": 0.33, + "grad_norm": 2.3405273078166084, + "learning_rate": 7.854878217152208e-06, + "loss": 0.626, + "step": 4031 + }, + { + "epoch": 0.33, + "grad_norm": 4.220458615251663, + "learning_rate": 7.853798255751231e-06, + "loss": 0.6996, + "step": 4032 + }, + { + "epoch": 0.33, + "grad_norm": 4.243201430839897, + "learning_rate": 7.852718096849445e-06, + "loss": 0.7394, + "step": 4033 + }, + { + "epoch": 0.33, + "grad_norm": 3.5176451314335, + "learning_rate": 7.851637740521608e-06, + "loss": 0.7925, + "step": 4034 + }, + { + "epoch": 0.33, + "grad_norm": 3.7871254192207204, + "learning_rate": 7.85055718684248e-06, + "loss": 0.6538, + "step": 4035 + }, + { + "epoch": 0.33, + "grad_norm": 2.8257614422145587, + "learning_rate": 7.849476435886847e-06, + "loss": 0.6096, + "step": 4036 + }, + { + "epoch": 0.33, + "grad_norm": 3.308609454555126, + "learning_rate": 7.848395487729505e-06, + "loss": 0.7691, + "step": 4037 + }, + { + "epoch": 0.33, + "grad_norm": 2.6383729710203117, + "learning_rate": 7.847314342445258e-06, + "loss": 0.7863, + "step": 4038 + }, + { + "epoch": 0.33, + "grad_norm": 4.635028452379696, + "learning_rate": 7.84623300010893e-06, + "loss": 0.8036, + "step": 4039 + }, + { + "epoch": 0.33, + "grad_norm": 5.427719952737282, + "learning_rate": 7.84515146079536e-06, + "loss": 0.7029, + "step": 4040 + }, + { + "epoch": 0.33, + "grad_norm": 4.2372017528608135, + "learning_rate": 7.844069724579392e-06, + "loss": 0.8418, + "step": 4041 + }, + { + "epoch": 0.33, + "grad_norm": 2.6153259822179673, + "learning_rate": 7.842987791535891e-06, + "loss": 0.6058, + "step": 4042 + }, + { + "epoch": 0.33, + "grad_norm": 3.785325191874001, + "learning_rate": 7.841905661739735e-06, + "loss": 0.698, + "step": 4043 + }, + { + "epoch": 0.33, + "grad_norm": 3.33553283571518, + "learning_rate": 7.840823335265813e-06, + "loss": 0.6399, + "step": 4044 + }, + { + "epoch": 0.33, + "grad_norm": 9.76389378766501, + "learning_rate": 7.839740812189027e-06, + "loss": 0.7678, + "step": 4045 + }, + { + "epoch": 0.33, + "grad_norm": 3.427439623166322, + "learning_rate": 7.8386580925843e-06, + "loss": 0.4043, + "step": 4046 + }, + { + "epoch": 0.33, + "grad_norm": 3.8407561538457835, + "learning_rate": 7.837575176526556e-06, + "loss": 0.6855, + "step": 4047 + }, + { + "epoch": 0.33, + "grad_norm": 3.1358971796546555, + "learning_rate": 7.836492064090745e-06, + "loss": 0.6993, + "step": 4048 + }, + { + "epoch": 0.33, + "grad_norm": 6.570332117553186, + "learning_rate": 7.83540875535182e-06, + "loss": 0.7468, + "step": 4049 + }, + { + "epoch": 0.33, + "grad_norm": 3.948865745022593, + "learning_rate": 7.83432525038476e-06, + "loss": 0.6822, + "step": 4050 + }, + { + "epoch": 0.33, + "grad_norm": 4.7488918413680965, + "learning_rate": 7.833241549264544e-06, + "loss": 0.6391, + "step": 4051 + }, + { + "epoch": 0.33, + "grad_norm": 3.0880446265928527, + "learning_rate": 7.832157652066173e-06, + "loss": 0.666, + "step": 4052 + }, + { + "epoch": 0.33, + "grad_norm": 5.217687001816275, + "learning_rate": 7.831073558864661e-06, + "loss": 0.7284, + "step": 4053 + }, + { + "epoch": 0.33, + "grad_norm": 3.049392989535443, + "learning_rate": 7.829989269735033e-06, + "loss": 0.6204, + "step": 4054 + }, + { + "epoch": 0.33, + "grad_norm": 3.984918816353251, + "learning_rate": 7.828904784752327e-06, + "loss": 0.7827, + "step": 4055 + }, + { + "epoch": 0.33, + "grad_norm": 3.728661484017217, + "learning_rate": 7.8278201039916e-06, + "loss": 0.8908, + "step": 4056 + }, + { + "epoch": 0.33, + "grad_norm": 3.0424028266399774, + "learning_rate": 7.826735227527913e-06, + "loss": 0.7756, + "step": 4057 + }, + { + "epoch": 0.33, + "grad_norm": 4.028174685880376, + "learning_rate": 7.825650155436352e-06, + "loss": 0.7433, + "step": 4058 + }, + { + "epoch": 0.33, + "grad_norm": 10.278119416074434, + "learning_rate": 7.824564887792008e-06, + "loss": 0.6336, + "step": 4059 + }, + { + "epoch": 0.33, + "grad_norm": 2.739215044632915, + "learning_rate": 7.823479424669988e-06, + "loss": 0.6444, + "step": 4060 + }, + { + "epoch": 0.33, + "grad_norm": 5.526006374892855, + "learning_rate": 7.822393766145415e-06, + "loss": 0.6217, + "step": 4061 + }, + { + "epoch": 0.33, + "grad_norm": 2.8194295174276993, + "learning_rate": 7.82130791229342e-06, + "loss": 0.7388, + "step": 4062 + }, + { + "epoch": 0.33, + "grad_norm": 3.0434223990426084, + "learning_rate": 7.820221863189156e-06, + "loss": 0.6793, + "step": 4063 + }, + { + "epoch": 0.33, + "grad_norm": 3.268143701971518, + "learning_rate": 7.819135618907781e-06, + "loss": 0.6439, + "step": 4064 + }, + { + "epoch": 0.33, + "grad_norm": 3.3650955995875766, + "learning_rate": 7.81804917952447e-06, + "loss": 0.7982, + "step": 4065 + }, + { + "epoch": 0.33, + "grad_norm": 3.46927797233056, + "learning_rate": 7.81696254511441e-06, + "loss": 0.5934, + "step": 4066 + }, + { + "epoch": 0.33, + "grad_norm": 4.054674725980978, + "learning_rate": 7.815875715752806e-06, + "loss": 0.7939, + "step": 4067 + }, + { + "epoch": 0.33, + "grad_norm": 3.4090407989165774, + "learning_rate": 7.814788691514871e-06, + "loss": 0.6676, + "step": 4068 + }, + { + "epoch": 0.33, + "grad_norm": 3.6681233465909524, + "learning_rate": 7.813701472475839e-06, + "loss": 0.7219, + "step": 4069 + }, + { + "epoch": 0.33, + "grad_norm": 3.730348746381778, + "learning_rate": 7.812614058710946e-06, + "loss": 0.5985, + "step": 4070 + }, + { + "epoch": 0.33, + "grad_norm": 2.5878949648442857, + "learning_rate": 7.81152645029545e-06, + "loss": 0.7509, + "step": 4071 + }, + { + "epoch": 0.33, + "grad_norm": 2.492655346569026, + "learning_rate": 7.810438647304621e-06, + "loss": 0.7055, + "step": 4072 + }, + { + "epoch": 0.33, + "grad_norm": 3.061167024935287, + "learning_rate": 7.809350649813743e-06, + "loss": 0.6532, + "step": 4073 + }, + { + "epoch": 0.33, + "grad_norm": 16.333585594803058, + "learning_rate": 7.80826245789811e-06, + "loss": 0.5317, + "step": 4074 + }, + { + "epoch": 0.33, + "grad_norm": 3.9319868898793273, + "learning_rate": 7.807174071633032e-06, + "loss": 0.6462, + "step": 4075 + }, + { + "epoch": 0.33, + "grad_norm": 10.158383994755884, + "learning_rate": 7.806085491093833e-06, + "loss": 0.4214, + "step": 4076 + }, + { + "epoch": 0.33, + "grad_norm": 2.5492804290846367, + "learning_rate": 7.80499671635585e-06, + "loss": 0.6198, + "step": 4077 + }, + { + "epoch": 0.33, + "grad_norm": 3.8651720526761166, + "learning_rate": 7.803907747494432e-06, + "loss": 0.7509, + "step": 4078 + }, + { + "epoch": 0.33, + "grad_norm": 8.304522655130349, + "learning_rate": 7.802818584584944e-06, + "loss": 0.6101, + "step": 4079 + }, + { + "epoch": 0.33, + "grad_norm": 3.800144844913076, + "learning_rate": 7.80172922770276e-06, + "loss": 0.7286, + "step": 4080 + }, + { + "epoch": 0.33, + "grad_norm": 4.12455386553149, + "learning_rate": 7.800639676923276e-06, + "loss": 0.8352, + "step": 4081 + }, + { + "epoch": 0.33, + "grad_norm": 8.769564802279781, + "learning_rate": 7.799549932321889e-06, + "loss": 0.6337, + "step": 4082 + }, + { + "epoch": 0.33, + "grad_norm": 4.000470968758646, + "learning_rate": 7.798459993974022e-06, + "loss": 0.6705, + "step": 4083 + }, + { + "epoch": 0.33, + "grad_norm": 4.773622868797054, + "learning_rate": 7.797369861955099e-06, + "loss": 0.7885, + "step": 4084 + }, + { + "epoch": 0.33, + "grad_norm": 3.278459471282244, + "learning_rate": 7.79627953634057e-06, + "loss": 0.7291, + "step": 4085 + }, + { + "epoch": 0.33, + "grad_norm": 4.1034281831182655, + "learning_rate": 7.795189017205888e-06, + "loss": 0.6794, + "step": 4086 + }, + { + "epoch": 0.33, + "grad_norm": 3.666641948357952, + "learning_rate": 7.79409830462653e-06, + "loss": 0.5839, + "step": 4087 + }, + { + "epoch": 0.33, + "grad_norm": 3.614734455374862, + "learning_rate": 7.793007398677973e-06, + "loss": 0.7052, + "step": 4088 + }, + { + "epoch": 0.33, + "grad_norm": 5.507318164068913, + "learning_rate": 7.79191629943572e-06, + "loss": 0.6686, + "step": 4089 + }, + { + "epoch": 0.33, + "grad_norm": 10.676401270174203, + "learning_rate": 7.790825006975279e-06, + "loss": 0.6491, + "step": 4090 + }, + { + "epoch": 0.33, + "grad_norm": 3.6209708565111534, + "learning_rate": 7.789733521372174e-06, + "loss": 0.6564, + "step": 4091 + }, + { + "epoch": 0.33, + "grad_norm": 20.198825515507583, + "learning_rate": 7.788641842701945e-06, + "loss": 0.6134, + "step": 4092 + }, + { + "epoch": 0.33, + "grad_norm": 4.872787851031557, + "learning_rate": 7.78754997104014e-06, + "loss": 0.6679, + "step": 4093 + }, + { + "epoch": 0.33, + "grad_norm": 2.879068451747883, + "learning_rate": 7.786457906462329e-06, + "loss": 0.7314, + "step": 4094 + }, + { + "epoch": 0.33, + "grad_norm": 12.101396645687867, + "learning_rate": 7.78536564904408e-06, + "loss": 0.756, + "step": 4095 + }, + { + "epoch": 0.33, + "grad_norm": 4.115924686329829, + "learning_rate": 7.784273198860995e-06, + "loss": 0.768, + "step": 4096 + }, + { + "epoch": 0.33, + "grad_norm": 3.0318172566732815, + "learning_rate": 7.783180555988671e-06, + "loss": 0.5867, + "step": 4097 + }, + { + "epoch": 0.33, + "grad_norm": 5.88818577725966, + "learning_rate": 7.78208772050273e-06, + "loss": 0.6385, + "step": 4098 + }, + { + "epoch": 0.33, + "grad_norm": 9.447203908840548, + "learning_rate": 7.780994692478798e-06, + "loss": 0.743, + "step": 4099 + }, + { + "epoch": 0.33, + "grad_norm": 30.369144915399453, + "learning_rate": 7.779901471992526e-06, + "loss": 0.7243, + "step": 4100 + }, + { + "epoch": 0.33, + "grad_norm": 7.2012427548716555, + "learning_rate": 7.778808059119567e-06, + "loss": 0.6446, + "step": 4101 + }, + { + "epoch": 0.33, + "grad_norm": 7.236087009464436, + "learning_rate": 7.777714453935594e-06, + "loss": 0.7151, + "step": 4102 + }, + { + "epoch": 0.33, + "grad_norm": 8.360358110778868, + "learning_rate": 7.77662065651629e-06, + "loss": 0.7456, + "step": 4103 + }, + { + "epoch": 0.33, + "grad_norm": 5.157020313618741, + "learning_rate": 7.775526666937354e-06, + "loss": 0.6859, + "step": 4104 + }, + { + "epoch": 0.33, + "grad_norm": 3.9790936876645144, + "learning_rate": 7.774432485274497e-06, + "loss": 0.6719, + "step": 4105 + }, + { + "epoch": 0.33, + "grad_norm": 4.29641542968105, + "learning_rate": 7.773338111603441e-06, + "loss": 0.7753, + "step": 4106 + }, + { + "epoch": 0.33, + "grad_norm": 3.109518616171552, + "learning_rate": 7.772243545999927e-06, + "loss": 0.6328, + "step": 4107 + }, + { + "epoch": 0.33, + "grad_norm": 3.905990823969171, + "learning_rate": 7.771148788539704e-06, + "loss": 0.8857, + "step": 4108 + }, + { + "epoch": 0.33, + "grad_norm": 3.1677308978861753, + "learning_rate": 7.770053839298535e-06, + "loss": 0.711, + "step": 4109 + }, + { + "epoch": 0.33, + "grad_norm": 3.1901076270752906, + "learning_rate": 7.7689586983522e-06, + "loss": 0.7131, + "step": 4110 + }, + { + "epoch": 0.33, + "grad_norm": 4.786363269034304, + "learning_rate": 7.767863365776488e-06, + "loss": 0.8328, + "step": 4111 + }, + { + "epoch": 0.33, + "grad_norm": 4.623157867135536, + "learning_rate": 7.766767841647203e-06, + "loss": 0.6248, + "step": 4112 + }, + { + "epoch": 0.33, + "grad_norm": 2.405042997179284, + "learning_rate": 7.765672126040162e-06, + "loss": 0.7472, + "step": 4113 + }, + { + "epoch": 0.33, + "grad_norm": 4.47305225801398, + "learning_rate": 7.764576219031197e-06, + "loss": 0.9045, + "step": 4114 + }, + { + "epoch": 0.33, + "grad_norm": 4.587956657843888, + "learning_rate": 7.763480120696149e-06, + "loss": 0.7675, + "step": 4115 + }, + { + "epoch": 0.33, + "grad_norm": 4.896169536520506, + "learning_rate": 7.762383831110878e-06, + "loss": 0.7246, + "step": 4116 + }, + { + "epoch": 0.33, + "grad_norm": 4.8970607335428245, + "learning_rate": 7.761287350351249e-06, + "loss": 0.6263, + "step": 4117 + }, + { + "epoch": 0.33, + "grad_norm": 3.2483340860096805, + "learning_rate": 7.760190678493152e-06, + "loss": 0.6393, + "step": 4118 + }, + { + "epoch": 0.33, + "grad_norm": 7.6971954311891055, + "learning_rate": 7.75909381561248e-06, + "loss": 0.6648, + "step": 4119 + }, + { + "epoch": 0.33, + "grad_norm": 24.353094749108823, + "learning_rate": 7.757996761785142e-06, + "loss": 0.7571, + "step": 4120 + }, + { + "epoch": 0.33, + "grad_norm": 3.9187088505766723, + "learning_rate": 7.756899517087064e-06, + "loss": 0.7357, + "step": 4121 + }, + { + "epoch": 0.33, + "grad_norm": 6.939205931395191, + "learning_rate": 7.755802081594179e-06, + "loss": 0.7559, + "step": 4122 + }, + { + "epoch": 0.33, + "grad_norm": 10.891093033491927, + "learning_rate": 7.75470445538244e-06, + "loss": 0.8352, + "step": 4123 + }, + { + "epoch": 0.33, + "grad_norm": 5.5681219779568485, + "learning_rate": 7.753606638527806e-06, + "loss": 0.6222, + "step": 4124 + }, + { + "epoch": 0.34, + "grad_norm": 2.9505204365213897, + "learning_rate": 7.752508631106254e-06, + "loss": 0.6965, + "step": 4125 + }, + { + "epoch": 0.34, + "grad_norm": 9.466021376256897, + "learning_rate": 7.751410433193775e-06, + "loss": 0.7038, + "step": 4126 + }, + { + "epoch": 0.34, + "grad_norm": 3.508643205227156, + "learning_rate": 7.75031204486637e-06, + "loss": 0.6965, + "step": 4127 + }, + { + "epoch": 0.34, + "grad_norm": 3.761493636141535, + "learning_rate": 7.749213466200052e-06, + "loss": 0.768, + "step": 4128 + }, + { + "epoch": 0.34, + "grad_norm": 9.026908527490088, + "learning_rate": 7.748114697270854e-06, + "loss": 0.7602, + "step": 4129 + }, + { + "epoch": 0.34, + "grad_norm": 4.2627735058668605, + "learning_rate": 7.747015738154814e-06, + "loss": 0.6751, + "step": 4130 + }, + { + "epoch": 0.34, + "grad_norm": 4.439934069674513, + "learning_rate": 7.745916588927988e-06, + "loss": 0.6475, + "step": 4131 + }, + { + "epoch": 0.34, + "grad_norm": 5.2383227787604705, + "learning_rate": 7.744817249666445e-06, + "loss": 0.7058, + "step": 4132 + }, + { + "epoch": 0.34, + "grad_norm": 3.4372320405668715, + "learning_rate": 7.743717720446265e-06, + "loss": 0.7557, + "step": 4133 + }, + { + "epoch": 0.34, + "grad_norm": 4.438490245659513, + "learning_rate": 7.742618001343544e-06, + "loss": 0.8321, + "step": 4134 + }, + { + "epoch": 0.34, + "grad_norm": 3.546815682773286, + "learning_rate": 7.741518092434388e-06, + "loss": 0.8111, + "step": 4135 + }, + { + "epoch": 0.34, + "grad_norm": 5.684351469153283, + "learning_rate": 7.740417993794918e-06, + "loss": 0.8111, + "step": 4136 + }, + { + "epoch": 0.34, + "grad_norm": 6.074752771301794, + "learning_rate": 7.739317705501266e-06, + "loss": 0.6977, + "step": 4137 + }, + { + "epoch": 0.34, + "grad_norm": 3.0155319121674027, + "learning_rate": 7.738217227629582e-06, + "loss": 0.5559, + "step": 4138 + }, + { + "epoch": 0.34, + "grad_norm": 3.6280985815363076, + "learning_rate": 7.737116560256024e-06, + "loss": 0.7512, + "step": 4139 + }, + { + "epoch": 0.34, + "grad_norm": 2.998859134416602, + "learning_rate": 7.736015703456768e-06, + "loss": 0.6122, + "step": 4140 + }, + { + "epoch": 0.34, + "grad_norm": 16.37306700851657, + "learning_rate": 7.734914657307995e-06, + "loss": 0.7383, + "step": 4141 + }, + { + "epoch": 0.34, + "grad_norm": 3.605277379139862, + "learning_rate": 7.733813421885907e-06, + "loss": 0.737, + "step": 4142 + }, + { + "epoch": 0.34, + "grad_norm": 18.75982192873511, + "learning_rate": 7.73271199726672e-06, + "loss": 0.6649, + "step": 4143 + }, + { + "epoch": 0.34, + "grad_norm": 6.8783168396698775, + "learning_rate": 7.731610383526654e-06, + "loss": 0.765, + "step": 4144 + }, + { + "epoch": 0.34, + "grad_norm": 3.5431948171922296, + "learning_rate": 7.73050858074195e-06, + "loss": 0.7947, + "step": 4145 + }, + { + "epoch": 0.34, + "grad_norm": 6.167641000453161, + "learning_rate": 7.72940658898886e-06, + "loss": 0.7782, + "step": 4146 + }, + { + "epoch": 0.34, + "grad_norm": 3.9945438146533467, + "learning_rate": 7.728304408343648e-06, + "loss": 0.7962, + "step": 4147 + }, + { + "epoch": 0.34, + "grad_norm": 3.0812113275096564, + "learning_rate": 7.72720203888259e-06, + "loss": 0.7787, + "step": 4148 + }, + { + "epoch": 0.34, + "grad_norm": 3.4178309141023178, + "learning_rate": 7.726099480681983e-06, + "loss": 0.7006, + "step": 4149 + }, + { + "epoch": 0.34, + "grad_norm": 3.6869408073749588, + "learning_rate": 7.724996733818124e-06, + "loss": 0.6133, + "step": 4150 + }, + { + "epoch": 0.34, + "grad_norm": 6.019079725212533, + "learning_rate": 7.723893798367335e-06, + "loss": 0.8643, + "step": 4151 + }, + { + "epoch": 0.34, + "grad_norm": 3.2551044974971997, + "learning_rate": 7.722790674405943e-06, + "loss": 0.6639, + "step": 4152 + }, + { + "epoch": 0.34, + "grad_norm": 2.903062872053778, + "learning_rate": 7.721687362010293e-06, + "loss": 0.8125, + "step": 4153 + }, + { + "epoch": 0.34, + "grad_norm": 3.8159382740878987, + "learning_rate": 7.72058386125674e-06, + "loss": 0.8421, + "step": 4154 + }, + { + "epoch": 0.34, + "grad_norm": 3.5294059463180854, + "learning_rate": 7.719480172221652e-06, + "loss": 0.7814, + "step": 4155 + }, + { + "epoch": 0.34, + "grad_norm": 2.3711255996802287, + "learning_rate": 7.718376294981416e-06, + "loss": 0.7501, + "step": 4156 + }, + { + "epoch": 0.34, + "grad_norm": 3.4514582764212527, + "learning_rate": 7.71727222961242e-06, + "loss": 0.7437, + "step": 4157 + }, + { + "epoch": 0.34, + "grad_norm": 3.7004749294885055, + "learning_rate": 7.71616797619108e-06, + "loss": 0.8478, + "step": 4158 + }, + { + "epoch": 0.34, + "grad_norm": 8.468487658189542, + "learning_rate": 7.715063534793811e-06, + "loss": 0.6774, + "step": 4159 + }, + { + "epoch": 0.34, + "grad_norm": 2.9881669309742995, + "learning_rate": 7.713958905497051e-06, + "loss": 0.7948, + "step": 4160 + }, + { + "epoch": 0.34, + "grad_norm": 20.85695866115867, + "learning_rate": 7.712854088377247e-06, + "loss": 0.7139, + "step": 4161 + }, + { + "epoch": 0.34, + "grad_norm": 5.480801745893337, + "learning_rate": 7.711749083510859e-06, + "loss": 0.8241, + "step": 4162 + }, + { + "epoch": 0.34, + "grad_norm": 47.52951888594341, + "learning_rate": 7.710643890974358e-06, + "loss": 0.8022, + "step": 4163 + }, + { + "epoch": 0.34, + "grad_norm": 5.4239647853745145, + "learning_rate": 7.709538510844234e-06, + "loss": 0.8134, + "step": 4164 + }, + { + "epoch": 0.34, + "grad_norm": 2.811308789895674, + "learning_rate": 7.708432943196982e-06, + "loss": 0.5743, + "step": 4165 + }, + { + "epoch": 0.34, + "grad_norm": 7.624334506596432, + "learning_rate": 7.70732718810912e-06, + "loss": 0.6332, + "step": 4166 + }, + { + "epoch": 0.34, + "grad_norm": 3.442416656138392, + "learning_rate": 7.706221245657168e-06, + "loss": 0.7508, + "step": 4167 + }, + { + "epoch": 0.34, + "grad_norm": 2.927889032363458, + "learning_rate": 7.705115115917665e-06, + "loss": 0.7404, + "step": 4168 + }, + { + "epoch": 0.34, + "grad_norm": 3.3005994166846095, + "learning_rate": 7.704008798967164e-06, + "loss": 0.6508, + "step": 4169 + }, + { + "epoch": 0.34, + "grad_norm": 5.356235752596568, + "learning_rate": 7.70290229488223e-06, + "loss": 0.6125, + "step": 4170 + }, + { + "epoch": 0.34, + "grad_norm": 3.5490115380220204, + "learning_rate": 7.701795603739434e-06, + "loss": 0.5048, + "step": 4171 + }, + { + "epoch": 0.34, + "grad_norm": 3.4100993042505685, + "learning_rate": 7.700688725615373e-06, + "loss": 0.7198, + "step": 4172 + }, + { + "epoch": 0.34, + "grad_norm": 2.8120108044876706, + "learning_rate": 7.699581660586648e-06, + "loss": 0.8058, + "step": 4173 + }, + { + "epoch": 0.34, + "grad_norm": 4.399572028246543, + "learning_rate": 7.698474408729872e-06, + "loss": 0.6973, + "step": 4174 + }, + { + "epoch": 0.34, + "grad_norm": 3.8053077621999054, + "learning_rate": 7.697366970121678e-06, + "loss": 0.8226, + "step": 4175 + }, + { + "epoch": 0.34, + "grad_norm": 3.1218565339521036, + "learning_rate": 7.696259344838706e-06, + "loss": 0.7116, + "step": 4176 + }, + { + "epoch": 0.34, + "grad_norm": 5.631258285929297, + "learning_rate": 7.695151532957608e-06, + "loss": 0.6867, + "step": 4177 + }, + { + "epoch": 0.34, + "grad_norm": 3.8185477601773297, + "learning_rate": 7.694043534555055e-06, + "loss": 0.6987, + "step": 4178 + }, + { + "epoch": 0.34, + "grad_norm": 3.9237029025735075, + "learning_rate": 7.692935349707726e-06, + "loss": 0.7236, + "step": 4179 + }, + { + "epoch": 0.34, + "grad_norm": 15.740143678524328, + "learning_rate": 7.691826978492316e-06, + "loss": 0.7921, + "step": 4180 + }, + { + "epoch": 0.34, + "grad_norm": 3.9487319245565122, + "learning_rate": 7.690718420985527e-06, + "loss": 0.5639, + "step": 4181 + }, + { + "epoch": 0.34, + "grad_norm": 3.1572443227536846, + "learning_rate": 7.689609677264083e-06, + "loss": 0.7445, + "step": 4182 + }, + { + "epoch": 0.34, + "grad_norm": 6.693988002017067, + "learning_rate": 7.688500747404716e-06, + "loss": 0.7799, + "step": 4183 + }, + { + "epoch": 0.34, + "grad_norm": 3.5869468706418477, + "learning_rate": 7.687391631484168e-06, + "loss": 0.5931, + "step": 4184 + }, + { + "epoch": 0.34, + "grad_norm": 10.782830887104295, + "learning_rate": 7.686282329579195e-06, + "loss": 0.7683, + "step": 4185 + }, + { + "epoch": 0.34, + "grad_norm": 3.6198637291941487, + "learning_rate": 7.685172841766573e-06, + "loss": 0.7242, + "step": 4186 + }, + { + "epoch": 0.34, + "grad_norm": 3.909840826051098, + "learning_rate": 7.684063168123082e-06, + "loss": 0.7236, + "step": 4187 + }, + { + "epoch": 0.34, + "grad_norm": 4.5002556387158865, + "learning_rate": 7.682953308725522e-06, + "loss": 0.6931, + "step": 4188 + }, + { + "epoch": 0.34, + "grad_norm": 33.390889858115614, + "learning_rate": 7.681843263650698e-06, + "loss": 0.7819, + "step": 4189 + }, + { + "epoch": 0.34, + "grad_norm": 4.236231648796214, + "learning_rate": 7.680733032975434e-06, + "loss": 0.749, + "step": 4190 + }, + { + "epoch": 0.34, + "grad_norm": 3.560344258759262, + "learning_rate": 7.679622616776565e-06, + "loss": 0.5957, + "step": 4191 + }, + { + "epoch": 0.34, + "grad_norm": 10.36076355842283, + "learning_rate": 7.678512015130936e-06, + "loss": 0.7471, + "step": 4192 + }, + { + "epoch": 0.34, + "grad_norm": 2.5496913962085896, + "learning_rate": 7.677401228115414e-06, + "loss": 0.6378, + "step": 4193 + }, + { + "epoch": 0.34, + "grad_norm": 8.25757401515829, + "learning_rate": 7.676290255806866e-06, + "loss": 0.7459, + "step": 4194 + }, + { + "epoch": 0.34, + "grad_norm": 6.002619111786537, + "learning_rate": 7.675179098282183e-06, + "loss": 0.5771, + "step": 4195 + }, + { + "epoch": 0.34, + "grad_norm": 3.701241230377751, + "learning_rate": 7.674067755618261e-06, + "loss": 0.671, + "step": 4196 + }, + { + "epoch": 0.34, + "grad_norm": 3.8880285706180135, + "learning_rate": 7.672956227892014e-06, + "loss": 0.7014, + "step": 4197 + }, + { + "epoch": 0.34, + "grad_norm": 2.8466834906214045, + "learning_rate": 7.671844515180365e-06, + "loss": 0.7784, + "step": 4198 + }, + { + "epoch": 0.34, + "grad_norm": 2.775989698557638, + "learning_rate": 7.670732617560253e-06, + "loss": 0.6754, + "step": 4199 + }, + { + "epoch": 0.34, + "grad_norm": 3.5978981404266084, + "learning_rate": 7.669620535108626e-06, + "loss": 0.6528, + "step": 4200 + }, + { + "epoch": 0.34, + "grad_norm": 4.220452673801354, + "learning_rate": 7.66850826790245e-06, + "loss": 0.7001, + "step": 4201 + }, + { + "epoch": 0.34, + "grad_norm": 3.7171663050214656, + "learning_rate": 7.667395816018699e-06, + "loss": 0.7974, + "step": 4202 + }, + { + "epoch": 0.34, + "grad_norm": 2.7313885002651697, + "learning_rate": 7.666283179534362e-06, + "loss": 0.7008, + "step": 4203 + }, + { + "epoch": 0.34, + "grad_norm": 3.9428986527451193, + "learning_rate": 7.665170358526441e-06, + "loss": 0.7038, + "step": 4204 + }, + { + "epoch": 0.34, + "grad_norm": 4.778854928045878, + "learning_rate": 7.66405735307195e-06, + "loss": 0.674, + "step": 4205 + }, + { + "epoch": 0.34, + "grad_norm": 9.0063608463653, + "learning_rate": 7.662944163247916e-06, + "loss": 0.5842, + "step": 4206 + }, + { + "epoch": 0.34, + "grad_norm": 2.785810860625398, + "learning_rate": 7.661830789131378e-06, + "loss": 0.7495, + "step": 4207 + }, + { + "epoch": 0.34, + "grad_norm": 3.170122198926483, + "learning_rate": 7.66071723079939e-06, + "loss": 0.7608, + "step": 4208 + }, + { + "epoch": 0.34, + "grad_norm": 4.6494954041382694, + "learning_rate": 7.659603488329014e-06, + "loss": 0.6163, + "step": 4209 + }, + { + "epoch": 0.34, + "grad_norm": 2.499744152038324, + "learning_rate": 7.658489561797333e-06, + "loss": 0.7113, + "step": 4210 + }, + { + "epoch": 0.34, + "grad_norm": 13.818948853593524, + "learning_rate": 7.657375451281435e-06, + "loss": 0.8048, + "step": 4211 + }, + { + "epoch": 0.34, + "grad_norm": 3.379269193921864, + "learning_rate": 7.656261156858423e-06, + "loss": 0.6937, + "step": 4212 + }, + { + "epoch": 0.34, + "grad_norm": 3.2596806987744276, + "learning_rate": 7.655146678605414e-06, + "loss": 0.7981, + "step": 4213 + }, + { + "epoch": 0.34, + "grad_norm": 7.327873204303678, + "learning_rate": 7.654032016599536e-06, + "loss": 0.5877, + "step": 4214 + }, + { + "epoch": 0.34, + "grad_norm": 3.329455137290018, + "learning_rate": 7.65291717091793e-06, + "loss": 0.6145, + "step": 4215 + }, + { + "epoch": 0.34, + "grad_norm": 2.9881259905539186, + "learning_rate": 7.651802141637753e-06, + "loss": 0.7879, + "step": 4216 + }, + { + "epoch": 0.34, + "grad_norm": 5.4802503217711775, + "learning_rate": 7.650686928836172e-06, + "loss": 0.8479, + "step": 4217 + }, + { + "epoch": 0.34, + "grad_norm": 3.1318347680815566, + "learning_rate": 7.649571532590363e-06, + "loss": 0.6803, + "step": 4218 + }, + { + "epoch": 0.34, + "grad_norm": 3.30604254142086, + "learning_rate": 7.648455952977523e-06, + "loss": 0.6684, + "step": 4219 + }, + { + "epoch": 0.34, + "grad_norm": 4.201200828071186, + "learning_rate": 7.647340190074854e-06, + "loss": 0.6677, + "step": 4220 + }, + { + "epoch": 0.34, + "grad_norm": 5.093367256882833, + "learning_rate": 7.646224243959575e-06, + "loss": 0.7021, + "step": 4221 + }, + { + "epoch": 0.34, + "grad_norm": 3.1090242822156138, + "learning_rate": 7.645108114708916e-06, + "loss": 0.6763, + "step": 4222 + }, + { + "epoch": 0.34, + "grad_norm": 3.000165483696345, + "learning_rate": 7.643991802400122e-06, + "loss": 0.7582, + "step": 4223 + }, + { + "epoch": 0.34, + "grad_norm": 3.375623946799999, + "learning_rate": 7.642875307110444e-06, + "loss": 0.8375, + "step": 4224 + }, + { + "epoch": 0.34, + "grad_norm": 4.387542910288342, + "learning_rate": 7.641758628917156e-06, + "loss": 0.8571, + "step": 4225 + }, + { + "epoch": 0.34, + "grad_norm": 2.470912717516068, + "learning_rate": 7.640641767897537e-06, + "loss": 0.8537, + "step": 4226 + }, + { + "epoch": 0.34, + "grad_norm": 14.196094651474384, + "learning_rate": 7.639524724128881e-06, + "loss": 0.8582, + "step": 4227 + }, + { + "epoch": 0.34, + "grad_norm": 2.3513481041016395, + "learning_rate": 7.638407497688493e-06, + "loss": 0.8401, + "step": 4228 + }, + { + "epoch": 0.34, + "grad_norm": 4.389900618980124, + "learning_rate": 7.637290088653695e-06, + "loss": 0.741, + "step": 4229 + }, + { + "epoch": 0.34, + "grad_norm": 3.0452021008936785, + "learning_rate": 7.636172497101817e-06, + "loss": 0.6562, + "step": 4230 + }, + { + "epoch": 0.34, + "grad_norm": 3.2185592330810473, + "learning_rate": 7.635054723110203e-06, + "loss": 0.7614, + "step": 4231 + }, + { + "epoch": 0.34, + "grad_norm": 2.7764435827683256, + "learning_rate": 7.633936766756211e-06, + "loss": 0.7416, + "step": 4232 + }, + { + "epoch": 0.34, + "grad_norm": 2.7500128874845915, + "learning_rate": 7.63281862811721e-06, + "loss": 0.6041, + "step": 4233 + }, + { + "epoch": 0.34, + "grad_norm": 3.7893848037547686, + "learning_rate": 7.63170030727058e-06, + "loss": 0.7515, + "step": 4234 + }, + { + "epoch": 0.34, + "grad_norm": 2.6436139683335282, + "learning_rate": 7.63058180429372e-06, + "loss": 0.7151, + "step": 4235 + }, + { + "epoch": 0.34, + "grad_norm": 65.65021008458264, + "learning_rate": 7.629463119264036e-06, + "loss": 0.6541, + "step": 4236 + }, + { + "epoch": 0.34, + "grad_norm": 35.95581769413149, + "learning_rate": 7.628344252258948e-06, + "loss": 0.7633, + "step": 4237 + }, + { + "epoch": 0.34, + "grad_norm": 2.76599168914248, + "learning_rate": 7.627225203355887e-06, + "loss": 0.6128, + "step": 4238 + }, + { + "epoch": 0.34, + "grad_norm": 6.0670053389041705, + "learning_rate": 7.6261059726323006e-06, + "loss": 0.7771, + "step": 4239 + }, + { + "epoch": 0.34, + "grad_norm": 3.085052386009528, + "learning_rate": 7.6249865601656434e-06, + "loss": 0.7562, + "step": 4240 + }, + { + "epoch": 0.34, + "grad_norm": 7.866399397270718, + "learning_rate": 7.623866966033391e-06, + "loss": 0.6186, + "step": 4241 + }, + { + "epoch": 0.34, + "grad_norm": 5.457272015709935, + "learning_rate": 7.622747190313022e-06, + "loss": 0.7136, + "step": 4242 + }, + { + "epoch": 0.34, + "grad_norm": 9.095566379321125, + "learning_rate": 7.621627233082033e-06, + "loss": 0.7175, + "step": 4243 + }, + { + "epoch": 0.34, + "grad_norm": 5.462027611056057, + "learning_rate": 7.620507094417933e-06, + "loss": 0.7203, + "step": 4244 + }, + { + "epoch": 0.34, + "grad_norm": 3.5910973649497864, + "learning_rate": 7.619386774398241e-06, + "loss": 0.7107, + "step": 4245 + }, + { + "epoch": 0.34, + "grad_norm": 4.0588277797716446, + "learning_rate": 7.618266273100492e-06, + "loss": 0.7534, + "step": 4246 + }, + { + "epoch": 0.34, + "grad_norm": 3.3122494566692384, + "learning_rate": 7.617145590602231e-06, + "loss": 0.6957, + "step": 4247 + }, + { + "epoch": 0.35, + "grad_norm": 2.848716313031181, + "learning_rate": 7.616024726981015e-06, + "loss": 0.7398, + "step": 4248 + }, + { + "epoch": 0.35, + "grad_norm": 2.3764844662969375, + "learning_rate": 7.614903682314419e-06, + "loss": 0.7286, + "step": 4249 + }, + { + "epoch": 0.35, + "grad_norm": 11.688971661091193, + "learning_rate": 7.613782456680019e-06, + "loss": 0.6604, + "step": 4250 + }, + { + "epoch": 0.35, + "grad_norm": 2.913327946682503, + "learning_rate": 7.612661050155418e-06, + "loss": 0.6498, + "step": 4251 + }, + { + "epoch": 0.35, + "grad_norm": 4.1708442864891895, + "learning_rate": 7.611539462818221e-06, + "loss": 0.7868, + "step": 4252 + }, + { + "epoch": 0.35, + "grad_norm": 2.7376442369297425, + "learning_rate": 7.6104176947460506e-06, + "loss": 0.7743, + "step": 4253 + }, + { + "epoch": 0.35, + "grad_norm": 6.627369714398502, + "learning_rate": 7.609295746016538e-06, + "loss": 0.8531, + "step": 4254 + }, + { + "epoch": 0.35, + "grad_norm": 6.999147695090658, + "learning_rate": 7.60817361670733e-06, + "loss": 0.7408, + "step": 4255 + }, + { + "epoch": 0.35, + "grad_norm": 3.2934167685567934, + "learning_rate": 7.607051306896087e-06, + "loss": 0.7353, + "step": 4256 + }, + { + "epoch": 0.35, + "grad_norm": 4.962497122851434, + "learning_rate": 7.605928816660477e-06, + "loss": 0.5706, + "step": 4257 + }, + { + "epoch": 0.35, + "grad_norm": 2.357074047694368, + "learning_rate": 7.604806146078185e-06, + "loss": 0.7376, + "step": 4258 + }, + { + "epoch": 0.35, + "grad_norm": 5.343497725228845, + "learning_rate": 7.603683295226907e-06, + "loss": 0.6709, + "step": 4259 + }, + { + "epoch": 0.35, + "grad_norm": 45.94924082581608, + "learning_rate": 7.602560264184349e-06, + "loss": 0.6962, + "step": 4260 + }, + { + "epoch": 0.35, + "grad_norm": 3.99734239842821, + "learning_rate": 7.601437053028235e-06, + "loss": 0.762, + "step": 4261 + }, + { + "epoch": 0.35, + "grad_norm": 14.08562740089319, + "learning_rate": 7.600313661836298e-06, + "loss": 0.8911, + "step": 4262 + }, + { + "epoch": 0.35, + "grad_norm": 8.032445371482412, + "learning_rate": 7.59919009068628e-06, + "loss": 0.6637, + "step": 4263 + }, + { + "epoch": 0.35, + "grad_norm": 2.3748303119157153, + "learning_rate": 7.598066339655943e-06, + "loss": 0.7459, + "step": 4264 + }, + { + "epoch": 0.35, + "grad_norm": 2.881278380295066, + "learning_rate": 7.596942408823057e-06, + "loss": 0.5924, + "step": 4265 + }, + { + "epoch": 0.35, + "grad_norm": 2.7172493662580743, + "learning_rate": 7.595818298265405e-06, + "loss": 0.7629, + "step": 4266 + }, + { + "epoch": 0.35, + "grad_norm": 4.474899227032276, + "learning_rate": 7.594694008060781e-06, + "loss": 0.6736, + "step": 4267 + }, + { + "epoch": 0.35, + "grad_norm": 3.011121489499902, + "learning_rate": 7.593569538286996e-06, + "loss": 0.6869, + "step": 4268 + }, + { + "epoch": 0.35, + "grad_norm": 7.966491105107606, + "learning_rate": 7.592444889021866e-06, + "loss": 0.6638, + "step": 4269 + }, + { + "epoch": 0.35, + "grad_norm": 3.680136116118259, + "learning_rate": 7.591320060343228e-06, + "loss": 0.878, + "step": 4270 + }, + { + "epoch": 0.35, + "grad_norm": 3.472905503656415, + "learning_rate": 7.590195052328923e-06, + "loss": 0.601, + "step": 4271 + }, + { + "epoch": 0.35, + "grad_norm": 3.3626481887466526, + "learning_rate": 7.589069865056815e-06, + "loss": 0.664, + "step": 4272 + }, + { + "epoch": 0.35, + "grad_norm": 5.648473049318399, + "learning_rate": 7.587944498604767e-06, + "loss": 0.5532, + "step": 4273 + }, + { + "epoch": 0.35, + "grad_norm": 9.179329166723242, + "learning_rate": 7.586818953050666e-06, + "loss": 0.8864, + "step": 4274 + }, + { + "epoch": 0.35, + "grad_norm": 3.592445722859328, + "learning_rate": 7.585693228472405e-06, + "loss": 0.7922, + "step": 4275 + }, + { + "epoch": 0.35, + "grad_norm": 3.6183777042074166, + "learning_rate": 7.584567324947893e-06, + "loss": 0.5551, + "step": 4276 + }, + { + "epoch": 0.35, + "grad_norm": 8.46317710728317, + "learning_rate": 7.5834412425550476e-06, + "loss": 0.7138, + "step": 4277 + }, + { + "epoch": 0.35, + "grad_norm": 2.229481922945613, + "learning_rate": 7.582314981371801e-06, + "loss": 0.6913, + "step": 4278 + }, + { + "epoch": 0.35, + "grad_norm": 4.920419097660212, + "learning_rate": 7.581188541476099e-06, + "loss": 0.8236, + "step": 4279 + }, + { + "epoch": 0.35, + "grad_norm": 4.971846136754737, + "learning_rate": 7.580061922945896e-06, + "loss": 0.7004, + "step": 4280 + }, + { + "epoch": 0.35, + "grad_norm": 3.790890194239284, + "learning_rate": 7.578935125859164e-06, + "loss": 0.6001, + "step": 4281 + }, + { + "epoch": 0.35, + "grad_norm": 3.739403303657867, + "learning_rate": 7.577808150293883e-06, + "loss": 0.6626, + "step": 4282 + }, + { + "epoch": 0.35, + "grad_norm": 11.138993134364888, + "learning_rate": 7.576680996328046e-06, + "loss": 0.6695, + "step": 4283 + }, + { + "epoch": 0.35, + "grad_norm": 3.561010131528322, + "learning_rate": 7.5755536640396585e-06, + "loss": 0.8184, + "step": 4284 + }, + { + "epoch": 0.35, + "grad_norm": 2.472230734999347, + "learning_rate": 7.5744261535067436e-06, + "loss": 0.6314, + "step": 4285 + }, + { + "epoch": 0.35, + "grad_norm": 3.003977553554704, + "learning_rate": 7.573298464807329e-06, + "loss": 0.5863, + "step": 4286 + }, + { + "epoch": 0.35, + "grad_norm": 3.0686257045391883, + "learning_rate": 7.572170598019455e-06, + "loss": 0.6578, + "step": 4287 + }, + { + "epoch": 0.35, + "grad_norm": 2.544806123624027, + "learning_rate": 7.5710425532211795e-06, + "loss": 0.6203, + "step": 4288 + }, + { + "epoch": 0.35, + "grad_norm": 4.576071171501176, + "learning_rate": 7.569914330490573e-06, + "loss": 0.641, + "step": 4289 + }, + { + "epoch": 0.35, + "grad_norm": 4.3167412223340405, + "learning_rate": 7.568785929905713e-06, + "loss": 0.7007, + "step": 4290 + }, + { + "epoch": 0.35, + "grad_norm": 3.309508843412498, + "learning_rate": 7.567657351544691e-06, + "loss": 0.7809, + "step": 4291 + }, + { + "epoch": 0.35, + "grad_norm": 5.412586443488965, + "learning_rate": 7.566528595485614e-06, + "loss": 0.6616, + "step": 4292 + }, + { + "epoch": 0.35, + "grad_norm": 4.025616402450841, + "learning_rate": 7.565399661806598e-06, + "loss": 0.6127, + "step": 4293 + }, + { + "epoch": 0.35, + "grad_norm": 3.1648631058745846, + "learning_rate": 7.564270550585773e-06, + "loss": 0.8687, + "step": 4294 + }, + { + "epoch": 0.35, + "grad_norm": 4.0924745924238835, + "learning_rate": 7.563141261901279e-06, + "loss": 0.6236, + "step": 4295 + }, + { + "epoch": 0.35, + "grad_norm": 3.284148698220159, + "learning_rate": 7.56201179583127e-06, + "loss": 0.6316, + "step": 4296 + }, + { + "epoch": 0.35, + "grad_norm": 4.769678694625544, + "learning_rate": 7.560882152453914e-06, + "loss": 0.7607, + "step": 4297 + }, + { + "epoch": 0.35, + "grad_norm": 5.829625574435862, + "learning_rate": 7.559752331847388e-06, + "loss": 0.7013, + "step": 4298 + }, + { + "epoch": 0.35, + "grad_norm": 2.2528921926368746, + "learning_rate": 7.558622334089884e-06, + "loss": 0.5965, + "step": 4299 + }, + { + "epoch": 0.35, + "grad_norm": 3.152801605318248, + "learning_rate": 7.557492159259603e-06, + "loss": 0.6844, + "step": 4300 + }, + { + "epoch": 0.35, + "grad_norm": 4.327779305291376, + "learning_rate": 7.556361807434762e-06, + "loss": 0.7638, + "step": 4301 + }, + { + "epoch": 0.35, + "grad_norm": 2.841464875596218, + "learning_rate": 7.5552312786935864e-06, + "loss": 0.7442, + "step": 4302 + }, + { + "epoch": 0.35, + "grad_norm": 3.9567418070259452, + "learning_rate": 7.554100573114318e-06, + "loss": 0.8092, + "step": 4303 + }, + { + "epoch": 0.35, + "grad_norm": 2.314113493638594, + "learning_rate": 7.552969690775209e-06, + "loss": 0.7011, + "step": 4304 + }, + { + "epoch": 0.35, + "grad_norm": 8.735612181970499, + "learning_rate": 7.551838631754522e-06, + "loss": 0.8591, + "step": 4305 + }, + { + "epoch": 0.35, + "grad_norm": 5.238950558333852, + "learning_rate": 7.550707396130533e-06, + "loss": 0.7353, + "step": 4306 + }, + { + "epoch": 0.35, + "grad_norm": 8.205141867471987, + "learning_rate": 7.549575983981532e-06, + "loss": 0.7643, + "step": 4307 + }, + { + "epoch": 0.35, + "grad_norm": 4.035112391994702, + "learning_rate": 7.548444395385819e-06, + "loss": 0.8964, + "step": 4308 + }, + { + "epoch": 0.35, + "grad_norm": 3.299592018189047, + "learning_rate": 7.547312630421711e-06, + "loss": 0.7828, + "step": 4309 + }, + { + "epoch": 0.35, + "grad_norm": 4.163112869232628, + "learning_rate": 7.546180689167526e-06, + "loss": 0.7216, + "step": 4310 + }, + { + "epoch": 0.35, + "grad_norm": 2.9262470424793032, + "learning_rate": 7.545048571701606e-06, + "loss": 0.6496, + "step": 4311 + }, + { + "epoch": 0.35, + "grad_norm": 4.011712214957674, + "learning_rate": 7.543916278102301e-06, + "loss": 0.6719, + "step": 4312 + }, + { + "epoch": 0.35, + "grad_norm": 5.111229060072679, + "learning_rate": 7.542783808447971e-06, + "loss": 0.5831, + "step": 4313 + }, + { + "epoch": 0.35, + "grad_norm": 3.8276495122470675, + "learning_rate": 7.541651162816989e-06, + "loss": 0.6679, + "step": 4314 + }, + { + "epoch": 0.35, + "grad_norm": 4.515969422204437, + "learning_rate": 7.540518341287746e-06, + "loss": 0.7352, + "step": 4315 + }, + { + "epoch": 0.35, + "grad_norm": 3.7718834626352558, + "learning_rate": 7.539385343938635e-06, + "loss": 0.6302, + "step": 4316 + }, + { + "epoch": 0.35, + "grad_norm": 2.910487524741352, + "learning_rate": 7.538252170848071e-06, + "loss": 0.8576, + "step": 4317 + }, + { + "epoch": 0.35, + "grad_norm": 3.3754792627870502, + "learning_rate": 7.537118822094474e-06, + "loss": 0.7643, + "step": 4318 + }, + { + "epoch": 0.35, + "grad_norm": 5.375285341172751, + "learning_rate": 7.535985297756278e-06, + "loss": 0.7353, + "step": 4319 + }, + { + "epoch": 0.35, + "grad_norm": 3.450964116605378, + "learning_rate": 7.534851597911933e-06, + "loss": 0.678, + "step": 4320 + }, + { + "epoch": 0.35, + "grad_norm": 6.802113764760465, + "learning_rate": 7.533717722639896e-06, + "loss": 0.681, + "step": 4321 + }, + { + "epoch": 0.35, + "grad_norm": 4.723671888537213, + "learning_rate": 7.5325836720186395e-06, + "loss": 0.7163, + "step": 4322 + }, + { + "epoch": 0.35, + "grad_norm": 2.844832202845109, + "learning_rate": 7.531449446126646e-06, + "loss": 0.8422, + "step": 4323 + }, + { + "epoch": 0.35, + "grad_norm": 5.405549775057817, + "learning_rate": 7.530315045042411e-06, + "loss": 0.6053, + "step": 4324 + }, + { + "epoch": 0.35, + "grad_norm": 5.794883352150976, + "learning_rate": 7.529180468844443e-06, + "loss": 0.8149, + "step": 4325 + }, + { + "epoch": 0.35, + "grad_norm": 2.763794887740198, + "learning_rate": 7.528045717611263e-06, + "loss": 0.5991, + "step": 4326 + }, + { + "epoch": 0.35, + "grad_norm": 4.21226558936452, + "learning_rate": 7.5269107914214e-06, + "loss": 0.7543, + "step": 4327 + }, + { + "epoch": 0.35, + "grad_norm": 3.2514841798366945, + "learning_rate": 7.5257756903534005e-06, + "loss": 0.5784, + "step": 4328 + }, + { + "epoch": 0.35, + "grad_norm": 4.109552823380717, + "learning_rate": 7.52464041448582e-06, + "loss": 0.8158, + "step": 4329 + }, + { + "epoch": 0.35, + "grad_norm": 17.481983166844294, + "learning_rate": 7.523504963897223e-06, + "loss": 0.7928, + "step": 4330 + }, + { + "epoch": 0.35, + "grad_norm": 77.51934481382357, + "learning_rate": 7.522369338666195e-06, + "loss": 0.626, + "step": 4331 + }, + { + "epoch": 0.35, + "grad_norm": 3.096121078174357, + "learning_rate": 7.521233538871329e-06, + "loss": 0.675, + "step": 4332 + }, + { + "epoch": 0.35, + "grad_norm": 3.0225664396930765, + "learning_rate": 7.520097564591224e-06, + "loss": 0.722, + "step": 4333 + }, + { + "epoch": 0.35, + "grad_norm": 3.841735200545695, + "learning_rate": 7.518961415904502e-06, + "loss": 0.8584, + "step": 4334 + }, + { + "epoch": 0.35, + "grad_norm": 3.9403101054438143, + "learning_rate": 7.517825092889789e-06, + "loss": 0.7618, + "step": 4335 + }, + { + "epoch": 0.35, + "grad_norm": 8.637741653292197, + "learning_rate": 7.516688595625725e-06, + "loss": 0.9029, + "step": 4336 + }, + { + "epoch": 0.35, + "grad_norm": 8.571593678613747, + "learning_rate": 7.515551924190964e-06, + "loss": 0.7626, + "step": 4337 + }, + { + "epoch": 0.35, + "grad_norm": 3.684436126968125, + "learning_rate": 7.5144150786641715e-06, + "loss": 0.7271, + "step": 4338 + }, + { + "epoch": 0.35, + "grad_norm": 2.754834524792647, + "learning_rate": 7.5132780591240216e-06, + "loss": 0.7724, + "step": 4339 + }, + { + "epoch": 0.35, + "grad_norm": 2.7306855035362596, + "learning_rate": 7.512140865649207e-06, + "loss": 0.7638, + "step": 4340 + }, + { + "epoch": 0.35, + "grad_norm": 2.848712832399641, + "learning_rate": 7.5110034983184255e-06, + "loss": 0.7882, + "step": 4341 + }, + { + "epoch": 0.35, + "grad_norm": 4.882641771002021, + "learning_rate": 7.509865957210393e-06, + "loss": 0.795, + "step": 4342 + }, + { + "epoch": 0.35, + "grad_norm": 3.2604957956172775, + "learning_rate": 7.508728242403831e-06, + "loss": 0.6628, + "step": 4343 + }, + { + "epoch": 0.35, + "grad_norm": 3.7648995043631066, + "learning_rate": 7.5075903539774785e-06, + "loss": 0.7274, + "step": 4344 + }, + { + "epoch": 0.35, + "grad_norm": 11.340782949888176, + "learning_rate": 7.506452292010085e-06, + "loss": 0.8604, + "step": 4345 + }, + { + "epoch": 0.35, + "grad_norm": 6.424270119065036, + "learning_rate": 7.505314056580411e-06, + "loss": 0.7382, + "step": 4346 + }, + { + "epoch": 0.35, + "grad_norm": 4.694682979492532, + "learning_rate": 7.504175647767229e-06, + "loss": 0.6887, + "step": 4347 + }, + { + "epoch": 0.35, + "grad_norm": 3.2455752662804667, + "learning_rate": 7.503037065649325e-06, + "loss": 0.8675, + "step": 4348 + }, + { + "epoch": 0.35, + "grad_norm": 4.0485837198328785, + "learning_rate": 7.501898310305495e-06, + "loss": 0.8403, + "step": 4349 + }, + { + "epoch": 0.35, + "grad_norm": 4.232352064815178, + "learning_rate": 7.500759381814551e-06, + "loss": 0.6316, + "step": 4350 + }, + { + "epoch": 0.35, + "grad_norm": 2.536019956477901, + "learning_rate": 7.4996202802553085e-06, + "loss": 0.739, + "step": 4351 + }, + { + "epoch": 0.35, + "grad_norm": 2.4280947287275367, + "learning_rate": 7.498481005706606e-06, + "loss": 0.7394, + "step": 4352 + }, + { + "epoch": 0.35, + "grad_norm": 5.383288921593991, + "learning_rate": 7.497341558247285e-06, + "loss": 0.7144, + "step": 4353 + }, + { + "epoch": 0.35, + "grad_norm": 10.719075400449896, + "learning_rate": 7.496201937956204e-06, + "loss": 0.6862, + "step": 4354 + }, + { + "epoch": 0.35, + "grad_norm": 3.1058513793628295, + "learning_rate": 7.495062144912232e-06, + "loss": 0.7191, + "step": 4355 + }, + { + "epoch": 0.35, + "grad_norm": 5.403956744813072, + "learning_rate": 7.493922179194249e-06, + "loss": 0.6442, + "step": 4356 + }, + { + "epoch": 0.35, + "grad_norm": 5.762749520896775, + "learning_rate": 7.492782040881148e-06, + "loss": 0.8512, + "step": 4357 + }, + { + "epoch": 0.35, + "grad_norm": 2.5824931142326437, + "learning_rate": 7.491641730051833e-06, + "loss": 0.5986, + "step": 4358 + }, + { + "epoch": 0.35, + "grad_norm": 2.3777913169090628, + "learning_rate": 7.4905012467852234e-06, + "loss": 0.7455, + "step": 4359 + }, + { + "epoch": 0.35, + "grad_norm": 4.807002957594186, + "learning_rate": 7.489360591160245e-06, + "loss": 0.7312, + "step": 4360 + }, + { + "epoch": 0.35, + "grad_norm": 4.170384616213076, + "learning_rate": 7.48821976325584e-06, + "loss": 0.7623, + "step": 4361 + }, + { + "epoch": 0.35, + "grad_norm": 3.30622894237531, + "learning_rate": 7.487078763150959e-06, + "loss": 0.7976, + "step": 4362 + }, + { + "epoch": 0.35, + "grad_norm": 4.699365350019269, + "learning_rate": 7.485937590924568e-06, + "loss": 0.7909, + "step": 4363 + }, + { + "epoch": 0.35, + "grad_norm": 4.082362812667312, + "learning_rate": 7.484796246655643e-06, + "loss": 0.7012, + "step": 4364 + }, + { + "epoch": 0.35, + "grad_norm": 2.6580126614536552, + "learning_rate": 7.483654730423173e-06, + "loss": 0.7096, + "step": 4365 + }, + { + "epoch": 0.35, + "grad_norm": 4.074900852916815, + "learning_rate": 7.482513042306158e-06, + "loss": 0.6994, + "step": 4366 + }, + { + "epoch": 0.35, + "grad_norm": 3.020106639083808, + "learning_rate": 7.481371182383608e-06, + "loss": 0.6411, + "step": 4367 + }, + { + "epoch": 0.35, + "grad_norm": 6.391860893050497, + "learning_rate": 7.480229150734548e-06, + "loss": 0.7552, + "step": 4368 + }, + { + "epoch": 0.35, + "grad_norm": 3.1866776090479867, + "learning_rate": 7.479086947438015e-06, + "loss": 0.7383, + "step": 4369 + }, + { + "epoch": 0.35, + "grad_norm": 7.938616134659222, + "learning_rate": 7.477944572573054e-06, + "loss": 0.8675, + "step": 4370 + }, + { + "epoch": 0.36, + "grad_norm": 3.729162888124308, + "learning_rate": 7.476802026218726e-06, + "loss": 0.8473, + "step": 4371 + }, + { + "epoch": 0.36, + "grad_norm": 2.648190168076544, + "learning_rate": 7.475659308454104e-06, + "loss": 0.6545, + "step": 4372 + }, + { + "epoch": 0.36, + "grad_norm": 2.5492860425201385, + "learning_rate": 7.474516419358268e-06, + "loss": 0.6718, + "step": 4373 + }, + { + "epoch": 0.36, + "grad_norm": 2.962336085043656, + "learning_rate": 7.4733733590103185e-06, + "loss": 0.6607, + "step": 4374 + }, + { + "epoch": 0.36, + "grad_norm": 4.939593749610692, + "learning_rate": 7.472230127489357e-06, + "loss": 0.6948, + "step": 4375 + }, + { + "epoch": 0.36, + "grad_norm": 4.5483901471045325, + "learning_rate": 7.471086724874503e-06, + "loss": 0.7022, + "step": 4376 + }, + { + "epoch": 0.36, + "grad_norm": 23.344568080240478, + "learning_rate": 7.46994315124489e-06, + "loss": 0.6708, + "step": 4377 + }, + { + "epoch": 0.36, + "grad_norm": 3.8962352363212074, + "learning_rate": 7.4687994066796585e-06, + "loss": 0.9484, + "step": 4378 + }, + { + "epoch": 0.36, + "grad_norm": 3.0325900516467854, + "learning_rate": 7.467655491257962e-06, + "loss": 0.5932, + "step": 4379 + }, + { + "epoch": 0.36, + "grad_norm": 7.373328253132646, + "learning_rate": 7.466511405058969e-06, + "loss": 0.7201, + "step": 4380 + }, + { + "epoch": 0.36, + "grad_norm": 3.2906479470233125, + "learning_rate": 7.4653671481618565e-06, + "loss": 0.6844, + "step": 4381 + }, + { + "epoch": 0.36, + "grad_norm": 5.354385641816617, + "learning_rate": 7.4642227206458125e-06, + "loss": 0.8214, + "step": 4382 + }, + { + "epoch": 0.36, + "grad_norm": 3.7625310585625833, + "learning_rate": 7.463078122590043e-06, + "loss": 0.7634, + "step": 4383 + }, + { + "epoch": 0.36, + "grad_norm": 3.168165182004448, + "learning_rate": 7.4619333540737556e-06, + "loss": 0.6484, + "step": 4384 + }, + { + "epoch": 0.36, + "grad_norm": 3.6448506509708443, + "learning_rate": 7.460788415176181e-06, + "loss": 0.6737, + "step": 4385 + }, + { + "epoch": 0.36, + "grad_norm": 2.8296230470965487, + "learning_rate": 7.459643305976552e-06, + "loss": 0.6259, + "step": 4386 + }, + { + "epoch": 0.36, + "grad_norm": 3.893643745956693, + "learning_rate": 7.45849802655412e-06, + "loss": 0.8519, + "step": 4387 + }, + { + "epoch": 0.36, + "grad_norm": 2.5568576546958846, + "learning_rate": 7.457352576988144e-06, + "loss": 0.7352, + "step": 4388 + }, + { + "epoch": 0.36, + "grad_norm": 2.9646037351542973, + "learning_rate": 7.456206957357896e-06, + "loss": 0.7524, + "step": 4389 + }, + { + "epoch": 0.36, + "grad_norm": 5.742027316486682, + "learning_rate": 7.4550611677426635e-06, + "loss": 0.5943, + "step": 4390 + }, + { + "epoch": 0.36, + "grad_norm": 2.473308982446472, + "learning_rate": 7.453915208221739e-06, + "loss": 0.7509, + "step": 4391 + }, + { + "epoch": 0.36, + "grad_norm": 3.265700408273659, + "learning_rate": 7.45276907887443e-06, + "loss": 0.752, + "step": 4392 + }, + { + "epoch": 0.36, + "grad_norm": 6.999515805677346, + "learning_rate": 7.451622779780057e-06, + "loss": 0.5414, + "step": 4393 + }, + { + "epoch": 0.36, + "grad_norm": 2.311703141414544, + "learning_rate": 7.450476311017951e-06, + "loss": 0.6898, + "step": 4394 + }, + { + "epoch": 0.36, + "grad_norm": 2.8718384830276156, + "learning_rate": 7.449329672667456e-06, + "loss": 0.7158, + "step": 4395 + }, + { + "epoch": 0.36, + "grad_norm": 3.7790002479755347, + "learning_rate": 7.4481828648079235e-06, + "loss": 0.6822, + "step": 4396 + }, + { + "epoch": 0.36, + "grad_norm": 3.5467110462830638, + "learning_rate": 7.447035887518722e-06, + "loss": 0.8671, + "step": 4397 + }, + { + "epoch": 0.36, + "grad_norm": 3.36794392555424, + "learning_rate": 7.44588874087923e-06, + "loss": 0.9396, + "step": 4398 + }, + { + "epoch": 0.36, + "grad_norm": 3.5598389632750376, + "learning_rate": 7.4447414249688375e-06, + "loss": 0.7146, + "step": 4399 + }, + { + "epoch": 0.36, + "grad_norm": 3.5350313382908096, + "learning_rate": 7.443593939866944e-06, + "loss": 0.6315, + "step": 4400 + }, + { + "epoch": 0.36, + "grad_norm": 3.5845399562367386, + "learning_rate": 7.442446285652964e-06, + "loss": 0.6204, + "step": 4401 + }, + { + "epoch": 0.36, + "grad_norm": 21.358207546900445, + "learning_rate": 7.441298462406321e-06, + "loss": 0.6278, + "step": 4402 + }, + { + "epoch": 0.36, + "grad_norm": 2.7893521881140986, + "learning_rate": 7.440150470206453e-06, + "loss": 0.7836, + "step": 4403 + }, + { + "epoch": 0.36, + "grad_norm": 4.3679027234008, + "learning_rate": 7.439002309132808e-06, + "loss": 0.8058, + "step": 4404 + }, + { + "epoch": 0.36, + "grad_norm": 2.462728113742895, + "learning_rate": 7.437853979264847e-06, + "loss": 0.735, + "step": 4405 + }, + { + "epoch": 0.36, + "grad_norm": 6.404155790869616, + "learning_rate": 7.43670548068204e-06, + "loss": 0.8153, + "step": 4406 + }, + { + "epoch": 0.36, + "grad_norm": 5.479978434150942, + "learning_rate": 7.435556813463871e-06, + "loss": 0.6841, + "step": 4407 + }, + { + "epoch": 0.36, + "grad_norm": 5.6454771356866, + "learning_rate": 7.434407977689837e-06, + "loss": 0.6981, + "step": 4408 + }, + { + "epoch": 0.36, + "grad_norm": 3.8221363121651963, + "learning_rate": 7.43325897343944e-06, + "loss": 0.8039, + "step": 4409 + }, + { + "epoch": 0.36, + "grad_norm": 2.5910480444473136, + "learning_rate": 7.432109800792201e-06, + "loss": 0.5592, + "step": 4410 + }, + { + "epoch": 0.36, + "grad_norm": 3.564302382351731, + "learning_rate": 7.430960459827652e-06, + "loss": 0.6607, + "step": 4411 + }, + { + "epoch": 0.36, + "grad_norm": 6.327072800179629, + "learning_rate": 7.42981095062533e-06, + "loss": 0.7135, + "step": 4412 + }, + { + "epoch": 0.36, + "grad_norm": 2.5503268761382865, + "learning_rate": 7.428661273264792e-06, + "loss": 0.7202, + "step": 4413 + }, + { + "epoch": 0.36, + "grad_norm": 2.962175249739075, + "learning_rate": 7.427511427825602e-06, + "loss": 0.6867, + "step": 4414 + }, + { + "epoch": 0.36, + "grad_norm": 5.797923456038306, + "learning_rate": 7.426361414387338e-06, + "loss": 0.6126, + "step": 4415 + }, + { + "epoch": 0.36, + "grad_norm": 2.9750245664600543, + "learning_rate": 7.4252112330295835e-06, + "loss": 0.7435, + "step": 4416 + }, + { + "epoch": 0.36, + "grad_norm": 4.357031784225165, + "learning_rate": 7.424060883831942e-06, + "loss": 0.6562, + "step": 4417 + }, + { + "epoch": 0.36, + "grad_norm": 4.689394544762236, + "learning_rate": 7.422910366874026e-06, + "loss": 0.6558, + "step": 4418 + }, + { + "epoch": 0.36, + "grad_norm": 3.362997759954026, + "learning_rate": 7.421759682235454e-06, + "loss": 0.8782, + "step": 4419 + }, + { + "epoch": 0.36, + "grad_norm": 4.076213161988718, + "learning_rate": 7.4206088299958646e-06, + "loss": 0.5963, + "step": 4420 + }, + { + "epoch": 0.36, + "grad_norm": 2.4998675953692286, + "learning_rate": 7.4194578102349025e-06, + "loss": 0.7361, + "step": 4421 + }, + { + "epoch": 0.36, + "grad_norm": 10.68025149381913, + "learning_rate": 7.418306623032227e-06, + "loss": 0.8335, + "step": 4422 + }, + { + "epoch": 0.36, + "grad_norm": 3.915984488004341, + "learning_rate": 7.417155268467505e-06, + "loss": 0.694, + "step": 4423 + }, + { + "epoch": 0.36, + "grad_norm": 4.323354267768315, + "learning_rate": 7.416003746620419e-06, + "loss": 0.8888, + "step": 4424 + }, + { + "epoch": 0.36, + "grad_norm": 3.0366748416549365, + "learning_rate": 7.414852057570661e-06, + "loss": 0.7066, + "step": 4425 + }, + { + "epoch": 0.36, + "grad_norm": 3.80221444821345, + "learning_rate": 7.413700201397936e-06, + "loss": 0.8362, + "step": 4426 + }, + { + "epoch": 0.36, + "grad_norm": 4.179382507836741, + "learning_rate": 7.4125481781819594e-06, + "loss": 0.7123, + "step": 4427 + }, + { + "epoch": 0.36, + "grad_norm": 5.264683396950939, + "learning_rate": 7.411395988002457e-06, + "loss": 0.6809, + "step": 4428 + }, + { + "epoch": 0.36, + "grad_norm": 2.5328985399594695, + "learning_rate": 7.41024363093917e-06, + "loss": 0.7276, + "step": 4429 + }, + { + "epoch": 0.36, + "grad_norm": 5.269893304673372, + "learning_rate": 7.409091107071849e-06, + "loss": 0.6323, + "step": 4430 + }, + { + "epoch": 0.36, + "grad_norm": 4.533576009059273, + "learning_rate": 7.407938416480253e-06, + "loss": 0.6308, + "step": 4431 + }, + { + "epoch": 0.36, + "grad_norm": 2.879600882191293, + "learning_rate": 7.406785559244156e-06, + "loss": 0.6572, + "step": 4432 + }, + { + "epoch": 0.36, + "grad_norm": 7.286176994035368, + "learning_rate": 7.4056325354433445e-06, + "loss": 0.7032, + "step": 4433 + }, + { + "epoch": 0.36, + "grad_norm": 2.4675438183094593, + "learning_rate": 7.404479345157613e-06, + "loss": 0.717, + "step": 4434 + }, + { + "epoch": 0.36, + "grad_norm": 3.781173474743352, + "learning_rate": 7.403325988466774e-06, + "loss": 0.5571, + "step": 4435 + }, + { + "epoch": 0.36, + "grad_norm": 2.485568872963055, + "learning_rate": 7.402172465450642e-06, + "loss": 0.5699, + "step": 4436 + }, + { + "epoch": 0.36, + "grad_norm": 3.906426371288215, + "learning_rate": 7.4010187761890504e-06, + "loss": 0.5097, + "step": 4437 + }, + { + "epoch": 0.36, + "grad_norm": 3.3024995323868644, + "learning_rate": 7.3998649207618425e-06, + "loss": 0.7408, + "step": 4438 + }, + { + "epoch": 0.36, + "grad_norm": 3.6042702688457045, + "learning_rate": 7.398710899248871e-06, + "loss": 0.7343, + "step": 4439 + }, + { + "epoch": 0.36, + "grad_norm": 4.120637871880075, + "learning_rate": 7.39755671173e-06, + "loss": 0.8833, + "step": 4440 + }, + { + "epoch": 0.36, + "grad_norm": 4.802276718670347, + "learning_rate": 7.396402358285111e-06, + "loss": 0.6656, + "step": 4441 + }, + { + "epoch": 0.36, + "grad_norm": 5.523283725885031, + "learning_rate": 7.395247838994087e-06, + "loss": 0.8113, + "step": 4442 + }, + { + "epoch": 0.36, + "grad_norm": 2.6428317951929197, + "learning_rate": 7.394093153936832e-06, + "loss": 0.7314, + "step": 4443 + }, + { + "epoch": 0.36, + "grad_norm": 2.8348351784781376, + "learning_rate": 7.392938303193257e-06, + "loss": 0.6182, + "step": 4444 + }, + { + "epoch": 0.36, + "grad_norm": 3.7823311607569168, + "learning_rate": 7.391783286843283e-06, + "loss": 0.867, + "step": 4445 + }, + { + "epoch": 0.36, + "grad_norm": 2.675564225908151, + "learning_rate": 7.390628104966846e-06, + "loss": 0.7526, + "step": 4446 + }, + { + "epoch": 0.36, + "grad_norm": 3.0716978667734183, + "learning_rate": 7.389472757643892e-06, + "loss": 0.7543, + "step": 4447 + }, + { + "epoch": 0.36, + "grad_norm": 3.5339393854202745, + "learning_rate": 7.388317244954379e-06, + "loss": 0.792, + "step": 4448 + }, + { + "epoch": 0.36, + "grad_norm": 2.4377307616407755, + "learning_rate": 7.387161566978271e-06, + "loss": 0.7818, + "step": 4449 + }, + { + "epoch": 0.36, + "grad_norm": 3.4030299838671603, + "learning_rate": 7.386005723795554e-06, + "loss": 0.7784, + "step": 4450 + }, + { + "epoch": 0.36, + "grad_norm": 5.238520446201125, + "learning_rate": 7.384849715486217e-06, + "loss": 0.6194, + "step": 4451 + }, + { + "epoch": 0.36, + "grad_norm": 3.6739702339207656, + "learning_rate": 7.383693542130265e-06, + "loss": 0.7833, + "step": 4452 + }, + { + "epoch": 0.36, + "grad_norm": 2.699240871470253, + "learning_rate": 7.382537203807709e-06, + "loss": 0.6864, + "step": 4453 + }, + { + "epoch": 0.36, + "grad_norm": 2.815667069581723, + "learning_rate": 7.381380700598577e-06, + "loss": 0.8795, + "step": 4454 + }, + { + "epoch": 0.36, + "grad_norm": 4.0636217260525305, + "learning_rate": 7.380224032582908e-06, + "loss": 0.6884, + "step": 4455 + }, + { + "epoch": 0.36, + "grad_norm": 3.6425945293438087, + "learning_rate": 7.379067199840746e-06, + "loss": 0.7579, + "step": 4456 + }, + { + "epoch": 0.36, + "grad_norm": 3.060351253226855, + "learning_rate": 7.377910202452155e-06, + "loss": 0.6402, + "step": 4457 + }, + { + "epoch": 0.36, + "grad_norm": 3.842007156371223, + "learning_rate": 7.376753040497207e-06, + "loss": 0.7784, + "step": 4458 + }, + { + "epoch": 0.36, + "grad_norm": 2.914196955308941, + "learning_rate": 7.375595714055981e-06, + "loss": 0.8269, + "step": 4459 + }, + { + "epoch": 0.36, + "grad_norm": 3.6018975751318885, + "learning_rate": 7.374438223208575e-06, + "loss": 0.6826, + "step": 4460 + }, + { + "epoch": 0.36, + "grad_norm": 3.588982859842889, + "learning_rate": 7.373280568035093e-06, + "loss": 0.7012, + "step": 4461 + }, + { + "epoch": 0.36, + "grad_norm": 3.0819515408638516, + "learning_rate": 7.372122748615651e-06, + "loss": 0.7294, + "step": 4462 + }, + { + "epoch": 0.36, + "grad_norm": 4.4930432066471555, + "learning_rate": 7.370964765030381e-06, + "loss": 0.6681, + "step": 4463 + }, + { + "epoch": 0.36, + "grad_norm": 2.5649525708235696, + "learning_rate": 7.36980661735942e-06, + "loss": 0.71, + "step": 4464 + }, + { + "epoch": 0.36, + "grad_norm": 4.523007518653461, + "learning_rate": 7.368648305682917e-06, + "loss": 0.6903, + "step": 4465 + }, + { + "epoch": 0.36, + "grad_norm": 3.420544116856962, + "learning_rate": 7.367489830081039e-06, + "loss": 0.6694, + "step": 4466 + }, + { + "epoch": 0.36, + "grad_norm": 4.097159963606105, + "learning_rate": 7.3663311906339575e-06, + "loss": 0.6577, + "step": 4467 + }, + { + "epoch": 0.36, + "grad_norm": 3.9323260735012817, + "learning_rate": 7.365172387421858e-06, + "loss": 0.6595, + "step": 4468 + }, + { + "epoch": 0.36, + "grad_norm": 3.2839904184587567, + "learning_rate": 7.364013420524937e-06, + "loss": 0.8204, + "step": 4469 + }, + { + "epoch": 0.36, + "grad_norm": 3.077568178413793, + "learning_rate": 7.362854290023402e-06, + "loss": 0.803, + "step": 4470 + }, + { + "epoch": 0.36, + "grad_norm": 2.9499308219118725, + "learning_rate": 7.361694995997473e-06, + "loss": 0.6457, + "step": 4471 + }, + { + "epoch": 0.36, + "grad_norm": 6.008733877548259, + "learning_rate": 7.3605355385273805e-06, + "loss": 0.5798, + "step": 4472 + }, + { + "epoch": 0.36, + "grad_norm": 2.1644681750580914, + "learning_rate": 7.359375917693363e-06, + "loss": 0.5895, + "step": 4473 + }, + { + "epoch": 0.36, + "grad_norm": 5.284804039572399, + "learning_rate": 7.358216133575678e-06, + "loss": 0.8053, + "step": 4474 + }, + { + "epoch": 0.36, + "grad_norm": 2.9141999765481916, + "learning_rate": 7.357056186254587e-06, + "loss": 0.7025, + "step": 4475 + }, + { + "epoch": 0.36, + "grad_norm": 2.718037820371632, + "learning_rate": 7.355896075810368e-06, + "loss": 0.7647, + "step": 4476 + }, + { + "epoch": 0.36, + "grad_norm": 4.8492144590168, + "learning_rate": 7.354735802323305e-06, + "loss": 0.7476, + "step": 4477 + }, + { + "epoch": 0.36, + "grad_norm": 3.4937446532726746, + "learning_rate": 7.3535753658737e-06, + "loss": 0.7112, + "step": 4478 + }, + { + "epoch": 0.36, + "grad_norm": 2.3530635055445606, + "learning_rate": 7.3524147665418585e-06, + "loss": 0.6617, + "step": 4479 + }, + { + "epoch": 0.36, + "grad_norm": 3.825658092249463, + "learning_rate": 7.351254004408104e-06, + "loss": 0.7787, + "step": 4480 + }, + { + "epoch": 0.36, + "grad_norm": 3.4362059331620687, + "learning_rate": 7.350093079552768e-06, + "loss": 0.6551, + "step": 4481 + }, + { + "epoch": 0.36, + "grad_norm": 2.349597271509936, + "learning_rate": 7.348931992056192e-06, + "loss": 0.6308, + "step": 4482 + }, + { + "epoch": 0.36, + "grad_norm": 6.397850722351214, + "learning_rate": 7.347770741998733e-06, + "loss": 0.7859, + "step": 4483 + }, + { + "epoch": 0.36, + "grad_norm": 2.7586586677911478, + "learning_rate": 7.346609329460757e-06, + "loss": 0.7025, + "step": 4484 + }, + { + "epoch": 0.36, + "grad_norm": 4.2912941470279655, + "learning_rate": 7.345447754522637e-06, + "loss": 0.5807, + "step": 4485 + }, + { + "epoch": 0.36, + "grad_norm": 5.2182919104816285, + "learning_rate": 7.344286017264765e-06, + "loss": 0.6608, + "step": 4486 + }, + { + "epoch": 0.36, + "grad_norm": 9.616429337382309, + "learning_rate": 7.343124117767542e-06, + "loss": 0.7459, + "step": 4487 + }, + { + "epoch": 0.36, + "grad_norm": 3.975173007872712, + "learning_rate": 7.341962056111376e-06, + "loss": 0.6199, + "step": 4488 + }, + { + "epoch": 0.36, + "grad_norm": 4.344522161427955, + "learning_rate": 7.340799832376689e-06, + "loss": 0.7651, + "step": 4489 + }, + { + "epoch": 0.36, + "grad_norm": 3.9539388157709983, + "learning_rate": 7.339637446643913e-06, + "loss": 0.7351, + "step": 4490 + }, + { + "epoch": 0.36, + "grad_norm": 5.500780946926331, + "learning_rate": 7.338474898993496e-06, + "loss": 0.625, + "step": 4491 + }, + { + "epoch": 0.36, + "grad_norm": 2.695192884904258, + "learning_rate": 7.337312189505892e-06, + "loss": 0.8551, + "step": 4492 + }, + { + "epoch": 0.36, + "grad_norm": 3.6660337234277947, + "learning_rate": 7.336149318261565e-06, + "loss": 0.6786, + "step": 4493 + }, + { + "epoch": 0.37, + "grad_norm": 3.4142356380326087, + "learning_rate": 7.3349862853409996e-06, + "loss": 0.6263, + "step": 4494 + }, + { + "epoch": 0.37, + "grad_norm": 7.343329823678661, + "learning_rate": 7.333823090824679e-06, + "loss": 0.6181, + "step": 4495 + }, + { + "epoch": 0.37, + "grad_norm": 4.352349273214737, + "learning_rate": 7.332659734793104e-06, + "loss": 0.7285, + "step": 4496 + }, + { + "epoch": 0.37, + "grad_norm": 3.755468306016667, + "learning_rate": 7.331496217326789e-06, + "loss": 0.6578, + "step": 4497 + }, + { + "epoch": 0.37, + "grad_norm": 6.070947189547664, + "learning_rate": 7.3303325385062555e-06, + "loss": 0.5937, + "step": 4498 + }, + { + "epoch": 0.37, + "grad_norm": 4.786729601064355, + "learning_rate": 7.329168698412037e-06, + "loss": 0.9753, + "step": 4499 + }, + { + "epoch": 0.37, + "grad_norm": 17.53945792672117, + "learning_rate": 7.3280046971246786e-06, + "loss": 0.6946, + "step": 4500 + }, + { + "epoch": 0.37, + "grad_norm": 3.1868990876330567, + "learning_rate": 7.326840534724738e-06, + "loss": 0.7419, + "step": 4501 + }, + { + "epoch": 0.37, + "grad_norm": 4.77679280063808, + "learning_rate": 7.3256762112927805e-06, + "loss": 0.6692, + "step": 4502 + }, + { + "epoch": 0.37, + "grad_norm": 5.282872279203256, + "learning_rate": 7.324511726909387e-06, + "loss": 0.6341, + "step": 4503 + }, + { + "epoch": 0.37, + "grad_norm": 3.056725116253631, + "learning_rate": 7.323347081655146e-06, + "loss": 0.7403, + "step": 4504 + }, + { + "epoch": 0.37, + "grad_norm": 2.7936693920790505, + "learning_rate": 7.322182275610655e-06, + "loss": 0.8717, + "step": 4505 + }, + { + "epoch": 0.37, + "grad_norm": 3.1625636100261616, + "learning_rate": 7.3210173088565294e-06, + "loss": 0.5849, + "step": 4506 + }, + { + "epoch": 0.37, + "grad_norm": 5.693060722705188, + "learning_rate": 7.319852181473393e-06, + "loss": 0.7963, + "step": 4507 + }, + { + "epoch": 0.37, + "grad_norm": 3.299505131122427, + "learning_rate": 7.318686893541879e-06, + "loss": 0.8001, + "step": 4508 + }, + { + "epoch": 0.37, + "grad_norm": 4.223171113197734, + "learning_rate": 7.317521445142631e-06, + "loss": 0.7491, + "step": 4509 + }, + { + "epoch": 0.37, + "grad_norm": 3.90549369845727, + "learning_rate": 7.3163558363563055e-06, + "loss": 0.657, + "step": 4510 + }, + { + "epoch": 0.37, + "grad_norm": 3.6712786695001736, + "learning_rate": 7.315190067263574e-06, + "loss": 0.6473, + "step": 4511 + }, + { + "epoch": 0.37, + "grad_norm": 3.0617241975372522, + "learning_rate": 7.314024137945113e-06, + "loss": 0.7854, + "step": 4512 + }, + { + "epoch": 0.37, + "grad_norm": 4.16181435349445, + "learning_rate": 7.312858048481608e-06, + "loss": 0.6128, + "step": 4513 + }, + { + "epoch": 0.37, + "grad_norm": 4.581605283772196, + "learning_rate": 7.311691798953765e-06, + "loss": 0.6351, + "step": 4514 + }, + { + "epoch": 0.37, + "grad_norm": 3.18255791428962, + "learning_rate": 7.310525389442294e-06, + "loss": 0.6911, + "step": 4515 + }, + { + "epoch": 0.37, + "grad_norm": 5.513449865630468, + "learning_rate": 7.3093588200279165e-06, + "loss": 0.601, + "step": 4516 + }, + { + "epoch": 0.37, + "grad_norm": 3.5127199862506826, + "learning_rate": 7.308192090791368e-06, + "loss": 0.5516, + "step": 4517 + }, + { + "epoch": 0.37, + "grad_norm": 3.2659025156335186, + "learning_rate": 7.307025201813394e-06, + "loss": 0.6579, + "step": 4518 + }, + { + "epoch": 0.37, + "grad_norm": 3.8030309495349726, + "learning_rate": 7.30585815317475e-06, + "loss": 0.6771, + "step": 4519 + }, + { + "epoch": 0.37, + "grad_norm": 2.7472380661656786, + "learning_rate": 7.304690944956202e-06, + "loss": 0.767, + "step": 4520 + }, + { + "epoch": 0.37, + "grad_norm": 2.323666872316359, + "learning_rate": 7.3035235772385295e-06, + "loss": 0.8885, + "step": 4521 + }, + { + "epoch": 0.37, + "grad_norm": 4.359434683976116, + "learning_rate": 7.302356050102522e-06, + "loss": 0.6723, + "step": 4522 + }, + { + "epoch": 0.37, + "grad_norm": 3.9416315942471467, + "learning_rate": 7.301188363628977e-06, + "loss": 0.6317, + "step": 4523 + }, + { + "epoch": 0.37, + "grad_norm": 3.5357673660822733, + "learning_rate": 7.30002051789871e-06, + "loss": 0.647, + "step": 4524 + }, + { + "epoch": 0.37, + "grad_norm": 3.3937182078312556, + "learning_rate": 7.298852512992539e-06, + "loss": 0.7492, + "step": 4525 + }, + { + "epoch": 0.37, + "grad_norm": 5.540364366001674, + "learning_rate": 7.2976843489913004e-06, + "loss": 0.6415, + "step": 4526 + }, + { + "epoch": 0.37, + "grad_norm": 5.3543556793176865, + "learning_rate": 7.296516025975837e-06, + "loss": 0.6691, + "step": 4527 + }, + { + "epoch": 0.37, + "grad_norm": 8.503150703278033, + "learning_rate": 7.295347544027006e-06, + "loss": 0.7484, + "step": 4528 + }, + { + "epoch": 0.37, + "grad_norm": 3.1146291847492726, + "learning_rate": 7.2941789032256705e-06, + "loss": 0.7278, + "step": 4529 + }, + { + "epoch": 0.37, + "grad_norm": 3.0630323459162354, + "learning_rate": 7.29301010365271e-06, + "loss": 0.8748, + "step": 4530 + }, + { + "epoch": 0.37, + "grad_norm": 3.5926084791304436, + "learning_rate": 7.291841145389013e-06, + "loss": 0.7058, + "step": 4531 + }, + { + "epoch": 0.37, + "grad_norm": 3.6182340081078275, + "learning_rate": 7.290672028515477e-06, + "loss": 0.5328, + "step": 4532 + }, + { + "epoch": 0.37, + "grad_norm": 2.7299228586058613, + "learning_rate": 7.289502753113015e-06, + "loss": 0.7408, + "step": 4533 + }, + { + "epoch": 0.37, + "grad_norm": 3.8451954582573413, + "learning_rate": 7.288333319262546e-06, + "loss": 0.6903, + "step": 4534 + }, + { + "epoch": 0.37, + "grad_norm": 2.53024285428926, + "learning_rate": 7.287163727045002e-06, + "loss": 0.6186, + "step": 4535 + }, + { + "epoch": 0.37, + "grad_norm": 3.024554096339014, + "learning_rate": 7.285993976541328e-06, + "loss": 0.7634, + "step": 4536 + }, + { + "epoch": 0.37, + "grad_norm": 2.8918017196047874, + "learning_rate": 7.284824067832477e-06, + "loss": 0.5123, + "step": 4537 + }, + { + "epoch": 0.37, + "grad_norm": 3.6861996334807534, + "learning_rate": 7.283654000999413e-06, + "loss": 0.7377, + "step": 4538 + }, + { + "epoch": 0.37, + "grad_norm": 3.511724204450621, + "learning_rate": 7.282483776123113e-06, + "loss": 0.6785, + "step": 4539 + }, + { + "epoch": 0.37, + "grad_norm": 6.604730870714155, + "learning_rate": 7.281313393284564e-06, + "loss": 0.645, + "step": 4540 + }, + { + "epoch": 0.37, + "grad_norm": 3.713225005858742, + "learning_rate": 7.280142852564764e-06, + "loss": 0.7845, + "step": 4541 + }, + { + "epoch": 0.37, + "grad_norm": 3.9671421588655744, + "learning_rate": 7.278972154044722e-06, + "loss": 0.6083, + "step": 4542 + }, + { + "epoch": 0.37, + "grad_norm": 2.798603295901293, + "learning_rate": 7.277801297805458e-06, + "loss": 0.5438, + "step": 4543 + }, + { + "epoch": 0.37, + "grad_norm": 2.334182366897565, + "learning_rate": 7.276630283928002e-06, + "loss": 0.7704, + "step": 4544 + }, + { + "epoch": 0.37, + "grad_norm": 6.733843349007048, + "learning_rate": 7.275459112493395e-06, + "loss": 0.8557, + "step": 4545 + }, + { + "epoch": 0.37, + "grad_norm": 4.158914812034645, + "learning_rate": 7.274287783582689e-06, + "loss": 0.8004, + "step": 4546 + }, + { + "epoch": 0.37, + "grad_norm": 4.695978509578617, + "learning_rate": 7.2731162972769484e-06, + "loss": 0.7563, + "step": 4547 + }, + { + "epoch": 0.37, + "grad_norm": 2.331290508565121, + "learning_rate": 7.271944653657248e-06, + "loss": 0.7446, + "step": 4548 + }, + { + "epoch": 0.37, + "grad_norm": 3.4394030356131027, + "learning_rate": 7.270772852804672e-06, + "loss": 0.6757, + "step": 4549 + }, + { + "epoch": 0.37, + "grad_norm": 3.0920548798272485, + "learning_rate": 7.2696008948003164e-06, + "loss": 0.6967, + "step": 4550 + }, + { + "epoch": 0.37, + "grad_norm": 3.540676870570052, + "learning_rate": 7.26842877972529e-06, + "loss": 0.608, + "step": 4551 + }, + { + "epoch": 0.37, + "grad_norm": 2.557950061716488, + "learning_rate": 7.2672565076607075e-06, + "loss": 0.5225, + "step": 4552 + }, + { + "epoch": 0.37, + "grad_norm": 3.7142264642875458, + "learning_rate": 7.266084078687698e-06, + "loss": 0.7657, + "step": 4553 + }, + { + "epoch": 0.37, + "grad_norm": 11.09317960733513, + "learning_rate": 7.264911492887403e-06, + "loss": 0.7854, + "step": 4554 + }, + { + "epoch": 0.37, + "grad_norm": 2.677222738106719, + "learning_rate": 7.26373875034097e-06, + "loss": 0.7034, + "step": 4555 + }, + { + "epoch": 0.37, + "grad_norm": 3.038416509291698, + "learning_rate": 7.2625658511295635e-06, + "loss": 0.6344, + "step": 4556 + }, + { + "epoch": 0.37, + "grad_norm": 3.526997371392452, + "learning_rate": 7.261392795334354e-06, + "loss": 0.6855, + "step": 4557 + }, + { + "epoch": 0.37, + "grad_norm": 4.839224849112458, + "learning_rate": 7.260219583036523e-06, + "loss": 0.7645, + "step": 4558 + }, + { + "epoch": 0.37, + "grad_norm": 6.486281317039733, + "learning_rate": 7.259046214317266e-06, + "loss": 0.7564, + "step": 4559 + }, + { + "epoch": 0.37, + "grad_norm": 2.6792517499256263, + "learning_rate": 7.257872689257787e-06, + "loss": 0.6917, + "step": 4560 + }, + { + "epoch": 0.37, + "grad_norm": 3.771886467674248, + "learning_rate": 7.256699007939301e-06, + "loss": 0.6858, + "step": 4561 + }, + { + "epoch": 0.37, + "grad_norm": 3.344732150887603, + "learning_rate": 7.255525170443034e-06, + "loss": 0.5819, + "step": 4562 + }, + { + "epoch": 0.37, + "grad_norm": 4.363691755572554, + "learning_rate": 7.254351176850223e-06, + "loss": 0.66, + "step": 4563 + }, + { + "epoch": 0.37, + "grad_norm": 2.615891923513796, + "learning_rate": 7.253177027242117e-06, + "loss": 0.8462, + "step": 4564 + }, + { + "epoch": 0.37, + "grad_norm": 3.1168405735950806, + "learning_rate": 7.252002721699972e-06, + "loss": 0.6546, + "step": 4565 + }, + { + "epoch": 0.37, + "grad_norm": 3.6931431043567744, + "learning_rate": 7.2508282603050595e-06, + "loss": 0.6695, + "step": 4566 + }, + { + "epoch": 0.37, + "grad_norm": 9.327837871670415, + "learning_rate": 7.24965364313866e-06, + "loss": 0.9064, + "step": 4567 + }, + { + "epoch": 0.37, + "grad_norm": 2.481337631194375, + "learning_rate": 7.248478870282063e-06, + "loss": 0.7351, + "step": 4568 + }, + { + "epoch": 0.37, + "grad_norm": 5.62741655654551, + "learning_rate": 7.24730394181657e-06, + "loss": 0.6023, + "step": 4569 + }, + { + "epoch": 0.37, + "grad_norm": 3.3939429939186585, + "learning_rate": 7.2461288578234955e-06, + "loss": 0.679, + "step": 4570 + }, + { + "epoch": 0.37, + "grad_norm": 3.573568861569186, + "learning_rate": 7.2449536183841584e-06, + "loss": 0.7867, + "step": 4571 + }, + { + "epoch": 0.37, + "grad_norm": 2.609345271924847, + "learning_rate": 7.2437782235798985e-06, + "loss": 0.6368, + "step": 4572 + }, + { + "epoch": 0.37, + "grad_norm": 2.6043982681065376, + "learning_rate": 7.242602673492054e-06, + "loss": 0.7712, + "step": 4573 + }, + { + "epoch": 0.37, + "grad_norm": 6.21425089175264, + "learning_rate": 7.241426968201988e-06, + "loss": 0.6049, + "step": 4574 + }, + { + "epoch": 0.37, + "grad_norm": 2.595482758036474, + "learning_rate": 7.24025110779106e-06, + "loss": 0.7128, + "step": 4575 + }, + { + "epoch": 0.37, + "grad_norm": 3.9185572592968274, + "learning_rate": 7.239075092340651e-06, + "loss": 0.8079, + "step": 4576 + }, + { + "epoch": 0.37, + "grad_norm": 4.906126656128381, + "learning_rate": 7.2378989219321475e-06, + "loss": 0.6028, + "step": 4577 + }, + { + "epoch": 0.37, + "grad_norm": 8.80429910323126, + "learning_rate": 7.236722596646946e-06, + "loss": 0.7737, + "step": 4578 + }, + { + "epoch": 0.37, + "grad_norm": 3.6438321064462946, + "learning_rate": 7.235546116566456e-06, + "loss": 0.6816, + "step": 4579 + }, + { + "epoch": 0.37, + "grad_norm": 3.07118777038309, + "learning_rate": 7.234369481772101e-06, + "loss": 0.7659, + "step": 4580 + }, + { + "epoch": 0.37, + "grad_norm": 2.7111779933442888, + "learning_rate": 7.233192692345309e-06, + "loss": 0.651, + "step": 4581 + }, + { + "epoch": 0.37, + "grad_norm": 4.234014950863667, + "learning_rate": 7.23201574836752e-06, + "loss": 0.6503, + "step": 4582 + }, + { + "epoch": 0.37, + "grad_norm": 4.817036907452633, + "learning_rate": 7.230838649920189e-06, + "loss": 0.8369, + "step": 4583 + }, + { + "epoch": 0.37, + "grad_norm": 2.5171222675429257, + "learning_rate": 7.229661397084775e-06, + "loss": 0.6974, + "step": 4584 + }, + { + "epoch": 0.37, + "grad_norm": 4.442788209780339, + "learning_rate": 7.228483989942756e-06, + "loss": 0.5769, + "step": 4585 + }, + { + "epoch": 0.37, + "grad_norm": 6.026905561747521, + "learning_rate": 7.227306428575611e-06, + "loss": 0.8128, + "step": 4586 + }, + { + "epoch": 0.37, + "grad_norm": 3.134403083387003, + "learning_rate": 7.2261287130648374e-06, + "loss": 0.8053, + "step": 4587 + }, + { + "epoch": 0.37, + "grad_norm": 11.268197332653804, + "learning_rate": 7.224950843491941e-06, + "loss": 0.7556, + "step": 4588 + }, + { + "epoch": 0.37, + "grad_norm": 4.587164494762261, + "learning_rate": 7.223772819938434e-06, + "loss": 0.7144, + "step": 4589 + }, + { + "epoch": 0.37, + "grad_norm": 8.765766967551963, + "learning_rate": 7.222594642485849e-06, + "loss": 0.6391, + "step": 4590 + }, + { + "epoch": 0.37, + "grad_norm": 3.6599792090456127, + "learning_rate": 7.221416311215718e-06, + "loss": 0.6075, + "step": 4591 + }, + { + "epoch": 0.37, + "grad_norm": 2.978936563983368, + "learning_rate": 7.220237826209592e-06, + "loss": 0.6746, + "step": 4592 + }, + { + "epoch": 0.37, + "grad_norm": 4.096708790035792, + "learning_rate": 7.219059187549028e-06, + "loss": 0.6987, + "step": 4593 + }, + { + "epoch": 0.37, + "grad_norm": 4.214828583214547, + "learning_rate": 7.217880395315596e-06, + "loss": 0.6459, + "step": 4594 + }, + { + "epoch": 0.37, + "grad_norm": 3.529298377320403, + "learning_rate": 7.216701449590876e-06, + "loss": 0.7842, + "step": 4595 + }, + { + "epoch": 0.37, + "grad_norm": 8.43057087299158, + "learning_rate": 7.215522350456457e-06, + "loss": 0.7392, + "step": 4596 + }, + { + "epoch": 0.37, + "grad_norm": 6.083927468428668, + "learning_rate": 7.214343097993944e-06, + "loss": 0.8105, + "step": 4597 + }, + { + "epoch": 0.37, + "grad_norm": 2.9975262418947, + "learning_rate": 7.213163692284943e-06, + "loss": 0.7509, + "step": 4598 + }, + { + "epoch": 0.37, + "grad_norm": 8.21083632358296, + "learning_rate": 7.211984133411081e-06, + "loss": 0.5472, + "step": 4599 + }, + { + "epoch": 0.37, + "grad_norm": 5.114782825420015, + "learning_rate": 7.21080442145399e-06, + "loss": 0.7532, + "step": 4600 + }, + { + "epoch": 0.37, + "grad_norm": 4.942817601309485, + "learning_rate": 7.209624556495312e-06, + "loss": 0.599, + "step": 4601 + }, + { + "epoch": 0.37, + "grad_norm": 3.3857090339517315, + "learning_rate": 7.2084445386167e-06, + "loss": 0.7096, + "step": 4602 + }, + { + "epoch": 0.37, + "grad_norm": 8.477506880956728, + "learning_rate": 7.207264367899822e-06, + "loss": 0.8509, + "step": 4603 + }, + { + "epoch": 0.37, + "grad_norm": 3.5150950399943772, + "learning_rate": 7.206084044426351e-06, + "loss": 0.9706, + "step": 4604 + }, + { + "epoch": 0.37, + "grad_norm": 3.378883747827981, + "learning_rate": 7.204903568277975e-06, + "loss": 0.6458, + "step": 4605 + }, + { + "epoch": 0.37, + "grad_norm": 2.825083210936648, + "learning_rate": 7.203722939536386e-06, + "loss": 0.7206, + "step": 4606 + }, + { + "epoch": 0.37, + "grad_norm": 3.029207355109933, + "learning_rate": 7.202542158283297e-06, + "loss": 0.8459, + "step": 4607 + }, + { + "epoch": 0.37, + "grad_norm": 4.0785525092058394, + "learning_rate": 7.20136122460042e-06, + "loss": 0.6555, + "step": 4608 + }, + { + "epoch": 0.37, + "grad_norm": 3.5358981019320828, + "learning_rate": 7.2001801385694855e-06, + "loss": 0.7707, + "step": 4609 + }, + { + "epoch": 0.37, + "grad_norm": 2.8737566113518436, + "learning_rate": 7.198998900272234e-06, + "loss": 0.7383, + "step": 4610 + }, + { + "epoch": 0.37, + "grad_norm": 4.506566893656891, + "learning_rate": 7.19781750979041e-06, + "loss": 0.7377, + "step": 4611 + }, + { + "epoch": 0.37, + "grad_norm": 3.2251487017150313, + "learning_rate": 7.196635967205776e-06, + "loss": 0.8004, + "step": 4612 + }, + { + "epoch": 0.37, + "grad_norm": 2.3122448031558185, + "learning_rate": 7.195454272600104e-06, + "loss": 0.5337, + "step": 4613 + }, + { + "epoch": 0.37, + "grad_norm": 3.430526040477239, + "learning_rate": 7.194272426055171e-06, + "loss": 0.8458, + "step": 4614 + }, + { + "epoch": 0.37, + "grad_norm": 3.1968625969383404, + "learning_rate": 7.193090427652769e-06, + "loss": 0.821, + "step": 4615 + }, + { + "epoch": 0.37, + "grad_norm": 3.208762288933861, + "learning_rate": 7.191908277474703e-06, + "loss": 0.5317, + "step": 4616 + }, + { + "epoch": 0.37, + "grad_norm": 9.598094212414262, + "learning_rate": 7.190725975602781e-06, + "loss": 0.6212, + "step": 4617 + }, + { + "epoch": 0.38, + "grad_norm": 4.823046359083563, + "learning_rate": 7.189543522118828e-06, + "loss": 0.5381, + "step": 4618 + }, + { + "epoch": 0.38, + "grad_norm": 4.981238625431476, + "learning_rate": 7.188360917104676e-06, + "loss": 0.6638, + "step": 4619 + }, + { + "epoch": 0.38, + "grad_norm": 2.786435191993864, + "learning_rate": 7.187178160642172e-06, + "loss": 0.6756, + "step": 4620 + }, + { + "epoch": 0.38, + "grad_norm": 4.503388179848868, + "learning_rate": 7.185995252813165e-06, + "loss": 0.7356, + "step": 4621 + }, + { + "epoch": 0.38, + "grad_norm": 3.497944670610621, + "learning_rate": 7.184812193699523e-06, + "loss": 0.696, + "step": 4622 + }, + { + "epoch": 0.38, + "grad_norm": 5.2878611448171675, + "learning_rate": 7.183628983383122e-06, + "loss": 0.7148, + "step": 4623 + }, + { + "epoch": 0.38, + "grad_norm": 3.4466132957046325, + "learning_rate": 7.182445621945844e-06, + "loss": 0.7535, + "step": 4624 + }, + { + "epoch": 0.38, + "grad_norm": 3.8373019503464616, + "learning_rate": 7.181262109469588e-06, + "loss": 0.8449, + "step": 4625 + }, + { + "epoch": 0.38, + "grad_norm": 3.3730537725653087, + "learning_rate": 7.180078446036259e-06, + "loss": 0.675, + "step": 4626 + }, + { + "epoch": 0.38, + "grad_norm": 3.4536043344573524, + "learning_rate": 7.178894631727776e-06, + "loss": 0.5769, + "step": 4627 + }, + { + "epoch": 0.38, + "grad_norm": 3.4422784999166303, + "learning_rate": 7.177710666626064e-06, + "loss": 0.5517, + "step": 4628 + }, + { + "epoch": 0.38, + "grad_norm": 4.342433814512476, + "learning_rate": 7.1765265508130625e-06, + "loss": 0.6372, + "step": 4629 + }, + { + "epoch": 0.38, + "grad_norm": 2.813416200574221, + "learning_rate": 7.175342284370719e-06, + "loss": 0.6313, + "step": 4630 + }, + { + "epoch": 0.38, + "grad_norm": 2.614986656758312, + "learning_rate": 7.174157867380992e-06, + "loss": 0.7583, + "step": 4631 + }, + { + "epoch": 0.38, + "grad_norm": 3.536362081041492, + "learning_rate": 7.1729732999258515e-06, + "loss": 0.6173, + "step": 4632 + }, + { + "epoch": 0.38, + "grad_norm": 2.481696936079603, + "learning_rate": 7.1717885820872766e-06, + "loss": 0.6383, + "step": 4633 + }, + { + "epoch": 0.38, + "grad_norm": 2.7623252405885337, + "learning_rate": 7.170603713947256e-06, + "loss": 0.6752, + "step": 4634 + }, + { + "epoch": 0.38, + "grad_norm": 6.635907439727863, + "learning_rate": 7.169418695587791e-06, + "loss": 0.662, + "step": 4635 + }, + { + "epoch": 0.38, + "grad_norm": 2.826614426957756, + "learning_rate": 7.168233527090893e-06, + "loss": 0.5858, + "step": 4636 + }, + { + "epoch": 0.38, + "grad_norm": 4.093919599114829, + "learning_rate": 7.167048208538584e-06, + "loss": 0.7368, + "step": 4637 + }, + { + "epoch": 0.38, + "grad_norm": 3.2721269813731015, + "learning_rate": 7.165862740012892e-06, + "loss": 0.6526, + "step": 4638 + }, + { + "epoch": 0.38, + "grad_norm": 2.841651248068161, + "learning_rate": 7.164677121595862e-06, + "loss": 0.6743, + "step": 4639 + }, + { + "epoch": 0.38, + "grad_norm": 3.9093931891702773, + "learning_rate": 7.163491353369545e-06, + "loss": 0.7495, + "step": 4640 + }, + { + "epoch": 0.38, + "grad_norm": 2.9809105998952536, + "learning_rate": 7.1623054354160045e-06, + "loss": 0.7539, + "step": 4641 + }, + { + "epoch": 0.38, + "grad_norm": 5.272578972408287, + "learning_rate": 7.161119367817313e-06, + "loss": 0.6042, + "step": 4642 + }, + { + "epoch": 0.38, + "grad_norm": 5.430665378882515, + "learning_rate": 7.1599331506555535e-06, + "loss": 0.5704, + "step": 4643 + }, + { + "epoch": 0.38, + "grad_norm": 3.738495536196233, + "learning_rate": 7.158746784012819e-06, + "loss": 0.6285, + "step": 4644 + }, + { + "epoch": 0.38, + "grad_norm": 7.62894779917687, + "learning_rate": 7.157560267971214e-06, + "loss": 0.7959, + "step": 4645 + }, + { + "epoch": 0.38, + "grad_norm": 4.675281474246245, + "learning_rate": 7.156373602612854e-06, + "loss": 0.9053, + "step": 4646 + }, + { + "epoch": 0.38, + "grad_norm": 4.775749864698209, + "learning_rate": 7.155186788019864e-06, + "loss": 0.4985, + "step": 4647 + }, + { + "epoch": 0.38, + "grad_norm": 11.954451505133795, + "learning_rate": 7.153999824274377e-06, + "loss": 0.5708, + "step": 4648 + }, + { + "epoch": 0.38, + "grad_norm": 4.29264108181392, + "learning_rate": 7.152812711458541e-06, + "loss": 0.5455, + "step": 4649 + }, + { + "epoch": 0.38, + "grad_norm": 3.4668578855760632, + "learning_rate": 7.151625449654509e-06, + "loss": 0.6797, + "step": 4650 + }, + { + "epoch": 0.38, + "grad_norm": 2.268808338807677, + "learning_rate": 7.150438038944448e-06, + "loss": 0.6572, + "step": 4651 + }, + { + "epoch": 0.38, + "grad_norm": 3.7225077219565583, + "learning_rate": 7.149250479410535e-06, + "loss": 0.6027, + "step": 4652 + }, + { + "epoch": 0.38, + "grad_norm": 3.0397708159013193, + "learning_rate": 7.148062771134956e-06, + "loss": 0.6133, + "step": 4653 + }, + { + "epoch": 0.38, + "grad_norm": 7.205837471556408, + "learning_rate": 7.146874914199906e-06, + "loss": 0.7338, + "step": 4654 + }, + { + "epoch": 0.38, + "grad_norm": 7.386212757823452, + "learning_rate": 7.1456869086875955e-06, + "loss": 0.6737, + "step": 4655 + }, + { + "epoch": 0.38, + "grad_norm": 8.441754556305513, + "learning_rate": 7.1444987546802415e-06, + "loss": 0.5709, + "step": 4656 + }, + { + "epoch": 0.38, + "grad_norm": 2.8555547065947873, + "learning_rate": 7.1433104522600705e-06, + "loss": 0.7045, + "step": 4657 + }, + { + "epoch": 0.38, + "grad_norm": 4.873391244255208, + "learning_rate": 7.1421220015093195e-06, + "loss": 0.598, + "step": 4658 + }, + { + "epoch": 0.38, + "grad_norm": 3.1599114860935478, + "learning_rate": 7.1409334025102395e-06, + "loss": 0.6517, + "step": 4659 + }, + { + "epoch": 0.38, + "grad_norm": 2.9712183550675904, + "learning_rate": 7.139744655345087e-06, + "loss": 0.583, + "step": 4660 + }, + { + "epoch": 0.38, + "grad_norm": 3.2435078418245027, + "learning_rate": 7.138555760096131e-06, + "loss": 0.7729, + "step": 4661 + }, + { + "epoch": 0.38, + "grad_norm": 2.904077071328026, + "learning_rate": 7.137366716845651e-06, + "loss": 0.661, + "step": 4662 + }, + { + "epoch": 0.38, + "grad_norm": 12.39071428593291, + "learning_rate": 7.136177525675937e-06, + "loss": 0.7207, + "step": 4663 + }, + { + "epoch": 0.38, + "grad_norm": 8.321056109513979, + "learning_rate": 7.134988186669287e-06, + "loss": 0.7683, + "step": 4664 + }, + { + "epoch": 0.38, + "grad_norm": 9.271932411096461, + "learning_rate": 7.133798699908012e-06, + "loss": 0.5346, + "step": 4665 + }, + { + "epoch": 0.38, + "grad_norm": 3.7271538233796817, + "learning_rate": 7.132609065474432e-06, + "loss": 0.7361, + "step": 4666 + }, + { + "epoch": 0.38, + "grad_norm": 5.8818397221116845, + "learning_rate": 7.131419283450875e-06, + "loss": 0.6833, + "step": 4667 + }, + { + "epoch": 0.38, + "grad_norm": 3.122850247102864, + "learning_rate": 7.130229353919685e-06, + "loss": 0.8497, + "step": 4668 + }, + { + "epoch": 0.38, + "grad_norm": 5.098573850938509, + "learning_rate": 7.129039276963209e-06, + "loss": 0.5876, + "step": 4669 + }, + { + "epoch": 0.38, + "grad_norm": 5.450692122221304, + "learning_rate": 7.12784905266381e-06, + "loss": 0.7207, + "step": 4670 + }, + { + "epoch": 0.38, + "grad_norm": 2.678522930421112, + "learning_rate": 7.126658681103858e-06, + "loss": 0.7842, + "step": 4671 + }, + { + "epoch": 0.38, + "grad_norm": 4.278532112478232, + "learning_rate": 7.125468162365736e-06, + "loss": 0.6208, + "step": 4672 + }, + { + "epoch": 0.38, + "grad_norm": 11.368575576046588, + "learning_rate": 7.124277496531834e-06, + "loss": 0.6432, + "step": 4673 + }, + { + "epoch": 0.38, + "grad_norm": 5.359932771442346, + "learning_rate": 7.123086683684554e-06, + "loss": 0.5558, + "step": 4674 + }, + { + "epoch": 0.38, + "grad_norm": 2.898097619127136, + "learning_rate": 7.121895723906306e-06, + "loss": 0.8041, + "step": 4675 + }, + { + "epoch": 0.38, + "grad_norm": 3.47237108497763, + "learning_rate": 7.1207046172795145e-06, + "loss": 0.6061, + "step": 4676 + }, + { + "epoch": 0.38, + "grad_norm": 4.320518907948175, + "learning_rate": 7.1195133638866085e-06, + "loss": 0.7361, + "step": 4677 + }, + { + "epoch": 0.38, + "grad_norm": 3.670823585036834, + "learning_rate": 7.118321963810033e-06, + "loss": 0.7705, + "step": 4678 + }, + { + "epoch": 0.38, + "grad_norm": 2.6007285772395137, + "learning_rate": 7.117130417132241e-06, + "loss": 0.6766, + "step": 4679 + }, + { + "epoch": 0.38, + "grad_norm": 2.640208804132585, + "learning_rate": 7.115938723935693e-06, + "loss": 0.7198, + "step": 4680 + }, + { + "epoch": 0.38, + "grad_norm": 2.7550458332469625, + "learning_rate": 7.114746884302862e-06, + "loss": 0.6745, + "step": 4681 + }, + { + "epoch": 0.38, + "grad_norm": 2.8387949986935954, + "learning_rate": 7.113554898316231e-06, + "loss": 0.6555, + "step": 4682 + }, + { + "epoch": 0.38, + "grad_norm": 3.6471663756970596, + "learning_rate": 7.1123627660582925e-06, + "loss": 0.654, + "step": 4683 + }, + { + "epoch": 0.38, + "grad_norm": 2.5069485078757863, + "learning_rate": 7.111170487611551e-06, + "loss": 0.7319, + "step": 4684 + }, + { + "epoch": 0.38, + "grad_norm": 3.01650507537378, + "learning_rate": 7.109978063058518e-06, + "loss": 0.7052, + "step": 4685 + }, + { + "epoch": 0.38, + "grad_norm": 11.417044643219272, + "learning_rate": 7.108785492481718e-06, + "loss": 0.5815, + "step": 4686 + }, + { + "epoch": 0.38, + "grad_norm": 6.12751161220626, + "learning_rate": 7.107592775963683e-06, + "loss": 0.5818, + "step": 4687 + }, + { + "epoch": 0.38, + "grad_norm": 3.906952307866992, + "learning_rate": 7.106399913586958e-06, + "loss": 0.7939, + "step": 4688 + }, + { + "epoch": 0.38, + "grad_norm": 4.634279269787878, + "learning_rate": 7.105206905434097e-06, + "loss": 0.8369, + "step": 4689 + }, + { + "epoch": 0.38, + "grad_norm": 2.456524174934351, + "learning_rate": 7.104013751587662e-06, + "loss": 0.7736, + "step": 4690 + }, + { + "epoch": 0.38, + "grad_norm": 3.2869926364334243, + "learning_rate": 7.1028204521302255e-06, + "loss": 0.6946, + "step": 4691 + }, + { + "epoch": 0.38, + "grad_norm": 2.8141452804517364, + "learning_rate": 7.101627007144375e-06, + "loss": 0.7007, + "step": 4692 + }, + { + "epoch": 0.38, + "grad_norm": 3.672130284730923, + "learning_rate": 7.100433416712703e-06, + "loss": 0.6419, + "step": 4693 + }, + { + "epoch": 0.38, + "grad_norm": 25.416486132647815, + "learning_rate": 7.099239680917813e-06, + "loss": 0.7434, + "step": 4694 + }, + { + "epoch": 0.38, + "grad_norm": 5.574362086676644, + "learning_rate": 7.098045799842318e-06, + "loss": 0.7044, + "step": 4695 + }, + { + "epoch": 0.38, + "grad_norm": 4.736531071837449, + "learning_rate": 7.0968517735688445e-06, + "loss": 0.6905, + "step": 4696 + }, + { + "epoch": 0.38, + "grad_norm": 3.4878564618133954, + "learning_rate": 7.095657602180025e-06, + "loss": 0.7594, + "step": 4697 + }, + { + "epoch": 0.38, + "grad_norm": 5.171514770025322, + "learning_rate": 7.094463285758505e-06, + "loss": 0.6537, + "step": 4698 + }, + { + "epoch": 0.38, + "grad_norm": 3.317578098449159, + "learning_rate": 7.093268824386936e-06, + "loss": 0.7155, + "step": 4699 + }, + { + "epoch": 0.38, + "grad_norm": 2.4636745818904853, + "learning_rate": 7.0920742181479865e-06, + "loss": 0.7538, + "step": 4700 + }, + { + "epoch": 0.38, + "grad_norm": 3.547806076694837, + "learning_rate": 7.090879467124325e-06, + "loss": 0.5509, + "step": 4701 + }, + { + "epoch": 0.38, + "grad_norm": 3.2563259121455865, + "learning_rate": 7.089684571398641e-06, + "loss": 0.5816, + "step": 4702 + }, + { + "epoch": 0.38, + "grad_norm": 3.2255120247281672, + "learning_rate": 7.0884895310536276e-06, + "loss": 0.783, + "step": 4703 + }, + { + "epoch": 0.38, + "grad_norm": 5.776721035622656, + "learning_rate": 7.087294346171987e-06, + "loss": 0.6842, + "step": 4704 + }, + { + "epoch": 0.38, + "grad_norm": 5.048516985531098, + "learning_rate": 7.086099016836436e-06, + "loss": 0.6774, + "step": 4705 + }, + { + "epoch": 0.38, + "grad_norm": 4.063302664025569, + "learning_rate": 7.084903543129699e-06, + "loss": 0.7017, + "step": 4706 + }, + { + "epoch": 0.38, + "grad_norm": 6.383645684848338, + "learning_rate": 7.083707925134507e-06, + "loss": 0.8186, + "step": 4707 + }, + { + "epoch": 0.38, + "grad_norm": 4.125286107161097, + "learning_rate": 7.082512162933606e-06, + "loss": 0.6044, + "step": 4708 + }, + { + "epoch": 0.38, + "grad_norm": 3.78441046497153, + "learning_rate": 7.081316256609752e-06, + "loss": 0.7631, + "step": 4709 + }, + { + "epoch": 0.38, + "grad_norm": 3.5065912131038317, + "learning_rate": 7.080120206245709e-06, + "loss": 0.7189, + "step": 4710 + }, + { + "epoch": 0.38, + "grad_norm": 2.701618202613849, + "learning_rate": 7.078924011924248e-06, + "loss": 0.6737, + "step": 4711 + }, + { + "epoch": 0.38, + "grad_norm": 2.292711295292416, + "learning_rate": 7.077727673728156e-06, + "loss": 0.6707, + "step": 4712 + }, + { + "epoch": 0.38, + "grad_norm": 3.534182666841607, + "learning_rate": 7.076531191740228e-06, + "loss": 0.7193, + "step": 4713 + }, + { + "epoch": 0.38, + "grad_norm": 3.1046399084071057, + "learning_rate": 7.075334566043266e-06, + "loss": 0.7456, + "step": 4714 + }, + { + "epoch": 0.38, + "grad_norm": 7.771713752398221, + "learning_rate": 7.074137796720083e-06, + "loss": 0.6358, + "step": 4715 + }, + { + "epoch": 0.38, + "grad_norm": 3.222820471856443, + "learning_rate": 7.0729408838535075e-06, + "loss": 0.705, + "step": 4716 + }, + { + "epoch": 0.38, + "grad_norm": 4.336394645030191, + "learning_rate": 7.071743827526367e-06, + "loss": 0.8549, + "step": 4717 + }, + { + "epoch": 0.38, + "grad_norm": 3.627849959579447, + "learning_rate": 7.07054662782151e-06, + "loss": 0.8724, + "step": 4718 + }, + { + "epoch": 0.38, + "grad_norm": 3.8270295955216893, + "learning_rate": 7.06934928482179e-06, + "loss": 0.8338, + "step": 4719 + }, + { + "epoch": 0.38, + "grad_norm": 3.1567612307157593, + "learning_rate": 7.06815179861007e-06, + "loss": 0.7705, + "step": 4720 + }, + { + "epoch": 0.38, + "grad_norm": 3.6584244116093805, + "learning_rate": 7.066954169269225e-06, + "loss": 0.6964, + "step": 4721 + }, + { + "epoch": 0.38, + "grad_norm": 3.6982942016067466, + "learning_rate": 7.065756396882134e-06, + "loss": 0.7037, + "step": 4722 + }, + { + "epoch": 0.38, + "grad_norm": 2.664422466104552, + "learning_rate": 7.064558481531695e-06, + "loss": 0.5922, + "step": 4723 + }, + { + "epoch": 0.38, + "grad_norm": 2.735010415736837, + "learning_rate": 7.063360423300808e-06, + "loss": 0.669, + "step": 4724 + }, + { + "epoch": 0.38, + "grad_norm": 8.837962127996956, + "learning_rate": 7.0621622222723875e-06, + "loss": 0.7479, + "step": 4725 + }, + { + "epoch": 0.38, + "grad_norm": 3.8384119996737094, + "learning_rate": 7.060963878529359e-06, + "loss": 0.6741, + "step": 4726 + }, + { + "epoch": 0.38, + "grad_norm": 3.3298489327790572, + "learning_rate": 7.059765392154651e-06, + "loss": 0.6785, + "step": 4727 + }, + { + "epoch": 0.38, + "grad_norm": 7.207616723323181, + "learning_rate": 7.058566763231209e-06, + "loss": 0.7337, + "step": 4728 + }, + { + "epoch": 0.38, + "grad_norm": 3.003171316423661, + "learning_rate": 7.0573679918419855e-06, + "loss": 0.7798, + "step": 4729 + }, + { + "epoch": 0.38, + "grad_norm": 8.500366819448388, + "learning_rate": 7.056169078069943e-06, + "loss": 0.6808, + "step": 4730 + }, + { + "epoch": 0.38, + "grad_norm": 4.359801025012354, + "learning_rate": 7.054970021998054e-06, + "loss": 0.563, + "step": 4731 + }, + { + "epoch": 0.38, + "grad_norm": 3.830337464691882, + "learning_rate": 7.0537708237092985e-06, + "loss": 0.7713, + "step": 4732 + }, + { + "epoch": 0.38, + "grad_norm": 12.434751353758072, + "learning_rate": 7.052571483286672e-06, + "loss": 0.8095, + "step": 4733 + }, + { + "epoch": 0.38, + "grad_norm": 4.176176076309689, + "learning_rate": 7.0513720008131745e-06, + "loss": 0.6965, + "step": 4734 + }, + { + "epoch": 0.38, + "grad_norm": 3.1911186745988207, + "learning_rate": 7.050172376371817e-06, + "loss": 0.957, + "step": 4735 + }, + { + "epoch": 0.38, + "grad_norm": 11.419599361262877, + "learning_rate": 7.048972610045624e-06, + "loss": 0.879, + "step": 4736 + }, + { + "epoch": 0.38, + "grad_norm": 5.785404441556161, + "learning_rate": 7.0477727019176235e-06, + "loss": 0.8006, + "step": 4737 + }, + { + "epoch": 0.38, + "grad_norm": 16.700054897768478, + "learning_rate": 7.04657265207086e-06, + "loss": 0.7379, + "step": 4738 + }, + { + "epoch": 0.38, + "grad_norm": 2.31001164582755, + "learning_rate": 7.045372460588381e-06, + "loss": 0.7344, + "step": 4739 + }, + { + "epoch": 0.38, + "grad_norm": 3.090967187510388, + "learning_rate": 7.044172127553249e-06, + "loss": 0.6778, + "step": 4740 + }, + { + "epoch": 0.39, + "grad_norm": 3.796858230944286, + "learning_rate": 7.042971653048535e-06, + "loss": 0.7779, + "step": 4741 + }, + { + "epoch": 0.39, + "grad_norm": 3.0251323675608686, + "learning_rate": 7.0417710371573185e-06, + "loss": 0.783, + "step": 4742 + }, + { + "epoch": 0.39, + "grad_norm": 3.139367508370477, + "learning_rate": 7.0405702799626905e-06, + "loss": 0.5638, + "step": 4743 + }, + { + "epoch": 0.39, + "grad_norm": 4.2397039366983975, + "learning_rate": 7.0393693815477505e-06, + "loss": 0.5706, + "step": 4744 + }, + { + "epoch": 0.39, + "grad_norm": 3.123173958836047, + "learning_rate": 7.038168341995609e-06, + "loss": 0.9192, + "step": 4745 + }, + { + "epoch": 0.39, + "grad_norm": 20.89906946577019, + "learning_rate": 7.036967161389386e-06, + "loss": 0.6656, + "step": 4746 + }, + { + "epoch": 0.39, + "grad_norm": 2.729162646857455, + "learning_rate": 7.035765839812208e-06, + "loss": 0.6235, + "step": 4747 + }, + { + "epoch": 0.39, + "grad_norm": 3.722917781428734, + "learning_rate": 7.034564377347215e-06, + "loss": 0.6638, + "step": 4748 + }, + { + "epoch": 0.39, + "grad_norm": 3.6165913437908417, + "learning_rate": 7.033362774077557e-06, + "loss": 0.658, + "step": 4749 + }, + { + "epoch": 0.39, + "grad_norm": 2.97570762293132, + "learning_rate": 7.032161030086392e-06, + "loss": 0.8415, + "step": 4750 + }, + { + "epoch": 0.39, + "grad_norm": 2.3525135934384527, + "learning_rate": 7.030959145456888e-06, + "loss": 0.7325, + "step": 4751 + }, + { + "epoch": 0.39, + "grad_norm": 6.5214968914635625, + "learning_rate": 7.029757120272222e-06, + "loss": 0.6022, + "step": 4752 + }, + { + "epoch": 0.39, + "grad_norm": 3.081810895384827, + "learning_rate": 7.028554954615585e-06, + "loss": 0.6031, + "step": 4753 + }, + { + "epoch": 0.39, + "grad_norm": 4.37171912183395, + "learning_rate": 7.027352648570173e-06, + "loss": 0.7175, + "step": 4754 + }, + { + "epoch": 0.39, + "grad_norm": 2.7692945793723376, + "learning_rate": 7.026150202219191e-06, + "loss": 0.6536, + "step": 4755 + }, + { + "epoch": 0.39, + "grad_norm": 2.962069871900144, + "learning_rate": 7.0249476156458574e-06, + "loss": 0.7136, + "step": 4756 + }, + { + "epoch": 0.39, + "grad_norm": 3.533535732389279, + "learning_rate": 7.0237448889333985e-06, + "loss": 0.623, + "step": 4757 + }, + { + "epoch": 0.39, + "grad_norm": 2.733575254379372, + "learning_rate": 7.022542022165051e-06, + "loss": 0.5744, + "step": 4758 + }, + { + "epoch": 0.39, + "grad_norm": 3.443727789975465, + "learning_rate": 7.02133901542406e-06, + "loss": 0.7227, + "step": 4759 + }, + { + "epoch": 0.39, + "grad_norm": 2.8484237733229407, + "learning_rate": 7.020135868793683e-06, + "loss": 0.7205, + "step": 4760 + }, + { + "epoch": 0.39, + "grad_norm": 8.262231973112538, + "learning_rate": 7.018932582357182e-06, + "loss": 0.7336, + "step": 4761 + }, + { + "epoch": 0.39, + "grad_norm": 2.5305398197549156, + "learning_rate": 7.017729156197836e-06, + "loss": 0.7715, + "step": 4762 + }, + { + "epoch": 0.39, + "grad_norm": 4.528294715825457, + "learning_rate": 7.0165255903989275e-06, + "loss": 0.6227, + "step": 4763 + }, + { + "epoch": 0.39, + "grad_norm": 3.962359345790988, + "learning_rate": 7.01532188504375e-06, + "loss": 0.6327, + "step": 4764 + }, + { + "epoch": 0.39, + "grad_norm": 2.9968972709091957, + "learning_rate": 7.0141180402156085e-06, + "loss": 0.5927, + "step": 4765 + }, + { + "epoch": 0.39, + "grad_norm": 3.039593348032703, + "learning_rate": 7.0129140559978184e-06, + "loss": 0.7199, + "step": 4766 + }, + { + "epoch": 0.39, + "grad_norm": 15.449739397911483, + "learning_rate": 7.011709932473699e-06, + "loss": 0.5359, + "step": 4767 + }, + { + "epoch": 0.39, + "grad_norm": 6.9289384120405915, + "learning_rate": 7.010505669726586e-06, + "loss": 0.6328, + "step": 4768 + }, + { + "epoch": 0.39, + "grad_norm": 4.679805263166353, + "learning_rate": 7.0093012678398234e-06, + "loss": 0.757, + "step": 4769 + }, + { + "epoch": 0.39, + "grad_norm": 3.590905785872564, + "learning_rate": 7.008096726896761e-06, + "loss": 0.7391, + "step": 4770 + }, + { + "epoch": 0.39, + "grad_norm": 3.5995779970118895, + "learning_rate": 7.00689204698076e-06, + "loss": 0.9249, + "step": 4771 + }, + { + "epoch": 0.39, + "grad_norm": 2.5076534012899625, + "learning_rate": 7.005687228175192e-06, + "loss": 0.7813, + "step": 4772 + }, + { + "epoch": 0.39, + "grad_norm": 6.610103732360768, + "learning_rate": 7.004482270563441e-06, + "loss": 0.6416, + "step": 4773 + }, + { + "epoch": 0.39, + "grad_norm": 3.5101774440748144, + "learning_rate": 7.0032771742288945e-06, + "loss": 0.7388, + "step": 4774 + }, + { + "epoch": 0.39, + "grad_norm": 3.8045407712194037, + "learning_rate": 7.002071939254953e-06, + "loss": 0.644, + "step": 4775 + }, + { + "epoch": 0.39, + "grad_norm": 4.611748653957224, + "learning_rate": 7.00086656572503e-06, + "loss": 0.705, + "step": 4776 + }, + { + "epoch": 0.39, + "grad_norm": 3.292408249368622, + "learning_rate": 6.99966105372254e-06, + "loss": 0.8843, + "step": 4777 + }, + { + "epoch": 0.39, + "grad_norm": 6.196076740072117, + "learning_rate": 6.998455403330915e-06, + "loss": 0.7964, + "step": 4778 + }, + { + "epoch": 0.39, + "grad_norm": 3.854180307343359, + "learning_rate": 6.997249614633592e-06, + "loss": 0.7312, + "step": 4779 + }, + { + "epoch": 0.39, + "grad_norm": 2.7662169052403867, + "learning_rate": 6.99604368771402e-06, + "loss": 0.758, + "step": 4780 + }, + { + "epoch": 0.39, + "grad_norm": 12.82854700120382, + "learning_rate": 6.994837622655657e-06, + "loss": 0.8026, + "step": 4781 + }, + { + "epoch": 0.39, + "grad_norm": 3.9924703274932507, + "learning_rate": 6.993631419541971e-06, + "loss": 0.7989, + "step": 4782 + }, + { + "epoch": 0.39, + "grad_norm": 3.394679096204586, + "learning_rate": 6.992425078456436e-06, + "loss": 0.576, + "step": 4783 + }, + { + "epoch": 0.39, + "grad_norm": 3.2052315446281074, + "learning_rate": 6.991218599482541e-06, + "loss": 0.7248, + "step": 4784 + }, + { + "epoch": 0.39, + "grad_norm": 5.829783402074966, + "learning_rate": 6.9900119827037815e-06, + "loss": 0.7393, + "step": 4785 + }, + { + "epoch": 0.39, + "grad_norm": 4.505100124682155, + "learning_rate": 6.988805228203662e-06, + "loss": 0.6921, + "step": 4786 + }, + { + "epoch": 0.39, + "grad_norm": 5.99520124462559, + "learning_rate": 6.9875983360657e-06, + "loss": 0.7645, + "step": 4787 + }, + { + "epoch": 0.39, + "grad_norm": 3.128365653463775, + "learning_rate": 6.9863913063734155e-06, + "loss": 0.6814, + "step": 4788 + }, + { + "epoch": 0.39, + "grad_norm": 4.219836684072162, + "learning_rate": 6.985184139210347e-06, + "loss": 0.6192, + "step": 4789 + }, + { + "epoch": 0.39, + "grad_norm": 5.060870354550515, + "learning_rate": 6.983976834660036e-06, + "loss": 0.8057, + "step": 4790 + }, + { + "epoch": 0.39, + "grad_norm": 4.853054135697003, + "learning_rate": 6.982769392806035e-06, + "loss": 0.6226, + "step": 4791 + }, + { + "epoch": 0.39, + "grad_norm": 5.590901122362912, + "learning_rate": 6.981561813731909e-06, + "loss": 0.7665, + "step": 4792 + }, + { + "epoch": 0.39, + "grad_norm": 3.872390955556565, + "learning_rate": 6.980354097521227e-06, + "loss": 0.6474, + "step": 4793 + }, + { + "epoch": 0.39, + "grad_norm": 5.05894404233163, + "learning_rate": 6.979146244257573e-06, + "loss": 0.8038, + "step": 4794 + }, + { + "epoch": 0.39, + "grad_norm": 3.2985392079290476, + "learning_rate": 6.977938254024537e-06, + "loss": 0.6575, + "step": 4795 + }, + { + "epoch": 0.39, + "grad_norm": 3.200526478924465, + "learning_rate": 6.9767301269057195e-06, + "loss": 0.64, + "step": 4796 + }, + { + "epoch": 0.39, + "grad_norm": 3.034106899897965, + "learning_rate": 6.975521862984731e-06, + "loss": 0.7676, + "step": 4797 + }, + { + "epoch": 0.39, + "grad_norm": 7.9877283184173535, + "learning_rate": 6.97431346234519e-06, + "loss": 0.7624, + "step": 4798 + }, + { + "epoch": 0.39, + "grad_norm": 3.4660120769015514, + "learning_rate": 6.9731049250707274e-06, + "loss": 0.7593, + "step": 4799 + }, + { + "epoch": 0.39, + "grad_norm": 3.344396575082435, + "learning_rate": 6.971896251244978e-06, + "loss": 0.7253, + "step": 4800 + }, + { + "epoch": 0.39, + "grad_norm": 4.0084637416976685, + "learning_rate": 6.9706874409515934e-06, + "loss": 0.7032, + "step": 4801 + }, + { + "epoch": 0.39, + "grad_norm": 3.772271827388688, + "learning_rate": 6.969478494274231e-06, + "loss": 0.7033, + "step": 4802 + }, + { + "epoch": 0.39, + "grad_norm": 3.2229279627551426, + "learning_rate": 6.968269411296555e-06, + "loss": 0.7639, + "step": 4803 + }, + { + "epoch": 0.39, + "grad_norm": 3.0697043020637014, + "learning_rate": 6.9670601921022405e-06, + "loss": 0.9466, + "step": 4804 + }, + { + "epoch": 0.39, + "grad_norm": 4.711419929329179, + "learning_rate": 6.965850836774976e-06, + "loss": 0.6672, + "step": 4805 + }, + { + "epoch": 0.39, + "grad_norm": 3.2544511999017, + "learning_rate": 6.9646413453984576e-06, + "loss": 0.7028, + "step": 4806 + }, + { + "epoch": 0.39, + "grad_norm": 5.231057528667463, + "learning_rate": 6.963431718056386e-06, + "loss": 0.6186, + "step": 4807 + }, + { + "epoch": 0.39, + "grad_norm": 5.072030013314798, + "learning_rate": 6.962221954832476e-06, + "loss": 0.7459, + "step": 4808 + }, + { + "epoch": 0.39, + "grad_norm": 3.6083940864712267, + "learning_rate": 6.961012055810452e-06, + "loss": 0.7894, + "step": 4809 + }, + { + "epoch": 0.39, + "grad_norm": 3.043149196602401, + "learning_rate": 6.959802021074048e-06, + "loss": 0.6362, + "step": 4810 + }, + { + "epoch": 0.39, + "grad_norm": 3.5806014620304136, + "learning_rate": 6.958591850707003e-06, + "loss": 0.8644, + "step": 4811 + }, + { + "epoch": 0.39, + "grad_norm": 4.401141092956759, + "learning_rate": 6.957381544793069e-06, + "loss": 0.7423, + "step": 4812 + }, + { + "epoch": 0.39, + "grad_norm": 5.044522120092507, + "learning_rate": 6.956171103416007e-06, + "loss": 0.759, + "step": 4813 + }, + { + "epoch": 0.39, + "grad_norm": 3.8813838781887844, + "learning_rate": 6.9549605266595884e-06, + "loss": 0.8383, + "step": 4814 + }, + { + "epoch": 0.39, + "grad_norm": 3.193952462769136, + "learning_rate": 6.9537498146075925e-06, + "loss": 0.6518, + "step": 4815 + }, + { + "epoch": 0.39, + "grad_norm": 6.295054639016323, + "learning_rate": 6.952538967343807e-06, + "loss": 0.5823, + "step": 4816 + }, + { + "epoch": 0.39, + "grad_norm": 3.353986345212676, + "learning_rate": 6.95132798495203e-06, + "loss": 0.563, + "step": 4817 + }, + { + "epoch": 0.39, + "grad_norm": 3.2383150637038103, + "learning_rate": 6.950116867516071e-06, + "loss": 0.5848, + "step": 4818 + }, + { + "epoch": 0.39, + "grad_norm": 3.529747284134479, + "learning_rate": 6.948905615119746e-06, + "loss": 0.872, + "step": 4819 + }, + { + "epoch": 0.39, + "grad_norm": 2.8181419144918824, + "learning_rate": 6.94769422784688e-06, + "loss": 0.7193, + "step": 4820 + }, + { + "epoch": 0.39, + "grad_norm": 2.8819947125477032, + "learning_rate": 6.94648270578131e-06, + "loss": 0.706, + "step": 4821 + }, + { + "epoch": 0.39, + "grad_norm": 3.3857924594068196, + "learning_rate": 6.945271049006882e-06, + "loss": 0.5259, + "step": 4822 + }, + { + "epoch": 0.39, + "grad_norm": 3.290557550668647, + "learning_rate": 6.944059257607447e-06, + "loss": 0.697, + "step": 4823 + }, + { + "epoch": 0.39, + "grad_norm": 3.18650239441529, + "learning_rate": 6.942847331666872e-06, + "loss": 0.6802, + "step": 4824 + }, + { + "epoch": 0.39, + "grad_norm": 4.452219946444507, + "learning_rate": 6.941635271269027e-06, + "loss": 0.6743, + "step": 4825 + }, + { + "epoch": 0.39, + "grad_norm": 8.170256908873805, + "learning_rate": 6.940423076497798e-06, + "loss": 0.543, + "step": 4826 + }, + { + "epoch": 0.39, + "grad_norm": 4.094811431356428, + "learning_rate": 6.939210747437073e-06, + "loss": 0.8904, + "step": 4827 + }, + { + "epoch": 0.39, + "grad_norm": 3.4304699025698406, + "learning_rate": 6.937998284170754e-06, + "loss": 0.8, + "step": 4828 + }, + { + "epoch": 0.39, + "grad_norm": 2.9774677272601506, + "learning_rate": 6.936785686782751e-06, + "loss": 0.6621, + "step": 4829 + }, + { + "epoch": 0.39, + "grad_norm": 2.488762887188442, + "learning_rate": 6.9355729553569824e-06, + "loss": 0.7363, + "step": 4830 + }, + { + "epoch": 0.39, + "grad_norm": 4.054060702281797, + "learning_rate": 6.934360089977379e-06, + "loss": 0.7053, + "step": 4831 + }, + { + "epoch": 0.39, + "grad_norm": 5.08085247803542, + "learning_rate": 6.933147090727878e-06, + "loss": 0.6227, + "step": 4832 + }, + { + "epoch": 0.39, + "grad_norm": 2.164629488068441, + "learning_rate": 6.931933957692425e-06, + "loss": 0.6465, + "step": 4833 + }, + { + "epoch": 0.39, + "grad_norm": 3.7527869687773174, + "learning_rate": 6.9307206909549795e-06, + "loss": 0.7853, + "step": 4834 + }, + { + "epoch": 0.39, + "grad_norm": 2.7310070914460285, + "learning_rate": 6.929507290599506e-06, + "loss": 0.6748, + "step": 4835 + }, + { + "epoch": 0.39, + "grad_norm": 3.0460914094795593, + "learning_rate": 6.928293756709976e-06, + "loss": 0.5652, + "step": 4836 + }, + { + "epoch": 0.39, + "grad_norm": 3.277227833770824, + "learning_rate": 6.927080089370377e-06, + "loss": 0.6367, + "step": 4837 + }, + { + "epoch": 0.39, + "grad_norm": 6.312090911288772, + "learning_rate": 6.925866288664702e-06, + "loss": 0.8895, + "step": 4838 + }, + { + "epoch": 0.39, + "grad_norm": 2.762364319267452, + "learning_rate": 6.924652354676955e-06, + "loss": 0.6174, + "step": 4839 + }, + { + "epoch": 0.39, + "grad_norm": 4.901084220276386, + "learning_rate": 6.923438287491145e-06, + "loss": 0.7807, + "step": 4840 + }, + { + "epoch": 0.39, + "grad_norm": 7.349605173306933, + "learning_rate": 6.922224087191295e-06, + "loss": 0.7794, + "step": 4841 + }, + { + "epoch": 0.39, + "grad_norm": 4.504229995389656, + "learning_rate": 6.9210097538614355e-06, + "loss": 0.6636, + "step": 4842 + }, + { + "epoch": 0.39, + "grad_norm": 3.1233364879584284, + "learning_rate": 6.9197952875856044e-06, + "loss": 0.7023, + "step": 4843 + }, + { + "epoch": 0.39, + "grad_norm": 2.234534529419142, + "learning_rate": 6.918580688447851e-06, + "loss": 0.7493, + "step": 4844 + }, + { + "epoch": 0.39, + "grad_norm": 2.9700592965797354, + "learning_rate": 6.917365956532236e-06, + "loss": 0.8958, + "step": 4845 + }, + { + "epoch": 0.39, + "grad_norm": 15.646954668236777, + "learning_rate": 6.916151091922822e-06, + "loss": 0.7039, + "step": 4846 + }, + { + "epoch": 0.39, + "grad_norm": 6.009356960421061, + "learning_rate": 6.914936094703687e-06, + "loss": 0.6982, + "step": 4847 + }, + { + "epoch": 0.39, + "grad_norm": 2.504380044199826, + "learning_rate": 6.9137209649589165e-06, + "loss": 0.6325, + "step": 4848 + }, + { + "epoch": 0.39, + "grad_norm": 3.3492555628831826, + "learning_rate": 6.912505702772608e-06, + "loss": 0.7983, + "step": 4849 + }, + { + "epoch": 0.39, + "grad_norm": 1.8157188367439279, + "learning_rate": 6.911290308228861e-06, + "loss": 0.6933, + "step": 4850 + }, + { + "epoch": 0.39, + "grad_norm": 3.7603590075297753, + "learning_rate": 6.910074781411791e-06, + "loss": 0.7869, + "step": 4851 + }, + { + "epoch": 0.39, + "grad_norm": 3.2566879865106375, + "learning_rate": 6.908859122405519e-06, + "loss": 0.6108, + "step": 4852 + }, + { + "epoch": 0.39, + "grad_norm": 2.290384485690406, + "learning_rate": 6.907643331294176e-06, + "loss": 0.6921, + "step": 4853 + }, + { + "epoch": 0.39, + "grad_norm": 4.5076507899564415, + "learning_rate": 6.906427408161902e-06, + "loss": 0.6926, + "step": 4854 + }, + { + "epoch": 0.39, + "grad_norm": 2.8981679304789374, + "learning_rate": 6.90521135309285e-06, + "loss": 0.6142, + "step": 4855 + }, + { + "epoch": 0.39, + "grad_norm": 3.0077441297538567, + "learning_rate": 6.903995166171174e-06, + "loss": 0.5751, + "step": 4856 + }, + { + "epoch": 0.39, + "grad_norm": 2.7602107885057667, + "learning_rate": 6.9027788474810455e-06, + "loss": 0.8843, + "step": 4857 + }, + { + "epoch": 0.39, + "grad_norm": 4.440473376558985, + "learning_rate": 6.901562397106639e-06, + "loss": 0.8154, + "step": 4858 + }, + { + "epoch": 0.39, + "grad_norm": 2.3164539492566516, + "learning_rate": 6.900345815132142e-06, + "loss": 0.8152, + "step": 4859 + }, + { + "epoch": 0.39, + "grad_norm": 5.9698913678772145, + "learning_rate": 6.899129101641749e-06, + "loss": 0.6033, + "step": 4860 + }, + { + "epoch": 0.39, + "grad_norm": 3.677718448085302, + "learning_rate": 6.897912256719663e-06, + "loss": 0.6525, + "step": 4861 + }, + { + "epoch": 0.39, + "grad_norm": 3.1018884482136397, + "learning_rate": 6.896695280450101e-06, + "loss": 0.6381, + "step": 4862 + }, + { + "epoch": 0.39, + "grad_norm": 6.04664164346696, + "learning_rate": 6.89547817291728e-06, + "loss": 0.7792, + "step": 4863 + }, + { + "epoch": 0.4, + "grad_norm": 2.9141833975241522, + "learning_rate": 6.894260934205437e-06, + "loss": 0.6559, + "step": 4864 + }, + { + "epoch": 0.4, + "grad_norm": 3.6985594169482856, + "learning_rate": 6.893043564398809e-06, + "loss": 0.7285, + "step": 4865 + }, + { + "epoch": 0.4, + "grad_norm": 3.1611910933365333, + "learning_rate": 6.891826063581646e-06, + "loss": 0.7749, + "step": 4866 + }, + { + "epoch": 0.4, + "grad_norm": 2.6069696519997665, + "learning_rate": 6.89060843183821e-06, + "loss": 0.6807, + "step": 4867 + }, + { + "epoch": 0.4, + "grad_norm": 4.580422093056034, + "learning_rate": 6.8893906692527635e-06, + "loss": 0.6287, + "step": 4868 + }, + { + "epoch": 0.4, + "grad_norm": 2.444635152017811, + "learning_rate": 6.888172775909588e-06, + "loss": 0.8139, + "step": 4869 + }, + { + "epoch": 0.4, + "grad_norm": 4.904345140899487, + "learning_rate": 6.886954751892966e-06, + "loss": 0.5901, + "step": 4870 + }, + { + "epoch": 0.4, + "grad_norm": 2.6470422407509746, + "learning_rate": 6.885736597287195e-06, + "loss": 0.7187, + "step": 4871 + }, + { + "epoch": 0.4, + "grad_norm": 8.215865622617345, + "learning_rate": 6.884518312176578e-06, + "loss": 0.7838, + "step": 4872 + }, + { + "epoch": 0.4, + "grad_norm": 4.751349945627463, + "learning_rate": 6.883299896645427e-06, + "loss": 0.5323, + "step": 4873 + }, + { + "epoch": 0.4, + "grad_norm": 2.423893416061932, + "learning_rate": 6.882081350778065e-06, + "loss": 0.8255, + "step": 4874 + }, + { + "epoch": 0.4, + "grad_norm": 2.967999148784259, + "learning_rate": 6.8808626746588235e-06, + "loss": 0.7699, + "step": 4875 + }, + { + "epoch": 0.4, + "grad_norm": 10.662438312947389, + "learning_rate": 6.879643868372043e-06, + "loss": 0.631, + "step": 4876 + }, + { + "epoch": 0.4, + "grad_norm": 3.2539442085637185, + "learning_rate": 6.878424932002069e-06, + "loss": 0.7352, + "step": 4877 + }, + { + "epoch": 0.4, + "grad_norm": 5.32244299076864, + "learning_rate": 6.8772058656332626e-06, + "loss": 0.6774, + "step": 4878 + }, + { + "epoch": 0.4, + "grad_norm": 7.162138210066419, + "learning_rate": 6.875986669349993e-06, + "loss": 0.6791, + "step": 4879 + }, + { + "epoch": 0.4, + "grad_norm": 3.128353701835335, + "learning_rate": 6.874767343236631e-06, + "loss": 0.7136, + "step": 4880 + }, + { + "epoch": 0.4, + "grad_norm": 2.819837127300953, + "learning_rate": 6.873547887377565e-06, + "loss": 0.7726, + "step": 4881 + }, + { + "epoch": 0.4, + "grad_norm": 2.287134181595837, + "learning_rate": 6.872328301857189e-06, + "loss": 0.7061, + "step": 4882 + }, + { + "epoch": 0.4, + "grad_norm": 3.019489563935498, + "learning_rate": 6.871108586759907e-06, + "loss": 0.6867, + "step": 4883 + }, + { + "epoch": 0.4, + "grad_norm": 4.243860201693428, + "learning_rate": 6.869888742170127e-06, + "loss": 0.8063, + "step": 4884 + }, + { + "epoch": 0.4, + "grad_norm": 2.6384940511022377, + "learning_rate": 6.868668768172273e-06, + "loss": 0.7353, + "step": 4885 + }, + { + "epoch": 0.4, + "grad_norm": 2.631504097013453, + "learning_rate": 6.8674486648507735e-06, + "loss": 0.6798, + "step": 4886 + }, + { + "epoch": 0.4, + "grad_norm": 4.973520759093243, + "learning_rate": 6.8662284322900675e-06, + "loss": 0.6342, + "step": 4887 + }, + { + "epoch": 0.4, + "grad_norm": 5.262189673632032, + "learning_rate": 6.865008070574604e-06, + "loss": 0.6115, + "step": 4888 + }, + { + "epoch": 0.4, + "grad_norm": 2.486058711455715, + "learning_rate": 6.8637875797888394e-06, + "loss": 0.6982, + "step": 4889 + }, + { + "epoch": 0.4, + "grad_norm": 5.976942703364385, + "learning_rate": 6.8625669600172386e-06, + "loss": 0.5798, + "step": 4890 + }, + { + "epoch": 0.4, + "grad_norm": 4.181174871753097, + "learning_rate": 6.861346211344277e-06, + "loss": 0.6583, + "step": 4891 + }, + { + "epoch": 0.4, + "grad_norm": 2.751577049115162, + "learning_rate": 6.860125333854437e-06, + "loss": 0.6289, + "step": 4892 + }, + { + "epoch": 0.4, + "grad_norm": 3.4840019897365724, + "learning_rate": 6.858904327632212e-06, + "loss": 0.7523, + "step": 4893 + }, + { + "epoch": 0.4, + "grad_norm": 8.756750573155582, + "learning_rate": 6.857683192762101e-06, + "loss": 0.6771, + "step": 4894 + }, + { + "epoch": 0.4, + "grad_norm": 6.7511098527241735, + "learning_rate": 6.85646192932862e-06, + "loss": 0.6944, + "step": 4895 + }, + { + "epoch": 0.4, + "grad_norm": 5.708761425865642, + "learning_rate": 6.85524053741628e-06, + "loss": 0.7667, + "step": 4896 + }, + { + "epoch": 0.4, + "grad_norm": 3.4578905548610033, + "learning_rate": 6.854019017109614e-06, + "loss": 0.7, + "step": 4897 + }, + { + "epoch": 0.4, + "grad_norm": 2.768490692522605, + "learning_rate": 6.85279736849316e-06, + "loss": 0.6715, + "step": 4898 + }, + { + "epoch": 0.4, + "grad_norm": 3.8707193731066476, + "learning_rate": 6.851575591651461e-06, + "loss": 0.6208, + "step": 4899 + }, + { + "epoch": 0.4, + "grad_norm": 2.814606485889003, + "learning_rate": 6.8503536866690735e-06, + "loss": 0.856, + "step": 4900 + }, + { + "epoch": 0.4, + "grad_norm": 3.861416581199034, + "learning_rate": 6.849131653630558e-06, + "loss": 0.7036, + "step": 4901 + }, + { + "epoch": 0.4, + "grad_norm": 2.6794671355215796, + "learning_rate": 6.8479094926204925e-06, + "loss": 0.7893, + "step": 4902 + }, + { + "epoch": 0.4, + "grad_norm": 3.46235404964707, + "learning_rate": 6.846687203723452e-06, + "loss": 0.587, + "step": 4903 + }, + { + "epoch": 0.4, + "grad_norm": 2.8876925087219494, + "learning_rate": 6.845464787024029e-06, + "loss": 0.6824, + "step": 4904 + }, + { + "epoch": 0.4, + "grad_norm": 2.9597244953680635, + "learning_rate": 6.844242242606825e-06, + "loss": 0.7209, + "step": 4905 + }, + { + "epoch": 0.4, + "grad_norm": 15.791574232592932, + "learning_rate": 6.843019570556443e-06, + "loss": 0.5625, + "step": 4906 + }, + { + "epoch": 0.4, + "grad_norm": 2.7578395422668005, + "learning_rate": 6.841796770957503e-06, + "loss": 0.6725, + "step": 4907 + }, + { + "epoch": 0.4, + "grad_norm": 3.0826355219913117, + "learning_rate": 6.840573843894631e-06, + "loss": 0.7371, + "step": 4908 + }, + { + "epoch": 0.4, + "grad_norm": 3.7095000877485846, + "learning_rate": 6.839350789452458e-06, + "loss": 0.7468, + "step": 4909 + }, + { + "epoch": 0.4, + "grad_norm": 4.628751378647502, + "learning_rate": 6.838127607715629e-06, + "loss": 0.7616, + "step": 4910 + }, + { + "epoch": 0.4, + "grad_norm": 3.851553840753211, + "learning_rate": 6.836904298768795e-06, + "loss": 0.7338, + "step": 4911 + }, + { + "epoch": 0.4, + "grad_norm": 2.271296816808612, + "learning_rate": 6.835680862696618e-06, + "loss": 0.7156, + "step": 4912 + }, + { + "epoch": 0.4, + "grad_norm": 2.55003452073316, + "learning_rate": 6.834457299583768e-06, + "loss": 0.8402, + "step": 4913 + }, + { + "epoch": 0.4, + "grad_norm": 3.846070474038525, + "learning_rate": 6.833233609514921e-06, + "loss": 0.8685, + "step": 4914 + }, + { + "epoch": 0.4, + "grad_norm": 3.6300982880081123, + "learning_rate": 6.832009792574766e-06, + "loss": 0.6318, + "step": 4915 + }, + { + "epoch": 0.4, + "grad_norm": 4.251487750984797, + "learning_rate": 6.830785848848e-06, + "loss": 0.7366, + "step": 4916 + }, + { + "epoch": 0.4, + "grad_norm": 3.3090286237907924, + "learning_rate": 6.829561778419323e-06, + "loss": 0.6907, + "step": 4917 + }, + { + "epoch": 0.4, + "grad_norm": 3.5608003879066765, + "learning_rate": 6.828337581373452e-06, + "loss": 0.8053, + "step": 4918 + }, + { + "epoch": 0.4, + "grad_norm": 2.869821317041007, + "learning_rate": 6.827113257795107e-06, + "loss": 0.7041, + "step": 4919 + }, + { + "epoch": 0.4, + "grad_norm": 5.1506786167368155, + "learning_rate": 6.82588880776902e-06, + "loss": 0.7103, + "step": 4920 + }, + { + "epoch": 0.4, + "grad_norm": 2.903738089308982, + "learning_rate": 6.824664231379932e-06, + "loss": 0.6428, + "step": 4921 + }, + { + "epoch": 0.4, + "grad_norm": 5.180286174370614, + "learning_rate": 6.82343952871259e-06, + "loss": 0.5192, + "step": 4922 + }, + { + "epoch": 0.4, + "grad_norm": 3.3407512211338766, + "learning_rate": 6.8222146998517515e-06, + "loss": 0.6543, + "step": 4923 + }, + { + "epoch": 0.4, + "grad_norm": 3.016329848326716, + "learning_rate": 6.820989744882182e-06, + "loss": 0.6059, + "step": 4924 + }, + { + "epoch": 0.4, + "grad_norm": 3.7798140504016247, + "learning_rate": 6.819764663888656e-06, + "loss": 0.7088, + "step": 4925 + }, + { + "epoch": 0.4, + "grad_norm": 9.32782874952827, + "learning_rate": 6.818539456955957e-06, + "loss": 0.6638, + "step": 4926 + }, + { + "epoch": 0.4, + "grad_norm": 3.2294859608543174, + "learning_rate": 6.817314124168877e-06, + "loss": 0.6311, + "step": 4927 + }, + { + "epoch": 0.4, + "grad_norm": 2.2677417453980624, + "learning_rate": 6.816088665612217e-06, + "loss": 0.7605, + "step": 4928 + }, + { + "epoch": 0.4, + "grad_norm": 4.007936509204852, + "learning_rate": 6.814863081370786e-06, + "loss": 0.6108, + "step": 4929 + }, + { + "epoch": 0.4, + "grad_norm": 2.5660843710383747, + "learning_rate": 6.813637371529403e-06, + "loss": 0.866, + "step": 4930 + }, + { + "epoch": 0.4, + "grad_norm": 2.9705517049068058, + "learning_rate": 6.8124115361728935e-06, + "loss": 0.6793, + "step": 4931 + }, + { + "epoch": 0.4, + "grad_norm": 2.523319423250326, + "learning_rate": 6.811185575386095e-06, + "loss": 0.6886, + "step": 4932 + }, + { + "epoch": 0.4, + "grad_norm": 25.918255448354714, + "learning_rate": 6.80995948925385e-06, + "loss": 0.6762, + "step": 4933 + }, + { + "epoch": 0.4, + "grad_norm": 3.278457881643568, + "learning_rate": 6.8087332778610116e-06, + "loss": 0.5528, + "step": 4934 + }, + { + "epoch": 0.4, + "grad_norm": 3.672443977365079, + "learning_rate": 6.8075069412924425e-06, + "loss": 0.8241, + "step": 4935 + }, + { + "epoch": 0.4, + "grad_norm": 2.6843366699481845, + "learning_rate": 6.806280479633011e-06, + "loss": 0.8567, + "step": 4936 + }, + { + "epoch": 0.4, + "grad_norm": 4.213207165335354, + "learning_rate": 6.8050538929675965e-06, + "loss": 0.7736, + "step": 4937 + }, + { + "epoch": 0.4, + "grad_norm": 3.7486391167375754, + "learning_rate": 6.803827181381089e-06, + "loss": 0.752, + "step": 4938 + }, + { + "epoch": 0.4, + "grad_norm": 3.1190565653439846, + "learning_rate": 6.802600344958381e-06, + "loss": 0.7614, + "step": 4939 + }, + { + "epoch": 0.4, + "grad_norm": 3.793317648003399, + "learning_rate": 6.80137338378438e-06, + "loss": 0.6383, + "step": 4940 + }, + { + "epoch": 0.4, + "grad_norm": 6.246085295528323, + "learning_rate": 6.800146297943998e-06, + "loss": 0.778, + "step": 4941 + }, + { + "epoch": 0.4, + "grad_norm": 4.299199697024928, + "learning_rate": 6.798919087522157e-06, + "loss": 0.703, + "step": 4942 + }, + { + "epoch": 0.4, + "grad_norm": 2.628244345392673, + "learning_rate": 6.79769175260379e-06, + "loss": 0.7291, + "step": 4943 + }, + { + "epoch": 0.4, + "grad_norm": 5.244779606912698, + "learning_rate": 6.796464293273832e-06, + "loss": 0.6751, + "step": 4944 + }, + { + "epoch": 0.4, + "grad_norm": 2.647494937081442, + "learning_rate": 6.795236709617237e-06, + "loss": 0.7623, + "step": 4945 + }, + { + "epoch": 0.4, + "grad_norm": 3.0507548862403695, + "learning_rate": 6.794009001718954e-06, + "loss": 0.6035, + "step": 4946 + }, + { + "epoch": 0.4, + "grad_norm": 4.29394347585778, + "learning_rate": 6.7927811696639554e-06, + "loss": 0.7374, + "step": 4947 + }, + { + "epoch": 0.4, + "grad_norm": 3.7936480222799127, + "learning_rate": 6.791553213537209e-06, + "loss": 0.5189, + "step": 4948 + }, + { + "epoch": 0.4, + "grad_norm": 3.324451737302168, + "learning_rate": 6.790325133423701e-06, + "loss": 0.6558, + "step": 4949 + }, + { + "epoch": 0.4, + "grad_norm": 3.3448679908728822, + "learning_rate": 6.789096929408421e-06, + "loss": 0.6626, + "step": 4950 + }, + { + "epoch": 0.4, + "grad_norm": 3.7624592233009424, + "learning_rate": 6.787868601576368e-06, + "loss": 0.8336, + "step": 4951 + }, + { + "epoch": 0.4, + "grad_norm": 5.0030750967991295, + "learning_rate": 6.78664015001255e-06, + "loss": 0.6029, + "step": 4952 + }, + { + "epoch": 0.4, + "grad_norm": 3.5888509335441756, + "learning_rate": 6.7854115748019845e-06, + "loss": 0.6706, + "step": 4953 + }, + { + "epoch": 0.4, + "grad_norm": 3.063577444692808, + "learning_rate": 6.784182876029696e-06, + "loss": 0.6527, + "step": 4954 + }, + { + "epoch": 0.4, + "grad_norm": 2.724760082821225, + "learning_rate": 6.782954053780719e-06, + "loss": 0.7408, + "step": 4955 + }, + { + "epoch": 0.4, + "grad_norm": 2.8398349947036854, + "learning_rate": 6.781725108140095e-06, + "loss": 0.6336, + "step": 4956 + }, + { + "epoch": 0.4, + "grad_norm": 3.492050262409166, + "learning_rate": 6.780496039192874e-06, + "loss": 0.7221, + "step": 4957 + }, + { + "epoch": 0.4, + "grad_norm": 2.479745341654804, + "learning_rate": 6.779266847024118e-06, + "loss": 0.5949, + "step": 4958 + }, + { + "epoch": 0.4, + "grad_norm": 2.175749955463759, + "learning_rate": 6.7780375317188904e-06, + "loss": 0.7195, + "step": 4959 + }, + { + "epoch": 0.4, + "grad_norm": 4.097047557585542, + "learning_rate": 6.776808093362271e-06, + "loss": 0.7895, + "step": 4960 + }, + { + "epoch": 0.4, + "grad_norm": 2.7493222483737947, + "learning_rate": 6.775578532039344e-06, + "loss": 0.7537, + "step": 4961 + }, + { + "epoch": 0.4, + "grad_norm": 2.451279738986141, + "learning_rate": 6.774348847835203e-06, + "loss": 0.6038, + "step": 4962 + }, + { + "epoch": 0.4, + "grad_norm": 2.528594258655667, + "learning_rate": 6.7731190408349475e-06, + "loss": 0.7321, + "step": 4963 + }, + { + "epoch": 0.4, + "grad_norm": 3.7806794514672712, + "learning_rate": 6.7718891111236925e-06, + "loss": 0.7319, + "step": 4964 + }, + { + "epoch": 0.4, + "grad_norm": 5.734898757581246, + "learning_rate": 6.770659058786555e-06, + "loss": 0.6775, + "step": 4965 + }, + { + "epoch": 0.4, + "grad_norm": 5.6660040045273545, + "learning_rate": 6.7694288839086595e-06, + "loss": 0.675, + "step": 4966 + }, + { + "epoch": 0.4, + "grad_norm": 3.170865202091634, + "learning_rate": 6.7681985865751434e-06, + "loss": 0.7601, + "step": 4967 + }, + { + "epoch": 0.4, + "grad_norm": 3.0833020650219836, + "learning_rate": 6.766968166871154e-06, + "loss": 0.5309, + "step": 4968 + }, + { + "epoch": 0.4, + "grad_norm": 4.8071087652262054, + "learning_rate": 6.76573762488184e-06, + "loss": 0.7415, + "step": 4969 + }, + { + "epoch": 0.4, + "grad_norm": 3.2614415620616857, + "learning_rate": 6.764506960692364e-06, + "loss": 0.8299, + "step": 4970 + }, + { + "epoch": 0.4, + "grad_norm": 4.6331570792770504, + "learning_rate": 6.763276174387898e-06, + "loss": 0.8818, + "step": 4971 + }, + { + "epoch": 0.4, + "grad_norm": 6.796748452132098, + "learning_rate": 6.7620452660536175e-06, + "loss": 0.8108, + "step": 4972 + }, + { + "epoch": 0.4, + "grad_norm": 4.539266007033854, + "learning_rate": 6.760814235774709e-06, + "loss": 0.6718, + "step": 4973 + }, + { + "epoch": 0.4, + "grad_norm": 3.116393385357577, + "learning_rate": 6.7595830836363684e-06, + "loss": 0.6739, + "step": 4974 + }, + { + "epoch": 0.4, + "grad_norm": 4.152408908299569, + "learning_rate": 6.7583518097238e-06, + "loss": 0.8215, + "step": 4975 + }, + { + "epoch": 0.4, + "grad_norm": 5.143771337933333, + "learning_rate": 6.757120414122214e-06, + "loss": 0.6795, + "step": 4976 + }, + { + "epoch": 0.4, + "grad_norm": 4.058050852254165, + "learning_rate": 6.755888896916831e-06, + "loss": 0.685, + "step": 4977 + }, + { + "epoch": 0.4, + "grad_norm": 25.687008959770978, + "learning_rate": 6.754657258192883e-06, + "loss": 0.618, + "step": 4978 + }, + { + "epoch": 0.4, + "grad_norm": 3.4679077080681178, + "learning_rate": 6.753425498035602e-06, + "loss": 0.6927, + "step": 4979 + }, + { + "epoch": 0.4, + "grad_norm": 5.717393355193208, + "learning_rate": 6.7521936165302384e-06, + "loss": 0.7709, + "step": 4980 + }, + { + "epoch": 0.4, + "grad_norm": 3.52521391370229, + "learning_rate": 6.750961613762042e-06, + "loss": 0.7718, + "step": 4981 + }, + { + "epoch": 0.4, + "grad_norm": 4.152022307203397, + "learning_rate": 6.749729489816277e-06, + "loss": 0.7635, + "step": 4982 + }, + { + "epoch": 0.4, + "grad_norm": 2.4207049119488993, + "learning_rate": 6.748497244778214e-06, + "loss": 0.7649, + "step": 4983 + }, + { + "epoch": 0.4, + "grad_norm": 3.0774242493309747, + "learning_rate": 6.747264878733133e-06, + "loss": 0.7109, + "step": 4984 + }, + { + "epoch": 0.4, + "grad_norm": 5.701531419707866, + "learning_rate": 6.746032391766321e-06, + "loss": 0.7855, + "step": 4985 + }, + { + "epoch": 0.4, + "grad_norm": 22.67299778658607, + "learning_rate": 6.744799783963072e-06, + "loss": 0.745, + "step": 4986 + }, + { + "epoch": 0.41, + "grad_norm": 2.4966346755969138, + "learning_rate": 6.743567055408693e-06, + "loss": 0.5558, + "step": 4987 + }, + { + "epoch": 0.41, + "grad_norm": 2.971100668042735, + "learning_rate": 6.742334206188494e-06, + "loss": 0.7532, + "step": 4988 + }, + { + "epoch": 0.41, + "grad_norm": 3.1520189641407472, + "learning_rate": 6.741101236387799e-06, + "loss": 0.7895, + "step": 4989 + }, + { + "epoch": 0.41, + "grad_norm": 2.844819841318991, + "learning_rate": 6.739868146091934e-06, + "loss": 0.6002, + "step": 4990 + }, + { + "epoch": 0.41, + "grad_norm": 2.8579954476862306, + "learning_rate": 6.7386349353862415e-06, + "loss": 0.7722, + "step": 4991 + }, + { + "epoch": 0.41, + "grad_norm": 5.254711892310361, + "learning_rate": 6.73740160435606e-06, + "loss": 0.6614, + "step": 4992 + }, + { + "epoch": 0.41, + "grad_norm": 2.4600862141161786, + "learning_rate": 6.73616815308675e-06, + "loss": 0.7076, + "step": 4993 + }, + { + "epoch": 0.41, + "grad_norm": 3.1514416380860055, + "learning_rate": 6.73493458166367e-06, + "loss": 0.7211, + "step": 4994 + }, + { + "epoch": 0.41, + "grad_norm": 4.271489919509307, + "learning_rate": 6.733700890172196e-06, + "loss": 0.8261, + "step": 4995 + }, + { + "epoch": 0.41, + "grad_norm": 2.7358240516543146, + "learning_rate": 6.732467078697703e-06, + "loss": 0.6951, + "step": 4996 + }, + { + "epoch": 0.41, + "grad_norm": 3.25147475326891, + "learning_rate": 6.731233147325578e-06, + "loss": 0.722, + "step": 4997 + }, + { + "epoch": 0.41, + "grad_norm": 5.7606544742576995, + "learning_rate": 6.729999096141221e-06, + "loss": 0.724, + "step": 4998 + }, + { + "epoch": 0.41, + "grad_norm": 9.355223991585957, + "learning_rate": 6.728764925230032e-06, + "loss": 0.6977, + "step": 4999 + }, + { + "epoch": 0.41, + "grad_norm": 5.5165558059484345, + "learning_rate": 6.727530634677425e-06, + "loss": 0.8671, + "step": 5000 + }, + { + "epoch": 0.41, + "grad_norm": 3.8632299351827935, + "learning_rate": 6.726296224568821e-06, + "loss": 0.7115, + "step": 5001 + }, + { + "epoch": 0.41, + "grad_norm": 2.4995308036675477, + "learning_rate": 6.725061694989647e-06, + "loss": 0.7201, + "step": 5002 + }, + { + "epoch": 0.41, + "grad_norm": 2.4757929721952308, + "learning_rate": 6.723827046025344e-06, + "loss": 0.745, + "step": 5003 + }, + { + "epoch": 0.41, + "grad_norm": 4.214930358802065, + "learning_rate": 6.722592277761355e-06, + "loss": 0.827, + "step": 5004 + }, + { + "epoch": 0.41, + "grad_norm": 3.4380746462034697, + "learning_rate": 6.721357390283134e-06, + "loss": 0.5538, + "step": 5005 + }, + { + "epoch": 0.41, + "grad_norm": 3.674369744138658, + "learning_rate": 6.720122383676142e-06, + "loss": 0.6961, + "step": 5006 + }, + { + "epoch": 0.41, + "grad_norm": 2.928556239665184, + "learning_rate": 6.718887258025851e-06, + "loss": 0.5996, + "step": 5007 + }, + { + "epoch": 0.41, + "grad_norm": 3.3021005883001693, + "learning_rate": 6.717652013417739e-06, + "loss": 0.6816, + "step": 5008 + }, + { + "epoch": 0.41, + "grad_norm": 2.961289019740647, + "learning_rate": 6.716416649937291e-06, + "loss": 0.6832, + "step": 5009 + }, + { + "epoch": 0.41, + "grad_norm": 4.155525662245276, + "learning_rate": 6.715181167670005e-06, + "loss": 0.6027, + "step": 5010 + }, + { + "epoch": 0.41, + "grad_norm": 2.7306408775871365, + "learning_rate": 6.713945566701383e-06, + "loss": 0.6836, + "step": 5011 + }, + { + "epoch": 0.41, + "grad_norm": 3.0934729932999487, + "learning_rate": 6.712709847116934e-06, + "loss": 0.8548, + "step": 5012 + }, + { + "epoch": 0.41, + "grad_norm": 4.2552055161435485, + "learning_rate": 6.711474009002181e-06, + "loss": 0.5983, + "step": 5013 + }, + { + "epoch": 0.41, + "grad_norm": 3.4059636103358737, + "learning_rate": 6.71023805244265e-06, + "loss": 0.78, + "step": 5014 + }, + { + "epoch": 0.41, + "grad_norm": 2.791794911046696, + "learning_rate": 6.709001977523877e-06, + "loss": 0.6411, + "step": 5015 + }, + { + "epoch": 0.41, + "grad_norm": 3.1380299358727246, + "learning_rate": 6.707765784331406e-06, + "loss": 0.7839, + "step": 5016 + }, + { + "epoch": 0.41, + "grad_norm": 3.8383987597544214, + "learning_rate": 6.706529472950789e-06, + "loss": 0.7235, + "step": 5017 + }, + { + "epoch": 0.41, + "grad_norm": 2.8690584080396015, + "learning_rate": 6.705293043467589e-06, + "loss": 0.8103, + "step": 5018 + }, + { + "epoch": 0.41, + "grad_norm": 2.7013125646738065, + "learning_rate": 6.704056495967372e-06, + "loss": 0.7716, + "step": 5019 + }, + { + "epoch": 0.41, + "grad_norm": 2.8572493551274656, + "learning_rate": 6.702819830535716e-06, + "loss": 0.7041, + "step": 5020 + }, + { + "epoch": 0.41, + "grad_norm": 2.468425537393209, + "learning_rate": 6.7015830472582065e-06, + "loss": 0.6019, + "step": 5021 + }, + { + "epoch": 0.41, + "grad_norm": 2.81278489809031, + "learning_rate": 6.700346146220436e-06, + "loss": 0.7471, + "step": 5022 + }, + { + "epoch": 0.41, + "grad_norm": 2.328934847965021, + "learning_rate": 6.699109127508004e-06, + "loss": 0.7705, + "step": 5023 + }, + { + "epoch": 0.41, + "grad_norm": 6.970253484989624, + "learning_rate": 6.697871991206524e-06, + "loss": 0.6292, + "step": 5024 + }, + { + "epoch": 0.41, + "grad_norm": 3.2510358557789996, + "learning_rate": 6.69663473740161e-06, + "loss": 0.8023, + "step": 5025 + }, + { + "epoch": 0.41, + "grad_norm": 2.735028893226962, + "learning_rate": 6.695397366178891e-06, + "loss": 0.7816, + "step": 5026 + }, + { + "epoch": 0.41, + "grad_norm": 2.712153740279217, + "learning_rate": 6.694159877623998e-06, + "loss": 0.7923, + "step": 5027 + }, + { + "epoch": 0.41, + "grad_norm": 4.280357329229467, + "learning_rate": 6.692922271822575e-06, + "loss": 0.6613, + "step": 5028 + }, + { + "epoch": 0.41, + "grad_norm": 22.32109284885269, + "learning_rate": 6.691684548860271e-06, + "loss": 0.6882, + "step": 5029 + }, + { + "epoch": 0.41, + "grad_norm": 2.6198163844461506, + "learning_rate": 6.690446708822744e-06, + "loss": 0.7059, + "step": 5030 + }, + { + "epoch": 0.41, + "grad_norm": 2.9798684719887887, + "learning_rate": 6.689208751795662e-06, + "loss": 0.6601, + "step": 5031 + }, + { + "epoch": 0.41, + "grad_norm": 2.6350320286969144, + "learning_rate": 6.687970677864696e-06, + "loss": 0.8318, + "step": 5032 + }, + { + "epoch": 0.41, + "grad_norm": 3.3680880399874855, + "learning_rate": 6.6867324871155316e-06, + "loss": 0.5916, + "step": 5033 + }, + { + "epoch": 0.41, + "grad_norm": 2.7772717113195995, + "learning_rate": 6.68549417963386e-06, + "loss": 0.7119, + "step": 5034 + }, + { + "epoch": 0.41, + "grad_norm": 2.1239408176620738, + "learning_rate": 6.6842557555053765e-06, + "loss": 0.6337, + "step": 5035 + }, + { + "epoch": 0.41, + "grad_norm": 2.9474245923833977, + "learning_rate": 6.683017214815791e-06, + "loss": 0.5968, + "step": 5036 + }, + { + "epoch": 0.41, + "grad_norm": 3.6923366673392612, + "learning_rate": 6.681778557650816e-06, + "loss": 0.7317, + "step": 5037 + }, + { + "epoch": 0.41, + "grad_norm": 2.8559656397007744, + "learning_rate": 6.680539784096177e-06, + "loss": 0.7574, + "step": 5038 + }, + { + "epoch": 0.41, + "grad_norm": 2.3631775929096053, + "learning_rate": 6.679300894237603e-06, + "loss": 0.6943, + "step": 5039 + }, + { + "epoch": 0.41, + "grad_norm": 7.389040743639903, + "learning_rate": 6.6780618881608315e-06, + "loss": 0.7443, + "step": 5040 + }, + { + "epoch": 0.41, + "grad_norm": 4.293024439847495, + "learning_rate": 6.676822765951614e-06, + "loss": 0.7487, + "step": 5041 + }, + { + "epoch": 0.41, + "grad_norm": 2.609097741737723, + "learning_rate": 6.675583527695701e-06, + "loss": 0.7211, + "step": 5042 + }, + { + "epoch": 0.41, + "grad_norm": 3.041360089405942, + "learning_rate": 6.674344173478858e-06, + "loss": 0.7158, + "step": 5043 + }, + { + "epoch": 0.41, + "grad_norm": 2.6003188369592616, + "learning_rate": 6.673104703386856e-06, + "loss": 0.5661, + "step": 5044 + }, + { + "epoch": 0.41, + "grad_norm": 3.468428455022618, + "learning_rate": 6.671865117505476e-06, + "loss": 0.8295, + "step": 5045 + }, + { + "epoch": 0.41, + "grad_norm": 3.129459100316946, + "learning_rate": 6.6706254159205e-06, + "loss": 0.6633, + "step": 5046 + }, + { + "epoch": 0.41, + "grad_norm": 3.960369845937572, + "learning_rate": 6.6693855987177254e-06, + "loss": 0.6505, + "step": 5047 + }, + { + "epoch": 0.41, + "grad_norm": 3.2408355837193, + "learning_rate": 6.668145665982959e-06, + "loss": 0.6992, + "step": 5048 + }, + { + "epoch": 0.41, + "grad_norm": 4.284071896381552, + "learning_rate": 6.666905617802006e-06, + "loss": 0.7053, + "step": 5049 + }, + { + "epoch": 0.41, + "grad_norm": 3.8541736350340186, + "learning_rate": 6.66566545426069e-06, + "loss": 0.8008, + "step": 5050 + }, + { + "epoch": 0.41, + "grad_norm": 2.9589248725208694, + "learning_rate": 6.664425175444838e-06, + "loss": 0.8337, + "step": 5051 + }, + { + "epoch": 0.41, + "grad_norm": 2.6886965292385363, + "learning_rate": 6.6631847814402815e-06, + "loss": 0.6631, + "step": 5052 + }, + { + "epoch": 0.41, + "grad_norm": 4.007024126168313, + "learning_rate": 6.661944272332867e-06, + "loss": 0.6692, + "step": 5053 + }, + { + "epoch": 0.41, + "grad_norm": 8.308365232233202, + "learning_rate": 6.660703648208446e-06, + "loss": 0.8157, + "step": 5054 + }, + { + "epoch": 0.41, + "grad_norm": 2.976952899370484, + "learning_rate": 6.659462909152873e-06, + "loss": 0.6068, + "step": 5055 + }, + { + "epoch": 0.41, + "grad_norm": 5.1632577261639225, + "learning_rate": 6.658222055252019e-06, + "loss": 0.6583, + "step": 5056 + }, + { + "epoch": 0.41, + "grad_norm": 2.650919258539903, + "learning_rate": 6.656981086591756e-06, + "loss": 0.5791, + "step": 5057 + }, + { + "epoch": 0.41, + "grad_norm": 2.924828151933549, + "learning_rate": 6.655740003257971e-06, + "loss": 0.8503, + "step": 5058 + }, + { + "epoch": 0.41, + "grad_norm": 3.2753795009843825, + "learning_rate": 6.654498805336551e-06, + "loss": 0.778, + "step": 5059 + }, + { + "epoch": 0.41, + "grad_norm": 3.255770231726, + "learning_rate": 6.653257492913398e-06, + "loss": 0.7918, + "step": 5060 + }, + { + "epoch": 0.41, + "grad_norm": 2.349428819895754, + "learning_rate": 6.652016066074416e-06, + "loss": 0.6037, + "step": 5061 + }, + { + "epoch": 0.41, + "grad_norm": 3.1597893282554437, + "learning_rate": 6.650774524905519e-06, + "loss": 0.8108, + "step": 5062 + }, + { + "epoch": 0.41, + "grad_norm": 2.714615314035783, + "learning_rate": 6.649532869492631e-06, + "loss": 0.7253, + "step": 5063 + }, + { + "epoch": 0.41, + "grad_norm": 4.89023893976048, + "learning_rate": 6.648291099921683e-06, + "loss": 0.6877, + "step": 5064 + }, + { + "epoch": 0.41, + "grad_norm": 3.9672246276933087, + "learning_rate": 6.647049216278612e-06, + "loss": 0.6675, + "step": 5065 + }, + { + "epoch": 0.41, + "grad_norm": 5.519301514153897, + "learning_rate": 6.645807218649364e-06, + "loss": 0.7745, + "step": 5066 + }, + { + "epoch": 0.41, + "grad_norm": 5.394529094650727, + "learning_rate": 6.644565107119895e-06, + "loss": 0.7197, + "step": 5067 + }, + { + "epoch": 0.41, + "grad_norm": 2.599852605743063, + "learning_rate": 6.643322881776164e-06, + "loss": 0.7601, + "step": 5068 + }, + { + "epoch": 0.41, + "grad_norm": 7.959115742962483, + "learning_rate": 6.642080542704144e-06, + "loss": 0.6973, + "step": 5069 + }, + { + "epoch": 0.41, + "grad_norm": 2.555274778284788, + "learning_rate": 6.640838089989809e-06, + "loss": 0.8815, + "step": 5070 + }, + { + "epoch": 0.41, + "grad_norm": 2.9974597963405656, + "learning_rate": 6.639595523719148e-06, + "loss": 0.7689, + "step": 5071 + }, + { + "epoch": 0.41, + "grad_norm": 3.9229524075739044, + "learning_rate": 6.638352843978153e-06, + "loss": 0.7671, + "step": 5072 + }, + { + "epoch": 0.41, + "grad_norm": 3.21939783823383, + "learning_rate": 6.637110050852824e-06, + "loss": 0.7184, + "step": 5073 + }, + { + "epoch": 0.41, + "grad_norm": 4.700668108280518, + "learning_rate": 6.6358671444291735e-06, + "loss": 0.7317, + "step": 5074 + }, + { + "epoch": 0.41, + "grad_norm": 4.378392081738789, + "learning_rate": 6.634624124793214e-06, + "loss": 0.6345, + "step": 5075 + }, + { + "epoch": 0.41, + "grad_norm": 3.0609003754767388, + "learning_rate": 6.633380992030973e-06, + "loss": 0.7689, + "step": 5076 + }, + { + "epoch": 0.41, + "grad_norm": 4.1340436387013435, + "learning_rate": 6.6321377462284845e-06, + "loss": 0.793, + "step": 5077 + }, + { + "epoch": 0.41, + "grad_norm": 3.506604164541014, + "learning_rate": 6.630894387471787e-06, + "loss": 0.755, + "step": 5078 + }, + { + "epoch": 0.41, + "grad_norm": 3.4382437530620154, + "learning_rate": 6.629650915846928e-06, + "loss": 0.586, + "step": 5079 + }, + { + "epoch": 0.41, + "grad_norm": 4.202886214794389, + "learning_rate": 6.628407331439964e-06, + "loss": 0.5438, + "step": 5080 + }, + { + "epoch": 0.41, + "grad_norm": 3.3132512225434994, + "learning_rate": 6.6271636343369606e-06, + "loss": 0.5141, + "step": 5081 + }, + { + "epoch": 0.41, + "grad_norm": 3.857983815209744, + "learning_rate": 6.6259198246239874e-06, + "loss": 0.6856, + "step": 5082 + }, + { + "epoch": 0.41, + "grad_norm": 5.2046949297431935, + "learning_rate": 6.624675902387124e-06, + "loss": 0.7413, + "step": 5083 + }, + { + "epoch": 0.41, + "grad_norm": 6.361141741674706, + "learning_rate": 6.62343186771246e-06, + "loss": 0.4927, + "step": 5084 + }, + { + "epoch": 0.41, + "grad_norm": 3.703819365479077, + "learning_rate": 6.6221877206860885e-06, + "loss": 0.7266, + "step": 5085 + }, + { + "epoch": 0.41, + "grad_norm": 4.058652973006847, + "learning_rate": 6.620943461394111e-06, + "loss": 0.844, + "step": 5086 + }, + { + "epoch": 0.41, + "grad_norm": 3.086634882234965, + "learning_rate": 6.619699089922642e-06, + "loss": 0.5647, + "step": 5087 + }, + { + "epoch": 0.41, + "grad_norm": 2.942574635794407, + "learning_rate": 6.618454606357796e-06, + "loss": 0.5957, + "step": 5088 + }, + { + "epoch": 0.41, + "grad_norm": 2.7301558501515597, + "learning_rate": 6.617210010785701e-06, + "loss": 0.6801, + "step": 5089 + }, + { + "epoch": 0.41, + "grad_norm": 2.7397019860003073, + "learning_rate": 6.61596530329249e-06, + "loss": 0.6839, + "step": 5090 + }, + { + "epoch": 0.41, + "grad_norm": 2.5547147480633634, + "learning_rate": 6.614720483964305e-06, + "loss": 0.7955, + "step": 5091 + }, + { + "epoch": 0.41, + "grad_norm": 3.507368736191685, + "learning_rate": 6.613475552887296e-06, + "loss": 0.5266, + "step": 5092 + }, + { + "epoch": 0.41, + "grad_norm": 3.131103502388818, + "learning_rate": 6.61223051014762e-06, + "loss": 0.7304, + "step": 5093 + }, + { + "epoch": 0.41, + "grad_norm": 2.0540027299447865, + "learning_rate": 6.610985355831441e-06, + "loss": 0.6823, + "step": 5094 + }, + { + "epoch": 0.41, + "grad_norm": 2.6262186858862817, + "learning_rate": 6.609740090024931e-06, + "loss": 0.7449, + "step": 5095 + }, + { + "epoch": 0.41, + "grad_norm": 3.2951921262156696, + "learning_rate": 6.60849471281427e-06, + "loss": 0.8184, + "step": 5096 + }, + { + "epoch": 0.41, + "grad_norm": 2.5005182984391636, + "learning_rate": 6.60724922428565e-06, + "loss": 0.6602, + "step": 5097 + }, + { + "epoch": 0.41, + "grad_norm": 2.442955440812688, + "learning_rate": 6.606003624525262e-06, + "loss": 0.6637, + "step": 5098 + }, + { + "epoch": 0.41, + "grad_norm": 3.109233404706816, + "learning_rate": 6.60475791361931e-06, + "loss": 0.712, + "step": 5099 + }, + { + "epoch": 0.41, + "grad_norm": 2.391663100230515, + "learning_rate": 6.603512091654007e-06, + "loss": 0.7831, + "step": 5100 + }, + { + "epoch": 0.41, + "grad_norm": 4.449960591145155, + "learning_rate": 6.60226615871557e-06, + "loss": 0.6644, + "step": 5101 + }, + { + "epoch": 0.41, + "grad_norm": 6.776939128538275, + "learning_rate": 6.601020114890227e-06, + "loss": 0.6789, + "step": 5102 + }, + { + "epoch": 0.41, + "grad_norm": 2.9631092761589493, + "learning_rate": 6.599773960264211e-06, + "loss": 0.7905, + "step": 5103 + }, + { + "epoch": 0.41, + "grad_norm": 3.3920002121434236, + "learning_rate": 6.598527694923764e-06, + "loss": 0.5453, + "step": 5104 + }, + { + "epoch": 0.41, + "grad_norm": 3.400846260479609, + "learning_rate": 6.597281318955134e-06, + "loss": 0.6576, + "step": 5105 + }, + { + "epoch": 0.41, + "grad_norm": 2.511903747847077, + "learning_rate": 6.596034832444581e-06, + "loss": 0.8276, + "step": 5106 + }, + { + "epoch": 0.41, + "grad_norm": 3.8890400429165775, + "learning_rate": 6.594788235478368e-06, + "loss": 0.5459, + "step": 5107 + }, + { + "epoch": 0.41, + "grad_norm": 2.761529334528283, + "learning_rate": 6.593541528142766e-06, + "loss": 0.5908, + "step": 5108 + }, + { + "epoch": 0.41, + "grad_norm": 2.7295796381618436, + "learning_rate": 6.5922947105240585e-06, + "loss": 0.7668, + "step": 5109 + }, + { + "epoch": 0.42, + "grad_norm": 11.355418728983004, + "learning_rate": 6.59104778270853e-06, + "loss": 0.7054, + "step": 5110 + }, + { + "epoch": 0.42, + "grad_norm": 2.467454895569117, + "learning_rate": 6.589800744782478e-06, + "loss": 0.6456, + "step": 5111 + }, + { + "epoch": 0.42, + "grad_norm": 3.530480677113284, + "learning_rate": 6.588553596832204e-06, + "loss": 0.9019, + "step": 5112 + }, + { + "epoch": 0.42, + "grad_norm": 3.4487594311354233, + "learning_rate": 6.587306338944017e-06, + "loss": 0.6821, + "step": 5113 + }, + { + "epoch": 0.42, + "grad_norm": 2.484148554071114, + "learning_rate": 6.586058971204239e-06, + "loss": 0.813, + "step": 5114 + }, + { + "epoch": 0.42, + "grad_norm": 3.1744859807399264, + "learning_rate": 6.584811493699191e-06, + "loss": 0.7771, + "step": 5115 + }, + { + "epoch": 0.42, + "grad_norm": 2.520752983370148, + "learning_rate": 6.5835639065152104e-06, + "loss": 0.7636, + "step": 5116 + }, + { + "epoch": 0.42, + "grad_norm": 3.1938231693178265, + "learning_rate": 6.582316209738638e-06, + "loss": 0.6011, + "step": 5117 + }, + { + "epoch": 0.42, + "grad_norm": 5.3442702512233575, + "learning_rate": 6.581068403455819e-06, + "loss": 0.6621, + "step": 5118 + }, + { + "epoch": 0.42, + "grad_norm": 3.429171022776963, + "learning_rate": 6.57982048775311e-06, + "loss": 0.6215, + "step": 5119 + }, + { + "epoch": 0.42, + "grad_norm": 3.3808351967077597, + "learning_rate": 6.578572462716879e-06, + "loss": 0.5334, + "step": 5120 + }, + { + "epoch": 0.42, + "grad_norm": 3.619364772181749, + "learning_rate": 6.577324328433492e-06, + "loss": 0.6761, + "step": 5121 + }, + { + "epoch": 0.42, + "grad_norm": 2.3036710815834702, + "learning_rate": 6.576076084989329e-06, + "loss": 0.8721, + "step": 5122 + }, + { + "epoch": 0.42, + "grad_norm": 5.510460953040238, + "learning_rate": 6.574827732470779e-06, + "loss": 0.7249, + "step": 5123 + }, + { + "epoch": 0.42, + "grad_norm": 2.4336381851440287, + "learning_rate": 6.573579270964233e-06, + "loss": 0.6295, + "step": 5124 + }, + { + "epoch": 0.42, + "grad_norm": 3.0377773121945295, + "learning_rate": 6.5723307005560955e-06, + "loss": 0.6313, + "step": 5125 + }, + { + "epoch": 0.42, + "grad_norm": 7.118248019100789, + "learning_rate": 6.571082021332771e-06, + "loss": 0.6986, + "step": 5126 + }, + { + "epoch": 0.42, + "grad_norm": 3.5451460556434857, + "learning_rate": 6.569833233380679e-06, + "loss": 0.7131, + "step": 5127 + }, + { + "epoch": 0.42, + "grad_norm": 2.564663367975666, + "learning_rate": 6.568584336786242e-06, + "loss": 0.637, + "step": 5128 + }, + { + "epoch": 0.42, + "grad_norm": 2.961397101498486, + "learning_rate": 6.567335331635892e-06, + "loss": 0.6969, + "step": 5129 + }, + { + "epoch": 0.42, + "grad_norm": 2.4849552668604438, + "learning_rate": 6.56608621801607e-06, + "loss": 0.6407, + "step": 5130 + }, + { + "epoch": 0.42, + "grad_norm": 3.674512913794182, + "learning_rate": 6.56483699601322e-06, + "loss": 0.6845, + "step": 5131 + }, + { + "epoch": 0.42, + "grad_norm": 4.3391434463276415, + "learning_rate": 6.563587665713796e-06, + "loss": 0.7919, + "step": 5132 + }, + { + "epoch": 0.42, + "grad_norm": 12.216543918921209, + "learning_rate": 6.5623382272042625e-06, + "loss": 0.6832, + "step": 5133 + }, + { + "epoch": 0.42, + "grad_norm": 2.4319586239511515, + "learning_rate": 6.561088680571085e-06, + "loss": 0.8278, + "step": 5134 + }, + { + "epoch": 0.42, + "grad_norm": 2.3387045496885652, + "learning_rate": 6.5598390259007415e-06, + "loss": 0.6763, + "step": 5135 + }, + { + "epoch": 0.42, + "grad_norm": 2.934483979902261, + "learning_rate": 6.558589263279716e-06, + "loss": 0.7004, + "step": 5136 + }, + { + "epoch": 0.42, + "grad_norm": 2.095079294763798, + "learning_rate": 6.5573393927945e-06, + "loss": 0.6582, + "step": 5137 + }, + { + "epoch": 0.42, + "grad_norm": 4.087866607874321, + "learning_rate": 6.55608941453159e-06, + "loss": 0.6146, + "step": 5138 + }, + { + "epoch": 0.42, + "grad_norm": 3.6322183237178773, + "learning_rate": 6.554839328577497e-06, + "loss": 0.5761, + "step": 5139 + }, + { + "epoch": 0.42, + "grad_norm": 3.766186539120334, + "learning_rate": 6.553589135018732e-06, + "loss": 0.6836, + "step": 5140 + }, + { + "epoch": 0.42, + "grad_norm": 6.9979409521366005, + "learning_rate": 6.552338833941816e-06, + "loss": 0.6977, + "step": 5141 + }, + { + "epoch": 0.42, + "grad_norm": 3.965183701058465, + "learning_rate": 6.55108842543328e-06, + "loss": 0.6959, + "step": 5142 + }, + { + "epoch": 0.42, + "grad_norm": 31.73922927426521, + "learning_rate": 6.549837909579656e-06, + "loss": 0.7231, + "step": 5143 + }, + { + "epoch": 0.42, + "grad_norm": 9.150695788568113, + "learning_rate": 6.548587286467491e-06, + "loss": 0.5688, + "step": 5144 + }, + { + "epoch": 0.42, + "grad_norm": 2.2439802547757273, + "learning_rate": 6.547336556183336e-06, + "loss": 0.7135, + "step": 5145 + }, + { + "epoch": 0.42, + "grad_norm": 2.1758341284687845, + "learning_rate": 6.546085718813747e-06, + "loss": 0.6759, + "step": 5146 + }, + { + "epoch": 0.42, + "grad_norm": 4.600316076343942, + "learning_rate": 6.544834774445293e-06, + "loss": 0.6583, + "step": 5147 + }, + { + "epoch": 0.42, + "grad_norm": 3.9783717881697234, + "learning_rate": 6.543583723164544e-06, + "loss": 0.59, + "step": 5148 + }, + { + "epoch": 0.42, + "grad_norm": 7.690640250820197, + "learning_rate": 6.542332565058084e-06, + "loss": 0.719, + "step": 5149 + }, + { + "epoch": 0.42, + "grad_norm": 4.644284559525008, + "learning_rate": 6.541081300212499e-06, + "loss": 0.7001, + "step": 5150 + }, + { + "epoch": 0.42, + "grad_norm": 5.36075757365983, + "learning_rate": 6.539829928714383e-06, + "loss": 0.6966, + "step": 5151 + }, + { + "epoch": 0.42, + "grad_norm": 4.556602367139547, + "learning_rate": 6.53857845065034e-06, + "loss": 0.7444, + "step": 5152 + }, + { + "epoch": 0.42, + "grad_norm": 5.552920175440035, + "learning_rate": 6.537326866106981e-06, + "loss": 0.767, + "step": 5153 + }, + { + "epoch": 0.42, + "grad_norm": 4.280691675546673, + "learning_rate": 6.536075175170924e-06, + "loss": 0.6648, + "step": 5154 + }, + { + "epoch": 0.42, + "grad_norm": 5.713586960112847, + "learning_rate": 6.534823377928792e-06, + "loss": 0.6264, + "step": 5155 + }, + { + "epoch": 0.42, + "grad_norm": 10.2953113987641, + "learning_rate": 6.533571474467218e-06, + "loss": 0.631, + "step": 5156 + }, + { + "epoch": 0.42, + "grad_norm": 12.942227843345615, + "learning_rate": 6.532319464872844e-06, + "loss": 0.6948, + "step": 5157 + }, + { + "epoch": 0.42, + "grad_norm": 9.429229635928682, + "learning_rate": 6.531067349232314e-06, + "loss": 0.6742, + "step": 5158 + }, + { + "epoch": 0.42, + "grad_norm": 4.796309895459906, + "learning_rate": 6.529815127632282e-06, + "loss": 0.688, + "step": 5159 + }, + { + "epoch": 0.42, + "grad_norm": 10.193115991265875, + "learning_rate": 6.52856280015941e-06, + "loss": 0.7021, + "step": 5160 + }, + { + "epoch": 0.42, + "grad_norm": 11.770248955558008, + "learning_rate": 6.527310366900369e-06, + "loss": 0.7637, + "step": 5161 + }, + { + "epoch": 0.42, + "grad_norm": 8.431529611564065, + "learning_rate": 6.5260578279418325e-06, + "loss": 0.7375, + "step": 5162 + }, + { + "epoch": 0.42, + "grad_norm": 16.99620538103692, + "learning_rate": 6.524805183370486e-06, + "loss": 0.7121, + "step": 5163 + }, + { + "epoch": 0.42, + "grad_norm": 5.196343131842406, + "learning_rate": 6.523552433273022e-06, + "loss": 0.7002, + "step": 5164 + }, + { + "epoch": 0.42, + "grad_norm": 12.332410083977896, + "learning_rate": 6.522299577736133e-06, + "loss": 0.7808, + "step": 5165 + }, + { + "epoch": 0.42, + "grad_norm": 8.264414180080209, + "learning_rate": 6.52104661684653e-06, + "loss": 0.7886, + "step": 5166 + }, + { + "epoch": 0.42, + "grad_norm": 4.892588304177848, + "learning_rate": 6.519793550690925e-06, + "loss": 0.7552, + "step": 5167 + }, + { + "epoch": 0.42, + "grad_norm": 31.26440008643601, + "learning_rate": 6.5185403793560355e-06, + "loss": 0.8189, + "step": 5168 + }, + { + "epoch": 0.42, + "grad_norm": 4.685607991350681, + "learning_rate": 6.517287102928589e-06, + "loss": 0.5534, + "step": 5169 + }, + { + "epoch": 0.42, + "grad_norm": 2.966266684294976, + "learning_rate": 6.516033721495323e-06, + "loss": 0.7885, + "step": 5170 + }, + { + "epoch": 0.42, + "grad_norm": 5.783215349797061, + "learning_rate": 6.514780235142977e-06, + "loss": 0.632, + "step": 5171 + }, + { + "epoch": 0.42, + "grad_norm": 4.247895608543652, + "learning_rate": 6.5135266439583015e-06, + "loss": 0.6885, + "step": 5172 + }, + { + "epoch": 0.42, + "grad_norm": 3.3025766230107756, + "learning_rate": 6.512272948028051e-06, + "loss": 0.724, + "step": 5173 + }, + { + "epoch": 0.42, + "grad_norm": 5.457575891686157, + "learning_rate": 6.511019147438993e-06, + "loss": 0.847, + "step": 5174 + }, + { + "epoch": 0.42, + "grad_norm": 13.301671293282698, + "learning_rate": 6.5097652422778935e-06, + "loss": 0.7902, + "step": 5175 + }, + { + "epoch": 0.42, + "grad_norm": 2.2266357158870314, + "learning_rate": 6.508511232631534e-06, + "loss": 0.6921, + "step": 5176 + }, + { + "epoch": 0.42, + "grad_norm": 2.5796610179572355, + "learning_rate": 6.507257118586698e-06, + "loss": 0.658, + "step": 5177 + }, + { + "epoch": 0.42, + "grad_norm": 4.9065930683545425, + "learning_rate": 6.5060029002301795e-06, + "loss": 0.7826, + "step": 5178 + }, + { + "epoch": 0.42, + "grad_norm": 5.428587699557702, + "learning_rate": 6.504748577648777e-06, + "loss": 0.5166, + "step": 5179 + }, + { + "epoch": 0.42, + "grad_norm": 18.02019485302552, + "learning_rate": 6.503494150929299e-06, + "loss": 0.7552, + "step": 5180 + }, + { + "epoch": 0.42, + "grad_norm": 5.6144885296774545, + "learning_rate": 6.502239620158559e-06, + "loss": 0.7687, + "step": 5181 + }, + { + "epoch": 0.42, + "grad_norm": 3.703952575926243, + "learning_rate": 6.5009849854233786e-06, + "loss": 0.7183, + "step": 5182 + }, + { + "epoch": 0.42, + "grad_norm": 6.348378046959388, + "learning_rate": 6.499730246810587e-06, + "loss": 0.6924, + "step": 5183 + }, + { + "epoch": 0.42, + "grad_norm": 7.5343143950488765, + "learning_rate": 6.498475404407018e-06, + "loss": 0.656, + "step": 5184 + }, + { + "epoch": 0.42, + "grad_norm": 4.563908370107235, + "learning_rate": 6.497220458299515e-06, + "loss": 0.7761, + "step": 5185 + }, + { + "epoch": 0.42, + "grad_norm": 5.343834255337502, + "learning_rate": 6.495965408574929e-06, + "loss": 0.7318, + "step": 5186 + }, + { + "epoch": 0.42, + "grad_norm": 3.269661660866481, + "learning_rate": 6.4947102553201195e-06, + "loss": 0.5819, + "step": 5187 + }, + { + "epoch": 0.42, + "grad_norm": 5.315334843453977, + "learning_rate": 6.493454998621946e-06, + "loss": 0.617, + "step": 5188 + }, + { + "epoch": 0.42, + "grad_norm": 3.2394630519777396, + "learning_rate": 6.492199638567285e-06, + "loss": 0.8927, + "step": 5189 + }, + { + "epoch": 0.42, + "grad_norm": 2.8214706011424995, + "learning_rate": 6.490944175243014e-06, + "loss": 0.676, + "step": 5190 + }, + { + "epoch": 0.42, + "grad_norm": 4.496179827915101, + "learning_rate": 6.4896886087360175e-06, + "loss": 0.5796, + "step": 5191 + }, + { + "epoch": 0.42, + "grad_norm": 4.434267996887535, + "learning_rate": 6.488432939133189e-06, + "loss": 0.6357, + "step": 5192 + }, + { + "epoch": 0.42, + "grad_norm": 8.005459111908596, + "learning_rate": 6.48717716652143e-06, + "loss": 0.7053, + "step": 5193 + }, + { + "epoch": 0.42, + "grad_norm": 2.6813766175192026, + "learning_rate": 6.485921290987647e-06, + "loss": 0.7689, + "step": 5194 + }, + { + "epoch": 0.42, + "grad_norm": 2.872702294269331, + "learning_rate": 6.484665312618753e-06, + "loss": 0.7115, + "step": 5195 + }, + { + "epoch": 0.42, + "grad_norm": 4.036473892756181, + "learning_rate": 6.483409231501672e-06, + "loss": 0.7021, + "step": 5196 + }, + { + "epoch": 0.42, + "grad_norm": 6.135692667271775, + "learning_rate": 6.482153047723332e-06, + "loss": 0.7552, + "step": 5197 + }, + { + "epoch": 0.42, + "grad_norm": 3.6355384421325057, + "learning_rate": 6.48089676137067e-06, + "loss": 0.7156, + "step": 5198 + }, + { + "epoch": 0.42, + "grad_norm": 3.471728382828722, + "learning_rate": 6.479640372530626e-06, + "loss": 0.5728, + "step": 5199 + }, + { + "epoch": 0.42, + "grad_norm": 7.029022578711728, + "learning_rate": 6.478383881290152e-06, + "loss": 0.835, + "step": 5200 + }, + { + "epoch": 0.42, + "grad_norm": 3.337360464801638, + "learning_rate": 6.477127287736204e-06, + "loss": 0.7169, + "step": 5201 + }, + { + "epoch": 0.42, + "grad_norm": 2.9142618980402086, + "learning_rate": 6.475870591955748e-06, + "loss": 0.692, + "step": 5202 + }, + { + "epoch": 0.42, + "grad_norm": 15.778039891864742, + "learning_rate": 6.474613794035754e-06, + "loss": 0.8646, + "step": 5203 + }, + { + "epoch": 0.42, + "grad_norm": 4.8798800276251555, + "learning_rate": 6.4733568940632e-06, + "loss": 0.6414, + "step": 5204 + }, + { + "epoch": 0.42, + "grad_norm": 2.855605537100388, + "learning_rate": 6.472099892125072e-06, + "loss": 0.5577, + "step": 5205 + }, + { + "epoch": 0.42, + "grad_norm": 6.356064153681271, + "learning_rate": 6.470842788308362e-06, + "loss": 0.7305, + "step": 5206 + }, + { + "epoch": 0.42, + "grad_norm": 3.1160279666025703, + "learning_rate": 6.469585582700072e-06, + "loss": 0.6576, + "step": 5207 + }, + { + "epoch": 0.42, + "grad_norm": 4.385586245173973, + "learning_rate": 6.468328275387205e-06, + "loss": 0.8128, + "step": 5208 + }, + { + "epoch": 0.42, + "grad_norm": 27.59206274518409, + "learning_rate": 6.467070866456775e-06, + "loss": 0.6198, + "step": 5209 + }, + { + "epoch": 0.42, + "grad_norm": 3.0298702840887834, + "learning_rate": 6.465813355995804e-06, + "loss": 0.7304, + "step": 5210 + }, + { + "epoch": 0.42, + "grad_norm": 4.155438365195864, + "learning_rate": 6.46455574409132e-06, + "loss": 0.7968, + "step": 5211 + }, + { + "epoch": 0.42, + "grad_norm": 3.5751198034119693, + "learning_rate": 6.463298030830356e-06, + "loss": 0.7073, + "step": 5212 + }, + { + "epoch": 0.42, + "grad_norm": 7.9524352485608825, + "learning_rate": 6.462040216299956e-06, + "loss": 0.5154, + "step": 5213 + }, + { + "epoch": 0.42, + "grad_norm": 8.711240861725129, + "learning_rate": 6.460782300587166e-06, + "loss": 0.7545, + "step": 5214 + }, + { + "epoch": 0.42, + "grad_norm": 4.065254070984416, + "learning_rate": 6.459524283779044e-06, + "loss": 0.5958, + "step": 5215 + }, + { + "epoch": 0.42, + "grad_norm": 5.101286995693898, + "learning_rate": 6.45826616596265e-06, + "loss": 0.8396, + "step": 5216 + }, + { + "epoch": 0.42, + "grad_norm": 3.725259205413698, + "learning_rate": 6.457007947225058e-06, + "loss": 0.7448, + "step": 5217 + }, + { + "epoch": 0.42, + "grad_norm": 3.4903826060355505, + "learning_rate": 6.455749627653339e-06, + "loss": 0.7276, + "step": 5218 + }, + { + "epoch": 0.42, + "grad_norm": 2.350746765882947, + "learning_rate": 6.454491207334581e-06, + "loss": 0.5739, + "step": 5219 + }, + { + "epoch": 0.42, + "grad_norm": 3.9980741016218415, + "learning_rate": 6.453232686355874e-06, + "loss": 0.8459, + "step": 5220 + }, + { + "epoch": 0.42, + "grad_norm": 3.0919523750894706, + "learning_rate": 6.451974064804313e-06, + "loss": 0.7157, + "step": 5221 + }, + { + "epoch": 0.42, + "grad_norm": 3.2924188436001987, + "learning_rate": 6.450715342767005e-06, + "loss": 0.5553, + "step": 5222 + }, + { + "epoch": 0.42, + "grad_norm": 4.3733312320345465, + "learning_rate": 6.449456520331063e-06, + "loss": 0.7715, + "step": 5223 + }, + { + "epoch": 0.42, + "grad_norm": 2.172647147742984, + "learning_rate": 6.448197597583601e-06, + "loss": 0.5613, + "step": 5224 + }, + { + "epoch": 0.42, + "grad_norm": 3.073730504806312, + "learning_rate": 6.446938574611746e-06, + "loss": 0.664, + "step": 5225 + }, + { + "epoch": 0.42, + "grad_norm": 3.1584711224678976, + "learning_rate": 6.445679451502634e-06, + "loss": 0.6146, + "step": 5226 + }, + { + "epoch": 0.42, + "grad_norm": 2.2321161089320563, + "learning_rate": 6.444420228343398e-06, + "loss": 0.7656, + "step": 5227 + }, + { + "epoch": 0.42, + "grad_norm": 6.053805218679485, + "learning_rate": 6.443160905221188e-06, + "loss": 0.6996, + "step": 5228 + }, + { + "epoch": 0.42, + "grad_norm": 2.4876072940662923, + "learning_rate": 6.441901482223156e-06, + "loss": 0.7718, + "step": 5229 + }, + { + "epoch": 0.42, + "grad_norm": 3.568482438754372, + "learning_rate": 6.440641959436464e-06, + "loss": 0.8145, + "step": 5230 + }, + { + "epoch": 0.42, + "grad_norm": 3.0355656464641467, + "learning_rate": 6.439382336948278e-06, + "loss": 0.7285, + "step": 5231 + }, + { + "epoch": 0.42, + "grad_norm": 10.794388631020505, + "learning_rate": 6.438122614845769e-06, + "loss": 0.7317, + "step": 5232 + }, + { + "epoch": 0.43, + "grad_norm": 3.0983815463994655, + "learning_rate": 6.436862793216121e-06, + "loss": 0.6455, + "step": 5233 + }, + { + "epoch": 0.43, + "grad_norm": 2.7557869137663573, + "learning_rate": 6.43560287214652e-06, + "loss": 0.5485, + "step": 5234 + }, + { + "epoch": 0.43, + "grad_norm": 2.3923688040249127, + "learning_rate": 6.4343428517241616e-06, + "loss": 0.7034, + "step": 5235 + }, + { + "epoch": 0.43, + "grad_norm": 4.558950485112192, + "learning_rate": 6.433082732036246e-06, + "loss": 0.717, + "step": 5236 + }, + { + "epoch": 0.43, + "grad_norm": 3.3868451320382196, + "learning_rate": 6.431822513169983e-06, + "loss": 0.6352, + "step": 5237 + }, + { + "epoch": 0.43, + "grad_norm": 2.963829420735218, + "learning_rate": 6.430562195212586e-06, + "loss": 0.731, + "step": 5238 + }, + { + "epoch": 0.43, + "grad_norm": 3.7732361484101915, + "learning_rate": 6.4293017782512764e-06, + "loss": 0.7049, + "step": 5239 + }, + { + "epoch": 0.43, + "grad_norm": 3.0114183784804824, + "learning_rate": 6.428041262373286e-06, + "loss": 0.7233, + "step": 5240 + }, + { + "epoch": 0.43, + "grad_norm": 5.50488873967489, + "learning_rate": 6.4267806476658465e-06, + "loss": 0.5451, + "step": 5241 + }, + { + "epoch": 0.43, + "grad_norm": 5.862583260380868, + "learning_rate": 6.425519934216204e-06, + "loss": 0.7373, + "step": 5242 + }, + { + "epoch": 0.43, + "grad_norm": 3.403880594270728, + "learning_rate": 6.424259122111606e-06, + "loss": 0.6205, + "step": 5243 + }, + { + "epoch": 0.43, + "grad_norm": 3.34436582589992, + "learning_rate": 6.422998211439307e-06, + "loss": 0.8311, + "step": 5244 + }, + { + "epoch": 0.43, + "grad_norm": 3.5309837569161067, + "learning_rate": 6.421737202286573e-06, + "loss": 0.8683, + "step": 5245 + }, + { + "epoch": 0.43, + "grad_norm": 4.555725876837524, + "learning_rate": 6.420476094740674e-06, + "loss": 0.7112, + "step": 5246 + }, + { + "epoch": 0.43, + "grad_norm": 5.919165322381834, + "learning_rate": 6.419214888888885e-06, + "loss": 0.786, + "step": 5247 + }, + { + "epoch": 0.43, + "grad_norm": 3.445026652851622, + "learning_rate": 6.417953584818488e-06, + "loss": 0.6499, + "step": 5248 + }, + { + "epoch": 0.43, + "grad_norm": 2.880883584163261, + "learning_rate": 6.416692182616775e-06, + "loss": 0.6731, + "step": 5249 + }, + { + "epoch": 0.43, + "grad_norm": 2.5967675235794005, + "learning_rate": 6.415430682371044e-06, + "loss": 0.7002, + "step": 5250 + }, + { + "epoch": 0.43, + "grad_norm": 2.4823750866949648, + "learning_rate": 6.414169084168596e-06, + "loss": 0.7132, + "step": 5251 + }, + { + "epoch": 0.43, + "grad_norm": 3.8252562789986375, + "learning_rate": 6.412907388096743e-06, + "loss": 0.8286, + "step": 5252 + }, + { + "epoch": 0.43, + "grad_norm": 2.8408714821656793, + "learning_rate": 6.411645594242804e-06, + "loss": 0.7406, + "step": 5253 + }, + { + "epoch": 0.43, + "grad_norm": 2.690635521919526, + "learning_rate": 6.4103837026941e-06, + "loss": 0.722, + "step": 5254 + }, + { + "epoch": 0.43, + "grad_norm": 2.71976700201395, + "learning_rate": 6.409121713537965e-06, + "loss": 0.7916, + "step": 5255 + }, + { + "epoch": 0.43, + "grad_norm": 6.066180290378848, + "learning_rate": 6.407859626861734e-06, + "loss": 0.5905, + "step": 5256 + }, + { + "epoch": 0.43, + "grad_norm": 2.977591690609914, + "learning_rate": 6.406597442752751e-06, + "loss": 0.7574, + "step": 5257 + }, + { + "epoch": 0.43, + "grad_norm": 2.7267330074179377, + "learning_rate": 6.405335161298369e-06, + "loss": 0.6836, + "step": 5258 + }, + { + "epoch": 0.43, + "grad_norm": 4.220799983394707, + "learning_rate": 6.404072782585945e-06, + "loss": 0.6847, + "step": 5259 + }, + { + "epoch": 0.43, + "grad_norm": 2.2952019183772787, + "learning_rate": 6.402810306702845e-06, + "loss": 0.8459, + "step": 5260 + }, + { + "epoch": 0.43, + "grad_norm": 5.624079882266811, + "learning_rate": 6.401547733736437e-06, + "loss": 0.6673, + "step": 5261 + }, + { + "epoch": 0.43, + "grad_norm": 4.25995636461149, + "learning_rate": 6.400285063774102e-06, + "loss": 0.8297, + "step": 5262 + }, + { + "epoch": 0.43, + "grad_norm": 2.681753142751082, + "learning_rate": 6.399022296903225e-06, + "loss": 0.8774, + "step": 5263 + }, + { + "epoch": 0.43, + "grad_norm": 2.9717594191112267, + "learning_rate": 6.397759433211194e-06, + "loss": 0.6591, + "step": 5264 + }, + { + "epoch": 0.43, + "grad_norm": 7.1341135688177495, + "learning_rate": 6.396496472785409e-06, + "loss": 0.8238, + "step": 5265 + }, + { + "epoch": 0.43, + "grad_norm": 3.8350327759210017, + "learning_rate": 6.395233415713277e-06, + "loss": 0.5831, + "step": 5266 + }, + { + "epoch": 0.43, + "grad_norm": 8.359989898228111, + "learning_rate": 6.393970262082205e-06, + "loss": 0.7875, + "step": 5267 + }, + { + "epoch": 0.43, + "grad_norm": 2.466873874646155, + "learning_rate": 6.3927070119796156e-06, + "loss": 0.6581, + "step": 5268 + }, + { + "epoch": 0.43, + "grad_norm": 9.004229595642427, + "learning_rate": 6.39144366549293e-06, + "loss": 0.7972, + "step": 5269 + }, + { + "epoch": 0.43, + "grad_norm": 2.487934171620757, + "learning_rate": 6.390180222709583e-06, + "loss": 0.6188, + "step": 5270 + }, + { + "epoch": 0.43, + "grad_norm": 2.9649533628787617, + "learning_rate": 6.388916683717011e-06, + "loss": 0.8735, + "step": 5271 + }, + { + "epoch": 0.43, + "grad_norm": 3.1661176758467966, + "learning_rate": 6.38765304860266e-06, + "loss": 0.6761, + "step": 5272 + }, + { + "epoch": 0.43, + "grad_norm": 2.802007322700124, + "learning_rate": 6.3863893174539805e-06, + "loss": 0.6599, + "step": 5273 + }, + { + "epoch": 0.43, + "grad_norm": 4.334763694730785, + "learning_rate": 6.38512549035843e-06, + "loss": 0.6104, + "step": 5274 + }, + { + "epoch": 0.43, + "grad_norm": 2.661898851882122, + "learning_rate": 6.383861567403473e-06, + "loss": 0.675, + "step": 5275 + }, + { + "epoch": 0.43, + "grad_norm": 2.66579857373232, + "learning_rate": 6.382597548676583e-06, + "loss": 0.661, + "step": 5276 + }, + { + "epoch": 0.43, + "grad_norm": 2.8762056254224917, + "learning_rate": 6.3813334342652375e-06, + "loss": 0.698, + "step": 5277 + }, + { + "epoch": 0.43, + "grad_norm": 6.9352400707030215, + "learning_rate": 6.38006922425692e-06, + "loss": 0.7164, + "step": 5278 + }, + { + "epoch": 0.43, + "grad_norm": 13.967592407698485, + "learning_rate": 6.3788049187391236e-06, + "loss": 0.8905, + "step": 5279 + }, + { + "epoch": 0.43, + "grad_norm": 2.71130406637019, + "learning_rate": 6.377540517799346e-06, + "loss": 0.7606, + "step": 5280 + }, + { + "epoch": 0.43, + "grad_norm": 2.511387701274658, + "learning_rate": 6.376276021525087e-06, + "loss": 0.6524, + "step": 5281 + }, + { + "epoch": 0.43, + "grad_norm": 2.608216246226552, + "learning_rate": 6.375011430003864e-06, + "loss": 0.7238, + "step": 5282 + }, + { + "epoch": 0.43, + "grad_norm": 2.3530022828999266, + "learning_rate": 6.373746743323193e-06, + "loss": 0.6202, + "step": 5283 + }, + { + "epoch": 0.43, + "grad_norm": 3.4515140554871566, + "learning_rate": 6.372481961570597e-06, + "loss": 0.7624, + "step": 5284 + }, + { + "epoch": 0.43, + "grad_norm": 2.544913506823473, + "learning_rate": 6.3712170848336064e-06, + "loss": 0.5893, + "step": 5285 + }, + { + "epoch": 0.43, + "grad_norm": 7.532904580049164, + "learning_rate": 6.369952113199761e-06, + "loss": 0.7289, + "step": 5286 + }, + { + "epoch": 0.43, + "grad_norm": 3.2607505012216573, + "learning_rate": 6.368687046756604e-06, + "loss": 0.6654, + "step": 5287 + }, + { + "epoch": 0.43, + "grad_norm": 2.5881251907207505, + "learning_rate": 6.367421885591684e-06, + "loss": 0.7131, + "step": 5288 + }, + { + "epoch": 0.43, + "grad_norm": 2.713919739499223, + "learning_rate": 6.3661566297925605e-06, + "loss": 0.6192, + "step": 5289 + }, + { + "epoch": 0.43, + "grad_norm": 3.71142486007151, + "learning_rate": 6.364891279446795e-06, + "loss": 0.761, + "step": 5290 + }, + { + "epoch": 0.43, + "grad_norm": 2.8764213482393646, + "learning_rate": 6.3636258346419585e-06, + "loss": 0.633, + "step": 5291 + }, + { + "epoch": 0.43, + "grad_norm": 2.8152901341654193, + "learning_rate": 6.362360295465628e-06, + "loss": 0.8202, + "step": 5292 + }, + { + "epoch": 0.43, + "grad_norm": 2.8443223461121607, + "learning_rate": 6.361094662005389e-06, + "loss": 0.4909, + "step": 5293 + }, + { + "epoch": 0.43, + "grad_norm": 3.807027557832339, + "learning_rate": 6.359828934348828e-06, + "loss": 0.6599, + "step": 5294 + }, + { + "epoch": 0.43, + "grad_norm": 2.4240101495975956, + "learning_rate": 6.3585631125835435e-06, + "loss": 0.7188, + "step": 5295 + }, + { + "epoch": 0.43, + "grad_norm": 2.6441013498808745, + "learning_rate": 6.3572971967971364e-06, + "loss": 0.707, + "step": 5296 + }, + { + "epoch": 0.43, + "grad_norm": 2.7029358130174805, + "learning_rate": 6.356031187077218e-06, + "loss": 0.687, + "step": 5297 + }, + { + "epoch": 0.43, + "grad_norm": 3.0102734735259773, + "learning_rate": 6.3547650835114014e-06, + "loss": 0.6227, + "step": 5298 + }, + { + "epoch": 0.43, + "grad_norm": 3.4421901041565186, + "learning_rate": 6.353498886187313e-06, + "loss": 0.7078, + "step": 5299 + }, + { + "epoch": 0.43, + "grad_norm": 3.457539779227769, + "learning_rate": 6.352232595192577e-06, + "loss": 0.7323, + "step": 5300 + }, + { + "epoch": 0.43, + "grad_norm": 4.4629904295009935, + "learning_rate": 6.3509662106148314e-06, + "loss": 0.7436, + "step": 5301 + }, + { + "epoch": 0.43, + "grad_norm": 2.5090402257104083, + "learning_rate": 6.349699732541719e-06, + "loss": 0.6919, + "step": 5302 + }, + { + "epoch": 0.43, + "grad_norm": 2.224146709755235, + "learning_rate": 6.348433161060886e-06, + "loss": 0.8017, + "step": 5303 + }, + { + "epoch": 0.43, + "grad_norm": 4.690659661724078, + "learning_rate": 6.347166496259989e-06, + "loss": 0.7003, + "step": 5304 + }, + { + "epoch": 0.43, + "grad_norm": 2.9010303005852855, + "learning_rate": 6.3458997382266865e-06, + "loss": 0.6783, + "step": 5305 + }, + { + "epoch": 0.43, + "grad_norm": 2.328712623138541, + "learning_rate": 6.344632887048647e-06, + "loss": 0.6279, + "step": 5306 + }, + { + "epoch": 0.43, + "grad_norm": 2.38164698694708, + "learning_rate": 6.343365942813546e-06, + "loss": 0.6396, + "step": 5307 + }, + { + "epoch": 0.43, + "grad_norm": 3.10426693429425, + "learning_rate": 6.3420989056090645e-06, + "loss": 0.6911, + "step": 5308 + }, + { + "epoch": 0.43, + "grad_norm": 2.870607952941557, + "learning_rate": 6.340831775522886e-06, + "loss": 0.5323, + "step": 5309 + }, + { + "epoch": 0.43, + "grad_norm": 3.0501657123256707, + "learning_rate": 6.339564552642708e-06, + "loss": 0.8065, + "step": 5310 + }, + { + "epoch": 0.43, + "grad_norm": 3.154110893134373, + "learning_rate": 6.338297237056228e-06, + "loss": 0.6719, + "step": 5311 + }, + { + "epoch": 0.43, + "grad_norm": 4.097632059306806, + "learning_rate": 6.337029828851151e-06, + "loss": 0.6379, + "step": 5312 + }, + { + "epoch": 0.43, + "grad_norm": 5.838025767126768, + "learning_rate": 6.335762328115194e-06, + "loss": 0.5671, + "step": 5313 + }, + { + "epoch": 0.43, + "grad_norm": 2.691955714219819, + "learning_rate": 6.334494734936071e-06, + "loss": 0.8376, + "step": 5314 + }, + { + "epoch": 0.43, + "grad_norm": 3.142764302020504, + "learning_rate": 6.333227049401509e-06, + "loss": 0.8203, + "step": 5315 + }, + { + "epoch": 0.43, + "grad_norm": 4.456752591272409, + "learning_rate": 6.331959271599243e-06, + "loss": 0.6692, + "step": 5316 + }, + { + "epoch": 0.43, + "grad_norm": 2.9933192619704756, + "learning_rate": 6.330691401617007e-06, + "loss": 0.8276, + "step": 5317 + }, + { + "epoch": 0.43, + "grad_norm": 4.115823291476445, + "learning_rate": 6.3294234395425465e-06, + "loss": 0.7502, + "step": 5318 + }, + { + "epoch": 0.43, + "grad_norm": 3.288428928249791, + "learning_rate": 6.328155385463616e-06, + "loss": 0.8581, + "step": 5319 + }, + { + "epoch": 0.43, + "grad_norm": 8.514299731306384, + "learning_rate": 6.326887239467969e-06, + "loss": 0.779, + "step": 5320 + }, + { + "epoch": 0.43, + "grad_norm": 2.7487833267156625, + "learning_rate": 6.32561900164337e-06, + "loss": 0.6337, + "step": 5321 + }, + { + "epoch": 0.43, + "grad_norm": 2.263419642375841, + "learning_rate": 6.324350672077588e-06, + "loss": 0.5782, + "step": 5322 + }, + { + "epoch": 0.43, + "grad_norm": 3.9891408672023774, + "learning_rate": 6.323082250858402e-06, + "loss": 0.8111, + "step": 5323 + }, + { + "epoch": 0.43, + "grad_norm": 3.5187577545160393, + "learning_rate": 6.3218137380735934e-06, + "loss": 0.6087, + "step": 5324 + }, + { + "epoch": 0.43, + "grad_norm": 3.1313320933112423, + "learning_rate": 6.32054513381095e-06, + "loss": 0.6584, + "step": 5325 + }, + { + "epoch": 0.43, + "grad_norm": 2.5967540577214225, + "learning_rate": 6.319276438158271e-06, + "loss": 0.7009, + "step": 5326 + }, + { + "epoch": 0.43, + "grad_norm": 5.846245139453428, + "learning_rate": 6.3180076512033525e-06, + "loss": 0.5999, + "step": 5327 + }, + { + "epoch": 0.43, + "grad_norm": 2.53987616270917, + "learning_rate": 6.316738773034009e-06, + "loss": 0.8392, + "step": 5328 + }, + { + "epoch": 0.43, + "grad_norm": 3.1076312980597627, + "learning_rate": 6.31546980373805e-06, + "loss": 0.7322, + "step": 5329 + }, + { + "epoch": 0.43, + "grad_norm": 3.33283973916105, + "learning_rate": 6.314200743403297e-06, + "loss": 0.639, + "step": 5330 + }, + { + "epoch": 0.43, + "grad_norm": 2.90753301224908, + "learning_rate": 6.312931592117578e-06, + "loss": 0.6892, + "step": 5331 + }, + { + "epoch": 0.43, + "grad_norm": 3.103386418908662, + "learning_rate": 6.311662349968726e-06, + "loss": 0.7516, + "step": 5332 + }, + { + "epoch": 0.43, + "grad_norm": 2.9494648974373683, + "learning_rate": 6.310393017044581e-06, + "loss": 0.6699, + "step": 5333 + }, + { + "epoch": 0.43, + "grad_norm": 3.3061706824576467, + "learning_rate": 6.309123593432988e-06, + "loss": 0.5765, + "step": 5334 + }, + { + "epoch": 0.43, + "grad_norm": 9.268461813364416, + "learning_rate": 6.3078540792218e-06, + "loss": 0.7125, + "step": 5335 + }, + { + "epoch": 0.43, + "grad_norm": 3.518779427584672, + "learning_rate": 6.3065844744988746e-06, + "loss": 0.5462, + "step": 5336 + }, + { + "epoch": 0.43, + "grad_norm": 3.1758946125652905, + "learning_rate": 6.305314779352076e-06, + "loss": 0.6834, + "step": 5337 + }, + { + "epoch": 0.43, + "grad_norm": 2.924179564159826, + "learning_rate": 6.304044993869276e-06, + "loss": 0.8562, + "step": 5338 + }, + { + "epoch": 0.43, + "grad_norm": 3.722573843571273, + "learning_rate": 6.302775118138352e-06, + "loss": 0.7935, + "step": 5339 + }, + { + "epoch": 0.43, + "grad_norm": 9.530991276176536, + "learning_rate": 6.301505152247185e-06, + "loss": 0.7517, + "step": 5340 + }, + { + "epoch": 0.43, + "grad_norm": 3.5409695148358957, + "learning_rate": 6.300235096283668e-06, + "loss": 0.7535, + "step": 5341 + }, + { + "epoch": 0.43, + "grad_norm": 3.2486045610096963, + "learning_rate": 6.2989649503356955e-06, + "loss": 0.8066, + "step": 5342 + }, + { + "epoch": 0.43, + "grad_norm": 5.186509901986895, + "learning_rate": 6.297694714491169e-06, + "loss": 0.6076, + "step": 5343 + }, + { + "epoch": 0.43, + "grad_norm": 2.4404382926400716, + "learning_rate": 6.296424388837998e-06, + "loss": 0.6935, + "step": 5344 + }, + { + "epoch": 0.43, + "grad_norm": 3.790387822917204, + "learning_rate": 6.295153973464095e-06, + "loss": 0.7227, + "step": 5345 + }, + { + "epoch": 0.43, + "grad_norm": 14.48362766681455, + "learning_rate": 6.293883468457383e-06, + "loss": 0.7985, + "step": 5346 + }, + { + "epoch": 0.43, + "grad_norm": 4.66139114475922, + "learning_rate": 6.2926128739057875e-06, + "loss": 0.8366, + "step": 5347 + }, + { + "epoch": 0.43, + "grad_norm": 5.048546098288123, + "learning_rate": 6.291342189897242e-06, + "loss": 0.7165, + "step": 5348 + }, + { + "epoch": 0.43, + "grad_norm": 4.7119719851595185, + "learning_rate": 6.2900714165196875e-06, + "loss": 0.7916, + "step": 5349 + }, + { + "epoch": 0.43, + "grad_norm": 2.4183444680567217, + "learning_rate": 6.288800553861068e-06, + "loss": 0.6321, + "step": 5350 + }, + { + "epoch": 0.43, + "grad_norm": 6.195559737049939, + "learning_rate": 6.287529602009334e-06, + "loss": 0.7216, + "step": 5351 + }, + { + "epoch": 0.43, + "grad_norm": 4.2480494812431235, + "learning_rate": 6.286258561052444e-06, + "loss": 0.8423, + "step": 5352 + }, + { + "epoch": 0.43, + "grad_norm": 6.3369016060079915, + "learning_rate": 6.284987431078364e-06, + "loss": 0.7436, + "step": 5353 + }, + { + "epoch": 0.43, + "grad_norm": 4.3225344423690135, + "learning_rate": 6.283716212175062e-06, + "loss": 0.6411, + "step": 5354 + }, + { + "epoch": 0.43, + "grad_norm": 6.563367259530566, + "learning_rate": 6.282444904430516e-06, + "loss": 0.6836, + "step": 5355 + }, + { + "epoch": 0.44, + "grad_norm": 5.865987301531571, + "learning_rate": 6.281173507932708e-06, + "loss": 0.6643, + "step": 5356 + }, + { + "epoch": 0.44, + "grad_norm": 3.55239417917751, + "learning_rate": 6.279902022769624e-06, + "loss": 0.6865, + "step": 5357 + }, + { + "epoch": 0.44, + "grad_norm": 3.1220185173679353, + "learning_rate": 6.278630449029263e-06, + "loss": 0.7681, + "step": 5358 + }, + { + "epoch": 0.44, + "grad_norm": 4.186195006945495, + "learning_rate": 6.277358786799623e-06, + "loss": 0.8277, + "step": 5359 + }, + { + "epoch": 0.44, + "grad_norm": 5.284091969487949, + "learning_rate": 6.2760870361687145e-06, + "loss": 0.6701, + "step": 5360 + }, + { + "epoch": 0.44, + "grad_norm": 7.3253871428292685, + "learning_rate": 6.2748151972245455e-06, + "loss": 0.719, + "step": 5361 + }, + { + "epoch": 0.44, + "grad_norm": 7.724022488010174, + "learning_rate": 6.273543270055139e-06, + "loss": 0.6994, + "step": 5362 + }, + { + "epoch": 0.44, + "grad_norm": 3.6935322576458733, + "learning_rate": 6.272271254748519e-06, + "loss": 0.7592, + "step": 5363 + }, + { + "epoch": 0.44, + "grad_norm": 3.97121808828623, + "learning_rate": 6.2709991513927156e-06, + "loss": 0.7802, + "step": 5364 + }, + { + "epoch": 0.44, + "grad_norm": 21.21890987257359, + "learning_rate": 6.26972696007577e-06, + "loss": 0.7675, + "step": 5365 + }, + { + "epoch": 0.44, + "grad_norm": 3.6353345420097516, + "learning_rate": 6.268454680885725e-06, + "loss": 0.8211, + "step": 5366 + }, + { + "epoch": 0.44, + "grad_norm": 4.523650589718551, + "learning_rate": 6.267182313910627e-06, + "loss": 0.6816, + "step": 5367 + }, + { + "epoch": 0.44, + "grad_norm": 3.021840541347513, + "learning_rate": 6.265909859238536e-06, + "loss": 0.7134, + "step": 5368 + }, + { + "epoch": 0.44, + "grad_norm": 7.5793452218181985, + "learning_rate": 6.264637316957512e-06, + "loss": 0.7602, + "step": 5369 + }, + { + "epoch": 0.44, + "grad_norm": 5.817414349145018, + "learning_rate": 6.263364687155621e-06, + "loss": 0.8693, + "step": 5370 + }, + { + "epoch": 0.44, + "grad_norm": 2.935361977472497, + "learning_rate": 6.262091969920938e-06, + "loss": 0.7554, + "step": 5371 + }, + { + "epoch": 0.44, + "grad_norm": 8.880339798788528, + "learning_rate": 6.260819165341548e-06, + "loss": 0.6667, + "step": 5372 + }, + { + "epoch": 0.44, + "grad_norm": 2.4813681257006333, + "learning_rate": 6.259546273505529e-06, + "loss": 0.7014, + "step": 5373 + }, + { + "epoch": 0.44, + "grad_norm": 3.783352149431896, + "learning_rate": 6.258273294500978e-06, + "loss": 0.6466, + "step": 5374 + }, + { + "epoch": 0.44, + "grad_norm": 3.954436769701012, + "learning_rate": 6.257000228415994e-06, + "loss": 0.8158, + "step": 5375 + }, + { + "epoch": 0.44, + "grad_norm": 2.574706782929329, + "learning_rate": 6.255727075338678e-06, + "loss": 0.7405, + "step": 5376 + }, + { + "epoch": 0.44, + "grad_norm": 7.593442259735793, + "learning_rate": 6.254453835357142e-06, + "loss": 0.497, + "step": 5377 + }, + { + "epoch": 0.44, + "grad_norm": 3.099777659845841, + "learning_rate": 6.253180508559501e-06, + "loss": 0.6707, + "step": 5378 + }, + { + "epoch": 0.44, + "grad_norm": 2.829228847341015, + "learning_rate": 6.25190709503388e-06, + "loss": 0.6808, + "step": 5379 + }, + { + "epoch": 0.44, + "grad_norm": 5.699881551059783, + "learning_rate": 6.250633594868404e-06, + "loss": 0.7624, + "step": 5380 + }, + { + "epoch": 0.44, + "grad_norm": 6.473006615324479, + "learning_rate": 6.2493600081512085e-06, + "loss": 0.9757, + "step": 5381 + }, + { + "epoch": 0.44, + "grad_norm": 5.066672490197237, + "learning_rate": 6.248086334970435e-06, + "loss": 0.8142, + "step": 5382 + }, + { + "epoch": 0.44, + "grad_norm": 9.465462596898993, + "learning_rate": 6.2468125754142275e-06, + "loss": 0.6955, + "step": 5383 + }, + { + "epoch": 0.44, + "grad_norm": 5.254627552383158, + "learning_rate": 6.24553872957074e-06, + "loss": 0.6811, + "step": 5384 + }, + { + "epoch": 0.44, + "grad_norm": 3.201992377576008, + "learning_rate": 6.244264797528129e-06, + "loss": 0.7173, + "step": 5385 + }, + { + "epoch": 0.44, + "grad_norm": 13.459291553091939, + "learning_rate": 6.24299077937456e-06, + "loss": 0.7101, + "step": 5386 + }, + { + "epoch": 0.44, + "grad_norm": 3.4576304079499156, + "learning_rate": 6.241716675198202e-06, + "loss": 0.7596, + "step": 5387 + }, + { + "epoch": 0.44, + "grad_norm": 3.350855293179054, + "learning_rate": 6.240442485087231e-06, + "loss": 0.7432, + "step": 5388 + }, + { + "epoch": 0.44, + "grad_norm": 3.50583677390844, + "learning_rate": 6.239168209129832e-06, + "loss": 0.7151, + "step": 5389 + }, + { + "epoch": 0.44, + "grad_norm": 2.352355412406717, + "learning_rate": 6.237893847414188e-06, + "loss": 0.6377, + "step": 5390 + }, + { + "epoch": 0.44, + "grad_norm": 5.8153858448204785, + "learning_rate": 6.2366194000284965e-06, + "loss": 0.7262, + "step": 5391 + }, + { + "epoch": 0.44, + "grad_norm": 3.3662347283669414, + "learning_rate": 6.235344867060956e-06, + "loss": 0.6757, + "step": 5392 + }, + { + "epoch": 0.44, + "grad_norm": 3.6684975439693503, + "learning_rate": 6.234070248599774e-06, + "loss": 0.6573, + "step": 5393 + }, + { + "epoch": 0.44, + "grad_norm": 4.02243878934282, + "learning_rate": 6.232795544733158e-06, + "loss": 0.6166, + "step": 5394 + }, + { + "epoch": 0.44, + "grad_norm": 6.544225511912298, + "learning_rate": 6.231520755549329e-06, + "loss": 0.6406, + "step": 5395 + }, + { + "epoch": 0.44, + "grad_norm": 2.9271099030260572, + "learning_rate": 6.230245881136509e-06, + "loss": 0.7559, + "step": 5396 + }, + { + "epoch": 0.44, + "grad_norm": 3.6019111498678913, + "learning_rate": 6.228970921582927e-06, + "loss": 0.7281, + "step": 5397 + }, + { + "epoch": 0.44, + "grad_norm": 2.7380268650313973, + "learning_rate": 6.22769587697682e-06, + "loss": 0.7908, + "step": 5398 + }, + { + "epoch": 0.44, + "grad_norm": 3.2135957247105775, + "learning_rate": 6.226420747406429e-06, + "loss": 0.7685, + "step": 5399 + }, + { + "epoch": 0.44, + "grad_norm": 2.5905397053270227, + "learning_rate": 6.2251455329599995e-06, + "loss": 0.7252, + "step": 5400 + }, + { + "epoch": 0.44, + "grad_norm": 6.756989970341897, + "learning_rate": 6.223870233725784e-06, + "loss": 0.6458, + "step": 5401 + }, + { + "epoch": 0.44, + "grad_norm": 3.56228681885131, + "learning_rate": 6.222594849792043e-06, + "loss": 0.7646, + "step": 5402 + }, + { + "epoch": 0.44, + "grad_norm": 3.0864387286228014, + "learning_rate": 6.22131938124704e-06, + "loss": 0.5313, + "step": 5403 + }, + { + "epoch": 0.44, + "grad_norm": 3.9830568539294364, + "learning_rate": 6.220043828179046e-06, + "loss": 0.7072, + "step": 5404 + }, + { + "epoch": 0.44, + "grad_norm": 4.583329975242541, + "learning_rate": 6.218768190676336e-06, + "loss": 0.6741, + "step": 5405 + }, + { + "epoch": 0.44, + "grad_norm": 3.48718690636573, + "learning_rate": 6.217492468827194e-06, + "loss": 0.6883, + "step": 5406 + }, + { + "epoch": 0.44, + "grad_norm": 5.319676448035411, + "learning_rate": 6.216216662719907e-06, + "loss": 0.7973, + "step": 5407 + }, + { + "epoch": 0.44, + "grad_norm": 2.941174486928553, + "learning_rate": 6.21494077244277e-06, + "loss": 0.6404, + "step": 5408 + }, + { + "epoch": 0.44, + "grad_norm": 2.5846925557150016, + "learning_rate": 6.2136647980840815e-06, + "loss": 0.6866, + "step": 5409 + }, + { + "epoch": 0.44, + "grad_norm": 2.7264782286603424, + "learning_rate": 6.2123887397321456e-06, + "loss": 0.8159, + "step": 5410 + }, + { + "epoch": 0.44, + "grad_norm": 2.5762708589251018, + "learning_rate": 6.2111125974752765e-06, + "loss": 0.5873, + "step": 5411 + }, + { + "epoch": 0.44, + "grad_norm": 3.252771569694799, + "learning_rate": 6.209836371401789e-06, + "loss": 0.7169, + "step": 5412 + }, + { + "epoch": 0.44, + "grad_norm": 4.950665161342302, + "learning_rate": 6.208560061600008e-06, + "loss": 0.64, + "step": 5413 + }, + { + "epoch": 0.44, + "grad_norm": 3.0404644272425223, + "learning_rate": 6.207283668158259e-06, + "loss": 0.5511, + "step": 5414 + }, + { + "epoch": 0.44, + "grad_norm": 6.127639734923818, + "learning_rate": 6.20600719116488e-06, + "loss": 0.6127, + "step": 5415 + }, + { + "epoch": 0.44, + "grad_norm": 3.843356708190703, + "learning_rate": 6.204730630708209e-06, + "loss": 0.5182, + "step": 5416 + }, + { + "epoch": 0.44, + "grad_norm": 2.6569152052562646, + "learning_rate": 6.203453986876594e-06, + "loss": 0.6976, + "step": 5417 + }, + { + "epoch": 0.44, + "grad_norm": 3.533354670299025, + "learning_rate": 6.202177259758384e-06, + "loss": 0.5959, + "step": 5418 + }, + { + "epoch": 0.44, + "grad_norm": 5.209694022605238, + "learning_rate": 6.20090044944194e-06, + "loss": 0.8021, + "step": 5419 + }, + { + "epoch": 0.44, + "grad_norm": 2.7257542660698593, + "learning_rate": 6.199623556015621e-06, + "loss": 0.7803, + "step": 5420 + }, + { + "epoch": 0.44, + "grad_norm": 2.367116253574616, + "learning_rate": 6.1983465795678e-06, + "loss": 0.6215, + "step": 5421 + }, + { + "epoch": 0.44, + "grad_norm": 3.1830152480743785, + "learning_rate": 6.19706952018685e-06, + "loss": 0.7885, + "step": 5422 + }, + { + "epoch": 0.44, + "grad_norm": 4.721921655102054, + "learning_rate": 6.195792377961152e-06, + "loss": 0.7182, + "step": 5423 + }, + { + "epoch": 0.44, + "grad_norm": 2.8564867760835946, + "learning_rate": 6.194515152979093e-06, + "loss": 0.6179, + "step": 5424 + }, + { + "epoch": 0.44, + "grad_norm": 3.16518348319037, + "learning_rate": 6.193237845329063e-06, + "loss": 0.576, + "step": 5425 + }, + { + "epoch": 0.44, + "grad_norm": 3.5048940898998686, + "learning_rate": 6.191960455099461e-06, + "loss": 0.7919, + "step": 5426 + }, + { + "epoch": 0.44, + "grad_norm": 2.7602764225850676, + "learning_rate": 6.19068298237869e-06, + "loss": 0.7971, + "step": 5427 + }, + { + "epoch": 0.44, + "grad_norm": 4.49977092855266, + "learning_rate": 6.189405427255158e-06, + "loss": 0.7714, + "step": 5428 + }, + { + "epoch": 0.44, + "grad_norm": 2.465674191101762, + "learning_rate": 6.188127789817284e-06, + "loss": 0.8631, + "step": 5429 + }, + { + "epoch": 0.44, + "grad_norm": 3.534092416065252, + "learning_rate": 6.186850070153484e-06, + "loss": 0.7703, + "step": 5430 + }, + { + "epoch": 0.44, + "grad_norm": 5.954974833418722, + "learning_rate": 6.1855722683521865e-06, + "loss": 0.7837, + "step": 5431 + }, + { + "epoch": 0.44, + "grad_norm": 3.186373510846532, + "learning_rate": 6.184294384501824e-06, + "loss": 0.671, + "step": 5432 + }, + { + "epoch": 0.44, + "grad_norm": 2.3291820264808507, + "learning_rate": 6.183016418690833e-06, + "loss": 0.6637, + "step": 5433 + }, + { + "epoch": 0.44, + "grad_norm": 4.491703984527265, + "learning_rate": 6.181738371007657e-06, + "loss": 0.6005, + "step": 5434 + }, + { + "epoch": 0.44, + "grad_norm": 3.4114646909893525, + "learning_rate": 6.180460241540745e-06, + "loss": 0.6914, + "step": 5435 + }, + { + "epoch": 0.44, + "grad_norm": 8.965727036413394, + "learning_rate": 6.1791820303785495e-06, + "loss": 0.7147, + "step": 5436 + }, + { + "epoch": 0.44, + "grad_norm": 13.2255432902925, + "learning_rate": 6.177903737609535e-06, + "loss": 0.6077, + "step": 5437 + }, + { + "epoch": 0.44, + "grad_norm": 2.1424648876354775, + "learning_rate": 6.176625363322164e-06, + "loss": 0.6357, + "step": 5438 + }, + { + "epoch": 0.44, + "grad_norm": 3.233952699653693, + "learning_rate": 6.17534690760491e-06, + "loss": 0.8118, + "step": 5439 + }, + { + "epoch": 0.44, + "grad_norm": 11.860065414271707, + "learning_rate": 6.17406837054625e-06, + "loss": 0.7259, + "step": 5440 + }, + { + "epoch": 0.44, + "grad_norm": 3.353191894431032, + "learning_rate": 6.172789752234665e-06, + "loss": 0.672, + "step": 5441 + }, + { + "epoch": 0.44, + "grad_norm": 2.5365073095911748, + "learning_rate": 6.171511052758645e-06, + "loss": 0.7353, + "step": 5442 + }, + { + "epoch": 0.44, + "grad_norm": 1.9867565063265187, + "learning_rate": 6.170232272206683e-06, + "loss": 0.6947, + "step": 5443 + }, + { + "epoch": 0.44, + "grad_norm": 7.802102200103752, + "learning_rate": 6.16895341066728e-06, + "loss": 0.7656, + "step": 5444 + }, + { + "epoch": 0.44, + "grad_norm": 2.4924829618151336, + "learning_rate": 6.1676744682289415e-06, + "loss": 0.7499, + "step": 5445 + }, + { + "epoch": 0.44, + "grad_norm": 2.3468149800417977, + "learning_rate": 6.1663954449801755e-06, + "loss": 0.663, + "step": 5446 + }, + { + "epoch": 0.44, + "grad_norm": 3.0585915895021416, + "learning_rate": 6.165116341009501e-06, + "loss": 0.7797, + "step": 5447 + }, + { + "epoch": 0.44, + "grad_norm": 4.057064431661013, + "learning_rate": 6.1638371564054415e-06, + "loss": 0.8421, + "step": 5448 + }, + { + "epoch": 0.44, + "grad_norm": 3.9499814769573316, + "learning_rate": 6.162557891256521e-06, + "loss": 0.5301, + "step": 5449 + }, + { + "epoch": 0.44, + "grad_norm": 5.417813111914994, + "learning_rate": 6.1612785456512745e-06, + "loss": 0.8148, + "step": 5450 + }, + { + "epoch": 0.44, + "grad_norm": 2.541330293301017, + "learning_rate": 6.159999119678241e-06, + "loss": 0.7168, + "step": 5451 + }, + { + "epoch": 0.44, + "grad_norm": 6.457983100996122, + "learning_rate": 6.158719613425964e-06, + "loss": 0.7683, + "step": 5452 + }, + { + "epoch": 0.44, + "grad_norm": 3.0991812769342886, + "learning_rate": 6.1574400269829934e-06, + "loss": 0.6132, + "step": 5453 + }, + { + "epoch": 0.44, + "grad_norm": 3.3400809026059073, + "learning_rate": 6.156160360437885e-06, + "loss": 0.6904, + "step": 5454 + }, + { + "epoch": 0.44, + "grad_norm": 2.3273916895260562, + "learning_rate": 6.154880613879202e-06, + "loss": 0.8502, + "step": 5455 + }, + { + "epoch": 0.44, + "grad_norm": 2.507437687895256, + "learning_rate": 6.153600787395506e-06, + "loss": 0.5769, + "step": 5456 + }, + { + "epoch": 0.44, + "grad_norm": 2.080519541895471, + "learning_rate": 6.152320881075374e-06, + "loss": 0.7646, + "step": 5457 + }, + { + "epoch": 0.44, + "grad_norm": 2.3478652515986926, + "learning_rate": 6.151040895007382e-06, + "loss": 0.7436, + "step": 5458 + }, + { + "epoch": 0.44, + "grad_norm": 2.4164658315290923, + "learning_rate": 6.1497608292801105e-06, + "loss": 0.6299, + "step": 5459 + }, + { + "epoch": 0.44, + "grad_norm": 5.725614000961182, + "learning_rate": 6.14848068398215e-06, + "loss": 0.7445, + "step": 5460 + }, + { + "epoch": 0.44, + "grad_norm": 2.1824596819168454, + "learning_rate": 6.147200459202095e-06, + "loss": 0.707, + "step": 5461 + }, + { + "epoch": 0.44, + "grad_norm": 5.207859413440403, + "learning_rate": 6.145920155028546e-06, + "loss": 0.762, + "step": 5462 + }, + { + "epoch": 0.44, + "grad_norm": 3.7187974262118453, + "learning_rate": 6.144639771550106e-06, + "loss": 0.7088, + "step": 5463 + }, + { + "epoch": 0.44, + "grad_norm": 3.1082905453030985, + "learning_rate": 6.143359308855388e-06, + "loss": 0.7101, + "step": 5464 + }, + { + "epoch": 0.44, + "grad_norm": 3.871269266122093, + "learning_rate": 6.142078767033006e-06, + "loss": 0.5926, + "step": 5465 + }, + { + "epoch": 0.44, + "grad_norm": 2.6660709616399156, + "learning_rate": 6.140798146171581e-06, + "loss": 0.6339, + "step": 5466 + }, + { + "epoch": 0.44, + "grad_norm": 3.0958431901142163, + "learning_rate": 6.139517446359742e-06, + "loss": 0.6837, + "step": 5467 + }, + { + "epoch": 0.44, + "grad_norm": 2.9837431784955566, + "learning_rate": 6.138236667686121e-06, + "loss": 0.7579, + "step": 5468 + }, + { + "epoch": 0.44, + "grad_norm": 3.2499933360219453, + "learning_rate": 6.136955810239356e-06, + "loss": 0.6534, + "step": 5469 + }, + { + "epoch": 0.44, + "grad_norm": 2.8064188398389214, + "learning_rate": 6.135674874108089e-06, + "loss": 0.7902, + "step": 5470 + }, + { + "epoch": 0.44, + "grad_norm": 3.2751762399093107, + "learning_rate": 6.134393859380969e-06, + "loss": 0.6461, + "step": 5471 + }, + { + "epoch": 0.44, + "grad_norm": 3.5868773927172124, + "learning_rate": 6.1331127661466525e-06, + "loss": 0.8174, + "step": 5472 + }, + { + "epoch": 0.44, + "grad_norm": 3.8423729790280716, + "learning_rate": 6.1318315944937985e-06, + "loss": 0.7063, + "step": 5473 + }, + { + "epoch": 0.44, + "grad_norm": 3.513132486641011, + "learning_rate": 6.130550344511071e-06, + "loss": 0.7053, + "step": 5474 + }, + { + "epoch": 0.44, + "grad_norm": 4.097479482107203, + "learning_rate": 6.129269016287142e-06, + "loss": 0.6955, + "step": 5475 + }, + { + "epoch": 0.44, + "grad_norm": 4.1922935058248285, + "learning_rate": 6.127987609910685e-06, + "loss": 0.6594, + "step": 5476 + }, + { + "epoch": 0.44, + "grad_norm": 2.8918171031402364, + "learning_rate": 6.126706125470383e-06, + "loss": 0.7509, + "step": 5477 + }, + { + "epoch": 0.44, + "grad_norm": 3.1915852462643763, + "learning_rate": 6.125424563054925e-06, + "loss": 0.6067, + "step": 5478 + }, + { + "epoch": 0.45, + "grad_norm": 3.696561461594429, + "learning_rate": 6.124142922752998e-06, + "loss": 0.8133, + "step": 5479 + }, + { + "epoch": 0.45, + "grad_norm": 13.331292131497543, + "learning_rate": 6.122861204653304e-06, + "loss": 0.7111, + "step": 5480 + }, + { + "epoch": 0.45, + "grad_norm": 2.9090074060075053, + "learning_rate": 6.121579408844546e-06, + "loss": 0.6356, + "step": 5481 + }, + { + "epoch": 0.45, + "grad_norm": 3.5576086257561386, + "learning_rate": 6.1202975354154296e-06, + "loss": 0.5959, + "step": 5482 + }, + { + "epoch": 0.45, + "grad_norm": 2.524695286913268, + "learning_rate": 6.1190155844546695e-06, + "loss": 0.6309, + "step": 5483 + }, + { + "epoch": 0.45, + "grad_norm": 3.5432535038546273, + "learning_rate": 6.117733556050985e-06, + "loss": 0.8023, + "step": 5484 + }, + { + "epoch": 0.45, + "grad_norm": 3.185092093595493, + "learning_rate": 6.1164514502931e-06, + "loss": 0.7325, + "step": 5485 + }, + { + "epoch": 0.45, + "grad_norm": 5.288818852742843, + "learning_rate": 6.115169267269746e-06, + "loss": 0.7447, + "step": 5486 + }, + { + "epoch": 0.45, + "grad_norm": 3.504440840954236, + "learning_rate": 6.113887007069657e-06, + "loss": 0.6257, + "step": 5487 + }, + { + "epoch": 0.45, + "grad_norm": 2.7656473141883997, + "learning_rate": 6.112604669781572e-06, + "loss": 0.6709, + "step": 5488 + }, + { + "epoch": 0.45, + "grad_norm": 3.028114962404329, + "learning_rate": 6.1113222554942405e-06, + "loss": 0.6002, + "step": 5489 + }, + { + "epoch": 0.45, + "grad_norm": 9.162837392962425, + "learning_rate": 6.1100397642964105e-06, + "loss": 0.6283, + "step": 5490 + }, + { + "epoch": 0.45, + "grad_norm": 3.468672856032037, + "learning_rate": 6.108757196276839e-06, + "loss": 0.6276, + "step": 5491 + }, + { + "epoch": 0.45, + "grad_norm": 2.943584562463514, + "learning_rate": 6.107474551524288e-06, + "loss": 0.6981, + "step": 5492 + }, + { + "epoch": 0.45, + "grad_norm": 2.423080842383334, + "learning_rate": 6.106191830127526e-06, + "loss": 0.5114, + "step": 5493 + }, + { + "epoch": 0.45, + "grad_norm": 11.155287187583347, + "learning_rate": 6.104909032175323e-06, + "loss": 0.7022, + "step": 5494 + }, + { + "epoch": 0.45, + "grad_norm": 3.8154891910531292, + "learning_rate": 6.103626157756459e-06, + "loss": 0.7656, + "step": 5495 + }, + { + "epoch": 0.45, + "grad_norm": 8.136020163417781, + "learning_rate": 6.102343206959714e-06, + "loss": 0.8002, + "step": 5496 + }, + { + "epoch": 0.45, + "grad_norm": 2.3983422652249082, + "learning_rate": 6.101060179873881e-06, + "loss": 0.6919, + "step": 5497 + }, + { + "epoch": 0.45, + "grad_norm": 2.9597901697306592, + "learning_rate": 6.099777076587749e-06, + "loss": 0.7012, + "step": 5498 + }, + { + "epoch": 0.45, + "grad_norm": 3.6252602399527274, + "learning_rate": 6.098493897190119e-06, + "loss": 0.7057, + "step": 5499 + }, + { + "epoch": 0.45, + "grad_norm": 3.3362153013550264, + "learning_rate": 6.097210641769794e-06, + "loss": 0.8149, + "step": 5500 + }, + { + "epoch": 0.45, + "grad_norm": 2.541577381763081, + "learning_rate": 6.095927310415584e-06, + "loss": 0.6942, + "step": 5501 + }, + { + "epoch": 0.45, + "grad_norm": 5.712320679657128, + "learning_rate": 6.094643903216304e-06, + "loss": 0.531, + "step": 5502 + }, + { + "epoch": 0.45, + "grad_norm": 5.377621185877719, + "learning_rate": 6.0933604202607735e-06, + "loss": 0.685, + "step": 5503 + }, + { + "epoch": 0.45, + "grad_norm": 2.7563797054336563, + "learning_rate": 6.092076861637817e-06, + "loss": 0.6915, + "step": 5504 + }, + { + "epoch": 0.45, + "grad_norm": 3.2962370508527163, + "learning_rate": 6.0907932274362655e-06, + "loss": 0.6769, + "step": 5505 + }, + { + "epoch": 0.45, + "grad_norm": 3.053444836208826, + "learning_rate": 6.089509517744956e-06, + "loss": 0.6877, + "step": 5506 + }, + { + "epoch": 0.45, + "grad_norm": 3.944120604345821, + "learning_rate": 6.088225732652726e-06, + "loss": 0.7667, + "step": 5507 + }, + { + "epoch": 0.45, + "grad_norm": 2.98845035372142, + "learning_rate": 6.086941872248424e-06, + "loss": 0.6201, + "step": 5508 + }, + { + "epoch": 0.45, + "grad_norm": 2.58968636755926, + "learning_rate": 6.0856579366209005e-06, + "loss": 0.57, + "step": 5509 + }, + { + "epoch": 0.45, + "grad_norm": 3.5525262240433158, + "learning_rate": 6.084373925859011e-06, + "loss": 0.5896, + "step": 5510 + }, + { + "epoch": 0.45, + "grad_norm": 3.5940031361575833, + "learning_rate": 6.083089840051619e-06, + "loss": 0.7888, + "step": 5511 + }, + { + "epoch": 0.45, + "grad_norm": 6.000042053568261, + "learning_rate": 6.0818056792875905e-06, + "loss": 0.6076, + "step": 5512 + }, + { + "epoch": 0.45, + "grad_norm": 3.4356303213025208, + "learning_rate": 6.080521443655797e-06, + "loss": 0.7205, + "step": 5513 + }, + { + "epoch": 0.45, + "grad_norm": 3.0369844295951616, + "learning_rate": 6.079237133245115e-06, + "loss": 0.5872, + "step": 5514 + }, + { + "epoch": 0.45, + "grad_norm": 3.5597237644164257, + "learning_rate": 6.07795274814443e-06, + "loss": 0.5408, + "step": 5515 + }, + { + "epoch": 0.45, + "grad_norm": 2.114816642177686, + "learning_rate": 6.076668288442626e-06, + "loss": 0.7376, + "step": 5516 + }, + { + "epoch": 0.45, + "grad_norm": 3.4330311341119404, + "learning_rate": 6.075383754228598e-06, + "loss": 0.6608, + "step": 5517 + }, + { + "epoch": 0.45, + "grad_norm": 10.80120312623891, + "learning_rate": 6.074099145591242e-06, + "loss": 0.5894, + "step": 5518 + }, + { + "epoch": 0.45, + "grad_norm": 2.7215246870568555, + "learning_rate": 6.072814462619463e-06, + "loss": 0.7244, + "step": 5519 + }, + { + "epoch": 0.45, + "grad_norm": 8.32048826020818, + "learning_rate": 6.071529705402167e-06, + "loss": 0.7393, + "step": 5520 + }, + { + "epoch": 0.45, + "grad_norm": 3.649122727356196, + "learning_rate": 6.0702448740282704e-06, + "loss": 0.7106, + "step": 5521 + }, + { + "epoch": 0.45, + "grad_norm": 2.9896427477115948, + "learning_rate": 6.068959968586689e-06, + "loss": 0.641, + "step": 5522 + }, + { + "epoch": 0.45, + "grad_norm": 3.9209350591946897, + "learning_rate": 6.0676749891663464e-06, + "loss": 0.4698, + "step": 5523 + }, + { + "epoch": 0.45, + "grad_norm": 2.379236859570022, + "learning_rate": 6.066389935856172e-06, + "loss": 0.8414, + "step": 5524 + }, + { + "epoch": 0.45, + "grad_norm": 3.642893200893073, + "learning_rate": 6.0651048087451e-06, + "loss": 0.7144, + "step": 5525 + }, + { + "epoch": 0.45, + "grad_norm": 3.1959707519466494, + "learning_rate": 6.063819607922068e-06, + "loss": 0.6937, + "step": 5526 + }, + { + "epoch": 0.45, + "grad_norm": 2.9945364211937493, + "learning_rate": 6.062534333476021e-06, + "loss": 0.6656, + "step": 5527 + }, + { + "epoch": 0.45, + "grad_norm": 4.856604907936098, + "learning_rate": 6.061248985495909e-06, + "loss": 0.7815, + "step": 5528 + }, + { + "epoch": 0.45, + "grad_norm": 3.6345119796498477, + "learning_rate": 6.059963564070683e-06, + "loss": 0.5658, + "step": 5529 + }, + { + "epoch": 0.45, + "grad_norm": 3.1557347854144875, + "learning_rate": 6.058678069289307e-06, + "loss": 0.7401, + "step": 5530 + }, + { + "epoch": 0.45, + "grad_norm": 3.634940497522204, + "learning_rate": 6.057392501240741e-06, + "loss": 0.7574, + "step": 5531 + }, + { + "epoch": 0.45, + "grad_norm": 5.064597336396302, + "learning_rate": 6.056106860013956e-06, + "loss": 0.721, + "step": 5532 + }, + { + "epoch": 0.45, + "grad_norm": 2.918225148022936, + "learning_rate": 6.0548211456979255e-06, + "loss": 0.7276, + "step": 5533 + }, + { + "epoch": 0.45, + "grad_norm": 2.4741388861959557, + "learning_rate": 6.053535358381632e-06, + "loss": 0.5945, + "step": 5534 + }, + { + "epoch": 0.45, + "grad_norm": 3.04529080845913, + "learning_rate": 6.052249498154057e-06, + "loss": 0.6167, + "step": 5535 + }, + { + "epoch": 0.45, + "grad_norm": 4.12801735906496, + "learning_rate": 6.050963565104191e-06, + "loss": 0.6862, + "step": 5536 + }, + { + "epoch": 0.45, + "grad_norm": 4.327844939699068, + "learning_rate": 6.04967755932103e-06, + "loss": 0.5271, + "step": 5537 + }, + { + "epoch": 0.45, + "grad_norm": 4.316307729140436, + "learning_rate": 6.0483914808935715e-06, + "loss": 0.8816, + "step": 5538 + }, + { + "epoch": 0.45, + "grad_norm": 5.170019008761969, + "learning_rate": 6.0471053299108216e-06, + "loss": 0.6825, + "step": 5539 + }, + { + "epoch": 0.45, + "grad_norm": 3.087785488458495, + "learning_rate": 6.04581910646179e-06, + "loss": 0.7986, + "step": 5540 + }, + { + "epoch": 0.45, + "grad_norm": 3.2270329131187134, + "learning_rate": 6.04453281063549e-06, + "loss": 0.7098, + "step": 5541 + }, + { + "epoch": 0.45, + "grad_norm": 3.092376325749032, + "learning_rate": 6.0432464425209445e-06, + "loss": 0.6118, + "step": 5542 + }, + { + "epoch": 0.45, + "grad_norm": 7.453302381062141, + "learning_rate": 6.041960002207174e-06, + "loss": 0.7232, + "step": 5543 + }, + { + "epoch": 0.45, + "grad_norm": 6.668788138829259, + "learning_rate": 6.040673489783212e-06, + "loss": 0.7153, + "step": 5544 + }, + { + "epoch": 0.45, + "grad_norm": 4.745626269578607, + "learning_rate": 6.039386905338093e-06, + "loss": 0.7795, + "step": 5545 + }, + { + "epoch": 0.45, + "grad_norm": 2.282937914918666, + "learning_rate": 6.0381002489608554e-06, + "loss": 0.744, + "step": 5546 + }, + { + "epoch": 0.45, + "grad_norm": 2.1049495726609586, + "learning_rate": 6.036813520740543e-06, + "loss": 0.646, + "step": 5547 + }, + { + "epoch": 0.45, + "grad_norm": 3.684562699653572, + "learning_rate": 6.035526720766207e-06, + "loss": 0.8077, + "step": 5548 + }, + { + "epoch": 0.45, + "grad_norm": 5.170062740000753, + "learning_rate": 6.034239849126901e-06, + "loss": 0.7445, + "step": 5549 + }, + { + "epoch": 0.45, + "grad_norm": 2.3936539023480927, + "learning_rate": 6.032952905911686e-06, + "loss": 0.6343, + "step": 5550 + }, + { + "epoch": 0.45, + "grad_norm": 4.0941263605514795, + "learning_rate": 6.031665891209627e-06, + "loss": 0.6527, + "step": 5551 + }, + { + "epoch": 0.45, + "grad_norm": 3.250914370966659, + "learning_rate": 6.030378805109791e-06, + "loss": 0.5753, + "step": 5552 + }, + { + "epoch": 0.45, + "grad_norm": 12.738931809841883, + "learning_rate": 6.029091647701254e-06, + "loss": 0.6138, + "step": 5553 + }, + { + "epoch": 0.45, + "grad_norm": 9.28200601067911, + "learning_rate": 6.027804419073096e-06, + "loss": 0.5888, + "step": 5554 + }, + { + "epoch": 0.45, + "grad_norm": 6.855940234919852, + "learning_rate": 6.0265171193144e-06, + "loss": 0.7445, + "step": 5555 + }, + { + "epoch": 0.45, + "grad_norm": 2.5646589553201324, + "learning_rate": 6.025229748514256e-06, + "loss": 0.6026, + "step": 5556 + }, + { + "epoch": 0.45, + "grad_norm": 3.050897821941831, + "learning_rate": 6.023942306761758e-06, + "loss": 0.7622, + "step": 5557 + }, + { + "epoch": 0.45, + "grad_norm": 2.5363723287320847, + "learning_rate": 6.022654794146006e-06, + "loss": 0.5502, + "step": 5558 + }, + { + "epoch": 0.45, + "grad_norm": 3.4094850908888037, + "learning_rate": 6.0213672107561005e-06, + "loss": 0.7411, + "step": 5559 + }, + { + "epoch": 0.45, + "grad_norm": 4.909537023299501, + "learning_rate": 6.020079556681154e-06, + "loss": 0.891, + "step": 5560 + }, + { + "epoch": 0.45, + "grad_norm": 3.709870374550227, + "learning_rate": 6.018791832010281e-06, + "loss": 0.8141, + "step": 5561 + }, + { + "epoch": 0.45, + "grad_norm": 5.678466723562747, + "learning_rate": 6.017504036832598e-06, + "loss": 0.7289, + "step": 5562 + }, + { + "epoch": 0.45, + "grad_norm": 2.3966343157395165, + "learning_rate": 6.016216171237228e-06, + "loss": 0.8388, + "step": 5563 + }, + { + "epoch": 0.45, + "grad_norm": 7.334902193859894, + "learning_rate": 6.014928235313301e-06, + "loss": 0.7347, + "step": 5564 + }, + { + "epoch": 0.45, + "grad_norm": 3.076488541849444, + "learning_rate": 6.013640229149948e-06, + "loss": 0.7614, + "step": 5565 + }, + { + "epoch": 0.45, + "grad_norm": 4.385063339589167, + "learning_rate": 6.012352152836309e-06, + "loss": 0.7464, + "step": 5566 + }, + { + "epoch": 0.45, + "grad_norm": 2.7211592676806737, + "learning_rate": 6.011064006461528e-06, + "loss": 0.6434, + "step": 5567 + }, + { + "epoch": 0.45, + "grad_norm": 7.345989756607234, + "learning_rate": 6.009775790114751e-06, + "loss": 0.5279, + "step": 5568 + }, + { + "epoch": 0.45, + "grad_norm": 3.6401390064802857, + "learning_rate": 6.008487503885132e-06, + "loss": 0.6762, + "step": 5569 + }, + { + "epoch": 0.45, + "grad_norm": 7.0361743741790095, + "learning_rate": 6.0071991478618275e-06, + "loss": 0.6972, + "step": 5570 + }, + { + "epoch": 0.45, + "grad_norm": 3.7869377196617875, + "learning_rate": 6.005910722134001e-06, + "loss": 0.6637, + "step": 5571 + }, + { + "epoch": 0.45, + "grad_norm": 3.3824748805857054, + "learning_rate": 6.004622226790816e-06, + "loss": 0.7765, + "step": 5572 + }, + { + "epoch": 0.45, + "grad_norm": 3.9419536508176183, + "learning_rate": 6.003333661921449e-06, + "loss": 0.8397, + "step": 5573 + }, + { + "epoch": 0.45, + "grad_norm": 4.581503634063919, + "learning_rate": 6.002045027615076e-06, + "loss": 0.6127, + "step": 5574 + }, + { + "epoch": 0.45, + "grad_norm": 5.025940011311868, + "learning_rate": 6.000756323960875e-06, + "loss": 0.6275, + "step": 5575 + }, + { + "epoch": 0.45, + "grad_norm": 5.212500305550624, + "learning_rate": 5.999467551048037e-06, + "loss": 0.8595, + "step": 5576 + }, + { + "epoch": 0.45, + "grad_norm": 4.6083405386571, + "learning_rate": 5.998178708965752e-06, + "loss": 0.7639, + "step": 5577 + }, + { + "epoch": 0.45, + "grad_norm": 2.523716581009918, + "learning_rate": 5.996889797803214e-06, + "loss": 0.6307, + "step": 5578 + }, + { + "epoch": 0.45, + "grad_norm": 2.480568202181107, + "learning_rate": 5.995600817649625e-06, + "loss": 0.706, + "step": 5579 + }, + { + "epoch": 0.45, + "grad_norm": 3.9920817575102427, + "learning_rate": 5.994311768594191e-06, + "loss": 0.7819, + "step": 5580 + }, + { + "epoch": 0.45, + "grad_norm": 10.18474341357066, + "learning_rate": 5.993022650726122e-06, + "loss": 0.5788, + "step": 5581 + }, + { + "epoch": 0.45, + "grad_norm": 3.670246154477628, + "learning_rate": 5.9917334641346325e-06, + "loss": 0.7016, + "step": 5582 + }, + { + "epoch": 0.45, + "grad_norm": 3.058775787977504, + "learning_rate": 5.990444208908942e-06, + "loss": 0.6849, + "step": 5583 + }, + { + "epoch": 0.45, + "grad_norm": 2.5166152428968305, + "learning_rate": 5.989154885138279e-06, + "loss": 0.6315, + "step": 5584 + }, + { + "epoch": 0.45, + "grad_norm": 3.0160515905740626, + "learning_rate": 5.987865492911866e-06, + "loss": 0.5419, + "step": 5585 + }, + { + "epoch": 0.45, + "grad_norm": 2.4451833308000075, + "learning_rate": 5.986576032318943e-06, + "loss": 0.7434, + "step": 5586 + }, + { + "epoch": 0.45, + "grad_norm": 2.867969828637662, + "learning_rate": 5.985286503448746e-06, + "loss": 0.6824, + "step": 5587 + }, + { + "epoch": 0.45, + "grad_norm": 7.427523669043565, + "learning_rate": 5.9839969063905205e-06, + "loss": 0.7957, + "step": 5588 + }, + { + "epoch": 0.45, + "grad_norm": 3.163381115665876, + "learning_rate": 5.982707241233511e-06, + "loss": 0.6994, + "step": 5589 + }, + { + "epoch": 0.45, + "grad_norm": 4.6747053778104615, + "learning_rate": 5.981417508066974e-06, + "loss": 0.6323, + "step": 5590 + }, + { + "epoch": 0.45, + "grad_norm": 3.6449175123487665, + "learning_rate": 5.980127706980165e-06, + "loss": 0.6168, + "step": 5591 + }, + { + "epoch": 0.45, + "grad_norm": 7.772738291633539, + "learning_rate": 5.978837838062348e-06, + "loss": 0.7204, + "step": 5592 + }, + { + "epoch": 0.45, + "grad_norm": 4.140518459178593, + "learning_rate": 5.9775479014027895e-06, + "loss": 0.6567, + "step": 5593 + }, + { + "epoch": 0.45, + "grad_norm": 4.528767214994769, + "learning_rate": 5.976257897090761e-06, + "loss": 0.7794, + "step": 5594 + }, + { + "epoch": 0.45, + "grad_norm": 3.451332048831778, + "learning_rate": 5.9749678252155394e-06, + "loss": 0.724, + "step": 5595 + }, + { + "epoch": 0.45, + "grad_norm": 4.9906710790072655, + "learning_rate": 5.973677685866405e-06, + "loss": 0.5742, + "step": 5596 + }, + { + "epoch": 0.45, + "grad_norm": 4.202263350691895, + "learning_rate": 5.9723874791326434e-06, + "loss": 0.6339, + "step": 5597 + }, + { + "epoch": 0.45, + "grad_norm": 2.8724974670014594, + "learning_rate": 5.971097205103547e-06, + "loss": 0.7919, + "step": 5598 + }, + { + "epoch": 0.45, + "grad_norm": 2.5909744333726885, + "learning_rate": 5.969806863868407e-06, + "loss": 0.6262, + "step": 5599 + }, + { + "epoch": 0.45, + "grad_norm": 5.699930880597009, + "learning_rate": 5.968516455516526e-06, + "loss": 0.5873, + "step": 5600 + }, + { + "epoch": 0.45, + "grad_norm": 5.235105565494222, + "learning_rate": 5.967225980137211e-06, + "loss": 0.8451, + "step": 5601 + }, + { + "epoch": 0.45, + "grad_norm": 4.22796480939009, + "learning_rate": 5.9659354378197666e-06, + "loss": 0.6706, + "step": 5602 + }, + { + "epoch": 0.46, + "grad_norm": 2.930849867227877, + "learning_rate": 5.964644828653506e-06, + "loss": 0.721, + "step": 5603 + }, + { + "epoch": 0.46, + "grad_norm": 3.2167317217587734, + "learning_rate": 5.963354152727751e-06, + "loss": 0.7929, + "step": 5604 + }, + { + "epoch": 0.46, + "grad_norm": 5.41177027895188, + "learning_rate": 5.962063410131823e-06, + "loss": 0.7472, + "step": 5605 + }, + { + "epoch": 0.46, + "grad_norm": 2.691295926289071, + "learning_rate": 5.9607726009550494e-06, + "loss": 0.8851, + "step": 5606 + }, + { + "epoch": 0.46, + "grad_norm": 3.6409175034478127, + "learning_rate": 5.959481725286761e-06, + "loss": 0.8012, + "step": 5607 + }, + { + "epoch": 0.46, + "grad_norm": 3.441324772598239, + "learning_rate": 5.958190783216297e-06, + "loss": 0.7659, + "step": 5608 + }, + { + "epoch": 0.46, + "grad_norm": 3.0572127707864913, + "learning_rate": 5.956899774832997e-06, + "loss": 0.7427, + "step": 5609 + }, + { + "epoch": 0.46, + "grad_norm": 3.9778288710614893, + "learning_rate": 5.955608700226208e-06, + "loss": 0.6225, + "step": 5610 + }, + { + "epoch": 0.46, + "grad_norm": 3.2509851252178663, + "learning_rate": 5.95431755948528e-06, + "loss": 0.745, + "step": 5611 + }, + { + "epoch": 0.46, + "grad_norm": 5.69934957624073, + "learning_rate": 5.9530263526995665e-06, + "loss": 0.7848, + "step": 5612 + }, + { + "epoch": 0.46, + "grad_norm": 3.1729169445809746, + "learning_rate": 5.9517350799584305e-06, + "loss": 0.5803, + "step": 5613 + }, + { + "epoch": 0.46, + "grad_norm": 3.543003657252106, + "learning_rate": 5.950443741351234e-06, + "loss": 0.585, + "step": 5614 + }, + { + "epoch": 0.46, + "grad_norm": 8.161416037217528, + "learning_rate": 5.949152336967345e-06, + "loss": 0.8223, + "step": 5615 + }, + { + "epoch": 0.46, + "grad_norm": 3.394931978827472, + "learning_rate": 5.9478608668961375e-06, + "loss": 0.7023, + "step": 5616 + }, + { + "epoch": 0.46, + "grad_norm": 4.416962152982418, + "learning_rate": 5.946569331226992e-06, + "loss": 0.7282, + "step": 5617 + }, + { + "epoch": 0.46, + "grad_norm": 4.0993338204237135, + "learning_rate": 5.945277730049287e-06, + "loss": 0.6273, + "step": 5618 + }, + { + "epoch": 0.46, + "grad_norm": 5.011530479873847, + "learning_rate": 5.943986063452412e-06, + "loss": 0.8692, + "step": 5619 + }, + { + "epoch": 0.46, + "grad_norm": 3.9213469938751015, + "learning_rate": 5.942694331525758e-06, + "loss": 0.6645, + "step": 5620 + }, + { + "epoch": 0.46, + "grad_norm": 3.7476993046354106, + "learning_rate": 5.94140253435872e-06, + "loss": 0.751, + "step": 5621 + }, + { + "epoch": 0.46, + "grad_norm": 4.7572062985866905, + "learning_rate": 5.940110672040699e-06, + "loss": 0.6993, + "step": 5622 + }, + { + "epoch": 0.46, + "grad_norm": 5.4628389642146, + "learning_rate": 5.938818744661099e-06, + "loss": 0.6514, + "step": 5623 + }, + { + "epoch": 0.46, + "grad_norm": 2.9679994983833153, + "learning_rate": 5.937526752309331e-06, + "loss": 0.7478, + "step": 5624 + }, + { + "epoch": 0.46, + "grad_norm": 2.447392515908094, + "learning_rate": 5.936234695074809e-06, + "loss": 0.7964, + "step": 5625 + }, + { + "epoch": 0.46, + "grad_norm": 5.188289071372854, + "learning_rate": 5.934942573046953e-06, + "loss": 0.7344, + "step": 5626 + }, + { + "epoch": 0.46, + "grad_norm": 3.7085565278313073, + "learning_rate": 5.9336503863151825e-06, + "loss": 0.8268, + "step": 5627 + }, + { + "epoch": 0.46, + "grad_norm": 2.173604035973248, + "learning_rate": 5.932358134968925e-06, + "loss": 0.5884, + "step": 5628 + }, + { + "epoch": 0.46, + "grad_norm": 2.924074296830354, + "learning_rate": 5.931065819097616e-06, + "loss": 0.674, + "step": 5629 + }, + { + "epoch": 0.46, + "grad_norm": 4.281831028199484, + "learning_rate": 5.929773438790688e-06, + "loss": 0.6652, + "step": 5630 + }, + { + "epoch": 0.46, + "grad_norm": 3.9546505967765455, + "learning_rate": 5.928480994137586e-06, + "loss": 0.5957, + "step": 5631 + }, + { + "epoch": 0.46, + "grad_norm": 2.8272042440677914, + "learning_rate": 5.9271884852277505e-06, + "loss": 0.7226, + "step": 5632 + }, + { + "epoch": 0.46, + "grad_norm": 4.435080111584851, + "learning_rate": 5.9258959121506345e-06, + "loss": 0.6117, + "step": 5633 + }, + { + "epoch": 0.46, + "grad_norm": 2.5976271870831904, + "learning_rate": 5.924603274995693e-06, + "loss": 0.5711, + "step": 5634 + }, + { + "epoch": 0.46, + "grad_norm": 4.039696681909373, + "learning_rate": 5.9233105738523835e-06, + "loss": 0.8134, + "step": 5635 + }, + { + "epoch": 0.46, + "grad_norm": 5.2321136845021, + "learning_rate": 5.9220178088101654e-06, + "loss": 0.7622, + "step": 5636 + }, + { + "epoch": 0.46, + "grad_norm": 3.3330470101560548, + "learning_rate": 5.920724979958512e-06, + "loss": 0.6186, + "step": 5637 + }, + { + "epoch": 0.46, + "grad_norm": 2.817319861523605, + "learning_rate": 5.919432087386891e-06, + "loss": 0.6629, + "step": 5638 + }, + { + "epoch": 0.46, + "grad_norm": 2.315873781066584, + "learning_rate": 5.918139131184781e-06, + "loss": 0.6226, + "step": 5639 + }, + { + "epoch": 0.46, + "grad_norm": 3.11057872721177, + "learning_rate": 5.916846111441663e-06, + "loss": 0.7952, + "step": 5640 + }, + { + "epoch": 0.46, + "grad_norm": 3.121019219880057, + "learning_rate": 5.915553028247021e-06, + "loss": 0.5495, + "step": 5641 + }, + { + "epoch": 0.46, + "grad_norm": 2.8758429756939488, + "learning_rate": 5.914259881690343e-06, + "loss": 0.6698, + "step": 5642 + }, + { + "epoch": 0.46, + "grad_norm": 26.86074133130603, + "learning_rate": 5.912966671861127e-06, + "loss": 0.622, + "step": 5643 + }, + { + "epoch": 0.46, + "grad_norm": 3.052048162915032, + "learning_rate": 5.9116733988488676e-06, + "loss": 0.7541, + "step": 5644 + }, + { + "epoch": 0.46, + "grad_norm": 6.074595451940016, + "learning_rate": 5.910380062743067e-06, + "loss": 0.587, + "step": 5645 + }, + { + "epoch": 0.46, + "grad_norm": 2.9505790955574724, + "learning_rate": 5.909086663633235e-06, + "loss": 0.6598, + "step": 5646 + }, + { + "epoch": 0.46, + "grad_norm": 4.1289877899278355, + "learning_rate": 5.9077932016088835e-06, + "loss": 0.6435, + "step": 5647 + }, + { + "epoch": 0.46, + "grad_norm": 3.8219784678777016, + "learning_rate": 5.906499676759524e-06, + "loss": 0.8832, + "step": 5648 + }, + { + "epoch": 0.46, + "grad_norm": 2.6816686369529656, + "learning_rate": 5.9052060891746796e-06, + "loss": 0.5819, + "step": 5649 + }, + { + "epoch": 0.46, + "grad_norm": 5.585264074473614, + "learning_rate": 5.903912438943875e-06, + "loss": 0.6244, + "step": 5650 + }, + { + "epoch": 0.46, + "grad_norm": 2.2870317291391986, + "learning_rate": 5.902618726156639e-06, + "loss": 0.7642, + "step": 5651 + }, + { + "epoch": 0.46, + "grad_norm": 2.7361087029961513, + "learning_rate": 5.9013249509025016e-06, + "loss": 0.6995, + "step": 5652 + }, + { + "epoch": 0.46, + "grad_norm": 3.0358767993543, + "learning_rate": 5.900031113271003e-06, + "loss": 0.6731, + "step": 5653 + }, + { + "epoch": 0.46, + "grad_norm": 6.020234028267428, + "learning_rate": 5.8987372133516865e-06, + "loss": 0.7503, + "step": 5654 + }, + { + "epoch": 0.46, + "grad_norm": 4.05824232923475, + "learning_rate": 5.897443251234093e-06, + "loss": 0.7473, + "step": 5655 + }, + { + "epoch": 0.46, + "grad_norm": 3.3459317319357025, + "learning_rate": 5.896149227007776e-06, + "loss": 0.7704, + "step": 5656 + }, + { + "epoch": 0.46, + "grad_norm": 6.00936886339648, + "learning_rate": 5.894855140762292e-06, + "loss": 0.7694, + "step": 5657 + }, + { + "epoch": 0.46, + "grad_norm": 3.557118944823089, + "learning_rate": 5.893560992587196e-06, + "loss": 0.56, + "step": 5658 + }, + { + "epoch": 0.46, + "grad_norm": 5.122025383605861, + "learning_rate": 5.892266782572053e-06, + "loss": 0.7186, + "step": 5659 + }, + { + "epoch": 0.46, + "grad_norm": 3.0711173904120903, + "learning_rate": 5.890972510806431e-06, + "loss": 0.8126, + "step": 5660 + }, + { + "epoch": 0.46, + "grad_norm": 3.656009449391839, + "learning_rate": 5.8896781773799015e-06, + "loss": 0.7556, + "step": 5661 + }, + { + "epoch": 0.46, + "grad_norm": 2.8440546554835766, + "learning_rate": 5.88838378238204e-06, + "loss": 0.8314, + "step": 5662 + }, + { + "epoch": 0.46, + "grad_norm": 3.496472087959339, + "learning_rate": 5.8870893259024264e-06, + "loss": 0.5987, + "step": 5663 + }, + { + "epoch": 0.46, + "grad_norm": 3.2173919049233346, + "learning_rate": 5.885794808030647e-06, + "loss": 0.6539, + "step": 5664 + }, + { + "epoch": 0.46, + "grad_norm": 4.140234613234118, + "learning_rate": 5.884500228856289e-06, + "loss": 0.6819, + "step": 5665 + }, + { + "epoch": 0.46, + "grad_norm": 3.3337652081867493, + "learning_rate": 5.8832055884689465e-06, + "loss": 0.6254, + "step": 5666 + }, + { + "epoch": 0.46, + "grad_norm": 3.3628557674858146, + "learning_rate": 5.881910886958214e-06, + "loss": 0.4852, + "step": 5667 + }, + { + "epoch": 0.46, + "grad_norm": 8.230373401705116, + "learning_rate": 5.880616124413698e-06, + "loss": 0.7346, + "step": 5668 + }, + { + "epoch": 0.46, + "grad_norm": 3.9566658954950733, + "learning_rate": 5.879321300924999e-06, + "loss": 0.7342, + "step": 5669 + }, + { + "epoch": 0.46, + "grad_norm": 3.6476031315904187, + "learning_rate": 5.87802641658173e-06, + "loss": 0.5692, + "step": 5670 + }, + { + "epoch": 0.46, + "grad_norm": 5.064370883783466, + "learning_rate": 5.876731471473506e-06, + "loss": 0.7567, + "step": 5671 + }, + { + "epoch": 0.46, + "grad_norm": 4.04424153330966, + "learning_rate": 5.875436465689942e-06, + "loss": 0.7693, + "step": 5672 + }, + { + "epoch": 0.46, + "grad_norm": 6.36466460642214, + "learning_rate": 5.874141399320662e-06, + "loss": 0.6407, + "step": 5673 + }, + { + "epoch": 0.46, + "grad_norm": 6.002740697211344, + "learning_rate": 5.872846272455295e-06, + "loss": 0.7776, + "step": 5674 + }, + { + "epoch": 0.46, + "grad_norm": 3.4118763626272006, + "learning_rate": 5.87155108518347e-06, + "loss": 0.6891, + "step": 5675 + }, + { + "epoch": 0.46, + "grad_norm": 3.728885451793463, + "learning_rate": 5.8702558375948206e-06, + "loss": 0.6166, + "step": 5676 + }, + { + "epoch": 0.46, + "grad_norm": 2.1139439994434004, + "learning_rate": 5.868960529778989e-06, + "loss": 0.6329, + "step": 5677 + }, + { + "epoch": 0.46, + "grad_norm": 3.940787110196132, + "learning_rate": 5.8676651618256165e-06, + "loss": 0.6884, + "step": 5678 + }, + { + "epoch": 0.46, + "grad_norm": 2.9407849870387257, + "learning_rate": 5.866369733824351e-06, + "loss": 0.6168, + "step": 5679 + }, + { + "epoch": 0.46, + "grad_norm": 3.5798745387504787, + "learning_rate": 5.865074245864846e-06, + "loss": 0.7867, + "step": 5680 + }, + { + "epoch": 0.46, + "grad_norm": 11.785772420822497, + "learning_rate": 5.863778698036755e-06, + "loss": 0.6997, + "step": 5681 + }, + { + "epoch": 0.46, + "grad_norm": 3.574881027180635, + "learning_rate": 5.862483090429739e-06, + "loss": 0.613, + "step": 5682 + }, + { + "epoch": 0.46, + "grad_norm": 4.12846554216105, + "learning_rate": 5.861187423133464e-06, + "loss": 0.6573, + "step": 5683 + }, + { + "epoch": 0.46, + "grad_norm": 4.213663798503768, + "learning_rate": 5.859891696237597e-06, + "loss": 0.739, + "step": 5684 + }, + { + "epoch": 0.46, + "grad_norm": 3.7719208030161155, + "learning_rate": 5.8585959098318105e-06, + "loss": 0.6568, + "step": 5685 + }, + { + "epoch": 0.46, + "grad_norm": 2.794899269878294, + "learning_rate": 5.8573000640057785e-06, + "loss": 0.5064, + "step": 5686 + }, + { + "epoch": 0.46, + "grad_norm": 3.7338054153210294, + "learning_rate": 5.8560041588491865e-06, + "loss": 0.7223, + "step": 5687 + }, + { + "epoch": 0.46, + "grad_norm": 3.698582594782864, + "learning_rate": 5.854708194451716e-06, + "loss": 0.5468, + "step": 5688 + }, + { + "epoch": 0.46, + "grad_norm": 3.3607318881064305, + "learning_rate": 5.853412170903055e-06, + "loss": 0.6869, + "step": 5689 + }, + { + "epoch": 0.46, + "grad_norm": 3.8200897134649816, + "learning_rate": 5.852116088292901e-06, + "loss": 0.8712, + "step": 5690 + }, + { + "epoch": 0.46, + "grad_norm": 5.952389878530989, + "learning_rate": 5.850819946710949e-06, + "loss": 0.6798, + "step": 5691 + }, + { + "epoch": 0.46, + "grad_norm": 3.712492403919604, + "learning_rate": 5.8495237462468966e-06, + "loss": 0.6828, + "step": 5692 + }, + { + "epoch": 0.46, + "grad_norm": 4.202882236676595, + "learning_rate": 5.848227486990452e-06, + "loss": 0.5487, + "step": 5693 + }, + { + "epoch": 0.46, + "grad_norm": 3.8337395248735526, + "learning_rate": 5.846931169031327e-06, + "loss": 0.5792, + "step": 5694 + }, + { + "epoch": 0.46, + "grad_norm": 10.655087579167843, + "learning_rate": 5.8456347924592295e-06, + "loss": 0.7377, + "step": 5695 + }, + { + "epoch": 0.46, + "grad_norm": 3.8491552200290187, + "learning_rate": 5.844338357363881e-06, + "loss": 0.7035, + "step": 5696 + }, + { + "epoch": 0.46, + "grad_norm": 2.586370026034477, + "learning_rate": 5.843041863835003e-06, + "loss": 0.6127, + "step": 5697 + }, + { + "epoch": 0.46, + "grad_norm": 5.799013215808532, + "learning_rate": 5.8417453119623176e-06, + "loss": 0.7516, + "step": 5698 + }, + { + "epoch": 0.46, + "grad_norm": 2.1161423026699886, + "learning_rate": 5.840448701835559e-06, + "loss": 0.5723, + "step": 5699 + }, + { + "epoch": 0.46, + "grad_norm": 3.5375681381626314, + "learning_rate": 5.839152033544457e-06, + "loss": 0.7619, + "step": 5700 + }, + { + "epoch": 0.46, + "grad_norm": 3.86569600862823, + "learning_rate": 5.8378553071787504e-06, + "loss": 0.6268, + "step": 5701 + }, + { + "epoch": 0.46, + "grad_norm": 3.522907617552013, + "learning_rate": 5.836558522828181e-06, + "loss": 0.7016, + "step": 5702 + }, + { + "epoch": 0.46, + "grad_norm": 3.16770843781035, + "learning_rate": 5.835261680582493e-06, + "loss": 0.7634, + "step": 5703 + }, + { + "epoch": 0.46, + "grad_norm": 39.16838223939189, + "learning_rate": 5.8339647805314404e-06, + "loss": 0.5516, + "step": 5704 + }, + { + "epoch": 0.46, + "grad_norm": 5.386697719590322, + "learning_rate": 5.832667822764771e-06, + "loss": 0.6797, + "step": 5705 + }, + { + "epoch": 0.46, + "grad_norm": 5.822638438817317, + "learning_rate": 5.8313708073722475e-06, + "loss": 0.6236, + "step": 5706 + }, + { + "epoch": 0.46, + "grad_norm": 3.4141997894462914, + "learning_rate": 5.8300737344436285e-06, + "loss": 0.739, + "step": 5707 + }, + { + "epoch": 0.46, + "grad_norm": 6.134334423886971, + "learning_rate": 5.828776604068682e-06, + "loss": 0.8001, + "step": 5708 + }, + { + "epoch": 0.46, + "grad_norm": 2.88565880958368, + "learning_rate": 5.827479416337174e-06, + "loss": 0.5722, + "step": 5709 + }, + { + "epoch": 0.46, + "grad_norm": 3.3064773259282565, + "learning_rate": 5.826182171338882e-06, + "loss": 0.6725, + "step": 5710 + }, + { + "epoch": 0.46, + "grad_norm": 2.962434387159482, + "learning_rate": 5.824884869163581e-06, + "loss": 0.7108, + "step": 5711 + }, + { + "epoch": 0.46, + "grad_norm": 2.504568175047801, + "learning_rate": 5.8235875099010516e-06, + "loss": 0.6987, + "step": 5712 + }, + { + "epoch": 0.46, + "grad_norm": 8.532626738523177, + "learning_rate": 5.822290093641081e-06, + "loss": 0.6987, + "step": 5713 + }, + { + "epoch": 0.46, + "grad_norm": 5.563573249454162, + "learning_rate": 5.82099262047346e-06, + "loss": 0.7139, + "step": 5714 + }, + { + "epoch": 0.46, + "grad_norm": 3.2633249747597834, + "learning_rate": 5.81969509048798e-06, + "loss": 0.7311, + "step": 5715 + }, + { + "epoch": 0.46, + "grad_norm": 3.1130594721384233, + "learning_rate": 5.818397503774438e-06, + "loss": 0.5648, + "step": 5716 + }, + { + "epoch": 0.46, + "grad_norm": 5.200537756627442, + "learning_rate": 5.817099860422637e-06, + "loss": 0.6039, + "step": 5717 + }, + { + "epoch": 0.46, + "grad_norm": 3.575690563457655, + "learning_rate": 5.815802160522379e-06, + "loss": 0.6895, + "step": 5718 + }, + { + "epoch": 0.46, + "grad_norm": 5.191206926447397, + "learning_rate": 5.814504404163474e-06, + "loss": 0.7822, + "step": 5719 + }, + { + "epoch": 0.46, + "grad_norm": 3.843853172438234, + "learning_rate": 5.813206591435739e-06, + "loss": 0.65, + "step": 5720 + }, + { + "epoch": 0.46, + "grad_norm": 3.7326265425687346, + "learning_rate": 5.8119087224289835e-06, + "loss": 0.5419, + "step": 5721 + }, + { + "epoch": 0.46, + "grad_norm": 4.738835725464466, + "learning_rate": 5.810610797233034e-06, + "loss": 0.6809, + "step": 5722 + }, + { + "epoch": 0.46, + "grad_norm": 3.6040389488624194, + "learning_rate": 5.809312815937715e-06, + "loss": 0.6499, + "step": 5723 + }, + { + "epoch": 0.46, + "grad_norm": 2.5317128008661065, + "learning_rate": 5.808014778632852e-06, + "loss": 0.7133, + "step": 5724 + }, + { + "epoch": 0.46, + "grad_norm": 2.715819480135261, + "learning_rate": 5.806716685408278e-06, + "loss": 0.7213, + "step": 5725 + }, + { + "epoch": 0.47, + "grad_norm": 5.089468218101545, + "learning_rate": 5.805418536353829e-06, + "loss": 0.7569, + "step": 5726 + }, + { + "epoch": 0.47, + "grad_norm": 3.2017915414262146, + "learning_rate": 5.804120331559349e-06, + "loss": 0.7109, + "step": 5727 + }, + { + "epoch": 0.47, + "grad_norm": 5.749780916241432, + "learning_rate": 5.802822071114676e-06, + "loss": 0.5977, + "step": 5728 + }, + { + "epoch": 0.47, + "grad_norm": 3.0991520659980245, + "learning_rate": 5.801523755109661e-06, + "loss": 0.762, + "step": 5729 + }, + { + "epoch": 0.47, + "grad_norm": 2.886788888167605, + "learning_rate": 5.8002253836341586e-06, + "loss": 0.5904, + "step": 5730 + }, + { + "epoch": 0.47, + "grad_norm": 5.08278486768797, + "learning_rate": 5.798926956778017e-06, + "loss": 0.6509, + "step": 5731 + }, + { + "epoch": 0.47, + "grad_norm": 2.667785174468762, + "learning_rate": 5.797628474631102e-06, + "loss": 0.622, + "step": 5732 + }, + { + "epoch": 0.47, + "grad_norm": 2.427148721908196, + "learning_rate": 5.796329937283274e-06, + "loss": 0.6385, + "step": 5733 + }, + { + "epoch": 0.47, + "grad_norm": 4.44670655917923, + "learning_rate": 5.795031344824399e-06, + "loss": 0.7065, + "step": 5734 + }, + { + "epoch": 0.47, + "grad_norm": 3.044007728134551, + "learning_rate": 5.79373269734435e-06, + "loss": 0.6678, + "step": 5735 + }, + { + "epoch": 0.47, + "grad_norm": 4.636121910766767, + "learning_rate": 5.792433994932999e-06, + "loss": 0.7862, + "step": 5736 + }, + { + "epoch": 0.47, + "grad_norm": 3.541543341081417, + "learning_rate": 5.791135237680228e-06, + "loss": 0.6495, + "step": 5737 + }, + { + "epoch": 0.47, + "grad_norm": 4.0246541274018, + "learning_rate": 5.7898364256759165e-06, + "loss": 0.6219, + "step": 5738 + }, + { + "epoch": 0.47, + "grad_norm": 3.1289732186735466, + "learning_rate": 5.788537559009951e-06, + "loss": 0.7909, + "step": 5739 + }, + { + "epoch": 0.47, + "grad_norm": 5.813955663416041, + "learning_rate": 5.787238637772223e-06, + "loss": 0.6082, + "step": 5740 + }, + { + "epoch": 0.47, + "grad_norm": 3.2116854166710462, + "learning_rate": 5.785939662052622e-06, + "loss": 0.7723, + "step": 5741 + }, + { + "epoch": 0.47, + "grad_norm": 2.6618735405259573, + "learning_rate": 5.784640631941048e-06, + "loss": 0.7024, + "step": 5742 + }, + { + "epoch": 0.47, + "grad_norm": 4.083561659330691, + "learning_rate": 5.783341547527403e-06, + "loss": 0.6005, + "step": 5743 + }, + { + "epoch": 0.47, + "grad_norm": 2.421357905123556, + "learning_rate": 5.782042408901589e-06, + "loss": 0.7978, + "step": 5744 + }, + { + "epoch": 0.47, + "grad_norm": 2.8161846841238853, + "learning_rate": 5.780743216153516e-06, + "loss": 0.6775, + "step": 5745 + }, + { + "epoch": 0.47, + "grad_norm": 3.904692854314484, + "learning_rate": 5.7794439693730975e-06, + "loss": 0.7986, + "step": 5746 + }, + { + "epoch": 0.47, + "grad_norm": 3.5547694305525224, + "learning_rate": 5.778144668650248e-06, + "loss": 0.5782, + "step": 5747 + }, + { + "epoch": 0.47, + "grad_norm": 3.9329544545123323, + "learning_rate": 5.776845314074889e-06, + "loss": 0.6605, + "step": 5748 + }, + { + "epoch": 0.47, + "grad_norm": 3.216768053365182, + "learning_rate": 5.775545905736942e-06, + "loss": 0.8415, + "step": 5749 + }, + { + "epoch": 0.47, + "grad_norm": 8.504597650338404, + "learning_rate": 5.774246443726336e-06, + "loss": 0.5837, + "step": 5750 + }, + { + "epoch": 0.47, + "grad_norm": 3.5881969988720246, + "learning_rate": 5.772946928133e-06, + "loss": 0.7973, + "step": 5751 + }, + { + "epoch": 0.47, + "grad_norm": 3.6856821575615926, + "learning_rate": 5.771647359046869e-06, + "loss": 0.5745, + "step": 5752 + }, + { + "epoch": 0.47, + "grad_norm": 9.574906428763217, + "learning_rate": 5.770347736557884e-06, + "loss": 0.4776, + "step": 5753 + }, + { + "epoch": 0.47, + "grad_norm": 3.7102892250448543, + "learning_rate": 5.769048060755984e-06, + "loss": 0.6283, + "step": 5754 + }, + { + "epoch": 0.47, + "grad_norm": 3.7308436260304365, + "learning_rate": 5.7677483317311164e-06, + "loss": 0.6557, + "step": 5755 + }, + { + "epoch": 0.47, + "grad_norm": 6.574500988473744, + "learning_rate": 5.766448549573229e-06, + "loss": 0.699, + "step": 5756 + }, + { + "epoch": 0.47, + "grad_norm": 5.118415093138679, + "learning_rate": 5.765148714372277e-06, + "loss": 0.6519, + "step": 5757 + }, + { + "epoch": 0.47, + "grad_norm": 2.2409897826042755, + "learning_rate": 5.7638488262182165e-06, + "loss": 0.5972, + "step": 5758 + }, + { + "epoch": 0.47, + "grad_norm": 3.6884911976600425, + "learning_rate": 5.762548885201007e-06, + "loss": 0.6959, + "step": 5759 + }, + { + "epoch": 0.47, + "grad_norm": 5.139632734738237, + "learning_rate": 5.761248891410613e-06, + "loss": 0.8107, + "step": 5760 + }, + { + "epoch": 0.47, + "grad_norm": 3.025660807293486, + "learning_rate": 5.7599488449370025e-06, + "loss": 0.5237, + "step": 5761 + }, + { + "epoch": 0.47, + "grad_norm": 3.093683102846561, + "learning_rate": 5.758648745870147e-06, + "loss": 0.5644, + "step": 5762 + }, + { + "epoch": 0.47, + "grad_norm": 6.919776825753386, + "learning_rate": 5.757348594300021e-06, + "loss": 0.6525, + "step": 5763 + }, + { + "epoch": 0.47, + "grad_norm": 3.777505815189163, + "learning_rate": 5.7560483903166065e-06, + "loss": 0.66, + "step": 5764 + }, + { + "epoch": 0.47, + "grad_norm": 2.3929340656365445, + "learning_rate": 5.75474813400988e-06, + "loss": 0.7158, + "step": 5765 + }, + { + "epoch": 0.47, + "grad_norm": 4.137709474719427, + "learning_rate": 5.75344782546983e-06, + "loss": 0.7297, + "step": 5766 + }, + { + "epoch": 0.47, + "grad_norm": 3.3866960714223273, + "learning_rate": 5.752147464786449e-06, + "loss": 0.579, + "step": 5767 + }, + { + "epoch": 0.47, + "grad_norm": 3.131827846944271, + "learning_rate": 5.750847052049725e-06, + "loss": 0.5822, + "step": 5768 + }, + { + "epoch": 0.47, + "grad_norm": 2.9317651863324223, + "learning_rate": 5.749546587349657e-06, + "loss": 0.7502, + "step": 5769 + }, + { + "epoch": 0.47, + "grad_norm": 3.092554287942367, + "learning_rate": 5.748246070776248e-06, + "loss": 0.7726, + "step": 5770 + }, + { + "epoch": 0.47, + "grad_norm": 2.758893373931066, + "learning_rate": 5.746945502419497e-06, + "loss": 0.7071, + "step": 5771 + }, + { + "epoch": 0.47, + "grad_norm": 3.0261085238691052, + "learning_rate": 5.745644882369417e-06, + "loss": 0.8018, + "step": 5772 + }, + { + "epoch": 0.47, + "grad_norm": 4.276685473006278, + "learning_rate": 5.744344210716015e-06, + "loss": 0.8957, + "step": 5773 + }, + { + "epoch": 0.47, + "grad_norm": 5.507044419375576, + "learning_rate": 5.743043487549306e-06, + "loss": 0.7051, + "step": 5774 + }, + { + "epoch": 0.47, + "grad_norm": 3.9443991174452244, + "learning_rate": 5.741742712959308e-06, + "loss": 0.7576, + "step": 5775 + }, + { + "epoch": 0.47, + "grad_norm": 2.8144654746747593, + "learning_rate": 5.740441887036046e-06, + "loss": 0.7318, + "step": 5776 + }, + { + "epoch": 0.47, + "grad_norm": 4.22064613606551, + "learning_rate": 5.7391410098695435e-06, + "loss": 0.7537, + "step": 5777 + }, + { + "epoch": 0.47, + "grad_norm": 4.186332948345009, + "learning_rate": 5.737840081549827e-06, + "loss": 0.7192, + "step": 5778 + }, + { + "epoch": 0.47, + "grad_norm": 5.5055057345079526, + "learning_rate": 5.736539102166934e-06, + "loss": 0.6411, + "step": 5779 + }, + { + "epoch": 0.47, + "grad_norm": 3.101918992158709, + "learning_rate": 5.7352380718108954e-06, + "loss": 0.7521, + "step": 5780 + }, + { + "epoch": 0.47, + "grad_norm": 4.845211240583912, + "learning_rate": 5.733936990571752e-06, + "loss": 0.57, + "step": 5781 + }, + { + "epoch": 0.47, + "grad_norm": 2.202408629073388, + "learning_rate": 5.732635858539549e-06, + "loss": 0.5978, + "step": 5782 + }, + { + "epoch": 0.47, + "grad_norm": 3.586983445724865, + "learning_rate": 5.731334675804332e-06, + "loss": 0.7098, + "step": 5783 + }, + { + "epoch": 0.47, + "grad_norm": 3.6435676348008093, + "learning_rate": 5.730033442456149e-06, + "loss": 0.6209, + "step": 5784 + }, + { + "epoch": 0.47, + "grad_norm": 52.095506540077075, + "learning_rate": 5.728732158585056e-06, + "loss": 0.5997, + "step": 5785 + }, + { + "epoch": 0.47, + "grad_norm": 2.6864175949424713, + "learning_rate": 5.7274308242811095e-06, + "loss": 0.8797, + "step": 5786 + }, + { + "epoch": 0.47, + "grad_norm": 5.523806327719223, + "learning_rate": 5.726129439634369e-06, + "loss": 0.7574, + "step": 5787 + }, + { + "epoch": 0.47, + "grad_norm": 2.3292710743903715, + "learning_rate": 5.7248280047348995e-06, + "loss": 0.7031, + "step": 5788 + }, + { + "epoch": 0.47, + "grad_norm": 3.7397484215912056, + "learning_rate": 5.7235265196727674e-06, + "loss": 0.7593, + "step": 5789 + }, + { + "epoch": 0.47, + "grad_norm": 3.1115980186534045, + "learning_rate": 5.722224984538046e-06, + "loss": 0.7853, + "step": 5790 + }, + { + "epoch": 0.47, + "grad_norm": 12.085753514511365, + "learning_rate": 5.720923399420807e-06, + "loss": 0.6605, + "step": 5791 + }, + { + "epoch": 0.47, + "grad_norm": 2.4791833487839816, + "learning_rate": 5.7196217644111295e-06, + "loss": 0.8459, + "step": 5792 + }, + { + "epoch": 0.47, + "grad_norm": 4.757496764992668, + "learning_rate": 5.718320079599096e-06, + "loss": 0.8126, + "step": 5793 + }, + { + "epoch": 0.47, + "grad_norm": 2.635265762213373, + "learning_rate": 5.717018345074788e-06, + "loss": 0.6242, + "step": 5794 + }, + { + "epoch": 0.47, + "grad_norm": 2.981756047084814, + "learning_rate": 5.715716560928297e-06, + "loss": 0.6362, + "step": 5795 + }, + { + "epoch": 0.47, + "grad_norm": 12.628437255206284, + "learning_rate": 5.714414727249714e-06, + "loss": 0.6901, + "step": 5796 + }, + { + "epoch": 0.47, + "grad_norm": 2.713401817505127, + "learning_rate": 5.713112844129133e-06, + "loss": 0.6986, + "step": 5797 + }, + { + "epoch": 0.47, + "grad_norm": 2.9097784367553396, + "learning_rate": 5.7118109116566525e-06, + "loss": 0.7297, + "step": 5798 + }, + { + "epoch": 0.47, + "grad_norm": 2.506125793971184, + "learning_rate": 5.710508929922376e-06, + "loss": 0.7954, + "step": 5799 + }, + { + "epoch": 0.47, + "grad_norm": 3.3120021207626227, + "learning_rate": 5.709206899016407e-06, + "loss": 0.7786, + "step": 5800 + }, + { + "epoch": 0.47, + "grad_norm": 3.39427128004559, + "learning_rate": 5.707904819028856e-06, + "loss": 0.5329, + "step": 5801 + }, + { + "epoch": 0.47, + "grad_norm": 9.124215767789128, + "learning_rate": 5.706602690049832e-06, + "loss": 0.8379, + "step": 5802 + }, + { + "epoch": 0.47, + "grad_norm": 2.767818652394017, + "learning_rate": 5.705300512169455e-06, + "loss": 0.6424, + "step": 5803 + }, + { + "epoch": 0.47, + "grad_norm": 7.092040605087711, + "learning_rate": 5.703998285477842e-06, + "loss": 0.6699, + "step": 5804 + }, + { + "epoch": 0.47, + "grad_norm": 5.641383501817107, + "learning_rate": 5.702696010065113e-06, + "loss": 0.7998, + "step": 5805 + }, + { + "epoch": 0.47, + "grad_norm": 3.9241281435525592, + "learning_rate": 5.701393686021397e-06, + "loss": 0.8518, + "step": 5806 + }, + { + "epoch": 0.47, + "grad_norm": 5.942381314047857, + "learning_rate": 5.70009131343682e-06, + "loss": 0.6687, + "step": 5807 + }, + { + "epoch": 0.47, + "grad_norm": 5.775864712214542, + "learning_rate": 5.698788892401517e-06, + "loss": 0.5846, + "step": 5808 + }, + { + "epoch": 0.47, + "grad_norm": 2.3743417727785117, + "learning_rate": 5.697486423005621e-06, + "loss": 0.8248, + "step": 5809 + }, + { + "epoch": 0.47, + "grad_norm": 6.809716475709886, + "learning_rate": 5.696183905339277e-06, + "loss": 0.6891, + "step": 5810 + }, + { + "epoch": 0.47, + "grad_norm": 4.489926325140084, + "learning_rate": 5.69488133949262e-06, + "loss": 0.6153, + "step": 5811 + }, + { + "epoch": 0.47, + "grad_norm": 3.7573023779335295, + "learning_rate": 5.693578725555799e-06, + "loss": 0.6711, + "step": 5812 + }, + { + "epoch": 0.47, + "grad_norm": 2.2195998452821923, + "learning_rate": 5.692276063618964e-06, + "loss": 0.8454, + "step": 5813 + }, + { + "epoch": 0.47, + "grad_norm": 2.342198622077043, + "learning_rate": 5.690973353772267e-06, + "loss": 0.7084, + "step": 5814 + }, + { + "epoch": 0.47, + "grad_norm": 4.033431825319062, + "learning_rate": 5.689670596105861e-06, + "loss": 0.6825, + "step": 5815 + }, + { + "epoch": 0.47, + "grad_norm": 3.8298968725001346, + "learning_rate": 5.688367790709909e-06, + "loss": 0.7143, + "step": 5816 + }, + { + "epoch": 0.47, + "grad_norm": 3.1328692285414887, + "learning_rate": 5.6870649376745714e-06, + "loss": 0.7916, + "step": 5817 + }, + { + "epoch": 0.47, + "grad_norm": 5.267488521284832, + "learning_rate": 5.685762037090013e-06, + "loss": 0.8928, + "step": 5818 + }, + { + "epoch": 0.47, + "grad_norm": 2.775404435961653, + "learning_rate": 5.6844590890464035e-06, + "loss": 0.838, + "step": 5819 + }, + { + "epoch": 0.47, + "grad_norm": 2.7889921218877536, + "learning_rate": 5.683156093633917e-06, + "loss": 0.5996, + "step": 5820 + }, + { + "epoch": 0.47, + "grad_norm": 2.6694801130527464, + "learning_rate": 5.681853050942727e-06, + "loss": 0.6881, + "step": 5821 + }, + { + "epoch": 0.47, + "grad_norm": 4.764167522320337, + "learning_rate": 5.680549961063011e-06, + "loss": 0.717, + "step": 5822 + }, + { + "epoch": 0.47, + "grad_norm": 4.175036039467229, + "learning_rate": 5.679246824084955e-06, + "loss": 0.8186, + "step": 5823 + }, + { + "epoch": 0.47, + "grad_norm": 3.8674061887972915, + "learning_rate": 5.67794364009874e-06, + "loss": 0.6814, + "step": 5824 + }, + { + "epoch": 0.47, + "grad_norm": 2.7367872694303803, + "learning_rate": 5.676640409194556e-06, + "loss": 0.7518, + "step": 5825 + }, + { + "epoch": 0.47, + "grad_norm": 6.3879362644659246, + "learning_rate": 5.6753371314625975e-06, + "loss": 0.8068, + "step": 5826 + }, + { + "epoch": 0.47, + "grad_norm": 7.60863794393657, + "learning_rate": 5.674033806993056e-06, + "loss": 0.7635, + "step": 5827 + }, + { + "epoch": 0.47, + "grad_norm": 3.0033696247012314, + "learning_rate": 5.6727304358761305e-06, + "loss": 0.8091, + "step": 5828 + }, + { + "epoch": 0.47, + "grad_norm": 2.844480632353817, + "learning_rate": 5.671427018202023e-06, + "loss": 0.6503, + "step": 5829 + }, + { + "epoch": 0.47, + "grad_norm": 5.593753465684358, + "learning_rate": 5.6701235540609405e-06, + "loss": 0.6583, + "step": 5830 + }, + { + "epoch": 0.47, + "grad_norm": 2.714439462757601, + "learning_rate": 5.668820043543085e-06, + "loss": 0.508, + "step": 5831 + }, + { + "epoch": 0.47, + "grad_norm": 5.760072604711219, + "learning_rate": 5.667516486738672e-06, + "loss": 0.7247, + "step": 5832 + }, + { + "epoch": 0.47, + "grad_norm": 3.1566847002301386, + "learning_rate": 5.666212883737917e-06, + "loss": 0.6605, + "step": 5833 + }, + { + "epoch": 0.47, + "grad_norm": 3.836457457320009, + "learning_rate": 5.6649092346310345e-06, + "loss": 0.5013, + "step": 5834 + }, + { + "epoch": 0.47, + "grad_norm": 3.756194917486008, + "learning_rate": 5.663605539508245e-06, + "loss": 0.6346, + "step": 5835 + }, + { + "epoch": 0.47, + "grad_norm": 3.902661297739872, + "learning_rate": 5.662301798459777e-06, + "loss": 0.5987, + "step": 5836 + }, + { + "epoch": 0.47, + "grad_norm": 3.0792162104007725, + "learning_rate": 5.660998011575853e-06, + "loss": 0.7415, + "step": 5837 + }, + { + "epoch": 0.47, + "grad_norm": 11.264493018311287, + "learning_rate": 5.659694178946704e-06, + "loss": 0.6967, + "step": 5838 + }, + { + "epoch": 0.47, + "grad_norm": 3.1837744508695485, + "learning_rate": 5.658390300662565e-06, + "loss": 0.6992, + "step": 5839 + }, + { + "epoch": 0.47, + "grad_norm": 1.9449491940769168, + "learning_rate": 5.657086376813671e-06, + "loss": 0.5513, + "step": 5840 + }, + { + "epoch": 0.47, + "grad_norm": 2.6543461166200886, + "learning_rate": 5.655782407490261e-06, + "loss": 0.6874, + "step": 5841 + }, + { + "epoch": 0.47, + "grad_norm": 6.906352367049715, + "learning_rate": 5.65447839278258e-06, + "loss": 0.7457, + "step": 5842 + }, + { + "epoch": 0.47, + "grad_norm": 2.289287569957791, + "learning_rate": 5.653174332780874e-06, + "loss": 0.6938, + "step": 5843 + }, + { + "epoch": 0.47, + "grad_norm": 3.354614906967554, + "learning_rate": 5.651870227575391e-06, + "loss": 0.6776, + "step": 5844 + }, + { + "epoch": 0.47, + "grad_norm": 4.200319846334159, + "learning_rate": 5.650566077256385e-06, + "loss": 0.6575, + "step": 5845 + }, + { + "epoch": 0.47, + "grad_norm": 2.2160457331030736, + "learning_rate": 5.64926188191411e-06, + "loss": 0.6131, + "step": 5846 + }, + { + "epoch": 0.47, + "grad_norm": 5.722072076925793, + "learning_rate": 5.647957641638823e-06, + "loss": 0.7048, + "step": 5847 + }, + { + "epoch": 0.47, + "grad_norm": 2.3342859862134913, + "learning_rate": 5.646653356520788e-06, + "loss": 0.6848, + "step": 5848 + }, + { + "epoch": 0.48, + "grad_norm": 5.186189717786647, + "learning_rate": 5.6453490266502695e-06, + "loss": 0.7614, + "step": 5849 + }, + { + "epoch": 0.48, + "grad_norm": 11.985845275655757, + "learning_rate": 5.644044652117534e-06, + "loss": 0.7533, + "step": 5850 + }, + { + "epoch": 0.48, + "grad_norm": 3.603170116008523, + "learning_rate": 5.642740233012854e-06, + "loss": 0.5888, + "step": 5851 + }, + { + "epoch": 0.48, + "grad_norm": 3.3150955015618258, + "learning_rate": 5.6414357694265035e-06, + "loss": 0.6464, + "step": 5852 + }, + { + "epoch": 0.48, + "grad_norm": 20.751550720352387, + "learning_rate": 5.640131261448758e-06, + "loss": 0.8277, + "step": 5853 + }, + { + "epoch": 0.48, + "grad_norm": 2.7856775008548214, + "learning_rate": 5.638826709169899e-06, + "loss": 0.7196, + "step": 5854 + }, + { + "epoch": 0.48, + "grad_norm": 7.893554997978029, + "learning_rate": 5.6375221126802085e-06, + "loss": 0.8021, + "step": 5855 + }, + { + "epoch": 0.48, + "grad_norm": 2.36359730894712, + "learning_rate": 5.6362174720699744e-06, + "loss": 0.6393, + "step": 5856 + }, + { + "epoch": 0.48, + "grad_norm": 3.2477486501315624, + "learning_rate": 5.6349127874294855e-06, + "loss": 0.6184, + "step": 5857 + }, + { + "epoch": 0.48, + "grad_norm": 4.95608148530001, + "learning_rate": 5.633608058849033e-06, + "loss": 0.6515, + "step": 5858 + }, + { + "epoch": 0.48, + "grad_norm": 3.517205513420182, + "learning_rate": 5.632303286418914e-06, + "loss": 0.6956, + "step": 5859 + }, + { + "epoch": 0.48, + "grad_norm": 3.1204648364844285, + "learning_rate": 5.630998470229426e-06, + "loss": 0.7374, + "step": 5860 + }, + { + "epoch": 0.48, + "grad_norm": 2.444126220884048, + "learning_rate": 5.6296936103708725e-06, + "loss": 0.8583, + "step": 5861 + }, + { + "epoch": 0.48, + "grad_norm": 5.799268369836265, + "learning_rate": 5.6283887069335545e-06, + "loss": 0.607, + "step": 5862 + }, + { + "epoch": 0.48, + "grad_norm": 5.686337377886992, + "learning_rate": 5.627083760007781e-06, + "loss": 0.6211, + "step": 5863 + }, + { + "epoch": 0.48, + "grad_norm": 9.458711487683296, + "learning_rate": 5.625778769683863e-06, + "loss": 0.7429, + "step": 5864 + }, + { + "epoch": 0.48, + "grad_norm": 2.4203032363584414, + "learning_rate": 5.624473736052114e-06, + "loss": 0.6259, + "step": 5865 + }, + { + "epoch": 0.48, + "grad_norm": 3.235019674443303, + "learning_rate": 5.623168659202851e-06, + "loss": 0.6063, + "step": 5866 + }, + { + "epoch": 0.48, + "grad_norm": 2.8412022685056186, + "learning_rate": 5.621863539226394e-06, + "loss": 0.7087, + "step": 5867 + }, + { + "epoch": 0.48, + "grad_norm": 3.7249508712676227, + "learning_rate": 5.620558376213063e-06, + "loss": 0.6629, + "step": 5868 + }, + { + "epoch": 0.48, + "grad_norm": 4.280655766093252, + "learning_rate": 5.619253170253185e-06, + "loss": 0.7991, + "step": 5869 + }, + { + "epoch": 0.48, + "grad_norm": 4.977627007259665, + "learning_rate": 5.617947921437089e-06, + "loss": 0.7147, + "step": 5870 + }, + { + "epoch": 0.48, + "grad_norm": 2.7581014240796304, + "learning_rate": 5.616642629855106e-06, + "loss": 0.7042, + "step": 5871 + }, + { + "epoch": 0.48, + "grad_norm": 4.559611928582002, + "learning_rate": 5.61533729559757e-06, + "loss": 0.7254, + "step": 5872 + }, + { + "epoch": 0.48, + "grad_norm": 5.805096582345258, + "learning_rate": 5.614031918754819e-06, + "loss": 0.709, + "step": 5873 + }, + { + "epoch": 0.48, + "grad_norm": 2.57651207241554, + "learning_rate": 5.612726499417192e-06, + "loss": 0.7506, + "step": 5874 + }, + { + "epoch": 0.48, + "grad_norm": 6.316357147709202, + "learning_rate": 5.611421037675034e-06, + "loss": 0.7437, + "step": 5875 + }, + { + "epoch": 0.48, + "grad_norm": 4.295406839311556, + "learning_rate": 5.61011553361869e-06, + "loss": 0.6881, + "step": 5876 + }, + { + "epoch": 0.48, + "grad_norm": 4.50491882343317, + "learning_rate": 5.60880998733851e-06, + "loss": 0.7941, + "step": 5877 + }, + { + "epoch": 0.48, + "grad_norm": 2.5192592881599274, + "learning_rate": 5.607504398924845e-06, + "loss": 0.6002, + "step": 5878 + }, + { + "epoch": 0.48, + "grad_norm": 4.2060583507968285, + "learning_rate": 5.6061987684680505e-06, + "loss": 0.8541, + "step": 5879 + }, + { + "epoch": 0.48, + "grad_norm": 8.094412711160727, + "learning_rate": 5.604893096058485e-06, + "loss": 0.6542, + "step": 5880 + }, + { + "epoch": 0.48, + "grad_norm": 3.082942997686426, + "learning_rate": 5.603587381786506e-06, + "loss": 0.7981, + "step": 5881 + }, + { + "epoch": 0.48, + "grad_norm": 2.9078782011293978, + "learning_rate": 5.602281625742481e-06, + "loss": 0.7507, + "step": 5882 + }, + { + "epoch": 0.48, + "grad_norm": 3.0674044553806272, + "learning_rate": 5.6009758280167766e-06, + "loss": 0.6959, + "step": 5883 + }, + { + "epoch": 0.48, + "grad_norm": 3.0767626224295093, + "learning_rate": 5.599669988699761e-06, + "loss": 0.8676, + "step": 5884 + }, + { + "epoch": 0.48, + "grad_norm": 4.979890605664801, + "learning_rate": 5.598364107881805e-06, + "loss": 0.5457, + "step": 5885 + }, + { + "epoch": 0.48, + "grad_norm": 3.0687384117851146, + "learning_rate": 5.5970581856532864e-06, + "loss": 0.6336, + "step": 5886 + }, + { + "epoch": 0.48, + "grad_norm": 4.0456114298189805, + "learning_rate": 5.59575222210458e-06, + "loss": 0.6126, + "step": 5887 + }, + { + "epoch": 0.48, + "grad_norm": 2.947721791720395, + "learning_rate": 5.594446217326069e-06, + "loss": 0.658, + "step": 5888 + }, + { + "epoch": 0.48, + "grad_norm": 5.111469998302074, + "learning_rate": 5.5931401714081394e-06, + "loss": 0.7067, + "step": 5889 + }, + { + "epoch": 0.48, + "grad_norm": 3.6139775674907115, + "learning_rate": 5.591834084441172e-06, + "loss": 0.8245, + "step": 5890 + }, + { + "epoch": 0.48, + "grad_norm": 2.258327322262662, + "learning_rate": 5.590527956515561e-06, + "loss": 0.586, + "step": 5891 + }, + { + "epoch": 0.48, + "grad_norm": 2.6216332873241366, + "learning_rate": 5.589221787721697e-06, + "loss": 0.5769, + "step": 5892 + }, + { + "epoch": 0.48, + "grad_norm": 2.4184769875681, + "learning_rate": 5.587915578149976e-06, + "loss": 0.688, + "step": 5893 + }, + { + "epoch": 0.48, + "grad_norm": 4.481081724640225, + "learning_rate": 5.586609327890794e-06, + "loss": 0.7078, + "step": 5894 + }, + { + "epoch": 0.48, + "grad_norm": 2.7904220652557927, + "learning_rate": 5.585303037034553e-06, + "loss": 0.6956, + "step": 5895 + }, + { + "epoch": 0.48, + "grad_norm": 10.232501237702206, + "learning_rate": 5.583996705671657e-06, + "loss": 0.6296, + "step": 5896 + }, + { + "epoch": 0.48, + "grad_norm": 5.4213543800496495, + "learning_rate": 5.582690333892512e-06, + "loss": 0.6377, + "step": 5897 + }, + { + "epoch": 0.48, + "grad_norm": 4.945174500045023, + "learning_rate": 5.5813839217875256e-06, + "loss": 0.8252, + "step": 5898 + }, + { + "epoch": 0.48, + "grad_norm": 3.236768986051341, + "learning_rate": 5.580077469447113e-06, + "loss": 0.7032, + "step": 5899 + }, + { + "epoch": 0.48, + "grad_norm": 4.890413970724716, + "learning_rate": 5.578770976961685e-06, + "loss": 0.6829, + "step": 5900 + }, + { + "epoch": 0.48, + "grad_norm": 2.839054836239624, + "learning_rate": 5.577464444421663e-06, + "loss": 0.6844, + "step": 5901 + }, + { + "epoch": 0.48, + "grad_norm": 13.799819444372186, + "learning_rate": 5.576157871917466e-06, + "loss": 0.7103, + "step": 5902 + }, + { + "epoch": 0.48, + "grad_norm": 3.2395469457710413, + "learning_rate": 5.574851259539514e-06, + "loss": 0.6671, + "step": 5903 + }, + { + "epoch": 0.48, + "grad_norm": 5.183754193338998, + "learning_rate": 5.5735446073782364e-06, + "loss": 0.633, + "step": 5904 + }, + { + "epoch": 0.48, + "grad_norm": 2.2693595688826447, + "learning_rate": 5.57223791552406e-06, + "loss": 0.7479, + "step": 5905 + }, + { + "epoch": 0.48, + "grad_norm": 3.809400747400824, + "learning_rate": 5.570931184067419e-06, + "loss": 0.6191, + "step": 5906 + }, + { + "epoch": 0.48, + "grad_norm": 8.718811649677951, + "learning_rate": 5.569624413098742e-06, + "loss": 0.5709, + "step": 5907 + }, + { + "epoch": 0.48, + "grad_norm": 2.823261171360906, + "learning_rate": 5.568317602708471e-06, + "loss": 0.6983, + "step": 5908 + }, + { + "epoch": 0.48, + "grad_norm": 3.0810277534129784, + "learning_rate": 5.5670107529870435e-06, + "loss": 0.6364, + "step": 5909 + }, + { + "epoch": 0.48, + "grad_norm": 3.784356676946425, + "learning_rate": 5.5657038640249015e-06, + "loss": 0.8816, + "step": 5910 + }, + { + "epoch": 0.48, + "grad_norm": 2.9357366168242316, + "learning_rate": 5.564396935912489e-06, + "loss": 0.6416, + "step": 5911 + }, + { + "epoch": 0.48, + "grad_norm": 5.59437008935768, + "learning_rate": 5.563089968740257e-06, + "loss": 0.6477, + "step": 5912 + }, + { + "epoch": 0.48, + "grad_norm": 4.05265046492426, + "learning_rate": 5.561782962598652e-06, + "loss": 0.751, + "step": 5913 + }, + { + "epoch": 0.48, + "grad_norm": 3.7789169084784286, + "learning_rate": 5.560475917578129e-06, + "loss": 0.8523, + "step": 5914 + }, + { + "epoch": 0.48, + "grad_norm": 3.472931041079459, + "learning_rate": 5.5591688337691415e-06, + "loss": 0.7577, + "step": 5915 + }, + { + "epoch": 0.48, + "grad_norm": 2.86623853096685, + "learning_rate": 5.557861711262154e-06, + "loss": 0.7382, + "step": 5916 + }, + { + "epoch": 0.48, + "grad_norm": 2.3072738925850746, + "learning_rate": 5.556554550147622e-06, + "loss": 0.6447, + "step": 5917 + }, + { + "epoch": 0.48, + "grad_norm": 3.990543035444797, + "learning_rate": 5.555247350516009e-06, + "loss": 0.6523, + "step": 5918 + }, + { + "epoch": 0.48, + "grad_norm": 3.154047938465092, + "learning_rate": 5.553940112457785e-06, + "loss": 0.7879, + "step": 5919 + }, + { + "epoch": 0.48, + "grad_norm": 4.704000288403946, + "learning_rate": 5.552632836063417e-06, + "loss": 0.6051, + "step": 5920 + }, + { + "epoch": 0.48, + "grad_norm": 4.618849387675572, + "learning_rate": 5.551325521423375e-06, + "loss": 0.5893, + "step": 5921 + }, + { + "epoch": 0.48, + "grad_norm": 3.3307892077234333, + "learning_rate": 5.5500181686281385e-06, + "loss": 0.5814, + "step": 5922 + }, + { + "epoch": 0.48, + "grad_norm": 6.580077393821343, + "learning_rate": 5.54871077776818e-06, + "loss": 0.8306, + "step": 5923 + }, + { + "epoch": 0.48, + "grad_norm": 2.5577958689738374, + "learning_rate": 5.54740334893398e-06, + "loss": 0.7353, + "step": 5924 + }, + { + "epoch": 0.48, + "grad_norm": 2.470728867004697, + "learning_rate": 5.546095882216024e-06, + "loss": 0.6548, + "step": 5925 + }, + { + "epoch": 0.48, + "grad_norm": 3.221893565064767, + "learning_rate": 5.544788377704793e-06, + "loss": 0.7324, + "step": 5926 + }, + { + "epoch": 0.48, + "grad_norm": 3.061110244897309, + "learning_rate": 5.5434808354907755e-06, + "loss": 0.7466, + "step": 5927 + }, + { + "epoch": 0.48, + "grad_norm": 2.306336398779051, + "learning_rate": 5.542173255664463e-06, + "loss": 0.6855, + "step": 5928 + }, + { + "epoch": 0.48, + "grad_norm": 4.335874514294128, + "learning_rate": 5.540865638316346e-06, + "loss": 0.6961, + "step": 5929 + }, + { + "epoch": 0.48, + "grad_norm": 3.728366268817357, + "learning_rate": 5.539557983536923e-06, + "loss": 0.7839, + "step": 5930 + }, + { + "epoch": 0.48, + "grad_norm": 13.726773639026964, + "learning_rate": 5.538250291416688e-06, + "loss": 0.5286, + "step": 5931 + }, + { + "epoch": 0.48, + "grad_norm": 2.3226998014425617, + "learning_rate": 5.536942562046146e-06, + "loss": 0.7185, + "step": 5932 + }, + { + "epoch": 0.48, + "grad_norm": 3.8459872276585205, + "learning_rate": 5.5356347955157974e-06, + "loss": 0.7207, + "step": 5933 + }, + { + "epoch": 0.48, + "grad_norm": 3.6144426264945895, + "learning_rate": 5.534326991916148e-06, + "loss": 0.7287, + "step": 5934 + }, + { + "epoch": 0.48, + "grad_norm": 4.3330663098256945, + "learning_rate": 5.533019151337706e-06, + "loss": 0.6706, + "step": 5935 + }, + { + "epoch": 0.48, + "grad_norm": 2.6126678893466706, + "learning_rate": 5.531711273870983e-06, + "loss": 0.5147, + "step": 5936 + }, + { + "epoch": 0.48, + "grad_norm": 3.2163950361360265, + "learning_rate": 5.530403359606492e-06, + "loss": 0.6117, + "step": 5937 + }, + { + "epoch": 0.48, + "grad_norm": 3.5989766690324925, + "learning_rate": 5.529095408634748e-06, + "loss": 0.6645, + "step": 5938 + }, + { + "epoch": 0.48, + "grad_norm": 13.307671522400634, + "learning_rate": 5.5277874210462715e-06, + "loss": 0.7469, + "step": 5939 + }, + { + "epoch": 0.48, + "grad_norm": 2.1912407384381645, + "learning_rate": 5.526479396931581e-06, + "loss": 0.6796, + "step": 5940 + }, + { + "epoch": 0.48, + "grad_norm": 2.6386397382828455, + "learning_rate": 5.525171336381202e-06, + "loss": 0.6136, + "step": 5941 + }, + { + "epoch": 0.48, + "grad_norm": 5.618606315080148, + "learning_rate": 5.523863239485661e-06, + "loss": 0.6268, + "step": 5942 + }, + { + "epoch": 0.48, + "grad_norm": 4.615189357811246, + "learning_rate": 5.522555106335483e-06, + "loss": 0.7704, + "step": 5943 + }, + { + "epoch": 0.48, + "grad_norm": 4.308420725968467, + "learning_rate": 5.521246937021202e-06, + "loss": 0.6504, + "step": 5944 + }, + { + "epoch": 0.48, + "grad_norm": 3.340424981487809, + "learning_rate": 5.5199387316333505e-06, + "loss": 0.6256, + "step": 5945 + }, + { + "epoch": 0.48, + "grad_norm": 2.6633058103731604, + "learning_rate": 5.518630490262467e-06, + "loss": 0.6516, + "step": 5946 + }, + { + "epoch": 0.48, + "grad_norm": 3.923711951305693, + "learning_rate": 5.517322212999086e-06, + "loss": 0.6793, + "step": 5947 + }, + { + "epoch": 0.48, + "grad_norm": 13.994057347754657, + "learning_rate": 5.516013899933751e-06, + "loss": 0.6655, + "step": 5948 + }, + { + "epoch": 0.48, + "grad_norm": 2.2254891895525177, + "learning_rate": 5.514705551157005e-06, + "loss": 0.7737, + "step": 5949 + }, + { + "epoch": 0.48, + "grad_norm": 2.6160184770571124, + "learning_rate": 5.513397166759395e-06, + "loss": 0.6703, + "step": 5950 + }, + { + "epoch": 0.48, + "grad_norm": 6.331575171926444, + "learning_rate": 5.512088746831468e-06, + "loss": 0.6071, + "step": 5951 + }, + { + "epoch": 0.48, + "grad_norm": 2.893743025128155, + "learning_rate": 5.5107802914637755e-06, + "loss": 0.7219, + "step": 5952 + }, + { + "epoch": 0.48, + "grad_norm": 4.323022199214157, + "learning_rate": 5.509471800746869e-06, + "loss": 0.7423, + "step": 5953 + }, + { + "epoch": 0.48, + "grad_norm": 4.0118236304013095, + "learning_rate": 5.508163274771308e-06, + "loss": 0.7294, + "step": 5954 + }, + { + "epoch": 0.48, + "grad_norm": 2.7014311816328047, + "learning_rate": 5.506854713627647e-06, + "loss": 0.5668, + "step": 5955 + }, + { + "epoch": 0.48, + "grad_norm": 2.759251065518076, + "learning_rate": 5.505546117406449e-06, + "loss": 0.5532, + "step": 5956 + }, + { + "epoch": 0.48, + "grad_norm": 2.928258761434138, + "learning_rate": 5.504237486198277e-06, + "loss": 0.6747, + "step": 5957 + }, + { + "epoch": 0.48, + "grad_norm": 3.4900605864103698, + "learning_rate": 5.502928820093696e-06, + "loss": 0.5736, + "step": 5958 + }, + { + "epoch": 0.48, + "grad_norm": 3.178825155160367, + "learning_rate": 5.501620119183275e-06, + "loss": 0.6252, + "step": 5959 + }, + { + "epoch": 0.48, + "grad_norm": 3.8810609414550976, + "learning_rate": 5.5003113835575814e-06, + "loss": 0.6751, + "step": 5960 + }, + { + "epoch": 0.48, + "grad_norm": 2.8704395058853334, + "learning_rate": 5.49900261330719e-06, + "loss": 0.7266, + "step": 5961 + }, + { + "epoch": 0.48, + "grad_norm": 2.6572731339543605, + "learning_rate": 5.497693808522677e-06, + "loss": 0.7251, + "step": 5962 + }, + { + "epoch": 0.48, + "grad_norm": 3.5707394175249516, + "learning_rate": 5.496384969294617e-06, + "loss": 0.6439, + "step": 5963 + }, + { + "epoch": 0.48, + "grad_norm": 3.5323863202965082, + "learning_rate": 5.4950760957135926e-06, + "loss": 0.6789, + "step": 5964 + }, + { + "epoch": 0.48, + "grad_norm": 2.718152958044197, + "learning_rate": 5.493767187870186e-06, + "loss": 0.8317, + "step": 5965 + }, + { + "epoch": 0.48, + "grad_norm": 3.3431843202261002, + "learning_rate": 5.49245824585498e-06, + "loss": 0.7107, + "step": 5966 + }, + { + "epoch": 0.48, + "grad_norm": 3.4348020570989415, + "learning_rate": 5.4911492697585635e-06, + "loss": 0.702, + "step": 5967 + }, + { + "epoch": 0.48, + "grad_norm": 2.5964222985249323, + "learning_rate": 5.489840259671523e-06, + "loss": 0.8075, + "step": 5968 + }, + { + "epoch": 0.48, + "grad_norm": 2.5796613553352405, + "learning_rate": 5.488531215684454e-06, + "loss": 0.5849, + "step": 5969 + }, + { + "epoch": 0.48, + "grad_norm": 2.300747308193322, + "learning_rate": 5.487222137887949e-06, + "loss": 0.6931, + "step": 5970 + }, + { + "epoch": 0.48, + "grad_norm": 2.2129022847530857, + "learning_rate": 5.485913026372602e-06, + "loss": 0.505, + "step": 5971 + }, + { + "epoch": 0.49, + "grad_norm": 3.4725853577981907, + "learning_rate": 5.484603881229017e-06, + "loss": 0.759, + "step": 5972 + }, + { + "epoch": 0.49, + "grad_norm": 5.162933715838699, + "learning_rate": 5.48329470254779e-06, + "loss": 0.7611, + "step": 5973 + }, + { + "epoch": 0.49, + "grad_norm": 3.120069743409115, + "learning_rate": 5.481985490419528e-06, + "loss": 0.6843, + "step": 5974 + }, + { + "epoch": 0.49, + "grad_norm": 1.9875008178158569, + "learning_rate": 5.480676244934835e-06, + "loss": 0.6296, + "step": 5975 + }, + { + "epoch": 0.49, + "grad_norm": 3.1877610294607224, + "learning_rate": 5.479366966184317e-06, + "loss": 0.555, + "step": 5976 + }, + { + "epoch": 0.49, + "grad_norm": 2.5338274543860186, + "learning_rate": 5.478057654258588e-06, + "loss": 0.7902, + "step": 5977 + }, + { + "epoch": 0.49, + "grad_norm": 3.265280260835974, + "learning_rate": 5.47674830924826e-06, + "loss": 0.6582, + "step": 5978 + }, + { + "epoch": 0.49, + "grad_norm": 3.2184041109968136, + "learning_rate": 5.475438931243947e-06, + "loss": 0.8355, + "step": 5979 + }, + { + "epoch": 0.49, + "grad_norm": 3.611096417517397, + "learning_rate": 5.4741295203362655e-06, + "loss": 0.6901, + "step": 5980 + }, + { + "epoch": 0.49, + "grad_norm": 2.924738465572044, + "learning_rate": 5.472820076615837e-06, + "loss": 0.6976, + "step": 5981 + }, + { + "epoch": 0.49, + "grad_norm": 4.518763424817466, + "learning_rate": 5.471510600173281e-06, + "loss": 0.6703, + "step": 5982 + }, + { + "epoch": 0.49, + "grad_norm": 3.0696352063465917, + "learning_rate": 5.4702010910992235e-06, + "loss": 0.6906, + "step": 5983 + }, + { + "epoch": 0.49, + "grad_norm": 3.2553276939125397, + "learning_rate": 5.4688915494842886e-06, + "loss": 0.6385, + "step": 5984 + }, + { + "epoch": 0.49, + "grad_norm": 12.13250430222909, + "learning_rate": 5.467581975419108e-06, + "loss": 0.6911, + "step": 5985 + }, + { + "epoch": 0.49, + "grad_norm": 2.2459569298495037, + "learning_rate": 5.4662723689943085e-06, + "loss": 0.6131, + "step": 5986 + }, + { + "epoch": 0.49, + "grad_norm": 3.0271084237156884, + "learning_rate": 5.464962730300526e-06, + "loss": 0.8608, + "step": 5987 + }, + { + "epoch": 0.49, + "grad_norm": 2.264105090503508, + "learning_rate": 5.4636530594283945e-06, + "loss": 0.6521, + "step": 5988 + }, + { + "epoch": 0.49, + "grad_norm": 4.837588964886058, + "learning_rate": 5.4623433564685536e-06, + "loss": 0.718, + "step": 5989 + }, + { + "epoch": 0.49, + "grad_norm": 3.823635715143104, + "learning_rate": 5.46103362151164e-06, + "loss": 0.679, + "step": 5990 + }, + { + "epoch": 0.49, + "grad_norm": 2.227895039886624, + "learning_rate": 5.459723854648297e-06, + "loss": 0.8, + "step": 5991 + }, + { + "epoch": 0.49, + "grad_norm": 2.5697961233021323, + "learning_rate": 5.458414055969169e-06, + "loss": 0.6244, + "step": 5992 + }, + { + "epoch": 0.49, + "grad_norm": 4.910819553907711, + "learning_rate": 5.457104225564901e-06, + "loss": 0.6829, + "step": 5993 + }, + { + "epoch": 0.49, + "grad_norm": 6.7440155071133105, + "learning_rate": 5.4557943635261425e-06, + "loss": 0.6464, + "step": 5994 + }, + { + "epoch": 0.49, + "grad_norm": 3.6052731745588744, + "learning_rate": 5.454484469943545e-06, + "loss": 0.7488, + "step": 5995 + }, + { + "epoch": 0.49, + "grad_norm": 2.6889017665761332, + "learning_rate": 5.45317454490776e-06, + "loss": 0.6819, + "step": 5996 + }, + { + "epoch": 0.49, + "grad_norm": 4.928846973097363, + "learning_rate": 5.451864588509442e-06, + "loss": 0.6475, + "step": 5997 + }, + { + "epoch": 0.49, + "grad_norm": 4.646626204029376, + "learning_rate": 5.450554600839251e-06, + "loss": 0.7024, + "step": 5998 + }, + { + "epoch": 0.49, + "grad_norm": 2.6337274247636655, + "learning_rate": 5.449244581987845e-06, + "loss": 0.6801, + "step": 5999 + }, + { + "epoch": 0.49, + "grad_norm": 4.703817431112147, + "learning_rate": 5.447934532045884e-06, + "loss": 0.8008, + "step": 6000 + }, + { + "epoch": 0.49, + "grad_norm": 2.7220439421192824, + "learning_rate": 5.446624451104032e-06, + "loss": 0.6387, + "step": 6001 + }, + { + "epoch": 0.49, + "grad_norm": 2.3843281273430446, + "learning_rate": 5.4453143392529586e-06, + "loss": 0.5188, + "step": 6002 + }, + { + "epoch": 0.49, + "grad_norm": 4.206465912155756, + "learning_rate": 5.4440041965833265e-06, + "loss": 0.7254, + "step": 6003 + }, + { + "epoch": 0.49, + "grad_norm": 3.7868698676639676, + "learning_rate": 5.44269402318581e-06, + "loss": 0.6986, + "step": 6004 + }, + { + "epoch": 0.49, + "grad_norm": 9.48619468044147, + "learning_rate": 5.4413838191510785e-06, + "loss": 0.621, + "step": 6005 + }, + { + "epoch": 0.49, + "grad_norm": 3.525805508684711, + "learning_rate": 5.44007358456981e-06, + "loss": 0.6929, + "step": 6006 + }, + { + "epoch": 0.49, + "grad_norm": 2.5526136284113705, + "learning_rate": 5.438763319532675e-06, + "loss": 0.6104, + "step": 6007 + }, + { + "epoch": 0.49, + "grad_norm": 2.9528098370265226, + "learning_rate": 5.437453024130358e-06, + "loss": 0.6373, + "step": 6008 + }, + { + "epoch": 0.49, + "grad_norm": 3.975851887896986, + "learning_rate": 5.436142698453536e-06, + "loss": 0.7316, + "step": 6009 + }, + { + "epoch": 0.49, + "grad_norm": 3.5086165613228366, + "learning_rate": 5.434832342592893e-06, + "loss": 0.7098, + "step": 6010 + }, + { + "epoch": 0.49, + "grad_norm": 2.0735745979950178, + "learning_rate": 5.433521956639114e-06, + "loss": 0.7015, + "step": 6011 + }, + { + "epoch": 0.49, + "grad_norm": 2.563154278863986, + "learning_rate": 5.432211540682887e-06, + "loss": 0.5834, + "step": 6012 + }, + { + "epoch": 0.49, + "grad_norm": 3.2000523631273117, + "learning_rate": 5.430901094814899e-06, + "loss": 0.5947, + "step": 6013 + }, + { + "epoch": 0.49, + "grad_norm": 2.87559365693038, + "learning_rate": 5.429590619125843e-06, + "loss": 0.6668, + "step": 6014 + }, + { + "epoch": 0.49, + "grad_norm": 2.536256148563878, + "learning_rate": 5.4282801137064114e-06, + "loss": 0.6846, + "step": 6015 + }, + { + "epoch": 0.49, + "grad_norm": 2.3317000437048585, + "learning_rate": 5.426969578647298e-06, + "loss": 0.8176, + "step": 6016 + }, + { + "epoch": 0.49, + "grad_norm": 22.65889067478474, + "learning_rate": 5.425659014039201e-06, + "loss": 0.5692, + "step": 6017 + }, + { + "epoch": 0.49, + "grad_norm": 3.9558838204967373, + "learning_rate": 5.424348419972821e-06, + "loss": 0.6334, + "step": 6018 + }, + { + "epoch": 0.49, + "grad_norm": 2.6008636739348385, + "learning_rate": 5.423037796538858e-06, + "loss": 0.6015, + "step": 6019 + }, + { + "epoch": 0.49, + "grad_norm": 2.8409810226481844, + "learning_rate": 5.421727143828016e-06, + "loss": 0.6852, + "step": 6020 + }, + { + "epoch": 0.49, + "grad_norm": 5.264403950751759, + "learning_rate": 5.4204164619309994e-06, + "loss": 0.7101, + "step": 6021 + }, + { + "epoch": 0.49, + "grad_norm": 2.8080331071570526, + "learning_rate": 5.419105750938518e-06, + "loss": 0.7143, + "step": 6022 + }, + { + "epoch": 0.49, + "grad_norm": 3.2637713424003247, + "learning_rate": 5.41779501094128e-06, + "loss": 0.6998, + "step": 6023 + }, + { + "epoch": 0.49, + "grad_norm": 6.337070143951558, + "learning_rate": 5.416484242029996e-06, + "loss": 0.824, + "step": 6024 + }, + { + "epoch": 0.49, + "grad_norm": 3.144785735172008, + "learning_rate": 5.41517344429538e-06, + "loss": 0.8502, + "step": 6025 + }, + { + "epoch": 0.49, + "grad_norm": 6.962155103824911, + "learning_rate": 5.413862617828147e-06, + "loss": 0.6204, + "step": 6026 + }, + { + "epoch": 0.49, + "grad_norm": 3.144120176320849, + "learning_rate": 5.412551762719015e-06, + "loss": 0.7989, + "step": 6027 + }, + { + "epoch": 0.49, + "grad_norm": 3.5454392967187625, + "learning_rate": 5.411240879058703e-06, + "loss": 0.5724, + "step": 6028 + }, + { + "epoch": 0.49, + "grad_norm": 2.9328952707320566, + "learning_rate": 5.409929966937933e-06, + "loss": 0.7518, + "step": 6029 + }, + { + "epoch": 0.49, + "grad_norm": 2.871331155297309, + "learning_rate": 5.40861902644743e-06, + "loss": 0.5722, + "step": 6030 + }, + { + "epoch": 0.49, + "grad_norm": 3.8319953500940356, + "learning_rate": 5.407308057677916e-06, + "loss": 0.7841, + "step": 6031 + }, + { + "epoch": 0.49, + "grad_norm": 2.4459071296170993, + "learning_rate": 5.40599706072012e-06, + "loss": 0.7552, + "step": 6032 + }, + { + "epoch": 0.49, + "grad_norm": 3.6037284275219945, + "learning_rate": 5.4046860356647705e-06, + "loss": 0.6545, + "step": 6033 + }, + { + "epoch": 0.49, + "grad_norm": 14.266364416344473, + "learning_rate": 5.4033749826025995e-06, + "loss": 0.8036, + "step": 6034 + }, + { + "epoch": 0.49, + "grad_norm": 4.881891601696901, + "learning_rate": 5.40206390162434e-06, + "loss": 0.624, + "step": 6035 + }, + { + "epoch": 0.49, + "grad_norm": 3.8355586314363546, + "learning_rate": 5.400752792820726e-06, + "loss": 0.5328, + "step": 6036 + }, + { + "epoch": 0.49, + "grad_norm": 3.8136205611839853, + "learning_rate": 5.3994416562824955e-06, + "loss": 0.6334, + "step": 6037 + }, + { + "epoch": 0.49, + "grad_norm": 6.074063491063646, + "learning_rate": 5.39813049210039e-06, + "loss": 0.7521, + "step": 6038 + }, + { + "epoch": 0.49, + "grad_norm": 2.788429614863073, + "learning_rate": 5.396819300365146e-06, + "loss": 0.6613, + "step": 6039 + }, + { + "epoch": 0.49, + "grad_norm": 3.661858296815369, + "learning_rate": 5.395508081167506e-06, + "loss": 0.6944, + "step": 6040 + }, + { + "epoch": 0.49, + "grad_norm": 3.1935204482796284, + "learning_rate": 5.394196834598218e-06, + "loss": 0.767, + "step": 6041 + }, + { + "epoch": 0.49, + "grad_norm": 4.398406727062364, + "learning_rate": 5.392885560748028e-06, + "loss": 0.7185, + "step": 6042 + }, + { + "epoch": 0.49, + "grad_norm": 2.635709776353121, + "learning_rate": 5.391574259707682e-06, + "loss": 0.7065, + "step": 6043 + }, + { + "epoch": 0.49, + "grad_norm": 2.1179927457085235, + "learning_rate": 5.3902629315679315e-06, + "loss": 0.7464, + "step": 6044 + }, + { + "epoch": 0.49, + "grad_norm": 3.685804367672943, + "learning_rate": 5.38895157641953e-06, + "loss": 0.7652, + "step": 6045 + }, + { + "epoch": 0.49, + "grad_norm": 5.057539174718798, + "learning_rate": 5.387640194353229e-06, + "loss": 0.5839, + "step": 6046 + }, + { + "epoch": 0.49, + "grad_norm": 2.4604760894511792, + "learning_rate": 5.3863287854597865e-06, + "loss": 0.6744, + "step": 6047 + }, + { + "epoch": 0.49, + "grad_norm": 4.114976780670543, + "learning_rate": 5.38501734982996e-06, + "loss": 0.5769, + "step": 6048 + }, + { + "epoch": 0.49, + "grad_norm": 3.6382902197853197, + "learning_rate": 5.383705887554508e-06, + "loss": 0.7757, + "step": 6049 + }, + { + "epoch": 0.49, + "grad_norm": 4.869439157032522, + "learning_rate": 5.3823943987241926e-06, + "loss": 0.7051, + "step": 6050 + }, + { + "epoch": 0.49, + "grad_norm": 3.3118802147779762, + "learning_rate": 5.381082883429776e-06, + "loss": 0.6706, + "step": 6051 + }, + { + "epoch": 0.49, + "grad_norm": 2.860528365453887, + "learning_rate": 5.379771341762025e-06, + "loss": 0.7592, + "step": 6052 + }, + { + "epoch": 0.49, + "grad_norm": 3.321617056150225, + "learning_rate": 5.378459773811707e-06, + "loss": 0.6632, + "step": 6053 + }, + { + "epoch": 0.49, + "grad_norm": 4.594482168342254, + "learning_rate": 5.37714817966959e-06, + "loss": 0.6308, + "step": 6054 + }, + { + "epoch": 0.49, + "grad_norm": 3.796655428904238, + "learning_rate": 5.375836559426444e-06, + "loss": 0.6478, + "step": 6055 + }, + { + "epoch": 0.49, + "grad_norm": 2.3086641598124755, + "learning_rate": 5.37452491317304e-06, + "loss": 0.7806, + "step": 6056 + }, + { + "epoch": 0.49, + "grad_norm": 3.520788507408676, + "learning_rate": 5.373213241000155e-06, + "loss": 0.676, + "step": 6057 + }, + { + "epoch": 0.49, + "grad_norm": 3.1922833532984236, + "learning_rate": 5.371901542998563e-06, + "loss": 0.816, + "step": 6058 + }, + { + "epoch": 0.49, + "grad_norm": 3.7542616334082104, + "learning_rate": 5.370589819259043e-06, + "loss": 0.7208, + "step": 6059 + }, + { + "epoch": 0.49, + "grad_norm": 2.62387100063729, + "learning_rate": 5.369278069872373e-06, + "loss": 0.7552, + "step": 6060 + }, + { + "epoch": 0.49, + "grad_norm": 4.805590783378133, + "learning_rate": 5.367966294929337e-06, + "loss": 0.7716, + "step": 6061 + }, + { + "epoch": 0.49, + "grad_norm": 2.542312555581617, + "learning_rate": 5.366654494520717e-06, + "loss": 0.8278, + "step": 6062 + }, + { + "epoch": 0.49, + "grad_norm": 15.746894458291544, + "learning_rate": 5.365342668737297e-06, + "loss": 0.617, + "step": 6063 + }, + { + "epoch": 0.49, + "grad_norm": 3.1226948088760307, + "learning_rate": 5.364030817669862e-06, + "loss": 0.7273, + "step": 6064 + }, + { + "epoch": 0.49, + "grad_norm": 2.5871510186995117, + "learning_rate": 5.362718941409204e-06, + "loss": 0.6226, + "step": 6065 + }, + { + "epoch": 0.49, + "grad_norm": 2.928282070853197, + "learning_rate": 5.36140704004611e-06, + "loss": 0.6519, + "step": 6066 + }, + { + "epoch": 0.49, + "grad_norm": 4.614472864629575, + "learning_rate": 5.3600951136713745e-06, + "loss": 0.6232, + "step": 6067 + }, + { + "epoch": 0.49, + "grad_norm": 2.9302165064439287, + "learning_rate": 5.35878316237579e-06, + "loss": 0.6108, + "step": 6068 + }, + { + "epoch": 0.49, + "grad_norm": 2.9879702004908815, + "learning_rate": 5.35747118625015e-06, + "loss": 0.7517, + "step": 6069 + }, + { + "epoch": 0.49, + "grad_norm": 4.947324066479569, + "learning_rate": 5.356159185385255e-06, + "loss": 0.8119, + "step": 6070 + }, + { + "epoch": 0.49, + "grad_norm": 2.780934345440035, + "learning_rate": 5.354847159871901e-06, + "loss": 0.6418, + "step": 6071 + }, + { + "epoch": 0.49, + "grad_norm": 2.8381875802735226, + "learning_rate": 5.353535109800891e-06, + "loss": 0.6467, + "step": 6072 + }, + { + "epoch": 0.49, + "grad_norm": 2.471893538112849, + "learning_rate": 5.352223035263022e-06, + "loss": 0.6602, + "step": 6073 + }, + { + "epoch": 0.49, + "grad_norm": 2.821842297233693, + "learning_rate": 5.350910936349102e-06, + "loss": 0.6887, + "step": 6074 + }, + { + "epoch": 0.49, + "grad_norm": 2.6668666850577933, + "learning_rate": 5.349598813149937e-06, + "loss": 0.6304, + "step": 6075 + }, + { + "epoch": 0.49, + "grad_norm": 3.0567978903041957, + "learning_rate": 5.348286665756331e-06, + "loss": 0.8358, + "step": 6076 + }, + { + "epoch": 0.49, + "grad_norm": 2.7027583097486145, + "learning_rate": 5.346974494259096e-06, + "loss": 0.9387, + "step": 6077 + }, + { + "epoch": 0.49, + "grad_norm": 3.6057449268108863, + "learning_rate": 5.345662298749043e-06, + "loss": 0.8957, + "step": 6078 + }, + { + "epoch": 0.49, + "grad_norm": 3.457156058660888, + "learning_rate": 5.344350079316981e-06, + "loss": 0.6204, + "step": 6079 + }, + { + "epoch": 0.49, + "grad_norm": 3.648010568877727, + "learning_rate": 5.343037836053724e-06, + "loss": 0.7787, + "step": 6080 + }, + { + "epoch": 0.49, + "grad_norm": 4.070781676232831, + "learning_rate": 5.341725569050091e-06, + "loss": 0.6028, + "step": 6081 + }, + { + "epoch": 0.49, + "grad_norm": 4.454688606329808, + "learning_rate": 5.340413278396896e-06, + "loss": 0.7242, + "step": 6082 + }, + { + "epoch": 0.49, + "grad_norm": 3.8815227919655015, + "learning_rate": 5.339100964184956e-06, + "loss": 0.6233, + "step": 6083 + }, + { + "epoch": 0.49, + "grad_norm": 3.1081970130406793, + "learning_rate": 5.337788626505097e-06, + "loss": 0.6219, + "step": 6084 + }, + { + "epoch": 0.49, + "grad_norm": 4.005613631025357, + "learning_rate": 5.336476265448138e-06, + "loss": 0.7065, + "step": 6085 + }, + { + "epoch": 0.49, + "grad_norm": 2.3755491919319134, + "learning_rate": 5.335163881104902e-06, + "loss": 0.6003, + "step": 6086 + }, + { + "epoch": 0.49, + "grad_norm": 4.264052114217631, + "learning_rate": 5.333851473566217e-06, + "loss": 0.7421, + "step": 6087 + }, + { + "epoch": 0.49, + "grad_norm": 3.6623169988903657, + "learning_rate": 5.332539042922908e-06, + "loss": 0.5314, + "step": 6088 + }, + { + "epoch": 0.49, + "grad_norm": 2.1096826667555733, + "learning_rate": 5.331226589265801e-06, + "loss": 0.5706, + "step": 6089 + }, + { + "epoch": 0.49, + "grad_norm": 4.642840657456279, + "learning_rate": 5.329914112685729e-06, + "loss": 0.8183, + "step": 6090 + }, + { + "epoch": 0.49, + "grad_norm": 4.761583386758941, + "learning_rate": 5.328601613273524e-06, + "loss": 0.6561, + "step": 6091 + }, + { + "epoch": 0.49, + "grad_norm": 3.1561771580049856, + "learning_rate": 5.327289091120017e-06, + "loss": 0.6014, + "step": 6092 + }, + { + "epoch": 0.49, + "grad_norm": 3.618550220016676, + "learning_rate": 5.325976546316044e-06, + "loss": 0.7986, + "step": 6093 + }, + { + "epoch": 0.49, + "grad_norm": 2.663761754693607, + "learning_rate": 5.324663978952443e-06, + "loss": 0.6761, + "step": 6094 + }, + { + "epoch": 0.5, + "grad_norm": 4.755446047808612, + "learning_rate": 5.32335138912005e-06, + "loss": 0.6614, + "step": 6095 + }, + { + "epoch": 0.5, + "grad_norm": 2.719045998729653, + "learning_rate": 5.322038776909705e-06, + "loss": 0.6595, + "step": 6096 + }, + { + "epoch": 0.5, + "grad_norm": 2.7713302586424162, + "learning_rate": 5.320726142412248e-06, + "loss": 0.5959, + "step": 6097 + }, + { + "epoch": 0.5, + "grad_norm": 2.4001853770394503, + "learning_rate": 5.3194134857185244e-06, + "loss": 0.7854, + "step": 6098 + }, + { + "epoch": 0.5, + "grad_norm": 9.02748191740966, + "learning_rate": 5.318100806919374e-06, + "loss": 0.7502, + "step": 6099 + }, + { + "epoch": 0.5, + "grad_norm": 2.3909465751838286, + "learning_rate": 5.316788106105646e-06, + "loss": 0.5897, + "step": 6100 + }, + { + "epoch": 0.5, + "grad_norm": 4.617371720817901, + "learning_rate": 5.315475383368186e-06, + "loss": 0.9065, + "step": 6101 + }, + { + "epoch": 0.5, + "grad_norm": 2.944454175874567, + "learning_rate": 5.314162638797844e-06, + "loss": 0.706, + "step": 6102 + }, + { + "epoch": 0.5, + "grad_norm": 6.77489885209827, + "learning_rate": 5.312849872485468e-06, + "loss": 0.6734, + "step": 6103 + }, + { + "epoch": 0.5, + "grad_norm": 3.9625328761098815, + "learning_rate": 5.311537084521911e-06, + "loss": 0.71, + "step": 6104 + }, + { + "epoch": 0.5, + "grad_norm": 3.572020428951802, + "learning_rate": 5.310224274998028e-06, + "loss": 0.5039, + "step": 6105 + }, + { + "epoch": 0.5, + "grad_norm": 2.8593272583980807, + "learning_rate": 5.308911444004671e-06, + "loss": 0.6846, + "step": 6106 + }, + { + "epoch": 0.5, + "grad_norm": 4.84595196509567, + "learning_rate": 5.307598591632696e-06, + "loss": 0.7806, + "step": 6107 + }, + { + "epoch": 0.5, + "grad_norm": 15.453522486993867, + "learning_rate": 5.306285717972962e-06, + "loss": 0.7916, + "step": 6108 + }, + { + "epoch": 0.5, + "grad_norm": 2.7391950587999503, + "learning_rate": 5.3049728231163275e-06, + "loss": 0.6137, + "step": 6109 + }, + { + "epoch": 0.5, + "grad_norm": 2.789118969319593, + "learning_rate": 5.303659907153654e-06, + "loss": 0.5938, + "step": 6110 + }, + { + "epoch": 0.5, + "grad_norm": 3.460812216976524, + "learning_rate": 5.302346970175803e-06, + "loss": 0.6847, + "step": 6111 + }, + { + "epoch": 0.5, + "grad_norm": 3.1467880016543357, + "learning_rate": 5.301034012273638e-06, + "loss": 0.6453, + "step": 6112 + }, + { + "epoch": 0.5, + "grad_norm": 2.586338746246817, + "learning_rate": 5.299721033538023e-06, + "loss": 0.7839, + "step": 6113 + }, + { + "epoch": 0.5, + "grad_norm": 5.161668078077261, + "learning_rate": 5.298408034059827e-06, + "loss": 0.6512, + "step": 6114 + }, + { + "epoch": 0.5, + "grad_norm": 4.253839899934579, + "learning_rate": 5.297095013929915e-06, + "loss": 0.8, + "step": 6115 + }, + { + "epoch": 0.5, + "grad_norm": 3.584780138917904, + "learning_rate": 5.295781973239157e-06, + "loss": 0.5697, + "step": 6116 + }, + { + "epoch": 0.5, + "grad_norm": 1.6547496515507776, + "learning_rate": 5.294468912078424e-06, + "loss": 0.6216, + "step": 6117 + }, + { + "epoch": 0.5, + "grad_norm": 2.6087721109250057, + "learning_rate": 5.293155830538589e-06, + "loss": 0.692, + "step": 6118 + }, + { + "epoch": 0.5, + "grad_norm": 3.4252715425847606, + "learning_rate": 5.291842728710524e-06, + "loss": 0.74, + "step": 6119 + }, + { + "epoch": 0.5, + "grad_norm": 5.279843724034523, + "learning_rate": 5.290529606685105e-06, + "loss": 0.8002, + "step": 6120 + }, + { + "epoch": 0.5, + "grad_norm": 2.2085829277476794, + "learning_rate": 5.289216464553209e-06, + "loss": 0.5669, + "step": 6121 + }, + { + "epoch": 0.5, + "grad_norm": 2.2200869478730385, + "learning_rate": 5.28790330240571e-06, + "loss": 0.6571, + "step": 6122 + }, + { + "epoch": 0.5, + "grad_norm": 3.287486802808259, + "learning_rate": 5.286590120333491e-06, + "loss": 0.6184, + "step": 6123 + }, + { + "epoch": 0.5, + "grad_norm": 4.613905549504539, + "learning_rate": 5.285276918427432e-06, + "loss": 0.67, + "step": 6124 + }, + { + "epoch": 0.5, + "grad_norm": 3.8654836942423247, + "learning_rate": 5.2839636967784124e-06, + "loss": 0.6213, + "step": 6125 + }, + { + "epoch": 0.5, + "grad_norm": 2.6161080479369296, + "learning_rate": 5.282650455477317e-06, + "loss": 0.7134, + "step": 6126 + }, + { + "epoch": 0.5, + "grad_norm": 2.103492777867684, + "learning_rate": 5.281337194615033e-06, + "loss": 0.643, + "step": 6127 + }, + { + "epoch": 0.5, + "grad_norm": 2.823750175339366, + "learning_rate": 5.280023914282442e-06, + "loss": 0.7403, + "step": 6128 + }, + { + "epoch": 0.5, + "grad_norm": 8.304131259461812, + "learning_rate": 5.278710614570432e-06, + "loss": 0.664, + "step": 6129 + }, + { + "epoch": 0.5, + "grad_norm": 8.819791891227474, + "learning_rate": 5.277397295569893e-06, + "loss": 0.7486, + "step": 6130 + }, + { + "epoch": 0.5, + "grad_norm": 3.7960270038142743, + "learning_rate": 5.276083957371716e-06, + "loss": 0.658, + "step": 6131 + }, + { + "epoch": 0.5, + "grad_norm": 2.822821918691646, + "learning_rate": 5.2747706000667885e-06, + "loss": 0.7288, + "step": 6132 + }, + { + "epoch": 0.5, + "grad_norm": 2.3757101706604633, + "learning_rate": 5.2734572237460056e-06, + "loss": 0.667, + "step": 6133 + }, + { + "epoch": 0.5, + "grad_norm": 5.101905178367649, + "learning_rate": 5.272143828500264e-06, + "loss": 0.676, + "step": 6134 + }, + { + "epoch": 0.5, + "grad_norm": 29.613867040447612, + "learning_rate": 5.270830414420453e-06, + "loss": 0.5123, + "step": 6135 + }, + { + "epoch": 0.5, + "grad_norm": 11.265496912775681, + "learning_rate": 5.269516981597473e-06, + "loss": 0.7884, + "step": 6136 + }, + { + "epoch": 0.5, + "grad_norm": 3.9268025810993037, + "learning_rate": 5.26820353012222e-06, + "loss": 0.675, + "step": 6137 + }, + { + "epoch": 0.5, + "grad_norm": 3.550837957235736, + "learning_rate": 5.2668900600855955e-06, + "loss": 0.7178, + "step": 6138 + }, + { + "epoch": 0.5, + "grad_norm": 2.6233075632644702, + "learning_rate": 5.265576571578497e-06, + "loss": 0.5249, + "step": 6139 + }, + { + "epoch": 0.5, + "grad_norm": 3.6264656835277442, + "learning_rate": 5.264263064691828e-06, + "loss": 0.6668, + "step": 6140 + }, + { + "epoch": 0.5, + "grad_norm": 4.241402491934814, + "learning_rate": 5.2629495395164905e-06, + "loss": 0.7393, + "step": 6141 + }, + { + "epoch": 0.5, + "grad_norm": 4.6265904205712385, + "learning_rate": 5.26163599614339e-06, + "loss": 0.5065, + "step": 6142 + }, + { + "epoch": 0.5, + "grad_norm": 3.6122218155642787, + "learning_rate": 5.260322434663432e-06, + "loss": 0.4714, + "step": 6143 + }, + { + "epoch": 0.5, + "grad_norm": 2.7974894337533405, + "learning_rate": 5.2590088551675215e-06, + "loss": 0.6652, + "step": 6144 + }, + { + "epoch": 0.5, + "grad_norm": 4.173934484939835, + "learning_rate": 5.257695257746567e-06, + "loss": 0.6247, + "step": 6145 + }, + { + "epoch": 0.5, + "grad_norm": 3.3365707684175345, + "learning_rate": 5.256381642491477e-06, + "loss": 0.5308, + "step": 6146 + }, + { + "epoch": 0.5, + "grad_norm": 10.385033548065424, + "learning_rate": 5.255068009493165e-06, + "loss": 0.7774, + "step": 6147 + }, + { + "epoch": 0.5, + "grad_norm": 3.4081386918040897, + "learning_rate": 5.25375435884254e-06, + "loss": 0.4786, + "step": 6148 + }, + { + "epoch": 0.5, + "grad_norm": 3.2695691346921247, + "learning_rate": 5.252440690630515e-06, + "loss": 0.7612, + "step": 6149 + }, + { + "epoch": 0.5, + "grad_norm": 2.671254105419217, + "learning_rate": 5.251127004948005e-06, + "loss": 0.7359, + "step": 6150 + }, + { + "epoch": 0.5, + "grad_norm": 3.441089603246151, + "learning_rate": 5.249813301885926e-06, + "loss": 0.574, + "step": 6151 + }, + { + "epoch": 0.5, + "grad_norm": 4.13877042750484, + "learning_rate": 5.248499581535193e-06, + "loss": 0.6604, + "step": 6152 + }, + { + "epoch": 0.5, + "grad_norm": 3.7172967133166375, + "learning_rate": 5.247185843986724e-06, + "loss": 0.672, + "step": 6153 + }, + { + "epoch": 0.5, + "grad_norm": 3.5687916575527074, + "learning_rate": 5.24587208933144e-06, + "loss": 0.6674, + "step": 6154 + }, + { + "epoch": 0.5, + "grad_norm": 3.7696331425469545, + "learning_rate": 5.244558317660256e-06, + "loss": 0.7127, + "step": 6155 + }, + { + "epoch": 0.5, + "grad_norm": 2.7881411562827823, + "learning_rate": 5.243244529064098e-06, + "loss": 0.7139, + "step": 6156 + }, + { + "epoch": 0.5, + "grad_norm": 3.7779642121175616, + "learning_rate": 5.241930723633887e-06, + "loss": 0.6709, + "step": 6157 + }, + { + "epoch": 0.5, + "grad_norm": 2.8811436976738904, + "learning_rate": 5.240616901460547e-06, + "loss": 0.7537, + "step": 6158 + }, + { + "epoch": 0.5, + "grad_norm": 2.5501142622631576, + "learning_rate": 5.239303062635001e-06, + "loss": 0.7724, + "step": 6159 + }, + { + "epoch": 0.5, + "grad_norm": 9.292940823716274, + "learning_rate": 5.237989207248179e-06, + "loss": 0.6356, + "step": 6160 + }, + { + "epoch": 0.5, + "grad_norm": 3.4123595092618726, + "learning_rate": 5.236675335391004e-06, + "loss": 0.5866, + "step": 6161 + }, + { + "epoch": 0.5, + "grad_norm": 3.353554811197336, + "learning_rate": 5.235361447154406e-06, + "loss": 0.8898, + "step": 6162 + }, + { + "epoch": 0.5, + "grad_norm": 5.942815827239726, + "learning_rate": 5.2340475426293125e-06, + "loss": 0.6854, + "step": 6163 + }, + { + "epoch": 0.5, + "grad_norm": 2.4576067856427763, + "learning_rate": 5.232733621906656e-06, + "loss": 0.5921, + "step": 6164 + }, + { + "epoch": 0.5, + "grad_norm": 3.2210207223955387, + "learning_rate": 5.231419685077367e-06, + "loss": 0.7975, + "step": 6165 + }, + { + "epoch": 0.5, + "grad_norm": 9.181922515927047, + "learning_rate": 5.2301057322323786e-06, + "loss": 0.6976, + "step": 6166 + }, + { + "epoch": 0.5, + "grad_norm": 2.5367914598114485, + "learning_rate": 5.228791763462626e-06, + "loss": 0.449, + "step": 6167 + }, + { + "epoch": 0.5, + "grad_norm": 2.9360657863154795, + "learning_rate": 5.227477778859044e-06, + "loss": 0.8882, + "step": 6168 + }, + { + "epoch": 0.5, + "grad_norm": 3.2988732664047276, + "learning_rate": 5.226163778512564e-06, + "loss": 0.6486, + "step": 6169 + }, + { + "epoch": 0.5, + "grad_norm": 5.325424064551389, + "learning_rate": 5.224849762514127e-06, + "loss": 0.7733, + "step": 6170 + }, + { + "epoch": 0.5, + "grad_norm": 3.2659001085866772, + "learning_rate": 5.223535730954673e-06, + "loss": 0.4959, + "step": 6171 + }, + { + "epoch": 0.5, + "grad_norm": 3.130300513542728, + "learning_rate": 5.222221683925138e-06, + "loss": 0.7962, + "step": 6172 + }, + { + "epoch": 0.5, + "grad_norm": 4.1538063729001005, + "learning_rate": 5.220907621516461e-06, + "loss": 0.7712, + "step": 6173 + }, + { + "epoch": 0.5, + "grad_norm": 5.5564267521277735, + "learning_rate": 5.219593543819587e-06, + "loss": 0.7416, + "step": 6174 + }, + { + "epoch": 0.5, + "grad_norm": 3.6650094529909194, + "learning_rate": 5.218279450925458e-06, + "loss": 0.7192, + "step": 6175 + }, + { + "epoch": 0.5, + "grad_norm": 3.8478887337676944, + "learning_rate": 5.216965342925017e-06, + "loss": 0.6952, + "step": 6176 + }, + { + "epoch": 0.5, + "grad_norm": 2.5226291278007675, + "learning_rate": 5.215651219909208e-06, + "loss": 0.7435, + "step": 6177 + }, + { + "epoch": 0.5, + "grad_norm": 2.413143222383419, + "learning_rate": 5.2143370819689756e-06, + "loss": 0.5118, + "step": 6178 + }, + { + "epoch": 0.5, + "grad_norm": 2.8962255050817145, + "learning_rate": 5.213022929195267e-06, + "loss": 0.7005, + "step": 6179 + }, + { + "epoch": 0.5, + "grad_norm": 3.71708012665215, + "learning_rate": 5.211708761679031e-06, + "loss": 0.673, + "step": 6180 + }, + { + "epoch": 0.5, + "grad_norm": 3.713636318009029, + "learning_rate": 5.210394579511217e-06, + "loss": 0.7289, + "step": 6181 + }, + { + "epoch": 0.5, + "grad_norm": 3.283584018638852, + "learning_rate": 5.209080382782772e-06, + "loss": 0.7526, + "step": 6182 + }, + { + "epoch": 0.5, + "grad_norm": 6.014019005529718, + "learning_rate": 5.207766171584648e-06, + "loss": 0.5562, + "step": 6183 + }, + { + "epoch": 0.5, + "grad_norm": 4.993489054799933, + "learning_rate": 5.206451946007797e-06, + "loss": 0.7209, + "step": 6184 + }, + { + "epoch": 0.5, + "grad_norm": 5.179498948287335, + "learning_rate": 5.205137706143172e-06, + "loss": 0.6716, + "step": 6185 + }, + { + "epoch": 0.5, + "grad_norm": 3.628828681521246, + "learning_rate": 5.203823452081725e-06, + "loss": 0.6323, + "step": 6186 + }, + { + "epoch": 0.5, + "grad_norm": 2.814430006658283, + "learning_rate": 5.2025091839144124e-06, + "loss": 0.6728, + "step": 6187 + }, + { + "epoch": 0.5, + "grad_norm": 3.6623186216338, + "learning_rate": 5.201194901732189e-06, + "loss": 0.835, + "step": 6188 + }, + { + "epoch": 0.5, + "grad_norm": 4.135063668002361, + "learning_rate": 5.1998806056260105e-06, + "loss": 0.6375, + "step": 6189 + }, + { + "epoch": 0.5, + "grad_norm": 2.6770162327181617, + "learning_rate": 5.198566295686837e-06, + "loss": 0.5926, + "step": 6190 + }, + { + "epoch": 0.5, + "grad_norm": 3.9819103933181212, + "learning_rate": 5.197251972005626e-06, + "loss": 0.8081, + "step": 6191 + }, + { + "epoch": 0.5, + "grad_norm": 2.6998838438191624, + "learning_rate": 5.195937634673336e-06, + "loss": 0.7073, + "step": 6192 + }, + { + "epoch": 0.5, + "grad_norm": 3.370591963206708, + "learning_rate": 5.194623283780927e-06, + "loss": 0.7051, + "step": 6193 + }, + { + "epoch": 0.5, + "grad_norm": 4.410232433475372, + "learning_rate": 5.193308919419363e-06, + "loss": 0.6879, + "step": 6194 + }, + { + "epoch": 0.5, + "grad_norm": 4.494584808257606, + "learning_rate": 5.191994541679603e-06, + "loss": 0.5908, + "step": 6195 + }, + { + "epoch": 0.5, + "grad_norm": 8.333774246184413, + "learning_rate": 5.190680150652613e-06, + "loss": 0.7648, + "step": 6196 + }, + { + "epoch": 0.5, + "grad_norm": 9.617380507683823, + "learning_rate": 5.189365746429356e-06, + "loss": 0.6442, + "step": 6197 + }, + { + "epoch": 0.5, + "grad_norm": 3.477191914952608, + "learning_rate": 5.188051329100795e-06, + "loss": 0.824, + "step": 6198 + }, + { + "epoch": 0.5, + "grad_norm": 4.631588516045174, + "learning_rate": 5.186736898757899e-06, + "loss": 0.8145, + "step": 6199 + }, + { + "epoch": 0.5, + "grad_norm": 4.130013253083948, + "learning_rate": 5.185422455491636e-06, + "loss": 0.5547, + "step": 6200 + }, + { + "epoch": 0.5, + "grad_norm": 1.9784063822333422, + "learning_rate": 5.18410799939297e-06, + "loss": 0.6744, + "step": 6201 + }, + { + "epoch": 0.5, + "grad_norm": 4.736033281213874, + "learning_rate": 5.18279353055287e-06, + "loss": 0.6468, + "step": 6202 + }, + { + "epoch": 0.5, + "grad_norm": 4.6730873520773875, + "learning_rate": 5.181479049062307e-06, + "loss": 0.6111, + "step": 6203 + }, + { + "epoch": 0.5, + "grad_norm": 5.533278576854528, + "learning_rate": 5.180164555012253e-06, + "loss": 0.5452, + "step": 6204 + }, + { + "epoch": 0.5, + "grad_norm": 3.073174700815744, + "learning_rate": 5.178850048493675e-06, + "loss": 0.6584, + "step": 6205 + }, + { + "epoch": 0.5, + "grad_norm": 4.240172084915734, + "learning_rate": 5.177535529597548e-06, + "loss": 0.8213, + "step": 6206 + }, + { + "epoch": 0.5, + "grad_norm": 5.34805098711028, + "learning_rate": 5.176220998414846e-06, + "loss": 0.7171, + "step": 6207 + }, + { + "epoch": 0.5, + "grad_norm": 3.1702814071440684, + "learning_rate": 5.1749064550365414e-06, + "loss": 0.7693, + "step": 6208 + }, + { + "epoch": 0.5, + "grad_norm": 6.146566509538084, + "learning_rate": 5.1735918995536074e-06, + "loss": 0.6876, + "step": 6209 + }, + { + "epoch": 0.5, + "grad_norm": 7.171354286996852, + "learning_rate": 5.1722773320570205e-06, + "loss": 0.6772, + "step": 6210 + }, + { + "epoch": 0.5, + "grad_norm": 2.8338016028828097, + "learning_rate": 5.1709627526377604e-06, + "loss": 0.682, + "step": 6211 + }, + { + "epoch": 0.5, + "grad_norm": 3.72196710298027, + "learning_rate": 5.1696481613867986e-06, + "loss": 0.5212, + "step": 6212 + }, + { + "epoch": 0.5, + "grad_norm": 5.309635970913145, + "learning_rate": 5.1683335583951156e-06, + "loss": 0.6851, + "step": 6213 + }, + { + "epoch": 0.5, + "grad_norm": 4.246942302599598, + "learning_rate": 5.167018943753692e-06, + "loss": 0.6922, + "step": 6214 + }, + { + "epoch": 0.5, + "grad_norm": 4.3817839419623485, + "learning_rate": 5.1657043175535045e-06, + "loss": 0.6164, + "step": 6215 + }, + { + "epoch": 0.5, + "grad_norm": 7.2854991840436645, + "learning_rate": 5.164389679885538e-06, + "loss": 0.6956, + "step": 6216 + }, + { + "epoch": 0.5, + "grad_norm": 5.958323988491736, + "learning_rate": 5.1630750308407675e-06, + "loss": 0.9094, + "step": 6217 + }, + { + "epoch": 0.51, + "grad_norm": 5.865238076798589, + "learning_rate": 5.161760370510178e-06, + "loss": 0.7996, + "step": 6218 + }, + { + "epoch": 0.51, + "grad_norm": 4.393186785392793, + "learning_rate": 5.160445698984753e-06, + "loss": 0.7125, + "step": 6219 + }, + { + "epoch": 0.51, + "grad_norm": 7.117320990383577, + "learning_rate": 5.159131016355475e-06, + "loss": 0.6213, + "step": 6220 + }, + { + "epoch": 0.51, + "grad_norm": 2.828080090939579, + "learning_rate": 5.15781632271333e-06, + "loss": 0.591, + "step": 6221 + }, + { + "epoch": 0.51, + "grad_norm": 4.423128903837038, + "learning_rate": 5.156501618149301e-06, + "loss": 0.5315, + "step": 6222 + }, + { + "epoch": 0.51, + "grad_norm": 26.064224173411056, + "learning_rate": 5.155186902754375e-06, + "loss": 0.7142, + "step": 6223 + }, + { + "epoch": 0.51, + "grad_norm": 3.795288853269341, + "learning_rate": 5.1538721766195375e-06, + "loss": 0.7093, + "step": 6224 + }, + { + "epoch": 0.51, + "grad_norm": 21.411370440548747, + "learning_rate": 5.152557439835777e-06, + "loss": 0.7201, + "step": 6225 + }, + { + "epoch": 0.51, + "grad_norm": 3.17625991121986, + "learning_rate": 5.1512426924940804e-06, + "loss": 0.7568, + "step": 6226 + }, + { + "epoch": 0.51, + "grad_norm": 3.78978191556089, + "learning_rate": 5.149927934685438e-06, + "loss": 0.6711, + "step": 6227 + }, + { + "epoch": 0.51, + "grad_norm": 4.005385430229076, + "learning_rate": 5.1486131665008386e-06, + "loss": 0.5685, + "step": 6228 + }, + { + "epoch": 0.51, + "grad_norm": 4.104950089907731, + "learning_rate": 5.147298388031271e-06, + "loss": 0.5924, + "step": 6229 + }, + { + "epoch": 0.51, + "grad_norm": 4.13134986606643, + "learning_rate": 5.145983599367729e-06, + "loss": 0.6834, + "step": 6230 + }, + { + "epoch": 0.51, + "grad_norm": 3.4496652597245774, + "learning_rate": 5.1446688006012015e-06, + "loss": 0.598, + "step": 6231 + }, + { + "epoch": 0.51, + "grad_norm": 11.41152676370373, + "learning_rate": 5.1433539918226835e-06, + "loss": 0.6215, + "step": 6232 + }, + { + "epoch": 0.51, + "grad_norm": 4.499029716593304, + "learning_rate": 5.142039173123166e-06, + "loss": 0.7062, + "step": 6233 + }, + { + "epoch": 0.51, + "grad_norm": 3.2227682760664496, + "learning_rate": 5.140724344593643e-06, + "loss": 0.7048, + "step": 6234 + }, + { + "epoch": 0.51, + "grad_norm": 4.435639172580153, + "learning_rate": 5.139409506325109e-06, + "loss": 0.641, + "step": 6235 + }, + { + "epoch": 0.51, + "grad_norm": 2.9178084630426033, + "learning_rate": 5.13809465840856e-06, + "loss": 0.5248, + "step": 6236 + }, + { + "epoch": 0.51, + "grad_norm": 3.3631548729024683, + "learning_rate": 5.1367798009349915e-06, + "loss": 0.8047, + "step": 6237 + }, + { + "epoch": 0.51, + "grad_norm": 5.136661630483342, + "learning_rate": 5.135464933995399e-06, + "loss": 0.6818, + "step": 6238 + }, + { + "epoch": 0.51, + "grad_norm": 8.300619799221906, + "learning_rate": 5.134150057680779e-06, + "loss": 0.6906, + "step": 6239 + }, + { + "epoch": 0.51, + "grad_norm": 5.771213019958129, + "learning_rate": 5.132835172082132e-06, + "loss": 0.6435, + "step": 6240 + }, + { + "epoch": 0.51, + "grad_norm": 3.7455522290771115, + "learning_rate": 5.131520277290455e-06, + "loss": 0.6725, + "step": 6241 + }, + { + "epoch": 0.51, + "grad_norm": 3.4084851599136683, + "learning_rate": 5.130205373396745e-06, + "loss": 0.6496, + "step": 6242 + }, + { + "epoch": 0.51, + "grad_norm": 3.257814757400069, + "learning_rate": 5.128890460492004e-06, + "loss": 0.5872, + "step": 6243 + }, + { + "epoch": 0.51, + "grad_norm": 3.773971269476287, + "learning_rate": 5.127575538667232e-06, + "loss": 0.7871, + "step": 6244 + }, + { + "epoch": 0.51, + "grad_norm": 4.540015770986173, + "learning_rate": 5.1262606080134295e-06, + "loss": 0.671, + "step": 6245 + }, + { + "epoch": 0.51, + "grad_norm": 4.2589624754919875, + "learning_rate": 5.124945668621597e-06, + "loss": 0.7469, + "step": 6246 + }, + { + "epoch": 0.51, + "grad_norm": 9.719587366494281, + "learning_rate": 5.123630720582738e-06, + "loss": 0.6163, + "step": 6247 + }, + { + "epoch": 0.51, + "grad_norm": 5.757925118842129, + "learning_rate": 5.122315763987855e-06, + "loss": 0.5416, + "step": 6248 + }, + { + "epoch": 0.51, + "grad_norm": 5.921969573258437, + "learning_rate": 5.121000798927951e-06, + "loss": 0.794, + "step": 6249 + }, + { + "epoch": 0.51, + "grad_norm": 48.65652883348092, + "learning_rate": 5.11968582549403e-06, + "loss": 0.6388, + "step": 6250 + }, + { + "epoch": 0.51, + "grad_norm": 4.377508895660131, + "learning_rate": 5.118370843777095e-06, + "loss": 0.747, + "step": 6251 + }, + { + "epoch": 0.51, + "grad_norm": 4.102649546117135, + "learning_rate": 5.117055853868153e-06, + "loss": 0.6836, + "step": 6252 + }, + { + "epoch": 0.51, + "grad_norm": 2.67265722083857, + "learning_rate": 5.115740855858209e-06, + "loss": 0.78, + "step": 6253 + }, + { + "epoch": 0.51, + "grad_norm": 3.5039241539403094, + "learning_rate": 5.114425849838269e-06, + "loss": 0.7522, + "step": 6254 + }, + { + "epoch": 0.51, + "grad_norm": 9.795063742297692, + "learning_rate": 5.11311083589934e-06, + "loss": 0.7253, + "step": 6255 + }, + { + "epoch": 0.51, + "grad_norm": 6.266589275738627, + "learning_rate": 5.111795814132429e-06, + "loss": 0.6696, + "step": 6256 + }, + { + "epoch": 0.51, + "grad_norm": 3.599079156741777, + "learning_rate": 5.110480784628544e-06, + "loss": 0.6816, + "step": 6257 + }, + { + "epoch": 0.51, + "grad_norm": 6.743036513672567, + "learning_rate": 5.109165747478693e-06, + "loss": 0.8133, + "step": 6258 + }, + { + "epoch": 0.51, + "grad_norm": 3.0426494728493623, + "learning_rate": 5.107850702773883e-06, + "loss": 0.6776, + "step": 6259 + }, + { + "epoch": 0.51, + "grad_norm": 4.655702174685603, + "learning_rate": 5.106535650605128e-06, + "loss": 0.7664, + "step": 6260 + }, + { + "epoch": 0.51, + "grad_norm": 3.589273000531835, + "learning_rate": 5.105220591063432e-06, + "loss": 0.6402, + "step": 6261 + }, + { + "epoch": 0.51, + "grad_norm": 3.2786743284274418, + "learning_rate": 5.103905524239811e-06, + "loss": 0.5992, + "step": 6262 + }, + { + "epoch": 0.51, + "grad_norm": 3.0283003087999196, + "learning_rate": 5.102590450225272e-06, + "loss": 0.6872, + "step": 6263 + }, + { + "epoch": 0.51, + "grad_norm": 4.256850269747036, + "learning_rate": 5.10127536911083e-06, + "loss": 0.6451, + "step": 6264 + }, + { + "epoch": 0.51, + "grad_norm": 4.061332231273521, + "learning_rate": 5.099960280987494e-06, + "loss": 0.5206, + "step": 6265 + }, + { + "epoch": 0.51, + "grad_norm": 2.4912958002412426, + "learning_rate": 5.098645185946276e-06, + "loss": 0.7174, + "step": 6266 + }, + { + "epoch": 0.51, + "grad_norm": 3.1842977055236035, + "learning_rate": 5.097330084078191e-06, + "loss": 0.7688, + "step": 6267 + }, + { + "epoch": 0.51, + "grad_norm": 3.891044302903524, + "learning_rate": 5.09601497547425e-06, + "loss": 0.6511, + "step": 6268 + }, + { + "epoch": 0.51, + "grad_norm": 3.7727471299723163, + "learning_rate": 5.09469986022547e-06, + "loss": 0.6068, + "step": 6269 + }, + { + "epoch": 0.51, + "grad_norm": 5.821934123560957, + "learning_rate": 5.093384738422863e-06, + "loss": 0.7294, + "step": 6270 + }, + { + "epoch": 0.51, + "grad_norm": 4.127401262379407, + "learning_rate": 5.092069610157443e-06, + "loss": 0.6514, + "step": 6271 + }, + { + "epoch": 0.51, + "grad_norm": 2.8825340599312703, + "learning_rate": 5.090754475520226e-06, + "loss": 0.7012, + "step": 6272 + }, + { + "epoch": 0.51, + "grad_norm": 4.643043727319916, + "learning_rate": 5.08943933460223e-06, + "loss": 0.6178, + "step": 6273 + }, + { + "epoch": 0.51, + "grad_norm": 3.564448118598244, + "learning_rate": 5.088124187494468e-06, + "loss": 0.6575, + "step": 6274 + }, + { + "epoch": 0.51, + "grad_norm": 2.9400452820553657, + "learning_rate": 5.086809034287957e-06, + "loss": 0.648, + "step": 6275 + }, + { + "epoch": 0.51, + "grad_norm": 22.39053453718653, + "learning_rate": 5.085493875073714e-06, + "loss": 0.8311, + "step": 6276 + }, + { + "epoch": 0.51, + "grad_norm": 4.249848436272152, + "learning_rate": 5.084178709942757e-06, + "loss": 0.757, + "step": 6277 + }, + { + "epoch": 0.51, + "grad_norm": 2.535573262284296, + "learning_rate": 5.082863538986103e-06, + "loss": 0.4822, + "step": 6278 + }, + { + "epoch": 0.51, + "grad_norm": 11.30432405839932, + "learning_rate": 5.0815483622947694e-06, + "loss": 0.5438, + "step": 6279 + }, + { + "epoch": 0.51, + "grad_norm": 3.2550272267903564, + "learning_rate": 5.080233179959777e-06, + "loss": 0.5133, + "step": 6280 + }, + { + "epoch": 0.51, + "grad_norm": 9.743716529476114, + "learning_rate": 5.078917992072144e-06, + "loss": 0.7112, + "step": 6281 + }, + { + "epoch": 0.51, + "grad_norm": 3.87975482317902, + "learning_rate": 5.077602798722888e-06, + "loss": 0.6653, + "step": 6282 + }, + { + "epoch": 0.51, + "grad_norm": 6.35546215040325, + "learning_rate": 5.076287600003029e-06, + "loss": 0.6723, + "step": 6283 + }, + { + "epoch": 0.51, + "grad_norm": 5.292099959035826, + "learning_rate": 5.074972396003589e-06, + "loss": 0.5966, + "step": 6284 + }, + { + "epoch": 0.51, + "grad_norm": 3.1902300120041627, + "learning_rate": 5.073657186815586e-06, + "loss": 0.7872, + "step": 6285 + }, + { + "epoch": 0.51, + "grad_norm": 3.730703839663729, + "learning_rate": 5.072341972530043e-06, + "loss": 0.6412, + "step": 6286 + }, + { + "epoch": 0.51, + "grad_norm": 24.789754365733003, + "learning_rate": 5.07102675323798e-06, + "loss": 0.6573, + "step": 6287 + }, + { + "epoch": 0.51, + "grad_norm": 5.497567759866287, + "learning_rate": 5.069711529030417e-06, + "loss": 0.7641, + "step": 6288 + }, + { + "epoch": 0.51, + "grad_norm": 4.758441457288548, + "learning_rate": 5.068396299998379e-06, + "loss": 0.5057, + "step": 6289 + }, + { + "epoch": 0.51, + "grad_norm": 4.9053592970554485, + "learning_rate": 5.0670810662328865e-06, + "loss": 0.6168, + "step": 6290 + }, + { + "epoch": 0.51, + "grad_norm": 3.2072430731205124, + "learning_rate": 5.06576582782496e-06, + "loss": 0.5961, + "step": 6291 + }, + { + "epoch": 0.51, + "grad_norm": 3.507140223952992, + "learning_rate": 5.064450584865624e-06, + "loss": 0.6724, + "step": 6292 + }, + { + "epoch": 0.51, + "grad_norm": 4.161283778576761, + "learning_rate": 5.063135337445903e-06, + "loss": 0.77, + "step": 6293 + }, + { + "epoch": 0.51, + "grad_norm": 4.679684785151928, + "learning_rate": 5.06182008565682e-06, + "loss": 0.5581, + "step": 6294 + }, + { + "epoch": 0.51, + "grad_norm": 5.334033826764989, + "learning_rate": 5.060504829589396e-06, + "loss": 0.5484, + "step": 6295 + }, + { + "epoch": 0.51, + "grad_norm": 3.447596083382297, + "learning_rate": 5.059189569334658e-06, + "loss": 0.7659, + "step": 6296 + }, + { + "epoch": 0.51, + "grad_norm": 3.3766614364355494, + "learning_rate": 5.0578743049836274e-06, + "loss": 0.6621, + "step": 6297 + }, + { + "epoch": 0.51, + "grad_norm": 3.9154768274359246, + "learning_rate": 5.056559036627333e-06, + "loss": 0.6543, + "step": 6298 + }, + { + "epoch": 0.51, + "grad_norm": 4.55916540809735, + "learning_rate": 5.055243764356795e-06, + "loss": 0.7174, + "step": 6299 + }, + { + "epoch": 0.51, + "grad_norm": 4.932127337658018, + "learning_rate": 5.053928488263043e-06, + "loss": 0.7061, + "step": 6300 + }, + { + "epoch": 0.51, + "grad_norm": 3.8712637985271288, + "learning_rate": 5.052613208437098e-06, + "loss": 0.5938, + "step": 6301 + }, + { + "epoch": 0.51, + "grad_norm": 4.222986215479166, + "learning_rate": 5.051297924969988e-06, + "loss": 0.5944, + "step": 6302 + }, + { + "epoch": 0.51, + "grad_norm": 8.873162875087925, + "learning_rate": 5.04998263795274e-06, + "loss": 0.8256, + "step": 6303 + }, + { + "epoch": 0.51, + "grad_norm": 3.3367976348226374, + "learning_rate": 5.048667347476376e-06, + "loss": 0.6353, + "step": 6304 + }, + { + "epoch": 0.51, + "grad_norm": 3.509565432265062, + "learning_rate": 5.047352053631928e-06, + "loss": 0.756, + "step": 6305 + }, + { + "epoch": 0.51, + "grad_norm": 8.346724018565215, + "learning_rate": 5.046036756510417e-06, + "loss": 0.6907, + "step": 6306 + }, + { + "epoch": 0.51, + "grad_norm": 12.57514821162449, + "learning_rate": 5.0447214562028755e-06, + "loss": 0.5992, + "step": 6307 + }, + { + "epoch": 0.51, + "grad_norm": 3.4033571896682737, + "learning_rate": 5.043406152800325e-06, + "loss": 0.6546, + "step": 6308 + }, + { + "epoch": 0.51, + "grad_norm": 2.7816531642011593, + "learning_rate": 5.042090846393797e-06, + "loss": 0.6608, + "step": 6309 + }, + { + "epoch": 0.51, + "grad_norm": 2.6417985658465364, + "learning_rate": 5.040775537074318e-06, + "loss": 0.772, + "step": 6310 + }, + { + "epoch": 0.51, + "grad_norm": 3.0714143437152384, + "learning_rate": 5.039460224932913e-06, + "loss": 0.7309, + "step": 6311 + }, + { + "epoch": 0.51, + "grad_norm": 2.467102201591127, + "learning_rate": 5.0381449100606126e-06, + "loss": 0.7047, + "step": 6312 + }, + { + "epoch": 0.51, + "grad_norm": 2.558396459846335, + "learning_rate": 5.036829592548446e-06, + "loss": 0.709, + "step": 6313 + }, + { + "epoch": 0.51, + "grad_norm": 2.9211847468439798, + "learning_rate": 5.035514272487438e-06, + "loss": 0.5507, + "step": 6314 + }, + { + "epoch": 0.51, + "grad_norm": 5.2102977468633735, + "learning_rate": 5.034198949968618e-06, + "loss": 0.6036, + "step": 6315 + }, + { + "epoch": 0.51, + "grad_norm": 9.834354421799684, + "learning_rate": 5.032883625083017e-06, + "loss": 0.7272, + "step": 6316 + }, + { + "epoch": 0.51, + "grad_norm": 2.110622541964088, + "learning_rate": 5.0315682979216615e-06, + "loss": 0.7107, + "step": 6317 + }, + { + "epoch": 0.51, + "grad_norm": 3.112445919661464, + "learning_rate": 5.0302529685755805e-06, + "loss": 0.89, + "step": 6318 + }, + { + "epoch": 0.51, + "grad_norm": 7.241950077241141, + "learning_rate": 5.028937637135804e-06, + "loss": 0.7031, + "step": 6319 + }, + { + "epoch": 0.51, + "grad_norm": 3.3302127460637414, + "learning_rate": 5.027622303693363e-06, + "loss": 0.6483, + "step": 6320 + }, + { + "epoch": 0.51, + "grad_norm": 5.674644143159602, + "learning_rate": 5.026306968339282e-06, + "loss": 0.7435, + "step": 6321 + }, + { + "epoch": 0.51, + "grad_norm": 5.2714012824227146, + "learning_rate": 5.024991631164593e-06, + "loss": 0.5843, + "step": 6322 + }, + { + "epoch": 0.51, + "grad_norm": 3.0180074359376716, + "learning_rate": 5.023676292260328e-06, + "loss": 0.6509, + "step": 6323 + }, + { + "epoch": 0.51, + "grad_norm": 5.987819418125114, + "learning_rate": 5.022360951717512e-06, + "loss": 0.6771, + "step": 6324 + }, + { + "epoch": 0.51, + "grad_norm": 3.321548488585978, + "learning_rate": 5.0210456096271775e-06, + "loss": 0.707, + "step": 6325 + }, + { + "epoch": 0.51, + "grad_norm": 4.886664745546878, + "learning_rate": 5.0197302660803545e-06, + "loss": 0.7559, + "step": 6326 + }, + { + "epoch": 0.51, + "grad_norm": 6.368649976490519, + "learning_rate": 5.018414921168075e-06, + "loss": 0.7171, + "step": 6327 + }, + { + "epoch": 0.51, + "grad_norm": 4.213867885350846, + "learning_rate": 5.017099574981366e-06, + "loss": 0.7112, + "step": 6328 + }, + { + "epoch": 0.51, + "grad_norm": 4.647568689024224, + "learning_rate": 5.015784227611258e-06, + "loss": 0.6629, + "step": 6329 + }, + { + "epoch": 0.51, + "grad_norm": 3.5510137769903243, + "learning_rate": 5.0144688791487825e-06, + "loss": 0.7343, + "step": 6330 + }, + { + "epoch": 0.51, + "grad_norm": 3.5266911442493036, + "learning_rate": 5.0131535296849684e-06, + "loss": 0.5639, + "step": 6331 + }, + { + "epoch": 0.51, + "grad_norm": 2.8045393149122053, + "learning_rate": 5.011838179310848e-06, + "loss": 0.697, + "step": 6332 + }, + { + "epoch": 0.51, + "grad_norm": 3.139989534629165, + "learning_rate": 5.010522828117452e-06, + "loss": 0.6541, + "step": 6333 + }, + { + "epoch": 0.51, + "grad_norm": 3.7592281556841236, + "learning_rate": 5.0092074761958085e-06, + "loss": 0.6587, + "step": 6334 + }, + { + "epoch": 0.51, + "grad_norm": 12.511469348877558, + "learning_rate": 5.00789212363695e-06, + "loss": 0.785, + "step": 6335 + }, + { + "epoch": 0.51, + "grad_norm": 2.352169308874297, + "learning_rate": 5.006576770531907e-06, + "loss": 0.4462, + "step": 6336 + }, + { + "epoch": 0.51, + "grad_norm": 2.3429816339815646, + "learning_rate": 5.00526141697171e-06, + "loss": 0.5871, + "step": 6337 + }, + { + "epoch": 0.51, + "grad_norm": 3.7657627080780443, + "learning_rate": 5.003946063047393e-06, + "loss": 0.76, + "step": 6338 + }, + { + "epoch": 0.51, + "grad_norm": 4.668506303265283, + "learning_rate": 5.002630708849979e-06, + "loss": 0.6603, + "step": 6339 + }, + { + "epoch": 0.51, + "grad_norm": 3.4664322265740766, + "learning_rate": 5.001315354470506e-06, + "loss": 0.7713, + "step": 6340 + }, + { + "epoch": 0.52, + "grad_norm": 3.116547238234516, + "learning_rate": 5e-06, + "loss": 0.6457, + "step": 6341 + }, + { + "epoch": 0.52, + "grad_norm": 4.46212173527785, + "learning_rate": 4.998684645529496e-06, + "loss": 0.6369, + "step": 6342 + }, + { + "epoch": 0.52, + "grad_norm": 4.526732114218656, + "learning_rate": 4.997369291150021e-06, + "loss": 0.6381, + "step": 6343 + }, + { + "epoch": 0.52, + "grad_norm": 4.4148994157716395, + "learning_rate": 4.99605393695261e-06, + "loss": 0.7904, + "step": 6344 + }, + { + "epoch": 0.52, + "grad_norm": 4.0051790405532515, + "learning_rate": 4.994738583028291e-06, + "loss": 0.6149, + "step": 6345 + }, + { + "epoch": 0.52, + "grad_norm": 6.050139412075392, + "learning_rate": 4.993423229468094e-06, + "loss": 0.7763, + "step": 6346 + }, + { + "epoch": 0.52, + "grad_norm": 3.0181206903472972, + "learning_rate": 4.992107876363051e-06, + "loss": 0.5243, + "step": 6347 + }, + { + "epoch": 0.52, + "grad_norm": 3.0108521215138953, + "learning_rate": 4.990792523804192e-06, + "loss": 0.7942, + "step": 6348 + }, + { + "epoch": 0.52, + "grad_norm": 3.103117966247781, + "learning_rate": 4.989477171882549e-06, + "loss": 0.5996, + "step": 6349 + }, + { + "epoch": 0.52, + "grad_norm": 3.270549363845947, + "learning_rate": 4.988161820689152e-06, + "loss": 0.6478, + "step": 6350 + }, + { + "epoch": 0.52, + "grad_norm": 3.6141796552617613, + "learning_rate": 4.986846470315033e-06, + "loss": 0.7283, + "step": 6351 + }, + { + "epoch": 0.52, + "grad_norm": 3.796486621535746, + "learning_rate": 4.98553112085122e-06, + "loss": 0.7291, + "step": 6352 + }, + { + "epoch": 0.52, + "grad_norm": 10.503942766991463, + "learning_rate": 4.984215772388744e-06, + "loss": 0.6979, + "step": 6353 + }, + { + "epoch": 0.52, + "grad_norm": 4.266974698845338, + "learning_rate": 4.982900425018637e-06, + "loss": 0.8384, + "step": 6354 + }, + { + "epoch": 0.52, + "grad_norm": 2.280078937194067, + "learning_rate": 4.981585078831926e-06, + "loss": 0.6545, + "step": 6355 + }, + { + "epoch": 0.52, + "grad_norm": 2.9209362279952815, + "learning_rate": 4.980269733919645e-06, + "loss": 0.666, + "step": 6356 + }, + { + "epoch": 0.52, + "grad_norm": 5.390739875520528, + "learning_rate": 4.9789543903728224e-06, + "loss": 0.6941, + "step": 6357 + }, + { + "epoch": 0.52, + "grad_norm": 3.251344286999029, + "learning_rate": 4.97763904828249e-06, + "loss": 0.6321, + "step": 6358 + }, + { + "epoch": 0.52, + "grad_norm": 3.964415277973606, + "learning_rate": 4.976323707739675e-06, + "loss": 0.8115, + "step": 6359 + }, + { + "epoch": 0.52, + "grad_norm": 2.8433745535799333, + "learning_rate": 4.975008368835408e-06, + "loss": 0.6591, + "step": 6360 + }, + { + "epoch": 0.52, + "grad_norm": 7.689166018373095, + "learning_rate": 4.973693031660719e-06, + "loss": 0.8147, + "step": 6361 + }, + { + "epoch": 0.52, + "grad_norm": 10.078192333247861, + "learning_rate": 4.972377696306639e-06, + "loss": 0.6828, + "step": 6362 + }, + { + "epoch": 0.52, + "grad_norm": 15.572383369901102, + "learning_rate": 4.971062362864196e-06, + "loss": 0.6735, + "step": 6363 + }, + { + "epoch": 0.52, + "grad_norm": 2.8575537611868427, + "learning_rate": 4.969747031424419e-06, + "loss": 0.6287, + "step": 6364 + }, + { + "epoch": 0.52, + "grad_norm": 2.806013827226555, + "learning_rate": 4.968431702078341e-06, + "loss": 0.7296, + "step": 6365 + }, + { + "epoch": 0.52, + "grad_norm": 3.6298434244722624, + "learning_rate": 4.967116374916985e-06, + "loss": 0.7312, + "step": 6366 + }, + { + "epoch": 0.52, + "grad_norm": 8.387068702084253, + "learning_rate": 4.965801050031383e-06, + "loss": 0.79, + "step": 6367 + }, + { + "epoch": 0.52, + "grad_norm": 3.6170534497596574, + "learning_rate": 4.9644857275125634e-06, + "loss": 0.655, + "step": 6368 + }, + { + "epoch": 0.52, + "grad_norm": 6.969601269120004, + "learning_rate": 4.963170407451556e-06, + "loss": 0.5889, + "step": 6369 + }, + { + "epoch": 0.52, + "grad_norm": 3.3148919557115883, + "learning_rate": 4.961855089939388e-06, + "loss": 0.7272, + "step": 6370 + }, + { + "epoch": 0.52, + "grad_norm": 3.69393805622033, + "learning_rate": 4.960539775067089e-06, + "loss": 0.6436, + "step": 6371 + }, + { + "epoch": 0.52, + "grad_norm": 3.4170437513463603, + "learning_rate": 4.959224462925685e-06, + "loss": 0.6985, + "step": 6372 + }, + { + "epoch": 0.52, + "grad_norm": 2.8674547831501527, + "learning_rate": 4.9579091536062054e-06, + "loss": 0.6502, + "step": 6373 + }, + { + "epoch": 0.52, + "grad_norm": 4.931877187777162, + "learning_rate": 4.956593847199676e-06, + "loss": 0.6063, + "step": 6374 + }, + { + "epoch": 0.52, + "grad_norm": 3.9731306731202833, + "learning_rate": 4.955278543797126e-06, + "loss": 0.6997, + "step": 6375 + }, + { + "epoch": 0.52, + "grad_norm": 4.783354846569904, + "learning_rate": 4.953963243489583e-06, + "loss": 0.7188, + "step": 6376 + }, + { + "epoch": 0.52, + "grad_norm": 16.599637167867275, + "learning_rate": 4.952647946368074e-06, + "loss": 0.7386, + "step": 6377 + }, + { + "epoch": 0.52, + "grad_norm": 4.032903713261169, + "learning_rate": 4.951332652523625e-06, + "loss": 0.7051, + "step": 6378 + }, + { + "epoch": 0.52, + "grad_norm": 3.791574009322771, + "learning_rate": 4.950017362047264e-06, + "loss": 0.6941, + "step": 6379 + }, + { + "epoch": 0.52, + "grad_norm": 5.919900037831575, + "learning_rate": 4.948702075030014e-06, + "loss": 0.7167, + "step": 6380 + }, + { + "epoch": 0.52, + "grad_norm": 3.1701979735786656, + "learning_rate": 4.947386791562904e-06, + "loss": 0.6915, + "step": 6381 + }, + { + "epoch": 0.52, + "grad_norm": 3.864771205304822, + "learning_rate": 4.946071511736959e-06, + "loss": 0.6785, + "step": 6382 + }, + { + "epoch": 0.52, + "grad_norm": 5.768537577843789, + "learning_rate": 4.944756235643205e-06, + "loss": 0.6965, + "step": 6383 + }, + { + "epoch": 0.52, + "grad_norm": 4.594881279617763, + "learning_rate": 4.943440963372668e-06, + "loss": 0.6001, + "step": 6384 + }, + { + "epoch": 0.52, + "grad_norm": 4.7638653217958105, + "learning_rate": 4.942125695016373e-06, + "loss": 0.6453, + "step": 6385 + }, + { + "epoch": 0.52, + "grad_norm": 3.828066138044884, + "learning_rate": 4.940810430665344e-06, + "loss": 0.5966, + "step": 6386 + }, + { + "epoch": 0.52, + "grad_norm": 5.59429964228511, + "learning_rate": 4.939495170410606e-06, + "loss": 0.7866, + "step": 6387 + }, + { + "epoch": 0.52, + "grad_norm": 2.3543983207238046, + "learning_rate": 4.9381799143431815e-06, + "loss": 0.8467, + "step": 6388 + }, + { + "epoch": 0.52, + "grad_norm": 5.125080758660041, + "learning_rate": 4.936864662554098e-06, + "loss": 0.6249, + "step": 6389 + }, + { + "epoch": 0.52, + "grad_norm": 3.6568281998277676, + "learning_rate": 4.935549415134376e-06, + "loss": 0.8075, + "step": 6390 + }, + { + "epoch": 0.52, + "grad_norm": 12.579054924178276, + "learning_rate": 4.934234172175043e-06, + "loss": 0.6585, + "step": 6391 + }, + { + "epoch": 0.52, + "grad_norm": 2.3209165248906003, + "learning_rate": 4.932918933767116e-06, + "loss": 0.6215, + "step": 6392 + }, + { + "epoch": 0.52, + "grad_norm": 3.676653675140861, + "learning_rate": 4.931603700001623e-06, + "loss": 0.6438, + "step": 6393 + }, + { + "epoch": 0.52, + "grad_norm": 2.73884537323697, + "learning_rate": 4.930288470969584e-06, + "loss": 0.7231, + "step": 6394 + }, + { + "epoch": 0.52, + "grad_norm": 3.9639053366386086, + "learning_rate": 4.928973246762022e-06, + "loss": 0.6294, + "step": 6395 + }, + { + "epoch": 0.52, + "grad_norm": 21.35458625668591, + "learning_rate": 4.927658027469958e-06, + "loss": 0.5958, + "step": 6396 + }, + { + "epoch": 0.52, + "grad_norm": 2.725073808178824, + "learning_rate": 4.926342813184413e-06, + "loss": 0.6966, + "step": 6397 + }, + { + "epoch": 0.52, + "grad_norm": 4.532701722618901, + "learning_rate": 4.925027603996414e-06, + "loss": 0.7644, + "step": 6398 + }, + { + "epoch": 0.52, + "grad_norm": 3.827771147301602, + "learning_rate": 4.923712399996972e-06, + "loss": 0.6024, + "step": 6399 + }, + { + "epoch": 0.52, + "grad_norm": 2.9825806708957887, + "learning_rate": 4.922397201277114e-06, + "loss": 0.5417, + "step": 6400 + }, + { + "epoch": 0.52, + "grad_norm": 2.67740885081383, + "learning_rate": 4.921082007927857e-06, + "loss": 0.7419, + "step": 6401 + }, + { + "epoch": 0.52, + "grad_norm": 3.4917724523278055, + "learning_rate": 4.919766820040224e-06, + "loss": 0.642, + "step": 6402 + }, + { + "epoch": 0.52, + "grad_norm": 2.5674161970507985, + "learning_rate": 4.9184516377052305e-06, + "loss": 0.6026, + "step": 6403 + }, + { + "epoch": 0.52, + "grad_norm": 4.725729591305165, + "learning_rate": 4.9171364610139e-06, + "loss": 0.7535, + "step": 6404 + }, + { + "epoch": 0.52, + "grad_norm": 2.948880072751482, + "learning_rate": 4.915821290057245e-06, + "loss": 0.7515, + "step": 6405 + }, + { + "epoch": 0.52, + "grad_norm": 6.568096667210566, + "learning_rate": 4.914506124926288e-06, + "loss": 0.6559, + "step": 6406 + }, + { + "epoch": 0.52, + "grad_norm": 2.6501606402728517, + "learning_rate": 4.913190965712045e-06, + "loss": 0.6338, + "step": 6407 + }, + { + "epoch": 0.52, + "grad_norm": 6.846070694646515, + "learning_rate": 4.911875812505533e-06, + "loss": 0.668, + "step": 6408 + }, + { + "epoch": 0.52, + "grad_norm": 9.501851312125194, + "learning_rate": 4.910560665397772e-06, + "loss": 0.7569, + "step": 6409 + }, + { + "epoch": 0.52, + "grad_norm": 3.5569555940736572, + "learning_rate": 4.909245524479774e-06, + "loss": 0.7844, + "step": 6410 + }, + { + "epoch": 0.52, + "grad_norm": 3.2054358423086287, + "learning_rate": 4.907930389842558e-06, + "loss": 0.6693, + "step": 6411 + }, + { + "epoch": 0.52, + "grad_norm": 8.330526161666796, + "learning_rate": 4.906615261577139e-06, + "loss": 0.6321, + "step": 6412 + }, + { + "epoch": 0.52, + "grad_norm": 2.2449889419161257, + "learning_rate": 4.905300139774532e-06, + "loss": 0.7846, + "step": 6413 + }, + { + "epoch": 0.52, + "grad_norm": 3.0590300277771174, + "learning_rate": 4.903985024525751e-06, + "loss": 0.6838, + "step": 6414 + }, + { + "epoch": 0.52, + "grad_norm": 2.9617294160582412, + "learning_rate": 4.90266991592181e-06, + "loss": 0.7916, + "step": 6415 + }, + { + "epoch": 0.52, + "grad_norm": 4.442971007173976, + "learning_rate": 4.901354814053724e-06, + "loss": 0.7019, + "step": 6416 + }, + { + "epoch": 0.52, + "grad_norm": 4.203682970791329, + "learning_rate": 4.9000397190125076e-06, + "loss": 0.6891, + "step": 6417 + }, + { + "epoch": 0.52, + "grad_norm": 3.302478574245904, + "learning_rate": 4.898724630889172e-06, + "loss": 0.5293, + "step": 6418 + }, + { + "epoch": 0.52, + "grad_norm": 2.7789041411074393, + "learning_rate": 4.897409549774729e-06, + "loss": 0.6709, + "step": 6419 + }, + { + "epoch": 0.52, + "grad_norm": 3.645925886226135, + "learning_rate": 4.896094475760191e-06, + "loss": 0.7505, + "step": 6420 + }, + { + "epoch": 0.52, + "grad_norm": 3.1945467846235687, + "learning_rate": 4.8947794089365685e-06, + "loss": 0.7228, + "step": 6421 + }, + { + "epoch": 0.52, + "grad_norm": 4.271069117326643, + "learning_rate": 4.893464349394874e-06, + "loss": 0.7548, + "step": 6422 + }, + { + "epoch": 0.52, + "grad_norm": 4.694655446506869, + "learning_rate": 4.892149297226118e-06, + "loss": 0.5366, + "step": 6423 + }, + { + "epoch": 0.52, + "grad_norm": 11.364478738264422, + "learning_rate": 4.890834252521311e-06, + "loss": 0.7314, + "step": 6424 + }, + { + "epoch": 0.52, + "grad_norm": 6.510783391621584, + "learning_rate": 4.889519215371458e-06, + "loss": 0.638, + "step": 6425 + }, + { + "epoch": 0.52, + "grad_norm": 3.2258567334754615, + "learning_rate": 4.888204185867572e-06, + "loss": 0.5635, + "step": 6426 + }, + { + "epoch": 0.52, + "grad_norm": 3.224529721452054, + "learning_rate": 4.886889164100661e-06, + "loss": 0.6794, + "step": 6427 + }, + { + "epoch": 0.52, + "grad_norm": 3.362427928638376, + "learning_rate": 4.885574150161732e-06, + "loss": 0.724, + "step": 6428 + }, + { + "epoch": 0.52, + "grad_norm": 4.7204437395199355, + "learning_rate": 4.884259144141792e-06, + "loss": 0.6561, + "step": 6429 + }, + { + "epoch": 0.52, + "grad_norm": 2.5394195204765944, + "learning_rate": 4.882944146131848e-06, + "loss": 0.7797, + "step": 6430 + }, + { + "epoch": 0.52, + "grad_norm": 6.980790558032294, + "learning_rate": 4.881629156222907e-06, + "loss": 0.556, + "step": 6431 + }, + { + "epoch": 0.52, + "grad_norm": 2.99893173919464, + "learning_rate": 4.880314174505972e-06, + "loss": 0.6949, + "step": 6432 + }, + { + "epoch": 0.52, + "grad_norm": 2.9951376458742023, + "learning_rate": 4.8789992010720505e-06, + "loss": 0.766, + "step": 6433 + }, + { + "epoch": 0.52, + "grad_norm": 2.1760280912025802, + "learning_rate": 4.877684236012147e-06, + "loss": 0.6768, + "step": 6434 + }, + { + "epoch": 0.52, + "grad_norm": 2.4190528921361367, + "learning_rate": 4.876369279417263e-06, + "loss": 0.6981, + "step": 6435 + }, + { + "epoch": 0.52, + "grad_norm": 4.195995133865271, + "learning_rate": 4.875054331378404e-06, + "loss": 0.7759, + "step": 6436 + }, + { + "epoch": 0.52, + "grad_norm": 4.798954939981184, + "learning_rate": 4.873739391986571e-06, + "loss": 0.6341, + "step": 6437 + }, + { + "epoch": 0.52, + "grad_norm": 2.822775238507823, + "learning_rate": 4.87242446133277e-06, + "loss": 0.6786, + "step": 6438 + }, + { + "epoch": 0.52, + "grad_norm": 3.200575302817755, + "learning_rate": 4.871109539507998e-06, + "loss": 0.6178, + "step": 6439 + }, + { + "epoch": 0.52, + "grad_norm": 3.0001511396021154, + "learning_rate": 4.869794626603256e-06, + "loss": 0.7569, + "step": 6440 + }, + { + "epoch": 0.52, + "grad_norm": 3.8045909348439313, + "learning_rate": 4.868479722709547e-06, + "loss": 0.6831, + "step": 6441 + }, + { + "epoch": 0.52, + "grad_norm": 2.5401181823660854, + "learning_rate": 4.86716482791787e-06, + "loss": 0.596, + "step": 6442 + }, + { + "epoch": 0.52, + "grad_norm": 6.692407526298747, + "learning_rate": 4.8658499423192215e-06, + "loss": 0.676, + "step": 6443 + }, + { + "epoch": 0.52, + "grad_norm": 8.034122487007481, + "learning_rate": 4.864535066004604e-06, + "loss": 0.723, + "step": 6444 + }, + { + "epoch": 0.52, + "grad_norm": 2.8709890589797684, + "learning_rate": 4.863220199065011e-06, + "loss": 0.8244, + "step": 6445 + }, + { + "epoch": 0.52, + "grad_norm": 3.1541197174179763, + "learning_rate": 4.861905341591442e-06, + "loss": 0.6012, + "step": 6446 + }, + { + "epoch": 0.52, + "grad_norm": 2.541868841247612, + "learning_rate": 4.860590493674892e-06, + "loss": 0.7638, + "step": 6447 + }, + { + "epoch": 0.52, + "grad_norm": 2.2875331255066396, + "learning_rate": 4.859275655406358e-06, + "loss": 0.7176, + "step": 6448 + }, + { + "epoch": 0.52, + "grad_norm": 3.389296275858009, + "learning_rate": 4.857960826876835e-06, + "loss": 0.7971, + "step": 6449 + }, + { + "epoch": 0.52, + "grad_norm": 7.608012479350219, + "learning_rate": 4.856646008177318e-06, + "loss": 0.6686, + "step": 6450 + }, + { + "epoch": 0.52, + "grad_norm": 7.845934367671523, + "learning_rate": 4.855331199398799e-06, + "loss": 0.6883, + "step": 6451 + }, + { + "epoch": 0.52, + "grad_norm": 4.324055416673929, + "learning_rate": 4.8540164006322735e-06, + "loss": 0.6225, + "step": 6452 + }, + { + "epoch": 0.52, + "grad_norm": 4.999141946174273, + "learning_rate": 4.8527016119687306e-06, + "loss": 0.76, + "step": 6453 + }, + { + "epoch": 0.52, + "grad_norm": 3.2953791511615615, + "learning_rate": 4.851386833499163e-06, + "loss": 0.6914, + "step": 6454 + }, + { + "epoch": 0.52, + "grad_norm": 5.370327795119699, + "learning_rate": 4.850072065314563e-06, + "loss": 0.8209, + "step": 6455 + }, + { + "epoch": 0.52, + "grad_norm": 2.9202756624324695, + "learning_rate": 4.8487573075059195e-06, + "loss": 0.8148, + "step": 6456 + }, + { + "epoch": 0.52, + "grad_norm": 3.019099815294026, + "learning_rate": 4.847442560164226e-06, + "loss": 0.6801, + "step": 6457 + }, + { + "epoch": 0.52, + "grad_norm": 3.139574476738395, + "learning_rate": 4.846127823380464e-06, + "loss": 0.6225, + "step": 6458 + }, + { + "epoch": 0.52, + "grad_norm": 7.569503691887198, + "learning_rate": 4.844813097245628e-06, + "loss": 0.6775, + "step": 6459 + }, + { + "epoch": 0.52, + "grad_norm": 4.819630963707262, + "learning_rate": 4.843498381850701e-06, + "loss": 0.7349, + "step": 6460 + }, + { + "epoch": 0.52, + "grad_norm": 4.241141261829069, + "learning_rate": 4.842183677286671e-06, + "loss": 0.7548, + "step": 6461 + }, + { + "epoch": 0.52, + "grad_norm": 2.456007749685355, + "learning_rate": 4.840868983644525e-06, + "loss": 0.6758, + "step": 6462 + }, + { + "epoch": 0.52, + "grad_norm": 4.538681786602927, + "learning_rate": 4.839554301015247e-06, + "loss": 0.6927, + "step": 6463 + }, + { + "epoch": 0.53, + "grad_norm": 3.4558697123770146, + "learning_rate": 4.838239629489824e-06, + "loss": 0.6596, + "step": 6464 + }, + { + "epoch": 0.53, + "grad_norm": 25.283376049394857, + "learning_rate": 4.836924969159234e-06, + "loss": 0.5828, + "step": 6465 + }, + { + "epoch": 0.53, + "grad_norm": 5.535437586532406, + "learning_rate": 4.835610320114465e-06, + "loss": 0.9513, + "step": 6466 + }, + { + "epoch": 0.53, + "grad_norm": 3.4836105477808608, + "learning_rate": 4.834295682446496e-06, + "loss": 0.7487, + "step": 6467 + }, + { + "epoch": 0.53, + "grad_norm": 14.654930251043227, + "learning_rate": 4.83298105624631e-06, + "loss": 0.5394, + "step": 6468 + }, + { + "epoch": 0.53, + "grad_norm": 2.822968386689163, + "learning_rate": 4.831666441604884e-06, + "loss": 0.7181, + "step": 6469 + }, + { + "epoch": 0.53, + "grad_norm": 2.9269271026148456, + "learning_rate": 4.830351838613202e-06, + "loss": 0.7548, + "step": 6470 + }, + { + "epoch": 0.53, + "grad_norm": 4.765919535555958, + "learning_rate": 4.829037247362243e-06, + "loss": 0.6214, + "step": 6471 + }, + { + "epoch": 0.53, + "grad_norm": 3.09388891730253, + "learning_rate": 4.82772266794298e-06, + "loss": 0.6693, + "step": 6472 + }, + { + "epoch": 0.53, + "grad_norm": 3.44704271006444, + "learning_rate": 4.826408100446393e-06, + "loss": 0.7739, + "step": 6473 + }, + { + "epoch": 0.53, + "grad_norm": 21.55790530469095, + "learning_rate": 4.82509354496346e-06, + "loss": 0.7431, + "step": 6474 + }, + { + "epoch": 0.53, + "grad_norm": 2.912877650204532, + "learning_rate": 4.823779001585155e-06, + "loss": 0.7069, + "step": 6475 + }, + { + "epoch": 0.53, + "grad_norm": 2.766901036719601, + "learning_rate": 4.822464470402452e-06, + "loss": 0.5794, + "step": 6476 + }, + { + "epoch": 0.53, + "grad_norm": 3.1699762502755635, + "learning_rate": 4.821149951506327e-06, + "loss": 0.7737, + "step": 6477 + }, + { + "epoch": 0.53, + "grad_norm": 2.714422906616694, + "learning_rate": 4.81983544498775e-06, + "loss": 0.7907, + "step": 6478 + }, + { + "epoch": 0.53, + "grad_norm": 3.2432875009933215, + "learning_rate": 4.818520950937694e-06, + "loss": 0.7728, + "step": 6479 + }, + { + "epoch": 0.53, + "grad_norm": 6.556282142958906, + "learning_rate": 4.817206469447132e-06, + "loss": 0.5339, + "step": 6480 + }, + { + "epoch": 0.53, + "grad_norm": 2.2193288163407465, + "learning_rate": 4.815892000607032e-06, + "loss": 0.7085, + "step": 6481 + }, + { + "epoch": 0.53, + "grad_norm": 4.666085027293924, + "learning_rate": 4.814577544508367e-06, + "loss": 0.7162, + "step": 6482 + }, + { + "epoch": 0.53, + "grad_norm": 8.412608825562815, + "learning_rate": 4.813263101242101e-06, + "loss": 0.6844, + "step": 6483 + }, + { + "epoch": 0.53, + "grad_norm": 3.160818637727029, + "learning_rate": 4.811948670899207e-06, + "loss": 0.6893, + "step": 6484 + }, + { + "epoch": 0.53, + "grad_norm": 2.5908341108149537, + "learning_rate": 4.810634253570647e-06, + "loss": 0.6023, + "step": 6485 + }, + { + "epoch": 0.53, + "grad_norm": 5.646059836988076, + "learning_rate": 4.8093198493473896e-06, + "loss": 0.8402, + "step": 6486 + }, + { + "epoch": 0.53, + "grad_norm": 3.01627079388889, + "learning_rate": 4.8080054583203975e-06, + "loss": 0.6908, + "step": 6487 + }, + { + "epoch": 0.53, + "grad_norm": 2.444250384858454, + "learning_rate": 4.8066910805806384e-06, + "loss": 0.6558, + "step": 6488 + }, + { + "epoch": 0.53, + "grad_norm": 3.264325368048423, + "learning_rate": 4.805376716219073e-06, + "loss": 0.6705, + "step": 6489 + }, + { + "epoch": 0.53, + "grad_norm": 8.61116645762462, + "learning_rate": 4.804062365326665e-06, + "loss": 0.6051, + "step": 6490 + }, + { + "epoch": 0.53, + "grad_norm": 3.7697763690659403, + "learning_rate": 4.802748027994376e-06, + "loss": 0.6951, + "step": 6491 + }, + { + "epoch": 0.53, + "grad_norm": 4.9782880694223595, + "learning_rate": 4.801433704313164e-06, + "loss": 0.6406, + "step": 6492 + }, + { + "epoch": 0.53, + "grad_norm": 2.9918777949675697, + "learning_rate": 4.80011939437399e-06, + "loss": 0.7993, + "step": 6493 + }, + { + "epoch": 0.53, + "grad_norm": 2.1607958375086382, + "learning_rate": 4.7988050982678125e-06, + "loss": 0.7583, + "step": 6494 + }, + { + "epoch": 0.53, + "grad_norm": 2.9358014731111868, + "learning_rate": 4.797490816085588e-06, + "loss": 0.5487, + "step": 6495 + }, + { + "epoch": 0.53, + "grad_norm": 2.6443722327049284, + "learning_rate": 4.796176547918276e-06, + "loss": 0.6268, + "step": 6496 + }, + { + "epoch": 0.53, + "grad_norm": 3.5942155918658782, + "learning_rate": 4.7948622938568305e-06, + "loss": 0.6101, + "step": 6497 + }, + { + "epoch": 0.53, + "grad_norm": 7.243264155984004, + "learning_rate": 4.793548053992205e-06, + "loss": 0.5971, + "step": 6498 + }, + { + "epoch": 0.53, + "grad_norm": 3.686805890154354, + "learning_rate": 4.792233828415353e-06, + "loss": 0.6365, + "step": 6499 + }, + { + "epoch": 0.53, + "grad_norm": 3.018733051877643, + "learning_rate": 4.79091961721723e-06, + "loss": 0.7762, + "step": 6500 + }, + { + "epoch": 0.53, + "grad_norm": 3.05475750340497, + "learning_rate": 4.789605420488785e-06, + "loss": 0.7222, + "step": 6501 + }, + { + "epoch": 0.53, + "grad_norm": 2.725972636727803, + "learning_rate": 4.78829123832097e-06, + "loss": 0.6777, + "step": 6502 + }, + { + "epoch": 0.53, + "grad_norm": 4.0479363527494066, + "learning_rate": 4.786977070804733e-06, + "loss": 0.7731, + "step": 6503 + }, + { + "epoch": 0.53, + "grad_norm": 4.378851954138839, + "learning_rate": 4.785662918031027e-06, + "loss": 0.6143, + "step": 6504 + }, + { + "epoch": 0.53, + "grad_norm": 5.478974780435422, + "learning_rate": 4.784348780090795e-06, + "loss": 0.6099, + "step": 6505 + }, + { + "epoch": 0.53, + "grad_norm": 2.836956082129708, + "learning_rate": 4.783034657074985e-06, + "loss": 0.6748, + "step": 6506 + }, + { + "epoch": 0.53, + "grad_norm": 2.1393955414564694, + "learning_rate": 4.781720549074543e-06, + "loss": 0.6672, + "step": 6507 + }, + { + "epoch": 0.53, + "grad_norm": 6.353242854951079, + "learning_rate": 4.7804064561804135e-06, + "loss": 0.6131, + "step": 6508 + }, + { + "epoch": 0.53, + "grad_norm": 4.506960355460957, + "learning_rate": 4.779092378483539e-06, + "loss": 0.8057, + "step": 6509 + }, + { + "epoch": 0.53, + "grad_norm": 2.542123021068329, + "learning_rate": 4.777778316074866e-06, + "loss": 0.5761, + "step": 6510 + }, + { + "epoch": 0.53, + "grad_norm": 2.7918381080145176, + "learning_rate": 4.77646426904533e-06, + "loss": 0.7661, + "step": 6511 + }, + { + "epoch": 0.53, + "grad_norm": 3.7901224160605795, + "learning_rate": 4.775150237485874e-06, + "loss": 0.6958, + "step": 6512 + }, + { + "epoch": 0.53, + "grad_norm": 5.636691397391061, + "learning_rate": 4.773836221487437e-06, + "loss": 0.7174, + "step": 6513 + }, + { + "epoch": 0.53, + "grad_norm": 3.9913918445078567, + "learning_rate": 4.772522221140959e-06, + "loss": 0.6308, + "step": 6514 + }, + { + "epoch": 0.53, + "grad_norm": 2.3364584673615125, + "learning_rate": 4.7712082365373755e-06, + "loss": 0.6052, + "step": 6515 + }, + { + "epoch": 0.53, + "grad_norm": 4.149080661712527, + "learning_rate": 4.769894267767621e-06, + "loss": 0.6166, + "step": 6516 + }, + { + "epoch": 0.53, + "grad_norm": 5.085614011368114, + "learning_rate": 4.768580314922635e-06, + "loss": 0.7956, + "step": 6517 + }, + { + "epoch": 0.53, + "grad_norm": 3.05738405009252, + "learning_rate": 4.767266378093346e-06, + "loss": 0.7635, + "step": 6518 + }, + { + "epoch": 0.53, + "grad_norm": 3.487911861058963, + "learning_rate": 4.765952457370689e-06, + "loss": 0.6206, + "step": 6519 + }, + { + "epoch": 0.53, + "grad_norm": 2.6211609943248395, + "learning_rate": 4.7646385528455966e-06, + "loss": 0.6759, + "step": 6520 + }, + { + "epoch": 0.53, + "grad_norm": 2.5451707265201944, + "learning_rate": 4.763324664608997e-06, + "loss": 0.8192, + "step": 6521 + }, + { + "epoch": 0.53, + "grad_norm": 3.4016162812318678, + "learning_rate": 4.762010792751823e-06, + "loss": 0.6082, + "step": 6522 + }, + { + "epoch": 0.53, + "grad_norm": 2.947345024466726, + "learning_rate": 4.760696937364999e-06, + "loss": 0.6572, + "step": 6523 + }, + { + "epoch": 0.53, + "grad_norm": 4.52877042625775, + "learning_rate": 4.759383098539454e-06, + "loss": 0.6485, + "step": 6524 + }, + { + "epoch": 0.53, + "grad_norm": 8.272560283063008, + "learning_rate": 4.758069276366115e-06, + "loss": 0.5812, + "step": 6525 + }, + { + "epoch": 0.53, + "grad_norm": 3.8984186352634573, + "learning_rate": 4.756755470935903e-06, + "loss": 0.6979, + "step": 6526 + }, + { + "epoch": 0.53, + "grad_norm": 8.797019820520363, + "learning_rate": 4.755441682339745e-06, + "loss": 0.6657, + "step": 6527 + }, + { + "epoch": 0.53, + "grad_norm": 3.242159009500934, + "learning_rate": 4.754127910668562e-06, + "loss": 0.8229, + "step": 6528 + }, + { + "epoch": 0.53, + "grad_norm": 2.198035289935515, + "learning_rate": 4.752814156013276e-06, + "loss": 0.6008, + "step": 6529 + }, + { + "epoch": 0.53, + "grad_norm": 4.970999595704256, + "learning_rate": 4.751500418464809e-06, + "loss": 0.9307, + "step": 6530 + }, + { + "epoch": 0.53, + "grad_norm": 7.027451664656361, + "learning_rate": 4.7501866981140755e-06, + "loss": 0.7864, + "step": 6531 + }, + { + "epoch": 0.53, + "grad_norm": 4.766871976303786, + "learning_rate": 4.748872995051996e-06, + "loss": 0.5974, + "step": 6532 + }, + { + "epoch": 0.53, + "grad_norm": 5.036349311241271, + "learning_rate": 4.747559309369486e-06, + "loss": 0.6499, + "step": 6533 + }, + { + "epoch": 0.53, + "grad_norm": 3.0628455602518243, + "learning_rate": 4.746245641157461e-06, + "loss": 0.7212, + "step": 6534 + }, + { + "epoch": 0.53, + "grad_norm": 3.6850060006118017, + "learning_rate": 4.744931990506836e-06, + "loss": 0.7501, + "step": 6535 + }, + { + "epoch": 0.53, + "grad_norm": 2.348277999571458, + "learning_rate": 4.743618357508522e-06, + "loss": 0.6835, + "step": 6536 + }, + { + "epoch": 0.53, + "grad_norm": 3.5383264410923703, + "learning_rate": 4.742304742253436e-06, + "loss": 0.7248, + "step": 6537 + }, + { + "epoch": 0.53, + "grad_norm": 5.0274065182545815, + "learning_rate": 4.740991144832481e-06, + "loss": 0.7053, + "step": 6538 + }, + { + "epoch": 0.53, + "grad_norm": 6.733311242022406, + "learning_rate": 4.73967756533657e-06, + "loss": 0.7506, + "step": 6539 + }, + { + "epoch": 0.53, + "grad_norm": 2.4614657840720757, + "learning_rate": 4.738364003856611e-06, + "loss": 0.7246, + "step": 6540 + }, + { + "epoch": 0.53, + "grad_norm": 3.984477315347523, + "learning_rate": 4.73705046048351e-06, + "loss": 0.7069, + "step": 6541 + }, + { + "epoch": 0.53, + "grad_norm": 9.685004703668936, + "learning_rate": 4.735736935308173e-06, + "loss": 0.6303, + "step": 6542 + }, + { + "epoch": 0.53, + "grad_norm": 4.139695765932749, + "learning_rate": 4.734423428421504e-06, + "loss": 0.7564, + "step": 6543 + }, + { + "epoch": 0.53, + "grad_norm": 3.3915270732442733, + "learning_rate": 4.733109939914407e-06, + "loss": 0.6931, + "step": 6544 + }, + { + "epoch": 0.53, + "grad_norm": 2.7367909329885274, + "learning_rate": 4.731796469877781e-06, + "loss": 0.5984, + "step": 6545 + }, + { + "epoch": 0.53, + "grad_norm": 2.4191448143982903, + "learning_rate": 4.7304830184025286e-06, + "loss": 0.7552, + "step": 6546 + }, + { + "epoch": 0.53, + "grad_norm": 3.2170464735005817, + "learning_rate": 4.729169585579549e-06, + "loss": 0.6015, + "step": 6547 + }, + { + "epoch": 0.53, + "grad_norm": 6.304314599433871, + "learning_rate": 4.727856171499738e-06, + "loss": 0.5193, + "step": 6548 + }, + { + "epoch": 0.53, + "grad_norm": 3.4858790673430864, + "learning_rate": 4.7265427762539936e-06, + "loss": 0.7098, + "step": 6549 + }, + { + "epoch": 0.53, + "grad_norm": 2.7153585677812964, + "learning_rate": 4.725229399933214e-06, + "loss": 0.7443, + "step": 6550 + }, + { + "epoch": 0.53, + "grad_norm": 2.8377986557990593, + "learning_rate": 4.723916042628287e-06, + "loss": 0.7858, + "step": 6551 + }, + { + "epoch": 0.53, + "grad_norm": 5.472524227480369, + "learning_rate": 4.722602704430108e-06, + "loss": 0.5207, + "step": 6552 + }, + { + "epoch": 0.53, + "grad_norm": 3.2512519256935635, + "learning_rate": 4.721289385429569e-06, + "loss": 0.8032, + "step": 6553 + }, + { + "epoch": 0.53, + "grad_norm": 3.151133617184783, + "learning_rate": 4.71997608571756e-06, + "loss": 0.731, + "step": 6554 + }, + { + "epoch": 0.53, + "grad_norm": 3.2898644324268327, + "learning_rate": 4.71866280538497e-06, + "loss": 0.7069, + "step": 6555 + }, + { + "epoch": 0.53, + "grad_norm": 3.6139589900965827, + "learning_rate": 4.717349544522683e-06, + "loss": 0.6137, + "step": 6556 + }, + { + "epoch": 0.53, + "grad_norm": 3.302476407478498, + "learning_rate": 4.71603630322159e-06, + "loss": 0.8684, + "step": 6557 + }, + { + "epoch": 0.53, + "grad_norm": 8.656895751033002, + "learning_rate": 4.714723081572571e-06, + "loss": 0.5347, + "step": 6558 + }, + { + "epoch": 0.53, + "grad_norm": 3.565183857421903, + "learning_rate": 4.71340987966651e-06, + "loss": 0.6073, + "step": 6559 + }, + { + "epoch": 0.53, + "grad_norm": 2.5623534762364346, + "learning_rate": 4.7120966975942905e-06, + "loss": 0.6609, + "step": 6560 + }, + { + "epoch": 0.53, + "grad_norm": 6.301720780624858, + "learning_rate": 4.710783535446793e-06, + "loss": 0.7892, + "step": 6561 + }, + { + "epoch": 0.53, + "grad_norm": 4.438664267025778, + "learning_rate": 4.709470393314896e-06, + "loss": 0.6616, + "step": 6562 + }, + { + "epoch": 0.53, + "grad_norm": 3.927502508577626, + "learning_rate": 4.708157271289477e-06, + "loss": 0.6009, + "step": 6563 + }, + { + "epoch": 0.53, + "grad_norm": 3.1289394755063817, + "learning_rate": 4.706844169461413e-06, + "loss": 0.6883, + "step": 6564 + }, + { + "epoch": 0.53, + "grad_norm": 7.028052298139913, + "learning_rate": 4.705531087921578e-06, + "loss": 0.6324, + "step": 6565 + }, + { + "epoch": 0.53, + "grad_norm": 3.745045886393011, + "learning_rate": 4.7042180267608445e-06, + "loss": 0.8369, + "step": 6566 + }, + { + "epoch": 0.53, + "grad_norm": 8.08031999769109, + "learning_rate": 4.7029049860700865e-06, + "loss": 0.6891, + "step": 6567 + }, + { + "epoch": 0.53, + "grad_norm": 13.56575456147451, + "learning_rate": 4.701591965940174e-06, + "loss": 0.6927, + "step": 6568 + }, + { + "epoch": 0.53, + "grad_norm": 2.9558069060613477, + "learning_rate": 4.700278966461977e-06, + "loss": 0.7404, + "step": 6569 + }, + { + "epoch": 0.53, + "grad_norm": 6.039738264820022, + "learning_rate": 4.6989659877263636e-06, + "loss": 0.6192, + "step": 6570 + }, + { + "epoch": 0.53, + "grad_norm": 4.63960107421065, + "learning_rate": 4.697653029824198e-06, + "loss": 0.819, + "step": 6571 + }, + { + "epoch": 0.53, + "grad_norm": 2.2811115729175806, + "learning_rate": 4.696340092846347e-06, + "loss": 0.6627, + "step": 6572 + }, + { + "epoch": 0.53, + "grad_norm": 2.3234269918797867, + "learning_rate": 4.695027176883673e-06, + "loss": 0.5731, + "step": 6573 + }, + { + "epoch": 0.53, + "grad_norm": 4.5493983203735295, + "learning_rate": 4.693714282027039e-06, + "loss": 0.7915, + "step": 6574 + }, + { + "epoch": 0.53, + "grad_norm": 3.8312371708493145, + "learning_rate": 4.692401408367305e-06, + "loss": 0.7356, + "step": 6575 + }, + { + "epoch": 0.53, + "grad_norm": 2.4848385300081026, + "learning_rate": 4.69108855599533e-06, + "loss": 0.7224, + "step": 6576 + }, + { + "epoch": 0.53, + "grad_norm": 2.7225411823837553, + "learning_rate": 4.689775725001974e-06, + "loss": 0.6242, + "step": 6577 + }, + { + "epoch": 0.53, + "grad_norm": 4.53432021661694, + "learning_rate": 4.6884629154780895e-06, + "loss": 0.7509, + "step": 6578 + }, + { + "epoch": 0.53, + "grad_norm": 3.9830787397835494, + "learning_rate": 4.6871501275145325e-06, + "loss": 0.7708, + "step": 6579 + }, + { + "epoch": 0.53, + "grad_norm": 3.4815850177678187, + "learning_rate": 4.6858373612021575e-06, + "loss": 0.6652, + "step": 6580 + }, + { + "epoch": 0.53, + "grad_norm": 4.464786657250361, + "learning_rate": 4.684524616631815e-06, + "loss": 0.7656, + "step": 6581 + }, + { + "epoch": 0.53, + "grad_norm": 4.32132457756276, + "learning_rate": 4.683211893894355e-06, + "loss": 0.7037, + "step": 6582 + }, + { + "epoch": 0.53, + "grad_norm": 3.6552625024597822, + "learning_rate": 4.681899193080628e-06, + "loss": 0.6821, + "step": 6583 + }, + { + "epoch": 0.53, + "grad_norm": 3.1795217780737413, + "learning_rate": 4.680586514281479e-06, + "loss": 0.7767, + "step": 6584 + }, + { + "epoch": 0.53, + "grad_norm": 2.6141349354942442, + "learning_rate": 4.679273857587753e-06, + "loss": 0.6681, + "step": 6585 + }, + { + "epoch": 0.53, + "grad_norm": 2.750261797300128, + "learning_rate": 4.677961223090297e-06, + "loss": 0.7594, + "step": 6586 + }, + { + "epoch": 0.53, + "grad_norm": 2.5236443826553225, + "learning_rate": 4.6766486108799505e-06, + "loss": 0.5148, + "step": 6587 + }, + { + "epoch": 0.54, + "grad_norm": 2.4834648525229737, + "learning_rate": 4.6753360210475576e-06, + "loss": 0.7451, + "step": 6588 + }, + { + "epoch": 0.54, + "grad_norm": 4.8551143754188475, + "learning_rate": 4.674023453683956e-06, + "loss": 0.6633, + "step": 6589 + }, + { + "epoch": 0.54, + "grad_norm": 2.9600284132004546, + "learning_rate": 4.672710908879985e-06, + "loss": 0.7555, + "step": 6590 + }, + { + "epoch": 0.54, + "grad_norm": 3.061178459461667, + "learning_rate": 4.671398386726479e-06, + "loss": 0.5683, + "step": 6591 + }, + { + "epoch": 0.54, + "grad_norm": 2.6049533329992016, + "learning_rate": 4.670085887314273e-06, + "loss": 0.656, + "step": 6592 + }, + { + "epoch": 0.54, + "grad_norm": 4.965331113963211, + "learning_rate": 4.6687734107342005e-06, + "loss": 0.7669, + "step": 6593 + }, + { + "epoch": 0.54, + "grad_norm": 3.8621287924335426, + "learning_rate": 4.667460957077094e-06, + "loss": 0.6622, + "step": 6594 + }, + { + "epoch": 0.54, + "grad_norm": 27.5842062742614, + "learning_rate": 4.666148526433784e-06, + "loss": 0.663, + "step": 6595 + }, + { + "epoch": 0.54, + "grad_norm": 3.199525539100665, + "learning_rate": 4.6648361188950976e-06, + "loss": 0.5194, + "step": 6596 + }, + { + "epoch": 0.54, + "grad_norm": 4.148987522027259, + "learning_rate": 4.663523734551863e-06, + "loss": 0.7305, + "step": 6597 + }, + { + "epoch": 0.54, + "grad_norm": 3.052734178415257, + "learning_rate": 4.662211373494904e-06, + "loss": 0.7156, + "step": 6598 + }, + { + "epoch": 0.54, + "grad_norm": 4.4085581095274105, + "learning_rate": 4.6608990358150444e-06, + "loss": 0.6515, + "step": 6599 + }, + { + "epoch": 0.54, + "grad_norm": 20.84002359512866, + "learning_rate": 4.659586721603107e-06, + "loss": 0.5971, + "step": 6600 + }, + { + "epoch": 0.54, + "grad_norm": 8.352703701213258, + "learning_rate": 4.658274430949911e-06, + "loss": 0.6749, + "step": 6601 + }, + { + "epoch": 0.54, + "grad_norm": 2.1533922893254513, + "learning_rate": 4.656962163946276e-06, + "loss": 0.6209, + "step": 6602 + }, + { + "epoch": 0.54, + "grad_norm": 3.661626394672807, + "learning_rate": 4.655649920683022e-06, + "loss": 0.8312, + "step": 6603 + }, + { + "epoch": 0.54, + "grad_norm": 2.788055480383149, + "learning_rate": 4.654337701250959e-06, + "loss": 0.656, + "step": 6604 + }, + { + "epoch": 0.54, + "grad_norm": 3.1361905896761866, + "learning_rate": 4.6530255057409055e-06, + "loss": 0.5552, + "step": 6605 + }, + { + "epoch": 0.54, + "grad_norm": 2.6181048594116105, + "learning_rate": 4.6517133342436695e-06, + "loss": 0.7074, + "step": 6606 + }, + { + "epoch": 0.54, + "grad_norm": 3.2612469084888334, + "learning_rate": 4.650401186850064e-06, + "loss": 0.7117, + "step": 6607 + }, + { + "epoch": 0.54, + "grad_norm": 3.015253742433555, + "learning_rate": 4.649089063650898e-06, + "loss": 0.7986, + "step": 6608 + }, + { + "epoch": 0.54, + "grad_norm": 4.439618212059267, + "learning_rate": 4.6477769647369785e-06, + "loss": 0.8132, + "step": 6609 + }, + { + "epoch": 0.54, + "grad_norm": 4.351726988856648, + "learning_rate": 4.646464890199113e-06, + "loss": 0.5885, + "step": 6610 + }, + { + "epoch": 0.54, + "grad_norm": 4.49084677788596, + "learning_rate": 4.6451528401281e-06, + "loss": 0.7586, + "step": 6611 + }, + { + "epoch": 0.54, + "grad_norm": 3.1524710157006317, + "learning_rate": 4.6438408146147455e-06, + "loss": 0.6821, + "step": 6612 + }, + { + "epoch": 0.54, + "grad_norm": 2.4091197530398873, + "learning_rate": 4.6425288137498506e-06, + "loss": 0.7515, + "step": 6613 + }, + { + "epoch": 0.54, + "grad_norm": 4.529629396258881, + "learning_rate": 4.641216837624211e-06, + "loss": 0.7693, + "step": 6614 + }, + { + "epoch": 0.54, + "grad_norm": 3.833613464529931, + "learning_rate": 4.6399048863286255e-06, + "loss": 0.7955, + "step": 6615 + }, + { + "epoch": 0.54, + "grad_norm": 9.494314889185498, + "learning_rate": 4.638592959953889e-06, + "loss": 0.6749, + "step": 6616 + }, + { + "epoch": 0.54, + "grad_norm": 3.864285215898108, + "learning_rate": 4.637281058590798e-06, + "loss": 0.6933, + "step": 6617 + }, + { + "epoch": 0.54, + "grad_norm": 2.2967734422358217, + "learning_rate": 4.635969182330139e-06, + "loss": 0.7839, + "step": 6618 + }, + { + "epoch": 0.54, + "grad_norm": 2.497374157001241, + "learning_rate": 4.634657331262705e-06, + "loss": 0.7592, + "step": 6619 + }, + { + "epoch": 0.54, + "grad_norm": 2.8675188174744712, + "learning_rate": 4.633345505479285e-06, + "loss": 0.7856, + "step": 6620 + }, + { + "epoch": 0.54, + "grad_norm": 3.3661663242949436, + "learning_rate": 4.632033705070663e-06, + "loss": 0.703, + "step": 6621 + }, + { + "epoch": 0.54, + "grad_norm": 2.90247757707111, + "learning_rate": 4.630721930127626e-06, + "loss": 0.6747, + "step": 6622 + }, + { + "epoch": 0.54, + "grad_norm": 6.258881222264688, + "learning_rate": 4.62941018074096e-06, + "loss": 0.7375, + "step": 6623 + }, + { + "epoch": 0.54, + "grad_norm": 4.161113073789652, + "learning_rate": 4.6280984570014395e-06, + "loss": 0.6651, + "step": 6624 + }, + { + "epoch": 0.54, + "grad_norm": 5.107610590258278, + "learning_rate": 4.626786758999847e-06, + "loss": 0.6868, + "step": 6625 + }, + { + "epoch": 0.54, + "grad_norm": 2.704019732371965, + "learning_rate": 4.625475086826961e-06, + "loss": 0.6422, + "step": 6626 + }, + { + "epoch": 0.54, + "grad_norm": 6.295053210236223, + "learning_rate": 4.624163440573558e-06, + "loss": 0.7998, + "step": 6627 + }, + { + "epoch": 0.54, + "grad_norm": 3.035357126194412, + "learning_rate": 4.622851820330412e-06, + "loss": 0.784, + "step": 6628 + }, + { + "epoch": 0.54, + "grad_norm": 4.039459680924603, + "learning_rate": 4.6215402261882935e-06, + "loss": 0.9122, + "step": 6629 + }, + { + "epoch": 0.54, + "grad_norm": 3.879646133764714, + "learning_rate": 4.620228658237976e-06, + "loss": 0.6336, + "step": 6630 + }, + { + "epoch": 0.54, + "grad_norm": 6.725956182067229, + "learning_rate": 4.618917116570225e-06, + "loss": 0.5537, + "step": 6631 + }, + { + "epoch": 0.54, + "grad_norm": 2.7807220355259297, + "learning_rate": 4.61760560127581e-06, + "loss": 0.7119, + "step": 6632 + }, + { + "epoch": 0.54, + "grad_norm": 2.599743473528298, + "learning_rate": 4.616294112445494e-06, + "loss": 0.6429, + "step": 6633 + }, + { + "epoch": 0.54, + "grad_norm": 2.582296741904413, + "learning_rate": 4.614982650170041e-06, + "loss": 0.4829, + "step": 6634 + }, + { + "epoch": 0.54, + "grad_norm": 2.9936761225936768, + "learning_rate": 4.613671214540214e-06, + "loss": 0.6411, + "step": 6635 + }, + { + "epoch": 0.54, + "grad_norm": 4.506044775218329, + "learning_rate": 4.612359805646773e-06, + "loss": 0.7319, + "step": 6636 + }, + { + "epoch": 0.54, + "grad_norm": 10.230227916452899, + "learning_rate": 4.611048423580472e-06, + "loss": 0.7302, + "step": 6637 + }, + { + "epoch": 0.54, + "grad_norm": 2.085715594391407, + "learning_rate": 4.609737068432071e-06, + "loss": 0.7145, + "step": 6638 + }, + { + "epoch": 0.54, + "grad_norm": 3.491468739494525, + "learning_rate": 4.60842574029232e-06, + "loss": 0.6628, + "step": 6639 + }, + { + "epoch": 0.54, + "grad_norm": 3.0069275413778223, + "learning_rate": 4.607114439251974e-06, + "loss": 0.5617, + "step": 6640 + }, + { + "epoch": 0.54, + "grad_norm": 6.664064628507378, + "learning_rate": 4.605803165401782e-06, + "loss": 0.6376, + "step": 6641 + }, + { + "epoch": 0.54, + "grad_norm": 7.163062003940158, + "learning_rate": 4.604491918832494e-06, + "loss": 0.6292, + "step": 6642 + }, + { + "epoch": 0.54, + "grad_norm": 3.4439607803063033, + "learning_rate": 4.603180699634857e-06, + "loss": 0.6623, + "step": 6643 + }, + { + "epoch": 0.54, + "grad_norm": 9.281783096459746, + "learning_rate": 4.601869507899612e-06, + "loss": 0.7172, + "step": 6644 + }, + { + "epoch": 0.54, + "grad_norm": 3.379480756170299, + "learning_rate": 4.600558343717505e-06, + "loss": 0.5549, + "step": 6645 + }, + { + "epoch": 0.54, + "grad_norm": 3.8004595518943844, + "learning_rate": 4.599247207179275e-06, + "loss": 0.7758, + "step": 6646 + }, + { + "epoch": 0.54, + "grad_norm": 3.436069888074509, + "learning_rate": 4.597936098375662e-06, + "loss": 0.7092, + "step": 6647 + }, + { + "epoch": 0.54, + "grad_norm": 33.82166900941241, + "learning_rate": 4.596625017397401e-06, + "loss": 0.689, + "step": 6648 + }, + { + "epoch": 0.54, + "grad_norm": 4.6902323951782146, + "learning_rate": 4.59531396433523e-06, + "loss": 0.6742, + "step": 6649 + }, + { + "epoch": 0.54, + "grad_norm": 3.726946138169601, + "learning_rate": 4.594002939279883e-06, + "loss": 0.5965, + "step": 6650 + }, + { + "epoch": 0.54, + "grad_norm": 3.3912624940115204, + "learning_rate": 4.592691942322086e-06, + "loss": 0.6788, + "step": 6651 + }, + { + "epoch": 0.54, + "grad_norm": 4.054687979593414, + "learning_rate": 4.591380973552571e-06, + "loss": 0.6331, + "step": 6652 + }, + { + "epoch": 0.54, + "grad_norm": 2.7210662681001323, + "learning_rate": 4.5900700330620675e-06, + "loss": 0.7772, + "step": 6653 + }, + { + "epoch": 0.54, + "grad_norm": 7.478096001945345, + "learning_rate": 4.5887591209412975e-06, + "loss": 0.6631, + "step": 6654 + }, + { + "epoch": 0.54, + "grad_norm": 3.619262110704854, + "learning_rate": 4.587448237280986e-06, + "loss": 0.5888, + "step": 6655 + }, + { + "epoch": 0.54, + "grad_norm": 2.805589150847091, + "learning_rate": 4.586137382171856e-06, + "loss": 0.8029, + "step": 6656 + }, + { + "epoch": 0.54, + "grad_norm": 5.924584755934715, + "learning_rate": 4.5848265557046226e-06, + "loss": 0.682, + "step": 6657 + }, + { + "epoch": 0.54, + "grad_norm": 4.150679422148071, + "learning_rate": 4.583515757970007e-06, + "loss": 0.769, + "step": 6658 + }, + { + "epoch": 0.54, + "grad_norm": 5.530903077880697, + "learning_rate": 4.5822049890587215e-06, + "loss": 0.6987, + "step": 6659 + }, + { + "epoch": 0.54, + "grad_norm": 7.810166958065769, + "learning_rate": 4.580894249061483e-06, + "loss": 0.6734, + "step": 6660 + }, + { + "epoch": 0.54, + "grad_norm": 3.5024212942073794, + "learning_rate": 4.5795835380690005e-06, + "loss": 0.6626, + "step": 6661 + }, + { + "epoch": 0.54, + "grad_norm": 28.989693084695062, + "learning_rate": 4.578272856171985e-06, + "loss": 0.7165, + "step": 6662 + }, + { + "epoch": 0.54, + "grad_norm": 3.5035931014937227, + "learning_rate": 4.576962203461144e-06, + "loss": 0.6568, + "step": 6663 + }, + { + "epoch": 0.54, + "grad_norm": 2.448710282148407, + "learning_rate": 4.5756515800271815e-06, + "loss": 0.5197, + "step": 6664 + }, + { + "epoch": 0.54, + "grad_norm": 3.5224921420705817, + "learning_rate": 4.574340985960801e-06, + "loss": 0.7752, + "step": 6665 + }, + { + "epoch": 0.54, + "grad_norm": 3.737454047681081, + "learning_rate": 4.573030421352704e-06, + "loss": 0.5826, + "step": 6666 + }, + { + "epoch": 0.54, + "grad_norm": 4.463797251287656, + "learning_rate": 4.571719886293591e-06, + "loss": 0.7684, + "step": 6667 + }, + { + "epoch": 0.54, + "grad_norm": 3.2941111898698856, + "learning_rate": 4.570409380874159e-06, + "loss": 0.6043, + "step": 6668 + }, + { + "epoch": 0.54, + "grad_norm": 3.631297192076074, + "learning_rate": 4.569098905185102e-06, + "loss": 0.789, + "step": 6669 + }, + { + "epoch": 0.54, + "grad_norm": 4.1851295769599695, + "learning_rate": 4.567788459317116e-06, + "loss": 0.5236, + "step": 6670 + }, + { + "epoch": 0.54, + "grad_norm": 3.503351610199908, + "learning_rate": 4.566478043360888e-06, + "loss": 0.6525, + "step": 6671 + }, + { + "epoch": 0.54, + "grad_norm": 5.5959023616599, + "learning_rate": 4.565167657407109e-06, + "loss": 0.6061, + "step": 6672 + }, + { + "epoch": 0.54, + "grad_norm": 3.2135405578783964, + "learning_rate": 4.563857301546466e-06, + "loss": 0.7545, + "step": 6673 + }, + { + "epoch": 0.54, + "grad_norm": 6.503361611501933, + "learning_rate": 4.562546975869644e-06, + "loss": 0.7783, + "step": 6674 + }, + { + "epoch": 0.54, + "grad_norm": 14.932049108871887, + "learning_rate": 4.561236680467326e-06, + "loss": 0.5992, + "step": 6675 + }, + { + "epoch": 0.54, + "grad_norm": 5.90712438256307, + "learning_rate": 4.559926415430194e-06, + "loss": 0.7046, + "step": 6676 + }, + { + "epoch": 0.54, + "grad_norm": 2.9571032593291657, + "learning_rate": 4.558616180848922e-06, + "loss": 0.6089, + "step": 6677 + }, + { + "epoch": 0.54, + "grad_norm": 2.597780598804849, + "learning_rate": 4.557305976814193e-06, + "loss": 0.6403, + "step": 6678 + }, + { + "epoch": 0.54, + "grad_norm": 5.566021419323083, + "learning_rate": 4.555995803416674e-06, + "loss": 0.7609, + "step": 6679 + }, + { + "epoch": 0.54, + "grad_norm": 3.900452754930337, + "learning_rate": 4.554685660747043e-06, + "loss": 0.7704, + "step": 6680 + }, + { + "epoch": 0.54, + "grad_norm": 14.694609341600737, + "learning_rate": 4.553375548895968e-06, + "loss": 0.7091, + "step": 6681 + }, + { + "epoch": 0.54, + "grad_norm": 4.514175327950168, + "learning_rate": 4.552065467954117e-06, + "loss": 0.6286, + "step": 6682 + }, + { + "epoch": 0.54, + "grad_norm": 6.7117302134141426, + "learning_rate": 4.550755418012158e-06, + "loss": 0.7501, + "step": 6683 + }, + { + "epoch": 0.54, + "grad_norm": 2.762116017493449, + "learning_rate": 4.54944539916075e-06, + "loss": 0.7016, + "step": 6684 + }, + { + "epoch": 0.54, + "grad_norm": 6.605277350544211, + "learning_rate": 4.5481354114905595e-06, + "loss": 0.7463, + "step": 6685 + }, + { + "epoch": 0.54, + "grad_norm": 3.3179601187548253, + "learning_rate": 4.546825455092242e-06, + "loss": 0.8276, + "step": 6686 + }, + { + "epoch": 0.54, + "grad_norm": 5.7777731531464225, + "learning_rate": 4.545515530056457e-06, + "loss": 0.8988, + "step": 6687 + }, + { + "epoch": 0.54, + "grad_norm": 3.219604761494268, + "learning_rate": 4.544205636473858e-06, + "loss": 0.6158, + "step": 6688 + }, + { + "epoch": 0.54, + "grad_norm": 4.997110199112129, + "learning_rate": 4.542895774435102e-06, + "loss": 0.6994, + "step": 6689 + }, + { + "epoch": 0.54, + "grad_norm": 2.5798419577911353, + "learning_rate": 4.541585944030833e-06, + "loss": 0.7564, + "step": 6690 + }, + { + "epoch": 0.54, + "grad_norm": 3.4405341450590785, + "learning_rate": 4.540276145351705e-06, + "loss": 0.7078, + "step": 6691 + }, + { + "epoch": 0.54, + "grad_norm": 2.3449428148115534, + "learning_rate": 4.538966378488362e-06, + "loss": 0.6487, + "step": 6692 + }, + { + "epoch": 0.54, + "grad_norm": 5.941951161476081, + "learning_rate": 4.537656643531448e-06, + "loss": 0.6811, + "step": 6693 + }, + { + "epoch": 0.54, + "grad_norm": 11.89940356260767, + "learning_rate": 4.536346940571606e-06, + "loss": 0.7666, + "step": 6694 + }, + { + "epoch": 0.54, + "grad_norm": 6.595110348685419, + "learning_rate": 4.535037269699474e-06, + "loss": 0.8103, + "step": 6695 + }, + { + "epoch": 0.54, + "grad_norm": 4.1909835862626545, + "learning_rate": 4.533727631005694e-06, + "loss": 0.5907, + "step": 6696 + }, + { + "epoch": 0.54, + "grad_norm": 3.218433680474106, + "learning_rate": 4.5324180245808945e-06, + "loss": 0.7066, + "step": 6697 + }, + { + "epoch": 0.54, + "grad_norm": 3.8656068642155383, + "learning_rate": 4.531108450515712e-06, + "loss": 0.8201, + "step": 6698 + }, + { + "epoch": 0.54, + "grad_norm": 5.630828204516917, + "learning_rate": 4.529798908900777e-06, + "loss": 0.6089, + "step": 6699 + }, + { + "epoch": 0.54, + "grad_norm": 3.505191271248207, + "learning_rate": 4.52848939982672e-06, + "loss": 0.6896, + "step": 6700 + }, + { + "epoch": 0.54, + "grad_norm": 2.6856599770051135, + "learning_rate": 4.527179923384165e-06, + "loss": 0.5842, + "step": 6701 + }, + { + "epoch": 0.54, + "grad_norm": 2.3588634222132203, + "learning_rate": 4.5258704796637345e-06, + "loss": 0.8579, + "step": 6702 + }, + { + "epoch": 0.54, + "grad_norm": 3.4218484615413804, + "learning_rate": 4.524561068756055e-06, + "loss": 0.7978, + "step": 6703 + }, + { + "epoch": 0.54, + "grad_norm": 2.809657816411793, + "learning_rate": 4.523251690751741e-06, + "loss": 0.6258, + "step": 6704 + }, + { + "epoch": 0.54, + "grad_norm": 4.261272381694099, + "learning_rate": 4.521942345741413e-06, + "loss": 0.7515, + "step": 6705 + }, + { + "epoch": 0.54, + "grad_norm": 30.014843001509995, + "learning_rate": 4.520633033815684e-06, + "loss": 0.7123, + "step": 6706 + }, + { + "epoch": 0.54, + "grad_norm": 6.790969504459516, + "learning_rate": 4.519323755065167e-06, + "loss": 0.6856, + "step": 6707 + }, + { + "epoch": 0.54, + "grad_norm": 2.878378842195477, + "learning_rate": 4.518014509580474e-06, + "loss": 0.6668, + "step": 6708 + }, + { + "epoch": 0.54, + "grad_norm": 3.010808332578446, + "learning_rate": 4.516705297452212e-06, + "loss": 0.792, + "step": 6709 + }, + { + "epoch": 0.54, + "grad_norm": 2.477459799882348, + "learning_rate": 4.515396118770986e-06, + "loss": 0.6284, + "step": 6710 + }, + { + "epoch": 0.55, + "grad_norm": 2.198079295130724, + "learning_rate": 4.514086973627399e-06, + "loss": 0.6708, + "step": 6711 + }, + { + "epoch": 0.55, + "grad_norm": 2.3893130794546984, + "learning_rate": 4.512777862112053e-06, + "loss": 0.7647, + "step": 6712 + }, + { + "epoch": 0.55, + "grad_norm": 3.1444475335326607, + "learning_rate": 4.511468784315547e-06, + "loss": 0.6161, + "step": 6713 + }, + { + "epoch": 0.55, + "grad_norm": 3.0208766390944493, + "learning_rate": 4.5101597403284765e-06, + "loss": 0.5301, + "step": 6714 + }, + { + "epoch": 0.55, + "grad_norm": 3.918797861826325, + "learning_rate": 4.508850730241437e-06, + "loss": 0.4313, + "step": 6715 + }, + { + "epoch": 0.55, + "grad_norm": 2.3654515573406623, + "learning_rate": 4.5075417541450215e-06, + "loss": 0.7215, + "step": 6716 + }, + { + "epoch": 0.55, + "grad_norm": 3.2801522030676176, + "learning_rate": 4.506232812129816e-06, + "loss": 0.7729, + "step": 6717 + }, + { + "epoch": 0.55, + "grad_norm": 3.12112248829566, + "learning_rate": 4.504923904286409e-06, + "loss": 0.7279, + "step": 6718 + }, + { + "epoch": 0.55, + "grad_norm": 7.8624182972335905, + "learning_rate": 4.503615030705384e-06, + "loss": 0.6375, + "step": 6719 + }, + { + "epoch": 0.55, + "grad_norm": 3.996929954014196, + "learning_rate": 4.5023061914773244e-06, + "loss": 0.6773, + "step": 6720 + }, + { + "epoch": 0.55, + "grad_norm": 2.376166508516253, + "learning_rate": 4.5009973866928105e-06, + "loss": 0.6622, + "step": 6721 + }, + { + "epoch": 0.55, + "grad_norm": 3.371857953089268, + "learning_rate": 4.499688616442419e-06, + "loss": 0.7857, + "step": 6722 + }, + { + "epoch": 0.55, + "grad_norm": 3.721452199819446, + "learning_rate": 4.498379880816728e-06, + "loss": 0.677, + "step": 6723 + }, + { + "epoch": 0.55, + "grad_norm": 3.669239784434637, + "learning_rate": 4.497071179906305e-06, + "loss": 0.7255, + "step": 6724 + }, + { + "epoch": 0.55, + "grad_norm": 8.878511392781272, + "learning_rate": 4.495762513801724e-06, + "loss": 0.8223, + "step": 6725 + }, + { + "epoch": 0.55, + "grad_norm": 2.7643459251663702, + "learning_rate": 4.494453882593552e-06, + "loss": 0.762, + "step": 6726 + }, + { + "epoch": 0.55, + "grad_norm": 3.546761834411261, + "learning_rate": 4.4931452863723535e-06, + "loss": 0.6948, + "step": 6727 + }, + { + "epoch": 0.55, + "grad_norm": 2.219742152088609, + "learning_rate": 4.491836725228693e-06, + "loss": 0.6289, + "step": 6728 + }, + { + "epoch": 0.55, + "grad_norm": 3.222239452993486, + "learning_rate": 4.490528199253133e-06, + "loss": 0.678, + "step": 6729 + }, + { + "epoch": 0.55, + "grad_norm": 4.039863666240687, + "learning_rate": 4.489219708536228e-06, + "loss": 0.7015, + "step": 6730 + }, + { + "epoch": 0.55, + "grad_norm": 4.002033534843258, + "learning_rate": 4.487911253168534e-06, + "loss": 0.7997, + "step": 6731 + }, + { + "epoch": 0.55, + "grad_norm": 2.7610655263986428, + "learning_rate": 4.4866028332406064e-06, + "loss": 0.7937, + "step": 6732 + }, + { + "epoch": 0.55, + "grad_norm": 3.9935545920730995, + "learning_rate": 4.485294448842996e-06, + "loss": 0.6704, + "step": 6733 + }, + { + "epoch": 0.55, + "grad_norm": 8.24533419406517, + "learning_rate": 4.4839861000662496e-06, + "loss": 0.8214, + "step": 6734 + }, + { + "epoch": 0.55, + "grad_norm": 4.667802527243373, + "learning_rate": 4.482677787000915e-06, + "loss": 0.6558, + "step": 6735 + }, + { + "epoch": 0.55, + "grad_norm": 2.916224639300997, + "learning_rate": 4.4813695097375355e-06, + "loss": 0.6094, + "step": 6736 + }, + { + "epoch": 0.55, + "grad_norm": 4.215978092389523, + "learning_rate": 4.48006126836665e-06, + "loss": 0.7036, + "step": 6737 + }, + { + "epoch": 0.55, + "grad_norm": 2.726765288253642, + "learning_rate": 4.4787530629787995e-06, + "loss": 0.6991, + "step": 6738 + }, + { + "epoch": 0.55, + "grad_norm": 4.465314361569634, + "learning_rate": 4.477444893664518e-06, + "loss": 0.6463, + "step": 6739 + }, + { + "epoch": 0.55, + "grad_norm": 5.090999309979331, + "learning_rate": 4.476136760514341e-06, + "loss": 0.7844, + "step": 6740 + }, + { + "epoch": 0.55, + "grad_norm": 4.318269304136731, + "learning_rate": 4.4748286636187985e-06, + "loss": 0.6391, + "step": 6741 + }, + { + "epoch": 0.55, + "grad_norm": 2.818514401302119, + "learning_rate": 4.473520603068421e-06, + "loss": 0.6322, + "step": 6742 + }, + { + "epoch": 0.55, + "grad_norm": 3.4071907620949085, + "learning_rate": 4.472212578953731e-06, + "loss": 0.6967, + "step": 6743 + }, + { + "epoch": 0.55, + "grad_norm": 2.654691208151812, + "learning_rate": 4.470904591365253e-06, + "loss": 0.6374, + "step": 6744 + }, + { + "epoch": 0.55, + "grad_norm": 2.218558949338727, + "learning_rate": 4.4695966403935095e-06, + "loss": 0.6717, + "step": 6745 + }, + { + "epoch": 0.55, + "grad_norm": 2.8815332717207465, + "learning_rate": 4.468288726129018e-06, + "loss": 0.7074, + "step": 6746 + }, + { + "epoch": 0.55, + "grad_norm": 2.6668805621423624, + "learning_rate": 4.466980848662295e-06, + "loss": 0.8332, + "step": 6747 + }, + { + "epoch": 0.55, + "grad_norm": 2.63204004296017, + "learning_rate": 4.4656730080838535e-06, + "loss": 0.6099, + "step": 6748 + }, + { + "epoch": 0.55, + "grad_norm": 2.4748550745061726, + "learning_rate": 4.464365204484204e-06, + "loss": 0.6179, + "step": 6749 + }, + { + "epoch": 0.55, + "grad_norm": 4.236652586850476, + "learning_rate": 4.463057437953855e-06, + "loss": 0.7659, + "step": 6750 + }, + { + "epoch": 0.55, + "grad_norm": 5.246496087227289, + "learning_rate": 4.461749708583313e-06, + "loss": 0.597, + "step": 6751 + }, + { + "epoch": 0.55, + "grad_norm": 3.1869054571016733, + "learning_rate": 4.460442016463079e-06, + "loss": 0.8225, + "step": 6752 + }, + { + "epoch": 0.55, + "grad_norm": 2.359610854596599, + "learning_rate": 4.4591343616836545e-06, + "loss": 0.4747, + "step": 6753 + }, + { + "epoch": 0.55, + "grad_norm": 3.2641550269475337, + "learning_rate": 4.457826744335538e-06, + "loss": 0.5478, + "step": 6754 + }, + { + "epoch": 0.55, + "grad_norm": 13.751901471810733, + "learning_rate": 4.4565191645092244e-06, + "loss": 0.7826, + "step": 6755 + }, + { + "epoch": 0.55, + "grad_norm": 4.139938756693409, + "learning_rate": 4.45521162229521e-06, + "loss": 0.5977, + "step": 6756 + }, + { + "epoch": 0.55, + "grad_norm": 4.2495862922110375, + "learning_rate": 4.453904117783978e-06, + "loss": 0.6785, + "step": 6757 + }, + { + "epoch": 0.55, + "grad_norm": 3.0688870582805134, + "learning_rate": 4.452596651066021e-06, + "loss": 0.6836, + "step": 6758 + }, + { + "epoch": 0.55, + "grad_norm": 3.222344557145634, + "learning_rate": 4.451289222231821e-06, + "loss": 0.7173, + "step": 6759 + }, + { + "epoch": 0.55, + "grad_norm": 2.933298701496621, + "learning_rate": 4.449981831371863e-06, + "loss": 0.5088, + "step": 6760 + }, + { + "epoch": 0.55, + "grad_norm": 3.6852505474558965, + "learning_rate": 4.448674478576625e-06, + "loss": 0.9275, + "step": 6761 + }, + { + "epoch": 0.55, + "grad_norm": 2.6298660844132016, + "learning_rate": 4.447367163936586e-06, + "loss": 0.5508, + "step": 6762 + }, + { + "epoch": 0.55, + "grad_norm": 5.471763681682436, + "learning_rate": 4.4460598875422175e-06, + "loss": 0.5929, + "step": 6763 + }, + { + "epoch": 0.55, + "grad_norm": 4.119746550532442, + "learning_rate": 4.444752649483993e-06, + "loss": 0.6637, + "step": 6764 + }, + { + "epoch": 0.55, + "grad_norm": 6.554778771125893, + "learning_rate": 4.44344544985238e-06, + "loss": 0.7469, + "step": 6765 + }, + { + "epoch": 0.55, + "grad_norm": 3.5784563875203768, + "learning_rate": 4.442138288737848e-06, + "loss": 0.7068, + "step": 6766 + }, + { + "epoch": 0.55, + "grad_norm": 4.446059255533405, + "learning_rate": 4.440831166230858e-06, + "loss": 0.7519, + "step": 6767 + }, + { + "epoch": 0.55, + "grad_norm": 2.3413452690024545, + "learning_rate": 4.439524082421872e-06, + "loss": 0.5098, + "step": 6768 + }, + { + "epoch": 0.55, + "grad_norm": 4.268941714671915, + "learning_rate": 4.438217037401351e-06, + "loss": 0.6509, + "step": 6769 + }, + { + "epoch": 0.55, + "grad_norm": 3.2972197728439703, + "learning_rate": 4.4369100312597455e-06, + "loss": 0.6241, + "step": 6770 + }, + { + "epoch": 0.55, + "grad_norm": 2.823880734997757, + "learning_rate": 4.435603064087512e-06, + "loss": 0.6973, + "step": 6771 + }, + { + "epoch": 0.55, + "grad_norm": 2.4285030669083536, + "learning_rate": 4.434296135975099e-06, + "loss": 0.6431, + "step": 6772 + }, + { + "epoch": 0.55, + "grad_norm": 2.344138302798512, + "learning_rate": 4.432989247012958e-06, + "loss": 0.7165, + "step": 6773 + }, + { + "epoch": 0.55, + "grad_norm": 3.318739936395028, + "learning_rate": 4.43168239729153e-06, + "loss": 0.72, + "step": 6774 + }, + { + "epoch": 0.55, + "grad_norm": 14.229288111268472, + "learning_rate": 4.430375586901258e-06, + "loss": 0.6236, + "step": 6775 + }, + { + "epoch": 0.55, + "grad_norm": 8.376423903309352, + "learning_rate": 4.429068815932585e-06, + "loss": 0.8278, + "step": 6776 + }, + { + "epoch": 0.55, + "grad_norm": 3.231230230888898, + "learning_rate": 4.427762084475941e-06, + "loss": 0.7404, + "step": 6777 + }, + { + "epoch": 0.55, + "grad_norm": 6.200312764738706, + "learning_rate": 4.426455392621765e-06, + "loss": 0.7774, + "step": 6778 + }, + { + "epoch": 0.55, + "grad_norm": 3.287926989827072, + "learning_rate": 4.425148740460487e-06, + "loss": 0.5863, + "step": 6779 + }, + { + "epoch": 0.55, + "grad_norm": 2.486685751121668, + "learning_rate": 4.423842128082535e-06, + "loss": 0.6078, + "step": 6780 + }, + { + "epoch": 0.55, + "grad_norm": 2.6591907274918536, + "learning_rate": 4.422535555578338e-06, + "loss": 0.7295, + "step": 6781 + }, + { + "epoch": 0.55, + "grad_norm": 4.429070825054204, + "learning_rate": 4.421229023038316e-06, + "loss": 0.8475, + "step": 6782 + }, + { + "epoch": 0.55, + "grad_norm": 4.807842088398467, + "learning_rate": 4.41992253055289e-06, + "loss": 0.6807, + "step": 6783 + }, + { + "epoch": 0.55, + "grad_norm": 4.305960130883989, + "learning_rate": 4.418616078212475e-06, + "loss": 0.6459, + "step": 6784 + }, + { + "epoch": 0.55, + "grad_norm": 2.8671408574431805, + "learning_rate": 4.4173096661074895e-06, + "loss": 0.5426, + "step": 6785 + }, + { + "epoch": 0.55, + "grad_norm": 4.655487629784463, + "learning_rate": 4.416003294328344e-06, + "loss": 0.6308, + "step": 6786 + }, + { + "epoch": 0.55, + "grad_norm": 3.1802288012490054, + "learning_rate": 4.414696962965447e-06, + "loss": 0.6717, + "step": 6787 + }, + { + "epoch": 0.55, + "grad_norm": 3.856534003553808, + "learning_rate": 4.413390672109207e-06, + "loss": 0.7129, + "step": 6788 + }, + { + "epoch": 0.55, + "grad_norm": 3.996665243099306, + "learning_rate": 4.412084421850026e-06, + "loss": 0.6742, + "step": 6789 + }, + { + "epoch": 0.55, + "grad_norm": 3.2734912986533846, + "learning_rate": 4.410778212278304e-06, + "loss": 0.7696, + "step": 6790 + }, + { + "epoch": 0.55, + "grad_norm": 5.430574509110885, + "learning_rate": 4.40947204348444e-06, + "loss": 0.7004, + "step": 6791 + }, + { + "epoch": 0.55, + "grad_norm": 4.944299302840354, + "learning_rate": 4.408165915558829e-06, + "loss": 0.565, + "step": 6792 + }, + { + "epoch": 0.55, + "grad_norm": 3.962879396044792, + "learning_rate": 4.406859828591862e-06, + "loss": 0.5595, + "step": 6793 + }, + { + "epoch": 0.55, + "grad_norm": 2.3547902145815116, + "learning_rate": 4.40555378267393e-06, + "loss": 0.7508, + "step": 6794 + }, + { + "epoch": 0.55, + "grad_norm": 2.284092742876538, + "learning_rate": 4.4042477778954215e-06, + "loss": 0.6907, + "step": 6795 + }, + { + "epoch": 0.55, + "grad_norm": 3.0642794767442205, + "learning_rate": 4.402941814346716e-06, + "loss": 0.6299, + "step": 6796 + }, + { + "epoch": 0.55, + "grad_norm": 3.6977575658930975, + "learning_rate": 4.401635892118196e-06, + "loss": 0.6803, + "step": 6797 + }, + { + "epoch": 0.55, + "grad_norm": 3.6853232169300756, + "learning_rate": 4.400330011300242e-06, + "loss": 0.623, + "step": 6798 + }, + { + "epoch": 0.55, + "grad_norm": 3.3064321738398985, + "learning_rate": 4.399024171983224e-06, + "loss": 0.7509, + "step": 6799 + }, + { + "epoch": 0.55, + "grad_norm": 2.5652442919321037, + "learning_rate": 4.3977183742575186e-06, + "loss": 0.7283, + "step": 6800 + }, + { + "epoch": 0.55, + "grad_norm": 2.903623141070364, + "learning_rate": 4.396412618213494e-06, + "loss": 0.6517, + "step": 6801 + }, + { + "epoch": 0.55, + "grad_norm": 4.2175841326536965, + "learning_rate": 4.3951069039415184e-06, + "loss": 0.7427, + "step": 6802 + }, + { + "epoch": 0.55, + "grad_norm": 3.9506541951063623, + "learning_rate": 4.393801231531952e-06, + "loss": 0.6943, + "step": 6803 + }, + { + "epoch": 0.55, + "grad_norm": 3.842806540883433, + "learning_rate": 4.392495601075157e-06, + "loss": 0.6348, + "step": 6804 + }, + { + "epoch": 0.55, + "grad_norm": 3.7986950893324125, + "learning_rate": 4.391190012661491e-06, + "loss": 0.6203, + "step": 6805 + }, + { + "epoch": 0.55, + "grad_norm": 4.168668997823342, + "learning_rate": 4.389884466381312e-06, + "loss": 0.7809, + "step": 6806 + }, + { + "epoch": 0.55, + "grad_norm": 3.4433229113020487, + "learning_rate": 4.388578962324967e-06, + "loss": 0.7327, + "step": 6807 + }, + { + "epoch": 0.55, + "grad_norm": 2.7667754655885135, + "learning_rate": 4.387273500582809e-06, + "loss": 0.5899, + "step": 6808 + }, + { + "epoch": 0.55, + "grad_norm": 3.7644107720700184, + "learning_rate": 4.3859680812451844e-06, + "loss": 0.6426, + "step": 6809 + }, + { + "epoch": 0.55, + "grad_norm": 2.9805139009576105, + "learning_rate": 4.384662704402433e-06, + "loss": 0.7077, + "step": 6810 + }, + { + "epoch": 0.55, + "grad_norm": 3.5754318770227105, + "learning_rate": 4.383357370144896e-06, + "loss": 0.5922, + "step": 6811 + }, + { + "epoch": 0.55, + "grad_norm": 3.963388652937568, + "learning_rate": 4.382052078562913e-06, + "loss": 0.6327, + "step": 6812 + }, + { + "epoch": 0.55, + "grad_norm": 2.3978615046933807, + "learning_rate": 4.380746829746817e-06, + "loss": 0.6471, + "step": 6813 + }, + { + "epoch": 0.55, + "grad_norm": 3.1407812042839693, + "learning_rate": 4.379441623786938e-06, + "loss": 0.8238, + "step": 6814 + }, + { + "epoch": 0.55, + "grad_norm": 5.0606686031518935, + "learning_rate": 4.378136460773609e-06, + "loss": 0.6591, + "step": 6815 + }, + { + "epoch": 0.55, + "grad_norm": 2.8574753075181225, + "learning_rate": 4.376831340797151e-06, + "loss": 0.8108, + "step": 6816 + }, + { + "epoch": 0.55, + "grad_norm": 3.342133698190901, + "learning_rate": 4.375526263947887e-06, + "loss": 0.5185, + "step": 6817 + }, + { + "epoch": 0.55, + "grad_norm": 24.21977990190991, + "learning_rate": 4.374221230316138e-06, + "loss": 0.5867, + "step": 6818 + }, + { + "epoch": 0.55, + "grad_norm": 3.097243863937538, + "learning_rate": 4.37291623999222e-06, + "loss": 0.7158, + "step": 6819 + }, + { + "epoch": 0.55, + "grad_norm": 2.3121114690182742, + "learning_rate": 4.371611293066446e-06, + "loss": 0.6579, + "step": 6820 + }, + { + "epoch": 0.55, + "grad_norm": 3.283416195224011, + "learning_rate": 4.37030638962913e-06, + "loss": 0.6807, + "step": 6821 + }, + { + "epoch": 0.55, + "grad_norm": 2.874680888419324, + "learning_rate": 4.3690015297705755e-06, + "loss": 0.7401, + "step": 6822 + }, + { + "epoch": 0.55, + "grad_norm": 3.4228578878989118, + "learning_rate": 4.367696713581088e-06, + "loss": 0.5715, + "step": 6823 + }, + { + "epoch": 0.55, + "grad_norm": 9.094807985216361, + "learning_rate": 4.366391941150969e-06, + "loss": 0.8104, + "step": 6824 + }, + { + "epoch": 0.55, + "grad_norm": 4.815336355126061, + "learning_rate": 4.365087212570516e-06, + "loss": 0.7584, + "step": 6825 + }, + { + "epoch": 0.55, + "grad_norm": 4.097249894248789, + "learning_rate": 4.363782527930026e-06, + "loss": 0.6317, + "step": 6826 + }, + { + "epoch": 0.55, + "grad_norm": 3.2542578602686554, + "learning_rate": 4.362477887319792e-06, + "loss": 0.7114, + "step": 6827 + }, + { + "epoch": 0.55, + "grad_norm": 4.576346748074083, + "learning_rate": 4.361173290830102e-06, + "loss": 0.7068, + "step": 6828 + }, + { + "epoch": 0.55, + "grad_norm": 4.4001467366703455, + "learning_rate": 4.359868738551244e-06, + "loss": 0.5701, + "step": 6829 + }, + { + "epoch": 0.55, + "grad_norm": 2.888778309484578, + "learning_rate": 4.358564230573498e-06, + "loss": 0.6928, + "step": 6830 + }, + { + "epoch": 0.55, + "grad_norm": 3.3897288448091265, + "learning_rate": 4.357259766987147e-06, + "loss": 0.612, + "step": 6831 + }, + { + "epoch": 0.55, + "grad_norm": 5.3837512393981095, + "learning_rate": 4.355955347882467e-06, + "loss": 0.7801, + "step": 6832 + }, + { + "epoch": 0.55, + "grad_norm": 3.660006228650256, + "learning_rate": 4.354650973349732e-06, + "loss": 0.7005, + "step": 6833 + }, + { + "epoch": 0.56, + "grad_norm": 4.5326247096127705, + "learning_rate": 4.3533466434792125e-06, + "loss": 0.7045, + "step": 6834 + }, + { + "epoch": 0.56, + "grad_norm": 2.504208698339557, + "learning_rate": 4.35204235836118e-06, + "loss": 0.6674, + "step": 6835 + }, + { + "epoch": 0.56, + "grad_norm": 3.2000150725821825, + "learning_rate": 4.350738118085893e-06, + "loss": 0.575, + "step": 6836 + }, + { + "epoch": 0.56, + "grad_norm": 7.019690815326524, + "learning_rate": 4.349433922743616e-06, + "loss": 0.603, + "step": 6837 + }, + { + "epoch": 0.56, + "grad_norm": 2.855861328453311, + "learning_rate": 4.34812977242461e-06, + "loss": 0.6177, + "step": 6838 + }, + { + "epoch": 0.56, + "grad_norm": 4.683856007469121, + "learning_rate": 4.346825667219127e-06, + "loss": 0.6589, + "step": 6839 + }, + { + "epoch": 0.56, + "grad_norm": 3.129192450229475, + "learning_rate": 4.34552160721742e-06, + "loss": 0.7202, + "step": 6840 + }, + { + "epoch": 0.56, + "grad_norm": 5.474050679960641, + "learning_rate": 4.3442175925097395e-06, + "loss": 0.7405, + "step": 6841 + }, + { + "epoch": 0.56, + "grad_norm": 3.331377930224471, + "learning_rate": 4.342913623186332e-06, + "loss": 0.7208, + "step": 6842 + }, + { + "epoch": 0.56, + "grad_norm": 21.168127492269672, + "learning_rate": 4.341609699337438e-06, + "loss": 0.7653, + "step": 6843 + }, + { + "epoch": 0.56, + "grad_norm": 2.9519802143215133, + "learning_rate": 4.3403058210532975e-06, + "loss": 0.6766, + "step": 6844 + }, + { + "epoch": 0.56, + "grad_norm": 4.283554167974886, + "learning_rate": 4.339001988424148e-06, + "loss": 0.5934, + "step": 6845 + }, + { + "epoch": 0.56, + "grad_norm": 2.6798154042577123, + "learning_rate": 4.337698201540225e-06, + "loss": 0.6729, + "step": 6846 + }, + { + "epoch": 0.56, + "grad_norm": 7.507049437169468, + "learning_rate": 4.336394460491754e-06, + "loss": 0.7491, + "step": 6847 + }, + { + "epoch": 0.56, + "grad_norm": 2.7205816971242416, + "learning_rate": 4.335090765368968e-06, + "loss": 0.7147, + "step": 6848 + }, + { + "epoch": 0.56, + "grad_norm": 5.276156317210714, + "learning_rate": 4.333787116262085e-06, + "loss": 0.822, + "step": 6849 + }, + { + "epoch": 0.56, + "grad_norm": 3.792361014771035, + "learning_rate": 4.3324835132613285e-06, + "loss": 0.7172, + "step": 6850 + }, + { + "epoch": 0.56, + "grad_norm": 3.058327451345355, + "learning_rate": 4.3311799564569165e-06, + "loss": 0.7133, + "step": 6851 + }, + { + "epoch": 0.56, + "grad_norm": 4.642867198276733, + "learning_rate": 4.329876445939062e-06, + "loss": 0.5627, + "step": 6852 + }, + { + "epoch": 0.56, + "grad_norm": 3.383166451271083, + "learning_rate": 4.3285729817979775e-06, + "loss": 0.7741, + "step": 6853 + }, + { + "epoch": 0.56, + "grad_norm": 2.7400101143718167, + "learning_rate": 4.32726956412387e-06, + "loss": 0.7178, + "step": 6854 + }, + { + "epoch": 0.56, + "grad_norm": 2.4756911441636227, + "learning_rate": 4.325966193006946e-06, + "loss": 0.7413, + "step": 6855 + }, + { + "epoch": 0.56, + "grad_norm": 3.194191717608314, + "learning_rate": 4.324662868537405e-06, + "loss": 0.6077, + "step": 6856 + }, + { + "epoch": 0.56, + "grad_norm": 3.6015080786733877, + "learning_rate": 4.323359590805445e-06, + "loss": 0.6798, + "step": 6857 + }, + { + "epoch": 0.56, + "grad_norm": 2.439863206470225, + "learning_rate": 4.322056359901262e-06, + "loss": 0.7068, + "step": 6858 + }, + { + "epoch": 0.56, + "grad_norm": 2.5042078080037626, + "learning_rate": 4.320753175915047e-06, + "loss": 0.6189, + "step": 6859 + }, + { + "epoch": 0.56, + "grad_norm": 2.9228169993683997, + "learning_rate": 4.319450038936989e-06, + "loss": 0.6765, + "step": 6860 + }, + { + "epoch": 0.56, + "grad_norm": 2.810683684170626, + "learning_rate": 4.318146949057275e-06, + "loss": 0.7373, + "step": 6861 + }, + { + "epoch": 0.56, + "grad_norm": 3.5029878062490876, + "learning_rate": 4.316843906366085e-06, + "loss": 0.678, + "step": 6862 + }, + { + "epoch": 0.56, + "grad_norm": 3.2255358236980713, + "learning_rate": 4.315540910953598e-06, + "loss": 0.747, + "step": 6863 + }, + { + "epoch": 0.56, + "grad_norm": 4.5528421390757785, + "learning_rate": 4.314237962909989e-06, + "loss": 0.6096, + "step": 6864 + }, + { + "epoch": 0.56, + "grad_norm": 3.8601774619259297, + "learning_rate": 4.312935062325431e-06, + "loss": 0.7473, + "step": 6865 + }, + { + "epoch": 0.56, + "grad_norm": 3.4966415893751863, + "learning_rate": 4.3116322092900925e-06, + "loss": 0.6065, + "step": 6866 + }, + { + "epoch": 0.56, + "grad_norm": 11.24879479757515, + "learning_rate": 4.31032940389414e-06, + "loss": 0.5538, + "step": 6867 + }, + { + "epoch": 0.56, + "grad_norm": 3.658742767608833, + "learning_rate": 4.309026646227737e-06, + "loss": 0.5204, + "step": 6868 + }, + { + "epoch": 0.56, + "grad_norm": 2.692263620159278, + "learning_rate": 4.307723936381038e-06, + "loss": 0.6503, + "step": 6869 + }, + { + "epoch": 0.56, + "grad_norm": 3.706723566596683, + "learning_rate": 4.3064212744442026e-06, + "loss": 0.5831, + "step": 6870 + }, + { + "epoch": 0.56, + "grad_norm": 2.402681804292963, + "learning_rate": 4.305118660507382e-06, + "loss": 0.5789, + "step": 6871 + }, + { + "epoch": 0.56, + "grad_norm": 3.2699878469512558, + "learning_rate": 4.303816094660726e-06, + "loss": 0.731, + "step": 6872 + }, + { + "epoch": 0.56, + "grad_norm": 3.6528611211965876, + "learning_rate": 4.3025135769943786e-06, + "loss": 0.6031, + "step": 6873 + }, + { + "epoch": 0.56, + "grad_norm": 2.295379776572049, + "learning_rate": 4.301211107598484e-06, + "loss": 0.6954, + "step": 6874 + }, + { + "epoch": 0.56, + "grad_norm": 3.2718876613953887, + "learning_rate": 4.2999086865631825e-06, + "loss": 0.6587, + "step": 6875 + }, + { + "epoch": 0.56, + "grad_norm": 2.477917529947379, + "learning_rate": 4.298606313978605e-06, + "loss": 0.7516, + "step": 6876 + }, + { + "epoch": 0.56, + "grad_norm": 2.9652452416912474, + "learning_rate": 4.297303989934888e-06, + "loss": 0.6927, + "step": 6877 + }, + { + "epoch": 0.56, + "grad_norm": 3.5725438439547745, + "learning_rate": 4.29600171452216e-06, + "loss": 0.6893, + "step": 6878 + }, + { + "epoch": 0.56, + "grad_norm": 3.2835239613039704, + "learning_rate": 4.294699487830546e-06, + "loss": 0.7377, + "step": 6879 + }, + { + "epoch": 0.56, + "grad_norm": 6.893217186711873, + "learning_rate": 4.293397309950168e-06, + "loss": 0.6117, + "step": 6880 + }, + { + "epoch": 0.56, + "grad_norm": 2.1266753939261904, + "learning_rate": 4.292095180971145e-06, + "loss": 0.6442, + "step": 6881 + }, + { + "epoch": 0.56, + "grad_norm": 4.169125447443993, + "learning_rate": 4.2907931009835954e-06, + "loss": 0.8305, + "step": 6882 + }, + { + "epoch": 0.56, + "grad_norm": 6.135002503289643, + "learning_rate": 4.289491070077626e-06, + "loss": 0.4442, + "step": 6883 + }, + { + "epoch": 0.56, + "grad_norm": 6.757603041277577, + "learning_rate": 4.288189088343348e-06, + "loss": 0.7315, + "step": 6884 + }, + { + "epoch": 0.56, + "grad_norm": 4.797176624238879, + "learning_rate": 4.286887155870868e-06, + "loss": 0.8499, + "step": 6885 + }, + { + "epoch": 0.56, + "grad_norm": 2.8593098324895605, + "learning_rate": 4.285585272750287e-06, + "loss": 0.6901, + "step": 6886 + }, + { + "epoch": 0.56, + "grad_norm": 3.869462001341872, + "learning_rate": 4.284283439071703e-06, + "loss": 0.6972, + "step": 6887 + }, + { + "epoch": 0.56, + "grad_norm": 3.6847339731251108, + "learning_rate": 4.282981654925214e-06, + "loss": 0.7295, + "step": 6888 + }, + { + "epoch": 0.56, + "grad_norm": 4.949437261496342, + "learning_rate": 4.281679920400907e-06, + "loss": 0.5941, + "step": 6889 + }, + { + "epoch": 0.56, + "grad_norm": 4.978802555034849, + "learning_rate": 4.280378235588872e-06, + "loss": 0.7034, + "step": 6890 + }, + { + "epoch": 0.56, + "grad_norm": 4.376380418818413, + "learning_rate": 4.279076600579194e-06, + "loss": 0.6988, + "step": 6891 + }, + { + "epoch": 0.56, + "grad_norm": 3.4259645241469916, + "learning_rate": 4.277775015461955e-06, + "loss": 0.8049, + "step": 6892 + }, + { + "epoch": 0.56, + "grad_norm": 3.2911235879344076, + "learning_rate": 4.2764734803272325e-06, + "loss": 0.6656, + "step": 6893 + }, + { + "epoch": 0.56, + "grad_norm": 3.127810935896433, + "learning_rate": 4.275171995265101e-06, + "loss": 0.7018, + "step": 6894 + }, + { + "epoch": 0.56, + "grad_norm": 2.831031382255254, + "learning_rate": 4.2738705603656326e-06, + "loss": 0.7153, + "step": 6895 + }, + { + "epoch": 0.56, + "grad_norm": 3.0253779529259353, + "learning_rate": 4.272569175718893e-06, + "loss": 0.731, + "step": 6896 + }, + { + "epoch": 0.56, + "grad_norm": 3.7216029049773622, + "learning_rate": 4.271267841414945e-06, + "loss": 0.558, + "step": 6897 + }, + { + "epoch": 0.56, + "grad_norm": 2.5302661351268227, + "learning_rate": 4.269966557543852e-06, + "loss": 0.7019, + "step": 6898 + }, + { + "epoch": 0.56, + "grad_norm": 3.536498346111739, + "learning_rate": 4.26866532419567e-06, + "loss": 0.5796, + "step": 6899 + }, + { + "epoch": 0.56, + "grad_norm": 3.457232323957409, + "learning_rate": 4.267364141460452e-06, + "loss": 0.6769, + "step": 6900 + }, + { + "epoch": 0.56, + "grad_norm": 2.54255783361201, + "learning_rate": 4.266063009428249e-06, + "loss": 0.693, + "step": 6901 + }, + { + "epoch": 0.56, + "grad_norm": 6.85275688174486, + "learning_rate": 4.264761928189107e-06, + "loss": 0.6411, + "step": 6902 + }, + { + "epoch": 0.56, + "grad_norm": 3.467817956697217, + "learning_rate": 4.263460897833069e-06, + "loss": 0.6038, + "step": 6903 + }, + { + "epoch": 0.56, + "grad_norm": 2.624433923144948, + "learning_rate": 4.2621599184501736e-06, + "loss": 0.6174, + "step": 6904 + }, + { + "epoch": 0.56, + "grad_norm": 3.585874732383417, + "learning_rate": 4.260858990130459e-06, + "loss": 0.785, + "step": 6905 + }, + { + "epoch": 0.56, + "grad_norm": 3.608098982953486, + "learning_rate": 4.259558112963954e-06, + "loss": 0.755, + "step": 6906 + }, + { + "epoch": 0.56, + "grad_norm": 5.541333386927576, + "learning_rate": 4.258257287040692e-06, + "loss": 0.7412, + "step": 6907 + }, + { + "epoch": 0.56, + "grad_norm": 2.5274300499259263, + "learning_rate": 4.256956512450697e-06, + "loss": 0.5357, + "step": 6908 + }, + { + "epoch": 0.56, + "grad_norm": 3.6208632605691893, + "learning_rate": 4.2556557892839875e-06, + "loss": 0.6301, + "step": 6909 + }, + { + "epoch": 0.56, + "grad_norm": 3.9467515991788997, + "learning_rate": 4.254355117630585e-06, + "loss": 0.6172, + "step": 6910 + }, + { + "epoch": 0.56, + "grad_norm": 3.0969169612864027, + "learning_rate": 4.2530544975805034e-06, + "loss": 0.6708, + "step": 6911 + }, + { + "epoch": 0.56, + "grad_norm": 5.043956915185662, + "learning_rate": 4.251753929223754e-06, + "loss": 0.7537, + "step": 6912 + }, + { + "epoch": 0.56, + "grad_norm": 3.5173455429655096, + "learning_rate": 4.250453412650343e-06, + "loss": 0.7341, + "step": 6913 + }, + { + "epoch": 0.56, + "grad_norm": 3.085061272051881, + "learning_rate": 4.249152947950276e-06, + "loss": 0.6552, + "step": 6914 + }, + { + "epoch": 0.56, + "grad_norm": 2.6985279728188476, + "learning_rate": 4.247852535213554e-06, + "loss": 0.5843, + "step": 6915 + }, + { + "epoch": 0.56, + "grad_norm": 3.3724583306023295, + "learning_rate": 4.246552174530171e-06, + "loss": 0.6629, + "step": 6916 + }, + { + "epoch": 0.56, + "grad_norm": 4.98527057985444, + "learning_rate": 4.245251865990122e-06, + "loss": 0.6779, + "step": 6917 + }, + { + "epoch": 0.56, + "grad_norm": 2.7148417949240664, + "learning_rate": 4.243951609683395e-06, + "loss": 0.7512, + "step": 6918 + }, + { + "epoch": 0.56, + "grad_norm": 2.749127735519029, + "learning_rate": 4.242651405699979e-06, + "loss": 0.6893, + "step": 6919 + }, + { + "epoch": 0.56, + "grad_norm": 3.2642926709931808, + "learning_rate": 4.241351254129854e-06, + "loss": 0.6173, + "step": 6920 + }, + { + "epoch": 0.56, + "grad_norm": 5.090029937141236, + "learning_rate": 4.240051155063e-06, + "loss": 0.7109, + "step": 6921 + }, + { + "epoch": 0.56, + "grad_norm": 2.768285255760678, + "learning_rate": 4.238751108589389e-06, + "loss": 0.6719, + "step": 6922 + }, + { + "epoch": 0.56, + "grad_norm": 2.8344256712184275, + "learning_rate": 4.237451114798995e-06, + "loss": 0.7396, + "step": 6923 + }, + { + "epoch": 0.56, + "grad_norm": 10.72485084269237, + "learning_rate": 4.236151173781785e-06, + "loss": 0.6967, + "step": 6924 + }, + { + "epoch": 0.56, + "grad_norm": 2.5303867448420116, + "learning_rate": 4.2348512856277235e-06, + "loss": 0.6694, + "step": 6925 + }, + { + "epoch": 0.56, + "grad_norm": 3.6706371518829792, + "learning_rate": 4.233551450426772e-06, + "loss": 0.8611, + "step": 6926 + }, + { + "epoch": 0.56, + "grad_norm": 2.9105442807704764, + "learning_rate": 4.232251668268884e-06, + "loss": 0.6477, + "step": 6927 + }, + { + "epoch": 0.56, + "grad_norm": 2.947705037421373, + "learning_rate": 4.2309519392440175e-06, + "loss": 0.6659, + "step": 6928 + }, + { + "epoch": 0.56, + "grad_norm": 6.402133450303937, + "learning_rate": 4.229652263442119e-06, + "loss": 0.5482, + "step": 6929 + }, + { + "epoch": 0.56, + "grad_norm": 5.661679875807692, + "learning_rate": 4.228352640953132e-06, + "loss": 0.6949, + "step": 6930 + }, + { + "epoch": 0.56, + "grad_norm": 3.6436215324016223, + "learning_rate": 4.227053071867001e-06, + "loss": 0.5583, + "step": 6931 + }, + { + "epoch": 0.56, + "grad_norm": 3.044194250327129, + "learning_rate": 4.225753556273665e-06, + "loss": 0.6226, + "step": 6932 + }, + { + "epoch": 0.56, + "grad_norm": 3.5078450917745547, + "learning_rate": 4.224454094263058e-06, + "loss": 0.7228, + "step": 6933 + }, + { + "epoch": 0.56, + "grad_norm": 3.2739072696751292, + "learning_rate": 4.223154685925112e-06, + "loss": 0.5581, + "step": 6934 + }, + { + "epoch": 0.56, + "grad_norm": 3.990188184137978, + "learning_rate": 4.221855331349753e-06, + "loss": 0.6212, + "step": 6935 + }, + { + "epoch": 0.56, + "grad_norm": 2.1403026590456626, + "learning_rate": 4.220556030626904e-06, + "loss": 0.6222, + "step": 6936 + }, + { + "epoch": 0.56, + "grad_norm": 2.412876906849242, + "learning_rate": 4.219256783846486e-06, + "loss": 0.8232, + "step": 6937 + }, + { + "epoch": 0.56, + "grad_norm": 3.6908419423726913, + "learning_rate": 4.217957591098413e-06, + "loss": 0.6956, + "step": 6938 + }, + { + "epoch": 0.56, + "grad_norm": 2.6590647870066806, + "learning_rate": 4.216658452472599e-06, + "loss": 0.7058, + "step": 6939 + }, + { + "epoch": 0.56, + "grad_norm": 3.647769449392533, + "learning_rate": 4.215359368058953e-06, + "loss": 0.6135, + "step": 6940 + }, + { + "epoch": 0.56, + "grad_norm": 2.9181381986913277, + "learning_rate": 4.214060337947381e-06, + "loss": 0.7652, + "step": 6941 + }, + { + "epoch": 0.56, + "grad_norm": 2.8977150255200117, + "learning_rate": 4.21276136222778e-06, + "loss": 0.7265, + "step": 6942 + }, + { + "epoch": 0.56, + "grad_norm": 2.7067296603603976, + "learning_rate": 4.21146244099005e-06, + "loss": 0.7343, + "step": 6943 + }, + { + "epoch": 0.56, + "grad_norm": 4.068896090182501, + "learning_rate": 4.210163574324085e-06, + "loss": 0.7433, + "step": 6944 + }, + { + "epoch": 0.56, + "grad_norm": 4.445737546903657, + "learning_rate": 4.208864762319773e-06, + "loss": 0.6623, + "step": 6945 + }, + { + "epoch": 0.56, + "grad_norm": 2.1674324217444743, + "learning_rate": 4.207566005067001e-06, + "loss": 0.6069, + "step": 6946 + }, + { + "epoch": 0.56, + "grad_norm": 3.066015263098254, + "learning_rate": 4.206267302655651e-06, + "loss": 0.6724, + "step": 6947 + }, + { + "epoch": 0.56, + "grad_norm": 4.278582462049776, + "learning_rate": 4.204968655175603e-06, + "loss": 0.6913, + "step": 6948 + }, + { + "epoch": 0.56, + "grad_norm": 2.616174743636218, + "learning_rate": 4.203670062716728e-06, + "loss": 0.7945, + "step": 6949 + }, + { + "epoch": 0.56, + "grad_norm": 4.157050326997998, + "learning_rate": 4.202371525368899e-06, + "loss": 0.7758, + "step": 6950 + }, + { + "epoch": 0.56, + "grad_norm": 3.396124135036793, + "learning_rate": 4.2010730432219845e-06, + "loss": 0.7564, + "step": 6951 + }, + { + "epoch": 0.56, + "grad_norm": 6.230260829983935, + "learning_rate": 4.199774616365844e-06, + "loss": 0.6655, + "step": 6952 + }, + { + "epoch": 0.56, + "grad_norm": 3.3868972231220447, + "learning_rate": 4.198476244890338e-06, + "loss": 0.6684, + "step": 6953 + }, + { + "epoch": 0.56, + "grad_norm": 2.602355793928937, + "learning_rate": 4.197177928885324e-06, + "loss": 0.8673, + "step": 6954 + }, + { + "epoch": 0.56, + "grad_norm": 6.951792856718155, + "learning_rate": 4.195879668440654e-06, + "loss": 0.6314, + "step": 6955 + }, + { + "epoch": 0.56, + "grad_norm": 4.2578614488574695, + "learning_rate": 4.194581463646172e-06, + "loss": 0.6605, + "step": 6956 + }, + { + "epoch": 0.57, + "grad_norm": 5.698667075326142, + "learning_rate": 4.193283314591723e-06, + "loss": 0.7334, + "step": 6957 + }, + { + "epoch": 0.57, + "grad_norm": 4.793750457034329, + "learning_rate": 4.191985221367149e-06, + "loss": 0.6277, + "step": 6958 + }, + { + "epoch": 0.57, + "grad_norm": 2.998544672944845, + "learning_rate": 4.190687184062286e-06, + "loss": 0.6871, + "step": 6959 + }, + { + "epoch": 0.57, + "grad_norm": 2.9266727867606193, + "learning_rate": 4.189389202766966e-06, + "loss": 0.489, + "step": 6960 + }, + { + "epoch": 0.57, + "grad_norm": 2.426347982121342, + "learning_rate": 4.188091277571018e-06, + "loss": 0.9593, + "step": 6961 + }, + { + "epoch": 0.57, + "grad_norm": 2.4510324698773287, + "learning_rate": 4.186793408564264e-06, + "loss": 0.6993, + "step": 6962 + }, + { + "epoch": 0.57, + "grad_norm": 2.7807890692245825, + "learning_rate": 4.1854955958365266e-06, + "loss": 0.6088, + "step": 6963 + }, + { + "epoch": 0.57, + "grad_norm": 3.750802336792746, + "learning_rate": 4.184197839477622e-06, + "loss": 0.6171, + "step": 6964 + }, + { + "epoch": 0.57, + "grad_norm": 7.762623844184858, + "learning_rate": 4.182900139577365e-06, + "loss": 0.718, + "step": 6965 + }, + { + "epoch": 0.57, + "grad_norm": 8.229150675378827, + "learning_rate": 4.181602496225562e-06, + "loss": 0.6466, + "step": 6966 + }, + { + "epoch": 0.57, + "grad_norm": 3.2659685179011646, + "learning_rate": 4.180304909512021e-06, + "loss": 0.5392, + "step": 6967 + }, + { + "epoch": 0.57, + "grad_norm": 2.74212297377412, + "learning_rate": 4.179007379526541e-06, + "loss": 0.6128, + "step": 6968 + }, + { + "epoch": 0.57, + "grad_norm": 3.6857015054727786, + "learning_rate": 4.17770990635892e-06, + "loss": 0.7609, + "step": 6969 + }, + { + "epoch": 0.57, + "grad_norm": 2.4533151660888293, + "learning_rate": 4.17641249009895e-06, + "loss": 0.6076, + "step": 6970 + }, + { + "epoch": 0.57, + "grad_norm": 8.432819590213036, + "learning_rate": 4.175115130836421e-06, + "loss": 0.6141, + "step": 6971 + }, + { + "epoch": 0.57, + "grad_norm": 3.5354045453391945, + "learning_rate": 4.17381782866112e-06, + "loss": 0.627, + "step": 6972 + }, + { + "epoch": 0.57, + "grad_norm": 4.104127409640797, + "learning_rate": 4.172520583662825e-06, + "loss": 0.691, + "step": 6973 + }, + { + "epoch": 0.57, + "grad_norm": 6.614995015345534, + "learning_rate": 4.171223395931321e-06, + "loss": 0.6441, + "step": 6974 + }, + { + "epoch": 0.57, + "grad_norm": 4.283319028455133, + "learning_rate": 4.169926265556372e-06, + "loss": 0.7528, + "step": 6975 + }, + { + "epoch": 0.57, + "grad_norm": 3.0450442093745655, + "learning_rate": 4.168629192627754e-06, + "loss": 0.8984, + "step": 6976 + }, + { + "epoch": 0.57, + "grad_norm": 2.4805258942389554, + "learning_rate": 4.1673321772352296e-06, + "loss": 0.7947, + "step": 6977 + }, + { + "epoch": 0.57, + "grad_norm": 2.8356137716526844, + "learning_rate": 4.166035219468561e-06, + "loss": 0.7014, + "step": 6978 + }, + { + "epoch": 0.57, + "grad_norm": 13.372403322537808, + "learning_rate": 4.164738319417507e-06, + "loss": 0.6765, + "step": 6979 + }, + { + "epoch": 0.57, + "grad_norm": 2.7362077028438483, + "learning_rate": 4.16344147717182e-06, + "loss": 0.5522, + "step": 6980 + }, + { + "epoch": 0.57, + "grad_norm": 2.880412823004085, + "learning_rate": 4.162144692821252e-06, + "loss": 0.5945, + "step": 6981 + }, + { + "epoch": 0.57, + "grad_norm": 3.455361614624516, + "learning_rate": 4.160847966455546e-06, + "loss": 0.7302, + "step": 6982 + }, + { + "epoch": 0.57, + "grad_norm": 8.763511073609845, + "learning_rate": 4.159551298164442e-06, + "loss": 0.6853, + "step": 6983 + }, + { + "epoch": 0.57, + "grad_norm": 3.86274625066185, + "learning_rate": 4.158254688037683e-06, + "loss": 0.5585, + "step": 6984 + }, + { + "epoch": 0.57, + "grad_norm": 5.572053454080541, + "learning_rate": 4.156958136164999e-06, + "loss": 0.7547, + "step": 6985 + }, + { + "epoch": 0.57, + "grad_norm": 3.7325630350931895, + "learning_rate": 4.1556616426361195e-06, + "loss": 0.8862, + "step": 6986 + }, + { + "epoch": 0.57, + "grad_norm": 4.010836243194204, + "learning_rate": 4.1543652075407705e-06, + "loss": 0.6712, + "step": 6987 + }, + { + "epoch": 0.57, + "grad_norm": 2.8972568444904194, + "learning_rate": 4.153068830968676e-06, + "loss": 0.6456, + "step": 6988 + }, + { + "epoch": 0.57, + "grad_norm": 4.105580395751182, + "learning_rate": 4.151772513009549e-06, + "loss": 0.6258, + "step": 6989 + }, + { + "epoch": 0.57, + "grad_norm": 3.5872361108436555, + "learning_rate": 4.150476253753105e-06, + "loss": 0.7067, + "step": 6990 + }, + { + "epoch": 0.57, + "grad_norm": 4.346122099792341, + "learning_rate": 4.149180053289054e-06, + "loss": 0.6976, + "step": 6991 + }, + { + "epoch": 0.57, + "grad_norm": 2.8041949768638776, + "learning_rate": 4.1478839117071e-06, + "loss": 0.6819, + "step": 6992 + }, + { + "epoch": 0.57, + "grad_norm": 4.803441080414738, + "learning_rate": 4.146587829096945e-06, + "loss": 0.8158, + "step": 6993 + }, + { + "epoch": 0.57, + "grad_norm": 3.8389702818027747, + "learning_rate": 4.1452918055482876e-06, + "loss": 0.633, + "step": 6994 + }, + { + "epoch": 0.57, + "grad_norm": 3.139920121476924, + "learning_rate": 4.143995841150816e-06, + "loss": 0.6422, + "step": 6995 + }, + { + "epoch": 0.57, + "grad_norm": 3.6123468082567993, + "learning_rate": 4.142699935994222e-06, + "loss": 0.7024, + "step": 6996 + }, + { + "epoch": 0.57, + "grad_norm": 10.518531232008542, + "learning_rate": 4.141404090168192e-06, + "loss": 0.6559, + "step": 6997 + }, + { + "epoch": 0.57, + "grad_norm": 4.4666480257785395, + "learning_rate": 4.140108303762404e-06, + "loss": 0.6895, + "step": 6998 + }, + { + "epoch": 0.57, + "grad_norm": 4.12669646816345, + "learning_rate": 4.138812576866537e-06, + "loss": 0.737, + "step": 6999 + }, + { + "epoch": 0.57, + "grad_norm": 2.5261102473110837, + "learning_rate": 4.137516909570261e-06, + "loss": 0.5375, + "step": 7000 + }, + { + "epoch": 0.57, + "grad_norm": 3.567310106819449, + "learning_rate": 4.136221301963247e-06, + "loss": 0.6619, + "step": 7001 + }, + { + "epoch": 0.57, + "grad_norm": 2.461835452672331, + "learning_rate": 4.134925754135157e-06, + "loss": 0.6671, + "step": 7002 + }, + { + "epoch": 0.57, + "grad_norm": 3.47197424273473, + "learning_rate": 4.133630266175651e-06, + "loss": 0.6363, + "step": 7003 + }, + { + "epoch": 0.57, + "grad_norm": 4.665543907810832, + "learning_rate": 4.132334838174385e-06, + "loss": 0.6492, + "step": 7004 + }, + { + "epoch": 0.57, + "grad_norm": 3.311331134782175, + "learning_rate": 4.131039470221013e-06, + "loss": 0.5325, + "step": 7005 + }, + { + "epoch": 0.57, + "grad_norm": 10.11120752270408, + "learning_rate": 4.12974416240518e-06, + "loss": 0.8934, + "step": 7006 + }, + { + "epoch": 0.57, + "grad_norm": 5.131770279643075, + "learning_rate": 4.128448914816532e-06, + "loss": 0.4708, + "step": 7007 + }, + { + "epoch": 0.57, + "grad_norm": 3.4207295136638884, + "learning_rate": 4.127153727544706e-06, + "loss": 0.6712, + "step": 7008 + }, + { + "epoch": 0.57, + "grad_norm": 2.820561755816374, + "learning_rate": 4.125858600679339e-06, + "loss": 0.6856, + "step": 7009 + }, + { + "epoch": 0.57, + "grad_norm": 7.089689352256433, + "learning_rate": 4.12456353431006e-06, + "loss": 0.6537, + "step": 7010 + }, + { + "epoch": 0.57, + "grad_norm": 2.955098347963419, + "learning_rate": 4.1232685285264955e-06, + "loss": 0.5982, + "step": 7011 + }, + { + "epoch": 0.57, + "grad_norm": 3.013909341519685, + "learning_rate": 4.12197358341827e-06, + "loss": 0.6128, + "step": 7012 + }, + { + "epoch": 0.57, + "grad_norm": 3.554990686335348, + "learning_rate": 4.120678699075001e-06, + "loss": 0.686, + "step": 7013 + }, + { + "epoch": 0.57, + "grad_norm": 4.071837424331226, + "learning_rate": 4.119383875586304e-06, + "loss": 0.6816, + "step": 7014 + }, + { + "epoch": 0.57, + "grad_norm": 4.047015330144255, + "learning_rate": 4.118089113041787e-06, + "loss": 0.7738, + "step": 7015 + }, + { + "epoch": 0.57, + "grad_norm": 2.7011206439987507, + "learning_rate": 4.116794411531055e-06, + "loss": 0.7857, + "step": 7016 + }, + { + "epoch": 0.57, + "grad_norm": 2.443015591627053, + "learning_rate": 4.115499771143713e-06, + "loss": 0.7378, + "step": 7017 + }, + { + "epoch": 0.57, + "grad_norm": 2.630151132061548, + "learning_rate": 4.114205191969354e-06, + "loss": 0.7302, + "step": 7018 + }, + { + "epoch": 0.57, + "grad_norm": 3.559285243907365, + "learning_rate": 4.1129106740975735e-06, + "loss": 0.6594, + "step": 7019 + }, + { + "epoch": 0.57, + "grad_norm": 2.20982742807074, + "learning_rate": 4.11161621761796e-06, + "loss": 0.5712, + "step": 7020 + }, + { + "epoch": 0.57, + "grad_norm": 2.650858541869961, + "learning_rate": 4.1103218226201e-06, + "loss": 0.683, + "step": 7021 + }, + { + "epoch": 0.57, + "grad_norm": 3.3895925274012786, + "learning_rate": 4.10902748919357e-06, + "loss": 0.7497, + "step": 7022 + }, + { + "epoch": 0.57, + "grad_norm": 3.1332893213838604, + "learning_rate": 4.1077332174279475e-06, + "loss": 0.6315, + "step": 7023 + }, + { + "epoch": 0.57, + "grad_norm": 4.906590289956032, + "learning_rate": 4.106439007412806e-06, + "loss": 0.7657, + "step": 7024 + }, + { + "epoch": 0.57, + "grad_norm": 3.697615257312409, + "learning_rate": 4.10514485923771e-06, + "loss": 0.6215, + "step": 7025 + }, + { + "epoch": 0.57, + "grad_norm": 3.5427551387827645, + "learning_rate": 4.103850772992224e-06, + "loss": 0.57, + "step": 7026 + }, + { + "epoch": 0.57, + "grad_norm": 2.965693629689701, + "learning_rate": 4.10255674876591e-06, + "loss": 0.6873, + "step": 7027 + }, + { + "epoch": 0.57, + "grad_norm": 4.218038610573853, + "learning_rate": 4.101262786648317e-06, + "loss": 0.69, + "step": 7028 + }, + { + "epoch": 0.57, + "grad_norm": 4.794665487858357, + "learning_rate": 4.099968886728998e-06, + "loss": 0.6925, + "step": 7029 + }, + { + "epoch": 0.57, + "grad_norm": 4.011174925049835, + "learning_rate": 4.098675049097499e-06, + "loss": 0.7004, + "step": 7030 + }, + { + "epoch": 0.57, + "grad_norm": 7.801034212070132, + "learning_rate": 4.097381273843363e-06, + "loss": 0.7107, + "step": 7031 + }, + { + "epoch": 0.57, + "grad_norm": 5.283237052604058, + "learning_rate": 4.096087561056126e-06, + "loss": 0.7031, + "step": 7032 + }, + { + "epoch": 0.57, + "grad_norm": 3.044230294571237, + "learning_rate": 4.09479391082532e-06, + "loss": 0.7869, + "step": 7033 + }, + { + "epoch": 0.57, + "grad_norm": 6.176605078991393, + "learning_rate": 4.093500323240479e-06, + "loss": 0.6318, + "step": 7034 + }, + { + "epoch": 0.57, + "grad_norm": 2.7351750786253635, + "learning_rate": 4.09220679839112e-06, + "loss": 0.6997, + "step": 7035 + }, + { + "epoch": 0.57, + "grad_norm": 2.885638212857847, + "learning_rate": 4.0909133363667654e-06, + "loss": 0.784, + "step": 7036 + }, + { + "epoch": 0.57, + "grad_norm": 2.5717807975805242, + "learning_rate": 4.089619937256934e-06, + "loss": 0.655, + "step": 7037 + }, + { + "epoch": 0.57, + "grad_norm": 2.431623769657669, + "learning_rate": 4.088326601151134e-06, + "loss": 0.6268, + "step": 7038 + }, + { + "epoch": 0.57, + "grad_norm": 3.0015131907465715, + "learning_rate": 4.087033328138875e-06, + "loss": 0.5873, + "step": 7039 + }, + { + "epoch": 0.57, + "grad_norm": 3.2371315591211256, + "learning_rate": 4.085740118309657e-06, + "loss": 0.7532, + "step": 7040 + }, + { + "epoch": 0.57, + "grad_norm": 4.075216512844301, + "learning_rate": 4.084446971752981e-06, + "loss": 0.8184, + "step": 7041 + }, + { + "epoch": 0.57, + "grad_norm": 3.3846155243129536, + "learning_rate": 4.0831538885583384e-06, + "loss": 0.5013, + "step": 7042 + }, + { + "epoch": 0.57, + "grad_norm": 2.444416255808669, + "learning_rate": 4.08186086881522e-06, + "loss": 0.6642, + "step": 7043 + }, + { + "epoch": 0.57, + "grad_norm": 2.85389886969537, + "learning_rate": 4.0805679126131096e-06, + "loss": 0.6088, + "step": 7044 + }, + { + "epoch": 0.57, + "grad_norm": 5.217992271804968, + "learning_rate": 4.079275020041489e-06, + "loss": 0.6256, + "step": 7045 + }, + { + "epoch": 0.57, + "grad_norm": 3.740372737200289, + "learning_rate": 4.0779821911898345e-06, + "loss": 0.6677, + "step": 7046 + }, + { + "epoch": 0.57, + "grad_norm": 2.658326584943097, + "learning_rate": 4.07668942614762e-06, + "loss": 0.6169, + "step": 7047 + }, + { + "epoch": 0.57, + "grad_norm": 2.500953519065944, + "learning_rate": 4.075396725004308e-06, + "loss": 0.7561, + "step": 7048 + }, + { + "epoch": 0.57, + "grad_norm": 4.033602099855156, + "learning_rate": 4.074104087849366e-06, + "loss": 0.7109, + "step": 7049 + }, + { + "epoch": 0.57, + "grad_norm": 4.018652416607557, + "learning_rate": 4.072811514772251e-06, + "loss": 0.6418, + "step": 7050 + }, + { + "epoch": 0.57, + "grad_norm": 3.475612196135787, + "learning_rate": 4.071519005862416e-06, + "loss": 0.6051, + "step": 7051 + }, + { + "epoch": 0.57, + "grad_norm": 2.571037654524126, + "learning_rate": 4.0702265612093125e-06, + "loss": 0.5917, + "step": 7052 + }, + { + "epoch": 0.57, + "grad_norm": 2.672389462818635, + "learning_rate": 4.068934180902385e-06, + "loss": 0.7578, + "step": 7053 + }, + { + "epoch": 0.57, + "grad_norm": 10.033104494828656, + "learning_rate": 4.067641865031076e-06, + "loss": 0.6651, + "step": 7054 + }, + { + "epoch": 0.57, + "grad_norm": 4.801455431537581, + "learning_rate": 4.06634961368482e-06, + "loss": 0.5716, + "step": 7055 + }, + { + "epoch": 0.57, + "grad_norm": 3.051278724372702, + "learning_rate": 4.065057426953049e-06, + "loss": 0.5318, + "step": 7056 + }, + { + "epoch": 0.57, + "grad_norm": 2.4216953608249656, + "learning_rate": 4.0637653049251915e-06, + "loss": 0.5733, + "step": 7057 + }, + { + "epoch": 0.57, + "grad_norm": 2.981763234104512, + "learning_rate": 4.0624732476906695e-06, + "loss": 0.5114, + "step": 7058 + }, + { + "epoch": 0.57, + "grad_norm": 3.3952158015716645, + "learning_rate": 4.061181255338902e-06, + "loss": 0.7248, + "step": 7059 + }, + { + "epoch": 0.57, + "grad_norm": 4.457571544170344, + "learning_rate": 4.059889327959302e-06, + "loss": 0.6903, + "step": 7060 + }, + { + "epoch": 0.57, + "grad_norm": 3.772096922595958, + "learning_rate": 4.058597465641283e-06, + "loss": 0.7111, + "step": 7061 + }, + { + "epoch": 0.57, + "grad_norm": 3.151545206750877, + "learning_rate": 4.057305668474244e-06, + "loss": 0.735, + "step": 7062 + }, + { + "epoch": 0.57, + "grad_norm": 4.73946926290578, + "learning_rate": 4.0560139365475885e-06, + "loss": 0.526, + "step": 7063 + }, + { + "epoch": 0.57, + "grad_norm": 2.40225347193254, + "learning_rate": 4.054722269950714e-06, + "loss": 0.6833, + "step": 7064 + }, + { + "epoch": 0.57, + "grad_norm": 2.8861880836575597, + "learning_rate": 4.053430668773009e-06, + "loss": 0.5289, + "step": 7065 + }, + { + "epoch": 0.57, + "grad_norm": 3.4858856994498515, + "learning_rate": 4.0521391331038624e-06, + "loss": 0.6765, + "step": 7066 + }, + { + "epoch": 0.57, + "grad_norm": 5.4327513934305145, + "learning_rate": 4.050847663032657e-06, + "loss": 0.5509, + "step": 7067 + }, + { + "epoch": 0.57, + "grad_norm": 4.679572625314522, + "learning_rate": 4.0495562586487685e-06, + "loss": 0.6792, + "step": 7068 + }, + { + "epoch": 0.57, + "grad_norm": 3.189841680618907, + "learning_rate": 4.048264920041571e-06, + "loss": 0.6398, + "step": 7069 + }, + { + "epoch": 0.57, + "grad_norm": 3.120392246091671, + "learning_rate": 4.046973647300434e-06, + "loss": 0.7439, + "step": 7070 + }, + { + "epoch": 0.57, + "grad_norm": 5.020875457136881, + "learning_rate": 4.045682440514721e-06, + "loss": 0.7007, + "step": 7071 + }, + { + "epoch": 0.57, + "grad_norm": 2.7445980263530205, + "learning_rate": 4.044391299773793e-06, + "loss": 0.663, + "step": 7072 + }, + { + "epoch": 0.57, + "grad_norm": 4.252267745389334, + "learning_rate": 4.043100225167004e-06, + "loss": 0.7099, + "step": 7073 + }, + { + "epoch": 0.57, + "grad_norm": 3.3732567284536623, + "learning_rate": 4.041809216783705e-06, + "loss": 0.843, + "step": 7074 + }, + { + "epoch": 0.57, + "grad_norm": 3.54904892470229, + "learning_rate": 4.04051827471324e-06, + "loss": 0.6988, + "step": 7075 + }, + { + "epoch": 0.57, + "grad_norm": 4.4150302327769575, + "learning_rate": 4.039227399044952e-06, + "loss": 0.6566, + "step": 7076 + }, + { + "epoch": 0.57, + "grad_norm": 2.196855632678299, + "learning_rate": 4.037936589868179e-06, + "loss": 0.6491, + "step": 7077 + }, + { + "epoch": 0.57, + "grad_norm": 2.1056305609931387, + "learning_rate": 4.0366458472722495e-06, + "loss": 0.7972, + "step": 7078 + }, + { + "epoch": 0.57, + "grad_norm": 2.579276927801973, + "learning_rate": 4.035355171346494e-06, + "loss": 0.7998, + "step": 7079 + }, + { + "epoch": 0.58, + "grad_norm": 6.366461185850729, + "learning_rate": 4.034064562180236e-06, + "loss": 0.7302, + "step": 7080 + }, + { + "epoch": 0.58, + "grad_norm": 2.0880156000446632, + "learning_rate": 4.032774019862791e-06, + "loss": 0.6702, + "step": 7081 + }, + { + "epoch": 0.58, + "grad_norm": 135.53772051368554, + "learning_rate": 4.0314835444834744e-06, + "loss": 0.5617, + "step": 7082 + }, + { + "epoch": 0.58, + "grad_norm": 3.383949655414227, + "learning_rate": 4.030193136131594e-06, + "loss": 0.7293, + "step": 7083 + }, + { + "epoch": 0.58, + "grad_norm": 2.4175060882339605, + "learning_rate": 4.028902794896455e-06, + "loss": 0.7137, + "step": 7084 + }, + { + "epoch": 0.58, + "grad_norm": 3.714319371619351, + "learning_rate": 4.027612520867357e-06, + "loss": 0.6962, + "step": 7085 + }, + { + "epoch": 0.58, + "grad_norm": 3.7014320202532955, + "learning_rate": 4.026322314133596e-06, + "loss": 0.7067, + "step": 7086 + }, + { + "epoch": 0.58, + "grad_norm": 2.360488574682084, + "learning_rate": 4.025032174784463e-06, + "loss": 0.7352, + "step": 7087 + }, + { + "epoch": 0.58, + "grad_norm": 10.120989490702291, + "learning_rate": 4.0237421029092405e-06, + "loss": 0.6739, + "step": 7088 + }, + { + "epoch": 0.58, + "grad_norm": 4.394345164597692, + "learning_rate": 4.022452098597212e-06, + "loss": 0.7594, + "step": 7089 + }, + { + "epoch": 0.58, + "grad_norm": 3.308053383223621, + "learning_rate": 4.021162161937653e-06, + "loss": 0.6454, + "step": 7090 + }, + { + "epoch": 0.58, + "grad_norm": 3.9880934345528547, + "learning_rate": 4.019872293019835e-06, + "loss": 0.5711, + "step": 7091 + }, + { + "epoch": 0.58, + "grad_norm": 2.6387960463333426, + "learning_rate": 4.018582491933027e-06, + "loss": 0.6446, + "step": 7092 + }, + { + "epoch": 0.58, + "grad_norm": 4.837870306150221, + "learning_rate": 4.017292758766489e-06, + "loss": 0.7222, + "step": 7093 + }, + { + "epoch": 0.58, + "grad_norm": 4.02634876717946, + "learning_rate": 4.016003093609482e-06, + "loss": 0.6411, + "step": 7094 + }, + { + "epoch": 0.58, + "grad_norm": 2.0614844495383546, + "learning_rate": 4.0147134965512555e-06, + "loss": 0.5738, + "step": 7095 + }, + { + "epoch": 0.58, + "grad_norm": 4.779380707527782, + "learning_rate": 4.0134239676810575e-06, + "loss": 0.7193, + "step": 7096 + }, + { + "epoch": 0.58, + "grad_norm": 4.471159342181864, + "learning_rate": 4.012134507088135e-06, + "loss": 0.6231, + "step": 7097 + }, + { + "epoch": 0.58, + "grad_norm": 2.7267299535950005, + "learning_rate": 4.010845114861724e-06, + "loss": 0.7063, + "step": 7098 + }, + { + "epoch": 0.58, + "grad_norm": 2.048546602296547, + "learning_rate": 4.009555791091058e-06, + "loss": 0.5358, + "step": 7099 + }, + { + "epoch": 0.58, + "grad_norm": 3.334308585353318, + "learning_rate": 4.00826653586537e-06, + "loss": 0.5832, + "step": 7100 + }, + { + "epoch": 0.58, + "grad_norm": 2.8520865479760427, + "learning_rate": 4.00697734927388e-06, + "loss": 0.7655, + "step": 7101 + }, + { + "epoch": 0.58, + "grad_norm": 2.81653992797605, + "learning_rate": 4.005688231405811e-06, + "loss": 0.714, + "step": 7102 + }, + { + "epoch": 0.58, + "grad_norm": 4.571812163577576, + "learning_rate": 4.004399182350377e-06, + "loss": 0.5714, + "step": 7103 + }, + { + "epoch": 0.58, + "grad_norm": 2.31253948320934, + "learning_rate": 4.003110202196787e-06, + "loss": 0.6416, + "step": 7104 + }, + { + "epoch": 0.58, + "grad_norm": 5.421226694768261, + "learning_rate": 4.00182129103425e-06, + "loss": 0.7174, + "step": 7105 + }, + { + "epoch": 0.58, + "grad_norm": 3.477881026038711, + "learning_rate": 4.0005324489519634e-06, + "loss": 0.7019, + "step": 7106 + }, + { + "epoch": 0.58, + "grad_norm": 2.6006236833725995, + "learning_rate": 3.999243676039127e-06, + "loss": 0.8622, + "step": 7107 + }, + { + "epoch": 0.58, + "grad_norm": 2.9735132406338485, + "learning_rate": 3.997954972384928e-06, + "loss": 0.7592, + "step": 7108 + }, + { + "epoch": 0.58, + "grad_norm": 4.028429629547646, + "learning_rate": 3.996666338078553e-06, + "loss": 0.7227, + "step": 7109 + }, + { + "epoch": 0.58, + "grad_norm": 7.430154104602893, + "learning_rate": 3.9953777732091854e-06, + "loss": 0.7075, + "step": 7110 + }, + { + "epoch": 0.58, + "grad_norm": 4.983411353107799, + "learning_rate": 3.994089277866001e-06, + "loss": 0.5159, + "step": 7111 + }, + { + "epoch": 0.58, + "grad_norm": 3.1490092625068558, + "learning_rate": 3.992800852138174e-06, + "loss": 0.6903, + "step": 7112 + }, + { + "epoch": 0.58, + "grad_norm": 2.4223680871025013, + "learning_rate": 3.991512496114869e-06, + "loss": 0.6953, + "step": 7113 + }, + { + "epoch": 0.58, + "grad_norm": 8.073684133253089, + "learning_rate": 3.990224209885251e-06, + "loss": 0.7106, + "step": 7114 + }, + { + "epoch": 0.58, + "grad_norm": 2.840984998535892, + "learning_rate": 3.988935993538474e-06, + "loss": 0.8413, + "step": 7115 + }, + { + "epoch": 0.58, + "grad_norm": 3.017826991115728, + "learning_rate": 3.987647847163692e-06, + "loss": 0.706, + "step": 7116 + }, + { + "epoch": 0.58, + "grad_norm": 3.94102754559899, + "learning_rate": 3.986359770850053e-06, + "loss": 0.6627, + "step": 7117 + }, + { + "epoch": 0.58, + "grad_norm": 5.0866405948519455, + "learning_rate": 3.985071764686701e-06, + "loss": 0.652, + "step": 7118 + }, + { + "epoch": 0.58, + "grad_norm": 3.6973490664048487, + "learning_rate": 3.983783828762773e-06, + "loss": 0.7126, + "step": 7119 + }, + { + "epoch": 0.58, + "grad_norm": 4.608764955189467, + "learning_rate": 3.9824959631674045e-06, + "loss": 0.5954, + "step": 7120 + }, + { + "epoch": 0.58, + "grad_norm": 3.9712986111403015, + "learning_rate": 3.9812081679897205e-06, + "loss": 0.7672, + "step": 7121 + }, + { + "epoch": 0.58, + "grad_norm": 3.274576886750923, + "learning_rate": 3.979920443318847e-06, + "loss": 0.6183, + "step": 7122 + }, + { + "epoch": 0.58, + "grad_norm": 8.080289535774286, + "learning_rate": 3.9786327892439e-06, + "loss": 0.7738, + "step": 7123 + }, + { + "epoch": 0.58, + "grad_norm": 3.6606731635016145, + "learning_rate": 3.977345205853996e-06, + "loss": 0.6448, + "step": 7124 + }, + { + "epoch": 0.58, + "grad_norm": 30.708902604430268, + "learning_rate": 3.976057693238243e-06, + "loss": 0.6561, + "step": 7125 + }, + { + "epoch": 0.58, + "grad_norm": 8.9008171536227, + "learning_rate": 3.974770251485745e-06, + "loss": 0.579, + "step": 7126 + }, + { + "epoch": 0.58, + "grad_norm": 3.2694346401965295, + "learning_rate": 3.973482880685603e-06, + "loss": 0.5499, + "step": 7127 + }, + { + "epoch": 0.58, + "grad_norm": 2.319533322066518, + "learning_rate": 3.972195580926906e-06, + "loss": 0.6618, + "step": 7128 + }, + { + "epoch": 0.58, + "grad_norm": 3.7157301424392157, + "learning_rate": 3.970908352298747e-06, + "loss": 0.6377, + "step": 7129 + }, + { + "epoch": 0.58, + "grad_norm": 4.11385347410864, + "learning_rate": 3.969621194890211e-06, + "loss": 0.8272, + "step": 7130 + }, + { + "epoch": 0.58, + "grad_norm": 3.5221889443121577, + "learning_rate": 3.968334108790375e-06, + "loss": 0.7417, + "step": 7131 + }, + { + "epoch": 0.58, + "grad_norm": 2.4143592836641257, + "learning_rate": 3.9670470940883144e-06, + "loss": 0.6909, + "step": 7132 + }, + { + "epoch": 0.58, + "grad_norm": 2.735378463235592, + "learning_rate": 3.965760150873101e-06, + "loss": 0.6633, + "step": 7133 + }, + { + "epoch": 0.58, + "grad_norm": 3.1453425545495413, + "learning_rate": 3.9644732792337956e-06, + "loss": 0.8668, + "step": 7134 + }, + { + "epoch": 0.58, + "grad_norm": 3.224006594471027, + "learning_rate": 3.963186479259459e-06, + "loss": 0.6877, + "step": 7135 + }, + { + "epoch": 0.58, + "grad_norm": 3.2155981706789656, + "learning_rate": 3.961899751039146e-06, + "loss": 0.655, + "step": 7136 + }, + { + "epoch": 0.58, + "grad_norm": 2.752151974816608, + "learning_rate": 3.960613094661908e-06, + "loss": 0.8427, + "step": 7137 + }, + { + "epoch": 0.58, + "grad_norm": 2.4229192556137007, + "learning_rate": 3.959326510216788e-06, + "loss": 0.811, + "step": 7138 + }, + { + "epoch": 0.58, + "grad_norm": 2.6910878124223054, + "learning_rate": 3.9580399977928256e-06, + "loss": 0.7086, + "step": 7139 + }, + { + "epoch": 0.58, + "grad_norm": 3.395493643124682, + "learning_rate": 3.956753557479058e-06, + "loss": 0.5931, + "step": 7140 + }, + { + "epoch": 0.58, + "grad_norm": 4.458235396836014, + "learning_rate": 3.955467189364511e-06, + "loss": 0.7405, + "step": 7141 + }, + { + "epoch": 0.58, + "grad_norm": 2.6195110177642635, + "learning_rate": 3.954180893538212e-06, + "loss": 0.6578, + "step": 7142 + }, + { + "epoch": 0.58, + "grad_norm": 2.6294253289058123, + "learning_rate": 3.952894670089179e-06, + "loss": 0.6806, + "step": 7143 + }, + { + "epoch": 0.58, + "grad_norm": 2.3949243170800902, + "learning_rate": 3.951608519106429e-06, + "loss": 0.6714, + "step": 7144 + }, + { + "epoch": 0.58, + "grad_norm": 4.743017411551807, + "learning_rate": 3.950322440678972e-06, + "loss": 0.5838, + "step": 7145 + }, + { + "epoch": 0.58, + "grad_norm": 6.840928874491977, + "learning_rate": 3.94903643489581e-06, + "loss": 0.8645, + "step": 7146 + }, + { + "epoch": 0.58, + "grad_norm": 5.160408115665429, + "learning_rate": 3.947750501845946e-06, + "loss": 0.6656, + "step": 7147 + }, + { + "epoch": 0.58, + "grad_norm": 4.537840446327892, + "learning_rate": 3.946464641618371e-06, + "loss": 0.6054, + "step": 7148 + }, + { + "epoch": 0.58, + "grad_norm": 4.450911099054057, + "learning_rate": 3.945178854302075e-06, + "loss": 0.6101, + "step": 7149 + }, + { + "epoch": 0.58, + "grad_norm": 2.85610640593839, + "learning_rate": 3.943893139986046e-06, + "loss": 0.5647, + "step": 7150 + }, + { + "epoch": 0.58, + "grad_norm": 5.6872553457541155, + "learning_rate": 3.942607498759261e-06, + "loss": 0.6266, + "step": 7151 + }, + { + "epoch": 0.58, + "grad_norm": 2.467327264867698, + "learning_rate": 3.941321930710695e-06, + "loss": 0.6275, + "step": 7152 + }, + { + "epoch": 0.58, + "grad_norm": 2.4375852895068206, + "learning_rate": 3.940036435929318e-06, + "loss": 0.7381, + "step": 7153 + }, + { + "epoch": 0.58, + "grad_norm": 3.0608470066806936, + "learning_rate": 3.938751014504093e-06, + "loss": 0.7376, + "step": 7154 + }, + { + "epoch": 0.58, + "grad_norm": 29.294594195116662, + "learning_rate": 3.937465666523981e-06, + "loss": 0.6674, + "step": 7155 + }, + { + "epoch": 0.58, + "grad_norm": 2.39115818843071, + "learning_rate": 3.9361803920779335e-06, + "loss": 0.6115, + "step": 7156 + }, + { + "epoch": 0.58, + "grad_norm": 2.419905863667368, + "learning_rate": 3.934895191254901e-06, + "loss": 0.7588, + "step": 7157 + }, + { + "epoch": 0.58, + "grad_norm": 3.1178064997011568, + "learning_rate": 3.933610064143829e-06, + "loss": 0.5792, + "step": 7158 + }, + { + "epoch": 0.58, + "grad_norm": 3.866681648895621, + "learning_rate": 3.932325010833654e-06, + "loss": 0.7154, + "step": 7159 + }, + { + "epoch": 0.58, + "grad_norm": 2.99812259081433, + "learning_rate": 3.931040031413313e-06, + "loss": 0.7449, + "step": 7160 + }, + { + "epoch": 0.58, + "grad_norm": 2.168260594487398, + "learning_rate": 3.929755125971731e-06, + "loss": 0.4579, + "step": 7161 + }, + { + "epoch": 0.58, + "grad_norm": 3.051557709198906, + "learning_rate": 3.928470294597834e-06, + "loss": 0.6434, + "step": 7162 + }, + { + "epoch": 0.58, + "grad_norm": 2.244744314834905, + "learning_rate": 3.927185537380539e-06, + "loss": 0.717, + "step": 7163 + }, + { + "epoch": 0.58, + "grad_norm": 4.6015150906868225, + "learning_rate": 3.925900854408759e-06, + "loss": 0.7066, + "step": 7164 + }, + { + "epoch": 0.58, + "grad_norm": 3.627472554385189, + "learning_rate": 3.924616245771403e-06, + "loss": 0.5849, + "step": 7165 + }, + { + "epoch": 0.58, + "grad_norm": 3.0429335049343633, + "learning_rate": 3.9233317115573745e-06, + "loss": 0.7272, + "step": 7166 + }, + { + "epoch": 0.58, + "grad_norm": 2.8403723486645127, + "learning_rate": 3.922047251855572e-06, + "loss": 0.6936, + "step": 7167 + }, + { + "epoch": 0.58, + "grad_norm": 2.428126481669758, + "learning_rate": 3.9207628667548855e-06, + "loss": 0.5414, + "step": 7168 + }, + { + "epoch": 0.58, + "grad_norm": 3.835722169142076, + "learning_rate": 3.919478556344205e-06, + "loss": 0.6228, + "step": 7169 + }, + { + "epoch": 0.58, + "grad_norm": 3.324640521430359, + "learning_rate": 3.918194320712412e-06, + "loss": 0.6293, + "step": 7170 + }, + { + "epoch": 0.58, + "grad_norm": 2.4802307783981914, + "learning_rate": 3.916910159948382e-06, + "loss": 0.6964, + "step": 7171 + }, + { + "epoch": 0.58, + "grad_norm": 3.153639548342947, + "learning_rate": 3.915626074140989e-06, + "loss": 0.8017, + "step": 7172 + }, + { + "epoch": 0.58, + "grad_norm": 2.272287398469343, + "learning_rate": 3.914342063379102e-06, + "loss": 0.7842, + "step": 7173 + }, + { + "epoch": 0.58, + "grad_norm": 2.8165599860236257, + "learning_rate": 3.913058127751578e-06, + "loss": 0.7548, + "step": 7174 + }, + { + "epoch": 0.58, + "grad_norm": 2.2558954578668042, + "learning_rate": 3.911774267347276e-06, + "loss": 0.7581, + "step": 7175 + }, + { + "epoch": 0.58, + "grad_norm": 2.6829229553980136, + "learning_rate": 3.910490482255046e-06, + "loss": 0.6439, + "step": 7176 + }, + { + "epoch": 0.58, + "grad_norm": 2.2670960439596812, + "learning_rate": 3.909206772563735e-06, + "loss": 0.6253, + "step": 7177 + }, + { + "epoch": 0.58, + "grad_norm": 3.0871668272852024, + "learning_rate": 3.907923138362184e-06, + "loss": 0.6802, + "step": 7178 + }, + { + "epoch": 0.58, + "grad_norm": 6.588851475813517, + "learning_rate": 3.906639579739227e-06, + "loss": 0.7277, + "step": 7179 + }, + { + "epoch": 0.58, + "grad_norm": 3.802346326024433, + "learning_rate": 3.9053560967836985e-06, + "loss": 0.608, + "step": 7180 + }, + { + "epoch": 0.58, + "grad_norm": 3.062849233313521, + "learning_rate": 3.904072689584418e-06, + "loss": 0.6987, + "step": 7181 + }, + { + "epoch": 0.58, + "grad_norm": 3.101469149764168, + "learning_rate": 3.902789358230208e-06, + "loss": 0.7503, + "step": 7182 + }, + { + "epoch": 0.58, + "grad_norm": 2.9694095345214118, + "learning_rate": 3.901506102809882e-06, + "loss": 0.7927, + "step": 7183 + }, + { + "epoch": 0.58, + "grad_norm": 2.9872987693511015, + "learning_rate": 3.9002229234122516e-06, + "loss": 0.7916, + "step": 7184 + }, + { + "epoch": 0.58, + "grad_norm": 3.519532135469556, + "learning_rate": 3.898939820126121e-06, + "loss": 0.7042, + "step": 7185 + }, + { + "epoch": 0.58, + "grad_norm": 3.067276273068609, + "learning_rate": 3.897656793040287e-06, + "loss": 0.7501, + "step": 7186 + }, + { + "epoch": 0.58, + "grad_norm": 5.803766889974801, + "learning_rate": 3.896373842243543e-06, + "loss": 0.6409, + "step": 7187 + }, + { + "epoch": 0.58, + "grad_norm": 2.2318748387184173, + "learning_rate": 3.895090967824678e-06, + "loss": 0.6115, + "step": 7188 + }, + { + "epoch": 0.58, + "grad_norm": 2.408297709139427, + "learning_rate": 3.8938081698724755e-06, + "loss": 0.6143, + "step": 7189 + }, + { + "epoch": 0.58, + "grad_norm": 2.2393862555000665, + "learning_rate": 3.892525448475713e-06, + "loss": 0.6762, + "step": 7190 + }, + { + "epoch": 0.58, + "grad_norm": 4.184636086856159, + "learning_rate": 3.891242803723162e-06, + "loss": 0.7141, + "step": 7191 + }, + { + "epoch": 0.58, + "grad_norm": 2.6274662966595788, + "learning_rate": 3.889960235703591e-06, + "loss": 0.7047, + "step": 7192 + }, + { + "epoch": 0.58, + "grad_norm": 2.2508948579955983, + "learning_rate": 3.888677744505762e-06, + "loss": 0.8252, + "step": 7193 + }, + { + "epoch": 0.58, + "grad_norm": 3.1153022288612937, + "learning_rate": 3.887395330218429e-06, + "loss": 0.6737, + "step": 7194 + }, + { + "epoch": 0.58, + "grad_norm": 2.9151043502806306, + "learning_rate": 3.886112992930345e-06, + "loss": 0.6627, + "step": 7195 + }, + { + "epoch": 0.58, + "grad_norm": 2.9304253212175793, + "learning_rate": 3.884830732730256e-06, + "loss": 0.6311, + "step": 7196 + }, + { + "epoch": 0.58, + "grad_norm": 2.5563519750572383, + "learning_rate": 3.883548549706901e-06, + "loss": 0.571, + "step": 7197 + }, + { + "epoch": 0.58, + "grad_norm": 2.621668624211604, + "learning_rate": 3.882266443949016e-06, + "loss": 0.7884, + "step": 7198 + }, + { + "epoch": 0.58, + "grad_norm": 2.8090990735473156, + "learning_rate": 3.880984415545331e-06, + "loss": 0.8881, + "step": 7199 + }, + { + "epoch": 0.58, + "grad_norm": 3.236210003920545, + "learning_rate": 3.879702464584573e-06, + "loss": 0.6562, + "step": 7200 + }, + { + "epoch": 0.58, + "grad_norm": 2.6916479094271137, + "learning_rate": 3.878420591155456e-06, + "loss": 0.6965, + "step": 7201 + }, + { + "epoch": 0.58, + "grad_norm": 4.721446012476177, + "learning_rate": 3.877138795346697e-06, + "loss": 0.721, + "step": 7202 + }, + { + "epoch": 0.59, + "grad_norm": 6.681146630832065, + "learning_rate": 3.875857077247003e-06, + "loss": 0.5663, + "step": 7203 + }, + { + "epoch": 0.59, + "grad_norm": 4.379412145480314, + "learning_rate": 3.8745754369450766e-06, + "loss": 0.785, + "step": 7204 + }, + { + "epoch": 0.59, + "grad_norm": 12.961918217050401, + "learning_rate": 3.873293874529617e-06, + "loss": 0.7024, + "step": 7205 + }, + { + "epoch": 0.59, + "grad_norm": 4.164589381791025, + "learning_rate": 3.872012390089318e-06, + "loss": 0.6477, + "step": 7206 + }, + { + "epoch": 0.59, + "grad_norm": 4.18180053343718, + "learning_rate": 3.870730983712861e-06, + "loss": 0.6177, + "step": 7207 + }, + { + "epoch": 0.59, + "grad_norm": 2.517025835710788, + "learning_rate": 3.86944965548893e-06, + "loss": 0.7079, + "step": 7208 + }, + { + "epoch": 0.59, + "grad_norm": 2.4843008449983923, + "learning_rate": 3.868168405506202e-06, + "loss": 0.6075, + "step": 7209 + }, + { + "epoch": 0.59, + "grad_norm": 2.494483836483101, + "learning_rate": 3.866887233853348e-06, + "loss": 0.477, + "step": 7210 + }, + { + "epoch": 0.59, + "grad_norm": 2.7910383364045024, + "learning_rate": 3.865606140619032e-06, + "loss": 0.6363, + "step": 7211 + }, + { + "epoch": 0.59, + "grad_norm": 3.3259199026576036, + "learning_rate": 3.864325125891912e-06, + "loss": 0.7215, + "step": 7212 + }, + { + "epoch": 0.59, + "grad_norm": 3.0871505619556356, + "learning_rate": 3.863044189760648e-06, + "loss": 0.6827, + "step": 7213 + }, + { + "epoch": 0.59, + "grad_norm": 3.9231385581149323, + "learning_rate": 3.861763332313881e-06, + "loss": 0.6288, + "step": 7214 + }, + { + "epoch": 0.59, + "grad_norm": 2.060535966473672, + "learning_rate": 3.86048255364026e-06, + "loss": 0.5824, + "step": 7215 + }, + { + "epoch": 0.59, + "grad_norm": 9.742416264242609, + "learning_rate": 3.85920185382842e-06, + "loss": 0.5813, + "step": 7216 + }, + { + "epoch": 0.59, + "grad_norm": 4.59445604628631, + "learning_rate": 3.8579212329669956e-06, + "loss": 0.8651, + "step": 7217 + }, + { + "epoch": 0.59, + "grad_norm": 2.9663063073640026, + "learning_rate": 3.856640691144614e-06, + "loss": 0.6132, + "step": 7218 + }, + { + "epoch": 0.59, + "grad_norm": 2.574833658107921, + "learning_rate": 3.8553602284498945e-06, + "loss": 0.8064, + "step": 7219 + }, + { + "epoch": 0.59, + "grad_norm": 3.1517735153701105, + "learning_rate": 3.854079844971456e-06, + "loss": 0.6178, + "step": 7220 + }, + { + "epoch": 0.59, + "grad_norm": 3.911390324305378, + "learning_rate": 3.852799540797906e-06, + "loss": 0.7517, + "step": 7221 + }, + { + "epoch": 0.59, + "grad_norm": 3.4241318381324013, + "learning_rate": 3.851519316017851e-06, + "loss": 0.6175, + "step": 7222 + }, + { + "epoch": 0.59, + "grad_norm": 3.105643683430784, + "learning_rate": 3.850239170719891e-06, + "loss": 0.6606, + "step": 7223 + }, + { + "epoch": 0.59, + "grad_norm": 2.644108443360923, + "learning_rate": 3.848959104992619e-06, + "loss": 0.6611, + "step": 7224 + }, + { + "epoch": 0.59, + "grad_norm": 6.780798108946891, + "learning_rate": 3.847679118924627e-06, + "loss": 0.7851, + "step": 7225 + }, + { + "epoch": 0.59, + "grad_norm": 4.11567693269245, + "learning_rate": 3.846399212604495e-06, + "loss": 0.6335, + "step": 7226 + }, + { + "epoch": 0.59, + "grad_norm": 2.954699984067186, + "learning_rate": 3.845119386120801e-06, + "loss": 0.5931, + "step": 7227 + }, + { + "epoch": 0.59, + "grad_norm": 3.084396318031787, + "learning_rate": 3.8438396395621155e-06, + "loss": 0.7061, + "step": 7228 + }, + { + "epoch": 0.59, + "grad_norm": 3.505458754041068, + "learning_rate": 3.842559973017007e-06, + "loss": 0.6277, + "step": 7229 + }, + { + "epoch": 0.59, + "grad_norm": 2.6835424893423943, + "learning_rate": 3.841280386574037e-06, + "loss": 0.5841, + "step": 7230 + }, + { + "epoch": 0.59, + "grad_norm": 7.0675334694293435, + "learning_rate": 3.84000088032176e-06, + "loss": 0.6593, + "step": 7231 + }, + { + "epoch": 0.59, + "grad_norm": 2.32267646422886, + "learning_rate": 3.838721454348726e-06, + "loss": 0.6421, + "step": 7232 + }, + { + "epoch": 0.59, + "grad_norm": 4.559178333668757, + "learning_rate": 3.837442108743481e-06, + "loss": 0.6663, + "step": 7233 + }, + { + "epoch": 0.59, + "grad_norm": 3.6079087860769774, + "learning_rate": 3.836162843594561e-06, + "loss": 0.4741, + "step": 7234 + }, + { + "epoch": 0.59, + "grad_norm": 2.9282600916634998, + "learning_rate": 3.8348836589905e-06, + "loss": 0.6938, + "step": 7235 + }, + { + "epoch": 0.59, + "grad_norm": 4.919553372839366, + "learning_rate": 3.833604555019826e-06, + "loss": 0.7656, + "step": 7236 + }, + { + "epoch": 0.59, + "grad_norm": 2.7995397734226843, + "learning_rate": 3.832325531771061e-06, + "loss": 0.703, + "step": 7237 + }, + { + "epoch": 0.59, + "grad_norm": 5.496908886308545, + "learning_rate": 3.831046589332721e-06, + "loss": 0.6902, + "step": 7238 + }, + { + "epoch": 0.59, + "grad_norm": 7.39964249065611, + "learning_rate": 3.82976772779332e-06, + "loss": 0.6733, + "step": 7239 + }, + { + "epoch": 0.59, + "grad_norm": 2.1525700114017003, + "learning_rate": 3.8284889472413575e-06, + "loss": 0.6014, + "step": 7240 + }, + { + "epoch": 0.59, + "grad_norm": 4.537658975154655, + "learning_rate": 3.8272102477653374e-06, + "loss": 0.6228, + "step": 7241 + }, + { + "epoch": 0.59, + "grad_norm": 3.8141789154662664, + "learning_rate": 3.825931629453752e-06, + "loss": 0.699, + "step": 7242 + }, + { + "epoch": 0.59, + "grad_norm": 3.5797493848278577, + "learning_rate": 3.824653092395091e-06, + "loss": 0.7611, + "step": 7243 + }, + { + "epoch": 0.59, + "grad_norm": 30.21051244384909, + "learning_rate": 3.823374636677837e-06, + "loss": 0.8074, + "step": 7244 + }, + { + "epoch": 0.59, + "grad_norm": 5.938529816369747, + "learning_rate": 3.822096262390466e-06, + "loss": 0.7849, + "step": 7245 + }, + { + "epoch": 0.59, + "grad_norm": 12.104901020428983, + "learning_rate": 3.820817969621452e-06, + "loss": 0.642, + "step": 7246 + }, + { + "epoch": 0.59, + "grad_norm": 6.208213911645407, + "learning_rate": 3.819539758459258e-06, + "loss": 0.7094, + "step": 7247 + }, + { + "epoch": 0.59, + "grad_norm": 2.9462731893274317, + "learning_rate": 3.8182616289923445e-06, + "loss": 0.7118, + "step": 7248 + }, + { + "epoch": 0.59, + "grad_norm": 5.769709331321691, + "learning_rate": 3.8169835813091675e-06, + "loss": 0.6705, + "step": 7249 + }, + { + "epoch": 0.59, + "grad_norm": 5.392203288692865, + "learning_rate": 3.815705615498177e-06, + "loss": 0.6834, + "step": 7250 + }, + { + "epoch": 0.59, + "grad_norm": 3.0020658023445153, + "learning_rate": 3.8144277316478135e-06, + "loss": 0.6077, + "step": 7251 + }, + { + "epoch": 0.59, + "grad_norm": 10.042213970577976, + "learning_rate": 3.813149929846516e-06, + "loss": 0.6952, + "step": 7252 + }, + { + "epoch": 0.59, + "grad_norm": 10.455287717037274, + "learning_rate": 3.8118722101827186e-06, + "loss": 0.6124, + "step": 7253 + }, + { + "epoch": 0.59, + "grad_norm": 2.5436967829007293, + "learning_rate": 3.810594572744843e-06, + "loss": 0.7633, + "step": 7254 + }, + { + "epoch": 0.59, + "grad_norm": 6.695256271579665, + "learning_rate": 3.8093170176213125e-06, + "loss": 0.5679, + "step": 7255 + }, + { + "epoch": 0.59, + "grad_norm": 2.675130500984532, + "learning_rate": 3.808039544900541e-06, + "loss": 0.6674, + "step": 7256 + }, + { + "epoch": 0.59, + "grad_norm": 3.2151662665051806, + "learning_rate": 3.806762154670938e-06, + "loss": 0.5473, + "step": 7257 + }, + { + "epoch": 0.59, + "grad_norm": 1.9915570694439937, + "learning_rate": 3.8054848470209094e-06, + "loss": 0.6101, + "step": 7258 + }, + { + "epoch": 0.59, + "grad_norm": 2.8705539405942555, + "learning_rate": 3.8042076220388494e-06, + "loss": 0.7492, + "step": 7259 + }, + { + "epoch": 0.59, + "grad_norm": 3.380885745975379, + "learning_rate": 3.8029304798131522e-06, + "loss": 0.5438, + "step": 7260 + }, + { + "epoch": 0.59, + "grad_norm": 3.148770564674383, + "learning_rate": 3.8016534204322015e-06, + "loss": 0.6225, + "step": 7261 + }, + { + "epoch": 0.59, + "grad_norm": 17.67362039582867, + "learning_rate": 3.80037644398438e-06, + "loss": 0.7057, + "step": 7262 + }, + { + "epoch": 0.59, + "grad_norm": 2.199823629081299, + "learning_rate": 3.7990995505580613e-06, + "loss": 0.7123, + "step": 7263 + }, + { + "epoch": 0.59, + "grad_norm": 2.6642799848007668, + "learning_rate": 3.7978227402416155e-06, + "loss": 0.405, + "step": 7264 + }, + { + "epoch": 0.59, + "grad_norm": 2.552145961684152, + "learning_rate": 3.796546013123407e-06, + "loss": 0.5678, + "step": 7265 + }, + { + "epoch": 0.59, + "grad_norm": 4.454298144343011, + "learning_rate": 3.795269369291792e-06, + "loss": 0.54, + "step": 7266 + }, + { + "epoch": 0.59, + "grad_norm": 3.455051219378804, + "learning_rate": 3.793992808835121e-06, + "loss": 0.7836, + "step": 7267 + }, + { + "epoch": 0.59, + "grad_norm": 3.000137984062915, + "learning_rate": 3.7927163318417426e-06, + "loss": 0.6297, + "step": 7268 + }, + { + "epoch": 0.59, + "grad_norm": 9.05652693384451, + "learning_rate": 3.791439938399994e-06, + "loss": 0.8266, + "step": 7269 + }, + { + "epoch": 0.59, + "grad_norm": 21.664017230343216, + "learning_rate": 3.790163628598212e-06, + "loss": 0.6272, + "step": 7270 + }, + { + "epoch": 0.59, + "grad_norm": 5.460426999278598, + "learning_rate": 3.7888874025247243e-06, + "loss": 0.5154, + "step": 7271 + }, + { + "epoch": 0.59, + "grad_norm": 6.362063070585342, + "learning_rate": 3.7876112602678544e-06, + "loss": 0.5735, + "step": 7272 + }, + { + "epoch": 0.59, + "grad_norm": 2.395111705112645, + "learning_rate": 3.786335201915921e-06, + "loss": 0.7249, + "step": 7273 + }, + { + "epoch": 0.59, + "grad_norm": 2.3274706668652643, + "learning_rate": 3.7850592275572316e-06, + "loss": 0.5885, + "step": 7274 + }, + { + "epoch": 0.59, + "grad_norm": 3.158507784404962, + "learning_rate": 3.783783337280094e-06, + "loss": 0.8045, + "step": 7275 + }, + { + "epoch": 0.59, + "grad_norm": 4.020353364995064, + "learning_rate": 3.782507531172807e-06, + "loss": 0.5935, + "step": 7276 + }, + { + "epoch": 0.59, + "grad_norm": 4.5596513661815194, + "learning_rate": 3.781231809323665e-06, + "loss": 0.7314, + "step": 7277 + }, + { + "epoch": 0.59, + "grad_norm": 4.102031344115807, + "learning_rate": 3.7799561718209555e-06, + "loss": 0.7554, + "step": 7278 + }, + { + "epoch": 0.59, + "grad_norm": 2.319310150103513, + "learning_rate": 3.778680618752963e-06, + "loss": 0.6662, + "step": 7279 + }, + { + "epoch": 0.59, + "grad_norm": 3.969363160057141, + "learning_rate": 3.7774051502079596e-06, + "loss": 0.5514, + "step": 7280 + }, + { + "epoch": 0.59, + "grad_norm": 2.8240704270365495, + "learning_rate": 3.776129766274218e-06, + "loss": 0.7703, + "step": 7281 + }, + { + "epoch": 0.59, + "grad_norm": 2.9126984781048404, + "learning_rate": 3.774854467040002e-06, + "loss": 0.6712, + "step": 7282 + }, + { + "epoch": 0.59, + "grad_norm": 4.7635942971269305, + "learning_rate": 3.7735792525935735e-06, + "loss": 0.6436, + "step": 7283 + }, + { + "epoch": 0.59, + "grad_norm": 5.7172518320405015, + "learning_rate": 3.7723041230231804e-06, + "loss": 0.5938, + "step": 7284 + }, + { + "epoch": 0.59, + "grad_norm": 5.188486848638251, + "learning_rate": 3.7710290784170733e-06, + "loss": 0.6635, + "step": 7285 + }, + { + "epoch": 0.59, + "grad_norm": 3.1391049735641894, + "learning_rate": 3.7697541188634934e-06, + "loss": 0.5841, + "step": 7286 + }, + { + "epoch": 0.59, + "grad_norm": 3.442731107041563, + "learning_rate": 3.7684792444506733e-06, + "loss": 0.778, + "step": 7287 + }, + { + "epoch": 0.59, + "grad_norm": 3.1025109357717615, + "learning_rate": 3.7672044552668436e-06, + "loss": 0.784, + "step": 7288 + }, + { + "epoch": 0.59, + "grad_norm": 2.4284228131984222, + "learning_rate": 3.765929751400228e-06, + "loss": 0.6904, + "step": 7289 + }, + { + "epoch": 0.59, + "grad_norm": 3.1258211256239545, + "learning_rate": 3.7646551329390445e-06, + "loss": 0.8809, + "step": 7290 + }, + { + "epoch": 0.59, + "grad_norm": 3.3436508090189148, + "learning_rate": 3.763380599971504e-06, + "loss": 0.6396, + "step": 7291 + }, + { + "epoch": 0.59, + "grad_norm": 2.8392529089421448, + "learning_rate": 3.762106152585813e-06, + "loss": 0.5705, + "step": 7292 + }, + { + "epoch": 0.59, + "grad_norm": 7.916368871920782, + "learning_rate": 3.760831790870171e-06, + "loss": 0.736, + "step": 7293 + }, + { + "epoch": 0.59, + "grad_norm": 2.7444956103580873, + "learning_rate": 3.7595575149127693e-06, + "loss": 0.6116, + "step": 7294 + }, + { + "epoch": 0.59, + "grad_norm": 3.056678484798001, + "learning_rate": 3.758283324801799e-06, + "loss": 0.7477, + "step": 7295 + }, + { + "epoch": 0.59, + "grad_norm": 5.079672154811444, + "learning_rate": 3.757009220625441e-06, + "loss": 0.7192, + "step": 7296 + }, + { + "epoch": 0.59, + "grad_norm": 4.476492311547803, + "learning_rate": 3.7557352024718718e-06, + "loss": 0.7163, + "step": 7297 + }, + { + "epoch": 0.59, + "grad_norm": 3.0904931844461085, + "learning_rate": 3.7544612704292616e-06, + "loss": 0.7768, + "step": 7298 + }, + { + "epoch": 0.59, + "grad_norm": 5.4284757805782755, + "learning_rate": 3.753187424585774e-06, + "loss": 0.6644, + "step": 7299 + }, + { + "epoch": 0.59, + "grad_norm": 3.9688884010972734, + "learning_rate": 3.7519136650295673e-06, + "loss": 0.7391, + "step": 7300 + }, + { + "epoch": 0.59, + "grad_norm": 4.02451487833557, + "learning_rate": 3.7506399918487927e-06, + "loss": 0.6577, + "step": 7301 + }, + { + "epoch": 0.59, + "grad_norm": 4.743075690605672, + "learning_rate": 3.7493664051315976e-06, + "loss": 0.6018, + "step": 7302 + }, + { + "epoch": 0.59, + "grad_norm": 4.538296913182985, + "learning_rate": 3.748092904966122e-06, + "loss": 0.7299, + "step": 7303 + }, + { + "epoch": 0.59, + "grad_norm": 6.940023903922378, + "learning_rate": 3.7468194914404986e-06, + "loss": 0.5886, + "step": 7304 + }, + { + "epoch": 0.59, + "grad_norm": 2.418059115404444, + "learning_rate": 3.745546164642859e-06, + "loss": 0.6898, + "step": 7305 + }, + { + "epoch": 0.59, + "grad_norm": 7.042844801926572, + "learning_rate": 3.7442729246613243e-06, + "loss": 0.7128, + "step": 7306 + }, + { + "epoch": 0.59, + "grad_norm": 2.6568260975861397, + "learning_rate": 3.742999771584008e-06, + "loss": 0.5964, + "step": 7307 + }, + { + "epoch": 0.59, + "grad_norm": 4.604082433565104, + "learning_rate": 3.7417267054990234e-06, + "loss": 0.7636, + "step": 7308 + }, + { + "epoch": 0.59, + "grad_norm": 2.720544242532811, + "learning_rate": 3.740453726494473e-06, + "loss": 0.6294, + "step": 7309 + }, + { + "epoch": 0.59, + "grad_norm": 4.077363071731181, + "learning_rate": 3.7391808346584545e-06, + "loss": 0.5218, + "step": 7310 + }, + { + "epoch": 0.59, + "grad_norm": 2.2079242017602843, + "learning_rate": 3.7379080300790616e-06, + "loss": 0.8184, + "step": 7311 + }, + { + "epoch": 0.59, + "grad_norm": 3.851829489873319, + "learning_rate": 3.7366353128443823e-06, + "loss": 0.6807, + "step": 7312 + }, + { + "epoch": 0.59, + "grad_norm": 1.8707867851801352, + "learning_rate": 3.7353626830424915e-06, + "loss": 0.7754, + "step": 7313 + }, + { + "epoch": 0.59, + "grad_norm": 6.262686183065627, + "learning_rate": 3.734090140761466e-06, + "loss": 0.7534, + "step": 7314 + }, + { + "epoch": 0.59, + "grad_norm": 3.222533421776613, + "learning_rate": 3.7328176860893743e-06, + "loss": 0.835, + "step": 7315 + }, + { + "epoch": 0.59, + "grad_norm": 3.349002082563296, + "learning_rate": 3.731545319114277e-06, + "loss": 0.5618, + "step": 7316 + }, + { + "epoch": 0.59, + "grad_norm": 3.23384930348398, + "learning_rate": 3.7302730399242305e-06, + "loss": 0.719, + "step": 7317 + }, + { + "epoch": 0.59, + "grad_norm": 2.740611501071095, + "learning_rate": 3.7290008486072836e-06, + "loss": 0.7509, + "step": 7318 + }, + { + "epoch": 0.59, + "grad_norm": 6.938051999744724, + "learning_rate": 3.7277287452514844e-06, + "loss": 0.7121, + "step": 7319 + }, + { + "epoch": 0.59, + "grad_norm": 1.9062652497507258, + "learning_rate": 3.726456729944864e-06, + "loss": 0.4997, + "step": 7320 + }, + { + "epoch": 0.59, + "grad_norm": 4.797029651708164, + "learning_rate": 3.7251848027754566e-06, + "loss": 0.7725, + "step": 7321 + }, + { + "epoch": 0.59, + "grad_norm": 4.308471130314857, + "learning_rate": 3.7239129638312876e-06, + "loss": 0.5171, + "step": 7322 + }, + { + "epoch": 0.59, + "grad_norm": 2.8394518292624578, + "learning_rate": 3.7226412132003775e-06, + "loss": 0.6554, + "step": 7323 + }, + { + "epoch": 0.59, + "grad_norm": 3.915612819876549, + "learning_rate": 3.7213695509707382e-06, + "loss": 0.6871, + "step": 7324 + }, + { + "epoch": 0.59, + "grad_norm": 2.8477706457359475, + "learning_rate": 3.720097977230376e-06, + "loss": 0.6479, + "step": 7325 + }, + { + "epoch": 0.6, + "grad_norm": 15.094926483075213, + "learning_rate": 3.7188264920672958e-06, + "loss": 0.6751, + "step": 7326 + }, + { + "epoch": 0.6, + "grad_norm": 3.40302766700636, + "learning_rate": 3.717555095569486e-06, + "loss": 0.5662, + "step": 7327 + }, + { + "epoch": 0.6, + "grad_norm": 3.482315615312778, + "learning_rate": 3.716283787824939e-06, + "loss": 0.6228, + "step": 7328 + }, + { + "epoch": 0.6, + "grad_norm": 3.0254614307954033, + "learning_rate": 3.7150125689216365e-06, + "loss": 0.7599, + "step": 7329 + }, + { + "epoch": 0.6, + "grad_norm": 3.715720435265195, + "learning_rate": 3.7137414389475566e-06, + "loss": 0.7637, + "step": 7330 + }, + { + "epoch": 0.6, + "grad_norm": 3.9743786472464704, + "learning_rate": 3.7124703979906674e-06, + "loss": 0.8342, + "step": 7331 + }, + { + "epoch": 0.6, + "grad_norm": 2.7100283616072436, + "learning_rate": 3.7111994461389346e-06, + "loss": 0.7956, + "step": 7332 + }, + { + "epoch": 0.6, + "grad_norm": 3.784783770059734, + "learning_rate": 3.7099285834803146e-06, + "loss": 0.6263, + "step": 7333 + }, + { + "epoch": 0.6, + "grad_norm": 2.7325968038777404, + "learning_rate": 3.708657810102759e-06, + "loss": 0.7586, + "step": 7334 + }, + { + "epoch": 0.6, + "grad_norm": 2.8368295303232087, + "learning_rate": 3.707387126094213e-06, + "loss": 0.7055, + "step": 7335 + }, + { + "epoch": 0.6, + "grad_norm": 3.77220291521277, + "learning_rate": 3.7061165315426173e-06, + "loss": 0.6782, + "step": 7336 + }, + { + "epoch": 0.6, + "grad_norm": 11.999899633740377, + "learning_rate": 3.7048460265359054e-06, + "loss": 0.7718, + "step": 7337 + }, + { + "epoch": 0.6, + "grad_norm": 2.8780581353734926, + "learning_rate": 3.7035756111620037e-06, + "loss": 0.5941, + "step": 7338 + }, + { + "epoch": 0.6, + "grad_norm": 3.501336089963944, + "learning_rate": 3.7023052855088327e-06, + "loss": 0.7938, + "step": 7339 + }, + { + "epoch": 0.6, + "grad_norm": 21.701738445421, + "learning_rate": 3.7010350496643065e-06, + "loss": 0.7309, + "step": 7340 + }, + { + "epoch": 0.6, + "grad_norm": 3.4981375924591074, + "learning_rate": 3.6997649037163336e-06, + "loss": 0.6602, + "step": 7341 + }, + { + "epoch": 0.6, + "grad_norm": 2.8239392248433193, + "learning_rate": 3.698494847752816e-06, + "loss": 0.6981, + "step": 7342 + }, + { + "epoch": 0.6, + "grad_norm": 5.686085407662347, + "learning_rate": 3.6972248818616497e-06, + "loss": 0.6483, + "step": 7343 + }, + { + "epoch": 0.6, + "grad_norm": 2.2970129393030274, + "learning_rate": 3.6959550061307246e-06, + "loss": 0.6969, + "step": 7344 + }, + { + "epoch": 0.6, + "grad_norm": 2.8577774368735596, + "learning_rate": 3.6946852206479244e-06, + "loss": 0.8203, + "step": 7345 + }, + { + "epoch": 0.6, + "grad_norm": 2.998849811732684, + "learning_rate": 3.693415525501128e-06, + "loss": 0.776, + "step": 7346 + }, + { + "epoch": 0.6, + "grad_norm": 2.9829314542238907, + "learning_rate": 3.6921459207782017e-06, + "loss": 0.5407, + "step": 7347 + }, + { + "epoch": 0.6, + "grad_norm": 3.2892755123158492, + "learning_rate": 3.6908764065670134e-06, + "loss": 0.768, + "step": 7348 + }, + { + "epoch": 0.6, + "grad_norm": 7.563693044579954, + "learning_rate": 3.6896069829554205e-06, + "loss": 0.6022, + "step": 7349 + }, + { + "epoch": 0.6, + "grad_norm": 4.066374372784264, + "learning_rate": 3.688337650031274e-06, + "loss": 0.6213, + "step": 7350 + }, + { + "epoch": 0.6, + "grad_norm": 7.35627071548663, + "learning_rate": 3.687068407882422e-06, + "loss": 0.636, + "step": 7351 + }, + { + "epoch": 0.6, + "grad_norm": 3.3694629288355444, + "learning_rate": 3.685799256596705e-06, + "loss": 0.7368, + "step": 7352 + }, + { + "epoch": 0.6, + "grad_norm": 2.81326833222612, + "learning_rate": 3.6845301962619525e-06, + "loss": 0.7889, + "step": 7353 + }, + { + "epoch": 0.6, + "grad_norm": 3.068086801096515, + "learning_rate": 3.683261226965993e-06, + "loss": 0.6114, + "step": 7354 + }, + { + "epoch": 0.6, + "grad_norm": 3.050866778184678, + "learning_rate": 3.681992348796648e-06, + "loss": 0.6424, + "step": 7355 + }, + { + "epoch": 0.6, + "grad_norm": 5.353583618659161, + "learning_rate": 3.6807235618417314e-06, + "loss": 0.7037, + "step": 7356 + }, + { + "epoch": 0.6, + "grad_norm": 3.51547153777052, + "learning_rate": 3.6794548661890506e-06, + "loss": 0.6838, + "step": 7357 + }, + { + "epoch": 0.6, + "grad_norm": 5.218758206161168, + "learning_rate": 3.6781862619264074e-06, + "loss": 0.6982, + "step": 7358 + }, + { + "epoch": 0.6, + "grad_norm": 2.810365870865711, + "learning_rate": 3.6769177491416004e-06, + "loss": 0.8776, + "step": 7359 + }, + { + "epoch": 0.6, + "grad_norm": 2.73536081318481, + "learning_rate": 3.6756493279224137e-06, + "loss": 0.5942, + "step": 7360 + }, + { + "epoch": 0.6, + "grad_norm": 2.312571897085781, + "learning_rate": 3.6743809983566324e-06, + "loss": 0.6155, + "step": 7361 + }, + { + "epoch": 0.6, + "grad_norm": 1.7280209460157567, + "learning_rate": 3.6731127605320326e-06, + "loss": 0.5422, + "step": 7362 + }, + { + "epoch": 0.6, + "grad_norm": 2.2483955327525234, + "learning_rate": 3.6718446145363857e-06, + "loss": 0.7942, + "step": 7363 + }, + { + "epoch": 0.6, + "grad_norm": 4.87319719793674, + "learning_rate": 3.6705765604574534e-06, + "loss": 0.698, + "step": 7364 + }, + { + "epoch": 0.6, + "grad_norm": 2.6660059292934863, + "learning_rate": 3.6693085983829955e-06, + "loss": 0.5085, + "step": 7365 + }, + { + "epoch": 0.6, + "grad_norm": 5.5764041209249555, + "learning_rate": 3.6680407284007595e-06, + "loss": 0.6, + "step": 7366 + }, + { + "epoch": 0.6, + "grad_norm": 2.819184106941003, + "learning_rate": 3.6667729505984916e-06, + "loss": 0.6404, + "step": 7367 + }, + { + "epoch": 0.6, + "grad_norm": 2.7016607748541444, + "learning_rate": 3.6655052650639313e-06, + "loss": 0.6461, + "step": 7368 + }, + { + "epoch": 0.6, + "grad_norm": 4.7830564835462495, + "learning_rate": 3.6642376718848076e-06, + "loss": 0.6721, + "step": 7369 + }, + { + "epoch": 0.6, + "grad_norm": 3.4867206424649875, + "learning_rate": 3.6629701711488485e-06, + "loss": 0.6357, + "step": 7370 + }, + { + "epoch": 0.6, + "grad_norm": 3.853191352058882, + "learning_rate": 3.6617027629437735e-06, + "loss": 0.7528, + "step": 7371 + }, + { + "epoch": 0.6, + "grad_norm": 2.624544850960897, + "learning_rate": 3.6604354473572934e-06, + "loss": 0.6265, + "step": 7372 + }, + { + "epoch": 0.6, + "grad_norm": 3.707890966403876, + "learning_rate": 3.6591682244771154e-06, + "loss": 0.6072, + "step": 7373 + }, + { + "epoch": 0.6, + "grad_norm": 2.316784138481489, + "learning_rate": 3.6579010943909376e-06, + "loss": 0.5535, + "step": 7374 + }, + { + "epoch": 0.6, + "grad_norm": 5.2933425411726684, + "learning_rate": 3.6566340571864544e-06, + "loss": 0.711, + "step": 7375 + }, + { + "epoch": 0.6, + "grad_norm": 12.070246655012982, + "learning_rate": 3.6553671129513534e-06, + "loss": 0.6061, + "step": 7376 + }, + { + "epoch": 0.6, + "grad_norm": 5.951990262528006, + "learning_rate": 3.6541002617733147e-06, + "loss": 0.5611, + "step": 7377 + }, + { + "epoch": 0.6, + "grad_norm": 4.698397698313504, + "learning_rate": 3.652833503740013e-06, + "loss": 0.5494, + "step": 7378 + }, + { + "epoch": 0.6, + "grad_norm": 3.036075843767054, + "learning_rate": 3.6515668389391157e-06, + "loss": 0.5877, + "step": 7379 + }, + { + "epoch": 0.6, + "grad_norm": 2.922370702053709, + "learning_rate": 3.6503002674582823e-06, + "loss": 0.587, + "step": 7380 + }, + { + "epoch": 0.6, + "grad_norm": 2.3480338085188914, + "learning_rate": 3.64903378938517e-06, + "loss": 0.6602, + "step": 7381 + }, + { + "epoch": 0.6, + "grad_norm": 5.253862737099655, + "learning_rate": 3.647767404807424e-06, + "loss": 0.627, + "step": 7382 + }, + { + "epoch": 0.6, + "grad_norm": 2.8909625112504473, + "learning_rate": 3.6465011138126894e-06, + "loss": 0.7586, + "step": 7383 + }, + { + "epoch": 0.6, + "grad_norm": 5.367757252075153, + "learning_rate": 3.645234916488599e-06, + "loss": 0.7268, + "step": 7384 + }, + { + "epoch": 0.6, + "grad_norm": 115.07498013614116, + "learning_rate": 3.6439688129227853e-06, + "loss": 0.7857, + "step": 7385 + }, + { + "epoch": 0.6, + "grad_norm": 3.0428660295695837, + "learning_rate": 3.6427028032028656e-06, + "loss": 0.6272, + "step": 7386 + }, + { + "epoch": 0.6, + "grad_norm": 2.891589588081227, + "learning_rate": 3.6414368874164586e-06, + "loss": 0.6837, + "step": 7387 + }, + { + "epoch": 0.6, + "grad_norm": 2.979096354638379, + "learning_rate": 3.6401710656511734e-06, + "loss": 0.5668, + "step": 7388 + }, + { + "epoch": 0.6, + "grad_norm": 3.71001924930594, + "learning_rate": 3.638905337994612e-06, + "loss": 0.6534, + "step": 7389 + }, + { + "epoch": 0.6, + "grad_norm": 3.7682537974195434, + "learning_rate": 3.6376397045343716e-06, + "loss": 0.6284, + "step": 7390 + }, + { + "epoch": 0.6, + "grad_norm": 3.923526506784474, + "learning_rate": 3.636374165358042e-06, + "loss": 0.7152, + "step": 7391 + }, + { + "epoch": 0.6, + "grad_norm": 4.332754426108979, + "learning_rate": 3.635108720553208e-06, + "loss": 0.6212, + "step": 7392 + }, + { + "epoch": 0.6, + "grad_norm": 2.9713923597123992, + "learning_rate": 3.633843370207443e-06, + "loss": 0.8295, + "step": 7393 + }, + { + "epoch": 0.6, + "grad_norm": 17.769906774633828, + "learning_rate": 3.632578114408318e-06, + "loss": 0.6423, + "step": 7394 + }, + { + "epoch": 0.6, + "grad_norm": 5.563480011177433, + "learning_rate": 3.6313129532433976e-06, + "loss": 0.6142, + "step": 7395 + }, + { + "epoch": 0.6, + "grad_norm": 2.7471018418412414, + "learning_rate": 3.6300478868002397e-06, + "loss": 0.7332, + "step": 7396 + }, + { + "epoch": 0.6, + "grad_norm": 2.960938746782647, + "learning_rate": 3.6287829151663935e-06, + "loss": 0.6241, + "step": 7397 + }, + { + "epoch": 0.6, + "grad_norm": 3.3244601287712663, + "learning_rate": 3.6275180384294033e-06, + "loss": 0.6261, + "step": 7398 + }, + { + "epoch": 0.6, + "grad_norm": 3.04758375350109, + "learning_rate": 3.6262532566768087e-06, + "loss": 0.6003, + "step": 7399 + }, + { + "epoch": 0.6, + "grad_norm": 4.148754541828166, + "learning_rate": 3.624988569996137e-06, + "loss": 0.6409, + "step": 7400 + }, + { + "epoch": 0.6, + "grad_norm": 5.220521118319865, + "learning_rate": 3.6237239784749132e-06, + "loss": 0.6581, + "step": 7401 + }, + { + "epoch": 0.6, + "grad_norm": 5.161334001062796, + "learning_rate": 3.6224594822006564e-06, + "loss": 0.7151, + "step": 7402 + }, + { + "epoch": 0.6, + "grad_norm": 14.677947169225197, + "learning_rate": 3.6211950812608777e-06, + "loss": 0.8255, + "step": 7403 + }, + { + "epoch": 0.6, + "grad_norm": 3.1644686056753066, + "learning_rate": 3.6199307757430806e-06, + "loss": 0.4962, + "step": 7404 + }, + { + "epoch": 0.6, + "grad_norm": 3.5720963507070422, + "learning_rate": 3.618666565734764e-06, + "loss": 0.7252, + "step": 7405 + }, + { + "epoch": 0.6, + "grad_norm": 12.785658881099442, + "learning_rate": 3.617402451323419e-06, + "loss": 0.6313, + "step": 7406 + }, + { + "epoch": 0.6, + "grad_norm": 3.636281569236962, + "learning_rate": 3.616138432596529e-06, + "loss": 0.7256, + "step": 7407 + }, + { + "epoch": 0.6, + "grad_norm": 3.482448966358987, + "learning_rate": 3.614874509641573e-06, + "loss": 0.7139, + "step": 7408 + }, + { + "epoch": 0.6, + "grad_norm": 2.913919190770498, + "learning_rate": 3.6136106825460216e-06, + "loss": 0.6947, + "step": 7409 + }, + { + "epoch": 0.6, + "grad_norm": 3.9653355441426075, + "learning_rate": 3.612346951397341e-06, + "loss": 0.6649, + "step": 7410 + }, + { + "epoch": 0.6, + "grad_norm": 3.5684972375213584, + "learning_rate": 3.6110833162829896e-06, + "loss": 0.8236, + "step": 7411 + }, + { + "epoch": 0.6, + "grad_norm": 13.866876925362012, + "learning_rate": 3.609819777290418e-06, + "loss": 0.6926, + "step": 7412 + }, + { + "epoch": 0.6, + "grad_norm": 2.9908740190233245, + "learning_rate": 3.608556334507072e-06, + "loss": 0.7981, + "step": 7413 + }, + { + "epoch": 0.6, + "grad_norm": 3.4929840280381725, + "learning_rate": 3.6072929880203865e-06, + "loss": 0.7569, + "step": 7414 + }, + { + "epoch": 0.6, + "grad_norm": 3.7318662072481046, + "learning_rate": 3.6060297379177963e-06, + "loss": 0.7681, + "step": 7415 + }, + { + "epoch": 0.6, + "grad_norm": 3.663281620344524, + "learning_rate": 3.6047665842867254e-06, + "loss": 0.7876, + "step": 7416 + }, + { + "epoch": 0.6, + "grad_norm": 2.844513174451792, + "learning_rate": 3.6035035272145912e-06, + "loss": 0.5963, + "step": 7417 + }, + { + "epoch": 0.6, + "grad_norm": 3.099551154949786, + "learning_rate": 3.6022405667888087e-06, + "loss": 0.6513, + "step": 7418 + }, + { + "epoch": 0.6, + "grad_norm": 3.5039842140014597, + "learning_rate": 3.6009777030967778e-06, + "loss": 0.7656, + "step": 7419 + }, + { + "epoch": 0.6, + "grad_norm": 2.796447368831364, + "learning_rate": 3.5997149362258986e-06, + "loss": 0.7451, + "step": 7420 + }, + { + "epoch": 0.6, + "grad_norm": 2.9008332895667834, + "learning_rate": 3.5984522662635647e-06, + "loss": 0.6005, + "step": 7421 + }, + { + "epoch": 0.6, + "grad_norm": 4.292150120438743, + "learning_rate": 3.597189693297157e-06, + "loss": 0.8122, + "step": 7422 + }, + { + "epoch": 0.6, + "grad_norm": 5.4528149633907175, + "learning_rate": 3.5959272174140556e-06, + "loss": 0.7191, + "step": 7423 + }, + { + "epoch": 0.6, + "grad_norm": 3.507289227802753, + "learning_rate": 3.5946648387016315e-06, + "loss": 0.7637, + "step": 7424 + }, + { + "epoch": 0.6, + "grad_norm": 9.9852168813264, + "learning_rate": 3.5934025572472507e-06, + "loss": 0.6046, + "step": 7425 + }, + { + "epoch": 0.6, + "grad_norm": 3.4736471397379516, + "learning_rate": 3.5921403731382685e-06, + "loss": 0.5477, + "step": 7426 + }, + { + "epoch": 0.6, + "grad_norm": 3.2894486857299805, + "learning_rate": 3.5908782864620366e-06, + "loss": 0.5862, + "step": 7427 + }, + { + "epoch": 0.6, + "grad_norm": 8.339181316810304, + "learning_rate": 3.5896162973059013e-06, + "loss": 0.6038, + "step": 7428 + }, + { + "epoch": 0.6, + "grad_norm": 3.4399392977424483, + "learning_rate": 3.5883544057571974e-06, + "loss": 0.7104, + "step": 7429 + }, + { + "epoch": 0.6, + "grad_norm": 2.799165767067909, + "learning_rate": 3.5870926119032568e-06, + "loss": 0.6365, + "step": 7430 + }, + { + "epoch": 0.6, + "grad_norm": 4.617079430169025, + "learning_rate": 3.5858309158314044e-06, + "loss": 0.7092, + "step": 7431 + }, + { + "epoch": 0.6, + "grad_norm": 4.5219995765391365, + "learning_rate": 3.5845693176289587e-06, + "loss": 0.6852, + "step": 7432 + }, + { + "epoch": 0.6, + "grad_norm": 4.817399633909096, + "learning_rate": 3.583307817383226e-06, + "loss": 0.7077, + "step": 7433 + }, + { + "epoch": 0.6, + "grad_norm": 4.76126893841402, + "learning_rate": 3.5820464151815133e-06, + "loss": 0.6744, + "step": 7434 + }, + { + "epoch": 0.6, + "grad_norm": 8.940478020531653, + "learning_rate": 3.5807851111111167e-06, + "loss": 0.6672, + "step": 7435 + }, + { + "epoch": 0.6, + "grad_norm": 3.1123666422739733, + "learning_rate": 3.579523905259327e-06, + "loss": 0.5721, + "step": 7436 + }, + { + "epoch": 0.6, + "grad_norm": 9.242911222196803, + "learning_rate": 3.5782627977134264e-06, + "loss": 0.5619, + "step": 7437 + }, + { + "epoch": 0.6, + "grad_norm": 2.633197309355743, + "learning_rate": 3.577001788560695e-06, + "loss": 0.6304, + "step": 7438 + }, + { + "epoch": 0.6, + "grad_norm": 7.032586923230544, + "learning_rate": 3.5757408778883972e-06, + "loss": 0.7834, + "step": 7439 + }, + { + "epoch": 0.6, + "grad_norm": 2.6920840802343577, + "learning_rate": 3.5744800657837984e-06, + "loss": 0.6377, + "step": 7440 + }, + { + "epoch": 0.6, + "grad_norm": 3.2239254934461985, + "learning_rate": 3.573219352334155e-06, + "loss": 0.6446, + "step": 7441 + }, + { + "epoch": 0.6, + "grad_norm": 2.4639003976234797, + "learning_rate": 3.5719587376267163e-06, + "loss": 0.7605, + "step": 7442 + }, + { + "epoch": 0.6, + "grad_norm": 3.125943728789647, + "learning_rate": 3.5706982217487252e-06, + "loss": 0.5768, + "step": 7443 + }, + { + "epoch": 0.6, + "grad_norm": 5.6578371646774634, + "learning_rate": 3.569437804787416e-06, + "loss": 0.5837, + "step": 7444 + }, + { + "epoch": 0.6, + "grad_norm": 4.491356875708425, + "learning_rate": 3.568177486830019e-06, + "loss": 0.7558, + "step": 7445 + }, + { + "epoch": 0.6, + "grad_norm": 3.9196997398654516, + "learning_rate": 3.566917267963756e-06, + "loss": 0.5761, + "step": 7446 + }, + { + "epoch": 0.6, + "grad_norm": 9.848948453498652, + "learning_rate": 3.56565714827584e-06, + "loss": 0.8136, + "step": 7447 + }, + { + "epoch": 0.6, + "grad_norm": 7.730786089156826, + "learning_rate": 3.5643971278534805e-06, + "loss": 0.8032, + "step": 7448 + }, + { + "epoch": 0.61, + "grad_norm": 6.674699654272515, + "learning_rate": 3.5631372067838798e-06, + "loss": 0.5267, + "step": 7449 + }, + { + "epoch": 0.61, + "grad_norm": 2.603941762933034, + "learning_rate": 3.561877385154231e-06, + "loss": 0.5457, + "step": 7450 + }, + { + "epoch": 0.61, + "grad_norm": 4.946074547451311, + "learning_rate": 3.560617663051724e-06, + "loss": 0.7727, + "step": 7451 + }, + { + "epoch": 0.61, + "grad_norm": 8.14094111561901, + "learning_rate": 3.5593580405635374e-06, + "loss": 0.5433, + "step": 7452 + }, + { + "epoch": 0.61, + "grad_norm": 3.033982308557601, + "learning_rate": 3.5580985177768456e-06, + "loss": 0.7063, + "step": 7453 + }, + { + "epoch": 0.61, + "grad_norm": 3.551145050639668, + "learning_rate": 3.556839094778814e-06, + "loss": 0.8079, + "step": 7454 + }, + { + "epoch": 0.61, + "grad_norm": 4.326093621977823, + "learning_rate": 3.555579771656604e-06, + "loss": 0.7129, + "step": 7455 + }, + { + "epoch": 0.61, + "grad_norm": 37.418582917242695, + "learning_rate": 3.5543205484973684e-06, + "loss": 0.6575, + "step": 7456 + }, + { + "epoch": 0.61, + "grad_norm": 5.316704214860367, + "learning_rate": 3.5530614253882546e-06, + "loss": 0.7761, + "step": 7457 + }, + { + "epoch": 0.61, + "grad_norm": 5.492745698722403, + "learning_rate": 3.5518024024164023e-06, + "loss": 0.7963, + "step": 7458 + }, + { + "epoch": 0.61, + "grad_norm": 3.4366181366781383, + "learning_rate": 3.5505434796689396e-06, + "loss": 0.7946, + "step": 7459 + }, + { + "epoch": 0.61, + "grad_norm": 5.025074032335014, + "learning_rate": 3.5492846572329952e-06, + "loss": 0.8172, + "step": 7460 + }, + { + "epoch": 0.61, + "grad_norm": 3.399991602686766, + "learning_rate": 3.5480259351956882e-06, + "loss": 0.6188, + "step": 7461 + }, + { + "epoch": 0.61, + "grad_norm": 2.556868153626207, + "learning_rate": 3.546767313644128e-06, + "loss": 0.6612, + "step": 7462 + }, + { + "epoch": 0.61, + "grad_norm": 4.451405150011575, + "learning_rate": 3.5455087926654197e-06, + "loss": 0.6265, + "step": 7463 + }, + { + "epoch": 0.61, + "grad_norm": 2.796846650522022, + "learning_rate": 3.544250372346661e-06, + "loss": 0.5943, + "step": 7464 + }, + { + "epoch": 0.61, + "grad_norm": 2.146322225123683, + "learning_rate": 3.542992052774945e-06, + "loss": 0.5916, + "step": 7465 + }, + { + "epoch": 0.61, + "grad_norm": 3.158542674518788, + "learning_rate": 3.541733834037351e-06, + "loss": 0.6333, + "step": 7466 + }, + { + "epoch": 0.61, + "grad_norm": 8.634972935042876, + "learning_rate": 3.5404757162209573e-06, + "loss": 0.7511, + "step": 7467 + }, + { + "epoch": 0.61, + "grad_norm": 3.095751474180428, + "learning_rate": 3.5392176994128357e-06, + "loss": 0.5816, + "step": 7468 + }, + { + "epoch": 0.61, + "grad_norm": 3.979623297655365, + "learning_rate": 3.537959783700046e-06, + "loss": 0.7147, + "step": 7469 + }, + { + "epoch": 0.61, + "grad_norm": 39.91399625056651, + "learning_rate": 3.536701969169644e-06, + "loss": 0.6689, + "step": 7470 + }, + { + "epoch": 0.61, + "grad_norm": 4.982232803984003, + "learning_rate": 3.5354442559086823e-06, + "loss": 0.6588, + "step": 7471 + }, + { + "epoch": 0.61, + "grad_norm": 10.61320063679203, + "learning_rate": 3.5341866440041977e-06, + "loss": 0.6637, + "step": 7472 + }, + { + "epoch": 0.61, + "grad_norm": 3.574015307026985, + "learning_rate": 3.532929133543227e-06, + "loss": 0.5225, + "step": 7473 + }, + { + "epoch": 0.61, + "grad_norm": 3.4782369508446074, + "learning_rate": 3.5316717246127973e-06, + "loss": 0.7809, + "step": 7474 + }, + { + "epoch": 0.61, + "grad_norm": 4.081763733078685, + "learning_rate": 3.5304144172999295e-06, + "loss": 0.7006, + "step": 7475 + }, + { + "epoch": 0.61, + "grad_norm": 4.707854843255855, + "learning_rate": 3.5291572116916383e-06, + "loss": 0.6645, + "step": 7476 + }, + { + "epoch": 0.61, + "grad_norm": 3.1294700332511667, + "learning_rate": 3.5279001078749285e-06, + "loss": 0.576, + "step": 7477 + }, + { + "epoch": 0.61, + "grad_norm": 10.574617868204239, + "learning_rate": 3.526643105936802e-06, + "loss": 0.6541, + "step": 7478 + }, + { + "epoch": 0.61, + "grad_norm": 3.8608913724545166, + "learning_rate": 3.5253862059642483e-06, + "loss": 0.6665, + "step": 7479 + }, + { + "epoch": 0.61, + "grad_norm": 3.104653721147114, + "learning_rate": 3.524129408044254e-06, + "loss": 0.7161, + "step": 7480 + }, + { + "epoch": 0.61, + "grad_norm": 3.649684018076883, + "learning_rate": 3.5228727122637973e-06, + "loss": 0.673, + "step": 7481 + }, + { + "epoch": 0.61, + "grad_norm": 3.247443326591562, + "learning_rate": 3.5216161187098497e-06, + "loss": 0.552, + "step": 7482 + }, + { + "epoch": 0.61, + "grad_norm": 2.8906115930719953, + "learning_rate": 3.5203596274693752e-06, + "loss": 0.5292, + "step": 7483 + }, + { + "epoch": 0.61, + "grad_norm": 4.250808900652668, + "learning_rate": 3.5191032386293315e-06, + "loss": 0.6648, + "step": 7484 + }, + { + "epoch": 0.61, + "grad_norm": 4.037753999067969, + "learning_rate": 3.517846952276669e-06, + "loss": 0.7541, + "step": 7485 + }, + { + "epoch": 0.61, + "grad_norm": 4.747744603187847, + "learning_rate": 3.5165907684983297e-06, + "loss": 0.6382, + "step": 7486 + }, + { + "epoch": 0.61, + "grad_norm": 5.429821659735972, + "learning_rate": 3.5153346873812484e-06, + "loss": 0.6565, + "step": 7487 + }, + { + "epoch": 0.61, + "grad_norm": 4.098600727721019, + "learning_rate": 3.5140787090123554e-06, + "loss": 0.6331, + "step": 7488 + }, + { + "epoch": 0.61, + "grad_norm": 3.2772646206463643, + "learning_rate": 3.512822833478571e-06, + "loss": 0.786, + "step": 7489 + }, + { + "epoch": 0.61, + "grad_norm": 3.1690652723192576, + "learning_rate": 3.5115670608668107e-06, + "loss": 0.6329, + "step": 7490 + }, + { + "epoch": 0.61, + "grad_norm": 4.901228272483044, + "learning_rate": 3.510311391263984e-06, + "loss": 0.8053, + "step": 7491 + }, + { + "epoch": 0.61, + "grad_norm": 4.521589422396018, + "learning_rate": 3.5090558247569873e-06, + "loss": 0.733, + "step": 7492 + }, + { + "epoch": 0.61, + "grad_norm": 5.383405775573046, + "learning_rate": 3.507800361432716e-06, + "loss": 0.7596, + "step": 7493 + }, + { + "epoch": 0.61, + "grad_norm": 3.510076665071607, + "learning_rate": 3.5065450013780544e-06, + "loss": 0.6236, + "step": 7494 + }, + { + "epoch": 0.61, + "grad_norm": 207.6708967768317, + "learning_rate": 3.5052897446798818e-06, + "loss": 0.5501, + "step": 7495 + }, + { + "epoch": 0.61, + "grad_norm": 5.411337803501745, + "learning_rate": 3.504034591425071e-06, + "loss": 0.6683, + "step": 7496 + }, + { + "epoch": 0.61, + "grad_norm": 2.6205665342670184, + "learning_rate": 3.502779541700485e-06, + "loss": 0.6318, + "step": 7497 + }, + { + "epoch": 0.61, + "grad_norm": 13.063560236619885, + "learning_rate": 3.501524595592985e-06, + "loss": 0.7156, + "step": 7498 + }, + { + "epoch": 0.61, + "grad_norm": 3.662077405662867, + "learning_rate": 3.5002697531894157e-06, + "loss": 0.7592, + "step": 7499 + }, + { + "epoch": 0.61, + "grad_norm": 6.1797708523514645, + "learning_rate": 3.4990150145766227e-06, + "loss": 0.6042, + "step": 7500 + }, + { + "epoch": 0.61, + "grad_norm": 5.094967996698112, + "learning_rate": 3.4977603798414427e-06, + "loss": 0.5516, + "step": 7501 + }, + { + "epoch": 0.61, + "grad_norm": 2.696605898506245, + "learning_rate": 3.4965058490707017e-06, + "loss": 0.601, + "step": 7502 + }, + { + "epoch": 0.61, + "grad_norm": 5.60726691514901, + "learning_rate": 3.4952514223512235e-06, + "loss": 0.6846, + "step": 7503 + }, + { + "epoch": 0.61, + "grad_norm": 3.582569383750347, + "learning_rate": 3.4939970997698213e-06, + "loss": 0.831, + "step": 7504 + }, + { + "epoch": 0.61, + "grad_norm": 8.124051279721181, + "learning_rate": 3.4927428814133043e-06, + "loss": 0.5354, + "step": 7505 + }, + { + "epoch": 0.61, + "grad_norm": 4.04058518958966, + "learning_rate": 3.491488767368468e-06, + "loss": 0.6631, + "step": 7506 + }, + { + "epoch": 0.61, + "grad_norm": 3.2407010948334074, + "learning_rate": 3.490234757722108e-06, + "loss": 0.8165, + "step": 7507 + }, + { + "epoch": 0.61, + "grad_norm": 4.299299475979328, + "learning_rate": 3.4889808525610085e-06, + "loss": 0.5579, + "step": 7508 + }, + { + "epoch": 0.61, + "grad_norm": 4.152458281296697, + "learning_rate": 3.4877270519719496e-06, + "loss": 0.6582, + "step": 7509 + }, + { + "epoch": 0.61, + "grad_norm": 3.2017835268883856, + "learning_rate": 3.4864733560416998e-06, + "loss": 0.7237, + "step": 7510 + }, + { + "epoch": 0.61, + "grad_norm": 2.7718636953479243, + "learning_rate": 3.485219764857025e-06, + "loss": 0.7915, + "step": 7511 + }, + { + "epoch": 0.61, + "grad_norm": 4.16678902980323, + "learning_rate": 3.483966278504679e-06, + "loss": 0.7403, + "step": 7512 + }, + { + "epoch": 0.61, + "grad_norm": 2.6574637399353565, + "learning_rate": 3.4827128970714123e-06, + "loss": 0.6491, + "step": 7513 + }, + { + "epoch": 0.61, + "grad_norm": 5.765782272221393, + "learning_rate": 3.4814596206439666e-06, + "loss": 0.6318, + "step": 7514 + }, + { + "epoch": 0.61, + "grad_norm": 3.618754662511141, + "learning_rate": 3.4802064493090765e-06, + "loss": 0.5972, + "step": 7515 + }, + { + "epoch": 0.61, + "grad_norm": 3.1053712083909537, + "learning_rate": 3.4789533831534706e-06, + "loss": 0.687, + "step": 7516 + }, + { + "epoch": 0.61, + "grad_norm": 3.593768579302839, + "learning_rate": 3.477700422263867e-06, + "loss": 0.759, + "step": 7517 + }, + { + "epoch": 0.61, + "grad_norm": 3.5757928555745937, + "learning_rate": 3.4764475667269815e-06, + "loss": 0.7198, + "step": 7518 + }, + { + "epoch": 0.61, + "grad_norm": 14.456081601410123, + "learning_rate": 3.4751948166295153e-06, + "loss": 0.7842, + "step": 7519 + }, + { + "epoch": 0.61, + "grad_norm": 3.296511440773971, + "learning_rate": 3.473942172058169e-06, + "loss": 0.671, + "step": 7520 + }, + { + "epoch": 0.61, + "grad_norm": 2.7681650615448232, + "learning_rate": 3.472689633099633e-06, + "loss": 0.7124, + "step": 7521 + }, + { + "epoch": 0.61, + "grad_norm": 3.619507762883987, + "learning_rate": 3.4714371998405903e-06, + "loss": 0.6797, + "step": 7522 + }, + { + "epoch": 0.61, + "grad_norm": 4.7647575371595785, + "learning_rate": 3.470184872367719e-06, + "loss": 0.6645, + "step": 7523 + }, + { + "epoch": 0.61, + "grad_norm": 3.465184326297442, + "learning_rate": 3.468932650767689e-06, + "loss": 0.5799, + "step": 7524 + }, + { + "epoch": 0.61, + "grad_norm": 4.5381760373826, + "learning_rate": 3.467680535127158e-06, + "loss": 0.7122, + "step": 7525 + }, + { + "epoch": 0.61, + "grad_norm": 10.240758344469585, + "learning_rate": 3.466428525532783e-06, + "loss": 0.673, + "step": 7526 + }, + { + "epoch": 0.61, + "grad_norm": 2.915393453011111, + "learning_rate": 3.465176622071209e-06, + "loss": 0.6828, + "step": 7527 + }, + { + "epoch": 0.61, + "grad_norm": 5.745266727619725, + "learning_rate": 3.463924824829077e-06, + "loss": 0.8178, + "step": 7528 + }, + { + "epoch": 0.61, + "grad_norm": 6.8201581530158775, + "learning_rate": 3.4626731338930194e-06, + "loss": 0.611, + "step": 7529 + }, + { + "epoch": 0.61, + "grad_norm": 3.457473827712386, + "learning_rate": 3.4614215493496604e-06, + "loss": 0.7095, + "step": 7530 + }, + { + "epoch": 0.61, + "grad_norm": 2.9497103153545785, + "learning_rate": 3.4601700712856202e-06, + "loss": 0.8942, + "step": 7531 + }, + { + "epoch": 0.61, + "grad_norm": 2.925772376733001, + "learning_rate": 3.458918699787504e-06, + "loss": 0.688, + "step": 7532 + }, + { + "epoch": 0.61, + "grad_norm": 5.941175101434325, + "learning_rate": 3.4576674349419178e-06, + "loss": 0.6646, + "step": 7533 + }, + { + "epoch": 0.61, + "grad_norm": 2.2634904600427452, + "learning_rate": 3.456416276835457e-06, + "loss": 0.6057, + "step": 7534 + }, + { + "epoch": 0.61, + "grad_norm": 3.2104076943050965, + "learning_rate": 3.4551652255547087e-06, + "loss": 0.4869, + "step": 7535 + }, + { + "epoch": 0.61, + "grad_norm": 3.965658145215351, + "learning_rate": 3.453914281186253e-06, + "loss": 0.7993, + "step": 7536 + }, + { + "epoch": 0.61, + "grad_norm": 3.063300630377541, + "learning_rate": 3.4526634438166643e-06, + "loss": 0.6329, + "step": 7537 + }, + { + "epoch": 0.61, + "grad_norm": 4.113619594841262, + "learning_rate": 3.4514127135325105e-06, + "loss": 0.7657, + "step": 7538 + }, + { + "epoch": 0.61, + "grad_norm": 3.0728880864938515, + "learning_rate": 3.4501620904203455e-06, + "loss": 0.6631, + "step": 7539 + }, + { + "epoch": 0.61, + "grad_norm": 3.6177796145425902, + "learning_rate": 3.448911574566722e-06, + "loss": 0.7692, + "step": 7540 + }, + { + "epoch": 0.61, + "grad_norm": 4.3696261315460045, + "learning_rate": 3.4476611660581856e-06, + "loss": 0.601, + "step": 7541 + }, + { + "epoch": 0.61, + "grad_norm": 3.469751703993662, + "learning_rate": 3.4464108649812692e-06, + "loss": 0.7128, + "step": 7542 + }, + { + "epoch": 0.61, + "grad_norm": 2.7064940915106606, + "learning_rate": 3.445160671422504e-06, + "loss": 0.8469, + "step": 7543 + }, + { + "epoch": 0.61, + "grad_norm": 3.3719725341938265, + "learning_rate": 3.4439105854684117e-06, + "loss": 0.7976, + "step": 7544 + }, + { + "epoch": 0.61, + "grad_norm": 3.050294741985281, + "learning_rate": 3.4426606072055033e-06, + "loss": 0.7369, + "step": 7545 + }, + { + "epoch": 0.61, + "grad_norm": 4.359685554400141, + "learning_rate": 3.4414107367202865e-06, + "loss": 0.5492, + "step": 7546 + }, + { + "epoch": 0.61, + "grad_norm": 2.518188602099489, + "learning_rate": 3.44016097409926e-06, + "loss": 0.657, + "step": 7547 + }, + { + "epoch": 0.61, + "grad_norm": 3.7170306957583934, + "learning_rate": 3.4389113194289158e-06, + "loss": 0.77, + "step": 7548 + }, + { + "epoch": 0.61, + "grad_norm": 3.875129275150784, + "learning_rate": 3.4376617727957396e-06, + "loss": 0.6587, + "step": 7549 + }, + { + "epoch": 0.61, + "grad_norm": 2.8938395998889006, + "learning_rate": 3.4364123342862043e-06, + "loss": 0.7543, + "step": 7550 + }, + { + "epoch": 0.61, + "grad_norm": 7.438521489977645, + "learning_rate": 3.4351630039867823e-06, + "loss": 0.7345, + "step": 7551 + }, + { + "epoch": 0.61, + "grad_norm": 4.142714207615951, + "learning_rate": 3.433913781983932e-06, + "loss": 0.6809, + "step": 7552 + }, + { + "epoch": 0.61, + "grad_norm": 4.399413891029143, + "learning_rate": 3.4326646683641085e-06, + "loss": 0.6667, + "step": 7553 + }, + { + "epoch": 0.61, + "grad_norm": 3.718180137185137, + "learning_rate": 3.43141566321376e-06, + "loss": 0.6406, + "step": 7554 + }, + { + "epoch": 0.61, + "grad_norm": 3.865513430650758, + "learning_rate": 3.4301667666193227e-06, + "loss": 0.7799, + "step": 7555 + }, + { + "epoch": 0.61, + "grad_norm": 155.36602555041543, + "learning_rate": 3.4289179786672313e-06, + "loss": 0.5487, + "step": 7556 + }, + { + "epoch": 0.61, + "grad_norm": 2.583920680705088, + "learning_rate": 3.4276692994439066e-06, + "loss": 0.6163, + "step": 7557 + }, + { + "epoch": 0.61, + "grad_norm": 2.8379677151565246, + "learning_rate": 3.4264207290357677e-06, + "loss": 0.6186, + "step": 7558 + }, + { + "epoch": 0.61, + "grad_norm": 2.8614862845848745, + "learning_rate": 3.4251722675292234e-06, + "loss": 0.645, + "step": 7559 + }, + { + "epoch": 0.61, + "grad_norm": 3.816174879108252, + "learning_rate": 3.4239239150106718e-06, + "loss": 0.6699, + "step": 7560 + }, + { + "epoch": 0.61, + "grad_norm": 5.100903637001031, + "learning_rate": 3.42267567156651e-06, + "loss": 0.7712, + "step": 7561 + }, + { + "epoch": 0.61, + "grad_norm": 5.698770048507221, + "learning_rate": 3.421427537283123e-06, + "loss": 0.6137, + "step": 7562 + }, + { + "epoch": 0.61, + "grad_norm": 3.784394996793609, + "learning_rate": 3.4201795122468895e-06, + "loss": 0.6888, + "step": 7563 + }, + { + "epoch": 0.61, + "grad_norm": 6.378711673819811, + "learning_rate": 3.4189315965441838e-06, + "loss": 0.6655, + "step": 7564 + }, + { + "epoch": 0.61, + "grad_norm": 4.907416852753616, + "learning_rate": 3.4176837902613645e-06, + "loss": 0.7325, + "step": 7565 + }, + { + "epoch": 0.61, + "grad_norm": 8.755515461970084, + "learning_rate": 3.4164360934847912e-06, + "loss": 0.6645, + "step": 7566 + }, + { + "epoch": 0.61, + "grad_norm": 5.059930427335747, + "learning_rate": 3.41518850630081e-06, + "loss": 0.6388, + "step": 7567 + }, + { + "epoch": 0.61, + "grad_norm": 4.848736507482465, + "learning_rate": 3.413941028795763e-06, + "loss": 0.6486, + "step": 7568 + }, + { + "epoch": 0.61, + "grad_norm": 5.602527706998475, + "learning_rate": 3.4126936610559835e-06, + "loss": 0.5881, + "step": 7569 + }, + { + "epoch": 0.61, + "grad_norm": 2.9329454364209995, + "learning_rate": 3.4114464031677976e-06, + "loss": 0.6854, + "step": 7570 + }, + { + "epoch": 0.61, + "grad_norm": 3.2587049665308667, + "learning_rate": 3.4101992552175243e-06, + "loss": 0.7212, + "step": 7571 + }, + { + "epoch": 0.61, + "grad_norm": 3.7887060670001023, + "learning_rate": 3.4089522172914713e-06, + "loss": 0.7532, + "step": 7572 + }, + { + "epoch": 0.62, + "grad_norm": 7.1201085347086535, + "learning_rate": 3.4077052894759423e-06, + "loss": 0.7754, + "step": 7573 + }, + { + "epoch": 0.62, + "grad_norm": 5.29265399362968, + "learning_rate": 3.4064584718572348e-06, + "loss": 0.7639, + "step": 7574 + }, + { + "epoch": 0.62, + "grad_norm": 2.7258394766481953, + "learning_rate": 3.4052117645216333e-06, + "loss": 0.7618, + "step": 7575 + }, + { + "epoch": 0.62, + "grad_norm": 6.378065528881045, + "learning_rate": 3.4039651675554197e-06, + "loss": 0.7478, + "step": 7576 + }, + { + "epoch": 0.62, + "grad_norm": 3.6832148871869688, + "learning_rate": 3.4027186810448677e-06, + "loss": 0.7685, + "step": 7577 + }, + { + "epoch": 0.62, + "grad_norm": 8.764362612449606, + "learning_rate": 3.4014723050762382e-06, + "loss": 0.6476, + "step": 7578 + }, + { + "epoch": 0.62, + "grad_norm": 2.6143854157830595, + "learning_rate": 3.4002260397357906e-06, + "loss": 0.7827, + "step": 7579 + }, + { + "epoch": 0.62, + "grad_norm": 4.134247018665249, + "learning_rate": 3.3989798851097744e-06, + "loss": 0.7283, + "step": 7580 + }, + { + "epoch": 0.62, + "grad_norm": 3.6227313138825625, + "learning_rate": 3.3977338412844315e-06, + "loss": 0.7077, + "step": 7581 + }, + { + "epoch": 0.62, + "grad_norm": 4.554280826344177, + "learning_rate": 3.3964879083459945e-06, + "loss": 0.6923, + "step": 7582 + }, + { + "epoch": 0.62, + "grad_norm": 4.295105284035616, + "learning_rate": 3.395242086380691e-06, + "loss": 0.6117, + "step": 7583 + }, + { + "epoch": 0.62, + "grad_norm": 4.647545962328437, + "learning_rate": 3.3939963754747413e-06, + "loss": 0.7276, + "step": 7584 + }, + { + "epoch": 0.62, + "grad_norm": 3.4353216906245416, + "learning_rate": 3.392750775714353e-06, + "loss": 0.772, + "step": 7585 + }, + { + "epoch": 0.62, + "grad_norm": 7.972910457391782, + "learning_rate": 3.391505287185731e-06, + "loss": 0.8188, + "step": 7586 + }, + { + "epoch": 0.62, + "grad_norm": 3.741876121869406, + "learning_rate": 3.3902599099750706e-06, + "loss": 0.5565, + "step": 7587 + }, + { + "epoch": 0.62, + "grad_norm": 3.4172216638434523, + "learning_rate": 3.3890146441685602e-06, + "loss": 0.6391, + "step": 7588 + }, + { + "epoch": 0.62, + "grad_norm": 2.661050743556769, + "learning_rate": 3.3877694898523817e-06, + "loss": 0.5918, + "step": 7589 + }, + { + "epoch": 0.62, + "grad_norm": 3.263090230973825, + "learning_rate": 3.3865244471127045e-06, + "loss": 0.7147, + "step": 7590 + }, + { + "epoch": 0.62, + "grad_norm": 8.350130700238457, + "learning_rate": 3.3852795160356968e-06, + "loss": 0.6576, + "step": 7591 + }, + { + "epoch": 0.62, + "grad_norm": 5.319606378751055, + "learning_rate": 3.384034696707512e-06, + "loss": 0.699, + "step": 7592 + }, + { + "epoch": 0.62, + "grad_norm": 2.6821540304070117, + "learning_rate": 3.3827899892143006e-06, + "loss": 0.8146, + "step": 7593 + }, + { + "epoch": 0.62, + "grad_norm": 2.9240450709303722, + "learning_rate": 3.381545393642205e-06, + "loss": 0.8117, + "step": 7594 + }, + { + "epoch": 0.62, + "grad_norm": 2.176858948563231, + "learning_rate": 3.380300910077359e-06, + "loss": 0.6749, + "step": 7595 + }, + { + "epoch": 0.62, + "grad_norm": 3.0392370141715475, + "learning_rate": 3.3790565386058882e-06, + "loss": 0.6283, + "step": 7596 + }, + { + "epoch": 0.62, + "grad_norm": 4.212753144989034, + "learning_rate": 3.3778122793139132e-06, + "loss": 0.7344, + "step": 7597 + }, + { + "epoch": 0.62, + "grad_norm": 4.2679195342232354, + "learning_rate": 3.376568132287541e-06, + "loss": 0.7003, + "step": 7598 + }, + { + "epoch": 0.62, + "grad_norm": 4.687791044418684, + "learning_rate": 3.3753240976128776e-06, + "loss": 0.5509, + "step": 7599 + }, + { + "epoch": 0.62, + "grad_norm": 3.1721425555808103, + "learning_rate": 3.3740801753760142e-06, + "loss": 0.7878, + "step": 7600 + }, + { + "epoch": 0.62, + "grad_norm": 4.6877598116318575, + "learning_rate": 3.3728363656630407e-06, + "loss": 0.7774, + "step": 7601 + }, + { + "epoch": 0.62, + "grad_norm": 6.146183988433133, + "learning_rate": 3.3715926685600363e-06, + "loss": 0.5087, + "step": 7602 + }, + { + "epoch": 0.62, + "grad_norm": 3.642230564569195, + "learning_rate": 3.3703490841530727e-06, + "loss": 0.6192, + "step": 7603 + }, + { + "epoch": 0.62, + "grad_norm": 4.47494069608227, + "learning_rate": 3.369105612528215e-06, + "loss": 0.6403, + "step": 7604 + }, + { + "epoch": 0.62, + "grad_norm": 20.733977456026008, + "learning_rate": 3.3678622537715167e-06, + "loss": 0.7043, + "step": 7605 + }, + { + "epoch": 0.62, + "grad_norm": 4.299512026883813, + "learning_rate": 3.3666190079690274e-06, + "loss": 0.7505, + "step": 7606 + }, + { + "epoch": 0.62, + "grad_norm": 3.055570205287564, + "learning_rate": 3.3653758752067873e-06, + "loss": 0.7283, + "step": 7607 + }, + { + "epoch": 0.62, + "grad_norm": 3.9612086080329707, + "learning_rate": 3.3641328555708286e-06, + "loss": 0.5763, + "step": 7608 + }, + { + "epoch": 0.62, + "grad_norm": 3.670936416006846, + "learning_rate": 3.3628899491471765e-06, + "loss": 0.6743, + "step": 7609 + }, + { + "epoch": 0.62, + "grad_norm": 3.8075709020030133, + "learning_rate": 3.3616471560218476e-06, + "loss": 0.6467, + "step": 7610 + }, + { + "epoch": 0.62, + "grad_norm": 4.274976174388547, + "learning_rate": 3.3604044762808543e-06, + "loss": 0.7192, + "step": 7611 + }, + { + "epoch": 0.62, + "grad_norm": 12.127380549450214, + "learning_rate": 3.3591619100101924e-06, + "loss": 0.6765, + "step": 7612 + }, + { + "epoch": 0.62, + "grad_norm": 3.619476348721796, + "learning_rate": 3.3579194572958583e-06, + "loss": 0.6522, + "step": 7613 + }, + { + "epoch": 0.62, + "grad_norm": 6.706480111979132, + "learning_rate": 3.356677118223838e-06, + "loss": 0.6877, + "step": 7614 + }, + { + "epoch": 0.62, + "grad_norm": 3.971162282550115, + "learning_rate": 3.355434892880107e-06, + "loss": 0.6166, + "step": 7615 + }, + { + "epoch": 0.62, + "grad_norm": 30.761581180809856, + "learning_rate": 3.354192781350637e-06, + "loss": 0.5656, + "step": 7616 + }, + { + "epoch": 0.62, + "grad_norm": 4.618268303835102, + "learning_rate": 3.3529507837213902e-06, + "loss": 0.5783, + "step": 7617 + }, + { + "epoch": 0.62, + "grad_norm": 3.542632453439617, + "learning_rate": 3.3517089000783193e-06, + "loss": 0.7226, + "step": 7618 + }, + { + "epoch": 0.62, + "grad_norm": 8.787151627397668, + "learning_rate": 3.35046713050737e-06, + "loss": 0.6324, + "step": 7619 + }, + { + "epoch": 0.62, + "grad_norm": 4.502601317352308, + "learning_rate": 3.349225475094482e-06, + "loss": 0.6605, + "step": 7620 + }, + { + "epoch": 0.62, + "grad_norm": 9.129781494319017, + "learning_rate": 3.347983933925586e-06, + "loss": 0.6254, + "step": 7621 + }, + { + "epoch": 0.62, + "grad_norm": 4.3374159657149445, + "learning_rate": 3.3467425070866034e-06, + "loss": 0.765, + "step": 7622 + }, + { + "epoch": 0.62, + "grad_norm": 2.563864121273048, + "learning_rate": 3.3455011946634486e-06, + "loss": 0.7385, + "step": 7623 + }, + { + "epoch": 0.62, + "grad_norm": 6.516016162321504, + "learning_rate": 3.344259996742031e-06, + "loss": 0.8031, + "step": 7624 + }, + { + "epoch": 0.62, + "grad_norm": 4.513862114676875, + "learning_rate": 3.343018913408245e-06, + "loss": 0.6626, + "step": 7625 + }, + { + "epoch": 0.62, + "grad_norm": 3.5021701188097376, + "learning_rate": 3.341777944747983e-06, + "loss": 0.6789, + "step": 7626 + }, + { + "epoch": 0.62, + "grad_norm": 3.4814712205112768, + "learning_rate": 3.3405370908471284e-06, + "loss": 0.6634, + "step": 7627 + }, + { + "epoch": 0.62, + "grad_norm": 5.173562625914548, + "learning_rate": 3.339296351791556e-06, + "loss": 0.8589, + "step": 7628 + }, + { + "epoch": 0.62, + "grad_norm": 4.210867153656778, + "learning_rate": 3.3380557276671345e-06, + "loss": 0.6119, + "step": 7629 + }, + { + "epoch": 0.62, + "grad_norm": 3.1572098433620903, + "learning_rate": 3.33681521855972e-06, + "loss": 0.679, + "step": 7630 + }, + { + "epoch": 0.62, + "grad_norm": 4.504057341518131, + "learning_rate": 3.335574824555165e-06, + "loss": 0.7075, + "step": 7631 + }, + { + "epoch": 0.62, + "grad_norm": 4.214673052098549, + "learning_rate": 3.334334545739311e-06, + "loss": 0.6395, + "step": 7632 + }, + { + "epoch": 0.62, + "grad_norm": 2.9608187310383824, + "learning_rate": 3.3330943821979944e-06, + "loss": 0.8362, + "step": 7633 + }, + { + "epoch": 0.62, + "grad_norm": 9.761837016919914, + "learning_rate": 3.3318543340170427e-06, + "loss": 0.8531, + "step": 7634 + }, + { + "epoch": 0.62, + "grad_norm": 3.1988797996930134, + "learning_rate": 3.3306144012822745e-06, + "loss": 0.7701, + "step": 7635 + }, + { + "epoch": 0.62, + "grad_norm": 3.961756173334681, + "learning_rate": 3.3293745840795004e-06, + "loss": 0.7678, + "step": 7636 + }, + { + "epoch": 0.62, + "grad_norm": 2.6678785403794336, + "learning_rate": 3.328134882494527e-06, + "loss": 0.6052, + "step": 7637 + }, + { + "epoch": 0.62, + "grad_norm": 9.698446651831313, + "learning_rate": 3.326895296613144e-06, + "loss": 0.5608, + "step": 7638 + }, + { + "epoch": 0.62, + "grad_norm": 2.7950826898696635, + "learning_rate": 3.325655826521143e-06, + "loss": 0.6427, + "step": 7639 + }, + { + "epoch": 0.62, + "grad_norm": 2.4300939959299215, + "learning_rate": 3.3244164723043e-06, + "loss": 0.6423, + "step": 7640 + }, + { + "epoch": 0.62, + "grad_norm": 3.174686478526541, + "learning_rate": 3.323177234048387e-06, + "loss": 0.7855, + "step": 7641 + }, + { + "epoch": 0.62, + "grad_norm": 4.072481012022744, + "learning_rate": 3.321938111839168e-06, + "loss": 0.597, + "step": 7642 + }, + { + "epoch": 0.62, + "grad_norm": 4.2582583679564525, + "learning_rate": 3.3206991057623977e-06, + "loss": 0.7655, + "step": 7643 + }, + { + "epoch": 0.62, + "grad_norm": 6.610611876438256, + "learning_rate": 3.3194602159038247e-06, + "loss": 0.7194, + "step": 7644 + }, + { + "epoch": 0.62, + "grad_norm": 7.095847825292501, + "learning_rate": 3.318221442349184e-06, + "loss": 0.6448, + "step": 7645 + }, + { + "epoch": 0.62, + "grad_norm": 2.939983950333868, + "learning_rate": 3.3169827851842096e-06, + "loss": 0.6499, + "step": 7646 + }, + { + "epoch": 0.62, + "grad_norm": 4.900593757030222, + "learning_rate": 3.3157442444946247e-06, + "loss": 0.6828, + "step": 7647 + }, + { + "epoch": 0.62, + "grad_norm": 4.473196969809137, + "learning_rate": 3.3145058203661416e-06, + "loss": 0.7372, + "step": 7648 + }, + { + "epoch": 0.62, + "grad_norm": 3.340425883696199, + "learning_rate": 3.3132675128844684e-06, + "loss": 0.7354, + "step": 7649 + }, + { + "epoch": 0.62, + "grad_norm": 4.797626039720172, + "learning_rate": 3.312029322135306e-06, + "loss": 0.6353, + "step": 7650 + }, + { + "epoch": 0.62, + "grad_norm": 5.327347955083741, + "learning_rate": 3.3107912482043413e-06, + "loss": 0.6843, + "step": 7651 + }, + { + "epoch": 0.62, + "grad_norm": 10.098680333792164, + "learning_rate": 3.309553291177258e-06, + "loss": 0.73, + "step": 7652 + }, + { + "epoch": 0.62, + "grad_norm": 5.232056353290789, + "learning_rate": 3.3083154511397308e-06, + "loss": 0.7105, + "step": 7653 + }, + { + "epoch": 0.62, + "grad_norm": 3.3190201068345804, + "learning_rate": 3.307077728177427e-06, + "loss": 0.739, + "step": 7654 + }, + { + "epoch": 0.62, + "grad_norm": 3.783498222703066, + "learning_rate": 3.305840122376003e-06, + "loss": 0.815, + "step": 7655 + }, + { + "epoch": 0.62, + "grad_norm": 2.583289707211872, + "learning_rate": 3.30460263382111e-06, + "loss": 0.5703, + "step": 7656 + }, + { + "epoch": 0.62, + "grad_norm": 4.035889472612898, + "learning_rate": 3.3033652625983915e-06, + "loss": 0.7322, + "step": 7657 + }, + { + "epoch": 0.62, + "grad_norm": 6.109958115776633, + "learning_rate": 3.302128008793478e-06, + "loss": 0.6066, + "step": 7658 + }, + { + "epoch": 0.62, + "grad_norm": 14.75017196502749, + "learning_rate": 3.300890872491997e-06, + "loss": 0.6381, + "step": 7659 + }, + { + "epoch": 0.62, + "grad_norm": 2.735863448747024, + "learning_rate": 3.2996538537795656e-06, + "loss": 0.6694, + "step": 7660 + }, + { + "epoch": 0.62, + "grad_norm": 5.437792596895274, + "learning_rate": 3.2984169527417943e-06, + "loss": 0.7091, + "step": 7661 + }, + { + "epoch": 0.62, + "grad_norm": 23.877700755266194, + "learning_rate": 3.2971801694642845e-06, + "loss": 0.5451, + "step": 7662 + }, + { + "epoch": 0.62, + "grad_norm": 3.8946372978336408, + "learning_rate": 3.295943504032629e-06, + "loss": 0.7329, + "step": 7663 + }, + { + "epoch": 0.62, + "grad_norm": 3.181871180584756, + "learning_rate": 3.2947069565324134e-06, + "loss": 0.5762, + "step": 7664 + }, + { + "epoch": 0.62, + "grad_norm": 3.8124439541833364, + "learning_rate": 3.2934705270492124e-06, + "loss": 0.5801, + "step": 7665 + }, + { + "epoch": 0.62, + "grad_norm": 3.829757867100352, + "learning_rate": 3.292234215668596e-06, + "loss": 0.5428, + "step": 7666 + }, + { + "epoch": 0.62, + "grad_norm": 4.637744724675553, + "learning_rate": 3.2909980224761246e-06, + "loss": 0.7825, + "step": 7667 + }, + { + "epoch": 0.62, + "grad_norm": 3.013602207219761, + "learning_rate": 3.289761947557351e-06, + "loss": 0.7835, + "step": 7668 + }, + { + "epoch": 0.62, + "grad_norm": 3.4680334890081848, + "learning_rate": 3.2885259909978205e-06, + "loss": 0.7759, + "step": 7669 + }, + { + "epoch": 0.62, + "grad_norm": 2.439206637462609, + "learning_rate": 3.287290152883067e-06, + "loss": 0.6723, + "step": 7670 + }, + { + "epoch": 0.62, + "grad_norm": 4.179106225527792, + "learning_rate": 3.286054433298619e-06, + "loss": 0.7072, + "step": 7671 + }, + { + "epoch": 0.62, + "grad_norm": 4.8041732960693775, + "learning_rate": 3.2848188323299964e-06, + "loss": 0.5256, + "step": 7672 + }, + { + "epoch": 0.62, + "grad_norm": 2.60557524086, + "learning_rate": 3.283583350062709e-06, + "loss": 0.5816, + "step": 7673 + }, + { + "epoch": 0.62, + "grad_norm": 4.300835489776446, + "learning_rate": 3.2823479865822616e-06, + "loss": 0.7765, + "step": 7674 + }, + { + "epoch": 0.62, + "grad_norm": 10.040977092105795, + "learning_rate": 3.2811127419741495e-06, + "loss": 0.688, + "step": 7675 + }, + { + "epoch": 0.62, + "grad_norm": 7.412265533566398, + "learning_rate": 3.279877616323858e-06, + "loss": 0.6562, + "step": 7676 + }, + { + "epoch": 0.62, + "grad_norm": 4.422554359228267, + "learning_rate": 3.278642609716868e-06, + "loss": 0.5768, + "step": 7677 + }, + { + "epoch": 0.62, + "grad_norm": 4.690113171316932, + "learning_rate": 3.2774077222386465e-06, + "loss": 0.6951, + "step": 7678 + }, + { + "epoch": 0.62, + "grad_norm": 3.7419522628757904, + "learning_rate": 3.276172953974658e-06, + "loss": 0.7497, + "step": 7679 + }, + { + "epoch": 0.62, + "grad_norm": 4.393845110521941, + "learning_rate": 3.2749383050103534e-06, + "loss": 0.7425, + "step": 7680 + }, + { + "epoch": 0.62, + "grad_norm": 3.6937774292244923, + "learning_rate": 3.2737037754311808e-06, + "loss": 0.6713, + "step": 7681 + }, + { + "epoch": 0.62, + "grad_norm": 4.410884187724273, + "learning_rate": 3.2724693653225757e-06, + "loss": 0.7277, + "step": 7682 + }, + { + "epoch": 0.62, + "grad_norm": 3.9393292391271264, + "learning_rate": 3.2712350747699704e-06, + "loss": 0.7861, + "step": 7683 + }, + { + "epoch": 0.62, + "grad_norm": 5.231265555332869, + "learning_rate": 3.2700009038587817e-06, + "loss": 0.5494, + "step": 7684 + }, + { + "epoch": 0.62, + "grad_norm": 3.348974837981795, + "learning_rate": 3.2687668526744224e-06, + "loss": 0.7663, + "step": 7685 + }, + { + "epoch": 0.62, + "grad_norm": 5.369366626832881, + "learning_rate": 3.267532921302299e-06, + "loss": 0.5978, + "step": 7686 + }, + { + "epoch": 0.62, + "grad_norm": 7.706487494107511, + "learning_rate": 3.2662991098278057e-06, + "loss": 0.5342, + "step": 7687 + }, + { + "epoch": 0.62, + "grad_norm": 3.277494553101847, + "learning_rate": 3.2650654183363297e-06, + "loss": 0.7206, + "step": 7688 + }, + { + "epoch": 0.62, + "grad_norm": 2.9952783807855683, + "learning_rate": 3.2638318469132507e-06, + "loss": 0.6935, + "step": 7689 + }, + { + "epoch": 0.62, + "grad_norm": 8.033823874131544, + "learning_rate": 3.262598395643942e-06, + "loss": 0.5771, + "step": 7690 + }, + { + "epoch": 0.62, + "grad_norm": 11.449662697936539, + "learning_rate": 3.261365064613762e-06, + "loss": 0.6382, + "step": 7691 + }, + { + "epoch": 0.62, + "grad_norm": 2.726104823017054, + "learning_rate": 3.260131853908066e-06, + "loss": 0.6494, + "step": 7692 + }, + { + "epoch": 0.62, + "grad_norm": 3.1446021922028695, + "learning_rate": 3.2588987636122016e-06, + "loss": 0.6588, + "step": 7693 + }, + { + "epoch": 0.62, + "grad_norm": 2.874213712478007, + "learning_rate": 3.2576657938115068e-06, + "loss": 0.5727, + "step": 7694 + }, + { + "epoch": 0.62, + "grad_norm": 3.49788809870037, + "learning_rate": 3.2564329445913085e-06, + "loss": 0.6762, + "step": 7695 + }, + { + "epoch": 0.63, + "grad_norm": 4.047238046135469, + "learning_rate": 3.255200216036929e-06, + "loss": 0.7261, + "step": 7696 + }, + { + "epoch": 0.63, + "grad_norm": 3.226500976776351, + "learning_rate": 3.2539676082336823e-06, + "loss": 0.7115, + "step": 7697 + }, + { + "epoch": 0.63, + "grad_norm": 4.613355220739448, + "learning_rate": 3.2527351212668688e-06, + "loss": 0.6949, + "step": 7698 + }, + { + "epoch": 0.63, + "grad_norm": 2.868387409718942, + "learning_rate": 3.251502755221787e-06, + "loss": 0.6188, + "step": 7699 + }, + { + "epoch": 0.63, + "grad_norm": 5.177311637015253, + "learning_rate": 3.250270510183724e-06, + "loss": 0.8344, + "step": 7700 + }, + { + "epoch": 0.63, + "grad_norm": 6.184161304533539, + "learning_rate": 3.2490383862379594e-06, + "loss": 0.7366, + "step": 7701 + }, + { + "epoch": 0.63, + "grad_norm": 2.757913672258659, + "learning_rate": 3.2478063834697637e-06, + "loss": 0.5776, + "step": 7702 + }, + { + "epoch": 0.63, + "grad_norm": 3.603749390775768, + "learning_rate": 3.2465745019643992e-06, + "loss": 0.8333, + "step": 7703 + }, + { + "epoch": 0.63, + "grad_norm": 6.971405006653064, + "learning_rate": 3.24534274180712e-06, + "loss": 0.7142, + "step": 7704 + }, + { + "epoch": 0.63, + "grad_norm": 2.5076719510943923, + "learning_rate": 3.2441111030831695e-06, + "loss": 0.6597, + "step": 7705 + }, + { + "epoch": 0.63, + "grad_norm": 4.19856775593818, + "learning_rate": 3.2428795858777873e-06, + "loss": 0.6919, + "step": 7706 + }, + { + "epoch": 0.63, + "grad_norm": 3.0672345442128863, + "learning_rate": 3.2416481902762015e-06, + "loss": 0.7056, + "step": 7707 + }, + { + "epoch": 0.63, + "grad_norm": 6.554291488559537, + "learning_rate": 3.2404169163636324e-06, + "loss": 0.5825, + "step": 7708 + }, + { + "epoch": 0.63, + "grad_norm": 4.094106997714975, + "learning_rate": 3.239185764225291e-06, + "loss": 0.6599, + "step": 7709 + }, + { + "epoch": 0.63, + "grad_norm": 3.8593722196957, + "learning_rate": 3.237954733946385e-06, + "loss": 0.6976, + "step": 7710 + }, + { + "epoch": 0.63, + "grad_norm": 5.0051146894564935, + "learning_rate": 3.2367238256121035e-06, + "loss": 0.6504, + "step": 7711 + }, + { + "epoch": 0.63, + "grad_norm": 7.05607896581478, + "learning_rate": 3.2354930393076373e-06, + "loss": 0.6607, + "step": 7712 + }, + { + "epoch": 0.63, + "grad_norm": 3.798002792397022, + "learning_rate": 3.234262375118161e-06, + "loss": 0.5919, + "step": 7713 + }, + { + "epoch": 0.63, + "grad_norm": 4.028723396190554, + "learning_rate": 3.233031833128848e-06, + "loss": 0.7423, + "step": 7714 + }, + { + "epoch": 0.63, + "grad_norm": 5.786384515398065, + "learning_rate": 3.2318014134248565e-06, + "loss": 0.8331, + "step": 7715 + }, + { + "epoch": 0.63, + "grad_norm": 2.6654172140159296, + "learning_rate": 3.230571116091341e-06, + "loss": 0.7741, + "step": 7716 + }, + { + "epoch": 0.63, + "grad_norm": 3.967654398945249, + "learning_rate": 3.229340941213448e-06, + "loss": 0.6815, + "step": 7717 + }, + { + "epoch": 0.63, + "grad_norm": 5.447210932324046, + "learning_rate": 3.228110888876308e-06, + "loss": 0.6838, + "step": 7718 + }, + { + "epoch": 0.63, + "grad_norm": 4.349344794487387, + "learning_rate": 3.226880959165053e-06, + "loss": 0.6382, + "step": 7719 + }, + { + "epoch": 0.63, + "grad_norm": 3.8974510665640474, + "learning_rate": 3.225651152164799e-06, + "loss": 0.5971, + "step": 7720 + }, + { + "epoch": 0.63, + "grad_norm": 18.835150563214153, + "learning_rate": 3.2244214679606574e-06, + "loss": 0.6938, + "step": 7721 + }, + { + "epoch": 0.63, + "grad_norm": 7.590028560985334, + "learning_rate": 3.22319190663773e-06, + "loss": 0.7676, + "step": 7722 + }, + { + "epoch": 0.63, + "grad_norm": 40.435510990333455, + "learning_rate": 3.2219624682811125e-06, + "loss": 0.7199, + "step": 7723 + }, + { + "epoch": 0.63, + "grad_norm": 4.087583585415945, + "learning_rate": 3.2207331529758856e-06, + "loss": 0.6098, + "step": 7724 + }, + { + "epoch": 0.63, + "grad_norm": 5.628691103812373, + "learning_rate": 3.2195039608071278e-06, + "loss": 0.6713, + "step": 7725 + }, + { + "epoch": 0.63, + "grad_norm": 2.912436390534725, + "learning_rate": 3.2182748918599064e-06, + "loss": 0.6006, + "step": 7726 + }, + { + "epoch": 0.63, + "grad_norm": 3.0509772199619407, + "learning_rate": 3.2170459462192827e-06, + "loss": 0.683, + "step": 7727 + }, + { + "epoch": 0.63, + "grad_norm": 3.0222839387223805, + "learning_rate": 3.215817123970305e-06, + "loss": 0.6247, + "step": 7728 + }, + { + "epoch": 0.63, + "grad_norm": 4.677206881731413, + "learning_rate": 3.214588425198016e-06, + "loss": 0.6999, + "step": 7729 + }, + { + "epoch": 0.63, + "grad_norm": 5.514324746090309, + "learning_rate": 3.213359849987452e-06, + "loss": 0.6766, + "step": 7730 + }, + { + "epoch": 0.63, + "grad_norm": 3.223193905586136, + "learning_rate": 3.212131398423634e-06, + "loss": 0.8193, + "step": 7731 + }, + { + "epoch": 0.63, + "grad_norm": 4.287674588437586, + "learning_rate": 3.2109030705915805e-06, + "loss": 0.9024, + "step": 7732 + }, + { + "epoch": 0.63, + "grad_norm": 2.8462741984231394, + "learning_rate": 3.2096748665763e-06, + "loss": 0.6102, + "step": 7733 + }, + { + "epoch": 0.63, + "grad_norm": 6.23799182649523, + "learning_rate": 3.208446786462791e-06, + "loss": 0.6404, + "step": 7734 + }, + { + "epoch": 0.63, + "grad_norm": 2.5296982293489747, + "learning_rate": 3.2072188303360462e-06, + "loss": 0.6327, + "step": 7735 + }, + { + "epoch": 0.63, + "grad_norm": 4.5288065371547255, + "learning_rate": 3.2059909982810456e-06, + "loss": 0.7014, + "step": 7736 + }, + { + "epoch": 0.63, + "grad_norm": 5.417449009010182, + "learning_rate": 3.2047632903827664e-06, + "loss": 0.6281, + "step": 7737 + }, + { + "epoch": 0.63, + "grad_norm": 4.350418374016844, + "learning_rate": 3.2035357067261686e-06, + "loss": 0.5926, + "step": 7738 + }, + { + "epoch": 0.63, + "grad_norm": 3.0377568791259586, + "learning_rate": 3.202308247396212e-06, + "loss": 0.6447, + "step": 7739 + }, + { + "epoch": 0.63, + "grad_norm": 4.496318808813016, + "learning_rate": 3.201080912477843e-06, + "loss": 0.6377, + "step": 7740 + }, + { + "epoch": 0.63, + "grad_norm": 7.598672818303001, + "learning_rate": 3.199853702056003e-06, + "loss": 0.863, + "step": 7741 + }, + { + "epoch": 0.63, + "grad_norm": 13.039716597322176, + "learning_rate": 3.198626616215621e-06, + "loss": 0.662, + "step": 7742 + }, + { + "epoch": 0.63, + "grad_norm": 3.408917508565585, + "learning_rate": 3.197399655041621e-06, + "loss": 0.693, + "step": 7743 + }, + { + "epoch": 0.63, + "grad_norm": 7.22095126804445, + "learning_rate": 3.196172818618914e-06, + "loss": 0.6231, + "step": 7744 + }, + { + "epoch": 0.63, + "grad_norm": 3.656461033961272, + "learning_rate": 3.194946107032405e-06, + "loss": 0.7085, + "step": 7745 + }, + { + "epoch": 0.63, + "grad_norm": 4.92919446388533, + "learning_rate": 3.1937195203669907e-06, + "loss": 0.7219, + "step": 7746 + }, + { + "epoch": 0.63, + "grad_norm": 8.851805241243225, + "learning_rate": 3.192493058707559e-06, + "loss": 0.7888, + "step": 7747 + }, + { + "epoch": 0.63, + "grad_norm": 3.499339926878046, + "learning_rate": 3.1912667221389892e-06, + "loss": 0.6251, + "step": 7748 + }, + { + "epoch": 0.63, + "grad_norm": 6.799931601253928, + "learning_rate": 3.1900405107461506e-06, + "loss": 0.7246, + "step": 7749 + }, + { + "epoch": 0.63, + "grad_norm": 2.6846434007196076, + "learning_rate": 3.1888144246139067e-06, + "loss": 0.7842, + "step": 7750 + }, + { + "epoch": 0.63, + "grad_norm": 4.585604911486977, + "learning_rate": 3.187588463827107e-06, + "loss": 0.6561, + "step": 7751 + }, + { + "epoch": 0.63, + "grad_norm": 3.718118408356869, + "learning_rate": 3.1863626284705997e-06, + "loss": 0.6814, + "step": 7752 + }, + { + "epoch": 0.63, + "grad_norm": 5.190609558842971, + "learning_rate": 3.185136918629216e-06, + "loss": 0.6963, + "step": 7753 + }, + { + "epoch": 0.63, + "grad_norm": 3.239222862174707, + "learning_rate": 3.1839113343877848e-06, + "loss": 0.5497, + "step": 7754 + }, + { + "epoch": 0.63, + "grad_norm": 9.88778798528499, + "learning_rate": 3.182685875831124e-06, + "loss": 0.7924, + "step": 7755 + }, + { + "epoch": 0.63, + "grad_norm": 3.1218864768920374, + "learning_rate": 3.1814605430440458e-06, + "loss": 0.5899, + "step": 7756 + }, + { + "epoch": 0.63, + "grad_norm": 3.15791075684033, + "learning_rate": 3.180235336111346e-06, + "loss": 0.7058, + "step": 7757 + }, + { + "epoch": 0.63, + "grad_norm": 4.377376120293765, + "learning_rate": 3.17901025511782e-06, + "loss": 0.6661, + "step": 7758 + }, + { + "epoch": 0.63, + "grad_norm": 3.409969053505127, + "learning_rate": 3.1777853001482493e-06, + "loss": 0.5112, + "step": 7759 + }, + { + "epoch": 0.63, + "grad_norm": 3.980900907828548, + "learning_rate": 3.1765604712874115e-06, + "loss": 0.7136, + "step": 7760 + }, + { + "epoch": 0.63, + "grad_norm": 5.151238656542228, + "learning_rate": 3.1753357686200693e-06, + "loss": 0.5439, + "step": 7761 + }, + { + "epoch": 0.63, + "grad_norm": 22.615372455978253, + "learning_rate": 3.1741111922309797e-06, + "loss": 0.6819, + "step": 7762 + }, + { + "epoch": 0.63, + "grad_norm": 7.980533539373199, + "learning_rate": 3.1728867422048957e-06, + "loss": 0.6054, + "step": 7763 + }, + { + "epoch": 0.63, + "grad_norm": 4.7561622208903795, + "learning_rate": 3.171662418626551e-06, + "loss": 0.7378, + "step": 7764 + }, + { + "epoch": 0.63, + "grad_norm": 6.126044575079399, + "learning_rate": 3.1704382215806794e-06, + "loss": 0.6452, + "step": 7765 + }, + { + "epoch": 0.63, + "grad_norm": 3.5784985141591794, + "learning_rate": 3.1692141511520025e-06, + "loss": 0.7608, + "step": 7766 + }, + { + "epoch": 0.63, + "grad_norm": 4.4288329191768545, + "learning_rate": 3.1679902074252344e-06, + "loss": 0.7558, + "step": 7767 + }, + { + "epoch": 0.63, + "grad_norm": 3.4271202605406024, + "learning_rate": 3.1667663904850786e-06, + "loss": 0.8049, + "step": 7768 + }, + { + "epoch": 0.63, + "grad_norm": 4.220449535006005, + "learning_rate": 3.165542700416232e-06, + "loss": 0.4805, + "step": 7769 + }, + { + "epoch": 0.63, + "grad_norm": 3.3989277540852494, + "learning_rate": 3.1643191373033833e-06, + "loss": 0.5789, + "step": 7770 + }, + { + "epoch": 0.63, + "grad_norm": 3.022969568252157, + "learning_rate": 3.1630957012312063e-06, + "loss": 0.847, + "step": 7771 + }, + { + "epoch": 0.63, + "grad_norm": 3.7117206380202394, + "learning_rate": 3.161872392284373e-06, + "loss": 0.6714, + "step": 7772 + }, + { + "epoch": 0.63, + "grad_norm": 6.240200871913442, + "learning_rate": 3.160649210547544e-06, + "loss": 0.6754, + "step": 7773 + }, + { + "epoch": 0.63, + "grad_norm": 2.527652181232202, + "learning_rate": 3.1594261561053707e-06, + "loss": 0.6578, + "step": 7774 + }, + { + "epoch": 0.63, + "grad_norm": 2.890328872037365, + "learning_rate": 3.158203229042498e-06, + "loss": 0.7807, + "step": 7775 + }, + { + "epoch": 0.63, + "grad_norm": 4.279387722613669, + "learning_rate": 3.156980429443559e-06, + "loss": 0.675, + "step": 7776 + }, + { + "epoch": 0.63, + "grad_norm": 4.3813053920552, + "learning_rate": 3.1557577573931786e-06, + "loss": 0.6648, + "step": 7777 + }, + { + "epoch": 0.63, + "grad_norm": 7.8366371850988665, + "learning_rate": 3.154535212975973e-06, + "loss": 0.6509, + "step": 7778 + }, + { + "epoch": 0.63, + "grad_norm": 6.510129993214789, + "learning_rate": 3.1533127962765497e-06, + "loss": 0.6698, + "step": 7779 + }, + { + "epoch": 0.63, + "grad_norm": 3.256387733347844, + "learning_rate": 3.1520905073795096e-06, + "loss": 0.7841, + "step": 7780 + }, + { + "epoch": 0.63, + "grad_norm": 5.144551442583872, + "learning_rate": 3.150868346369441e-06, + "loss": 0.7633, + "step": 7781 + }, + { + "epoch": 0.63, + "grad_norm": 12.38142037042164, + "learning_rate": 3.1496463133309274e-06, + "loss": 0.7002, + "step": 7782 + }, + { + "epoch": 0.63, + "grad_norm": 3.018917575865357, + "learning_rate": 3.14842440834854e-06, + "loss": 0.6307, + "step": 7783 + }, + { + "epoch": 0.63, + "grad_norm": 21.90359593025437, + "learning_rate": 3.1472026315068404e-06, + "loss": 0.5818, + "step": 7784 + }, + { + "epoch": 0.63, + "grad_norm": 5.974350587656121, + "learning_rate": 3.1459809828903865e-06, + "loss": 0.6482, + "step": 7785 + }, + { + "epoch": 0.63, + "grad_norm": 13.927370647612653, + "learning_rate": 3.144759462583721e-06, + "loss": 0.7035, + "step": 7786 + }, + { + "epoch": 0.63, + "grad_norm": 31.941265019583575, + "learning_rate": 3.1435380706713823e-06, + "loss": 0.7151, + "step": 7787 + }, + { + "epoch": 0.63, + "grad_norm": 8.55445053760908, + "learning_rate": 3.1423168072378986e-06, + "loss": 0.5533, + "step": 7788 + }, + { + "epoch": 0.63, + "grad_norm": 6.0138693672080334, + "learning_rate": 3.1410956723677888e-06, + "loss": 0.6116, + "step": 7789 + }, + { + "epoch": 0.63, + "grad_norm": 5.008743160972371, + "learning_rate": 3.1398746661455647e-06, + "loss": 0.7882, + "step": 7790 + }, + { + "epoch": 0.63, + "grad_norm": 4.551174500873657, + "learning_rate": 3.1386537886557244e-06, + "loss": 0.6925, + "step": 7791 + }, + { + "epoch": 0.63, + "grad_norm": 3.5890832512231343, + "learning_rate": 3.137433039982763e-06, + "loss": 0.6555, + "step": 7792 + }, + { + "epoch": 0.63, + "grad_norm": 9.65990171509788, + "learning_rate": 3.1362124202111614e-06, + "loss": 0.6497, + "step": 7793 + }, + { + "epoch": 0.63, + "grad_norm": 7.137935208217813, + "learning_rate": 3.134991929425396e-06, + "loss": 0.7044, + "step": 7794 + }, + { + "epoch": 0.63, + "grad_norm": 4.887859221024265, + "learning_rate": 3.1337715677099325e-06, + "loss": 0.6397, + "step": 7795 + }, + { + "epoch": 0.63, + "grad_norm": 4.722730914851677, + "learning_rate": 3.1325513351492286e-06, + "loss": 0.6903, + "step": 7796 + }, + { + "epoch": 0.63, + "grad_norm": 11.819912797652126, + "learning_rate": 3.131331231827729e-06, + "loss": 0.7927, + "step": 7797 + }, + { + "epoch": 0.63, + "grad_norm": 3.897105251050104, + "learning_rate": 3.130111257829874e-06, + "loss": 0.5588, + "step": 7798 + }, + { + "epoch": 0.63, + "grad_norm": 6.406932038833458, + "learning_rate": 3.1288914132400948e-06, + "loss": 0.8049, + "step": 7799 + }, + { + "epoch": 0.63, + "grad_norm": 24.688988420277415, + "learning_rate": 3.127671698142811e-06, + "loss": 0.5677, + "step": 7800 + }, + { + "epoch": 0.63, + "grad_norm": 3.878597056739613, + "learning_rate": 3.1264521126224345e-06, + "loss": 0.6835, + "step": 7801 + }, + { + "epoch": 0.63, + "grad_norm": 2.6840381167648113, + "learning_rate": 3.1252326567633686e-06, + "loss": 0.5967, + "step": 7802 + }, + { + "epoch": 0.63, + "grad_norm": 4.468656781635662, + "learning_rate": 3.1240133306500096e-06, + "loss": 0.7435, + "step": 7803 + }, + { + "epoch": 0.63, + "grad_norm": 3.006163746955273, + "learning_rate": 3.122794134366738e-06, + "loss": 0.6444, + "step": 7804 + }, + { + "epoch": 0.63, + "grad_norm": 11.017568369656365, + "learning_rate": 3.1215750679979316e-06, + "loss": 0.7563, + "step": 7805 + }, + { + "epoch": 0.63, + "grad_norm": 4.02654974514555, + "learning_rate": 3.120356131627959e-06, + "loss": 0.9429, + "step": 7806 + }, + { + "epoch": 0.63, + "grad_norm": 7.012079005629595, + "learning_rate": 3.119137325341178e-06, + "loss": 0.5927, + "step": 7807 + }, + { + "epoch": 0.63, + "grad_norm": 4.21845084670605, + "learning_rate": 3.117918649221936e-06, + "loss": 0.5881, + "step": 7808 + }, + { + "epoch": 0.63, + "grad_norm": 4.665467673248985, + "learning_rate": 3.116700103354575e-06, + "loss": 0.7387, + "step": 7809 + }, + { + "epoch": 0.63, + "grad_norm": 3.387636063968424, + "learning_rate": 3.115481687823425e-06, + "loss": 0.5892, + "step": 7810 + }, + { + "epoch": 0.63, + "grad_norm": 3.1986666881053, + "learning_rate": 3.114263402712807e-06, + "loss": 0.5072, + "step": 7811 + }, + { + "epoch": 0.63, + "grad_norm": 3.6582221794064966, + "learning_rate": 3.113045248107035e-06, + "loss": 0.6263, + "step": 7812 + }, + { + "epoch": 0.63, + "grad_norm": 3.202176878332542, + "learning_rate": 3.1118272240904136e-06, + "loss": 0.6164, + "step": 7813 + }, + { + "epoch": 0.63, + "grad_norm": 5.006164122249263, + "learning_rate": 3.110609330747237e-06, + "loss": 0.6311, + "step": 7814 + }, + { + "epoch": 0.63, + "grad_norm": 5.559994708593936, + "learning_rate": 3.109391568161792e-06, + "loss": 0.7401, + "step": 7815 + }, + { + "epoch": 0.63, + "grad_norm": 2.9547972782051644, + "learning_rate": 3.108173936418355e-06, + "loss": 0.5967, + "step": 7816 + }, + { + "epoch": 0.63, + "grad_norm": 3.0577294516536573, + "learning_rate": 3.106956435601194e-06, + "loss": 0.6676, + "step": 7817 + }, + { + "epoch": 0.63, + "grad_norm": 2.7416254123662127, + "learning_rate": 3.105739065794565e-06, + "loss": 0.7813, + "step": 7818 + }, + { + "epoch": 0.64, + "grad_norm": 3.750782139307154, + "learning_rate": 3.104521827082721e-06, + "loss": 0.6834, + "step": 7819 + }, + { + "epoch": 0.64, + "grad_norm": 3.1726802635563955, + "learning_rate": 3.1033047195499013e-06, + "loss": 0.5241, + "step": 7820 + }, + { + "epoch": 0.64, + "grad_norm": 6.520254021356529, + "learning_rate": 3.102087743280337e-06, + "loss": 0.6318, + "step": 7821 + }, + { + "epoch": 0.64, + "grad_norm": 4.515664299037804, + "learning_rate": 3.1008708983582525e-06, + "loss": 0.5974, + "step": 7822 + }, + { + "epoch": 0.64, + "grad_norm": 4.205600514002894, + "learning_rate": 3.0996541848678598e-06, + "loss": 0.6768, + "step": 7823 + }, + { + "epoch": 0.64, + "grad_norm": 5.388180188522335, + "learning_rate": 3.0984376028933623e-06, + "loss": 0.7288, + "step": 7824 + }, + { + "epoch": 0.64, + "grad_norm": 43.8329588145411, + "learning_rate": 3.0972211525189566e-06, + "loss": 0.6351, + "step": 7825 + }, + { + "epoch": 0.64, + "grad_norm": 3.6485111273942277, + "learning_rate": 3.096004833828827e-06, + "loss": 0.6738, + "step": 7826 + }, + { + "epoch": 0.64, + "grad_norm": 3.427899421891699, + "learning_rate": 3.0947886469071512e-06, + "loss": 0.83, + "step": 7827 + }, + { + "epoch": 0.64, + "grad_norm": 3.2538144675701868, + "learning_rate": 3.0935725918380977e-06, + "loss": 0.6581, + "step": 7828 + }, + { + "epoch": 0.64, + "grad_norm": 5.297058177906108, + "learning_rate": 3.0923566687058264e-06, + "loss": 0.6391, + "step": 7829 + }, + { + "epoch": 0.64, + "grad_norm": 3.36028536088647, + "learning_rate": 3.0911408775944836e-06, + "loss": 0.6735, + "step": 7830 + }, + { + "epoch": 0.64, + "grad_norm": 27.407577340052498, + "learning_rate": 3.0899252185882106e-06, + "loss": 0.6654, + "step": 7831 + }, + { + "epoch": 0.64, + "grad_norm": 3.9111025150663843, + "learning_rate": 3.0887096917711408e-06, + "loss": 0.6961, + "step": 7832 + }, + { + "epoch": 0.64, + "grad_norm": 4.422645980999589, + "learning_rate": 3.0874942972273937e-06, + "loss": 0.5759, + "step": 7833 + }, + { + "epoch": 0.64, + "grad_norm": 3.5753318802998315, + "learning_rate": 3.086279035041083e-06, + "loss": 0.7835, + "step": 7834 + }, + { + "epoch": 0.64, + "grad_norm": 2.615130735915641, + "learning_rate": 3.0850639052963135e-06, + "loss": 0.7615, + "step": 7835 + }, + { + "epoch": 0.64, + "grad_norm": 5.192258680627292, + "learning_rate": 3.0838489080771804e-06, + "loss": 0.5789, + "step": 7836 + }, + { + "epoch": 0.64, + "grad_norm": 5.838917166902251, + "learning_rate": 3.082634043467767e-06, + "loss": 0.6546, + "step": 7837 + }, + { + "epoch": 0.64, + "grad_norm": 2.9469886056947705, + "learning_rate": 3.0814193115521496e-06, + "loss": 0.715, + "step": 7838 + }, + { + "epoch": 0.64, + "grad_norm": 3.817014681098799, + "learning_rate": 3.0802047124143964e-06, + "loss": 0.7515, + "step": 7839 + }, + { + "epoch": 0.64, + "grad_norm": 3.3198898935645875, + "learning_rate": 3.078990246138566e-06, + "loss": 0.5867, + "step": 7840 + }, + { + "epoch": 0.64, + "grad_norm": 7.8286379030413045, + "learning_rate": 3.077775912808706e-06, + "loss": 0.5958, + "step": 7841 + }, + { + "epoch": 0.64, + "grad_norm": 4.295519938350132, + "learning_rate": 3.0765617125088554e-06, + "loss": 0.7171, + "step": 7842 + }, + { + "epoch": 0.64, + "grad_norm": 3.969462832878753, + "learning_rate": 3.075347645323048e-06, + "loss": 0.6072, + "step": 7843 + }, + { + "epoch": 0.64, + "grad_norm": 5.640517531966812, + "learning_rate": 3.074133711335299e-06, + "loss": 0.6917, + "step": 7844 + }, + { + "epoch": 0.64, + "grad_norm": 3.9185191450811088, + "learning_rate": 3.072919910629625e-06, + "loss": 0.7194, + "step": 7845 + }, + { + "epoch": 0.64, + "grad_norm": 6.523923374258665, + "learning_rate": 3.071706243290026e-06, + "loss": 0.7285, + "step": 7846 + }, + { + "epoch": 0.64, + "grad_norm": 9.72633181915045, + "learning_rate": 3.0704927094004964e-06, + "loss": 0.6619, + "step": 7847 + }, + { + "epoch": 0.64, + "grad_norm": 2.573777202958262, + "learning_rate": 3.0692793090450217e-06, + "loss": 0.5786, + "step": 7848 + }, + { + "epoch": 0.64, + "grad_norm": 4.061796164522386, + "learning_rate": 3.068066042307576e-06, + "loss": 0.7024, + "step": 7849 + }, + { + "epoch": 0.64, + "grad_norm": 16.607891963948443, + "learning_rate": 3.0668529092721246e-06, + "loss": 0.6591, + "step": 7850 + }, + { + "epoch": 0.64, + "grad_norm": 9.213118749684922, + "learning_rate": 3.0656399100226218e-06, + "loss": 0.593, + "step": 7851 + }, + { + "epoch": 0.64, + "grad_norm": 3.569839625820368, + "learning_rate": 3.0644270446430184e-06, + "loss": 0.6297, + "step": 7852 + }, + { + "epoch": 0.64, + "grad_norm": 2.8726355481413335, + "learning_rate": 3.0632143132172503e-06, + "loss": 0.6432, + "step": 7853 + }, + { + "epoch": 0.64, + "grad_norm": 4.682413917202822, + "learning_rate": 3.062001715829247e-06, + "loss": 0.6283, + "step": 7854 + }, + { + "epoch": 0.64, + "grad_norm": 15.480529211341592, + "learning_rate": 3.0607892525629283e-06, + "loss": 0.7385, + "step": 7855 + }, + { + "epoch": 0.64, + "grad_norm": 5.937287464227921, + "learning_rate": 3.059576923502204e-06, + "loss": 0.6936, + "step": 7856 + }, + { + "epoch": 0.64, + "grad_norm": 3.0541846827672634, + "learning_rate": 3.0583647287309744e-06, + "loss": 0.6334, + "step": 7857 + }, + { + "epoch": 0.64, + "grad_norm": 7.769415266227085, + "learning_rate": 3.05715266833313e-06, + "loss": 0.8101, + "step": 7858 + }, + { + "epoch": 0.64, + "grad_norm": 3.357925122052827, + "learning_rate": 3.0559407423925536e-06, + "loss": 0.6588, + "step": 7859 + }, + { + "epoch": 0.64, + "grad_norm": 6.2840431057745345, + "learning_rate": 3.0547289509931194e-06, + "loss": 0.6509, + "step": 7860 + }, + { + "epoch": 0.64, + "grad_norm": 32.238299815732894, + "learning_rate": 3.05351729421869e-06, + "loss": 0.7241, + "step": 7861 + }, + { + "epoch": 0.64, + "grad_norm": 12.773983193619882, + "learning_rate": 3.0523057721531217e-06, + "loss": 0.6083, + "step": 7862 + }, + { + "epoch": 0.64, + "grad_norm": 9.512312813758063, + "learning_rate": 3.051094384880256e-06, + "loss": 0.4938, + "step": 7863 + }, + { + "epoch": 0.64, + "grad_norm": 4.207905589942727, + "learning_rate": 3.0498831324839294e-06, + "loss": 0.628, + "step": 7864 + }, + { + "epoch": 0.64, + "grad_norm": 4.042296915855348, + "learning_rate": 3.048672015047971e-06, + "loss": 0.7337, + "step": 7865 + }, + { + "epoch": 0.64, + "grad_norm": 5.391390544497846, + "learning_rate": 3.047461032656195e-06, + "loss": 0.8529, + "step": 7866 + }, + { + "epoch": 0.64, + "grad_norm": 4.532974773520633, + "learning_rate": 3.0462501853924088e-06, + "loss": 0.6345, + "step": 7867 + }, + { + "epoch": 0.64, + "grad_norm": 4.799622434355538, + "learning_rate": 3.0450394733404115e-06, + "loss": 0.7364, + "step": 7868 + }, + { + "epoch": 0.64, + "grad_norm": 13.465004451670874, + "learning_rate": 3.0438288965839947e-06, + "loss": 0.5904, + "step": 7869 + }, + { + "epoch": 0.64, + "grad_norm": 6.242620737775251, + "learning_rate": 3.0426184552069327e-06, + "loss": 0.6309, + "step": 7870 + }, + { + "epoch": 0.64, + "grad_norm": 6.9349626310096495, + "learning_rate": 3.0414081492929993e-06, + "loss": 0.4354, + "step": 7871 + }, + { + "epoch": 0.64, + "grad_norm": 7.43634561969817, + "learning_rate": 3.0401979789259533e-06, + "loss": 0.6585, + "step": 7872 + }, + { + "epoch": 0.64, + "grad_norm": 4.335818042484232, + "learning_rate": 3.0389879441895485e-06, + "loss": 0.6891, + "step": 7873 + }, + { + "epoch": 0.64, + "grad_norm": 5.041383983211005, + "learning_rate": 3.0377780451675243e-06, + "loss": 0.7062, + "step": 7874 + }, + { + "epoch": 0.64, + "grad_norm": 8.098438813868885, + "learning_rate": 3.036568281943615e-06, + "loss": 0.6207, + "step": 7875 + }, + { + "epoch": 0.64, + "grad_norm": 3.331323271312619, + "learning_rate": 3.035358654601545e-06, + "loss": 0.6325, + "step": 7876 + }, + { + "epoch": 0.64, + "grad_norm": 8.172012236661782, + "learning_rate": 3.034149163225025e-06, + "loss": 0.6375, + "step": 7877 + }, + { + "epoch": 0.64, + "grad_norm": 4.956578636567212, + "learning_rate": 3.03293980789776e-06, + "loss": 0.6135, + "step": 7878 + }, + { + "epoch": 0.64, + "grad_norm": 19.415122842998876, + "learning_rate": 3.0317305887034466e-06, + "loss": 0.7273, + "step": 7879 + }, + { + "epoch": 0.64, + "grad_norm": 8.885271264185215, + "learning_rate": 3.030521505725771e-06, + "loss": 0.6618, + "step": 7880 + }, + { + "epoch": 0.64, + "grad_norm": 11.457323142400096, + "learning_rate": 3.029312559048406e-06, + "loss": 0.5922, + "step": 7881 + }, + { + "epoch": 0.64, + "grad_norm": 5.7852882438453666, + "learning_rate": 3.0281037487550235e-06, + "loss": 0.5909, + "step": 7882 + }, + { + "epoch": 0.64, + "grad_norm": 4.8593047139245575, + "learning_rate": 3.0268950749292747e-06, + "loss": 0.8284, + "step": 7883 + }, + { + "epoch": 0.64, + "grad_norm": 3.070505913406939, + "learning_rate": 3.025686537654812e-06, + "loss": 0.7446, + "step": 7884 + }, + { + "epoch": 0.64, + "grad_norm": 5.877066261886097, + "learning_rate": 3.0244781370152705e-06, + "loss": 0.7533, + "step": 7885 + }, + { + "epoch": 0.64, + "grad_norm": 4.491271405104326, + "learning_rate": 3.023269873094281e-06, + "loss": 0.5516, + "step": 7886 + }, + { + "epoch": 0.64, + "grad_norm": 5.075265876201668, + "learning_rate": 3.0220617459754638e-06, + "loss": 0.6125, + "step": 7887 + }, + { + "epoch": 0.64, + "grad_norm": 2.8901075478741336, + "learning_rate": 3.020853755742428e-06, + "loss": 0.6526, + "step": 7888 + }, + { + "epoch": 0.64, + "grad_norm": 22.458422196900415, + "learning_rate": 3.0196459024787745e-06, + "loss": 0.5759, + "step": 7889 + }, + { + "epoch": 0.64, + "grad_norm": 3.9186420432648004, + "learning_rate": 3.018438186268094e-06, + "loss": 0.6842, + "step": 7890 + }, + { + "epoch": 0.64, + "grad_norm": 6.9748673377967805, + "learning_rate": 3.0172306071939666e-06, + "loss": 0.5512, + "step": 7891 + }, + { + "epoch": 0.64, + "grad_norm": 6.937072774044195, + "learning_rate": 3.0160231653399656e-06, + "loss": 0.7846, + "step": 7892 + }, + { + "epoch": 0.64, + "grad_norm": 5.135000466827242, + "learning_rate": 3.014815860789654e-06, + "loss": 0.6697, + "step": 7893 + }, + { + "epoch": 0.64, + "grad_norm": 4.299658009216886, + "learning_rate": 3.0136086936265853e-06, + "loss": 0.7657, + "step": 7894 + }, + { + "epoch": 0.64, + "grad_norm": 28.91912794836449, + "learning_rate": 3.0124016639343023e-06, + "loss": 0.6206, + "step": 7895 + }, + { + "epoch": 0.64, + "grad_norm": 4.823366619539434, + "learning_rate": 3.011194771796339e-06, + "loss": 0.6931, + "step": 7896 + }, + { + "epoch": 0.64, + "grad_norm": 4.475263454895488, + "learning_rate": 3.0099880172962197e-06, + "loss": 0.8125, + "step": 7897 + }, + { + "epoch": 0.64, + "grad_norm": 5.79657639753959, + "learning_rate": 3.008781400517461e-06, + "loss": 0.5828, + "step": 7898 + }, + { + "epoch": 0.64, + "grad_norm": 14.183035035030214, + "learning_rate": 3.007574921543565e-06, + "loss": 0.84, + "step": 7899 + }, + { + "epoch": 0.64, + "grad_norm": 9.636072225940968, + "learning_rate": 3.0063685804580306e-06, + "loss": 0.5457, + "step": 7900 + }, + { + "epoch": 0.64, + "grad_norm": 3.869545403430963, + "learning_rate": 3.005162377344343e-06, + "loss": 0.6299, + "step": 7901 + }, + { + "epoch": 0.64, + "grad_norm": 17.072487959173916, + "learning_rate": 3.0039563122859815e-06, + "loss": 0.6958, + "step": 7902 + }, + { + "epoch": 0.64, + "grad_norm": 4.569244186342764, + "learning_rate": 3.0027503853664097e-06, + "loss": 0.6301, + "step": 7903 + }, + { + "epoch": 0.64, + "grad_norm": 13.062297963859766, + "learning_rate": 3.001544596669087e-06, + "loss": 0.7271, + "step": 7904 + }, + { + "epoch": 0.64, + "grad_norm": 9.463149920474777, + "learning_rate": 3.0003389462774625e-06, + "loss": 0.6611, + "step": 7905 + }, + { + "epoch": 0.64, + "grad_norm": 4.240716493379181, + "learning_rate": 2.9991334342749725e-06, + "loss": 0.7052, + "step": 7906 + }, + { + "epoch": 0.64, + "grad_norm": 5.76850876516233, + "learning_rate": 2.9979280607450466e-06, + "loss": 0.6932, + "step": 7907 + }, + { + "epoch": 0.64, + "grad_norm": 5.742749648571481, + "learning_rate": 2.9967228257711063e-06, + "loss": 0.7938, + "step": 7908 + }, + { + "epoch": 0.64, + "grad_norm": 4.145215614532756, + "learning_rate": 2.995517729436561e-06, + "loss": 0.5623, + "step": 7909 + }, + { + "epoch": 0.64, + "grad_norm": 4.886022292183818, + "learning_rate": 2.994312771824809e-06, + "loss": 0.5979, + "step": 7910 + }, + { + "epoch": 0.64, + "grad_norm": 10.086279461239739, + "learning_rate": 2.9931079530192418e-06, + "loss": 0.6673, + "step": 7911 + }, + { + "epoch": 0.64, + "grad_norm": 5.825988826120475, + "learning_rate": 2.9919032731032406e-06, + "loss": 0.7695, + "step": 7912 + }, + { + "epoch": 0.64, + "grad_norm": 5.584574557801636, + "learning_rate": 2.990698732160178e-06, + "loss": 0.6012, + "step": 7913 + }, + { + "epoch": 0.64, + "grad_norm": 3.2434054284591785, + "learning_rate": 2.9894943302734137e-06, + "loss": 0.6601, + "step": 7914 + }, + { + "epoch": 0.64, + "grad_norm": 4.341394232799937, + "learning_rate": 2.9882900675263026e-06, + "loss": 0.62, + "step": 7915 + }, + { + "epoch": 0.64, + "grad_norm": 6.141136561869252, + "learning_rate": 2.9870859440021845e-06, + "loss": 0.6394, + "step": 7916 + }, + { + "epoch": 0.64, + "grad_norm": 9.981751957236176, + "learning_rate": 2.9858819597843923e-06, + "loss": 0.6822, + "step": 7917 + }, + { + "epoch": 0.64, + "grad_norm": 7.416965121903551, + "learning_rate": 2.9846781149562515e-06, + "loss": 0.7304, + "step": 7918 + }, + { + "epoch": 0.64, + "grad_norm": 4.274431051083681, + "learning_rate": 2.9834744096010738e-06, + "loss": 0.5364, + "step": 7919 + }, + { + "epoch": 0.64, + "grad_norm": 4.969932756795815, + "learning_rate": 2.982270843802165e-06, + "loss": 0.4988, + "step": 7920 + }, + { + "epoch": 0.64, + "grad_norm": 9.357437356718279, + "learning_rate": 2.9810674176428184e-06, + "loss": 0.7122, + "step": 7921 + }, + { + "epoch": 0.64, + "grad_norm": 3.574239072455284, + "learning_rate": 2.979864131206319e-06, + "loss": 0.6666, + "step": 7922 + }, + { + "epoch": 0.64, + "grad_norm": 3.6839782779232797, + "learning_rate": 2.9786609845759416e-06, + "loss": 0.5263, + "step": 7923 + }, + { + "epoch": 0.64, + "grad_norm": 3.3508871678142755, + "learning_rate": 2.977457977834951e-06, + "loss": 0.6691, + "step": 7924 + }, + { + "epoch": 0.64, + "grad_norm": 4.185103107716808, + "learning_rate": 2.9762551110666027e-06, + "loss": 0.7456, + "step": 7925 + }, + { + "epoch": 0.64, + "grad_norm": 9.41904428498282, + "learning_rate": 2.975052384354144e-06, + "loss": 0.6896, + "step": 7926 + }, + { + "epoch": 0.64, + "grad_norm": 3.3044620151225677, + "learning_rate": 2.97384979778081e-06, + "loss": 0.6696, + "step": 7927 + }, + { + "epoch": 0.64, + "grad_norm": 3.231869928374368, + "learning_rate": 2.972647351429828e-06, + "loss": 0.654, + "step": 7928 + }, + { + "epoch": 0.64, + "grad_norm": 3.2922325244580324, + "learning_rate": 2.9714450453844156e-06, + "loss": 0.6341, + "step": 7929 + }, + { + "epoch": 0.64, + "grad_norm": 9.100957700942446, + "learning_rate": 2.970242879727778e-06, + "loss": 0.6161, + "step": 7930 + }, + { + "epoch": 0.64, + "grad_norm": 3.983404084257971, + "learning_rate": 2.9690408545431138e-06, + "loss": 0.6426, + "step": 7931 + }, + { + "epoch": 0.64, + "grad_norm": 9.35888262770671, + "learning_rate": 2.967838969913609e-06, + "loss": 0.6427, + "step": 7932 + }, + { + "epoch": 0.64, + "grad_norm": 3.8283775129531388, + "learning_rate": 2.9666372259224442e-06, + "loss": 0.5771, + "step": 7933 + }, + { + "epoch": 0.64, + "grad_norm": 4.353232357024093, + "learning_rate": 2.9654356226527857e-06, + "loss": 0.7037, + "step": 7934 + }, + { + "epoch": 0.64, + "grad_norm": 3.1626257251002183, + "learning_rate": 2.9642341601877954e-06, + "loss": 0.7246, + "step": 7935 + }, + { + "epoch": 0.64, + "grad_norm": 4.394324187380833, + "learning_rate": 2.9630328386106165e-06, + "loss": 0.6876, + "step": 7936 + }, + { + "epoch": 0.64, + "grad_norm": 3.649140026101249, + "learning_rate": 2.9618316580043915e-06, + "loss": 0.554, + "step": 7937 + }, + { + "epoch": 0.64, + "grad_norm": 71.34016249697791, + "learning_rate": 2.9606306184522503e-06, + "loss": 0.5776, + "step": 7938 + }, + { + "epoch": 0.64, + "grad_norm": 3.720611466257484, + "learning_rate": 2.95942972003731e-06, + "loss": 0.693, + "step": 7939 + }, + { + "epoch": 0.64, + "grad_norm": 3.8802664737751256, + "learning_rate": 2.958228962842682e-06, + "loss": 0.7109, + "step": 7940 + }, + { + "epoch": 0.64, + "grad_norm": 7.826390114148268, + "learning_rate": 2.957028346951466e-06, + "loss": 0.6693, + "step": 7941 + }, + { + "epoch": 0.65, + "grad_norm": 3.4790995891865046, + "learning_rate": 2.955827872446753e-06, + "loss": 0.5502, + "step": 7942 + }, + { + "epoch": 0.65, + "grad_norm": 13.062916621124748, + "learning_rate": 2.954627539411621e-06, + "loss": 0.7426, + "step": 7943 + }, + { + "epoch": 0.65, + "grad_norm": 4.8941872484225915, + "learning_rate": 2.953427347929142e-06, + "loss": 0.7372, + "step": 7944 + }, + { + "epoch": 0.65, + "grad_norm": 3.2018462171097166, + "learning_rate": 2.9522272980823773e-06, + "loss": 0.8031, + "step": 7945 + }, + { + "epoch": 0.65, + "grad_norm": 4.230947170917448, + "learning_rate": 2.9510273899543774e-06, + "loss": 0.5772, + "step": 7946 + }, + { + "epoch": 0.65, + "grad_norm": 4.036943717057517, + "learning_rate": 2.949827623628183e-06, + "loss": 0.6124, + "step": 7947 + }, + { + "epoch": 0.65, + "grad_norm": 2.3796032971634973, + "learning_rate": 2.948627999186826e-06, + "loss": 0.6637, + "step": 7948 + }, + { + "epoch": 0.65, + "grad_norm": 2.765529867414433, + "learning_rate": 2.9474285167133297e-06, + "loss": 0.5128, + "step": 7949 + }, + { + "epoch": 0.65, + "grad_norm": 10.674421582559516, + "learning_rate": 2.9462291762907024e-06, + "loss": 0.7475, + "step": 7950 + }, + { + "epoch": 0.65, + "grad_norm": 4.064782182364504, + "learning_rate": 2.9450299780019476e-06, + "loss": 0.6172, + "step": 7951 + }, + { + "epoch": 0.65, + "grad_norm": 3.703773336780566, + "learning_rate": 2.9438309219300578e-06, + "loss": 0.7579, + "step": 7952 + }, + { + "epoch": 0.65, + "grad_norm": 5.634757393313631, + "learning_rate": 2.942632008158015e-06, + "loss": 0.5659, + "step": 7953 + }, + { + "epoch": 0.65, + "grad_norm": 6.474832530656683, + "learning_rate": 2.9414332367687914e-06, + "loss": 0.8499, + "step": 7954 + }, + { + "epoch": 0.65, + "grad_norm": 5.428702717606891, + "learning_rate": 2.9402346078453513e-06, + "loss": 0.64, + "step": 7955 + }, + { + "epoch": 0.65, + "grad_norm": 7.94889601818546, + "learning_rate": 2.9390361214706443e-06, + "loss": 0.7082, + "step": 7956 + }, + { + "epoch": 0.65, + "grad_norm": 3.145697355851298, + "learning_rate": 2.9378377777276134e-06, + "loss": 0.6718, + "step": 7957 + }, + { + "epoch": 0.65, + "grad_norm": 3.0426558195478544, + "learning_rate": 2.936639576699194e-06, + "loss": 0.6516, + "step": 7958 + }, + { + "epoch": 0.65, + "grad_norm": 4.912128450013469, + "learning_rate": 2.935441518468307e-06, + "loss": 0.6268, + "step": 7959 + }, + { + "epoch": 0.65, + "grad_norm": 7.756751233757057, + "learning_rate": 2.9342436031178677e-06, + "loss": 0.7175, + "step": 7960 + }, + { + "epoch": 0.65, + "grad_norm": 2.774060271416822, + "learning_rate": 2.9330458307307774e-06, + "loss": 0.596, + "step": 7961 + }, + { + "epoch": 0.65, + "grad_norm": 6.143130770325557, + "learning_rate": 2.9318482013899306e-06, + "loss": 0.6759, + "step": 7962 + }, + { + "epoch": 0.65, + "grad_norm": 3.0279437038225416, + "learning_rate": 2.930650715178211e-06, + "loss": 0.7146, + "step": 7963 + }, + { + "epoch": 0.65, + "grad_norm": 3.6286172762672035, + "learning_rate": 2.92945337217849e-06, + "loss": 0.6897, + "step": 7964 + }, + { + "epoch": 0.65, + "grad_norm": 3.3096147681560106, + "learning_rate": 2.9282561724736335e-06, + "loss": 0.7997, + "step": 7965 + }, + { + "epoch": 0.65, + "grad_norm": 7.5184808043191635, + "learning_rate": 2.9270591161464946e-06, + "loss": 0.5893, + "step": 7966 + }, + { + "epoch": 0.65, + "grad_norm": 6.4450835896070044, + "learning_rate": 2.9258622032799165e-06, + "loss": 0.7474, + "step": 7967 + }, + { + "epoch": 0.65, + "grad_norm": 4.141192623134453, + "learning_rate": 2.9246654339567373e-06, + "loss": 0.7651, + "step": 7968 + }, + { + "epoch": 0.65, + "grad_norm": 8.341503198541163, + "learning_rate": 2.923468808259774e-06, + "loss": 0.5452, + "step": 7969 + }, + { + "epoch": 0.65, + "grad_norm": 4.912407446912591, + "learning_rate": 2.9222723262718456e-06, + "loss": 0.6942, + "step": 7970 + }, + { + "epoch": 0.65, + "grad_norm": 4.805692582141735, + "learning_rate": 2.921075988075753e-06, + "loss": 0.6534, + "step": 7971 + }, + { + "epoch": 0.65, + "grad_norm": 4.009331023721363, + "learning_rate": 2.9198797937542935e-06, + "loss": 0.68, + "step": 7972 + }, + { + "epoch": 0.65, + "grad_norm": 4.273268378117469, + "learning_rate": 2.918683743390248e-06, + "loss": 0.688, + "step": 7973 + }, + { + "epoch": 0.65, + "grad_norm": 6.026614079388815, + "learning_rate": 2.917487837066395e-06, + "loss": 0.7275, + "step": 7974 + }, + { + "epoch": 0.65, + "grad_norm": 5.313160018907861, + "learning_rate": 2.9162920748654955e-06, + "loss": 0.6443, + "step": 7975 + }, + { + "epoch": 0.65, + "grad_norm": 3.607403742104908, + "learning_rate": 2.915096456870305e-06, + "loss": 0.6542, + "step": 7976 + }, + { + "epoch": 0.65, + "grad_norm": 2.862426215231995, + "learning_rate": 2.913900983163565e-06, + "loss": 0.5966, + "step": 7977 + }, + { + "epoch": 0.65, + "grad_norm": 2.941519325575196, + "learning_rate": 2.9127056538280142e-06, + "loss": 0.7731, + "step": 7978 + }, + { + "epoch": 0.65, + "grad_norm": 3.8829764860047624, + "learning_rate": 2.9115104689463724e-06, + "loss": 0.7495, + "step": 7979 + }, + { + "epoch": 0.65, + "grad_norm": 4.846146106556295, + "learning_rate": 2.910315428601359e-06, + "loss": 0.6769, + "step": 7980 + }, + { + "epoch": 0.65, + "grad_norm": 5.43896740777116, + "learning_rate": 2.9091205328756755e-06, + "loss": 0.7169, + "step": 7981 + }, + { + "epoch": 0.65, + "grad_norm": 6.055833871896981, + "learning_rate": 2.907925781852017e-06, + "loss": 0.6893, + "step": 7982 + }, + { + "epoch": 0.65, + "grad_norm": 2.3383098832766898, + "learning_rate": 2.906731175613066e-06, + "loss": 0.4464, + "step": 7983 + }, + { + "epoch": 0.65, + "grad_norm": 4.4110259476385485, + "learning_rate": 2.905536714241497e-06, + "loss": 0.6174, + "step": 7984 + }, + { + "epoch": 0.65, + "grad_norm": 3.9106396925019045, + "learning_rate": 2.9043423978199764e-06, + "loss": 0.7601, + "step": 7985 + }, + { + "epoch": 0.65, + "grad_norm": 4.928270083696889, + "learning_rate": 2.903148226431155e-06, + "loss": 0.5821, + "step": 7986 + }, + { + "epoch": 0.65, + "grad_norm": 3.418163235308268, + "learning_rate": 2.901954200157682e-06, + "loss": 0.7019, + "step": 7987 + }, + { + "epoch": 0.65, + "grad_norm": 2.5586526169629757, + "learning_rate": 2.900760319082189e-06, + "loss": 0.5839, + "step": 7988 + }, + { + "epoch": 0.65, + "grad_norm": 8.021811364147736, + "learning_rate": 2.899566583287299e-06, + "loss": 0.7865, + "step": 7989 + }, + { + "epoch": 0.65, + "grad_norm": 2.5165847932544025, + "learning_rate": 2.898372992855627e-06, + "loss": 0.5454, + "step": 7990 + }, + { + "epoch": 0.65, + "grad_norm": 3.2709619933447973, + "learning_rate": 2.897179547869775e-06, + "loss": 0.6818, + "step": 7991 + }, + { + "epoch": 0.65, + "grad_norm": 3.5076440096854045, + "learning_rate": 2.8959862484123407e-06, + "loss": 0.8244, + "step": 7992 + }, + { + "epoch": 0.65, + "grad_norm": 9.566804698825475, + "learning_rate": 2.8947930945659043e-06, + "loss": 0.6592, + "step": 7993 + }, + { + "epoch": 0.65, + "grad_norm": 3.746943799016381, + "learning_rate": 2.8936000864130427e-06, + "loss": 0.6335, + "step": 7994 + }, + { + "epoch": 0.65, + "grad_norm": 4.284303628858223, + "learning_rate": 2.8924072240363182e-06, + "loss": 0.7517, + "step": 7995 + }, + { + "epoch": 0.65, + "grad_norm": 2.719195740432593, + "learning_rate": 2.8912145075182844e-06, + "loss": 0.5689, + "step": 7996 + }, + { + "epoch": 0.65, + "grad_norm": 7.210201003929519, + "learning_rate": 2.890021936941483e-06, + "loss": 0.816, + "step": 7997 + }, + { + "epoch": 0.65, + "grad_norm": 4.9637647127954665, + "learning_rate": 2.8888295123884507e-06, + "loss": 0.5769, + "step": 7998 + }, + { + "epoch": 0.65, + "grad_norm": 3.344290299609074, + "learning_rate": 2.887637233941709e-06, + "loss": 0.7094, + "step": 7999 + }, + { + "epoch": 0.65, + "grad_norm": 3.8171927691933503, + "learning_rate": 2.8864451016837703e-06, + "loss": 0.6812, + "step": 8000 + }, + { + "epoch": 0.65, + "grad_norm": 7.207018494530118, + "learning_rate": 2.88525311569714e-06, + "loss": 0.6815, + "step": 8001 + }, + { + "epoch": 0.65, + "grad_norm": 3.379976828641701, + "learning_rate": 2.884061276064309e-06, + "loss": 0.6165, + "step": 8002 + }, + { + "epoch": 0.65, + "grad_norm": 3.018867886467191, + "learning_rate": 2.882869582867761e-06, + "loss": 0.752, + "step": 8003 + }, + { + "epoch": 0.65, + "grad_norm": 3.4021895370054533, + "learning_rate": 2.8816780361899664e-06, + "loss": 0.6062, + "step": 8004 + }, + { + "epoch": 0.65, + "grad_norm": 4.569776692017455, + "learning_rate": 2.880486636113392e-06, + "loss": 0.6702, + "step": 8005 + }, + { + "epoch": 0.65, + "grad_norm": 5.325271848292634, + "learning_rate": 2.8792953827204884e-06, + "loss": 0.7343, + "step": 8006 + }, + { + "epoch": 0.65, + "grad_norm": 3.2729458613252556, + "learning_rate": 2.878104276093695e-06, + "loss": 0.6263, + "step": 8007 + }, + { + "epoch": 0.65, + "grad_norm": 6.023866598421868, + "learning_rate": 2.87691331631545e-06, + "loss": 0.6179, + "step": 8008 + }, + { + "epoch": 0.65, + "grad_norm": 3.4525442210827513, + "learning_rate": 2.875722503468168e-06, + "loss": 0.7279, + "step": 8009 + }, + { + "epoch": 0.65, + "grad_norm": 6.522536383425385, + "learning_rate": 2.874531837634266e-06, + "loss": 0.6595, + "step": 8010 + }, + { + "epoch": 0.65, + "grad_norm": 2.6069834590391516, + "learning_rate": 2.8733413188961416e-06, + "loss": 0.6464, + "step": 8011 + }, + { + "epoch": 0.65, + "grad_norm": 3.833461329120667, + "learning_rate": 2.872150947336191e-06, + "loss": 0.5774, + "step": 8012 + }, + { + "epoch": 0.65, + "grad_norm": 14.295096291704, + "learning_rate": 2.870960723036793e-06, + "loss": 0.7079, + "step": 8013 + }, + { + "epoch": 0.65, + "grad_norm": 3.4686198610194823, + "learning_rate": 2.869770646080316e-06, + "loss": 0.7404, + "step": 8014 + }, + { + "epoch": 0.65, + "grad_norm": 3.694525856744964, + "learning_rate": 2.8685807165491275e-06, + "loss": 0.7049, + "step": 8015 + }, + { + "epoch": 0.65, + "grad_norm": 3.5126109050783083, + "learning_rate": 2.86739093452557e-06, + "loss": 0.6991, + "step": 8016 + }, + { + "epoch": 0.65, + "grad_norm": 5.27960053021177, + "learning_rate": 2.8662013000919897e-06, + "loss": 0.7257, + "step": 8017 + }, + { + "epoch": 0.65, + "grad_norm": 2.915218762908385, + "learning_rate": 2.865011813330713e-06, + "loss": 0.6121, + "step": 8018 + }, + { + "epoch": 0.65, + "grad_norm": 2.9744899449321274, + "learning_rate": 2.863822474324064e-06, + "loss": 0.7253, + "step": 8019 + }, + { + "epoch": 0.65, + "grad_norm": 3.063975824891128, + "learning_rate": 2.862633283154348e-06, + "loss": 0.6413, + "step": 8020 + }, + { + "epoch": 0.65, + "grad_norm": 3.476180188181614, + "learning_rate": 2.8614442399038713e-06, + "loss": 0.5547, + "step": 8021 + }, + { + "epoch": 0.65, + "grad_norm": 3.5944207966133868, + "learning_rate": 2.860255344654914e-06, + "loss": 0.61, + "step": 8022 + }, + { + "epoch": 0.65, + "grad_norm": 4.9939928969105285, + "learning_rate": 2.8590665974897626e-06, + "loss": 0.5697, + "step": 8023 + }, + { + "epoch": 0.65, + "grad_norm": 2.609342772961525, + "learning_rate": 2.857877998490682e-06, + "loss": 0.6324, + "step": 8024 + }, + { + "epoch": 0.65, + "grad_norm": 3.4910581802080674, + "learning_rate": 2.8566895477399303e-06, + "loss": 0.758, + "step": 8025 + }, + { + "epoch": 0.65, + "grad_norm": 22.555353816985132, + "learning_rate": 2.8555012453197594e-06, + "loss": 0.7427, + "step": 8026 + }, + { + "epoch": 0.65, + "grad_norm": 3.3924341478690874, + "learning_rate": 2.8543130913124036e-06, + "loss": 0.6537, + "step": 8027 + }, + { + "epoch": 0.65, + "grad_norm": 6.540463838343723, + "learning_rate": 2.853125085800096e-06, + "loss": 0.7307, + "step": 8028 + }, + { + "epoch": 0.65, + "grad_norm": 4.608088004437503, + "learning_rate": 2.851937228865046e-06, + "loss": 0.7204, + "step": 8029 + }, + { + "epoch": 0.65, + "grad_norm": 4.750358123066622, + "learning_rate": 2.850749520589467e-06, + "loss": 0.7658, + "step": 8030 + }, + { + "epoch": 0.65, + "grad_norm": 5.290827648545436, + "learning_rate": 2.849561961055554e-06, + "loss": 0.758, + "step": 8031 + }, + { + "epoch": 0.65, + "grad_norm": 3.0124113658921785, + "learning_rate": 2.848374550345492e-06, + "loss": 0.6691, + "step": 8032 + }, + { + "epoch": 0.65, + "grad_norm": 3.548019509451185, + "learning_rate": 2.847187288541461e-06, + "loss": 0.7344, + "step": 8033 + }, + { + "epoch": 0.65, + "grad_norm": 3.2523457725426064, + "learning_rate": 2.8460001757256225e-06, + "loss": 0.6396, + "step": 8034 + }, + { + "epoch": 0.65, + "grad_norm": 3.04257344946543, + "learning_rate": 2.8448132119801387e-06, + "loss": 0.7396, + "step": 8035 + }, + { + "epoch": 0.65, + "grad_norm": 8.140818571161578, + "learning_rate": 2.843626397387146e-06, + "loss": 0.6545, + "step": 8036 + }, + { + "epoch": 0.65, + "grad_norm": 2.7542264824030367, + "learning_rate": 2.842439732028787e-06, + "loss": 0.7558, + "step": 8037 + }, + { + "epoch": 0.65, + "grad_norm": 6.4777330081682685, + "learning_rate": 2.8412532159871835e-06, + "loss": 0.7565, + "step": 8038 + }, + { + "epoch": 0.65, + "grad_norm": 2.9347423493914984, + "learning_rate": 2.840066849344448e-06, + "loss": 0.6668, + "step": 8039 + }, + { + "epoch": 0.65, + "grad_norm": 14.075107441588635, + "learning_rate": 2.838880632182689e-06, + "loss": 0.6859, + "step": 8040 + }, + { + "epoch": 0.65, + "grad_norm": 2.8346265119003657, + "learning_rate": 2.837694564583997e-06, + "loss": 0.709, + "step": 8041 + }, + { + "epoch": 0.65, + "grad_norm": 8.108622104523713, + "learning_rate": 2.836508646630457e-06, + "loss": 0.6318, + "step": 8042 + }, + { + "epoch": 0.65, + "grad_norm": 2.446374545868996, + "learning_rate": 2.835322878404139e-06, + "loss": 0.5959, + "step": 8043 + }, + { + "epoch": 0.65, + "grad_norm": 2.6309871248743137, + "learning_rate": 2.834137259987109e-06, + "loss": 0.8056, + "step": 8044 + }, + { + "epoch": 0.65, + "grad_norm": 4.897188935304473, + "learning_rate": 2.832951791461417e-06, + "loss": 0.7282, + "step": 8045 + }, + { + "epoch": 0.65, + "grad_norm": 3.230830528350498, + "learning_rate": 2.831766472909107e-06, + "loss": 0.7623, + "step": 8046 + }, + { + "epoch": 0.65, + "grad_norm": 5.104243282469706, + "learning_rate": 2.83058130441221e-06, + "loss": 0.6459, + "step": 8047 + }, + { + "epoch": 0.65, + "grad_norm": 3.895120005138428, + "learning_rate": 2.8293962860527463e-06, + "loss": 0.6943, + "step": 8048 + }, + { + "epoch": 0.65, + "grad_norm": 2.669712617957591, + "learning_rate": 2.828211417912727e-06, + "loss": 0.6413, + "step": 8049 + }, + { + "epoch": 0.65, + "grad_norm": 7.4947910697641635, + "learning_rate": 2.82702670007415e-06, + "loss": 0.6932, + "step": 8050 + }, + { + "epoch": 0.65, + "grad_norm": 3.3743044314570665, + "learning_rate": 2.82584213261901e-06, + "loss": 0.6865, + "step": 8051 + }, + { + "epoch": 0.65, + "grad_norm": 6.645393736516758, + "learning_rate": 2.8246577156292814e-06, + "loss": 0.7015, + "step": 8052 + }, + { + "epoch": 0.65, + "grad_norm": 4.256401060582648, + "learning_rate": 2.8234734491869388e-06, + "loss": 0.6964, + "step": 8053 + }, + { + "epoch": 0.65, + "grad_norm": 3.5846555797881288, + "learning_rate": 2.822289333373937e-06, + "loss": 0.6724, + "step": 8054 + }, + { + "epoch": 0.65, + "grad_norm": 17.647839782732696, + "learning_rate": 2.821105368272226e-06, + "loss": 0.5622, + "step": 8055 + }, + { + "epoch": 0.65, + "grad_norm": 2.9535726185612163, + "learning_rate": 2.8199215539637427e-06, + "loss": 0.7646, + "step": 8056 + }, + { + "epoch": 0.65, + "grad_norm": 6.010230176300674, + "learning_rate": 2.818737890530413e-06, + "loss": 0.6683, + "step": 8057 + }, + { + "epoch": 0.65, + "grad_norm": 4.239535044533543, + "learning_rate": 2.8175543780541583e-06, + "loss": 0.8028, + "step": 8058 + }, + { + "epoch": 0.65, + "grad_norm": 3.2369724267153903, + "learning_rate": 2.816371016616879e-06, + "loss": 0.7309, + "step": 8059 + }, + { + "epoch": 0.65, + "grad_norm": 5.9465045609551614, + "learning_rate": 2.815187806300478e-06, + "loss": 0.5982, + "step": 8060 + }, + { + "epoch": 0.65, + "grad_norm": 8.21157044415239, + "learning_rate": 2.8140047471868364e-06, + "loss": 0.7354, + "step": 8061 + }, + { + "epoch": 0.65, + "grad_norm": 10.13291753451248, + "learning_rate": 2.812821839357831e-06, + "loss": 0.7539, + "step": 8062 + }, + { + "epoch": 0.65, + "grad_norm": 5.9758329503293695, + "learning_rate": 2.8116390828953257e-06, + "loss": 0.691, + "step": 8063 + }, + { + "epoch": 0.65, + "grad_norm": 4.822796671840841, + "learning_rate": 2.8104564778811735e-06, + "loss": 0.6881, + "step": 8064 + }, + { + "epoch": 0.66, + "grad_norm": 5.331684154175175, + "learning_rate": 2.8092740243972205e-06, + "loss": 0.6795, + "step": 8065 + }, + { + "epoch": 0.66, + "grad_norm": 3.737394938090677, + "learning_rate": 2.8080917225252977e-06, + "loss": 0.6896, + "step": 8066 + }, + { + "epoch": 0.66, + "grad_norm": 2.5397599396491572, + "learning_rate": 2.806909572347231e-06, + "loss": 0.653, + "step": 8067 + }, + { + "epoch": 0.66, + "grad_norm": 5.553295341303807, + "learning_rate": 2.805727573944831e-06, + "loss": 0.6577, + "step": 8068 + }, + { + "epoch": 0.66, + "grad_norm": 2.7603569793241465, + "learning_rate": 2.804545727399899e-06, + "loss": 0.5979, + "step": 8069 + }, + { + "epoch": 0.66, + "grad_norm": 4.30557996129583, + "learning_rate": 2.8033640327942235e-06, + "loss": 0.7343, + "step": 8070 + }, + { + "epoch": 0.66, + "grad_norm": 3.7599864026680043, + "learning_rate": 2.8021824902095914e-06, + "loss": 0.7741, + "step": 8071 + }, + { + "epoch": 0.66, + "grad_norm": 4.425154338026033, + "learning_rate": 2.8010010997277692e-06, + "loss": 0.6454, + "step": 8072 + }, + { + "epoch": 0.66, + "grad_norm": 9.292637563655099, + "learning_rate": 2.7998198614305145e-06, + "loss": 0.6662, + "step": 8073 + }, + { + "epoch": 0.66, + "grad_norm": 3.0078072550963215, + "learning_rate": 2.798638775399583e-06, + "loss": 0.6474, + "step": 8074 + }, + { + "epoch": 0.66, + "grad_norm": 2.4950065604598426, + "learning_rate": 2.7974578417167052e-06, + "loss": 0.6051, + "step": 8075 + }, + { + "epoch": 0.66, + "grad_norm": 2.77412315501453, + "learning_rate": 2.796277060463616e-06, + "loss": 0.6506, + "step": 8076 + }, + { + "epoch": 0.66, + "grad_norm": 89.2254811421721, + "learning_rate": 2.7950964317220266e-06, + "loss": 0.638, + "step": 8077 + }, + { + "epoch": 0.66, + "grad_norm": 9.063342351241381, + "learning_rate": 2.79391595557365e-06, + "loss": 0.6091, + "step": 8078 + }, + { + "epoch": 0.66, + "grad_norm": 7.1340359037385115, + "learning_rate": 2.79273563210018e-06, + "loss": 0.5987, + "step": 8079 + }, + { + "epoch": 0.66, + "grad_norm": 3.740413829044198, + "learning_rate": 2.7915554613833e-06, + "loss": 0.6965, + "step": 8080 + }, + { + "epoch": 0.66, + "grad_norm": 3.089046791368479, + "learning_rate": 2.7903754435046914e-06, + "loss": 0.6006, + "step": 8081 + }, + { + "epoch": 0.66, + "grad_norm": 12.496913360956905, + "learning_rate": 2.7891955785460124e-06, + "loss": 0.7316, + "step": 8082 + }, + { + "epoch": 0.66, + "grad_norm": 2.263579399050377, + "learning_rate": 2.78801586658892e-06, + "loss": 0.7201, + "step": 8083 + }, + { + "epoch": 0.66, + "grad_norm": 3.128008181608036, + "learning_rate": 2.786836307715056e-06, + "loss": 0.671, + "step": 8084 + }, + { + "epoch": 0.66, + "grad_norm": 5.005441463228108, + "learning_rate": 2.7856569020060576e-06, + "loss": 0.6981, + "step": 8085 + }, + { + "epoch": 0.66, + "grad_norm": 3.5765022728610334, + "learning_rate": 2.7844776495435435e-06, + "loss": 0.7256, + "step": 8086 + }, + { + "epoch": 0.66, + "grad_norm": 2.916427159046024, + "learning_rate": 2.7832985504091242e-06, + "loss": 0.6826, + "step": 8087 + }, + { + "epoch": 0.66, + "grad_norm": 2.259328775828344, + "learning_rate": 2.782119604684407e-06, + "loss": 0.784, + "step": 8088 + }, + { + "epoch": 0.66, + "grad_norm": 3.011025343382135, + "learning_rate": 2.780940812450974e-06, + "loss": 0.6897, + "step": 8089 + }, + { + "epoch": 0.66, + "grad_norm": 9.132058530184704, + "learning_rate": 2.779762173790411e-06, + "loss": 0.7162, + "step": 8090 + }, + { + "epoch": 0.66, + "grad_norm": 3.8341124318769713, + "learning_rate": 2.778583688784283e-06, + "loss": 0.6693, + "step": 8091 + }, + { + "epoch": 0.66, + "grad_norm": 5.064764614926233, + "learning_rate": 2.7774053575141534e-06, + "loss": 0.675, + "step": 8092 + }, + { + "epoch": 0.66, + "grad_norm": 4.023152511188378, + "learning_rate": 2.7762271800615654e-06, + "loss": 0.4862, + "step": 8093 + }, + { + "epoch": 0.66, + "grad_norm": 4.7788938895923625, + "learning_rate": 2.7750491565080628e-06, + "loss": 0.5944, + "step": 8094 + }, + { + "epoch": 0.66, + "grad_norm": 4.192379958973519, + "learning_rate": 2.773871286935164e-06, + "loss": 0.5599, + "step": 8095 + }, + { + "epoch": 0.66, + "grad_norm": 3.3720122497226215, + "learning_rate": 2.772693571424391e-06, + "loss": 0.5955, + "step": 8096 + }, + { + "epoch": 0.66, + "grad_norm": 2.5577324112883493, + "learning_rate": 2.771516010057247e-06, + "loss": 0.6597, + "step": 8097 + }, + { + "epoch": 0.66, + "grad_norm": 3.6533423604480904, + "learning_rate": 2.7703386029152246e-06, + "loss": 0.6618, + "step": 8098 + }, + { + "epoch": 0.66, + "grad_norm": 2.9683340010962227, + "learning_rate": 2.769161350079812e-06, + "loss": 0.5322, + "step": 8099 + }, + { + "epoch": 0.66, + "grad_norm": 3.708425913454942, + "learning_rate": 2.767984251632479e-06, + "loss": 0.7231, + "step": 8100 + }, + { + "epoch": 0.66, + "grad_norm": 2.3598118702840942, + "learning_rate": 2.7668073076546936e-06, + "loss": 0.7126, + "step": 8101 + }, + { + "epoch": 0.66, + "grad_norm": 2.4044414523470983, + "learning_rate": 2.7656305182279e-06, + "loss": 0.7585, + "step": 8102 + }, + { + "epoch": 0.66, + "grad_norm": 2.879798893839328, + "learning_rate": 2.7644538834335446e-06, + "loss": 0.6973, + "step": 8103 + }, + { + "epoch": 0.66, + "grad_norm": 3.752774397731517, + "learning_rate": 2.7632774033530575e-06, + "loss": 0.6072, + "step": 8104 + }, + { + "epoch": 0.66, + "grad_norm": 5.784226603803175, + "learning_rate": 2.7621010780678546e-06, + "loss": 0.572, + "step": 8105 + }, + { + "epoch": 0.66, + "grad_norm": 5.218206124068918, + "learning_rate": 2.7609249076593507e-06, + "loss": 0.7062, + "step": 8106 + }, + { + "epoch": 0.66, + "grad_norm": 3.942241814504858, + "learning_rate": 2.75974889220894e-06, + "loss": 0.6435, + "step": 8107 + }, + { + "epoch": 0.66, + "grad_norm": 3.6680174676185695, + "learning_rate": 2.7585730317980154e-06, + "loss": 0.6204, + "step": 8108 + }, + { + "epoch": 0.66, + "grad_norm": 6.0356555201357365, + "learning_rate": 2.7573973265079456e-06, + "loss": 0.7886, + "step": 8109 + }, + { + "epoch": 0.66, + "grad_norm": 4.179336771332034, + "learning_rate": 2.756221776420104e-06, + "loss": 0.4285, + "step": 8110 + }, + { + "epoch": 0.66, + "grad_norm": 2.586153514957241, + "learning_rate": 2.7550463816158437e-06, + "loss": 0.9171, + "step": 8111 + }, + { + "epoch": 0.66, + "grad_norm": 2.574352646633711, + "learning_rate": 2.753871142176506e-06, + "loss": 0.6009, + "step": 8112 + }, + { + "epoch": 0.66, + "grad_norm": 5.190698838544873, + "learning_rate": 2.7526960581834316e-06, + "loss": 0.7553, + "step": 8113 + }, + { + "epoch": 0.66, + "grad_norm": 3.3618261508897382, + "learning_rate": 2.751521129717939e-06, + "loss": 0.6209, + "step": 8114 + }, + { + "epoch": 0.66, + "grad_norm": 3.2833187956151426, + "learning_rate": 2.7503463568613425e-06, + "loss": 0.6505, + "step": 8115 + }, + { + "epoch": 0.66, + "grad_norm": 3.4587743625127083, + "learning_rate": 2.749171739694941e-06, + "loss": 0.6915, + "step": 8116 + }, + { + "epoch": 0.66, + "grad_norm": 3.5522995137920397, + "learning_rate": 2.747997278300029e-06, + "loss": 0.6012, + "step": 8117 + }, + { + "epoch": 0.66, + "grad_norm": 8.236393816434713, + "learning_rate": 2.7468229727578836e-06, + "loss": 0.6107, + "step": 8118 + }, + { + "epoch": 0.66, + "grad_norm": 4.267780455576585, + "learning_rate": 2.745648823149778e-06, + "loss": 1.0247, + "step": 8119 + }, + { + "epoch": 0.66, + "grad_norm": 3.6651676247798903, + "learning_rate": 2.744474829556968e-06, + "loss": 0.814, + "step": 8120 + }, + { + "epoch": 0.66, + "grad_norm": 3.862417019236534, + "learning_rate": 2.743300992060701e-06, + "loss": 0.7157, + "step": 8121 + }, + { + "epoch": 0.66, + "grad_norm": 4.809328871151626, + "learning_rate": 2.7421273107422157e-06, + "loss": 0.698, + "step": 8122 + }, + { + "epoch": 0.66, + "grad_norm": 5.306268007817394, + "learning_rate": 2.740953785682735e-06, + "loss": 0.6299, + "step": 8123 + }, + { + "epoch": 0.66, + "grad_norm": 3.2365114206748578, + "learning_rate": 2.7397804169634785e-06, + "loss": 0.6519, + "step": 8124 + }, + { + "epoch": 0.66, + "grad_norm": 3.9779067661232146, + "learning_rate": 2.7386072046656466e-06, + "loss": 0.6022, + "step": 8125 + }, + { + "epoch": 0.66, + "grad_norm": 2.8569241628554756, + "learning_rate": 2.737434148870437e-06, + "loss": 0.6969, + "step": 8126 + }, + { + "epoch": 0.66, + "grad_norm": 8.249849309220107, + "learning_rate": 2.73626124965903e-06, + "loss": 0.6716, + "step": 8127 + }, + { + "epoch": 0.66, + "grad_norm": 5.473925444958698, + "learning_rate": 2.7350885071125993e-06, + "loss": 0.5792, + "step": 8128 + }, + { + "epoch": 0.66, + "grad_norm": 14.151669351812043, + "learning_rate": 2.7339159213123047e-06, + "loss": 0.7819, + "step": 8129 + }, + { + "epoch": 0.66, + "grad_norm": 3.271079193189441, + "learning_rate": 2.732743492339294e-06, + "loss": 0.7488, + "step": 8130 + }, + { + "epoch": 0.66, + "grad_norm": 2.8123534476903784, + "learning_rate": 2.7315712202747123e-06, + "loss": 0.5749, + "step": 8131 + }, + { + "epoch": 0.66, + "grad_norm": 3.080516209679372, + "learning_rate": 2.730399105199683e-06, + "loss": 0.7644, + "step": 8132 + }, + { + "epoch": 0.66, + "grad_norm": 9.666565114778587, + "learning_rate": 2.7292271471953287e-06, + "loss": 0.6489, + "step": 8133 + }, + { + "epoch": 0.66, + "grad_norm": 4.768538734082018, + "learning_rate": 2.728055346342753e-06, + "loss": 0.7818, + "step": 8134 + }, + { + "epoch": 0.66, + "grad_norm": 2.897447991631518, + "learning_rate": 2.7268837027230532e-06, + "loss": 0.8416, + "step": 8135 + }, + { + "epoch": 0.66, + "grad_norm": 2.2968322726515984, + "learning_rate": 2.725712216417314e-06, + "loss": 0.5824, + "step": 8136 + }, + { + "epoch": 0.66, + "grad_norm": 6.151412765665947, + "learning_rate": 2.724540887506607e-06, + "loss": 0.6942, + "step": 8137 + }, + { + "epoch": 0.66, + "grad_norm": 4.983144827871983, + "learning_rate": 2.7233697160720006e-06, + "loss": 0.6166, + "step": 8138 + }, + { + "epoch": 0.66, + "grad_norm": 10.496092195886966, + "learning_rate": 2.7221987021945424e-06, + "loss": 0.6779, + "step": 8139 + }, + { + "epoch": 0.66, + "grad_norm": 5.399288633401184, + "learning_rate": 2.7210278459552786e-06, + "loss": 0.6359, + "step": 8140 + }, + { + "epoch": 0.66, + "grad_norm": 40.49926859289402, + "learning_rate": 2.7198571474352365e-06, + "loss": 0.6678, + "step": 8141 + }, + { + "epoch": 0.66, + "grad_norm": 4.068194332199744, + "learning_rate": 2.7186866067154377e-06, + "loss": 0.6917, + "step": 8142 + }, + { + "epoch": 0.66, + "grad_norm": 4.214361757073315, + "learning_rate": 2.717516223876888e-06, + "loss": 0.6056, + "step": 8143 + }, + { + "epoch": 0.66, + "grad_norm": 2.366611601620602, + "learning_rate": 2.7163459990005885e-06, + "loss": 0.5489, + "step": 8144 + }, + { + "epoch": 0.66, + "grad_norm": 3.433824867741869, + "learning_rate": 2.715175932167525e-06, + "loss": 0.6497, + "step": 8145 + }, + { + "epoch": 0.66, + "grad_norm": 8.416421347979638, + "learning_rate": 2.714006023458673e-06, + "loss": 0.6585, + "step": 8146 + }, + { + "epoch": 0.66, + "grad_norm": 4.370174083445899, + "learning_rate": 2.712836272955001e-06, + "loss": 0.5897, + "step": 8147 + }, + { + "epoch": 0.66, + "grad_norm": 7.079149995938893, + "learning_rate": 2.7116666807374557e-06, + "loss": 0.6861, + "step": 8148 + }, + { + "epoch": 0.66, + "grad_norm": 4.123743866742354, + "learning_rate": 2.7104972468869867e-06, + "loss": 0.669, + "step": 8149 + }, + { + "epoch": 0.66, + "grad_norm": 3.310417937479774, + "learning_rate": 2.7093279714845223e-06, + "loss": 0.6096, + "step": 8150 + }, + { + "epoch": 0.66, + "grad_norm": 7.5286076165290625, + "learning_rate": 2.7081588546109875e-06, + "loss": 0.7812, + "step": 8151 + }, + { + "epoch": 0.66, + "grad_norm": 16.493016333175053, + "learning_rate": 2.7069898963472906e-06, + "loss": 0.7276, + "step": 8152 + }, + { + "epoch": 0.66, + "grad_norm": 4.300271825650717, + "learning_rate": 2.7058210967743294e-06, + "loss": 0.6913, + "step": 8153 + }, + { + "epoch": 0.66, + "grad_norm": 4.57384285911414, + "learning_rate": 2.704652455972997e-06, + "loss": 0.7158, + "step": 8154 + }, + { + "epoch": 0.66, + "grad_norm": 4.351206435000293, + "learning_rate": 2.7034839740241634e-06, + "loss": 0.743, + "step": 8155 + }, + { + "epoch": 0.66, + "grad_norm": 3.0215783665567226, + "learning_rate": 2.7023156510087012e-06, + "loss": 0.5758, + "step": 8156 + }, + { + "epoch": 0.66, + "grad_norm": 3.7332458177399337, + "learning_rate": 2.701147487007461e-06, + "loss": 0.7216, + "step": 8157 + }, + { + "epoch": 0.66, + "grad_norm": 29.436091927951935, + "learning_rate": 2.6999794821012915e-06, + "loss": 0.7029, + "step": 8158 + }, + { + "epoch": 0.66, + "grad_norm": 28.091473821487718, + "learning_rate": 2.6988116363710243e-06, + "loss": 0.7103, + "step": 8159 + }, + { + "epoch": 0.66, + "grad_norm": 2.6741269095688835, + "learning_rate": 2.697643949897479e-06, + "loss": 0.5242, + "step": 8160 + }, + { + "epoch": 0.66, + "grad_norm": 4.066508999322499, + "learning_rate": 2.696476422761474e-06, + "loss": 0.8965, + "step": 8161 + }, + { + "epoch": 0.66, + "grad_norm": 2.766408325277755, + "learning_rate": 2.6953090550437994e-06, + "loss": 0.5919, + "step": 8162 + }, + { + "epoch": 0.66, + "grad_norm": 4.319680001264087, + "learning_rate": 2.694141846825252e-06, + "loss": 0.6082, + "step": 8163 + }, + { + "epoch": 0.66, + "grad_norm": 2.7161134675993197, + "learning_rate": 2.6929747981866066e-06, + "loss": 0.6127, + "step": 8164 + }, + { + "epoch": 0.66, + "grad_norm": 3.748930160644413, + "learning_rate": 2.6918079092086323e-06, + "loss": 0.5933, + "step": 8165 + }, + { + "epoch": 0.66, + "grad_norm": 4.402770707724097, + "learning_rate": 2.6906411799720856e-06, + "loss": 0.5895, + "step": 8166 + }, + { + "epoch": 0.66, + "grad_norm": 4.766736266458826, + "learning_rate": 2.689474610557709e-06, + "loss": 0.7376, + "step": 8167 + }, + { + "epoch": 0.66, + "grad_norm": 5.26013466914022, + "learning_rate": 2.688308201046236e-06, + "loss": 0.6647, + "step": 8168 + }, + { + "epoch": 0.66, + "grad_norm": 4.934365711994622, + "learning_rate": 2.6871419515183934e-06, + "loss": 0.7761, + "step": 8169 + }, + { + "epoch": 0.66, + "grad_norm": 3.1207334549338785, + "learning_rate": 2.6859758620548904e-06, + "loss": 0.6577, + "step": 8170 + }, + { + "epoch": 0.66, + "grad_norm": 4.540244856917245, + "learning_rate": 2.6848099327364263e-06, + "loss": 0.5619, + "step": 8171 + }, + { + "epoch": 0.66, + "grad_norm": 7.544296323887684, + "learning_rate": 2.683644163643694e-06, + "loss": 0.5972, + "step": 8172 + }, + { + "epoch": 0.66, + "grad_norm": 5.913912830134446, + "learning_rate": 2.6824785548573685e-06, + "loss": 0.6488, + "step": 8173 + }, + { + "epoch": 0.66, + "grad_norm": 9.218652830581636, + "learning_rate": 2.6813131064581237e-06, + "loss": 0.7573, + "step": 8174 + }, + { + "epoch": 0.66, + "grad_norm": 4.940829344477792, + "learning_rate": 2.6801478185266076e-06, + "loss": 0.6524, + "step": 8175 + }, + { + "epoch": 0.66, + "grad_norm": 4.802348762504626, + "learning_rate": 2.6789826911434714e-06, + "loss": 0.6368, + "step": 8176 + }, + { + "epoch": 0.66, + "grad_norm": 6.0293096270980895, + "learning_rate": 2.6778177243893475e-06, + "loss": 0.8511, + "step": 8177 + }, + { + "epoch": 0.66, + "grad_norm": 3.2633117713110504, + "learning_rate": 2.6766529183448566e-06, + "loss": 0.802, + "step": 8178 + }, + { + "epoch": 0.66, + "grad_norm": 6.686260005792027, + "learning_rate": 2.6754882730906145e-06, + "loss": 0.5111, + "step": 8179 + }, + { + "epoch": 0.66, + "grad_norm": 6.396596808027793, + "learning_rate": 2.674323788707218e-06, + "loss": 0.833, + "step": 8180 + }, + { + "epoch": 0.66, + "grad_norm": 13.006997384344544, + "learning_rate": 2.673159465275264e-06, + "loss": 0.558, + "step": 8181 + }, + { + "epoch": 0.66, + "grad_norm": 13.289912997776192, + "learning_rate": 2.6719953028753214e-06, + "loss": 0.6079, + "step": 8182 + }, + { + "epoch": 0.66, + "grad_norm": 6.083173780646696, + "learning_rate": 2.670831301587964e-06, + "loss": 0.8162, + "step": 8183 + }, + { + "epoch": 0.66, + "grad_norm": 9.201957025999572, + "learning_rate": 2.6696674614937466e-06, + "loss": 0.7288, + "step": 8184 + }, + { + "epoch": 0.66, + "grad_norm": 17.598100752302532, + "learning_rate": 2.668503782673212e-06, + "loss": 0.7004, + "step": 8185 + }, + { + "epoch": 0.66, + "grad_norm": 6.483641831618135, + "learning_rate": 2.667340265206897e-06, + "loss": 0.8014, + "step": 8186 + }, + { + "epoch": 0.66, + "grad_norm": 3.5587194007847933, + "learning_rate": 2.6661769091753244e-06, + "loss": 0.7161, + "step": 8187 + }, + { + "epoch": 0.67, + "grad_norm": 8.318247796483886, + "learning_rate": 2.665013714659004e-06, + "loss": 0.6558, + "step": 8188 + }, + { + "epoch": 0.67, + "grad_norm": 6.389200576753488, + "learning_rate": 2.6638506817384346e-06, + "loss": 0.6297, + "step": 8189 + }, + { + "epoch": 0.67, + "grad_norm": 7.707717064787286, + "learning_rate": 2.66268781049411e-06, + "loss": 0.9265, + "step": 8190 + }, + { + "epoch": 0.67, + "grad_norm": 7.042260655258461, + "learning_rate": 2.661525101006506e-06, + "loss": 0.6435, + "step": 8191 + }, + { + "epoch": 0.67, + "grad_norm": 4.1712809946919265, + "learning_rate": 2.660362553356087e-06, + "loss": 0.6226, + "step": 8192 + }, + { + "epoch": 0.67, + "grad_norm": 6.965248040716757, + "learning_rate": 2.659200167623313e-06, + "loss": 0.5982, + "step": 8193 + }, + { + "epoch": 0.67, + "grad_norm": 7.610740591713689, + "learning_rate": 2.658037943888626e-06, + "loss": 0.505, + "step": 8194 + }, + { + "epoch": 0.67, + "grad_norm": 3.5886055930808562, + "learning_rate": 2.6568758822324605e-06, + "loss": 0.7438, + "step": 8195 + }, + { + "epoch": 0.67, + "grad_norm": 3.931046275279916, + "learning_rate": 2.655713982735234e-06, + "loss": 0.676, + "step": 8196 + }, + { + "epoch": 0.67, + "grad_norm": 4.715776715617235, + "learning_rate": 2.6545522454773643e-06, + "loss": 0.6227, + "step": 8197 + }, + { + "epoch": 0.67, + "grad_norm": 3.4117764528628163, + "learning_rate": 2.653390670539244e-06, + "loss": 0.6489, + "step": 8198 + }, + { + "epoch": 0.67, + "grad_norm": 3.1189164504067204, + "learning_rate": 2.652229258001268e-06, + "loss": 0.7726, + "step": 8199 + }, + { + "epoch": 0.67, + "grad_norm": 6.742417384081913, + "learning_rate": 2.651068007943809e-06, + "loss": 0.7546, + "step": 8200 + }, + { + "epoch": 0.67, + "grad_norm": 7.067713286725016, + "learning_rate": 2.6499069204472346e-06, + "loss": 0.5828, + "step": 8201 + }, + { + "epoch": 0.67, + "grad_norm": 5.2475697301286965, + "learning_rate": 2.648745995591898e-06, + "loss": 0.8124, + "step": 8202 + }, + { + "epoch": 0.67, + "grad_norm": 9.905347852964248, + "learning_rate": 2.647585233458142e-06, + "loss": 0.7315, + "step": 8203 + }, + { + "epoch": 0.67, + "grad_norm": 28.31208804022421, + "learning_rate": 2.6464246341263023e-06, + "loss": 0.6172, + "step": 8204 + }, + { + "epoch": 0.67, + "grad_norm": 12.742042122893908, + "learning_rate": 2.645264197676694e-06, + "loss": 0.7066, + "step": 8205 + }, + { + "epoch": 0.67, + "grad_norm": 4.906364811344351, + "learning_rate": 2.6441039241896325e-06, + "loss": 0.6999, + "step": 8206 + }, + { + "epoch": 0.67, + "grad_norm": 6.262589629087726, + "learning_rate": 2.6429438137454133e-06, + "loss": 0.88, + "step": 8207 + }, + { + "epoch": 0.67, + "grad_norm": 4.546216797871973, + "learning_rate": 2.6417838664243232e-06, + "loss": 0.7516, + "step": 8208 + }, + { + "epoch": 0.67, + "grad_norm": 10.1806082319197, + "learning_rate": 2.6406240823066387e-06, + "loss": 0.5961, + "step": 8209 + }, + { + "epoch": 0.67, + "grad_norm": 18.23638104296995, + "learning_rate": 2.6394644614726215e-06, + "loss": 0.8248, + "step": 8210 + }, + { + "epoch": 0.67, + "grad_norm": 7.429546987646909, + "learning_rate": 2.638305004002528e-06, + "loss": 0.9113, + "step": 8211 + }, + { + "epoch": 0.67, + "grad_norm": 8.732817295019055, + "learning_rate": 2.6371457099765975e-06, + "loss": 0.6727, + "step": 8212 + }, + { + "epoch": 0.67, + "grad_norm": 9.124099909141329, + "learning_rate": 2.6359865794750635e-06, + "loss": 0.807, + "step": 8213 + }, + { + "epoch": 0.67, + "grad_norm": 6.402880912308503, + "learning_rate": 2.6348276125781423e-06, + "loss": 0.5557, + "step": 8214 + }, + { + "epoch": 0.67, + "grad_norm": 12.446808329390967, + "learning_rate": 2.633668809366044e-06, + "loss": 0.6702, + "step": 8215 + }, + { + "epoch": 0.67, + "grad_norm": 9.981870623522754, + "learning_rate": 2.632510169918963e-06, + "loss": 0.5764, + "step": 8216 + }, + { + "epoch": 0.67, + "grad_norm": 13.76705176843207, + "learning_rate": 2.6313516943170836e-06, + "loss": 0.6461, + "step": 8217 + }, + { + "epoch": 0.67, + "grad_norm": 6.052215955873879, + "learning_rate": 2.630193382640583e-06, + "loss": 0.6595, + "step": 8218 + }, + { + "epoch": 0.67, + "grad_norm": 15.38530910237124, + "learning_rate": 2.6290352349696196e-06, + "loss": 0.7473, + "step": 8219 + }, + { + "epoch": 0.67, + "grad_norm": 10.268505480326876, + "learning_rate": 2.627877251384351e-06, + "loss": 0.6669, + "step": 8220 + }, + { + "epoch": 0.67, + "grad_norm": 12.654271352594717, + "learning_rate": 2.6267194319649087e-06, + "loss": 0.5112, + "step": 8221 + }, + { + "epoch": 0.67, + "grad_norm": 18.583044626326892, + "learning_rate": 2.625561776791427e-06, + "loss": 0.675, + "step": 8222 + }, + { + "epoch": 0.67, + "grad_norm": 22.33390887622557, + "learning_rate": 2.6244042859440195e-06, + "loss": 0.6994, + "step": 8223 + }, + { + "epoch": 0.67, + "grad_norm": 16.730588975323435, + "learning_rate": 2.623246959502795e-06, + "loss": 0.4905, + "step": 8224 + }, + { + "epoch": 0.67, + "grad_norm": 14.676263907689403, + "learning_rate": 2.622089797547846e-06, + "loss": 0.6832, + "step": 8225 + }, + { + "epoch": 0.67, + "grad_norm": 14.253781704791074, + "learning_rate": 2.6209328001592538e-06, + "loss": 0.7666, + "step": 8226 + }, + { + "epoch": 0.67, + "grad_norm": 33.8195551811475, + "learning_rate": 2.619775967417096e-06, + "loss": 0.7291, + "step": 8227 + }, + { + "epoch": 0.67, + "grad_norm": 24.18578915126938, + "learning_rate": 2.6186192994014238e-06, + "loss": 0.722, + "step": 8228 + }, + { + "epoch": 0.67, + "grad_norm": 14.377973655685794, + "learning_rate": 2.6174627961922926e-06, + "loss": 0.6936, + "step": 8229 + }, + { + "epoch": 0.67, + "grad_norm": 32.98636212297213, + "learning_rate": 2.6163064578697363e-06, + "loss": 0.6515, + "step": 8230 + }, + { + "epoch": 0.67, + "grad_norm": 9.386963435014797, + "learning_rate": 2.615150284513783e-06, + "loss": 0.7523, + "step": 8231 + }, + { + "epoch": 0.67, + "grad_norm": 15.205331967586337, + "learning_rate": 2.613994276204447e-06, + "loss": 0.5983, + "step": 8232 + }, + { + "epoch": 0.67, + "grad_norm": 28.6514188069815, + "learning_rate": 2.6128384330217283e-06, + "loss": 0.7293, + "step": 8233 + }, + { + "epoch": 0.67, + "grad_norm": 17.831687296780995, + "learning_rate": 2.6116827550456247e-06, + "loss": 0.6784, + "step": 8234 + }, + { + "epoch": 0.67, + "grad_norm": 19.516646057738622, + "learning_rate": 2.610527242356109e-06, + "loss": 0.779, + "step": 8235 + }, + { + "epoch": 0.67, + "grad_norm": 33.49779404976335, + "learning_rate": 2.609371895033156e-06, + "loss": 0.8037, + "step": 8236 + }, + { + "epoch": 0.67, + "grad_norm": 50.56590111142393, + "learning_rate": 2.608216713156717e-06, + "loss": 0.6721, + "step": 8237 + }, + { + "epoch": 0.67, + "grad_norm": 49.68248270482413, + "learning_rate": 2.6070616968067446e-06, + "loss": 0.7132, + "step": 8238 + }, + { + "epoch": 0.67, + "grad_norm": 19.86493983513108, + "learning_rate": 2.60590684606317e-06, + "loss": 0.6762, + "step": 8239 + }, + { + "epoch": 0.67, + "grad_norm": 12.342508565066408, + "learning_rate": 2.6047521610059153e-06, + "loss": 0.7483, + "step": 8240 + }, + { + "epoch": 0.67, + "grad_norm": 11.843376396440245, + "learning_rate": 2.603597641714893e-06, + "loss": 0.765, + "step": 8241 + }, + { + "epoch": 0.67, + "grad_norm": 17.959175600565697, + "learning_rate": 2.6024432882700012e-06, + "loss": 0.7787, + "step": 8242 + }, + { + "epoch": 0.67, + "grad_norm": 28.307255426551528, + "learning_rate": 2.601289100751132e-06, + "loss": 0.714, + "step": 8243 + }, + { + "epoch": 0.67, + "grad_norm": 26.479581534129256, + "learning_rate": 2.6001350792381587e-06, + "loss": 0.8364, + "step": 8244 + }, + { + "epoch": 0.67, + "grad_norm": 14.725010026610999, + "learning_rate": 2.5989812238109504e-06, + "loss": 0.6629, + "step": 8245 + }, + { + "epoch": 0.67, + "grad_norm": 16.52815762956759, + "learning_rate": 2.5978275345493577e-06, + "loss": 0.7947, + "step": 8246 + }, + { + "epoch": 0.67, + "grad_norm": 48.565615276067895, + "learning_rate": 2.5966740115332283e-06, + "loss": 0.6574, + "step": 8247 + }, + { + "epoch": 0.67, + "grad_norm": 33.74968617743713, + "learning_rate": 2.5955206548423867e-06, + "loss": 0.6726, + "step": 8248 + }, + { + "epoch": 0.67, + "grad_norm": 10.138348242999951, + "learning_rate": 2.5943674645566576e-06, + "loss": 0.8163, + "step": 8249 + }, + { + "epoch": 0.67, + "grad_norm": 23.37531335896041, + "learning_rate": 2.5932144407558468e-06, + "loss": 0.8429, + "step": 8250 + }, + { + "epoch": 0.67, + "grad_norm": 26.3693816235651, + "learning_rate": 2.592061583519749e-06, + "loss": 0.8271, + "step": 8251 + }, + { + "epoch": 0.67, + "grad_norm": 17.781584712441497, + "learning_rate": 2.5909088929281534e-06, + "loss": 0.739, + "step": 8252 + }, + { + "epoch": 0.67, + "grad_norm": 21.240776158318038, + "learning_rate": 2.5897563690608307e-06, + "loss": 0.8345, + "step": 8253 + }, + { + "epoch": 0.67, + "grad_norm": 20.702491218654632, + "learning_rate": 2.5886040119975443e-06, + "loss": 0.7993, + "step": 8254 + }, + { + "epoch": 0.67, + "grad_norm": 12.629814151892004, + "learning_rate": 2.587451821818041e-06, + "loss": 0.7768, + "step": 8255 + }, + { + "epoch": 0.67, + "grad_norm": 40.903013276169, + "learning_rate": 2.586299798602065e-06, + "loss": 0.9042, + "step": 8256 + }, + { + "epoch": 0.67, + "grad_norm": 10.475404344049656, + "learning_rate": 2.5851479424293403e-06, + "loss": 0.7024, + "step": 8257 + }, + { + "epoch": 0.67, + "grad_norm": 13.456025969614958, + "learning_rate": 2.5839962533795813e-06, + "loss": 0.786, + "step": 8258 + }, + { + "epoch": 0.67, + "grad_norm": 24.09518736140198, + "learning_rate": 2.582844731532496e-06, + "loss": 0.7525, + "step": 8259 + }, + { + "epoch": 0.67, + "grad_norm": 16.993367992918316, + "learning_rate": 2.5816933769677753e-06, + "loss": 0.7025, + "step": 8260 + }, + { + "epoch": 0.67, + "grad_norm": 9.65078052737918, + "learning_rate": 2.5805421897650996e-06, + "loss": 0.7227, + "step": 8261 + }, + { + "epoch": 0.67, + "grad_norm": 33.898179329554765, + "learning_rate": 2.5793911700041362e-06, + "loss": 0.8241, + "step": 8262 + }, + { + "epoch": 0.67, + "grad_norm": 12.844796282842156, + "learning_rate": 2.578240317764548e-06, + "loss": 0.8568, + "step": 8263 + }, + { + "epoch": 0.67, + "grad_norm": 7.8673172758208185, + "learning_rate": 2.5770896331259778e-06, + "loss": 0.7153, + "step": 8264 + }, + { + "epoch": 0.67, + "grad_norm": 14.084314347936505, + "learning_rate": 2.5759391161680587e-06, + "loss": 0.732, + "step": 8265 + }, + { + "epoch": 0.67, + "grad_norm": 60.96104111663253, + "learning_rate": 2.574788766970418e-06, + "loss": 0.8572, + "step": 8266 + }, + { + "epoch": 0.67, + "grad_norm": 7.003459601832733, + "learning_rate": 2.5736385856126656e-06, + "loss": 0.691, + "step": 8267 + }, + { + "epoch": 0.67, + "grad_norm": 30.690852447772407, + "learning_rate": 2.5724885721744e-06, + "loss": 0.8399, + "step": 8268 + }, + { + "epoch": 0.67, + "grad_norm": 6.949819403904213, + "learning_rate": 2.5713387267352084e-06, + "loss": 0.7524, + "step": 8269 + }, + { + "epoch": 0.67, + "grad_norm": 8.421339360589064, + "learning_rate": 2.570189049374671e-06, + "loss": 0.6622, + "step": 8270 + }, + { + "epoch": 0.67, + "grad_norm": 12.09286622812239, + "learning_rate": 2.569039540172349e-06, + "loss": 0.6285, + "step": 8271 + }, + { + "epoch": 0.67, + "grad_norm": 21.914259386453125, + "learning_rate": 2.5678901992077993e-06, + "loss": 0.6467, + "step": 8272 + }, + { + "epoch": 0.67, + "grad_norm": 17.869546336735226, + "learning_rate": 2.566741026560562e-06, + "loss": 0.6533, + "step": 8273 + }, + { + "epoch": 0.67, + "grad_norm": 13.474181994400592, + "learning_rate": 2.5655920223101662e-06, + "loss": 0.7067, + "step": 8274 + }, + { + "epoch": 0.67, + "grad_norm": 13.422399016321672, + "learning_rate": 2.564443186536131e-06, + "loss": 0.6449, + "step": 8275 + }, + { + "epoch": 0.67, + "grad_norm": 10.905226227658213, + "learning_rate": 2.5632945193179603e-06, + "loss": 0.5227, + "step": 8276 + }, + { + "epoch": 0.67, + "grad_norm": 7.384792416364311, + "learning_rate": 2.562146020735154e-06, + "loss": 0.8417, + "step": 8277 + }, + { + "epoch": 0.67, + "grad_norm": 7.488766354034559, + "learning_rate": 2.5609976908671906e-06, + "loss": 0.7317, + "step": 8278 + }, + { + "epoch": 0.67, + "grad_norm": 46.21264585044283, + "learning_rate": 2.559849529793547e-06, + "loss": 0.5848, + "step": 8279 + }, + { + "epoch": 0.67, + "grad_norm": 14.41707088896261, + "learning_rate": 2.55870153759368e-06, + "loss": 0.7727, + "step": 8280 + }, + { + "epoch": 0.67, + "grad_norm": 6.240576917652196, + "learning_rate": 2.5575537143470386e-06, + "loss": 0.6971, + "step": 8281 + }, + { + "epoch": 0.67, + "grad_norm": 11.603423489036432, + "learning_rate": 2.556406060133059e-06, + "loss": 0.5985, + "step": 8282 + }, + { + "epoch": 0.67, + "grad_norm": 7.499365229119412, + "learning_rate": 2.555258575031164e-06, + "loss": 0.6869, + "step": 8283 + }, + { + "epoch": 0.67, + "grad_norm": 16.838022291498124, + "learning_rate": 2.5541112591207705e-06, + "loss": 0.6011, + "step": 8284 + }, + { + "epoch": 0.67, + "grad_norm": 18.39698225368846, + "learning_rate": 2.5529641124812776e-06, + "loss": 0.7313, + "step": 8285 + }, + { + "epoch": 0.67, + "grad_norm": 20.58441821433014, + "learning_rate": 2.5518171351920773e-06, + "loss": 0.7285, + "step": 8286 + }, + { + "epoch": 0.67, + "grad_norm": 13.179078535201766, + "learning_rate": 2.550670327332546e-06, + "loss": 0.819, + "step": 8287 + }, + { + "epoch": 0.67, + "grad_norm": 10.770960324872755, + "learning_rate": 2.5495236889820507e-06, + "loss": 0.6965, + "step": 8288 + }, + { + "epoch": 0.67, + "grad_norm": 9.767263230814526, + "learning_rate": 2.5483772202199452e-06, + "loss": 0.7579, + "step": 8289 + }, + { + "epoch": 0.67, + "grad_norm": 6.965736691152021, + "learning_rate": 2.5472309211255707e-06, + "loss": 0.7983, + "step": 8290 + }, + { + "epoch": 0.67, + "grad_norm": 24.97808997137343, + "learning_rate": 2.546084791778263e-06, + "loss": 0.7247, + "step": 8291 + }, + { + "epoch": 0.67, + "grad_norm": 12.992255972259166, + "learning_rate": 2.5449388322573365e-06, + "loss": 0.7102, + "step": 8292 + }, + { + "epoch": 0.67, + "grad_norm": 12.308641024431868, + "learning_rate": 2.5437930426421053e-06, + "loss": 0.4962, + "step": 8293 + }, + { + "epoch": 0.67, + "grad_norm": 9.471606044559692, + "learning_rate": 2.542647423011857e-06, + "loss": 0.7973, + "step": 8294 + }, + { + "epoch": 0.67, + "grad_norm": 25.998663466745278, + "learning_rate": 2.541501973445882e-06, + "loss": 0.6297, + "step": 8295 + }, + { + "epoch": 0.67, + "grad_norm": 9.288472817795983, + "learning_rate": 2.540356694023448e-06, + "loss": 0.6668, + "step": 8296 + }, + { + "epoch": 0.67, + "grad_norm": 11.995650766777597, + "learning_rate": 2.5392115848238203e-06, + "loss": 0.673, + "step": 8297 + }, + { + "epoch": 0.67, + "grad_norm": 7.736268342679664, + "learning_rate": 2.538066645926245e-06, + "loss": 0.7518, + "step": 8298 + }, + { + "epoch": 0.67, + "grad_norm": 5.223080692719563, + "learning_rate": 2.536921877409958e-06, + "loss": 0.7897, + "step": 8299 + }, + { + "epoch": 0.67, + "grad_norm": 8.73998769571859, + "learning_rate": 2.535777279354189e-06, + "loss": 0.7061, + "step": 8300 + }, + { + "epoch": 0.67, + "grad_norm": 7.554764900826824, + "learning_rate": 2.5346328518381447e-06, + "loss": 0.5404, + "step": 8301 + }, + { + "epoch": 0.67, + "grad_norm": 23.340008431974912, + "learning_rate": 2.5334885949410327e-06, + "loss": 0.7239, + "step": 8302 + }, + { + "epoch": 0.67, + "grad_norm": 34.47352832458383, + "learning_rate": 2.5323445087420385e-06, + "loss": 0.5376, + "step": 8303 + }, + { + "epoch": 0.67, + "grad_norm": 18.71218663547627, + "learning_rate": 2.531200593320343e-06, + "loss": 0.735, + "step": 8304 + }, + { + "epoch": 0.67, + "grad_norm": 7.445078966527546, + "learning_rate": 2.530056848755112e-06, + "loss": 0.7634, + "step": 8305 + }, + { + "epoch": 0.67, + "grad_norm": 14.618680343058326, + "learning_rate": 2.5289132751254985e-06, + "loss": 0.7824, + "step": 8306 + }, + { + "epoch": 0.67, + "grad_norm": 7.306603566080601, + "learning_rate": 2.5277698725106462e-06, + "loss": 0.7184, + "step": 8307 + }, + { + "epoch": 0.67, + "grad_norm": 14.592755503060456, + "learning_rate": 2.526626640989683e-06, + "loss": 0.5805, + "step": 8308 + }, + { + "epoch": 0.67, + "grad_norm": 4.836907742020782, + "learning_rate": 2.525483580641732e-06, + "loss": 0.7413, + "step": 8309 + }, + { + "epoch": 0.67, + "grad_norm": 10.70722702735294, + "learning_rate": 2.524340691545896e-06, + "loss": 0.7518, + "step": 8310 + }, + { + "epoch": 0.68, + "grad_norm": 9.253993510084053, + "learning_rate": 2.523197973781274e-06, + "loss": 0.6853, + "step": 8311 + }, + { + "epoch": 0.68, + "grad_norm": 7.9131644903090494, + "learning_rate": 2.5220554274269475e-06, + "loss": 0.6818, + "step": 8312 + }, + { + "epoch": 0.68, + "grad_norm": 10.27759790776848, + "learning_rate": 2.5209130525619884e-06, + "loss": 0.5856, + "step": 8313 + }, + { + "epoch": 0.68, + "grad_norm": 9.309877892345757, + "learning_rate": 2.519770849265455e-06, + "loss": 0.7758, + "step": 8314 + }, + { + "epoch": 0.68, + "grad_norm": 14.283158577072696, + "learning_rate": 2.518628817616394e-06, + "loss": 0.7568, + "step": 8315 + }, + { + "epoch": 0.68, + "grad_norm": 9.672996938056864, + "learning_rate": 2.517486957693844e-06, + "loss": 0.7764, + "step": 8316 + }, + { + "epoch": 0.68, + "grad_norm": 31.567805309846364, + "learning_rate": 2.516345269576827e-06, + "loss": 0.7585, + "step": 8317 + }, + { + "epoch": 0.68, + "grad_norm": 10.860002033694853, + "learning_rate": 2.5152037533443575e-06, + "loss": 0.5649, + "step": 8318 + }, + { + "epoch": 0.68, + "grad_norm": 8.556735012495583, + "learning_rate": 2.514062409075433e-06, + "loss": 0.7482, + "step": 8319 + }, + { + "epoch": 0.68, + "grad_norm": 12.074569219762061, + "learning_rate": 2.512921236849043e-06, + "loss": 0.6966, + "step": 8320 + }, + { + "epoch": 0.68, + "grad_norm": 10.889438682207485, + "learning_rate": 2.5117802367441613e-06, + "loss": 0.5698, + "step": 8321 + }, + { + "epoch": 0.68, + "grad_norm": 6.115744169201199, + "learning_rate": 2.510639408839757e-06, + "loss": 0.7425, + "step": 8322 + }, + { + "epoch": 0.68, + "grad_norm": 6.761739507948528, + "learning_rate": 2.5094987532147786e-06, + "loss": 0.688, + "step": 8323 + }, + { + "epoch": 0.68, + "grad_norm": 17.73763991038439, + "learning_rate": 2.5083582699481667e-06, + "loss": 0.6337, + "step": 8324 + }, + { + "epoch": 0.68, + "grad_norm": 5.663564222221587, + "learning_rate": 2.507217959118854e-06, + "loss": 0.6352, + "step": 8325 + }, + { + "epoch": 0.68, + "grad_norm": 332.42161985430135, + "learning_rate": 2.5060778208057533e-06, + "loss": 0.6817, + "step": 8326 + }, + { + "epoch": 0.68, + "grad_norm": 8.030513051004808, + "learning_rate": 2.50493785508777e-06, + "loss": 0.6418, + "step": 8327 + }, + { + "epoch": 0.68, + "grad_norm": 17.22949239095917, + "learning_rate": 2.5037980620437963e-06, + "loss": 0.4806, + "step": 8328 + }, + { + "epoch": 0.68, + "grad_norm": 4.038479820147205, + "learning_rate": 2.502658441752716e-06, + "loss": 0.7588, + "step": 8329 + }, + { + "epoch": 0.68, + "grad_norm": 11.106475270562205, + "learning_rate": 2.501518994293396e-06, + "loss": 0.5461, + "step": 8330 + }, + { + "epoch": 0.68, + "grad_norm": 18.50909960720777, + "learning_rate": 2.500379719744691e-06, + "loss": 0.7187, + "step": 8331 + }, + { + "epoch": 0.68, + "grad_norm": 13.828695655621319, + "learning_rate": 2.499240618185451e-06, + "loss": 0.7435, + "step": 8332 + }, + { + "epoch": 0.68, + "grad_norm": 7.669078711104005, + "learning_rate": 2.498101689694506e-06, + "loss": 0.782, + "step": 8333 + }, + { + "epoch": 0.68, + "grad_norm": 5.84590706742733, + "learning_rate": 2.4969629343506767e-06, + "loss": 0.8187, + "step": 8334 + }, + { + "epoch": 0.68, + "grad_norm": 33.6666776788197, + "learning_rate": 2.495824352232771e-06, + "loss": 0.5788, + "step": 8335 + }, + { + "epoch": 0.68, + "grad_norm": 3.593049303548369, + "learning_rate": 2.4946859434195904e-06, + "loss": 0.6803, + "step": 8336 + }, + { + "epoch": 0.68, + "grad_norm": 8.795124251341287, + "learning_rate": 2.4935477079899167e-06, + "loss": 0.6579, + "step": 8337 + }, + { + "epoch": 0.68, + "grad_norm": 25.447033559238918, + "learning_rate": 2.4924096460225223e-06, + "loss": 0.7124, + "step": 8338 + }, + { + "epoch": 0.68, + "grad_norm": 7.060984452592301, + "learning_rate": 2.4912717575961703e-06, + "loss": 0.7204, + "step": 8339 + }, + { + "epoch": 0.68, + "grad_norm": 13.048779600965632, + "learning_rate": 2.4901340427896097e-06, + "loss": 0.5793, + "step": 8340 + }, + { + "epoch": 0.68, + "grad_norm": 6.823225016563455, + "learning_rate": 2.4889965016815766e-06, + "loss": 0.6722, + "step": 8341 + }, + { + "epoch": 0.68, + "grad_norm": 25.295642173460735, + "learning_rate": 2.487859134350794e-06, + "loss": 0.6823, + "step": 8342 + }, + { + "epoch": 0.68, + "grad_norm": 7.6884366581580315, + "learning_rate": 2.4867219408759797e-06, + "loss": 0.7397, + "step": 8343 + }, + { + "epoch": 0.68, + "grad_norm": 3.6688460649343075, + "learning_rate": 2.4855849213358314e-06, + "loss": 0.6021, + "step": 8344 + }, + { + "epoch": 0.68, + "grad_norm": 5.027713290032436, + "learning_rate": 2.4844480758090366e-06, + "loss": 0.6631, + "step": 8345 + }, + { + "epoch": 0.68, + "grad_norm": 6.887070393685942, + "learning_rate": 2.483311404374276e-06, + "loss": 0.6126, + "step": 8346 + }, + { + "epoch": 0.68, + "grad_norm": 50.70892855282612, + "learning_rate": 2.4821749071102132e-06, + "loss": 0.672, + "step": 8347 + }, + { + "epoch": 0.68, + "grad_norm": 3.9507228051166736, + "learning_rate": 2.4810385840955e-06, + "loss": 0.6836, + "step": 8348 + }, + { + "epoch": 0.68, + "grad_norm": 6.45342169485433, + "learning_rate": 2.4799024354087758e-06, + "loss": 0.7652, + "step": 8349 + }, + { + "epoch": 0.68, + "grad_norm": 5.344406007275595, + "learning_rate": 2.478766461128672e-06, + "loss": 0.5764, + "step": 8350 + }, + { + "epoch": 0.68, + "grad_norm": 6.187591748854834, + "learning_rate": 2.477630661333803e-06, + "loss": 0.6414, + "step": 8351 + }, + { + "epoch": 0.68, + "grad_norm": 23.064222526382782, + "learning_rate": 2.476495036102776e-06, + "loss": 0.5848, + "step": 8352 + }, + { + "epoch": 0.68, + "grad_norm": 8.4341490723611, + "learning_rate": 2.475359585514182e-06, + "loss": 0.803, + "step": 8353 + }, + { + "epoch": 0.68, + "grad_norm": 18.23331533632398, + "learning_rate": 2.474224309646601e-06, + "loss": 0.6365, + "step": 8354 + }, + { + "epoch": 0.68, + "grad_norm": 10.63785711862461, + "learning_rate": 2.4730892085786018e-06, + "loss": 0.607, + "step": 8355 + }, + { + "epoch": 0.68, + "grad_norm": 4.905295188032726, + "learning_rate": 2.4719542823887375e-06, + "loss": 0.6336, + "step": 8356 + }, + { + "epoch": 0.68, + "grad_norm": 4.571715588261021, + "learning_rate": 2.470819531155557e-06, + "loss": 0.633, + "step": 8357 + }, + { + "epoch": 0.68, + "grad_norm": 3.080861161179966, + "learning_rate": 2.4696849549575878e-06, + "loss": 0.5545, + "step": 8358 + }, + { + "epoch": 0.68, + "grad_norm": 5.551542271881336, + "learning_rate": 2.4685505538733562e-06, + "loss": 0.7473, + "step": 8359 + }, + { + "epoch": 0.68, + "grad_norm": 8.773371456743488, + "learning_rate": 2.4674163279813617e-06, + "loss": 0.7983, + "step": 8360 + }, + { + "epoch": 0.68, + "grad_norm": 12.303284594775095, + "learning_rate": 2.4662822773601055e-06, + "loss": 0.822, + "step": 8361 + }, + { + "epoch": 0.68, + "grad_norm": 4.432959065770363, + "learning_rate": 2.465148402088069e-06, + "loss": 0.5756, + "step": 8362 + }, + { + "epoch": 0.68, + "grad_norm": 9.635145768457829, + "learning_rate": 2.464014702243722e-06, + "loss": 0.5492, + "step": 8363 + }, + { + "epoch": 0.68, + "grad_norm": 4.632664306787909, + "learning_rate": 2.4628811779055277e-06, + "loss": 0.7039, + "step": 8364 + }, + { + "epoch": 0.68, + "grad_norm": 9.380741959344734, + "learning_rate": 2.461747829151929e-06, + "loss": 0.6809, + "step": 8365 + }, + { + "epoch": 0.68, + "grad_norm": 4.6158743438646175, + "learning_rate": 2.4606146560613663e-06, + "loss": 0.6149, + "step": 8366 + }, + { + "epoch": 0.68, + "grad_norm": 4.733452003609559, + "learning_rate": 2.4594816587122557e-06, + "loss": 0.5411, + "step": 8367 + }, + { + "epoch": 0.68, + "grad_norm": 4.018816770527788, + "learning_rate": 2.4583488371830115e-06, + "loss": 0.6974, + "step": 8368 + }, + { + "epoch": 0.68, + "grad_norm": 4.632716556586067, + "learning_rate": 2.457216191552032e-06, + "loss": 0.5839, + "step": 8369 + }, + { + "epoch": 0.68, + "grad_norm": 11.8896188836641, + "learning_rate": 2.4560837218977006e-06, + "loss": 0.6084, + "step": 8370 + }, + { + "epoch": 0.68, + "grad_norm": 2.945213617712697, + "learning_rate": 2.454951428298395e-06, + "loss": 0.7008, + "step": 8371 + }, + { + "epoch": 0.68, + "grad_norm": 7.114222287552107, + "learning_rate": 2.4538193108324742e-06, + "loss": 0.5078, + "step": 8372 + }, + { + "epoch": 0.68, + "grad_norm": 5.577578606011007, + "learning_rate": 2.4526873695782928e-06, + "loss": 0.6545, + "step": 8373 + }, + { + "epoch": 0.68, + "grad_norm": 6.516088559992444, + "learning_rate": 2.451555604614181e-06, + "loss": 0.6411, + "step": 8374 + }, + { + "epoch": 0.68, + "grad_norm": 4.301408951108873, + "learning_rate": 2.450424016018469e-06, + "loss": 0.6349, + "step": 8375 + }, + { + "epoch": 0.68, + "grad_norm": 10.600475921349892, + "learning_rate": 2.449292603869467e-06, + "loss": 0.6219, + "step": 8376 + }, + { + "epoch": 0.68, + "grad_norm": 5.393286904023358, + "learning_rate": 2.4481613682454796e-06, + "loss": 0.8101, + "step": 8377 + }, + { + "epoch": 0.68, + "grad_norm": 4.77895276643497, + "learning_rate": 2.4470303092247926e-06, + "loss": 0.6838, + "step": 8378 + }, + { + "epoch": 0.68, + "grad_norm": 13.797426485172773, + "learning_rate": 2.4458994268856835e-06, + "loss": 0.6073, + "step": 8379 + }, + { + "epoch": 0.68, + "grad_norm": 10.2079912010546, + "learning_rate": 2.4447687213064157e-06, + "loss": 0.6675, + "step": 8380 + }, + { + "epoch": 0.68, + "grad_norm": 6.90362430005197, + "learning_rate": 2.4436381925652397e-06, + "loss": 0.6694, + "step": 8381 + }, + { + "epoch": 0.68, + "grad_norm": 10.103958531934417, + "learning_rate": 2.442507840740399e-06, + "loss": 0.6136, + "step": 8382 + }, + { + "epoch": 0.68, + "grad_norm": 9.164638555755664, + "learning_rate": 2.4413776659101172e-06, + "loss": 0.5395, + "step": 8383 + }, + { + "epoch": 0.68, + "grad_norm": 21.06306834013, + "learning_rate": 2.4402476681526125e-06, + "loss": 0.8171, + "step": 8384 + }, + { + "epoch": 0.68, + "grad_norm": 9.470979333296741, + "learning_rate": 2.4391178475460873e-06, + "loss": 0.635, + "step": 8385 + }, + { + "epoch": 0.68, + "grad_norm": 5.208569615915241, + "learning_rate": 2.437988204168732e-06, + "loss": 0.7552, + "step": 8386 + }, + { + "epoch": 0.68, + "grad_norm": 4.93668510208142, + "learning_rate": 2.4368587380987246e-06, + "loss": 0.6462, + "step": 8387 + }, + { + "epoch": 0.68, + "grad_norm": 5.213279772036837, + "learning_rate": 2.435729449414229e-06, + "loss": 0.6508, + "step": 8388 + }, + { + "epoch": 0.68, + "grad_norm": 4.520355371713811, + "learning_rate": 2.4346003381934036e-06, + "loss": 0.5953, + "step": 8389 + }, + { + "epoch": 0.68, + "grad_norm": 8.457956602423597, + "learning_rate": 2.433471404514386e-06, + "loss": 0.5243, + "step": 8390 + }, + { + "epoch": 0.68, + "grad_norm": 4.307006121715574, + "learning_rate": 2.432342648455309e-06, + "loss": 0.5886, + "step": 8391 + }, + { + "epoch": 0.68, + "grad_norm": 5.659180606121157, + "learning_rate": 2.431214070094289e-06, + "loss": 0.6104, + "step": 8392 + }, + { + "epoch": 0.68, + "grad_norm": 7.20484361301813, + "learning_rate": 2.4300856695094287e-06, + "loss": 0.7232, + "step": 8393 + }, + { + "epoch": 0.68, + "grad_norm": 5.387265914547474, + "learning_rate": 2.42895744677882e-06, + "loss": 0.6597, + "step": 8394 + }, + { + "epoch": 0.68, + "grad_norm": 9.139932113045544, + "learning_rate": 2.427829401980547e-06, + "loss": 0.8518, + "step": 8395 + }, + { + "epoch": 0.68, + "grad_norm": 6.4238743651347585, + "learning_rate": 2.4267015351926747e-06, + "loss": 0.7006, + "step": 8396 + }, + { + "epoch": 0.68, + "grad_norm": 12.991008989031487, + "learning_rate": 2.4255738464932573e-06, + "loss": 0.7193, + "step": 8397 + }, + { + "epoch": 0.68, + "grad_norm": 5.529679732086161, + "learning_rate": 2.4244463359603415e-06, + "loss": 0.5918, + "step": 8398 + }, + { + "epoch": 0.68, + "grad_norm": 4.677988042702766, + "learning_rate": 2.423319003671956e-06, + "loss": 0.5842, + "step": 8399 + }, + { + "epoch": 0.68, + "grad_norm": 3.5476273423304305, + "learning_rate": 2.42219184970612e-06, + "loss": 0.6553, + "step": 8400 + }, + { + "epoch": 0.68, + "grad_norm": 6.158805973732745, + "learning_rate": 2.4210648741408364e-06, + "loss": 0.6194, + "step": 8401 + }, + { + "epoch": 0.68, + "grad_norm": 5.538379827666495, + "learning_rate": 2.419938077054105e-06, + "loss": 0.7514, + "step": 8402 + }, + { + "epoch": 0.68, + "grad_norm": 4.258544415595995, + "learning_rate": 2.418811458523903e-06, + "loss": 0.6597, + "step": 8403 + }, + { + "epoch": 0.68, + "grad_norm": 9.909075842941341, + "learning_rate": 2.4176850186281993e-06, + "loss": 0.7525, + "step": 8404 + }, + { + "epoch": 0.68, + "grad_norm": 9.061979521252233, + "learning_rate": 2.4165587574449533e-06, + "loss": 0.6153, + "step": 8405 + }, + { + "epoch": 0.68, + "grad_norm": 5.8261232271363, + "learning_rate": 2.4154326750521084e-06, + "loss": 0.7194, + "step": 8406 + }, + { + "epoch": 0.68, + "grad_norm": 4.063607746638771, + "learning_rate": 2.4143067715275965e-06, + "loss": 0.6049, + "step": 8407 + }, + { + "epoch": 0.68, + "grad_norm": 8.94359803917466, + "learning_rate": 2.4131810469493343e-06, + "loss": 0.6781, + "step": 8408 + }, + { + "epoch": 0.68, + "grad_norm": 3.6059435897762584, + "learning_rate": 2.412055501395234e-06, + "loss": 0.7044, + "step": 8409 + }, + { + "epoch": 0.68, + "grad_norm": 8.089435314273251, + "learning_rate": 2.410930134943187e-06, + "loss": 0.6873, + "step": 8410 + }, + { + "epoch": 0.68, + "grad_norm": 4.202930531359204, + "learning_rate": 2.4098049476710767e-06, + "loss": 0.6739, + "step": 8411 + }, + { + "epoch": 0.68, + "grad_norm": 9.240341042382903, + "learning_rate": 2.4086799396567755e-06, + "loss": 0.6435, + "step": 8412 + }, + { + "epoch": 0.68, + "grad_norm": 29.518656371527673, + "learning_rate": 2.407555110978136e-06, + "loss": 0.7528, + "step": 8413 + }, + { + "epoch": 0.68, + "grad_norm": 6.564930486049343, + "learning_rate": 2.4064304617130076e-06, + "loss": 0.5093, + "step": 8414 + }, + { + "epoch": 0.68, + "grad_norm": 5.92270354347785, + "learning_rate": 2.4053059919392197e-06, + "loss": 0.6441, + "step": 8415 + }, + { + "epoch": 0.68, + "grad_norm": 4.682227112888556, + "learning_rate": 2.4041817017345963e-06, + "loss": 0.6611, + "step": 8416 + }, + { + "epoch": 0.68, + "grad_norm": 7.132099312397348, + "learning_rate": 2.4030575911769443e-06, + "loss": 0.6452, + "step": 8417 + }, + { + "epoch": 0.68, + "grad_norm": 5.271616264140666, + "learning_rate": 2.4019336603440567e-06, + "loss": 0.6882, + "step": 8418 + }, + { + "epoch": 0.68, + "grad_norm": 3.866446913623662, + "learning_rate": 2.400809909313721e-06, + "loss": 0.608, + "step": 8419 + }, + { + "epoch": 0.68, + "grad_norm": 4.430563855304474, + "learning_rate": 2.3996863381637046e-06, + "loss": 0.5782, + "step": 8420 + }, + { + "epoch": 0.68, + "grad_norm": 5.0549862715664595, + "learning_rate": 2.398562946971767e-06, + "loss": 0.7621, + "step": 8421 + }, + { + "epoch": 0.68, + "grad_norm": 6.318287326484502, + "learning_rate": 2.3974397358156516e-06, + "loss": 0.763, + "step": 8422 + }, + { + "epoch": 0.68, + "grad_norm": 8.306378489138103, + "learning_rate": 2.396316704773095e-06, + "loss": 0.4981, + "step": 8423 + }, + { + "epoch": 0.68, + "grad_norm": 8.65721648549689, + "learning_rate": 2.395193853921815e-06, + "loss": 0.6421, + "step": 8424 + }, + { + "epoch": 0.68, + "grad_norm": 5.554766796455805, + "learning_rate": 2.394071183339523e-06, + "loss": 0.6486, + "step": 8425 + }, + { + "epoch": 0.68, + "grad_norm": 5.340749926459876, + "learning_rate": 2.3929486931039143e-06, + "loss": 0.5543, + "step": 8426 + }, + { + "epoch": 0.68, + "grad_norm": 9.19554677699654, + "learning_rate": 2.391826383292671e-06, + "loss": 0.8047, + "step": 8427 + }, + { + "epoch": 0.68, + "grad_norm": 6.122861600085072, + "learning_rate": 2.390704253983464e-06, + "loss": 0.6904, + "step": 8428 + }, + { + "epoch": 0.68, + "grad_norm": 7.113127938262337, + "learning_rate": 2.3895823052539503e-06, + "loss": 0.658, + "step": 8429 + }, + { + "epoch": 0.68, + "grad_norm": 10.604292520170947, + "learning_rate": 2.38846053718178e-06, + "loss": 0.4966, + "step": 8430 + }, + { + "epoch": 0.68, + "grad_norm": 4.188822387185473, + "learning_rate": 2.3873389498445814e-06, + "loss": 0.5807, + "step": 8431 + }, + { + "epoch": 0.68, + "grad_norm": 5.201203351394706, + "learning_rate": 2.3862175433199823e-06, + "loss": 0.6183, + "step": 8432 + }, + { + "epoch": 0.68, + "grad_norm": 4.955220540322817, + "learning_rate": 2.3850963176855833e-06, + "loss": 0.6121, + "step": 8433 + }, + { + "epoch": 0.69, + "grad_norm": 45.505539801826345, + "learning_rate": 2.383975273018986e-06, + "loss": 0.6639, + "step": 8434 + }, + { + "epoch": 0.69, + "grad_norm": 3.9689423829973576, + "learning_rate": 2.382854409397772e-06, + "loss": 0.7081, + "step": 8435 + }, + { + "epoch": 0.69, + "grad_norm": 6.7115536062502095, + "learning_rate": 2.381733726899509e-06, + "loss": 0.6311, + "step": 8436 + }, + { + "epoch": 0.69, + "grad_norm": 8.730370587025364, + "learning_rate": 2.3806132256017607e-06, + "loss": 0.5415, + "step": 8437 + }, + { + "epoch": 0.69, + "grad_norm": 14.413680394853325, + "learning_rate": 2.3794929055820677e-06, + "loss": 0.5518, + "step": 8438 + }, + { + "epoch": 0.69, + "grad_norm": 4.719599601542386, + "learning_rate": 2.3783727669179695e-06, + "loss": 0.6229, + "step": 8439 + }, + { + "epoch": 0.69, + "grad_norm": 10.074699043363571, + "learning_rate": 2.3772528096869796e-06, + "loss": 0.6762, + "step": 8440 + }, + { + "epoch": 0.69, + "grad_norm": 11.543848847790601, + "learning_rate": 2.376133033966611e-06, + "loss": 0.7442, + "step": 8441 + }, + { + "epoch": 0.69, + "grad_norm": 6.107740359912297, + "learning_rate": 2.375013439834358e-06, + "loss": 0.7017, + "step": 8442 + }, + { + "epoch": 0.69, + "grad_norm": 29.291324378564525, + "learning_rate": 2.3738940273677007e-06, + "loss": 0.7695, + "step": 8443 + }, + { + "epoch": 0.69, + "grad_norm": 5.929525708405646, + "learning_rate": 2.3727747966441144e-06, + "loss": 0.5682, + "step": 8444 + }, + { + "epoch": 0.69, + "grad_norm": 6.302905681186479, + "learning_rate": 2.371655747741053e-06, + "loss": 0.7086, + "step": 8445 + }, + { + "epoch": 0.69, + "grad_norm": 5.272121119177552, + "learning_rate": 2.370536880735967e-06, + "loss": 0.5548, + "step": 8446 + }, + { + "epoch": 0.69, + "grad_norm": 7.558809159287319, + "learning_rate": 2.3694181957062812e-06, + "loss": 0.7837, + "step": 8447 + }, + { + "epoch": 0.69, + "grad_norm": 5.780219238769598, + "learning_rate": 2.3682996927294216e-06, + "loss": 0.6779, + "step": 8448 + }, + { + "epoch": 0.69, + "grad_norm": 3.5837353857638967, + "learning_rate": 2.367181371882792e-06, + "loss": 0.5634, + "step": 8449 + }, + { + "epoch": 0.69, + "grad_norm": 4.898956436274996, + "learning_rate": 2.366063233243791e-06, + "loss": 0.7431, + "step": 8450 + }, + { + "epoch": 0.69, + "grad_norm": 6.73302771299475, + "learning_rate": 2.364945276889799e-06, + "loss": 0.6452, + "step": 8451 + }, + { + "epoch": 0.69, + "grad_norm": 8.296626556600119, + "learning_rate": 2.3638275028981854e-06, + "loss": 0.7699, + "step": 8452 + }, + { + "epoch": 0.69, + "grad_norm": 5.636523699326703, + "learning_rate": 2.362709911346307e-06, + "loss": 0.6516, + "step": 8453 + }, + { + "epoch": 0.69, + "grad_norm": 5.316881936498307, + "learning_rate": 2.361592502311507e-06, + "loss": 0.8503, + "step": 8454 + }, + { + "epoch": 0.69, + "grad_norm": 12.2817977397594, + "learning_rate": 2.3604752758711207e-06, + "loss": 0.8209, + "step": 8455 + }, + { + "epoch": 0.69, + "grad_norm": 2.9092249999626363, + "learning_rate": 2.3593582321024625e-06, + "loss": 0.6413, + "step": 8456 + }, + { + "epoch": 0.69, + "grad_norm": 7.328579796523897, + "learning_rate": 2.3582413710828445e-06, + "loss": 0.6957, + "step": 8457 + }, + { + "epoch": 0.69, + "grad_norm": 20.932373350132284, + "learning_rate": 2.357124692889556e-06, + "loss": 0.5434, + "step": 8458 + }, + { + "epoch": 0.69, + "grad_norm": 3.293253765874854, + "learning_rate": 2.356008197599881e-06, + "loss": 0.6483, + "step": 8459 + }, + { + "epoch": 0.69, + "grad_norm": 5.9979749443588055, + "learning_rate": 2.354891885291086e-06, + "loss": 0.7811, + "step": 8460 + }, + { + "epoch": 0.69, + "grad_norm": 8.280738057088614, + "learning_rate": 2.3537757560404263e-06, + "loss": 0.6826, + "step": 8461 + }, + { + "epoch": 0.69, + "grad_norm": 5.9218038486902955, + "learning_rate": 2.3526598099251473e-06, + "loss": 0.555, + "step": 8462 + }, + { + "epoch": 0.69, + "grad_norm": 5.827334751349018, + "learning_rate": 2.3515440470224778e-06, + "loss": 0.596, + "step": 8463 + }, + { + "epoch": 0.69, + "grad_norm": 29.19993497652659, + "learning_rate": 2.3504284674096366e-06, + "loss": 0.6562, + "step": 8464 + }, + { + "epoch": 0.69, + "grad_norm": 9.29031311426116, + "learning_rate": 2.3493130711638295e-06, + "loss": 0.6663, + "step": 8465 + }, + { + "epoch": 0.69, + "grad_norm": 9.578376961952015, + "learning_rate": 2.348197858362248e-06, + "loss": 0.7251, + "step": 8466 + }, + { + "epoch": 0.69, + "grad_norm": 3.4244944882291874, + "learning_rate": 2.347082829082072e-06, + "loss": 0.6724, + "step": 8467 + }, + { + "epoch": 0.69, + "grad_norm": 83.68547392394662, + "learning_rate": 2.345967983400466e-06, + "loss": 0.5521, + "step": 8468 + }, + { + "epoch": 0.69, + "grad_norm": 3.8595389978510886, + "learning_rate": 2.3448533213945884e-06, + "loss": 0.5407, + "step": 8469 + }, + { + "epoch": 0.69, + "grad_norm": 6.834497654367603, + "learning_rate": 2.3437388431415774e-06, + "loss": 0.7813, + "step": 8470 + }, + { + "epoch": 0.69, + "grad_norm": 3.861775158801616, + "learning_rate": 2.3426245487185663e-06, + "loss": 0.5242, + "step": 8471 + }, + { + "epoch": 0.69, + "grad_norm": 9.60543108929251, + "learning_rate": 2.3415104382026678e-06, + "loss": 0.6846, + "step": 8472 + }, + { + "epoch": 0.69, + "grad_norm": 21.80984637096357, + "learning_rate": 2.3403965116709863e-06, + "loss": 0.66, + "step": 8473 + }, + { + "epoch": 0.69, + "grad_norm": 3.143262241399748, + "learning_rate": 2.339282769200611e-06, + "loss": 0.8833, + "step": 8474 + }, + { + "epoch": 0.69, + "grad_norm": 9.656634210889026, + "learning_rate": 2.338169210868623e-06, + "loss": 0.6943, + "step": 8475 + }, + { + "epoch": 0.69, + "grad_norm": 4.391053079549587, + "learning_rate": 2.3370558367520856e-06, + "loss": 0.716, + "step": 8476 + }, + { + "epoch": 0.69, + "grad_norm": 4.3907963189668795, + "learning_rate": 2.3359426469280507e-06, + "loss": 0.5086, + "step": 8477 + }, + { + "epoch": 0.69, + "grad_norm": 2.6999854148092344, + "learning_rate": 2.3348296414735595e-06, + "loss": 0.6079, + "step": 8478 + }, + { + "epoch": 0.69, + "grad_norm": 6.533069419882555, + "learning_rate": 2.3337168204656392e-06, + "loss": 0.7044, + "step": 8479 + }, + { + "epoch": 0.69, + "grad_norm": 2.978556990570742, + "learning_rate": 2.332604183981303e-06, + "loss": 0.5809, + "step": 8480 + }, + { + "epoch": 0.69, + "grad_norm": 6.490537664901921, + "learning_rate": 2.3314917320975504e-06, + "loss": 0.6561, + "step": 8481 + }, + { + "epoch": 0.69, + "grad_norm": 7.402779595205896, + "learning_rate": 2.3303794648913745e-06, + "loss": 0.6203, + "step": 8482 + }, + { + "epoch": 0.69, + "grad_norm": 2.7463131735289066, + "learning_rate": 2.329267382439749e-06, + "loss": 0.6419, + "step": 8483 + }, + { + "epoch": 0.69, + "grad_norm": 8.098558513650591, + "learning_rate": 2.3281554848196347e-06, + "loss": 0.6464, + "step": 8484 + }, + { + "epoch": 0.69, + "grad_norm": 7.228655708504801, + "learning_rate": 2.3270437721079885e-06, + "loss": 0.6681, + "step": 8485 + }, + { + "epoch": 0.69, + "grad_norm": 10.982143389579852, + "learning_rate": 2.3259322443817397e-06, + "loss": 0.7015, + "step": 8486 + }, + { + "epoch": 0.69, + "grad_norm": 14.438136988785494, + "learning_rate": 2.3248209017178186e-06, + "loss": 0.722, + "step": 8487 + }, + { + "epoch": 0.69, + "grad_norm": 3.668595189596921, + "learning_rate": 2.3237097441931333e-06, + "loss": 0.8091, + "step": 8488 + }, + { + "epoch": 0.69, + "grad_norm": 10.528631488265987, + "learning_rate": 2.3225987718845873e-06, + "loss": 0.7232, + "step": 8489 + }, + { + "epoch": 0.69, + "grad_norm": 16.643916404353256, + "learning_rate": 2.321487984869064e-06, + "loss": 0.5179, + "step": 8490 + }, + { + "epoch": 0.69, + "grad_norm": 3.201848526256978, + "learning_rate": 2.3203773832234368e-06, + "loss": 0.7617, + "step": 8491 + }, + { + "epoch": 0.69, + "grad_norm": 8.002247498385998, + "learning_rate": 2.319266967024569e-06, + "loss": 0.6016, + "step": 8492 + }, + { + "epoch": 0.69, + "grad_norm": 5.659220661693019, + "learning_rate": 2.318156736349304e-06, + "loss": 0.733, + "step": 8493 + }, + { + "epoch": 0.69, + "grad_norm": 3.534320849692937, + "learning_rate": 2.317046691274481e-06, + "loss": 0.745, + "step": 8494 + }, + { + "epoch": 0.69, + "grad_norm": 3.2198993212320612, + "learning_rate": 2.3159368318769176e-06, + "loss": 0.5961, + "step": 8495 + }, + { + "epoch": 0.69, + "grad_norm": 3.789454027844675, + "learning_rate": 2.314827158233428e-06, + "loss": 0.8784, + "step": 8496 + }, + { + "epoch": 0.69, + "grad_norm": 10.409392099085872, + "learning_rate": 2.313717670420804e-06, + "loss": 0.5898, + "step": 8497 + }, + { + "epoch": 0.69, + "grad_norm": 4.23445199585428, + "learning_rate": 2.312608368515834e-06, + "loss": 0.7891, + "step": 8498 + }, + { + "epoch": 0.69, + "grad_norm": 4.530778679043598, + "learning_rate": 2.3114992525952855e-06, + "loss": 0.5663, + "step": 8499 + }, + { + "epoch": 0.69, + "grad_norm": 6.263107656925747, + "learning_rate": 2.3103903227359177e-06, + "loss": 0.772, + "step": 8500 + }, + { + "epoch": 0.69, + "grad_norm": 3.9271039211966627, + "learning_rate": 2.309281579014474e-06, + "loss": 0.7443, + "step": 8501 + }, + { + "epoch": 0.69, + "grad_norm": 7.8495665083021064, + "learning_rate": 2.3081730215076853e-06, + "loss": 0.6638, + "step": 8502 + }, + { + "epoch": 0.69, + "grad_norm": 10.832249253696789, + "learning_rate": 2.307064650292275e-06, + "loss": 0.6766, + "step": 8503 + }, + { + "epoch": 0.69, + "grad_norm": 2.9341253765305284, + "learning_rate": 2.305956465444945e-06, + "loss": 0.632, + "step": 8504 + }, + { + "epoch": 0.69, + "grad_norm": 5.825577665956119, + "learning_rate": 2.304848467042394e-06, + "loss": 0.6561, + "step": 8505 + }, + { + "epoch": 0.69, + "grad_norm": 6.264117929053284, + "learning_rate": 2.303740655161296e-06, + "loss": 0.7126, + "step": 8506 + }, + { + "epoch": 0.69, + "grad_norm": 3.0985761816714943, + "learning_rate": 2.3026330298783232e-06, + "loss": 0.6879, + "step": 8507 + }, + { + "epoch": 0.69, + "grad_norm": 10.111526910074229, + "learning_rate": 2.301525591270129e-06, + "loss": 0.6291, + "step": 8508 + }, + { + "epoch": 0.69, + "grad_norm": 6.436197761418853, + "learning_rate": 2.3004183394133535e-06, + "loss": 0.775, + "step": 8509 + }, + { + "epoch": 0.69, + "grad_norm": 6.226197413334194, + "learning_rate": 2.299311274384628e-06, + "loss": 0.8087, + "step": 8510 + }, + { + "epoch": 0.69, + "grad_norm": 8.275333382510041, + "learning_rate": 2.2982043962605653e-06, + "loss": 0.6988, + "step": 8511 + }, + { + "epoch": 0.69, + "grad_norm": 3.4940877325906765, + "learning_rate": 2.2970977051177745e-06, + "loss": 0.7818, + "step": 8512 + }, + { + "epoch": 0.69, + "grad_norm": 7.997752836258018, + "learning_rate": 2.2959912010328372e-06, + "loss": 0.5971, + "step": 8513 + }, + { + "epoch": 0.69, + "grad_norm": 14.66546642836317, + "learning_rate": 2.2948848840823367e-06, + "loss": 0.6045, + "step": 8514 + }, + { + "epoch": 0.69, + "grad_norm": 11.35736695368174, + "learning_rate": 2.293778754342835e-06, + "loss": 0.4988, + "step": 8515 + }, + { + "epoch": 0.69, + "grad_norm": 4.349383743325977, + "learning_rate": 2.292672811890882e-06, + "loss": 0.5822, + "step": 8516 + }, + { + "epoch": 0.69, + "grad_norm": 12.173691667256804, + "learning_rate": 2.2915670568030183e-06, + "loss": 0.5628, + "step": 8517 + }, + { + "epoch": 0.69, + "grad_norm": 2.5985034630801134, + "learning_rate": 2.290461489155768e-06, + "loss": 0.668, + "step": 8518 + }, + { + "epoch": 0.69, + "grad_norm": 12.878706968707768, + "learning_rate": 2.289356109025644e-06, + "loss": 0.5932, + "step": 8519 + }, + { + "epoch": 0.69, + "grad_norm": 16.124017807522456, + "learning_rate": 2.288250916489142e-06, + "loss": 0.8018, + "step": 8520 + }, + { + "epoch": 0.69, + "grad_norm": 8.702467351221129, + "learning_rate": 2.287145911622754e-06, + "loss": 0.7048, + "step": 8521 + }, + { + "epoch": 0.69, + "grad_norm": 3.43799033040984, + "learning_rate": 2.2860410945029483e-06, + "loss": 0.4545, + "step": 8522 + }, + { + "epoch": 0.69, + "grad_norm": 10.593057633909131, + "learning_rate": 2.284936465206189e-06, + "loss": 0.6866, + "step": 8523 + }, + { + "epoch": 0.69, + "grad_norm": 7.94601214930685, + "learning_rate": 2.283832023808922e-06, + "loss": 0.6446, + "step": 8524 + }, + { + "epoch": 0.69, + "grad_norm": 6.228455379869982, + "learning_rate": 2.2827277703875806e-06, + "loss": 0.6375, + "step": 8525 + }, + { + "epoch": 0.69, + "grad_norm": 9.453672403257391, + "learning_rate": 2.2816237050185875e-06, + "loss": 0.625, + "step": 8526 + }, + { + "epoch": 0.69, + "grad_norm": 3.593239479790527, + "learning_rate": 2.2805198277783484e-06, + "loss": 0.6203, + "step": 8527 + }, + { + "epoch": 0.69, + "grad_norm": 4.639895124065902, + "learning_rate": 2.279416138743262e-06, + "loss": 0.6008, + "step": 8528 + }, + { + "epoch": 0.69, + "grad_norm": 8.612577966601357, + "learning_rate": 2.278312637989708e-06, + "loss": 0.6895, + "step": 8529 + }, + { + "epoch": 0.69, + "grad_norm": 6.034300664018397, + "learning_rate": 2.277209325594058e-06, + "loss": 0.6863, + "step": 8530 + }, + { + "epoch": 0.69, + "grad_norm": 3.1526826458246653, + "learning_rate": 2.2761062016326667e-06, + "loss": 0.7628, + "step": 8531 + }, + { + "epoch": 0.69, + "grad_norm": 5.060198575844125, + "learning_rate": 2.275003266181877e-06, + "loss": 0.6354, + "step": 8532 + }, + { + "epoch": 0.69, + "grad_norm": 4.317361763016478, + "learning_rate": 2.2739005193180196e-06, + "loss": 0.7614, + "step": 8533 + }, + { + "epoch": 0.69, + "grad_norm": 9.294788949003523, + "learning_rate": 2.2727979611174096e-06, + "loss": 0.7102, + "step": 8534 + }, + { + "epoch": 0.69, + "grad_norm": 3.093610412782277, + "learning_rate": 2.2716955916563544e-06, + "loss": 0.6581, + "step": 8535 + }, + { + "epoch": 0.69, + "grad_norm": 4.795755252504728, + "learning_rate": 2.270593411011141e-06, + "loss": 0.7487, + "step": 8536 + }, + { + "epoch": 0.69, + "grad_norm": 4.1270339416754664, + "learning_rate": 2.2694914192580506e-06, + "loss": 0.5363, + "step": 8537 + }, + { + "epoch": 0.69, + "grad_norm": 2.330987429467569, + "learning_rate": 2.2683896164733476e-06, + "loss": 0.649, + "step": 8538 + }, + { + "epoch": 0.69, + "grad_norm": 22.83871459355381, + "learning_rate": 2.267288002733283e-06, + "loss": 0.7177, + "step": 8539 + }, + { + "epoch": 0.69, + "grad_norm": 3.4255811193506145, + "learning_rate": 2.266186578114094e-06, + "loss": 0.6266, + "step": 8540 + }, + { + "epoch": 0.69, + "grad_norm": 6.116036650768977, + "learning_rate": 2.2650853426920065e-06, + "loss": 0.6037, + "step": 8541 + }, + { + "epoch": 0.69, + "grad_norm": 5.610822245688341, + "learning_rate": 2.2639842965432353e-06, + "loss": 0.5242, + "step": 8542 + }, + { + "epoch": 0.69, + "grad_norm": 5.34312530611171, + "learning_rate": 2.262883439743976e-06, + "loss": 0.6515, + "step": 8543 + }, + { + "epoch": 0.69, + "grad_norm": 4.056419835352032, + "learning_rate": 2.261782772370419e-06, + "loss": 0.5427, + "step": 8544 + }, + { + "epoch": 0.69, + "grad_norm": 5.037809901717693, + "learning_rate": 2.2606822944987357e-06, + "loss": 0.7306, + "step": 8545 + }, + { + "epoch": 0.69, + "grad_norm": 57.07562588312198, + "learning_rate": 2.2595820062050854e-06, + "loss": 0.6369, + "step": 8546 + }, + { + "epoch": 0.69, + "grad_norm": 3.970436896810602, + "learning_rate": 2.258481907565613e-06, + "loss": 0.6802, + "step": 8547 + }, + { + "epoch": 0.69, + "grad_norm": 4.020054873050856, + "learning_rate": 2.2573819986564576e-06, + "loss": 0.638, + "step": 8548 + }, + { + "epoch": 0.69, + "grad_norm": 6.9172050748144684, + "learning_rate": 2.2562822795537364e-06, + "loss": 0.5828, + "step": 8549 + }, + { + "epoch": 0.69, + "grad_norm": 4.212179949032205, + "learning_rate": 2.2551827503335556e-06, + "loss": 0.7656, + "step": 8550 + }, + { + "epoch": 0.69, + "grad_norm": 4.981269966080089, + "learning_rate": 2.254083411072013e-06, + "loss": 0.7586, + "step": 8551 + }, + { + "epoch": 0.69, + "grad_norm": 4.107953207312299, + "learning_rate": 2.252984261845188e-06, + "loss": 0.6383, + "step": 8552 + }, + { + "epoch": 0.69, + "grad_norm": 4.2747194570202565, + "learning_rate": 2.2518853027291487e-06, + "loss": 0.7714, + "step": 8553 + }, + { + "epoch": 0.69, + "grad_norm": 3.5880206085975415, + "learning_rate": 2.250786533799948e-06, + "loss": 0.7179, + "step": 8554 + }, + { + "epoch": 0.69, + "grad_norm": 3.37566109180717, + "learning_rate": 2.249687955133632e-06, + "loss": 0.6726, + "step": 8555 + }, + { + "epoch": 0.69, + "grad_norm": 4.778730200727405, + "learning_rate": 2.2485895668062263e-06, + "loss": 0.5623, + "step": 8556 + }, + { + "epoch": 0.69, + "grad_norm": 3.3290876360003567, + "learning_rate": 2.2474913688937457e-06, + "loss": 0.505, + "step": 8557 + }, + { + "epoch": 0.7, + "grad_norm": 7.21595884769128, + "learning_rate": 2.2463933614721965e-06, + "loss": 0.7445, + "step": 8558 + }, + { + "epoch": 0.7, + "grad_norm": 6.541253184190403, + "learning_rate": 2.245295544617562e-06, + "loss": 0.6479, + "step": 8559 + }, + { + "epoch": 0.7, + "grad_norm": 5.15514891370725, + "learning_rate": 2.2441979184058223e-06, + "loss": 0.4837, + "step": 8560 + }, + { + "epoch": 0.7, + "grad_norm": 4.535841138822577, + "learning_rate": 2.2431004829129368e-06, + "loss": 0.6225, + "step": 8561 + }, + { + "epoch": 0.7, + "grad_norm": 8.495294118664097, + "learning_rate": 2.2420032382148584e-06, + "loss": 0.6908, + "step": 8562 + }, + { + "epoch": 0.7, + "grad_norm": 6.302123796596489, + "learning_rate": 2.240906184387522e-06, + "loss": 0.8441, + "step": 8563 + }, + { + "epoch": 0.7, + "grad_norm": 4.272287677467053, + "learning_rate": 2.239809321506848e-06, + "loss": 0.6411, + "step": 8564 + }, + { + "epoch": 0.7, + "grad_norm": 4.84266452244706, + "learning_rate": 2.2387126496487526e-06, + "loss": 0.679, + "step": 8565 + }, + { + "epoch": 0.7, + "grad_norm": 3.412564497109921, + "learning_rate": 2.2376161688891247e-06, + "loss": 0.5688, + "step": 8566 + }, + { + "epoch": 0.7, + "grad_norm": 5.565136612770939, + "learning_rate": 2.2365198793038526e-06, + "loss": 0.532, + "step": 8567 + }, + { + "epoch": 0.7, + "grad_norm": 5.134678779141866, + "learning_rate": 2.2354237809688038e-06, + "loss": 0.609, + "step": 8568 + }, + { + "epoch": 0.7, + "grad_norm": 3.903154857176654, + "learning_rate": 2.234327873959839e-06, + "loss": 0.7834, + "step": 8569 + }, + { + "epoch": 0.7, + "grad_norm": 26.88418753654049, + "learning_rate": 2.233232158352799e-06, + "loss": 0.5264, + "step": 8570 + }, + { + "epoch": 0.7, + "grad_norm": 6.640729250089596, + "learning_rate": 2.2321366342235124e-06, + "loss": 0.793, + "step": 8571 + }, + { + "epoch": 0.7, + "grad_norm": 5.552826160781963, + "learning_rate": 2.2310413016478003e-06, + "loss": 0.6787, + "step": 8572 + }, + { + "epoch": 0.7, + "grad_norm": 3.1976662784308076, + "learning_rate": 2.2299461607014654e-06, + "loss": 0.6362, + "step": 8573 + }, + { + "epoch": 0.7, + "grad_norm": 3.83499189961259, + "learning_rate": 2.2288512114602986e-06, + "loss": 0.658, + "step": 8574 + }, + { + "epoch": 0.7, + "grad_norm": 3.1671346641349203, + "learning_rate": 2.2277564540000736e-06, + "loss": 0.6246, + "step": 8575 + }, + { + "epoch": 0.7, + "grad_norm": 21.794237250179386, + "learning_rate": 2.2266618883965597e-06, + "loss": 0.6937, + "step": 8576 + }, + { + "epoch": 0.7, + "grad_norm": 5.497314374601564, + "learning_rate": 2.2255675147255036e-06, + "loss": 0.727, + "step": 8577 + }, + { + "epoch": 0.7, + "grad_norm": 6.967444537740059, + "learning_rate": 2.2244733330626484e-06, + "loss": 0.876, + "step": 8578 + }, + { + "epoch": 0.7, + "grad_norm": 2.9057236235301986, + "learning_rate": 2.2233793434837108e-06, + "loss": 0.7735, + "step": 8579 + }, + { + "epoch": 0.7, + "grad_norm": 3.607714048234826, + "learning_rate": 2.222285546064408e-06, + "loss": 0.6462, + "step": 8580 + }, + { + "epoch": 0.7, + "grad_norm": 4.309307427657575, + "learning_rate": 2.2211919408804357e-06, + "loss": 0.7832, + "step": 8581 + }, + { + "epoch": 0.7, + "grad_norm": 4.5708567457274505, + "learning_rate": 2.220098528007475e-06, + "loss": 0.6513, + "step": 8582 + }, + { + "epoch": 0.7, + "grad_norm": 8.52777755423946, + "learning_rate": 2.2190053075212024e-06, + "loss": 0.6828, + "step": 8583 + }, + { + "epoch": 0.7, + "grad_norm": 6.924271821075603, + "learning_rate": 2.217912279497271e-06, + "loss": 0.6942, + "step": 8584 + }, + { + "epoch": 0.7, + "grad_norm": 4.572383361116938, + "learning_rate": 2.216819444011331e-06, + "loss": 0.496, + "step": 8585 + }, + { + "epoch": 0.7, + "grad_norm": 4.022850263665472, + "learning_rate": 2.2157268011390065e-06, + "loss": 0.6326, + "step": 8586 + }, + { + "epoch": 0.7, + "grad_norm": 11.49366626481796, + "learning_rate": 2.2146343509559205e-06, + "loss": 0.6046, + "step": 8587 + }, + { + "epoch": 0.7, + "grad_norm": 5.265303261808111, + "learning_rate": 2.213542093537675e-06, + "loss": 0.6467, + "step": 8588 + }, + { + "epoch": 0.7, + "grad_norm": 3.6295894233424613, + "learning_rate": 2.21245002895986e-06, + "loss": 0.5624, + "step": 8589 + }, + { + "epoch": 0.7, + "grad_norm": 8.489376458113789, + "learning_rate": 2.2113581572980568e-06, + "loss": 0.6682, + "step": 8590 + }, + { + "epoch": 0.7, + "grad_norm": 3.3694218358218158, + "learning_rate": 2.2102664786278276e-06, + "loss": 0.6715, + "step": 8591 + }, + { + "epoch": 0.7, + "grad_norm": 7.747761745052959, + "learning_rate": 2.2091749930247242e-06, + "loss": 0.6099, + "step": 8592 + }, + { + "epoch": 0.7, + "grad_norm": 5.337270094939879, + "learning_rate": 2.2080837005642813e-06, + "loss": 0.5992, + "step": 8593 + }, + { + "epoch": 0.7, + "grad_norm": 4.063349748652745, + "learning_rate": 2.206992601322028e-06, + "loss": 0.676, + "step": 8594 + }, + { + "epoch": 0.7, + "grad_norm": 4.038287272502322, + "learning_rate": 2.2059016953734723e-06, + "loss": 0.7289, + "step": 8595 + }, + { + "epoch": 0.7, + "grad_norm": 8.565532030679988, + "learning_rate": 2.204810982794111e-06, + "loss": 0.6195, + "step": 8596 + }, + { + "epoch": 0.7, + "grad_norm": 3.5109210707237546, + "learning_rate": 2.2037204636594316e-06, + "loss": 0.7609, + "step": 8597 + }, + { + "epoch": 0.7, + "grad_norm": 5.167555257794369, + "learning_rate": 2.2026301380449026e-06, + "loss": 0.6475, + "step": 8598 + }, + { + "epoch": 0.7, + "grad_norm": 4.215673434487035, + "learning_rate": 2.2015400060259824e-06, + "loss": 0.5562, + "step": 8599 + }, + { + "epoch": 0.7, + "grad_norm": 3.492322207632711, + "learning_rate": 2.200450067678112e-06, + "loss": 0.6747, + "step": 8600 + }, + { + "epoch": 0.7, + "grad_norm": 7.538421366908801, + "learning_rate": 2.199360323076726e-06, + "loss": 0.5867, + "step": 8601 + }, + { + "epoch": 0.7, + "grad_norm": 3.9624585278136246, + "learning_rate": 2.1982707722972383e-06, + "loss": 0.5787, + "step": 8602 + }, + { + "epoch": 0.7, + "grad_norm": 3.8517341600905386, + "learning_rate": 2.1971814154150562e-06, + "loss": 0.6847, + "step": 8603 + }, + { + "epoch": 0.7, + "grad_norm": 12.074191000544618, + "learning_rate": 2.1960922525055684e-06, + "loss": 0.7161, + "step": 8604 + }, + { + "epoch": 0.7, + "grad_norm": 3.4149049782222063, + "learning_rate": 2.195003283644151e-06, + "loss": 0.632, + "step": 8605 + }, + { + "epoch": 0.7, + "grad_norm": 3.9831308149650426, + "learning_rate": 2.1939145089061685e-06, + "loss": 0.6732, + "step": 8606 + }, + { + "epoch": 0.7, + "grad_norm": 4.9795316517203645, + "learning_rate": 2.1928259283669686e-06, + "loss": 0.6318, + "step": 8607 + }, + { + "epoch": 0.7, + "grad_norm": 4.914866971486496, + "learning_rate": 2.1917375421018914e-06, + "loss": 0.5883, + "step": 8608 + }, + { + "epoch": 0.7, + "grad_norm": 4.219046929349652, + "learning_rate": 2.1906493501862574e-06, + "loss": 0.7631, + "step": 8609 + }, + { + "epoch": 0.7, + "grad_norm": 3.589514452798457, + "learning_rate": 2.189561352695379e-06, + "loss": 0.6832, + "step": 8610 + }, + { + "epoch": 0.7, + "grad_norm": 3.6479538532774494, + "learning_rate": 2.188473549704551e-06, + "loss": 0.5522, + "step": 8611 + }, + { + "epoch": 0.7, + "grad_norm": 2.9681243550939893, + "learning_rate": 2.1873859412890565e-06, + "loss": 0.5881, + "step": 8612 + }, + { + "epoch": 0.7, + "grad_norm": 6.140650664220086, + "learning_rate": 2.186298527524164e-06, + "loss": 0.695, + "step": 8613 + }, + { + "epoch": 0.7, + "grad_norm": 3.973758915821884, + "learning_rate": 2.1852113084851286e-06, + "loss": 0.721, + "step": 8614 + }, + { + "epoch": 0.7, + "grad_norm": 3.94096062650955, + "learning_rate": 2.1841242842471955e-06, + "loss": 0.7336, + "step": 8615 + }, + { + "epoch": 0.7, + "grad_norm": 7.352036398785971, + "learning_rate": 2.1830374548855905e-06, + "loss": 0.719, + "step": 8616 + }, + { + "epoch": 0.7, + "grad_norm": 3.5623424192841093, + "learning_rate": 2.181950820475532e-06, + "loss": 0.5465, + "step": 8617 + }, + { + "epoch": 0.7, + "grad_norm": 2.663431350659676, + "learning_rate": 2.1808643810922207e-06, + "loss": 0.8053, + "step": 8618 + }, + { + "epoch": 0.7, + "grad_norm": 3.5390657890432964, + "learning_rate": 2.1797781368108458e-06, + "loss": 0.7685, + "step": 8619 + }, + { + "epoch": 0.7, + "grad_norm": 3.2513384100568024, + "learning_rate": 2.178692087706581e-06, + "loss": 0.8052, + "step": 8620 + }, + { + "epoch": 0.7, + "grad_norm": 5.376188954480933, + "learning_rate": 2.177606233854586e-06, + "loss": 0.783, + "step": 8621 + }, + { + "epoch": 0.7, + "grad_norm": 3.9474292900630097, + "learning_rate": 2.176520575330013e-06, + "loss": 0.5182, + "step": 8622 + }, + { + "epoch": 0.7, + "grad_norm": 4.129261860413936, + "learning_rate": 2.1754351122079926e-06, + "loss": 0.6717, + "step": 8623 + }, + { + "epoch": 0.7, + "grad_norm": 2.6080343585247787, + "learning_rate": 2.1743498445636492e-06, + "loss": 0.6319, + "step": 8624 + }, + { + "epoch": 0.7, + "grad_norm": 4.1331609567671945, + "learning_rate": 2.173264772472088e-06, + "loss": 0.6355, + "step": 8625 + }, + { + "epoch": 0.7, + "grad_norm": 7.064389889346811, + "learning_rate": 2.172179896008403e-06, + "loss": 0.6934, + "step": 8626 + }, + { + "epoch": 0.7, + "grad_norm": 9.705151724425937, + "learning_rate": 2.1710952152476732e-06, + "loss": 0.5991, + "step": 8627 + }, + { + "epoch": 0.7, + "grad_norm": 2.6314630520282916, + "learning_rate": 2.1700107302649686e-06, + "loss": 0.5289, + "step": 8628 + }, + { + "epoch": 0.7, + "grad_norm": 5.677628131094843, + "learning_rate": 2.16892644113534e-06, + "loss": 0.4892, + "step": 8629 + }, + { + "epoch": 0.7, + "grad_norm": 5.918374935852546, + "learning_rate": 2.167842347933826e-06, + "loss": 0.6989, + "step": 8630 + }, + { + "epoch": 0.7, + "grad_norm": 3.8988428176289256, + "learning_rate": 2.1667584507354584e-06, + "loss": 0.5952, + "step": 8631 + }, + { + "epoch": 0.7, + "grad_norm": 5.740871241603859, + "learning_rate": 2.165674749615242e-06, + "loss": 0.6833, + "step": 8632 + }, + { + "epoch": 0.7, + "grad_norm": 4.14773882994138, + "learning_rate": 2.1645912446481805e-06, + "loss": 0.7358, + "step": 8633 + }, + { + "epoch": 0.7, + "grad_norm": 4.255777688225042, + "learning_rate": 2.1635079359092566e-06, + "loss": 0.8461, + "step": 8634 + }, + { + "epoch": 0.7, + "grad_norm": 6.3807009075069825, + "learning_rate": 2.162424823473445e-06, + "loss": 0.813, + "step": 8635 + }, + { + "epoch": 0.7, + "grad_norm": 6.941771014334811, + "learning_rate": 2.1613419074157026e-06, + "loss": 0.7084, + "step": 8636 + }, + { + "epoch": 0.7, + "grad_norm": 16.63107834524546, + "learning_rate": 2.1602591878109724e-06, + "loss": 0.6655, + "step": 8637 + }, + { + "epoch": 0.7, + "grad_norm": 5.501016707335661, + "learning_rate": 2.1591766647341904e-06, + "loss": 0.5609, + "step": 8638 + }, + { + "epoch": 0.7, + "grad_norm": 4.267035702882348, + "learning_rate": 2.158094338260267e-06, + "loss": 0.6609, + "step": 8639 + }, + { + "epoch": 0.7, + "grad_norm": 3.5787856039940538, + "learning_rate": 2.157012208464111e-06, + "loss": 0.8027, + "step": 8640 + }, + { + "epoch": 0.7, + "grad_norm": 4.975055960926863, + "learning_rate": 2.1559302754206092e-06, + "loss": 0.7628, + "step": 8641 + }, + { + "epoch": 0.7, + "grad_norm": 5.782341608527495, + "learning_rate": 2.154848539204642e-06, + "loss": 0.7448, + "step": 8642 + }, + { + "epoch": 0.7, + "grad_norm": 4.432465579265484, + "learning_rate": 2.153766999891071e-06, + "loss": 0.8149, + "step": 8643 + }, + { + "epoch": 0.7, + "grad_norm": 4.611631649127151, + "learning_rate": 2.1526856575547444e-06, + "loss": 0.7374, + "step": 8644 + }, + { + "epoch": 0.7, + "grad_norm": 4.688766355622498, + "learning_rate": 2.151604512270499e-06, + "loss": 0.7187, + "step": 8645 + }, + { + "epoch": 0.7, + "grad_norm": 7.678656616077013, + "learning_rate": 2.1505235641131538e-06, + "loss": 0.5581, + "step": 8646 + }, + { + "epoch": 0.7, + "grad_norm": 3.782042452398749, + "learning_rate": 2.1494428131575218e-06, + "loss": 0.717, + "step": 8647 + }, + { + "epoch": 0.7, + "grad_norm": 5.055834975843569, + "learning_rate": 2.1483622594783937e-06, + "loss": 0.6277, + "step": 8648 + }, + { + "epoch": 0.7, + "grad_norm": 3.8526455857836583, + "learning_rate": 2.147281903150555e-06, + "loss": 0.7904, + "step": 8649 + }, + { + "epoch": 0.7, + "grad_norm": 12.190045974194316, + "learning_rate": 2.1462017442487688e-06, + "loss": 0.7114, + "step": 8650 + }, + { + "epoch": 0.7, + "grad_norm": 5.79155826084793, + "learning_rate": 2.1451217828477945e-06, + "loss": 0.6396, + "step": 8651 + }, + { + "epoch": 0.7, + "grad_norm": 3.6803239673866, + "learning_rate": 2.144042019022365e-06, + "loss": 0.7278, + "step": 8652 + }, + { + "epoch": 0.7, + "grad_norm": 2.700124218420424, + "learning_rate": 2.142962452847212e-06, + "loss": 0.6705, + "step": 8653 + }, + { + "epoch": 0.7, + "grad_norm": 3.4313381194175867, + "learning_rate": 2.141883084397047e-06, + "loss": 0.5537, + "step": 8654 + }, + { + "epoch": 0.7, + "grad_norm": 3.7096628004087755, + "learning_rate": 2.1408039137465664e-06, + "loss": 0.6182, + "step": 8655 + }, + { + "epoch": 0.7, + "grad_norm": 5.338593365876373, + "learning_rate": 2.1397249409704603e-06, + "loss": 0.7251, + "step": 8656 + }, + { + "epoch": 0.7, + "grad_norm": 2.836407930869923, + "learning_rate": 2.138646166143396e-06, + "loss": 0.6073, + "step": 8657 + }, + { + "epoch": 0.7, + "grad_norm": 3.5080304047599493, + "learning_rate": 2.1375675893400373e-06, + "loss": 0.5967, + "step": 8658 + }, + { + "epoch": 0.7, + "grad_norm": 3.020425501738531, + "learning_rate": 2.136489210635021e-06, + "loss": 0.6938, + "step": 8659 + }, + { + "epoch": 0.7, + "grad_norm": 2.991417668725965, + "learning_rate": 2.1354110301029834e-06, + "loss": 0.6449, + "step": 8660 + }, + { + "epoch": 0.7, + "grad_norm": 5.169612186110909, + "learning_rate": 2.1343330478185398e-06, + "loss": 0.508, + "step": 8661 + }, + { + "epoch": 0.7, + "grad_norm": 5.940615779555634, + "learning_rate": 2.13325526385629e-06, + "loss": 0.7991, + "step": 8662 + }, + { + "epoch": 0.7, + "grad_norm": 6.65234367608044, + "learning_rate": 2.13217767829083e-06, + "loss": 0.5911, + "step": 8663 + }, + { + "epoch": 0.7, + "grad_norm": 3.212048018037774, + "learning_rate": 2.131100291196731e-06, + "loss": 0.6948, + "step": 8664 + }, + { + "epoch": 0.7, + "grad_norm": 4.820565321202793, + "learning_rate": 2.130023102648556e-06, + "loss": 0.7052, + "step": 8665 + }, + { + "epoch": 0.7, + "grad_norm": 2.357035210674939, + "learning_rate": 2.128946112720851e-06, + "loss": 0.6269, + "step": 8666 + }, + { + "epoch": 0.7, + "grad_norm": 3.2865651071642765, + "learning_rate": 2.1278693214881552e-06, + "loss": 0.7986, + "step": 8667 + }, + { + "epoch": 0.7, + "grad_norm": 10.197509087785358, + "learning_rate": 2.126792729024986e-06, + "loss": 0.6764, + "step": 8668 + }, + { + "epoch": 0.7, + "grad_norm": 3.2599414332477554, + "learning_rate": 2.1257163354058502e-06, + "loss": 0.7318, + "step": 8669 + }, + { + "epoch": 0.7, + "grad_norm": 2.7568378724860976, + "learning_rate": 2.1246401407052437e-06, + "loss": 0.5436, + "step": 8670 + }, + { + "epoch": 0.7, + "grad_norm": 6.1365012097545755, + "learning_rate": 2.1235641449976437e-06, + "loss": 0.6607, + "step": 8671 + }, + { + "epoch": 0.7, + "grad_norm": 6.382133686428175, + "learning_rate": 2.1224883483575166e-06, + "loss": 0.8555, + "step": 8672 + }, + { + "epoch": 0.7, + "grad_norm": 3.198341717226936, + "learning_rate": 2.1214127508593124e-06, + "loss": 0.5251, + "step": 8673 + }, + { + "epoch": 0.7, + "grad_norm": 2.9528514812501467, + "learning_rate": 2.120337352577472e-06, + "loss": 0.6841, + "step": 8674 + }, + { + "epoch": 0.7, + "grad_norm": 3.811048466123399, + "learning_rate": 2.119262153586418e-06, + "loss": 0.7342, + "step": 8675 + }, + { + "epoch": 0.7, + "grad_norm": 4.047916400272804, + "learning_rate": 2.118187153960563e-06, + "loss": 0.6661, + "step": 8676 + }, + { + "epoch": 0.7, + "grad_norm": 5.788036242962474, + "learning_rate": 2.1171123537743023e-06, + "loss": 0.5965, + "step": 8677 + }, + { + "epoch": 0.7, + "grad_norm": 2.8984934838449035, + "learning_rate": 2.1160377531020185e-06, + "loss": 0.6307, + "step": 8678 + }, + { + "epoch": 0.7, + "grad_norm": 3.2751604517817663, + "learning_rate": 2.1149633520180813e-06, + "loss": 0.5921, + "step": 8679 + }, + { + "epoch": 0.7, + "grad_norm": 5.734705131610686, + "learning_rate": 2.1138891505968444e-06, + "loss": 0.6421, + "step": 8680 + }, + { + "epoch": 0.71, + "grad_norm": 3.523416707367139, + "learning_rate": 2.112815148912652e-06, + "loss": 0.6321, + "step": 8681 + }, + { + "epoch": 0.71, + "grad_norm": 9.018799935431145, + "learning_rate": 2.111741347039829e-06, + "loss": 0.6609, + "step": 8682 + }, + { + "epoch": 0.71, + "grad_norm": 3.0199819550339067, + "learning_rate": 2.110667745052693e-06, + "loss": 0.5086, + "step": 8683 + }, + { + "epoch": 0.71, + "grad_norm": 6.709487236395212, + "learning_rate": 2.109594343025541e-06, + "loss": 0.591, + "step": 8684 + }, + { + "epoch": 0.71, + "grad_norm": 3.3334942662345957, + "learning_rate": 2.1085211410326605e-06, + "loss": 0.7547, + "step": 8685 + }, + { + "epoch": 0.71, + "grad_norm": 2.9600302950983965, + "learning_rate": 2.1074481391483233e-06, + "loss": 0.6807, + "step": 8686 + }, + { + "epoch": 0.71, + "grad_norm": 4.127885370736339, + "learning_rate": 2.1063753374467854e-06, + "loss": 0.6041, + "step": 8687 + }, + { + "epoch": 0.71, + "grad_norm": 3.4425164921147737, + "learning_rate": 2.1053027360022965e-06, + "loss": 0.8104, + "step": 8688 + }, + { + "epoch": 0.71, + "grad_norm": 3.9323000269657804, + "learning_rate": 2.1042303348890825e-06, + "loss": 0.653, + "step": 8689 + }, + { + "epoch": 0.71, + "grad_norm": 2.569468756731211, + "learning_rate": 2.1031581341813646e-06, + "loss": 0.6515, + "step": 8690 + }, + { + "epoch": 0.71, + "grad_norm": 9.363600749813932, + "learning_rate": 2.1020861339533438e-06, + "loss": 0.6311, + "step": 8691 + }, + { + "epoch": 0.71, + "grad_norm": 4.137352027724873, + "learning_rate": 2.1010143342792096e-06, + "loss": 0.6009, + "step": 8692 + }, + { + "epoch": 0.71, + "grad_norm": 7.151476899836574, + "learning_rate": 2.099942735233136e-06, + "loss": 0.7406, + "step": 8693 + }, + { + "epoch": 0.71, + "grad_norm": 5.151903497882219, + "learning_rate": 2.0988713368892848e-06, + "loss": 0.7441, + "step": 8694 + }, + { + "epoch": 0.71, + "grad_norm": 8.598946775632346, + "learning_rate": 2.0978001393218054e-06, + "loss": 0.54, + "step": 8695 + }, + { + "epoch": 0.71, + "grad_norm": 4.578597796144323, + "learning_rate": 2.0967291426048288e-06, + "loss": 0.6434, + "step": 8696 + }, + { + "epoch": 0.71, + "grad_norm": 8.592868960907758, + "learning_rate": 2.0956583468124787e-06, + "loss": 0.6214, + "step": 8697 + }, + { + "epoch": 0.71, + "grad_norm": 14.500333153251693, + "learning_rate": 2.0945877520188552e-06, + "loss": 0.6546, + "step": 8698 + }, + { + "epoch": 0.71, + "grad_norm": 6.461235125731297, + "learning_rate": 2.093517358298055e-06, + "loss": 0.6259, + "step": 8699 + }, + { + "epoch": 0.71, + "grad_norm": 3.2599787728836094, + "learning_rate": 2.0924471657241526e-06, + "loss": 0.6931, + "step": 8700 + }, + { + "epoch": 0.71, + "grad_norm": 4.762429091968062, + "learning_rate": 2.091377174371215e-06, + "loss": 0.5189, + "step": 8701 + }, + { + "epoch": 0.71, + "grad_norm": 6.4486011061203925, + "learning_rate": 2.090307384313292e-06, + "loss": 0.7062, + "step": 8702 + }, + { + "epoch": 0.71, + "grad_norm": 6.487457445290819, + "learning_rate": 2.089237795624417e-06, + "loss": 0.5634, + "step": 8703 + }, + { + "epoch": 0.71, + "grad_norm": 2.5894098966713157, + "learning_rate": 2.0881684083786173e-06, + "loss": 0.5507, + "step": 8704 + }, + { + "epoch": 0.71, + "grad_norm": 7.5365149028032885, + "learning_rate": 2.0870992226498947e-06, + "loss": 0.747, + "step": 8705 + }, + { + "epoch": 0.71, + "grad_norm": 2.9580458546394914, + "learning_rate": 2.0860302385122493e-06, + "loss": 0.7489, + "step": 8706 + }, + { + "epoch": 0.71, + "grad_norm": 2.929235240916999, + "learning_rate": 2.084961456039657e-06, + "loss": 0.5252, + "step": 8707 + }, + { + "epoch": 0.71, + "grad_norm": 4.994150618570016, + "learning_rate": 2.0838928753060887e-06, + "loss": 0.5555, + "step": 8708 + }, + { + "epoch": 0.71, + "grad_norm": 28.759874795858693, + "learning_rate": 2.082824496385494e-06, + "loss": 0.5985, + "step": 8709 + }, + { + "epoch": 0.71, + "grad_norm": 4.379750912784285, + "learning_rate": 2.0817563193518115e-06, + "loss": 0.7479, + "step": 8710 + }, + { + "epoch": 0.71, + "grad_norm": 15.810220790301898, + "learning_rate": 2.0806883442789694e-06, + "loss": 0.695, + "step": 8711 + }, + { + "epoch": 0.71, + "grad_norm": 3.2393711778677816, + "learning_rate": 2.0796205712408718e-06, + "loss": 0.6269, + "step": 8712 + }, + { + "epoch": 0.71, + "grad_norm": 2.8221422208054125, + "learning_rate": 2.0785530003114206e-06, + "loss": 0.7105, + "step": 8713 + }, + { + "epoch": 0.71, + "grad_norm": 4.147768406746869, + "learning_rate": 2.0774856315644955e-06, + "loss": 0.7037, + "step": 8714 + }, + { + "epoch": 0.71, + "grad_norm": 4.075343137593721, + "learning_rate": 2.0764184650739677e-06, + "loss": 0.6125, + "step": 8715 + }, + { + "epoch": 0.71, + "grad_norm": 3.809925517865381, + "learning_rate": 2.0753515009136905e-06, + "loss": 0.7524, + "step": 8716 + }, + { + "epoch": 0.71, + "grad_norm": 2.5202275142128707, + "learning_rate": 2.074284739157505e-06, + "loss": 0.7352, + "step": 8717 + }, + { + "epoch": 0.71, + "grad_norm": 4.356608614327902, + "learning_rate": 2.0732181798792366e-06, + "loss": 0.603, + "step": 8718 + }, + { + "epoch": 0.71, + "grad_norm": 3.1177854093008612, + "learning_rate": 2.0721518231526977e-06, + "loss": 0.7067, + "step": 8719 + }, + { + "epoch": 0.71, + "grad_norm": 4.264112895053691, + "learning_rate": 2.0710856690516893e-06, + "loss": 0.5412, + "step": 8720 + }, + { + "epoch": 0.71, + "grad_norm": 4.52024655669802, + "learning_rate": 2.0700197176499927e-06, + "loss": 0.6411, + "step": 8721 + }, + { + "epoch": 0.71, + "grad_norm": 4.6499806411241495, + "learning_rate": 2.0689539690213823e-06, + "loss": 0.7169, + "step": 8722 + }, + { + "epoch": 0.71, + "grad_norm": 5.413234635787713, + "learning_rate": 2.0678884232396106e-06, + "loss": 0.707, + "step": 8723 + }, + { + "epoch": 0.71, + "grad_norm": 3.5517990822081735, + "learning_rate": 2.066823080378426e-06, + "loss": 0.7047, + "step": 8724 + }, + { + "epoch": 0.71, + "grad_norm": 4.956214019018222, + "learning_rate": 2.065757940511549e-06, + "loss": 0.5625, + "step": 8725 + }, + { + "epoch": 0.71, + "grad_norm": 4.4319365568827545, + "learning_rate": 2.0646930037127003e-06, + "loss": 0.7116, + "step": 8726 + }, + { + "epoch": 0.71, + "grad_norm": 3.333851219894353, + "learning_rate": 2.0636282700555775e-06, + "loss": 0.6362, + "step": 8727 + }, + { + "epoch": 0.71, + "grad_norm": 5.577300691087735, + "learning_rate": 2.0625637396138666e-06, + "loss": 0.6129, + "step": 8728 + }, + { + "epoch": 0.71, + "grad_norm": 13.152794198042924, + "learning_rate": 2.0614994124612413e-06, + "loss": 0.6509, + "step": 8729 + }, + { + "epoch": 0.71, + "grad_norm": 5.161030402290788, + "learning_rate": 2.0604352886713574e-06, + "loss": 0.7231, + "step": 8730 + }, + { + "epoch": 0.71, + "grad_norm": 3.1113872836751413, + "learning_rate": 2.059371368317864e-06, + "loss": 0.6442, + "step": 8731 + }, + { + "epoch": 0.71, + "grad_norm": 3.6569494167890872, + "learning_rate": 2.0583076514743844e-06, + "loss": 0.7688, + "step": 8732 + }, + { + "epoch": 0.71, + "grad_norm": 2.7580933591632664, + "learning_rate": 2.0572441382145397e-06, + "loss": 0.6358, + "step": 8733 + }, + { + "epoch": 0.71, + "grad_norm": 2.8104458085992507, + "learning_rate": 2.0561808286119294e-06, + "loss": 0.6583, + "step": 8734 + }, + { + "epoch": 0.71, + "grad_norm": 4.366274906959839, + "learning_rate": 2.0551177227401397e-06, + "loss": 0.6322, + "step": 8735 + }, + { + "epoch": 0.71, + "grad_norm": 4.083506917762391, + "learning_rate": 2.054054820672748e-06, + "loss": 0.6904, + "step": 8736 + }, + { + "epoch": 0.71, + "grad_norm": 7.218458812277885, + "learning_rate": 2.052992122483312e-06, + "loss": 0.5426, + "step": 8737 + }, + { + "epoch": 0.71, + "grad_norm": 2.8177185258560553, + "learning_rate": 2.051929628245377e-06, + "loss": 0.6697, + "step": 8738 + }, + { + "epoch": 0.71, + "grad_norm": 3.3944690494797713, + "learning_rate": 2.0508673380324723e-06, + "loss": 0.5708, + "step": 8739 + }, + { + "epoch": 0.71, + "grad_norm": 3.0659302369352384, + "learning_rate": 2.0498052519181193e-06, + "loss": 0.7051, + "step": 8740 + }, + { + "epoch": 0.71, + "grad_norm": 3.649979616514557, + "learning_rate": 2.0487433699758184e-06, + "loss": 0.7239, + "step": 8741 + }, + { + "epoch": 0.71, + "grad_norm": 4.11572919664713, + "learning_rate": 2.0476816922790575e-06, + "loss": 0.6405, + "step": 8742 + }, + { + "epoch": 0.71, + "grad_norm": 3.415681130598828, + "learning_rate": 2.0466202189013145e-06, + "loss": 0.6504, + "step": 8743 + }, + { + "epoch": 0.71, + "grad_norm": 7.959075286556643, + "learning_rate": 2.0455589499160484e-06, + "loss": 0.6164, + "step": 8744 + }, + { + "epoch": 0.71, + "grad_norm": 2.614939214660841, + "learning_rate": 2.0444978853967057e-06, + "loss": 0.6671, + "step": 8745 + }, + { + "epoch": 0.71, + "grad_norm": 3.519863362194282, + "learning_rate": 2.0434370254167166e-06, + "loss": 0.6475, + "step": 8746 + }, + { + "epoch": 0.71, + "grad_norm": 2.4098669032329227, + "learning_rate": 2.0423763700495037e-06, + "loss": 0.4122, + "step": 8747 + }, + { + "epoch": 0.71, + "grad_norm": 3.360617735215812, + "learning_rate": 2.041315919368466e-06, + "loss": 0.796, + "step": 8748 + }, + { + "epoch": 0.71, + "grad_norm": 3.135359601540009, + "learning_rate": 2.040255673446999e-06, + "loss": 0.621, + "step": 8749 + }, + { + "epoch": 0.71, + "grad_norm": 2.9458632091083037, + "learning_rate": 2.039195632358475e-06, + "loss": 0.7715, + "step": 8750 + }, + { + "epoch": 0.71, + "grad_norm": 3.31503705409151, + "learning_rate": 2.038135796176256e-06, + "loss": 0.7427, + "step": 8751 + }, + { + "epoch": 0.71, + "grad_norm": 5.172875999063851, + "learning_rate": 2.0370761649736892e-06, + "loss": 0.6782, + "step": 8752 + }, + { + "epoch": 0.71, + "grad_norm": 3.0013392021312173, + "learning_rate": 2.0360167388241063e-06, + "loss": 0.7386, + "step": 8753 + }, + { + "epoch": 0.71, + "grad_norm": 2.9064158610937847, + "learning_rate": 2.0349575178008298e-06, + "loss": 0.8823, + "step": 8754 + }, + { + "epoch": 0.71, + "grad_norm": 5.832432484605943, + "learning_rate": 2.0338985019771606e-06, + "loss": 0.6298, + "step": 8755 + }, + { + "epoch": 0.71, + "grad_norm": 9.547488560837657, + "learning_rate": 2.0328396914263925e-06, + "loss": 0.7966, + "step": 8756 + }, + { + "epoch": 0.71, + "grad_norm": 7.685151162193827, + "learning_rate": 2.031781086221801e-06, + "loss": 0.6159, + "step": 8757 + }, + { + "epoch": 0.71, + "grad_norm": 3.9834943806801983, + "learning_rate": 2.0307226864366483e-06, + "loss": 0.6155, + "step": 8758 + }, + { + "epoch": 0.71, + "grad_norm": 2.958339369303112, + "learning_rate": 2.029664492144181e-06, + "loss": 0.5695, + "step": 8759 + }, + { + "epoch": 0.71, + "grad_norm": 2.355163702464426, + "learning_rate": 2.028606503417632e-06, + "loss": 0.5086, + "step": 8760 + }, + { + "epoch": 0.71, + "grad_norm": 11.997588756694606, + "learning_rate": 2.027548720330224e-06, + "loss": 0.6777, + "step": 8761 + }, + { + "epoch": 0.71, + "grad_norm": 2.966795201439225, + "learning_rate": 2.026491142955159e-06, + "loss": 0.5589, + "step": 8762 + }, + { + "epoch": 0.71, + "grad_norm": 2.351763077065818, + "learning_rate": 2.025433771365632e-06, + "loss": 0.5699, + "step": 8763 + }, + { + "epoch": 0.71, + "grad_norm": 4.301127495955502, + "learning_rate": 2.0243766056348167e-06, + "loss": 0.7164, + "step": 8764 + }, + { + "epoch": 0.71, + "grad_norm": 2.775612509667861, + "learning_rate": 2.0233196458358773e-06, + "loss": 0.7505, + "step": 8765 + }, + { + "epoch": 0.71, + "grad_norm": 2.902540412913684, + "learning_rate": 2.02226289204196e-06, + "loss": 0.5775, + "step": 8766 + }, + { + "epoch": 0.71, + "grad_norm": 4.592198878891836, + "learning_rate": 2.021206344326199e-06, + "loss": 0.5671, + "step": 8767 + }, + { + "epoch": 0.71, + "grad_norm": 2.9187244078518146, + "learning_rate": 2.0201500027617167e-06, + "loss": 0.7872, + "step": 8768 + }, + { + "epoch": 0.71, + "grad_norm": 3.130563729588463, + "learning_rate": 2.0190938674216146e-06, + "loss": 0.5342, + "step": 8769 + }, + { + "epoch": 0.71, + "grad_norm": 3.482246278832916, + "learning_rate": 2.0180379383789907e-06, + "loss": 0.5205, + "step": 8770 + }, + { + "epoch": 0.71, + "grad_norm": 4.352517286409447, + "learning_rate": 2.016982215706913e-06, + "loss": 0.7255, + "step": 8771 + }, + { + "epoch": 0.71, + "grad_norm": 3.6667276761978616, + "learning_rate": 2.0159266994784504e-06, + "loss": 0.6406, + "step": 8772 + }, + { + "epoch": 0.71, + "grad_norm": 8.038854512780896, + "learning_rate": 2.0148713897666485e-06, + "loss": 0.5379, + "step": 8773 + }, + { + "epoch": 0.71, + "grad_norm": 2.8582175226159774, + "learning_rate": 2.013816286644543e-06, + "loss": 0.7832, + "step": 8774 + }, + { + "epoch": 0.71, + "grad_norm": 3.1584807172865745, + "learning_rate": 2.0127613901851537e-06, + "loss": 0.8039, + "step": 8775 + }, + { + "epoch": 0.71, + "grad_norm": 4.71440524692816, + "learning_rate": 2.0117067004614838e-06, + "loss": 0.6369, + "step": 8776 + }, + { + "epoch": 0.71, + "grad_norm": 3.382746967430863, + "learning_rate": 2.0106522175465292e-06, + "loss": 0.671, + "step": 8777 + }, + { + "epoch": 0.71, + "grad_norm": 3.1467605862658012, + "learning_rate": 2.0095979415132603e-06, + "loss": 0.6567, + "step": 8778 + }, + { + "epoch": 0.71, + "grad_norm": 3.505394019386553, + "learning_rate": 2.0085438724346446e-06, + "loss": 0.7686, + "step": 8779 + }, + { + "epoch": 0.71, + "grad_norm": 3.236953113885481, + "learning_rate": 2.007490010383627e-06, + "loss": 0.5824, + "step": 8780 + }, + { + "epoch": 0.71, + "grad_norm": 5.154553815704456, + "learning_rate": 2.006436355433145e-06, + "loss": 0.6867, + "step": 8781 + }, + { + "epoch": 0.71, + "grad_norm": 5.033077935833127, + "learning_rate": 2.0053829076561158e-06, + "loss": 0.7493, + "step": 8782 + }, + { + "epoch": 0.71, + "grad_norm": 13.297693329221463, + "learning_rate": 2.004329667125444e-06, + "loss": 0.6757, + "step": 8783 + }, + { + "epoch": 0.71, + "grad_norm": 9.07637789094105, + "learning_rate": 2.0032766339140246e-06, + "loss": 0.5927, + "step": 8784 + }, + { + "epoch": 0.71, + "grad_norm": 3.227209380649744, + "learning_rate": 2.0022238080947275e-06, + "loss": 0.6673, + "step": 8785 + }, + { + "epoch": 0.71, + "grad_norm": 3.832185161291698, + "learning_rate": 2.0011711897404207e-06, + "loss": 0.6972, + "step": 8786 + }, + { + "epoch": 0.71, + "grad_norm": 2.8703507096471452, + "learning_rate": 2.000118778923947e-06, + "loss": 0.5868, + "step": 8787 + }, + { + "epoch": 0.71, + "grad_norm": 7.193109091535448, + "learning_rate": 1.9990665757181455e-06, + "loss": 0.6396, + "step": 8788 + }, + { + "epoch": 0.71, + "grad_norm": 2.756965544552875, + "learning_rate": 1.9980145801958316e-06, + "loss": 0.7477, + "step": 8789 + }, + { + "epoch": 0.71, + "grad_norm": 4.567258164627998, + "learning_rate": 1.9969627924298114e-06, + "loss": 0.7473, + "step": 8790 + }, + { + "epoch": 0.71, + "grad_norm": 16.96634245215981, + "learning_rate": 1.9959112124928743e-06, + "loss": 0.8128, + "step": 8791 + }, + { + "epoch": 0.71, + "grad_norm": 4.326706004376088, + "learning_rate": 1.9948598404577944e-06, + "loss": 0.6632, + "step": 8792 + }, + { + "epoch": 0.71, + "grad_norm": 2.531978564691185, + "learning_rate": 1.993808676397338e-06, + "loss": 0.7591, + "step": 8793 + }, + { + "epoch": 0.71, + "grad_norm": 7.527630960837817, + "learning_rate": 1.992757720384248e-06, + "loss": 0.6943, + "step": 8794 + }, + { + "epoch": 0.71, + "grad_norm": 6.343451868325262, + "learning_rate": 1.9917069724912603e-06, + "loss": 0.6974, + "step": 8795 + }, + { + "epoch": 0.71, + "grad_norm": 2.471764167291829, + "learning_rate": 1.990656432791092e-06, + "loss": 0.6072, + "step": 8796 + }, + { + "epoch": 0.71, + "grad_norm": 8.582828636829632, + "learning_rate": 1.9896061013564467e-06, + "loss": 0.5871, + "step": 8797 + }, + { + "epoch": 0.71, + "grad_norm": 2.4587343788493907, + "learning_rate": 1.988555978260013e-06, + "loss": 0.5145, + "step": 8798 + }, + { + "epoch": 0.71, + "grad_norm": 2.9815703130217637, + "learning_rate": 1.987506063574468e-06, + "loss": 0.6576, + "step": 8799 + }, + { + "epoch": 0.71, + "grad_norm": 3.2030165514381377, + "learning_rate": 1.9864563573724725e-06, + "loss": 0.6835, + "step": 8800 + }, + { + "epoch": 0.71, + "grad_norm": 7.861961822558024, + "learning_rate": 1.98540685972667e-06, + "loss": 0.6036, + "step": 8801 + }, + { + "epoch": 0.71, + "grad_norm": 3.20920395854436, + "learning_rate": 1.9843575707096955e-06, + "loss": 0.7583, + "step": 8802 + }, + { + "epoch": 0.71, + "grad_norm": 11.64728827343324, + "learning_rate": 1.9833084903941657e-06, + "loss": 0.624, + "step": 8803 + }, + { + "epoch": 0.72, + "grad_norm": 2.9001418924202986, + "learning_rate": 1.9822596188526834e-06, + "loss": 0.6414, + "step": 8804 + }, + { + "epoch": 0.72, + "grad_norm": 12.843775200413022, + "learning_rate": 1.981210956157834e-06, + "loss": 0.7241, + "step": 8805 + }, + { + "epoch": 0.72, + "grad_norm": 4.864499375595119, + "learning_rate": 1.9801625023821968e-06, + "loss": 0.6794, + "step": 8806 + }, + { + "epoch": 0.72, + "grad_norm": 3.8144753717202486, + "learning_rate": 1.9791142575983286e-06, + "loss": 0.6899, + "step": 8807 + }, + { + "epoch": 0.72, + "grad_norm": 2.98050843228975, + "learning_rate": 1.9780662218787733e-06, + "loss": 0.6075, + "step": 8808 + }, + { + "epoch": 0.72, + "grad_norm": 5.571411515945465, + "learning_rate": 1.977018395296064e-06, + "loss": 0.5819, + "step": 8809 + }, + { + "epoch": 0.72, + "grad_norm": 6.47989052207137, + "learning_rate": 1.975970777922717e-06, + "loss": 0.6682, + "step": 8810 + }, + { + "epoch": 0.72, + "grad_norm": 2.6616961134977277, + "learning_rate": 1.9749233698312327e-06, + "loss": 0.6291, + "step": 8811 + }, + { + "epoch": 0.72, + "grad_norm": 8.259310514297201, + "learning_rate": 1.973876171094097e-06, + "loss": 0.7177, + "step": 8812 + }, + { + "epoch": 0.72, + "grad_norm": 7.500422836903766, + "learning_rate": 1.9728291817837857e-06, + "loss": 0.6698, + "step": 8813 + }, + { + "epoch": 0.72, + "grad_norm": 5.085636074316168, + "learning_rate": 1.9717824019727567e-06, + "loss": 0.6926, + "step": 8814 + }, + { + "epoch": 0.72, + "grad_norm": 3.6010372384968923, + "learning_rate": 1.9707358317334497e-06, + "loss": 0.7064, + "step": 8815 + }, + { + "epoch": 0.72, + "grad_norm": 6.570934873563309, + "learning_rate": 1.9696894711382997e-06, + "loss": 0.621, + "step": 8816 + }, + { + "epoch": 0.72, + "grad_norm": 6.828100193374712, + "learning_rate": 1.9686433202597178e-06, + "loss": 0.6842, + "step": 8817 + }, + { + "epoch": 0.72, + "grad_norm": 4.159167754686179, + "learning_rate": 1.9675973791701057e-06, + "loss": 0.7297, + "step": 8818 + }, + { + "epoch": 0.72, + "grad_norm": 4.39609840413412, + "learning_rate": 1.966551647941847e-06, + "loss": 0.7244, + "step": 8819 + }, + { + "epoch": 0.72, + "grad_norm": 2.187336405449309, + "learning_rate": 1.9655061266473158e-06, + "loss": 0.6542, + "step": 8820 + }, + { + "epoch": 0.72, + "grad_norm": 4.371349603679339, + "learning_rate": 1.9644608153588674e-06, + "loss": 0.7111, + "step": 8821 + }, + { + "epoch": 0.72, + "grad_norm": 3.23061652058898, + "learning_rate": 1.963415714148842e-06, + "loss": 0.7292, + "step": 8822 + }, + { + "epoch": 0.72, + "grad_norm": 8.340844833874831, + "learning_rate": 1.962370823089571e-06, + "loss": 0.6715, + "step": 8823 + }, + { + "epoch": 0.72, + "grad_norm": 11.777039717234098, + "learning_rate": 1.9613261422533657e-06, + "loss": 0.6475, + "step": 8824 + }, + { + "epoch": 0.72, + "grad_norm": 6.288982565294828, + "learning_rate": 1.9602816717125243e-06, + "loss": 0.619, + "step": 8825 + }, + { + "epoch": 0.72, + "grad_norm": 3.5372321756169103, + "learning_rate": 1.9592374115393293e-06, + "loss": 0.6137, + "step": 8826 + }, + { + "epoch": 0.72, + "grad_norm": 2.401546727963931, + "learning_rate": 1.958193361806053e-06, + "loss": 0.6387, + "step": 8827 + }, + { + "epoch": 0.72, + "grad_norm": 4.502049295304319, + "learning_rate": 1.9571495225849475e-06, + "loss": 0.5887, + "step": 8828 + }, + { + "epoch": 0.72, + "grad_norm": 3.0368141758375082, + "learning_rate": 1.9561058939482562e-06, + "loss": 0.6378, + "step": 8829 + }, + { + "epoch": 0.72, + "grad_norm": 5.279880187545452, + "learning_rate": 1.9550624759682028e-06, + "loss": 0.6048, + "step": 8830 + }, + { + "epoch": 0.72, + "grad_norm": 3.459920744456565, + "learning_rate": 1.9540192687169984e-06, + "loss": 0.6234, + "step": 8831 + }, + { + "epoch": 0.72, + "grad_norm": 2.6907751830788924, + "learning_rate": 1.95297627226684e-06, + "loss": 0.7046, + "step": 8832 + }, + { + "epoch": 0.72, + "grad_norm": 3.8818150432192287, + "learning_rate": 1.951933486689907e-06, + "loss": 0.5922, + "step": 8833 + }, + { + "epoch": 0.72, + "grad_norm": 3.2705707612512147, + "learning_rate": 1.9508909120583715e-06, + "loss": 0.5511, + "step": 8834 + }, + { + "epoch": 0.72, + "grad_norm": 4.139808327956517, + "learning_rate": 1.9498485484443813e-06, + "loss": 0.7152, + "step": 8835 + }, + { + "epoch": 0.72, + "grad_norm": 6.001292193704591, + "learning_rate": 1.948806395920079e-06, + "loss": 0.6956, + "step": 8836 + }, + { + "epoch": 0.72, + "grad_norm": 4.080190400459213, + "learning_rate": 1.947764454557585e-06, + "loss": 0.7407, + "step": 8837 + }, + { + "epoch": 0.72, + "grad_norm": 3.615498639777865, + "learning_rate": 1.9467227244290105e-06, + "loss": 0.7361, + "step": 8838 + }, + { + "epoch": 0.72, + "grad_norm": 5.0511357492068525, + "learning_rate": 1.945681205606448e-06, + "loss": 0.6729, + "step": 8839 + }, + { + "epoch": 0.72, + "grad_norm": 3.533593971268565, + "learning_rate": 1.9446398981619757e-06, + "loss": 0.4941, + "step": 8840 + }, + { + "epoch": 0.72, + "grad_norm": 4.733292801297826, + "learning_rate": 1.9435988021676626e-06, + "loss": 0.6545, + "step": 8841 + }, + { + "epoch": 0.72, + "grad_norm": 9.594415058582966, + "learning_rate": 1.942557917695555e-06, + "loss": 0.5628, + "step": 8842 + }, + { + "epoch": 0.72, + "grad_norm": 3.1979531907412144, + "learning_rate": 1.9415172448176945e-06, + "loss": 0.7919, + "step": 8843 + }, + { + "epoch": 0.72, + "grad_norm": 2.98315139661177, + "learning_rate": 1.940476783606095e-06, + "loss": 0.6954, + "step": 8844 + }, + { + "epoch": 0.72, + "grad_norm": 4.848243160187772, + "learning_rate": 1.939436534132768e-06, + "loss": 0.7195, + "step": 8845 + }, + { + "epoch": 0.72, + "grad_norm": 4.1605302466262675, + "learning_rate": 1.938396496469704e-06, + "loss": 0.6755, + "step": 8846 + }, + { + "epoch": 0.72, + "grad_norm": 14.056649281012927, + "learning_rate": 1.937356670688878e-06, + "loss": 0.6979, + "step": 8847 + }, + { + "epoch": 0.72, + "grad_norm": 8.780443747167748, + "learning_rate": 1.936317056862256e-06, + "loss": 0.6964, + "step": 8848 + }, + { + "epoch": 0.72, + "grad_norm": 3.3643636349117223, + "learning_rate": 1.9352776550617824e-06, + "loss": 0.6031, + "step": 8849 + }, + { + "epoch": 0.72, + "grad_norm": 3.3227013309111584, + "learning_rate": 1.934238465359396e-06, + "loss": 0.5447, + "step": 8850 + }, + { + "epoch": 0.72, + "grad_norm": 3.135112387352472, + "learning_rate": 1.9331994878270077e-06, + "loss": 0.6779, + "step": 8851 + }, + { + "epoch": 0.72, + "grad_norm": 9.366334045097481, + "learning_rate": 1.9321607225365267e-06, + "loss": 0.7106, + "step": 8852 + }, + { + "epoch": 0.72, + "grad_norm": 3.517658218963076, + "learning_rate": 1.931122169559839e-06, + "loss": 0.557, + "step": 8853 + }, + { + "epoch": 0.72, + "grad_norm": 3.5169309975828154, + "learning_rate": 1.9300838289688216e-06, + "loss": 0.5963, + "step": 8854 + }, + { + "epoch": 0.72, + "grad_norm": 6.773285088108598, + "learning_rate": 1.9290457008353336e-06, + "loss": 0.6356, + "step": 8855 + }, + { + "epoch": 0.72, + "grad_norm": 7.5664047014484, + "learning_rate": 1.9280077852312194e-06, + "loss": 0.7356, + "step": 8856 + }, + { + "epoch": 0.72, + "grad_norm": 3.908910187668835, + "learning_rate": 1.926970082228309e-06, + "loss": 0.5346, + "step": 8857 + }, + { + "epoch": 0.72, + "grad_norm": 3.970747605647887, + "learning_rate": 1.9259325918984167e-06, + "loss": 0.6697, + "step": 8858 + }, + { + "epoch": 0.72, + "grad_norm": 8.903257348224061, + "learning_rate": 1.924895314313347e-06, + "loss": 0.659, + "step": 8859 + }, + { + "epoch": 0.72, + "grad_norm": 2.5131874768384823, + "learning_rate": 1.9238582495448814e-06, + "loss": 0.783, + "step": 8860 + }, + { + "epoch": 0.72, + "grad_norm": 3.2968781216152383, + "learning_rate": 1.9228213976647964e-06, + "loss": 0.5727, + "step": 8861 + }, + { + "epoch": 0.72, + "grad_norm": 4.186467857850592, + "learning_rate": 1.9217847587448464e-06, + "loss": 0.7955, + "step": 8862 + }, + { + "epoch": 0.72, + "grad_norm": 4.175171325936079, + "learning_rate": 1.9207483328567726e-06, + "loss": 0.732, + "step": 8863 + }, + { + "epoch": 0.72, + "grad_norm": 3.1490724095877782, + "learning_rate": 1.919712120072303e-06, + "loss": 0.6484, + "step": 8864 + }, + { + "epoch": 0.72, + "grad_norm": 2.9071704239129392, + "learning_rate": 1.9186761204631476e-06, + "loss": 0.4801, + "step": 8865 + }, + { + "epoch": 0.72, + "grad_norm": 2.3813989347091855, + "learning_rate": 1.9176403341010087e-06, + "loss": 0.6109, + "step": 8866 + }, + { + "epoch": 0.72, + "grad_norm": 2.8342140976000767, + "learning_rate": 1.9166047610575646e-06, + "loss": 0.6841, + "step": 8867 + }, + { + "epoch": 0.72, + "grad_norm": 1.918152166754916, + "learning_rate": 1.915569401404488e-06, + "loss": 0.77, + "step": 8868 + }, + { + "epoch": 0.72, + "grad_norm": 3.768150286762269, + "learning_rate": 1.9145342552134293e-06, + "loss": 0.6882, + "step": 8869 + }, + { + "epoch": 0.72, + "grad_norm": 3.649873713760616, + "learning_rate": 1.9134993225560283e-06, + "loss": 0.6548, + "step": 8870 + }, + { + "epoch": 0.72, + "grad_norm": 4.863850684322349, + "learning_rate": 1.912464603503908e-06, + "loss": 0.7566, + "step": 8871 + }, + { + "epoch": 0.72, + "grad_norm": 5.851115547488806, + "learning_rate": 1.9114300981286763e-06, + "loss": 0.5994, + "step": 8872 + }, + { + "epoch": 0.72, + "grad_norm": 3.5132057714789506, + "learning_rate": 1.9103958065019307e-06, + "loss": 0.7338, + "step": 8873 + }, + { + "epoch": 0.72, + "grad_norm": 4.751211097863998, + "learning_rate": 1.9093617286952476e-06, + "loss": 0.7096, + "step": 8874 + }, + { + "epoch": 0.72, + "grad_norm": 4.485628580346519, + "learning_rate": 1.908327864780195e-06, + "loss": 0.4865, + "step": 8875 + }, + { + "epoch": 0.72, + "grad_norm": 4.414869053437688, + "learning_rate": 1.9072942148283202e-06, + "loss": 0.5486, + "step": 8876 + }, + { + "epoch": 0.72, + "grad_norm": 4.237645749272, + "learning_rate": 1.9062607789111598e-06, + "loss": 0.7081, + "step": 8877 + }, + { + "epoch": 0.72, + "grad_norm": 5.988538903555468, + "learning_rate": 1.905227557100231e-06, + "loss": 0.6915, + "step": 8878 + }, + { + "epoch": 0.72, + "grad_norm": 6.0746211516613435, + "learning_rate": 1.904194549467044e-06, + "loss": 0.805, + "step": 8879 + }, + { + "epoch": 0.72, + "grad_norm": 4.6827267273987605, + "learning_rate": 1.9031617560830861e-06, + "loss": 0.7081, + "step": 8880 + }, + { + "epoch": 0.72, + "grad_norm": 4.590668992869816, + "learning_rate": 1.902129177019833e-06, + "loss": 0.5751, + "step": 8881 + }, + { + "epoch": 0.72, + "grad_norm": 3.1395077087252523, + "learning_rate": 1.9010968123487478e-06, + "loss": 0.7919, + "step": 8882 + }, + { + "epoch": 0.72, + "grad_norm": 5.511577507063931, + "learning_rate": 1.9000646621412762e-06, + "loss": 0.5962, + "step": 8883 + }, + { + "epoch": 0.72, + "grad_norm": 20.164675316732144, + "learning_rate": 1.899032726468848e-06, + "loss": 0.6879, + "step": 8884 + }, + { + "epoch": 0.72, + "grad_norm": 4.46412725949072, + "learning_rate": 1.8980010054028792e-06, + "loss": 0.5249, + "step": 8885 + }, + { + "epoch": 0.72, + "grad_norm": 7.207493164539404, + "learning_rate": 1.8969694990147742e-06, + "loss": 0.7571, + "step": 8886 + }, + { + "epoch": 0.72, + "grad_norm": 4.2069825022497795, + "learning_rate": 1.895938207375918e-06, + "loss": 0.5319, + "step": 8887 + }, + { + "epoch": 0.72, + "grad_norm": 3.8124152684909336, + "learning_rate": 1.894907130557681e-06, + "loss": 0.5642, + "step": 8888 + }, + { + "epoch": 0.72, + "grad_norm": 2.685926505522822, + "learning_rate": 1.8938762686314238e-06, + "loss": 0.6805, + "step": 8889 + }, + { + "epoch": 0.72, + "grad_norm": 3.8138392264987586, + "learning_rate": 1.892845621668486e-06, + "loss": 0.6468, + "step": 8890 + }, + { + "epoch": 0.72, + "grad_norm": 3.931445920141013, + "learning_rate": 1.891815189740196e-06, + "loss": 0.5371, + "step": 8891 + }, + { + "epoch": 0.72, + "grad_norm": 7.345806439582732, + "learning_rate": 1.890784972917864e-06, + "loss": 0.6336, + "step": 8892 + }, + { + "epoch": 0.72, + "grad_norm": 3.428362852982109, + "learning_rate": 1.8897549712727903e-06, + "loss": 0.7927, + "step": 8893 + }, + { + "epoch": 0.72, + "grad_norm": 3.3390805881812033, + "learning_rate": 1.8887251848762567e-06, + "loss": 0.5877, + "step": 8894 + }, + { + "epoch": 0.72, + "grad_norm": 7.351594676809807, + "learning_rate": 1.8876956137995284e-06, + "loss": 0.5394, + "step": 8895 + }, + { + "epoch": 0.72, + "grad_norm": 2.2130512581854758, + "learning_rate": 1.8866662581138646e-06, + "loss": 0.6815, + "step": 8896 + }, + { + "epoch": 0.72, + "grad_norm": 5.258494473815784, + "learning_rate": 1.8856371178904947e-06, + "loss": 0.7325, + "step": 8897 + }, + { + "epoch": 0.72, + "grad_norm": 3.9004540796423877, + "learning_rate": 1.8846081932006476e-06, + "loss": 0.6311, + "step": 8898 + }, + { + "epoch": 0.72, + "grad_norm": 2.430892785125821, + "learning_rate": 1.883579484115528e-06, + "loss": 0.7303, + "step": 8899 + }, + { + "epoch": 0.72, + "grad_norm": 11.19710787708259, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.6601, + "step": 8900 + }, + { + "epoch": 0.72, + "grad_norm": 5.002406894291113, + "learning_rate": 1.881522713044236e-06, + "loss": 0.614, + "step": 8901 + }, + { + "epoch": 0.72, + "grad_norm": 3.863070105141923, + "learning_rate": 1.8804946512004053e-06, + "loss": 0.8421, + "step": 8902 + }, + { + "epoch": 0.72, + "grad_norm": 8.00026687953639, + "learning_rate": 1.8794668052459863e-06, + "loss": 0.5922, + "step": 8903 + }, + { + "epoch": 0.72, + "grad_norm": 6.733748844287857, + "learning_rate": 1.878439175252113e-06, + "loss": 0.6112, + "step": 8904 + }, + { + "epoch": 0.72, + "grad_norm": 3.737647186540963, + "learning_rate": 1.8774117612899034e-06, + "loss": 0.6153, + "step": 8905 + }, + { + "epoch": 0.72, + "grad_norm": 4.700916158126489, + "learning_rate": 1.87638456343046e-06, + "loss": 0.5457, + "step": 8906 + }, + { + "epoch": 0.72, + "grad_norm": 3.2856501841996058, + "learning_rate": 1.8753575817448745e-06, + "loss": 0.6974, + "step": 8907 + }, + { + "epoch": 0.72, + "grad_norm": 3.2838574532711218, + "learning_rate": 1.8743308163042167e-06, + "loss": 0.5804, + "step": 8908 + }, + { + "epoch": 0.72, + "grad_norm": 5.105916117254069, + "learning_rate": 1.873304267179551e-06, + "loss": 0.6725, + "step": 8909 + }, + { + "epoch": 0.72, + "grad_norm": 5.153860723511032, + "learning_rate": 1.8722779344419139e-06, + "loss": 0.7657, + "step": 8910 + }, + { + "epoch": 0.72, + "grad_norm": 13.466183605469553, + "learning_rate": 1.871251818162339e-06, + "loss": 0.8554, + "step": 8911 + }, + { + "epoch": 0.72, + "grad_norm": 4.629660635048524, + "learning_rate": 1.8702259184118387e-06, + "loss": 0.6891, + "step": 8912 + }, + { + "epoch": 0.72, + "grad_norm": 2.746441486492676, + "learning_rate": 1.8692002352614098e-06, + "loss": 0.6385, + "step": 8913 + }, + { + "epoch": 0.72, + "grad_norm": 4.018041203452597, + "learning_rate": 1.868174768782039e-06, + "loss": 0.6318, + "step": 8914 + }, + { + "epoch": 0.72, + "grad_norm": 3.560944326930053, + "learning_rate": 1.8671495190446925e-06, + "loss": 0.6133, + "step": 8915 + }, + { + "epoch": 0.72, + "grad_norm": 32.677818019842654, + "learning_rate": 1.8661244861203288e-06, + "loss": 0.7663, + "step": 8916 + }, + { + "epoch": 0.72, + "grad_norm": 3.887371866724973, + "learning_rate": 1.8650996700798797e-06, + "loss": 0.5237, + "step": 8917 + }, + { + "epoch": 0.72, + "grad_norm": 3.9806102594400885, + "learning_rate": 1.864075070994274e-06, + "loss": 0.6477, + "step": 8918 + }, + { + "epoch": 0.72, + "grad_norm": 2.6242920161706467, + "learning_rate": 1.863050688934419e-06, + "loss": 0.6185, + "step": 8919 + }, + { + "epoch": 0.72, + "grad_norm": 2.778203570254656, + "learning_rate": 1.8620265239712066e-06, + "loss": 0.7441, + "step": 8920 + }, + { + "epoch": 0.72, + "grad_norm": 3.806447036424115, + "learning_rate": 1.8610025761755184e-06, + "loss": 0.6068, + "step": 8921 + }, + { + "epoch": 0.72, + "grad_norm": 3.6615885983520875, + "learning_rate": 1.859978845618215e-06, + "loss": 0.5936, + "step": 8922 + }, + { + "epoch": 0.72, + "grad_norm": 3.118603952204705, + "learning_rate": 1.8589553323701503e-06, + "loss": 0.6658, + "step": 8923 + }, + { + "epoch": 0.72, + "grad_norm": 5.223911553332602, + "learning_rate": 1.8579320365021508e-06, + "loss": 0.5449, + "step": 8924 + }, + { + "epoch": 0.72, + "grad_norm": 6.435947445253762, + "learning_rate": 1.8569089580850403e-06, + "loss": 0.6364, + "step": 8925 + }, + { + "epoch": 0.72, + "grad_norm": 4.963754850974254, + "learning_rate": 1.855886097189618e-06, + "loss": 0.667, + "step": 8926 + }, + { + "epoch": 0.73, + "grad_norm": 2.8400800060155227, + "learning_rate": 1.8548634538866772e-06, + "loss": 0.766, + "step": 8927 + }, + { + "epoch": 0.73, + "grad_norm": 4.115290829574984, + "learning_rate": 1.8538410282469888e-06, + "loss": 0.5938, + "step": 8928 + }, + { + "epoch": 0.73, + "grad_norm": 3.811712575342287, + "learning_rate": 1.852818820341311e-06, + "loss": 0.6294, + "step": 8929 + }, + { + "epoch": 0.73, + "grad_norm": 2.2137187414278205, + "learning_rate": 1.8517968302403872e-06, + "loss": 0.5604, + "step": 8930 + }, + { + "epoch": 0.73, + "grad_norm": 5.263109231578008, + "learning_rate": 1.8507750580149436e-06, + "loss": 0.6258, + "step": 8931 + }, + { + "epoch": 0.73, + "grad_norm": 4.121221600036374, + "learning_rate": 1.8497535037356967e-06, + "loss": 0.7666, + "step": 8932 + }, + { + "epoch": 0.73, + "grad_norm": 3.77503342284328, + "learning_rate": 1.8487321674733412e-06, + "loss": 0.6392, + "step": 8933 + }, + { + "epoch": 0.73, + "grad_norm": 9.33547330763729, + "learning_rate": 1.847711049298564e-06, + "loss": 0.8678, + "step": 8934 + }, + { + "epoch": 0.73, + "grad_norm": 4.334042439527358, + "learning_rate": 1.84669014928203e-06, + "loss": 0.6495, + "step": 8935 + }, + { + "epoch": 0.73, + "grad_norm": 4.017867873332193, + "learning_rate": 1.845669467494393e-06, + "loss": 0.5016, + "step": 8936 + }, + { + "epoch": 0.73, + "grad_norm": 5.247128543463194, + "learning_rate": 1.8446490040062898e-06, + "loss": 0.5412, + "step": 8937 + }, + { + "epoch": 0.73, + "grad_norm": 3.496608306137211, + "learning_rate": 1.8436287588883416e-06, + "loss": 0.6213, + "step": 8938 + }, + { + "epoch": 0.73, + "grad_norm": 3.3600632311866896, + "learning_rate": 1.8426087322111597e-06, + "loss": 0.6747, + "step": 8939 + }, + { + "epoch": 0.73, + "grad_norm": 4.525386929664864, + "learning_rate": 1.8415889240453316e-06, + "loss": 0.6864, + "step": 8940 + }, + { + "epoch": 0.73, + "grad_norm": 13.08742272246945, + "learning_rate": 1.840569334461439e-06, + "loss": 0.7082, + "step": 8941 + }, + { + "epoch": 0.73, + "grad_norm": 4.54220366001043, + "learning_rate": 1.8395499635300423e-06, + "loss": 0.7895, + "step": 8942 + }, + { + "epoch": 0.73, + "grad_norm": 3.130245164444048, + "learning_rate": 1.8385308113216876e-06, + "loss": 0.6243, + "step": 8943 + }, + { + "epoch": 0.73, + "grad_norm": 2.8823914782576554, + "learning_rate": 1.8375118779069067e-06, + "loss": 0.7373, + "step": 8944 + }, + { + "epoch": 0.73, + "grad_norm": 4.111831772144409, + "learning_rate": 1.836493163356215e-06, + "loss": 0.795, + "step": 8945 + }, + { + "epoch": 0.73, + "grad_norm": 7.67936925723455, + "learning_rate": 1.8354746677401174e-06, + "loss": 0.5647, + "step": 8946 + }, + { + "epoch": 0.73, + "grad_norm": 7.997706317521311, + "learning_rate": 1.8344563911290964e-06, + "loss": 0.7289, + "step": 8947 + }, + { + "epoch": 0.73, + "grad_norm": 6.268636044870036, + "learning_rate": 1.8334383335936269e-06, + "loss": 0.7539, + "step": 8948 + }, + { + "epoch": 0.73, + "grad_norm": 8.24434019604976, + "learning_rate": 1.832420495204163e-06, + "loss": 0.6067, + "step": 8949 + }, + { + "epoch": 0.73, + "grad_norm": 3.6026947510567098, + "learning_rate": 1.8314028760311458e-06, + "loss": 0.7028, + "step": 8950 + }, + { + "epoch": 0.73, + "grad_norm": 3.9820277836451767, + "learning_rate": 1.8303854761449984e-06, + "loss": 0.5997, + "step": 8951 + }, + { + "epoch": 0.73, + "grad_norm": 3.3255929843193117, + "learning_rate": 1.8293682956161357e-06, + "loss": 0.6649, + "step": 8952 + }, + { + "epoch": 0.73, + "grad_norm": 3.8811180403990075, + "learning_rate": 1.8283513345149507e-06, + "loss": 0.6645, + "step": 8953 + }, + { + "epoch": 0.73, + "grad_norm": 4.259735814374588, + "learning_rate": 1.8273345929118225e-06, + "loss": 0.8205, + "step": 8954 + }, + { + "epoch": 0.73, + "grad_norm": 3.8203005597936253, + "learning_rate": 1.8263180708771184e-06, + "loss": 0.6583, + "step": 8955 + }, + { + "epoch": 0.73, + "grad_norm": 2.6201288642527136, + "learning_rate": 1.825301768481187e-06, + "loss": 0.7451, + "step": 8956 + }, + { + "epoch": 0.73, + "grad_norm": 3.4385288483430356, + "learning_rate": 1.824285685794363e-06, + "loss": 0.8086, + "step": 8957 + }, + { + "epoch": 0.73, + "grad_norm": 4.486546745669859, + "learning_rate": 1.8232698228869633e-06, + "loss": 0.6086, + "step": 8958 + }, + { + "epoch": 0.73, + "grad_norm": 11.169820215423293, + "learning_rate": 1.8222541798292965e-06, + "loss": 0.7319, + "step": 8959 + }, + { + "epoch": 0.73, + "grad_norm": 5.678950261741719, + "learning_rate": 1.821238756691649e-06, + "loss": 0.7987, + "step": 8960 + }, + { + "epoch": 0.73, + "grad_norm": 4.783690915444024, + "learning_rate": 1.820223553544293e-06, + "loss": 0.6954, + "step": 8961 + }, + { + "epoch": 0.73, + "grad_norm": 3.3347573949184564, + "learning_rate": 1.8192085704574902e-06, + "loss": 0.5746, + "step": 8962 + }, + { + "epoch": 0.73, + "grad_norm": 4.555647604637633, + "learning_rate": 1.8181938075014821e-06, + "loss": 0.8035, + "step": 8963 + }, + { + "epoch": 0.73, + "grad_norm": 4.528980627802501, + "learning_rate": 1.817179264746497e-06, + "loss": 0.8058, + "step": 8964 + }, + { + "epoch": 0.73, + "grad_norm": 5.776142902647297, + "learning_rate": 1.8161649422627458e-06, + "loss": 0.6615, + "step": 8965 + }, + { + "epoch": 0.73, + "grad_norm": 3.279929496658013, + "learning_rate": 1.8151508401204298e-06, + "loss": 0.6595, + "step": 8966 + }, + { + "epoch": 0.73, + "grad_norm": 4.359658071853933, + "learning_rate": 1.8141369583897283e-06, + "loss": 0.4988, + "step": 8967 + }, + { + "epoch": 0.73, + "grad_norm": 2.88296926229359, + "learning_rate": 1.813123297140808e-06, + "loss": 0.7011, + "step": 8968 + }, + { + "epoch": 0.73, + "grad_norm": 5.47099188690207, + "learning_rate": 1.8121098564438249e-06, + "loss": 0.4852, + "step": 8969 + }, + { + "epoch": 0.73, + "grad_norm": 7.7618395462869, + "learning_rate": 1.8110966363689093e-06, + "loss": 0.6238, + "step": 8970 + }, + { + "epoch": 0.73, + "grad_norm": 7.841487017123696, + "learning_rate": 1.8100836369861869e-06, + "loss": 0.6894, + "step": 8971 + }, + { + "epoch": 0.73, + "grad_norm": 9.846068274846333, + "learning_rate": 1.8090708583657606e-06, + "loss": 0.7645, + "step": 8972 + }, + { + "epoch": 0.73, + "grad_norm": 3.4077122700992106, + "learning_rate": 1.8080583005777241e-06, + "loss": 0.6646, + "step": 8973 + }, + { + "epoch": 0.73, + "grad_norm": 2.363809129038152, + "learning_rate": 1.8070459636921517e-06, + "loss": 0.6734, + "step": 8974 + }, + { + "epoch": 0.73, + "grad_norm": 7.689613686336973, + "learning_rate": 1.8060338477791011e-06, + "loss": 0.6706, + "step": 8975 + }, + { + "epoch": 0.73, + "grad_norm": 5.503918679902205, + "learning_rate": 1.805021952908621e-06, + "loss": 0.5106, + "step": 8976 + }, + { + "epoch": 0.73, + "grad_norm": 6.228365668188462, + "learning_rate": 1.8040102791507385e-06, + "loss": 0.7783, + "step": 8977 + }, + { + "epoch": 0.73, + "grad_norm": 3.740134013002249, + "learning_rate": 1.8029988265754688e-06, + "loss": 0.6774, + "step": 8978 + }, + { + "epoch": 0.73, + "grad_norm": 2.972612345685512, + "learning_rate": 1.8019875952528087e-06, + "loss": 0.7327, + "step": 8979 + }, + { + "epoch": 0.73, + "grad_norm": 4.830794121883064, + "learning_rate": 1.800976585252745e-06, + "loss": 0.4537, + "step": 8980 + }, + { + "epoch": 0.73, + "grad_norm": 16.78888868962632, + "learning_rate": 1.799965796645242e-06, + "loss": 0.6034, + "step": 8981 + }, + { + "epoch": 0.73, + "grad_norm": 3.6369935798976267, + "learning_rate": 1.7989552295002593e-06, + "loss": 0.6731, + "step": 8982 + }, + { + "epoch": 0.73, + "grad_norm": 3.644768700621319, + "learning_rate": 1.7979448838877262e-06, + "loss": 0.5378, + "step": 8983 + }, + { + "epoch": 0.73, + "grad_norm": 2.6747102535784393, + "learning_rate": 1.7969347598775705e-06, + "loss": 0.6811, + "step": 8984 + }, + { + "epoch": 0.73, + "grad_norm": 8.431429027706097, + "learning_rate": 1.7959248575396982e-06, + "loss": 0.6829, + "step": 8985 + }, + { + "epoch": 0.73, + "grad_norm": 4.431836853961968, + "learning_rate": 1.7949151769439983e-06, + "loss": 0.6992, + "step": 8986 + }, + { + "epoch": 0.73, + "grad_norm": 10.761560258546373, + "learning_rate": 1.7939057181603504e-06, + "loss": 0.7632, + "step": 8987 + }, + { + "epoch": 0.73, + "grad_norm": 4.628241603904614, + "learning_rate": 1.7928964812586126e-06, + "loss": 0.8677, + "step": 8988 + }, + { + "epoch": 0.73, + "grad_norm": 2.8317954292852643, + "learning_rate": 1.7918874663086355e-06, + "loss": 0.6575, + "step": 8989 + }, + { + "epoch": 0.73, + "grad_norm": 66.91201835222382, + "learning_rate": 1.7908786733802419e-06, + "loss": 0.8439, + "step": 8990 + }, + { + "epoch": 0.73, + "grad_norm": 5.504491479135456, + "learning_rate": 1.789870102543252e-06, + "loss": 0.6043, + "step": 8991 + }, + { + "epoch": 0.73, + "grad_norm": 10.810557621017534, + "learning_rate": 1.788861753867464e-06, + "loss": 0.5702, + "step": 8992 + }, + { + "epoch": 0.73, + "grad_norm": 3.0093973864062384, + "learning_rate": 1.7878536274226598e-06, + "loss": 0.5665, + "step": 8993 + }, + { + "epoch": 0.73, + "grad_norm": 5.417772893934148, + "learning_rate": 1.7868457232786117e-06, + "loss": 0.6604, + "step": 8994 + }, + { + "epoch": 0.73, + "grad_norm": 5.059833991515718, + "learning_rate": 1.7858380415050696e-06, + "loss": 0.7275, + "step": 8995 + }, + { + "epoch": 0.73, + "grad_norm": 4.971516046770104, + "learning_rate": 1.7848305821717766e-06, + "loss": 0.7033, + "step": 8996 + }, + { + "epoch": 0.73, + "grad_norm": 7.384158316000213, + "learning_rate": 1.7838233453484476e-06, + "loss": 0.6184, + "step": 8997 + }, + { + "epoch": 0.73, + "grad_norm": 23.44364335545117, + "learning_rate": 1.7828163311047963e-06, + "loss": 0.7064, + "step": 8998 + }, + { + "epoch": 0.73, + "grad_norm": 10.883352435369922, + "learning_rate": 1.7818095395105116e-06, + "loss": 0.5967, + "step": 8999 + }, + { + "epoch": 0.73, + "grad_norm": 4.29345362283638, + "learning_rate": 1.780802970635268e-06, + "loss": 0.7215, + "step": 9000 + }, + { + "epoch": 0.73, + "grad_norm": 6.696126834789653, + "learning_rate": 1.7797966245487314e-06, + "loss": 0.6664, + "step": 9001 + }, + { + "epoch": 0.73, + "grad_norm": 3.6294596186497943, + "learning_rate": 1.7787905013205437e-06, + "loss": 0.6883, + "step": 9002 + }, + { + "epoch": 0.73, + "grad_norm": 3.5979563728844113, + "learning_rate": 1.7777846010203359e-06, + "loss": 0.6018, + "step": 9003 + }, + { + "epoch": 0.73, + "grad_norm": 3.459464342918987, + "learning_rate": 1.7767789237177208e-06, + "loss": 0.5564, + "step": 9004 + }, + { + "epoch": 0.73, + "grad_norm": 4.719924776994022, + "learning_rate": 1.7757734694823004e-06, + "loss": 0.735, + "step": 9005 + }, + { + "epoch": 0.73, + "grad_norm": 4.720136840744433, + "learning_rate": 1.7747682383836563e-06, + "loss": 0.7211, + "step": 9006 + }, + { + "epoch": 0.73, + "grad_norm": 7.264085624524268, + "learning_rate": 1.7737632304913592e-06, + "loss": 0.7236, + "step": 9007 + }, + { + "epoch": 0.73, + "grad_norm": 3.4478113986501246, + "learning_rate": 1.7727584458749608e-06, + "loss": 0.6115, + "step": 9008 + }, + { + "epoch": 0.73, + "grad_norm": 9.773460893580113, + "learning_rate": 1.7717538846039984e-06, + "loss": 0.6398, + "step": 9009 + }, + { + "epoch": 0.73, + "grad_norm": 3.1112526191049374, + "learning_rate": 1.7707495467479934e-06, + "loss": 0.7122, + "step": 9010 + }, + { + "epoch": 0.73, + "grad_norm": 6.582481412180845, + "learning_rate": 1.7697454323764518e-06, + "loss": 0.6642, + "step": 9011 + }, + { + "epoch": 0.73, + "grad_norm": 4.412118142441393, + "learning_rate": 1.7687415415588672e-06, + "loss": 0.6743, + "step": 9012 + }, + { + "epoch": 0.73, + "grad_norm": 12.78464136476746, + "learning_rate": 1.7677378743647116e-06, + "loss": 0.6451, + "step": 9013 + }, + { + "epoch": 0.73, + "grad_norm": 7.1328121900297266, + "learning_rate": 1.7667344308634488e-06, + "loss": 0.8853, + "step": 9014 + }, + { + "epoch": 0.73, + "grad_norm": 4.349985568047294, + "learning_rate": 1.7657312111245218e-06, + "loss": 0.6934, + "step": 9015 + }, + { + "epoch": 0.73, + "grad_norm": 4.221360289547216, + "learning_rate": 1.7647282152173594e-06, + "loss": 0.6002, + "step": 9016 + }, + { + "epoch": 0.73, + "grad_norm": 3.604380148727912, + "learning_rate": 1.763725443211376e-06, + "loss": 0.4612, + "step": 9017 + }, + { + "epoch": 0.73, + "grad_norm": 4.58516107331622, + "learning_rate": 1.7627228951759673e-06, + "loss": 0.6356, + "step": 9018 + }, + { + "epoch": 0.73, + "grad_norm": 4.9419297235067114, + "learning_rate": 1.7617205711805196e-06, + "loss": 0.7621, + "step": 9019 + }, + { + "epoch": 0.73, + "grad_norm": 4.854320444043853, + "learning_rate": 1.7607184712943964e-06, + "loss": 0.6562, + "step": 9020 + }, + { + "epoch": 0.73, + "grad_norm": 4.222895018022861, + "learning_rate": 1.7597165955869528e-06, + "loss": 0.7388, + "step": 9021 + }, + { + "epoch": 0.73, + "grad_norm": 5.515256703478686, + "learning_rate": 1.7587149441275236e-06, + "loss": 0.683, + "step": 9022 + }, + { + "epoch": 0.73, + "grad_norm": 2.9721383949852735, + "learning_rate": 1.7577135169854286e-06, + "loss": 0.6173, + "step": 9023 + }, + { + "epoch": 0.73, + "grad_norm": 4.287168177606552, + "learning_rate": 1.7567123142299718e-06, + "loss": 0.7624, + "step": 9024 + }, + { + "epoch": 0.73, + "grad_norm": 4.179905343394945, + "learning_rate": 1.7557113359304461e-06, + "loss": 0.6658, + "step": 9025 + }, + { + "epoch": 0.73, + "grad_norm": 3.5768020742337057, + "learning_rate": 1.7547105821561238e-06, + "loss": 0.7563, + "step": 9026 + }, + { + "epoch": 0.73, + "grad_norm": 2.8418063680582013, + "learning_rate": 1.7537100529762619e-06, + "loss": 0.6087, + "step": 9027 + }, + { + "epoch": 0.73, + "grad_norm": 4.63907027952988, + "learning_rate": 1.7527097484601057e-06, + "loss": 0.7878, + "step": 9028 + }, + { + "epoch": 0.73, + "grad_norm": 3.773055042408477, + "learning_rate": 1.751709668676882e-06, + "loss": 0.6239, + "step": 9029 + }, + { + "epoch": 0.73, + "grad_norm": 4.300900392702149, + "learning_rate": 1.7507098136958017e-06, + "loss": 0.7563, + "step": 9030 + }, + { + "epoch": 0.73, + "grad_norm": 5.995543001320013, + "learning_rate": 1.7497101835860603e-06, + "loss": 0.6281, + "step": 9031 + }, + { + "epoch": 0.73, + "grad_norm": 7.037518494021243, + "learning_rate": 1.748710778416841e-06, + "loss": 0.7558, + "step": 9032 + }, + { + "epoch": 0.73, + "grad_norm": 6.1753280524523095, + "learning_rate": 1.7477115982573078e-06, + "loss": 0.7854, + "step": 9033 + }, + { + "epoch": 0.73, + "grad_norm": 2.7143625032684087, + "learning_rate": 1.7467126431766084e-06, + "loss": 0.7144, + "step": 9034 + }, + { + "epoch": 0.73, + "grad_norm": 5.201435086480116, + "learning_rate": 1.7457139132438816e-06, + "loss": 0.7627, + "step": 9035 + }, + { + "epoch": 0.73, + "grad_norm": 4.2234905298335335, + "learning_rate": 1.7447154085282398e-06, + "loss": 0.5716, + "step": 9036 + }, + { + "epoch": 0.73, + "grad_norm": 4.411111418167788, + "learning_rate": 1.7437171290987898e-06, + "loss": 0.8133, + "step": 9037 + }, + { + "epoch": 0.73, + "grad_norm": 2.8236004070386422, + "learning_rate": 1.7427190750246164e-06, + "loss": 0.684, + "step": 9038 + }, + { + "epoch": 0.73, + "grad_norm": 7.281846612935211, + "learning_rate": 1.7417212463747945e-06, + "loss": 0.5949, + "step": 9039 + }, + { + "epoch": 0.73, + "grad_norm": 2.95905044721102, + "learning_rate": 1.7407236432183778e-06, + "loss": 0.7019, + "step": 9040 + }, + { + "epoch": 0.73, + "grad_norm": 3.6531974406127885, + "learning_rate": 1.7397262656244057e-06, + "loss": 0.698, + "step": 9041 + }, + { + "epoch": 0.73, + "grad_norm": 3.2760151276700524, + "learning_rate": 1.7387291136619071e-06, + "loss": 0.7477, + "step": 9042 + }, + { + "epoch": 0.73, + "grad_norm": 11.437757978207243, + "learning_rate": 1.7377321873998858e-06, + "loss": 0.7306, + "step": 9043 + }, + { + "epoch": 0.73, + "grad_norm": 4.47655015391583, + "learning_rate": 1.7367354869073394e-06, + "loss": 0.7008, + "step": 9044 + }, + { + "epoch": 0.73, + "grad_norm": 2.8784992560830913, + "learning_rate": 1.735739012253243e-06, + "loss": 0.5629, + "step": 9045 + }, + { + "epoch": 0.73, + "grad_norm": 6.088462584267213, + "learning_rate": 1.7347427635065622e-06, + "loss": 0.6368, + "step": 9046 + }, + { + "epoch": 0.73, + "grad_norm": 4.498229947807873, + "learning_rate": 1.7337467407362418e-06, + "loss": 0.6819, + "step": 9047 + }, + { + "epoch": 0.73, + "grad_norm": 5.098128921328518, + "learning_rate": 1.7327509440112112e-06, + "loss": 0.6594, + "step": 9048 + }, + { + "epoch": 0.73, + "grad_norm": 3.126851388231115, + "learning_rate": 1.7317553734003894e-06, + "loss": 0.6863, + "step": 9049 + }, + { + "epoch": 0.74, + "grad_norm": 5.052787635697249, + "learning_rate": 1.7307600289726745e-06, + "loss": 0.5893, + "step": 9050 + }, + { + "epoch": 0.74, + "grad_norm": 4.777614082166385, + "learning_rate": 1.72976491079695e-06, + "loss": 0.7043, + "step": 9051 + }, + { + "epoch": 0.74, + "grad_norm": 4.139907213065849, + "learning_rate": 1.7287700189420831e-06, + "loss": 0.6776, + "step": 9052 + }, + { + "epoch": 0.74, + "grad_norm": 4.512297868201302, + "learning_rate": 1.7277753534769304e-06, + "loss": 0.7316, + "step": 9053 + }, + { + "epoch": 0.74, + "grad_norm": 6.001483969036824, + "learning_rate": 1.7267809144703251e-06, + "loss": 0.6071, + "step": 9054 + }, + { + "epoch": 0.74, + "grad_norm": 6.349328884197159, + "learning_rate": 1.7257867019910933e-06, + "loss": 0.6583, + "step": 9055 + }, + { + "epoch": 0.74, + "grad_norm": 3.5299168070237603, + "learning_rate": 1.7247927161080346e-06, + "loss": 0.5047, + "step": 9056 + }, + { + "epoch": 0.74, + "grad_norm": 4.227965674527148, + "learning_rate": 1.7237989568899444e-06, + "loss": 0.641, + "step": 9057 + }, + { + "epoch": 0.74, + "grad_norm": 3.8061821226187074, + "learning_rate": 1.7228054244055952e-06, + "loss": 0.583, + "step": 9058 + }, + { + "epoch": 0.74, + "grad_norm": 3.3558384297001114, + "learning_rate": 1.7218121187237436e-06, + "loss": 0.6177, + "step": 9059 + }, + { + "epoch": 0.74, + "grad_norm": 8.038592903515427, + "learning_rate": 1.7208190399131359e-06, + "loss": 0.5733, + "step": 9060 + }, + { + "epoch": 0.74, + "grad_norm": 2.7442964032355848, + "learning_rate": 1.7198261880424967e-06, + "loss": 0.629, + "step": 9061 + }, + { + "epoch": 0.74, + "grad_norm": 2.854857229577612, + "learning_rate": 1.7188335631805426e-06, + "loss": 0.5712, + "step": 9062 + }, + { + "epoch": 0.74, + "grad_norm": 4.02428763612648, + "learning_rate": 1.717841165395962e-06, + "loss": 0.7264, + "step": 9063 + }, + { + "epoch": 0.74, + "grad_norm": 6.718385428002132, + "learning_rate": 1.7168489947574407e-06, + "loss": 0.7513, + "step": 9064 + }, + { + "epoch": 0.74, + "grad_norm": 4.440736186316236, + "learning_rate": 1.715857051333642e-06, + "loss": 0.7002, + "step": 9065 + }, + { + "epoch": 0.74, + "grad_norm": 3.6065433015276014, + "learning_rate": 1.7148653351932116e-06, + "loss": 0.7542, + "step": 9066 + }, + { + "epoch": 0.74, + "grad_norm": 3.3725003325413216, + "learning_rate": 1.713873846404787e-06, + "loss": 0.6819, + "step": 9067 + }, + { + "epoch": 0.74, + "grad_norm": 12.63661843072848, + "learning_rate": 1.7128825850369819e-06, + "loss": 0.5876, + "step": 9068 + }, + { + "epoch": 0.74, + "grad_norm": 5.193358105257273, + "learning_rate": 1.7118915511584022e-06, + "loss": 0.4654, + "step": 9069 + }, + { + "epoch": 0.74, + "grad_norm": 20.656710694762054, + "learning_rate": 1.7109007448376274e-06, + "loss": 0.6683, + "step": 9070 + }, + { + "epoch": 0.74, + "grad_norm": 3.709800434236687, + "learning_rate": 1.7099101661432326e-06, + "loss": 0.5542, + "step": 9071 + }, + { + "epoch": 0.74, + "grad_norm": 11.913372999313664, + "learning_rate": 1.7089198151437708e-06, + "loss": 0.608, + "step": 9072 + }, + { + "epoch": 0.74, + "grad_norm": 4.870034372793257, + "learning_rate": 1.7079296919077781e-06, + "loss": 0.5864, + "step": 9073 + }, + { + "epoch": 0.74, + "grad_norm": 4.264145769317549, + "learning_rate": 1.7069397965037816e-06, + "loss": 0.6341, + "step": 9074 + }, + { + "epoch": 0.74, + "grad_norm": 4.389151236961843, + "learning_rate": 1.7059501290002855e-06, + "loss": 0.7465, + "step": 9075 + }, + { + "epoch": 0.74, + "grad_norm": 4.998991433883439, + "learning_rate": 1.7049606894657817e-06, + "loss": 0.6225, + "step": 9076 + }, + { + "epoch": 0.74, + "grad_norm": 3.268987331458217, + "learning_rate": 1.7039714779687438e-06, + "loss": 0.6913, + "step": 9077 + }, + { + "epoch": 0.74, + "grad_norm": 7.400244461200818, + "learning_rate": 1.7029824945776346e-06, + "loss": 0.6046, + "step": 9078 + }, + { + "epoch": 0.74, + "grad_norm": 11.676297328591891, + "learning_rate": 1.701993739360895e-06, + "loss": 0.7142, + "step": 9079 + }, + { + "epoch": 0.74, + "grad_norm": 4.156450850923716, + "learning_rate": 1.7010052123869564e-06, + "loss": 0.6187, + "step": 9080 + }, + { + "epoch": 0.74, + "grad_norm": 3.9160448808124215, + "learning_rate": 1.700016913724229e-06, + "loss": 0.8204, + "step": 9081 + }, + { + "epoch": 0.74, + "grad_norm": 4.4581595964277945, + "learning_rate": 1.6990288434411094e-06, + "loss": 0.8151, + "step": 9082 + }, + { + "epoch": 0.74, + "grad_norm": 6.108067224138785, + "learning_rate": 1.6980410016059789e-06, + "loss": 0.7362, + "step": 9083 + }, + { + "epoch": 0.74, + "grad_norm": 7.249607278894565, + "learning_rate": 1.6970533882872004e-06, + "loss": 0.6182, + "step": 9084 + }, + { + "epoch": 0.74, + "grad_norm": 4.903282138217733, + "learning_rate": 1.6960660035531256e-06, + "loss": 0.6303, + "step": 9085 + }, + { + "epoch": 0.74, + "grad_norm": 3.7111491747140537, + "learning_rate": 1.6950788474720852e-06, + "loss": 0.698, + "step": 9086 + }, + { + "epoch": 0.74, + "grad_norm": 5.685673776479529, + "learning_rate": 1.6940919201124001e-06, + "loss": 0.5428, + "step": 9087 + }, + { + "epoch": 0.74, + "grad_norm": 5.537884616404571, + "learning_rate": 1.6931052215423693e-06, + "loss": 0.6795, + "step": 9088 + }, + { + "epoch": 0.74, + "grad_norm": 4.4066587046520835, + "learning_rate": 1.6921187518302795e-06, + "loss": 0.6103, + "step": 9089 + }, + { + "epoch": 0.74, + "grad_norm": 5.522610604200022, + "learning_rate": 1.6911325110444005e-06, + "loss": 0.6918, + "step": 9090 + }, + { + "epoch": 0.74, + "grad_norm": 4.694734407310773, + "learning_rate": 1.6901464992529837e-06, + "loss": 0.5909, + "step": 9091 + }, + { + "epoch": 0.74, + "grad_norm": 3.302509582619649, + "learning_rate": 1.6891607165242718e-06, + "loss": 0.7304, + "step": 9092 + }, + { + "epoch": 0.74, + "grad_norm": 5.768673489039729, + "learning_rate": 1.688175162926483e-06, + "loss": 0.7259, + "step": 9093 + }, + { + "epoch": 0.74, + "grad_norm": 2.8808571421889364, + "learning_rate": 1.6871898385278278e-06, + "loss": 0.6389, + "step": 9094 + }, + { + "epoch": 0.74, + "grad_norm": 12.20263962194019, + "learning_rate": 1.686204743396495e-06, + "loss": 0.5056, + "step": 9095 + }, + { + "epoch": 0.74, + "grad_norm": 4.170177527202357, + "learning_rate": 1.6852198776006596e-06, + "loss": 0.6973, + "step": 9096 + }, + { + "epoch": 0.74, + "grad_norm": 4.790392057339692, + "learning_rate": 1.68423524120848e-06, + "loss": 0.6513, + "step": 9097 + }, + { + "epoch": 0.74, + "grad_norm": 8.725997075970518, + "learning_rate": 1.6832508342880981e-06, + "loss": 0.6928, + "step": 9098 + }, + { + "epoch": 0.74, + "grad_norm": 4.153388006904061, + "learning_rate": 1.6822666569076434e-06, + "loss": 0.6925, + "step": 9099 + }, + { + "epoch": 0.74, + "grad_norm": 3.978065682747705, + "learning_rate": 1.6812827091352252e-06, + "loss": 0.6699, + "step": 9100 + }, + { + "epoch": 0.74, + "grad_norm": 5.3207723377084, + "learning_rate": 1.6802989910389416e-06, + "loss": 0.6841, + "step": 9101 + }, + { + "epoch": 0.74, + "grad_norm": 3.4069671825968824, + "learning_rate": 1.67931550268687e-06, + "loss": 0.6806, + "step": 9102 + }, + { + "epoch": 0.74, + "grad_norm": 3.015712646777389, + "learning_rate": 1.6783322441470745e-06, + "loss": 0.672, + "step": 9103 + }, + { + "epoch": 0.74, + "grad_norm": 7.9885619403915475, + "learning_rate": 1.6773492154876008e-06, + "loss": 0.67, + "step": 9104 + }, + { + "epoch": 0.74, + "grad_norm": 14.551906968357082, + "learning_rate": 1.6763664167764847e-06, + "loss": 0.7617, + "step": 9105 + }, + { + "epoch": 0.74, + "grad_norm": 6.667806995767433, + "learning_rate": 1.6753838480817397e-06, + "loss": 0.7167, + "step": 9106 + }, + { + "epoch": 0.74, + "grad_norm": 5.008091582743248, + "learning_rate": 1.674401509471364e-06, + "loss": 0.6347, + "step": 9107 + }, + { + "epoch": 0.74, + "grad_norm": 3.547632990834146, + "learning_rate": 1.673419401013347e-06, + "loss": 0.5644, + "step": 9108 + }, + { + "epoch": 0.74, + "grad_norm": 4.339606931924342, + "learning_rate": 1.6724375227756501e-06, + "loss": 0.5805, + "step": 9109 + }, + { + "epoch": 0.74, + "grad_norm": 4.131366870584318, + "learning_rate": 1.6714558748262298e-06, + "loss": 0.6864, + "step": 9110 + }, + { + "epoch": 0.74, + "grad_norm": 3.986157075215698, + "learning_rate": 1.6704744572330206e-06, + "loss": 0.6885, + "step": 9111 + }, + { + "epoch": 0.74, + "grad_norm": 6.749418945188202, + "learning_rate": 1.6694932700639444e-06, + "loss": 0.6904, + "step": 9112 + }, + { + "epoch": 0.74, + "grad_norm": 7.0648936777508355, + "learning_rate": 1.6685123133869046e-06, + "loss": 0.7836, + "step": 9113 + }, + { + "epoch": 0.74, + "grad_norm": 3.515946587550001, + "learning_rate": 1.6675315872697879e-06, + "loss": 0.6884, + "step": 9114 + }, + { + "epoch": 0.74, + "grad_norm": 10.604344868341466, + "learning_rate": 1.6665510917804712e-06, + "loss": 0.6752, + "step": 9115 + }, + { + "epoch": 0.74, + "grad_norm": 3.615890680680915, + "learning_rate": 1.6655708269868055e-06, + "loss": 0.7058, + "step": 9116 + }, + { + "epoch": 0.74, + "grad_norm": 5.14001618855776, + "learning_rate": 1.6645907929566345e-06, + "loss": 0.729, + "step": 9117 + }, + { + "epoch": 0.74, + "grad_norm": 10.46455098034888, + "learning_rate": 1.6636109897577813e-06, + "loss": 0.6776, + "step": 9118 + }, + { + "epoch": 0.74, + "grad_norm": 4.569020223502847, + "learning_rate": 1.6626314174580565e-06, + "loss": 0.5976, + "step": 9119 + }, + { + "epoch": 0.74, + "grad_norm": 5.676520332948421, + "learning_rate": 1.661652076125252e-06, + "loss": 0.7312, + "step": 9120 + }, + { + "epoch": 0.74, + "grad_norm": 5.755037101965318, + "learning_rate": 1.6606729658271413e-06, + "loss": 0.7252, + "step": 9121 + }, + { + "epoch": 0.74, + "grad_norm": 3.4341892797644964, + "learning_rate": 1.6596940866314915e-06, + "loss": 0.6965, + "step": 9122 + }, + { + "epoch": 0.74, + "grad_norm": 5.475785196018253, + "learning_rate": 1.65871543860604e-06, + "loss": 0.6194, + "step": 9123 + }, + { + "epoch": 0.74, + "grad_norm": 4.912743834214711, + "learning_rate": 1.6577370218185197e-06, + "loss": 0.6062, + "step": 9124 + }, + { + "epoch": 0.74, + "grad_norm": 7.577586886506765, + "learning_rate": 1.656758836336641e-06, + "loss": 0.7639, + "step": 9125 + }, + { + "epoch": 0.74, + "grad_norm": 2.883934903324397, + "learning_rate": 1.655780882228103e-06, + "loss": 0.5834, + "step": 9126 + }, + { + "epoch": 0.74, + "grad_norm": 9.15424994558803, + "learning_rate": 1.6548031595605829e-06, + "loss": 0.7172, + "step": 9127 + }, + { + "epoch": 0.74, + "grad_norm": 3.256171763045889, + "learning_rate": 1.6538256684017512e-06, + "loss": 0.6774, + "step": 9128 + }, + { + "epoch": 0.74, + "grad_norm": 2.9494340733529523, + "learning_rate": 1.6528484088192487e-06, + "loss": 0.5602, + "step": 9129 + }, + { + "epoch": 0.74, + "grad_norm": 3.810263468724041, + "learning_rate": 1.6518713808807135e-06, + "loss": 0.7268, + "step": 9130 + }, + { + "epoch": 0.74, + "grad_norm": 3.3056735247106093, + "learning_rate": 1.6508945846537606e-06, + "loss": 0.9031, + "step": 9131 + }, + { + "epoch": 0.74, + "grad_norm": 3.635794463167354, + "learning_rate": 1.6499180202059883e-06, + "loss": 0.6024, + "step": 9132 + }, + { + "epoch": 0.74, + "grad_norm": 5.051135977375669, + "learning_rate": 1.648941687604984e-06, + "loss": 0.7451, + "step": 9133 + }, + { + "epoch": 0.74, + "grad_norm": 4.32487712961461, + "learning_rate": 1.6479655869183142e-06, + "loss": 0.8083, + "step": 9134 + }, + { + "epoch": 0.74, + "grad_norm": 4.509850515877558, + "learning_rate": 1.6469897182135347e-06, + "loss": 0.5401, + "step": 9135 + }, + { + "epoch": 0.74, + "grad_norm": 3.0609954209440535, + "learning_rate": 1.6460140815581754e-06, + "loss": 0.7148, + "step": 9136 + }, + { + "epoch": 0.74, + "grad_norm": 2.890736995360674, + "learning_rate": 1.6450386770197625e-06, + "loss": 0.6671, + "step": 9137 + }, + { + "epoch": 0.74, + "grad_norm": 4.40560969211236, + "learning_rate": 1.6440635046657971e-06, + "loss": 0.6227, + "step": 9138 + }, + { + "epoch": 0.74, + "grad_norm": 6.215995185891323, + "learning_rate": 1.6430885645637667e-06, + "loss": 0.748, + "step": 9139 + }, + { + "epoch": 0.74, + "grad_norm": 3.339770772935581, + "learning_rate": 1.6421138567811456e-06, + "loss": 0.6585, + "step": 9140 + }, + { + "epoch": 0.74, + "grad_norm": 5.417085367573536, + "learning_rate": 1.6411393813853893e-06, + "loss": 0.6055, + "step": 9141 + }, + { + "epoch": 0.74, + "grad_norm": 4.202176153914713, + "learning_rate": 1.6401651384439365e-06, + "loss": 0.6097, + "step": 9142 + }, + { + "epoch": 0.74, + "grad_norm": 4.367131862255048, + "learning_rate": 1.63919112802421e-06, + "loss": 0.7113, + "step": 9143 + }, + { + "epoch": 0.74, + "grad_norm": 11.779382813060646, + "learning_rate": 1.6382173501936206e-06, + "loss": 0.724, + "step": 9144 + }, + { + "epoch": 0.74, + "grad_norm": 2.6634392830037816, + "learning_rate": 1.6372438050195577e-06, + "loss": 0.6127, + "step": 9145 + }, + { + "epoch": 0.74, + "grad_norm": 3.907296089291729, + "learning_rate": 1.6362704925693957e-06, + "loss": 0.703, + "step": 9146 + }, + { + "epoch": 0.74, + "grad_norm": 6.189456493705393, + "learning_rate": 1.6352974129104964e-06, + "loss": 0.5996, + "step": 9147 + }, + { + "epoch": 0.74, + "grad_norm": 4.498296102419997, + "learning_rate": 1.6343245661102031e-06, + "loss": 0.6815, + "step": 9148 + }, + { + "epoch": 0.74, + "grad_norm": 5.196170975003428, + "learning_rate": 1.6333519522358416e-06, + "loss": 0.701, + "step": 9149 + }, + { + "epoch": 0.74, + "grad_norm": 3.334316168853089, + "learning_rate": 1.6323795713547208e-06, + "loss": 0.7045, + "step": 9150 + }, + { + "epoch": 0.74, + "grad_norm": 22.819685436074494, + "learning_rate": 1.6314074235341403e-06, + "loss": 0.659, + "step": 9151 + }, + { + "epoch": 0.74, + "grad_norm": 5.540659253063088, + "learning_rate": 1.6304355088413747e-06, + "loss": 0.7287, + "step": 9152 + }, + { + "epoch": 0.74, + "grad_norm": 4.687392666501798, + "learning_rate": 1.6294638273436902e-06, + "loss": 0.6199, + "step": 9153 + }, + { + "epoch": 0.74, + "grad_norm": 7.664401345340307, + "learning_rate": 1.6284923791083312e-06, + "loss": 0.7385, + "step": 9154 + }, + { + "epoch": 0.74, + "grad_norm": 12.026549945268178, + "learning_rate": 1.6275211642025285e-06, + "loss": 0.6975, + "step": 9155 + }, + { + "epoch": 0.74, + "grad_norm": 5.861625300862647, + "learning_rate": 1.6265501826934959e-06, + "loss": 0.7269, + "step": 9156 + }, + { + "epoch": 0.74, + "grad_norm": 4.152682572236951, + "learning_rate": 1.6255794346484305e-06, + "loss": 0.621, + "step": 9157 + }, + { + "epoch": 0.74, + "grad_norm": 9.234341237890744, + "learning_rate": 1.6246089201345167e-06, + "loss": 0.7241, + "step": 9158 + }, + { + "epoch": 0.74, + "grad_norm": 8.769690963293272, + "learning_rate": 1.6236386392189175e-06, + "loss": 0.7735, + "step": 9159 + }, + { + "epoch": 0.74, + "grad_norm": 3.861439507922802, + "learning_rate": 1.622668591968785e-06, + "loss": 0.6127, + "step": 9160 + }, + { + "epoch": 0.74, + "grad_norm": 4.405117975854151, + "learning_rate": 1.6216987784512512e-06, + "loss": 0.7164, + "step": 9161 + }, + { + "epoch": 0.74, + "grad_norm": 6.999821958661862, + "learning_rate": 1.620729198733434e-06, + "loss": 0.7372, + "step": 9162 + }, + { + "epoch": 0.74, + "grad_norm": 4.234094529083211, + "learning_rate": 1.6197598528824338e-06, + "loss": 0.5409, + "step": 9163 + }, + { + "epoch": 0.74, + "grad_norm": 3.923997652612684, + "learning_rate": 1.6187907409653335e-06, + "loss": 0.6248, + "step": 9164 + }, + { + "epoch": 0.74, + "grad_norm": 4.964865302354988, + "learning_rate": 1.617821863049206e-06, + "loss": 0.6974, + "step": 9165 + }, + { + "epoch": 0.74, + "grad_norm": 5.090789297409314, + "learning_rate": 1.6168532192010993e-06, + "loss": 0.7865, + "step": 9166 + }, + { + "epoch": 0.74, + "grad_norm": 7.018303851147221, + "learning_rate": 1.6158848094880535e-06, + "loss": 0.5518, + "step": 9167 + }, + { + "epoch": 0.74, + "grad_norm": 3.8169427169205816, + "learning_rate": 1.6149166339770877e-06, + "loss": 0.6302, + "step": 9168 + }, + { + "epoch": 0.74, + "grad_norm": 7.209963653850187, + "learning_rate": 1.6139486927352048e-06, + "loss": 0.6847, + "step": 9169 + }, + { + "epoch": 0.74, + "grad_norm": 7.226084564230448, + "learning_rate": 1.6129809858293926e-06, + "loss": 0.6822, + "step": 9170 + }, + { + "epoch": 0.74, + "grad_norm": 3.655494651526364, + "learning_rate": 1.6120135133266208e-06, + "loss": 0.6129, + "step": 9171 + }, + { + "epoch": 0.74, + "grad_norm": 3.5918371148134525, + "learning_rate": 1.6110462752938482e-06, + "loss": 0.6865, + "step": 9172 + }, + { + "epoch": 0.75, + "grad_norm": 5.490431937751609, + "learning_rate": 1.6100792717980106e-06, + "loss": 0.7239, + "step": 9173 + }, + { + "epoch": 0.75, + "grad_norm": 3.684896562475687, + "learning_rate": 1.6091125029060335e-06, + "loss": 0.5958, + "step": 9174 + }, + { + "epoch": 0.75, + "grad_norm": 13.42468390490575, + "learning_rate": 1.6081459686848217e-06, + "loss": 0.7426, + "step": 9175 + }, + { + "epoch": 0.75, + "grad_norm": 5.73150237237512, + "learning_rate": 1.6071796692012663e-06, + "loss": 0.844, + "step": 9176 + }, + { + "epoch": 0.75, + "grad_norm": 5.643259136436941, + "learning_rate": 1.6062136045222388e-06, + "loss": 0.665, + "step": 9177 + }, + { + "epoch": 0.75, + "grad_norm": 3.333573845412961, + "learning_rate": 1.6052477747146006e-06, + "loss": 0.637, + "step": 9178 + }, + { + "epoch": 0.75, + "grad_norm": 5.2401039201135875, + "learning_rate": 1.6042821798451914e-06, + "loss": 0.6692, + "step": 9179 + }, + { + "epoch": 0.75, + "grad_norm": 7.129691568387476, + "learning_rate": 1.6033168199808352e-06, + "loss": 0.7452, + "step": 9180 + }, + { + "epoch": 0.75, + "grad_norm": 4.120948592788868, + "learning_rate": 1.6023516951883455e-06, + "loss": 0.7169, + "step": 9181 + }, + { + "epoch": 0.75, + "grad_norm": 7.693447296054683, + "learning_rate": 1.6013868055345084e-06, + "loss": 0.5477, + "step": 9182 + }, + { + "epoch": 0.75, + "grad_norm": 4.259877268147181, + "learning_rate": 1.6004221510861057e-06, + "loss": 0.5187, + "step": 9183 + }, + { + "epoch": 0.75, + "grad_norm": 7.118176312147558, + "learning_rate": 1.5994577319098936e-06, + "loss": 0.7644, + "step": 9184 + }, + { + "epoch": 0.75, + "grad_norm": 6.93585007449809, + "learning_rate": 1.5984935480726199e-06, + "loss": 0.729, + "step": 9185 + }, + { + "epoch": 0.75, + "grad_norm": 4.977732233558026, + "learning_rate": 1.5975295996410107e-06, + "loss": 0.6384, + "step": 9186 + }, + { + "epoch": 0.75, + "grad_norm": 4.80102309224025, + "learning_rate": 1.5965658866817751e-06, + "loss": 0.6703, + "step": 9187 + }, + { + "epoch": 0.75, + "grad_norm": 3.945065652565553, + "learning_rate": 1.5956024092616129e-06, + "loss": 0.6165, + "step": 9188 + }, + { + "epoch": 0.75, + "grad_norm": 3.046224525980792, + "learning_rate": 1.5946391674471968e-06, + "loss": 0.4656, + "step": 9189 + }, + { + "epoch": 0.75, + "grad_norm": 3.97849703743684, + "learning_rate": 1.5936761613051937e-06, + "loss": 0.6367, + "step": 9190 + }, + { + "epoch": 0.75, + "grad_norm": 7.265006225011878, + "learning_rate": 1.5927133909022469e-06, + "loss": 0.6929, + "step": 9191 + }, + { + "epoch": 0.75, + "grad_norm": 6.517950305423383, + "learning_rate": 1.5917508563049888e-06, + "loss": 0.7377, + "step": 9192 + }, + { + "epoch": 0.75, + "grad_norm": 2.6027746513511745, + "learning_rate": 1.5907885575800318e-06, + "loss": 0.6322, + "step": 9193 + }, + { + "epoch": 0.75, + "grad_norm": 3.5160709055603894, + "learning_rate": 1.5898264947939729e-06, + "loss": 0.7594, + "step": 9194 + }, + { + "epoch": 0.75, + "grad_norm": 6.451420791104515, + "learning_rate": 1.5888646680133923e-06, + "loss": 0.4824, + "step": 9195 + }, + { + "epoch": 0.75, + "grad_norm": 4.678427133251209, + "learning_rate": 1.5879030773048536e-06, + "loss": 0.61, + "step": 9196 + }, + { + "epoch": 0.75, + "grad_norm": 5.212637963787614, + "learning_rate": 1.5869417227349077e-06, + "loss": 0.8648, + "step": 9197 + }, + { + "epoch": 0.75, + "grad_norm": 2.9362544649209155, + "learning_rate": 1.5859806043700838e-06, + "loss": 0.5822, + "step": 9198 + }, + { + "epoch": 0.75, + "grad_norm": 4.05145130873793, + "learning_rate": 1.5850197222768998e-06, + "loss": 0.6556, + "step": 9199 + }, + { + "epoch": 0.75, + "grad_norm": 18.3590440407811, + "learning_rate": 1.5840590765218538e-06, + "loss": 0.5171, + "step": 9200 + }, + { + "epoch": 0.75, + "grad_norm": 2.595370668199203, + "learning_rate": 1.5830986671714283e-06, + "loss": 0.6932, + "step": 9201 + }, + { + "epoch": 0.75, + "grad_norm": 3.7085766653583896, + "learning_rate": 1.5821384942920876e-06, + "loss": 0.7209, + "step": 9202 + }, + { + "epoch": 0.75, + "grad_norm": 4.257791650298892, + "learning_rate": 1.5811785579502852e-06, + "loss": 0.5832, + "step": 9203 + }, + { + "epoch": 0.75, + "grad_norm": 4.921515809682467, + "learning_rate": 1.580218858212454e-06, + "loss": 0.6732, + "step": 9204 + }, + { + "epoch": 0.75, + "grad_norm": 4.387574225923554, + "learning_rate": 1.5792593951450085e-06, + "loss": 0.6015, + "step": 9205 + }, + { + "epoch": 0.75, + "grad_norm": 3.988231355404521, + "learning_rate": 1.578300168814353e-06, + "loss": 0.8296, + "step": 9206 + }, + { + "epoch": 0.75, + "grad_norm": 4.9590633483187805, + "learning_rate": 1.5773411792868692e-06, + "loss": 0.7607, + "step": 9207 + }, + { + "epoch": 0.75, + "grad_norm": 3.2155625784212076, + "learning_rate": 1.57638242662893e-06, + "loss": 0.8566, + "step": 9208 + }, + { + "epoch": 0.75, + "grad_norm": 3.875293183795946, + "learning_rate": 1.5754239109068804e-06, + "loss": 0.6667, + "step": 9209 + }, + { + "epoch": 0.75, + "grad_norm": 4.518434355778084, + "learning_rate": 1.574465632187061e-06, + "loss": 0.7105, + "step": 9210 + }, + { + "epoch": 0.75, + "grad_norm": 4.092580564897229, + "learning_rate": 1.5735075905357882e-06, + "loss": 0.5526, + "step": 9211 + }, + { + "epoch": 0.75, + "grad_norm": 4.659085709354241, + "learning_rate": 1.572549786019364e-06, + "loss": 0.5401, + "step": 9212 + }, + { + "epoch": 0.75, + "grad_norm": 4.908088813060428, + "learning_rate": 1.5715922187040771e-06, + "loss": 0.5642, + "step": 9213 + }, + { + "epoch": 0.75, + "grad_norm": 7.16254994324948, + "learning_rate": 1.5706348886561955e-06, + "loss": 0.7392, + "step": 9214 + }, + { + "epoch": 0.75, + "grad_norm": 17.269429750689987, + "learning_rate": 1.5696777959419729e-06, + "loss": 0.6543, + "step": 9215 + }, + { + "epoch": 0.75, + "grad_norm": 3.385640826487743, + "learning_rate": 1.5687209406276443e-06, + "loss": 0.6619, + "step": 9216 + }, + { + "epoch": 0.75, + "grad_norm": 4.856734977688676, + "learning_rate": 1.5677643227794332e-06, + "loss": 0.6164, + "step": 9217 + }, + { + "epoch": 0.75, + "grad_norm": 9.699678129090218, + "learning_rate": 1.5668079424635424e-06, + "loss": 0.5915, + "step": 9218 + }, + { + "epoch": 0.75, + "grad_norm": 6.769990278525421, + "learning_rate": 1.565851799746157e-06, + "loss": 0.6369, + "step": 9219 + }, + { + "epoch": 0.75, + "grad_norm": 5.307646279136852, + "learning_rate": 1.5648958946934523e-06, + "loss": 0.6575, + "step": 9220 + }, + { + "epoch": 0.75, + "grad_norm": 3.976543767937419, + "learning_rate": 1.563940227371581e-06, + "loss": 0.6661, + "step": 9221 + }, + { + "epoch": 0.75, + "grad_norm": 5.2934098774266785, + "learning_rate": 1.5629847978466805e-06, + "loss": 0.5588, + "step": 9222 + }, + { + "epoch": 0.75, + "grad_norm": 3.8612276933948975, + "learning_rate": 1.5620296061848722e-06, + "loss": 0.5009, + "step": 9223 + }, + { + "epoch": 0.75, + "grad_norm": 3.927145591459146, + "learning_rate": 1.561074652452264e-06, + "loss": 0.6675, + "step": 9224 + }, + { + "epoch": 0.75, + "grad_norm": 3.3608913380276064, + "learning_rate": 1.5601199367149432e-06, + "loss": 0.6538, + "step": 9225 + }, + { + "epoch": 0.75, + "grad_norm": 3.59653933875556, + "learning_rate": 1.5591654590389798e-06, + "loss": 0.6122, + "step": 9226 + }, + { + "epoch": 0.75, + "grad_norm": 4.250110997902879, + "learning_rate": 1.558211219490434e-06, + "loss": 0.7701, + "step": 9227 + }, + { + "epoch": 0.75, + "grad_norm": 8.47222757288281, + "learning_rate": 1.5572572181353435e-06, + "loss": 0.7227, + "step": 9228 + }, + { + "epoch": 0.75, + "grad_norm": 4.29823036502934, + "learning_rate": 1.5563034550397305e-06, + "loss": 0.6022, + "step": 9229 + }, + { + "epoch": 0.75, + "grad_norm": 8.753653718420093, + "learning_rate": 1.5553499302695996e-06, + "loss": 0.5702, + "step": 9230 + }, + { + "epoch": 0.75, + "grad_norm": 6.82887321416608, + "learning_rate": 1.5543966438909451e-06, + "loss": 0.6388, + "step": 9231 + }, + { + "epoch": 0.75, + "grad_norm": 14.36021441245323, + "learning_rate": 1.5534435959697363e-06, + "loss": 0.5621, + "step": 9232 + }, + { + "epoch": 0.75, + "grad_norm": 5.02160843440359, + "learning_rate": 1.5524907865719336e-06, + "loss": 0.6572, + "step": 9233 + }, + { + "epoch": 0.75, + "grad_norm": 9.689530491544076, + "learning_rate": 1.5515382157634756e-06, + "loss": 0.6774, + "step": 9234 + }, + { + "epoch": 0.75, + "grad_norm": 2.745878161202803, + "learning_rate": 1.5505858836102866e-06, + "loss": 0.653, + "step": 9235 + }, + { + "epoch": 0.75, + "grad_norm": 10.68215910521105, + "learning_rate": 1.5496337901782737e-06, + "loss": 0.7023, + "step": 9236 + }, + { + "epoch": 0.75, + "grad_norm": 10.267370816841499, + "learning_rate": 1.548681935533326e-06, + "loss": 0.5201, + "step": 9237 + }, + { + "epoch": 0.75, + "grad_norm": 5.03302229731131, + "learning_rate": 1.5477303197413213e-06, + "loss": 0.748, + "step": 9238 + }, + { + "epoch": 0.75, + "grad_norm": 3.0474537566442126, + "learning_rate": 1.5467789428681145e-06, + "loss": 0.7339, + "step": 9239 + }, + { + "epoch": 0.75, + "grad_norm": 4.320054887296264, + "learning_rate": 1.5458278049795495e-06, + "loss": 0.7397, + "step": 9240 + }, + { + "epoch": 0.75, + "grad_norm": 7.302954571483545, + "learning_rate": 1.5448769061414497e-06, + "loss": 0.8012, + "step": 9241 + }, + { + "epoch": 0.75, + "grad_norm": 3.0207065053807693, + "learning_rate": 1.5439262464196236e-06, + "loss": 0.5929, + "step": 9242 + }, + { + "epoch": 0.75, + "grad_norm": 13.230253778180412, + "learning_rate": 1.5429758258798622e-06, + "loss": 0.5731, + "step": 9243 + }, + { + "epoch": 0.75, + "grad_norm": 6.959799588574407, + "learning_rate": 1.542025644587939e-06, + "loss": 0.5715, + "step": 9244 + }, + { + "epoch": 0.75, + "grad_norm": 3.257006355714011, + "learning_rate": 1.5410757026096163e-06, + "loss": 0.6638, + "step": 9245 + }, + { + "epoch": 0.75, + "grad_norm": 8.227118253275556, + "learning_rate": 1.5401260000106321e-06, + "loss": 0.6396, + "step": 9246 + }, + { + "epoch": 0.75, + "grad_norm": 5.811584241571735, + "learning_rate": 1.5391765368567173e-06, + "loss": 0.7566, + "step": 9247 + }, + { + "epoch": 0.75, + "grad_norm": 5.569791300293353, + "learning_rate": 1.5382273132135745e-06, + "loss": 0.7143, + "step": 9248 + }, + { + "epoch": 0.75, + "grad_norm": 10.15907716062236, + "learning_rate": 1.5372783291469002e-06, + "loss": 0.7944, + "step": 9249 + }, + { + "epoch": 0.75, + "grad_norm": 4.704889215605705, + "learning_rate": 1.5363295847223685e-06, + "loss": 0.6312, + "step": 9250 + }, + { + "epoch": 0.75, + "grad_norm": 6.157170630343912, + "learning_rate": 1.5353810800056367e-06, + "loss": 0.7463, + "step": 9251 + }, + { + "epoch": 0.75, + "grad_norm": 5.111424047629009, + "learning_rate": 1.5344328150623516e-06, + "loss": 0.8069, + "step": 9252 + }, + { + "epoch": 0.75, + "grad_norm": 3.4527324622912072, + "learning_rate": 1.5334847899581344e-06, + "loss": 0.6963, + "step": 9253 + }, + { + "epoch": 0.75, + "grad_norm": 4.066958643787716, + "learning_rate": 1.5325370047586003e-06, + "loss": 0.6095, + "step": 9254 + }, + { + "epoch": 0.75, + "grad_norm": 7.533251528204885, + "learning_rate": 1.531589459529335e-06, + "loss": 0.5975, + "step": 9255 + }, + { + "epoch": 0.75, + "grad_norm": 4.161754736588987, + "learning_rate": 1.5306421543359195e-06, + "loss": 0.62, + "step": 9256 + }, + { + "epoch": 0.75, + "grad_norm": 3.302096731690723, + "learning_rate": 1.5296950892439106e-06, + "loss": 0.7082, + "step": 9257 + }, + { + "epoch": 0.75, + "grad_norm": 6.289009720198772, + "learning_rate": 1.528748264318854e-06, + "loss": 0.6463, + "step": 9258 + }, + { + "epoch": 0.75, + "grad_norm": 27.717254730586927, + "learning_rate": 1.527801679626274e-06, + "loss": 0.6503, + "step": 9259 + }, + { + "epoch": 0.75, + "grad_norm": 3.824479183718752, + "learning_rate": 1.526855335231679e-06, + "loss": 0.7478, + "step": 9260 + }, + { + "epoch": 0.75, + "grad_norm": 3.5740542654717222, + "learning_rate": 1.5259092312005668e-06, + "loss": 0.8412, + "step": 9261 + }, + { + "epoch": 0.75, + "grad_norm": 4.253259984536387, + "learning_rate": 1.5249633675984072e-06, + "loss": 0.5094, + "step": 9262 + }, + { + "epoch": 0.75, + "grad_norm": 129.67896132521813, + "learning_rate": 1.5240177444906651e-06, + "loss": 0.6777, + "step": 9263 + }, + { + "epoch": 0.75, + "grad_norm": 7.472854077658867, + "learning_rate": 1.5230723619427795e-06, + "loss": 0.7814, + "step": 9264 + }, + { + "epoch": 0.75, + "grad_norm": 3.1834672142842084, + "learning_rate": 1.5221272200201808e-06, + "loss": 0.7614, + "step": 9265 + }, + { + "epoch": 0.75, + "grad_norm": 3.8019906331171027, + "learning_rate": 1.5211823187882774e-06, + "loss": 0.7666, + "step": 9266 + }, + { + "epoch": 0.75, + "grad_norm": 4.466781684155773, + "learning_rate": 1.5202376583124617e-06, + "loss": 0.7644, + "step": 9267 + }, + { + "epoch": 0.75, + "grad_norm": 5.509186117026691, + "learning_rate": 1.5192932386581105e-06, + "loss": 0.7377, + "step": 9268 + }, + { + "epoch": 0.75, + "grad_norm": 5.506211265222116, + "learning_rate": 1.5183490598905814e-06, + "loss": 0.615, + "step": 9269 + }, + { + "epoch": 0.75, + "grad_norm": 3.649314717925277, + "learning_rate": 1.5174051220752216e-06, + "loss": 0.6686, + "step": 9270 + }, + { + "epoch": 0.75, + "grad_norm": 4.648718334978951, + "learning_rate": 1.5164614252773545e-06, + "loss": 0.7374, + "step": 9271 + }, + { + "epoch": 0.75, + "grad_norm": 6.535012781862225, + "learning_rate": 1.5155179695622918e-06, + "loss": 0.6309, + "step": 9272 + }, + { + "epoch": 0.75, + "grad_norm": 3.9433851342779307, + "learning_rate": 1.514574754995326e-06, + "loss": 0.4907, + "step": 9273 + }, + { + "epoch": 0.75, + "grad_norm": 6.350863966303147, + "learning_rate": 1.5136317816417333e-06, + "loss": 0.7643, + "step": 9274 + }, + { + "epoch": 0.75, + "grad_norm": 4.894091796859776, + "learning_rate": 1.5126890495667734e-06, + "loss": 0.6533, + "step": 9275 + }, + { + "epoch": 0.75, + "grad_norm": 7.639719974319914, + "learning_rate": 1.5117465588356871e-06, + "loss": 0.8022, + "step": 9276 + }, + { + "epoch": 0.75, + "grad_norm": 4.347682022479323, + "learning_rate": 1.5108043095137048e-06, + "loss": 0.6198, + "step": 9277 + }, + { + "epoch": 0.75, + "grad_norm": 3.191939686103781, + "learning_rate": 1.5098623016660325e-06, + "loss": 0.5919, + "step": 9278 + }, + { + "epoch": 0.75, + "grad_norm": 5.20371465162756, + "learning_rate": 1.5089205353578663e-06, + "loss": 0.5907, + "step": 9279 + }, + { + "epoch": 0.75, + "grad_norm": 2.855046168317262, + "learning_rate": 1.507979010654379e-06, + "loss": 0.6376, + "step": 9280 + }, + { + "epoch": 0.75, + "grad_norm": 5.142402084139453, + "learning_rate": 1.5070377276207348e-06, + "loss": 0.6523, + "step": 9281 + }, + { + "epoch": 0.75, + "grad_norm": 4.838668671451622, + "learning_rate": 1.50609668632207e-06, + "loss": 0.5614, + "step": 9282 + }, + { + "epoch": 0.75, + "grad_norm": 3.2125028433098923, + "learning_rate": 1.505155886823516e-06, + "loss": 0.7854, + "step": 9283 + }, + { + "epoch": 0.75, + "grad_norm": 3.4174997687799715, + "learning_rate": 1.5042153291901796e-06, + "loss": 0.609, + "step": 9284 + }, + { + "epoch": 0.75, + "grad_norm": 3.845669736688184, + "learning_rate": 1.5032750134871527e-06, + "loss": 0.7142, + "step": 9285 + }, + { + "epoch": 0.75, + "grad_norm": 3.7994916286810474, + "learning_rate": 1.5023349397795128e-06, + "loss": 0.7153, + "step": 9286 + }, + { + "epoch": 0.75, + "grad_norm": 3.235914159338443, + "learning_rate": 1.5013951081323186e-06, + "loss": 0.6226, + "step": 9287 + }, + { + "epoch": 0.75, + "grad_norm": 4.123545209455686, + "learning_rate": 1.5004555186106124e-06, + "loss": 0.692, + "step": 9288 + }, + { + "epoch": 0.75, + "grad_norm": 3.2544455607248937, + "learning_rate": 1.499516171279417e-06, + "loss": 0.6108, + "step": 9289 + }, + { + "epoch": 0.75, + "grad_norm": 4.435548968747164, + "learning_rate": 1.4985770662037453e-06, + "loss": 0.5999, + "step": 9290 + }, + { + "epoch": 0.75, + "grad_norm": 3.816375521089074, + "learning_rate": 1.4976382034485876e-06, + "loss": 0.6891, + "step": 9291 + }, + { + "epoch": 0.75, + "grad_norm": 3.9966555691748056, + "learning_rate": 1.4966995830789167e-06, + "loss": 0.8782, + "step": 9292 + }, + { + "epoch": 0.75, + "grad_norm": 7.4538069667697195, + "learning_rate": 1.4957612051596953e-06, + "loss": 0.6575, + "step": 9293 + }, + { + "epoch": 0.75, + "grad_norm": 4.935559227840784, + "learning_rate": 1.494823069755863e-06, + "loss": 0.7139, + "step": 9294 + }, + { + "epoch": 0.75, + "grad_norm": 3.0903064557130886, + "learning_rate": 1.4938851769323449e-06, + "loss": 0.6205, + "step": 9295 + }, + { + "epoch": 0.76, + "grad_norm": 5.808422615849854, + "learning_rate": 1.4929475267540467e-06, + "loss": 0.8061, + "step": 9296 + }, + { + "epoch": 0.76, + "grad_norm": 6.791235467919618, + "learning_rate": 1.4920101192858637e-06, + "loss": 0.7637, + "step": 9297 + }, + { + "epoch": 0.76, + "grad_norm": 3.683634453110716, + "learning_rate": 1.4910729545926689e-06, + "loss": 0.7237, + "step": 9298 + }, + { + "epoch": 0.76, + "grad_norm": 3.622448925006723, + "learning_rate": 1.4901360327393177e-06, + "loss": 0.5661, + "step": 9299 + }, + { + "epoch": 0.76, + "grad_norm": 4.695114106657661, + "learning_rate": 1.4891993537906563e-06, + "loss": 0.6312, + "step": 9300 + }, + { + "epoch": 0.76, + "grad_norm": 3.812612347237318, + "learning_rate": 1.488262917811502e-06, + "loss": 0.6054, + "step": 9301 + }, + { + "epoch": 0.76, + "grad_norm": 7.009663055766406, + "learning_rate": 1.487326724866668e-06, + "loss": 0.6748, + "step": 9302 + }, + { + "epoch": 0.76, + "grad_norm": 6.083384332947613, + "learning_rate": 1.4863907750209399e-06, + "loss": 0.6129, + "step": 9303 + }, + { + "epoch": 0.76, + "grad_norm": 8.3833174880032, + "learning_rate": 1.485455068339095e-06, + "loss": 0.7168, + "step": 9304 + }, + { + "epoch": 0.76, + "grad_norm": 2.913862213041282, + "learning_rate": 1.484519604885888e-06, + "loss": 0.7563, + "step": 9305 + }, + { + "epoch": 0.76, + "grad_norm": 13.314362639088515, + "learning_rate": 1.4835843847260605e-06, + "loss": 0.552, + "step": 9306 + }, + { + "epoch": 0.76, + "grad_norm": 3.4298889728525794, + "learning_rate": 1.4826494079243353e-06, + "loss": 0.7031, + "step": 9307 + }, + { + "epoch": 0.76, + "grad_norm": 4.877332375157771, + "learning_rate": 1.4817146745454174e-06, + "loss": 0.7611, + "step": 9308 + }, + { + "epoch": 0.76, + "grad_norm": 3.6126484827195955, + "learning_rate": 1.4807801846539977e-06, + "loss": 0.7667, + "step": 9309 + }, + { + "epoch": 0.76, + "grad_norm": 6.170541528714656, + "learning_rate": 1.4798459383147462e-06, + "loss": 0.7008, + "step": 9310 + }, + { + "epoch": 0.76, + "grad_norm": 3.407821872231295, + "learning_rate": 1.4789119355923227e-06, + "loss": 0.759, + "step": 9311 + }, + { + "epoch": 0.76, + "grad_norm": 7.012052171750457, + "learning_rate": 1.4779781765513612e-06, + "loss": 0.6111, + "step": 9312 + }, + { + "epoch": 0.76, + "grad_norm": 18.56320732367359, + "learning_rate": 1.4770446612564887e-06, + "loss": 0.7102, + "step": 9313 + }, + { + "epoch": 0.76, + "grad_norm": 4.955881004437763, + "learning_rate": 1.4761113897723078e-06, + "loss": 0.5928, + "step": 9314 + }, + { + "epoch": 0.76, + "grad_norm": 4.730250532559149, + "learning_rate": 1.475178362163407e-06, + "loss": 0.6538, + "step": 9315 + }, + { + "epoch": 0.76, + "grad_norm": 4.7496379987671595, + "learning_rate": 1.4742455784943576e-06, + "loss": 0.7231, + "step": 9316 + }, + { + "epoch": 0.76, + "grad_norm": 11.364472325159019, + "learning_rate": 1.4733130388297124e-06, + "loss": 0.5832, + "step": 9317 + }, + { + "epoch": 0.76, + "grad_norm": 4.457145344889489, + "learning_rate": 1.4723807432340125e-06, + "loss": 0.7458, + "step": 9318 + }, + { + "epoch": 0.76, + "grad_norm": 6.800535402176218, + "learning_rate": 1.4714486917717753e-06, + "loss": 0.7445, + "step": 9319 + }, + { + "epoch": 0.76, + "grad_norm": 3.714306216479424, + "learning_rate": 1.4705168845075095e-06, + "loss": 0.6098, + "step": 9320 + }, + { + "epoch": 0.76, + "grad_norm": 5.306299729887863, + "learning_rate": 1.4695853215056955e-06, + "loss": 0.6536, + "step": 9321 + }, + { + "epoch": 0.76, + "grad_norm": 4.20643848952529, + "learning_rate": 1.4686540028308083e-06, + "loss": 0.7495, + "step": 9322 + }, + { + "epoch": 0.76, + "grad_norm": 3.6561084069340892, + "learning_rate": 1.4677229285472988e-06, + "loss": 0.6338, + "step": 9323 + }, + { + "epoch": 0.76, + "grad_norm": 4.265606389719064, + "learning_rate": 1.4667920987196028e-06, + "loss": 0.6615, + "step": 9324 + }, + { + "epoch": 0.76, + "grad_norm": 3.028822361190528, + "learning_rate": 1.4658615134121417e-06, + "loss": 0.7723, + "step": 9325 + }, + { + "epoch": 0.76, + "grad_norm": 5.743626541381983, + "learning_rate": 1.4649311726893151e-06, + "loss": 0.7238, + "step": 9326 + }, + { + "epoch": 0.76, + "grad_norm": 4.676897935008805, + "learning_rate": 1.4640010766155128e-06, + "loss": 0.6896, + "step": 9327 + }, + { + "epoch": 0.76, + "grad_norm": 15.156779956523764, + "learning_rate": 1.4630712252550977e-06, + "loss": 0.6965, + "step": 9328 + }, + { + "epoch": 0.76, + "grad_norm": 4.01535544773294, + "learning_rate": 1.4621416186724257e-06, + "loss": 0.604, + "step": 9329 + }, + { + "epoch": 0.76, + "grad_norm": 5.867567225253043, + "learning_rate": 1.4612122569318282e-06, + "loss": 0.5211, + "step": 9330 + }, + { + "epoch": 0.76, + "grad_norm": 3.968299325985637, + "learning_rate": 1.4602831400976263e-06, + "loss": 0.6325, + "step": 9331 + }, + { + "epoch": 0.76, + "grad_norm": 5.332360261243705, + "learning_rate": 1.4593542682341193e-06, + "loss": 0.6537, + "step": 9332 + }, + { + "epoch": 0.76, + "grad_norm": 5.237917683230502, + "learning_rate": 1.4584256414055886e-06, + "loss": 0.5422, + "step": 9333 + }, + { + "epoch": 0.76, + "grad_norm": 3.7038639417554093, + "learning_rate": 1.4574972596763066e-06, + "loss": 0.7572, + "step": 9334 + }, + { + "epoch": 0.76, + "grad_norm": 4.366763674415903, + "learning_rate": 1.456569123110516e-06, + "loss": 0.5537, + "step": 9335 + }, + { + "epoch": 0.76, + "grad_norm": 4.048691976731345, + "learning_rate": 1.4556412317724556e-06, + "loss": 0.6057, + "step": 9336 + }, + { + "epoch": 0.76, + "grad_norm": 3.1770592548547096, + "learning_rate": 1.4547135857263372e-06, + "loss": 0.8133, + "step": 9337 + }, + { + "epoch": 0.76, + "grad_norm": 4.273052178917577, + "learning_rate": 1.4537861850363633e-06, + "loss": 0.6866, + "step": 9338 + }, + { + "epoch": 0.76, + "grad_norm": 7.993579487150804, + "learning_rate": 1.452859029766714e-06, + "loss": 0.5804, + "step": 9339 + }, + { + "epoch": 0.76, + "grad_norm": 3.5101684811171205, + "learning_rate": 1.4519321199815544e-06, + "loss": 0.6341, + "step": 9340 + }, + { + "epoch": 0.76, + "grad_norm": 3.7801582414463124, + "learning_rate": 1.4510054557450332e-06, + "loss": 0.6836, + "step": 9341 + }, + { + "epoch": 0.76, + "grad_norm": 4.788131555611861, + "learning_rate": 1.4500790371212786e-06, + "loss": 0.5263, + "step": 9342 + }, + { + "epoch": 0.76, + "grad_norm": 4.148113292645213, + "learning_rate": 1.4491528641744085e-06, + "loss": 0.6957, + "step": 9343 + }, + { + "epoch": 0.76, + "grad_norm": 5.82267111825369, + "learning_rate": 1.448226936968517e-06, + "loss": 0.5848, + "step": 9344 + }, + { + "epoch": 0.76, + "grad_norm": 3.306365422177082, + "learning_rate": 1.4473012555676862e-06, + "loss": 0.698, + "step": 9345 + }, + { + "epoch": 0.76, + "grad_norm": 3.9184169230978316, + "learning_rate": 1.4463758200359783e-06, + "loss": 0.7738, + "step": 9346 + }, + { + "epoch": 0.76, + "grad_norm": 3.9343299528086617, + "learning_rate": 1.4454506304374394e-06, + "loss": 0.6179, + "step": 9347 + }, + { + "epoch": 0.76, + "grad_norm": 3.8637123397143474, + "learning_rate": 1.4445256868360979e-06, + "loss": 0.5942, + "step": 9348 + }, + { + "epoch": 0.76, + "grad_norm": 6.378754228020261, + "learning_rate": 1.4436009892959647e-06, + "loss": 0.5277, + "step": 9349 + }, + { + "epoch": 0.76, + "grad_norm": 5.988587805054817, + "learning_rate": 1.4426765378810376e-06, + "loss": 0.6473, + "step": 9350 + }, + { + "epoch": 0.76, + "grad_norm": 7.990524893043351, + "learning_rate": 1.4417523326552911e-06, + "loss": 0.7076, + "step": 9351 + }, + { + "epoch": 0.76, + "grad_norm": 3.8324272212929333, + "learning_rate": 1.4408283736826894e-06, + "loss": 0.7753, + "step": 9352 + }, + { + "epoch": 0.76, + "grad_norm": 3.045654808839438, + "learning_rate": 1.4399046610271726e-06, + "loss": 0.6739, + "step": 9353 + }, + { + "epoch": 0.76, + "grad_norm": 5.6403461302166225, + "learning_rate": 1.4389811947526733e-06, + "loss": 0.5497, + "step": 9354 + }, + { + "epoch": 0.76, + "grad_norm": 6.415072690864279, + "learning_rate": 1.4380579749230938e-06, + "loss": 0.689, + "step": 9355 + }, + { + "epoch": 0.76, + "grad_norm": 7.0265031471379595, + "learning_rate": 1.4371350016023323e-06, + "loss": 0.6661, + "step": 9356 + }, + { + "epoch": 0.76, + "grad_norm": 4.259754088083147, + "learning_rate": 1.4362122748542617e-06, + "loss": 0.7027, + "step": 9357 + }, + { + "epoch": 0.76, + "grad_norm": 5.793872018476198, + "learning_rate": 1.4352897947427396e-06, + "loss": 0.7106, + "step": 9358 + }, + { + "epoch": 0.76, + "grad_norm": 3.24953483362668, + "learning_rate": 1.434367561331611e-06, + "loss": 0.49, + "step": 9359 + }, + { + "epoch": 0.76, + "grad_norm": 5.098133111989722, + "learning_rate": 1.433445574684698e-06, + "loss": 0.6826, + "step": 9360 + }, + { + "epoch": 0.76, + "grad_norm": 3.3460340922132232, + "learning_rate": 1.4325238348658082e-06, + "loss": 0.6714, + "step": 9361 + }, + { + "epoch": 0.76, + "grad_norm": 3.890265257009263, + "learning_rate": 1.4316023419387303e-06, + "loss": 0.6585, + "step": 9362 + }, + { + "epoch": 0.76, + "grad_norm": 5.057762003928424, + "learning_rate": 1.43068109596724e-06, + "loss": 0.7342, + "step": 9363 + }, + { + "epoch": 0.76, + "grad_norm": 3.427564669345006, + "learning_rate": 1.4297600970150927e-06, + "loss": 0.6393, + "step": 9364 + }, + { + "epoch": 0.76, + "grad_norm": 2.6128378002813206, + "learning_rate": 1.4288393451460248e-06, + "loss": 0.6295, + "step": 9365 + }, + { + "epoch": 0.76, + "grad_norm": 7.0161774381701125, + "learning_rate": 1.4279188404237615e-06, + "loss": 0.6829, + "step": 9366 + }, + { + "epoch": 0.76, + "grad_norm": 3.1804544126040577, + "learning_rate": 1.4269985829120065e-06, + "loss": 0.6144, + "step": 9367 + }, + { + "epoch": 0.76, + "grad_norm": 11.214863897223797, + "learning_rate": 1.426078572674447e-06, + "loss": 0.5799, + "step": 9368 + }, + { + "epoch": 0.76, + "grad_norm": 25.30369612817867, + "learning_rate": 1.4251588097747515e-06, + "loss": 0.6735, + "step": 9369 + }, + { + "epoch": 0.76, + "grad_norm": 3.630466790980653, + "learning_rate": 1.4242392942765775e-06, + "loss": 0.675, + "step": 9370 + }, + { + "epoch": 0.76, + "grad_norm": 6.3586736554188095, + "learning_rate": 1.4233200262435592e-06, + "loss": 0.712, + "step": 9371 + }, + { + "epoch": 0.76, + "grad_norm": 5.179197510191876, + "learning_rate": 1.422401005739314e-06, + "loss": 0.5815, + "step": 9372 + }, + { + "epoch": 0.76, + "grad_norm": 5.113403459690238, + "learning_rate": 1.4214822328274485e-06, + "loss": 0.6101, + "step": 9373 + }, + { + "epoch": 0.76, + "grad_norm": 2.9333260586841323, + "learning_rate": 1.4205637075715418e-06, + "loss": 0.7115, + "step": 9374 + }, + { + "epoch": 0.76, + "grad_norm": 6.9652998235247345, + "learning_rate": 1.4196454300351665e-06, + "loss": 0.7521, + "step": 9375 + }, + { + "epoch": 0.76, + "grad_norm": 3.619509654471619, + "learning_rate": 1.418727400281869e-06, + "loss": 0.5868, + "step": 9376 + }, + { + "epoch": 0.76, + "grad_norm": 5.222293923131044, + "learning_rate": 1.4178096183751866e-06, + "loss": 0.6987, + "step": 9377 + }, + { + "epoch": 0.76, + "grad_norm": 6.382648874908834, + "learning_rate": 1.4168920843786326e-06, + "loss": 0.7036, + "step": 9378 + }, + { + "epoch": 0.76, + "grad_norm": 2.6352075156029686, + "learning_rate": 1.4159747983557093e-06, + "loss": 0.5236, + "step": 9379 + }, + { + "epoch": 0.76, + "grad_norm": 2.066683403265199, + "learning_rate": 1.4150577603698962e-06, + "loss": 0.5709, + "step": 9380 + }, + { + "epoch": 0.76, + "grad_norm": 6.5213741077241805, + "learning_rate": 1.4141409704846592e-06, + "loss": 0.5405, + "step": 9381 + }, + { + "epoch": 0.76, + "grad_norm": 3.637692591390936, + "learning_rate": 1.4132244287634456e-06, + "loss": 0.6722, + "step": 9382 + }, + { + "epoch": 0.76, + "grad_norm": 3.6366763484426174, + "learning_rate": 1.4123081352696838e-06, + "loss": 0.586, + "step": 9383 + }, + { + "epoch": 0.76, + "grad_norm": 2.773732006360121, + "learning_rate": 1.4113920900667905e-06, + "loss": 0.5606, + "step": 9384 + }, + { + "epoch": 0.76, + "grad_norm": 3.187365333462406, + "learning_rate": 1.4104762932181592e-06, + "loss": 0.6227, + "step": 9385 + }, + { + "epoch": 0.76, + "grad_norm": 2.1990589750044864, + "learning_rate": 1.4095607447871711e-06, + "loss": 0.7466, + "step": 9386 + }, + { + "epoch": 0.76, + "grad_norm": 3.849252959376761, + "learning_rate": 1.4086454448371873e-06, + "loss": 0.6404, + "step": 9387 + }, + { + "epoch": 0.76, + "grad_norm": 4.294010139524278, + "learning_rate": 1.4077303934315511e-06, + "loss": 0.6122, + "step": 9388 + }, + { + "epoch": 0.76, + "grad_norm": 4.0210574665377194, + "learning_rate": 1.4068155906335906e-06, + "loss": 0.6887, + "step": 9389 + }, + { + "epoch": 0.76, + "grad_norm": 5.575880392902406, + "learning_rate": 1.4059010365066145e-06, + "loss": 0.6074, + "step": 9390 + }, + { + "epoch": 0.76, + "grad_norm": 3.9841927375665755, + "learning_rate": 1.4049867311139182e-06, + "loss": 0.7425, + "step": 9391 + }, + { + "epoch": 0.76, + "grad_norm": 7.987163491207309, + "learning_rate": 1.4040726745187749e-06, + "loss": 0.7611, + "step": 9392 + }, + { + "epoch": 0.76, + "grad_norm": 3.913856026919917, + "learning_rate": 1.4031588667844476e-06, + "loss": 0.6164, + "step": 9393 + }, + { + "epoch": 0.76, + "grad_norm": 7.953483679783007, + "learning_rate": 1.402245307974171e-06, + "loss": 0.6473, + "step": 9394 + }, + { + "epoch": 0.76, + "grad_norm": 7.532476624848864, + "learning_rate": 1.4013319981511736e-06, + "loss": 0.7298, + "step": 9395 + }, + { + "epoch": 0.76, + "grad_norm": 4.20155485111007, + "learning_rate": 1.4004189373786614e-06, + "loss": 0.7231, + "step": 9396 + }, + { + "epoch": 0.76, + "grad_norm": 5.394815073032657, + "learning_rate": 1.3995061257198224e-06, + "loss": 0.5554, + "step": 9397 + }, + { + "epoch": 0.76, + "grad_norm": 3.2714839651335272, + "learning_rate": 1.398593563237831e-06, + "loss": 0.7769, + "step": 9398 + }, + { + "epoch": 0.76, + "grad_norm": 4.4275263483779606, + "learning_rate": 1.3976812499958397e-06, + "loss": 0.5575, + "step": 9399 + }, + { + "epoch": 0.76, + "grad_norm": 3.5053858514271337, + "learning_rate": 1.3967691860569915e-06, + "loss": 0.6995, + "step": 9400 + }, + { + "epoch": 0.76, + "grad_norm": 4.505526921097421, + "learning_rate": 1.3958573714844005e-06, + "loss": 0.6892, + "step": 9401 + }, + { + "epoch": 0.76, + "grad_norm": 4.092998180178473, + "learning_rate": 1.3949458063411742e-06, + "loss": 0.7358, + "step": 9402 + }, + { + "epoch": 0.76, + "grad_norm": 3.0186226234192866, + "learning_rate": 1.3940344906903957e-06, + "loss": 0.4459, + "step": 9403 + }, + { + "epoch": 0.76, + "grad_norm": 3.514920750122406, + "learning_rate": 1.3931234245951375e-06, + "loss": 0.5682, + "step": 9404 + }, + { + "epoch": 0.76, + "grad_norm": 4.459417237539226, + "learning_rate": 1.3922126081184484e-06, + "loss": 0.6886, + "step": 9405 + }, + { + "epoch": 0.76, + "grad_norm": 2.7926230018193845, + "learning_rate": 1.3913020413233625e-06, + "loss": 0.796, + "step": 9406 + }, + { + "epoch": 0.76, + "grad_norm": 3.058280350991169, + "learning_rate": 1.3903917242729004e-06, + "loss": 0.5742, + "step": 9407 + }, + { + "epoch": 0.76, + "grad_norm": 42.39492807120868, + "learning_rate": 1.3894816570300557e-06, + "loss": 0.7018, + "step": 9408 + }, + { + "epoch": 0.76, + "grad_norm": 8.678357776462851, + "learning_rate": 1.3885718396578157e-06, + "loss": 0.7281, + "step": 9409 + }, + { + "epoch": 0.76, + "grad_norm": 4.681286840037638, + "learning_rate": 1.3876622722191425e-06, + "loss": 0.5698, + "step": 9410 + }, + { + "epoch": 0.76, + "grad_norm": 5.840682750296039, + "learning_rate": 1.3867529547769865e-06, + "loss": 0.5836, + "step": 9411 + }, + { + "epoch": 0.76, + "grad_norm": 3.466426624789153, + "learning_rate": 1.3858438873942765e-06, + "loss": 0.7828, + "step": 9412 + }, + { + "epoch": 0.76, + "grad_norm": 4.662761279982415, + "learning_rate": 1.3849350701339265e-06, + "loss": 0.6137, + "step": 9413 + }, + { + "epoch": 0.76, + "grad_norm": 3.026888218262693, + "learning_rate": 1.3840265030588323e-06, + "loss": 0.4448, + "step": 9414 + }, + { + "epoch": 0.76, + "grad_norm": 4.827438691696988, + "learning_rate": 1.3831181862318704e-06, + "loss": 0.6629, + "step": 9415 + }, + { + "epoch": 0.76, + "grad_norm": 3.8725643768166362, + "learning_rate": 1.3822101197159049e-06, + "loss": 0.6567, + "step": 9416 + }, + { + "epoch": 0.76, + "grad_norm": 4.657553514633317, + "learning_rate": 1.3813023035737778e-06, + "loss": 0.4273, + "step": 9417 + }, + { + "epoch": 0.76, + "grad_norm": 19.357686246995534, + "learning_rate": 1.3803947378683174e-06, + "loss": 0.7513, + "step": 9418 + }, + { + "epoch": 0.77, + "grad_norm": 5.610892118403507, + "learning_rate": 1.3794874226623323e-06, + "loss": 0.7867, + "step": 9419 + }, + { + "epoch": 0.77, + "grad_norm": 3.8942101571835606, + "learning_rate": 1.3785803580186141e-06, + "loss": 0.6229, + "step": 9420 + }, + { + "epoch": 0.77, + "grad_norm": 9.136690708914886, + "learning_rate": 1.3776735439999379e-06, + "loss": 0.5678, + "step": 9421 + }, + { + "epoch": 0.77, + "grad_norm": 5.614065362734215, + "learning_rate": 1.3767669806690586e-06, + "loss": 0.7685, + "step": 9422 + }, + { + "epoch": 0.77, + "grad_norm": 3.0858022072414943, + "learning_rate": 1.3758606680887194e-06, + "loss": 0.6561, + "step": 9423 + }, + { + "epoch": 0.77, + "grad_norm": 3.767367994011455, + "learning_rate": 1.37495460632164e-06, + "loss": 0.5858, + "step": 9424 + }, + { + "epoch": 0.77, + "grad_norm": 2.657141547611293, + "learning_rate": 1.3740487954305288e-06, + "loss": 0.718, + "step": 9425 + }, + { + "epoch": 0.77, + "grad_norm": 4.026550591224162, + "learning_rate": 1.3731432354780716e-06, + "loss": 0.6113, + "step": 9426 + }, + { + "epoch": 0.77, + "grad_norm": 16.161578514508417, + "learning_rate": 1.3722379265269393e-06, + "loss": 0.6813, + "step": 9427 + }, + { + "epoch": 0.77, + "grad_norm": 8.296344156904274, + "learning_rate": 1.3713328686397832e-06, + "loss": 0.802, + "step": 9428 + }, + { + "epoch": 0.77, + "grad_norm": 3.0271976617204652, + "learning_rate": 1.3704280618792415e-06, + "loss": 0.717, + "step": 9429 + }, + { + "epoch": 0.77, + "grad_norm": 4.739558449333144, + "learning_rate": 1.3695235063079322e-06, + "loss": 0.6639, + "step": 9430 + }, + { + "epoch": 0.77, + "grad_norm": 15.09090945733377, + "learning_rate": 1.3686192019884542e-06, + "loss": 0.6142, + "step": 9431 + }, + { + "epoch": 0.77, + "grad_norm": 3.5850617805810323, + "learning_rate": 1.3677151489833933e-06, + "loss": 0.7216, + "step": 9432 + }, + { + "epoch": 0.77, + "grad_norm": 2.5573239524107496, + "learning_rate": 1.3668113473553157e-06, + "loss": 0.6464, + "step": 9433 + }, + { + "epoch": 0.77, + "grad_norm": 5.633427485422954, + "learning_rate": 1.3659077971667689e-06, + "loss": 0.7828, + "step": 9434 + }, + { + "epoch": 0.77, + "grad_norm": 3.880682936681103, + "learning_rate": 1.365004498480283e-06, + "loss": 0.6984, + "step": 9435 + }, + { + "epoch": 0.77, + "grad_norm": 16.854026774746146, + "learning_rate": 1.3641014513583755e-06, + "loss": 0.6061, + "step": 9436 + }, + { + "epoch": 0.77, + "grad_norm": 2.2413485418308796, + "learning_rate": 1.3631986558635408e-06, + "loss": 0.5517, + "step": 9437 + }, + { + "epoch": 0.77, + "grad_norm": 5.871668771035668, + "learning_rate": 1.3622961120582567e-06, + "loss": 0.6006, + "step": 9438 + }, + { + "epoch": 0.77, + "grad_norm": 14.038943024850285, + "learning_rate": 1.3613938200049886e-06, + "loss": 0.5614, + "step": 9439 + }, + { + "epoch": 0.77, + "grad_norm": 3.546456078083907, + "learning_rate": 1.3604917797661782e-06, + "loss": 0.703, + "step": 9440 + }, + { + "epoch": 0.77, + "grad_norm": 3.2314983478417942, + "learning_rate": 1.3595899914042531e-06, + "loss": 0.5713, + "step": 9441 + }, + { + "epoch": 0.77, + "grad_norm": 3.752462864977622, + "learning_rate": 1.358688454981621e-06, + "loss": 0.7105, + "step": 9442 + }, + { + "epoch": 0.77, + "grad_norm": 13.505514208892068, + "learning_rate": 1.3577871705606765e-06, + "loss": 0.5521, + "step": 9443 + }, + { + "epoch": 0.77, + "grad_norm": 4.972439527237001, + "learning_rate": 1.3568861382037934e-06, + "loss": 0.6371, + "step": 9444 + }, + { + "epoch": 0.77, + "grad_norm": 4.84470654981171, + "learning_rate": 1.3559853579733274e-06, + "loss": 0.8575, + "step": 9445 + }, + { + "epoch": 0.77, + "grad_norm": 3.37562565935764, + "learning_rate": 1.3550848299316216e-06, + "loss": 0.5746, + "step": 9446 + }, + { + "epoch": 0.77, + "grad_norm": 3.2630794649236337, + "learning_rate": 1.354184554140993e-06, + "loss": 0.7595, + "step": 9447 + }, + { + "epoch": 0.77, + "grad_norm": 4.797906327124326, + "learning_rate": 1.353284530663751e-06, + "loss": 0.6748, + "step": 9448 + }, + { + "epoch": 0.77, + "grad_norm": 4.498425560107589, + "learning_rate": 1.3523847595621792e-06, + "loss": 0.7346, + "step": 9449 + }, + { + "epoch": 0.77, + "grad_norm": 3.5797633960400828, + "learning_rate": 1.3514852408985513e-06, + "loss": 0.5507, + "step": 9450 + }, + { + "epoch": 0.77, + "grad_norm": 3.5582291379528, + "learning_rate": 1.3505859747351174e-06, + "loss": 0.6038, + "step": 9451 + }, + { + "epoch": 0.77, + "grad_norm": 4.807824396820554, + "learning_rate": 1.3496869611341107e-06, + "loss": 0.6591, + "step": 9452 + }, + { + "epoch": 0.77, + "grad_norm": 2.814359334846196, + "learning_rate": 1.348788200157753e-06, + "loss": 0.717, + "step": 9453 + }, + { + "epoch": 0.77, + "grad_norm": 13.986395786837159, + "learning_rate": 1.347889691868241e-06, + "loss": 0.6358, + "step": 9454 + }, + { + "epoch": 0.77, + "grad_norm": 3.336861632328459, + "learning_rate": 1.3469914363277582e-06, + "loss": 0.6312, + "step": 9455 + }, + { + "epoch": 0.77, + "grad_norm": 2.9235675377273656, + "learning_rate": 1.3460934335984677e-06, + "loss": 0.4495, + "step": 9456 + }, + { + "epoch": 0.77, + "grad_norm": 3.077665290491094, + "learning_rate": 1.34519568374252e-06, + "loss": 0.6599, + "step": 9457 + }, + { + "epoch": 0.77, + "grad_norm": 5.249627745329606, + "learning_rate": 1.3442981868220423e-06, + "loss": 0.5464, + "step": 9458 + }, + { + "epoch": 0.77, + "grad_norm": 2.7728833694350725, + "learning_rate": 1.343400942899149e-06, + "loss": 0.6116, + "step": 9459 + }, + { + "epoch": 0.77, + "grad_norm": 3.7730671031244585, + "learning_rate": 1.3425039520359352e-06, + "loss": 0.6769, + "step": 9460 + }, + { + "epoch": 0.77, + "grad_norm": 4.421558918751307, + "learning_rate": 1.3416072142944768e-06, + "loss": 0.53, + "step": 9461 + }, + { + "epoch": 0.77, + "grad_norm": 2.869963322675455, + "learning_rate": 1.340710729736835e-06, + "loss": 0.643, + "step": 9462 + }, + { + "epoch": 0.77, + "grad_norm": 3.2423807961942686, + "learning_rate": 1.3398144984250493e-06, + "loss": 0.776, + "step": 9463 + }, + { + "epoch": 0.77, + "grad_norm": 9.9924529501169, + "learning_rate": 1.3389185204211487e-06, + "loss": 0.6591, + "step": 9464 + }, + { + "epoch": 0.77, + "grad_norm": 7.117505675698146, + "learning_rate": 1.3380227957871366e-06, + "loss": 0.5951, + "step": 9465 + }, + { + "epoch": 0.77, + "grad_norm": 3.275376050156914, + "learning_rate": 1.337127324585008e-06, + "loss": 0.616, + "step": 9466 + }, + { + "epoch": 0.77, + "grad_norm": 3.4346046303001825, + "learning_rate": 1.3362321068767293e-06, + "loss": 0.592, + "step": 9467 + }, + { + "epoch": 0.77, + "grad_norm": 3.4120153758497995, + "learning_rate": 1.3353371427242585e-06, + "loss": 0.7462, + "step": 9468 + }, + { + "epoch": 0.77, + "grad_norm": 3.664979299883388, + "learning_rate": 1.3344424321895328e-06, + "loss": 0.7389, + "step": 9469 + }, + { + "epoch": 0.77, + "grad_norm": 2.2984499813772734, + "learning_rate": 1.3335479753344688e-06, + "loss": 0.7003, + "step": 9470 + }, + { + "epoch": 0.77, + "grad_norm": 4.168994906173609, + "learning_rate": 1.3326537722209727e-06, + "loss": 0.6077, + "step": 9471 + }, + { + "epoch": 0.77, + "grad_norm": 4.20905239145414, + "learning_rate": 1.3317598229109258e-06, + "loss": 0.5668, + "step": 9472 + }, + { + "epoch": 0.77, + "grad_norm": 6.972859705232672, + "learning_rate": 1.3308661274661988e-06, + "loss": 0.6262, + "step": 9473 + }, + { + "epoch": 0.77, + "grad_norm": 3.2884191701654233, + "learning_rate": 1.3299726859486361e-06, + "loss": 0.669, + "step": 9474 + }, + { + "epoch": 0.77, + "grad_norm": 4.061886599429941, + "learning_rate": 1.3290794984200734e-06, + "loss": 0.6047, + "step": 9475 + }, + { + "epoch": 0.77, + "grad_norm": 3.4633455103451714, + "learning_rate": 1.3281865649423231e-06, + "loss": 0.6355, + "step": 9476 + }, + { + "epoch": 0.77, + "grad_norm": 3.279719834942962, + "learning_rate": 1.3272938855771805e-06, + "loss": 0.7319, + "step": 9477 + }, + { + "epoch": 0.77, + "grad_norm": 12.929307332529223, + "learning_rate": 1.3264014603864278e-06, + "loss": 0.7332, + "step": 9478 + }, + { + "epoch": 0.77, + "grad_norm": 4.894570486158061, + "learning_rate": 1.3255092894318256e-06, + "loss": 0.6998, + "step": 9479 + }, + { + "epoch": 0.77, + "grad_norm": 8.48825942957005, + "learning_rate": 1.3246173727751166e-06, + "loss": 0.5335, + "step": 9480 + }, + { + "epoch": 0.77, + "grad_norm": 2.6854420005954913, + "learning_rate": 1.323725710478026e-06, + "loss": 0.6232, + "step": 9481 + }, + { + "epoch": 0.77, + "grad_norm": 4.189099177111844, + "learning_rate": 1.3228343026022656e-06, + "loss": 0.6521, + "step": 9482 + }, + { + "epoch": 0.77, + "grad_norm": 4.843425941346352, + "learning_rate": 1.321943149209523e-06, + "loss": 0.7574, + "step": 9483 + }, + { + "epoch": 0.77, + "grad_norm": 3.7605790907966896, + "learning_rate": 1.3210522503614753e-06, + "loss": 0.6304, + "step": 9484 + }, + { + "epoch": 0.77, + "grad_norm": 2.656388881656097, + "learning_rate": 1.3201616061197763e-06, + "loss": 0.5772, + "step": 9485 + }, + { + "epoch": 0.77, + "grad_norm": 4.242821918812973, + "learning_rate": 1.3192712165460648e-06, + "loss": 0.6511, + "step": 9486 + }, + { + "epoch": 0.77, + "grad_norm": 3.945219436104987, + "learning_rate": 1.31838108170196e-06, + "loss": 0.7387, + "step": 9487 + }, + { + "epoch": 0.77, + "grad_norm": 3.516458437000437, + "learning_rate": 1.3174912016490649e-06, + "loss": 0.7046, + "step": 9488 + }, + { + "epoch": 0.77, + "grad_norm": 3.5464214990016734, + "learning_rate": 1.316601576448967e-06, + "loss": 0.5098, + "step": 9489 + }, + { + "epoch": 0.77, + "grad_norm": 3.660385989974557, + "learning_rate": 1.315712206163231e-06, + "loss": 0.5736, + "step": 9490 + }, + { + "epoch": 0.77, + "grad_norm": 7.127447316552168, + "learning_rate": 1.3148230908534098e-06, + "loss": 0.7929, + "step": 9491 + }, + { + "epoch": 0.77, + "grad_norm": 26.23366192024085, + "learning_rate": 1.3139342305810349e-06, + "loss": 0.6779, + "step": 9492 + }, + { + "epoch": 0.77, + "grad_norm": 3.1936162666647743, + "learning_rate": 1.3130456254076206e-06, + "loss": 0.7702, + "step": 9493 + }, + { + "epoch": 0.77, + "grad_norm": 6.514636194832057, + "learning_rate": 1.3121572753946638e-06, + "loss": 0.727, + "step": 9494 + }, + { + "epoch": 0.77, + "grad_norm": 3.267814276339569, + "learning_rate": 1.3112691806036425e-06, + "loss": 0.5855, + "step": 9495 + }, + { + "epoch": 0.77, + "grad_norm": 5.683781091152204, + "learning_rate": 1.310381341096022e-06, + "loss": 0.6916, + "step": 9496 + }, + { + "epoch": 0.77, + "grad_norm": 2.9322214508221713, + "learning_rate": 1.3094937569332428e-06, + "loss": 0.6743, + "step": 9497 + }, + { + "epoch": 0.77, + "grad_norm": 3.373301687035292, + "learning_rate": 1.3086064281767346e-06, + "loss": 0.5708, + "step": 9498 + }, + { + "epoch": 0.77, + "grad_norm": 3.037177140777006, + "learning_rate": 1.307719354887904e-06, + "loss": 0.8457, + "step": 9499 + }, + { + "epoch": 0.77, + "grad_norm": 4.500426649586497, + "learning_rate": 1.3068325371281433e-06, + "loss": 0.6916, + "step": 9500 + }, + { + "epoch": 0.77, + "grad_norm": 3.734558616073429, + "learning_rate": 1.3059459749588243e-06, + "loss": 0.6509, + "step": 9501 + }, + { + "epoch": 0.77, + "grad_norm": 4.6652459661091745, + "learning_rate": 1.3050596684413025e-06, + "loss": 0.608, + "step": 9502 + }, + { + "epoch": 0.77, + "grad_norm": 3.682720377105289, + "learning_rate": 1.3041736176369184e-06, + "loss": 0.6993, + "step": 9503 + }, + { + "epoch": 0.77, + "grad_norm": 3.2513865670415867, + "learning_rate": 1.3032878226069895e-06, + "loss": 0.6689, + "step": 9504 + }, + { + "epoch": 0.77, + "grad_norm": 3.036004580320126, + "learning_rate": 1.302402283412821e-06, + "loss": 0.6038, + "step": 9505 + }, + { + "epoch": 0.77, + "grad_norm": 12.426136977120445, + "learning_rate": 1.3015170001156962e-06, + "loss": 0.652, + "step": 9506 + }, + { + "epoch": 0.77, + "grad_norm": 7.952317677931165, + "learning_rate": 1.300631972776883e-06, + "loss": 0.7531, + "step": 9507 + }, + { + "epoch": 0.77, + "grad_norm": 6.42523116334164, + "learning_rate": 1.299747201457629e-06, + "loss": 0.8523, + "step": 9508 + }, + { + "epoch": 0.77, + "grad_norm": 6.272477864862201, + "learning_rate": 1.2988626862191684e-06, + "loss": 0.6346, + "step": 9509 + }, + { + "epoch": 0.77, + "grad_norm": 4.79702736986493, + "learning_rate": 1.2979784271227146e-06, + "loss": 0.6283, + "step": 9510 + }, + { + "epoch": 0.77, + "grad_norm": 2.979345806540136, + "learning_rate": 1.2970944242294614e-06, + "loss": 0.482, + "step": 9511 + }, + { + "epoch": 0.77, + "grad_norm": 5.288386849289338, + "learning_rate": 1.2962106776005917e-06, + "loss": 0.55, + "step": 9512 + }, + { + "epoch": 0.77, + "grad_norm": 3.199472690350407, + "learning_rate": 1.2953271872972638e-06, + "loss": 0.6315, + "step": 9513 + }, + { + "epoch": 0.77, + "grad_norm": 2.9096214029363225, + "learning_rate": 1.2944439533806207e-06, + "loss": 0.774, + "step": 9514 + }, + { + "epoch": 0.77, + "grad_norm": 3.008805231043488, + "learning_rate": 1.2935609759117873e-06, + "loss": 0.7406, + "step": 9515 + }, + { + "epoch": 0.77, + "grad_norm": 4.150798833911846, + "learning_rate": 1.2926782549518734e-06, + "loss": 0.6784, + "step": 9516 + }, + { + "epoch": 0.77, + "grad_norm": 9.215084121911525, + "learning_rate": 1.2917957905619672e-06, + "loss": 0.6568, + "step": 9517 + }, + { + "epoch": 0.77, + "grad_norm": 5.649516436408424, + "learning_rate": 1.2909135828031398e-06, + "loss": 0.7832, + "step": 9518 + }, + { + "epoch": 0.77, + "grad_norm": 3.491442266883123, + "learning_rate": 1.2900316317364498e-06, + "loss": 0.7138, + "step": 9519 + }, + { + "epoch": 0.77, + "grad_norm": 10.552972195007301, + "learning_rate": 1.2891499374229276e-06, + "loss": 0.4464, + "step": 9520 + }, + { + "epoch": 0.77, + "grad_norm": 6.478727050024747, + "learning_rate": 1.2882684999235967e-06, + "loss": 0.7565, + "step": 9521 + }, + { + "epoch": 0.77, + "grad_norm": 4.02253963072119, + "learning_rate": 1.2873873192994552e-06, + "loss": 0.5654, + "step": 9522 + }, + { + "epoch": 0.77, + "grad_norm": 7.426251196401269, + "learning_rate": 1.2865063956114893e-06, + "loss": 0.5125, + "step": 9523 + }, + { + "epoch": 0.77, + "grad_norm": 3.106882263675633, + "learning_rate": 1.2856257289206625e-06, + "loss": 0.5261, + "step": 9524 + }, + { + "epoch": 0.77, + "grad_norm": 2.5022550530711145, + "learning_rate": 1.2847453192879217e-06, + "loss": 0.8325, + "step": 9525 + }, + { + "epoch": 0.77, + "grad_norm": 3.9709878161306817, + "learning_rate": 1.2838651667742014e-06, + "loss": 0.6347, + "step": 9526 + }, + { + "epoch": 0.77, + "grad_norm": 3.1397708119233965, + "learning_rate": 1.2829852714404068e-06, + "loss": 0.6313, + "step": 9527 + }, + { + "epoch": 0.77, + "grad_norm": 4.45783865869915, + "learning_rate": 1.2821056333474368e-06, + "loss": 0.6782, + "step": 9528 + }, + { + "epoch": 0.77, + "grad_norm": 2.2183880303718646, + "learning_rate": 1.281226252556166e-06, + "loss": 0.7428, + "step": 9529 + }, + { + "epoch": 0.77, + "grad_norm": 4.042599479424637, + "learning_rate": 1.280347129127455e-06, + "loss": 0.6635, + "step": 9530 + }, + { + "epoch": 0.77, + "grad_norm": 2.5031459900373916, + "learning_rate": 1.2794682631221423e-06, + "loss": 0.4689, + "step": 9531 + }, + { + "epoch": 0.77, + "grad_norm": 16.64032822868164, + "learning_rate": 1.278589654601055e-06, + "loss": 0.6319, + "step": 9532 + }, + { + "epoch": 0.77, + "grad_norm": 5.056688915720502, + "learning_rate": 1.2777113036249927e-06, + "loss": 0.7046, + "step": 9533 + }, + { + "epoch": 0.77, + "grad_norm": 6.390324709845071, + "learning_rate": 1.2768332102547464e-06, + "loss": 0.628, + "step": 9534 + }, + { + "epoch": 0.77, + "grad_norm": 3.1254744860918717, + "learning_rate": 1.275955374551086e-06, + "loss": 0.5482, + "step": 9535 + }, + { + "epoch": 0.77, + "grad_norm": 4.781001368342637, + "learning_rate": 1.2750777965747601e-06, + "loss": 0.7425, + "step": 9536 + }, + { + "epoch": 0.77, + "grad_norm": 2.931844168629806, + "learning_rate": 1.2742004763865063e-06, + "loss": 0.6571, + "step": 9537 + }, + { + "epoch": 0.77, + "grad_norm": 2.38800903016534, + "learning_rate": 1.273323414047038e-06, + "loss": 0.6273, + "step": 9538 + }, + { + "epoch": 0.77, + "grad_norm": 6.726295790302656, + "learning_rate": 1.2724466096170568e-06, + "loss": 0.8395, + "step": 9539 + }, + { + "epoch": 0.77, + "grad_norm": 3.666764789221523, + "learning_rate": 1.2715700631572387e-06, + "loss": 0.6851, + "step": 9540 + }, + { + "epoch": 0.77, + "grad_norm": 3.693267124076448, + "learning_rate": 1.2706937747282493e-06, + "loss": 0.7828, + "step": 9541 + }, + { + "epoch": 0.78, + "grad_norm": 5.632303995361531, + "learning_rate": 1.2698177443907322e-06, + "loss": 0.7265, + "step": 9542 + }, + { + "epoch": 0.78, + "grad_norm": 3.0788167771280786, + "learning_rate": 1.2689419722053132e-06, + "loss": 0.5517, + "step": 9543 + }, + { + "epoch": 0.78, + "grad_norm": 3.3799337836105963, + "learning_rate": 1.2680664582326042e-06, + "loss": 0.4752, + "step": 9544 + }, + { + "epoch": 0.78, + "grad_norm": 5.66525494421117, + "learning_rate": 1.2671912025331922e-06, + "loss": 0.7874, + "step": 9545 + }, + { + "epoch": 0.78, + "grad_norm": 5.992751150613818, + "learning_rate": 1.2663162051676565e-06, + "loss": 0.5712, + "step": 9546 + }, + { + "epoch": 0.78, + "grad_norm": 3.7405047888835554, + "learning_rate": 1.2654414661965447e-06, + "loss": 0.6324, + "step": 9547 + }, + { + "epoch": 0.78, + "grad_norm": 14.240767975016839, + "learning_rate": 1.2645669856804005e-06, + "loss": 0.5388, + "step": 9548 + }, + { + "epoch": 0.78, + "grad_norm": 2.709086698385279, + "learning_rate": 1.2636927636797407e-06, + "loss": 0.6597, + "step": 9549 + }, + { + "epoch": 0.78, + "grad_norm": 4.011325047864485, + "learning_rate": 1.2628188002550662e-06, + "loss": 0.6425, + "step": 9550 + }, + { + "epoch": 0.78, + "grad_norm": 3.486298032839116, + "learning_rate": 1.2619450954668633e-06, + "loss": 0.7064, + "step": 9551 + }, + { + "epoch": 0.78, + "grad_norm": 2.82453304462159, + "learning_rate": 1.2610716493755965e-06, + "loss": 0.6387, + "step": 9552 + }, + { + "epoch": 0.78, + "grad_norm": 4.287445732403542, + "learning_rate": 1.2601984620417136e-06, + "loss": 0.7065, + "step": 9553 + }, + { + "epoch": 0.78, + "grad_norm": 3.039181482293806, + "learning_rate": 1.2593255335256438e-06, + "loss": 0.6161, + "step": 9554 + }, + { + "epoch": 0.78, + "grad_norm": 2.887711658599226, + "learning_rate": 1.2584528638878014e-06, + "loss": 0.7509, + "step": 9555 + }, + { + "epoch": 0.78, + "grad_norm": 7.124386944298144, + "learning_rate": 1.2575804531885783e-06, + "loss": 0.6278, + "step": 9556 + }, + { + "epoch": 0.78, + "grad_norm": 4.966553770182493, + "learning_rate": 1.2567083014883536e-06, + "loss": 0.8015, + "step": 9557 + }, + { + "epoch": 0.78, + "grad_norm": 3.359991493600607, + "learning_rate": 1.2558364088474838e-06, + "loss": 0.5198, + "step": 9558 + }, + { + "epoch": 0.78, + "grad_norm": 2.9359154034670802, + "learning_rate": 1.25496477532631e-06, + "loss": 0.7131, + "step": 9559 + }, + { + "epoch": 0.78, + "grad_norm": 2.340351344368538, + "learning_rate": 1.2540934009851541e-06, + "loss": 0.5602, + "step": 9560 + }, + { + "epoch": 0.78, + "grad_norm": 3.0011070473618746, + "learning_rate": 1.2532222858843202e-06, + "loss": 0.6853, + "step": 9561 + }, + { + "epoch": 0.78, + "grad_norm": 3.9353496464230457, + "learning_rate": 1.2523514300840967e-06, + "loss": 0.617, + "step": 9562 + }, + { + "epoch": 0.78, + "grad_norm": 4.3507879199612045, + "learning_rate": 1.2514808336447499e-06, + "loss": 0.7897, + "step": 9563 + }, + { + "epoch": 0.78, + "grad_norm": 2.664658563841125, + "learning_rate": 1.2506104966265336e-06, + "loss": 0.6507, + "step": 9564 + }, + { + "epoch": 0.78, + "grad_norm": 3.610129299176213, + "learning_rate": 1.2497404190896795e-06, + "loss": 0.8481, + "step": 9565 + }, + { + "epoch": 0.78, + "grad_norm": 3.231669621724826, + "learning_rate": 1.2488706010944012e-06, + "loss": 0.7117, + "step": 9566 + }, + { + "epoch": 0.78, + "grad_norm": 5.410990380846471, + "learning_rate": 1.248001042700897e-06, + "loss": 0.7147, + "step": 9567 + }, + { + "epoch": 0.78, + "grad_norm": 3.329839871992406, + "learning_rate": 1.2471317439693436e-06, + "loss": 0.5731, + "step": 9568 + }, + { + "epoch": 0.78, + "grad_norm": 2.8676361042348844, + "learning_rate": 1.2462627049599052e-06, + "loss": 0.6977, + "step": 9569 + }, + { + "epoch": 0.78, + "grad_norm": 4.043671699356178, + "learning_rate": 1.2453939257327213e-06, + "loss": 0.4871, + "step": 9570 + }, + { + "epoch": 0.78, + "grad_norm": 5.322868473881842, + "learning_rate": 1.24452540634792e-06, + "loss": 0.6787, + "step": 9571 + }, + { + "epoch": 0.78, + "grad_norm": 3.570523032201416, + "learning_rate": 1.2436571468656071e-06, + "loss": 0.7608, + "step": 9572 + }, + { + "epoch": 0.78, + "grad_norm": 3.039044964921788, + "learning_rate": 1.242789147345872e-06, + "loss": 0.619, + "step": 9573 + }, + { + "epoch": 0.78, + "grad_norm": 4.224375005791324, + "learning_rate": 1.2419214078487846e-06, + "loss": 0.7531, + "step": 9574 + }, + { + "epoch": 0.78, + "grad_norm": 9.119077338920082, + "learning_rate": 1.2410539284343975e-06, + "loss": 0.6441, + "step": 9575 + }, + { + "epoch": 0.78, + "grad_norm": 4.910610252005452, + "learning_rate": 1.2401867091627485e-06, + "loss": 0.6464, + "step": 9576 + }, + { + "epoch": 0.78, + "grad_norm": 7.088794565466001, + "learning_rate": 1.2393197500938508e-06, + "loss": 0.6115, + "step": 9577 + }, + { + "epoch": 0.78, + "grad_norm": 4.022615217698594, + "learning_rate": 1.2384530512877074e-06, + "loss": 0.6389, + "step": 9578 + }, + { + "epoch": 0.78, + "grad_norm": 4.1652060062719904, + "learning_rate": 1.237586612804298e-06, + "loss": 0.6482, + "step": 9579 + }, + { + "epoch": 0.78, + "grad_norm": 3.3097162079473255, + "learning_rate": 1.2367204347035845e-06, + "loss": 0.862, + "step": 9580 + }, + { + "epoch": 0.78, + "grad_norm": 23.44444919315233, + "learning_rate": 1.235854517045511e-06, + "loss": 0.6533, + "step": 9581 + }, + { + "epoch": 0.78, + "grad_norm": 4.56494358636794, + "learning_rate": 1.2349888598900078e-06, + "loss": 0.7958, + "step": 9582 + }, + { + "epoch": 0.78, + "grad_norm": 3.467798125316924, + "learning_rate": 1.2341234632969817e-06, + "loss": 0.6072, + "step": 9583 + }, + { + "epoch": 0.78, + "grad_norm": 2.6837300904263093, + "learning_rate": 1.2332583273263227e-06, + "loss": 0.6819, + "step": 9584 + }, + { + "epoch": 0.78, + "grad_norm": 6.28537589498293, + "learning_rate": 1.232393452037907e-06, + "loss": 0.5772, + "step": 9585 + }, + { + "epoch": 0.78, + "grad_norm": 3.833562915235655, + "learning_rate": 1.2315288374915852e-06, + "loss": 0.713, + "step": 9586 + }, + { + "epoch": 0.78, + "grad_norm": 3.6677176934363485, + "learning_rate": 1.2306644837471971e-06, + "loss": 0.6154, + "step": 9587 + }, + { + "epoch": 0.78, + "grad_norm": 3.758829091242493, + "learning_rate": 1.229800390864559e-06, + "loss": 0.6044, + "step": 9588 + }, + { + "epoch": 0.78, + "grad_norm": 3.186526174599644, + "learning_rate": 1.2289365589034746e-06, + "loss": 0.7094, + "step": 9589 + }, + { + "epoch": 0.78, + "grad_norm": 8.028294912543311, + "learning_rate": 1.2280729879237247e-06, + "loss": 0.6784, + "step": 9590 + }, + { + "epoch": 0.78, + "grad_norm": 8.164653773605256, + "learning_rate": 1.2272096779850728e-06, + "loss": 0.7694, + "step": 9591 + }, + { + "epoch": 0.78, + "grad_norm": 5.063424299452289, + "learning_rate": 1.2263466291472692e-06, + "loss": 0.784, + "step": 9592 + }, + { + "epoch": 0.78, + "grad_norm": 5.924236981931554, + "learning_rate": 1.2254838414700371e-06, + "loss": 0.7228, + "step": 9593 + }, + { + "epoch": 0.78, + "grad_norm": 2.894697716079355, + "learning_rate": 1.224621315013091e-06, + "loss": 0.6904, + "step": 9594 + }, + { + "epoch": 0.78, + "grad_norm": 3.173362067409352, + "learning_rate": 1.2237590498361202e-06, + "loss": 0.6775, + "step": 9595 + }, + { + "epoch": 0.78, + "grad_norm": 2.6684281719755236, + "learning_rate": 1.2228970459988015e-06, + "loss": 0.5385, + "step": 9596 + }, + { + "epoch": 0.78, + "grad_norm": 3.3744139799994706, + "learning_rate": 1.2220353035607902e-06, + "loss": 0.6454, + "step": 9597 + }, + { + "epoch": 0.78, + "grad_norm": 2.805794200279848, + "learning_rate": 1.221173822581722e-06, + "loss": 0.7744, + "step": 9598 + }, + { + "epoch": 0.78, + "grad_norm": 6.720023113604179, + "learning_rate": 1.220312603121222e-06, + "loss": 0.5788, + "step": 9599 + }, + { + "epoch": 0.78, + "grad_norm": 5.026312604866843, + "learning_rate": 1.2194516452388861e-06, + "loss": 0.4923, + "step": 9600 + }, + { + "epoch": 0.78, + "grad_norm": 2.659868431870634, + "learning_rate": 1.2185909489943015e-06, + "loss": 0.6683, + "step": 9601 + }, + { + "epoch": 0.78, + "grad_norm": 7.019878786826271, + "learning_rate": 1.217730514447032e-06, + "loss": 0.7073, + "step": 9602 + }, + { + "epoch": 0.78, + "grad_norm": 3.6557179406078246, + "learning_rate": 1.2168703416566274e-06, + "loss": 0.7031, + "step": 9603 + }, + { + "epoch": 0.78, + "grad_norm": 4.547170879504895, + "learning_rate": 1.2160104306826154e-06, + "loss": 0.7275, + "step": 9604 + }, + { + "epoch": 0.78, + "grad_norm": 5.177141151994431, + "learning_rate": 1.2151507815845077e-06, + "loss": 0.6608, + "step": 9605 + }, + { + "epoch": 0.78, + "grad_norm": 3.9473281936320475, + "learning_rate": 1.214291394421796e-06, + "loss": 0.6326, + "step": 9606 + }, + { + "epoch": 0.78, + "grad_norm": 3.7195551900189874, + "learning_rate": 1.213432269253958e-06, + "loss": 0.685, + "step": 9607 + }, + { + "epoch": 0.78, + "grad_norm": 2.399625797093667, + "learning_rate": 1.2125734061404488e-06, + "loss": 0.6829, + "step": 9608 + }, + { + "epoch": 0.78, + "grad_norm": 5.269585512658102, + "learning_rate": 1.2117148051407064e-06, + "loss": 0.5668, + "step": 9609 + }, + { + "epoch": 0.78, + "grad_norm": 6.455420343327664, + "learning_rate": 1.2108564663141541e-06, + "loss": 0.6362, + "step": 9610 + }, + { + "epoch": 0.78, + "grad_norm": 3.0188580984211977, + "learning_rate": 1.209998389720191e-06, + "loss": 0.7286, + "step": 9611 + }, + { + "epoch": 0.78, + "grad_norm": 4.727330074761765, + "learning_rate": 1.2091405754182061e-06, + "loss": 0.7701, + "step": 9612 + }, + { + "epoch": 0.78, + "grad_norm": 3.9516304969028067, + "learning_rate": 1.2082830234675597e-06, + "loss": 0.6331, + "step": 9613 + }, + { + "epoch": 0.78, + "grad_norm": 4.726769937936566, + "learning_rate": 1.2074257339276041e-06, + "loss": 0.6636, + "step": 9614 + }, + { + "epoch": 0.78, + "grad_norm": 5.2240850101689835, + "learning_rate": 1.206568706857668e-06, + "loss": 0.5978, + "step": 9615 + }, + { + "epoch": 0.78, + "grad_norm": 4.068590841150558, + "learning_rate": 1.205711942317061e-06, + "loss": 0.742, + "step": 9616 + }, + { + "epoch": 0.78, + "grad_norm": 4.86774571973402, + "learning_rate": 1.2048554403650803e-06, + "loss": 0.7562, + "step": 9617 + }, + { + "epoch": 0.78, + "grad_norm": 4.2669033346874805, + "learning_rate": 1.2039992010609974e-06, + "loss": 0.7791, + "step": 9618 + }, + { + "epoch": 0.78, + "grad_norm": 2.9145475573449207, + "learning_rate": 1.203143224464075e-06, + "loss": 0.6959, + "step": 9619 + }, + { + "epoch": 0.78, + "grad_norm": 3.497617366248506, + "learning_rate": 1.2022875106335446e-06, + "loss": 0.5946, + "step": 9620 + }, + { + "epoch": 0.78, + "grad_norm": 4.079856301069102, + "learning_rate": 1.2014320596286327e-06, + "loss": 0.6903, + "step": 9621 + }, + { + "epoch": 0.78, + "grad_norm": 4.574458224097774, + "learning_rate": 1.2005768715085402e-06, + "loss": 0.6076, + "step": 9622 + }, + { + "epoch": 0.78, + "grad_norm": 5.618204258576541, + "learning_rate": 1.19972194633245e-06, + "loss": 0.6766, + "step": 9623 + }, + { + "epoch": 0.78, + "grad_norm": 5.069222147867285, + "learning_rate": 1.1988672841595312e-06, + "loss": 0.7764, + "step": 9624 + }, + { + "epoch": 0.78, + "grad_norm": 3.0841799032921338, + "learning_rate": 1.1980128850489298e-06, + "loss": 0.637, + "step": 9625 + }, + { + "epoch": 0.78, + "grad_norm": 3.227425537046121, + "learning_rate": 1.1971587490597759e-06, + "loss": 0.556, + "step": 9626 + }, + { + "epoch": 0.78, + "grad_norm": 4.298069711101177, + "learning_rate": 1.1963048762511802e-06, + "loss": 0.5983, + "step": 9627 + }, + { + "epoch": 0.78, + "grad_norm": 4.219900479613822, + "learning_rate": 1.1954512666822383e-06, + "loss": 0.8055, + "step": 9628 + }, + { + "epoch": 0.78, + "grad_norm": 5.152423102015065, + "learning_rate": 1.1945979204120244e-06, + "loss": 0.6486, + "step": 9629 + }, + { + "epoch": 0.78, + "grad_norm": 4.187802495101171, + "learning_rate": 1.1937448374995936e-06, + "loss": 0.5941, + "step": 9630 + }, + { + "epoch": 0.78, + "grad_norm": 4.566822192955822, + "learning_rate": 1.1928920180039877e-06, + "loss": 0.687, + "step": 9631 + }, + { + "epoch": 0.78, + "grad_norm": 2.980511266830398, + "learning_rate": 1.1920394619842257e-06, + "loss": 0.7644, + "step": 9632 + }, + { + "epoch": 0.78, + "grad_norm": 7.3526787683167365, + "learning_rate": 1.1911871694993093e-06, + "loss": 0.6765, + "step": 9633 + }, + { + "epoch": 0.78, + "grad_norm": 3.203667054744695, + "learning_rate": 1.1903351406082224e-06, + "loss": 0.6395, + "step": 9634 + }, + { + "epoch": 0.78, + "grad_norm": 4.0058431664561605, + "learning_rate": 1.1894833753699325e-06, + "loss": 0.682, + "step": 9635 + }, + { + "epoch": 0.78, + "grad_norm": 5.207957447236932, + "learning_rate": 1.1886318738433844e-06, + "loss": 0.679, + "step": 9636 + }, + { + "epoch": 0.78, + "grad_norm": 6.874841198116699, + "learning_rate": 1.1877806360875111e-06, + "loss": 0.6493, + "step": 9637 + }, + { + "epoch": 0.78, + "grad_norm": 5.071974534828488, + "learning_rate": 1.186929662161221e-06, + "loss": 0.4878, + "step": 9638 + }, + { + "epoch": 0.78, + "grad_norm": 7.971429247966249, + "learning_rate": 1.1860789521234072e-06, + "loss": 0.7024, + "step": 9639 + }, + { + "epoch": 0.78, + "grad_norm": 3.6947777562316024, + "learning_rate": 1.1852285060329445e-06, + "loss": 0.8092, + "step": 9640 + }, + { + "epoch": 0.78, + "grad_norm": 3.8582083359088646, + "learning_rate": 1.1843783239486878e-06, + "loss": 0.6184, + "step": 9641 + }, + { + "epoch": 0.78, + "grad_norm": 4.293907610858088, + "learning_rate": 1.1835284059294772e-06, + "loss": 0.6328, + "step": 9642 + }, + { + "epoch": 0.78, + "grad_norm": 3.1754457334937585, + "learning_rate": 1.1826787520341305e-06, + "loss": 0.6122, + "step": 9643 + }, + { + "epoch": 0.78, + "grad_norm": 12.246519715294442, + "learning_rate": 1.181829362321451e-06, + "loss": 0.7338, + "step": 9644 + }, + { + "epoch": 0.78, + "grad_norm": 2.499651219999791, + "learning_rate": 1.180980236850221e-06, + "loss": 0.6928, + "step": 9645 + }, + { + "epoch": 0.78, + "grad_norm": 3.594425801623606, + "learning_rate": 1.180131375679205e-06, + "loss": 0.6358, + "step": 9646 + }, + { + "epoch": 0.78, + "grad_norm": 4.392444984731222, + "learning_rate": 1.1792827788671496e-06, + "loss": 0.6205, + "step": 9647 + }, + { + "epoch": 0.78, + "grad_norm": 13.604272230403495, + "learning_rate": 1.178434446472782e-06, + "loss": 0.8644, + "step": 9648 + }, + { + "epoch": 0.78, + "grad_norm": 2.6024776033915433, + "learning_rate": 1.1775863785548147e-06, + "loss": 0.588, + "step": 9649 + }, + { + "epoch": 0.78, + "grad_norm": 14.537394509933167, + "learning_rate": 1.1767385751719362e-06, + "loss": 0.6458, + "step": 9650 + }, + { + "epoch": 0.78, + "grad_norm": 2.5381805411280784, + "learning_rate": 1.175891036382823e-06, + "loss": 0.6513, + "step": 9651 + }, + { + "epoch": 0.78, + "grad_norm": 3.0303118121987103, + "learning_rate": 1.1750437622461293e-06, + "loss": 0.5681, + "step": 9652 + }, + { + "epoch": 0.78, + "grad_norm": 3.9068296982847395, + "learning_rate": 1.17419675282049e-06, + "loss": 0.6265, + "step": 9653 + }, + { + "epoch": 0.78, + "grad_norm": 5.75146600736371, + "learning_rate": 1.1733500081645243e-06, + "loss": 0.7751, + "step": 9654 + }, + { + "epoch": 0.78, + "grad_norm": 4.398045793440526, + "learning_rate": 1.1725035283368335e-06, + "loss": 0.7736, + "step": 9655 + }, + { + "epoch": 0.78, + "grad_norm": 5.303020747673285, + "learning_rate": 1.1716573133959985e-06, + "loss": 0.6145, + "step": 9656 + }, + { + "epoch": 0.78, + "grad_norm": 9.648148665950508, + "learning_rate": 1.1708113634005813e-06, + "loss": 0.7501, + "step": 9657 + }, + { + "epoch": 0.78, + "grad_norm": 2.9303155619223737, + "learning_rate": 1.1699656784091311e-06, + "loss": 0.8575, + "step": 9658 + }, + { + "epoch": 0.78, + "grad_norm": 3.832602747573734, + "learning_rate": 1.1691202584801692e-06, + "loss": 0.7392, + "step": 9659 + }, + { + "epoch": 0.78, + "grad_norm": 2.804885902566445, + "learning_rate": 1.1682751036722078e-06, + "loss": 0.7396, + "step": 9660 + }, + { + "epoch": 0.78, + "grad_norm": 3.1948167754758066, + "learning_rate": 1.1674302140437344e-06, + "loss": 0.7027, + "step": 9661 + }, + { + "epoch": 0.78, + "grad_norm": 4.90538211396826, + "learning_rate": 1.1665855896532235e-06, + "loss": 0.4959, + "step": 9662 + }, + { + "epoch": 0.78, + "grad_norm": 4.5585920990480835, + "learning_rate": 1.165741230559127e-06, + "loss": 0.739, + "step": 9663 + }, + { + "epoch": 0.78, + "grad_norm": 5.568827427781763, + "learning_rate": 1.1648971368198786e-06, + "loss": 0.7025, + "step": 9664 + }, + { + "epoch": 0.78, + "grad_norm": 2.5972258888476776, + "learning_rate": 1.1640533084938988e-06, + "loss": 0.6907, + "step": 9665 + }, + { + "epoch": 0.79, + "grad_norm": 10.820020156140478, + "learning_rate": 1.1632097456395802e-06, + "loss": 0.6587, + "step": 9666 + }, + { + "epoch": 0.79, + "grad_norm": 5.653303323184138, + "learning_rate": 1.1623664483153069e-06, + "loss": 0.6687, + "step": 9667 + }, + { + "epoch": 0.79, + "grad_norm": 6.160000970875404, + "learning_rate": 1.1615234165794381e-06, + "loss": 0.6767, + "step": 9668 + }, + { + "epoch": 0.79, + "grad_norm": 2.892261587940089, + "learning_rate": 1.160680650490319e-06, + "loss": 0.6827, + "step": 9669 + }, + { + "epoch": 0.79, + "grad_norm": 4.898715714603023, + "learning_rate": 1.1598381501062738e-06, + "loss": 0.6711, + "step": 9670 + }, + { + "epoch": 0.79, + "grad_norm": 3.0244597515304785, + "learning_rate": 1.1589959154856063e-06, + "loss": 0.5147, + "step": 9671 + }, + { + "epoch": 0.79, + "grad_norm": 5.1039331120685265, + "learning_rate": 1.1581539466866094e-06, + "loss": 0.6583, + "step": 9672 + }, + { + "epoch": 0.79, + "grad_norm": 4.556082072404434, + "learning_rate": 1.1573122437675465e-06, + "loss": 0.5773, + "step": 9673 + }, + { + "epoch": 0.79, + "grad_norm": 8.277379927436868, + "learning_rate": 1.1564708067866743e-06, + "loss": 0.7724, + "step": 9674 + }, + { + "epoch": 0.79, + "grad_norm": 4.304968043971542, + "learning_rate": 1.1556296358022207e-06, + "loss": 0.5938, + "step": 9675 + }, + { + "epoch": 0.79, + "grad_norm": 4.159142398812623, + "learning_rate": 1.1547887308724043e-06, + "loss": 0.6771, + "step": 9676 + }, + { + "epoch": 0.79, + "grad_norm": 4.22566451472834, + "learning_rate": 1.153948092055419e-06, + "loss": 0.5606, + "step": 9677 + }, + { + "epoch": 0.79, + "grad_norm": 3.170054018350635, + "learning_rate": 1.1531077194094426e-06, + "loss": 0.5805, + "step": 9678 + }, + { + "epoch": 0.79, + "grad_norm": 5.05262277175399, + "learning_rate": 1.1522676129926324e-06, + "loss": 0.6084, + "step": 9679 + }, + { + "epoch": 0.79, + "grad_norm": 2.8360146728026927, + "learning_rate": 1.1514277728631323e-06, + "loss": 0.7273, + "step": 9680 + }, + { + "epoch": 0.79, + "grad_norm": 3.4645596947399273, + "learning_rate": 1.1505881990790634e-06, + "loss": 0.6756, + "step": 9681 + }, + { + "epoch": 0.79, + "grad_norm": 2.6532793512664568, + "learning_rate": 1.1497488916985273e-06, + "loss": 0.6044, + "step": 9682 + }, + { + "epoch": 0.79, + "grad_norm": 4.609439698468369, + "learning_rate": 1.148909850779612e-06, + "loss": 0.6893, + "step": 9683 + }, + { + "epoch": 0.79, + "grad_norm": 2.945774241740003, + "learning_rate": 1.1480710763803826e-06, + "loss": 0.5901, + "step": 9684 + }, + { + "epoch": 0.79, + "grad_norm": 3.644527362727375, + "learning_rate": 1.147232568558891e-06, + "loss": 0.5732, + "step": 9685 + }, + { + "epoch": 0.79, + "grad_norm": 4.192537875676673, + "learning_rate": 1.146394327373162e-06, + "loss": 0.8358, + "step": 9686 + }, + { + "epoch": 0.79, + "grad_norm": 7.062437611859238, + "learning_rate": 1.1455563528812113e-06, + "loss": 0.8054, + "step": 9687 + }, + { + "epoch": 0.79, + "grad_norm": 3.5419914276762747, + "learning_rate": 1.1447186451410308e-06, + "loss": 0.693, + "step": 9688 + }, + { + "epoch": 0.79, + "grad_norm": 6.129239809713161, + "learning_rate": 1.143881204210593e-06, + "loss": 0.7837, + "step": 9689 + }, + { + "epoch": 0.79, + "grad_norm": 3.7934839233525963, + "learning_rate": 1.143044030147858e-06, + "loss": 0.7311, + "step": 9690 + }, + { + "epoch": 0.79, + "grad_norm": 3.943349551292075, + "learning_rate": 1.1422071230107607e-06, + "loss": 0.6903, + "step": 9691 + }, + { + "epoch": 0.79, + "grad_norm": 5.535187730837632, + "learning_rate": 1.141370482857222e-06, + "loss": 0.6173, + "step": 9692 + }, + { + "epoch": 0.79, + "grad_norm": 5.133055386900323, + "learning_rate": 1.14053410974514e-06, + "loss": 0.6514, + "step": 9693 + }, + { + "epoch": 0.79, + "grad_norm": 5.152983562475722, + "learning_rate": 1.1396980037324e-06, + "loss": 0.6253, + "step": 9694 + }, + { + "epoch": 0.79, + "grad_norm": 3.943519991692357, + "learning_rate": 1.138862164876865e-06, + "loss": 0.5579, + "step": 9695 + }, + { + "epoch": 0.79, + "grad_norm": 4.488075935419111, + "learning_rate": 1.1380265932363783e-06, + "loss": 0.7353, + "step": 9696 + }, + { + "epoch": 0.79, + "grad_norm": 6.378047870688172, + "learning_rate": 1.1371912888687698e-06, + "loss": 0.8865, + "step": 9697 + }, + { + "epoch": 0.79, + "grad_norm": 3.633542562262393, + "learning_rate": 1.1363562518318465e-06, + "loss": 0.5558, + "step": 9698 + }, + { + "epoch": 0.79, + "grad_norm": 5.09808064828012, + "learning_rate": 1.1355214821833983e-06, + "loss": 0.627, + "step": 9699 + }, + { + "epoch": 0.79, + "grad_norm": 11.095714047175939, + "learning_rate": 1.1346869799811943e-06, + "loss": 0.6412, + "step": 9700 + }, + { + "epoch": 0.79, + "grad_norm": 3.4342924423023096, + "learning_rate": 1.1338527452829912e-06, + "loss": 0.6934, + "step": 9701 + }, + { + "epoch": 0.79, + "grad_norm": 3.9846550784591903, + "learning_rate": 1.1330187781465207e-06, + "loss": 0.5665, + "step": 9702 + }, + { + "epoch": 0.79, + "grad_norm": 3.0256006505485447, + "learning_rate": 1.1321850786294986e-06, + "loss": 0.6522, + "step": 9703 + }, + { + "epoch": 0.79, + "grad_norm": 2.7547328963372366, + "learning_rate": 1.131351646789624e-06, + "loss": 0.6399, + "step": 9704 + }, + { + "epoch": 0.79, + "grad_norm": 2.3947916901946, + "learning_rate": 1.1305184826845745e-06, + "loss": 0.7279, + "step": 9705 + }, + { + "epoch": 0.79, + "grad_norm": 4.678641501940377, + "learning_rate": 1.1296855863720103e-06, + "loss": 0.6577, + "step": 9706 + }, + { + "epoch": 0.79, + "grad_norm": 3.401625757577636, + "learning_rate": 1.1288529579095713e-06, + "loss": 0.6255, + "step": 9707 + }, + { + "epoch": 0.79, + "grad_norm": 2.786864622235492, + "learning_rate": 1.128020597354884e-06, + "loss": 0.5278, + "step": 9708 + }, + { + "epoch": 0.79, + "grad_norm": 4.276418898701669, + "learning_rate": 1.12718850476555e-06, + "loss": 0.5811, + "step": 9709 + }, + { + "epoch": 0.79, + "grad_norm": 4.642329986414064, + "learning_rate": 1.1263566801991583e-06, + "loss": 0.7763, + "step": 9710 + }, + { + "epoch": 0.79, + "grad_norm": 2.762623222175062, + "learning_rate": 1.1255251237132746e-06, + "loss": 0.5785, + "step": 9711 + }, + { + "epoch": 0.79, + "grad_norm": 7.231099144134598, + "learning_rate": 1.124693835365448e-06, + "loss": 0.7966, + "step": 9712 + }, + { + "epoch": 0.79, + "grad_norm": 2.8210258430420136, + "learning_rate": 1.1238628152132093e-06, + "loss": 0.6635, + "step": 9713 + }, + { + "epoch": 0.79, + "grad_norm": 4.7451956962387865, + "learning_rate": 1.1230320633140678e-06, + "loss": 0.6041, + "step": 9714 + }, + { + "epoch": 0.79, + "grad_norm": 2.6179270218608206, + "learning_rate": 1.122201579725521e-06, + "loss": 0.6041, + "step": 9715 + }, + { + "epoch": 0.79, + "grad_norm": 3.9323621128508375, + "learning_rate": 1.12137136450504e-06, + "loss": 0.5517, + "step": 9716 + }, + { + "epoch": 0.79, + "grad_norm": 5.000328275867567, + "learning_rate": 1.1205414177100837e-06, + "loss": 0.5663, + "step": 9717 + }, + { + "epoch": 0.79, + "grad_norm": 3.0234454540137583, + "learning_rate": 1.1197117393980883e-06, + "loss": 0.6707, + "step": 9718 + }, + { + "epoch": 0.79, + "grad_norm": 10.592957426203395, + "learning_rate": 1.1188823296264734e-06, + "loss": 0.6716, + "step": 9719 + }, + { + "epoch": 0.79, + "grad_norm": 15.539374023295242, + "learning_rate": 1.118053188452638e-06, + "loss": 0.5131, + "step": 9720 + }, + { + "epoch": 0.79, + "grad_norm": 3.34681886379964, + "learning_rate": 1.117224315933964e-06, + "loss": 0.6771, + "step": 9721 + }, + { + "epoch": 0.79, + "grad_norm": 5.259963234921147, + "learning_rate": 1.1163957121278163e-06, + "loss": 0.7317, + "step": 9722 + }, + { + "epoch": 0.79, + "grad_norm": 6.394483156480688, + "learning_rate": 1.1155673770915377e-06, + "loss": 0.6714, + "step": 9723 + }, + { + "epoch": 0.79, + "grad_norm": 3.06166633541685, + "learning_rate": 1.1147393108824556e-06, + "loss": 0.6505, + "step": 9724 + }, + { + "epoch": 0.79, + "grad_norm": 6.620930835666329, + "learning_rate": 1.113911513557877e-06, + "loss": 0.5993, + "step": 9725 + }, + { + "epoch": 0.79, + "grad_norm": 4.356851727848269, + "learning_rate": 1.1130839851750908e-06, + "loss": 0.7491, + "step": 9726 + }, + { + "epoch": 0.79, + "grad_norm": 5.197260372620817, + "learning_rate": 1.112256725791367e-06, + "loss": 0.5711, + "step": 9727 + }, + { + "epoch": 0.79, + "grad_norm": 5.916259531545235, + "learning_rate": 1.1114297354639553e-06, + "loss": 0.7968, + "step": 9728 + }, + { + "epoch": 0.79, + "grad_norm": 4.837624168103878, + "learning_rate": 1.1106030142500917e-06, + "loss": 0.7132, + "step": 9729 + }, + { + "epoch": 0.79, + "grad_norm": 2.6842887453832907, + "learning_rate": 1.1097765622069878e-06, + "loss": 0.6206, + "step": 9730 + }, + { + "epoch": 0.79, + "grad_norm": 5.805029141042495, + "learning_rate": 1.1089503793918438e-06, + "loss": 0.5622, + "step": 9731 + }, + { + "epoch": 0.79, + "grad_norm": 4.198211582905947, + "learning_rate": 1.1081244658618306e-06, + "loss": 0.8159, + "step": 9732 + }, + { + "epoch": 0.79, + "grad_norm": 5.898727013812226, + "learning_rate": 1.107298821674111e-06, + "loss": 0.6261, + "step": 9733 + }, + { + "epoch": 0.79, + "grad_norm": 7.105839467981459, + "learning_rate": 1.1064734468858223e-06, + "loss": 0.5587, + "step": 9734 + }, + { + "epoch": 0.79, + "grad_norm": 4.5615799781334845, + "learning_rate": 1.1056483415540874e-06, + "loss": 0.6985, + "step": 9735 + }, + { + "epoch": 0.79, + "grad_norm": 3.892344535612827, + "learning_rate": 1.104823505736009e-06, + "loss": 0.6041, + "step": 9736 + }, + { + "epoch": 0.79, + "grad_norm": 5.642601068388329, + "learning_rate": 1.1039989394886686e-06, + "loss": 0.6085, + "step": 9737 + }, + { + "epoch": 0.79, + "grad_norm": 4.855958276444511, + "learning_rate": 1.1031746428691354e-06, + "loss": 0.7245, + "step": 9738 + }, + { + "epoch": 0.79, + "grad_norm": 4.539357229552136, + "learning_rate": 1.1023506159344498e-06, + "loss": 0.7087, + "step": 9739 + }, + { + "epoch": 0.79, + "grad_norm": 6.767155304205483, + "learning_rate": 1.1015268587416455e-06, + "loss": 0.741, + "step": 9740 + }, + { + "epoch": 0.79, + "grad_norm": 3.522013878855432, + "learning_rate": 1.1007033713477277e-06, + "loss": 0.7851, + "step": 9741 + }, + { + "epoch": 0.79, + "grad_norm": 2.4456256187860443, + "learning_rate": 1.0998801538096904e-06, + "loss": 0.6785, + "step": 9742 + }, + { + "epoch": 0.79, + "grad_norm": 4.624646460868328, + "learning_rate": 1.0990572061845034e-06, + "loss": 0.745, + "step": 9743 + }, + { + "epoch": 0.79, + "grad_norm": 4.0752240110712785, + "learning_rate": 1.0982345285291184e-06, + "loss": 0.7212, + "step": 9744 + }, + { + "epoch": 0.79, + "grad_norm": 3.3468582299379555, + "learning_rate": 1.0974121209004746e-06, + "loss": 0.5652, + "step": 9745 + }, + { + "epoch": 0.79, + "grad_norm": 3.2845251821572865, + "learning_rate": 1.0965899833554821e-06, + "loss": 0.6849, + "step": 9746 + }, + { + "epoch": 0.79, + "grad_norm": 2.361084013711998, + "learning_rate": 1.0957681159510418e-06, + "loss": 0.8287, + "step": 9747 + }, + { + "epoch": 0.79, + "grad_norm": 3.2001332072406288, + "learning_rate": 1.09494651874403e-06, + "loss": 0.6247, + "step": 9748 + }, + { + "epoch": 0.79, + "grad_norm": 3.978647846563255, + "learning_rate": 1.0941251917913082e-06, + "loss": 0.7044, + "step": 9749 + }, + { + "epoch": 0.79, + "grad_norm": 3.061369104160993, + "learning_rate": 1.093304135149717e-06, + "loss": 0.683, + "step": 9750 + }, + { + "epoch": 0.79, + "grad_norm": 6.820948028367795, + "learning_rate": 1.0924833488760778e-06, + "loss": 0.7039, + "step": 9751 + }, + { + "epoch": 0.79, + "grad_norm": 9.612384506598756, + "learning_rate": 1.091662833027195e-06, + "loss": 0.7224, + "step": 9752 + }, + { + "epoch": 0.79, + "grad_norm": 2.22698314263358, + "learning_rate": 1.0908425876598512e-06, + "loss": 0.6723, + "step": 9753 + }, + { + "epoch": 0.79, + "grad_norm": 3.6679925662629467, + "learning_rate": 1.090022612830816e-06, + "loss": 0.6476, + "step": 9754 + }, + { + "epoch": 0.79, + "grad_norm": 5.046718509361913, + "learning_rate": 1.0892029085968343e-06, + "loss": 0.8094, + "step": 9755 + }, + { + "epoch": 0.79, + "grad_norm": 3.1467281063049084, + "learning_rate": 1.0883834750146366e-06, + "loss": 0.7074, + "step": 9756 + }, + { + "epoch": 0.79, + "grad_norm": 5.749920143474759, + "learning_rate": 1.0875643121409307e-06, + "loss": 0.6944, + "step": 9757 + }, + { + "epoch": 0.79, + "grad_norm": 2.931383088425106, + "learning_rate": 1.0867454200324123e-06, + "loss": 0.6525, + "step": 9758 + }, + { + "epoch": 0.79, + "grad_norm": 3.3960350593838005, + "learning_rate": 1.0859267987457478e-06, + "loss": 0.6391, + "step": 9759 + }, + { + "epoch": 0.79, + "grad_norm": 2.839040912644251, + "learning_rate": 1.085108448337595e-06, + "loss": 0.7594, + "step": 9760 + }, + { + "epoch": 0.79, + "grad_norm": 3.465451141606628, + "learning_rate": 1.0842903688645879e-06, + "loss": 0.6289, + "step": 9761 + }, + { + "epoch": 0.79, + "grad_norm": 3.8371713358598587, + "learning_rate": 1.0834725603833414e-06, + "loss": 0.6642, + "step": 9762 + }, + { + "epoch": 0.79, + "grad_norm": 4.125293622236857, + "learning_rate": 1.0826550229504552e-06, + "loss": 0.6999, + "step": 9763 + }, + { + "epoch": 0.79, + "grad_norm": 2.765093682787993, + "learning_rate": 1.0818377566225075e-06, + "loss": 0.7862, + "step": 9764 + }, + { + "epoch": 0.79, + "grad_norm": 3.327115086233531, + "learning_rate": 1.0810207614560575e-06, + "loss": 0.6173, + "step": 9765 + }, + { + "epoch": 0.79, + "grad_norm": 7.117884598842083, + "learning_rate": 1.0802040375076457e-06, + "loss": 0.7942, + "step": 9766 + }, + { + "epoch": 0.79, + "grad_norm": 3.944794301192135, + "learning_rate": 1.0793875848337964e-06, + "loss": 0.7212, + "step": 9767 + }, + { + "epoch": 0.79, + "grad_norm": 3.9042654880637957, + "learning_rate": 1.0785714034910128e-06, + "loss": 0.6901, + "step": 9768 + }, + { + "epoch": 0.79, + "grad_norm": 4.4270346318516, + "learning_rate": 1.077755493535778e-06, + "loss": 0.737, + "step": 9769 + }, + { + "epoch": 0.79, + "grad_norm": 5.013169963814797, + "learning_rate": 1.0769398550245613e-06, + "loss": 0.7072, + "step": 9770 + }, + { + "epoch": 0.79, + "grad_norm": 4.949637448756055, + "learning_rate": 1.0761244880138078e-06, + "loss": 0.6744, + "step": 9771 + }, + { + "epoch": 0.79, + "grad_norm": 3.2484788298471954, + "learning_rate": 1.0753093925599467e-06, + "loss": 0.712, + "step": 9772 + }, + { + "epoch": 0.79, + "grad_norm": 8.510977166981402, + "learning_rate": 1.0744945687193858e-06, + "loss": 0.766, + "step": 9773 + }, + { + "epoch": 0.79, + "grad_norm": 3.3349905675965297, + "learning_rate": 1.0736800165485194e-06, + "loss": 0.667, + "step": 9774 + }, + { + "epoch": 0.79, + "grad_norm": 2.3178571140266717, + "learning_rate": 1.072865736103718e-06, + "loss": 0.7812, + "step": 9775 + }, + { + "epoch": 0.79, + "grad_norm": 4.8316602969272235, + "learning_rate": 1.0720517274413338e-06, + "loss": 0.6317, + "step": 9776 + }, + { + "epoch": 0.79, + "grad_norm": 3.1927301207945282, + "learning_rate": 1.0712379906177034e-06, + "loss": 0.7594, + "step": 9777 + }, + { + "epoch": 0.79, + "grad_norm": 2.894156867274025, + "learning_rate": 1.070424525689142e-06, + "loss": 0.4974, + "step": 9778 + }, + { + "epoch": 0.79, + "grad_norm": 6.65724442669717, + "learning_rate": 1.0696113327119461e-06, + "loss": 0.7553, + "step": 9779 + }, + { + "epoch": 0.79, + "grad_norm": 8.213544208588305, + "learning_rate": 1.068798411742392e-06, + "loss": 0.5794, + "step": 9780 + }, + { + "epoch": 0.79, + "grad_norm": 4.029607735526111, + "learning_rate": 1.0679857628367423e-06, + "loss": 0.5589, + "step": 9781 + }, + { + "epoch": 0.79, + "grad_norm": 3.035865218068489, + "learning_rate": 1.0671733860512346e-06, + "loss": 0.7249, + "step": 9782 + }, + { + "epoch": 0.79, + "grad_norm": 3.2117165727754124, + "learning_rate": 1.0663612814420927e-06, + "loss": 0.5294, + "step": 9783 + }, + { + "epoch": 0.79, + "grad_norm": 2.9296379865594915, + "learning_rate": 1.0655494490655183e-06, + "loss": 0.6008, + "step": 9784 + }, + { + "epoch": 0.79, + "grad_norm": 6.690473689729569, + "learning_rate": 1.0647378889776956e-06, + "loss": 0.6787, + "step": 9785 + }, + { + "epoch": 0.79, + "grad_norm": 3.2495891997445536, + "learning_rate": 1.0639266012347892e-06, + "loss": 0.571, + "step": 9786 + }, + { + "epoch": 0.79, + "grad_norm": 4.588433892039435, + "learning_rate": 1.0631155858929448e-06, + "loss": 0.574, + "step": 9787 + }, + { + "epoch": 0.79, + "grad_norm": 12.087009859965255, + "learning_rate": 1.0623048430082917e-06, + "loss": 0.6206, + "step": 9788 + }, + { + "epoch": 0.8, + "grad_norm": 2.534138459952571, + "learning_rate": 1.0614943726369354e-06, + "loss": 0.6891, + "step": 9789 + }, + { + "epoch": 0.8, + "grad_norm": 3.282467796066206, + "learning_rate": 1.060684174834969e-06, + "loss": 0.483, + "step": 9790 + }, + { + "epoch": 0.8, + "grad_norm": 7.760725227511191, + "learning_rate": 1.059874249658462e-06, + "loss": 0.662, + "step": 9791 + }, + { + "epoch": 0.8, + "grad_norm": 4.110396428224749, + "learning_rate": 1.0590645971634655e-06, + "loss": 0.5828, + "step": 9792 + }, + { + "epoch": 0.8, + "grad_norm": 3.1462311175106166, + "learning_rate": 1.0582552174060133e-06, + "loss": 0.6508, + "step": 9793 + }, + { + "epoch": 0.8, + "grad_norm": 2.8866488415353535, + "learning_rate": 1.057446110442118e-06, + "loss": 0.7141, + "step": 9794 + }, + { + "epoch": 0.8, + "grad_norm": 2.6226958772210525, + "learning_rate": 1.0566372763277777e-06, + "loss": 0.6764, + "step": 9795 + }, + { + "epoch": 0.8, + "grad_norm": 3.2512020029050253, + "learning_rate": 1.0558287151189656e-06, + "loss": 0.6086, + "step": 9796 + }, + { + "epoch": 0.8, + "grad_norm": 6.3873777482631215, + "learning_rate": 1.055020426871643e-06, + "loss": 0.7259, + "step": 9797 + }, + { + "epoch": 0.8, + "grad_norm": 3.5571791715349765, + "learning_rate": 1.0542124116417456e-06, + "loss": 0.621, + "step": 9798 + }, + { + "epoch": 0.8, + "grad_norm": 2.7555725063588077, + "learning_rate": 1.0534046694851945e-06, + "loss": 0.5893, + "step": 9799 + }, + { + "epoch": 0.8, + "grad_norm": 6.802592347371291, + "learning_rate": 1.0525972004578904e-06, + "loss": 0.5954, + "step": 9800 + }, + { + "epoch": 0.8, + "grad_norm": 4.179481322058555, + "learning_rate": 1.051790004615713e-06, + "loss": 0.5572, + "step": 9801 + }, + { + "epoch": 0.8, + "grad_norm": 2.6510495370101723, + "learning_rate": 1.0509830820145294e-06, + "loss": 0.7066, + "step": 9802 + }, + { + "epoch": 0.8, + "grad_norm": 4.549669132313497, + "learning_rate": 1.0501764327101793e-06, + "loss": 0.6037, + "step": 9803 + }, + { + "epoch": 0.8, + "grad_norm": 3.3420122030448396, + "learning_rate": 1.0493700567584935e-06, + "loss": 0.6085, + "step": 9804 + }, + { + "epoch": 0.8, + "grad_norm": 4.143601182458291, + "learning_rate": 1.048563954215272e-06, + "loss": 0.8227, + "step": 9805 + }, + { + "epoch": 0.8, + "grad_norm": 3.8398553918063425, + "learning_rate": 1.0477581251363066e-06, + "loss": 0.6747, + "step": 9806 + }, + { + "epoch": 0.8, + "grad_norm": 2.8450218432357515, + "learning_rate": 1.0469525695773636e-06, + "loss": 0.7192, + "step": 9807 + }, + { + "epoch": 0.8, + "grad_norm": 4.071998571654696, + "learning_rate": 1.0461472875941935e-06, + "loss": 0.6987, + "step": 9808 + }, + { + "epoch": 0.8, + "grad_norm": 6.200122686823859, + "learning_rate": 1.0453422792425273e-06, + "loss": 0.7143, + "step": 9809 + }, + { + "epoch": 0.8, + "grad_norm": 3.1071614119043374, + "learning_rate": 1.0445375445780747e-06, + "loss": 0.5628, + "step": 9810 + }, + { + "epoch": 0.8, + "grad_norm": 3.5983636434502366, + "learning_rate": 1.0437330836565317e-06, + "loss": 0.5938, + "step": 9811 + }, + { + "epoch": 0.8, + "grad_norm": 2.546479104936366, + "learning_rate": 1.0429288965335683e-06, + "loss": 0.5832, + "step": 9812 + }, + { + "epoch": 0.8, + "grad_norm": 3.4932302962130124, + "learning_rate": 1.0421249832648416e-06, + "loss": 0.7725, + "step": 9813 + }, + { + "epoch": 0.8, + "grad_norm": 2.7445871570196374, + "learning_rate": 1.0413213439059855e-06, + "loss": 0.5961, + "step": 9814 + }, + { + "epoch": 0.8, + "grad_norm": 3.4847538198474868, + "learning_rate": 1.0405179785126201e-06, + "loss": 0.6386, + "step": 9815 + }, + { + "epoch": 0.8, + "grad_norm": 4.790984485566235, + "learning_rate": 1.0397148871403412e-06, + "loss": 0.5645, + "step": 9816 + }, + { + "epoch": 0.8, + "grad_norm": 3.0407499743680524, + "learning_rate": 1.0389120698447286e-06, + "loss": 0.5538, + "step": 9817 + }, + { + "epoch": 0.8, + "grad_norm": 3.2291153915641915, + "learning_rate": 1.0381095266813413e-06, + "loss": 0.6546, + "step": 9818 + }, + { + "epoch": 0.8, + "grad_norm": 3.414551047727841, + "learning_rate": 1.0373072577057197e-06, + "loss": 0.6793, + "step": 9819 + }, + { + "epoch": 0.8, + "grad_norm": 3.8562338931200952, + "learning_rate": 1.0365052629733884e-06, + "loss": 0.6466, + "step": 9820 + }, + { + "epoch": 0.8, + "grad_norm": 2.8985592894916477, + "learning_rate": 1.0357035425398482e-06, + "loss": 0.5847, + "step": 9821 + }, + { + "epoch": 0.8, + "grad_norm": 2.9384587727277904, + "learning_rate": 1.034902096460585e-06, + "loss": 0.6297, + "step": 9822 + }, + { + "epoch": 0.8, + "grad_norm": 2.917657446062314, + "learning_rate": 1.0341009247910626e-06, + "loss": 0.7864, + "step": 9823 + }, + { + "epoch": 0.8, + "grad_norm": 3.729609375101682, + "learning_rate": 1.0333000275867284e-06, + "loss": 0.6788, + "step": 9824 + }, + { + "epoch": 0.8, + "grad_norm": 3.6943581375243597, + "learning_rate": 1.0324994049030085e-06, + "loss": 0.5271, + "step": 9825 + }, + { + "epoch": 0.8, + "grad_norm": 3.9860330169121614, + "learning_rate": 1.0316990567953101e-06, + "loss": 0.6025, + "step": 9826 + }, + { + "epoch": 0.8, + "grad_norm": 2.636193133743554, + "learning_rate": 1.0308989833190241e-06, + "loss": 0.7409, + "step": 9827 + }, + { + "epoch": 0.8, + "grad_norm": 2.4835857648800244, + "learning_rate": 1.030099184529519e-06, + "loss": 0.6271, + "step": 9828 + }, + { + "epoch": 0.8, + "grad_norm": 3.1346390387416663, + "learning_rate": 1.0292996604821482e-06, + "loss": 0.7568, + "step": 9829 + }, + { + "epoch": 0.8, + "grad_norm": 3.7144664418359605, + "learning_rate": 1.0285004112322428e-06, + "loss": 0.6225, + "step": 9830 + }, + { + "epoch": 0.8, + "grad_norm": 2.4355144159678597, + "learning_rate": 1.0277014368351152e-06, + "loss": 0.5937, + "step": 9831 + }, + { + "epoch": 0.8, + "grad_norm": 4.515312259428321, + "learning_rate": 1.0269027373460589e-06, + "loss": 0.7651, + "step": 9832 + }, + { + "epoch": 0.8, + "grad_norm": 3.0296099442093793, + "learning_rate": 1.0261043128203508e-06, + "loss": 0.5965, + "step": 9833 + }, + { + "epoch": 0.8, + "grad_norm": 8.30637490607456, + "learning_rate": 1.025306163313246e-06, + "loss": 0.6246, + "step": 9834 + }, + { + "epoch": 0.8, + "grad_norm": 3.453953429327857, + "learning_rate": 1.02450828887998e-06, + "loss": 0.7047, + "step": 9835 + }, + { + "epoch": 0.8, + "grad_norm": 3.2883844110558766, + "learning_rate": 1.0237106895757738e-06, + "loss": 0.5855, + "step": 9836 + }, + { + "epoch": 0.8, + "grad_norm": 4.291481196393002, + "learning_rate": 1.022913365455825e-06, + "loss": 0.6593, + "step": 9837 + }, + { + "epoch": 0.8, + "grad_norm": 2.462518202903803, + "learning_rate": 1.0221163165753122e-06, + "loss": 0.695, + "step": 9838 + }, + { + "epoch": 0.8, + "grad_norm": 4.755127102563516, + "learning_rate": 1.0213195429893963e-06, + "loss": 0.6545, + "step": 9839 + }, + { + "epoch": 0.8, + "grad_norm": 3.2668505756551944, + "learning_rate": 1.0205230447532217e-06, + "loss": 0.6016, + "step": 9840 + }, + { + "epoch": 0.8, + "grad_norm": 8.225575090310823, + "learning_rate": 1.0197268219219087e-06, + "loss": 0.66, + "step": 9841 + }, + { + "epoch": 0.8, + "grad_norm": 3.2115669509100533, + "learning_rate": 1.0189308745505598e-06, + "loss": 0.7311, + "step": 9842 + }, + { + "epoch": 0.8, + "grad_norm": 2.703656660165645, + "learning_rate": 1.0181352026942632e-06, + "loss": 0.5958, + "step": 9843 + }, + { + "epoch": 0.8, + "grad_norm": 6.347740546803845, + "learning_rate": 1.017339806408082e-06, + "loss": 0.5058, + "step": 9844 + }, + { + "epoch": 0.8, + "grad_norm": 3.7415659875750897, + "learning_rate": 1.0165446857470635e-06, + "loss": 0.5069, + "step": 9845 + }, + { + "epoch": 0.8, + "grad_norm": 9.315507809385082, + "learning_rate": 1.015749840766233e-06, + "loss": 0.6386, + "step": 9846 + }, + { + "epoch": 0.8, + "grad_norm": 3.994838929632411, + "learning_rate": 1.0149552715206024e-06, + "loss": 0.6315, + "step": 9847 + }, + { + "epoch": 0.8, + "grad_norm": 2.343483464463572, + "learning_rate": 1.0141609780651585e-06, + "loss": 0.6322, + "step": 9848 + }, + { + "epoch": 0.8, + "grad_norm": 2.8765354442463704, + "learning_rate": 1.0133669604548702e-06, + "loss": 0.7813, + "step": 9849 + }, + { + "epoch": 0.8, + "grad_norm": 3.017737827426694, + "learning_rate": 1.0125732187446918e-06, + "loss": 0.7039, + "step": 9850 + }, + { + "epoch": 0.8, + "grad_norm": 2.6575724236641967, + "learning_rate": 1.0117797529895535e-06, + "loss": 0.6462, + "step": 9851 + }, + { + "epoch": 0.8, + "grad_norm": 4.011062257505342, + "learning_rate": 1.0109865632443684e-06, + "loss": 0.7091, + "step": 9852 + }, + { + "epoch": 0.8, + "grad_norm": 2.842731910061206, + "learning_rate": 1.0101936495640285e-06, + "loss": 0.6484, + "step": 9853 + }, + { + "epoch": 0.8, + "grad_norm": 2.9779938108617507, + "learning_rate": 1.0094010120034115e-06, + "loss": 0.7331, + "step": 9854 + }, + { + "epoch": 0.8, + "grad_norm": 2.797181126354803, + "learning_rate": 1.008608650617371e-06, + "loss": 0.6564, + "step": 9855 + }, + { + "epoch": 0.8, + "grad_norm": 2.7150580818574706, + "learning_rate": 1.0078165654607425e-06, + "loss": 0.6411, + "step": 9856 + }, + { + "epoch": 0.8, + "grad_norm": 5.477065946580599, + "learning_rate": 1.0070247565883462e-06, + "loss": 0.6293, + "step": 9857 + }, + { + "epoch": 0.8, + "grad_norm": 2.996772066276645, + "learning_rate": 1.0062332240549782e-06, + "loss": 0.6806, + "step": 9858 + }, + { + "epoch": 0.8, + "grad_norm": 6.118761970564444, + "learning_rate": 1.0054419679154182e-06, + "loss": 0.7158, + "step": 9859 + }, + { + "epoch": 0.8, + "grad_norm": 4.5072894417883305, + "learning_rate": 1.0046509882244243e-06, + "loss": 0.5304, + "step": 9860 + }, + { + "epoch": 0.8, + "grad_norm": 8.209889708484853, + "learning_rate": 1.0038602850367401e-06, + "loss": 0.7886, + "step": 9861 + }, + { + "epoch": 0.8, + "grad_norm": 4.827637373309874, + "learning_rate": 1.0030698584070848e-06, + "loss": 0.6521, + "step": 9862 + }, + { + "epoch": 0.8, + "grad_norm": 2.60922441210416, + "learning_rate": 1.002279708390163e-06, + "loss": 0.6337, + "step": 9863 + }, + { + "epoch": 0.8, + "grad_norm": 5.138348283360533, + "learning_rate": 1.0014898350406577e-06, + "loss": 0.5664, + "step": 9864 + }, + { + "epoch": 0.8, + "grad_norm": 3.9101353720032335, + "learning_rate": 1.0007002384132325e-06, + "loss": 0.5833, + "step": 9865 + }, + { + "epoch": 0.8, + "grad_norm": 2.7448521102382304, + "learning_rate": 9.999109185625321e-07, + "loss": 0.6864, + "step": 9866 + }, + { + "epoch": 0.8, + "grad_norm": 2.2845213466571432, + "learning_rate": 9.991218755431814e-07, + "loss": 0.6316, + "step": 9867 + }, + { + "epoch": 0.8, + "grad_norm": 8.489723838723888, + "learning_rate": 9.983331094097903e-07, + "loss": 0.6779, + "step": 9868 + }, + { + "epoch": 0.8, + "grad_norm": 5.302306223808616, + "learning_rate": 9.975446202169432e-07, + "loss": 0.5683, + "step": 9869 + }, + { + "epoch": 0.8, + "grad_norm": 4.744571955249638, + "learning_rate": 9.967564080192122e-07, + "loss": 0.7098, + "step": 9870 + }, + { + "epoch": 0.8, + "grad_norm": 3.8680602372490682, + "learning_rate": 9.959684728711417e-07, + "loss": 0.8085, + "step": 9871 + }, + { + "epoch": 0.8, + "grad_norm": 3.3022222488447164, + "learning_rate": 9.951808148272656e-07, + "loss": 0.7149, + "step": 9872 + }, + { + "epoch": 0.8, + "grad_norm": 3.1886163022263077, + "learning_rate": 9.943934339420941e-07, + "loss": 0.7712, + "step": 9873 + }, + { + "epoch": 0.8, + "grad_norm": 4.330511260748767, + "learning_rate": 9.936063302701165e-07, + "loss": 0.8742, + "step": 9874 + }, + { + "epoch": 0.8, + "grad_norm": 3.2640347591500034, + "learning_rate": 9.928195038658085e-07, + "loss": 0.5693, + "step": 9875 + }, + { + "epoch": 0.8, + "grad_norm": 12.38615577161832, + "learning_rate": 9.92032954783621e-07, + "loss": 0.682, + "step": 9876 + }, + { + "epoch": 0.8, + "grad_norm": 3.1347015589706433, + "learning_rate": 9.91246683077992e-07, + "loss": 0.7185, + "step": 9877 + }, + { + "epoch": 0.8, + "grad_norm": 3.5947056110095885, + "learning_rate": 9.904606888033307e-07, + "loss": 0.7086, + "step": 9878 + }, + { + "epoch": 0.8, + "grad_norm": 2.7228555737199014, + "learning_rate": 9.896749720140375e-07, + "loss": 0.7253, + "step": 9879 + }, + { + "epoch": 0.8, + "grad_norm": 7.816999087187546, + "learning_rate": 9.888895327644876e-07, + "loss": 0.6724, + "step": 9880 + }, + { + "epoch": 0.8, + "grad_norm": 3.669850136064356, + "learning_rate": 9.881043711090366e-07, + "loss": 0.6906, + "step": 9881 + }, + { + "epoch": 0.8, + "grad_norm": 2.9955352572892413, + "learning_rate": 9.873194871020252e-07, + "loss": 0.6552, + "step": 9882 + }, + { + "epoch": 0.8, + "grad_norm": 4.418101307687318, + "learning_rate": 9.865348807977698e-07, + "loss": 0.7678, + "step": 9883 + }, + { + "epoch": 0.8, + "grad_norm": 3.9007923991874645, + "learning_rate": 9.857505522505745e-07, + "loss": 0.7791, + "step": 9884 + }, + { + "epoch": 0.8, + "grad_norm": 2.843120049423034, + "learning_rate": 9.849665015147136e-07, + "loss": 0.5385, + "step": 9885 + }, + { + "epoch": 0.8, + "grad_norm": 3.623885339692565, + "learning_rate": 9.841827286444532e-07, + "loss": 0.5117, + "step": 9886 + }, + { + "epoch": 0.8, + "grad_norm": 7.32091976815324, + "learning_rate": 9.833992336940328e-07, + "loss": 0.7094, + "step": 9887 + }, + { + "epoch": 0.8, + "grad_norm": 5.4631310249121805, + "learning_rate": 9.826160167176768e-07, + "loss": 0.6313, + "step": 9888 + }, + { + "epoch": 0.8, + "grad_norm": 2.5846001876292672, + "learning_rate": 9.818330777695878e-07, + "loss": 0.6994, + "step": 9889 + }, + { + "epoch": 0.8, + "grad_norm": 5.0506085763568676, + "learning_rate": 9.81050416903951e-07, + "loss": 0.5898, + "step": 9890 + }, + { + "epoch": 0.8, + "grad_norm": 2.5686981938386353, + "learning_rate": 9.802680341749303e-07, + "loss": 0.6972, + "step": 9891 + }, + { + "epoch": 0.8, + "grad_norm": 2.3391065713945363, + "learning_rate": 9.794859296366704e-07, + "loss": 0.5776, + "step": 9892 + }, + { + "epoch": 0.8, + "grad_norm": 11.017948277806132, + "learning_rate": 9.787041033433014e-07, + "loss": 0.429, + "step": 9893 + }, + { + "epoch": 0.8, + "grad_norm": 5.6097893167574115, + "learning_rate": 9.77922555348927e-07, + "loss": 0.625, + "step": 9894 + }, + { + "epoch": 0.8, + "grad_norm": 3.553094928138148, + "learning_rate": 9.771412857076379e-07, + "loss": 0.5638, + "step": 9895 + }, + { + "epoch": 0.8, + "grad_norm": 3.2993411124900534, + "learning_rate": 9.763602944735018e-07, + "loss": 0.7334, + "step": 9896 + }, + { + "epoch": 0.8, + "grad_norm": 3.20098221947586, + "learning_rate": 9.755795817005686e-07, + "loss": 0.5926, + "step": 9897 + }, + { + "epoch": 0.8, + "grad_norm": 8.868622972391147, + "learning_rate": 9.747991474428682e-07, + "loss": 0.6959, + "step": 9898 + }, + { + "epoch": 0.8, + "grad_norm": 4.930077251644765, + "learning_rate": 9.740189917544102e-07, + "loss": 0.5402, + "step": 9899 + }, + { + "epoch": 0.8, + "grad_norm": 5.6389110324195295, + "learning_rate": 9.73239114689189e-07, + "loss": 0.7216, + "step": 9900 + }, + { + "epoch": 0.8, + "grad_norm": 5.8879918247849545, + "learning_rate": 9.724595163011741e-07, + "loss": 0.5999, + "step": 9901 + }, + { + "epoch": 0.8, + "grad_norm": 2.9601317312061295, + "learning_rate": 9.716801966443211e-07, + "loss": 0.5953, + "step": 9902 + }, + { + "epoch": 0.8, + "grad_norm": 12.242294430189657, + "learning_rate": 9.709011557725639e-07, + "loss": 0.5665, + "step": 9903 + }, + { + "epoch": 0.8, + "grad_norm": 2.900616768327703, + "learning_rate": 9.701223937398152e-07, + "loss": 0.7114, + "step": 9904 + }, + { + "epoch": 0.8, + "grad_norm": 4.764342312992366, + "learning_rate": 9.693439105999715e-07, + "loss": 0.8228, + "step": 9905 + }, + { + "epoch": 0.8, + "grad_norm": 3.0850936729241676, + "learning_rate": 9.68565706406907e-07, + "loss": 0.6997, + "step": 9906 + }, + { + "epoch": 0.8, + "grad_norm": 3.245720389276964, + "learning_rate": 9.677877812144803e-07, + "loss": 0.4705, + "step": 9907 + }, + { + "epoch": 0.8, + "grad_norm": 6.250918850932546, + "learning_rate": 9.670101350765276e-07, + "loss": 0.5754, + "step": 9908 + }, + { + "epoch": 0.8, + "grad_norm": 3.1019898222030506, + "learning_rate": 9.66232768046868e-07, + "loss": 0.8221, + "step": 9909 + }, + { + "epoch": 0.8, + "grad_norm": 3.9075477351777055, + "learning_rate": 9.654556801793002e-07, + "loss": 0.6233, + "step": 9910 + }, + { + "epoch": 0.8, + "grad_norm": 2.95268295266713, + "learning_rate": 9.646788715276024e-07, + "loss": 0.7627, + "step": 9911 + }, + { + "epoch": 0.81, + "grad_norm": 3.9429748279105863, + "learning_rate": 9.63902342145534e-07, + "loss": 0.5924, + "step": 9912 + }, + { + "epoch": 0.81, + "grad_norm": 6.556515030329859, + "learning_rate": 9.631260920868386e-07, + "loss": 0.7955, + "step": 9913 + }, + { + "epoch": 0.81, + "grad_norm": 7.014493961271654, + "learning_rate": 9.62350121405235e-07, + "loss": 0.7527, + "step": 9914 + }, + { + "epoch": 0.81, + "grad_norm": 5.044064711572825, + "learning_rate": 9.615744301544256e-07, + "loss": 0.6153, + "step": 9915 + }, + { + "epoch": 0.81, + "grad_norm": 3.5663222594727806, + "learning_rate": 9.607990183880944e-07, + "loss": 0.6786, + "step": 9916 + }, + { + "epoch": 0.81, + "grad_norm": 6.631948114406123, + "learning_rate": 9.600238861599047e-07, + "loss": 0.6252, + "step": 9917 + }, + { + "epoch": 0.81, + "grad_norm": 4.244549301947422, + "learning_rate": 9.592490335234993e-07, + "loss": 0.5268, + "step": 9918 + }, + { + "epoch": 0.81, + "grad_norm": 17.741883362213013, + "learning_rate": 9.584744605325024e-07, + "loss": 0.7255, + "step": 9919 + }, + { + "epoch": 0.81, + "grad_norm": 3.908729417462394, + "learning_rate": 9.577001672405218e-07, + "loss": 0.88, + "step": 9920 + }, + { + "epoch": 0.81, + "grad_norm": 2.6733531510879756, + "learning_rate": 9.569261537011421e-07, + "loss": 0.4844, + "step": 9921 + }, + { + "epoch": 0.81, + "grad_norm": 4.3824202752278305, + "learning_rate": 9.561524199679284e-07, + "loss": 0.786, + "step": 9922 + }, + { + "epoch": 0.81, + "grad_norm": 3.3814556966659737, + "learning_rate": 9.553789660944318e-07, + "loss": 0.715, + "step": 9923 + }, + { + "epoch": 0.81, + "grad_norm": 7.21167255596437, + "learning_rate": 9.54605792134175e-07, + "loss": 0.6129, + "step": 9924 + }, + { + "epoch": 0.81, + "grad_norm": 4.770964950828427, + "learning_rate": 9.538328981406714e-07, + "loss": 0.6037, + "step": 9925 + }, + { + "epoch": 0.81, + "grad_norm": 6.126381299311236, + "learning_rate": 9.530602841674064e-07, + "loss": 0.5184, + "step": 9926 + }, + { + "epoch": 0.81, + "grad_norm": 2.923778646833415, + "learning_rate": 9.522879502678522e-07, + "loss": 0.76, + "step": 9927 + }, + { + "epoch": 0.81, + "grad_norm": 3.2053625728389443, + "learning_rate": 9.515158964954585e-07, + "loss": 0.7445, + "step": 9928 + }, + { + "epoch": 0.81, + "grad_norm": 3.621953503704296, + "learning_rate": 9.507441229036551e-07, + "loss": 0.6254, + "step": 9929 + }, + { + "epoch": 0.81, + "grad_norm": 10.117085393736515, + "learning_rate": 9.499726295458572e-07, + "loss": 0.7325, + "step": 9930 + }, + { + "epoch": 0.81, + "grad_norm": 3.2295081819081135, + "learning_rate": 9.492014164754521e-07, + "loss": 0.5779, + "step": 9931 + }, + { + "epoch": 0.81, + "grad_norm": 2.606483573577365, + "learning_rate": 9.484304837458158e-07, + "loss": 0.6866, + "step": 9932 + }, + { + "epoch": 0.81, + "grad_norm": 2.76858179425622, + "learning_rate": 9.476598314102992e-07, + "loss": 0.6533, + "step": 9933 + }, + { + "epoch": 0.81, + "grad_norm": 5.365714699325056, + "learning_rate": 9.468894595222399e-07, + "loss": 0.6083, + "step": 9934 + }, + { + "epoch": 0.81, + "grad_norm": 2.537466135233147, + "learning_rate": 9.46119368134949e-07, + "loss": 0.6682, + "step": 9935 + }, + { + "epoch": 0.81, + "grad_norm": 2.672354938808407, + "learning_rate": 9.453495573017241e-07, + "loss": 0.6396, + "step": 9936 + }, + { + "epoch": 0.81, + "grad_norm": 3.1029966936616122, + "learning_rate": 9.445800270758404e-07, + "loss": 0.5554, + "step": 9937 + }, + { + "epoch": 0.81, + "grad_norm": 3.667919478908177, + "learning_rate": 9.438107775105538e-07, + "loss": 0.6622, + "step": 9938 + }, + { + "epoch": 0.81, + "grad_norm": 4.481492053441964, + "learning_rate": 9.430418086591008e-07, + "loss": 0.6594, + "step": 9939 + }, + { + "epoch": 0.81, + "grad_norm": 3.732630352314224, + "learning_rate": 9.422731205746988e-07, + "loss": 0.6862, + "step": 9940 + }, + { + "epoch": 0.81, + "grad_norm": 3.9530019307486466, + "learning_rate": 9.41504713310547e-07, + "loss": 0.7385, + "step": 9941 + }, + { + "epoch": 0.81, + "grad_norm": 9.053886732090607, + "learning_rate": 9.407365869198226e-07, + "loss": 0.6474, + "step": 9942 + }, + { + "epoch": 0.81, + "grad_norm": 3.6758936164347076, + "learning_rate": 9.399687414556885e-07, + "loss": 0.6395, + "step": 9943 + }, + { + "epoch": 0.81, + "grad_norm": 3.8841177307556918, + "learning_rate": 9.392011769712784e-07, + "loss": 0.7285, + "step": 9944 + }, + { + "epoch": 0.81, + "grad_norm": 2.562305562817162, + "learning_rate": 9.384338935197174e-07, + "loss": 0.6913, + "step": 9945 + }, + { + "epoch": 0.81, + "grad_norm": 8.272170620929478, + "learning_rate": 9.376668911541042e-07, + "loss": 0.8153, + "step": 9946 + }, + { + "epoch": 0.81, + "grad_norm": 2.599613293050827, + "learning_rate": 9.369001699275199e-07, + "loss": 0.7047, + "step": 9947 + }, + { + "epoch": 0.81, + "grad_norm": 5.567350074408844, + "learning_rate": 9.361337298930284e-07, + "loss": 0.5762, + "step": 9948 + }, + { + "epoch": 0.81, + "grad_norm": 4.325903761824358, + "learning_rate": 9.353675711036697e-07, + "loss": 0.6129, + "step": 9949 + }, + { + "epoch": 0.81, + "grad_norm": 3.660770102574959, + "learning_rate": 9.346016936124708e-07, + "loss": 0.6449, + "step": 9950 + }, + { + "epoch": 0.81, + "grad_norm": 5.817692395970429, + "learning_rate": 9.338360974724298e-07, + "loss": 0.6095, + "step": 9951 + }, + { + "epoch": 0.81, + "grad_norm": 8.418374291451418, + "learning_rate": 9.330707827365354e-07, + "loss": 0.715, + "step": 9952 + }, + { + "epoch": 0.81, + "grad_norm": 6.890711367873007, + "learning_rate": 9.323057494577498e-07, + "loss": 0.6922, + "step": 9953 + }, + { + "epoch": 0.81, + "grad_norm": 2.0488552272629583, + "learning_rate": 9.315409976890172e-07, + "loss": 0.6094, + "step": 9954 + }, + { + "epoch": 0.81, + "grad_norm": 5.277842085185813, + "learning_rate": 9.307765274832664e-07, + "loss": 0.6416, + "step": 9955 + }, + { + "epoch": 0.81, + "grad_norm": 9.701786330251842, + "learning_rate": 9.300123388934001e-07, + "loss": 0.7129, + "step": 9956 + }, + { + "epoch": 0.81, + "grad_norm": 2.685363438627029, + "learning_rate": 9.292484319723094e-07, + "loss": 0.6335, + "step": 9957 + }, + { + "epoch": 0.81, + "grad_norm": 5.091373219660217, + "learning_rate": 9.284848067728569e-07, + "loss": 0.6499, + "step": 9958 + }, + { + "epoch": 0.81, + "grad_norm": 5.263243363282859, + "learning_rate": 9.277214633478926e-07, + "loss": 0.7102, + "step": 9959 + }, + { + "epoch": 0.81, + "grad_norm": 9.469385285515743, + "learning_rate": 9.269584017502431e-07, + "loss": 0.8157, + "step": 9960 + }, + { + "epoch": 0.81, + "grad_norm": 4.874019528623201, + "learning_rate": 9.261956220327195e-07, + "loss": 0.6714, + "step": 9961 + }, + { + "epoch": 0.81, + "grad_norm": 10.350105670674443, + "learning_rate": 9.254331242481102e-07, + "loss": 0.7947, + "step": 9962 + }, + { + "epoch": 0.81, + "grad_norm": 3.386553640009264, + "learning_rate": 9.246709084491839e-07, + "loss": 0.6018, + "step": 9963 + }, + { + "epoch": 0.81, + "grad_norm": 2.548136398112541, + "learning_rate": 9.239089746886909e-07, + "loss": 0.6337, + "step": 9964 + }, + { + "epoch": 0.81, + "grad_norm": 3.886245952536533, + "learning_rate": 9.231473230193611e-07, + "loss": 0.594, + "step": 9965 + }, + { + "epoch": 0.81, + "grad_norm": 4.813430292578901, + "learning_rate": 9.223859534939073e-07, + "loss": 0.7768, + "step": 9966 + }, + { + "epoch": 0.81, + "grad_norm": 5.206574699364554, + "learning_rate": 9.216248661650196e-07, + "loss": 0.5511, + "step": 9967 + }, + { + "epoch": 0.81, + "grad_norm": 3.1499657962144294, + "learning_rate": 9.208640610853719e-07, + "loss": 0.632, + "step": 9968 + }, + { + "epoch": 0.81, + "grad_norm": 4.785945467020094, + "learning_rate": 9.201035383076152e-07, + "loss": 0.7104, + "step": 9969 + }, + { + "epoch": 0.81, + "grad_norm": 2.6062730959721105, + "learning_rate": 9.19343297884383e-07, + "loss": 0.546, + "step": 9970 + }, + { + "epoch": 0.81, + "grad_norm": 4.312447485612665, + "learning_rate": 9.185833398682886e-07, + "loss": 0.71, + "step": 9971 + }, + { + "epoch": 0.81, + "grad_norm": 3.0433521237370815, + "learning_rate": 9.178236643119242e-07, + "loss": 0.7095, + "step": 9972 + }, + { + "epoch": 0.81, + "grad_norm": 5.3710641076397065, + "learning_rate": 9.170642712678674e-07, + "loss": 0.7192, + "step": 9973 + }, + { + "epoch": 0.81, + "grad_norm": 4.744952186413227, + "learning_rate": 9.163051607886703e-07, + "loss": 0.715, + "step": 9974 + }, + { + "epoch": 0.81, + "grad_norm": 4.208452409065429, + "learning_rate": 9.155463329268699e-07, + "loss": 0.6187, + "step": 9975 + }, + { + "epoch": 0.81, + "grad_norm": 3.5017488865379813, + "learning_rate": 9.147877877349815e-07, + "loss": 0.751, + "step": 9976 + }, + { + "epoch": 0.81, + "grad_norm": 12.000348819317006, + "learning_rate": 9.140295252655002e-07, + "loss": 0.616, + "step": 9977 + }, + { + "epoch": 0.81, + "grad_norm": 3.1433084888266336, + "learning_rate": 9.132715455709035e-07, + "loss": 0.6289, + "step": 9978 + }, + { + "epoch": 0.81, + "grad_norm": 35.7302882179897, + "learning_rate": 9.125138487036467e-07, + "loss": 0.7065, + "step": 9979 + }, + { + "epoch": 0.81, + "grad_norm": 4.902207323945893, + "learning_rate": 9.1175643471617e-07, + "loss": 0.6192, + "step": 9980 + }, + { + "epoch": 0.81, + "grad_norm": 3.2746288505432606, + "learning_rate": 9.109993036608883e-07, + "loss": 0.6142, + "step": 9981 + }, + { + "epoch": 0.81, + "grad_norm": 4.800546252378688, + "learning_rate": 9.102424555902023e-07, + "loss": 0.7213, + "step": 9982 + }, + { + "epoch": 0.81, + "grad_norm": 20.975126133655735, + "learning_rate": 9.094858905564902e-07, + "loss": 0.669, + "step": 9983 + }, + { + "epoch": 0.81, + "grad_norm": 2.8746858668462645, + "learning_rate": 9.0872960861211e-07, + "loss": 0.7381, + "step": 9984 + }, + { + "epoch": 0.81, + "grad_norm": 5.067209310758347, + "learning_rate": 9.079736098094006e-07, + "loss": 0.5981, + "step": 9985 + }, + { + "epoch": 0.81, + "grad_norm": 2.50363557741976, + "learning_rate": 9.072178942006838e-07, + "loss": 0.5365, + "step": 9986 + }, + { + "epoch": 0.81, + "grad_norm": 3.4652050748271845, + "learning_rate": 9.064624618382595e-07, + "loss": 0.548, + "step": 9987 + }, + { + "epoch": 0.81, + "grad_norm": 10.690335971055255, + "learning_rate": 9.057073127744065e-07, + "loss": 0.6651, + "step": 9988 + }, + { + "epoch": 0.81, + "grad_norm": 6.79897315345429, + "learning_rate": 9.049524470613885e-07, + "loss": 0.6373, + "step": 9989 + }, + { + "epoch": 0.81, + "grad_norm": 5.229264430109422, + "learning_rate": 9.041978647514454e-07, + "loss": 0.5997, + "step": 9990 + }, + { + "epoch": 0.81, + "grad_norm": 2.4722038457767863, + "learning_rate": 9.034435658967999e-07, + "loss": 0.5981, + "step": 9991 + }, + { + "epoch": 0.81, + "grad_norm": 2.8959921227574075, + "learning_rate": 9.026895505496519e-07, + "loss": 0.5882, + "step": 9992 + }, + { + "epoch": 0.81, + "grad_norm": 3.8511478104590635, + "learning_rate": 9.019358187621874e-07, + "loss": 0.6288, + "step": 9993 + }, + { + "epoch": 0.81, + "grad_norm": 4.146488149715795, + "learning_rate": 9.011823705865674e-07, + "loss": 0.704, + "step": 9994 + }, + { + "epoch": 0.81, + "grad_norm": 2.903316464439226, + "learning_rate": 9.004292060749347e-07, + "loss": 0.7238, + "step": 9995 + }, + { + "epoch": 0.81, + "grad_norm": 3.230917624133374, + "learning_rate": 8.996763252794166e-07, + "loss": 0.7223, + "step": 9996 + }, + { + "epoch": 0.81, + "grad_norm": 4.521846836940077, + "learning_rate": 8.989237282521118e-07, + "loss": 0.8728, + "step": 9997 + }, + { + "epoch": 0.81, + "grad_norm": 3.7138821547713445, + "learning_rate": 8.981714150451093e-07, + "loss": 0.7436, + "step": 9998 + }, + { + "epoch": 0.81, + "grad_norm": 3.194124361155184, + "learning_rate": 8.974193857104702e-07, + "loss": 0.5838, + "step": 9999 + }, + { + "epoch": 0.81, + "grad_norm": 4.403892501063843, + "learning_rate": 8.966676403002434e-07, + "loss": 0.5369, + "step": 10000 + }, + { + "epoch": 0.81, + "grad_norm": 4.375022464560412, + "learning_rate": 8.959161788664522e-07, + "loss": 0.5794, + "step": 10001 + }, + { + "epoch": 0.81, + "grad_norm": 3.1482155647840733, + "learning_rate": 8.951650014611019e-07, + "loss": 0.7108, + "step": 10002 + }, + { + "epoch": 0.81, + "grad_norm": 2.821499133219986, + "learning_rate": 8.944141081361818e-07, + "loss": 0.6749, + "step": 10003 + }, + { + "epoch": 0.81, + "grad_norm": 5.211414831541902, + "learning_rate": 8.936634989436537e-07, + "loss": 0.5111, + "step": 10004 + }, + { + "epoch": 0.81, + "grad_norm": 3.3177974720829506, + "learning_rate": 8.929131739354691e-07, + "loss": 0.531, + "step": 10005 + }, + { + "epoch": 0.81, + "grad_norm": 3.9594489519800056, + "learning_rate": 8.921631331635516e-07, + "loss": 0.7919, + "step": 10006 + }, + { + "epoch": 0.81, + "grad_norm": 9.729668434642653, + "learning_rate": 8.914133766798117e-07, + "loss": 0.7486, + "step": 10007 + }, + { + "epoch": 0.81, + "grad_norm": 13.118682432926416, + "learning_rate": 8.906639045361343e-07, + "loss": 0.6797, + "step": 10008 + }, + { + "epoch": 0.81, + "grad_norm": 5.434523461612034, + "learning_rate": 8.899147167843908e-07, + "loss": 0.6095, + "step": 10009 + }, + { + "epoch": 0.81, + "grad_norm": 8.96529726489475, + "learning_rate": 8.89165813476428e-07, + "loss": 0.7011, + "step": 10010 + }, + { + "epoch": 0.81, + "grad_norm": 4.2660321046956975, + "learning_rate": 8.884171946640746e-07, + "loss": 0.6122, + "step": 10011 + }, + { + "epoch": 0.81, + "grad_norm": 5.399336702770959, + "learning_rate": 8.876688603991407e-07, + "loss": 0.7188, + "step": 10012 + }, + { + "epoch": 0.81, + "grad_norm": 5.137013802557782, + "learning_rate": 8.869208107334131e-07, + "loss": 0.6184, + "step": 10013 + }, + { + "epoch": 0.81, + "grad_norm": 3.5346420944263865, + "learning_rate": 8.861730457186651e-07, + "loss": 0.7026, + "step": 10014 + }, + { + "epoch": 0.81, + "grad_norm": 13.443712801252248, + "learning_rate": 8.85425565406644e-07, + "loss": 0.6192, + "step": 10015 + }, + { + "epoch": 0.81, + "grad_norm": 3.8982771683310378, + "learning_rate": 8.846783698490835e-07, + "loss": 0.6048, + "step": 10016 + }, + { + "epoch": 0.81, + "grad_norm": 5.969153616681193, + "learning_rate": 8.839314590976894e-07, + "loss": 0.6681, + "step": 10017 + }, + { + "epoch": 0.81, + "grad_norm": 5.834030993747555, + "learning_rate": 8.831848332041571e-07, + "loss": 0.5791, + "step": 10018 + }, + { + "epoch": 0.81, + "grad_norm": 4.010729864195363, + "learning_rate": 8.824384922201556e-07, + "loss": 0.7278, + "step": 10019 + }, + { + "epoch": 0.81, + "grad_norm": 3.2691051437356857, + "learning_rate": 8.81692436197335e-07, + "loss": 0.6184, + "step": 10020 + }, + { + "epoch": 0.81, + "grad_norm": 5.275393904335757, + "learning_rate": 8.809466651873305e-07, + "loss": 0.6494, + "step": 10021 + }, + { + "epoch": 0.81, + "grad_norm": 2.6415277801299886, + "learning_rate": 8.802011792417515e-07, + "loss": 0.7521, + "step": 10022 + }, + { + "epoch": 0.81, + "grad_norm": 4.43963955885498, + "learning_rate": 8.794559784121936e-07, + "loss": 0.5756, + "step": 10023 + }, + { + "epoch": 0.81, + "grad_norm": 3.113050618765771, + "learning_rate": 8.787110627502243e-07, + "loss": 0.479, + "step": 10024 + }, + { + "epoch": 0.81, + "grad_norm": 3.11635687300156, + "learning_rate": 8.779664323074011e-07, + "loss": 0.732, + "step": 10025 + }, + { + "epoch": 0.81, + "grad_norm": 3.2315641957310066, + "learning_rate": 8.772220871352549e-07, + "loss": 0.6677, + "step": 10026 + }, + { + "epoch": 0.81, + "grad_norm": 15.812044950390879, + "learning_rate": 8.76478027285298e-07, + "loss": 0.6643, + "step": 10027 + }, + { + "epoch": 0.81, + "grad_norm": 4.934738090176063, + "learning_rate": 8.757342528090268e-07, + "loss": 0.633, + "step": 10028 + }, + { + "epoch": 0.81, + "grad_norm": 2.373447810651177, + "learning_rate": 8.749907637579136e-07, + "loss": 0.6666, + "step": 10029 + }, + { + "epoch": 0.81, + "grad_norm": 3.0448811834514706, + "learning_rate": 8.742475601834133e-07, + "loss": 0.681, + "step": 10030 + }, + { + "epoch": 0.81, + "grad_norm": 3.7632109114332377, + "learning_rate": 8.735046421369581e-07, + "loss": 0.6288, + "step": 10031 + }, + { + "epoch": 0.81, + "grad_norm": 2.970402716900649, + "learning_rate": 8.727620096699658e-07, + "loss": 0.513, + "step": 10032 + }, + { + "epoch": 0.81, + "grad_norm": 2.7748752909336254, + "learning_rate": 8.720196628338278e-07, + "loss": 0.4794, + "step": 10033 + }, + { + "epoch": 0.81, + "grad_norm": 3.5168284070517495, + "learning_rate": 8.71277601679923e-07, + "loss": 0.7166, + "step": 10034 + }, + { + "epoch": 0.82, + "grad_norm": 3.6616367936068, + "learning_rate": 8.705358262596042e-07, + "loss": 0.7462, + "step": 10035 + }, + { + "epoch": 0.82, + "grad_norm": 5.091964927511263, + "learning_rate": 8.697943366242079e-07, + "loss": 0.6282, + "step": 10036 + }, + { + "epoch": 0.82, + "grad_norm": 3.034898650078384, + "learning_rate": 8.690531328250489e-07, + "loss": 0.5632, + "step": 10037 + }, + { + "epoch": 0.82, + "grad_norm": 3.3716848570428213, + "learning_rate": 8.683122149134232e-07, + "loss": 0.8308, + "step": 10038 + }, + { + "epoch": 0.82, + "grad_norm": 9.818542424694925, + "learning_rate": 8.675715829406084e-07, + "loss": 0.6914, + "step": 10039 + }, + { + "epoch": 0.82, + "grad_norm": 8.703049490539447, + "learning_rate": 8.668312369578586e-07, + "loss": 0.6717, + "step": 10040 + }, + { + "epoch": 0.82, + "grad_norm": 17.006251696228503, + "learning_rate": 8.660911770164132e-07, + "loss": 0.6394, + "step": 10041 + }, + { + "epoch": 0.82, + "grad_norm": 3.9283875912049804, + "learning_rate": 8.65351403167487e-07, + "loss": 0.5303, + "step": 10042 + }, + { + "epoch": 0.82, + "grad_norm": 3.956962778763141, + "learning_rate": 8.646119154622784e-07, + "loss": 0.6878, + "step": 10043 + }, + { + "epoch": 0.82, + "grad_norm": 4.723838136464106, + "learning_rate": 8.638727139519637e-07, + "loss": 0.6048, + "step": 10044 + }, + { + "epoch": 0.82, + "grad_norm": 4.642161893501136, + "learning_rate": 8.631337986876987e-07, + "loss": 0.6733, + "step": 10045 + }, + { + "epoch": 0.82, + "grad_norm": 3.3225533693272133, + "learning_rate": 8.62395169720624e-07, + "loss": 0.7483, + "step": 10046 + }, + { + "epoch": 0.82, + "grad_norm": 3.5344546736925704, + "learning_rate": 8.616568271018549e-07, + "loss": 0.7195, + "step": 10047 + }, + { + "epoch": 0.82, + "grad_norm": 2.757047265554324, + "learning_rate": 8.609187708824923e-07, + "loss": 0.7156, + "step": 10048 + }, + { + "epoch": 0.82, + "grad_norm": 2.873518495746369, + "learning_rate": 8.601810011136119e-07, + "loss": 0.6754, + "step": 10049 + }, + { + "epoch": 0.82, + "grad_norm": 6.134221879628959, + "learning_rate": 8.594435178462729e-07, + "loss": 0.595, + "step": 10050 + }, + { + "epoch": 0.82, + "grad_norm": 3.296454296851781, + "learning_rate": 8.587063211315138e-07, + "loss": 0.748, + "step": 10051 + }, + { + "epoch": 0.82, + "grad_norm": 2.9996258929563226, + "learning_rate": 8.579694110203512e-07, + "loss": 0.6052, + "step": 10052 + }, + { + "epoch": 0.82, + "grad_norm": 4.164977674490247, + "learning_rate": 8.572327875637876e-07, + "loss": 0.6497, + "step": 10053 + }, + { + "epoch": 0.82, + "grad_norm": 2.6727321287613868, + "learning_rate": 8.564964508127987e-07, + "loss": 0.6241, + "step": 10054 + }, + { + "epoch": 0.82, + "grad_norm": 6.630090955781284, + "learning_rate": 8.557604008183462e-07, + "loss": 0.6606, + "step": 10055 + }, + { + "epoch": 0.82, + "grad_norm": 5.62966583843766, + "learning_rate": 8.550246376313681e-07, + "loss": 0.6881, + "step": 10056 + }, + { + "epoch": 0.82, + "grad_norm": 4.890753279891143, + "learning_rate": 8.542891613027843e-07, + "loss": 0.6692, + "step": 10057 + }, + { + "epoch": 0.82, + "grad_norm": 4.633094972123343, + "learning_rate": 8.535539718834929e-07, + "loss": 0.7126, + "step": 10058 + }, + { + "epoch": 0.82, + "grad_norm": 3.066437100145936, + "learning_rate": 8.528190694243759e-07, + "loss": 0.7466, + "step": 10059 + }, + { + "epoch": 0.82, + "grad_norm": 3.3593599244267396, + "learning_rate": 8.520844539762918e-07, + "loss": 0.6855, + "step": 10060 + }, + { + "epoch": 0.82, + "grad_norm": 18.54404783434007, + "learning_rate": 8.513501255900802e-07, + "loss": 0.6152, + "step": 10061 + }, + { + "epoch": 0.82, + "grad_norm": 3.779066328568525, + "learning_rate": 8.506160843165629e-07, + "loss": 0.8059, + "step": 10062 + }, + { + "epoch": 0.82, + "grad_norm": 3.1639471885558628, + "learning_rate": 8.498823302065395e-07, + "loss": 0.5708, + "step": 10063 + }, + { + "epoch": 0.82, + "grad_norm": 17.83718328452912, + "learning_rate": 8.491488633107897e-07, + "loss": 0.795, + "step": 10064 + }, + { + "epoch": 0.82, + "grad_norm": 6.003342601866175, + "learning_rate": 8.484156836800739e-07, + "loss": 0.5638, + "step": 10065 + }, + { + "epoch": 0.82, + "grad_norm": 6.333901284514175, + "learning_rate": 8.476827913651337e-07, + "loss": 0.8183, + "step": 10066 + }, + { + "epoch": 0.82, + "grad_norm": 2.6577947985809796, + "learning_rate": 8.469501864166902e-07, + "loss": 0.7356, + "step": 10067 + }, + { + "epoch": 0.82, + "grad_norm": 4.542498708977644, + "learning_rate": 8.462178688854423e-07, + "loss": 0.6353, + "step": 10068 + }, + { + "epoch": 0.82, + "grad_norm": 10.434213955943274, + "learning_rate": 8.454858388220744e-07, + "loss": 0.6221, + "step": 10069 + }, + { + "epoch": 0.82, + "grad_norm": 3.8546958778127074, + "learning_rate": 8.447540962772426e-07, + "loss": 0.6192, + "step": 10070 + }, + { + "epoch": 0.82, + "grad_norm": 2.2740977978185803, + "learning_rate": 8.440226413015928e-07, + "loss": 0.7045, + "step": 10071 + }, + { + "epoch": 0.82, + "grad_norm": 6.322790581030244, + "learning_rate": 8.432914739457432e-07, + "loss": 0.7421, + "step": 10072 + }, + { + "epoch": 0.82, + "grad_norm": 2.878544547668449, + "learning_rate": 8.425605942602977e-07, + "loss": 0.6284, + "step": 10073 + }, + { + "epoch": 0.82, + "grad_norm": 3.324119782403429, + "learning_rate": 8.418300022958359e-07, + "loss": 0.5786, + "step": 10074 + }, + { + "epoch": 0.82, + "grad_norm": 4.791672976030121, + "learning_rate": 8.41099698102919e-07, + "loss": 0.5112, + "step": 10075 + }, + { + "epoch": 0.82, + "grad_norm": 3.136160297104687, + "learning_rate": 8.403696817320922e-07, + "loss": 0.7406, + "step": 10076 + }, + { + "epoch": 0.82, + "grad_norm": 3.066838639519394, + "learning_rate": 8.396399532338722e-07, + "loss": 0.6602, + "step": 10077 + }, + { + "epoch": 0.82, + "grad_norm": 4.688393503306328, + "learning_rate": 8.389105126587644e-07, + "loss": 0.5227, + "step": 10078 + }, + { + "epoch": 0.82, + "grad_norm": 2.9545023242377226, + "learning_rate": 8.38181360057248e-07, + "loss": 0.5662, + "step": 10079 + }, + { + "epoch": 0.82, + "grad_norm": 6.427798172336533, + "learning_rate": 8.37452495479788e-07, + "loss": 0.7295, + "step": 10080 + }, + { + "epoch": 0.82, + "grad_norm": 4.184494303271157, + "learning_rate": 8.36723918976825e-07, + "loss": 0.7596, + "step": 10081 + }, + { + "epoch": 0.82, + "grad_norm": 3.931254765805409, + "learning_rate": 8.359956305987805e-07, + "loss": 0.6669, + "step": 10082 + }, + { + "epoch": 0.82, + "grad_norm": 6.752607265065419, + "learning_rate": 8.352676303960561e-07, + "loss": 0.7041, + "step": 10083 + }, + { + "epoch": 0.82, + "grad_norm": 2.3524220642091382, + "learning_rate": 8.345399184190362e-07, + "loss": 0.6889, + "step": 10084 + }, + { + "epoch": 0.82, + "grad_norm": 3.5619192014658085, + "learning_rate": 8.33812494718082e-07, + "loss": 0.6599, + "step": 10085 + }, + { + "epoch": 0.82, + "grad_norm": 4.870899849411198, + "learning_rate": 8.330853593435345e-07, + "loss": 0.7056, + "step": 10086 + }, + { + "epoch": 0.82, + "grad_norm": 2.398365406564716, + "learning_rate": 8.323585123457179e-07, + "loss": 0.5399, + "step": 10087 + }, + { + "epoch": 0.82, + "grad_norm": 3.100343712333666, + "learning_rate": 8.316319537749328e-07, + "loss": 0.5487, + "step": 10088 + }, + { + "epoch": 0.82, + "grad_norm": 4.064413469988187, + "learning_rate": 8.309056836814656e-07, + "loss": 0.7184, + "step": 10089 + }, + { + "epoch": 0.82, + "grad_norm": 3.494413252339731, + "learning_rate": 8.301797021155733e-07, + "loss": 0.5436, + "step": 10090 + }, + { + "epoch": 0.82, + "grad_norm": 3.1773139635434604, + "learning_rate": 8.294540091275022e-07, + "loss": 0.8159, + "step": 10091 + }, + { + "epoch": 0.82, + "grad_norm": 14.091969817918459, + "learning_rate": 8.28728604767473e-07, + "loss": 0.6656, + "step": 10092 + }, + { + "epoch": 0.82, + "grad_norm": 10.58721334926195, + "learning_rate": 8.280034890856886e-07, + "loss": 0.6846, + "step": 10093 + }, + { + "epoch": 0.82, + "grad_norm": 3.557409951412148, + "learning_rate": 8.272786621323326e-07, + "loss": 0.6401, + "step": 10094 + }, + { + "epoch": 0.82, + "grad_norm": 2.69140007905225, + "learning_rate": 8.265541239575653e-07, + "loss": 0.5285, + "step": 10095 + }, + { + "epoch": 0.82, + "grad_norm": 5.637270821416528, + "learning_rate": 8.258298746115334e-07, + "loss": 0.6103, + "step": 10096 + }, + { + "epoch": 0.82, + "grad_norm": 2.8348275711569757, + "learning_rate": 8.251059141443545e-07, + "loss": 0.6203, + "step": 10097 + }, + { + "epoch": 0.82, + "grad_norm": 3.3500300781312875, + "learning_rate": 8.243822426061348e-07, + "loss": 0.6617, + "step": 10098 + }, + { + "epoch": 0.82, + "grad_norm": 5.414726643368543, + "learning_rate": 8.236588600469558e-07, + "loss": 0.6935, + "step": 10099 + }, + { + "epoch": 0.82, + "grad_norm": 4.03935550027246, + "learning_rate": 8.229357665168791e-07, + "loss": 0.6556, + "step": 10100 + }, + { + "epoch": 0.82, + "grad_norm": 5.480901250831511, + "learning_rate": 8.222129620659497e-07, + "loss": 0.5506, + "step": 10101 + }, + { + "epoch": 0.82, + "grad_norm": 3.177275605231011, + "learning_rate": 8.214904467441887e-07, + "loss": 0.5894, + "step": 10102 + }, + { + "epoch": 0.82, + "grad_norm": 22.891657069117663, + "learning_rate": 8.207682206015988e-07, + "loss": 0.6457, + "step": 10103 + }, + { + "epoch": 0.82, + "grad_norm": 2.850670177976615, + "learning_rate": 8.200462836881612e-07, + "loss": 0.6119, + "step": 10104 + }, + { + "epoch": 0.82, + "grad_norm": 9.147155185340978, + "learning_rate": 8.19324636053841e-07, + "loss": 0.6607, + "step": 10105 + }, + { + "epoch": 0.82, + "grad_norm": 6.187151596075321, + "learning_rate": 8.186032777485803e-07, + "loss": 0.6391, + "step": 10106 + }, + { + "epoch": 0.82, + "grad_norm": 5.270641072708848, + "learning_rate": 8.178822088222992e-07, + "loss": 0.6316, + "step": 10107 + }, + { + "epoch": 0.82, + "grad_norm": 6.023323030820504, + "learning_rate": 8.171614293249036e-07, + "loss": 0.7677, + "step": 10108 + }, + { + "epoch": 0.82, + "grad_norm": 8.19160943650215, + "learning_rate": 8.164409393062744e-07, + "loss": 0.6565, + "step": 10109 + }, + { + "epoch": 0.82, + "grad_norm": 2.9053125784745752, + "learning_rate": 8.157207388162741e-07, + "loss": 0.6957, + "step": 10110 + }, + { + "epoch": 0.82, + "grad_norm": 10.482786565300222, + "learning_rate": 8.150008279047439e-07, + "loss": 0.6524, + "step": 10111 + }, + { + "epoch": 0.82, + "grad_norm": 6.324983642143745, + "learning_rate": 8.142812066215083e-07, + "loss": 0.6409, + "step": 10112 + }, + { + "epoch": 0.82, + "grad_norm": 6.970866170137658, + "learning_rate": 8.135618750163677e-07, + "loss": 0.694, + "step": 10113 + }, + { + "epoch": 0.82, + "grad_norm": 4.172965195778272, + "learning_rate": 8.12842833139107e-07, + "loss": 0.6435, + "step": 10114 + }, + { + "epoch": 0.82, + "grad_norm": 3.2351037799756264, + "learning_rate": 8.12124081039486e-07, + "loss": 0.6842, + "step": 10115 + }, + { + "epoch": 0.82, + "grad_norm": 6.935783492440107, + "learning_rate": 8.114056187672481e-07, + "loss": 0.6755, + "step": 10116 + }, + { + "epoch": 0.82, + "grad_norm": 4.096022214321478, + "learning_rate": 8.106874463721143e-07, + "loss": 0.633, + "step": 10117 + }, + { + "epoch": 0.82, + "grad_norm": 5.154422332599943, + "learning_rate": 8.099695639037869e-07, + "loss": 0.7158, + "step": 10118 + }, + { + "epoch": 0.82, + "grad_norm": 2.6058249391847434, + "learning_rate": 8.09251971411949e-07, + "loss": 0.5274, + "step": 10119 + }, + { + "epoch": 0.82, + "grad_norm": 2.815111526971434, + "learning_rate": 8.085346689462609e-07, + "loss": 0.6454, + "step": 10120 + }, + { + "epoch": 0.82, + "grad_norm": 3.7593263902721223, + "learning_rate": 8.078176565563661e-07, + "loss": 0.6661, + "step": 10121 + }, + { + "epoch": 0.82, + "grad_norm": 4.994278030161245, + "learning_rate": 8.071009342918861e-07, + "loss": 0.6275, + "step": 10122 + }, + { + "epoch": 0.82, + "grad_norm": 4.208489094655455, + "learning_rate": 8.063845022024219e-07, + "loss": 0.6778, + "step": 10123 + }, + { + "epoch": 0.82, + "grad_norm": 4.296257192510657, + "learning_rate": 8.056683603375553e-07, + "loss": 0.6065, + "step": 10124 + }, + { + "epoch": 0.82, + "grad_norm": 6.979817946677779, + "learning_rate": 8.049525087468469e-07, + "loss": 0.6094, + "step": 10125 + }, + { + "epoch": 0.82, + "grad_norm": 2.9296188092921005, + "learning_rate": 8.042369474798401e-07, + "loss": 0.6018, + "step": 10126 + }, + { + "epoch": 0.82, + "grad_norm": 3.3887398248505303, + "learning_rate": 8.035216765860537e-07, + "loss": 0.5247, + "step": 10127 + }, + { + "epoch": 0.82, + "grad_norm": 4.436193239621094, + "learning_rate": 8.028066961149921e-07, + "loss": 0.6429, + "step": 10128 + }, + { + "epoch": 0.82, + "grad_norm": 7.551590110345325, + "learning_rate": 8.020920061161352e-07, + "loss": 0.6206, + "step": 10129 + }, + { + "epoch": 0.82, + "grad_norm": 2.496423160290504, + "learning_rate": 8.013776066389434e-07, + "loss": 0.558, + "step": 10130 + }, + { + "epoch": 0.82, + "grad_norm": 3.670744597907161, + "learning_rate": 8.006634977328575e-07, + "loss": 0.706, + "step": 10131 + }, + { + "epoch": 0.82, + "grad_norm": 5.848619146668664, + "learning_rate": 7.999496794472977e-07, + "loss": 0.7227, + "step": 10132 + }, + { + "epoch": 0.82, + "grad_norm": 9.869707989799299, + "learning_rate": 7.992361518316677e-07, + "loss": 0.7191, + "step": 10133 + }, + { + "epoch": 0.82, + "grad_norm": 8.498997857008085, + "learning_rate": 7.98522914935344e-07, + "loss": 0.6804, + "step": 10134 + }, + { + "epoch": 0.82, + "grad_norm": 6.126768132874517, + "learning_rate": 7.978099688076912e-07, + "loss": 0.5459, + "step": 10135 + }, + { + "epoch": 0.82, + "grad_norm": 13.289793451007261, + "learning_rate": 7.970973134980475e-07, + "loss": 0.6822, + "step": 10136 + }, + { + "epoch": 0.82, + "grad_norm": 3.864888186080762, + "learning_rate": 7.963849490557335e-07, + "loss": 0.4448, + "step": 10137 + }, + { + "epoch": 0.82, + "grad_norm": 1.9225289849221263, + "learning_rate": 7.956728755300474e-07, + "loss": 0.5732, + "step": 10138 + }, + { + "epoch": 0.82, + "grad_norm": 5.519012125235013, + "learning_rate": 7.949610929702728e-07, + "loss": 0.6365, + "step": 10139 + }, + { + "epoch": 0.82, + "grad_norm": 4.75023876574547, + "learning_rate": 7.942496014256673e-07, + "loss": 0.7651, + "step": 10140 + }, + { + "epoch": 0.82, + "grad_norm": 4.231250425666945, + "learning_rate": 7.9353840094547e-07, + "loss": 0.7076, + "step": 10141 + }, + { + "epoch": 0.82, + "grad_norm": 7.2890702116742485, + "learning_rate": 7.928274915789035e-07, + "loss": 0.6996, + "step": 10142 + }, + { + "epoch": 0.82, + "grad_norm": 6.20125422483117, + "learning_rate": 7.921168733751633e-07, + "loss": 0.6531, + "step": 10143 + }, + { + "epoch": 0.82, + "grad_norm": 2.581696168456722, + "learning_rate": 7.914065463834314e-07, + "loss": 0.6917, + "step": 10144 + }, + { + "epoch": 0.82, + "grad_norm": 5.303097198431156, + "learning_rate": 7.906965106528647e-07, + "loss": 0.6645, + "step": 10145 + }, + { + "epoch": 0.82, + "grad_norm": 3.6990325959823056, + "learning_rate": 7.899867662326049e-07, + "loss": 0.7903, + "step": 10146 + }, + { + "epoch": 0.82, + "grad_norm": 7.150008280987295, + "learning_rate": 7.89277313171769e-07, + "loss": 0.6497, + "step": 10147 + }, + { + "epoch": 0.82, + "grad_norm": 4.100901674782403, + "learning_rate": 7.885681515194549e-07, + "loss": 0.5572, + "step": 10148 + }, + { + "epoch": 0.82, + "grad_norm": 3.1770106590830047, + "learning_rate": 7.878592813247443e-07, + "loss": 0.6542, + "step": 10149 + }, + { + "epoch": 0.82, + "grad_norm": 3.5956830684505516, + "learning_rate": 7.871507026366909e-07, + "loss": 0.8315, + "step": 10150 + }, + { + "epoch": 0.82, + "grad_norm": 4.22113903684988, + "learning_rate": 7.864424155043366e-07, + "loss": 0.5046, + "step": 10151 + }, + { + "epoch": 0.82, + "grad_norm": 2.3724472663445773, + "learning_rate": 7.857344199766964e-07, + "loss": 0.6814, + "step": 10152 + }, + { + "epoch": 0.82, + "grad_norm": 4.874729327220722, + "learning_rate": 7.850267161027709e-07, + "loss": 0.8528, + "step": 10153 + }, + { + "epoch": 0.82, + "grad_norm": 5.4719900299038, + "learning_rate": 7.843193039315361e-07, + "loss": 0.6331, + "step": 10154 + }, + { + "epoch": 0.82, + "grad_norm": 4.86477705255406, + "learning_rate": 7.836121835119498e-07, + "loss": 0.6353, + "step": 10155 + }, + { + "epoch": 0.82, + "grad_norm": 4.007241314171843, + "learning_rate": 7.829053548929488e-07, + "loss": 0.6617, + "step": 10156 + }, + { + "epoch": 0.82, + "grad_norm": 3.843163189941457, + "learning_rate": 7.821988181234497e-07, + "loss": 0.6907, + "step": 10157 + }, + { + "epoch": 0.83, + "grad_norm": 2.824561973661893, + "learning_rate": 7.814925732523504e-07, + "loss": 0.6195, + "step": 10158 + }, + { + "epoch": 0.83, + "grad_norm": 3.3628067254773377, + "learning_rate": 7.807866203285258e-07, + "loss": 0.6719, + "step": 10159 + }, + { + "epoch": 0.83, + "grad_norm": 2.7102201205108396, + "learning_rate": 7.800809594008346e-07, + "loss": 0.6202, + "step": 10160 + }, + { + "epoch": 0.83, + "grad_norm": 8.493171717989103, + "learning_rate": 7.793755905181111e-07, + "loss": 0.6924, + "step": 10161 + }, + { + "epoch": 0.83, + "grad_norm": 8.186740370391155, + "learning_rate": 7.78670513729174e-07, + "loss": 0.7066, + "step": 10162 + }, + { + "epoch": 0.83, + "grad_norm": 3.876260709066584, + "learning_rate": 7.779657290828146e-07, + "loss": 0.6074, + "step": 10163 + }, + { + "epoch": 0.83, + "grad_norm": 3.619848332205266, + "learning_rate": 7.772612366278121e-07, + "loss": 0.7117, + "step": 10164 + }, + { + "epoch": 0.83, + "grad_norm": 3.9413923770978916, + "learning_rate": 7.76557036412921e-07, + "loss": 0.6303, + "step": 10165 + }, + { + "epoch": 0.83, + "grad_norm": 3.4733039403608457, + "learning_rate": 7.758531284868742e-07, + "loss": 0.5753, + "step": 10166 + }, + { + "epoch": 0.83, + "grad_norm": 13.141796085533507, + "learning_rate": 7.7514951289839e-07, + "loss": 0.8038, + "step": 10167 + }, + { + "epoch": 0.83, + "grad_norm": 6.029904041482119, + "learning_rate": 7.744461896961598e-07, + "loss": 0.5812, + "step": 10168 + }, + { + "epoch": 0.83, + "grad_norm": 3.519758699994979, + "learning_rate": 7.737431589288619e-07, + "loss": 0.7023, + "step": 10169 + }, + { + "epoch": 0.83, + "grad_norm": 3.3300568765930016, + "learning_rate": 7.730404206451459e-07, + "loss": 0.6397, + "step": 10170 + }, + { + "epoch": 0.83, + "grad_norm": 5.680880422768918, + "learning_rate": 7.723379748936494e-07, + "loss": 0.5487, + "step": 10171 + }, + { + "epoch": 0.83, + "grad_norm": 4.822683081387333, + "learning_rate": 7.716358217229841e-07, + "loss": 0.6671, + "step": 10172 + }, + { + "epoch": 0.83, + "grad_norm": 3.7748097314750066, + "learning_rate": 7.709339611817429e-07, + "loss": 0.7738, + "step": 10173 + }, + { + "epoch": 0.83, + "grad_norm": 3.4303245913987572, + "learning_rate": 7.702323933185013e-07, + "loss": 0.7037, + "step": 10174 + }, + { + "epoch": 0.83, + "grad_norm": 2.8167239006212763, + "learning_rate": 7.695311181818111e-07, + "loss": 0.7308, + "step": 10175 + }, + { + "epoch": 0.83, + "grad_norm": 3.0217795272544095, + "learning_rate": 7.688301358202043e-07, + "loss": 0.7025, + "step": 10176 + }, + { + "epoch": 0.83, + "grad_norm": 9.061168689610298, + "learning_rate": 7.681294462821925e-07, + "loss": 0.5613, + "step": 10177 + }, + { + "epoch": 0.83, + "grad_norm": 5.125386807553893, + "learning_rate": 7.674290496162707e-07, + "loss": 0.5784, + "step": 10178 + }, + { + "epoch": 0.83, + "grad_norm": 3.073319161000866, + "learning_rate": 7.667289458709088e-07, + "loss": 0.6692, + "step": 10179 + }, + { + "epoch": 0.83, + "grad_norm": 23.21586193756055, + "learning_rate": 7.660291350945581e-07, + "loss": 0.6006, + "step": 10180 + }, + { + "epoch": 0.83, + "grad_norm": 4.410568116311697, + "learning_rate": 7.653296173356512e-07, + "loss": 0.5988, + "step": 10181 + }, + { + "epoch": 0.83, + "grad_norm": 3.4146482517699717, + "learning_rate": 7.646303926425986e-07, + "loss": 0.6404, + "step": 10182 + }, + { + "epoch": 0.83, + "grad_norm": 4.819260532191193, + "learning_rate": 7.639314610637905e-07, + "loss": 0.7422, + "step": 10183 + }, + { + "epoch": 0.83, + "grad_norm": 4.126538264960124, + "learning_rate": 7.632328226475971e-07, + "loss": 0.7315, + "step": 10184 + }, + { + "epoch": 0.83, + "grad_norm": 3.0229372395694156, + "learning_rate": 7.625344774423704e-07, + "loss": 0.4861, + "step": 10185 + }, + { + "epoch": 0.83, + "grad_norm": 5.454984074248593, + "learning_rate": 7.618364254964378e-07, + "loss": 0.535, + "step": 10186 + }, + { + "epoch": 0.83, + "grad_norm": 4.5718627092307225, + "learning_rate": 7.611386668581117e-07, + "loss": 0.6587, + "step": 10187 + }, + { + "epoch": 0.83, + "grad_norm": 3.075648257710709, + "learning_rate": 7.604412015756796e-07, + "loss": 0.7122, + "step": 10188 + }, + { + "epoch": 0.83, + "grad_norm": 3.203526440911372, + "learning_rate": 7.597440296974112e-07, + "loss": 0.5986, + "step": 10189 + }, + { + "epoch": 0.83, + "grad_norm": 6.188141744831746, + "learning_rate": 7.590471512715547e-07, + "loss": 0.4143, + "step": 10190 + }, + { + "epoch": 0.83, + "grad_norm": 3.781234348822351, + "learning_rate": 7.58350566346337e-07, + "loss": 0.5905, + "step": 10191 + }, + { + "epoch": 0.83, + "grad_norm": 13.772789176008088, + "learning_rate": 7.576542749699695e-07, + "loss": 0.5904, + "step": 10192 + }, + { + "epoch": 0.83, + "grad_norm": 2.873932427968937, + "learning_rate": 7.569582771906364e-07, + "loss": 0.6637, + "step": 10193 + }, + { + "epoch": 0.83, + "grad_norm": 11.71370104878234, + "learning_rate": 7.562625730565088e-07, + "loss": 0.753, + "step": 10194 + }, + { + "epoch": 0.83, + "grad_norm": 5.5976354786786295, + "learning_rate": 7.555671626157312e-07, + "loss": 0.6404, + "step": 10195 + }, + { + "epoch": 0.83, + "grad_norm": 3.3844331630609803, + "learning_rate": 7.548720459164316e-07, + "loss": 0.7377, + "step": 10196 + }, + { + "epoch": 0.83, + "grad_norm": 5.307981339011157, + "learning_rate": 7.541772230067157e-07, + "loss": 0.6314, + "step": 10197 + }, + { + "epoch": 0.83, + "grad_norm": 5.061631733208505, + "learning_rate": 7.53482693934669e-07, + "loss": 0.7617, + "step": 10198 + }, + { + "epoch": 0.83, + "grad_norm": 4.451421598327279, + "learning_rate": 7.527884587483592e-07, + "loss": 0.668, + "step": 10199 + }, + { + "epoch": 0.83, + "grad_norm": 4.771655132737514, + "learning_rate": 7.520945174958294e-07, + "loss": 0.6895, + "step": 10200 + }, + { + "epoch": 0.83, + "grad_norm": 22.414786560626762, + "learning_rate": 7.514008702251068e-07, + "loss": 0.5476, + "step": 10201 + }, + { + "epoch": 0.83, + "grad_norm": 4.674627558912777, + "learning_rate": 7.50707516984196e-07, + "loss": 0.5944, + "step": 10202 + }, + { + "epoch": 0.83, + "grad_norm": 3.854452834521621, + "learning_rate": 7.500144578210805e-07, + "loss": 0.6728, + "step": 10203 + }, + { + "epoch": 0.83, + "grad_norm": 4.17955301544156, + "learning_rate": 7.49321692783725e-07, + "loss": 0.7063, + "step": 10204 + }, + { + "epoch": 0.83, + "grad_norm": 7.769539727100411, + "learning_rate": 7.486292219200714e-07, + "loss": 0.7398, + "step": 10205 + }, + { + "epoch": 0.83, + "grad_norm": 3.1111028518844046, + "learning_rate": 7.47937045278046e-07, + "loss": 0.7386, + "step": 10206 + }, + { + "epoch": 0.83, + "grad_norm": 3.904299973843954, + "learning_rate": 7.472451629055483e-07, + "loss": 0.6721, + "step": 10207 + }, + { + "epoch": 0.83, + "grad_norm": 4.211919171774436, + "learning_rate": 7.46553574850466e-07, + "loss": 0.8329, + "step": 10208 + }, + { + "epoch": 0.83, + "grad_norm": 6.216305997681818, + "learning_rate": 7.458622811606553e-07, + "loss": 0.565, + "step": 10209 + }, + { + "epoch": 0.83, + "grad_norm": 2.968840945883631, + "learning_rate": 7.451712818839629e-07, + "loss": 0.5918, + "step": 10210 + }, + { + "epoch": 0.83, + "grad_norm": 4.570607731068357, + "learning_rate": 7.444805770682068e-07, + "loss": 0.7387, + "step": 10211 + }, + { + "epoch": 0.83, + "grad_norm": 2.710315573917204, + "learning_rate": 7.437901667611908e-07, + "loss": 0.7255, + "step": 10212 + }, + { + "epoch": 0.83, + "grad_norm": 2.489205059000977, + "learning_rate": 7.431000510106945e-07, + "loss": 0.5279, + "step": 10213 + }, + { + "epoch": 0.83, + "grad_norm": 4.545759072508146, + "learning_rate": 7.424102298644775e-07, + "loss": 0.75, + "step": 10214 + }, + { + "epoch": 0.83, + "grad_norm": 3.5425442188725946, + "learning_rate": 7.417207033702827e-07, + "loss": 0.4542, + "step": 10215 + }, + { + "epoch": 0.83, + "grad_norm": 6.963534589952645, + "learning_rate": 7.410314715758255e-07, + "loss": 0.64, + "step": 10216 + }, + { + "epoch": 0.83, + "grad_norm": 4.365877125503975, + "learning_rate": 7.403425345288079e-07, + "loss": 0.6509, + "step": 10217 + }, + { + "epoch": 0.83, + "grad_norm": 3.354505495325425, + "learning_rate": 7.39653892276907e-07, + "loss": 0.5866, + "step": 10218 + }, + { + "epoch": 0.83, + "grad_norm": 2.6790065935553384, + "learning_rate": 7.389655448677834e-07, + "loss": 0.5339, + "step": 10219 + }, + { + "epoch": 0.83, + "grad_norm": 3.2477589082173246, + "learning_rate": 7.382774923490738e-07, + "loss": 0.6434, + "step": 10220 + }, + { + "epoch": 0.83, + "grad_norm": 5.067106079956164, + "learning_rate": 7.375897347683942e-07, + "loss": 0.532, + "step": 10221 + }, + { + "epoch": 0.83, + "grad_norm": 3.705076837922779, + "learning_rate": 7.36902272173346e-07, + "loss": 0.6304, + "step": 10222 + }, + { + "epoch": 0.83, + "grad_norm": 5.287578836010013, + "learning_rate": 7.362151046115007e-07, + "loss": 0.5771, + "step": 10223 + }, + { + "epoch": 0.83, + "grad_norm": 3.2333213765429933, + "learning_rate": 7.355282321304185e-07, + "loss": 0.7157, + "step": 10224 + }, + { + "epoch": 0.83, + "grad_norm": 5.287089846042632, + "learning_rate": 7.348416547776327e-07, + "loss": 0.6911, + "step": 10225 + }, + { + "epoch": 0.83, + "grad_norm": 8.34536524856903, + "learning_rate": 7.341553726006611e-07, + "loss": 0.5208, + "step": 10226 + }, + { + "epoch": 0.83, + "grad_norm": 4.067430452718088, + "learning_rate": 7.334693856469982e-07, + "loss": 0.6789, + "step": 10227 + }, + { + "epoch": 0.83, + "grad_norm": 8.551517404564226, + "learning_rate": 7.327836939641175e-07, + "loss": 0.5792, + "step": 10228 + }, + { + "epoch": 0.83, + "grad_norm": 5.678720595104767, + "learning_rate": 7.320982975994739e-07, + "loss": 0.6594, + "step": 10229 + }, + { + "epoch": 0.83, + "grad_norm": 5.080679204551104, + "learning_rate": 7.314131966005e-07, + "loss": 0.7425, + "step": 10230 + }, + { + "epoch": 0.83, + "grad_norm": 6.4504270401508155, + "learning_rate": 7.307283910146118e-07, + "loss": 0.5143, + "step": 10231 + }, + { + "epoch": 0.83, + "grad_norm": 5.955054458292446, + "learning_rate": 7.300438808891985e-07, + "loss": 0.5932, + "step": 10232 + }, + { + "epoch": 0.83, + "grad_norm": 5.943403200607857, + "learning_rate": 7.293596662716362e-07, + "loss": 0.6345, + "step": 10233 + }, + { + "epoch": 0.83, + "grad_norm": 3.6936250023573356, + "learning_rate": 7.286757472092749e-07, + "loss": 0.6373, + "step": 10234 + }, + { + "epoch": 0.83, + "grad_norm": 2.87598512920276, + "learning_rate": 7.279921237494464e-07, + "loss": 0.7345, + "step": 10235 + }, + { + "epoch": 0.83, + "grad_norm": 4.4833261912587865, + "learning_rate": 7.273087959394609e-07, + "loss": 0.6137, + "step": 10236 + }, + { + "epoch": 0.83, + "grad_norm": 2.991248061720026, + "learning_rate": 7.266257638266106e-07, + "loss": 0.6686, + "step": 10237 + }, + { + "epoch": 0.83, + "grad_norm": 4.524742297759488, + "learning_rate": 7.259430274581647e-07, + "loss": 0.7008, + "step": 10238 + }, + { + "epoch": 0.83, + "grad_norm": 7.39141564428959, + "learning_rate": 7.252605868813722e-07, + "loss": 0.6848, + "step": 10239 + }, + { + "epoch": 0.83, + "grad_norm": 5.836627278870458, + "learning_rate": 7.245784421434643e-07, + "loss": 0.6215, + "step": 10240 + }, + { + "epoch": 0.83, + "grad_norm": 3.1396570859644592, + "learning_rate": 7.23896593291647e-07, + "loss": 0.5527, + "step": 10241 + }, + { + "epoch": 0.83, + "grad_norm": 4.097785791460001, + "learning_rate": 7.232150403731126e-07, + "loss": 0.7235, + "step": 10242 + }, + { + "epoch": 0.83, + "grad_norm": 3.688406583616061, + "learning_rate": 7.225337834350237e-07, + "loss": 0.6325, + "step": 10243 + }, + { + "epoch": 0.83, + "grad_norm": 3.648541327707882, + "learning_rate": 7.218528225245314e-07, + "loss": 0.6635, + "step": 10244 + }, + { + "epoch": 0.83, + "grad_norm": 26.55656057431152, + "learning_rate": 7.211721576887609e-07, + "loss": 0.72, + "step": 10245 + }, + { + "epoch": 0.83, + "grad_norm": 5.880070357657142, + "learning_rate": 7.204917889748181e-07, + "loss": 0.7435, + "step": 10246 + }, + { + "epoch": 0.83, + "grad_norm": 6.262622038231014, + "learning_rate": 7.198117164297908e-07, + "loss": 0.6219, + "step": 10247 + }, + { + "epoch": 0.83, + "grad_norm": 4.607431292423183, + "learning_rate": 7.191319401007423e-07, + "loss": 0.7331, + "step": 10248 + }, + { + "epoch": 0.83, + "grad_norm": 3.435736569694444, + "learning_rate": 7.184524600347187e-07, + "loss": 0.7149, + "step": 10249 + }, + { + "epoch": 0.83, + "grad_norm": 4.653171952035075, + "learning_rate": 7.177732762787426e-07, + "loss": 0.72, + "step": 10250 + }, + { + "epoch": 0.83, + "grad_norm": 4.977890959625817, + "learning_rate": 7.170943888798199e-07, + "loss": 0.8009, + "step": 10251 + }, + { + "epoch": 0.83, + "grad_norm": 2.959268097093111, + "learning_rate": 7.164157978849329e-07, + "loss": 0.6028, + "step": 10252 + }, + { + "epoch": 0.83, + "grad_norm": 3.018126538434741, + "learning_rate": 7.15737503341043e-07, + "loss": 0.669, + "step": 10253 + }, + { + "epoch": 0.83, + "grad_norm": 4.264711722504301, + "learning_rate": 7.150595052950954e-07, + "loss": 0.6819, + "step": 10254 + }, + { + "epoch": 0.83, + "grad_norm": 3.7745904981180027, + "learning_rate": 7.143818037940098e-07, + "loss": 0.5662, + "step": 10255 + }, + { + "epoch": 0.83, + "grad_norm": 4.463814512383072, + "learning_rate": 7.137043988846881e-07, + "loss": 0.6456, + "step": 10256 + }, + { + "epoch": 0.83, + "grad_norm": 3.023813075919316, + "learning_rate": 7.130272906140095e-07, + "loss": 0.6044, + "step": 10257 + }, + { + "epoch": 0.83, + "grad_norm": 7.706398060493753, + "learning_rate": 7.123504790288371e-07, + "loss": 0.7286, + "step": 10258 + }, + { + "epoch": 0.83, + "grad_norm": 4.733282957754952, + "learning_rate": 7.116739641760085e-07, + "loss": 0.6908, + "step": 10259 + }, + { + "epoch": 0.83, + "grad_norm": 4.712365755064234, + "learning_rate": 7.109977461023415e-07, + "loss": 0.556, + "step": 10260 + }, + { + "epoch": 0.83, + "grad_norm": 2.5456775869275616, + "learning_rate": 7.103218248546379e-07, + "loss": 0.5377, + "step": 10261 + }, + { + "epoch": 0.83, + "grad_norm": 4.281573303018533, + "learning_rate": 7.09646200479674e-07, + "loss": 0.6947, + "step": 10262 + }, + { + "epoch": 0.83, + "grad_norm": 8.865958573238945, + "learning_rate": 7.089708730242067e-07, + "loss": 0.7508, + "step": 10263 + }, + { + "epoch": 0.83, + "grad_norm": 4.342278155483451, + "learning_rate": 7.082958425349734e-07, + "loss": 0.567, + "step": 10264 + }, + { + "epoch": 0.83, + "grad_norm": 5.786765807652377, + "learning_rate": 7.076211090586909e-07, + "loss": 0.6085, + "step": 10265 + }, + { + "epoch": 0.83, + "grad_norm": 5.77668001919161, + "learning_rate": 7.069466726420543e-07, + "loss": 0.6629, + "step": 10266 + }, + { + "epoch": 0.83, + "grad_norm": 2.784683119806058, + "learning_rate": 7.062725333317399e-07, + "loss": 0.6982, + "step": 10267 + }, + { + "epoch": 0.83, + "grad_norm": 4.342476773267422, + "learning_rate": 7.055986911744017e-07, + "loss": 0.5913, + "step": 10268 + }, + { + "epoch": 0.83, + "grad_norm": 6.940907527501119, + "learning_rate": 7.04925146216674e-07, + "loss": 0.8289, + "step": 10269 + }, + { + "epoch": 0.83, + "grad_norm": 3.2146721293793554, + "learning_rate": 7.042518985051705e-07, + "loss": 0.5996, + "step": 10270 + }, + { + "epoch": 0.83, + "grad_norm": 3.93633227172791, + "learning_rate": 7.035789480864824e-07, + "loss": 0.564, + "step": 10271 + }, + { + "epoch": 0.83, + "grad_norm": 2.8101480225352993, + "learning_rate": 7.029062950071847e-07, + "loss": 0.5076, + "step": 10272 + }, + { + "epoch": 0.83, + "grad_norm": 5.08787573823197, + "learning_rate": 7.022339393138272e-07, + "loss": 0.6075, + "step": 10273 + }, + { + "epoch": 0.83, + "grad_norm": 3.2671836232172513, + "learning_rate": 7.015618810529428e-07, + "loss": 0.7458, + "step": 10274 + }, + { + "epoch": 0.83, + "grad_norm": 3.9281403047501344, + "learning_rate": 7.008901202710416e-07, + "loss": 0.8771, + "step": 10275 + }, + { + "epoch": 0.83, + "grad_norm": 3.1279623744196385, + "learning_rate": 7.002186570146141e-07, + "loss": 0.5684, + "step": 10276 + }, + { + "epoch": 0.83, + "grad_norm": 4.0917047361638375, + "learning_rate": 6.995474913301287e-07, + "loss": 0.7017, + "step": 10277 + }, + { + "epoch": 0.83, + "grad_norm": 4.481461830892188, + "learning_rate": 6.988766232640337e-07, + "loss": 0.6429, + "step": 10278 + }, + { + "epoch": 0.83, + "grad_norm": 6.993127680316164, + "learning_rate": 6.982060528627594e-07, + "loss": 0.8608, + "step": 10279 + }, + { + "epoch": 0.83, + "grad_norm": 6.8092471373883745, + "learning_rate": 6.975357801727117e-07, + "loss": 0.6497, + "step": 10280 + }, + { + "epoch": 0.84, + "grad_norm": 2.678803408196946, + "learning_rate": 6.968658052402805e-07, + "loss": 0.5866, + "step": 10281 + }, + { + "epoch": 0.84, + "grad_norm": 9.344964932791905, + "learning_rate": 6.961961281118285e-07, + "loss": 0.7114, + "step": 10282 + }, + { + "epoch": 0.84, + "grad_norm": 4.401805319053445, + "learning_rate": 6.955267488337048e-07, + "loss": 0.6884, + "step": 10283 + }, + { + "epoch": 0.84, + "grad_norm": 5.788684589390815, + "learning_rate": 6.948576674522317e-07, + "loss": 0.5688, + "step": 10284 + }, + { + "epoch": 0.84, + "grad_norm": 5.665107987038337, + "learning_rate": 6.941888840137162e-07, + "loss": 0.5753, + "step": 10285 + }, + { + "epoch": 0.84, + "grad_norm": 5.291629653143022, + "learning_rate": 6.935203985644423e-07, + "loss": 0.7368, + "step": 10286 + }, + { + "epoch": 0.84, + "grad_norm": 5.4084795505447065, + "learning_rate": 6.928522111506713e-07, + "loss": 0.6546, + "step": 10287 + }, + { + "epoch": 0.84, + "grad_norm": 9.345589560433593, + "learning_rate": 6.921843218186492e-07, + "loss": 0.7892, + "step": 10288 + }, + { + "epoch": 0.84, + "grad_norm": 3.6173610479553737, + "learning_rate": 6.915167306145943e-07, + "loss": 0.6149, + "step": 10289 + }, + { + "epoch": 0.84, + "grad_norm": 3.2295157064303006, + "learning_rate": 6.908494375847114e-07, + "loss": 0.7342, + "step": 10290 + }, + { + "epoch": 0.84, + "grad_norm": 3.854541269762568, + "learning_rate": 6.901824427751785e-07, + "loss": 0.6512, + "step": 10291 + }, + { + "epoch": 0.84, + "grad_norm": 5.720851724348029, + "learning_rate": 6.895157462321589e-07, + "loss": 0.6955, + "step": 10292 + }, + { + "epoch": 0.84, + "grad_norm": 15.804938239872524, + "learning_rate": 6.88849348001791e-07, + "loss": 0.6454, + "step": 10293 + }, + { + "epoch": 0.84, + "grad_norm": 4.651269146874924, + "learning_rate": 6.88183248130192e-07, + "loss": 0.7272, + "step": 10294 + }, + { + "epoch": 0.84, + "grad_norm": 3.8057621411456464, + "learning_rate": 6.875174466634638e-07, + "loss": 0.6901, + "step": 10295 + }, + { + "epoch": 0.84, + "grad_norm": 2.978275668062095, + "learning_rate": 6.868519436476795e-07, + "loss": 0.7389, + "step": 10296 + }, + { + "epoch": 0.84, + "grad_norm": 2.9691941368260433, + "learning_rate": 6.861867391289e-07, + "loss": 0.6854, + "step": 10297 + }, + { + "epoch": 0.84, + "grad_norm": 6.43910238964205, + "learning_rate": 6.855218331531594e-07, + "loss": 0.8386, + "step": 10298 + }, + { + "epoch": 0.84, + "grad_norm": 3.2261013951058333, + "learning_rate": 6.848572257664749e-07, + "loss": 0.7331, + "step": 10299 + }, + { + "epoch": 0.84, + "grad_norm": 5.3338752568327, + "learning_rate": 6.841929170148403e-07, + "loss": 0.6453, + "step": 10300 + }, + { + "epoch": 0.84, + "grad_norm": 8.512909804575376, + "learning_rate": 6.835289069442308e-07, + "loss": 0.5218, + "step": 10301 + }, + { + "epoch": 0.84, + "grad_norm": 2.874504027138462, + "learning_rate": 6.828651956006e-07, + "loss": 0.7418, + "step": 10302 + }, + { + "epoch": 0.84, + "grad_norm": 3.8434801054176924, + "learning_rate": 6.822017830298788e-07, + "loss": 0.5339, + "step": 10303 + }, + { + "epoch": 0.84, + "grad_norm": 14.45940776824948, + "learning_rate": 6.815386692779829e-07, + "loss": 0.6591, + "step": 10304 + }, + { + "epoch": 0.84, + "grad_norm": 8.933732830244342, + "learning_rate": 6.808758543908012e-07, + "loss": 0.6542, + "step": 10305 + }, + { + "epoch": 0.84, + "grad_norm": 3.334997529908435, + "learning_rate": 6.802133384142068e-07, + "loss": 0.6655, + "step": 10306 + }, + { + "epoch": 0.84, + "grad_norm": 8.224923364123644, + "learning_rate": 6.795511213940492e-07, + "loss": 0.6344, + "step": 10307 + }, + { + "epoch": 0.84, + "grad_norm": 5.425023392372662, + "learning_rate": 6.788892033761579e-07, + "loss": 0.8, + "step": 10308 + }, + { + "epoch": 0.84, + "grad_norm": 4.057378608699342, + "learning_rate": 6.782275844063402e-07, + "loss": 0.6748, + "step": 10309 + }, + { + "epoch": 0.84, + "grad_norm": 3.530998033490358, + "learning_rate": 6.775662645303871e-07, + "loss": 0.5981, + "step": 10310 + }, + { + "epoch": 0.84, + "grad_norm": 3.0777349118971693, + "learning_rate": 6.769052437940649e-07, + "loss": 0.7284, + "step": 10311 + }, + { + "epoch": 0.84, + "grad_norm": 3.317813205319253, + "learning_rate": 6.762445222431191e-07, + "loss": 0.6273, + "step": 10312 + }, + { + "epoch": 0.84, + "grad_norm": 4.7540914251439546, + "learning_rate": 6.755840999232776e-07, + "loss": 0.7435, + "step": 10313 + }, + { + "epoch": 0.84, + "grad_norm": 3.950362125580203, + "learning_rate": 6.749239768802457e-07, + "loss": 0.6815, + "step": 10314 + }, + { + "epoch": 0.84, + "grad_norm": 3.394164293821075, + "learning_rate": 6.742641531597077e-07, + "loss": 0.6544, + "step": 10315 + }, + { + "epoch": 0.84, + "grad_norm": 2.9423269808752077, + "learning_rate": 6.736046288073261e-07, + "loss": 0.6683, + "step": 10316 + }, + { + "epoch": 0.84, + "grad_norm": 2.926102349419464, + "learning_rate": 6.729454038687461e-07, + "loss": 0.6786, + "step": 10317 + }, + { + "epoch": 0.84, + "grad_norm": 6.1844165819940145, + "learning_rate": 6.722864783895899e-07, + "loss": 0.6564, + "step": 10318 + }, + { + "epoch": 0.84, + "grad_norm": 7.913077866984318, + "learning_rate": 6.716278524154579e-07, + "loss": 0.6802, + "step": 10319 + }, + { + "epoch": 0.84, + "grad_norm": 5.347027814148411, + "learning_rate": 6.70969525991933e-07, + "loss": 0.627, + "step": 10320 + }, + { + "epoch": 0.84, + "grad_norm": 3.8244963417087936, + "learning_rate": 6.703114991645754e-07, + "loss": 0.6633, + "step": 10321 + }, + { + "epoch": 0.84, + "grad_norm": 6.193735581475296, + "learning_rate": 6.696537719789231e-07, + "loss": 0.7494, + "step": 10322 + }, + { + "epoch": 0.84, + "grad_norm": 3.1135273151725276, + "learning_rate": 6.689963444804954e-07, + "loss": 0.6579, + "step": 10323 + }, + { + "epoch": 0.84, + "grad_norm": 2.866006617273913, + "learning_rate": 6.683392167147917e-07, + "loss": 0.6197, + "step": 10324 + }, + { + "epoch": 0.84, + "grad_norm": 4.108516330537227, + "learning_rate": 6.676823887272888e-07, + "loss": 0.6185, + "step": 10325 + }, + { + "epoch": 0.84, + "grad_norm": 6.062323830574421, + "learning_rate": 6.670258605634422e-07, + "loss": 0.7935, + "step": 10326 + }, + { + "epoch": 0.84, + "grad_norm": 3.1354579115801857, + "learning_rate": 6.663696322686897e-07, + "loss": 0.6264, + "step": 10327 + }, + { + "epoch": 0.84, + "grad_norm": 13.144135383962132, + "learning_rate": 6.657137038884453e-07, + "loss": 0.8409, + "step": 10328 + }, + { + "epoch": 0.84, + "grad_norm": 10.44201426781892, + "learning_rate": 6.650580754681035e-07, + "loss": 0.7576, + "step": 10329 + }, + { + "epoch": 0.84, + "grad_norm": 3.450999863278629, + "learning_rate": 6.644027470530367e-07, + "loss": 0.6023, + "step": 10330 + }, + { + "epoch": 0.84, + "grad_norm": 2.605468927205407, + "learning_rate": 6.637477186886004e-07, + "loss": 0.7032, + "step": 10331 + }, + { + "epoch": 0.84, + "grad_norm": 9.356896866243238, + "learning_rate": 6.63092990420125e-07, + "loss": 0.5973, + "step": 10332 + }, + { + "epoch": 0.84, + "grad_norm": 3.021726192174664, + "learning_rate": 6.624385622929214e-07, + "loss": 0.6924, + "step": 10333 + }, + { + "epoch": 0.84, + "grad_norm": 4.823269583980918, + "learning_rate": 6.617844343522817e-07, + "loss": 0.706, + "step": 10334 + }, + { + "epoch": 0.84, + "grad_norm": 2.3900572299403176, + "learning_rate": 6.611306066434747e-07, + "loss": 0.6583, + "step": 10335 + }, + { + "epoch": 0.84, + "grad_norm": 5.711679502227634, + "learning_rate": 6.604770792117493e-07, + "loss": 0.6994, + "step": 10336 + }, + { + "epoch": 0.84, + "grad_norm": 16.79376202724141, + "learning_rate": 6.598238521023332e-07, + "loss": 0.6974, + "step": 10337 + }, + { + "epoch": 0.84, + "grad_norm": 2.9757197337152905, + "learning_rate": 6.591709253604356e-07, + "loss": 0.6157, + "step": 10338 + }, + { + "epoch": 0.84, + "grad_norm": 3.215857467464239, + "learning_rate": 6.585182990312405e-07, + "loss": 0.7551, + "step": 10339 + }, + { + "epoch": 0.84, + "grad_norm": 7.121030621894883, + "learning_rate": 6.578659731599169e-07, + "loss": 0.5982, + "step": 10340 + }, + { + "epoch": 0.84, + "grad_norm": 6.626078812745635, + "learning_rate": 6.572139477916084e-07, + "loss": 0.591, + "step": 10341 + }, + { + "epoch": 0.84, + "grad_norm": 4.037539856414299, + "learning_rate": 6.565622229714392e-07, + "loss": 0.5354, + "step": 10342 + }, + { + "epoch": 0.84, + "grad_norm": 2.968027033660957, + "learning_rate": 6.559107987445124e-07, + "loss": 0.8826, + "step": 10343 + }, + { + "epoch": 0.84, + "grad_norm": 4.53264512312851, + "learning_rate": 6.552596751559098e-07, + "loss": 0.742, + "step": 10344 + }, + { + "epoch": 0.84, + "grad_norm": 4.204865704765471, + "learning_rate": 6.546088522506955e-07, + "loss": 0.7021, + "step": 10345 + }, + { + "epoch": 0.84, + "grad_norm": 5.867827255980681, + "learning_rate": 6.539583300739089e-07, + "loss": 0.6873, + "step": 10346 + }, + { + "epoch": 0.84, + "grad_norm": 7.307166362198696, + "learning_rate": 6.533081086705711e-07, + "loss": 0.6685, + "step": 10347 + }, + { + "epoch": 0.84, + "grad_norm": 7.40612694342808, + "learning_rate": 6.526581880856819e-07, + "loss": 0.5511, + "step": 10348 + }, + { + "epoch": 0.84, + "grad_norm": 5.20080167432009, + "learning_rate": 6.520085683642191e-07, + "loss": 0.6783, + "step": 10349 + }, + { + "epoch": 0.84, + "grad_norm": 2.5813895676085608, + "learning_rate": 6.513592495511406e-07, + "loss": 0.6618, + "step": 10350 + }, + { + "epoch": 0.84, + "grad_norm": 2.6116065994577693, + "learning_rate": 6.507102316913816e-07, + "loss": 0.5529, + "step": 10351 + }, + { + "epoch": 0.84, + "grad_norm": 4.80458611424017, + "learning_rate": 6.500615148298617e-07, + "loss": 0.6887, + "step": 10352 + }, + { + "epoch": 0.84, + "grad_norm": 21.334037622840906, + "learning_rate": 6.494130990114733e-07, + "loss": 0.7477, + "step": 10353 + }, + { + "epoch": 0.84, + "grad_norm": 3.6788465851705574, + "learning_rate": 6.487649842810939e-07, + "loss": 0.7162, + "step": 10354 + }, + { + "epoch": 0.84, + "grad_norm": 5.3090341604953775, + "learning_rate": 6.481171706835737e-07, + "loss": 0.5675, + "step": 10355 + }, + { + "epoch": 0.84, + "grad_norm": 4.0817200217799305, + "learning_rate": 6.474696582637474e-07, + "loss": 0.6988, + "step": 10356 + }, + { + "epoch": 0.84, + "grad_norm": 87.68690045900183, + "learning_rate": 6.46822447066427e-07, + "loss": 0.7554, + "step": 10357 + }, + { + "epoch": 0.84, + "grad_norm": 4.502143765734597, + "learning_rate": 6.461755371364015e-07, + "loss": 0.6506, + "step": 10358 + }, + { + "epoch": 0.84, + "grad_norm": 3.1037843511659853, + "learning_rate": 6.455289285184446e-07, + "loss": 0.6503, + "step": 10359 + }, + { + "epoch": 0.84, + "grad_norm": 2.484675077768453, + "learning_rate": 6.448826212573023e-07, + "loss": 0.7002, + "step": 10360 + }, + { + "epoch": 0.84, + "grad_norm": 5.5047779931021115, + "learning_rate": 6.44236615397707e-07, + "loss": 0.537, + "step": 10361 + }, + { + "epoch": 0.84, + "grad_norm": 5.808955311385058, + "learning_rate": 6.435909109843619e-07, + "loss": 0.6739, + "step": 10362 + }, + { + "epoch": 0.84, + "grad_norm": 2.8508680085392153, + "learning_rate": 6.429455080619568e-07, + "loss": 0.6523, + "step": 10363 + }, + { + "epoch": 0.84, + "grad_norm": 4.329015422887391, + "learning_rate": 6.42300406675156e-07, + "loss": 0.6689, + "step": 10364 + }, + { + "epoch": 0.84, + "grad_norm": 5.613391125960158, + "learning_rate": 6.416556068686064e-07, + "loss": 0.7513, + "step": 10365 + }, + { + "epoch": 0.84, + "grad_norm": 2.739849790229807, + "learning_rate": 6.410111086869314e-07, + "loss": 0.4758, + "step": 10366 + }, + { + "epoch": 0.84, + "grad_norm": 6.131785198292752, + "learning_rate": 6.403669121747336e-07, + "loss": 0.5984, + "step": 10367 + }, + { + "epoch": 0.84, + "grad_norm": 2.3467058360087165, + "learning_rate": 6.397230173765967e-07, + "loss": 0.6914, + "step": 10368 + }, + { + "epoch": 0.84, + "grad_norm": 4.338622187972352, + "learning_rate": 6.390794243370801e-07, + "loss": 0.6395, + "step": 10369 + }, + { + "epoch": 0.84, + "grad_norm": 20.877496552494325, + "learning_rate": 6.384361331007271e-07, + "loss": 0.7061, + "step": 10370 + }, + { + "epoch": 0.84, + "grad_norm": 3.3398472569820887, + "learning_rate": 6.377931437120555e-07, + "loss": 0.6416, + "step": 10371 + }, + { + "epoch": 0.84, + "grad_norm": 2.734674335682519, + "learning_rate": 6.371504562155656e-07, + "loss": 0.6, + "step": 10372 + }, + { + "epoch": 0.84, + "grad_norm": 2.9118978312716317, + "learning_rate": 6.365080706557352e-07, + "loss": 0.6582, + "step": 10373 + }, + { + "epoch": 0.84, + "grad_norm": 3.5333695170524897, + "learning_rate": 6.358659870770212e-07, + "loss": 0.6903, + "step": 10374 + }, + { + "epoch": 0.84, + "grad_norm": 3.592788041985399, + "learning_rate": 6.3522420552386e-07, + "loss": 0.8557, + "step": 10375 + }, + { + "epoch": 0.84, + "grad_norm": 5.290180713457035, + "learning_rate": 6.34582726040665e-07, + "loss": 0.7188, + "step": 10376 + }, + { + "epoch": 0.84, + "grad_norm": 3.2193692414814046, + "learning_rate": 6.339415486718336e-07, + "loss": 0.8291, + "step": 10377 + }, + { + "epoch": 0.84, + "grad_norm": 3.1535420078102843, + "learning_rate": 6.333006734617375e-07, + "loss": 0.6187, + "step": 10378 + }, + { + "epoch": 0.84, + "grad_norm": 7.260409625376434, + "learning_rate": 6.326601004547301e-07, + "loss": 0.4959, + "step": 10379 + }, + { + "epoch": 0.84, + "grad_norm": 10.432916900277515, + "learning_rate": 6.320198296951435e-07, + "loss": 0.6387, + "step": 10380 + }, + { + "epoch": 0.84, + "grad_norm": 3.3128386379719243, + "learning_rate": 6.31379861227287e-07, + "loss": 0.6815, + "step": 10381 + }, + { + "epoch": 0.84, + "grad_norm": 4.046337035488539, + "learning_rate": 6.307401950954517e-07, + "loss": 0.7009, + "step": 10382 + }, + { + "epoch": 0.84, + "grad_norm": 5.815238562421174, + "learning_rate": 6.30100831343905e-07, + "loss": 0.4688, + "step": 10383 + }, + { + "epoch": 0.84, + "grad_norm": 53.539435612051825, + "learning_rate": 6.29461770016897e-07, + "loss": 0.7678, + "step": 10384 + }, + { + "epoch": 0.84, + "grad_norm": 19.587524240326776, + "learning_rate": 6.288230111586524e-07, + "loss": 0.5885, + "step": 10385 + }, + { + "epoch": 0.84, + "grad_norm": 10.117443535484943, + "learning_rate": 6.281845548133796e-07, + "loss": 0.6438, + "step": 10386 + }, + { + "epoch": 0.84, + "grad_norm": 3.044261323403354, + "learning_rate": 6.27546401025263e-07, + "loss": 0.5458, + "step": 10387 + }, + { + "epoch": 0.84, + "grad_norm": 3.65084815691572, + "learning_rate": 6.26908549838467e-07, + "loss": 0.6376, + "step": 10388 + }, + { + "epoch": 0.84, + "grad_norm": 21.497912503281153, + "learning_rate": 6.262710012971329e-07, + "loss": 0.6523, + "step": 10389 + }, + { + "epoch": 0.84, + "grad_norm": 4.506621217933862, + "learning_rate": 6.256337554453862e-07, + "loss": 0.7805, + "step": 10390 + }, + { + "epoch": 0.84, + "grad_norm": 6.337912195930157, + "learning_rate": 6.24996812327327e-07, + "loss": 0.6193, + "step": 10391 + }, + { + "epoch": 0.84, + "grad_norm": 3.6775326311364274, + "learning_rate": 6.243601719870346e-07, + "loss": 0.5932, + "step": 10392 + }, + { + "epoch": 0.84, + "grad_norm": 3.9853114690390354, + "learning_rate": 6.237238344685703e-07, + "loss": 0.7941, + "step": 10393 + }, + { + "epoch": 0.84, + "grad_norm": 5.7886714788766644, + "learning_rate": 6.230877998159724e-07, + "loss": 0.6257, + "step": 10394 + }, + { + "epoch": 0.84, + "grad_norm": 6.260932374432806, + "learning_rate": 6.224520680732582e-07, + "loss": 0.6847, + "step": 10395 + }, + { + "epoch": 0.84, + "grad_norm": 4.299173570488173, + "learning_rate": 6.218166392844227e-07, + "loss": 0.6663, + "step": 10396 + }, + { + "epoch": 0.84, + "grad_norm": 4.375333355386297, + "learning_rate": 6.211815134934446e-07, + "loss": 0.6135, + "step": 10397 + }, + { + "epoch": 0.84, + "grad_norm": 2.5452650103996453, + "learning_rate": 6.205466907442764e-07, + "loss": 0.6365, + "step": 10398 + }, + { + "epoch": 0.84, + "grad_norm": 3.9901833225220193, + "learning_rate": 6.19912171080852e-07, + "loss": 0.6014, + "step": 10399 + }, + { + "epoch": 0.84, + "grad_norm": 3.261833758382411, + "learning_rate": 6.192779545470856e-07, + "loss": 0.8548, + "step": 10400 + }, + { + "epoch": 0.84, + "grad_norm": 3.5044936715148247, + "learning_rate": 6.186440411868683e-07, + "loss": 0.6833, + "step": 10401 + }, + { + "epoch": 0.84, + "grad_norm": 3.498299056057299, + "learning_rate": 6.180104310440705e-07, + "loss": 0.6055, + "step": 10402 + }, + { + "epoch": 0.84, + "grad_norm": 4.807397300682668, + "learning_rate": 6.173771241625409e-07, + "loss": 0.6662, + "step": 10403 + }, + { + "epoch": 0.85, + "grad_norm": 4.991125347854919, + "learning_rate": 6.167441205861108e-07, + "loss": 0.6206, + "step": 10404 + }, + { + "epoch": 0.85, + "grad_norm": 8.090919405503609, + "learning_rate": 6.161114203585866e-07, + "loss": 0.649, + "step": 10405 + }, + { + "epoch": 0.85, + "grad_norm": 8.471272683824242, + "learning_rate": 6.154790235237546e-07, + "loss": 0.6031, + "step": 10406 + }, + { + "epoch": 0.85, + "grad_norm": 3.8302448620665666, + "learning_rate": 6.148469301253834e-07, + "loss": 0.603, + "step": 10407 + }, + { + "epoch": 0.85, + "grad_norm": 5.12344653475466, + "learning_rate": 6.142151402072133e-07, + "loss": 0.585, + "step": 10408 + }, + { + "epoch": 0.85, + "grad_norm": 4.341409406176881, + "learning_rate": 6.135836538129725e-07, + "loss": 0.6119, + "step": 10409 + }, + { + "epoch": 0.85, + "grad_norm": 11.994580200705164, + "learning_rate": 6.129524709863605e-07, + "loss": 0.7164, + "step": 10410 + }, + { + "epoch": 0.85, + "grad_norm": 4.771347418688263, + "learning_rate": 6.123215917710617e-07, + "loss": 0.6189, + "step": 10411 + }, + { + "epoch": 0.85, + "grad_norm": 4.46114124270828, + "learning_rate": 6.116910162107348e-07, + "loss": 0.6445, + "step": 10412 + }, + { + "epoch": 0.85, + "grad_norm": 3.2124967338696395, + "learning_rate": 6.110607443490218e-07, + "loss": 0.6684, + "step": 10413 + }, + { + "epoch": 0.85, + "grad_norm": 2.4512186008441392, + "learning_rate": 6.104307762295403e-07, + "loss": 0.6613, + "step": 10414 + }, + { + "epoch": 0.85, + "grad_norm": 7.234811722280214, + "learning_rate": 6.098011118958885e-07, + "loss": 0.6208, + "step": 10415 + }, + { + "epoch": 0.85, + "grad_norm": 3.7458987287811345, + "learning_rate": 6.091717513916424e-07, + "loss": 0.6757, + "step": 10416 + }, + { + "epoch": 0.85, + "grad_norm": 3.1862721466114, + "learning_rate": 6.085426947603568e-07, + "loss": 0.6142, + "step": 10417 + }, + { + "epoch": 0.85, + "grad_norm": 3.5773256137198226, + "learning_rate": 6.079139420455688e-07, + "loss": 0.7204, + "step": 10418 + }, + { + "epoch": 0.85, + "grad_norm": 2.9527655987048163, + "learning_rate": 6.072854932907901e-07, + "loss": 0.4914, + "step": 10419 + }, + { + "epoch": 0.85, + "grad_norm": 2.713211585221999, + "learning_rate": 6.066573485395155e-07, + "loss": 0.6181, + "step": 10420 + }, + { + "epoch": 0.85, + "grad_norm": 7.123617165101472, + "learning_rate": 6.060295078352135e-07, + "loss": 0.7754, + "step": 10421 + }, + { + "epoch": 0.85, + "grad_norm": 3.1498774063466732, + "learning_rate": 6.054019712213377e-07, + "loss": 0.7949, + "step": 10422 + }, + { + "epoch": 0.85, + "grad_norm": 4.337723456755252, + "learning_rate": 6.047747387413156e-07, + "loss": 0.7654, + "step": 10423 + }, + { + "epoch": 0.85, + "grad_norm": 4.663180619022543, + "learning_rate": 6.041478104385556e-07, + "loss": 0.7326, + "step": 10424 + }, + { + "epoch": 0.85, + "grad_norm": 4.92796186203844, + "learning_rate": 6.035211863564461e-07, + "loss": 0.5615, + "step": 10425 + }, + { + "epoch": 0.85, + "grad_norm": 7.54693451654569, + "learning_rate": 6.028948665383527e-07, + "loss": 0.643, + "step": 10426 + }, + { + "epoch": 0.85, + "grad_norm": 4.756255508057335, + "learning_rate": 6.022688510276226e-07, + "loss": 0.7324, + "step": 10427 + }, + { + "epoch": 0.85, + "grad_norm": 3.799498802717264, + "learning_rate": 6.016431398675764e-07, + "loss": 0.625, + "step": 10428 + }, + { + "epoch": 0.85, + "grad_norm": 19.40892480915589, + "learning_rate": 6.010177331015205e-07, + "loss": 0.613, + "step": 10429 + }, + { + "epoch": 0.85, + "grad_norm": 4.081445016357576, + "learning_rate": 6.003926307727359e-07, + "loss": 0.5737, + "step": 10430 + }, + { + "epoch": 0.85, + "grad_norm": 5.422477275692693, + "learning_rate": 5.997678329244822e-07, + "loss": 0.6934, + "step": 10431 + }, + { + "epoch": 0.85, + "grad_norm": 2.8998179351185183, + "learning_rate": 5.991433396000013e-07, + "loss": 0.5573, + "step": 10432 + }, + { + "epoch": 0.85, + "grad_norm": 3.075276476336215, + "learning_rate": 5.985191508425109e-07, + "loss": 0.541, + "step": 10433 + }, + { + "epoch": 0.85, + "grad_norm": 3.1398148373933443, + "learning_rate": 5.978952666952109e-07, + "loss": 0.5644, + "step": 10434 + }, + { + "epoch": 0.85, + "grad_norm": 5.354432132988761, + "learning_rate": 5.972716872012746e-07, + "loss": 0.7347, + "step": 10435 + }, + { + "epoch": 0.85, + "grad_norm": 6.0921637779136235, + "learning_rate": 5.966484124038602e-07, + "loss": 0.604, + "step": 10436 + }, + { + "epoch": 0.85, + "grad_norm": 3.775606925354813, + "learning_rate": 5.960254423461009e-07, + "loss": 0.5325, + "step": 10437 + }, + { + "epoch": 0.85, + "grad_norm": 2.762244894743621, + "learning_rate": 5.954027770711112e-07, + "loss": 0.6813, + "step": 10438 + }, + { + "epoch": 0.85, + "grad_norm": 3.5560300413945294, + "learning_rate": 5.947804166219834e-07, + "loss": 0.768, + "step": 10439 + }, + { + "epoch": 0.85, + "grad_norm": 4.215138672799785, + "learning_rate": 5.941583610417878e-07, + "loss": 0.74, + "step": 10440 + }, + { + "epoch": 0.85, + "grad_norm": 2.770448243603704, + "learning_rate": 5.935366103735757e-07, + "loss": 0.6183, + "step": 10441 + }, + { + "epoch": 0.85, + "grad_norm": 6.107574466474058, + "learning_rate": 5.929151646603742e-07, + "loss": 0.6654, + "step": 10442 + }, + { + "epoch": 0.85, + "grad_norm": 3.8323745782748917, + "learning_rate": 5.922940239451935e-07, + "loss": 0.6157, + "step": 10443 + }, + { + "epoch": 0.85, + "grad_norm": 5.290518424973234, + "learning_rate": 5.916731882710186e-07, + "loss": 0.6526, + "step": 10444 + }, + { + "epoch": 0.85, + "grad_norm": 5.360297804639516, + "learning_rate": 5.910526576808173e-07, + "loss": 0.5134, + "step": 10445 + }, + { + "epoch": 0.85, + "grad_norm": 2.822586854552277, + "learning_rate": 5.904324322175331e-07, + "loss": 0.5253, + "step": 10446 + }, + { + "epoch": 0.85, + "grad_norm": 3.831853653025244, + "learning_rate": 5.8981251192409e-07, + "loss": 0.6633, + "step": 10447 + }, + { + "epoch": 0.85, + "grad_norm": 5.747059534916783, + "learning_rate": 5.891928968433891e-07, + "loss": 0.5285, + "step": 10448 + }, + { + "epoch": 0.85, + "grad_norm": 4.4877683130208945, + "learning_rate": 5.885735870183118e-07, + "loss": 0.5811, + "step": 10449 + }, + { + "epoch": 0.85, + "grad_norm": 3.775932976594435, + "learning_rate": 5.879545824917199e-07, + "loss": 0.5389, + "step": 10450 + }, + { + "epoch": 0.85, + "grad_norm": 3.750757655951693, + "learning_rate": 5.873358833064507e-07, + "loss": 0.5711, + "step": 10451 + }, + { + "epoch": 0.85, + "grad_norm": 4.417306371635409, + "learning_rate": 5.867174895053235e-07, + "loss": 0.6208, + "step": 10452 + }, + { + "epoch": 0.85, + "grad_norm": 4.104949924456998, + "learning_rate": 5.860994011311344e-07, + "loss": 0.6585, + "step": 10453 + }, + { + "epoch": 0.85, + "grad_norm": 5.501678328833904, + "learning_rate": 5.854816182266593e-07, + "loss": 0.5378, + "step": 10454 + }, + { + "epoch": 0.85, + "grad_norm": 3.420261198676261, + "learning_rate": 5.848641408346517e-07, + "loss": 0.6376, + "step": 10455 + }, + { + "epoch": 0.85, + "grad_norm": 5.197292639161484, + "learning_rate": 5.842469689978447e-07, + "loss": 0.7506, + "step": 10456 + }, + { + "epoch": 0.85, + "grad_norm": 2.8908635631430477, + "learning_rate": 5.836301027589525e-07, + "loss": 0.7235, + "step": 10457 + }, + { + "epoch": 0.85, + "grad_norm": 4.861022318169697, + "learning_rate": 5.830135421606642e-07, + "loss": 0.6076, + "step": 10458 + }, + { + "epoch": 0.85, + "grad_norm": 7.489755608726509, + "learning_rate": 5.823972872456512e-07, + "loss": 0.7943, + "step": 10459 + }, + { + "epoch": 0.85, + "grad_norm": 10.59410304310968, + "learning_rate": 5.817813380565612e-07, + "loss": 0.6886, + "step": 10460 + }, + { + "epoch": 0.85, + "grad_norm": 6.03736415315865, + "learning_rate": 5.811656946360222e-07, + "loss": 0.7753, + "step": 10461 + }, + { + "epoch": 0.85, + "grad_norm": 4.336831404605428, + "learning_rate": 5.805503570266396e-07, + "loss": 0.6668, + "step": 10462 + }, + { + "epoch": 0.85, + "grad_norm": 2.5533491592726203, + "learning_rate": 5.799353252710005e-07, + "loss": 0.6407, + "step": 10463 + }, + { + "epoch": 0.85, + "grad_norm": 3.6278356642623897, + "learning_rate": 5.793205994116674e-07, + "loss": 0.8978, + "step": 10464 + }, + { + "epoch": 0.85, + "grad_norm": 5.5589684501172165, + "learning_rate": 5.78706179491183e-07, + "loss": 0.723, + "step": 10465 + }, + { + "epoch": 0.85, + "grad_norm": 3.3143701313246003, + "learning_rate": 5.780920655520711e-07, + "loss": 0.7077, + "step": 10466 + }, + { + "epoch": 0.85, + "grad_norm": 2.57361119760105, + "learning_rate": 5.774782576368304e-07, + "loss": 0.7415, + "step": 10467 + }, + { + "epoch": 0.85, + "grad_norm": 2.947981599599951, + "learning_rate": 5.768647557879408e-07, + "loss": 0.635, + "step": 10468 + }, + { + "epoch": 0.85, + "grad_norm": 3.227808313926324, + "learning_rate": 5.762515600478596e-07, + "loss": 0.7014, + "step": 10469 + }, + { + "epoch": 0.85, + "grad_norm": 12.114263232259527, + "learning_rate": 5.756386704590255e-07, + "loss": 0.8008, + "step": 10470 + }, + { + "epoch": 0.85, + "grad_norm": 3.464787519314774, + "learning_rate": 5.750260870638541e-07, + "loss": 0.7135, + "step": 10471 + }, + { + "epoch": 0.85, + "grad_norm": 5.601030370219566, + "learning_rate": 5.744138099047375e-07, + "loss": 0.7965, + "step": 10472 + }, + { + "epoch": 0.85, + "grad_norm": 41.707977187014066, + "learning_rate": 5.738018390240535e-07, + "loss": 0.6188, + "step": 10473 + }, + { + "epoch": 0.85, + "grad_norm": 8.908503250200823, + "learning_rate": 5.731901744641499e-07, + "loss": 0.7987, + "step": 10474 + }, + { + "epoch": 0.85, + "grad_norm": 3.6928638454145704, + "learning_rate": 5.725788162673612e-07, + "loss": 0.5576, + "step": 10475 + }, + { + "epoch": 0.85, + "grad_norm": 3.903003052043101, + "learning_rate": 5.719677644759941e-07, + "loss": 0.6336, + "step": 10476 + }, + { + "epoch": 0.85, + "grad_norm": 3.1266846127333547, + "learning_rate": 5.713570191323398e-07, + "loss": 0.6908, + "step": 10477 + }, + { + "epoch": 0.85, + "grad_norm": 3.4547086037485526, + "learning_rate": 5.707465802786655e-07, + "loss": 0.5827, + "step": 10478 + }, + { + "epoch": 0.85, + "grad_norm": 3.1352188120714954, + "learning_rate": 5.701364479572152e-07, + "loss": 0.6586, + "step": 10479 + }, + { + "epoch": 0.85, + "grad_norm": 3.9516907517193705, + "learning_rate": 5.695266222102175e-07, + "loss": 0.6951, + "step": 10480 + }, + { + "epoch": 0.85, + "grad_norm": 4.272577748854382, + "learning_rate": 5.689171030798723e-07, + "loss": 0.5986, + "step": 10481 + }, + { + "epoch": 0.85, + "grad_norm": 4.2666762537922205, + "learning_rate": 5.683078906083644e-07, + "loss": 0.4899, + "step": 10482 + }, + { + "epoch": 0.85, + "grad_norm": 3.4245318131258835, + "learning_rate": 5.676989848378545e-07, + "loss": 0.6647, + "step": 10483 + }, + { + "epoch": 0.85, + "grad_norm": 2.605718714393821, + "learning_rate": 5.670903858104837e-07, + "loss": 0.5798, + "step": 10484 + }, + { + "epoch": 0.85, + "grad_norm": 3.508762339610457, + "learning_rate": 5.664820935683695e-07, + "loss": 0.5445, + "step": 10485 + }, + { + "epoch": 0.85, + "grad_norm": 7.027852891063986, + "learning_rate": 5.658741081536101e-07, + "loss": 0.6171, + "step": 10486 + }, + { + "epoch": 0.85, + "grad_norm": 2.6425636235422147, + "learning_rate": 5.652664296082822e-07, + "loss": 0.6663, + "step": 10487 + }, + { + "epoch": 0.85, + "grad_norm": 7.187846163584768, + "learning_rate": 5.64659057974441e-07, + "loss": 0.6666, + "step": 10488 + }, + { + "epoch": 0.85, + "grad_norm": 5.777206419105017, + "learning_rate": 5.640519932941202e-07, + "loss": 0.6063, + "step": 10489 + }, + { + "epoch": 0.85, + "grad_norm": 12.686746745228717, + "learning_rate": 5.634452356093317e-07, + "loss": 0.5177, + "step": 10490 + }, + { + "epoch": 0.85, + "grad_norm": 3.6741115956239074, + "learning_rate": 5.628387849620687e-07, + "loss": 0.5503, + "step": 10491 + }, + { + "epoch": 0.85, + "grad_norm": 4.163605372311336, + "learning_rate": 5.622326413942997e-07, + "loss": 0.7316, + "step": 10492 + }, + { + "epoch": 0.85, + "grad_norm": 3.3348619975581553, + "learning_rate": 5.616268049479756e-07, + "loss": 0.6219, + "step": 10493 + }, + { + "epoch": 0.85, + "grad_norm": 6.475155001732228, + "learning_rate": 5.610212756650219e-07, + "loss": 0.576, + "step": 10494 + }, + { + "epoch": 0.85, + "grad_norm": 4.21222858749342, + "learning_rate": 5.604160535873465e-07, + "loss": 0.6862, + "step": 10495 + }, + { + "epoch": 0.85, + "grad_norm": 3.10594804741805, + "learning_rate": 5.598111387568339e-07, + "loss": 0.59, + "step": 10496 + }, + { + "epoch": 0.85, + "grad_norm": 2.987327285622205, + "learning_rate": 5.592065312153477e-07, + "loss": 0.6251, + "step": 10497 + }, + { + "epoch": 0.85, + "grad_norm": 4.37267722475284, + "learning_rate": 5.586022310047317e-07, + "loss": 0.6098, + "step": 10498 + }, + { + "epoch": 0.85, + "grad_norm": 4.414321777178975, + "learning_rate": 5.579982381668058e-07, + "loss": 0.7805, + "step": 10499 + }, + { + "epoch": 0.85, + "grad_norm": 8.555182853057014, + "learning_rate": 5.573945527433733e-07, + "loss": 0.6006, + "step": 10500 + }, + { + "epoch": 0.85, + "grad_norm": 6.523839739145099, + "learning_rate": 5.567911747762084e-07, + "loss": 0.7815, + "step": 10501 + }, + { + "epoch": 0.85, + "grad_norm": 4.3531984322880275, + "learning_rate": 5.561881043070721e-07, + "loss": 0.8642, + "step": 10502 + }, + { + "epoch": 0.85, + "grad_norm": 2.9368808756037335, + "learning_rate": 5.555853413776991e-07, + "loss": 0.584, + "step": 10503 + }, + { + "epoch": 0.85, + "grad_norm": 3.257545599693184, + "learning_rate": 5.549828860298046e-07, + "loss": 0.649, + "step": 10504 + }, + { + "epoch": 0.85, + "grad_norm": 3.2443289063388794, + "learning_rate": 5.543807383050826e-07, + "loss": 0.7033, + "step": 10505 + }, + { + "epoch": 0.85, + "grad_norm": 14.636871459198412, + "learning_rate": 5.537788982452052e-07, + "loss": 0.6668, + "step": 10506 + }, + { + "epoch": 0.85, + "grad_norm": 3.420298604385048, + "learning_rate": 5.531773658918254e-07, + "loss": 0.6947, + "step": 10507 + }, + { + "epoch": 0.85, + "grad_norm": 4.9134211305509865, + "learning_rate": 5.525761412865693e-07, + "loss": 0.639, + "step": 10508 + }, + { + "epoch": 0.85, + "grad_norm": 4.562912416516313, + "learning_rate": 5.519752244710491e-07, + "loss": 0.681, + "step": 10509 + }, + { + "epoch": 0.85, + "grad_norm": 3.918343456024627, + "learning_rate": 5.513746154868499e-07, + "loss": 0.7008, + "step": 10510 + }, + { + "epoch": 0.85, + "grad_norm": 14.64249338792327, + "learning_rate": 5.507743143755373e-07, + "loss": 0.6881, + "step": 10511 + }, + { + "epoch": 0.85, + "grad_norm": 8.6273248730356, + "learning_rate": 5.501743211786575e-07, + "loss": 0.7638, + "step": 10512 + }, + { + "epoch": 0.85, + "grad_norm": 4.652435617616853, + "learning_rate": 5.495746359377335e-07, + "loss": 0.6598, + "step": 10513 + }, + { + "epoch": 0.85, + "grad_norm": 5.190453697518684, + "learning_rate": 5.48975258694267e-07, + "loss": 0.5521, + "step": 10514 + }, + { + "epoch": 0.85, + "grad_norm": 3.3803604285741278, + "learning_rate": 5.483761894897371e-07, + "loss": 0.6289, + "step": 10515 + }, + { + "epoch": 0.85, + "grad_norm": 6.560965467319808, + "learning_rate": 5.477774283656062e-07, + "loss": 0.6772, + "step": 10516 + }, + { + "epoch": 0.85, + "grad_norm": 2.6701711485625834, + "learning_rate": 5.471789753633095e-07, + "loss": 0.6921, + "step": 10517 + }, + { + "epoch": 0.85, + "grad_norm": 3.038743598112128, + "learning_rate": 5.465808305242659e-07, + "loss": 0.6807, + "step": 10518 + }, + { + "epoch": 0.85, + "grad_norm": 2.749553157839282, + "learning_rate": 5.459829938898697e-07, + "loss": 0.5905, + "step": 10519 + }, + { + "epoch": 0.85, + "grad_norm": 10.854706603114828, + "learning_rate": 5.453854655014956e-07, + "loss": 0.6775, + "step": 10520 + }, + { + "epoch": 0.85, + "grad_norm": 4.269071265175622, + "learning_rate": 5.447882454004955e-07, + "loss": 0.686, + "step": 10521 + }, + { + "epoch": 0.85, + "grad_norm": 2.853907333078846, + "learning_rate": 5.441913336282001e-07, + "loss": 0.6965, + "step": 10522 + }, + { + "epoch": 0.85, + "grad_norm": 5.114523722564825, + "learning_rate": 5.435947302259215e-07, + "loss": 0.5129, + "step": 10523 + }, + { + "epoch": 0.85, + "grad_norm": 6.197091259142938, + "learning_rate": 5.429984352349466e-07, + "loss": 0.6902, + "step": 10524 + }, + { + "epoch": 0.85, + "grad_norm": 4.676453452667284, + "learning_rate": 5.424024486965446e-07, + "loss": 0.6663, + "step": 10525 + }, + { + "epoch": 0.85, + "grad_norm": 3.366709298636746, + "learning_rate": 5.418067706519603e-07, + "loss": 0.5944, + "step": 10526 + }, + { + "epoch": 0.86, + "grad_norm": 18.273426121956494, + "learning_rate": 5.412114011424191e-07, + "loss": 0.7503, + "step": 10527 + }, + { + "epoch": 0.86, + "grad_norm": 3.7596731263709464, + "learning_rate": 5.406163402091236e-07, + "loss": 0.694, + "step": 10528 + }, + { + "epoch": 0.86, + "grad_norm": 4.37285851142754, + "learning_rate": 5.400215878932547e-07, + "loss": 0.5341, + "step": 10529 + }, + { + "epoch": 0.86, + "grad_norm": 5.272860907097611, + "learning_rate": 5.39427144235975e-07, + "loss": 0.5827, + "step": 10530 + }, + { + "epoch": 0.86, + "grad_norm": 6.676266335920648, + "learning_rate": 5.388330092784222e-07, + "loss": 0.6228, + "step": 10531 + }, + { + "epoch": 0.86, + "grad_norm": 7.362440420769961, + "learning_rate": 5.382391830617162e-07, + "loss": 0.5789, + "step": 10532 + }, + { + "epoch": 0.86, + "grad_norm": 2.8814051450583795, + "learning_rate": 5.376456656269524e-07, + "loss": 0.7283, + "step": 10533 + }, + { + "epoch": 0.86, + "grad_norm": 3.4488748727174645, + "learning_rate": 5.370524570152059e-07, + "loss": 0.6834, + "step": 10534 + }, + { + "epoch": 0.86, + "grad_norm": 3.19758406223143, + "learning_rate": 5.364595572675302e-07, + "loss": 0.7506, + "step": 10535 + }, + { + "epoch": 0.86, + "grad_norm": 4.629791989870899, + "learning_rate": 5.358669664249566e-07, + "loss": 0.6763, + "step": 10536 + }, + { + "epoch": 0.86, + "grad_norm": 4.324939374829556, + "learning_rate": 5.35274684528499e-07, + "loss": 0.7324, + "step": 10537 + }, + { + "epoch": 0.86, + "grad_norm": 3.887977618355025, + "learning_rate": 5.346827116191438e-07, + "loss": 0.6359, + "step": 10538 + }, + { + "epoch": 0.86, + "grad_norm": 4.2207115213857, + "learning_rate": 5.340910477378625e-07, + "loss": 0.5644, + "step": 10539 + }, + { + "epoch": 0.86, + "grad_norm": 4.739609769218641, + "learning_rate": 5.334996929256003e-07, + "loss": 0.7145, + "step": 10540 + }, + { + "epoch": 0.86, + "grad_norm": 3.570825006490705, + "learning_rate": 5.329086472232825e-07, + "loss": 0.6594, + "step": 10541 + }, + { + "epoch": 0.86, + "grad_norm": 5.051548996510039, + "learning_rate": 5.323179106718129e-07, + "loss": 0.5454, + "step": 10542 + }, + { + "epoch": 0.86, + "grad_norm": 4.418263224865038, + "learning_rate": 5.31727483312075e-07, + "loss": 0.6165, + "step": 10543 + }, + { + "epoch": 0.86, + "grad_norm": 4.83260981620051, + "learning_rate": 5.311373651849305e-07, + "loss": 0.7012, + "step": 10544 + }, + { + "epoch": 0.86, + "grad_norm": 4.277805244576161, + "learning_rate": 5.305475563312174e-07, + "loss": 0.666, + "step": 10545 + }, + { + "epoch": 0.86, + "grad_norm": 4.975542131461341, + "learning_rate": 5.299580567917573e-07, + "loss": 0.6073, + "step": 10546 + }, + { + "epoch": 0.86, + "grad_norm": 11.605812194946333, + "learning_rate": 5.293688666073438e-07, + "loss": 0.6732, + "step": 10547 + }, + { + "epoch": 0.86, + "grad_norm": 4.2409241908450275, + "learning_rate": 5.287799858187548e-07, + "loss": 0.6426, + "step": 10548 + }, + { + "epoch": 0.86, + "grad_norm": 14.305969226693401, + "learning_rate": 5.281914144667427e-07, + "loss": 0.76, + "step": 10549 + }, + { + "epoch": 0.86, + "grad_norm": 4.733803971508274, + "learning_rate": 5.276031525920427e-07, + "loss": 0.6391, + "step": 10550 + }, + { + "epoch": 0.86, + "grad_norm": 3.2113097195839706, + "learning_rate": 5.270152002353651e-07, + "loss": 0.5095, + "step": 10551 + }, + { + "epoch": 0.86, + "grad_norm": 5.85936567432933, + "learning_rate": 5.264275574373989e-07, + "loss": 0.6679, + "step": 10552 + }, + { + "epoch": 0.86, + "grad_norm": 4.652227304738535, + "learning_rate": 5.258402242388156e-07, + "loss": 0.6637, + "step": 10553 + }, + { + "epoch": 0.86, + "grad_norm": 7.369486736826186, + "learning_rate": 5.252532006802585e-07, + "loss": 0.6037, + "step": 10554 + }, + { + "epoch": 0.86, + "grad_norm": 2.8911085096934186, + "learning_rate": 5.246664868023565e-07, + "loss": 0.766, + "step": 10555 + }, + { + "epoch": 0.86, + "grad_norm": 3.8186751391199176, + "learning_rate": 5.240800826457115e-07, + "loss": 0.605, + "step": 10556 + }, + { + "epoch": 0.86, + "grad_norm": 4.803947863403988, + "learning_rate": 5.234939882509083e-07, + "loss": 0.6399, + "step": 10557 + }, + { + "epoch": 0.86, + "grad_norm": 3.965588522005686, + "learning_rate": 5.229082036585076e-07, + "loss": 0.738, + "step": 10558 + }, + { + "epoch": 0.86, + "grad_norm": 3.48272340087464, + "learning_rate": 5.223227289090482e-07, + "loss": 0.685, + "step": 10559 + }, + { + "epoch": 0.86, + "grad_norm": 4.23140765331175, + "learning_rate": 5.217375640430522e-07, + "loss": 0.6292, + "step": 10560 + }, + { + "epoch": 0.86, + "grad_norm": 5.897979897944252, + "learning_rate": 5.211527091010116e-07, + "loss": 0.7619, + "step": 10561 + }, + { + "epoch": 0.86, + "grad_norm": 3.112186185375023, + "learning_rate": 5.205681641234062e-07, + "loss": 0.619, + "step": 10562 + }, + { + "epoch": 0.86, + "grad_norm": 18.232828557853026, + "learning_rate": 5.199839291506875e-07, + "loss": 0.7551, + "step": 10563 + }, + { + "epoch": 0.86, + "grad_norm": 4.3861718664890725, + "learning_rate": 5.194000042232906e-07, + "loss": 0.5812, + "step": 10564 + }, + { + "epoch": 0.86, + "grad_norm": 4.735029090022982, + "learning_rate": 5.188163893816239e-07, + "loss": 0.577, + "step": 10565 + }, + { + "epoch": 0.86, + "grad_norm": 3.658741322798877, + "learning_rate": 5.182330846660815e-07, + "loss": 0.5866, + "step": 10566 + }, + { + "epoch": 0.86, + "grad_norm": 5.522653060913649, + "learning_rate": 5.176500901170273e-07, + "loss": 0.606, + "step": 10567 + }, + { + "epoch": 0.86, + "grad_norm": 4.395711799783536, + "learning_rate": 5.170674057748109e-07, + "loss": 0.5854, + "step": 10568 + }, + { + "epoch": 0.86, + "grad_norm": 18.455156662017206, + "learning_rate": 5.16485031679757e-07, + "loss": 0.6455, + "step": 10569 + }, + { + "epoch": 0.86, + "grad_norm": 3.470361169422147, + "learning_rate": 5.159029678721683e-07, + "loss": 0.5632, + "step": 10570 + }, + { + "epoch": 0.86, + "grad_norm": 5.320427189128552, + "learning_rate": 5.153212143923292e-07, + "loss": 0.7743, + "step": 10571 + }, + { + "epoch": 0.86, + "grad_norm": 3.5693918549923382, + "learning_rate": 5.147397712804992e-07, + "loss": 0.6212, + "step": 10572 + }, + { + "epoch": 0.86, + "grad_norm": 3.7455786315052007, + "learning_rate": 5.141586385769204e-07, + "loss": 0.6655, + "step": 10573 + }, + { + "epoch": 0.86, + "grad_norm": 2.94125506023899, + "learning_rate": 5.135778163218074e-07, + "loss": 0.6415, + "step": 10574 + }, + { + "epoch": 0.86, + "grad_norm": 8.278718512856946, + "learning_rate": 5.129973045553593e-07, + "loss": 0.6008, + "step": 10575 + }, + { + "epoch": 0.86, + "grad_norm": 3.6833335543368757, + "learning_rate": 5.1241710331775e-07, + "loss": 0.7002, + "step": 10576 + }, + { + "epoch": 0.86, + "grad_norm": 6.460548296974436, + "learning_rate": 5.118372126491322e-07, + "loss": 0.5877, + "step": 10577 + }, + { + "epoch": 0.86, + "grad_norm": 5.054272156481077, + "learning_rate": 5.112576325896401e-07, + "loss": 0.7049, + "step": 10578 + }, + { + "epoch": 0.86, + "grad_norm": 3.291200695720283, + "learning_rate": 5.106783631793826e-07, + "loss": 0.6278, + "step": 10579 + }, + { + "epoch": 0.86, + "grad_norm": 4.540267510675974, + "learning_rate": 5.100994044584511e-07, + "loss": 0.7307, + "step": 10580 + }, + { + "epoch": 0.86, + "grad_norm": 2.882817940978211, + "learning_rate": 5.095207564669097e-07, + "loss": 0.5502, + "step": 10581 + }, + { + "epoch": 0.86, + "grad_norm": 3.7507092049889557, + "learning_rate": 5.089424192448078e-07, + "loss": 0.6053, + "step": 10582 + }, + { + "epoch": 0.86, + "grad_norm": 3.1805385700120308, + "learning_rate": 5.08364392832168e-07, + "loss": 0.6144, + "step": 10583 + }, + { + "epoch": 0.86, + "grad_norm": 8.522537145484804, + "learning_rate": 5.077866772689932e-07, + "loss": 0.7216, + "step": 10584 + }, + { + "epoch": 0.86, + "grad_norm": 8.361047956031662, + "learning_rate": 5.07209272595266e-07, + "loss": 0.4926, + "step": 10585 + }, + { + "epoch": 0.86, + "grad_norm": 8.20122524918878, + "learning_rate": 5.066321788509465e-07, + "loss": 0.7026, + "step": 10586 + }, + { + "epoch": 0.86, + "grad_norm": 4.283186653538668, + "learning_rate": 5.060553960759729e-07, + "loss": 0.7298, + "step": 10587 + }, + { + "epoch": 0.86, + "grad_norm": 2.9345050601400904, + "learning_rate": 5.054789243102615e-07, + "loss": 0.6809, + "step": 10588 + }, + { + "epoch": 0.86, + "grad_norm": 5.887505501756078, + "learning_rate": 5.049027635937087e-07, + "loss": 0.5909, + "step": 10589 + }, + { + "epoch": 0.86, + "grad_norm": 3.9275803618180674, + "learning_rate": 5.043269139661872e-07, + "loss": 0.6625, + "step": 10590 + }, + { + "epoch": 0.86, + "grad_norm": 309.3909502206394, + "learning_rate": 5.037513754675516e-07, + "loss": 0.6389, + "step": 10591 + }, + { + "epoch": 0.86, + "grad_norm": 4.056319338162214, + "learning_rate": 5.031761481376318e-07, + "loss": 0.637, + "step": 10592 + }, + { + "epoch": 0.86, + "grad_norm": 4.397426432059715, + "learning_rate": 5.026012320162365e-07, + "loss": 0.5116, + "step": 10593 + }, + { + "epoch": 0.86, + "grad_norm": 3.6227971417607354, + "learning_rate": 5.02026627143154e-07, + "loss": 0.7308, + "step": 10594 + }, + { + "epoch": 0.86, + "grad_norm": 4.343959560266296, + "learning_rate": 5.014523335581495e-07, + "loss": 0.7387, + "step": 10595 + }, + { + "epoch": 0.86, + "grad_norm": 3.2530948217614117, + "learning_rate": 5.008783513009696e-07, + "loss": 0.5354, + "step": 10596 + }, + { + "epoch": 0.86, + "grad_norm": 3.5275053557443607, + "learning_rate": 5.003046804113354e-07, + "loss": 0.7055, + "step": 10597 + }, + { + "epoch": 0.86, + "grad_norm": 4.820721482380367, + "learning_rate": 4.997313209289512e-07, + "loss": 0.6841, + "step": 10598 + }, + { + "epoch": 0.86, + "grad_norm": 4.576153449877427, + "learning_rate": 4.991582728934952e-07, + "loss": 0.6943, + "step": 10599 + }, + { + "epoch": 0.86, + "grad_norm": 3.8452697545324006, + "learning_rate": 4.985855363446268e-07, + "loss": 0.6361, + "step": 10600 + }, + { + "epoch": 0.86, + "grad_norm": 6.88192464799193, + "learning_rate": 4.980131113219822e-07, + "loss": 0.6391, + "step": 10601 + }, + { + "epoch": 0.86, + "grad_norm": 3.9129811219456907, + "learning_rate": 4.974409978651762e-07, + "loss": 0.6869, + "step": 10602 + }, + { + "epoch": 0.86, + "grad_norm": 5.449472163573471, + "learning_rate": 4.96869196013805e-07, + "loss": 0.6459, + "step": 10603 + }, + { + "epoch": 0.86, + "grad_norm": 4.172261244328076, + "learning_rate": 4.962977058074381e-07, + "loss": 0.576, + "step": 10604 + }, + { + "epoch": 0.86, + "grad_norm": 11.059593329921459, + "learning_rate": 4.957265272856288e-07, + "loss": 0.715, + "step": 10605 + }, + { + "epoch": 0.86, + "grad_norm": 2.7194163077427613, + "learning_rate": 4.951556604879049e-07, + "loss": 0.6763, + "step": 10606 + }, + { + "epoch": 0.86, + "grad_norm": 6.198100077429061, + "learning_rate": 4.945851054537737e-07, + "loss": 0.5365, + "step": 10607 + }, + { + "epoch": 0.86, + "grad_norm": 4.563358032232945, + "learning_rate": 4.940148622227225e-07, + "loss": 0.6101, + "step": 10608 + }, + { + "epoch": 0.86, + "grad_norm": 7.297522348946438, + "learning_rate": 4.934449308342131e-07, + "loss": 0.5722, + "step": 10609 + }, + { + "epoch": 0.86, + "grad_norm": 4.147626008345462, + "learning_rate": 4.928753113276918e-07, + "loss": 0.5664, + "step": 10610 + }, + { + "epoch": 0.86, + "grad_norm": 4.3861938013427295, + "learning_rate": 4.92306003742577e-07, + "loss": 0.4749, + "step": 10611 + }, + { + "epoch": 0.86, + "grad_norm": 4.681590036620992, + "learning_rate": 4.917370081182698e-07, + "loss": 0.5637, + "step": 10612 + }, + { + "epoch": 0.86, + "grad_norm": 3.7230256106035844, + "learning_rate": 4.91168324494149e-07, + "loss": 0.6898, + "step": 10613 + }, + { + "epoch": 0.86, + "grad_norm": 3.3049238813477837, + "learning_rate": 4.905999529095695e-07, + "loss": 0.6471, + "step": 10614 + }, + { + "epoch": 0.86, + "grad_norm": 3.94147644657435, + "learning_rate": 4.900318934038662e-07, + "loss": 0.5835, + "step": 10615 + }, + { + "epoch": 0.86, + "grad_norm": 4.337598647202033, + "learning_rate": 4.894641460163536e-07, + "loss": 0.6927, + "step": 10616 + }, + { + "epoch": 0.86, + "grad_norm": 2.895531550852072, + "learning_rate": 4.888967107863229e-07, + "loss": 0.5616, + "step": 10617 + }, + { + "epoch": 0.86, + "grad_norm": 2.216273778060793, + "learning_rate": 4.883295877530431e-07, + "loss": 0.4925, + "step": 10618 + }, + { + "epoch": 0.86, + "grad_norm": 3.545956964017675, + "learning_rate": 4.877627769557658e-07, + "loss": 0.8262, + "step": 10619 + }, + { + "epoch": 0.86, + "grad_norm": 3.5256952436828572, + "learning_rate": 4.871962784337131e-07, + "loss": 0.6437, + "step": 10620 + }, + { + "epoch": 0.86, + "grad_norm": 3.9170066278168165, + "learning_rate": 4.866300922260947e-07, + "loss": 0.5866, + "step": 10621 + }, + { + "epoch": 0.86, + "grad_norm": 6.8906195712446605, + "learning_rate": 4.86064218372091e-07, + "loss": 0.73, + "step": 10622 + }, + { + "epoch": 0.86, + "grad_norm": 7.924580649900985, + "learning_rate": 4.854986569108667e-07, + "loss": 0.5662, + "step": 10623 + }, + { + "epoch": 0.86, + "grad_norm": 4.631069010927226, + "learning_rate": 4.849334078815609e-07, + "loss": 0.708, + "step": 10624 + }, + { + "epoch": 0.86, + "grad_norm": 3.8392169604301745, + "learning_rate": 4.843684713232916e-07, + "loss": 0.7795, + "step": 10625 + }, + { + "epoch": 0.86, + "grad_norm": 4.601164038347287, + "learning_rate": 4.838038472751582e-07, + "loss": 0.6441, + "step": 10626 + }, + { + "epoch": 0.86, + "grad_norm": 4.701846148230725, + "learning_rate": 4.832395357762337e-07, + "loss": 0.695, + "step": 10627 + }, + { + "epoch": 0.86, + "grad_norm": 28.658827084440855, + "learning_rate": 4.826755368655739e-07, + "loss": 0.5909, + "step": 10628 + }, + { + "epoch": 0.86, + "grad_norm": 61.79259142274256, + "learning_rate": 4.821118505822093e-07, + "loss": 0.8125, + "step": 10629 + }, + { + "epoch": 0.86, + "grad_norm": 11.260471148525255, + "learning_rate": 4.815484769651529e-07, + "loss": 0.5481, + "step": 10630 + }, + { + "epoch": 0.86, + "grad_norm": 4.294809915851474, + "learning_rate": 4.809854160533923e-07, + "loss": 0.5414, + "step": 10631 + }, + { + "epoch": 0.86, + "grad_norm": 3.882161200259773, + "learning_rate": 4.804226678858936e-07, + "loss": 0.6763, + "step": 10632 + }, + { + "epoch": 0.86, + "grad_norm": 4.290195397872298, + "learning_rate": 4.79860232501606e-07, + "loss": 0.5978, + "step": 10633 + }, + { + "epoch": 0.86, + "grad_norm": 4.267078407063617, + "learning_rate": 4.7929810993945e-07, + "loss": 0.68, + "step": 10634 + }, + { + "epoch": 0.86, + "grad_norm": 2.6229768447322295, + "learning_rate": 4.787363002383299e-07, + "loss": 0.5809, + "step": 10635 + }, + { + "epoch": 0.86, + "grad_norm": 4.704167181546853, + "learning_rate": 4.781748034371253e-07, + "loss": 0.7956, + "step": 10636 + }, + { + "epoch": 0.86, + "grad_norm": 3.598967932543625, + "learning_rate": 4.776136195746972e-07, + "loss": 0.8454, + "step": 10637 + }, + { + "epoch": 0.86, + "grad_norm": 5.57438540593825, + "learning_rate": 4.770527486898808e-07, + "loss": 0.6648, + "step": 10638 + }, + { + "epoch": 0.86, + "grad_norm": 49.875161375206574, + "learning_rate": 4.764921908214948e-07, + "loss": 0.6805, + "step": 10639 + }, + { + "epoch": 0.86, + "grad_norm": 9.200926855274849, + "learning_rate": 4.759319460083295e-07, + "loss": 0.6228, + "step": 10640 + }, + { + "epoch": 0.86, + "grad_norm": 5.884702202467515, + "learning_rate": 4.75372014289161e-07, + "loss": 0.564, + "step": 10641 + }, + { + "epoch": 0.86, + "grad_norm": 3.5923936922622794, + "learning_rate": 4.748123957027379e-07, + "loss": 0.4832, + "step": 10642 + }, + { + "epoch": 0.86, + "grad_norm": 4.114500342579102, + "learning_rate": 4.7425309028778954e-07, + "loss": 0.6898, + "step": 10643 + }, + { + "epoch": 0.86, + "grad_norm": 11.659995047574768, + "learning_rate": 4.7369409808302457e-07, + "loss": 0.7404, + "step": 10644 + }, + { + "epoch": 0.86, + "grad_norm": 5.294514529587528, + "learning_rate": 4.731354191271265e-07, + "loss": 0.8083, + "step": 10645 + }, + { + "epoch": 0.86, + "grad_norm": 4.482671858791675, + "learning_rate": 4.725770534587637e-07, + "loss": 0.5486, + "step": 10646 + }, + { + "epoch": 0.86, + "grad_norm": 4.311126718705619, + "learning_rate": 4.7201900111657316e-07, + "loss": 0.6297, + "step": 10647 + }, + { + "epoch": 0.86, + "grad_norm": 3.29032404457384, + "learning_rate": 4.714612621391795e-07, + "loss": 0.4806, + "step": 10648 + }, + { + "epoch": 0.86, + "grad_norm": 3.843451611443095, + "learning_rate": 4.709038365651808e-07, + "loss": 0.5216, + "step": 10649 + }, + { + "epoch": 0.86, + "grad_norm": 4.776116006946723, + "learning_rate": 4.7034672443315274e-07, + "loss": 0.7396, + "step": 10650 + }, + { + "epoch": 0.87, + "grad_norm": 3.165981709076825, + "learning_rate": 4.697899257816535e-07, + "loss": 0.6285, + "step": 10651 + }, + { + "epoch": 0.87, + "grad_norm": 5.324935612151433, + "learning_rate": 4.6923344064921604e-07, + "loss": 0.5604, + "step": 10652 + }, + { + "epoch": 0.87, + "grad_norm": 3.5979812727132616, + "learning_rate": 4.6867726907435295e-07, + "loss": 0.733, + "step": 10653 + }, + { + "epoch": 0.87, + "grad_norm": 11.359870204681577, + "learning_rate": 4.6812141109555286e-07, + "loss": 0.6604, + "step": 10654 + }, + { + "epoch": 0.87, + "grad_norm": 2.708531677156082, + "learning_rate": 4.6756586675128724e-07, + "loss": 0.5802, + "step": 10655 + }, + { + "epoch": 0.87, + "grad_norm": 8.61350395342908, + "learning_rate": 4.670106360800025e-07, + "loss": 0.5981, + "step": 10656 + }, + { + "epoch": 0.87, + "grad_norm": 2.7432617918576176, + "learning_rate": 4.6645571912012245e-07, + "loss": 0.6139, + "step": 10657 + }, + { + "epoch": 0.87, + "grad_norm": 3.194952948779785, + "learning_rate": 4.659011159100535e-07, + "loss": 0.6095, + "step": 10658 + }, + { + "epoch": 0.87, + "grad_norm": 5.0003352575296525, + "learning_rate": 4.653468264881761e-07, + "loss": 0.705, + "step": 10659 + }, + { + "epoch": 0.87, + "grad_norm": 9.921719853622454, + "learning_rate": 4.647928508928512e-07, + "loss": 0.8292, + "step": 10660 + }, + { + "epoch": 0.87, + "grad_norm": 3.9676314298486037, + "learning_rate": 4.642391891624159e-07, + "loss": 0.714, + "step": 10661 + }, + { + "epoch": 0.87, + "grad_norm": 4.002695521266337, + "learning_rate": 4.6368584133518914e-07, + "loss": 0.6978, + "step": 10662 + }, + { + "epoch": 0.87, + "grad_norm": 4.146433874369412, + "learning_rate": 4.6313280744946396e-07, + "loss": 0.9193, + "step": 10663 + }, + { + "epoch": 0.87, + "grad_norm": 4.409137241818238, + "learning_rate": 4.625800875435166e-07, + "loss": 0.6236, + "step": 10664 + }, + { + "epoch": 0.87, + "grad_norm": 7.746631306750918, + "learning_rate": 4.620276816555963e-07, + "loss": 0.7241, + "step": 10665 + }, + { + "epoch": 0.87, + "grad_norm": 6.417075639192836, + "learning_rate": 4.6147558982393427e-07, + "loss": 0.8172, + "step": 10666 + }, + { + "epoch": 0.87, + "grad_norm": 3.4633927289202093, + "learning_rate": 4.6092381208673875e-07, + "loss": 0.5396, + "step": 10667 + }, + { + "epoch": 0.87, + "grad_norm": 3.424281430579176, + "learning_rate": 4.6037234848219424e-07, + "loss": 0.5794, + "step": 10668 + }, + { + "epoch": 0.87, + "grad_norm": 2.3934324478734585, + "learning_rate": 4.59821199048468e-07, + "loss": 0.6771, + "step": 10669 + }, + { + "epoch": 0.87, + "grad_norm": 2.4545270456651376, + "learning_rate": 4.592703638237017e-07, + "loss": 0.6353, + "step": 10670 + }, + { + "epoch": 0.87, + "grad_norm": 4.619502455479935, + "learning_rate": 4.5871984284601765e-07, + "loss": 0.6405, + "step": 10671 + }, + { + "epoch": 0.87, + "grad_norm": 7.174178438437912, + "learning_rate": 4.5816963615351486e-07, + "loss": 0.5364, + "step": 10672 + }, + { + "epoch": 0.87, + "grad_norm": 3.3650846113824997, + "learning_rate": 4.576197437842705e-07, + "loss": 0.5696, + "step": 10673 + }, + { + "epoch": 0.87, + "grad_norm": 3.24642689059261, + "learning_rate": 4.5707016577634156e-07, + "loss": 0.7475, + "step": 10674 + }, + { + "epoch": 0.87, + "grad_norm": 4.658879225526947, + "learning_rate": 4.565209021677608e-07, + "loss": 0.7568, + "step": 10675 + }, + { + "epoch": 0.87, + "grad_norm": 3.4726092868542944, + "learning_rate": 4.5597195299654285e-07, + "loss": 0.6556, + "step": 10676 + }, + { + "epoch": 0.87, + "grad_norm": 4.829099294606015, + "learning_rate": 4.554233183006762e-07, + "loss": 0.8412, + "step": 10677 + }, + { + "epoch": 0.87, + "grad_norm": 3.477059135767142, + "learning_rate": 4.5487499811813163e-07, + "loss": 0.6292, + "step": 10678 + }, + { + "epoch": 0.87, + "grad_norm": 4.443836704857228, + "learning_rate": 4.5432699248685597e-07, + "loss": 0.7187, + "step": 10679 + }, + { + "epoch": 0.87, + "grad_norm": 3.017378970938267, + "learning_rate": 4.537793014447739e-07, + "loss": 0.6979, + "step": 10680 + }, + { + "epoch": 0.87, + "grad_norm": 2.817235505882643, + "learning_rate": 4.532319250297901e-07, + "loss": 0.6393, + "step": 10681 + }, + { + "epoch": 0.87, + "grad_norm": 6.188456478269907, + "learning_rate": 4.526848632797848e-07, + "loss": 0.5979, + "step": 10682 + }, + { + "epoch": 0.87, + "grad_norm": 3.600301302996873, + "learning_rate": 4.5213811623261994e-07, + "loss": 0.6585, + "step": 10683 + }, + { + "epoch": 0.87, + "grad_norm": 4.438135214543242, + "learning_rate": 4.515916839261325e-07, + "loss": 0.7192, + "step": 10684 + }, + { + "epoch": 0.87, + "grad_norm": 29.624191162479622, + "learning_rate": 4.5104556639814055e-07, + "loss": 0.6984, + "step": 10685 + }, + { + "epoch": 0.87, + "grad_norm": 4.114865999884099, + "learning_rate": 4.504997636864378e-07, + "loss": 0.6885, + "step": 10686 + }, + { + "epoch": 0.87, + "grad_norm": 4.816215245279651, + "learning_rate": 4.4995427582879725e-07, + "loss": 0.722, + "step": 10687 + }, + { + "epoch": 0.87, + "grad_norm": 4.518576055772596, + "learning_rate": 4.494091028629699e-07, + "loss": 0.6212, + "step": 10688 + }, + { + "epoch": 0.87, + "grad_norm": 9.414125447537662, + "learning_rate": 4.488642448266861e-07, + "loss": 0.6031, + "step": 10689 + }, + { + "epoch": 0.87, + "grad_norm": 4.013569962235598, + "learning_rate": 4.4831970175765293e-07, + "loss": 0.6197, + "step": 10690 + }, + { + "epoch": 0.87, + "grad_norm": 3.477407045598223, + "learning_rate": 4.4777547369355523e-07, + "loss": 0.7972, + "step": 10691 + }, + { + "epoch": 0.87, + "grad_norm": 4.830427915339826, + "learning_rate": 4.472315606720601e-07, + "loss": 0.7668, + "step": 10692 + }, + { + "epoch": 0.87, + "grad_norm": 3.4004253754601206, + "learning_rate": 4.4668796273080515e-07, + "loss": 0.6022, + "step": 10693 + }, + { + "epoch": 0.87, + "grad_norm": 10.372112958292616, + "learning_rate": 4.461446799074143e-07, + "loss": 0.4801, + "step": 10694 + }, + { + "epoch": 0.87, + "grad_norm": 5.1516987134369066, + "learning_rate": 4.4560171223948457e-07, + "loss": 0.6883, + "step": 10695 + }, + { + "epoch": 0.87, + "grad_norm": 3.6732652956552703, + "learning_rate": 4.4505905976459374e-07, + "loss": 0.5188, + "step": 10696 + }, + { + "epoch": 0.87, + "grad_norm": 3.583479564699838, + "learning_rate": 4.445167225202962e-07, + "loss": 0.5876, + "step": 10697 + }, + { + "epoch": 0.87, + "grad_norm": 4.366386547835916, + "learning_rate": 4.4397470054412415e-07, + "loss": 0.6699, + "step": 10698 + }, + { + "epoch": 0.87, + "grad_norm": 3.5871270381617655, + "learning_rate": 4.434329938735921e-07, + "loss": 0.5064, + "step": 10699 + }, + { + "epoch": 0.87, + "grad_norm": 3.2086242418846607, + "learning_rate": 4.428916025461855e-07, + "loss": 0.6888, + "step": 10700 + }, + { + "epoch": 0.87, + "grad_norm": 4.997365684494536, + "learning_rate": 4.4235052659937437e-07, + "loss": 0.5655, + "step": 10701 + }, + { + "epoch": 0.87, + "grad_norm": 2.7224291586339917, + "learning_rate": 4.418097660706039e-07, + "loss": 0.5888, + "step": 10702 + }, + { + "epoch": 0.87, + "grad_norm": 5.109252829432643, + "learning_rate": 4.4126932099729903e-07, + "loss": 0.608, + "step": 10703 + }, + { + "epoch": 0.87, + "grad_norm": 11.04049901556695, + "learning_rate": 4.40729191416861e-07, + "loss": 0.7794, + "step": 10704 + }, + { + "epoch": 0.87, + "grad_norm": 4.793674277503852, + "learning_rate": 4.40189377366671e-07, + "loss": 0.633, + "step": 10705 + }, + { + "epoch": 0.87, + "grad_norm": 2.897300522636713, + "learning_rate": 4.396498788840864e-07, + "loss": 0.6524, + "step": 10706 + }, + { + "epoch": 0.87, + "grad_norm": 2.73339379744802, + "learning_rate": 4.3911069600644396e-07, + "loss": 0.6855, + "step": 10707 + }, + { + "epoch": 0.87, + "grad_norm": 4.155855647493374, + "learning_rate": 4.3857182877105997e-07, + "loss": 0.7516, + "step": 10708 + }, + { + "epoch": 0.87, + "grad_norm": 3.8774944705788434, + "learning_rate": 4.380332772152257e-07, + "loss": 0.689, + "step": 10709 + }, + { + "epoch": 0.87, + "grad_norm": 4.063910709280885, + "learning_rate": 4.3749504137621413e-07, + "loss": 0.5842, + "step": 10710 + }, + { + "epoch": 0.87, + "grad_norm": 9.913068545118325, + "learning_rate": 4.369571212912732e-07, + "loss": 0.7191, + "step": 10711 + }, + { + "epoch": 0.87, + "grad_norm": 3.579721761886697, + "learning_rate": 4.36419516997631e-07, + "loss": 0.6416, + "step": 10712 + }, + { + "epoch": 0.87, + "grad_norm": 3.4509672210560884, + "learning_rate": 4.3588222853249207e-07, + "loss": 0.8461, + "step": 10713 + }, + { + "epoch": 0.87, + "grad_norm": 3.241141784706939, + "learning_rate": 4.3534525593304177e-07, + "loss": 0.6639, + "step": 10714 + }, + { + "epoch": 0.87, + "grad_norm": 8.656809468557332, + "learning_rate": 4.348085992364415e-07, + "loss": 0.6811, + "step": 10715 + }, + { + "epoch": 0.87, + "grad_norm": 6.0464387930865895, + "learning_rate": 4.342722584798298e-07, + "loss": 0.6983, + "step": 10716 + }, + { + "epoch": 0.87, + "grad_norm": 4.568471405690998, + "learning_rate": 4.33736233700327e-07, + "loss": 0.5458, + "step": 10717 + }, + { + "epoch": 0.87, + "grad_norm": 7.755403977461108, + "learning_rate": 4.332005249350274e-07, + "loss": 0.6554, + "step": 10718 + }, + { + "epoch": 0.87, + "grad_norm": 2.469411592698162, + "learning_rate": 4.3266513222100846e-07, + "loss": 0.646, + "step": 10719 + }, + { + "epoch": 0.87, + "grad_norm": 4.1011309925795425, + "learning_rate": 4.3213005559531893e-07, + "loss": 0.6755, + "step": 10720 + }, + { + "epoch": 0.87, + "grad_norm": 2.8232968513780894, + "learning_rate": 4.31595295094992e-07, + "loss": 0.7011, + "step": 10721 + }, + { + "epoch": 0.87, + "grad_norm": 7.189138763845999, + "learning_rate": 4.3106085075703576e-07, + "loss": 0.5735, + "step": 10722 + }, + { + "epoch": 0.87, + "grad_norm": 4.9764717974193395, + "learning_rate": 4.3052672261843564e-07, + "loss": 0.7338, + "step": 10723 + }, + { + "epoch": 0.87, + "grad_norm": 3.965778039093081, + "learning_rate": 4.2999291071615934e-07, + "loss": 0.6783, + "step": 10724 + }, + { + "epoch": 0.87, + "grad_norm": 7.1017850062923245, + "learning_rate": 4.294594150871489e-07, + "loss": 0.7119, + "step": 10725 + }, + { + "epoch": 0.87, + "grad_norm": 4.09854637897419, + "learning_rate": 4.289262357683255e-07, + "loss": 0.579, + "step": 10726 + }, + { + "epoch": 0.87, + "grad_norm": 3.395027399396893, + "learning_rate": 4.283933727965872e-07, + "loss": 0.6879, + "step": 10727 + }, + { + "epoch": 0.87, + "grad_norm": 2.561964856597612, + "learning_rate": 4.278608262088141e-07, + "loss": 0.5705, + "step": 10728 + }, + { + "epoch": 0.87, + "grad_norm": 9.143239212745891, + "learning_rate": 4.2732859604185994e-07, + "loss": 0.8148, + "step": 10729 + }, + { + "epoch": 0.87, + "grad_norm": 3.236095773149485, + "learning_rate": 4.267966823325581e-07, + "loss": 0.8488, + "step": 10730 + }, + { + "epoch": 0.87, + "grad_norm": 3.1534033487066147, + "learning_rate": 4.2626508511772247e-07, + "loss": 0.5129, + "step": 10731 + }, + { + "epoch": 0.87, + "grad_norm": 5.01009947183095, + "learning_rate": 4.2573380443414083e-07, + "loss": 0.7155, + "step": 10732 + }, + { + "epoch": 0.87, + "grad_norm": 5.560043806292731, + "learning_rate": 4.2520284031858206e-07, + "loss": 0.7596, + "step": 10733 + }, + { + "epoch": 0.87, + "grad_norm": 3.6784894874431444, + "learning_rate": 4.2467219280779183e-07, + "loss": 0.6808, + "step": 10734 + }, + { + "epoch": 0.87, + "grad_norm": 5.4886375162283, + "learning_rate": 4.241418619384946e-07, + "loss": 0.6529, + "step": 10735 + }, + { + "epoch": 0.87, + "grad_norm": 2.836470937591066, + "learning_rate": 4.236118477473927e-07, + "loss": 0.6393, + "step": 10736 + }, + { + "epoch": 0.87, + "grad_norm": 2.80145808918984, + "learning_rate": 4.230821502711657e-07, + "loss": 0.7841, + "step": 10737 + }, + { + "epoch": 0.87, + "grad_norm": 4.098709739120388, + "learning_rate": 4.225527695464732e-07, + "loss": 0.5967, + "step": 10738 + }, + { + "epoch": 0.87, + "grad_norm": 7.4095888310814475, + "learning_rate": 4.2202370560995076e-07, + "loss": 0.5959, + "step": 10739 + }, + { + "epoch": 0.87, + "grad_norm": 3.9753327539729195, + "learning_rate": 4.2149495849821365e-07, + "loss": 0.679, + "step": 10740 + }, + { + "epoch": 0.87, + "grad_norm": 3.8556580480137637, + "learning_rate": 4.209665282478531e-07, + "loss": 0.7672, + "step": 10741 + }, + { + "epoch": 0.87, + "grad_norm": 4.723370368903523, + "learning_rate": 4.2043841489544156e-07, + "loss": 0.6165, + "step": 10742 + }, + { + "epoch": 0.87, + "grad_norm": 27.050325456903867, + "learning_rate": 4.199106184775259e-07, + "loss": 0.551, + "step": 10743 + }, + { + "epoch": 0.87, + "grad_norm": 5.467307375636786, + "learning_rate": 4.193831390306352e-07, + "loss": 0.7173, + "step": 10744 + }, + { + "epoch": 0.87, + "grad_norm": 3.0401008008734647, + "learning_rate": 4.188559765912731e-07, + "loss": 0.5714, + "step": 10745 + }, + { + "epoch": 0.87, + "grad_norm": 4.128529810729945, + "learning_rate": 4.183291311959231e-07, + "loss": 0.641, + "step": 10746 + }, + { + "epoch": 0.87, + "grad_norm": 2.485994496870722, + "learning_rate": 4.1780260288104504e-07, + "loss": 0.4747, + "step": 10747 + }, + { + "epoch": 0.87, + "grad_norm": 4.043381728166808, + "learning_rate": 4.172763916830785e-07, + "loss": 0.7358, + "step": 10748 + }, + { + "epoch": 0.87, + "grad_norm": 4.481993697998047, + "learning_rate": 4.167504976384418e-07, + "loss": 0.6272, + "step": 10749 + }, + { + "epoch": 0.87, + "grad_norm": 3.5276661692307334, + "learning_rate": 4.1622492078352783e-07, + "loss": 0.6207, + "step": 10750 + }, + { + "epoch": 0.87, + "grad_norm": 2.339290533645544, + "learning_rate": 4.156996611547126e-07, + "loss": 0.5842, + "step": 10751 + }, + { + "epoch": 0.87, + "grad_norm": 5.441490813999628, + "learning_rate": 4.1517471878834536e-07, + "loss": 0.5888, + "step": 10752 + }, + { + "epoch": 0.87, + "grad_norm": 8.177837937407425, + "learning_rate": 4.1465009372075647e-07, + "loss": 0.5467, + "step": 10753 + }, + { + "epoch": 0.87, + "grad_norm": 3.867151964631239, + "learning_rate": 4.141257859882525e-07, + "loss": 0.6577, + "step": 10754 + }, + { + "epoch": 0.87, + "grad_norm": 3.6094521000745527, + "learning_rate": 4.136017956271188e-07, + "loss": 0.6963, + "step": 10755 + }, + { + "epoch": 0.87, + "grad_norm": 5.441529914069627, + "learning_rate": 4.130781226736197e-07, + "loss": 0.6772, + "step": 10756 + }, + { + "epoch": 0.87, + "grad_norm": 3.9750203704806393, + "learning_rate": 4.125547671639957e-07, + "loss": 0.7016, + "step": 10757 + }, + { + "epoch": 0.87, + "grad_norm": 2.9720566962814456, + "learning_rate": 4.1203172913446774e-07, + "loss": 0.7542, + "step": 10758 + }, + { + "epoch": 0.87, + "grad_norm": 2.375481272321517, + "learning_rate": 4.1150900862123145e-07, + "loss": 0.5636, + "step": 10759 + }, + { + "epoch": 0.87, + "grad_norm": 2.9125915907391353, + "learning_rate": 4.109866056604633e-07, + "loss": 0.5838, + "step": 10760 + }, + { + "epoch": 0.87, + "grad_norm": 7.754166412312943, + "learning_rate": 4.1046452028831786e-07, + "loss": 0.7411, + "step": 10761 + }, + { + "epoch": 0.87, + "grad_norm": 3.2216364944218623, + "learning_rate": 4.099427525409239e-07, + "loss": 0.8449, + "step": 10762 + }, + { + "epoch": 0.87, + "grad_norm": 8.691515045128018, + "learning_rate": 4.0942130245439414e-07, + "loss": 0.8273, + "step": 10763 + }, + { + "epoch": 0.87, + "grad_norm": 3.1807086757421694, + "learning_rate": 4.089001700648143e-07, + "loss": 0.6819, + "step": 10764 + }, + { + "epoch": 0.87, + "grad_norm": 3.0881183837050785, + "learning_rate": 4.0837935540825214e-07, + "loss": 0.729, + "step": 10765 + }, + { + "epoch": 0.87, + "grad_norm": 4.812208167391301, + "learning_rate": 4.078588585207477e-07, + "loss": 0.7293, + "step": 10766 + }, + { + "epoch": 0.87, + "grad_norm": 3.316448712526414, + "learning_rate": 4.0733867943832607e-07, + "loss": 0.5389, + "step": 10767 + }, + { + "epoch": 0.87, + "grad_norm": 7.863321853301671, + "learning_rate": 4.068188181969851e-07, + "loss": 0.5217, + "step": 10768 + }, + { + "epoch": 0.87, + "grad_norm": 3.706238670079514, + "learning_rate": 4.0629927483270326e-07, + "loss": 0.7675, + "step": 10769 + }, + { + "epoch": 0.87, + "grad_norm": 5.291048939953106, + "learning_rate": 4.0578004938143624e-07, + "loss": 0.5705, + "step": 10770 + }, + { + "epoch": 0.87, + "grad_norm": 22.34300728010316, + "learning_rate": 4.0526114187911636e-07, + "loss": 0.5999, + "step": 10771 + }, + { + "epoch": 0.87, + "grad_norm": 2.3222506841879937, + "learning_rate": 4.047425523616577e-07, + "loss": 0.6545, + "step": 10772 + }, + { + "epoch": 0.87, + "grad_norm": 3.886792034387991, + "learning_rate": 4.0422428086494713e-07, + "loss": 0.5307, + "step": 10773 + }, + { + "epoch": 0.88, + "grad_norm": 7.531583274532244, + "learning_rate": 4.037063274248548e-07, + "loss": 0.5338, + "step": 10774 + }, + { + "epoch": 0.88, + "grad_norm": 3.635812634928802, + "learning_rate": 4.0318869207722433e-07, + "loss": 0.6593, + "step": 10775 + }, + { + "epoch": 0.88, + "grad_norm": 5.122985585651787, + "learning_rate": 4.026713748578809e-07, + "loss": 0.7173, + "step": 10776 + }, + { + "epoch": 0.88, + "grad_norm": 6.834422601280923, + "learning_rate": 4.0215437580262584e-07, + "loss": 0.6338, + "step": 10777 + }, + { + "epoch": 0.88, + "grad_norm": 3.237068795115182, + "learning_rate": 4.0163769494723836e-07, + "loss": 0.8367, + "step": 10778 + }, + { + "epoch": 0.88, + "grad_norm": 6.514229708552531, + "learning_rate": 4.0112133232747596e-07, + "loss": 0.6697, + "step": 10779 + }, + { + "epoch": 0.88, + "grad_norm": 5.851794017685984, + "learning_rate": 4.006052879790734e-07, + "loss": 0.6796, + "step": 10780 + }, + { + "epoch": 0.88, + "grad_norm": 8.39969653473002, + "learning_rate": 4.0008956193774597e-07, + "loss": 0.5769, + "step": 10781 + }, + { + "epoch": 0.88, + "grad_norm": 3.807743145700366, + "learning_rate": 3.995741542391834e-07, + "loss": 0.8037, + "step": 10782 + }, + { + "epoch": 0.88, + "grad_norm": 4.525909173501873, + "learning_rate": 3.9905906491905676e-07, + "loss": 0.8172, + "step": 10783 + }, + { + "epoch": 0.88, + "grad_norm": 4.234772635311939, + "learning_rate": 3.98544294013013e-07, + "loss": 0.7275, + "step": 10784 + }, + { + "epoch": 0.88, + "grad_norm": 3.9127935712544764, + "learning_rate": 3.9802984155667744e-07, + "loss": 0.5657, + "step": 10785 + }, + { + "epoch": 0.88, + "grad_norm": 24.338474949032975, + "learning_rate": 3.9751570758565284e-07, + "loss": 0.6679, + "step": 10786 + }, + { + "epoch": 0.88, + "grad_norm": 3.4639828238236094, + "learning_rate": 3.970018921355201e-07, + "loss": 0.7563, + "step": 10787 + }, + { + "epoch": 0.88, + "grad_norm": 3.3006041412268003, + "learning_rate": 3.964883952418402e-07, + "loss": 0.7276, + "step": 10788 + }, + { + "epoch": 0.88, + "grad_norm": 5.138867709159208, + "learning_rate": 3.9597521694014875e-07, + "loss": 0.6436, + "step": 10789 + }, + { + "epoch": 0.88, + "grad_norm": 3.484028622598305, + "learning_rate": 3.9546235726596273e-07, + "loss": 0.7474, + "step": 10790 + }, + { + "epoch": 0.88, + "grad_norm": 3.6119127333558207, + "learning_rate": 3.949498162547727e-07, + "loss": 0.6258, + "step": 10791 + }, + { + "epoch": 0.88, + "grad_norm": 4.158881464932098, + "learning_rate": 3.9443759394205303e-07, + "loss": 0.8304, + "step": 10792 + }, + { + "epoch": 0.88, + "grad_norm": 3.4376506485900324, + "learning_rate": 3.9392569036324936e-07, + "loss": 0.6527, + "step": 10793 + }, + { + "epoch": 0.88, + "grad_norm": 4.1278923531363025, + "learning_rate": 3.9341410555379103e-07, + "loss": 0.6089, + "step": 10794 + }, + { + "epoch": 0.88, + "grad_norm": 16.98344433356969, + "learning_rate": 3.929028395490819e-07, + "loss": 0.6806, + "step": 10795 + }, + { + "epoch": 0.88, + "grad_norm": 4.610832433200752, + "learning_rate": 3.923918923845038e-07, + "loss": 0.7049, + "step": 10796 + }, + { + "epoch": 0.88, + "grad_norm": 12.824302302694603, + "learning_rate": 3.9188126409542003e-07, + "loss": 0.7757, + "step": 10797 + }, + { + "epoch": 0.88, + "grad_norm": 3.2547259255027003, + "learning_rate": 3.9137095471716793e-07, + "loss": 0.6869, + "step": 10798 + }, + { + "epoch": 0.88, + "grad_norm": 6.449743102549887, + "learning_rate": 3.908609642850636e-07, + "loss": 0.6903, + "step": 10799 + }, + { + "epoch": 0.88, + "grad_norm": 8.712775558014576, + "learning_rate": 3.9035129283440165e-07, + "loss": 0.6427, + "step": 10800 + }, + { + "epoch": 0.88, + "grad_norm": 5.89446468517924, + "learning_rate": 3.898419404004555e-07, + "loss": 0.6401, + "step": 10801 + }, + { + "epoch": 0.88, + "grad_norm": 3.6309736738123393, + "learning_rate": 3.893329070184754e-07, + "loss": 0.7009, + "step": 10802 + }, + { + "epoch": 0.88, + "grad_norm": 3.052039845785766, + "learning_rate": 3.88824192723688e-07, + "loss": 0.7364, + "step": 10803 + }, + { + "epoch": 0.88, + "grad_norm": 3.6550574277065744, + "learning_rate": 3.8831579755130243e-07, + "loss": 0.7283, + "step": 10804 + }, + { + "epoch": 0.88, + "grad_norm": 4.480008765502872, + "learning_rate": 3.878077215365006e-07, + "loss": 0.6016, + "step": 10805 + }, + { + "epoch": 0.88, + "grad_norm": 3.24592424617487, + "learning_rate": 3.872999647144454e-07, + "loss": 0.5713, + "step": 10806 + }, + { + "epoch": 0.88, + "grad_norm": 3.3614592661674436, + "learning_rate": 3.867925271202755e-07, + "loss": 0.745, + "step": 10807 + }, + { + "epoch": 0.88, + "grad_norm": 3.61736760124939, + "learning_rate": 3.8628540878911105e-07, + "loss": 0.7137, + "step": 10808 + }, + { + "epoch": 0.88, + "grad_norm": 4.091836165510375, + "learning_rate": 3.857786097560462e-07, + "loss": 0.6086, + "step": 10809 + }, + { + "epoch": 0.88, + "grad_norm": 4.389002292409124, + "learning_rate": 3.852721300561546e-07, + "loss": 0.6205, + "step": 10810 + }, + { + "epoch": 0.88, + "grad_norm": 3.8487328516598094, + "learning_rate": 3.8476596972449043e-07, + "loss": 0.6733, + "step": 10811 + }, + { + "epoch": 0.88, + "grad_norm": 22.488528845979367, + "learning_rate": 3.84260128796079e-07, + "loss": 0.6, + "step": 10812 + }, + { + "epoch": 0.88, + "grad_norm": 4.12196490116398, + "learning_rate": 3.8375460730593005e-07, + "loss": 0.6711, + "step": 10813 + }, + { + "epoch": 0.88, + "grad_norm": 4.105090609792938, + "learning_rate": 3.8324940528902845e-07, + "loss": 0.6909, + "step": 10814 + }, + { + "epoch": 0.88, + "grad_norm": 3.1301202383461035, + "learning_rate": 3.8274452278033836e-07, + "loss": 0.6067, + "step": 10815 + }, + { + "epoch": 0.88, + "grad_norm": 6.690917712219051, + "learning_rate": 3.8223995981479855e-07, + "loss": 0.7009, + "step": 10816 + }, + { + "epoch": 0.88, + "grad_norm": 4.088085968418896, + "learning_rate": 3.8173571642733056e-07, + "loss": 0.4663, + "step": 10817 + }, + { + "epoch": 0.88, + "grad_norm": 2.864656508894658, + "learning_rate": 3.812317926528297e-07, + "loss": 0.57, + "step": 10818 + }, + { + "epoch": 0.88, + "grad_norm": 4.0731893650097035, + "learning_rate": 3.80728188526171e-07, + "loss": 0.8613, + "step": 10819 + }, + { + "epoch": 0.88, + "grad_norm": 15.872279201065492, + "learning_rate": 3.8022490408220757e-07, + "loss": 0.6205, + "step": 10820 + }, + { + "epoch": 0.88, + "grad_norm": 6.715321180448168, + "learning_rate": 3.797219393557677e-07, + "loss": 0.6215, + "step": 10821 + }, + { + "epoch": 0.88, + "grad_norm": 5.611492008451022, + "learning_rate": 3.792192943816625e-07, + "loss": 0.8028, + "step": 10822 + }, + { + "epoch": 0.88, + "grad_norm": 3.672703192369442, + "learning_rate": 3.787169691946763e-07, + "loss": 0.5803, + "step": 10823 + }, + { + "epoch": 0.88, + "grad_norm": 4.222360364051522, + "learning_rate": 3.78214963829574e-07, + "loss": 0.5888, + "step": 10824 + }, + { + "epoch": 0.88, + "grad_norm": 6.768556937185829, + "learning_rate": 3.7771327832109795e-07, + "loss": 0.7202, + "step": 10825 + }, + { + "epoch": 0.88, + "grad_norm": 3.3218425955721544, + "learning_rate": 3.772119127039675e-07, + "loss": 0.6861, + "step": 10826 + }, + { + "epoch": 0.88, + "grad_norm": 13.579685371603079, + "learning_rate": 3.7671086701287994e-07, + "loss": 0.6153, + "step": 10827 + }, + { + "epoch": 0.88, + "grad_norm": 8.805727296263525, + "learning_rate": 3.762101412825098e-07, + "loss": 0.4538, + "step": 10828 + }, + { + "epoch": 0.88, + "grad_norm": 3.4703984014085156, + "learning_rate": 3.757097355475131e-07, + "loss": 0.6611, + "step": 10829 + }, + { + "epoch": 0.88, + "grad_norm": 4.523117551334587, + "learning_rate": 3.752096498425184e-07, + "loss": 0.6932, + "step": 10830 + }, + { + "epoch": 0.88, + "grad_norm": 6.265916493785175, + "learning_rate": 3.7470988420213796e-07, + "loss": 0.691, + "step": 10831 + }, + { + "epoch": 0.88, + "grad_norm": 5.668490577994119, + "learning_rate": 3.7421043866095465e-07, + "loss": 0.737, + "step": 10832 + }, + { + "epoch": 0.88, + "grad_norm": 3.284505113694467, + "learning_rate": 3.7371131325353695e-07, + "loss": 0.7774, + "step": 10833 + }, + { + "epoch": 0.88, + "grad_norm": 2.9792978064078075, + "learning_rate": 3.73212508014425e-07, + "loss": 0.6887, + "step": 10834 + }, + { + "epoch": 0.88, + "grad_norm": 8.134236049816344, + "learning_rate": 3.727140229781401e-07, + "loss": 0.6874, + "step": 10835 + }, + { + "epoch": 0.88, + "grad_norm": 7.138657160872964, + "learning_rate": 3.722158581791813e-07, + "loss": 0.6499, + "step": 10836 + }, + { + "epoch": 0.88, + "grad_norm": 3.377567140291227, + "learning_rate": 3.7171801365202266e-07, + "loss": 0.6613, + "step": 10837 + }, + { + "epoch": 0.88, + "grad_norm": 4.651548329583782, + "learning_rate": 3.7122048943112165e-07, + "loss": 0.7572, + "step": 10838 + }, + { + "epoch": 0.88, + "grad_norm": 3.429868901871164, + "learning_rate": 3.707232855509063e-07, + "loss": 0.7628, + "step": 10839 + }, + { + "epoch": 0.88, + "grad_norm": 3.03028237683892, + "learning_rate": 3.702264020457885e-07, + "loss": 0.6038, + "step": 10840 + }, + { + "epoch": 0.88, + "grad_norm": 4.351056190404571, + "learning_rate": 3.6972983895015467e-07, + "loss": 0.6581, + "step": 10841 + }, + { + "epoch": 0.88, + "grad_norm": 18.841013864930726, + "learning_rate": 3.6923359629837117e-07, + "loss": 0.7694, + "step": 10842 + }, + { + "epoch": 0.88, + "grad_norm": 5.015133105399184, + "learning_rate": 3.687376741247811e-07, + "loss": 0.7624, + "step": 10843 + }, + { + "epoch": 0.88, + "grad_norm": 4.408370547016043, + "learning_rate": 3.682420724637031e-07, + "loss": 0.6106, + "step": 10844 + }, + { + "epoch": 0.88, + "grad_norm": 2.5022558791540153, + "learning_rate": 3.677467913494398e-07, + "loss": 0.6293, + "step": 10845 + }, + { + "epoch": 0.88, + "grad_norm": 5.776660231366383, + "learning_rate": 3.6725183081626424e-07, + "loss": 0.7405, + "step": 10846 + }, + { + "epoch": 0.88, + "grad_norm": 4.746430416783898, + "learning_rate": 3.6675719089843245e-07, + "loss": 0.6463, + "step": 10847 + }, + { + "epoch": 0.88, + "grad_norm": 6.316204321422619, + "learning_rate": 3.662628716301758e-07, + "loss": 0.5675, + "step": 10848 + }, + { + "epoch": 0.88, + "grad_norm": 4.400179913488069, + "learning_rate": 3.657688730457054e-07, + "loss": 0.6398, + "step": 10849 + }, + { + "epoch": 0.88, + "grad_norm": 3.286898495998081, + "learning_rate": 3.6527519517920886e-07, + "loss": 0.752, + "step": 10850 + }, + { + "epoch": 0.88, + "grad_norm": 3.0158511600315054, + "learning_rate": 3.64781838064851e-07, + "loss": 0.4571, + "step": 10851 + }, + { + "epoch": 0.88, + "grad_norm": 3.6313709695818357, + "learning_rate": 3.642888017367763e-07, + "loss": 0.64, + "step": 10852 + }, + { + "epoch": 0.88, + "grad_norm": 4.846280850082306, + "learning_rate": 3.63796086229104e-07, + "loss": 0.7249, + "step": 10853 + }, + { + "epoch": 0.88, + "grad_norm": 4.08817591106842, + "learning_rate": 3.633036915759358e-07, + "loss": 0.6381, + "step": 10854 + }, + { + "epoch": 0.88, + "grad_norm": 7.201152463679357, + "learning_rate": 3.628116178113461e-07, + "loss": 0.8171, + "step": 10855 + }, + { + "epoch": 0.88, + "grad_norm": 4.606024947970406, + "learning_rate": 3.6231986496939153e-07, + "loss": 0.5831, + "step": 10856 + }, + { + "epoch": 0.88, + "grad_norm": 7.281297639751838, + "learning_rate": 3.618284330841032e-07, + "loss": 0.7388, + "step": 10857 + }, + { + "epoch": 0.88, + "grad_norm": 3.6732143611130845, + "learning_rate": 3.6133732218949223e-07, + "loss": 0.6119, + "step": 10858 + }, + { + "epoch": 0.88, + "grad_norm": 4.408555356895679, + "learning_rate": 3.608465323195454e-07, + "loss": 0.6128, + "step": 10859 + }, + { + "epoch": 0.88, + "grad_norm": 9.460299777518019, + "learning_rate": 3.603560635082287e-07, + "loss": 0.6298, + "step": 10860 + }, + { + "epoch": 0.88, + "grad_norm": 5.5701445925225785, + "learning_rate": 3.598659157894868e-07, + "loss": 0.7678, + "step": 10861 + }, + { + "epoch": 0.88, + "grad_norm": 5.58053137794694, + "learning_rate": 3.593760891972392e-07, + "loss": 0.6718, + "step": 10862 + }, + { + "epoch": 0.88, + "grad_norm": 6.558371047788067, + "learning_rate": 3.5888658376538654e-07, + "loss": 0.58, + "step": 10863 + }, + { + "epoch": 0.88, + "grad_norm": 3.494515364394613, + "learning_rate": 3.583973995278056e-07, + "loss": 0.7092, + "step": 10864 + }, + { + "epoch": 0.88, + "grad_norm": 2.7476416955410006, + "learning_rate": 3.5790853651835043e-07, + "loss": 0.574, + "step": 10865 + }, + { + "epoch": 0.88, + "grad_norm": 2.8351782462483706, + "learning_rate": 3.574199947708529e-07, + "loss": 0.6011, + "step": 10866 + }, + { + "epoch": 0.88, + "grad_norm": 5.00961626519902, + "learning_rate": 3.5693177431912473e-07, + "loss": 0.781, + "step": 10867 + }, + { + "epoch": 0.88, + "grad_norm": 3.9902535354987, + "learning_rate": 3.564438751969523e-07, + "loss": 0.5352, + "step": 10868 + }, + { + "epoch": 0.88, + "grad_norm": 4.832401005192465, + "learning_rate": 3.55956297438102e-07, + "loss": 0.6396, + "step": 10869 + }, + { + "epoch": 0.88, + "grad_norm": 6.607730565230629, + "learning_rate": 3.554690410763173e-07, + "loss": 0.6863, + "step": 10870 + }, + { + "epoch": 0.88, + "grad_norm": 6.44160700052036, + "learning_rate": 3.5498210614532013e-07, + "loss": 0.7407, + "step": 10871 + }, + { + "epoch": 0.88, + "grad_norm": 3.3963547794851108, + "learning_rate": 3.5449549267880803e-07, + "loss": 0.6021, + "step": 10872 + }, + { + "epoch": 0.88, + "grad_norm": 3.433347639161826, + "learning_rate": 3.5400920071045787e-07, + "loss": 0.721, + "step": 10873 + }, + { + "epoch": 0.88, + "grad_norm": 6.9244875742490155, + "learning_rate": 3.5352323027392497e-07, + "loss": 0.7314, + "step": 10874 + }, + { + "epoch": 0.88, + "grad_norm": 5.165494561102616, + "learning_rate": 3.530375814028414e-07, + "loss": 0.7358, + "step": 10875 + }, + { + "epoch": 0.88, + "grad_norm": 14.7942220002371, + "learning_rate": 3.525522541308163e-07, + "loss": 0.6501, + "step": 10876 + }, + { + "epoch": 0.88, + "grad_norm": 4.20282943510205, + "learning_rate": 3.520672484914384e-07, + "loss": 0.7039, + "step": 10877 + }, + { + "epoch": 0.88, + "grad_norm": 3.123767783444509, + "learning_rate": 3.51582564518273e-07, + "loss": 0.6741, + "step": 10878 + }, + { + "epoch": 0.88, + "grad_norm": 4.173298116133693, + "learning_rate": 3.510982022448628e-07, + "loss": 0.7465, + "step": 10879 + }, + { + "epoch": 0.88, + "grad_norm": 4.388081029425778, + "learning_rate": 3.506141617047282e-07, + "loss": 0.8184, + "step": 10880 + }, + { + "epoch": 0.88, + "grad_norm": 3.9527444792979405, + "learning_rate": 3.5013044293136957e-07, + "loss": 0.6049, + "step": 10881 + }, + { + "epoch": 0.88, + "grad_norm": 8.214400547552188, + "learning_rate": 3.496470459582624e-07, + "loss": 0.721, + "step": 10882 + }, + { + "epoch": 0.88, + "grad_norm": 3.019469287232392, + "learning_rate": 3.4916397081885935e-07, + "loss": 0.6151, + "step": 10883 + }, + { + "epoch": 0.88, + "grad_norm": 2.7579013561345302, + "learning_rate": 3.4868121754659533e-07, + "loss": 0.6887, + "step": 10884 + }, + { + "epoch": 0.88, + "grad_norm": 5.3501040291615505, + "learning_rate": 3.48198786174877e-07, + "loss": 0.7722, + "step": 10885 + }, + { + "epoch": 0.88, + "grad_norm": 3.1480402326081856, + "learning_rate": 3.477166767370932e-07, + "loss": 0.6364, + "step": 10886 + }, + { + "epoch": 0.88, + "grad_norm": 6.992128079977042, + "learning_rate": 3.4723488926660777e-07, + "loss": 0.5978, + "step": 10887 + }, + { + "epoch": 0.88, + "grad_norm": 3.1966450633847643, + "learning_rate": 3.467534237967651e-07, + "loss": 0.6207, + "step": 10888 + }, + { + "epoch": 0.88, + "grad_norm": 4.275108895684727, + "learning_rate": 3.462722803608848e-07, + "loss": 0.7046, + "step": 10889 + }, + { + "epoch": 0.88, + "grad_norm": 2.5054385421132968, + "learning_rate": 3.457914589922645e-07, + "loss": 0.641, + "step": 10890 + }, + { + "epoch": 0.88, + "grad_norm": 4.428693232769049, + "learning_rate": 3.4531095972418103e-07, + "loss": 0.6462, + "step": 10891 + }, + { + "epoch": 0.88, + "grad_norm": 2.5645204253873395, + "learning_rate": 3.448307825898872e-07, + "loss": 0.6611, + "step": 10892 + }, + { + "epoch": 0.88, + "grad_norm": 3.477850476916759, + "learning_rate": 3.443509276226148e-07, + "loss": 0.7433, + "step": 10893 + }, + { + "epoch": 0.88, + "grad_norm": 9.552941877146717, + "learning_rate": 3.438713948555722e-07, + "loss": 0.6224, + "step": 10894 + }, + { + "epoch": 0.88, + "grad_norm": 4.62616435209956, + "learning_rate": 3.433921843219468e-07, + "loss": 0.6496, + "step": 10895 + }, + { + "epoch": 0.88, + "grad_norm": 3.6714003301140625, + "learning_rate": 3.4291329605490196e-07, + "loss": 0.723, + "step": 10896 + }, + { + "epoch": 0.89, + "grad_norm": 5.767331545991184, + "learning_rate": 3.4243473008758134e-07, + "loss": 0.7232, + "step": 10897 + }, + { + "epoch": 0.89, + "grad_norm": 3.304808188817758, + "learning_rate": 3.4195648645310443e-07, + "loss": 0.7345, + "step": 10898 + }, + { + "epoch": 0.89, + "grad_norm": 5.1712891953343645, + "learning_rate": 3.4147856518456757e-07, + "loss": 0.7991, + "step": 10899 + }, + { + "epoch": 0.89, + "grad_norm": 3.4617159556750137, + "learning_rate": 3.4100096631504597e-07, + "loss": 0.6601, + "step": 10900 + }, + { + "epoch": 0.89, + "grad_norm": 4.555856076024046, + "learning_rate": 3.4052368987759323e-07, + "loss": 0.7221, + "step": 10901 + }, + { + "epoch": 0.89, + "grad_norm": 14.385081937100026, + "learning_rate": 3.400467359052395e-07, + "loss": 0.5517, + "step": 10902 + }, + { + "epoch": 0.89, + "grad_norm": 21.449272974765854, + "learning_rate": 3.3957010443099294e-07, + "loss": 0.6947, + "step": 10903 + }, + { + "epoch": 0.89, + "grad_norm": 3.2195911324029507, + "learning_rate": 3.3909379548784095e-07, + "loss": 0.5112, + "step": 10904 + }, + { + "epoch": 0.89, + "grad_norm": 5.057611814677612, + "learning_rate": 3.386178091087444e-07, + "loss": 0.6487, + "step": 10905 + }, + { + "epoch": 0.89, + "grad_norm": 3.0765013536103134, + "learning_rate": 3.381421453266465e-07, + "loss": 0.57, + "step": 10906 + }, + { + "epoch": 0.89, + "grad_norm": 5.559972952182055, + "learning_rate": 3.3766680417446574e-07, + "loss": 0.7629, + "step": 10907 + }, + { + "epoch": 0.89, + "grad_norm": 4.8210301378962, + "learning_rate": 3.371917856850981e-07, + "loss": 0.6382, + "step": 10908 + }, + { + "epoch": 0.89, + "grad_norm": 5.398364042425473, + "learning_rate": 3.3671708989141905e-07, + "loss": 0.648, + "step": 10909 + }, + { + "epoch": 0.89, + "grad_norm": 3.32764146430838, + "learning_rate": 3.3624271682627884e-07, + "loss": 0.6049, + "step": 10910 + }, + { + "epoch": 0.89, + "grad_norm": 8.365182820873128, + "learning_rate": 3.357686665225096e-07, + "loss": 0.576, + "step": 10911 + }, + { + "epoch": 0.89, + "grad_norm": 4.28327169175371, + "learning_rate": 3.3529493901291567e-07, + "loss": 0.6841, + "step": 10912 + }, + { + "epoch": 0.89, + "grad_norm": 6.183675003210733, + "learning_rate": 3.3482153433028407e-07, + "loss": 0.7028, + "step": 10913 + }, + { + "epoch": 0.89, + "grad_norm": 9.483625523776135, + "learning_rate": 3.3434845250737593e-07, + "loss": 0.6664, + "step": 10914 + }, + { + "epoch": 0.89, + "grad_norm": 29.371961055661107, + "learning_rate": 3.3387569357693274e-07, + "loss": 0.6688, + "step": 10915 + }, + { + "epoch": 0.89, + "grad_norm": 3.50031587473056, + "learning_rate": 3.3340325757167224e-07, + "loss": 0.5737, + "step": 10916 + }, + { + "epoch": 0.89, + "grad_norm": 4.071057120881112, + "learning_rate": 3.3293114452428944e-07, + "loss": 0.4962, + "step": 10917 + }, + { + "epoch": 0.89, + "grad_norm": 3.14887072082782, + "learning_rate": 3.3245935446745815e-07, + "loss": 0.5862, + "step": 10918 + }, + { + "epoch": 0.89, + "grad_norm": 6.829552335957282, + "learning_rate": 3.3198788743382784e-07, + "loss": 0.7145, + "step": 10919 + }, + { + "epoch": 0.89, + "grad_norm": 5.971437757259194, + "learning_rate": 3.3151674345602844e-07, + "loss": 0.643, + "step": 10920 + }, + { + "epoch": 0.89, + "grad_norm": 9.7659533856254, + "learning_rate": 3.310459225666651e-07, + "loss": 0.7354, + "step": 10921 + }, + { + "epoch": 0.89, + "grad_norm": 4.039331298549966, + "learning_rate": 3.3057542479832285e-07, + "loss": 0.6404, + "step": 10922 + }, + { + "epoch": 0.89, + "grad_norm": 2.524898602175444, + "learning_rate": 3.301052501835622e-07, + "loss": 0.6528, + "step": 10923 + }, + { + "epoch": 0.89, + "grad_norm": 3.2135190610748077, + "learning_rate": 3.296353987549222e-07, + "loss": 0.5357, + "step": 10924 + }, + { + "epoch": 0.89, + "grad_norm": 4.241760176126978, + "learning_rate": 3.2916587054491967e-07, + "loss": 0.6785, + "step": 10925 + }, + { + "epoch": 0.89, + "grad_norm": 7.035880113520567, + "learning_rate": 3.286966655860485e-07, + "loss": 0.7147, + "step": 10926 + }, + { + "epoch": 0.89, + "grad_norm": 3.4895870297386273, + "learning_rate": 3.282277839107817e-07, + "loss": 0.6999, + "step": 10927 + }, + { + "epoch": 0.89, + "grad_norm": 9.452803755456355, + "learning_rate": 3.277592255515671e-07, + "loss": 0.5655, + "step": 10928 + }, + { + "epoch": 0.89, + "grad_norm": 3.7550536108174133, + "learning_rate": 3.2729099054083393e-07, + "loss": 0.6218, + "step": 10929 + }, + { + "epoch": 0.89, + "grad_norm": 3.409548020345975, + "learning_rate": 3.2682307891098606e-07, + "loss": 0.6936, + "step": 10930 + }, + { + "epoch": 0.89, + "grad_norm": 3.845889888304671, + "learning_rate": 3.263554906944055e-07, + "loss": 0.7564, + "step": 10931 + }, + { + "epoch": 0.89, + "grad_norm": 3.650318890663278, + "learning_rate": 3.2588822592345304e-07, + "loss": 0.7519, + "step": 10932 + }, + { + "epoch": 0.89, + "grad_norm": 4.163174538561021, + "learning_rate": 3.2542128463046495e-07, + "loss": 0.4984, + "step": 10933 + }, + { + "epoch": 0.89, + "grad_norm": 6.3085841970843815, + "learning_rate": 3.249546668477588e-07, + "loss": 0.629, + "step": 10934 + }, + { + "epoch": 0.89, + "grad_norm": 3.9483163598080626, + "learning_rate": 3.244883726076253e-07, + "loss": 0.6657, + "step": 10935 + }, + { + "epoch": 0.89, + "grad_norm": 4.006543373567025, + "learning_rate": 3.240224019423366e-07, + "loss": 0.7121, + "step": 10936 + }, + { + "epoch": 0.89, + "grad_norm": 3.4621533129599507, + "learning_rate": 3.235567548841401e-07, + "loss": 0.7952, + "step": 10937 + }, + { + "epoch": 0.89, + "grad_norm": 3.1730685169617985, + "learning_rate": 3.2309143146526114e-07, + "loss": 0.726, + "step": 10938 + }, + { + "epoch": 0.89, + "grad_norm": 2.912219007765076, + "learning_rate": 3.226264317179029e-07, + "loss": 0.7009, + "step": 10939 + }, + { + "epoch": 0.89, + "grad_norm": 3.8435243721097962, + "learning_rate": 3.2216175567424737e-07, + "loss": 0.5869, + "step": 10940 + }, + { + "epoch": 0.89, + "grad_norm": 7.084705475421821, + "learning_rate": 3.2169740336645274e-07, + "loss": 0.6604, + "step": 10941 + }, + { + "epoch": 0.89, + "grad_norm": 4.607484392596619, + "learning_rate": 3.2123337482665385e-07, + "loss": 0.5276, + "step": 10942 + }, + { + "epoch": 0.89, + "grad_norm": 6.1201509016279205, + "learning_rate": 3.2076967008696614e-07, + "loss": 0.7097, + "step": 10943 + }, + { + "epoch": 0.89, + "grad_norm": 4.117041536382245, + "learning_rate": 3.2030628917948006e-07, + "loss": 0.6372, + "step": 10944 + }, + { + "epoch": 0.89, + "grad_norm": 3.364394496305914, + "learning_rate": 3.198432321362643e-07, + "loss": 0.5792, + "step": 10945 + }, + { + "epoch": 0.89, + "grad_norm": 13.844525175317687, + "learning_rate": 3.193804989893656e-07, + "loss": 0.6414, + "step": 10946 + }, + { + "epoch": 0.89, + "grad_norm": 5.01081783945343, + "learning_rate": 3.189180897708083e-07, + "loss": 0.5943, + "step": 10947 + }, + { + "epoch": 0.89, + "grad_norm": 5.371180046602053, + "learning_rate": 3.184560045125934e-07, + "loss": 0.6251, + "step": 10948 + }, + { + "epoch": 0.89, + "grad_norm": 9.389195349538713, + "learning_rate": 3.1799424324670035e-07, + "loss": 0.6732, + "step": 10949 + }, + { + "epoch": 0.89, + "grad_norm": 5.819654660523757, + "learning_rate": 3.175328060050864e-07, + "loss": 0.5288, + "step": 10950 + }, + { + "epoch": 0.89, + "grad_norm": 3.8829556961393905, + "learning_rate": 3.170716928196854e-07, + "loss": 0.7548, + "step": 10951 + }, + { + "epoch": 0.89, + "grad_norm": 4.684812362635046, + "learning_rate": 3.1661090372240965e-07, + "loss": 0.6741, + "step": 10952 + }, + { + "epoch": 0.89, + "grad_norm": 3.7338990775230925, + "learning_rate": 3.161504387451475e-07, + "loss": 0.6004, + "step": 10953 + }, + { + "epoch": 0.89, + "grad_norm": 6.849774123859998, + "learning_rate": 3.156902979197679e-07, + "loss": 0.6495, + "step": 10954 + }, + { + "epoch": 0.89, + "grad_norm": 9.581000665819154, + "learning_rate": 3.1523048127811426e-07, + "loss": 0.5655, + "step": 10955 + }, + { + "epoch": 0.89, + "grad_norm": 6.887475936059228, + "learning_rate": 3.147709888520084e-07, + "loss": 0.6411, + "step": 10956 + }, + { + "epoch": 0.89, + "grad_norm": 10.829624710704543, + "learning_rate": 3.1431182067325207e-07, + "loss": 0.6369, + "step": 10957 + }, + { + "epoch": 0.89, + "grad_norm": 3.878753810976778, + "learning_rate": 3.1385297677362035e-07, + "loss": 0.7126, + "step": 10958 + }, + { + "epoch": 0.89, + "grad_norm": 2.780466212127189, + "learning_rate": 3.133944571848696e-07, + "loss": 0.6138, + "step": 10959 + }, + { + "epoch": 0.89, + "grad_norm": 2.3058794313638065, + "learning_rate": 3.129362619387305e-07, + "loss": 0.7082, + "step": 10960 + }, + { + "epoch": 0.89, + "grad_norm": 5.446517143292522, + "learning_rate": 3.124783910669155e-07, + "loss": 0.7126, + "step": 10961 + }, + { + "epoch": 0.89, + "grad_norm": 3.3075086431795166, + "learning_rate": 3.120208446011108e-07, + "loss": 0.8188, + "step": 10962 + }, + { + "epoch": 0.89, + "grad_norm": 5.549107875505797, + "learning_rate": 3.1156362257298065e-07, + "loss": 0.6251, + "step": 10963 + }, + { + "epoch": 0.89, + "grad_norm": 4.620396362887429, + "learning_rate": 3.111067250141697e-07, + "loss": 0.5695, + "step": 10964 + }, + { + "epoch": 0.89, + "grad_norm": 2.9912729095760766, + "learning_rate": 3.106501519562971e-07, + "loss": 0.6634, + "step": 10965 + }, + { + "epoch": 0.89, + "grad_norm": 25.75748257553001, + "learning_rate": 3.1019390343096033e-07, + "loss": 0.7508, + "step": 10966 + }, + { + "epoch": 0.89, + "grad_norm": 16.94716932791019, + "learning_rate": 3.097379794697342e-07, + "loss": 0.5963, + "step": 10967 + }, + { + "epoch": 0.89, + "grad_norm": 2.7729427253941332, + "learning_rate": 3.0928238010417275e-07, + "loss": 0.6435, + "step": 10968 + }, + { + "epoch": 0.89, + "grad_norm": 7.919912434676973, + "learning_rate": 3.088271053658054e-07, + "loss": 0.6582, + "step": 10969 + }, + { + "epoch": 0.89, + "grad_norm": 5.811574226645277, + "learning_rate": 3.0837215528614127e-07, + "loss": 0.6864, + "step": 10970 + }, + { + "epoch": 0.89, + "grad_norm": 21.720939131055648, + "learning_rate": 3.079175298966647e-07, + "loss": 0.6347, + "step": 10971 + }, + { + "epoch": 0.89, + "grad_norm": 6.784288646832833, + "learning_rate": 3.0746322922883933e-07, + "loss": 0.665, + "step": 10972 + }, + { + "epoch": 0.89, + "grad_norm": 3.764678459861052, + "learning_rate": 3.0700925331410447e-07, + "loss": 0.6391, + "step": 10973 + }, + { + "epoch": 0.89, + "grad_norm": 3.5031464046138905, + "learning_rate": 3.0655560218387835e-07, + "loss": 0.6944, + "step": 10974 + }, + { + "epoch": 0.89, + "grad_norm": 3.535400395103272, + "learning_rate": 3.0610227586955753e-07, + "loss": 0.7876, + "step": 10975 + }, + { + "epoch": 0.89, + "grad_norm": 3.633990322605817, + "learning_rate": 3.0564927440251355e-07, + "loss": 0.7006, + "step": 10976 + }, + { + "epoch": 0.89, + "grad_norm": 4.360386217156877, + "learning_rate": 3.051965978140997e-07, + "loss": 0.5755, + "step": 10977 + }, + { + "epoch": 0.89, + "grad_norm": 2.3788548101949814, + "learning_rate": 3.047442461356409e-07, + "loss": 0.7298, + "step": 10978 + }, + { + "epoch": 0.89, + "grad_norm": 7.423088251507555, + "learning_rate": 3.0429221939844433e-07, + "loss": 0.7105, + "step": 10979 + }, + { + "epoch": 0.89, + "grad_norm": 3.1681020190668243, + "learning_rate": 3.0384051763379327e-07, + "loss": 0.8693, + "step": 10980 + }, + { + "epoch": 0.89, + "grad_norm": 3.1250691843570118, + "learning_rate": 3.0338914087294667e-07, + "loss": 0.5456, + "step": 10981 + }, + { + "epoch": 0.89, + "grad_norm": 2.2767691600742492, + "learning_rate": 3.029380891471445e-07, + "loss": 0.6396, + "step": 10982 + }, + { + "epoch": 0.89, + "grad_norm": 3.2631299435276335, + "learning_rate": 3.0248736248760126e-07, + "loss": 0.7114, + "step": 10983 + }, + { + "epoch": 0.89, + "grad_norm": 5.383009363404805, + "learning_rate": 3.0203696092551193e-07, + "loss": 0.6221, + "step": 10984 + }, + { + "epoch": 0.89, + "grad_norm": 4.66888702240897, + "learning_rate": 3.015868844920444e-07, + "loss": 0.6413, + "step": 10985 + }, + { + "epoch": 0.89, + "grad_norm": 3.327183694571496, + "learning_rate": 3.011371332183488e-07, + "loss": 0.6769, + "step": 10986 + }, + { + "epoch": 0.89, + "grad_norm": 7.734963464205543, + "learning_rate": 3.0068770713554965e-07, + "loss": 0.7668, + "step": 10987 + }, + { + "epoch": 0.89, + "grad_norm": 11.050728398062168, + "learning_rate": 3.002386062747503e-07, + "loss": 0.6344, + "step": 10988 + }, + { + "epoch": 0.89, + "grad_norm": 4.536299399405929, + "learning_rate": 2.997898306670322e-07, + "loss": 0.5377, + "step": 10989 + }, + { + "epoch": 0.89, + "grad_norm": 4.067352075678169, + "learning_rate": 2.993413803434525e-07, + "loss": 0.6885, + "step": 10990 + }, + { + "epoch": 0.89, + "grad_norm": 7.296190236092171, + "learning_rate": 2.988932553350471e-07, + "loss": 0.646, + "step": 10991 + }, + { + "epoch": 0.89, + "grad_norm": 3.126252517031877, + "learning_rate": 2.9844545567282835e-07, + "loss": 0.7359, + "step": 10992 + }, + { + "epoch": 0.89, + "grad_norm": 3.647144943073153, + "learning_rate": 2.979979813877881e-07, + "loss": 0.7087, + "step": 10993 + }, + { + "epoch": 0.89, + "grad_norm": 7.786361848617197, + "learning_rate": 2.9755083251089334e-07, + "loss": 0.6814, + "step": 10994 + }, + { + "epoch": 0.89, + "grad_norm": 5.947825742287276, + "learning_rate": 2.971040090730909e-07, + "loss": 0.6607, + "step": 10995 + }, + { + "epoch": 0.89, + "grad_norm": 2.6824023817478513, + "learning_rate": 2.966575111053027e-07, + "loss": 0.6799, + "step": 10996 + }, + { + "epoch": 0.89, + "grad_norm": 7.290364883255014, + "learning_rate": 2.9621133863842913e-07, + "loss": 0.7455, + "step": 10997 + }, + { + "epoch": 0.89, + "grad_norm": 4.814323049416905, + "learning_rate": 2.957654917033487e-07, + "loss": 0.5305, + "step": 10998 + }, + { + "epoch": 0.89, + "grad_norm": 11.724736310381989, + "learning_rate": 2.953199703309162e-07, + "loss": 0.6842, + "step": 10999 + }, + { + "epoch": 0.89, + "grad_norm": 5.85119626303855, + "learning_rate": 2.948747745519648e-07, + "loss": 0.5625, + "step": 11000 + }, + { + "epoch": 0.89, + "grad_norm": 4.289126343156414, + "learning_rate": 2.9442990439730477e-07, + "loss": 0.7346, + "step": 11001 + }, + { + "epoch": 0.89, + "grad_norm": 6.147511325838741, + "learning_rate": 2.939853598977249e-07, + "loss": 0.8817, + "step": 11002 + }, + { + "epoch": 0.89, + "grad_norm": 4.074792092739971, + "learning_rate": 2.935411410839889e-07, + "loss": 0.5382, + "step": 11003 + }, + { + "epoch": 0.89, + "grad_norm": 5.62727757776066, + "learning_rate": 2.9309724798684105e-07, + "loss": 0.7896, + "step": 11004 + }, + { + "epoch": 0.89, + "grad_norm": 3.6789632017200478, + "learning_rate": 2.926536806370006e-07, + "loss": 0.5883, + "step": 11005 + }, + { + "epoch": 0.89, + "grad_norm": 2.8308157892618664, + "learning_rate": 2.922104390651642e-07, + "loss": 0.7303, + "step": 11006 + }, + { + "epoch": 0.89, + "grad_norm": 3.1674429898774004, + "learning_rate": 2.9176752330200895e-07, + "loss": 0.6527, + "step": 11007 + }, + { + "epoch": 0.89, + "grad_norm": 3.2703844883961812, + "learning_rate": 2.9132493337818644e-07, + "loss": 0.7117, + "step": 11008 + }, + { + "epoch": 0.89, + "grad_norm": 5.449218084393396, + "learning_rate": 2.908826693243266e-07, + "loss": 0.6392, + "step": 11009 + }, + { + "epoch": 0.89, + "grad_norm": 10.042307722425726, + "learning_rate": 2.9044073117103777e-07, + "loss": 0.5633, + "step": 11010 + }, + { + "epoch": 0.89, + "grad_norm": 3.7124749921247826, + "learning_rate": 2.8999911894890434e-07, + "loss": 0.6788, + "step": 11011 + }, + { + "epoch": 0.89, + "grad_norm": 3.6686671637432933, + "learning_rate": 2.895578326884879e-07, + "loss": 0.7725, + "step": 11012 + }, + { + "epoch": 0.89, + "grad_norm": 12.499661304534454, + "learning_rate": 2.891168724203286e-07, + "loss": 0.5301, + "step": 11013 + }, + { + "epoch": 0.89, + "grad_norm": 4.923511311841258, + "learning_rate": 2.8867623817494415e-07, + "loss": 0.7701, + "step": 11014 + }, + { + "epoch": 0.89, + "grad_norm": 4.282808010497953, + "learning_rate": 2.882359299828286e-07, + "loss": 0.4871, + "step": 11015 + }, + { + "epoch": 0.89, + "grad_norm": 5.279265734248698, + "learning_rate": 2.877959478744546e-07, + "loss": 0.6954, + "step": 11016 + }, + { + "epoch": 0.89, + "grad_norm": 3.992050200322682, + "learning_rate": 2.8735629188027247e-07, + "loss": 0.745, + "step": 11017 + }, + { + "epoch": 0.89, + "grad_norm": 10.735040539927928, + "learning_rate": 2.869169620307072e-07, + "loss": 0.6784, + "step": 11018 + }, + { + "epoch": 0.89, + "grad_norm": 2.8885986017060663, + "learning_rate": 2.8647795835616387e-07, + "loss": 0.7013, + "step": 11019 + }, + { + "epoch": 0.9, + "grad_norm": 4.931027669111495, + "learning_rate": 2.8603928088702547e-07, + "loss": 0.5912, + "step": 11020 + }, + { + "epoch": 0.9, + "grad_norm": 4.738511406984342, + "learning_rate": 2.856009296536505e-07, + "loss": 0.5725, + "step": 11021 + }, + { + "epoch": 0.9, + "grad_norm": 2.9900060693603896, + "learning_rate": 2.8516290468637467e-07, + "loss": 0.6641, + "step": 11022 + }, + { + "epoch": 0.9, + "grad_norm": 3.3331611004904107, + "learning_rate": 2.847252060155131e-07, + "loss": 0.5742, + "step": 11023 + }, + { + "epoch": 0.9, + "grad_norm": 3.753065822721993, + "learning_rate": 2.842878336713578e-07, + "loss": 0.8139, + "step": 11024 + }, + { + "epoch": 0.9, + "grad_norm": 3.0790590520907166, + "learning_rate": 2.838507876841767e-07, + "loss": 0.658, + "step": 11025 + }, + { + "epoch": 0.9, + "grad_norm": 4.094361228382077, + "learning_rate": 2.834140680842157e-07, + "loss": 0.6788, + "step": 11026 + }, + { + "epoch": 0.9, + "grad_norm": 6.1670024060023465, + "learning_rate": 2.829776749016999e-07, + "loss": 0.7216, + "step": 11027 + }, + { + "epoch": 0.9, + "grad_norm": 3.8104025176980767, + "learning_rate": 2.8254160816682975e-07, + "loss": 0.7401, + "step": 11028 + }, + { + "epoch": 0.9, + "grad_norm": 4.505179201853745, + "learning_rate": 2.8210586790978323e-07, + "loss": 0.6331, + "step": 11029 + }, + { + "epoch": 0.9, + "grad_norm": 5.759404068733508, + "learning_rate": 2.81670454160719e-07, + "loss": 0.7486, + "step": 11030 + }, + { + "epoch": 0.9, + "grad_norm": 5.196715188777045, + "learning_rate": 2.8123536694976636e-07, + "loss": 0.5044, + "step": 11031 + }, + { + "epoch": 0.9, + "grad_norm": 4.06660649156897, + "learning_rate": 2.8080060630703896e-07, + "loss": 0.6347, + "step": 11032 + }, + { + "epoch": 0.9, + "grad_norm": 7.477711581152729, + "learning_rate": 2.803661722626233e-07, + "loss": 0.6174, + "step": 11033 + }, + { + "epoch": 0.9, + "grad_norm": 4.101523609885264, + "learning_rate": 2.799320648465864e-07, + "loss": 0.8228, + "step": 11034 + }, + { + "epoch": 0.9, + "grad_norm": 3.625755814722213, + "learning_rate": 2.7949828408897097e-07, + "loss": 0.7009, + "step": 11035 + }, + { + "epoch": 0.9, + "grad_norm": 4.843786626435055, + "learning_rate": 2.7906483001979623e-07, + "loss": 0.7467, + "step": 11036 + }, + { + "epoch": 0.9, + "grad_norm": 4.05239168898531, + "learning_rate": 2.7863170266906215e-07, + "loss": 0.6315, + "step": 11037 + }, + { + "epoch": 0.9, + "grad_norm": 2.2900184133245327, + "learning_rate": 2.7819890206674083e-07, + "loss": 0.511, + "step": 11038 + }, + { + "epoch": 0.9, + "grad_norm": 2.3478426629191254, + "learning_rate": 2.777664282427872e-07, + "loss": 0.6157, + "step": 11039 + }, + { + "epoch": 0.9, + "grad_norm": 4.732189669581965, + "learning_rate": 2.773342812271301e-07, + "loss": 0.7367, + "step": 11040 + }, + { + "epoch": 0.9, + "grad_norm": 6.484904005981082, + "learning_rate": 2.7690246104967735e-07, + "loss": 0.6805, + "step": 11041 + }, + { + "epoch": 0.9, + "grad_norm": 3.179213619844034, + "learning_rate": 2.7647096774031267e-07, + "loss": 0.6705, + "step": 11042 + }, + { + "epoch": 0.9, + "grad_norm": 5.33654241393421, + "learning_rate": 2.760398013289001e-07, + "loss": 0.6427, + "step": 11043 + }, + { + "epoch": 0.9, + "grad_norm": 5.840324108276913, + "learning_rate": 2.7560896184527674e-07, + "loss": 0.6388, + "step": 11044 + }, + { + "epoch": 0.9, + "grad_norm": 3.9816053213673377, + "learning_rate": 2.7517844931926106e-07, + "loss": 0.8055, + "step": 11045 + }, + { + "epoch": 0.9, + "grad_norm": 4.262387984297056, + "learning_rate": 2.7474826378064647e-07, + "loss": 0.7576, + "step": 11046 + }, + { + "epoch": 0.9, + "grad_norm": 9.057910352882542, + "learning_rate": 2.7431840525920407e-07, + "loss": 0.7114, + "step": 11047 + }, + { + "epoch": 0.9, + "grad_norm": 5.172748901330774, + "learning_rate": 2.73888873784684e-07, + "loss": 0.778, + "step": 11048 + }, + { + "epoch": 0.9, + "grad_norm": 4.8044785611634655, + "learning_rate": 2.7345966938681134e-07, + "loss": 0.5451, + "step": 11049 + }, + { + "epoch": 0.9, + "grad_norm": 2.6873342534314486, + "learning_rate": 2.730307920952913e-07, + "loss": 0.6718, + "step": 11050 + }, + { + "epoch": 0.9, + "grad_norm": 4.4728732922107275, + "learning_rate": 2.7260224193980335e-07, + "loss": 0.6887, + "step": 11051 + }, + { + "epoch": 0.9, + "grad_norm": 2.5400663468894873, + "learning_rate": 2.7217401895000664e-07, + "loss": 0.7207, + "step": 11052 + }, + { + "epoch": 0.9, + "grad_norm": 30.80619272167525, + "learning_rate": 2.7174612315553627e-07, + "loss": 0.6757, + "step": 11053 + }, + { + "epoch": 0.9, + "grad_norm": 4.078369844990921, + "learning_rate": 2.713185545860053e-07, + "loss": 0.7554, + "step": 11054 + }, + { + "epoch": 0.9, + "grad_norm": 7.3351131093809085, + "learning_rate": 2.708913132710056e-07, + "loss": 0.5357, + "step": 11055 + }, + { + "epoch": 0.9, + "grad_norm": 4.937600040195671, + "learning_rate": 2.7046439924010295e-07, + "loss": 0.5559, + "step": 11056 + }, + { + "epoch": 0.9, + "grad_norm": 3.947098860009083, + "learning_rate": 2.7003781252284533e-07, + "loss": 0.6379, + "step": 11057 + }, + { + "epoch": 0.9, + "grad_norm": 8.937500308756222, + "learning_rate": 2.6961155314875144e-07, + "loss": 0.5549, + "step": 11058 + }, + { + "epoch": 0.9, + "grad_norm": 4.740888046462372, + "learning_rate": 2.6918562114732374e-07, + "loss": 0.7494, + "step": 11059 + }, + { + "epoch": 0.9, + "grad_norm": 2.7691718979419084, + "learning_rate": 2.687600165480392e-07, + "loss": 0.6387, + "step": 11060 + }, + { + "epoch": 0.9, + "grad_norm": 4.193479970646864, + "learning_rate": 2.6833473938035094e-07, + "loss": 0.6808, + "step": 11061 + }, + { + "epoch": 0.9, + "grad_norm": 5.989482028595132, + "learning_rate": 2.679097896736921e-07, + "loss": 0.5675, + "step": 11062 + }, + { + "epoch": 0.9, + "grad_norm": 3.038569809133305, + "learning_rate": 2.6748516745747187e-07, + "loss": 0.5682, + "step": 11063 + }, + { + "epoch": 0.9, + "grad_norm": 4.210461827183721, + "learning_rate": 2.670608727610763e-07, + "loss": 0.6391, + "step": 11064 + }, + { + "epoch": 0.9, + "grad_norm": 3.858660906488997, + "learning_rate": 2.6663690561386903e-07, + "loss": 0.7706, + "step": 11065 + }, + { + "epoch": 0.9, + "grad_norm": 3.18010352232131, + "learning_rate": 2.6621326604519216e-07, + "loss": 0.6139, + "step": 11066 + }, + { + "epoch": 0.9, + "grad_norm": 4.873795037276898, + "learning_rate": 2.6578995408436283e-07, + "loss": 0.6138, + "step": 11067 + }, + { + "epoch": 0.9, + "grad_norm": 2.7048552037389864, + "learning_rate": 2.653669697606781e-07, + "loss": 0.7192, + "step": 11068 + }, + { + "epoch": 0.9, + "grad_norm": 5.944068909352404, + "learning_rate": 2.649443131034113e-07, + "loss": 0.5785, + "step": 11069 + }, + { + "epoch": 0.9, + "grad_norm": 5.275556438146503, + "learning_rate": 2.645219841418123e-07, + "loss": 0.5607, + "step": 11070 + }, + { + "epoch": 0.9, + "grad_norm": 2.8747941865006155, + "learning_rate": 2.6409998290510884e-07, + "loss": 0.7647, + "step": 11071 + }, + { + "epoch": 0.9, + "grad_norm": 4.900992328853086, + "learning_rate": 2.6367830942250596e-07, + "loss": 0.5738, + "step": 11072 + }, + { + "epoch": 0.9, + "grad_norm": 6.8776815581718935, + "learning_rate": 2.6325696372318687e-07, + "loss": 0.7264, + "step": 11073 + }, + { + "epoch": 0.9, + "grad_norm": 6.526611136912102, + "learning_rate": 2.6283594583631e-07, + "loss": 0.7287, + "step": 11074 + }, + { + "epoch": 0.9, + "grad_norm": 3.772144936698093, + "learning_rate": 2.6241525579101425e-07, + "loss": 0.6975, + "step": 11075 + }, + { + "epoch": 0.9, + "grad_norm": 3.9206089330944818, + "learning_rate": 2.61994893616413e-07, + "loss": 0.5787, + "step": 11076 + }, + { + "epoch": 0.9, + "grad_norm": 3.7637230687007723, + "learning_rate": 2.615748593415979e-07, + "loss": 0.6346, + "step": 11077 + }, + { + "epoch": 0.9, + "grad_norm": 4.426360126121607, + "learning_rate": 2.6115515299563856e-07, + "loss": 0.4801, + "step": 11078 + }, + { + "epoch": 0.9, + "grad_norm": 5.7396289326706365, + "learning_rate": 2.6073577460758003e-07, + "loss": 0.6079, + "step": 11079 + }, + { + "epoch": 0.9, + "grad_norm": 3.394920379151465, + "learning_rate": 2.6031672420644694e-07, + "loss": 0.6492, + "step": 11080 + }, + { + "epoch": 0.9, + "grad_norm": 3.1043298234267467, + "learning_rate": 2.5989800182123994e-07, + "loss": 0.711, + "step": 11081 + }, + { + "epoch": 0.9, + "grad_norm": 7.656738845633479, + "learning_rate": 2.5947960748093805e-07, + "loss": 0.5637, + "step": 11082 + }, + { + "epoch": 0.9, + "grad_norm": 14.313786296976895, + "learning_rate": 2.5906154121449587e-07, + "loss": 0.7494, + "step": 11083 + }, + { + "epoch": 0.9, + "grad_norm": 4.095595547293801, + "learning_rate": 2.5864380305084646e-07, + "loss": 0.6043, + "step": 11084 + }, + { + "epoch": 0.9, + "grad_norm": 3.2503757638554864, + "learning_rate": 2.5822639301889995e-07, + "loss": 0.5951, + "step": 11085 + }, + { + "epoch": 0.9, + "grad_norm": 4.421316136387105, + "learning_rate": 2.578093111475433e-07, + "loss": 0.731, + "step": 11086 + }, + { + "epoch": 0.9, + "grad_norm": 2.4469570776847727, + "learning_rate": 2.573925574656422e-07, + "loss": 0.6792, + "step": 11087 + }, + { + "epoch": 0.9, + "grad_norm": 4.180240511963956, + "learning_rate": 2.5697613200203697e-07, + "loss": 0.7852, + "step": 11088 + }, + { + "epoch": 0.9, + "grad_norm": 3.560192570109727, + "learning_rate": 2.5656003478554903e-07, + "loss": 0.6535, + "step": 11089 + }, + { + "epoch": 0.9, + "grad_norm": 9.993804265099204, + "learning_rate": 2.5614426584497363e-07, + "loss": 0.6224, + "step": 11090 + }, + { + "epoch": 0.9, + "grad_norm": 3.685717836360914, + "learning_rate": 2.5572882520908505e-07, + "loss": 0.7154, + "step": 11091 + }, + { + "epoch": 0.9, + "grad_norm": 5.076263093940613, + "learning_rate": 2.553137129066335e-07, + "loss": 0.5295, + "step": 11092 + }, + { + "epoch": 0.9, + "grad_norm": 5.590691796037563, + "learning_rate": 2.548989289663484e-07, + "loss": 0.5224, + "step": 11093 + }, + { + "epoch": 0.9, + "grad_norm": 4.0380278217603385, + "learning_rate": 2.5448447341693493e-07, + "loss": 0.8946, + "step": 11094 + }, + { + "epoch": 0.9, + "grad_norm": 4.33410672132875, + "learning_rate": 2.540703462870758e-07, + "loss": 0.5321, + "step": 11095 + }, + { + "epoch": 0.9, + "grad_norm": 2.8223743977094635, + "learning_rate": 2.5365654760543313e-07, + "loss": 0.6875, + "step": 11096 + }, + { + "epoch": 0.9, + "grad_norm": 10.155282123370107, + "learning_rate": 2.5324307740064113e-07, + "loss": 0.7869, + "step": 11097 + }, + { + "epoch": 0.9, + "grad_norm": 3.105071004926456, + "learning_rate": 2.5282993570131697e-07, + "loss": 0.7059, + "step": 11098 + }, + { + "epoch": 0.9, + "grad_norm": 4.759174445219804, + "learning_rate": 2.524171225360511e-07, + "loss": 0.7714, + "step": 11099 + }, + { + "epoch": 0.9, + "grad_norm": 3.305611317761764, + "learning_rate": 2.5200463793341455e-07, + "loss": 0.6946, + "step": 11100 + }, + { + "epoch": 0.9, + "grad_norm": 3.5013903699175417, + "learning_rate": 2.5159248192195284e-07, + "loss": 0.6914, + "step": 11101 + }, + { + "epoch": 0.9, + "grad_norm": 4.004572463957777, + "learning_rate": 2.5118065453018867e-07, + "loss": 0.6863, + "step": 11102 + }, + { + "epoch": 0.9, + "grad_norm": 2.8414661158451766, + "learning_rate": 2.5076915578662597e-07, + "loss": 0.4927, + "step": 11103 + }, + { + "epoch": 0.9, + "grad_norm": 7.158135883358956, + "learning_rate": 2.503579857197402e-07, + "loss": 0.8022, + "step": 11104 + }, + { + "epoch": 0.9, + "grad_norm": 6.303931678287403, + "learning_rate": 2.4994714435798815e-07, + "loss": 0.4349, + "step": 11105 + }, + { + "epoch": 0.9, + "grad_norm": 9.152109380009543, + "learning_rate": 2.495366317298026e-07, + "loss": 0.647, + "step": 11106 + }, + { + "epoch": 0.9, + "grad_norm": 3.6233721171321203, + "learning_rate": 2.4912644786359354e-07, + "loss": 0.5429, + "step": 11107 + }, + { + "epoch": 0.9, + "grad_norm": 2.936108732280984, + "learning_rate": 2.4871659278774884e-07, + "loss": 0.6106, + "step": 11108 + }, + { + "epoch": 0.9, + "grad_norm": 76.5995454463314, + "learning_rate": 2.483070665306314e-07, + "loss": 0.831, + "step": 11109 + }, + { + "epoch": 0.9, + "grad_norm": 9.110038819834891, + "learning_rate": 2.4789786912058524e-07, + "loss": 0.8331, + "step": 11110 + }, + { + "epoch": 0.9, + "grad_norm": 6.213034492420114, + "learning_rate": 2.474890005859271e-07, + "loss": 0.5754, + "step": 11111 + }, + { + "epoch": 0.9, + "grad_norm": 3.1989941600754412, + "learning_rate": 2.470804609549554e-07, + "loss": 0.4403, + "step": 11112 + }, + { + "epoch": 0.9, + "grad_norm": 6.387931422354827, + "learning_rate": 2.466722502559416e-07, + "loss": 0.5919, + "step": 11113 + }, + { + "epoch": 0.9, + "grad_norm": 2.7793936208278027, + "learning_rate": 2.4626436851713844e-07, + "loss": 0.7198, + "step": 11114 + }, + { + "epoch": 0.9, + "grad_norm": 24.675438852653613, + "learning_rate": 2.458568157667729e-07, + "loss": 0.4868, + "step": 11115 + }, + { + "epoch": 0.9, + "grad_norm": 2.968899497997956, + "learning_rate": 2.454495920330502e-07, + "loss": 0.549, + "step": 11116 + }, + { + "epoch": 0.9, + "grad_norm": 2.9724963433551848, + "learning_rate": 2.450426973441516e-07, + "loss": 0.4692, + "step": 11117 + }, + { + "epoch": 0.9, + "grad_norm": 6.113019026443085, + "learning_rate": 2.4463613172823975e-07, + "loss": 0.6257, + "step": 11118 + }, + { + "epoch": 0.9, + "grad_norm": 4.8362695841092, + "learning_rate": 2.442298952134492e-07, + "loss": 0.5985, + "step": 11119 + }, + { + "epoch": 0.9, + "grad_norm": 2.992228939025299, + "learning_rate": 2.4382398782789416e-07, + "loss": 0.7287, + "step": 11120 + }, + { + "epoch": 0.9, + "grad_norm": 4.313877136232641, + "learning_rate": 2.4341840959966724e-07, + "loss": 0.7235, + "step": 11121 + }, + { + "epoch": 0.9, + "grad_norm": 8.472511654987843, + "learning_rate": 2.430131605568353e-07, + "loss": 0.8167, + "step": 11122 + }, + { + "epoch": 0.9, + "grad_norm": 4.92453178971149, + "learning_rate": 2.4260824072744714e-07, + "loss": 0.7042, + "step": 11123 + }, + { + "epoch": 0.9, + "grad_norm": 2.7753498312351743, + "learning_rate": 2.42203650139522e-07, + "loss": 0.5991, + "step": 11124 + }, + { + "epoch": 0.9, + "grad_norm": 2.59708602228783, + "learning_rate": 2.4179938882106235e-07, + "loss": 0.6161, + "step": 11125 + }, + { + "epoch": 0.9, + "grad_norm": 3.9195639880987545, + "learning_rate": 2.413954568000454e-07, + "loss": 0.7473, + "step": 11126 + }, + { + "epoch": 0.9, + "grad_norm": 3.3863210309823724, + "learning_rate": 2.409918541044248e-07, + "loss": 0.6121, + "step": 11127 + }, + { + "epoch": 0.9, + "grad_norm": 13.656219673341297, + "learning_rate": 2.405885807621333e-07, + "loss": 0.6579, + "step": 11128 + }, + { + "epoch": 0.9, + "grad_norm": 2.7701955902738185, + "learning_rate": 2.4018563680107964e-07, + "loss": 0.5286, + "step": 11129 + }, + { + "epoch": 0.9, + "grad_norm": 5.1333374481865555, + "learning_rate": 2.397830222491515e-07, + "loss": 0.591, + "step": 11130 + }, + { + "epoch": 0.9, + "grad_norm": 4.906035418301558, + "learning_rate": 2.393807371342094e-07, + "loss": 0.6907, + "step": 11131 + }, + { + "epoch": 0.9, + "grad_norm": 3.38353178579262, + "learning_rate": 2.38978781484096e-07, + "loss": 0.6791, + "step": 11132 + }, + { + "epoch": 0.9, + "grad_norm": 3.645503942555014, + "learning_rate": 2.3857715532662915e-07, + "loss": 0.6961, + "step": 11133 + }, + { + "epoch": 0.9, + "grad_norm": 6.296236525567647, + "learning_rate": 2.3817585868960323e-07, + "loss": 0.7069, + "step": 11134 + }, + { + "epoch": 0.9, + "grad_norm": 2.8484121432260605, + "learning_rate": 2.3777489160079104e-07, + "loss": 0.8108, + "step": 11135 + }, + { + "epoch": 0.9, + "grad_norm": 6.71723743756462, + "learning_rate": 2.3737425408794202e-07, + "loss": 0.6915, + "step": 11136 + }, + { + "epoch": 0.9, + "grad_norm": 4.11595224928552, + "learning_rate": 2.3697394617878232e-07, + "loss": 0.854, + "step": 11137 + }, + { + "epoch": 0.9, + "grad_norm": 5.462369659612821, + "learning_rate": 2.3657396790101539e-07, + "loss": 0.7799, + "step": 11138 + }, + { + "epoch": 0.9, + "grad_norm": 4.1844888416580766, + "learning_rate": 2.3617431928232405e-07, + "loss": 0.6329, + "step": 11139 + }, + { + "epoch": 0.9, + "grad_norm": 7.2286873241493526, + "learning_rate": 2.3577500035036505e-07, + "loss": 0.5984, + "step": 11140 + }, + { + "epoch": 0.9, + "grad_norm": 5.965449536007669, + "learning_rate": 2.3537601113277299e-07, + "loss": 0.7, + "step": 11141 + }, + { + "epoch": 0.9, + "grad_norm": 3.8296834453207613, + "learning_rate": 2.349773516571624e-07, + "loss": 0.5032, + "step": 11142 + }, + { + "epoch": 0.91, + "grad_norm": 4.66201295833367, + "learning_rate": 2.3457902195112236e-07, + "loss": 0.6273, + "step": 11143 + }, + { + "epoch": 0.91, + "grad_norm": 6.59917799988265, + "learning_rate": 2.3418102204221972e-07, + "loss": 0.6182, + "step": 11144 + }, + { + "epoch": 0.91, + "grad_norm": 3.269615318244128, + "learning_rate": 2.3378335195799739e-07, + "loss": 0.7314, + "step": 11145 + }, + { + "epoch": 0.91, + "grad_norm": 4.881587472595604, + "learning_rate": 2.3338601172597842e-07, + "loss": 0.6883, + "step": 11146 + }, + { + "epoch": 0.91, + "grad_norm": 29.10442500021408, + "learning_rate": 2.3298900137365966e-07, + "loss": 0.6143, + "step": 11147 + }, + { + "epoch": 0.91, + "grad_norm": 2.758332840153624, + "learning_rate": 2.3259232092851857e-07, + "loss": 0.6094, + "step": 11148 + }, + { + "epoch": 0.91, + "grad_norm": 3.3708754290049634, + "learning_rate": 2.3219597041800713e-07, + "loss": 0.6694, + "step": 11149 + }, + { + "epoch": 0.91, + "grad_norm": 3.5667350172006196, + "learning_rate": 2.31799949869555e-07, + "loss": 0.6215, + "step": 11150 + }, + { + "epoch": 0.91, + "grad_norm": 4.60462786089303, + "learning_rate": 2.314042593105692e-07, + "loss": 0.66, + "step": 11151 + }, + { + "epoch": 0.91, + "grad_norm": 10.131368632742777, + "learning_rate": 2.3100889876843335e-07, + "loss": 0.6525, + "step": 11152 + }, + { + "epoch": 0.91, + "grad_norm": 3.883522401708305, + "learning_rate": 2.3061386827051114e-07, + "loss": 0.6799, + "step": 11153 + }, + { + "epoch": 0.91, + "grad_norm": 3.755193991968222, + "learning_rate": 2.3021916784413845e-07, + "loss": 0.6643, + "step": 11154 + }, + { + "epoch": 0.91, + "grad_norm": 6.489369986521138, + "learning_rate": 2.2982479751663344e-07, + "loss": 0.6152, + "step": 11155 + }, + { + "epoch": 0.91, + "grad_norm": 3.542210881578721, + "learning_rate": 2.2943075731528764e-07, + "loss": 0.6141, + "step": 11156 + }, + { + "epoch": 0.91, + "grad_norm": 2.4501734368800485, + "learning_rate": 2.290370472673714e-07, + "loss": 0.6118, + "step": 11157 + }, + { + "epoch": 0.91, + "grad_norm": 2.8648890370561024, + "learning_rate": 2.2864366740013188e-07, + "loss": 0.6098, + "step": 11158 + }, + { + "epoch": 0.91, + "grad_norm": 3.331476229469243, + "learning_rate": 2.2825061774079337e-07, + "loss": 0.637, + "step": 11159 + }, + { + "epoch": 0.91, + "grad_norm": 11.827249495520467, + "learning_rate": 2.2785789831655803e-07, + "loss": 0.6456, + "step": 11160 + }, + { + "epoch": 0.91, + "grad_norm": 4.784397517162743, + "learning_rate": 2.2746550915460297e-07, + "loss": 0.5943, + "step": 11161 + }, + { + "epoch": 0.91, + "grad_norm": 4.35396000818709, + "learning_rate": 2.2707345028208593e-07, + "loss": 0.6139, + "step": 11162 + }, + { + "epoch": 0.91, + "grad_norm": 5.719855164084684, + "learning_rate": 2.2668172172613912e-07, + "loss": 0.7485, + "step": 11163 + }, + { + "epoch": 0.91, + "grad_norm": 3.88158982756313, + "learning_rate": 2.2629032351387247e-07, + "loss": 0.7047, + "step": 11164 + }, + { + "epoch": 0.91, + "grad_norm": 5.796267768771133, + "learning_rate": 2.258992556723727e-07, + "loss": 0.6729, + "step": 11165 + }, + { + "epoch": 0.91, + "grad_norm": 3.066477015841981, + "learning_rate": 2.2550851822870423e-07, + "loss": 0.6729, + "step": 11166 + }, + { + "epoch": 0.91, + "grad_norm": 10.031857418393574, + "learning_rate": 2.251181112099099e-07, + "loss": 0.5465, + "step": 11167 + }, + { + "epoch": 0.91, + "grad_norm": 2.675819055436363, + "learning_rate": 2.2472803464300697e-07, + "loss": 0.7149, + "step": 11168 + }, + { + "epoch": 0.91, + "grad_norm": 3.5101733525460443, + "learning_rate": 2.2433828855499218e-07, + "loss": 0.5845, + "step": 11169 + }, + { + "epoch": 0.91, + "grad_norm": 3.599381142452844, + "learning_rate": 2.239488729728373e-07, + "loss": 0.7133, + "step": 11170 + }, + { + "epoch": 0.91, + "grad_norm": 6.624830753928612, + "learning_rate": 2.23559787923493e-07, + "loss": 0.6637, + "step": 11171 + }, + { + "epoch": 0.91, + "grad_norm": 3.3807407363981623, + "learning_rate": 2.2317103343388603e-07, + "loss": 0.937, + "step": 11172 + }, + { + "epoch": 0.91, + "grad_norm": 4.136022318660975, + "learning_rate": 2.2278260953092158e-07, + "loss": 0.705, + "step": 11173 + }, + { + "epoch": 0.91, + "grad_norm": 3.213316628655878, + "learning_rate": 2.2239451624148035e-07, + "loss": 0.5909, + "step": 11174 + }, + { + "epoch": 0.91, + "grad_norm": 4.113919450818377, + "learning_rate": 2.220067535924203e-07, + "loss": 0.5676, + "step": 11175 + }, + { + "epoch": 0.91, + "grad_norm": 3.689508213542701, + "learning_rate": 2.2161932161057888e-07, + "loss": 0.6524, + "step": 11176 + }, + { + "epoch": 0.91, + "grad_norm": 3.690227101161742, + "learning_rate": 2.2123222032276625e-07, + "loss": 0.6971, + "step": 11177 + }, + { + "epoch": 0.91, + "grad_norm": 4.31144987650552, + "learning_rate": 2.2084544975577383e-07, + "loss": 0.6283, + "step": 11178 + }, + { + "epoch": 0.91, + "grad_norm": 4.627351558278359, + "learning_rate": 2.2045900993636793e-07, + "loss": 0.6298, + "step": 11179 + }, + { + "epoch": 0.91, + "grad_norm": 7.181540816920443, + "learning_rate": 2.2007290089129386e-07, + "loss": 0.6105, + "step": 11180 + }, + { + "epoch": 0.91, + "grad_norm": 2.4111880089800546, + "learning_rate": 2.1968712264727187e-07, + "loss": 0.6496, + "step": 11181 + }, + { + "epoch": 0.91, + "grad_norm": 3.7301495699497518, + "learning_rate": 2.193016752310001e-07, + "loss": 0.7502, + "step": 11182 + }, + { + "epoch": 0.91, + "grad_norm": 6.946330232235449, + "learning_rate": 2.1891655866915496e-07, + "loss": 0.6652, + "step": 11183 + }, + { + "epoch": 0.91, + "grad_norm": 5.9617569552395135, + "learning_rate": 2.185317729883868e-07, + "loss": 0.5079, + "step": 11184 + }, + { + "epoch": 0.91, + "grad_norm": 3.4539595084818053, + "learning_rate": 2.1814731821532765e-07, + "loss": 0.6411, + "step": 11185 + }, + { + "epoch": 0.91, + "grad_norm": 3.7295998632425245, + "learning_rate": 2.1776319437658233e-07, + "loss": 0.6514, + "step": 11186 + }, + { + "epoch": 0.91, + "grad_norm": 5.180756978128458, + "learning_rate": 2.173794014987357e-07, + "loss": 0.6396, + "step": 11187 + }, + { + "epoch": 0.91, + "grad_norm": 2.3700950684986877, + "learning_rate": 2.1699593960834876e-07, + "loss": 0.6384, + "step": 11188 + }, + { + "epoch": 0.91, + "grad_norm": 5.435322993458829, + "learning_rate": 2.1661280873195855e-07, + "loss": 0.8065, + "step": 11189 + }, + { + "epoch": 0.91, + "grad_norm": 4.588651937813416, + "learning_rate": 2.1623000889608113e-07, + "loss": 0.6653, + "step": 11190 + }, + { + "epoch": 0.91, + "grad_norm": 3.8417818868419378, + "learning_rate": 2.1584754012720755e-07, + "loss": 0.7299, + "step": 11191 + }, + { + "epoch": 0.91, + "grad_norm": 3.321297919657786, + "learning_rate": 2.1546540245180825e-07, + "loss": 0.5849, + "step": 11192 + }, + { + "epoch": 0.91, + "grad_norm": 4.812304121023848, + "learning_rate": 2.150835958963282e-07, + "loss": 0.5711, + "step": 11193 + }, + { + "epoch": 0.91, + "grad_norm": 4.789822625841763, + "learning_rate": 2.147021204871924e-07, + "loss": 0.6448, + "step": 11194 + }, + { + "epoch": 0.91, + "grad_norm": 4.482624387945147, + "learning_rate": 2.1432097625080028e-07, + "loss": 0.4734, + "step": 11195 + }, + { + "epoch": 0.91, + "grad_norm": 4.15611770572156, + "learning_rate": 2.1394016321353074e-07, + "loss": 0.4833, + "step": 11196 + }, + { + "epoch": 0.91, + "grad_norm": 5.533211608966728, + "learning_rate": 2.13559681401736e-07, + "loss": 0.6356, + "step": 11197 + }, + { + "epoch": 0.91, + "grad_norm": 4.668946241240258, + "learning_rate": 2.1317953084175003e-07, + "loss": 0.6324, + "step": 11198 + }, + { + "epoch": 0.91, + "grad_norm": 15.00257056308707, + "learning_rate": 2.1279971155988066e-07, + "loss": 0.5525, + "step": 11199 + }, + { + "epoch": 0.91, + "grad_norm": 6.806304167910922, + "learning_rate": 2.1242022358241354e-07, + "loss": 0.6445, + "step": 11200 + }, + { + "epoch": 0.91, + "grad_norm": 3.343260844327897, + "learning_rate": 2.1204106693561265e-07, + "loss": 0.6619, + "step": 11201 + }, + { + "epoch": 0.91, + "grad_norm": 3.950683077485783, + "learning_rate": 2.1166224164571757e-07, + "loss": 0.4888, + "step": 11202 + }, + { + "epoch": 0.91, + "grad_norm": 3.5932287198342583, + "learning_rate": 2.1128374773894512e-07, + "loss": 0.7357, + "step": 11203 + }, + { + "epoch": 0.91, + "grad_norm": 4.2658072215347875, + "learning_rate": 2.1090558524148875e-07, + "loss": 0.718, + "step": 11204 + }, + { + "epoch": 0.91, + "grad_norm": 3.3954486428975836, + "learning_rate": 2.1052775417952088e-07, + "loss": 0.66, + "step": 11205 + }, + { + "epoch": 0.91, + "grad_norm": 7.184776914767399, + "learning_rate": 2.1015025457919002e-07, + "loss": 0.6501, + "step": 11206 + }, + { + "epoch": 0.91, + "grad_norm": 6.020422422928121, + "learning_rate": 2.0977308646662032e-07, + "loss": 0.5798, + "step": 11207 + }, + { + "epoch": 0.91, + "grad_norm": 5.383636768910911, + "learning_rate": 2.0939624986791473e-07, + "loss": 0.6269, + "step": 11208 + }, + { + "epoch": 0.91, + "grad_norm": 7.392695274835778, + "learning_rate": 2.0901974480915355e-07, + "loss": 0.6152, + "step": 11209 + }, + { + "epoch": 0.91, + "grad_norm": 12.184743807836144, + "learning_rate": 2.08643571316392e-07, + "loss": 0.5746, + "step": 11210 + }, + { + "epoch": 0.91, + "grad_norm": 4.578078483932733, + "learning_rate": 2.0826772941566376e-07, + "loss": 0.637, + "step": 11211 + }, + { + "epoch": 0.91, + "grad_norm": 4.292490272675671, + "learning_rate": 2.0789221913298075e-07, + "loss": 0.5561, + "step": 11212 + }, + { + "epoch": 0.91, + "grad_norm": 3.900359661084928, + "learning_rate": 2.075170404943294e-07, + "loss": 0.6385, + "step": 11213 + }, + { + "epoch": 0.91, + "grad_norm": 4.637713822962244, + "learning_rate": 2.0714219352567455e-07, + "loss": 0.6683, + "step": 11214 + }, + { + "epoch": 0.91, + "grad_norm": 4.596075403319375, + "learning_rate": 2.0676767825295873e-07, + "loss": 0.7032, + "step": 11215 + }, + { + "epoch": 0.91, + "grad_norm": 2.8509918175013684, + "learning_rate": 2.0639349470210014e-07, + "loss": 0.6394, + "step": 11216 + }, + { + "epoch": 0.91, + "grad_norm": 3.820180651945835, + "learning_rate": 2.0601964289899467e-07, + "loss": 0.8167, + "step": 11217 + }, + { + "epoch": 0.91, + "grad_norm": 15.241877536564646, + "learning_rate": 2.05646122869515e-07, + "loss": 0.6918, + "step": 11218 + }, + { + "epoch": 0.91, + "grad_norm": 11.633888495411068, + "learning_rate": 2.0527293463951158e-07, + "loss": 0.6618, + "step": 11219 + }, + { + "epoch": 0.91, + "grad_norm": 15.251399605112153, + "learning_rate": 2.0490007823481096e-07, + "loss": 0.6036, + "step": 11220 + }, + { + "epoch": 0.91, + "grad_norm": 3.349610011431575, + "learning_rate": 2.0452755368121803e-07, + "loss": 0.6235, + "step": 11221 + }, + { + "epoch": 0.91, + "grad_norm": 5.595683221298956, + "learning_rate": 2.0415536100451273e-07, + "loss": 0.4783, + "step": 11222 + }, + { + "epoch": 0.91, + "grad_norm": 5.956363776287118, + "learning_rate": 2.037835002304539e-07, + "loss": 0.6666, + "step": 11223 + }, + { + "epoch": 0.91, + "grad_norm": 12.635621690223486, + "learning_rate": 2.0341197138477652e-07, + "loss": 0.7744, + "step": 11224 + }, + { + "epoch": 0.91, + "grad_norm": 6.839538448672353, + "learning_rate": 2.030407744931917e-07, + "loss": 0.6424, + "step": 11225 + }, + { + "epoch": 0.91, + "grad_norm": 6.538209436943901, + "learning_rate": 2.0266990958138998e-07, + "loss": 0.7572, + "step": 11226 + }, + { + "epoch": 0.91, + "grad_norm": 5.329314131426762, + "learning_rate": 2.0229937667503641e-07, + "loss": 0.5788, + "step": 11227 + }, + { + "epoch": 0.91, + "grad_norm": 4.981178461071761, + "learning_rate": 2.0192917579977545e-07, + "loss": 0.6999, + "step": 11228 + }, + { + "epoch": 0.91, + "grad_norm": 3.9967243742484726, + "learning_rate": 2.0155930698122661e-07, + "loss": 0.704, + "step": 11229 + }, + { + "epoch": 0.91, + "grad_norm": 4.680105901722793, + "learning_rate": 2.011897702449872e-07, + "loss": 0.7099, + "step": 11230 + }, + { + "epoch": 0.91, + "grad_norm": 6.067912557989318, + "learning_rate": 2.008205656166312e-07, + "loss": 0.6702, + "step": 11231 + }, + { + "epoch": 0.91, + "grad_norm": 4.629518653451745, + "learning_rate": 2.0045169312171043e-07, + "loss": 0.4777, + "step": 11232 + }, + { + "epoch": 0.91, + "grad_norm": 14.303071857238, + "learning_rate": 2.0008315278575274e-07, + "loss": 0.6023, + "step": 11233 + }, + { + "epoch": 0.91, + "grad_norm": 2.9550083491589967, + "learning_rate": 1.9971494463426332e-07, + "loss": 0.5514, + "step": 11234 + }, + { + "epoch": 0.91, + "grad_norm": 4.45169761435258, + "learning_rate": 1.993470686927257e-07, + "loss": 0.5483, + "step": 11235 + }, + { + "epoch": 0.91, + "grad_norm": 14.136993320354271, + "learning_rate": 1.989795249865978e-07, + "loss": 0.6777, + "step": 11236 + }, + { + "epoch": 0.91, + "grad_norm": 3.448756818442511, + "learning_rate": 1.9861231354131705e-07, + "loss": 0.7651, + "step": 11237 + }, + { + "epoch": 0.91, + "grad_norm": 6.710480896720181, + "learning_rate": 1.9824543438229593e-07, + "loss": 0.5539, + "step": 11238 + }, + { + "epoch": 0.91, + "grad_norm": 11.228298048873127, + "learning_rate": 1.978788875349247e-07, + "loss": 0.5702, + "step": 11239 + }, + { + "epoch": 0.91, + "grad_norm": 4.245243401907516, + "learning_rate": 1.9751267302457132e-07, + "loss": 0.5553, + "step": 11240 + }, + { + "epoch": 0.91, + "grad_norm": 3.8167119126457365, + "learning_rate": 1.971467908765795e-07, + "loss": 0.6471, + "step": 11241 + }, + { + "epoch": 0.91, + "grad_norm": 3.2591391861311845, + "learning_rate": 1.9678124111627229e-07, + "loss": 0.61, + "step": 11242 + }, + { + "epoch": 0.91, + "grad_norm": 3.27032283104346, + "learning_rate": 1.9641602376894552e-07, + "loss": 0.8078, + "step": 11243 + }, + { + "epoch": 0.91, + "grad_norm": 4.396870639714731, + "learning_rate": 1.960511388598768e-07, + "loss": 0.5992, + "step": 11244 + }, + { + "epoch": 0.91, + "grad_norm": 3.452741420176858, + "learning_rate": 1.9568658641431648e-07, + "loss": 0.7191, + "step": 11245 + }, + { + "epoch": 0.91, + "grad_norm": 4.469746243397283, + "learning_rate": 1.9532236645749492e-07, + "loss": 0.6226, + "step": 11246 + }, + { + "epoch": 0.91, + "grad_norm": 3.0071098089703447, + "learning_rate": 1.9495847901461916e-07, + "loss": 0.6423, + "step": 11247 + }, + { + "epoch": 0.91, + "grad_norm": 2.927309478084663, + "learning_rate": 1.9459492411087078e-07, + "loss": 0.6625, + "step": 11248 + }, + { + "epoch": 0.91, + "grad_norm": 3.8751220068643923, + "learning_rate": 1.9423170177141182e-07, + "loss": 0.6653, + "step": 11249 + }, + { + "epoch": 0.91, + "grad_norm": 3.563704809780736, + "learning_rate": 1.938688120213783e-07, + "loss": 0.5827, + "step": 11250 + }, + { + "epoch": 0.91, + "grad_norm": 3.358420934460345, + "learning_rate": 1.9350625488588458e-07, + "loss": 0.5478, + "step": 11251 + }, + { + "epoch": 0.91, + "grad_norm": 5.2048093345710065, + "learning_rate": 1.9314403039002228e-07, + "loss": 0.6095, + "step": 11252 + }, + { + "epoch": 0.91, + "grad_norm": 2.5368372327602633, + "learning_rate": 1.927821385588602e-07, + "loss": 0.6251, + "step": 11253 + }, + { + "epoch": 0.91, + "grad_norm": 2.8808988217708147, + "learning_rate": 1.924205794174422e-07, + "loss": 0.6186, + "step": 11254 + }, + { + "epoch": 0.91, + "grad_norm": 4.215083189640467, + "learning_rate": 1.9205935299079158e-07, + "loss": 0.5443, + "step": 11255 + }, + { + "epoch": 0.91, + "grad_norm": 3.968943102852748, + "learning_rate": 1.916984593039073e-07, + "loss": 0.7671, + "step": 11256 + }, + { + "epoch": 0.91, + "grad_norm": 2.658793552877997, + "learning_rate": 1.913378983817643e-07, + "loss": 0.6221, + "step": 11257 + }, + { + "epoch": 0.91, + "grad_norm": 13.225970960842027, + "learning_rate": 1.9097767024931713e-07, + "loss": 0.8828, + "step": 11258 + }, + { + "epoch": 0.91, + "grad_norm": 3.787876191138275, + "learning_rate": 1.906177749314947e-07, + "loss": 0.7026, + "step": 11259 + }, + { + "epoch": 0.91, + "grad_norm": 3.5243636606237465, + "learning_rate": 1.902582124532054e-07, + "loss": 0.5846, + "step": 11260 + }, + { + "epoch": 0.91, + "grad_norm": 3.940901314921539, + "learning_rate": 1.8989898283933216e-07, + "loss": 0.6955, + "step": 11261 + }, + { + "epoch": 0.91, + "grad_norm": 3.4868704721265336, + "learning_rate": 1.8954008611473618e-07, + "loss": 0.5596, + "step": 11262 + }, + { + "epoch": 0.91, + "grad_norm": 18.703640149227553, + "learning_rate": 1.8918152230425534e-07, + "loss": 0.4744, + "step": 11263 + }, + { + "epoch": 0.91, + "grad_norm": 4.945525287111293, + "learning_rate": 1.8882329143270429e-07, + "loss": 0.6621, + "step": 11264 + }, + { + "epoch": 0.91, + "grad_norm": 5.204010961577526, + "learning_rate": 1.8846539352487591e-07, + "loss": 0.4942, + "step": 11265 + }, + { + "epoch": 0.92, + "grad_norm": 3.5048726028688013, + "learning_rate": 1.8810782860553712e-07, + "loss": 0.7458, + "step": 11266 + }, + { + "epoch": 0.92, + "grad_norm": 2.659341861036358, + "learning_rate": 1.8775059669943586e-07, + "loss": 0.6877, + "step": 11267 + }, + { + "epoch": 0.92, + "grad_norm": 3.3723421273103655, + "learning_rate": 1.873936978312929e-07, + "loss": 0.5985, + "step": 11268 + }, + { + "epoch": 0.92, + "grad_norm": 3.0123279451882095, + "learning_rate": 1.8703713202580963e-07, + "loss": 0.7331, + "step": 11269 + }, + { + "epoch": 0.92, + "grad_norm": 7.625981246359626, + "learning_rate": 1.8668089930766077e-07, + "loss": 0.6579, + "step": 11270 + }, + { + "epoch": 0.92, + "grad_norm": 3.0689827099492053, + "learning_rate": 1.8632499970150154e-07, + "loss": 0.6426, + "step": 11271 + }, + { + "epoch": 0.92, + "grad_norm": 21.540314721514395, + "learning_rate": 1.859694332319617e-07, + "loss": 0.6223, + "step": 11272 + }, + { + "epoch": 0.92, + "grad_norm": 5.964900391084288, + "learning_rate": 1.8561419992364826e-07, + "loss": 0.6281, + "step": 11273 + }, + { + "epoch": 0.92, + "grad_norm": 4.6903264018260105, + "learning_rate": 1.8525929980114653e-07, + "loss": 0.7361, + "step": 11274 + }, + { + "epoch": 0.92, + "grad_norm": 6.964590575087678, + "learning_rate": 1.8490473288901744e-07, + "loss": 0.6393, + "step": 11275 + }, + { + "epoch": 0.92, + "grad_norm": 4.547461324425597, + "learning_rate": 1.8455049921179858e-07, + "loss": 0.7326, + "step": 11276 + }, + { + "epoch": 0.92, + "grad_norm": 48.91079470325377, + "learning_rate": 1.8419659879400587e-07, + "loss": 0.4434, + "step": 11277 + }, + { + "epoch": 0.92, + "grad_norm": 7.599910658665928, + "learning_rate": 1.8384303166013194e-07, + "loss": 0.7121, + "step": 11278 + }, + { + "epoch": 0.92, + "grad_norm": 5.094208246499218, + "learning_rate": 1.8348979783464505e-07, + "loss": 0.6975, + "step": 11279 + }, + { + "epoch": 0.92, + "grad_norm": 7.1835376821648484, + "learning_rate": 1.831368973419906e-07, + "loss": 0.5416, + "step": 11280 + }, + { + "epoch": 0.92, + "grad_norm": 4.078551495747529, + "learning_rate": 1.827843302065929e-07, + "loss": 0.4897, + "step": 11281 + }, + { + "epoch": 0.92, + "grad_norm": 6.288439876156027, + "learning_rate": 1.8243209645285143e-07, + "loss": 0.6914, + "step": 11282 + }, + { + "epoch": 0.92, + "grad_norm": 4.5239103333466275, + "learning_rate": 1.8208019610514273e-07, + "loss": 0.6296, + "step": 11283 + }, + { + "epoch": 0.92, + "grad_norm": 2.647979380089214, + "learning_rate": 1.8172862918782008e-07, + "loss": 0.6212, + "step": 11284 + }, + { + "epoch": 0.92, + "grad_norm": 2.42850606551405, + "learning_rate": 1.8137739572521518e-07, + "loss": 0.6969, + "step": 11285 + }, + { + "epoch": 0.92, + "grad_norm": 2.8193799941343847, + "learning_rate": 1.8102649574163523e-07, + "loss": 0.6285, + "step": 11286 + }, + { + "epoch": 0.92, + "grad_norm": 5.3924909696544265, + "learning_rate": 1.8067592926136412e-07, + "loss": 0.5888, + "step": 11287 + }, + { + "epoch": 0.92, + "grad_norm": 3.7818778346585287, + "learning_rate": 1.803256963086636e-07, + "loss": 0.5445, + "step": 11288 + }, + { + "epoch": 0.92, + "grad_norm": 3.255098907805088, + "learning_rate": 1.7997579690777257e-07, + "loss": 0.8356, + "step": 11289 + }, + { + "epoch": 0.92, + "grad_norm": 2.7259441838723153, + "learning_rate": 1.7962623108290556e-07, + "loss": 0.4971, + "step": 11290 + }, + { + "epoch": 0.92, + "grad_norm": 4.696404589431635, + "learning_rate": 1.7927699885825488e-07, + "loss": 0.5833, + "step": 11291 + }, + { + "epoch": 0.92, + "grad_norm": 5.084098117617025, + "learning_rate": 1.7892810025798958e-07, + "loss": 0.8636, + "step": 11292 + }, + { + "epoch": 0.92, + "grad_norm": 5.779511802895045, + "learning_rate": 1.7857953530625528e-07, + "loss": 0.6311, + "step": 11293 + }, + { + "epoch": 0.92, + "grad_norm": 4.813675047655089, + "learning_rate": 1.7823130402717604e-07, + "loss": 0.8099, + "step": 11294 + }, + { + "epoch": 0.92, + "grad_norm": 6.518444299799028, + "learning_rate": 1.7788340644485093e-07, + "loss": 0.6243, + "step": 11295 + }, + { + "epoch": 0.92, + "grad_norm": 3.352692611521556, + "learning_rate": 1.7753584258335677e-07, + "loss": 0.6686, + "step": 11296 + }, + { + "epoch": 0.92, + "grad_norm": 4.7854030750804855, + "learning_rate": 1.7718861246674656e-07, + "loss": 0.6203, + "step": 11297 + }, + { + "epoch": 0.92, + "grad_norm": 3.0625260606515714, + "learning_rate": 1.768417161190511e-07, + "loss": 0.7503, + "step": 11298 + }, + { + "epoch": 0.92, + "grad_norm": 8.733551191478266, + "learning_rate": 1.7649515356427839e-07, + "loss": 0.7507, + "step": 11299 + }, + { + "epoch": 0.92, + "grad_norm": 3.9132102446045893, + "learning_rate": 1.76148924826412e-07, + "loss": 0.5885, + "step": 11300 + }, + { + "epoch": 0.92, + "grad_norm": 5.394820205406352, + "learning_rate": 1.758030299294139e-07, + "loss": 0.7972, + "step": 11301 + }, + { + "epoch": 0.92, + "grad_norm": 5.415622490564768, + "learning_rate": 1.754574688972216e-07, + "loss": 0.6228, + "step": 11302 + }, + { + "epoch": 0.92, + "grad_norm": 3.6203674526266982, + "learning_rate": 1.7511224175375097e-07, + "loss": 0.6477, + "step": 11303 + }, + { + "epoch": 0.92, + "grad_norm": 3.482400390919232, + "learning_rate": 1.7476734852289235e-07, + "loss": 0.6297, + "step": 11304 + }, + { + "epoch": 0.92, + "grad_norm": 4.646189629126471, + "learning_rate": 1.7442278922851551e-07, + "loss": 0.6205, + "step": 11305 + }, + { + "epoch": 0.92, + "grad_norm": 2.5846996323918274, + "learning_rate": 1.7407856389446588e-07, + "loss": 0.5584, + "step": 11306 + }, + { + "epoch": 0.92, + "grad_norm": 4.119611207171882, + "learning_rate": 1.73734672544566e-07, + "loss": 0.7873, + "step": 11307 + }, + { + "epoch": 0.92, + "grad_norm": 3.8005402386654237, + "learning_rate": 1.7339111520261686e-07, + "loss": 0.7073, + "step": 11308 + }, + { + "epoch": 0.92, + "grad_norm": 4.046992935276586, + "learning_rate": 1.7304789189239167e-07, + "loss": 0.8008, + "step": 11309 + }, + { + "epoch": 0.92, + "grad_norm": 5.040735739036495, + "learning_rate": 1.7270500263764645e-07, + "loss": 0.6108, + "step": 11310 + }, + { + "epoch": 0.92, + "grad_norm": 8.466417338929515, + "learning_rate": 1.7236244746210994e-07, + "loss": 0.6214, + "step": 11311 + }, + { + "epoch": 0.92, + "grad_norm": 4.110455408198369, + "learning_rate": 1.7202022638948878e-07, + "loss": 0.6717, + "step": 11312 + }, + { + "epoch": 0.92, + "grad_norm": 4.671910819961705, + "learning_rate": 1.7167833944346846e-07, + "loss": 0.6942, + "step": 11313 + }, + { + "epoch": 0.92, + "grad_norm": 20.625401341803297, + "learning_rate": 1.7133678664770726e-07, + "loss": 0.5982, + "step": 11314 + }, + { + "epoch": 0.92, + "grad_norm": 3.667073387587954, + "learning_rate": 1.7099556802584628e-07, + "loss": 0.5206, + "step": 11315 + }, + { + "epoch": 0.92, + "grad_norm": 3.804106872063484, + "learning_rate": 1.7065468360149607e-07, + "loss": 0.8144, + "step": 11316 + }, + { + "epoch": 0.92, + "grad_norm": 3.945172852137348, + "learning_rate": 1.7031413339825054e-07, + "loss": 0.762, + "step": 11317 + }, + { + "epoch": 0.92, + "grad_norm": 4.399638969248526, + "learning_rate": 1.6997391743967696e-07, + "loss": 0.7223, + "step": 11318 + }, + { + "epoch": 0.92, + "grad_norm": 5.530700456349516, + "learning_rate": 1.696340357493209e-07, + "loss": 0.616, + "step": 11319 + }, + { + "epoch": 0.92, + "grad_norm": 3.77537007080804, + "learning_rate": 1.6929448835070418e-07, + "loss": 0.7288, + "step": 11320 + }, + { + "epoch": 0.92, + "grad_norm": 3.2395221834056325, + "learning_rate": 1.689552752673246e-07, + "loss": 0.6597, + "step": 11321 + }, + { + "epoch": 0.92, + "grad_norm": 3.04079371987463, + "learning_rate": 1.686163965226606e-07, + "loss": 0.7368, + "step": 11322 + }, + { + "epoch": 0.92, + "grad_norm": 3.843562519496911, + "learning_rate": 1.6827785214016123e-07, + "loss": 0.6692, + "step": 11323 + }, + { + "epoch": 0.92, + "grad_norm": 4.8703317084375755, + "learning_rate": 1.6793964214325776e-07, + "loss": 0.5896, + "step": 11324 + }, + { + "epoch": 0.92, + "grad_norm": 4.904465482176586, + "learning_rate": 1.6760176655535643e-07, + "loss": 0.5499, + "step": 11325 + }, + { + "epoch": 0.92, + "grad_norm": 4.345299328999583, + "learning_rate": 1.672642253998402e-07, + "loss": 0.6296, + "step": 11326 + }, + { + "epoch": 0.92, + "grad_norm": 3.0821982158403833, + "learning_rate": 1.6692701870006933e-07, + "loss": 0.706, + "step": 11327 + }, + { + "epoch": 0.92, + "grad_norm": 2.8088281504890302, + "learning_rate": 1.665901464793801e-07, + "loss": 0.5967, + "step": 11328 + }, + { + "epoch": 0.92, + "grad_norm": 5.5914595375943685, + "learning_rate": 1.6625360876108608e-07, + "loss": 0.6793, + "step": 11329 + }, + { + "epoch": 0.92, + "grad_norm": 4.05874173295695, + "learning_rate": 1.6591740556847812e-07, + "loss": 0.6682, + "step": 11330 + }, + { + "epoch": 0.92, + "grad_norm": 2.849166669824276, + "learning_rate": 1.655815369248237e-07, + "loss": 0.7779, + "step": 11331 + }, + { + "epoch": 0.92, + "grad_norm": 2.9577242451888446, + "learning_rate": 1.65246002853367e-07, + "loss": 0.6344, + "step": 11332 + }, + { + "epoch": 0.92, + "grad_norm": 3.8005021807318426, + "learning_rate": 1.649108033773289e-07, + "loss": 0.6756, + "step": 11333 + }, + { + "epoch": 0.92, + "grad_norm": 5.5086865869507875, + "learning_rate": 1.6457593851990805e-07, + "loss": 0.5032, + "step": 11334 + }, + { + "epoch": 0.92, + "grad_norm": 5.752048430276614, + "learning_rate": 1.6424140830427816e-07, + "loss": 0.6865, + "step": 11335 + }, + { + "epoch": 0.92, + "grad_norm": 3.5670783083033104, + "learning_rate": 1.6390721275359123e-07, + "loss": 0.7315, + "step": 11336 + }, + { + "epoch": 0.92, + "grad_norm": 3.9611069283397486, + "learning_rate": 1.6357335189097546e-07, + "loss": 0.6658, + "step": 11337 + }, + { + "epoch": 0.92, + "grad_norm": 6.907016723052516, + "learning_rate": 1.632398257395368e-07, + "loss": 0.6841, + "step": 11338 + }, + { + "epoch": 0.92, + "grad_norm": 13.774494562248503, + "learning_rate": 1.6290663432235622e-07, + "loss": 0.5817, + "step": 11339 + }, + { + "epoch": 0.92, + "grad_norm": 2.1466833584054457, + "learning_rate": 1.6257377766249416e-07, + "loss": 0.6933, + "step": 11340 + }, + { + "epoch": 0.92, + "grad_norm": 4.677183835696159, + "learning_rate": 1.6224125578298611e-07, + "loss": 0.7218, + "step": 11341 + }, + { + "epoch": 0.92, + "grad_norm": 10.862130663735634, + "learning_rate": 1.6190906870684365e-07, + "loss": 0.7622, + "step": 11342 + }, + { + "epoch": 0.92, + "grad_norm": 3.1152603199156434, + "learning_rate": 1.6157721645705615e-07, + "loss": 0.6921, + "step": 11343 + }, + { + "epoch": 0.92, + "grad_norm": 3.982612311057777, + "learning_rate": 1.6124569905659136e-07, + "loss": 0.6354, + "step": 11344 + }, + { + "epoch": 0.92, + "grad_norm": 3.3627137885472655, + "learning_rate": 1.6091451652839151e-07, + "loss": 0.6055, + "step": 11345 + }, + { + "epoch": 0.92, + "grad_norm": 2.981164909693949, + "learning_rate": 1.6058366889537546e-07, + "loss": 0.561, + "step": 11346 + }, + { + "epoch": 0.92, + "grad_norm": 3.7013246737156655, + "learning_rate": 1.6025315618044211e-07, + "loss": 0.6128, + "step": 11347 + }, + { + "epoch": 0.92, + "grad_norm": 23.816482656442975, + "learning_rate": 1.5992297840646376e-07, + "loss": 0.6945, + "step": 11348 + }, + { + "epoch": 0.92, + "grad_norm": 2.5154817244944305, + "learning_rate": 1.5959313559629098e-07, + "loss": 0.6949, + "step": 11349 + }, + { + "epoch": 0.92, + "grad_norm": 4.028092005877914, + "learning_rate": 1.5926362777274994e-07, + "loss": 0.4796, + "step": 11350 + }, + { + "epoch": 0.92, + "grad_norm": 3.3833095600247156, + "learning_rate": 1.589344549586469e-07, + "loss": 0.6234, + "step": 11351 + }, + { + "epoch": 0.92, + "grad_norm": 5.042563100392468, + "learning_rate": 1.5860561717676137e-07, + "loss": 0.6392, + "step": 11352 + }, + { + "epoch": 0.92, + "grad_norm": 3.464226622332731, + "learning_rate": 1.5827711444985017e-07, + "loss": 0.5578, + "step": 11353 + }, + { + "epoch": 0.92, + "grad_norm": 5.4081553398162105, + "learning_rate": 1.57948946800649e-07, + "loss": 0.7801, + "step": 11354 + }, + { + "epoch": 0.92, + "grad_norm": 8.559692279775795, + "learning_rate": 1.576211142518691e-07, + "loss": 0.6881, + "step": 11355 + }, + { + "epoch": 0.92, + "grad_norm": 2.9308364806958203, + "learning_rate": 1.572936168261985e-07, + "loss": 0.7093, + "step": 11356 + }, + { + "epoch": 0.92, + "grad_norm": 3.7556613182043166, + "learning_rate": 1.5696645454630121e-07, + "loss": 0.6598, + "step": 11357 + }, + { + "epoch": 0.92, + "grad_norm": 3.8895442592773337, + "learning_rate": 1.5663962743481976e-07, + "loss": 0.611, + "step": 11358 + }, + { + "epoch": 0.92, + "grad_norm": 5.245928206072252, + "learning_rate": 1.5631313551437266e-07, + "loss": 0.6779, + "step": 11359 + }, + { + "epoch": 0.92, + "grad_norm": 2.8060822507163423, + "learning_rate": 1.559869788075541e-07, + "loss": 0.7006, + "step": 11360 + }, + { + "epoch": 0.92, + "grad_norm": 4.553019662325742, + "learning_rate": 1.5566115733693766e-07, + "loss": 0.7098, + "step": 11361 + }, + { + "epoch": 0.92, + "grad_norm": 12.987882498736571, + "learning_rate": 1.5533567112507196e-07, + "loss": 0.7036, + "step": 11362 + }, + { + "epoch": 0.92, + "grad_norm": 5.506611439697978, + "learning_rate": 1.5501052019448183e-07, + "loss": 0.6388, + "step": 11363 + }, + { + "epoch": 0.92, + "grad_norm": 3.54126202649975, + "learning_rate": 1.5468570456766973e-07, + "loss": 0.6579, + "step": 11364 + }, + { + "epoch": 0.92, + "grad_norm": 4.111291338264455, + "learning_rate": 1.5436122426711664e-07, + "loss": 0.5988, + "step": 11365 + }, + { + "epoch": 0.92, + "grad_norm": 5.477972852199743, + "learning_rate": 1.5403707931527735e-07, + "loss": 0.5365, + "step": 11366 + }, + { + "epoch": 0.92, + "grad_norm": 2.9252105481445767, + "learning_rate": 1.537132697345839e-07, + "loss": 0.6941, + "step": 11367 + }, + { + "epoch": 0.92, + "grad_norm": 5.370815188033459, + "learning_rate": 1.5338979554744782e-07, + "loss": 0.671, + "step": 11368 + }, + { + "epoch": 0.92, + "grad_norm": 2.804232315146814, + "learning_rate": 1.5306665677625453e-07, + "loss": 0.6526, + "step": 11369 + }, + { + "epoch": 0.92, + "grad_norm": 4.269627434998638, + "learning_rate": 1.5274385344336728e-07, + "loss": 0.5296, + "step": 11370 + }, + { + "epoch": 0.92, + "grad_norm": 8.860232817952454, + "learning_rate": 1.5242138557112595e-07, + "loss": 0.6667, + "step": 11371 + }, + { + "epoch": 0.92, + "grad_norm": 4.078921810451814, + "learning_rate": 1.5209925318184827e-07, + "loss": 0.683, + "step": 11372 + }, + { + "epoch": 0.92, + "grad_norm": 3.8827854380777485, + "learning_rate": 1.5177745629782638e-07, + "loss": 0.655, + "step": 11373 + }, + { + "epoch": 0.92, + "grad_norm": 3.627943026625959, + "learning_rate": 1.514559949413319e-07, + "loss": 0.7929, + "step": 11374 + }, + { + "epoch": 0.92, + "grad_norm": 3.3382801012273338, + "learning_rate": 1.5113486913461152e-07, + "loss": 0.7024, + "step": 11375 + }, + { + "epoch": 0.92, + "grad_norm": 10.312793837771833, + "learning_rate": 1.5081407889988908e-07, + "loss": 0.6685, + "step": 11376 + }, + { + "epoch": 0.92, + "grad_norm": 4.985840613105751, + "learning_rate": 1.5049362425936576e-07, + "loss": 0.5905, + "step": 11377 + }, + { + "epoch": 0.92, + "grad_norm": 4.668605820625122, + "learning_rate": 1.5017350523521823e-07, + "loss": 0.7027, + "step": 11378 + }, + { + "epoch": 0.92, + "grad_norm": 4.23191987741091, + "learning_rate": 1.49853721849601e-07, + "loss": 0.5432, + "step": 11379 + }, + { + "epoch": 0.92, + "grad_norm": 4.218237886594436, + "learning_rate": 1.4953427412464527e-07, + "loss": 0.6174, + "step": 11380 + }, + { + "epoch": 0.92, + "grad_norm": 4.311840916897585, + "learning_rate": 1.4921516208246002e-07, + "loss": 0.6073, + "step": 11381 + }, + { + "epoch": 0.92, + "grad_norm": 3.3755069862100195, + "learning_rate": 1.48896385745127e-07, + "loss": 0.8342, + "step": 11382 + }, + { + "epoch": 0.92, + "grad_norm": 4.345442506283233, + "learning_rate": 1.4857794513471025e-07, + "loss": 0.6335, + "step": 11383 + }, + { + "epoch": 0.92, + "grad_norm": 3.1581361945411155, + "learning_rate": 1.482598402732466e-07, + "loss": 0.6706, + "step": 11384 + }, + { + "epoch": 0.92, + "grad_norm": 3.059322864683502, + "learning_rate": 1.4794207118275007e-07, + "loss": 0.5244, + "step": 11385 + }, + { + "epoch": 0.92, + "grad_norm": 6.527982839451223, + "learning_rate": 1.4762463788521474e-07, + "loss": 0.7469, + "step": 11386 + }, + { + "epoch": 0.92, + "grad_norm": 5.870894137829264, + "learning_rate": 1.4730754040260642e-07, + "loss": 0.7489, + "step": 11387 + }, + { + "epoch": 0.92, + "grad_norm": 5.14039387263071, + "learning_rate": 1.4699077875687252e-07, + "loss": 0.5167, + "step": 11388 + }, + { + "epoch": 0.93, + "grad_norm": 9.401638749942542, + "learning_rate": 1.466743529699327e-07, + "loss": 0.6808, + "step": 11389 + }, + { + "epoch": 0.93, + "grad_norm": 2.6071930152666014, + "learning_rate": 1.463582630636873e-07, + "loss": 0.5333, + "step": 11390 + }, + { + "epoch": 0.93, + "grad_norm": 2.6917431700755707, + "learning_rate": 1.4604250906001093e-07, + "loss": 0.5804, + "step": 11391 + }, + { + "epoch": 0.93, + "grad_norm": 3.353136079190856, + "learning_rate": 1.4572709098075565e-07, + "loss": 0.7591, + "step": 11392 + }, + { + "epoch": 0.93, + "grad_norm": 4.780160289960551, + "learning_rate": 1.4541200884775119e-07, + "loss": 0.6824, + "step": 11393 + }, + { + "epoch": 0.93, + "grad_norm": 12.947381580933802, + "learning_rate": 1.4509726268280233e-07, + "loss": 0.6537, + "step": 11394 + }, + { + "epoch": 0.93, + "grad_norm": 3.6487765281032, + "learning_rate": 1.447828525076933e-07, + "loss": 0.6034, + "step": 11395 + }, + { + "epoch": 0.93, + "grad_norm": 2.7371953806911082, + "learning_rate": 1.4446877834418004e-07, + "loss": 0.721, + "step": 11396 + }, + { + "epoch": 0.93, + "grad_norm": 3.272631827861146, + "learning_rate": 1.4415504021400128e-07, + "loss": 0.6393, + "step": 11397 + }, + { + "epoch": 0.93, + "grad_norm": 3.897589784273589, + "learning_rate": 1.43841638138868e-07, + "loss": 0.6521, + "step": 11398 + }, + { + "epoch": 0.93, + "grad_norm": 4.110382566654573, + "learning_rate": 1.4352857214047056e-07, + "loss": 0.5612, + "step": 11399 + }, + { + "epoch": 0.93, + "grad_norm": 3.258100866122637, + "learning_rate": 1.4321584224047502e-07, + "loss": 0.6846, + "step": 11400 + }, + { + "epoch": 0.93, + "grad_norm": 3.1329123979100104, + "learning_rate": 1.4290344846052406e-07, + "loss": 0.692, + "step": 11401 + }, + { + "epoch": 0.93, + "grad_norm": 2.768243901488135, + "learning_rate": 1.4259139082223761e-07, + "loss": 0.6342, + "step": 11402 + }, + { + "epoch": 0.93, + "grad_norm": 3.7383615746483367, + "learning_rate": 1.422796693472106e-07, + "loss": 0.6043, + "step": 11403 + }, + { + "epoch": 0.93, + "grad_norm": 5.813280930536397, + "learning_rate": 1.41968284057018e-07, + "loss": 0.6294, + "step": 11404 + }, + { + "epoch": 0.93, + "grad_norm": 3.510748005571704, + "learning_rate": 1.4165723497320815e-07, + "loss": 0.6244, + "step": 11405 + }, + { + "epoch": 0.93, + "grad_norm": 49.73435916302651, + "learning_rate": 1.413465221173088e-07, + "loss": 0.6415, + "step": 11406 + }, + { + "epoch": 0.93, + "grad_norm": 5.6395586627568965, + "learning_rate": 1.410361455108228e-07, + "loss": 0.8055, + "step": 11407 + }, + { + "epoch": 0.93, + "grad_norm": 3.803798780051327, + "learning_rate": 1.4072610517523068e-07, + "loss": 0.6204, + "step": 11408 + }, + { + "epoch": 0.93, + "grad_norm": 2.595097040542259, + "learning_rate": 1.404164011319875e-07, + "loss": 0.6475, + "step": 11409 + }, + { + "epoch": 0.93, + "grad_norm": 2.9503674995130216, + "learning_rate": 1.401070334025284e-07, + "loss": 0.6113, + "step": 11410 + }, + { + "epoch": 0.93, + "grad_norm": 6.8650050775455735, + "learning_rate": 1.3979800200826289e-07, + "loss": 0.5771, + "step": 11411 + }, + { + "epoch": 0.93, + "grad_norm": 3.059204302336034, + "learning_rate": 1.3948930697057772e-07, + "loss": 0.6457, + "step": 11412 + }, + { + "epoch": 0.93, + "grad_norm": 2.5066812251668362, + "learning_rate": 1.3918094831083696e-07, + "loss": 0.6207, + "step": 11413 + }, + { + "epoch": 0.93, + "grad_norm": 3.4666209134087476, + "learning_rate": 1.3887292605038128e-07, + "loss": 0.7905, + "step": 11414 + }, + { + "epoch": 0.93, + "grad_norm": 4.547030065393692, + "learning_rate": 1.3856524021052696e-07, + "loss": 0.6082, + "step": 11415 + }, + { + "epoch": 0.93, + "grad_norm": 9.204413778944172, + "learning_rate": 1.3825789081256812e-07, + "loss": 0.7553, + "step": 11416 + }, + { + "epoch": 0.93, + "grad_norm": 6.833168621264518, + "learning_rate": 1.3795087787777494e-07, + "loss": 0.5879, + "step": 11417 + }, + { + "epoch": 0.93, + "grad_norm": 3.9049100321032477, + "learning_rate": 1.3764420142739543e-07, + "loss": 0.6388, + "step": 11418 + }, + { + "epoch": 0.93, + "grad_norm": 3.9527516075623437, + "learning_rate": 1.373378614826526e-07, + "loss": 0.6209, + "step": 11419 + }, + { + "epoch": 0.93, + "grad_norm": 6.063438106064102, + "learning_rate": 1.3703185806474838e-07, + "loss": 0.7724, + "step": 11420 + }, + { + "epoch": 0.93, + "grad_norm": 4.026549451805104, + "learning_rate": 1.367261911948592e-07, + "loss": 0.7508, + "step": 11421 + }, + { + "epoch": 0.93, + "grad_norm": 3.3084445404464207, + "learning_rate": 1.364208608941392e-07, + "loss": 0.5567, + "step": 11422 + }, + { + "epoch": 0.93, + "grad_norm": 4.447217249115067, + "learning_rate": 1.3611586718371871e-07, + "loss": 0.589, + "step": 11423 + }, + { + "epoch": 0.93, + "grad_norm": 2.7712875353268376, + "learning_rate": 1.3581121008470644e-07, + "loss": 0.5829, + "step": 11424 + }, + { + "epoch": 0.93, + "grad_norm": 3.119636383680576, + "learning_rate": 1.3550688961818602e-07, + "loss": 0.7012, + "step": 11425 + }, + { + "epoch": 0.93, + "grad_norm": 4.394405906267207, + "learning_rate": 1.3520290580521734e-07, + "loss": 0.6759, + "step": 11426 + }, + { + "epoch": 0.93, + "grad_norm": 6.357454642491195, + "learning_rate": 1.348992586668396e-07, + "loss": 0.6017, + "step": 11427 + }, + { + "epoch": 0.93, + "grad_norm": 4.5308063680424375, + "learning_rate": 1.3459594822406607e-07, + "loss": 0.6301, + "step": 11428 + }, + { + "epoch": 0.93, + "grad_norm": 7.658950334816602, + "learning_rate": 1.3429297449788825e-07, + "loss": 0.9281, + "step": 11429 + }, + { + "epoch": 0.93, + "grad_norm": 3.945622023335836, + "learning_rate": 1.3399033750927327e-07, + "loss": 0.7386, + "step": 11430 + }, + { + "epoch": 0.93, + "grad_norm": 2.615094996485289, + "learning_rate": 1.3368803727916658e-07, + "loss": 0.5747, + "step": 11431 + }, + { + "epoch": 0.93, + "grad_norm": 4.82000048914744, + "learning_rate": 1.3338607382848811e-07, + "loss": 0.6137, + "step": 11432 + }, + { + "epoch": 0.93, + "grad_norm": 4.041368937015934, + "learning_rate": 1.3308444717813562e-07, + "loss": 0.604, + "step": 11433 + }, + { + "epoch": 0.93, + "grad_norm": 8.452854769154436, + "learning_rate": 1.3278315734898516e-07, + "loss": 0.6382, + "step": 11434 + }, + { + "epoch": 0.93, + "grad_norm": 3.735592822997537, + "learning_rate": 1.3248220436188565e-07, + "loss": 0.4741, + "step": 11435 + }, + { + "epoch": 0.93, + "grad_norm": 3.7318679891647264, + "learning_rate": 1.3218158823766646e-07, + "loss": 0.7383, + "step": 11436 + }, + { + "epoch": 0.93, + "grad_norm": 4.410538387383327, + "learning_rate": 1.3188130899713102e-07, + "loss": 0.7998, + "step": 11437 + }, + { + "epoch": 0.93, + "grad_norm": 6.089742488283766, + "learning_rate": 1.3158136666106215e-07, + "loss": 0.6571, + "step": 11438 + }, + { + "epoch": 0.93, + "grad_norm": 6.547591457278495, + "learning_rate": 1.3128176125021653e-07, + "loss": 0.6286, + "step": 11439 + }, + { + "epoch": 0.93, + "grad_norm": 2.866402969474792, + "learning_rate": 1.3098249278532814e-07, + "loss": 0.6275, + "step": 11440 + }, + { + "epoch": 0.93, + "grad_norm": 5.069474245946652, + "learning_rate": 1.30683561287111e-07, + "loss": 0.6068, + "step": 11441 + }, + { + "epoch": 0.93, + "grad_norm": 3.3593093735361617, + "learning_rate": 1.3038496677624968e-07, + "loss": 0.7212, + "step": 11442 + }, + { + "epoch": 0.93, + "grad_norm": 2.8443034469932384, + "learning_rate": 1.3008670927341037e-07, + "loss": 0.4995, + "step": 11443 + }, + { + "epoch": 0.93, + "grad_norm": 2.7166993764204466, + "learning_rate": 1.297887887992344e-07, + "loss": 0.5849, + "step": 11444 + }, + { + "epoch": 0.93, + "grad_norm": 2.750668719298837, + "learning_rate": 1.2949120537434024e-07, + "loss": 0.5086, + "step": 11445 + }, + { + "epoch": 0.93, + "grad_norm": 10.18295930415933, + "learning_rate": 1.2919395901932087e-07, + "loss": 0.7181, + "step": 11446 + }, + { + "epoch": 0.93, + "grad_norm": 4.335804195975361, + "learning_rate": 1.288970497547498e-07, + "loss": 0.6296, + "step": 11447 + }, + { + "epoch": 0.93, + "grad_norm": 3.9923644499337567, + "learning_rate": 1.2860047760117344e-07, + "loss": 0.6693, + "step": 11448 + }, + { + "epoch": 0.93, + "grad_norm": 3.829525422343924, + "learning_rate": 1.28304242579117e-07, + "loss": 0.7851, + "step": 11449 + }, + { + "epoch": 0.93, + "grad_norm": 3.7603471583677086, + "learning_rate": 1.280083447090813e-07, + "loss": 0.6702, + "step": 11450 + }, + { + "epoch": 0.93, + "grad_norm": 8.334774438674412, + "learning_rate": 1.2771278401154496e-07, + "loss": 0.6578, + "step": 11451 + }, + { + "epoch": 0.93, + "grad_norm": 4.729800875038276, + "learning_rate": 1.2741756050696275e-07, + "loss": 0.7373, + "step": 11452 + }, + { + "epoch": 0.93, + "grad_norm": 3.660710829268898, + "learning_rate": 1.2712267421576497e-07, + "loss": 0.682, + "step": 11453 + }, + { + "epoch": 0.93, + "grad_norm": 3.1866141556836154, + "learning_rate": 1.268281251583614e-07, + "loss": 0.6424, + "step": 11454 + }, + { + "epoch": 0.93, + "grad_norm": 3.880055463916356, + "learning_rate": 1.265339133551341e-07, + "loss": 0.6087, + "step": 11455 + }, + { + "epoch": 0.93, + "grad_norm": 3.30578707194263, + "learning_rate": 1.2624003882644674e-07, + "loss": 0.7275, + "step": 11456 + }, + { + "epoch": 0.93, + "grad_norm": 6.042591611668468, + "learning_rate": 1.25946501592637e-07, + "loss": 0.7509, + "step": 11457 + }, + { + "epoch": 0.93, + "grad_norm": 2.663008684694424, + "learning_rate": 1.2565330167401747e-07, + "loss": 0.5814, + "step": 11458 + }, + { + "epoch": 0.93, + "grad_norm": 18.815938197177395, + "learning_rate": 1.253604390908819e-07, + "loss": 0.7523, + "step": 11459 + }, + { + "epoch": 0.93, + "grad_norm": 4.525150600376938, + "learning_rate": 1.2506791386349693e-07, + "loss": 0.7165, + "step": 11460 + }, + { + "epoch": 0.93, + "grad_norm": 5.096269770592939, + "learning_rate": 1.2477572601210796e-07, + "loss": 0.7191, + "step": 11461 + }, + { + "epoch": 0.93, + "grad_norm": 5.543790499615598, + "learning_rate": 1.2448387555693498e-07, + "loss": 0.6205, + "step": 11462 + }, + { + "epoch": 0.93, + "grad_norm": 7.447283179539826, + "learning_rate": 1.2419236251817735e-07, + "loss": 0.7706, + "step": 11463 + }, + { + "epoch": 0.93, + "grad_norm": 4.067276293011203, + "learning_rate": 1.2390118691600838e-07, + "loss": 0.7817, + "step": 11464 + }, + { + "epoch": 0.93, + "grad_norm": 5.872606758268423, + "learning_rate": 1.236103487705792e-07, + "loss": 0.638, + "step": 11465 + }, + { + "epoch": 0.93, + "grad_norm": 3.008389368255949, + "learning_rate": 1.2331984810201869e-07, + "loss": 0.6085, + "step": 11466 + }, + { + "epoch": 0.93, + "grad_norm": 4.6678813332588405, + "learning_rate": 1.2302968493043078e-07, + "loss": 0.7046, + "step": 11467 + }, + { + "epoch": 0.93, + "grad_norm": 3.354952063264658, + "learning_rate": 1.2273985927589715e-07, + "loss": 0.5203, + "step": 11468 + }, + { + "epoch": 0.93, + "grad_norm": 4.388074469745559, + "learning_rate": 1.2245037115847402e-07, + "loss": 0.6352, + "step": 11469 + }, + { + "epoch": 0.93, + "grad_norm": 4.363366641435926, + "learning_rate": 1.2216122059819757e-07, + "loss": 0.6699, + "step": 11470 + }, + { + "epoch": 0.93, + "grad_norm": 4.5356698483628985, + "learning_rate": 1.2187240761507736e-07, + "loss": 0.9072, + "step": 11471 + }, + { + "epoch": 0.93, + "grad_norm": 5.301440655624692, + "learning_rate": 1.2158393222910235e-07, + "loss": 0.5446, + "step": 11472 + }, + { + "epoch": 0.93, + "grad_norm": 10.592560003178411, + "learning_rate": 1.2129579446023665e-07, + "loss": 0.7756, + "step": 11473 + }, + { + "epoch": 0.93, + "grad_norm": 3.3125618821169147, + "learning_rate": 1.2100799432842037e-07, + "loss": 0.6231, + "step": 11474 + }, + { + "epoch": 0.93, + "grad_norm": 3.066028756926879, + "learning_rate": 1.2072053185357146e-07, + "loss": 0.7225, + "step": 11475 + }, + { + "epoch": 0.93, + "grad_norm": 3.222692303239116, + "learning_rate": 1.2043340705558405e-07, + "loss": 0.5683, + "step": 11476 + }, + { + "epoch": 0.93, + "grad_norm": 7.062321074809714, + "learning_rate": 1.201466199543294e-07, + "loss": 0.6948, + "step": 11477 + }, + { + "epoch": 0.93, + "grad_norm": 3.9573957233842276, + "learning_rate": 1.1986017056965448e-07, + "loss": 0.616, + "step": 11478 + }, + { + "epoch": 0.93, + "grad_norm": 4.704614401418179, + "learning_rate": 1.1957405892138397e-07, + "loss": 0.68, + "step": 11479 + }, + { + "epoch": 0.93, + "grad_norm": 4.5823885844589345, + "learning_rate": 1.1928828502931867e-07, + "loss": 0.609, + "step": 11480 + }, + { + "epoch": 0.93, + "grad_norm": 6.304430283723194, + "learning_rate": 1.1900284891323499e-07, + "loss": 0.6984, + "step": 11481 + }, + { + "epoch": 0.93, + "grad_norm": 3.4489755957564174, + "learning_rate": 1.1871775059288771e-07, + "loss": 0.6543, + "step": 11482 + }, + { + "epoch": 0.93, + "grad_norm": 5.072475991937842, + "learning_rate": 1.1843299008800712e-07, + "loss": 0.6035, + "step": 11483 + }, + { + "epoch": 0.93, + "grad_norm": 11.761782002413566, + "learning_rate": 1.1814856741830027e-07, + "loss": 0.6944, + "step": 11484 + }, + { + "epoch": 0.93, + "grad_norm": 3.9315357284272667, + "learning_rate": 1.1786448260345141e-07, + "loss": 0.8571, + "step": 11485 + }, + { + "epoch": 0.93, + "grad_norm": 7.267408502137549, + "learning_rate": 1.175807356631209e-07, + "loss": 0.7623, + "step": 11486 + }, + { + "epoch": 0.93, + "grad_norm": 6.81775875186139, + "learning_rate": 1.1729732661694582e-07, + "loss": 0.6502, + "step": 11487 + }, + { + "epoch": 0.93, + "grad_norm": 2.6358801138093613, + "learning_rate": 1.1701425548453938e-07, + "loss": 0.6235, + "step": 11488 + }, + { + "epoch": 0.93, + "grad_norm": 12.343909592743612, + "learning_rate": 1.1673152228549256e-07, + "loss": 0.4986, + "step": 11489 + }, + { + "epoch": 0.93, + "grad_norm": 6.2864122965684075, + "learning_rate": 1.1644912703937194e-07, + "loss": 0.573, + "step": 11490 + }, + { + "epoch": 0.93, + "grad_norm": 5.355532116362043, + "learning_rate": 1.1616706976572134e-07, + "loss": 0.5227, + "step": 11491 + }, + { + "epoch": 0.93, + "grad_norm": 3.3531169924161017, + "learning_rate": 1.1588535048406013e-07, + "loss": 0.6328, + "step": 11492 + }, + { + "epoch": 0.93, + "grad_norm": 3.8756372832388304, + "learning_rate": 1.1560396921388551e-07, + "loss": 0.6779, + "step": 11493 + }, + { + "epoch": 0.93, + "grad_norm": 9.408758765338888, + "learning_rate": 1.1532292597467188e-07, + "loss": 0.5711, + "step": 11494 + }, + { + "epoch": 0.93, + "grad_norm": 3.4668620983892366, + "learning_rate": 1.1504222078586757e-07, + "loss": 0.5696, + "step": 11495 + }, + { + "epoch": 0.93, + "grad_norm": 4.5817165494331435, + "learning_rate": 1.1476185366689985e-07, + "loss": 0.7049, + "step": 11496 + }, + { + "epoch": 0.93, + "grad_norm": 4.154842225515676, + "learning_rate": 1.1448182463717205e-07, + "loss": 0.7533, + "step": 11497 + }, + { + "epoch": 0.93, + "grad_norm": 3.9670265430670297, + "learning_rate": 1.142021337160637e-07, + "loss": 0.7129, + "step": 11498 + }, + { + "epoch": 0.93, + "grad_norm": 3.421862648752209, + "learning_rate": 1.1392278092293041e-07, + "loss": 0.8248, + "step": 11499 + }, + { + "epoch": 0.93, + "grad_norm": 4.1245467483504745, + "learning_rate": 1.1364376627710727e-07, + "loss": 0.5923, + "step": 11500 + }, + { + "epoch": 0.93, + "grad_norm": 10.428484925579507, + "learning_rate": 1.1336508979790217e-07, + "loss": 0.7293, + "step": 11501 + }, + { + "epoch": 0.93, + "grad_norm": 5.31349496955677, + "learning_rate": 1.1308675150460136e-07, + "loss": 0.6367, + "step": 11502 + }, + { + "epoch": 0.93, + "grad_norm": 4.807113153984954, + "learning_rate": 1.1280875141646774e-07, + "loss": 0.565, + "step": 11503 + }, + { + "epoch": 0.93, + "grad_norm": 4.779421150724952, + "learning_rate": 1.1253108955274094e-07, + "loss": 0.7455, + "step": 11504 + }, + { + "epoch": 0.93, + "grad_norm": 8.916204004292753, + "learning_rate": 1.1225376593263726e-07, + "loss": 0.5611, + "step": 11505 + }, + { + "epoch": 0.93, + "grad_norm": 3.1897827783342025, + "learning_rate": 1.11976780575348e-07, + "loss": 0.7628, + "step": 11506 + }, + { + "epoch": 0.93, + "grad_norm": 3.125904759399018, + "learning_rate": 1.1170013350004449e-07, + "loss": 0.616, + "step": 11507 + }, + { + "epoch": 0.93, + "grad_norm": 3.6804669900038514, + "learning_rate": 1.1142382472586921e-07, + "loss": 0.5992, + "step": 11508 + }, + { + "epoch": 0.93, + "grad_norm": 3.479507763164218, + "learning_rate": 1.111478542719474e-07, + "loss": 0.6898, + "step": 11509 + }, + { + "epoch": 0.93, + "grad_norm": 3.166576669367274, + "learning_rate": 1.1087222215737603e-07, + "loss": 0.6678, + "step": 11510 + }, + { + "epoch": 0.93, + "grad_norm": 6.5907576553474, + "learning_rate": 1.1059692840123204e-07, + "loss": 0.6673, + "step": 11511 + }, + { + "epoch": 0.94, + "grad_norm": 3.949853025962199, + "learning_rate": 1.1032197302256686e-07, + "loss": 0.6254, + "step": 11512 + }, + { + "epoch": 0.94, + "grad_norm": 4.305377308077892, + "learning_rate": 1.1004735604040862e-07, + "loss": 0.6232, + "step": 11513 + }, + { + "epoch": 0.94, + "grad_norm": 7.385771144314364, + "learning_rate": 1.0977307747376431e-07, + "loss": 0.737, + "step": 11514 + }, + { + "epoch": 0.94, + "grad_norm": 3.9879489698615607, + "learning_rate": 1.0949913734161266e-07, + "loss": 0.5812, + "step": 11515 + }, + { + "epoch": 0.94, + "grad_norm": 2.718159896092306, + "learning_rate": 1.0922553566291516e-07, + "loss": 0.6668, + "step": 11516 + }, + { + "epoch": 0.94, + "grad_norm": 6.151474753800585, + "learning_rate": 1.0895227245660444e-07, + "loss": 0.7592, + "step": 11517 + }, + { + "epoch": 0.94, + "grad_norm": 3.175089066326356, + "learning_rate": 1.0867934774159372e-07, + "loss": 0.8187, + "step": 11518 + }, + { + "epoch": 0.94, + "grad_norm": 4.131506931443423, + "learning_rate": 1.0840676153677066e-07, + "loss": 0.5617, + "step": 11519 + }, + { + "epoch": 0.94, + "grad_norm": 2.6311422849833903, + "learning_rate": 1.0813451386099904e-07, + "loss": 0.6008, + "step": 11520 + }, + { + "epoch": 0.94, + "grad_norm": 5.0694915706349555, + "learning_rate": 1.0786260473312104e-07, + "loss": 0.6224, + "step": 11521 + }, + { + "epoch": 0.94, + "grad_norm": 2.406061541795044, + "learning_rate": 1.0759103417195438e-07, + "loss": 0.6534, + "step": 11522 + }, + { + "epoch": 0.94, + "grad_norm": 2.968554981058841, + "learning_rate": 1.0731980219629346e-07, + "loss": 0.6147, + "step": 11523 + }, + { + "epoch": 0.94, + "grad_norm": 3.8488686832202417, + "learning_rate": 1.0704890882490827e-07, + "loss": 0.4803, + "step": 11524 + }, + { + "epoch": 0.94, + "grad_norm": 5.342749147250126, + "learning_rate": 1.0677835407654824e-07, + "loss": 0.5724, + "step": 11525 + }, + { + "epoch": 0.94, + "grad_norm": 38.42224631027316, + "learning_rate": 1.0650813796993508e-07, + "loss": 0.546, + "step": 11526 + }, + { + "epoch": 0.94, + "grad_norm": 3.243083961626768, + "learning_rate": 1.0623826052377217e-07, + "loss": 0.5683, + "step": 11527 + }, + { + "epoch": 0.94, + "grad_norm": 3.339100282442589, + "learning_rate": 1.0596872175673456e-07, + "loss": 0.5925, + "step": 11528 + }, + { + "epoch": 0.94, + "grad_norm": 6.323886679719286, + "learning_rate": 1.0569952168747677e-07, + "loss": 0.7036, + "step": 11529 + }, + { + "epoch": 0.94, + "grad_norm": 4.297906812413702, + "learning_rate": 1.0543066033462946e-07, + "loss": 0.7611, + "step": 11530 + }, + { + "epoch": 0.94, + "grad_norm": 2.5633968139913996, + "learning_rate": 1.0516213771679885e-07, + "loss": 0.6412, + "step": 11531 + }, + { + "epoch": 0.94, + "grad_norm": 5.471481590573864, + "learning_rate": 1.0489395385256896e-07, + "loss": 0.6177, + "step": 11532 + }, + { + "epoch": 0.94, + "grad_norm": 3.1911007804717975, + "learning_rate": 1.046261087604994e-07, + "loss": 0.6781, + "step": 11533 + }, + { + "epoch": 0.94, + "grad_norm": 3.5118392570540125, + "learning_rate": 1.0435860245912754e-07, + "loss": 0.7317, + "step": 11534 + }, + { + "epoch": 0.94, + "grad_norm": 2.7864538147219746, + "learning_rate": 1.0409143496696528e-07, + "loss": 0.7429, + "step": 11535 + }, + { + "epoch": 0.94, + "grad_norm": 3.3808046819415942, + "learning_rate": 1.038246063025028e-07, + "loss": 0.6297, + "step": 11536 + }, + { + "epoch": 0.94, + "grad_norm": 9.732503340033963, + "learning_rate": 1.03558116484207e-07, + "loss": 0.6748, + "step": 11537 + }, + { + "epoch": 0.94, + "grad_norm": 4.293665581081362, + "learning_rate": 1.0329196553051924e-07, + "loss": 0.5564, + "step": 11538 + }, + { + "epoch": 0.94, + "grad_norm": 3.660290581264573, + "learning_rate": 1.0302615345986034e-07, + "loss": 0.6979, + "step": 11539 + }, + { + "epoch": 0.94, + "grad_norm": 3.3771867679522973, + "learning_rate": 1.0276068029062559e-07, + "loss": 0.7345, + "step": 11540 + }, + { + "epoch": 0.94, + "grad_norm": 4.516735338652766, + "learning_rate": 1.024955460411875e-07, + "loss": 0.6474, + "step": 11541 + }, + { + "epoch": 0.94, + "grad_norm": 3.6534943145331114, + "learning_rate": 1.0223075072989418e-07, + "loss": 0.5188, + "step": 11542 + }, + { + "epoch": 0.94, + "grad_norm": 5.811678301905325, + "learning_rate": 1.019662943750721e-07, + "loss": 0.7598, + "step": 11543 + }, + { + "epoch": 0.94, + "grad_norm": 3.0269610934528606, + "learning_rate": 1.0170217699502272e-07, + "loss": 0.5633, + "step": 11544 + }, + { + "epoch": 0.94, + "grad_norm": 6.031090401356234, + "learning_rate": 1.0143839860802529e-07, + "loss": 0.6686, + "step": 11545 + }, + { + "epoch": 0.94, + "grad_norm": 3.9537633934158403, + "learning_rate": 1.0117495923233467e-07, + "loss": 0.78, + "step": 11546 + }, + { + "epoch": 0.94, + "grad_norm": 2.7836766586563786, + "learning_rate": 1.0091185888618238e-07, + "loss": 0.6807, + "step": 11547 + }, + { + "epoch": 0.94, + "grad_norm": 5.525225967591341, + "learning_rate": 1.0064909758777719e-07, + "loss": 0.5995, + "step": 11548 + }, + { + "epoch": 0.94, + "grad_norm": 4.036200857961824, + "learning_rate": 1.0038667535530233e-07, + "loss": 0.6269, + "step": 11549 + }, + { + "epoch": 0.94, + "grad_norm": 4.768220441354685, + "learning_rate": 1.001245922069205e-07, + "loss": 0.6867, + "step": 11550 + }, + { + "epoch": 0.94, + "grad_norm": 3.631182825766013, + "learning_rate": 9.98628481607683e-08, + "loss": 0.6584, + "step": 11551 + }, + { + "epoch": 0.94, + "grad_norm": 3.521279969963515, + "learning_rate": 9.960144323496179e-08, + "loss": 0.6244, + "step": 11552 + }, + { + "epoch": 0.94, + "grad_norm": 8.285872302571256, + "learning_rate": 9.934037744759096e-08, + "loss": 0.6541, + "step": 11553 + }, + { + "epoch": 0.94, + "grad_norm": 5.755804522446817, + "learning_rate": 9.907965081672244e-08, + "loss": 0.6331, + "step": 11554 + }, + { + "epoch": 0.94, + "grad_norm": 3.5099633342407204, + "learning_rate": 9.881926336040126e-08, + "loss": 0.7255, + "step": 11555 + }, + { + "epoch": 0.94, + "grad_norm": 5.651743321305104, + "learning_rate": 9.855921509664745e-08, + "loss": 0.6239, + "step": 11556 + }, + { + "epoch": 0.94, + "grad_norm": 3.7542328891519845, + "learning_rate": 9.829950604345772e-08, + "loss": 0.6934, + "step": 11557 + }, + { + "epoch": 0.94, + "grad_norm": 3.4735214247980846, + "learning_rate": 9.804013621880548e-08, + "loss": 0.6757, + "step": 11558 + }, + { + "epoch": 0.94, + "grad_norm": 3.3913642322877506, + "learning_rate": 9.778110564064191e-08, + "loss": 0.6834, + "step": 11559 + }, + { + "epoch": 0.94, + "grad_norm": 2.984380678780318, + "learning_rate": 9.752241432689214e-08, + "loss": 0.7108, + "step": 11560 + }, + { + "epoch": 0.94, + "grad_norm": 3.1182360513861322, + "learning_rate": 9.72640622954607e-08, + "loss": 0.6659, + "step": 11561 + }, + { + "epoch": 0.94, + "grad_norm": 4.439015078897651, + "learning_rate": 9.700604956422554e-08, + "loss": 0.5725, + "step": 11562 + }, + { + "epoch": 0.94, + "grad_norm": 3.3307809303430456, + "learning_rate": 9.674837615104349e-08, + "loss": 0.7122, + "step": 11563 + }, + { + "epoch": 0.94, + "grad_norm": 3.7068015658384317, + "learning_rate": 9.649104207374749e-08, + "loss": 0.6315, + "step": 11564 + }, + { + "epoch": 0.94, + "grad_norm": 2.3523424097740366, + "learning_rate": 9.623404735014608e-08, + "loss": 0.6933, + "step": 11565 + }, + { + "epoch": 0.94, + "grad_norm": 5.176665596445569, + "learning_rate": 9.597739199802614e-08, + "loss": 0.5512, + "step": 11566 + }, + { + "epoch": 0.94, + "grad_norm": 3.475544379390946, + "learning_rate": 9.572107603514846e-08, + "loss": 0.5305, + "step": 11567 + }, + { + "epoch": 0.94, + "grad_norm": 3.675809172104682, + "learning_rate": 9.54650994792522e-08, + "loss": 0.5767, + "step": 11568 + }, + { + "epoch": 0.94, + "grad_norm": 5.737765280557999, + "learning_rate": 9.520946234805206e-08, + "loss": 0.5663, + "step": 11569 + }, + { + "epoch": 0.94, + "grad_norm": 2.9182776775666692, + "learning_rate": 9.495416465924113e-08, + "loss": 0.5466, + "step": 11570 + }, + { + "epoch": 0.94, + "grad_norm": 3.6868794542786842, + "learning_rate": 9.469920643048636e-08, + "loss": 0.6532, + "step": 11571 + }, + { + "epoch": 0.94, + "grad_norm": 5.0995567761750795, + "learning_rate": 9.444458767943254e-08, + "loss": 0.7676, + "step": 11572 + }, + { + "epoch": 0.94, + "grad_norm": 2.697674843721375, + "learning_rate": 9.419030842370114e-08, + "loss": 0.6472, + "step": 11573 + }, + { + "epoch": 0.94, + "grad_norm": 5.4233984982670895, + "learning_rate": 9.393636868089029e-08, + "loss": 0.823, + "step": 11574 + }, + { + "epoch": 0.94, + "grad_norm": 3.6890610300496998, + "learning_rate": 9.368276846857427e-08, + "loss": 0.633, + "step": 11575 + }, + { + "epoch": 0.94, + "grad_norm": 10.782577575098676, + "learning_rate": 9.342950780430238e-08, + "loss": 0.6728, + "step": 11576 + }, + { + "epoch": 0.94, + "grad_norm": 4.266639622208362, + "learning_rate": 9.317658670560336e-08, + "loss": 0.6915, + "step": 11577 + }, + { + "epoch": 0.94, + "grad_norm": 2.9196072722320987, + "learning_rate": 9.292400518998102e-08, + "loss": 0.653, + "step": 11578 + }, + { + "epoch": 0.94, + "grad_norm": 4.968598092054262, + "learning_rate": 9.267176327491412e-08, + "loss": 0.7488, + "step": 11579 + }, + { + "epoch": 0.94, + "grad_norm": 3.858103519916151, + "learning_rate": 9.241986097786093e-08, + "loss": 0.6532, + "step": 11580 + }, + { + "epoch": 0.94, + "grad_norm": 2.4807037938636998, + "learning_rate": 9.216829831625363e-08, + "loss": 0.6437, + "step": 11581 + }, + { + "epoch": 0.94, + "grad_norm": 4.200296841777683, + "learning_rate": 9.191707530750271e-08, + "loss": 0.7319, + "step": 11582 + }, + { + "epoch": 0.94, + "grad_norm": 4.18615010952583, + "learning_rate": 9.166619196899318e-08, + "loss": 0.6507, + "step": 11583 + }, + { + "epoch": 0.94, + "grad_norm": 4.572003535381461, + "learning_rate": 9.141564831808947e-08, + "loss": 0.6512, + "step": 11584 + }, + { + "epoch": 0.94, + "grad_norm": 2.910539420515676, + "learning_rate": 9.116544437212993e-08, + "loss": 0.7063, + "step": 11585 + }, + { + "epoch": 0.94, + "grad_norm": 7.726111085651036, + "learning_rate": 9.091558014842961e-08, + "loss": 0.5578, + "step": 11586 + }, + { + "epoch": 0.94, + "grad_norm": 6.525478925624177, + "learning_rate": 9.066605566428188e-08, + "loss": 0.6738, + "step": 11587 + }, + { + "epoch": 0.94, + "grad_norm": 3.350555394766436, + "learning_rate": 9.041687093695461e-08, + "loss": 0.6915, + "step": 11588 + }, + { + "epoch": 0.94, + "grad_norm": 2.838150938541724, + "learning_rate": 9.01680259836929e-08, + "loss": 0.6678, + "step": 11589 + }, + { + "epoch": 0.94, + "grad_norm": 4.639352425093215, + "learning_rate": 8.991952082171851e-08, + "loss": 0.588, + "step": 11590 + }, + { + "epoch": 0.94, + "grad_norm": 3.3533410319936827, + "learning_rate": 8.967135546823047e-08, + "loss": 0.5919, + "step": 11591 + }, + { + "epoch": 0.94, + "grad_norm": 5.016141101764093, + "learning_rate": 8.942352994040227e-08, + "loss": 0.6153, + "step": 11592 + }, + { + "epoch": 0.94, + "grad_norm": 15.523717066604496, + "learning_rate": 8.917604425538518e-08, + "loss": 0.6691, + "step": 11593 + }, + { + "epoch": 0.94, + "grad_norm": 10.429911331746677, + "learning_rate": 8.892889843030717e-08, + "loss": 0.6266, + "step": 11594 + }, + { + "epoch": 0.94, + "grad_norm": 2.931049488674767, + "learning_rate": 8.868209248227178e-08, + "loss": 0.7576, + "step": 11595 + }, + { + "epoch": 0.94, + "grad_norm": 3.821839098993725, + "learning_rate": 8.843562642835979e-08, + "loss": 0.5988, + "step": 11596 + }, + { + "epoch": 0.94, + "grad_norm": 5.566454634037884, + "learning_rate": 8.818950028562811e-08, + "loss": 0.7029, + "step": 11597 + }, + { + "epoch": 0.94, + "grad_norm": 3.961223750544519, + "learning_rate": 8.794371407111091e-08, + "loss": 0.6638, + "step": 11598 + }, + { + "epoch": 0.94, + "grad_norm": 4.36013230181537, + "learning_rate": 8.769826780181678e-08, + "loss": 0.5431, + "step": 11599 + }, + { + "epoch": 0.94, + "grad_norm": 5.5354828177064785, + "learning_rate": 8.745316149473382e-08, + "loss": 0.6708, + "step": 11600 + }, + { + "epoch": 0.94, + "grad_norm": 2.726120754900734, + "learning_rate": 8.720839516682344e-08, + "loss": 0.6106, + "step": 11601 + }, + { + "epoch": 0.94, + "grad_norm": 3.3694560656560912, + "learning_rate": 8.6963968835026e-08, + "loss": 0.4937, + "step": 11602 + }, + { + "epoch": 0.94, + "grad_norm": 4.784349168717699, + "learning_rate": 8.671988251625685e-08, + "loss": 0.6154, + "step": 11603 + }, + { + "epoch": 0.94, + "grad_norm": 3.4841503991185947, + "learning_rate": 8.647613622740746e-08, + "loss": 0.6955, + "step": 11604 + }, + { + "epoch": 0.94, + "grad_norm": 3.3273731289619217, + "learning_rate": 8.623272998534882e-08, + "loss": 0.6842, + "step": 11605 + }, + { + "epoch": 0.94, + "grad_norm": 9.537243778071526, + "learning_rate": 8.598966380692408e-08, + "loss": 0.5794, + "step": 11606 + }, + { + "epoch": 0.94, + "grad_norm": 2.9748760151315534, + "learning_rate": 8.574693770895648e-08, + "loss": 0.7394, + "step": 11607 + }, + { + "epoch": 0.94, + "grad_norm": 4.7055747418133365, + "learning_rate": 8.550455170824313e-08, + "loss": 0.7158, + "step": 11608 + }, + { + "epoch": 0.94, + "grad_norm": 3.8182130679469277, + "learning_rate": 8.526250582155893e-08, + "loss": 0.6823, + "step": 11609 + }, + { + "epoch": 0.94, + "grad_norm": 3.163062967380128, + "learning_rate": 8.502080006565495e-08, + "loss": 0.7314, + "step": 11610 + }, + { + "epoch": 0.94, + "grad_norm": 6.835815105432434, + "learning_rate": 8.477943445725889e-08, + "loss": 0.6437, + "step": 11611 + }, + { + "epoch": 0.94, + "grad_norm": 5.743579343542501, + "learning_rate": 8.45384090130752e-08, + "loss": 0.5644, + "step": 11612 + }, + { + "epoch": 0.94, + "grad_norm": 4.744159961895033, + "learning_rate": 8.429772374978384e-08, + "loss": 0.782, + "step": 11613 + }, + { + "epoch": 0.94, + "grad_norm": 4.792524804149734, + "learning_rate": 8.405737868404151e-08, + "loss": 0.6003, + "step": 11614 + }, + { + "epoch": 0.94, + "grad_norm": 4.3166922619617765, + "learning_rate": 8.381737383248156e-08, + "loss": 0.6129, + "step": 11615 + }, + { + "epoch": 0.94, + "grad_norm": 6.499222075008327, + "learning_rate": 8.357770921171516e-08, + "loss": 0.574, + "step": 11616 + }, + { + "epoch": 0.94, + "grad_norm": 6.521274010017859, + "learning_rate": 8.333838483832679e-08, + "loss": 0.5296, + "step": 11617 + }, + { + "epoch": 0.94, + "grad_norm": 5.855771900680462, + "learning_rate": 8.309940072888046e-08, + "loss": 0.6511, + "step": 11618 + }, + { + "epoch": 0.94, + "grad_norm": 4.772660268051909, + "learning_rate": 8.286075689991457e-08, + "loss": 0.7796, + "step": 11619 + }, + { + "epoch": 0.94, + "grad_norm": 2.980918799755801, + "learning_rate": 8.262245336794594e-08, + "loss": 0.6253, + "step": 11620 + }, + { + "epoch": 0.94, + "grad_norm": 5.179517333585579, + "learning_rate": 8.238449014946526e-08, + "loss": 0.6407, + "step": 11621 + }, + { + "epoch": 0.94, + "grad_norm": 3.1512457581422826, + "learning_rate": 8.214686726094157e-08, + "loss": 0.6343, + "step": 11622 + }, + { + "epoch": 0.94, + "grad_norm": 3.82226874289729, + "learning_rate": 8.19095847188206e-08, + "loss": 0.5487, + "step": 11623 + }, + { + "epoch": 0.94, + "grad_norm": 7.873913725748984, + "learning_rate": 8.167264253952256e-08, + "loss": 0.8426, + "step": 11624 + }, + { + "epoch": 0.94, + "grad_norm": 6.504261537819809, + "learning_rate": 8.143604073944656e-08, + "loss": 0.7083, + "step": 11625 + }, + { + "epoch": 0.94, + "grad_norm": 9.504890110119282, + "learning_rate": 8.11997793349667e-08, + "loss": 0.615, + "step": 11626 + }, + { + "epoch": 0.94, + "grad_norm": 25.717031275301753, + "learning_rate": 8.096385834243325e-08, + "loss": 0.4832, + "step": 11627 + }, + { + "epoch": 0.94, + "grad_norm": 2.6888519422814947, + "learning_rate": 8.072827777817316e-08, + "loss": 0.6875, + "step": 11628 + }, + { + "epoch": 0.94, + "grad_norm": 3.711849887015135, + "learning_rate": 8.049303765849059e-08, + "loss": 0.5983, + "step": 11629 + }, + { + "epoch": 0.94, + "grad_norm": 4.15815723616945, + "learning_rate": 8.025813799966586e-08, + "loss": 0.7489, + "step": 11630 + }, + { + "epoch": 0.94, + "grad_norm": 2.924321452685983, + "learning_rate": 8.002357881795486e-08, + "loss": 0.6797, + "step": 11631 + }, + { + "epoch": 0.94, + "grad_norm": 2.9912399391153275, + "learning_rate": 7.978936012959126e-08, + "loss": 0.6462, + "step": 11632 + }, + { + "epoch": 0.94, + "grad_norm": 2.1665609563727517, + "learning_rate": 7.955548195078433e-08, + "loss": 0.5375, + "step": 11633 + }, + { + "epoch": 0.94, + "grad_norm": 6.145731813981746, + "learning_rate": 7.932194429771945e-08, + "loss": 0.6375, + "step": 11634 + }, + { + "epoch": 0.94, + "grad_norm": 8.795368409432303, + "learning_rate": 7.908874718655923e-08, + "loss": 0.6494, + "step": 11635 + }, + { + "epoch": 0.95, + "grad_norm": 3.2244219716125446, + "learning_rate": 7.88558906334419e-08, + "loss": 0.5431, + "step": 11636 + }, + { + "epoch": 0.95, + "grad_norm": 2.9539547923962974, + "learning_rate": 7.862337465448344e-08, + "loss": 0.629, + "step": 11637 + }, + { + "epoch": 0.95, + "grad_norm": 3.97953857435795, + "learning_rate": 7.839119926577488e-08, + "loss": 0.6806, + "step": 11638 + }, + { + "epoch": 0.95, + "grad_norm": 5.762450643057052, + "learning_rate": 7.815936448338446e-08, + "loss": 0.6174, + "step": 11639 + }, + { + "epoch": 0.95, + "grad_norm": 2.7535604536714646, + "learning_rate": 7.792787032335657e-08, + "loss": 0.4925, + "step": 11640 + }, + { + "epoch": 0.95, + "grad_norm": 5.890220407167974, + "learning_rate": 7.769671680171232e-08, + "loss": 0.6196, + "step": 11641 + }, + { + "epoch": 0.95, + "grad_norm": 4.809098251359767, + "learning_rate": 7.74659039344483e-08, + "loss": 0.6616, + "step": 11642 + }, + { + "epoch": 0.95, + "grad_norm": 4.490832267627192, + "learning_rate": 7.723543173753789e-08, + "loss": 0.6091, + "step": 11643 + }, + { + "epoch": 0.95, + "grad_norm": 9.444773869167516, + "learning_rate": 7.700530022693275e-08, + "loss": 0.68, + "step": 11644 + }, + { + "epoch": 0.95, + "grad_norm": 2.343394088195895, + "learning_rate": 7.677550941855793e-08, + "loss": 0.6036, + "step": 11645 + }, + { + "epoch": 0.95, + "grad_norm": 4.606753780027665, + "learning_rate": 7.654605932831793e-08, + "loss": 0.6125, + "step": 11646 + }, + { + "epoch": 0.95, + "grad_norm": 6.98710454191637, + "learning_rate": 7.631694997209061e-08, + "loss": 0.6896, + "step": 11647 + }, + { + "epoch": 0.95, + "grad_norm": 5.865091923210574, + "learning_rate": 7.60881813657327e-08, + "loss": 0.7415, + "step": 11648 + }, + { + "epoch": 0.95, + "grad_norm": 6.0396325940630975, + "learning_rate": 7.585975352507547e-08, + "loss": 0.5673, + "step": 11649 + }, + { + "epoch": 0.95, + "grad_norm": 3.24069278097471, + "learning_rate": 7.5631666465929e-08, + "loss": 0.746, + "step": 11650 + }, + { + "epoch": 0.95, + "grad_norm": 6.487372615802543, + "learning_rate": 7.540392020407739e-08, + "loss": 0.7738, + "step": 11651 + }, + { + "epoch": 0.95, + "grad_norm": 3.109664530159642, + "learning_rate": 7.517651475528187e-08, + "loss": 0.538, + "step": 11652 + }, + { + "epoch": 0.95, + "grad_norm": 9.15964529060189, + "learning_rate": 7.49494501352821e-08, + "loss": 0.7096, + "step": 11653 + }, + { + "epoch": 0.95, + "grad_norm": 4.857504601036801, + "learning_rate": 7.472272635978995e-08, + "loss": 0.6357, + "step": 11654 + }, + { + "epoch": 0.95, + "grad_norm": 5.36471050006214, + "learning_rate": 7.44963434444973e-08, + "loss": 0.5389, + "step": 11655 + }, + { + "epoch": 0.95, + "grad_norm": 3.33508908282511, + "learning_rate": 7.427030140507108e-08, + "loss": 0.6442, + "step": 11656 + }, + { + "epoch": 0.95, + "grad_norm": 4.391788444758637, + "learning_rate": 7.404460025715543e-08, + "loss": 0.7423, + "step": 11657 + }, + { + "epoch": 0.95, + "grad_norm": 6.528137374660082, + "learning_rate": 7.381924001636953e-08, + "loss": 0.7136, + "step": 11658 + }, + { + "epoch": 0.95, + "grad_norm": 4.554457221550705, + "learning_rate": 7.359422069831035e-08, + "loss": 0.7168, + "step": 11659 + }, + { + "epoch": 0.95, + "grad_norm": 2.4903430032411733, + "learning_rate": 7.336954231855042e-08, + "loss": 0.4979, + "step": 11660 + }, + { + "epoch": 0.95, + "grad_norm": 4.360894000283392, + "learning_rate": 7.314520489263787e-08, + "loss": 0.6154, + "step": 11661 + }, + { + "epoch": 0.95, + "grad_norm": 4.090449773781666, + "learning_rate": 7.29212084361003e-08, + "loss": 0.7313, + "step": 11662 + }, + { + "epoch": 0.95, + "grad_norm": 2.935725075978512, + "learning_rate": 7.269755296443748e-08, + "loss": 0.6681, + "step": 11663 + }, + { + "epoch": 0.95, + "grad_norm": 4.669406930782915, + "learning_rate": 7.247423849312984e-08, + "loss": 0.5562, + "step": 11664 + }, + { + "epoch": 0.95, + "grad_norm": 3.336626173636811, + "learning_rate": 7.225126503763057e-08, + "loss": 0.6231, + "step": 11665 + }, + { + "epoch": 0.95, + "grad_norm": 3.636211872373431, + "learning_rate": 7.202863261337178e-08, + "loss": 0.7553, + "step": 11666 + }, + { + "epoch": 0.95, + "grad_norm": 3.150574842900758, + "learning_rate": 7.180634123576058e-08, + "loss": 0.647, + "step": 11667 + }, + { + "epoch": 0.95, + "grad_norm": 3.8888324209728045, + "learning_rate": 7.158439092018077e-08, + "loss": 0.8087, + "step": 11668 + }, + { + "epoch": 0.95, + "grad_norm": 8.136560649361392, + "learning_rate": 7.13627816819934e-08, + "loss": 0.72, + "step": 11669 + }, + { + "epoch": 0.95, + "grad_norm": 2.9991879772076735, + "learning_rate": 7.114151353653399e-08, + "loss": 0.5567, + "step": 11670 + }, + { + "epoch": 0.95, + "grad_norm": 2.7172871566518326, + "learning_rate": 7.092058649911748e-08, + "loss": 0.5748, + "step": 11671 + }, + { + "epoch": 0.95, + "grad_norm": 2.9132600838390226, + "learning_rate": 7.070000058503169e-08, + "loss": 0.6371, + "step": 11672 + }, + { + "epoch": 0.95, + "grad_norm": 5.962452321979046, + "learning_rate": 7.047975580954436e-08, + "loss": 0.6457, + "step": 11673 + }, + { + "epoch": 0.95, + "grad_norm": 11.20241206756556, + "learning_rate": 7.025985218789555e-08, + "loss": 0.6162, + "step": 11674 + }, + { + "epoch": 0.95, + "grad_norm": 3.346096299542828, + "learning_rate": 7.004028973530586e-08, + "loss": 0.8032, + "step": 11675 + }, + { + "epoch": 0.95, + "grad_norm": 5.851467428148096, + "learning_rate": 6.982106846696979e-08, + "loss": 0.7072, + "step": 11676 + }, + { + "epoch": 0.95, + "grad_norm": 11.020489786730803, + "learning_rate": 6.9602188398058e-08, + "loss": 0.5978, + "step": 11677 + }, + { + "epoch": 0.95, + "grad_norm": 4.001187987405929, + "learning_rate": 6.938364954372001e-08, + "loss": 0.7337, + "step": 11678 + }, + { + "epoch": 0.95, + "grad_norm": 7.460073231275004, + "learning_rate": 6.91654519190782e-08, + "loss": 0.6974, + "step": 11679 + }, + { + "epoch": 0.95, + "grad_norm": 4.553490945055479, + "learning_rate": 6.894759553923547e-08, + "loss": 0.7167, + "step": 11680 + }, + { + "epoch": 0.95, + "grad_norm": 2.8631416112897083, + "learning_rate": 6.873008041926643e-08, + "loss": 0.6052, + "step": 11681 + }, + { + "epoch": 0.95, + "grad_norm": 10.48298385823776, + "learning_rate": 6.851290657422627e-08, + "loss": 0.6784, + "step": 11682 + }, + { + "epoch": 0.95, + "grad_norm": 3.320154206976553, + "learning_rate": 6.829607401914462e-08, + "loss": 0.6991, + "step": 11683 + }, + { + "epoch": 0.95, + "grad_norm": 5.932955269226586, + "learning_rate": 6.807958276902615e-08, + "loss": 0.6533, + "step": 11684 + }, + { + "epoch": 0.95, + "grad_norm": 5.223752866619851, + "learning_rate": 6.786343283885554e-08, + "loss": 0.6863, + "step": 11685 + }, + { + "epoch": 0.95, + "grad_norm": 6.974001810193784, + "learning_rate": 6.764762424359029e-08, + "loss": 0.691, + "step": 11686 + }, + { + "epoch": 0.95, + "grad_norm": 4.3220393808556, + "learning_rate": 6.743215699816564e-08, + "loss": 0.8174, + "step": 11687 + }, + { + "epoch": 0.95, + "grad_norm": 2.855817887731506, + "learning_rate": 6.721703111749412e-08, + "loss": 0.6676, + "step": 11688 + }, + { + "epoch": 0.95, + "grad_norm": 2.7986218182754286, + "learning_rate": 6.700224661646326e-08, + "loss": 0.6008, + "step": 11689 + }, + { + "epoch": 0.95, + "grad_norm": 4.217314689381992, + "learning_rate": 6.678780350993786e-08, + "loss": 0.7453, + "step": 11690 + }, + { + "epoch": 0.95, + "grad_norm": 4.096674792101529, + "learning_rate": 6.657370181275823e-08, + "loss": 0.6831, + "step": 11691 + }, + { + "epoch": 0.95, + "grad_norm": 3.3553204887401304, + "learning_rate": 6.635994153974257e-08, + "loss": 0.7659, + "step": 11692 + }, + { + "epoch": 0.95, + "grad_norm": 3.758865732524541, + "learning_rate": 6.61465227056829e-08, + "loss": 0.5297, + "step": 11693 + }, + { + "epoch": 0.95, + "grad_norm": 4.082034937843226, + "learning_rate": 6.593344532535073e-08, + "loss": 0.6807, + "step": 11694 + }, + { + "epoch": 0.95, + "grad_norm": 3.0757607724979907, + "learning_rate": 6.572070941349095e-08, + "loss": 0.7898, + "step": 11695 + }, + { + "epoch": 0.95, + "grad_norm": 2.7172509878037627, + "learning_rate": 6.550831498482679e-08, + "loss": 0.6053, + "step": 11696 + }, + { + "epoch": 0.95, + "grad_norm": 8.806184060462922, + "learning_rate": 6.529626205405759e-08, + "loss": 0.7493, + "step": 11697 + }, + { + "epoch": 0.95, + "grad_norm": 3.9409835634147736, + "learning_rate": 6.508455063585883e-08, + "loss": 0.725, + "step": 11698 + }, + { + "epoch": 0.95, + "grad_norm": 2.081419349184579, + "learning_rate": 6.487318074488159e-08, + "loss": 0.6261, + "step": 11699 + }, + { + "epoch": 0.95, + "grad_norm": 12.1516430982968, + "learning_rate": 6.466215239575469e-08, + "loss": 0.5925, + "step": 11700 + }, + { + "epoch": 0.95, + "grad_norm": 5.661992939388067, + "learning_rate": 6.445146560308202e-08, + "loss": 0.5893, + "step": 11701 + }, + { + "epoch": 0.95, + "grad_norm": 3.8211793726593872, + "learning_rate": 6.42411203814447e-08, + "loss": 0.5849, + "step": 11702 + }, + { + "epoch": 0.95, + "grad_norm": 4.167181838430979, + "learning_rate": 6.403111674539996e-08, + "loss": 0.6441, + "step": 11703 + }, + { + "epoch": 0.95, + "grad_norm": 5.5875787897976705, + "learning_rate": 6.38214547094812e-08, + "loss": 0.5131, + "step": 11704 + }, + { + "epoch": 0.95, + "grad_norm": 2.673228563884166, + "learning_rate": 6.361213428819901e-08, + "loss": 0.6417, + "step": 11705 + }, + { + "epoch": 0.95, + "grad_norm": 3.0925866395972355, + "learning_rate": 6.340315549603903e-08, + "loss": 0.7816, + "step": 11706 + }, + { + "epoch": 0.95, + "grad_norm": 3.326546306893215, + "learning_rate": 6.319451834746415e-08, + "loss": 0.7483, + "step": 11707 + }, + { + "epoch": 0.95, + "grad_norm": 5.7795352368544854, + "learning_rate": 6.298622285691337e-08, + "loss": 0.632, + "step": 11708 + }, + { + "epoch": 0.95, + "grad_norm": 3.5534990874002412, + "learning_rate": 6.277826903880125e-08, + "loss": 0.6951, + "step": 11709 + }, + { + "epoch": 0.95, + "grad_norm": 4.161709880855212, + "learning_rate": 6.257065690752129e-08, + "loss": 0.6208, + "step": 11710 + }, + { + "epoch": 0.95, + "grad_norm": 8.90725377683745, + "learning_rate": 6.236338647743922e-08, + "loss": 0.6842, + "step": 11711 + }, + { + "epoch": 0.95, + "grad_norm": 4.4160285104456465, + "learning_rate": 6.215645776290191e-08, + "loss": 0.678, + "step": 11712 + }, + { + "epoch": 0.95, + "grad_norm": 3.311361410569505, + "learning_rate": 6.194987077822845e-08, + "loss": 0.6691, + "step": 11713 + }, + { + "epoch": 0.95, + "grad_norm": 3.091133285935937, + "learning_rate": 6.174362553771685e-08, + "loss": 0.6524, + "step": 11714 + }, + { + "epoch": 0.95, + "grad_norm": 3.6902023663328367, + "learning_rate": 6.153772205563957e-08, + "loss": 0.5426, + "step": 11715 + }, + { + "epoch": 0.95, + "grad_norm": 2.8404791017445543, + "learning_rate": 6.133216034624745e-08, + "loss": 0.7112, + "step": 11716 + }, + { + "epoch": 0.95, + "grad_norm": 5.033195540933517, + "learning_rate": 6.112694042376632e-08, + "loss": 0.5975, + "step": 11717 + }, + { + "epoch": 0.95, + "grad_norm": 4.558822748664367, + "learning_rate": 6.092206230239817e-08, + "loss": 0.6435, + "step": 11718 + }, + { + "epoch": 0.95, + "grad_norm": 3.6303853319449413, + "learning_rate": 6.071752599632274e-08, + "loss": 0.5443, + "step": 11719 + }, + { + "epoch": 0.95, + "grad_norm": 6.148070415844036, + "learning_rate": 6.051333151969484e-08, + "loss": 0.6088, + "step": 11720 + }, + { + "epoch": 0.95, + "grad_norm": 9.901224407908957, + "learning_rate": 6.030947888664595e-08, + "loss": 0.7554, + "step": 11721 + }, + { + "epoch": 0.95, + "grad_norm": 8.911564597551182, + "learning_rate": 6.010596811128366e-08, + "loss": 0.6903, + "step": 11722 + }, + { + "epoch": 0.95, + "grad_norm": 10.908731564824842, + "learning_rate": 5.990279920769227e-08, + "loss": 0.6494, + "step": 11723 + }, + { + "epoch": 0.95, + "grad_norm": 3.30618045758593, + "learning_rate": 5.969997218993328e-08, + "loss": 0.878, + "step": 11724 + }, + { + "epoch": 0.95, + "grad_norm": 3.0259451785408618, + "learning_rate": 5.9497487072042726e-08, + "loss": 0.6883, + "step": 11725 + }, + { + "epoch": 0.95, + "grad_norm": 4.340156293807322, + "learning_rate": 5.929534386803437e-08, + "loss": 0.5766, + "step": 11726 + }, + { + "epoch": 0.95, + "grad_norm": 3.5549552286110973, + "learning_rate": 5.909354259189648e-08, + "loss": 0.6149, + "step": 11727 + }, + { + "epoch": 0.95, + "grad_norm": 3.9749930530491513, + "learning_rate": 5.889208325759677e-08, + "loss": 0.6626, + "step": 11728 + }, + { + "epoch": 0.95, + "grad_norm": 7.390236351493411, + "learning_rate": 5.86909658790763e-08, + "loss": 0.5118, + "step": 11729 + }, + { + "epoch": 0.95, + "grad_norm": 5.125566525101464, + "learning_rate": 5.8490190470254505e-08, + "loss": 0.6327, + "step": 11730 + }, + { + "epoch": 0.95, + "grad_norm": 4.259811792140969, + "learning_rate": 5.8289757045025816e-08, + "loss": 0.7539, + "step": 11731 + }, + { + "epoch": 0.95, + "grad_norm": 7.5175206300375805, + "learning_rate": 5.8089665617260816e-08, + "loss": 0.5818, + "step": 11732 + }, + { + "epoch": 0.95, + "grad_norm": 3.7699829581998667, + "learning_rate": 5.7889916200808414e-08, + "loss": 0.6107, + "step": 11733 + }, + { + "epoch": 0.95, + "grad_norm": 4.9176823580550355, + "learning_rate": 5.769050880949201e-08, + "loss": 0.6737, + "step": 11734 + }, + { + "epoch": 0.95, + "grad_norm": 5.322091305529035, + "learning_rate": 5.7491443457111105e-08, + "loss": 0.8031, + "step": 11735 + }, + { + "epoch": 0.95, + "grad_norm": 4.814411466178407, + "learning_rate": 5.729272015744303e-08, + "loss": 0.6445, + "step": 11736 + }, + { + "epoch": 0.95, + "grad_norm": 5.632955372670994, + "learning_rate": 5.709433892424121e-08, + "loss": 0.5588, + "step": 11737 + }, + { + "epoch": 0.95, + "grad_norm": 3.831056469865346, + "learning_rate": 5.689629977123412e-08, + "loss": 0.7361, + "step": 11738 + }, + { + "epoch": 0.95, + "grad_norm": 3.52884708347657, + "learning_rate": 5.6698602712126906e-08, + "loss": 0.6379, + "step": 11739 + }, + { + "epoch": 0.95, + "grad_norm": 2.8764701210074586, + "learning_rate": 5.6501247760602506e-08, + "loss": 0.7537, + "step": 11740 + }, + { + "epoch": 0.95, + "grad_norm": 3.497824934773602, + "learning_rate": 5.6304234930318336e-08, + "loss": 0.6144, + "step": 11741 + }, + { + "epoch": 0.95, + "grad_norm": 3.979743264884185, + "learning_rate": 5.610756423490904e-08, + "loss": 0.7633, + "step": 11742 + }, + { + "epoch": 0.95, + "grad_norm": 2.7418776773373006, + "learning_rate": 5.591123568798596e-08, + "loss": 0.5232, + "step": 11743 + }, + { + "epoch": 0.95, + "grad_norm": 7.234840138021113, + "learning_rate": 5.571524930313543e-08, + "loss": 0.6694, + "step": 11744 + }, + { + "epoch": 0.95, + "grad_norm": 3.302785926799452, + "learning_rate": 5.551960509392218e-08, + "loss": 0.7003, + "step": 11745 + }, + { + "epoch": 0.95, + "grad_norm": 3.8394740752944747, + "learning_rate": 5.532430307388481e-08, + "loss": 0.5596, + "step": 11746 + }, + { + "epoch": 0.95, + "grad_norm": 3.87822818498878, + "learning_rate": 5.5129343256539734e-08, + "loss": 0.7971, + "step": 11747 + }, + { + "epoch": 0.95, + "grad_norm": 2.2671029104677416, + "learning_rate": 5.493472565538005e-08, + "loss": 0.6643, + "step": 11748 + }, + { + "epoch": 0.95, + "grad_norm": 3.981859394682588, + "learning_rate": 5.474045028387387e-08, + "loss": 0.6538, + "step": 11749 + }, + { + "epoch": 0.95, + "grad_norm": 5.011321146903895, + "learning_rate": 5.4546517155465996e-08, + "loss": 0.5559, + "step": 11750 + }, + { + "epoch": 0.95, + "grad_norm": 6.722255364012913, + "learning_rate": 5.435292628357902e-08, + "loss": 0.5028, + "step": 11751 + }, + { + "epoch": 0.95, + "grad_norm": 3.9585529911247788, + "learning_rate": 5.415967768160946e-08, + "loss": 0.5237, + "step": 11752 + }, + { + "epoch": 0.95, + "grad_norm": 4.3637407933531325, + "learning_rate": 5.396677136293216e-08, + "loss": 0.6942, + "step": 11753 + }, + { + "epoch": 0.95, + "grad_norm": 10.38383821701632, + "learning_rate": 5.377420734089644e-08, + "loss": 0.6323, + "step": 11754 + }, + { + "epoch": 0.95, + "grad_norm": 16.965801943513956, + "learning_rate": 5.3581985628830545e-08, + "loss": 0.5266, + "step": 11755 + }, + { + "epoch": 0.95, + "grad_norm": 4.786846301516611, + "learning_rate": 5.3390106240036046e-08, + "loss": 0.6443, + "step": 11756 + }, + { + "epoch": 0.95, + "grad_norm": 2.956081327722214, + "learning_rate": 5.319856918779232e-08, + "loss": 0.5561, + "step": 11757 + }, + { + "epoch": 0.95, + "grad_norm": 4.499820576197688, + "learning_rate": 5.3007374485355424e-08, + "loss": 0.6006, + "step": 11758 + }, + { + "epoch": 0.96, + "grad_norm": 5.162631407623525, + "learning_rate": 5.281652214595701e-08, + "loss": 0.5894, + "step": 11759 + }, + { + "epoch": 0.96, + "grad_norm": 3.112365071169805, + "learning_rate": 5.262601218280539e-08, + "loss": 0.5843, + "step": 11760 + }, + { + "epoch": 0.96, + "grad_norm": 2.8853421401107124, + "learning_rate": 5.243584460908446e-08, + "loss": 0.5181, + "step": 11761 + }, + { + "epoch": 0.96, + "grad_norm": 4.11792934507338, + "learning_rate": 5.2246019437956486e-08, + "loss": 0.6572, + "step": 11762 + }, + { + "epoch": 0.96, + "grad_norm": 5.92477116497485, + "learning_rate": 5.2056536682557054e-08, + "loss": 0.6777, + "step": 11763 + }, + { + "epoch": 0.96, + "grad_norm": 3.9361211085662506, + "learning_rate": 5.186739635600013e-08, + "loss": 0.7874, + "step": 11764 + }, + { + "epoch": 0.96, + "grad_norm": 2.9373416501277725, + "learning_rate": 5.167859847137524e-08, + "loss": 0.727, + "step": 11765 + }, + { + "epoch": 0.96, + "grad_norm": 3.4091266528517106, + "learning_rate": 5.149014304174915e-08, + "loss": 0.7605, + "step": 11766 + }, + { + "epoch": 0.96, + "grad_norm": 7.291767057113755, + "learning_rate": 5.13020300801631e-08, + "loss": 0.6447, + "step": 11767 + }, + { + "epoch": 0.96, + "grad_norm": 5.736239395341825, + "learning_rate": 5.111425959963612e-08, + "loss": 0.6843, + "step": 11768 + }, + { + "epoch": 0.96, + "grad_norm": 3.224492789528648, + "learning_rate": 5.092683161316281e-08, + "loss": 0.6628, + "step": 11769 + }, + { + "epoch": 0.96, + "grad_norm": 4.292948962405509, + "learning_rate": 5.0739746133715574e-08, + "loss": 0.5603, + "step": 11770 + }, + { + "epoch": 0.96, + "grad_norm": 2.9891738149979004, + "learning_rate": 5.055300317424017e-08, + "loss": 0.7368, + "step": 11771 + }, + { + "epoch": 0.96, + "grad_norm": 18.401821849542376, + "learning_rate": 5.036660274766181e-08, + "loss": 0.594, + "step": 11772 + }, + { + "epoch": 0.96, + "grad_norm": 5.535712182828714, + "learning_rate": 5.018054486687962e-08, + "loss": 0.6591, + "step": 11773 + }, + { + "epoch": 0.96, + "grad_norm": 4.350381755532951, + "learning_rate": 4.999482954477053e-08, + "loss": 0.7204, + "step": 11774 + }, + { + "epoch": 0.96, + "grad_norm": 3.3175349348044554, + "learning_rate": 4.980945679418703e-08, + "loss": 0.6631, + "step": 11775 + }, + { + "epoch": 0.96, + "grad_norm": 8.368479457097477, + "learning_rate": 4.96244266279583e-08, + "loss": 0.5932, + "step": 11776 + }, + { + "epoch": 0.96, + "grad_norm": 4.804503287296305, + "learning_rate": 4.94397390588891e-08, + "loss": 0.6023, + "step": 11777 + }, + { + "epoch": 0.96, + "grad_norm": 6.8984197259125315, + "learning_rate": 4.9255394099761436e-08, + "loss": 0.6383, + "step": 11778 + }, + { + "epoch": 0.96, + "grad_norm": 11.166330058521746, + "learning_rate": 4.907139176333286e-08, + "loss": 0.5621, + "step": 11779 + }, + { + "epoch": 0.96, + "grad_norm": 3.817361965547026, + "learning_rate": 4.8887732062337656e-08, + "loss": 0.6938, + "step": 11780 + }, + { + "epoch": 0.96, + "grad_norm": 2.3044762273354675, + "learning_rate": 4.8704415009486194e-08, + "loss": 0.6966, + "step": 11781 + }, + { + "epoch": 0.96, + "grad_norm": 3.0063241892957127, + "learning_rate": 4.8521440617465e-08, + "loss": 0.6059, + "step": 11782 + }, + { + "epoch": 0.96, + "grad_norm": 3.576583988628541, + "learning_rate": 4.833880889893727e-08, + "loss": 0.6574, + "step": 11783 + }, + { + "epoch": 0.96, + "grad_norm": 4.647600597087502, + "learning_rate": 4.815651986654235e-08, + "loss": 0.629, + "step": 11784 + }, + { + "epoch": 0.96, + "grad_norm": 4.880565343466998, + "learning_rate": 4.7974573532895695e-08, + "loss": 0.7421, + "step": 11785 + }, + { + "epoch": 0.96, + "grad_norm": 5.369678854894292, + "learning_rate": 4.77929699105889e-08, + "loss": 0.7461, + "step": 11786 + }, + { + "epoch": 0.96, + "grad_norm": 6.825459718938507, + "learning_rate": 4.761170901219025e-08, + "loss": 0.7085, + "step": 11787 + }, + { + "epoch": 0.96, + "grad_norm": 4.817402987415686, + "learning_rate": 4.743079085024416e-08, + "loss": 0.6638, + "step": 11788 + }, + { + "epoch": 0.96, + "grad_norm": 3.1856757977129075, + "learning_rate": 4.725021543727115e-08, + "loss": 0.7412, + "step": 11789 + }, + { + "epoch": 0.96, + "grad_norm": 4.000049906005819, + "learning_rate": 4.706998278576846e-08, + "loss": 0.6355, + "step": 11790 + }, + { + "epoch": 0.96, + "grad_norm": 5.619029392557585, + "learning_rate": 4.68900929082089e-08, + "loss": 0.725, + "step": 11791 + }, + { + "epoch": 0.96, + "grad_norm": 7.161004327045112, + "learning_rate": 4.671054581704304e-08, + "loss": 0.6832, + "step": 11792 + }, + { + "epoch": 0.96, + "grad_norm": 3.770603912796601, + "learning_rate": 4.653134152469541e-08, + "loss": 0.7151, + "step": 11793 + }, + { + "epoch": 0.96, + "grad_norm": 3.1447001719861545, + "learning_rate": 4.635248004356885e-08, + "loss": 0.7283, + "step": 11794 + }, + { + "epoch": 0.96, + "grad_norm": 3.6507668736272922, + "learning_rate": 4.6173961386041246e-08, + "loss": 0.6539, + "step": 11795 + }, + { + "epoch": 0.96, + "grad_norm": 13.851385403214774, + "learning_rate": 4.5995785564467155e-08, + "loss": 0.6265, + "step": 11796 + }, + { + "epoch": 0.96, + "grad_norm": 3.9219720999886225, + "learning_rate": 4.581795259117783e-08, + "loss": 0.5695, + "step": 11797 + }, + { + "epoch": 0.96, + "grad_norm": 8.214854891611218, + "learning_rate": 4.564046247848008e-08, + "loss": 0.7844, + "step": 11798 + }, + { + "epoch": 0.96, + "grad_norm": 4.687880779138726, + "learning_rate": 4.546331523865799e-08, + "loss": 0.7868, + "step": 11799 + }, + { + "epoch": 0.96, + "grad_norm": 2.947492742888208, + "learning_rate": 4.528651088397063e-08, + "loss": 0.6984, + "step": 11800 + }, + { + "epoch": 0.96, + "grad_norm": 4.064461748502639, + "learning_rate": 4.5110049426653755e-08, + "loss": 0.7241, + "step": 11801 + }, + { + "epoch": 0.96, + "grad_norm": 3.5106466271044736, + "learning_rate": 4.49339308789204e-08, + "loss": 0.7736, + "step": 11802 + }, + { + "epoch": 0.96, + "grad_norm": 11.366460903777558, + "learning_rate": 4.475815525295857e-08, + "loss": 0.7007, + "step": 11803 + }, + { + "epoch": 0.96, + "grad_norm": 4.117332181173183, + "learning_rate": 4.458272256093355e-08, + "loss": 0.7438, + "step": 11804 + }, + { + "epoch": 0.96, + "grad_norm": 41.72140384334036, + "learning_rate": 4.440763281498561e-08, + "loss": 0.5759, + "step": 11805 + }, + { + "epoch": 0.96, + "grad_norm": 4.3315864226899, + "learning_rate": 4.423288602723286e-08, + "loss": 0.6719, + "step": 11806 + }, + { + "epoch": 0.96, + "grad_norm": 2.9160590023268895, + "learning_rate": 4.405848220976838e-08, + "loss": 0.7821, + "step": 11807 + }, + { + "epoch": 0.96, + "grad_norm": 4.547706385640694, + "learning_rate": 4.388442137466198e-08, + "loss": 0.7137, + "step": 11808 + }, + { + "epoch": 0.96, + "grad_norm": 4.007870718501078, + "learning_rate": 4.3710703533959566e-08, + "loss": 0.7098, + "step": 11809 + }, + { + "epoch": 0.96, + "grad_norm": 4.213252084335085, + "learning_rate": 4.35373286996843e-08, + "loss": 0.6617, + "step": 11810 + }, + { + "epoch": 0.96, + "grad_norm": 3.3318056721847014, + "learning_rate": 4.3364296883834364e-08, + "loss": 0.823, + "step": 11811 + }, + { + "epoch": 0.96, + "grad_norm": 3.2881173728429283, + "learning_rate": 4.319160809838463e-08, + "loss": 0.5753, + "step": 11812 + }, + { + "epoch": 0.96, + "grad_norm": 3.0660704359374944, + "learning_rate": 4.301926235528664e-08, + "loss": 0.5376, + "step": 11813 + }, + { + "epoch": 0.96, + "grad_norm": 4.68932158352393, + "learning_rate": 4.2847259666466414e-08, + "loss": 0.5945, + "step": 11814 + }, + { + "epoch": 0.96, + "grad_norm": 2.0674276419708786, + "learning_rate": 4.2675600043829425e-08, + "loss": 0.6133, + "step": 11815 + }, + { + "epoch": 0.96, + "grad_norm": 6.610089690095713, + "learning_rate": 4.250428349925451e-08, + "loss": 0.7862, + "step": 11816 + }, + { + "epoch": 0.96, + "grad_norm": 3.7059969871160576, + "learning_rate": 4.233331004459829e-08, + "loss": 0.6245, + "step": 11817 + }, + { + "epoch": 0.96, + "grad_norm": 3.323119209650115, + "learning_rate": 4.2162679691692966e-08, + "loss": 0.8316, + "step": 11818 + }, + { + "epoch": 0.96, + "grad_norm": 2.92159898112092, + "learning_rate": 4.199239245234743e-08, + "loss": 0.791, + "step": 11819 + }, + { + "epoch": 0.96, + "grad_norm": 2.5131701947476306, + "learning_rate": 4.18224483383467e-08, + "loss": 0.7938, + "step": 11820 + }, + { + "epoch": 0.96, + "grad_norm": 4.166327737543794, + "learning_rate": 4.165284736145136e-08, + "loss": 0.7224, + "step": 11821 + }, + { + "epoch": 0.96, + "grad_norm": 4.793683203033816, + "learning_rate": 4.148358953339926e-08, + "loss": 0.6067, + "step": 11822 + }, + { + "epoch": 0.96, + "grad_norm": 2.700167702150854, + "learning_rate": 4.131467486590435e-08, + "loss": 0.5525, + "step": 11823 + }, + { + "epoch": 0.96, + "grad_norm": 5.547321442230637, + "learning_rate": 4.114610337065672e-08, + "loss": 0.676, + "step": 11824 + }, + { + "epoch": 0.96, + "grad_norm": 5.0759249232115, + "learning_rate": 4.0977875059322046e-08, + "loss": 0.7486, + "step": 11825 + }, + { + "epoch": 0.96, + "grad_norm": 2.9483857624522316, + "learning_rate": 4.080998994354324e-08, + "loss": 0.6341, + "step": 11826 + }, + { + "epoch": 0.96, + "grad_norm": 8.960801444921684, + "learning_rate": 4.064244803493822e-08, + "loss": 0.5047, + "step": 11827 + }, + { + "epoch": 0.96, + "grad_norm": 4.512891620984063, + "learning_rate": 4.0475249345102716e-08, + "loss": 0.7384, + "step": 11828 + }, + { + "epoch": 0.96, + "grad_norm": 3.607577900643964, + "learning_rate": 4.0308393885608034e-08, + "loss": 0.7112, + "step": 11829 + }, + { + "epoch": 0.96, + "grad_norm": 3.409797416262915, + "learning_rate": 4.0141881668000485e-08, + "loss": 0.6734, + "step": 11830 + }, + { + "epoch": 0.96, + "grad_norm": 4.978103726536734, + "learning_rate": 3.997571270380529e-08, + "loss": 0.7975, + "step": 11831 + }, + { + "epoch": 0.96, + "grad_norm": 3.6102855850208857, + "learning_rate": 3.98098870045216e-08, + "loss": 0.7464, + "step": 11832 + }, + { + "epoch": 0.96, + "grad_norm": 3.824873472225501, + "learning_rate": 3.964440458162577e-08, + "loss": 0.6344, + "step": 11833 + }, + { + "epoch": 0.96, + "grad_norm": 4.98441774699736, + "learning_rate": 3.947926544656977e-08, + "loss": 0.7687, + "step": 11834 + }, + { + "epoch": 0.96, + "grad_norm": 3.2292704784398385, + "learning_rate": 3.931446961078278e-08, + "loss": 0.5859, + "step": 11835 + }, + { + "epoch": 0.96, + "grad_norm": 3.4870169994153906, + "learning_rate": 3.9150017085669566e-08, + "loss": 0.4805, + "step": 11836 + }, + { + "epoch": 0.96, + "grad_norm": 5.062580293442506, + "learning_rate": 3.898590788261103e-08, + "loss": 0.7161, + "step": 11837 + }, + { + "epoch": 0.96, + "grad_norm": 6.100431525644177, + "learning_rate": 3.8822142012964747e-08, + "loss": 0.7341, + "step": 11838 + }, + { + "epoch": 0.96, + "grad_norm": 3.3314324069473447, + "learning_rate": 3.8658719488064985e-08, + "loss": 0.6781, + "step": 11839 + }, + { + "epoch": 0.96, + "grad_norm": 2.450516359856029, + "learning_rate": 3.8495640319221036e-08, + "loss": 0.6023, + "step": 11840 + }, + { + "epoch": 0.96, + "grad_norm": 2.976975946829453, + "learning_rate": 3.8332904517718315e-08, + "loss": 0.6164, + "step": 11841 + }, + { + "epoch": 0.96, + "grad_norm": 8.095341290807635, + "learning_rate": 3.817051209482003e-08, + "loss": 0.71, + "step": 11842 + }, + { + "epoch": 0.96, + "grad_norm": 9.611964994972334, + "learning_rate": 3.800846306176498e-08, + "loss": 0.7238, + "step": 11843 + }, + { + "epoch": 0.96, + "grad_norm": 3.2114721599806795, + "learning_rate": 3.7846757429766955e-08, + "loss": 0.6205, + "step": 11844 + }, + { + "epoch": 0.96, + "grad_norm": 4.361437741075271, + "learning_rate": 3.7685395210018127e-08, + "loss": 0.82, + "step": 11845 + }, + { + "epoch": 0.96, + "grad_norm": 3.5935391999504485, + "learning_rate": 3.7524376413685114e-08, + "loss": 0.5529, + "step": 11846 + }, + { + "epoch": 0.96, + "grad_norm": 10.300596126210026, + "learning_rate": 3.736370105191178e-08, + "loss": 0.6262, + "step": 11847 + }, + { + "epoch": 0.96, + "grad_norm": 5.053305308054646, + "learning_rate": 3.7203369135817016e-08, + "loss": 0.7948, + "step": 11848 + }, + { + "epoch": 0.96, + "grad_norm": 2.696827430525888, + "learning_rate": 3.704338067649804e-08, + "loss": 0.5176, + "step": 11849 + }, + { + "epoch": 0.96, + "grad_norm": 5.351856593592406, + "learning_rate": 3.688373568502601e-08, + "loss": 0.7051, + "step": 11850 + }, + { + "epoch": 0.96, + "grad_norm": 3.5107290070064825, + "learning_rate": 3.672443417245042e-08, + "loss": 0.6872, + "step": 11851 + }, + { + "epoch": 0.96, + "grad_norm": 4.572847814659825, + "learning_rate": 3.656547614979522e-08, + "loss": 0.548, + "step": 11852 + }, + { + "epoch": 0.96, + "grad_norm": 3.6960078321425676, + "learning_rate": 3.640686162806106e-08, + "loss": 0.644, + "step": 11853 + }, + { + "epoch": 0.96, + "grad_norm": 4.94524423191858, + "learning_rate": 3.6248590618225834e-08, + "loss": 0.5384, + "step": 11854 + }, + { + "epoch": 0.96, + "grad_norm": 5.15555236867171, + "learning_rate": 3.609066313124243e-08, + "loss": 0.5936, + "step": 11855 + }, + { + "epoch": 0.96, + "grad_norm": 6.012299533848973, + "learning_rate": 3.593307917804045e-08, + "loss": 0.6281, + "step": 11856 + }, + { + "epoch": 0.96, + "grad_norm": 4.093419828481794, + "learning_rate": 3.577583876952562e-08, + "loss": 0.6121, + "step": 11857 + }, + { + "epoch": 0.96, + "grad_norm": 4.3229737969701425, + "learning_rate": 3.561894191658033e-08, + "loss": 0.6845, + "step": 11858 + }, + { + "epoch": 0.96, + "grad_norm": 3.0526804458579693, + "learning_rate": 3.546238863006202e-08, + "loss": 0.5692, + "step": 11859 + }, + { + "epoch": 0.96, + "grad_norm": 3.3962669548380418, + "learning_rate": 3.5306178920806456e-08, + "loss": 0.689, + "step": 11860 + }, + { + "epoch": 0.96, + "grad_norm": 12.461258436497639, + "learning_rate": 3.515031279962333e-08, + "loss": 0.6997, + "step": 11861 + }, + { + "epoch": 0.96, + "grad_norm": 3.528824179039459, + "learning_rate": 3.499479027729957e-08, + "loss": 0.6958, + "step": 11862 + }, + { + "epoch": 0.96, + "grad_norm": 5.124628103678376, + "learning_rate": 3.483961136459879e-08, + "loss": 0.7675, + "step": 11863 + }, + { + "epoch": 0.96, + "grad_norm": 8.159251235931286, + "learning_rate": 3.468477607226017e-08, + "loss": 0.6361, + "step": 11864 + }, + { + "epoch": 0.96, + "grad_norm": 3.281044459149608, + "learning_rate": 3.453028441099959e-08, + "loss": 0.6438, + "step": 11865 + }, + { + "epoch": 0.96, + "grad_norm": 3.220204242004862, + "learning_rate": 3.437613639150794e-08, + "loss": 0.7162, + "step": 11866 + }, + { + "epoch": 0.96, + "grad_norm": 5.2679109447005725, + "learning_rate": 3.422233202445391e-08, + "loss": 0.7529, + "step": 11867 + }, + { + "epoch": 0.96, + "grad_norm": 3.7577588171765166, + "learning_rate": 3.406887132048176e-08, + "loss": 0.6714, + "step": 11868 + }, + { + "epoch": 0.96, + "grad_norm": 3.251742605212685, + "learning_rate": 3.3915754290211876e-08, + "loss": 0.4785, + "step": 11869 + }, + { + "epoch": 0.96, + "grad_norm": 10.602164482178486, + "learning_rate": 3.37629809442408e-08, + "loss": 0.6755, + "step": 11870 + }, + { + "epoch": 0.96, + "grad_norm": 4.620589946245881, + "learning_rate": 3.361055129314117e-08, + "loss": 0.5591, + "step": 11871 + }, + { + "epoch": 0.96, + "grad_norm": 7.256418404793536, + "learning_rate": 3.345846534746289e-08, + "loss": 0.7291, + "step": 11872 + }, + { + "epoch": 0.96, + "grad_norm": 4.373951585327049, + "learning_rate": 3.330672311773031e-08, + "loss": 0.5641, + "step": 11873 + }, + { + "epoch": 0.96, + "grad_norm": 4.472418803154782, + "learning_rate": 3.3155324614445593e-08, + "loss": 0.7203, + "step": 11874 + }, + { + "epoch": 0.96, + "grad_norm": 6.221621712633217, + "learning_rate": 3.3004269848085914e-08, + "loss": 0.6343, + "step": 11875 + }, + { + "epoch": 0.96, + "grad_norm": 5.9469173023246045, + "learning_rate": 3.285355882910568e-08, + "loss": 0.6024, + "step": 11876 + }, + { + "epoch": 0.96, + "grad_norm": 3.236680186386587, + "learning_rate": 3.270319156793544e-08, + "loss": 0.7358, + "step": 11877 + }, + { + "epoch": 0.96, + "grad_norm": 3.2664858863859503, + "learning_rate": 3.255316807498077e-08, + "loss": 0.5884, + "step": 11878 + }, + { + "epoch": 0.96, + "grad_norm": 3.7959539348916485, + "learning_rate": 3.2403488360624455e-08, + "loss": 0.5568, + "step": 11879 + }, + { + "epoch": 0.96, + "grad_norm": 4.888618167181351, + "learning_rate": 3.225415243522489e-08, + "loss": 0.5406, + "step": 11880 + }, + { + "epoch": 0.96, + "grad_norm": 5.133780143768868, + "learning_rate": 3.21051603091177e-08, + "loss": 0.6207, + "step": 11881 + }, + { + "epoch": 0.97, + "grad_norm": 3.0867649333185887, + "learning_rate": 3.195651199261407e-08, + "loss": 0.6324, + "step": 11882 + }, + { + "epoch": 0.97, + "grad_norm": 3.429140874548844, + "learning_rate": 3.180820749600133e-08, + "loss": 0.6677, + "step": 11883 + }, + { + "epoch": 0.97, + "grad_norm": 2.4855478431729603, + "learning_rate": 3.1660246829542385e-08, + "loss": 0.8144, + "step": 11884 + }, + { + "epoch": 0.97, + "grad_norm": 11.997469807122755, + "learning_rate": 3.151263000347793e-08, + "loss": 0.8046, + "step": 11885 + }, + { + "epoch": 0.97, + "grad_norm": 2.9222589268612085, + "learning_rate": 3.136535702802423e-08, + "loss": 0.6575, + "step": 11886 + }, + { + "epoch": 0.97, + "grad_norm": 4.27314531850364, + "learning_rate": 3.121842791337204e-08, + "loss": 0.6022, + "step": 11887 + }, + { + "epoch": 0.97, + "grad_norm": 3.3107370203826387, + "learning_rate": 3.107184266969099e-08, + "loss": 0.6944, + "step": 11888 + }, + { + "epoch": 0.97, + "grad_norm": 3.032218957667251, + "learning_rate": 3.0925601307125184e-08, + "loss": 0.7337, + "step": 11889 + }, + { + "epoch": 0.97, + "grad_norm": 4.2891704378221736, + "learning_rate": 3.077970383579598e-08, + "loss": 0.7441, + "step": 11890 + }, + { + "epoch": 0.97, + "grad_norm": 3.255225485438617, + "learning_rate": 3.06341502658003e-08, + "loss": 0.5414, + "step": 11891 + }, + { + "epoch": 0.97, + "grad_norm": 4.557984584508957, + "learning_rate": 3.048894060721064e-08, + "loss": 0.6986, + "step": 11892 + }, + { + "epoch": 0.97, + "grad_norm": 3.4357919453777073, + "learning_rate": 3.0344074870077287e-08, + "loss": 0.6044, + "step": 11893 + }, + { + "epoch": 0.97, + "grad_norm": 5.182388493527507, + "learning_rate": 3.0199553064425014e-08, + "loss": 0.7402, + "step": 11894 + }, + { + "epoch": 0.97, + "grad_norm": 5.7900889342726245, + "learning_rate": 3.005537520025637e-08, + "loss": 0.6623, + "step": 11895 + }, + { + "epoch": 0.97, + "grad_norm": 4.7116717154188485, + "learning_rate": 2.9911541287549474e-08, + "loss": 0.6877, + "step": 11896 + }, + { + "epoch": 0.97, + "grad_norm": 5.214135387615386, + "learning_rate": 2.9768051336257487e-08, + "loss": 0.5149, + "step": 11897 + }, + { + "epoch": 0.97, + "grad_norm": 4.362429168296588, + "learning_rate": 2.9624905356311905e-08, + "loss": 0.5293, + "step": 11898 + }, + { + "epoch": 0.97, + "grad_norm": 3.531257564276267, + "learning_rate": 2.948210335761925e-08, + "loss": 0.6374, + "step": 11899 + }, + { + "epoch": 0.97, + "grad_norm": 5.732242561278232, + "learning_rate": 2.9339645350061617e-08, + "loss": 0.5524, + "step": 11900 + }, + { + "epoch": 0.97, + "grad_norm": 6.7269994188701325, + "learning_rate": 2.9197531343498344e-08, + "loss": 0.7478, + "step": 11901 + }, + { + "epoch": 0.97, + "grad_norm": 4.096987047475449, + "learning_rate": 2.9055761347764887e-08, + "loss": 0.6621, + "step": 11902 + }, + { + "epoch": 0.97, + "grad_norm": 3.0051812682952237, + "learning_rate": 2.8914335372672296e-08, + "loss": 0.6073, + "step": 11903 + }, + { + "epoch": 0.97, + "grad_norm": 4.961099537086095, + "learning_rate": 2.8773253428008296e-08, + "loss": 0.616, + "step": 11904 + }, + { + "epoch": 0.97, + "grad_norm": 11.941392703574312, + "learning_rate": 2.863251552353674e-08, + "loss": 0.7127, + "step": 11905 + }, + { + "epoch": 0.97, + "grad_norm": 5.152404616828218, + "learning_rate": 2.8492121668997064e-08, + "loss": 0.6741, + "step": 11906 + }, + { + "epoch": 0.97, + "grad_norm": 3.6530869510621367, + "learning_rate": 2.8352071874105934e-08, + "loss": 0.845, + "step": 11907 + }, + { + "epoch": 0.97, + "grad_norm": 3.3309374590549248, + "learning_rate": 2.8212366148555602e-08, + "loss": 0.6494, + "step": 11908 + }, + { + "epoch": 0.97, + "grad_norm": 3.635360543471996, + "learning_rate": 2.8073004502014445e-08, + "loss": 0.5611, + "step": 11909 + }, + { + "epoch": 0.97, + "grad_norm": 3.7786526075395983, + "learning_rate": 2.7933986944126967e-08, + "loss": 0.6531, + "step": 11910 + }, + { + "epoch": 0.97, + "grad_norm": 3.669604467143136, + "learning_rate": 2.7795313484514362e-08, + "loss": 0.612, + "step": 11911 + }, + { + "epoch": 0.97, + "grad_norm": 4.799484129297882, + "learning_rate": 2.7656984132773955e-08, + "loss": 0.5849, + "step": 11912 + }, + { + "epoch": 0.97, + "grad_norm": 6.053339119941887, + "learning_rate": 2.7518998898478644e-08, + "loss": 0.5381, + "step": 11913 + }, + { + "epoch": 0.97, + "grad_norm": 5.544861288384643, + "learning_rate": 2.7381357791177454e-08, + "loss": 0.9079, + "step": 11914 + }, + { + "epoch": 0.97, + "grad_norm": 7.036877878927683, + "learning_rate": 2.724406082039721e-08, + "loss": 0.685, + "step": 11915 + }, + { + "epoch": 0.97, + "grad_norm": 2.6061658072696514, + "learning_rate": 2.7107107995638648e-08, + "loss": 0.7237, + "step": 11916 + }, + { + "epoch": 0.97, + "grad_norm": 4.913288140064642, + "learning_rate": 2.697049932637974e-08, + "loss": 0.7011, + "step": 11917 + }, + { + "epoch": 0.97, + "grad_norm": 4.077282708178528, + "learning_rate": 2.6834234822076255e-08, + "loss": 0.6142, + "step": 11918 + }, + { + "epoch": 0.97, + "grad_norm": 4.568058483596346, + "learning_rate": 2.6698314492156208e-08, + "loss": 0.6412, + "step": 11919 + }, + { + "epoch": 0.97, + "grad_norm": 4.848466570194925, + "learning_rate": 2.6562738346027627e-08, + "loss": 0.6093, + "step": 11920 + }, + { + "epoch": 0.97, + "grad_norm": 4.619506795616202, + "learning_rate": 2.642750639307301e-08, + "loss": 0.6488, + "step": 11921 + }, + { + "epoch": 0.97, + "grad_norm": 2.810065753423881, + "learning_rate": 2.629261864265098e-08, + "loss": 0.6464, + "step": 11922 + }, + { + "epoch": 0.97, + "grad_norm": 3.5801625892157, + "learning_rate": 2.6158075104096848e-08, + "loss": 0.6557, + "step": 11923 + }, + { + "epoch": 0.97, + "grad_norm": 6.436228517053414, + "learning_rate": 2.6023875786722053e-08, + "loss": 0.548, + "step": 11924 + }, + { + "epoch": 0.97, + "grad_norm": 3.8137573521015966, + "learning_rate": 2.589002069981361e-08, + "loss": 0.592, + "step": 11925 + }, + { + "epoch": 0.97, + "grad_norm": 8.421799636832723, + "learning_rate": 2.5756509852635226e-08, + "loss": 0.674, + "step": 11926 + }, + { + "epoch": 0.97, + "grad_norm": 2.637088447190258, + "learning_rate": 2.562334325442728e-08, + "loss": 0.598, + "step": 11927 + }, + { + "epoch": 0.97, + "grad_norm": 3.044061400170564, + "learning_rate": 2.5490520914404627e-08, + "loss": 0.6312, + "step": 11928 + }, + { + "epoch": 0.97, + "grad_norm": 4.072431683870927, + "learning_rate": 2.5358042841760466e-08, + "loss": 0.6144, + "step": 11929 + }, + { + "epoch": 0.97, + "grad_norm": 3.844565037271914, + "learning_rate": 2.5225909045661913e-08, + "loss": 0.7093, + "step": 11930 + }, + { + "epoch": 0.97, + "grad_norm": 6.413785624713688, + "learning_rate": 2.509411953525498e-08, + "loss": 0.6617, + "step": 11931 + }, + { + "epoch": 0.97, + "grad_norm": 3.985518335231157, + "learning_rate": 2.4962674319659595e-08, + "loss": 0.6247, + "step": 11932 + }, + { + "epoch": 0.97, + "grad_norm": 3.5328705697973914, + "learning_rate": 2.4831573407972377e-08, + "loss": 0.5599, + "step": 11933 + }, + { + "epoch": 0.97, + "grad_norm": 4.375236493316782, + "learning_rate": 2.4700816809266615e-08, + "loss": 0.7279, + "step": 11934 + }, + { + "epoch": 0.97, + "grad_norm": 11.30994058504707, + "learning_rate": 2.4570404532591187e-08, + "loss": 0.7648, + "step": 11935 + }, + { + "epoch": 0.97, + "grad_norm": 5.655322030546553, + "learning_rate": 2.4440336586971648e-08, + "loss": 0.6675, + "step": 11936 + }, + { + "epoch": 0.97, + "grad_norm": 3.2225337307210533, + "learning_rate": 2.4310612981409686e-08, + "loss": 0.7029, + "step": 11937 + }, + { + "epoch": 0.97, + "grad_norm": 3.6279750790971654, + "learning_rate": 2.418123372488257e-08, + "loss": 0.6478, + "step": 11938 + }, + { + "epoch": 0.97, + "grad_norm": 17.593306659512876, + "learning_rate": 2.4052198826344796e-08, + "loss": 0.6248, + "step": 11939 + }, + { + "epoch": 0.97, + "grad_norm": 4.7657603621801305, + "learning_rate": 2.3923508294725893e-08, + "loss": 0.7203, + "step": 11940 + }, + { + "epoch": 0.97, + "grad_norm": 5.025901495485195, + "learning_rate": 2.3795162138932072e-08, + "loss": 0.7016, + "step": 11941 + }, + { + "epoch": 0.97, + "grad_norm": 5.236126639015923, + "learning_rate": 2.3667160367845664e-08, + "loss": 0.7081, + "step": 11942 + }, + { + "epoch": 0.97, + "grad_norm": 5.515405300164409, + "learning_rate": 2.35395029903257e-08, + "loss": 0.6129, + "step": 11943 + }, + { + "epoch": 0.97, + "grad_norm": 4.481965799805051, + "learning_rate": 2.3412190015206226e-08, + "loss": 0.6469, + "step": 11944 + }, + { + "epoch": 0.97, + "grad_norm": 3.6278191276225726, + "learning_rate": 2.328522145129908e-08, + "loss": 0.5895, + "step": 11945 + }, + { + "epoch": 0.97, + "grad_norm": 3.425379394785436, + "learning_rate": 2.3158597307390007e-08, + "loss": 0.5487, + "step": 11946 + }, + { + "epoch": 0.97, + "grad_norm": 4.424205359844178, + "learning_rate": 2.303231759224256e-08, + "loss": 0.6771, + "step": 11947 + }, + { + "epoch": 0.97, + "grad_norm": 3.3230942387174562, + "learning_rate": 2.290638231459641e-08, + "loss": 0.6647, + "step": 11948 + }, + { + "epoch": 0.97, + "grad_norm": 4.059729867198876, + "learning_rate": 2.2780791483167363e-08, + "loss": 0.5848, + "step": 11949 + }, + { + "epoch": 0.97, + "grad_norm": 3.984873707440627, + "learning_rate": 2.2655545106646803e-08, + "loss": 0.6941, + "step": 11950 + }, + { + "epoch": 0.97, + "grad_norm": 4.103170728555762, + "learning_rate": 2.253064319370224e-08, + "loss": 0.7415, + "step": 11951 + }, + { + "epoch": 0.97, + "grad_norm": 2.5447513306076495, + "learning_rate": 2.240608575297787e-08, + "loss": 0.546, + "step": 11952 + }, + { + "epoch": 0.97, + "grad_norm": 9.241307061082546, + "learning_rate": 2.2281872793093462e-08, + "loss": 0.5532, + "step": 11953 + }, + { + "epoch": 0.97, + "grad_norm": 4.150659586550228, + "learning_rate": 2.2158004322646033e-08, + "loss": 0.7274, + "step": 11954 + }, + { + "epoch": 0.97, + "grad_norm": 2.8765592241886964, + "learning_rate": 2.2034480350208166e-08, + "loss": 0.6793, + "step": 11955 + }, + { + "epoch": 0.97, + "grad_norm": 2.606224735162423, + "learning_rate": 2.1911300884328023e-08, + "loss": 0.6543, + "step": 11956 + }, + { + "epoch": 0.97, + "grad_norm": 3.4238020604322577, + "learning_rate": 2.17884659335299e-08, + "loss": 0.7113, + "step": 11957 + }, + { + "epoch": 0.97, + "grad_norm": 8.118626053682032, + "learning_rate": 2.1665975506315885e-08, + "loss": 0.6308, + "step": 11958 + }, + { + "epoch": 0.97, + "grad_norm": 3.3156694969478715, + "learning_rate": 2.1543829611162524e-08, + "loss": 0.6807, + "step": 11959 + }, + { + "epoch": 0.97, + "grad_norm": 3.0513316303596443, + "learning_rate": 2.1422028256523065e-08, + "loss": 0.6433, + "step": 11960 + }, + { + "epoch": 0.97, + "grad_norm": 3.133452014898096, + "learning_rate": 2.130057145082687e-08, + "loss": 0.6414, + "step": 11961 + }, + { + "epoch": 0.97, + "grad_norm": 6.032224333862911, + "learning_rate": 2.1179459202479436e-08, + "loss": 0.7187, + "step": 11962 + }, + { + "epoch": 0.97, + "grad_norm": 4.218850570435848, + "learning_rate": 2.1058691519862952e-08, + "loss": 0.6403, + "step": 11963 + }, + { + "epoch": 0.97, + "grad_norm": 3.0767606734852957, + "learning_rate": 2.0938268411335172e-08, + "loss": 0.7581, + "step": 11964 + }, + { + "epoch": 0.97, + "grad_norm": 2.8919751913242755, + "learning_rate": 2.081818988522999e-08, + "loss": 0.6509, + "step": 11965 + }, + { + "epoch": 0.97, + "grad_norm": 2.4201619947583435, + "learning_rate": 2.069845594985742e-08, + "loss": 0.5854, + "step": 11966 + }, + { + "epoch": 0.97, + "grad_norm": 2.822856858224823, + "learning_rate": 2.0579066613503618e-08, + "loss": 0.6901, + "step": 11967 + }, + { + "epoch": 0.97, + "grad_norm": 3.52859049395303, + "learning_rate": 2.046002188443197e-08, + "loss": 0.7091, + "step": 11968 + }, + { + "epoch": 0.97, + "grad_norm": 4.884165994831136, + "learning_rate": 2.0341321770880327e-08, + "loss": 0.5925, + "step": 11969 + }, + { + "epoch": 0.97, + "grad_norm": 5.834157460728985, + "learning_rate": 2.0222966281063794e-08, + "loss": 0.6902, + "step": 11970 + }, + { + "epoch": 0.97, + "grad_norm": 5.618860980312664, + "learning_rate": 2.0104955423173034e-08, + "loss": 0.556, + "step": 11971 + }, + { + "epoch": 0.97, + "grad_norm": 3.311309050986231, + "learning_rate": 1.9987289205375958e-08, + "loss": 0.6753, + "step": 11972 + }, + { + "epoch": 0.97, + "grad_norm": 4.028842695582181, + "learning_rate": 1.986996763581439e-08, + "loss": 0.8232, + "step": 11973 + }, + { + "epoch": 0.97, + "grad_norm": 3.3395948196662353, + "learning_rate": 1.9752990722609057e-08, + "loss": 0.7651, + "step": 11974 + }, + { + "epoch": 0.97, + "grad_norm": 3.600131193015411, + "learning_rate": 1.9636358473855145e-08, + "loss": 0.584, + "step": 11975 + }, + { + "epoch": 0.97, + "grad_norm": 4.408745712856693, + "learning_rate": 1.9520070897623976e-08, + "loss": 0.6481, + "step": 11976 + }, + { + "epoch": 0.97, + "grad_norm": 2.3645400616337406, + "learning_rate": 1.9404128001963562e-08, + "loss": 0.7192, + "step": 11977 + }, + { + "epoch": 0.97, + "grad_norm": 7.472001271313164, + "learning_rate": 1.9288529794898037e-08, + "loss": 0.6289, + "step": 11978 + }, + { + "epoch": 0.97, + "grad_norm": 5.307054297205913, + "learning_rate": 1.9173276284427666e-08, + "loss": 0.6486, + "step": 11979 + }, + { + "epoch": 0.97, + "grad_norm": 4.581064860353578, + "learning_rate": 1.905836747852774e-08, + "loss": 0.7518, + "step": 11980 + }, + { + "epoch": 0.97, + "grad_norm": 3.0210183235128683, + "learning_rate": 1.8943803385151894e-08, + "loss": 0.6633, + "step": 11981 + }, + { + "epoch": 0.97, + "grad_norm": 4.13423809176151, + "learning_rate": 1.882958401222823e-08, + "loss": 0.6693, + "step": 11982 + }, + { + "epoch": 0.97, + "grad_norm": 6.383761537258242, + "learning_rate": 1.8715709367660984e-08, + "loss": 0.6425, + "step": 11983 + }, + { + "epoch": 0.97, + "grad_norm": 4.300049821055086, + "learning_rate": 1.8602179459331625e-08, + "loss": 0.7342, + "step": 11984 + }, + { + "epoch": 0.97, + "grad_norm": 3.9354804177743037, + "learning_rate": 1.8488994295096653e-08, + "loss": 0.6204, + "step": 11985 + }, + { + "epoch": 0.97, + "grad_norm": 2.75486604852116, + "learning_rate": 1.8376153882789792e-08, + "loss": 0.6579, + "step": 11986 + }, + { + "epoch": 0.97, + "grad_norm": 2.418300504387471, + "learning_rate": 1.8263658230219804e-08, + "loss": 0.8194, + "step": 11987 + }, + { + "epoch": 0.97, + "grad_norm": 3.807686015398802, + "learning_rate": 1.815150734517268e-08, + "loss": 0.6236, + "step": 11988 + }, + { + "epoch": 0.97, + "grad_norm": 4.369865360356615, + "learning_rate": 1.8039701235409434e-08, + "loss": 0.7407, + "step": 11989 + }, + { + "epoch": 0.97, + "grad_norm": 3.032980589017032, + "learning_rate": 1.792823990866721e-08, + "loss": 0.7408, + "step": 11990 + }, + { + "epoch": 0.97, + "grad_norm": 3.9728061904199237, + "learning_rate": 1.7817123372661505e-08, + "loss": 0.6209, + "step": 11991 + }, + { + "epoch": 0.97, + "grad_norm": 4.307731537824287, + "learning_rate": 1.770635163508061e-08, + "loss": 0.6387, + "step": 11992 + }, + { + "epoch": 0.97, + "grad_norm": 4.556848522893382, + "learning_rate": 1.7595924703591726e-08, + "loss": 0.7818, + "step": 11993 + }, + { + "epoch": 0.97, + "grad_norm": 3.2671355048469275, + "learning_rate": 1.7485842585835966e-08, + "loss": 0.6736, + "step": 11994 + }, + { + "epoch": 0.97, + "grad_norm": 6.461901095139359, + "learning_rate": 1.7376105289432786e-08, + "loss": 0.6878, + "step": 11995 + }, + { + "epoch": 0.97, + "grad_norm": 15.840750276869194, + "learning_rate": 1.7266712821976673e-08, + "loss": 0.6524, + "step": 11996 + }, + { + "epoch": 0.97, + "grad_norm": 3.386002456212125, + "learning_rate": 1.715766519103712e-08, + "loss": 0.6708, + "step": 11997 + }, + { + "epoch": 0.97, + "grad_norm": 3.2631949771024416, + "learning_rate": 1.704896240416254e-08, + "loss": 0.5577, + "step": 11998 + }, + { + "epoch": 0.97, + "grad_norm": 5.241975701142466, + "learning_rate": 1.694060446887469e-08, + "loss": 0.6202, + "step": 11999 + }, + { + "epoch": 0.97, + "grad_norm": 12.330298388112183, + "learning_rate": 1.6832591392673127e-08, + "loss": 0.9061, + "step": 12000 + }, + { + "epoch": 0.97, + "grad_norm": 4.024984042225761, + "learning_rate": 7.887651627436933e-06, + "loss": 0.6954, + "step": 12001 + }, + { + "epoch": 0.97, + "grad_norm": 3.136642290189405, + "learning_rate": 7.887293688015853e-06, + "loss": 0.6799, + "step": 12002 + }, + { + "epoch": 0.97, + "grad_norm": 2.826007311876671, + "learning_rate": 7.886935726393908e-06, + "loss": 0.5704, + "step": 12003 + }, + { + "epoch": 0.97, + "grad_norm": 3.673132617932286, + "learning_rate": 7.886577742573856e-06, + "loss": 0.7033, + "step": 12004 + }, + { + "epoch": 0.98, + "grad_norm": 3.180031638412777, + "learning_rate": 7.886219736558448e-06, + "loss": 0.5536, + "step": 12005 + }, + { + "epoch": 0.98, + "grad_norm": 6.115056360310386, + "learning_rate": 7.885861708350437e-06, + "loss": 0.6069, + "step": 12006 + }, + { + "epoch": 0.98, + "grad_norm": 5.516314784807653, + "learning_rate": 7.885503657952575e-06, + "loss": 0.6268, + "step": 12007 + }, + { + "epoch": 0.98, + "grad_norm": 4.033164723146516, + "learning_rate": 7.885145585367615e-06, + "loss": 0.5566, + "step": 12008 + }, + { + "epoch": 0.98, + "grad_norm": 3.721533168735317, + "learning_rate": 7.884787490598312e-06, + "loss": 0.6053, + "step": 12009 + }, + { + "epoch": 0.98, + "grad_norm": 11.374917535692543, + "learning_rate": 7.884429373647419e-06, + "loss": 0.7036, + "step": 12010 + }, + { + "epoch": 0.98, + "grad_norm": 6.440950636513003, + "learning_rate": 7.884071234517687e-06, + "loss": 0.6475, + "step": 12011 + }, + { + "epoch": 0.98, + "grad_norm": 3.8169925255485677, + "learning_rate": 7.883713073211874e-06, + "loss": 0.603, + "step": 12012 + }, + { + "epoch": 0.98, + "grad_norm": 3.195059509655681, + "learning_rate": 7.883354889732731e-06, + "loss": 0.608, + "step": 12013 + }, + { + "epoch": 0.98, + "grad_norm": 4.504968783982464, + "learning_rate": 7.882996684083013e-06, + "loss": 0.7708, + "step": 12014 + }, + { + "epoch": 0.98, + "grad_norm": 4.061421903967798, + "learning_rate": 7.882638456265475e-06, + "loss": 0.7702, + "step": 12015 + }, + { + "epoch": 0.98, + "grad_norm": 2.703162273490401, + "learning_rate": 7.882280206282871e-06, + "loss": 0.5519, + "step": 12016 + }, + { + "epoch": 0.98, + "grad_norm": 16.447392410081385, + "learning_rate": 7.881921934137952e-06, + "loss": 0.5959, + "step": 12017 + }, + { + "epoch": 0.98, + "grad_norm": 2.8289420634636224, + "learning_rate": 7.881563639833479e-06, + "loss": 0.7197, + "step": 12018 + }, + { + "epoch": 0.98, + "grad_norm": 4.244201348888967, + "learning_rate": 7.881205323372206e-06, + "loss": 0.7613, + "step": 12019 + }, + { + "epoch": 0.98, + "grad_norm": 3.335400755659604, + "learning_rate": 7.880846984756883e-06, + "loss": 0.8804, + "step": 12020 + }, + { + "epoch": 0.98, + "grad_norm": 3.1421324244639526, + "learning_rate": 7.88048862399027e-06, + "loss": 0.7064, + "step": 12021 + }, + { + "epoch": 0.98, + "grad_norm": 4.022727506743707, + "learning_rate": 7.880130241075121e-06, + "loss": 0.7558, + "step": 12022 + }, + { + "epoch": 0.98, + "grad_norm": 4.007836802587387, + "learning_rate": 7.879771836014191e-06, + "loss": 0.6004, + "step": 12023 + }, + { + "epoch": 0.98, + "grad_norm": 3.863983054511877, + "learning_rate": 7.879413408810239e-06, + "loss": 0.5285, + "step": 12024 + }, + { + "epoch": 0.98, + "grad_norm": 7.181891100001127, + "learning_rate": 7.879054959466017e-06, + "loss": 0.6728, + "step": 12025 + }, + { + "epoch": 0.98, + "grad_norm": 6.439894165032388, + "learning_rate": 7.878696487984282e-06, + "loss": 0.7008, + "step": 12026 + }, + { + "epoch": 0.98, + "grad_norm": 4.734452638445886, + "learning_rate": 7.878337994367793e-06, + "loss": 0.6403, + "step": 12027 + }, + { + "epoch": 0.98, + "grad_norm": 10.700781186542399, + "learning_rate": 7.877979478619303e-06, + "loss": 0.7694, + "step": 12028 + }, + { + "epoch": 0.98, + "grad_norm": 4.334030451751, + "learning_rate": 7.877620940741571e-06, + "loss": 0.7115, + "step": 12029 + }, + { + "epoch": 0.98, + "grad_norm": 3.484183309697673, + "learning_rate": 7.877262380737353e-06, + "loss": 0.6892, + "step": 12030 + }, + { + "epoch": 0.98, + "grad_norm": 4.830946426998408, + "learning_rate": 7.876903798609408e-06, + "loss": 0.705, + "step": 12031 + }, + { + "epoch": 0.98, + "grad_norm": 4.191861307537942, + "learning_rate": 7.87654519436049e-06, + "loss": 0.781, + "step": 12032 + }, + { + "epoch": 0.98, + "grad_norm": 3.2180020003697463, + "learning_rate": 7.876186567993358e-06, + "loss": 0.7363, + "step": 12033 + }, + { + "epoch": 0.98, + "grad_norm": 3.6913941949566786, + "learning_rate": 7.875827919510769e-06, + "loss": 0.5523, + "step": 12034 + }, + { + "epoch": 0.98, + "grad_norm": 4.233631716969953, + "learning_rate": 7.875469248915481e-06, + "loss": 0.6651, + "step": 12035 + }, + { + "epoch": 0.98, + "grad_norm": 5.2013003086881175, + "learning_rate": 7.875110556210252e-06, + "loss": 0.7688, + "step": 12036 + }, + { + "epoch": 0.98, + "grad_norm": 5.036950821191211, + "learning_rate": 7.874751841397841e-06, + "loss": 0.5058, + "step": 12037 + }, + { + "epoch": 0.98, + "grad_norm": 20.539660031748948, + "learning_rate": 7.874393104481004e-06, + "loss": 0.6575, + "step": 12038 + }, + { + "epoch": 0.98, + "grad_norm": 4.237282620069438, + "learning_rate": 7.874034345462502e-06, + "loss": 0.6526, + "step": 12039 + }, + { + "epoch": 0.98, + "grad_norm": 5.976683686078058, + "learning_rate": 7.87367556434509e-06, + "loss": 0.8748, + "step": 12040 + }, + { + "epoch": 0.98, + "grad_norm": 3.8723361335971447, + "learning_rate": 7.873316761131531e-06, + "loss": 0.7453, + "step": 12041 + }, + { + "epoch": 0.98, + "grad_norm": 2.8145918262647327, + "learning_rate": 7.87295793582458e-06, + "loss": 0.5655, + "step": 12042 + }, + { + "epoch": 0.98, + "grad_norm": 12.971621644696427, + "learning_rate": 7.872599088427e-06, + "loss": 0.7022, + "step": 12043 + }, + { + "epoch": 0.98, + "grad_norm": 48.36840574529206, + "learning_rate": 7.872240218941545e-06, + "loss": 0.5662, + "step": 12044 + }, + { + "epoch": 0.98, + "grad_norm": 5.128550993468869, + "learning_rate": 7.87188132737098e-06, + "loss": 0.7513, + "step": 12045 + }, + { + "epoch": 0.98, + "grad_norm": 4.118101313610252, + "learning_rate": 7.87152241371806e-06, + "loss": 0.5864, + "step": 12046 + }, + { + "epoch": 0.98, + "grad_norm": 4.079900967589845, + "learning_rate": 7.871163477985548e-06, + "loss": 0.6023, + "step": 12047 + }, + { + "epoch": 0.98, + "grad_norm": 7.159664989094367, + "learning_rate": 7.870804520176203e-06, + "loss": 0.711, + "step": 12048 + }, + { + "epoch": 0.98, + "grad_norm": 6.110818642805435, + "learning_rate": 7.870445540292784e-06, + "loss": 0.6605, + "step": 12049 + }, + { + "epoch": 0.98, + "grad_norm": 2.243267073620121, + "learning_rate": 7.870086538338054e-06, + "loss": 0.5194, + "step": 12050 + }, + { + "epoch": 0.98, + "grad_norm": 2.681849544295601, + "learning_rate": 7.869727514314767e-06, + "loss": 0.6116, + "step": 12051 + }, + { + "epoch": 0.98, + "grad_norm": 3.8018800112959603, + "learning_rate": 7.869368468225692e-06, + "loss": 0.6738, + "step": 12052 + }, + { + "epoch": 0.98, + "grad_norm": 59.764163924914094, + "learning_rate": 7.869009400073583e-06, + "loss": 0.6759, + "step": 12053 + }, + { + "epoch": 0.98, + "grad_norm": 3.604168693360398, + "learning_rate": 7.868650309861206e-06, + "loss": 0.8029, + "step": 12054 + }, + { + "epoch": 0.98, + "grad_norm": 3.73004260525144, + "learning_rate": 7.86829119759132e-06, + "loss": 0.702, + "step": 12055 + }, + { + "epoch": 0.98, + "grad_norm": 2.85374198083674, + "learning_rate": 7.867932063266685e-06, + "loss": 0.729, + "step": 12056 + }, + { + "epoch": 0.98, + "grad_norm": 2.7857704353130623, + "learning_rate": 7.867572906890064e-06, + "loss": 0.5993, + "step": 12057 + }, + { + "epoch": 0.98, + "grad_norm": 3.052394242109492, + "learning_rate": 7.867213728464219e-06, + "loss": 0.6259, + "step": 12058 + }, + { + "epoch": 0.98, + "grad_norm": 24.82425349092041, + "learning_rate": 7.866854527991908e-06, + "loss": 0.7488, + "step": 12059 + }, + { + "epoch": 0.98, + "grad_norm": 4.809873699165875, + "learning_rate": 7.866495305475898e-06, + "loss": 0.6165, + "step": 12060 + }, + { + "epoch": 0.98, + "grad_norm": 4.806074512909129, + "learning_rate": 7.86613606091895e-06, + "loss": 0.675, + "step": 12061 + }, + { + "epoch": 0.98, + "grad_norm": 3.3655606508869096, + "learning_rate": 7.865776794323823e-06, + "loss": 0.6545, + "step": 12062 + }, + { + "epoch": 0.98, + "grad_norm": 5.042723634600169, + "learning_rate": 7.865417505693282e-06, + "loss": 0.7312, + "step": 12063 + }, + { + "epoch": 0.98, + "grad_norm": 4.463062454187894, + "learning_rate": 7.86505819503009e-06, + "loss": 0.6281, + "step": 12064 + }, + { + "epoch": 0.98, + "grad_norm": 5.60971747600775, + "learning_rate": 7.86469886233701e-06, + "loss": 0.664, + "step": 12065 + }, + { + "epoch": 0.98, + "grad_norm": 3.6456850600461785, + "learning_rate": 7.864339507616803e-06, + "loss": 0.6528, + "step": 12066 + }, + { + "epoch": 0.98, + "grad_norm": 3.8403800076799977, + "learning_rate": 7.863980130872235e-06, + "loss": 0.6592, + "step": 12067 + }, + { + "epoch": 0.98, + "grad_norm": 33.9275048898015, + "learning_rate": 7.863620732106067e-06, + "loss": 0.6925, + "step": 12068 + }, + { + "epoch": 0.98, + "grad_norm": 3.7099841141422387, + "learning_rate": 7.863261311321062e-06, + "loss": 0.7796, + "step": 12069 + }, + { + "epoch": 0.98, + "grad_norm": 3.7105492781550895, + "learning_rate": 7.862901868519986e-06, + "loss": 0.6531, + "step": 12070 + }, + { + "epoch": 0.98, + "grad_norm": 7.727475759019637, + "learning_rate": 7.862542403705599e-06, + "loss": 0.6272, + "step": 12071 + }, + { + "epoch": 0.98, + "grad_norm": 4.774757700743257, + "learning_rate": 7.86218291688067e-06, + "loss": 0.5291, + "step": 12072 + }, + { + "epoch": 0.98, + "grad_norm": 5.217142021739866, + "learning_rate": 7.861823408047959e-06, + "loss": 0.8212, + "step": 12073 + }, + { + "epoch": 0.98, + "grad_norm": 4.792473091687254, + "learning_rate": 7.861463877210234e-06, + "loss": 0.8523, + "step": 12074 + }, + { + "epoch": 0.98, + "grad_norm": 6.217593114573845, + "learning_rate": 7.861104324370255e-06, + "loss": 0.6507, + "step": 12075 + }, + { + "epoch": 0.98, + "grad_norm": 3.091993839071186, + "learning_rate": 7.860744749530791e-06, + "loss": 0.7532, + "step": 12076 + }, + { + "epoch": 0.98, + "grad_norm": 4.016470200004109, + "learning_rate": 7.860385152694603e-06, + "loss": 0.5071, + "step": 12077 + }, + { + "epoch": 0.98, + "grad_norm": 4.620649092064307, + "learning_rate": 7.86002553386446e-06, + "loss": 0.6416, + "step": 12078 + }, + { + "epoch": 0.98, + "grad_norm": 3.554086480348715, + "learning_rate": 7.859665893043124e-06, + "loss": 0.6021, + "step": 12079 + }, + { + "epoch": 0.98, + "grad_norm": 6.638508230438698, + "learning_rate": 7.859306230233363e-06, + "loss": 0.5603, + "step": 12080 + }, + { + "epoch": 0.98, + "grad_norm": 7.202945507426494, + "learning_rate": 7.858946545437938e-06, + "loss": 0.7015, + "step": 12081 + }, + { + "epoch": 0.98, + "grad_norm": 5.138652319020925, + "learning_rate": 7.858586838659621e-06, + "loss": 0.5554, + "step": 12082 + }, + { + "epoch": 0.98, + "grad_norm": 3.8305891334146374, + "learning_rate": 7.858227109901172e-06, + "loss": 0.5977, + "step": 12083 + }, + { + "epoch": 0.98, + "grad_norm": 4.003655404513807, + "learning_rate": 7.85786735916536e-06, + "loss": 0.7089, + "step": 12084 + }, + { + "epoch": 0.98, + "grad_norm": 3.283468005369923, + "learning_rate": 7.857507586454951e-06, + "loss": 0.6506, + "step": 12085 + }, + { + "epoch": 0.98, + "grad_norm": 5.957433420269763, + "learning_rate": 7.85714779177271e-06, + "loss": 0.7231, + "step": 12086 + }, + { + "epoch": 0.98, + "grad_norm": 3.351845708326642, + "learning_rate": 7.856787975121407e-06, + "loss": 0.7154, + "step": 12087 + }, + { + "epoch": 0.98, + "grad_norm": 6.995854611678724, + "learning_rate": 7.856428136503804e-06, + "loss": 0.6035, + "step": 12088 + }, + { + "epoch": 0.98, + "grad_norm": 6.286548863116367, + "learning_rate": 7.85606827592267e-06, + "loss": 0.745, + "step": 12089 + }, + { + "epoch": 0.98, + "grad_norm": 3.7995685039775933, + "learning_rate": 7.855708393380775e-06, + "loss": 0.6579, + "step": 12090 + }, + { + "epoch": 0.98, + "grad_norm": 6.695566935500905, + "learning_rate": 7.85534848888088e-06, + "loss": 0.9223, + "step": 12091 + }, + { + "epoch": 0.98, + "grad_norm": 5.747206759230759, + "learning_rate": 7.854988562425758e-06, + "loss": 0.7343, + "step": 12092 + }, + { + "epoch": 0.98, + "grad_norm": 2.243430323428925, + "learning_rate": 7.854628614018172e-06, + "loss": 0.4484, + "step": 12093 + }, + { + "epoch": 0.98, + "grad_norm": 3.1351353516418046, + "learning_rate": 7.854268643660893e-06, + "loss": 0.5022, + "step": 12094 + }, + { + "epoch": 0.98, + "grad_norm": 3.596635838346699, + "learning_rate": 7.853908651356688e-06, + "loss": 0.6252, + "step": 12095 + }, + { + "epoch": 0.98, + "grad_norm": 10.757898620532243, + "learning_rate": 7.853548637108323e-06, + "loss": 0.5191, + "step": 12096 + }, + { + "epoch": 0.98, + "grad_norm": 5.8036339138670545, + "learning_rate": 7.85318860091857e-06, + "loss": 0.738, + "step": 12097 + }, + { + "epoch": 0.98, + "grad_norm": 18.586551832690887, + "learning_rate": 7.852828542790195e-06, + "loss": 0.5293, + "step": 12098 + }, + { + "epoch": 0.98, + "grad_norm": 5.1815913123352, + "learning_rate": 7.852468462725966e-06, + "loss": 0.5379, + "step": 12099 + }, + { + "epoch": 0.98, + "grad_norm": 4.494366370070267, + "learning_rate": 7.852108360728655e-06, + "loss": 0.6193, + "step": 12100 + }, + { + "epoch": 0.98, + "grad_norm": 5.130268160062507, + "learning_rate": 7.851748236801026e-06, + "loss": 0.5823, + "step": 12101 + }, + { + "epoch": 0.98, + "grad_norm": 4.166718538744968, + "learning_rate": 7.851388090945853e-06, + "loss": 0.5748, + "step": 12102 + }, + { + "epoch": 0.98, + "grad_norm": 3.818580947298578, + "learning_rate": 7.851027923165899e-06, + "loss": 0.7684, + "step": 12103 + }, + { + "epoch": 0.98, + "grad_norm": 5.1699384974859885, + "learning_rate": 7.850667733463941e-06, + "loss": 0.5763, + "step": 12104 + }, + { + "epoch": 0.98, + "grad_norm": 4.778738939474952, + "learning_rate": 7.850307521842742e-06, + "loss": 0.6607, + "step": 12105 + }, + { + "epoch": 0.98, + "grad_norm": 3.675896105279259, + "learning_rate": 7.849947288305075e-06, + "loss": 0.6721, + "step": 12106 + }, + { + "epoch": 0.98, + "grad_norm": 4.158405425690384, + "learning_rate": 7.84958703285371e-06, + "loss": 0.5552, + "step": 12107 + }, + { + "epoch": 0.98, + "grad_norm": 3.049232752823057, + "learning_rate": 7.849226755491417e-06, + "loss": 0.6009, + "step": 12108 + }, + { + "epoch": 0.98, + "grad_norm": 3.094498278259343, + "learning_rate": 7.848866456220965e-06, + "loss": 0.634, + "step": 12109 + }, + { + "epoch": 0.98, + "grad_norm": 4.524825118523398, + "learning_rate": 7.848506135045123e-06, + "loss": 0.6453, + "step": 12110 + }, + { + "epoch": 0.98, + "grad_norm": 3.9514437764359, + "learning_rate": 7.848145791966668e-06, + "loss": 0.6701, + "step": 12111 + }, + { + "epoch": 0.98, + "grad_norm": 3.5648903724874574, + "learning_rate": 7.847785426988364e-06, + "loss": 0.4488, + "step": 12112 + }, + { + "epoch": 0.98, + "grad_norm": 2.843288321049604, + "learning_rate": 7.847425040112984e-06, + "loss": 0.6156, + "step": 12113 + }, + { + "epoch": 0.98, + "grad_norm": 4.4286040758995675, + "learning_rate": 7.8470646313433e-06, + "loss": 0.5989, + "step": 12114 + }, + { + "epoch": 0.98, + "grad_norm": 3.9923110497977414, + "learning_rate": 7.84670420068208e-06, + "loss": 0.6909, + "step": 12115 + }, + { + "epoch": 0.98, + "grad_norm": 3.10144542464045, + "learning_rate": 7.846343748132102e-06, + "loss": 0.7802, + "step": 12116 + }, + { + "epoch": 0.98, + "grad_norm": 3.8305742267968874, + "learning_rate": 7.845983273696131e-06, + "loss": 0.7178, + "step": 12117 + }, + { + "epoch": 0.98, + "grad_norm": 3.246697472986766, + "learning_rate": 7.845622777376942e-06, + "loss": 0.6418, + "step": 12118 + }, + { + "epoch": 0.98, + "grad_norm": 2.9757394344174033, + "learning_rate": 7.845262259177305e-06, + "loss": 0.653, + "step": 12119 + }, + { + "epoch": 0.98, + "grad_norm": 6.571458522104288, + "learning_rate": 7.844901719099996e-06, + "loss": 0.5585, + "step": 12120 + }, + { + "epoch": 0.98, + "grad_norm": 3.233114897438143, + "learning_rate": 7.844541157147781e-06, + "loss": 0.64, + "step": 12121 + }, + { + "epoch": 0.98, + "grad_norm": 3.3106215455909243, + "learning_rate": 7.84418057332344e-06, + "loss": 0.6281, + "step": 12122 + }, + { + "epoch": 0.98, + "grad_norm": 4.061517380522789, + "learning_rate": 7.843819967629737e-06, + "loss": 0.7036, + "step": 12123 + }, + { + "epoch": 0.98, + "grad_norm": 4.012100530517519, + "learning_rate": 7.843459340069452e-06, + "loss": 0.7446, + "step": 12124 + }, + { + "epoch": 0.98, + "grad_norm": 7.17489777599761, + "learning_rate": 7.843098690645355e-06, + "loss": 0.5927, + "step": 12125 + }, + { + "epoch": 0.98, + "grad_norm": 3.523158296552429, + "learning_rate": 7.842738019360218e-06, + "loss": 0.655, + "step": 12126 + }, + { + "epoch": 0.98, + "grad_norm": 2.598426058363254, + "learning_rate": 7.842377326216818e-06, + "loss": 0.5008, + "step": 12127 + }, + { + "epoch": 0.99, + "grad_norm": 4.198243621338775, + "learning_rate": 7.842016611217924e-06, + "loss": 0.5445, + "step": 12128 + }, + { + "epoch": 0.99, + "grad_norm": 5.084293806153699, + "learning_rate": 7.841655874366313e-06, + "loss": 0.5282, + "step": 12129 + }, + { + "epoch": 0.99, + "grad_norm": 3.7836362295918757, + "learning_rate": 7.841295115664756e-06, + "loss": 0.7669, + "step": 12130 + }, + { + "epoch": 0.99, + "grad_norm": 3.6056221378375475, + "learning_rate": 7.84093433511603e-06, + "loss": 0.7224, + "step": 12131 + }, + { + "epoch": 0.99, + "grad_norm": 5.598493650674104, + "learning_rate": 7.840573532722905e-06, + "loss": 0.6373, + "step": 12132 + }, + { + "epoch": 0.99, + "grad_norm": 5.355764080741471, + "learning_rate": 7.84021270848816e-06, + "loss": 0.7114, + "step": 12133 + }, + { + "epoch": 0.99, + "grad_norm": 6.220595285582811, + "learning_rate": 7.839851862414566e-06, + "loss": 0.6773, + "step": 12134 + }, + { + "epoch": 0.99, + "grad_norm": 9.393262452020393, + "learning_rate": 7.8394909945049e-06, + "loss": 0.7257, + "step": 12135 + }, + { + "epoch": 0.99, + "grad_norm": 3.282048592073847, + "learning_rate": 7.839130104761932e-06, + "loss": 0.5825, + "step": 12136 + }, + { + "epoch": 0.99, + "grad_norm": 2.974934652597153, + "learning_rate": 7.838769193188443e-06, + "loss": 0.4903, + "step": 12137 + }, + { + "epoch": 0.99, + "grad_norm": 4.491033237232289, + "learning_rate": 7.838408259787205e-06, + "loss": 0.7295, + "step": 12138 + }, + { + "epoch": 0.99, + "grad_norm": 4.652569682851361, + "learning_rate": 7.838047304560993e-06, + "loss": 0.7767, + "step": 12139 + }, + { + "epoch": 0.99, + "grad_norm": 5.575782123747067, + "learning_rate": 7.837686327512585e-06, + "loss": 0.7394, + "step": 12140 + }, + { + "epoch": 0.99, + "grad_norm": 2.5846419254656623, + "learning_rate": 7.837325328644754e-06, + "loss": 0.538, + "step": 12141 + }, + { + "epoch": 0.99, + "grad_norm": 3.9482960618292404, + "learning_rate": 7.836964307960276e-06, + "loss": 0.5625, + "step": 12142 + }, + { + "epoch": 0.99, + "grad_norm": 5.316577658107811, + "learning_rate": 7.836603265461929e-06, + "loss": 0.6206, + "step": 12143 + }, + { + "epoch": 0.99, + "grad_norm": 10.624688009719822, + "learning_rate": 7.836242201152486e-06, + "loss": 0.8618, + "step": 12144 + }, + { + "epoch": 0.99, + "grad_norm": 4.931925501763064, + "learning_rate": 7.835881115034725e-06, + "loss": 0.5689, + "step": 12145 + }, + { + "epoch": 0.99, + "grad_norm": 3.1920217374094872, + "learning_rate": 7.835520007111424e-06, + "loss": 0.5673, + "step": 12146 + }, + { + "epoch": 0.99, + "grad_norm": 7.054961152554092, + "learning_rate": 7.835158877385356e-06, + "loss": 0.6172, + "step": 12147 + }, + { + "epoch": 0.99, + "grad_norm": 2.824546434544536, + "learning_rate": 7.8347977258593e-06, + "loss": 0.6955, + "step": 12148 + }, + { + "epoch": 0.99, + "grad_norm": 14.735700032706571, + "learning_rate": 7.834436552536035e-06, + "loss": 0.5175, + "step": 12149 + }, + { + "epoch": 0.99, + "grad_norm": 8.832659947223265, + "learning_rate": 7.834075357418334e-06, + "loss": 0.5754, + "step": 12150 + }, + { + "epoch": 0.99, + "grad_norm": 16.041721059089777, + "learning_rate": 7.833714140508977e-06, + "loss": 0.6403, + "step": 12151 + }, + { + "epoch": 0.99, + "grad_norm": 4.787231194546511, + "learning_rate": 7.83335290181074e-06, + "loss": 0.6287, + "step": 12152 + }, + { + "epoch": 0.99, + "grad_norm": 3.6284973769571454, + "learning_rate": 7.832991641326401e-06, + "loss": 0.6701, + "step": 12153 + }, + { + "epoch": 0.99, + "grad_norm": 3.4158574058330275, + "learning_rate": 7.832630359058739e-06, + "loss": 0.7507, + "step": 12154 + }, + { + "epoch": 0.99, + "grad_norm": 5.918548430325671, + "learning_rate": 7.83226905501053e-06, + "loss": 0.6509, + "step": 12155 + }, + { + "epoch": 0.99, + "grad_norm": 4.353082124426365, + "learning_rate": 7.831907729184553e-06, + "loss": 0.7693, + "step": 12156 + }, + { + "epoch": 0.99, + "grad_norm": 7.711173132583299, + "learning_rate": 7.831546381583588e-06, + "loss": 0.5678, + "step": 12157 + }, + { + "epoch": 0.99, + "grad_norm": 3.136214583937381, + "learning_rate": 7.83118501221041e-06, + "loss": 0.6021, + "step": 12158 + }, + { + "epoch": 0.99, + "grad_norm": 3.8386719091651615, + "learning_rate": 7.8308236210678e-06, + "loss": 0.6955, + "step": 12159 + }, + { + "epoch": 0.99, + "grad_norm": 9.676051206522784, + "learning_rate": 7.830462208158537e-06, + "loss": 0.7687, + "step": 12160 + }, + { + "epoch": 0.99, + "grad_norm": 3.3403014014492682, + "learning_rate": 7.830100773485398e-06, + "loss": 0.814, + "step": 12161 + }, + { + "epoch": 0.99, + "grad_norm": 4.825282859915182, + "learning_rate": 7.829739317051163e-06, + "loss": 0.6187, + "step": 12162 + }, + { + "epoch": 0.99, + "grad_norm": 3.6935016475696996, + "learning_rate": 7.829377838858614e-06, + "loss": 0.7142, + "step": 12163 + }, + { + "epoch": 0.99, + "grad_norm": 44.18085326653864, + "learning_rate": 7.829016338910526e-06, + "loss": 0.6578, + "step": 12164 + }, + { + "epoch": 0.99, + "grad_norm": 3.3725435330629177, + "learning_rate": 7.828654817209682e-06, + "loss": 0.649, + "step": 12165 + }, + { + "epoch": 0.99, + "grad_norm": 5.371158673500937, + "learning_rate": 7.82829327375886e-06, + "loss": 0.6118, + "step": 12166 + }, + { + "epoch": 0.99, + "grad_norm": 3.8597236504292964, + "learning_rate": 7.827931708560841e-06, + "loss": 0.7847, + "step": 12167 + }, + { + "epoch": 0.99, + "grad_norm": 4.140979186240384, + "learning_rate": 7.827570121618404e-06, + "loss": 0.6216, + "step": 12168 + }, + { + "epoch": 0.99, + "grad_norm": 5.741194175582843, + "learning_rate": 7.82720851293433e-06, + "loss": 0.5707, + "step": 12169 + }, + { + "epoch": 0.99, + "grad_norm": 3.997725445356984, + "learning_rate": 7.8268468825114e-06, + "loss": 0.7038, + "step": 12170 + }, + { + "epoch": 0.99, + "grad_norm": 4.2115428708702884, + "learning_rate": 7.826485230352395e-06, + "loss": 0.5636, + "step": 12171 + }, + { + "epoch": 0.99, + "grad_norm": 7.211196934784885, + "learning_rate": 7.826123556460093e-06, + "loss": 0.5492, + "step": 12172 + }, + { + "epoch": 0.99, + "grad_norm": 4.8228237443080815, + "learning_rate": 7.825761860837276e-06, + "loss": 0.7031, + "step": 12173 + }, + { + "epoch": 0.99, + "grad_norm": 5.97291479259059, + "learning_rate": 7.825400143486727e-06, + "loss": 0.6916, + "step": 12174 + }, + { + "epoch": 0.99, + "grad_norm": 7.07356665638762, + "learning_rate": 7.825038404411226e-06, + "loss": 0.6724, + "step": 12175 + }, + { + "epoch": 0.99, + "grad_norm": 3.2675880360296268, + "learning_rate": 7.824676643613556e-06, + "loss": 0.6483, + "step": 12176 + }, + { + "epoch": 0.99, + "grad_norm": 4.5353829941079935, + "learning_rate": 7.824314861096495e-06, + "loss": 0.4635, + "step": 12177 + }, + { + "epoch": 0.99, + "grad_norm": 2.846249778732734, + "learning_rate": 7.82395305686283e-06, + "loss": 0.6092, + "step": 12178 + }, + { + "epoch": 0.99, + "grad_norm": 3.5522368199597723, + "learning_rate": 7.82359123091534e-06, + "loss": 0.5557, + "step": 12179 + }, + { + "epoch": 0.99, + "grad_norm": 10.855329604628054, + "learning_rate": 7.823229383256805e-06, + "loss": 0.7271, + "step": 12180 + }, + { + "epoch": 0.99, + "grad_norm": 6.264178411998487, + "learning_rate": 7.822867513890011e-06, + "loss": 0.7451, + "step": 12181 + }, + { + "epoch": 0.99, + "grad_norm": 4.078196816084956, + "learning_rate": 7.82250562281774e-06, + "loss": 0.5085, + "step": 12182 + }, + { + "epoch": 0.99, + "grad_norm": 7.354194632814517, + "learning_rate": 7.822143710042771e-06, + "loss": 0.5789, + "step": 12183 + }, + { + "epoch": 0.99, + "grad_norm": 8.609939176473407, + "learning_rate": 7.821781775567891e-06, + "loss": 0.7198, + "step": 12184 + }, + { + "epoch": 0.99, + "grad_norm": 5.886965359507057, + "learning_rate": 7.821419819395881e-06, + "loss": 0.5406, + "step": 12185 + }, + { + "epoch": 0.99, + "grad_norm": 4.631325517278753, + "learning_rate": 7.821057841529525e-06, + "loss": 0.7126, + "step": 12186 + }, + { + "epoch": 0.99, + "grad_norm": 3.179115714905711, + "learning_rate": 7.820695841971606e-06, + "loss": 0.7723, + "step": 12187 + }, + { + "epoch": 0.99, + "grad_norm": 3.0981813321885125, + "learning_rate": 7.820333820724908e-06, + "loss": 0.7101, + "step": 12188 + }, + { + "epoch": 0.99, + "grad_norm": 7.9788668476993045, + "learning_rate": 7.819971777792212e-06, + "loss": 0.689, + "step": 12189 + }, + { + "epoch": 0.99, + "grad_norm": 2.9278301025900983, + "learning_rate": 7.819609713176305e-06, + "loss": 0.6265, + "step": 12190 + }, + { + "epoch": 0.99, + "grad_norm": 4.1349316035422525, + "learning_rate": 7.819247626879972e-06, + "loss": 0.6366, + "step": 12191 + }, + { + "epoch": 0.99, + "grad_norm": 3.496437332582988, + "learning_rate": 7.818885518905992e-06, + "loss": 0.6069, + "step": 12192 + }, + { + "epoch": 0.99, + "grad_norm": 3.9195632965385196, + "learning_rate": 7.818523389257151e-06, + "loss": 0.7834, + "step": 12193 + }, + { + "epoch": 0.99, + "grad_norm": 37.19072104576985, + "learning_rate": 7.818161237936238e-06, + "loss": 0.7115, + "step": 12194 + }, + { + "epoch": 0.99, + "grad_norm": 3.685572604659147, + "learning_rate": 7.817799064946033e-06, + "loss": 0.6284, + "step": 12195 + }, + { + "epoch": 0.99, + "grad_norm": 4.119569996234796, + "learning_rate": 7.817436870289324e-06, + "loss": 0.6561, + "step": 12196 + }, + { + "epoch": 0.99, + "grad_norm": 3.3272859247826427, + "learning_rate": 7.817074653968891e-06, + "loss": 0.6308, + "step": 12197 + }, + { + "epoch": 0.99, + "grad_norm": 3.5846458993900034, + "learning_rate": 7.816712415987523e-06, + "loss": 0.6964, + "step": 12198 + }, + { + "epoch": 0.99, + "grad_norm": 6.74395545842818, + "learning_rate": 7.816350156348006e-06, + "loss": 0.6677, + "step": 12199 + }, + { + "epoch": 0.99, + "grad_norm": 3.3677456728084936, + "learning_rate": 7.815987875053123e-06, + "loss": 0.6797, + "step": 12200 + }, + { + "epoch": 0.99, + "grad_norm": 3.2920651685559967, + "learning_rate": 7.81562557210566e-06, + "loss": 0.6911, + "step": 12201 + }, + { + "epoch": 0.99, + "grad_norm": 3.0304128828059613, + "learning_rate": 7.815263247508406e-06, + "loss": 0.6402, + "step": 12202 + }, + { + "epoch": 0.99, + "grad_norm": 5.460447163868425, + "learning_rate": 7.814900901264142e-06, + "loss": 0.7705, + "step": 12203 + }, + { + "epoch": 0.99, + "grad_norm": 9.911453053210254, + "learning_rate": 7.814538533375658e-06, + "loss": 0.6264, + "step": 12204 + }, + { + "epoch": 0.99, + "grad_norm": 3.5953934074809766, + "learning_rate": 7.814176143845737e-06, + "loss": 0.6217, + "step": 12205 + }, + { + "epoch": 0.99, + "grad_norm": 3.3373822463462273, + "learning_rate": 7.81381373267717e-06, + "loss": 0.5611, + "step": 12206 + }, + { + "epoch": 0.99, + "grad_norm": 31.8902307086028, + "learning_rate": 7.81345129987274e-06, + "loss": 0.6458, + "step": 12207 + }, + { + "epoch": 0.99, + "grad_norm": 5.098533876643183, + "learning_rate": 7.813088845435235e-06, + "loss": 0.735, + "step": 12208 + }, + { + "epoch": 0.99, + "grad_norm": 4.278928865920938, + "learning_rate": 7.812726369367441e-06, + "loss": 0.6014, + "step": 12209 + }, + { + "epoch": 0.99, + "grad_norm": 3.4908477693681808, + "learning_rate": 7.812363871672147e-06, + "loss": 0.6013, + "step": 12210 + }, + { + "epoch": 0.99, + "grad_norm": 4.549320650953422, + "learning_rate": 7.812001352352138e-06, + "loss": 0.7671, + "step": 12211 + }, + { + "epoch": 0.99, + "grad_norm": 5.2217724415360385, + "learning_rate": 7.811638811410203e-06, + "loss": 0.6527, + "step": 12212 + }, + { + "epoch": 0.99, + "grad_norm": 8.963712999094353, + "learning_rate": 7.811276248849129e-06, + "loss": 0.6332, + "step": 12213 + }, + { + "epoch": 0.99, + "grad_norm": 5.886074458142891, + "learning_rate": 7.810913664671706e-06, + "loss": 0.665, + "step": 12214 + }, + { + "epoch": 0.99, + "grad_norm": 6.471175164766711, + "learning_rate": 7.810551058880718e-06, + "loss": 0.7012, + "step": 12215 + }, + { + "epoch": 0.99, + "grad_norm": 2.7793148174480793, + "learning_rate": 7.810188431478955e-06, + "loss": 0.6479, + "step": 12216 + }, + { + "epoch": 0.99, + "grad_norm": 3.7220735311819544, + "learning_rate": 7.809825782469207e-06, + "loss": 0.7588, + "step": 12217 + }, + { + "epoch": 0.99, + "grad_norm": 2.8507931515297433, + "learning_rate": 7.80946311185426e-06, + "loss": 0.4817, + "step": 12218 + }, + { + "epoch": 0.99, + "grad_norm": 3.0043089701166386, + "learning_rate": 7.809100419636906e-06, + "loss": 0.5407, + "step": 12219 + }, + { + "epoch": 0.99, + "grad_norm": 8.795581797951318, + "learning_rate": 7.808737705819929e-06, + "loss": 0.6904, + "step": 12220 + }, + { + "epoch": 0.99, + "grad_norm": 3.492431613558116, + "learning_rate": 7.80837497040612e-06, + "loss": 0.6938, + "step": 12221 + }, + { + "epoch": 0.99, + "grad_norm": 3.6165768628534427, + "learning_rate": 7.80801221339827e-06, + "loss": 0.744, + "step": 12222 + }, + { + "epoch": 0.99, + "grad_norm": 3.7602955462066348, + "learning_rate": 7.807649434799168e-06, + "loss": 0.7081, + "step": 12223 + }, + { + "epoch": 0.99, + "grad_norm": 2.5990608601790273, + "learning_rate": 7.8072866346116e-06, + "loss": 0.7011, + "step": 12224 + }, + { + "epoch": 0.99, + "grad_norm": 3.017271375863101, + "learning_rate": 7.806923812838357e-06, + "loss": 0.6235, + "step": 12225 + }, + { + "epoch": 0.99, + "grad_norm": 4.1901361269949, + "learning_rate": 7.806560969482232e-06, + "loss": 0.7299, + "step": 12226 + }, + { + "epoch": 0.99, + "grad_norm": 3.566871661270214, + "learning_rate": 7.806198104546012e-06, + "loss": 0.7182, + "step": 12227 + }, + { + "epoch": 0.99, + "grad_norm": 5.002970092188824, + "learning_rate": 7.805835218032487e-06, + "loss": 0.4558, + "step": 12228 + }, + { + "epoch": 0.99, + "grad_norm": 3.9166014475720896, + "learning_rate": 7.80547230994445e-06, + "loss": 0.6167, + "step": 12229 + }, + { + "epoch": 0.99, + "grad_norm": 3.264870537355649, + "learning_rate": 7.805109380284688e-06, + "loss": 0.5233, + "step": 12230 + }, + { + "epoch": 0.99, + "grad_norm": 3.6615798534304567, + "learning_rate": 7.804746429055994e-06, + "loss": 0.5422, + "step": 12231 + }, + { + "epoch": 0.99, + "grad_norm": 3.717846446727007, + "learning_rate": 7.804383456261156e-06, + "loss": 0.8171, + "step": 12232 + }, + { + "epoch": 0.99, + "grad_norm": 3.5926100277554824, + "learning_rate": 7.804020461902968e-06, + "loss": 0.6136, + "step": 12233 + }, + { + "epoch": 0.99, + "grad_norm": 3.9472504729452473, + "learning_rate": 7.803657445984221e-06, + "loss": 0.7147, + "step": 12234 + }, + { + "epoch": 0.99, + "grad_norm": 4.703175604467212, + "learning_rate": 7.803294408507704e-06, + "loss": 0.7633, + "step": 12235 + }, + { + "epoch": 0.99, + "grad_norm": 18.94742197454528, + "learning_rate": 7.80293134947621e-06, + "loss": 0.6135, + "step": 12236 + }, + { + "epoch": 0.99, + "grad_norm": 10.956400725383462, + "learning_rate": 7.802568268892531e-06, + "loss": 0.6734, + "step": 12237 + }, + { + "epoch": 0.99, + "grad_norm": 5.307158523155442, + "learning_rate": 7.802205166759457e-06, + "loss": 0.6009, + "step": 12238 + }, + { + "epoch": 0.99, + "grad_norm": 2.2162234448504177, + "learning_rate": 7.801842043079784e-06, + "loss": 0.5927, + "step": 12239 + }, + { + "epoch": 0.99, + "grad_norm": 7.164349261371752, + "learning_rate": 7.801478897856298e-06, + "loss": 0.6616, + "step": 12240 + }, + { + "epoch": 0.99, + "grad_norm": 3.9214110772691244, + "learning_rate": 7.801115731091797e-06, + "loss": 0.6727, + "step": 12241 + }, + { + "epoch": 0.99, + "grad_norm": 3.4363097490008188, + "learning_rate": 7.80075254278907e-06, + "loss": 0.6967, + "step": 12242 + }, + { + "epoch": 0.99, + "grad_norm": 3.2572995279125725, + "learning_rate": 7.80038933295091e-06, + "loss": 0.7039, + "step": 12243 + }, + { + "epoch": 0.99, + "grad_norm": 6.978419475256046, + "learning_rate": 7.80002610158011e-06, + "loss": 0.6751, + "step": 12244 + }, + { + "epoch": 0.99, + "grad_norm": 4.165558393552936, + "learning_rate": 7.799662848679464e-06, + "loss": 0.7314, + "step": 12245 + }, + { + "epoch": 0.99, + "grad_norm": 4.629007274193252, + "learning_rate": 7.799299574251766e-06, + "loss": 0.5777, + "step": 12246 + }, + { + "epoch": 0.99, + "grad_norm": 5.105145079313559, + "learning_rate": 7.798936278299804e-06, + "loss": 0.617, + "step": 12247 + }, + { + "epoch": 0.99, + "grad_norm": 7.9822088811219745, + "learning_rate": 7.798572960826378e-06, + "loss": 0.6453, + "step": 12248 + }, + { + "epoch": 0.99, + "grad_norm": 6.709508600257386, + "learning_rate": 7.798209621834279e-06, + "loss": 0.6615, + "step": 12249 + }, + { + "epoch": 0.99, + "grad_norm": 3.4662543990604107, + "learning_rate": 7.7978462613263e-06, + "loss": 0.695, + "step": 12250 + }, + { + "epoch": 1.0, + "grad_norm": 3.778282186841342, + "learning_rate": 7.797482879305233e-06, + "loss": 0.6307, + "step": 12251 + }, + { + "epoch": 1.0, + "grad_norm": 3.7972152053114674, + "learning_rate": 7.797119475773877e-06, + "loss": 0.8989, + "step": 12252 + }, + { + "epoch": 1.0, + "grad_norm": 12.109735931431942, + "learning_rate": 7.796756050735023e-06, + "loss": 0.6249, + "step": 12253 + }, + { + "epoch": 1.0, + "grad_norm": 4.064435333074841, + "learning_rate": 7.796392604191468e-06, + "loss": 0.8667, + "step": 12254 + }, + { + "epoch": 1.0, + "grad_norm": 4.611726736971056, + "learning_rate": 7.796029136146003e-06, + "loss": 0.6571, + "step": 12255 + }, + { + "epoch": 1.0, + "grad_norm": 14.420739561213098, + "learning_rate": 7.795665646601425e-06, + "loss": 0.5519, + "step": 12256 + }, + { + "epoch": 1.0, + "grad_norm": 8.158273952733538, + "learning_rate": 7.795302135560527e-06, + "loss": 0.6887, + "step": 12257 + }, + { + "epoch": 1.0, + "grad_norm": 3.271767687201891, + "learning_rate": 7.794938603026107e-06, + "loss": 0.6402, + "step": 12258 + }, + { + "epoch": 1.0, + "grad_norm": 4.771347499546004, + "learning_rate": 7.794575049000961e-06, + "loss": 0.7685, + "step": 12259 + }, + { + "epoch": 1.0, + "grad_norm": 11.082559439950336, + "learning_rate": 7.79421147348788e-06, + "loss": 0.7517, + "step": 12260 + }, + { + "epoch": 1.0, + "grad_norm": 3.1195492010686197, + "learning_rate": 7.793847876489662e-06, + "loss": 0.6749, + "step": 12261 + }, + { + "epoch": 1.0, + "grad_norm": 3.747118224137669, + "learning_rate": 7.793484258009103e-06, + "loss": 0.6963, + "step": 12262 + }, + { + "epoch": 1.0, + "grad_norm": 4.260413779070824, + "learning_rate": 7.793120618048997e-06, + "loss": 0.7857, + "step": 12263 + }, + { + "epoch": 1.0, + "grad_norm": 5.865520868853068, + "learning_rate": 7.792756956612143e-06, + "loss": 0.7947, + "step": 12264 + }, + { + "epoch": 1.0, + "grad_norm": 4.8332355000111376, + "learning_rate": 7.792393273701337e-06, + "loss": 0.5821, + "step": 12265 + }, + { + "epoch": 1.0, + "grad_norm": 25.033079898992558, + "learning_rate": 7.792029569319374e-06, + "loss": 0.7391, + "step": 12266 + }, + { + "epoch": 1.0, + "grad_norm": 6.894461244006182, + "learning_rate": 7.791665843469049e-06, + "loss": 0.6408, + "step": 12267 + }, + { + "epoch": 1.0, + "grad_norm": 5.434250883738288, + "learning_rate": 7.791302096153162e-06, + "loss": 0.8545, + "step": 12268 + }, + { + "epoch": 1.0, + "grad_norm": 35.93544476333542, + "learning_rate": 7.790938327374508e-06, + "loss": 0.6269, + "step": 12269 + }, + { + "epoch": 1.0, + "grad_norm": 13.647866478969739, + "learning_rate": 7.790574537135886e-06, + "loss": 0.721, + "step": 12270 + }, + { + "epoch": 1.0, + "grad_norm": 3.4969679179110615, + "learning_rate": 7.790210725440091e-06, + "loss": 0.637, + "step": 12271 + }, + { + "epoch": 1.0, + "grad_norm": 3.018765412240935, + "learning_rate": 7.789846892289921e-06, + "loss": 0.7249, + "step": 12272 + }, + { + "epoch": 1.0, + "grad_norm": 3.9287572774896864, + "learning_rate": 7.789483037688174e-06, + "loss": 0.7201, + "step": 12273 + }, + { + "epoch": 1.0, + "grad_norm": 2.475719742855069, + "learning_rate": 7.789119161637649e-06, + "loss": 0.6552, + "step": 12274 + }, + { + "epoch": 1.0, + "grad_norm": 4.689685354833785, + "learning_rate": 7.78875526414114e-06, + "loss": 0.5868, + "step": 12275 + }, + { + "epoch": 1.0, + "grad_norm": 3.9910993335853244, + "learning_rate": 7.788391345201449e-06, + "loss": 0.4659, + "step": 12276 + }, + { + "epoch": 1.0, + "grad_norm": 4.0144784681909025, + "learning_rate": 7.788027404821375e-06, + "loss": 0.8007, + "step": 12277 + }, + { + "epoch": 1.0, + "grad_norm": 2.538671321675504, + "learning_rate": 7.78766344300371e-06, + "loss": 0.4756, + "step": 12278 + }, + { + "epoch": 1.0, + "grad_norm": 7.304400678411963, + "learning_rate": 7.78729945975126e-06, + "loss": 0.6857, + "step": 12279 + }, + { + "epoch": 1.0, + "grad_norm": 3.226066453952374, + "learning_rate": 7.786935455066817e-06, + "loss": 0.5848, + "step": 12280 + }, + { + "epoch": 1.0, + "grad_norm": 5.534743052487797, + "learning_rate": 7.786571428953187e-06, + "loss": 0.7197, + "step": 12281 + }, + { + "epoch": 1.0, + "grad_norm": 3.8865146800842147, + "learning_rate": 7.786207381413164e-06, + "loss": 0.5744, + "step": 12282 + }, + { + "epoch": 1.0, + "grad_norm": 5.95261662451427, + "learning_rate": 7.785843312449548e-06, + "loss": 0.6293, + "step": 12283 + }, + { + "epoch": 1.0, + "grad_norm": 3.195883462505136, + "learning_rate": 7.78547922206514e-06, + "loss": 0.6475, + "step": 12284 + }, + { + "epoch": 1.0, + "grad_norm": 3.75891372454614, + "learning_rate": 7.785115110262738e-06, + "loss": 0.7816, + "step": 12285 + }, + { + "epoch": 1.0, + "grad_norm": 3.556380028332088, + "learning_rate": 7.784750977045143e-06, + "loss": 0.7241, + "step": 12286 + }, + { + "epoch": 1.0, + "grad_norm": 3.8345500910780563, + "learning_rate": 7.784386822415152e-06, + "loss": 0.7368, + "step": 12287 + }, + { + "epoch": 1.0, + "grad_norm": 4.313241620150329, + "learning_rate": 7.784022646375569e-06, + "loss": 0.5734, + "step": 12288 + }, + { + "epoch": 1.0, + "grad_norm": 4.228884839750384, + "learning_rate": 7.783658448929193e-06, + "loss": 0.4883, + "step": 12289 + }, + { + "epoch": 1.0, + "grad_norm": 4.661439332288415, + "learning_rate": 7.783294230078823e-06, + "loss": 0.6979, + "step": 12290 + }, + { + "epoch": 1.0, + "grad_norm": 8.249220768611895, + "learning_rate": 7.78292998982726e-06, + "loss": 0.6955, + "step": 12291 + }, + { + "epoch": 1.0, + "grad_norm": 3.1187762738556417, + "learning_rate": 7.782565728177304e-06, + "loss": 0.5674, + "step": 12292 + }, + { + "epoch": 1.0, + "grad_norm": 8.556128914340617, + "learning_rate": 7.782201445131761e-06, + "loss": 0.677, + "step": 12293 + }, + { + "epoch": 1.0, + "grad_norm": 30.013634363000687, + "learning_rate": 7.781837140693425e-06, + "loss": 0.6098, + "step": 12294 + }, + { + "epoch": 1.0, + "grad_norm": 3.6626451992633204, + "learning_rate": 7.781472814865099e-06, + "loss": 0.7141, + "step": 12295 + }, + { + "epoch": 1.0, + "grad_norm": 4.189813590714736, + "learning_rate": 7.781108467649588e-06, + "loss": 0.5932, + "step": 12296 + }, + { + "epoch": 1.0, + "grad_norm": 7.4752300479225315, + "learning_rate": 7.780744099049689e-06, + "loss": 0.6724, + "step": 12297 + }, + { + "epoch": 1.0, + "grad_norm": 3.3388877413636706, + "learning_rate": 7.780379709068206e-06, + "loss": 0.7507, + "step": 12298 + }, + { + "epoch": 1.0, + "grad_norm": 2.6990917016073555, + "learning_rate": 7.780015297707942e-06, + "loss": 0.7497, + "step": 12299 + }, + { + "epoch": 1.0, + "grad_norm": 2.8370745145690477, + "learning_rate": 7.779650864971695e-06, + "loss": 0.4812, + "step": 12300 + }, + { + "epoch": 1.0, + "grad_norm": 4.1805838085100095, + "learning_rate": 7.779286410862273e-06, + "loss": 0.5362, + "step": 12301 + }, + { + "epoch": 1.0, + "grad_norm": 3.269065170056007, + "learning_rate": 7.778921935382473e-06, + "loss": 0.7703, + "step": 12302 + }, + { + "epoch": 1.0, + "grad_norm": 5.339250280794474, + "learning_rate": 7.778557438535099e-06, + "loss": 0.6341, + "step": 12303 + }, + { + "epoch": 1.0, + "grad_norm": 2.9216049089902607, + "learning_rate": 7.778192920322955e-06, + "loss": 0.6253, + "step": 12304 + }, + { + "epoch": 1.0, + "grad_norm": 20.6498819775808, + "learning_rate": 7.777828380748844e-06, + "loss": 0.7632, + "step": 12305 + }, + { + "epoch": 1.0, + "grad_norm": 4.740886465965859, + "learning_rate": 7.777463819815568e-06, + "loss": 0.6718, + "step": 12306 + }, + { + "epoch": 1.0, + "grad_norm": 5.251076687239021, + "learning_rate": 7.777099237525929e-06, + "loss": 0.5973, + "step": 12307 + }, + { + "epoch": 1.0, + "grad_norm": 3.4352323146561954, + "learning_rate": 7.776734633882731e-06, + "loss": 0.6901, + "step": 12308 + }, + { + "epoch": 1.0, + "grad_norm": 2.980752899021085, + "learning_rate": 7.776370008888781e-06, + "loss": 0.6319, + "step": 12309 + }, + { + "epoch": 1.0, + "grad_norm": 3.401668790743825, + "learning_rate": 7.77600536254688e-06, + "loss": 0.606, + "step": 12310 + }, + { + "epoch": 1.0, + "grad_norm": 2.7812082880721687, + "learning_rate": 7.77564069485983e-06, + "loss": 0.5914, + "step": 12311 + }, + { + "epoch": 1.0, + "grad_norm": 8.209408090889218, + "learning_rate": 7.775276005830434e-06, + "loss": 0.8583, + "step": 12312 + }, + { + "epoch": 1.0, + "grad_norm": 3.185361682188218, + "learning_rate": 7.774911295461503e-06, + "loss": 0.6666, + "step": 12313 + }, + { + "epoch": 1.0, + "grad_norm": 3.498702307938696, + "learning_rate": 7.774546563755833e-06, + "loss": 0.484, + "step": 12314 + }, + { + "epoch": 1.0, + "grad_norm": 5.472964213982246, + "learning_rate": 7.774181810716236e-06, + "loss": 0.5913, + "step": 12315 + }, + { + "epoch": 1.0, + "grad_norm": 2.833738539254753, + "learning_rate": 7.773817036345513e-06, + "loss": 0.6254, + "step": 12316 + }, + { + "epoch": 1.0, + "grad_norm": 7.236101394034412, + "learning_rate": 7.773452240646466e-06, + "loss": 0.6619, + "step": 12317 + }, + { + "epoch": 1.0, + "grad_norm": 4.9409095194088675, + "learning_rate": 7.773087423621905e-06, + "loss": 0.6499, + "step": 12318 + }, + { + "epoch": 1.0, + "grad_norm": 5.177145996419497, + "learning_rate": 7.772722585274633e-06, + "loss": 0.7187, + "step": 12319 + }, + { + "epoch": 1.0, + "grad_norm": 5.385487705384896, + "learning_rate": 7.772357725607455e-06, + "loss": 0.5783, + "step": 12320 + }, + { + "epoch": 1.0, + "grad_norm": 3.986181230282297, + "learning_rate": 7.771992844623177e-06, + "loss": 0.6453, + "step": 12321 + }, + { + "epoch": 1.0, + "grad_norm": 4.369590576159136, + "learning_rate": 7.771627942324605e-06, + "loss": 0.6456, + "step": 12322 + }, + { + "epoch": 1.0, + "grad_norm": 6.991346849888702, + "learning_rate": 7.771263018714544e-06, + "loss": 0.4831, + "step": 12323 + }, + { + "epoch": 1.0, + "grad_norm": 33.44804825094083, + "learning_rate": 7.7708980737958e-06, + "loss": 0.6116, + "step": 12324 + }, + { + "epoch": 1.0, + "grad_norm": 3.0716189454435665, + "learning_rate": 7.77053310757118e-06, + "loss": 0.5712, + "step": 12325 + }, + { + "epoch": 1.0, + "grad_norm": 4.089502726707574, + "learning_rate": 7.77016812004349e-06, + "loss": 0.6704, + "step": 12326 + }, + { + "epoch": 1.0, + "grad_norm": 4.657682099783619, + "learning_rate": 7.769803111215534e-06, + "loss": 0.4638, + "step": 12327 + }, + { + "epoch": 1.0, + "grad_norm": 4.723065857048151, + "learning_rate": 7.769438081090121e-06, + "loss": 0.597, + "step": 12328 + }, + { + "epoch": 1.0, + "grad_norm": 6.691568424305026, + "learning_rate": 7.76907302967006e-06, + "loss": 0.8181, + "step": 12329 + }, + { + "epoch": 1.0, + "grad_norm": 3.0383961866918083, + "learning_rate": 7.768707956958154e-06, + "loss": 0.6252, + "step": 12330 + }, + { + "epoch": 1.0, + "grad_norm": 3.191519093750927, + "learning_rate": 7.76834286295721e-06, + "loss": 0.7377, + "step": 12331 + }, + { + "epoch": 1.0, + "grad_norm": 4.050261980539512, + "learning_rate": 7.76797774767004e-06, + "loss": 0.6642, + "step": 12332 + }, + { + "epoch": 1.0, + "grad_norm": 16.666896987375384, + "learning_rate": 7.767612611099444e-06, + "loss": 0.7379, + "step": 12333 + }, + { + "epoch": 1.0, + "grad_norm": 4.610156634550446, + "learning_rate": 7.767247453248237e-06, + "loss": 0.7204, + "step": 12334 + }, + { + "epoch": 1.0, + "grad_norm": 3.0943573096149506, + "learning_rate": 7.766882274119222e-06, + "loss": 0.5846, + "step": 12335 + }, + { + "epoch": 1.0, + "grad_norm": 3.548686559472081, + "learning_rate": 7.766517073715208e-06, + "loss": 0.7203, + "step": 12336 + }, + { + "epoch": 1.0, + "grad_norm": 3.4692453031314496, + "learning_rate": 7.766151852039006e-06, + "loss": 0.5866, + "step": 12337 + }, + { + "epoch": 1.0, + "grad_norm": 3.4946597367643375, + "learning_rate": 7.76578660909342e-06, + "loss": 0.5318, + "step": 12338 + }, + { + "epoch": 1.0, + "grad_norm": 4.370031047749948, + "learning_rate": 7.765421344881261e-06, + "loss": 0.6678, + "step": 12339 + }, + { + "epoch": 1.0, + "grad_norm": 3.71542157403533, + "learning_rate": 7.765056059405335e-06, + "loss": 0.6277, + "step": 12340 + }, + { + "epoch": 1.0, + "grad_norm": 5.94370950967478, + "learning_rate": 7.764690752668454e-06, + "loss": 0.6381, + "step": 12341 + }, + { + "epoch": 1.0, + "grad_norm": 4.911420117400132, + "learning_rate": 7.764325424673425e-06, + "loss": 0.4531, + "step": 12342 + }, + { + "epoch": 1.0, + "grad_norm": 4.102001959781842, + "learning_rate": 7.763960075423059e-06, + "loss": 0.6942, + "step": 12343 + }, + { + "epoch": 1.0, + "grad_norm": 2.5015750285894587, + "learning_rate": 7.763594704920161e-06, + "loss": 0.5658, + "step": 12344 + }, + { + "epoch": 1.0, + "grad_norm": 2.953171671566058, + "learning_rate": 7.763229313167547e-06, + "loss": 0.5321, + "step": 12345 + }, + { + "epoch": 1.0, + "grad_norm": 7.127827544512035, + "learning_rate": 7.762863900168019e-06, + "loss": 0.7065, + "step": 12346 + }, + { + "epoch": 1.0, + "grad_norm": 3.3827313382861, + "learning_rate": 7.762498465924391e-06, + "loss": 0.7649, + "step": 12347 + }, + { + "epoch": 1.0, + "grad_norm": 6.547389372714532, + "learning_rate": 7.762133010439474e-06, + "loss": 0.5579, + "step": 12348 + }, + { + "epoch": 1.0, + "grad_norm": 3.98740693862126, + "learning_rate": 7.761767533716076e-06, + "loss": 0.6622, + "step": 12349 + }, + { + "epoch": 1.0, + "grad_norm": 2.9972146281650947, + "learning_rate": 7.761402035757007e-06, + "loss": 0.7119, + "step": 12350 + }, + { + "epoch": 1.0, + "grad_norm": 6.761449708977114, + "learning_rate": 7.761036516565077e-06, + "loss": 0.5546, + "step": 12351 + }, + { + "epoch": 1.0, + "grad_norm": 5.014182086625072, + "learning_rate": 7.760670976143098e-06, + "loss": 0.7446, + "step": 12352 + }, + { + "epoch": 1.0, + "grad_norm": 5.390113396572107, + "learning_rate": 7.76030541449388e-06, + "loss": 0.7511, + "step": 12353 + }, + { + "epoch": 1.0, + "grad_norm": 4.489083824453948, + "learning_rate": 7.759939831620234e-06, + "loss": 0.5598, + "step": 12354 + }, + { + "epoch": 1.0, + "grad_norm": 5.4785876621263325, + "learning_rate": 7.75957422752497e-06, + "loss": 0.764, + "step": 12355 + }, + { + "epoch": 1.0, + "grad_norm": 3.301134677945862, + "learning_rate": 7.759208602210903e-06, + "loss": 0.621, + "step": 12356 + }, + { + "epoch": 1.0, + "grad_norm": 5.137221195664433, + "learning_rate": 7.758842955680841e-06, + "loss": 0.6051, + "step": 12357 + }, + { + "epoch": 1.0, + "grad_norm": 7.161915928381801, + "learning_rate": 7.758477287937594e-06, + "loss": 0.5628, + "step": 12358 + }, + { + "epoch": 1.0, + "grad_norm": 14.221602585639925, + "learning_rate": 7.758111598983978e-06, + "loss": 0.5629, + "step": 12359 + }, + { + "epoch": 1.0, + "grad_norm": 10.61359663720406, + "learning_rate": 7.7577458888228e-06, + "loss": 0.7102, + "step": 12360 + }, + { + "epoch": 1.0, + "grad_norm": 19.547655154438917, + "learning_rate": 7.757380157456876e-06, + "loss": 0.607, + "step": 12361 + }, + { + "epoch": 1.0, + "grad_norm": 6.079260282167976, + "learning_rate": 7.757014404889017e-06, + "loss": 0.5267, + "step": 12362 + }, + { + "epoch": 1.0, + "grad_norm": 12.455611052289733, + "learning_rate": 7.756648631122034e-06, + "loss": 0.7443, + "step": 12363 + }, + { + "epoch": 1.0, + "grad_norm": 2.995062351657669, + "learning_rate": 7.756282836158743e-06, + "loss": 0.5046, + "step": 12364 + }, + { + "epoch": 1.0, + "grad_norm": 3.9298575264214537, + "learning_rate": 7.755917020001952e-06, + "loss": 0.6369, + "step": 12365 + }, + { + "epoch": 1.0, + "grad_norm": 2.671893253903679, + "learning_rate": 7.755551182654478e-06, + "loss": 0.6545, + "step": 12366 + }, + { + "epoch": 1.0, + "grad_norm": 2.8006294274097616, + "learning_rate": 7.75518532411913e-06, + "loss": 0.7098, + "step": 12367 + }, + { + "epoch": 1.0, + "grad_norm": 9.810284457455396, + "learning_rate": 7.754819444398725e-06, + "loss": 0.6724, + "step": 12368 + }, + { + "epoch": 1.0, + "grad_norm": 3.6523321395386037, + "learning_rate": 7.754453543496071e-06, + "loss": 0.64, + "step": 12369 + }, + { + "epoch": 1.0, + "grad_norm": 2.3938093764643162, + "learning_rate": 7.754087621413989e-06, + "loss": 0.6504, + "step": 12370 + }, + { + "epoch": 1.0, + "grad_norm": 4.151078216143737, + "learning_rate": 7.753721678155287e-06, + "loss": 0.8103, + "step": 12371 + }, + { + "epoch": 1.0, + "grad_norm": 7.634980881799524, + "learning_rate": 7.75335571372278e-06, + "loss": 0.7608, + "step": 12372 + }, + { + "epoch": 1.0, + "grad_norm": 17.874650859569957, + "learning_rate": 7.752989728119283e-06, + "loss": 0.6029, + "step": 12373 + }, + { + "epoch": 1.01, + "grad_norm": 3.8044805748291934, + "learning_rate": 7.752623721347609e-06, + "loss": 0.5672, + "step": 12374 + }, + { + "epoch": 1.01, + "grad_norm": 3.4340875179578347, + "learning_rate": 7.752257693410574e-06, + "loss": 0.544, + "step": 12375 + }, + { + "epoch": 1.01, + "grad_norm": 4.24443514873277, + "learning_rate": 7.75189164431099e-06, + "loss": 0.5404, + "step": 12376 + }, + { + "epoch": 1.01, + "grad_norm": 4.998389835300329, + "learning_rate": 7.751525574051672e-06, + "loss": 0.5918, + "step": 12377 + }, + { + "epoch": 1.01, + "grad_norm": 4.390147408671915, + "learning_rate": 7.751159482635437e-06, + "loss": 0.5597, + "step": 12378 + }, + { + "epoch": 1.01, + "grad_norm": 3.5628917431910634, + "learning_rate": 7.750793370065098e-06, + "loss": 0.5366, + "step": 12379 + }, + { + "epoch": 1.01, + "grad_norm": 4.95645296554164, + "learning_rate": 7.750427236343471e-06, + "loss": 0.6, + "step": 12380 + }, + { + "epoch": 1.01, + "grad_norm": 4.259826541221953, + "learning_rate": 7.75006108147337e-06, + "loss": 0.6176, + "step": 12381 + }, + { + "epoch": 1.01, + "grad_norm": 8.25881958656223, + "learning_rate": 7.749694905457612e-06, + "loss": 0.6598, + "step": 12382 + }, + { + "epoch": 1.01, + "grad_norm": 5.012678935908401, + "learning_rate": 7.749328708299012e-06, + "loss": 0.6759, + "step": 12383 + }, + { + "epoch": 1.01, + "grad_norm": 2.9763067338894955, + "learning_rate": 7.748962490000385e-06, + "loss": 0.6029, + "step": 12384 + }, + { + "epoch": 1.01, + "grad_norm": 3.302962317854548, + "learning_rate": 7.748596250564548e-06, + "loss": 0.611, + "step": 12385 + }, + { + "epoch": 1.01, + "grad_norm": 4.275089073351172, + "learning_rate": 7.748229989994317e-06, + "loss": 0.5557, + "step": 12386 + }, + { + "epoch": 1.01, + "grad_norm": 5.855164554664778, + "learning_rate": 7.747863708292508e-06, + "loss": 0.5917, + "step": 12387 + }, + { + "epoch": 1.01, + "grad_norm": 4.005887088186894, + "learning_rate": 7.747497405461936e-06, + "loss": 0.7338, + "step": 12388 + }, + { + "epoch": 1.01, + "grad_norm": 4.331118169381667, + "learning_rate": 7.747131081505419e-06, + "loss": 0.6196, + "step": 12389 + }, + { + "epoch": 1.01, + "grad_norm": 7.306210302354887, + "learning_rate": 7.746764736425774e-06, + "loss": 0.7645, + "step": 12390 + }, + { + "epoch": 1.01, + "grad_norm": 3.3845578069745885, + "learning_rate": 7.746398370225818e-06, + "loss": 0.8394, + "step": 12391 + }, + { + "epoch": 1.01, + "grad_norm": 5.143218388978716, + "learning_rate": 7.746031982908367e-06, + "loss": 0.6478, + "step": 12392 + }, + { + "epoch": 1.01, + "grad_norm": 4.235429469520762, + "learning_rate": 7.74566557447624e-06, + "loss": 0.6428, + "step": 12393 + }, + { + "epoch": 1.01, + "grad_norm": 3.641673011416448, + "learning_rate": 7.745299144932251e-06, + "loss": 0.6982, + "step": 12394 + }, + { + "epoch": 1.01, + "grad_norm": 3.656750697109626, + "learning_rate": 7.744932694279219e-06, + "loss": 0.6602, + "step": 12395 + }, + { + "epoch": 1.01, + "grad_norm": 3.980877697518048, + "learning_rate": 7.744566222519964e-06, + "loss": 0.6246, + "step": 12396 + }, + { + "epoch": 1.01, + "grad_norm": 5.759409884924394, + "learning_rate": 7.744199729657303e-06, + "loss": 0.7747, + "step": 12397 + }, + { + "epoch": 1.01, + "grad_norm": 3.5644067378487727, + "learning_rate": 7.74383321569405e-06, + "loss": 0.5402, + "step": 12398 + }, + { + "epoch": 1.01, + "grad_norm": 3.474110855441311, + "learning_rate": 7.74346668063303e-06, + "loss": 0.6459, + "step": 12399 + }, + { + "epoch": 1.01, + "grad_norm": 8.8094401957294, + "learning_rate": 7.743100124477054e-06, + "loss": 0.6544, + "step": 12400 + }, + { + "epoch": 1.01, + "grad_norm": 4.508837876645364, + "learning_rate": 7.742733547228947e-06, + "loss": 0.7154, + "step": 12401 + }, + { + "epoch": 1.01, + "grad_norm": 8.482146241296723, + "learning_rate": 7.742366948891523e-06, + "loss": 0.4897, + "step": 12402 + }, + { + "epoch": 1.01, + "grad_norm": 2.8012339544173175, + "learning_rate": 7.742000329467605e-06, + "loss": 0.7675, + "step": 12403 + }, + { + "epoch": 1.01, + "grad_norm": 4.811890366880647, + "learning_rate": 7.741633688960007e-06, + "loss": 0.7713, + "step": 12404 + }, + { + "epoch": 1.01, + "grad_norm": 3.718089944762814, + "learning_rate": 7.741267027371553e-06, + "loss": 0.6709, + "step": 12405 + }, + { + "epoch": 1.01, + "grad_norm": 4.208618513478443, + "learning_rate": 7.74090034470506e-06, + "loss": 0.6052, + "step": 12406 + }, + { + "epoch": 1.01, + "grad_norm": 3.5628861606361024, + "learning_rate": 7.740533640963347e-06, + "loss": 0.7031, + "step": 12407 + }, + { + "epoch": 1.01, + "grad_norm": 4.951995912330768, + "learning_rate": 7.740166916149234e-06, + "loss": 0.6086, + "step": 12408 + }, + { + "epoch": 1.01, + "grad_norm": 4.066257857836169, + "learning_rate": 7.739800170265542e-06, + "loss": 0.496, + "step": 12409 + }, + { + "epoch": 1.01, + "grad_norm": 2.7838069601218964, + "learning_rate": 7.739433403315088e-06, + "loss": 0.5929, + "step": 12410 + }, + { + "epoch": 1.01, + "grad_norm": 3.9309172196933706, + "learning_rate": 7.739066615300697e-06, + "loss": 0.6071, + "step": 12411 + }, + { + "epoch": 1.01, + "grad_norm": 4.266282188232434, + "learning_rate": 7.738699806225185e-06, + "loss": 0.652, + "step": 12412 + }, + { + "epoch": 1.01, + "grad_norm": 4.682750071130759, + "learning_rate": 7.738332976091374e-06, + "loss": 0.7113, + "step": 12413 + }, + { + "epoch": 1.01, + "grad_norm": 4.234492710801553, + "learning_rate": 7.737966124902086e-06, + "loss": 0.7745, + "step": 12414 + }, + { + "epoch": 1.01, + "grad_norm": 3.8162428891457822, + "learning_rate": 7.737599252660139e-06, + "loss": 0.6393, + "step": 12415 + }, + { + "epoch": 1.01, + "grad_norm": 4.488736193502105, + "learning_rate": 7.737232359368355e-06, + "loss": 0.6527, + "step": 12416 + }, + { + "epoch": 1.01, + "grad_norm": 2.6143265710864934, + "learning_rate": 7.736865445029555e-06, + "loss": 0.6947, + "step": 12417 + }, + { + "epoch": 1.01, + "grad_norm": 3.4623748071856157, + "learning_rate": 7.736498509646562e-06, + "loss": 0.5658, + "step": 12418 + }, + { + "epoch": 1.01, + "grad_norm": 3.5217046523318696, + "learning_rate": 7.736131553222195e-06, + "loss": 0.5268, + "step": 12419 + }, + { + "epoch": 1.01, + "grad_norm": 3.374979470035191, + "learning_rate": 7.735764575759278e-06, + "loss": 0.6547, + "step": 12420 + }, + { + "epoch": 1.01, + "grad_norm": 3.485433535474984, + "learning_rate": 7.73539757726063e-06, + "loss": 0.5143, + "step": 12421 + }, + { + "epoch": 1.01, + "grad_norm": 7.004160809032969, + "learning_rate": 7.735030557729075e-06, + "loss": 0.7766, + "step": 12422 + }, + { + "epoch": 1.01, + "grad_norm": 3.6036504120675192, + "learning_rate": 7.734663517167436e-06, + "loss": 0.7153, + "step": 12423 + }, + { + "epoch": 1.01, + "grad_norm": 3.884095417512094, + "learning_rate": 7.734296455578531e-06, + "loss": 0.6505, + "step": 12424 + }, + { + "epoch": 1.01, + "grad_norm": 4.573398702571422, + "learning_rate": 7.733929372965185e-06, + "loss": 0.7059, + "step": 12425 + }, + { + "epoch": 1.01, + "grad_norm": 3.8139982635944367, + "learning_rate": 7.733562269330222e-06, + "loss": 0.6952, + "step": 12426 + }, + { + "epoch": 1.01, + "grad_norm": 5.194551261366177, + "learning_rate": 7.733195144676463e-06, + "loss": 0.5159, + "step": 12427 + }, + { + "epoch": 1.01, + "grad_norm": 4.860328106309098, + "learning_rate": 7.732827999006732e-06, + "loss": 0.7511, + "step": 12428 + }, + { + "epoch": 1.01, + "grad_norm": 3.3449992575824705, + "learning_rate": 7.732460832323849e-06, + "loss": 0.6291, + "step": 12429 + }, + { + "epoch": 1.01, + "grad_norm": 6.181628492683932, + "learning_rate": 7.732093644630641e-06, + "loss": 0.7886, + "step": 12430 + }, + { + "epoch": 1.01, + "grad_norm": 5.136907784533668, + "learning_rate": 7.73172643592993e-06, + "loss": 0.6918, + "step": 12431 + }, + { + "epoch": 1.01, + "grad_norm": 2.4852940212195125, + "learning_rate": 7.73135920622454e-06, + "loss": 0.6246, + "step": 12432 + }, + { + "epoch": 1.01, + "grad_norm": 4.105300684373457, + "learning_rate": 7.730991955517291e-06, + "loss": 0.6206, + "step": 12433 + }, + { + "epoch": 1.01, + "grad_norm": 6.681406266577509, + "learning_rate": 7.730624683811012e-06, + "loss": 0.6622, + "step": 12434 + }, + { + "epoch": 1.01, + "grad_norm": 137.76017076187927, + "learning_rate": 7.730257391108524e-06, + "loss": 0.6682, + "step": 12435 + }, + { + "epoch": 1.01, + "grad_norm": 2.5196184702541746, + "learning_rate": 7.729890077412655e-06, + "loss": 0.6022, + "step": 12436 + }, + { + "epoch": 1.01, + "grad_norm": 3.7161633214146996, + "learning_rate": 7.729522742726221e-06, + "loss": 0.6222, + "step": 12437 + }, + { + "epoch": 1.01, + "grad_norm": 4.63742921396767, + "learning_rate": 7.729155387052057e-06, + "loss": 0.766, + "step": 12438 + }, + { + "epoch": 1.01, + "grad_norm": 3.6814001993959615, + "learning_rate": 7.72878801039298e-06, + "loss": 0.5864, + "step": 12439 + }, + { + "epoch": 1.01, + "grad_norm": 4.246354017028032, + "learning_rate": 7.728420612751816e-06, + "loss": 0.5371, + "step": 12440 + }, + { + "epoch": 1.01, + "grad_norm": 3.89690299307031, + "learning_rate": 7.728053194131393e-06, + "loss": 0.5069, + "step": 12441 + }, + { + "epoch": 1.01, + "grad_norm": 2.512753592726871, + "learning_rate": 7.727685754534535e-06, + "loss": 0.5506, + "step": 12442 + }, + { + "epoch": 1.01, + "grad_norm": 3.1359035638049595, + "learning_rate": 7.727318293964066e-06, + "loss": 0.703, + "step": 12443 + }, + { + "epoch": 1.01, + "grad_norm": 3.4010716550034585, + "learning_rate": 7.726950812422812e-06, + "loss": 0.6649, + "step": 12444 + }, + { + "epoch": 1.01, + "grad_norm": 3.4902385325799314, + "learning_rate": 7.7265833099136e-06, + "loss": 0.7175, + "step": 12445 + }, + { + "epoch": 1.01, + "grad_norm": 8.341618495245067, + "learning_rate": 7.726215786439253e-06, + "loss": 0.6683, + "step": 12446 + }, + { + "epoch": 1.01, + "grad_norm": 3.9105637151793555, + "learning_rate": 7.7258482420026e-06, + "loss": 0.8218, + "step": 12447 + }, + { + "epoch": 1.01, + "grad_norm": 4.898789746602815, + "learning_rate": 7.725480676606465e-06, + "loss": 0.581, + "step": 12448 + }, + { + "epoch": 1.01, + "grad_norm": 2.4825356702267585, + "learning_rate": 7.725113090253673e-06, + "loss": 0.5684, + "step": 12449 + }, + { + "epoch": 1.01, + "grad_norm": 6.118383132410117, + "learning_rate": 7.724745482947055e-06, + "loss": 0.7814, + "step": 12450 + }, + { + "epoch": 1.01, + "grad_norm": 12.024728912950485, + "learning_rate": 7.724377854689436e-06, + "loss": 0.6322, + "step": 12451 + }, + { + "epoch": 1.01, + "grad_norm": 5.378213907319591, + "learning_rate": 7.724010205483639e-06, + "loss": 0.6503, + "step": 12452 + }, + { + "epoch": 1.01, + "grad_norm": 2.908838228926188, + "learning_rate": 7.723642535332493e-06, + "loss": 0.5902, + "step": 12453 + }, + { + "epoch": 1.01, + "grad_norm": 2.7925478097847245, + "learning_rate": 7.72327484423883e-06, + "loss": 0.564, + "step": 12454 + }, + { + "epoch": 1.01, + "grad_norm": 3.219172884101676, + "learning_rate": 7.72290713220547e-06, + "loss": 0.5676, + "step": 12455 + }, + { + "epoch": 1.01, + "grad_norm": 4.325319515868642, + "learning_rate": 7.722539399235242e-06, + "loss": 0.7188, + "step": 12456 + }, + { + "epoch": 1.01, + "grad_norm": 3.1005123523180393, + "learning_rate": 7.722171645330978e-06, + "loss": 0.5075, + "step": 12457 + }, + { + "epoch": 1.01, + "grad_norm": 51.844239775420874, + "learning_rate": 7.721803870495502e-06, + "loss": 0.667, + "step": 12458 + }, + { + "epoch": 1.01, + "grad_norm": 3.614372489244219, + "learning_rate": 7.72143607473164e-06, + "loss": 0.7487, + "step": 12459 + }, + { + "epoch": 1.01, + "grad_norm": 4.729110265982259, + "learning_rate": 7.721068258042227e-06, + "loss": 0.5708, + "step": 12460 + }, + { + "epoch": 1.01, + "grad_norm": 9.235611099044382, + "learning_rate": 7.720700420430083e-06, + "loss": 0.7087, + "step": 12461 + }, + { + "epoch": 1.01, + "grad_norm": 2.980037951333018, + "learning_rate": 7.72033256189804e-06, + "loss": 0.6202, + "step": 12462 + }, + { + "epoch": 1.01, + "grad_norm": 4.610652505152559, + "learning_rate": 7.719964682448927e-06, + "loss": 0.6729, + "step": 12463 + }, + { + "epoch": 1.01, + "grad_norm": 4.203383173467424, + "learning_rate": 7.719596782085575e-06, + "loss": 0.6963, + "step": 12464 + }, + { + "epoch": 1.01, + "grad_norm": 5.938864009496398, + "learning_rate": 7.719228860810806e-06, + "loss": 0.6215, + "step": 12465 + }, + { + "epoch": 1.01, + "grad_norm": 4.42999822004694, + "learning_rate": 7.718860918627456e-06, + "loss": 0.4941, + "step": 12466 + }, + { + "epoch": 1.01, + "grad_norm": 3.790169057616943, + "learning_rate": 7.718492955538351e-06, + "loss": 0.6048, + "step": 12467 + }, + { + "epoch": 1.01, + "grad_norm": 5.252885183751117, + "learning_rate": 7.718124971546318e-06, + "loss": 0.6138, + "step": 12468 + }, + { + "epoch": 1.01, + "grad_norm": 3.0266282733554575, + "learning_rate": 7.717756966654193e-06, + "loss": 0.6817, + "step": 12469 + }, + { + "epoch": 1.01, + "grad_norm": 2.8871540398253113, + "learning_rate": 7.717388940864801e-06, + "loss": 0.6417, + "step": 12470 + }, + { + "epoch": 1.01, + "grad_norm": 7.537664204470688, + "learning_rate": 7.717020894180972e-06, + "loss": 0.5406, + "step": 12471 + }, + { + "epoch": 1.01, + "grad_norm": 3.0147322826724676, + "learning_rate": 7.716652826605535e-06, + "loss": 0.6454, + "step": 12472 + }, + { + "epoch": 1.01, + "grad_norm": 6.485109186116596, + "learning_rate": 7.716284738141325e-06, + "loss": 0.5993, + "step": 12473 + }, + { + "epoch": 1.01, + "grad_norm": 4.197615314683759, + "learning_rate": 7.715916628791165e-06, + "loss": 0.5768, + "step": 12474 + }, + { + "epoch": 1.01, + "grad_norm": 3.6062381656100704, + "learning_rate": 7.715548498557893e-06, + "loss": 0.6595, + "step": 12475 + }, + { + "epoch": 1.01, + "grad_norm": 2.761853757796361, + "learning_rate": 7.715180347444333e-06, + "loss": 0.6605, + "step": 12476 + }, + { + "epoch": 1.01, + "grad_norm": 3.711674679767731, + "learning_rate": 7.714812175453321e-06, + "loss": 0.7607, + "step": 12477 + }, + { + "epoch": 1.01, + "grad_norm": 5.940281561281353, + "learning_rate": 7.714443982587685e-06, + "loss": 0.578, + "step": 12478 + }, + { + "epoch": 1.01, + "grad_norm": 6.8326576933770875, + "learning_rate": 7.714075768850257e-06, + "loss": 0.5773, + "step": 12479 + }, + { + "epoch": 1.01, + "grad_norm": 5.306155783080045, + "learning_rate": 7.713707534243868e-06, + "loss": 0.7211, + "step": 12480 + }, + { + "epoch": 1.01, + "grad_norm": 4.158634480674293, + "learning_rate": 7.71333927877135e-06, + "loss": 0.6481, + "step": 12481 + }, + { + "epoch": 1.01, + "grad_norm": 2.843609143817566, + "learning_rate": 7.712971002435533e-06, + "loss": 0.633, + "step": 12482 + }, + { + "epoch": 1.01, + "grad_norm": 3.867345166845518, + "learning_rate": 7.712602705239249e-06, + "loss": 0.58, + "step": 12483 + }, + { + "epoch": 1.01, + "grad_norm": 3.5395612668829197, + "learning_rate": 7.712234387185333e-06, + "loss": 0.6477, + "step": 12484 + }, + { + "epoch": 1.01, + "grad_norm": 3.309771716147315, + "learning_rate": 7.711866048276614e-06, + "loss": 0.611, + "step": 12485 + }, + { + "epoch": 1.01, + "grad_norm": 3.7774767869502615, + "learning_rate": 7.711497688515926e-06, + "loss": 0.6026, + "step": 12486 + }, + { + "epoch": 1.01, + "grad_norm": 2.6896578225234253, + "learning_rate": 7.711129307906098e-06, + "loss": 0.8223, + "step": 12487 + }, + { + "epoch": 1.01, + "grad_norm": 3.5091284485037595, + "learning_rate": 7.710760906449967e-06, + "loss": 0.6385, + "step": 12488 + }, + { + "epoch": 1.01, + "grad_norm": 3.2257552192719334, + "learning_rate": 7.710392484150361e-06, + "loss": 0.667, + "step": 12489 + }, + { + "epoch": 1.01, + "grad_norm": 3.2266761286620094, + "learning_rate": 7.71002404101012e-06, + "loss": 0.5546, + "step": 12490 + }, + { + "epoch": 1.01, + "grad_norm": 2.960008109392364, + "learning_rate": 7.70965557703207e-06, + "loss": 0.4834, + "step": 12491 + }, + { + "epoch": 1.01, + "grad_norm": 2.8925774836362432, + "learning_rate": 7.709287092219045e-06, + "loss": 0.5245, + "step": 12492 + }, + { + "epoch": 1.01, + "grad_norm": 12.788409531014182, + "learning_rate": 7.708918586573881e-06, + "loss": 0.729, + "step": 12493 + }, + { + "epoch": 1.01, + "grad_norm": 3.896006725309682, + "learning_rate": 7.708550060099411e-06, + "loss": 0.6387, + "step": 12494 + }, + { + "epoch": 1.01, + "grad_norm": 11.052491606860372, + "learning_rate": 7.708181512798467e-06, + "loss": 0.5589, + "step": 12495 + }, + { + "epoch": 1.01, + "grad_norm": 5.239690051420297, + "learning_rate": 7.707812944673886e-06, + "loss": 0.7396, + "step": 12496 + }, + { + "epoch": 1.02, + "grad_norm": 3.4831620878914693, + "learning_rate": 7.7074443557285e-06, + "loss": 0.593, + "step": 12497 + }, + { + "epoch": 1.02, + "grad_norm": 4.379393889441705, + "learning_rate": 7.70707574596514e-06, + "loss": 0.591, + "step": 12498 + }, + { + "epoch": 1.02, + "grad_norm": 2.7208026474898066, + "learning_rate": 7.706707115386648e-06, + "loss": 0.6065, + "step": 12499 + }, + { + "epoch": 1.02, + "grad_norm": 4.934889838350088, + "learning_rate": 7.70633846399585e-06, + "loss": 0.7773, + "step": 12500 + }, + { + "epoch": 1.02, + "grad_norm": 2.955300990181328, + "learning_rate": 7.705969791795585e-06, + "loss": 0.6696, + "step": 12501 + }, + { + "epoch": 1.02, + "grad_norm": 3.3891291853942334, + "learning_rate": 7.70560109878869e-06, + "loss": 0.6165, + "step": 12502 + }, + { + "epoch": 1.02, + "grad_norm": 4.463789616446766, + "learning_rate": 7.705232384977994e-06, + "loss": 0.7136, + "step": 12503 + }, + { + "epoch": 1.02, + "grad_norm": 4.404550775750048, + "learning_rate": 7.704863650366337e-06, + "loss": 0.7982, + "step": 12504 + }, + { + "epoch": 1.02, + "grad_norm": 4.408852042468807, + "learning_rate": 7.704494894956551e-06, + "loss": 0.7204, + "step": 12505 + }, + { + "epoch": 1.02, + "grad_norm": 3.2997349924763775, + "learning_rate": 7.704126118751476e-06, + "loss": 0.7004, + "step": 12506 + }, + { + "epoch": 1.02, + "grad_norm": 5.109205518581624, + "learning_rate": 7.703757321753942e-06, + "loss": 0.5842, + "step": 12507 + }, + { + "epoch": 1.02, + "grad_norm": 3.907151286069359, + "learning_rate": 7.703388503966787e-06, + "loss": 0.6245, + "step": 12508 + }, + { + "epoch": 1.02, + "grad_norm": 3.270189421701857, + "learning_rate": 7.703019665392848e-06, + "loss": 0.7375, + "step": 12509 + }, + { + "epoch": 1.02, + "grad_norm": 3.6054525953614935, + "learning_rate": 7.702650806034962e-06, + "loss": 0.7707, + "step": 12510 + }, + { + "epoch": 1.02, + "grad_norm": 5.277238984730477, + "learning_rate": 7.70228192589596e-06, + "loss": 0.5775, + "step": 12511 + }, + { + "epoch": 1.02, + "grad_norm": 2.861479295290755, + "learning_rate": 7.701913024978684e-06, + "loss": 0.64, + "step": 12512 + }, + { + "epoch": 1.02, + "grad_norm": 4.683086539504576, + "learning_rate": 7.701544103285967e-06, + "loss": 0.6542, + "step": 12513 + }, + { + "epoch": 1.02, + "grad_norm": 3.886111698955071, + "learning_rate": 7.701175160820648e-06, + "loss": 0.6161, + "step": 12514 + }, + { + "epoch": 1.02, + "grad_norm": 4.335351822417811, + "learning_rate": 7.700806197585564e-06, + "loss": 0.6632, + "step": 12515 + }, + { + "epoch": 1.02, + "grad_norm": 3.770021228495931, + "learning_rate": 7.70043721358355e-06, + "loss": 0.5825, + "step": 12516 + }, + { + "epoch": 1.02, + "grad_norm": 4.109337837525772, + "learning_rate": 7.700068208817444e-06, + "loss": 0.6602, + "step": 12517 + }, + { + "epoch": 1.02, + "grad_norm": 3.7824469160379546, + "learning_rate": 7.699699183290084e-06, + "loss": 0.5915, + "step": 12518 + }, + { + "epoch": 1.02, + "grad_norm": 3.3741714656917883, + "learning_rate": 7.699330137004306e-06, + "loss": 0.6083, + "step": 12519 + }, + { + "epoch": 1.02, + "grad_norm": 4.113577975332614, + "learning_rate": 7.69896106996295e-06, + "loss": 0.6592, + "step": 12520 + }, + { + "epoch": 1.02, + "grad_norm": 3.284420853677757, + "learning_rate": 7.698591982168851e-06, + "loss": 0.6026, + "step": 12521 + }, + { + "epoch": 1.02, + "grad_norm": 3.7657081335262608, + "learning_rate": 7.698222873624847e-06, + "loss": 0.6015, + "step": 12522 + }, + { + "epoch": 1.02, + "grad_norm": 2.533041664474653, + "learning_rate": 7.697853744333781e-06, + "loss": 0.5321, + "step": 12523 + }, + { + "epoch": 1.02, + "grad_norm": 5.874571038245419, + "learning_rate": 7.697484594298485e-06, + "loss": 0.734, + "step": 12524 + }, + { + "epoch": 1.02, + "grad_norm": 3.0987508486379047, + "learning_rate": 7.697115423521802e-06, + "loss": 0.645, + "step": 12525 + }, + { + "epoch": 1.02, + "grad_norm": 3.9874398749366544, + "learning_rate": 7.696746232006569e-06, + "loss": 0.6241, + "step": 12526 + }, + { + "epoch": 1.02, + "grad_norm": 3.4565971877879744, + "learning_rate": 7.696377019755624e-06, + "loss": 0.6687, + "step": 12527 + }, + { + "epoch": 1.02, + "grad_norm": 5.557081188894988, + "learning_rate": 7.696007786771806e-06, + "loss": 0.5783, + "step": 12528 + }, + { + "epoch": 1.02, + "grad_norm": 2.8604984111035843, + "learning_rate": 7.695638533057956e-06, + "loss": 0.6064, + "step": 12529 + }, + { + "epoch": 1.02, + "grad_norm": 3.1284752666066225, + "learning_rate": 7.69526925861691e-06, + "loss": 0.6775, + "step": 12530 + }, + { + "epoch": 1.02, + "grad_norm": 3.1044202777037664, + "learning_rate": 7.694899963451512e-06, + "loss": 0.5536, + "step": 12531 + }, + { + "epoch": 1.02, + "grad_norm": 3.393868298344729, + "learning_rate": 7.694530647564597e-06, + "loss": 0.7347, + "step": 12532 + }, + { + "epoch": 1.02, + "grad_norm": 4.015780076086292, + "learning_rate": 7.694161310959007e-06, + "loss": 0.6135, + "step": 12533 + }, + { + "epoch": 1.02, + "grad_norm": 4.761142339762222, + "learning_rate": 7.693791953637584e-06, + "loss": 0.5762, + "step": 12534 + }, + { + "epoch": 1.02, + "grad_norm": 3.5468236845590657, + "learning_rate": 7.693422575603162e-06, + "loss": 0.7304, + "step": 12535 + }, + { + "epoch": 1.02, + "grad_norm": 17.37870368059202, + "learning_rate": 7.693053176858586e-06, + "loss": 0.6916, + "step": 12536 + }, + { + "epoch": 1.02, + "grad_norm": 4.964013871202618, + "learning_rate": 7.692683757406696e-06, + "loss": 0.5789, + "step": 12537 + }, + { + "epoch": 1.02, + "grad_norm": 2.3521395362245916, + "learning_rate": 7.692314317250331e-06, + "loss": 0.6145, + "step": 12538 + }, + { + "epoch": 1.02, + "grad_norm": 2.566645138834135, + "learning_rate": 7.691944856392333e-06, + "loss": 0.5797, + "step": 12539 + }, + { + "epoch": 1.02, + "grad_norm": 2.6899042136466536, + "learning_rate": 7.69157537483554e-06, + "loss": 0.6343, + "step": 12540 + }, + { + "epoch": 1.02, + "grad_norm": 4.6904359585547155, + "learning_rate": 7.691205872582797e-06, + "loss": 0.6079, + "step": 12541 + }, + { + "epoch": 1.02, + "grad_norm": 3.19248809338908, + "learning_rate": 7.690836349636945e-06, + "loss": 0.5855, + "step": 12542 + }, + { + "epoch": 1.02, + "grad_norm": 4.04093009566446, + "learning_rate": 7.690466806000822e-06, + "loss": 0.833, + "step": 12543 + }, + { + "epoch": 1.02, + "grad_norm": 6.493340837376103, + "learning_rate": 7.69009724167727e-06, + "loss": 0.6643, + "step": 12544 + }, + { + "epoch": 1.02, + "grad_norm": 11.094288996913862, + "learning_rate": 7.689727656669132e-06, + "loss": 0.5741, + "step": 12545 + }, + { + "epoch": 1.02, + "grad_norm": 7.199239490640086, + "learning_rate": 7.689358050979252e-06, + "loss": 0.4498, + "step": 12546 + }, + { + "epoch": 1.02, + "grad_norm": 9.338233991109362, + "learning_rate": 7.688988424610468e-06, + "loss": 0.597, + "step": 12547 + }, + { + "epoch": 1.02, + "grad_norm": 3.031397304346165, + "learning_rate": 7.688618777565623e-06, + "loss": 0.5712, + "step": 12548 + }, + { + "epoch": 1.02, + "grad_norm": 3.3270259212646858, + "learning_rate": 7.68824910984756e-06, + "loss": 0.7088, + "step": 12549 + }, + { + "epoch": 1.02, + "grad_norm": 5.361687459090794, + "learning_rate": 7.687879421459123e-06, + "loss": 0.8544, + "step": 12550 + }, + { + "epoch": 1.02, + "grad_norm": 2.5667657686767544, + "learning_rate": 7.687509712403152e-06, + "loss": 0.6679, + "step": 12551 + }, + { + "epoch": 1.02, + "grad_norm": 55.11664932699268, + "learning_rate": 7.68713998268249e-06, + "loss": 0.6991, + "step": 12552 + }, + { + "epoch": 1.02, + "grad_norm": 3.755015059995825, + "learning_rate": 7.686770232299982e-06, + "loss": 0.636, + "step": 12553 + }, + { + "epoch": 1.02, + "grad_norm": 4.325059288928103, + "learning_rate": 7.68640046125847e-06, + "loss": 0.7616, + "step": 12554 + }, + { + "epoch": 1.02, + "grad_norm": 4.356956566199944, + "learning_rate": 7.686030669560796e-06, + "loss": 0.5423, + "step": 12555 + }, + { + "epoch": 1.02, + "grad_norm": 3.8784764522094157, + "learning_rate": 7.685660857209805e-06, + "loss": 0.6149, + "step": 12556 + }, + { + "epoch": 1.02, + "grad_norm": 2.6067373594337373, + "learning_rate": 7.685291024208338e-06, + "loss": 0.5454, + "step": 12557 + }, + { + "epoch": 1.02, + "grad_norm": 4.015451592080818, + "learning_rate": 7.684921170559243e-06, + "loss": 0.5329, + "step": 12558 + }, + { + "epoch": 1.02, + "grad_norm": 5.460954957993479, + "learning_rate": 7.68455129626536e-06, + "loss": 0.5194, + "step": 12559 + }, + { + "epoch": 1.02, + "grad_norm": 17.44754904089569, + "learning_rate": 7.684181401329535e-06, + "loss": 0.5676, + "step": 12560 + }, + { + "epoch": 1.02, + "grad_norm": 4.959332331093466, + "learning_rate": 7.68381148575461e-06, + "loss": 0.6397, + "step": 12561 + }, + { + "epoch": 1.02, + "grad_norm": 2.9243296428501235, + "learning_rate": 7.683441549543435e-06, + "loss": 0.5269, + "step": 12562 + }, + { + "epoch": 1.02, + "grad_norm": 5.801959713766648, + "learning_rate": 7.683071592698847e-06, + "loss": 0.6858, + "step": 12563 + }, + { + "epoch": 1.02, + "grad_norm": 45.56470308297, + "learning_rate": 7.682701615223695e-06, + "loss": 0.7867, + "step": 12564 + }, + { + "epoch": 1.02, + "grad_norm": 3.7277751535654393, + "learning_rate": 7.682331617120823e-06, + "loss": 0.7609, + "step": 12565 + }, + { + "epoch": 1.02, + "grad_norm": 7.314442079940626, + "learning_rate": 7.681961598393077e-06, + "loss": 0.7517, + "step": 12566 + }, + { + "epoch": 1.02, + "grad_norm": 16.591712719960395, + "learning_rate": 7.6815915590433e-06, + "loss": 0.6199, + "step": 12567 + }, + { + "epoch": 1.02, + "grad_norm": 4.288676630101633, + "learning_rate": 7.681221499074338e-06, + "loss": 0.5563, + "step": 12568 + }, + { + "epoch": 1.02, + "grad_norm": 4.67781462152573, + "learning_rate": 7.680851418489037e-06, + "loss": 0.7763, + "step": 12569 + }, + { + "epoch": 1.02, + "grad_norm": 3.2820391513501885, + "learning_rate": 7.680481317290243e-06, + "loss": 0.7015, + "step": 12570 + }, + { + "epoch": 1.02, + "grad_norm": 4.037068985052755, + "learning_rate": 7.680111195480801e-06, + "loss": 0.7217, + "step": 12571 + }, + { + "epoch": 1.02, + "grad_norm": 5.492997210151447, + "learning_rate": 7.679741053063557e-06, + "loss": 0.5099, + "step": 12572 + }, + { + "epoch": 1.02, + "grad_norm": 2.7877565694175805, + "learning_rate": 7.679370890041358e-06, + "loss": 0.5245, + "step": 12573 + }, + { + "epoch": 1.02, + "grad_norm": 5.502229127065097, + "learning_rate": 7.679000706417049e-06, + "loss": 0.5252, + "step": 12574 + }, + { + "epoch": 1.02, + "grad_norm": 2.6868943961811547, + "learning_rate": 7.678630502193476e-06, + "loss": 0.8192, + "step": 12575 + }, + { + "epoch": 1.02, + "grad_norm": 5.155918508588384, + "learning_rate": 7.678260277373488e-06, + "loss": 0.6738, + "step": 12576 + }, + { + "epoch": 1.02, + "grad_norm": 8.159239974960599, + "learning_rate": 7.677890031959928e-06, + "loss": 0.6146, + "step": 12577 + }, + { + "epoch": 1.02, + "grad_norm": 4.63249666100707, + "learning_rate": 7.677519765955647e-06, + "loss": 0.6395, + "step": 12578 + }, + { + "epoch": 1.02, + "grad_norm": 4.086485087761686, + "learning_rate": 7.677149479363487e-06, + "loss": 0.5852, + "step": 12579 + }, + { + "epoch": 1.02, + "grad_norm": 3.488293765484528, + "learning_rate": 7.6767791721863e-06, + "loss": 0.5882, + "step": 12580 + }, + { + "epoch": 1.02, + "grad_norm": 6.466325233015778, + "learning_rate": 7.676408844426934e-06, + "loss": 0.6721, + "step": 12581 + }, + { + "epoch": 1.02, + "grad_norm": 2.6952983509431783, + "learning_rate": 7.676038496088232e-06, + "loss": 0.4849, + "step": 12582 + }, + { + "epoch": 1.02, + "grad_norm": 3.0064643513799, + "learning_rate": 7.675668127173043e-06, + "loss": 0.6906, + "step": 12583 + }, + { + "epoch": 1.02, + "grad_norm": 3.7211621564519684, + "learning_rate": 7.675297737684217e-06, + "loss": 0.6305, + "step": 12584 + }, + { + "epoch": 1.02, + "grad_norm": 7.0064852257260775, + "learning_rate": 7.6749273276246e-06, + "loss": 0.5429, + "step": 12585 + }, + { + "epoch": 1.02, + "grad_norm": 3.0022255296393427, + "learning_rate": 7.674556896997041e-06, + "loss": 0.5432, + "step": 12586 + }, + { + "epoch": 1.02, + "grad_norm": 4.644900194514009, + "learning_rate": 7.674186445804387e-06, + "loss": 0.763, + "step": 12587 + }, + { + "epoch": 1.02, + "grad_norm": 4.122263336429523, + "learning_rate": 7.673815974049489e-06, + "loss": 0.6086, + "step": 12588 + }, + { + "epoch": 1.02, + "grad_norm": 4.252663133854405, + "learning_rate": 7.673445481735191e-06, + "loss": 0.697, + "step": 12589 + }, + { + "epoch": 1.02, + "grad_norm": 2.6400001535358597, + "learning_rate": 7.673074968864347e-06, + "loss": 0.5344, + "step": 12590 + }, + { + "epoch": 1.02, + "grad_norm": 4.686475514425044, + "learning_rate": 7.672704435439805e-06, + "loss": 0.7779, + "step": 12591 + }, + { + "epoch": 1.02, + "grad_norm": 3.097133767992966, + "learning_rate": 7.672333881464411e-06, + "loss": 0.5638, + "step": 12592 + }, + { + "epoch": 1.02, + "grad_norm": 3.638546275863654, + "learning_rate": 7.671963306941017e-06, + "loss": 0.6188, + "step": 12593 + }, + { + "epoch": 1.02, + "grad_norm": 4.509845926533034, + "learning_rate": 7.67159271187247e-06, + "loss": 0.7021, + "step": 12594 + }, + { + "epoch": 1.02, + "grad_norm": 2.6588582068874804, + "learning_rate": 7.671222096261624e-06, + "loss": 0.612, + "step": 12595 + }, + { + "epoch": 1.02, + "grad_norm": 4.517744275470992, + "learning_rate": 7.670851460111323e-06, + "loss": 0.6097, + "step": 12596 + }, + { + "epoch": 1.02, + "grad_norm": 5.041977547756016, + "learning_rate": 7.670480803424422e-06, + "loss": 0.6301, + "step": 12597 + }, + { + "epoch": 1.02, + "grad_norm": 3.2306451631836945, + "learning_rate": 7.670110126203767e-06, + "loss": 0.6299, + "step": 12598 + }, + { + "epoch": 1.02, + "grad_norm": 5.986142420766472, + "learning_rate": 7.669739428452211e-06, + "loss": 0.6896, + "step": 12599 + }, + { + "epoch": 1.02, + "grad_norm": 4.6298990483191425, + "learning_rate": 7.669368710172603e-06, + "loss": 0.6252, + "step": 12600 + }, + { + "epoch": 1.02, + "grad_norm": 3.534689909909978, + "learning_rate": 7.668997971367793e-06, + "loss": 0.4609, + "step": 12601 + }, + { + "epoch": 1.02, + "grad_norm": 2.71198053992389, + "learning_rate": 7.668627212040633e-06, + "loss": 0.7169, + "step": 12602 + }, + { + "epoch": 1.02, + "grad_norm": 3.741814230254302, + "learning_rate": 7.668256432193974e-06, + "loss": 0.7502, + "step": 12603 + }, + { + "epoch": 1.02, + "grad_norm": 6.4266329533502455, + "learning_rate": 7.667885631830665e-06, + "loss": 0.6781, + "step": 12604 + }, + { + "epoch": 1.02, + "grad_norm": 3.070013793096365, + "learning_rate": 7.66751481095356e-06, + "loss": 0.6919, + "step": 12605 + }, + { + "epoch": 1.02, + "grad_norm": 2.7622846635810046, + "learning_rate": 7.667143969565507e-06, + "loss": 0.642, + "step": 12606 + }, + { + "epoch": 1.02, + "grad_norm": 4.194138834810723, + "learning_rate": 7.66677310766936e-06, + "loss": 0.6854, + "step": 12607 + }, + { + "epoch": 1.02, + "grad_norm": 3.7363510699447517, + "learning_rate": 7.66640222526797e-06, + "loss": 0.6769, + "step": 12608 + }, + { + "epoch": 1.02, + "grad_norm": 12.241112970295672, + "learning_rate": 7.666031322364188e-06, + "loss": 0.5354, + "step": 12609 + }, + { + "epoch": 1.02, + "grad_norm": 3.454955727844363, + "learning_rate": 7.665660398960867e-06, + "loss": 0.6226, + "step": 12610 + }, + { + "epoch": 1.02, + "grad_norm": 4.291899800526837, + "learning_rate": 7.665289455060857e-06, + "loss": 0.5728, + "step": 12611 + }, + { + "epoch": 1.02, + "grad_norm": 2.4431201295869376, + "learning_rate": 7.664918490667016e-06, + "loss": 0.5855, + "step": 12612 + }, + { + "epoch": 1.02, + "grad_norm": 2.85912174151412, + "learning_rate": 7.664547505782187e-06, + "loss": 0.6685, + "step": 12613 + }, + { + "epoch": 1.02, + "grad_norm": 6.108523369578036, + "learning_rate": 7.664176500409231e-06, + "loss": 0.5269, + "step": 12614 + }, + { + "epoch": 1.02, + "grad_norm": 3.8746210000527834, + "learning_rate": 7.663805474550998e-06, + "loss": 0.5418, + "step": 12615 + }, + { + "epoch": 1.02, + "grad_norm": 3.2637285308097974, + "learning_rate": 7.663434428210339e-06, + "loss": 0.6343, + "step": 12616 + }, + { + "epoch": 1.02, + "grad_norm": 3.7334445523450617, + "learning_rate": 7.663063361390109e-06, + "loss": 0.7649, + "step": 12617 + }, + { + "epoch": 1.02, + "grad_norm": 3.1571098780109486, + "learning_rate": 7.66269227409316e-06, + "loss": 0.5962, + "step": 12618 + }, + { + "epoch": 1.02, + "grad_norm": 2.7328510710828033, + "learning_rate": 7.662321166322346e-06, + "loss": 0.6699, + "step": 12619 + }, + { + "epoch": 1.02, + "grad_norm": 4.258015310199422, + "learning_rate": 7.661950038080521e-06, + "loss": 0.6852, + "step": 12620 + }, + { + "epoch": 1.03, + "grad_norm": 4.640945031137924, + "learning_rate": 7.661578889370538e-06, + "loss": 0.6567, + "step": 12621 + }, + { + "epoch": 1.03, + "grad_norm": 7.609780829004056, + "learning_rate": 7.66120772019525e-06, + "loss": 0.6189, + "step": 12622 + }, + { + "epoch": 1.03, + "grad_norm": 3.3958081065183845, + "learning_rate": 7.660836530557514e-06, + "loss": 0.5997, + "step": 12623 + }, + { + "epoch": 1.03, + "grad_norm": 3.1015283651929795, + "learning_rate": 7.66046532046018e-06, + "loss": 0.55, + "step": 12624 + }, + { + "epoch": 1.03, + "grad_norm": 3.316638205052729, + "learning_rate": 7.660094089906105e-06, + "loss": 0.719, + "step": 12625 + }, + { + "epoch": 1.03, + "grad_norm": 2.9895636026662147, + "learning_rate": 7.659722838898144e-06, + "loss": 0.6164, + "step": 12626 + }, + { + "epoch": 1.03, + "grad_norm": 4.738051178839015, + "learning_rate": 7.65935156743915e-06, + "loss": 0.6487, + "step": 12627 + }, + { + "epoch": 1.03, + "grad_norm": 3.3178545810205744, + "learning_rate": 7.658980275531977e-06, + "loss": 0.5264, + "step": 12628 + }, + { + "epoch": 1.03, + "grad_norm": 3.7736308503544413, + "learning_rate": 7.65860896317948e-06, + "loss": 0.7128, + "step": 12629 + }, + { + "epoch": 1.03, + "grad_norm": 2.8919043028994262, + "learning_rate": 7.658237630384518e-06, + "loss": 0.5955, + "step": 12630 + }, + { + "epoch": 1.03, + "grad_norm": 3.7289010251378465, + "learning_rate": 7.657866277149943e-06, + "loss": 0.5479, + "step": 12631 + }, + { + "epoch": 1.03, + "grad_norm": 3.228984425810598, + "learning_rate": 7.65749490347861e-06, + "loss": 0.713, + "step": 12632 + }, + { + "epoch": 1.03, + "grad_norm": 2.5258839598154768, + "learning_rate": 7.657123509373376e-06, + "loss": 0.6011, + "step": 12633 + }, + { + "epoch": 1.03, + "grad_norm": 4.030415988029781, + "learning_rate": 7.656752094837097e-06, + "loss": 0.6751, + "step": 12634 + }, + { + "epoch": 1.03, + "grad_norm": 3.7202505144906794, + "learning_rate": 7.656380659872627e-06, + "loss": 0.6943, + "step": 12635 + }, + { + "epoch": 1.03, + "grad_norm": 4.641821520473564, + "learning_rate": 7.656009204482822e-06, + "loss": 0.7023, + "step": 12636 + }, + { + "epoch": 1.03, + "grad_norm": 4.1640511210328155, + "learning_rate": 7.65563772867054e-06, + "loss": 0.6807, + "step": 12637 + }, + { + "epoch": 1.03, + "grad_norm": 3.3880663182288764, + "learning_rate": 7.655266232438636e-06, + "loss": 0.652, + "step": 12638 + }, + { + "epoch": 1.03, + "grad_norm": 7.361598382079345, + "learning_rate": 7.654894715789968e-06, + "loss": 0.6858, + "step": 12639 + }, + { + "epoch": 1.03, + "grad_norm": 4.264258619611187, + "learning_rate": 7.654523178727391e-06, + "loss": 0.569, + "step": 12640 + }, + { + "epoch": 1.03, + "grad_norm": 2.9905351205532913, + "learning_rate": 7.654151621253762e-06, + "loss": 0.6036, + "step": 12641 + }, + { + "epoch": 1.03, + "grad_norm": 2.457708442124658, + "learning_rate": 7.653780043371939e-06, + "loss": 0.583, + "step": 12642 + }, + { + "epoch": 1.03, + "grad_norm": 4.0053163690024505, + "learning_rate": 7.653408445084779e-06, + "loss": 0.447, + "step": 12643 + }, + { + "epoch": 1.03, + "grad_norm": 5.426722192810419, + "learning_rate": 7.653036826395138e-06, + "loss": 0.5981, + "step": 12644 + }, + { + "epoch": 1.03, + "grad_norm": 5.259307618019066, + "learning_rate": 7.652665187305874e-06, + "loss": 0.6231, + "step": 12645 + }, + { + "epoch": 1.03, + "grad_norm": 2.849345243363587, + "learning_rate": 7.652293527819845e-06, + "loss": 0.6454, + "step": 12646 + }, + { + "epoch": 1.03, + "grad_norm": 6.475845877783626, + "learning_rate": 7.651921847939909e-06, + "loss": 0.5483, + "step": 12647 + }, + { + "epoch": 1.03, + "grad_norm": 4.911356482245304, + "learning_rate": 7.651550147668925e-06, + "loss": 0.599, + "step": 12648 + }, + { + "epoch": 1.03, + "grad_norm": 3.822108633375212, + "learning_rate": 7.651178427009746e-06, + "loss": 0.4859, + "step": 12649 + }, + { + "epoch": 1.03, + "grad_norm": 3.208152452620052, + "learning_rate": 7.650806685965237e-06, + "loss": 0.6283, + "step": 12650 + }, + { + "epoch": 1.03, + "grad_norm": 5.013681535076231, + "learning_rate": 7.650434924538253e-06, + "loss": 0.5204, + "step": 12651 + }, + { + "epoch": 1.03, + "grad_norm": 4.195904411173517, + "learning_rate": 7.650063142731652e-06, + "loss": 0.5806, + "step": 12652 + }, + { + "epoch": 1.03, + "grad_norm": 3.505536869019851, + "learning_rate": 7.649691340548291e-06, + "loss": 0.5668, + "step": 12653 + }, + { + "epoch": 1.03, + "grad_norm": 5.851559676617221, + "learning_rate": 7.649319517991034e-06, + "loss": 0.6395, + "step": 12654 + }, + { + "epoch": 1.03, + "grad_norm": 2.2066569061475505, + "learning_rate": 7.648947675062737e-06, + "loss": 0.5282, + "step": 12655 + }, + { + "epoch": 1.03, + "grad_norm": 2.554987259007454, + "learning_rate": 7.64857581176626e-06, + "loss": 0.4915, + "step": 12656 + }, + { + "epoch": 1.03, + "grad_norm": 3.176595981309338, + "learning_rate": 7.648203928104458e-06, + "loss": 0.6313, + "step": 12657 + }, + { + "epoch": 1.03, + "grad_norm": 4.662850864986201, + "learning_rate": 7.647832024080197e-06, + "loss": 0.6297, + "step": 12658 + }, + { + "epoch": 1.03, + "grad_norm": 3.631005298730444, + "learning_rate": 7.647460099696333e-06, + "loss": 0.7317, + "step": 12659 + }, + { + "epoch": 1.03, + "grad_norm": 3.540099823594697, + "learning_rate": 7.647088154955728e-06, + "loss": 0.6325, + "step": 12660 + }, + { + "epoch": 1.03, + "grad_norm": 3.250794334498882, + "learning_rate": 7.64671618986124e-06, + "loss": 0.5916, + "step": 12661 + }, + { + "epoch": 1.03, + "grad_norm": 4.434718213820831, + "learning_rate": 7.646344204415729e-06, + "loss": 0.5714, + "step": 12662 + }, + { + "epoch": 1.03, + "grad_norm": 2.2102645742289724, + "learning_rate": 7.645972198622056e-06, + "loss": 0.5076, + "step": 12663 + }, + { + "epoch": 1.03, + "grad_norm": 3.392975242736115, + "learning_rate": 7.645600172483083e-06, + "loss": 0.6118, + "step": 12664 + }, + { + "epoch": 1.03, + "grad_norm": 3.918041500269964, + "learning_rate": 7.645228126001668e-06, + "loss": 0.617, + "step": 12665 + }, + { + "epoch": 1.03, + "grad_norm": 6.485057942493537, + "learning_rate": 7.644856059180669e-06, + "loss": 0.7169, + "step": 12666 + }, + { + "epoch": 1.03, + "grad_norm": 4.570162958975371, + "learning_rate": 7.644483972022955e-06, + "loss": 0.5793, + "step": 12667 + }, + { + "epoch": 1.03, + "grad_norm": 3.5811980945416915, + "learning_rate": 7.644111864531381e-06, + "loss": 0.6468, + "step": 12668 + }, + { + "epoch": 1.03, + "grad_norm": 4.537947090902917, + "learning_rate": 7.643739736708811e-06, + "loss": 0.621, + "step": 12669 + }, + { + "epoch": 1.03, + "grad_norm": 8.401402696966562, + "learning_rate": 7.6433675885581e-06, + "loss": 0.651, + "step": 12670 + }, + { + "epoch": 1.03, + "grad_norm": 3.166768705021124, + "learning_rate": 7.64299542008212e-06, + "loss": 0.6323, + "step": 12671 + }, + { + "epoch": 1.03, + "grad_norm": 3.4287667384016784, + "learning_rate": 7.642623231283725e-06, + "loss": 0.4893, + "step": 12672 + }, + { + "epoch": 1.03, + "grad_norm": 3.923935838757038, + "learning_rate": 7.64225102216578e-06, + "loss": 0.7357, + "step": 12673 + }, + { + "epoch": 1.03, + "grad_norm": 3.263290077962503, + "learning_rate": 7.641878792731146e-06, + "loss": 0.6786, + "step": 12674 + }, + { + "epoch": 1.03, + "grad_norm": 2.5469350279805347, + "learning_rate": 7.641506542982686e-06, + "loss": 0.6096, + "step": 12675 + }, + { + "epoch": 1.03, + "grad_norm": 4.7633702929307615, + "learning_rate": 7.641134272923259e-06, + "loss": 0.5758, + "step": 12676 + }, + { + "epoch": 1.03, + "grad_norm": 2.6467588717374784, + "learning_rate": 7.640761982555732e-06, + "loss": 0.6636, + "step": 12677 + }, + { + "epoch": 1.03, + "grad_norm": 3.5489589829612123, + "learning_rate": 7.640389671882963e-06, + "loss": 0.6997, + "step": 12678 + }, + { + "epoch": 1.03, + "grad_norm": 3.5096145876932643, + "learning_rate": 7.64001734090782e-06, + "loss": 0.6318, + "step": 12679 + }, + { + "epoch": 1.03, + "grad_norm": 3.8685306095150707, + "learning_rate": 7.63964498963316e-06, + "loss": 0.5047, + "step": 12680 + }, + { + "epoch": 1.03, + "grad_norm": 3.3489888692499936, + "learning_rate": 7.639272618061852e-06, + "loss": 0.7346, + "step": 12681 + }, + { + "epoch": 1.03, + "grad_norm": 2.8764381734466036, + "learning_rate": 7.638900226196756e-06, + "loss": 0.6137, + "step": 12682 + }, + { + "epoch": 1.03, + "grad_norm": 3.4309636151388476, + "learning_rate": 7.638527814040735e-06, + "loss": 0.5165, + "step": 12683 + }, + { + "epoch": 1.03, + "grad_norm": 4.079100724581614, + "learning_rate": 7.638155381596655e-06, + "loss": 0.5057, + "step": 12684 + }, + { + "epoch": 1.03, + "grad_norm": 3.8923304718753995, + "learning_rate": 7.637782928867376e-06, + "loss": 0.6078, + "step": 12685 + }, + { + "epoch": 1.03, + "grad_norm": 5.994973961507608, + "learning_rate": 7.637410455855764e-06, + "loss": 0.5187, + "step": 12686 + }, + { + "epoch": 1.03, + "grad_norm": 16.141474167517647, + "learning_rate": 7.637037962564683e-06, + "loss": 0.7251, + "step": 12687 + }, + { + "epoch": 1.03, + "grad_norm": 4.1994825318012845, + "learning_rate": 7.636665448996999e-06, + "loss": 0.6299, + "step": 12688 + }, + { + "epoch": 1.03, + "grad_norm": 5.567059959206812, + "learning_rate": 7.636292915155574e-06, + "loss": 0.6624, + "step": 12689 + }, + { + "epoch": 1.03, + "grad_norm": 5.098566248758252, + "learning_rate": 7.635920361043271e-06, + "loss": 0.5266, + "step": 12690 + }, + { + "epoch": 1.03, + "grad_norm": 4.666446824047482, + "learning_rate": 7.635547786662958e-06, + "loss": 0.6295, + "step": 12691 + }, + { + "epoch": 1.03, + "grad_norm": 2.521160221289315, + "learning_rate": 7.635175192017496e-06, + "loss": 0.637, + "step": 12692 + }, + { + "epoch": 1.03, + "grad_norm": 11.346480274163584, + "learning_rate": 7.634802577109755e-06, + "loss": 0.6331, + "step": 12693 + }, + { + "epoch": 1.03, + "grad_norm": 2.8889141559473956, + "learning_rate": 7.634429941942596e-06, + "loss": 0.5706, + "step": 12694 + }, + { + "epoch": 1.03, + "grad_norm": 15.643610117436824, + "learning_rate": 7.634057286518885e-06, + "loss": 0.5741, + "step": 12695 + }, + { + "epoch": 1.03, + "grad_norm": 6.189224929460995, + "learning_rate": 7.63368461084149e-06, + "loss": 0.4159, + "step": 12696 + }, + { + "epoch": 1.03, + "grad_norm": 3.606080223399179, + "learning_rate": 7.633311914913274e-06, + "loss": 0.6609, + "step": 12697 + }, + { + "epoch": 1.03, + "grad_norm": 2.957572907874164, + "learning_rate": 7.632939198737102e-06, + "loss": 0.5185, + "step": 12698 + }, + { + "epoch": 1.03, + "grad_norm": 13.065508946227473, + "learning_rate": 7.63256646231584e-06, + "loss": 0.65, + "step": 12699 + }, + { + "epoch": 1.03, + "grad_norm": 4.549873516079415, + "learning_rate": 7.632193705652358e-06, + "loss": 0.6508, + "step": 12700 + }, + { + "epoch": 1.03, + "grad_norm": 5.528590277276484, + "learning_rate": 7.631820928749517e-06, + "loss": 0.7976, + "step": 12701 + }, + { + "epoch": 1.03, + "grad_norm": 3.376794564510867, + "learning_rate": 7.631448131610188e-06, + "loss": 0.7881, + "step": 12702 + }, + { + "epoch": 1.03, + "grad_norm": 4.71868224284807, + "learning_rate": 7.631075314237233e-06, + "loss": 0.6213, + "step": 12703 + }, + { + "epoch": 1.03, + "grad_norm": 8.95824015129035, + "learning_rate": 7.630702476633522e-06, + "loss": 0.5107, + "step": 12704 + }, + { + "epoch": 1.03, + "grad_norm": 3.1156414482767962, + "learning_rate": 7.63032961880192e-06, + "loss": 0.6331, + "step": 12705 + }, + { + "epoch": 1.03, + "grad_norm": 4.209895955685698, + "learning_rate": 7.629956740745294e-06, + "loss": 0.6437, + "step": 12706 + }, + { + "epoch": 1.03, + "grad_norm": 8.740229868541386, + "learning_rate": 7.629583842466512e-06, + "loss": 0.5358, + "step": 12707 + }, + { + "epoch": 1.03, + "grad_norm": 3.7898023146714688, + "learning_rate": 7.629210923968443e-06, + "loss": 0.7315, + "step": 12708 + }, + { + "epoch": 1.03, + "grad_norm": 3.9090961923385388, + "learning_rate": 7.628837985253952e-06, + "loss": 0.7224, + "step": 12709 + }, + { + "epoch": 1.03, + "grad_norm": 14.424070680951075, + "learning_rate": 7.628465026325905e-06, + "loss": 0.6609, + "step": 12710 + }, + { + "epoch": 1.03, + "grad_norm": 3.978638724331431, + "learning_rate": 7.628092047187173e-06, + "loss": 0.5598, + "step": 12711 + }, + { + "epoch": 1.03, + "grad_norm": 2.8596370126196216, + "learning_rate": 7.627719047840622e-06, + "loss": 0.7251, + "step": 12712 + }, + { + "epoch": 1.03, + "grad_norm": 5.649291025963399, + "learning_rate": 7.627346028289121e-06, + "loss": 0.4565, + "step": 12713 + }, + { + "epoch": 1.03, + "grad_norm": 7.2442414412241725, + "learning_rate": 7.626972988535538e-06, + "loss": 0.5738, + "step": 12714 + }, + { + "epoch": 1.03, + "grad_norm": 5.078459145574682, + "learning_rate": 7.626599928582741e-06, + "loss": 0.5754, + "step": 12715 + }, + { + "epoch": 1.03, + "grad_norm": 4.321479761150848, + "learning_rate": 7.626226848433599e-06, + "loss": 0.6033, + "step": 12716 + }, + { + "epoch": 1.03, + "grad_norm": 6.6860391090558, + "learning_rate": 7.625853748090981e-06, + "loss": 0.7958, + "step": 12717 + }, + { + "epoch": 1.03, + "grad_norm": 3.850835155819159, + "learning_rate": 7.6254806275577545e-06, + "loss": 0.6412, + "step": 12718 + }, + { + "epoch": 1.03, + "grad_norm": 4.4608686724929845, + "learning_rate": 7.625107486836789e-06, + "loss": 0.7587, + "step": 12719 + }, + { + "epoch": 1.03, + "grad_norm": 3.4215482857655237, + "learning_rate": 7.6247343259309535e-06, + "loss": 0.483, + "step": 12720 + }, + { + "epoch": 1.03, + "grad_norm": 3.098025831168619, + "learning_rate": 7.6243611448431195e-06, + "loss": 0.7063, + "step": 12721 + }, + { + "epoch": 1.03, + "grad_norm": 10.727304840829941, + "learning_rate": 7.623987943576153e-06, + "loss": 0.5673, + "step": 12722 + }, + { + "epoch": 1.03, + "grad_norm": 2.773103092325417, + "learning_rate": 7.623614722132926e-06, + "loss": 0.69, + "step": 12723 + }, + { + "epoch": 1.03, + "grad_norm": 2.611113493629122, + "learning_rate": 7.623241480516307e-06, + "loss": 0.7246, + "step": 12724 + }, + { + "epoch": 1.03, + "grad_norm": 2.703047847146327, + "learning_rate": 7.622868218729167e-06, + "loss": 0.6533, + "step": 12725 + }, + { + "epoch": 1.03, + "grad_norm": 4.8948907064671285, + "learning_rate": 7.622494936774376e-06, + "loss": 0.6401, + "step": 12726 + }, + { + "epoch": 1.03, + "grad_norm": 3.010706603838789, + "learning_rate": 7.622121634654802e-06, + "loss": 0.575, + "step": 12727 + }, + { + "epoch": 1.03, + "grad_norm": 14.463933934117026, + "learning_rate": 7.621748312373318e-06, + "loss": 0.4634, + "step": 12728 + }, + { + "epoch": 1.03, + "grad_norm": 3.5977220452248284, + "learning_rate": 7.621374969932793e-06, + "loss": 0.5652, + "step": 12729 + }, + { + "epoch": 1.03, + "grad_norm": 4.733752512886142, + "learning_rate": 7.6210016073361e-06, + "loss": 0.5665, + "step": 12730 + }, + { + "epoch": 1.03, + "grad_norm": 6.214524943664311, + "learning_rate": 7.620628224586106e-06, + "loss": 0.656, + "step": 12731 + }, + { + "epoch": 1.03, + "grad_norm": 7.204604365567833, + "learning_rate": 7.620254821685687e-06, + "loss": 0.6533, + "step": 12732 + }, + { + "epoch": 1.03, + "grad_norm": 2.985986789259628, + "learning_rate": 7.619881398637709e-06, + "loss": 0.6811, + "step": 12733 + }, + { + "epoch": 1.03, + "grad_norm": 4.627074874276101, + "learning_rate": 7.619507955445047e-06, + "loss": 0.4404, + "step": 12734 + }, + { + "epoch": 1.03, + "grad_norm": 3.461723280278261, + "learning_rate": 7.619134492110569e-06, + "loss": 0.589, + "step": 12735 + }, + { + "epoch": 1.03, + "grad_norm": 4.658443577476422, + "learning_rate": 7.61876100863715e-06, + "loss": 0.6971, + "step": 12736 + }, + { + "epoch": 1.03, + "grad_norm": 3.628238462327136, + "learning_rate": 7.61838750502766e-06, + "loss": 0.6149, + "step": 12737 + }, + { + "epoch": 1.03, + "grad_norm": 2.9273601008684715, + "learning_rate": 7.618013981284973e-06, + "loss": 0.5583, + "step": 12738 + }, + { + "epoch": 1.03, + "grad_norm": 15.400414693616444, + "learning_rate": 7.617640437411958e-06, + "loss": 0.6308, + "step": 12739 + }, + { + "epoch": 1.03, + "grad_norm": 3.6699417830876864, + "learning_rate": 7.617266873411489e-06, + "loss": 0.7399, + "step": 12740 + }, + { + "epoch": 1.03, + "grad_norm": 4.996752595734896, + "learning_rate": 7.616893289286438e-06, + "loss": 0.7019, + "step": 12741 + }, + { + "epoch": 1.03, + "grad_norm": 3.556986995240959, + "learning_rate": 7.616519685039678e-06, + "loss": 0.6845, + "step": 12742 + }, + { + "epoch": 1.03, + "grad_norm": 3.44127620543055, + "learning_rate": 7.616146060674081e-06, + "loss": 0.6563, + "step": 12743 + }, + { + "epoch": 1.04, + "grad_norm": 5.036999803877084, + "learning_rate": 7.6157724161925195e-06, + "loss": 0.6513, + "step": 12744 + }, + { + "epoch": 1.04, + "grad_norm": 9.55447748952306, + "learning_rate": 7.615398751597869e-06, + "loss": 0.5298, + "step": 12745 + }, + { + "epoch": 1.04, + "grad_norm": 4.325238728687551, + "learning_rate": 7.615025066893001e-06, + "loss": 0.4261, + "step": 12746 + }, + { + "epoch": 1.04, + "grad_norm": 9.517232044134829, + "learning_rate": 7.614651362080787e-06, + "loss": 0.7219, + "step": 12747 + }, + { + "epoch": 1.04, + "grad_norm": 3.1950782344470507, + "learning_rate": 7.614277637164103e-06, + "loss": 0.5758, + "step": 12748 + }, + { + "epoch": 1.04, + "grad_norm": 3.257063695380154, + "learning_rate": 7.613903892145822e-06, + "loss": 0.6144, + "step": 12749 + }, + { + "epoch": 1.04, + "grad_norm": 2.397110433945175, + "learning_rate": 7.6135301270288175e-06, + "loss": 0.7748, + "step": 12750 + }, + { + "epoch": 1.04, + "grad_norm": 9.31565973881787, + "learning_rate": 7.613156341815962e-06, + "loss": 0.544, + "step": 12751 + }, + { + "epoch": 1.04, + "grad_norm": 5.01815608637562, + "learning_rate": 7.612782536510134e-06, + "loss": 0.5427, + "step": 12752 + }, + { + "epoch": 1.04, + "grad_norm": 7.558146419952685, + "learning_rate": 7.612408711114203e-06, + "loss": 0.7821, + "step": 12753 + }, + { + "epoch": 1.04, + "grad_norm": 3.2531385248784357, + "learning_rate": 7.612034865631046e-06, + "loss": 0.7159, + "step": 12754 + }, + { + "epoch": 1.04, + "grad_norm": 4.064756667661807, + "learning_rate": 7.611661000063537e-06, + "loss": 0.6756, + "step": 12755 + }, + { + "epoch": 1.04, + "grad_norm": 5.457252790122411, + "learning_rate": 7.61128711441455e-06, + "loss": 0.6291, + "step": 12756 + }, + { + "epoch": 1.04, + "grad_norm": 3.1934871898818322, + "learning_rate": 7.6109132086869606e-06, + "loss": 0.5896, + "step": 12757 + }, + { + "epoch": 1.04, + "grad_norm": 2.9152751405486406, + "learning_rate": 7.6105392828836445e-06, + "loss": 0.609, + "step": 12758 + }, + { + "epoch": 1.04, + "grad_norm": 2.5100458126505565, + "learning_rate": 7.610165337007475e-06, + "loss": 0.6877, + "step": 12759 + }, + { + "epoch": 1.04, + "grad_norm": 6.166657944816198, + "learning_rate": 7.609791371061328e-06, + "loss": 0.6328, + "step": 12760 + }, + { + "epoch": 1.04, + "grad_norm": 2.4854246456665208, + "learning_rate": 7.609417385048081e-06, + "loss": 0.5329, + "step": 12761 + }, + { + "epoch": 1.04, + "grad_norm": 2.4282461551566312, + "learning_rate": 7.609043378970607e-06, + "loss": 0.5994, + "step": 12762 + }, + { + "epoch": 1.04, + "grad_norm": 2.634963939819283, + "learning_rate": 7.608669352831783e-06, + "loss": 0.5741, + "step": 12763 + }, + { + "epoch": 1.04, + "grad_norm": 8.22524159691337, + "learning_rate": 7.6082953066344855e-06, + "loss": 0.6962, + "step": 12764 + }, + { + "epoch": 1.04, + "grad_norm": 5.089818500257091, + "learning_rate": 7.60792124038159e-06, + "loss": 0.5954, + "step": 12765 + }, + { + "epoch": 1.04, + "grad_norm": 2.864265783619446, + "learning_rate": 7.607547154075971e-06, + "loss": 0.3764, + "step": 12766 + }, + { + "epoch": 1.04, + "grad_norm": 2.986366872991414, + "learning_rate": 7.607173047720507e-06, + "loss": 0.5681, + "step": 12767 + }, + { + "epoch": 1.04, + "grad_norm": 4.699899032336854, + "learning_rate": 7.606798921318076e-06, + "loss": 0.7856, + "step": 12768 + }, + { + "epoch": 1.04, + "grad_norm": 5.651795465220876, + "learning_rate": 7.606424774871553e-06, + "loss": 0.504, + "step": 12769 + }, + { + "epoch": 1.04, + "grad_norm": 7.733345413134611, + "learning_rate": 7.606050608383813e-06, + "loss": 0.54, + "step": 12770 + }, + { + "epoch": 1.04, + "grad_norm": 2.6129903048693426, + "learning_rate": 7.605676421857734e-06, + "loss": 0.5981, + "step": 12771 + }, + { + "epoch": 1.04, + "grad_norm": 4.026272806869781, + "learning_rate": 7.6053022152961955e-06, + "loss": 0.678, + "step": 12772 + }, + { + "epoch": 1.04, + "grad_norm": 4.65748943283699, + "learning_rate": 7.6049279887020735e-06, + "loss": 0.5444, + "step": 12773 + }, + { + "epoch": 1.04, + "grad_norm": 12.02601042081635, + "learning_rate": 7.604553742078245e-06, + "loss": 0.6976, + "step": 12774 + }, + { + "epoch": 1.04, + "grad_norm": 13.724336846252793, + "learning_rate": 7.604179475427587e-06, + "loss": 0.666, + "step": 12775 + }, + { + "epoch": 1.04, + "grad_norm": 2.9504382687062423, + "learning_rate": 7.603805188752978e-06, + "loss": 0.6547, + "step": 12776 + }, + { + "epoch": 1.04, + "grad_norm": 17.464439827354802, + "learning_rate": 7.6034308820572975e-06, + "loss": 0.6583, + "step": 12777 + }, + { + "epoch": 1.04, + "grad_norm": 4.550452991851988, + "learning_rate": 7.603056555343422e-06, + "loss": 0.5131, + "step": 12778 + }, + { + "epoch": 1.04, + "grad_norm": 3.456177792292817, + "learning_rate": 7.602682208614229e-06, + "loss": 0.7705, + "step": 12779 + }, + { + "epoch": 1.04, + "grad_norm": 7.334727550407669, + "learning_rate": 7.602307841872599e-06, + "loss": 0.6911, + "step": 12780 + }, + { + "epoch": 1.04, + "grad_norm": 3.4165144011491817, + "learning_rate": 7.601933455121409e-06, + "loss": 0.5651, + "step": 12781 + }, + { + "epoch": 1.04, + "grad_norm": 9.218716007509439, + "learning_rate": 7.60155904836354e-06, + "loss": 0.4555, + "step": 12782 + }, + { + "epoch": 1.04, + "grad_norm": 4.223146443670349, + "learning_rate": 7.601184621601867e-06, + "loss": 0.6172, + "step": 12783 + }, + { + "epoch": 1.04, + "grad_norm": 3.3109871212909776, + "learning_rate": 7.600810174839271e-06, + "loss": 0.5285, + "step": 12784 + }, + { + "epoch": 1.04, + "grad_norm": 3.419517487598908, + "learning_rate": 7.600435708078631e-06, + "loss": 0.5786, + "step": 12785 + }, + { + "epoch": 1.04, + "grad_norm": 4.815783386241288, + "learning_rate": 7.600061221322829e-06, + "loss": 0.6246, + "step": 12786 + }, + { + "epoch": 1.04, + "grad_norm": 2.9430935855323996, + "learning_rate": 7.599686714574741e-06, + "loss": 0.5619, + "step": 12787 + }, + { + "epoch": 1.04, + "grad_norm": 3.5133423983408116, + "learning_rate": 7.599312187837247e-06, + "loss": 0.724, + "step": 12788 + }, + { + "epoch": 1.04, + "grad_norm": 4.962449945598559, + "learning_rate": 7.598937641113226e-06, + "loss": 0.6352, + "step": 12789 + }, + { + "epoch": 1.04, + "grad_norm": 4.084565556809841, + "learning_rate": 7.598563074405563e-06, + "loss": 0.6259, + "step": 12790 + }, + { + "epoch": 1.04, + "grad_norm": 3.5886149354437262, + "learning_rate": 7.598188487717133e-06, + "loss": 0.6265, + "step": 12791 + }, + { + "epoch": 1.04, + "grad_norm": 6.560635682657924, + "learning_rate": 7.597813881050817e-06, + "loss": 0.5909, + "step": 12792 + }, + { + "epoch": 1.04, + "grad_norm": 2.649151892526432, + "learning_rate": 7.597439254409498e-06, + "loss": 0.6349, + "step": 12793 + }, + { + "epoch": 1.04, + "grad_norm": 3.1693242036844738, + "learning_rate": 7.597064607796054e-06, + "loss": 0.5471, + "step": 12794 + }, + { + "epoch": 1.04, + "grad_norm": 2.6411156987136692, + "learning_rate": 7.596689941213366e-06, + "loss": 0.5808, + "step": 12795 + }, + { + "epoch": 1.04, + "grad_norm": 3.2723051211658087, + "learning_rate": 7.596315254664317e-06, + "loss": 0.6621, + "step": 12796 + }, + { + "epoch": 1.04, + "grad_norm": 3.4525280464515835, + "learning_rate": 7.5959405481517855e-06, + "loss": 0.4874, + "step": 12797 + }, + { + "epoch": 1.04, + "grad_norm": 5.526233481973619, + "learning_rate": 7.595565821678653e-06, + "loss": 0.6286, + "step": 12798 + }, + { + "epoch": 1.04, + "grad_norm": 2.872959149562405, + "learning_rate": 7.595191075247803e-06, + "loss": 0.615, + "step": 12799 + }, + { + "epoch": 1.04, + "grad_norm": 3.698519515892671, + "learning_rate": 7.594816308862114e-06, + "loss": 0.4336, + "step": 12800 + }, + { + "epoch": 1.04, + "grad_norm": 3.163842019446634, + "learning_rate": 7.594441522524469e-06, + "loss": 0.6808, + "step": 12801 + }, + { + "epoch": 1.04, + "grad_norm": 5.5689324479126245, + "learning_rate": 7.594066716237751e-06, + "loss": 0.5975, + "step": 12802 + }, + { + "epoch": 1.04, + "grad_norm": 3.235896546149228, + "learning_rate": 7.593691890004841e-06, + "loss": 0.4524, + "step": 12803 + }, + { + "epoch": 1.04, + "grad_norm": 2.616976415961154, + "learning_rate": 7.593317043828618e-06, + "loss": 0.5607, + "step": 12804 + }, + { + "epoch": 1.04, + "grad_norm": 2.933131474160084, + "learning_rate": 7.592942177711971e-06, + "loss": 0.59, + "step": 12805 + }, + { + "epoch": 1.04, + "grad_norm": 4.270078724733852, + "learning_rate": 7.592567291657778e-06, + "loss": 0.6675, + "step": 12806 + }, + { + "epoch": 1.04, + "grad_norm": 4.330723914274467, + "learning_rate": 7.592192385668919e-06, + "loss": 0.7049, + "step": 12807 + }, + { + "epoch": 1.04, + "grad_norm": 2.9772652714102548, + "learning_rate": 7.591817459748283e-06, + "loss": 0.3996, + "step": 12808 + }, + { + "epoch": 1.04, + "grad_norm": 3.6057363048709923, + "learning_rate": 7.591442513898748e-06, + "loss": 0.7368, + "step": 12809 + }, + { + "epoch": 1.04, + "grad_norm": 3.260327628676037, + "learning_rate": 7.5910675481232e-06, + "loss": 0.6862, + "step": 12810 + }, + { + "epoch": 1.04, + "grad_norm": 2.6641322549182025, + "learning_rate": 7.59069256242452e-06, + "loss": 0.5374, + "step": 12811 + }, + { + "epoch": 1.04, + "grad_norm": 8.605123424224642, + "learning_rate": 7.5903175568055924e-06, + "loss": 0.5916, + "step": 12812 + }, + { + "epoch": 1.04, + "grad_norm": 4.02195230421896, + "learning_rate": 7.5899425312693e-06, + "loss": 0.5072, + "step": 12813 + }, + { + "epoch": 1.04, + "grad_norm": 7.945457106233641, + "learning_rate": 7.589567485818528e-06, + "loss": 0.6073, + "step": 12814 + }, + { + "epoch": 1.04, + "grad_norm": 3.7491013436098926, + "learning_rate": 7.589192420456159e-06, + "loss": 0.6567, + "step": 12815 + }, + { + "epoch": 1.04, + "grad_norm": 3.041146457022362, + "learning_rate": 7.588817335185077e-06, + "loss": 0.6743, + "step": 12816 + }, + { + "epoch": 1.04, + "grad_norm": 5.3290351982731, + "learning_rate": 7.588442230008164e-06, + "loss": 0.5651, + "step": 12817 + }, + { + "epoch": 1.04, + "grad_norm": 2.900515589125215, + "learning_rate": 7.5880671049283095e-06, + "loss": 0.7394, + "step": 12818 + }, + { + "epoch": 1.04, + "grad_norm": 2.8867849310449536, + "learning_rate": 7.5876919599483935e-06, + "loss": 0.6317, + "step": 12819 + }, + { + "epoch": 1.04, + "grad_norm": 3.0397792305371705, + "learning_rate": 7.587316795071303e-06, + "loss": 0.7619, + "step": 12820 + }, + { + "epoch": 1.04, + "grad_norm": 5.8819838721710225, + "learning_rate": 7.586941610299918e-06, + "loss": 0.5767, + "step": 12821 + }, + { + "epoch": 1.04, + "grad_norm": 3.0413282927489536, + "learning_rate": 7.58656640563713e-06, + "loss": 0.5173, + "step": 12822 + }, + { + "epoch": 1.04, + "grad_norm": 3.2020903224621637, + "learning_rate": 7.58619118108582e-06, + "loss": 0.6467, + "step": 12823 + }, + { + "epoch": 1.04, + "grad_norm": 5.0346044767259235, + "learning_rate": 7.585815936648875e-06, + "loss": 0.766, + "step": 12824 + }, + { + "epoch": 1.04, + "grad_norm": 5.074782026541122, + "learning_rate": 7.585440672329179e-06, + "loss": 0.6455, + "step": 12825 + }, + { + "epoch": 1.04, + "grad_norm": 2.9257779028272393, + "learning_rate": 7.585065388129618e-06, + "loss": 0.6246, + "step": 12826 + }, + { + "epoch": 1.04, + "grad_norm": 3.3028295750054903, + "learning_rate": 7.584690084053077e-06, + "loss": 0.6052, + "step": 12827 + }, + { + "epoch": 1.04, + "grad_norm": 4.223614400729803, + "learning_rate": 7.584314760102442e-06, + "loss": 0.6191, + "step": 12828 + }, + { + "epoch": 1.04, + "grad_norm": 4.445941894319534, + "learning_rate": 7.583939416280599e-06, + "loss": 0.6843, + "step": 12829 + }, + { + "epoch": 1.04, + "grad_norm": 3.057786288720754, + "learning_rate": 7.5835640525904355e-06, + "loss": 0.7344, + "step": 12830 + }, + { + "epoch": 1.04, + "grad_norm": 4.952058114611515, + "learning_rate": 7.583188669034836e-06, + "loss": 0.621, + "step": 12831 + }, + { + "epoch": 1.04, + "grad_norm": 6.136468826486243, + "learning_rate": 7.582813265616686e-06, + "loss": 0.7438, + "step": 12832 + }, + { + "epoch": 1.04, + "grad_norm": 5.744980675253817, + "learning_rate": 7.5824378423388745e-06, + "loss": 0.6039, + "step": 12833 + }, + { + "epoch": 1.04, + "grad_norm": 2.0920290602443576, + "learning_rate": 7.582062399204286e-06, + "loss": 0.6646, + "step": 12834 + }, + { + "epoch": 1.04, + "grad_norm": 4.18517096491782, + "learning_rate": 7.581686936215811e-06, + "loss": 0.6616, + "step": 12835 + }, + { + "epoch": 1.04, + "grad_norm": 3.124520912811146, + "learning_rate": 7.581311453376332e-06, + "loss": 0.6432, + "step": 12836 + }, + { + "epoch": 1.04, + "grad_norm": 2.624355000407767, + "learning_rate": 7.580935950688737e-06, + "loss": 0.4822, + "step": 12837 + }, + { + "epoch": 1.04, + "grad_norm": 3.3228417315567778, + "learning_rate": 7.580560428155917e-06, + "loss": 0.5396, + "step": 12838 + }, + { + "epoch": 1.04, + "grad_norm": 2.111096381939276, + "learning_rate": 7.580184885780755e-06, + "loss": 0.5327, + "step": 12839 + }, + { + "epoch": 1.04, + "grad_norm": 4.116813057621899, + "learning_rate": 7.579809323566141e-06, + "loss": 0.5965, + "step": 12840 + }, + { + "epoch": 1.04, + "grad_norm": 2.678693845015011, + "learning_rate": 7.579433741514962e-06, + "loss": 0.6427, + "step": 12841 + }, + { + "epoch": 1.04, + "grad_norm": 4.288166769486887, + "learning_rate": 7.579058139630107e-06, + "loss": 0.5799, + "step": 12842 + }, + { + "epoch": 1.04, + "grad_norm": 5.145233701073364, + "learning_rate": 7.578682517914462e-06, + "loss": 0.5781, + "step": 12843 + }, + { + "epoch": 1.04, + "grad_norm": 4.170856938477209, + "learning_rate": 7.578306876370918e-06, + "loss": 0.6935, + "step": 12844 + }, + { + "epoch": 1.04, + "grad_norm": 3.204411420820357, + "learning_rate": 7.577931215002359e-06, + "loss": 0.721, + "step": 12845 + }, + { + "epoch": 1.04, + "grad_norm": 3.1376649062294817, + "learning_rate": 7.577555533811678e-06, + "loss": 0.613, + "step": 12846 + }, + { + "epoch": 1.04, + "grad_norm": 6.764195349377388, + "learning_rate": 7.577179832801762e-06, + "loss": 0.6847, + "step": 12847 + }, + { + "epoch": 1.04, + "grad_norm": 5.208624196821827, + "learning_rate": 7.5768041119755e-06, + "loss": 0.7508, + "step": 12848 + }, + { + "epoch": 1.04, + "grad_norm": 4.6310628681771036, + "learning_rate": 7.57642837133578e-06, + "loss": 0.6596, + "step": 12849 + }, + { + "epoch": 1.04, + "grad_norm": 4.085638005543012, + "learning_rate": 7.576052610885492e-06, + "loss": 0.7066, + "step": 12850 + }, + { + "epoch": 1.04, + "grad_norm": 5.2807557012953135, + "learning_rate": 7.575676830627525e-06, + "loss": 0.5935, + "step": 12851 + }, + { + "epoch": 1.04, + "grad_norm": 3.9332013067601572, + "learning_rate": 7.57530103056477e-06, + "loss": 0.924, + "step": 12852 + }, + { + "epoch": 1.04, + "grad_norm": 5.7821358569883845, + "learning_rate": 7.574925210700112e-06, + "loss": 0.6504, + "step": 12853 + }, + { + "epoch": 1.04, + "grad_norm": 5.942539675122486, + "learning_rate": 7.574549371036447e-06, + "loss": 0.5952, + "step": 12854 + }, + { + "epoch": 1.04, + "grad_norm": 2.832936696465376, + "learning_rate": 7.574173511576661e-06, + "loss": 0.6834, + "step": 12855 + }, + { + "epoch": 1.04, + "grad_norm": 4.435250721417182, + "learning_rate": 7.5737976323236455e-06, + "loss": 0.5961, + "step": 12856 + }, + { + "epoch": 1.04, + "grad_norm": 3.318390130152538, + "learning_rate": 7.5734217332802884e-06, + "loss": 0.5799, + "step": 12857 + }, + { + "epoch": 1.04, + "grad_norm": 3.090246126662872, + "learning_rate": 7.573045814449482e-06, + "loss": 0.5969, + "step": 12858 + }, + { + "epoch": 1.04, + "grad_norm": 3.3261611515366223, + "learning_rate": 7.572669875834118e-06, + "loss": 0.635, + "step": 12859 + }, + { + "epoch": 1.04, + "grad_norm": 2.73147751416171, + "learning_rate": 7.572293917437084e-06, + "loss": 0.6056, + "step": 12860 + }, + { + "epoch": 1.04, + "grad_norm": 3.229177332884211, + "learning_rate": 7.571917939261272e-06, + "loss": 0.6744, + "step": 12861 + }, + { + "epoch": 1.04, + "grad_norm": 5.06987894923852, + "learning_rate": 7.5715419413095734e-06, + "loss": 0.6544, + "step": 12862 + }, + { + "epoch": 1.04, + "grad_norm": 4.36480891428641, + "learning_rate": 7.57116592358488e-06, + "loss": 0.4624, + "step": 12863 + }, + { + "epoch": 1.04, + "grad_norm": 3.010204142447662, + "learning_rate": 7.570789886090083e-06, + "loss": 0.526, + "step": 12864 + }, + { + "epoch": 1.04, + "grad_norm": 6.566813077356942, + "learning_rate": 7.5704138288280714e-06, + "loss": 0.6147, + "step": 12865 + }, + { + "epoch": 1.04, + "grad_norm": 3.7478303308783616, + "learning_rate": 7.57003775180174e-06, + "loss": 0.6144, + "step": 12866 + }, + { + "epoch": 1.05, + "grad_norm": 2.9057936474336468, + "learning_rate": 7.569661655013978e-06, + "loss": 0.5585, + "step": 12867 + }, + { + "epoch": 1.05, + "grad_norm": 3.3389219202585854, + "learning_rate": 7.569285538467679e-06, + "loss": 0.719, + "step": 12868 + }, + { + "epoch": 1.05, + "grad_norm": 4.361067155938908, + "learning_rate": 7.568909402165732e-06, + "loss": 0.7157, + "step": 12869 + }, + { + "epoch": 1.05, + "grad_norm": 2.926865809629079, + "learning_rate": 7.568533246111034e-06, + "loss": 0.6856, + "step": 12870 + }, + { + "epoch": 1.05, + "grad_norm": 3.10965533698562, + "learning_rate": 7.5681570703064745e-06, + "loss": 0.6095, + "step": 12871 + }, + { + "epoch": 1.05, + "grad_norm": 6.978224756012123, + "learning_rate": 7.567780874754945e-06, + "loss": 0.5196, + "step": 12872 + }, + { + "epoch": 1.05, + "grad_norm": 2.697163511376358, + "learning_rate": 7.567404659459341e-06, + "loss": 0.6253, + "step": 12873 + }, + { + "epoch": 1.05, + "grad_norm": 4.302982332376722, + "learning_rate": 7.567028424422551e-06, + "loss": 0.5521, + "step": 12874 + }, + { + "epoch": 1.05, + "grad_norm": 7.0034166533003415, + "learning_rate": 7.566652169647472e-06, + "loss": 0.7202, + "step": 12875 + }, + { + "epoch": 1.05, + "grad_norm": 5.4408456646420715, + "learning_rate": 7.566275895136996e-06, + "loss": 0.6429, + "step": 12876 + }, + { + "epoch": 1.05, + "grad_norm": 2.705834714059999, + "learning_rate": 7.565899600894015e-06, + "loss": 0.498, + "step": 12877 + }, + { + "epoch": 1.05, + "grad_norm": 3.2415108009721076, + "learning_rate": 7.565523286921423e-06, + "loss": 0.7025, + "step": 12878 + }, + { + "epoch": 1.05, + "grad_norm": 4.616854295087875, + "learning_rate": 7.565146953222116e-06, + "loss": 0.7654, + "step": 12879 + }, + { + "epoch": 1.05, + "grad_norm": 2.191030664565779, + "learning_rate": 7.564770599798984e-06, + "loss": 0.5182, + "step": 12880 + }, + { + "epoch": 1.05, + "grad_norm": 5.587930036843815, + "learning_rate": 7.564394226654923e-06, + "loss": 0.6505, + "step": 12881 + }, + { + "epoch": 1.05, + "grad_norm": 3.689544741376294, + "learning_rate": 7.564017833792825e-06, + "loss": 0.7206, + "step": 12882 + }, + { + "epoch": 1.05, + "grad_norm": 3.7347983475460436, + "learning_rate": 7.563641421215586e-06, + "loss": 0.7166, + "step": 12883 + }, + { + "epoch": 1.05, + "grad_norm": 2.1950899116882363, + "learning_rate": 7.5632649889261e-06, + "loss": 0.6308, + "step": 12884 + }, + { + "epoch": 1.05, + "grad_norm": 2.6291732741040246, + "learning_rate": 7.562888536927262e-06, + "loss": 0.5754, + "step": 12885 + }, + { + "epoch": 1.05, + "grad_norm": 5.134022269848872, + "learning_rate": 7.562512065221964e-06, + "loss": 0.5737, + "step": 12886 + }, + { + "epoch": 1.05, + "grad_norm": 2.412242116634721, + "learning_rate": 7.562135573813104e-06, + "loss": 0.6684, + "step": 12887 + }, + { + "epoch": 1.05, + "grad_norm": 3.063713919672888, + "learning_rate": 7.561759062703575e-06, + "loss": 0.5779, + "step": 12888 + }, + { + "epoch": 1.05, + "grad_norm": 5.1595520351443005, + "learning_rate": 7.561382531896273e-06, + "loss": 0.6646, + "step": 12889 + }, + { + "epoch": 1.05, + "grad_norm": 2.7146067153071907, + "learning_rate": 7.561005981394092e-06, + "loss": 0.545, + "step": 12890 + }, + { + "epoch": 1.05, + "grad_norm": 2.4833305272159945, + "learning_rate": 7.560629411199928e-06, + "loss": 0.5678, + "step": 12891 + }, + { + "epoch": 1.05, + "grad_norm": 5.084120901006132, + "learning_rate": 7.560252821316677e-06, + "loss": 0.6182, + "step": 12892 + }, + { + "epoch": 1.05, + "grad_norm": 3.0139425149079426, + "learning_rate": 7.559876211747234e-06, + "loss": 0.6031, + "step": 12893 + }, + { + "epoch": 1.05, + "grad_norm": 3.3030395034120947, + "learning_rate": 7.559499582494495e-06, + "loss": 0.5666, + "step": 12894 + }, + { + "epoch": 1.05, + "grad_norm": 3.583735421756559, + "learning_rate": 7.559122933561356e-06, + "loss": 0.544, + "step": 12895 + }, + { + "epoch": 1.05, + "grad_norm": 6.827200608305828, + "learning_rate": 7.5587462649507134e-06, + "loss": 0.5064, + "step": 12896 + }, + { + "epoch": 1.05, + "grad_norm": 3.3939484459747904, + "learning_rate": 7.558369576665464e-06, + "loss": 0.5746, + "step": 12897 + }, + { + "epoch": 1.05, + "grad_norm": 2.8013861541149865, + "learning_rate": 7.557992868708501e-06, + "loss": 0.628, + "step": 12898 + }, + { + "epoch": 1.05, + "grad_norm": 4.005491801836468, + "learning_rate": 7.557616141082727e-06, + "loss": 0.6334, + "step": 12899 + }, + { + "epoch": 1.05, + "grad_norm": 2.8944989689176843, + "learning_rate": 7.5572393937910325e-06, + "loss": 0.6981, + "step": 12900 + }, + { + "epoch": 1.05, + "grad_norm": 31.879377491954656, + "learning_rate": 7.556862626836317e-06, + "loss": 0.6903, + "step": 12901 + }, + { + "epoch": 1.05, + "grad_norm": 2.282639491219605, + "learning_rate": 7.556485840221478e-06, + "loss": 0.7524, + "step": 12902 + }, + { + "epoch": 1.05, + "grad_norm": 5.453569689945408, + "learning_rate": 7.5561090339494126e-06, + "loss": 0.5688, + "step": 12903 + }, + { + "epoch": 1.05, + "grad_norm": 4.297351724448505, + "learning_rate": 7.555732208023017e-06, + "loss": 0.5193, + "step": 12904 + }, + { + "epoch": 1.05, + "grad_norm": 5.553114862082581, + "learning_rate": 7.5553553624451905e-06, + "loss": 0.6624, + "step": 12905 + }, + { + "epoch": 1.05, + "grad_norm": 4.554025873564326, + "learning_rate": 7.5549784972188275e-06, + "loss": 0.664, + "step": 12906 + }, + { + "epoch": 1.05, + "grad_norm": 2.7215684347286344, + "learning_rate": 7.55460161234683e-06, + "loss": 0.6802, + "step": 12907 + }, + { + "epoch": 1.05, + "grad_norm": 2.529484433345862, + "learning_rate": 7.5542247078320925e-06, + "loss": 0.6583, + "step": 12908 + }, + { + "epoch": 1.05, + "grad_norm": 3.4351283090074105, + "learning_rate": 7.553847783677515e-06, + "loss": 0.6228, + "step": 12909 + }, + { + "epoch": 1.05, + "grad_norm": 2.8329958193872615, + "learning_rate": 7.553470839885994e-06, + "loss": 0.7529, + "step": 12910 + }, + { + "epoch": 1.05, + "grad_norm": 4.146702165606948, + "learning_rate": 7.553093876460431e-06, + "loss": 0.6535, + "step": 12911 + }, + { + "epoch": 1.05, + "grad_norm": 4.552828545682671, + "learning_rate": 7.552716893403721e-06, + "loss": 0.4728, + "step": 12912 + }, + { + "epoch": 1.05, + "grad_norm": 3.0591043155616564, + "learning_rate": 7.552339890718765e-06, + "loss": 0.4705, + "step": 12913 + }, + { + "epoch": 1.05, + "grad_norm": 18.64032005174934, + "learning_rate": 7.55196286840846e-06, + "loss": 0.5905, + "step": 12914 + }, + { + "epoch": 1.05, + "grad_norm": 3.176329840718672, + "learning_rate": 7.551585826475707e-06, + "loss": 0.6987, + "step": 12915 + }, + { + "epoch": 1.05, + "grad_norm": 7.183984656313434, + "learning_rate": 7.551208764923403e-06, + "loss": 0.6651, + "step": 12916 + }, + { + "epoch": 1.05, + "grad_norm": 2.4136293255632717, + "learning_rate": 7.550831683754449e-06, + "loss": 0.4232, + "step": 12917 + }, + { + "epoch": 1.05, + "grad_norm": 2.4527900065214037, + "learning_rate": 7.550454582971745e-06, + "loss": 0.5677, + "step": 12918 + }, + { + "epoch": 1.05, + "grad_norm": 3.465747657421707, + "learning_rate": 7.550077462578188e-06, + "loss": 0.6474, + "step": 12919 + }, + { + "epoch": 1.05, + "grad_norm": 2.891478918299922, + "learning_rate": 7.5497003225766795e-06, + "loss": 0.6477, + "step": 12920 + }, + { + "epoch": 1.05, + "grad_norm": 4.44268419784709, + "learning_rate": 7.549323162970119e-06, + "loss": 0.5989, + "step": 12921 + }, + { + "epoch": 1.05, + "grad_norm": 6.736434592398039, + "learning_rate": 7.548945983761407e-06, + "loss": 0.7118, + "step": 12922 + }, + { + "epoch": 1.05, + "grad_norm": 21.55659486119966, + "learning_rate": 7.548568784953443e-06, + "loss": 0.5354, + "step": 12923 + }, + { + "epoch": 1.05, + "grad_norm": 4.545342723425328, + "learning_rate": 7.548191566549128e-06, + "loss": 0.7615, + "step": 12924 + }, + { + "epoch": 1.05, + "grad_norm": 4.597743276211596, + "learning_rate": 7.547814328551363e-06, + "loss": 0.5849, + "step": 12925 + }, + { + "epoch": 1.05, + "grad_norm": 4.5639859704389485, + "learning_rate": 7.547437070963046e-06, + "loss": 0.6209, + "step": 12926 + }, + { + "epoch": 1.05, + "grad_norm": 2.8344832636185395, + "learning_rate": 7.547059793787082e-06, + "loss": 0.545, + "step": 12927 + }, + { + "epoch": 1.05, + "grad_norm": 2.6199364508971943, + "learning_rate": 7.546682497026368e-06, + "loss": 0.6768, + "step": 12928 + }, + { + "epoch": 1.05, + "grad_norm": 22.088523405703608, + "learning_rate": 7.546305180683806e-06, + "loss": 0.7606, + "step": 12929 + }, + { + "epoch": 1.05, + "grad_norm": 5.220863625599206, + "learning_rate": 7.545927844762297e-06, + "loss": 0.5198, + "step": 12930 + }, + { + "epoch": 1.05, + "grad_norm": 6.245333746181903, + "learning_rate": 7.545550489264746e-06, + "loss": 0.6268, + "step": 12931 + }, + { + "epoch": 1.05, + "grad_norm": 3.4138271374914577, + "learning_rate": 7.545173114194051e-06, + "loss": 0.5801, + "step": 12932 + }, + { + "epoch": 1.05, + "grad_norm": 4.026446039957563, + "learning_rate": 7.544795719553113e-06, + "loss": 0.5299, + "step": 12933 + }, + { + "epoch": 1.05, + "grad_norm": 7.555896978053495, + "learning_rate": 7.544418305344836e-06, + "loss": 0.7816, + "step": 12934 + }, + { + "epoch": 1.05, + "grad_norm": 4.283123159716678, + "learning_rate": 7.544040871572122e-06, + "loss": 0.6527, + "step": 12935 + }, + { + "epoch": 1.05, + "grad_norm": 3.6196072724572073, + "learning_rate": 7.5436634182378735e-06, + "loss": 0.6432, + "step": 12936 + }, + { + "epoch": 1.05, + "grad_norm": 3.89965917088967, + "learning_rate": 7.54328594534499e-06, + "loss": 0.6019, + "step": 12937 + }, + { + "epoch": 1.05, + "grad_norm": 2.822201854569433, + "learning_rate": 7.542908452896376e-06, + "loss": 0.6069, + "step": 12938 + }, + { + "epoch": 1.05, + "grad_norm": 4.346270456196262, + "learning_rate": 7.5425309408949346e-06, + "loss": 0.5748, + "step": 12939 + }, + { + "epoch": 1.05, + "grad_norm": 3.696427546959457, + "learning_rate": 7.542153409343568e-06, + "loss": 0.6025, + "step": 12940 + }, + { + "epoch": 1.05, + "grad_norm": 6.426884662177392, + "learning_rate": 7.541775858245179e-06, + "loss": 0.7392, + "step": 12941 + }, + { + "epoch": 1.05, + "grad_norm": 2.242249684746652, + "learning_rate": 7.541398287602668e-06, + "loss": 0.5272, + "step": 12942 + }, + { + "epoch": 1.05, + "grad_norm": 4.611115678953091, + "learning_rate": 7.541020697418944e-06, + "loss": 0.5293, + "step": 12943 + }, + { + "epoch": 1.05, + "grad_norm": 5.543807748018826, + "learning_rate": 7.540643087696906e-06, + "loss": 0.7165, + "step": 12944 + }, + { + "epoch": 1.05, + "grad_norm": 2.394840366377686, + "learning_rate": 7.540265458439457e-06, + "loss": 0.561, + "step": 12945 + }, + { + "epoch": 1.05, + "grad_norm": 3.7510754180689383, + "learning_rate": 7.539887809649505e-06, + "loss": 0.6649, + "step": 12946 + }, + { + "epoch": 1.05, + "grad_norm": 3.271408650176617, + "learning_rate": 7.539510141329949e-06, + "loss": 0.5564, + "step": 12947 + }, + { + "epoch": 1.05, + "grad_norm": 2.7833473760466307, + "learning_rate": 7.539132453483696e-06, + "loss": 0.6426, + "step": 12948 + }, + { + "epoch": 1.05, + "grad_norm": 4.143611131463908, + "learning_rate": 7.538754746113649e-06, + "loss": 0.6389, + "step": 12949 + }, + { + "epoch": 1.05, + "grad_norm": 12.08943732076139, + "learning_rate": 7.5383770192227115e-06, + "loss": 0.5431, + "step": 12950 + }, + { + "epoch": 1.05, + "grad_norm": 4.768896154153527, + "learning_rate": 7.53799927281379e-06, + "loss": 0.6396, + "step": 12951 + }, + { + "epoch": 1.05, + "grad_norm": 3.3455061762708485, + "learning_rate": 7.537621506889787e-06, + "loss": 0.6087, + "step": 12952 + }, + { + "epoch": 1.05, + "grad_norm": 5.27275111067098, + "learning_rate": 7.537243721453609e-06, + "loss": 0.6011, + "step": 12953 + }, + { + "epoch": 1.05, + "grad_norm": 3.3120248364910463, + "learning_rate": 7.536865916508158e-06, + "loss": 0.775, + "step": 12954 + }, + { + "epoch": 1.05, + "grad_norm": 2.92865777603432, + "learning_rate": 7.536488092056343e-06, + "loss": 0.6719, + "step": 12955 + }, + { + "epoch": 1.05, + "grad_norm": 21.51719720798822, + "learning_rate": 7.536110248101066e-06, + "loss": 0.6667, + "step": 12956 + }, + { + "epoch": 1.05, + "grad_norm": 7.982117329036535, + "learning_rate": 7.5357323846452336e-06, + "loss": 0.8352, + "step": 12957 + }, + { + "epoch": 1.05, + "grad_norm": 2.445754806062073, + "learning_rate": 7.535354501691751e-06, + "loss": 0.705, + "step": 12958 + }, + { + "epoch": 1.05, + "grad_norm": 7.3088583547263175, + "learning_rate": 7.534976599243524e-06, + "loss": 0.5363, + "step": 12959 + }, + { + "epoch": 1.05, + "grad_norm": 2.2036612728870257, + "learning_rate": 7.534598677303457e-06, + "loss": 0.6128, + "step": 12960 + }, + { + "epoch": 1.05, + "grad_norm": 4.256759567307507, + "learning_rate": 7.534220735874459e-06, + "loss": 0.5546, + "step": 12961 + }, + { + "epoch": 1.05, + "grad_norm": 3.209387645788221, + "learning_rate": 7.533842774959433e-06, + "loss": 0.7113, + "step": 12962 + }, + { + "epoch": 1.05, + "grad_norm": 2.8565533203506557, + "learning_rate": 7.533464794561285e-06, + "loss": 0.5947, + "step": 12963 + }, + { + "epoch": 1.05, + "grad_norm": 5.952308731330324, + "learning_rate": 7.533086794682925e-06, + "loss": 0.5474, + "step": 12964 + }, + { + "epoch": 1.05, + "grad_norm": 3.3088632128223727, + "learning_rate": 7.5327087753272555e-06, + "loss": 0.5409, + "step": 12965 + }, + { + "epoch": 1.05, + "grad_norm": 2.9167789673277866, + "learning_rate": 7.532330736497187e-06, + "loss": 0.6823, + "step": 12966 + }, + { + "epoch": 1.05, + "grad_norm": 2.5216256001648847, + "learning_rate": 7.531952678195621e-06, + "loss": 0.6508, + "step": 12967 + }, + { + "epoch": 1.05, + "grad_norm": 4.206423326585319, + "learning_rate": 7.531574600425468e-06, + "loss": 0.5374, + "step": 12968 + }, + { + "epoch": 1.05, + "grad_norm": 3.205265185722258, + "learning_rate": 7.531196503189637e-06, + "loss": 0.6661, + "step": 12969 + }, + { + "epoch": 1.05, + "grad_norm": 2.5274990819323646, + "learning_rate": 7.530818386491032e-06, + "loss": 0.5368, + "step": 12970 + }, + { + "epoch": 1.05, + "grad_norm": 2.2976003245348555, + "learning_rate": 7.53044025033256e-06, + "loss": 0.6919, + "step": 12971 + }, + { + "epoch": 1.05, + "grad_norm": 4.50511898097329, + "learning_rate": 7.5300620947171295e-06, + "loss": 0.7339, + "step": 12972 + }, + { + "epoch": 1.05, + "grad_norm": 3.100494668097852, + "learning_rate": 7.52968391964765e-06, + "loss": 0.6969, + "step": 12973 + }, + { + "epoch": 1.05, + "grad_norm": 5.392658362833077, + "learning_rate": 7.529305725127028e-06, + "loss": 0.6843, + "step": 12974 + }, + { + "epoch": 1.05, + "grad_norm": 4.987288503908171, + "learning_rate": 7.528927511158172e-06, + "loss": 0.5408, + "step": 12975 + }, + { + "epoch": 1.05, + "grad_norm": 2.626879004492817, + "learning_rate": 7.528549277743989e-06, + "loss": 0.4995, + "step": 12976 + }, + { + "epoch": 1.05, + "grad_norm": 2.8047086178959493, + "learning_rate": 7.5281710248873866e-06, + "loss": 0.6311, + "step": 12977 + }, + { + "epoch": 1.05, + "grad_norm": 2.47872598683856, + "learning_rate": 7.527792752591276e-06, + "loss": 0.6015, + "step": 12978 + }, + { + "epoch": 1.05, + "grad_norm": 4.884974458147338, + "learning_rate": 7.527414460858563e-06, + "loss": 0.4657, + "step": 12979 + }, + { + "epoch": 1.05, + "grad_norm": 6.553667367967418, + "learning_rate": 7.527036149692157e-06, + "loss": 0.6715, + "step": 12980 + }, + { + "epoch": 1.05, + "grad_norm": 6.211611045747593, + "learning_rate": 7.52665781909497e-06, + "loss": 0.596, + "step": 12981 + }, + { + "epoch": 1.05, + "grad_norm": 4.3696167344970185, + "learning_rate": 7.526279469069908e-06, + "loss": 0.7348, + "step": 12982 + }, + { + "epoch": 1.05, + "grad_norm": 4.114145999740935, + "learning_rate": 7.52590109961988e-06, + "loss": 0.5073, + "step": 12983 + }, + { + "epoch": 1.05, + "grad_norm": 27.3030639853401, + "learning_rate": 7.525522710747794e-06, + "loss": 0.7124, + "step": 12984 + }, + { + "epoch": 1.05, + "grad_norm": 8.636906289517063, + "learning_rate": 7.525144302456566e-06, + "loss": 0.6407, + "step": 12985 + }, + { + "epoch": 1.05, + "grad_norm": 2.3467047615989722, + "learning_rate": 7.524765874749098e-06, + "loss": 0.6442, + "step": 12986 + }, + { + "epoch": 1.05, + "grad_norm": 2.9629395407962105, + "learning_rate": 7.524387427628306e-06, + "loss": 0.6355, + "step": 12987 + }, + { + "epoch": 1.05, + "grad_norm": 6.632899706427862, + "learning_rate": 7.524008961097094e-06, + "loss": 0.7496, + "step": 12988 + }, + { + "epoch": 1.05, + "grad_norm": 2.784499931967794, + "learning_rate": 7.5236304751583765e-06, + "loss": 0.6299, + "step": 12989 + }, + { + "epoch": 1.06, + "grad_norm": 3.812928489468325, + "learning_rate": 7.523251969815062e-06, + "loss": 0.6023, + "step": 12990 + }, + { + "epoch": 1.06, + "grad_norm": 3.227227109717069, + "learning_rate": 7.52287344507006e-06, + "loss": 0.5626, + "step": 12991 + }, + { + "epoch": 1.06, + "grad_norm": 3.5066681185408695, + "learning_rate": 7.522494900926284e-06, + "loss": 0.5426, + "step": 12992 + }, + { + "epoch": 1.06, + "grad_norm": 2.80311215468122, + "learning_rate": 7.522116337386642e-06, + "loss": 0.574, + "step": 12993 + }, + { + "epoch": 1.06, + "grad_norm": 3.8688784179005697, + "learning_rate": 7.521737754454046e-06, + "loss": 0.5773, + "step": 12994 + }, + { + "epoch": 1.06, + "grad_norm": 2.6756710898649727, + "learning_rate": 7.521359152131407e-06, + "loss": 0.662, + "step": 12995 + }, + { + "epoch": 1.06, + "grad_norm": 4.109892498738257, + "learning_rate": 7.520980530421635e-06, + "loss": 0.6704, + "step": 12996 + }, + { + "epoch": 1.06, + "grad_norm": 2.9935996837741086, + "learning_rate": 7.520601889327643e-06, + "loss": 0.5862, + "step": 12997 + }, + { + "epoch": 1.06, + "grad_norm": 5.566543373292843, + "learning_rate": 7.520223228852342e-06, + "loss": 0.5596, + "step": 12998 + }, + { + "epoch": 1.06, + "grad_norm": 6.662874817246582, + "learning_rate": 7.519844548998642e-06, + "loss": 0.4694, + "step": 12999 + }, + { + "epoch": 1.06, + "grad_norm": 3.4734137977205592, + "learning_rate": 7.5194658497694564e-06, + "loss": 0.4194, + "step": 13000 + }, + { + "epoch": 1.06, + "grad_norm": 3.0767028833173113, + "learning_rate": 7.519087131167697e-06, + "loss": 0.6477, + "step": 13001 + }, + { + "epoch": 1.06, + "grad_norm": 3.1101376604430406, + "learning_rate": 7.5187083931962744e-06, + "loss": 0.5146, + "step": 13002 + }, + { + "epoch": 1.06, + "grad_norm": 4.192752146596859, + "learning_rate": 7.5183296358581025e-06, + "loss": 0.5482, + "step": 13003 + }, + { + "epoch": 1.06, + "grad_norm": 2.730969386956924, + "learning_rate": 7.5179508591560925e-06, + "loss": 0.5271, + "step": 13004 + }, + { + "epoch": 1.06, + "grad_norm": 3.6263101866310254, + "learning_rate": 7.517572063093157e-06, + "loss": 0.7015, + "step": 13005 + }, + { + "epoch": 1.06, + "grad_norm": 4.2242336701075756, + "learning_rate": 7.51719324767221e-06, + "loss": 0.618, + "step": 13006 + }, + { + "epoch": 1.06, + "grad_norm": 3.19168906340524, + "learning_rate": 7.5168144128961625e-06, + "loss": 0.5821, + "step": 13007 + }, + { + "epoch": 1.06, + "grad_norm": 3.2437606904974796, + "learning_rate": 7.516435558767927e-06, + "loss": 0.6609, + "step": 13008 + }, + { + "epoch": 1.06, + "grad_norm": 9.121274810725644, + "learning_rate": 7.516056685290421e-06, + "loss": 0.6599, + "step": 13009 + }, + { + "epoch": 1.06, + "grad_norm": 3.9445503014105667, + "learning_rate": 7.5156777924665515e-06, + "loss": 0.6084, + "step": 13010 + }, + { + "epoch": 1.06, + "grad_norm": 6.5464683272119935, + "learning_rate": 7.515298880299236e-06, + "loss": 0.5868, + "step": 13011 + }, + { + "epoch": 1.06, + "grad_norm": 2.3965096227047296, + "learning_rate": 7.514919948791385e-06, + "loss": 0.6254, + "step": 13012 + }, + { + "epoch": 1.06, + "grad_norm": 36.13939920301798, + "learning_rate": 7.514540997945915e-06, + "loss": 0.6169, + "step": 13013 + }, + { + "epoch": 1.06, + "grad_norm": 5.051357332271329, + "learning_rate": 7.514162027765739e-06, + "loss": 0.6199, + "step": 13014 + }, + { + "epoch": 1.06, + "grad_norm": 3.649929083914388, + "learning_rate": 7.51378303825377e-06, + "loss": 0.6355, + "step": 13015 + }, + { + "epoch": 1.06, + "grad_norm": 2.7801950489804104, + "learning_rate": 7.513404029412923e-06, + "loss": 0.6716, + "step": 13016 + }, + { + "epoch": 1.06, + "grad_norm": 7.20404319693001, + "learning_rate": 7.5130250012461125e-06, + "loss": 0.6472, + "step": 13017 + }, + { + "epoch": 1.06, + "grad_norm": 4.210849432276456, + "learning_rate": 7.512645953756252e-06, + "loss": 0.6148, + "step": 13018 + }, + { + "epoch": 1.06, + "grad_norm": 3.6746032785353115, + "learning_rate": 7.512266886946258e-06, + "loss": 0.6568, + "step": 13019 + }, + { + "epoch": 1.06, + "grad_norm": 2.810460607281335, + "learning_rate": 7.511887800819042e-06, + "loss": 0.5361, + "step": 13020 + }, + { + "epoch": 1.06, + "grad_norm": 3.1242753321128203, + "learning_rate": 7.511508695377522e-06, + "loss": 0.5355, + "step": 13021 + }, + { + "epoch": 1.06, + "grad_norm": 3.7511597008032385, + "learning_rate": 7.511129570624611e-06, + "loss": 0.6807, + "step": 13022 + }, + { + "epoch": 1.06, + "grad_norm": 2.7224367570107697, + "learning_rate": 7.510750426563225e-06, + "loss": 0.652, + "step": 13023 + }, + { + "epoch": 1.06, + "grad_norm": 3.737486058646932, + "learning_rate": 7.510371263196277e-06, + "loss": 0.6574, + "step": 13024 + }, + { + "epoch": 1.06, + "grad_norm": 4.603673008665052, + "learning_rate": 7.509992080526687e-06, + "loss": 0.5906, + "step": 13025 + }, + { + "epoch": 1.06, + "grad_norm": 4.016496267754109, + "learning_rate": 7.5096128785573676e-06, + "loss": 0.612, + "step": 13026 + }, + { + "epoch": 1.06, + "grad_norm": 4.302334775672012, + "learning_rate": 7.509233657291235e-06, + "loss": 0.6664, + "step": 13027 + }, + { + "epoch": 1.06, + "grad_norm": 2.9798563576339276, + "learning_rate": 7.508854416731204e-06, + "loss": 0.6026, + "step": 13028 + }, + { + "epoch": 1.06, + "grad_norm": 3.2743207860352648, + "learning_rate": 7.508475156880193e-06, + "loss": 0.4926, + "step": 13029 + }, + { + "epoch": 1.06, + "grad_norm": 3.295863186400481, + "learning_rate": 7.508095877741116e-06, + "loss": 0.5906, + "step": 13030 + }, + { + "epoch": 1.06, + "grad_norm": 5.1871386537799955, + "learning_rate": 7.5077165793168905e-06, + "loss": 0.7731, + "step": 13031 + }, + { + "epoch": 1.06, + "grad_norm": 3.692072194444417, + "learning_rate": 7.5073372616104326e-06, + "loss": 0.638, + "step": 13032 + }, + { + "epoch": 1.06, + "grad_norm": 3.6704819569980986, + "learning_rate": 7.50695792462466e-06, + "loss": 0.8383, + "step": 13033 + }, + { + "epoch": 1.06, + "grad_norm": 4.263780895661941, + "learning_rate": 7.506578568362488e-06, + "loss": 0.5926, + "step": 13034 + }, + { + "epoch": 1.06, + "grad_norm": 2.7738588951559975, + "learning_rate": 7.506199192826835e-06, + "loss": 0.4862, + "step": 13035 + }, + { + "epoch": 1.06, + "grad_norm": 4.4427784709790386, + "learning_rate": 7.5058197980206145e-06, + "loss": 0.544, + "step": 13036 + }, + { + "epoch": 1.06, + "grad_norm": 3.7691587650125653, + "learning_rate": 7.50544038394675e-06, + "loss": 0.6153, + "step": 13037 + }, + { + "epoch": 1.06, + "grad_norm": 3.35902403741966, + "learning_rate": 7.505060950608154e-06, + "loss": 0.7068, + "step": 13038 + }, + { + "epoch": 1.06, + "grad_norm": 9.763970095984241, + "learning_rate": 7.504681498007744e-06, + "loss": 0.58, + "step": 13039 + }, + { + "epoch": 1.06, + "grad_norm": 5.857111260063662, + "learning_rate": 7.50430202614844e-06, + "loss": 0.5357, + "step": 13040 + }, + { + "epoch": 1.06, + "grad_norm": 5.716448218697416, + "learning_rate": 7.503922535033159e-06, + "loss": 0.5835, + "step": 13041 + }, + { + "epoch": 1.06, + "grad_norm": 3.7624159394126333, + "learning_rate": 7.503543024664819e-06, + "loss": 0.5003, + "step": 13042 + }, + { + "epoch": 1.06, + "grad_norm": 4.460675269904419, + "learning_rate": 7.5031634950463385e-06, + "loss": 0.6573, + "step": 13043 + }, + { + "epoch": 1.06, + "grad_norm": 2.3468656457059875, + "learning_rate": 7.502783946180634e-06, + "loss": 0.6669, + "step": 13044 + }, + { + "epoch": 1.06, + "grad_norm": 2.6538654253751712, + "learning_rate": 7.502404378070625e-06, + "loss": 0.6655, + "step": 13045 + }, + { + "epoch": 1.06, + "grad_norm": 2.392353197394934, + "learning_rate": 7.502024790719231e-06, + "loss": 0.6067, + "step": 13046 + }, + { + "epoch": 1.06, + "grad_norm": 12.680554096727802, + "learning_rate": 7.501645184129369e-06, + "loss": 0.7067, + "step": 13047 + }, + { + "epoch": 1.06, + "grad_norm": 3.203523074504997, + "learning_rate": 7.501265558303958e-06, + "loss": 0.6952, + "step": 13048 + }, + { + "epoch": 1.06, + "grad_norm": 5.835395677164653, + "learning_rate": 7.500885913245919e-06, + "loss": 0.6415, + "step": 13049 + }, + { + "epoch": 1.06, + "grad_norm": 5.116114378839112, + "learning_rate": 7.500506248958171e-06, + "loss": 0.6454, + "step": 13050 + }, + { + "epoch": 1.06, + "grad_norm": 6.753197175918499, + "learning_rate": 7.50012656544363e-06, + "loss": 0.7574, + "step": 13051 + }, + { + "epoch": 1.06, + "grad_norm": 3.5937447995557683, + "learning_rate": 7.499746862705218e-06, + "loss": 0.549, + "step": 13052 + }, + { + "epoch": 1.06, + "grad_norm": 2.0345778510253947, + "learning_rate": 7.499367140745854e-06, + "loss": 0.692, + "step": 13053 + }, + { + "epoch": 1.06, + "grad_norm": 3.7789440099958203, + "learning_rate": 7.498987399568459e-06, + "loss": 0.6218, + "step": 13054 + }, + { + "epoch": 1.06, + "grad_norm": 5.956870211003573, + "learning_rate": 7.498607639175952e-06, + "loss": 0.5634, + "step": 13055 + }, + { + "epoch": 1.06, + "grad_norm": 4.942193661542927, + "learning_rate": 7.498227859571252e-06, + "loss": 0.7677, + "step": 13056 + }, + { + "epoch": 1.06, + "grad_norm": 2.8438836614209873, + "learning_rate": 7.49784806075728e-06, + "loss": 0.4879, + "step": 13057 + }, + { + "epoch": 1.06, + "grad_norm": 2.250099262413133, + "learning_rate": 7.497468242736956e-06, + "loss": 0.5448, + "step": 13058 + }, + { + "epoch": 1.06, + "grad_norm": 7.759401909023542, + "learning_rate": 7.497088405513202e-06, + "loss": 0.4052, + "step": 13059 + }, + { + "epoch": 1.06, + "grad_norm": 2.5429105166024333, + "learning_rate": 7.496708549088938e-06, + "loss": 0.5191, + "step": 13060 + }, + { + "epoch": 1.06, + "grad_norm": 4.907347266909476, + "learning_rate": 7.496328673467082e-06, + "loss": 0.6831, + "step": 13061 + }, + { + "epoch": 1.06, + "grad_norm": 3.9018292592070254, + "learning_rate": 7.495948778650559e-06, + "loss": 0.5689, + "step": 13062 + }, + { + "epoch": 1.06, + "grad_norm": 2.9649616462911834, + "learning_rate": 7.495568864642288e-06, + "loss": 0.7447, + "step": 13063 + }, + { + "epoch": 1.06, + "grad_norm": 3.5333521105053673, + "learning_rate": 7.49518893144519e-06, + "loss": 0.7198, + "step": 13064 + }, + { + "epoch": 1.06, + "grad_norm": 2.9993076997153083, + "learning_rate": 7.494808979062187e-06, + "loss": 0.5752, + "step": 13065 + }, + { + "epoch": 1.06, + "grad_norm": 3.685347400052185, + "learning_rate": 7.4944290074962e-06, + "loss": 0.7822, + "step": 13066 + }, + { + "epoch": 1.06, + "grad_norm": 6.190997307115064, + "learning_rate": 7.494049016750152e-06, + "loss": 0.6422, + "step": 13067 + }, + { + "epoch": 1.06, + "grad_norm": 3.586246774580454, + "learning_rate": 7.493669006826964e-06, + "loss": 0.6159, + "step": 13068 + }, + { + "epoch": 1.06, + "grad_norm": 3.637529991540786, + "learning_rate": 7.493288977729556e-06, + "loss": 0.6231, + "step": 13069 + }, + { + "epoch": 1.06, + "grad_norm": 2.975063591428215, + "learning_rate": 7.492908929460854e-06, + "loss": 0.6252, + "step": 13070 + }, + { + "epoch": 1.06, + "grad_norm": 3.880210367107952, + "learning_rate": 7.492528862023777e-06, + "loss": 0.7183, + "step": 13071 + }, + { + "epoch": 1.06, + "grad_norm": 3.8759185342633016, + "learning_rate": 7.492148775421248e-06, + "loss": 0.5719, + "step": 13072 + }, + { + "epoch": 1.06, + "grad_norm": 2.6444228116965918, + "learning_rate": 7.491768669656191e-06, + "loss": 0.7625, + "step": 13073 + }, + { + "epoch": 1.06, + "grad_norm": 35.218429409114826, + "learning_rate": 7.491388544731528e-06, + "loss": 0.4292, + "step": 13074 + }, + { + "epoch": 1.06, + "grad_norm": 3.4159379577313604, + "learning_rate": 7.4910084006501816e-06, + "loss": 0.6335, + "step": 13075 + }, + { + "epoch": 1.06, + "grad_norm": 3.039646119686925, + "learning_rate": 7.490628237415074e-06, + "loss": 0.6722, + "step": 13076 + }, + { + "epoch": 1.06, + "grad_norm": 3.461297144351354, + "learning_rate": 7.49024805502913e-06, + "loss": 0.5702, + "step": 13077 + }, + { + "epoch": 1.06, + "grad_norm": 3.5340864901680686, + "learning_rate": 7.489867853495271e-06, + "loss": 0.5333, + "step": 13078 + }, + { + "epoch": 1.06, + "grad_norm": 2.1197618044316617, + "learning_rate": 7.489487632816424e-06, + "loss": 0.542, + "step": 13079 + }, + { + "epoch": 1.06, + "grad_norm": 11.10742497632765, + "learning_rate": 7.489107392995507e-06, + "loss": 0.6233, + "step": 13080 + }, + { + "epoch": 1.06, + "grad_norm": 4.090226583930624, + "learning_rate": 7.488727134035449e-06, + "loss": 0.5619, + "step": 13081 + }, + { + "epoch": 1.06, + "grad_norm": 2.603861603129147, + "learning_rate": 7.48834685593917e-06, + "loss": 0.5104, + "step": 13082 + }, + { + "epoch": 1.06, + "grad_norm": 4.14082112092457, + "learning_rate": 7.487966558709596e-06, + "loss": 0.6035, + "step": 13083 + }, + { + "epoch": 1.06, + "grad_norm": 3.141612130828562, + "learning_rate": 7.487586242349652e-06, + "loss": 0.7136, + "step": 13084 + }, + { + "epoch": 1.06, + "grad_norm": 7.1735149052796, + "learning_rate": 7.487205906862259e-06, + "loss": 0.5953, + "step": 13085 + }, + { + "epoch": 1.06, + "grad_norm": 3.5794536967046753, + "learning_rate": 7.486825552250345e-06, + "loss": 0.6325, + "step": 13086 + }, + { + "epoch": 1.06, + "grad_norm": 5.767455056747688, + "learning_rate": 7.486445178516834e-06, + "loss": 0.5969, + "step": 13087 + }, + { + "epoch": 1.06, + "grad_norm": 3.1255294996060705, + "learning_rate": 7.48606478566465e-06, + "loss": 0.5852, + "step": 13088 + }, + { + "epoch": 1.06, + "grad_norm": 3.8281320256781823, + "learning_rate": 7.485684373696715e-06, + "loss": 0.7062, + "step": 13089 + }, + { + "epoch": 1.06, + "grad_norm": 7.260757385922951, + "learning_rate": 7.48530394261596e-06, + "loss": 0.4995, + "step": 13090 + }, + { + "epoch": 1.06, + "grad_norm": 6.443329719967807, + "learning_rate": 7.4849234924253065e-06, + "loss": 0.6251, + "step": 13091 + }, + { + "epoch": 1.06, + "grad_norm": 2.8739589027070602, + "learning_rate": 7.484543023127679e-06, + "loss": 0.7199, + "step": 13092 + }, + { + "epoch": 1.06, + "grad_norm": 2.6622625021658095, + "learning_rate": 7.484162534726005e-06, + "loss": 0.6285, + "step": 13093 + }, + { + "epoch": 1.06, + "grad_norm": 3.283872090441876, + "learning_rate": 7.4837820272232105e-06, + "loss": 0.549, + "step": 13094 + }, + { + "epoch": 1.06, + "grad_norm": 2.961479817296063, + "learning_rate": 7.48340150062222e-06, + "loss": 0.8426, + "step": 13095 + }, + { + "epoch": 1.06, + "grad_norm": 3.1667071341589974, + "learning_rate": 7.48302095492596e-06, + "loss": 0.673, + "step": 13096 + }, + { + "epoch": 1.06, + "grad_norm": 4.976002653264443, + "learning_rate": 7.482640390137356e-06, + "loss": 0.7217, + "step": 13097 + }, + { + "epoch": 1.06, + "grad_norm": 3.9857283799431436, + "learning_rate": 7.482259806259334e-06, + "loss": 0.6385, + "step": 13098 + }, + { + "epoch": 1.06, + "grad_norm": 7.9234256162612615, + "learning_rate": 7.481879203294822e-06, + "loss": 0.5577, + "step": 13099 + }, + { + "epoch": 1.06, + "grad_norm": 4.229645067691927, + "learning_rate": 7.481498581246746e-06, + "loss": 0.6828, + "step": 13100 + }, + { + "epoch": 1.06, + "grad_norm": 5.737875261869866, + "learning_rate": 7.48111794011803e-06, + "loss": 0.6518, + "step": 13101 + }, + { + "epoch": 1.06, + "grad_norm": 8.521616978414704, + "learning_rate": 7.480737279911605e-06, + "loss": 0.5183, + "step": 13102 + }, + { + "epoch": 1.06, + "grad_norm": 3.2798927639312967, + "learning_rate": 7.4803566006303955e-06, + "loss": 0.4544, + "step": 13103 + }, + { + "epoch": 1.06, + "grad_norm": 15.073779547788366, + "learning_rate": 7.4799759022773275e-06, + "loss": 0.6635, + "step": 13104 + }, + { + "epoch": 1.06, + "grad_norm": 4.552393725827752, + "learning_rate": 7.47959518485533e-06, + "loss": 0.554, + "step": 13105 + }, + { + "epoch": 1.06, + "grad_norm": 3.4897679900275986, + "learning_rate": 7.479214448367332e-06, + "loss": 0.543, + "step": 13106 + }, + { + "epoch": 1.06, + "grad_norm": 6.7181230743250895, + "learning_rate": 7.478833692816259e-06, + "loss": 0.7602, + "step": 13107 + }, + { + "epoch": 1.06, + "grad_norm": 3.2840881241311366, + "learning_rate": 7.478452918205038e-06, + "loss": 0.5727, + "step": 13108 + }, + { + "epoch": 1.06, + "grad_norm": 2.1440826203088044, + "learning_rate": 7.478072124536598e-06, + "loss": 0.4487, + "step": 13109 + }, + { + "epoch": 1.06, + "grad_norm": 3.2503063605270217, + "learning_rate": 7.4776913118138664e-06, + "loss": 0.6186, + "step": 13110 + }, + { + "epoch": 1.06, + "grad_norm": 4.555547705490123, + "learning_rate": 7.477310480039771e-06, + "loss": 0.5847, + "step": 13111 + }, + { + "epoch": 1.06, + "grad_norm": 11.169269764016597, + "learning_rate": 7.476929629217242e-06, + "loss": 0.5817, + "step": 13112 + }, + { + "epoch": 1.07, + "grad_norm": 4.05308104316401, + "learning_rate": 7.4765487593492044e-06, + "loss": 0.8092, + "step": 13113 + }, + { + "epoch": 1.07, + "grad_norm": 4.195311370397483, + "learning_rate": 7.476167870438592e-06, + "loss": 0.6955, + "step": 13114 + }, + { + "epoch": 1.07, + "grad_norm": 5.420620062852729, + "learning_rate": 7.475786962488329e-06, + "loss": 0.6482, + "step": 13115 + }, + { + "epoch": 1.07, + "grad_norm": 3.2088742909149732, + "learning_rate": 7.475406035501346e-06, + "loss": 0.5608, + "step": 13116 + }, + { + "epoch": 1.07, + "grad_norm": 3.6922266571041056, + "learning_rate": 7.475025089480571e-06, + "loss": 0.7131, + "step": 13117 + }, + { + "epoch": 1.07, + "grad_norm": 4.092314974821069, + "learning_rate": 7.474644124428933e-06, + "loss": 0.3942, + "step": 13118 + }, + { + "epoch": 1.07, + "grad_norm": 2.655913910175776, + "learning_rate": 7.474263140349365e-06, + "loss": 0.6576, + "step": 13119 + }, + { + "epoch": 1.07, + "grad_norm": 13.708168113365288, + "learning_rate": 7.473882137244792e-06, + "loss": 0.6796, + "step": 13120 + }, + { + "epoch": 1.07, + "grad_norm": 3.2163496780001144, + "learning_rate": 7.473501115118145e-06, + "loss": 0.6863, + "step": 13121 + }, + { + "epoch": 1.07, + "grad_norm": 3.6653105698485375, + "learning_rate": 7.473120073972353e-06, + "loss": 0.5001, + "step": 13122 + }, + { + "epoch": 1.07, + "grad_norm": 3.174045713888869, + "learning_rate": 7.472739013810348e-06, + "loss": 0.6836, + "step": 13123 + }, + { + "epoch": 1.07, + "grad_norm": 3.0753418214289, + "learning_rate": 7.4723579346350595e-06, + "loss": 0.6006, + "step": 13124 + }, + { + "epoch": 1.07, + "grad_norm": 10.134506659967762, + "learning_rate": 7.471976836449416e-06, + "loss": 0.5719, + "step": 13125 + }, + { + "epoch": 1.07, + "grad_norm": 8.52158483458711, + "learning_rate": 7.4715957192563494e-06, + "loss": 0.5088, + "step": 13126 + }, + { + "epoch": 1.07, + "grad_norm": 3.2188856173128295, + "learning_rate": 7.47121458305879e-06, + "loss": 0.7113, + "step": 13127 + }, + { + "epoch": 1.07, + "grad_norm": 2.943558597435543, + "learning_rate": 7.470833427859667e-06, + "loss": 0.5373, + "step": 13128 + }, + { + "epoch": 1.07, + "grad_norm": 8.352764074871018, + "learning_rate": 7.4704522536619116e-06, + "loss": 0.6991, + "step": 13129 + }, + { + "epoch": 1.07, + "grad_norm": 2.7641227938140593, + "learning_rate": 7.470071060468457e-06, + "loss": 0.7818, + "step": 13130 + }, + { + "epoch": 1.07, + "grad_norm": 6.756026062200751, + "learning_rate": 7.469689848282231e-06, + "loss": 0.5774, + "step": 13131 + }, + { + "epoch": 1.07, + "grad_norm": 4.407893082266839, + "learning_rate": 7.469308617106168e-06, + "loss": 0.5675, + "step": 13132 + }, + { + "epoch": 1.07, + "grad_norm": 3.6523045812510775, + "learning_rate": 7.468927366943198e-06, + "loss": 0.5938, + "step": 13133 + }, + { + "epoch": 1.07, + "grad_norm": 3.39645892320719, + "learning_rate": 7.4685460977962495e-06, + "loss": 0.6581, + "step": 13134 + }, + { + "epoch": 1.07, + "grad_norm": 3.241529608313073, + "learning_rate": 7.468164809668259e-06, + "loss": 0.646, + "step": 13135 + }, + { + "epoch": 1.07, + "grad_norm": 2.487257642585635, + "learning_rate": 7.467783502562156e-06, + "loss": 0.4596, + "step": 13136 + }, + { + "epoch": 1.07, + "grad_norm": 2.9710902676024653, + "learning_rate": 7.467402176480873e-06, + "loss": 0.6213, + "step": 13137 + }, + { + "epoch": 1.07, + "grad_norm": 4.7309006817942585, + "learning_rate": 7.46702083142734e-06, + "loss": 0.4781, + "step": 13138 + }, + { + "epoch": 1.07, + "grad_norm": 144.67675691725674, + "learning_rate": 7.466639467404492e-06, + "loss": 0.7385, + "step": 13139 + }, + { + "epoch": 1.07, + "grad_norm": 2.5156514317444847, + "learning_rate": 7.4662580844152596e-06, + "loss": 0.6209, + "step": 13140 + }, + { + "epoch": 1.07, + "grad_norm": 8.305305820388769, + "learning_rate": 7.465876682462576e-06, + "loss": 0.5491, + "step": 13141 + }, + { + "epoch": 1.07, + "grad_norm": 2.15168257955364, + "learning_rate": 7.465495261549373e-06, + "loss": 0.6536, + "step": 13142 + }, + { + "epoch": 1.07, + "grad_norm": 3.736901266723794, + "learning_rate": 7.465113821678587e-06, + "loss": 0.5547, + "step": 13143 + }, + { + "epoch": 1.07, + "grad_norm": 3.096421973622719, + "learning_rate": 7.464732362853146e-06, + "loss": 0.6518, + "step": 13144 + }, + { + "epoch": 1.07, + "grad_norm": 7.109904852316536, + "learning_rate": 7.464350885075986e-06, + "loss": 0.672, + "step": 13145 + }, + { + "epoch": 1.07, + "grad_norm": 3.5381466282286, + "learning_rate": 7.4639693883500384e-06, + "loss": 0.6154, + "step": 13146 + }, + { + "epoch": 1.07, + "grad_norm": 5.438163040758922, + "learning_rate": 7.46358787267824e-06, + "loss": 0.5683, + "step": 13147 + }, + { + "epoch": 1.07, + "grad_norm": 2.9702431266512956, + "learning_rate": 7.46320633806352e-06, + "loss": 0.5774, + "step": 13148 + }, + { + "epoch": 1.07, + "grad_norm": 3.157234807952063, + "learning_rate": 7.462824784508815e-06, + "loss": 0.5533, + "step": 13149 + }, + { + "epoch": 1.07, + "grad_norm": 2.9748905805705768, + "learning_rate": 7.462443212017059e-06, + "loss": 0.6478, + "step": 13150 + }, + { + "epoch": 1.07, + "grad_norm": 4.461351213565799, + "learning_rate": 7.462061620591183e-06, + "loss": 0.4918, + "step": 13151 + }, + { + "epoch": 1.07, + "grad_norm": 2.74558813142874, + "learning_rate": 7.4616800102341235e-06, + "loss": 0.4109, + "step": 13152 + }, + { + "epoch": 1.07, + "grad_norm": 3.751227098727027, + "learning_rate": 7.461298380948815e-06, + "loss": 0.5711, + "step": 13153 + }, + { + "epoch": 1.07, + "grad_norm": 7.318146318969377, + "learning_rate": 7.46091673273819e-06, + "loss": 0.5473, + "step": 13154 + }, + { + "epoch": 1.07, + "grad_norm": 7.136078183314036, + "learning_rate": 7.460535065605184e-06, + "loss": 0.5208, + "step": 13155 + }, + { + "epoch": 1.07, + "grad_norm": 3.882304640969959, + "learning_rate": 7.460153379552734e-06, + "loss": 0.7574, + "step": 13156 + }, + { + "epoch": 1.07, + "grad_norm": 3.263320254215945, + "learning_rate": 7.459771674583771e-06, + "loss": 0.5641, + "step": 13157 + }, + { + "epoch": 1.07, + "grad_norm": 3.9454302819181706, + "learning_rate": 7.4593899507012334e-06, + "loss": 0.8165, + "step": 13158 + }, + { + "epoch": 1.07, + "grad_norm": 5.023684017649677, + "learning_rate": 7.459008207908053e-06, + "loss": 0.6519, + "step": 13159 + }, + { + "epoch": 1.07, + "grad_norm": 2.946288482614118, + "learning_rate": 7.458626446207168e-06, + "loss": 0.5857, + "step": 13160 + }, + { + "epoch": 1.07, + "grad_norm": 3.3237427377397304, + "learning_rate": 7.4582446656015125e-06, + "loss": 0.5407, + "step": 13161 + }, + { + "epoch": 1.07, + "grad_norm": 2.4057548289416175, + "learning_rate": 7.457862866094022e-06, + "loss": 0.5911, + "step": 13162 + }, + { + "epoch": 1.07, + "grad_norm": 5.7306448242100165, + "learning_rate": 7.457481047687631e-06, + "loss": 0.546, + "step": 13163 + }, + { + "epoch": 1.07, + "grad_norm": 2.43236916047915, + "learning_rate": 7.457099210385279e-06, + "loss": 0.6269, + "step": 13164 + }, + { + "epoch": 1.07, + "grad_norm": 12.018064351585961, + "learning_rate": 7.456717354189898e-06, + "loss": 0.5036, + "step": 13165 + }, + { + "epoch": 1.07, + "grad_norm": 5.049084473895779, + "learning_rate": 7.456335479104429e-06, + "loss": 0.7227, + "step": 13166 + }, + { + "epoch": 1.07, + "grad_norm": 3.62912891290483, + "learning_rate": 7.455953585131801e-06, + "loss": 0.5387, + "step": 13167 + }, + { + "epoch": 1.07, + "grad_norm": 3.9047174348882008, + "learning_rate": 7.455571672274957e-06, + "loss": 0.5964, + "step": 13168 + }, + { + "epoch": 1.07, + "grad_norm": 3.266857483593641, + "learning_rate": 7.455189740536832e-06, + "loss": 0.5561, + "step": 13169 + }, + { + "epoch": 1.07, + "grad_norm": 4.161309609801212, + "learning_rate": 7.454807789920361e-06, + "loss": 0.683, + "step": 13170 + }, + { + "epoch": 1.07, + "grad_norm": 9.909369323452003, + "learning_rate": 7.454425820428481e-06, + "loss": 0.6305, + "step": 13171 + }, + { + "epoch": 1.07, + "grad_norm": 3.0868172990934695, + "learning_rate": 7.4540438320641304e-06, + "loss": 0.6774, + "step": 13172 + }, + { + "epoch": 1.07, + "grad_norm": 2.3203630580355004, + "learning_rate": 7.453661824830247e-06, + "loss": 0.6174, + "step": 13173 + }, + { + "epoch": 1.07, + "grad_norm": 6.682073889503434, + "learning_rate": 7.453279798729766e-06, + "loss": 0.6479, + "step": 13174 + }, + { + "epoch": 1.07, + "grad_norm": 5.2843051067858875, + "learning_rate": 7.452897753765626e-06, + "loss": 0.6467, + "step": 13175 + }, + { + "epoch": 1.07, + "grad_norm": 3.012566680341504, + "learning_rate": 7.452515689940765e-06, + "loss": 0.5223, + "step": 13176 + }, + { + "epoch": 1.07, + "grad_norm": 3.4024672574784858, + "learning_rate": 7.45213360725812e-06, + "loss": 0.5819, + "step": 13177 + }, + { + "epoch": 1.07, + "grad_norm": 2.8784912182560967, + "learning_rate": 7.45175150572063e-06, + "loss": 0.7457, + "step": 13178 + }, + { + "epoch": 1.07, + "grad_norm": 5.583025305898903, + "learning_rate": 7.451369385331229e-06, + "loss": 0.6961, + "step": 13179 + }, + { + "epoch": 1.07, + "grad_norm": 4.218089368153732, + "learning_rate": 7.450987246092862e-06, + "loss": 0.5456, + "step": 13180 + }, + { + "epoch": 1.07, + "grad_norm": 3.4169816808152, + "learning_rate": 7.450605088008462e-06, + "loss": 0.7615, + "step": 13181 + }, + { + "epoch": 1.07, + "grad_norm": 4.147815156131929, + "learning_rate": 7.45022291108097e-06, + "loss": 0.6199, + "step": 13182 + }, + { + "epoch": 1.07, + "grad_norm": 4.814323223907262, + "learning_rate": 7.4498407153133215e-06, + "loss": 0.5564, + "step": 13183 + }, + { + "epoch": 1.07, + "grad_norm": 3.227284197187449, + "learning_rate": 7.4494585007084594e-06, + "loss": 0.6277, + "step": 13184 + }, + { + "epoch": 1.07, + "grad_norm": 3.722205002668175, + "learning_rate": 7.449076267269321e-06, + "loss": 0.653, + "step": 13185 + }, + { + "epoch": 1.07, + "grad_norm": 2.8504798535520344, + "learning_rate": 7.448694014998844e-06, + "loss": 0.6865, + "step": 13186 + }, + { + "epoch": 1.07, + "grad_norm": 3.588612035387599, + "learning_rate": 7.4483117438999685e-06, + "loss": 0.5696, + "step": 13187 + }, + { + "epoch": 1.07, + "grad_norm": 3.206604201549923, + "learning_rate": 7.447929453975635e-06, + "loss": 0.5462, + "step": 13188 + }, + { + "epoch": 1.07, + "grad_norm": 8.687200740721579, + "learning_rate": 7.4475471452287816e-06, + "loss": 0.6277, + "step": 13189 + }, + { + "epoch": 1.07, + "grad_norm": 2.5196362813600133, + "learning_rate": 7.447164817662349e-06, + "loss": 0.6248, + "step": 13190 + }, + { + "epoch": 1.07, + "grad_norm": 4.39504533981986, + "learning_rate": 7.4467824712792744e-06, + "loss": 0.842, + "step": 13191 + }, + { + "epoch": 1.07, + "grad_norm": 7.392500980030107, + "learning_rate": 7.446400106082501e-06, + "loss": 0.6453, + "step": 13192 + }, + { + "epoch": 1.07, + "grad_norm": 3.297465587889023, + "learning_rate": 7.446017722074968e-06, + "loss": 0.5973, + "step": 13193 + }, + { + "epoch": 1.07, + "grad_norm": 6.351685119573751, + "learning_rate": 7.445635319259615e-06, + "loss": 0.755, + "step": 13194 + }, + { + "epoch": 1.07, + "grad_norm": 2.7229964663372557, + "learning_rate": 7.445252897639381e-06, + "loss": 0.6806, + "step": 13195 + }, + { + "epoch": 1.07, + "grad_norm": 5.003098543224561, + "learning_rate": 7.444870457217209e-06, + "loss": 0.7158, + "step": 13196 + }, + { + "epoch": 1.07, + "grad_norm": 2.976671300658637, + "learning_rate": 7.44448799799604e-06, + "loss": 0.6609, + "step": 13197 + }, + { + "epoch": 1.07, + "grad_norm": 3.3555567529529555, + "learning_rate": 7.444105519978812e-06, + "loss": 0.5668, + "step": 13198 + }, + { + "epoch": 1.07, + "grad_norm": 3.588637739002608, + "learning_rate": 7.443723023168466e-06, + "loss": 0.6229, + "step": 13199 + }, + { + "epoch": 1.07, + "grad_norm": 2.7747849810965084, + "learning_rate": 7.443340507567947e-06, + "loss": 0.5135, + "step": 13200 + }, + { + "epoch": 1.07, + "grad_norm": 3.004624738836836, + "learning_rate": 7.4429579731801915e-06, + "loss": 0.451, + "step": 13201 + }, + { + "epoch": 1.07, + "grad_norm": 3.88518218219603, + "learning_rate": 7.442575420008145e-06, + "loss": 0.5553, + "step": 13202 + }, + { + "epoch": 1.07, + "grad_norm": 2.329906907265006, + "learning_rate": 7.442192848054745e-06, + "loss": 0.6215, + "step": 13203 + }, + { + "epoch": 1.07, + "grad_norm": 2.4662115365504453, + "learning_rate": 7.441810257322937e-06, + "loss": 0.611, + "step": 13204 + }, + { + "epoch": 1.07, + "grad_norm": 5.730812734292271, + "learning_rate": 7.44142764781566e-06, + "loss": 0.6095, + "step": 13205 + }, + { + "epoch": 1.07, + "grad_norm": 3.865176831121156, + "learning_rate": 7.441045019535857e-06, + "loss": 0.6881, + "step": 13206 + }, + { + "epoch": 1.07, + "grad_norm": 3.395602676329784, + "learning_rate": 7.440662372486469e-06, + "loss": 0.7076, + "step": 13207 + }, + { + "epoch": 1.07, + "grad_norm": 2.2138721793505067, + "learning_rate": 7.440279706670441e-06, + "loss": 0.4204, + "step": 13208 + }, + { + "epoch": 1.07, + "grad_norm": 4.597840672592666, + "learning_rate": 7.439897022090713e-06, + "loss": 0.6595, + "step": 13209 + }, + { + "epoch": 1.07, + "grad_norm": 2.85241927117721, + "learning_rate": 7.439514318750228e-06, + "loss": 0.6681, + "step": 13210 + }, + { + "epoch": 1.07, + "grad_norm": 3.082806122929316, + "learning_rate": 7.439131596651929e-06, + "loss": 0.6682, + "step": 13211 + }, + { + "epoch": 1.07, + "grad_norm": 2.933447515645228, + "learning_rate": 7.438748855798758e-06, + "loss": 0.5394, + "step": 13212 + }, + { + "epoch": 1.07, + "grad_norm": 2.334173410533451, + "learning_rate": 7.43836609619366e-06, + "loss": 0.5864, + "step": 13213 + }, + { + "epoch": 1.07, + "grad_norm": 3.126398221300579, + "learning_rate": 7.437983317839577e-06, + "loss": 0.5398, + "step": 13214 + }, + { + "epoch": 1.07, + "grad_norm": 8.091837987166922, + "learning_rate": 7.4376005207394495e-06, + "loss": 0.6768, + "step": 13215 + }, + { + "epoch": 1.07, + "grad_norm": 4.526932198455699, + "learning_rate": 7.437217704896225e-06, + "loss": 0.5944, + "step": 13216 + }, + { + "epoch": 1.07, + "grad_norm": 3.040644472058373, + "learning_rate": 7.436834870312846e-06, + "loss": 0.6438, + "step": 13217 + }, + { + "epoch": 1.07, + "grad_norm": 3.2714842496701118, + "learning_rate": 7.436452016992254e-06, + "loss": 0.7201, + "step": 13218 + }, + { + "epoch": 1.07, + "grad_norm": 11.422882965630448, + "learning_rate": 7.436069144937394e-06, + "loss": 0.4795, + "step": 13219 + }, + { + "epoch": 1.07, + "grad_norm": 3.209457339080209, + "learning_rate": 7.435686254151211e-06, + "loss": 0.5999, + "step": 13220 + }, + { + "epoch": 1.07, + "grad_norm": 4.694160598414946, + "learning_rate": 7.4353033446366495e-06, + "loss": 0.6036, + "step": 13221 + }, + { + "epoch": 1.07, + "grad_norm": 2.730136495648104, + "learning_rate": 7.434920416396651e-06, + "loss": 0.5434, + "step": 13222 + }, + { + "epoch": 1.07, + "grad_norm": 3.5184394502310377, + "learning_rate": 7.434537469434162e-06, + "loss": 0.6205, + "step": 13223 + }, + { + "epoch": 1.07, + "grad_norm": 7.466522293819573, + "learning_rate": 7.434154503752128e-06, + "loss": 0.5954, + "step": 13224 + }, + { + "epoch": 1.07, + "grad_norm": 3.1657789386471173, + "learning_rate": 7.433771519353492e-06, + "loss": 0.6656, + "step": 13225 + }, + { + "epoch": 1.07, + "grad_norm": 17.029234997514077, + "learning_rate": 7.433388516241198e-06, + "loss": 0.561, + "step": 13226 + }, + { + "epoch": 1.07, + "grad_norm": 4.005451057316696, + "learning_rate": 7.433005494418192e-06, + "loss": 0.5955, + "step": 13227 + }, + { + "epoch": 1.07, + "grad_norm": 3.7188709869458316, + "learning_rate": 7.432622453887419e-06, + "loss": 0.6584, + "step": 13228 + }, + { + "epoch": 1.07, + "grad_norm": 3.568972283936614, + "learning_rate": 7.432239394651826e-06, + "loss": 0.7238, + "step": 13229 + }, + { + "epoch": 1.07, + "grad_norm": 4.012677303820611, + "learning_rate": 7.4318563167143565e-06, + "loss": 0.6036, + "step": 13230 + }, + { + "epoch": 1.07, + "grad_norm": 6.133045280193642, + "learning_rate": 7.431473220077955e-06, + "loss": 0.5659, + "step": 13231 + }, + { + "epoch": 1.07, + "grad_norm": 4.356657568114009, + "learning_rate": 7.43109010474557e-06, + "loss": 0.5803, + "step": 13232 + }, + { + "epoch": 1.07, + "grad_norm": 3.2742651428548557, + "learning_rate": 7.430706970720145e-06, + "loss": 0.4892, + "step": 13233 + }, + { + "epoch": 1.07, + "grad_norm": 2.757850994158556, + "learning_rate": 7.430323818004629e-06, + "loss": 0.5953, + "step": 13234 + }, + { + "epoch": 1.07, + "grad_norm": 4.574939438902401, + "learning_rate": 7.429940646601964e-06, + "loss": 0.7631, + "step": 13235 + }, + { + "epoch": 1.08, + "grad_norm": 2.914063817014665, + "learning_rate": 7.429557456515098e-06, + "loss": 0.5647, + "step": 13236 + }, + { + "epoch": 1.08, + "grad_norm": 5.4051270973396255, + "learning_rate": 7.42917424774698e-06, + "loss": 0.6855, + "step": 13237 + }, + { + "epoch": 1.08, + "grad_norm": 3.990199675551497, + "learning_rate": 7.428791020300552e-06, + "loss": 0.5289, + "step": 13238 + }, + { + "epoch": 1.08, + "grad_norm": 2.8648522652203527, + "learning_rate": 7.428407774178764e-06, + "loss": 0.6821, + "step": 13239 + }, + { + "epoch": 1.08, + "grad_norm": 2.9273038537099056, + "learning_rate": 7.428024509384561e-06, + "loss": 0.5467, + "step": 13240 + }, + { + "epoch": 1.08, + "grad_norm": 6.027222672783492, + "learning_rate": 7.427641225920892e-06, + "loss": 0.6703, + "step": 13241 + }, + { + "epoch": 1.08, + "grad_norm": 3.822647847278044, + "learning_rate": 7.427257923790703e-06, + "loss": 0.6269, + "step": 13242 + }, + { + "epoch": 1.08, + "grad_norm": 3.4220047782158236, + "learning_rate": 7.426874602996941e-06, + "loss": 0.541, + "step": 13243 + }, + { + "epoch": 1.08, + "grad_norm": 5.716366484184259, + "learning_rate": 7.426491263542551e-06, + "loss": 0.6579, + "step": 13244 + }, + { + "epoch": 1.08, + "grad_norm": 4.398584835132485, + "learning_rate": 7.426107905430486e-06, + "loss": 0.5246, + "step": 13245 + }, + { + "epoch": 1.08, + "grad_norm": 2.6068763792276077, + "learning_rate": 7.42572452866369e-06, + "loss": 0.5123, + "step": 13246 + }, + { + "epoch": 1.08, + "grad_norm": 3.4856252089901023, + "learning_rate": 7.425341133245112e-06, + "loss": 0.7347, + "step": 13247 + }, + { + "epoch": 1.08, + "grad_norm": 5.232532327047942, + "learning_rate": 7.424957719177699e-06, + "loss": 0.6251, + "step": 13248 + }, + { + "epoch": 1.08, + "grad_norm": 4.564120716691231, + "learning_rate": 7.424574286464401e-06, + "loss": 0.5374, + "step": 13249 + }, + { + "epoch": 1.08, + "grad_norm": 3.810498962768359, + "learning_rate": 7.424190835108165e-06, + "loss": 0.5396, + "step": 13250 + }, + { + "epoch": 1.08, + "grad_norm": 4.2126697330723175, + "learning_rate": 7.423807365111939e-06, + "loss": 0.4812, + "step": 13251 + }, + { + "epoch": 1.08, + "grad_norm": 3.9497893100034194, + "learning_rate": 7.423423876478672e-06, + "loss": 0.6903, + "step": 13252 + }, + { + "epoch": 1.08, + "grad_norm": 23.72419201543532, + "learning_rate": 7.423040369211313e-06, + "loss": 0.5933, + "step": 13253 + }, + { + "epoch": 1.08, + "grad_norm": 5.209033948351683, + "learning_rate": 7.422656843312811e-06, + "loss": 0.7385, + "step": 13254 + }, + { + "epoch": 1.08, + "grad_norm": 3.1981060680149396, + "learning_rate": 7.422273298786115e-06, + "loss": 0.5485, + "step": 13255 + }, + { + "epoch": 1.08, + "grad_norm": 6.478489272254062, + "learning_rate": 7.421889735634172e-06, + "loss": 0.5808, + "step": 13256 + }, + { + "epoch": 1.08, + "grad_norm": 7.438475442305074, + "learning_rate": 7.421506153859934e-06, + "loss": 0.5563, + "step": 13257 + }, + { + "epoch": 1.08, + "grad_norm": 4.084610723725969, + "learning_rate": 7.42112255346635e-06, + "loss": 0.5692, + "step": 13258 + }, + { + "epoch": 1.08, + "grad_norm": 5.691684887734501, + "learning_rate": 7.420738934456369e-06, + "loss": 0.5496, + "step": 13259 + }, + { + "epoch": 1.08, + "grad_norm": 6.447687030692418, + "learning_rate": 7.42035529683294e-06, + "loss": 0.6503, + "step": 13260 + }, + { + "epoch": 1.08, + "grad_norm": 6.71435985491528, + "learning_rate": 7.419971640599013e-06, + "loss": 0.7384, + "step": 13261 + }, + { + "epoch": 1.08, + "grad_norm": 2.696361319372669, + "learning_rate": 7.41958796575754e-06, + "loss": 0.6365, + "step": 13262 + }, + { + "epoch": 1.08, + "grad_norm": 12.85285184184139, + "learning_rate": 7.4192042723114696e-06, + "loss": 0.4558, + "step": 13263 + }, + { + "epoch": 1.08, + "grad_norm": 4.687942120444293, + "learning_rate": 7.418820560263751e-06, + "loss": 0.7266, + "step": 13264 + }, + { + "epoch": 1.08, + "grad_norm": 5.82047509039928, + "learning_rate": 7.418436829617337e-06, + "loss": 0.5765, + "step": 13265 + }, + { + "epoch": 1.08, + "grad_norm": 4.787652403067645, + "learning_rate": 7.418053080375177e-06, + "loss": 0.6128, + "step": 13266 + }, + { + "epoch": 1.08, + "grad_norm": 4.967867724919784, + "learning_rate": 7.417669312540221e-06, + "loss": 0.5525, + "step": 13267 + }, + { + "epoch": 1.08, + "grad_norm": 5.715021338770556, + "learning_rate": 7.4172855261154204e-06, + "loss": 0.657, + "step": 13268 + }, + { + "epoch": 1.08, + "grad_norm": 7.1322204561142355, + "learning_rate": 7.4169017211037275e-06, + "loss": 0.6177, + "step": 13269 + }, + { + "epoch": 1.08, + "grad_norm": 6.27198894969328, + "learning_rate": 7.416517897508092e-06, + "loss": 0.5589, + "step": 13270 + }, + { + "epoch": 1.08, + "grad_norm": 3.6378189931235965, + "learning_rate": 7.416134055331466e-06, + "loss": 0.6485, + "step": 13271 + }, + { + "epoch": 1.08, + "grad_norm": 3.5028604880007137, + "learning_rate": 7.415750194576799e-06, + "loss": 0.5357, + "step": 13272 + }, + { + "epoch": 1.08, + "grad_norm": 5.5031192830231115, + "learning_rate": 7.415366315247043e-06, + "loss": 0.5948, + "step": 13273 + }, + { + "epoch": 1.08, + "grad_norm": 4.192262296590541, + "learning_rate": 7.4149824173451534e-06, + "loss": 0.617, + "step": 13274 + }, + { + "epoch": 1.08, + "grad_norm": 5.836848821924808, + "learning_rate": 7.414598500874078e-06, + "loss": 0.5832, + "step": 13275 + }, + { + "epoch": 1.08, + "grad_norm": 4.005217070584467, + "learning_rate": 7.414214565836771e-06, + "loss": 0.6577, + "step": 13276 + }, + { + "epoch": 1.08, + "grad_norm": 5.994712981097223, + "learning_rate": 7.413830612236181e-06, + "loss": 0.5213, + "step": 13277 + }, + { + "epoch": 1.08, + "grad_norm": 2.7161736183945147, + "learning_rate": 7.4134466400752655e-06, + "loss": 0.5844, + "step": 13278 + }, + { + "epoch": 1.08, + "grad_norm": 3.6209869495040685, + "learning_rate": 7.413062649356975e-06, + "loss": 0.6991, + "step": 13279 + }, + { + "epoch": 1.08, + "grad_norm": 3.5316345985980577, + "learning_rate": 7.412678640084258e-06, + "loss": 0.5137, + "step": 13280 + }, + { + "epoch": 1.08, + "grad_norm": 3.809810081804549, + "learning_rate": 7.4122946122600735e-06, + "loss": 0.6427, + "step": 13281 + }, + { + "epoch": 1.08, + "grad_norm": 2.3050533970022165, + "learning_rate": 7.4119105658873714e-06, + "loss": 0.5149, + "step": 13282 + }, + { + "epoch": 1.08, + "grad_norm": 6.3686477171519815, + "learning_rate": 7.411526500969104e-06, + "loss": 0.5543, + "step": 13283 + }, + { + "epoch": 1.08, + "grad_norm": 3.2811517802933134, + "learning_rate": 7.411142417508225e-06, + "loss": 0.6836, + "step": 13284 + }, + { + "epoch": 1.08, + "grad_norm": 2.384303154240263, + "learning_rate": 7.410758315507688e-06, + "loss": 0.563, + "step": 13285 + }, + { + "epoch": 1.08, + "grad_norm": 3.7188045243265084, + "learning_rate": 7.410374194970447e-06, + "loss": 0.5262, + "step": 13286 + }, + { + "epoch": 1.08, + "grad_norm": 4.986806125269831, + "learning_rate": 7.409990055899454e-06, + "loss": 0.6662, + "step": 13287 + }, + { + "epoch": 1.08, + "grad_norm": 2.8175814740369836, + "learning_rate": 7.409605898297664e-06, + "loss": 0.5922, + "step": 13288 + }, + { + "epoch": 1.08, + "grad_norm": 3.2760113226705196, + "learning_rate": 7.409221722168029e-06, + "loss": 0.7339, + "step": 13289 + }, + { + "epoch": 1.08, + "grad_norm": 3.324158103436725, + "learning_rate": 7.408837527513507e-06, + "loss": 0.7418, + "step": 13290 + }, + { + "epoch": 1.08, + "grad_norm": 3.948392042379822, + "learning_rate": 7.408453314337047e-06, + "loss": 0.5154, + "step": 13291 + }, + { + "epoch": 1.08, + "grad_norm": 3.625910203446188, + "learning_rate": 7.408069082641608e-06, + "loss": 0.7366, + "step": 13292 + }, + { + "epoch": 1.08, + "grad_norm": 1.7810246925783204, + "learning_rate": 7.4076848324301406e-06, + "loss": 0.4648, + "step": 13293 + }, + { + "epoch": 1.08, + "grad_norm": 2.180323254466273, + "learning_rate": 7.407300563705603e-06, + "loss": 0.4685, + "step": 13294 + }, + { + "epoch": 1.08, + "grad_norm": 4.207731895752863, + "learning_rate": 7.4069162764709464e-06, + "loss": 0.5888, + "step": 13295 + }, + { + "epoch": 1.08, + "grad_norm": 2.9353263058128847, + "learning_rate": 7.4065319707291275e-06, + "loss": 0.7236, + "step": 13296 + }, + { + "epoch": 1.08, + "grad_norm": 2.6740823768348636, + "learning_rate": 7.4061476464831005e-06, + "loss": 0.6026, + "step": 13297 + }, + { + "epoch": 1.08, + "grad_norm": 6.4536367009218445, + "learning_rate": 7.4057633037358225e-06, + "loss": 0.5732, + "step": 13298 + }, + { + "epoch": 1.08, + "grad_norm": 3.021613168144522, + "learning_rate": 7.405378942490245e-06, + "loss": 0.5626, + "step": 13299 + }, + { + "epoch": 1.08, + "grad_norm": 3.884493995489164, + "learning_rate": 7.404994562749328e-06, + "loss": 0.5363, + "step": 13300 + }, + { + "epoch": 1.08, + "grad_norm": 3.606482309089173, + "learning_rate": 7.404610164516023e-06, + "loss": 0.6295, + "step": 13301 + }, + { + "epoch": 1.08, + "grad_norm": 3.92212260819278, + "learning_rate": 7.4042257477932875e-06, + "loss": 0.6155, + "step": 13302 + }, + { + "epoch": 1.08, + "grad_norm": 4.436738081189434, + "learning_rate": 7.403841312584079e-06, + "loss": 0.5412, + "step": 13303 + }, + { + "epoch": 1.08, + "grad_norm": 3.07810492648197, + "learning_rate": 7.40345685889135e-06, + "loss": 0.4993, + "step": 13304 + }, + { + "epoch": 1.08, + "grad_norm": 4.703002431451, + "learning_rate": 7.4030723867180585e-06, + "loss": 0.5208, + "step": 13305 + }, + { + "epoch": 1.08, + "grad_norm": 5.671345571351434, + "learning_rate": 7.4026878960671625e-06, + "loss": 0.5952, + "step": 13306 + }, + { + "epoch": 1.08, + "grad_norm": 19.630485219315084, + "learning_rate": 7.402303386941614e-06, + "loss": 0.6579, + "step": 13307 + }, + { + "epoch": 1.08, + "grad_norm": 3.324344719823664, + "learning_rate": 7.401918859344373e-06, + "loss": 0.625, + "step": 13308 + }, + { + "epoch": 1.08, + "grad_norm": 4.585859163190357, + "learning_rate": 7.401534313278396e-06, + "loss": 0.6206, + "step": 13309 + }, + { + "epoch": 1.08, + "grad_norm": 3.013115062020825, + "learning_rate": 7.401149748746639e-06, + "loss": 0.5492, + "step": 13310 + }, + { + "epoch": 1.08, + "grad_norm": 3.045598600906526, + "learning_rate": 7.400765165752059e-06, + "loss": 0.5649, + "step": 13311 + }, + { + "epoch": 1.08, + "grad_norm": 3.196164519692179, + "learning_rate": 7.400380564297613e-06, + "loss": 0.5525, + "step": 13312 + }, + { + "epoch": 1.08, + "grad_norm": 3.7137650803776516, + "learning_rate": 7.399995944386258e-06, + "loss": 0.6448, + "step": 13313 + }, + { + "epoch": 1.08, + "grad_norm": 3.4122336660597683, + "learning_rate": 7.399611306020953e-06, + "loss": 0.5411, + "step": 13314 + }, + { + "epoch": 1.08, + "grad_norm": 4.986289778777504, + "learning_rate": 7.399226649204654e-06, + "loss": 0.5427, + "step": 13315 + }, + { + "epoch": 1.08, + "grad_norm": 3.8400061286585783, + "learning_rate": 7.398841973940318e-06, + "loss": 0.495, + "step": 13316 + }, + { + "epoch": 1.08, + "grad_norm": 2.838412044844545, + "learning_rate": 7.398457280230905e-06, + "loss": 0.6105, + "step": 13317 + }, + { + "epoch": 1.08, + "grad_norm": 3.6803561987982767, + "learning_rate": 7.398072568079372e-06, + "loss": 0.5566, + "step": 13318 + }, + { + "epoch": 1.08, + "grad_norm": 2.7778889598182857, + "learning_rate": 7.397687837488677e-06, + "loss": 0.6425, + "step": 13319 + }, + { + "epoch": 1.08, + "grad_norm": 2.3961841412477765, + "learning_rate": 7.397303088461779e-06, + "loss": 0.5832, + "step": 13320 + }, + { + "epoch": 1.08, + "grad_norm": 5.666398766276481, + "learning_rate": 7.396918321001634e-06, + "loss": 0.5932, + "step": 13321 + }, + { + "epoch": 1.08, + "grad_norm": 2.3758657592424353, + "learning_rate": 7.396533535111203e-06, + "loss": 0.5025, + "step": 13322 + }, + { + "epoch": 1.08, + "grad_norm": 3.5756966468747815, + "learning_rate": 7.396148730793444e-06, + "loss": 0.6582, + "step": 13323 + }, + { + "epoch": 1.08, + "grad_norm": 5.5686691646075355, + "learning_rate": 7.395763908051317e-06, + "loss": 0.6816, + "step": 13324 + }, + { + "epoch": 1.08, + "grad_norm": 3.5179475067981065, + "learning_rate": 7.395379066887778e-06, + "loss": 0.6145, + "step": 13325 + }, + { + "epoch": 1.08, + "grad_norm": 2.415214604886496, + "learning_rate": 7.3949942073057876e-06, + "loss": 0.6418, + "step": 13326 + }, + { + "epoch": 1.08, + "grad_norm": 2.1617353859133095, + "learning_rate": 7.394609329308306e-06, + "loss": 0.6327, + "step": 13327 + }, + { + "epoch": 1.08, + "grad_norm": 4.791422361591413, + "learning_rate": 7.394224432898293e-06, + "loss": 0.5652, + "step": 13328 + }, + { + "epoch": 1.08, + "grad_norm": 3.700889320458536, + "learning_rate": 7.3938395180787044e-06, + "loss": 0.4456, + "step": 13329 + }, + { + "epoch": 1.08, + "grad_norm": 3.619616252579102, + "learning_rate": 7.393454584852504e-06, + "loss": 0.5571, + "step": 13330 + }, + { + "epoch": 1.08, + "grad_norm": 3.6210679625726665, + "learning_rate": 7.393069633222652e-06, + "loss": 0.6491, + "step": 13331 + }, + { + "epoch": 1.08, + "grad_norm": 3.2160924152875046, + "learning_rate": 7.392684663192103e-06, + "loss": 0.5991, + "step": 13332 + }, + { + "epoch": 1.08, + "grad_norm": 3.141763455438355, + "learning_rate": 7.392299674763823e-06, + "loss": 0.4313, + "step": 13333 + }, + { + "epoch": 1.08, + "grad_norm": 3.742697564526375, + "learning_rate": 7.391914667940768e-06, + "loss": 0.4995, + "step": 13334 + }, + { + "epoch": 1.08, + "grad_norm": 3.926847512834141, + "learning_rate": 7.3915296427259e-06, + "loss": 0.5404, + "step": 13335 + }, + { + "epoch": 1.08, + "grad_norm": 2.8536085142060097, + "learning_rate": 7.391144599122181e-06, + "loss": 0.642, + "step": 13336 + }, + { + "epoch": 1.08, + "grad_norm": 2.6778900294622217, + "learning_rate": 7.3907595371325705e-06, + "loss": 0.6263, + "step": 13337 + }, + { + "epoch": 1.08, + "grad_norm": 3.313658519856464, + "learning_rate": 7.390374456760027e-06, + "loss": 0.6721, + "step": 13338 + }, + { + "epoch": 1.08, + "grad_norm": 4.847492133877503, + "learning_rate": 7.389989358007514e-06, + "loss": 0.533, + "step": 13339 + }, + { + "epoch": 1.08, + "grad_norm": 4.9897180092268805, + "learning_rate": 7.389604240877994e-06, + "loss": 0.6389, + "step": 13340 + }, + { + "epoch": 1.08, + "grad_norm": 2.941041372140006, + "learning_rate": 7.3892191053744255e-06, + "loss": 0.6186, + "step": 13341 + }, + { + "epoch": 1.08, + "grad_norm": 3.655602471796342, + "learning_rate": 7.38883395149977e-06, + "loss": 0.6595, + "step": 13342 + }, + { + "epoch": 1.08, + "grad_norm": 4.365545216103017, + "learning_rate": 7.38844877925699e-06, + "loss": 0.6479, + "step": 13343 + }, + { + "epoch": 1.08, + "grad_norm": 6.118006719455853, + "learning_rate": 7.388063588649047e-06, + "loss": 0.6941, + "step": 13344 + }, + { + "epoch": 1.08, + "grad_norm": 3.6435395296446598, + "learning_rate": 7.387678379678903e-06, + "loss": 0.5773, + "step": 13345 + }, + { + "epoch": 1.08, + "grad_norm": 4.179386428825421, + "learning_rate": 7.38729315234952e-06, + "loss": 0.7434, + "step": 13346 + }, + { + "epoch": 1.08, + "grad_norm": 4.311718568944002, + "learning_rate": 7.386907906663858e-06, + "loss": 0.6326, + "step": 13347 + }, + { + "epoch": 1.08, + "grad_norm": 3.812304148030748, + "learning_rate": 7.3865226426248826e-06, + "loss": 0.4957, + "step": 13348 + }, + { + "epoch": 1.08, + "grad_norm": 4.769599135106726, + "learning_rate": 7.386137360235554e-06, + "loss": 0.3403, + "step": 13349 + }, + { + "epoch": 1.08, + "grad_norm": 5.987181216629483, + "learning_rate": 7.385752059498834e-06, + "loss": 0.6991, + "step": 13350 + }, + { + "epoch": 1.08, + "grad_norm": 2.900970074701525, + "learning_rate": 7.3853667404176886e-06, + "loss": 0.5882, + "step": 13351 + }, + { + "epoch": 1.08, + "grad_norm": 3.817701135238978, + "learning_rate": 7.384981402995077e-06, + "loss": 0.6783, + "step": 13352 + }, + { + "epoch": 1.08, + "grad_norm": 8.626573916368784, + "learning_rate": 7.384596047233964e-06, + "loss": 0.6283, + "step": 13353 + }, + { + "epoch": 1.08, + "grad_norm": 3.963501283380743, + "learning_rate": 7.384210673137311e-06, + "loss": 0.5692, + "step": 13354 + }, + { + "epoch": 1.08, + "grad_norm": 3.4777590324014485, + "learning_rate": 7.383825280708084e-06, + "loss": 0.466, + "step": 13355 + }, + { + "epoch": 1.08, + "grad_norm": 3.4197899990700336, + "learning_rate": 7.3834398699492436e-06, + "loss": 0.5533, + "step": 13356 + }, + { + "epoch": 1.08, + "grad_norm": 2.7537233246253368, + "learning_rate": 7.383054440863755e-06, + "loss": 0.7064, + "step": 13357 + }, + { + "epoch": 1.08, + "grad_norm": 3.677770330574415, + "learning_rate": 7.382668993454581e-06, + "loss": 0.696, + "step": 13358 + }, + { + "epoch": 1.09, + "grad_norm": 4.807014526354837, + "learning_rate": 7.3822835277246855e-06, + "loss": 0.7088, + "step": 13359 + }, + { + "epoch": 1.09, + "grad_norm": 2.43781972069784, + "learning_rate": 7.381898043677033e-06, + "loss": 0.6017, + "step": 13360 + }, + { + "epoch": 1.09, + "grad_norm": 3.3695284052223653, + "learning_rate": 7.381512541314586e-06, + "loss": 0.4097, + "step": 13361 + }, + { + "epoch": 1.09, + "grad_norm": 2.9678324283700683, + "learning_rate": 7.381127020640311e-06, + "loss": 0.6378, + "step": 13362 + }, + { + "epoch": 1.09, + "grad_norm": 3.8003089829825143, + "learning_rate": 7.38074148165717e-06, + "loss": 0.5963, + "step": 13363 + }, + { + "epoch": 1.09, + "grad_norm": 6.196766435437576, + "learning_rate": 7.3803559243681284e-06, + "loss": 0.673, + "step": 13364 + }, + { + "epoch": 1.09, + "grad_norm": 2.294039519703108, + "learning_rate": 7.379970348776152e-06, + "loss": 0.553, + "step": 13365 + }, + { + "epoch": 1.09, + "grad_norm": 5.899757171549006, + "learning_rate": 7.379584754884203e-06, + "loss": 0.7049, + "step": 13366 + }, + { + "epoch": 1.09, + "grad_norm": 3.7059539796944874, + "learning_rate": 7.379199142695249e-06, + "loss": 0.6357, + "step": 13367 + }, + { + "epoch": 1.09, + "grad_norm": 3.1902426560339863, + "learning_rate": 7.378813512212254e-06, + "loss": 0.5575, + "step": 13368 + }, + { + "epoch": 1.09, + "grad_norm": 2.9200037299678754, + "learning_rate": 7.378427863438183e-06, + "loss": 0.5566, + "step": 13369 + }, + { + "epoch": 1.09, + "grad_norm": 9.749428249369979, + "learning_rate": 7.378042196376001e-06, + "loss": 0.5314, + "step": 13370 + }, + { + "epoch": 1.09, + "grad_norm": 6.991227830196902, + "learning_rate": 7.377656511028672e-06, + "loss": 0.6268, + "step": 13371 + }, + { + "epoch": 1.09, + "grad_norm": 5.3220751691566, + "learning_rate": 7.377270807399166e-06, + "loss": 0.5925, + "step": 13372 + }, + { + "epoch": 1.09, + "grad_norm": 5.954751373968552, + "learning_rate": 7.376885085490446e-06, + "loss": 0.6219, + "step": 13373 + }, + { + "epoch": 1.09, + "grad_norm": 2.6724790075494553, + "learning_rate": 7.376499345305476e-06, + "loss": 0.4996, + "step": 13374 + }, + { + "epoch": 1.09, + "grad_norm": 3.1289076724745697, + "learning_rate": 7.376113586847226e-06, + "loss": 0.6523, + "step": 13375 + }, + { + "epoch": 1.09, + "grad_norm": 3.8733865557818734, + "learning_rate": 7.375727810118658e-06, + "loss": 0.5935, + "step": 13376 + }, + { + "epoch": 1.09, + "grad_norm": 3.9038676807925157, + "learning_rate": 7.375342015122743e-06, + "loss": 0.5889, + "step": 13377 + }, + { + "epoch": 1.09, + "grad_norm": 2.7039781874129614, + "learning_rate": 7.374956201862442e-06, + "loss": 0.6089, + "step": 13378 + }, + { + "epoch": 1.09, + "grad_norm": 5.138483684534424, + "learning_rate": 7.374570370340727e-06, + "loss": 0.5297, + "step": 13379 + }, + { + "epoch": 1.09, + "grad_norm": 3.206866366391148, + "learning_rate": 7.374184520560561e-06, + "loss": 0.524, + "step": 13380 + }, + { + "epoch": 1.09, + "grad_norm": 3.655404803967594, + "learning_rate": 7.3737986525249125e-06, + "loss": 0.5493, + "step": 13381 + }, + { + "epoch": 1.09, + "grad_norm": 2.4628375799818008, + "learning_rate": 7.373412766236747e-06, + "loss": 0.4964, + "step": 13382 + }, + { + "epoch": 1.09, + "grad_norm": 3.2655762919473554, + "learning_rate": 7.373026861699033e-06, + "loss": 0.6149, + "step": 13383 + }, + { + "epoch": 1.09, + "grad_norm": 3.4053705260825016, + "learning_rate": 7.372640938914739e-06, + "loss": 0.6629, + "step": 13384 + }, + { + "epoch": 1.09, + "grad_norm": 8.471285142533315, + "learning_rate": 7.37225499788683e-06, + "loss": 0.5483, + "step": 13385 + }, + { + "epoch": 1.09, + "grad_norm": 3.9783224020055434, + "learning_rate": 7.371869038618273e-06, + "loss": 0.5021, + "step": 13386 + }, + { + "epoch": 1.09, + "grad_norm": 3.840001067143406, + "learning_rate": 7.3714830611120395e-06, + "loss": 0.5964, + "step": 13387 + }, + { + "epoch": 1.09, + "grad_norm": 3.5814599815133668, + "learning_rate": 7.371097065371093e-06, + "loss": 0.6482, + "step": 13388 + }, + { + "epoch": 1.09, + "grad_norm": 5.617559810603534, + "learning_rate": 7.370711051398406e-06, + "loss": 0.6026, + "step": 13389 + }, + { + "epoch": 1.09, + "grad_norm": 3.3820942783628993, + "learning_rate": 7.370325019196941e-06, + "loss": 0.5133, + "step": 13390 + }, + { + "epoch": 1.09, + "grad_norm": 4.054158108350162, + "learning_rate": 7.369938968769672e-06, + "loss": 0.4765, + "step": 13391 + }, + { + "epoch": 1.09, + "grad_norm": 2.969236352258403, + "learning_rate": 7.369552900119563e-06, + "loss": 0.5939, + "step": 13392 + }, + { + "epoch": 1.09, + "grad_norm": 2.3418001747287067, + "learning_rate": 7.369166813249586e-06, + "loss": 0.6913, + "step": 13393 + }, + { + "epoch": 1.09, + "grad_norm": 2.852197552130543, + "learning_rate": 7.368780708162706e-06, + "loss": 0.5712, + "step": 13394 + }, + { + "epoch": 1.09, + "grad_norm": 4.690951798802362, + "learning_rate": 7.368394584861895e-06, + "loss": 0.796, + "step": 13395 + }, + { + "epoch": 1.09, + "grad_norm": 5.977634419900721, + "learning_rate": 7.368008443350121e-06, + "loss": 0.5366, + "step": 13396 + }, + { + "epoch": 1.09, + "grad_norm": 2.6512689487073464, + "learning_rate": 7.367622283630353e-06, + "loss": 0.7687, + "step": 13397 + }, + { + "epoch": 1.09, + "grad_norm": 4.154744275206109, + "learning_rate": 7.3672361057055585e-06, + "loss": 0.6488, + "step": 13398 + }, + { + "epoch": 1.09, + "grad_norm": 3.55996297500704, + "learning_rate": 7.366849909578711e-06, + "loss": 0.5997, + "step": 13399 + }, + { + "epoch": 1.09, + "grad_norm": 5.680832161136861, + "learning_rate": 7.366463695252776e-06, + "loss": 0.7422, + "step": 13400 + }, + { + "epoch": 1.09, + "grad_norm": 2.4042707631818256, + "learning_rate": 7.366077462730724e-06, + "loss": 0.5856, + "step": 13401 + }, + { + "epoch": 1.09, + "grad_norm": 3.123338294120542, + "learning_rate": 7.3656912120155265e-06, + "loss": 0.4939, + "step": 13402 + }, + { + "epoch": 1.09, + "grad_norm": 3.651941762600341, + "learning_rate": 7.365304943110152e-06, + "loss": 0.6437, + "step": 13403 + }, + { + "epoch": 1.09, + "grad_norm": 3.9283965068159477, + "learning_rate": 7.364918656017572e-06, + "loss": 0.5593, + "step": 13404 + }, + { + "epoch": 1.09, + "grad_norm": 6.46375267756135, + "learning_rate": 7.364532350740755e-06, + "loss": 0.7014, + "step": 13405 + }, + { + "epoch": 1.09, + "grad_norm": 5.996651642159815, + "learning_rate": 7.3641460272826715e-06, + "loss": 0.8186, + "step": 13406 + }, + { + "epoch": 1.09, + "grad_norm": 2.8991978700595733, + "learning_rate": 7.3637596856462945e-06, + "loss": 0.6996, + "step": 13407 + }, + { + "epoch": 1.09, + "grad_norm": 2.4830519091859036, + "learning_rate": 7.363373325834591e-06, + "loss": 0.487, + "step": 13408 + }, + { + "epoch": 1.09, + "grad_norm": 3.868601773523832, + "learning_rate": 7.362986947850534e-06, + "loss": 0.6429, + "step": 13409 + }, + { + "epoch": 1.09, + "grad_norm": 2.920457296137699, + "learning_rate": 7.362600551697094e-06, + "loss": 0.6708, + "step": 13410 + }, + { + "epoch": 1.09, + "grad_norm": 2.342645145660642, + "learning_rate": 7.3622141373772426e-06, + "loss": 0.521, + "step": 13411 + }, + { + "epoch": 1.09, + "grad_norm": 6.374356136300773, + "learning_rate": 7.36182770489395e-06, + "loss": 0.6696, + "step": 13412 + }, + { + "epoch": 1.09, + "grad_norm": 3.047173642193051, + "learning_rate": 7.3614412542501876e-06, + "loss": 0.513, + "step": 13413 + }, + { + "epoch": 1.09, + "grad_norm": 4.268807851668942, + "learning_rate": 7.361054785448928e-06, + "loss": 0.5674, + "step": 13414 + }, + { + "epoch": 1.09, + "grad_norm": 3.365899486108061, + "learning_rate": 7.360668298493142e-06, + "loss": 0.6785, + "step": 13415 + }, + { + "epoch": 1.09, + "grad_norm": 2.7832589555461102, + "learning_rate": 7.3602817933858015e-06, + "loss": 0.6633, + "step": 13416 + }, + { + "epoch": 1.09, + "grad_norm": 4.115218613629016, + "learning_rate": 7.359895270129878e-06, + "loss": 0.7278, + "step": 13417 + }, + { + "epoch": 1.09, + "grad_norm": 4.93241876152276, + "learning_rate": 7.359508728728344e-06, + "loss": 0.6568, + "step": 13418 + }, + { + "epoch": 1.09, + "grad_norm": 3.244464674999036, + "learning_rate": 7.359122169184171e-06, + "loss": 0.5654, + "step": 13419 + }, + { + "epoch": 1.09, + "grad_norm": 3.208430393175036, + "learning_rate": 7.358735591500333e-06, + "loss": 0.6408, + "step": 13420 + }, + { + "epoch": 1.09, + "grad_norm": 2.9771764239776575, + "learning_rate": 7.3583489956798e-06, + "loss": 0.5922, + "step": 13421 + }, + { + "epoch": 1.09, + "grad_norm": 2.6172242079302737, + "learning_rate": 7.357962381725548e-06, + "loss": 0.5347, + "step": 13422 + }, + { + "epoch": 1.09, + "grad_norm": 3.0308398789347737, + "learning_rate": 7.357575749640545e-06, + "loss": 0.7112, + "step": 13423 + }, + { + "epoch": 1.09, + "grad_norm": 4.253798493946265, + "learning_rate": 7.357189099427767e-06, + "loss": 0.5547, + "step": 13424 + }, + { + "epoch": 1.09, + "grad_norm": 3.730119601981954, + "learning_rate": 7.3568024310901875e-06, + "loss": 0.4586, + "step": 13425 + }, + { + "epoch": 1.09, + "grad_norm": 3.1542016367072003, + "learning_rate": 7.356415744630779e-06, + "loss": 0.6294, + "step": 13426 + }, + { + "epoch": 1.09, + "grad_norm": 9.481997050228143, + "learning_rate": 7.3560290400525125e-06, + "loss": 0.5434, + "step": 13427 + }, + { + "epoch": 1.09, + "grad_norm": 2.683830841207533, + "learning_rate": 7.355642317358366e-06, + "loss": 0.6899, + "step": 13428 + }, + { + "epoch": 1.09, + "grad_norm": 2.1936319839719096, + "learning_rate": 7.355255576551309e-06, + "loss": 0.5321, + "step": 13429 + }, + { + "epoch": 1.09, + "grad_norm": 3.9381660235072116, + "learning_rate": 7.354868817634317e-06, + "loss": 0.6586, + "step": 13430 + }, + { + "epoch": 1.09, + "grad_norm": 7.079058053557462, + "learning_rate": 7.354482040610363e-06, + "loss": 0.7153, + "step": 13431 + }, + { + "epoch": 1.09, + "grad_norm": 3.7320977034790244, + "learning_rate": 7.354095245482423e-06, + "loss": 0.441, + "step": 13432 + }, + { + "epoch": 1.09, + "grad_norm": 2.9172066726457424, + "learning_rate": 7.353708432253469e-06, + "loss": 0.5116, + "step": 13433 + }, + { + "epoch": 1.09, + "grad_norm": 3.329639278298864, + "learning_rate": 7.353321600926476e-06, + "loss": 0.6715, + "step": 13434 + }, + { + "epoch": 1.09, + "grad_norm": 3.3439046228501197, + "learning_rate": 7.352934751504418e-06, + "loss": 0.5837, + "step": 13435 + }, + { + "epoch": 1.09, + "grad_norm": 5.574842284300025, + "learning_rate": 7.352547883990271e-06, + "loss": 0.6844, + "step": 13436 + }, + { + "epoch": 1.09, + "grad_norm": 4.360454148616133, + "learning_rate": 7.352160998387007e-06, + "loss": 0.6115, + "step": 13437 + }, + { + "epoch": 1.09, + "grad_norm": 2.9082686960950763, + "learning_rate": 7.3517740946976035e-06, + "loss": 0.4728, + "step": 13438 + }, + { + "epoch": 1.09, + "grad_norm": 6.181744259678195, + "learning_rate": 7.351387172925033e-06, + "loss": 0.5889, + "step": 13439 + }, + { + "epoch": 1.09, + "grad_norm": 10.918153480782195, + "learning_rate": 7.351000233072274e-06, + "loss": 0.7201, + "step": 13440 + }, + { + "epoch": 1.09, + "grad_norm": 10.20131753397193, + "learning_rate": 7.3506132751422985e-06, + "loss": 0.5955, + "step": 13441 + }, + { + "epoch": 1.09, + "grad_norm": 2.357334017449769, + "learning_rate": 7.3502262991380835e-06, + "loss": 0.6144, + "step": 13442 + }, + { + "epoch": 1.09, + "grad_norm": 2.931643129704439, + "learning_rate": 7.3498393050626034e-06, + "loss": 0.465, + "step": 13443 + }, + { + "epoch": 1.09, + "grad_norm": 3.4207700503272296, + "learning_rate": 7.349452292918835e-06, + "loss": 0.6511, + "step": 13444 + }, + { + "epoch": 1.09, + "grad_norm": 3.5144603866879165, + "learning_rate": 7.349065262709754e-06, + "loss": 0.5947, + "step": 13445 + }, + { + "epoch": 1.09, + "grad_norm": 2.3408194512040215, + "learning_rate": 7.348678214438337e-06, + "loss": 0.5454, + "step": 13446 + }, + { + "epoch": 1.09, + "grad_norm": 3.3527654093400265, + "learning_rate": 7.348291148107557e-06, + "loss": 0.5844, + "step": 13447 + }, + { + "epoch": 1.09, + "grad_norm": 6.73129166832152, + "learning_rate": 7.3479040637203935e-06, + "loss": 0.5802, + "step": 13448 + }, + { + "epoch": 1.09, + "grad_norm": 4.4470233452306385, + "learning_rate": 7.347516961279821e-06, + "loss": 0.6498, + "step": 13449 + }, + { + "epoch": 1.09, + "grad_norm": 3.3274888497661697, + "learning_rate": 7.3471298407888165e-06, + "loss": 0.5489, + "step": 13450 + }, + { + "epoch": 1.09, + "grad_norm": 5.73446100552803, + "learning_rate": 7.346742702250358e-06, + "loss": 0.642, + "step": 13451 + }, + { + "epoch": 1.09, + "grad_norm": 2.4952394942598954, + "learning_rate": 7.346355545667419e-06, + "loss": 0.5061, + "step": 13452 + }, + { + "epoch": 1.09, + "grad_norm": 18.803310916818997, + "learning_rate": 7.345968371042981e-06, + "loss": 0.5707, + "step": 13453 + }, + { + "epoch": 1.09, + "grad_norm": 3.8711459653014866, + "learning_rate": 7.345581178380018e-06, + "loss": 0.5937, + "step": 13454 + }, + { + "epoch": 1.09, + "grad_norm": 4.704245202427621, + "learning_rate": 7.345193967681508e-06, + "loss": 0.6444, + "step": 13455 + }, + { + "epoch": 1.09, + "grad_norm": 3.6535478592179382, + "learning_rate": 7.344806738950425e-06, + "loss": 0.5379, + "step": 13456 + }, + { + "epoch": 1.09, + "grad_norm": 5.5212331133772725, + "learning_rate": 7.344419492189753e-06, + "loss": 0.4924, + "step": 13457 + }, + { + "epoch": 1.09, + "grad_norm": 3.5534392047519785, + "learning_rate": 7.344032227402465e-06, + "loss": 0.4625, + "step": 13458 + }, + { + "epoch": 1.09, + "grad_norm": 3.7249069663835366, + "learning_rate": 7.343644944591539e-06, + "loss": 0.6002, + "step": 13459 + }, + { + "epoch": 1.09, + "grad_norm": 5.9612294800676, + "learning_rate": 7.343257643759953e-06, + "loss": 0.599, + "step": 13460 + }, + { + "epoch": 1.09, + "grad_norm": 3.6127049233918727, + "learning_rate": 7.342870324910688e-06, + "loss": 0.7612, + "step": 13461 + }, + { + "epoch": 1.09, + "grad_norm": 4.464996920045892, + "learning_rate": 7.34248298804672e-06, + "loss": 0.6314, + "step": 13462 + }, + { + "epoch": 1.09, + "grad_norm": 3.517255524805919, + "learning_rate": 7.342095633171025e-06, + "loss": 0.7793, + "step": 13463 + }, + { + "epoch": 1.09, + "grad_norm": 2.7706442378490292, + "learning_rate": 7.3417082602865845e-06, + "loss": 0.6191, + "step": 13464 + }, + { + "epoch": 1.09, + "grad_norm": 4.555545635871531, + "learning_rate": 7.341320869396376e-06, + "loss": 0.717, + "step": 13465 + }, + { + "epoch": 1.09, + "grad_norm": 3.8157611043444413, + "learning_rate": 7.34093346050338e-06, + "loss": 0.638, + "step": 13466 + }, + { + "epoch": 1.09, + "grad_norm": 2.9898488651488053, + "learning_rate": 7.3405460336105726e-06, + "loss": 0.5815, + "step": 13467 + }, + { + "epoch": 1.09, + "grad_norm": 18.969251562968797, + "learning_rate": 7.340158588720934e-06, + "loss": 0.4965, + "step": 13468 + }, + { + "epoch": 1.09, + "grad_norm": 2.292176529253343, + "learning_rate": 7.339771125837443e-06, + "loss": 0.4479, + "step": 13469 + }, + { + "epoch": 1.09, + "grad_norm": 4.724094616916793, + "learning_rate": 7.339383644963078e-06, + "loss": 0.693, + "step": 13470 + }, + { + "epoch": 1.09, + "grad_norm": 3.5563815092278364, + "learning_rate": 7.338996146100822e-06, + "loss": 0.606, + "step": 13471 + }, + { + "epoch": 1.09, + "grad_norm": 3.5694023145069265, + "learning_rate": 7.338608629253649e-06, + "loss": 0.5273, + "step": 13472 + }, + { + "epoch": 1.09, + "grad_norm": 3.814591843391919, + "learning_rate": 7.338221094424545e-06, + "loss": 0.4615, + "step": 13473 + }, + { + "epoch": 1.09, + "grad_norm": 2.3815621487596803, + "learning_rate": 7.337833541616486e-06, + "loss": 0.5943, + "step": 13474 + }, + { + "epoch": 1.09, + "grad_norm": 2.8345208171741776, + "learning_rate": 7.337445970832451e-06, + "loss": 0.5789, + "step": 13475 + }, + { + "epoch": 1.09, + "grad_norm": 3.990302422910012, + "learning_rate": 7.337058382075421e-06, + "loss": 0.6557, + "step": 13476 + }, + { + "epoch": 1.09, + "grad_norm": 4.377866920367567, + "learning_rate": 7.336670775348379e-06, + "loss": 0.6824, + "step": 13477 + }, + { + "epoch": 1.09, + "grad_norm": 2.089018424450257, + "learning_rate": 7.336283150654303e-06, + "loss": 0.5621, + "step": 13478 + }, + { + "epoch": 1.09, + "grad_norm": 2.5477827818324004, + "learning_rate": 7.335895507996174e-06, + "loss": 0.4, + "step": 13479 + }, + { + "epoch": 1.09, + "grad_norm": 3.696650310536078, + "learning_rate": 7.33550784737697e-06, + "loss": 0.493, + "step": 13480 + }, + { + "epoch": 1.09, + "grad_norm": 2.5574118612161696, + "learning_rate": 7.335120168799675e-06, + "loss": 0.5891, + "step": 13481 + }, + { + "epoch": 1.1, + "grad_norm": 3.386627649159391, + "learning_rate": 7.33473247226727e-06, + "loss": 0.6264, + "step": 13482 + }, + { + "epoch": 1.1, + "grad_norm": 18.408373989860383, + "learning_rate": 7.334344757782735e-06, + "loss": 0.618, + "step": 13483 + }, + { + "epoch": 1.1, + "grad_norm": 2.9146463133579195, + "learning_rate": 7.333957025349051e-06, + "loss": 0.66, + "step": 13484 + }, + { + "epoch": 1.1, + "grad_norm": 6.9267255808415005, + "learning_rate": 7.3335692749692e-06, + "loss": 0.7153, + "step": 13485 + }, + { + "epoch": 1.1, + "grad_norm": 4.579137866249911, + "learning_rate": 7.333181506646163e-06, + "loss": 0.5246, + "step": 13486 + }, + { + "epoch": 1.1, + "grad_norm": 3.3301909924003104, + "learning_rate": 7.332793720382921e-06, + "loss": 0.513, + "step": 13487 + }, + { + "epoch": 1.1, + "grad_norm": 3.6475195832753027, + "learning_rate": 7.332405916182457e-06, + "loss": 0.4976, + "step": 13488 + }, + { + "epoch": 1.1, + "grad_norm": 2.6061015914992938, + "learning_rate": 7.332018094047752e-06, + "loss": 0.5899, + "step": 13489 + }, + { + "epoch": 1.1, + "grad_norm": 3.6413188892324535, + "learning_rate": 7.33163025398179e-06, + "loss": 0.6308, + "step": 13490 + }, + { + "epoch": 1.1, + "grad_norm": 2.6946965463132937, + "learning_rate": 7.3312423959875514e-06, + "loss": 0.5647, + "step": 13491 + }, + { + "epoch": 1.1, + "grad_norm": 5.141701003957877, + "learning_rate": 7.330854520068017e-06, + "loss": 0.5795, + "step": 13492 + }, + { + "epoch": 1.1, + "grad_norm": 2.4440313294029488, + "learning_rate": 7.3304666262261716e-06, + "loss": 0.636, + "step": 13493 + }, + { + "epoch": 1.1, + "grad_norm": 3.2288124472461246, + "learning_rate": 7.330078714464997e-06, + "loss": 0.6276, + "step": 13494 + }, + { + "epoch": 1.1, + "grad_norm": 3.341430905712477, + "learning_rate": 7.329690784787478e-06, + "loss": 0.5773, + "step": 13495 + }, + { + "epoch": 1.1, + "grad_norm": 2.2399777233803424, + "learning_rate": 7.329302837196592e-06, + "loss": 0.5462, + "step": 13496 + }, + { + "epoch": 1.1, + "grad_norm": 5.605321198857471, + "learning_rate": 7.328914871695327e-06, + "loss": 0.6128, + "step": 13497 + }, + { + "epoch": 1.1, + "grad_norm": 2.874917193864595, + "learning_rate": 7.328526888286666e-06, + "loss": 0.571, + "step": 13498 + }, + { + "epoch": 1.1, + "grad_norm": 3.4971126242514456, + "learning_rate": 7.328138886973589e-06, + "loss": 0.6595, + "step": 13499 + }, + { + "epoch": 1.1, + "grad_norm": 12.190328503403988, + "learning_rate": 7.327750867759081e-06, + "loss": 0.5138, + "step": 13500 + }, + { + "epoch": 1.1, + "grad_norm": 3.466214100452097, + "learning_rate": 7.327362830646127e-06, + "loss": 0.6627, + "step": 13501 + }, + { + "epoch": 1.1, + "grad_norm": 4.3354947054144075, + "learning_rate": 7.32697477563771e-06, + "loss": 0.5575, + "step": 13502 + }, + { + "epoch": 1.1, + "grad_norm": 4.2027677586716345, + "learning_rate": 7.326586702736813e-06, + "loss": 0.7054, + "step": 13503 + }, + { + "epoch": 1.1, + "grad_norm": 3.2703466827775682, + "learning_rate": 7.326198611946419e-06, + "loss": 0.5926, + "step": 13504 + }, + { + "epoch": 1.1, + "grad_norm": 3.624998139152789, + "learning_rate": 7.325810503269514e-06, + "loss": 0.6104, + "step": 13505 + }, + { + "epoch": 1.1, + "grad_norm": 3.0080727254089044, + "learning_rate": 7.325422376709082e-06, + "loss": 0.5625, + "step": 13506 + }, + { + "epoch": 1.1, + "grad_norm": 3.954914749444578, + "learning_rate": 7.325034232268107e-06, + "loss": 0.6458, + "step": 13507 + }, + { + "epoch": 1.1, + "grad_norm": 2.896013255977765, + "learning_rate": 7.3246460699495725e-06, + "loss": 0.5809, + "step": 13508 + }, + { + "epoch": 1.1, + "grad_norm": 17.46524313233495, + "learning_rate": 7.324257889756464e-06, + "loss": 0.5315, + "step": 13509 + }, + { + "epoch": 1.1, + "grad_norm": 3.577513130146795, + "learning_rate": 7.323869691691767e-06, + "loss": 0.6148, + "step": 13510 + }, + { + "epoch": 1.1, + "grad_norm": 3.8764419185000416, + "learning_rate": 7.323481475758467e-06, + "loss": 0.6576, + "step": 13511 + }, + { + "epoch": 1.1, + "grad_norm": 4.847178116767892, + "learning_rate": 7.323093241959546e-06, + "loss": 0.5483, + "step": 13512 + }, + { + "epoch": 1.1, + "grad_norm": 2.712003006796985, + "learning_rate": 7.322704990297992e-06, + "loss": 0.5917, + "step": 13513 + }, + { + "epoch": 1.1, + "grad_norm": 2.897607361700453, + "learning_rate": 7.322316720776788e-06, + "loss": 0.6499, + "step": 13514 + }, + { + "epoch": 1.1, + "grad_norm": 4.558409061649964, + "learning_rate": 7.321928433398922e-06, + "loss": 0.5401, + "step": 13515 + }, + { + "epoch": 1.1, + "grad_norm": 4.719251330486398, + "learning_rate": 7.32154012816738e-06, + "loss": 0.5595, + "step": 13516 + }, + { + "epoch": 1.1, + "grad_norm": 3.238679698966101, + "learning_rate": 7.321151805085143e-06, + "loss": 0.8019, + "step": 13517 + }, + { + "epoch": 1.1, + "grad_norm": 2.5570233681813996, + "learning_rate": 7.320763464155202e-06, + "loss": 0.6095, + "step": 13518 + }, + { + "epoch": 1.1, + "grad_norm": 3.445873274398044, + "learning_rate": 7.320375105380541e-06, + "loss": 0.6063, + "step": 13519 + }, + { + "epoch": 1.1, + "grad_norm": 4.606409674015923, + "learning_rate": 7.319986728764146e-06, + "loss": 0.5613, + "step": 13520 + }, + { + "epoch": 1.1, + "grad_norm": 3.5550820380594765, + "learning_rate": 7.319598334309001e-06, + "loss": 0.506, + "step": 13521 + }, + { + "epoch": 1.1, + "grad_norm": 2.7548082115758032, + "learning_rate": 7.319209922018098e-06, + "loss": 0.6644, + "step": 13522 + }, + { + "epoch": 1.1, + "grad_norm": 3.4446181681762518, + "learning_rate": 7.31882149189442e-06, + "loss": 0.5624, + "step": 13523 + }, + { + "epoch": 1.1, + "grad_norm": 7.762712303756813, + "learning_rate": 7.318433043940954e-06, + "loss": 0.5479, + "step": 13524 + }, + { + "epoch": 1.1, + "grad_norm": 3.6672561799376724, + "learning_rate": 7.318044578160685e-06, + "loss": 0.5259, + "step": 13525 + }, + { + "epoch": 1.1, + "grad_norm": 3.2079248353381953, + "learning_rate": 7.317656094556605e-06, + "loss": 0.5229, + "step": 13526 + }, + { + "epoch": 1.1, + "grad_norm": 2.448107930050599, + "learning_rate": 7.317267593131698e-06, + "loss": 0.5732, + "step": 13527 + }, + { + "epoch": 1.1, + "grad_norm": 3.48208376292226, + "learning_rate": 7.316879073888951e-06, + "loss": 0.6546, + "step": 13528 + }, + { + "epoch": 1.1, + "grad_norm": 4.87011793167953, + "learning_rate": 7.31649053683135e-06, + "loss": 0.5394, + "step": 13529 + }, + { + "epoch": 1.1, + "grad_norm": 2.4065631656474915, + "learning_rate": 7.316101981961885e-06, + "loss": 0.4963, + "step": 13530 + }, + { + "epoch": 1.1, + "grad_norm": 3.4261978583146524, + "learning_rate": 7.315713409283543e-06, + "loss": 0.5597, + "step": 13531 + }, + { + "epoch": 1.1, + "grad_norm": 3.0708375064384312, + "learning_rate": 7.315324818799313e-06, + "loss": 0.6599, + "step": 13532 + }, + { + "epoch": 1.1, + "grad_norm": 3.119498607794073, + "learning_rate": 7.31493621051218e-06, + "loss": 0.5502, + "step": 13533 + }, + { + "epoch": 1.1, + "grad_norm": 3.107578346702478, + "learning_rate": 7.314547584425136e-06, + "loss": 0.5194, + "step": 13534 + }, + { + "epoch": 1.1, + "grad_norm": 3.458998292235858, + "learning_rate": 7.314158940541165e-06, + "loss": 0.5856, + "step": 13535 + }, + { + "epoch": 1.1, + "grad_norm": 4.956017720784595, + "learning_rate": 7.313770278863258e-06, + "loss": 0.7769, + "step": 13536 + }, + { + "epoch": 1.1, + "grad_norm": 3.159744534267069, + "learning_rate": 7.313381599394401e-06, + "loss": 0.5113, + "step": 13537 + }, + { + "epoch": 1.1, + "grad_norm": 2.941019969044103, + "learning_rate": 7.312992902137587e-06, + "loss": 0.6011, + "step": 13538 + }, + { + "epoch": 1.1, + "grad_norm": 4.015922627952936, + "learning_rate": 7.312604187095801e-06, + "loss": 0.5856, + "step": 13539 + }, + { + "epoch": 1.1, + "grad_norm": 4.05314211414396, + "learning_rate": 7.3122154542720335e-06, + "loss": 0.496, + "step": 13540 + }, + { + "epoch": 1.1, + "grad_norm": 3.4463788117050544, + "learning_rate": 7.311826703669271e-06, + "loss": 0.4943, + "step": 13541 + }, + { + "epoch": 1.1, + "grad_norm": 7.18866687740904, + "learning_rate": 7.311437935290508e-06, + "loss": 0.6552, + "step": 13542 + }, + { + "epoch": 1.1, + "grad_norm": 6.341320372758751, + "learning_rate": 7.311049149138729e-06, + "loss": 0.5455, + "step": 13543 + }, + { + "epoch": 1.1, + "grad_norm": 4.408898606957012, + "learning_rate": 7.310660345216924e-06, + "loss": 0.6043, + "step": 13544 + }, + { + "epoch": 1.1, + "grad_norm": 3.2899498532532228, + "learning_rate": 7.310271523528084e-06, + "loss": 0.5835, + "step": 13545 + }, + { + "epoch": 1.1, + "grad_norm": 4.270912281069678, + "learning_rate": 7.309882684075199e-06, + "loss": 0.5058, + "step": 13546 + }, + { + "epoch": 1.1, + "grad_norm": 3.1832429803423357, + "learning_rate": 7.309493826861258e-06, + "loss": 0.6059, + "step": 13547 + }, + { + "epoch": 1.1, + "grad_norm": 3.467091211329391, + "learning_rate": 7.309104951889252e-06, + "loss": 0.649, + "step": 13548 + }, + { + "epoch": 1.1, + "grad_norm": 4.0405913863912275, + "learning_rate": 7.308716059162169e-06, + "loss": 0.5789, + "step": 13549 + }, + { + "epoch": 1.1, + "grad_norm": 4.178791172911453, + "learning_rate": 7.308327148683e-06, + "loss": 0.5797, + "step": 13550 + }, + { + "epoch": 1.1, + "grad_norm": 2.3173689717226282, + "learning_rate": 7.3079382204547365e-06, + "loss": 0.4513, + "step": 13551 + }, + { + "epoch": 1.1, + "grad_norm": 7.577770580343595, + "learning_rate": 7.307549274480369e-06, + "loss": 0.6018, + "step": 13552 + }, + { + "epoch": 1.1, + "grad_norm": 9.44835882331754, + "learning_rate": 7.3071603107628865e-06, + "loss": 0.5657, + "step": 13553 + }, + { + "epoch": 1.1, + "grad_norm": 4.47599878889487, + "learning_rate": 7.306771329305281e-06, + "loss": 0.6772, + "step": 13554 + }, + { + "epoch": 1.1, + "grad_norm": 4.315885006547513, + "learning_rate": 7.306382330110544e-06, + "loss": 0.6037, + "step": 13555 + }, + { + "epoch": 1.1, + "grad_norm": 4.069729257195434, + "learning_rate": 7.305993313181666e-06, + "loss": 0.4309, + "step": 13556 + }, + { + "epoch": 1.1, + "grad_norm": 3.46152939428144, + "learning_rate": 7.305604278521636e-06, + "loss": 0.5809, + "step": 13557 + }, + { + "epoch": 1.1, + "grad_norm": 2.721081011203209, + "learning_rate": 7.305215226133451e-06, + "loss": 0.4471, + "step": 13558 + }, + { + "epoch": 1.1, + "grad_norm": 2.7144132328526247, + "learning_rate": 7.304826156020096e-06, + "loss": 0.5945, + "step": 13559 + }, + { + "epoch": 1.1, + "grad_norm": 8.37350810475523, + "learning_rate": 7.304437068184567e-06, + "loss": 0.5545, + "step": 13560 + }, + { + "epoch": 1.1, + "grad_norm": 3.4706792239582818, + "learning_rate": 7.304047962629854e-06, + "loss": 0.5397, + "step": 13561 + }, + { + "epoch": 1.1, + "grad_norm": 5.737522773980567, + "learning_rate": 7.303658839358949e-06, + "loss": 0.6053, + "step": 13562 + }, + { + "epoch": 1.1, + "grad_norm": 7.991863958193902, + "learning_rate": 7.303269698374844e-06, + "loss": 0.788, + "step": 13563 + }, + { + "epoch": 1.1, + "grad_norm": 5.1569081698735895, + "learning_rate": 7.302880539680532e-06, + "loss": 0.587, + "step": 13564 + }, + { + "epoch": 1.1, + "grad_norm": 3.669724445338854, + "learning_rate": 7.302491363279004e-06, + "loss": 0.5617, + "step": 13565 + }, + { + "epoch": 1.1, + "grad_norm": 2.4799806545465546, + "learning_rate": 7.302102169173254e-06, + "loss": 0.485, + "step": 13566 + }, + { + "epoch": 1.1, + "grad_norm": 4.329439633083559, + "learning_rate": 7.301712957366273e-06, + "loss": 0.5074, + "step": 13567 + }, + { + "epoch": 1.1, + "grad_norm": 3.5407603098365454, + "learning_rate": 7.301323727861056e-06, + "loss": 0.4944, + "step": 13568 + }, + { + "epoch": 1.1, + "grad_norm": 4.0054824256282195, + "learning_rate": 7.300934480660593e-06, + "loss": 0.7084, + "step": 13569 + }, + { + "epoch": 1.1, + "grad_norm": 2.8037781900403838, + "learning_rate": 7.300545215767878e-06, + "loss": 0.6442, + "step": 13570 + }, + { + "epoch": 1.1, + "grad_norm": 5.946360632985224, + "learning_rate": 7.300155933185905e-06, + "loss": 0.7092, + "step": 13571 + }, + { + "epoch": 1.1, + "grad_norm": 4.141908972931778, + "learning_rate": 7.299766632917666e-06, + "loss": 0.529, + "step": 13572 + }, + { + "epoch": 1.1, + "grad_norm": 3.8391000099197234, + "learning_rate": 7.299377314966156e-06, + "loss": 0.4806, + "step": 13573 + }, + { + "epoch": 1.1, + "grad_norm": 5.4165428611223225, + "learning_rate": 7.298987979334367e-06, + "loss": 0.6158, + "step": 13574 + }, + { + "epoch": 1.1, + "grad_norm": 4.054570256657723, + "learning_rate": 7.298598626025293e-06, + "loss": 0.7201, + "step": 13575 + }, + { + "epoch": 1.1, + "grad_norm": 9.248602861189125, + "learning_rate": 7.298209255041929e-06, + "loss": 0.5464, + "step": 13576 + }, + { + "epoch": 1.1, + "grad_norm": 10.873927472003286, + "learning_rate": 7.2978198663872665e-06, + "loss": 0.7012, + "step": 13577 + }, + { + "epoch": 1.1, + "grad_norm": 3.666704866617444, + "learning_rate": 7.297430460064302e-06, + "loss": 0.5661, + "step": 13578 + }, + { + "epoch": 1.1, + "grad_norm": 3.406852661198488, + "learning_rate": 7.297041036076029e-06, + "loss": 0.5875, + "step": 13579 + }, + { + "epoch": 1.1, + "grad_norm": 3.8922825883595644, + "learning_rate": 7.296651594425441e-06, + "loss": 0.7036, + "step": 13580 + }, + { + "epoch": 1.1, + "grad_norm": 2.939279568834087, + "learning_rate": 7.296262135115533e-06, + "loss": 0.489, + "step": 13581 + }, + { + "epoch": 1.1, + "grad_norm": 3.9001271509756954, + "learning_rate": 7.2958726581493e-06, + "loss": 0.5783, + "step": 13582 + }, + { + "epoch": 1.1, + "grad_norm": 5.090768593288156, + "learning_rate": 7.295483163529736e-06, + "loss": 0.5157, + "step": 13583 + }, + { + "epoch": 1.1, + "grad_norm": 5.695163722737587, + "learning_rate": 7.295093651259837e-06, + "loss": 0.4091, + "step": 13584 + }, + { + "epoch": 1.1, + "grad_norm": 6.1503587818411685, + "learning_rate": 7.294704121342596e-06, + "loss": 0.7059, + "step": 13585 + }, + { + "epoch": 1.1, + "grad_norm": 4.595982494244561, + "learning_rate": 7.294314573781012e-06, + "loss": 0.5023, + "step": 13586 + }, + { + "epoch": 1.1, + "grad_norm": 4.811992358783734, + "learning_rate": 7.293925008578075e-06, + "loss": 0.5458, + "step": 13587 + }, + { + "epoch": 1.1, + "grad_norm": 2.539335337156614, + "learning_rate": 7.2935354257367855e-06, + "loss": 0.5069, + "step": 13588 + }, + { + "epoch": 1.1, + "grad_norm": 2.9754689287191525, + "learning_rate": 7.293145825260135e-06, + "loss": 0.4656, + "step": 13589 + }, + { + "epoch": 1.1, + "grad_norm": 4.374643705646175, + "learning_rate": 7.292756207151122e-06, + "loss": 0.5121, + "step": 13590 + }, + { + "epoch": 1.1, + "grad_norm": 11.160481761635483, + "learning_rate": 7.292366571412741e-06, + "loss": 0.5037, + "step": 13591 + }, + { + "epoch": 1.1, + "grad_norm": 5.483865060339781, + "learning_rate": 7.29197691804799e-06, + "loss": 0.684, + "step": 13592 + }, + { + "epoch": 1.1, + "grad_norm": 3.31470691122908, + "learning_rate": 7.2915872470598605e-06, + "loss": 0.4371, + "step": 13593 + }, + { + "epoch": 1.1, + "grad_norm": 2.72764792064609, + "learning_rate": 7.291197558451353e-06, + "loss": 0.6355, + "step": 13594 + }, + { + "epoch": 1.1, + "grad_norm": 4.746384486518186, + "learning_rate": 7.290807852225462e-06, + "loss": 0.6652, + "step": 13595 + }, + { + "epoch": 1.1, + "grad_norm": 4.1724240882029635, + "learning_rate": 7.290418128385186e-06, + "loss": 0.617, + "step": 13596 + }, + { + "epoch": 1.1, + "grad_norm": 8.395128518659952, + "learning_rate": 7.290028386933518e-06, + "loss": 0.5053, + "step": 13597 + }, + { + "epoch": 1.1, + "grad_norm": 2.537125099289778, + "learning_rate": 7.289638627873459e-06, + "loss": 0.4778, + "step": 13598 + }, + { + "epoch": 1.1, + "grad_norm": 3.828844361020568, + "learning_rate": 7.289248851208003e-06, + "loss": 0.668, + "step": 13599 + }, + { + "epoch": 1.1, + "grad_norm": 3.738144377230147, + "learning_rate": 7.288859056940148e-06, + "loss": 0.4496, + "step": 13600 + }, + { + "epoch": 1.1, + "grad_norm": 2.5324930536936354, + "learning_rate": 7.288469245072891e-06, + "loss": 0.5376, + "step": 13601 + }, + { + "epoch": 1.1, + "grad_norm": 4.017819418566862, + "learning_rate": 7.288079415609229e-06, + "loss": 0.6945, + "step": 13602 + }, + { + "epoch": 1.1, + "grad_norm": 7.879159822813833, + "learning_rate": 7.287689568552161e-06, + "loss": 0.7104, + "step": 13603 + }, + { + "epoch": 1.1, + "grad_norm": 2.844743820253322, + "learning_rate": 7.287299703904682e-06, + "loss": 0.6756, + "step": 13604 + }, + { + "epoch": 1.1, + "grad_norm": 3.5662607641722843, + "learning_rate": 7.2869098216697934e-06, + "loss": 0.5007, + "step": 13605 + }, + { + "epoch": 1.11, + "grad_norm": 2.5600655453042065, + "learning_rate": 7.286519921850489e-06, + "loss": 0.671, + "step": 13606 + }, + { + "epoch": 1.11, + "grad_norm": 8.035498911679829, + "learning_rate": 7.28613000444977e-06, + "loss": 0.6832, + "step": 13607 + }, + { + "epoch": 1.11, + "grad_norm": 4.526303496241881, + "learning_rate": 7.285740069470633e-06, + "loss": 0.6311, + "step": 13608 + }, + { + "epoch": 1.11, + "grad_norm": 6.597877776277447, + "learning_rate": 7.285350116916074e-06, + "loss": 0.5635, + "step": 13609 + }, + { + "epoch": 1.11, + "grad_norm": 3.5848214898594453, + "learning_rate": 7.284960146789097e-06, + "loss": 0.5096, + "step": 13610 + }, + { + "epoch": 1.11, + "grad_norm": 2.4774790972932212, + "learning_rate": 7.284570159092696e-06, + "loss": 0.6451, + "step": 13611 + }, + { + "epoch": 1.11, + "grad_norm": 4.099595520630844, + "learning_rate": 7.284180153829872e-06, + "loss": 0.5972, + "step": 13612 + }, + { + "epoch": 1.11, + "grad_norm": 3.5914068408288857, + "learning_rate": 7.283790131003623e-06, + "loss": 0.5741, + "step": 13613 + }, + { + "epoch": 1.11, + "grad_norm": 3.1171858573239404, + "learning_rate": 7.283400090616948e-06, + "loss": 0.5853, + "step": 13614 + }, + { + "epoch": 1.11, + "grad_norm": 2.805075246820459, + "learning_rate": 7.283010032672844e-06, + "loss": 0.5167, + "step": 13615 + }, + { + "epoch": 1.11, + "grad_norm": 3.331404559748941, + "learning_rate": 7.282619957174315e-06, + "loss": 0.5707, + "step": 13616 + }, + { + "epoch": 1.11, + "grad_norm": 4.827123802040391, + "learning_rate": 7.282229864124356e-06, + "loss": 0.45, + "step": 13617 + }, + { + "epoch": 1.11, + "grad_norm": 4.674823034587977, + "learning_rate": 7.2818397535259685e-06, + "loss": 0.6018, + "step": 13618 + }, + { + "epoch": 1.11, + "grad_norm": 2.4115448723124886, + "learning_rate": 7.281449625382151e-06, + "loss": 0.5965, + "step": 13619 + }, + { + "epoch": 1.11, + "grad_norm": 2.8496826932441754, + "learning_rate": 7.281059479695906e-06, + "loss": 0.4554, + "step": 13620 + }, + { + "epoch": 1.11, + "grad_norm": 3.1166968602242227, + "learning_rate": 7.280669316470229e-06, + "loss": 0.6485, + "step": 13621 + }, + { + "epoch": 1.11, + "grad_norm": 3.597477060902457, + "learning_rate": 7.2802791357081236e-06, + "loss": 0.616, + "step": 13622 + }, + { + "epoch": 1.11, + "grad_norm": 2.54740171127477, + "learning_rate": 7.279888937412587e-06, + "loss": 0.6266, + "step": 13623 + }, + { + "epoch": 1.11, + "grad_norm": 3.7886230570323183, + "learning_rate": 7.279498721586623e-06, + "loss": 0.7589, + "step": 13624 + }, + { + "epoch": 1.11, + "grad_norm": 3.600326995782971, + "learning_rate": 7.279108488233231e-06, + "loss": 0.4986, + "step": 13625 + }, + { + "epoch": 1.11, + "grad_norm": 2.365628714708935, + "learning_rate": 7.2787182373554085e-06, + "loss": 0.5026, + "step": 13626 + }, + { + "epoch": 1.11, + "grad_norm": 3.8072110976070106, + "learning_rate": 7.278327968956159e-06, + "loss": 0.6004, + "step": 13627 + }, + { + "epoch": 1.11, + "grad_norm": 3.4000086205518705, + "learning_rate": 7.277937683038484e-06, + "loss": 0.6412, + "step": 13628 + }, + { + "epoch": 1.11, + "grad_norm": 12.052754719416749, + "learning_rate": 7.277547379605383e-06, + "loss": 0.7265, + "step": 13629 + }, + { + "epoch": 1.11, + "grad_norm": 3.290521225808238, + "learning_rate": 7.2771570586598576e-06, + "loss": 0.5871, + "step": 13630 + }, + { + "epoch": 1.11, + "grad_norm": 8.44642544046495, + "learning_rate": 7.276766720204907e-06, + "loss": 0.5083, + "step": 13631 + }, + { + "epoch": 1.11, + "grad_norm": 3.618153842211457, + "learning_rate": 7.276376364243536e-06, + "loss": 0.5141, + "step": 13632 + }, + { + "epoch": 1.11, + "grad_norm": 3.22437007278642, + "learning_rate": 7.275985990778745e-06, + "loss": 0.5498, + "step": 13633 + }, + { + "epoch": 1.11, + "grad_norm": 3.1396820332048248, + "learning_rate": 7.275595599813534e-06, + "loss": 0.4359, + "step": 13634 + }, + { + "epoch": 1.11, + "grad_norm": 2.2956769549208964, + "learning_rate": 7.275205191350907e-06, + "loss": 0.6314, + "step": 13635 + }, + { + "epoch": 1.11, + "grad_norm": 5.373027564793716, + "learning_rate": 7.274814765393864e-06, + "loss": 0.5064, + "step": 13636 + }, + { + "epoch": 1.11, + "grad_norm": 3.2087241182326713, + "learning_rate": 7.274424321945408e-06, + "loss": 0.6844, + "step": 13637 + }, + { + "epoch": 1.11, + "grad_norm": 3.850744892233335, + "learning_rate": 7.274033861008542e-06, + "loss": 0.6981, + "step": 13638 + }, + { + "epoch": 1.11, + "grad_norm": 2.8135283505772137, + "learning_rate": 7.273643382586266e-06, + "loss": 0.5551, + "step": 13639 + }, + { + "epoch": 1.11, + "grad_norm": 4.823781858190363, + "learning_rate": 7.273252886681585e-06, + "loss": 0.5302, + "step": 13640 + }, + { + "epoch": 1.11, + "grad_norm": 6.95097058641305, + "learning_rate": 7.2728623732975e-06, + "loss": 0.5494, + "step": 13641 + }, + { + "epoch": 1.11, + "grad_norm": 3.938086509159436, + "learning_rate": 7.272471842437015e-06, + "loss": 0.6281, + "step": 13642 + }, + { + "epoch": 1.11, + "grad_norm": 3.2491894769278615, + "learning_rate": 7.272081294103131e-06, + "loss": 0.5295, + "step": 13643 + }, + { + "epoch": 1.11, + "grad_norm": 2.9046462949286327, + "learning_rate": 7.271690728298852e-06, + "loss": 0.5519, + "step": 13644 + }, + { + "epoch": 1.11, + "grad_norm": 4.528543221855132, + "learning_rate": 7.271300145027182e-06, + "loss": 0.5703, + "step": 13645 + }, + { + "epoch": 1.11, + "grad_norm": 3.560129751321755, + "learning_rate": 7.2709095442911236e-06, + "loss": 0.7193, + "step": 13646 + }, + { + "epoch": 1.11, + "grad_norm": 2.8951147975485854, + "learning_rate": 7.27051892609368e-06, + "loss": 0.6322, + "step": 13647 + }, + { + "epoch": 1.11, + "grad_norm": 3.8027551698005913, + "learning_rate": 7.2701282904378525e-06, + "loss": 0.53, + "step": 13648 + }, + { + "epoch": 1.11, + "grad_norm": 3.009623067233355, + "learning_rate": 7.269737637326649e-06, + "loss": 0.5484, + "step": 13649 + }, + { + "epoch": 1.11, + "grad_norm": 3.230212847971009, + "learning_rate": 7.269346966763071e-06, + "loss": 0.4083, + "step": 13650 + }, + { + "epoch": 1.11, + "grad_norm": 4.214078454548609, + "learning_rate": 7.268956278750122e-06, + "loss": 0.5674, + "step": 13651 + }, + { + "epoch": 1.11, + "grad_norm": 4.050525411313377, + "learning_rate": 7.2685655732908064e-06, + "loss": 0.6467, + "step": 13652 + }, + { + "epoch": 1.11, + "grad_norm": 3.3000538062489673, + "learning_rate": 7.268174850388131e-06, + "loss": 0.5376, + "step": 13653 + }, + { + "epoch": 1.11, + "grad_norm": 2.8406946846862366, + "learning_rate": 7.267784110045096e-06, + "loss": 0.557, + "step": 13654 + }, + { + "epoch": 1.11, + "grad_norm": 8.22274558048246, + "learning_rate": 7.267393352264708e-06, + "loss": 0.5333, + "step": 13655 + }, + { + "epoch": 1.11, + "grad_norm": 2.9546001891123845, + "learning_rate": 7.267002577049972e-06, + "loss": 0.4562, + "step": 13656 + }, + { + "epoch": 1.11, + "grad_norm": 7.384747069188464, + "learning_rate": 7.266611784403892e-06, + "loss": 0.6083, + "step": 13657 + }, + { + "epoch": 1.11, + "grad_norm": 4.960575889470557, + "learning_rate": 7.266220974329472e-06, + "loss": 0.4839, + "step": 13658 + }, + { + "epoch": 1.11, + "grad_norm": 5.760522025014579, + "learning_rate": 7.265830146829719e-06, + "loss": 0.8203, + "step": 13659 + }, + { + "epoch": 1.11, + "grad_norm": 2.720569229733579, + "learning_rate": 7.2654393019076365e-06, + "loss": 0.7054, + "step": 13660 + }, + { + "epoch": 1.11, + "grad_norm": 5.322842485156056, + "learning_rate": 7.265048439566231e-06, + "loss": 0.6491, + "step": 13661 + }, + { + "epoch": 1.11, + "grad_norm": 4.830703679417691, + "learning_rate": 7.2646575598085065e-06, + "loss": 0.6712, + "step": 13662 + }, + { + "epoch": 1.11, + "grad_norm": 2.3070340659312154, + "learning_rate": 7.264266662637469e-06, + "loss": 0.6548, + "step": 13663 + }, + { + "epoch": 1.11, + "grad_norm": 3.1204332912785344, + "learning_rate": 7.263875748056125e-06, + "loss": 0.6155, + "step": 13664 + }, + { + "epoch": 1.11, + "grad_norm": 3.5955227083838075, + "learning_rate": 7.2634848160674805e-06, + "loss": 0.6416, + "step": 13665 + }, + { + "epoch": 1.11, + "grad_norm": 2.7656666887570722, + "learning_rate": 7.26309386667454e-06, + "loss": 0.4581, + "step": 13666 + }, + { + "epoch": 1.11, + "grad_norm": 3.2051705283746683, + "learning_rate": 7.26270289988031e-06, + "loss": 0.5317, + "step": 13667 + }, + { + "epoch": 1.11, + "grad_norm": 3.60560063568297, + "learning_rate": 7.2623119156877976e-06, + "loss": 0.6239, + "step": 13668 + }, + { + "epoch": 1.11, + "grad_norm": 6.668021961127223, + "learning_rate": 7.261920914100008e-06, + "loss": 0.5405, + "step": 13669 + }, + { + "epoch": 1.11, + "grad_norm": 2.7457893845248758, + "learning_rate": 7.261529895119949e-06, + "loss": 0.6306, + "step": 13670 + }, + { + "epoch": 1.11, + "grad_norm": 2.6840329869681816, + "learning_rate": 7.2611388587506245e-06, + "loss": 0.5027, + "step": 13671 + }, + { + "epoch": 1.11, + "grad_norm": 4.168427182404945, + "learning_rate": 7.260747804995045e-06, + "loss": 0.5979, + "step": 13672 + }, + { + "epoch": 1.11, + "grad_norm": 3.041583516612649, + "learning_rate": 7.260356733856215e-06, + "loss": 0.6731, + "step": 13673 + }, + { + "epoch": 1.11, + "grad_norm": 4.9262524762455335, + "learning_rate": 7.2599656453371426e-06, + "loss": 0.6092, + "step": 13674 + }, + { + "epoch": 1.11, + "grad_norm": 2.9129632448592915, + "learning_rate": 7.259574539440833e-06, + "loss": 0.7235, + "step": 13675 + }, + { + "epoch": 1.11, + "grad_norm": 2.7629937667746383, + "learning_rate": 7.259183416170296e-06, + "loss": 0.6612, + "step": 13676 + }, + { + "epoch": 1.11, + "grad_norm": 3.6304337433268703, + "learning_rate": 7.2587922755285374e-06, + "loss": 0.6052, + "step": 13677 + }, + { + "epoch": 1.11, + "grad_norm": 3.4560103336122503, + "learning_rate": 7.258401117518565e-06, + "loss": 0.6594, + "step": 13678 + }, + { + "epoch": 1.11, + "grad_norm": 4.938568660329166, + "learning_rate": 7.258009942143387e-06, + "loss": 0.6374, + "step": 13679 + }, + { + "epoch": 1.11, + "grad_norm": 3.625959071665032, + "learning_rate": 7.257618749406012e-06, + "loss": 0.4654, + "step": 13680 + }, + { + "epoch": 1.11, + "grad_norm": 3.342589056076939, + "learning_rate": 7.257227539309445e-06, + "loss": 0.7786, + "step": 13681 + }, + { + "epoch": 1.11, + "grad_norm": 3.9275297612554554, + "learning_rate": 7.256836311856697e-06, + "loss": 0.5244, + "step": 13682 + }, + { + "epoch": 1.11, + "grad_norm": 2.8750540339371806, + "learning_rate": 7.256445067050774e-06, + "loss": 0.5843, + "step": 13683 + }, + { + "epoch": 1.11, + "grad_norm": 2.867133340954212, + "learning_rate": 7.2560538048946874e-06, + "loss": 0.4666, + "step": 13684 + }, + { + "epoch": 1.11, + "grad_norm": 3.348985641589021, + "learning_rate": 7.255662525391443e-06, + "loss": 0.6404, + "step": 13685 + }, + { + "epoch": 1.11, + "grad_norm": 3.7940246344832738, + "learning_rate": 7.2552712285440485e-06, + "loss": 0.6081, + "step": 13686 + }, + { + "epoch": 1.11, + "grad_norm": 2.3820628047988133, + "learning_rate": 7.2548799143555145e-06, + "loss": 0.6042, + "step": 13687 + }, + { + "epoch": 1.11, + "grad_norm": 5.82420927209422, + "learning_rate": 7.2544885828288514e-06, + "loss": 0.586, + "step": 13688 + }, + { + "epoch": 1.11, + "grad_norm": 3.749865580424538, + "learning_rate": 7.254097233967065e-06, + "loss": 0.5887, + "step": 13689 + }, + { + "epoch": 1.11, + "grad_norm": 4.573537936515338, + "learning_rate": 7.253705867773167e-06, + "loss": 0.621, + "step": 13690 + }, + { + "epoch": 1.11, + "grad_norm": 2.883335237272948, + "learning_rate": 7.253314484250165e-06, + "loss": 0.4828, + "step": 13691 + }, + { + "epoch": 1.11, + "grad_norm": 168.65855162998915, + "learning_rate": 7.25292308340107e-06, + "loss": 0.6059, + "step": 13692 + }, + { + "epoch": 1.11, + "grad_norm": 2.0567768512908606, + "learning_rate": 7.25253166522889e-06, + "loss": 0.5182, + "step": 13693 + }, + { + "epoch": 1.11, + "grad_norm": 2.55393114590187, + "learning_rate": 7.252140229736635e-06, + "loss": 0.6399, + "step": 13694 + }, + { + "epoch": 1.11, + "grad_norm": 9.703428966641564, + "learning_rate": 7.251748776927315e-06, + "loss": 0.631, + "step": 13695 + }, + { + "epoch": 1.11, + "grad_norm": 5.8407023873118025, + "learning_rate": 7.25135730680394e-06, + "loss": 0.7433, + "step": 13696 + }, + { + "epoch": 1.11, + "grad_norm": 3.275765330255824, + "learning_rate": 7.25096581936952e-06, + "loss": 0.7617, + "step": 13697 + }, + { + "epoch": 1.11, + "grad_norm": 2.076815594301001, + "learning_rate": 7.2505743146270656e-06, + "loss": 0.4659, + "step": 13698 + }, + { + "epoch": 1.11, + "grad_norm": 2.3373062117208154, + "learning_rate": 7.250182792579587e-06, + "loss": 0.5542, + "step": 13699 + }, + { + "epoch": 1.11, + "grad_norm": 6.326351678182756, + "learning_rate": 7.249791253230094e-06, + "loss": 0.6495, + "step": 13700 + }, + { + "epoch": 1.11, + "grad_norm": 2.588511802418635, + "learning_rate": 7.2493996965815976e-06, + "loss": 0.5579, + "step": 13701 + }, + { + "epoch": 1.11, + "grad_norm": 3.4345067324826126, + "learning_rate": 7.249008122637109e-06, + "loss": 0.508, + "step": 13702 + }, + { + "epoch": 1.11, + "grad_norm": 3.455976206609785, + "learning_rate": 7.248616531399639e-06, + "loss": 0.6621, + "step": 13703 + }, + { + "epoch": 1.11, + "grad_norm": 2.2468877595006487, + "learning_rate": 7.2482249228721965e-06, + "loss": 0.6294, + "step": 13704 + }, + { + "epoch": 1.11, + "grad_norm": 3.336489416247208, + "learning_rate": 7.247833297057796e-06, + "loss": 0.5929, + "step": 13705 + }, + { + "epoch": 1.11, + "grad_norm": 3.352030269271094, + "learning_rate": 7.247441653959448e-06, + "loss": 0.6797, + "step": 13706 + }, + { + "epoch": 1.11, + "grad_norm": 6.455987224174443, + "learning_rate": 7.247049993580162e-06, + "loss": 0.5335, + "step": 13707 + }, + { + "epoch": 1.11, + "grad_norm": 4.161599268951711, + "learning_rate": 7.24665831592295e-06, + "loss": 0.703, + "step": 13708 + }, + { + "epoch": 1.11, + "grad_norm": 5.127961260542747, + "learning_rate": 7.246266620990825e-06, + "loss": 0.6588, + "step": 13709 + }, + { + "epoch": 1.11, + "grad_norm": 11.152777537696295, + "learning_rate": 7.245874908786798e-06, + "loss": 0.6498, + "step": 13710 + }, + { + "epoch": 1.11, + "grad_norm": 2.965389147515734, + "learning_rate": 7.245483179313884e-06, + "loss": 0.541, + "step": 13711 + }, + { + "epoch": 1.11, + "grad_norm": 3.4586407513574806, + "learning_rate": 7.245091432575088e-06, + "loss": 0.576, + "step": 13712 + }, + { + "epoch": 1.11, + "grad_norm": 3.1803393192682385, + "learning_rate": 7.244699668573428e-06, + "loss": 0.6211, + "step": 13713 + }, + { + "epoch": 1.11, + "grad_norm": 3.205115546836763, + "learning_rate": 7.2443078873119145e-06, + "loss": 0.4287, + "step": 13714 + }, + { + "epoch": 1.11, + "grad_norm": 11.735905986620926, + "learning_rate": 7.243916088793561e-06, + "loss": 0.769, + "step": 13715 + }, + { + "epoch": 1.11, + "grad_norm": 8.72597881360186, + "learning_rate": 7.243524273021379e-06, + "loss": 0.6037, + "step": 13716 + }, + { + "epoch": 1.11, + "grad_norm": 3.0367790302394773, + "learning_rate": 7.2431324399983806e-06, + "loss": 0.6474, + "step": 13717 + }, + { + "epoch": 1.11, + "grad_norm": 3.4717628797993445, + "learning_rate": 7.242740589727579e-06, + "loss": 0.5807, + "step": 13718 + }, + { + "epoch": 1.11, + "grad_norm": 8.153124968326608, + "learning_rate": 7.242348722211991e-06, + "loss": 0.4099, + "step": 13719 + }, + { + "epoch": 1.11, + "grad_norm": 3.3190918125883537, + "learning_rate": 7.241956837454622e-06, + "loss": 0.5153, + "step": 13720 + }, + { + "epoch": 1.11, + "grad_norm": 5.327688805317918, + "learning_rate": 7.241564935458493e-06, + "loss": 0.6314, + "step": 13721 + }, + { + "epoch": 1.11, + "grad_norm": 2.288197494273073, + "learning_rate": 7.241173016226613e-06, + "loss": 0.5377, + "step": 13722 + }, + { + "epoch": 1.11, + "grad_norm": 2.6292898211266262, + "learning_rate": 7.240781079761998e-06, + "loss": 0.5014, + "step": 13723 + }, + { + "epoch": 1.11, + "grad_norm": 2.1768929274770397, + "learning_rate": 7.240389126067658e-06, + "loss": 0.4638, + "step": 13724 + }, + { + "epoch": 1.11, + "grad_norm": 6.2558964187211314, + "learning_rate": 7.2399971551466105e-06, + "loss": 0.4922, + "step": 13725 + }, + { + "epoch": 1.11, + "grad_norm": 5.031110740890489, + "learning_rate": 7.2396051670018685e-06, + "loss": 0.5348, + "step": 13726 + }, + { + "epoch": 1.11, + "grad_norm": 5.314010823657378, + "learning_rate": 7.239213161636446e-06, + "loss": 0.6956, + "step": 13727 + }, + { + "epoch": 1.11, + "grad_norm": 3.6657947949212337, + "learning_rate": 7.238821139053354e-06, + "loss": 0.63, + "step": 13728 + }, + { + "epoch": 1.12, + "grad_norm": 3.0904740632647276, + "learning_rate": 7.238429099255613e-06, + "loss": 0.6213, + "step": 13729 + }, + { + "epoch": 1.12, + "grad_norm": 2.7797692311792477, + "learning_rate": 7.238037042246233e-06, + "loss": 0.6163, + "step": 13730 + }, + { + "epoch": 1.12, + "grad_norm": 3.2708110106374897, + "learning_rate": 7.23764496802823e-06, + "loss": 0.4747, + "step": 13731 + }, + { + "epoch": 1.12, + "grad_norm": 8.342987777374985, + "learning_rate": 7.237252876604617e-06, + "loss": 0.6911, + "step": 13732 + }, + { + "epoch": 1.12, + "grad_norm": 2.9431556945703083, + "learning_rate": 7.236860767978411e-06, + "loss": 0.5518, + "step": 13733 + }, + { + "epoch": 1.12, + "grad_norm": 2.947640758211327, + "learning_rate": 7.2364686421526265e-06, + "loss": 0.5633, + "step": 13734 + }, + { + "epoch": 1.12, + "grad_norm": 3.535012343741763, + "learning_rate": 7.236076499130279e-06, + "loss": 0.5511, + "step": 13735 + }, + { + "epoch": 1.12, + "grad_norm": 4.528931065468862, + "learning_rate": 7.235684338914382e-06, + "loss": 0.5711, + "step": 13736 + }, + { + "epoch": 1.12, + "grad_norm": 2.651878840428295, + "learning_rate": 7.235292161507952e-06, + "loss": 0.5151, + "step": 13737 + }, + { + "epoch": 1.12, + "grad_norm": 3.734653233718342, + "learning_rate": 7.234899966914005e-06, + "loss": 0.6354, + "step": 13738 + }, + { + "epoch": 1.12, + "grad_norm": 3.6016425562290704, + "learning_rate": 7.234507755135557e-06, + "loss": 0.6073, + "step": 13739 + }, + { + "epoch": 1.12, + "grad_norm": 3.2749555987118457, + "learning_rate": 7.234115526175621e-06, + "loss": 0.665, + "step": 13740 + }, + { + "epoch": 1.12, + "grad_norm": 2.3730387744696624, + "learning_rate": 7.233723280037216e-06, + "loss": 0.568, + "step": 13741 + }, + { + "epoch": 1.12, + "grad_norm": 4.2483717443397335, + "learning_rate": 7.233331016723357e-06, + "loss": 0.67, + "step": 13742 + }, + { + "epoch": 1.12, + "grad_norm": 3.264771619915305, + "learning_rate": 7.2329387362370605e-06, + "loss": 0.6045, + "step": 13743 + }, + { + "epoch": 1.12, + "grad_norm": 3.8350954358834795, + "learning_rate": 7.232546438581341e-06, + "loss": 0.6234, + "step": 13744 + }, + { + "epoch": 1.12, + "grad_norm": 2.9479147912347066, + "learning_rate": 7.232154123759217e-06, + "loss": 0.4711, + "step": 13745 + }, + { + "epoch": 1.12, + "grad_norm": 2.911420313334315, + "learning_rate": 7.231761791773705e-06, + "loss": 0.6761, + "step": 13746 + }, + { + "epoch": 1.12, + "grad_norm": 4.022530378352138, + "learning_rate": 7.231369442627821e-06, + "loss": 0.6366, + "step": 13747 + }, + { + "epoch": 1.12, + "grad_norm": 2.4230112214579047, + "learning_rate": 7.23097707632458e-06, + "loss": 0.5104, + "step": 13748 + }, + { + "epoch": 1.12, + "grad_norm": 4.310376389057019, + "learning_rate": 7.230584692867003e-06, + "loss": 0.4716, + "step": 13749 + }, + { + "epoch": 1.12, + "grad_norm": 12.180468241073962, + "learning_rate": 7.230192292258105e-06, + "loss": 0.5348, + "step": 13750 + }, + { + "epoch": 1.12, + "grad_norm": 2.845133121693099, + "learning_rate": 7.229799874500902e-06, + "loss": 0.6172, + "step": 13751 + }, + { + "epoch": 1.12, + "grad_norm": 5.329772625426198, + "learning_rate": 7.229407439598413e-06, + "loss": 0.5634, + "step": 13752 + }, + { + "epoch": 1.12, + "grad_norm": 9.164802219986807, + "learning_rate": 7.2290149875536555e-06, + "loss": 0.5821, + "step": 13753 + }, + { + "epoch": 1.12, + "grad_norm": 2.9236811019280515, + "learning_rate": 7.228622518369647e-06, + "loss": 0.4933, + "step": 13754 + }, + { + "epoch": 1.12, + "grad_norm": 3.255980576626028, + "learning_rate": 7.228230032049405e-06, + "loss": 0.5434, + "step": 13755 + }, + { + "epoch": 1.12, + "grad_norm": 4.39678082320362, + "learning_rate": 7.2278375285959455e-06, + "loss": 0.6526, + "step": 13756 + }, + { + "epoch": 1.12, + "grad_norm": 3.5103525411741052, + "learning_rate": 7.227445008012291e-06, + "loss": 0.6443, + "step": 13757 + }, + { + "epoch": 1.12, + "grad_norm": 2.1891001436810886, + "learning_rate": 7.227052470301454e-06, + "loss": 0.6112, + "step": 13758 + }, + { + "epoch": 1.12, + "grad_norm": 4.463762184761607, + "learning_rate": 7.226659915466459e-06, + "loss": 0.4771, + "step": 13759 + }, + { + "epoch": 1.12, + "grad_norm": 3.5798292919014494, + "learning_rate": 7.226267343510319e-06, + "loss": 0.6131, + "step": 13760 + }, + { + "epoch": 1.12, + "grad_norm": 2.895298596538284, + "learning_rate": 7.225874754436055e-06, + "loss": 0.7167, + "step": 13761 + }, + { + "epoch": 1.12, + "grad_norm": 2.834722205240585, + "learning_rate": 7.225482148246687e-06, + "loss": 0.6605, + "step": 13762 + }, + { + "epoch": 1.12, + "grad_norm": 2.3004681876366866, + "learning_rate": 7.225089524945231e-06, + "loss": 0.5126, + "step": 13763 + }, + { + "epoch": 1.12, + "grad_norm": 2.1027116224513867, + "learning_rate": 7.224696884534708e-06, + "loss": 0.4398, + "step": 13764 + }, + { + "epoch": 1.12, + "grad_norm": 4.6301745030118555, + "learning_rate": 7.224304227018135e-06, + "loss": 0.5517, + "step": 13765 + }, + { + "epoch": 1.12, + "grad_norm": 3.5367749463926073, + "learning_rate": 7.223911552398534e-06, + "loss": 0.4263, + "step": 13766 + }, + { + "epoch": 1.12, + "grad_norm": 19.024696152630558, + "learning_rate": 7.223518860678922e-06, + "loss": 0.6696, + "step": 13767 + }, + { + "epoch": 1.12, + "grad_norm": 7.8105903389616405, + "learning_rate": 7.2231261518623185e-06, + "loss": 0.6686, + "step": 13768 + }, + { + "epoch": 1.12, + "grad_norm": 8.583645713253404, + "learning_rate": 7.222733425951745e-06, + "loss": 0.6792, + "step": 13769 + }, + { + "epoch": 1.12, + "grad_norm": 2.686878938979837, + "learning_rate": 7.22234068295022e-06, + "loss": 0.6836, + "step": 13770 + }, + { + "epoch": 1.12, + "grad_norm": 4.175943040189135, + "learning_rate": 7.221947922860764e-06, + "loss": 0.559, + "step": 13771 + }, + { + "epoch": 1.12, + "grad_norm": 3.842331323402045, + "learning_rate": 7.221555145686396e-06, + "loss": 0.7488, + "step": 13772 + }, + { + "epoch": 1.12, + "grad_norm": 5.2834534469433665, + "learning_rate": 7.221162351430135e-06, + "loss": 0.5906, + "step": 13773 + }, + { + "epoch": 1.12, + "grad_norm": 6.82704507702863, + "learning_rate": 7.220769540095006e-06, + "loss": 0.5866, + "step": 13774 + }, + { + "epoch": 1.12, + "grad_norm": 3.591465783066619, + "learning_rate": 7.220376711684025e-06, + "loss": 0.5638, + "step": 13775 + }, + { + "epoch": 1.12, + "grad_norm": 4.26573050633974, + "learning_rate": 7.219983866200213e-06, + "loss": 0.6138, + "step": 13776 + }, + { + "epoch": 1.12, + "grad_norm": 2.729945733359918, + "learning_rate": 7.219591003646592e-06, + "loss": 0.5794, + "step": 13777 + }, + { + "epoch": 1.12, + "grad_norm": 3.3486741755387546, + "learning_rate": 7.2191981240261825e-06, + "loss": 0.5606, + "step": 13778 + }, + { + "epoch": 1.12, + "grad_norm": 3.5433319340297142, + "learning_rate": 7.2188052273420055e-06, + "loss": 0.6684, + "step": 13779 + }, + { + "epoch": 1.12, + "grad_norm": 8.62487166182258, + "learning_rate": 7.218412313597081e-06, + "loss": 0.6376, + "step": 13780 + }, + { + "epoch": 1.12, + "grad_norm": 8.412388793335955, + "learning_rate": 7.21801938279443e-06, + "loss": 0.5682, + "step": 13781 + }, + { + "epoch": 1.12, + "grad_norm": 3.5715871970199546, + "learning_rate": 7.217626434937076e-06, + "loss": 0.5783, + "step": 13782 + }, + { + "epoch": 1.12, + "grad_norm": 5.098582570731343, + "learning_rate": 7.217233470028039e-06, + "loss": 0.6361, + "step": 13783 + }, + { + "epoch": 1.12, + "grad_norm": 4.559091280972446, + "learning_rate": 7.216840488070341e-06, + "loss": 0.5303, + "step": 13784 + }, + { + "epoch": 1.12, + "grad_norm": 3.4899853791386235, + "learning_rate": 7.216447489067002e-06, + "loss": 0.5092, + "step": 13785 + }, + { + "epoch": 1.12, + "grad_norm": 4.434691305521869, + "learning_rate": 7.216054473021046e-06, + "loss": 0.6606, + "step": 13786 + }, + { + "epoch": 1.12, + "grad_norm": 2.5709695605313176, + "learning_rate": 7.215661439935494e-06, + "loss": 0.6045, + "step": 13787 + }, + { + "epoch": 1.12, + "grad_norm": 6.072519950124297, + "learning_rate": 7.215268389813369e-06, + "loss": 0.7048, + "step": 13788 + }, + { + "epoch": 1.12, + "grad_norm": 2.9344085640848854, + "learning_rate": 7.214875322657691e-06, + "loss": 0.4699, + "step": 13789 + }, + { + "epoch": 1.12, + "grad_norm": 2.8935437444044805, + "learning_rate": 7.214482238471485e-06, + "loss": 0.5875, + "step": 13790 + }, + { + "epoch": 1.12, + "grad_norm": 3.909764683105959, + "learning_rate": 7.2140891372577724e-06, + "loss": 0.7566, + "step": 13791 + }, + { + "epoch": 1.12, + "grad_norm": 2.8950347683647606, + "learning_rate": 7.213696019019576e-06, + "loss": 0.6663, + "step": 13792 + }, + { + "epoch": 1.12, + "grad_norm": 2.616807920059923, + "learning_rate": 7.213302883759917e-06, + "loss": 0.5711, + "step": 13793 + }, + { + "epoch": 1.12, + "grad_norm": 6.539236654690772, + "learning_rate": 7.21290973148182e-06, + "loss": 0.6792, + "step": 13794 + }, + { + "epoch": 1.12, + "grad_norm": 3.1885493683863806, + "learning_rate": 7.212516562188309e-06, + "loss": 0.6979, + "step": 13795 + }, + { + "epoch": 1.12, + "grad_norm": 2.742976003919392, + "learning_rate": 7.212123375882404e-06, + "loss": 0.5894, + "step": 13796 + }, + { + "epoch": 1.12, + "grad_norm": 2.5769838867925365, + "learning_rate": 7.211730172567131e-06, + "loss": 0.6136, + "step": 13797 + }, + { + "epoch": 1.12, + "grad_norm": 2.7449666248796567, + "learning_rate": 7.211336952245511e-06, + "loss": 0.65, + "step": 13798 + }, + { + "epoch": 1.12, + "grad_norm": 3.55244069683914, + "learning_rate": 7.2109437149205705e-06, + "loss": 0.6071, + "step": 13799 + }, + { + "epoch": 1.12, + "grad_norm": 2.5151252493599605, + "learning_rate": 7.2105504605953315e-06, + "loss": 0.3771, + "step": 13800 + }, + { + "epoch": 1.12, + "grad_norm": 3.663623033127427, + "learning_rate": 7.210157189272817e-06, + "loss": 0.6494, + "step": 13801 + }, + { + "epoch": 1.12, + "grad_norm": 3.8786365793688997, + "learning_rate": 7.209763900956053e-06, + "loss": 0.5431, + "step": 13802 + }, + { + "epoch": 1.12, + "grad_norm": 3.5515019897879285, + "learning_rate": 7.209370595648061e-06, + "loss": 0.671, + "step": 13803 + }, + { + "epoch": 1.12, + "grad_norm": 2.8296975687995567, + "learning_rate": 7.208977273351867e-06, + "loss": 0.5313, + "step": 13804 + }, + { + "epoch": 1.12, + "grad_norm": 2.7700278535294585, + "learning_rate": 7.208583934070496e-06, + "loss": 0.5599, + "step": 13805 + }, + { + "epoch": 1.12, + "grad_norm": 6.317564579045138, + "learning_rate": 7.208190577806969e-06, + "loss": 0.6371, + "step": 13806 + }, + { + "epoch": 1.12, + "grad_norm": 3.3483814303048294, + "learning_rate": 7.207797204564315e-06, + "loss": 0.7047, + "step": 13807 + }, + { + "epoch": 1.12, + "grad_norm": 8.083423535140293, + "learning_rate": 7.2074038143455576e-06, + "loss": 0.6715, + "step": 13808 + }, + { + "epoch": 1.12, + "grad_norm": 3.0789909503231887, + "learning_rate": 7.207010407153719e-06, + "loss": 0.568, + "step": 13809 + }, + { + "epoch": 1.12, + "grad_norm": 4.6439865729004355, + "learning_rate": 7.2066169829918245e-06, + "loss": 0.6112, + "step": 13810 + }, + { + "epoch": 1.12, + "grad_norm": 2.724868689608358, + "learning_rate": 7.206223541862902e-06, + "loss": 0.6705, + "step": 13811 + }, + { + "epoch": 1.12, + "grad_norm": 7.727766980671883, + "learning_rate": 7.2058300837699755e-06, + "loss": 0.7059, + "step": 13812 + }, + { + "epoch": 1.12, + "grad_norm": 2.7906760545338956, + "learning_rate": 7.20543660871607e-06, + "loss": 0.7561, + "step": 13813 + }, + { + "epoch": 1.12, + "grad_norm": 2.631927545966649, + "learning_rate": 7.205043116704211e-06, + "loss": 0.5901, + "step": 13814 + }, + { + "epoch": 1.12, + "grad_norm": 18.108004916024033, + "learning_rate": 7.204649607737424e-06, + "loss": 0.441, + "step": 13815 + }, + { + "epoch": 1.12, + "grad_norm": 3.8382259607021068, + "learning_rate": 7.204256081818735e-06, + "loss": 0.69, + "step": 13816 + }, + { + "epoch": 1.12, + "grad_norm": 4.791665815119219, + "learning_rate": 7.203862538951171e-06, + "loss": 0.4563, + "step": 13817 + }, + { + "epoch": 1.12, + "grad_norm": 2.6611345991951083, + "learning_rate": 7.2034689791377555e-06, + "loss": 0.5902, + "step": 13818 + }, + { + "epoch": 1.12, + "grad_norm": 4.15457437587834, + "learning_rate": 7.203075402381516e-06, + "loss": 0.5801, + "step": 13819 + }, + { + "epoch": 1.12, + "grad_norm": 5.477199139999866, + "learning_rate": 7.20268180868548e-06, + "loss": 0.6452, + "step": 13820 + }, + { + "epoch": 1.12, + "grad_norm": 4.904122212453779, + "learning_rate": 7.202288198052673e-06, + "loss": 0.5942, + "step": 13821 + }, + { + "epoch": 1.12, + "grad_norm": 3.976966525544117, + "learning_rate": 7.201894570486119e-06, + "loss": 0.6664, + "step": 13822 + }, + { + "epoch": 1.12, + "grad_norm": 3.427898919844347, + "learning_rate": 7.201500925988848e-06, + "loss": 0.4538, + "step": 13823 + }, + { + "epoch": 1.12, + "grad_norm": 6.2578702341618, + "learning_rate": 7.201107264563887e-06, + "loss": 0.5162, + "step": 13824 + }, + { + "epoch": 1.12, + "grad_norm": 4.05762462007046, + "learning_rate": 7.200713586214261e-06, + "loss": 0.5559, + "step": 13825 + }, + { + "epoch": 1.12, + "grad_norm": 4.628852187013995, + "learning_rate": 7.200319890942996e-06, + "loss": 0.6448, + "step": 13826 + }, + { + "epoch": 1.12, + "grad_norm": 3.9620794070063985, + "learning_rate": 7.199926178753123e-06, + "loss": 0.456, + "step": 13827 + }, + { + "epoch": 1.12, + "grad_norm": 4.394319225017799, + "learning_rate": 7.199532449647666e-06, + "loss": 0.5275, + "step": 13828 + }, + { + "epoch": 1.12, + "grad_norm": 8.62773341941716, + "learning_rate": 7.199138703629654e-06, + "loss": 0.681, + "step": 13829 + }, + { + "epoch": 1.12, + "grad_norm": 3.617551161454771, + "learning_rate": 7.198744940702113e-06, + "loss": 0.6179, + "step": 13830 + }, + { + "epoch": 1.12, + "grad_norm": 10.562824619541656, + "learning_rate": 7.1983511608680735e-06, + "loss": 0.4861, + "step": 13831 + }, + { + "epoch": 1.12, + "grad_norm": 3.5139947185503213, + "learning_rate": 7.197957364130562e-06, + "loss": 0.6002, + "step": 13832 + }, + { + "epoch": 1.12, + "grad_norm": 2.6797663263308773, + "learning_rate": 7.197563550492605e-06, + "loss": 0.6601, + "step": 13833 + }, + { + "epoch": 1.12, + "grad_norm": 4.512459034070046, + "learning_rate": 7.197169719957233e-06, + "loss": 0.5329, + "step": 13834 + }, + { + "epoch": 1.12, + "grad_norm": 2.904311365987366, + "learning_rate": 7.196775872527473e-06, + "loss": 0.4502, + "step": 13835 + }, + { + "epoch": 1.12, + "grad_norm": 5.691522272756192, + "learning_rate": 7.196382008206353e-06, + "loss": 0.6618, + "step": 13836 + }, + { + "epoch": 1.12, + "grad_norm": 3.4168930110864237, + "learning_rate": 7.195988126996902e-06, + "loss": 0.6327, + "step": 13837 + }, + { + "epoch": 1.12, + "grad_norm": 3.4863749768076877, + "learning_rate": 7.195594228902148e-06, + "loss": 0.6133, + "step": 13838 + }, + { + "epoch": 1.12, + "grad_norm": 3.5017008467170587, + "learning_rate": 7.195200313925119e-06, + "loss": 0.5927, + "step": 13839 + }, + { + "epoch": 1.12, + "grad_norm": 3.874689883190253, + "learning_rate": 7.1948063820688475e-06, + "loss": 0.5838, + "step": 13840 + }, + { + "epoch": 1.12, + "grad_norm": 5.238980568930057, + "learning_rate": 7.19441243333636e-06, + "loss": 0.5301, + "step": 13841 + }, + { + "epoch": 1.12, + "grad_norm": 3.497707253411865, + "learning_rate": 7.194018467730683e-06, + "loss": 0.5963, + "step": 13842 + }, + { + "epoch": 1.12, + "grad_norm": 2.7609624200498035, + "learning_rate": 7.193624485254852e-06, + "loss": 0.5497, + "step": 13843 + }, + { + "epoch": 1.12, + "grad_norm": 2.6990515158370294, + "learning_rate": 7.1932304859118915e-06, + "loss": 0.4653, + "step": 13844 + }, + { + "epoch": 1.12, + "grad_norm": 3.213511660305424, + "learning_rate": 7.192836469704832e-06, + "loss": 0.565, + "step": 13845 + }, + { + "epoch": 1.12, + "grad_norm": 3.404358032919781, + "learning_rate": 7.192442436636704e-06, + "loss": 0.4971, + "step": 13846 + }, + { + "epoch": 1.12, + "grad_norm": 4.059642568573351, + "learning_rate": 7.192048386710537e-06, + "loss": 0.5758, + "step": 13847 + }, + { + "epoch": 1.12, + "grad_norm": 2.3726941122453185, + "learning_rate": 7.191654319929361e-06, + "loss": 0.5411, + "step": 13848 + }, + { + "epoch": 1.12, + "grad_norm": 2.753535090242175, + "learning_rate": 7.191260236296206e-06, + "loss": 0.4848, + "step": 13849 + }, + { + "epoch": 1.12, + "grad_norm": 2.877288808729877, + "learning_rate": 7.190866135814101e-06, + "loss": 0.6716, + "step": 13850 + }, + { + "epoch": 1.12, + "grad_norm": 3.3204068582815216, + "learning_rate": 7.1904720184860774e-06, + "loss": 0.7113, + "step": 13851 + }, + { + "epoch": 1.13, + "grad_norm": 4.277001926729191, + "learning_rate": 7.190077884315166e-06, + "loss": 0.6293, + "step": 13852 + }, + { + "epoch": 1.13, + "grad_norm": 4.400370386253204, + "learning_rate": 7.1896837333043975e-06, + "loss": 0.5902, + "step": 13853 + }, + { + "epoch": 1.13, + "grad_norm": 5.448542043581633, + "learning_rate": 7.189289565456801e-06, + "loss": 0.5405, + "step": 13854 + }, + { + "epoch": 1.13, + "grad_norm": 3.824459056322676, + "learning_rate": 7.188895380775409e-06, + "loss": 0.5968, + "step": 13855 + }, + { + "epoch": 1.13, + "grad_norm": 3.317817256082273, + "learning_rate": 7.188501179263252e-06, + "loss": 0.6357, + "step": 13856 + }, + { + "epoch": 1.13, + "grad_norm": 2.6726714125510203, + "learning_rate": 7.18810696092336e-06, + "loss": 0.5401, + "step": 13857 + }, + { + "epoch": 1.13, + "grad_norm": 3.7628811782106806, + "learning_rate": 7.187712725758765e-06, + "loss": 0.6106, + "step": 13858 + }, + { + "epoch": 1.13, + "grad_norm": 2.8866577996147993, + "learning_rate": 7.1873184737724985e-06, + "loss": 0.5815, + "step": 13859 + }, + { + "epoch": 1.13, + "grad_norm": 7.6066320686319635, + "learning_rate": 7.186924204967593e-06, + "loss": 0.548, + "step": 13860 + }, + { + "epoch": 1.13, + "grad_norm": 4.269830986547436, + "learning_rate": 7.186529919347077e-06, + "loss": 0.573, + "step": 13861 + }, + { + "epoch": 1.13, + "grad_norm": 3.174004846347893, + "learning_rate": 7.186135616913985e-06, + "loss": 0.4036, + "step": 13862 + }, + { + "epoch": 1.13, + "grad_norm": 3.006786151509747, + "learning_rate": 7.185741297671348e-06, + "loss": 0.5308, + "step": 13863 + }, + { + "epoch": 1.13, + "grad_norm": 2.995313483494356, + "learning_rate": 7.185346961622199e-06, + "loss": 0.5799, + "step": 13864 + }, + { + "epoch": 1.13, + "grad_norm": 5.406691261706471, + "learning_rate": 7.184952608769569e-06, + "loss": 0.6245, + "step": 13865 + }, + { + "epoch": 1.13, + "grad_norm": 2.830366075239703, + "learning_rate": 7.184558239116488e-06, + "loss": 0.638, + "step": 13866 + }, + { + "epoch": 1.13, + "grad_norm": 2.3652318803646497, + "learning_rate": 7.184163852665993e-06, + "loss": 0.5574, + "step": 13867 + }, + { + "epoch": 1.13, + "grad_norm": 2.561945064372659, + "learning_rate": 7.1837694494211145e-06, + "loss": 0.5379, + "step": 13868 + }, + { + "epoch": 1.13, + "grad_norm": 2.6509450773172216, + "learning_rate": 7.183375029384884e-06, + "loss": 0.5769, + "step": 13869 + }, + { + "epoch": 1.13, + "grad_norm": 1.9732951562469092, + "learning_rate": 7.182980592560334e-06, + "loss": 0.69, + "step": 13870 + }, + { + "epoch": 1.13, + "grad_norm": 3.4653405882391244, + "learning_rate": 7.1825861389505005e-06, + "loss": 0.6843, + "step": 13871 + }, + { + "epoch": 1.13, + "grad_norm": 4.255171132710585, + "learning_rate": 7.1821916685584135e-06, + "loss": 0.5056, + "step": 13872 + }, + { + "epoch": 1.13, + "grad_norm": 3.912955800085182, + "learning_rate": 7.181797181387107e-06, + "loss": 0.6407, + "step": 13873 + }, + { + "epoch": 1.13, + "grad_norm": 11.051891974625585, + "learning_rate": 7.181402677439614e-06, + "loss": 0.4559, + "step": 13874 + }, + { + "epoch": 1.13, + "grad_norm": 2.6112562406306647, + "learning_rate": 7.181008156718969e-06, + "loss": 0.656, + "step": 13875 + }, + { + "epoch": 1.13, + "grad_norm": 3.46346601593531, + "learning_rate": 7.180613619228206e-06, + "loss": 0.5725, + "step": 13876 + }, + { + "epoch": 1.13, + "grad_norm": 3.162121858153935, + "learning_rate": 7.180219064970356e-06, + "loss": 0.425, + "step": 13877 + }, + { + "epoch": 1.13, + "grad_norm": 3.3750302099340024, + "learning_rate": 7.179824493948455e-06, + "loss": 0.6943, + "step": 13878 + }, + { + "epoch": 1.13, + "grad_norm": 2.421298215040587, + "learning_rate": 7.179429906165536e-06, + "loss": 0.5966, + "step": 13879 + }, + { + "epoch": 1.13, + "grad_norm": 3.857748371382513, + "learning_rate": 7.179035301624634e-06, + "loss": 0.4828, + "step": 13880 + }, + { + "epoch": 1.13, + "grad_norm": 2.882910367105991, + "learning_rate": 7.178640680328782e-06, + "loss": 0.6013, + "step": 13881 + }, + { + "epoch": 1.13, + "grad_norm": 4.021187163042069, + "learning_rate": 7.178246042281015e-06, + "loss": 0.6163, + "step": 13882 + }, + { + "epoch": 1.13, + "grad_norm": 3.740447707479144, + "learning_rate": 7.177851387484366e-06, + "loss": 0.5675, + "step": 13883 + }, + { + "epoch": 1.13, + "grad_norm": 2.4682595381741272, + "learning_rate": 7.177456715941872e-06, + "loss": 0.5142, + "step": 13884 + }, + { + "epoch": 1.13, + "grad_norm": 4.065577893387178, + "learning_rate": 7.1770620276565664e-06, + "loss": 0.6111, + "step": 13885 + }, + { + "epoch": 1.13, + "grad_norm": 3.5442062194116515, + "learning_rate": 7.176667322631484e-06, + "loss": 0.6339, + "step": 13886 + }, + { + "epoch": 1.13, + "grad_norm": 8.189626846772645, + "learning_rate": 7.176272600869658e-06, + "loss": 0.6138, + "step": 13887 + }, + { + "epoch": 1.13, + "grad_norm": 4.035802494212654, + "learning_rate": 7.175877862374127e-06, + "loss": 0.6243, + "step": 13888 + }, + { + "epoch": 1.13, + "grad_norm": 6.5514011498489175, + "learning_rate": 7.175483107147926e-06, + "loss": 0.6103, + "step": 13889 + }, + { + "epoch": 1.13, + "grad_norm": 3.378438197949367, + "learning_rate": 7.175088335194087e-06, + "loss": 0.5298, + "step": 13890 + }, + { + "epoch": 1.13, + "grad_norm": 2.7796497625254704, + "learning_rate": 7.174693546515648e-06, + "loss": 0.6225, + "step": 13891 + }, + { + "epoch": 1.13, + "grad_norm": 9.984470715459475, + "learning_rate": 7.174298741115644e-06, + "loss": 0.6573, + "step": 13892 + }, + { + "epoch": 1.13, + "grad_norm": 6.784402754254416, + "learning_rate": 7.1739039189971095e-06, + "loss": 0.5571, + "step": 13893 + }, + { + "epoch": 1.13, + "grad_norm": 2.8388458596600556, + "learning_rate": 7.173509080163083e-06, + "loss": 0.5664, + "step": 13894 + }, + { + "epoch": 1.13, + "grad_norm": 49.990620700172826, + "learning_rate": 7.1731142246165975e-06, + "loss": 0.6003, + "step": 13895 + }, + { + "epoch": 1.13, + "grad_norm": 2.1482417742483153, + "learning_rate": 7.172719352360692e-06, + "loss": 0.5578, + "step": 13896 + }, + { + "epoch": 1.13, + "grad_norm": 5.4210498332023045, + "learning_rate": 7.1723244633984005e-06, + "loss": 0.6748, + "step": 13897 + }, + { + "epoch": 1.13, + "grad_norm": 5.506822762437197, + "learning_rate": 7.171929557732761e-06, + "loss": 0.5981, + "step": 13898 + }, + { + "epoch": 1.13, + "grad_norm": 4.055471858355198, + "learning_rate": 7.171534635366808e-06, + "loss": 0.5702, + "step": 13899 + }, + { + "epoch": 1.13, + "grad_norm": 2.9263229491751015, + "learning_rate": 7.17113969630358e-06, + "loss": 0.6373, + "step": 13900 + }, + { + "epoch": 1.13, + "grad_norm": 3.3697766408580376, + "learning_rate": 7.1707447405461125e-06, + "loss": 0.6359, + "step": 13901 + }, + { + "epoch": 1.13, + "grad_norm": 4.0312086507405605, + "learning_rate": 7.170349768097443e-06, + "loss": 0.5251, + "step": 13902 + }, + { + "epoch": 1.13, + "grad_norm": 5.423719232380969, + "learning_rate": 7.169954778960608e-06, + "loss": 0.7085, + "step": 13903 + }, + { + "epoch": 1.13, + "grad_norm": 4.074180620964624, + "learning_rate": 7.169559773138647e-06, + "loss": 0.5344, + "step": 13904 + }, + { + "epoch": 1.13, + "grad_norm": 3.5229812874395168, + "learning_rate": 7.169164750634594e-06, + "loss": 0.5216, + "step": 13905 + }, + { + "epoch": 1.13, + "grad_norm": 4.248614185204561, + "learning_rate": 7.168769711451488e-06, + "loss": 0.6562, + "step": 13906 + }, + { + "epoch": 1.13, + "grad_norm": 2.9487944776994457, + "learning_rate": 7.168374655592365e-06, + "loss": 0.5842, + "step": 13907 + }, + { + "epoch": 1.13, + "grad_norm": 1.8052205599697673, + "learning_rate": 7.167979583060265e-06, + "loss": 0.4995, + "step": 13908 + }, + { + "epoch": 1.13, + "grad_norm": 2.9625159902876086, + "learning_rate": 7.167584493858225e-06, + "loss": 0.551, + "step": 13909 + }, + { + "epoch": 1.13, + "grad_norm": 7.254494796730688, + "learning_rate": 7.167189387989283e-06, + "loss": 0.4748, + "step": 13910 + }, + { + "epoch": 1.13, + "grad_norm": 3.1591906525983178, + "learning_rate": 7.166794265456475e-06, + "loss": 0.6117, + "step": 13911 + }, + { + "epoch": 1.13, + "grad_norm": 2.1517150867522874, + "learning_rate": 7.166399126262842e-06, + "loss": 0.6234, + "step": 13912 + }, + { + "epoch": 1.13, + "grad_norm": 2.7310674796524035, + "learning_rate": 7.16600397041142e-06, + "loss": 0.7281, + "step": 13913 + }, + { + "epoch": 1.13, + "grad_norm": 3.480284811360218, + "learning_rate": 7.165608797905249e-06, + "loss": 0.6267, + "step": 13914 + }, + { + "epoch": 1.13, + "grad_norm": 12.568051698152436, + "learning_rate": 7.165213608747367e-06, + "loss": 0.5812, + "step": 13915 + }, + { + "epoch": 1.13, + "grad_norm": 3.436715474336368, + "learning_rate": 7.164818402940813e-06, + "loss": 0.5472, + "step": 13916 + }, + { + "epoch": 1.13, + "grad_norm": 4.472822346034293, + "learning_rate": 7.164423180488625e-06, + "loss": 0.7649, + "step": 13917 + }, + { + "epoch": 1.13, + "grad_norm": 4.587258830421729, + "learning_rate": 7.164027941393843e-06, + "loss": 0.5744, + "step": 13918 + }, + { + "epoch": 1.13, + "grad_norm": 2.526690831142121, + "learning_rate": 7.163632685659504e-06, + "loss": 0.6233, + "step": 13919 + }, + { + "epoch": 1.13, + "grad_norm": 2.4346480322733797, + "learning_rate": 7.1632374132886506e-06, + "loss": 0.4204, + "step": 13920 + }, + { + "epoch": 1.13, + "grad_norm": 2.713576049329069, + "learning_rate": 7.1628421242843195e-06, + "loss": 0.641, + "step": 13921 + }, + { + "epoch": 1.13, + "grad_norm": 2.6168031588221403, + "learning_rate": 7.16244681864955e-06, + "loss": 0.5223, + "step": 13922 + }, + { + "epoch": 1.13, + "grad_norm": 2.502868536294418, + "learning_rate": 7.162051496387382e-06, + "loss": 0.5765, + "step": 13923 + }, + { + "epoch": 1.13, + "grad_norm": 5.876081240191781, + "learning_rate": 7.161656157500857e-06, + "loss": 0.5308, + "step": 13924 + }, + { + "epoch": 1.13, + "grad_norm": 2.414597893699812, + "learning_rate": 7.161260801993013e-06, + "loss": 0.5309, + "step": 13925 + }, + { + "epoch": 1.13, + "grad_norm": 2.8507010897142995, + "learning_rate": 7.160865429866891e-06, + "loss": 0.6039, + "step": 13926 + }, + { + "epoch": 1.13, + "grad_norm": 4.724602346032643, + "learning_rate": 7.16047004112553e-06, + "loss": 0.6457, + "step": 13927 + }, + { + "epoch": 1.13, + "grad_norm": 4.895762245453891, + "learning_rate": 7.16007463577197e-06, + "loss": 0.5523, + "step": 13928 + }, + { + "epoch": 1.13, + "grad_norm": 4.62593965272068, + "learning_rate": 7.159679213809253e-06, + "loss": 0.6391, + "step": 13929 + }, + { + "epoch": 1.13, + "grad_norm": 3.8572038488701037, + "learning_rate": 7.159283775240419e-06, + "loss": 0.5466, + "step": 13930 + }, + { + "epoch": 1.13, + "grad_norm": 1.7937155143161267, + "learning_rate": 7.158888320068507e-06, + "loss": 0.482, + "step": 13931 + }, + { + "epoch": 1.13, + "grad_norm": 4.508318606547678, + "learning_rate": 7.1584928482965586e-06, + "loss": 0.6574, + "step": 13932 + }, + { + "epoch": 1.13, + "grad_norm": 8.163163784060702, + "learning_rate": 7.158097359927616e-06, + "loss": 0.5256, + "step": 13933 + }, + { + "epoch": 1.13, + "grad_norm": 4.774549549628053, + "learning_rate": 7.157701854964719e-06, + "loss": 0.615, + "step": 13934 + }, + { + "epoch": 1.13, + "grad_norm": 6.143098145913764, + "learning_rate": 7.1573063334109085e-06, + "loss": 0.5513, + "step": 13935 + }, + { + "epoch": 1.13, + "grad_norm": 3.786266960761607, + "learning_rate": 7.1569107952692255e-06, + "loss": 0.4934, + "step": 13936 + }, + { + "epoch": 1.13, + "grad_norm": 4.202427215202946, + "learning_rate": 7.156515240542712e-06, + "loss": 0.6978, + "step": 13937 + }, + { + "epoch": 1.13, + "grad_norm": 3.8030948185834896, + "learning_rate": 7.15611966923441e-06, + "loss": 0.4359, + "step": 13938 + }, + { + "epoch": 1.13, + "grad_norm": 3.0354891578202623, + "learning_rate": 7.15572408134736e-06, + "loss": 0.6155, + "step": 13939 + }, + { + "epoch": 1.13, + "grad_norm": 3.210985712073173, + "learning_rate": 7.155328476884603e-06, + "loss": 0.4971, + "step": 13940 + }, + { + "epoch": 1.13, + "grad_norm": 3.3833210534093885, + "learning_rate": 7.154932855849184e-06, + "loss": 0.6064, + "step": 13941 + }, + { + "epoch": 1.13, + "grad_norm": 11.078452034845732, + "learning_rate": 7.154537218244142e-06, + "loss": 0.4711, + "step": 13942 + }, + { + "epoch": 1.13, + "grad_norm": 6.777079861804997, + "learning_rate": 7.154141564072521e-06, + "loss": 0.5551, + "step": 13943 + }, + { + "epoch": 1.13, + "grad_norm": 6.1910307906881075, + "learning_rate": 7.153745893337361e-06, + "loss": 0.4641, + "step": 13944 + }, + { + "epoch": 1.13, + "grad_norm": 5.171532798503356, + "learning_rate": 7.153350206041706e-06, + "loss": 0.6698, + "step": 13945 + }, + { + "epoch": 1.13, + "grad_norm": 2.8881978638426244, + "learning_rate": 7.152954502188599e-06, + "loss": 0.5547, + "step": 13946 + }, + { + "epoch": 1.13, + "grad_norm": 3.7962827328488213, + "learning_rate": 7.152558781781082e-06, + "loss": 0.5981, + "step": 13947 + }, + { + "epoch": 1.13, + "grad_norm": 9.834960910606593, + "learning_rate": 7.152163044822197e-06, + "loss": 0.7151, + "step": 13948 + }, + { + "epoch": 1.13, + "grad_norm": 2.454312955107933, + "learning_rate": 7.151767291314989e-06, + "loss": 0.5448, + "step": 13949 + }, + { + "epoch": 1.13, + "grad_norm": 3.3624556147759557, + "learning_rate": 7.151371521262498e-06, + "loss": 0.6813, + "step": 13950 + }, + { + "epoch": 1.13, + "grad_norm": 4.001379219637956, + "learning_rate": 7.150975734667769e-06, + "loss": 0.5665, + "step": 13951 + }, + { + "epoch": 1.13, + "grad_norm": 3.5636964560510527, + "learning_rate": 7.150579931533844e-06, + "loss": 0.5313, + "step": 13952 + }, + { + "epoch": 1.13, + "grad_norm": 4.780778811902173, + "learning_rate": 7.150184111863768e-06, + "loss": 0.5076, + "step": 13953 + }, + { + "epoch": 1.13, + "grad_norm": 2.3583493059284857, + "learning_rate": 7.149788275660585e-06, + "loss": 0.6196, + "step": 13954 + }, + { + "epoch": 1.13, + "grad_norm": 2.647393173341142, + "learning_rate": 7.149392422927337e-06, + "loss": 0.6494, + "step": 13955 + }, + { + "epoch": 1.13, + "grad_norm": 3.842129682618392, + "learning_rate": 7.1489965536670666e-06, + "loss": 0.686, + "step": 13956 + }, + { + "epoch": 1.13, + "grad_norm": 8.626116757099544, + "learning_rate": 7.148600667882821e-06, + "loss": 0.5927, + "step": 13957 + }, + { + "epoch": 1.13, + "grad_norm": 2.422068799728741, + "learning_rate": 7.148204765577643e-06, + "loss": 0.552, + "step": 13958 + }, + { + "epoch": 1.13, + "grad_norm": 2.9359730971372464, + "learning_rate": 7.147808846754576e-06, + "loss": 0.5935, + "step": 13959 + }, + { + "epoch": 1.13, + "grad_norm": 6.70259163523777, + "learning_rate": 7.147412911416664e-06, + "loss": 0.514, + "step": 13960 + }, + { + "epoch": 1.13, + "grad_norm": 4.186502996518931, + "learning_rate": 7.147016959566953e-06, + "loss": 0.6248, + "step": 13961 + }, + { + "epoch": 1.13, + "grad_norm": 4.404774312829644, + "learning_rate": 7.146620991208486e-06, + "loss": 0.6203, + "step": 13962 + }, + { + "epoch": 1.13, + "grad_norm": 2.700011087551851, + "learning_rate": 7.146225006344309e-06, + "loss": 0.6009, + "step": 13963 + }, + { + "epoch": 1.13, + "grad_norm": 2.6353260233051246, + "learning_rate": 7.145829004977465e-06, + "loss": 0.5804, + "step": 13964 + }, + { + "epoch": 1.13, + "grad_norm": 3.0009819240038045, + "learning_rate": 7.145432987111001e-06, + "loss": 0.5519, + "step": 13965 + }, + { + "epoch": 1.13, + "grad_norm": 2.7416414520584564, + "learning_rate": 7.14503695274796e-06, + "loss": 0.6597, + "step": 13966 + }, + { + "epoch": 1.13, + "grad_norm": 3.721123781980044, + "learning_rate": 7.144640901891389e-06, + "loss": 0.6586, + "step": 13967 + }, + { + "epoch": 1.13, + "grad_norm": 3.5919646519659296, + "learning_rate": 7.144244834544331e-06, + "loss": 0.4226, + "step": 13968 + }, + { + "epoch": 1.13, + "grad_norm": 3.6910003192928236, + "learning_rate": 7.143848750709835e-06, + "loss": 0.4933, + "step": 13969 + }, + { + "epoch": 1.13, + "grad_norm": 5.127393070600145, + "learning_rate": 7.143452650390944e-06, + "loss": 0.614, + "step": 13970 + }, + { + "epoch": 1.13, + "grad_norm": 3.688656507957647, + "learning_rate": 7.143056533590704e-06, + "loss": 0.5591, + "step": 13971 + }, + { + "epoch": 1.13, + "grad_norm": 4.433085422689032, + "learning_rate": 7.14266040031216e-06, + "loss": 0.6977, + "step": 13972 + }, + { + "epoch": 1.13, + "grad_norm": 3.3030169444795527, + "learning_rate": 7.14226425055836e-06, + "loss": 0.5886, + "step": 13973 + }, + { + "epoch": 1.13, + "grad_norm": 6.788700585354247, + "learning_rate": 7.141868084332349e-06, + "loss": 0.6239, + "step": 13974 + }, + { + "epoch": 1.14, + "grad_norm": 5.092288471339357, + "learning_rate": 7.141471901637173e-06, + "loss": 0.5107, + "step": 13975 + }, + { + "epoch": 1.14, + "grad_norm": 3.0847450809819947, + "learning_rate": 7.141075702475878e-06, + "loss": 0.5263, + "step": 13976 + }, + { + "epoch": 1.14, + "grad_norm": 3.2140987235900393, + "learning_rate": 7.140679486851509e-06, + "loss": 0.6384, + "step": 13977 + }, + { + "epoch": 1.14, + "grad_norm": 3.9898147884053135, + "learning_rate": 7.140283254767118e-06, + "loss": 0.5706, + "step": 13978 + }, + { + "epoch": 1.14, + "grad_norm": 2.9719987561485586, + "learning_rate": 7.139887006225747e-06, + "loss": 0.5458, + "step": 13979 + }, + { + "epoch": 1.14, + "grad_norm": 5.363355221068068, + "learning_rate": 7.139490741230444e-06, + "loss": 0.6443, + "step": 13980 + }, + { + "epoch": 1.14, + "grad_norm": 4.579838594789887, + "learning_rate": 7.139094459784254e-06, + "loss": 0.5244, + "step": 13981 + }, + { + "epoch": 1.14, + "grad_norm": 3.6841996887661916, + "learning_rate": 7.138698161890228e-06, + "loss": 0.6745, + "step": 13982 + }, + { + "epoch": 1.14, + "grad_norm": 6.788198125805596, + "learning_rate": 7.138301847551411e-06, + "loss": 0.5236, + "step": 13983 + }, + { + "epoch": 1.14, + "grad_norm": 8.8394044651797, + "learning_rate": 7.13790551677085e-06, + "loss": 0.4088, + "step": 13984 + }, + { + "epoch": 1.14, + "grad_norm": 3.8168957828482104, + "learning_rate": 7.137509169551592e-06, + "loss": 0.5996, + "step": 13985 + }, + { + "epoch": 1.14, + "grad_norm": 2.718926860154682, + "learning_rate": 7.1371128058966864e-06, + "loss": 0.7695, + "step": 13986 + }, + { + "epoch": 1.14, + "grad_norm": 3.6345128264086712, + "learning_rate": 7.13671642580918e-06, + "loss": 0.5572, + "step": 13987 + }, + { + "epoch": 1.14, + "grad_norm": 1.6837277051765758, + "learning_rate": 7.136320029292122e-06, + "loss": 0.5062, + "step": 13988 + }, + { + "epoch": 1.14, + "grad_norm": 3.408500915513878, + "learning_rate": 7.1359236163485564e-06, + "loss": 0.5344, + "step": 13989 + }, + { + "epoch": 1.14, + "grad_norm": 6.367493853190697, + "learning_rate": 7.1355271869815365e-06, + "loss": 0.8157, + "step": 13990 + }, + { + "epoch": 1.14, + "grad_norm": 5.379678545397891, + "learning_rate": 7.135130741194107e-06, + "loss": 0.6107, + "step": 13991 + }, + { + "epoch": 1.14, + "grad_norm": 4.485392412505868, + "learning_rate": 7.134734278989317e-06, + "loss": 0.493, + "step": 13992 + }, + { + "epoch": 1.14, + "grad_norm": 3.228221703028478, + "learning_rate": 7.134337800370215e-06, + "loss": 0.4726, + "step": 13993 + }, + { + "epoch": 1.14, + "grad_norm": 3.5045283426093894, + "learning_rate": 7.133941305339849e-06, + "loss": 0.6206, + "step": 13994 + }, + { + "epoch": 1.14, + "grad_norm": 3.6389969507063804, + "learning_rate": 7.133544793901269e-06, + "loss": 0.6038, + "step": 13995 + }, + { + "epoch": 1.14, + "grad_norm": 3.3681824363722748, + "learning_rate": 7.133148266057524e-06, + "loss": 0.4501, + "step": 13996 + }, + { + "epoch": 1.14, + "grad_norm": 3.467226798304162, + "learning_rate": 7.13275172181166e-06, + "loss": 0.5774, + "step": 13997 + }, + { + "epoch": 1.14, + "grad_norm": 2.6047589715568207, + "learning_rate": 7.132355161166731e-06, + "loss": 0.566, + "step": 13998 + }, + { + "epoch": 1.14, + "grad_norm": 12.318717076757398, + "learning_rate": 7.131958584125782e-06, + "loss": 0.6792, + "step": 13999 + }, + { + "epoch": 1.14, + "grad_norm": 3.4507065763203415, + "learning_rate": 7.131561990691864e-06, + "loss": 0.4846, + "step": 14000 + }, + { + "epoch": 1.14, + "grad_norm": 6.242212347105321, + "learning_rate": 7.131165380868026e-06, + "loss": 0.5442, + "step": 14001 + }, + { + "epoch": 1.14, + "grad_norm": 6.930273107287269, + "learning_rate": 7.130768754657319e-06, + "loss": 0.6367, + "step": 14002 + }, + { + "epoch": 1.14, + "grad_norm": 2.8929932563322205, + "learning_rate": 7.130372112062791e-06, + "loss": 0.5218, + "step": 14003 + }, + { + "epoch": 1.14, + "grad_norm": 4.39733226831647, + "learning_rate": 7.1299754530874936e-06, + "loss": 0.5871, + "step": 14004 + }, + { + "epoch": 1.14, + "grad_norm": 17.02333657347272, + "learning_rate": 7.129578777734472e-06, + "loss": 0.5311, + "step": 14005 + }, + { + "epoch": 1.14, + "grad_norm": 3.039717864113113, + "learning_rate": 7.129182086006784e-06, + "loss": 0.7625, + "step": 14006 + }, + { + "epoch": 1.14, + "grad_norm": 8.325642489843645, + "learning_rate": 7.128785377907475e-06, + "loss": 0.5331, + "step": 14007 + }, + { + "epoch": 1.14, + "grad_norm": 3.6732080647469347, + "learning_rate": 7.128388653439595e-06, + "loss": 0.499, + "step": 14008 + }, + { + "epoch": 1.14, + "grad_norm": 7.177687497926602, + "learning_rate": 7.127991912606196e-06, + "loss": 0.4268, + "step": 14009 + }, + { + "epoch": 1.14, + "grad_norm": 2.7350986391699617, + "learning_rate": 7.127595155410329e-06, + "loss": 0.6696, + "step": 14010 + }, + { + "epoch": 1.14, + "grad_norm": 11.704710100305793, + "learning_rate": 7.1271983818550426e-06, + "loss": 0.7394, + "step": 14011 + }, + { + "epoch": 1.14, + "grad_norm": 6.599395200235075, + "learning_rate": 7.126801591943389e-06, + "loss": 0.4981, + "step": 14012 + }, + { + "epoch": 1.14, + "grad_norm": 4.0670380243567905, + "learning_rate": 7.12640478567842e-06, + "loss": 0.6661, + "step": 14013 + }, + { + "epoch": 1.14, + "grad_norm": 3.709272454058723, + "learning_rate": 7.126007963063186e-06, + "loss": 0.5569, + "step": 14014 + }, + { + "epoch": 1.14, + "grad_norm": 3.4942600626040203, + "learning_rate": 7.125611124100739e-06, + "loss": 0.556, + "step": 14015 + }, + { + "epoch": 1.14, + "grad_norm": 2.4631165118017972, + "learning_rate": 7.125214268794129e-06, + "loss": 0.5246, + "step": 14016 + }, + { + "epoch": 1.14, + "grad_norm": 2.485823977905314, + "learning_rate": 7.1248173971464065e-06, + "loss": 0.5405, + "step": 14017 + }, + { + "epoch": 1.14, + "grad_norm": 3.2236929857975953, + "learning_rate": 7.124420509160626e-06, + "loss": 0.681, + "step": 14018 + }, + { + "epoch": 1.14, + "grad_norm": 8.685896867173078, + "learning_rate": 7.124023604839836e-06, + "loss": 0.5715, + "step": 14019 + }, + { + "epoch": 1.14, + "grad_norm": 3.3273328093794254, + "learning_rate": 7.123626684187092e-06, + "loss": 0.5432, + "step": 14020 + }, + { + "epoch": 1.14, + "grad_norm": 3.529042101830504, + "learning_rate": 7.123229747205442e-06, + "loss": 0.6158, + "step": 14021 + }, + { + "epoch": 1.14, + "grad_norm": 3.776446771979209, + "learning_rate": 7.1228327938979435e-06, + "loss": 0.761, + "step": 14022 + }, + { + "epoch": 1.14, + "grad_norm": 3.181729733610844, + "learning_rate": 7.122435824267644e-06, + "loss": 0.5828, + "step": 14023 + }, + { + "epoch": 1.14, + "grad_norm": 7.813596500994247, + "learning_rate": 7.122038838317598e-06, + "loss": 0.4946, + "step": 14024 + }, + { + "epoch": 1.14, + "grad_norm": 2.890600938310227, + "learning_rate": 7.121641836050855e-06, + "loss": 0.5451, + "step": 14025 + }, + { + "epoch": 1.14, + "grad_norm": 4.225950405658553, + "learning_rate": 7.121244817470472e-06, + "loss": 0.5887, + "step": 14026 + }, + { + "epoch": 1.14, + "grad_norm": 2.761608940991486, + "learning_rate": 7.1208477825795e-06, + "loss": 0.5644, + "step": 14027 + }, + { + "epoch": 1.14, + "grad_norm": 2.5772166823562412, + "learning_rate": 7.120450731380991e-06, + "loss": 0.7592, + "step": 14028 + }, + { + "epoch": 1.14, + "grad_norm": 2.742152906290321, + "learning_rate": 7.120053663877997e-06, + "loss": 0.5988, + "step": 14029 + }, + { + "epoch": 1.14, + "grad_norm": 6.385341121748592, + "learning_rate": 7.119656580073575e-06, + "loss": 0.5873, + "step": 14030 + }, + { + "epoch": 1.14, + "grad_norm": 2.3756375522211814, + "learning_rate": 7.119259479970775e-06, + "loss": 0.4631, + "step": 14031 + }, + { + "epoch": 1.14, + "grad_norm": 2.6471526773981915, + "learning_rate": 7.1188623635726515e-06, + "loss": 0.5567, + "step": 14032 + }, + { + "epoch": 1.14, + "grad_norm": 6.804030897702289, + "learning_rate": 7.118465230882258e-06, + "loss": 0.6458, + "step": 14033 + }, + { + "epoch": 1.14, + "grad_norm": 6.329285581444827, + "learning_rate": 7.118068081902647e-06, + "loss": 0.6643, + "step": 14034 + }, + { + "epoch": 1.14, + "grad_norm": 3.0937024644037896, + "learning_rate": 7.117670916636874e-06, + "loss": 0.5862, + "step": 14035 + }, + { + "epoch": 1.14, + "grad_norm": 6.433043270063747, + "learning_rate": 7.117273735087993e-06, + "loss": 0.6547, + "step": 14036 + }, + { + "epoch": 1.14, + "grad_norm": 5.843080992640058, + "learning_rate": 7.116876537259054e-06, + "loss": 0.5489, + "step": 14037 + }, + { + "epoch": 1.14, + "grad_norm": 3.656064659562861, + "learning_rate": 7.116479323153116e-06, + "loss": 0.5703, + "step": 14038 + }, + { + "epoch": 1.14, + "grad_norm": 5.3032634176140645, + "learning_rate": 7.116082092773231e-06, + "loss": 0.5514, + "step": 14039 + }, + { + "epoch": 1.14, + "grad_norm": 2.7270044372363675, + "learning_rate": 7.1156848461224545e-06, + "loss": 0.6904, + "step": 14040 + }, + { + "epoch": 1.14, + "grad_norm": 2.6540333651140378, + "learning_rate": 7.115287583203839e-06, + "loss": 0.6183, + "step": 14041 + }, + { + "epoch": 1.14, + "grad_norm": 4.009342461023394, + "learning_rate": 7.114890304020441e-06, + "loss": 0.4273, + "step": 14042 + }, + { + "epoch": 1.14, + "grad_norm": 2.681541107577118, + "learning_rate": 7.114493008575315e-06, + "loss": 0.5448, + "step": 14043 + }, + { + "epoch": 1.14, + "grad_norm": 6.476858165003556, + "learning_rate": 7.1140956968715154e-06, + "loss": 0.4766, + "step": 14044 + }, + { + "epoch": 1.14, + "grad_norm": 2.127736994985274, + "learning_rate": 7.113698368912096e-06, + "loss": 0.5371, + "step": 14045 + }, + { + "epoch": 1.14, + "grad_norm": 6.492346649264869, + "learning_rate": 7.113301024700115e-06, + "loss": 0.5217, + "step": 14046 + }, + { + "epoch": 1.14, + "grad_norm": 3.857623321669221, + "learning_rate": 7.112903664238624e-06, + "loss": 0.5586, + "step": 14047 + }, + { + "epoch": 1.14, + "grad_norm": 4.231953093247358, + "learning_rate": 7.112506287530682e-06, + "loss": 0.5895, + "step": 14048 + }, + { + "epoch": 1.14, + "grad_norm": 2.7119821708871474, + "learning_rate": 7.11210889457934e-06, + "loss": 0.596, + "step": 14049 + }, + { + "epoch": 1.14, + "grad_norm": 3.2373965081330343, + "learning_rate": 7.111711485387659e-06, + "loss": 0.667, + "step": 14050 + }, + { + "epoch": 1.14, + "grad_norm": 3.1633006482257935, + "learning_rate": 7.111314059958692e-06, + "loss": 0.6395, + "step": 14051 + }, + { + "epoch": 1.14, + "grad_norm": 6.181949659053406, + "learning_rate": 7.110916618295493e-06, + "loss": 0.6003, + "step": 14052 + }, + { + "epoch": 1.14, + "grad_norm": 3.4255721513063606, + "learning_rate": 7.11051916040112e-06, + "loss": 0.5798, + "step": 14053 + }, + { + "epoch": 1.14, + "grad_norm": 3.6079143569189767, + "learning_rate": 7.110121686278631e-06, + "loss": 0.6137, + "step": 14054 + }, + { + "epoch": 1.14, + "grad_norm": 5.87133046763248, + "learning_rate": 7.109724195931078e-06, + "loss": 0.525, + "step": 14055 + }, + { + "epoch": 1.14, + "grad_norm": 3.293642929718121, + "learning_rate": 7.109326689361521e-06, + "loss": 0.5557, + "step": 14056 + }, + { + "epoch": 1.14, + "grad_norm": 3.62988946486616, + "learning_rate": 7.108929166573014e-06, + "loss": 0.5175, + "step": 14057 + }, + { + "epoch": 1.14, + "grad_norm": 4.763547408904908, + "learning_rate": 7.108531627568615e-06, + "loss": 0.5919, + "step": 14058 + }, + { + "epoch": 1.14, + "grad_norm": 13.518437752041928, + "learning_rate": 7.108134072351381e-06, + "loss": 0.5254, + "step": 14059 + }, + { + "epoch": 1.14, + "grad_norm": 2.002461124491446, + "learning_rate": 7.107736500924369e-06, + "loss": 0.5139, + "step": 14060 + }, + { + "epoch": 1.14, + "grad_norm": 5.21698623549741, + "learning_rate": 7.107338913290635e-06, + "loss": 0.5652, + "step": 14061 + }, + { + "epoch": 1.14, + "grad_norm": 4.624539691500908, + "learning_rate": 7.106941309453235e-06, + "loss": 0.5224, + "step": 14062 + }, + { + "epoch": 1.14, + "grad_norm": 8.93781793302301, + "learning_rate": 7.106543689415228e-06, + "loss": 0.5506, + "step": 14063 + }, + { + "epoch": 1.14, + "grad_norm": 2.600324887881603, + "learning_rate": 7.106146053179672e-06, + "loss": 0.5911, + "step": 14064 + }, + { + "epoch": 1.14, + "grad_norm": 2.955799788303194, + "learning_rate": 7.105748400749624e-06, + "loss": 0.6704, + "step": 14065 + }, + { + "epoch": 1.14, + "grad_norm": 4.081984541821989, + "learning_rate": 7.10535073212814e-06, + "loss": 0.6583, + "step": 14066 + }, + { + "epoch": 1.14, + "grad_norm": 4.248804887688949, + "learning_rate": 7.10495304731828e-06, + "loss": 0.6981, + "step": 14067 + }, + { + "epoch": 1.14, + "grad_norm": 6.92853198105978, + "learning_rate": 7.104555346323098e-06, + "loss": 0.5545, + "step": 14068 + }, + { + "epoch": 1.14, + "grad_norm": 3.8659932647849367, + "learning_rate": 7.104157629145658e-06, + "loss": 0.618, + "step": 14069 + }, + { + "epoch": 1.14, + "grad_norm": 3.1260199424631945, + "learning_rate": 7.103759895789013e-06, + "loss": 0.5967, + "step": 14070 + }, + { + "epoch": 1.14, + "grad_norm": 3.1766996080036813, + "learning_rate": 7.103362146256223e-06, + "loss": 0.5136, + "step": 14071 + }, + { + "epoch": 1.14, + "grad_norm": 3.8031647423475508, + "learning_rate": 7.102964380550348e-06, + "loss": 0.6074, + "step": 14072 + }, + { + "epoch": 1.14, + "grad_norm": 9.07410922745925, + "learning_rate": 7.102566598674443e-06, + "loss": 0.6495, + "step": 14073 + }, + { + "epoch": 1.14, + "grad_norm": 3.9226776713759994, + "learning_rate": 7.102168800631569e-06, + "loss": 0.5438, + "step": 14074 + }, + { + "epoch": 1.14, + "grad_norm": 3.9220484951316323, + "learning_rate": 7.101770986424785e-06, + "loss": 0.577, + "step": 14075 + }, + { + "epoch": 1.14, + "grad_norm": 3.970188428490416, + "learning_rate": 7.101373156057148e-06, + "loss": 0.5468, + "step": 14076 + }, + { + "epoch": 1.14, + "grad_norm": 4.903892395965639, + "learning_rate": 7.10097530953172e-06, + "loss": 0.7002, + "step": 14077 + }, + { + "epoch": 1.14, + "grad_norm": 4.093239525825034, + "learning_rate": 7.100577446851555e-06, + "loss": 0.5743, + "step": 14078 + }, + { + "epoch": 1.14, + "grad_norm": 2.897996512013464, + "learning_rate": 7.100179568019719e-06, + "loss": 0.6799, + "step": 14079 + }, + { + "epoch": 1.14, + "grad_norm": 3.735222923519625, + "learning_rate": 7.099781673039265e-06, + "loss": 0.5399, + "step": 14080 + }, + { + "epoch": 1.14, + "grad_norm": 2.868410546521567, + "learning_rate": 7.099383761913257e-06, + "loss": 0.6331, + "step": 14081 + }, + { + "epoch": 1.14, + "grad_norm": 5.462939334244097, + "learning_rate": 7.0989858346447515e-06, + "loss": 0.4825, + "step": 14082 + }, + { + "epoch": 1.14, + "grad_norm": 7.11011658215929, + "learning_rate": 7.098587891236811e-06, + "loss": 0.5513, + "step": 14083 + }, + { + "epoch": 1.14, + "grad_norm": 3.451756389908537, + "learning_rate": 7.098189931692494e-06, + "loss": 0.5442, + "step": 14084 + }, + { + "epoch": 1.14, + "grad_norm": 4.842599459390585, + "learning_rate": 7.097791956014859e-06, + "loss": 0.733, + "step": 14085 + }, + { + "epoch": 1.14, + "grad_norm": 3.112264745473636, + "learning_rate": 7.097393964206968e-06, + "loss": 0.6751, + "step": 14086 + }, + { + "epoch": 1.14, + "grad_norm": 2.811174693668927, + "learning_rate": 7.096995956271881e-06, + "loss": 0.742, + "step": 14087 + }, + { + "epoch": 1.14, + "grad_norm": 3.405638353854878, + "learning_rate": 7.0965979322126574e-06, + "loss": 0.5813, + "step": 14088 + }, + { + "epoch": 1.14, + "grad_norm": 3.6003122621451245, + "learning_rate": 7.096199892032359e-06, + "loss": 0.6734, + "step": 14089 + }, + { + "epoch": 1.14, + "grad_norm": 2.7739248661018934, + "learning_rate": 7.095801835734046e-06, + "loss": 0.5432, + "step": 14090 + }, + { + "epoch": 1.14, + "grad_norm": 3.5569595655297386, + "learning_rate": 7.095403763320777e-06, + "loss": 0.6025, + "step": 14091 + }, + { + "epoch": 1.14, + "grad_norm": 4.584418015281892, + "learning_rate": 7.095005674795616e-06, + "loss": 0.6391, + "step": 14092 + }, + { + "epoch": 1.14, + "grad_norm": 3.6938939539169318, + "learning_rate": 7.094607570161625e-06, + "loss": 0.6481, + "step": 14093 + }, + { + "epoch": 1.14, + "grad_norm": 5.1040573879123325, + "learning_rate": 7.09420944942186e-06, + "loss": 0.5178, + "step": 14094 + }, + { + "epoch": 1.14, + "grad_norm": 3.2501333268378096, + "learning_rate": 7.093811312579385e-06, + "loss": 0.6699, + "step": 14095 + }, + { + "epoch": 1.14, + "grad_norm": 2.726086705995903, + "learning_rate": 7.0934131596372615e-06, + "loss": 0.5765, + "step": 14096 + }, + { + "epoch": 1.14, + "grad_norm": 4.575038192925408, + "learning_rate": 7.0930149905985525e-06, + "loss": 0.5689, + "step": 14097 + }, + { + "epoch": 1.15, + "grad_norm": 12.649958184097981, + "learning_rate": 7.092616805466316e-06, + "loss": 0.6007, + "step": 14098 + }, + { + "epoch": 1.15, + "grad_norm": 4.656171235696052, + "learning_rate": 7.092218604243615e-06, + "loss": 0.6753, + "step": 14099 + }, + { + "epoch": 1.15, + "grad_norm": 2.2936373055637858, + "learning_rate": 7.091820386933513e-06, + "loss": 0.5615, + "step": 14100 + }, + { + "epoch": 1.15, + "grad_norm": 3.3618159627256192, + "learning_rate": 7.091422153539072e-06, + "loss": 0.6564, + "step": 14101 + }, + { + "epoch": 1.15, + "grad_norm": 3.208953388488813, + "learning_rate": 7.091023904063352e-06, + "loss": 0.5159, + "step": 14102 + }, + { + "epoch": 1.15, + "grad_norm": 3.3376953747366103, + "learning_rate": 7.0906256385094145e-06, + "loss": 0.6552, + "step": 14103 + }, + { + "epoch": 1.15, + "grad_norm": 4.059295072807514, + "learning_rate": 7.090227356880325e-06, + "loss": 0.6571, + "step": 14104 + }, + { + "epoch": 1.15, + "grad_norm": 3.5955086851826072, + "learning_rate": 7.089829059179145e-06, + "loss": 0.4719, + "step": 14105 + }, + { + "epoch": 1.15, + "grad_norm": 2.7183274192340905, + "learning_rate": 7.089430745408936e-06, + "loss": 0.6439, + "step": 14106 + }, + { + "epoch": 1.15, + "grad_norm": 9.77472418476916, + "learning_rate": 7.08903241557276e-06, + "loss": 0.5568, + "step": 14107 + }, + { + "epoch": 1.15, + "grad_norm": 3.247580974979504, + "learning_rate": 7.088634069673683e-06, + "loss": 0.5631, + "step": 14108 + }, + { + "epoch": 1.15, + "grad_norm": 2.6667231495117663, + "learning_rate": 7.088235707714763e-06, + "loss": 0.5781, + "step": 14109 + }, + { + "epoch": 1.15, + "grad_norm": 3.2584384235731254, + "learning_rate": 7.0878373296990685e-06, + "loss": 0.5411, + "step": 14110 + }, + { + "epoch": 1.15, + "grad_norm": 11.403141698690456, + "learning_rate": 7.087438935629659e-06, + "loss": 0.5395, + "step": 14111 + }, + { + "epoch": 1.15, + "grad_norm": 2.2351864915359125, + "learning_rate": 7.0870405255096e-06, + "loss": 0.5342, + "step": 14112 + }, + { + "epoch": 1.15, + "grad_norm": 2.822381014634726, + "learning_rate": 7.0866420993419535e-06, + "loss": 0.5907, + "step": 14113 + }, + { + "epoch": 1.15, + "grad_norm": 3.6866002641339866, + "learning_rate": 7.086243657129784e-06, + "loss": 0.5102, + "step": 14114 + }, + { + "epoch": 1.15, + "grad_norm": 7.3050480577659815, + "learning_rate": 7.085845198876154e-06, + "loss": 0.6958, + "step": 14115 + }, + { + "epoch": 1.15, + "grad_norm": 5.99677284558523, + "learning_rate": 7.085446724584129e-06, + "loss": 0.6622, + "step": 14116 + }, + { + "epoch": 1.15, + "grad_norm": 2.922262982628735, + "learning_rate": 7.085048234256771e-06, + "loss": 0.5063, + "step": 14117 + }, + { + "epoch": 1.15, + "grad_norm": 4.52880605265708, + "learning_rate": 7.084649727897145e-06, + "loss": 0.5515, + "step": 14118 + }, + { + "epoch": 1.15, + "grad_norm": 3.243147010824444, + "learning_rate": 7.084251205508315e-06, + "loss": 0.5647, + "step": 14119 + }, + { + "epoch": 1.15, + "grad_norm": 2.470191200775277, + "learning_rate": 7.083852667093346e-06, + "loss": 0.5193, + "step": 14120 + }, + { + "epoch": 1.15, + "grad_norm": 2.5320749709149935, + "learning_rate": 7.083454112655302e-06, + "loss": 0.5184, + "step": 14121 + }, + { + "epoch": 1.15, + "grad_norm": 4.390295479158579, + "learning_rate": 7.083055542197248e-06, + "loss": 0.5423, + "step": 14122 + }, + { + "epoch": 1.15, + "grad_norm": 3.066019523434415, + "learning_rate": 7.082656955722247e-06, + "loss": 0.5862, + "step": 14123 + }, + { + "epoch": 1.15, + "grad_norm": 18.12779303562252, + "learning_rate": 7.082258353233365e-06, + "loss": 0.5815, + "step": 14124 + }, + { + "epoch": 1.15, + "grad_norm": 3.4149559418994193, + "learning_rate": 7.081859734733667e-06, + "loss": 0.4436, + "step": 14125 + }, + { + "epoch": 1.15, + "grad_norm": 14.92517512936975, + "learning_rate": 7.0814611002262194e-06, + "loss": 0.6932, + "step": 14126 + }, + { + "epoch": 1.15, + "grad_norm": 2.2794132881214884, + "learning_rate": 7.081062449714084e-06, + "loss": 0.6581, + "step": 14127 + }, + { + "epoch": 1.15, + "grad_norm": 3.5659905356015447, + "learning_rate": 7.080663783200328e-06, + "loss": 0.5788, + "step": 14128 + }, + { + "epoch": 1.15, + "grad_norm": 3.706520014161506, + "learning_rate": 7.080265100688018e-06, + "loss": 0.628, + "step": 14129 + }, + { + "epoch": 1.15, + "grad_norm": 4.35230767820653, + "learning_rate": 7.079866402180218e-06, + "loss": 0.5671, + "step": 14130 + }, + { + "epoch": 1.15, + "grad_norm": 2.5077247290695155, + "learning_rate": 7.079467687679993e-06, + "loss": 0.5825, + "step": 14131 + }, + { + "epoch": 1.15, + "grad_norm": 3.1462337223422794, + "learning_rate": 7.079068957190409e-06, + "loss": 0.6244, + "step": 14132 + }, + { + "epoch": 1.15, + "grad_norm": 7.107115742882602, + "learning_rate": 7.078670210714536e-06, + "loss": 0.6741, + "step": 14133 + }, + { + "epoch": 1.15, + "grad_norm": 3.1053063413149697, + "learning_rate": 7.078271448255434e-06, + "loss": 0.5557, + "step": 14134 + }, + { + "epoch": 1.15, + "grad_norm": 2.4951065382384936, + "learning_rate": 7.077872669816172e-06, + "loss": 0.5443, + "step": 14135 + }, + { + "epoch": 1.15, + "grad_norm": 3.4341544438330183, + "learning_rate": 7.077473875399816e-06, + "loss": 0.7928, + "step": 14136 + }, + { + "epoch": 1.15, + "grad_norm": 2.306575633269179, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.6348, + "step": 14137 + }, + { + "epoch": 1.15, + "grad_norm": 2.9829238856004805, + "learning_rate": 7.076676238648089e-06, + "loss": 0.5296, + "step": 14138 + }, + { + "epoch": 1.15, + "grad_norm": 2.641276243233299, + "learning_rate": 7.0762773963188495e-06, + "loss": 0.6552, + "step": 14139 + }, + { + "epoch": 1.15, + "grad_norm": 4.033204496628203, + "learning_rate": 7.075878538024783e-06, + "loss": 0.5109, + "step": 14140 + }, + { + "epoch": 1.15, + "grad_norm": 3.1348349710619723, + "learning_rate": 7.075479663768957e-06, + "loss": 0.6851, + "step": 14141 + }, + { + "epoch": 1.15, + "grad_norm": 2.8099052621020646, + "learning_rate": 7.075080773554437e-06, + "loss": 0.5616, + "step": 14142 + }, + { + "epoch": 1.15, + "grad_norm": 3.8927122830835437, + "learning_rate": 7.0746818673842884e-06, + "loss": 0.5616, + "step": 14143 + }, + { + "epoch": 1.15, + "grad_norm": 4.135964918894633, + "learning_rate": 7.074282945261581e-06, + "loss": 0.6813, + "step": 14144 + }, + { + "epoch": 1.15, + "grad_norm": 2.8281807827288974, + "learning_rate": 7.073884007189383e-06, + "loss": 0.5875, + "step": 14145 + }, + { + "epoch": 1.15, + "grad_norm": 2.5513601234085352, + "learning_rate": 7.073485053170761e-06, + "loss": 0.4731, + "step": 14146 + }, + { + "epoch": 1.15, + "grad_norm": 4.4011919842333675, + "learning_rate": 7.07308608320878e-06, + "loss": 0.6637, + "step": 14147 + }, + { + "epoch": 1.15, + "grad_norm": 3.7373945575770655, + "learning_rate": 7.072687097306512e-06, + "loss": 0.5743, + "step": 14148 + }, + { + "epoch": 1.15, + "grad_norm": 2.475161611195808, + "learning_rate": 7.0722880954670215e-06, + "loss": 0.5747, + "step": 14149 + }, + { + "epoch": 1.15, + "grad_norm": 2.486059182959368, + "learning_rate": 7.071889077693378e-06, + "loss": 0.6948, + "step": 14150 + }, + { + "epoch": 1.15, + "grad_norm": 5.830111930499957, + "learning_rate": 7.071490043988649e-06, + "loss": 0.6594, + "step": 14151 + }, + { + "epoch": 1.15, + "grad_norm": 3.7657416059330777, + "learning_rate": 7.071090994355904e-06, + "loss": 0.7021, + "step": 14152 + }, + { + "epoch": 1.15, + "grad_norm": 8.728802268459566, + "learning_rate": 7.07069192879821e-06, + "loss": 0.5258, + "step": 14153 + }, + { + "epoch": 1.15, + "grad_norm": 3.9561275292081812, + "learning_rate": 7.070292847318636e-06, + "loss": 0.5427, + "step": 14154 + }, + { + "epoch": 1.15, + "grad_norm": 2.916833737248854, + "learning_rate": 7.06989374992025e-06, + "loss": 0.5892, + "step": 14155 + }, + { + "epoch": 1.15, + "grad_norm": 5.290824947251644, + "learning_rate": 7.069494636606121e-06, + "loss": 0.5325, + "step": 14156 + }, + { + "epoch": 1.15, + "grad_norm": 3.6321110330165034, + "learning_rate": 7.069095507379319e-06, + "loss": 0.7114, + "step": 14157 + }, + { + "epoch": 1.15, + "grad_norm": 3.6899015568475297, + "learning_rate": 7.068696362242912e-06, + "loss": 0.5574, + "step": 14158 + }, + { + "epoch": 1.15, + "grad_norm": 3.4814257110268603, + "learning_rate": 7.068297201199969e-06, + "loss": 0.5065, + "step": 14159 + }, + { + "epoch": 1.15, + "grad_norm": 2.8971998233592093, + "learning_rate": 7.067898024253559e-06, + "loss": 0.5867, + "step": 14160 + }, + { + "epoch": 1.15, + "grad_norm": 3.230370994885463, + "learning_rate": 7.067498831406751e-06, + "loss": 0.6415, + "step": 14161 + }, + { + "epoch": 1.15, + "grad_norm": 4.054201325148742, + "learning_rate": 7.067099622662618e-06, + "loss": 0.6904, + "step": 14162 + }, + { + "epoch": 1.15, + "grad_norm": 16.39179449732065, + "learning_rate": 7.066700398024225e-06, + "loss": 0.6, + "step": 14163 + }, + { + "epoch": 1.15, + "grad_norm": 2.318860455813238, + "learning_rate": 7.066301157494641e-06, + "loss": 0.5265, + "step": 14164 + }, + { + "epoch": 1.15, + "grad_norm": 3.896431255934566, + "learning_rate": 7.0659019010769415e-06, + "loss": 0.4681, + "step": 14165 + }, + { + "epoch": 1.15, + "grad_norm": 3.134207206228953, + "learning_rate": 7.065502628774193e-06, + "loss": 0.5775, + "step": 14166 + }, + { + "epoch": 1.15, + "grad_norm": 2.5381122116601778, + "learning_rate": 7.065103340589466e-06, + "loss": 0.5491, + "step": 14167 + }, + { + "epoch": 1.15, + "grad_norm": 7.798928323928904, + "learning_rate": 7.064704036525829e-06, + "loss": 0.5972, + "step": 14168 + }, + { + "epoch": 1.15, + "grad_norm": 3.566298411881335, + "learning_rate": 7.064304716586354e-06, + "loss": 0.6204, + "step": 14169 + }, + { + "epoch": 1.15, + "grad_norm": 4.580900525922911, + "learning_rate": 7.063905380774112e-06, + "loss": 0.5264, + "step": 14170 + }, + { + "epoch": 1.15, + "grad_norm": 5.044030661659421, + "learning_rate": 7.063506029092173e-06, + "loss": 0.6364, + "step": 14171 + }, + { + "epoch": 1.15, + "grad_norm": 5.258222859950595, + "learning_rate": 7.063106661543606e-06, + "loss": 0.5465, + "step": 14172 + }, + { + "epoch": 1.15, + "grad_norm": 5.07461770642589, + "learning_rate": 7.062707278131485e-06, + "loss": 0.6461, + "step": 14173 + }, + { + "epoch": 1.15, + "grad_norm": 3.0162060080548314, + "learning_rate": 7.062307878858877e-06, + "loss": 0.709, + "step": 14174 + }, + { + "epoch": 1.15, + "grad_norm": 2.7004139744413926, + "learning_rate": 7.0619084637288574e-06, + "loss": 0.5251, + "step": 14175 + }, + { + "epoch": 1.15, + "grad_norm": 3.531640933137738, + "learning_rate": 7.0615090327444935e-06, + "loss": 0.5967, + "step": 14176 + }, + { + "epoch": 1.15, + "grad_norm": 2.4479087366468577, + "learning_rate": 7.061109585908858e-06, + "loss": 0.6044, + "step": 14177 + }, + { + "epoch": 1.15, + "grad_norm": 3.505476908621678, + "learning_rate": 7.060710123225025e-06, + "loss": 0.4353, + "step": 14178 + }, + { + "epoch": 1.15, + "grad_norm": 3.565731847618166, + "learning_rate": 7.060310644696062e-06, + "loss": 0.5569, + "step": 14179 + }, + { + "epoch": 1.15, + "grad_norm": 2.907161365099284, + "learning_rate": 7.059911150325043e-06, + "loss": 0.6003, + "step": 14180 + }, + { + "epoch": 1.15, + "grad_norm": 3.7061213433323945, + "learning_rate": 7.059511640115038e-06, + "loss": 0.548, + "step": 14181 + }, + { + "epoch": 1.15, + "grad_norm": 2.1972393226108906, + "learning_rate": 7.059112114069121e-06, + "loss": 0.5637, + "step": 14182 + }, + { + "epoch": 1.15, + "grad_norm": 2.5087399276062468, + "learning_rate": 7.058712572190362e-06, + "loss": 0.6655, + "step": 14183 + }, + { + "epoch": 1.15, + "grad_norm": 3.9807149605159298, + "learning_rate": 7.0583130144818345e-06, + "loss": 0.5794, + "step": 14184 + }, + { + "epoch": 1.15, + "grad_norm": 2.779790463183691, + "learning_rate": 7.057913440946611e-06, + "loss": 0.5326, + "step": 14185 + }, + { + "epoch": 1.15, + "grad_norm": 2.0242194908515527, + "learning_rate": 7.057513851587763e-06, + "loss": 0.6512, + "step": 14186 + }, + { + "epoch": 1.15, + "grad_norm": 3.0648594562789664, + "learning_rate": 7.057114246408363e-06, + "loss": 0.5705, + "step": 14187 + }, + { + "epoch": 1.15, + "grad_norm": 2.749251205679122, + "learning_rate": 7.056714625411482e-06, + "loss": 0.5764, + "step": 14188 + }, + { + "epoch": 1.15, + "grad_norm": 3.6710502819411164, + "learning_rate": 7.056314988600198e-06, + "loss": 0.6023, + "step": 14189 + }, + { + "epoch": 1.15, + "grad_norm": 2.841614284212581, + "learning_rate": 7.055915335977579e-06, + "loss": 0.5274, + "step": 14190 + }, + { + "epoch": 1.15, + "grad_norm": 3.1849817079290332, + "learning_rate": 7.0555156675466994e-06, + "loss": 0.6975, + "step": 14191 + }, + { + "epoch": 1.15, + "grad_norm": 6.19683003953635, + "learning_rate": 7.055115983310632e-06, + "loss": 0.5268, + "step": 14192 + }, + { + "epoch": 1.15, + "grad_norm": 1.9388549574697502, + "learning_rate": 7.054716283272451e-06, + "loss": 0.4735, + "step": 14193 + }, + { + "epoch": 1.15, + "grad_norm": 6.809605985542842, + "learning_rate": 7.054316567435231e-06, + "loss": 0.642, + "step": 14194 + }, + { + "epoch": 1.15, + "grad_norm": 2.4303950371227496, + "learning_rate": 7.053916835802042e-06, + "loss": 0.6966, + "step": 14195 + }, + { + "epoch": 1.15, + "grad_norm": 5.101884248414051, + "learning_rate": 7.053517088375959e-06, + "loss": 0.7299, + "step": 14196 + }, + { + "epoch": 1.15, + "grad_norm": 4.98381854556617, + "learning_rate": 7.053117325160055e-06, + "loss": 0.5358, + "step": 14197 + }, + { + "epoch": 1.15, + "grad_norm": 3.8788729306803917, + "learning_rate": 7.052717546157407e-06, + "loss": 0.6807, + "step": 14198 + }, + { + "epoch": 1.15, + "grad_norm": 6.322838976019162, + "learning_rate": 7.052317751371086e-06, + "loss": 0.5928, + "step": 14199 + }, + { + "epoch": 1.15, + "grad_norm": 2.537546204200766, + "learning_rate": 7.051917940804166e-06, + "loss": 0.6548, + "step": 14200 + }, + { + "epoch": 1.15, + "grad_norm": 2.682024910459504, + "learning_rate": 7.051518114459723e-06, + "loss": 0.6621, + "step": 14201 + }, + { + "epoch": 1.15, + "grad_norm": 4.696853423223693, + "learning_rate": 7.051118272340831e-06, + "loss": 0.5416, + "step": 14202 + }, + { + "epoch": 1.15, + "grad_norm": 2.9290480436392157, + "learning_rate": 7.050718414450563e-06, + "loss": 0.5128, + "step": 14203 + }, + { + "epoch": 1.15, + "grad_norm": 3.849270822665608, + "learning_rate": 7.050318540791994e-06, + "loss": 0.5966, + "step": 14204 + }, + { + "epoch": 1.15, + "grad_norm": 5.1453802120454775, + "learning_rate": 7.0499186513682e-06, + "loss": 0.6246, + "step": 14205 + }, + { + "epoch": 1.15, + "grad_norm": 4.787934275303903, + "learning_rate": 7.049518746182255e-06, + "loss": 0.4698, + "step": 14206 + }, + { + "epoch": 1.15, + "grad_norm": 2.0871450044159077, + "learning_rate": 7.0491188252372344e-06, + "loss": 0.6549, + "step": 14207 + }, + { + "epoch": 1.15, + "grad_norm": 3.4986509024792256, + "learning_rate": 7.0487188885362115e-06, + "loss": 0.6304, + "step": 14208 + }, + { + "epoch": 1.15, + "grad_norm": 3.0676703886409675, + "learning_rate": 7.048318936082264e-06, + "loss": 0.6224, + "step": 14209 + }, + { + "epoch": 1.15, + "grad_norm": 3.866784754285266, + "learning_rate": 7.047918967878465e-06, + "loss": 0.4615, + "step": 14210 + }, + { + "epoch": 1.15, + "grad_norm": 2.6499533088665483, + "learning_rate": 7.047518983927891e-06, + "loss": 0.4718, + "step": 14211 + }, + { + "epoch": 1.15, + "grad_norm": 3.668578097527645, + "learning_rate": 7.047118984233618e-06, + "loss": 0.6364, + "step": 14212 + }, + { + "epoch": 1.15, + "grad_norm": 2.274191053878962, + "learning_rate": 7.04671896879872e-06, + "loss": 0.6038, + "step": 14213 + }, + { + "epoch": 1.15, + "grad_norm": 3.227721908211826, + "learning_rate": 7.046318937626275e-06, + "loss": 0.6532, + "step": 14214 + }, + { + "epoch": 1.15, + "grad_norm": 2.236891619411432, + "learning_rate": 7.0459188907193566e-06, + "loss": 0.6661, + "step": 14215 + }, + { + "epoch": 1.15, + "grad_norm": 3.1897841702304173, + "learning_rate": 7.045518828081041e-06, + "loss": 0.5675, + "step": 14216 + }, + { + "epoch": 1.15, + "grad_norm": 3.862769464211047, + "learning_rate": 7.045118749714408e-06, + "loss": 0.7569, + "step": 14217 + }, + { + "epoch": 1.15, + "grad_norm": 2.7098936322591896, + "learning_rate": 7.044718655622531e-06, + "loss": 0.549, + "step": 14218 + }, + { + "epoch": 1.15, + "grad_norm": 2.3318334835866166, + "learning_rate": 7.044318545808485e-06, + "loss": 0.385, + "step": 14219 + }, + { + "epoch": 1.15, + "grad_norm": 18.438469572456377, + "learning_rate": 7.043918420275348e-06, + "loss": 0.5976, + "step": 14220 + }, + { + "epoch": 1.16, + "grad_norm": 3.64866732986671, + "learning_rate": 7.043518279026198e-06, + "loss": 0.5169, + "step": 14221 + }, + { + "epoch": 1.16, + "grad_norm": 2.8022253428234407, + "learning_rate": 7.04311812206411e-06, + "loss": 0.6423, + "step": 14222 + }, + { + "epoch": 1.16, + "grad_norm": 3.2695242600483043, + "learning_rate": 7.042717949392162e-06, + "loss": 0.5445, + "step": 14223 + }, + { + "epoch": 1.16, + "grad_norm": 2.373034291469507, + "learning_rate": 7.042317761013428e-06, + "loss": 0.6268, + "step": 14224 + }, + { + "epoch": 1.16, + "grad_norm": 4.278726047941581, + "learning_rate": 7.041917556930988e-06, + "loss": 0.5275, + "step": 14225 + }, + { + "epoch": 1.16, + "grad_norm": 7.575303241550505, + "learning_rate": 7.041517337147921e-06, + "loss": 0.5914, + "step": 14226 + }, + { + "epoch": 1.16, + "grad_norm": 4.236060154047391, + "learning_rate": 7.0411171016673005e-06, + "loss": 0.5985, + "step": 14227 + }, + { + "epoch": 1.16, + "grad_norm": 3.72609517028221, + "learning_rate": 7.040716850492204e-06, + "loss": 0.6925, + "step": 14228 + }, + { + "epoch": 1.16, + "grad_norm": 3.562042962853014, + "learning_rate": 7.040316583625712e-06, + "loss": 0.6362, + "step": 14229 + }, + { + "epoch": 1.16, + "grad_norm": 3.077031168319329, + "learning_rate": 7.039916301070902e-06, + "loss": 0.5332, + "step": 14230 + }, + { + "epoch": 1.16, + "grad_norm": 3.2276407717421103, + "learning_rate": 7.03951600283085e-06, + "loss": 0.6154, + "step": 14231 + }, + { + "epoch": 1.16, + "grad_norm": 2.4644680798648673, + "learning_rate": 7.039115688908633e-06, + "loss": 0.65, + "step": 14232 + }, + { + "epoch": 1.16, + "grad_norm": 2.836473678887765, + "learning_rate": 7.038715359307332e-06, + "loss": 0.643, + "step": 14233 + }, + { + "epoch": 1.16, + "grad_norm": 31.734069037054216, + "learning_rate": 7.0383150140300236e-06, + "loss": 0.5541, + "step": 14234 + }, + { + "epoch": 1.16, + "grad_norm": 4.481432072482124, + "learning_rate": 7.037914653079787e-06, + "loss": 0.5053, + "step": 14235 + }, + { + "epoch": 1.16, + "grad_norm": 3.481188855081529, + "learning_rate": 7.037514276459698e-06, + "loss": 0.4337, + "step": 14236 + }, + { + "epoch": 1.16, + "grad_norm": 2.956238670935746, + "learning_rate": 7.0371138841728395e-06, + "loss": 0.5488, + "step": 14237 + }, + { + "epoch": 1.16, + "grad_norm": 2.6783277180415337, + "learning_rate": 7.036713476222288e-06, + "loss": 0.6124, + "step": 14238 + }, + { + "epoch": 1.16, + "grad_norm": 3.499927366795735, + "learning_rate": 7.036313052611121e-06, + "loss": 0.5339, + "step": 14239 + }, + { + "epoch": 1.16, + "grad_norm": 4.1823288130429335, + "learning_rate": 7.035912613342418e-06, + "loss": 0.6013, + "step": 14240 + }, + { + "epoch": 1.16, + "grad_norm": 3.787370209468694, + "learning_rate": 7.03551215841926e-06, + "loss": 0.5942, + "step": 14241 + }, + { + "epoch": 1.16, + "grad_norm": 3.1707196488507363, + "learning_rate": 7.0351116878447234e-06, + "loss": 0.7343, + "step": 14242 + }, + { + "epoch": 1.16, + "grad_norm": 2.620789120139993, + "learning_rate": 7.03471120162189e-06, + "loss": 0.5365, + "step": 14243 + }, + { + "epoch": 1.16, + "grad_norm": 3.697701424448276, + "learning_rate": 7.034310699753838e-06, + "loss": 0.6508, + "step": 14244 + }, + { + "epoch": 1.16, + "grad_norm": 3.205998466773204, + "learning_rate": 7.033910182243646e-06, + "loss": 0.683, + "step": 14245 + }, + { + "epoch": 1.16, + "grad_norm": 2.2415047488304283, + "learning_rate": 7.0335096490943944e-06, + "loss": 0.4936, + "step": 14246 + }, + { + "epoch": 1.16, + "grad_norm": 3.1970524813710424, + "learning_rate": 7.0331091003091645e-06, + "loss": 0.6232, + "step": 14247 + }, + { + "epoch": 1.16, + "grad_norm": 3.275575383702841, + "learning_rate": 7.032708535891035e-06, + "loss": 0.6115, + "step": 14248 + }, + { + "epoch": 1.16, + "grad_norm": 4.266359018917631, + "learning_rate": 7.032307955843084e-06, + "loss": 0.6165, + "step": 14249 + }, + { + "epoch": 1.16, + "grad_norm": 3.1378515633452344, + "learning_rate": 7.031907360168395e-06, + "loss": 0.5524, + "step": 14250 + }, + { + "epoch": 1.16, + "grad_norm": 7.249711138190236, + "learning_rate": 7.031506748870046e-06, + "loss": 0.5644, + "step": 14251 + }, + { + "epoch": 1.16, + "grad_norm": 3.2162692082218745, + "learning_rate": 7.031106121951119e-06, + "loss": 0.632, + "step": 14252 + }, + { + "epoch": 1.16, + "grad_norm": 3.5931438875311055, + "learning_rate": 7.030705479414693e-06, + "loss": 0.6657, + "step": 14253 + }, + { + "epoch": 1.16, + "grad_norm": 7.824382926667766, + "learning_rate": 7.030304821263848e-06, + "loss": 0.6022, + "step": 14254 + }, + { + "epoch": 1.16, + "grad_norm": 4.132514640687481, + "learning_rate": 7.029904147501667e-06, + "loss": 0.5773, + "step": 14255 + }, + { + "epoch": 1.16, + "grad_norm": 4.724806935404955, + "learning_rate": 7.029503458131231e-06, + "loss": 0.5868, + "step": 14256 + }, + { + "epoch": 1.16, + "grad_norm": 4.19500344275097, + "learning_rate": 7.029102753155618e-06, + "loss": 0.6365, + "step": 14257 + }, + { + "epoch": 1.16, + "grad_norm": 4.598390870970992, + "learning_rate": 7.02870203257791e-06, + "loss": 0.5698, + "step": 14258 + }, + { + "epoch": 1.16, + "grad_norm": 4.179967694382992, + "learning_rate": 7.02830129640119e-06, + "loss": 0.6406, + "step": 14259 + }, + { + "epoch": 1.16, + "grad_norm": 3.5470600459981214, + "learning_rate": 7.027900544628538e-06, + "loss": 0.5384, + "step": 14260 + }, + { + "epoch": 1.16, + "grad_norm": 1.944454053254147, + "learning_rate": 7.027499777263036e-06, + "loss": 0.5242, + "step": 14261 + }, + { + "epoch": 1.16, + "grad_norm": 3.719403525036206, + "learning_rate": 7.027098994307764e-06, + "loss": 0.6097, + "step": 14262 + }, + { + "epoch": 1.16, + "grad_norm": 6.2569974974738685, + "learning_rate": 7.026698195765806e-06, + "loss": 0.6217, + "step": 14263 + }, + { + "epoch": 1.16, + "grad_norm": 3.5394625905364725, + "learning_rate": 7.026297381640244e-06, + "loss": 0.6382, + "step": 14264 + }, + { + "epoch": 1.16, + "grad_norm": 4.98790303985018, + "learning_rate": 7.025896551934157e-06, + "loss": 0.5931, + "step": 14265 + }, + { + "epoch": 1.16, + "grad_norm": 4.1845552567708575, + "learning_rate": 7.025495706650628e-06, + "loss": 0.6124, + "step": 14266 + }, + { + "epoch": 1.16, + "grad_norm": 3.283451759215607, + "learning_rate": 7.025094845792741e-06, + "loss": 0.5638, + "step": 14267 + }, + { + "epoch": 1.16, + "grad_norm": 5.135297357234463, + "learning_rate": 7.024693969363577e-06, + "loss": 0.7682, + "step": 14268 + }, + { + "epoch": 1.16, + "grad_norm": 4.288630295475565, + "learning_rate": 7.02429307736622e-06, + "loss": 0.6353, + "step": 14269 + }, + { + "epoch": 1.16, + "grad_norm": 8.26184866605229, + "learning_rate": 7.023892169803748e-06, + "loss": 0.5387, + "step": 14270 + }, + { + "epoch": 1.16, + "grad_norm": 2.685698657168184, + "learning_rate": 7.02349124667925e-06, + "loss": 0.5178, + "step": 14271 + }, + { + "epoch": 1.16, + "grad_norm": 4.350758759440808, + "learning_rate": 7.0230903079958035e-06, + "loss": 0.4892, + "step": 14272 + }, + { + "epoch": 1.16, + "grad_norm": 2.058275065640589, + "learning_rate": 7.022689353756493e-06, + "loss": 0.5459, + "step": 14273 + }, + { + "epoch": 1.16, + "grad_norm": 3.4042189692306204, + "learning_rate": 7.022288383964403e-06, + "loss": 0.6865, + "step": 14274 + }, + { + "epoch": 1.16, + "grad_norm": 3.1462412949832093, + "learning_rate": 7.021887398622616e-06, + "loss": 0.6052, + "step": 14275 + }, + { + "epoch": 1.16, + "grad_norm": 3.90072606020423, + "learning_rate": 7.021486397734214e-06, + "loss": 0.6066, + "step": 14276 + }, + { + "epoch": 1.16, + "grad_norm": 3.5941592100779425, + "learning_rate": 7.0210853813022804e-06, + "loss": 0.5538, + "step": 14277 + }, + { + "epoch": 1.16, + "grad_norm": 19.374855319086677, + "learning_rate": 7.020684349329899e-06, + "loss": 0.5373, + "step": 14278 + }, + { + "epoch": 1.16, + "grad_norm": 7.898111934725016, + "learning_rate": 7.0202833018201556e-06, + "loss": 0.6998, + "step": 14279 + }, + { + "epoch": 1.16, + "grad_norm": 3.444032974062626, + "learning_rate": 7.0198822387761325e-06, + "loss": 0.6366, + "step": 14280 + }, + { + "epoch": 1.16, + "grad_norm": 3.1021112313858734, + "learning_rate": 7.019481160200912e-06, + "loss": 0.6034, + "step": 14281 + }, + { + "epoch": 1.16, + "grad_norm": 4.188810116655731, + "learning_rate": 7.019080066097578e-06, + "loss": 0.5915, + "step": 14282 + }, + { + "epoch": 1.16, + "grad_norm": 2.5392609469184104, + "learning_rate": 7.018678956469217e-06, + "loss": 0.4396, + "step": 14283 + }, + { + "epoch": 1.16, + "grad_norm": 4.364169499956356, + "learning_rate": 7.018277831318911e-06, + "loss": 0.6798, + "step": 14284 + }, + { + "epoch": 1.16, + "grad_norm": 3.058440700169329, + "learning_rate": 7.017876690649747e-06, + "loss": 0.6833, + "step": 14285 + }, + { + "epoch": 1.16, + "grad_norm": 3.730260520261365, + "learning_rate": 7.017475534464806e-06, + "loss": 0.5656, + "step": 14286 + }, + { + "epoch": 1.16, + "grad_norm": 2.4517320227522026, + "learning_rate": 7.017074362767176e-06, + "loss": 0.6382, + "step": 14287 + }, + { + "epoch": 1.16, + "grad_norm": 3.243880959335342, + "learning_rate": 7.016673175559939e-06, + "loss": 0.6384, + "step": 14288 + }, + { + "epoch": 1.16, + "grad_norm": 5.6257644524606265, + "learning_rate": 7.0162719728461804e-06, + "loss": 0.5352, + "step": 14289 + }, + { + "epoch": 1.16, + "grad_norm": 4.600378816527835, + "learning_rate": 7.015870754628985e-06, + "loss": 0.4983, + "step": 14290 + }, + { + "epoch": 1.16, + "grad_norm": 2.4691132195469168, + "learning_rate": 7.01546952091144e-06, + "loss": 0.4924, + "step": 14291 + }, + { + "epoch": 1.16, + "grad_norm": 3.2473224802264307, + "learning_rate": 7.015068271696628e-06, + "loss": 0.5009, + "step": 14292 + }, + { + "epoch": 1.16, + "grad_norm": 3.3686684626597145, + "learning_rate": 7.014667006987634e-06, + "loss": 0.7183, + "step": 14293 + }, + { + "epoch": 1.16, + "grad_norm": 2.820540035102836, + "learning_rate": 7.014265726787546e-06, + "loss": 0.5231, + "step": 14294 + }, + { + "epoch": 1.16, + "grad_norm": 3.789686339556025, + "learning_rate": 7.013864431099446e-06, + "loss": 0.5128, + "step": 14295 + }, + { + "epoch": 1.16, + "grad_norm": 2.335494360825453, + "learning_rate": 7.013463119926425e-06, + "loss": 0.4913, + "step": 14296 + }, + { + "epoch": 1.16, + "grad_norm": 3.534001407306064, + "learning_rate": 7.013061793271563e-06, + "loss": 0.5593, + "step": 14297 + }, + { + "epoch": 1.16, + "grad_norm": 5.18746331545558, + "learning_rate": 7.012660451137947e-06, + "loss": 0.7104, + "step": 14298 + }, + { + "epoch": 1.16, + "grad_norm": 5.6493774004037585, + "learning_rate": 7.012259093528666e-06, + "loss": 0.6258, + "step": 14299 + }, + { + "epoch": 1.16, + "grad_norm": 2.9257381941606098, + "learning_rate": 7.011857720446805e-06, + "loss": 0.6526, + "step": 14300 + }, + { + "epoch": 1.16, + "grad_norm": 2.633411593380638, + "learning_rate": 7.011456331895449e-06, + "loss": 0.6897, + "step": 14301 + }, + { + "epoch": 1.16, + "grad_norm": 2.5222965533559267, + "learning_rate": 7.011054927877683e-06, + "loss": 0.5715, + "step": 14302 + }, + { + "epoch": 1.16, + "grad_norm": 3.292623548327875, + "learning_rate": 7.010653508396598e-06, + "loss": 0.6149, + "step": 14303 + }, + { + "epoch": 1.16, + "grad_norm": 2.597249919260393, + "learning_rate": 7.010252073455277e-06, + "loss": 0.5925, + "step": 14304 + }, + { + "epoch": 1.16, + "grad_norm": 4.415128115344621, + "learning_rate": 7.009850623056807e-06, + "loss": 0.5944, + "step": 14305 + }, + { + "epoch": 1.16, + "grad_norm": 2.4897428841915263, + "learning_rate": 7.009449157204275e-06, + "loss": 0.5302, + "step": 14306 + }, + { + "epoch": 1.16, + "grad_norm": 4.657284668879697, + "learning_rate": 7.00904767590077e-06, + "loss": 0.6341, + "step": 14307 + }, + { + "epoch": 1.16, + "grad_norm": 4.419504732160633, + "learning_rate": 7.008646179149377e-06, + "loss": 0.4999, + "step": 14308 + }, + { + "epoch": 1.16, + "grad_norm": 10.280517335341065, + "learning_rate": 7.008244666953182e-06, + "loss": 0.6955, + "step": 14309 + }, + { + "epoch": 1.16, + "grad_norm": 14.099013589556494, + "learning_rate": 7.007843139315275e-06, + "loss": 0.5176, + "step": 14310 + }, + { + "epoch": 1.16, + "grad_norm": 3.458984400780439, + "learning_rate": 7.007441596238742e-06, + "loss": 0.51, + "step": 14311 + }, + { + "epoch": 1.16, + "grad_norm": 5.383464098332261, + "learning_rate": 7.0070400377266715e-06, + "loss": 0.5011, + "step": 14312 + }, + { + "epoch": 1.16, + "grad_norm": 3.239818942293765, + "learning_rate": 7.00663846378215e-06, + "loss": 0.6235, + "step": 14313 + }, + { + "epoch": 1.16, + "grad_norm": 2.8546949092995773, + "learning_rate": 7.006236874408265e-06, + "loss": 0.4533, + "step": 14314 + }, + { + "epoch": 1.16, + "grad_norm": 4.1804134820621055, + "learning_rate": 7.005835269608106e-06, + "loss": 0.4069, + "step": 14315 + }, + { + "epoch": 1.16, + "grad_norm": 3.8695467842726194, + "learning_rate": 7.005433649384761e-06, + "loss": 0.3909, + "step": 14316 + }, + { + "epoch": 1.16, + "grad_norm": 3.272513515418013, + "learning_rate": 7.0050320137413154e-06, + "loss": 0.5891, + "step": 14317 + }, + { + "epoch": 1.16, + "grad_norm": 3.7416433539010336, + "learning_rate": 7.00463036268086e-06, + "loss": 0.5106, + "step": 14318 + }, + { + "epoch": 1.16, + "grad_norm": 2.5495522678032243, + "learning_rate": 7.004228696206482e-06, + "loss": 0.5411, + "step": 14319 + }, + { + "epoch": 1.16, + "grad_norm": 2.1458036016879634, + "learning_rate": 7.003827014321272e-06, + "loss": 0.472, + "step": 14320 + }, + { + "epoch": 1.16, + "grad_norm": 3.338365066157051, + "learning_rate": 7.0034253170283154e-06, + "loss": 0.681, + "step": 14321 + }, + { + "epoch": 1.16, + "grad_norm": 8.21066698040215, + "learning_rate": 7.003023604330702e-06, + "loss": 0.5258, + "step": 14322 + }, + { + "epoch": 1.16, + "grad_norm": 7.263807912585048, + "learning_rate": 7.002621876231521e-06, + "loss": 0.5927, + "step": 14323 + }, + { + "epoch": 1.16, + "grad_norm": 4.607377515690884, + "learning_rate": 7.002220132733864e-06, + "loss": 0.6158, + "step": 14324 + }, + { + "epoch": 1.16, + "grad_norm": 3.7137709876487484, + "learning_rate": 7.001818373840815e-06, + "loss": 0.5318, + "step": 14325 + }, + { + "epoch": 1.16, + "grad_norm": 5.081545836644973, + "learning_rate": 7.001416599555466e-06, + "loss": 0.5295, + "step": 14326 + }, + { + "epoch": 1.16, + "grad_norm": 2.8397771366828812, + "learning_rate": 7.001014809880906e-06, + "loss": 0.6522, + "step": 14327 + }, + { + "epoch": 1.16, + "grad_norm": 4.086702431562511, + "learning_rate": 7.000613004820225e-06, + "loss": 0.6997, + "step": 14328 + }, + { + "epoch": 1.16, + "grad_norm": 2.930224267265713, + "learning_rate": 7.000211184376512e-06, + "loss": 0.6057, + "step": 14329 + }, + { + "epoch": 1.16, + "grad_norm": 3.5875215759806083, + "learning_rate": 6.999809348552855e-06, + "loss": 0.7486, + "step": 14330 + }, + { + "epoch": 1.16, + "grad_norm": 2.657890963189876, + "learning_rate": 6.999407497352348e-06, + "loss": 0.657, + "step": 14331 + }, + { + "epoch": 1.16, + "grad_norm": 6.243013781429413, + "learning_rate": 6.9990056307780765e-06, + "loss": 0.5178, + "step": 14332 + }, + { + "epoch": 1.16, + "grad_norm": 3.708661312104522, + "learning_rate": 6.998603748833133e-06, + "loss": 0.4944, + "step": 14333 + }, + { + "epoch": 1.16, + "grad_norm": 3.1312206968052423, + "learning_rate": 6.998201851520605e-06, + "loss": 0.6599, + "step": 14334 + }, + { + "epoch": 1.16, + "grad_norm": 3.6656408090384676, + "learning_rate": 6.997799938843587e-06, + "loss": 0.6819, + "step": 14335 + }, + { + "epoch": 1.16, + "grad_norm": 3.2935957007769057, + "learning_rate": 6.997398010805166e-06, + "loss": 0.7279, + "step": 14336 + }, + { + "epoch": 1.16, + "grad_norm": 5.888406169172882, + "learning_rate": 6.9969960674084345e-06, + "loss": 0.6371, + "step": 14337 + }, + { + "epoch": 1.16, + "grad_norm": 3.774438784174132, + "learning_rate": 6.996594108656482e-06, + "loss": 0.5928, + "step": 14338 + }, + { + "epoch": 1.16, + "grad_norm": 2.2188273061923267, + "learning_rate": 6.996192134552397e-06, + "loss": 0.6339, + "step": 14339 + }, + { + "epoch": 1.16, + "grad_norm": 3.709104016510214, + "learning_rate": 6.995790145099276e-06, + "loss": 0.5738, + "step": 14340 + }, + { + "epoch": 1.16, + "grad_norm": 4.842457330602665, + "learning_rate": 6.995388140300205e-06, + "loss": 0.5877, + "step": 14341 + }, + { + "epoch": 1.16, + "grad_norm": 4.681528044720819, + "learning_rate": 6.994986120158278e-06, + "loss": 0.5867, + "step": 14342 + }, + { + "epoch": 1.16, + "grad_norm": 9.32937426644189, + "learning_rate": 6.994584084676583e-06, + "loss": 0.5722, + "step": 14343 + }, + { + "epoch": 1.17, + "grad_norm": 3.1848223850243587, + "learning_rate": 6.994182033858215e-06, + "loss": 0.5362, + "step": 14344 + }, + { + "epoch": 1.17, + "grad_norm": 6.564815504002435, + "learning_rate": 6.9937799677062626e-06, + "loss": 0.6368, + "step": 14345 + }, + { + "epoch": 1.17, + "grad_norm": 3.1987090748418106, + "learning_rate": 6.9933778862238186e-06, + "loss": 0.6182, + "step": 14346 + }, + { + "epoch": 1.17, + "grad_norm": 3.0578874080368137, + "learning_rate": 6.992975789413974e-06, + "loss": 0.4869, + "step": 14347 + }, + { + "epoch": 1.17, + "grad_norm": 25.621097330749468, + "learning_rate": 6.992573677279822e-06, + "loss": 0.5725, + "step": 14348 + }, + { + "epoch": 1.17, + "grad_norm": 3.7363801280548934, + "learning_rate": 6.992171549824453e-06, + "loss": 0.4721, + "step": 14349 + }, + { + "epoch": 1.17, + "grad_norm": 2.7744705247068238, + "learning_rate": 6.991769407050961e-06, + "loss": 0.7033, + "step": 14350 + }, + { + "epoch": 1.17, + "grad_norm": 3.474478192986402, + "learning_rate": 6.991367248962435e-06, + "loss": 0.568, + "step": 14351 + }, + { + "epoch": 1.17, + "grad_norm": 4.546957961790935, + "learning_rate": 6.990965075561971e-06, + "loss": 0.4784, + "step": 14352 + }, + { + "epoch": 1.17, + "grad_norm": 6.11401562522574, + "learning_rate": 6.990562886852658e-06, + "loss": 0.6165, + "step": 14353 + }, + { + "epoch": 1.17, + "grad_norm": 3.9047566310040525, + "learning_rate": 6.99016068283759e-06, + "loss": 0.5949, + "step": 14354 + }, + { + "epoch": 1.17, + "grad_norm": 2.9266982396267665, + "learning_rate": 6.989758463519859e-06, + "loss": 0.6844, + "step": 14355 + }, + { + "epoch": 1.17, + "grad_norm": 3.402016149153466, + "learning_rate": 6.98935622890256e-06, + "loss": 0.6214, + "step": 14356 + }, + { + "epoch": 1.17, + "grad_norm": 2.982808532668181, + "learning_rate": 6.988953978988781e-06, + "loss": 0.6567, + "step": 14357 + }, + { + "epoch": 1.17, + "grad_norm": 3.898175964082523, + "learning_rate": 6.988551713781622e-06, + "loss": 0.6108, + "step": 14358 + }, + { + "epoch": 1.17, + "grad_norm": 4.662277020948855, + "learning_rate": 6.988149433284168e-06, + "loss": 0.6059, + "step": 14359 + }, + { + "epoch": 1.17, + "grad_norm": 3.4111622186054538, + "learning_rate": 6.98774713749952e-06, + "loss": 0.614, + "step": 14360 + }, + { + "epoch": 1.17, + "grad_norm": 2.7320782341051872, + "learning_rate": 6.987344826430766e-06, + "loss": 0.5392, + "step": 14361 + }, + { + "epoch": 1.17, + "grad_norm": 3.4686411469204526, + "learning_rate": 6.986942500081001e-06, + "loss": 0.6384, + "step": 14362 + }, + { + "epoch": 1.17, + "grad_norm": 3.173090377014203, + "learning_rate": 6.986540158453319e-06, + "loss": 0.4716, + "step": 14363 + }, + { + "epoch": 1.17, + "grad_norm": 3.096791265309678, + "learning_rate": 6.986137801550812e-06, + "loss": 0.4752, + "step": 14364 + }, + { + "epoch": 1.17, + "grad_norm": 4.8143501926506875, + "learning_rate": 6.985735429376577e-06, + "loss": 0.6, + "step": 14365 + }, + { + "epoch": 1.17, + "grad_norm": 2.4462063820842443, + "learning_rate": 6.985333041933705e-06, + "loss": 0.6659, + "step": 14366 + }, + { + "epoch": 1.17, + "grad_norm": 3.6728921012953752, + "learning_rate": 6.984930639225291e-06, + "loss": 0.6288, + "step": 14367 + }, + { + "epoch": 1.17, + "grad_norm": 5.223560767515796, + "learning_rate": 6.98452822125443e-06, + "loss": 0.5641, + "step": 14368 + }, + { + "epoch": 1.17, + "grad_norm": 3.0925376791391344, + "learning_rate": 6.984125788024214e-06, + "loss": 0.5045, + "step": 14369 + }, + { + "epoch": 1.17, + "grad_norm": 3.480869296655566, + "learning_rate": 6.98372333953774e-06, + "loss": 0.5843, + "step": 14370 + }, + { + "epoch": 1.17, + "grad_norm": 2.8390158419798173, + "learning_rate": 6.9833208757981e-06, + "loss": 0.6507, + "step": 14371 + }, + { + "epoch": 1.17, + "grad_norm": 3.006454821342152, + "learning_rate": 6.982918396808391e-06, + "loss": 0.593, + "step": 14372 + }, + { + "epoch": 1.17, + "grad_norm": 5.573638379559762, + "learning_rate": 6.982515902571706e-06, + "loss": 0.6283, + "step": 14373 + }, + { + "epoch": 1.17, + "grad_norm": 4.566660085411355, + "learning_rate": 6.982113393091141e-06, + "loss": 0.7519, + "step": 14374 + }, + { + "epoch": 1.17, + "grad_norm": 8.681217432274263, + "learning_rate": 6.981710868369789e-06, + "loss": 0.4871, + "step": 14375 + }, + { + "epoch": 1.17, + "grad_norm": 2.8482653095466577, + "learning_rate": 6.9813083284107476e-06, + "loss": 0.5129, + "step": 14376 + }, + { + "epoch": 1.17, + "grad_norm": 4.175341198785065, + "learning_rate": 6.9809057732171115e-06, + "loss": 0.6001, + "step": 14377 + }, + { + "epoch": 1.17, + "grad_norm": 2.8747275560042667, + "learning_rate": 6.980503202791975e-06, + "loss": 0.6338, + "step": 14378 + }, + { + "epoch": 1.17, + "grad_norm": 3.5022422261109445, + "learning_rate": 6.980100617138433e-06, + "loss": 0.6773, + "step": 14379 + }, + { + "epoch": 1.17, + "grad_norm": 3.962739994178937, + "learning_rate": 6.9796980162595816e-06, + "loss": 0.6986, + "step": 14380 + }, + { + "epoch": 1.17, + "grad_norm": 6.391472358760104, + "learning_rate": 6.979295400158519e-06, + "loss": 0.6048, + "step": 14381 + }, + { + "epoch": 1.17, + "grad_norm": 2.6423739026527975, + "learning_rate": 6.9788927688383375e-06, + "loss": 0.5914, + "step": 14382 + }, + { + "epoch": 1.17, + "grad_norm": 3.135679931640387, + "learning_rate": 6.978490122302134e-06, + "loss": 0.4888, + "step": 14383 + }, + { + "epoch": 1.17, + "grad_norm": 2.239844735822562, + "learning_rate": 6.978087460553005e-06, + "loss": 0.529, + "step": 14384 + }, + { + "epoch": 1.17, + "grad_norm": 4.794632692026889, + "learning_rate": 6.977684783594047e-06, + "loss": 0.5175, + "step": 14385 + }, + { + "epoch": 1.17, + "grad_norm": 2.3501057830500023, + "learning_rate": 6.977282091428354e-06, + "loss": 0.5594, + "step": 14386 + }, + { + "epoch": 1.17, + "grad_norm": 2.870632360650033, + "learning_rate": 6.976879384059025e-06, + "loss": 0.6589, + "step": 14387 + }, + { + "epoch": 1.17, + "grad_norm": 2.8998745782701274, + "learning_rate": 6.976476661489156e-06, + "loss": 0.4675, + "step": 14388 + }, + { + "epoch": 1.17, + "grad_norm": 37.81231139900525, + "learning_rate": 6.976073923721844e-06, + "loss": 0.5786, + "step": 14389 + }, + { + "epoch": 1.17, + "grad_norm": 3.548745545461503, + "learning_rate": 6.975671170760184e-06, + "loss": 0.6791, + "step": 14390 + }, + { + "epoch": 1.17, + "grad_norm": 3.7472913912717685, + "learning_rate": 6.975268402607273e-06, + "loss": 0.6195, + "step": 14391 + }, + { + "epoch": 1.17, + "grad_norm": 4.039606035480137, + "learning_rate": 6.974865619266209e-06, + "loss": 0.5425, + "step": 14392 + }, + { + "epoch": 1.17, + "grad_norm": 3.7674261132181988, + "learning_rate": 6.974462820740089e-06, + "loss": 0.7325, + "step": 14393 + }, + { + "epoch": 1.17, + "grad_norm": 16.828672966850558, + "learning_rate": 6.9740600070320095e-06, + "loss": 0.5822, + "step": 14394 + }, + { + "epoch": 1.17, + "grad_norm": 3.179237926362057, + "learning_rate": 6.973657178145068e-06, + "loss": 0.644, + "step": 14395 + }, + { + "epoch": 1.17, + "grad_norm": 2.6762755940609977, + "learning_rate": 6.9732543340823625e-06, + "loss": 0.5259, + "step": 14396 + }, + { + "epoch": 1.17, + "grad_norm": 10.92428157152755, + "learning_rate": 6.97285147484699e-06, + "loss": 0.62, + "step": 14397 + }, + { + "epoch": 1.17, + "grad_norm": 4.45452400649346, + "learning_rate": 6.972448600442049e-06, + "loss": 0.5079, + "step": 14398 + }, + { + "epoch": 1.17, + "grad_norm": 3.52073172281811, + "learning_rate": 6.972045710870635e-06, + "loss": 0.6075, + "step": 14399 + }, + { + "epoch": 1.17, + "grad_norm": 3.7215429942568576, + "learning_rate": 6.971642806135848e-06, + "loss": 0.4868, + "step": 14400 + }, + { + "epoch": 1.17, + "grad_norm": 3.3293480967158993, + "learning_rate": 6.9712398862407855e-06, + "loss": 0.6403, + "step": 14401 + }, + { + "epoch": 1.17, + "grad_norm": 4.008981933670134, + "learning_rate": 6.970836951188546e-06, + "loss": 0.6132, + "step": 14402 + }, + { + "epoch": 1.17, + "grad_norm": 2.6923034952557425, + "learning_rate": 6.970434000982227e-06, + "loss": 0.5692, + "step": 14403 + }, + { + "epoch": 1.17, + "grad_norm": 2.6762156985686643, + "learning_rate": 6.970031035624927e-06, + "loss": 0.5048, + "step": 14404 + }, + { + "epoch": 1.17, + "grad_norm": 2.267599241682459, + "learning_rate": 6.969628055119743e-06, + "loss": 0.5385, + "step": 14405 + }, + { + "epoch": 1.17, + "grad_norm": 5.771215214722076, + "learning_rate": 6.969225059469778e-06, + "loss": 0.6546, + "step": 14406 + }, + { + "epoch": 1.17, + "grad_norm": 2.9552246423485613, + "learning_rate": 6.9688220486781266e-06, + "loss": 0.6403, + "step": 14407 + }, + { + "epoch": 1.17, + "grad_norm": 2.166165497279776, + "learning_rate": 6.9684190227478876e-06, + "loss": 0.5658, + "step": 14408 + }, + { + "epoch": 1.17, + "grad_norm": 15.575607020082467, + "learning_rate": 6.968015981682163e-06, + "loss": 0.6496, + "step": 14409 + }, + { + "epoch": 1.17, + "grad_norm": 2.6021809587434213, + "learning_rate": 6.96761292548405e-06, + "loss": 0.7036, + "step": 14410 + }, + { + "epoch": 1.17, + "grad_norm": 2.546907069565274, + "learning_rate": 6.967209854156647e-06, + "loss": 0.6077, + "step": 14411 + }, + { + "epoch": 1.17, + "grad_norm": 2.7488686561205156, + "learning_rate": 6.966806767703054e-06, + "loss": 0.6283, + "step": 14412 + }, + { + "epoch": 1.17, + "grad_norm": 2.995010561152073, + "learning_rate": 6.966403666126371e-06, + "loss": 0.4886, + "step": 14413 + }, + { + "epoch": 1.17, + "grad_norm": 3.810030776176775, + "learning_rate": 6.9660005494296965e-06, + "loss": 0.4791, + "step": 14414 + }, + { + "epoch": 1.17, + "grad_norm": 3.42248382242463, + "learning_rate": 6.965597417616131e-06, + "loss": 0.5891, + "step": 14415 + }, + { + "epoch": 1.17, + "grad_norm": 3.1886976142437597, + "learning_rate": 6.965194270688773e-06, + "loss": 0.4953, + "step": 14416 + }, + { + "epoch": 1.17, + "grad_norm": 2.7148153760039633, + "learning_rate": 6.964791108650725e-06, + "loss": 0.568, + "step": 14417 + }, + { + "epoch": 1.17, + "grad_norm": 18.725090757385285, + "learning_rate": 6.964387931505084e-06, + "loss": 0.601, + "step": 14418 + }, + { + "epoch": 1.17, + "grad_norm": 5.304822505435947, + "learning_rate": 6.963984739254952e-06, + "loss": 0.6416, + "step": 14419 + }, + { + "epoch": 1.17, + "grad_norm": 3.0611772645721254, + "learning_rate": 6.963581531903427e-06, + "loss": 0.4845, + "step": 14420 + }, + { + "epoch": 1.17, + "grad_norm": 4.958420776103195, + "learning_rate": 6.963178309453612e-06, + "loss": 0.569, + "step": 14421 + }, + { + "epoch": 1.17, + "grad_norm": 3.6402015672583374, + "learning_rate": 6.9627750719086075e-06, + "loss": 0.356, + "step": 14422 + }, + { + "epoch": 1.17, + "grad_norm": 6.788615198498025, + "learning_rate": 6.9623718192715105e-06, + "loss": 0.5999, + "step": 14423 + }, + { + "epoch": 1.17, + "grad_norm": 2.8605981501853446, + "learning_rate": 6.961968551545425e-06, + "loss": 0.521, + "step": 14424 + }, + { + "epoch": 1.17, + "grad_norm": 3.436802712074126, + "learning_rate": 6.96156526873345e-06, + "loss": 0.6463, + "step": 14425 + }, + { + "epoch": 1.17, + "grad_norm": 2.5790178172689577, + "learning_rate": 6.961161970838689e-06, + "loss": 0.631, + "step": 14426 + }, + { + "epoch": 1.17, + "grad_norm": 4.628268415180049, + "learning_rate": 6.96075865786424e-06, + "loss": 0.5626, + "step": 14427 + }, + { + "epoch": 1.17, + "grad_norm": 2.6567155756065124, + "learning_rate": 6.960355329813205e-06, + "loss": 0.6449, + "step": 14428 + }, + { + "epoch": 1.17, + "grad_norm": 6.994920028866129, + "learning_rate": 6.9599519866886865e-06, + "loss": 0.5483, + "step": 14429 + }, + { + "epoch": 1.17, + "grad_norm": 4.352044545876115, + "learning_rate": 6.959548628493785e-06, + "loss": 0.6923, + "step": 14430 + }, + { + "epoch": 1.17, + "grad_norm": 3.6040236462593978, + "learning_rate": 6.959145255231602e-06, + "loss": 0.5904, + "step": 14431 + }, + { + "epoch": 1.17, + "grad_norm": 8.142022398021625, + "learning_rate": 6.958741866905238e-06, + "loss": 0.3811, + "step": 14432 + }, + { + "epoch": 1.17, + "grad_norm": 3.464647890254119, + "learning_rate": 6.9583384635177966e-06, + "loss": 0.7376, + "step": 14433 + }, + { + "epoch": 1.17, + "grad_norm": 2.934550034235554, + "learning_rate": 6.95793504507238e-06, + "loss": 0.5579, + "step": 14434 + }, + { + "epoch": 1.17, + "grad_norm": 5.489098908607065, + "learning_rate": 6.957531611572087e-06, + "loss": 0.5379, + "step": 14435 + }, + { + "epoch": 1.17, + "grad_norm": 2.4403418770921634, + "learning_rate": 6.957128163020022e-06, + "loss": 0.5621, + "step": 14436 + }, + { + "epoch": 1.17, + "grad_norm": 3.695129049865734, + "learning_rate": 6.956724699419286e-06, + "loss": 0.6381, + "step": 14437 + }, + { + "epoch": 1.17, + "grad_norm": 3.6233689748503255, + "learning_rate": 6.956321220772984e-06, + "loss": 0.5965, + "step": 14438 + }, + { + "epoch": 1.17, + "grad_norm": 3.453742364885673, + "learning_rate": 6.955917727084216e-06, + "loss": 0.7493, + "step": 14439 + }, + { + "epoch": 1.17, + "grad_norm": 3.852200289282606, + "learning_rate": 6.955514218356085e-06, + "loss": 0.6375, + "step": 14440 + }, + { + "epoch": 1.17, + "grad_norm": 8.907921466841959, + "learning_rate": 6.955110694591692e-06, + "loss": 0.5739, + "step": 14441 + }, + { + "epoch": 1.17, + "grad_norm": 3.0053273914422793, + "learning_rate": 6.954707155794144e-06, + "loss": 0.5764, + "step": 14442 + }, + { + "epoch": 1.17, + "grad_norm": 2.4444362137450395, + "learning_rate": 6.95430360196654e-06, + "loss": 0.6312, + "step": 14443 + }, + { + "epoch": 1.17, + "grad_norm": 4.552504380206663, + "learning_rate": 6.953900033111985e-06, + "loss": 0.6743, + "step": 14444 + }, + { + "epoch": 1.17, + "grad_norm": 10.594083810729677, + "learning_rate": 6.95349644923358e-06, + "loss": 0.6278, + "step": 14445 + }, + { + "epoch": 1.17, + "grad_norm": 2.0910392908780358, + "learning_rate": 6.953092850334431e-06, + "loss": 0.5347, + "step": 14446 + }, + { + "epoch": 1.17, + "grad_norm": 2.5170330814506703, + "learning_rate": 6.9526892364176405e-06, + "loss": 0.489, + "step": 14447 + }, + { + "epoch": 1.17, + "grad_norm": 14.807003721571265, + "learning_rate": 6.95228560748631e-06, + "loss": 0.6311, + "step": 14448 + }, + { + "epoch": 1.17, + "grad_norm": 2.69009518789666, + "learning_rate": 6.951881963543544e-06, + "loss": 0.5366, + "step": 14449 + }, + { + "epoch": 1.17, + "grad_norm": 5.047129019330523, + "learning_rate": 6.951478304592448e-06, + "loss": 0.6048, + "step": 14450 + }, + { + "epoch": 1.17, + "grad_norm": 7.696321536972059, + "learning_rate": 6.951074630636124e-06, + "loss": 0.6983, + "step": 14451 + }, + { + "epoch": 1.17, + "grad_norm": 3.995056215344097, + "learning_rate": 6.950670941677678e-06, + "loss": 0.5687, + "step": 14452 + }, + { + "epoch": 1.17, + "grad_norm": 7.888937232835702, + "learning_rate": 6.95026723772021e-06, + "loss": 0.5067, + "step": 14453 + }, + { + "epoch": 1.17, + "grad_norm": 4.184677893703467, + "learning_rate": 6.9498635187668276e-06, + "loss": 0.502, + "step": 14454 + }, + { + "epoch": 1.17, + "grad_norm": 2.700464604971667, + "learning_rate": 6.949459784820633e-06, + "loss": 0.4556, + "step": 14455 + }, + { + "epoch": 1.17, + "grad_norm": 1.9415550473834058, + "learning_rate": 6.9490560358847335e-06, + "loss": 0.5243, + "step": 14456 + }, + { + "epoch": 1.17, + "grad_norm": 6.112133684629271, + "learning_rate": 6.9486522719622305e-06, + "loss": 0.6074, + "step": 14457 + }, + { + "epoch": 1.17, + "grad_norm": 2.558293499464618, + "learning_rate": 6.94824849305623e-06, + "loss": 0.5656, + "step": 14458 + }, + { + "epoch": 1.17, + "grad_norm": 3.034697989777955, + "learning_rate": 6.947844699169837e-06, + "loss": 0.6923, + "step": 14459 + }, + { + "epoch": 1.17, + "grad_norm": 2.986962921572295, + "learning_rate": 6.9474408903061555e-06, + "loss": 0.4884, + "step": 14460 + }, + { + "epoch": 1.17, + "grad_norm": 3.493398004113482, + "learning_rate": 6.94703706646829e-06, + "loss": 0.5289, + "step": 14461 + }, + { + "epoch": 1.17, + "grad_norm": 3.4731142285957204, + "learning_rate": 6.9466332276593474e-06, + "loss": 0.592, + "step": 14462 + }, + { + "epoch": 1.17, + "grad_norm": 8.763589641777372, + "learning_rate": 6.9462293738824315e-06, + "loss": 0.6499, + "step": 14463 + }, + { + "epoch": 1.17, + "grad_norm": 4.52475535840548, + "learning_rate": 6.9458255051406474e-06, + "loss": 0.592, + "step": 14464 + }, + { + "epoch": 1.17, + "grad_norm": 4.695638273405512, + "learning_rate": 6.9454216214371e-06, + "loss": 0.619, + "step": 14465 + }, + { + "epoch": 1.17, + "grad_norm": 6.5616661267180785, + "learning_rate": 6.945017722774898e-06, + "loss": 0.7059, + "step": 14466 + }, + { + "epoch": 1.18, + "grad_norm": 3.266149976332877, + "learning_rate": 6.944613809157146e-06, + "loss": 0.6844, + "step": 14467 + }, + { + "epoch": 1.18, + "grad_norm": 3.073413464543612, + "learning_rate": 6.944209880586946e-06, + "loss": 0.5085, + "step": 14468 + }, + { + "epoch": 1.18, + "grad_norm": 5.373660738438155, + "learning_rate": 6.943805937067407e-06, + "loss": 0.6729, + "step": 14469 + }, + { + "epoch": 1.18, + "grad_norm": 4.305212600092387, + "learning_rate": 6.943401978601636e-06, + "loss": 0.5937, + "step": 14470 + }, + { + "epoch": 1.18, + "grad_norm": 2.9393802118129715, + "learning_rate": 6.942998005192736e-06, + "loss": 0.4203, + "step": 14471 + }, + { + "epoch": 1.18, + "grad_norm": 4.104284032081064, + "learning_rate": 6.9425940168438165e-06, + "loss": 0.685, + "step": 14472 + }, + { + "epoch": 1.18, + "grad_norm": 4.091450107283679, + "learning_rate": 6.94219001355798e-06, + "loss": 0.6502, + "step": 14473 + }, + { + "epoch": 1.18, + "grad_norm": 6.388606407007351, + "learning_rate": 6.9417859953383375e-06, + "loss": 0.5572, + "step": 14474 + }, + { + "epoch": 1.18, + "grad_norm": 2.8529118219582585, + "learning_rate": 6.941381962187992e-06, + "loss": 0.4887, + "step": 14475 + }, + { + "epoch": 1.18, + "grad_norm": 2.4264530983589663, + "learning_rate": 6.940977914110052e-06, + "loss": 0.5473, + "step": 14476 + }, + { + "epoch": 1.18, + "grad_norm": 4.095804444139031, + "learning_rate": 6.940573851107622e-06, + "loss": 0.5881, + "step": 14477 + }, + { + "epoch": 1.18, + "grad_norm": 3.0032376852488394, + "learning_rate": 6.940169773183812e-06, + "loss": 0.6145, + "step": 14478 + }, + { + "epoch": 1.18, + "grad_norm": 3.922028243634884, + "learning_rate": 6.939765680341727e-06, + "loss": 0.7281, + "step": 14479 + }, + { + "epoch": 1.18, + "grad_norm": 3.6186835928882677, + "learning_rate": 6.9393615725844755e-06, + "loss": 0.6334, + "step": 14480 + }, + { + "epoch": 1.18, + "grad_norm": 5.694292881787652, + "learning_rate": 6.9389574499151624e-06, + "loss": 0.6004, + "step": 14481 + }, + { + "epoch": 1.18, + "grad_norm": 4.334504571831336, + "learning_rate": 6.938553312336897e-06, + "loss": 0.5724, + "step": 14482 + }, + { + "epoch": 1.18, + "grad_norm": 4.942372914609437, + "learning_rate": 6.9381491598527875e-06, + "loss": 0.6537, + "step": 14483 + }, + { + "epoch": 1.18, + "grad_norm": 2.880060952924213, + "learning_rate": 6.93774499246594e-06, + "loss": 0.6288, + "step": 14484 + }, + { + "epoch": 1.18, + "grad_norm": 3.3184450531676535, + "learning_rate": 6.937340810179462e-06, + "loss": 0.5929, + "step": 14485 + }, + { + "epoch": 1.18, + "grad_norm": 2.2420099026407723, + "learning_rate": 6.936936612996462e-06, + "loss": 0.4999, + "step": 14486 + }, + { + "epoch": 1.18, + "grad_norm": 2.7566032603695034, + "learning_rate": 6.936532400920048e-06, + "loss": 0.5569, + "step": 14487 + }, + { + "epoch": 1.18, + "grad_norm": 5.287939074943457, + "learning_rate": 6.93612817395333e-06, + "loss": 0.8029, + "step": 14488 + }, + { + "epoch": 1.18, + "grad_norm": 2.5402167100741644, + "learning_rate": 6.935723932099411e-06, + "loss": 0.5409, + "step": 14489 + }, + { + "epoch": 1.18, + "grad_norm": 2.634229023218897, + "learning_rate": 6.935319675361404e-06, + "loss": 0.5988, + "step": 14490 + }, + { + "epoch": 1.18, + "grad_norm": 4.228036784112948, + "learning_rate": 6.934915403742415e-06, + "loss": 0.6141, + "step": 14491 + }, + { + "epoch": 1.18, + "grad_norm": 3.263605324718991, + "learning_rate": 6.934511117245554e-06, + "loss": 0.6113, + "step": 14492 + }, + { + "epoch": 1.18, + "grad_norm": 10.540184288257919, + "learning_rate": 6.934106815873928e-06, + "loss": 0.5666, + "step": 14493 + }, + { + "epoch": 1.18, + "grad_norm": 2.4732038090945974, + "learning_rate": 6.933702499630647e-06, + "loss": 0.6033, + "step": 14494 + }, + { + "epoch": 1.18, + "grad_norm": 2.9096236681682486, + "learning_rate": 6.93329816851882e-06, + "loss": 0.4452, + "step": 14495 + }, + { + "epoch": 1.18, + "grad_norm": 2.803658197668262, + "learning_rate": 6.9328938225415556e-06, + "loss": 0.7844, + "step": 14496 + }, + { + "epoch": 1.18, + "grad_norm": 2.501477241764622, + "learning_rate": 6.9324894617019615e-06, + "loss": 0.4965, + "step": 14497 + }, + { + "epoch": 1.18, + "grad_norm": 2.573260797654089, + "learning_rate": 6.932085086003149e-06, + "loss": 0.6283, + "step": 14498 + }, + { + "epoch": 1.18, + "grad_norm": 4.659134702879103, + "learning_rate": 6.931680695448225e-06, + "loss": 0.4654, + "step": 14499 + }, + { + "epoch": 1.18, + "grad_norm": 4.382496520456605, + "learning_rate": 6.931276290040302e-06, + "loss": 0.6358, + "step": 14500 + }, + { + "epoch": 1.18, + "grad_norm": 3.9526001229338603, + "learning_rate": 6.930871869782488e-06, + "loss": 0.5823, + "step": 14501 + }, + { + "epoch": 1.18, + "grad_norm": 3.6188307498633017, + "learning_rate": 6.9304674346778925e-06, + "loss": 0.7168, + "step": 14502 + }, + { + "epoch": 1.18, + "grad_norm": 4.370348800190304, + "learning_rate": 6.930062984729624e-06, + "loss": 0.6438, + "step": 14503 + }, + { + "epoch": 1.18, + "grad_norm": 5.39690235171182, + "learning_rate": 6.929658519940796e-06, + "loss": 0.5861, + "step": 14504 + }, + { + "epoch": 1.18, + "grad_norm": 3.8506907724649904, + "learning_rate": 6.929254040314514e-06, + "loss": 0.6067, + "step": 14505 + }, + { + "epoch": 1.18, + "grad_norm": 3.143680453525122, + "learning_rate": 6.9288495458538915e-06, + "loss": 0.5675, + "step": 14506 + }, + { + "epoch": 1.18, + "grad_norm": 2.2910626808847936, + "learning_rate": 6.9284450365620385e-06, + "loss": 0.4505, + "step": 14507 + }, + { + "epoch": 1.18, + "grad_norm": 3.245542271536684, + "learning_rate": 6.928040512442064e-06, + "loss": 0.501, + "step": 14508 + }, + { + "epoch": 1.18, + "grad_norm": 3.55701894105794, + "learning_rate": 6.927635973497077e-06, + "loss": 0.5896, + "step": 14509 + }, + { + "epoch": 1.18, + "grad_norm": 4.0393767894626444, + "learning_rate": 6.9272314197301925e-06, + "loss": 0.5646, + "step": 14510 + }, + { + "epoch": 1.18, + "grad_norm": 6.149176482069408, + "learning_rate": 6.926826851144518e-06, + "loss": 0.5843, + "step": 14511 + }, + { + "epoch": 1.18, + "grad_norm": 4.152491306049876, + "learning_rate": 6.9264222677431645e-06, + "loss": 0.6703, + "step": 14512 + }, + { + "epoch": 1.18, + "grad_norm": 3.747954856878307, + "learning_rate": 6.926017669529242e-06, + "loss": 0.6766, + "step": 14513 + }, + { + "epoch": 1.18, + "grad_norm": 5.083977912322783, + "learning_rate": 6.925613056505865e-06, + "loss": 0.7093, + "step": 14514 + }, + { + "epoch": 1.18, + "grad_norm": 2.7289031081883914, + "learning_rate": 6.925208428676142e-06, + "loss": 0.5994, + "step": 14515 + }, + { + "epoch": 1.18, + "grad_norm": 4.157304065905774, + "learning_rate": 6.924803786043185e-06, + "loss": 0.692, + "step": 14516 + }, + { + "epoch": 1.18, + "grad_norm": 3.2148154963327156, + "learning_rate": 6.924399128610104e-06, + "loss": 0.6058, + "step": 14517 + }, + { + "epoch": 1.18, + "grad_norm": 4.183329641774825, + "learning_rate": 6.923994456380012e-06, + "loss": 0.7598, + "step": 14518 + }, + { + "epoch": 1.18, + "grad_norm": 3.35829866966315, + "learning_rate": 6.92358976935602e-06, + "loss": 0.5264, + "step": 14519 + }, + { + "epoch": 1.18, + "grad_norm": 6.219441357927212, + "learning_rate": 6.923185067541241e-06, + "loss": 0.5883, + "step": 14520 + }, + { + "epoch": 1.18, + "grad_norm": 2.7878692265898373, + "learning_rate": 6.9227803509387845e-06, + "loss": 0.5065, + "step": 14521 + }, + { + "epoch": 1.18, + "grad_norm": 4.378925828353985, + "learning_rate": 6.922375619551763e-06, + "loss": 0.6651, + "step": 14522 + }, + { + "epoch": 1.18, + "grad_norm": 2.011147862395951, + "learning_rate": 6.921970873383291e-06, + "loss": 0.5467, + "step": 14523 + }, + { + "epoch": 1.18, + "grad_norm": 5.886382233203103, + "learning_rate": 6.921566112436478e-06, + "loss": 0.6541, + "step": 14524 + }, + { + "epoch": 1.18, + "grad_norm": 3.5457697956374825, + "learning_rate": 6.921161336714437e-06, + "loss": 0.5786, + "step": 14525 + }, + { + "epoch": 1.18, + "grad_norm": 13.733004475248478, + "learning_rate": 6.92075654622028e-06, + "loss": 0.4584, + "step": 14526 + }, + { + "epoch": 1.18, + "grad_norm": 2.859111966772742, + "learning_rate": 6.920351740957121e-06, + "loss": 0.5579, + "step": 14527 + }, + { + "epoch": 1.18, + "grad_norm": 3.1564173528651986, + "learning_rate": 6.9199469209280715e-06, + "loss": 0.6417, + "step": 14528 + }, + { + "epoch": 1.18, + "grad_norm": 3.740323572489624, + "learning_rate": 6.9195420861362435e-06, + "loss": 0.5106, + "step": 14529 + }, + { + "epoch": 1.18, + "grad_norm": 4.080594189494386, + "learning_rate": 6.9191372365847495e-06, + "loss": 0.5823, + "step": 14530 + }, + { + "epoch": 1.18, + "grad_norm": 2.3326303848203733, + "learning_rate": 6.918732372276707e-06, + "loss": 0.587, + "step": 14531 + }, + { + "epoch": 1.18, + "grad_norm": 5.443489220170335, + "learning_rate": 6.9183274932152234e-06, + "loss": 0.6017, + "step": 14532 + }, + { + "epoch": 1.18, + "grad_norm": 4.314880484895241, + "learning_rate": 6.917922599403415e-06, + "loss": 0.5064, + "step": 14533 + }, + { + "epoch": 1.18, + "grad_norm": 4.749167624866858, + "learning_rate": 6.917517690844392e-06, + "loss": 0.5379, + "step": 14534 + }, + { + "epoch": 1.18, + "grad_norm": 8.56354884698868, + "learning_rate": 6.917112767541272e-06, + "loss": 0.6246, + "step": 14535 + }, + { + "epoch": 1.18, + "grad_norm": 3.8989006066748715, + "learning_rate": 6.9167078294971665e-06, + "loss": 0.561, + "step": 14536 + }, + { + "epoch": 1.18, + "grad_norm": 3.654886438435428, + "learning_rate": 6.91630287671519e-06, + "loss": 0.4718, + "step": 14537 + }, + { + "epoch": 1.18, + "grad_norm": 2.2925192699155, + "learning_rate": 6.915897909198453e-06, + "loss": 0.3467, + "step": 14538 + }, + { + "epoch": 1.18, + "grad_norm": 3.671775140571182, + "learning_rate": 6.915492926950074e-06, + "loss": 0.5226, + "step": 14539 + }, + { + "epoch": 1.18, + "grad_norm": 2.955106516372029, + "learning_rate": 6.915087929973164e-06, + "loss": 0.5438, + "step": 14540 + }, + { + "epoch": 1.18, + "grad_norm": 5.910237713387555, + "learning_rate": 6.914682918270839e-06, + "loss": 0.6538, + "step": 14541 + }, + { + "epoch": 1.18, + "grad_norm": 6.385512057467495, + "learning_rate": 6.914277891846209e-06, + "loss": 0.6817, + "step": 14542 + }, + { + "epoch": 1.18, + "grad_norm": 3.7691200491610153, + "learning_rate": 6.913872850702393e-06, + "loss": 0.5884, + "step": 14543 + }, + { + "epoch": 1.18, + "grad_norm": 7.262591535784197, + "learning_rate": 6.913467794842505e-06, + "loss": 0.6593, + "step": 14544 + }, + { + "epoch": 1.18, + "grad_norm": 2.3960757507600294, + "learning_rate": 6.913062724269658e-06, + "loss": 0.4691, + "step": 14545 + }, + { + "epoch": 1.18, + "grad_norm": 3.070385853483246, + "learning_rate": 6.912657638986966e-06, + "loss": 0.5396, + "step": 14546 + }, + { + "epoch": 1.18, + "grad_norm": 3.035048924608097, + "learning_rate": 6.912252538997545e-06, + "loss": 0.4264, + "step": 14547 + }, + { + "epoch": 1.18, + "grad_norm": 3.1272489686554916, + "learning_rate": 6.91184742430451e-06, + "loss": 0.4707, + "step": 14548 + }, + { + "epoch": 1.18, + "grad_norm": 2.7703294734446966, + "learning_rate": 6.911442294910975e-06, + "loss": 0.6093, + "step": 14549 + }, + { + "epoch": 1.18, + "grad_norm": 3.1661571059834515, + "learning_rate": 6.911037150820056e-06, + "loss": 0.7229, + "step": 14550 + }, + { + "epoch": 1.18, + "grad_norm": 3.580717046428166, + "learning_rate": 6.9106319920348685e-06, + "loss": 0.5768, + "step": 14551 + }, + { + "epoch": 1.18, + "grad_norm": 5.683861688888747, + "learning_rate": 6.910226818558528e-06, + "loss": 0.59, + "step": 14552 + }, + { + "epoch": 1.18, + "grad_norm": 2.9539888658386264, + "learning_rate": 6.909821630394147e-06, + "loss": 0.6783, + "step": 14553 + }, + { + "epoch": 1.18, + "grad_norm": 2.794105628728191, + "learning_rate": 6.909416427544844e-06, + "loss": 0.7218, + "step": 14554 + }, + { + "epoch": 1.18, + "grad_norm": 4.237422219470898, + "learning_rate": 6.909011210013734e-06, + "loss": 0.6012, + "step": 14555 + }, + { + "epoch": 1.18, + "grad_norm": 5.354421107418542, + "learning_rate": 6.9086059778039336e-06, + "loss": 0.5679, + "step": 14556 + }, + { + "epoch": 1.18, + "grad_norm": 3.1692471391332324, + "learning_rate": 6.908200730918557e-06, + "loss": 0.5319, + "step": 14557 + }, + { + "epoch": 1.18, + "grad_norm": 3.8358600933617786, + "learning_rate": 6.9077954693607206e-06, + "loss": 0.5093, + "step": 14558 + }, + { + "epoch": 1.18, + "grad_norm": 2.6717782282767693, + "learning_rate": 6.907390193133543e-06, + "loss": 0.6958, + "step": 14559 + }, + { + "epoch": 1.18, + "grad_norm": 3.275165219418645, + "learning_rate": 6.906984902240137e-06, + "loss": 0.4726, + "step": 14560 + }, + { + "epoch": 1.18, + "grad_norm": 3.516761491251228, + "learning_rate": 6.90657959668362e-06, + "loss": 0.5884, + "step": 14561 + }, + { + "epoch": 1.18, + "grad_norm": 1.8342664351162046, + "learning_rate": 6.906174276467109e-06, + "loss": 0.4467, + "step": 14562 + }, + { + "epoch": 1.18, + "grad_norm": 2.109137528488764, + "learning_rate": 6.905768941593721e-06, + "loss": 0.5953, + "step": 14563 + }, + { + "epoch": 1.18, + "grad_norm": 2.9290489374269986, + "learning_rate": 6.905363592066572e-06, + "loss": 0.7976, + "step": 14564 + }, + { + "epoch": 1.18, + "grad_norm": 4.672266777362304, + "learning_rate": 6.904958227888777e-06, + "loss": 0.6124, + "step": 14565 + }, + { + "epoch": 1.18, + "grad_norm": 2.8051911993989123, + "learning_rate": 6.9045528490634575e-06, + "loss": 0.5999, + "step": 14566 + }, + { + "epoch": 1.18, + "grad_norm": 8.549356018637834, + "learning_rate": 6.904147455593725e-06, + "loss": 0.6209, + "step": 14567 + }, + { + "epoch": 1.18, + "grad_norm": 2.2780007663246256, + "learning_rate": 6.9037420474827014e-06, + "loss": 0.4011, + "step": 14568 + }, + { + "epoch": 1.18, + "grad_norm": 3.766290942331301, + "learning_rate": 6.903336624733501e-06, + "loss": 0.6025, + "step": 14569 + }, + { + "epoch": 1.18, + "grad_norm": 8.538594306585793, + "learning_rate": 6.902931187349243e-06, + "loss": 0.6406, + "step": 14570 + }, + { + "epoch": 1.18, + "grad_norm": 2.916951018598615, + "learning_rate": 6.9025257353330435e-06, + "loss": 0.6077, + "step": 14571 + }, + { + "epoch": 1.18, + "grad_norm": 3.5753188419283712, + "learning_rate": 6.902120268688021e-06, + "loss": 0.6457, + "step": 14572 + }, + { + "epoch": 1.18, + "grad_norm": 2.5482159041894694, + "learning_rate": 6.9017147874172915e-06, + "loss": 0.5663, + "step": 14573 + }, + { + "epoch": 1.18, + "grad_norm": 6.1718157765408055, + "learning_rate": 6.901309291523976e-06, + "loss": 0.4969, + "step": 14574 + }, + { + "epoch": 1.18, + "grad_norm": 4.241099917640582, + "learning_rate": 6.900903781011188e-06, + "loss": 0.6229, + "step": 14575 + }, + { + "epoch": 1.18, + "grad_norm": 4.00185011230321, + "learning_rate": 6.90049825588205e-06, + "loss": 0.546, + "step": 14576 + }, + { + "epoch": 1.18, + "grad_norm": 3.0743367067680234, + "learning_rate": 6.900092716139678e-06, + "loss": 0.6437, + "step": 14577 + }, + { + "epoch": 1.18, + "grad_norm": 2.4224029343180944, + "learning_rate": 6.899687161787191e-06, + "loss": 0.504, + "step": 14578 + }, + { + "epoch": 1.18, + "grad_norm": 4.43917239539689, + "learning_rate": 6.899281592827705e-06, + "loss": 0.6521, + "step": 14579 + }, + { + "epoch": 1.18, + "grad_norm": 3.2729537294571003, + "learning_rate": 6.898876009264341e-06, + "loss": 0.6006, + "step": 14580 + }, + { + "epoch": 1.18, + "grad_norm": 3.1362197756417944, + "learning_rate": 6.898470411100218e-06, + "loss": 0.6611, + "step": 14581 + }, + { + "epoch": 1.18, + "grad_norm": 2.8966494068382556, + "learning_rate": 6.898064798338453e-06, + "loss": 0.5743, + "step": 14582 + }, + { + "epoch": 1.18, + "grad_norm": 3.1644909838633324, + "learning_rate": 6.8976591709821635e-06, + "loss": 0.5351, + "step": 14583 + }, + { + "epoch": 1.18, + "grad_norm": 5.052919754624432, + "learning_rate": 6.897253529034474e-06, + "loss": 0.5455, + "step": 14584 + }, + { + "epoch": 1.18, + "grad_norm": 3.0956940313485455, + "learning_rate": 6.896847872498498e-06, + "loss": 0.5472, + "step": 14585 + }, + { + "epoch": 1.18, + "grad_norm": 3.0198119014272535, + "learning_rate": 6.8964422013773555e-06, + "loss": 0.699, + "step": 14586 + }, + { + "epoch": 1.18, + "grad_norm": 3.0018378847112825, + "learning_rate": 6.896036515674168e-06, + "loss": 0.546, + "step": 14587 + }, + { + "epoch": 1.18, + "grad_norm": 2.200639339574012, + "learning_rate": 6.895630815392054e-06, + "loss": 0.5628, + "step": 14588 + }, + { + "epoch": 1.18, + "grad_norm": 2.516244836007579, + "learning_rate": 6.895225100534132e-06, + "loss": 0.5125, + "step": 14589 + }, + { + "epoch": 1.18, + "grad_norm": 5.53199277099304, + "learning_rate": 6.894819371103522e-06, + "loss": 0.64, + "step": 14590 + }, + { + "epoch": 1.19, + "grad_norm": 4.096333944260052, + "learning_rate": 6.894413627103345e-06, + "loss": 0.717, + "step": 14591 + }, + { + "epoch": 1.19, + "grad_norm": 2.530081223163244, + "learning_rate": 6.8940078685367205e-06, + "loss": 0.7187, + "step": 14592 + }, + { + "epoch": 1.19, + "grad_norm": 6.313443132328232, + "learning_rate": 6.8936020954067664e-06, + "loss": 0.5712, + "step": 14593 + }, + { + "epoch": 1.19, + "grad_norm": 3.1724676524861954, + "learning_rate": 6.893196307716606e-06, + "loss": 0.4531, + "step": 14594 + }, + { + "epoch": 1.19, + "grad_norm": 3.806889966091346, + "learning_rate": 6.8927905054693546e-06, + "loss": 0.6022, + "step": 14595 + }, + { + "epoch": 1.19, + "grad_norm": 43.44389870361191, + "learning_rate": 6.892384688668138e-06, + "loss": 0.5661, + "step": 14596 + }, + { + "epoch": 1.19, + "grad_norm": 5.180615163495028, + "learning_rate": 6.891978857316073e-06, + "loss": 0.6604, + "step": 14597 + }, + { + "epoch": 1.19, + "grad_norm": 2.8463665847896977, + "learning_rate": 6.891573011416282e-06, + "loss": 0.5829, + "step": 14598 + }, + { + "epoch": 1.19, + "grad_norm": 3.1951499432145654, + "learning_rate": 6.891167150971884e-06, + "loss": 0.4976, + "step": 14599 + }, + { + "epoch": 1.19, + "grad_norm": 3.799688565093332, + "learning_rate": 6.890761275986e-06, + "loss": 0.595, + "step": 14600 + }, + { + "epoch": 1.19, + "grad_norm": 3.8140165146978275, + "learning_rate": 6.890355386461753e-06, + "loss": 0.4989, + "step": 14601 + }, + { + "epoch": 1.19, + "grad_norm": 3.57989470396492, + "learning_rate": 6.8899494824022615e-06, + "loss": 0.5904, + "step": 14602 + }, + { + "epoch": 1.19, + "grad_norm": 5.7862313252525155, + "learning_rate": 6.8895435638106465e-06, + "loss": 0.5105, + "step": 14603 + }, + { + "epoch": 1.19, + "grad_norm": 3.2940086293065507, + "learning_rate": 6.889137630690031e-06, + "loss": 0.6355, + "step": 14604 + }, + { + "epoch": 1.19, + "grad_norm": 3.275668657174012, + "learning_rate": 6.8887316830435354e-06, + "loss": 0.7609, + "step": 14605 + }, + { + "epoch": 1.19, + "grad_norm": 3.4113767264285606, + "learning_rate": 6.888325720874283e-06, + "loss": 0.5245, + "step": 14606 + }, + { + "epoch": 1.19, + "grad_norm": 2.395612270995112, + "learning_rate": 6.8879197441853895e-06, + "loss": 0.5912, + "step": 14607 + }, + { + "epoch": 1.19, + "grad_norm": 21.61806765454407, + "learning_rate": 6.887513752979983e-06, + "loss": 0.6156, + "step": 14608 + }, + { + "epoch": 1.19, + "grad_norm": 2.9313884028735337, + "learning_rate": 6.887107747261182e-06, + "loss": 0.6651, + "step": 14609 + }, + { + "epoch": 1.19, + "grad_norm": 2.6147975922347384, + "learning_rate": 6.886701727032108e-06, + "loss": 0.544, + "step": 14610 + }, + { + "epoch": 1.19, + "grad_norm": 15.394087523217047, + "learning_rate": 6.886295692295884e-06, + "loss": 0.5341, + "step": 14611 + }, + { + "epoch": 1.19, + "grad_norm": 2.983613310106075, + "learning_rate": 6.885889643055633e-06, + "loss": 0.5799, + "step": 14612 + }, + { + "epoch": 1.19, + "grad_norm": 3.631379638501287, + "learning_rate": 6.885483579314476e-06, + "loss": 0.5539, + "step": 14613 + }, + { + "epoch": 1.19, + "grad_norm": 3.1474430734767767, + "learning_rate": 6.885077501075536e-06, + "loss": 0.654, + "step": 14614 + }, + { + "epoch": 1.19, + "grad_norm": 2.9899482808207094, + "learning_rate": 6.884671408341933e-06, + "loss": 0.4174, + "step": 14615 + }, + { + "epoch": 1.19, + "grad_norm": 3.4593852490438692, + "learning_rate": 6.884265301116793e-06, + "loss": 0.6522, + "step": 14616 + }, + { + "epoch": 1.19, + "grad_norm": 2.6917839437555275, + "learning_rate": 6.8838591794032365e-06, + "loss": 0.5679, + "step": 14617 + }, + { + "epoch": 1.19, + "grad_norm": 4.8672016369957225, + "learning_rate": 6.883453043204387e-06, + "loss": 0.4636, + "step": 14618 + }, + { + "epoch": 1.19, + "grad_norm": 7.735843492066565, + "learning_rate": 6.883046892523366e-06, + "loss": 0.5884, + "step": 14619 + }, + { + "epoch": 1.19, + "grad_norm": 3.2621953416171534, + "learning_rate": 6.8826407273632975e-06, + "loss": 0.5953, + "step": 14620 + }, + { + "epoch": 1.19, + "grad_norm": 2.4575172355996604, + "learning_rate": 6.882234547727306e-06, + "loss": 0.6319, + "step": 14621 + }, + { + "epoch": 1.19, + "grad_norm": 2.7269738607819725, + "learning_rate": 6.881828353618512e-06, + "loss": 0.5386, + "step": 14622 + }, + { + "epoch": 1.19, + "grad_norm": 3.670029071801196, + "learning_rate": 6.881422145040041e-06, + "loss": 0.7119, + "step": 14623 + }, + { + "epoch": 1.19, + "grad_norm": 1.9789427513807505, + "learning_rate": 6.881015921995013e-06, + "loss": 0.5229, + "step": 14624 + }, + { + "epoch": 1.19, + "grad_norm": 4.8383702980540075, + "learning_rate": 6.880609684486557e-06, + "loss": 0.5584, + "step": 14625 + }, + { + "epoch": 1.19, + "grad_norm": 3.171112849874239, + "learning_rate": 6.8802034325177925e-06, + "loss": 0.4344, + "step": 14626 + }, + { + "epoch": 1.19, + "grad_norm": 2.7888561828287166, + "learning_rate": 6.879797166091844e-06, + "loss": 0.6604, + "step": 14627 + }, + { + "epoch": 1.19, + "grad_norm": 2.3024484433218158, + "learning_rate": 6.879390885211835e-06, + "loss": 0.5356, + "step": 14628 + }, + { + "epoch": 1.19, + "grad_norm": 3.6442738281565616, + "learning_rate": 6.878984589880892e-06, + "loss": 0.707, + "step": 14629 + }, + { + "epoch": 1.19, + "grad_norm": 2.363310974206081, + "learning_rate": 6.878578280102136e-06, + "loss": 0.6052, + "step": 14630 + }, + { + "epoch": 1.19, + "grad_norm": 5.932755721244748, + "learning_rate": 6.878171955878693e-06, + "loss": 0.5156, + "step": 14631 + }, + { + "epoch": 1.19, + "grad_norm": 5.737313379889864, + "learning_rate": 6.877765617213685e-06, + "loss": 0.5764, + "step": 14632 + }, + { + "epoch": 1.19, + "grad_norm": 3.077036645983875, + "learning_rate": 6.8773592641102405e-06, + "loss": 0.549, + "step": 14633 + }, + { + "epoch": 1.19, + "grad_norm": 3.356979668110325, + "learning_rate": 6.87695289657148e-06, + "loss": 0.7877, + "step": 14634 + }, + { + "epoch": 1.19, + "grad_norm": 3.6310622040370863, + "learning_rate": 6.87654651460053e-06, + "loss": 0.7438, + "step": 14635 + }, + { + "epoch": 1.19, + "grad_norm": 10.905019053630436, + "learning_rate": 6.876140118200515e-06, + "loss": 0.6713, + "step": 14636 + }, + { + "epoch": 1.19, + "grad_norm": 4.155163479973959, + "learning_rate": 6.87573370737456e-06, + "loss": 0.494, + "step": 14637 + }, + { + "epoch": 1.19, + "grad_norm": 4.065749953678626, + "learning_rate": 6.87532728212579e-06, + "loss": 0.5899, + "step": 14638 + }, + { + "epoch": 1.19, + "grad_norm": 3.2493713143651646, + "learning_rate": 6.874920842457329e-06, + "loss": 0.5877, + "step": 14639 + }, + { + "epoch": 1.19, + "grad_norm": 12.211560184486986, + "learning_rate": 6.874514388372303e-06, + "loss": 0.4738, + "step": 14640 + }, + { + "epoch": 1.19, + "grad_norm": 3.1468040066912026, + "learning_rate": 6.874107919873838e-06, + "loss": 0.5501, + "step": 14641 + }, + { + "epoch": 1.19, + "grad_norm": 6.536938409475869, + "learning_rate": 6.873701436965059e-06, + "loss": 0.5867, + "step": 14642 + }, + { + "epoch": 1.19, + "grad_norm": 5.031300298201698, + "learning_rate": 6.87329493964909e-06, + "loss": 0.4403, + "step": 14643 + }, + { + "epoch": 1.19, + "grad_norm": 5.136219984350847, + "learning_rate": 6.8728884279290574e-06, + "loss": 0.6777, + "step": 14644 + }, + { + "epoch": 1.19, + "grad_norm": 3.959432670409234, + "learning_rate": 6.872481901808089e-06, + "loss": 0.5103, + "step": 14645 + }, + { + "epoch": 1.19, + "grad_norm": 3.316732970983276, + "learning_rate": 6.872075361289309e-06, + "loss": 0.6007, + "step": 14646 + }, + { + "epoch": 1.19, + "grad_norm": 8.495909661677377, + "learning_rate": 6.871668806375843e-06, + "loss": 0.5693, + "step": 14647 + }, + { + "epoch": 1.19, + "grad_norm": 2.9687044358207695, + "learning_rate": 6.871262237070816e-06, + "loss": 0.5396, + "step": 14648 + }, + { + "epoch": 1.19, + "grad_norm": 3.119150259066366, + "learning_rate": 6.870855653377357e-06, + "loss": 0.7173, + "step": 14649 + }, + { + "epoch": 1.19, + "grad_norm": 16.547911475538278, + "learning_rate": 6.87044905529859e-06, + "loss": 0.6296, + "step": 14650 + }, + { + "epoch": 1.19, + "grad_norm": 2.6583243198080657, + "learning_rate": 6.8700424428376435e-06, + "loss": 0.5883, + "step": 14651 + }, + { + "epoch": 1.19, + "grad_norm": 3.5187193300065505, + "learning_rate": 6.869635815997642e-06, + "loss": 0.6531, + "step": 14652 + }, + { + "epoch": 1.19, + "grad_norm": 4.831538175510151, + "learning_rate": 6.869229174781713e-06, + "loss": 0.5324, + "step": 14653 + }, + { + "epoch": 1.19, + "grad_norm": 7.289556028660734, + "learning_rate": 6.868822519192984e-06, + "loss": 0.5805, + "step": 14654 + }, + { + "epoch": 1.19, + "grad_norm": 7.620531642927985, + "learning_rate": 6.86841584923458e-06, + "loss": 0.4925, + "step": 14655 + }, + { + "epoch": 1.19, + "grad_norm": 4.198086297676359, + "learning_rate": 6.868009164909628e-06, + "loss": 0.6057, + "step": 14656 + }, + { + "epoch": 1.19, + "grad_norm": 10.175524707631405, + "learning_rate": 6.867602466221257e-06, + "loss": 0.7283, + "step": 14657 + }, + { + "epoch": 1.19, + "grad_norm": 3.14364453428349, + "learning_rate": 6.867195753172594e-06, + "loss": 0.7116, + "step": 14658 + }, + { + "epoch": 1.19, + "grad_norm": 4.624989917239108, + "learning_rate": 6.866789025766764e-06, + "loss": 0.6377, + "step": 14659 + }, + { + "epoch": 1.19, + "grad_norm": 3.7800738865376347, + "learning_rate": 6.866382284006896e-06, + "loss": 0.609, + "step": 14660 + }, + { + "epoch": 1.19, + "grad_norm": 2.492699277675529, + "learning_rate": 6.865975527896118e-06, + "loss": 0.5471, + "step": 14661 + }, + { + "epoch": 1.19, + "grad_norm": 3.4894303588259037, + "learning_rate": 6.865568757437558e-06, + "loss": 0.5765, + "step": 14662 + }, + { + "epoch": 1.19, + "grad_norm": 2.9572408667518824, + "learning_rate": 6.865161972634341e-06, + "loss": 0.6899, + "step": 14663 + }, + { + "epoch": 1.19, + "grad_norm": 8.698606911187769, + "learning_rate": 6.864755173489597e-06, + "loss": 0.5742, + "step": 14664 + }, + { + "epoch": 1.19, + "grad_norm": 3.5517550221427756, + "learning_rate": 6.864348360006453e-06, + "loss": 0.6839, + "step": 14665 + }, + { + "epoch": 1.19, + "grad_norm": 5.595848727162736, + "learning_rate": 6.863941532188039e-06, + "loss": 0.5442, + "step": 14666 + }, + { + "epoch": 1.19, + "grad_norm": 6.23529839519157, + "learning_rate": 6.86353469003748e-06, + "loss": 0.5915, + "step": 14667 + }, + { + "epoch": 1.19, + "grad_norm": 4.24340321170888, + "learning_rate": 6.863127833557905e-06, + "loss": 0.5916, + "step": 14668 + }, + { + "epoch": 1.19, + "grad_norm": 9.81902068365687, + "learning_rate": 6.862720962752445e-06, + "loss": 0.6966, + "step": 14669 + }, + { + "epoch": 1.19, + "grad_norm": 5.2536189729295355, + "learning_rate": 6.862314077624227e-06, + "loss": 0.6447, + "step": 14670 + }, + { + "epoch": 1.19, + "grad_norm": 2.76684046409634, + "learning_rate": 6.861907178176379e-06, + "loss": 0.6612, + "step": 14671 + }, + { + "epoch": 1.19, + "grad_norm": 3.1660628896964638, + "learning_rate": 6.86150026441203e-06, + "loss": 0.5876, + "step": 14672 + }, + { + "epoch": 1.19, + "grad_norm": 3.886385734107923, + "learning_rate": 6.861093336334309e-06, + "loss": 0.5254, + "step": 14673 + }, + { + "epoch": 1.19, + "grad_norm": 4.20366618065212, + "learning_rate": 6.860686393946345e-06, + "loss": 0.4607, + "step": 14674 + }, + { + "epoch": 1.19, + "grad_norm": 3.552136677567691, + "learning_rate": 6.860279437251267e-06, + "loss": 0.6166, + "step": 14675 + }, + { + "epoch": 1.19, + "grad_norm": 26.150441346228664, + "learning_rate": 6.859872466252204e-06, + "loss": 0.6809, + "step": 14676 + }, + { + "epoch": 1.19, + "grad_norm": 3.392126872146257, + "learning_rate": 6.8594654809522855e-06, + "loss": 0.651, + "step": 14677 + }, + { + "epoch": 1.19, + "grad_norm": 3.6500177388145976, + "learning_rate": 6.8590584813546414e-06, + "loss": 0.5847, + "step": 14678 + }, + { + "epoch": 1.19, + "grad_norm": 2.7755441594658836, + "learning_rate": 6.858651467462399e-06, + "loss": 0.4924, + "step": 14679 + }, + { + "epoch": 1.19, + "grad_norm": 4.6806015103090335, + "learning_rate": 6.85824443927869e-06, + "loss": 0.6493, + "step": 14680 + }, + { + "epoch": 1.19, + "grad_norm": 2.8555758335664594, + "learning_rate": 6.857837396806643e-06, + "loss": 0.5572, + "step": 14681 + }, + { + "epoch": 1.19, + "grad_norm": 2.8441032303112315, + "learning_rate": 6.857430340049391e-06, + "loss": 0.6873, + "step": 14682 + }, + { + "epoch": 1.19, + "grad_norm": 5.360166573160086, + "learning_rate": 6.857023269010058e-06, + "loss": 0.6478, + "step": 14683 + }, + { + "epoch": 1.19, + "grad_norm": 3.710068990816284, + "learning_rate": 6.856616183691777e-06, + "loss": 0.5423, + "step": 14684 + }, + { + "epoch": 1.19, + "grad_norm": 10.778100744832885, + "learning_rate": 6.8562090840976816e-06, + "loss": 0.6969, + "step": 14685 + }, + { + "epoch": 1.19, + "grad_norm": 3.8970660696268014, + "learning_rate": 6.855801970230898e-06, + "loss": 0.616, + "step": 14686 + }, + { + "epoch": 1.19, + "grad_norm": 5.54902674493392, + "learning_rate": 6.855394842094556e-06, + "loss": 0.4647, + "step": 14687 + }, + { + "epoch": 1.19, + "grad_norm": 3.963603016961619, + "learning_rate": 6.854987699691788e-06, + "loss": 0.8043, + "step": 14688 + }, + { + "epoch": 1.19, + "grad_norm": 4.487067875305332, + "learning_rate": 6.854580543025724e-06, + "loss": 0.5265, + "step": 14689 + }, + { + "epoch": 1.19, + "grad_norm": 4.999512694219298, + "learning_rate": 6.854173372099495e-06, + "loss": 0.7449, + "step": 14690 + }, + { + "epoch": 1.19, + "grad_norm": 3.8435834618627327, + "learning_rate": 6.853766186916232e-06, + "loss": 0.479, + "step": 14691 + }, + { + "epoch": 1.19, + "grad_norm": 2.8430511226124664, + "learning_rate": 6.853358987479065e-06, + "loss": 0.578, + "step": 14692 + }, + { + "epoch": 1.19, + "grad_norm": 3.7541736731806092, + "learning_rate": 6.852951773791125e-06, + "loss": 0.5739, + "step": 14693 + }, + { + "epoch": 1.19, + "grad_norm": 2.153530489223778, + "learning_rate": 6.852544545855545e-06, + "loss": 0.5666, + "step": 14694 + }, + { + "epoch": 1.19, + "grad_norm": 2.5408896341506653, + "learning_rate": 6.852137303675455e-06, + "loss": 0.4928, + "step": 14695 + }, + { + "epoch": 1.19, + "grad_norm": 2.8974024862694283, + "learning_rate": 6.851730047253985e-06, + "loss": 0.5409, + "step": 14696 + }, + { + "epoch": 1.19, + "grad_norm": 2.966070322427666, + "learning_rate": 6.851322776594268e-06, + "loss": 0.6679, + "step": 14697 + }, + { + "epoch": 1.19, + "grad_norm": 5.857088659538648, + "learning_rate": 6.850915491699436e-06, + "loss": 0.5394, + "step": 14698 + }, + { + "epoch": 1.19, + "grad_norm": 3.763808319096023, + "learning_rate": 6.8505081925726205e-06, + "loss": 0.497, + "step": 14699 + }, + { + "epoch": 1.19, + "grad_norm": 4.023071609799391, + "learning_rate": 6.85010087921695e-06, + "loss": 0.4907, + "step": 14700 + }, + { + "epoch": 1.19, + "grad_norm": 4.973886727329223, + "learning_rate": 6.849693551635561e-06, + "loss": 0.6239, + "step": 14701 + }, + { + "epoch": 1.19, + "grad_norm": 7.92839766650643, + "learning_rate": 6.849286209831585e-06, + "loss": 0.6243, + "step": 14702 + }, + { + "epoch": 1.19, + "grad_norm": 2.4003191114555795, + "learning_rate": 6.848878853808151e-06, + "loss": 0.6531, + "step": 14703 + }, + { + "epoch": 1.19, + "grad_norm": 2.257789344043934, + "learning_rate": 6.848471483568393e-06, + "loss": 0.6302, + "step": 14704 + }, + { + "epoch": 1.19, + "grad_norm": 2.3079246801389384, + "learning_rate": 6.848064099115444e-06, + "loss": 0.5215, + "step": 14705 + }, + { + "epoch": 1.19, + "grad_norm": 3.182218967420018, + "learning_rate": 6.847656700452436e-06, + "loss": 0.5505, + "step": 14706 + }, + { + "epoch": 1.19, + "grad_norm": 4.305800809092512, + "learning_rate": 6.8472492875825e-06, + "loss": 0.4313, + "step": 14707 + }, + { + "epoch": 1.19, + "grad_norm": 5.908891350014604, + "learning_rate": 6.84684186050877e-06, + "loss": 0.5486, + "step": 14708 + }, + { + "epoch": 1.19, + "grad_norm": 4.760643988354593, + "learning_rate": 6.84643441923438e-06, + "loss": 0.4156, + "step": 14709 + }, + { + "epoch": 1.19, + "grad_norm": 4.0614172785634945, + "learning_rate": 6.846026963762461e-06, + "loss": 0.5884, + "step": 14710 + }, + { + "epoch": 1.19, + "grad_norm": 2.880233172958939, + "learning_rate": 6.8456194940961475e-06, + "loss": 0.6842, + "step": 14711 + }, + { + "epoch": 1.19, + "grad_norm": 2.2193720119458082, + "learning_rate": 6.845212010238571e-06, + "loss": 0.5716, + "step": 14712 + }, + { + "epoch": 1.19, + "grad_norm": 3.510072669291895, + "learning_rate": 6.844804512192864e-06, + "loss": 0.5126, + "step": 14713 + }, + { + "epoch": 1.2, + "grad_norm": 4.689280612629077, + "learning_rate": 6.844396999962164e-06, + "loss": 0.5171, + "step": 14714 + }, + { + "epoch": 1.2, + "grad_norm": 3.213174798211879, + "learning_rate": 6.8439894735496e-06, + "loss": 0.7135, + "step": 14715 + }, + { + "epoch": 1.2, + "grad_norm": 3.2655173950171528, + "learning_rate": 6.843581932958308e-06, + "loss": 0.5901, + "step": 14716 + }, + { + "epoch": 1.2, + "grad_norm": 2.4276917706838317, + "learning_rate": 6.843174378191419e-06, + "loss": 0.4364, + "step": 14717 + }, + { + "epoch": 1.2, + "grad_norm": 3.6318836368014327, + "learning_rate": 6.84276680925207e-06, + "loss": 0.5311, + "step": 14718 + }, + { + "epoch": 1.2, + "grad_norm": 5.136452195101276, + "learning_rate": 6.842359226143394e-06, + "loss": 0.499, + "step": 14719 + }, + { + "epoch": 1.2, + "grad_norm": 3.583300628326307, + "learning_rate": 6.841951628868525e-06, + "loss": 0.6088, + "step": 14720 + }, + { + "epoch": 1.2, + "grad_norm": 3.7510241148183234, + "learning_rate": 6.841544017430595e-06, + "loss": 0.6, + "step": 14721 + }, + { + "epoch": 1.2, + "grad_norm": 2.197192189128557, + "learning_rate": 6.84113639183274e-06, + "loss": 0.4996, + "step": 14722 + }, + { + "epoch": 1.2, + "grad_norm": 4.198562020468372, + "learning_rate": 6.8407287520780944e-06, + "loss": 0.6003, + "step": 14723 + }, + { + "epoch": 1.2, + "grad_norm": 4.179014914255244, + "learning_rate": 6.840321098169791e-06, + "loss": 0.4895, + "step": 14724 + }, + { + "epoch": 1.2, + "grad_norm": 3.1336196797618165, + "learning_rate": 6.839913430110967e-06, + "loss": 0.5997, + "step": 14725 + }, + { + "epoch": 1.2, + "grad_norm": 2.8441183640818877, + "learning_rate": 6.839505747904754e-06, + "loss": 0.5903, + "step": 14726 + }, + { + "epoch": 1.2, + "grad_norm": 4.415429763903203, + "learning_rate": 6.83909805155429e-06, + "loss": 0.5687, + "step": 14727 + }, + { + "epoch": 1.2, + "grad_norm": 3.5673190102561665, + "learning_rate": 6.838690341062708e-06, + "loss": 0.5193, + "step": 14728 + }, + { + "epoch": 1.2, + "grad_norm": 8.364522269054966, + "learning_rate": 6.838282616433143e-06, + "loss": 0.5711, + "step": 14729 + }, + { + "epoch": 1.2, + "grad_norm": 4.931869090260097, + "learning_rate": 6.8378748776687296e-06, + "loss": 0.6281, + "step": 14730 + }, + { + "epoch": 1.2, + "grad_norm": 4.377437607560901, + "learning_rate": 6.837467124772604e-06, + "loss": 0.4964, + "step": 14731 + }, + { + "epoch": 1.2, + "grad_norm": 4.240034619014098, + "learning_rate": 6.8370593577479004e-06, + "loss": 0.7647, + "step": 14732 + }, + { + "epoch": 1.2, + "grad_norm": 3.3790234451697008, + "learning_rate": 6.836651576597756e-06, + "loss": 0.5076, + "step": 14733 + }, + { + "epoch": 1.2, + "grad_norm": 2.0258166149763386, + "learning_rate": 6.836243781325303e-06, + "loss": 0.5449, + "step": 14734 + }, + { + "epoch": 1.2, + "grad_norm": 1.8903200895884404, + "learning_rate": 6.835835971933681e-06, + "loss": 0.5623, + "step": 14735 + }, + { + "epoch": 1.2, + "grad_norm": 6.486703233589475, + "learning_rate": 6.8354281484260235e-06, + "loss": 0.669, + "step": 14736 + }, + { + "epoch": 1.2, + "grad_norm": 7.273531651041171, + "learning_rate": 6.835020310805467e-06, + "loss": 0.5097, + "step": 14737 + }, + { + "epoch": 1.2, + "grad_norm": 3.427515253578516, + "learning_rate": 6.834612459075145e-06, + "loss": 0.6121, + "step": 14738 + }, + { + "epoch": 1.2, + "grad_norm": 3.091071309689994, + "learning_rate": 6.8342045932381964e-06, + "loss": 0.5982, + "step": 14739 + }, + { + "epoch": 1.2, + "grad_norm": 3.2711963059862152, + "learning_rate": 6.8337967132977574e-06, + "loss": 0.5969, + "step": 14740 + }, + { + "epoch": 1.2, + "grad_norm": 4.611297306190468, + "learning_rate": 6.833388819256963e-06, + "loss": 0.6958, + "step": 14741 + }, + { + "epoch": 1.2, + "grad_norm": 4.225762848151504, + "learning_rate": 6.832980911118949e-06, + "loss": 0.4497, + "step": 14742 + }, + { + "epoch": 1.2, + "grad_norm": 3.277378577434065, + "learning_rate": 6.832572988886854e-06, + "loss": 0.6368, + "step": 14743 + }, + { + "epoch": 1.2, + "grad_norm": 3.594241511770091, + "learning_rate": 6.832165052563814e-06, + "loss": 0.6818, + "step": 14744 + }, + { + "epoch": 1.2, + "grad_norm": 2.7157025944403164, + "learning_rate": 6.831757102152964e-06, + "loss": 0.4068, + "step": 14745 + }, + { + "epoch": 1.2, + "grad_norm": 2.927508483529524, + "learning_rate": 6.8313491376574415e-06, + "loss": 0.5261, + "step": 14746 + }, + { + "epoch": 1.2, + "grad_norm": 3.2553186828486167, + "learning_rate": 6.830941159080384e-06, + "loss": 0.6132, + "step": 14747 + }, + { + "epoch": 1.2, + "grad_norm": 2.61550268781451, + "learning_rate": 6.830533166424929e-06, + "loss": 0.472, + "step": 14748 + }, + { + "epoch": 1.2, + "grad_norm": 2.861419432081579, + "learning_rate": 6.830125159694213e-06, + "loss": 0.6692, + "step": 14749 + }, + { + "epoch": 1.2, + "grad_norm": 3.5088264068590456, + "learning_rate": 6.829717138891372e-06, + "loss": 0.5583, + "step": 14750 + }, + { + "epoch": 1.2, + "grad_norm": 2.398749094541213, + "learning_rate": 6.829309104019544e-06, + "loss": 0.5727, + "step": 14751 + }, + { + "epoch": 1.2, + "grad_norm": 2.567437367786468, + "learning_rate": 6.828901055081869e-06, + "loss": 0.5891, + "step": 14752 + }, + { + "epoch": 1.2, + "grad_norm": 2.6729759742067, + "learning_rate": 6.828492992081481e-06, + "loss": 0.6758, + "step": 14753 + }, + { + "epoch": 1.2, + "grad_norm": 3.3962796244401567, + "learning_rate": 6.82808491502152e-06, + "loss": 0.5274, + "step": 14754 + }, + { + "epoch": 1.2, + "grad_norm": 2.664077729817176, + "learning_rate": 6.827676823905123e-06, + "loss": 0.5926, + "step": 14755 + }, + { + "epoch": 1.2, + "grad_norm": 2.759870349512828, + "learning_rate": 6.827268718735427e-06, + "loss": 0.5079, + "step": 14756 + }, + { + "epoch": 1.2, + "grad_norm": 3.548071549483714, + "learning_rate": 6.826860599515571e-06, + "loss": 0.6991, + "step": 14757 + }, + { + "epoch": 1.2, + "grad_norm": 3.0197632775721823, + "learning_rate": 6.826452466248692e-06, + "loss": 0.5358, + "step": 14758 + }, + { + "epoch": 1.2, + "grad_norm": 5.540378939381762, + "learning_rate": 6.82604431893793e-06, + "loss": 0.5853, + "step": 14759 + }, + { + "epoch": 1.2, + "grad_norm": 3.1705217638497794, + "learning_rate": 6.825636157586423e-06, + "loss": 0.5612, + "step": 14760 + }, + { + "epoch": 1.2, + "grad_norm": 22.997665966305444, + "learning_rate": 6.825227982197309e-06, + "loss": 0.5227, + "step": 14761 + }, + { + "epoch": 1.2, + "grad_norm": 3.4443091020898784, + "learning_rate": 6.824819792773725e-06, + "loss": 0.5699, + "step": 14762 + }, + { + "epoch": 1.2, + "grad_norm": 4.253305728855996, + "learning_rate": 6.824411589318811e-06, + "loss": 0.5869, + "step": 14763 + }, + { + "epoch": 1.2, + "grad_norm": 2.8501912031070233, + "learning_rate": 6.8240033718357054e-06, + "loss": 0.6259, + "step": 14764 + }, + { + "epoch": 1.2, + "grad_norm": 6.046963578283579, + "learning_rate": 6.823595140327549e-06, + "loss": 0.5237, + "step": 14765 + }, + { + "epoch": 1.2, + "grad_norm": 2.4391276206912518, + "learning_rate": 6.8231868947974776e-06, + "loss": 0.4432, + "step": 14766 + }, + { + "epoch": 1.2, + "grad_norm": 3.3783655170824334, + "learning_rate": 6.822778635248633e-06, + "loss": 0.4853, + "step": 14767 + }, + { + "epoch": 1.2, + "grad_norm": 2.4873335067755398, + "learning_rate": 6.8223703616841515e-06, + "loss": 0.5551, + "step": 14768 + }, + { + "epoch": 1.2, + "grad_norm": 4.951816874594107, + "learning_rate": 6.8219620741071754e-06, + "loss": 0.6278, + "step": 14769 + }, + { + "epoch": 1.2, + "grad_norm": 6.575744138016724, + "learning_rate": 6.821553772520841e-06, + "loss": 0.6479, + "step": 14770 + }, + { + "epoch": 1.2, + "grad_norm": 3.884841464862106, + "learning_rate": 6.821145456928291e-06, + "loss": 0.4425, + "step": 14771 + }, + { + "epoch": 1.2, + "grad_norm": 2.8873656084345036, + "learning_rate": 6.820737127332664e-06, + "loss": 0.6389, + "step": 14772 + }, + { + "epoch": 1.2, + "grad_norm": 2.7144470153943776, + "learning_rate": 6.820328783737098e-06, + "loss": 0.6162, + "step": 14773 + }, + { + "epoch": 1.2, + "grad_norm": 7.050838496908331, + "learning_rate": 6.819920426144734e-06, + "loss": 0.6888, + "step": 14774 + }, + { + "epoch": 1.2, + "grad_norm": 2.7758330680229695, + "learning_rate": 6.819512054558713e-06, + "loss": 0.4272, + "step": 14775 + }, + { + "epoch": 1.2, + "grad_norm": 3.7347789087962995, + "learning_rate": 6.8191036689821735e-06, + "loss": 0.8185, + "step": 14776 + }, + { + "epoch": 1.2, + "grad_norm": 5.91985486004289, + "learning_rate": 6.8186952694182565e-06, + "loss": 0.5799, + "step": 14777 + }, + { + "epoch": 1.2, + "grad_norm": 3.5697304929618867, + "learning_rate": 6.8182868558701e-06, + "loss": 0.5133, + "step": 14778 + }, + { + "epoch": 1.2, + "grad_norm": 3.6320166054387535, + "learning_rate": 6.817878428340847e-06, + "loss": 0.7047, + "step": 14779 + }, + { + "epoch": 1.2, + "grad_norm": 2.7153134173019517, + "learning_rate": 6.817469986833639e-06, + "loss": 0.5043, + "step": 14780 + }, + { + "epoch": 1.2, + "grad_norm": 3.590402039574638, + "learning_rate": 6.817061531351614e-06, + "loss": 0.5539, + "step": 14781 + }, + { + "epoch": 1.2, + "grad_norm": 2.8156278741555076, + "learning_rate": 6.816653061897912e-06, + "loss": 0.5066, + "step": 14782 + }, + { + "epoch": 1.2, + "grad_norm": 2.905298217699814, + "learning_rate": 6.816244578475677e-06, + "loss": 0.6337, + "step": 14783 + }, + { + "epoch": 1.2, + "grad_norm": 23.51843809477399, + "learning_rate": 6.815836081088047e-06, + "loss": 0.558, + "step": 14784 + }, + { + "epoch": 1.2, + "grad_norm": 2.6620907079667697, + "learning_rate": 6.815427569738164e-06, + "loss": 0.5499, + "step": 14785 + }, + { + "epoch": 1.2, + "grad_norm": 4.295796771178682, + "learning_rate": 6.81501904442917e-06, + "loss": 0.645, + "step": 14786 + }, + { + "epoch": 1.2, + "grad_norm": 2.7262879754511222, + "learning_rate": 6.814610505164205e-06, + "loss": 0.5336, + "step": 14787 + }, + { + "epoch": 1.2, + "grad_norm": 2.254840056023091, + "learning_rate": 6.814201951946412e-06, + "loss": 0.6156, + "step": 14788 + }, + { + "epoch": 1.2, + "grad_norm": 2.5390871360390572, + "learning_rate": 6.81379338477893e-06, + "loss": 0.516, + "step": 14789 + }, + { + "epoch": 1.2, + "grad_norm": 3.746597547876837, + "learning_rate": 6.813384803664902e-06, + "loss": 0.4872, + "step": 14790 + }, + { + "epoch": 1.2, + "grad_norm": 6.5072497101828395, + "learning_rate": 6.812976208607469e-06, + "loss": 0.579, + "step": 14791 + }, + { + "epoch": 1.2, + "grad_norm": 5.057975399305155, + "learning_rate": 6.812567599609774e-06, + "loss": 0.68, + "step": 14792 + }, + { + "epoch": 1.2, + "grad_norm": 6.535378039003077, + "learning_rate": 6.812158976674958e-06, + "loss": 0.6115, + "step": 14793 + }, + { + "epoch": 1.2, + "grad_norm": 3.2654791595422643, + "learning_rate": 6.811750339806161e-06, + "loss": 0.5274, + "step": 14794 + }, + { + "epoch": 1.2, + "grad_norm": 2.1380247791910567, + "learning_rate": 6.81134168900653e-06, + "loss": 0.5939, + "step": 14795 + }, + { + "epoch": 1.2, + "grad_norm": 2.944178700637409, + "learning_rate": 6.810933024279203e-06, + "loss": 0.5175, + "step": 14796 + }, + { + "epoch": 1.2, + "grad_norm": 2.7438478566940256, + "learning_rate": 6.810524345627323e-06, + "loss": 0.5664, + "step": 14797 + }, + { + "epoch": 1.2, + "grad_norm": 2.6719373051241497, + "learning_rate": 6.810115653054033e-06, + "loss": 0.48, + "step": 14798 + }, + { + "epoch": 1.2, + "grad_norm": 3.3390754146364108, + "learning_rate": 6.809706946562475e-06, + "loss": 0.5867, + "step": 14799 + }, + { + "epoch": 1.2, + "grad_norm": 2.6337600865271216, + "learning_rate": 6.809298226155794e-06, + "loss": 0.5322, + "step": 14800 + }, + { + "epoch": 1.2, + "grad_norm": 2.182146828862285, + "learning_rate": 6.80888949183713e-06, + "loss": 0.5538, + "step": 14801 + }, + { + "epoch": 1.2, + "grad_norm": 2.455566695395919, + "learning_rate": 6.808480743609626e-06, + "loss": 0.6108, + "step": 14802 + }, + { + "epoch": 1.2, + "grad_norm": 5.798309466151922, + "learning_rate": 6.8080719814764255e-06, + "loss": 0.455, + "step": 14803 + }, + { + "epoch": 1.2, + "grad_norm": 3.6462677354672675, + "learning_rate": 6.807663205440671e-06, + "loss": 0.522, + "step": 14804 + }, + { + "epoch": 1.2, + "grad_norm": 5.235018800439813, + "learning_rate": 6.807254415505506e-06, + "loss": 0.6477, + "step": 14805 + }, + { + "epoch": 1.2, + "grad_norm": 4.717549413126731, + "learning_rate": 6.806845611674076e-06, + "loss": 0.4998, + "step": 14806 + }, + { + "epoch": 1.2, + "grad_norm": 3.260368951521907, + "learning_rate": 6.80643679394952e-06, + "loss": 0.5702, + "step": 14807 + }, + { + "epoch": 1.2, + "grad_norm": 2.931305929030512, + "learning_rate": 6.806027962334985e-06, + "loss": 0.5549, + "step": 14808 + }, + { + "epoch": 1.2, + "grad_norm": 4.372973972108359, + "learning_rate": 6.8056191168336126e-06, + "loss": 0.726, + "step": 14809 + }, + { + "epoch": 1.2, + "grad_norm": 12.414451429256461, + "learning_rate": 6.805210257448549e-06, + "loss": 0.5219, + "step": 14810 + }, + { + "epoch": 1.2, + "grad_norm": 3.0991981576251546, + "learning_rate": 6.804801384182933e-06, + "loss": 0.7568, + "step": 14811 + }, + { + "epoch": 1.2, + "grad_norm": 3.6387093704475513, + "learning_rate": 6.8043924970399145e-06, + "loss": 0.6711, + "step": 14812 + }, + { + "epoch": 1.2, + "grad_norm": 3.209127374784164, + "learning_rate": 6.803983596022634e-06, + "loss": 0.4246, + "step": 14813 + }, + { + "epoch": 1.2, + "grad_norm": 5.262968533541719, + "learning_rate": 6.8035746811342364e-06, + "loss": 0.7182, + "step": 14814 + }, + { + "epoch": 1.2, + "grad_norm": 6.2062956872002255, + "learning_rate": 6.803165752377864e-06, + "loss": 0.5052, + "step": 14815 + }, + { + "epoch": 1.2, + "grad_norm": 3.779841709785704, + "learning_rate": 6.8027568097566645e-06, + "loss": 0.5955, + "step": 14816 + }, + { + "epoch": 1.2, + "grad_norm": 5.717216278745841, + "learning_rate": 6.8023478532737804e-06, + "loss": 0.7065, + "step": 14817 + }, + { + "epoch": 1.2, + "grad_norm": 4.427116417544992, + "learning_rate": 6.801938882932357e-06, + "loss": 0.5563, + "step": 14818 + }, + { + "epoch": 1.2, + "grad_norm": 3.2308138390740395, + "learning_rate": 6.801529898735537e-06, + "loss": 0.5021, + "step": 14819 + }, + { + "epoch": 1.2, + "grad_norm": 3.2607607253308295, + "learning_rate": 6.8011209006864685e-06, + "loss": 0.5585, + "step": 14820 + }, + { + "epoch": 1.2, + "grad_norm": 2.7978925262568057, + "learning_rate": 6.800711888788294e-06, + "loss": 0.6009, + "step": 14821 + }, + { + "epoch": 1.2, + "grad_norm": 3.55882890093947, + "learning_rate": 6.800302863044159e-06, + "loss": 0.6361, + "step": 14822 + }, + { + "epoch": 1.2, + "grad_norm": 4.731035896279384, + "learning_rate": 6.799893823457209e-06, + "loss": 0.5911, + "step": 14823 + }, + { + "epoch": 1.2, + "grad_norm": 2.336452905005183, + "learning_rate": 6.7994847700305875e-06, + "loss": 0.6552, + "step": 14824 + }, + { + "epoch": 1.2, + "grad_norm": 4.460804962040697, + "learning_rate": 6.7990757027674415e-06, + "loss": 0.728, + "step": 14825 + }, + { + "epoch": 1.2, + "grad_norm": 4.67237122776513, + "learning_rate": 6.798666621670916e-06, + "loss": 0.5155, + "step": 14826 + }, + { + "epoch": 1.2, + "grad_norm": 3.1174122115694933, + "learning_rate": 6.798257526744155e-06, + "loss": 0.5583, + "step": 14827 + }, + { + "epoch": 1.2, + "grad_norm": 2.7560681786351258, + "learning_rate": 6.797848417990307e-06, + "loss": 0.5209, + "step": 14828 + }, + { + "epoch": 1.2, + "grad_norm": 2.3155483478046635, + "learning_rate": 6.797439295412517e-06, + "loss": 0.5019, + "step": 14829 + }, + { + "epoch": 1.2, + "grad_norm": 2.4017957516904818, + "learning_rate": 6.797030159013929e-06, + "loss": 0.4986, + "step": 14830 + }, + { + "epoch": 1.2, + "grad_norm": 5.407251176869368, + "learning_rate": 6.7966210087976885e-06, + "loss": 0.7315, + "step": 14831 + }, + { + "epoch": 1.2, + "grad_norm": 4.1120032518994405, + "learning_rate": 6.796211844766945e-06, + "loss": 0.6064, + "step": 14832 + }, + { + "epoch": 1.2, + "grad_norm": 3.6678911173976836, + "learning_rate": 6.795802666924841e-06, + "loss": 0.5705, + "step": 14833 + }, + { + "epoch": 1.2, + "grad_norm": 3.0792773221101783, + "learning_rate": 6.7953934752745246e-06, + "loss": 0.4362, + "step": 14834 + }, + { + "epoch": 1.2, + "grad_norm": 4.415328386226919, + "learning_rate": 6.794984269819142e-06, + "loss": 0.5291, + "step": 14835 + }, + { + "epoch": 1.2, + "grad_norm": 5.219540193012217, + "learning_rate": 6.794575050561839e-06, + "loss": 0.5592, + "step": 14836 + }, + { + "epoch": 1.21, + "grad_norm": 2.4877305456309386, + "learning_rate": 6.7941658175057635e-06, + "loss": 0.6168, + "step": 14837 + }, + { + "epoch": 1.21, + "grad_norm": 15.844652333724921, + "learning_rate": 6.793756570654061e-06, + "loss": 0.5583, + "step": 14838 + }, + { + "epoch": 1.21, + "grad_norm": 2.8439661517650183, + "learning_rate": 6.793347310009877e-06, + "loss": 0.5444, + "step": 14839 + }, + { + "epoch": 1.21, + "grad_norm": 3.758723230397438, + "learning_rate": 6.792938035576362e-06, + "loss": 0.6927, + "step": 14840 + }, + { + "epoch": 1.21, + "grad_norm": 6.106635580983163, + "learning_rate": 6.792528747356659e-06, + "loss": 0.6323, + "step": 14841 + }, + { + "epoch": 1.21, + "grad_norm": 3.291241225163582, + "learning_rate": 6.792119445353918e-06, + "loss": 0.7022, + "step": 14842 + }, + { + "epoch": 1.21, + "grad_norm": 3.3238712712201055, + "learning_rate": 6.791710129571285e-06, + "loss": 0.6856, + "step": 14843 + }, + { + "epoch": 1.21, + "grad_norm": 2.6371711029896, + "learning_rate": 6.791300800011908e-06, + "loss": 0.617, + "step": 14844 + }, + { + "epoch": 1.21, + "grad_norm": 3.1693597970697853, + "learning_rate": 6.790891456678933e-06, + "loss": 0.5028, + "step": 14845 + }, + { + "epoch": 1.21, + "grad_norm": 2.546506255865266, + "learning_rate": 6.790482099575508e-06, + "loss": 0.3318, + "step": 14846 + }, + { + "epoch": 1.21, + "grad_norm": 3.29771779093766, + "learning_rate": 6.790072728704782e-06, + "loss": 0.8016, + "step": 14847 + }, + { + "epoch": 1.21, + "grad_norm": 2.643095871371575, + "learning_rate": 6.789663344069901e-06, + "loss": 0.4898, + "step": 14848 + }, + { + "epoch": 1.21, + "grad_norm": 2.635116990967789, + "learning_rate": 6.789253945674013e-06, + "loss": 0.7046, + "step": 14849 + }, + { + "epoch": 1.21, + "grad_norm": 5.908303624393926, + "learning_rate": 6.788844533520268e-06, + "loss": 0.7898, + "step": 14850 + }, + { + "epoch": 1.21, + "grad_norm": 3.8810701683758517, + "learning_rate": 6.788435107611811e-06, + "loss": 0.6115, + "step": 14851 + }, + { + "epoch": 1.21, + "grad_norm": 2.689473222381809, + "learning_rate": 6.7880256679517915e-06, + "loss": 0.5451, + "step": 14852 + }, + { + "epoch": 1.21, + "grad_norm": 3.748763429887011, + "learning_rate": 6.7876162145433595e-06, + "loss": 0.6224, + "step": 14853 + }, + { + "epoch": 1.21, + "grad_norm": 2.7967399999634255, + "learning_rate": 6.787206747389661e-06, + "loss": 0.6204, + "step": 14854 + }, + { + "epoch": 1.21, + "grad_norm": 4.25404410975183, + "learning_rate": 6.786797266493843e-06, + "loss": 0.6173, + "step": 14855 + }, + { + "epoch": 1.21, + "grad_norm": 7.17216365104514, + "learning_rate": 6.786387771859059e-06, + "loss": 0.5187, + "step": 14856 + }, + { + "epoch": 1.21, + "grad_norm": 4.504697891761151, + "learning_rate": 6.785978263488454e-06, + "loss": 0.7197, + "step": 14857 + }, + { + "epoch": 1.21, + "grad_norm": 3.0327988185167394, + "learning_rate": 6.785568741385178e-06, + "loss": 0.5814, + "step": 14858 + }, + { + "epoch": 1.21, + "grad_norm": 4.798500100088936, + "learning_rate": 6.785159205552378e-06, + "loss": 0.6174, + "step": 14859 + }, + { + "epoch": 1.21, + "grad_norm": 5.240325917693067, + "learning_rate": 6.784749655993206e-06, + "loss": 0.4095, + "step": 14860 + }, + { + "epoch": 1.21, + "grad_norm": 23.259543597924814, + "learning_rate": 6.7843400927108095e-06, + "loss": 0.4682, + "step": 14861 + }, + { + "epoch": 1.21, + "grad_norm": 4.059762094065877, + "learning_rate": 6.783930515708337e-06, + "loss": 0.5924, + "step": 14862 + }, + { + "epoch": 1.21, + "grad_norm": 3.671885201574418, + "learning_rate": 6.7835209249889385e-06, + "loss": 0.6314, + "step": 14863 + }, + { + "epoch": 1.21, + "grad_norm": 2.577228154260966, + "learning_rate": 6.7831113205557645e-06, + "loss": 0.6713, + "step": 14864 + }, + { + "epoch": 1.21, + "grad_norm": 3.070797974513478, + "learning_rate": 6.782701702411964e-06, + "loss": 0.6325, + "step": 14865 + }, + { + "epoch": 1.21, + "grad_norm": 2.77986767725274, + "learning_rate": 6.7822920705606855e-06, + "loss": 0.6196, + "step": 14866 + }, + { + "epoch": 1.21, + "grad_norm": 4.290208973267117, + "learning_rate": 6.7818824250050774e-06, + "loss": 0.6099, + "step": 14867 + }, + { + "epoch": 1.21, + "grad_norm": 4.475187602130366, + "learning_rate": 6.781472765748294e-06, + "loss": 0.6864, + "step": 14868 + }, + { + "epoch": 1.21, + "grad_norm": 3.5224372833495226, + "learning_rate": 6.7810630927934815e-06, + "loss": 0.6975, + "step": 14869 + }, + { + "epoch": 1.21, + "grad_norm": 2.7956385548333045, + "learning_rate": 6.780653406143792e-06, + "loss": 0.4659, + "step": 14870 + }, + { + "epoch": 1.21, + "grad_norm": 4.070882171854959, + "learning_rate": 6.780243705802374e-06, + "loss": 0.5846, + "step": 14871 + }, + { + "epoch": 1.21, + "grad_norm": 2.700188859238596, + "learning_rate": 6.77983399177238e-06, + "loss": 0.5747, + "step": 14872 + }, + { + "epoch": 1.21, + "grad_norm": 5.198585542092083, + "learning_rate": 6.779424264056958e-06, + "loss": 0.6275, + "step": 14873 + }, + { + "epoch": 1.21, + "grad_norm": 3.9692894750405667, + "learning_rate": 6.77901452265926e-06, + "loss": 0.5938, + "step": 14874 + }, + { + "epoch": 1.21, + "grad_norm": 3.371526530400815, + "learning_rate": 6.778604767582434e-06, + "loss": 0.5454, + "step": 14875 + }, + { + "epoch": 1.21, + "grad_norm": 7.633769723456087, + "learning_rate": 6.7781949988296345e-06, + "loss": 0.5634, + "step": 14876 + }, + { + "epoch": 1.21, + "grad_norm": 4.3664743220412054, + "learning_rate": 6.77778521640401e-06, + "loss": 0.6131, + "step": 14877 + }, + { + "epoch": 1.21, + "grad_norm": 3.8878483251973877, + "learning_rate": 6.777375420308712e-06, + "loss": 0.7577, + "step": 14878 + }, + { + "epoch": 1.21, + "grad_norm": 2.757930371934674, + "learning_rate": 6.77696561054689e-06, + "loss": 0.5119, + "step": 14879 + }, + { + "epoch": 1.21, + "grad_norm": 8.853750486591252, + "learning_rate": 6.776555787121698e-06, + "loss": 0.4906, + "step": 14880 + }, + { + "epoch": 1.21, + "grad_norm": 2.381920428994496, + "learning_rate": 6.776145950036285e-06, + "loss": 0.6058, + "step": 14881 + }, + { + "epoch": 1.21, + "grad_norm": 3.693999691498746, + "learning_rate": 6.775736099293803e-06, + "loss": 0.4644, + "step": 14882 + }, + { + "epoch": 1.21, + "grad_norm": 3.1418668238017147, + "learning_rate": 6.775326234897403e-06, + "loss": 0.5338, + "step": 14883 + }, + { + "epoch": 1.21, + "grad_norm": 4.084425543326978, + "learning_rate": 6.774916356850235e-06, + "loss": 0.5618, + "step": 14884 + }, + { + "epoch": 1.21, + "grad_norm": 2.9138840544893423, + "learning_rate": 6.774506465155455e-06, + "loss": 0.5327, + "step": 14885 + }, + { + "epoch": 1.21, + "grad_norm": 3.7928749916250415, + "learning_rate": 6.774096559816212e-06, + "loss": 0.6097, + "step": 14886 + }, + { + "epoch": 1.21, + "grad_norm": 3.6219685529410848, + "learning_rate": 6.773686640835657e-06, + "loss": 0.6725, + "step": 14887 + }, + { + "epoch": 1.21, + "grad_norm": 3.1858957132247077, + "learning_rate": 6.773276708216943e-06, + "loss": 0.487, + "step": 14888 + }, + { + "epoch": 1.21, + "grad_norm": 3.5544365517229912, + "learning_rate": 6.772866761963223e-06, + "loss": 0.5625, + "step": 14889 + }, + { + "epoch": 1.21, + "grad_norm": 2.9004141275982094, + "learning_rate": 6.772456802077647e-06, + "loss": 0.5485, + "step": 14890 + }, + { + "epoch": 1.21, + "grad_norm": 7.218163923731186, + "learning_rate": 6.772046828563369e-06, + "loss": 0.5172, + "step": 14891 + }, + { + "epoch": 1.21, + "grad_norm": 3.389057976496339, + "learning_rate": 6.771636841423539e-06, + "loss": 0.7012, + "step": 14892 + }, + { + "epoch": 1.21, + "grad_norm": 2.9873556731001734, + "learning_rate": 6.771226840661314e-06, + "loss": 0.5542, + "step": 14893 + }, + { + "epoch": 1.21, + "grad_norm": 5.268274305712097, + "learning_rate": 6.770816826279841e-06, + "loss": 0.4931, + "step": 14894 + }, + { + "epoch": 1.21, + "grad_norm": 2.593137242544547, + "learning_rate": 6.770406798282277e-06, + "loss": 0.6675, + "step": 14895 + }, + { + "epoch": 1.21, + "grad_norm": 2.8748900727611697, + "learning_rate": 6.769996756671773e-06, + "loss": 0.4301, + "step": 14896 + }, + { + "epoch": 1.21, + "grad_norm": 5.810001596382339, + "learning_rate": 6.769586701451481e-06, + "loss": 0.5507, + "step": 14897 + }, + { + "epoch": 1.21, + "grad_norm": 3.3869169879404857, + "learning_rate": 6.769176632624556e-06, + "loss": 0.6071, + "step": 14898 + }, + { + "epoch": 1.21, + "grad_norm": 3.630628339340941, + "learning_rate": 6.7687665501941504e-06, + "loss": 0.7011, + "step": 14899 + }, + { + "epoch": 1.21, + "grad_norm": 2.255777570605704, + "learning_rate": 6.7683564541634165e-06, + "loss": 0.6939, + "step": 14900 + }, + { + "epoch": 1.21, + "grad_norm": 2.4447687968255245, + "learning_rate": 6.7679463445355065e-06, + "loss": 0.5363, + "step": 14901 + }, + { + "epoch": 1.21, + "grad_norm": 3.014235548221006, + "learning_rate": 6.7675362213135775e-06, + "loss": 0.6091, + "step": 14902 + }, + { + "epoch": 1.21, + "grad_norm": 2.878403473308778, + "learning_rate": 6.7671260845007804e-06, + "loss": 0.6734, + "step": 14903 + }, + { + "epoch": 1.21, + "grad_norm": 3.066857329337155, + "learning_rate": 6.76671593410027e-06, + "loss": 0.757, + "step": 14904 + }, + { + "epoch": 1.21, + "grad_norm": 5.008502984016628, + "learning_rate": 6.766305770115198e-06, + "loss": 0.6301, + "step": 14905 + }, + { + "epoch": 1.21, + "grad_norm": 2.739842896526618, + "learning_rate": 6.76589559254872e-06, + "loss": 0.6467, + "step": 14906 + }, + { + "epoch": 1.21, + "grad_norm": 2.645533380458798, + "learning_rate": 6.76548540140399e-06, + "loss": 0.5885, + "step": 14907 + }, + { + "epoch": 1.21, + "grad_norm": 3.282026097214851, + "learning_rate": 6.765075196684162e-06, + "loss": 0.4495, + "step": 14908 + }, + { + "epoch": 1.21, + "grad_norm": 3.1166605245527084, + "learning_rate": 6.764664978392388e-06, + "loss": 0.5759, + "step": 14909 + }, + { + "epoch": 1.21, + "grad_norm": 4.45386723269136, + "learning_rate": 6.7642547465318254e-06, + "loss": 0.6144, + "step": 14910 + }, + { + "epoch": 1.21, + "grad_norm": 3.6279771688108684, + "learning_rate": 6.763844501105627e-06, + "loss": 0.55, + "step": 14911 + }, + { + "epoch": 1.21, + "grad_norm": 9.816154473336146, + "learning_rate": 6.763434242116946e-06, + "loss": 0.5878, + "step": 14912 + }, + { + "epoch": 1.21, + "grad_norm": 4.347939065644158, + "learning_rate": 6.76302396956894e-06, + "loss": 0.6775, + "step": 14913 + }, + { + "epoch": 1.21, + "grad_norm": 4.876321335187087, + "learning_rate": 6.76261368346476e-06, + "loss": 0.5713, + "step": 14914 + }, + { + "epoch": 1.21, + "grad_norm": 6.194061843963838, + "learning_rate": 6.762203383807564e-06, + "loss": 0.5373, + "step": 14915 + }, + { + "epoch": 1.21, + "grad_norm": 2.661532393474939, + "learning_rate": 6.7617930706005055e-06, + "loss": 0.5829, + "step": 14916 + }, + { + "epoch": 1.21, + "grad_norm": 3.0528360491445796, + "learning_rate": 6.761382743846738e-06, + "loss": 0.4923, + "step": 14917 + }, + { + "epoch": 1.21, + "grad_norm": 2.197285914592314, + "learning_rate": 6.7609724035494195e-06, + "loss": 0.6449, + "step": 14918 + }, + { + "epoch": 1.21, + "grad_norm": 3.126276459284213, + "learning_rate": 6.760562049711703e-06, + "loss": 0.5503, + "step": 14919 + }, + { + "epoch": 1.21, + "grad_norm": 9.65785130982446, + "learning_rate": 6.7601516823367455e-06, + "loss": 0.438, + "step": 14920 + }, + { + "epoch": 1.21, + "grad_norm": 3.8038449686159295, + "learning_rate": 6.759741301427699e-06, + "loss": 0.5474, + "step": 14921 + }, + { + "epoch": 1.21, + "grad_norm": 4.047525880087615, + "learning_rate": 6.759330906987723e-06, + "loss": 0.5408, + "step": 14922 + }, + { + "epoch": 1.21, + "grad_norm": 2.7293789238708546, + "learning_rate": 6.758920499019972e-06, + "loss": 0.5825, + "step": 14923 + }, + { + "epoch": 1.21, + "grad_norm": 2.9195180962420704, + "learning_rate": 6.7585100775276005e-06, + "loss": 0.6482, + "step": 14924 + }, + { + "epoch": 1.21, + "grad_norm": 4.526446721901817, + "learning_rate": 6.7580996425137635e-06, + "loss": 0.6229, + "step": 14925 + }, + { + "epoch": 1.21, + "grad_norm": 6.867093010154416, + "learning_rate": 6.75768919398162e-06, + "loss": 0.5328, + "step": 14926 + }, + { + "epoch": 1.21, + "grad_norm": 2.3200207549029024, + "learning_rate": 6.7572787319343245e-06, + "loss": 0.5211, + "step": 14927 + }, + { + "epoch": 1.21, + "grad_norm": 3.348954437484658, + "learning_rate": 6.756868256375032e-06, + "loss": 0.5466, + "step": 14928 + }, + { + "epoch": 1.21, + "grad_norm": 13.573143074996123, + "learning_rate": 6.7564577673069e-06, + "loss": 0.5146, + "step": 14929 + }, + { + "epoch": 1.21, + "grad_norm": 5.146589220741601, + "learning_rate": 6.756047264733085e-06, + "loss": 0.6238, + "step": 14930 + }, + { + "epoch": 1.21, + "grad_norm": 4.0734943970533735, + "learning_rate": 6.755636748656742e-06, + "loss": 0.5783, + "step": 14931 + }, + { + "epoch": 1.21, + "grad_norm": 2.8000534612454495, + "learning_rate": 6.755226219081028e-06, + "loss": 0.551, + "step": 14932 + }, + { + "epoch": 1.21, + "grad_norm": 2.7268253984180957, + "learning_rate": 6.754815676009101e-06, + "loss": 0.575, + "step": 14933 + }, + { + "epoch": 1.21, + "grad_norm": 4.320334683002618, + "learning_rate": 6.754405119444116e-06, + "loss": 0.5083, + "step": 14934 + }, + { + "epoch": 1.21, + "grad_norm": 4.814918800508288, + "learning_rate": 6.753994549389231e-06, + "loss": 0.5302, + "step": 14935 + }, + { + "epoch": 1.21, + "grad_norm": 3.5917751072140507, + "learning_rate": 6.753583965847603e-06, + "loss": 0.5347, + "step": 14936 + }, + { + "epoch": 1.21, + "grad_norm": 3.3404881717304287, + "learning_rate": 6.753173368822388e-06, + "loss": 0.4817, + "step": 14937 + }, + { + "epoch": 1.21, + "grad_norm": 4.714868534948615, + "learning_rate": 6.752762758316744e-06, + "loss": 0.6229, + "step": 14938 + }, + { + "epoch": 1.21, + "grad_norm": 2.490795051857086, + "learning_rate": 6.7523521343338285e-06, + "loss": 0.6258, + "step": 14939 + }, + { + "epoch": 1.21, + "grad_norm": 5.4682101979533595, + "learning_rate": 6.751941496876797e-06, + "loss": 0.6834, + "step": 14940 + }, + { + "epoch": 1.21, + "grad_norm": 2.898522139890144, + "learning_rate": 6.751530845948809e-06, + "loss": 0.6246, + "step": 14941 + }, + { + "epoch": 1.21, + "grad_norm": 2.110916247897185, + "learning_rate": 6.75112018155302e-06, + "loss": 0.5609, + "step": 14942 + }, + { + "epoch": 1.21, + "grad_norm": 2.8411411144098375, + "learning_rate": 6.750709503692592e-06, + "loss": 0.5703, + "step": 14943 + }, + { + "epoch": 1.21, + "grad_norm": 2.3170044236807468, + "learning_rate": 6.750298812370677e-06, + "loss": 0.6, + "step": 14944 + }, + { + "epoch": 1.21, + "grad_norm": 2.3286927833151845, + "learning_rate": 6.749888107590437e-06, + "loss": 0.5649, + "step": 14945 + }, + { + "epoch": 1.21, + "grad_norm": 3.5530501666236534, + "learning_rate": 6.749477389355028e-06, + "loss": 0.5812, + "step": 14946 + }, + { + "epoch": 1.21, + "grad_norm": 6.776717908474441, + "learning_rate": 6.749066657667609e-06, + "loss": 0.4992, + "step": 14947 + }, + { + "epoch": 1.21, + "grad_norm": 2.684583555062618, + "learning_rate": 6.7486559125313374e-06, + "loss": 0.5093, + "step": 14948 + }, + { + "epoch": 1.21, + "grad_norm": 6.409612452124999, + "learning_rate": 6.748245153949372e-06, + "loss": 0.6102, + "step": 14949 + }, + { + "epoch": 1.21, + "grad_norm": 3.8714639620374616, + "learning_rate": 6.747834381924871e-06, + "loss": 0.6109, + "step": 14950 + }, + { + "epoch": 1.21, + "grad_norm": 2.774899167323002, + "learning_rate": 6.747423596460995e-06, + "loss": 0.6473, + "step": 14951 + }, + { + "epoch": 1.21, + "grad_norm": 4.505917853933734, + "learning_rate": 6.747012797560899e-06, + "loss": 0.6114, + "step": 14952 + }, + { + "epoch": 1.21, + "grad_norm": 3.393353915004305, + "learning_rate": 6.746601985227742e-06, + "loss": 0.729, + "step": 14953 + }, + { + "epoch": 1.21, + "grad_norm": 2.947476546006258, + "learning_rate": 6.746191159464685e-06, + "loss": 0.5376, + "step": 14954 + }, + { + "epoch": 1.21, + "grad_norm": 3.006791325250897, + "learning_rate": 6.745780320274888e-06, + "loss": 0.7159, + "step": 14955 + }, + { + "epoch": 1.21, + "grad_norm": 7.165608647939638, + "learning_rate": 6.745369467661507e-06, + "loss": 0.7287, + "step": 14956 + }, + { + "epoch": 1.21, + "grad_norm": 2.7184683162082095, + "learning_rate": 6.744958601627701e-06, + "loss": 0.5261, + "step": 14957 + }, + { + "epoch": 1.21, + "grad_norm": 4.193171917950052, + "learning_rate": 6.744547722176631e-06, + "loss": 0.4563, + "step": 14958 + }, + { + "epoch": 1.21, + "grad_norm": 3.061544060675656, + "learning_rate": 6.744136829311457e-06, + "loss": 0.6054, + "step": 14959 + }, + { + "epoch": 1.22, + "grad_norm": 3.0696397711246304, + "learning_rate": 6.743725923035336e-06, + "loss": 0.604, + "step": 14960 + }, + { + "epoch": 1.22, + "grad_norm": 3.2919222796861534, + "learning_rate": 6.743315003351427e-06, + "loss": 0.6945, + "step": 14961 + }, + { + "epoch": 1.22, + "grad_norm": 2.1196984209047427, + "learning_rate": 6.742904070262894e-06, + "loss": 0.5894, + "step": 14962 + }, + { + "epoch": 1.22, + "grad_norm": 3.3374645049815874, + "learning_rate": 6.742493123772893e-06, + "loss": 0.6631, + "step": 14963 + }, + { + "epoch": 1.22, + "grad_norm": 4.222941325257737, + "learning_rate": 6.7420821638845844e-06, + "loss": 0.5689, + "step": 14964 + }, + { + "epoch": 1.22, + "grad_norm": 5.739845325321258, + "learning_rate": 6.7416711906011275e-06, + "loss": 0.4766, + "step": 14965 + }, + { + "epoch": 1.22, + "grad_norm": 5.154994471723901, + "learning_rate": 6.741260203925686e-06, + "loss": 0.5471, + "step": 14966 + }, + { + "epoch": 1.22, + "grad_norm": 3.290280583049368, + "learning_rate": 6.740849203861416e-06, + "loss": 0.5837, + "step": 14967 + }, + { + "epoch": 1.22, + "grad_norm": 2.969123611814239, + "learning_rate": 6.740438190411479e-06, + "loss": 0.4667, + "step": 14968 + }, + { + "epoch": 1.22, + "grad_norm": 4.030944546187075, + "learning_rate": 6.7400271635790345e-06, + "loss": 0.6172, + "step": 14969 + }, + { + "epoch": 1.22, + "grad_norm": 5.2029443831569, + "learning_rate": 6.739616123367246e-06, + "loss": 0.5081, + "step": 14970 + }, + { + "epoch": 1.22, + "grad_norm": 3.327385103504345, + "learning_rate": 6.739205069779272e-06, + "loss": 0.5785, + "step": 14971 + }, + { + "epoch": 1.22, + "grad_norm": 15.547299826960266, + "learning_rate": 6.738794002818273e-06, + "loss": 0.6687, + "step": 14972 + }, + { + "epoch": 1.22, + "grad_norm": 3.332514931811837, + "learning_rate": 6.738382922487408e-06, + "loss": 0.6012, + "step": 14973 + }, + { + "epoch": 1.22, + "grad_norm": 2.4642680250707407, + "learning_rate": 6.7379718287898425e-06, + "loss": 0.5094, + "step": 14974 + }, + { + "epoch": 1.22, + "grad_norm": 2.9411803759772917, + "learning_rate": 6.737560721728733e-06, + "loss": 0.6168, + "step": 14975 + }, + { + "epoch": 1.22, + "grad_norm": 3.580392598301216, + "learning_rate": 6.7371496013072435e-06, + "loss": 0.4821, + "step": 14976 + }, + { + "epoch": 1.22, + "grad_norm": 2.5370892243056793, + "learning_rate": 6.736738467528532e-06, + "loss": 0.5574, + "step": 14977 + }, + { + "epoch": 1.22, + "grad_norm": 8.459756082518046, + "learning_rate": 6.736327320395764e-06, + "loss": 0.4706, + "step": 14978 + }, + { + "epoch": 1.22, + "grad_norm": 3.121416643139075, + "learning_rate": 6.735916159912098e-06, + "loss": 0.551, + "step": 14979 + }, + { + "epoch": 1.22, + "grad_norm": 3.2487348434294834, + "learning_rate": 6.735504986080696e-06, + "loss": 0.6276, + "step": 14980 + }, + { + "epoch": 1.22, + "grad_norm": 3.8353263301090266, + "learning_rate": 6.735093798904721e-06, + "loss": 0.6296, + "step": 14981 + }, + { + "epoch": 1.22, + "grad_norm": 4.80225052641842, + "learning_rate": 6.734682598387331e-06, + "loss": 0.6556, + "step": 14982 + }, + { + "epoch": 1.22, + "grad_norm": 13.759825068630365, + "learning_rate": 6.734271384531691e-06, + "loss": 0.4623, + "step": 14983 + }, + { + "epoch": 1.22, + "grad_norm": 2.566141694530004, + "learning_rate": 6.733860157340963e-06, + "loss": 0.5698, + "step": 14984 + }, + { + "epoch": 1.22, + "grad_norm": 3.3679456388370324, + "learning_rate": 6.733448916818308e-06, + "loss": 0.5367, + "step": 14985 + }, + { + "epoch": 1.22, + "grad_norm": 3.4333308282531214, + "learning_rate": 6.733037662966886e-06, + "loss": 0.7183, + "step": 14986 + }, + { + "epoch": 1.22, + "grad_norm": 5.990579847629308, + "learning_rate": 6.732626395789863e-06, + "loss": 0.5603, + "step": 14987 + }, + { + "epoch": 1.22, + "grad_norm": 3.4034368872687937, + "learning_rate": 6.7322151152904006e-06, + "loss": 0.6942, + "step": 14988 + }, + { + "epoch": 1.22, + "grad_norm": 2.611527490067071, + "learning_rate": 6.73180382147166e-06, + "loss": 0.5661, + "step": 14989 + }, + { + "epoch": 1.22, + "grad_norm": 3.0432731680594554, + "learning_rate": 6.731392514336802e-06, + "loss": 0.513, + "step": 14990 + }, + { + "epoch": 1.22, + "grad_norm": 4.513425710544245, + "learning_rate": 6.730981193888993e-06, + "loss": 0.6597, + "step": 14991 + }, + { + "epoch": 1.22, + "grad_norm": 9.931240117034832, + "learning_rate": 6.7305698601313925e-06, + "loss": 0.5815, + "step": 14992 + }, + { + "epoch": 1.22, + "grad_norm": 3.688825611488757, + "learning_rate": 6.7301585130671665e-06, + "loss": 0.6309, + "step": 14993 + }, + { + "epoch": 1.22, + "grad_norm": 2.402023152973241, + "learning_rate": 6.729747152699474e-06, + "loss": 0.4699, + "step": 14994 + }, + { + "epoch": 1.22, + "grad_norm": 5.095778514060208, + "learning_rate": 6.729335779031482e-06, + "loss": 0.5455, + "step": 14995 + }, + { + "epoch": 1.22, + "grad_norm": 3.062877155310399, + "learning_rate": 6.728924392066352e-06, + "loss": 0.5898, + "step": 14996 + }, + { + "epoch": 1.22, + "grad_norm": 5.524628374013388, + "learning_rate": 6.7285129918072455e-06, + "loss": 0.7479, + "step": 14997 + }, + { + "epoch": 1.22, + "grad_norm": 3.702879940472238, + "learning_rate": 6.7281015782573265e-06, + "loss": 0.6725, + "step": 14998 + }, + { + "epoch": 1.22, + "grad_norm": 2.812282553187698, + "learning_rate": 6.727690151419761e-06, + "loss": 0.6656, + "step": 14999 + }, + { + "epoch": 1.22, + "grad_norm": 2.465124049605808, + "learning_rate": 6.72727871129771e-06, + "loss": 0.4037, + "step": 15000 + } + ], + "logging_steps": 1.0, + "max_steps": 36936, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "total_flos": 1.2325614325636006e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}