diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7728 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9959072305593453, + "eval_steps": 500, + "global_step": 1098, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002728512960436562, + "grad_norm": 9.15278434753418, + "learning_rate": 1.8181818181818183e-07, + "loss": 1.071, + "step": 1 + }, + { + "epoch": 0.005457025920873124, + "grad_norm": 9.647517204284668, + "learning_rate": 3.6363636363636366e-07, + "loss": 1.0791, + "step": 2 + }, + { + "epoch": 0.008185538881309686, + "grad_norm": 9.722785949707031, + "learning_rate": 5.454545454545455e-07, + "loss": 1.0874, + "step": 3 + }, + { + "epoch": 0.010914051841746248, + "grad_norm": 9.583983421325684, + "learning_rate": 7.272727272727273e-07, + "loss": 1.0872, + "step": 4 + }, + { + "epoch": 0.013642564802182811, + "grad_norm": 9.145880699157715, + "learning_rate": 9.090909090909091e-07, + "loss": 1.0747, + "step": 5 + }, + { + "epoch": 0.01637107776261937, + "grad_norm": 9.105477333068848, + "learning_rate": 1.090909090909091e-06, + "loss": 1.0738, + "step": 6 + }, + { + "epoch": 0.019099590723055934, + "grad_norm": 8.226037979125977, + "learning_rate": 1.2727272727272728e-06, + "loss": 1.0432, + "step": 7 + }, + { + "epoch": 0.021828103683492497, + "grad_norm": 7.477120876312256, + "learning_rate": 1.4545454545454546e-06, + "loss": 1.0028, + "step": 8 + }, + { + "epoch": 0.02455661664392906, + "grad_norm": 6.091797351837158, + "learning_rate": 1.6363636363636365e-06, + "loss": 0.9729, + "step": 9 + }, + { + "epoch": 0.027285129604365622, + "grad_norm": 5.667421817779541, + "learning_rate": 1.8181818181818183e-06, + "loss": 0.9413, + "step": 10 + }, + { + "epoch": 0.030013642564802184, + "grad_norm": 4.3738813400268555, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.8404, + "step": 11 + }, + { + "epoch": 0.03274215552523874, + "grad_norm": 4.342959880828857, + "learning_rate": 2.181818181818182e-06, + "loss": 0.8228, + "step": 12 + }, + { + "epoch": 0.03547066848567531, + "grad_norm": 3.8612661361694336, + "learning_rate": 2.363636363636364e-06, + "loss": 0.8026, + "step": 13 + }, + { + "epoch": 0.03819918144611187, + "grad_norm": 3.513092517852783, + "learning_rate": 2.5454545454545456e-06, + "loss": 0.7898, + "step": 14 + }, + { + "epoch": 0.040927694406548434, + "grad_norm": 4.365728378295898, + "learning_rate": 2.7272727272727272e-06, + "loss": 0.7217, + "step": 15 + }, + { + "epoch": 0.04365620736698499, + "grad_norm": 3.0272576808929443, + "learning_rate": 2.9090909090909093e-06, + "loss": 0.7098, + "step": 16 + }, + { + "epoch": 0.04638472032742155, + "grad_norm": 1.917324423789978, + "learning_rate": 3.090909090909091e-06, + "loss": 0.6913, + "step": 17 + }, + { + "epoch": 0.04911323328785812, + "grad_norm": 1.4591542482376099, + "learning_rate": 3.272727272727273e-06, + "loss": 0.6585, + "step": 18 + }, + { + "epoch": 0.05184174624829468, + "grad_norm": 1.2868497371673584, + "learning_rate": 3.454545454545455e-06, + "loss": 0.6543, + "step": 19 + }, + { + "epoch": 0.054570259208731244, + "grad_norm": 1.2492839097976685, + "learning_rate": 3.6363636363636366e-06, + "loss": 0.6487, + "step": 20 + }, + { + "epoch": 0.0572987721691678, + "grad_norm": 1.1911948919296265, + "learning_rate": 3.818181818181819e-06, + "loss": 0.6244, + "step": 21 + }, + { + "epoch": 0.06002728512960437, + "grad_norm": 0.9519822597503662, + "learning_rate": 4.000000000000001e-06, + "loss": 0.6212, + "step": 22 + }, + { + "epoch": 0.06275579809004093, + "grad_norm": 0.92195725440979, + "learning_rate": 4.181818181818182e-06, + "loss": 0.6121, + "step": 23 + }, + { + "epoch": 0.06548431105047749, + "grad_norm": 0.9371785521507263, + "learning_rate": 4.363636363636364e-06, + "loss": 0.595, + "step": 24 + }, + { + "epoch": 0.06821282401091405, + "grad_norm": 0.8275936245918274, + "learning_rate": 4.5454545454545455e-06, + "loss": 0.5948, + "step": 25 + }, + { + "epoch": 0.07094133697135062, + "grad_norm": 0.7959325313568115, + "learning_rate": 4.727272727272728e-06, + "loss": 0.5912, + "step": 26 + }, + { + "epoch": 0.07366984993178717, + "grad_norm": 0.7594197392463684, + "learning_rate": 4.90909090909091e-06, + "loss": 0.5825, + "step": 27 + }, + { + "epoch": 0.07639836289222374, + "grad_norm": 0.7820079326629639, + "learning_rate": 5.090909090909091e-06, + "loss": 0.5589, + "step": 28 + }, + { + "epoch": 0.0791268758526603, + "grad_norm": 0.7125181555747986, + "learning_rate": 5.272727272727273e-06, + "loss": 0.5774, + "step": 29 + }, + { + "epoch": 0.08185538881309687, + "grad_norm": 0.6750136613845825, + "learning_rate": 5.4545454545454545e-06, + "loss": 0.5613, + "step": 30 + }, + { + "epoch": 0.08458390177353342, + "grad_norm": 0.6903117895126343, + "learning_rate": 5.636363636363636e-06, + "loss": 0.5659, + "step": 31 + }, + { + "epoch": 0.08731241473396999, + "grad_norm": 0.7005020380020142, + "learning_rate": 5.8181818181818185e-06, + "loss": 0.565, + "step": 32 + }, + { + "epoch": 0.09004092769440655, + "grad_norm": 0.6101182103157043, + "learning_rate": 6e-06, + "loss": 0.5515, + "step": 33 + }, + { + "epoch": 0.0927694406548431, + "grad_norm": 0.6128501892089844, + "learning_rate": 6.181818181818182e-06, + "loss": 0.5444, + "step": 34 + }, + { + "epoch": 0.09549795361527967, + "grad_norm": 0.6540465950965881, + "learning_rate": 6.363636363636364e-06, + "loss": 0.5529, + "step": 35 + }, + { + "epoch": 0.09822646657571624, + "grad_norm": 0.5642831325531006, + "learning_rate": 6.545454545454546e-06, + "loss": 0.5462, + "step": 36 + }, + { + "epoch": 0.1009549795361528, + "grad_norm": 0.5906216502189636, + "learning_rate": 6.7272727272727275e-06, + "loss": 0.5429, + "step": 37 + }, + { + "epoch": 0.10368349249658936, + "grad_norm": 0.5924307107925415, + "learning_rate": 6.90909090909091e-06, + "loss": 0.5317, + "step": 38 + }, + { + "epoch": 0.10641200545702592, + "grad_norm": 0.5813631415367126, + "learning_rate": 7.0909090909090916e-06, + "loss": 0.5353, + "step": 39 + }, + { + "epoch": 0.10914051841746249, + "grad_norm": 0.5987147092819214, + "learning_rate": 7.272727272727273e-06, + "loss": 0.5262, + "step": 40 + }, + { + "epoch": 0.11186903137789904, + "grad_norm": 0.5837368369102478, + "learning_rate": 7.454545454545456e-06, + "loss": 0.52, + "step": 41 + }, + { + "epoch": 0.1145975443383356, + "grad_norm": 0.5774114727973938, + "learning_rate": 7.636363636363638e-06, + "loss": 0.5186, + "step": 42 + }, + { + "epoch": 0.11732605729877217, + "grad_norm": 0.6058359146118164, + "learning_rate": 7.81818181818182e-06, + "loss": 0.5162, + "step": 43 + }, + { + "epoch": 0.12005457025920874, + "grad_norm": 0.5552087426185608, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5134, + "step": 44 + }, + { + "epoch": 0.12278308321964529, + "grad_norm": 0.5530264973640442, + "learning_rate": 8.181818181818183e-06, + "loss": 0.5115, + "step": 45 + }, + { + "epoch": 0.12551159618008187, + "grad_norm": 0.5766640901565552, + "learning_rate": 8.363636363636365e-06, + "loss": 0.5107, + "step": 46 + }, + { + "epoch": 0.12824010914051842, + "grad_norm": 0.5763387680053711, + "learning_rate": 8.545454545454546e-06, + "loss": 0.5169, + "step": 47 + }, + { + "epoch": 0.13096862210095497, + "grad_norm": 0.5950232744216919, + "learning_rate": 8.727272727272728e-06, + "loss": 0.496, + "step": 48 + }, + { + "epoch": 0.13369713506139155, + "grad_norm": 0.550932765007019, + "learning_rate": 8.90909090909091e-06, + "loss": 0.4977, + "step": 49 + }, + { + "epoch": 0.1364256480218281, + "grad_norm": 0.5710775256156921, + "learning_rate": 9.090909090909091e-06, + "loss": 0.4979, + "step": 50 + }, + { + "epoch": 0.13915416098226466, + "grad_norm": 0.5536239743232727, + "learning_rate": 9.272727272727273e-06, + "loss": 0.4901, + "step": 51 + }, + { + "epoch": 0.14188267394270124, + "grad_norm": 0.5787481665611267, + "learning_rate": 9.454545454545456e-06, + "loss": 0.4985, + "step": 52 + }, + { + "epoch": 0.1446111869031378, + "grad_norm": 0.5732221007347107, + "learning_rate": 9.636363636363638e-06, + "loss": 0.4995, + "step": 53 + }, + { + "epoch": 0.14733969986357434, + "grad_norm": 0.5549193024635315, + "learning_rate": 9.81818181818182e-06, + "loss": 0.4786, + "step": 54 + }, + { + "epoch": 0.15006821282401092, + "grad_norm": 0.5745016932487488, + "learning_rate": 1e-05, + "loss": 0.4814, + "step": 55 + }, + { + "epoch": 0.15279672578444747, + "grad_norm": 0.5580504536628723, + "learning_rate": 1.0181818181818182e-05, + "loss": 0.4786, + "step": 56 + }, + { + "epoch": 0.15552523874488403, + "grad_norm": 0.5935932397842407, + "learning_rate": 1.0363636363636364e-05, + "loss": 0.492, + "step": 57 + }, + { + "epoch": 0.1582537517053206, + "grad_norm": 0.5717213153839111, + "learning_rate": 1.0545454545454546e-05, + "loss": 0.47, + "step": 58 + }, + { + "epoch": 0.16098226466575716, + "grad_norm": 0.5752708315849304, + "learning_rate": 1.0727272727272729e-05, + "loss": 0.4728, + "step": 59 + }, + { + "epoch": 0.16371077762619374, + "grad_norm": 0.6000607013702393, + "learning_rate": 1.0909090909090909e-05, + "loss": 0.4667, + "step": 60 + }, + { + "epoch": 0.1664392905866303, + "grad_norm": 0.5734297633171082, + "learning_rate": 1.1090909090909092e-05, + "loss": 0.4666, + "step": 61 + }, + { + "epoch": 0.16916780354706684, + "grad_norm": 0.6120443940162659, + "learning_rate": 1.1272727272727272e-05, + "loss": 0.4629, + "step": 62 + }, + { + "epoch": 0.17189631650750342, + "grad_norm": 0.5620084404945374, + "learning_rate": 1.1454545454545455e-05, + "loss": 0.4587, + "step": 63 + }, + { + "epoch": 0.17462482946793997, + "grad_norm": 0.6068463921546936, + "learning_rate": 1.1636363636363637e-05, + "loss": 0.4671, + "step": 64 + }, + { + "epoch": 0.17735334242837653, + "grad_norm": 0.5794389843940735, + "learning_rate": 1.181818181818182e-05, + "loss": 0.4559, + "step": 65 + }, + { + "epoch": 0.1800818553888131, + "grad_norm": 0.6076776385307312, + "learning_rate": 1.2e-05, + "loss": 0.4536, + "step": 66 + }, + { + "epoch": 0.18281036834924966, + "grad_norm": 0.5867766737937927, + "learning_rate": 1.2181818181818184e-05, + "loss": 0.4535, + "step": 67 + }, + { + "epoch": 0.1855388813096862, + "grad_norm": 0.639927089214325, + "learning_rate": 1.2363636363636364e-05, + "loss": 0.446, + "step": 68 + }, + { + "epoch": 0.1882673942701228, + "grad_norm": 0.6484801173210144, + "learning_rate": 1.2545454545454547e-05, + "loss": 0.4394, + "step": 69 + }, + { + "epoch": 0.19099590723055934, + "grad_norm": 0.6096192002296448, + "learning_rate": 1.2727272727272728e-05, + "loss": 0.4378, + "step": 70 + }, + { + "epoch": 0.1937244201909959, + "grad_norm": 0.6389114260673523, + "learning_rate": 1.2909090909090912e-05, + "loss": 0.4445, + "step": 71 + }, + { + "epoch": 0.19645293315143247, + "grad_norm": 0.6160559058189392, + "learning_rate": 1.3090909090909092e-05, + "loss": 0.4359, + "step": 72 + }, + { + "epoch": 0.19918144611186903, + "grad_norm": 0.6258884072303772, + "learning_rate": 1.3272727272727275e-05, + "loss": 0.4244, + "step": 73 + }, + { + "epoch": 0.2019099590723056, + "grad_norm": 0.6996473073959351, + "learning_rate": 1.3454545454545455e-05, + "loss": 0.437, + "step": 74 + }, + { + "epoch": 0.20463847203274216, + "grad_norm": 0.6465001702308655, + "learning_rate": 1.3636363636363637e-05, + "loss": 0.4224, + "step": 75 + }, + { + "epoch": 0.2073669849931787, + "grad_norm": 0.6399327516555786, + "learning_rate": 1.381818181818182e-05, + "loss": 0.4258, + "step": 76 + }, + { + "epoch": 0.2100954979536153, + "grad_norm": 0.7422960996627808, + "learning_rate": 1.4e-05, + "loss": 0.4199, + "step": 77 + }, + { + "epoch": 0.21282401091405184, + "grad_norm": 0.6545052528381348, + "learning_rate": 1.4181818181818183e-05, + "loss": 0.4243, + "step": 78 + }, + { + "epoch": 0.2155525238744884, + "grad_norm": 0.6757943630218506, + "learning_rate": 1.4363636363636365e-05, + "loss": 0.4093, + "step": 79 + }, + { + "epoch": 0.21828103683492497, + "grad_norm": 1.1193770170211792, + "learning_rate": 1.4545454545454546e-05, + "loss": 0.4144, + "step": 80 + }, + { + "epoch": 0.22100954979536153, + "grad_norm": 0.8388747572898865, + "learning_rate": 1.4727272727272728e-05, + "loss": 0.4079, + "step": 81 + }, + { + "epoch": 0.22373806275579808, + "grad_norm": 0.7611749172210693, + "learning_rate": 1.4909090909090911e-05, + "loss": 0.4066, + "step": 82 + }, + { + "epoch": 0.22646657571623466, + "grad_norm": 0.8053273558616638, + "learning_rate": 1.5090909090909091e-05, + "loss": 0.4045, + "step": 83 + }, + { + "epoch": 0.2291950886766712, + "grad_norm": 0.8546133637428284, + "learning_rate": 1.5272727272727276e-05, + "loss": 0.3915, + "step": 84 + }, + { + "epoch": 0.23192360163710776, + "grad_norm": 0.7772160172462463, + "learning_rate": 1.5454545454545454e-05, + "loss": 0.3951, + "step": 85 + }, + { + "epoch": 0.23465211459754434, + "grad_norm": 0.7378780245780945, + "learning_rate": 1.563636363636364e-05, + "loss": 0.3938, + "step": 86 + }, + { + "epoch": 0.2373806275579809, + "grad_norm": 0.8644944429397583, + "learning_rate": 1.5818181818181818e-05, + "loss": 0.3924, + "step": 87 + }, + { + "epoch": 0.24010914051841747, + "grad_norm": 0.7195233702659607, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3832, + "step": 88 + }, + { + "epoch": 0.24283765347885403, + "grad_norm": 0.714919924736023, + "learning_rate": 1.6181818181818184e-05, + "loss": 0.3865, + "step": 89 + }, + { + "epoch": 0.24556616643929058, + "grad_norm": 0.7049028873443604, + "learning_rate": 1.6363636363636366e-05, + "loss": 0.3889, + "step": 90 + }, + { + "epoch": 0.24829467939972716, + "grad_norm": 0.6938926577568054, + "learning_rate": 1.6545454545454548e-05, + "loss": 0.3854, + "step": 91 + }, + { + "epoch": 0.25102319236016374, + "grad_norm": 0.7241790294647217, + "learning_rate": 1.672727272727273e-05, + "loss": 0.3745, + "step": 92 + }, + { + "epoch": 0.25375170532060026, + "grad_norm": 0.7766016721725464, + "learning_rate": 1.690909090909091e-05, + "loss": 0.3766, + "step": 93 + }, + { + "epoch": 0.25648021828103684, + "grad_norm": 0.6900867223739624, + "learning_rate": 1.7090909090909092e-05, + "loss": 0.3661, + "step": 94 + }, + { + "epoch": 0.2592087312414734, + "grad_norm": 0.7429078221321106, + "learning_rate": 1.7272727272727274e-05, + "loss": 0.369, + "step": 95 + }, + { + "epoch": 0.26193724420190995, + "grad_norm": 0.7221667766571045, + "learning_rate": 1.7454545454545456e-05, + "loss": 0.36, + "step": 96 + }, + { + "epoch": 0.2646657571623465, + "grad_norm": 0.8097471594810486, + "learning_rate": 1.7636363636363637e-05, + "loss": 0.3594, + "step": 97 + }, + { + "epoch": 0.2673942701227831, + "grad_norm": 0.7111004590988159, + "learning_rate": 1.781818181818182e-05, + "loss": 0.3649, + "step": 98 + }, + { + "epoch": 0.27012278308321963, + "grad_norm": 0.8246558904647827, + "learning_rate": 1.8e-05, + "loss": 0.3726, + "step": 99 + }, + { + "epoch": 0.2728512960436562, + "grad_norm": 0.7303751111030579, + "learning_rate": 1.8181818181818182e-05, + "loss": 0.356, + "step": 100 + }, + { + "epoch": 0.2755798090040928, + "grad_norm": 0.7322264909744263, + "learning_rate": 1.8363636363636367e-05, + "loss": 0.3542, + "step": 101 + }, + { + "epoch": 0.2783083219645293, + "grad_norm": 0.7856989502906799, + "learning_rate": 1.8545454545454545e-05, + "loss": 0.3479, + "step": 102 + }, + { + "epoch": 0.2810368349249659, + "grad_norm": 0.6907728910446167, + "learning_rate": 1.872727272727273e-05, + "loss": 0.3428, + "step": 103 + }, + { + "epoch": 0.2837653478854025, + "grad_norm": 0.8893505930900574, + "learning_rate": 1.8909090909090912e-05, + "loss": 0.3453, + "step": 104 + }, + { + "epoch": 0.286493860845839, + "grad_norm": 0.8685447573661804, + "learning_rate": 1.9090909090909094e-05, + "loss": 0.3458, + "step": 105 + }, + { + "epoch": 0.2892223738062756, + "grad_norm": 0.7228476405143738, + "learning_rate": 1.9272727272727275e-05, + "loss": 0.3329, + "step": 106 + }, + { + "epoch": 0.29195088676671216, + "grad_norm": 0.8950058221817017, + "learning_rate": 1.9454545454545457e-05, + "loss": 0.3318, + "step": 107 + }, + { + "epoch": 0.2946793997271487, + "grad_norm": 0.7596233487129211, + "learning_rate": 1.963636363636364e-05, + "loss": 0.3435, + "step": 108 + }, + { + "epoch": 0.29740791268758526, + "grad_norm": 0.745883047580719, + "learning_rate": 1.981818181818182e-05, + "loss": 0.3379, + "step": 109 + }, + { + "epoch": 0.30013642564802184, + "grad_norm": 0.729240894317627, + "learning_rate": 2e-05, + "loss": 0.3338, + "step": 110 + }, + { + "epoch": 0.30286493860845837, + "grad_norm": 0.7607076168060303, + "learning_rate": 1.9999949446003432e-05, + "loss": 0.3302, + "step": 111 + }, + { + "epoch": 0.30559345156889495, + "grad_norm": 0.7514939308166504, + "learning_rate": 1.9999797784524866e-05, + "loss": 0.3305, + "step": 112 + }, + { + "epoch": 0.3083219645293315, + "grad_norm": 0.6919752955436707, + "learning_rate": 1.9999545017097726e-05, + "loss": 0.3256, + "step": 113 + }, + { + "epoch": 0.31105047748976805, + "grad_norm": 0.745072603225708, + "learning_rate": 1.999919114627769e-05, + "loss": 0.3234, + "step": 114 + }, + { + "epoch": 0.31377899045020463, + "grad_norm": 0.7782521843910217, + "learning_rate": 1.9998736175642674e-05, + "loss": 0.3341, + "step": 115 + }, + { + "epoch": 0.3165075034106412, + "grad_norm": 0.70183265209198, + "learning_rate": 1.9998180109792793e-05, + "loss": 0.3079, + "step": 116 + }, + { + "epoch": 0.31923601637107774, + "grad_norm": 0.8084478378295898, + "learning_rate": 1.999752295435032e-05, + "loss": 0.3245, + "step": 117 + }, + { + "epoch": 0.3219645293315143, + "grad_norm": 0.70412278175354, + "learning_rate": 1.999676471595962e-05, + "loss": 0.318, + "step": 118 + }, + { + "epoch": 0.3246930422919509, + "grad_norm": 0.74163818359375, + "learning_rate": 1.9995905402287094e-05, + "loss": 0.3139, + "step": 119 + }, + { + "epoch": 0.3274215552523875, + "grad_norm": 0.6615740656852722, + "learning_rate": 1.9994945022021085e-05, + "loss": 0.3058, + "step": 120 + }, + { + "epoch": 0.330150068212824, + "grad_norm": 0.8651596903800964, + "learning_rate": 1.9993883584871807e-05, + "loss": 0.3171, + "step": 121 + }, + { + "epoch": 0.3328785811732606, + "grad_norm": 0.712332010269165, + "learning_rate": 1.9992721101571238e-05, + "loss": 0.2981, + "step": 122 + }, + { + "epoch": 0.33560709413369716, + "grad_norm": 0.7112699747085571, + "learning_rate": 1.999145758387301e-05, + "loss": 0.3184, + "step": 123 + }, + { + "epoch": 0.3383356070941337, + "grad_norm": 0.7106016874313354, + "learning_rate": 1.9990093044552304e-05, + "loss": 0.2973, + "step": 124 + }, + { + "epoch": 0.34106412005457026, + "grad_norm": 0.7371537685394287, + "learning_rate": 1.9988627497405696e-05, + "loss": 0.3082, + "step": 125 + }, + { + "epoch": 0.34379263301500684, + "grad_norm": 0.6868679523468018, + "learning_rate": 1.9987060957251047e-05, + "loss": 0.3042, + "step": 126 + }, + { + "epoch": 0.34652114597544337, + "grad_norm": 0.7139438986778259, + "learning_rate": 1.9985393439927325e-05, + "loss": 0.3072, + "step": 127 + }, + { + "epoch": 0.34924965893587995, + "grad_norm": 0.7296750545501709, + "learning_rate": 1.998362496229446e-05, + "loss": 0.3042, + "step": 128 + }, + { + "epoch": 0.3519781718963165, + "grad_norm": 0.6611348390579224, + "learning_rate": 1.9981755542233175e-05, + "loss": 0.2931, + "step": 129 + }, + { + "epoch": 0.35470668485675305, + "grad_norm": 0.6664003133773804, + "learning_rate": 1.997978519864481e-05, + "loss": 0.3008, + "step": 130 + }, + { + "epoch": 0.35743519781718963, + "grad_norm": 0.6638462543487549, + "learning_rate": 1.9977713951451102e-05, + "loss": 0.3034, + "step": 131 + }, + { + "epoch": 0.3601637107776262, + "grad_norm": 0.7581326961517334, + "learning_rate": 1.9975541821594028e-05, + "loss": 0.3027, + "step": 132 + }, + { + "epoch": 0.36289222373806274, + "grad_norm": 0.6742005348205566, + "learning_rate": 1.9973268831035547e-05, + "loss": 0.2966, + "step": 133 + }, + { + "epoch": 0.3656207366984993, + "grad_norm": 0.7190444469451904, + "learning_rate": 1.9970895002757413e-05, + "loss": 0.2928, + "step": 134 + }, + { + "epoch": 0.3683492496589359, + "grad_norm": 0.6588881015777588, + "learning_rate": 1.996842036076093e-05, + "loss": 0.2966, + "step": 135 + }, + { + "epoch": 0.3710777626193724, + "grad_norm": 0.7533734440803528, + "learning_rate": 1.99658449300667e-05, + "loss": 0.2939, + "step": 136 + }, + { + "epoch": 0.373806275579809, + "grad_norm": 0.6481987833976746, + "learning_rate": 1.9963168736714395e-05, + "loss": 0.2903, + "step": 137 + }, + { + "epoch": 0.3765347885402456, + "grad_norm": 0.6547331809997559, + "learning_rate": 1.9960391807762462e-05, + "loss": 0.2849, + "step": 138 + }, + { + "epoch": 0.3792633015006821, + "grad_norm": 0.6273127794265747, + "learning_rate": 1.9957514171287875e-05, + "loss": 0.2858, + "step": 139 + }, + { + "epoch": 0.3819918144611187, + "grad_norm": 0.6538695096969604, + "learning_rate": 1.995453585638584e-05, + "loss": 0.2884, + "step": 140 + }, + { + "epoch": 0.38472032742155526, + "grad_norm": 0.6586217284202576, + "learning_rate": 1.9951456893169497e-05, + "loss": 0.2807, + "step": 141 + }, + { + "epoch": 0.3874488403819918, + "grad_norm": 0.6591027975082397, + "learning_rate": 1.994827731276963e-05, + "loss": 0.2919, + "step": 142 + }, + { + "epoch": 0.39017735334242837, + "grad_norm": 0.616389274597168, + "learning_rate": 1.994499714733434e-05, + "loss": 0.2843, + "step": 143 + }, + { + "epoch": 0.39290586630286495, + "grad_norm": 0.6623064875602722, + "learning_rate": 1.9941616430028713e-05, + "loss": 0.2841, + "step": 144 + }, + { + "epoch": 0.3956343792633015, + "grad_norm": 0.617335319519043, + "learning_rate": 1.993813519503451e-05, + "loss": 0.2805, + "step": 145 + }, + { + "epoch": 0.39836289222373805, + "grad_norm": 0.6545597314834595, + "learning_rate": 1.9934553477549795e-05, + "loss": 0.2816, + "step": 146 + }, + { + "epoch": 0.40109140518417463, + "grad_norm": 0.6183223724365234, + "learning_rate": 1.99308713137886e-05, + "loss": 0.2766, + "step": 147 + }, + { + "epoch": 0.4038199181446112, + "grad_norm": 0.6186001896858215, + "learning_rate": 1.992708874098054e-05, + "loss": 0.272, + "step": 148 + }, + { + "epoch": 0.40654843110504774, + "grad_norm": 0.6923176646232605, + "learning_rate": 1.992320579737045e-05, + "loss": 0.2771, + "step": 149 + }, + { + "epoch": 0.4092769440654843, + "grad_norm": 0.659702718257904, + "learning_rate": 1.9919222522217998e-05, + "loss": 0.2795, + "step": 150 + }, + { + "epoch": 0.4120054570259209, + "grad_norm": 0.6168344616889954, + "learning_rate": 1.9915138955797272e-05, + "loss": 0.2759, + "step": 151 + }, + { + "epoch": 0.4147339699863574, + "grad_norm": 0.6312188506126404, + "learning_rate": 1.9910955139396395e-05, + "loss": 0.274, + "step": 152 + }, + { + "epoch": 0.417462482946794, + "grad_norm": 0.6516560316085815, + "learning_rate": 1.99066711153171e-05, + "loss": 0.2716, + "step": 153 + }, + { + "epoch": 0.4201909959072306, + "grad_norm": 0.6017420291900635, + "learning_rate": 1.990228692687429e-05, + "loss": 0.2659, + "step": 154 + }, + { + "epoch": 0.4229195088676671, + "grad_norm": 0.6232393980026245, + "learning_rate": 1.9897802618395614e-05, + "loss": 0.2747, + "step": 155 + }, + { + "epoch": 0.4256480218281037, + "grad_norm": 0.611860454082489, + "learning_rate": 1.9893218235221016e-05, + "loss": 0.2704, + "step": 156 + }, + { + "epoch": 0.42837653478854026, + "grad_norm": 0.6973581314086914, + "learning_rate": 1.988853382370228e-05, + "loss": 0.2801, + "step": 157 + }, + { + "epoch": 0.4311050477489768, + "grad_norm": 0.5803492069244385, + "learning_rate": 1.988374943120254e-05, + "loss": 0.2726, + "step": 158 + }, + { + "epoch": 0.43383356070941337, + "grad_norm": 0.6552534699440002, + "learning_rate": 1.9878865106095838e-05, + "loss": 0.2585, + "step": 159 + }, + { + "epoch": 0.43656207366984995, + "grad_norm": 0.5801110863685608, + "learning_rate": 1.9873880897766597e-05, + "loss": 0.2628, + "step": 160 + }, + { + "epoch": 0.4392905866302865, + "grad_norm": 0.7232557535171509, + "learning_rate": 1.9868796856609154e-05, + "loss": 0.2661, + "step": 161 + }, + { + "epoch": 0.44201909959072305, + "grad_norm": 0.6580977439880371, + "learning_rate": 1.9863613034027224e-05, + "loss": 0.2679, + "step": 162 + }, + { + "epoch": 0.44474761255115963, + "grad_norm": 0.5943677425384521, + "learning_rate": 1.9858329482433404e-05, + "loss": 0.2585, + "step": 163 + }, + { + "epoch": 0.44747612551159616, + "grad_norm": 0.6069351434707642, + "learning_rate": 1.985294625524861e-05, + "loss": 0.2627, + "step": 164 + }, + { + "epoch": 0.45020463847203274, + "grad_norm": 0.5810995101928711, + "learning_rate": 1.984746340690159e-05, + "loss": 0.2622, + "step": 165 + }, + { + "epoch": 0.4529331514324693, + "grad_norm": 0.6588981747627258, + "learning_rate": 1.9841880992828306e-05, + "loss": 0.26, + "step": 166 + }, + { + "epoch": 0.45566166439290584, + "grad_norm": 0.5870964527130127, + "learning_rate": 1.983619906947144e-05, + "loss": 0.2649, + "step": 167 + }, + { + "epoch": 0.4583901773533424, + "grad_norm": 0.6157024502754211, + "learning_rate": 1.9830417694279766e-05, + "loss": 0.2576, + "step": 168 + }, + { + "epoch": 0.461118690313779, + "grad_norm": 0.5841497778892517, + "learning_rate": 1.9824536925707622e-05, + "loss": 0.2506, + "step": 169 + }, + { + "epoch": 0.4638472032742155, + "grad_norm": 0.616535484790802, + "learning_rate": 1.981855682321427e-05, + "loss": 0.2556, + "step": 170 + }, + { + "epoch": 0.4665757162346521, + "grad_norm": 0.6029694080352783, + "learning_rate": 1.9812477447263324e-05, + "loss": 0.2567, + "step": 171 + }, + { + "epoch": 0.4693042291950887, + "grad_norm": 0.5664442777633667, + "learning_rate": 1.9806298859322143e-05, + "loss": 0.2549, + "step": 172 + }, + { + "epoch": 0.47203274215552526, + "grad_norm": 0.5878991484642029, + "learning_rate": 1.980002112186118e-05, + "loss": 0.2608, + "step": 173 + }, + { + "epoch": 0.4747612551159618, + "grad_norm": 0.5764362812042236, + "learning_rate": 1.979364429835339e-05, + "loss": 0.2514, + "step": 174 + }, + { + "epoch": 0.47748976807639837, + "grad_norm": 0.6009331345558167, + "learning_rate": 1.9787168453273546e-05, + "loss": 0.2488, + "step": 175 + }, + { + "epoch": 0.48021828103683495, + "grad_norm": 0.6152218580245972, + "learning_rate": 1.978059365209762e-05, + "loss": 0.2608, + "step": 176 + }, + { + "epoch": 0.4829467939972715, + "grad_norm": 0.5641770958900452, + "learning_rate": 1.9773919961302113e-05, + "loss": 0.2612, + "step": 177 + }, + { + "epoch": 0.48567530695770805, + "grad_norm": 0.5944207906723022, + "learning_rate": 1.9767147448363366e-05, + "loss": 0.2635, + "step": 178 + }, + { + "epoch": 0.48840381991814463, + "grad_norm": 0.5424714684486389, + "learning_rate": 1.9760276181756905e-05, + "loss": 0.2566, + "step": 179 + }, + { + "epoch": 0.49113233287858116, + "grad_norm": 0.6175258755683899, + "learning_rate": 1.975330623095672e-05, + "loss": 0.2543, + "step": 180 + }, + { + "epoch": 0.49386084583901774, + "grad_norm": 0.5744900107383728, + "learning_rate": 1.9746237666434588e-05, + "loss": 0.2471, + "step": 181 + }, + { + "epoch": 0.4965893587994543, + "grad_norm": 0.5477325320243835, + "learning_rate": 1.9739070559659347e-05, + "loss": 0.255, + "step": 182 + }, + { + "epoch": 0.49931787175989084, + "grad_norm": 0.599983811378479, + "learning_rate": 1.973180498309618e-05, + "loss": 0.252, + "step": 183 + }, + { + "epoch": 0.5020463847203275, + "grad_norm": 0.5861368179321289, + "learning_rate": 1.9724441010205865e-05, + "loss": 0.2527, + "step": 184 + }, + { + "epoch": 0.504774897680764, + "grad_norm": 0.5599756836891174, + "learning_rate": 1.9716978715444056e-05, + "loss": 0.249, + "step": 185 + }, + { + "epoch": 0.5075034106412005, + "grad_norm": 0.5516128540039062, + "learning_rate": 1.9709418174260523e-05, + "loss": 0.2433, + "step": 186 + }, + { + "epoch": 0.5102319236016372, + "grad_norm": 0.5438660383224487, + "learning_rate": 1.9701759463098377e-05, + "loss": 0.2479, + "step": 187 + }, + { + "epoch": 0.5129604365620737, + "grad_norm": 0.5530990362167358, + "learning_rate": 1.9694002659393306e-05, + "loss": 0.2503, + "step": 188 + }, + { + "epoch": 0.5156889495225102, + "grad_norm": 0.5980175733566284, + "learning_rate": 1.9686147841572803e-05, + "loss": 0.2452, + "step": 189 + }, + { + "epoch": 0.5184174624829468, + "grad_norm": 0.5309689044952393, + "learning_rate": 1.9678195089055347e-05, + "loss": 0.2381, + "step": 190 + }, + { + "epoch": 0.5211459754433834, + "grad_norm": 0.5269597172737122, + "learning_rate": 1.967014448224963e-05, + "loss": 0.24, + "step": 191 + }, + { + "epoch": 0.5238744884038199, + "grad_norm": 0.5427187085151672, + "learning_rate": 1.9661996102553716e-05, + "loss": 0.2453, + "step": 192 + }, + { + "epoch": 0.5266030013642565, + "grad_norm": 0.5384966731071472, + "learning_rate": 1.965375003235424e-05, + "loss": 0.2487, + "step": 193 + }, + { + "epoch": 0.529331514324693, + "grad_norm": 0.5402637124061584, + "learning_rate": 1.9645406355025565e-05, + "loss": 0.2412, + "step": 194 + }, + { + "epoch": 0.5320600272851296, + "grad_norm": 0.556141197681427, + "learning_rate": 1.9636965154928932e-05, + "loss": 0.2453, + "step": 195 + }, + { + "epoch": 0.5347885402455662, + "grad_norm": 0.5466804504394531, + "learning_rate": 1.9628426517411625e-05, + "loss": 0.2418, + "step": 196 + }, + { + "epoch": 0.5375170532060027, + "grad_norm": 0.5225583910942078, + "learning_rate": 1.9619790528806092e-05, + "loss": 0.242, + "step": 197 + }, + { + "epoch": 0.5402455661664393, + "grad_norm": 0.5193293690681458, + "learning_rate": 1.9611057276429085e-05, + "loss": 0.2352, + "step": 198 + }, + { + "epoch": 0.5429740791268759, + "grad_norm": 0.5324943661689758, + "learning_rate": 1.9602226848580762e-05, + "loss": 0.2473, + "step": 199 + }, + { + "epoch": 0.5457025920873124, + "grad_norm": 0.5119441747665405, + "learning_rate": 1.959329933454381e-05, + "loss": 0.2391, + "step": 200 + }, + { + "epoch": 0.548431105047749, + "grad_norm": 0.5389058589935303, + "learning_rate": 1.958427482458253e-05, + "loss": 0.239, + "step": 201 + }, + { + "epoch": 0.5511596180081856, + "grad_norm": 0.5182425379753113, + "learning_rate": 1.957515340994193e-05, + "loss": 0.2392, + "step": 202 + }, + { + "epoch": 0.5538881309686221, + "grad_norm": 0.5547599196434021, + "learning_rate": 1.95659351828468e-05, + "loss": 0.2439, + "step": 203 + }, + { + "epoch": 0.5566166439290586, + "grad_norm": 0.5299258232116699, + "learning_rate": 1.9556620236500794e-05, + "loss": 0.2423, + "step": 204 + }, + { + "epoch": 0.5593451568894953, + "grad_norm": 0.5086020231246948, + "learning_rate": 1.954720866508546e-05, + "loss": 0.234, + "step": 205 + }, + { + "epoch": 0.5620736698499318, + "grad_norm": 0.5384035706520081, + "learning_rate": 1.9537700563759303e-05, + "loss": 0.2405, + "step": 206 + }, + { + "epoch": 0.5648021828103683, + "grad_norm": 0.5312212705612183, + "learning_rate": 1.9528096028656835e-05, + "loss": 0.2419, + "step": 207 + }, + { + "epoch": 0.567530695770805, + "grad_norm": 0.5274946093559265, + "learning_rate": 1.9518395156887574e-05, + "loss": 0.2318, + "step": 208 + }, + { + "epoch": 0.5702592087312415, + "grad_norm": 0.5244715213775635, + "learning_rate": 1.9508598046535095e-05, + "loss": 0.2363, + "step": 209 + }, + { + "epoch": 0.572987721691678, + "grad_norm": 0.529675304889679, + "learning_rate": 1.949870479665602e-05, + "loss": 0.242, + "step": 210 + }, + { + "epoch": 0.5757162346521146, + "grad_norm": 0.510682225227356, + "learning_rate": 1.9488715507279e-05, + "loss": 0.2368, + "step": 211 + }, + { + "epoch": 0.5784447476125512, + "grad_norm": 0.5228487253189087, + "learning_rate": 1.9478630279403737e-05, + "loss": 0.2319, + "step": 212 + }, + { + "epoch": 0.5811732605729877, + "grad_norm": 0.5426226854324341, + "learning_rate": 1.9468449214999956e-05, + "loss": 0.239, + "step": 213 + }, + { + "epoch": 0.5839017735334243, + "grad_norm": 0.5186337828636169, + "learning_rate": 1.9458172417006347e-05, + "loss": 0.2386, + "step": 214 + }, + { + "epoch": 0.5866302864938608, + "grad_norm": 0.5370061993598938, + "learning_rate": 1.9447799989329557e-05, + "loss": 0.2387, + "step": 215 + }, + { + "epoch": 0.5893587994542974, + "grad_norm": 0.5024000406265259, + "learning_rate": 1.943733203684312e-05, + "loss": 0.2317, + "step": 216 + }, + { + "epoch": 0.592087312414734, + "grad_norm": 0.5344046354293823, + "learning_rate": 1.9426768665386397e-05, + "loss": 0.2308, + "step": 217 + }, + { + "epoch": 0.5948158253751705, + "grad_norm": 0.4995371699333191, + "learning_rate": 1.9416109981763526e-05, + "loss": 0.2342, + "step": 218 + }, + { + "epoch": 0.597544338335607, + "grad_norm": 0.5385648608207703, + "learning_rate": 1.9405356093742314e-05, + "loss": 0.2313, + "step": 219 + }, + { + "epoch": 0.6002728512960437, + "grad_norm": 0.5008872747421265, + "learning_rate": 1.939450711005316e-05, + "loss": 0.2365, + "step": 220 + }, + { + "epoch": 0.6030013642564802, + "grad_norm": 0.504681408405304, + "learning_rate": 1.9383563140387966e-05, + "loss": 0.2333, + "step": 221 + }, + { + "epoch": 0.6057298772169167, + "grad_norm": 0.49399399757385254, + "learning_rate": 1.9372524295399014e-05, + "loss": 0.2352, + "step": 222 + }, + { + "epoch": 0.6084583901773534, + "grad_norm": 0.5233116149902344, + "learning_rate": 1.9361390686697847e-05, + "loss": 0.2351, + "step": 223 + }, + { + "epoch": 0.6111869031377899, + "grad_norm": 0.5101408958435059, + "learning_rate": 1.9350162426854152e-05, + "loss": 0.2335, + "step": 224 + }, + { + "epoch": 0.6139154160982264, + "grad_norm": 0.5167925357818604, + "learning_rate": 1.9338839629394606e-05, + "loss": 0.233, + "step": 225 + }, + { + "epoch": 0.616643929058663, + "grad_norm": 0.4952821731567383, + "learning_rate": 1.9327422408801744e-05, + "loss": 0.2356, + "step": 226 + }, + { + "epoch": 0.6193724420190996, + "grad_norm": 0.48435306549072266, + "learning_rate": 1.9315910880512792e-05, + "loss": 0.2293, + "step": 227 + }, + { + "epoch": 0.6221009549795361, + "grad_norm": 0.5238944292068481, + "learning_rate": 1.93043051609185e-05, + "loss": 0.2294, + "step": 228 + }, + { + "epoch": 0.6248294679399727, + "grad_norm": 0.48778635263442993, + "learning_rate": 1.929260536736198e-05, + "loss": 0.2357, + "step": 229 + }, + { + "epoch": 0.6275579809004093, + "grad_norm": 0.5128819942474365, + "learning_rate": 1.9280811618137486e-05, + "loss": 0.2283, + "step": 230 + }, + { + "epoch": 0.6302864938608458, + "grad_norm": 0.49600908160209656, + "learning_rate": 1.926892403248925e-05, + "loss": 0.2225, + "step": 231 + }, + { + "epoch": 0.6330150068212824, + "grad_norm": 0.49010199308395386, + "learning_rate": 1.9256942730610268e-05, + "loss": 0.2301, + "step": 232 + }, + { + "epoch": 0.635743519781719, + "grad_norm": 0.5124602913856506, + "learning_rate": 1.9244867833641078e-05, + "loss": 0.2334, + "step": 233 + }, + { + "epoch": 0.6384720327421555, + "grad_norm": 0.4958963394165039, + "learning_rate": 1.9232699463668543e-05, + "loss": 0.2314, + "step": 234 + }, + { + "epoch": 0.6412005457025921, + "grad_norm": 0.4773724675178528, + "learning_rate": 1.9220437743724605e-05, + "loss": 0.2318, + "step": 235 + }, + { + "epoch": 0.6439290586630286, + "grad_norm": 0.4998438060283661, + "learning_rate": 1.9208082797785057e-05, + "loss": 0.22, + "step": 236 + }, + { + "epoch": 0.6466575716234653, + "grad_norm": 0.48424261808395386, + "learning_rate": 1.9195634750768276e-05, + "loss": 0.2156, + "step": 237 + }, + { + "epoch": 0.6493860845839018, + "grad_norm": 0.5186326503753662, + "learning_rate": 1.9183093728533966e-05, + "loss": 0.2338, + "step": 238 + }, + { + "epoch": 0.6521145975443383, + "grad_norm": 0.49726244807243347, + "learning_rate": 1.9170459857881888e-05, + "loss": 0.2256, + "step": 239 + }, + { + "epoch": 0.654843110504775, + "grad_norm": 0.4714222550392151, + "learning_rate": 1.9157733266550577e-05, + "loss": 0.2259, + "step": 240 + }, + { + "epoch": 0.6575716234652115, + "grad_norm": 0.5003750324249268, + "learning_rate": 1.9144914083216036e-05, + "loss": 0.2253, + "step": 241 + }, + { + "epoch": 0.660300136425648, + "grad_norm": 0.4727269411087036, + "learning_rate": 1.913200243749046e-05, + "loss": 0.2258, + "step": 242 + }, + { + "epoch": 0.6630286493860846, + "grad_norm": 0.5212213397026062, + "learning_rate": 1.91189984599209e-05, + "loss": 0.2322, + "step": 243 + }, + { + "epoch": 0.6657571623465212, + "grad_norm": 0.5002415776252747, + "learning_rate": 1.910590228198798e-05, + "loss": 0.2277, + "step": 244 + }, + { + "epoch": 0.6684856753069577, + "grad_norm": 0.4715561270713806, + "learning_rate": 1.9092714036104508e-05, + "loss": 0.2317, + "step": 245 + }, + { + "epoch": 0.6712141882673943, + "grad_norm": 0.47772514820098877, + "learning_rate": 1.9079433855614203e-05, + "loss": 0.2247, + "step": 246 + }, + { + "epoch": 0.6739427012278308, + "grad_norm": 0.47400856018066406, + "learning_rate": 1.9066061874790302e-05, + "loss": 0.2254, + "step": 247 + }, + { + "epoch": 0.6766712141882674, + "grad_norm": 0.4679079055786133, + "learning_rate": 1.9052598228834217e-05, + "loss": 0.2167, + "step": 248 + }, + { + "epoch": 0.679399727148704, + "grad_norm": 0.48590168356895447, + "learning_rate": 1.9039043053874175e-05, + "loss": 0.2216, + "step": 249 + }, + { + "epoch": 0.6821282401091405, + "grad_norm": 0.4846552610397339, + "learning_rate": 1.9025396486963827e-05, + "loss": 0.2247, + "step": 250 + }, + { + "epoch": 0.684856753069577, + "grad_norm": 0.4776105582714081, + "learning_rate": 1.9011658666080873e-05, + "loss": 0.2278, + "step": 251 + }, + { + "epoch": 0.6875852660300137, + "grad_norm": 0.4800094664096832, + "learning_rate": 1.8997829730125662e-05, + "loss": 0.2276, + "step": 252 + }, + { + "epoch": 0.6903137789904502, + "grad_norm": 0.47760075330734253, + "learning_rate": 1.898390981891979e-05, + "loss": 0.2189, + "step": 253 + }, + { + "epoch": 0.6930422919508867, + "grad_norm": 0.4844151735305786, + "learning_rate": 1.8969899073204687e-05, + "loss": 0.2236, + "step": 254 + }, + { + "epoch": 0.6957708049113234, + "grad_norm": 0.475306898355484, + "learning_rate": 1.895579763464019e-05, + "loss": 0.224, + "step": 255 + }, + { + "epoch": 0.6984993178717599, + "grad_norm": 0.4959429204463959, + "learning_rate": 1.8941605645803115e-05, + "loss": 0.2237, + "step": 256 + }, + { + "epoch": 0.7012278308321964, + "grad_norm": 0.4867900609970093, + "learning_rate": 1.8927323250185815e-05, + "loss": 0.2319, + "step": 257 + }, + { + "epoch": 0.703956343792633, + "grad_norm": 0.4814487099647522, + "learning_rate": 1.891295059219472e-05, + "loss": 0.2186, + "step": 258 + }, + { + "epoch": 0.7066848567530696, + "grad_norm": 0.4731517732143402, + "learning_rate": 1.88984878171489e-05, + "loss": 0.2205, + "step": 259 + }, + { + "epoch": 0.7094133697135061, + "grad_norm": 0.45691797137260437, + "learning_rate": 1.888393507127856e-05, + "loss": 0.2107, + "step": 260 + }, + { + "epoch": 0.7121418826739427, + "grad_norm": 0.5063825249671936, + "learning_rate": 1.8869292501723602e-05, + "loss": 0.2269, + "step": 261 + }, + { + "epoch": 0.7148703956343793, + "grad_norm": 0.4973117411136627, + "learning_rate": 1.8854560256532098e-05, + "loss": 0.2248, + "step": 262 + }, + { + "epoch": 0.7175989085948158, + "grad_norm": 0.49024084210395813, + "learning_rate": 1.8839738484658835e-05, + "loss": 0.2268, + "step": 263 + }, + { + "epoch": 0.7203274215552524, + "grad_norm": 0.4630759060382843, + "learning_rate": 1.8824827335963767e-05, + "loss": 0.2109, + "step": 264 + }, + { + "epoch": 0.723055934515689, + "grad_norm": 0.477979451417923, + "learning_rate": 1.8809826961210527e-05, + "loss": 0.2216, + "step": 265 + }, + { + "epoch": 0.7257844474761255, + "grad_norm": 0.4574492871761322, + "learning_rate": 1.879473751206489e-05, + "loss": 0.2182, + "step": 266 + }, + { + "epoch": 0.7285129604365621, + "grad_norm": 0.5049831867218018, + "learning_rate": 1.8779559141093256e-05, + "loss": 0.2238, + "step": 267 + }, + { + "epoch": 0.7312414733969986, + "grad_norm": 0.46383431553840637, + "learning_rate": 1.876429200176108e-05, + "loss": 0.2202, + "step": 268 + }, + { + "epoch": 0.7339699863574352, + "grad_norm": 0.48974180221557617, + "learning_rate": 1.8748936248431353e-05, + "loss": 0.2184, + "step": 269 + }, + { + "epoch": 0.7366984993178718, + "grad_norm": 0.4690982699394226, + "learning_rate": 1.8733492036363007e-05, + "loss": 0.2219, + "step": 270 + }, + { + "epoch": 0.7394270122783083, + "grad_norm": 0.48219865560531616, + "learning_rate": 1.871795952170937e-05, + "loss": 0.2209, + "step": 271 + }, + { + "epoch": 0.7421555252387448, + "grad_norm": 0.4637184739112854, + "learning_rate": 1.8702338861516587e-05, + "loss": 0.2131, + "step": 272 + }, + { + "epoch": 0.7448840381991815, + "grad_norm": 0.45264101028442383, + "learning_rate": 1.8686630213722015e-05, + "loss": 0.2167, + "step": 273 + }, + { + "epoch": 0.747612551159618, + "grad_norm": 0.4602806270122528, + "learning_rate": 1.867083373715264e-05, + "loss": 0.2194, + "step": 274 + }, + { + "epoch": 0.7503410641200545, + "grad_norm": 0.47461724281311035, + "learning_rate": 1.8654949591523467e-05, + "loss": 0.2195, + "step": 275 + }, + { + "epoch": 0.7530695770804912, + "grad_norm": 0.4658590257167816, + "learning_rate": 1.86389779374359e-05, + "loss": 0.2285, + "step": 276 + }, + { + "epoch": 0.7557980900409277, + "grad_norm": 0.4593490660190582, + "learning_rate": 1.8622918936376133e-05, + "loss": 0.2113, + "step": 277 + }, + { + "epoch": 0.7585266030013642, + "grad_norm": 0.4694429636001587, + "learning_rate": 1.8606772750713503e-05, + "loss": 0.2222, + "step": 278 + }, + { + "epoch": 0.7612551159618008, + "grad_norm": 0.4493769407272339, + "learning_rate": 1.8590539543698852e-05, + "loss": 0.2151, + "step": 279 + }, + { + "epoch": 0.7639836289222374, + "grad_norm": 0.4646337628364563, + "learning_rate": 1.857421947946288e-05, + "loss": 0.2208, + "step": 280 + }, + { + "epoch": 0.7667121418826739, + "grad_norm": 0.4551270008087158, + "learning_rate": 1.8557812723014476e-05, + "loss": 0.2113, + "step": 281 + }, + { + "epoch": 0.7694406548431105, + "grad_norm": 0.46589139103889465, + "learning_rate": 1.8541319440239066e-05, + "loss": 0.2207, + "step": 282 + }, + { + "epoch": 0.772169167803547, + "grad_norm": 0.4452350437641144, + "learning_rate": 1.8524739797896924e-05, + "loss": 0.2169, + "step": 283 + }, + { + "epoch": 0.7748976807639836, + "grad_norm": 0.49259626865386963, + "learning_rate": 1.8508073963621482e-05, + "loss": 0.2192, + "step": 284 + }, + { + "epoch": 0.7776261937244202, + "grad_norm": 0.450286328792572, + "learning_rate": 1.8491322105917645e-05, + "loss": 0.2187, + "step": 285 + }, + { + "epoch": 0.7803547066848567, + "grad_norm": 0.4535258710384369, + "learning_rate": 1.847448439416009e-05, + "loss": 0.218, + "step": 286 + }, + { + "epoch": 0.7830832196452933, + "grad_norm": 0.44347622990608215, + "learning_rate": 1.845756099859154e-05, + "loss": 0.2154, + "step": 287 + }, + { + "epoch": 0.7858117326057299, + "grad_norm": 0.45118656754493713, + "learning_rate": 1.8440552090321047e-05, + "loss": 0.21, + "step": 288 + }, + { + "epoch": 0.7885402455661664, + "grad_norm": 0.4592891335487366, + "learning_rate": 1.842345784132227e-05, + "loss": 0.2165, + "step": 289 + }, + { + "epoch": 0.791268758526603, + "grad_norm": 0.499129056930542, + "learning_rate": 1.8406278424431737e-05, + "loss": 0.2165, + "step": 290 + }, + { + "epoch": 0.7939972714870396, + "grad_norm": 0.4674071669578552, + "learning_rate": 1.838901401334708e-05, + "loss": 0.2183, + "step": 291 + }, + { + "epoch": 0.7967257844474761, + "grad_norm": 0.4629931151866913, + "learning_rate": 1.8371664782625287e-05, + "loss": 0.2164, + "step": 292 + }, + { + "epoch": 0.7994542974079127, + "grad_norm": 0.45491263270378113, + "learning_rate": 1.835423090768096e-05, + "loss": 0.2201, + "step": 293 + }, + { + "epoch": 0.8021828103683493, + "grad_norm": 0.4651404619216919, + "learning_rate": 1.8336712564784506e-05, + "loss": 0.2182, + "step": 294 + }, + { + "epoch": 0.8049113233287858, + "grad_norm": 0.4813602566719055, + "learning_rate": 1.8319109931060367e-05, + "loss": 0.2211, + "step": 295 + }, + { + "epoch": 0.8076398362892224, + "grad_norm": 0.4485262334346771, + "learning_rate": 1.8301423184485253e-05, + "loss": 0.2095, + "step": 296 + }, + { + "epoch": 0.810368349249659, + "grad_norm": 0.4614250361919403, + "learning_rate": 1.82836525038863e-05, + "loss": 0.2111, + "step": 297 + }, + { + "epoch": 0.8130968622100955, + "grad_norm": 0.47213491797447205, + "learning_rate": 1.8265798068939295e-05, + "loss": 0.216, + "step": 298 + }, + { + "epoch": 0.8158253751705321, + "grad_norm": 0.44635578989982605, + "learning_rate": 1.824786006016685e-05, + "loss": 0.208, + "step": 299 + }, + { + "epoch": 0.8185538881309686, + "grad_norm": 0.4536116123199463, + "learning_rate": 1.8229838658936566e-05, + "loss": 0.2105, + "step": 300 + }, + { + "epoch": 0.8212824010914052, + "grad_norm": 0.4396938681602478, + "learning_rate": 1.821173404745922e-05, + "loss": 0.2116, + "step": 301 + }, + { + "epoch": 0.8240109140518418, + "grad_norm": 0.4701569080352783, + "learning_rate": 1.81935464087869e-05, + "loss": 0.2143, + "step": 302 + }, + { + "epoch": 0.8267394270122783, + "grad_norm": 0.46065476536750793, + "learning_rate": 1.8175275926811173e-05, + "loss": 0.2163, + "step": 303 + }, + { + "epoch": 0.8294679399727148, + "grad_norm": 0.444499135017395, + "learning_rate": 1.815692278626122e-05, + "loss": 0.2109, + "step": 304 + }, + { + "epoch": 0.8321964529331515, + "grad_norm": 0.4454192519187927, + "learning_rate": 1.813848717270195e-05, + "loss": 0.2154, + "step": 305 + }, + { + "epoch": 0.834924965893588, + "grad_norm": 0.44666001200675964, + "learning_rate": 1.8119969272532164e-05, + "loss": 0.2158, + "step": 306 + }, + { + "epoch": 0.8376534788540245, + "grad_norm": 0.43889233469963074, + "learning_rate": 1.8101369272982633e-05, + "loss": 0.2121, + "step": 307 + }, + { + "epoch": 0.8403819918144612, + "grad_norm": 0.47276782989501953, + "learning_rate": 1.808268736211421e-05, + "loss": 0.215, + "step": 308 + }, + { + "epoch": 0.8431105047748977, + "grad_norm": 0.4451788365840912, + "learning_rate": 1.806392372881596e-05, + "loss": 0.217, + "step": 309 + }, + { + "epoch": 0.8458390177353342, + "grad_norm": 0.43395736813545227, + "learning_rate": 1.8045078562803203e-05, + "loss": 0.2137, + "step": 310 + }, + { + "epoch": 0.8485675306957708, + "grad_norm": 0.4520686864852905, + "learning_rate": 1.8026152054615633e-05, + "loss": 0.2133, + "step": 311 + }, + { + "epoch": 0.8512960436562074, + "grad_norm": 0.42937785387039185, + "learning_rate": 1.800714439561538e-05, + "loss": 0.2133, + "step": 312 + }, + { + "epoch": 0.8540245566166439, + "grad_norm": 0.4670831859111786, + "learning_rate": 1.7988055777985066e-05, + "loss": 0.2113, + "step": 313 + }, + { + "epoch": 0.8567530695770805, + "grad_norm": 0.4341495931148529, + "learning_rate": 1.7968886394725876e-05, + "loss": 0.2072, + "step": 314 + }, + { + "epoch": 0.859481582537517, + "grad_norm": 0.44559651613235474, + "learning_rate": 1.7949636439655592e-05, + "loss": 0.2173, + "step": 315 + }, + { + "epoch": 0.8622100954979536, + "grad_norm": 0.4362984299659729, + "learning_rate": 1.793030610740665e-05, + "loss": 0.2092, + "step": 316 + }, + { + "epoch": 0.8649386084583902, + "grad_norm": 0.4457165598869324, + "learning_rate": 1.7910895593424166e-05, + "loss": 0.2043, + "step": 317 + }, + { + "epoch": 0.8676671214188267, + "grad_norm": 0.4453994333744049, + "learning_rate": 1.789140509396394e-05, + "loss": 0.2149, + "step": 318 + }, + { + "epoch": 0.8703956343792633, + "grad_norm": 0.42760151624679565, + "learning_rate": 1.7871834806090502e-05, + "loss": 0.2081, + "step": 319 + }, + { + "epoch": 0.8731241473396999, + "grad_norm": 0.4355124533176422, + "learning_rate": 1.7852184927675113e-05, + "loss": 0.2087, + "step": 320 + }, + { + "epoch": 0.8758526603001364, + "grad_norm": 0.4316492974758148, + "learning_rate": 1.7832455657393745e-05, + "loss": 0.2062, + "step": 321 + }, + { + "epoch": 0.878581173260573, + "grad_norm": 0.43141260743141174, + "learning_rate": 1.7812647194725093e-05, + "loss": 0.2117, + "step": 322 + }, + { + "epoch": 0.8813096862210096, + "grad_norm": 0.44423919916152954, + "learning_rate": 1.7792759739948546e-05, + "loss": 0.2086, + "step": 323 + }, + { + "epoch": 0.8840381991814461, + "grad_norm": 0.4427170753479004, + "learning_rate": 1.777279349414217e-05, + "loss": 0.2065, + "step": 324 + }, + { + "epoch": 0.8867667121418826, + "grad_norm": 0.4190024733543396, + "learning_rate": 1.7752748659180662e-05, + "loss": 0.2104, + "step": 325 + }, + { + "epoch": 0.8894952251023193, + "grad_norm": 0.42245593667030334, + "learning_rate": 1.7732625437733338e-05, + "loss": 0.211, + "step": 326 + }, + { + "epoch": 0.8922237380627558, + "grad_norm": 0.4226483702659607, + "learning_rate": 1.771242403326204e-05, + "loss": 0.2093, + "step": 327 + }, + { + "epoch": 0.8949522510231923, + "grad_norm": 0.4086417853832245, + "learning_rate": 1.7692144650019125e-05, + "loss": 0.2046, + "step": 328 + }, + { + "epoch": 0.897680763983629, + "grad_norm": 0.432595431804657, + "learning_rate": 1.767178749304536e-05, + "loss": 0.2073, + "step": 329 + }, + { + "epoch": 0.9004092769440655, + "grad_norm": 0.44269153475761414, + "learning_rate": 1.765135276816787e-05, + "loss": 0.2129, + "step": 330 + }, + { + "epoch": 0.903137789904502, + "grad_norm": 0.42717134952545166, + "learning_rate": 1.7630840681998068e-05, + "loss": 0.2071, + "step": 331 + }, + { + "epoch": 0.9058663028649386, + "grad_norm": 0.4242989420890808, + "learning_rate": 1.7610251441929532e-05, + "loss": 0.2078, + "step": 332 + }, + { + "epoch": 0.9085948158253752, + "grad_norm": 0.4230920970439911, + "learning_rate": 1.758958525613594e-05, + "loss": 0.2064, + "step": 333 + }, + { + "epoch": 0.9113233287858117, + "grad_norm": 0.45626723766326904, + "learning_rate": 1.7568842333568952e-05, + "loss": 0.2147, + "step": 334 + }, + { + "epoch": 0.9140518417462483, + "grad_norm": 0.4308648109436035, + "learning_rate": 1.754802288395609e-05, + "loss": 0.2084, + "step": 335 + }, + { + "epoch": 0.9167803547066848, + "grad_norm": 0.4306204617023468, + "learning_rate": 1.7527127117798635e-05, + "loss": 0.2122, + "step": 336 + }, + { + "epoch": 0.9195088676671214, + "grad_norm": 0.4448958933353424, + "learning_rate": 1.750615524636948e-05, + "loss": 0.2056, + "step": 337 + }, + { + "epoch": 0.922237380627558, + "grad_norm": 0.4293544590473175, + "learning_rate": 1.7485107481711014e-05, + "loss": 0.2073, + "step": 338 + }, + { + "epoch": 0.9249658935879945, + "grad_norm": 0.45121076703071594, + "learning_rate": 1.7463984036632956e-05, + "loss": 0.2105, + "step": 339 + }, + { + "epoch": 0.927694406548431, + "grad_norm": 0.4212591350078583, + "learning_rate": 1.7442785124710227e-05, + "loss": 0.2065, + "step": 340 + }, + { + "epoch": 0.9304229195088677, + "grad_norm": 0.417019784450531, + "learning_rate": 1.742151096028076e-05, + "loss": 0.2117, + "step": 341 + }, + { + "epoch": 0.9331514324693042, + "grad_norm": 0.44269001483917236, + "learning_rate": 1.7400161758443377e-05, + "loss": 0.2098, + "step": 342 + }, + { + "epoch": 0.9358799454297408, + "grad_norm": 0.45144540071487427, + "learning_rate": 1.7378737735055562e-05, + "loss": 0.2031, + "step": 343 + }, + { + "epoch": 0.9386084583901774, + "grad_norm": 0.4351907968521118, + "learning_rate": 1.735723910673132e-05, + "loss": 0.2104, + "step": 344 + }, + { + "epoch": 0.9413369713506139, + "grad_norm": 0.42139601707458496, + "learning_rate": 1.7335666090838965e-05, + "loss": 0.2109, + "step": 345 + }, + { + "epoch": 0.9440654843110505, + "grad_norm": 0.42321038246154785, + "learning_rate": 1.7314018905498932e-05, + "loss": 0.207, + "step": 346 + }, + { + "epoch": 0.946793997271487, + "grad_norm": 0.409960001707077, + "learning_rate": 1.729229776958157e-05, + "loss": 0.2022, + "step": 347 + }, + { + "epoch": 0.9495225102319236, + "grad_norm": 0.42684659361839294, + "learning_rate": 1.7270502902704925e-05, + "loss": 0.2122, + "step": 348 + }, + { + "epoch": 0.9522510231923602, + "grad_norm": 0.4144516587257385, + "learning_rate": 1.7248634525232523e-05, + "loss": 0.2083, + "step": 349 + }, + { + "epoch": 0.9549795361527967, + "grad_norm": 0.431145578622818, + "learning_rate": 1.7226692858271133e-05, + "loss": 0.2113, + "step": 350 + }, + { + "epoch": 0.9577080491132333, + "grad_norm": 0.41966184973716736, + "learning_rate": 1.7204678123668556e-05, + "loss": 0.2064, + "step": 351 + }, + { + "epoch": 0.9604365620736699, + "grad_norm": 0.41143694519996643, + "learning_rate": 1.718259054401135e-05, + "loss": 0.2042, + "step": 352 + }, + { + "epoch": 0.9631650750341064, + "grad_norm": 0.40677252411842346, + "learning_rate": 1.71604303426226e-05, + "loss": 0.2079, + "step": 353 + }, + { + "epoch": 0.965893587994543, + "grad_norm": 0.41135165095329285, + "learning_rate": 1.7138197743559656e-05, + "loss": 0.2062, + "step": 354 + }, + { + "epoch": 0.9686221009549796, + "grad_norm": 0.42349115014076233, + "learning_rate": 1.7115892971611864e-05, + "loss": 0.2112, + "step": 355 + }, + { + "epoch": 0.9713506139154161, + "grad_norm": 0.4229361414909363, + "learning_rate": 1.7093516252298296e-05, + "loss": 0.2064, + "step": 356 + }, + { + "epoch": 0.9740791268758526, + "grad_norm": 0.4161188304424286, + "learning_rate": 1.7071067811865477e-05, + "loss": 0.2053, + "step": 357 + }, + { + "epoch": 0.9768076398362893, + "grad_norm": 0.4185996651649475, + "learning_rate": 1.7048547877285078e-05, + "loss": 0.2021, + "step": 358 + }, + { + "epoch": 0.9795361527967258, + "grad_norm": 0.4161181151866913, + "learning_rate": 1.7025956676251636e-05, + "loss": 0.2104, + "step": 359 + }, + { + "epoch": 0.9822646657571623, + "grad_norm": 0.41911646723747253, + "learning_rate": 1.7003294437180254e-05, + "loss": 0.2043, + "step": 360 + }, + { + "epoch": 0.984993178717599, + "grad_norm": 0.4234631061553955, + "learning_rate": 1.6980561389204285e-05, + "loss": 0.203, + "step": 361 + }, + { + "epoch": 0.9877216916780355, + "grad_norm": 0.43413975834846497, + "learning_rate": 1.695775776217301e-05, + "loss": 0.2091, + "step": 362 + }, + { + "epoch": 0.990450204638472, + "grad_norm": 0.4408881962299347, + "learning_rate": 1.6934883786649333e-05, + "loss": 0.205, + "step": 363 + }, + { + "epoch": 0.9931787175989086, + "grad_norm": 0.4064873456954956, + "learning_rate": 1.6911939693907422e-05, + "loss": 0.2018, + "step": 364 + }, + { + "epoch": 0.9959072305593452, + "grad_norm": 0.42045751214027405, + "learning_rate": 1.6888925715930396e-05, + "loss": 0.206, + "step": 365 + }, + { + "epoch": 0.9986357435197817, + "grad_norm": 0.42600393295288086, + "learning_rate": 1.686584208540797e-05, + "loss": 0.2106, + "step": 366 + }, + { + "epoch": 1.0013642564802183, + "grad_norm": 0.4242832064628601, + "learning_rate": 1.68426890357341e-05, + "loss": 0.1921, + "step": 367 + }, + { + "epoch": 1.004092769440655, + "grad_norm": 0.4603778123855591, + "learning_rate": 1.6819466801004622e-05, + "loss": 0.1769, + "step": 368 + }, + { + "epoch": 1.0068212824010914, + "grad_norm": 0.4325129985809326, + "learning_rate": 1.6796175616014894e-05, + "loss": 0.1739, + "step": 369 + }, + { + "epoch": 1.009549795361528, + "grad_norm": 0.4734160602092743, + "learning_rate": 1.6772815716257414e-05, + "loss": 0.172, + "step": 370 + }, + { + "epoch": 1.0122783083219646, + "grad_norm": 0.4947826862335205, + "learning_rate": 1.6749387337919434e-05, + "loss": 0.1713, + "step": 371 + }, + { + "epoch": 1.015006821282401, + "grad_norm": 0.5005810260772705, + "learning_rate": 1.672589071788059e-05, + "loss": 0.1732, + "step": 372 + }, + { + "epoch": 1.0177353342428377, + "grad_norm": 0.4360048472881317, + "learning_rate": 1.6702326093710493e-05, + "loss": 0.174, + "step": 373 + }, + { + "epoch": 1.0204638472032743, + "grad_norm": 0.44616055488586426, + "learning_rate": 1.6678693703666327e-05, + "loss": 0.1817, + "step": 374 + }, + { + "epoch": 1.0231923601637107, + "grad_norm": 0.42599302530288696, + "learning_rate": 1.6654993786690445e-05, + "loss": 0.1714, + "step": 375 + }, + { + "epoch": 1.0259208731241474, + "grad_norm": 0.42496803402900696, + "learning_rate": 1.6631226582407954e-05, + "loss": 0.1752, + "step": 376 + }, + { + "epoch": 1.028649386084584, + "grad_norm": 0.4142780005931854, + "learning_rate": 1.6607392331124282e-05, + "loss": 0.1689, + "step": 377 + }, + { + "epoch": 1.0313778990450204, + "grad_norm": 0.4593532681465149, + "learning_rate": 1.6583491273822763e-05, + "loss": 0.1788, + "step": 378 + }, + { + "epoch": 1.034106412005457, + "grad_norm": 0.44729867577552795, + "learning_rate": 1.6559523652162192e-05, + "loss": 0.1741, + "step": 379 + }, + { + "epoch": 1.0368349249658937, + "grad_norm": 0.46795180439949036, + "learning_rate": 1.653548970847438e-05, + "loss": 0.176, + "step": 380 + }, + { + "epoch": 1.03956343792633, + "grad_norm": 0.40044230222702026, + "learning_rate": 1.651138968576171e-05, + "loss": 0.1716, + "step": 381 + }, + { + "epoch": 1.0422919508867667, + "grad_norm": 0.42729461193084717, + "learning_rate": 1.6487223827694673e-05, + "loss": 0.1728, + "step": 382 + }, + { + "epoch": 1.0450204638472034, + "grad_norm": 0.41781216859817505, + "learning_rate": 1.646299237860941e-05, + "loss": 0.1789, + "step": 383 + }, + { + "epoch": 1.0477489768076398, + "grad_norm": 0.4189138114452362, + "learning_rate": 1.643869558350524e-05, + "loss": 0.1772, + "step": 384 + }, + { + "epoch": 1.0504774897680764, + "grad_norm": 0.4230622947216034, + "learning_rate": 1.6414333688042186e-05, + "loss": 0.1752, + "step": 385 + }, + { + "epoch": 1.053206002728513, + "grad_norm": 0.41642749309539795, + "learning_rate": 1.638990693853848e-05, + "loss": 0.1764, + "step": 386 + }, + { + "epoch": 1.0559345156889495, + "grad_norm": 0.4181545376777649, + "learning_rate": 1.6365415581968086e-05, + "loss": 0.1725, + "step": 387 + }, + { + "epoch": 1.058663028649386, + "grad_norm": 0.437956839799881, + "learning_rate": 1.6340859865958193e-05, + "loss": 0.1768, + "step": 388 + }, + { + "epoch": 1.0613915416098227, + "grad_norm": 0.4101565480232239, + "learning_rate": 1.631624003878672e-05, + "loss": 0.1706, + "step": 389 + }, + { + "epoch": 1.0641200545702592, + "grad_norm": 0.41338515281677246, + "learning_rate": 1.6291556349379794e-05, + "loss": 0.1729, + "step": 390 + }, + { + "epoch": 1.0668485675306958, + "grad_norm": 0.4137190580368042, + "learning_rate": 1.6266809047309253e-05, + "loss": 0.1797, + "step": 391 + }, + { + "epoch": 1.0695770804911324, + "grad_norm": 0.4079779088497162, + "learning_rate": 1.6241998382790095e-05, + "loss": 0.1741, + "step": 392 + }, + { + "epoch": 1.0723055934515688, + "grad_norm": 0.4103091061115265, + "learning_rate": 1.6217124606677973e-05, + "loss": 0.17, + "step": 393 + }, + { + "epoch": 1.0750341064120055, + "grad_norm": 0.3932867646217346, + "learning_rate": 1.6192187970466646e-05, + "loss": 0.1727, + "step": 394 + }, + { + "epoch": 1.077762619372442, + "grad_norm": 0.4191529154777527, + "learning_rate": 1.6167188726285433e-05, + "loss": 0.1747, + "step": 395 + }, + { + "epoch": 1.0804911323328785, + "grad_norm": 0.4195106327533722, + "learning_rate": 1.6142127126896682e-05, + "loss": 0.1716, + "step": 396 + }, + { + "epoch": 1.0832196452933152, + "grad_norm": 0.403487890958786, + "learning_rate": 1.611700342569319e-05, + "loss": 0.1757, + "step": 397 + }, + { + "epoch": 1.0859481582537518, + "grad_norm": 0.41124963760375977, + "learning_rate": 1.6091817876695655e-05, + "loss": 0.1729, + "step": 398 + }, + { + "epoch": 1.0886766712141882, + "grad_norm": 0.4118644595146179, + "learning_rate": 1.606657073455012e-05, + "loss": 0.169, + "step": 399 + }, + { + "epoch": 1.0914051841746248, + "grad_norm": 0.4055117666721344, + "learning_rate": 1.6041262254525362e-05, + "loss": 0.1741, + "step": 400 + }, + { + "epoch": 1.0941336971350615, + "grad_norm": 0.39282718300819397, + "learning_rate": 1.601589269251035e-05, + "loss": 0.174, + "step": 401 + }, + { + "epoch": 1.096862210095498, + "grad_norm": 0.41776013374328613, + "learning_rate": 1.599046230501163e-05, + "loss": 0.1801, + "step": 402 + }, + { + "epoch": 1.0995907230559345, + "grad_norm": 0.40125736594200134, + "learning_rate": 1.5964971349150746e-05, + "loss": 0.173, + "step": 403 + }, + { + "epoch": 1.1023192360163712, + "grad_norm": 0.40280765295028687, + "learning_rate": 1.593942008266164e-05, + "loss": 0.1727, + "step": 404 + }, + { + "epoch": 1.1050477489768076, + "grad_norm": 0.4229116141796112, + "learning_rate": 1.591380876388804e-05, + "loss": 0.1787, + "step": 405 + }, + { + "epoch": 1.1077762619372442, + "grad_norm": 0.393923819065094, + "learning_rate": 1.5888137651780847e-05, + "loss": 0.1707, + "step": 406 + }, + { + "epoch": 1.1105047748976808, + "grad_norm": 0.40177997946739197, + "learning_rate": 1.5862407005895524e-05, + "loss": 0.1696, + "step": 407 + }, + { + "epoch": 1.1132332878581173, + "grad_norm": 0.385484516620636, + "learning_rate": 1.583661708638947e-05, + "loss": 0.1698, + "step": 408 + }, + { + "epoch": 1.115961800818554, + "grad_norm": 0.41319334506988525, + "learning_rate": 1.5810768154019386e-05, + "loss": 0.1708, + "step": 409 + }, + { + "epoch": 1.1186903137789905, + "grad_norm": 0.4051019251346588, + "learning_rate": 1.5784860470138633e-05, + "loss": 0.1725, + "step": 410 + }, + { + "epoch": 1.121418826739427, + "grad_norm": 0.4432063102722168, + "learning_rate": 1.5758894296694614e-05, + "loss": 0.1802, + "step": 411 + }, + { + "epoch": 1.1241473396998636, + "grad_norm": 0.4144032597541809, + "learning_rate": 1.573286989622609e-05, + "loss": 0.1768, + "step": 412 + }, + { + "epoch": 1.1268758526603002, + "grad_norm": 0.40015333890914917, + "learning_rate": 1.5706787531860557e-05, + "loss": 0.1737, + "step": 413 + }, + { + "epoch": 1.1296043656207366, + "grad_norm": 0.40774694085121155, + "learning_rate": 1.568064746731156e-05, + "loss": 0.1764, + "step": 414 + }, + { + "epoch": 1.1323328785811733, + "grad_norm": 0.41893133521080017, + "learning_rate": 1.565444996687605e-05, + "loss": 0.1751, + "step": 415 + }, + { + "epoch": 1.13506139154161, + "grad_norm": 0.41963663697242737, + "learning_rate": 1.5628195295431696e-05, + "loss": 0.1752, + "step": 416 + }, + { + "epoch": 1.1377899045020463, + "grad_norm": 0.41315096616744995, + "learning_rate": 1.5601883718434207e-05, + "loss": 0.1736, + "step": 417 + }, + { + "epoch": 1.140518417462483, + "grad_norm": 0.40985772013664246, + "learning_rate": 1.557551550191467e-05, + "loss": 0.1725, + "step": 418 + }, + { + "epoch": 1.1432469304229196, + "grad_norm": 0.39127951860427856, + "learning_rate": 1.554909091247682e-05, + "loss": 0.1713, + "step": 419 + }, + { + "epoch": 1.145975443383356, + "grad_norm": 0.42498472332954407, + "learning_rate": 1.5522610217294377e-05, + "loss": 0.1713, + "step": 420 + }, + { + "epoch": 1.1487039563437926, + "grad_norm": 0.39464524388313293, + "learning_rate": 1.549607368410834e-05, + "loss": 0.1697, + "step": 421 + }, + { + "epoch": 1.1514324693042293, + "grad_norm": 0.3958282172679901, + "learning_rate": 1.5469481581224274e-05, + "loss": 0.1744, + "step": 422 + }, + { + "epoch": 1.1541609822646657, + "grad_norm": 0.40584421157836914, + "learning_rate": 1.544283417750958e-05, + "loss": 0.1738, + "step": 423 + }, + { + "epoch": 1.1568894952251023, + "grad_norm": 0.4232189953327179, + "learning_rate": 1.5416131742390827e-05, + "loss": 0.1755, + "step": 424 + }, + { + "epoch": 1.159618008185539, + "grad_norm": 0.46699848771095276, + "learning_rate": 1.5389374545850973e-05, + "loss": 0.1742, + "step": 425 + }, + { + "epoch": 1.1623465211459754, + "grad_norm": 0.41914746165275574, + "learning_rate": 1.5362562858426655e-05, + "loss": 0.1741, + "step": 426 + }, + { + "epoch": 1.165075034106412, + "grad_norm": 0.3851917088031769, + "learning_rate": 1.533569695120547e-05, + "loss": 0.1722, + "step": 427 + }, + { + "epoch": 1.1678035470668486, + "grad_norm": 0.4126559793949127, + "learning_rate": 1.530877709582321e-05, + "loss": 0.1731, + "step": 428 + }, + { + "epoch": 1.170532060027285, + "grad_norm": 0.3970852196216583, + "learning_rate": 1.5281803564461135e-05, + "loss": 0.1674, + "step": 429 + }, + { + "epoch": 1.1732605729877217, + "grad_norm": 0.409260094165802, + "learning_rate": 1.5254776629843204e-05, + "loss": 0.177, + "step": 430 + }, + { + "epoch": 1.1759890859481583, + "grad_norm": 0.4050799608230591, + "learning_rate": 1.522769656523333e-05, + "loss": 0.173, + "step": 431 + }, + { + "epoch": 1.1787175989085947, + "grad_norm": 0.39923250675201416, + "learning_rate": 1.5200563644432614e-05, + "loss": 0.1685, + "step": 432 + }, + { + "epoch": 1.1814461118690314, + "grad_norm": 0.39861583709716797, + "learning_rate": 1.5173378141776569e-05, + "loss": 0.1755, + "step": 433 + }, + { + "epoch": 1.184174624829468, + "grad_norm": 0.39778876304626465, + "learning_rate": 1.5146140332132359e-05, + "loss": 0.1734, + "step": 434 + }, + { + "epoch": 1.1869031377899044, + "grad_norm": 0.383896142244339, + "learning_rate": 1.5118850490896012e-05, + "loss": 0.1709, + "step": 435 + }, + { + "epoch": 1.189631650750341, + "grad_norm": 0.3906187117099762, + "learning_rate": 1.5091508893989633e-05, + "loss": 0.1713, + "step": 436 + }, + { + "epoch": 1.1923601637107777, + "grad_norm": 0.3931368589401245, + "learning_rate": 1.5064115817858622e-05, + "loss": 0.1773, + "step": 437 + }, + { + "epoch": 1.195088676671214, + "grad_norm": 0.4080300033092499, + "learning_rate": 1.5036671539468879e-05, + "loss": 0.1748, + "step": 438 + }, + { + "epoch": 1.1978171896316507, + "grad_norm": 0.4110572040081024, + "learning_rate": 1.5009176336303987e-05, + "loss": 0.1758, + "step": 439 + }, + { + "epoch": 1.2005457025920874, + "grad_norm": 0.38402611017227173, + "learning_rate": 1.4981630486362435e-05, + "loss": 0.1706, + "step": 440 + }, + { + "epoch": 1.2032742155525238, + "grad_norm": 0.4152999222278595, + "learning_rate": 1.4954034268154777e-05, + "loss": 0.1748, + "step": 441 + }, + { + "epoch": 1.2060027285129604, + "grad_norm": 0.3882010877132416, + "learning_rate": 1.4926387960700843e-05, + "loss": 0.1732, + "step": 442 + }, + { + "epoch": 1.208731241473397, + "grad_norm": 0.40996554493904114, + "learning_rate": 1.4898691843526897e-05, + "loss": 0.1773, + "step": 443 + }, + { + "epoch": 1.2114597544338335, + "grad_norm": 0.41083115339279175, + "learning_rate": 1.4870946196662822e-05, + "loss": 0.1727, + "step": 444 + }, + { + "epoch": 1.21418826739427, + "grad_norm": 0.39368975162506104, + "learning_rate": 1.4843151300639282e-05, + "loss": 0.1726, + "step": 445 + }, + { + "epoch": 1.2169167803547067, + "grad_norm": 0.4009232223033905, + "learning_rate": 1.4815307436484898e-05, + "loss": 0.1724, + "step": 446 + }, + { + "epoch": 1.2196452933151432, + "grad_norm": 0.3860257565975189, + "learning_rate": 1.4787414885723386e-05, + "loss": 0.1712, + "step": 447 + }, + { + "epoch": 1.2223738062755798, + "grad_norm": 0.37950554490089417, + "learning_rate": 1.4759473930370738e-05, + "loss": 0.1723, + "step": 448 + }, + { + "epoch": 1.2251023192360164, + "grad_norm": 0.3731255829334259, + "learning_rate": 1.4731484852932338e-05, + "loss": 0.1682, + "step": 449 + }, + { + "epoch": 1.2278308321964528, + "grad_norm": 0.39131075143814087, + "learning_rate": 1.4703447936400135e-05, + "loss": 0.169, + "step": 450 + }, + { + "epoch": 1.2305593451568895, + "grad_norm": 0.4061223566532135, + "learning_rate": 1.4675363464249763e-05, + "loss": 0.1769, + "step": 451 + }, + { + "epoch": 1.233287858117326, + "grad_norm": 0.3892337679862976, + "learning_rate": 1.4647231720437687e-05, + "loss": 0.173, + "step": 452 + }, + { + "epoch": 1.2360163710777625, + "grad_norm": 0.396017462015152, + "learning_rate": 1.461905298939832e-05, + "loss": 0.1737, + "step": 453 + }, + { + "epoch": 1.2387448840381992, + "grad_norm": 0.3907962739467621, + "learning_rate": 1.4590827556041158e-05, + "loss": 0.1699, + "step": 454 + }, + { + "epoch": 1.2414733969986358, + "grad_norm": 0.3987230658531189, + "learning_rate": 1.4562555705747894e-05, + "loss": 0.1755, + "step": 455 + }, + { + "epoch": 1.2442019099590724, + "grad_norm": 0.38845115900039673, + "learning_rate": 1.4534237724369534e-05, + "loss": 0.1743, + "step": 456 + }, + { + "epoch": 1.2469304229195088, + "grad_norm": 0.3965071737766266, + "learning_rate": 1.4505873898223498e-05, + "loss": 0.1729, + "step": 457 + }, + { + "epoch": 1.2496589358799455, + "grad_norm": 0.38698264956474304, + "learning_rate": 1.4477464514090745e-05, + "loss": 0.1711, + "step": 458 + }, + { + "epoch": 1.252387448840382, + "grad_norm": 0.39602330327033997, + "learning_rate": 1.4449009859212857e-05, + "loss": 0.1773, + "step": 459 + }, + { + "epoch": 1.2551159618008185, + "grad_norm": 0.3965539336204529, + "learning_rate": 1.4420510221289137e-05, + "loss": 0.1731, + "step": 460 + }, + { + "epoch": 1.2578444747612552, + "grad_norm": 0.3928743600845337, + "learning_rate": 1.4391965888473705e-05, + "loss": 0.1688, + "step": 461 + }, + { + "epoch": 1.2605729877216918, + "grad_norm": 0.3851865828037262, + "learning_rate": 1.4363377149372584e-05, + "loss": 0.1726, + "step": 462 + }, + { + "epoch": 1.2633015006821282, + "grad_norm": 0.3963593542575836, + "learning_rate": 1.4334744293040773e-05, + "loss": 0.1715, + "step": 463 + }, + { + "epoch": 1.2660300136425648, + "grad_norm": 0.37571755051612854, + "learning_rate": 1.430606760897934e-05, + "loss": 0.1723, + "step": 464 + }, + { + "epoch": 1.2687585266030013, + "grad_norm": 0.3996609151363373, + "learning_rate": 1.4277347387132482e-05, + "loss": 0.1706, + "step": 465 + }, + { + "epoch": 1.271487039563438, + "grad_norm": 0.403372585773468, + "learning_rate": 1.4248583917884595e-05, + "loss": 0.1714, + "step": 466 + }, + { + "epoch": 1.2742155525238745, + "grad_norm": 0.4006504714488983, + "learning_rate": 1.4219777492057349e-05, + "loss": 0.1712, + "step": 467 + }, + { + "epoch": 1.2769440654843112, + "grad_norm": 0.3809853494167328, + "learning_rate": 1.4190928400906731e-05, + "loss": 0.1652, + "step": 468 + }, + { + "epoch": 1.2796725784447476, + "grad_norm": 0.39101433753967285, + "learning_rate": 1.4162036936120115e-05, + "loss": 0.1725, + "step": 469 + }, + { + "epoch": 1.2824010914051842, + "grad_norm": 0.4025862514972687, + "learning_rate": 1.4133103389813302e-05, + "loss": 0.1736, + "step": 470 + }, + { + "epoch": 1.2851296043656206, + "grad_norm": 0.3864237666130066, + "learning_rate": 1.410412805452757e-05, + "loss": 0.1708, + "step": 471 + }, + { + "epoch": 1.2878581173260573, + "grad_norm": 0.37752318382263184, + "learning_rate": 1.4075111223226721e-05, + "loss": 0.1742, + "step": 472 + }, + { + "epoch": 1.290586630286494, + "grad_norm": 0.38657379150390625, + "learning_rate": 1.4046053189294114e-05, + "loss": 0.175, + "step": 473 + }, + { + "epoch": 1.2933151432469305, + "grad_norm": 0.3829413652420044, + "learning_rate": 1.4016954246529697e-05, + "loss": 0.1702, + "step": 474 + }, + { + "epoch": 1.296043656207367, + "grad_norm": 0.3982481360435486, + "learning_rate": 1.3987814689147041e-05, + "loss": 0.1708, + "step": 475 + }, + { + "epoch": 1.2987721691678036, + "grad_norm": 0.3672139048576355, + "learning_rate": 1.3958634811770361e-05, + "loss": 0.1673, + "step": 476 + }, + { + "epoch": 1.30150068212824, + "grad_norm": 0.39407840371131897, + "learning_rate": 1.3929414909431544e-05, + "loss": 0.174, + "step": 477 + }, + { + "epoch": 1.3042291950886766, + "grad_norm": 0.4047017991542816, + "learning_rate": 1.3900155277567157e-05, + "loss": 0.1721, + "step": 478 + }, + { + "epoch": 1.3069577080491133, + "grad_norm": 0.3893299102783203, + "learning_rate": 1.3870856212015468e-05, + "loss": 0.1757, + "step": 479 + }, + { + "epoch": 1.30968622100955, + "grad_norm": 0.38834965229034424, + "learning_rate": 1.3841518009013446e-05, + "loss": 0.1762, + "step": 480 + }, + { + "epoch": 1.3124147339699863, + "grad_norm": 0.39646992087364197, + "learning_rate": 1.3812140965193775e-05, + "loss": 0.1771, + "step": 481 + }, + { + "epoch": 1.315143246930423, + "grad_norm": 0.37843289971351624, + "learning_rate": 1.378272537758185e-05, + "loss": 0.1746, + "step": 482 + }, + { + "epoch": 1.3178717598908594, + "grad_norm": 0.378549724817276, + "learning_rate": 1.3753271543592772e-05, + "loss": 0.1711, + "step": 483 + }, + { + "epoch": 1.320600272851296, + "grad_norm": 0.3854270279407501, + "learning_rate": 1.3723779761028349e-05, + "loss": 0.1747, + "step": 484 + }, + { + "epoch": 1.3233287858117326, + "grad_norm": 0.38288038969039917, + "learning_rate": 1.3694250328074072e-05, + "loss": 0.1783, + "step": 485 + }, + { + "epoch": 1.3260572987721693, + "grad_norm": 0.3755180537700653, + "learning_rate": 1.3664683543296114e-05, + "loss": 0.1706, + "step": 486 + }, + { + "epoch": 1.3287858117326057, + "grad_norm": 0.38838502764701843, + "learning_rate": 1.3635079705638298e-05, + "loss": 0.1734, + "step": 487 + }, + { + "epoch": 1.3315143246930423, + "grad_norm": 0.3818054497241974, + "learning_rate": 1.3605439114419095e-05, + "loss": 0.1694, + "step": 488 + }, + { + "epoch": 1.3342428376534787, + "grad_norm": 0.3814438283443451, + "learning_rate": 1.3575762069328567e-05, + "loss": 0.1744, + "step": 489 + }, + { + "epoch": 1.3369713506139154, + "grad_norm": 0.39339813590049744, + "learning_rate": 1.3546048870425356e-05, + "loss": 0.1722, + "step": 490 + }, + { + "epoch": 1.339699863574352, + "grad_norm": 0.380854070186615, + "learning_rate": 1.3516299818133664e-05, + "loss": 0.1695, + "step": 491 + }, + { + "epoch": 1.3424283765347886, + "grad_norm": 0.3817788064479828, + "learning_rate": 1.3486515213240188e-05, + "loss": 0.1712, + "step": 492 + }, + { + "epoch": 1.345156889495225, + "grad_norm": 0.38205572962760925, + "learning_rate": 1.3456695356891079e-05, + "loss": 0.1709, + "step": 493 + }, + { + "epoch": 1.3478854024556617, + "grad_norm": 0.3941291570663452, + "learning_rate": 1.3426840550588933e-05, + "loss": 0.1731, + "step": 494 + }, + { + "epoch": 1.350613915416098, + "grad_norm": 0.3804328143596649, + "learning_rate": 1.33969510961897e-05, + "loss": 0.1723, + "step": 495 + }, + { + "epoch": 1.3533424283765347, + "grad_norm": 0.3907669186592102, + "learning_rate": 1.3367027295899652e-05, + "loss": 0.1737, + "step": 496 + }, + { + "epoch": 1.3560709413369714, + "grad_norm": 0.3885650038719177, + "learning_rate": 1.3337069452272332e-05, + "loss": 0.1726, + "step": 497 + }, + { + "epoch": 1.358799454297408, + "grad_norm": 0.38135266304016113, + "learning_rate": 1.3307077868205487e-05, + "loss": 0.1757, + "step": 498 + }, + { + "epoch": 1.3615279672578444, + "grad_norm": 0.36993685364723206, + "learning_rate": 1.3277052846937997e-05, + "loss": 0.1718, + "step": 499 + }, + { + "epoch": 1.364256480218281, + "grad_norm": 0.37970247864723206, + "learning_rate": 1.3246994692046837e-05, + "loss": 0.1746, + "step": 500 + }, + { + "epoch": 1.3669849931787175, + "grad_norm": 0.3805428445339203, + "learning_rate": 1.321690370744397e-05, + "loss": 0.1782, + "step": 501 + }, + { + "epoch": 1.369713506139154, + "grad_norm": 0.3677096962928772, + "learning_rate": 1.3186780197373306e-05, + "loss": 0.1691, + "step": 502 + }, + { + "epoch": 1.3724420190995907, + "grad_norm": 0.38988423347473145, + "learning_rate": 1.3156624466407607e-05, + "loss": 0.1708, + "step": 503 + }, + { + "epoch": 1.3751705320600274, + "grad_norm": 0.38532352447509766, + "learning_rate": 1.3126436819445423e-05, + "loss": 0.174, + "step": 504 + }, + { + "epoch": 1.3778990450204638, + "grad_norm": 0.3830656409263611, + "learning_rate": 1.309621756170799e-05, + "loss": 0.1737, + "step": 505 + }, + { + "epoch": 1.3806275579809004, + "grad_norm": 0.3703681230545044, + "learning_rate": 1.3065966998736155e-05, + "loss": 0.1687, + "step": 506 + }, + { + "epoch": 1.3833560709413368, + "grad_norm": 0.3779120445251465, + "learning_rate": 1.3035685436387297e-05, + "loss": 0.1687, + "step": 507 + }, + { + "epoch": 1.3860845839017735, + "grad_norm": 0.3700374364852905, + "learning_rate": 1.300537318083221e-05, + "loss": 0.173, + "step": 508 + }, + { + "epoch": 1.38881309686221, + "grad_norm": 0.38560089468955994, + "learning_rate": 1.297503053855203e-05, + "loss": 0.1706, + "step": 509 + }, + { + "epoch": 1.3915416098226467, + "grad_norm": 0.37079334259033203, + "learning_rate": 1.2944657816335124e-05, + "loss": 0.1715, + "step": 510 + }, + { + "epoch": 1.3942701227830832, + "grad_norm": 0.37695810198783875, + "learning_rate": 1.2914255321273987e-05, + "loss": 0.1746, + "step": 511 + }, + { + "epoch": 1.3969986357435198, + "grad_norm": 0.37229034304618835, + "learning_rate": 1.2883823360762149e-05, + "loss": 0.1692, + "step": 512 + }, + { + "epoch": 1.3997271487039564, + "grad_norm": 0.3717256486415863, + "learning_rate": 1.2853362242491054e-05, + "loss": 0.1722, + "step": 513 + }, + { + "epoch": 1.4024556616643928, + "grad_norm": 0.3814941346645355, + "learning_rate": 1.2822872274446958e-05, + "loss": 0.1692, + "step": 514 + }, + { + "epoch": 1.4051841746248295, + "grad_norm": 0.40019774436950684, + "learning_rate": 1.2792353764907803e-05, + "loss": 0.1736, + "step": 515 + }, + { + "epoch": 1.407912687585266, + "grad_norm": 0.3744332790374756, + "learning_rate": 1.276180702244012e-05, + "loss": 0.1698, + "step": 516 + }, + { + "epoch": 1.4106412005457025, + "grad_norm": 0.3784146010875702, + "learning_rate": 1.273123235589589e-05, + "loss": 0.1701, + "step": 517 + }, + { + "epoch": 1.4133697135061392, + "grad_norm": 0.371044784784317, + "learning_rate": 1.2700630074409427e-05, + "loss": 0.1661, + "step": 518 + }, + { + "epoch": 1.4160982264665758, + "grad_norm": 0.37950316071510315, + "learning_rate": 1.2670000487394268e-05, + "loss": 0.167, + "step": 519 + }, + { + "epoch": 1.4188267394270122, + "grad_norm": 0.3809194564819336, + "learning_rate": 1.2639343904540008e-05, + "loss": 0.172, + "step": 520 + }, + { + "epoch": 1.4215552523874488, + "grad_norm": 0.3829333186149597, + "learning_rate": 1.260866063580921e-05, + "loss": 0.1715, + "step": 521 + }, + { + "epoch": 1.4242837653478855, + "grad_norm": 0.4065547585487366, + "learning_rate": 1.2577950991434249e-05, + "loss": 0.1681, + "step": 522 + }, + { + "epoch": 1.427012278308322, + "grad_norm": 0.39221319556236267, + "learning_rate": 1.254721528191417e-05, + "loss": 0.1721, + "step": 523 + }, + { + "epoch": 1.4297407912687585, + "grad_norm": 0.3714672327041626, + "learning_rate": 1.2516453818011567e-05, + "loss": 0.1698, + "step": 524 + }, + { + "epoch": 1.4324693042291952, + "grad_norm": 0.38517430424690247, + "learning_rate": 1.2485666910749427e-05, + "loss": 0.1718, + "step": 525 + }, + { + "epoch": 1.4351978171896316, + "grad_norm": 0.3771488070487976, + "learning_rate": 1.2454854871407993e-05, + "loss": 0.1705, + "step": 526 + }, + { + "epoch": 1.4379263301500682, + "grad_norm": 0.39493903517723083, + "learning_rate": 1.242401801152161e-05, + "loss": 0.1725, + "step": 527 + }, + { + "epoch": 1.4406548431105048, + "grad_norm": 0.36978915333747864, + "learning_rate": 1.2393156642875579e-05, + "loss": 0.1746, + "step": 528 + }, + { + "epoch": 1.4433833560709413, + "grad_norm": 0.37171533703804016, + "learning_rate": 1.2362271077503007e-05, + "loss": 0.1681, + "step": 529 + }, + { + "epoch": 1.446111869031378, + "grad_norm": 0.36533740162849426, + "learning_rate": 1.2331361627681645e-05, + "loss": 0.1663, + "step": 530 + }, + { + "epoch": 1.4488403819918145, + "grad_norm": 0.3751925826072693, + "learning_rate": 1.2300428605930736e-05, + "loss": 0.1731, + "step": 531 + }, + { + "epoch": 1.451568894952251, + "grad_norm": 0.3715348541736603, + "learning_rate": 1.2269472325007858e-05, + "loss": 0.1699, + "step": 532 + }, + { + "epoch": 1.4542974079126876, + "grad_norm": 0.3716941773891449, + "learning_rate": 1.2238493097905754e-05, + "loss": 0.1698, + "step": 533 + }, + { + "epoch": 1.4570259208731242, + "grad_norm": 0.3676636517047882, + "learning_rate": 1.2207491237849174e-05, + "loss": 0.1678, + "step": 534 + }, + { + "epoch": 1.4597544338335606, + "grad_norm": 0.3839435875415802, + "learning_rate": 1.2176467058291699e-05, + "loss": 0.1699, + "step": 535 + }, + { + "epoch": 1.4624829467939973, + "grad_norm": 0.3931127190589905, + "learning_rate": 1.2145420872912586e-05, + "loss": 0.1773, + "step": 536 + }, + { + "epoch": 1.465211459754434, + "grad_norm": 0.3639325499534607, + "learning_rate": 1.2114352995613582e-05, + "loss": 0.1671, + "step": 537 + }, + { + "epoch": 1.4679399727148703, + "grad_norm": 0.3724636137485504, + "learning_rate": 1.2083263740515764e-05, + "loss": 0.168, + "step": 538 + }, + { + "epoch": 1.470668485675307, + "grad_norm": 0.3660443425178528, + "learning_rate": 1.2052153421956343e-05, + "loss": 0.1661, + "step": 539 + }, + { + "epoch": 1.4733969986357436, + "grad_norm": 0.36283448338508606, + "learning_rate": 1.2021022354485514e-05, + "loss": 0.1656, + "step": 540 + }, + { + "epoch": 1.4761255115961802, + "grad_norm": 0.3686031103134155, + "learning_rate": 1.1989870852863254e-05, + "loss": 0.1693, + "step": 541 + }, + { + "epoch": 1.4788540245566166, + "grad_norm": 0.3700789511203766, + "learning_rate": 1.1958699232056135e-05, + "loss": 0.1681, + "step": 542 + }, + { + "epoch": 1.4815825375170533, + "grad_norm": 0.37259745597839355, + "learning_rate": 1.1927507807234169e-05, + "loss": 0.1706, + "step": 543 + }, + { + "epoch": 1.4843110504774897, + "grad_norm": 0.3932602107524872, + "learning_rate": 1.1896296893767588e-05, + "loss": 0.1702, + "step": 544 + }, + { + "epoch": 1.4870395634379263, + "grad_norm": 0.3860296308994293, + "learning_rate": 1.186506680722367e-05, + "loss": 0.1713, + "step": 545 + }, + { + "epoch": 1.489768076398363, + "grad_norm": 0.37699058651924133, + "learning_rate": 1.1833817863363563e-05, + "loss": 0.1683, + "step": 546 + }, + { + "epoch": 1.4924965893587996, + "grad_norm": 0.37668129801750183, + "learning_rate": 1.180255037813906e-05, + "loss": 0.1698, + "step": 547 + }, + { + "epoch": 1.495225102319236, + "grad_norm": 0.3677523732185364, + "learning_rate": 1.1771264667689428e-05, + "loss": 0.1678, + "step": 548 + }, + { + "epoch": 1.4979536152796726, + "grad_norm": 0.3719595670700073, + "learning_rate": 1.1739961048338213e-05, + "loss": 0.1636, + "step": 549 + }, + { + "epoch": 1.500682128240109, + "grad_norm": 0.3700611889362335, + "learning_rate": 1.1708639836590024e-05, + "loss": 0.1673, + "step": 550 + }, + { + "epoch": 1.5034106412005457, + "grad_norm": 0.37999942898750305, + "learning_rate": 1.1677301349127349e-05, + "loss": 0.1706, + "step": 551 + }, + { + "epoch": 1.5061391541609823, + "grad_norm": 0.37487849593162537, + "learning_rate": 1.164594590280734e-05, + "loss": 0.1715, + "step": 552 + }, + { + "epoch": 1.508867667121419, + "grad_norm": 0.37367403507232666, + "learning_rate": 1.161457381465863e-05, + "loss": 0.1698, + "step": 553 + }, + { + "epoch": 1.5115961800818554, + "grad_norm": 0.39386484026908875, + "learning_rate": 1.15831854018781e-05, + "loss": 0.1699, + "step": 554 + }, + { + "epoch": 1.514324693042292, + "grad_norm": 0.37812918424606323, + "learning_rate": 1.1551780981827699e-05, + "loss": 0.1684, + "step": 555 + }, + { + "epoch": 1.5170532060027284, + "grad_norm": 0.3666611611843109, + "learning_rate": 1.1520360872031208e-05, + "loss": 0.1679, + "step": 556 + }, + { + "epoch": 1.519781718963165, + "grad_norm": 0.36553552746772766, + "learning_rate": 1.148892539017106e-05, + "loss": 0.1645, + "step": 557 + }, + { + "epoch": 1.5225102319236017, + "grad_norm": 0.3726014792919159, + "learning_rate": 1.1457474854085095e-05, + "loss": 0.1746, + "step": 558 + }, + { + "epoch": 1.5252387448840383, + "grad_norm": 0.3792051672935486, + "learning_rate": 1.1426009581763377e-05, + "loss": 0.1682, + "step": 559 + }, + { + "epoch": 1.5279672578444747, + "grad_norm": 0.3550755977630615, + "learning_rate": 1.139452989134496e-05, + "loss": 0.1668, + "step": 560 + }, + { + "epoch": 1.5306957708049114, + "grad_norm": 0.3755435049533844, + "learning_rate": 1.1363036101114671e-05, + "loss": 0.165, + "step": 561 + }, + { + "epoch": 1.5334242837653478, + "grad_norm": 0.3794499337673187, + "learning_rate": 1.1331528529499909e-05, + "loss": 0.1677, + "step": 562 + }, + { + "epoch": 1.5361527967257844, + "grad_norm": 0.3994825780391693, + "learning_rate": 1.1300007495067403e-05, + "loss": 0.173, + "step": 563 + }, + { + "epoch": 1.538881309686221, + "grad_norm": 0.3852636516094208, + "learning_rate": 1.1268473316520007e-05, + "loss": 0.171, + "step": 564 + }, + { + "epoch": 1.5416098226466577, + "grad_norm": 0.3853999674320221, + "learning_rate": 1.123692631269348e-05, + "loss": 0.1719, + "step": 565 + }, + { + "epoch": 1.544338335607094, + "grad_norm": 0.3695131540298462, + "learning_rate": 1.1205366802553231e-05, + "loss": 0.171, + "step": 566 + }, + { + "epoch": 1.5470668485675307, + "grad_norm": 0.3661561906337738, + "learning_rate": 1.1173795105191146e-05, + "loss": 0.166, + "step": 567 + }, + { + "epoch": 1.5497953615279672, + "grad_norm": 0.38201916217803955, + "learning_rate": 1.1142211539822318e-05, + "loss": 0.1673, + "step": 568 + }, + { + "epoch": 1.5525238744884038, + "grad_norm": 0.37060391902923584, + "learning_rate": 1.1110616425781833e-05, + "loss": 0.1675, + "step": 569 + }, + { + "epoch": 1.5552523874488404, + "grad_norm": 0.3723720908164978, + "learning_rate": 1.1079010082521557e-05, + "loss": 0.1688, + "step": 570 + }, + { + "epoch": 1.557980900409277, + "grad_norm": 0.3776531219482422, + "learning_rate": 1.1047392829606876e-05, + "loss": 0.1675, + "step": 571 + }, + { + "epoch": 1.5607094133697135, + "grad_norm": 0.38429945707321167, + "learning_rate": 1.101576498671349e-05, + "loss": 0.1671, + "step": 572 + }, + { + "epoch": 1.56343792633015, + "grad_norm": 0.3864508271217346, + "learning_rate": 1.098412687362418e-05, + "loss": 0.1721, + "step": 573 + }, + { + "epoch": 1.5661664392905865, + "grad_norm": 0.3664016127586365, + "learning_rate": 1.095247881022555e-05, + "loss": 0.1684, + "step": 574 + }, + { + "epoch": 1.5688949522510232, + "grad_norm": 0.3717040717601776, + "learning_rate": 1.0920821116504816e-05, + "loss": 0.1722, + "step": 575 + }, + { + "epoch": 1.5716234652114598, + "grad_norm": 0.3679288625717163, + "learning_rate": 1.0889154112546569e-05, + "loss": 0.1696, + "step": 576 + }, + { + "epoch": 1.5743519781718964, + "grad_norm": 0.3595617413520813, + "learning_rate": 1.0857478118529534e-05, + "loss": 0.1618, + "step": 577 + }, + { + "epoch": 1.5770804911323328, + "grad_norm": 0.36247459053993225, + "learning_rate": 1.0825793454723325e-05, + "loss": 0.1702, + "step": 578 + }, + { + "epoch": 1.5798090040927695, + "grad_norm": 0.36833876371383667, + "learning_rate": 1.079410044148522e-05, + "loss": 0.1684, + "step": 579 + }, + { + "epoch": 1.5825375170532059, + "grad_norm": 0.3675031065940857, + "learning_rate": 1.0762399399256917e-05, + "loss": 0.1635, + "step": 580 + }, + { + "epoch": 1.5852660300136425, + "grad_norm": 0.3601101040840149, + "learning_rate": 1.0730690648561293e-05, + "loss": 0.1659, + "step": 581 + }, + { + "epoch": 1.5879945429740792, + "grad_norm": 0.372632771730423, + "learning_rate": 1.0698974509999159e-05, + "loss": 0.1686, + "step": 582 + }, + { + "epoch": 1.5907230559345158, + "grad_norm": 0.37733328342437744, + "learning_rate": 1.0667251304246028e-05, + "loss": 0.1711, + "step": 583 + }, + { + "epoch": 1.5934515688949522, + "grad_norm": 0.36464056372642517, + "learning_rate": 1.0635521352048873e-05, + "loss": 0.1681, + "step": 584 + }, + { + "epoch": 1.5961800818553888, + "grad_norm": 0.37589231133461, + "learning_rate": 1.0603784974222862e-05, + "loss": 0.1688, + "step": 585 + }, + { + "epoch": 1.5989085948158253, + "grad_norm": 0.3540600836277008, + "learning_rate": 1.057204249164815e-05, + "loss": 0.1685, + "step": 586 + }, + { + "epoch": 1.601637107776262, + "grad_norm": 0.36046862602233887, + "learning_rate": 1.0540294225266608e-05, + "loss": 0.1642, + "step": 587 + }, + { + "epoch": 1.6043656207366985, + "grad_norm": 0.36235296726226807, + "learning_rate": 1.0508540496078582e-05, + "loss": 0.1686, + "step": 588 + }, + { + "epoch": 1.6070941336971352, + "grad_norm": 0.35960090160369873, + "learning_rate": 1.0476781625139655e-05, + "loss": 0.169, + "step": 589 + }, + { + "epoch": 1.6098226466575716, + "grad_norm": 0.37027955055236816, + "learning_rate": 1.0445017933557404e-05, + "loss": 0.1695, + "step": 590 + }, + { + "epoch": 1.6125511596180082, + "grad_norm": 0.37015828490257263, + "learning_rate": 1.0413249742488132e-05, + "loss": 0.1641, + "step": 591 + }, + { + "epoch": 1.6152796725784446, + "grad_norm": 0.36656126379966736, + "learning_rate": 1.0381477373133652e-05, + "loss": 0.1667, + "step": 592 + }, + { + "epoch": 1.6180081855388813, + "grad_norm": 0.364135205745697, + "learning_rate": 1.0349701146738007e-05, + "loss": 0.1652, + "step": 593 + }, + { + "epoch": 1.620736698499318, + "grad_norm": 0.36405521631240845, + "learning_rate": 1.0317921384584245e-05, + "loss": 0.1692, + "step": 594 + }, + { + "epoch": 1.6234652114597545, + "grad_norm": 0.36980974674224854, + "learning_rate": 1.0286138407991171e-05, + "loss": 0.1689, + "step": 595 + }, + { + "epoch": 1.626193724420191, + "grad_norm": 0.37447232007980347, + "learning_rate": 1.0254352538310075e-05, + "loss": 0.1729, + "step": 596 + }, + { + "epoch": 1.6289222373806276, + "grad_norm": 0.3681443929672241, + "learning_rate": 1.0222564096921505e-05, + "loss": 0.1714, + "step": 597 + }, + { + "epoch": 1.631650750341064, + "grad_norm": 0.36758869886398315, + "learning_rate": 1.0190773405232024e-05, + "loss": 0.1649, + "step": 598 + }, + { + "epoch": 1.6343792633015006, + "grad_norm": 0.36638176441192627, + "learning_rate": 1.0158980784670927e-05, + "loss": 0.1677, + "step": 599 + }, + { + "epoch": 1.6371077762619373, + "grad_norm": 0.35130685567855835, + "learning_rate": 1.012718655668702e-05, + "loss": 0.1643, + "step": 600 + }, + { + "epoch": 1.639836289222374, + "grad_norm": 0.3654751777648926, + "learning_rate": 1.0095391042745362e-05, + "loss": 0.1643, + "step": 601 + }, + { + "epoch": 1.6425648021828103, + "grad_norm": 0.3721977472305298, + "learning_rate": 1.0063594564324014e-05, + "loss": 0.1655, + "step": 602 + }, + { + "epoch": 1.645293315143247, + "grad_norm": 0.3742852509021759, + "learning_rate": 1.0031797442910788e-05, + "loss": 0.1694, + "step": 603 + }, + { + "epoch": 1.6480218281036834, + "grad_norm": 0.37592101097106934, + "learning_rate": 1e-05, + "loss": 0.1667, + "step": 604 + }, + { + "epoch": 1.65075034106412, + "grad_norm": 0.36879512667655945, + "learning_rate": 9.968202557089213e-06, + "loss": 0.1687, + "step": 605 + }, + { + "epoch": 1.6534788540245566, + "grad_norm": 0.36528080701828003, + "learning_rate": 9.936405435675991e-06, + "loss": 0.1679, + "step": 606 + }, + { + "epoch": 1.6562073669849933, + "grad_norm": 0.37423449754714966, + "learning_rate": 9.904608957254643e-06, + "loss": 0.1719, + "step": 607 + }, + { + "epoch": 1.65893587994543, + "grad_norm": 0.3570186495780945, + "learning_rate": 9.872813443312984e-06, + "loss": 0.1664, + "step": 608 + }, + { + "epoch": 1.6616643929058663, + "grad_norm": 0.357686847448349, + "learning_rate": 9.84101921532908e-06, + "loss": 0.1641, + "step": 609 + }, + { + "epoch": 1.6643929058663027, + "grad_norm": 0.35586750507354736, + "learning_rate": 9.809226594767979e-06, + "loss": 0.1618, + "step": 610 + }, + { + "epoch": 1.6671214188267394, + "grad_norm": 0.35907599329948425, + "learning_rate": 9.777435903078493e-06, + "loss": 0.1652, + "step": 611 + }, + { + "epoch": 1.669849931787176, + "grad_norm": 0.3660561144351959, + "learning_rate": 9.745647461689932e-06, + "loss": 0.1667, + "step": 612 + }, + { + "epoch": 1.6725784447476126, + "grad_norm": 0.36144155263900757, + "learning_rate": 9.713861592008834e-06, + "loss": 0.1639, + "step": 613 + }, + { + "epoch": 1.6753069577080493, + "grad_norm": 0.37603816390037537, + "learning_rate": 9.682078615415755e-06, + "loss": 0.1659, + "step": 614 + }, + { + "epoch": 1.6780354706684857, + "grad_norm": 0.3800089359283447, + "learning_rate": 9.650298853261998e-06, + "loss": 0.1673, + "step": 615 + }, + { + "epoch": 1.680763983628922, + "grad_norm": 0.3737815320491791, + "learning_rate": 9.618522626866351e-06, + "loss": 0.1654, + "step": 616 + }, + { + "epoch": 1.6834924965893587, + "grad_norm": 0.38408902287483215, + "learning_rate": 9.586750257511868e-06, + "loss": 0.1707, + "step": 617 + }, + { + "epoch": 1.6862210095497954, + "grad_norm": 0.36487987637519836, + "learning_rate": 9.554982066442601e-06, + "loss": 0.1689, + "step": 618 + }, + { + "epoch": 1.688949522510232, + "grad_norm": 0.35596176981925964, + "learning_rate": 9.523218374860348e-06, + "loss": 0.1664, + "step": 619 + }, + { + "epoch": 1.6916780354706686, + "grad_norm": 0.3624376654624939, + "learning_rate": 9.49145950392142e-06, + "loss": 0.1668, + "step": 620 + }, + { + "epoch": 1.694406548431105, + "grad_norm": 0.368150532245636, + "learning_rate": 9.459705774733397e-06, + "loss": 0.1707, + "step": 621 + }, + { + "epoch": 1.6971350613915415, + "grad_norm": 0.3632226586341858, + "learning_rate": 9.427957508351852e-06, + "loss": 0.1696, + "step": 622 + }, + { + "epoch": 1.699863574351978, + "grad_norm": 0.359633207321167, + "learning_rate": 9.39621502577714e-06, + "loss": 0.1668, + "step": 623 + }, + { + "epoch": 1.7025920873124147, + "grad_norm": 0.36725008487701416, + "learning_rate": 9.364478647951132e-06, + "loss": 0.1652, + "step": 624 + }, + { + "epoch": 1.7053206002728514, + "grad_norm": 0.3794236481189728, + "learning_rate": 9.332748695753973e-06, + "loss": 0.1701, + "step": 625 + }, + { + "epoch": 1.708049113233288, + "grad_norm": 0.3663654327392578, + "learning_rate": 9.301025490000843e-06, + "loss": 0.169, + "step": 626 + }, + { + "epoch": 1.7107776261937244, + "grad_norm": 0.3565012812614441, + "learning_rate": 9.26930935143871e-06, + "loss": 0.1691, + "step": 627 + }, + { + "epoch": 1.7135061391541608, + "grad_norm": 0.3741627633571625, + "learning_rate": 9.237600600743086e-06, + "loss": 0.1688, + "step": 628 + }, + { + "epoch": 1.7162346521145975, + "grad_norm": 0.36277323961257935, + "learning_rate": 9.20589955851478e-06, + "loss": 0.1675, + "step": 629 + }, + { + "epoch": 1.718963165075034, + "grad_norm": 0.35570815205574036, + "learning_rate": 9.174206545276678e-06, + "loss": 0.1673, + "step": 630 + }, + { + "epoch": 1.7216916780354707, + "grad_norm": 0.3580315411090851, + "learning_rate": 9.14252188147047e-06, + "loss": 0.1615, + "step": 631 + }, + { + "epoch": 1.7244201909959074, + "grad_norm": 0.3664811849594116, + "learning_rate": 9.11084588745343e-06, + "loss": 0.1677, + "step": 632 + }, + { + "epoch": 1.7271487039563438, + "grad_norm": 0.37821367383003235, + "learning_rate": 9.07917888349519e-06, + "loss": 0.1671, + "step": 633 + }, + { + "epoch": 1.7298772169167802, + "grad_norm": 0.35077428817749023, + "learning_rate": 9.047521189774456e-06, + "loss": 0.1653, + "step": 634 + }, + { + "epoch": 1.7326057298772168, + "grad_norm": 0.3558577597141266, + "learning_rate": 9.015873126375822e-06, + "loss": 0.1638, + "step": 635 + }, + { + "epoch": 1.7353342428376535, + "grad_norm": 0.34969979524612427, + "learning_rate": 8.984235013286512e-06, + "loss": 0.1628, + "step": 636 + }, + { + "epoch": 1.73806275579809, + "grad_norm": 0.37003517150878906, + "learning_rate": 8.952607170393126e-06, + "loss": 0.1691, + "step": 637 + }, + { + "epoch": 1.7407912687585267, + "grad_norm": 0.36189723014831543, + "learning_rate": 8.920989917478446e-06, + "loss": 0.1624, + "step": 638 + }, + { + "epoch": 1.7435197817189632, + "grad_norm": 0.3659520745277405, + "learning_rate": 8.88938357421817e-06, + "loss": 0.1669, + "step": 639 + }, + { + "epoch": 1.7462482946793996, + "grad_norm": 0.37601613998413086, + "learning_rate": 8.857788460177685e-06, + "loss": 0.1691, + "step": 640 + }, + { + "epoch": 1.7489768076398362, + "grad_norm": 0.36393433809280396, + "learning_rate": 8.826204894808856e-06, + "loss": 0.1643, + "step": 641 + }, + { + "epoch": 1.7517053206002728, + "grad_norm": 0.3598528802394867, + "learning_rate": 8.79463319744677e-06, + "loss": 0.1686, + "step": 642 + }, + { + "epoch": 1.7544338335607095, + "grad_norm": 0.3672749102115631, + "learning_rate": 8.763073687306523e-06, + "loss": 0.1661, + "step": 643 + }, + { + "epoch": 1.7571623465211461, + "grad_norm": 0.3553427457809448, + "learning_rate": 8.731526683479991e-06, + "loss": 0.168, + "step": 644 + }, + { + "epoch": 1.7598908594815825, + "grad_norm": 0.36217623949050903, + "learning_rate": 8.699992504932599e-06, + "loss": 0.1667, + "step": 645 + }, + { + "epoch": 1.762619372442019, + "grad_norm": 0.3513911962509155, + "learning_rate": 8.668471470500094e-06, + "loss": 0.1646, + "step": 646 + }, + { + "epoch": 1.7653478854024556, + "grad_norm": 0.36681661009788513, + "learning_rate": 8.63696389888533e-06, + "loss": 0.1702, + "step": 647 + }, + { + "epoch": 1.7680763983628922, + "grad_norm": 0.35328978300094604, + "learning_rate": 8.605470108655046e-06, + "loss": 0.1675, + "step": 648 + }, + { + "epoch": 1.7708049113233288, + "grad_norm": 0.3467901349067688, + "learning_rate": 8.573990418236626e-06, + "loss": 0.1684, + "step": 649 + }, + { + "epoch": 1.7735334242837655, + "grad_norm": 0.355685293674469, + "learning_rate": 8.542525145914907e-06, + "loss": 0.1637, + "step": 650 + }, + { + "epoch": 1.776261937244202, + "grad_norm": 0.393255352973938, + "learning_rate": 8.511074609828944e-06, + "loss": 0.1665, + "step": 651 + }, + { + "epoch": 1.7789904502046383, + "grad_norm": 0.34803488850593567, + "learning_rate": 8.479639127968793e-06, + "loss": 0.1643, + "step": 652 + }, + { + "epoch": 1.781718963165075, + "grad_norm": 0.37034231424331665, + "learning_rate": 8.448219018172303e-06, + "loss": 0.1684, + "step": 653 + }, + { + "epoch": 1.7844474761255116, + "grad_norm": 0.3687577247619629, + "learning_rate": 8.416814598121901e-06, + "loss": 0.1657, + "step": 654 + }, + { + "epoch": 1.7871759890859482, + "grad_norm": 0.3742063045501709, + "learning_rate": 8.385426185341374e-06, + "loss": 0.161, + "step": 655 + }, + { + "epoch": 1.7899045020463848, + "grad_norm": 0.36773720383644104, + "learning_rate": 8.35405409719266e-06, + "loss": 0.1648, + "step": 656 + }, + { + "epoch": 1.7926330150068213, + "grad_norm": 0.3640764653682709, + "learning_rate": 8.322698650872656e-06, + "loss": 0.1662, + "step": 657 + }, + { + "epoch": 1.795361527967258, + "grad_norm": 0.3551373779773712, + "learning_rate": 8.291360163409978e-06, + "loss": 0.1612, + "step": 658 + }, + { + "epoch": 1.7980900409276943, + "grad_norm": 0.3683207631111145, + "learning_rate": 8.260038951661787e-06, + "loss": 0.1691, + "step": 659 + }, + { + "epoch": 1.800818553888131, + "grad_norm": 0.37494397163391113, + "learning_rate": 8.228735332310575e-06, + "loss": 0.1682, + "step": 660 + }, + { + "epoch": 1.8035470668485676, + "grad_norm": 0.35092827677726746, + "learning_rate": 8.197449621860944e-06, + "loss": 0.1669, + "step": 661 + }, + { + "epoch": 1.8062755798090042, + "grad_norm": 0.342723548412323, + "learning_rate": 8.16618213663644e-06, + "loss": 0.1617, + "step": 662 + }, + { + "epoch": 1.8090040927694406, + "grad_norm": 0.3787606656551361, + "learning_rate": 8.134933192776333e-06, + "loss": 0.1714, + "step": 663 + }, + { + "epoch": 1.8117326057298773, + "grad_norm": 0.36189302802085876, + "learning_rate": 8.103703106232416e-06, + "loss": 0.1658, + "step": 664 + }, + { + "epoch": 1.8144611186903137, + "grad_norm": 0.3539157807826996, + "learning_rate": 8.072492192765833e-06, + "loss": 0.1609, + "step": 665 + }, + { + "epoch": 1.8171896316507503, + "grad_norm": 0.35902145504951477, + "learning_rate": 8.041300767943867e-06, + "loss": 0.1636, + "step": 666 + }, + { + "epoch": 1.819918144611187, + "grad_norm": 0.3730703890323639, + "learning_rate": 8.010129147136749e-06, + "loss": 0.1665, + "step": 667 + }, + { + "epoch": 1.8226466575716236, + "grad_norm": 0.3461478650569916, + "learning_rate": 7.978977645514488e-06, + "loss": 0.1615, + "step": 668 + }, + { + "epoch": 1.82537517053206, + "grad_norm": 0.36380550265312195, + "learning_rate": 7.947846578043658e-06, + "loss": 0.1667, + "step": 669 + }, + { + "epoch": 1.8281036834924966, + "grad_norm": 0.3588198125362396, + "learning_rate": 7.916736259484239e-06, + "loss": 0.1634, + "step": 670 + }, + { + "epoch": 1.830832196452933, + "grad_norm": 0.3543216586112976, + "learning_rate": 7.885647004386421e-06, + "loss": 0.1637, + "step": 671 + }, + { + "epoch": 1.8335607094133697, + "grad_norm": 0.3527906835079193, + "learning_rate": 7.854579127087418e-06, + "loss": 0.1636, + "step": 672 + }, + { + "epoch": 1.8362892223738063, + "grad_norm": 0.3463229537010193, + "learning_rate": 7.823532941708305e-06, + "loss": 0.1617, + "step": 673 + }, + { + "epoch": 1.839017735334243, + "grad_norm": 0.36647194623947144, + "learning_rate": 7.792508762150833e-06, + "loss": 0.1672, + "step": 674 + }, + { + "epoch": 1.8417462482946794, + "grad_norm": 0.35416069626808167, + "learning_rate": 7.761506902094248e-06, + "loss": 0.1633, + "step": 675 + }, + { + "epoch": 1.844474761255116, + "grad_norm": 0.3427625000476837, + "learning_rate": 7.730527674992143e-06, + "loss": 0.1662, + "step": 676 + }, + { + "epoch": 1.8472032742155524, + "grad_norm": 0.3429381251335144, + "learning_rate": 7.699571394069269e-06, + "loss": 0.1594, + "step": 677 + }, + { + "epoch": 1.849931787175989, + "grad_norm": 0.3431464433670044, + "learning_rate": 7.668638372318359e-06, + "loss": 0.1596, + "step": 678 + }, + { + "epoch": 1.8526603001364257, + "grad_norm": 0.36854469776153564, + "learning_rate": 7.637728922496996e-06, + "loss": 0.1686, + "step": 679 + }, + { + "epoch": 1.8553888130968623, + "grad_norm": 0.3657079339027405, + "learning_rate": 7.606843357124426e-06, + "loss": 0.1684, + "step": 680 + }, + { + "epoch": 1.8581173260572987, + "grad_norm": 0.3696242868900299, + "learning_rate": 7.575981988478393e-06, + "loss": 0.1693, + "step": 681 + }, + { + "epoch": 1.8608458390177354, + "grad_norm": 0.36820054054260254, + "learning_rate": 7.545145128592009e-06, + "loss": 0.169, + "step": 682 + }, + { + "epoch": 1.8635743519781718, + "grad_norm": 0.35837873816490173, + "learning_rate": 7.514333089250577e-06, + "loss": 0.1659, + "step": 683 + }, + { + "epoch": 1.8663028649386084, + "grad_norm": 0.35923945903778076, + "learning_rate": 7.483546181988437e-06, + "loss": 0.1683, + "step": 684 + }, + { + "epoch": 1.869031377899045, + "grad_norm": 0.35063204169273376, + "learning_rate": 7.452784718085834e-06, + "loss": 0.1611, + "step": 685 + }, + { + "epoch": 1.8717598908594817, + "grad_norm": 0.3463192582130432, + "learning_rate": 7.422049008565757e-06, + "loss": 0.1648, + "step": 686 + }, + { + "epoch": 1.874488403819918, + "grad_norm": 0.34152331948280334, + "learning_rate": 7.391339364190794e-06, + "loss": 0.1602, + "step": 687 + }, + { + "epoch": 1.8772169167803547, + "grad_norm": 0.34842997789382935, + "learning_rate": 7.360656095459995e-06, + "loss": 0.1644, + "step": 688 + }, + { + "epoch": 1.8799454297407912, + "grad_norm": 0.3455371856689453, + "learning_rate": 7.329999512605738e-06, + "loss": 0.1631, + "step": 689 + }, + { + "epoch": 1.8826739427012278, + "grad_norm": 0.34828466176986694, + "learning_rate": 7.299369925590575e-06, + "loss": 0.1618, + "step": 690 + }, + { + "epoch": 1.8854024556616644, + "grad_norm": 0.36350730061531067, + "learning_rate": 7.268767644104114e-06, + "loss": 0.1653, + "step": 691 + }, + { + "epoch": 1.888130968622101, + "grad_norm": 0.35511481761932373, + "learning_rate": 7.2381929775598835e-06, + "loss": 0.1598, + "step": 692 + }, + { + "epoch": 1.8908594815825375, + "grad_norm": 0.34677204489707947, + "learning_rate": 7.207646235092201e-06, + "loss": 0.1603, + "step": 693 + }, + { + "epoch": 1.893587994542974, + "grad_norm": 0.3588501811027527, + "learning_rate": 7.1771277255530456e-06, + "loss": 0.1638, + "step": 694 + }, + { + "epoch": 1.8963165075034105, + "grad_norm": 0.3464423716068268, + "learning_rate": 7.14663775750895e-06, + "loss": 0.1635, + "step": 695 + }, + { + "epoch": 1.8990450204638472, + "grad_norm": 0.3492993116378784, + "learning_rate": 7.116176639237853e-06, + "loss": 0.1629, + "step": 696 + }, + { + "epoch": 1.9017735334242838, + "grad_norm": 0.35050979256629944, + "learning_rate": 7.085744678726013e-06, + "loss": 0.1661, + "step": 697 + }, + { + "epoch": 1.9045020463847204, + "grad_norm": 0.3559916317462921, + "learning_rate": 7.05534218366488e-06, + "loss": 0.165, + "step": 698 + }, + { + "epoch": 1.9072305593451568, + "grad_norm": 0.34668344259262085, + "learning_rate": 7.024969461447973e-06, + "loss": 0.1624, + "step": 699 + }, + { + "epoch": 1.9099590723055935, + "grad_norm": 0.3400524854660034, + "learning_rate": 6.994626819167789e-06, + "loss": 0.1631, + "step": 700 + }, + { + "epoch": 1.9126875852660299, + "grad_norm": 0.3597893714904785, + "learning_rate": 6.964314563612709e-06, + "loss": 0.166, + "step": 701 + }, + { + "epoch": 1.9154160982264665, + "grad_norm": 0.35054320096969604, + "learning_rate": 6.934033001263847e-06, + "loss": 0.1667, + "step": 702 + }, + { + "epoch": 1.9181446111869032, + "grad_norm": 0.3557494878768921, + "learning_rate": 6.9037824382920145e-06, + "loss": 0.1644, + "step": 703 + }, + { + "epoch": 1.9208731241473398, + "grad_norm": 0.34000909328460693, + "learning_rate": 6.873563180554583e-06, + "loss": 0.1603, + "step": 704 + }, + { + "epoch": 1.9236016371077762, + "grad_norm": 0.35631030797958374, + "learning_rate": 6.843375533592395e-06, + "loss": 0.1661, + "step": 705 + }, + { + "epoch": 1.9263301500682128, + "grad_norm": 0.3655133843421936, + "learning_rate": 6.813219802626698e-06, + "loss": 0.1639, + "step": 706 + }, + { + "epoch": 1.9290586630286493, + "grad_norm": 0.3506717085838318, + "learning_rate": 6.783096292556035e-06, + "loss": 0.1674, + "step": 707 + }, + { + "epoch": 1.931787175989086, + "grad_norm": 0.34663382172584534, + "learning_rate": 6.7530053079531664e-06, + "loss": 0.1652, + "step": 708 + }, + { + "epoch": 1.9345156889495225, + "grad_norm": 0.34880849719047546, + "learning_rate": 6.722947153062003e-06, + "loss": 0.1647, + "step": 709 + }, + { + "epoch": 1.9372442019099592, + "grad_norm": 0.3882060945034027, + "learning_rate": 6.692922131794517e-06, + "loss": 0.1693, + "step": 710 + }, + { + "epoch": 1.9399727148703958, + "grad_norm": 0.35033705830574036, + "learning_rate": 6.662930547727668e-06, + "loss": 0.1648, + "step": 711 + }, + { + "epoch": 1.9427012278308322, + "grad_norm": 0.35732313990592957, + "learning_rate": 6.632972704100349e-06, + "loss": 0.1621, + "step": 712 + }, + { + "epoch": 1.9454297407912686, + "grad_norm": 0.33822888135910034, + "learning_rate": 6.603048903810305e-06, + "loss": 0.1598, + "step": 713 + }, + { + "epoch": 1.9481582537517053, + "grad_norm": 0.3523855209350586, + "learning_rate": 6.573159449411071e-06, + "loss": 0.164, + "step": 714 + }, + { + "epoch": 1.950886766712142, + "grad_norm": 0.38246482610702515, + "learning_rate": 6.5433046431089205e-06, + "loss": 0.1668, + "step": 715 + }, + { + "epoch": 1.9536152796725785, + "grad_norm": 0.3540845811367035, + "learning_rate": 6.513484786759818e-06, + "loss": 0.1646, + "step": 716 + }, + { + "epoch": 1.9563437926330152, + "grad_norm": 0.34382006525993347, + "learning_rate": 6.483700181866337e-06, + "loss": 0.1621, + "step": 717 + }, + { + "epoch": 1.9590723055934516, + "grad_norm": 0.35310429334640503, + "learning_rate": 6.453951129574644e-06, + "loss": 0.1678, + "step": 718 + }, + { + "epoch": 1.961800818553888, + "grad_norm": 0.3334411680698395, + "learning_rate": 6.42423793067144e-06, + "loss": 0.1572, + "step": 719 + }, + { + "epoch": 1.9645293315143246, + "grad_norm": 0.3550149202346802, + "learning_rate": 6.39456088558091e-06, + "loss": 0.1668, + "step": 720 + }, + { + "epoch": 1.9672578444747613, + "grad_norm": 0.35232090950012207, + "learning_rate": 6.364920294361701e-06, + "loss": 0.1621, + "step": 721 + }, + { + "epoch": 1.969986357435198, + "grad_norm": 0.3518090844154358, + "learning_rate": 6.335316456703891e-06, + "loss": 0.1589, + "step": 722 + }, + { + "epoch": 1.9727148703956345, + "grad_norm": 0.34866175055503845, + "learning_rate": 6.3057496719259314e-06, + "loss": 0.1664, + "step": 723 + }, + { + "epoch": 1.975443383356071, + "grad_norm": 0.3425263464450836, + "learning_rate": 6.276220238971653e-06, + "loss": 0.1606, + "step": 724 + }, + { + "epoch": 1.9781718963165074, + "grad_norm": 0.35735857486724854, + "learning_rate": 6.2467284564072294e-06, + "loss": 0.1632, + "step": 725 + }, + { + "epoch": 1.980900409276944, + "grad_norm": 0.342393159866333, + "learning_rate": 6.2172746224181524e-06, + "loss": 0.1604, + "step": 726 + }, + { + "epoch": 1.9836289222373806, + "grad_norm": 0.34849631786346436, + "learning_rate": 6.187859034806225e-06, + "loss": 0.1619, + "step": 727 + }, + { + "epoch": 1.9863574351978173, + "grad_norm": 0.34562066197395325, + "learning_rate": 6.158481990986558e-06, + "loss": 0.1637, + "step": 728 + }, + { + "epoch": 1.989085948158254, + "grad_norm": 0.3466956913471222, + "learning_rate": 6.1291437879845335e-06, + "loss": 0.1628, + "step": 729 + }, + { + "epoch": 1.9918144611186903, + "grad_norm": 0.3530554175376892, + "learning_rate": 6.099844722432844e-06, + "loss": 0.1608, + "step": 730 + }, + { + "epoch": 1.9945429740791267, + "grad_norm": 0.33960825204849243, + "learning_rate": 6.07058509056846e-06, + "loss": 0.1602, + "step": 731 + }, + { + "epoch": 1.9972714870395634, + "grad_norm": 0.3492552936077118, + "learning_rate": 6.041365188229641e-06, + "loss": 0.1669, + "step": 732 + }, + { + "epoch": 2.0, + "grad_norm": 0.3499084413051605, + "learning_rate": 6.012185310852962e-06, + "loss": 0.1638, + "step": 733 + }, + { + "epoch": 2.0027285129604366, + "grad_norm": 0.3995380997657776, + "learning_rate": 5.983045753470308e-06, + "loss": 0.134, + "step": 734 + }, + { + "epoch": 2.0054570259208733, + "grad_norm": 0.4106523096561432, + "learning_rate": 5.9539468107058885e-06, + "loss": 0.1382, + "step": 735 + }, + { + "epoch": 2.00818553888131, + "grad_norm": 0.3607160151004791, + "learning_rate": 5.924888776773281e-06, + "loss": 0.137, + "step": 736 + }, + { + "epoch": 2.010914051841746, + "grad_norm": 0.3377709686756134, + "learning_rate": 5.895871945472434e-06, + "loss": 0.1303, + "step": 737 + }, + { + "epoch": 2.0136425648021827, + "grad_norm": 0.33374232053756714, + "learning_rate": 5.866896610186701e-06, + "loss": 0.1324, + "step": 738 + }, + { + "epoch": 2.0163710777626194, + "grad_norm": 0.3930217921733856, + "learning_rate": 5.8379630638798845e-06, + "loss": 0.1324, + "step": 739 + }, + { + "epoch": 2.019099590723056, + "grad_norm": 0.44687938690185547, + "learning_rate": 5.809071599093272e-06, + "loss": 0.1345, + "step": 740 + }, + { + "epoch": 2.0218281036834926, + "grad_norm": 0.4330204129219055, + "learning_rate": 5.780222507942654e-06, + "loss": 0.1288, + "step": 741 + }, + { + "epoch": 2.0245566166439293, + "grad_norm": 0.41968345642089844, + "learning_rate": 5.7514160821154085e-06, + "loss": 0.1313, + "step": 742 + }, + { + "epoch": 2.0272851296043655, + "grad_norm": 0.3952399790287018, + "learning_rate": 5.7226526128675234e-06, + "loss": 0.1326, + "step": 743 + }, + { + "epoch": 2.030013642564802, + "grad_norm": 0.37134867906570435, + "learning_rate": 5.693932391020664e-06, + "loss": 0.1299, + "step": 744 + }, + { + "epoch": 2.0327421555252387, + "grad_norm": 0.39517930150032043, + "learning_rate": 5.665255706959231e-06, + "loss": 0.1331, + "step": 745 + }, + { + "epoch": 2.0354706684856754, + "grad_norm": 0.3553452491760254, + "learning_rate": 5.63662285062742e-06, + "loss": 0.1293, + "step": 746 + }, + { + "epoch": 2.038199181446112, + "grad_norm": 0.3541392683982849, + "learning_rate": 5.608034111526298e-06, + "loss": 0.1329, + "step": 747 + }, + { + "epoch": 2.0409276944065486, + "grad_norm": 0.3674290180206299, + "learning_rate": 5.579489778710867e-06, + "loss": 0.1323, + "step": 748 + }, + { + "epoch": 2.043656207366985, + "grad_norm": 0.3576567471027374, + "learning_rate": 5.550990140787147e-06, + "loss": 0.1314, + "step": 749 + }, + { + "epoch": 2.0463847203274215, + "grad_norm": 0.34979933500289917, + "learning_rate": 5.522535485909258e-06, + "loss": 0.1288, + "step": 750 + }, + { + "epoch": 2.049113233287858, + "grad_norm": 0.3478044867515564, + "learning_rate": 5.494126101776505e-06, + "loss": 0.1308, + "step": 751 + }, + { + "epoch": 2.0518417462482947, + "grad_norm": 0.35146066546440125, + "learning_rate": 5.465762275630471e-06, + "loss": 0.1301, + "step": 752 + }, + { + "epoch": 2.0545702592087314, + "grad_norm": 0.36018314957618713, + "learning_rate": 5.437444294252108e-06, + "loss": 0.1306, + "step": 753 + }, + { + "epoch": 2.057298772169168, + "grad_norm": 0.37735897302627563, + "learning_rate": 5.409172443958844e-06, + "loss": 0.1353, + "step": 754 + }, + { + "epoch": 2.060027285129604, + "grad_norm": 0.36527469754219055, + "learning_rate": 5.380947010601681e-06, + "loss": 0.1338, + "step": 755 + }, + { + "epoch": 2.062755798090041, + "grad_norm": 0.3785851001739502, + "learning_rate": 5.352768279562315e-06, + "loss": 0.1299, + "step": 756 + }, + { + "epoch": 2.0654843110504775, + "grad_norm": 0.3725038468837738, + "learning_rate": 5.324636535750238e-06, + "loss": 0.1312, + "step": 757 + }, + { + "epoch": 2.068212824010914, + "grad_norm": 0.3610471487045288, + "learning_rate": 5.2965520635998676e-06, + "loss": 0.1299, + "step": 758 + }, + { + "epoch": 2.0709413369713507, + "grad_norm": 0.3752604126930237, + "learning_rate": 5.268515147067666e-06, + "loss": 0.1322, + "step": 759 + }, + { + "epoch": 2.0736698499317874, + "grad_norm": 0.3641640245914459, + "learning_rate": 5.240526069629265e-06, + "loss": 0.1314, + "step": 760 + }, + { + "epoch": 2.0763983628922236, + "grad_norm": 0.3561566472053528, + "learning_rate": 5.212585114276614e-06, + "loss": 0.1301, + "step": 761 + }, + { + "epoch": 2.07912687585266, + "grad_norm": 0.3664422333240509, + "learning_rate": 5.184692563515104e-06, + "loss": 0.1315, + "step": 762 + }, + { + "epoch": 2.081855388813097, + "grad_norm": 0.3476559519767761, + "learning_rate": 5.156848699360719e-06, + "loss": 0.1293, + "step": 763 + }, + { + "epoch": 2.0845839017735335, + "grad_norm": 0.35268449783325195, + "learning_rate": 5.129053803337181e-06, + "loss": 0.1313, + "step": 764 + }, + { + "epoch": 2.08731241473397, + "grad_norm": 0.36053764820098877, + "learning_rate": 5.101308156473104e-06, + "loss": 0.1304, + "step": 765 + }, + { + "epoch": 2.0900409276944067, + "grad_norm": 0.3567509949207306, + "learning_rate": 5.073612039299157e-06, + "loss": 0.1309, + "step": 766 + }, + { + "epoch": 2.092769440654843, + "grad_norm": 0.35490044951438904, + "learning_rate": 5.045965731845223e-06, + "loss": 0.132, + "step": 767 + }, + { + "epoch": 2.0954979536152796, + "grad_norm": 0.3639454245567322, + "learning_rate": 5.018369513637567e-06, + "loss": 0.1311, + "step": 768 + }, + { + "epoch": 2.098226466575716, + "grad_norm": 0.3668364882469177, + "learning_rate": 4.990823663696013e-06, + "loss": 0.1301, + "step": 769 + }, + { + "epoch": 2.100954979536153, + "grad_norm": 0.3602748215198517, + "learning_rate": 4.963328460531127e-06, + "loss": 0.1316, + "step": 770 + }, + { + "epoch": 2.1036834924965895, + "grad_norm": 0.3594209551811218, + "learning_rate": 4.9358841821413775e-06, + "loss": 0.1292, + "step": 771 + }, + { + "epoch": 2.106412005457026, + "grad_norm": 0.36325401067733765, + "learning_rate": 4.908491106010368e-06, + "loss": 0.1333, + "step": 772 + }, + { + "epoch": 2.1091405184174623, + "grad_norm": 0.36963802576065063, + "learning_rate": 4.881149509103993e-06, + "loss": 0.1297, + "step": 773 + }, + { + "epoch": 2.111869031377899, + "grad_norm": 0.36008724570274353, + "learning_rate": 4.853859667867641e-06, + "loss": 0.1299, + "step": 774 + }, + { + "epoch": 2.1145975443383356, + "grad_norm": 0.3590433895587921, + "learning_rate": 4.826621858223431e-06, + "loss": 0.1315, + "step": 775 + }, + { + "epoch": 2.117326057298772, + "grad_norm": 0.3663657009601593, + "learning_rate": 4.799436355567391e-06, + "loss": 0.1314, + "step": 776 + }, + { + "epoch": 2.120054570259209, + "grad_norm": 0.3599216938018799, + "learning_rate": 4.772303434766669e-06, + "loss": 0.1307, + "step": 777 + }, + { + "epoch": 2.1227830832196455, + "grad_norm": 0.36558717489242554, + "learning_rate": 4.745223370156797e-06, + "loss": 0.1323, + "step": 778 + }, + { + "epoch": 2.1255115961800817, + "grad_norm": 0.3608526289463043, + "learning_rate": 4.7181964355388695e-06, + "loss": 0.1319, + "step": 779 + }, + { + "epoch": 2.1282401091405183, + "grad_norm": 0.35885030031204224, + "learning_rate": 4.691222904176791e-06, + "loss": 0.1323, + "step": 780 + }, + { + "epoch": 2.130968622100955, + "grad_norm": 0.3633890151977539, + "learning_rate": 4.664303048794533e-06, + "loss": 0.1323, + "step": 781 + }, + { + "epoch": 2.1336971350613916, + "grad_norm": 0.36755603551864624, + "learning_rate": 4.63743714157335e-06, + "loss": 0.1342, + "step": 782 + }, + { + "epoch": 2.136425648021828, + "grad_norm": 0.3608950674533844, + "learning_rate": 4.610625454149033e-06, + "loss": 0.1319, + "step": 783 + }, + { + "epoch": 2.139154160982265, + "grad_norm": 0.35920849442481995, + "learning_rate": 4.583868257609171e-06, + "loss": 0.1312, + "step": 784 + }, + { + "epoch": 2.141882673942701, + "grad_norm": 0.36166754364967346, + "learning_rate": 4.55716582249042e-06, + "loss": 0.1323, + "step": 785 + }, + { + "epoch": 2.1446111869031377, + "grad_norm": 0.37216901779174805, + "learning_rate": 4.530518418775734e-06, + "loss": 0.1304, + "step": 786 + }, + { + "epoch": 2.1473396998635743, + "grad_norm": 0.3564962148666382, + "learning_rate": 4.50392631589166e-06, + "loss": 0.1324, + "step": 787 + }, + { + "epoch": 2.150068212824011, + "grad_norm": 0.35313740372657776, + "learning_rate": 4.477389782705628e-06, + "loss": 0.1297, + "step": 788 + }, + { + "epoch": 2.1527967257844476, + "grad_norm": 0.35927248001098633, + "learning_rate": 4.4509090875231865e-06, + "loss": 0.1331, + "step": 789 + }, + { + "epoch": 2.155525238744884, + "grad_norm": 0.3569164276123047, + "learning_rate": 4.424484498085335e-06, + "loss": 0.1328, + "step": 790 + }, + { + "epoch": 2.1582537517053204, + "grad_norm": 0.36442428827285767, + "learning_rate": 4.398116281565794e-06, + "loss": 0.1313, + "step": 791 + }, + { + "epoch": 2.160982264665757, + "grad_norm": 0.35785555839538574, + "learning_rate": 4.371804704568309e-06, + "loss": 0.1296, + "step": 792 + }, + { + "epoch": 2.1637107776261937, + "grad_norm": 0.3561733663082123, + "learning_rate": 4.345550033123954e-06, + "loss": 0.1306, + "step": 793 + }, + { + "epoch": 2.1664392905866303, + "grad_norm": 0.36286094784736633, + "learning_rate": 4.319352532688444e-06, + "loss": 0.1324, + "step": 794 + }, + { + "epoch": 2.169167803547067, + "grad_norm": 0.35654789209365845, + "learning_rate": 4.293212468139447e-06, + "loss": 0.1292, + "step": 795 + }, + { + "epoch": 2.1718963165075036, + "grad_norm": 0.3664185404777527, + "learning_rate": 4.267130103773911e-06, + "loss": 0.1304, + "step": 796 + }, + { + "epoch": 2.17462482946794, + "grad_norm": 0.3567107021808624, + "learning_rate": 4.241105703305388e-06, + "loss": 0.1297, + "step": 797 + }, + { + "epoch": 2.1773533424283764, + "grad_norm": 0.3579164743423462, + "learning_rate": 4.2151395298613675e-06, + "loss": 0.1305, + "step": 798 + }, + { + "epoch": 2.180081855388813, + "grad_norm": 0.35019171237945557, + "learning_rate": 4.189231845980618e-06, + "loss": 0.131, + "step": 799 + }, + { + "epoch": 2.1828103683492497, + "grad_norm": 0.3644610047340393, + "learning_rate": 4.163382913610533e-06, + "loss": 0.1314, + "step": 800 + }, + { + "epoch": 2.1855388813096863, + "grad_norm": 0.3550662398338318, + "learning_rate": 4.137592994104479e-06, + "loss": 0.1297, + "step": 801 + }, + { + "epoch": 2.188267394270123, + "grad_norm": 0.37268707156181335, + "learning_rate": 4.111862348219158e-06, + "loss": 0.1311, + "step": 802 + }, + { + "epoch": 2.190995907230559, + "grad_norm": 0.35408931970596313, + "learning_rate": 4.086191236111964e-06, + "loss": 0.13, + "step": 803 + }, + { + "epoch": 2.193724420190996, + "grad_norm": 0.36723992228507996, + "learning_rate": 4.060579917338362e-06, + "loss": 0.1345, + "step": 804 + }, + { + "epoch": 2.1964529331514324, + "grad_norm": 0.3593861758708954, + "learning_rate": 4.035028650849255e-06, + "loss": 0.1322, + "step": 805 + }, + { + "epoch": 2.199181446111869, + "grad_norm": 0.3648458421230316, + "learning_rate": 4.009537694988372e-06, + "loss": 0.1341, + "step": 806 + }, + { + "epoch": 2.2019099590723057, + "grad_norm": 0.361258327960968, + "learning_rate": 3.984107307489652e-06, + "loss": 0.1305, + "step": 807 + }, + { + "epoch": 2.2046384720327423, + "grad_norm": 0.35904020071029663, + "learning_rate": 3.958737745474638e-06, + "loss": 0.1279, + "step": 808 + }, + { + "epoch": 2.2073669849931785, + "grad_norm": 0.3607647120952606, + "learning_rate": 3.933429265449882e-06, + "loss": 0.1299, + "step": 809 + }, + { + "epoch": 2.210095497953615, + "grad_norm": 0.3666648268699646, + "learning_rate": 3.908182123304344e-06, + "loss": 0.1309, + "step": 810 + }, + { + "epoch": 2.212824010914052, + "grad_norm": 0.3656477928161621, + "learning_rate": 3.882996574306818e-06, + "loss": 0.1334, + "step": 811 + }, + { + "epoch": 2.2155525238744884, + "grad_norm": 0.35920169949531555, + "learning_rate": 3.857872873103322e-06, + "loss": 0.1328, + "step": 812 + }, + { + "epoch": 2.218281036834925, + "grad_norm": 0.3563489317893982, + "learning_rate": 3.832811273714569e-06, + "loss": 0.1327, + "step": 813 + }, + { + "epoch": 2.2210095497953617, + "grad_norm": 0.363343209028244, + "learning_rate": 3.807812029533362e-06, + "loss": 0.13, + "step": 814 + }, + { + "epoch": 2.223738062755798, + "grad_norm": 0.36663034558296204, + "learning_rate": 3.78287539332203e-06, + "loss": 0.1319, + "step": 815 + }, + { + "epoch": 2.2264665757162345, + "grad_norm": 0.35951149463653564, + "learning_rate": 3.7580016172099067e-06, + "loss": 0.1306, + "step": 816 + }, + { + "epoch": 2.229195088676671, + "grad_norm": 0.35489046573638916, + "learning_rate": 3.7331909526907527e-06, + "loss": 0.1293, + "step": 817 + }, + { + "epoch": 2.231923601637108, + "grad_norm": 0.3771286904811859, + "learning_rate": 3.708443650620206e-06, + "loss": 0.1338, + "step": 818 + }, + { + "epoch": 2.2346521145975444, + "grad_norm": 0.35692450404167175, + "learning_rate": 3.6837599612132826e-06, + "loss": 0.1314, + "step": 819 + }, + { + "epoch": 2.237380627557981, + "grad_norm": 0.3585570454597473, + "learning_rate": 3.659140134041812e-06, + "loss": 0.1319, + "step": 820 + }, + { + "epoch": 2.2401091405184177, + "grad_norm": 0.3692927658557892, + "learning_rate": 3.6345844180319157e-06, + "loss": 0.1355, + "step": 821 + }, + { + "epoch": 2.242837653478854, + "grad_norm": 0.3518622815608978, + "learning_rate": 3.6100930614615204e-06, + "loss": 0.1298, + "step": 822 + }, + { + "epoch": 2.2455661664392905, + "grad_norm": 0.34735947847366333, + "learning_rate": 3.5856663119578174e-06, + "loss": 0.1315, + "step": 823 + }, + { + "epoch": 2.248294679399727, + "grad_norm": 0.339832603931427, + "learning_rate": 3.5613044164947617e-06, + "loss": 0.1267, + "step": 824 + }, + { + "epoch": 2.251023192360164, + "grad_norm": 0.3616288900375366, + "learning_rate": 3.5370076213905904e-06, + "loss": 0.1294, + "step": 825 + }, + { + "epoch": 2.2537517053206004, + "grad_norm": 0.35375145077705383, + "learning_rate": 3.5127761723053313e-06, + "loss": 0.1299, + "step": 826 + }, + { + "epoch": 2.2564802182810366, + "grad_norm": 0.35924383997917175, + "learning_rate": 3.4886103142382944e-06, + "loss": 0.1306, + "step": 827 + }, + { + "epoch": 2.2592087312414733, + "grad_norm": 0.3569202423095703, + "learning_rate": 3.46451029152562e-06, + "loss": 0.1303, + "step": 828 + }, + { + "epoch": 2.26193724420191, + "grad_norm": 0.36816731095314026, + "learning_rate": 3.440476347837811e-06, + "loss": 0.1317, + "step": 829 + }, + { + "epoch": 2.2646657571623465, + "grad_norm": 0.3677736520767212, + "learning_rate": 3.41650872617724e-06, + "loss": 0.1323, + "step": 830 + }, + { + "epoch": 2.267394270122783, + "grad_norm": 0.35778266191482544, + "learning_rate": 3.392607668875718e-06, + "loss": 0.1316, + "step": 831 + }, + { + "epoch": 2.27012278308322, + "grad_norm": 0.35744157433509827, + "learning_rate": 3.3687734175920505e-06, + "loss": 0.1296, + "step": 832 + }, + { + "epoch": 2.2728512960436564, + "grad_norm": 0.37661606073379517, + "learning_rate": 3.3450062133095572e-06, + "loss": 0.1326, + "step": 833 + }, + { + "epoch": 2.2755798090040926, + "grad_norm": 0.36716973781585693, + "learning_rate": 3.321306296333673e-06, + "loss": 0.1325, + "step": 834 + }, + { + "epoch": 2.2783083219645293, + "grad_norm": 0.3580392301082611, + "learning_rate": 3.29767390628951e-06, + "loss": 0.1317, + "step": 835 + }, + { + "epoch": 2.281036834924966, + "grad_norm": 0.36350300908088684, + "learning_rate": 3.274109282119413e-06, + "loss": 0.1311, + "step": 836 + }, + { + "epoch": 2.2837653478854025, + "grad_norm": 0.36548492312431335, + "learning_rate": 3.2506126620805666e-06, + "loss": 0.1325, + "step": 837 + }, + { + "epoch": 2.286493860845839, + "grad_norm": 0.3625844717025757, + "learning_rate": 3.2271842837425917e-06, + "loss": 0.1297, + "step": 838 + }, + { + "epoch": 2.2892223738062754, + "grad_norm": 0.3575722277164459, + "learning_rate": 3.203824383985108e-06, + "loss": 0.1315, + "step": 839 + }, + { + "epoch": 2.291950886766712, + "grad_norm": 0.3623971939086914, + "learning_rate": 3.180533198995379e-06, + "loss": 0.132, + "step": 840 + }, + { + "epoch": 2.2946793997271486, + "grad_norm": 0.3681948184967041, + "learning_rate": 3.157310964265903e-06, + "loss": 0.1308, + "step": 841 + }, + { + "epoch": 2.2974079126875853, + "grad_norm": 0.36443206667900085, + "learning_rate": 3.134157914592032e-06, + "loss": 0.1328, + "step": 842 + }, + { + "epoch": 2.300136425648022, + "grad_norm": 0.3680581748485565, + "learning_rate": 3.1110742840696063e-06, + "loss": 0.1314, + "step": 843 + }, + { + "epoch": 2.3028649386084585, + "grad_norm": 0.3506981134414673, + "learning_rate": 3.088060306092582e-06, + "loss": 0.1268, + "step": 844 + }, + { + "epoch": 2.305593451568895, + "grad_norm": 0.3596420884132385, + "learning_rate": 3.0651162133506707e-06, + "loss": 0.1317, + "step": 845 + }, + { + "epoch": 2.3083219645293314, + "grad_norm": 0.359647274017334, + "learning_rate": 3.042242237826991e-06, + "loss": 0.1278, + "step": 846 + }, + { + "epoch": 2.311050477489768, + "grad_norm": 0.36838117241859436, + "learning_rate": 3.0194386107957175e-06, + "loss": 0.1337, + "step": 847 + }, + { + "epoch": 2.3137789904502046, + "grad_norm": 0.3791147768497467, + "learning_rate": 2.996705562819747e-06, + "loss": 0.1325, + "step": 848 + }, + { + "epoch": 2.3165075034106413, + "grad_norm": 0.3608884811401367, + "learning_rate": 2.9740433237483667e-06, + "loss": 0.1299, + "step": 849 + }, + { + "epoch": 2.319236016371078, + "grad_norm": 0.35787829756736755, + "learning_rate": 2.951452122714926e-06, + "loss": 0.131, + "step": 850 + }, + { + "epoch": 2.321964529331514, + "grad_norm": 0.3690457046031952, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.1327, + "step": 851 + }, + { + "epoch": 2.3246930422919507, + "grad_norm": 0.3544095456600189, + "learning_rate": 2.906483747701705e-06, + "loss": 0.1268, + "step": 852 + }, + { + "epoch": 2.3274215552523874, + "grad_norm": 0.3589238226413727, + "learning_rate": 2.88410702838814e-06, + "loss": 0.1298, + "step": 853 + }, + { + "epoch": 2.330150068212824, + "grad_norm": 0.34518349170684814, + "learning_rate": 2.861802256440348e-06, + "loss": 0.1286, + "step": 854 + }, + { + "epoch": 2.3328785811732606, + "grad_norm": 0.35874345898628235, + "learning_rate": 2.8395696573774034e-06, + "loss": 0.1296, + "step": 855 + }, + { + "epoch": 2.3356070941336973, + "grad_norm": 0.35607776045799255, + "learning_rate": 2.8174094559886535e-06, + "loss": 0.1282, + "step": 856 + }, + { + "epoch": 2.338335607094134, + "grad_norm": 0.36033570766448975, + "learning_rate": 2.795321876331446e-06, + "loss": 0.1327, + "step": 857 + }, + { + "epoch": 2.34106412005457, + "grad_norm": 0.3491268455982208, + "learning_rate": 2.773307141728867e-06, + "loss": 0.1279, + "step": 858 + }, + { + "epoch": 2.3437926330150067, + "grad_norm": 0.3684820234775543, + "learning_rate": 2.751365474767479e-06, + "loss": 0.1331, + "step": 859 + }, + { + "epoch": 2.3465211459754434, + "grad_norm": 0.3787662088871002, + "learning_rate": 2.729497097295075e-06, + "loss": 0.1365, + "step": 860 + }, + { + "epoch": 2.34924965893588, + "grad_norm": 0.3510342538356781, + "learning_rate": 2.70770223041843e-06, + "loss": 0.1286, + "step": 861 + }, + { + "epoch": 2.3519781718963166, + "grad_norm": 0.36617404222488403, + "learning_rate": 2.6859810945010687e-06, + "loss": 0.1328, + "step": 862 + }, + { + "epoch": 2.354706684856753, + "grad_norm": 0.3623158633708954, + "learning_rate": 2.6643339091610376e-06, + "loss": 0.1311, + "step": 863 + }, + { + "epoch": 2.3574351978171895, + "grad_norm": 0.3640574812889099, + "learning_rate": 2.642760893268684e-06, + "loss": 0.1289, + "step": 864 + }, + { + "epoch": 2.360163710777626, + "grad_norm": 0.3659525513648987, + "learning_rate": 2.621262264944444e-06, + "loss": 0.132, + "step": 865 + }, + { + "epoch": 2.3628922237380627, + "grad_norm": 0.3544803857803345, + "learning_rate": 2.5998382415566258e-06, + "loss": 0.1305, + "step": 866 + }, + { + "epoch": 2.3656207366984994, + "grad_norm": 0.35486936569213867, + "learning_rate": 2.5784890397192395e-06, + "loss": 0.1312, + "step": 867 + }, + { + "epoch": 2.368349249658936, + "grad_norm": 0.35350891947746277, + "learning_rate": 2.55721487528978e-06, + "loss": 0.1319, + "step": 868 + }, + { + "epoch": 2.3710777626193726, + "grad_norm": 0.35865113139152527, + "learning_rate": 2.5360159633670456e-06, + "loss": 0.1305, + "step": 869 + }, + { + "epoch": 2.373806275579809, + "grad_norm": 0.3628556430339813, + "learning_rate": 2.514892518288988e-06, + "loss": 0.1299, + "step": 870 + }, + { + "epoch": 2.3765347885402455, + "grad_norm": 0.34977447986602783, + "learning_rate": 2.4938447536305243e-06, + "loss": 0.1311, + "step": 871 + }, + { + "epoch": 2.379263301500682, + "grad_norm": 0.3562906086444855, + "learning_rate": 2.4728728822013683e-06, + "loss": 0.1299, + "step": 872 + }, + { + "epoch": 2.3819918144611187, + "grad_norm": 0.3648015558719635, + "learning_rate": 2.451977116043911e-06, + "loss": 0.1311, + "step": 873 + }, + { + "epoch": 2.3847203274215554, + "grad_norm": 0.3597666025161743, + "learning_rate": 2.431157666431052e-06, + "loss": 0.1307, + "step": 874 + }, + { + "epoch": 2.3874488403819916, + "grad_norm": 0.35988011956214905, + "learning_rate": 2.410414743864059e-06, + "loss": 0.1317, + "step": 875 + }, + { + "epoch": 2.390177353342428, + "grad_norm": 0.37350034713745117, + "learning_rate": 2.3897485580704684e-06, + "loss": 0.1307, + "step": 876 + }, + { + "epoch": 2.392905866302865, + "grad_norm": 0.36840546131134033, + "learning_rate": 2.369159318001937e-06, + "loss": 0.1318, + "step": 877 + }, + { + "epoch": 2.3956343792633015, + "grad_norm": 0.36420130729675293, + "learning_rate": 2.348647231832131e-06, + "loss": 0.1304, + "step": 878 + }, + { + "epoch": 2.398362892223738, + "grad_norm": 0.3545297086238861, + "learning_rate": 2.3282125069546437e-06, + "loss": 0.1269, + "step": 879 + }, + { + "epoch": 2.4010914051841747, + "grad_norm": 0.36249154806137085, + "learning_rate": 2.30785534998088e-06, + "loss": 0.133, + "step": 880 + }, + { + "epoch": 2.4038199181446114, + "grad_norm": 0.3670268952846527, + "learning_rate": 2.2875759667379616e-06, + "loss": 0.1292, + "step": 881 + }, + { + "epoch": 2.4065484311050476, + "grad_norm": 0.36384454369544983, + "learning_rate": 2.267374562266662e-06, + "loss": 0.1285, + "step": 882 + }, + { + "epoch": 2.409276944065484, + "grad_norm": 0.35740095376968384, + "learning_rate": 2.2472513408193385e-06, + "loss": 0.1305, + "step": 883 + }, + { + "epoch": 2.412005457025921, + "grad_norm": 0.3598315119743347, + "learning_rate": 2.227206505857834e-06, + "loss": 0.1319, + "step": 884 + }, + { + "epoch": 2.4147339699863575, + "grad_norm": 0.36150842905044556, + "learning_rate": 2.207240260051453e-06, + "loss": 0.1325, + "step": 885 + }, + { + "epoch": 2.417462482946794, + "grad_norm": 0.3574983477592468, + "learning_rate": 2.1873528052749094e-06, + "loss": 0.131, + "step": 886 + }, + { + "epoch": 2.4201909959072307, + "grad_norm": 0.35541966557502747, + "learning_rate": 2.167544342606256e-06, + "loss": 0.1276, + "step": 887 + }, + { + "epoch": 2.422919508867667, + "grad_norm": 0.3616240322589874, + "learning_rate": 2.147815072324886e-06, + "loss": 0.1328, + "step": 888 + }, + { + "epoch": 2.4256480218281036, + "grad_norm": 0.35617539286613464, + "learning_rate": 2.1281651939094996e-06, + "loss": 0.1289, + "step": 889 + }, + { + "epoch": 2.42837653478854, + "grad_norm": 0.3464270830154419, + "learning_rate": 2.1085949060360654e-06, + "loss": 0.1298, + "step": 890 + }, + { + "epoch": 2.431105047748977, + "grad_norm": 0.35398560762405396, + "learning_rate": 2.089104406575837e-06, + "loss": 0.1334, + "step": 891 + }, + { + "epoch": 2.4338335607094135, + "grad_norm": 0.36852145195007324, + "learning_rate": 2.0696938925933505e-06, + "loss": 0.1304, + "step": 892 + }, + { + "epoch": 2.43656207366985, + "grad_norm": 0.3582163155078888, + "learning_rate": 2.0503635603444094e-06, + "loss": 0.1307, + "step": 893 + }, + { + "epoch": 2.4392905866302863, + "grad_norm": 0.3476228713989258, + "learning_rate": 2.0311136052741274e-06, + "loss": 0.1265, + "step": 894 + }, + { + "epoch": 2.442019099590723, + "grad_norm": 0.3634602129459381, + "learning_rate": 2.0119442220149356e-06, + "loss": 0.1327, + "step": 895 + }, + { + "epoch": 2.4447476125511596, + "grad_norm": 0.363454133272171, + "learning_rate": 1.9928556043846215e-06, + "loss": 0.134, + "step": 896 + }, + { + "epoch": 2.447476125511596, + "grad_norm": 0.3611748516559601, + "learning_rate": 1.9738479453843685e-06, + "loss": 0.1305, + "step": 897 + }, + { + "epoch": 2.450204638472033, + "grad_norm": 0.34824034571647644, + "learning_rate": 1.9549214371968008e-06, + "loss": 0.1285, + "step": 898 + }, + { + "epoch": 2.4529331514324695, + "grad_norm": 0.3596573770046234, + "learning_rate": 1.936076271184044e-06, + "loss": 0.1281, + "step": 899 + }, + { + "epoch": 2.4556616643929057, + "grad_norm": 0.35721272230148315, + "learning_rate": 1.917312637885791e-06, + "loss": 0.1305, + "step": 900 + }, + { + "epoch": 2.4583901773533423, + "grad_norm": 0.36631426215171814, + "learning_rate": 1.898630727017371e-06, + "loss": 0.134, + "step": 901 + }, + { + "epoch": 2.461118690313779, + "grad_norm": 0.36680275201797485, + "learning_rate": 1.8800307274678364e-06, + "loss": 0.1325, + "step": 902 + }, + { + "epoch": 2.4638472032742156, + "grad_norm": 0.3748106062412262, + "learning_rate": 1.861512827298051e-06, + "loss": 0.1309, + "step": 903 + }, + { + "epoch": 2.466575716234652, + "grad_norm": 0.37186112999916077, + "learning_rate": 1.8430772137387853e-06, + "loss": 0.1333, + "step": 904 + }, + { + "epoch": 2.469304229195089, + "grad_norm": 0.36231905221939087, + "learning_rate": 1.8247240731888293e-06, + "loss": 0.13, + "step": 905 + }, + { + "epoch": 2.472032742155525, + "grad_norm": 0.35124829411506653, + "learning_rate": 1.8064535912131032e-06, + "loss": 0.1274, + "step": 906 + }, + { + "epoch": 2.4747612551159617, + "grad_norm": 0.35125380754470825, + "learning_rate": 1.7882659525407842e-06, + "loss": 0.1278, + "step": 907 + }, + { + "epoch": 2.4774897680763983, + "grad_norm": 0.3610575497150421, + "learning_rate": 1.7701613410634367e-06, + "loss": 0.1288, + "step": 908 + }, + { + "epoch": 2.480218281036835, + "grad_norm": 0.35818085074424744, + "learning_rate": 1.752139939833154e-06, + "loss": 0.1305, + "step": 909 + }, + { + "epoch": 2.4829467939972716, + "grad_norm": 0.36310091614723206, + "learning_rate": 1.7342019310607062e-06, + "loss": 0.1288, + "step": 910 + }, + { + "epoch": 2.485675306957708, + "grad_norm": 0.36673158407211304, + "learning_rate": 1.7163474961137029e-06, + "loss": 0.1307, + "step": 911 + }, + { + "epoch": 2.488403819918145, + "grad_norm": 0.3627110719680786, + "learning_rate": 1.6985768155147498e-06, + "loss": 0.1296, + "step": 912 + }, + { + "epoch": 2.491132332878581, + "grad_norm": 0.3594669997692108, + "learning_rate": 1.6808900689396334e-06, + "loss": 0.1281, + "step": 913 + }, + { + "epoch": 2.4938608458390177, + "grad_norm": 0.35554230213165283, + "learning_rate": 1.6632874352154982e-06, + "loss": 0.1273, + "step": 914 + }, + { + "epoch": 2.4965893587994543, + "grad_norm": 0.3572724163532257, + "learning_rate": 1.645769092319045e-06, + "loss": 0.1299, + "step": 915 + }, + { + "epoch": 2.499317871759891, + "grad_norm": 0.3588331639766693, + "learning_rate": 1.6283352173747148e-06, + "loss": 0.1308, + "step": 916 + }, + { + "epoch": 2.5020463847203276, + "grad_norm": 0.36220690608024597, + "learning_rate": 1.6109859866529253e-06, + "loss": 0.1297, + "step": 917 + }, + { + "epoch": 2.504774897680764, + "grad_norm": 0.35867708921432495, + "learning_rate": 1.5937215755682667e-06, + "loss": 0.1298, + "step": 918 + }, + { + "epoch": 2.5075034106412004, + "grad_norm": 0.3584694266319275, + "learning_rate": 1.5765421586777285e-06, + "loss": 0.1293, + "step": 919 + }, + { + "epoch": 2.510231923601637, + "grad_norm": 0.3651615381240845, + "learning_rate": 1.559447909678954e-06, + "loss": 0.1301, + "step": 920 + }, + { + "epoch": 2.5129604365620737, + "grad_norm": 0.3454124331474304, + "learning_rate": 1.5424390014084644e-06, + "loss": 0.1261, + "step": 921 + }, + { + "epoch": 2.5156889495225103, + "grad_norm": 0.36464953422546387, + "learning_rate": 1.5255156058399124e-06, + "loss": 0.1309, + "step": 922 + }, + { + "epoch": 2.518417462482947, + "grad_norm": 0.3527681827545166, + "learning_rate": 1.5086778940823544e-06, + "loss": 0.1271, + "step": 923 + }, + { + "epoch": 2.5211459754433836, + "grad_norm": 0.3712303042411804, + "learning_rate": 1.4919260363785215e-06, + "loss": 0.1311, + "step": 924 + }, + { + "epoch": 2.52387448840382, + "grad_norm": 0.3462398052215576, + "learning_rate": 1.4752602021030794e-06, + "loss": 0.1274, + "step": 925 + }, + { + "epoch": 2.5266030013642564, + "grad_norm": 0.36644744873046875, + "learning_rate": 1.4586805597609333e-06, + "loss": 0.1292, + "step": 926 + }, + { + "epoch": 2.529331514324693, + "grad_norm": 0.3596111536026001, + "learning_rate": 1.4421872769855262e-06, + "loss": 0.1311, + "step": 927 + }, + { + "epoch": 2.5320600272851297, + "grad_norm": 0.3601299822330475, + "learning_rate": 1.4257805205371233e-06, + "loss": 0.1306, + "step": 928 + }, + { + "epoch": 2.5347885402455663, + "grad_norm": 0.3638714551925659, + "learning_rate": 1.409460456301147e-06, + "loss": 0.1295, + "step": 929 + }, + { + "epoch": 2.5375170532060025, + "grad_norm": 0.3720818758010864, + "learning_rate": 1.3932272492864984e-06, + "loss": 0.1281, + "step": 930 + }, + { + "epoch": 2.540245566166439, + "grad_norm": 0.3580470085144043, + "learning_rate": 1.3770810636238685e-06, + "loss": 0.1282, + "step": 931 + }, + { + "epoch": 2.542974079126876, + "grad_norm": 0.3591982126235962, + "learning_rate": 1.3610220625641002e-06, + "loss": 0.1292, + "step": 932 + }, + { + "epoch": 2.5457025920873124, + "grad_norm": 0.3552996516227722, + "learning_rate": 1.3450504084765381e-06, + "loss": 0.1294, + "step": 933 + }, + { + "epoch": 2.548431105047749, + "grad_norm": 0.3630991578102112, + "learning_rate": 1.3291662628473634e-06, + "loss": 0.1296, + "step": 934 + }, + { + "epoch": 2.5511596180081857, + "grad_norm": 0.34676593542099, + "learning_rate": 1.313369786277987e-06, + "loss": 0.1281, + "step": 935 + }, + { + "epoch": 2.5538881309686223, + "grad_norm": 0.35284459590911865, + "learning_rate": 1.2976611384834148e-06, + "loss": 0.1285, + "step": 936 + }, + { + "epoch": 2.5566166439290585, + "grad_norm": 0.3661840856075287, + "learning_rate": 1.2820404782906315e-06, + "loss": 0.1304, + "step": 937 + }, + { + "epoch": 2.559345156889495, + "grad_norm": 0.34972235560417175, + "learning_rate": 1.266507963636997e-06, + "loss": 0.1268, + "step": 938 + }, + { + "epoch": 2.562073669849932, + "grad_norm": 0.34828442335128784, + "learning_rate": 1.2510637515686497e-06, + "loss": 0.1252, + "step": 939 + }, + { + "epoch": 2.5648021828103684, + "grad_norm": 0.3637445569038391, + "learning_rate": 1.2357079982389197e-06, + "loss": 0.1308, + "step": 940 + }, + { + "epoch": 2.567530695770805, + "grad_norm": 0.3512468934059143, + "learning_rate": 1.2204408589067462e-06, + "loss": 0.1287, + "step": 941 + }, + { + "epoch": 2.5702592087312413, + "grad_norm": 0.36545827984809875, + "learning_rate": 1.2052624879351105e-06, + "loss": 0.1306, + "step": 942 + }, + { + "epoch": 2.572987721691678, + "grad_norm": 0.3595888018608093, + "learning_rate": 1.190173038789476e-06, + "loss": 0.1291, + "step": 943 + }, + { + "epoch": 2.5757162346521145, + "grad_norm": 0.34953707456588745, + "learning_rate": 1.175172664036235e-06, + "loss": 0.1271, + "step": 944 + }, + { + "epoch": 2.578444747612551, + "grad_norm": 0.35142770409584045, + "learning_rate": 1.1602615153411666e-06, + "loss": 0.1296, + "step": 945 + }, + { + "epoch": 2.581173260572988, + "grad_norm": 0.3567049205303192, + "learning_rate": 1.1454397434679022e-06, + "loss": 0.1268, + "step": 946 + }, + { + "epoch": 2.5839017735334244, + "grad_norm": 0.3721398413181305, + "learning_rate": 1.1307074982764022e-06, + "loss": 0.1334, + "step": 947 + }, + { + "epoch": 2.586630286493861, + "grad_norm": 0.36237263679504395, + "learning_rate": 1.116064928721442e-06, + "loss": 0.1299, + "step": 948 + }, + { + "epoch": 2.5893587994542973, + "grad_norm": 0.356036514043808, + "learning_rate": 1.1015121828511033e-06, + "loss": 0.13, + "step": 949 + }, + { + "epoch": 2.592087312414734, + "grad_norm": 0.36223796010017395, + "learning_rate": 1.0870494078052796e-06, + "loss": 0.1302, + "step": 950 + }, + { + "epoch": 2.5948158253751705, + "grad_norm": 0.3553941547870636, + "learning_rate": 1.0726767498141877e-06, + "loss": 0.1283, + "step": 951 + }, + { + "epoch": 2.597544338335607, + "grad_norm": 0.3495614230632782, + "learning_rate": 1.0583943541968856e-06, + "loss": 0.1285, + "step": 952 + }, + { + "epoch": 2.600272851296044, + "grad_norm": 0.3539895713329315, + "learning_rate": 1.044202365359811e-06, + "loss": 0.1288, + "step": 953 + }, + { + "epoch": 2.60300136425648, + "grad_norm": 0.35513320565223694, + "learning_rate": 1.0301009267953145e-06, + "loss": 0.1281, + "step": 954 + }, + { + "epoch": 2.6057298772169166, + "grad_norm": 0.36777645349502563, + "learning_rate": 1.0160901810802114e-06, + "loss": 0.131, + "step": 955 + }, + { + "epoch": 2.6084583901773533, + "grad_norm": 0.3543359637260437, + "learning_rate": 1.0021702698743408e-06, + "loss": 0.126, + "step": 956 + }, + { + "epoch": 2.61118690313779, + "grad_norm": 0.35314980149269104, + "learning_rate": 9.883413339191295e-07, + "loss": 0.1289, + "step": 957 + }, + { + "epoch": 2.6139154160982265, + "grad_norm": 0.36196306347846985, + "learning_rate": 9.746035130361741e-07, + "loss": 0.13, + "step": 958 + }, + { + "epoch": 2.616643929058663, + "grad_norm": 0.3647453486919403, + "learning_rate": 9.609569461258262e-07, + "loss": 0.1306, + "step": 959 + }, + { + "epoch": 2.6193724420191, + "grad_norm": 0.35779696702957153, + "learning_rate": 9.474017711657835e-07, + "loss": 0.1284, + "step": 960 + }, + { + "epoch": 2.622100954979536, + "grad_norm": 0.36195996403694153, + "learning_rate": 9.339381252097001e-07, + "loss": 0.1313, + "step": 961 + }, + { + "epoch": 2.6248294679399726, + "grad_norm": 0.3700987696647644, + "learning_rate": 9.205661443857994e-07, + "loss": 0.1299, + "step": 962 + }, + { + "epoch": 2.6275579809004093, + "grad_norm": 0.35640883445739746, + "learning_rate": 9.072859638954956e-07, + "loss": 0.1281, + "step": 963 + }, + { + "epoch": 2.630286493860846, + "grad_norm": 0.35907936096191406, + "learning_rate": 8.940977180120247e-07, + "loss": 0.1272, + "step": 964 + }, + { + "epoch": 2.6330150068212825, + "grad_norm": 0.3632573187351227, + "learning_rate": 8.810015400790994e-07, + "loss": 0.1308, + "step": 965 + }, + { + "epoch": 2.6357435197817187, + "grad_norm": 0.3489466607570648, + "learning_rate": 8.67997562509546e-07, + "loss": 0.1283, + "step": 966 + }, + { + "epoch": 2.6384720327421554, + "grad_norm": 0.35598084330558777, + "learning_rate": 8.550859167839665e-07, + "loss": 0.1292, + "step": 967 + }, + { + "epoch": 2.641200545702592, + "grad_norm": 0.3620053827762604, + "learning_rate": 8.42266733449425e-07, + "loss": 0.1278, + "step": 968 + }, + { + "epoch": 2.6439290586630286, + "grad_norm": 0.36157307028770447, + "learning_rate": 8.295401421181126e-07, + "loss": 0.13, + "step": 969 + }, + { + "epoch": 2.6466575716234653, + "grad_norm": 0.3444899916648865, + "learning_rate": 8.169062714660347e-07, + "loss": 0.127, + "step": 970 + }, + { + "epoch": 2.649386084583902, + "grad_norm": 0.35807591676712036, + "learning_rate": 8.043652492317256e-07, + "loss": 0.1297, + "step": 971 + }, + { + "epoch": 2.6521145975443385, + "grad_norm": 0.3608627915382385, + "learning_rate": 7.919172022149458e-07, + "loss": 0.1281, + "step": 972 + }, + { + "epoch": 2.6548431105047747, + "grad_norm": 0.35188964009284973, + "learning_rate": 7.795622562753957e-07, + "loss": 0.1272, + "step": 973 + }, + { + "epoch": 2.6575716234652114, + "grad_norm": 0.3593100309371948, + "learning_rate": 7.673005363314578e-07, + "loss": 0.1302, + "step": 974 + }, + { + "epoch": 2.660300136425648, + "grad_norm": 0.35982316732406616, + "learning_rate": 7.551321663589229e-07, + "loss": 0.1304, + "step": 975 + }, + { + "epoch": 2.6630286493860846, + "grad_norm": 0.3450589179992676, + "learning_rate": 7.430572693897342e-07, + "loss": 0.128, + "step": 976 + }, + { + "epoch": 2.6657571623465213, + "grad_norm": 0.34911608695983887, + "learning_rate": 7.310759675107515e-07, + "loss": 0.1282, + "step": 977 + }, + { + "epoch": 2.6684856753069575, + "grad_norm": 0.36377087235450745, + "learning_rate": 7.19188381862519e-07, + "loss": 0.1338, + "step": 978 + }, + { + "epoch": 2.6712141882673945, + "grad_norm": 0.34635117650032043, + "learning_rate": 7.073946326380243e-07, + "loss": 0.1276, + "step": 979 + }, + { + "epoch": 2.6739427012278307, + "grad_norm": 0.3502279818058014, + "learning_rate": 6.956948390814977e-07, + "loss": 0.1291, + "step": 980 + }, + { + "epoch": 2.6766712141882674, + "grad_norm": 0.36715587973594666, + "learning_rate": 6.840891194872112e-07, + "loss": 0.1344, + "step": 981 + }, + { + "epoch": 2.679399727148704, + "grad_norm": 0.35529640316963196, + "learning_rate": 6.725775911982602e-07, + "loss": 0.1302, + "step": 982 + }, + { + "epoch": 2.6821282401091406, + "grad_norm": 0.35221484303474426, + "learning_rate": 6.61160370605397e-07, + "loss": 0.1265, + "step": 983 + }, + { + "epoch": 2.6848567530695773, + "grad_norm": 0.34295523166656494, + "learning_rate": 6.498375731458529e-07, + "loss": 0.1275, + "step": 984 + }, + { + "epoch": 2.6875852660300135, + "grad_norm": 0.36550015211105347, + "learning_rate": 6.386093133021554e-07, + "loss": 0.1277, + "step": 985 + }, + { + "epoch": 2.69031377899045, + "grad_norm": 0.3569300174713135, + "learning_rate": 6.274757046009871e-07, + "loss": 0.1281, + "step": 986 + }, + { + "epoch": 2.6930422919508867, + "grad_norm": 0.35022974014282227, + "learning_rate": 6.164368596120351e-07, + "loss": 0.1263, + "step": 987 + }, + { + "epoch": 2.6957708049113234, + "grad_norm": 0.35484209656715393, + "learning_rate": 6.054928899468427e-07, + "loss": 0.1278, + "step": 988 + }, + { + "epoch": 2.69849931787176, + "grad_norm": 0.3532489836215973, + "learning_rate": 5.946439062576903e-07, + "loss": 0.1284, + "step": 989 + }, + { + "epoch": 2.701227830832196, + "grad_norm": 0.3485071659088135, + "learning_rate": 5.83890018236476e-07, + "loss": 0.1272, + "step": 990 + }, + { + "epoch": 2.7039563437926333, + "grad_norm": 0.3542632758617401, + "learning_rate": 5.732313346136032e-07, + "loss": 0.1265, + "step": 991 + }, + { + "epoch": 2.7066848567530695, + "grad_norm": 0.3596004843711853, + "learning_rate": 5.626679631568832e-07, + "loss": 0.1282, + "step": 992 + }, + { + "epoch": 2.709413369713506, + "grad_norm": 0.36018607020378113, + "learning_rate": 5.52200010670444e-07, + "loss": 0.1277, + "step": 993 + }, + { + "epoch": 2.7121418826739427, + "grad_norm": 0.35304713249206543, + "learning_rate": 5.418275829936537e-07, + "loss": 0.1307, + "step": 994 + }, + { + "epoch": 2.7148703956343794, + "grad_norm": 0.3582627475261688, + "learning_rate": 5.315507850000456e-07, + "loss": 0.1284, + "step": 995 + }, + { + "epoch": 2.717598908594816, + "grad_norm": 0.35284796357154846, + "learning_rate": 5.213697205962631e-07, + "loss": 0.1273, + "step": 996 + }, + { + "epoch": 2.720327421555252, + "grad_norm": 0.35549217462539673, + "learning_rate": 5.112844927210048e-07, + "loss": 0.1283, + "step": 997 + }, + { + "epoch": 2.723055934515689, + "grad_norm": 0.35494813323020935, + "learning_rate": 5.012952033439844e-07, + "loss": 0.1243, + "step": 998 + }, + { + "epoch": 2.7257844474761255, + "grad_norm": 0.3586711585521698, + "learning_rate": 4.914019534649039e-07, + "loss": 0.1304, + "step": 999 + }, + { + "epoch": 2.728512960436562, + "grad_norm": 0.3513246774673462, + "learning_rate": 4.816048431124265e-07, + "loss": 0.1262, + "step": 1000 + }, + { + "epoch": 2.7312414733969987, + "grad_norm": 0.36048534512519836, + "learning_rate": 4.7190397134316946e-07, + "loss": 0.1298, + "step": 1001 + }, + { + "epoch": 2.733969986357435, + "grad_norm": 0.3561542332172394, + "learning_rate": 4.6229943624069963e-07, + "loss": 0.1304, + "step": 1002 + }, + { + "epoch": 2.736698499317872, + "grad_norm": 0.35859954357147217, + "learning_rate": 4.5279133491454406e-07, + "loss": 0.129, + "step": 1003 + }, + { + "epoch": 2.739427012278308, + "grad_norm": 0.35953471064567566, + "learning_rate": 4.4337976349920763e-07, + "loss": 0.1294, + "step": 1004 + }, + { + "epoch": 2.742155525238745, + "grad_norm": 0.3629186451435089, + "learning_rate": 4.3406481715319916e-07, + "loss": 0.1305, + "step": 1005 + }, + { + "epoch": 2.7448840381991815, + "grad_norm": 0.36121776700019836, + "learning_rate": 4.248465900580734e-07, + "loss": 0.1299, + "step": 1006 + }, + { + "epoch": 2.747612551159618, + "grad_norm": 0.3531065583229065, + "learning_rate": 4.1572517541747294e-07, + "loss": 0.1304, + "step": 1007 + }, + { + "epoch": 2.7503410641200547, + "grad_norm": 0.35587796568870544, + "learning_rate": 4.0670066545619224e-07, + "loss": 0.1289, + "step": 1008 + }, + { + "epoch": 2.753069577080491, + "grad_norm": 0.35462117195129395, + "learning_rate": 3.9777315141923847e-07, + "loss": 0.1286, + "step": 1009 + }, + { + "epoch": 2.7557980900409276, + "grad_norm": 0.34857916831970215, + "learning_rate": 3.889427235709153e-07, + "loss": 0.1282, + "step": 1010 + }, + { + "epoch": 2.758526603001364, + "grad_norm": 0.35415270924568176, + "learning_rate": 3.802094711939075e-07, + "loss": 0.1275, + "step": 1011 + }, + { + "epoch": 2.761255115961801, + "grad_norm": 0.37276458740234375, + "learning_rate": 3.715734825883766e-07, + "loss": 0.1338, + "step": 1012 + }, + { + "epoch": 2.7639836289222375, + "grad_norm": 0.3584205210208893, + "learning_rate": 3.6303484507106965e-07, + "loss": 0.1276, + "step": 1013 + }, + { + "epoch": 2.7667121418826737, + "grad_norm": 0.360146164894104, + "learning_rate": 3.5459364497443696e-07, + "loss": 0.1275, + "step": 1014 + }, + { + "epoch": 2.7694406548431107, + "grad_norm": 0.3654356896877289, + "learning_rate": 3.462499676457598e-07, + "loss": 0.1277, + "step": 1015 + }, + { + "epoch": 2.772169167803547, + "grad_norm": 0.3602968752384186, + "learning_rate": 3.38003897446284e-07, + "loss": 0.1319, + "step": 1016 + }, + { + "epoch": 2.7748976807639836, + "grad_norm": 0.3531845808029175, + "learning_rate": 3.298555177503726e-07, + "loss": 0.1311, + "step": 1017 + }, + { + "epoch": 2.77762619372442, + "grad_norm": 0.35511648654937744, + "learning_rate": 3.2180491094465414e-07, + "loss": 0.1292, + "step": 1018 + }, + { + "epoch": 2.780354706684857, + "grad_norm": 0.35548272728919983, + "learning_rate": 3.138521584272003e-07, + "loss": 0.1299, + "step": 1019 + }, + { + "epoch": 2.7830832196452935, + "grad_norm": 0.36467936635017395, + "learning_rate": 3.059973406066963e-07, + "loss": 0.1318, + "step": 1020 + }, + { + "epoch": 2.7858117326057297, + "grad_norm": 0.36441510915756226, + "learning_rate": 2.982405369016272e-07, + "loss": 0.1305, + "step": 1021 + }, + { + "epoch": 2.7885402455661663, + "grad_norm": 0.35912269353866577, + "learning_rate": 2.905818257394799e-07, + "loss": 0.1266, + "step": 1022 + }, + { + "epoch": 2.791268758526603, + "grad_norm": 0.3510645925998688, + "learning_rate": 2.830212845559466e-07, + "loss": 0.1292, + "step": 1023 + }, + { + "epoch": 2.7939972714870396, + "grad_norm": 0.38425615429878235, + "learning_rate": 2.7555898979413796e-07, + "loss": 0.1261, + "step": 1024 + }, + { + "epoch": 2.796725784447476, + "grad_norm": 0.36103349924087524, + "learning_rate": 2.6819501690382275e-07, + "loss": 0.131, + "step": 1025 + }, + { + "epoch": 2.799454297407913, + "grad_norm": 0.3564034104347229, + "learning_rate": 2.609294403406537e-07, + "loss": 0.1285, + "step": 1026 + }, + { + "epoch": 2.8021828103683495, + "grad_norm": 0.35618945956230164, + "learning_rate": 2.537623335654127e-07, + "loss": 0.1303, + "step": 1027 + }, + { + "epoch": 2.8049113233287857, + "grad_norm": 0.3517012298107147, + "learning_rate": 2.4669376904328244e-07, + "loss": 0.128, + "step": 1028 + }, + { + "epoch": 2.8076398362892223, + "grad_norm": 0.35787534713745117, + "learning_rate": 2.397238182430994e-07, + "loss": 0.1287, + "step": 1029 + }, + { + "epoch": 2.810368349249659, + "grad_norm": 0.3706185221672058, + "learning_rate": 2.3285255163663535e-07, + "loss": 0.1316, + "step": 1030 + }, + { + "epoch": 2.8130968622100956, + "grad_norm": 0.3420504331588745, + "learning_rate": 2.2608003869788786e-07, + "loss": 0.1248, + "step": 1031 + }, + { + "epoch": 2.815825375170532, + "grad_norm": 0.3569451868534088, + "learning_rate": 2.1940634790238003e-07, + "loss": 0.1279, + "step": 1032 + }, + { + "epoch": 2.8185538881309684, + "grad_norm": 0.3604009747505188, + "learning_rate": 2.1283154672645522e-07, + "loss": 0.1291, + "step": 1033 + }, + { + "epoch": 2.821282401091405, + "grad_norm": 0.3657495975494385, + "learning_rate": 2.063557016466111e-07, + "loss": 0.1295, + "step": 1034 + }, + { + "epoch": 2.8240109140518417, + "grad_norm": 0.3558668792247772, + "learning_rate": 1.999788781388201e-07, + "loss": 0.1268, + "step": 1035 + }, + { + "epoch": 2.8267394270122783, + "grad_norm": 0.36042869091033936, + "learning_rate": 1.9370114067785995e-07, + "loss": 0.129, + "step": 1036 + }, + { + "epoch": 2.829467939972715, + "grad_norm": 0.3488910496234894, + "learning_rate": 1.8752255273667752e-07, + "loss": 0.1274, + "step": 1037 + }, + { + "epoch": 2.8321964529331516, + "grad_norm": 0.35946106910705566, + "learning_rate": 1.8144317678573497e-07, + "loss": 0.1276, + "step": 1038 + }, + { + "epoch": 2.8349249658935882, + "grad_norm": 0.35392606258392334, + "learning_rate": 1.7546307429238129e-07, + "loss": 0.1289, + "step": 1039 + }, + { + "epoch": 2.8376534788540244, + "grad_norm": 0.35671889781951904, + "learning_rate": 1.6958230572023504e-07, + "loss": 0.1288, + "step": 1040 + }, + { + "epoch": 2.840381991814461, + "grad_norm": 0.3586263656616211, + "learning_rate": 1.6380093052856482e-07, + "loss": 0.1332, + "step": 1041 + }, + { + "epoch": 2.8431105047748977, + "grad_norm": 0.34798717498779297, + "learning_rate": 1.5811900717169537e-07, + "loss": 0.128, + "step": 1042 + }, + { + "epoch": 2.8458390177353343, + "grad_norm": 0.369842529296875, + "learning_rate": 1.5253659309841463e-07, + "loss": 0.1337, + "step": 1043 + }, + { + "epoch": 2.848567530695771, + "grad_norm": 0.3586486577987671, + "learning_rate": 1.4705374475138978e-07, + "loss": 0.1281, + "step": 1044 + }, + { + "epoch": 2.851296043656207, + "grad_norm": 0.3524888753890991, + "learning_rate": 1.416705175666e-07, + "loss": 0.1283, + "step": 1045 + }, + { + "epoch": 2.854024556616644, + "grad_norm": 0.3938431739807129, + "learning_rate": 1.3638696597277678e-07, + "loss": 0.1287, + "step": 1046 + }, + { + "epoch": 2.8567530695770804, + "grad_norm": 0.3598809540271759, + "learning_rate": 1.3120314339084782e-07, + "loss": 0.1289, + "step": 1047 + }, + { + "epoch": 2.859481582537517, + "grad_norm": 0.3557843565940857, + "learning_rate": 1.2611910223340408e-07, + "loss": 0.1287, + "step": 1048 + }, + { + "epoch": 2.8622100954979537, + "grad_norm": 0.3524629473686218, + "learning_rate": 1.2113489390416565e-07, + "loss": 0.1274, + "step": 1049 + }, + { + "epoch": 2.8649386084583903, + "grad_norm": 0.36476588249206543, + "learning_rate": 1.1625056879746133e-07, + "loss": 0.1313, + "step": 1050 + }, + { + "epoch": 2.867667121418827, + "grad_norm": 0.3563539683818817, + "learning_rate": 1.1146617629772316e-07, + "loss": 0.128, + "step": 1051 + }, + { + "epoch": 2.870395634379263, + "grad_norm": 0.36203011870384216, + "learning_rate": 1.0678176477898372e-07, + "loss": 0.1299, + "step": 1052 + }, + { + "epoch": 2.8731241473397, + "grad_norm": 0.3595244288444519, + "learning_rate": 1.0219738160438753e-07, + "loss": 0.1296, + "step": 1053 + }, + { + "epoch": 2.8758526603001364, + "grad_norm": 0.3583170473575592, + "learning_rate": 9.771307312571254e-08, + "loss": 0.1283, + "step": 1054 + }, + { + "epoch": 2.878581173260573, + "grad_norm": 0.3593871593475342, + "learning_rate": 9.332888468290168e-08, + "loss": 0.1298, + "step": 1055 + }, + { + "epoch": 2.8813096862210097, + "grad_norm": 0.3553028702735901, + "learning_rate": 8.90448606036054e-08, + "loss": 0.1286, + "step": 1056 + }, + { + "epoch": 2.884038199181446, + "grad_norm": 0.34894633293151855, + "learning_rate": 8.486104420272979e-08, + "loss": 0.1249, + "step": 1057 + }, + { + "epoch": 2.8867667121418825, + "grad_norm": 0.35654279589653015, + "learning_rate": 8.077747778200474e-08, + "loss": 0.1293, + "step": 1058 + }, + { + "epoch": 2.889495225102319, + "grad_norm": 0.3499641716480255, + "learning_rate": 7.679420262954984e-08, + "loss": 0.1293, + "step": 1059 + }, + { + "epoch": 2.892223738062756, + "grad_norm": 0.3609579801559448, + "learning_rate": 7.291125901946027e-08, + "loss": 0.1303, + "step": 1060 + }, + { + "epoch": 2.8949522510231924, + "grad_norm": 0.3622332215309143, + "learning_rate": 6.912868621140045e-08, + "loss": 0.1294, + "step": 1061 + }, + { + "epoch": 2.897680763983629, + "grad_norm": 0.348332941532135, + "learning_rate": 6.544652245020433e-08, + "loss": 0.1281, + "step": 1062 + }, + { + "epoch": 2.9004092769440657, + "grad_norm": 0.3568393588066101, + "learning_rate": 6.18648049654913e-08, + "loss": 0.1284, + "step": 1063 + }, + { + "epoch": 2.903137789904502, + "grad_norm": 0.3721940815448761, + "learning_rate": 5.838356997128869e-08, + "loss": 0.1287, + "step": 1064 + }, + { + "epoch": 2.9058663028649385, + "grad_norm": 0.3597653806209564, + "learning_rate": 5.500285266566319e-08, + "loss": 0.1314, + "step": 1065 + }, + { + "epoch": 2.908594815825375, + "grad_norm": 0.364513635635376, + "learning_rate": 5.1722687230369995e-08, + "loss": 0.1294, + "step": 1066 + }, + { + "epoch": 2.911323328785812, + "grad_norm": 0.36251965165138245, + "learning_rate": 4.854310683050312e-08, + "loss": 0.1288, + "step": 1067 + }, + { + "epoch": 2.9140518417462484, + "grad_norm": 0.35791829228401184, + "learning_rate": 4.5464143614162294e-08, + "loss": 0.1305, + "step": 1068 + }, + { + "epoch": 2.9167803547066846, + "grad_norm": 0.347622275352478, + "learning_rate": 4.2485828712126584e-08, + "loss": 0.1281, + "step": 1069 + }, + { + "epoch": 2.9195088676671213, + "grad_norm": 0.35234710574150085, + "learning_rate": 3.96081922375402e-08, + "loss": 0.1269, + "step": 1070 + }, + { + "epoch": 2.922237380627558, + "grad_norm": 0.3629789352416992, + "learning_rate": 3.683126328560826e-08, + "loss": 0.1298, + "step": 1071 + }, + { + "epoch": 2.9249658935879945, + "grad_norm": 0.35642895102500916, + "learning_rate": 3.4155069933301535e-08, + "loss": 0.1292, + "step": 1072 + }, + { + "epoch": 2.927694406548431, + "grad_norm": 0.3609231114387512, + "learning_rate": 3.1579639239074364e-08, + "loss": 0.131, + "step": 1073 + }, + { + "epoch": 2.930422919508868, + "grad_norm": 0.36312758922576904, + "learning_rate": 2.9104997242590528e-08, + "loss": 0.1284, + "step": 1074 + }, + { + "epoch": 2.9331514324693044, + "grad_norm": 0.36244162917137146, + "learning_rate": 2.673116896445671e-08, + "loss": 0.1286, + "step": 1075 + }, + { + "epoch": 2.9358799454297406, + "grad_norm": 0.35929951071739197, + "learning_rate": 2.4458178405974974e-08, + "loss": 0.13, + "step": 1076 + }, + { + "epoch": 2.9386084583901773, + "grad_norm": 0.3554922342300415, + "learning_rate": 2.2286048548897378e-08, + "loss": 0.1286, + "step": 1077 + }, + { + "epoch": 2.941336971350614, + "grad_norm": 0.3549968898296356, + "learning_rate": 2.0214801355192826e-08, + "loss": 0.1286, + "step": 1078 + }, + { + "epoch": 2.9440654843110505, + "grad_norm": 0.34757283329963684, + "learning_rate": 1.824445776682504e-08, + "loss": 0.127, + "step": 1079 + }, + { + "epoch": 2.946793997271487, + "grad_norm": 0.36166948080062866, + "learning_rate": 1.6375037705543827e-08, + "loss": 0.129, + "step": 1080 + }, + { + "epoch": 2.9495225102319234, + "grad_norm": 0.3565935492515564, + "learning_rate": 1.4606560072679687e-08, + "loss": 0.1273, + "step": 1081 + }, + { + "epoch": 2.9522510231923604, + "grad_norm": 0.3589683473110199, + "learning_rate": 1.2939042748955078e-08, + "loss": 0.1284, + "step": 1082 + }, + { + "epoch": 2.9549795361527966, + "grad_norm": 0.35582536458969116, + "learning_rate": 1.1372502594303448e-08, + "loss": 0.1289, + "step": 1083 + }, + { + "epoch": 2.9577080491132333, + "grad_norm": 0.3483884930610657, + "learning_rate": 9.906955447697153e-09, + "loss": 0.1262, + "step": 1084 + }, + { + "epoch": 2.96043656207367, + "grad_norm": 0.35305240750312805, + "learning_rate": 8.542416126989805e-09, + "loss": 0.1257, + "step": 1085 + }, + { + "epoch": 2.9631650750341065, + "grad_norm": 0.3558148145675659, + "learning_rate": 7.278898428764169e-09, + "loss": 0.1287, + "step": 1086 + }, + { + "epoch": 2.965893587994543, + "grad_norm": 0.35575419664382935, + "learning_rate": 6.1164151281944974e-09, + "loss": 0.1281, + "step": 1087 + }, + { + "epoch": 2.9686221009549794, + "grad_norm": 0.3598300814628601, + "learning_rate": 5.054977978916631e-09, + "loss": 0.1282, + "step": 1088 + }, + { + "epoch": 2.971350613915416, + "grad_norm": 0.35684746503829956, + "learning_rate": 4.094597712908099e-09, + "loss": 0.1276, + "step": 1089 + }, + { + "epoch": 2.9740791268758526, + "grad_norm": 0.3472525477409363, + "learning_rate": 3.2352840403804264e-09, + "loss": 0.1277, + "step": 1090 + }, + { + "epoch": 2.9768076398362893, + "grad_norm": 0.3728668689727783, + "learning_rate": 2.477045649681431e-09, + "loss": 0.1335, + "step": 1091 + }, + { + "epoch": 2.979536152796726, + "grad_norm": 0.3551958203315735, + "learning_rate": 1.8198902072097402e-09, + "loss": 0.1292, + "step": 1092 + }, + { + "epoch": 2.982264665757162, + "grad_norm": 0.3531115651130676, + "learning_rate": 1.2638243573293019e-09, + "loss": 0.1297, + "step": 1093 + }, + { + "epoch": 2.984993178717599, + "grad_norm": 0.3493654727935791, + "learning_rate": 8.088537223116533e-10, + "loss": 0.1267, + "step": 1094 + }, + { + "epoch": 2.9877216916780354, + "grad_norm": 0.3511788547039032, + "learning_rate": 4.549829022748586e-10, + "loss": 0.1254, + "step": 1095 + }, + { + "epoch": 2.990450204638472, + "grad_norm": 0.36361077427864075, + "learning_rate": 2.02215475132439e-10, + "loss": 0.1318, + "step": 1096 + }, + { + "epoch": 2.9931787175989086, + "grad_norm": 0.36402377486228943, + "learning_rate": 5.0553996568947216e-11, + "loss": 0.1314, + "step": 1097 + }, + { + "epoch": 2.9959072305593453, + "grad_norm": 0.3575246036052704, + "learning_rate": 0.0, + "loss": 0.1289, + "step": 1098 + }, + { + "epoch": 2.9959072305593453, + "step": 1098, + "total_flos": 3.496351998494638e+18, + "train_loss": 0.2091554065246834, + "train_runtime": 8168.8424, + "train_samples_per_second": 17.224, + "train_steps_per_second": 0.134 + } + ], + "logging_steps": 1, + "max_steps": 1098, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 999999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.496351998494638e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}