diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15575 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9997747240369452, + "eval_steps": 500, + "global_step": 2219, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004505519261094841, + "grad_norm": 26.28510414143239, + "learning_rate": 0.0, + "loss": 2.0443, + "step": 1 + }, + { + "epoch": 0.0009011038522189682, + "grad_norm": 8.222095044715916, + "learning_rate": 2.2106472945750374e-06, + "loss": 1.8172, + "step": 2 + }, + { + "epoch": 0.0013516557783284523, + "grad_norm": 8.58152531674897, + "learning_rate": 3.5037930642221103e-06, + "loss": 1.8449, + "step": 3 + }, + { + "epoch": 0.0018022077044379365, + "grad_norm": 7.344261765285006, + "learning_rate": 4.421294589150075e-06, + "loss": 1.7721, + "step": 4 + }, + { + "epoch": 0.0022527596305474207, + "grad_norm": 6.058933858290448, + "learning_rate": 5.132964061160519e-06, + "loss": 1.6875, + "step": 5 + }, + { + "epoch": 0.0027033115566569045, + "grad_norm": 6.31581635219131, + "learning_rate": 5.714440358797148e-06, + "loss": 1.6827, + "step": 6 + }, + { + "epoch": 0.0031538634827663887, + "grad_norm": 4.974134087549639, + "learning_rate": 6.2060715633585576e-06, + "loss": 1.5961, + "step": 7 + }, + { + "epoch": 0.003604415408875873, + "grad_norm": 3.6875663810117874, + "learning_rate": 6.631941883725112e-06, + "loss": 1.5536, + "step": 8 + }, + { + "epoch": 0.004054967334985357, + "grad_norm": 4.4525551165323165, + "learning_rate": 7.007586128444221e-06, + "loss": 1.5576, + "step": 9 + }, + { + "epoch": 0.004505519261094841, + "grad_norm": 3.059175705061217, + "learning_rate": 7.343611355735557e-06, + "loss": 1.5529, + "step": 10 + }, + { + "epoch": 0.004956071187204325, + "grad_norm": 3.226233366612977, + "learning_rate": 7.647583148507886e-06, + "loss": 1.5459, + "step": 11 + }, + { + "epoch": 0.005406623113313809, + "grad_norm": 2.9477028598078077, + "learning_rate": 7.925087653372185e-06, + "loss": 1.4634, + "step": 12 + }, + { + "epoch": 0.005857175039423294, + "grad_norm": 2.720800057963264, + "learning_rate": 8.180367051646621e-06, + "loss": 1.5685, + "step": 13 + }, + { + "epoch": 0.0063077269655327775, + "grad_norm": 2.9145535800847022, + "learning_rate": 8.416718857933595e-06, + "loss": 1.4726, + "step": 14 + }, + { + "epoch": 0.006758278891642262, + "grad_norm": 3.1869297889189068, + "learning_rate": 8.636757125382629e-06, + "loss": 1.5608, + "step": 15 + }, + { + "epoch": 0.007208830817751746, + "grad_norm": 2.729040726576262, + "learning_rate": 8.84258917830015e-06, + "loss": 1.4937, + "step": 16 + }, + { + "epoch": 0.00765938274386123, + "grad_norm": 2.597750532724102, + "learning_rate": 9.03593867168606e-06, + "loss": 1.4803, + "step": 17 + }, + { + "epoch": 0.008109934669970714, + "grad_norm": 2.775091573536583, + "learning_rate": 9.218233423019256e-06, + "loss": 1.5027, + "step": 18 + }, + { + "epoch": 0.008560486596080198, + "grad_norm": 2.5089161733494807, + "learning_rate": 9.390669465144928e-06, + "loss": 1.5034, + "step": 19 + }, + { + "epoch": 0.009011038522189683, + "grad_norm": 2.690280429996506, + "learning_rate": 9.554258650310593e-06, + "loss": 1.4345, + "step": 20 + }, + { + "epoch": 0.009461590448299166, + "grad_norm": 2.8272167884838284, + "learning_rate": 9.709864627580668e-06, + "loss": 1.4647, + "step": 21 + }, + { + "epoch": 0.00991214237440865, + "grad_norm": 2.378855815301042, + "learning_rate": 9.858230443082923e-06, + "loss": 1.4172, + "step": 22 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 2.3805829415787887, + "learning_rate": 1e-05, + "loss": 1.4745, + "step": 23 + }, + { + "epoch": 0.010813246226627618, + "grad_norm": 2.386709390440998, + "learning_rate": 1e-05, + "loss": 1.4517, + "step": 24 + }, + { + "epoch": 0.011263798152737103, + "grad_norm": 2.518404890144143, + "learning_rate": 1e-05, + "loss": 1.4705, + "step": 25 + }, + { + "epoch": 0.011714350078846587, + "grad_norm": 2.5205110824340933, + "learning_rate": 1e-05, + "loss": 1.4409, + "step": 26 + }, + { + "epoch": 0.012164902004956072, + "grad_norm": 2.608849272769937, + "learning_rate": 1e-05, + "loss": 1.4649, + "step": 27 + }, + { + "epoch": 0.012615453931065555, + "grad_norm": 2.724386963149695, + "learning_rate": 1e-05, + "loss": 1.4349, + "step": 28 + }, + { + "epoch": 0.01306600585717504, + "grad_norm": 2.396900395164592, + "learning_rate": 1e-05, + "loss": 1.4657, + "step": 29 + }, + { + "epoch": 0.013516557783284524, + "grad_norm": 2.301795861679911, + "learning_rate": 1e-05, + "loss": 1.4494, + "step": 30 + }, + { + "epoch": 0.013967109709394007, + "grad_norm": 2.4681646640580177, + "learning_rate": 1e-05, + "loss": 1.3577, + "step": 31 + }, + { + "epoch": 0.014417661635503492, + "grad_norm": 2.2370170283767967, + "learning_rate": 1e-05, + "loss": 1.4006, + "step": 32 + }, + { + "epoch": 0.014868213561612977, + "grad_norm": 2.3618258090169224, + "learning_rate": 1e-05, + "loss": 1.4705, + "step": 33 + }, + { + "epoch": 0.01531876548772246, + "grad_norm": 2.264234680658778, + "learning_rate": 1e-05, + "loss": 1.4492, + "step": 34 + }, + { + "epoch": 0.015769317413831946, + "grad_norm": 2.6366014899470094, + "learning_rate": 1e-05, + "loss": 1.4061, + "step": 35 + }, + { + "epoch": 0.016219869339941427, + "grad_norm": 2.4059100692684936, + "learning_rate": 1e-05, + "loss": 1.4546, + "step": 36 + }, + { + "epoch": 0.01667042126605091, + "grad_norm": 2.351443360609274, + "learning_rate": 1e-05, + "loss": 1.4255, + "step": 37 + }, + { + "epoch": 0.017120973192160396, + "grad_norm": 2.2251584813515963, + "learning_rate": 1e-05, + "loss": 1.4403, + "step": 38 + }, + { + "epoch": 0.01757152511826988, + "grad_norm": 2.4666722222794655, + "learning_rate": 1e-05, + "loss": 1.4411, + "step": 39 + }, + { + "epoch": 0.018022077044379366, + "grad_norm": 2.4349781065931113, + "learning_rate": 1e-05, + "loss": 1.4366, + "step": 40 + }, + { + "epoch": 0.01847262897048885, + "grad_norm": 2.6644037834229235, + "learning_rate": 1e-05, + "loss": 1.4366, + "step": 41 + }, + { + "epoch": 0.01892318089659833, + "grad_norm": 2.3668664318822477, + "learning_rate": 1e-05, + "loss": 1.3755, + "step": 42 + }, + { + "epoch": 0.019373732822707816, + "grad_norm": 2.289846872939494, + "learning_rate": 1e-05, + "loss": 1.383, + "step": 43 + }, + { + "epoch": 0.0198242847488173, + "grad_norm": 2.3829712616178944, + "learning_rate": 1e-05, + "loss": 1.3999, + "step": 44 + }, + { + "epoch": 0.020274836674926786, + "grad_norm": 2.2758409182803225, + "learning_rate": 1e-05, + "loss": 1.413, + "step": 45 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 2.6859081702334326, + "learning_rate": 1e-05, + "loss": 1.4004, + "step": 46 + }, + { + "epoch": 0.021175940527145755, + "grad_norm": 2.6459078901377158, + "learning_rate": 1e-05, + "loss": 1.3606, + "step": 47 + }, + { + "epoch": 0.021626492453255236, + "grad_norm": 2.30611696528225, + "learning_rate": 1e-05, + "loss": 1.4375, + "step": 48 + }, + { + "epoch": 0.02207704437936472, + "grad_norm": 2.289938283664605, + "learning_rate": 1e-05, + "loss": 1.4189, + "step": 49 + }, + { + "epoch": 0.022527596305474205, + "grad_norm": 2.276199761500501, + "learning_rate": 1e-05, + "loss": 1.4109, + "step": 50 + }, + { + "epoch": 0.02297814823158369, + "grad_norm": 2.452090967696992, + "learning_rate": 1e-05, + "loss": 1.3708, + "step": 51 + }, + { + "epoch": 0.023428700157693175, + "grad_norm": 2.2807167446012806, + "learning_rate": 1e-05, + "loss": 1.3852, + "step": 52 + }, + { + "epoch": 0.02387925208380266, + "grad_norm": 2.334407850543259, + "learning_rate": 1e-05, + "loss": 1.4386, + "step": 53 + }, + { + "epoch": 0.024329804009912144, + "grad_norm": 2.5155186958910916, + "learning_rate": 1e-05, + "loss": 1.3805, + "step": 54 + }, + { + "epoch": 0.024780355936021625, + "grad_norm": 2.3936403403814515, + "learning_rate": 1e-05, + "loss": 1.3909, + "step": 55 + }, + { + "epoch": 0.02523090786213111, + "grad_norm": 2.4199229822775976, + "learning_rate": 1e-05, + "loss": 1.3892, + "step": 56 + }, + { + "epoch": 0.025681459788240595, + "grad_norm": 2.3730631304687138, + "learning_rate": 1e-05, + "loss": 1.4074, + "step": 57 + }, + { + "epoch": 0.02613201171435008, + "grad_norm": 2.2334835941143365, + "learning_rate": 1e-05, + "loss": 1.3509, + "step": 58 + }, + { + "epoch": 0.026582563640459564, + "grad_norm": 2.374174087054882, + "learning_rate": 1e-05, + "loss": 1.4301, + "step": 59 + }, + { + "epoch": 0.02703311556656905, + "grad_norm": 2.321815293727669, + "learning_rate": 1e-05, + "loss": 1.3976, + "step": 60 + }, + { + "epoch": 0.02748366749267853, + "grad_norm": 2.504847959880212, + "learning_rate": 1e-05, + "loss": 1.3677, + "step": 61 + }, + { + "epoch": 0.027934219418788014, + "grad_norm": 2.2418023299937198, + "learning_rate": 1e-05, + "loss": 1.3904, + "step": 62 + }, + { + "epoch": 0.0283847713448975, + "grad_norm": 2.2804423991950484, + "learning_rate": 1e-05, + "loss": 1.3717, + "step": 63 + }, + { + "epoch": 0.028835323271006984, + "grad_norm": 2.219781167888153, + "learning_rate": 1e-05, + "loss": 1.3992, + "step": 64 + }, + { + "epoch": 0.02928587519711647, + "grad_norm": 2.3330607239335737, + "learning_rate": 1e-05, + "loss": 1.4061, + "step": 65 + }, + { + "epoch": 0.029736427123225953, + "grad_norm": 2.4560005590707696, + "learning_rate": 1e-05, + "loss": 1.3819, + "step": 66 + }, + { + "epoch": 0.030186979049335434, + "grad_norm": 2.1318116937147025, + "learning_rate": 1e-05, + "loss": 1.3692, + "step": 67 + }, + { + "epoch": 0.03063753097544492, + "grad_norm": 2.2285877498888333, + "learning_rate": 1e-05, + "loss": 1.4126, + "step": 68 + }, + { + "epoch": 0.031088082901554404, + "grad_norm": 2.343917803713368, + "learning_rate": 1e-05, + "loss": 1.3298, + "step": 69 + }, + { + "epoch": 0.03153863482766389, + "grad_norm": 2.242840254335559, + "learning_rate": 1e-05, + "loss": 1.3887, + "step": 70 + }, + { + "epoch": 0.03198918675377337, + "grad_norm": 2.2775060074244853, + "learning_rate": 1e-05, + "loss": 1.3772, + "step": 71 + }, + { + "epoch": 0.032439738679882854, + "grad_norm": 2.0385163756818803, + "learning_rate": 1e-05, + "loss": 1.3449, + "step": 72 + }, + { + "epoch": 0.03289029060599234, + "grad_norm": 2.5082054302710666, + "learning_rate": 1e-05, + "loss": 1.3692, + "step": 73 + }, + { + "epoch": 0.03334084253210182, + "grad_norm": 2.600727120152394, + "learning_rate": 1e-05, + "loss": 1.4177, + "step": 74 + }, + { + "epoch": 0.03379139445821131, + "grad_norm": 2.2862002649869884, + "learning_rate": 1e-05, + "loss": 1.3511, + "step": 75 + }, + { + "epoch": 0.03424194638432079, + "grad_norm": 2.427908692557385, + "learning_rate": 1e-05, + "loss": 1.3903, + "step": 76 + }, + { + "epoch": 0.034692498310430274, + "grad_norm": 2.1941151719137033, + "learning_rate": 1e-05, + "loss": 1.3511, + "step": 77 + }, + { + "epoch": 0.03514305023653976, + "grad_norm": 2.030520849428415, + "learning_rate": 1e-05, + "loss": 1.3776, + "step": 78 + }, + { + "epoch": 0.03559360216264924, + "grad_norm": 2.375592708504076, + "learning_rate": 1e-05, + "loss": 1.3724, + "step": 79 + }, + { + "epoch": 0.03604415408875873, + "grad_norm": 2.2108433561033864, + "learning_rate": 1e-05, + "loss": 1.3972, + "step": 80 + }, + { + "epoch": 0.03649470601486821, + "grad_norm": 2.4539765524786334, + "learning_rate": 1e-05, + "loss": 1.3872, + "step": 81 + }, + { + "epoch": 0.0369452579409777, + "grad_norm": 2.2671405110557306, + "learning_rate": 1e-05, + "loss": 1.3848, + "step": 82 + }, + { + "epoch": 0.03739580986708718, + "grad_norm": 2.4849840452865544, + "learning_rate": 1e-05, + "loss": 1.3752, + "step": 83 + }, + { + "epoch": 0.03784636179319666, + "grad_norm": 2.172247922858377, + "learning_rate": 1e-05, + "loss": 1.3554, + "step": 84 + }, + { + "epoch": 0.03829691371930615, + "grad_norm": 2.1670772679155914, + "learning_rate": 1e-05, + "loss": 1.3743, + "step": 85 + }, + { + "epoch": 0.03874746564541563, + "grad_norm": 2.191176057713791, + "learning_rate": 1e-05, + "loss": 1.3902, + "step": 86 + }, + { + "epoch": 0.03919801757152512, + "grad_norm": 2.334687357837179, + "learning_rate": 1e-05, + "loss": 1.3638, + "step": 87 + }, + { + "epoch": 0.0396485694976346, + "grad_norm": 2.0920629315766246, + "learning_rate": 1e-05, + "loss": 1.3687, + "step": 88 + }, + { + "epoch": 0.04009912142374409, + "grad_norm": 2.2442009548950725, + "learning_rate": 1e-05, + "loss": 1.3735, + "step": 89 + }, + { + "epoch": 0.04054967334985357, + "grad_norm": 2.4644253798107654, + "learning_rate": 1e-05, + "loss": 1.3352, + "step": 90 + }, + { + "epoch": 0.04100022527596305, + "grad_norm": 3.1495591252992865, + "learning_rate": 1e-05, + "loss": 1.3492, + "step": 91 + }, + { + "epoch": 0.04145077720207254, + "grad_norm": 2.2158388490503933, + "learning_rate": 1e-05, + "loss": 1.391, + "step": 92 + }, + { + "epoch": 0.04190132912818202, + "grad_norm": 2.2643615549031253, + "learning_rate": 1e-05, + "loss": 1.3922, + "step": 93 + }, + { + "epoch": 0.04235188105429151, + "grad_norm": 2.277253984187307, + "learning_rate": 1e-05, + "loss": 1.3235, + "step": 94 + }, + { + "epoch": 0.04280243298040099, + "grad_norm": 2.4652877710696646, + "learning_rate": 1e-05, + "loss": 1.3388, + "step": 95 + }, + { + "epoch": 0.04325298490651047, + "grad_norm": 2.3757029746456726, + "learning_rate": 1e-05, + "loss": 1.393, + "step": 96 + }, + { + "epoch": 0.04370353683261996, + "grad_norm": 2.5231305121620835, + "learning_rate": 1e-05, + "loss": 1.3897, + "step": 97 + }, + { + "epoch": 0.04415408875872944, + "grad_norm": 2.1369778554629395, + "learning_rate": 1e-05, + "loss": 1.3642, + "step": 98 + }, + { + "epoch": 0.04460464068483893, + "grad_norm": 2.1260402398799743, + "learning_rate": 1e-05, + "loss": 1.3484, + "step": 99 + }, + { + "epoch": 0.04505519261094841, + "grad_norm": 2.309709263259357, + "learning_rate": 1e-05, + "loss": 1.3736, + "step": 100 + }, + { + "epoch": 0.0455057445370579, + "grad_norm": 2.1011110408842506, + "learning_rate": 1e-05, + "loss": 1.3783, + "step": 101 + }, + { + "epoch": 0.04595629646316738, + "grad_norm": 2.1914831067805336, + "learning_rate": 1e-05, + "loss": 1.3469, + "step": 102 + }, + { + "epoch": 0.04640684838927686, + "grad_norm": 2.241251638945157, + "learning_rate": 1e-05, + "loss": 1.3201, + "step": 103 + }, + { + "epoch": 0.04685740031538635, + "grad_norm": 2.209731187051675, + "learning_rate": 1e-05, + "loss": 1.343, + "step": 104 + }, + { + "epoch": 0.04730795224149583, + "grad_norm": 2.322197381391765, + "learning_rate": 1e-05, + "loss": 1.3629, + "step": 105 + }, + { + "epoch": 0.04775850416760532, + "grad_norm": 2.5964458053354282, + "learning_rate": 1e-05, + "loss": 1.3011, + "step": 106 + }, + { + "epoch": 0.0482090560937148, + "grad_norm": 2.3635572590606606, + "learning_rate": 1e-05, + "loss": 1.3734, + "step": 107 + }, + { + "epoch": 0.04865960801982429, + "grad_norm": 2.3162924552707094, + "learning_rate": 1e-05, + "loss": 1.3296, + "step": 108 + }, + { + "epoch": 0.04911015994593377, + "grad_norm": 2.280106792732187, + "learning_rate": 1e-05, + "loss": 1.386, + "step": 109 + }, + { + "epoch": 0.04956071187204325, + "grad_norm": 2.24924470423268, + "learning_rate": 1e-05, + "loss": 1.3415, + "step": 110 + }, + { + "epoch": 0.05001126379815274, + "grad_norm": 2.267342003393844, + "learning_rate": 1e-05, + "loss": 1.2756, + "step": 111 + }, + { + "epoch": 0.05046181572426222, + "grad_norm": 2.1193683158088987, + "learning_rate": 1e-05, + "loss": 1.3389, + "step": 112 + }, + { + "epoch": 0.05091236765037171, + "grad_norm": 2.187500036093264, + "learning_rate": 1e-05, + "loss": 1.3599, + "step": 113 + }, + { + "epoch": 0.05136291957648119, + "grad_norm": 2.2362627224448626, + "learning_rate": 1e-05, + "loss": 1.346, + "step": 114 + }, + { + "epoch": 0.05181347150259067, + "grad_norm": 2.189663703662425, + "learning_rate": 1e-05, + "loss": 1.3714, + "step": 115 + }, + { + "epoch": 0.05226402342870016, + "grad_norm": 2.300810784853383, + "learning_rate": 1e-05, + "loss": 1.3997, + "step": 116 + }, + { + "epoch": 0.05271457535480964, + "grad_norm": 2.3150676660413927, + "learning_rate": 1e-05, + "loss": 1.3265, + "step": 117 + }, + { + "epoch": 0.05316512728091913, + "grad_norm": 2.2842180322320575, + "learning_rate": 1e-05, + "loss": 1.37, + "step": 118 + }, + { + "epoch": 0.05361567920702861, + "grad_norm": 2.2297837308028696, + "learning_rate": 1e-05, + "loss": 1.4089, + "step": 119 + }, + { + "epoch": 0.0540662311331381, + "grad_norm": 2.402291874295927, + "learning_rate": 1e-05, + "loss": 1.4087, + "step": 120 + }, + { + "epoch": 0.05451678305924758, + "grad_norm": 2.290491505586159, + "learning_rate": 1e-05, + "loss": 1.3614, + "step": 121 + }, + { + "epoch": 0.05496733498535706, + "grad_norm": 2.065772896787654, + "learning_rate": 1e-05, + "loss": 1.3346, + "step": 122 + }, + { + "epoch": 0.05541788691146655, + "grad_norm": 2.370429850220542, + "learning_rate": 1e-05, + "loss": 1.3106, + "step": 123 + }, + { + "epoch": 0.05586843883757603, + "grad_norm": 2.2132653252379053, + "learning_rate": 1e-05, + "loss": 1.3197, + "step": 124 + }, + { + "epoch": 0.05631899076368552, + "grad_norm": 2.154390296688665, + "learning_rate": 1e-05, + "loss": 1.3043, + "step": 125 + }, + { + "epoch": 0.056769542689795, + "grad_norm": 2.304743722679258, + "learning_rate": 1e-05, + "loss": 1.3378, + "step": 126 + }, + { + "epoch": 0.057220094615904486, + "grad_norm": 1.9985607835453456, + "learning_rate": 1e-05, + "loss": 1.3635, + "step": 127 + }, + { + "epoch": 0.05767064654201397, + "grad_norm": 2.3799647674283366, + "learning_rate": 1e-05, + "loss": 1.391, + "step": 128 + }, + { + "epoch": 0.05812119846812345, + "grad_norm": 2.2473413860736375, + "learning_rate": 1e-05, + "loss": 1.351, + "step": 129 + }, + { + "epoch": 0.05857175039423294, + "grad_norm": 2.377688590024442, + "learning_rate": 1e-05, + "loss": 1.3401, + "step": 130 + }, + { + "epoch": 0.05902230232034242, + "grad_norm": 2.2073273079626836, + "learning_rate": 1e-05, + "loss": 1.3573, + "step": 131 + }, + { + "epoch": 0.059472854246451906, + "grad_norm": 2.190356460206687, + "learning_rate": 1e-05, + "loss": 1.3501, + "step": 132 + }, + { + "epoch": 0.05992340617256139, + "grad_norm": 2.0280828601230962, + "learning_rate": 1e-05, + "loss": 1.2865, + "step": 133 + }, + { + "epoch": 0.06037395809867087, + "grad_norm": 2.3849567762471136, + "learning_rate": 1e-05, + "loss": 1.3469, + "step": 134 + }, + { + "epoch": 0.06082451002478036, + "grad_norm": 2.183864843860172, + "learning_rate": 1e-05, + "loss": 1.2862, + "step": 135 + }, + { + "epoch": 0.06127506195088984, + "grad_norm": 2.1465217395570586, + "learning_rate": 1e-05, + "loss": 1.3769, + "step": 136 + }, + { + "epoch": 0.061725613876999326, + "grad_norm": 2.2304904790498217, + "learning_rate": 1e-05, + "loss": 1.3463, + "step": 137 + }, + { + "epoch": 0.06217616580310881, + "grad_norm": 2.245862844678027, + "learning_rate": 1e-05, + "loss": 1.3316, + "step": 138 + }, + { + "epoch": 0.0626267177292183, + "grad_norm": 2.0441903472041445, + "learning_rate": 1e-05, + "loss": 1.3554, + "step": 139 + }, + { + "epoch": 0.06307726965532778, + "grad_norm": 2.1541155197686557, + "learning_rate": 1e-05, + "loss": 1.3408, + "step": 140 + }, + { + "epoch": 0.06352782158143726, + "grad_norm": 2.103704298075221, + "learning_rate": 1e-05, + "loss": 1.3109, + "step": 141 + }, + { + "epoch": 0.06397837350754675, + "grad_norm": 2.143436032315168, + "learning_rate": 1e-05, + "loss": 1.344, + "step": 142 + }, + { + "epoch": 0.06442892543365623, + "grad_norm": 1.9869633772847095, + "learning_rate": 1e-05, + "loss": 1.3568, + "step": 143 + }, + { + "epoch": 0.06487947735976571, + "grad_norm": 2.0902245114076834, + "learning_rate": 1e-05, + "loss": 1.3089, + "step": 144 + }, + { + "epoch": 0.0653300292858752, + "grad_norm": 2.3943683613913453, + "learning_rate": 1e-05, + "loss": 1.3242, + "step": 145 + }, + { + "epoch": 0.06578058121198468, + "grad_norm": 2.0650060989119723, + "learning_rate": 1e-05, + "loss": 1.3943, + "step": 146 + }, + { + "epoch": 0.06623113313809416, + "grad_norm": 2.209294484404208, + "learning_rate": 1e-05, + "loss": 1.3244, + "step": 147 + }, + { + "epoch": 0.06668168506420365, + "grad_norm": 2.2369927550510846, + "learning_rate": 1e-05, + "loss": 1.3697, + "step": 148 + }, + { + "epoch": 0.06713223699031313, + "grad_norm": 2.0944369232164486, + "learning_rate": 1e-05, + "loss": 1.3656, + "step": 149 + }, + { + "epoch": 0.06758278891642262, + "grad_norm": 2.938063368949922, + "learning_rate": 1e-05, + "loss": 1.3448, + "step": 150 + }, + { + "epoch": 0.0680333408425321, + "grad_norm": 2.2018520024011092, + "learning_rate": 1e-05, + "loss": 1.3635, + "step": 151 + }, + { + "epoch": 0.06848389276864159, + "grad_norm": 2.088835022914261, + "learning_rate": 1e-05, + "loss": 1.3868, + "step": 152 + }, + { + "epoch": 0.06893444469475107, + "grad_norm": 2.27935479884058, + "learning_rate": 1e-05, + "loss": 1.3474, + "step": 153 + }, + { + "epoch": 0.06938499662086055, + "grad_norm": 2.1759713573337045, + "learning_rate": 1e-05, + "loss": 1.3319, + "step": 154 + }, + { + "epoch": 0.06983554854697004, + "grad_norm": 2.0986604325792273, + "learning_rate": 1e-05, + "loss": 1.3298, + "step": 155 + }, + { + "epoch": 0.07028610047307952, + "grad_norm": 2.0356679638693005, + "learning_rate": 1e-05, + "loss": 1.3449, + "step": 156 + }, + { + "epoch": 0.07073665239918901, + "grad_norm": 2.019473324752186, + "learning_rate": 1e-05, + "loss": 1.3056, + "step": 157 + }, + { + "epoch": 0.07118720432529849, + "grad_norm": 2.0150736309932267, + "learning_rate": 1e-05, + "loss": 1.3257, + "step": 158 + }, + { + "epoch": 0.07163775625140797, + "grad_norm": 2.0434827081661826, + "learning_rate": 1e-05, + "loss": 1.2815, + "step": 159 + }, + { + "epoch": 0.07208830817751746, + "grad_norm": 1.97423799741727, + "learning_rate": 1e-05, + "loss": 1.3446, + "step": 160 + }, + { + "epoch": 0.07253886010362694, + "grad_norm": 2.086993457301471, + "learning_rate": 1e-05, + "loss": 1.3419, + "step": 161 + }, + { + "epoch": 0.07298941202973643, + "grad_norm": 2.0583664808721416, + "learning_rate": 1e-05, + "loss": 1.3712, + "step": 162 + }, + { + "epoch": 0.07343996395584591, + "grad_norm": 1.972502617783117, + "learning_rate": 1e-05, + "loss": 1.297, + "step": 163 + }, + { + "epoch": 0.0738905158819554, + "grad_norm": 1.9403535984947597, + "learning_rate": 1e-05, + "loss": 1.2888, + "step": 164 + }, + { + "epoch": 0.07434106780806488, + "grad_norm": 2.0284573800273358, + "learning_rate": 1e-05, + "loss": 1.2949, + "step": 165 + }, + { + "epoch": 0.07479161973417436, + "grad_norm": 2.255410970147385, + "learning_rate": 1e-05, + "loss": 1.3494, + "step": 166 + }, + { + "epoch": 0.07524217166028385, + "grad_norm": 1.9373576070929324, + "learning_rate": 1e-05, + "loss": 1.3459, + "step": 167 + }, + { + "epoch": 0.07569272358639333, + "grad_norm": 2.0643336639670635, + "learning_rate": 1e-05, + "loss": 1.3367, + "step": 168 + }, + { + "epoch": 0.07614327551250281, + "grad_norm": 2.19869988638025, + "learning_rate": 1e-05, + "loss": 1.3395, + "step": 169 + }, + { + "epoch": 0.0765938274386123, + "grad_norm": 2.0235337047791413, + "learning_rate": 1e-05, + "loss": 1.2544, + "step": 170 + }, + { + "epoch": 0.07704437936472179, + "grad_norm": 2.0789686417964157, + "learning_rate": 1e-05, + "loss": 1.3147, + "step": 171 + }, + { + "epoch": 0.07749493129083126, + "grad_norm": 2.146030217715539, + "learning_rate": 1e-05, + "loss": 1.2945, + "step": 172 + }, + { + "epoch": 0.07794548321694075, + "grad_norm": 2.2702052358432585, + "learning_rate": 1e-05, + "loss": 1.3205, + "step": 173 + }, + { + "epoch": 0.07839603514305024, + "grad_norm": 2.074969369993277, + "learning_rate": 1e-05, + "loss": 1.3681, + "step": 174 + }, + { + "epoch": 0.07884658706915972, + "grad_norm": 2.0882868293617833, + "learning_rate": 1e-05, + "loss": 1.3324, + "step": 175 + }, + { + "epoch": 0.0792971389952692, + "grad_norm": 2.2428849524821275, + "learning_rate": 1e-05, + "loss": 1.3431, + "step": 176 + }, + { + "epoch": 0.07974769092137869, + "grad_norm": 1.9430898055672532, + "learning_rate": 1e-05, + "loss": 1.2792, + "step": 177 + }, + { + "epoch": 0.08019824284748818, + "grad_norm": 2.151887159652184, + "learning_rate": 1e-05, + "loss": 1.3077, + "step": 178 + }, + { + "epoch": 0.08064879477359765, + "grad_norm": 2.3642708132508754, + "learning_rate": 1e-05, + "loss": 1.3221, + "step": 179 + }, + { + "epoch": 0.08109934669970714, + "grad_norm": 2.0851971556207447, + "learning_rate": 1e-05, + "loss": 1.3298, + "step": 180 + }, + { + "epoch": 0.08154989862581663, + "grad_norm": 2.1372303530266765, + "learning_rate": 1e-05, + "loss": 1.2822, + "step": 181 + }, + { + "epoch": 0.0820004505519261, + "grad_norm": 2.025927429224001, + "learning_rate": 1e-05, + "loss": 1.3042, + "step": 182 + }, + { + "epoch": 0.08245100247803559, + "grad_norm": 2.215614207426725, + "learning_rate": 1e-05, + "loss": 1.2804, + "step": 183 + }, + { + "epoch": 0.08290155440414508, + "grad_norm": 2.147530296171878, + "learning_rate": 1e-05, + "loss": 1.3324, + "step": 184 + }, + { + "epoch": 0.08335210633025456, + "grad_norm": 2.1468751331920455, + "learning_rate": 1e-05, + "loss": 1.3324, + "step": 185 + }, + { + "epoch": 0.08380265825636404, + "grad_norm": 1.8657617259301975, + "learning_rate": 1e-05, + "loss": 1.3313, + "step": 186 + }, + { + "epoch": 0.08425321018247353, + "grad_norm": 2.1810123393988707, + "learning_rate": 1e-05, + "loss": 1.3239, + "step": 187 + }, + { + "epoch": 0.08470376210858302, + "grad_norm": 2.26292004700728, + "learning_rate": 1e-05, + "loss": 1.3161, + "step": 188 + }, + { + "epoch": 0.0851543140346925, + "grad_norm": 2.2356541214457555, + "learning_rate": 1e-05, + "loss": 1.2959, + "step": 189 + }, + { + "epoch": 0.08560486596080198, + "grad_norm": 2.340511553816989, + "learning_rate": 1e-05, + "loss": 1.3412, + "step": 190 + }, + { + "epoch": 0.08605541788691147, + "grad_norm": 1.9727356523181143, + "learning_rate": 1e-05, + "loss": 1.2852, + "step": 191 + }, + { + "epoch": 0.08650596981302094, + "grad_norm": 1.9979926064436653, + "learning_rate": 1e-05, + "loss": 1.3461, + "step": 192 + }, + { + "epoch": 0.08695652173913043, + "grad_norm": 2.088054262265907, + "learning_rate": 1e-05, + "loss": 1.3649, + "step": 193 + }, + { + "epoch": 0.08740707366523992, + "grad_norm": 2.049726673942621, + "learning_rate": 1e-05, + "loss": 1.281, + "step": 194 + }, + { + "epoch": 0.08785762559134941, + "grad_norm": 2.2460539886132325, + "learning_rate": 1e-05, + "loss": 1.2916, + "step": 195 + }, + { + "epoch": 0.08830817751745888, + "grad_norm": 2.294882839143102, + "learning_rate": 1e-05, + "loss": 1.3023, + "step": 196 + }, + { + "epoch": 0.08875872944356837, + "grad_norm": 2.0105964141212778, + "learning_rate": 1e-05, + "loss": 1.2945, + "step": 197 + }, + { + "epoch": 0.08920928136967786, + "grad_norm": 2.1261627253589355, + "learning_rate": 1e-05, + "loss": 1.3162, + "step": 198 + }, + { + "epoch": 0.08965983329578733, + "grad_norm": 2.1558255460157207, + "learning_rate": 1e-05, + "loss": 1.3119, + "step": 199 + }, + { + "epoch": 0.09011038522189682, + "grad_norm": 2.3615824846619637, + "learning_rate": 1e-05, + "loss": 1.355, + "step": 200 + }, + { + "epoch": 0.09056093714800631, + "grad_norm": 2.1010711545597682, + "learning_rate": 1e-05, + "loss": 1.2998, + "step": 201 + }, + { + "epoch": 0.0910114890741158, + "grad_norm": 2.1120748106921043, + "learning_rate": 1e-05, + "loss": 1.25, + "step": 202 + }, + { + "epoch": 0.09146204100022527, + "grad_norm": 2.1820679615413705, + "learning_rate": 1e-05, + "loss": 1.266, + "step": 203 + }, + { + "epoch": 0.09191259292633476, + "grad_norm": 2.294622936183944, + "learning_rate": 1e-05, + "loss": 1.3235, + "step": 204 + }, + { + "epoch": 0.09236314485244425, + "grad_norm": 2.154727340275725, + "learning_rate": 1e-05, + "loss": 1.2828, + "step": 205 + }, + { + "epoch": 0.09281369677855372, + "grad_norm": 1.9902518644805207, + "learning_rate": 1e-05, + "loss": 1.3093, + "step": 206 + }, + { + "epoch": 0.09326424870466321, + "grad_norm": 2.1266591160221684, + "learning_rate": 1e-05, + "loss": 1.2783, + "step": 207 + }, + { + "epoch": 0.0937148006307727, + "grad_norm": 2.10569335618675, + "learning_rate": 1e-05, + "loss": 1.3027, + "step": 208 + }, + { + "epoch": 0.09416535255688219, + "grad_norm": 2.0834789625484156, + "learning_rate": 1e-05, + "loss": 1.2972, + "step": 209 + }, + { + "epoch": 0.09461590448299166, + "grad_norm": 2.201298293871812, + "learning_rate": 1e-05, + "loss": 1.3269, + "step": 210 + }, + { + "epoch": 0.09506645640910115, + "grad_norm": 2.072133926573633, + "learning_rate": 1e-05, + "loss": 1.2411, + "step": 211 + }, + { + "epoch": 0.09551700833521064, + "grad_norm": 2.160638257283922, + "learning_rate": 1e-05, + "loss": 1.3115, + "step": 212 + }, + { + "epoch": 0.09596756026132011, + "grad_norm": 2.1756470916488544, + "learning_rate": 1e-05, + "loss": 1.3439, + "step": 213 + }, + { + "epoch": 0.0964181121874296, + "grad_norm": 1.9143146267256204, + "learning_rate": 1e-05, + "loss": 1.2647, + "step": 214 + }, + { + "epoch": 0.09686866411353909, + "grad_norm": 2.3050841717765578, + "learning_rate": 1e-05, + "loss": 1.3248, + "step": 215 + }, + { + "epoch": 0.09731921603964858, + "grad_norm": 2.4165019734863815, + "learning_rate": 1e-05, + "loss": 1.258, + "step": 216 + }, + { + "epoch": 0.09776976796575805, + "grad_norm": 2.2372311885078084, + "learning_rate": 1e-05, + "loss": 1.2905, + "step": 217 + }, + { + "epoch": 0.09822031989186754, + "grad_norm": 2.04193161203431, + "learning_rate": 1e-05, + "loss": 1.3026, + "step": 218 + }, + { + "epoch": 0.09867087181797703, + "grad_norm": 1.954613609808351, + "learning_rate": 1e-05, + "loss": 1.2869, + "step": 219 + }, + { + "epoch": 0.0991214237440865, + "grad_norm": 1.976265979622734, + "learning_rate": 1e-05, + "loss": 1.2519, + "step": 220 + }, + { + "epoch": 0.09957197567019599, + "grad_norm": 2.344636898145692, + "learning_rate": 1e-05, + "loss": 1.261, + "step": 221 + }, + { + "epoch": 0.10002252759630548, + "grad_norm": 2.079147567257483, + "learning_rate": 1e-05, + "loss": 1.3017, + "step": 222 + }, + { + "epoch": 0.10047307952241495, + "grad_norm": 1.9433808675623756, + "learning_rate": 1e-05, + "loss": 1.3133, + "step": 223 + }, + { + "epoch": 0.10092363144852444, + "grad_norm": 2.1198616346721937, + "learning_rate": 1e-05, + "loss": 1.3428, + "step": 224 + }, + { + "epoch": 0.10137418337463393, + "grad_norm": 2.2434673582395495, + "learning_rate": 1e-05, + "loss": 1.2892, + "step": 225 + }, + { + "epoch": 0.10182473530074342, + "grad_norm": 2.1693804368854783, + "learning_rate": 1e-05, + "loss": 1.3214, + "step": 226 + }, + { + "epoch": 0.10227528722685289, + "grad_norm": 2.1854057215784124, + "learning_rate": 1e-05, + "loss": 1.2451, + "step": 227 + }, + { + "epoch": 0.10272583915296238, + "grad_norm": 2.2579876090328095, + "learning_rate": 1e-05, + "loss": 1.283, + "step": 228 + }, + { + "epoch": 0.10317639107907187, + "grad_norm": 2.1231468309558963, + "learning_rate": 1e-05, + "loss": 1.355, + "step": 229 + }, + { + "epoch": 0.10362694300518134, + "grad_norm": 1.872151743679353, + "learning_rate": 1e-05, + "loss": 1.324, + "step": 230 + }, + { + "epoch": 0.10407749493129083, + "grad_norm": 2.22398898445514, + "learning_rate": 1e-05, + "loss": 1.328, + "step": 231 + }, + { + "epoch": 0.10452804685740032, + "grad_norm": 2.1507743930092404, + "learning_rate": 1e-05, + "loss": 1.3638, + "step": 232 + }, + { + "epoch": 0.1049785987835098, + "grad_norm": 2.2800866611460138, + "learning_rate": 1e-05, + "loss": 1.2881, + "step": 233 + }, + { + "epoch": 0.10542915070961928, + "grad_norm": 2.2289538826428816, + "learning_rate": 1e-05, + "loss": 1.2792, + "step": 234 + }, + { + "epoch": 0.10587970263572877, + "grad_norm": 2.009099245481224, + "learning_rate": 1e-05, + "loss": 1.3218, + "step": 235 + }, + { + "epoch": 0.10633025456183826, + "grad_norm": 2.006015485548607, + "learning_rate": 1e-05, + "loss": 1.3089, + "step": 236 + }, + { + "epoch": 0.10678080648794773, + "grad_norm": 2.1255543388528726, + "learning_rate": 1e-05, + "loss": 1.3231, + "step": 237 + }, + { + "epoch": 0.10723135841405722, + "grad_norm": 2.173061330250712, + "learning_rate": 1e-05, + "loss": 1.2384, + "step": 238 + }, + { + "epoch": 0.1076819103401667, + "grad_norm": 2.2818871812577597, + "learning_rate": 1e-05, + "loss": 1.2709, + "step": 239 + }, + { + "epoch": 0.1081324622662762, + "grad_norm": 2.3229635064658707, + "learning_rate": 1e-05, + "loss": 1.2506, + "step": 240 + }, + { + "epoch": 0.10858301419238567, + "grad_norm": 2.0566676446728005, + "learning_rate": 1e-05, + "loss": 1.292, + "step": 241 + }, + { + "epoch": 0.10903356611849516, + "grad_norm": 2.155920765024828, + "learning_rate": 1e-05, + "loss": 1.3153, + "step": 242 + }, + { + "epoch": 0.10948411804460464, + "grad_norm": 2.0841140126822943, + "learning_rate": 1e-05, + "loss": 1.2806, + "step": 243 + }, + { + "epoch": 0.10993466997071412, + "grad_norm": 2.1393721399250687, + "learning_rate": 1e-05, + "loss": 1.3261, + "step": 244 + }, + { + "epoch": 0.11038522189682361, + "grad_norm": 2.1221868581545116, + "learning_rate": 1e-05, + "loss": 1.2892, + "step": 245 + }, + { + "epoch": 0.1108357738229331, + "grad_norm": 2.1755867538401263, + "learning_rate": 1e-05, + "loss": 1.324, + "step": 246 + }, + { + "epoch": 0.11128632574904258, + "grad_norm": 2.319444842074109, + "learning_rate": 1e-05, + "loss": 1.3057, + "step": 247 + }, + { + "epoch": 0.11173687767515206, + "grad_norm": 2.1103232906021083, + "learning_rate": 1e-05, + "loss": 1.322, + "step": 248 + }, + { + "epoch": 0.11218742960126155, + "grad_norm": 2.1680871784458136, + "learning_rate": 1e-05, + "loss": 1.3126, + "step": 249 + }, + { + "epoch": 0.11263798152737103, + "grad_norm": 1.9897648507815235, + "learning_rate": 1e-05, + "loss": 1.3266, + "step": 250 + }, + { + "epoch": 0.11308853345348051, + "grad_norm": 2.1452185458575572, + "learning_rate": 1e-05, + "loss": 1.2991, + "step": 251 + }, + { + "epoch": 0.11353908537959, + "grad_norm": 2.0639404649517687, + "learning_rate": 1e-05, + "loss": 1.3345, + "step": 252 + }, + { + "epoch": 0.11398963730569948, + "grad_norm": 2.2265474509862733, + "learning_rate": 1e-05, + "loss": 1.3338, + "step": 253 + }, + { + "epoch": 0.11444018923180897, + "grad_norm": 2.1158310661962862, + "learning_rate": 1e-05, + "loss": 1.2721, + "step": 254 + }, + { + "epoch": 0.11489074115791845, + "grad_norm": 2.150280623912654, + "learning_rate": 1e-05, + "loss": 1.3154, + "step": 255 + }, + { + "epoch": 0.11534129308402793, + "grad_norm": 2.0166941783038634, + "learning_rate": 1e-05, + "loss": 1.2752, + "step": 256 + }, + { + "epoch": 0.11579184501013742, + "grad_norm": 2.108035622623124, + "learning_rate": 1e-05, + "loss": 1.2931, + "step": 257 + }, + { + "epoch": 0.1162423969362469, + "grad_norm": 2.042157781896836, + "learning_rate": 1e-05, + "loss": 1.3069, + "step": 258 + }, + { + "epoch": 0.11669294886235639, + "grad_norm": 2.0251060091348783, + "learning_rate": 1e-05, + "loss": 1.2555, + "step": 259 + }, + { + "epoch": 0.11714350078846587, + "grad_norm": 2.295329853473669, + "learning_rate": 1e-05, + "loss": 1.3083, + "step": 260 + }, + { + "epoch": 0.11759405271457536, + "grad_norm": 2.0714716088770815, + "learning_rate": 1e-05, + "loss": 1.2937, + "step": 261 + }, + { + "epoch": 0.11804460464068484, + "grad_norm": 2.396354053446123, + "learning_rate": 1e-05, + "loss": 1.2919, + "step": 262 + }, + { + "epoch": 0.11849515656679432, + "grad_norm": 2.200235892723831, + "learning_rate": 1e-05, + "loss": 1.3423, + "step": 263 + }, + { + "epoch": 0.11894570849290381, + "grad_norm": 1.9819306761172546, + "learning_rate": 1e-05, + "loss": 1.3062, + "step": 264 + }, + { + "epoch": 0.11939626041901329, + "grad_norm": 1.9841435404879328, + "learning_rate": 1e-05, + "loss": 1.2746, + "step": 265 + }, + { + "epoch": 0.11984681234512277, + "grad_norm": 2.159956082135299, + "learning_rate": 1e-05, + "loss": 1.3104, + "step": 266 + }, + { + "epoch": 0.12029736427123226, + "grad_norm": 1.9200585591152597, + "learning_rate": 1e-05, + "loss": 1.2983, + "step": 267 + }, + { + "epoch": 0.12074791619734174, + "grad_norm": 2.5259249629148592, + "learning_rate": 1e-05, + "loss": 1.2992, + "step": 268 + }, + { + "epoch": 0.12119846812345123, + "grad_norm": 1.9398891739080681, + "learning_rate": 1e-05, + "loss": 1.2928, + "step": 269 + }, + { + "epoch": 0.12164902004956071, + "grad_norm": 1.9790756436442474, + "learning_rate": 1e-05, + "loss": 1.3188, + "step": 270 + }, + { + "epoch": 0.1220995719756702, + "grad_norm": 2.242052817808025, + "learning_rate": 1e-05, + "loss": 1.2804, + "step": 271 + }, + { + "epoch": 0.12255012390177968, + "grad_norm": 2.122027788554486, + "learning_rate": 1e-05, + "loss": 1.2669, + "step": 272 + }, + { + "epoch": 0.12300067582788916, + "grad_norm": 2.1527010134914786, + "learning_rate": 1e-05, + "loss": 1.3339, + "step": 273 + }, + { + "epoch": 0.12345122775399865, + "grad_norm": 2.1764628846093816, + "learning_rate": 1e-05, + "loss": 1.2766, + "step": 274 + }, + { + "epoch": 0.12390177968010813, + "grad_norm": 1.947845094899021, + "learning_rate": 1e-05, + "loss": 1.264, + "step": 275 + }, + { + "epoch": 0.12435233160621761, + "grad_norm": 2.368267399084212, + "learning_rate": 1e-05, + "loss": 1.3515, + "step": 276 + }, + { + "epoch": 0.1248028835323271, + "grad_norm": 2.0997819018442163, + "learning_rate": 1e-05, + "loss": 1.3069, + "step": 277 + }, + { + "epoch": 0.1252534354584366, + "grad_norm": 1.9939134555995226, + "learning_rate": 1e-05, + "loss": 1.3191, + "step": 278 + }, + { + "epoch": 0.12570398738454608, + "grad_norm": 1.9066572393040075, + "learning_rate": 1e-05, + "loss": 1.3135, + "step": 279 + }, + { + "epoch": 0.12615453931065557, + "grad_norm": 1.9057042423036947, + "learning_rate": 1e-05, + "loss": 1.3153, + "step": 280 + }, + { + "epoch": 0.12660509123676503, + "grad_norm": 2.150247021172038, + "learning_rate": 1e-05, + "loss": 1.3303, + "step": 281 + }, + { + "epoch": 0.12705564316287452, + "grad_norm": 2.0992788299613783, + "learning_rate": 1e-05, + "loss": 1.3254, + "step": 282 + }, + { + "epoch": 0.127506195088984, + "grad_norm": 2.133783599904729, + "learning_rate": 1e-05, + "loss": 1.2773, + "step": 283 + }, + { + "epoch": 0.1279567470150935, + "grad_norm": 1.9316963608148732, + "learning_rate": 1e-05, + "loss": 1.3435, + "step": 284 + }, + { + "epoch": 0.12840729894120298, + "grad_norm": 1.974806893876162, + "learning_rate": 1e-05, + "loss": 1.2523, + "step": 285 + }, + { + "epoch": 0.12885785086731247, + "grad_norm": 1.9743808573222912, + "learning_rate": 1e-05, + "loss": 1.3285, + "step": 286 + }, + { + "epoch": 0.12930840279342193, + "grad_norm": 2.103180773196452, + "learning_rate": 1e-05, + "loss": 1.2892, + "step": 287 + }, + { + "epoch": 0.12975895471953142, + "grad_norm": 1.8556401356475218, + "learning_rate": 1e-05, + "loss": 1.291, + "step": 288 + }, + { + "epoch": 0.1302095066456409, + "grad_norm": 1.9207818760870192, + "learning_rate": 1e-05, + "loss": 1.2789, + "step": 289 + }, + { + "epoch": 0.1306600585717504, + "grad_norm": 2.0033928497304627, + "learning_rate": 1e-05, + "loss": 1.2813, + "step": 290 + }, + { + "epoch": 0.13111061049785988, + "grad_norm": 1.8633066695261629, + "learning_rate": 1e-05, + "loss": 1.2738, + "step": 291 + }, + { + "epoch": 0.13156116242396937, + "grad_norm": 1.8657542972869336, + "learning_rate": 1e-05, + "loss": 1.3087, + "step": 292 + }, + { + "epoch": 0.13201171435007886, + "grad_norm": 2.1494510639786335, + "learning_rate": 1e-05, + "loss": 1.2836, + "step": 293 + }, + { + "epoch": 0.13246226627618832, + "grad_norm": 2.0505108843320503, + "learning_rate": 1e-05, + "loss": 1.3038, + "step": 294 + }, + { + "epoch": 0.1329128182022978, + "grad_norm": 2.112493335275763, + "learning_rate": 1e-05, + "loss": 1.2872, + "step": 295 + }, + { + "epoch": 0.1333633701284073, + "grad_norm": 1.924080602241478, + "learning_rate": 1e-05, + "loss": 1.2903, + "step": 296 + }, + { + "epoch": 0.13381392205451678, + "grad_norm": 2.0436320967486115, + "learning_rate": 1e-05, + "loss": 1.3554, + "step": 297 + }, + { + "epoch": 0.13426447398062627, + "grad_norm": 1.9545825121390592, + "learning_rate": 1e-05, + "loss": 1.2739, + "step": 298 + }, + { + "epoch": 0.13471502590673576, + "grad_norm": 1.9665488651762209, + "learning_rate": 1e-05, + "loss": 1.304, + "step": 299 + }, + { + "epoch": 0.13516557783284525, + "grad_norm": 1.9449462490428069, + "learning_rate": 1e-05, + "loss": 1.2848, + "step": 300 + }, + { + "epoch": 0.1356161297589547, + "grad_norm": 1.9470051383361042, + "learning_rate": 1e-05, + "loss": 1.2981, + "step": 301 + }, + { + "epoch": 0.1360666816850642, + "grad_norm": 1.9373980424275328, + "learning_rate": 1e-05, + "loss": 1.2315, + "step": 302 + }, + { + "epoch": 0.13651723361117368, + "grad_norm": 1.875270255478935, + "learning_rate": 1e-05, + "loss": 1.3268, + "step": 303 + }, + { + "epoch": 0.13696778553728317, + "grad_norm": 2.2044670214107707, + "learning_rate": 1e-05, + "loss": 1.2953, + "step": 304 + }, + { + "epoch": 0.13741833746339266, + "grad_norm": 2.075304026490976, + "learning_rate": 1e-05, + "loss": 1.288, + "step": 305 + }, + { + "epoch": 0.13786888938950215, + "grad_norm": 2.011073798276399, + "learning_rate": 1e-05, + "loss": 1.2467, + "step": 306 + }, + { + "epoch": 0.13831944131561164, + "grad_norm": 2.034679330715142, + "learning_rate": 1e-05, + "loss": 1.3292, + "step": 307 + }, + { + "epoch": 0.1387699932417211, + "grad_norm": 1.9483449655122804, + "learning_rate": 1e-05, + "loss": 1.319, + "step": 308 + }, + { + "epoch": 0.13922054516783058, + "grad_norm": 2.04678116650454, + "learning_rate": 1e-05, + "loss": 1.2547, + "step": 309 + }, + { + "epoch": 0.13967109709394007, + "grad_norm": 2.1949720256189336, + "learning_rate": 1e-05, + "loss": 1.2852, + "step": 310 + }, + { + "epoch": 0.14012164902004956, + "grad_norm": 2.0350442826126174, + "learning_rate": 1e-05, + "loss": 1.3086, + "step": 311 + }, + { + "epoch": 0.14057220094615905, + "grad_norm": 2.0258670899912095, + "learning_rate": 1e-05, + "loss": 1.271, + "step": 312 + }, + { + "epoch": 0.14102275287226854, + "grad_norm": 2.171456270634506, + "learning_rate": 1e-05, + "loss": 1.2978, + "step": 313 + }, + { + "epoch": 0.14147330479837802, + "grad_norm": 2.149145869935288, + "learning_rate": 1e-05, + "loss": 1.3287, + "step": 314 + }, + { + "epoch": 0.14192385672448748, + "grad_norm": 2.175629086835601, + "learning_rate": 1e-05, + "loss": 1.2797, + "step": 315 + }, + { + "epoch": 0.14237440865059697, + "grad_norm": 2.14936647254275, + "learning_rate": 1e-05, + "loss": 1.2917, + "step": 316 + }, + { + "epoch": 0.14282496057670646, + "grad_norm": 1.9906810415024248, + "learning_rate": 1e-05, + "loss": 1.2675, + "step": 317 + }, + { + "epoch": 0.14327551250281595, + "grad_norm": 1.988086950463775, + "learning_rate": 1e-05, + "loss": 1.2886, + "step": 318 + }, + { + "epoch": 0.14372606442892544, + "grad_norm": 1.9476086425243724, + "learning_rate": 1e-05, + "loss": 1.2644, + "step": 319 + }, + { + "epoch": 0.14417661635503493, + "grad_norm": 2.0995866280534394, + "learning_rate": 1e-05, + "loss": 1.3236, + "step": 320 + }, + { + "epoch": 0.1446271682811444, + "grad_norm": 1.996319309918401, + "learning_rate": 1e-05, + "loss": 1.2763, + "step": 321 + }, + { + "epoch": 0.14507772020725387, + "grad_norm": 2.2000731970049165, + "learning_rate": 1e-05, + "loss": 1.3132, + "step": 322 + }, + { + "epoch": 0.14552827213336336, + "grad_norm": 2.004990334407573, + "learning_rate": 1e-05, + "loss": 1.3002, + "step": 323 + }, + { + "epoch": 0.14597882405947285, + "grad_norm": 1.9688564258244992, + "learning_rate": 1e-05, + "loss": 1.2174, + "step": 324 + }, + { + "epoch": 0.14642937598558234, + "grad_norm": 2.181618588764316, + "learning_rate": 1e-05, + "loss": 1.3192, + "step": 325 + }, + { + "epoch": 0.14687992791169183, + "grad_norm": 2.069099208462894, + "learning_rate": 1e-05, + "loss": 1.3353, + "step": 326 + }, + { + "epoch": 0.14733047983780131, + "grad_norm": 2.0591506357520393, + "learning_rate": 1e-05, + "loss": 1.263, + "step": 327 + }, + { + "epoch": 0.1477810317639108, + "grad_norm": 2.041408533263358, + "learning_rate": 1e-05, + "loss": 1.3209, + "step": 328 + }, + { + "epoch": 0.14823158369002026, + "grad_norm": 1.9928222049055881, + "learning_rate": 1e-05, + "loss": 1.2589, + "step": 329 + }, + { + "epoch": 0.14868213561612975, + "grad_norm": 1.9717547921546856, + "learning_rate": 1e-05, + "loss": 1.2733, + "step": 330 + }, + { + "epoch": 0.14913268754223924, + "grad_norm": 2.0663499407777537, + "learning_rate": 1e-05, + "loss": 1.2612, + "step": 331 + }, + { + "epoch": 0.14958323946834873, + "grad_norm": 2.131614625072467, + "learning_rate": 1e-05, + "loss": 1.2867, + "step": 332 + }, + { + "epoch": 0.15003379139445822, + "grad_norm": 2.001144065765142, + "learning_rate": 1e-05, + "loss": 1.2885, + "step": 333 + }, + { + "epoch": 0.1504843433205677, + "grad_norm": 2.324747529344625, + "learning_rate": 1e-05, + "loss": 1.2727, + "step": 334 + }, + { + "epoch": 0.1509348952466772, + "grad_norm": 1.9493047619407076, + "learning_rate": 1e-05, + "loss": 1.282, + "step": 335 + }, + { + "epoch": 0.15138544717278665, + "grad_norm": 1.9618381937389244, + "learning_rate": 1e-05, + "loss": 1.2747, + "step": 336 + }, + { + "epoch": 0.15183599909889614, + "grad_norm": 2.021673157769538, + "learning_rate": 1e-05, + "loss": 1.2915, + "step": 337 + }, + { + "epoch": 0.15228655102500563, + "grad_norm": 2.0609723042172923, + "learning_rate": 1e-05, + "loss": 1.2915, + "step": 338 + }, + { + "epoch": 0.15273710295111512, + "grad_norm": 2.2411227954856856, + "learning_rate": 1e-05, + "loss": 1.311, + "step": 339 + }, + { + "epoch": 0.1531876548772246, + "grad_norm": 2.0293089438224015, + "learning_rate": 1e-05, + "loss": 1.2973, + "step": 340 + }, + { + "epoch": 0.1536382068033341, + "grad_norm": 1.8179160445288032, + "learning_rate": 1e-05, + "loss": 1.2857, + "step": 341 + }, + { + "epoch": 0.15408875872944358, + "grad_norm": 1.9623466371508445, + "learning_rate": 1e-05, + "loss": 1.2892, + "step": 342 + }, + { + "epoch": 0.15453931065555304, + "grad_norm": 1.9603857645386529, + "learning_rate": 1e-05, + "loss": 1.3028, + "step": 343 + }, + { + "epoch": 0.15498986258166253, + "grad_norm": 1.84250642856154, + "learning_rate": 1e-05, + "loss": 1.2759, + "step": 344 + }, + { + "epoch": 0.15544041450777202, + "grad_norm": 2.2444300057918998, + "learning_rate": 1e-05, + "loss": 1.2607, + "step": 345 + }, + { + "epoch": 0.1558909664338815, + "grad_norm": 1.960685051584487, + "learning_rate": 1e-05, + "loss": 1.2981, + "step": 346 + }, + { + "epoch": 0.156341518359991, + "grad_norm": 1.9720183471848456, + "learning_rate": 1e-05, + "loss": 1.2658, + "step": 347 + }, + { + "epoch": 0.15679207028610048, + "grad_norm": 2.1108681327485526, + "learning_rate": 1e-05, + "loss": 1.3252, + "step": 348 + }, + { + "epoch": 0.15724262221220997, + "grad_norm": 1.9526787609837528, + "learning_rate": 1e-05, + "loss": 1.3097, + "step": 349 + }, + { + "epoch": 0.15769317413831943, + "grad_norm": 2.102470525566385, + "learning_rate": 1e-05, + "loss": 1.2793, + "step": 350 + }, + { + "epoch": 0.15814372606442892, + "grad_norm": 1.9043399917790473, + "learning_rate": 1e-05, + "loss": 1.2901, + "step": 351 + }, + { + "epoch": 0.1585942779905384, + "grad_norm": 2.0704873691866217, + "learning_rate": 1e-05, + "loss": 1.2534, + "step": 352 + }, + { + "epoch": 0.1590448299166479, + "grad_norm": 1.8935409303536357, + "learning_rate": 1e-05, + "loss": 1.2705, + "step": 353 + }, + { + "epoch": 0.15949538184275738, + "grad_norm": 2.05329124751458, + "learning_rate": 1e-05, + "loss": 1.3187, + "step": 354 + }, + { + "epoch": 0.15994593376886687, + "grad_norm": 2.0597273322745937, + "learning_rate": 1e-05, + "loss": 1.2566, + "step": 355 + }, + { + "epoch": 0.16039648569497636, + "grad_norm": 1.994080738314847, + "learning_rate": 1e-05, + "loss": 1.2945, + "step": 356 + }, + { + "epoch": 0.16084703762108582, + "grad_norm": 1.9178614859606067, + "learning_rate": 1e-05, + "loss": 1.2729, + "step": 357 + }, + { + "epoch": 0.1612975895471953, + "grad_norm": 2.1202612219294097, + "learning_rate": 1e-05, + "loss": 1.2783, + "step": 358 + }, + { + "epoch": 0.1617481414733048, + "grad_norm": 2.0639508875751145, + "learning_rate": 1e-05, + "loss": 1.3166, + "step": 359 + }, + { + "epoch": 0.16219869339941428, + "grad_norm": 1.9219411874218848, + "learning_rate": 1e-05, + "loss": 1.2851, + "step": 360 + }, + { + "epoch": 0.16264924532552377, + "grad_norm": 1.8555787792800178, + "learning_rate": 1e-05, + "loss": 1.3075, + "step": 361 + }, + { + "epoch": 0.16309979725163326, + "grad_norm": 2.0051025638227364, + "learning_rate": 1e-05, + "loss": 1.2787, + "step": 362 + }, + { + "epoch": 0.16355034917774275, + "grad_norm": 2.071493880001985, + "learning_rate": 1e-05, + "loss": 1.3449, + "step": 363 + }, + { + "epoch": 0.1640009011038522, + "grad_norm": 2.0435140721243785, + "learning_rate": 1e-05, + "loss": 1.3004, + "step": 364 + }, + { + "epoch": 0.1644514530299617, + "grad_norm": 2.020256783371866, + "learning_rate": 1e-05, + "loss": 1.2824, + "step": 365 + }, + { + "epoch": 0.16490200495607119, + "grad_norm": 2.095838466374123, + "learning_rate": 1e-05, + "loss": 1.2231, + "step": 366 + }, + { + "epoch": 0.16535255688218067, + "grad_norm": 2.079247303020079, + "learning_rate": 1e-05, + "loss": 1.2719, + "step": 367 + }, + { + "epoch": 0.16580310880829016, + "grad_norm": 2.0004956596544683, + "learning_rate": 1e-05, + "loss": 1.3348, + "step": 368 + }, + { + "epoch": 0.16625366073439965, + "grad_norm": 2.0327189616174994, + "learning_rate": 1e-05, + "loss": 1.281, + "step": 369 + }, + { + "epoch": 0.1667042126605091, + "grad_norm": 2.0939532929360927, + "learning_rate": 1e-05, + "loss": 1.2308, + "step": 370 + }, + { + "epoch": 0.1671547645866186, + "grad_norm": 1.942640042738766, + "learning_rate": 1e-05, + "loss": 1.2309, + "step": 371 + }, + { + "epoch": 0.1676053165127281, + "grad_norm": 2.251110736450933, + "learning_rate": 1e-05, + "loss": 1.2885, + "step": 372 + }, + { + "epoch": 0.16805586843883757, + "grad_norm": 2.2861326119505048, + "learning_rate": 1e-05, + "loss": 1.272, + "step": 373 + }, + { + "epoch": 0.16850642036494706, + "grad_norm": 1.954729928516754, + "learning_rate": 1e-05, + "loss": 1.3243, + "step": 374 + }, + { + "epoch": 0.16895697229105655, + "grad_norm": 1.9117136264725754, + "learning_rate": 1e-05, + "loss": 1.3125, + "step": 375 + }, + { + "epoch": 0.16940752421716604, + "grad_norm": 2.1945864188944153, + "learning_rate": 1e-05, + "loss": 1.3087, + "step": 376 + }, + { + "epoch": 0.1698580761432755, + "grad_norm": 1.9058131074988187, + "learning_rate": 1e-05, + "loss": 1.2924, + "step": 377 + }, + { + "epoch": 0.170308628069385, + "grad_norm": 1.8407007694236421, + "learning_rate": 1e-05, + "loss": 1.2937, + "step": 378 + }, + { + "epoch": 0.17075917999549448, + "grad_norm": 2.098941464662954, + "learning_rate": 1e-05, + "loss": 1.3028, + "step": 379 + }, + { + "epoch": 0.17120973192160396, + "grad_norm": 1.9882687348840964, + "learning_rate": 1e-05, + "loss": 1.3079, + "step": 380 + }, + { + "epoch": 0.17166028384771345, + "grad_norm": 2.1011227957223704, + "learning_rate": 1e-05, + "loss": 1.2818, + "step": 381 + }, + { + "epoch": 0.17211083577382294, + "grad_norm": 2.0966958079367126, + "learning_rate": 1e-05, + "loss": 1.2357, + "step": 382 + }, + { + "epoch": 0.17256138769993243, + "grad_norm": 2.043553305436292, + "learning_rate": 1e-05, + "loss": 1.2624, + "step": 383 + }, + { + "epoch": 0.1730119396260419, + "grad_norm": 1.9157164138264542, + "learning_rate": 1e-05, + "loss": 1.2901, + "step": 384 + }, + { + "epoch": 0.17346249155215138, + "grad_norm": 2.0489774956843014, + "learning_rate": 1e-05, + "loss": 1.2954, + "step": 385 + }, + { + "epoch": 0.17391304347826086, + "grad_norm": 2.0355571212846666, + "learning_rate": 1e-05, + "loss": 1.2809, + "step": 386 + }, + { + "epoch": 0.17436359540437035, + "grad_norm": 1.7634131824701134, + "learning_rate": 1e-05, + "loss": 1.3056, + "step": 387 + }, + { + "epoch": 0.17481414733047984, + "grad_norm": 2.162318565634982, + "learning_rate": 1e-05, + "loss": 1.2765, + "step": 388 + }, + { + "epoch": 0.17526469925658933, + "grad_norm": 2.112678284922863, + "learning_rate": 1e-05, + "loss": 1.2962, + "step": 389 + }, + { + "epoch": 0.17571525118269882, + "grad_norm": 2.1494215295790795, + "learning_rate": 1e-05, + "loss": 1.2652, + "step": 390 + }, + { + "epoch": 0.17616580310880828, + "grad_norm": 2.064498142531983, + "learning_rate": 1e-05, + "loss": 1.2803, + "step": 391 + }, + { + "epoch": 0.17661635503491777, + "grad_norm": 1.8046181497392817, + "learning_rate": 1e-05, + "loss": 1.3499, + "step": 392 + }, + { + "epoch": 0.17706690696102725, + "grad_norm": 1.859711338145051, + "learning_rate": 1e-05, + "loss": 1.3156, + "step": 393 + }, + { + "epoch": 0.17751745888713674, + "grad_norm": 1.9854435645998658, + "learning_rate": 1e-05, + "loss": 1.2698, + "step": 394 + }, + { + "epoch": 0.17796801081324623, + "grad_norm": 1.917316632528049, + "learning_rate": 1e-05, + "loss": 1.3331, + "step": 395 + }, + { + "epoch": 0.17841856273935572, + "grad_norm": 2.189833730293825, + "learning_rate": 1e-05, + "loss": 1.2522, + "step": 396 + }, + { + "epoch": 0.1788691146654652, + "grad_norm": 2.016849754761407, + "learning_rate": 1e-05, + "loss": 1.2866, + "step": 397 + }, + { + "epoch": 0.17931966659157467, + "grad_norm": 2.0055991366271435, + "learning_rate": 1e-05, + "loss": 1.2886, + "step": 398 + }, + { + "epoch": 0.17977021851768415, + "grad_norm": 1.9429713181519759, + "learning_rate": 1e-05, + "loss": 1.2578, + "step": 399 + }, + { + "epoch": 0.18022077044379364, + "grad_norm": 1.9769582665952863, + "learning_rate": 1e-05, + "loss": 1.3172, + "step": 400 + }, + { + "epoch": 0.18067132236990313, + "grad_norm": 2.033065302057002, + "learning_rate": 1e-05, + "loss": 1.2717, + "step": 401 + }, + { + "epoch": 0.18112187429601262, + "grad_norm": 2.044143490422974, + "learning_rate": 1e-05, + "loss": 1.2887, + "step": 402 + }, + { + "epoch": 0.1815724262221221, + "grad_norm": 2.2312132814181664, + "learning_rate": 1e-05, + "loss": 1.2564, + "step": 403 + }, + { + "epoch": 0.1820229781482316, + "grad_norm": 1.9692640587423784, + "learning_rate": 1e-05, + "loss": 1.3192, + "step": 404 + }, + { + "epoch": 0.18247353007434106, + "grad_norm": 2.0299575879307072, + "learning_rate": 1e-05, + "loss": 1.2624, + "step": 405 + }, + { + "epoch": 0.18292408200045054, + "grad_norm": 1.8904771035708423, + "learning_rate": 1e-05, + "loss": 1.3114, + "step": 406 + }, + { + "epoch": 0.18337463392656003, + "grad_norm": 1.841332211929519, + "learning_rate": 1e-05, + "loss": 1.2567, + "step": 407 + }, + { + "epoch": 0.18382518585266952, + "grad_norm": 2.1136872115933265, + "learning_rate": 1e-05, + "loss": 1.3072, + "step": 408 + }, + { + "epoch": 0.184275737778779, + "grad_norm": 2.105056901824731, + "learning_rate": 1e-05, + "loss": 1.2333, + "step": 409 + }, + { + "epoch": 0.1847262897048885, + "grad_norm": 2.1391861806470733, + "learning_rate": 1e-05, + "loss": 1.2697, + "step": 410 + }, + { + "epoch": 0.18517684163099798, + "grad_norm": 1.9386019547928421, + "learning_rate": 1e-05, + "loss": 1.2688, + "step": 411 + }, + { + "epoch": 0.18562739355710745, + "grad_norm": 2.2075012555281837, + "learning_rate": 1e-05, + "loss": 1.2554, + "step": 412 + }, + { + "epoch": 0.18607794548321693, + "grad_norm": 1.9657658945733893, + "learning_rate": 1e-05, + "loss": 1.2818, + "step": 413 + }, + { + "epoch": 0.18652849740932642, + "grad_norm": 2.062999023916708, + "learning_rate": 1e-05, + "loss": 1.3001, + "step": 414 + }, + { + "epoch": 0.1869790493354359, + "grad_norm": 1.987774567017447, + "learning_rate": 1e-05, + "loss": 1.2666, + "step": 415 + }, + { + "epoch": 0.1874296012615454, + "grad_norm": 1.965562259197448, + "learning_rate": 1e-05, + "loss": 1.2754, + "step": 416 + }, + { + "epoch": 0.18788015318765489, + "grad_norm": 1.9874935900845858, + "learning_rate": 1e-05, + "loss": 1.298, + "step": 417 + }, + { + "epoch": 0.18833070511376437, + "grad_norm": 1.9579248305312746, + "learning_rate": 1e-05, + "loss": 1.2804, + "step": 418 + }, + { + "epoch": 0.18878125703987383, + "grad_norm": 1.9969577424329485, + "learning_rate": 1e-05, + "loss": 1.2285, + "step": 419 + }, + { + "epoch": 0.18923180896598332, + "grad_norm": 2.2398185632472356, + "learning_rate": 1e-05, + "loss": 1.3058, + "step": 420 + }, + { + "epoch": 0.1896823608920928, + "grad_norm": 1.9797098368258668, + "learning_rate": 1e-05, + "loss": 1.3245, + "step": 421 + }, + { + "epoch": 0.1901329128182023, + "grad_norm": 2.0799083824667104, + "learning_rate": 1e-05, + "loss": 1.2954, + "step": 422 + }, + { + "epoch": 0.1905834647443118, + "grad_norm": 1.9912326179789581, + "learning_rate": 1e-05, + "loss": 1.2557, + "step": 423 + }, + { + "epoch": 0.19103401667042127, + "grad_norm": 1.9822348840500932, + "learning_rate": 1e-05, + "loss": 1.2825, + "step": 424 + }, + { + "epoch": 0.19148456859653076, + "grad_norm": 1.955087556533265, + "learning_rate": 1e-05, + "loss": 1.2856, + "step": 425 + }, + { + "epoch": 0.19193512052264022, + "grad_norm": 1.8832447554381153, + "learning_rate": 1e-05, + "loss": 1.27, + "step": 426 + }, + { + "epoch": 0.1923856724487497, + "grad_norm": 2.075850607998512, + "learning_rate": 1e-05, + "loss": 1.2924, + "step": 427 + }, + { + "epoch": 0.1928362243748592, + "grad_norm": 1.9545773350054239, + "learning_rate": 1e-05, + "loss": 1.3046, + "step": 428 + }, + { + "epoch": 0.1932867763009687, + "grad_norm": 1.8106472670812626, + "learning_rate": 1e-05, + "loss": 1.241, + "step": 429 + }, + { + "epoch": 0.19373732822707818, + "grad_norm": 2.202580923848979, + "learning_rate": 1e-05, + "loss": 1.3186, + "step": 430 + }, + { + "epoch": 0.19418788015318766, + "grad_norm": 1.9802743607989126, + "learning_rate": 1e-05, + "loss": 1.2508, + "step": 431 + }, + { + "epoch": 0.19463843207929715, + "grad_norm": 2.2448484146253644, + "learning_rate": 1e-05, + "loss": 1.287, + "step": 432 + }, + { + "epoch": 0.1950889840054066, + "grad_norm": 2.0777067227076507, + "learning_rate": 1e-05, + "loss": 1.2443, + "step": 433 + }, + { + "epoch": 0.1955395359315161, + "grad_norm": 1.9836012166807184, + "learning_rate": 1e-05, + "loss": 1.2806, + "step": 434 + }, + { + "epoch": 0.1959900878576256, + "grad_norm": 1.9285909883485441, + "learning_rate": 1e-05, + "loss": 1.3033, + "step": 435 + }, + { + "epoch": 0.19644063978373508, + "grad_norm": 1.9878147349260273, + "learning_rate": 1e-05, + "loss": 1.2759, + "step": 436 + }, + { + "epoch": 0.19689119170984457, + "grad_norm": 1.9557036605383904, + "learning_rate": 1e-05, + "loss": 1.2758, + "step": 437 + }, + { + "epoch": 0.19734174363595405, + "grad_norm": 1.884658796228399, + "learning_rate": 1e-05, + "loss": 1.2771, + "step": 438 + }, + { + "epoch": 0.19779229556206354, + "grad_norm": 1.816275867047091, + "learning_rate": 1e-05, + "loss": 1.2955, + "step": 439 + }, + { + "epoch": 0.198242847488173, + "grad_norm": 2.10656333138919, + "learning_rate": 1e-05, + "loss": 1.2617, + "step": 440 + }, + { + "epoch": 0.1986933994142825, + "grad_norm": 2.0984562962870803, + "learning_rate": 1e-05, + "loss": 1.2222, + "step": 441 + }, + { + "epoch": 0.19914395134039198, + "grad_norm": 1.9348752922906438, + "learning_rate": 1e-05, + "loss": 1.2168, + "step": 442 + }, + { + "epoch": 0.19959450326650147, + "grad_norm": 1.8927559482208312, + "learning_rate": 1e-05, + "loss": 1.2447, + "step": 443 + }, + { + "epoch": 0.20004505519261095, + "grad_norm": 1.8915183032229956, + "learning_rate": 1e-05, + "loss": 1.2556, + "step": 444 + }, + { + "epoch": 0.20049560711872044, + "grad_norm": 1.8590151705324758, + "learning_rate": 1e-05, + "loss": 1.3036, + "step": 445 + }, + { + "epoch": 0.2009461590448299, + "grad_norm": 2.0188235465064834, + "learning_rate": 1e-05, + "loss": 1.2813, + "step": 446 + }, + { + "epoch": 0.2013967109709394, + "grad_norm": 1.882861465804375, + "learning_rate": 1e-05, + "loss": 1.247, + "step": 447 + }, + { + "epoch": 0.20184726289704888, + "grad_norm": 1.964160284373517, + "learning_rate": 1e-05, + "loss": 1.2808, + "step": 448 + }, + { + "epoch": 0.20229781482315837, + "grad_norm": 2.1704836512761783, + "learning_rate": 1e-05, + "loss": 1.2391, + "step": 449 + }, + { + "epoch": 0.20274836674926786, + "grad_norm": 1.7979559458234273, + "learning_rate": 1e-05, + "loss": 1.2787, + "step": 450 + }, + { + "epoch": 0.20319891867537734, + "grad_norm": 1.9531550503549668, + "learning_rate": 1e-05, + "loss": 1.3082, + "step": 451 + }, + { + "epoch": 0.20364947060148683, + "grad_norm": 1.6802653824191487, + "learning_rate": 1e-05, + "loss": 1.2592, + "step": 452 + }, + { + "epoch": 0.2041000225275963, + "grad_norm": 1.9009883595109776, + "learning_rate": 1e-05, + "loss": 1.2307, + "step": 453 + }, + { + "epoch": 0.20455057445370578, + "grad_norm": 1.924227930900898, + "learning_rate": 1e-05, + "loss": 1.2701, + "step": 454 + }, + { + "epoch": 0.20500112637981527, + "grad_norm": 2.072514033394317, + "learning_rate": 1e-05, + "loss": 1.2749, + "step": 455 + }, + { + "epoch": 0.20545167830592476, + "grad_norm": 2.1258082293123697, + "learning_rate": 1e-05, + "loss": 1.3065, + "step": 456 + }, + { + "epoch": 0.20590223023203424, + "grad_norm": 2.0719552458789545, + "learning_rate": 1e-05, + "loss": 1.2453, + "step": 457 + }, + { + "epoch": 0.20635278215814373, + "grad_norm": 1.8200768626343444, + "learning_rate": 1e-05, + "loss": 1.2943, + "step": 458 + }, + { + "epoch": 0.20680333408425322, + "grad_norm": 2.1138401076737146, + "learning_rate": 1e-05, + "loss": 1.2944, + "step": 459 + }, + { + "epoch": 0.20725388601036268, + "grad_norm": 2.2416430586143186, + "learning_rate": 1e-05, + "loss": 1.2885, + "step": 460 + }, + { + "epoch": 0.20770443793647217, + "grad_norm": 2.0189173757078067, + "learning_rate": 1e-05, + "loss": 1.2722, + "step": 461 + }, + { + "epoch": 0.20815498986258166, + "grad_norm": 1.9056409736980289, + "learning_rate": 1e-05, + "loss": 1.2481, + "step": 462 + }, + { + "epoch": 0.20860554178869115, + "grad_norm": 1.873869775109084, + "learning_rate": 1e-05, + "loss": 1.2947, + "step": 463 + }, + { + "epoch": 0.20905609371480063, + "grad_norm": 2.030085128944356, + "learning_rate": 1e-05, + "loss": 1.3246, + "step": 464 + }, + { + "epoch": 0.20950664564091012, + "grad_norm": 1.996424509377663, + "learning_rate": 1e-05, + "loss": 1.2581, + "step": 465 + }, + { + "epoch": 0.2099571975670196, + "grad_norm": 1.9704691056393633, + "learning_rate": 1e-05, + "loss": 1.2676, + "step": 466 + }, + { + "epoch": 0.21040774949312907, + "grad_norm": 2.1302562487677505, + "learning_rate": 1e-05, + "loss": 1.2801, + "step": 467 + }, + { + "epoch": 0.21085830141923856, + "grad_norm": 1.8898483209227444, + "learning_rate": 1e-05, + "loss": 1.1885, + "step": 468 + }, + { + "epoch": 0.21130885334534805, + "grad_norm": 1.9769508731754923, + "learning_rate": 1e-05, + "loss": 1.2947, + "step": 469 + }, + { + "epoch": 0.21175940527145753, + "grad_norm": 2.051397240160048, + "learning_rate": 1e-05, + "loss": 1.285, + "step": 470 + }, + { + "epoch": 0.21220995719756702, + "grad_norm": 1.827454330187214, + "learning_rate": 1e-05, + "loss": 1.2488, + "step": 471 + }, + { + "epoch": 0.2126605091236765, + "grad_norm": 1.8965671551423218, + "learning_rate": 1e-05, + "loss": 1.2383, + "step": 472 + }, + { + "epoch": 0.213111061049786, + "grad_norm": 1.9374248152180933, + "learning_rate": 1e-05, + "loss": 1.2655, + "step": 473 + }, + { + "epoch": 0.21356161297589546, + "grad_norm": 1.9152642282375247, + "learning_rate": 1e-05, + "loss": 1.269, + "step": 474 + }, + { + "epoch": 0.21401216490200495, + "grad_norm": 2.1286034986619597, + "learning_rate": 1e-05, + "loss": 1.2549, + "step": 475 + }, + { + "epoch": 0.21446271682811444, + "grad_norm": 2.1239685351695416, + "learning_rate": 1e-05, + "loss": 1.2934, + "step": 476 + }, + { + "epoch": 0.21491326875422392, + "grad_norm": 2.387986420548799, + "learning_rate": 1e-05, + "loss": 1.2209, + "step": 477 + }, + { + "epoch": 0.2153638206803334, + "grad_norm": 2.0729872120608066, + "learning_rate": 1e-05, + "loss": 1.2738, + "step": 478 + }, + { + "epoch": 0.2158143726064429, + "grad_norm": 1.9997296687320307, + "learning_rate": 1e-05, + "loss": 1.2029, + "step": 479 + }, + { + "epoch": 0.2162649245325524, + "grad_norm": 2.138097688719461, + "learning_rate": 1e-05, + "loss": 1.2721, + "step": 480 + }, + { + "epoch": 0.21671547645866185, + "grad_norm": 2.0653824446435483, + "learning_rate": 1e-05, + "loss": 1.2819, + "step": 481 + }, + { + "epoch": 0.21716602838477134, + "grad_norm": 2.072177431387959, + "learning_rate": 1e-05, + "loss": 1.2505, + "step": 482 + }, + { + "epoch": 0.21761658031088082, + "grad_norm": 1.9308751828170199, + "learning_rate": 1e-05, + "loss": 1.2837, + "step": 483 + }, + { + "epoch": 0.2180671322369903, + "grad_norm": 1.8944222007186382, + "learning_rate": 1e-05, + "loss": 1.2319, + "step": 484 + }, + { + "epoch": 0.2185176841630998, + "grad_norm": 1.9077748456798524, + "learning_rate": 1e-05, + "loss": 1.2364, + "step": 485 + }, + { + "epoch": 0.2189682360892093, + "grad_norm": 1.9631759804488607, + "learning_rate": 1e-05, + "loss": 1.2822, + "step": 486 + }, + { + "epoch": 0.21941878801531878, + "grad_norm": 1.8012739283429102, + "learning_rate": 1e-05, + "loss": 1.2737, + "step": 487 + }, + { + "epoch": 0.21986933994142824, + "grad_norm": 1.9756749334181785, + "learning_rate": 1e-05, + "loss": 1.2922, + "step": 488 + }, + { + "epoch": 0.22031989186753773, + "grad_norm": 1.9260508139187147, + "learning_rate": 1e-05, + "loss": 1.2276, + "step": 489 + }, + { + "epoch": 0.22077044379364721, + "grad_norm": 1.8435682883122477, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 490 + }, + { + "epoch": 0.2212209957197567, + "grad_norm": 1.864809666456412, + "learning_rate": 1e-05, + "loss": 1.3182, + "step": 491 + }, + { + "epoch": 0.2216715476458662, + "grad_norm": 1.8619755761060044, + "learning_rate": 1e-05, + "loss": 1.2504, + "step": 492 + }, + { + "epoch": 0.22212209957197568, + "grad_norm": 2.047961696264194, + "learning_rate": 1e-05, + "loss": 1.2295, + "step": 493 + }, + { + "epoch": 0.22257265149808517, + "grad_norm": 1.8984426277668016, + "learning_rate": 1e-05, + "loss": 1.2497, + "step": 494 + }, + { + "epoch": 0.22302320342419463, + "grad_norm": 2.0451801593017143, + "learning_rate": 1e-05, + "loss": 1.2555, + "step": 495 + }, + { + "epoch": 0.22347375535030412, + "grad_norm": 1.8921979333307704, + "learning_rate": 1e-05, + "loss": 1.2358, + "step": 496 + }, + { + "epoch": 0.2239243072764136, + "grad_norm": 1.9640954151850023, + "learning_rate": 1e-05, + "loss": 1.2156, + "step": 497 + }, + { + "epoch": 0.2243748592025231, + "grad_norm": 1.9433705241324468, + "learning_rate": 1e-05, + "loss": 1.2705, + "step": 498 + }, + { + "epoch": 0.22482541112863258, + "grad_norm": 1.932678859581744, + "learning_rate": 1e-05, + "loss": 1.2651, + "step": 499 + }, + { + "epoch": 0.22527596305474207, + "grad_norm": 1.914088641607698, + "learning_rate": 1e-05, + "loss": 1.2678, + "step": 500 + }, + { + "epoch": 0.22572651498085156, + "grad_norm": 1.9092930210655947, + "learning_rate": 1e-05, + "loss": 1.2299, + "step": 501 + }, + { + "epoch": 0.22617706690696102, + "grad_norm": 1.9149221414856479, + "learning_rate": 1e-05, + "loss": 1.2219, + "step": 502 + }, + { + "epoch": 0.2266276188330705, + "grad_norm": 2.0766417297709387, + "learning_rate": 1e-05, + "loss": 1.2767, + "step": 503 + }, + { + "epoch": 0.22707817075918, + "grad_norm": 1.928305726584179, + "learning_rate": 1e-05, + "loss": 1.2458, + "step": 504 + }, + { + "epoch": 0.22752872268528948, + "grad_norm": 2.096065242803092, + "learning_rate": 1e-05, + "loss": 1.2576, + "step": 505 + }, + { + "epoch": 0.22797927461139897, + "grad_norm": 2.0912227374818437, + "learning_rate": 1e-05, + "loss": 1.2611, + "step": 506 + }, + { + "epoch": 0.22842982653750846, + "grad_norm": 1.9815852280061006, + "learning_rate": 1e-05, + "loss": 1.2675, + "step": 507 + }, + { + "epoch": 0.22888037846361795, + "grad_norm": 2.044754812467016, + "learning_rate": 1e-05, + "loss": 1.2407, + "step": 508 + }, + { + "epoch": 0.2293309303897274, + "grad_norm": 1.829492525033062, + "learning_rate": 1e-05, + "loss": 1.3004, + "step": 509 + }, + { + "epoch": 0.2297814823158369, + "grad_norm": 2.1996988452198334, + "learning_rate": 1e-05, + "loss": 1.2411, + "step": 510 + }, + { + "epoch": 0.23023203424194638, + "grad_norm": 2.182608604414491, + "learning_rate": 1e-05, + "loss": 1.2737, + "step": 511 + }, + { + "epoch": 0.23068258616805587, + "grad_norm": 2.110776994389581, + "learning_rate": 1e-05, + "loss": 1.2685, + "step": 512 + }, + { + "epoch": 0.23113313809416536, + "grad_norm": 1.9067786503382023, + "learning_rate": 1e-05, + "loss": 1.2384, + "step": 513 + }, + { + "epoch": 0.23158369002027485, + "grad_norm": 1.929296727816064, + "learning_rate": 1e-05, + "loss": 1.2546, + "step": 514 + }, + { + "epoch": 0.23203424194638433, + "grad_norm": 2.0741880381317537, + "learning_rate": 1e-05, + "loss": 1.3031, + "step": 515 + }, + { + "epoch": 0.2324847938724938, + "grad_norm": 1.9270434416614235, + "learning_rate": 1e-05, + "loss": 1.2574, + "step": 516 + }, + { + "epoch": 0.23293534579860328, + "grad_norm": 1.9078088919166587, + "learning_rate": 1e-05, + "loss": 1.2369, + "step": 517 + }, + { + "epoch": 0.23338589772471277, + "grad_norm": 2.0310839047428293, + "learning_rate": 1e-05, + "loss": 1.22, + "step": 518 + }, + { + "epoch": 0.23383644965082226, + "grad_norm": 1.9361095982022545, + "learning_rate": 1e-05, + "loss": 1.2526, + "step": 519 + }, + { + "epoch": 0.23428700157693175, + "grad_norm": 1.8978662543954028, + "learning_rate": 1e-05, + "loss": 1.2516, + "step": 520 + }, + { + "epoch": 0.23473755350304124, + "grad_norm": 1.9683996368018823, + "learning_rate": 1e-05, + "loss": 1.2844, + "step": 521 + }, + { + "epoch": 0.23518810542915072, + "grad_norm": 1.9425722201345397, + "learning_rate": 1e-05, + "loss": 1.2705, + "step": 522 + }, + { + "epoch": 0.23563865735526018, + "grad_norm": 1.9426821596520805, + "learning_rate": 1e-05, + "loss": 1.2745, + "step": 523 + }, + { + "epoch": 0.23608920928136967, + "grad_norm": 2.081070297565642, + "learning_rate": 1e-05, + "loss": 1.3034, + "step": 524 + }, + { + "epoch": 0.23653976120747916, + "grad_norm": 1.8796976038406434, + "learning_rate": 1e-05, + "loss": 1.2481, + "step": 525 + }, + { + "epoch": 0.23699031313358865, + "grad_norm": 1.865460478581203, + "learning_rate": 1e-05, + "loss": 1.3021, + "step": 526 + }, + { + "epoch": 0.23744086505969814, + "grad_norm": 1.818970277211085, + "learning_rate": 1e-05, + "loss": 1.243, + "step": 527 + }, + { + "epoch": 0.23789141698580762, + "grad_norm": 1.9903490262035735, + "learning_rate": 1e-05, + "loss": 1.2598, + "step": 528 + }, + { + "epoch": 0.23834196891191708, + "grad_norm": 1.8917885928616347, + "learning_rate": 1e-05, + "loss": 1.3025, + "step": 529 + }, + { + "epoch": 0.23879252083802657, + "grad_norm": 1.9596289984736852, + "learning_rate": 1e-05, + "loss": 1.2954, + "step": 530 + }, + { + "epoch": 0.23924307276413606, + "grad_norm": 1.9621410635607637, + "learning_rate": 1e-05, + "loss": 1.2208, + "step": 531 + }, + { + "epoch": 0.23969362469024555, + "grad_norm": 1.922080949168734, + "learning_rate": 1e-05, + "loss": 1.2036, + "step": 532 + }, + { + "epoch": 0.24014417661635504, + "grad_norm": 2.0146667015909427, + "learning_rate": 1e-05, + "loss": 1.2538, + "step": 533 + }, + { + "epoch": 0.24059472854246453, + "grad_norm": 2.0416674478286763, + "learning_rate": 1e-05, + "loss": 1.2441, + "step": 534 + }, + { + "epoch": 0.241045280468574, + "grad_norm": 2.082711792086268, + "learning_rate": 1e-05, + "loss": 1.2083, + "step": 535 + }, + { + "epoch": 0.24149583239468347, + "grad_norm": 2.151044272905431, + "learning_rate": 1e-05, + "loss": 1.2281, + "step": 536 + }, + { + "epoch": 0.24194638432079296, + "grad_norm": 1.9625368046490017, + "learning_rate": 1e-05, + "loss": 1.275, + "step": 537 + }, + { + "epoch": 0.24239693624690245, + "grad_norm": 2.0426023983025985, + "learning_rate": 1e-05, + "loss": 1.2283, + "step": 538 + }, + { + "epoch": 0.24284748817301194, + "grad_norm": 1.853693957393131, + "learning_rate": 1e-05, + "loss": 1.2646, + "step": 539 + }, + { + "epoch": 0.24329804009912143, + "grad_norm": 2.0254715870273596, + "learning_rate": 1e-05, + "loss": 1.2516, + "step": 540 + }, + { + "epoch": 0.24374859202523091, + "grad_norm": 2.0387179440881753, + "learning_rate": 1e-05, + "loss": 1.2457, + "step": 541 + }, + { + "epoch": 0.2441991439513404, + "grad_norm": 1.9099190321454138, + "learning_rate": 1e-05, + "loss": 1.2443, + "step": 542 + }, + { + "epoch": 0.24464969587744986, + "grad_norm": 1.9176306660283398, + "learning_rate": 1e-05, + "loss": 1.2296, + "step": 543 + }, + { + "epoch": 0.24510024780355935, + "grad_norm": 2.099460935168799, + "learning_rate": 1e-05, + "loss": 1.24, + "step": 544 + }, + { + "epoch": 0.24555079972966884, + "grad_norm": 1.9843854834957728, + "learning_rate": 1e-05, + "loss": 1.2776, + "step": 545 + }, + { + "epoch": 0.24600135165577833, + "grad_norm": 2.0908098000374475, + "learning_rate": 1e-05, + "loss": 1.277, + "step": 546 + }, + { + "epoch": 0.24645190358188782, + "grad_norm": 1.857303589542034, + "learning_rate": 1e-05, + "loss": 1.2682, + "step": 547 + }, + { + "epoch": 0.2469024555079973, + "grad_norm": 1.8466377771443474, + "learning_rate": 1e-05, + "loss": 1.2519, + "step": 548 + }, + { + "epoch": 0.2473530074341068, + "grad_norm": 1.9712909163231243, + "learning_rate": 1e-05, + "loss": 1.2531, + "step": 549 + }, + { + "epoch": 0.24780355936021625, + "grad_norm": 1.7932411789570453, + "learning_rate": 1e-05, + "loss": 1.2663, + "step": 550 + }, + { + "epoch": 0.24825411128632574, + "grad_norm": 1.9020112246587428, + "learning_rate": 1e-05, + "loss": 1.2591, + "step": 551 + }, + { + "epoch": 0.24870466321243523, + "grad_norm": 1.9559681862087972, + "learning_rate": 1e-05, + "loss": 1.3088, + "step": 552 + }, + { + "epoch": 0.24915521513854472, + "grad_norm": 1.8097002149071548, + "learning_rate": 1e-05, + "loss": 1.2609, + "step": 553 + }, + { + "epoch": 0.2496057670646542, + "grad_norm": 2.0104367704997435, + "learning_rate": 1e-05, + "loss": 1.2931, + "step": 554 + }, + { + "epoch": 0.2500563189907637, + "grad_norm": 1.9436277754193785, + "learning_rate": 1e-05, + "loss": 1.3028, + "step": 555 + }, + { + "epoch": 0.2505068709168732, + "grad_norm": 1.8798357276671347, + "learning_rate": 1e-05, + "loss": 1.2211, + "step": 556 + }, + { + "epoch": 0.25095742284298267, + "grad_norm": 1.9024477751630549, + "learning_rate": 1e-05, + "loss": 1.2961, + "step": 557 + }, + { + "epoch": 0.25140797476909216, + "grad_norm": 1.8153727085142268, + "learning_rate": 1e-05, + "loss": 1.2941, + "step": 558 + }, + { + "epoch": 0.25185852669520165, + "grad_norm": 2.2310326015353983, + "learning_rate": 1e-05, + "loss": 1.29, + "step": 559 + }, + { + "epoch": 0.25230907862131113, + "grad_norm": 1.8442397109048703, + "learning_rate": 1e-05, + "loss": 1.2141, + "step": 560 + }, + { + "epoch": 0.25275963054742057, + "grad_norm": 1.9019081996535443, + "learning_rate": 1e-05, + "loss": 1.2646, + "step": 561 + }, + { + "epoch": 0.25321018247353005, + "grad_norm": 1.8380479192111479, + "learning_rate": 1e-05, + "loss": 1.2558, + "step": 562 + }, + { + "epoch": 0.25366073439963954, + "grad_norm": 1.8426028450767409, + "learning_rate": 1e-05, + "loss": 1.2492, + "step": 563 + }, + { + "epoch": 0.25411128632574903, + "grad_norm": 2.010881620497572, + "learning_rate": 1e-05, + "loss": 1.2736, + "step": 564 + }, + { + "epoch": 0.2545618382518585, + "grad_norm": 1.8697749380936282, + "learning_rate": 1e-05, + "loss": 1.2506, + "step": 565 + }, + { + "epoch": 0.255012390177968, + "grad_norm": 1.8935160673483178, + "learning_rate": 1e-05, + "loss": 1.2153, + "step": 566 + }, + { + "epoch": 0.2554629421040775, + "grad_norm": 1.9153294504976563, + "learning_rate": 1e-05, + "loss": 1.2979, + "step": 567 + }, + { + "epoch": 0.255913494030187, + "grad_norm": 1.9618080034916596, + "learning_rate": 1e-05, + "loss": 1.2654, + "step": 568 + }, + { + "epoch": 0.25636404595629647, + "grad_norm": 1.993769279135054, + "learning_rate": 1e-05, + "loss": 1.3167, + "step": 569 + }, + { + "epoch": 0.25681459788240596, + "grad_norm": 1.9148075559939128, + "learning_rate": 1e-05, + "loss": 1.2521, + "step": 570 + }, + { + "epoch": 0.25726514980851545, + "grad_norm": 1.8930065639482323, + "learning_rate": 1e-05, + "loss": 1.2972, + "step": 571 + }, + { + "epoch": 0.25771570173462494, + "grad_norm": 2.135229844218331, + "learning_rate": 1e-05, + "loss": 1.2488, + "step": 572 + }, + { + "epoch": 0.2581662536607344, + "grad_norm": 2.0760526206165153, + "learning_rate": 1e-05, + "loss": 1.218, + "step": 573 + }, + { + "epoch": 0.25861680558684386, + "grad_norm": 1.938146636441883, + "learning_rate": 1e-05, + "loss": 1.257, + "step": 574 + }, + { + "epoch": 0.25906735751295334, + "grad_norm": 1.9346531249502403, + "learning_rate": 1e-05, + "loss": 1.2362, + "step": 575 + }, + { + "epoch": 0.25951790943906283, + "grad_norm": 1.8983542752101255, + "learning_rate": 1e-05, + "loss": 1.245, + "step": 576 + }, + { + "epoch": 0.2599684613651723, + "grad_norm": 1.7958759437626646, + "learning_rate": 1e-05, + "loss": 1.253, + "step": 577 + }, + { + "epoch": 0.2604190132912818, + "grad_norm": 1.8849719480056717, + "learning_rate": 1e-05, + "loss": 1.2436, + "step": 578 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 1.7589070794953543, + "learning_rate": 1e-05, + "loss": 1.2681, + "step": 579 + }, + { + "epoch": 0.2613201171435008, + "grad_norm": 1.9961918535347094, + "learning_rate": 1e-05, + "loss": 1.2522, + "step": 580 + }, + { + "epoch": 0.2617706690696103, + "grad_norm": 2.110755417506112, + "learning_rate": 1e-05, + "loss": 1.2925, + "step": 581 + }, + { + "epoch": 0.26222122099571976, + "grad_norm": 2.2152176744911043, + "learning_rate": 1e-05, + "loss": 1.2577, + "step": 582 + }, + { + "epoch": 0.26267177292182925, + "grad_norm": 1.8224419298462682, + "learning_rate": 1e-05, + "loss": 1.252, + "step": 583 + }, + { + "epoch": 0.26312232484793874, + "grad_norm": 2.1189296581041877, + "learning_rate": 1e-05, + "loss": 1.2493, + "step": 584 + }, + { + "epoch": 0.2635728767740482, + "grad_norm": 1.9324131683927712, + "learning_rate": 1e-05, + "loss": 1.2796, + "step": 585 + }, + { + "epoch": 0.2640234287001577, + "grad_norm": 2.2033632424316294, + "learning_rate": 1e-05, + "loss": 1.2669, + "step": 586 + }, + { + "epoch": 0.2644739806262672, + "grad_norm": 1.7928066330460684, + "learning_rate": 1e-05, + "loss": 1.2663, + "step": 587 + }, + { + "epoch": 0.26492453255237663, + "grad_norm": 2.0178291312075833, + "learning_rate": 1e-05, + "loss": 1.2465, + "step": 588 + }, + { + "epoch": 0.2653750844784861, + "grad_norm": 1.7942754093241515, + "learning_rate": 1e-05, + "loss": 1.2118, + "step": 589 + }, + { + "epoch": 0.2658256364045956, + "grad_norm": 2.0418669185015124, + "learning_rate": 1e-05, + "loss": 1.2664, + "step": 590 + }, + { + "epoch": 0.2662761883307051, + "grad_norm": 1.9047054777681043, + "learning_rate": 1e-05, + "loss": 1.2726, + "step": 591 + }, + { + "epoch": 0.2667267402568146, + "grad_norm": 1.8933659600328823, + "learning_rate": 1e-05, + "loss": 1.3021, + "step": 592 + }, + { + "epoch": 0.2671772921829241, + "grad_norm": 1.8504998391516625, + "learning_rate": 1e-05, + "loss": 1.2382, + "step": 593 + }, + { + "epoch": 0.26762784410903356, + "grad_norm": 1.9144625800421582, + "learning_rate": 1e-05, + "loss": 1.2684, + "step": 594 + }, + { + "epoch": 0.26807839603514305, + "grad_norm": 1.9094798541668625, + "learning_rate": 1e-05, + "loss": 1.2533, + "step": 595 + }, + { + "epoch": 0.26852894796125254, + "grad_norm": 1.8852457983238309, + "learning_rate": 1e-05, + "loss": 1.2162, + "step": 596 + }, + { + "epoch": 0.26897949988736203, + "grad_norm": 1.9097091467304366, + "learning_rate": 1e-05, + "loss": 1.3164, + "step": 597 + }, + { + "epoch": 0.2694300518134715, + "grad_norm": 1.8950963782713985, + "learning_rate": 1e-05, + "loss": 1.2247, + "step": 598 + }, + { + "epoch": 0.269880603739581, + "grad_norm": 1.6843883133171236, + "learning_rate": 1e-05, + "loss": 1.2235, + "step": 599 + }, + { + "epoch": 0.2703311556656905, + "grad_norm": 2.0418070127443153, + "learning_rate": 1e-05, + "loss": 1.2505, + "step": 600 + }, + { + "epoch": 0.2707817075918, + "grad_norm": 1.7928717250400767, + "learning_rate": 1e-05, + "loss": 1.2421, + "step": 601 + }, + { + "epoch": 0.2712322595179094, + "grad_norm": 1.9859554696363064, + "learning_rate": 1e-05, + "loss": 1.229, + "step": 602 + }, + { + "epoch": 0.2716828114440189, + "grad_norm": 2.0638907923686234, + "learning_rate": 1e-05, + "loss": 1.2687, + "step": 603 + }, + { + "epoch": 0.2721333633701284, + "grad_norm": 1.9984427057305065, + "learning_rate": 1e-05, + "loss": 1.2763, + "step": 604 + }, + { + "epoch": 0.2725839152962379, + "grad_norm": 1.830478514385362, + "learning_rate": 1e-05, + "loss": 1.2383, + "step": 605 + }, + { + "epoch": 0.27303446722234737, + "grad_norm": 1.9236051373849303, + "learning_rate": 1e-05, + "loss": 1.2394, + "step": 606 + }, + { + "epoch": 0.27348501914845685, + "grad_norm": 1.8885633057049556, + "learning_rate": 1e-05, + "loss": 1.2794, + "step": 607 + }, + { + "epoch": 0.27393557107456634, + "grad_norm": 2.0478146941614344, + "learning_rate": 1e-05, + "loss": 1.2718, + "step": 608 + }, + { + "epoch": 0.27438612300067583, + "grad_norm": 1.885228281361915, + "learning_rate": 1e-05, + "loss": 1.1898, + "step": 609 + }, + { + "epoch": 0.2748366749267853, + "grad_norm": 1.965907132172426, + "learning_rate": 1e-05, + "loss": 1.2515, + "step": 610 + }, + { + "epoch": 0.2752872268528948, + "grad_norm": 1.971709861173124, + "learning_rate": 1e-05, + "loss": 1.2287, + "step": 611 + }, + { + "epoch": 0.2757377787790043, + "grad_norm": 1.9506731322089625, + "learning_rate": 1e-05, + "loss": 1.2697, + "step": 612 + }, + { + "epoch": 0.2761883307051138, + "grad_norm": 2.037557087427492, + "learning_rate": 1e-05, + "loss": 1.2547, + "step": 613 + }, + { + "epoch": 0.27663888263122327, + "grad_norm": 1.9533614493348617, + "learning_rate": 1e-05, + "loss": 1.2732, + "step": 614 + }, + { + "epoch": 0.27708943455733276, + "grad_norm": 1.9467276392821233, + "learning_rate": 1e-05, + "loss": 1.2485, + "step": 615 + }, + { + "epoch": 0.2775399864834422, + "grad_norm": 1.8261245698686148, + "learning_rate": 1e-05, + "loss": 1.2801, + "step": 616 + }, + { + "epoch": 0.2779905384095517, + "grad_norm": 1.7677999183226603, + "learning_rate": 1e-05, + "loss": 1.2581, + "step": 617 + }, + { + "epoch": 0.27844109033566117, + "grad_norm": 1.8793767659029388, + "learning_rate": 1e-05, + "loss": 1.2155, + "step": 618 + }, + { + "epoch": 0.27889164226177066, + "grad_norm": 1.96490874825221, + "learning_rate": 1e-05, + "loss": 1.2412, + "step": 619 + }, + { + "epoch": 0.27934219418788014, + "grad_norm": 1.9862147573110902, + "learning_rate": 1e-05, + "loss": 1.2088, + "step": 620 + }, + { + "epoch": 0.27979274611398963, + "grad_norm": 1.9880527565841493, + "learning_rate": 1e-05, + "loss": 1.3268, + "step": 621 + }, + { + "epoch": 0.2802432980400991, + "grad_norm": 1.918947122409961, + "learning_rate": 1e-05, + "loss": 1.2982, + "step": 622 + }, + { + "epoch": 0.2806938499662086, + "grad_norm": 2.1011233128899054, + "learning_rate": 1e-05, + "loss": 1.2747, + "step": 623 + }, + { + "epoch": 0.2811444018923181, + "grad_norm": 2.1554619814183726, + "learning_rate": 1e-05, + "loss": 1.2303, + "step": 624 + }, + { + "epoch": 0.2815949538184276, + "grad_norm": 1.9287282770735685, + "learning_rate": 1e-05, + "loss": 1.272, + "step": 625 + }, + { + "epoch": 0.2820455057445371, + "grad_norm": 1.9136176641149933, + "learning_rate": 1e-05, + "loss": 1.2338, + "step": 626 + }, + { + "epoch": 0.28249605767064656, + "grad_norm": 1.7970486871003493, + "learning_rate": 1e-05, + "loss": 1.2632, + "step": 627 + }, + { + "epoch": 0.28294660959675605, + "grad_norm": 1.8635294906179631, + "learning_rate": 1e-05, + "loss": 1.242, + "step": 628 + }, + { + "epoch": 0.28339716152286554, + "grad_norm": 1.8187208723363777, + "learning_rate": 1e-05, + "loss": 1.2687, + "step": 629 + }, + { + "epoch": 0.28384771344897497, + "grad_norm": 1.9285995653602948, + "learning_rate": 1e-05, + "loss": 1.2532, + "step": 630 + }, + { + "epoch": 0.28429826537508446, + "grad_norm": 1.7634240721942334, + "learning_rate": 1e-05, + "loss": 1.2467, + "step": 631 + }, + { + "epoch": 0.28474881730119395, + "grad_norm": 1.9044966044859704, + "learning_rate": 1e-05, + "loss": 1.2523, + "step": 632 + }, + { + "epoch": 0.28519936922730343, + "grad_norm": 1.9045151073343214, + "learning_rate": 1e-05, + "loss": 1.3006, + "step": 633 + }, + { + "epoch": 0.2856499211534129, + "grad_norm": 1.95462789696124, + "learning_rate": 1e-05, + "loss": 1.2299, + "step": 634 + }, + { + "epoch": 0.2861004730795224, + "grad_norm": 1.772589567631728, + "learning_rate": 1e-05, + "loss": 1.2576, + "step": 635 + }, + { + "epoch": 0.2865510250056319, + "grad_norm": 1.9744864517778673, + "learning_rate": 1e-05, + "loss": 1.2105, + "step": 636 + }, + { + "epoch": 0.2870015769317414, + "grad_norm": 1.9986746713780499, + "learning_rate": 1e-05, + "loss": 1.2213, + "step": 637 + }, + { + "epoch": 0.2874521288578509, + "grad_norm": 1.8563633099604115, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 638 + }, + { + "epoch": 0.28790268078396036, + "grad_norm": 1.8947838304224478, + "learning_rate": 1e-05, + "loss": 1.2159, + "step": 639 + }, + { + "epoch": 0.28835323271006985, + "grad_norm": 1.8021751312403476, + "learning_rate": 1e-05, + "loss": 1.2135, + "step": 640 + }, + { + "epoch": 0.28880378463617934, + "grad_norm": 1.9584176887843077, + "learning_rate": 1e-05, + "loss": 1.2968, + "step": 641 + }, + { + "epoch": 0.2892543365622888, + "grad_norm": 2.1133640919058885, + "learning_rate": 1e-05, + "loss": 1.2968, + "step": 642 + }, + { + "epoch": 0.2897048884883983, + "grad_norm": 1.8296540316036876, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 643 + }, + { + "epoch": 0.29015544041450775, + "grad_norm": 1.7767733447837024, + "learning_rate": 1e-05, + "loss": 1.2734, + "step": 644 + }, + { + "epoch": 0.29060599234061724, + "grad_norm": 1.779280238237704, + "learning_rate": 1e-05, + "loss": 1.2466, + "step": 645 + }, + { + "epoch": 0.2910565442667267, + "grad_norm": 1.8168390816308468, + "learning_rate": 1e-05, + "loss": 1.2492, + "step": 646 + }, + { + "epoch": 0.2915070961928362, + "grad_norm": 2.041111946701274, + "learning_rate": 1e-05, + "loss": 1.1899, + "step": 647 + }, + { + "epoch": 0.2919576481189457, + "grad_norm": 1.9122640157995485, + "learning_rate": 1e-05, + "loss": 1.2323, + "step": 648 + }, + { + "epoch": 0.2924082000450552, + "grad_norm": 1.9426734430959265, + "learning_rate": 1e-05, + "loss": 1.2979, + "step": 649 + }, + { + "epoch": 0.2928587519711647, + "grad_norm": 2.0810089200553206, + "learning_rate": 1e-05, + "loss": 1.2241, + "step": 650 + }, + { + "epoch": 0.29330930389727417, + "grad_norm": 1.885071653308782, + "learning_rate": 1e-05, + "loss": 1.2606, + "step": 651 + }, + { + "epoch": 0.29375985582338365, + "grad_norm": 1.9440547245925526, + "learning_rate": 1e-05, + "loss": 1.2584, + "step": 652 + }, + { + "epoch": 0.29421040774949314, + "grad_norm": 1.8334532422194452, + "learning_rate": 1e-05, + "loss": 1.2687, + "step": 653 + }, + { + "epoch": 0.29466095967560263, + "grad_norm": 2.0064599716232743, + "learning_rate": 1e-05, + "loss": 1.2758, + "step": 654 + }, + { + "epoch": 0.2951115116017121, + "grad_norm": 2.0338711008764716, + "learning_rate": 1e-05, + "loss": 1.3173, + "step": 655 + }, + { + "epoch": 0.2955620635278216, + "grad_norm": 1.9613884552495555, + "learning_rate": 1e-05, + "loss": 1.2838, + "step": 656 + }, + { + "epoch": 0.29601261545393104, + "grad_norm": 1.794221735858151, + "learning_rate": 1e-05, + "loss": 1.2701, + "step": 657 + }, + { + "epoch": 0.2964631673800405, + "grad_norm": 1.7636679479293365, + "learning_rate": 1e-05, + "loss": 1.2454, + "step": 658 + }, + { + "epoch": 0.29691371930615, + "grad_norm": 1.9078463684005733, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 659 + }, + { + "epoch": 0.2973642712322595, + "grad_norm": 2.1209677317693454, + "learning_rate": 1e-05, + "loss": 1.2354, + "step": 660 + }, + { + "epoch": 0.297814823158369, + "grad_norm": 2.0008354557103623, + "learning_rate": 1e-05, + "loss": 1.2186, + "step": 661 + }, + { + "epoch": 0.2982653750844785, + "grad_norm": 1.8965814673419596, + "learning_rate": 1e-05, + "loss": 1.2603, + "step": 662 + }, + { + "epoch": 0.29871592701058797, + "grad_norm": 1.9719058474306004, + "learning_rate": 1e-05, + "loss": 1.2781, + "step": 663 + }, + { + "epoch": 0.29916647893669746, + "grad_norm": 1.7544263698970395, + "learning_rate": 1e-05, + "loss": 1.2813, + "step": 664 + }, + { + "epoch": 0.29961703086280694, + "grad_norm": 1.8711638364612417, + "learning_rate": 1e-05, + "loss": 1.2925, + "step": 665 + }, + { + "epoch": 0.30006758278891643, + "grad_norm": 2.0782600818572377, + "learning_rate": 1e-05, + "loss": 1.2416, + "step": 666 + }, + { + "epoch": 0.3005181347150259, + "grad_norm": 1.7494014354815353, + "learning_rate": 1e-05, + "loss": 1.263, + "step": 667 + }, + { + "epoch": 0.3009686866411354, + "grad_norm": 1.8595411629469847, + "learning_rate": 1e-05, + "loss": 1.2161, + "step": 668 + }, + { + "epoch": 0.3014192385672449, + "grad_norm": 1.8264571962662959, + "learning_rate": 1e-05, + "loss": 1.2698, + "step": 669 + }, + { + "epoch": 0.3018697904933544, + "grad_norm": 1.9191326189499924, + "learning_rate": 1e-05, + "loss": 1.2612, + "step": 670 + }, + { + "epoch": 0.3023203424194638, + "grad_norm": 2.160600233993354, + "learning_rate": 1e-05, + "loss": 1.2138, + "step": 671 + }, + { + "epoch": 0.3027708943455733, + "grad_norm": 1.7713948983525214, + "learning_rate": 1e-05, + "loss": 1.2791, + "step": 672 + }, + { + "epoch": 0.3032214462716828, + "grad_norm": 1.7448790476109295, + "learning_rate": 1e-05, + "loss": 1.2671, + "step": 673 + }, + { + "epoch": 0.3036719981977923, + "grad_norm": 1.8631161510984464, + "learning_rate": 1e-05, + "loss": 1.2832, + "step": 674 + }, + { + "epoch": 0.30412255012390177, + "grad_norm": 1.8206084628951194, + "learning_rate": 1e-05, + "loss": 1.2223, + "step": 675 + }, + { + "epoch": 0.30457310205001126, + "grad_norm": 1.9898347824902163, + "learning_rate": 1e-05, + "loss": 1.3051, + "step": 676 + }, + { + "epoch": 0.30502365397612075, + "grad_norm": 1.9515819844602575, + "learning_rate": 1e-05, + "loss": 1.2252, + "step": 677 + }, + { + "epoch": 0.30547420590223023, + "grad_norm": 1.9669775761642008, + "learning_rate": 1e-05, + "loss": 1.3134, + "step": 678 + }, + { + "epoch": 0.3059247578283397, + "grad_norm": 2.041871634160802, + "learning_rate": 1e-05, + "loss": 1.2395, + "step": 679 + }, + { + "epoch": 0.3063753097544492, + "grad_norm": 1.6875829319333566, + "learning_rate": 1e-05, + "loss": 1.2713, + "step": 680 + }, + { + "epoch": 0.3068258616805587, + "grad_norm": 1.8023021843830136, + "learning_rate": 1e-05, + "loss": 1.2073, + "step": 681 + }, + { + "epoch": 0.3072764136066682, + "grad_norm": 2.028430485499546, + "learning_rate": 1e-05, + "loss": 1.2381, + "step": 682 + }, + { + "epoch": 0.3077269655327777, + "grad_norm": 1.9448083800430478, + "learning_rate": 1e-05, + "loss": 1.2199, + "step": 683 + }, + { + "epoch": 0.30817751745888716, + "grad_norm": 1.820721473843506, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 684 + }, + { + "epoch": 0.3086280693849966, + "grad_norm": 1.949589011134347, + "learning_rate": 1e-05, + "loss": 1.2072, + "step": 685 + }, + { + "epoch": 0.3090786213111061, + "grad_norm": 1.8732994550966031, + "learning_rate": 1e-05, + "loss": 1.2967, + "step": 686 + }, + { + "epoch": 0.30952917323721557, + "grad_norm": 1.932829278229781, + "learning_rate": 1e-05, + "loss": 1.2673, + "step": 687 + }, + { + "epoch": 0.30997972516332506, + "grad_norm": 1.9086931128881206, + "learning_rate": 1e-05, + "loss": 1.2758, + "step": 688 + }, + { + "epoch": 0.31043027708943455, + "grad_norm": 1.760708860736313, + "learning_rate": 1e-05, + "loss": 1.2494, + "step": 689 + }, + { + "epoch": 0.31088082901554404, + "grad_norm": 1.9712662595531913, + "learning_rate": 1e-05, + "loss": 1.2472, + "step": 690 + }, + { + "epoch": 0.3113313809416535, + "grad_norm": 1.8962750227529488, + "learning_rate": 1e-05, + "loss": 1.2289, + "step": 691 + }, + { + "epoch": 0.311781932867763, + "grad_norm": 2.0300733168828597, + "learning_rate": 1e-05, + "loss": 1.2079, + "step": 692 + }, + { + "epoch": 0.3122324847938725, + "grad_norm": 1.8056562646023957, + "learning_rate": 1e-05, + "loss": 1.2212, + "step": 693 + }, + { + "epoch": 0.312683036719982, + "grad_norm": 1.7827128325999113, + "learning_rate": 1e-05, + "loss": 1.3003, + "step": 694 + }, + { + "epoch": 0.3131335886460915, + "grad_norm": 1.9333767688649806, + "learning_rate": 1e-05, + "loss": 1.2323, + "step": 695 + }, + { + "epoch": 0.31358414057220096, + "grad_norm": 1.8373807667649689, + "learning_rate": 1e-05, + "loss": 1.2484, + "step": 696 + }, + { + "epoch": 0.31403469249831045, + "grad_norm": 1.9226577869565329, + "learning_rate": 1e-05, + "loss": 1.2749, + "step": 697 + }, + { + "epoch": 0.31448524442441994, + "grad_norm": 1.984429483011316, + "learning_rate": 1e-05, + "loss": 1.2305, + "step": 698 + }, + { + "epoch": 0.3149357963505294, + "grad_norm": 1.7189193892169639, + "learning_rate": 1e-05, + "loss": 1.2669, + "step": 699 + }, + { + "epoch": 0.31538634827663886, + "grad_norm": 1.8465549129475556, + "learning_rate": 1e-05, + "loss": 1.2127, + "step": 700 + }, + { + "epoch": 0.31583690020274835, + "grad_norm": 1.8360690944861742, + "learning_rate": 1e-05, + "loss": 1.2512, + "step": 701 + }, + { + "epoch": 0.31628745212885784, + "grad_norm": 1.5944897716492685, + "learning_rate": 1e-05, + "loss": 1.2394, + "step": 702 + }, + { + "epoch": 0.3167380040549673, + "grad_norm": 1.9074999633821268, + "learning_rate": 1e-05, + "loss": 1.2171, + "step": 703 + }, + { + "epoch": 0.3171885559810768, + "grad_norm": 1.992601939946842, + "learning_rate": 1e-05, + "loss": 1.2643, + "step": 704 + }, + { + "epoch": 0.3176391079071863, + "grad_norm": 1.930457455101624, + "learning_rate": 1e-05, + "loss": 1.2193, + "step": 705 + }, + { + "epoch": 0.3180896598332958, + "grad_norm": 1.9228635882655845, + "learning_rate": 1e-05, + "loss": 1.2466, + "step": 706 + }, + { + "epoch": 0.3185402117594053, + "grad_norm": 2.042348035086456, + "learning_rate": 1e-05, + "loss": 1.2935, + "step": 707 + }, + { + "epoch": 0.31899076368551477, + "grad_norm": 1.8468885660035388, + "learning_rate": 1e-05, + "loss": 1.2542, + "step": 708 + }, + { + "epoch": 0.31944131561162425, + "grad_norm": 1.93678191805368, + "learning_rate": 1e-05, + "loss": 1.2067, + "step": 709 + }, + { + "epoch": 0.31989186753773374, + "grad_norm": 1.8568046179383066, + "learning_rate": 1e-05, + "loss": 1.2448, + "step": 710 + }, + { + "epoch": 0.32034241946384323, + "grad_norm": 1.936668150957869, + "learning_rate": 1e-05, + "loss": 1.2612, + "step": 711 + }, + { + "epoch": 0.3207929713899527, + "grad_norm": 1.7810908935731147, + "learning_rate": 1e-05, + "loss": 1.2294, + "step": 712 + }, + { + "epoch": 0.32124352331606215, + "grad_norm": 1.9363587940760691, + "learning_rate": 1e-05, + "loss": 1.2686, + "step": 713 + }, + { + "epoch": 0.32169407524217164, + "grad_norm": 1.828765552522956, + "learning_rate": 1e-05, + "loss": 1.2174, + "step": 714 + }, + { + "epoch": 0.32214462716828113, + "grad_norm": 1.9851127184045374, + "learning_rate": 1e-05, + "loss": 1.2122, + "step": 715 + }, + { + "epoch": 0.3225951790943906, + "grad_norm": 1.6979971611419993, + "learning_rate": 1e-05, + "loss": 1.2277, + "step": 716 + }, + { + "epoch": 0.3230457310205001, + "grad_norm": 1.798659712836174, + "learning_rate": 1e-05, + "loss": 1.2619, + "step": 717 + }, + { + "epoch": 0.3234962829466096, + "grad_norm": 1.9369672921677843, + "learning_rate": 1e-05, + "loss": 1.1789, + "step": 718 + }, + { + "epoch": 0.3239468348727191, + "grad_norm": 1.7233293500872506, + "learning_rate": 1e-05, + "loss": 1.2636, + "step": 719 + }, + { + "epoch": 0.32439738679882857, + "grad_norm": 1.695406821888204, + "learning_rate": 1e-05, + "loss": 1.2562, + "step": 720 + }, + { + "epoch": 0.32484793872493806, + "grad_norm": 1.7870571922328964, + "learning_rate": 1e-05, + "loss": 1.2992, + "step": 721 + }, + { + "epoch": 0.32529849065104754, + "grad_norm": 1.987769705851601, + "learning_rate": 1e-05, + "loss": 1.263, + "step": 722 + }, + { + "epoch": 0.32574904257715703, + "grad_norm": 2.073377619924158, + "learning_rate": 1e-05, + "loss": 1.2416, + "step": 723 + }, + { + "epoch": 0.3261995945032665, + "grad_norm": 1.8075389951410414, + "learning_rate": 1e-05, + "loss": 1.2239, + "step": 724 + }, + { + "epoch": 0.326650146429376, + "grad_norm": 1.8767063676009834, + "learning_rate": 1e-05, + "loss": 1.2238, + "step": 725 + }, + { + "epoch": 0.3271006983554855, + "grad_norm": 1.9335488853913598, + "learning_rate": 1e-05, + "loss": 1.2512, + "step": 726 + }, + { + "epoch": 0.32755125028159493, + "grad_norm": 1.9775223677374711, + "learning_rate": 1e-05, + "loss": 1.2708, + "step": 727 + }, + { + "epoch": 0.3280018022077044, + "grad_norm": 1.9912742641164474, + "learning_rate": 1e-05, + "loss": 1.252, + "step": 728 + }, + { + "epoch": 0.3284523541338139, + "grad_norm": 2.09177823369937, + "learning_rate": 1e-05, + "loss": 1.2249, + "step": 729 + }, + { + "epoch": 0.3289029060599234, + "grad_norm": 1.9218260164559409, + "learning_rate": 1e-05, + "loss": 1.2589, + "step": 730 + }, + { + "epoch": 0.3293534579860329, + "grad_norm": 1.816883454509941, + "learning_rate": 1e-05, + "loss": 1.2504, + "step": 731 + }, + { + "epoch": 0.32980400991214237, + "grad_norm": 1.9463305657675822, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 732 + }, + { + "epoch": 0.33025456183825186, + "grad_norm": 1.7032723184588883, + "learning_rate": 1e-05, + "loss": 1.2376, + "step": 733 + }, + { + "epoch": 0.33070511376436135, + "grad_norm": 1.991976651796448, + "learning_rate": 1e-05, + "loss": 1.2354, + "step": 734 + }, + { + "epoch": 0.33115566569047084, + "grad_norm": 1.8782791838639008, + "learning_rate": 1e-05, + "loss": 1.2649, + "step": 735 + }, + { + "epoch": 0.3316062176165803, + "grad_norm": 1.6540855870657907, + "learning_rate": 1e-05, + "loss": 1.2287, + "step": 736 + }, + { + "epoch": 0.3320567695426898, + "grad_norm": 1.9522705189092093, + "learning_rate": 1e-05, + "loss": 1.2337, + "step": 737 + }, + { + "epoch": 0.3325073214687993, + "grad_norm": 1.935043846337164, + "learning_rate": 1e-05, + "loss": 1.2327, + "step": 738 + }, + { + "epoch": 0.3329578733949088, + "grad_norm": 1.807798892625471, + "learning_rate": 1e-05, + "loss": 1.2422, + "step": 739 + }, + { + "epoch": 0.3334084253210182, + "grad_norm": 1.8889784912562422, + "learning_rate": 1e-05, + "loss": 1.2715, + "step": 740 + }, + { + "epoch": 0.3338589772471277, + "grad_norm": 1.723950690185663, + "learning_rate": 1e-05, + "loss": 1.2289, + "step": 741 + }, + { + "epoch": 0.3343095291732372, + "grad_norm": 1.5995009443472274, + "learning_rate": 1e-05, + "loss": 1.2203, + "step": 742 + }, + { + "epoch": 0.3347600810993467, + "grad_norm": 1.945758037730679, + "learning_rate": 1e-05, + "loss": 1.2116, + "step": 743 + }, + { + "epoch": 0.3352106330254562, + "grad_norm": 1.9733192591587503, + "learning_rate": 1e-05, + "loss": 1.232, + "step": 744 + }, + { + "epoch": 0.33566118495156566, + "grad_norm": 2.0633809873560884, + "learning_rate": 1e-05, + "loss": 1.2626, + "step": 745 + }, + { + "epoch": 0.33611173687767515, + "grad_norm": 2.0365944648132155, + "learning_rate": 1e-05, + "loss": 1.1862, + "step": 746 + }, + { + "epoch": 0.33656228880378464, + "grad_norm": 1.779828087983778, + "learning_rate": 1e-05, + "loss": 1.2454, + "step": 747 + }, + { + "epoch": 0.3370128407298941, + "grad_norm": 1.8123635566033534, + "learning_rate": 1e-05, + "loss": 1.2256, + "step": 748 + }, + { + "epoch": 0.3374633926560036, + "grad_norm": 2.1513724728075894, + "learning_rate": 1e-05, + "loss": 1.2773, + "step": 749 + }, + { + "epoch": 0.3379139445821131, + "grad_norm": 2.0077394950898326, + "learning_rate": 1e-05, + "loss": 1.2122, + "step": 750 + }, + { + "epoch": 0.3383644965082226, + "grad_norm": 1.9381550250884332, + "learning_rate": 1e-05, + "loss": 1.2519, + "step": 751 + }, + { + "epoch": 0.3388150484343321, + "grad_norm": 2.025361026523369, + "learning_rate": 1e-05, + "loss": 1.2078, + "step": 752 + }, + { + "epoch": 0.33926560036044157, + "grad_norm": 1.8400031398551076, + "learning_rate": 1e-05, + "loss": 1.2663, + "step": 753 + }, + { + "epoch": 0.339716152286551, + "grad_norm": 1.8809149555202167, + "learning_rate": 1e-05, + "loss": 1.2829, + "step": 754 + }, + { + "epoch": 0.3401667042126605, + "grad_norm": 1.9664266559297683, + "learning_rate": 1e-05, + "loss": 1.2397, + "step": 755 + }, + { + "epoch": 0.34061725613877, + "grad_norm": 2.041521078939078, + "learning_rate": 1e-05, + "loss": 1.2566, + "step": 756 + }, + { + "epoch": 0.34106780806487946, + "grad_norm": 1.6817834186608238, + "learning_rate": 1e-05, + "loss": 1.2897, + "step": 757 + }, + { + "epoch": 0.34151835999098895, + "grad_norm": 1.9397442643785014, + "learning_rate": 1e-05, + "loss": 1.264, + "step": 758 + }, + { + "epoch": 0.34196891191709844, + "grad_norm": 2.1191890181952964, + "learning_rate": 1e-05, + "loss": 1.184, + "step": 759 + }, + { + "epoch": 0.3424194638432079, + "grad_norm": 1.9793801207753674, + "learning_rate": 1e-05, + "loss": 1.2168, + "step": 760 + }, + { + "epoch": 0.3428700157693174, + "grad_norm": 1.9456244095521873, + "learning_rate": 1e-05, + "loss": 1.2407, + "step": 761 + }, + { + "epoch": 0.3433205676954269, + "grad_norm": 1.6993983808068778, + "learning_rate": 1e-05, + "loss": 1.2893, + "step": 762 + }, + { + "epoch": 0.3437711196215364, + "grad_norm": 1.769049718489682, + "learning_rate": 1e-05, + "loss": 1.2378, + "step": 763 + }, + { + "epoch": 0.3442216715476459, + "grad_norm": 1.9129320101600475, + "learning_rate": 1e-05, + "loss": 1.2181, + "step": 764 + }, + { + "epoch": 0.34467222347375537, + "grad_norm": 1.8656987912590273, + "learning_rate": 1e-05, + "loss": 1.1849, + "step": 765 + }, + { + "epoch": 0.34512277539986486, + "grad_norm": 2.098538830379863, + "learning_rate": 1e-05, + "loss": 1.2239, + "step": 766 + }, + { + "epoch": 0.34557332732597434, + "grad_norm": 1.8523257138169724, + "learning_rate": 1e-05, + "loss": 1.223, + "step": 767 + }, + { + "epoch": 0.3460238792520838, + "grad_norm": 1.7717813153696425, + "learning_rate": 1e-05, + "loss": 1.2795, + "step": 768 + }, + { + "epoch": 0.34647443117819327, + "grad_norm": 1.932633789733974, + "learning_rate": 1e-05, + "loss": 1.2513, + "step": 769 + }, + { + "epoch": 0.34692498310430275, + "grad_norm": 1.9188570240882774, + "learning_rate": 1e-05, + "loss": 1.2028, + "step": 770 + }, + { + "epoch": 0.34737553503041224, + "grad_norm": 1.9153912274518279, + "learning_rate": 1e-05, + "loss": 1.2137, + "step": 771 + }, + { + "epoch": 0.34782608695652173, + "grad_norm": 1.7891212005130266, + "learning_rate": 1e-05, + "loss": 1.1921, + "step": 772 + }, + { + "epoch": 0.3482766388826312, + "grad_norm": 2.1043655596657884, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 773 + }, + { + "epoch": 0.3487271908087407, + "grad_norm": 1.8048577556595926, + "learning_rate": 1e-05, + "loss": 1.2283, + "step": 774 + }, + { + "epoch": 0.3491777427348502, + "grad_norm": 1.8763088020409484, + "learning_rate": 1e-05, + "loss": 1.2978, + "step": 775 + }, + { + "epoch": 0.3496282946609597, + "grad_norm": 1.8610132874734748, + "learning_rate": 1e-05, + "loss": 1.222, + "step": 776 + }, + { + "epoch": 0.35007884658706917, + "grad_norm": 1.8498527649552017, + "learning_rate": 1e-05, + "loss": 1.2687, + "step": 777 + }, + { + "epoch": 0.35052939851317866, + "grad_norm": 1.7764009803913332, + "learning_rate": 1e-05, + "loss": 1.2407, + "step": 778 + }, + { + "epoch": 0.35097995043928815, + "grad_norm": 1.8972165021959277, + "learning_rate": 1e-05, + "loss": 1.2507, + "step": 779 + }, + { + "epoch": 0.35143050236539763, + "grad_norm": 1.8941513283311089, + "learning_rate": 1e-05, + "loss": 1.3047, + "step": 780 + }, + { + "epoch": 0.3518810542915071, + "grad_norm": 1.7516035467723052, + "learning_rate": 1e-05, + "loss": 1.2209, + "step": 781 + }, + { + "epoch": 0.35233160621761656, + "grad_norm": 1.7561059495858193, + "learning_rate": 1e-05, + "loss": 1.1997, + "step": 782 + }, + { + "epoch": 0.35278215814372604, + "grad_norm": 1.7901381312825824, + "learning_rate": 1e-05, + "loss": 1.2098, + "step": 783 + }, + { + "epoch": 0.35323271006983553, + "grad_norm": 1.7508908641532699, + "learning_rate": 1e-05, + "loss": 1.2572, + "step": 784 + }, + { + "epoch": 0.353683261995945, + "grad_norm": 1.8488980584698382, + "learning_rate": 1e-05, + "loss": 1.2055, + "step": 785 + }, + { + "epoch": 0.3541338139220545, + "grad_norm": 1.7706487550217227, + "learning_rate": 1e-05, + "loss": 1.2951, + "step": 786 + }, + { + "epoch": 0.354584365848164, + "grad_norm": 1.9049654575616393, + "learning_rate": 1e-05, + "loss": 1.2546, + "step": 787 + }, + { + "epoch": 0.3550349177742735, + "grad_norm": 1.9533455484883948, + "learning_rate": 1e-05, + "loss": 1.2211, + "step": 788 + }, + { + "epoch": 0.35548546970038297, + "grad_norm": 1.744879836527318, + "learning_rate": 1e-05, + "loss": 1.2455, + "step": 789 + }, + { + "epoch": 0.35593602162649246, + "grad_norm": 1.884398471923905, + "learning_rate": 1e-05, + "loss": 1.2628, + "step": 790 + }, + { + "epoch": 0.35638657355260195, + "grad_norm": 1.914153193744963, + "learning_rate": 1e-05, + "loss": 1.2213, + "step": 791 + }, + { + "epoch": 0.35683712547871144, + "grad_norm": 1.7549023438590918, + "learning_rate": 1e-05, + "loss": 1.2402, + "step": 792 + }, + { + "epoch": 0.3572876774048209, + "grad_norm": 1.848486134709626, + "learning_rate": 1e-05, + "loss": 1.2392, + "step": 793 + }, + { + "epoch": 0.3577382293309304, + "grad_norm": 1.7459577103976147, + "learning_rate": 1e-05, + "loss": 1.2305, + "step": 794 + }, + { + "epoch": 0.3581887812570399, + "grad_norm": 1.7715126663952638, + "learning_rate": 1e-05, + "loss": 1.2304, + "step": 795 + }, + { + "epoch": 0.35863933318314933, + "grad_norm": 1.9331485517321647, + "learning_rate": 1e-05, + "loss": 1.211, + "step": 796 + }, + { + "epoch": 0.3590898851092588, + "grad_norm": 1.9171014142037934, + "learning_rate": 1e-05, + "loss": 1.233, + "step": 797 + }, + { + "epoch": 0.3595404370353683, + "grad_norm": 1.8777048679057136, + "learning_rate": 1e-05, + "loss": 1.2483, + "step": 798 + }, + { + "epoch": 0.3599909889614778, + "grad_norm": 2.2202998663962754, + "learning_rate": 1e-05, + "loss": 1.272, + "step": 799 + }, + { + "epoch": 0.3604415408875873, + "grad_norm": 1.9769852229050389, + "learning_rate": 1e-05, + "loss": 1.236, + "step": 800 + }, + { + "epoch": 0.3608920928136968, + "grad_norm": 1.7462248980283235, + "learning_rate": 1e-05, + "loss": 1.296, + "step": 801 + }, + { + "epoch": 0.36134264473980626, + "grad_norm": 1.919523299895141, + "learning_rate": 1e-05, + "loss": 1.208, + "step": 802 + }, + { + "epoch": 0.36179319666591575, + "grad_norm": 1.7725751095521172, + "learning_rate": 1e-05, + "loss": 1.2697, + "step": 803 + }, + { + "epoch": 0.36224374859202524, + "grad_norm": 1.998043101972712, + "learning_rate": 1e-05, + "loss": 1.2168, + "step": 804 + }, + { + "epoch": 0.3626943005181347, + "grad_norm": 1.93511292404548, + "learning_rate": 1e-05, + "loss": 1.261, + "step": 805 + }, + { + "epoch": 0.3631448524442442, + "grad_norm": 1.939482584712453, + "learning_rate": 1e-05, + "loss": 1.2666, + "step": 806 + }, + { + "epoch": 0.3635954043703537, + "grad_norm": 1.9633358814141688, + "learning_rate": 1e-05, + "loss": 1.1979, + "step": 807 + }, + { + "epoch": 0.3640459562964632, + "grad_norm": 1.974552504756949, + "learning_rate": 1e-05, + "loss": 1.2228, + "step": 808 + }, + { + "epoch": 0.3644965082225726, + "grad_norm": 1.8943325796584518, + "learning_rate": 1e-05, + "loss": 1.2809, + "step": 809 + }, + { + "epoch": 0.3649470601486821, + "grad_norm": 1.8415647742821555, + "learning_rate": 1e-05, + "loss": 1.2498, + "step": 810 + }, + { + "epoch": 0.3653976120747916, + "grad_norm": 1.7198111870455757, + "learning_rate": 1e-05, + "loss": 1.2474, + "step": 811 + }, + { + "epoch": 0.3658481640009011, + "grad_norm": 1.8184528638787918, + "learning_rate": 1e-05, + "loss": 1.261, + "step": 812 + }, + { + "epoch": 0.3662987159270106, + "grad_norm": 1.908720974845513, + "learning_rate": 1e-05, + "loss": 1.2648, + "step": 813 + }, + { + "epoch": 0.36674926785312006, + "grad_norm": 1.7975656253941963, + "learning_rate": 1e-05, + "loss": 1.2426, + "step": 814 + }, + { + "epoch": 0.36719981977922955, + "grad_norm": 1.9362132595172787, + "learning_rate": 1e-05, + "loss": 1.256, + "step": 815 + }, + { + "epoch": 0.36765037170533904, + "grad_norm": 2.018637379819173, + "learning_rate": 1e-05, + "loss": 1.2344, + "step": 816 + }, + { + "epoch": 0.36810092363144853, + "grad_norm": 1.9802241053742995, + "learning_rate": 1e-05, + "loss": 1.2589, + "step": 817 + }, + { + "epoch": 0.368551475557558, + "grad_norm": 1.8099024787025881, + "learning_rate": 1e-05, + "loss": 1.2971, + "step": 818 + }, + { + "epoch": 0.3690020274836675, + "grad_norm": 2.0168776458192545, + "learning_rate": 1e-05, + "loss": 1.2287, + "step": 819 + }, + { + "epoch": 0.369452579409777, + "grad_norm": 1.8166821194359652, + "learning_rate": 1e-05, + "loss": 1.221, + "step": 820 + }, + { + "epoch": 0.3699031313358865, + "grad_norm": 1.8788222274730944, + "learning_rate": 1e-05, + "loss": 1.2146, + "step": 821 + }, + { + "epoch": 0.37035368326199597, + "grad_norm": 1.8597354689004102, + "learning_rate": 1e-05, + "loss": 1.3146, + "step": 822 + }, + { + "epoch": 0.3708042351881054, + "grad_norm": 1.8490024962659892, + "learning_rate": 1e-05, + "loss": 1.2531, + "step": 823 + }, + { + "epoch": 0.3712547871142149, + "grad_norm": 1.9932131168744172, + "learning_rate": 1e-05, + "loss": 1.2258, + "step": 824 + }, + { + "epoch": 0.3717053390403244, + "grad_norm": 1.792362296656891, + "learning_rate": 1e-05, + "loss": 1.2201, + "step": 825 + }, + { + "epoch": 0.37215589096643387, + "grad_norm": 1.938705231070029, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 826 + }, + { + "epoch": 0.37260644289254335, + "grad_norm": 1.7842902067000777, + "learning_rate": 1e-05, + "loss": 1.2847, + "step": 827 + }, + { + "epoch": 0.37305699481865284, + "grad_norm": 1.7616563146804334, + "learning_rate": 1e-05, + "loss": 1.2702, + "step": 828 + }, + { + "epoch": 0.37350754674476233, + "grad_norm": 1.7580772109432945, + "learning_rate": 1e-05, + "loss": 1.2636, + "step": 829 + }, + { + "epoch": 0.3739580986708718, + "grad_norm": 1.8708258964354627, + "learning_rate": 1e-05, + "loss": 1.2103, + "step": 830 + }, + { + "epoch": 0.3744086505969813, + "grad_norm": 2.0918050039950344, + "learning_rate": 1e-05, + "loss": 1.2447, + "step": 831 + }, + { + "epoch": 0.3748592025230908, + "grad_norm": 2.01351718562748, + "learning_rate": 1e-05, + "loss": 1.2506, + "step": 832 + }, + { + "epoch": 0.3753097544492003, + "grad_norm": 1.8425337248709375, + "learning_rate": 1e-05, + "loss": 1.2987, + "step": 833 + }, + { + "epoch": 0.37576030637530977, + "grad_norm": 1.8605102177776403, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 834 + }, + { + "epoch": 0.37621085830141926, + "grad_norm": 1.6946568207636692, + "learning_rate": 1e-05, + "loss": 1.2329, + "step": 835 + }, + { + "epoch": 0.37666141022752875, + "grad_norm": 1.8772757818015977, + "learning_rate": 1e-05, + "loss": 1.221, + "step": 836 + }, + { + "epoch": 0.3771119621536382, + "grad_norm": 1.842851166865783, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 837 + }, + { + "epoch": 0.37756251407974767, + "grad_norm": 1.8849523873973497, + "learning_rate": 1e-05, + "loss": 1.2147, + "step": 838 + }, + { + "epoch": 0.37801306600585716, + "grad_norm": 1.8533083831614017, + "learning_rate": 1e-05, + "loss": 1.2316, + "step": 839 + }, + { + "epoch": 0.37846361793196664, + "grad_norm": 1.9790724756108462, + "learning_rate": 1e-05, + "loss": 1.2492, + "step": 840 + }, + { + "epoch": 0.37891416985807613, + "grad_norm": 1.9415222438339077, + "learning_rate": 1e-05, + "loss": 1.2088, + "step": 841 + }, + { + "epoch": 0.3793647217841856, + "grad_norm": 2.0064179702380525, + "learning_rate": 1e-05, + "loss": 1.2054, + "step": 842 + }, + { + "epoch": 0.3798152737102951, + "grad_norm": 1.9995002419984607, + "learning_rate": 1e-05, + "loss": 1.2479, + "step": 843 + }, + { + "epoch": 0.3802658256364046, + "grad_norm": 1.746526951976324, + "learning_rate": 1e-05, + "loss": 1.2677, + "step": 844 + }, + { + "epoch": 0.3807163775625141, + "grad_norm": 1.8472340617635299, + "learning_rate": 1e-05, + "loss": 1.2229, + "step": 845 + }, + { + "epoch": 0.3811669294886236, + "grad_norm": 1.6989791803179513, + "learning_rate": 1e-05, + "loss": 1.248, + "step": 846 + }, + { + "epoch": 0.38161748141473306, + "grad_norm": 1.7861084739049955, + "learning_rate": 1e-05, + "loss": 1.2479, + "step": 847 + }, + { + "epoch": 0.38206803334084255, + "grad_norm": 1.769562122846481, + "learning_rate": 1e-05, + "loss": 1.2067, + "step": 848 + }, + { + "epoch": 0.38251858526695204, + "grad_norm": 1.884505014051771, + "learning_rate": 1e-05, + "loss": 1.2004, + "step": 849 + }, + { + "epoch": 0.3829691371930615, + "grad_norm": 1.8608952507144012, + "learning_rate": 1e-05, + "loss": 1.2203, + "step": 850 + }, + { + "epoch": 0.38341968911917096, + "grad_norm": 1.8068312924208156, + "learning_rate": 1e-05, + "loss": 1.2325, + "step": 851 + }, + { + "epoch": 0.38387024104528045, + "grad_norm": 1.9077098493511166, + "learning_rate": 1e-05, + "loss": 1.292, + "step": 852 + }, + { + "epoch": 0.38432079297138994, + "grad_norm": 1.7981833545840775, + "learning_rate": 1e-05, + "loss": 1.1984, + "step": 853 + }, + { + "epoch": 0.3847713448974994, + "grad_norm": 1.856513701528131, + "learning_rate": 1e-05, + "loss": 1.2531, + "step": 854 + }, + { + "epoch": 0.3852218968236089, + "grad_norm": 1.857015365071636, + "learning_rate": 1e-05, + "loss": 1.2707, + "step": 855 + }, + { + "epoch": 0.3856724487497184, + "grad_norm": 1.9707250335863131, + "learning_rate": 1e-05, + "loss": 1.2419, + "step": 856 + }, + { + "epoch": 0.3861230006758279, + "grad_norm": 1.9106073526456564, + "learning_rate": 1e-05, + "loss": 1.2054, + "step": 857 + }, + { + "epoch": 0.3865735526019374, + "grad_norm": 1.9501533883167215, + "learning_rate": 1e-05, + "loss": 1.2286, + "step": 858 + }, + { + "epoch": 0.38702410452804686, + "grad_norm": 1.9959109986598897, + "learning_rate": 1e-05, + "loss": 1.2468, + "step": 859 + }, + { + "epoch": 0.38747465645415635, + "grad_norm": 1.6484250314284374, + "learning_rate": 1e-05, + "loss": 1.2478, + "step": 860 + }, + { + "epoch": 0.38792520838026584, + "grad_norm": 1.9677741572608782, + "learning_rate": 1e-05, + "loss": 1.2097, + "step": 861 + }, + { + "epoch": 0.38837576030637533, + "grad_norm": 1.8559375675050855, + "learning_rate": 1e-05, + "loss": 1.2389, + "step": 862 + }, + { + "epoch": 0.3888263122324848, + "grad_norm": 1.7870988194556259, + "learning_rate": 1e-05, + "loss": 1.2223, + "step": 863 + }, + { + "epoch": 0.3892768641585943, + "grad_norm": 1.8287900417708567, + "learning_rate": 1e-05, + "loss": 1.2405, + "step": 864 + }, + { + "epoch": 0.38972741608470374, + "grad_norm": 1.951938431706165, + "learning_rate": 1e-05, + "loss": 1.2226, + "step": 865 + }, + { + "epoch": 0.3901779680108132, + "grad_norm": 1.752310475841562, + "learning_rate": 1e-05, + "loss": 1.2216, + "step": 866 + }, + { + "epoch": 0.3906285199369227, + "grad_norm": 1.9543454009939398, + "learning_rate": 1e-05, + "loss": 1.2375, + "step": 867 + }, + { + "epoch": 0.3910790718630322, + "grad_norm": 1.8219769800646242, + "learning_rate": 1e-05, + "loss": 1.2417, + "step": 868 + }, + { + "epoch": 0.3915296237891417, + "grad_norm": 1.8046726965804514, + "learning_rate": 1e-05, + "loss": 1.2429, + "step": 869 + }, + { + "epoch": 0.3919801757152512, + "grad_norm": 1.7627905770282928, + "learning_rate": 1e-05, + "loss": 1.1608, + "step": 870 + }, + { + "epoch": 0.39243072764136067, + "grad_norm": 1.815967903711212, + "learning_rate": 1e-05, + "loss": 1.26, + "step": 871 + }, + { + "epoch": 0.39288127956747015, + "grad_norm": 1.7445848284617353, + "learning_rate": 1e-05, + "loss": 1.2577, + "step": 872 + }, + { + "epoch": 0.39333183149357964, + "grad_norm": 1.868942646909875, + "learning_rate": 1e-05, + "loss": 1.214, + "step": 873 + }, + { + "epoch": 0.39378238341968913, + "grad_norm": 1.8210881389865994, + "learning_rate": 1e-05, + "loss": 1.2435, + "step": 874 + }, + { + "epoch": 0.3942329353457986, + "grad_norm": 1.705569170189639, + "learning_rate": 1e-05, + "loss": 1.241, + "step": 875 + }, + { + "epoch": 0.3946834872719081, + "grad_norm": 1.9540522749998606, + "learning_rate": 1e-05, + "loss": 1.2463, + "step": 876 + }, + { + "epoch": 0.3951340391980176, + "grad_norm": 1.8532976849033589, + "learning_rate": 1e-05, + "loss": 1.2684, + "step": 877 + }, + { + "epoch": 0.3955845911241271, + "grad_norm": 2.0267127643103375, + "learning_rate": 1e-05, + "loss": 1.2621, + "step": 878 + }, + { + "epoch": 0.3960351430502365, + "grad_norm": 1.7195863840982162, + "learning_rate": 1e-05, + "loss": 1.2528, + "step": 879 + }, + { + "epoch": 0.396485694976346, + "grad_norm": 1.9520831100294582, + "learning_rate": 1e-05, + "loss": 1.23, + "step": 880 + }, + { + "epoch": 0.3969362469024555, + "grad_norm": 1.6876082251249087, + "learning_rate": 1e-05, + "loss": 1.2124, + "step": 881 + }, + { + "epoch": 0.397386798828565, + "grad_norm": 1.9033954135692357, + "learning_rate": 1e-05, + "loss": 1.2337, + "step": 882 + }, + { + "epoch": 0.39783735075467447, + "grad_norm": 2.053902208410709, + "learning_rate": 1e-05, + "loss": 1.2482, + "step": 883 + }, + { + "epoch": 0.39828790268078396, + "grad_norm": 1.95607947734673, + "learning_rate": 1e-05, + "loss": 1.222, + "step": 884 + }, + { + "epoch": 0.39873845460689344, + "grad_norm": 1.7916894763559215, + "learning_rate": 1e-05, + "loss": 1.2135, + "step": 885 + }, + { + "epoch": 0.39918900653300293, + "grad_norm": 1.734770833440029, + "learning_rate": 1e-05, + "loss": 1.2173, + "step": 886 + }, + { + "epoch": 0.3996395584591124, + "grad_norm": 1.8631984425859183, + "learning_rate": 1e-05, + "loss": 1.1849, + "step": 887 + }, + { + "epoch": 0.4000901103852219, + "grad_norm": 1.8462058887209238, + "learning_rate": 1e-05, + "loss": 1.2135, + "step": 888 + }, + { + "epoch": 0.4005406623113314, + "grad_norm": 1.7876150498138967, + "learning_rate": 1e-05, + "loss": 1.2054, + "step": 889 + }, + { + "epoch": 0.4009912142374409, + "grad_norm": 1.854313355156316, + "learning_rate": 1e-05, + "loss": 1.2385, + "step": 890 + }, + { + "epoch": 0.4014417661635504, + "grad_norm": 1.8444446458921264, + "learning_rate": 1e-05, + "loss": 1.2009, + "step": 891 + }, + { + "epoch": 0.4018923180896598, + "grad_norm": 1.9383431386705288, + "learning_rate": 1e-05, + "loss": 1.2204, + "step": 892 + }, + { + "epoch": 0.4023428700157693, + "grad_norm": 2.1113776609966255, + "learning_rate": 1e-05, + "loss": 1.2534, + "step": 893 + }, + { + "epoch": 0.4027934219418788, + "grad_norm": 1.8768166031930706, + "learning_rate": 1e-05, + "loss": 1.2349, + "step": 894 + }, + { + "epoch": 0.40324397386798827, + "grad_norm": 1.773780616193697, + "learning_rate": 1e-05, + "loss": 1.2152, + "step": 895 + }, + { + "epoch": 0.40369452579409776, + "grad_norm": 1.6736686600690427, + "learning_rate": 1e-05, + "loss": 1.2171, + "step": 896 + }, + { + "epoch": 0.40414507772020725, + "grad_norm": 1.7515241224968952, + "learning_rate": 1e-05, + "loss": 1.2017, + "step": 897 + }, + { + "epoch": 0.40459562964631673, + "grad_norm": 1.9593202732085018, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 898 + }, + { + "epoch": 0.4050461815724262, + "grad_norm": 1.8267834579292501, + "learning_rate": 1e-05, + "loss": 1.2498, + "step": 899 + }, + { + "epoch": 0.4054967334985357, + "grad_norm": 1.8041221315512324, + "learning_rate": 1e-05, + "loss": 1.2342, + "step": 900 + }, + { + "epoch": 0.4059472854246452, + "grad_norm": 1.769381272532187, + "learning_rate": 1e-05, + "loss": 1.2233, + "step": 901 + }, + { + "epoch": 0.4063978373507547, + "grad_norm": 1.8129209057417408, + "learning_rate": 1e-05, + "loss": 1.2703, + "step": 902 + }, + { + "epoch": 0.4068483892768642, + "grad_norm": 1.9108631223024326, + "learning_rate": 1e-05, + "loss": 1.1581, + "step": 903 + }, + { + "epoch": 0.40729894120297366, + "grad_norm": 1.9726083051261702, + "learning_rate": 1e-05, + "loss": 1.2196, + "step": 904 + }, + { + "epoch": 0.40774949312908315, + "grad_norm": 1.8478866238047853, + "learning_rate": 1e-05, + "loss": 1.2491, + "step": 905 + }, + { + "epoch": 0.4082000450551926, + "grad_norm": 1.9997563451103388, + "learning_rate": 1e-05, + "loss": 1.2393, + "step": 906 + }, + { + "epoch": 0.40865059698130207, + "grad_norm": 1.8595271723897708, + "learning_rate": 1e-05, + "loss": 1.2191, + "step": 907 + }, + { + "epoch": 0.40910114890741156, + "grad_norm": 1.9314412730516308, + "learning_rate": 1e-05, + "loss": 1.2966, + "step": 908 + }, + { + "epoch": 0.40955170083352105, + "grad_norm": 1.9087951304482553, + "learning_rate": 1e-05, + "loss": 1.1789, + "step": 909 + }, + { + "epoch": 0.41000225275963054, + "grad_norm": 1.7270848651641488, + "learning_rate": 1e-05, + "loss": 1.2484, + "step": 910 + }, + { + "epoch": 0.41045280468574, + "grad_norm": 1.8133435393778934, + "learning_rate": 1e-05, + "loss": 1.2341, + "step": 911 + }, + { + "epoch": 0.4109033566118495, + "grad_norm": 1.7739710977749763, + "learning_rate": 1e-05, + "loss": 1.2591, + "step": 912 + }, + { + "epoch": 0.411353908537959, + "grad_norm": 1.8018454180699817, + "learning_rate": 1e-05, + "loss": 1.2302, + "step": 913 + }, + { + "epoch": 0.4118044604640685, + "grad_norm": 1.899747818272689, + "learning_rate": 1e-05, + "loss": 1.1914, + "step": 914 + }, + { + "epoch": 0.412255012390178, + "grad_norm": 1.6447189441511958, + "learning_rate": 1e-05, + "loss": 1.2483, + "step": 915 + }, + { + "epoch": 0.41270556431628747, + "grad_norm": 1.8335610585183395, + "learning_rate": 1e-05, + "loss": 1.2451, + "step": 916 + }, + { + "epoch": 0.41315611624239695, + "grad_norm": 1.7885413084914048, + "learning_rate": 1e-05, + "loss": 1.2423, + "step": 917 + }, + { + "epoch": 0.41360666816850644, + "grad_norm": 1.714313154932247, + "learning_rate": 1e-05, + "loss": 1.2214, + "step": 918 + }, + { + "epoch": 0.41405722009461593, + "grad_norm": 1.8999263091456016, + "learning_rate": 1e-05, + "loss": 1.211, + "step": 919 + }, + { + "epoch": 0.41450777202072536, + "grad_norm": 1.7615025789848202, + "learning_rate": 1e-05, + "loss": 1.2339, + "step": 920 + }, + { + "epoch": 0.41495832394683485, + "grad_norm": 1.984694925916507, + "learning_rate": 1e-05, + "loss": 1.2596, + "step": 921 + }, + { + "epoch": 0.41540887587294434, + "grad_norm": 1.8309947728006515, + "learning_rate": 1e-05, + "loss": 1.2188, + "step": 922 + }, + { + "epoch": 0.4158594277990538, + "grad_norm": 1.8714866019169583, + "learning_rate": 1e-05, + "loss": 1.2282, + "step": 923 + }, + { + "epoch": 0.4163099797251633, + "grad_norm": 1.6965979518458885, + "learning_rate": 1e-05, + "loss": 1.207, + "step": 924 + }, + { + "epoch": 0.4167605316512728, + "grad_norm": 1.8774421905009213, + "learning_rate": 1e-05, + "loss": 1.2191, + "step": 925 + }, + { + "epoch": 0.4172110835773823, + "grad_norm": 1.759408329157933, + "learning_rate": 1e-05, + "loss": 1.2521, + "step": 926 + }, + { + "epoch": 0.4176616355034918, + "grad_norm": 1.8325304285702253, + "learning_rate": 1e-05, + "loss": 1.2394, + "step": 927 + }, + { + "epoch": 0.41811218742960127, + "grad_norm": 1.8659079533070821, + "learning_rate": 1e-05, + "loss": 1.2526, + "step": 928 + }, + { + "epoch": 0.41856273935571076, + "grad_norm": 1.88343915219476, + "learning_rate": 1e-05, + "loss": 1.2834, + "step": 929 + }, + { + "epoch": 0.41901329128182024, + "grad_norm": 1.8107333099552414, + "learning_rate": 1e-05, + "loss": 1.2034, + "step": 930 + }, + { + "epoch": 0.41946384320792973, + "grad_norm": 1.8811280025175747, + "learning_rate": 1e-05, + "loss": 1.2346, + "step": 931 + }, + { + "epoch": 0.4199143951340392, + "grad_norm": 1.6863157009590324, + "learning_rate": 1e-05, + "loss": 1.2468, + "step": 932 + }, + { + "epoch": 0.4203649470601487, + "grad_norm": 1.6448935128144737, + "learning_rate": 1e-05, + "loss": 1.2001, + "step": 933 + }, + { + "epoch": 0.42081549898625814, + "grad_norm": 1.9123116226428942, + "learning_rate": 1e-05, + "loss": 1.2457, + "step": 934 + }, + { + "epoch": 0.42126605091236763, + "grad_norm": 1.8960168476037824, + "learning_rate": 1e-05, + "loss": 1.2395, + "step": 935 + }, + { + "epoch": 0.4217166028384771, + "grad_norm": 1.7512695559940004, + "learning_rate": 1e-05, + "loss": 1.2755, + "step": 936 + }, + { + "epoch": 0.4221671547645866, + "grad_norm": 2.112142633500385, + "learning_rate": 1e-05, + "loss": 1.2534, + "step": 937 + }, + { + "epoch": 0.4226177066906961, + "grad_norm": 1.8517229815085718, + "learning_rate": 1e-05, + "loss": 1.2409, + "step": 938 + }, + { + "epoch": 0.4230682586168056, + "grad_norm": 1.958729062600093, + "learning_rate": 1e-05, + "loss": 1.2611, + "step": 939 + }, + { + "epoch": 0.42351881054291507, + "grad_norm": 1.7826395039325846, + "learning_rate": 1e-05, + "loss": 1.2239, + "step": 940 + }, + { + "epoch": 0.42396936246902456, + "grad_norm": 1.7178891678540262, + "learning_rate": 1e-05, + "loss": 1.255, + "step": 941 + }, + { + "epoch": 0.42441991439513405, + "grad_norm": 1.7876765829669556, + "learning_rate": 1e-05, + "loss": 1.2414, + "step": 942 + }, + { + "epoch": 0.42487046632124353, + "grad_norm": 1.8427359715446618, + "learning_rate": 1e-05, + "loss": 1.2635, + "step": 943 + }, + { + "epoch": 0.425321018247353, + "grad_norm": 2.0491975947455736, + "learning_rate": 1e-05, + "loss": 1.2849, + "step": 944 + }, + { + "epoch": 0.4257715701734625, + "grad_norm": 1.6888517145569675, + "learning_rate": 1e-05, + "loss": 1.2113, + "step": 945 + }, + { + "epoch": 0.426222122099572, + "grad_norm": 1.7689491278837048, + "learning_rate": 1e-05, + "loss": 1.247, + "step": 946 + }, + { + "epoch": 0.4266726740256815, + "grad_norm": 1.796391862762361, + "learning_rate": 1e-05, + "loss": 1.2066, + "step": 947 + }, + { + "epoch": 0.4271232259517909, + "grad_norm": 1.8311369964039967, + "learning_rate": 1e-05, + "loss": 1.2364, + "step": 948 + }, + { + "epoch": 0.4275737778779004, + "grad_norm": 1.9450482355011398, + "learning_rate": 1e-05, + "loss": 1.1701, + "step": 949 + }, + { + "epoch": 0.4280243298040099, + "grad_norm": 1.8497392645254098, + "learning_rate": 1e-05, + "loss": 1.2733, + "step": 950 + }, + { + "epoch": 0.4284748817301194, + "grad_norm": 1.9061611409755275, + "learning_rate": 1e-05, + "loss": 1.2525, + "step": 951 + }, + { + "epoch": 0.42892543365622887, + "grad_norm": 1.814663803548855, + "learning_rate": 1e-05, + "loss": 1.2185, + "step": 952 + }, + { + "epoch": 0.42937598558233836, + "grad_norm": 1.7845164252468, + "learning_rate": 1e-05, + "loss": 1.2948, + "step": 953 + }, + { + "epoch": 0.42982653750844785, + "grad_norm": 1.824363044629698, + "learning_rate": 1e-05, + "loss": 1.2208, + "step": 954 + }, + { + "epoch": 0.43027708943455734, + "grad_norm": 1.86807240064721, + "learning_rate": 1e-05, + "loss": 1.233, + "step": 955 + }, + { + "epoch": 0.4307276413606668, + "grad_norm": 2.0124202223012113, + "learning_rate": 1e-05, + "loss": 1.284, + "step": 956 + }, + { + "epoch": 0.4311781932867763, + "grad_norm": 1.6962010066906617, + "learning_rate": 1e-05, + "loss": 1.2338, + "step": 957 + }, + { + "epoch": 0.4316287452128858, + "grad_norm": 1.8014128500101874, + "learning_rate": 1e-05, + "loss": 1.2282, + "step": 958 + }, + { + "epoch": 0.4320792971389953, + "grad_norm": 1.86509842034215, + "learning_rate": 1e-05, + "loss": 1.2806, + "step": 959 + }, + { + "epoch": 0.4325298490651048, + "grad_norm": 1.9463495755610225, + "learning_rate": 1e-05, + "loss": 1.2613, + "step": 960 + }, + { + "epoch": 0.43298040099121426, + "grad_norm": 2.015400634180842, + "learning_rate": 1e-05, + "loss": 1.2242, + "step": 961 + }, + { + "epoch": 0.4334309529173237, + "grad_norm": 1.7459452192089768, + "learning_rate": 1e-05, + "loss": 1.2269, + "step": 962 + }, + { + "epoch": 0.4338815048434332, + "grad_norm": 1.6682578292669217, + "learning_rate": 1e-05, + "loss": 1.2277, + "step": 963 + }, + { + "epoch": 0.4343320567695427, + "grad_norm": 1.8473342106833701, + "learning_rate": 1e-05, + "loss": 1.2663, + "step": 964 + }, + { + "epoch": 0.43478260869565216, + "grad_norm": 1.7929104274279675, + "learning_rate": 1e-05, + "loss": 1.2174, + "step": 965 + }, + { + "epoch": 0.43523316062176165, + "grad_norm": 1.9113226803614998, + "learning_rate": 1e-05, + "loss": 1.219, + "step": 966 + }, + { + "epoch": 0.43568371254787114, + "grad_norm": 1.7820315389432446, + "learning_rate": 1e-05, + "loss": 1.2093, + "step": 967 + }, + { + "epoch": 0.4361342644739806, + "grad_norm": 1.7912651903425194, + "learning_rate": 1e-05, + "loss": 1.21, + "step": 968 + }, + { + "epoch": 0.4365848164000901, + "grad_norm": 1.7021897938320005, + "learning_rate": 1e-05, + "loss": 1.2658, + "step": 969 + }, + { + "epoch": 0.4370353683261996, + "grad_norm": 1.713127972915882, + "learning_rate": 1e-05, + "loss": 1.2431, + "step": 970 + }, + { + "epoch": 0.4374859202523091, + "grad_norm": 1.976456964290821, + "learning_rate": 1e-05, + "loss": 1.2396, + "step": 971 + }, + { + "epoch": 0.4379364721784186, + "grad_norm": 1.690709484557212, + "learning_rate": 1e-05, + "loss": 1.1973, + "step": 972 + }, + { + "epoch": 0.43838702410452807, + "grad_norm": 1.7631626297677152, + "learning_rate": 1e-05, + "loss": 1.2917, + "step": 973 + }, + { + "epoch": 0.43883757603063756, + "grad_norm": 1.9321613627415042, + "learning_rate": 1e-05, + "loss": 1.2996, + "step": 974 + }, + { + "epoch": 0.439288127956747, + "grad_norm": 1.9212148960520163, + "learning_rate": 1e-05, + "loss": 1.2572, + "step": 975 + }, + { + "epoch": 0.4397386798828565, + "grad_norm": 1.8545946670818378, + "learning_rate": 1e-05, + "loss": 1.2415, + "step": 976 + }, + { + "epoch": 0.44018923180896596, + "grad_norm": 1.7357455740514371, + "learning_rate": 1e-05, + "loss": 1.2113, + "step": 977 + }, + { + "epoch": 0.44063978373507545, + "grad_norm": 1.7103721758032753, + "learning_rate": 1e-05, + "loss": 1.2573, + "step": 978 + }, + { + "epoch": 0.44109033566118494, + "grad_norm": 1.621500284952924, + "learning_rate": 1e-05, + "loss": 1.2004, + "step": 979 + }, + { + "epoch": 0.44154088758729443, + "grad_norm": 1.8143231838877647, + "learning_rate": 1e-05, + "loss": 1.2248, + "step": 980 + }, + { + "epoch": 0.4419914395134039, + "grad_norm": 1.817884639012165, + "learning_rate": 1e-05, + "loss": 1.2326, + "step": 981 + }, + { + "epoch": 0.4424419914395134, + "grad_norm": 1.8566661422575435, + "learning_rate": 1e-05, + "loss": 1.2371, + "step": 982 + }, + { + "epoch": 0.4428925433656229, + "grad_norm": 1.5932009456479728, + "learning_rate": 1e-05, + "loss": 1.243, + "step": 983 + }, + { + "epoch": 0.4433430952917324, + "grad_norm": 1.7374272520952616, + "learning_rate": 1e-05, + "loss": 1.2126, + "step": 984 + }, + { + "epoch": 0.44379364721784187, + "grad_norm": 1.8573512532324357, + "learning_rate": 1e-05, + "loss": 1.2225, + "step": 985 + }, + { + "epoch": 0.44424419914395136, + "grad_norm": 1.7423933948751205, + "learning_rate": 1e-05, + "loss": 1.2409, + "step": 986 + }, + { + "epoch": 0.44469475107006085, + "grad_norm": 1.7647060259539527, + "learning_rate": 1e-05, + "loss": 1.2877, + "step": 987 + }, + { + "epoch": 0.44514530299617033, + "grad_norm": 1.748668121574136, + "learning_rate": 1e-05, + "loss": 1.2248, + "step": 988 + }, + { + "epoch": 0.44559585492227977, + "grad_norm": 1.793432386539002, + "learning_rate": 1e-05, + "loss": 1.2415, + "step": 989 + }, + { + "epoch": 0.44604640684838925, + "grad_norm": 1.853915821574754, + "learning_rate": 1e-05, + "loss": 1.2288, + "step": 990 + }, + { + "epoch": 0.44649695877449874, + "grad_norm": 1.8560130103305665, + "learning_rate": 1e-05, + "loss": 1.1669, + "step": 991 + }, + { + "epoch": 0.44694751070060823, + "grad_norm": 1.7161372582429646, + "learning_rate": 1e-05, + "loss": 1.2325, + "step": 992 + }, + { + "epoch": 0.4473980626267177, + "grad_norm": 1.8150571727108074, + "learning_rate": 1e-05, + "loss": 1.2581, + "step": 993 + }, + { + "epoch": 0.4478486145528272, + "grad_norm": 1.7526906399784734, + "learning_rate": 1e-05, + "loss": 1.203, + "step": 994 + }, + { + "epoch": 0.4482991664789367, + "grad_norm": 1.7912002199012331, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 995 + }, + { + "epoch": 0.4487497184050462, + "grad_norm": 1.8425462851521288, + "learning_rate": 1e-05, + "loss": 1.227, + "step": 996 + }, + { + "epoch": 0.44920027033115567, + "grad_norm": 1.6377109358434168, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 997 + }, + { + "epoch": 0.44965082225726516, + "grad_norm": 2.0834025665164333, + "learning_rate": 1e-05, + "loss": 1.2755, + "step": 998 + }, + { + "epoch": 0.45010137418337465, + "grad_norm": 1.8628186553040025, + "learning_rate": 1e-05, + "loss": 1.217, + "step": 999 + }, + { + "epoch": 0.45055192610948414, + "grad_norm": 1.6906460456680688, + "learning_rate": 1e-05, + "loss": 1.2358, + "step": 1000 + }, + { + "epoch": 0.4510024780355936, + "grad_norm": 1.933170044940621, + "learning_rate": 1e-05, + "loss": 1.1983, + "step": 1001 + }, + { + "epoch": 0.4514530299617031, + "grad_norm": 1.9063631080079892, + "learning_rate": 1e-05, + "loss": 1.1888, + "step": 1002 + }, + { + "epoch": 0.45190358188781254, + "grad_norm": 2.0736812951315926, + "learning_rate": 1e-05, + "loss": 1.2588, + "step": 1003 + }, + { + "epoch": 0.45235413381392203, + "grad_norm": 1.7951054172338083, + "learning_rate": 1e-05, + "loss": 1.2045, + "step": 1004 + }, + { + "epoch": 0.4528046857400315, + "grad_norm": 1.9804276436292978, + "learning_rate": 1e-05, + "loss": 1.2217, + "step": 1005 + }, + { + "epoch": 0.453255237666141, + "grad_norm": 1.7688903889613266, + "learning_rate": 1e-05, + "loss": 1.2471, + "step": 1006 + }, + { + "epoch": 0.4537057895922505, + "grad_norm": 1.9064149035191789, + "learning_rate": 1e-05, + "loss": 1.21, + "step": 1007 + }, + { + "epoch": 0.45415634151836, + "grad_norm": 1.7500629600876403, + "learning_rate": 1e-05, + "loss": 1.2002, + "step": 1008 + }, + { + "epoch": 0.4546068934444695, + "grad_norm": 1.7935387124911324, + "learning_rate": 1e-05, + "loss": 1.2256, + "step": 1009 + }, + { + "epoch": 0.45505744537057896, + "grad_norm": 1.7428898938844228, + "learning_rate": 1e-05, + "loss": 1.1975, + "step": 1010 + }, + { + "epoch": 0.45550799729668845, + "grad_norm": 1.6232895579332256, + "learning_rate": 1e-05, + "loss": 1.212, + "step": 1011 + }, + { + "epoch": 0.45595854922279794, + "grad_norm": 1.6606790097851276, + "learning_rate": 1e-05, + "loss": 1.2307, + "step": 1012 + }, + { + "epoch": 0.4564091011489074, + "grad_norm": 1.8620402336756303, + "learning_rate": 1e-05, + "loss": 1.2091, + "step": 1013 + }, + { + "epoch": 0.4568596530750169, + "grad_norm": 1.7066063911320382, + "learning_rate": 1e-05, + "loss": 1.2301, + "step": 1014 + }, + { + "epoch": 0.4573102050011264, + "grad_norm": 2.0393744218354337, + "learning_rate": 1e-05, + "loss": 1.1975, + "step": 1015 + }, + { + "epoch": 0.4577607569272359, + "grad_norm": 1.9529652845428371, + "learning_rate": 1e-05, + "loss": 1.187, + "step": 1016 + }, + { + "epoch": 0.4582113088533453, + "grad_norm": 1.7144968408997552, + "learning_rate": 1e-05, + "loss": 1.278, + "step": 1017 + }, + { + "epoch": 0.4586618607794548, + "grad_norm": 1.8255706948280142, + "learning_rate": 1e-05, + "loss": 1.1724, + "step": 1018 + }, + { + "epoch": 0.4591124127055643, + "grad_norm": 1.7648007224714204, + "learning_rate": 1e-05, + "loss": 1.2123, + "step": 1019 + }, + { + "epoch": 0.4595629646316738, + "grad_norm": 1.822346875626478, + "learning_rate": 1e-05, + "loss": 1.2033, + "step": 1020 + }, + { + "epoch": 0.4600135165577833, + "grad_norm": 1.7709385612833495, + "learning_rate": 1e-05, + "loss": 1.2185, + "step": 1021 + }, + { + "epoch": 0.46046406848389276, + "grad_norm": 1.6569703238652924, + "learning_rate": 1e-05, + "loss": 1.2058, + "step": 1022 + }, + { + "epoch": 0.46091462041000225, + "grad_norm": 1.8915853122658959, + "learning_rate": 1e-05, + "loss": 1.2038, + "step": 1023 + }, + { + "epoch": 0.46136517233611174, + "grad_norm": 1.7120760960014065, + "learning_rate": 1e-05, + "loss": 1.2368, + "step": 1024 + }, + { + "epoch": 0.4618157242622212, + "grad_norm": 1.651758686931775, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 1025 + }, + { + "epoch": 0.4622662761883307, + "grad_norm": 1.837127284255619, + "learning_rate": 1e-05, + "loss": 1.2651, + "step": 1026 + }, + { + "epoch": 0.4627168281144402, + "grad_norm": 1.9652775493658583, + "learning_rate": 1e-05, + "loss": 1.2607, + "step": 1027 + }, + { + "epoch": 0.4631673800405497, + "grad_norm": 1.872687538220321, + "learning_rate": 1e-05, + "loss": 1.2191, + "step": 1028 + }, + { + "epoch": 0.4636179319666592, + "grad_norm": 1.671868424456358, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1029 + }, + { + "epoch": 0.46406848389276867, + "grad_norm": 1.846576568317387, + "learning_rate": 1e-05, + "loss": 1.2195, + "step": 1030 + }, + { + "epoch": 0.4645190358188781, + "grad_norm": 1.7524205473229308, + "learning_rate": 1e-05, + "loss": 1.1984, + "step": 1031 + }, + { + "epoch": 0.4649695877449876, + "grad_norm": 1.7582846412885695, + "learning_rate": 1e-05, + "loss": 1.267, + "step": 1032 + }, + { + "epoch": 0.4654201396710971, + "grad_norm": 1.9264915792558468, + "learning_rate": 1e-05, + "loss": 1.2354, + "step": 1033 + }, + { + "epoch": 0.46587069159720657, + "grad_norm": 1.773101550711459, + "learning_rate": 1e-05, + "loss": 1.2185, + "step": 1034 + }, + { + "epoch": 0.46632124352331605, + "grad_norm": 1.8624304126504854, + "learning_rate": 1e-05, + "loss": 1.2514, + "step": 1035 + }, + { + "epoch": 0.46677179544942554, + "grad_norm": 1.858008867057068, + "learning_rate": 1e-05, + "loss": 1.2676, + "step": 1036 + }, + { + "epoch": 0.46722234737553503, + "grad_norm": 1.8245165875314857, + "learning_rate": 1e-05, + "loss": 1.2499, + "step": 1037 + }, + { + "epoch": 0.4676728993016445, + "grad_norm": 1.9287810094534832, + "learning_rate": 1e-05, + "loss": 1.213, + "step": 1038 + }, + { + "epoch": 0.468123451227754, + "grad_norm": 1.9290438800960605, + "learning_rate": 1e-05, + "loss": 1.2774, + "step": 1039 + }, + { + "epoch": 0.4685740031538635, + "grad_norm": 1.89641334900627, + "learning_rate": 1e-05, + "loss": 1.1682, + "step": 1040 + }, + { + "epoch": 0.469024555079973, + "grad_norm": 1.8319785958476764, + "learning_rate": 1e-05, + "loss": 1.2416, + "step": 1041 + }, + { + "epoch": 0.46947510700608247, + "grad_norm": 1.842667663741529, + "learning_rate": 1e-05, + "loss": 1.173, + "step": 1042 + }, + { + "epoch": 0.46992565893219196, + "grad_norm": 1.7958175868902506, + "learning_rate": 1e-05, + "loss": 1.2279, + "step": 1043 + }, + { + "epoch": 0.47037621085830145, + "grad_norm": 1.7208189699628653, + "learning_rate": 1e-05, + "loss": 1.2433, + "step": 1044 + }, + { + "epoch": 0.4708267627844109, + "grad_norm": 1.9064596547765817, + "learning_rate": 1e-05, + "loss": 1.2874, + "step": 1045 + }, + { + "epoch": 0.47127731471052037, + "grad_norm": 1.9590432871635806, + "learning_rate": 1e-05, + "loss": 1.249, + "step": 1046 + }, + { + "epoch": 0.47172786663662986, + "grad_norm": 1.9061918225215113, + "learning_rate": 1e-05, + "loss": 1.222, + "step": 1047 + }, + { + "epoch": 0.47217841856273934, + "grad_norm": 1.8327957069073397, + "learning_rate": 1e-05, + "loss": 1.2052, + "step": 1048 + }, + { + "epoch": 0.47262897048884883, + "grad_norm": 1.889909285426708, + "learning_rate": 1e-05, + "loss": 1.2188, + "step": 1049 + }, + { + "epoch": 0.4730795224149583, + "grad_norm": 1.8888538532149346, + "learning_rate": 1e-05, + "loss": 1.2571, + "step": 1050 + }, + { + "epoch": 0.4735300743410678, + "grad_norm": 1.7364925862968676, + "learning_rate": 1e-05, + "loss": 1.2166, + "step": 1051 + }, + { + "epoch": 0.4739806262671773, + "grad_norm": 1.940114901098188, + "learning_rate": 1e-05, + "loss": 1.1676, + "step": 1052 + }, + { + "epoch": 0.4744311781932868, + "grad_norm": 1.7824750248703818, + "learning_rate": 1e-05, + "loss": 1.1862, + "step": 1053 + }, + { + "epoch": 0.4748817301193963, + "grad_norm": 1.6632088404087289, + "learning_rate": 1e-05, + "loss": 1.2078, + "step": 1054 + }, + { + "epoch": 0.47533228204550576, + "grad_norm": 1.8535270432778796, + "learning_rate": 1e-05, + "loss": 1.1857, + "step": 1055 + }, + { + "epoch": 0.47578283397161525, + "grad_norm": 1.7490217032145572, + "learning_rate": 1e-05, + "loss": 1.2767, + "step": 1056 + }, + { + "epoch": 0.47623338589772474, + "grad_norm": 2.2784924514306497, + "learning_rate": 1e-05, + "loss": 1.2643, + "step": 1057 + }, + { + "epoch": 0.47668393782383417, + "grad_norm": 1.9290938964960718, + "learning_rate": 1e-05, + "loss": 1.2154, + "step": 1058 + }, + { + "epoch": 0.47713448974994366, + "grad_norm": 1.910082105214886, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 1059 + }, + { + "epoch": 0.47758504167605315, + "grad_norm": 1.884840709045603, + "learning_rate": 1e-05, + "loss": 1.232, + "step": 1060 + }, + { + "epoch": 0.47803559360216263, + "grad_norm": 1.7051054150373703, + "learning_rate": 1e-05, + "loss": 1.2019, + "step": 1061 + }, + { + "epoch": 0.4784861455282721, + "grad_norm": 1.7260457359308374, + "learning_rate": 1e-05, + "loss": 1.2419, + "step": 1062 + }, + { + "epoch": 0.4789366974543816, + "grad_norm": 1.7309120879574524, + "learning_rate": 1e-05, + "loss": 1.2377, + "step": 1063 + }, + { + "epoch": 0.4793872493804911, + "grad_norm": 1.9683552553648596, + "learning_rate": 1e-05, + "loss": 1.239, + "step": 1064 + }, + { + "epoch": 0.4798378013066006, + "grad_norm": 1.901677488014881, + "learning_rate": 1e-05, + "loss": 1.2173, + "step": 1065 + }, + { + "epoch": 0.4802883532327101, + "grad_norm": 1.8308295379837334, + "learning_rate": 1e-05, + "loss": 1.2389, + "step": 1066 + }, + { + "epoch": 0.48073890515881956, + "grad_norm": 1.7820508423112793, + "learning_rate": 1e-05, + "loss": 1.2076, + "step": 1067 + }, + { + "epoch": 0.48118945708492905, + "grad_norm": 1.8331685742222972, + "learning_rate": 1e-05, + "loss": 1.2598, + "step": 1068 + }, + { + "epoch": 0.48164000901103854, + "grad_norm": 1.7856887837426858, + "learning_rate": 1e-05, + "loss": 1.225, + "step": 1069 + }, + { + "epoch": 0.482090560937148, + "grad_norm": 1.9523050077246389, + "learning_rate": 1e-05, + "loss": 1.2284, + "step": 1070 + }, + { + "epoch": 0.4825411128632575, + "grad_norm": 1.7941205953769455, + "learning_rate": 1e-05, + "loss": 1.2355, + "step": 1071 + }, + { + "epoch": 0.48299166478936695, + "grad_norm": 1.977534031281802, + "learning_rate": 1e-05, + "loss": 1.2914, + "step": 1072 + }, + { + "epoch": 0.48344221671547644, + "grad_norm": 1.8023645961903285, + "learning_rate": 1e-05, + "loss": 1.2393, + "step": 1073 + }, + { + "epoch": 0.4838927686415859, + "grad_norm": 1.7954481615528401, + "learning_rate": 1e-05, + "loss": 1.205, + "step": 1074 + }, + { + "epoch": 0.4843433205676954, + "grad_norm": 1.7348221983360275, + "learning_rate": 1e-05, + "loss": 1.2209, + "step": 1075 + }, + { + "epoch": 0.4847938724938049, + "grad_norm": 1.7307336849738606, + "learning_rate": 1e-05, + "loss": 1.2444, + "step": 1076 + }, + { + "epoch": 0.4852444244199144, + "grad_norm": 1.9162431895429781, + "learning_rate": 1e-05, + "loss": 1.2518, + "step": 1077 + }, + { + "epoch": 0.4856949763460239, + "grad_norm": 1.97778966257397, + "learning_rate": 1e-05, + "loss": 1.2082, + "step": 1078 + }, + { + "epoch": 0.48614552827213336, + "grad_norm": 1.699217758383844, + "learning_rate": 1e-05, + "loss": 1.2253, + "step": 1079 + }, + { + "epoch": 0.48659608019824285, + "grad_norm": 1.6883380824264893, + "learning_rate": 1e-05, + "loss": 1.1954, + "step": 1080 + }, + { + "epoch": 0.48704663212435234, + "grad_norm": 1.8429241298457815, + "learning_rate": 1e-05, + "loss": 1.2117, + "step": 1081 + }, + { + "epoch": 0.48749718405046183, + "grad_norm": 1.8325220765257848, + "learning_rate": 1e-05, + "loss": 1.2305, + "step": 1082 + }, + { + "epoch": 0.4879477359765713, + "grad_norm": 1.8472108956272706, + "learning_rate": 1e-05, + "loss": 1.2733, + "step": 1083 + }, + { + "epoch": 0.4883982879026808, + "grad_norm": 1.8369136976527052, + "learning_rate": 1e-05, + "loss": 1.2339, + "step": 1084 + }, + { + "epoch": 0.4888488398287903, + "grad_norm": 1.6162764037207624, + "learning_rate": 1e-05, + "loss": 1.237, + "step": 1085 + }, + { + "epoch": 0.4892993917548997, + "grad_norm": 1.851187210448935, + "learning_rate": 1e-05, + "loss": 1.2331, + "step": 1086 + }, + { + "epoch": 0.4897499436810092, + "grad_norm": 1.8322504715167574, + "learning_rate": 1e-05, + "loss": 1.2485, + "step": 1087 + }, + { + "epoch": 0.4902004956071187, + "grad_norm": 1.9160880478705529, + "learning_rate": 1e-05, + "loss": 1.2001, + "step": 1088 + }, + { + "epoch": 0.4906510475332282, + "grad_norm": 1.8071171427065056, + "learning_rate": 1e-05, + "loss": 1.2349, + "step": 1089 + }, + { + "epoch": 0.4911015994593377, + "grad_norm": 1.622019523490204, + "learning_rate": 1e-05, + "loss": 1.2269, + "step": 1090 + }, + { + "epoch": 0.49155215138544717, + "grad_norm": 1.828774747448718, + "learning_rate": 1e-05, + "loss": 1.2238, + "step": 1091 + }, + { + "epoch": 0.49200270331155666, + "grad_norm": 1.6336547605350142, + "learning_rate": 1e-05, + "loss": 1.2508, + "step": 1092 + }, + { + "epoch": 0.49245325523766614, + "grad_norm": 1.9854826065315856, + "learning_rate": 1e-05, + "loss": 1.2405, + "step": 1093 + }, + { + "epoch": 0.49290380716377563, + "grad_norm": 1.9172141076014846, + "learning_rate": 1e-05, + "loss": 1.1921, + "step": 1094 + }, + { + "epoch": 0.4933543590898851, + "grad_norm": 1.7647119828471034, + "learning_rate": 1e-05, + "loss": 1.2039, + "step": 1095 + }, + { + "epoch": 0.4938049110159946, + "grad_norm": 1.7055858189962692, + "learning_rate": 1e-05, + "loss": 1.2411, + "step": 1096 + }, + { + "epoch": 0.4942554629421041, + "grad_norm": 1.794010258913513, + "learning_rate": 1e-05, + "loss": 1.2476, + "step": 1097 + }, + { + "epoch": 0.4947060148682136, + "grad_norm": 1.6278628378054147, + "learning_rate": 1e-05, + "loss": 1.2451, + "step": 1098 + }, + { + "epoch": 0.49515656679432307, + "grad_norm": 1.8243087009320678, + "learning_rate": 1e-05, + "loss": 1.2588, + "step": 1099 + }, + { + "epoch": 0.4956071187204325, + "grad_norm": 1.6759172576522992, + "learning_rate": 1e-05, + "loss": 1.2367, + "step": 1100 + }, + { + "epoch": 0.496057670646542, + "grad_norm": 1.7807547764841059, + "learning_rate": 1e-05, + "loss": 1.2052, + "step": 1101 + }, + { + "epoch": 0.4965082225726515, + "grad_norm": 1.7560604438541885, + "learning_rate": 1e-05, + "loss": 1.2343, + "step": 1102 + }, + { + "epoch": 0.49695877449876097, + "grad_norm": 1.6398485896511423, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 1103 + }, + { + "epoch": 0.49740932642487046, + "grad_norm": 1.6943249402448588, + "learning_rate": 1e-05, + "loss": 1.2074, + "step": 1104 + }, + { + "epoch": 0.49785987835097995, + "grad_norm": 1.6998993181668858, + "learning_rate": 1e-05, + "loss": 1.2035, + "step": 1105 + }, + { + "epoch": 0.49831043027708943, + "grad_norm": 1.7167357490770654, + "learning_rate": 1e-05, + "loss": 1.1755, + "step": 1106 + }, + { + "epoch": 0.4987609822031989, + "grad_norm": 1.856023947993202, + "learning_rate": 1e-05, + "loss": 1.173, + "step": 1107 + }, + { + "epoch": 0.4992115341293084, + "grad_norm": 1.7209075128988316, + "learning_rate": 1e-05, + "loss": 1.231, + "step": 1108 + }, + { + "epoch": 0.4996620860554179, + "grad_norm": 1.7321911152152958, + "learning_rate": 1e-05, + "loss": 1.2232, + "step": 1109 + }, + { + "epoch": 0.5001126379815274, + "grad_norm": 1.834703729970928, + "learning_rate": 1e-05, + "loss": 1.2282, + "step": 1110 + }, + { + "epoch": 0.5005631899076368, + "grad_norm": 1.703690752117525, + "learning_rate": 1e-05, + "loss": 1.2261, + "step": 1111 + }, + { + "epoch": 0.5010137418337464, + "grad_norm": 1.8971254709341026, + "learning_rate": 1e-05, + "loss": 1.2252, + "step": 1112 + }, + { + "epoch": 0.5014642937598558, + "grad_norm": 1.6885863371613574, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 1113 + }, + { + "epoch": 0.5019148456859653, + "grad_norm": 1.7867795820634051, + "learning_rate": 1e-05, + "loss": 1.2161, + "step": 1114 + }, + { + "epoch": 0.5023653976120748, + "grad_norm": 1.7866472158501792, + "learning_rate": 1e-05, + "loss": 1.2407, + "step": 1115 + }, + { + "epoch": 0.5028159495381843, + "grad_norm": 1.7145118086058329, + "learning_rate": 1e-05, + "loss": 1.2003, + "step": 1116 + }, + { + "epoch": 0.5032665014642937, + "grad_norm": 1.8420656895369183, + "learning_rate": 1e-05, + "loss": 1.245, + "step": 1117 + }, + { + "epoch": 0.5037170533904033, + "grad_norm": 1.8881987897318155, + "learning_rate": 1e-05, + "loss": 1.2149, + "step": 1118 + }, + { + "epoch": 0.5041676053165127, + "grad_norm": 1.662823224734846, + "learning_rate": 1e-05, + "loss": 1.2788, + "step": 1119 + }, + { + "epoch": 0.5046181572426223, + "grad_norm": 1.6455901767468586, + "learning_rate": 1e-05, + "loss": 1.2202, + "step": 1120 + }, + { + "epoch": 0.5050687091687317, + "grad_norm": 1.6179584810828676, + "learning_rate": 1e-05, + "loss": 1.2427, + "step": 1121 + }, + { + "epoch": 0.5055192610948411, + "grad_norm": 1.7161125239457964, + "learning_rate": 1e-05, + "loss": 1.2381, + "step": 1122 + }, + { + "epoch": 0.5059698130209507, + "grad_norm": 1.7815519260922321, + "learning_rate": 1e-05, + "loss": 1.2181, + "step": 1123 + }, + { + "epoch": 0.5064203649470601, + "grad_norm": 1.8429736829187442, + "learning_rate": 1e-05, + "loss": 1.2138, + "step": 1124 + }, + { + "epoch": 0.5068709168731697, + "grad_norm": 1.7426923228641775, + "learning_rate": 1e-05, + "loss": 1.2327, + "step": 1125 + }, + { + "epoch": 0.5073214687992791, + "grad_norm": 1.8345535027242015, + "learning_rate": 1e-05, + "loss": 1.184, + "step": 1126 + }, + { + "epoch": 0.5077720207253886, + "grad_norm": 1.7373886015772988, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1127 + }, + { + "epoch": 0.5082225726514981, + "grad_norm": 1.7368499412838856, + "learning_rate": 1e-05, + "loss": 1.2256, + "step": 1128 + }, + { + "epoch": 0.5086731245776076, + "grad_norm": 1.666050196887123, + "learning_rate": 1e-05, + "loss": 1.1866, + "step": 1129 + }, + { + "epoch": 0.509123676503717, + "grad_norm": 1.7223799725798914, + "learning_rate": 1e-05, + "loss": 1.2122, + "step": 1130 + }, + { + "epoch": 0.5095742284298266, + "grad_norm": 1.784934201113948, + "learning_rate": 1e-05, + "loss": 1.1619, + "step": 1131 + }, + { + "epoch": 0.510024780355936, + "grad_norm": 1.7354950600530021, + "learning_rate": 1e-05, + "loss": 1.2096, + "step": 1132 + }, + { + "epoch": 0.5104753322820456, + "grad_norm": 1.7779168631616875, + "learning_rate": 1e-05, + "loss": 1.2107, + "step": 1133 + }, + { + "epoch": 0.510925884208155, + "grad_norm": 1.8849253874183658, + "learning_rate": 1e-05, + "loss": 1.2027, + "step": 1134 + }, + { + "epoch": 0.5113764361342644, + "grad_norm": 1.8102302267850547, + "learning_rate": 1e-05, + "loss": 1.2296, + "step": 1135 + }, + { + "epoch": 0.511826988060374, + "grad_norm": 1.7416638130497022, + "learning_rate": 1e-05, + "loss": 1.2782, + "step": 1136 + }, + { + "epoch": 0.5122775399864834, + "grad_norm": 1.945864303499097, + "learning_rate": 1e-05, + "loss": 1.2051, + "step": 1137 + }, + { + "epoch": 0.5127280919125929, + "grad_norm": 1.7873013100729083, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 1138 + }, + { + "epoch": 0.5131786438387024, + "grad_norm": 1.7613293131158412, + "learning_rate": 1e-05, + "loss": 1.2109, + "step": 1139 + }, + { + "epoch": 0.5136291957648119, + "grad_norm": 1.6975318487518078, + "learning_rate": 1e-05, + "loss": 1.2473, + "step": 1140 + }, + { + "epoch": 0.5140797476909214, + "grad_norm": 1.7674609411831723, + "learning_rate": 1e-05, + "loss": 1.239, + "step": 1141 + }, + { + "epoch": 0.5145302996170309, + "grad_norm": 1.7714493294198033, + "learning_rate": 1e-05, + "loss": 1.2214, + "step": 1142 + }, + { + "epoch": 0.5149808515431403, + "grad_norm": 1.9020951604324543, + "learning_rate": 1e-05, + "loss": 1.2285, + "step": 1143 + }, + { + "epoch": 0.5154314034692499, + "grad_norm": 1.7262916022035897, + "learning_rate": 1e-05, + "loss": 1.1955, + "step": 1144 + }, + { + "epoch": 0.5158819553953593, + "grad_norm": 1.8065375077357888, + "learning_rate": 1e-05, + "loss": 1.2188, + "step": 1145 + }, + { + "epoch": 0.5163325073214688, + "grad_norm": 1.9000422236133612, + "learning_rate": 1e-05, + "loss": 1.2332, + "step": 1146 + }, + { + "epoch": 0.5167830592475783, + "grad_norm": 1.820127159138276, + "learning_rate": 1e-05, + "loss": 1.1912, + "step": 1147 + }, + { + "epoch": 0.5172336111736877, + "grad_norm": 2.0478804185280914, + "learning_rate": 1e-05, + "loss": 1.1894, + "step": 1148 + }, + { + "epoch": 0.5176841630997973, + "grad_norm": 1.6483893814399748, + "learning_rate": 1e-05, + "loss": 1.2214, + "step": 1149 + }, + { + "epoch": 0.5181347150259067, + "grad_norm": 1.8585701430807764, + "learning_rate": 1e-05, + "loss": 1.2955, + "step": 1150 + }, + { + "epoch": 0.5185852669520162, + "grad_norm": 1.7839142062682194, + "learning_rate": 1e-05, + "loss": 1.2145, + "step": 1151 + }, + { + "epoch": 0.5190358188781257, + "grad_norm": 1.8390477603191218, + "learning_rate": 1e-05, + "loss": 1.2206, + "step": 1152 + }, + { + "epoch": 0.5194863708042352, + "grad_norm": 1.8242871766456614, + "learning_rate": 1e-05, + "loss": 1.2778, + "step": 1153 + }, + { + "epoch": 0.5199369227303446, + "grad_norm": 2.0086407164587268, + "learning_rate": 1e-05, + "loss": 1.2485, + "step": 1154 + }, + { + "epoch": 0.5203874746564542, + "grad_norm": 1.6880513466993963, + "learning_rate": 1e-05, + "loss": 1.251, + "step": 1155 + }, + { + "epoch": 0.5208380265825636, + "grad_norm": 1.7131540209069966, + "learning_rate": 1e-05, + "loss": 1.2352, + "step": 1156 + }, + { + "epoch": 0.5212885785086732, + "grad_norm": 1.77424723685144, + "learning_rate": 1e-05, + "loss": 1.196, + "step": 1157 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 1.694871403898286, + "learning_rate": 1e-05, + "loss": 1.1821, + "step": 1158 + }, + { + "epoch": 0.5221896823608921, + "grad_norm": 1.5785506005919077, + "learning_rate": 1e-05, + "loss": 1.218, + "step": 1159 + }, + { + "epoch": 0.5226402342870016, + "grad_norm": 1.8796378901257582, + "learning_rate": 1e-05, + "loss": 1.191, + "step": 1160 + }, + { + "epoch": 0.5230907862131111, + "grad_norm": 1.7924233055693763, + "learning_rate": 1e-05, + "loss": 1.2295, + "step": 1161 + }, + { + "epoch": 0.5235413381392205, + "grad_norm": 1.7781927608468, + "learning_rate": 1e-05, + "loss": 1.2309, + "step": 1162 + }, + { + "epoch": 0.52399189006533, + "grad_norm": 1.799427621383034, + "learning_rate": 1e-05, + "loss": 1.1959, + "step": 1163 + }, + { + "epoch": 0.5244424419914395, + "grad_norm": 1.8834615193453594, + "learning_rate": 1e-05, + "loss": 1.2077, + "step": 1164 + }, + { + "epoch": 0.524892993917549, + "grad_norm": 1.8016925557595358, + "learning_rate": 1e-05, + "loss": 1.1745, + "step": 1165 + }, + { + "epoch": 0.5253435458436585, + "grad_norm": 1.705402400130282, + "learning_rate": 1e-05, + "loss": 1.227, + "step": 1166 + }, + { + "epoch": 0.5257940977697679, + "grad_norm": 1.7686687798590577, + "learning_rate": 1e-05, + "loss": 1.2233, + "step": 1167 + }, + { + "epoch": 0.5262446496958775, + "grad_norm": 1.658258877352761, + "learning_rate": 1e-05, + "loss": 1.2014, + "step": 1168 + }, + { + "epoch": 0.5266952016219869, + "grad_norm": 1.8191890012375032, + "learning_rate": 1e-05, + "loss": 1.2677, + "step": 1169 + }, + { + "epoch": 0.5271457535480965, + "grad_norm": 1.7701142620729842, + "learning_rate": 1e-05, + "loss": 1.214, + "step": 1170 + }, + { + "epoch": 0.5275963054742059, + "grad_norm": 1.8133893275054933, + "learning_rate": 1e-05, + "loss": 1.2402, + "step": 1171 + }, + { + "epoch": 0.5280468574003154, + "grad_norm": 1.740931963668351, + "learning_rate": 1e-05, + "loss": 1.1761, + "step": 1172 + }, + { + "epoch": 0.5284974093264249, + "grad_norm": 1.7818969524312216, + "learning_rate": 1e-05, + "loss": 1.2553, + "step": 1173 + }, + { + "epoch": 0.5289479612525344, + "grad_norm": 1.7237927032769915, + "learning_rate": 1e-05, + "loss": 1.2491, + "step": 1174 + }, + { + "epoch": 0.5293985131786438, + "grad_norm": 1.7559928322729434, + "learning_rate": 1e-05, + "loss": 1.2524, + "step": 1175 + }, + { + "epoch": 0.5298490651047533, + "grad_norm": 1.832205300455528, + "learning_rate": 1e-05, + "loss": 1.2073, + "step": 1176 + }, + { + "epoch": 0.5302996170308628, + "grad_norm": 2.0093659961338277, + "learning_rate": 1e-05, + "loss": 1.2001, + "step": 1177 + }, + { + "epoch": 0.5307501689569722, + "grad_norm": 1.9178053907781756, + "learning_rate": 1e-05, + "loss": 1.1835, + "step": 1178 + }, + { + "epoch": 0.5312007208830818, + "grad_norm": 1.7443974234038635, + "learning_rate": 1e-05, + "loss": 1.1814, + "step": 1179 + }, + { + "epoch": 0.5316512728091912, + "grad_norm": 1.9130309444097395, + "learning_rate": 1e-05, + "loss": 1.2044, + "step": 1180 + }, + { + "epoch": 0.5321018247353008, + "grad_norm": 2.0645138856559093, + "learning_rate": 1e-05, + "loss": 1.2481, + "step": 1181 + }, + { + "epoch": 0.5325523766614102, + "grad_norm": 1.9165903238442665, + "learning_rate": 1e-05, + "loss": 1.2731, + "step": 1182 + }, + { + "epoch": 0.5330029285875197, + "grad_norm": 1.7745230817996194, + "learning_rate": 1e-05, + "loss": 1.2351, + "step": 1183 + }, + { + "epoch": 0.5334534805136292, + "grad_norm": 1.779111603058722, + "learning_rate": 1e-05, + "loss": 1.2212, + "step": 1184 + }, + { + "epoch": 0.5339040324397387, + "grad_norm": 1.7484601964668258, + "learning_rate": 1e-05, + "loss": 1.192, + "step": 1185 + }, + { + "epoch": 0.5343545843658482, + "grad_norm": 1.6836885724980257, + "learning_rate": 1e-05, + "loss": 1.2735, + "step": 1186 + }, + { + "epoch": 0.5348051362919577, + "grad_norm": 1.8114416538391223, + "learning_rate": 1e-05, + "loss": 1.2181, + "step": 1187 + }, + { + "epoch": 0.5352556882180671, + "grad_norm": 1.8448999435632383, + "learning_rate": 1e-05, + "loss": 1.2197, + "step": 1188 + }, + { + "epoch": 0.5357062401441767, + "grad_norm": 1.9435339007338701, + "learning_rate": 1e-05, + "loss": 1.2194, + "step": 1189 + }, + { + "epoch": 0.5361567920702861, + "grad_norm": 1.8025109235583145, + "learning_rate": 1e-05, + "loss": 1.2042, + "step": 1190 + }, + { + "epoch": 0.5366073439963955, + "grad_norm": 1.6700090053309786, + "learning_rate": 1e-05, + "loss": 1.2033, + "step": 1191 + }, + { + "epoch": 0.5370578959225051, + "grad_norm": 1.6849205350953573, + "learning_rate": 1e-05, + "loss": 1.2215, + "step": 1192 + }, + { + "epoch": 0.5375084478486145, + "grad_norm": 1.7147161960304929, + "learning_rate": 1e-05, + "loss": 1.2341, + "step": 1193 + }, + { + "epoch": 0.5379589997747241, + "grad_norm": 1.7746421868361555, + "learning_rate": 1e-05, + "loss": 1.1747, + "step": 1194 + }, + { + "epoch": 0.5384095517008335, + "grad_norm": 1.8599995128441753, + "learning_rate": 1e-05, + "loss": 1.2487, + "step": 1195 + }, + { + "epoch": 0.538860103626943, + "grad_norm": 1.7399761480508251, + "learning_rate": 1e-05, + "loss": 1.1832, + "step": 1196 + }, + { + "epoch": 0.5393106555530525, + "grad_norm": 1.799614681377465, + "learning_rate": 1e-05, + "loss": 1.1554, + "step": 1197 + }, + { + "epoch": 0.539761207479162, + "grad_norm": 1.892989010192148, + "learning_rate": 1e-05, + "loss": 1.2031, + "step": 1198 + }, + { + "epoch": 0.5402117594052714, + "grad_norm": 1.7929467713485536, + "learning_rate": 1e-05, + "loss": 1.202, + "step": 1199 + }, + { + "epoch": 0.540662311331381, + "grad_norm": 1.655986074624804, + "learning_rate": 1e-05, + "loss": 1.2022, + "step": 1200 + }, + { + "epoch": 0.5411128632574904, + "grad_norm": 1.8235445374371355, + "learning_rate": 1e-05, + "loss": 1.2826, + "step": 1201 + }, + { + "epoch": 0.5415634151836, + "grad_norm": 1.6988588552090702, + "learning_rate": 1e-05, + "loss": 1.2316, + "step": 1202 + }, + { + "epoch": 0.5420139671097094, + "grad_norm": 1.8175071133701286, + "learning_rate": 1e-05, + "loss": 1.1956, + "step": 1203 + }, + { + "epoch": 0.5424645190358188, + "grad_norm": 1.8050236092770742, + "learning_rate": 1e-05, + "loss": 1.2068, + "step": 1204 + }, + { + "epoch": 0.5429150709619284, + "grad_norm": 1.869406592824594, + "learning_rate": 1e-05, + "loss": 1.2322, + "step": 1205 + }, + { + "epoch": 0.5433656228880378, + "grad_norm": 1.7853907899983779, + "learning_rate": 1e-05, + "loss": 1.2501, + "step": 1206 + }, + { + "epoch": 0.5438161748141473, + "grad_norm": 1.8810947527108874, + "learning_rate": 1e-05, + "loss": 1.1695, + "step": 1207 + }, + { + "epoch": 0.5442667267402568, + "grad_norm": 1.945647569363387, + "learning_rate": 1e-05, + "loss": 1.1909, + "step": 1208 + }, + { + "epoch": 0.5447172786663663, + "grad_norm": 1.9087443366683496, + "learning_rate": 1e-05, + "loss": 1.2084, + "step": 1209 + }, + { + "epoch": 0.5451678305924758, + "grad_norm": 1.8357196765209838, + "learning_rate": 1e-05, + "loss": 1.2312, + "step": 1210 + }, + { + "epoch": 0.5456183825185853, + "grad_norm": 1.8138651667673715, + "learning_rate": 1e-05, + "loss": 1.2034, + "step": 1211 + }, + { + "epoch": 0.5460689344446947, + "grad_norm": 1.8099605933670535, + "learning_rate": 1e-05, + "loss": 1.2638, + "step": 1212 + }, + { + "epoch": 0.5465194863708043, + "grad_norm": 1.773024385547567, + "learning_rate": 1e-05, + "loss": 1.1959, + "step": 1213 + }, + { + "epoch": 0.5469700382969137, + "grad_norm": 1.7383391718814192, + "learning_rate": 1e-05, + "loss": 1.2083, + "step": 1214 + }, + { + "epoch": 0.5474205902230233, + "grad_norm": 1.9103185288324496, + "learning_rate": 1e-05, + "loss": 1.1993, + "step": 1215 + }, + { + "epoch": 0.5478711421491327, + "grad_norm": 1.700385208833561, + "learning_rate": 1e-05, + "loss": 1.2476, + "step": 1216 + }, + { + "epoch": 0.5483216940752421, + "grad_norm": 1.6383806331975956, + "learning_rate": 1e-05, + "loss": 1.2314, + "step": 1217 + }, + { + "epoch": 0.5487722460013517, + "grad_norm": 1.6895621880809684, + "learning_rate": 1e-05, + "loss": 1.2214, + "step": 1218 + }, + { + "epoch": 0.5492227979274611, + "grad_norm": 1.7873286157278356, + "learning_rate": 1e-05, + "loss": 1.2299, + "step": 1219 + }, + { + "epoch": 0.5496733498535706, + "grad_norm": 1.930894859148649, + "learning_rate": 1e-05, + "loss": 1.2487, + "step": 1220 + }, + { + "epoch": 0.5501239017796801, + "grad_norm": 2.0135423824053675, + "learning_rate": 1e-05, + "loss": 1.2151, + "step": 1221 + }, + { + "epoch": 0.5505744537057896, + "grad_norm": 1.6520001512756237, + "learning_rate": 1e-05, + "loss": 1.2274, + "step": 1222 + }, + { + "epoch": 0.551025005631899, + "grad_norm": 1.7675149095183413, + "learning_rate": 1e-05, + "loss": 1.1628, + "step": 1223 + }, + { + "epoch": 0.5514755575580086, + "grad_norm": 1.670533487591758, + "learning_rate": 1e-05, + "loss": 1.2193, + "step": 1224 + }, + { + "epoch": 0.551926109484118, + "grad_norm": 1.7402222912391112, + "learning_rate": 1e-05, + "loss": 1.1725, + "step": 1225 + }, + { + "epoch": 0.5523766614102276, + "grad_norm": 1.8433878045228402, + "learning_rate": 1e-05, + "loss": 1.2262, + "step": 1226 + }, + { + "epoch": 0.552827213336337, + "grad_norm": 1.7083694572915074, + "learning_rate": 1e-05, + "loss": 1.1985, + "step": 1227 + }, + { + "epoch": 0.5532777652624465, + "grad_norm": 1.749289550207512, + "learning_rate": 1e-05, + "loss": 1.1765, + "step": 1228 + }, + { + "epoch": 0.553728317188556, + "grad_norm": 1.790532022586267, + "learning_rate": 1e-05, + "loss": 1.2446, + "step": 1229 + }, + { + "epoch": 0.5541788691146655, + "grad_norm": 1.8473192179714735, + "learning_rate": 1e-05, + "loss": 1.2218, + "step": 1230 + }, + { + "epoch": 0.554629421040775, + "grad_norm": 1.765691891742622, + "learning_rate": 1e-05, + "loss": 1.2279, + "step": 1231 + }, + { + "epoch": 0.5550799729668844, + "grad_norm": 1.7761654789595203, + "learning_rate": 1e-05, + "loss": 1.2273, + "step": 1232 + }, + { + "epoch": 0.5555305248929939, + "grad_norm": 1.808037518854502, + "learning_rate": 1e-05, + "loss": 1.2546, + "step": 1233 + }, + { + "epoch": 0.5559810768191034, + "grad_norm": 1.7323127037119805, + "learning_rate": 1e-05, + "loss": 1.251, + "step": 1234 + }, + { + "epoch": 0.5564316287452129, + "grad_norm": 1.7872621678328413, + "learning_rate": 1e-05, + "loss": 1.2239, + "step": 1235 + }, + { + "epoch": 0.5568821806713223, + "grad_norm": 1.5816400521913458, + "learning_rate": 1e-05, + "loss": 1.2246, + "step": 1236 + }, + { + "epoch": 0.5573327325974319, + "grad_norm": 1.78128066934941, + "learning_rate": 1e-05, + "loss": 1.214, + "step": 1237 + }, + { + "epoch": 0.5577832845235413, + "grad_norm": 1.6569491612432181, + "learning_rate": 1e-05, + "loss": 1.2065, + "step": 1238 + }, + { + "epoch": 0.5582338364496509, + "grad_norm": 1.8776190258672327, + "learning_rate": 1e-05, + "loss": 1.1834, + "step": 1239 + }, + { + "epoch": 0.5586843883757603, + "grad_norm": 1.7696075196857524, + "learning_rate": 1e-05, + "loss": 1.2107, + "step": 1240 + }, + { + "epoch": 0.5591349403018698, + "grad_norm": 1.729313440288939, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 1241 + }, + { + "epoch": 0.5595854922279793, + "grad_norm": 1.6447216627865269, + "learning_rate": 1e-05, + "loss": 1.1784, + "step": 1242 + }, + { + "epoch": 0.5600360441540888, + "grad_norm": 1.6745547082296826, + "learning_rate": 1e-05, + "loss": 1.1845, + "step": 1243 + }, + { + "epoch": 0.5604865960801982, + "grad_norm": 1.8037442589388843, + "learning_rate": 1e-05, + "loss": 1.1784, + "step": 1244 + }, + { + "epoch": 0.5609371480063077, + "grad_norm": 1.82565372827476, + "learning_rate": 1e-05, + "loss": 1.2405, + "step": 1245 + }, + { + "epoch": 0.5613876999324172, + "grad_norm": 1.8098475115447121, + "learning_rate": 1e-05, + "loss": 1.2163, + "step": 1246 + }, + { + "epoch": 0.5618382518585266, + "grad_norm": 1.9220636481752578, + "learning_rate": 1e-05, + "loss": 1.2288, + "step": 1247 + }, + { + "epoch": 0.5622888037846362, + "grad_norm": 1.710731088857779, + "learning_rate": 1e-05, + "loss": 1.2183, + "step": 1248 + }, + { + "epoch": 0.5627393557107456, + "grad_norm": 1.8052982521525502, + "learning_rate": 1e-05, + "loss": 1.1942, + "step": 1249 + }, + { + "epoch": 0.5631899076368552, + "grad_norm": 1.640205511214467, + "learning_rate": 1e-05, + "loss": 1.2008, + "step": 1250 + }, + { + "epoch": 0.5636404595629646, + "grad_norm": 2.0393672038821564, + "learning_rate": 1e-05, + "loss": 1.2977, + "step": 1251 + }, + { + "epoch": 0.5640910114890741, + "grad_norm": 1.8026799358887735, + "learning_rate": 1e-05, + "loss": 1.1981, + "step": 1252 + }, + { + "epoch": 0.5645415634151836, + "grad_norm": 1.718751835030151, + "learning_rate": 1e-05, + "loss": 1.2208, + "step": 1253 + }, + { + "epoch": 0.5649921153412931, + "grad_norm": 1.6918767310612728, + "learning_rate": 1e-05, + "loss": 1.2205, + "step": 1254 + }, + { + "epoch": 0.5654426672674026, + "grad_norm": 1.6717442792677766, + "learning_rate": 1e-05, + "loss": 1.1739, + "step": 1255 + }, + { + "epoch": 0.5658932191935121, + "grad_norm": 1.961048303687513, + "learning_rate": 1e-05, + "loss": 1.206, + "step": 1256 + }, + { + "epoch": 0.5663437711196215, + "grad_norm": 1.8056887914479824, + "learning_rate": 1e-05, + "loss": 1.2089, + "step": 1257 + }, + { + "epoch": 0.5667943230457311, + "grad_norm": 1.6984295897919814, + "learning_rate": 1e-05, + "loss": 1.2279, + "step": 1258 + }, + { + "epoch": 0.5672448749718405, + "grad_norm": 1.7622743101852951, + "learning_rate": 1e-05, + "loss": 1.1912, + "step": 1259 + }, + { + "epoch": 0.5676954268979499, + "grad_norm": 1.583640334650597, + "learning_rate": 1e-05, + "loss": 1.2112, + "step": 1260 + }, + { + "epoch": 0.5681459788240595, + "grad_norm": 1.6079332863360147, + "learning_rate": 1e-05, + "loss": 1.1877, + "step": 1261 + }, + { + "epoch": 0.5685965307501689, + "grad_norm": 1.6952757868493882, + "learning_rate": 1e-05, + "loss": 1.1768, + "step": 1262 + }, + { + "epoch": 0.5690470826762785, + "grad_norm": 1.668674496114196, + "learning_rate": 1e-05, + "loss": 1.174, + "step": 1263 + }, + { + "epoch": 0.5694976346023879, + "grad_norm": 1.9683106986964076, + "learning_rate": 1e-05, + "loss": 1.229, + "step": 1264 + }, + { + "epoch": 0.5699481865284974, + "grad_norm": 1.739019753022858, + "learning_rate": 1e-05, + "loss": 1.17, + "step": 1265 + }, + { + "epoch": 0.5703987384546069, + "grad_norm": 1.692161131470966, + "learning_rate": 1e-05, + "loss": 1.1942, + "step": 1266 + }, + { + "epoch": 0.5708492903807164, + "grad_norm": 1.7293550537323032, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 1267 + }, + { + "epoch": 0.5712998423068258, + "grad_norm": 1.7932383980869326, + "learning_rate": 1e-05, + "loss": 1.2103, + "step": 1268 + }, + { + "epoch": 0.5717503942329354, + "grad_norm": 1.7337813641116824, + "learning_rate": 1e-05, + "loss": 1.1794, + "step": 1269 + }, + { + "epoch": 0.5722009461590448, + "grad_norm": 1.7602788271157532, + "learning_rate": 1e-05, + "loss": 1.2374, + "step": 1270 + }, + { + "epoch": 0.5726514980851544, + "grad_norm": 1.663773026864633, + "learning_rate": 1e-05, + "loss": 1.2071, + "step": 1271 + }, + { + "epoch": 0.5731020500112638, + "grad_norm": 1.806661906427446, + "learning_rate": 1e-05, + "loss": 1.1879, + "step": 1272 + }, + { + "epoch": 0.5735526019373732, + "grad_norm": 1.722384599755511, + "learning_rate": 1e-05, + "loss": 1.1938, + "step": 1273 + }, + { + "epoch": 0.5740031538634828, + "grad_norm": 1.8231291007726889, + "learning_rate": 1e-05, + "loss": 1.2121, + "step": 1274 + }, + { + "epoch": 0.5744537057895922, + "grad_norm": 1.8167541605452744, + "learning_rate": 1e-05, + "loss": 1.2475, + "step": 1275 + }, + { + "epoch": 0.5749042577157017, + "grad_norm": 1.7818831527774315, + "learning_rate": 1e-05, + "loss": 1.2115, + "step": 1276 + }, + { + "epoch": 0.5753548096418112, + "grad_norm": 1.6617095262867325, + "learning_rate": 1e-05, + "loss": 1.227, + "step": 1277 + }, + { + "epoch": 0.5758053615679207, + "grad_norm": 1.7007240432875168, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 1278 + }, + { + "epoch": 0.5762559134940302, + "grad_norm": 1.76925981886769, + "learning_rate": 1e-05, + "loss": 1.1294, + "step": 1279 + }, + { + "epoch": 0.5767064654201397, + "grad_norm": 1.8815257385659285, + "learning_rate": 1e-05, + "loss": 1.1888, + "step": 1280 + }, + { + "epoch": 0.5771570173462491, + "grad_norm": 1.5160018182898973, + "learning_rate": 1e-05, + "loss": 1.2336, + "step": 1281 + }, + { + "epoch": 0.5776075692723587, + "grad_norm": 1.6549581185662503, + "learning_rate": 1e-05, + "loss": 1.251, + "step": 1282 + }, + { + "epoch": 0.5780581211984681, + "grad_norm": 1.756618270675133, + "learning_rate": 1e-05, + "loss": 1.232, + "step": 1283 + }, + { + "epoch": 0.5785086731245777, + "grad_norm": 1.7419052722691228, + "learning_rate": 1e-05, + "loss": 1.17, + "step": 1284 + }, + { + "epoch": 0.5789592250506871, + "grad_norm": 1.7160979101151757, + "learning_rate": 1e-05, + "loss": 1.2386, + "step": 1285 + }, + { + "epoch": 0.5794097769767966, + "grad_norm": 1.8243826066374753, + "learning_rate": 1e-05, + "loss": 1.2472, + "step": 1286 + }, + { + "epoch": 0.5798603289029061, + "grad_norm": 1.933479040541196, + "learning_rate": 1e-05, + "loss": 1.2354, + "step": 1287 + }, + { + "epoch": 0.5803108808290155, + "grad_norm": 1.719961458326181, + "learning_rate": 1e-05, + "loss": 1.1676, + "step": 1288 + }, + { + "epoch": 0.580761432755125, + "grad_norm": 1.627151067117838, + "learning_rate": 1e-05, + "loss": 1.1455, + "step": 1289 + }, + { + "epoch": 0.5812119846812345, + "grad_norm": 1.9107495551066946, + "learning_rate": 1e-05, + "loss": 1.2433, + "step": 1290 + }, + { + "epoch": 0.581662536607344, + "grad_norm": 1.7825032477174885, + "learning_rate": 1e-05, + "loss": 1.2024, + "step": 1291 + }, + { + "epoch": 0.5821130885334534, + "grad_norm": 1.6348409962935258, + "learning_rate": 1e-05, + "loss": 1.2242, + "step": 1292 + }, + { + "epoch": 0.582563640459563, + "grad_norm": 1.699968988824632, + "learning_rate": 1e-05, + "loss": 1.2582, + "step": 1293 + }, + { + "epoch": 0.5830141923856724, + "grad_norm": 1.8974440062338904, + "learning_rate": 1e-05, + "loss": 1.1932, + "step": 1294 + }, + { + "epoch": 0.583464744311782, + "grad_norm": 1.8263445611498854, + "learning_rate": 1e-05, + "loss": 1.1732, + "step": 1295 + }, + { + "epoch": 0.5839152962378914, + "grad_norm": 1.8527801966410529, + "learning_rate": 1e-05, + "loss": 1.2289, + "step": 1296 + }, + { + "epoch": 0.584365848164001, + "grad_norm": 1.7644064170863516, + "learning_rate": 1e-05, + "loss": 1.2403, + "step": 1297 + }, + { + "epoch": 0.5848164000901104, + "grad_norm": 1.686109827756735, + "learning_rate": 1e-05, + "loss": 1.2233, + "step": 1298 + }, + { + "epoch": 0.5852669520162199, + "grad_norm": 1.6650733956443948, + "learning_rate": 1e-05, + "loss": 1.1874, + "step": 1299 + }, + { + "epoch": 0.5857175039423294, + "grad_norm": 1.871982358828454, + "learning_rate": 1e-05, + "loss": 1.188, + "step": 1300 + }, + { + "epoch": 0.5861680558684388, + "grad_norm": 1.8230592097279792, + "learning_rate": 1e-05, + "loss": 1.2099, + "step": 1301 + }, + { + "epoch": 0.5866186077945483, + "grad_norm": 1.6857220231101702, + "learning_rate": 1e-05, + "loss": 1.2375, + "step": 1302 + }, + { + "epoch": 0.5870691597206578, + "grad_norm": 1.8959198732393088, + "learning_rate": 1e-05, + "loss": 1.1937, + "step": 1303 + }, + { + "epoch": 0.5875197116467673, + "grad_norm": 1.7160428014338607, + "learning_rate": 1e-05, + "loss": 1.1922, + "step": 1304 + }, + { + "epoch": 0.5879702635728767, + "grad_norm": 1.7226545455972886, + "learning_rate": 1e-05, + "loss": 1.2047, + "step": 1305 + }, + { + "epoch": 0.5884208154989863, + "grad_norm": 1.8027359299917531, + "learning_rate": 1e-05, + "loss": 1.2287, + "step": 1306 + }, + { + "epoch": 0.5888713674250957, + "grad_norm": 1.7292176768544731, + "learning_rate": 1e-05, + "loss": 1.2721, + "step": 1307 + }, + { + "epoch": 0.5893219193512053, + "grad_norm": 1.776100854875579, + "learning_rate": 1e-05, + "loss": 1.2539, + "step": 1308 + }, + { + "epoch": 0.5897724712773147, + "grad_norm": 1.81697377707163, + "learning_rate": 1e-05, + "loss": 1.2325, + "step": 1309 + }, + { + "epoch": 0.5902230232034242, + "grad_norm": 1.8478715783483286, + "learning_rate": 1e-05, + "loss": 1.1742, + "step": 1310 + }, + { + "epoch": 0.5906735751295337, + "grad_norm": 1.6954920926484498, + "learning_rate": 1e-05, + "loss": 1.1995, + "step": 1311 + }, + { + "epoch": 0.5911241270556432, + "grad_norm": 1.7772856974838311, + "learning_rate": 1e-05, + "loss": 1.2106, + "step": 1312 + }, + { + "epoch": 0.5915746789817526, + "grad_norm": 1.790739411389188, + "learning_rate": 1e-05, + "loss": 1.1941, + "step": 1313 + }, + { + "epoch": 0.5920252309078621, + "grad_norm": 1.6940522084311165, + "learning_rate": 1e-05, + "loss": 1.2367, + "step": 1314 + }, + { + "epoch": 0.5924757828339716, + "grad_norm": 1.8268985226553285, + "learning_rate": 1e-05, + "loss": 1.2083, + "step": 1315 + }, + { + "epoch": 0.592926334760081, + "grad_norm": 1.9954520134048521, + "learning_rate": 1e-05, + "loss": 1.179, + "step": 1316 + }, + { + "epoch": 0.5933768866861906, + "grad_norm": 1.8629824076550818, + "learning_rate": 1e-05, + "loss": 1.2344, + "step": 1317 + }, + { + "epoch": 0.5938274386123, + "grad_norm": 2.0103774181218332, + "learning_rate": 1e-05, + "loss": 1.2594, + "step": 1318 + }, + { + "epoch": 0.5942779905384096, + "grad_norm": 1.6660420981044828, + "learning_rate": 1e-05, + "loss": 1.1565, + "step": 1319 + }, + { + "epoch": 0.594728542464519, + "grad_norm": 1.6853788872552433, + "learning_rate": 1e-05, + "loss": 1.1985, + "step": 1320 + }, + { + "epoch": 0.5951790943906285, + "grad_norm": 1.8487969481917361, + "learning_rate": 1e-05, + "loss": 1.2616, + "step": 1321 + }, + { + "epoch": 0.595629646316738, + "grad_norm": 1.8469994164982355, + "learning_rate": 1e-05, + "loss": 1.2165, + "step": 1322 + }, + { + "epoch": 0.5960801982428475, + "grad_norm": 1.7808612583191794, + "learning_rate": 1e-05, + "loss": 1.196, + "step": 1323 + }, + { + "epoch": 0.596530750168957, + "grad_norm": 1.6874331263721494, + "learning_rate": 1e-05, + "loss": 1.2272, + "step": 1324 + }, + { + "epoch": 0.5969813020950665, + "grad_norm": 1.7758083439810735, + "learning_rate": 1e-05, + "loss": 1.1748, + "step": 1325 + }, + { + "epoch": 0.5974318540211759, + "grad_norm": 1.6678835969274108, + "learning_rate": 1e-05, + "loss": 1.2267, + "step": 1326 + }, + { + "epoch": 0.5978824059472855, + "grad_norm": 1.803673440473812, + "learning_rate": 1e-05, + "loss": 1.1627, + "step": 1327 + }, + { + "epoch": 0.5983329578733949, + "grad_norm": 1.8984680335014743, + "learning_rate": 1e-05, + "loss": 1.2117, + "step": 1328 + }, + { + "epoch": 0.5987835097995043, + "grad_norm": 1.8927002548152252, + "learning_rate": 1e-05, + "loss": 1.2069, + "step": 1329 + }, + { + "epoch": 0.5992340617256139, + "grad_norm": 1.7466479887423474, + "learning_rate": 1e-05, + "loss": 1.2336, + "step": 1330 + }, + { + "epoch": 0.5996846136517233, + "grad_norm": 1.8992416636445528, + "learning_rate": 1e-05, + "loss": 1.1989, + "step": 1331 + }, + { + "epoch": 0.6001351655778329, + "grad_norm": 1.8491010305275333, + "learning_rate": 1e-05, + "loss": 1.2014, + "step": 1332 + }, + { + "epoch": 0.6005857175039423, + "grad_norm": 1.819825114115824, + "learning_rate": 1e-05, + "loss": 1.2468, + "step": 1333 + }, + { + "epoch": 0.6010362694300518, + "grad_norm": 1.7991712243380846, + "learning_rate": 1e-05, + "loss": 1.1827, + "step": 1334 + }, + { + "epoch": 0.6014868213561613, + "grad_norm": 1.7071817076402331, + "learning_rate": 1e-05, + "loss": 1.2202, + "step": 1335 + }, + { + "epoch": 0.6019373732822708, + "grad_norm": 1.7303007334112557, + "learning_rate": 1e-05, + "loss": 1.2097, + "step": 1336 + }, + { + "epoch": 0.6023879252083802, + "grad_norm": 1.5340971873303832, + "learning_rate": 1e-05, + "loss": 1.1909, + "step": 1337 + }, + { + "epoch": 0.6028384771344898, + "grad_norm": 1.6591955553042324, + "learning_rate": 1e-05, + "loss": 1.2635, + "step": 1338 + }, + { + "epoch": 0.6032890290605992, + "grad_norm": 1.8169846626883783, + "learning_rate": 1e-05, + "loss": 1.2285, + "step": 1339 + }, + { + "epoch": 0.6037395809867088, + "grad_norm": 2.0689419123732202, + "learning_rate": 1e-05, + "loss": 1.2503, + "step": 1340 + }, + { + "epoch": 0.6041901329128182, + "grad_norm": 1.8044095506809605, + "learning_rate": 1e-05, + "loss": 1.2008, + "step": 1341 + }, + { + "epoch": 0.6046406848389276, + "grad_norm": 1.7016093429282768, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 1342 + }, + { + "epoch": 0.6050912367650372, + "grad_norm": 1.6926471157882403, + "learning_rate": 1e-05, + "loss": 1.1924, + "step": 1343 + }, + { + "epoch": 0.6055417886911466, + "grad_norm": 1.726036101180517, + "learning_rate": 1e-05, + "loss": 1.2183, + "step": 1344 + }, + { + "epoch": 0.6059923406172562, + "grad_norm": 1.6284686389088292, + "learning_rate": 1e-05, + "loss": 1.2263, + "step": 1345 + }, + { + "epoch": 0.6064428925433656, + "grad_norm": 1.8219348319849917, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 1346 + }, + { + "epoch": 0.6068934444694751, + "grad_norm": 1.867304570054251, + "learning_rate": 1e-05, + "loss": 1.2246, + "step": 1347 + }, + { + "epoch": 0.6073439963955846, + "grad_norm": 1.7241172819179993, + "learning_rate": 1e-05, + "loss": 1.1936, + "step": 1348 + }, + { + "epoch": 0.6077945483216941, + "grad_norm": 1.6534720493893311, + "learning_rate": 1e-05, + "loss": 1.2811, + "step": 1349 + }, + { + "epoch": 0.6082451002478035, + "grad_norm": 1.6786794751334155, + "learning_rate": 1e-05, + "loss": 1.1915, + "step": 1350 + }, + { + "epoch": 0.6086956521739131, + "grad_norm": 1.7585178814796325, + "learning_rate": 1e-05, + "loss": 1.1843, + "step": 1351 + }, + { + "epoch": 0.6091462041000225, + "grad_norm": 1.7960034529195434, + "learning_rate": 1e-05, + "loss": 1.1596, + "step": 1352 + }, + { + "epoch": 0.6095967560261321, + "grad_norm": 1.6927853300950366, + "learning_rate": 1e-05, + "loss": 1.2017, + "step": 1353 + }, + { + "epoch": 0.6100473079522415, + "grad_norm": 1.7275520080961777, + "learning_rate": 1e-05, + "loss": 1.2036, + "step": 1354 + }, + { + "epoch": 0.610497859878351, + "grad_norm": 1.9279648803221345, + "learning_rate": 1e-05, + "loss": 1.2357, + "step": 1355 + }, + { + "epoch": 0.6109484118044605, + "grad_norm": 1.716209317173508, + "learning_rate": 1e-05, + "loss": 1.2058, + "step": 1356 + }, + { + "epoch": 0.6113989637305699, + "grad_norm": 1.7036585157626531, + "learning_rate": 1e-05, + "loss": 1.1875, + "step": 1357 + }, + { + "epoch": 0.6118495156566794, + "grad_norm": 1.6337918035391776, + "learning_rate": 1e-05, + "loss": 1.2264, + "step": 1358 + }, + { + "epoch": 0.6123000675827889, + "grad_norm": 1.5820117076593503, + "learning_rate": 1e-05, + "loss": 1.2066, + "step": 1359 + }, + { + "epoch": 0.6127506195088984, + "grad_norm": 1.930044225209007, + "learning_rate": 1e-05, + "loss": 1.2437, + "step": 1360 + }, + { + "epoch": 0.6132011714350079, + "grad_norm": 1.6566093920031004, + "learning_rate": 1e-05, + "loss": 1.1883, + "step": 1361 + }, + { + "epoch": 0.6136517233611174, + "grad_norm": 1.779881594851804, + "learning_rate": 1e-05, + "loss": 1.2089, + "step": 1362 + }, + { + "epoch": 0.6141022752872268, + "grad_norm": 1.764450046969482, + "learning_rate": 1e-05, + "loss": 1.2103, + "step": 1363 + }, + { + "epoch": 0.6145528272133364, + "grad_norm": 1.7755722026363077, + "learning_rate": 1e-05, + "loss": 1.2131, + "step": 1364 + }, + { + "epoch": 0.6150033791394458, + "grad_norm": 1.7162557771636968, + "learning_rate": 1e-05, + "loss": 1.2231, + "step": 1365 + }, + { + "epoch": 0.6154539310655553, + "grad_norm": 1.7411428223191674, + "learning_rate": 1e-05, + "loss": 1.1688, + "step": 1366 + }, + { + "epoch": 0.6159044829916648, + "grad_norm": 1.5915237847294443, + "learning_rate": 1e-05, + "loss": 1.1696, + "step": 1367 + }, + { + "epoch": 0.6163550349177743, + "grad_norm": 1.6602744896219306, + "learning_rate": 1e-05, + "loss": 1.203, + "step": 1368 + }, + { + "epoch": 0.6168055868438838, + "grad_norm": 1.6811181986810635, + "learning_rate": 1e-05, + "loss": 1.1846, + "step": 1369 + }, + { + "epoch": 0.6172561387699932, + "grad_norm": 1.8880690213174565, + "learning_rate": 1e-05, + "loss": 1.2611, + "step": 1370 + }, + { + "epoch": 0.6177066906961027, + "grad_norm": 1.6852043893589732, + "learning_rate": 1e-05, + "loss": 1.2065, + "step": 1371 + }, + { + "epoch": 0.6181572426222122, + "grad_norm": 1.7370535118725405, + "learning_rate": 1e-05, + "loss": 1.2133, + "step": 1372 + }, + { + "epoch": 0.6186077945483217, + "grad_norm": 1.8432188590958254, + "learning_rate": 1e-05, + "loss": 1.1533, + "step": 1373 + }, + { + "epoch": 0.6190583464744311, + "grad_norm": 1.7090225875378293, + "learning_rate": 1e-05, + "loss": 1.2126, + "step": 1374 + }, + { + "epoch": 0.6195088984005407, + "grad_norm": 1.7471266346091723, + "learning_rate": 1e-05, + "loss": 1.229, + "step": 1375 + }, + { + "epoch": 0.6199594503266501, + "grad_norm": 1.6751556261213432, + "learning_rate": 1e-05, + "loss": 1.1679, + "step": 1376 + }, + { + "epoch": 0.6204100022527597, + "grad_norm": 1.840837168317006, + "learning_rate": 1e-05, + "loss": 1.2597, + "step": 1377 + }, + { + "epoch": 0.6208605541788691, + "grad_norm": 1.7050669339799445, + "learning_rate": 1e-05, + "loss": 1.1779, + "step": 1378 + }, + { + "epoch": 0.6213111061049786, + "grad_norm": 1.675799407147007, + "learning_rate": 1e-05, + "loss": 1.1792, + "step": 1379 + }, + { + "epoch": 0.6217616580310881, + "grad_norm": 1.8471267408877814, + "learning_rate": 1e-05, + "loss": 1.1939, + "step": 1380 + }, + { + "epoch": 0.6222122099571976, + "grad_norm": 1.7646840616526538, + "learning_rate": 1e-05, + "loss": 1.1809, + "step": 1381 + }, + { + "epoch": 0.622662761883307, + "grad_norm": 1.613532346023576, + "learning_rate": 1e-05, + "loss": 1.2278, + "step": 1382 + }, + { + "epoch": 0.6231133138094165, + "grad_norm": 1.7524354574585752, + "learning_rate": 1e-05, + "loss": 1.1992, + "step": 1383 + }, + { + "epoch": 0.623563865735526, + "grad_norm": 1.8946756153547877, + "learning_rate": 1e-05, + "loss": 1.218, + "step": 1384 + }, + { + "epoch": 0.6240144176616355, + "grad_norm": 1.8070652923583572, + "learning_rate": 1e-05, + "loss": 1.2454, + "step": 1385 + }, + { + "epoch": 0.624464969587745, + "grad_norm": 1.934785501668537, + "learning_rate": 1e-05, + "loss": 1.1521, + "step": 1386 + }, + { + "epoch": 0.6249155215138544, + "grad_norm": 1.7400549394661604, + "learning_rate": 1e-05, + "loss": 1.196, + "step": 1387 + }, + { + "epoch": 0.625366073439964, + "grad_norm": 1.607968603657251, + "learning_rate": 1e-05, + "loss": 1.199, + "step": 1388 + }, + { + "epoch": 0.6258166253660734, + "grad_norm": 1.8044675154447942, + "learning_rate": 1e-05, + "loss": 1.1741, + "step": 1389 + }, + { + "epoch": 0.626267177292183, + "grad_norm": 1.7688196483248884, + "learning_rate": 1e-05, + "loss": 1.1747, + "step": 1390 + }, + { + "epoch": 0.6267177292182924, + "grad_norm": 1.759460497999574, + "learning_rate": 1e-05, + "loss": 1.1521, + "step": 1391 + }, + { + "epoch": 0.6271682811444019, + "grad_norm": 1.7143078451595086, + "learning_rate": 1e-05, + "loss": 1.2089, + "step": 1392 + }, + { + "epoch": 0.6276188330705114, + "grad_norm": 1.7577585746100481, + "learning_rate": 1e-05, + "loss": 1.2127, + "step": 1393 + }, + { + "epoch": 0.6280693849966209, + "grad_norm": 1.7184496518733532, + "learning_rate": 1e-05, + "loss": 1.2616, + "step": 1394 + }, + { + "epoch": 0.6285199369227303, + "grad_norm": 1.8030703326828137, + "learning_rate": 1e-05, + "loss": 1.2213, + "step": 1395 + }, + { + "epoch": 0.6289704888488399, + "grad_norm": 1.7208734506499097, + "learning_rate": 1e-05, + "loss": 1.2436, + "step": 1396 + }, + { + "epoch": 0.6294210407749493, + "grad_norm": 1.8016050010759244, + "learning_rate": 1e-05, + "loss": 1.2276, + "step": 1397 + }, + { + "epoch": 0.6298715927010587, + "grad_norm": 1.828897332337839, + "learning_rate": 1e-05, + "loss": 1.1914, + "step": 1398 + }, + { + "epoch": 0.6303221446271683, + "grad_norm": 1.7219910585566085, + "learning_rate": 1e-05, + "loss": 1.1792, + "step": 1399 + }, + { + "epoch": 0.6307726965532777, + "grad_norm": 1.8449535380865347, + "learning_rate": 1e-05, + "loss": 1.2105, + "step": 1400 + }, + { + "epoch": 0.6312232484793873, + "grad_norm": 1.6087786279962788, + "learning_rate": 1e-05, + "loss": 1.1846, + "step": 1401 + }, + { + "epoch": 0.6316738004054967, + "grad_norm": 2.0102078883332144, + "learning_rate": 1e-05, + "loss": 1.2488, + "step": 1402 + }, + { + "epoch": 0.6321243523316062, + "grad_norm": 1.7119388385471392, + "learning_rate": 1e-05, + "loss": 1.2099, + "step": 1403 + }, + { + "epoch": 0.6325749042577157, + "grad_norm": 1.6487861408950628, + "learning_rate": 1e-05, + "loss": 1.1942, + "step": 1404 + }, + { + "epoch": 0.6330254561838252, + "grad_norm": 1.6649600775876516, + "learning_rate": 1e-05, + "loss": 1.2256, + "step": 1405 + }, + { + "epoch": 0.6334760081099347, + "grad_norm": 1.782388209434633, + "learning_rate": 1e-05, + "loss": 1.2136, + "step": 1406 + }, + { + "epoch": 0.6339265600360442, + "grad_norm": 1.7316483411168198, + "learning_rate": 1e-05, + "loss": 1.2479, + "step": 1407 + }, + { + "epoch": 0.6343771119621536, + "grad_norm": 1.8474748436342696, + "learning_rate": 1e-05, + "loss": 1.2448, + "step": 1408 + }, + { + "epoch": 0.6348276638882632, + "grad_norm": 1.8881771017116113, + "learning_rate": 1e-05, + "loss": 1.2296, + "step": 1409 + }, + { + "epoch": 0.6352782158143726, + "grad_norm": 1.6815920585030426, + "learning_rate": 1e-05, + "loss": 1.2309, + "step": 1410 + }, + { + "epoch": 0.635728767740482, + "grad_norm": 1.7368229873136294, + "learning_rate": 1e-05, + "loss": 1.1924, + "step": 1411 + }, + { + "epoch": 0.6361793196665916, + "grad_norm": 1.622168896559517, + "learning_rate": 1e-05, + "loss": 1.2089, + "step": 1412 + }, + { + "epoch": 0.636629871592701, + "grad_norm": 1.912293481955647, + "learning_rate": 1e-05, + "loss": 1.1906, + "step": 1413 + }, + { + "epoch": 0.6370804235188106, + "grad_norm": 1.6915846270575778, + "learning_rate": 1e-05, + "loss": 1.179, + "step": 1414 + }, + { + "epoch": 0.63753097544492, + "grad_norm": 1.7556753029844074, + "learning_rate": 1e-05, + "loss": 1.1796, + "step": 1415 + }, + { + "epoch": 0.6379815273710295, + "grad_norm": 1.6918747056678867, + "learning_rate": 1e-05, + "loss": 1.2222, + "step": 1416 + }, + { + "epoch": 0.638432079297139, + "grad_norm": 1.7310795539635673, + "learning_rate": 1e-05, + "loss": 1.1973, + "step": 1417 + }, + { + "epoch": 0.6388826312232485, + "grad_norm": 1.6894866324396298, + "learning_rate": 1e-05, + "loss": 1.2472, + "step": 1418 + }, + { + "epoch": 0.6393331831493579, + "grad_norm": 1.7382727610437598, + "learning_rate": 1e-05, + "loss": 1.2301, + "step": 1419 + }, + { + "epoch": 0.6397837350754675, + "grad_norm": 1.943319888398003, + "learning_rate": 1e-05, + "loss": 1.1965, + "step": 1420 + }, + { + "epoch": 0.6402342870015769, + "grad_norm": 1.696952940528892, + "learning_rate": 1e-05, + "loss": 1.2014, + "step": 1421 + }, + { + "epoch": 0.6406848389276865, + "grad_norm": 1.714810552322869, + "learning_rate": 1e-05, + "loss": 1.183, + "step": 1422 + }, + { + "epoch": 0.6411353908537959, + "grad_norm": 1.7048297703948376, + "learning_rate": 1e-05, + "loss": 1.2197, + "step": 1423 + }, + { + "epoch": 0.6415859427799054, + "grad_norm": 1.7726722279557274, + "learning_rate": 1e-05, + "loss": 1.2121, + "step": 1424 + }, + { + "epoch": 0.6420364947060149, + "grad_norm": 1.6887525653748252, + "learning_rate": 1e-05, + "loss": 1.2374, + "step": 1425 + }, + { + "epoch": 0.6424870466321243, + "grad_norm": 1.8340053037918147, + "learning_rate": 1e-05, + "loss": 1.2096, + "step": 1426 + }, + { + "epoch": 0.6429375985582338, + "grad_norm": 1.7779256927355376, + "learning_rate": 1e-05, + "loss": 1.1675, + "step": 1427 + }, + { + "epoch": 0.6433881504843433, + "grad_norm": 1.7405618803764125, + "learning_rate": 1e-05, + "loss": 1.1953, + "step": 1428 + }, + { + "epoch": 0.6438387024104528, + "grad_norm": 1.7971599220737207, + "learning_rate": 1e-05, + "loss": 1.2136, + "step": 1429 + }, + { + "epoch": 0.6442892543365623, + "grad_norm": 1.7114769331843658, + "learning_rate": 1e-05, + "loss": 1.2055, + "step": 1430 + }, + { + "epoch": 0.6447398062626718, + "grad_norm": 1.7760274627193031, + "learning_rate": 1e-05, + "loss": 1.156, + "step": 1431 + }, + { + "epoch": 0.6451903581887812, + "grad_norm": 1.7056122994068905, + "learning_rate": 1e-05, + "loss": 1.2215, + "step": 1432 + }, + { + "epoch": 0.6456409101148908, + "grad_norm": 1.7839219911420403, + "learning_rate": 1e-05, + "loss": 1.2541, + "step": 1433 + }, + { + "epoch": 0.6460914620410002, + "grad_norm": 1.8744821096312974, + "learning_rate": 1e-05, + "loss": 1.2339, + "step": 1434 + }, + { + "epoch": 0.6465420139671098, + "grad_norm": 1.839865376865044, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 1435 + }, + { + "epoch": 0.6469925658932192, + "grad_norm": 1.8639431131353121, + "learning_rate": 1e-05, + "loss": 1.1954, + "step": 1436 + }, + { + "epoch": 0.6474431178193287, + "grad_norm": 1.8293807546531986, + "learning_rate": 1e-05, + "loss": 1.175, + "step": 1437 + }, + { + "epoch": 0.6478936697454382, + "grad_norm": 1.6694526478693845, + "learning_rate": 1e-05, + "loss": 1.1901, + "step": 1438 + }, + { + "epoch": 0.6483442216715476, + "grad_norm": 1.8084879536612948, + "learning_rate": 1e-05, + "loss": 1.2469, + "step": 1439 + }, + { + "epoch": 0.6487947735976571, + "grad_norm": 1.7432929325836355, + "learning_rate": 1e-05, + "loss": 1.223, + "step": 1440 + }, + { + "epoch": 0.6492453255237666, + "grad_norm": 1.5605053909073308, + "learning_rate": 1e-05, + "loss": 1.2234, + "step": 1441 + }, + { + "epoch": 0.6496958774498761, + "grad_norm": 1.6995774197072129, + "learning_rate": 1e-05, + "loss": 1.2038, + "step": 1442 + }, + { + "epoch": 0.6501464293759855, + "grad_norm": 1.649924736851965, + "learning_rate": 1e-05, + "loss": 1.2179, + "step": 1443 + }, + { + "epoch": 0.6505969813020951, + "grad_norm": 1.677375810646424, + "learning_rate": 1e-05, + "loss": 1.215, + "step": 1444 + }, + { + "epoch": 0.6510475332282045, + "grad_norm": 1.687153477900548, + "learning_rate": 1e-05, + "loss": 1.2227, + "step": 1445 + }, + { + "epoch": 0.6514980851543141, + "grad_norm": 1.7842664158579138, + "learning_rate": 1e-05, + "loss": 1.2258, + "step": 1446 + }, + { + "epoch": 0.6519486370804235, + "grad_norm": 1.7699069004158208, + "learning_rate": 1e-05, + "loss": 1.2028, + "step": 1447 + }, + { + "epoch": 0.652399189006533, + "grad_norm": 1.7048564627062228, + "learning_rate": 1e-05, + "loss": 1.1685, + "step": 1448 + }, + { + "epoch": 0.6528497409326425, + "grad_norm": 1.7209083290062486, + "learning_rate": 1e-05, + "loss": 1.2508, + "step": 1449 + }, + { + "epoch": 0.653300292858752, + "grad_norm": 1.7614080078327725, + "learning_rate": 1e-05, + "loss": 1.1635, + "step": 1450 + }, + { + "epoch": 0.6537508447848615, + "grad_norm": 1.9896284472985386, + "learning_rate": 1e-05, + "loss": 1.1742, + "step": 1451 + }, + { + "epoch": 0.654201396710971, + "grad_norm": 2.0460215106674284, + "learning_rate": 1e-05, + "loss": 1.2027, + "step": 1452 + }, + { + "epoch": 0.6546519486370804, + "grad_norm": 1.7851444286720592, + "learning_rate": 1e-05, + "loss": 1.1954, + "step": 1453 + }, + { + "epoch": 0.6551025005631899, + "grad_norm": 1.875687692273614, + "learning_rate": 1e-05, + "loss": 1.2338, + "step": 1454 + }, + { + "epoch": 0.6555530524892994, + "grad_norm": 1.8123153895280242, + "learning_rate": 1e-05, + "loss": 1.2024, + "step": 1455 + }, + { + "epoch": 0.6560036044154088, + "grad_norm": 1.6178100607967243, + "learning_rate": 1e-05, + "loss": 1.1982, + "step": 1456 + }, + { + "epoch": 0.6564541563415184, + "grad_norm": 1.7615096838359687, + "learning_rate": 1e-05, + "loss": 1.1939, + "step": 1457 + }, + { + "epoch": 0.6569047082676278, + "grad_norm": 1.8122650299160075, + "learning_rate": 1e-05, + "loss": 1.1995, + "step": 1458 + }, + { + "epoch": 0.6573552601937374, + "grad_norm": 1.7898039749755277, + "learning_rate": 1e-05, + "loss": 1.2364, + "step": 1459 + }, + { + "epoch": 0.6578058121198468, + "grad_norm": 1.7004419153173866, + "learning_rate": 1e-05, + "loss": 1.1836, + "step": 1460 + }, + { + "epoch": 0.6582563640459563, + "grad_norm": 1.6632662674579606, + "learning_rate": 1e-05, + "loss": 1.2021, + "step": 1461 + }, + { + "epoch": 0.6587069159720658, + "grad_norm": 1.744403489143352, + "learning_rate": 1e-05, + "loss": 1.1767, + "step": 1462 + }, + { + "epoch": 0.6591574678981753, + "grad_norm": 1.8525400652515043, + "learning_rate": 1e-05, + "loss": 1.1795, + "step": 1463 + }, + { + "epoch": 0.6596080198242847, + "grad_norm": 1.788612318743038, + "learning_rate": 1e-05, + "loss": 1.2182, + "step": 1464 + }, + { + "epoch": 0.6600585717503943, + "grad_norm": 1.8299636065907605, + "learning_rate": 1e-05, + "loss": 1.2462, + "step": 1465 + }, + { + "epoch": 0.6605091236765037, + "grad_norm": 1.8145114327639338, + "learning_rate": 1e-05, + "loss": 1.2028, + "step": 1466 + }, + { + "epoch": 0.6609596756026132, + "grad_norm": 1.7717768279354946, + "learning_rate": 1e-05, + "loss": 1.2048, + "step": 1467 + }, + { + "epoch": 0.6614102275287227, + "grad_norm": 1.7837418410858035, + "learning_rate": 1e-05, + "loss": 1.1785, + "step": 1468 + }, + { + "epoch": 0.6618607794548321, + "grad_norm": 1.8712209185517494, + "learning_rate": 1e-05, + "loss": 1.2264, + "step": 1469 + }, + { + "epoch": 0.6623113313809417, + "grad_norm": 1.7860580091025848, + "learning_rate": 1e-05, + "loss": 1.1879, + "step": 1470 + }, + { + "epoch": 0.6627618833070511, + "grad_norm": 1.6606100128063292, + "learning_rate": 1e-05, + "loss": 1.1584, + "step": 1471 + }, + { + "epoch": 0.6632124352331606, + "grad_norm": 1.6797327456630582, + "learning_rate": 1e-05, + "loss": 1.2382, + "step": 1472 + }, + { + "epoch": 0.6636629871592701, + "grad_norm": 1.779156003997943, + "learning_rate": 1e-05, + "loss": 1.2082, + "step": 1473 + }, + { + "epoch": 0.6641135390853796, + "grad_norm": 1.8478302534625182, + "learning_rate": 1e-05, + "loss": 1.2254, + "step": 1474 + }, + { + "epoch": 0.664564091011489, + "grad_norm": 1.6148156185454428, + "learning_rate": 1e-05, + "loss": 1.249, + "step": 1475 + }, + { + "epoch": 0.6650146429375986, + "grad_norm": 1.75522802249122, + "learning_rate": 1e-05, + "loss": 1.2592, + "step": 1476 + }, + { + "epoch": 0.665465194863708, + "grad_norm": 1.5213446285248269, + "learning_rate": 1e-05, + "loss": 1.2248, + "step": 1477 + }, + { + "epoch": 0.6659157467898176, + "grad_norm": 1.7710285474424141, + "learning_rate": 1e-05, + "loss": 1.2387, + "step": 1478 + }, + { + "epoch": 0.666366298715927, + "grad_norm": 1.680337020579166, + "learning_rate": 1e-05, + "loss": 1.289, + "step": 1479 + }, + { + "epoch": 0.6668168506420364, + "grad_norm": 1.7109187930443421, + "learning_rate": 1e-05, + "loss": 1.194, + "step": 1480 + }, + { + "epoch": 0.667267402568146, + "grad_norm": 1.8027228771531587, + "learning_rate": 1e-05, + "loss": 1.2099, + "step": 1481 + }, + { + "epoch": 0.6677179544942554, + "grad_norm": 1.6876214039504391, + "learning_rate": 1e-05, + "loss": 1.1938, + "step": 1482 + }, + { + "epoch": 0.668168506420365, + "grad_norm": 1.8428577669555128, + "learning_rate": 1e-05, + "loss": 1.1474, + "step": 1483 + }, + { + "epoch": 0.6686190583464744, + "grad_norm": 1.6964020418470995, + "learning_rate": 1e-05, + "loss": 1.1333, + "step": 1484 + }, + { + "epoch": 0.6690696102725839, + "grad_norm": 1.7276792301617687, + "learning_rate": 1e-05, + "loss": 1.2064, + "step": 1485 + }, + { + "epoch": 0.6695201621986934, + "grad_norm": 1.7431273311944988, + "learning_rate": 1e-05, + "loss": 1.1688, + "step": 1486 + }, + { + "epoch": 0.6699707141248029, + "grad_norm": 1.8501026168048027, + "learning_rate": 1e-05, + "loss": 1.2096, + "step": 1487 + }, + { + "epoch": 0.6704212660509123, + "grad_norm": 1.7677780866844144, + "learning_rate": 1e-05, + "loss": 1.2035, + "step": 1488 + }, + { + "epoch": 0.6708718179770219, + "grad_norm": 1.7150756049457243, + "learning_rate": 1e-05, + "loss": 1.2254, + "step": 1489 + }, + { + "epoch": 0.6713223699031313, + "grad_norm": 1.7671950790182738, + "learning_rate": 1e-05, + "loss": 1.1698, + "step": 1490 + }, + { + "epoch": 0.6717729218292409, + "grad_norm": 1.5858226806327806, + "learning_rate": 1e-05, + "loss": 1.2184, + "step": 1491 + }, + { + "epoch": 0.6722234737553503, + "grad_norm": 1.717479513832325, + "learning_rate": 1e-05, + "loss": 1.2017, + "step": 1492 + }, + { + "epoch": 0.6726740256814598, + "grad_norm": 1.7178760535041833, + "learning_rate": 1e-05, + "loss": 1.1896, + "step": 1493 + }, + { + "epoch": 0.6731245776075693, + "grad_norm": 1.7378252367998972, + "learning_rate": 1e-05, + "loss": 1.1888, + "step": 1494 + }, + { + "epoch": 0.6735751295336787, + "grad_norm": 1.686646531763602, + "learning_rate": 1e-05, + "loss": 1.2322, + "step": 1495 + }, + { + "epoch": 0.6740256814597883, + "grad_norm": 1.7785805981710803, + "learning_rate": 1e-05, + "loss": 1.1652, + "step": 1496 + }, + { + "epoch": 0.6744762333858977, + "grad_norm": 1.863415067381876, + "learning_rate": 1e-05, + "loss": 1.1878, + "step": 1497 + }, + { + "epoch": 0.6749267853120072, + "grad_norm": 1.7432798280237887, + "learning_rate": 1e-05, + "loss": 1.2158, + "step": 1498 + }, + { + "epoch": 0.6753773372381167, + "grad_norm": 1.6729316504931013, + "learning_rate": 1e-05, + "loss": 1.2433, + "step": 1499 + }, + { + "epoch": 0.6758278891642262, + "grad_norm": 1.6629313114328055, + "learning_rate": 1e-05, + "loss": 1.1942, + "step": 1500 + }, + { + "epoch": 0.6762784410903356, + "grad_norm": 1.6065777404365544, + "learning_rate": 1e-05, + "loss": 1.2327, + "step": 1501 + }, + { + "epoch": 0.6767289930164452, + "grad_norm": 1.5557977992450556, + "learning_rate": 1e-05, + "loss": 1.1825, + "step": 1502 + }, + { + "epoch": 0.6771795449425546, + "grad_norm": 1.9016853376062381, + "learning_rate": 1e-05, + "loss": 1.1943, + "step": 1503 + }, + { + "epoch": 0.6776300968686642, + "grad_norm": 1.590043710512899, + "learning_rate": 1e-05, + "loss": 1.2031, + "step": 1504 + }, + { + "epoch": 0.6780806487947736, + "grad_norm": 1.7796113694185258, + "learning_rate": 1e-05, + "loss": 1.2222, + "step": 1505 + }, + { + "epoch": 0.6785312007208831, + "grad_norm": 1.7367622130464204, + "learning_rate": 1e-05, + "loss": 1.2028, + "step": 1506 + }, + { + "epoch": 0.6789817526469926, + "grad_norm": 1.5468093833959076, + "learning_rate": 1e-05, + "loss": 1.1449, + "step": 1507 + }, + { + "epoch": 0.679432304573102, + "grad_norm": 1.615544002799072, + "learning_rate": 1e-05, + "loss": 1.181, + "step": 1508 + }, + { + "epoch": 0.6798828564992115, + "grad_norm": 1.6654340696921206, + "learning_rate": 1e-05, + "loss": 1.1835, + "step": 1509 + }, + { + "epoch": 0.680333408425321, + "grad_norm": 1.8111265166768526, + "learning_rate": 1e-05, + "loss": 1.2335, + "step": 1510 + }, + { + "epoch": 0.6807839603514305, + "grad_norm": 1.9126157729140019, + "learning_rate": 1e-05, + "loss": 1.2202, + "step": 1511 + }, + { + "epoch": 0.68123451227754, + "grad_norm": 1.8063762771293985, + "learning_rate": 1e-05, + "loss": 1.1887, + "step": 1512 + }, + { + "epoch": 0.6816850642036495, + "grad_norm": 1.808564622464707, + "learning_rate": 1e-05, + "loss": 1.1837, + "step": 1513 + }, + { + "epoch": 0.6821356161297589, + "grad_norm": 1.7637059867029303, + "learning_rate": 1e-05, + "loss": 1.2166, + "step": 1514 + }, + { + "epoch": 0.6825861680558685, + "grad_norm": 1.6100181389970576, + "learning_rate": 1e-05, + "loss": 1.2362, + "step": 1515 + }, + { + "epoch": 0.6830367199819779, + "grad_norm": 1.8426325458109705, + "learning_rate": 1e-05, + "loss": 1.2488, + "step": 1516 + }, + { + "epoch": 0.6834872719080874, + "grad_norm": 1.829455745565599, + "learning_rate": 1e-05, + "loss": 1.2223, + "step": 1517 + }, + { + "epoch": 0.6839378238341969, + "grad_norm": 1.679105198586097, + "learning_rate": 1e-05, + "loss": 1.1963, + "step": 1518 + }, + { + "epoch": 0.6843883757603064, + "grad_norm": 1.8167196778392032, + "learning_rate": 1e-05, + "loss": 1.234, + "step": 1519 + }, + { + "epoch": 0.6848389276864159, + "grad_norm": 1.8156765386085207, + "learning_rate": 1e-05, + "loss": 1.2101, + "step": 1520 + }, + { + "epoch": 0.6852894796125254, + "grad_norm": 1.7612340227331704, + "learning_rate": 1e-05, + "loss": 1.2334, + "step": 1521 + }, + { + "epoch": 0.6857400315386348, + "grad_norm": 1.7988044156380567, + "learning_rate": 1e-05, + "loss": 1.2129, + "step": 1522 + }, + { + "epoch": 0.6861905834647443, + "grad_norm": 1.9904012936293098, + "learning_rate": 1e-05, + "loss": 1.2339, + "step": 1523 + }, + { + "epoch": 0.6866411353908538, + "grad_norm": 1.8835220745766155, + "learning_rate": 1e-05, + "loss": 1.2215, + "step": 1524 + }, + { + "epoch": 0.6870916873169632, + "grad_norm": 1.6911101505931265, + "learning_rate": 1e-05, + "loss": 1.2243, + "step": 1525 + }, + { + "epoch": 0.6875422392430728, + "grad_norm": 1.9621820059570811, + "learning_rate": 1e-05, + "loss": 1.2103, + "step": 1526 + }, + { + "epoch": 0.6879927911691822, + "grad_norm": 1.5896032578241706, + "learning_rate": 1e-05, + "loss": 1.2022, + "step": 1527 + }, + { + "epoch": 0.6884433430952918, + "grad_norm": 1.6167331916388736, + "learning_rate": 1e-05, + "loss": 1.2181, + "step": 1528 + }, + { + "epoch": 0.6888938950214012, + "grad_norm": 1.8187292950298952, + "learning_rate": 1e-05, + "loss": 1.2165, + "step": 1529 + }, + { + "epoch": 0.6893444469475107, + "grad_norm": 1.8082302021678878, + "learning_rate": 1e-05, + "loss": 1.1962, + "step": 1530 + }, + { + "epoch": 0.6897949988736202, + "grad_norm": 1.7671311925432198, + "learning_rate": 1e-05, + "loss": 1.1896, + "step": 1531 + }, + { + "epoch": 0.6902455507997297, + "grad_norm": 1.5678329258738315, + "learning_rate": 1e-05, + "loss": 1.1907, + "step": 1532 + }, + { + "epoch": 0.6906961027258391, + "grad_norm": 1.695281139458708, + "learning_rate": 1e-05, + "loss": 1.2393, + "step": 1533 + }, + { + "epoch": 0.6911466546519487, + "grad_norm": 1.7283322631447566, + "learning_rate": 1e-05, + "loss": 1.2076, + "step": 1534 + }, + { + "epoch": 0.6915972065780581, + "grad_norm": 1.8763264466537393, + "learning_rate": 1e-05, + "loss": 1.2439, + "step": 1535 + }, + { + "epoch": 0.6920477585041676, + "grad_norm": 1.6627011146386719, + "learning_rate": 1e-05, + "loss": 1.2044, + "step": 1536 + }, + { + "epoch": 0.6924983104302771, + "grad_norm": 1.7034196511170738, + "learning_rate": 1e-05, + "loss": 1.2172, + "step": 1537 + }, + { + "epoch": 0.6929488623563865, + "grad_norm": 1.6600270926222587, + "learning_rate": 1e-05, + "loss": 1.224, + "step": 1538 + }, + { + "epoch": 0.6933994142824961, + "grad_norm": 1.7470572585648798, + "learning_rate": 1e-05, + "loss": 1.2131, + "step": 1539 + }, + { + "epoch": 0.6938499662086055, + "grad_norm": 1.6756058848466726, + "learning_rate": 1e-05, + "loss": 1.2199, + "step": 1540 + }, + { + "epoch": 0.694300518134715, + "grad_norm": 1.6866150389106547, + "learning_rate": 1e-05, + "loss": 1.1937, + "step": 1541 + }, + { + "epoch": 0.6947510700608245, + "grad_norm": 1.8182803750396543, + "learning_rate": 1e-05, + "loss": 1.2329, + "step": 1542 + }, + { + "epoch": 0.695201621986934, + "grad_norm": 1.8179083545886239, + "learning_rate": 1e-05, + "loss": 1.2352, + "step": 1543 + }, + { + "epoch": 0.6956521739130435, + "grad_norm": 1.7826450014663868, + "learning_rate": 1e-05, + "loss": 1.2025, + "step": 1544 + }, + { + "epoch": 0.696102725839153, + "grad_norm": 1.6686981124730706, + "learning_rate": 1e-05, + "loss": 1.1848, + "step": 1545 + }, + { + "epoch": 0.6965532777652624, + "grad_norm": 1.8117545255777887, + "learning_rate": 1e-05, + "loss": 1.1969, + "step": 1546 + }, + { + "epoch": 0.697003829691372, + "grad_norm": 1.6142263149647902, + "learning_rate": 1e-05, + "loss": 1.1976, + "step": 1547 + }, + { + "epoch": 0.6974543816174814, + "grad_norm": 1.6610024951944695, + "learning_rate": 1e-05, + "loss": 1.2529, + "step": 1548 + }, + { + "epoch": 0.6979049335435908, + "grad_norm": 1.6013470250591175, + "learning_rate": 1e-05, + "loss": 1.1785, + "step": 1549 + }, + { + "epoch": 0.6983554854697004, + "grad_norm": 1.7140287273065915, + "learning_rate": 1e-05, + "loss": 1.1858, + "step": 1550 + }, + { + "epoch": 0.6988060373958098, + "grad_norm": 1.7758993147566013, + "learning_rate": 1e-05, + "loss": 1.2053, + "step": 1551 + }, + { + "epoch": 0.6992565893219194, + "grad_norm": 1.797803196846316, + "learning_rate": 1e-05, + "loss": 1.2303, + "step": 1552 + }, + { + "epoch": 0.6997071412480288, + "grad_norm": 1.8656479863941466, + "learning_rate": 1e-05, + "loss": 1.2259, + "step": 1553 + }, + { + "epoch": 0.7001576931741383, + "grad_norm": 1.726345010700787, + "learning_rate": 1e-05, + "loss": 1.1792, + "step": 1554 + }, + { + "epoch": 0.7006082451002478, + "grad_norm": 1.7547131217994272, + "learning_rate": 1e-05, + "loss": 1.2178, + "step": 1555 + }, + { + "epoch": 0.7010587970263573, + "grad_norm": 1.5810118037989145, + "learning_rate": 1e-05, + "loss": 1.1964, + "step": 1556 + }, + { + "epoch": 0.7015093489524667, + "grad_norm": 1.6349651639810352, + "learning_rate": 1e-05, + "loss": 1.1872, + "step": 1557 + }, + { + "epoch": 0.7019599008785763, + "grad_norm": 1.8351718999767024, + "learning_rate": 1e-05, + "loss": 1.2411, + "step": 1558 + }, + { + "epoch": 0.7024104528046857, + "grad_norm": 1.79325340556643, + "learning_rate": 1e-05, + "loss": 1.1546, + "step": 1559 + }, + { + "epoch": 0.7028610047307953, + "grad_norm": 1.6932223721377095, + "learning_rate": 1e-05, + "loss": 1.216, + "step": 1560 + }, + { + "epoch": 0.7033115566569047, + "grad_norm": 1.7572509603512143, + "learning_rate": 1e-05, + "loss": 1.1863, + "step": 1561 + }, + { + "epoch": 0.7037621085830142, + "grad_norm": 1.7345565420160527, + "learning_rate": 1e-05, + "loss": 1.1564, + "step": 1562 + }, + { + "epoch": 0.7042126605091237, + "grad_norm": 1.642208909407649, + "learning_rate": 1e-05, + "loss": 1.1407, + "step": 1563 + }, + { + "epoch": 0.7046632124352331, + "grad_norm": 1.7821873058298203, + "learning_rate": 1e-05, + "loss": 1.1878, + "step": 1564 + }, + { + "epoch": 0.7051137643613427, + "grad_norm": 1.6436486535365389, + "learning_rate": 1e-05, + "loss": 1.1418, + "step": 1565 + }, + { + "epoch": 0.7055643162874521, + "grad_norm": 1.7879936125811284, + "learning_rate": 1e-05, + "loss": 1.2328, + "step": 1566 + }, + { + "epoch": 0.7060148682135616, + "grad_norm": 1.7849417260891618, + "learning_rate": 1e-05, + "loss": 1.1732, + "step": 1567 + }, + { + "epoch": 0.7064654201396711, + "grad_norm": 1.5640189291154702, + "learning_rate": 1e-05, + "loss": 1.1912, + "step": 1568 + }, + { + "epoch": 0.7069159720657806, + "grad_norm": 1.847230070236875, + "learning_rate": 1e-05, + "loss": 1.2405, + "step": 1569 + }, + { + "epoch": 0.70736652399189, + "grad_norm": 1.7915035073553303, + "learning_rate": 1e-05, + "loss": 1.248, + "step": 1570 + }, + { + "epoch": 0.7078170759179996, + "grad_norm": 1.669729647831837, + "learning_rate": 1e-05, + "loss": 1.228, + "step": 1571 + }, + { + "epoch": 0.708267627844109, + "grad_norm": 1.753022180495031, + "learning_rate": 1e-05, + "loss": 1.195, + "step": 1572 + }, + { + "epoch": 0.7087181797702186, + "grad_norm": 1.8953343083160659, + "learning_rate": 1e-05, + "loss": 1.2168, + "step": 1573 + }, + { + "epoch": 0.709168731696328, + "grad_norm": 1.7374549036280298, + "learning_rate": 1e-05, + "loss": 1.1432, + "step": 1574 + }, + { + "epoch": 0.7096192836224375, + "grad_norm": 1.8178560560782415, + "learning_rate": 1e-05, + "loss": 1.2117, + "step": 1575 + }, + { + "epoch": 0.710069835548547, + "grad_norm": 1.7536064319434517, + "learning_rate": 1e-05, + "loss": 1.188, + "step": 1576 + }, + { + "epoch": 0.7105203874746564, + "grad_norm": 1.8131392516496259, + "learning_rate": 1e-05, + "loss": 1.1563, + "step": 1577 + }, + { + "epoch": 0.7109709394007659, + "grad_norm": 1.8091956809767324, + "learning_rate": 1e-05, + "loss": 1.1893, + "step": 1578 + }, + { + "epoch": 0.7114214913268754, + "grad_norm": 1.6431862046099364, + "learning_rate": 1e-05, + "loss": 1.2216, + "step": 1579 + }, + { + "epoch": 0.7118720432529849, + "grad_norm": 1.7559439281770257, + "learning_rate": 1e-05, + "loss": 1.1981, + "step": 1580 + }, + { + "epoch": 0.7123225951790944, + "grad_norm": 1.8701009634372734, + "learning_rate": 1e-05, + "loss": 1.1879, + "step": 1581 + }, + { + "epoch": 0.7127731471052039, + "grad_norm": 1.7142600163138828, + "learning_rate": 1e-05, + "loss": 1.2113, + "step": 1582 + }, + { + "epoch": 0.7132236990313133, + "grad_norm": 1.9174721675261706, + "learning_rate": 1e-05, + "loss": 1.2255, + "step": 1583 + }, + { + "epoch": 0.7136742509574229, + "grad_norm": 1.7559177222668094, + "learning_rate": 1e-05, + "loss": 1.1863, + "step": 1584 + }, + { + "epoch": 0.7141248028835323, + "grad_norm": 1.748516334016906, + "learning_rate": 1e-05, + "loss": 1.2024, + "step": 1585 + }, + { + "epoch": 0.7145753548096418, + "grad_norm": 1.7933589848387923, + "learning_rate": 1e-05, + "loss": 1.1801, + "step": 1586 + }, + { + "epoch": 0.7150259067357513, + "grad_norm": 1.9270765133493692, + "learning_rate": 1e-05, + "loss": 1.1939, + "step": 1587 + }, + { + "epoch": 0.7154764586618608, + "grad_norm": 1.7346018389981674, + "learning_rate": 1e-05, + "loss": 1.2283, + "step": 1588 + }, + { + "epoch": 0.7159270105879703, + "grad_norm": 1.7325242832750585, + "learning_rate": 1e-05, + "loss": 1.1963, + "step": 1589 + }, + { + "epoch": 0.7163775625140798, + "grad_norm": 1.7444000627612912, + "learning_rate": 1e-05, + "loss": 1.1531, + "step": 1590 + }, + { + "epoch": 0.7168281144401892, + "grad_norm": 1.8912072864723126, + "learning_rate": 1e-05, + "loss": 1.1566, + "step": 1591 + }, + { + "epoch": 0.7172786663662987, + "grad_norm": 1.733590476278565, + "learning_rate": 1e-05, + "loss": 1.2231, + "step": 1592 + }, + { + "epoch": 0.7177292182924082, + "grad_norm": 1.749969276816314, + "learning_rate": 1e-05, + "loss": 1.2064, + "step": 1593 + }, + { + "epoch": 0.7181797702185176, + "grad_norm": 1.7989817552283915, + "learning_rate": 1e-05, + "loss": 1.2495, + "step": 1594 + }, + { + "epoch": 0.7186303221446272, + "grad_norm": 1.6821062511170501, + "learning_rate": 1e-05, + "loss": 1.2158, + "step": 1595 + }, + { + "epoch": 0.7190808740707366, + "grad_norm": 1.724992081467261, + "learning_rate": 1e-05, + "loss": 1.1243, + "step": 1596 + }, + { + "epoch": 0.7195314259968462, + "grad_norm": 1.5558985298158818, + "learning_rate": 1e-05, + "loss": 1.1994, + "step": 1597 + }, + { + "epoch": 0.7199819779229556, + "grad_norm": 1.6563535657227137, + "learning_rate": 1e-05, + "loss": 1.2341, + "step": 1598 + }, + { + "epoch": 0.7204325298490651, + "grad_norm": 1.710169957463851, + "learning_rate": 1e-05, + "loss": 1.2425, + "step": 1599 + }, + { + "epoch": 0.7208830817751746, + "grad_norm": 1.7955061952535722, + "learning_rate": 1e-05, + "loss": 1.217, + "step": 1600 + }, + { + "epoch": 0.7213336337012841, + "grad_norm": 1.963471144681431, + "learning_rate": 1e-05, + "loss": 1.1847, + "step": 1601 + }, + { + "epoch": 0.7217841856273935, + "grad_norm": 1.7220523584163132, + "learning_rate": 1e-05, + "loss": 1.2386, + "step": 1602 + }, + { + "epoch": 0.7222347375535031, + "grad_norm": 1.6971781187080373, + "learning_rate": 1e-05, + "loss": 1.2152, + "step": 1603 + }, + { + "epoch": 0.7226852894796125, + "grad_norm": 1.7770582851966428, + "learning_rate": 1e-05, + "loss": 1.1843, + "step": 1604 + }, + { + "epoch": 0.723135841405722, + "grad_norm": 1.79805152863676, + "learning_rate": 1e-05, + "loss": 1.2518, + "step": 1605 + }, + { + "epoch": 0.7235863933318315, + "grad_norm": 1.9077231530916592, + "learning_rate": 1e-05, + "loss": 1.1668, + "step": 1606 + }, + { + "epoch": 0.7240369452579409, + "grad_norm": 1.5492100134600801, + "learning_rate": 1e-05, + "loss": 1.2113, + "step": 1607 + }, + { + "epoch": 0.7244874971840505, + "grad_norm": 1.9142045705248958, + "learning_rate": 1e-05, + "loss": 1.2466, + "step": 1608 + }, + { + "epoch": 0.7249380491101599, + "grad_norm": 1.6101992830507001, + "learning_rate": 1e-05, + "loss": 1.1995, + "step": 1609 + }, + { + "epoch": 0.7253886010362695, + "grad_norm": 1.7515548027062005, + "learning_rate": 1e-05, + "loss": 1.1555, + "step": 1610 + }, + { + "epoch": 0.7258391529623789, + "grad_norm": 1.6762276735393804, + "learning_rate": 1e-05, + "loss": 1.229, + "step": 1611 + }, + { + "epoch": 0.7262897048884884, + "grad_norm": 1.627656753569499, + "learning_rate": 1e-05, + "loss": 1.1657, + "step": 1612 + }, + { + "epoch": 0.7267402568145979, + "grad_norm": 1.7496013678598379, + "learning_rate": 1e-05, + "loss": 1.2081, + "step": 1613 + }, + { + "epoch": 0.7271908087407074, + "grad_norm": 1.8545784833141947, + "learning_rate": 1e-05, + "loss": 1.2059, + "step": 1614 + }, + { + "epoch": 0.7276413606668168, + "grad_norm": 2.00188202430048, + "learning_rate": 1e-05, + "loss": 1.2155, + "step": 1615 + }, + { + "epoch": 0.7280919125929264, + "grad_norm": 1.9014407802443913, + "learning_rate": 1e-05, + "loss": 1.2485, + "step": 1616 + }, + { + "epoch": 0.7285424645190358, + "grad_norm": 1.6826040941086706, + "learning_rate": 1e-05, + "loss": 1.2039, + "step": 1617 + }, + { + "epoch": 0.7289930164451452, + "grad_norm": 1.7676232423354608, + "learning_rate": 1e-05, + "loss": 1.2278, + "step": 1618 + }, + { + "epoch": 0.7294435683712548, + "grad_norm": 1.7829262719988204, + "learning_rate": 1e-05, + "loss": 1.1751, + "step": 1619 + }, + { + "epoch": 0.7298941202973642, + "grad_norm": 1.6967225202117673, + "learning_rate": 1e-05, + "loss": 1.2246, + "step": 1620 + }, + { + "epoch": 0.7303446722234738, + "grad_norm": 1.7824532206264245, + "learning_rate": 1e-05, + "loss": 1.1883, + "step": 1621 + }, + { + "epoch": 0.7307952241495832, + "grad_norm": 1.8185538924815645, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1622 + }, + { + "epoch": 0.7312457760756927, + "grad_norm": 1.6654605627242565, + "learning_rate": 1e-05, + "loss": 1.1439, + "step": 1623 + }, + { + "epoch": 0.7316963280018022, + "grad_norm": 1.8029330897676907, + "learning_rate": 1e-05, + "loss": 1.2079, + "step": 1624 + }, + { + "epoch": 0.7321468799279117, + "grad_norm": 1.7215814233611655, + "learning_rate": 1e-05, + "loss": 1.1704, + "step": 1625 + }, + { + "epoch": 0.7325974318540212, + "grad_norm": 1.6040166139979504, + "learning_rate": 1e-05, + "loss": 1.1985, + "step": 1626 + }, + { + "epoch": 0.7330479837801307, + "grad_norm": 1.7937428268112714, + "learning_rate": 1e-05, + "loss": 1.2049, + "step": 1627 + }, + { + "epoch": 0.7334985357062401, + "grad_norm": 1.9822342995484614, + "learning_rate": 1e-05, + "loss": 1.1894, + "step": 1628 + }, + { + "epoch": 0.7339490876323497, + "grad_norm": 1.7672706967543768, + "learning_rate": 1e-05, + "loss": 1.2336, + "step": 1629 + }, + { + "epoch": 0.7343996395584591, + "grad_norm": 1.631220283803437, + "learning_rate": 1e-05, + "loss": 1.2061, + "step": 1630 + }, + { + "epoch": 0.7348501914845686, + "grad_norm": 1.7624571913302236, + "learning_rate": 1e-05, + "loss": 1.2216, + "step": 1631 + }, + { + "epoch": 0.7353007434106781, + "grad_norm": 1.7053116058692035, + "learning_rate": 1e-05, + "loss": 1.1874, + "step": 1632 + }, + { + "epoch": 0.7357512953367875, + "grad_norm": 1.783535417096981, + "learning_rate": 1e-05, + "loss": 1.2203, + "step": 1633 + }, + { + "epoch": 0.7362018472628971, + "grad_norm": 1.847392777457146, + "learning_rate": 1e-05, + "loss": 1.2229, + "step": 1634 + }, + { + "epoch": 0.7366523991890065, + "grad_norm": 1.9496620431006855, + "learning_rate": 1e-05, + "loss": 1.2311, + "step": 1635 + }, + { + "epoch": 0.737102951115116, + "grad_norm": 1.8380697829735286, + "learning_rate": 1e-05, + "loss": 1.2176, + "step": 1636 + }, + { + "epoch": 0.7375535030412255, + "grad_norm": 1.5333611622857826, + "learning_rate": 1e-05, + "loss": 1.1909, + "step": 1637 + }, + { + "epoch": 0.738004054967335, + "grad_norm": 1.9704628520280674, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 1638 + }, + { + "epoch": 0.7384546068934444, + "grad_norm": 1.7125255731444782, + "learning_rate": 1e-05, + "loss": 1.1701, + "step": 1639 + }, + { + "epoch": 0.738905158819554, + "grad_norm": 1.7479134783417707, + "learning_rate": 1e-05, + "loss": 1.2084, + "step": 1640 + }, + { + "epoch": 0.7393557107456634, + "grad_norm": 1.6900244842691479, + "learning_rate": 1e-05, + "loss": 1.1688, + "step": 1641 + }, + { + "epoch": 0.739806262671773, + "grad_norm": 1.7304043413819028, + "learning_rate": 1e-05, + "loss": 1.2006, + "step": 1642 + }, + { + "epoch": 0.7402568145978824, + "grad_norm": 1.7264034979823348, + "learning_rate": 1e-05, + "loss": 1.2405, + "step": 1643 + }, + { + "epoch": 0.7407073665239919, + "grad_norm": 1.7754883669138675, + "learning_rate": 1e-05, + "loss": 1.1632, + "step": 1644 + }, + { + "epoch": 0.7411579184501014, + "grad_norm": 1.6539240626012026, + "learning_rate": 1e-05, + "loss": 1.2114, + "step": 1645 + }, + { + "epoch": 0.7416084703762108, + "grad_norm": 1.7747504452166638, + "learning_rate": 1e-05, + "loss": 1.1517, + "step": 1646 + }, + { + "epoch": 0.7420590223023203, + "grad_norm": 1.6843163661125733, + "learning_rate": 1e-05, + "loss": 1.1739, + "step": 1647 + }, + { + "epoch": 0.7425095742284298, + "grad_norm": 1.654873351842606, + "learning_rate": 1e-05, + "loss": 1.2159, + "step": 1648 + }, + { + "epoch": 0.7429601261545393, + "grad_norm": 1.7765851776191028, + "learning_rate": 1e-05, + "loss": 1.1683, + "step": 1649 + }, + { + "epoch": 0.7434106780806488, + "grad_norm": 1.766473346190109, + "learning_rate": 1e-05, + "loss": 1.1727, + "step": 1650 + }, + { + "epoch": 0.7438612300067583, + "grad_norm": 1.7732215218572276, + "learning_rate": 1e-05, + "loss": 1.1981, + "step": 1651 + }, + { + "epoch": 0.7443117819328677, + "grad_norm": 1.765700001263744, + "learning_rate": 1e-05, + "loss": 1.199, + "step": 1652 + }, + { + "epoch": 0.7447623338589773, + "grad_norm": 1.6173817287431251, + "learning_rate": 1e-05, + "loss": 1.1757, + "step": 1653 + }, + { + "epoch": 0.7452128857850867, + "grad_norm": 1.7428494378215253, + "learning_rate": 1e-05, + "loss": 1.215, + "step": 1654 + }, + { + "epoch": 0.7456634377111963, + "grad_norm": 1.814149946273443, + "learning_rate": 1e-05, + "loss": 1.173, + "step": 1655 + }, + { + "epoch": 0.7461139896373057, + "grad_norm": 1.7816463532898834, + "learning_rate": 1e-05, + "loss": 1.1542, + "step": 1656 + }, + { + "epoch": 0.7465645415634152, + "grad_norm": 1.6724681088945228, + "learning_rate": 1e-05, + "loss": 1.151, + "step": 1657 + }, + { + "epoch": 0.7470150934895247, + "grad_norm": 1.7304730114747535, + "learning_rate": 1e-05, + "loss": 1.1957, + "step": 1658 + }, + { + "epoch": 0.7474656454156342, + "grad_norm": 1.9411514227103825, + "learning_rate": 1e-05, + "loss": 1.1814, + "step": 1659 + }, + { + "epoch": 0.7479161973417436, + "grad_norm": 1.568979236716191, + "learning_rate": 1e-05, + "loss": 1.2002, + "step": 1660 + }, + { + "epoch": 0.7483667492678531, + "grad_norm": 1.6982277329516287, + "learning_rate": 1e-05, + "loss": 1.1841, + "step": 1661 + }, + { + "epoch": 0.7488173011939626, + "grad_norm": 1.6759893107103216, + "learning_rate": 1e-05, + "loss": 1.2325, + "step": 1662 + }, + { + "epoch": 0.749267853120072, + "grad_norm": 1.6620514013752155, + "learning_rate": 1e-05, + "loss": 1.1932, + "step": 1663 + }, + { + "epoch": 0.7497184050461816, + "grad_norm": 1.8423705266471422, + "learning_rate": 1e-05, + "loss": 1.2196, + "step": 1664 + }, + { + "epoch": 0.750168956972291, + "grad_norm": 1.708307386102823, + "learning_rate": 1e-05, + "loss": 1.1776, + "step": 1665 + }, + { + "epoch": 0.7506195088984006, + "grad_norm": 1.6022053767174338, + "learning_rate": 1e-05, + "loss": 1.2495, + "step": 1666 + }, + { + "epoch": 0.75107006082451, + "grad_norm": 1.561987979726797, + "learning_rate": 1e-05, + "loss": 1.1684, + "step": 1667 + }, + { + "epoch": 0.7515206127506195, + "grad_norm": 1.8992134971270394, + "learning_rate": 1e-05, + "loss": 1.1751, + "step": 1668 + }, + { + "epoch": 0.751971164676729, + "grad_norm": 1.8111560222561016, + "learning_rate": 1e-05, + "loss": 1.1868, + "step": 1669 + }, + { + "epoch": 0.7524217166028385, + "grad_norm": 1.8190244202436154, + "learning_rate": 1e-05, + "loss": 1.2032, + "step": 1670 + }, + { + "epoch": 0.752872268528948, + "grad_norm": 1.6469750299329777, + "learning_rate": 1e-05, + "loss": 1.2034, + "step": 1671 + }, + { + "epoch": 0.7533228204550575, + "grad_norm": 1.8926937557156034, + "learning_rate": 1e-05, + "loss": 1.2443, + "step": 1672 + }, + { + "epoch": 0.7537733723811669, + "grad_norm": 1.8153985153002195, + "learning_rate": 1e-05, + "loss": 1.2156, + "step": 1673 + }, + { + "epoch": 0.7542239243072764, + "grad_norm": 1.7348525698050452, + "learning_rate": 1e-05, + "loss": 1.2106, + "step": 1674 + }, + { + "epoch": 0.7546744762333859, + "grad_norm": 1.661136537982251, + "learning_rate": 1e-05, + "loss": 1.169, + "step": 1675 + }, + { + "epoch": 0.7551250281594953, + "grad_norm": 1.6169169082935835, + "learning_rate": 1e-05, + "loss": 1.2337, + "step": 1676 + }, + { + "epoch": 0.7555755800856049, + "grad_norm": 1.7406684017522813, + "learning_rate": 1e-05, + "loss": 1.2192, + "step": 1677 + }, + { + "epoch": 0.7560261320117143, + "grad_norm": 1.8106982172244952, + "learning_rate": 1e-05, + "loss": 1.1707, + "step": 1678 + }, + { + "epoch": 0.7564766839378239, + "grad_norm": 1.6359043165430858, + "learning_rate": 1e-05, + "loss": 1.2168, + "step": 1679 + }, + { + "epoch": 0.7569272358639333, + "grad_norm": 1.7071707918397376, + "learning_rate": 1e-05, + "loss": 1.2194, + "step": 1680 + }, + { + "epoch": 0.7573777877900428, + "grad_norm": 1.8100793765227274, + "learning_rate": 1e-05, + "loss": 1.1894, + "step": 1681 + }, + { + "epoch": 0.7578283397161523, + "grad_norm": 1.838337039847543, + "learning_rate": 1e-05, + "loss": 1.2753, + "step": 1682 + }, + { + "epoch": 0.7582788916422618, + "grad_norm": 1.6451626473007042, + "learning_rate": 1e-05, + "loss": 1.1912, + "step": 1683 + }, + { + "epoch": 0.7587294435683712, + "grad_norm": 1.7967639278107814, + "learning_rate": 1e-05, + "loss": 1.2071, + "step": 1684 + }, + { + "epoch": 0.7591799954944808, + "grad_norm": 1.6660682047340702, + "learning_rate": 1e-05, + "loss": 1.1666, + "step": 1685 + }, + { + "epoch": 0.7596305474205902, + "grad_norm": 1.644060554767546, + "learning_rate": 1e-05, + "loss": 1.2144, + "step": 1686 + }, + { + "epoch": 0.7600810993466998, + "grad_norm": 1.7529047759013408, + "learning_rate": 1e-05, + "loss": 1.1907, + "step": 1687 + }, + { + "epoch": 0.7605316512728092, + "grad_norm": 1.7081032194106698, + "learning_rate": 1e-05, + "loss": 1.205, + "step": 1688 + }, + { + "epoch": 0.7609822031989186, + "grad_norm": 1.6534657086617957, + "learning_rate": 1e-05, + "loss": 1.1923, + "step": 1689 + }, + { + "epoch": 0.7614327551250282, + "grad_norm": 1.7220403657284369, + "learning_rate": 1e-05, + "loss": 1.1939, + "step": 1690 + }, + { + "epoch": 0.7618833070511376, + "grad_norm": 1.6483255299122568, + "learning_rate": 1e-05, + "loss": 1.1956, + "step": 1691 + }, + { + "epoch": 0.7623338589772471, + "grad_norm": 1.796404617306694, + "learning_rate": 1e-05, + "loss": 1.2375, + "step": 1692 + }, + { + "epoch": 0.7627844109033566, + "grad_norm": 1.7868456616778812, + "learning_rate": 1e-05, + "loss": 1.1456, + "step": 1693 + }, + { + "epoch": 0.7632349628294661, + "grad_norm": 1.7165716057910696, + "learning_rate": 1e-05, + "loss": 1.1772, + "step": 1694 + }, + { + "epoch": 0.7636855147555756, + "grad_norm": 1.7780125742011994, + "learning_rate": 1e-05, + "loss": 1.1915, + "step": 1695 + }, + { + "epoch": 0.7641360666816851, + "grad_norm": 1.6989972107664026, + "learning_rate": 1e-05, + "loss": 1.2096, + "step": 1696 + }, + { + "epoch": 0.7645866186077945, + "grad_norm": 1.6383750155560448, + "learning_rate": 1e-05, + "loss": 1.235, + "step": 1697 + }, + { + "epoch": 0.7650371705339041, + "grad_norm": 1.8560501612834974, + "learning_rate": 1e-05, + "loss": 1.2285, + "step": 1698 + }, + { + "epoch": 0.7654877224600135, + "grad_norm": 1.909815939253733, + "learning_rate": 1e-05, + "loss": 1.2085, + "step": 1699 + }, + { + "epoch": 0.765938274386123, + "grad_norm": 1.7146072974709152, + "learning_rate": 1e-05, + "loss": 1.1551, + "step": 1700 + }, + { + "epoch": 0.7663888263122325, + "grad_norm": 1.7092825861043606, + "learning_rate": 1e-05, + "loss": 1.1833, + "step": 1701 + }, + { + "epoch": 0.7668393782383419, + "grad_norm": 1.9280825836498556, + "learning_rate": 1e-05, + "loss": 1.1698, + "step": 1702 + }, + { + "epoch": 0.7672899301644515, + "grad_norm": 1.5595640210310495, + "learning_rate": 1e-05, + "loss": 1.2188, + "step": 1703 + }, + { + "epoch": 0.7677404820905609, + "grad_norm": 1.5818772470951465, + "learning_rate": 1e-05, + "loss": 1.1521, + "step": 1704 + }, + { + "epoch": 0.7681910340166704, + "grad_norm": 1.7331777117877314, + "learning_rate": 1e-05, + "loss": 1.2049, + "step": 1705 + }, + { + "epoch": 0.7686415859427799, + "grad_norm": 1.7539450858019312, + "learning_rate": 1e-05, + "loss": 1.2063, + "step": 1706 + }, + { + "epoch": 0.7690921378688894, + "grad_norm": 1.7403663376766787, + "learning_rate": 1e-05, + "loss": 1.1681, + "step": 1707 + }, + { + "epoch": 0.7695426897949988, + "grad_norm": 1.7345378343179203, + "learning_rate": 1e-05, + "loss": 1.1693, + "step": 1708 + }, + { + "epoch": 0.7699932417211084, + "grad_norm": 1.81637035399044, + "learning_rate": 1e-05, + "loss": 1.2041, + "step": 1709 + }, + { + "epoch": 0.7704437936472178, + "grad_norm": 1.6417819735404637, + "learning_rate": 1e-05, + "loss": 1.1634, + "step": 1710 + }, + { + "epoch": 0.7708943455733274, + "grad_norm": 1.8461910039021008, + "learning_rate": 1e-05, + "loss": 1.1926, + "step": 1711 + }, + { + "epoch": 0.7713448974994368, + "grad_norm": 1.6440018690387175, + "learning_rate": 1e-05, + "loss": 1.1833, + "step": 1712 + }, + { + "epoch": 0.7717954494255463, + "grad_norm": 1.8508994498444773, + "learning_rate": 1e-05, + "loss": 1.2303, + "step": 1713 + }, + { + "epoch": 0.7722460013516558, + "grad_norm": 1.7443234713383082, + "learning_rate": 1e-05, + "loss": 1.2142, + "step": 1714 + }, + { + "epoch": 0.7726965532777652, + "grad_norm": 1.7168353461896915, + "learning_rate": 1e-05, + "loss": 1.2048, + "step": 1715 + }, + { + "epoch": 0.7731471052038748, + "grad_norm": 1.6190431946454633, + "learning_rate": 1e-05, + "loss": 1.1731, + "step": 1716 + }, + { + "epoch": 0.7735976571299842, + "grad_norm": 1.819388151551019, + "learning_rate": 1e-05, + "loss": 1.199, + "step": 1717 + }, + { + "epoch": 0.7740482090560937, + "grad_norm": 1.7030116397503874, + "learning_rate": 1e-05, + "loss": 1.1851, + "step": 1718 + }, + { + "epoch": 0.7744987609822032, + "grad_norm": 1.7229972907895592, + "learning_rate": 1e-05, + "loss": 1.2051, + "step": 1719 + }, + { + "epoch": 0.7749493129083127, + "grad_norm": 1.7504320739342736, + "learning_rate": 1e-05, + "loss": 1.2057, + "step": 1720 + }, + { + "epoch": 0.7753998648344221, + "grad_norm": 1.6323365236334266, + "learning_rate": 1e-05, + "loss": 1.1922, + "step": 1721 + }, + { + "epoch": 0.7758504167605317, + "grad_norm": 1.7472841784426592, + "learning_rate": 1e-05, + "loss": 1.1621, + "step": 1722 + }, + { + "epoch": 0.7763009686866411, + "grad_norm": 1.6314516314515433, + "learning_rate": 1e-05, + "loss": 1.1741, + "step": 1723 + }, + { + "epoch": 0.7767515206127507, + "grad_norm": 1.7208094873956943, + "learning_rate": 1e-05, + "loss": 1.2228, + "step": 1724 + }, + { + "epoch": 0.7772020725388601, + "grad_norm": 1.6848426955294054, + "learning_rate": 1e-05, + "loss": 1.1896, + "step": 1725 + }, + { + "epoch": 0.7776526244649696, + "grad_norm": 1.866021427559413, + "learning_rate": 1e-05, + "loss": 1.2043, + "step": 1726 + }, + { + "epoch": 0.7781031763910791, + "grad_norm": 1.8912442275828933, + "learning_rate": 1e-05, + "loss": 1.1582, + "step": 1727 + }, + { + "epoch": 0.7785537283171886, + "grad_norm": 1.71422552420474, + "learning_rate": 1e-05, + "loss": 1.1946, + "step": 1728 + }, + { + "epoch": 0.779004280243298, + "grad_norm": 1.6427767378374356, + "learning_rate": 1e-05, + "loss": 1.2069, + "step": 1729 + }, + { + "epoch": 0.7794548321694075, + "grad_norm": 1.6066707395149764, + "learning_rate": 1e-05, + "loss": 1.1584, + "step": 1730 + }, + { + "epoch": 0.779905384095517, + "grad_norm": 1.7181703507041595, + "learning_rate": 1e-05, + "loss": 1.1939, + "step": 1731 + }, + { + "epoch": 0.7803559360216265, + "grad_norm": 1.7829887511924638, + "learning_rate": 1e-05, + "loss": 1.1991, + "step": 1732 + }, + { + "epoch": 0.780806487947736, + "grad_norm": 1.673584325888017, + "learning_rate": 1e-05, + "loss": 1.2075, + "step": 1733 + }, + { + "epoch": 0.7812570398738454, + "grad_norm": 1.757262862429782, + "learning_rate": 1e-05, + "loss": 1.243, + "step": 1734 + }, + { + "epoch": 0.781707591799955, + "grad_norm": 1.8297059704504663, + "learning_rate": 1e-05, + "loss": 1.2154, + "step": 1735 + }, + { + "epoch": 0.7821581437260644, + "grad_norm": 1.7922089407986124, + "learning_rate": 1e-05, + "loss": 1.2178, + "step": 1736 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 1.699372079542919, + "learning_rate": 1e-05, + "loss": 1.1847, + "step": 1737 + }, + { + "epoch": 0.7830592475782834, + "grad_norm": 1.812467744918672, + "learning_rate": 1e-05, + "loss": 1.1993, + "step": 1738 + }, + { + "epoch": 0.7835097995043929, + "grad_norm": 1.806682015541317, + "learning_rate": 1e-05, + "loss": 1.2317, + "step": 1739 + }, + { + "epoch": 0.7839603514305024, + "grad_norm": 2.0428325187686154, + "learning_rate": 1e-05, + "loss": 1.22, + "step": 1740 + }, + { + "epoch": 0.7844109033566119, + "grad_norm": 1.6579826323082347, + "learning_rate": 1e-05, + "loss": 1.152, + "step": 1741 + }, + { + "epoch": 0.7848614552827213, + "grad_norm": 1.6688203186414754, + "learning_rate": 1e-05, + "loss": 1.2005, + "step": 1742 + }, + { + "epoch": 0.7853120072088308, + "grad_norm": 1.646632420536205, + "learning_rate": 1e-05, + "loss": 1.1693, + "step": 1743 + }, + { + "epoch": 0.7857625591349403, + "grad_norm": 1.7542126824558943, + "learning_rate": 1e-05, + "loss": 1.1658, + "step": 1744 + }, + { + "epoch": 0.7862131110610497, + "grad_norm": 1.9774026203805404, + "learning_rate": 1e-05, + "loss": 1.1946, + "step": 1745 + }, + { + "epoch": 0.7866636629871593, + "grad_norm": 1.9150771509861109, + "learning_rate": 1e-05, + "loss": 1.1811, + "step": 1746 + }, + { + "epoch": 0.7871142149132687, + "grad_norm": 1.9746730698587291, + "learning_rate": 1e-05, + "loss": 1.2175, + "step": 1747 + }, + { + "epoch": 0.7875647668393783, + "grad_norm": 1.8540179428694414, + "learning_rate": 1e-05, + "loss": 1.2127, + "step": 1748 + }, + { + "epoch": 0.7880153187654877, + "grad_norm": 1.7878971194265023, + "learning_rate": 1e-05, + "loss": 1.1843, + "step": 1749 + }, + { + "epoch": 0.7884658706915972, + "grad_norm": 1.7091001177940246, + "learning_rate": 1e-05, + "loss": 1.2027, + "step": 1750 + }, + { + "epoch": 0.7889164226177067, + "grad_norm": 1.6117635444294602, + "learning_rate": 1e-05, + "loss": 1.2083, + "step": 1751 + }, + { + "epoch": 0.7893669745438162, + "grad_norm": 1.8820711179069227, + "learning_rate": 1e-05, + "loss": 1.2228, + "step": 1752 + }, + { + "epoch": 0.7898175264699256, + "grad_norm": 1.6934011531025193, + "learning_rate": 1e-05, + "loss": 1.1865, + "step": 1753 + }, + { + "epoch": 0.7902680783960352, + "grad_norm": 1.74142677650193, + "learning_rate": 1e-05, + "loss": 1.2068, + "step": 1754 + }, + { + "epoch": 0.7907186303221446, + "grad_norm": 1.8215817246840316, + "learning_rate": 1e-05, + "loss": 1.1876, + "step": 1755 + }, + { + "epoch": 0.7911691822482542, + "grad_norm": 1.7720699713860941, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1756 + }, + { + "epoch": 0.7916197341743636, + "grad_norm": 1.7469765363624727, + "learning_rate": 1e-05, + "loss": 1.1414, + "step": 1757 + }, + { + "epoch": 0.792070286100473, + "grad_norm": 1.8180964693581891, + "learning_rate": 1e-05, + "loss": 1.2038, + "step": 1758 + }, + { + "epoch": 0.7925208380265826, + "grad_norm": 1.6796050239341567, + "learning_rate": 1e-05, + "loss": 1.2388, + "step": 1759 + }, + { + "epoch": 0.792971389952692, + "grad_norm": 1.8410353568960462, + "learning_rate": 1e-05, + "loss": 1.1842, + "step": 1760 + }, + { + "epoch": 0.7934219418788016, + "grad_norm": 1.7180786562319967, + "learning_rate": 1e-05, + "loss": 1.2057, + "step": 1761 + }, + { + "epoch": 0.793872493804911, + "grad_norm": 1.7048127655518062, + "learning_rate": 1e-05, + "loss": 1.1976, + "step": 1762 + }, + { + "epoch": 0.7943230457310205, + "grad_norm": 1.875493721146123, + "learning_rate": 1e-05, + "loss": 1.2299, + "step": 1763 + }, + { + "epoch": 0.79477359765713, + "grad_norm": 1.6894023148868924, + "learning_rate": 1e-05, + "loss": 1.1831, + "step": 1764 + }, + { + "epoch": 0.7952241495832395, + "grad_norm": 1.7007836557399247, + "learning_rate": 1e-05, + "loss": 1.1997, + "step": 1765 + }, + { + "epoch": 0.7956747015093489, + "grad_norm": 1.7234616576952273, + "learning_rate": 1e-05, + "loss": 1.1705, + "step": 1766 + }, + { + "epoch": 0.7961252534354585, + "grad_norm": 1.6785410428689458, + "learning_rate": 1e-05, + "loss": 1.2251, + "step": 1767 + }, + { + "epoch": 0.7965758053615679, + "grad_norm": 1.7915367124382264, + "learning_rate": 1e-05, + "loss": 1.2132, + "step": 1768 + }, + { + "epoch": 0.7970263572876775, + "grad_norm": 1.7408723133985373, + "learning_rate": 1e-05, + "loss": 1.18, + "step": 1769 + }, + { + "epoch": 0.7974769092137869, + "grad_norm": 1.579411643669021, + "learning_rate": 1e-05, + "loss": 1.1705, + "step": 1770 + }, + { + "epoch": 0.7979274611398963, + "grad_norm": 1.7059046806169196, + "learning_rate": 1e-05, + "loss": 1.2293, + "step": 1771 + }, + { + "epoch": 0.7983780130660059, + "grad_norm": 1.556289164485916, + "learning_rate": 1e-05, + "loss": 1.2469, + "step": 1772 + }, + { + "epoch": 0.7988285649921153, + "grad_norm": 1.80268197301653, + "learning_rate": 1e-05, + "loss": 1.2354, + "step": 1773 + }, + { + "epoch": 0.7992791169182248, + "grad_norm": 1.6570194827647189, + "learning_rate": 1e-05, + "loss": 1.2282, + "step": 1774 + }, + { + "epoch": 0.7997296688443343, + "grad_norm": 1.6781340182826867, + "learning_rate": 1e-05, + "loss": 1.1614, + "step": 1775 + }, + { + "epoch": 0.8001802207704438, + "grad_norm": 1.9247040424846475, + "learning_rate": 1e-05, + "loss": 1.179, + "step": 1776 + }, + { + "epoch": 0.8006307726965533, + "grad_norm": 1.7275158966315673, + "learning_rate": 1e-05, + "loss": 1.2007, + "step": 1777 + }, + { + "epoch": 0.8010813246226628, + "grad_norm": 1.752167686702428, + "learning_rate": 1e-05, + "loss": 1.1741, + "step": 1778 + }, + { + "epoch": 0.8015318765487722, + "grad_norm": 1.6709785896835017, + "learning_rate": 1e-05, + "loss": 1.2334, + "step": 1779 + }, + { + "epoch": 0.8019824284748818, + "grad_norm": 1.7216061976974615, + "learning_rate": 1e-05, + "loss": 1.1788, + "step": 1780 + }, + { + "epoch": 0.8024329804009912, + "grad_norm": 1.659990474406928, + "learning_rate": 1e-05, + "loss": 1.1738, + "step": 1781 + }, + { + "epoch": 0.8028835323271007, + "grad_norm": 1.7641067294313082, + "learning_rate": 1e-05, + "loss": 1.1689, + "step": 1782 + }, + { + "epoch": 0.8033340842532102, + "grad_norm": 1.8687111423180383, + "learning_rate": 1e-05, + "loss": 1.2167, + "step": 1783 + }, + { + "epoch": 0.8037846361793196, + "grad_norm": 1.6467592276784964, + "learning_rate": 1e-05, + "loss": 1.1728, + "step": 1784 + }, + { + "epoch": 0.8042351881054292, + "grad_norm": 1.8845041751538107, + "learning_rate": 1e-05, + "loss": 1.1963, + "step": 1785 + }, + { + "epoch": 0.8046857400315386, + "grad_norm": 1.6606200852666957, + "learning_rate": 1e-05, + "loss": 1.2419, + "step": 1786 + }, + { + "epoch": 0.8051362919576481, + "grad_norm": 1.7195875080928165, + "learning_rate": 1e-05, + "loss": 1.2053, + "step": 1787 + }, + { + "epoch": 0.8055868438837576, + "grad_norm": 1.7450042789170206, + "learning_rate": 1e-05, + "loss": 1.1653, + "step": 1788 + }, + { + "epoch": 0.8060373958098671, + "grad_norm": 1.607903307986601, + "learning_rate": 1e-05, + "loss": 1.2252, + "step": 1789 + }, + { + "epoch": 0.8064879477359765, + "grad_norm": 1.6291198311827304, + "learning_rate": 1e-05, + "loss": 1.1807, + "step": 1790 + }, + { + "epoch": 0.8069384996620861, + "grad_norm": 1.7570954418935782, + "learning_rate": 1e-05, + "loss": 1.1372, + "step": 1791 + }, + { + "epoch": 0.8073890515881955, + "grad_norm": 1.6940183248513656, + "learning_rate": 1e-05, + "loss": 1.1764, + "step": 1792 + }, + { + "epoch": 0.8078396035143051, + "grad_norm": 1.6715246144432852, + "learning_rate": 1e-05, + "loss": 1.1682, + "step": 1793 + }, + { + "epoch": 0.8082901554404145, + "grad_norm": 1.622579390147315, + "learning_rate": 1e-05, + "loss": 1.1824, + "step": 1794 + }, + { + "epoch": 0.808740707366524, + "grad_norm": 1.6122444971448244, + "learning_rate": 1e-05, + "loss": 1.1938, + "step": 1795 + }, + { + "epoch": 0.8091912592926335, + "grad_norm": 1.673915226874395, + "learning_rate": 1e-05, + "loss": 1.2029, + "step": 1796 + }, + { + "epoch": 0.809641811218743, + "grad_norm": 1.6289651786498598, + "learning_rate": 1e-05, + "loss": 1.221, + "step": 1797 + }, + { + "epoch": 0.8100923631448524, + "grad_norm": 1.8574225182558701, + "learning_rate": 1e-05, + "loss": 1.1884, + "step": 1798 + }, + { + "epoch": 0.8105429150709619, + "grad_norm": 1.7759255504996503, + "learning_rate": 1e-05, + "loss": 1.1806, + "step": 1799 + }, + { + "epoch": 0.8109934669970714, + "grad_norm": 1.6065268073103804, + "learning_rate": 1e-05, + "loss": 1.1875, + "step": 1800 + }, + { + "epoch": 0.8114440189231809, + "grad_norm": 1.8073045942786299, + "learning_rate": 1e-05, + "loss": 1.1745, + "step": 1801 + }, + { + "epoch": 0.8118945708492904, + "grad_norm": 1.5086722747406716, + "learning_rate": 1e-05, + "loss": 1.206, + "step": 1802 + }, + { + "epoch": 0.8123451227753998, + "grad_norm": 1.6410734376734943, + "learning_rate": 1e-05, + "loss": 1.2089, + "step": 1803 + }, + { + "epoch": 0.8127956747015094, + "grad_norm": 1.6696031782373022, + "learning_rate": 1e-05, + "loss": 1.2102, + "step": 1804 + }, + { + "epoch": 0.8132462266276188, + "grad_norm": 1.72933695131258, + "learning_rate": 1e-05, + "loss": 1.1674, + "step": 1805 + }, + { + "epoch": 0.8136967785537284, + "grad_norm": 1.6713287790566425, + "learning_rate": 1e-05, + "loss": 1.1421, + "step": 1806 + }, + { + "epoch": 0.8141473304798378, + "grad_norm": 1.555689526971806, + "learning_rate": 1e-05, + "loss": 1.2045, + "step": 1807 + }, + { + "epoch": 0.8145978824059473, + "grad_norm": 1.7107816419209407, + "learning_rate": 1e-05, + "loss": 1.254, + "step": 1808 + }, + { + "epoch": 0.8150484343320568, + "grad_norm": 1.6978797539741441, + "learning_rate": 1e-05, + "loss": 1.1999, + "step": 1809 + }, + { + "epoch": 0.8154989862581663, + "grad_norm": 1.7342072656013985, + "learning_rate": 1e-05, + "loss": 1.161, + "step": 1810 + }, + { + "epoch": 0.8159495381842757, + "grad_norm": 1.7215478015529795, + "learning_rate": 1e-05, + "loss": 1.1938, + "step": 1811 + }, + { + "epoch": 0.8164000901103852, + "grad_norm": 1.8168631125275996, + "learning_rate": 1e-05, + "loss": 1.1559, + "step": 1812 + }, + { + "epoch": 0.8168506420364947, + "grad_norm": 1.6730864612404082, + "learning_rate": 1e-05, + "loss": 1.2148, + "step": 1813 + }, + { + "epoch": 0.8173011939626041, + "grad_norm": 1.794971257052246, + "learning_rate": 1e-05, + "loss": 1.1674, + "step": 1814 + }, + { + "epoch": 0.8177517458887137, + "grad_norm": 1.833276009223717, + "learning_rate": 1e-05, + "loss": 1.1529, + "step": 1815 + }, + { + "epoch": 0.8182022978148231, + "grad_norm": 1.755511955138208, + "learning_rate": 1e-05, + "loss": 1.1837, + "step": 1816 + }, + { + "epoch": 0.8186528497409327, + "grad_norm": 1.8568036299566444, + "learning_rate": 1e-05, + "loss": 1.2397, + "step": 1817 + }, + { + "epoch": 0.8191034016670421, + "grad_norm": 1.730612575135526, + "learning_rate": 1e-05, + "loss": 1.2339, + "step": 1818 + }, + { + "epoch": 0.8195539535931516, + "grad_norm": 1.8010708713586632, + "learning_rate": 1e-05, + "loss": 1.2136, + "step": 1819 + }, + { + "epoch": 0.8200045055192611, + "grad_norm": 1.7522398396456647, + "learning_rate": 1e-05, + "loss": 1.1606, + "step": 1820 + }, + { + "epoch": 0.8204550574453706, + "grad_norm": 1.7871145554348764, + "learning_rate": 1e-05, + "loss": 1.2302, + "step": 1821 + }, + { + "epoch": 0.82090560937148, + "grad_norm": 1.5881068278749262, + "learning_rate": 1e-05, + "loss": 1.2138, + "step": 1822 + }, + { + "epoch": 0.8213561612975896, + "grad_norm": 1.8361277896093844, + "learning_rate": 1e-05, + "loss": 1.1689, + "step": 1823 + }, + { + "epoch": 0.821806713223699, + "grad_norm": 1.5772982242765807, + "learning_rate": 1e-05, + "loss": 1.2147, + "step": 1824 + }, + { + "epoch": 0.8222572651498086, + "grad_norm": 1.6907674511280488, + "learning_rate": 1e-05, + "loss": 1.1599, + "step": 1825 + }, + { + "epoch": 0.822707817075918, + "grad_norm": 1.6915559112820235, + "learning_rate": 1e-05, + "loss": 1.2331, + "step": 1826 + }, + { + "epoch": 0.8231583690020274, + "grad_norm": 1.7271988450470883, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 1827 + }, + { + "epoch": 0.823608920928137, + "grad_norm": 1.6495148178297072, + "learning_rate": 1e-05, + "loss": 1.1727, + "step": 1828 + }, + { + "epoch": 0.8240594728542464, + "grad_norm": 1.5429531334169466, + "learning_rate": 1e-05, + "loss": 1.2308, + "step": 1829 + }, + { + "epoch": 0.824510024780356, + "grad_norm": 1.6528167550186859, + "learning_rate": 1e-05, + "loss": 1.221, + "step": 1830 + }, + { + "epoch": 0.8249605767064654, + "grad_norm": 1.7028016391985052, + "learning_rate": 1e-05, + "loss": 1.2303, + "step": 1831 + }, + { + "epoch": 0.8254111286325749, + "grad_norm": 1.6519648763476065, + "learning_rate": 1e-05, + "loss": 1.1734, + "step": 1832 + }, + { + "epoch": 0.8258616805586844, + "grad_norm": 1.7831930073112774, + "learning_rate": 1e-05, + "loss": 1.1996, + "step": 1833 + }, + { + "epoch": 0.8263122324847939, + "grad_norm": 1.569943737291928, + "learning_rate": 1e-05, + "loss": 1.166, + "step": 1834 + }, + { + "epoch": 0.8267627844109033, + "grad_norm": 1.749828893584674, + "learning_rate": 1e-05, + "loss": 1.1314, + "step": 1835 + }, + { + "epoch": 0.8272133363370129, + "grad_norm": 1.49333625744596, + "learning_rate": 1e-05, + "loss": 1.1943, + "step": 1836 + }, + { + "epoch": 0.8276638882631223, + "grad_norm": 1.701130962185549, + "learning_rate": 1e-05, + "loss": 1.1847, + "step": 1837 + }, + { + "epoch": 0.8281144401892319, + "grad_norm": 1.7597771845463768, + "learning_rate": 1e-05, + "loss": 1.2266, + "step": 1838 + }, + { + "epoch": 0.8285649921153413, + "grad_norm": 1.6822295737976576, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 1839 + }, + { + "epoch": 0.8290155440414507, + "grad_norm": 1.7578620391435773, + "learning_rate": 1e-05, + "loss": 1.1696, + "step": 1840 + }, + { + "epoch": 0.8294660959675603, + "grad_norm": 1.7515912207578863, + "learning_rate": 1e-05, + "loss": 1.1676, + "step": 1841 + }, + { + "epoch": 0.8299166478936697, + "grad_norm": 1.7140724085890104, + "learning_rate": 1e-05, + "loss": 1.1911, + "step": 1842 + }, + { + "epoch": 0.8303671998197792, + "grad_norm": 1.639741990912705, + "learning_rate": 1e-05, + "loss": 1.1805, + "step": 1843 + }, + { + "epoch": 0.8308177517458887, + "grad_norm": 1.5237143397840152, + "learning_rate": 1e-05, + "loss": 1.1468, + "step": 1844 + }, + { + "epoch": 0.8312683036719982, + "grad_norm": 1.8327144880859152, + "learning_rate": 1e-05, + "loss": 1.2012, + "step": 1845 + }, + { + "epoch": 0.8317188555981077, + "grad_norm": 1.784939482353893, + "learning_rate": 1e-05, + "loss": 1.2006, + "step": 1846 + }, + { + "epoch": 0.8321694075242172, + "grad_norm": 1.6765228375047296, + "learning_rate": 1e-05, + "loss": 1.1382, + "step": 1847 + }, + { + "epoch": 0.8326199594503266, + "grad_norm": 1.6807484329690845, + "learning_rate": 1e-05, + "loss": 1.1825, + "step": 1848 + }, + { + "epoch": 0.8330705113764362, + "grad_norm": 1.6556503576432688, + "learning_rate": 1e-05, + "loss": 1.1642, + "step": 1849 + }, + { + "epoch": 0.8335210633025456, + "grad_norm": 1.8457718442378077, + "learning_rate": 1e-05, + "loss": 1.2018, + "step": 1850 + }, + { + "epoch": 0.8339716152286551, + "grad_norm": 1.7386324171860397, + "learning_rate": 1e-05, + "loss": 1.2086, + "step": 1851 + }, + { + "epoch": 0.8344221671547646, + "grad_norm": 1.7834582941382333, + "learning_rate": 1e-05, + "loss": 1.1935, + "step": 1852 + }, + { + "epoch": 0.8348727190808741, + "grad_norm": 1.5578726204866538, + "learning_rate": 1e-05, + "loss": 1.1852, + "step": 1853 + }, + { + "epoch": 0.8353232710069836, + "grad_norm": 1.645561699294569, + "learning_rate": 1e-05, + "loss": 1.1927, + "step": 1854 + }, + { + "epoch": 0.835773822933093, + "grad_norm": 1.628572224310932, + "learning_rate": 1e-05, + "loss": 1.2055, + "step": 1855 + }, + { + "epoch": 0.8362243748592025, + "grad_norm": 1.7569072711235298, + "learning_rate": 1e-05, + "loss": 1.2308, + "step": 1856 + }, + { + "epoch": 0.836674926785312, + "grad_norm": 1.6861628602830974, + "learning_rate": 1e-05, + "loss": 1.187, + "step": 1857 + }, + { + "epoch": 0.8371254787114215, + "grad_norm": 1.6393350502863893, + "learning_rate": 1e-05, + "loss": 1.19, + "step": 1858 + }, + { + "epoch": 0.8375760306375309, + "grad_norm": 1.7432904377757557, + "learning_rate": 1e-05, + "loss": 1.1788, + "step": 1859 + }, + { + "epoch": 0.8380265825636405, + "grad_norm": 1.6497139581484863, + "learning_rate": 1e-05, + "loss": 1.1959, + "step": 1860 + }, + { + "epoch": 0.8384771344897499, + "grad_norm": 1.7551824550788737, + "learning_rate": 1e-05, + "loss": 1.2106, + "step": 1861 + }, + { + "epoch": 0.8389276864158595, + "grad_norm": 1.7720142131386385, + "learning_rate": 1e-05, + "loss": 1.1487, + "step": 1862 + }, + { + "epoch": 0.8393782383419689, + "grad_norm": 1.6155334459556048, + "learning_rate": 1e-05, + "loss": 1.2256, + "step": 1863 + }, + { + "epoch": 0.8398287902680784, + "grad_norm": 1.7381417026213979, + "learning_rate": 1e-05, + "loss": 1.1875, + "step": 1864 + }, + { + "epoch": 0.8402793421941879, + "grad_norm": 1.6518693993382525, + "learning_rate": 1e-05, + "loss": 1.187, + "step": 1865 + }, + { + "epoch": 0.8407298941202974, + "grad_norm": 1.7581564397885485, + "learning_rate": 1e-05, + "loss": 1.1662, + "step": 1866 + }, + { + "epoch": 0.8411804460464068, + "grad_norm": 1.7407070017452284, + "learning_rate": 1e-05, + "loss": 1.1695, + "step": 1867 + }, + { + "epoch": 0.8416309979725163, + "grad_norm": 1.696193545487891, + "learning_rate": 1e-05, + "loss": 1.2007, + "step": 1868 + }, + { + "epoch": 0.8420815498986258, + "grad_norm": 1.675694788012286, + "learning_rate": 1e-05, + "loss": 1.2231, + "step": 1869 + }, + { + "epoch": 0.8425321018247353, + "grad_norm": 1.732119679262739, + "learning_rate": 1e-05, + "loss": 1.1746, + "step": 1870 + }, + { + "epoch": 0.8429826537508448, + "grad_norm": 1.7359533865764607, + "learning_rate": 1e-05, + "loss": 1.2073, + "step": 1871 + }, + { + "epoch": 0.8434332056769542, + "grad_norm": 1.6255239440036602, + "learning_rate": 1e-05, + "loss": 1.2126, + "step": 1872 + }, + { + "epoch": 0.8438837576030638, + "grad_norm": 1.603652875682749, + "learning_rate": 1e-05, + "loss": 1.1704, + "step": 1873 + }, + { + "epoch": 0.8443343095291732, + "grad_norm": 1.7171999769504531, + "learning_rate": 1e-05, + "loss": 1.1826, + "step": 1874 + }, + { + "epoch": 0.8447848614552828, + "grad_norm": 1.7695406001085834, + "learning_rate": 1e-05, + "loss": 1.1859, + "step": 1875 + }, + { + "epoch": 0.8452354133813922, + "grad_norm": 1.608752510331531, + "learning_rate": 1e-05, + "loss": 1.1943, + "step": 1876 + }, + { + "epoch": 0.8456859653075017, + "grad_norm": 1.7423124752559593, + "learning_rate": 1e-05, + "loss": 1.2367, + "step": 1877 + }, + { + "epoch": 0.8461365172336112, + "grad_norm": 1.68481610871632, + "learning_rate": 1e-05, + "loss": 1.1829, + "step": 1878 + }, + { + "epoch": 0.8465870691597207, + "grad_norm": 1.8527501439577547, + "learning_rate": 1e-05, + "loss": 1.2476, + "step": 1879 + }, + { + "epoch": 0.8470376210858301, + "grad_norm": 1.9968473859003448, + "learning_rate": 1e-05, + "loss": 1.1167, + "step": 1880 + }, + { + "epoch": 0.8474881730119396, + "grad_norm": 1.7678137412578787, + "learning_rate": 1e-05, + "loss": 1.1633, + "step": 1881 + }, + { + "epoch": 0.8479387249380491, + "grad_norm": 1.556477379139996, + "learning_rate": 1e-05, + "loss": 1.1773, + "step": 1882 + }, + { + "epoch": 0.8483892768641585, + "grad_norm": 1.7924479162860214, + "learning_rate": 1e-05, + "loss": 1.1828, + "step": 1883 + }, + { + "epoch": 0.8488398287902681, + "grad_norm": 1.6292880584988436, + "learning_rate": 1e-05, + "loss": 1.1717, + "step": 1884 + }, + { + "epoch": 0.8492903807163775, + "grad_norm": 1.8370736351623214, + "learning_rate": 1e-05, + "loss": 1.1136, + "step": 1885 + }, + { + "epoch": 0.8497409326424871, + "grad_norm": 1.712098595924913, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1886 + }, + { + "epoch": 0.8501914845685965, + "grad_norm": 1.732302685864171, + "learning_rate": 1e-05, + "loss": 1.1663, + "step": 1887 + }, + { + "epoch": 0.850642036494706, + "grad_norm": 1.7428759852946527, + "learning_rate": 1e-05, + "loss": 1.157, + "step": 1888 + }, + { + "epoch": 0.8510925884208155, + "grad_norm": 1.6311039887003997, + "learning_rate": 1e-05, + "loss": 1.1947, + "step": 1889 + }, + { + "epoch": 0.851543140346925, + "grad_norm": 1.6693782203489504, + "learning_rate": 1e-05, + "loss": 1.1194, + "step": 1890 + }, + { + "epoch": 0.8519936922730345, + "grad_norm": 1.6151122944453822, + "learning_rate": 1e-05, + "loss": 1.2245, + "step": 1891 + }, + { + "epoch": 0.852444244199144, + "grad_norm": 1.6666911048541473, + "learning_rate": 1e-05, + "loss": 1.1455, + "step": 1892 + }, + { + "epoch": 0.8528947961252534, + "grad_norm": 1.7271702493835965, + "learning_rate": 1e-05, + "loss": 1.2418, + "step": 1893 + }, + { + "epoch": 0.853345348051363, + "grad_norm": 1.625278036644014, + "learning_rate": 1e-05, + "loss": 1.1439, + "step": 1894 + }, + { + "epoch": 0.8537958999774724, + "grad_norm": 1.8266926896290014, + "learning_rate": 1e-05, + "loss": 1.1757, + "step": 1895 + }, + { + "epoch": 0.8542464519035818, + "grad_norm": 1.8088659008337555, + "learning_rate": 1e-05, + "loss": 1.2177, + "step": 1896 + }, + { + "epoch": 0.8546970038296914, + "grad_norm": 1.5188719773527812, + "learning_rate": 1e-05, + "loss": 1.1786, + "step": 1897 + }, + { + "epoch": 0.8551475557558008, + "grad_norm": 1.698235192975051, + "learning_rate": 1e-05, + "loss": 1.1502, + "step": 1898 + }, + { + "epoch": 0.8555981076819104, + "grad_norm": 1.682321920659013, + "learning_rate": 1e-05, + "loss": 1.1755, + "step": 1899 + }, + { + "epoch": 0.8560486596080198, + "grad_norm": 1.7211441240079381, + "learning_rate": 1e-05, + "loss": 1.1948, + "step": 1900 + }, + { + "epoch": 0.8564992115341293, + "grad_norm": 1.5671613014454864, + "learning_rate": 1e-05, + "loss": 1.1532, + "step": 1901 + }, + { + "epoch": 0.8569497634602388, + "grad_norm": 1.7570383061221853, + "learning_rate": 1e-05, + "loss": 1.1508, + "step": 1902 + }, + { + "epoch": 0.8574003153863483, + "grad_norm": 1.8218014930265243, + "learning_rate": 1e-05, + "loss": 1.1678, + "step": 1903 + }, + { + "epoch": 0.8578508673124577, + "grad_norm": 1.5796342577148133, + "learning_rate": 1e-05, + "loss": 1.1813, + "step": 1904 + }, + { + "epoch": 0.8583014192385673, + "grad_norm": 1.6305157420347662, + "learning_rate": 1e-05, + "loss": 1.2015, + "step": 1905 + }, + { + "epoch": 0.8587519711646767, + "grad_norm": 1.5123328622529004, + "learning_rate": 1e-05, + "loss": 1.1888, + "step": 1906 + }, + { + "epoch": 0.8592025230907863, + "grad_norm": 1.7615806652394799, + "learning_rate": 1e-05, + "loss": 1.1527, + "step": 1907 + }, + { + "epoch": 0.8596530750168957, + "grad_norm": 1.8080875330863884, + "learning_rate": 1e-05, + "loss": 1.1483, + "step": 1908 + }, + { + "epoch": 0.8601036269430051, + "grad_norm": 1.6531915110793447, + "learning_rate": 1e-05, + "loss": 1.1528, + "step": 1909 + }, + { + "epoch": 0.8605541788691147, + "grad_norm": 1.7109784395736596, + "learning_rate": 1e-05, + "loss": 1.1782, + "step": 1910 + }, + { + "epoch": 0.8610047307952241, + "grad_norm": 1.7292321924749698, + "learning_rate": 1e-05, + "loss": 1.1707, + "step": 1911 + }, + { + "epoch": 0.8614552827213336, + "grad_norm": 1.8983091050254104, + "learning_rate": 1e-05, + "loss": 1.2272, + "step": 1912 + }, + { + "epoch": 0.8619058346474431, + "grad_norm": 1.6990088023494063, + "learning_rate": 1e-05, + "loss": 1.1365, + "step": 1913 + }, + { + "epoch": 0.8623563865735526, + "grad_norm": 1.8009079667610153, + "learning_rate": 1e-05, + "loss": 1.1821, + "step": 1914 + }, + { + "epoch": 0.8628069384996621, + "grad_norm": 1.6029370701232601, + "learning_rate": 1e-05, + "loss": 1.1858, + "step": 1915 + }, + { + "epoch": 0.8632574904257716, + "grad_norm": 1.736786043826039, + "learning_rate": 1e-05, + "loss": 1.174, + "step": 1916 + }, + { + "epoch": 0.863708042351881, + "grad_norm": 1.4943056336840133, + "learning_rate": 1e-05, + "loss": 1.1687, + "step": 1917 + }, + { + "epoch": 0.8641585942779906, + "grad_norm": 1.7782797122098448, + "learning_rate": 1e-05, + "loss": 1.137, + "step": 1918 + }, + { + "epoch": 0.8646091462041, + "grad_norm": 1.5722811399895673, + "learning_rate": 1e-05, + "loss": 1.224, + "step": 1919 + }, + { + "epoch": 0.8650596981302096, + "grad_norm": 1.8196960961003943, + "learning_rate": 1e-05, + "loss": 1.1641, + "step": 1920 + }, + { + "epoch": 0.865510250056319, + "grad_norm": 1.5014356478018795, + "learning_rate": 1e-05, + "loss": 1.2079, + "step": 1921 + }, + { + "epoch": 0.8659608019824285, + "grad_norm": 1.7155412730644062, + "learning_rate": 1e-05, + "loss": 1.1497, + "step": 1922 + }, + { + "epoch": 0.866411353908538, + "grad_norm": 1.7479774465805296, + "learning_rate": 1e-05, + "loss": 1.201, + "step": 1923 + }, + { + "epoch": 0.8668619058346474, + "grad_norm": 1.8635908641176946, + "learning_rate": 1e-05, + "loss": 1.2052, + "step": 1924 + }, + { + "epoch": 0.8673124577607569, + "grad_norm": 1.6981131344398972, + "learning_rate": 1e-05, + "loss": 1.1979, + "step": 1925 + }, + { + "epoch": 0.8677630096868664, + "grad_norm": 1.8298095483126138, + "learning_rate": 1e-05, + "loss": 1.1667, + "step": 1926 + }, + { + "epoch": 0.8682135616129759, + "grad_norm": 1.7194418701797443, + "learning_rate": 1e-05, + "loss": 1.2701, + "step": 1927 + }, + { + "epoch": 0.8686641135390853, + "grad_norm": 1.7527047305719925, + "learning_rate": 1e-05, + "loss": 1.1983, + "step": 1928 + }, + { + "epoch": 0.8691146654651949, + "grad_norm": 1.7420697895766035, + "learning_rate": 1e-05, + "loss": 1.1632, + "step": 1929 + }, + { + "epoch": 0.8695652173913043, + "grad_norm": 1.8830249678980415, + "learning_rate": 1e-05, + "loss": 1.2274, + "step": 1930 + }, + { + "epoch": 0.8700157693174139, + "grad_norm": 1.7902124544077, + "learning_rate": 1e-05, + "loss": 1.1519, + "step": 1931 + }, + { + "epoch": 0.8704663212435233, + "grad_norm": 1.8125679907386705, + "learning_rate": 1e-05, + "loss": 1.2235, + "step": 1932 + }, + { + "epoch": 0.8709168731696328, + "grad_norm": 1.5364302166060644, + "learning_rate": 1e-05, + "loss": 1.1675, + "step": 1933 + }, + { + "epoch": 0.8713674250957423, + "grad_norm": 1.71309437243926, + "learning_rate": 1e-05, + "loss": 1.1971, + "step": 1934 + }, + { + "epoch": 0.8718179770218518, + "grad_norm": 1.8382299502817083, + "learning_rate": 1e-05, + "loss": 1.1889, + "step": 1935 + }, + { + "epoch": 0.8722685289479613, + "grad_norm": 1.4800146563735435, + "learning_rate": 1e-05, + "loss": 1.1834, + "step": 1936 + }, + { + "epoch": 0.8727190808740707, + "grad_norm": 1.8419305600469726, + "learning_rate": 1e-05, + "loss": 1.1682, + "step": 1937 + }, + { + "epoch": 0.8731696328001802, + "grad_norm": 1.6402860515722093, + "learning_rate": 1e-05, + "loss": 1.191, + "step": 1938 + }, + { + "epoch": 0.8736201847262897, + "grad_norm": 1.5750853324413905, + "learning_rate": 1e-05, + "loss": 1.1931, + "step": 1939 + }, + { + "epoch": 0.8740707366523992, + "grad_norm": 1.7122071346392769, + "learning_rate": 1e-05, + "loss": 1.2012, + "step": 1940 + }, + { + "epoch": 0.8745212885785086, + "grad_norm": 1.962267954992774, + "learning_rate": 1e-05, + "loss": 1.2298, + "step": 1941 + }, + { + "epoch": 0.8749718405046182, + "grad_norm": 1.6470105267146145, + "learning_rate": 1e-05, + "loss": 1.1909, + "step": 1942 + }, + { + "epoch": 0.8754223924307276, + "grad_norm": 1.696760791120632, + "learning_rate": 1e-05, + "loss": 1.1283, + "step": 1943 + }, + { + "epoch": 0.8758729443568372, + "grad_norm": 1.6512854525210154, + "learning_rate": 1e-05, + "loss": 1.1498, + "step": 1944 + }, + { + "epoch": 0.8763234962829466, + "grad_norm": 1.7346141415486804, + "learning_rate": 1e-05, + "loss": 1.182, + "step": 1945 + }, + { + "epoch": 0.8767740482090561, + "grad_norm": 1.7333886670705998, + "learning_rate": 1e-05, + "loss": 1.175, + "step": 1946 + }, + { + "epoch": 0.8772246001351656, + "grad_norm": 1.7197986361204711, + "learning_rate": 1e-05, + "loss": 1.189, + "step": 1947 + }, + { + "epoch": 0.8776751520612751, + "grad_norm": 1.5322161639690512, + "learning_rate": 1e-05, + "loss": 1.2137, + "step": 1948 + }, + { + "epoch": 0.8781257039873845, + "grad_norm": 1.8152703348833092, + "learning_rate": 1e-05, + "loss": 1.2056, + "step": 1949 + }, + { + "epoch": 0.878576255913494, + "grad_norm": 1.572175310710983, + "learning_rate": 1e-05, + "loss": 1.2318, + "step": 1950 + }, + { + "epoch": 0.8790268078396035, + "grad_norm": 1.8533395139877775, + "learning_rate": 1e-05, + "loss": 1.2218, + "step": 1951 + }, + { + "epoch": 0.879477359765713, + "grad_norm": 1.7114207143773543, + "learning_rate": 1e-05, + "loss": 1.1862, + "step": 1952 + }, + { + "epoch": 0.8799279116918225, + "grad_norm": 2.0850669987105075, + "learning_rate": 1e-05, + "loss": 1.1992, + "step": 1953 + }, + { + "epoch": 0.8803784636179319, + "grad_norm": 1.684177597482306, + "learning_rate": 1e-05, + "loss": 1.254, + "step": 1954 + }, + { + "epoch": 0.8808290155440415, + "grad_norm": 1.5762065907916154, + "learning_rate": 1e-05, + "loss": 1.1994, + "step": 1955 + }, + { + "epoch": 0.8812795674701509, + "grad_norm": 1.791877464246084, + "learning_rate": 1e-05, + "loss": 1.1817, + "step": 1956 + }, + { + "epoch": 0.8817301193962604, + "grad_norm": 1.619852262764895, + "learning_rate": 1e-05, + "loss": 1.1919, + "step": 1957 + }, + { + "epoch": 0.8821806713223699, + "grad_norm": 1.7205647996126916, + "learning_rate": 1e-05, + "loss": 1.1754, + "step": 1958 + }, + { + "epoch": 0.8826312232484794, + "grad_norm": 1.5313646610580463, + "learning_rate": 1e-05, + "loss": 1.236, + "step": 1959 + }, + { + "epoch": 0.8830817751745889, + "grad_norm": 1.5825029652532525, + "learning_rate": 1e-05, + "loss": 1.1857, + "step": 1960 + }, + { + "epoch": 0.8835323271006984, + "grad_norm": 1.6849573240870548, + "learning_rate": 1e-05, + "loss": 1.1486, + "step": 1961 + }, + { + "epoch": 0.8839828790268078, + "grad_norm": 1.8040464344018217, + "learning_rate": 1e-05, + "loss": 1.1794, + "step": 1962 + }, + { + "epoch": 0.8844334309529174, + "grad_norm": 1.6890087972359196, + "learning_rate": 1e-05, + "loss": 1.1859, + "step": 1963 + }, + { + "epoch": 0.8848839828790268, + "grad_norm": 1.617560696442198, + "learning_rate": 1e-05, + "loss": 1.2189, + "step": 1964 + }, + { + "epoch": 0.8853345348051362, + "grad_norm": 1.6811332514125268, + "learning_rate": 1e-05, + "loss": 1.2115, + "step": 1965 + }, + { + "epoch": 0.8857850867312458, + "grad_norm": 1.7534007222902268, + "learning_rate": 1e-05, + "loss": 1.1822, + "step": 1966 + }, + { + "epoch": 0.8862356386573552, + "grad_norm": 1.5847852679799652, + "learning_rate": 1e-05, + "loss": 1.2162, + "step": 1967 + }, + { + "epoch": 0.8866861905834648, + "grad_norm": 1.7632898753820936, + "learning_rate": 1e-05, + "loss": 1.2327, + "step": 1968 + }, + { + "epoch": 0.8871367425095742, + "grad_norm": 1.6310639286552508, + "learning_rate": 1e-05, + "loss": 1.179, + "step": 1969 + }, + { + "epoch": 0.8875872944356837, + "grad_norm": 1.843730565121264, + "learning_rate": 1e-05, + "loss": 1.1949, + "step": 1970 + }, + { + "epoch": 0.8880378463617932, + "grad_norm": 1.781736616332847, + "learning_rate": 1e-05, + "loss": 1.1743, + "step": 1971 + }, + { + "epoch": 0.8884883982879027, + "grad_norm": 1.5943169480925572, + "learning_rate": 1e-05, + "loss": 1.2062, + "step": 1972 + }, + { + "epoch": 0.8889389502140121, + "grad_norm": 1.4993920792787943, + "learning_rate": 1e-05, + "loss": 1.1956, + "step": 1973 + }, + { + "epoch": 0.8893895021401217, + "grad_norm": 1.540824279950419, + "learning_rate": 1e-05, + "loss": 1.2087, + "step": 1974 + }, + { + "epoch": 0.8898400540662311, + "grad_norm": 1.7252658755827255, + "learning_rate": 1e-05, + "loss": 1.1728, + "step": 1975 + }, + { + "epoch": 0.8902906059923407, + "grad_norm": 1.7052017798554122, + "learning_rate": 1e-05, + "loss": 1.1743, + "step": 1976 + }, + { + "epoch": 0.8907411579184501, + "grad_norm": 1.6612625789667403, + "learning_rate": 1e-05, + "loss": 1.2041, + "step": 1977 + }, + { + "epoch": 0.8911917098445595, + "grad_norm": 1.6587744402696805, + "learning_rate": 1e-05, + "loss": 1.2129, + "step": 1978 + }, + { + "epoch": 0.8916422617706691, + "grad_norm": 1.6447097169325238, + "learning_rate": 1e-05, + "loss": 1.1925, + "step": 1979 + }, + { + "epoch": 0.8920928136967785, + "grad_norm": 1.65524150981378, + "learning_rate": 1e-05, + "loss": 1.1487, + "step": 1980 + }, + { + "epoch": 0.892543365622888, + "grad_norm": 1.7112097802046498, + "learning_rate": 1e-05, + "loss": 1.2032, + "step": 1981 + }, + { + "epoch": 0.8929939175489975, + "grad_norm": 1.7547125950361586, + "learning_rate": 1e-05, + "loss": 1.1676, + "step": 1982 + }, + { + "epoch": 0.893444469475107, + "grad_norm": 1.6425869215805582, + "learning_rate": 1e-05, + "loss": 1.1941, + "step": 1983 + }, + { + "epoch": 0.8938950214012165, + "grad_norm": 1.4451572493199791, + "learning_rate": 1e-05, + "loss": 1.1587, + "step": 1984 + }, + { + "epoch": 0.894345573327326, + "grad_norm": 1.572571629382231, + "learning_rate": 1e-05, + "loss": 1.1649, + "step": 1985 + }, + { + "epoch": 0.8947961252534354, + "grad_norm": 1.7864696673674751, + "learning_rate": 1e-05, + "loss": 1.182, + "step": 1986 + }, + { + "epoch": 0.895246677179545, + "grad_norm": 1.6268030280618402, + "learning_rate": 1e-05, + "loss": 1.1916, + "step": 1987 + }, + { + "epoch": 0.8956972291056544, + "grad_norm": 1.6707602189816138, + "learning_rate": 1e-05, + "loss": 1.2154, + "step": 1988 + }, + { + "epoch": 0.896147781031764, + "grad_norm": 1.6520039500127561, + "learning_rate": 1e-05, + "loss": 1.2046, + "step": 1989 + }, + { + "epoch": 0.8965983329578734, + "grad_norm": 1.8866840650686438, + "learning_rate": 1e-05, + "loss": 1.1683, + "step": 1990 + }, + { + "epoch": 0.8970488848839829, + "grad_norm": 1.6635248858057377, + "learning_rate": 1e-05, + "loss": 1.1416, + "step": 1991 + }, + { + "epoch": 0.8974994368100924, + "grad_norm": 1.6801657472532312, + "learning_rate": 1e-05, + "loss": 1.2457, + "step": 1992 + }, + { + "epoch": 0.8979499887362018, + "grad_norm": 1.7513150745030448, + "learning_rate": 1e-05, + "loss": 1.1839, + "step": 1993 + }, + { + "epoch": 0.8984005406623113, + "grad_norm": 1.7926390263455134, + "learning_rate": 1e-05, + "loss": 1.1908, + "step": 1994 + }, + { + "epoch": 0.8988510925884208, + "grad_norm": 1.7440011648485394, + "learning_rate": 1e-05, + "loss": 1.1758, + "step": 1995 + }, + { + "epoch": 0.8993016445145303, + "grad_norm": 1.802874400960212, + "learning_rate": 1e-05, + "loss": 1.153, + "step": 1996 + }, + { + "epoch": 0.8997521964406398, + "grad_norm": 1.674275456851461, + "learning_rate": 1e-05, + "loss": 1.2032, + "step": 1997 + }, + { + "epoch": 0.9002027483667493, + "grad_norm": 1.70597562331596, + "learning_rate": 1e-05, + "loss": 1.2026, + "step": 1998 + }, + { + "epoch": 0.9006533002928587, + "grad_norm": 1.6645508955826498, + "learning_rate": 1e-05, + "loss": 1.204, + "step": 1999 + }, + { + "epoch": 0.9011038522189683, + "grad_norm": 1.6437466582223337, + "learning_rate": 1e-05, + "loss": 1.1853, + "step": 2000 + }, + { + "epoch": 0.9015544041450777, + "grad_norm": 1.752087077429046, + "learning_rate": 1e-05, + "loss": 1.2628, + "step": 2001 + }, + { + "epoch": 0.9020049560711872, + "grad_norm": 1.669099591859999, + "learning_rate": 1e-05, + "loss": 1.2362, + "step": 2002 + }, + { + "epoch": 0.9024555079972967, + "grad_norm": 1.6895858952402134, + "learning_rate": 1e-05, + "loss": 1.1824, + "step": 2003 + }, + { + "epoch": 0.9029060599234062, + "grad_norm": 1.6481183332774054, + "learning_rate": 1e-05, + "loss": 1.1734, + "step": 2004 + }, + { + "epoch": 0.9033566118495157, + "grad_norm": 1.8295713923594479, + "learning_rate": 1e-05, + "loss": 1.1994, + "step": 2005 + }, + { + "epoch": 0.9038071637756251, + "grad_norm": 1.8460703606205773, + "learning_rate": 1e-05, + "loss": 1.1777, + "step": 2006 + }, + { + "epoch": 0.9042577157017346, + "grad_norm": 1.6241536868747677, + "learning_rate": 1e-05, + "loss": 1.1913, + "step": 2007 + }, + { + "epoch": 0.9047082676278441, + "grad_norm": 1.7175802542448775, + "learning_rate": 1e-05, + "loss": 1.1814, + "step": 2008 + }, + { + "epoch": 0.9051588195539536, + "grad_norm": 1.5828181067077758, + "learning_rate": 1e-05, + "loss": 1.1803, + "step": 2009 + }, + { + "epoch": 0.905609371480063, + "grad_norm": 1.8423204404690332, + "learning_rate": 1e-05, + "loss": 1.1417, + "step": 2010 + }, + { + "epoch": 0.9060599234061726, + "grad_norm": 1.683197073295049, + "learning_rate": 1e-05, + "loss": 1.1936, + "step": 2011 + }, + { + "epoch": 0.906510475332282, + "grad_norm": 1.8219461398772028, + "learning_rate": 1e-05, + "loss": 1.1845, + "step": 2012 + }, + { + "epoch": 0.9069610272583916, + "grad_norm": 1.7365303973164858, + "learning_rate": 1e-05, + "loss": 1.1968, + "step": 2013 + }, + { + "epoch": 0.907411579184501, + "grad_norm": 1.7252487536901362, + "learning_rate": 1e-05, + "loss": 1.1837, + "step": 2014 + }, + { + "epoch": 0.9078621311106105, + "grad_norm": 1.7973801355067949, + "learning_rate": 1e-05, + "loss": 1.217, + "step": 2015 + }, + { + "epoch": 0.90831268303672, + "grad_norm": 1.7309101052129412, + "learning_rate": 1e-05, + "loss": 1.1768, + "step": 2016 + }, + { + "epoch": 0.9087632349628295, + "grad_norm": 1.651613975910536, + "learning_rate": 1e-05, + "loss": 1.1767, + "step": 2017 + }, + { + "epoch": 0.909213786888939, + "grad_norm": 1.5932297428511177, + "learning_rate": 1e-05, + "loss": 1.1776, + "step": 2018 + }, + { + "epoch": 0.9096643388150484, + "grad_norm": 1.7249736812948915, + "learning_rate": 1e-05, + "loss": 1.2391, + "step": 2019 + }, + { + "epoch": 0.9101148907411579, + "grad_norm": 1.7592791830484773, + "learning_rate": 1e-05, + "loss": 1.1678, + "step": 2020 + }, + { + "epoch": 0.9105654426672674, + "grad_norm": 1.8001511327251414, + "learning_rate": 1e-05, + "loss": 1.1881, + "step": 2021 + }, + { + "epoch": 0.9110159945933769, + "grad_norm": 1.5958915091678874, + "learning_rate": 1e-05, + "loss": 1.1829, + "step": 2022 + }, + { + "epoch": 0.9114665465194863, + "grad_norm": 1.7053854208063959, + "learning_rate": 1e-05, + "loss": 1.1706, + "step": 2023 + }, + { + "epoch": 0.9119170984455959, + "grad_norm": 1.603121726297471, + "learning_rate": 1e-05, + "loss": 1.157, + "step": 2024 + }, + { + "epoch": 0.9123676503717053, + "grad_norm": 1.741748960234155, + "learning_rate": 1e-05, + "loss": 1.1776, + "step": 2025 + }, + { + "epoch": 0.9128182022978149, + "grad_norm": 1.6666156009125006, + "learning_rate": 1e-05, + "loss": 1.2061, + "step": 2026 + }, + { + "epoch": 0.9132687542239243, + "grad_norm": 1.6999991012738433, + "learning_rate": 1e-05, + "loss": 1.1603, + "step": 2027 + }, + { + "epoch": 0.9137193061500338, + "grad_norm": 1.5071174447484084, + "learning_rate": 1e-05, + "loss": 1.157, + "step": 2028 + }, + { + "epoch": 0.9141698580761433, + "grad_norm": 1.5934489986133855, + "learning_rate": 1e-05, + "loss": 1.1652, + "step": 2029 + }, + { + "epoch": 0.9146204100022528, + "grad_norm": 1.7691389722709743, + "learning_rate": 1e-05, + "loss": 1.2329, + "step": 2030 + }, + { + "epoch": 0.9150709619283622, + "grad_norm": 1.612525557681105, + "learning_rate": 1e-05, + "loss": 1.2585, + "step": 2031 + }, + { + "epoch": 0.9155215138544718, + "grad_norm": 1.823896119381473, + "learning_rate": 1e-05, + "loss": 1.1784, + "step": 2032 + }, + { + "epoch": 0.9159720657805812, + "grad_norm": 1.6001182819638253, + "learning_rate": 1e-05, + "loss": 1.1729, + "step": 2033 + }, + { + "epoch": 0.9164226177066906, + "grad_norm": 1.7114437153741997, + "learning_rate": 1e-05, + "loss": 1.2322, + "step": 2034 + }, + { + "epoch": 0.9168731696328002, + "grad_norm": 1.7578588566571445, + "learning_rate": 1e-05, + "loss": 1.1919, + "step": 2035 + }, + { + "epoch": 0.9173237215589096, + "grad_norm": 1.684811816484171, + "learning_rate": 1e-05, + "loss": 1.1927, + "step": 2036 + }, + { + "epoch": 0.9177742734850192, + "grad_norm": 1.6758870836191506, + "learning_rate": 1e-05, + "loss": 1.1887, + "step": 2037 + }, + { + "epoch": 0.9182248254111286, + "grad_norm": 1.784155879996892, + "learning_rate": 1e-05, + "loss": 1.1696, + "step": 2038 + }, + { + "epoch": 0.9186753773372381, + "grad_norm": 1.7001875941313467, + "learning_rate": 1e-05, + "loss": 1.2048, + "step": 2039 + }, + { + "epoch": 0.9191259292633476, + "grad_norm": 1.5193718812259038, + "learning_rate": 1e-05, + "loss": 1.14, + "step": 2040 + }, + { + "epoch": 0.9195764811894571, + "grad_norm": 1.7140562171701816, + "learning_rate": 1e-05, + "loss": 1.2036, + "step": 2041 + }, + { + "epoch": 0.9200270331155666, + "grad_norm": 1.6658501846960208, + "learning_rate": 1e-05, + "loss": 1.2263, + "step": 2042 + }, + { + "epoch": 0.9204775850416761, + "grad_norm": 1.6245115612898635, + "learning_rate": 1e-05, + "loss": 1.1818, + "step": 2043 + }, + { + "epoch": 0.9209281369677855, + "grad_norm": 1.7707988391102771, + "learning_rate": 1e-05, + "loss": 1.1845, + "step": 2044 + }, + { + "epoch": 0.9213786888938951, + "grad_norm": 1.700605838458128, + "learning_rate": 1e-05, + "loss": 1.2123, + "step": 2045 + }, + { + "epoch": 0.9218292408200045, + "grad_norm": 1.8486779226587806, + "learning_rate": 1e-05, + "loss": 1.1775, + "step": 2046 + }, + { + "epoch": 0.9222797927461139, + "grad_norm": 1.8102808308048854, + "learning_rate": 1e-05, + "loss": 1.1988, + "step": 2047 + }, + { + "epoch": 0.9227303446722235, + "grad_norm": 1.5288101087416415, + "learning_rate": 1e-05, + "loss": 1.1526, + "step": 2048 + }, + { + "epoch": 0.9231808965983329, + "grad_norm": 1.604554857518699, + "learning_rate": 1e-05, + "loss": 1.1632, + "step": 2049 + }, + { + "epoch": 0.9236314485244425, + "grad_norm": 1.8100618174923913, + "learning_rate": 1e-05, + "loss": 1.1515, + "step": 2050 + }, + { + "epoch": 0.9240820004505519, + "grad_norm": 1.6319685863655795, + "learning_rate": 1e-05, + "loss": 1.1523, + "step": 2051 + }, + { + "epoch": 0.9245325523766614, + "grad_norm": 1.632142134315281, + "learning_rate": 1e-05, + "loss": 1.2029, + "step": 2052 + }, + { + "epoch": 0.9249831043027709, + "grad_norm": 1.8187454131617482, + "learning_rate": 1e-05, + "loss": 1.1761, + "step": 2053 + }, + { + "epoch": 0.9254336562288804, + "grad_norm": 1.6826786299296643, + "learning_rate": 1e-05, + "loss": 1.202, + "step": 2054 + }, + { + "epoch": 0.9258842081549898, + "grad_norm": 1.7098050761568442, + "learning_rate": 1e-05, + "loss": 1.2148, + "step": 2055 + }, + { + "epoch": 0.9263347600810994, + "grad_norm": 1.639004737641357, + "learning_rate": 1e-05, + "loss": 1.171, + "step": 2056 + }, + { + "epoch": 0.9267853120072088, + "grad_norm": 1.623859948439882, + "learning_rate": 1e-05, + "loss": 1.1859, + "step": 2057 + }, + { + "epoch": 0.9272358639333184, + "grad_norm": 1.7021487486732494, + "learning_rate": 1e-05, + "loss": 1.2176, + "step": 2058 + }, + { + "epoch": 0.9276864158594278, + "grad_norm": 1.7717027001668448, + "learning_rate": 1e-05, + "loss": 1.2052, + "step": 2059 + }, + { + "epoch": 0.9281369677855373, + "grad_norm": 1.797295137865806, + "learning_rate": 1e-05, + "loss": 1.1753, + "step": 2060 + }, + { + "epoch": 0.9285875197116468, + "grad_norm": 1.6138084929771288, + "learning_rate": 1e-05, + "loss": 1.1809, + "step": 2061 + }, + { + "epoch": 0.9290380716377562, + "grad_norm": 1.698916134145838, + "learning_rate": 1e-05, + "loss": 1.1883, + "step": 2062 + }, + { + "epoch": 0.9294886235638657, + "grad_norm": 1.8018354499014477, + "learning_rate": 1e-05, + "loss": 1.1626, + "step": 2063 + }, + { + "epoch": 0.9299391754899752, + "grad_norm": 1.7321854344599588, + "learning_rate": 1e-05, + "loss": 1.2136, + "step": 2064 + }, + { + "epoch": 0.9303897274160847, + "grad_norm": 1.634004679541849, + "learning_rate": 1e-05, + "loss": 1.149, + "step": 2065 + }, + { + "epoch": 0.9308402793421942, + "grad_norm": 1.7033993770183842, + "learning_rate": 1e-05, + "loss": 1.1627, + "step": 2066 + }, + { + "epoch": 0.9312908312683037, + "grad_norm": 1.888189431846741, + "learning_rate": 1e-05, + "loss": 1.1643, + "step": 2067 + }, + { + "epoch": 0.9317413831944131, + "grad_norm": 1.705533244923517, + "learning_rate": 1e-05, + "loss": 1.1602, + "step": 2068 + }, + { + "epoch": 0.9321919351205227, + "grad_norm": 1.7096701473629878, + "learning_rate": 1e-05, + "loss": 1.1988, + "step": 2069 + }, + { + "epoch": 0.9326424870466321, + "grad_norm": 1.630666376640028, + "learning_rate": 1e-05, + "loss": 1.1713, + "step": 2070 + }, + { + "epoch": 0.9330930389727417, + "grad_norm": 1.8493121915990847, + "learning_rate": 1e-05, + "loss": 1.207, + "step": 2071 + }, + { + "epoch": 0.9335435908988511, + "grad_norm": 1.815624099029328, + "learning_rate": 1e-05, + "loss": 1.182, + "step": 2072 + }, + { + "epoch": 0.9339941428249606, + "grad_norm": 1.6384160771080598, + "learning_rate": 1e-05, + "loss": 1.1903, + "step": 2073 + }, + { + "epoch": 0.9344446947510701, + "grad_norm": 1.6288305079143965, + "learning_rate": 1e-05, + "loss": 1.1821, + "step": 2074 + }, + { + "epoch": 0.9348952466771795, + "grad_norm": 1.6490190090104768, + "learning_rate": 1e-05, + "loss": 1.1727, + "step": 2075 + }, + { + "epoch": 0.935345798603289, + "grad_norm": 1.6223468525759248, + "learning_rate": 1e-05, + "loss": 1.179, + "step": 2076 + }, + { + "epoch": 0.9357963505293985, + "grad_norm": 1.7946445207666113, + "learning_rate": 1e-05, + "loss": 1.2144, + "step": 2077 + }, + { + "epoch": 0.936246902455508, + "grad_norm": 1.6897767509695476, + "learning_rate": 1e-05, + "loss": 1.2137, + "step": 2078 + }, + { + "epoch": 0.9366974543816174, + "grad_norm": 1.6188498752212164, + "learning_rate": 1e-05, + "loss": 1.1902, + "step": 2079 + }, + { + "epoch": 0.937148006307727, + "grad_norm": 1.681733067879976, + "learning_rate": 1e-05, + "loss": 1.2483, + "step": 2080 + }, + { + "epoch": 0.9375985582338364, + "grad_norm": 1.725178267652548, + "learning_rate": 1e-05, + "loss": 1.1524, + "step": 2081 + }, + { + "epoch": 0.938049110159946, + "grad_norm": 1.7196779773962776, + "learning_rate": 1e-05, + "loss": 1.2441, + "step": 2082 + }, + { + "epoch": 0.9384996620860554, + "grad_norm": 1.72948940896615, + "learning_rate": 1e-05, + "loss": 1.1142, + "step": 2083 + }, + { + "epoch": 0.9389502140121649, + "grad_norm": 1.7511010335092747, + "learning_rate": 1e-05, + "loss": 1.2238, + "step": 2084 + }, + { + "epoch": 0.9394007659382744, + "grad_norm": 1.838057472974213, + "learning_rate": 1e-05, + "loss": 1.1773, + "step": 2085 + }, + { + "epoch": 0.9398513178643839, + "grad_norm": 1.6591311122470425, + "learning_rate": 1e-05, + "loss": 1.1552, + "step": 2086 + }, + { + "epoch": 0.9403018697904933, + "grad_norm": 1.5791720221435517, + "learning_rate": 1e-05, + "loss": 1.1942, + "step": 2087 + }, + { + "epoch": 0.9407524217166029, + "grad_norm": 1.6260408967438869, + "learning_rate": 1e-05, + "loss": 1.1934, + "step": 2088 + }, + { + "epoch": 0.9412029736427123, + "grad_norm": 1.8301203454759956, + "learning_rate": 1e-05, + "loss": 1.1426, + "step": 2089 + }, + { + "epoch": 0.9416535255688218, + "grad_norm": 1.6393692809665887, + "learning_rate": 1e-05, + "loss": 1.1764, + "step": 2090 + }, + { + "epoch": 0.9421040774949313, + "grad_norm": 1.6165840529847568, + "learning_rate": 1e-05, + "loss": 1.1811, + "step": 2091 + }, + { + "epoch": 0.9425546294210407, + "grad_norm": 1.5798081012769836, + "learning_rate": 1e-05, + "loss": 1.2356, + "step": 2092 + }, + { + "epoch": 0.9430051813471503, + "grad_norm": 1.727700337892747, + "learning_rate": 1e-05, + "loss": 1.1749, + "step": 2093 + }, + { + "epoch": 0.9434557332732597, + "grad_norm": 1.9714736909888164, + "learning_rate": 1e-05, + "loss": 1.1896, + "step": 2094 + }, + { + "epoch": 0.9439062851993693, + "grad_norm": 1.6580452596611326, + "learning_rate": 1e-05, + "loss": 1.1921, + "step": 2095 + }, + { + "epoch": 0.9443568371254787, + "grad_norm": 1.866349427049117, + "learning_rate": 1e-05, + "loss": 1.2059, + "step": 2096 + }, + { + "epoch": 0.9448073890515882, + "grad_norm": 1.7313573957932022, + "learning_rate": 1e-05, + "loss": 1.175, + "step": 2097 + }, + { + "epoch": 0.9452579409776977, + "grad_norm": 1.7221346323183881, + "learning_rate": 1e-05, + "loss": 1.1439, + "step": 2098 + }, + { + "epoch": 0.9457084929038072, + "grad_norm": 1.6381714881988407, + "learning_rate": 1e-05, + "loss": 1.2122, + "step": 2099 + }, + { + "epoch": 0.9461590448299166, + "grad_norm": 1.6447047547046818, + "learning_rate": 1e-05, + "loss": 1.2059, + "step": 2100 + }, + { + "epoch": 0.9466095967560262, + "grad_norm": 1.6758634414894065, + "learning_rate": 1e-05, + "loss": 1.0905, + "step": 2101 + }, + { + "epoch": 0.9470601486821356, + "grad_norm": 1.624740500489032, + "learning_rate": 1e-05, + "loss": 1.1452, + "step": 2102 + }, + { + "epoch": 0.947510700608245, + "grad_norm": 1.634954570703128, + "learning_rate": 1e-05, + "loss": 1.1753, + "step": 2103 + }, + { + "epoch": 0.9479612525343546, + "grad_norm": 1.8032264648997465, + "learning_rate": 1e-05, + "loss": 1.1978, + "step": 2104 + }, + { + "epoch": 0.948411804460464, + "grad_norm": 1.7215681096454558, + "learning_rate": 1e-05, + "loss": 1.177, + "step": 2105 + }, + { + "epoch": 0.9488623563865736, + "grad_norm": 1.6686611213810127, + "learning_rate": 1e-05, + "loss": 1.1717, + "step": 2106 + }, + { + "epoch": 0.949312908312683, + "grad_norm": 1.881143362237432, + "learning_rate": 1e-05, + "loss": 1.1908, + "step": 2107 + }, + { + "epoch": 0.9497634602387925, + "grad_norm": 1.6451020286185194, + "learning_rate": 1e-05, + "loss": 1.1849, + "step": 2108 + }, + { + "epoch": 0.950214012164902, + "grad_norm": 1.754167561683326, + "learning_rate": 1e-05, + "loss": 1.1952, + "step": 2109 + }, + { + "epoch": 0.9506645640910115, + "grad_norm": 1.7556968199681056, + "learning_rate": 1e-05, + "loss": 1.1717, + "step": 2110 + }, + { + "epoch": 0.951115116017121, + "grad_norm": 1.7095415110303946, + "learning_rate": 1e-05, + "loss": 1.2258, + "step": 2111 + }, + { + "epoch": 0.9515656679432305, + "grad_norm": 1.706848125511313, + "learning_rate": 1e-05, + "loss": 1.2049, + "step": 2112 + }, + { + "epoch": 0.9520162198693399, + "grad_norm": 1.5703031012611752, + "learning_rate": 1e-05, + "loss": 1.2062, + "step": 2113 + }, + { + "epoch": 0.9524667717954495, + "grad_norm": 1.5547854742449227, + "learning_rate": 1e-05, + "loss": 1.1723, + "step": 2114 + }, + { + "epoch": 0.9529173237215589, + "grad_norm": 1.5813108924809067, + "learning_rate": 1e-05, + "loss": 1.1692, + "step": 2115 + }, + { + "epoch": 0.9533678756476683, + "grad_norm": 1.6400217382575737, + "learning_rate": 1e-05, + "loss": 1.2242, + "step": 2116 + }, + { + "epoch": 0.9538184275737779, + "grad_norm": 1.719910875150272, + "learning_rate": 1e-05, + "loss": 1.2065, + "step": 2117 + }, + { + "epoch": 0.9542689794998873, + "grad_norm": 1.7754453365847451, + "learning_rate": 1e-05, + "loss": 1.2161, + "step": 2118 + }, + { + "epoch": 0.9547195314259969, + "grad_norm": 1.5679928263707885, + "learning_rate": 1e-05, + "loss": 1.1943, + "step": 2119 + }, + { + "epoch": 0.9551700833521063, + "grad_norm": 1.5326901408842268, + "learning_rate": 1e-05, + "loss": 1.1702, + "step": 2120 + }, + { + "epoch": 0.9556206352782158, + "grad_norm": 1.7934219244055345, + "learning_rate": 1e-05, + "loss": 1.1612, + "step": 2121 + }, + { + "epoch": 0.9560711872043253, + "grad_norm": 1.5903967174339735, + "learning_rate": 1e-05, + "loss": 1.223, + "step": 2122 + }, + { + "epoch": 0.9565217391304348, + "grad_norm": 1.8576688324700052, + "learning_rate": 1e-05, + "loss": 1.1855, + "step": 2123 + }, + { + "epoch": 0.9569722910565442, + "grad_norm": 1.5024626501001406, + "learning_rate": 1e-05, + "loss": 1.1583, + "step": 2124 + }, + { + "epoch": 0.9574228429826538, + "grad_norm": 1.913175515389359, + "learning_rate": 1e-05, + "loss": 1.1555, + "step": 2125 + }, + { + "epoch": 0.9578733949087632, + "grad_norm": 1.6565160630402702, + "learning_rate": 1e-05, + "loss": 1.1382, + "step": 2126 + }, + { + "epoch": 0.9583239468348728, + "grad_norm": 1.7612841877743974, + "learning_rate": 1e-05, + "loss": 1.1882, + "step": 2127 + }, + { + "epoch": 0.9587744987609822, + "grad_norm": 1.6999960214638121, + "learning_rate": 1e-05, + "loss": 1.127, + "step": 2128 + }, + { + "epoch": 0.9592250506870917, + "grad_norm": 1.7865687508854, + "learning_rate": 1e-05, + "loss": 1.2001, + "step": 2129 + }, + { + "epoch": 0.9596756026132012, + "grad_norm": 1.7229040927458896, + "learning_rate": 1e-05, + "loss": 1.1457, + "step": 2130 + }, + { + "epoch": 0.9601261545393106, + "grad_norm": 1.6649741455846447, + "learning_rate": 1e-05, + "loss": 1.138, + "step": 2131 + }, + { + "epoch": 0.9605767064654201, + "grad_norm": 1.6951738897843966, + "learning_rate": 1e-05, + "loss": 1.202, + "step": 2132 + }, + { + "epoch": 0.9610272583915296, + "grad_norm": 1.6004134883352765, + "learning_rate": 1e-05, + "loss": 1.2104, + "step": 2133 + }, + { + "epoch": 0.9614778103176391, + "grad_norm": 1.6271378214931058, + "learning_rate": 1e-05, + "loss": 1.1665, + "step": 2134 + }, + { + "epoch": 0.9619283622437486, + "grad_norm": 1.5995941621214527, + "learning_rate": 1e-05, + "loss": 1.1781, + "step": 2135 + }, + { + "epoch": 0.9623789141698581, + "grad_norm": 1.6152784854202955, + "learning_rate": 1e-05, + "loss": 1.2213, + "step": 2136 + }, + { + "epoch": 0.9628294660959675, + "grad_norm": 1.8385052853015975, + "learning_rate": 1e-05, + "loss": 1.1412, + "step": 2137 + }, + { + "epoch": 0.9632800180220771, + "grad_norm": 1.7068694440237158, + "learning_rate": 1e-05, + "loss": 1.1898, + "step": 2138 + }, + { + "epoch": 0.9637305699481865, + "grad_norm": 1.8055622353281682, + "learning_rate": 1e-05, + "loss": 1.2078, + "step": 2139 + }, + { + "epoch": 0.964181121874296, + "grad_norm": 1.5892068342229684, + "learning_rate": 1e-05, + "loss": 1.215, + "step": 2140 + }, + { + "epoch": 0.9646316738004055, + "grad_norm": 1.6626282035716575, + "learning_rate": 1e-05, + "loss": 1.1684, + "step": 2141 + }, + { + "epoch": 0.965082225726515, + "grad_norm": 1.5815313004686862, + "learning_rate": 1e-05, + "loss": 1.1426, + "step": 2142 + }, + { + "epoch": 0.9655327776526245, + "grad_norm": 1.6104144911672769, + "learning_rate": 1e-05, + "loss": 1.2065, + "step": 2143 + }, + { + "epoch": 0.9659833295787339, + "grad_norm": 1.7223066918002348, + "learning_rate": 1e-05, + "loss": 1.216, + "step": 2144 + }, + { + "epoch": 0.9664338815048434, + "grad_norm": 1.626978120340767, + "learning_rate": 1e-05, + "loss": 1.1555, + "step": 2145 + }, + { + "epoch": 0.9668844334309529, + "grad_norm": 1.5977128449160958, + "learning_rate": 1e-05, + "loss": 1.1722, + "step": 2146 + }, + { + "epoch": 0.9673349853570624, + "grad_norm": 1.7382985565577844, + "learning_rate": 1e-05, + "loss": 1.1813, + "step": 2147 + }, + { + "epoch": 0.9677855372831718, + "grad_norm": 1.7100089095382642, + "learning_rate": 1e-05, + "loss": 1.186, + "step": 2148 + }, + { + "epoch": 0.9682360892092814, + "grad_norm": 1.5140761682833617, + "learning_rate": 1e-05, + "loss": 1.2091, + "step": 2149 + }, + { + "epoch": 0.9686866411353908, + "grad_norm": 1.631458054037132, + "learning_rate": 1e-05, + "loss": 1.1977, + "step": 2150 + }, + { + "epoch": 0.9691371930615004, + "grad_norm": 1.9407101557307815, + "learning_rate": 1e-05, + "loss": 1.244, + "step": 2151 + }, + { + "epoch": 0.9695877449876098, + "grad_norm": 1.644705809226952, + "learning_rate": 1e-05, + "loss": 1.1926, + "step": 2152 + }, + { + "epoch": 0.9700382969137193, + "grad_norm": 1.7738926775892274, + "learning_rate": 1e-05, + "loss": 1.1801, + "step": 2153 + }, + { + "epoch": 0.9704888488398288, + "grad_norm": 1.632866466849786, + "learning_rate": 1e-05, + "loss": 1.1703, + "step": 2154 + }, + { + "epoch": 0.9709394007659383, + "grad_norm": 1.6489079715720707, + "learning_rate": 1e-05, + "loss": 1.1688, + "step": 2155 + }, + { + "epoch": 0.9713899526920478, + "grad_norm": 1.6991057253621062, + "learning_rate": 1e-05, + "loss": 1.1681, + "step": 2156 + }, + { + "epoch": 0.9718405046181573, + "grad_norm": 1.6331345480206434, + "learning_rate": 1e-05, + "loss": 1.1734, + "step": 2157 + }, + { + "epoch": 0.9722910565442667, + "grad_norm": 1.6750211080976438, + "learning_rate": 1e-05, + "loss": 1.2016, + "step": 2158 + }, + { + "epoch": 0.9727416084703762, + "grad_norm": 1.750153664716623, + "learning_rate": 1e-05, + "loss": 1.1745, + "step": 2159 + }, + { + "epoch": 0.9731921603964857, + "grad_norm": 1.6629057903262463, + "learning_rate": 1e-05, + "loss": 1.1462, + "step": 2160 + }, + { + "epoch": 0.9736427123225951, + "grad_norm": 1.633481623623148, + "learning_rate": 1e-05, + "loss": 1.212, + "step": 2161 + }, + { + "epoch": 0.9740932642487047, + "grad_norm": 1.6677123407725638, + "learning_rate": 1e-05, + "loss": 1.1675, + "step": 2162 + }, + { + "epoch": 0.9745438161748141, + "grad_norm": 1.6603950206096891, + "learning_rate": 1e-05, + "loss": 1.1501, + "step": 2163 + }, + { + "epoch": 0.9749943681009237, + "grad_norm": 1.5383013282675635, + "learning_rate": 1e-05, + "loss": 1.167, + "step": 2164 + }, + { + "epoch": 0.9754449200270331, + "grad_norm": 1.6339350842028757, + "learning_rate": 1e-05, + "loss": 1.1703, + "step": 2165 + }, + { + "epoch": 0.9758954719531426, + "grad_norm": 1.6419327750701314, + "learning_rate": 1e-05, + "loss": 1.2046, + "step": 2166 + }, + { + "epoch": 0.9763460238792521, + "grad_norm": 1.6071485373938563, + "learning_rate": 1e-05, + "loss": 1.1564, + "step": 2167 + }, + { + "epoch": 0.9767965758053616, + "grad_norm": 1.6645829201081386, + "learning_rate": 1e-05, + "loss": 1.1567, + "step": 2168 + }, + { + "epoch": 0.977247127731471, + "grad_norm": 1.834241571711109, + "learning_rate": 1e-05, + "loss": 1.1959, + "step": 2169 + }, + { + "epoch": 0.9776976796575806, + "grad_norm": 1.7851782257705597, + "learning_rate": 1e-05, + "loss": 1.1073, + "step": 2170 + }, + { + "epoch": 0.97814823158369, + "grad_norm": 1.6662785700861493, + "learning_rate": 1e-05, + "loss": 1.2193, + "step": 2171 + }, + { + "epoch": 0.9785987835097995, + "grad_norm": 1.6137357355336408, + "learning_rate": 1e-05, + "loss": 1.1989, + "step": 2172 + }, + { + "epoch": 0.979049335435909, + "grad_norm": 1.769977579305047, + "learning_rate": 1e-05, + "loss": 1.1943, + "step": 2173 + }, + { + "epoch": 0.9794998873620184, + "grad_norm": 1.5440600304095895, + "learning_rate": 1e-05, + "loss": 1.2057, + "step": 2174 + }, + { + "epoch": 0.979950439288128, + "grad_norm": 1.5650860905655992, + "learning_rate": 1e-05, + "loss": 1.163, + "step": 2175 + }, + { + "epoch": 0.9804009912142374, + "grad_norm": 1.683105123119004, + "learning_rate": 1e-05, + "loss": 1.1673, + "step": 2176 + }, + { + "epoch": 0.980851543140347, + "grad_norm": 1.6772024747381555, + "learning_rate": 1e-05, + "loss": 1.2483, + "step": 2177 + }, + { + "epoch": 0.9813020950664564, + "grad_norm": 1.9429115939809587, + "learning_rate": 1e-05, + "loss": 1.1323, + "step": 2178 + }, + { + "epoch": 0.9817526469925659, + "grad_norm": 1.7168014980135007, + "learning_rate": 1e-05, + "loss": 1.1686, + "step": 2179 + }, + { + "epoch": 0.9822031989186754, + "grad_norm": 1.627958580173763, + "learning_rate": 1e-05, + "loss": 1.1377, + "step": 2180 + }, + { + "epoch": 0.9826537508447849, + "grad_norm": 1.8630592315550862, + "learning_rate": 1e-05, + "loss": 1.197, + "step": 2181 + }, + { + "epoch": 0.9831043027708943, + "grad_norm": 1.5226363816071424, + "learning_rate": 1e-05, + "loss": 1.2748, + "step": 2182 + }, + { + "epoch": 0.9835548546970039, + "grad_norm": 1.640663627306306, + "learning_rate": 1e-05, + "loss": 1.1931, + "step": 2183 + }, + { + "epoch": 0.9840054066231133, + "grad_norm": 1.792672713213748, + "learning_rate": 1e-05, + "loss": 1.1694, + "step": 2184 + }, + { + "epoch": 0.9844559585492227, + "grad_norm": 1.657565748047637, + "learning_rate": 1e-05, + "loss": 1.1532, + "step": 2185 + }, + { + "epoch": 0.9849065104753323, + "grad_norm": 1.599593629920567, + "learning_rate": 1e-05, + "loss": 1.2105, + "step": 2186 + }, + { + "epoch": 0.9853570624014417, + "grad_norm": 1.6978249473374794, + "learning_rate": 1e-05, + "loss": 1.1582, + "step": 2187 + }, + { + "epoch": 0.9858076143275513, + "grad_norm": 1.6437621469181267, + "learning_rate": 1e-05, + "loss": 1.1904, + "step": 2188 + }, + { + "epoch": 0.9862581662536607, + "grad_norm": 1.6908522774335355, + "learning_rate": 1e-05, + "loss": 1.1329, + "step": 2189 + }, + { + "epoch": 0.9867087181797702, + "grad_norm": 1.7108282497456526, + "learning_rate": 1e-05, + "loss": 1.1667, + "step": 2190 + }, + { + "epoch": 0.9871592701058797, + "grad_norm": 1.650745417574159, + "learning_rate": 1e-05, + "loss": 1.1542, + "step": 2191 + }, + { + "epoch": 0.9876098220319892, + "grad_norm": 1.8415193285070817, + "learning_rate": 1e-05, + "loss": 1.1436, + "step": 2192 + }, + { + "epoch": 0.9880603739580986, + "grad_norm": 1.7265003600000148, + "learning_rate": 1e-05, + "loss": 1.1802, + "step": 2193 + }, + { + "epoch": 0.9885109258842082, + "grad_norm": 1.809638358823917, + "learning_rate": 1e-05, + "loss": 1.1701, + "step": 2194 + }, + { + "epoch": 0.9889614778103176, + "grad_norm": 1.6772766425201677, + "learning_rate": 1e-05, + "loss": 1.1852, + "step": 2195 + }, + { + "epoch": 0.9894120297364272, + "grad_norm": 1.7234560256340692, + "learning_rate": 1e-05, + "loss": 1.2143, + "step": 2196 + }, + { + "epoch": 0.9898625816625366, + "grad_norm": 1.7230785087089597, + "learning_rate": 1e-05, + "loss": 1.1305, + "step": 2197 + }, + { + "epoch": 0.9903131335886461, + "grad_norm": 1.5677372020042342, + "learning_rate": 1e-05, + "loss": 1.2346, + "step": 2198 + }, + { + "epoch": 0.9907636855147556, + "grad_norm": 1.6534627672302962, + "learning_rate": 1e-05, + "loss": 1.1358, + "step": 2199 + }, + { + "epoch": 0.991214237440865, + "grad_norm": 1.6677962601647158, + "learning_rate": 1e-05, + "loss": 1.2025, + "step": 2200 + }, + { + "epoch": 0.9916647893669746, + "grad_norm": 1.5957431691240571, + "learning_rate": 1e-05, + "loss": 1.2021, + "step": 2201 + }, + { + "epoch": 0.992115341293084, + "grad_norm": 1.6825396781316948, + "learning_rate": 1e-05, + "loss": 1.1906, + "step": 2202 + }, + { + "epoch": 0.9925658932191935, + "grad_norm": 1.631478160156966, + "learning_rate": 1e-05, + "loss": 1.2176, + "step": 2203 + }, + { + "epoch": 0.993016445145303, + "grad_norm": 1.6181064708610031, + "learning_rate": 1e-05, + "loss": 1.1497, + "step": 2204 + }, + { + "epoch": 0.9934669970714125, + "grad_norm": 1.6731227448613548, + "learning_rate": 1e-05, + "loss": 1.2529, + "step": 2205 + }, + { + "epoch": 0.9939175489975219, + "grad_norm": 1.7203610581692894, + "learning_rate": 1e-05, + "loss": 1.2136, + "step": 2206 + }, + { + "epoch": 0.9943681009236315, + "grad_norm": 1.641928282303174, + "learning_rate": 1e-05, + "loss": 1.2041, + "step": 2207 + }, + { + "epoch": 0.9948186528497409, + "grad_norm": 1.753504116694314, + "learning_rate": 1e-05, + "loss": 1.2071, + "step": 2208 + }, + { + "epoch": 0.9952692047758505, + "grad_norm": 1.7634227391080495, + "learning_rate": 1e-05, + "loss": 1.1856, + "step": 2209 + }, + { + "epoch": 0.9957197567019599, + "grad_norm": 1.6307678361862732, + "learning_rate": 1e-05, + "loss": 1.1747, + "step": 2210 + }, + { + "epoch": 0.9961703086280694, + "grad_norm": 1.676694512275526, + "learning_rate": 1e-05, + "loss": 1.1732, + "step": 2211 + }, + { + "epoch": 0.9966208605541789, + "grad_norm": 1.5468525592889175, + "learning_rate": 1e-05, + "loss": 1.1757, + "step": 2212 + }, + { + "epoch": 0.9970714124802883, + "grad_norm": 1.762742264188603, + "learning_rate": 1e-05, + "loss": 1.1747, + "step": 2213 + }, + { + "epoch": 0.9975219644063978, + "grad_norm": 1.606944223414691, + "learning_rate": 1e-05, + "loss": 1.1967, + "step": 2214 + }, + { + "epoch": 0.9979725163325073, + "grad_norm": 1.603067947399285, + "learning_rate": 1e-05, + "loss": 1.1813, + "step": 2215 + }, + { + "epoch": 0.9984230682586168, + "grad_norm": 1.6802311418144311, + "learning_rate": 1e-05, + "loss": 1.2049, + "step": 2216 + }, + { + "epoch": 0.9988736201847263, + "grad_norm": 1.4778117214314126, + "learning_rate": 1e-05, + "loss": 1.1737, + "step": 2217 + }, + { + "epoch": 0.9993241721108358, + "grad_norm": 1.6302785209744532, + "learning_rate": 1e-05, + "loss": 1.1906, + "step": 2218 + }, + { + "epoch": 0.9997747240369452, + "grad_norm": 1.568318632001129, + "learning_rate": 1e-05, + "loss": 1.191, + "step": 2219 + }, + { + "epoch": 0.9997747240369452, + "step": 2219, + "total_flos": 3591718242877440.0, + "train_loss": 1.239272991082001, + "train_runtime": 334031.5377, + "train_samples_per_second": 0.851, + "train_steps_per_second": 0.007 + } + ], + "logging_steps": 1.0, + "max_steps": 2219, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3591718242877440.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}