{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5118, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019538882375928098, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 18.159, "step": 1 }, { "epoch": 0.00039077764751856197, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 18.2968, "step": 2 }, { "epoch": 0.0005861664712778429, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 18.881, "step": 3 }, { "epoch": 0.0007815552950371239, "grad_norm": 16.800939559936523, "learning_rate": 1.298701298701299e-07, "loss": 18.863, "step": 4 }, { "epoch": 0.0009769441187964048, "grad_norm": 16.903465270996094, "learning_rate": 2.597402597402598e-07, "loss": 19.0107, "step": 5 }, { "epoch": 0.0011723329425556857, "grad_norm": 17.979801177978516, "learning_rate": 3.896103896103896e-07, "loss": 18.9991, "step": 6 }, { "epoch": 0.0013677217663149667, "grad_norm": 17.979801177978516, "learning_rate": 3.896103896103896e-07, "loss": 18.9156, "step": 7 }, { "epoch": 0.0015631105900742479, "grad_norm": 19.2469425201416, "learning_rate": 5.194805194805196e-07, "loss": 19.1087, "step": 8 }, { "epoch": 0.0017584994138335288, "grad_norm": 19.234683990478516, "learning_rate": 6.493506493506493e-07, "loss": 19.2963, "step": 9 }, { "epoch": 0.0019538882375928096, "grad_norm": 15.376220703125, "learning_rate": 7.792207792207792e-07, "loss": 17.9463, "step": 10 }, { "epoch": 0.0021492770613520907, "grad_norm": 15.44509506225586, "learning_rate": 9.090909090909091e-07, "loss": 17.3, "step": 11 }, { "epoch": 0.0023446658851113715, "grad_norm": 17.092782974243164, "learning_rate": 1.0389610389610392e-06, "loss": 19.3398, "step": 12 }, { "epoch": 0.0025400547088706526, "grad_norm": 18.010583877563477, "learning_rate": 1.168831168831169e-06, "loss": 18.7621, "step": 13 }, { "epoch": 0.0027354435326299334, "grad_norm": 15.407889366149902, "learning_rate": 1.2987012987012986e-06, "loss": 18.2084, "step": 14 }, { "epoch": 0.0029308323563892145, "grad_norm": 15.01679801940918, "learning_rate": 1.4285714285714286e-06, "loss": 17.2003, "step": 15 }, { "epoch": 0.0031262211801484957, "grad_norm": 16.848499298095703, "learning_rate": 1.5584415584415584e-06, "loss": 20.4405, "step": 16 }, { "epoch": 0.0033216100039077765, "grad_norm": 15.197953224182129, "learning_rate": 1.6883116883116885e-06, "loss": 16.384, "step": 17 }, { "epoch": 0.0035169988276670576, "grad_norm": 16.76958465576172, "learning_rate": 1.8181818181818183e-06, "loss": 17.7668, "step": 18 }, { "epoch": 0.0037123876514263384, "grad_norm": 18.03081512451172, "learning_rate": 1.9480519480519483e-06, "loss": 19.181, "step": 19 }, { "epoch": 0.003907776475185619, "grad_norm": 17.887229919433594, "learning_rate": 2.0779220779220784e-06, "loss": 18.8617, "step": 20 }, { "epoch": 0.004103165298944901, "grad_norm": 17.887229919433594, "learning_rate": 2.0779220779220784e-06, "loss": 18.5894, "step": 21 }, { "epoch": 0.0042985541227041815, "grad_norm": 15.389242172241211, "learning_rate": 2.207792207792208e-06, "loss": 17.2314, "step": 22 }, { "epoch": 0.004493942946463462, "grad_norm": 14.811219215393066, "learning_rate": 2.337662337662338e-06, "loss": 16.7673, "step": 23 }, { "epoch": 0.004689331770222743, "grad_norm": 17.104415893554688, "learning_rate": 2.4675324675324676e-06, "loss": 17.2118, "step": 24 }, { "epoch": 0.0048847205939820245, "grad_norm": 17.192047119140625, "learning_rate": 2.597402597402597e-06, "loss": 15.7282, "step": 25 }, { "epoch": 0.005080109417741305, "grad_norm": 18.115428924560547, "learning_rate": 2.7272727272727272e-06, "loss": 18.8801, "step": 26 }, { "epoch": 0.005275498241500586, "grad_norm": 16.415836334228516, "learning_rate": 2.8571428571428573e-06, "loss": 16.1387, "step": 27 }, { "epoch": 0.005470887065259867, "grad_norm": 15.510481834411621, "learning_rate": 2.9870129870129873e-06, "loss": 15.881, "step": 28 }, { "epoch": 0.005666275889019148, "grad_norm": 14.883086204528809, "learning_rate": 3.116883116883117e-06, "loss": 17.2384, "step": 29 }, { "epoch": 0.005861664712778429, "grad_norm": 17.018898010253906, "learning_rate": 3.246753246753247e-06, "loss": 17.8932, "step": 30 }, { "epoch": 0.00605705353653771, "grad_norm": 17.309856414794922, "learning_rate": 3.376623376623377e-06, "loss": 17.3969, "step": 31 }, { "epoch": 0.0062524423602969914, "grad_norm": 16.117368698120117, "learning_rate": 3.506493506493507e-06, "loss": 17.9658, "step": 32 }, { "epoch": 0.006447831184056272, "grad_norm": 11.697212219238281, "learning_rate": 3.6363636363636366e-06, "loss": 13.7439, "step": 33 }, { "epoch": 0.006643220007815553, "grad_norm": 12.093951225280762, "learning_rate": 3.7662337662337666e-06, "loss": 15.5621, "step": 34 }, { "epoch": 0.006838608831574834, "grad_norm": 15.0689058303833, "learning_rate": 3.896103896103897e-06, "loss": 17.4019, "step": 35 }, { "epoch": 0.007033997655334115, "grad_norm": 13.318926811218262, "learning_rate": 4.025974025974026e-06, "loss": 16.8068, "step": 36 }, { "epoch": 0.007229386479093396, "grad_norm": 15.324155807495117, "learning_rate": 4.155844155844157e-06, "loss": 16.4426, "step": 37 }, { "epoch": 0.007424775302852677, "grad_norm": 14.826874732971191, "learning_rate": 4.2857142857142855e-06, "loss": 16.845, "step": 38 }, { "epoch": 0.0076201641266119575, "grad_norm": 13.863877296447754, "learning_rate": 4.415584415584416e-06, "loss": 17.7616, "step": 39 }, { "epoch": 0.007815552950371238, "grad_norm": 17.50107192993164, "learning_rate": 4.5454545454545455e-06, "loss": 17.392, "step": 40 }, { "epoch": 0.008010941774130519, "grad_norm": 16.738174438476562, "learning_rate": 4.675324675324676e-06, "loss": 16.4532, "step": 41 }, { "epoch": 0.008206330597889801, "grad_norm": 9.62713623046875, "learning_rate": 4.805194805194806e-06, "loss": 15.1813, "step": 42 }, { "epoch": 0.008401719421649082, "grad_norm": 14.606769561767578, "learning_rate": 4.935064935064935e-06, "loss": 17.5544, "step": 43 }, { "epoch": 0.008597108245408363, "grad_norm": 14.058442115783691, "learning_rate": 5.064935064935065e-06, "loss": 15.967, "step": 44 }, { "epoch": 0.008792497069167644, "grad_norm": 15.973811149597168, "learning_rate": 5.194805194805194e-06, "loss": 17.6413, "step": 45 }, { "epoch": 0.008987885892926924, "grad_norm": 14.27064037322998, "learning_rate": 5.324675324675325e-06, "loss": 16.7137, "step": 46 }, { "epoch": 0.009183274716686205, "grad_norm": 12.279301643371582, "learning_rate": 5.4545454545454545e-06, "loss": 14.5723, "step": 47 }, { "epoch": 0.009378663540445486, "grad_norm": 13.54649543762207, "learning_rate": 5.584415584415585e-06, "loss": 15.7591, "step": 48 }, { "epoch": 0.009574052364204768, "grad_norm": 11.76611042022705, "learning_rate": 5.7142857142857145e-06, "loss": 15.5008, "step": 49 }, { "epoch": 0.009769441187964049, "grad_norm": 11.76115894317627, "learning_rate": 5.844155844155844e-06, "loss": 14.9208, "step": 50 }, { "epoch": 0.00996483001172333, "grad_norm": 12.934619903564453, "learning_rate": 5.9740259740259746e-06, "loss": 14.8215, "step": 51 }, { "epoch": 0.01016021883548261, "grad_norm": 11.615428924560547, "learning_rate": 6.103896103896104e-06, "loss": 14.7069, "step": 52 }, { "epoch": 0.010355607659241891, "grad_norm": 11.138904571533203, "learning_rate": 6.233766233766234e-06, "loss": 15.4979, "step": 53 }, { "epoch": 0.010550996483001172, "grad_norm": 11.6993408203125, "learning_rate": 6.363636363636364e-06, "loss": 15.0529, "step": 54 }, { "epoch": 0.010746385306760453, "grad_norm": 12.620027542114258, "learning_rate": 6.493506493506494e-06, "loss": 16.0377, "step": 55 }, { "epoch": 0.010941774130519734, "grad_norm": 11.459622383117676, "learning_rate": 6.623376623376624e-06, "loss": 14.9392, "step": 56 }, { "epoch": 0.011137162954279016, "grad_norm": 11.75425910949707, "learning_rate": 6.753246753246754e-06, "loss": 14.584, "step": 57 }, { "epoch": 0.011332551778038297, "grad_norm": 11.240898132324219, "learning_rate": 6.8831168831168835e-06, "loss": 12.9321, "step": 58 }, { "epoch": 0.011527940601797577, "grad_norm": 12.680749893188477, "learning_rate": 7.012987012987014e-06, "loss": 15.2367, "step": 59 }, { "epoch": 0.011723329425556858, "grad_norm": 12.971368789672852, "learning_rate": 7.1428571428571436e-06, "loss": 14.4319, "step": 60 }, { "epoch": 0.011918718249316139, "grad_norm": 12.750476837158203, "learning_rate": 7.272727272727273e-06, "loss": 15.405, "step": 61 }, { "epoch": 0.01211410707307542, "grad_norm": 13.858943939208984, "learning_rate": 7.402597402597404e-06, "loss": 14.7354, "step": 62 }, { "epoch": 0.0123094958968347, "grad_norm": 11.960023880004883, "learning_rate": 7.532467532467533e-06, "loss": 14.515, "step": 63 }, { "epoch": 0.012504884720593983, "grad_norm": 13.708508491516113, "learning_rate": 7.662337662337663e-06, "loss": 14.8207, "step": 64 }, { "epoch": 0.012700273544353264, "grad_norm": 11.947013854980469, "learning_rate": 7.792207792207793e-06, "loss": 14.1906, "step": 65 }, { "epoch": 0.012895662368112544, "grad_norm": 13.315733909606934, "learning_rate": 7.922077922077924e-06, "loss": 13.5782, "step": 66 }, { "epoch": 0.013091051191871825, "grad_norm": 11.261037826538086, "learning_rate": 8.051948051948052e-06, "loss": 13.8077, "step": 67 }, { "epoch": 0.013286440015631106, "grad_norm": 11.723245620727539, "learning_rate": 8.181818181818183e-06, "loss": 12.6595, "step": 68 }, { "epoch": 0.013481828839390387, "grad_norm": 13.365504264831543, "learning_rate": 8.311688311688313e-06, "loss": 15.6787, "step": 69 }, { "epoch": 0.013677217663149667, "grad_norm": 13.995412826538086, "learning_rate": 8.441558441558442e-06, "loss": 14.7963, "step": 70 }, { "epoch": 0.013872606486908948, "grad_norm": 12.938873291015625, "learning_rate": 8.571428571428571e-06, "loss": 13.5083, "step": 71 }, { "epoch": 0.01406799531066823, "grad_norm": 12.961465835571289, "learning_rate": 8.701298701298701e-06, "loss": 15.0235, "step": 72 }, { "epoch": 0.014263384134427511, "grad_norm": 12.395155906677246, "learning_rate": 8.831168831168832e-06, "loss": 14.7144, "step": 73 }, { "epoch": 0.014458772958186792, "grad_norm": 12.128456115722656, "learning_rate": 8.96103896103896e-06, "loss": 13.4361, "step": 74 }, { "epoch": 0.014654161781946073, "grad_norm": 12.000514030456543, "learning_rate": 9.090909090909091e-06, "loss": 12.8412, "step": 75 }, { "epoch": 0.014849550605705353, "grad_norm": 14.545674324035645, "learning_rate": 9.220779220779221e-06, "loss": 14.0329, "step": 76 }, { "epoch": 0.015044939429464634, "grad_norm": 15.808619499206543, "learning_rate": 9.350649350649352e-06, "loss": 14.0651, "step": 77 }, { "epoch": 0.015240328253223915, "grad_norm": 14.302594184875488, "learning_rate": 9.48051948051948e-06, "loss": 14.3772, "step": 78 }, { "epoch": 0.015435717076983196, "grad_norm": 14.302594184875488, "learning_rate": 9.48051948051948e-06, "loss": 14.3133, "step": 79 }, { "epoch": 0.015631105900742476, "grad_norm": 14.733464241027832, "learning_rate": 9.610389610389611e-06, "loss": 13.9054, "step": 80 }, { "epoch": 0.015826494724501757, "grad_norm": 14.733464241027832, "learning_rate": 9.610389610389611e-06, "loss": 14.3721, "step": 81 }, { "epoch": 0.016021883548261038, "grad_norm": 13.027717590332031, "learning_rate": 9.740259740259742e-06, "loss": 13.9081, "step": 82 }, { "epoch": 0.016217272372020322, "grad_norm": 13.918047904968262, "learning_rate": 9.87012987012987e-06, "loss": 14.737, "step": 83 }, { "epoch": 0.016412661195779603, "grad_norm": 12.087900161743164, "learning_rate": 1e-05, "loss": 13.6766, "step": 84 }, { "epoch": 0.016608050019538884, "grad_norm": 12.087900161743164, "learning_rate": 1e-05, "loss": 14.2403, "step": 85 }, { "epoch": 0.016803438843298164, "grad_norm": 13.96923828125, "learning_rate": 1.012987012987013e-05, "loss": 13.2121, "step": 86 }, { "epoch": 0.016998827667057445, "grad_norm": 11.908881187438965, "learning_rate": 1.025974025974026e-05, "loss": 13.5275, "step": 87 }, { "epoch": 0.017194216490816726, "grad_norm": 12.391776084899902, "learning_rate": 1.0389610389610389e-05, "loss": 14.2606, "step": 88 }, { "epoch": 0.017389605314576007, "grad_norm": 12.35890007019043, "learning_rate": 1.0519480519480521e-05, "loss": 14.1348, "step": 89 }, { "epoch": 0.017584994138335287, "grad_norm": 12.73941421508789, "learning_rate": 1.064935064935065e-05, "loss": 13.7867, "step": 90 }, { "epoch": 0.017780382962094568, "grad_norm": 12.822901725769043, "learning_rate": 1.077922077922078e-05, "loss": 13.2284, "step": 91 }, { "epoch": 0.01797577178585385, "grad_norm": 11.797019004821777, "learning_rate": 1.0909090909090909e-05, "loss": 12.2814, "step": 92 }, { "epoch": 0.01817116060961313, "grad_norm": 10.80223560333252, "learning_rate": 1.1038961038961041e-05, "loss": 12.0659, "step": 93 }, { "epoch": 0.01836654943337241, "grad_norm": 12.917786598205566, "learning_rate": 1.116883116883117e-05, "loss": 15.0527, "step": 94 }, { "epoch": 0.01856193825713169, "grad_norm": 11.387985229492188, "learning_rate": 1.12987012987013e-05, "loss": 13.0093, "step": 95 }, { "epoch": 0.01875732708089097, "grad_norm": 12.817272186279297, "learning_rate": 1.1428571428571429e-05, "loss": 13.1882, "step": 96 }, { "epoch": 0.018952715904650252, "grad_norm": 13.179692268371582, "learning_rate": 1.155844155844156e-05, "loss": 13.6843, "step": 97 }, { "epoch": 0.019148104728409537, "grad_norm": 13.599560737609863, "learning_rate": 1.1688311688311688e-05, "loss": 13.4325, "step": 98 }, { "epoch": 0.019343493552168817, "grad_norm": 12.171883583068848, "learning_rate": 1.181818181818182e-05, "loss": 13.1249, "step": 99 }, { "epoch": 0.019538882375928098, "grad_norm": 12.852227210998535, "learning_rate": 1.1948051948051949e-05, "loss": 11.6582, "step": 100 }, { "epoch": 0.01973427119968738, "grad_norm": 14.056069374084473, "learning_rate": 1.207792207792208e-05, "loss": 14.5759, "step": 101 }, { "epoch": 0.01992966002344666, "grad_norm": 12.38291072845459, "learning_rate": 1.2207792207792208e-05, "loss": 13.5927, "step": 102 }, { "epoch": 0.02012504884720594, "grad_norm": 11.649094581604004, "learning_rate": 1.2337662337662339e-05, "loss": 13.3672, "step": 103 }, { "epoch": 0.02032043767096522, "grad_norm": 12.83110237121582, "learning_rate": 1.2467532467532468e-05, "loss": 14.0482, "step": 104 }, { "epoch": 0.020515826494724502, "grad_norm": 13.384620666503906, "learning_rate": 1.25974025974026e-05, "loss": 12.7853, "step": 105 }, { "epoch": 0.020711215318483783, "grad_norm": 17.137697219848633, "learning_rate": 1.2727272727272728e-05, "loss": 13.875, "step": 106 }, { "epoch": 0.020906604142243063, "grad_norm": 11.776130676269531, "learning_rate": 1.2857142857142859e-05, "loss": 12.762, "step": 107 }, { "epoch": 0.021101992966002344, "grad_norm": 13.227246284484863, "learning_rate": 1.2987012987012988e-05, "loss": 13.6374, "step": 108 }, { "epoch": 0.021297381789761625, "grad_norm": 11.719467163085938, "learning_rate": 1.311688311688312e-05, "loss": 13.027, "step": 109 }, { "epoch": 0.021492770613520906, "grad_norm": 11.77922248840332, "learning_rate": 1.3246753246753249e-05, "loss": 12.9044, "step": 110 }, { "epoch": 0.021688159437280186, "grad_norm": 13.76087760925293, "learning_rate": 1.3376623376623377e-05, "loss": 12.3403, "step": 111 }, { "epoch": 0.021883548261039467, "grad_norm": 10.988027572631836, "learning_rate": 1.3506493506493508e-05, "loss": 12.9417, "step": 112 }, { "epoch": 0.02207893708479875, "grad_norm": 13.056401252746582, "learning_rate": 1.3636363636363637e-05, "loss": 12.7452, "step": 113 }, { "epoch": 0.022274325908558032, "grad_norm": 9.933206558227539, "learning_rate": 1.3766233766233767e-05, "loss": 13.4601, "step": 114 }, { "epoch": 0.022469714732317313, "grad_norm": 13.234012603759766, "learning_rate": 1.3896103896103896e-05, "loss": 13.0663, "step": 115 }, { "epoch": 0.022665103556076593, "grad_norm": 13.284384727478027, "learning_rate": 1.4025974025974028e-05, "loss": 12.7403, "step": 116 }, { "epoch": 0.022860492379835874, "grad_norm": 14.263381958007812, "learning_rate": 1.4155844155844157e-05, "loss": 14.07, "step": 117 }, { "epoch": 0.023055881203595155, "grad_norm": 17.6854248046875, "learning_rate": 1.4285714285714287e-05, "loss": 13.6625, "step": 118 }, { "epoch": 0.023251270027354436, "grad_norm": 7.057964324951172, "learning_rate": 1.4415584415584416e-05, "loss": 11.2206, "step": 119 }, { "epoch": 0.023446658851113716, "grad_norm": 11.81092357635498, "learning_rate": 1.4545454545454546e-05, "loss": 11.8661, "step": 120 }, { "epoch": 0.023642047674872997, "grad_norm": 10.942760467529297, "learning_rate": 1.4675324675324675e-05, "loss": 12.795, "step": 121 }, { "epoch": 0.023837436498632278, "grad_norm": 10.352446556091309, "learning_rate": 1.4805194805194807e-05, "loss": 12.6598, "step": 122 }, { "epoch": 0.02403282532239156, "grad_norm": 8.63533878326416, "learning_rate": 1.4935064935064936e-05, "loss": 12.3454, "step": 123 }, { "epoch": 0.02422821414615084, "grad_norm": 7.654079437255859, "learning_rate": 1.5064935064935066e-05, "loss": 14.233, "step": 124 }, { "epoch": 0.02442360296991012, "grad_norm": 7.536815166473389, "learning_rate": 1.5194805194805195e-05, "loss": 13.4265, "step": 125 }, { "epoch": 0.0246189917936694, "grad_norm": 7.602650165557861, "learning_rate": 1.5324675324675326e-05, "loss": 11.8791, "step": 126 }, { "epoch": 0.02481438061742868, "grad_norm": 10.716212272644043, "learning_rate": 1.5454545454545454e-05, "loss": 13.0185, "step": 127 }, { "epoch": 0.025009769441187966, "grad_norm": 9.813230514526367, "learning_rate": 1.5584415584415587e-05, "loss": 12.7345, "step": 128 }, { "epoch": 0.025205158264947247, "grad_norm": 11.557696342468262, "learning_rate": 1.5714285714285715e-05, "loss": 11.8847, "step": 129 }, { "epoch": 0.025400547088706527, "grad_norm": 7.174719333648682, "learning_rate": 1.5844155844155847e-05, "loss": 13.8129, "step": 130 }, { "epoch": 0.025595935912465808, "grad_norm": 6.26338005065918, "learning_rate": 1.5974025974025976e-05, "loss": 13.1788, "step": 131 }, { "epoch": 0.02579132473622509, "grad_norm": 5.924908638000488, "learning_rate": 1.6103896103896105e-05, "loss": 12.8278, "step": 132 }, { "epoch": 0.02598671355998437, "grad_norm": 13.797770500183105, "learning_rate": 1.6233766233766234e-05, "loss": 11.9606, "step": 133 }, { "epoch": 0.02618210238374365, "grad_norm": 6.960059642791748, "learning_rate": 1.6363636363636366e-05, "loss": 13.5888, "step": 134 }, { "epoch": 0.02637749120750293, "grad_norm": 11.013506889343262, "learning_rate": 1.6493506493506495e-05, "loss": 10.7678, "step": 135 }, { "epoch": 0.02657288003126221, "grad_norm": 14.11983871459961, "learning_rate": 1.6623376623376627e-05, "loss": 12.7964, "step": 136 }, { "epoch": 0.026768268855021492, "grad_norm": 16.5230712890625, "learning_rate": 1.6753246753246756e-05, "loss": 13.8711, "step": 137 }, { "epoch": 0.026963657678780773, "grad_norm": 9.295743942260742, "learning_rate": 1.6883116883116884e-05, "loss": 12.0177, "step": 138 }, { "epoch": 0.027159046502540054, "grad_norm": 17.73920249938965, "learning_rate": 1.7012987012987013e-05, "loss": 13.5454, "step": 139 }, { "epoch": 0.027354435326299335, "grad_norm": 11.929683685302734, "learning_rate": 1.7142857142857142e-05, "loss": 11.7523, "step": 140 }, { "epoch": 0.027549824150058615, "grad_norm": 7.2855424880981445, "learning_rate": 1.7272727272727274e-05, "loss": 11.6333, "step": 141 }, { "epoch": 0.027745212973817896, "grad_norm": 7.469539165496826, "learning_rate": 1.7402597402597403e-05, "loss": 13.5007, "step": 142 }, { "epoch": 0.027940601797577177, "grad_norm": 7.750254154205322, "learning_rate": 1.7532467532467535e-05, "loss": 12.3127, "step": 143 }, { "epoch": 0.02813599062133646, "grad_norm": 8.031920433044434, "learning_rate": 1.7662337662337664e-05, "loss": 11.4355, "step": 144 }, { "epoch": 0.028331379445095742, "grad_norm": 12.954949378967285, "learning_rate": 1.7792207792207792e-05, "loss": 13.1203, "step": 145 }, { "epoch": 0.028526768268855023, "grad_norm": 8.291078567504883, "learning_rate": 1.792207792207792e-05, "loss": 13.5938, "step": 146 }, { "epoch": 0.028722157092614303, "grad_norm": 9.29495620727539, "learning_rate": 1.8051948051948053e-05, "loss": 13.3973, "step": 147 }, { "epoch": 0.028917545916373584, "grad_norm": 7.3590192794799805, "learning_rate": 1.8181818181818182e-05, "loss": 12.2823, "step": 148 }, { "epoch": 0.029112934740132865, "grad_norm": 13.084951400756836, "learning_rate": 1.8311688311688314e-05, "loss": 11.847, "step": 149 }, { "epoch": 0.029308323563892145, "grad_norm": 7.148357391357422, "learning_rate": 1.8441558441558443e-05, "loss": 12.9952, "step": 150 }, { "epoch": 0.029503712387651426, "grad_norm": 6.886097431182861, "learning_rate": 1.8571428571428575e-05, "loss": 12.9661, "step": 151 }, { "epoch": 0.029699101211410707, "grad_norm": 7.425508499145508, "learning_rate": 1.8701298701298704e-05, "loss": 12.0623, "step": 152 }, { "epoch": 0.029894490035169988, "grad_norm": 6.25291633605957, "learning_rate": 1.8831168831168833e-05, "loss": 12.4557, "step": 153 }, { "epoch": 0.03008987885892927, "grad_norm": 10.292274475097656, "learning_rate": 1.896103896103896e-05, "loss": 13.0426, "step": 154 }, { "epoch": 0.03028526768268855, "grad_norm": 5.456650733947754, "learning_rate": 1.9090909090909094e-05, "loss": 11.7076, "step": 155 }, { "epoch": 0.03048065650644783, "grad_norm": 9.853564262390137, "learning_rate": 1.9220779220779222e-05, "loss": 13.1965, "step": 156 }, { "epoch": 0.03067604533020711, "grad_norm": 6.572847366333008, "learning_rate": 1.9350649350649354e-05, "loss": 10.9203, "step": 157 }, { "epoch": 0.03087143415396639, "grad_norm": 5.121306419372559, "learning_rate": 1.9480519480519483e-05, "loss": 12.3719, "step": 158 }, { "epoch": 0.031066822977725676, "grad_norm": 7.446605682373047, "learning_rate": 1.9610389610389612e-05, "loss": 12.1669, "step": 159 }, { "epoch": 0.03126221180148495, "grad_norm": 9.406105995178223, "learning_rate": 1.974025974025974e-05, "loss": 12.3783, "step": 160 }, { "epoch": 0.031457600625244234, "grad_norm": 7.835089683532715, "learning_rate": 1.9870129870129873e-05, "loss": 11.9766, "step": 161 }, { "epoch": 0.031652989449003514, "grad_norm": 9.111235618591309, "learning_rate": 2e-05, "loss": 13.4484, "step": 162 }, { "epoch": 0.031848378272762795, "grad_norm": 9.407170295715332, "learning_rate": 1.9999997997344768e-05, "loss": 13.429, "step": 163 }, { "epoch": 0.032043767096522076, "grad_norm": 13.125298500061035, "learning_rate": 1.9999991989379874e-05, "loss": 13.0715, "step": 164 }, { "epoch": 0.03223915592028136, "grad_norm": 6.865466117858887, "learning_rate": 1.9999981976107724e-05, "loss": 12.4914, "step": 165 }, { "epoch": 0.032434544744040644, "grad_norm": 4.316725730895996, "learning_rate": 1.999996795753233e-05, "loss": 11.7864, "step": 166 }, { "epoch": 0.032629933567799925, "grad_norm": 7.447556972503662, "learning_rate": 1.9999949933659307e-05, "loss": 13.1166, "step": 167 }, { "epoch": 0.032825322391559206, "grad_norm": 6.406036376953125, "learning_rate": 1.999992790449587e-05, "loss": 12.461, "step": 168 }, { "epoch": 0.033020711215318486, "grad_norm": 7.764867305755615, "learning_rate": 1.9999901870050848e-05, "loss": 11.5924, "step": 169 }, { "epoch": 0.03321610003907777, "grad_norm": 7.2458624839782715, "learning_rate": 1.9999871830334667e-05, "loss": 12.7767, "step": 170 }, { "epoch": 0.03341148886283705, "grad_norm": 5.0661396980285645, "learning_rate": 1.9999837785359356e-05, "loss": 11.0157, "step": 171 }, { "epoch": 0.03360687768659633, "grad_norm": 7.504276752471924, "learning_rate": 1.999979973513855e-05, "loss": 10.8756, "step": 172 }, { "epoch": 0.03380226651035561, "grad_norm": 9.193746566772461, "learning_rate": 1.9999757679687498e-05, "loss": 13.5181, "step": 173 }, { "epoch": 0.03399765533411489, "grad_norm": 13.39449405670166, "learning_rate": 1.9999711619023036e-05, "loss": 13.0867, "step": 174 }, { "epoch": 0.03419304415787417, "grad_norm": 10.203667640686035, "learning_rate": 1.9999661553163615e-05, "loss": 11.7621, "step": 175 }, { "epoch": 0.03438843298163345, "grad_norm": 6.521947383880615, "learning_rate": 1.999960748212929e-05, "loss": 12.5624, "step": 176 }, { "epoch": 0.03458382180539273, "grad_norm": 7.569502830505371, "learning_rate": 1.9999549405941714e-05, "loss": 11.277, "step": 177 }, { "epoch": 0.03477921062915201, "grad_norm": 6.592679023742676, "learning_rate": 1.9999487324624152e-05, "loss": 12.7629, "step": 178 }, { "epoch": 0.034974599452911294, "grad_norm": 8.883557319641113, "learning_rate": 1.9999421238201465e-05, "loss": 11.6271, "step": 179 }, { "epoch": 0.035169988276670575, "grad_norm": 6.424564361572266, "learning_rate": 1.9999351146700133e-05, "loss": 12.3532, "step": 180 }, { "epoch": 0.035365377100429855, "grad_norm": 7.161871910095215, "learning_rate": 1.9999277050148217e-05, "loss": 12.4987, "step": 181 }, { "epoch": 0.035560765924189136, "grad_norm": 6.069089412689209, "learning_rate": 1.9999198948575404e-05, "loss": 11.6968, "step": 182 }, { "epoch": 0.03575615474794842, "grad_norm": 8.944799423217773, "learning_rate": 1.9999116842012973e-05, "loss": 12.5904, "step": 183 }, { "epoch": 0.0359515435717077, "grad_norm": 18.52510643005371, "learning_rate": 1.999903073049381e-05, "loss": 12.4171, "step": 184 }, { "epoch": 0.03614693239546698, "grad_norm": 5.86504602432251, "learning_rate": 1.9998940614052404e-05, "loss": 12.2995, "step": 185 }, { "epoch": 0.03634232121922626, "grad_norm": 7.921477794647217, "learning_rate": 1.999884649272485e-05, "loss": 12.1127, "step": 186 }, { "epoch": 0.03653771004298554, "grad_norm": 5.69157600402832, "learning_rate": 1.999874836654885e-05, "loss": 11.7619, "step": 187 }, { "epoch": 0.03673309886674482, "grad_norm": 5.952302932739258, "learning_rate": 1.9998646235563702e-05, "loss": 11.5656, "step": 188 }, { "epoch": 0.0369284876905041, "grad_norm": 5.239631175994873, "learning_rate": 1.9998540099810318e-05, "loss": 11.8899, "step": 189 }, { "epoch": 0.03712387651426338, "grad_norm": 5.957993507385254, "learning_rate": 1.9998429959331204e-05, "loss": 12.6495, "step": 190 }, { "epoch": 0.03731926533802266, "grad_norm": 6.800102233886719, "learning_rate": 1.9998315814170473e-05, "loss": 12.0542, "step": 191 }, { "epoch": 0.03751465416178194, "grad_norm": 7.192645072937012, "learning_rate": 1.999819766437385e-05, "loss": 12.5862, "step": 192 }, { "epoch": 0.037710042985541224, "grad_norm": 6.849038124084473, "learning_rate": 1.999807550998865e-05, "loss": 11.8042, "step": 193 }, { "epoch": 0.037905431809300505, "grad_norm": 5.00752592086792, "learning_rate": 1.9997949351063807e-05, "loss": 12.224, "step": 194 }, { "epoch": 0.038100820633059786, "grad_norm": 8.420522689819336, "learning_rate": 1.999781918764985e-05, "loss": 12.8256, "step": 195 }, { "epoch": 0.03829620945681907, "grad_norm": 7.47088098526001, "learning_rate": 1.9997685019798913e-05, "loss": 12.2079, "step": 196 }, { "epoch": 0.038491598280578354, "grad_norm": 8.45734691619873, "learning_rate": 1.9997546847564727e-05, "loss": 11.7802, "step": 197 }, { "epoch": 0.038686987104337635, "grad_norm": 10.523774147033691, "learning_rate": 1.9997404671002645e-05, "loss": 11.5177, "step": 198 }, { "epoch": 0.038882375928096916, "grad_norm": 14.023902893066406, "learning_rate": 1.999725849016961e-05, "loss": 13.3077, "step": 199 }, { "epoch": 0.039077764751856196, "grad_norm": 6.061577320098877, "learning_rate": 1.9997108305124165e-05, "loss": 10.655, "step": 200 }, { "epoch": 0.03927315357561548, "grad_norm": 18.192489624023438, "learning_rate": 1.9996954115926477e-05, "loss": 12.3448, "step": 201 }, { "epoch": 0.03946854239937476, "grad_norm": 7.7355780601501465, "learning_rate": 1.9996795922638293e-05, "loss": 11.3258, "step": 202 }, { "epoch": 0.03966393122313404, "grad_norm": 8.186224937438965, "learning_rate": 1.9996633725322977e-05, "loss": 13.2872, "step": 203 }, { "epoch": 0.03985932004689332, "grad_norm": 8.051215171813965, "learning_rate": 1.9996467524045492e-05, "loss": 12.226, "step": 204 }, { "epoch": 0.0400547088706526, "grad_norm": 8.462077140808105, "learning_rate": 1.9996297318872413e-05, "loss": 12.6791, "step": 205 }, { "epoch": 0.04025009769441188, "grad_norm": 7.602774620056152, "learning_rate": 1.999612310987191e-05, "loss": 12.0686, "step": 206 }, { "epoch": 0.04044548651817116, "grad_norm": 5.274927139282227, "learning_rate": 1.9995944897113753e-05, "loss": 13.5675, "step": 207 }, { "epoch": 0.04064087534193044, "grad_norm": 6.607327461242676, "learning_rate": 1.9995762680669326e-05, "loss": 11.4844, "step": 208 }, { "epoch": 0.04083626416568972, "grad_norm": 12.272229194641113, "learning_rate": 1.9995576460611617e-05, "loss": 13.1514, "step": 209 }, { "epoch": 0.041031652989449004, "grad_norm": 11.912178993225098, "learning_rate": 1.9995386237015208e-05, "loss": 11.9987, "step": 210 }, { "epoch": 0.041227041813208284, "grad_norm": 8.682504653930664, "learning_rate": 1.999519200995629e-05, "loss": 12.7482, "step": 211 }, { "epoch": 0.041422430636967565, "grad_norm": 12.171260833740234, "learning_rate": 1.9994993779512656e-05, "loss": 11.9504, "step": 212 }, { "epoch": 0.041617819460726846, "grad_norm": 15.512253761291504, "learning_rate": 1.9994791545763707e-05, "loss": 13.1384, "step": 213 }, { "epoch": 0.04181320828448613, "grad_norm": 7.158082962036133, "learning_rate": 1.9994585308790437e-05, "loss": 10.9148, "step": 214 }, { "epoch": 0.04200859710824541, "grad_norm": 7.843011379241943, "learning_rate": 1.999437506867546e-05, "loss": 12.889, "step": 215 }, { "epoch": 0.04220398593200469, "grad_norm": 7.392290115356445, "learning_rate": 1.9994160825502975e-05, "loss": 13.2473, "step": 216 }, { "epoch": 0.04239937475576397, "grad_norm": 7.660642623901367, "learning_rate": 1.99939425793588e-05, "loss": 11.0385, "step": 217 }, { "epoch": 0.04259476357952325, "grad_norm": 7.692232131958008, "learning_rate": 1.9993720330330343e-05, "loss": 12.1109, "step": 218 }, { "epoch": 0.04279015240328253, "grad_norm": 5.706768989562988, "learning_rate": 1.9993494078506626e-05, "loss": 12.2178, "step": 219 }, { "epoch": 0.04298554122704181, "grad_norm": 5.357436180114746, "learning_rate": 1.9993263823978268e-05, "loss": 12.0862, "step": 220 }, { "epoch": 0.04318093005080109, "grad_norm": 7.361543655395508, "learning_rate": 1.9993029566837493e-05, "loss": 13.2949, "step": 221 }, { "epoch": 0.04337631887456037, "grad_norm": 8.009748458862305, "learning_rate": 1.9992791307178133e-05, "loss": 10.9137, "step": 222 }, { "epoch": 0.04357170769831965, "grad_norm": 6.979041576385498, "learning_rate": 1.9992549045095607e-05, "loss": 12.2136, "step": 223 }, { "epoch": 0.043767096522078934, "grad_norm": 8.310110092163086, "learning_rate": 1.999230278068696e-05, "loss": 11.1071, "step": 224 }, { "epoch": 0.043962485345838215, "grad_norm": 8.9794282913208, "learning_rate": 1.9992052514050824e-05, "loss": 11.9055, "step": 225 }, { "epoch": 0.0441578741695975, "grad_norm": 15.550240516662598, "learning_rate": 1.9991798245287438e-05, "loss": 12.0324, "step": 226 }, { "epoch": 0.04435326299335678, "grad_norm": 7.737309455871582, "learning_rate": 1.999153997449865e-05, "loss": 12.1904, "step": 227 }, { "epoch": 0.044548651817116064, "grad_norm": 7.448500633239746, "learning_rate": 1.9991277701787895e-05, "loss": 11.7476, "step": 228 }, { "epoch": 0.044744040640875345, "grad_norm": 11.323410034179688, "learning_rate": 1.999101142726023e-05, "loss": 12.7668, "step": 229 }, { "epoch": 0.044939429464634625, "grad_norm": 10.129905700683594, "learning_rate": 1.9990741151022302e-05, "loss": 10.9619, "step": 230 }, { "epoch": 0.045134818288393906, "grad_norm": 10.254401206970215, "learning_rate": 1.9990466873182367e-05, "loss": 10.8593, "step": 231 }, { "epoch": 0.04533020711215319, "grad_norm": 9.408255577087402, "learning_rate": 1.999018859385028e-05, "loss": 11.9756, "step": 232 }, { "epoch": 0.04552559593591247, "grad_norm": 12.784259796142578, "learning_rate": 1.9989906313137502e-05, "loss": 12.2801, "step": 233 }, { "epoch": 0.04572098475967175, "grad_norm": 10.667810440063477, "learning_rate": 1.9989620031157098e-05, "loss": 12.3217, "step": 234 }, { "epoch": 0.04591637358343103, "grad_norm": 8.796360969543457, "learning_rate": 1.9989329748023728e-05, "loss": 12.3541, "step": 235 }, { "epoch": 0.04611176240719031, "grad_norm": 9.64390754699707, "learning_rate": 1.998903546385366e-05, "loss": 12.063, "step": 236 }, { "epoch": 0.04630715123094959, "grad_norm": 10.169878959655762, "learning_rate": 1.9988737178764764e-05, "loss": 12.7145, "step": 237 }, { "epoch": 0.04650254005470887, "grad_norm": 10.228228569030762, "learning_rate": 1.9988434892876515e-05, "loss": 11.8427, "step": 238 }, { "epoch": 0.04669792887846815, "grad_norm": 13.552098274230957, "learning_rate": 1.9988128606309988e-05, "loss": 12.853, "step": 239 }, { "epoch": 0.04689331770222743, "grad_norm": 6.725545406341553, "learning_rate": 1.9987818319187857e-05, "loss": 11.8622, "step": 240 }, { "epoch": 0.047088706525986714, "grad_norm": 6.490736484527588, "learning_rate": 1.9987504031634403e-05, "loss": 12.8614, "step": 241 }, { "epoch": 0.047284095349745994, "grad_norm": 7.924977779388428, "learning_rate": 1.9987185743775507e-05, "loss": 12.0172, "step": 242 }, { "epoch": 0.047479484173505275, "grad_norm": 7.123000144958496, "learning_rate": 1.998686345573866e-05, "loss": 12.1521, "step": 243 }, { "epoch": 0.047674872997264556, "grad_norm": 6.6585845947265625, "learning_rate": 1.9986537167652937e-05, "loss": 11.3805, "step": 244 }, { "epoch": 0.047870261821023836, "grad_norm": 5.809377193450928, "learning_rate": 1.9986206879649033e-05, "loss": 11.1165, "step": 245 }, { "epoch": 0.04806565064478312, "grad_norm": 6.047052383422852, "learning_rate": 1.9985872591859242e-05, "loss": 10.7509, "step": 246 }, { "epoch": 0.0482610394685424, "grad_norm": 12.317660331726074, "learning_rate": 1.9985534304417448e-05, "loss": 11.8707, "step": 247 }, { "epoch": 0.04845642829230168, "grad_norm": 8.26644229888916, "learning_rate": 1.9985192017459153e-05, "loss": 12.3413, "step": 248 }, { "epoch": 0.04865181711606096, "grad_norm": 8.184246063232422, "learning_rate": 1.9984845731121447e-05, "loss": 10.454, "step": 249 }, { "epoch": 0.04884720593982024, "grad_norm": 6.725503921508789, "learning_rate": 1.9984495445543035e-05, "loss": 10.6297, "step": 250 }, { "epoch": 0.04904259476357952, "grad_norm": 14.417993545532227, "learning_rate": 1.9984141160864214e-05, "loss": 12.5653, "step": 251 }, { "epoch": 0.0492379835873388, "grad_norm": 7.391936302185059, "learning_rate": 1.9983782877226888e-05, "loss": 11.6914, "step": 252 }, { "epoch": 0.04943337241109808, "grad_norm": 9.729405403137207, "learning_rate": 1.9983420594774556e-05, "loss": 12.3673, "step": 253 }, { "epoch": 0.04962876123485736, "grad_norm": 13.310443878173828, "learning_rate": 1.998305431365233e-05, "loss": 12.5482, "step": 254 }, { "epoch": 0.049824150058616644, "grad_norm": 12.740357398986816, "learning_rate": 1.9982684034006906e-05, "loss": 11.3177, "step": 255 }, { "epoch": 0.05001953888237593, "grad_norm": 13.545549392700195, "learning_rate": 1.9982309755986607e-05, "loss": 12.0766, "step": 256 }, { "epoch": 0.05021492770613521, "grad_norm": 8.222143173217773, "learning_rate": 1.9981931479741336e-05, "loss": 10.2483, "step": 257 }, { "epoch": 0.05041031652989449, "grad_norm": 17.599727630615234, "learning_rate": 1.9981549205422602e-05, "loss": 12.1119, "step": 258 }, { "epoch": 0.050605705353653774, "grad_norm": 9.997603416442871, "learning_rate": 1.998116293318352e-05, "loss": 11.3142, "step": 259 }, { "epoch": 0.050801094177413054, "grad_norm": 15.584720611572266, "learning_rate": 1.99807726631788e-05, "loss": 10.2511, "step": 260 }, { "epoch": 0.050996483001172335, "grad_norm": 15.335906982421875, "learning_rate": 1.9980378395564772e-05, "loss": 11.6443, "step": 261 }, { "epoch": 0.051191871824931616, "grad_norm": 12.51326847076416, "learning_rate": 1.997998013049933e-05, "loss": 12.287, "step": 262 }, { "epoch": 0.0513872606486909, "grad_norm": 8.552204132080078, "learning_rate": 1.997957786814201e-05, "loss": 11.6877, "step": 263 }, { "epoch": 0.05158264947245018, "grad_norm": 6.9093241691589355, "learning_rate": 1.9979171608653926e-05, "loss": 12.0065, "step": 264 }, { "epoch": 0.05177803829620946, "grad_norm": 13.546882629394531, "learning_rate": 1.997876135219779e-05, "loss": 11.411, "step": 265 }, { "epoch": 0.05197342711996874, "grad_norm": 11.272443771362305, "learning_rate": 1.997834709893793e-05, "loss": 12.0057, "step": 266 }, { "epoch": 0.05216881594372802, "grad_norm": 8.439807891845703, "learning_rate": 1.9977928849040263e-05, "loss": 11.9195, "step": 267 }, { "epoch": 0.0523642047674873, "grad_norm": 10.880560874938965, "learning_rate": 1.9977506602672317e-05, "loss": 11.9902, "step": 268 }, { "epoch": 0.05255959359124658, "grad_norm": 7.07627010345459, "learning_rate": 1.9977080360003206e-05, "loss": 11.8873, "step": 269 }, { "epoch": 0.05275498241500586, "grad_norm": 14.284347534179688, "learning_rate": 1.9976650121203663e-05, "loss": 10.994, "step": 270 }, { "epoch": 0.05295037123876514, "grad_norm": 8.325899124145508, "learning_rate": 1.9976215886446005e-05, "loss": 12.2574, "step": 271 }, { "epoch": 0.05314576006252442, "grad_norm": 9.194845199584961, "learning_rate": 1.9975777655904157e-05, "loss": 12.5032, "step": 272 }, { "epoch": 0.053341148886283704, "grad_norm": 11.341529846191406, "learning_rate": 1.997533542975365e-05, "loss": 10.0915, "step": 273 }, { "epoch": 0.053536537710042985, "grad_norm": 9.99206256866455, "learning_rate": 1.9974889208171602e-05, "loss": 11.9477, "step": 274 }, { "epoch": 0.053731926533802266, "grad_norm": 8.00964069366455, "learning_rate": 1.9974438991336743e-05, "loss": 11.3129, "step": 275 }, { "epoch": 0.053927315357561546, "grad_norm": 9.96445083618164, "learning_rate": 1.9973984779429396e-05, "loss": 11.4211, "step": 276 }, { "epoch": 0.05412270418132083, "grad_norm": 11.463232040405273, "learning_rate": 1.997352657263149e-05, "loss": 12.0209, "step": 277 }, { "epoch": 0.05431809300508011, "grad_norm": 9.95913314819336, "learning_rate": 1.997306437112655e-05, "loss": 12.3077, "step": 278 }, { "epoch": 0.05451348182883939, "grad_norm": 13.20826244354248, "learning_rate": 1.9972598175099698e-05, "loss": 12.2697, "step": 279 }, { "epoch": 0.05470887065259867, "grad_norm": 9.180987358093262, "learning_rate": 1.9972127984737667e-05, "loss": 12.1211, "step": 280 }, { "epoch": 0.05490425947635795, "grad_norm": 9.276026725769043, "learning_rate": 1.9971653800228782e-05, "loss": 12.6417, "step": 281 }, { "epoch": 0.05509964830011723, "grad_norm": 11.767862319946289, "learning_rate": 1.997117562176296e-05, "loss": 12.017, "step": 282 }, { "epoch": 0.05529503712387651, "grad_norm": 9.630742073059082, "learning_rate": 1.9970693449531735e-05, "loss": 10.7546, "step": 283 }, { "epoch": 0.05549042594763579, "grad_norm": 10.929373741149902, "learning_rate": 1.9970207283728227e-05, "loss": 11.5425, "step": 284 }, { "epoch": 0.05568581477139507, "grad_norm": 12.832663536071777, "learning_rate": 1.9969717124547165e-05, "loss": 11.6781, "step": 285 }, { "epoch": 0.055881203595154354, "grad_norm": 20.87643051147461, "learning_rate": 1.9969222972184873e-05, "loss": 12.2216, "step": 286 }, { "epoch": 0.05607659241891364, "grad_norm": 10.373276710510254, "learning_rate": 1.9968724826839268e-05, "loss": 12.3443, "step": 287 }, { "epoch": 0.05627198124267292, "grad_norm": 8.58100414276123, "learning_rate": 1.996822268870988e-05, "loss": 10.6689, "step": 288 }, { "epoch": 0.0564673700664322, "grad_norm": 8.117618560791016, "learning_rate": 1.9967716557997824e-05, "loss": 12.3755, "step": 289 }, { "epoch": 0.056662758890191484, "grad_norm": 10.374201774597168, "learning_rate": 1.9967206434905832e-05, "loss": 12.1325, "step": 290 }, { "epoch": 0.056858147713950764, "grad_norm": 8.217313766479492, "learning_rate": 1.996669231963821e-05, "loss": 10.9547, "step": 291 }, { "epoch": 0.057053536537710045, "grad_norm": 7.501300811767578, "learning_rate": 1.9966174212400885e-05, "loss": 11.9509, "step": 292 }, { "epoch": 0.057248925361469326, "grad_norm": 6.74676513671875, "learning_rate": 1.9965652113401374e-05, "loss": 11.4133, "step": 293 }, { "epoch": 0.05744431418522861, "grad_norm": 11.078866958618164, "learning_rate": 1.9965126022848796e-05, "loss": 12.1601, "step": 294 }, { "epoch": 0.05763970300898789, "grad_norm": 7.624950885772705, "learning_rate": 1.9964595940953863e-05, "loss": 11.3853, "step": 295 }, { "epoch": 0.05783509183274717, "grad_norm": 7.737289905548096, "learning_rate": 1.9964061867928892e-05, "loss": 11.9907, "step": 296 }, { "epoch": 0.05803048065650645, "grad_norm": 9.63924503326416, "learning_rate": 1.9963523803987792e-05, "loss": 11.5848, "step": 297 }, { "epoch": 0.05822586948026573, "grad_norm": 7.582948207855225, "learning_rate": 1.996298174934608e-05, "loss": 11.4951, "step": 298 }, { "epoch": 0.05842125830402501, "grad_norm": 8.697115898132324, "learning_rate": 1.996243570422086e-05, "loss": 12.8351, "step": 299 }, { "epoch": 0.05861664712778429, "grad_norm": 9.946210861206055, "learning_rate": 1.9961885668830844e-05, "loss": 10.6673, "step": 300 }, { "epoch": 0.05881203595154357, "grad_norm": 6.278243064880371, "learning_rate": 1.9961331643396336e-05, "loss": 11.1935, "step": 301 }, { "epoch": 0.05900742477530285, "grad_norm": 8.534592628479004, "learning_rate": 1.9960773628139244e-05, "loss": 12.0799, "step": 302 }, { "epoch": 0.05920281359906213, "grad_norm": 6.725213050842285, "learning_rate": 1.9960211623283064e-05, "loss": 10.8297, "step": 303 }, { "epoch": 0.059398202422821414, "grad_norm": 11.384440422058105, "learning_rate": 1.9959645629052905e-05, "loss": 11.9136, "step": 304 }, { "epoch": 0.059593591246580695, "grad_norm": 11.529409408569336, "learning_rate": 1.9959075645675457e-05, "loss": 10.7427, "step": 305 }, { "epoch": 0.059788980070339975, "grad_norm": 12.019720077514648, "learning_rate": 1.9958501673379018e-05, "loss": 12.0401, "step": 306 }, { "epoch": 0.059984368894099256, "grad_norm": 8.388792037963867, "learning_rate": 1.9957923712393486e-05, "loss": 13.0926, "step": 307 }, { "epoch": 0.06017975771785854, "grad_norm": 12.770638465881348, "learning_rate": 1.9957341762950346e-05, "loss": 11.7429, "step": 308 }, { "epoch": 0.06037514654161782, "grad_norm": 5.92092227935791, "learning_rate": 1.995675582528269e-05, "loss": 10.5684, "step": 309 }, { "epoch": 0.0605705353653771, "grad_norm": 7.7345781326293945, "learning_rate": 1.995616589962521e-05, "loss": 12.3008, "step": 310 }, { "epoch": 0.06076592418913638, "grad_norm": 8.71921443939209, "learning_rate": 1.9955571986214182e-05, "loss": 13.1848, "step": 311 }, { "epoch": 0.06096131301289566, "grad_norm": 6.207778453826904, "learning_rate": 1.9954974085287487e-05, "loss": 12.2107, "step": 312 }, { "epoch": 0.06115670183665494, "grad_norm": 7.688232421875, "learning_rate": 1.9954372197084607e-05, "loss": 11.741, "step": 313 }, { "epoch": 0.06135209066041422, "grad_norm": 9.183834075927734, "learning_rate": 1.995376632184661e-05, "loss": 12.838, "step": 314 }, { "epoch": 0.0615474794841735, "grad_norm": 9.513407707214355, "learning_rate": 1.995315645981618e-05, "loss": 11.5654, "step": 315 }, { "epoch": 0.06174286830793278, "grad_norm": 8.319108009338379, "learning_rate": 1.995254261123757e-05, "loss": 12.2501, "step": 316 }, { "epoch": 0.06193825713169207, "grad_norm": 6.513503551483154, "learning_rate": 1.9951924776356657e-05, "loss": 11.0926, "step": 317 }, { "epoch": 0.06213364595545135, "grad_norm": 14.238517761230469, "learning_rate": 1.99513029554209e-05, "loss": 13.1257, "step": 318 }, { "epoch": 0.06232903477921063, "grad_norm": 11.35081672668457, "learning_rate": 1.9950677148679357e-05, "loss": 11.6198, "step": 319 }, { "epoch": 0.0625244236029699, "grad_norm": 7.610073566436768, "learning_rate": 1.9950047356382684e-05, "loss": 10.8325, "step": 320 }, { "epoch": 0.0627198124267292, "grad_norm": 7.52583646774292, "learning_rate": 1.9949413578783133e-05, "loss": 11.8231, "step": 321 }, { "epoch": 0.06291520125048847, "grad_norm": 12.769768714904785, "learning_rate": 1.9948775816134547e-05, "loss": 12.0816, "step": 322 }, { "epoch": 0.06311059007424775, "grad_norm": 13.12612533569336, "learning_rate": 1.9948134068692372e-05, "loss": 11.0729, "step": 323 }, { "epoch": 0.06330597889800703, "grad_norm": 8.223774909973145, "learning_rate": 1.994748833671365e-05, "loss": 10.8119, "step": 324 }, { "epoch": 0.06350136772176632, "grad_norm": 8.893241882324219, "learning_rate": 1.9946838620457015e-05, "loss": 11.7001, "step": 325 }, { "epoch": 0.06369675654552559, "grad_norm": 9.47516918182373, "learning_rate": 1.9946184920182704e-05, "loss": 12.1686, "step": 326 }, { "epoch": 0.06389214536928488, "grad_norm": 9.479887962341309, "learning_rate": 1.9945527236152533e-05, "loss": 11.6604, "step": 327 }, { "epoch": 0.06408753419304415, "grad_norm": 9.156216621398926, "learning_rate": 1.994486556862993e-05, "loss": 11.3712, "step": 328 }, { "epoch": 0.06428292301680344, "grad_norm": 8.44137191772461, "learning_rate": 1.994419991787992e-05, "loss": 11.162, "step": 329 }, { "epoch": 0.06447831184056271, "grad_norm": 8.18656063079834, "learning_rate": 1.9943530284169107e-05, "loss": 10.9821, "step": 330 }, { "epoch": 0.064673700664322, "grad_norm": 12.09242057800293, "learning_rate": 1.9942856667765706e-05, "loss": 10.8379, "step": 331 }, { "epoch": 0.06486908948808129, "grad_norm": 9.561943054199219, "learning_rate": 1.994217906893952e-05, "loss": 12.13, "step": 332 }, { "epoch": 0.06506447831184056, "grad_norm": 11.300248146057129, "learning_rate": 1.9941497487961946e-05, "loss": 11.8315, "step": 333 }, { "epoch": 0.06525986713559985, "grad_norm": 22.479005813598633, "learning_rate": 1.9940811925105984e-05, "loss": 12.5274, "step": 334 }, { "epoch": 0.06545525595935912, "grad_norm": 12.22573184967041, "learning_rate": 1.9940122380646218e-05, "loss": 12.6328, "step": 335 }, { "epoch": 0.06565064478311841, "grad_norm": 12.668004989624023, "learning_rate": 1.993942885485883e-05, "loss": 12.9096, "step": 336 }, { "epoch": 0.06584603360687769, "grad_norm": 14.226447105407715, "learning_rate": 1.9938731348021607e-05, "loss": 12.423, "step": 337 }, { "epoch": 0.06604142243063697, "grad_norm": 11.14017105102539, "learning_rate": 1.9938029860413914e-05, "loss": 11.2108, "step": 338 }, { "epoch": 0.06623681125439625, "grad_norm": 13.295299530029297, "learning_rate": 1.9937324392316725e-05, "loss": 11.8767, "step": 339 }, { "epoch": 0.06643220007815553, "grad_norm": 9.23791790008545, "learning_rate": 1.9936614944012595e-05, "loss": 11.8994, "step": 340 }, { "epoch": 0.06662758890191481, "grad_norm": 12.534093856811523, "learning_rate": 1.9935901515785686e-05, "loss": 11.687, "step": 341 }, { "epoch": 0.0668229777256741, "grad_norm": 8.44124984741211, "learning_rate": 1.9935184107921747e-05, "loss": 11.3491, "step": 342 }, { "epoch": 0.06701836654943337, "grad_norm": 8.509767532348633, "learning_rate": 1.993446272070812e-05, "loss": 10.6972, "step": 343 }, { "epoch": 0.06721375537319266, "grad_norm": 10.905438423156738, "learning_rate": 1.993373735443374e-05, "loss": 11.0463, "step": 344 }, { "epoch": 0.06740914419695193, "grad_norm": 8.30171012878418, "learning_rate": 1.993300800938915e-05, "loss": 10.7473, "step": 345 }, { "epoch": 0.06760453302071122, "grad_norm": 7.070130825042725, "learning_rate": 1.993227468586646e-05, "loss": 10.6074, "step": 346 }, { "epoch": 0.06779992184447049, "grad_norm": 10.217448234558105, "learning_rate": 1.99315373841594e-05, "loss": 10.8733, "step": 347 }, { "epoch": 0.06799531066822978, "grad_norm": 7.3030805587768555, "learning_rate": 1.993079610456328e-05, "loss": 11.4374, "step": 348 }, { "epoch": 0.06819069949198905, "grad_norm": 10.487565994262695, "learning_rate": 1.993005084737501e-05, "loss": 11.2645, "step": 349 }, { "epoch": 0.06838608831574834, "grad_norm": 24.83467674255371, "learning_rate": 1.9929301612893076e-05, "loss": 11.8643, "step": 350 }, { "epoch": 0.06858147713950762, "grad_norm": 10.412206649780273, "learning_rate": 1.992854840141758e-05, "loss": 10.8554, "step": 351 }, { "epoch": 0.0687768659632669, "grad_norm": 9.083630561828613, "learning_rate": 1.9927791213250202e-05, "loss": 11.3935, "step": 352 }, { "epoch": 0.06897225478702618, "grad_norm": 10.593680381774902, "learning_rate": 1.992703004869422e-05, "loss": 10.8293, "step": 353 }, { "epoch": 0.06916764361078546, "grad_norm": 33.345863342285156, "learning_rate": 1.9926264908054506e-05, "loss": 11.751, "step": 354 }, { "epoch": 0.06936303243454474, "grad_norm": 15.0076322555542, "learning_rate": 1.992549579163752e-05, "loss": 11.2289, "step": 355 }, { "epoch": 0.06955842125830403, "grad_norm": 12.340163230895996, "learning_rate": 1.992472269975132e-05, "loss": 10.9912, "step": 356 }, { "epoch": 0.0697538100820633, "grad_norm": 10.287991523742676, "learning_rate": 1.992394563270555e-05, "loss": 11.4063, "step": 357 }, { "epoch": 0.06994919890582259, "grad_norm": 19.07567596435547, "learning_rate": 1.9923164590811455e-05, "loss": 11.4794, "step": 358 }, { "epoch": 0.07014458772958186, "grad_norm": 8.652302742004395, "learning_rate": 1.992237957438186e-05, "loss": 11.6414, "step": 359 }, { "epoch": 0.07033997655334115, "grad_norm": 16.885883331298828, "learning_rate": 1.9921590583731195e-05, "loss": 10.9357, "step": 360 }, { "epoch": 0.07053536537710042, "grad_norm": 8.816641807556152, "learning_rate": 1.9920797619175466e-05, "loss": 10.4371, "step": 361 }, { "epoch": 0.07073075420085971, "grad_norm": 12.317264556884766, "learning_rate": 1.992000068103229e-05, "loss": 12.5566, "step": 362 }, { "epoch": 0.070926143024619, "grad_norm": 10.080653190612793, "learning_rate": 1.991919976962086e-05, "loss": 12.1561, "step": 363 }, { "epoch": 0.07112153184837827, "grad_norm": 10.67681884765625, "learning_rate": 1.9918394885261966e-05, "loss": 11.6402, "step": 364 }, { "epoch": 0.07131692067213756, "grad_norm": 9.851771354675293, "learning_rate": 1.9917586028277987e-05, "loss": 11.2789, "step": 365 }, { "epoch": 0.07151230949589683, "grad_norm": 16.777217864990234, "learning_rate": 1.99167731989929e-05, "loss": 11.3211, "step": 366 }, { "epoch": 0.07170769831965612, "grad_norm": 10.766368865966797, "learning_rate": 1.991595639773227e-05, "loss": 10.4509, "step": 367 }, { "epoch": 0.0719030871434154, "grad_norm": 9.359428405761719, "learning_rate": 1.9915135624823243e-05, "loss": 11.8863, "step": 368 }, { "epoch": 0.07209847596717468, "grad_norm": 10.939064025878906, "learning_rate": 1.9914310880594572e-05, "loss": 12.8301, "step": 369 }, { "epoch": 0.07229386479093396, "grad_norm": 15.838484764099121, "learning_rate": 1.991348216537659e-05, "loss": 11.0152, "step": 370 }, { "epoch": 0.07248925361469324, "grad_norm": 16.605043411254883, "learning_rate": 1.9912649479501222e-05, "loss": 12.3227, "step": 371 }, { "epoch": 0.07268464243845252, "grad_norm": 7.643439769744873, "learning_rate": 1.9911812823301985e-05, "loss": 10.9601, "step": 372 }, { "epoch": 0.0728800312622118, "grad_norm": 9.731760025024414, "learning_rate": 1.9910972197113986e-05, "loss": 11.597, "step": 373 }, { "epoch": 0.07307542008597108, "grad_norm": 11.740914344787598, "learning_rate": 1.9910127601273922e-05, "loss": 11.6196, "step": 374 }, { "epoch": 0.07327080890973037, "grad_norm": 10.138679504394531, "learning_rate": 1.9909279036120082e-05, "loss": 11.4332, "step": 375 }, { "epoch": 0.07346619773348964, "grad_norm": 8.679524421691895, "learning_rate": 1.9908426501992337e-05, "loss": 12.6449, "step": 376 }, { "epoch": 0.07366158655724893, "grad_norm": 13.47012710571289, "learning_rate": 1.9907569999232158e-05, "loss": 10.7538, "step": 377 }, { "epoch": 0.0738569753810082, "grad_norm": 7.856994152069092, "learning_rate": 1.9906709528182602e-05, "loss": 12.6015, "step": 378 }, { "epoch": 0.07405236420476749, "grad_norm": 9.596478462219238, "learning_rate": 1.990584508918831e-05, "loss": 11.9684, "step": 379 }, { "epoch": 0.07424775302852676, "grad_norm": 11.648253440856934, "learning_rate": 1.9904976682595518e-05, "loss": 12.2241, "step": 380 }, { "epoch": 0.07444314185228605, "grad_norm": 8.147643089294434, "learning_rate": 1.9904104308752053e-05, "loss": 11.0065, "step": 381 }, { "epoch": 0.07463853067604533, "grad_norm": 9.387744903564453, "learning_rate": 1.9903227968007324e-05, "loss": 11.2652, "step": 382 }, { "epoch": 0.07483391949980461, "grad_norm": 7.9869818687438965, "learning_rate": 1.9902347660712336e-05, "loss": 10.4613, "step": 383 }, { "epoch": 0.07502930832356389, "grad_norm": 12.57647705078125, "learning_rate": 1.9901463387219677e-05, "loss": 13.2274, "step": 384 }, { "epoch": 0.07522469714732317, "grad_norm": 16.30275535583496, "learning_rate": 1.9900575147883525e-05, "loss": 10.7593, "step": 385 }, { "epoch": 0.07542008597108245, "grad_norm": 8.52121353149414, "learning_rate": 1.9899682943059652e-05, "loss": 11.4779, "step": 386 }, { "epoch": 0.07561547479484174, "grad_norm": 9.90449047088623, "learning_rate": 1.9898786773105407e-05, "loss": 12.9545, "step": 387 }, { "epoch": 0.07581086361860101, "grad_norm": 11.800531387329102, "learning_rate": 1.989788663837974e-05, "loss": 12.4126, "step": 388 }, { "epoch": 0.0760062524423603, "grad_norm": 7.592154502868652, "learning_rate": 1.989698253924318e-05, "loss": 11.6059, "step": 389 }, { "epoch": 0.07620164126611957, "grad_norm": 13.224897384643555, "learning_rate": 1.9896074476057846e-05, "loss": 11.1759, "step": 390 }, { "epoch": 0.07639703008987886, "grad_norm": 7.516778469085693, "learning_rate": 1.9895162449187452e-05, "loss": 12.0758, "step": 391 }, { "epoch": 0.07659241891363815, "grad_norm": 10.208797454833984, "learning_rate": 1.989424645899728e-05, "loss": 11.1313, "step": 392 }, { "epoch": 0.07678780773739742, "grad_norm": 14.074166297912598, "learning_rate": 1.9893326505854228e-05, "loss": 11.9664, "step": 393 }, { "epoch": 0.07698319656115671, "grad_norm": 9.178504943847656, "learning_rate": 1.989240259012675e-05, "loss": 11.5146, "step": 394 }, { "epoch": 0.07717858538491598, "grad_norm": 10.253775596618652, "learning_rate": 1.9891474712184916e-05, "loss": 11.8304, "step": 395 }, { "epoch": 0.07737397420867527, "grad_norm": 11.380575180053711, "learning_rate": 1.9890542872400368e-05, "loss": 10.6585, "step": 396 }, { "epoch": 0.07756936303243454, "grad_norm": 13.956233024597168, "learning_rate": 1.988960707114633e-05, "loss": 11.6757, "step": 397 }, { "epoch": 0.07776475185619383, "grad_norm": 9.273847579956055, "learning_rate": 1.9888667308797622e-05, "loss": 11.9675, "step": 398 }, { "epoch": 0.0779601406799531, "grad_norm": 8.791987419128418, "learning_rate": 1.988772358573065e-05, "loss": 10.9511, "step": 399 }, { "epoch": 0.07815552950371239, "grad_norm": 10.47549057006836, "learning_rate": 1.9886775902323405e-05, "loss": 11.0095, "step": 400 }, { "epoch": 0.07835091832747167, "grad_norm": 9.35380744934082, "learning_rate": 1.9885824258955462e-05, "loss": 11.3787, "step": 401 }, { "epoch": 0.07854630715123095, "grad_norm": 9.344833374023438, "learning_rate": 1.9884868656007982e-05, "loss": 12.0148, "step": 402 }, { "epoch": 0.07874169597499023, "grad_norm": 10.613936424255371, "learning_rate": 1.988390909386372e-05, "loss": 11.4727, "step": 403 }, { "epoch": 0.07893708479874952, "grad_norm": 9.871330261230469, "learning_rate": 1.9882945572907002e-05, "loss": 11.2291, "step": 404 }, { "epoch": 0.07913247362250879, "grad_norm": 9.063941955566406, "learning_rate": 1.9881978093523754e-05, "loss": 11.2424, "step": 405 }, { "epoch": 0.07932786244626808, "grad_norm": 10.158225059509277, "learning_rate": 1.988100665610148e-05, "loss": 11.7028, "step": 406 }, { "epoch": 0.07952325127002735, "grad_norm": 20.984745025634766, "learning_rate": 1.988003126102927e-05, "loss": 12.7531, "step": 407 }, { "epoch": 0.07971864009378664, "grad_norm": 10.408671379089355, "learning_rate": 1.9879051908697803e-05, "loss": 9.6428, "step": 408 }, { "epoch": 0.07991402891754591, "grad_norm": 11.17059326171875, "learning_rate": 1.9878068599499333e-05, "loss": 11.5591, "step": 409 }, { "epoch": 0.0801094177413052, "grad_norm": 17.67782211303711, "learning_rate": 1.9877081333827714e-05, "loss": 12.1476, "step": 410 }, { "epoch": 0.08030480656506447, "grad_norm": 10.006389617919922, "learning_rate": 1.987609011207837e-05, "loss": 9.6199, "step": 411 }, { "epoch": 0.08050019538882376, "grad_norm": 11.344561576843262, "learning_rate": 1.987509493464832e-05, "loss": 11.0631, "step": 412 }, { "epoch": 0.08069558421258304, "grad_norm": 7.435112476348877, "learning_rate": 1.987409580193616e-05, "loss": 9.9161, "step": 413 }, { "epoch": 0.08089097303634232, "grad_norm": 11.36435317993164, "learning_rate": 1.987309271434208e-05, "loss": 12.5855, "step": 414 }, { "epoch": 0.0810863618601016, "grad_norm": 16.058595657348633, "learning_rate": 1.9872085672267842e-05, "loss": 11.5972, "step": 415 }, { "epoch": 0.08128175068386088, "grad_norm": 12.252034187316895, "learning_rate": 1.9871074676116802e-05, "loss": 9.6958, "step": 416 }, { "epoch": 0.08147713950762016, "grad_norm": 11.003941535949707, "learning_rate": 1.9870059726293892e-05, "loss": 10.7821, "step": 417 }, { "epoch": 0.08167252833137945, "grad_norm": 13.808038711547852, "learning_rate": 1.986904082320563e-05, "loss": 11.5391, "step": 418 }, { "epoch": 0.08186791715513872, "grad_norm": 12.557829856872559, "learning_rate": 1.986801796726012e-05, "loss": 12.2014, "step": 419 }, { "epoch": 0.08206330597889801, "grad_norm": 14.017550468444824, "learning_rate": 1.9866991158867047e-05, "loss": 10.9317, "step": 420 }, { "epoch": 0.08225869480265728, "grad_norm": 12.062165260314941, "learning_rate": 1.986596039843768e-05, "loss": 10.8878, "step": 421 }, { "epoch": 0.08245408362641657, "grad_norm": 10.640327453613281, "learning_rate": 1.9864925686384874e-05, "loss": 10.019, "step": 422 }, { "epoch": 0.08264947245017586, "grad_norm": 6.800312519073486, "learning_rate": 1.986388702312306e-05, "loss": 10.3276, "step": 423 }, { "epoch": 0.08284486127393513, "grad_norm": 8.374358177185059, "learning_rate": 1.9862844409068252e-05, "loss": 11.5054, "step": 424 }, { "epoch": 0.08304025009769442, "grad_norm": 11.760390281677246, "learning_rate": 1.9861797844638055e-05, "loss": 10.7713, "step": 425 }, { "epoch": 0.08323563892145369, "grad_norm": 15.597134590148926, "learning_rate": 1.9860747330251647e-05, "loss": 12.2257, "step": 426 }, { "epoch": 0.08343102774521298, "grad_norm": 11.854767799377441, "learning_rate": 1.985969286632979e-05, "loss": 12.0446, "step": 427 }, { "epoch": 0.08362641656897225, "grad_norm": 8.871596336364746, "learning_rate": 1.9858634453294835e-05, "loss": 11.391, "step": 428 }, { "epoch": 0.08382180539273154, "grad_norm": 6.248795032501221, "learning_rate": 1.9857572091570704e-05, "loss": 10.8528, "step": 429 }, { "epoch": 0.08401719421649081, "grad_norm": 10.798820495605469, "learning_rate": 1.985650578158291e-05, "loss": 10.1009, "step": 430 }, { "epoch": 0.0842125830402501, "grad_norm": 11.153851509094238, "learning_rate": 1.9855435523758542e-05, "loss": 10.4832, "step": 431 }, { "epoch": 0.08440797186400938, "grad_norm": 7.624647617340088, "learning_rate": 1.9854361318526273e-05, "loss": 12.0875, "step": 432 }, { "epoch": 0.08460336068776866, "grad_norm": 9.647201538085938, "learning_rate": 1.9853283166316348e-05, "loss": 11.6031, "step": 433 }, { "epoch": 0.08479874951152794, "grad_norm": 10.949299812316895, "learning_rate": 1.9852201067560607e-05, "loss": 11.5041, "step": 434 }, { "epoch": 0.08499413833528723, "grad_norm": 12.349892616271973, "learning_rate": 1.9851115022692468e-05, "loss": 11.2026, "step": 435 }, { "epoch": 0.0851895271590465, "grad_norm": 9.193042755126953, "learning_rate": 1.9850025032146916e-05, "loss": 10.3315, "step": 436 }, { "epoch": 0.08538491598280579, "grad_norm": 14.063774108886719, "learning_rate": 1.984893109636053e-05, "loss": 12.1384, "step": 437 }, { "epoch": 0.08558030480656506, "grad_norm": 13.098580360412598, "learning_rate": 1.984783321577147e-05, "loss": 10.7132, "step": 438 }, { "epoch": 0.08577569363032435, "grad_norm": 9.289295196533203, "learning_rate": 1.9846731390819463e-05, "loss": 11.0615, "step": 439 }, { "epoch": 0.08597108245408362, "grad_norm": 13.094406127929688, "learning_rate": 1.984562562194583e-05, "loss": 11.685, "step": 440 }, { "epoch": 0.08616647127784291, "grad_norm": 10.86340045928955, "learning_rate": 1.9844515909593466e-05, "loss": 11.3287, "step": 441 }, { "epoch": 0.08636186010160218, "grad_norm": 14.637931823730469, "learning_rate": 1.984340225420684e-05, "loss": 11.9579, "step": 442 }, { "epoch": 0.08655724892536147, "grad_norm": 16.27992820739746, "learning_rate": 1.984228465623201e-05, "loss": 11.3188, "step": 443 }, { "epoch": 0.08675263774912075, "grad_norm": 8.437994003295898, "learning_rate": 1.9841163116116607e-05, "loss": 10.107, "step": 444 }, { "epoch": 0.08694802657288003, "grad_norm": 12.490747451782227, "learning_rate": 1.9840037634309846e-05, "loss": 12.4261, "step": 445 }, { "epoch": 0.0871434153966393, "grad_norm": 9.785431861877441, "learning_rate": 1.9838908211262512e-05, "loss": 10.9658, "step": 446 }, { "epoch": 0.0873388042203986, "grad_norm": 9.814424514770508, "learning_rate": 1.983777484742698e-05, "loss": 12.4832, "step": 447 }, { "epoch": 0.08753419304415787, "grad_norm": 9.344550132751465, "learning_rate": 1.983663754325719e-05, "loss": 11.3044, "step": 448 }, { "epoch": 0.08772958186791716, "grad_norm": 12.776989936828613, "learning_rate": 1.9835496299208675e-05, "loss": 12.2846, "step": 449 }, { "epoch": 0.08792497069167643, "grad_norm": 6.569207668304443, "learning_rate": 1.9834351115738535e-05, "loss": 11.6907, "step": 450 }, { "epoch": 0.08812035951543572, "grad_norm": 8.714112281799316, "learning_rate": 1.9833201993305452e-05, "loss": 11.3617, "step": 451 }, { "epoch": 0.088315748339195, "grad_norm": 9.547415733337402, "learning_rate": 1.9832048932369684e-05, "loss": 12.0239, "step": 452 }, { "epoch": 0.08851113716295428, "grad_norm": 8.278121948242188, "learning_rate": 1.9830891933393068e-05, "loss": 11.1598, "step": 453 }, { "epoch": 0.08870652598671357, "grad_norm": 12.530584335327148, "learning_rate": 1.982973099683902e-05, "loss": 11.3444, "step": 454 }, { "epoch": 0.08890191481047284, "grad_norm": 7.621415615081787, "learning_rate": 1.982856612317253e-05, "loss": 10.6134, "step": 455 }, { "epoch": 0.08909730363423213, "grad_norm": 8.175232887268066, "learning_rate": 1.9827397312860165e-05, "loss": 12.4467, "step": 456 }, { "epoch": 0.0892926924579914, "grad_norm": 12.001221656799316, "learning_rate": 1.982622456637007e-05, "loss": 10.5231, "step": 457 }, { "epoch": 0.08948808128175069, "grad_norm": 10.777896881103516, "learning_rate": 1.982504788417197e-05, "loss": 11.8483, "step": 458 }, { "epoch": 0.08968347010550996, "grad_norm": 11.15634822845459, "learning_rate": 1.982386726673716e-05, "loss": 10.4165, "step": 459 }, { "epoch": 0.08987885892926925, "grad_norm": 15.486494064331055, "learning_rate": 1.9822682714538512e-05, "loss": 12.3944, "step": 460 }, { "epoch": 0.09007424775302852, "grad_norm": 17.380624771118164, "learning_rate": 1.982149422805048e-05, "loss": 11.0951, "step": 461 }, { "epoch": 0.09026963657678781, "grad_norm": 23.118669509887695, "learning_rate": 1.9820301807749084e-05, "loss": 11.6186, "step": 462 }, { "epoch": 0.09046502540054709, "grad_norm": 12.999380111694336, "learning_rate": 1.9819105454111932e-05, "loss": 11.1989, "step": 463 }, { "epoch": 0.09066041422430637, "grad_norm": 10.317048072814941, "learning_rate": 1.9817905167618194e-05, "loss": 11.1085, "step": 464 }, { "epoch": 0.09085580304806565, "grad_norm": 9.965431213378906, "learning_rate": 1.981670094874863e-05, "loss": 11.5608, "step": 465 }, { "epoch": 0.09105119187182494, "grad_norm": 8.059679985046387, "learning_rate": 1.981549279798556e-05, "loss": 10.2311, "step": 466 }, { "epoch": 0.09124658069558421, "grad_norm": 12.043992042541504, "learning_rate": 1.981428071581289e-05, "loss": 11.189, "step": 467 }, { "epoch": 0.0914419695193435, "grad_norm": 8.804059028625488, "learning_rate": 1.9813064702716094e-05, "loss": 12.1722, "step": 468 }, { "epoch": 0.09163735834310277, "grad_norm": 9.039908409118652, "learning_rate": 1.9811844759182226e-05, "loss": 9.9737, "step": 469 }, { "epoch": 0.09183274716686206, "grad_norm": 11.43051528930664, "learning_rate": 1.9810620885699908e-05, "loss": 11.2245, "step": 470 }, { "epoch": 0.09202813599062133, "grad_norm": 12.444183349609375, "learning_rate": 1.9809393082759342e-05, "loss": 13.1322, "step": 471 }, { "epoch": 0.09222352481438062, "grad_norm": 8.038171768188477, "learning_rate": 1.98081613508523e-05, "loss": 12.363, "step": 472 }, { "epoch": 0.0924189136381399, "grad_norm": 14.102073669433594, "learning_rate": 1.9806925690472128e-05, "loss": 11.5428, "step": 473 }, { "epoch": 0.09261430246189918, "grad_norm": 10.017738342285156, "learning_rate": 1.980568610211375e-05, "loss": 12.0738, "step": 474 }, { "epoch": 0.09280969128565845, "grad_norm": 8.079500198364258, "learning_rate": 1.980444258627365e-05, "loss": 10.8434, "step": 475 }, { "epoch": 0.09300508010941774, "grad_norm": 10.891678810119629, "learning_rate": 1.9803195143449905e-05, "loss": 11.4258, "step": 476 }, { "epoch": 0.09320046893317702, "grad_norm": 8.259817123413086, "learning_rate": 1.9801943774142152e-05, "loss": 11.2229, "step": 477 }, { "epoch": 0.0933958577569363, "grad_norm": 9.344334602355957, "learning_rate": 1.9800688478851603e-05, "loss": 11.9778, "step": 478 }, { "epoch": 0.09359124658069558, "grad_norm": 11.072267532348633, "learning_rate": 1.979942925808104e-05, "loss": 11.1355, "step": 479 }, { "epoch": 0.09378663540445487, "grad_norm": 11.740673065185547, "learning_rate": 1.9798166112334824e-05, "loss": 9.7471, "step": 480 }, { "epoch": 0.09398202422821414, "grad_norm": 8.792943954467773, "learning_rate": 1.9796899042118883e-05, "loss": 10.8433, "step": 481 }, { "epoch": 0.09417741305197343, "grad_norm": 9.371011734008789, "learning_rate": 1.9795628047940714e-05, "loss": 11.7686, "step": 482 }, { "epoch": 0.09437280187573271, "grad_norm": 11.211883544921875, "learning_rate": 1.9794353130309395e-05, "loss": 11.6966, "step": 483 }, { "epoch": 0.09456819069949199, "grad_norm": 16.014528274536133, "learning_rate": 1.9793074289735566e-05, "loss": 11.0515, "step": 484 }, { "epoch": 0.09476357952325128, "grad_norm": 8.87293815612793, "learning_rate": 1.9791791526731444e-05, "loss": 9.333, "step": 485 }, { "epoch": 0.09495896834701055, "grad_norm": 18.065631866455078, "learning_rate": 1.9790504841810817e-05, "loss": 10.9915, "step": 486 }, { "epoch": 0.09515435717076984, "grad_norm": 9.98987865447998, "learning_rate": 1.9789214235489042e-05, "loss": 10.7225, "step": 487 }, { "epoch": 0.09534974599452911, "grad_norm": 21.688825607299805, "learning_rate": 1.9787919708283044e-05, "loss": 11.0898, "step": 488 }, { "epoch": 0.0955451348182884, "grad_norm": 8.642053604125977, "learning_rate": 1.9786621260711322e-05, "loss": 9.916, "step": 489 }, { "epoch": 0.09574052364204767, "grad_norm": 10.877992630004883, "learning_rate": 1.978531889329395e-05, "loss": 12.2251, "step": 490 }, { "epoch": 0.09593591246580696, "grad_norm": 9.516289710998535, "learning_rate": 1.9784012606552557e-05, "loss": 12.1528, "step": 491 }, { "epoch": 0.09613130128956623, "grad_norm": 8.622530937194824, "learning_rate": 1.978270240101036e-05, "loss": 11.5102, "step": 492 }, { "epoch": 0.09632669011332552, "grad_norm": 8.162901878356934, "learning_rate": 1.9781388277192132e-05, "loss": 10.8012, "step": 493 }, { "epoch": 0.0965220789370848, "grad_norm": 10.565152168273926, "learning_rate": 1.9780070235624222e-05, "loss": 10.1863, "step": 494 }, { "epoch": 0.09671746776084408, "grad_norm": 7.8273749351501465, "learning_rate": 1.977874827683455e-05, "loss": 11.5141, "step": 495 }, { "epoch": 0.09691285658460336, "grad_norm": 9.111390113830566, "learning_rate": 1.9777422401352594e-05, "loss": 11.9115, "step": 496 }, { "epoch": 0.09710824540836265, "grad_norm": 8.689447402954102, "learning_rate": 1.9776092609709416e-05, "loss": 9.9112, "step": 497 }, { "epoch": 0.09730363423212192, "grad_norm": 9.937636375427246, "learning_rate": 1.9774758902437634e-05, "loss": 10.3904, "step": 498 }, { "epoch": 0.0974990230558812, "grad_norm": 12.398046493530273, "learning_rate": 1.9773421280071437e-05, "loss": 10.1024, "step": 499 }, { "epoch": 0.09769441187964048, "grad_norm": 9.194086074829102, "learning_rate": 1.977207974314659e-05, "loss": 11.3953, "step": 500 }, { "epoch": 0.09788980070339977, "grad_norm": 7.784787178039551, "learning_rate": 1.977073429220042e-05, "loss": 10.7072, "step": 501 }, { "epoch": 0.09808518952715904, "grad_norm": 15.563216209411621, "learning_rate": 1.976938492777182e-05, "loss": 10.7934, "step": 502 }, { "epoch": 0.09828057835091833, "grad_norm": 10.296627044677734, "learning_rate": 1.9768031650401248e-05, "loss": 11.5148, "step": 503 }, { "epoch": 0.0984759671746776, "grad_norm": 9.36056137084961, "learning_rate": 1.976667446063074e-05, "loss": 11.3866, "step": 504 }, { "epoch": 0.09867135599843689, "grad_norm": 15.350499153137207, "learning_rate": 1.976531335900389e-05, "loss": 13.5321, "step": 505 }, { "epoch": 0.09886674482219616, "grad_norm": 11.02067756652832, "learning_rate": 1.9763948346065865e-05, "loss": 9.5924, "step": 506 }, { "epoch": 0.09906213364595545, "grad_norm": 10.623998641967773, "learning_rate": 1.976257942236339e-05, "loss": 10.4124, "step": 507 }, { "epoch": 0.09925752246971473, "grad_norm": 11.29489517211914, "learning_rate": 1.976120658844476e-05, "loss": 10.2982, "step": 508 }, { "epoch": 0.09945291129347401, "grad_norm": 15.255050659179688, "learning_rate": 1.9759829844859843e-05, "loss": 11.665, "step": 509 }, { "epoch": 0.09964830011723329, "grad_norm": 10.669962882995605, "learning_rate": 1.9758449192160067e-05, "loss": 11.3428, "step": 510 }, { "epoch": 0.09984368894099258, "grad_norm": 11.840312004089355, "learning_rate": 1.9757064630898422e-05, "loss": 11.7798, "step": 511 }, { "epoch": 0.10003907776475186, "grad_norm": 9.912818908691406, "learning_rate": 1.9755676161629474e-05, "loss": 9.7258, "step": 512 }, { "epoch": 0.10023446658851114, "grad_norm": 9.993812561035156, "learning_rate": 1.9754283784909344e-05, "loss": 11.4899, "step": 513 }, { "epoch": 0.10042985541227042, "grad_norm": 8.480623245239258, "learning_rate": 1.975288750129572e-05, "loss": 10.9759, "step": 514 }, { "epoch": 0.1006252442360297, "grad_norm": 8.028963088989258, "learning_rate": 1.975148731134786e-05, "loss": 11.3543, "step": 515 }, { "epoch": 0.10082063305978899, "grad_norm": 9.241572380065918, "learning_rate": 1.9750083215626588e-05, "loss": 10.422, "step": 516 }, { "epoch": 0.10101602188354826, "grad_norm": 8.950488090515137, "learning_rate": 1.974867521469428e-05, "loss": 11.0477, "step": 517 }, { "epoch": 0.10121141070730755, "grad_norm": 9.833844184875488, "learning_rate": 1.9747263309114884e-05, "loss": 12.0173, "step": 518 }, { "epoch": 0.10140679953106682, "grad_norm": 8.743428230285645, "learning_rate": 1.9745847499453918e-05, "loss": 11.5385, "step": 519 }, { "epoch": 0.10160218835482611, "grad_norm": 7.573606014251709, "learning_rate": 1.9744427786278456e-05, "loss": 10.3587, "step": 520 }, { "epoch": 0.10179757717858538, "grad_norm": 11.663070678710938, "learning_rate": 1.9743004170157137e-05, "loss": 11.1453, "step": 521 }, { "epoch": 0.10199296600234467, "grad_norm": 9.460543632507324, "learning_rate": 1.9741576651660158e-05, "loss": 12.3356, "step": 522 }, { "epoch": 0.10218835482610394, "grad_norm": 11.570713996887207, "learning_rate": 1.9740145231359295e-05, "loss": 10.9296, "step": 523 }, { "epoch": 0.10238374364986323, "grad_norm": 11.01218318939209, "learning_rate": 1.9738709909827866e-05, "loss": 10.4656, "step": 524 }, { "epoch": 0.1025791324736225, "grad_norm": 10.380640029907227, "learning_rate": 1.9737270687640767e-05, "loss": 11.2092, "step": 525 }, { "epoch": 0.1027745212973818, "grad_norm": 12.392244338989258, "learning_rate": 1.9735827565374447e-05, "loss": 11.3003, "step": 526 }, { "epoch": 0.10296991012114107, "grad_norm": 13.215970993041992, "learning_rate": 1.9734380543606932e-05, "loss": 11.0708, "step": 527 }, { "epoch": 0.10316529894490035, "grad_norm": 8.894550323486328, "learning_rate": 1.9732929622917785e-05, "loss": 10.2508, "step": 528 }, { "epoch": 0.10336068776865963, "grad_norm": 9.56419849395752, "learning_rate": 1.9731474803888156e-05, "loss": 10.2722, "step": 529 }, { "epoch": 0.10355607659241892, "grad_norm": 17.47562026977539, "learning_rate": 1.973001608710074e-05, "loss": 12.7116, "step": 530 }, { "epoch": 0.10375146541617819, "grad_norm": 14.577072143554688, "learning_rate": 1.9728553473139795e-05, "loss": 11.1043, "step": 531 }, { "epoch": 0.10394685423993748, "grad_norm": 9.963520050048828, "learning_rate": 1.9727086962591153e-05, "loss": 12.1775, "step": 532 }, { "epoch": 0.10414224306369675, "grad_norm": 9.970325469970703, "learning_rate": 1.9725616556042188e-05, "loss": 11.3413, "step": 533 }, { "epoch": 0.10433763188745604, "grad_norm": 10.894572257995605, "learning_rate": 1.972414225408185e-05, "loss": 10.9785, "step": 534 }, { "epoch": 0.10453302071121531, "grad_norm": 10.209842681884766, "learning_rate": 1.9722664057300635e-05, "loss": 10.9789, "step": 535 }, { "epoch": 0.1047284095349746, "grad_norm": 8.99254322052002, "learning_rate": 1.9721181966290614e-05, "loss": 11.8011, "step": 536 }, { "epoch": 0.10492379835873387, "grad_norm": 9.370485305786133, "learning_rate": 1.971969598164541e-05, "loss": 9.4472, "step": 537 }, { "epoch": 0.10511918718249316, "grad_norm": 8.806024551391602, "learning_rate": 1.97182061039602e-05, "loss": 10.4979, "step": 538 }, { "epoch": 0.10531457600625244, "grad_norm": 9.16971492767334, "learning_rate": 1.971671233383173e-05, "loss": 11.3504, "step": 539 }, { "epoch": 0.10550996483001172, "grad_norm": 8.441150665283203, "learning_rate": 1.9715214671858303e-05, "loss": 10.4157, "step": 540 }, { "epoch": 0.105705353653771, "grad_norm": 12.984179496765137, "learning_rate": 1.9713713118639778e-05, "loss": 10.6053, "step": 541 }, { "epoch": 0.10590074247753029, "grad_norm": 13.540075302124023, "learning_rate": 1.9712207674777572e-05, "loss": 10.1771, "step": 542 }, { "epoch": 0.10609613130128957, "grad_norm": 13.014501571655273, "learning_rate": 1.9710698340874663e-05, "loss": 11.6918, "step": 543 }, { "epoch": 0.10629152012504885, "grad_norm": 13.211419105529785, "learning_rate": 1.9709185117535586e-05, "loss": 11.1228, "step": 544 }, { "epoch": 0.10648690894880813, "grad_norm": 13.059433937072754, "learning_rate": 1.9707668005366435e-05, "loss": 11.5204, "step": 545 }, { "epoch": 0.10668229777256741, "grad_norm": 12.231781959533691, "learning_rate": 1.9706147004974858e-05, "loss": 11.4718, "step": 546 }, { "epoch": 0.1068776865963267, "grad_norm": 9.824128150939941, "learning_rate": 1.9704622116970067e-05, "loss": 9.9623, "step": 547 }, { "epoch": 0.10707307542008597, "grad_norm": 8.887593269348145, "learning_rate": 1.9703093341962822e-05, "loss": 10.9046, "step": 548 }, { "epoch": 0.10726846424384526, "grad_norm": 9.651532173156738, "learning_rate": 1.970156068056545e-05, "loss": 11.2181, "step": 549 }, { "epoch": 0.10746385306760453, "grad_norm": 11.166255950927734, "learning_rate": 1.9700024133391826e-05, "loss": 12.0837, "step": 550 }, { "epoch": 0.10765924189136382, "grad_norm": 9.001782417297363, "learning_rate": 1.9698483701057384e-05, "loss": 10.9068, "step": 551 }, { "epoch": 0.10785463071512309, "grad_norm": 7.795221328735352, "learning_rate": 1.969693938417912e-05, "loss": 11.1963, "step": 552 }, { "epoch": 0.10805001953888238, "grad_norm": 11.027246475219727, "learning_rate": 1.9695391183375575e-05, "loss": 11.813, "step": 553 }, { "epoch": 0.10824540836264165, "grad_norm": 8.13861083984375, "learning_rate": 1.9693839099266855e-05, "loss": 11.1423, "step": 554 }, { "epoch": 0.10844079718640094, "grad_norm": 9.753663063049316, "learning_rate": 1.9692283132474616e-05, "loss": 11.9865, "step": 555 }, { "epoch": 0.10863618601016022, "grad_norm": 14.523602485656738, "learning_rate": 1.969072328362207e-05, "loss": 12.1644, "step": 556 }, { "epoch": 0.1088315748339195, "grad_norm": 9.290946006774902, "learning_rate": 1.968915955333399e-05, "loss": 10.767, "step": 557 }, { "epoch": 0.10902696365767878, "grad_norm": 8.2459077835083, "learning_rate": 1.9687591942236696e-05, "loss": 10.6439, "step": 558 }, { "epoch": 0.10922235248143806, "grad_norm": 9.10664176940918, "learning_rate": 1.968602045095806e-05, "loss": 10.5131, "step": 559 }, { "epoch": 0.10941774130519734, "grad_norm": 14.061135292053223, "learning_rate": 1.9684445080127518e-05, "loss": 10.7492, "step": 560 }, { "epoch": 0.10961313012895663, "grad_norm": 11.38359260559082, "learning_rate": 1.968286583037606e-05, "loss": 10.3915, "step": 561 }, { "epoch": 0.1098085189527159, "grad_norm": 7.756126403808594, "learning_rate": 1.9681282702336215e-05, "loss": 11.4694, "step": 562 }, { "epoch": 0.11000390777647519, "grad_norm": 9.486867904663086, "learning_rate": 1.9679695696642075e-05, "loss": 10.9713, "step": 563 }, { "epoch": 0.11019929660023446, "grad_norm": 7.7375264167785645, "learning_rate": 1.9678104813929292e-05, "loss": 11.4329, "step": 564 }, { "epoch": 0.11039468542399375, "grad_norm": 10.03549861907959, "learning_rate": 1.9676510054835064e-05, "loss": 12.6544, "step": 565 }, { "epoch": 0.11059007424775302, "grad_norm": 16.695438385009766, "learning_rate": 1.9674911419998133e-05, "loss": 11.2948, "step": 566 }, { "epoch": 0.11078546307151231, "grad_norm": 11.747823715209961, "learning_rate": 1.9673308910058807e-05, "loss": 10.7342, "step": 567 }, { "epoch": 0.11098085189527158, "grad_norm": 13.515654563903809, "learning_rate": 1.9671702525658946e-05, "loss": 11.5844, "step": 568 }, { "epoch": 0.11117624071903087, "grad_norm": 13.833075523376465, "learning_rate": 1.967009226744195e-05, "loss": 11.388, "step": 569 }, { "epoch": 0.11137162954279015, "grad_norm": 9.444323539733887, "learning_rate": 1.9668478136052776e-05, "loss": 10.6101, "step": 570 }, { "epoch": 0.11156701836654943, "grad_norm": 13.008130073547363, "learning_rate": 1.966686013213794e-05, "loss": 9.3365, "step": 571 }, { "epoch": 0.11176240719030871, "grad_norm": 11.551283836364746, "learning_rate": 1.96652382563455e-05, "loss": 10.8201, "step": 572 }, { "epoch": 0.111957796014068, "grad_norm": 9.139278411865234, "learning_rate": 1.9663612509325064e-05, "loss": 9.7352, "step": 573 }, { "epoch": 0.11215318483782728, "grad_norm": 9.821352005004883, "learning_rate": 1.9661982891727803e-05, "loss": 10.9528, "step": 574 }, { "epoch": 0.11234857366158656, "grad_norm": 16.140743255615234, "learning_rate": 1.966034940420642e-05, "loss": 12.4749, "step": 575 }, { "epoch": 0.11254396248534584, "grad_norm": 11.304898262023926, "learning_rate": 1.965871204741518e-05, "loss": 10.5936, "step": 576 }, { "epoch": 0.11273935130910512, "grad_norm": 10.936923027038574, "learning_rate": 1.9657070822009903e-05, "loss": 11.798, "step": 577 }, { "epoch": 0.1129347401328644, "grad_norm": 12.186397552490234, "learning_rate": 1.965542572864794e-05, "loss": 11.4177, "step": 578 }, { "epoch": 0.11313012895662368, "grad_norm": 12.100482940673828, "learning_rate": 1.9653776767988208e-05, "loss": 11.537, "step": 579 }, { "epoch": 0.11332551778038297, "grad_norm": 17.33932876586914, "learning_rate": 1.9652123940691163e-05, "loss": 11.5476, "step": 580 }, { "epoch": 0.11352090660414224, "grad_norm": 36.325687408447266, "learning_rate": 1.965046724741882e-05, "loss": 10.6124, "step": 581 }, { "epoch": 0.11371629542790153, "grad_norm": 12.444219589233398, "learning_rate": 1.964880668883473e-05, "loss": 10.8188, "step": 582 }, { "epoch": 0.1139116842516608, "grad_norm": 11.519243240356445, "learning_rate": 1.9647142265603998e-05, "loss": 10.4957, "step": 583 }, { "epoch": 0.11410707307542009, "grad_norm": 12.104283332824707, "learning_rate": 1.9645473978393283e-05, "loss": 11.0054, "step": 584 }, { "epoch": 0.11430246189917936, "grad_norm": 10.913578033447266, "learning_rate": 1.9643801827870778e-05, "loss": 10.4854, "step": 585 }, { "epoch": 0.11449785072293865, "grad_norm": 11.549110412597656, "learning_rate": 1.964212581470624e-05, "loss": 10.5053, "step": 586 }, { "epoch": 0.11469323954669793, "grad_norm": 13.377059936523438, "learning_rate": 1.964044593957096e-05, "loss": 11.7833, "step": 587 }, { "epoch": 0.11488862837045721, "grad_norm": 13.456280708312988, "learning_rate": 1.9638762203137774e-05, "loss": 11.9166, "step": 588 }, { "epoch": 0.11508401719421649, "grad_norm": 8.56250286102295, "learning_rate": 1.9637074606081084e-05, "loss": 10.6182, "step": 589 }, { "epoch": 0.11527940601797577, "grad_norm": 15.668073654174805, "learning_rate": 1.9635383149076818e-05, "loss": 9.1693, "step": 590 }, { "epoch": 0.11547479484173505, "grad_norm": 10.171055793762207, "learning_rate": 1.963368783280245e-05, "loss": 10.1066, "step": 591 }, { "epoch": 0.11567018366549434, "grad_norm": 8.605528831481934, "learning_rate": 1.9631988657937018e-05, "loss": 9.7485, "step": 592 }, { "epoch": 0.11586557248925361, "grad_norm": 10.60910415649414, "learning_rate": 1.963028562516109e-05, "loss": 11.2119, "step": 593 }, { "epoch": 0.1160609613130129, "grad_norm": 10.013952255249023, "learning_rate": 1.962857873515678e-05, "loss": 10.3981, "step": 594 }, { "epoch": 0.11625635013677217, "grad_norm": 11.97077465057373, "learning_rate": 1.9626867988607755e-05, "loss": 11.9388, "step": 595 }, { "epoch": 0.11645173896053146, "grad_norm": 9.654369354248047, "learning_rate": 1.9625153386199225e-05, "loss": 10.5881, "step": 596 }, { "epoch": 0.11664712778429073, "grad_norm": 8.684782028198242, "learning_rate": 1.9623434928617936e-05, "loss": 10.8913, "step": 597 }, { "epoch": 0.11684251660805002, "grad_norm": 9.611656188964844, "learning_rate": 1.962171261655218e-05, "loss": 10.6154, "step": 598 }, { "epoch": 0.1170379054318093, "grad_norm": 10.223007202148438, "learning_rate": 1.9619986450691808e-05, "loss": 10.6338, "step": 599 }, { "epoch": 0.11723329425556858, "grad_norm": 10.545039176940918, "learning_rate": 1.961825643172819e-05, "loss": 11.1157, "step": 600 }, { "epoch": 0.11742868307932786, "grad_norm": 21.339628219604492, "learning_rate": 1.9616522560354265e-05, "loss": 11.5241, "step": 601 }, { "epoch": 0.11762407190308714, "grad_norm": 10.725296020507812, "learning_rate": 1.9614784837264493e-05, "loss": 10.6171, "step": 602 }, { "epoch": 0.11781946072684643, "grad_norm": 11.286303520202637, "learning_rate": 1.961304326315489e-05, "loss": 11.0865, "step": 603 }, { "epoch": 0.1180148495506057, "grad_norm": 8.145142555236816, "learning_rate": 1.961129783872301e-05, "loss": 10.4172, "step": 604 }, { "epoch": 0.11821023837436499, "grad_norm": 22.039506912231445, "learning_rate": 1.960954856466795e-05, "loss": 10.4725, "step": 605 }, { "epoch": 0.11840562719812427, "grad_norm": 10.486015319824219, "learning_rate": 1.9607795441690343e-05, "loss": 11.1312, "step": 606 }, { "epoch": 0.11860101602188355, "grad_norm": 8.452276229858398, "learning_rate": 1.9606038470492378e-05, "loss": 9.6146, "step": 607 }, { "epoch": 0.11879640484564283, "grad_norm": 9.439180374145508, "learning_rate": 1.9604277651777774e-05, "loss": 10.1609, "step": 608 }, { "epoch": 0.11899179366940212, "grad_norm": 9.560111999511719, "learning_rate": 1.960251298625179e-05, "loss": 10.1234, "step": 609 }, { "epoch": 0.11918718249316139, "grad_norm": 11.915494918823242, "learning_rate": 1.960074447462123e-05, "loss": 11.364, "step": 610 }, { "epoch": 0.11938257131692068, "grad_norm": 10.901596069335938, "learning_rate": 1.9598972117594445e-05, "loss": 11.3737, "step": 611 }, { "epoch": 0.11957796014067995, "grad_norm": 8.30836296081543, "learning_rate": 1.959719591588131e-05, "loss": 11.3081, "step": 612 }, { "epoch": 0.11977334896443924, "grad_norm": 10.83170223236084, "learning_rate": 1.959541587019325e-05, "loss": 11.5524, "step": 613 }, { "epoch": 0.11996873778819851, "grad_norm": 12.46660041809082, "learning_rate": 1.959363198124323e-05, "loss": 10.2701, "step": 614 }, { "epoch": 0.1201641266119578, "grad_norm": 8.10671329498291, "learning_rate": 1.959184424974576e-05, "loss": 9.8735, "step": 615 }, { "epoch": 0.12035951543571707, "grad_norm": 25.261287689208984, "learning_rate": 1.9590052676416872e-05, "loss": 11.7366, "step": 616 }, { "epoch": 0.12055490425947636, "grad_norm": 14.915023803710938, "learning_rate": 1.9588257261974147e-05, "loss": 10.4955, "step": 617 }, { "epoch": 0.12075029308323564, "grad_norm": 17.870332717895508, "learning_rate": 1.958645800713671e-05, "loss": 11.134, "step": 618 }, { "epoch": 0.12094568190699492, "grad_norm": 8.873143196105957, "learning_rate": 1.958465491262522e-05, "loss": 9.836, "step": 619 }, { "epoch": 0.1211410707307542, "grad_norm": 10.999107360839844, "learning_rate": 1.9582847979161864e-05, "loss": 10.983, "step": 620 }, { "epoch": 0.12133645955451348, "grad_norm": 12.157037734985352, "learning_rate": 1.9581037207470382e-05, "loss": 10.6826, "step": 621 }, { "epoch": 0.12153184837827276, "grad_norm": 13.782594680786133, "learning_rate": 1.957922259827604e-05, "loss": 11.2176, "step": 622 }, { "epoch": 0.12172723720203205, "grad_norm": 11.599889755249023, "learning_rate": 1.957740415230565e-05, "loss": 11.4917, "step": 623 }, { "epoch": 0.12192262602579132, "grad_norm": 12.650994300842285, "learning_rate": 1.957558187028755e-05, "loss": 10.5263, "step": 624 }, { "epoch": 0.12211801484955061, "grad_norm": 8.949012756347656, "learning_rate": 1.9573755752951625e-05, "loss": 11.2604, "step": 625 }, { "epoch": 0.12231340367330988, "grad_norm": 9.102046966552734, "learning_rate": 1.957192580102929e-05, "loss": 10.1976, "step": 626 }, { "epoch": 0.12250879249706917, "grad_norm": 11.349761009216309, "learning_rate": 1.9570092015253497e-05, "loss": 10.8624, "step": 627 }, { "epoch": 0.12270418132082844, "grad_norm": 7.718044757843018, "learning_rate": 1.9568254396358735e-05, "loss": 10.177, "step": 628 }, { "epoch": 0.12289957014458773, "grad_norm": 11.953253746032715, "learning_rate": 1.9566412945081028e-05, "loss": 11.9735, "step": 629 }, { "epoch": 0.123094958968347, "grad_norm": 11.112428665161133, "learning_rate": 1.9564567662157932e-05, "loss": 10.7037, "step": 630 }, { "epoch": 0.12329034779210629, "grad_norm": 11.177035331726074, "learning_rate": 1.9562718548328544e-05, "loss": 11.331, "step": 631 }, { "epoch": 0.12348573661586557, "grad_norm": 17.98822021484375, "learning_rate": 1.9560865604333487e-05, "loss": 12.0022, "step": 632 }, { "epoch": 0.12368112543962485, "grad_norm": 21.346542358398438, "learning_rate": 1.955900883091493e-05, "loss": 11.8838, "step": 633 }, { "epoch": 0.12387651426338414, "grad_norm": 41.60587692260742, "learning_rate": 1.955714822881656e-05, "loss": 11.0619, "step": 634 }, { "epoch": 0.12407190308714341, "grad_norm": 11.272843360900879, "learning_rate": 1.9555283798783605e-05, "loss": 11.3297, "step": 635 }, { "epoch": 0.1242672919109027, "grad_norm": 19.868553161621094, "learning_rate": 1.9553415541562833e-05, "loss": 12.7203, "step": 636 }, { "epoch": 0.12446268073466198, "grad_norm": 16.160825729370117, "learning_rate": 1.9551543457902544e-05, "loss": 10.2777, "step": 637 }, { "epoch": 0.12465806955842126, "grad_norm": 9.786787986755371, "learning_rate": 1.9549667548552557e-05, "loss": 12.1408, "step": 638 }, { "epoch": 0.12485345838218054, "grad_norm": 9.837309837341309, "learning_rate": 1.9547787814264228e-05, "loss": 11.0809, "step": 639 }, { "epoch": 0.1250488472059398, "grad_norm": 12.483675003051758, "learning_rate": 1.954590425579046e-05, "loss": 10.9152, "step": 640 }, { "epoch": 0.1252442360296991, "grad_norm": 15.183236122131348, "learning_rate": 1.954401687388567e-05, "loss": 11.2427, "step": 641 }, { "epoch": 0.1254396248534584, "grad_norm": 12.204520225524902, "learning_rate": 1.9542125669305813e-05, "loss": 11.4033, "step": 642 }, { "epoch": 0.12563501367721766, "grad_norm": 26.315473556518555, "learning_rate": 1.954023064280838e-05, "loss": 10.4009, "step": 643 }, { "epoch": 0.12583040250097693, "grad_norm": 10.41172981262207, "learning_rate": 1.9538331795152383e-05, "loss": 10.8517, "step": 644 }, { "epoch": 0.12602579132473624, "grad_norm": 9.543086051940918, "learning_rate": 1.9536429127098372e-05, "loss": 11.2244, "step": 645 }, { "epoch": 0.1262211801484955, "grad_norm": 8.386434555053711, "learning_rate": 1.9534522639408422e-05, "loss": 10.9421, "step": 646 }, { "epoch": 0.12641656897225478, "grad_norm": 10.363856315612793, "learning_rate": 1.9532612332846143e-05, "loss": 11.924, "step": 647 }, { "epoch": 0.12661195779601406, "grad_norm": 17.130685806274414, "learning_rate": 1.9530698208176674e-05, "loss": 10.718, "step": 648 }, { "epoch": 0.12680734661977336, "grad_norm": 9.325024604797363, "learning_rate": 1.9528780266166674e-05, "loss": 10.0044, "step": 649 }, { "epoch": 0.12700273544353263, "grad_norm": 10.563580513000488, "learning_rate": 1.9526858507584347e-05, "loss": 10.5433, "step": 650 }, { "epoch": 0.1271981242672919, "grad_norm": 9.915545463562012, "learning_rate": 1.952493293319941e-05, "loss": 10.2272, "step": 651 }, { "epoch": 0.12739351309105118, "grad_norm": 9.59200668334961, "learning_rate": 1.9523003543783122e-05, "loss": 10.0584, "step": 652 }, { "epoch": 0.12758890191481048, "grad_norm": 20.589073181152344, "learning_rate": 1.9521070340108258e-05, "loss": 10.1349, "step": 653 }, { "epoch": 0.12778429073856976, "grad_norm": 13.821552276611328, "learning_rate": 1.9519133322949124e-05, "loss": 10.4477, "step": 654 }, { "epoch": 0.12797967956232903, "grad_norm": 13.572846412658691, "learning_rate": 1.9517192493081563e-05, "loss": 10.4236, "step": 655 }, { "epoch": 0.1281750683860883, "grad_norm": 29.654455184936523, "learning_rate": 1.9515247851282936e-05, "loss": 11.0006, "step": 656 }, { "epoch": 0.1283704572098476, "grad_norm": 18.17194175720215, "learning_rate": 1.9513299398332126e-05, "loss": 10.744, "step": 657 }, { "epoch": 0.12856584603360688, "grad_norm": 15.245993614196777, "learning_rate": 1.9511347135009556e-05, "loss": 10.9422, "step": 658 }, { "epoch": 0.12876123485736615, "grad_norm": 9.658390045166016, "learning_rate": 1.9509391062097163e-05, "loss": 10.7485, "step": 659 }, { "epoch": 0.12895662368112543, "grad_norm": 19.675369262695312, "learning_rate": 1.950743118037842e-05, "loss": 10.4852, "step": 660 }, { "epoch": 0.12915201250488473, "grad_norm": 16.983983993530273, "learning_rate": 1.9505467490638317e-05, "loss": 10.9365, "step": 661 }, { "epoch": 0.129347401328644, "grad_norm": 12.646660804748535, "learning_rate": 1.950349999366337e-05, "loss": 11.257, "step": 662 }, { "epoch": 0.12954279015240328, "grad_norm": 10.177793502807617, "learning_rate": 1.9501528690241632e-05, "loss": 10.5301, "step": 663 }, { "epoch": 0.12973817897616258, "grad_norm": 25.35265350341797, "learning_rate": 1.9499553581162658e-05, "loss": 10.7971, "step": 664 }, { "epoch": 0.12993356779992185, "grad_norm": 9.132247924804688, "learning_rate": 1.9497574667217553e-05, "loss": 10.6206, "step": 665 }, { "epoch": 0.13012895662368112, "grad_norm": 12.48060131072998, "learning_rate": 1.9495591949198926e-05, "loss": 11.35, "step": 666 }, { "epoch": 0.1303243454474404, "grad_norm": 18.60825538635254, "learning_rate": 1.9493605427900922e-05, "loss": 10.384, "step": 667 }, { "epoch": 0.1305197342711997, "grad_norm": 10.191056251525879, "learning_rate": 1.94916151041192e-05, "loss": 10.8634, "step": 668 }, { "epoch": 0.13071512309495897, "grad_norm": 13.43942928314209, "learning_rate": 1.9489620978650946e-05, "loss": 10.8465, "step": 669 }, { "epoch": 0.13091051191871825, "grad_norm": 9.228296279907227, "learning_rate": 1.9487623052294875e-05, "loss": 11.0102, "step": 670 }, { "epoch": 0.13110590074247752, "grad_norm": 10.576723098754883, "learning_rate": 1.9485621325851213e-05, "loss": 10.9432, "step": 671 }, { "epoch": 0.13130128956623682, "grad_norm": 12.026520729064941, "learning_rate": 1.9483615800121717e-05, "loss": 11.577, "step": 672 }, { "epoch": 0.1314966783899961, "grad_norm": 12.680463790893555, "learning_rate": 1.948160647590966e-05, "loss": 10.7402, "step": 673 }, { "epoch": 0.13169206721375537, "grad_norm": 9.338397026062012, "learning_rate": 1.9479593354019843e-05, "loss": 10.3435, "step": 674 }, { "epoch": 0.13188745603751464, "grad_norm": 9.28903865814209, "learning_rate": 1.9477576435258575e-05, "loss": 10.9857, "step": 675 }, { "epoch": 0.13208284486127395, "grad_norm": 14.938960075378418, "learning_rate": 1.94755557204337e-05, "loss": 9.6975, "step": 676 }, { "epoch": 0.13227823368503322, "grad_norm": 20.733600616455078, "learning_rate": 1.947353121035458e-05, "loss": 10.5668, "step": 677 }, { "epoch": 0.1324736225087925, "grad_norm": 16.638071060180664, "learning_rate": 1.9471502905832088e-05, "loss": 11.0326, "step": 678 }, { "epoch": 0.13266901133255177, "grad_norm": 11.815279960632324, "learning_rate": 1.946947080767863e-05, "loss": 10.4106, "step": 679 }, { "epoch": 0.13286440015631107, "grad_norm": 11.046647071838379, "learning_rate": 1.9467434916708118e-05, "loss": 10.8296, "step": 680 }, { "epoch": 0.13305978898007034, "grad_norm": 14.552698135375977, "learning_rate": 1.9465395233735992e-05, "loss": 12.1462, "step": 681 }, { "epoch": 0.13325517780382962, "grad_norm": 17.292787551879883, "learning_rate": 1.946335175957921e-05, "loss": 11.1024, "step": 682 }, { "epoch": 0.1334505666275889, "grad_norm": 19.84165382385254, "learning_rate": 1.9461304495056247e-05, "loss": 10.2571, "step": 683 }, { "epoch": 0.1336459554513482, "grad_norm": 10.883853912353516, "learning_rate": 1.945925344098709e-05, "loss": 9.9972, "step": 684 }, { "epoch": 0.13384134427510747, "grad_norm": 12.545206069946289, "learning_rate": 1.9457198598193257e-05, "loss": 10.76, "step": 685 }, { "epoch": 0.13403673309886674, "grad_norm": 8.704316139221191, "learning_rate": 1.9455139967497773e-05, "loss": 10.6959, "step": 686 }, { "epoch": 0.134232121922626, "grad_norm": 11.672131538391113, "learning_rate": 1.945307754972518e-05, "loss": 11.8341, "step": 687 }, { "epoch": 0.13442751074638531, "grad_norm": 10.522601127624512, "learning_rate": 1.945101134570155e-05, "loss": 10.5467, "step": 688 }, { "epoch": 0.1346228995701446, "grad_norm": 10.88532543182373, "learning_rate": 1.9448941356254453e-05, "loss": 11.1203, "step": 689 }, { "epoch": 0.13481828839390386, "grad_norm": 10.269328117370605, "learning_rate": 1.9446867582212987e-05, "loss": 10.3559, "step": 690 }, { "epoch": 0.13501367721766314, "grad_norm": 11.798625946044922, "learning_rate": 1.9444790024407766e-05, "loss": 9.7001, "step": 691 }, { "epoch": 0.13520906604142244, "grad_norm": 10.87350845336914, "learning_rate": 1.944270868367091e-05, "loss": 11.1203, "step": 692 }, { "epoch": 0.1354044548651817, "grad_norm": 15.473928451538086, "learning_rate": 1.9440623560836064e-05, "loss": 10.6878, "step": 693 }, { "epoch": 0.13559984368894099, "grad_norm": 10.6022310256958, "learning_rate": 1.9438534656738386e-05, "loss": 10.6721, "step": 694 }, { "epoch": 0.1357952325127003, "grad_norm": 9.145467758178711, "learning_rate": 1.9436441972214545e-05, "loss": 10.7177, "step": 695 }, { "epoch": 0.13599062133645956, "grad_norm": 10.879833221435547, "learning_rate": 1.9434345508102726e-05, "loss": 10.2057, "step": 696 }, { "epoch": 0.13618601016021883, "grad_norm": 9.55604362487793, "learning_rate": 1.943224526524263e-05, "loss": 10.0616, "step": 697 }, { "epoch": 0.1363813989839781, "grad_norm": 11.760533332824707, "learning_rate": 1.9430141244475468e-05, "loss": 10.8176, "step": 698 }, { "epoch": 0.1365767878077374, "grad_norm": 17.020631790161133, "learning_rate": 1.9428033446643964e-05, "loss": 10.1112, "step": 699 }, { "epoch": 0.13677217663149668, "grad_norm": 12.788175582885742, "learning_rate": 1.942592187259236e-05, "loss": 11.4064, "step": 700 }, { "epoch": 0.13696756545525596, "grad_norm": 8.893692016601562, "learning_rate": 1.94238065231664e-05, "loss": 10.5016, "step": 701 }, { "epoch": 0.13716295427901523, "grad_norm": 10.239310264587402, "learning_rate": 1.942168739921336e-05, "loss": 10.837, "step": 702 }, { "epoch": 0.13735834310277453, "grad_norm": 11.442367553710938, "learning_rate": 1.9419564501582e-05, "loss": 11.4701, "step": 703 }, { "epoch": 0.1375537319265338, "grad_norm": 19.924348831176758, "learning_rate": 1.9417437831122615e-05, "loss": 10.4355, "step": 704 }, { "epoch": 0.13774912075029308, "grad_norm": 11.848478317260742, "learning_rate": 1.9415307388687007e-05, "loss": 12.0589, "step": 705 }, { "epoch": 0.13794450957405235, "grad_norm": 10.013603210449219, "learning_rate": 1.9413173175128472e-05, "loss": 10.827, "step": 706 }, { "epoch": 0.13813989839781166, "grad_norm": 15.492804527282715, "learning_rate": 1.9411035191301836e-05, "loss": 8.7754, "step": 707 }, { "epoch": 0.13833528722157093, "grad_norm": 15.368468284606934, "learning_rate": 1.9408893438063428e-05, "loss": 11.1972, "step": 708 }, { "epoch": 0.1385306760453302, "grad_norm": 8.977986335754395, "learning_rate": 1.9406747916271088e-05, "loss": 10.1648, "step": 709 }, { "epoch": 0.13872606486908948, "grad_norm": 10.708306312561035, "learning_rate": 1.940459862678416e-05, "loss": 10.5425, "step": 710 }, { "epoch": 0.13892145369284878, "grad_norm": 9.737686157226562, "learning_rate": 1.9402445570463502e-05, "loss": 9.6175, "step": 711 }, { "epoch": 0.13911684251660805, "grad_norm": 11.807138442993164, "learning_rate": 1.940028874817148e-05, "loss": 11.2366, "step": 712 }, { "epoch": 0.13931223134036733, "grad_norm": 9.303187370300293, "learning_rate": 1.9398128160771973e-05, "loss": 9.8696, "step": 713 }, { "epoch": 0.1395076201641266, "grad_norm": 19.42157745361328, "learning_rate": 1.9395963809130355e-05, "loss": 10.5176, "step": 714 }, { "epoch": 0.1397030089878859, "grad_norm": 10.6793794631958, "learning_rate": 1.9393795694113526e-05, "loss": 10.3265, "step": 715 }, { "epoch": 0.13989839781164518, "grad_norm": 10.98534870147705, "learning_rate": 1.9391623816589874e-05, "loss": 10.4471, "step": 716 }, { "epoch": 0.14009378663540445, "grad_norm": 9.268878936767578, "learning_rate": 1.9389448177429305e-05, "loss": 9.6479, "step": 717 }, { "epoch": 0.14028917545916372, "grad_norm": 6.744013786315918, "learning_rate": 1.9387268777503237e-05, "loss": 9.1637, "step": 718 }, { "epoch": 0.14048456428292302, "grad_norm": 10.005885124206543, "learning_rate": 1.938508561768458e-05, "loss": 9.3408, "step": 719 }, { "epoch": 0.1406799531066823, "grad_norm": 11.842926979064941, "learning_rate": 1.938289869884776e-05, "loss": 9.6945, "step": 720 }, { "epoch": 0.14087534193044157, "grad_norm": 10.483861923217773, "learning_rate": 1.9380708021868708e-05, "loss": 10.5852, "step": 721 }, { "epoch": 0.14107073075420085, "grad_norm": 10.77122688293457, "learning_rate": 1.9378513587624855e-05, "loss": 10.0683, "step": 722 }, { "epoch": 0.14126611957796015, "grad_norm": 10.417200088500977, "learning_rate": 1.937631539699514e-05, "loss": 9.9922, "step": 723 }, { "epoch": 0.14146150840171942, "grad_norm": 11.36428165435791, "learning_rate": 1.9374113450860012e-05, "loss": 10.2745, "step": 724 }, { "epoch": 0.1416568972254787, "grad_norm": 10.181743621826172, "learning_rate": 1.937190775010141e-05, "loss": 11.0796, "step": 725 }, { "epoch": 0.141852286049238, "grad_norm": 10.130002975463867, "learning_rate": 1.936969829560279e-05, "loss": 10.2408, "step": 726 }, { "epoch": 0.14204767487299727, "grad_norm": 12.734686851501465, "learning_rate": 1.9367485088249106e-05, "loss": 11.2567, "step": 727 }, { "epoch": 0.14224306369675654, "grad_norm": 12.297183990478516, "learning_rate": 1.936526812892682e-05, "loss": 10.2194, "step": 728 }, { "epoch": 0.14243845252051582, "grad_norm": 13.363584518432617, "learning_rate": 1.936304741852389e-05, "loss": 11.4515, "step": 729 }, { "epoch": 0.14263384134427512, "grad_norm": 12.167123794555664, "learning_rate": 1.936082295792978e-05, "loss": 11.2041, "step": 730 }, { "epoch": 0.1428292301680344, "grad_norm": 8.581811904907227, "learning_rate": 1.9358594748035455e-05, "loss": 9.6672, "step": 731 }, { "epoch": 0.14302461899179367, "grad_norm": 28.514732360839844, "learning_rate": 1.9356362789733383e-05, "loss": 10.7176, "step": 732 }, { "epoch": 0.14322000781555294, "grad_norm": 9.339582443237305, "learning_rate": 1.9354127083917535e-05, "loss": 10.3786, "step": 733 }, { "epoch": 0.14341539663931224, "grad_norm": 9.966968536376953, "learning_rate": 1.9351887631483372e-05, "loss": 11.0023, "step": 734 }, { "epoch": 0.14361078546307152, "grad_norm": 9.983609199523926, "learning_rate": 1.9349644433327875e-05, "loss": 10.4865, "step": 735 }, { "epoch": 0.1438061742868308, "grad_norm": 8.336555480957031, "learning_rate": 1.9347397490349507e-05, "loss": 10.2784, "step": 736 }, { "epoch": 0.14400156311059006, "grad_norm": 10.444098472595215, "learning_rate": 1.934514680344824e-05, "loss": 10.4828, "step": 737 }, { "epoch": 0.14419695193434937, "grad_norm": 9.705633163452148, "learning_rate": 1.9342892373525547e-05, "loss": 10.3624, "step": 738 }, { "epoch": 0.14439234075810864, "grad_norm": 7.4516921043396, "learning_rate": 1.9340634201484397e-05, "loss": 10.4507, "step": 739 }, { "epoch": 0.1445877295818679, "grad_norm": 8.806925773620605, "learning_rate": 1.9338372288229253e-05, "loss": 9.7585, "step": 740 }, { "epoch": 0.1447831184056272, "grad_norm": 10.420880317687988, "learning_rate": 1.9336106634666086e-05, "loss": 10.4827, "step": 741 }, { "epoch": 0.1449785072293865, "grad_norm": 14.015484809875488, "learning_rate": 1.933383724170236e-05, "loss": 11.1924, "step": 742 }, { "epoch": 0.14517389605314576, "grad_norm": 12.910109519958496, "learning_rate": 1.9331564110247035e-05, "loss": 10.4148, "step": 743 }, { "epoch": 0.14536928487690504, "grad_norm": 38.42607116699219, "learning_rate": 1.9329287241210574e-05, "loss": 11.107, "step": 744 }, { "epoch": 0.1455646737006643, "grad_norm": 16.01734161376953, "learning_rate": 1.932700663550493e-05, "loss": 10.0358, "step": 745 }, { "epoch": 0.1457600625244236, "grad_norm": 12.651199340820312, "learning_rate": 1.932472229404356e-05, "loss": 10.2062, "step": 746 }, { "epoch": 0.14595545134818289, "grad_norm": 13.449583053588867, "learning_rate": 1.9322434217741413e-05, "loss": 9.2374, "step": 747 }, { "epoch": 0.14615084017194216, "grad_norm": 15.523850440979004, "learning_rate": 1.932014240751493e-05, "loss": 11.7652, "step": 748 }, { "epoch": 0.14634622899570143, "grad_norm": 16.57215690612793, "learning_rate": 1.931784686428206e-05, "loss": 10.8459, "step": 749 }, { "epoch": 0.14654161781946073, "grad_norm": 11.21127700805664, "learning_rate": 1.9315547588962232e-05, "loss": 11.843, "step": 750 }, { "epoch": 0.14673700664322, "grad_norm": 9.245950698852539, "learning_rate": 1.9313244582476383e-05, "loss": 11.2157, "step": 751 }, { "epoch": 0.14693239546697928, "grad_norm": 9.136964797973633, "learning_rate": 1.9310937845746934e-05, "loss": 9.6121, "step": 752 }, { "epoch": 0.14712778429073858, "grad_norm": 10.128212928771973, "learning_rate": 1.930862737969781e-05, "loss": 10.1339, "step": 753 }, { "epoch": 0.14732317311449786, "grad_norm": 13.177711486816406, "learning_rate": 1.9306313185254417e-05, "loss": 10.2268, "step": 754 }, { "epoch": 0.14751856193825713, "grad_norm": 12.620972633361816, "learning_rate": 1.9303995263343665e-05, "loss": 10.7249, "step": 755 }, { "epoch": 0.1477139507620164, "grad_norm": 10.05113697052002, "learning_rate": 1.9301673614893957e-05, "loss": 10.9795, "step": 756 }, { "epoch": 0.1479093395857757, "grad_norm": 9.155972480773926, "learning_rate": 1.9299348240835182e-05, "loss": 9.1259, "step": 757 }, { "epoch": 0.14810472840953498, "grad_norm": 10.70669174194336, "learning_rate": 1.9297019142098725e-05, "loss": 10.718, "step": 758 }, { "epoch": 0.14830011723329425, "grad_norm": 13.88758659362793, "learning_rate": 1.9294686319617463e-05, "loss": 10.4523, "step": 759 }, { "epoch": 0.14849550605705353, "grad_norm": 17.19888687133789, "learning_rate": 1.9292349774325766e-05, "loss": 11.894, "step": 760 }, { "epoch": 0.14869089488081283, "grad_norm": 21.77732276916504, "learning_rate": 1.9290009507159488e-05, "loss": 12.2076, "step": 761 }, { "epoch": 0.1488862837045721, "grad_norm": 11.067423820495605, "learning_rate": 1.928766551905598e-05, "loss": 9.8428, "step": 762 }, { "epoch": 0.14908167252833138, "grad_norm": 16.04343605041504, "learning_rate": 1.9285317810954083e-05, "loss": 10.0406, "step": 763 }, { "epoch": 0.14927706135209065, "grad_norm": 10.960137367248535, "learning_rate": 1.9282966383794127e-05, "loss": 10.2735, "step": 764 }, { "epoch": 0.14947245017584995, "grad_norm": 10.08834171295166, "learning_rate": 1.9280611238517932e-05, "loss": 10.0519, "step": 765 }, { "epoch": 0.14966783899960923, "grad_norm": 14.802245140075684, "learning_rate": 1.9278252376068805e-05, "loss": 11.5211, "step": 766 }, { "epoch": 0.1498632278233685, "grad_norm": 19.65907859802246, "learning_rate": 1.927588979739155e-05, "loss": 11.1272, "step": 767 }, { "epoch": 0.15005861664712777, "grad_norm": 10.54307746887207, "learning_rate": 1.9273523503432444e-05, "loss": 10.9068, "step": 768 }, { "epoch": 0.15025400547088708, "grad_norm": 14.132508277893066, "learning_rate": 1.9271153495139266e-05, "loss": 10.1415, "step": 769 }, { "epoch": 0.15044939429464635, "grad_norm": 11.745526313781738, "learning_rate": 1.926877977346128e-05, "loss": 11.2422, "step": 770 }, { "epoch": 0.15064478311840562, "grad_norm": 9.170060157775879, "learning_rate": 1.9266402339349227e-05, "loss": 10.488, "step": 771 }, { "epoch": 0.1508401719421649, "grad_norm": 10.40644645690918, "learning_rate": 1.926402119375535e-05, "loss": 9.9418, "step": 772 }, { "epoch": 0.1510355607659242, "grad_norm": 10.796622276306152, "learning_rate": 1.9261636337633374e-05, "loss": 9.6721, "step": 773 }, { "epoch": 0.15123094958968347, "grad_norm": 12.15422534942627, "learning_rate": 1.92592477719385e-05, "loss": 10.767, "step": 774 }, { "epoch": 0.15142633841344275, "grad_norm": 8.324102401733398, "learning_rate": 1.925685549762743e-05, "loss": 10.0255, "step": 775 }, { "epoch": 0.15162172723720202, "grad_norm": 10.961838722229004, "learning_rate": 1.925445951565834e-05, "loss": 10.8336, "step": 776 }, { "epoch": 0.15181711606096132, "grad_norm": 11.413140296936035, "learning_rate": 1.9252059826990893e-05, "loss": 9.7799, "step": 777 }, { "epoch": 0.1520125048847206, "grad_norm": 11.092761993408203, "learning_rate": 1.9249656432586248e-05, "loss": 10.4188, "step": 778 }, { "epoch": 0.15220789370847987, "grad_norm": 13.403634071350098, "learning_rate": 1.924724933340703e-05, "loss": 10.6561, "step": 779 }, { "epoch": 0.15240328253223914, "grad_norm": 13.839494705200195, "learning_rate": 1.924483853041736e-05, "loss": 10.3124, "step": 780 }, { "epoch": 0.15259867135599844, "grad_norm": 16.102264404296875, "learning_rate": 1.924242402458284e-05, "loss": 10.6515, "step": 781 }, { "epoch": 0.15279406017975772, "grad_norm": 10.754448890686035, "learning_rate": 1.9240005816870554e-05, "loss": 10.3905, "step": 782 }, { "epoch": 0.152989449003517, "grad_norm": 14.738258361816406, "learning_rate": 1.9237583908249072e-05, "loss": 9.8646, "step": 783 }, { "epoch": 0.1531848378272763, "grad_norm": 17.20542335510254, "learning_rate": 1.9235158299688436e-05, "loss": 11.8768, "step": 784 }, { "epoch": 0.15338022665103557, "grad_norm": 12.346606254577637, "learning_rate": 1.9232728992160187e-05, "loss": 10.3553, "step": 785 }, { "epoch": 0.15357561547479484, "grad_norm": 12.346830368041992, "learning_rate": 1.923029598663733e-05, "loss": 10.2946, "step": 786 }, { "epoch": 0.15377100429855411, "grad_norm": 9.510976791381836, "learning_rate": 1.9227859284094367e-05, "loss": 9.9759, "step": 787 }, { "epoch": 0.15396639312231342, "grad_norm": 7.273228645324707, "learning_rate": 1.9225418885507264e-05, "loss": 9.3106, "step": 788 }, { "epoch": 0.1541617819460727, "grad_norm": 10.699312210083008, "learning_rate": 1.9222974791853485e-05, "loss": 10.2201, "step": 789 }, { "epoch": 0.15435717076983196, "grad_norm": 10.579397201538086, "learning_rate": 1.922052700411196e-05, "loss": 10.8552, "step": 790 }, { "epoch": 0.15455255959359124, "grad_norm": 8.557092666625977, "learning_rate": 1.9218075523263104e-05, "loss": 9.5428, "step": 791 }, { "epoch": 0.15474794841735054, "grad_norm": 9.219961166381836, "learning_rate": 1.9215620350288816e-05, "loss": 11.1206, "step": 792 }, { "epoch": 0.1549433372411098, "grad_norm": 9.186723709106445, "learning_rate": 1.9213161486172464e-05, "loss": 10.8128, "step": 793 }, { "epoch": 0.1551387260648691, "grad_norm": 14.034444808959961, "learning_rate": 1.9210698931898903e-05, "loss": 10.4753, "step": 794 }, { "epoch": 0.15533411488862836, "grad_norm": 13.460503578186035, "learning_rate": 1.920823268845446e-05, "loss": 10.4392, "step": 795 }, { "epoch": 0.15552950371238766, "grad_norm": 17.306602478027344, "learning_rate": 1.920576275682694e-05, "loss": 10.153, "step": 796 }, { "epoch": 0.15572489253614694, "grad_norm": 14.931540489196777, "learning_rate": 1.9203289138005634e-05, "loss": 11.0139, "step": 797 }, { "epoch": 0.1559202813599062, "grad_norm": 11.157635688781738, "learning_rate": 1.92008118329813e-05, "loss": 10.3161, "step": 798 }, { "epoch": 0.15611567018366548, "grad_norm": 16.768131256103516, "learning_rate": 1.919833084274617e-05, "loss": 9.3059, "step": 799 }, { "epoch": 0.15631105900742479, "grad_norm": 26.313011169433594, "learning_rate": 1.9195846168293967e-05, "loss": 9.0071, "step": 800 }, { "epoch": 0.15650644783118406, "grad_norm": 10.458858489990234, "learning_rate": 1.919335781061987e-05, "loss": 10.5491, "step": 801 }, { "epoch": 0.15670183665494333, "grad_norm": 8.418832778930664, "learning_rate": 1.919086577072055e-05, "loss": 9.9025, "step": 802 }, { "epoch": 0.1568972254787026, "grad_norm": 13.979361534118652, "learning_rate": 1.918837004959415e-05, "loss": 10.9538, "step": 803 }, { "epoch": 0.1570926143024619, "grad_norm": 8.292892456054688, "learning_rate": 1.9185870648240278e-05, "loss": 9.9292, "step": 804 }, { "epoch": 0.15728800312622118, "grad_norm": 18.740543365478516, "learning_rate": 1.9183367567660024e-05, "loss": 9.2841, "step": 805 }, { "epoch": 0.15748339194998046, "grad_norm": 11.974870681762695, "learning_rate": 1.9180860808855944e-05, "loss": 9.4875, "step": 806 }, { "epoch": 0.15767878077373973, "grad_norm": 11.385255813598633, "learning_rate": 1.917835037283208e-05, "loss": 10.3209, "step": 807 }, { "epoch": 0.15787416959749903, "grad_norm": 8.198461532592773, "learning_rate": 1.9175836260593937e-05, "loss": 9.3359, "step": 808 }, { "epoch": 0.1580695584212583, "grad_norm": 23.158065795898438, "learning_rate": 1.9173318473148495e-05, "loss": 10.8434, "step": 809 }, { "epoch": 0.15826494724501758, "grad_norm": 9.892696380615234, "learning_rate": 1.9170797011504206e-05, "loss": 10.5046, "step": 810 }, { "epoch": 0.15846033606877685, "grad_norm": 23.545766830444336, "learning_rate": 1.9168271876670994e-05, "loss": 9.549, "step": 811 }, { "epoch": 0.15865572489253615, "grad_norm": 10.899819374084473, "learning_rate": 1.9165743069660253e-05, "loss": 10.2441, "step": 812 }, { "epoch": 0.15885111371629543, "grad_norm": 22.16189193725586, "learning_rate": 1.9163210591484852e-05, "loss": 10.6823, "step": 813 }, { "epoch": 0.1590465025400547, "grad_norm": 10.303801536560059, "learning_rate": 1.916067444315912e-05, "loss": 10.7417, "step": 814 }, { "epoch": 0.159241891363814, "grad_norm": 11.283565521240234, "learning_rate": 1.915813462569887e-05, "loss": 9.949, "step": 815 }, { "epoch": 0.15943728018757328, "grad_norm": 15.853772163391113, "learning_rate": 1.9155591140121373e-05, "loss": 11.3669, "step": 816 }, { "epoch": 0.15963266901133255, "grad_norm": 16.68556022644043, "learning_rate": 1.9153043987445377e-05, "loss": 10.1564, "step": 817 }, { "epoch": 0.15982805783509182, "grad_norm": 7.7740654945373535, "learning_rate": 1.9150493168691097e-05, "loss": 11.4006, "step": 818 }, { "epoch": 0.16002344665885113, "grad_norm": 9.202394485473633, "learning_rate": 1.9147938684880213e-05, "loss": 11.4376, "step": 819 }, { "epoch": 0.1602188354826104, "grad_norm": 8.620976448059082, "learning_rate": 1.914538053703587e-05, "loss": 10.9465, "step": 820 }, { "epoch": 0.16041422430636967, "grad_norm": 8.079070091247559, "learning_rate": 1.9142818726182696e-05, "loss": 10.0092, "step": 821 }, { "epoch": 0.16060961313012895, "grad_norm": 8.546801567077637, "learning_rate": 1.9140253253346767e-05, "loss": 9.9966, "step": 822 }, { "epoch": 0.16080500195388825, "grad_norm": 8.872438430786133, "learning_rate": 1.9137684119555642e-05, "loss": 10.4949, "step": 823 }, { "epoch": 0.16100039077764752, "grad_norm": 8.324982643127441, "learning_rate": 1.913511132583833e-05, "loss": 10.6894, "step": 824 }, { "epoch": 0.1611957796014068, "grad_norm": 11.035906791687012, "learning_rate": 1.9132534873225323e-05, "loss": 10.7705, "step": 825 }, { "epoch": 0.16139116842516607, "grad_norm": 14.182964324951172, "learning_rate": 1.9129954762748567e-05, "loss": 11.0233, "step": 826 }, { "epoch": 0.16158655724892537, "grad_norm": 11.811083793640137, "learning_rate": 1.9127370995441475e-05, "loss": 10.7647, "step": 827 }, { "epoch": 0.16178194607268465, "grad_norm": 10.472794532775879, "learning_rate": 1.912478357233893e-05, "loss": 10.7073, "step": 828 }, { "epoch": 0.16197733489644392, "grad_norm": 12.881394386291504, "learning_rate": 1.9122192494477268e-05, "loss": 10.2255, "step": 829 }, { "epoch": 0.1621727237202032, "grad_norm": 11.292678833007812, "learning_rate": 1.9119597762894302e-05, "loss": 11.345, "step": 830 }, { "epoch": 0.1623681125439625, "grad_norm": 10.249587059020996, "learning_rate": 1.9116999378629305e-05, "loss": 10.1139, "step": 831 }, { "epoch": 0.16256350136772177, "grad_norm": 14.024880409240723, "learning_rate": 1.9114397342723005e-05, "loss": 10.1549, "step": 832 }, { "epoch": 0.16275889019148104, "grad_norm": 13.929058074951172, "learning_rate": 1.9111791656217598e-05, "loss": 10.9506, "step": 833 }, { "epoch": 0.16295427901524032, "grad_norm": 14.429847717285156, "learning_rate": 1.9109182320156744e-05, "loss": 9.965, "step": 834 }, { "epoch": 0.16314966783899962, "grad_norm": 10.850347518920898, "learning_rate": 1.9106569335585566e-05, "loss": 10.1431, "step": 835 }, { "epoch": 0.1633450566627589, "grad_norm": 9.337977409362793, "learning_rate": 1.9103952703550642e-05, "loss": 11.1465, "step": 836 }, { "epoch": 0.16354044548651817, "grad_norm": 8.037405014038086, "learning_rate": 1.9101332425100016e-05, "loss": 10.2202, "step": 837 }, { "epoch": 0.16373583431027744, "grad_norm": 10.364066123962402, "learning_rate": 1.9098708501283188e-05, "loss": 9.844, "step": 838 }, { "epoch": 0.16393122313403674, "grad_norm": 7.588009834289551, "learning_rate": 1.9096080933151123e-05, "loss": 9.8606, "step": 839 }, { "epoch": 0.16412661195779601, "grad_norm": 16.311908721923828, "learning_rate": 1.9093449721756246e-05, "loss": 10.1571, "step": 840 }, { "epoch": 0.1643220007815553, "grad_norm": 9.902872085571289, "learning_rate": 1.9090814868152435e-05, "loss": 8.7677, "step": 841 }, { "epoch": 0.16451738960531456, "grad_norm": 9.914548873901367, "learning_rate": 1.908817637339503e-05, "loss": 9.726, "step": 842 }, { "epoch": 0.16471277842907386, "grad_norm": 10.072443962097168, "learning_rate": 1.9085534238540834e-05, "loss": 10.4707, "step": 843 }, { "epoch": 0.16490816725283314, "grad_norm": 9.244873046875, "learning_rate": 1.9082888464648103e-05, "loss": 10.3142, "step": 844 }, { "epoch": 0.1651035560765924, "grad_norm": 8.684638023376465, "learning_rate": 1.9080239052776554e-05, "loss": 10.3544, "step": 845 }, { "epoch": 0.1652989449003517, "grad_norm": 12.886144638061523, "learning_rate": 1.907758600398735e-05, "loss": 10.8514, "step": 846 }, { "epoch": 0.165494333724111, "grad_norm": 13.057008743286133, "learning_rate": 1.9074929319343126e-05, "loss": 9.6809, "step": 847 }, { "epoch": 0.16568972254787026, "grad_norm": 11.642423629760742, "learning_rate": 1.9072268999907967e-05, "loss": 10.628, "step": 848 }, { "epoch": 0.16588511137162953, "grad_norm": 13.759044647216797, "learning_rate": 1.9069605046747414e-05, "loss": 10.9427, "step": 849 }, { "epoch": 0.16608050019538884, "grad_norm": 9.928781509399414, "learning_rate": 1.906693746092846e-05, "loss": 9.9875, "step": 850 }, { "epoch": 0.1662758890191481, "grad_norm": 15.753911018371582, "learning_rate": 1.9064266243519553e-05, "loss": 10.0349, "step": 851 }, { "epoch": 0.16647127784290738, "grad_norm": 10.317689895629883, "learning_rate": 1.9061591395590606e-05, "loss": 9.6778, "step": 852 }, { "epoch": 0.16666666666666666, "grad_norm": 9.685894012451172, "learning_rate": 1.9058912918212976e-05, "loss": 9.3574, "step": 853 }, { "epoch": 0.16686205549042596, "grad_norm": 12.898737907409668, "learning_rate": 1.9056230812459476e-05, "loss": 10.8164, "step": 854 }, { "epoch": 0.16705744431418523, "grad_norm": 11.205578804016113, "learning_rate": 1.905354507940437e-05, "loss": 9.3245, "step": 855 }, { "epoch": 0.1672528331379445, "grad_norm": 13.233284950256348, "learning_rate": 1.905085572012338e-05, "loss": 10.7563, "step": 856 }, { "epoch": 0.16744822196170378, "grad_norm": 12.376396179199219, "learning_rate": 1.904816273569368e-05, "loss": 10.7316, "step": 857 }, { "epoch": 0.16764361078546308, "grad_norm": 16.27195930480957, "learning_rate": 1.904546612719389e-05, "loss": 10.638, "step": 858 }, { "epoch": 0.16783899960922236, "grad_norm": 7.701115131378174, "learning_rate": 1.9042765895704085e-05, "loss": 9.1598, "step": 859 }, { "epoch": 0.16803438843298163, "grad_norm": 36.823429107666016, "learning_rate": 1.9040062042305796e-05, "loss": 10.7289, "step": 860 }, { "epoch": 0.1682297772567409, "grad_norm": 9.800819396972656, "learning_rate": 1.9037354568081995e-05, "loss": 10.1007, "step": 861 }, { "epoch": 0.1684251660805002, "grad_norm": 12.288325309753418, "learning_rate": 1.9034643474117114e-05, "loss": 9.1896, "step": 862 }, { "epoch": 0.16862055490425948, "grad_norm": 10.411764144897461, "learning_rate": 1.9031928761497026e-05, "loss": 10.3171, "step": 863 }, { "epoch": 0.16881594372801875, "grad_norm": 10.256525993347168, "learning_rate": 1.9029210431309063e-05, "loss": 9.6621, "step": 864 }, { "epoch": 0.16901133255177803, "grad_norm": 13.477582931518555, "learning_rate": 1.9026488484641995e-05, "loss": 9.3424, "step": 865 }, { "epoch": 0.16920672137553733, "grad_norm": 11.819972038269043, "learning_rate": 1.9023762922586052e-05, "loss": 9.5409, "step": 866 }, { "epoch": 0.1694021101992966, "grad_norm": 12.532838821411133, "learning_rate": 1.9021033746232903e-05, "loss": 9.0871, "step": 867 }, { "epoch": 0.16959749902305588, "grad_norm": 15.850306510925293, "learning_rate": 1.9018300956675667e-05, "loss": 9.578, "step": 868 }, { "epoch": 0.16979288784681515, "grad_norm": 46.099002838134766, "learning_rate": 1.9015564555008912e-05, "loss": 9.3759, "step": 869 }, { "epoch": 0.16998827667057445, "grad_norm": 20.86892318725586, "learning_rate": 1.901282454232865e-05, "loss": 11.2573, "step": 870 }, { "epoch": 0.17018366549433372, "grad_norm": 14.856077194213867, "learning_rate": 1.9010080919732347e-05, "loss": 10.2307, "step": 871 }, { "epoch": 0.170379054318093, "grad_norm": 11.684233665466309, "learning_rate": 1.9007333688318902e-05, "loss": 9.5953, "step": 872 }, { "epoch": 0.17057444314185227, "grad_norm": 21.605602264404297, "learning_rate": 1.9004582849188668e-05, "loss": 11.3202, "step": 873 }, { "epoch": 0.17076983196561157, "grad_norm": 20.927377700805664, "learning_rate": 1.9001828403443444e-05, "loss": 11.4154, "step": 874 }, { "epoch": 0.17096522078937085, "grad_norm": 13.724664688110352, "learning_rate": 1.8999070352186474e-05, "loss": 10.2794, "step": 875 }, { "epoch": 0.17116060961313012, "grad_norm": 11.638361930847168, "learning_rate": 1.8996308696522435e-05, "loss": 10.5548, "step": 876 }, { "epoch": 0.17135599843688942, "grad_norm": 23.183778762817383, "learning_rate": 1.8993543437557462e-05, "loss": 11.002, "step": 877 }, { "epoch": 0.1715513872606487, "grad_norm": 30.77211570739746, "learning_rate": 1.8990774576399124e-05, "loss": 11.6167, "step": 878 }, { "epoch": 0.17174677608440797, "grad_norm": 113.89714813232422, "learning_rate": 1.898800211415644e-05, "loss": 11.3287, "step": 879 }, { "epoch": 0.17194216490816724, "grad_norm": 8.955188751220703, "learning_rate": 1.898522605193986e-05, "loss": 10.0213, "step": 880 }, { "epoch": 0.17213755373192655, "grad_norm": 14.04103946685791, "learning_rate": 1.8982446390861286e-05, "loss": 10.7902, "step": 881 }, { "epoch": 0.17233294255568582, "grad_norm": 10.359882354736328, "learning_rate": 1.8979663132034062e-05, "loss": 10.6917, "step": 882 }, { "epoch": 0.1725283313794451, "grad_norm": 8.169754981994629, "learning_rate": 1.897687627657297e-05, "loss": 10.1715, "step": 883 }, { "epoch": 0.17272372020320437, "grad_norm": 10.486644744873047, "learning_rate": 1.8974085825594226e-05, "loss": 10.3867, "step": 884 }, { "epoch": 0.17291910902696367, "grad_norm": 34.91582489013672, "learning_rate": 1.8971291780215496e-05, "loss": 9.8048, "step": 885 }, { "epoch": 0.17311449785072294, "grad_norm": 11.902545928955078, "learning_rate": 1.8968494141555882e-05, "loss": 10.4441, "step": 886 }, { "epoch": 0.17330988667448222, "grad_norm": 11.266779899597168, "learning_rate": 1.8965692910735925e-05, "loss": 10.7083, "step": 887 }, { "epoch": 0.1735052754982415, "grad_norm": 25.55230712890625, "learning_rate": 1.8962888088877607e-05, "loss": 9.981, "step": 888 }, { "epoch": 0.1737006643220008, "grad_norm": 10.482330322265625, "learning_rate": 1.8960079677104342e-05, "loss": 9.7049, "step": 889 }, { "epoch": 0.17389605314576007, "grad_norm": 9.570494651794434, "learning_rate": 1.895726767654099e-05, "loss": 9.6493, "step": 890 }, { "epoch": 0.17409144196951934, "grad_norm": 10.879228591918945, "learning_rate": 1.895445208831384e-05, "loss": 10.1414, "step": 891 }, { "epoch": 0.1742868307932786, "grad_norm": 14.008447647094727, "learning_rate": 1.8951632913550625e-05, "loss": 10.9662, "step": 892 }, { "epoch": 0.17448221961703791, "grad_norm": 10.85938549041748, "learning_rate": 1.8948810153380514e-05, "loss": 9.9632, "step": 893 }, { "epoch": 0.1746776084407972, "grad_norm": 10.80557918548584, "learning_rate": 1.8945983808934105e-05, "loss": 10.1232, "step": 894 }, { "epoch": 0.17487299726455646, "grad_norm": 14.553106307983398, "learning_rate": 1.894315388134344e-05, "loss": 11.7757, "step": 895 }, { "epoch": 0.17506838608831574, "grad_norm": 10.305713653564453, "learning_rate": 1.8940320371741995e-05, "loss": 10.6498, "step": 896 }, { "epoch": 0.17526377491207504, "grad_norm": 19.3458194732666, "learning_rate": 1.893748328126468e-05, "loss": 10.2667, "step": 897 }, { "epoch": 0.1754591637358343, "grad_norm": 10.901649475097656, "learning_rate": 1.8934642611047828e-05, "loss": 9.6728, "step": 898 }, { "epoch": 0.17565455255959359, "grad_norm": 7.951071262359619, "learning_rate": 1.8931798362229227e-05, "loss": 9.3648, "step": 899 }, { "epoch": 0.17584994138335286, "grad_norm": 12.390594482421875, "learning_rate": 1.8928950535948077e-05, "loss": 9.9003, "step": 900 }, { "epoch": 0.17604533020711216, "grad_norm": 10.91873836517334, "learning_rate": 1.8926099133345025e-05, "loss": 10.0891, "step": 901 }, { "epoch": 0.17624071903087143, "grad_norm": 10.91873836517334, "learning_rate": 1.8926099133345025e-05, "loss": 11.0351, "step": 902 }, { "epoch": 0.1764361078546307, "grad_norm": 12.28889274597168, "learning_rate": 1.8923244155562147e-05, "loss": 10.2997, "step": 903 }, { "epoch": 0.17663149667839, "grad_norm": 13.323399543762207, "learning_rate": 1.8920385603742955e-05, "loss": 10.6953, "step": 904 }, { "epoch": 0.17682688550214928, "grad_norm": 9.011679649353027, "learning_rate": 1.8917523479032378e-05, "loss": 10.1096, "step": 905 }, { "epoch": 0.17702227432590856, "grad_norm": 13.647565841674805, "learning_rate": 1.8914657782576795e-05, "loss": 9.576, "step": 906 }, { "epoch": 0.17721766314966783, "grad_norm": 18.961706161499023, "learning_rate": 1.8911788515524e-05, "loss": 10.1923, "step": 907 }, { "epoch": 0.17741305197342713, "grad_norm": 15.747967720031738, "learning_rate": 1.8908915679023222e-05, "loss": 10.5809, "step": 908 }, { "epoch": 0.1776084407971864, "grad_norm": 14.569042205810547, "learning_rate": 1.8906039274225126e-05, "loss": 10.8621, "step": 909 }, { "epoch": 0.17780382962094568, "grad_norm": 14.774585723876953, "learning_rate": 1.8903159302281803e-05, "loss": 10.6481, "step": 910 }, { "epoch": 0.17799921844470495, "grad_norm": 11.765643119812012, "learning_rate": 1.890027576434677e-05, "loss": 10.7216, "step": 911 }, { "epoch": 0.17819460726846426, "grad_norm": 9.366642951965332, "learning_rate": 1.889738866157497e-05, "loss": 10.1954, "step": 912 }, { "epoch": 0.17838999609222353, "grad_norm": 10.075114250183105, "learning_rate": 1.8894497995122775e-05, "loss": 10.356, "step": 913 }, { "epoch": 0.1785853849159828, "grad_norm": 12.980185508728027, "learning_rate": 1.8891603766147994e-05, "loss": 10.7433, "step": 914 }, { "epoch": 0.17878077373974208, "grad_norm": 27.603477478027344, "learning_rate": 1.888870597580985e-05, "loss": 9.8225, "step": 915 }, { "epoch": 0.17897616256350138, "grad_norm": 9.369039535522461, "learning_rate": 1.8885804625269004e-05, "loss": 9.6697, "step": 916 }, { "epoch": 0.17917155138726065, "grad_norm": 9.285225868225098, "learning_rate": 1.888289971568753e-05, "loss": 11.0268, "step": 917 }, { "epoch": 0.17936694021101993, "grad_norm": 7.665126800537109, "learning_rate": 1.8879991248228942e-05, "loss": 10.105, "step": 918 }, { "epoch": 0.1795623290347792, "grad_norm": 21.36392593383789, "learning_rate": 1.8877079224058167e-05, "loss": 10.8853, "step": 919 }, { "epoch": 0.1797577178585385, "grad_norm": 8.818321228027344, "learning_rate": 1.8874163644341556e-05, "loss": 10.3977, "step": 920 }, { "epoch": 0.17995310668229778, "grad_norm": 21.660226821899414, "learning_rate": 1.8871244510246898e-05, "loss": 10.5013, "step": 921 }, { "epoch": 0.18014849550605705, "grad_norm": 15.823149681091309, "learning_rate": 1.8868321822943396e-05, "loss": 10.5462, "step": 922 }, { "epoch": 0.18034388432981632, "grad_norm": 8.88244342803955, "learning_rate": 1.886539558360167e-05, "loss": 10.4773, "step": 923 }, { "epoch": 0.18053927315357562, "grad_norm": 9.925248146057129, "learning_rate": 1.8862465793393775e-05, "loss": 10.5078, "step": 924 }, { "epoch": 0.1807346619773349, "grad_norm": 16.658496856689453, "learning_rate": 1.8859532453493182e-05, "loss": 10.6751, "step": 925 }, { "epoch": 0.18093005080109417, "grad_norm": 8.997674942016602, "learning_rate": 1.8856595565074786e-05, "loss": 9.6502, "step": 926 }, { "epoch": 0.18112543962485345, "grad_norm": 14.015752792358398, "learning_rate": 1.88536551293149e-05, "loss": 11.0057, "step": 927 }, { "epoch": 0.18132082844861275, "grad_norm": 9.927313804626465, "learning_rate": 1.8850711147391257e-05, "loss": 9.5504, "step": 928 }, { "epoch": 0.18151621727237202, "grad_norm": 9.687064170837402, "learning_rate": 1.8847763620483023e-05, "loss": 9.8367, "step": 929 }, { "epoch": 0.1817116060961313, "grad_norm": 11.51573371887207, "learning_rate": 1.8844812549770765e-05, "loss": 10.774, "step": 930 }, { "epoch": 0.18190699491989057, "grad_norm": 14.399081230163574, "learning_rate": 1.884185793643648e-05, "loss": 10.9734, "step": 931 }, { "epoch": 0.18210238374364987, "grad_norm": 11.186507225036621, "learning_rate": 1.8838899781663587e-05, "loss": 10.5601, "step": 932 }, { "epoch": 0.18229777256740914, "grad_norm": 8.207215309143066, "learning_rate": 1.8835938086636913e-05, "loss": 11.1318, "step": 933 }, { "epoch": 0.18249316139116842, "grad_norm": 10.036437034606934, "learning_rate": 1.883297285254271e-05, "loss": 9.1535, "step": 934 }, { "epoch": 0.18268855021492772, "grad_norm": 8.014676094055176, "learning_rate": 1.8830004080568648e-05, "loss": 9.3372, "step": 935 }, { "epoch": 0.182883939038687, "grad_norm": 11.679019927978516, "learning_rate": 1.8827031771903814e-05, "loss": 9.9121, "step": 936 }, { "epoch": 0.18307932786244627, "grad_norm": 8.254961013793945, "learning_rate": 1.8824055927738706e-05, "loss": 10.4003, "step": 937 }, { "epoch": 0.18327471668620554, "grad_norm": 10.134276390075684, "learning_rate": 1.8821076549265246e-05, "loss": 11.0081, "step": 938 }, { "epoch": 0.18347010550996484, "grad_norm": 8.73877239227295, "learning_rate": 1.8818093637676763e-05, "loss": 9.1944, "step": 939 }, { "epoch": 0.18366549433372412, "grad_norm": 33.710693359375, "learning_rate": 1.8815107194168007e-05, "loss": 10.5759, "step": 940 }, { "epoch": 0.1838608831574834, "grad_norm": 7.068522930145264, "learning_rate": 1.8812117219935143e-05, "loss": 10.0031, "step": 941 }, { "epoch": 0.18405627198124266, "grad_norm": 16.002485275268555, "learning_rate": 1.880912371617575e-05, "loss": 9.6456, "step": 942 }, { "epoch": 0.18425166080500197, "grad_norm": 32.332855224609375, "learning_rate": 1.8806126684088813e-05, "loss": 11.6795, "step": 943 }, { "epoch": 0.18444704962876124, "grad_norm": 9.968116760253906, "learning_rate": 1.8803126124874748e-05, "loss": 9.8382, "step": 944 }, { "epoch": 0.1846424384525205, "grad_norm": 11.146265983581543, "learning_rate": 1.880012203973536e-05, "loss": 10.6834, "step": 945 }, { "epoch": 0.1848378272762798, "grad_norm": 11.863654136657715, "learning_rate": 1.879711442987388e-05, "loss": 10.1275, "step": 946 }, { "epoch": 0.1850332161000391, "grad_norm": 11.573152542114258, "learning_rate": 1.8794103296494954e-05, "loss": 12.0389, "step": 947 }, { "epoch": 0.18522860492379836, "grad_norm": 10.724802017211914, "learning_rate": 1.8791088640804634e-05, "loss": 10.1915, "step": 948 }, { "epoch": 0.18542399374755764, "grad_norm": 11.720141410827637, "learning_rate": 1.878807046401038e-05, "loss": 9.1108, "step": 949 }, { "epoch": 0.1856193825713169, "grad_norm": 11.617509841918945, "learning_rate": 1.878504876732107e-05, "loss": 10.0505, "step": 950 }, { "epoch": 0.1858147713950762, "grad_norm": 12.604987144470215, "learning_rate": 1.878202355194698e-05, "loss": 9.9923, "step": 951 }, { "epoch": 0.18601016021883549, "grad_norm": 9.539042472839355, "learning_rate": 1.877899481909981e-05, "loss": 9.6927, "step": 952 }, { "epoch": 0.18620554904259476, "grad_norm": 9.913569450378418, "learning_rate": 1.8775962569992662e-05, "loss": 10.0365, "step": 953 }, { "epoch": 0.18640093786635403, "grad_norm": 35.884490966796875, "learning_rate": 1.8772926805840036e-05, "loss": 10.3549, "step": 954 }, { "epoch": 0.18659632669011333, "grad_norm": 16.634611129760742, "learning_rate": 1.876988752785786e-05, "loss": 10.5787, "step": 955 }, { "epoch": 0.1867917155138726, "grad_norm": 51.59297180175781, "learning_rate": 1.8766844737263456e-05, "loss": 10.6785, "step": 956 }, { "epoch": 0.18698710433763188, "grad_norm": 10.21894645690918, "learning_rate": 1.8763798435275554e-05, "loss": 10.3058, "step": 957 }, { "epoch": 0.18718249316139116, "grad_norm": 10.180886268615723, "learning_rate": 1.8760748623114295e-05, "loss": 10.3053, "step": 958 }, { "epoch": 0.18737788198515046, "grad_norm": 19.828392028808594, "learning_rate": 1.8757695302001223e-05, "loss": 10.1661, "step": 959 }, { "epoch": 0.18757327080890973, "grad_norm": 10.160892486572266, "learning_rate": 1.8754638473159285e-05, "loss": 11.049, "step": 960 }, { "epoch": 0.187768659632669, "grad_norm": 12.746784210205078, "learning_rate": 1.875157813781284e-05, "loss": 9.86, "step": 961 }, { "epoch": 0.18796404845642828, "grad_norm": 14.9076509475708, "learning_rate": 1.874851429718765e-05, "loss": 10.0313, "step": 962 }, { "epoch": 0.18815943728018758, "grad_norm": 8.683619499206543, "learning_rate": 1.8745446952510868e-05, "loss": 10.4593, "step": 963 }, { "epoch": 0.18835482610394685, "grad_norm": 17.066213607788086, "learning_rate": 1.8742376105011072e-05, "loss": 10.2991, "step": 964 }, { "epoch": 0.18855021492770613, "grad_norm": 17.554244995117188, "learning_rate": 1.873930175591822e-05, "loss": 10.2592, "step": 965 }, { "epoch": 0.18874560375146543, "grad_norm": 11.156131744384766, "learning_rate": 1.8736223906463698e-05, "loss": 9.3706, "step": 966 }, { "epoch": 0.1889409925752247, "grad_norm": 8.87385368347168, "learning_rate": 1.873314255788027e-05, "loss": 8.2635, "step": 967 }, { "epoch": 0.18913638139898398, "grad_norm": 11.072525978088379, "learning_rate": 1.8730057711402113e-05, "loss": 9.8821, "step": 968 }, { "epoch": 0.18933177022274325, "grad_norm": 10.544384002685547, "learning_rate": 1.872696936826481e-05, "loss": 9.6465, "step": 969 }, { "epoch": 0.18952715904650255, "grad_norm": 18.760879516601562, "learning_rate": 1.872387752970533e-05, "loss": 9.3924, "step": 970 }, { "epoch": 0.18972254787026183, "grad_norm": 10.454094886779785, "learning_rate": 1.8720782196962055e-05, "loss": 10.16, "step": 971 }, { "epoch": 0.1899179366940211, "grad_norm": 13.4886474609375, "learning_rate": 1.8717683371274763e-05, "loss": 11.129, "step": 972 }, { "epoch": 0.19011332551778037, "grad_norm": 8.18844223022461, "learning_rate": 1.8714581053884628e-05, "loss": 9.3449, "step": 973 }, { "epoch": 0.19030871434153968, "grad_norm": 7.980138301849365, "learning_rate": 1.871147524603422e-05, "loss": 10.3902, "step": 974 }, { "epoch": 0.19050410316529895, "grad_norm": 11.241415977478027, "learning_rate": 1.870836594896752e-05, "loss": 9.4598, "step": 975 }, { "epoch": 0.19069949198905822, "grad_norm": 8.223668098449707, "learning_rate": 1.8705253163929895e-05, "loss": 8.8009, "step": 976 }, { "epoch": 0.1908948808128175, "grad_norm": 8.505722999572754, "learning_rate": 1.870213689216811e-05, "loss": 9.5131, "step": 977 }, { "epoch": 0.1910902696365768, "grad_norm": 13.652040481567383, "learning_rate": 1.8699017134930327e-05, "loss": 9.904, "step": 978 }, { "epoch": 0.19128565846033607, "grad_norm": 8.11439037322998, "learning_rate": 1.869589389346611e-05, "loss": 9.4733, "step": 979 }, { "epoch": 0.19148104728409535, "grad_norm": 11.425436019897461, "learning_rate": 1.8692767169026415e-05, "loss": 10.8716, "step": 980 }, { "epoch": 0.19167643610785462, "grad_norm": 10.17133617401123, "learning_rate": 1.8689636962863588e-05, "loss": 9.3576, "step": 981 }, { "epoch": 0.19187182493161392, "grad_norm": 9.900402069091797, "learning_rate": 1.8686503276231372e-05, "loss": 10.6097, "step": 982 }, { "epoch": 0.1920672137553732, "grad_norm": 27.443452835083008, "learning_rate": 1.8683366110384912e-05, "loss": 11.2632, "step": 983 }, { "epoch": 0.19226260257913247, "grad_norm": 11.592450141906738, "learning_rate": 1.868022546658074e-05, "loss": 8.5889, "step": 984 }, { "epoch": 0.19245799140289174, "grad_norm": 16.061792373657227, "learning_rate": 1.8677081346076777e-05, "loss": 10.4353, "step": 985 }, { "epoch": 0.19265338022665104, "grad_norm": 19.611604690551758, "learning_rate": 1.8673933750132342e-05, "loss": 10.8018, "step": 986 }, { "epoch": 0.19284876905041032, "grad_norm": 10.408438682556152, "learning_rate": 1.8670782680008145e-05, "loss": 10.5335, "step": 987 }, { "epoch": 0.1930441578741696, "grad_norm": 6.8224873542785645, "learning_rate": 1.866762813696629e-05, "loss": 8.9648, "step": 988 }, { "epoch": 0.19323954669792887, "grad_norm": 8.705338478088379, "learning_rate": 1.866447012227026e-05, "loss": 10.8897, "step": 989 }, { "epoch": 0.19343493552168817, "grad_norm": 9.13908863067627, "learning_rate": 1.8661308637184955e-05, "loss": 10.2841, "step": 990 }, { "epoch": 0.19363032434544744, "grad_norm": 11.744199752807617, "learning_rate": 1.8658143682976636e-05, "loss": 10.8832, "step": 991 }, { "epoch": 0.19382571316920671, "grad_norm": 10.985271453857422, "learning_rate": 1.8654975260912963e-05, "loss": 11.1635, "step": 992 }, { "epoch": 0.194021101992966, "grad_norm": 31.188379287719727, "learning_rate": 1.8651803372263e-05, "loss": 10.509, "step": 993 }, { "epoch": 0.1942164908167253, "grad_norm": 22.462879180908203, "learning_rate": 1.864862801829717e-05, "loss": 11.0881, "step": 994 }, { "epoch": 0.19441187964048456, "grad_norm": 7.340974807739258, "learning_rate": 1.8645449200287317e-05, "loss": 10.0884, "step": 995 }, { "epoch": 0.19460726846424384, "grad_norm": 27.147672653198242, "learning_rate": 1.8642266919506644e-05, "loss": 9.2816, "step": 996 }, { "epoch": 0.19480265728800314, "grad_norm": 12.003215789794922, "learning_rate": 1.8639081177229763e-05, "loss": 11.5175, "step": 997 }, { "epoch": 0.1949980461117624, "grad_norm": 8.73083782196045, "learning_rate": 1.8635891974732655e-05, "loss": 10.0267, "step": 998 }, { "epoch": 0.1951934349355217, "grad_norm": 8.434202194213867, "learning_rate": 1.8632699313292702e-05, "loss": 10.004, "step": 999 }, { "epoch": 0.19538882375928096, "grad_norm": 14.637370109558105, "learning_rate": 1.8629503194188652e-05, "loss": 9.7757, "step": 1000 }, { "epoch": 0.19558421258304026, "grad_norm": 10.80057430267334, "learning_rate": 1.8626303618700664e-05, "loss": 9.355, "step": 1001 }, { "epoch": 0.19577960140679954, "grad_norm": 9.21177864074707, "learning_rate": 1.862310058811026e-05, "loss": 10.9338, "step": 1002 }, { "epoch": 0.1959749902305588, "grad_norm": 7.865182876586914, "learning_rate": 1.8619894103700354e-05, "loss": 8.2683, "step": 1003 }, { "epoch": 0.19617037905431808, "grad_norm": 13.538627624511719, "learning_rate": 1.861668416675524e-05, "loss": 11.2537, "step": 1004 }, { "epoch": 0.19636576787807739, "grad_norm": 9.453564643859863, "learning_rate": 1.86134707785606e-05, "loss": 9.258, "step": 1005 }, { "epoch": 0.19656115670183666, "grad_norm": 8.552481651306152, "learning_rate": 1.86102539404035e-05, "loss": 10.1384, "step": 1006 }, { "epoch": 0.19675654552559593, "grad_norm": 27.59398651123047, "learning_rate": 1.860703365357238e-05, "loss": 10.9349, "step": 1007 }, { "epoch": 0.1969519343493552, "grad_norm": 9.849193572998047, "learning_rate": 1.8603809919357063e-05, "loss": 11.5077, "step": 1008 }, { "epoch": 0.1971473231731145, "grad_norm": 8.214253425598145, "learning_rate": 1.8600582739048752e-05, "loss": 9.8551, "step": 1009 }, { "epoch": 0.19734271199687378, "grad_norm": 14.345209121704102, "learning_rate": 1.859735211394004e-05, "loss": 8.9469, "step": 1010 }, { "epoch": 0.19753810082063306, "grad_norm": 24.646177291870117, "learning_rate": 1.859411804532489e-05, "loss": 11.3968, "step": 1011 }, { "epoch": 0.19773348964439233, "grad_norm": 12.425567626953125, "learning_rate": 1.8590880534498644e-05, "loss": 10.8658, "step": 1012 }, { "epoch": 0.19792887846815163, "grad_norm": 10.923027038574219, "learning_rate": 1.8587639582758032e-05, "loss": 10.8557, "step": 1013 }, { "epoch": 0.1981242672919109, "grad_norm": 8.746227264404297, "learning_rate": 1.858439519140115e-05, "loss": 9.8136, "step": 1014 }, { "epoch": 0.19831965611567018, "grad_norm": 9.470844268798828, "learning_rate": 1.858114736172748e-05, "loss": 9.5208, "step": 1015 }, { "epoch": 0.19851504493942945, "grad_norm": 10.059723854064941, "learning_rate": 1.8577896095037875e-05, "loss": 9.477, "step": 1016 }, { "epoch": 0.19871043376318875, "grad_norm": 11.267123222351074, "learning_rate": 1.8574641392634573e-05, "loss": 10.8301, "step": 1017 }, { "epoch": 0.19890582258694803, "grad_norm": 10.173998832702637, "learning_rate": 1.857138325582118e-05, "loss": 9.8479, "step": 1018 }, { "epoch": 0.1991012114107073, "grad_norm": 9.12119197845459, "learning_rate": 1.8568121685902683e-05, "loss": 10.9387, "step": 1019 }, { "epoch": 0.19929660023446658, "grad_norm": 15.177370071411133, "learning_rate": 1.856485668418544e-05, "loss": 9.1175, "step": 1020 }, { "epoch": 0.19949198905822588, "grad_norm": 10.732454299926758, "learning_rate": 1.856158825197719e-05, "loss": 10.0042, "step": 1021 }, { "epoch": 0.19968737788198515, "grad_norm": 7.855399131774902, "learning_rate": 1.8558316390587036e-05, "loss": 9.8825, "step": 1022 }, { "epoch": 0.19988276670574442, "grad_norm": 11.231322288513184, "learning_rate": 1.8555041101325466e-05, "loss": 10.0247, "step": 1023 }, { "epoch": 0.20007815552950373, "grad_norm": 10.086041450500488, "learning_rate": 1.855176238550433e-05, "loss": 9.5155, "step": 1024 }, { "epoch": 0.200273544353263, "grad_norm": 11.300803184509277, "learning_rate": 1.8548480244436855e-05, "loss": 10.328, "step": 1025 }, { "epoch": 0.20046893317702227, "grad_norm": 12.238363265991211, "learning_rate": 1.8545194679437646e-05, "loss": 10.4964, "step": 1026 }, { "epoch": 0.20066432200078155, "grad_norm": 10.1900053024292, "learning_rate": 1.8541905691822667e-05, "loss": 8.9627, "step": 1027 }, { "epoch": 0.20085971082454085, "grad_norm": 14.579061508178711, "learning_rate": 1.8538613282909263e-05, "loss": 10.9295, "step": 1028 }, { "epoch": 0.20105509964830012, "grad_norm": 11.488454818725586, "learning_rate": 1.853531745401615e-05, "loss": 9.7865, "step": 1029 }, { "epoch": 0.2012504884720594, "grad_norm": 11.466203689575195, "learning_rate": 1.8532018206463404e-05, "loss": 9.4745, "step": 1030 }, { "epoch": 0.20144587729581867, "grad_norm": 12.232128143310547, "learning_rate": 1.8528715541572475e-05, "loss": 10.396, "step": 1031 }, { "epoch": 0.20164126611957797, "grad_norm": 10.786626815795898, "learning_rate": 1.852540946066619e-05, "loss": 10.9871, "step": 1032 }, { "epoch": 0.20183665494333725, "grad_norm": 18.612485885620117, "learning_rate": 1.852209996506873e-05, "loss": 11.799, "step": 1033 }, { "epoch": 0.20203204376709652, "grad_norm": 12.24704360961914, "learning_rate": 1.851878705610565e-05, "loss": 10.1001, "step": 1034 }, { "epoch": 0.2022274325908558, "grad_norm": 10.10387134552002, "learning_rate": 1.8515470735103883e-05, "loss": 9.901, "step": 1035 }, { "epoch": 0.2024228214146151, "grad_norm": 18.21666145324707, "learning_rate": 1.8512151003391708e-05, "loss": 10.8576, "step": 1036 }, { "epoch": 0.20261821023837437, "grad_norm": 9.849905014038086, "learning_rate": 1.8508827862298783e-05, "loss": 10.298, "step": 1037 }, { "epoch": 0.20281359906213364, "grad_norm": 9.479056358337402, "learning_rate": 1.8505501313156135e-05, "loss": 10.3479, "step": 1038 }, { "epoch": 0.20300898788589292, "grad_norm": 75.90926361083984, "learning_rate": 1.8502171357296144e-05, "loss": 11.1973, "step": 1039 }, { "epoch": 0.20320437670965222, "grad_norm": 9.197609901428223, "learning_rate": 1.8498837996052563e-05, "loss": 9.2738, "step": 1040 }, { "epoch": 0.2033997655334115, "grad_norm": 13.858948707580566, "learning_rate": 1.8495501230760505e-05, "loss": 11.1317, "step": 1041 }, { "epoch": 0.20359515435717077, "grad_norm": 10.339825630187988, "learning_rate": 1.849216106275645e-05, "loss": 8.5552, "step": 1042 }, { "epoch": 0.20379054318093004, "grad_norm": 9.709257125854492, "learning_rate": 1.8488817493378238e-05, "loss": 10.1478, "step": 1043 }, { "epoch": 0.20398593200468934, "grad_norm": 9.045165061950684, "learning_rate": 1.8485470523965073e-05, "loss": 11.3759, "step": 1044 }, { "epoch": 0.20418132082844861, "grad_norm": 7.252753734588623, "learning_rate": 1.8482120155857517e-05, "loss": 10.4401, "step": 1045 }, { "epoch": 0.2043767096522079, "grad_norm": 7.973372936248779, "learning_rate": 1.8478766390397503e-05, "loss": 11.0516, "step": 1046 }, { "epoch": 0.20457209847596716, "grad_norm": 18.249378204345703, "learning_rate": 1.8475409228928314e-05, "loss": 10.5308, "step": 1047 }, { "epoch": 0.20476748729972646, "grad_norm": 10.62024211883545, "learning_rate": 1.8472048672794597e-05, "loss": 10.3611, "step": 1048 }, { "epoch": 0.20496287612348574, "grad_norm": 8.392167091369629, "learning_rate": 1.846868472334236e-05, "loss": 9.6377, "step": 1049 }, { "epoch": 0.205158264947245, "grad_norm": 12.937665939331055, "learning_rate": 1.8465317381918968e-05, "loss": 9.0667, "step": 1050 }, { "epoch": 0.20535365377100429, "grad_norm": 7.9081573486328125, "learning_rate": 1.8461946649873146e-05, "loss": 10.0356, "step": 1051 }, { "epoch": 0.2055490425947636, "grad_norm": 9.666654586791992, "learning_rate": 1.845857252855498e-05, "loss": 10.3346, "step": 1052 }, { "epoch": 0.20574443141852286, "grad_norm": 9.820070266723633, "learning_rate": 1.845519501931591e-05, "loss": 10.1557, "step": 1053 }, { "epoch": 0.20593982024228213, "grad_norm": 8.45248031616211, "learning_rate": 1.845181412350873e-05, "loss": 10.1601, "step": 1054 }, { "epoch": 0.20613520906604144, "grad_norm": 13.481633186340332, "learning_rate": 1.8448429842487593e-05, "loss": 10.1516, "step": 1055 }, { "epoch": 0.2063305978898007, "grad_norm": 9.514289855957031, "learning_rate": 1.844504217760801e-05, "loss": 9.6983, "step": 1056 }, { "epoch": 0.20652598671355998, "grad_norm": 8.149691581726074, "learning_rate": 1.844165113022685e-05, "loss": 10.0595, "step": 1057 }, { "epoch": 0.20672137553731926, "grad_norm": 44.551353454589844, "learning_rate": 1.8438256701702326e-05, "loss": 11.4602, "step": 1058 }, { "epoch": 0.20691676436107856, "grad_norm": 11.108083724975586, "learning_rate": 1.8434858893394015e-05, "loss": 9.0455, "step": 1059 }, { "epoch": 0.20711215318483783, "grad_norm": 10.105135917663574, "learning_rate": 1.843145770666285e-05, "loss": 9.4722, "step": 1060 }, { "epoch": 0.2073075420085971, "grad_norm": 14.592355728149414, "learning_rate": 1.8428053142871106e-05, "loss": 9.2339, "step": 1061 }, { "epoch": 0.20750293083235638, "grad_norm": 10.019697189331055, "learning_rate": 1.8424645203382415e-05, "loss": 9.3722, "step": 1062 }, { "epoch": 0.20769831965611568, "grad_norm": 8.274886131286621, "learning_rate": 1.8421233889561766e-05, "loss": 8.479, "step": 1063 }, { "epoch": 0.20789370847987496, "grad_norm": 9.821599006652832, "learning_rate": 1.8417819202775497e-05, "loss": 9.8521, "step": 1064 }, { "epoch": 0.20808909730363423, "grad_norm": 8.140969276428223, "learning_rate": 1.8414401144391288e-05, "loss": 8.7278, "step": 1065 }, { "epoch": 0.2082844861273935, "grad_norm": 10.504135131835938, "learning_rate": 1.841097971577819e-05, "loss": 9.6927, "step": 1066 }, { "epoch": 0.2084798749511528, "grad_norm": 10.516091346740723, "learning_rate": 1.8407554918306583e-05, "loss": 9.8247, "step": 1067 }, { "epoch": 0.20867526377491208, "grad_norm": 10.204903602600098, "learning_rate": 1.840412675334821e-05, "loss": 10.1618, "step": 1068 }, { "epoch": 0.20887065259867135, "grad_norm": 10.61201286315918, "learning_rate": 1.840069522227615e-05, "loss": 10.2239, "step": 1069 }, { "epoch": 0.20906604142243063, "grad_norm": 22.232080459594727, "learning_rate": 1.8397260326464843e-05, "loss": 10.6251, "step": 1070 }, { "epoch": 0.20926143024618993, "grad_norm": 11.604631423950195, "learning_rate": 1.839382206729007e-05, "loss": 10.1518, "step": 1071 }, { "epoch": 0.2094568190699492, "grad_norm": 12.9887113571167, "learning_rate": 1.8390380446128965e-05, "loss": 9.9893, "step": 1072 }, { "epoch": 0.20965220789370848, "grad_norm": 10.097661972045898, "learning_rate": 1.8386935464359997e-05, "loss": 10.3341, "step": 1073 }, { "epoch": 0.20984759671746775, "grad_norm": 9.482799530029297, "learning_rate": 1.8383487123362992e-05, "loss": 9.2421, "step": 1074 }, { "epoch": 0.21004298554122705, "grad_norm": 11.762171745300293, "learning_rate": 1.8380035424519116e-05, "loss": 9.2298, "step": 1075 }, { "epoch": 0.21023837436498632, "grad_norm": 17.297704696655273, "learning_rate": 1.8376580369210882e-05, "loss": 9.5459, "step": 1076 }, { "epoch": 0.2104337631887456, "grad_norm": 17.030080795288086, "learning_rate": 1.837312195882215e-05, "loss": 8.9085, "step": 1077 }, { "epoch": 0.21062915201250487, "grad_norm": 80.03797912597656, "learning_rate": 1.8369660194738118e-05, "loss": 10.4275, "step": 1078 }, { "epoch": 0.21082454083626417, "grad_norm": 10.00363826751709, "learning_rate": 1.8366195078345327e-05, "loss": 8.8138, "step": 1079 }, { "epoch": 0.21101992966002345, "grad_norm": 10.246661186218262, "learning_rate": 1.836272661103167e-05, "loss": 10.1692, "step": 1080 }, { "epoch": 0.21121531848378272, "grad_norm": 14.411211967468262, "learning_rate": 1.8359254794186368e-05, "loss": 10.7699, "step": 1081 }, { "epoch": 0.211410707307542, "grad_norm": 9.111126899719238, "learning_rate": 1.83557796292e-05, "loss": 8.5393, "step": 1082 }, { "epoch": 0.2116060961313013, "grad_norm": 43.198951721191406, "learning_rate": 1.8352301117464476e-05, "loss": 10.8663, "step": 1083 }, { "epoch": 0.21180148495506057, "grad_norm": 12.209263801574707, "learning_rate": 1.834881926037304e-05, "loss": 9.4677, "step": 1084 }, { "epoch": 0.21199687377881984, "grad_norm": 6.728679656982422, "learning_rate": 1.8345334059320294e-05, "loss": 9.0522, "step": 1085 }, { "epoch": 0.21219226260257915, "grad_norm": 9.289886474609375, "learning_rate": 1.8341845515702164e-05, "loss": 10.0314, "step": 1086 }, { "epoch": 0.21238765142633842, "grad_norm": 11.61264419555664, "learning_rate": 1.8338353630915918e-05, "loss": 11.3864, "step": 1087 }, { "epoch": 0.2125830402500977, "grad_norm": 10.6533842086792, "learning_rate": 1.8334858406360168e-05, "loss": 10.0839, "step": 1088 }, { "epoch": 0.21277842907385697, "grad_norm": 15.192487716674805, "learning_rate": 1.833135984343486e-05, "loss": 10.1023, "step": 1089 }, { "epoch": 0.21297381789761627, "grad_norm": 23.368717193603516, "learning_rate": 1.8327857943541273e-05, "loss": 10.8994, "step": 1090 }, { "epoch": 0.21316920672137554, "grad_norm": 12.789835929870605, "learning_rate": 1.8324352708082033e-05, "loss": 10.3748, "step": 1091 }, { "epoch": 0.21336459554513482, "grad_norm": 52.898136138916016, "learning_rate": 1.832084413846109e-05, "loss": 10.3177, "step": 1092 }, { "epoch": 0.2135599843688941, "grad_norm": 11.188353538513184, "learning_rate": 1.8317332236083738e-05, "loss": 10.2827, "step": 1093 }, { "epoch": 0.2137553731926534, "grad_norm": 10.416289329528809, "learning_rate": 1.8313817002356602e-05, "loss": 10.7478, "step": 1094 }, { "epoch": 0.21395076201641267, "grad_norm": 10.080784797668457, "learning_rate": 1.8310298438687645e-05, "loss": 10.2522, "step": 1095 }, { "epoch": 0.21414615084017194, "grad_norm": 9.515165328979492, "learning_rate": 1.8306776546486155e-05, "loss": 9.1797, "step": 1096 }, { "epoch": 0.2143415396639312, "grad_norm": 8.99358081817627, "learning_rate": 1.8303251327162763e-05, "loss": 9.541, "step": 1097 }, { "epoch": 0.21453692848769051, "grad_norm": 8.59024429321289, "learning_rate": 1.8299722782129428e-05, "loss": 9.7474, "step": 1098 }, { "epoch": 0.2147323173114498, "grad_norm": 11.45220947265625, "learning_rate": 1.8296190912799444e-05, "loss": 9.8606, "step": 1099 }, { "epoch": 0.21492770613520906, "grad_norm": 8.631582260131836, "learning_rate": 1.829265572058743e-05, "loss": 10.3564, "step": 1100 }, { "epoch": 0.21512309495896834, "grad_norm": 76.1949691772461, "learning_rate": 1.8289117206909347e-05, "loss": 10.4504, "step": 1101 }, { "epoch": 0.21531848378272764, "grad_norm": 20.741443634033203, "learning_rate": 1.8285575373182474e-05, "loss": 11.4318, "step": 1102 }, { "epoch": 0.2155138726064869, "grad_norm": 14.55085277557373, "learning_rate": 1.8282030220825424e-05, "loss": 9.5378, "step": 1103 }, { "epoch": 0.21570926143024619, "grad_norm": 8.860641479492188, "learning_rate": 1.8278481751258145e-05, "loss": 9.9277, "step": 1104 }, { "epoch": 0.21590465025400546, "grad_norm": 11.90178394317627, "learning_rate": 1.8274929965901905e-05, "loss": 10.8726, "step": 1105 }, { "epoch": 0.21610003907776476, "grad_norm": 10.75643539428711, "learning_rate": 1.827137486617931e-05, "loss": 10.498, "step": 1106 }, { "epoch": 0.21629542790152403, "grad_norm": 10.311312675476074, "learning_rate": 1.8267816453514283e-05, "loss": 9.9612, "step": 1107 }, { "epoch": 0.2164908167252833, "grad_norm": 8.801495552062988, "learning_rate": 1.8264254729332083e-05, "loss": 9.3258, "step": 1108 }, { "epoch": 0.21668620554904258, "grad_norm": 25.568994522094727, "learning_rate": 1.8260689695059285e-05, "loss": 10.9771, "step": 1109 }, { "epoch": 0.21688159437280188, "grad_norm": 11.811704635620117, "learning_rate": 1.8257121352123803e-05, "loss": 9.777, "step": 1110 }, { "epoch": 0.21707698319656116, "grad_norm": 10.467910766601562, "learning_rate": 1.8253549701954863e-05, "loss": 8.6843, "step": 1111 }, { "epoch": 0.21727237202032043, "grad_norm": 16.515195846557617, "learning_rate": 1.8249974745983023e-05, "loss": 10.9219, "step": 1112 }, { "epoch": 0.2174677608440797, "grad_norm": 9.21031665802002, "learning_rate": 1.8246396485640168e-05, "loss": 10.6956, "step": 1113 }, { "epoch": 0.217663149667839, "grad_norm": 11.476834297180176, "learning_rate": 1.82428149223595e-05, "loss": 10.8747, "step": 1114 }, { "epoch": 0.21785853849159828, "grad_norm": 10.673362731933594, "learning_rate": 1.8239230057575542e-05, "loss": 10.5956, "step": 1115 }, { "epoch": 0.21805392731535755, "grad_norm": 12.2440824508667, "learning_rate": 1.823564189272415e-05, "loss": 10.2206, "step": 1116 }, { "epoch": 0.21824931613911686, "grad_norm": 9.135146141052246, "learning_rate": 1.8232050429242495e-05, "loss": 10.1133, "step": 1117 }, { "epoch": 0.21844470496287613, "grad_norm": 8.728495597839355, "learning_rate": 1.8228455668569065e-05, "loss": 9.8545, "step": 1118 }, { "epoch": 0.2186400937866354, "grad_norm": 9.4779052734375, "learning_rate": 1.8224857612143675e-05, "loss": 10.7018, "step": 1119 }, { "epoch": 0.21883548261039468, "grad_norm": 18.96925163269043, "learning_rate": 1.822125626140746e-05, "loss": 10.017, "step": 1120 }, { "epoch": 0.21903087143415398, "grad_norm": 10.221548080444336, "learning_rate": 1.8217651617802873e-05, "loss": 11.3653, "step": 1121 }, { "epoch": 0.21922626025791325, "grad_norm": 13.457391738891602, "learning_rate": 1.8214043682773685e-05, "loss": 10.3343, "step": 1122 }, { "epoch": 0.21942164908167253, "grad_norm": 11.418845176696777, "learning_rate": 1.8210432457764982e-05, "loss": 9.7316, "step": 1123 }, { "epoch": 0.2196170379054318, "grad_norm": 8.411755561828613, "learning_rate": 1.8206817944223178e-05, "loss": 10.5238, "step": 1124 }, { "epoch": 0.2198124267291911, "grad_norm": 9.965346336364746, "learning_rate": 1.8203200143595993e-05, "loss": 10.655, "step": 1125 }, { "epoch": 0.22000781555295038, "grad_norm": 14.216445922851562, "learning_rate": 1.8199579057332472e-05, "loss": 9.4912, "step": 1126 }, { "epoch": 0.22020320437670965, "grad_norm": 18.472694396972656, "learning_rate": 1.8195954686882972e-05, "loss": 10.7054, "step": 1127 }, { "epoch": 0.22039859320046892, "grad_norm": 10.878667831420898, "learning_rate": 1.8192327033699168e-05, "loss": 10.9642, "step": 1128 }, { "epoch": 0.22059398202422822, "grad_norm": 9.860835075378418, "learning_rate": 1.8188696099234036e-05, "loss": 10.7126, "step": 1129 }, { "epoch": 0.2207893708479875, "grad_norm": 10.548471450805664, "learning_rate": 1.8185061884941893e-05, "loss": 10.1515, "step": 1130 }, { "epoch": 0.22098475967174677, "grad_norm": 18.488536834716797, "learning_rate": 1.8181424392278347e-05, "loss": 9.2223, "step": 1131 }, { "epoch": 0.22118014849550605, "grad_norm": 12.526021003723145, "learning_rate": 1.8177783622700328e-05, "loss": 10.6216, "step": 1132 }, { "epoch": 0.22137553731926535, "grad_norm": 9.050341606140137, "learning_rate": 1.8174139577666078e-05, "loss": 8.8958, "step": 1133 }, { "epoch": 0.22157092614302462, "grad_norm": 14.065995216369629, "learning_rate": 1.8170492258635147e-05, "loss": 10.3226, "step": 1134 }, { "epoch": 0.2217663149667839, "grad_norm": 8.310761451721191, "learning_rate": 1.8166841667068403e-05, "loss": 9.8635, "step": 1135 }, { "epoch": 0.22196170379054317, "grad_norm": 11.958816528320312, "learning_rate": 1.816318780442802e-05, "loss": 11.0582, "step": 1136 }, { "epoch": 0.22215709261430247, "grad_norm": 23.47361183166504, "learning_rate": 1.8159530672177486e-05, "loss": 10.164, "step": 1137 }, { "epoch": 0.22235248143806174, "grad_norm": 18.067411422729492, "learning_rate": 1.815587027178159e-05, "loss": 12.0752, "step": 1138 }, { "epoch": 0.22254787026182102, "grad_norm": 11.176780700683594, "learning_rate": 1.8152206604706445e-05, "loss": 10.1291, "step": 1139 }, { "epoch": 0.2227432590855803, "grad_norm": 12.644645690917969, "learning_rate": 1.8148539672419457e-05, "loss": 9.5169, "step": 1140 }, { "epoch": 0.2229386479093396, "grad_norm": 13.717794418334961, "learning_rate": 1.8144869476389342e-05, "loss": 10.9828, "step": 1141 }, { "epoch": 0.22313403673309887, "grad_norm": 8.076347351074219, "learning_rate": 1.8141196018086136e-05, "loss": 9.2707, "step": 1142 }, { "epoch": 0.22332942555685814, "grad_norm": 10.27889633178711, "learning_rate": 1.8137519298981172e-05, "loss": 11.1033, "step": 1143 }, { "epoch": 0.22352481438061741, "grad_norm": 10.209543228149414, "learning_rate": 1.8133839320547085e-05, "loss": 9.5142, "step": 1144 }, { "epoch": 0.22372020320437672, "grad_norm": 11.779301643371582, "learning_rate": 1.8130156084257824e-05, "loss": 9.8682, "step": 1145 }, { "epoch": 0.223915592028136, "grad_norm": 8.995259284973145, "learning_rate": 1.8126469591588643e-05, "loss": 9.0289, "step": 1146 }, { "epoch": 0.22411098085189526, "grad_norm": 8.813530921936035, "learning_rate": 1.8122779844016088e-05, "loss": 9.8038, "step": 1147 }, { "epoch": 0.22430636967565457, "grad_norm": 26.275314331054688, "learning_rate": 1.8119086843018024e-05, "loss": 10.352, "step": 1148 }, { "epoch": 0.22450175849941384, "grad_norm": 8.708663940429688, "learning_rate": 1.8115390590073612e-05, "loss": 9.6875, "step": 1149 }, { "epoch": 0.2246971473231731, "grad_norm": 7.769646167755127, "learning_rate": 1.8111691086663316e-05, "loss": 9.5119, "step": 1150 }, { "epoch": 0.2248925361469324, "grad_norm": 7.051853656768799, "learning_rate": 1.81079883342689e-05, "loss": 10.1524, "step": 1151 }, { "epoch": 0.2250879249706917, "grad_norm": 11.214014053344727, "learning_rate": 1.8104282334373428e-05, "loss": 10.8635, "step": 1152 }, { "epoch": 0.22528331379445096, "grad_norm": 8.857877731323242, "learning_rate": 1.8100573088461275e-05, "loss": 8.7015, "step": 1153 }, { "epoch": 0.22547870261821024, "grad_norm": 9.525921821594238, "learning_rate": 1.809686059801811e-05, "loss": 8.728, "step": 1154 }, { "epoch": 0.2256740914419695, "grad_norm": 9.54394817352295, "learning_rate": 1.809314486453089e-05, "loss": 9.559, "step": 1155 }, { "epoch": 0.2258694802657288, "grad_norm": 13.451454162597656, "learning_rate": 1.8089425889487894e-05, "loss": 10.4519, "step": 1156 }, { "epoch": 0.22606486908948809, "grad_norm": 67.2001724243164, "learning_rate": 1.808570367437868e-05, "loss": 10.6649, "step": 1157 }, { "epoch": 0.22626025791324736, "grad_norm": 8.727229118347168, "learning_rate": 1.8081978220694113e-05, "loss": 9.2873, "step": 1158 }, { "epoch": 0.22645564673700663, "grad_norm": 9.808565139770508, "learning_rate": 1.8078249529926353e-05, "loss": 10.2171, "step": 1159 }, { "epoch": 0.22665103556076593, "grad_norm": 9.564712524414062, "learning_rate": 1.8074517603568855e-05, "loss": 9.0959, "step": 1160 }, { "epoch": 0.2268464243845252, "grad_norm": 13.291114807128906, "learning_rate": 1.807078244311637e-05, "loss": 9.4776, "step": 1161 }, { "epoch": 0.22704181320828448, "grad_norm": 22.615671157836914, "learning_rate": 1.806704405006495e-05, "loss": 9.9774, "step": 1162 }, { "epoch": 0.22723720203204376, "grad_norm": 7.402246952056885, "learning_rate": 1.8063302425911936e-05, "loss": 9.3268, "step": 1163 }, { "epoch": 0.22743259085580306, "grad_norm": 24.33492660522461, "learning_rate": 1.805955757215596e-05, "loss": 9.1528, "step": 1164 }, { "epoch": 0.22762797967956233, "grad_norm": 8.760977745056152, "learning_rate": 1.805580949029696e-05, "loss": 10.8391, "step": 1165 }, { "epoch": 0.2278233685033216, "grad_norm": 11.081182479858398, "learning_rate": 1.8052058181836152e-05, "loss": 11.2433, "step": 1166 }, { "epoch": 0.22801875732708088, "grad_norm": 12.74425983428955, "learning_rate": 1.804830364827606e-05, "loss": 9.6439, "step": 1167 }, { "epoch": 0.22821414615084018, "grad_norm": 14.619860649108887, "learning_rate": 1.804454589112048e-05, "loss": 10.2618, "step": 1168 }, { "epoch": 0.22840953497459945, "grad_norm": 10.31884765625, "learning_rate": 1.8040784911874517e-05, "loss": 10.6417, "step": 1169 }, { "epoch": 0.22860492379835873, "grad_norm": 7.929812908172607, "learning_rate": 1.803702071204456e-05, "loss": 9.5073, "step": 1170 }, { "epoch": 0.228800312622118, "grad_norm": 8.551639556884766, "learning_rate": 1.803325329313829e-05, "loss": 10.1904, "step": 1171 }, { "epoch": 0.2289957014458773, "grad_norm": 9.057095527648926, "learning_rate": 1.8029482656664672e-05, "loss": 10.4331, "step": 1172 }, { "epoch": 0.22919109026963658, "grad_norm": 9.363543510437012, "learning_rate": 1.8025708804133962e-05, "loss": 9.9363, "step": 1173 }, { "epoch": 0.22938647909339585, "grad_norm": 10.03451156616211, "learning_rate": 1.802193173705771e-05, "loss": 10.2171, "step": 1174 }, { "epoch": 0.22958186791715515, "grad_norm": 9.010741233825684, "learning_rate": 1.8018151456948738e-05, "loss": 10.2098, "step": 1175 }, { "epoch": 0.22977725674091443, "grad_norm": 8.350069999694824, "learning_rate": 1.8014367965321178e-05, "loss": 9.5197, "step": 1176 }, { "epoch": 0.2299726455646737, "grad_norm": 9.755274772644043, "learning_rate": 1.8010581263690428e-05, "loss": 10.6485, "step": 1177 }, { "epoch": 0.23016803438843297, "grad_norm": 15.555088996887207, "learning_rate": 1.8006791353573186e-05, "loss": 9.9917, "step": 1178 }, { "epoch": 0.23036342321219228, "grad_norm": 13.880017280578613, "learning_rate": 1.8002998236487423e-05, "loss": 10.0376, "step": 1179 }, { "epoch": 0.23055881203595155, "grad_norm": 11.880322456359863, "learning_rate": 1.79992019139524e-05, "loss": 11.0284, "step": 1180 }, { "epoch": 0.23075420085971082, "grad_norm": 12.753463745117188, "learning_rate": 1.7995402387488665e-05, "loss": 11.3415, "step": 1181 }, { "epoch": 0.2309495896834701, "grad_norm": 10.973676681518555, "learning_rate": 1.7991599658618044e-05, "loss": 10.3221, "step": 1182 }, { "epoch": 0.2311449785072294, "grad_norm": 8.763169288635254, "learning_rate": 1.798779372886365e-05, "loss": 9.3201, "step": 1183 }, { "epoch": 0.23134036733098867, "grad_norm": 9.328156471252441, "learning_rate": 1.7983984599749876e-05, "loss": 9.4876, "step": 1184 }, { "epoch": 0.23153575615474795, "grad_norm": 14.645464897155762, "learning_rate": 1.7980172272802398e-05, "loss": 9.8452, "step": 1185 }, { "epoch": 0.23173114497850722, "grad_norm": 9.281057357788086, "learning_rate": 1.797635674954817e-05, "loss": 10.4572, "step": 1186 }, { "epoch": 0.23192653380226652, "grad_norm": 9.923699378967285, "learning_rate": 1.7972538031515418e-05, "loss": 8.9601, "step": 1187 }, { "epoch": 0.2321219226260258, "grad_norm": 11.165024757385254, "learning_rate": 1.796871612023367e-05, "loss": 9.1202, "step": 1188 }, { "epoch": 0.23231731144978507, "grad_norm": 16.26569938659668, "learning_rate": 1.796489101723372e-05, "loss": 10.9923, "step": 1189 }, { "epoch": 0.23251270027354434, "grad_norm": 13.922295570373535, "learning_rate": 1.7961062724047632e-05, "loss": 10.104, "step": 1190 }, { "epoch": 0.23270808909730364, "grad_norm": 10.160478591918945, "learning_rate": 1.7957231242208763e-05, "loss": 9.7135, "step": 1191 }, { "epoch": 0.23290347792106292, "grad_norm": 12.048813819885254, "learning_rate": 1.7953396573251738e-05, "loss": 9.2826, "step": 1192 }, { "epoch": 0.2330988667448222, "grad_norm": 10.418081283569336, "learning_rate": 1.794955871871246e-05, "loss": 9.1642, "step": 1193 }, { "epoch": 0.23329425556858147, "grad_norm": 12.009995460510254, "learning_rate": 1.7945717680128107e-05, "loss": 10.4241, "step": 1194 }, { "epoch": 0.23348964439234077, "grad_norm": 10.573904991149902, "learning_rate": 1.7941873459037144e-05, "loss": 10.4035, "step": 1195 }, { "epoch": 0.23368503321610004, "grad_norm": 8.724102973937988, "learning_rate": 1.7938026056979288e-05, "loss": 9.8846, "step": 1196 }, { "epoch": 0.23388042203985931, "grad_norm": 11.088909149169922, "learning_rate": 1.7934175475495552e-05, "loss": 11.0432, "step": 1197 }, { "epoch": 0.2340758108636186, "grad_norm": 11.072525978088379, "learning_rate": 1.793032171612821e-05, "loss": 9.5168, "step": 1198 }, { "epoch": 0.2342711996873779, "grad_norm": 14.48965072631836, "learning_rate": 1.7926464780420813e-05, "loss": 9.6476, "step": 1199 }, { "epoch": 0.23446658851113716, "grad_norm": 25.64013671875, "learning_rate": 1.792260466991818e-05, "loss": 11.0423, "step": 1200 }, { "epoch": 0.23466197733489644, "grad_norm": 15.797342300415039, "learning_rate": 1.791874138616641e-05, "loss": 10.7754, "step": 1201 }, { "epoch": 0.2348573661586557, "grad_norm": 9.64517593383789, "learning_rate": 1.791487493071287e-05, "loss": 9.4815, "step": 1202 }, { "epoch": 0.235052754982415, "grad_norm": 9.780092239379883, "learning_rate": 1.791100530510619e-05, "loss": 9.5045, "step": 1203 }, { "epoch": 0.2352481438061743, "grad_norm": 10.812311172485352, "learning_rate": 1.7907132510896275e-05, "loss": 9.8301, "step": 1204 }, { "epoch": 0.23544353262993356, "grad_norm": 22.109315872192383, "learning_rate": 1.7903256549634304e-05, "loss": 9.8362, "step": 1205 }, { "epoch": 0.23563892145369286, "grad_norm": 13.416193008422852, "learning_rate": 1.7899377422872716e-05, "loss": 9.0167, "step": 1206 }, { "epoch": 0.23583431027745214, "grad_norm": 10.781249046325684, "learning_rate": 1.789549513216522e-05, "loss": 9.0485, "step": 1207 }, { "epoch": 0.2360296991012114, "grad_norm": 16.785064697265625, "learning_rate": 1.78916096790668e-05, "loss": 10.2705, "step": 1208 }, { "epoch": 0.23622508792497068, "grad_norm": 16.255413055419922, "learning_rate": 1.7887721065133695e-05, "loss": 10.4822, "step": 1209 }, { "epoch": 0.23642047674872999, "grad_norm": 13.382121086120605, "learning_rate": 1.7883829291923423e-05, "loss": 10.0025, "step": 1210 }, { "epoch": 0.23661586557248926, "grad_norm": 10.204498291015625, "learning_rate": 1.787993436099475e-05, "loss": 9.5152, "step": 1211 }, { "epoch": 0.23681125439624853, "grad_norm": 7.8883562088012695, "learning_rate": 1.7876036273907726e-05, "loss": 10.241, "step": 1212 }, { "epoch": 0.2370066432200078, "grad_norm": 22.64422607421875, "learning_rate": 1.7872135032223648e-05, "loss": 11.2415, "step": 1213 }, { "epoch": 0.2372020320437671, "grad_norm": 9.7103853225708, "learning_rate": 1.786823063750509e-05, "loss": 10.1178, "step": 1214 }, { "epoch": 0.23739742086752638, "grad_norm": 11.876055717468262, "learning_rate": 1.786432309131588e-05, "loss": 10.7425, "step": 1215 }, { "epoch": 0.23759280969128566, "grad_norm": 21.693655014038086, "learning_rate": 1.7860412395221113e-05, "loss": 9.4291, "step": 1216 }, { "epoch": 0.23778819851504493, "grad_norm": 15.459426879882812, "learning_rate": 1.7856498550787144e-05, "loss": 10.1343, "step": 1217 }, { "epoch": 0.23798358733880423, "grad_norm": 8.189459800720215, "learning_rate": 1.7852581559581594e-05, "loss": 9.7895, "step": 1218 }, { "epoch": 0.2381789761625635, "grad_norm": 10.463160514831543, "learning_rate": 1.7848661423173328e-05, "loss": 10.3121, "step": 1219 }, { "epoch": 0.23837436498632278, "grad_norm": 13.797211647033691, "learning_rate": 1.7844738143132494e-05, "loss": 10.7717, "step": 1220 }, { "epoch": 0.23856975381008205, "grad_norm": 9.269835472106934, "learning_rate": 1.784081172103048e-05, "loss": 9.1177, "step": 1221 }, { "epoch": 0.23876514263384135, "grad_norm": 9.249794006347656, "learning_rate": 1.7836882158439945e-05, "loss": 9.1639, "step": 1222 }, { "epoch": 0.23896053145760063, "grad_norm": 8.747522354125977, "learning_rate": 1.7832949456934797e-05, "loss": 9.1398, "step": 1223 }, { "epoch": 0.2391559202813599, "grad_norm": 9.711735725402832, "learning_rate": 1.7829013618090208e-05, "loss": 9.5402, "step": 1224 }, { "epoch": 0.23935130910511918, "grad_norm": 12.270694732666016, "learning_rate": 1.78250746434826e-05, "loss": 9.57, "step": 1225 }, { "epoch": 0.23954669792887848, "grad_norm": 10.28706169128418, "learning_rate": 1.782113253468966e-05, "loss": 9.8925, "step": 1226 }, { "epoch": 0.23974208675263775, "grad_norm": 11.051925659179688, "learning_rate": 1.781718729329032e-05, "loss": 10.4482, "step": 1227 }, { "epoch": 0.23993747557639702, "grad_norm": 10.1842679977417, "learning_rate": 1.7813238920864772e-05, "loss": 10.0631, "step": 1228 }, { "epoch": 0.2401328644001563, "grad_norm": 12.414724349975586, "learning_rate": 1.7809287418994463e-05, "loss": 9.2048, "step": 1229 }, { "epoch": 0.2403282532239156, "grad_norm": 13.164472579956055, "learning_rate": 1.7805332789262096e-05, "loss": 9.9888, "step": 1230 }, { "epoch": 0.24052364204767487, "grad_norm": 10.315644264221191, "learning_rate": 1.7801375033251617e-05, "loss": 9.5832, "step": 1231 }, { "epoch": 0.24071903087143415, "grad_norm": 10.495373725891113, "learning_rate": 1.7797414152548233e-05, "loss": 9.2239, "step": 1232 }, { "epoch": 0.24091441969519342, "grad_norm": 9.411508560180664, "learning_rate": 1.7793450148738398e-05, "loss": 9.907, "step": 1233 }, { "epoch": 0.24110980851895272, "grad_norm": 11.17962646484375, "learning_rate": 1.7789483023409822e-05, "loss": 9.591, "step": 1234 }, { "epoch": 0.241305197342712, "grad_norm": 8.910758972167969, "learning_rate": 1.778551277815146e-05, "loss": 8.7132, "step": 1235 }, { "epoch": 0.24150058616647127, "grad_norm": 19.328405380249023, "learning_rate": 1.7781539414553516e-05, "loss": 9.9229, "step": 1236 }, { "epoch": 0.24169597499023057, "grad_norm": 10.288520812988281, "learning_rate": 1.7777562934207447e-05, "loss": 9.5067, "step": 1237 }, { "epoch": 0.24189136381398985, "grad_norm": 14.588144302368164, "learning_rate": 1.777358333870596e-05, "loss": 9.5383, "step": 1238 }, { "epoch": 0.24208675263774912, "grad_norm": 13.42604923248291, "learning_rate": 1.7769600629643005e-05, "loss": 10.9858, "step": 1239 }, { "epoch": 0.2422821414615084, "grad_norm": 10.372969627380371, "learning_rate": 1.7765614808613775e-05, "loss": 9.9459, "step": 1240 }, { "epoch": 0.2424775302852677, "grad_norm": 11.277698516845703, "learning_rate": 1.7761625877214725e-05, "loss": 9.0366, "step": 1241 }, { "epoch": 0.24267291910902697, "grad_norm": 10.354175567626953, "learning_rate": 1.7757633837043536e-05, "loss": 10.0163, "step": 1242 }, { "epoch": 0.24286830793278624, "grad_norm": 11.91468620300293, "learning_rate": 1.7753638689699152e-05, "loss": 9.5083, "step": 1243 }, { "epoch": 0.24306369675654552, "grad_norm": 10.422640800476074, "learning_rate": 1.7749640436781748e-05, "loss": 9.8963, "step": 1244 }, { "epoch": 0.24325908558030482, "grad_norm": 10.919748306274414, "learning_rate": 1.7745639079892752e-05, "loss": 10.2643, "step": 1245 }, { "epoch": 0.2434544744040641, "grad_norm": 11.70514965057373, "learning_rate": 1.774163462063483e-05, "loss": 10.4714, "step": 1246 }, { "epoch": 0.24364986322782337, "grad_norm": 11.730195045471191, "learning_rate": 1.773762706061189e-05, "loss": 10.9369, "step": 1247 }, { "epoch": 0.24384525205158264, "grad_norm": 10.884676933288574, "learning_rate": 1.773361640142909e-05, "loss": 9.495, "step": 1248 }, { "epoch": 0.24404064087534194, "grad_norm": 11.580656051635742, "learning_rate": 1.7729602644692818e-05, "loss": 11.6008, "step": 1249 }, { "epoch": 0.24423602969910121, "grad_norm": 10.542391777038574, "learning_rate": 1.7725585792010714e-05, "loss": 10.0219, "step": 1250 }, { "epoch": 0.2444314185228605, "grad_norm": 12.857534408569336, "learning_rate": 1.7721565844991643e-05, "loss": 10.5635, "step": 1251 }, { "epoch": 0.24462680734661976, "grad_norm": 10.392827987670898, "learning_rate": 1.7717542805245728e-05, "loss": 9.2779, "step": 1252 }, { "epoch": 0.24482219617037906, "grad_norm": 9.28751277923584, "learning_rate": 1.771351667438432e-05, "loss": 10.2842, "step": 1253 }, { "epoch": 0.24501758499413834, "grad_norm": 14.696070671081543, "learning_rate": 1.770948745402e-05, "loss": 10.1721, "step": 1254 }, { "epoch": 0.2452129738178976, "grad_norm": 10.734374046325684, "learning_rate": 1.7705455145766607e-05, "loss": 9.8766, "step": 1255 }, { "epoch": 0.24540836264165689, "grad_norm": 13.675962448120117, "learning_rate": 1.77014197512392e-05, "loss": 9.8617, "step": 1256 }, { "epoch": 0.2456037514654162, "grad_norm": 8.64175796508789, "learning_rate": 1.7697381272054083e-05, "loss": 10.7436, "step": 1257 }, { "epoch": 0.24579914028917546, "grad_norm": 15.539191246032715, "learning_rate": 1.769333970982879e-05, "loss": 8.8873, "step": 1258 }, { "epoch": 0.24599452911293473, "grad_norm": 13.888678550720215, "learning_rate": 1.7689295066182092e-05, "loss": 10.2032, "step": 1259 }, { "epoch": 0.246189917936694, "grad_norm": 9.026545524597168, "learning_rate": 1.7685247342733996e-05, "loss": 10.028, "step": 1260 }, { "epoch": 0.2463853067604533, "grad_norm": 7.562590599060059, "learning_rate": 1.768119654110574e-05, "loss": 8.2881, "step": 1261 }, { "epoch": 0.24658069558421258, "grad_norm": 8.620518684387207, "learning_rate": 1.7677142662919794e-05, "loss": 9.7138, "step": 1262 }, { "epoch": 0.24677608440797186, "grad_norm": 9.384614944458008, "learning_rate": 1.7673085709799865e-05, "loss": 9.9614, "step": 1263 }, { "epoch": 0.24697147323173113, "grad_norm": 9.137187004089355, "learning_rate": 1.766902568337089e-05, "loss": 10.3298, "step": 1264 }, { "epoch": 0.24716686205549043, "grad_norm": 13.199517250061035, "learning_rate": 1.7664962585259028e-05, "loss": 9.4269, "step": 1265 }, { "epoch": 0.2473622508792497, "grad_norm": 7.940667629241943, "learning_rate": 1.7660896417091686e-05, "loss": 9.6282, "step": 1266 }, { "epoch": 0.24755763970300898, "grad_norm": 11.642074584960938, "learning_rate": 1.7656827180497484e-05, "loss": 10.6028, "step": 1267 }, { "epoch": 0.24775302852676828, "grad_norm": 9.434550285339355, "learning_rate": 1.7652754877106275e-05, "loss": 9.8025, "step": 1268 }, { "epoch": 0.24794841735052756, "grad_norm": 11.158660888671875, "learning_rate": 1.764867950854915e-05, "loss": 10.3788, "step": 1269 }, { "epoch": 0.24814380617428683, "grad_norm": 10.036881446838379, "learning_rate": 1.764460107645842e-05, "loss": 9.859, "step": 1270 }, { "epoch": 0.2483391949980461, "grad_norm": 8.490141868591309, "learning_rate": 1.764051958246762e-05, "loss": 9.2948, "step": 1271 }, { "epoch": 0.2485345838218054, "grad_norm": 10.146963119506836, "learning_rate": 1.763643502821152e-05, "loss": 9.7118, "step": 1272 }, { "epoch": 0.24872997264556468, "grad_norm": 7.648308277130127, "learning_rate": 1.7632347415326105e-05, "loss": 8.9883, "step": 1273 }, { "epoch": 0.24892536146932395, "grad_norm": 13.093494415283203, "learning_rate": 1.762825674544859e-05, "loss": 10.2064, "step": 1274 }, { "epoch": 0.24912075029308323, "grad_norm": 8.967086791992188, "learning_rate": 1.7624163020217418e-05, "loss": 9.5987, "step": 1275 }, { "epoch": 0.24931613911684253, "grad_norm": 8.7538480758667, "learning_rate": 1.7620066241272257e-05, "loss": 9.1515, "step": 1276 }, { "epoch": 0.2495115279406018, "grad_norm": 8.5569429397583, "learning_rate": 1.761596641025399e-05, "loss": 10.0197, "step": 1277 }, { "epoch": 0.24970691676436108, "grad_norm": 10.225385665893555, "learning_rate": 1.7611863528804728e-05, "loss": 9.0034, "step": 1278 }, { "epoch": 0.24990230558812035, "grad_norm": 9.1556978225708, "learning_rate": 1.76077575985678e-05, "loss": 8.9998, "step": 1279 }, { "epoch": 0.2500976944118796, "grad_norm": 8.018303871154785, "learning_rate": 1.760364862118776e-05, "loss": 9.8576, "step": 1280 }, { "epoch": 0.2502930832356389, "grad_norm": 13.046024322509766, "learning_rate": 1.7599536598310384e-05, "loss": 10.976, "step": 1281 }, { "epoch": 0.2504884720593982, "grad_norm": 11.197158813476562, "learning_rate": 1.759542153158266e-05, "loss": 10.0729, "step": 1282 }, { "epoch": 0.25068386088315747, "grad_norm": 8.969386100769043, "learning_rate": 1.7591303422652802e-05, "loss": 9.7935, "step": 1283 }, { "epoch": 0.2508792497069168, "grad_norm": 8.279151916503906, "learning_rate": 1.7587182273170244e-05, "loss": 9.8509, "step": 1284 }, { "epoch": 0.251074638530676, "grad_norm": 8.590933799743652, "learning_rate": 1.7583058084785626e-05, "loss": 8.6509, "step": 1285 }, { "epoch": 0.2512700273544353, "grad_norm": 8.488905906677246, "learning_rate": 1.757893085915082e-05, "loss": 8.9383, "step": 1286 }, { "epoch": 0.2514654161781946, "grad_norm": 19.685009002685547, "learning_rate": 1.7574800597918907e-05, "loss": 9.711, "step": 1287 }, { "epoch": 0.25166080500195387, "grad_norm": 17.11168098449707, "learning_rate": 1.7570667302744186e-05, "loss": 10.0746, "step": 1288 }, { "epoch": 0.25185619382571317, "grad_norm": 9.783052444458008, "learning_rate": 1.7566530975282166e-05, "loss": 10.2021, "step": 1289 }, { "epoch": 0.25205158264947247, "grad_norm": 7.9754767417907715, "learning_rate": 1.7562391617189576e-05, "loss": 9.676, "step": 1290 }, { "epoch": 0.2522469714732317, "grad_norm": 8.846024513244629, "learning_rate": 1.755824923012436e-05, "loss": 9.3899, "step": 1291 }, { "epoch": 0.252442360296991, "grad_norm": 13.300436019897461, "learning_rate": 1.755410381574567e-05, "loss": 10.2287, "step": 1292 }, { "epoch": 0.25263774912075027, "grad_norm": 10.40530776977539, "learning_rate": 1.7549955375713874e-05, "loss": 9.9376, "step": 1293 }, { "epoch": 0.25283313794450957, "grad_norm": 8.713654518127441, "learning_rate": 1.7545803911690552e-05, "loss": 8.6172, "step": 1294 }, { "epoch": 0.25302852676826887, "grad_norm": 8.503681182861328, "learning_rate": 1.754164942533849e-05, "loss": 10.3257, "step": 1295 }, { "epoch": 0.2532239155920281, "grad_norm": 9.206268310546875, "learning_rate": 1.7537491918321695e-05, "loss": 9.8505, "step": 1296 }, { "epoch": 0.2534193044157874, "grad_norm": 10.204298973083496, "learning_rate": 1.7533331392305375e-05, "loss": 10.1417, "step": 1297 }, { "epoch": 0.2536146932395467, "grad_norm": 43.227294921875, "learning_rate": 1.7529167848955947e-05, "loss": 11.0423, "step": 1298 }, { "epoch": 0.25381008206330596, "grad_norm": 8.194405555725098, "learning_rate": 1.7525001289941043e-05, "loss": 9.587, "step": 1299 }, { "epoch": 0.25400547088706527, "grad_norm": 8.818645477294922, "learning_rate": 1.75208317169295e-05, "loss": 9.5864, "step": 1300 }, { "epoch": 0.25420085971082457, "grad_norm": 13.115413665771484, "learning_rate": 1.7516659131591358e-05, "loss": 9.8418, "step": 1301 }, { "epoch": 0.2543962485345838, "grad_norm": 11.651979446411133, "learning_rate": 1.7512483535597868e-05, "loss": 10.203, "step": 1302 }, { "epoch": 0.2545916373583431, "grad_norm": 7.130728244781494, "learning_rate": 1.7508304930621486e-05, "loss": 8.2777, "step": 1303 }, { "epoch": 0.25478702618210236, "grad_norm": 9.03930377960205, "learning_rate": 1.7504123318335876e-05, "loss": 10.2767, "step": 1304 }, { "epoch": 0.25498241500586166, "grad_norm": 10.161637306213379, "learning_rate": 1.74999387004159e-05, "loss": 10.3646, "step": 1305 }, { "epoch": 0.25517780382962096, "grad_norm": 9.122706413269043, "learning_rate": 1.749575107853763e-05, "loss": 10.1747, "step": 1306 }, { "epoch": 0.2553731926533802, "grad_norm": 7.738844394683838, "learning_rate": 1.7491560454378333e-05, "loss": 9.664, "step": 1307 }, { "epoch": 0.2555685814771395, "grad_norm": 10.311731338500977, "learning_rate": 1.748736682961649e-05, "loss": 10.4507, "step": 1308 }, { "epoch": 0.2557639703008988, "grad_norm": 10.760847091674805, "learning_rate": 1.7483170205931775e-05, "loss": 9.6038, "step": 1309 }, { "epoch": 0.25595935912465806, "grad_norm": 15.414864540100098, "learning_rate": 1.7478970585005068e-05, "loss": 10.4599, "step": 1310 }, { "epoch": 0.25615474794841736, "grad_norm": 10.18409538269043, "learning_rate": 1.747476796851845e-05, "loss": 9.9979, "step": 1311 }, { "epoch": 0.2563501367721766, "grad_norm": 8.863759994506836, "learning_rate": 1.747056235815519e-05, "loss": 9.6182, "step": 1312 }, { "epoch": 0.2565455255959359, "grad_norm": 41.86937713623047, "learning_rate": 1.7466353755599773e-05, "loss": 9.6356, "step": 1313 }, { "epoch": 0.2567409144196952, "grad_norm": 8.577277183532715, "learning_rate": 1.7462142162537876e-05, "loss": 9.5527, "step": 1314 }, { "epoch": 0.25693630324345446, "grad_norm": 10.327160835266113, "learning_rate": 1.7457927580656367e-05, "loss": 9.6522, "step": 1315 }, { "epoch": 0.25713169206721376, "grad_norm": 18.54293441772461, "learning_rate": 1.7453710011643318e-05, "loss": 9.6847, "step": 1316 }, { "epoch": 0.25732708089097306, "grad_norm": 8.194198608398438, "learning_rate": 1.7449489457188005e-05, "loss": 9.6983, "step": 1317 }, { "epoch": 0.2575224697147323, "grad_norm": 10.148292541503906, "learning_rate": 1.744526591898088e-05, "loss": 9.8361, "step": 1318 }, { "epoch": 0.2577178585384916, "grad_norm": 9.461729049682617, "learning_rate": 1.744103939871361e-05, "loss": 9.5904, "step": 1319 }, { "epoch": 0.25791324736225085, "grad_norm": 8.193283081054688, "learning_rate": 1.743680989807904e-05, "loss": 9.0122, "step": 1320 }, { "epoch": 0.25810863618601015, "grad_norm": 11.62541675567627, "learning_rate": 1.743257741877122e-05, "loss": 10.3077, "step": 1321 }, { "epoch": 0.25830402500976946, "grad_norm": 10.485865592956543, "learning_rate": 1.7428341962485387e-05, "loss": 9.9068, "step": 1322 }, { "epoch": 0.2584994138335287, "grad_norm": 27.449573516845703, "learning_rate": 1.7424103530917978e-05, "loss": 9.5062, "step": 1323 }, { "epoch": 0.258694802657288, "grad_norm": 10.768765449523926, "learning_rate": 1.741986212576661e-05, "loss": 9.3951, "step": 1324 }, { "epoch": 0.2588901914810473, "grad_norm": 8.962034225463867, "learning_rate": 1.74156177487301e-05, "loss": 9.4387, "step": 1325 }, { "epoch": 0.25908558030480655, "grad_norm": 8.598007202148438, "learning_rate": 1.741137040150846e-05, "loss": 9.9422, "step": 1326 }, { "epoch": 0.25928096912856585, "grad_norm": 11.843433380126953, "learning_rate": 1.740712008580287e-05, "loss": 10.5862, "step": 1327 }, { "epoch": 0.25947635795232515, "grad_norm": 8.559103012084961, "learning_rate": 1.7402866803315726e-05, "loss": 9.9117, "step": 1328 }, { "epoch": 0.2596717467760844, "grad_norm": 7.808802127838135, "learning_rate": 1.7398610555750596e-05, "loss": 8.8458, "step": 1329 }, { "epoch": 0.2598671355998437, "grad_norm": 7.943463325500488, "learning_rate": 1.7394351344812236e-05, "loss": 9.1846, "step": 1330 }, { "epoch": 0.26006252442360295, "grad_norm": 8.736802101135254, "learning_rate": 1.7390089172206594e-05, "loss": 9.6133, "step": 1331 }, { "epoch": 0.26025791324736225, "grad_norm": 10.777621269226074, "learning_rate": 1.73858240396408e-05, "loss": 9.2166, "step": 1332 }, { "epoch": 0.26045330207112155, "grad_norm": 12.80964469909668, "learning_rate": 1.738155594882318e-05, "loss": 10.6826, "step": 1333 }, { "epoch": 0.2606486908948808, "grad_norm": 11.080780982971191, "learning_rate": 1.737728490146323e-05, "loss": 10.1382, "step": 1334 }, { "epoch": 0.2608440797186401, "grad_norm": 9.645936012268066, "learning_rate": 1.7373010899271636e-05, "loss": 10.6446, "step": 1335 }, { "epoch": 0.2610394685423994, "grad_norm": 13.024704933166504, "learning_rate": 1.7368733943960278e-05, "loss": 10.8037, "step": 1336 }, { "epoch": 0.26123485736615865, "grad_norm": 9.12859058380127, "learning_rate": 1.7364454037242196e-05, "loss": 9.665, "step": 1337 }, { "epoch": 0.26143024618991795, "grad_norm": 12.750452041625977, "learning_rate": 1.736017118083163e-05, "loss": 10.8155, "step": 1338 }, { "epoch": 0.2616256350136772, "grad_norm": 22.314653396606445, "learning_rate": 1.7355885376444007e-05, "loss": 10.0157, "step": 1339 }, { "epoch": 0.2618210238374365, "grad_norm": 9.28205680847168, "learning_rate": 1.7351596625795914e-05, "loss": 10.2323, "step": 1340 }, { "epoch": 0.2620164126611958, "grad_norm": 9.391312599182129, "learning_rate": 1.7347304930605135e-05, "loss": 8.8491, "step": 1341 }, { "epoch": 0.26221180148495504, "grad_norm": 20.81572723388672, "learning_rate": 1.734301029259062e-05, "loss": 10.0474, "step": 1342 }, { "epoch": 0.26240719030871434, "grad_norm": 8.251876831054688, "learning_rate": 1.7338712713472508e-05, "loss": 10.0025, "step": 1343 }, { "epoch": 0.26260257913247365, "grad_norm": 9.096527099609375, "learning_rate": 1.733441219497211e-05, "loss": 10.1042, "step": 1344 }, { "epoch": 0.2627979679562329, "grad_norm": 9.74231243133545, "learning_rate": 1.733010873881193e-05, "loss": 9.763, "step": 1345 }, { "epoch": 0.2629933567799922, "grad_norm": 7.231594562530518, "learning_rate": 1.7325802346715622e-05, "loss": 8.3591, "step": 1346 }, { "epoch": 0.26318874560375144, "grad_norm": 7.799582481384277, "learning_rate": 1.7321493020408033e-05, "loss": 9.7482, "step": 1347 }, { "epoch": 0.26338413442751074, "grad_norm": 14.953970909118652, "learning_rate": 1.7317180761615183e-05, "loss": 9.7535, "step": 1348 }, { "epoch": 0.26357952325127004, "grad_norm": 10.339173316955566, "learning_rate": 1.731286557206427e-05, "loss": 9.9535, "step": 1349 }, { "epoch": 0.2637749120750293, "grad_norm": 16.204267501831055, "learning_rate": 1.7308547453483653e-05, "loss": 9.771, "step": 1350 }, { "epoch": 0.2639703008987886, "grad_norm": 8.685306549072266, "learning_rate": 1.730422640760288e-05, "loss": 9.1089, "step": 1351 }, { "epoch": 0.2641656897225479, "grad_norm": 11.356358528137207, "learning_rate": 1.729990243615266e-05, "loss": 9.6862, "step": 1352 }, { "epoch": 0.26436107854630714, "grad_norm": 8.473872184753418, "learning_rate": 1.7295575540864878e-05, "loss": 9.07, "step": 1353 }, { "epoch": 0.26455646737006644, "grad_norm": 16.833782196044922, "learning_rate": 1.729124572347259e-05, "loss": 10.145, "step": 1354 }, { "epoch": 0.26475185619382574, "grad_norm": 25.658811569213867, "learning_rate": 1.7286912985710023e-05, "loss": 9.8756, "step": 1355 }, { "epoch": 0.264947245017585, "grad_norm": 8.838557243347168, "learning_rate": 1.7282577329312573e-05, "loss": 9.8892, "step": 1356 }, { "epoch": 0.2651426338413443, "grad_norm": 9.928974151611328, "learning_rate": 1.7278238756016805e-05, "loss": 9.5268, "step": 1357 }, { "epoch": 0.26533802266510353, "grad_norm": 8.144798278808594, "learning_rate": 1.7273897267560448e-05, "loss": 9.2465, "step": 1358 }, { "epoch": 0.26553341148886284, "grad_norm": 10.5440092086792, "learning_rate": 1.7269552865682414e-05, "loss": 9.5009, "step": 1359 }, { "epoch": 0.26572880031262214, "grad_norm": 9.936861991882324, "learning_rate": 1.726520555212276e-05, "loss": 10.153, "step": 1360 }, { "epoch": 0.2659241891363814, "grad_norm": 9.90738296508789, "learning_rate": 1.7260855328622725e-05, "loss": 10.5246, "step": 1361 }, { "epoch": 0.2661195779601407, "grad_norm": 9.371439933776855, "learning_rate": 1.7256502196924707e-05, "loss": 9.0179, "step": 1362 }, { "epoch": 0.2663149667839, "grad_norm": 9.719748497009277, "learning_rate": 1.725214615877227e-05, "loss": 10.1903, "step": 1363 }, { "epoch": 0.26651035560765923, "grad_norm": 11.304025650024414, "learning_rate": 1.7247787215910144e-05, "loss": 10.3357, "step": 1364 }, { "epoch": 0.26670574443141853, "grad_norm": 9.024648666381836, "learning_rate": 1.724342537008422e-05, "loss": 9.9289, "step": 1365 }, { "epoch": 0.2669011332551778, "grad_norm": 8.572917938232422, "learning_rate": 1.7239060623041552e-05, "loss": 8.4542, "step": 1366 }, { "epoch": 0.2670965220789371, "grad_norm": 9.70483112335205, "learning_rate": 1.7234692976530357e-05, "loss": 9.5483, "step": 1367 }, { "epoch": 0.2672919109026964, "grad_norm": 9.806533813476562, "learning_rate": 1.723032243230002e-05, "loss": 9.6311, "step": 1368 }, { "epoch": 0.26748729972645563, "grad_norm": 18.847232818603516, "learning_rate": 1.7225948992101067e-05, "loss": 9.3228, "step": 1369 }, { "epoch": 0.26768268855021493, "grad_norm": 12.063652038574219, "learning_rate": 1.7221572657685205e-05, "loss": 8.5508, "step": 1370 }, { "epoch": 0.26787807737397423, "grad_norm": 12.360920906066895, "learning_rate": 1.7217193430805286e-05, "loss": 9.7002, "step": 1371 }, { "epoch": 0.2680734661977335, "grad_norm": 9.31651782989502, "learning_rate": 1.7212811313215337e-05, "loss": 8.3971, "step": 1372 }, { "epoch": 0.2682688550214928, "grad_norm": 10.297537803649902, "learning_rate": 1.7208426306670522e-05, "loss": 9.3896, "step": 1373 }, { "epoch": 0.268464243845252, "grad_norm": 9.430858612060547, "learning_rate": 1.7204038412927177e-05, "loss": 9.7238, "step": 1374 }, { "epoch": 0.26865963266901133, "grad_norm": 9.866272926330566, "learning_rate": 1.7199647633742783e-05, "loss": 9.8938, "step": 1375 }, { "epoch": 0.26885502149277063, "grad_norm": 13.442451477050781, "learning_rate": 1.7195253970875994e-05, "loss": 9.7914, "step": 1376 }, { "epoch": 0.2690504103165299, "grad_norm": 8.995190620422363, "learning_rate": 1.71908574260866e-05, "loss": 9.7674, "step": 1377 }, { "epoch": 0.2692457991402892, "grad_norm": 10.069921493530273, "learning_rate": 1.7186458001135557e-05, "loss": 10.0635, "step": 1378 }, { "epoch": 0.2694411879640485, "grad_norm": 7.587112903594971, "learning_rate": 1.7182055697784972e-05, "loss": 8.6594, "step": 1379 }, { "epoch": 0.2696365767878077, "grad_norm": 9.20854663848877, "learning_rate": 1.71776505177981e-05, "loss": 9.5423, "step": 1380 }, { "epoch": 0.269831965611567, "grad_norm": 9.881065368652344, "learning_rate": 1.717324246293936e-05, "loss": 10.2872, "step": 1381 }, { "epoch": 0.27002735443532627, "grad_norm": 9.00755500793457, "learning_rate": 1.7168831534974306e-05, "loss": 9.0091, "step": 1382 }, { "epoch": 0.2702227432590856, "grad_norm": 31.67610740661621, "learning_rate": 1.7164417735669657e-05, "loss": 11.0973, "step": 1383 }, { "epoch": 0.2704181320828449, "grad_norm": 18.58461570739746, "learning_rate": 1.7160001066793275e-05, "loss": 8.8906, "step": 1384 }, { "epoch": 0.2706135209066041, "grad_norm": 8.737338066101074, "learning_rate": 1.7155581530114174e-05, "loss": 9.6321, "step": 1385 }, { "epoch": 0.2708089097303634, "grad_norm": 8.752185821533203, "learning_rate": 1.7151159127402518e-05, "loss": 9.5535, "step": 1386 }, { "epoch": 0.2710042985541227, "grad_norm": 14.492600440979004, "learning_rate": 1.7146733860429614e-05, "loss": 9.779, "step": 1387 }, { "epoch": 0.27119968737788197, "grad_norm": 7.780018329620361, "learning_rate": 1.7142305730967913e-05, "loss": 10.2408, "step": 1388 }, { "epoch": 0.27139507620164127, "grad_norm": 9.412854194641113, "learning_rate": 1.7137874740791027e-05, "loss": 10.4512, "step": 1389 }, { "epoch": 0.2715904650254006, "grad_norm": 10.296211242675781, "learning_rate": 1.71334408916737e-05, "loss": 11.2377, "step": 1390 }, { "epoch": 0.2717858538491598, "grad_norm": 14.402783393859863, "learning_rate": 1.712900418539183e-05, "loss": 10.3899, "step": 1391 }, { "epoch": 0.2719812426729191, "grad_norm": 11.80636215209961, "learning_rate": 1.7124564623722455e-05, "loss": 9.8263, "step": 1392 }, { "epoch": 0.27217663149667837, "grad_norm": 7.489226341247559, "learning_rate": 1.7120122208443755e-05, "loss": 10.4005, "step": 1393 }, { "epoch": 0.27237202032043767, "grad_norm": 9.502429008483887, "learning_rate": 1.7115676941335057e-05, "loss": 10.4095, "step": 1394 }, { "epoch": 0.27256740914419697, "grad_norm": 12.124082565307617, "learning_rate": 1.7111228824176827e-05, "loss": 10.1853, "step": 1395 }, { "epoch": 0.2727627979679562, "grad_norm": 11.134305000305176, "learning_rate": 1.7106777858750673e-05, "loss": 9.9589, "step": 1396 }, { "epoch": 0.2729581867917155, "grad_norm": 10.44892692565918, "learning_rate": 1.7102324046839346e-05, "loss": 9.6726, "step": 1397 }, { "epoch": 0.2731535756154748, "grad_norm": 14.50195598602295, "learning_rate": 1.709786739022674e-05, "loss": 10.07, "step": 1398 }, { "epoch": 0.27334896443923407, "grad_norm": 13.216436386108398, "learning_rate": 1.709340789069788e-05, "loss": 8.6138, "step": 1399 }, { "epoch": 0.27354435326299337, "grad_norm": 10.18933391571045, "learning_rate": 1.7088945550038935e-05, "loss": 9.2372, "step": 1400 }, { "epoch": 0.2737397420867526, "grad_norm": 10.000385284423828, "learning_rate": 1.708448037003721e-05, "loss": 9.677, "step": 1401 }, { "epoch": 0.2739351309105119, "grad_norm": 11.34453296661377, "learning_rate": 1.708001235248115e-05, "loss": 9.6858, "step": 1402 }, { "epoch": 0.2741305197342712, "grad_norm": 9.784234046936035, "learning_rate": 1.7075541499160336e-05, "loss": 10.3645, "step": 1403 }, { "epoch": 0.27432590855803046, "grad_norm": 8.841297149658203, "learning_rate": 1.7071067811865477e-05, "loss": 9.9277, "step": 1404 }, { "epoch": 0.27452129738178976, "grad_norm": 8.425395011901855, "learning_rate": 1.706659129238843e-05, "loss": 9.4929, "step": 1405 }, { "epoch": 0.27471668620554907, "grad_norm": 8.213043212890625, "learning_rate": 1.7062111942522177e-05, "loss": 9.5276, "step": 1406 }, { "epoch": 0.2749120750293083, "grad_norm": 12.275148391723633, "learning_rate": 1.7057629764060842e-05, "loss": 9.3295, "step": 1407 }, { "epoch": 0.2751074638530676, "grad_norm": 10.934953689575195, "learning_rate": 1.7053144758799668e-05, "loss": 8.5463, "step": 1408 }, { "epoch": 0.27530285267682686, "grad_norm": 7.493126392364502, "learning_rate": 1.7048656928535045e-05, "loss": 9.9411, "step": 1409 }, { "epoch": 0.27549824150058616, "grad_norm": 10.218108177185059, "learning_rate": 1.704416627506449e-05, "loss": 10.0827, "step": 1410 }, { "epoch": 0.27569363032434546, "grad_norm": 10.551949501037598, "learning_rate": 1.703967280018664e-05, "loss": 10.771, "step": 1411 }, { "epoch": 0.2758890191481047, "grad_norm": 10.218538284301758, "learning_rate": 1.703517650570128e-05, "loss": 10.8535, "step": 1412 }, { "epoch": 0.276084407971864, "grad_norm": 10.639214515686035, "learning_rate": 1.703067739340931e-05, "loss": 9.3686, "step": 1413 }, { "epoch": 0.2762797967956233, "grad_norm": 10.4620361328125, "learning_rate": 1.702617546511277e-05, "loss": 9.0547, "step": 1414 }, { "epoch": 0.27647518561938256, "grad_norm": 9.750692367553711, "learning_rate": 1.7021670722614817e-05, "loss": 9.6319, "step": 1415 }, { "epoch": 0.27667057444314186, "grad_norm": 9.28245735168457, "learning_rate": 1.7017163167719743e-05, "loss": 9.6182, "step": 1416 }, { "epoch": 0.27686596326690116, "grad_norm": 14.629377365112305, "learning_rate": 1.701265280223296e-05, "loss": 9.664, "step": 1417 }, { "epoch": 0.2770613520906604, "grad_norm": 10.068516731262207, "learning_rate": 1.7008139627961012e-05, "loss": 9.9891, "step": 1418 }, { "epoch": 0.2772567409144197, "grad_norm": 13.83179759979248, "learning_rate": 1.700362364671157e-05, "loss": 10.4564, "step": 1419 }, { "epoch": 0.27745212973817895, "grad_norm": 10.409890174865723, "learning_rate": 1.699910486029342e-05, "loss": 9.4946, "step": 1420 }, { "epoch": 0.27764751856193826, "grad_norm": 9.16124439239502, "learning_rate": 1.699458327051647e-05, "loss": 9.3257, "step": 1421 }, { "epoch": 0.27784290738569756, "grad_norm": 7.58655309677124, "learning_rate": 1.699005887919177e-05, "loss": 9.4247, "step": 1422 }, { "epoch": 0.2780382962094568, "grad_norm": 8.163040161132812, "learning_rate": 1.698553168813147e-05, "loss": 10.097, "step": 1423 }, { "epoch": 0.2782336850332161, "grad_norm": 11.548359870910645, "learning_rate": 1.6981001699148853e-05, "loss": 8.9478, "step": 1424 }, { "epoch": 0.2784290738569754, "grad_norm": 16.783096313476562, "learning_rate": 1.697646891405832e-05, "loss": 10.8209, "step": 1425 }, { "epoch": 0.27862446268073465, "grad_norm": 12.144583702087402, "learning_rate": 1.6971933334675395e-05, "loss": 9.2343, "step": 1426 }, { "epoch": 0.27881985150449395, "grad_norm": 13.720064163208008, "learning_rate": 1.6967394962816713e-05, "loss": 9.8624, "step": 1427 }, { "epoch": 0.2790152403282532, "grad_norm": 10.508441925048828, "learning_rate": 1.696285380030004e-05, "loss": 9.7736, "step": 1428 }, { "epoch": 0.2792106291520125, "grad_norm": 10.424206733703613, "learning_rate": 1.6958309848944244e-05, "loss": 10.8513, "step": 1429 }, { "epoch": 0.2794060179757718, "grad_norm": 10.918111801147461, "learning_rate": 1.6953763110569325e-05, "loss": 9.6206, "step": 1430 }, { "epoch": 0.27960140679953105, "grad_norm": 7.589666366577148, "learning_rate": 1.694921358699639e-05, "loss": 10.1118, "step": 1431 }, { "epoch": 0.27979679562329035, "grad_norm": 10.684234619140625, "learning_rate": 1.6944661280047667e-05, "loss": 9.1125, "step": 1432 }, { "epoch": 0.27999218444704965, "grad_norm": 8.044968605041504, "learning_rate": 1.6940106191546493e-05, "loss": 9.7331, "step": 1433 }, { "epoch": 0.2801875732708089, "grad_norm": 7.673303604125977, "learning_rate": 1.6935548323317326e-05, "loss": 10.1433, "step": 1434 }, { "epoch": 0.2803829620945682, "grad_norm": 9.639922142028809, "learning_rate": 1.693098767718573e-05, "loss": 9.7468, "step": 1435 }, { "epoch": 0.28057835091832745, "grad_norm": 7.536942005157471, "learning_rate": 1.6926424254978388e-05, "loss": 9.557, "step": 1436 }, { "epoch": 0.28077373974208675, "grad_norm": 9.589834213256836, "learning_rate": 1.692185805852309e-05, "loss": 8.855, "step": 1437 }, { "epoch": 0.28096912856584605, "grad_norm": 8.762582778930664, "learning_rate": 1.6917289089648742e-05, "loss": 8.8605, "step": 1438 }, { "epoch": 0.2811645173896053, "grad_norm": 9.100251197814941, "learning_rate": 1.6912717350185353e-05, "loss": 9.8335, "step": 1439 }, { "epoch": 0.2813599062133646, "grad_norm": 8.672943115234375, "learning_rate": 1.6908142841964055e-05, "loss": 9.9712, "step": 1440 }, { "epoch": 0.2815552950371239, "grad_norm": 8.250885963439941, "learning_rate": 1.6903565566817073e-05, "loss": 10.1313, "step": 1441 }, { "epoch": 0.28175068386088314, "grad_norm": 67.72570037841797, "learning_rate": 1.689898552657775e-05, "loss": 7.4254, "step": 1442 }, { "epoch": 0.28194607268464245, "grad_norm": 12.001933097839355, "learning_rate": 1.689440272308054e-05, "loss": 9.6222, "step": 1443 }, { "epoch": 0.2821414615084017, "grad_norm": 8.292913436889648, "learning_rate": 1.688981715816099e-05, "loss": 9.6154, "step": 1444 }, { "epoch": 0.282336850332161, "grad_norm": 7.754610061645508, "learning_rate": 1.688522883365576e-05, "loss": 10.2804, "step": 1445 }, { "epoch": 0.2825322391559203, "grad_norm": 11.714853286743164, "learning_rate": 1.6880637751402623e-05, "loss": 10.2299, "step": 1446 }, { "epoch": 0.28272762797967954, "grad_norm": 9.187288284301758, "learning_rate": 1.687604391324045e-05, "loss": 10.3888, "step": 1447 }, { "epoch": 0.28292301680343884, "grad_norm": 13.724797248840332, "learning_rate": 1.6871447321009215e-05, "loss": 10.1753, "step": 1448 }, { "epoch": 0.28311840562719814, "grad_norm": 8.328743934631348, "learning_rate": 1.6866847976549988e-05, "loss": 9.6432, "step": 1449 }, { "epoch": 0.2833137944509574, "grad_norm": 10.776053428649902, "learning_rate": 1.686224588170496e-05, "loss": 10.8265, "step": 1450 }, { "epoch": 0.2835091832747167, "grad_norm": 9.196435928344727, "learning_rate": 1.6857641038317404e-05, "loss": 9.3719, "step": 1451 }, { "epoch": 0.283704572098476, "grad_norm": 11.425103187561035, "learning_rate": 1.6853033448231712e-05, "loss": 10.7306, "step": 1452 }, { "epoch": 0.28389996092223524, "grad_norm": 9.222018241882324, "learning_rate": 1.684842311329336e-05, "loss": 9.7921, "step": 1453 }, { "epoch": 0.28409534974599454, "grad_norm": 10.456576347351074, "learning_rate": 1.684381003534893e-05, "loss": 10.2656, "step": 1454 }, { "epoch": 0.2842907385697538, "grad_norm": 8.699284553527832, "learning_rate": 1.683919421624611e-05, "loss": 9.208, "step": 1455 }, { "epoch": 0.2844861273935131, "grad_norm": 10.197521209716797, "learning_rate": 1.683457565783367e-05, "loss": 9.3122, "step": 1456 }, { "epoch": 0.2846815162172724, "grad_norm": 15.487138748168945, "learning_rate": 1.6829954361961492e-05, "loss": 9.501, "step": 1457 }, { "epoch": 0.28487690504103164, "grad_norm": 10.236659049987793, "learning_rate": 1.6825330330480545e-05, "loss": 9.9833, "step": 1458 }, { "epoch": 0.28507229386479094, "grad_norm": 9.559532165527344, "learning_rate": 1.68207035652429e-05, "loss": 9.6962, "step": 1459 }, { "epoch": 0.28526768268855024, "grad_norm": 10.54056453704834, "learning_rate": 1.6816074068101715e-05, "loss": 8.1149, "step": 1460 }, { "epoch": 0.2854630715123095, "grad_norm": 8.070180892944336, "learning_rate": 1.6811441840911252e-05, "loss": 8.8212, "step": 1461 }, { "epoch": 0.2856584603360688, "grad_norm": 11.812846183776855, "learning_rate": 1.6806806885526864e-05, "loss": 10.4092, "step": 1462 }, { "epoch": 0.28585384915982803, "grad_norm": 11.560187339782715, "learning_rate": 1.6802169203804992e-05, "loss": 9.4563, "step": 1463 }, { "epoch": 0.28604923798358733, "grad_norm": 14.845040321350098, "learning_rate": 1.679752879760317e-05, "loss": 9.9109, "step": 1464 }, { "epoch": 0.28624462680734664, "grad_norm": 17.191116333007812, "learning_rate": 1.6792885668780026e-05, "loss": 10.3424, "step": 1465 }, { "epoch": 0.2864400156311059, "grad_norm": 9.194401741027832, "learning_rate": 1.6788239819195277e-05, "loss": 9.9862, "step": 1466 }, { "epoch": 0.2866354044548652, "grad_norm": 7.765769958496094, "learning_rate": 1.6783591250709726e-05, "loss": 9.7139, "step": 1467 }, { "epoch": 0.2868307932786245, "grad_norm": 14.478752136230469, "learning_rate": 1.6778939965185278e-05, "loss": 10.2629, "step": 1468 }, { "epoch": 0.28702618210238373, "grad_norm": 9.576632499694824, "learning_rate": 1.677428596448491e-05, "loss": 10.4502, "step": 1469 }, { "epoch": 0.28722157092614303, "grad_norm": 11.754566192626953, "learning_rate": 1.6769629250472696e-05, "loss": 9.6511, "step": 1470 }, { "epoch": 0.2874169597499023, "grad_norm": 10.358704566955566, "learning_rate": 1.6764969825013795e-05, "loss": 9.386, "step": 1471 }, { "epoch": 0.2876123485736616, "grad_norm": 10.96502685546875, "learning_rate": 1.676030768997445e-05, "loss": 9.974, "step": 1472 }, { "epoch": 0.2878077373974209, "grad_norm": 12.029976844787598, "learning_rate": 1.6755642847221992e-05, "loss": 10.6669, "step": 1473 }, { "epoch": 0.28800312622118013, "grad_norm": 12.324685096740723, "learning_rate": 1.6750975298624838e-05, "loss": 9.6188, "step": 1474 }, { "epoch": 0.28819851504493943, "grad_norm": 10.851834297180176, "learning_rate": 1.674630504605248e-05, "loss": 9.612, "step": 1475 }, { "epoch": 0.28839390386869873, "grad_norm": 11.32132339477539, "learning_rate": 1.6741632091375505e-05, "loss": 10.2633, "step": 1476 }, { "epoch": 0.288589292692458, "grad_norm": 10.077210426330566, "learning_rate": 1.6736956436465573e-05, "loss": 10.4579, "step": 1477 }, { "epoch": 0.2887846815162173, "grad_norm": 12.926339149475098, "learning_rate": 1.673227808319543e-05, "loss": 11.0451, "step": 1478 }, { "epoch": 0.2889800703399766, "grad_norm": 16.012737274169922, "learning_rate": 1.6727597033438903e-05, "loss": 9.7502, "step": 1479 }, { "epoch": 0.2891754591637358, "grad_norm": 13.25388240814209, "learning_rate": 1.6722913289070896e-05, "loss": 9.355, "step": 1480 }, { "epoch": 0.28937084798749513, "grad_norm": 9.597887992858887, "learning_rate": 1.671822685196739e-05, "loss": 8.222, "step": 1481 }, { "epoch": 0.2895662368112544, "grad_norm": 9.557008743286133, "learning_rate": 1.6713537724005457e-05, "loss": 9.1877, "step": 1482 }, { "epoch": 0.2897616256350137, "grad_norm": 12.110699653625488, "learning_rate": 1.6708845907063234e-05, "loss": 10.3352, "step": 1483 }, { "epoch": 0.289957014458773, "grad_norm": 10.078930854797363, "learning_rate": 1.6704151403019938e-05, "loss": 10.5232, "step": 1484 }, { "epoch": 0.2901524032825322, "grad_norm": 11.133780479431152, "learning_rate": 1.6699454213755868e-05, "loss": 10.0798, "step": 1485 }, { "epoch": 0.2903477921062915, "grad_norm": 11.386680603027344, "learning_rate": 1.669475434115239e-05, "loss": 10.3686, "step": 1486 }, { "epoch": 0.2905431809300508, "grad_norm": 12.639415740966797, "learning_rate": 1.669005178709195e-05, "loss": 9.2262, "step": 1487 }, { "epoch": 0.29073856975381007, "grad_norm": 12.360422134399414, "learning_rate": 1.6685346553458067e-05, "loss": 9.796, "step": 1488 }, { "epoch": 0.2909339585775694, "grad_norm": 9.941367149353027, "learning_rate": 1.6680638642135337e-05, "loss": 8.8607, "step": 1489 }, { "epoch": 0.2911293474013286, "grad_norm": 8.1162109375, "learning_rate": 1.6675928055009417e-05, "loss": 8.7807, "step": 1490 }, { "epoch": 0.2913247362250879, "grad_norm": 9.627894401550293, "learning_rate": 1.667121479396705e-05, "loss": 8.5564, "step": 1491 }, { "epoch": 0.2915201250488472, "grad_norm": 8.931618690490723, "learning_rate": 1.6666498860896036e-05, "loss": 9.3509, "step": 1492 }, { "epoch": 0.29171551387260647, "grad_norm": 11.936795234680176, "learning_rate": 1.666178025768526e-05, "loss": 10.2361, "step": 1493 }, { "epoch": 0.29191090269636577, "grad_norm": 10.509285926818848, "learning_rate": 1.6657058986224667e-05, "loss": 9.7758, "step": 1494 }, { "epoch": 0.29210629152012507, "grad_norm": 6.613710403442383, "learning_rate": 1.6652335048405275e-05, "loss": 9.2565, "step": 1495 }, { "epoch": 0.2923016803438843, "grad_norm": 27.692249298095703, "learning_rate": 1.6647608446119163e-05, "loss": 10.6953, "step": 1496 }, { "epoch": 0.2924970691676436, "grad_norm": 12.839340209960938, "learning_rate": 1.6642879181259483e-05, "loss": 9.8403, "step": 1497 }, { "epoch": 0.29269245799140287, "grad_norm": 8.90999984741211, "learning_rate": 1.663814725572045e-05, "loss": 10.0489, "step": 1498 }, { "epoch": 0.29288784681516217, "grad_norm": 10.096855163574219, "learning_rate": 1.6633412671397357e-05, "loss": 10.1652, "step": 1499 }, { "epoch": 0.29308323563892147, "grad_norm": 8.625805854797363, "learning_rate": 1.6628675430186543e-05, "loss": 9.3864, "step": 1500 }, { "epoch": 0.2932786244626807, "grad_norm": 12.385692596435547, "learning_rate": 1.662393553398542e-05, "loss": 8.7917, "step": 1501 }, { "epoch": 0.29347401328644, "grad_norm": 10.390081405639648, "learning_rate": 1.661919298469247e-05, "loss": 10.0085, "step": 1502 }, { "epoch": 0.2936694021101993, "grad_norm": 16.05337142944336, "learning_rate": 1.6614447784207226e-05, "loss": 10.1085, "step": 1503 }, { "epoch": 0.29386479093395856, "grad_norm": 10.438437461853027, "learning_rate": 1.660969993443029e-05, "loss": 9.9805, "step": 1504 }, { "epoch": 0.29406017975771787, "grad_norm": 11.209931373596191, "learning_rate": 1.6604949437263322e-05, "loss": 10.66, "step": 1505 }, { "epoch": 0.29425556858147717, "grad_norm": 11.702326774597168, "learning_rate": 1.6600196294609047e-05, "loss": 9.4944, "step": 1506 }, { "epoch": 0.2944509574052364, "grad_norm": 9.846290588378906, "learning_rate": 1.6595440508371238e-05, "loss": 9.8013, "step": 1507 }, { "epoch": 0.2946463462289957, "grad_norm": 8.696939468383789, "learning_rate": 1.6590682080454748e-05, "loss": 9.4621, "step": 1508 }, { "epoch": 0.29484173505275496, "grad_norm": 9.543524742126465, "learning_rate": 1.6585921012765465e-05, "loss": 9.3651, "step": 1509 }, { "epoch": 0.29503712387651426, "grad_norm": 11.260859489440918, "learning_rate": 1.6581157307210345e-05, "loss": 9.5992, "step": 1510 }, { "epoch": 0.29523251270027356, "grad_norm": 10.757518768310547, "learning_rate": 1.6576390965697403e-05, "loss": 10.2075, "step": 1511 }, { "epoch": 0.2954279015240328, "grad_norm": 9.129809379577637, "learning_rate": 1.6571621990135708e-05, "loss": 9.8907, "step": 1512 }, { "epoch": 0.2956232903477921, "grad_norm": 8.124305725097656, "learning_rate": 1.6566850382435378e-05, "loss": 7.8761, "step": 1513 }, { "epoch": 0.2958186791715514, "grad_norm": 8.545839309692383, "learning_rate": 1.6562076144507595e-05, "loss": 9.7005, "step": 1514 }, { "epoch": 0.29601406799531066, "grad_norm": 8.617210388183594, "learning_rate": 1.6557299278264586e-05, "loss": 9.6294, "step": 1515 }, { "epoch": 0.29620945681906996, "grad_norm": 10.131733894348145, "learning_rate": 1.6552519785619633e-05, "loss": 8.6846, "step": 1516 }, { "epoch": 0.2964048456428292, "grad_norm": 12.793623924255371, "learning_rate": 1.6547737668487078e-05, "loss": 9.1373, "step": 1517 }, { "epoch": 0.2966002344665885, "grad_norm": 10.047884941101074, "learning_rate": 1.6542952928782302e-05, "loss": 10.5081, "step": 1518 }, { "epoch": 0.2967956232903478, "grad_norm": 11.640722274780273, "learning_rate": 1.6538165568421742e-05, "loss": 8.9963, "step": 1519 }, { "epoch": 0.29699101211410706, "grad_norm": 9.68021297454834, "learning_rate": 1.6533375589322883e-05, "loss": 10.3106, "step": 1520 }, { "epoch": 0.29718640093786636, "grad_norm": 29.985857009887695, "learning_rate": 1.6528582993404266e-05, "loss": 10.0603, "step": 1521 }, { "epoch": 0.29738178976162566, "grad_norm": 10.854527473449707, "learning_rate": 1.652378778258547e-05, "loss": 9.6001, "step": 1522 }, { "epoch": 0.2975771785853849, "grad_norm": 11.029958724975586, "learning_rate": 1.6518989958787126e-05, "loss": 9.0629, "step": 1523 }, { "epoch": 0.2977725674091442, "grad_norm": 8.962169647216797, "learning_rate": 1.6514189523930913e-05, "loss": 8.4699, "step": 1524 }, { "epoch": 0.29796795623290345, "grad_norm": 8.509246826171875, "learning_rate": 1.6509386479939554e-05, "loss": 9.2693, "step": 1525 }, { "epoch": 0.29816334505666275, "grad_norm": 10.364874839782715, "learning_rate": 1.650458082873682e-05, "loss": 8.625, "step": 1526 }, { "epoch": 0.29835873388042206, "grad_norm": 10.83619499206543, "learning_rate": 1.6499772572247518e-05, "loss": 10.3062, "step": 1527 }, { "epoch": 0.2985541227041813, "grad_norm": 13.112933158874512, "learning_rate": 1.64949617123975e-05, "loss": 9.9893, "step": 1528 }, { "epoch": 0.2987495115279406, "grad_norm": 7.441256046295166, "learning_rate": 1.649014825111368e-05, "loss": 9.1927, "step": 1529 }, { "epoch": 0.2989449003516999, "grad_norm": 14.765359878540039, "learning_rate": 1.6485332190323984e-05, "loss": 10.6637, "step": 1530 }, { "epoch": 0.29914028917545915, "grad_norm": 11.117351531982422, "learning_rate": 1.64805135319574e-05, "loss": 9.4884, "step": 1531 }, { "epoch": 0.29933567799921845, "grad_norm": 6.486770153045654, "learning_rate": 1.647569227794395e-05, "loss": 9.4596, "step": 1532 }, { "epoch": 0.2995310668229777, "grad_norm": 9.962038040161133, "learning_rate": 1.64708684302147e-05, "loss": 9.6684, "step": 1533 }, { "epoch": 0.299726455646737, "grad_norm": 8.842194557189941, "learning_rate": 1.6466041990701743e-05, "loss": 8.9051, "step": 1534 }, { "epoch": 0.2999218444704963, "grad_norm": 10.530966758728027, "learning_rate": 1.646121296133822e-05, "loss": 10.2824, "step": 1535 }, { "epoch": 0.30011723329425555, "grad_norm": 10.525800704956055, "learning_rate": 1.645638134405831e-05, "loss": 9.6589, "step": 1536 }, { "epoch": 0.30031262211801485, "grad_norm": 8.37814998626709, "learning_rate": 1.6451547140797223e-05, "loss": 9.7822, "step": 1537 }, { "epoch": 0.30050801094177415, "grad_norm": 8.279181480407715, "learning_rate": 1.6446710353491213e-05, "loss": 9.5246, "step": 1538 }, { "epoch": 0.3007033997655334, "grad_norm": 10.936617851257324, "learning_rate": 1.6441870984077556e-05, "loss": 9.5018, "step": 1539 }, { "epoch": 0.3008987885892927, "grad_norm": 8.712897300720215, "learning_rate": 1.6437029034494574e-05, "loss": 10.3973, "step": 1540 }, { "epoch": 0.301094177413052, "grad_norm": 24.52730369567871, "learning_rate": 1.6432184506681615e-05, "loss": 9.4271, "step": 1541 }, { "epoch": 0.30128956623681125, "grad_norm": 11.265439987182617, "learning_rate": 1.6427337402579065e-05, "loss": 8.8542, "step": 1542 }, { "epoch": 0.30148495506057055, "grad_norm": 8.999545097351074, "learning_rate": 1.6422487724128343e-05, "loss": 8.8798, "step": 1543 }, { "epoch": 0.3016803438843298, "grad_norm": 10.456027030944824, "learning_rate": 1.6417635473271893e-05, "loss": 9.0947, "step": 1544 }, { "epoch": 0.3018757327080891, "grad_norm": 15.720943450927734, "learning_rate": 1.6412780651953187e-05, "loss": 9.244, "step": 1545 }, { "epoch": 0.3020711215318484, "grad_norm": 11.093077659606934, "learning_rate": 1.6407923262116736e-05, "loss": 10.4142, "step": 1546 }, { "epoch": 0.30226651035560764, "grad_norm": 10.72560977935791, "learning_rate": 1.6403063305708076e-05, "loss": 9.9932, "step": 1547 }, { "epoch": 0.30246189917936694, "grad_norm": 10.00912094116211, "learning_rate": 1.6398200784673773e-05, "loss": 10.841, "step": 1548 }, { "epoch": 0.30265728800312625, "grad_norm": 11.052425384521484, "learning_rate": 1.639333570096141e-05, "loss": 9.2576, "step": 1549 }, { "epoch": 0.3028526768268855, "grad_norm": 8.651049613952637, "learning_rate": 1.638846805651961e-05, "loss": 9.9265, "step": 1550 }, { "epoch": 0.3030480656506448, "grad_norm": 7.818670272827148, "learning_rate": 1.638359785329802e-05, "loss": 8.814, "step": 1551 }, { "epoch": 0.30324345447440404, "grad_norm": 9.49549674987793, "learning_rate": 1.6378725093247297e-05, "loss": 9.3743, "step": 1552 }, { "epoch": 0.30343884329816334, "grad_norm": 7.494236469268799, "learning_rate": 1.637384977831914e-05, "loss": 10.0042, "step": 1553 }, { "epoch": 0.30363423212192264, "grad_norm": 10.093852043151855, "learning_rate": 1.636897191046626e-05, "loss": 9.2741, "step": 1554 }, { "epoch": 0.3038296209456819, "grad_norm": 9.646723747253418, "learning_rate": 1.63640914916424e-05, "loss": 10.0743, "step": 1555 }, { "epoch": 0.3040250097694412, "grad_norm": 14.061182975769043, "learning_rate": 1.6359208523802308e-05, "loss": 9.7071, "step": 1556 }, { "epoch": 0.3042203985932005, "grad_norm": 7.9713664054870605, "learning_rate": 1.6354323008901774e-05, "loss": 8.947, "step": 1557 }, { "epoch": 0.30441578741695974, "grad_norm": 7.878183364868164, "learning_rate": 1.63494349488976e-05, "loss": 9.4146, "step": 1558 }, { "epoch": 0.30461117624071904, "grad_norm": 9.49204158782959, "learning_rate": 1.63445443457476e-05, "loss": 10.5473, "step": 1559 }, { "epoch": 0.3048065650644783, "grad_norm": 9.174206733703613, "learning_rate": 1.6339651201410613e-05, "loss": 9.1709, "step": 1560 }, { "epoch": 0.3050019538882376, "grad_norm": 8.276881217956543, "learning_rate": 1.6334755517846496e-05, "loss": 9.2224, "step": 1561 }, { "epoch": 0.3051973427119969, "grad_norm": 11.268220901489258, "learning_rate": 1.632985729701612e-05, "loss": 9.8205, "step": 1562 }, { "epoch": 0.30539273153575613, "grad_norm": 9.72193717956543, "learning_rate": 1.6324956540881386e-05, "loss": 8.4075, "step": 1563 }, { "epoch": 0.30558812035951544, "grad_norm": 10.148341178894043, "learning_rate": 1.6320053251405187e-05, "loss": 9.7546, "step": 1564 }, { "epoch": 0.30578350918327474, "grad_norm": 9.791842460632324, "learning_rate": 1.6315147430551442e-05, "loss": 9.1544, "step": 1565 }, { "epoch": 0.305978898007034, "grad_norm": 27.13488006591797, "learning_rate": 1.6310239080285088e-05, "loss": 9.4843, "step": 1566 }, { "epoch": 0.3061742868307933, "grad_norm": 8.317152976989746, "learning_rate": 1.6305328202572078e-05, "loss": 9.1695, "step": 1567 }, { "epoch": 0.3063696756545526, "grad_norm": 13.713216781616211, "learning_rate": 1.630041479937936e-05, "loss": 9.5583, "step": 1568 }, { "epoch": 0.30656506447831183, "grad_norm": 27.447023391723633, "learning_rate": 1.629549887267491e-05, "loss": 10.8052, "step": 1569 }, { "epoch": 0.30676045330207113, "grad_norm": 11.13930892944336, "learning_rate": 1.6290580424427713e-05, "loss": 9.2008, "step": 1570 }, { "epoch": 0.3069558421258304, "grad_norm": 12.243010520935059, "learning_rate": 1.628565945660775e-05, "loss": 8.071, "step": 1571 }, { "epoch": 0.3071512309495897, "grad_norm": 9.27982234954834, "learning_rate": 1.6280735971186033e-05, "loss": 9.7449, "step": 1572 }, { "epoch": 0.307346619773349, "grad_norm": 10.390607833862305, "learning_rate": 1.6275809970134565e-05, "loss": 9.8616, "step": 1573 }, { "epoch": 0.30754200859710823, "grad_norm": 12.398528099060059, "learning_rate": 1.6270881455426358e-05, "loss": 9.521, "step": 1574 }, { "epoch": 0.30773739742086753, "grad_norm": 11.779969215393066, "learning_rate": 1.6265950429035445e-05, "loss": 10.1166, "step": 1575 }, { "epoch": 0.30793278624462683, "grad_norm": 10.970566749572754, "learning_rate": 1.6261016892936844e-05, "loss": 10.4658, "step": 1576 }, { "epoch": 0.3081281750683861, "grad_norm": 7.940483093261719, "learning_rate": 1.6256080849106603e-05, "loss": 9.0797, "step": 1577 }, { "epoch": 0.3083235638921454, "grad_norm": 11.488015174865723, "learning_rate": 1.6251142299521746e-05, "loss": 8.9782, "step": 1578 }, { "epoch": 0.3085189527159046, "grad_norm": 8.953866958618164, "learning_rate": 1.6246201246160327e-05, "loss": 8.9, "step": 1579 }, { "epoch": 0.30871434153966393, "grad_norm": 12.140908241271973, "learning_rate": 1.6241257691001386e-05, "loss": 11.0306, "step": 1580 }, { "epoch": 0.30890973036342323, "grad_norm": 10.783838272094727, "learning_rate": 1.623631163602497e-05, "loss": 9.4288, "step": 1581 }, { "epoch": 0.3091051191871825, "grad_norm": 10.663506507873535, "learning_rate": 1.6231363083212128e-05, "loss": 9.1419, "step": 1582 }, { "epoch": 0.3093005080109418, "grad_norm": 28.735572814941406, "learning_rate": 1.622641203454491e-05, "loss": 9.655, "step": 1583 }, { "epoch": 0.3094958968347011, "grad_norm": 9.377638816833496, "learning_rate": 1.6221458492006367e-05, "loss": 9.819, "step": 1584 }, { "epoch": 0.3096912856584603, "grad_norm": 16.58684539794922, "learning_rate": 1.6216502457580542e-05, "loss": 10.2696, "step": 1585 }, { "epoch": 0.3098866744822196, "grad_norm": 11.1240816116333, "learning_rate": 1.621154393325248e-05, "loss": 9.5811, "step": 1586 }, { "epoch": 0.31008206330597887, "grad_norm": 8.305037498474121, "learning_rate": 1.6206582921008233e-05, "loss": 9.3075, "step": 1587 }, { "epoch": 0.3102774521297382, "grad_norm": 10.81719970703125, "learning_rate": 1.620161942283483e-05, "loss": 10.3475, "step": 1588 }, { "epoch": 0.3104728409534975, "grad_norm": 14.510613441467285, "learning_rate": 1.619665344072031e-05, "loss": 9.751, "step": 1589 }, { "epoch": 0.3106682297772567, "grad_norm": 9.30660629272461, "learning_rate": 1.6191684976653707e-05, "loss": 9.7671, "step": 1590 }, { "epoch": 0.310863618601016, "grad_norm": 8.766847610473633, "learning_rate": 1.6186714032625036e-05, "loss": 9.5391, "step": 1591 }, { "epoch": 0.3110590074247753, "grad_norm": 8.718366622924805, "learning_rate": 1.6181740610625324e-05, "loss": 9.3201, "step": 1592 }, { "epoch": 0.31125439624853457, "grad_norm": 7.148858547210693, "learning_rate": 1.6176764712646575e-05, "loss": 9.3771, "step": 1593 }, { "epoch": 0.31144978507229387, "grad_norm": 14.913329124450684, "learning_rate": 1.617178634068179e-05, "loss": 10.153, "step": 1594 }, { "epoch": 0.3116451738960531, "grad_norm": 9.894980430603027, "learning_rate": 1.6166805496724965e-05, "loss": 9.9227, "step": 1595 }, { "epoch": 0.3118405627198124, "grad_norm": 8.090249061584473, "learning_rate": 1.6161822182771084e-05, "loss": 10.1949, "step": 1596 }, { "epoch": 0.3120359515435717, "grad_norm": 10.747549057006836, "learning_rate": 1.6156836400816112e-05, "loss": 9.2133, "step": 1597 }, { "epoch": 0.31223134036733097, "grad_norm": 8.165460586547852, "learning_rate": 1.6151848152857017e-05, "loss": 7.9304, "step": 1598 }, { "epoch": 0.31242672919109027, "grad_norm": 9.286834716796875, "learning_rate": 1.6146857440891744e-05, "loss": 9.6123, "step": 1599 }, { "epoch": 0.31262211801484957, "grad_norm": 10.36357307434082, "learning_rate": 1.6141864266919228e-05, "loss": 9.3243, "step": 1600 }, { "epoch": 0.3128175068386088, "grad_norm": 13.824201583862305, "learning_rate": 1.613686863293939e-05, "loss": 9.7002, "step": 1601 }, { "epoch": 0.3130128956623681, "grad_norm": 8.38644027709961, "learning_rate": 1.6131870540953135e-05, "loss": 9.3296, "step": 1602 }, { "epoch": 0.3132082844861274, "grad_norm": 39.436885833740234, "learning_rate": 1.6126869992962357e-05, "loss": 9.4538, "step": 1603 }, { "epoch": 0.31340367330988667, "grad_norm": 21.05567741394043, "learning_rate": 1.6121866990969926e-05, "loss": 9.7832, "step": 1604 }, { "epoch": 0.31359906213364597, "grad_norm": 9.28034496307373, "learning_rate": 1.611686153697971e-05, "loss": 9.8811, "step": 1605 }, { "epoch": 0.3137944509574052, "grad_norm": 7.713963985443115, "learning_rate": 1.611185363299654e-05, "loss": 9.5874, "step": 1606 }, { "epoch": 0.3139898397811645, "grad_norm": 8.904071807861328, "learning_rate": 1.6106843281026237e-05, "loss": 10.1699, "step": 1607 }, { "epoch": 0.3141852286049238, "grad_norm": 11.779389381408691, "learning_rate": 1.6101830483075603e-05, "loss": 9.974, "step": 1608 }, { "epoch": 0.31438061742868306, "grad_norm": 10.87238597869873, "learning_rate": 1.6096815241152427e-05, "loss": 10.2095, "step": 1609 }, { "epoch": 0.31457600625244236, "grad_norm": 8.63390827178955, "learning_rate": 1.6091797557265455e-05, "loss": 9.8568, "step": 1610 }, { "epoch": 0.31477139507620167, "grad_norm": 13.952505111694336, "learning_rate": 1.6086777433424435e-05, "loss": 9.9195, "step": 1611 }, { "epoch": 0.3149667838999609, "grad_norm": 7.961946964263916, "learning_rate": 1.6081754871640084e-05, "loss": 8.5242, "step": 1612 }, { "epoch": 0.3151621727237202, "grad_norm": 9.592717170715332, "learning_rate": 1.607672987392409e-05, "loss": 9.3546, "step": 1613 }, { "epoch": 0.31535756154747946, "grad_norm": 6.381619930267334, "learning_rate": 1.6071702442289115e-05, "loss": 9.2855, "step": 1614 }, { "epoch": 0.31555295037123876, "grad_norm": 10.898557662963867, "learning_rate": 1.6066672578748814e-05, "loss": 9.8498, "step": 1615 }, { "epoch": 0.31574833919499806, "grad_norm": 13.637084007263184, "learning_rate": 1.6061640285317795e-05, "loss": 10.5978, "step": 1616 }, { "epoch": 0.3159437280187573, "grad_norm": 10.367321014404297, "learning_rate": 1.605660556401165e-05, "loss": 8.6832, "step": 1617 }, { "epoch": 0.3161391168425166, "grad_norm": 9.48491382598877, "learning_rate": 1.6051568416846936e-05, "loss": 9.1167, "step": 1618 }, { "epoch": 0.3163345056662759, "grad_norm": 12.643638610839844, "learning_rate": 1.6046528845841195e-05, "loss": 9.4473, "step": 1619 }, { "epoch": 0.31652989449003516, "grad_norm": 9.137619972229004, "learning_rate": 1.604148685301293e-05, "loss": 9.1735, "step": 1620 }, { "epoch": 0.31672528331379446, "grad_norm": 13.785964012145996, "learning_rate": 1.603644244038161e-05, "loss": 10.3451, "step": 1621 }, { "epoch": 0.3169206721375537, "grad_norm": 10.2887544631958, "learning_rate": 1.6031395609967683e-05, "loss": 8.9395, "step": 1622 }, { "epoch": 0.317116060961313, "grad_norm": 12.658092498779297, "learning_rate": 1.6026346363792565e-05, "loss": 8.7574, "step": 1623 }, { "epoch": 0.3173114497850723, "grad_norm": 9.074075698852539, "learning_rate": 1.6021294703878632e-05, "loss": 9.1438, "step": 1624 }, { "epoch": 0.31750683860883155, "grad_norm": 9.075652122497559, "learning_rate": 1.6016240632249224e-05, "loss": 9.7798, "step": 1625 }, { "epoch": 0.31770222743259086, "grad_norm": 9.15244197845459, "learning_rate": 1.6011184150928667e-05, "loss": 8.891, "step": 1626 }, { "epoch": 0.31789761625635016, "grad_norm": 8.605587005615234, "learning_rate": 1.600612526194223e-05, "loss": 9.6025, "step": 1627 }, { "epoch": 0.3180930050801094, "grad_norm": 8.151103973388672, "learning_rate": 1.6001063967316156e-05, "loss": 9.012, "step": 1628 }, { "epoch": 0.3182883939038687, "grad_norm": 9.278593063354492, "learning_rate": 1.599600026907765e-05, "loss": 9.6449, "step": 1629 }, { "epoch": 0.318483782727628, "grad_norm": 9.308077812194824, "learning_rate": 1.5990934169254885e-05, "loss": 10.2807, "step": 1630 }, { "epoch": 0.31867917155138725, "grad_norm": 10.449454307556152, "learning_rate": 1.598586566987699e-05, "loss": 10.107, "step": 1631 }, { "epoch": 0.31887456037514655, "grad_norm": 11.943887710571289, "learning_rate": 1.598079477297405e-05, "loss": 9.5064, "step": 1632 }, { "epoch": 0.3190699491989058, "grad_norm": 7.746683120727539, "learning_rate": 1.5975721480577124e-05, "loss": 8.9351, "step": 1633 }, { "epoch": 0.3192653380226651, "grad_norm": 14.373797416687012, "learning_rate": 1.597064579471822e-05, "loss": 9.4663, "step": 1634 }, { "epoch": 0.3194607268464244, "grad_norm": 8.865236282348633, "learning_rate": 1.5965567717430304e-05, "loss": 9.718, "step": 1635 }, { "epoch": 0.31965611567018365, "grad_norm": 11.387980461120605, "learning_rate": 1.5960487250747314e-05, "loss": 9.918, "step": 1636 }, { "epoch": 0.31985150449394295, "grad_norm": 9.649009704589844, "learning_rate": 1.5955404396704127e-05, "loss": 8.6794, "step": 1637 }, { "epoch": 0.32004689331770225, "grad_norm": 10.171775817871094, "learning_rate": 1.595031915733658e-05, "loss": 10.2739, "step": 1638 }, { "epoch": 0.3202422821414615, "grad_norm": 17.069379806518555, "learning_rate": 1.5945231534681483e-05, "loss": 8.3431, "step": 1639 }, { "epoch": 0.3204376709652208, "grad_norm": 9.645978927612305, "learning_rate": 1.5940141530776573e-05, "loss": 9.5865, "step": 1640 }, { "epoch": 0.32063305978898005, "grad_norm": 9.110332489013672, "learning_rate": 1.593504914766056e-05, "loss": 9.1653, "step": 1641 }, { "epoch": 0.32082844861273935, "grad_norm": 21.16898536682129, "learning_rate": 1.5929954387373103e-05, "loss": 10.0758, "step": 1642 }, { "epoch": 0.32102383743649865, "grad_norm": 13.377840042114258, "learning_rate": 1.5924857251954805e-05, "loss": 9.098, "step": 1643 }, { "epoch": 0.3212192262602579, "grad_norm": 9.212141036987305, "learning_rate": 1.5919757743447237e-05, "loss": 9.5257, "step": 1644 }, { "epoch": 0.3214146150840172, "grad_norm": 10.903090476989746, "learning_rate": 1.5914655863892907e-05, "loss": 10.3237, "step": 1645 }, { "epoch": 0.3216100039077765, "grad_norm": 10.470623016357422, "learning_rate": 1.5909551615335274e-05, "loss": 9.6061, "step": 1646 }, { "epoch": 0.32180539273153574, "grad_norm": 19.114530563354492, "learning_rate": 1.5904444999818748e-05, "loss": 9.5387, "step": 1647 }, { "epoch": 0.32200078155529505, "grad_norm": 10.143712997436523, "learning_rate": 1.5899336019388686e-05, "loss": 9.6485, "step": 1648 }, { "epoch": 0.3221961703790543, "grad_norm": 12.910961151123047, "learning_rate": 1.5894224676091395e-05, "loss": 9.614, "step": 1649 }, { "epoch": 0.3223915592028136, "grad_norm": 9.364129066467285, "learning_rate": 1.588911097197413e-05, "loss": 10.3305, "step": 1650 }, { "epoch": 0.3225869480265729, "grad_norm": 11.014893531799316, "learning_rate": 1.588399490908508e-05, "loss": 10.3429, "step": 1651 }, { "epoch": 0.32278233685033214, "grad_norm": 11.558199882507324, "learning_rate": 1.5878876489473397e-05, "loss": 9.7504, "step": 1652 }, { "epoch": 0.32297772567409144, "grad_norm": 11.256964683532715, "learning_rate": 1.587375571518916e-05, "loss": 9.6539, "step": 1653 }, { "epoch": 0.32317311449785074, "grad_norm": 10.854321479797363, "learning_rate": 1.58686325882834e-05, "loss": 9.5224, "step": 1654 }, { "epoch": 0.32336850332161, "grad_norm": 12.20022201538086, "learning_rate": 1.586350711080809e-05, "loss": 9.9883, "step": 1655 }, { "epoch": 0.3235638921453693, "grad_norm": 9.082717895507812, "learning_rate": 1.5858379284816136e-05, "loss": 10.2387, "step": 1656 }, { "epoch": 0.3237592809691286, "grad_norm": 9.35047721862793, "learning_rate": 1.58532491123614e-05, "loss": 9.0864, "step": 1657 }, { "epoch": 0.32395466979288784, "grad_norm": 9.99332332611084, "learning_rate": 1.5848116595498676e-05, "loss": 9.0237, "step": 1658 }, { "epoch": 0.32415005861664714, "grad_norm": 8.102620124816895, "learning_rate": 1.5842981736283686e-05, "loss": 9.7664, "step": 1659 }, { "epoch": 0.3243454474404064, "grad_norm": 9.926274299621582, "learning_rate": 1.583784453677311e-05, "loss": 8.8169, "step": 1660 }, { "epoch": 0.3245408362641657, "grad_norm": 8.649129867553711, "learning_rate": 1.5832704999024552e-05, "loss": 8.4021, "step": 1661 }, { "epoch": 0.324736225087925, "grad_norm": 10.611956596374512, "learning_rate": 1.5827563125096557e-05, "loss": 10.3733, "step": 1662 }, { "epoch": 0.32493161391168424, "grad_norm": 8.192151069641113, "learning_rate": 1.582241891704861e-05, "loss": 9.454, "step": 1663 }, { "epoch": 0.32512700273544354, "grad_norm": 10.12881851196289, "learning_rate": 1.581727237694112e-05, "loss": 10.0913, "step": 1664 }, { "epoch": 0.32532239155920284, "grad_norm": 9.004416465759277, "learning_rate": 1.5812123506835435e-05, "loss": 9.4404, "step": 1665 }, { "epoch": 0.3255177803829621, "grad_norm": 13.841079711914062, "learning_rate": 1.580697230879384e-05, "loss": 9.7292, "step": 1666 }, { "epoch": 0.3257131692067214, "grad_norm": 12.088377952575684, "learning_rate": 1.580181878487955e-05, "loss": 9.7977, "step": 1667 }, { "epoch": 0.32590855803048063, "grad_norm": 7.993840217590332, "learning_rate": 1.5796662937156714e-05, "loss": 8.9673, "step": 1668 }, { "epoch": 0.32610394685423993, "grad_norm": 8.977736473083496, "learning_rate": 1.5791504767690403e-05, "loss": 9.04, "step": 1669 }, { "epoch": 0.32629933567799924, "grad_norm": 13.686468124389648, "learning_rate": 1.5786344278546627e-05, "loss": 10.4661, "step": 1670 }, { "epoch": 0.3264947245017585, "grad_norm": 7.914223670959473, "learning_rate": 1.5781181471792326e-05, "loss": 9.5493, "step": 1671 }, { "epoch": 0.3266901133255178, "grad_norm": 7.73126745223999, "learning_rate": 1.5776016349495355e-05, "loss": 10.191, "step": 1672 }, { "epoch": 0.3268855021492771, "grad_norm": 12.177936553955078, "learning_rate": 1.5770848913724516e-05, "loss": 10.3791, "step": 1673 }, { "epoch": 0.32708089097303633, "grad_norm": 20.568925857543945, "learning_rate": 1.576567916654952e-05, "loss": 9.4601, "step": 1674 }, { "epoch": 0.32727627979679563, "grad_norm": 12.07807731628418, "learning_rate": 1.5760507110041017e-05, "loss": 10.3394, "step": 1675 }, { "epoch": 0.3274716686205549, "grad_norm": 7.828347206115723, "learning_rate": 1.5755332746270574e-05, "loss": 9.282, "step": 1676 }, { "epoch": 0.3276670574443142, "grad_norm": 6.870009422302246, "learning_rate": 1.575015607731068e-05, "loss": 9.4014, "step": 1677 }, { "epoch": 0.3278624462680735, "grad_norm": 9.491081237792969, "learning_rate": 1.5744977105234756e-05, "loss": 10.2289, "step": 1678 }, { "epoch": 0.32805783509183273, "grad_norm": 8.813817977905273, "learning_rate": 1.5739795832117142e-05, "loss": 8.1194, "step": 1679 }, { "epoch": 0.32825322391559203, "grad_norm": 8.379867553710938, "learning_rate": 1.5734612260033094e-05, "loss": 9.8672, "step": 1680 }, { "epoch": 0.32844861273935133, "grad_norm": 13.194073677062988, "learning_rate": 1.57294263910588e-05, "loss": 9.9284, "step": 1681 }, { "epoch": 0.3286440015631106, "grad_norm": 6.913089275360107, "learning_rate": 1.572423822727136e-05, "loss": 9.1499, "step": 1682 }, { "epoch": 0.3288393903868699, "grad_norm": 11.585538864135742, "learning_rate": 1.571904777074879e-05, "loss": 9.3277, "step": 1683 }, { "epoch": 0.3290347792106291, "grad_norm": 10.432095527648926, "learning_rate": 1.5713855023570032e-05, "loss": 9.8317, "step": 1684 }, { "epoch": 0.3292301680343884, "grad_norm": 8.566866874694824, "learning_rate": 1.5708659987814945e-05, "loss": 8.5996, "step": 1685 }, { "epoch": 0.32942555685814773, "grad_norm": 18.440052032470703, "learning_rate": 1.5703462665564296e-05, "loss": 10.0485, "step": 1686 }, { "epoch": 0.329620945681907, "grad_norm": 65.54505920410156, "learning_rate": 1.5698263058899778e-05, "loss": 8.6768, "step": 1687 }, { "epoch": 0.3298163345056663, "grad_norm": 10.525182723999023, "learning_rate": 1.5693061169903998e-05, "loss": 9.9954, "step": 1688 }, { "epoch": 0.3300117233294256, "grad_norm": 11.578959465026855, "learning_rate": 1.5687857000660467e-05, "loss": 10.4881, "step": 1689 }, { "epoch": 0.3302071121531848, "grad_norm": 11.128236770629883, "learning_rate": 1.568265055325362e-05, "loss": 10.3662, "step": 1690 }, { "epoch": 0.3304025009769441, "grad_norm": 9.159039497375488, "learning_rate": 1.5677441829768802e-05, "loss": 9.982, "step": 1691 }, { "epoch": 0.3305978898007034, "grad_norm": 15.565377235412598, "learning_rate": 1.5672230832292264e-05, "loss": 10.0356, "step": 1692 }, { "epoch": 0.33079327862446267, "grad_norm": 12.125045776367188, "learning_rate": 1.566701756291118e-05, "loss": 9.6776, "step": 1693 }, { "epoch": 0.330988667448222, "grad_norm": 9.87791633605957, "learning_rate": 1.5661802023713618e-05, "loss": 10.6344, "step": 1694 }, { "epoch": 0.3311840562719812, "grad_norm": 8.393369674682617, "learning_rate": 1.5656584216788567e-05, "loss": 9.7636, "step": 1695 }, { "epoch": 0.3313794450957405, "grad_norm": 11.450444221496582, "learning_rate": 1.565136414422592e-05, "loss": 11.0457, "step": 1696 }, { "epoch": 0.3315748339194998, "grad_norm": 10.343286514282227, "learning_rate": 1.564614180811648e-05, "loss": 9.8457, "step": 1697 }, { "epoch": 0.33177022274325907, "grad_norm": 8.537278175354004, "learning_rate": 1.5640917210551955e-05, "loss": 9.4423, "step": 1698 }, { "epoch": 0.33196561156701837, "grad_norm": 16.415647506713867, "learning_rate": 1.5635690353624955e-05, "loss": 9.2602, "step": 1699 }, { "epoch": 0.33216100039077767, "grad_norm": 12.417532920837402, "learning_rate": 1.5630461239429e-05, "loss": 10.9, "step": 1700 }, { "epoch": 0.3323563892145369, "grad_norm": 6.117668628692627, "learning_rate": 1.5625229870058515e-05, "loss": 8.3, "step": 1701 }, { "epoch": 0.3325517780382962, "grad_norm": 8.208422660827637, "learning_rate": 1.561999624760882e-05, "loss": 9.441, "step": 1702 }, { "epoch": 0.33274716686205547, "grad_norm": 7.063049793243408, "learning_rate": 1.5614760374176153e-05, "loss": 9.227, "step": 1703 }, { "epoch": 0.33294255568581477, "grad_norm": 27.353660583496094, "learning_rate": 1.5609522251857634e-05, "loss": 10.7016, "step": 1704 }, { "epoch": 0.33313794450957407, "grad_norm": 8.364476203918457, "learning_rate": 1.5604281882751297e-05, "loss": 9.6392, "step": 1705 }, { "epoch": 0.3333333333333333, "grad_norm": 12.525715827941895, "learning_rate": 1.5599039268956072e-05, "loss": 10.7313, "step": 1706 }, { "epoch": 0.3335287221570926, "grad_norm": 10.193623542785645, "learning_rate": 1.5593794412571798e-05, "loss": 9.5809, "step": 1707 }, { "epoch": 0.3337241109808519, "grad_norm": 12.494501113891602, "learning_rate": 1.5588547315699188e-05, "loss": 9.9022, "step": 1708 }, { "epoch": 0.33391949980461116, "grad_norm": 13.320052146911621, "learning_rate": 1.5583297980439873e-05, "loss": 10.4568, "step": 1709 }, { "epoch": 0.33411488862837047, "grad_norm": 8.454721450805664, "learning_rate": 1.5578046408896378e-05, "loss": 9.1103, "step": 1710 }, { "epoch": 0.3343102774521297, "grad_norm": 10.274154663085938, "learning_rate": 1.5572792603172117e-05, "loss": 9.895, "step": 1711 }, { "epoch": 0.334505666275889, "grad_norm": 9.209165573120117, "learning_rate": 1.5567536565371404e-05, "loss": 10.0295, "step": 1712 }, { "epoch": 0.3347010550996483, "grad_norm": 57.516273498535156, "learning_rate": 1.5562278297599444e-05, "loss": 9.9245, "step": 1713 }, { "epoch": 0.33489644392340756, "grad_norm": 8.862438201904297, "learning_rate": 1.5557017801962334e-05, "loss": 9.4419, "step": 1714 }, { "epoch": 0.33509183274716686, "grad_norm": 8.271980285644531, "learning_rate": 1.5551755080567076e-05, "loss": 9.4185, "step": 1715 }, { "epoch": 0.33528722157092616, "grad_norm": 9.476627349853516, "learning_rate": 1.554649013552154e-05, "loss": 9.6879, "step": 1716 }, { "epoch": 0.3354826103946854, "grad_norm": 10.101987838745117, "learning_rate": 1.554122296893451e-05, "loss": 8.4631, "step": 1717 }, { "epoch": 0.3356779992184447, "grad_norm": 9.779273986816406, "learning_rate": 1.5535953582915643e-05, "loss": 9.814, "step": 1718 }, { "epoch": 0.335873388042204, "grad_norm": 12.370697021484375, "learning_rate": 1.5530681979575496e-05, "loss": 10.0133, "step": 1719 }, { "epoch": 0.33606877686596326, "grad_norm": 7.9293694496154785, "learning_rate": 1.5525408161025507e-05, "loss": 9.9495, "step": 1720 }, { "epoch": 0.33626416568972256, "grad_norm": 19.309507369995117, "learning_rate": 1.5520132129378006e-05, "loss": 9.9442, "step": 1721 }, { "epoch": 0.3364595545134818, "grad_norm": 7.228127956390381, "learning_rate": 1.551485388674621e-05, "loss": 9.0594, "step": 1722 }, { "epoch": 0.3366549433372411, "grad_norm": 18.292306900024414, "learning_rate": 1.5509573435244215e-05, "loss": 9.2468, "step": 1723 }, { "epoch": 0.3368503321610004, "grad_norm": 14.835461616516113, "learning_rate": 1.5504290776987005e-05, "loss": 10.1081, "step": 1724 }, { "epoch": 0.33704572098475966, "grad_norm": 9.334471702575684, "learning_rate": 1.5499005914090447e-05, "loss": 9.1263, "step": 1725 }, { "epoch": 0.33724110980851896, "grad_norm": 12.06333065032959, "learning_rate": 1.5493718848671302e-05, "loss": 8.5533, "step": 1726 }, { "epoch": 0.33743649863227826, "grad_norm": 10.127058029174805, "learning_rate": 1.5488429582847194e-05, "loss": 8.5659, "step": 1727 }, { "epoch": 0.3376318874560375, "grad_norm": 9.882468223571777, "learning_rate": 1.5483138118736643e-05, "loss": 8.3786, "step": 1728 }, { "epoch": 0.3378272762797968, "grad_norm": 9.601874351501465, "learning_rate": 1.5477844458459046e-05, "loss": 10.2354, "step": 1729 }, { "epoch": 0.33802266510355605, "grad_norm": 11.464741706848145, "learning_rate": 1.5472548604134675e-05, "loss": 8.6515, "step": 1730 }, { "epoch": 0.33821805392731535, "grad_norm": 9.604881286621094, "learning_rate": 1.5467250557884685e-05, "loss": 10.6031, "step": 1731 }, { "epoch": 0.33841344275107466, "grad_norm": 8.280776023864746, "learning_rate": 1.5461950321831105e-05, "loss": 10.2086, "step": 1732 }, { "epoch": 0.3386088315748339, "grad_norm": 10.27190113067627, "learning_rate": 1.545664789809685e-05, "loss": 9.7946, "step": 1733 }, { "epoch": 0.3388042203985932, "grad_norm": 8.053266525268555, "learning_rate": 1.54513432888057e-05, "loss": 9.3896, "step": 1734 }, { "epoch": 0.3389996092223525, "grad_norm": 9.985084533691406, "learning_rate": 1.544603649608232e-05, "loss": 10.0078, "step": 1735 }, { "epoch": 0.33919499804611175, "grad_norm": 10.035849571228027, "learning_rate": 1.5440727522052247e-05, "loss": 10.6757, "step": 1736 }, { "epoch": 0.33939038686987105, "grad_norm": 11.540842056274414, "learning_rate": 1.543541636884188e-05, "loss": 9.3436, "step": 1737 }, { "epoch": 0.3395857756936303, "grad_norm": 7.954980373382568, "learning_rate": 1.5430103038578513e-05, "loss": 9.0339, "step": 1738 }, { "epoch": 0.3397811645173896, "grad_norm": 19.364994049072266, "learning_rate": 1.542478753339029e-05, "loss": 8.4139, "step": 1739 }, { "epoch": 0.3399765533411489, "grad_norm": 8.921440124511719, "learning_rate": 1.5419469855406243e-05, "loss": 8.6741, "step": 1740 }, { "epoch": 0.34017194216490815, "grad_norm": 26.762006759643555, "learning_rate": 1.5414150006756262e-05, "loss": 10.0279, "step": 1741 }, { "epoch": 0.34036733098866745, "grad_norm": 7.822473526000977, "learning_rate": 1.540882798957111e-05, "loss": 9.2955, "step": 1742 }, { "epoch": 0.34056271981242675, "grad_norm": 9.464631080627441, "learning_rate": 1.5403503805982428e-05, "loss": 9.8347, "step": 1743 }, { "epoch": 0.340758108636186, "grad_norm": 8.8168363571167, "learning_rate": 1.539817745812271e-05, "loss": 10.2843, "step": 1744 }, { "epoch": 0.3409534974599453, "grad_norm": 9.104181289672852, "learning_rate": 1.539284894812533e-05, "loss": 9.0267, "step": 1745 }, { "epoch": 0.34114888628370454, "grad_norm": 9.172795295715332, "learning_rate": 1.5387518278124512e-05, "loss": 10.059, "step": 1746 }, { "epoch": 0.34134427510746385, "grad_norm": 8.707510948181152, "learning_rate": 1.5382185450255365e-05, "loss": 9.7586, "step": 1747 }, { "epoch": 0.34153966393122315, "grad_norm": 9.747553825378418, "learning_rate": 1.5376850466653844e-05, "loss": 9.5557, "step": 1748 }, { "epoch": 0.3417350527549824, "grad_norm": 10.045906066894531, "learning_rate": 1.537151332945678e-05, "loss": 9.9312, "step": 1749 }, { "epoch": 0.3419304415787417, "grad_norm": 11.068819046020508, "learning_rate": 1.5366174040801866e-05, "loss": 9.7978, "step": 1750 }, { "epoch": 0.342125830402501, "grad_norm": 11.23110580444336, "learning_rate": 1.5360832602827644e-05, "loss": 10.2003, "step": 1751 }, { "epoch": 0.34232121922626024, "grad_norm": 8.130640029907227, "learning_rate": 1.5355489017673534e-05, "loss": 9.0259, "step": 1752 }, { "epoch": 0.34251660805001954, "grad_norm": 9.70206069946289, "learning_rate": 1.5350143287479796e-05, "loss": 9.0877, "step": 1753 }, { "epoch": 0.34271199687377885, "grad_norm": 10.980212211608887, "learning_rate": 1.5344795414387575e-05, "loss": 10.7845, "step": 1754 }, { "epoch": 0.3429073856975381, "grad_norm": 9.177943229675293, "learning_rate": 1.5339445400538852e-05, "loss": 9.9836, "step": 1755 }, { "epoch": 0.3431027745212974, "grad_norm": 8.501229286193848, "learning_rate": 1.5334093248076472e-05, "loss": 9.2696, "step": 1756 }, { "epoch": 0.34329816334505664, "grad_norm": 10.649053573608398, "learning_rate": 1.5328738959144146e-05, "loss": 8.7284, "step": 1757 }, { "epoch": 0.34349355216881594, "grad_norm": 9.538617134094238, "learning_rate": 1.532338253588643e-05, "loss": 9.8119, "step": 1758 }, { "epoch": 0.34368894099257524, "grad_norm": 7.793221473693848, "learning_rate": 1.5318023980448735e-05, "loss": 9.6133, "step": 1759 }, { "epoch": 0.3438843298163345, "grad_norm": 7.8072590827941895, "learning_rate": 1.531266329497733e-05, "loss": 9.7504, "step": 1760 }, { "epoch": 0.3440797186400938, "grad_norm": 7.636564254760742, "learning_rate": 1.5307300481619334e-05, "loss": 9.3218, "step": 1761 }, { "epoch": 0.3442751074638531, "grad_norm": 6.859006404876709, "learning_rate": 1.5301935542522725e-05, "loss": 9.4971, "step": 1762 }, { "epoch": 0.34447049628761234, "grad_norm": 7.057887554168701, "learning_rate": 1.5296568479836326e-05, "loss": 8.7375, "step": 1763 }, { "epoch": 0.34466588511137164, "grad_norm": 10.694537162780762, "learning_rate": 1.5291199295709808e-05, "loss": 9.8011, "step": 1764 }, { "epoch": 0.3448612739351309, "grad_norm": 7.027894973754883, "learning_rate": 1.5285827992293703e-05, "loss": 9.0979, "step": 1765 }, { "epoch": 0.3450566627588902, "grad_norm": 9.455521583557129, "learning_rate": 1.528045457173938e-05, "loss": 9.0439, "step": 1766 }, { "epoch": 0.3452520515826495, "grad_norm": 8.299240112304688, "learning_rate": 1.5275079036199062e-05, "loss": 9.5298, "step": 1767 }, { "epoch": 0.34544744040640873, "grad_norm": 192.6204071044922, "learning_rate": 1.5269701387825815e-05, "loss": 9.5424, "step": 1768 }, { "epoch": 0.34564282923016804, "grad_norm": 35.3787956237793, "learning_rate": 1.526432162877356e-05, "loss": 9.8712, "step": 1769 }, { "epoch": 0.34583821805392734, "grad_norm": 12.05440616607666, "learning_rate": 1.525893976119705e-05, "loss": 9.401, "step": 1770 }, { "epoch": 0.3460336068776866, "grad_norm": 16.06743621826172, "learning_rate": 1.5253555787251899e-05, "loss": 9.285, "step": 1771 }, { "epoch": 0.3462289957014459, "grad_norm": 9.969664573669434, "learning_rate": 1.5248169709094551e-05, "loss": 9.3563, "step": 1772 }, { "epoch": 0.34642438452520513, "grad_norm": 43.8010139465332, "learning_rate": 1.5242781528882296e-05, "loss": 10.163, "step": 1773 }, { "epoch": 0.34661977334896443, "grad_norm": 10.185831069946289, "learning_rate": 1.5237391248773273e-05, "loss": 10.1241, "step": 1774 }, { "epoch": 0.34681516217272373, "grad_norm": 58.117881774902344, "learning_rate": 1.5231998870926448e-05, "loss": 10.2288, "step": 1775 }, { "epoch": 0.347010550996483, "grad_norm": 9.52348804473877, "learning_rate": 1.522660439750164e-05, "loss": 8.9495, "step": 1776 }, { "epoch": 0.3472059398202423, "grad_norm": 18.378759384155273, "learning_rate": 1.5221207830659506e-05, "loss": 10.8928, "step": 1777 }, { "epoch": 0.3474013286440016, "grad_norm": 12.922347068786621, "learning_rate": 1.521580917256154e-05, "loss": 10.3628, "step": 1778 }, { "epoch": 0.34759671746776083, "grad_norm": 16.34457778930664, "learning_rate": 1.5210408425370062e-05, "loss": 10.6221, "step": 1779 }, { "epoch": 0.34779210629152013, "grad_norm": 7.653688907623291, "learning_rate": 1.5205005591248248e-05, "loss": 8.9949, "step": 1780 }, { "epoch": 0.34798749511527943, "grad_norm": 10.004982948303223, "learning_rate": 1.5199600672360096e-05, "loss": 8.9597, "step": 1781 }, { "epoch": 0.3481828839390387, "grad_norm": 12.164095878601074, "learning_rate": 1.5194193670870447e-05, "loss": 9.9517, "step": 1782 }, { "epoch": 0.348378272762798, "grad_norm": 10.798413276672363, "learning_rate": 1.5188784588944972e-05, "loss": 9.9719, "step": 1783 }, { "epoch": 0.3485736615865572, "grad_norm": 10.619060516357422, "learning_rate": 1.5183373428750177e-05, "loss": 8.9537, "step": 1784 }, { "epoch": 0.34876905041031653, "grad_norm": 27.77444839477539, "learning_rate": 1.5177960192453399e-05, "loss": 10.713, "step": 1785 }, { "epoch": 0.34896443923407583, "grad_norm": 12.40777587890625, "learning_rate": 1.5172544882222804e-05, "loss": 9.9726, "step": 1786 }, { "epoch": 0.3491598280578351, "grad_norm": 8.204813003540039, "learning_rate": 1.5167127500227397e-05, "loss": 9.753, "step": 1787 }, { "epoch": 0.3493552168815944, "grad_norm": 10.18585205078125, "learning_rate": 1.5161708048637002e-05, "loss": 9.7292, "step": 1788 }, { "epoch": 0.3495506057053537, "grad_norm": 9.334511756896973, "learning_rate": 1.5156286529622284e-05, "loss": 9.4344, "step": 1789 }, { "epoch": 0.3497459945291129, "grad_norm": 12.067798614501953, "learning_rate": 1.5150862945354728e-05, "loss": 10.4729, "step": 1790 }, { "epoch": 0.3499413833528722, "grad_norm": 9.017687797546387, "learning_rate": 1.5145437298006642e-05, "loss": 9.6782, "step": 1791 }, { "epoch": 0.35013677217663147, "grad_norm": 8.172805786132812, "learning_rate": 1.5140009589751174e-05, "loss": 9.9983, "step": 1792 }, { "epoch": 0.3503321610003908, "grad_norm": 13.06108570098877, "learning_rate": 1.5134579822762287e-05, "loss": 9.3318, "step": 1793 }, { "epoch": 0.3505275498241501, "grad_norm": 8.837092399597168, "learning_rate": 1.512914799921477e-05, "loss": 10.2875, "step": 1794 }, { "epoch": 0.3507229386479093, "grad_norm": 7.7178263664245605, "learning_rate": 1.512371412128424e-05, "loss": 9.5769, "step": 1795 }, { "epoch": 0.3509183274716686, "grad_norm": 7.829312324523926, "learning_rate": 1.5118278191147129e-05, "loss": 9.539, "step": 1796 }, { "epoch": 0.3511137162954279, "grad_norm": 9.15108585357666, "learning_rate": 1.5112840210980698e-05, "loss": 9.4839, "step": 1797 }, { "epoch": 0.35130910511918717, "grad_norm": 9.459131240844727, "learning_rate": 1.5107400182963025e-05, "loss": 10.9973, "step": 1798 }, { "epoch": 0.35150449394294647, "grad_norm": 25.22171401977539, "learning_rate": 1.5101958109273016e-05, "loss": 10.1825, "step": 1799 }, { "epoch": 0.3516998827667057, "grad_norm": 10.554967880249023, "learning_rate": 1.5096513992090386e-05, "loss": 9.6381, "step": 1800 }, { "epoch": 0.351895271590465, "grad_norm": 9.855436325073242, "learning_rate": 1.5091067833595672e-05, "loss": 10.3401, "step": 1801 }, { "epoch": 0.3520906604142243, "grad_norm": 7.670804500579834, "learning_rate": 1.5085619635970235e-05, "loss": 8.7629, "step": 1802 }, { "epoch": 0.35228604923798357, "grad_norm": 12.818094253540039, "learning_rate": 1.508016940139624e-05, "loss": 10.2233, "step": 1803 }, { "epoch": 0.35248143806174287, "grad_norm": 9.649227142333984, "learning_rate": 1.5074717132056678e-05, "loss": 9.2624, "step": 1804 }, { "epoch": 0.35267682688550217, "grad_norm": 9.296341896057129, "learning_rate": 1.5069262830135356e-05, "loss": 9.6492, "step": 1805 }, { "epoch": 0.3528722157092614, "grad_norm": 16.60502815246582, "learning_rate": 1.5063806497816887e-05, "loss": 9.9349, "step": 1806 }, { "epoch": 0.3530676045330207, "grad_norm": 23.677478790283203, "learning_rate": 1.5058348137286704e-05, "loss": 10.1734, "step": 1807 }, { "epoch": 0.35326299335678, "grad_norm": 9.695001602172852, "learning_rate": 1.5052887750731046e-05, "loss": 9.4951, "step": 1808 }, { "epoch": 0.35345838218053927, "grad_norm": 8.7704439163208, "learning_rate": 1.5047425340336972e-05, "loss": 8.7867, "step": 1809 }, { "epoch": 0.35365377100429857, "grad_norm": 8.951241493225098, "learning_rate": 1.5041960908292343e-05, "loss": 8.2299, "step": 1810 }, { "epoch": 0.3538491598280578, "grad_norm": 9.119094848632812, "learning_rate": 1.5036494456785837e-05, "loss": 9.7607, "step": 1811 }, { "epoch": 0.3540445486518171, "grad_norm": 13.40417194366455, "learning_rate": 1.5031025988006935e-05, "loss": 10.2292, "step": 1812 }, { "epoch": 0.3542399374755764, "grad_norm": 19.00713348388672, "learning_rate": 1.502555550414593e-05, "loss": 10.1842, "step": 1813 }, { "epoch": 0.35443532629933566, "grad_norm": 12.991162300109863, "learning_rate": 1.5020083007393919e-05, "loss": 10.2614, "step": 1814 }, { "epoch": 0.35463071512309496, "grad_norm": 9.391922950744629, "learning_rate": 1.5014608499942808e-05, "loss": 9.3905, "step": 1815 }, { "epoch": 0.35482610394685427, "grad_norm": 9.280734062194824, "learning_rate": 1.5009131983985308e-05, "loss": 9.9366, "step": 1816 }, { "epoch": 0.3550214927706135, "grad_norm": 7.751325607299805, "learning_rate": 1.5003653461714934e-05, "loss": 9.3776, "step": 1817 }, { "epoch": 0.3552168815943728, "grad_norm": 7.636277198791504, "learning_rate": 1.4998172935326003e-05, "loss": 9.1454, "step": 1818 }, { "epoch": 0.35541227041813206, "grad_norm": 8.686695098876953, "learning_rate": 1.4992690407013635e-05, "loss": 9.4679, "step": 1819 }, { "epoch": 0.35560765924189136, "grad_norm": 9.64791488647461, "learning_rate": 1.4987205878973755e-05, "loss": 9.3872, "step": 1820 }, { "epoch": 0.35580304806565066, "grad_norm": 11.299792289733887, "learning_rate": 1.4981719353403085e-05, "loss": 10.114, "step": 1821 }, { "epoch": 0.3559984368894099, "grad_norm": 12.038385391235352, "learning_rate": 1.497623083249915e-05, "loss": 8.477, "step": 1822 }, { "epoch": 0.3561938257131692, "grad_norm": 9.453662872314453, "learning_rate": 1.4970740318460271e-05, "loss": 9.853, "step": 1823 }, { "epoch": 0.3563892145369285, "grad_norm": 9.599242210388184, "learning_rate": 1.4965247813485571e-05, "loss": 9.8869, "step": 1824 }, { "epoch": 0.35658460336068776, "grad_norm": 11.355889320373535, "learning_rate": 1.4959753319774968e-05, "loss": 9.2784, "step": 1825 }, { "epoch": 0.35677999218444706, "grad_norm": 11.106788635253906, "learning_rate": 1.495425683952918e-05, "loss": 10.4052, "step": 1826 }, { "epoch": 0.3569753810082063, "grad_norm": 8.836578369140625, "learning_rate": 1.4948758374949715e-05, "loss": 8.5516, "step": 1827 }, { "epoch": 0.3571707698319656, "grad_norm": 11.409369468688965, "learning_rate": 1.4943257928238879e-05, "loss": 9.7967, "step": 1828 }, { "epoch": 0.3573661586557249, "grad_norm": 7.801117420196533, "learning_rate": 1.4937755501599774e-05, "loss": 8.9674, "step": 1829 }, { "epoch": 0.35756154747948415, "grad_norm": 8.499917984008789, "learning_rate": 1.493225109723629e-05, "loss": 9.0892, "step": 1830 }, { "epoch": 0.35775693630324346, "grad_norm": 10.434982299804688, "learning_rate": 1.4926744717353112e-05, "loss": 9.8643, "step": 1831 }, { "epoch": 0.35795232512700276, "grad_norm": 9.691671371459961, "learning_rate": 1.4921236364155717e-05, "loss": 9.6132, "step": 1832 }, { "epoch": 0.358147713950762, "grad_norm": 7.935885429382324, "learning_rate": 1.4915726039850376e-05, "loss": 8.4978, "step": 1833 }, { "epoch": 0.3583431027745213, "grad_norm": 51.44169998168945, "learning_rate": 1.4910213746644136e-05, "loss": 9.921, "step": 1834 }, { "epoch": 0.35853849159828055, "grad_norm": 10.459129333496094, "learning_rate": 1.4904699486744848e-05, "loss": 9.5983, "step": 1835 }, { "epoch": 0.35873388042203985, "grad_norm": 9.238507270812988, "learning_rate": 1.489918326236114e-05, "loss": 8.4888, "step": 1836 }, { "epoch": 0.35892926924579915, "grad_norm": 17.272464752197266, "learning_rate": 1.4893665075702435e-05, "loss": 9.608, "step": 1837 }, { "epoch": 0.3591246580695584, "grad_norm": 8.849822044372559, "learning_rate": 1.4888144928978936e-05, "loss": 9.3512, "step": 1838 }, { "epoch": 0.3593200468933177, "grad_norm": 8.948539733886719, "learning_rate": 1.4882622824401632e-05, "loss": 8.9959, "step": 1839 }, { "epoch": 0.359515435717077, "grad_norm": 8.089892387390137, "learning_rate": 1.4877098764182301e-05, "loss": 9.1757, "step": 1840 }, { "epoch": 0.35971082454083625, "grad_norm": 9.682193756103516, "learning_rate": 1.4871572750533499e-05, "loss": 10.1555, "step": 1841 }, { "epoch": 0.35990621336459555, "grad_norm": 10.889745712280273, "learning_rate": 1.4866044785668563e-05, "loss": 8.9293, "step": 1842 }, { "epoch": 0.36010160218835485, "grad_norm": 8.232142448425293, "learning_rate": 1.486051487180162e-05, "loss": 9.5754, "step": 1843 }, { "epoch": 0.3602969910121141, "grad_norm": 8.604398727416992, "learning_rate": 1.4854983011147568e-05, "loss": 10.1241, "step": 1844 }, { "epoch": 0.3604923798358734, "grad_norm": 11.409536361694336, "learning_rate": 1.4849449205922088e-05, "loss": 10.0654, "step": 1845 }, { "epoch": 0.36068776865963265, "grad_norm": 12.24126148223877, "learning_rate": 1.4843913458341646e-05, "loss": 10.1095, "step": 1846 }, { "epoch": 0.36088315748339195, "grad_norm": 8.643593788146973, "learning_rate": 1.483837577062348e-05, "loss": 9.2632, "step": 1847 }, { "epoch": 0.36107854630715125, "grad_norm": 8.110797882080078, "learning_rate": 1.4832836144985601e-05, "loss": 8.6348, "step": 1848 }, { "epoch": 0.3612739351309105, "grad_norm": 10.314324378967285, "learning_rate": 1.4827294583646804e-05, "loss": 8.5782, "step": 1849 }, { "epoch": 0.3614693239546698, "grad_norm": 9.651543617248535, "learning_rate": 1.4821751088826655e-05, "loss": 9.8624, "step": 1850 }, { "epoch": 0.3616647127784291, "grad_norm": 9.259075164794922, "learning_rate": 1.4816205662745498e-05, "loss": 9.2482, "step": 1851 }, { "epoch": 0.36186010160218834, "grad_norm": 11.75296688079834, "learning_rate": 1.4810658307624445e-05, "loss": 8.6998, "step": 1852 }, { "epoch": 0.36205549042594765, "grad_norm": 12.283703804016113, "learning_rate": 1.4805109025685391e-05, "loss": 10.5427, "step": 1853 }, { "epoch": 0.3622508792497069, "grad_norm": 8.814277648925781, "learning_rate": 1.479955781915099e-05, "loss": 10.1233, "step": 1854 }, { "epoch": 0.3624462680734662, "grad_norm": 8.639643669128418, "learning_rate": 1.479400469024467e-05, "loss": 9.5759, "step": 1855 }, { "epoch": 0.3626416568972255, "grad_norm": 10.140213012695312, "learning_rate": 1.4788449641190637e-05, "loss": 10.0116, "step": 1856 }, { "epoch": 0.36283704572098474, "grad_norm": 9.60897445678711, "learning_rate": 1.4782892674213855e-05, "loss": 9.7824, "step": 1857 }, { "epoch": 0.36303243454474404, "grad_norm": 11.40339469909668, "learning_rate": 1.477733379154007e-05, "loss": 9.0601, "step": 1858 }, { "epoch": 0.36322782336850334, "grad_norm": 10.77801513671875, "learning_rate": 1.4771772995395779e-05, "loss": 8.0438, "step": 1859 }, { "epoch": 0.3634232121922626, "grad_norm": 10.345446586608887, "learning_rate": 1.476621028800826e-05, "loss": 9.349, "step": 1860 }, { "epoch": 0.3636186010160219, "grad_norm": 10.751293182373047, "learning_rate": 1.4760645671605545e-05, "loss": 9.2973, "step": 1861 }, { "epoch": 0.36381398983978114, "grad_norm": 12.415898323059082, "learning_rate": 1.475507914841644e-05, "loss": 9.0775, "step": 1862 }, { "epoch": 0.36400937866354044, "grad_norm": 9.695807456970215, "learning_rate": 1.4749510720670506e-05, "loss": 9.8121, "step": 1863 }, { "epoch": 0.36420476748729974, "grad_norm": 14.2394437789917, "learning_rate": 1.4743940390598072e-05, "loss": 10.325, "step": 1864 }, { "epoch": 0.364400156311059, "grad_norm": 10.003837585449219, "learning_rate": 1.4738368160430229e-05, "loss": 8.9347, "step": 1865 }, { "epoch": 0.3645955451348183, "grad_norm": 9.896443367004395, "learning_rate": 1.4732794032398832e-05, "loss": 9.6103, "step": 1866 }, { "epoch": 0.3647909339585776, "grad_norm": 8.426640510559082, "learning_rate": 1.4727218008736485e-05, "loss": 8.767, "step": 1867 }, { "epoch": 0.36498632278233684, "grad_norm": 8.99334716796875, "learning_rate": 1.4721640091676566e-05, "loss": 9.5023, "step": 1868 }, { "epoch": 0.36518171160609614, "grad_norm": 10.854475021362305, "learning_rate": 1.4716060283453198e-05, "loss": 9.2807, "step": 1869 }, { "epoch": 0.36537710042985544, "grad_norm": 10.936485290527344, "learning_rate": 1.4710478586301268e-05, "loss": 8.7673, "step": 1870 }, { "epoch": 0.3655724892536147, "grad_norm": 10.797525405883789, "learning_rate": 1.4704895002456424e-05, "loss": 9.6278, "step": 1871 }, { "epoch": 0.365767878077374, "grad_norm": 10.396178245544434, "learning_rate": 1.469930953415506e-05, "loss": 9.7169, "step": 1872 }, { "epoch": 0.36596326690113323, "grad_norm": 35.18453598022461, "learning_rate": 1.4693722183634332e-05, "loss": 9.0131, "step": 1873 }, { "epoch": 0.36615865572489253, "grad_norm": 11.25657844543457, "learning_rate": 1.4688132953132145e-05, "loss": 10.948, "step": 1874 }, { "epoch": 0.36635404454865184, "grad_norm": 11.835681915283203, "learning_rate": 1.4682541844887163e-05, "loss": 9.5515, "step": 1875 }, { "epoch": 0.3665494333724111, "grad_norm": 8.967255592346191, "learning_rate": 1.4676948861138793e-05, "loss": 9.7112, "step": 1876 }, { "epoch": 0.3667448221961704, "grad_norm": 12.371103286743164, "learning_rate": 1.4671354004127203e-05, "loss": 9.8506, "step": 1877 }, { "epoch": 0.3669402110199297, "grad_norm": 10.877632141113281, "learning_rate": 1.4665757276093306e-05, "loss": 10.1536, "step": 1878 }, { "epoch": 0.36713559984368893, "grad_norm": 12.124642372131348, "learning_rate": 1.4660158679278764e-05, "loss": 10.031, "step": 1879 }, { "epoch": 0.36733098866744823, "grad_norm": 9.705178260803223, "learning_rate": 1.465455821592599e-05, "loss": 9.7276, "step": 1880 }, { "epoch": 0.3675263774912075, "grad_norm": 9.293903350830078, "learning_rate": 1.4648955888278144e-05, "loss": 9.5486, "step": 1881 }, { "epoch": 0.3677217663149668, "grad_norm": 8.219917297363281, "learning_rate": 1.464335169857913e-05, "loss": 9.5627, "step": 1882 }, { "epoch": 0.3679171551387261, "grad_norm": 9.594404220581055, "learning_rate": 1.4637745649073602e-05, "loss": 9.9368, "step": 1883 }, { "epoch": 0.36811254396248533, "grad_norm": 18.886594772338867, "learning_rate": 1.4632137742006955e-05, "loss": 9.9665, "step": 1884 }, { "epoch": 0.36830793278624463, "grad_norm": 8.912412643432617, "learning_rate": 1.4626527979625334e-05, "loss": 9.4875, "step": 1885 }, { "epoch": 0.36850332161000393, "grad_norm": 8.886992454528809, "learning_rate": 1.4620916364175617e-05, "loss": 9.886, "step": 1886 }, { "epoch": 0.3686987104337632, "grad_norm": 9.128472328186035, "learning_rate": 1.4615302897905435e-05, "loss": 8.9634, "step": 1887 }, { "epoch": 0.3688940992575225, "grad_norm": 8.628862380981445, "learning_rate": 1.4609687583063155e-05, "loss": 9.6416, "step": 1888 }, { "epoch": 0.3690894880812817, "grad_norm": 10.561972618103027, "learning_rate": 1.4604070421897883e-05, "loss": 10.5866, "step": 1889 }, { "epoch": 0.369284876905041, "grad_norm": 7.685659885406494, "learning_rate": 1.4598451416659468e-05, "loss": 8.9428, "step": 1890 }, { "epoch": 0.36948026572880033, "grad_norm": 13.756418228149414, "learning_rate": 1.459283056959849e-05, "loss": 9.4792, "step": 1891 }, { "epoch": 0.3696756545525596, "grad_norm": 9.282796859741211, "learning_rate": 1.4587207882966283e-05, "loss": 9.8491, "step": 1892 }, { "epoch": 0.3698710433763189, "grad_norm": 9.124377250671387, "learning_rate": 1.45815833590149e-05, "loss": 9.6051, "step": 1893 }, { "epoch": 0.3700664322000782, "grad_norm": 9.004120826721191, "learning_rate": 1.4575956999997144e-05, "loss": 8.636, "step": 1894 }, { "epoch": 0.3702618210238374, "grad_norm": 8.152937889099121, "learning_rate": 1.4570328808166539e-05, "loss": 8.6843, "step": 1895 }, { "epoch": 0.3704572098475967, "grad_norm": 9.83988094329834, "learning_rate": 1.4564698785777353e-05, "loss": 9.4453, "step": 1896 }, { "epoch": 0.370652598671356, "grad_norm": 9.798221588134766, "learning_rate": 1.455906693508459e-05, "loss": 9.9185, "step": 1897 }, { "epoch": 0.37084798749511527, "grad_norm": 8.114480018615723, "learning_rate": 1.4553433258343974e-05, "loss": 9.3029, "step": 1898 }, { "epoch": 0.3710433763188746, "grad_norm": 10.6480712890625, "learning_rate": 1.454779775781197e-05, "loss": 10.0314, "step": 1899 }, { "epoch": 0.3712387651426338, "grad_norm": 9.375577926635742, "learning_rate": 1.4542160435745771e-05, "loss": 9.2519, "step": 1900 }, { "epoch": 0.3714341539663931, "grad_norm": 8.865910530090332, "learning_rate": 1.45365212944033e-05, "loss": 9.4088, "step": 1901 }, { "epoch": 0.3716295427901524, "grad_norm": 8.14194107055664, "learning_rate": 1.4530880336043212e-05, "loss": 8.474, "step": 1902 }, { "epoch": 0.37182493161391167, "grad_norm": 10.09233570098877, "learning_rate": 1.4525237562924879e-05, "loss": 9.8282, "step": 1903 }, { "epoch": 0.37202032043767097, "grad_norm": 7.718667507171631, "learning_rate": 1.4519592977308409e-05, "loss": 8.3733, "step": 1904 }, { "epoch": 0.37221570926143027, "grad_norm": 13.187790870666504, "learning_rate": 1.4513946581454634e-05, "loss": 10.4114, "step": 1905 }, { "epoch": 0.3724110980851895, "grad_norm": 7.69195556640625, "learning_rate": 1.4508298377625112e-05, "loss": 8.8665, "step": 1906 }, { "epoch": 0.3726064869089488, "grad_norm": 9.133148193359375, "learning_rate": 1.4502648368082123e-05, "loss": 8.5269, "step": 1907 }, { "epoch": 0.37280187573270807, "grad_norm": 14.639871597290039, "learning_rate": 1.4496996555088672e-05, "loss": 10.605, "step": 1908 }, { "epoch": 0.37299726455646737, "grad_norm": 10.028745651245117, "learning_rate": 1.4491342940908487e-05, "loss": 10.1821, "step": 1909 }, { "epoch": 0.37319265338022667, "grad_norm": 14.973143577575684, "learning_rate": 1.448568752780601e-05, "loss": 10.1627, "step": 1910 }, { "epoch": 0.3733880422039859, "grad_norm": 8.591376304626465, "learning_rate": 1.4480030318046417e-05, "loss": 8.6355, "step": 1911 }, { "epoch": 0.3735834310277452, "grad_norm": 9.579111099243164, "learning_rate": 1.447437131389559e-05, "loss": 9.7195, "step": 1912 }, { "epoch": 0.3737788198515045, "grad_norm": 8.835308074951172, "learning_rate": 1.446871051762014e-05, "loss": 8.7154, "step": 1913 }, { "epoch": 0.37397420867526376, "grad_norm": 10.81924057006836, "learning_rate": 1.4463047931487391e-05, "loss": 10.5608, "step": 1914 }, { "epoch": 0.37416959749902307, "grad_norm": 9.480253219604492, "learning_rate": 1.4457383557765385e-05, "loss": 9.7263, "step": 1915 }, { "epoch": 0.3743649863227823, "grad_norm": 16.763931274414062, "learning_rate": 1.4451717398722875e-05, "loss": 9.2355, "step": 1916 }, { "epoch": 0.3745603751465416, "grad_norm": 13.234987258911133, "learning_rate": 1.4446049456629339e-05, "loss": 10.6605, "step": 1917 }, { "epoch": 0.3747557639703009, "grad_norm": 8.773869514465332, "learning_rate": 1.4440379733754961e-05, "loss": 9.5099, "step": 1918 }, { "epoch": 0.37495115279406016, "grad_norm": 27.581687927246094, "learning_rate": 1.4434708232370642e-05, "loss": 10.4511, "step": 1919 }, { "epoch": 0.37514654161781946, "grad_norm": 10.067249298095703, "learning_rate": 1.4429034954747996e-05, "loss": 10.0346, "step": 1920 }, { "epoch": 0.37534193044157876, "grad_norm": 10.091471672058105, "learning_rate": 1.4423359903159342e-05, "loss": 9.173, "step": 1921 }, { "epoch": 0.375537319265338, "grad_norm": 7.579031467437744, "learning_rate": 1.441768307987772e-05, "loss": 9.0195, "step": 1922 }, { "epoch": 0.3757327080890973, "grad_norm": 8.659554481506348, "learning_rate": 1.4412004487176872e-05, "loss": 9.3103, "step": 1923 }, { "epoch": 0.37592809691285656, "grad_norm": 14.094969749450684, "learning_rate": 1.4406324127331246e-05, "loss": 10.1186, "step": 1924 }, { "epoch": 0.37612348573661586, "grad_norm": 9.010276794433594, "learning_rate": 1.440064200261601e-05, "loss": 10.1677, "step": 1925 }, { "epoch": 0.37631887456037516, "grad_norm": 7.850002288818359, "learning_rate": 1.4394958115307027e-05, "loss": 9.3536, "step": 1926 }, { "epoch": 0.3765142633841344, "grad_norm": 11.411676406860352, "learning_rate": 1.4389272467680871e-05, "loss": 10.2704, "step": 1927 }, { "epoch": 0.3767096522078937, "grad_norm": 8.324441909790039, "learning_rate": 1.4383585062014817e-05, "loss": 8.2264, "step": 1928 }, { "epoch": 0.376905041031653, "grad_norm": 17.55536460876465, "learning_rate": 1.4377895900586856e-05, "loss": 8.7914, "step": 1929 }, { "epoch": 0.37710042985541226, "grad_norm": 8.358119010925293, "learning_rate": 1.4372204985675666e-05, "loss": 8.6207, "step": 1930 }, { "epoch": 0.37729581867917156, "grad_norm": 8.256921768188477, "learning_rate": 1.4366512319560642e-05, "loss": 8.901, "step": 1931 }, { "epoch": 0.37749120750293086, "grad_norm": 9.609137535095215, "learning_rate": 1.4360817904521866e-05, "loss": 9.44, "step": 1932 }, { "epoch": 0.3776865963266901, "grad_norm": 8.766924858093262, "learning_rate": 1.4355121742840132e-05, "loss": 9.1647, "step": 1933 }, { "epoch": 0.3778819851504494, "grad_norm": 8.230937004089355, "learning_rate": 1.434942383679693e-05, "loss": 8.404, "step": 1934 }, { "epoch": 0.37807737397420865, "grad_norm": 7.7953643798828125, "learning_rate": 1.4343724188674446e-05, "loss": 10.0905, "step": 1935 }, { "epoch": 0.37827276279796795, "grad_norm": 9.239731788635254, "learning_rate": 1.4338022800755567e-05, "loss": 9.3502, "step": 1936 }, { "epoch": 0.37846815162172726, "grad_norm": 8.503083229064941, "learning_rate": 1.4332319675323879e-05, "loss": 9.2081, "step": 1937 }, { "epoch": 0.3786635404454865, "grad_norm": 9.946684837341309, "learning_rate": 1.4326614814663655e-05, "loss": 9.744, "step": 1938 }, { "epoch": 0.3788589292692458, "grad_norm": 10.687923431396484, "learning_rate": 1.4320908221059876e-05, "loss": 9.8484, "step": 1939 }, { "epoch": 0.3790543180930051, "grad_norm": 9.50954532623291, "learning_rate": 1.43151998967982e-05, "loss": 9.7537, "step": 1940 }, { "epoch": 0.37924970691676435, "grad_norm": 8.836617469787598, "learning_rate": 1.4309489844165e-05, "loss": 8.8867, "step": 1941 }, { "epoch": 0.37944509574052365, "grad_norm": 11.293417930603027, "learning_rate": 1.4303778065447319e-05, "loss": 9.4723, "step": 1942 }, { "epoch": 0.3796404845642829, "grad_norm": 9.453107833862305, "learning_rate": 1.4298064562932906e-05, "loss": 9.4513, "step": 1943 }, { "epoch": 0.3798358733880422, "grad_norm": 12.666526794433594, "learning_rate": 1.4292349338910199e-05, "loss": 10.6713, "step": 1944 }, { "epoch": 0.3800312622118015, "grad_norm": 7.939197540283203, "learning_rate": 1.4286632395668318e-05, "loss": 9.7284, "step": 1945 }, { "epoch": 0.38022665103556075, "grad_norm": 8.85693359375, "learning_rate": 1.4280913735497077e-05, "loss": 9.59, "step": 1946 }, { "epoch": 0.38042203985932005, "grad_norm": 7.0706658363342285, "learning_rate": 1.427519336068698e-05, "loss": 9.5534, "step": 1947 }, { "epoch": 0.38061742868307935, "grad_norm": 8.621312141418457, "learning_rate": 1.4269471273529212e-05, "loss": 8.332, "step": 1948 }, { "epoch": 0.3808128175068386, "grad_norm": 11.882975578308105, "learning_rate": 1.4263747476315646e-05, "loss": 10.2817, "step": 1949 }, { "epoch": 0.3810082063305979, "grad_norm": 10.135407447814941, "learning_rate": 1.4258021971338843e-05, "loss": 10.2998, "step": 1950 }, { "epoch": 0.38120359515435714, "grad_norm": 14.021071434020996, "learning_rate": 1.4252294760892042e-05, "loss": 9.3759, "step": 1951 }, { "epoch": 0.38139898397811645, "grad_norm": 161.7584991455078, "learning_rate": 1.4246565847269174e-05, "loss": 10.2743, "step": 1952 }, { "epoch": 0.38159437280187575, "grad_norm": 8.837624549865723, "learning_rate": 1.424083523276484e-05, "loss": 8.2313, "step": 1953 }, { "epoch": 0.381789761625635, "grad_norm": 16.58618927001953, "learning_rate": 1.4235102919674338e-05, "loss": 10.6641, "step": 1954 }, { "epoch": 0.3819851504493943, "grad_norm": 11.371005058288574, "learning_rate": 1.4229368910293625e-05, "loss": 8.9532, "step": 1955 }, { "epoch": 0.3821805392731536, "grad_norm": 10.293523788452148, "learning_rate": 1.422363320691936e-05, "loss": 8.8939, "step": 1956 }, { "epoch": 0.38237592809691284, "grad_norm": 9.805085182189941, "learning_rate": 1.4217895811848872e-05, "loss": 10.1148, "step": 1957 }, { "epoch": 0.38257131692067214, "grad_norm": 17.808725357055664, "learning_rate": 1.4212156727380155e-05, "loss": 9.7582, "step": 1958 }, { "epoch": 0.38276670574443145, "grad_norm": 11.809649467468262, "learning_rate": 1.42064159558119e-05, "loss": 9.4327, "step": 1959 }, { "epoch": 0.3829620945681907, "grad_norm": 8.403098106384277, "learning_rate": 1.4200673499443456e-05, "loss": 9.1216, "step": 1960 }, { "epoch": 0.38315748339195, "grad_norm": 22.475610733032227, "learning_rate": 1.4194929360574863e-05, "loss": 10.0235, "step": 1961 }, { "epoch": 0.38335287221570924, "grad_norm": 20.79180335998535, "learning_rate": 1.4189183541506821e-05, "loss": 9.8155, "step": 1962 }, { "epoch": 0.38354826103946854, "grad_norm": 9.602819442749023, "learning_rate": 1.4183436044540713e-05, "loss": 9.6023, "step": 1963 }, { "epoch": 0.38374364986322784, "grad_norm": 7.785224914550781, "learning_rate": 1.417768687197859e-05, "loss": 9.0554, "step": 1964 }, { "epoch": 0.3839390386869871, "grad_norm": 9.529844284057617, "learning_rate": 1.417193602612317e-05, "loss": 9.3794, "step": 1965 }, { "epoch": 0.3841344275107464, "grad_norm": 15.077787399291992, "learning_rate": 1.4166183509277848e-05, "loss": 9.3298, "step": 1966 }, { "epoch": 0.3843298163345057, "grad_norm": 11.620304107666016, "learning_rate": 1.4160429323746687e-05, "loss": 11.1838, "step": 1967 }, { "epoch": 0.38452520515826494, "grad_norm": 8.911892890930176, "learning_rate": 1.4154673471834414e-05, "loss": 9.7006, "step": 1968 }, { "epoch": 0.38472059398202424, "grad_norm": 7.756126403808594, "learning_rate": 1.4148915955846429e-05, "loss": 9.1773, "step": 1969 }, { "epoch": 0.3849159828057835, "grad_norm": 11.736945152282715, "learning_rate": 1.4143156778088795e-05, "loss": 9.1653, "step": 1970 }, { "epoch": 0.3851113716295428, "grad_norm": 10.644880294799805, "learning_rate": 1.4137395940868244e-05, "loss": 10.2337, "step": 1971 }, { "epoch": 0.3853067604533021, "grad_norm": 9.106091499328613, "learning_rate": 1.4131633446492163e-05, "loss": 8.9762, "step": 1972 }, { "epoch": 0.38550214927706133, "grad_norm": 10.716482162475586, "learning_rate": 1.4125869297268615e-05, "loss": 10.7699, "step": 1973 }, { "epoch": 0.38569753810082064, "grad_norm": 10.083828926086426, "learning_rate": 1.4120103495506321e-05, "loss": 9.2582, "step": 1974 }, { "epoch": 0.38589292692457994, "grad_norm": 9.934428215026855, "learning_rate": 1.4114336043514662e-05, "loss": 10.5586, "step": 1975 }, { "epoch": 0.3860883157483392, "grad_norm": 9.987054824829102, "learning_rate": 1.4108566943603683e-05, "loss": 9.2885, "step": 1976 }, { "epoch": 0.3862837045720985, "grad_norm": 18.581371307373047, "learning_rate": 1.4102796198084085e-05, "loss": 8.4393, "step": 1977 }, { "epoch": 0.38647909339585773, "grad_norm": 9.174590110778809, "learning_rate": 1.4097023809267235e-05, "loss": 8.9276, "step": 1978 }, { "epoch": 0.38667448221961703, "grad_norm": 9.374712944030762, "learning_rate": 1.409124977946515e-05, "loss": 9.4317, "step": 1979 }, { "epoch": 0.38686987104337633, "grad_norm": 10.881209373474121, "learning_rate": 1.408547411099051e-05, "loss": 9.9214, "step": 1980 }, { "epoch": 0.3870652598671356, "grad_norm": 8.33880615234375, "learning_rate": 1.4079696806156649e-05, "loss": 9.9988, "step": 1981 }, { "epoch": 0.3872606486908949, "grad_norm": 10.133203506469727, "learning_rate": 1.4073917867277557e-05, "loss": 10.4433, "step": 1982 }, { "epoch": 0.3874560375146542, "grad_norm": 9.079756736755371, "learning_rate": 1.4068137296667884e-05, "loss": 9.9795, "step": 1983 }, { "epoch": 0.38765142633841343, "grad_norm": 9.376028060913086, "learning_rate": 1.4062355096642917e-05, "loss": 9.1507, "step": 1984 }, { "epoch": 0.38784681516217273, "grad_norm": 8.1322660446167, "learning_rate": 1.4056571269518612e-05, "loss": 9.5236, "step": 1985 }, { "epoch": 0.388042203985932, "grad_norm": 11.051843643188477, "learning_rate": 1.4050785817611571e-05, "loss": 8.7444, "step": 1986 }, { "epoch": 0.3882375928096913, "grad_norm": 8.349790573120117, "learning_rate": 1.4044998743239051e-05, "loss": 10.1119, "step": 1987 }, { "epoch": 0.3884329816334506, "grad_norm": 75.24749755859375, "learning_rate": 1.403921004871895e-05, "loss": 9.6965, "step": 1988 }, { "epoch": 0.3886283704572098, "grad_norm": 19.189090728759766, "learning_rate": 1.4033419736369823e-05, "loss": 9.2351, "step": 1989 }, { "epoch": 0.38882375928096913, "grad_norm": 7.177303314208984, "learning_rate": 1.4027627808510866e-05, "loss": 9.4034, "step": 1990 }, { "epoch": 0.38901914810472843, "grad_norm": 7.710775375366211, "learning_rate": 1.4021834267461932e-05, "loss": 9.4265, "step": 1991 }, { "epoch": 0.3892145369284877, "grad_norm": 9.059228897094727, "learning_rate": 1.4016039115543508e-05, "loss": 9.1341, "step": 1992 }, { "epoch": 0.389409925752247, "grad_norm": 41.08426284790039, "learning_rate": 1.4010242355076733e-05, "loss": 9.6277, "step": 1993 }, { "epoch": 0.3896053145760063, "grad_norm": 8.896717071533203, "learning_rate": 1.4004443988383393e-05, "loss": 9.0697, "step": 1994 }, { "epoch": 0.3898007033997655, "grad_norm": 24.87018585205078, "learning_rate": 1.3998644017785912e-05, "loss": 9.1842, "step": 1995 }, { "epoch": 0.3899960922235248, "grad_norm": 10.012320518493652, "learning_rate": 1.3992842445607358e-05, "loss": 8.4979, "step": 1996 }, { "epoch": 0.39019148104728407, "grad_norm": 9.255597114562988, "learning_rate": 1.3987039274171441e-05, "loss": 9.1657, "step": 1997 }, { "epoch": 0.3903868698710434, "grad_norm": 10.156935691833496, "learning_rate": 1.3981234505802513e-05, "loss": 9.3369, "step": 1998 }, { "epoch": 0.3905822586948027, "grad_norm": 14.97067928314209, "learning_rate": 1.3975428142825562e-05, "loss": 10.339, "step": 1999 }, { "epoch": 0.3907776475185619, "grad_norm": 8.015941619873047, "learning_rate": 1.3969620187566216e-05, "loss": 10.3577, "step": 2000 }, { "epoch": 0.3909730363423212, "grad_norm": 8.700507164001465, "learning_rate": 1.3963810642350741e-05, "loss": 9.7226, "step": 2001 }, { "epoch": 0.3911684251660805, "grad_norm": 7.423156261444092, "learning_rate": 1.3957999509506044e-05, "loss": 8.3885, "step": 2002 }, { "epoch": 0.39136381398983977, "grad_norm": 10.251492500305176, "learning_rate": 1.395218679135966e-05, "loss": 10.1336, "step": 2003 }, { "epoch": 0.39155920281359907, "grad_norm": 8.170392990112305, "learning_rate": 1.3946372490239769e-05, "loss": 7.9025, "step": 2004 }, { "epoch": 0.3917545916373583, "grad_norm": 16.20621109008789, "learning_rate": 1.394055660847517e-05, "loss": 9.9719, "step": 2005 }, { "epoch": 0.3919499804611176, "grad_norm": 22.03818702697754, "learning_rate": 1.3934739148395311e-05, "loss": 8.8452, "step": 2006 }, { "epoch": 0.3921453692848769, "grad_norm": 24.292627334594727, "learning_rate": 1.3928920112330265e-05, "loss": 9.796, "step": 2007 }, { "epoch": 0.39234075810863617, "grad_norm": 9.747627258300781, "learning_rate": 1.3923099502610733e-05, "loss": 8.3834, "step": 2008 }, { "epoch": 0.39253614693239547, "grad_norm": 8.528168678283691, "learning_rate": 1.3917277321568052e-05, "loss": 9.0326, "step": 2009 }, { "epoch": 0.39273153575615477, "grad_norm": 10.370828628540039, "learning_rate": 1.3911453571534189e-05, "loss": 10.0083, "step": 2010 }, { "epoch": 0.392926924579914, "grad_norm": 9.663119316101074, "learning_rate": 1.390562825484173e-05, "loss": 8.2309, "step": 2011 }, { "epoch": 0.3931223134036733, "grad_norm": 9.895528793334961, "learning_rate": 1.3899801373823905e-05, "loss": 10.2185, "step": 2012 }, { "epoch": 0.39331770222743256, "grad_norm": 12.388712882995605, "learning_rate": 1.389397293081455e-05, "loss": 9.9484, "step": 2013 }, { "epoch": 0.39351309105119187, "grad_norm": 11.636601448059082, "learning_rate": 1.3888142928148143e-05, "loss": 8.8707, "step": 2014 }, { "epoch": 0.39370847987495117, "grad_norm": 11.408031463623047, "learning_rate": 1.3882311368159782e-05, "loss": 9.5439, "step": 2015 }, { "epoch": 0.3939038686987104, "grad_norm": 8.283649444580078, "learning_rate": 1.3876478253185183e-05, "loss": 8.3897, "step": 2016 }, { "epoch": 0.3940992575224697, "grad_norm": 11.562610626220703, "learning_rate": 1.3870643585560697e-05, "loss": 9.9744, "step": 2017 }, { "epoch": 0.394294646346229, "grad_norm": 9.294994354248047, "learning_rate": 1.3864807367623282e-05, "loss": 9.6295, "step": 2018 }, { "epoch": 0.39449003516998826, "grad_norm": 7.365604877471924, "learning_rate": 1.3858969601710533e-05, "loss": 9.4103, "step": 2019 }, { "epoch": 0.39468542399374756, "grad_norm": 10.17297077178955, "learning_rate": 1.3853130290160647e-05, "loss": 9.4888, "step": 2020 }, { "epoch": 0.39488081281750687, "grad_norm": 9.276739120483398, "learning_rate": 1.3847289435312455e-05, "loss": 9.889, "step": 2021 }, { "epoch": 0.3950762016412661, "grad_norm": 8.32766342163086, "learning_rate": 1.3841447039505398e-05, "loss": 8.3295, "step": 2022 }, { "epoch": 0.3952715904650254, "grad_norm": 9.893261909484863, "learning_rate": 1.383560310507954e-05, "loss": 9.938, "step": 2023 }, { "epoch": 0.39546697928878466, "grad_norm": 15.601113319396973, "learning_rate": 1.3829757634375556e-05, "loss": 9.2815, "step": 2024 }, { "epoch": 0.39566236811254396, "grad_norm": 13.371784210205078, "learning_rate": 1.3823910629734743e-05, "loss": 9.9588, "step": 2025 }, { "epoch": 0.39585775693630326, "grad_norm": 29.384428024291992, "learning_rate": 1.3818062093499e-05, "loss": 9.993, "step": 2026 }, { "epoch": 0.3960531457600625, "grad_norm": 9.52156925201416, "learning_rate": 1.3812212028010854e-05, "loss": 9.4732, "step": 2027 }, { "epoch": 0.3962485345838218, "grad_norm": 9.161752700805664, "learning_rate": 1.3806360435613433e-05, "loss": 9.4128, "step": 2028 }, { "epoch": 0.3964439234075811, "grad_norm": 13.252968788146973, "learning_rate": 1.3800507318650485e-05, "loss": 10.0995, "step": 2029 }, { "epoch": 0.39663931223134036, "grad_norm": 10.196971893310547, "learning_rate": 1.3794652679466363e-05, "loss": 9.456, "step": 2030 }, { "epoch": 0.39683470105509966, "grad_norm": 11.998358726501465, "learning_rate": 1.3788796520406033e-05, "loss": 9.2391, "step": 2031 }, { "epoch": 0.3970300898788589, "grad_norm": 8.397653579711914, "learning_rate": 1.3782938843815068e-05, "loss": 9.5083, "step": 2032 }, { "epoch": 0.3972254787026182, "grad_norm": 9.211454391479492, "learning_rate": 1.3777079652039649e-05, "loss": 8.3023, "step": 2033 }, { "epoch": 0.3974208675263775, "grad_norm": 17.611833572387695, "learning_rate": 1.3771218947426566e-05, "loss": 9.8341, "step": 2034 }, { "epoch": 0.39761625635013675, "grad_norm": 7.4314656257629395, "learning_rate": 1.3765356732323209e-05, "loss": 8.9265, "step": 2035 }, { "epoch": 0.39781164517389606, "grad_norm": 18.303077697753906, "learning_rate": 1.3759493009077583e-05, "loss": 9.1009, "step": 2036 }, { "epoch": 0.39800703399765536, "grad_norm": 15.656684875488281, "learning_rate": 1.3753627780038286e-05, "loss": 8.9401, "step": 2037 }, { "epoch": 0.3982024228214146, "grad_norm": 46.9091682434082, "learning_rate": 1.3747761047554531e-05, "loss": 10.1341, "step": 2038 }, { "epoch": 0.3983978116451739, "grad_norm": 8.730496406555176, "learning_rate": 1.3741892813976117e-05, "loss": 8.5978, "step": 2039 }, { "epoch": 0.39859320046893315, "grad_norm": 7.332820892333984, "learning_rate": 1.373602308165346e-05, "loss": 8.8639, "step": 2040 }, { "epoch": 0.39878858929269245, "grad_norm": 8.697312355041504, "learning_rate": 1.3730151852937567e-05, "loss": 8.9305, "step": 2041 }, { "epoch": 0.39898397811645175, "grad_norm": 8.44970703125, "learning_rate": 1.372427913018005e-05, "loss": 8.6963, "step": 2042 }, { "epoch": 0.399179366940211, "grad_norm": 9.921573638916016, "learning_rate": 1.371840491573312e-05, "loss": 9.7979, "step": 2043 }, { "epoch": 0.3993747557639703, "grad_norm": 20.099802017211914, "learning_rate": 1.3712529211949575e-05, "loss": 8.9746, "step": 2044 }, { "epoch": 0.3995701445877296, "grad_norm": 8.397631645202637, "learning_rate": 1.3706652021182818e-05, "loss": 9.6269, "step": 2045 }, { "epoch": 0.39976553341148885, "grad_norm": 14.477246284484863, "learning_rate": 1.370077334578685e-05, "loss": 9.4552, "step": 2046 }, { "epoch": 0.39996092223524815, "grad_norm": 18.402416229248047, "learning_rate": 1.369489318811626e-05, "loss": 8.1664, "step": 2047 }, { "epoch": 0.40015631105900745, "grad_norm": 56.74452590942383, "learning_rate": 1.3689011550526237e-05, "loss": 10.9544, "step": 2048 }, { "epoch": 0.4003516998827667, "grad_norm": 10.717354774475098, "learning_rate": 1.3683128435372558e-05, "loss": 9.1311, "step": 2049 }, { "epoch": 0.400547088706526, "grad_norm": 7.775894641876221, "learning_rate": 1.3677243845011589e-05, "loss": 9.6921, "step": 2050 }, { "epoch": 0.40074247753028525, "grad_norm": 8.910210609436035, "learning_rate": 1.36713577818003e-05, "loss": 9.7363, "step": 2051 }, { "epoch": 0.40093786635404455, "grad_norm": 7.493995189666748, "learning_rate": 1.3665470248096236e-05, "loss": 8.3686, "step": 2052 }, { "epoch": 0.40113325517780385, "grad_norm": 7.656559467315674, "learning_rate": 1.3659581246257533e-05, "loss": 8.1144, "step": 2053 }, { "epoch": 0.4013286440015631, "grad_norm": 7.61382532119751, "learning_rate": 1.365369077864293e-05, "loss": 8.3759, "step": 2054 }, { "epoch": 0.4015240328253224, "grad_norm": 7.527225971221924, "learning_rate": 1.3647798847611733e-05, "loss": 9.0688, "step": 2055 }, { "epoch": 0.4017194216490817, "grad_norm": 11.350974082946777, "learning_rate": 1.3641905455523846e-05, "loss": 10.2505, "step": 2056 }, { "epoch": 0.40191481047284094, "grad_norm": 9.301215171813965, "learning_rate": 1.3636010604739757e-05, "loss": 9.9992, "step": 2057 }, { "epoch": 0.40211019929660025, "grad_norm": 8.098173141479492, "learning_rate": 1.3630114297620535e-05, "loss": 9.7471, "step": 2058 }, { "epoch": 0.4023055881203595, "grad_norm": 7.174412727355957, "learning_rate": 1.362421653652784e-05, "loss": 9.3435, "step": 2059 }, { "epoch": 0.4025009769441188, "grad_norm": 11.994549751281738, "learning_rate": 1.3618317323823897e-05, "loss": 8.4048, "step": 2060 }, { "epoch": 0.4026963657678781, "grad_norm": 9.520535469055176, "learning_rate": 1.3612416661871532e-05, "loss": 8.5934, "step": 2061 }, { "epoch": 0.40289175459163734, "grad_norm": 11.950779914855957, "learning_rate": 1.360651455303414e-05, "loss": 9.5535, "step": 2062 }, { "epoch": 0.40308714341539664, "grad_norm": 8.245354652404785, "learning_rate": 1.3600610999675703e-05, "loss": 9.6193, "step": 2063 }, { "epoch": 0.40328253223915594, "grad_norm": 7.312138080596924, "learning_rate": 1.3594706004160772e-05, "loss": 9.3063, "step": 2064 }, { "epoch": 0.4034779210629152, "grad_norm": 8.881611824035645, "learning_rate": 1.3588799568854485e-05, "loss": 10.3356, "step": 2065 }, { "epoch": 0.4036733098866745, "grad_norm": 10.751225471496582, "learning_rate": 1.358289169612255e-05, "loss": 10.3858, "step": 2066 }, { "epoch": 0.40386869871043374, "grad_norm": 8.918109893798828, "learning_rate": 1.3576982388331258e-05, "loss": 9.4066, "step": 2067 }, { "epoch": 0.40406408753419304, "grad_norm": 8.959725379943848, "learning_rate": 1.3571071647847465e-05, "loss": 9.8884, "step": 2068 }, { "epoch": 0.40425947635795234, "grad_norm": 9.951619148254395, "learning_rate": 1.3565159477038608e-05, "loss": 8.6897, "step": 2069 }, { "epoch": 0.4044548651817116, "grad_norm": 16.672077178955078, "learning_rate": 1.3559245878272695e-05, "loss": 9.3297, "step": 2070 }, { "epoch": 0.4046502540054709, "grad_norm": 9.029973983764648, "learning_rate": 1.3553330853918305e-05, "loss": 10.0507, "step": 2071 }, { "epoch": 0.4048456428292302, "grad_norm": 9.713790893554688, "learning_rate": 1.354741440634459e-05, "loss": 10.0247, "step": 2072 }, { "epoch": 0.40504103165298944, "grad_norm": 7.559501647949219, "learning_rate": 1.3541496537921273e-05, "loss": 9.4727, "step": 2073 }, { "epoch": 0.40523642047674874, "grad_norm": 21.059707641601562, "learning_rate": 1.353557725101864e-05, "loss": 9.0248, "step": 2074 }, { "epoch": 0.405431809300508, "grad_norm": 7.639827728271484, "learning_rate": 1.352965654800755e-05, "loss": 9.2796, "step": 2075 }, { "epoch": 0.4056271981242673, "grad_norm": 8.415217399597168, "learning_rate": 1.3523734431259432e-05, "loss": 9.8698, "step": 2076 }, { "epoch": 0.4058225869480266, "grad_norm": 8.58553695678711, "learning_rate": 1.3517810903146273e-05, "loss": 8.7702, "step": 2077 }, { "epoch": 0.40601797577178583, "grad_norm": 7.218175888061523, "learning_rate": 1.3511885966040632e-05, "loss": 9.2946, "step": 2078 }, { "epoch": 0.40621336459554513, "grad_norm": 8.998862266540527, "learning_rate": 1.3505959622315632e-05, "loss": 9.6173, "step": 2079 }, { "epoch": 0.40640875341930444, "grad_norm": 27.732744216918945, "learning_rate": 1.3500031874344956e-05, "loss": 10.2917, "step": 2080 }, { "epoch": 0.4066041422430637, "grad_norm": 8.803112983703613, "learning_rate": 1.3494102724502849e-05, "loss": 10.2354, "step": 2081 }, { "epoch": 0.406799531066823, "grad_norm": 10.404772758483887, "learning_rate": 1.3488172175164122e-05, "loss": 9.5116, "step": 2082 }, { "epoch": 0.4069949198905823, "grad_norm": 8.896175384521484, "learning_rate": 1.3482240228704143e-05, "loss": 9.2856, "step": 2083 }, { "epoch": 0.40719030871434153, "grad_norm": 7.903563022613525, "learning_rate": 1.3476306887498842e-05, "loss": 8.9899, "step": 2084 }, { "epoch": 0.40738569753810083, "grad_norm": 9.580747604370117, "learning_rate": 1.3470372153924707e-05, "loss": 9.3801, "step": 2085 }, { "epoch": 0.4075810863618601, "grad_norm": 8.003578186035156, "learning_rate": 1.346443603035878e-05, "loss": 8.8302, "step": 2086 }, { "epoch": 0.4077764751856194, "grad_norm": 9.539640426635742, "learning_rate": 1.3458498519178665e-05, "loss": 9.8276, "step": 2087 }, { "epoch": 0.4079718640093787, "grad_norm": 9.275850296020508, "learning_rate": 1.3452559622762518e-05, "loss": 8.6797, "step": 2088 }, { "epoch": 0.40816725283313793, "grad_norm": 6.939610481262207, "learning_rate": 1.3446619343489053e-05, "loss": 8.6884, "step": 2089 }, { "epoch": 0.40836264165689723, "grad_norm": 12.585261344909668, "learning_rate": 1.3440677683737538e-05, "loss": 10.2965, "step": 2090 }, { "epoch": 0.40855803048065653, "grad_norm": 7.093527793884277, "learning_rate": 1.3434734645887788e-05, "loss": 8.9571, "step": 2091 }, { "epoch": 0.4087534193044158, "grad_norm": 7.72236967086792, "learning_rate": 1.3428790232320177e-05, "loss": 8.7031, "step": 2092 }, { "epoch": 0.4089488081281751, "grad_norm": 7.893250465393066, "learning_rate": 1.3422844445415628e-05, "loss": 8.4756, "step": 2093 }, { "epoch": 0.4091441969519343, "grad_norm": 10.6410551071167, "learning_rate": 1.341689728755561e-05, "loss": 9.0713, "step": 2094 }, { "epoch": 0.4093395857756936, "grad_norm": 9.671658515930176, "learning_rate": 1.3410948761122145e-05, "loss": 10.2595, "step": 2095 }, { "epoch": 0.40953497459945293, "grad_norm": 13.780243873596191, "learning_rate": 1.3404998868497803e-05, "loss": 9.4971, "step": 2096 }, { "epoch": 0.4097303634232122, "grad_norm": 22.337764739990234, "learning_rate": 1.3399047612065703e-05, "loss": 8.2947, "step": 2097 }, { "epoch": 0.4099257522469715, "grad_norm": 8.596807479858398, "learning_rate": 1.3393094994209505e-05, "loss": 8.9651, "step": 2098 }, { "epoch": 0.4101211410707308, "grad_norm": 8.460862159729004, "learning_rate": 1.338714101731342e-05, "loss": 8.8179, "step": 2099 }, { "epoch": 0.41031652989449, "grad_norm": 8.91547679901123, "learning_rate": 1.3381185683762197e-05, "loss": 9.2624, "step": 2100 }, { "epoch": 0.4105119187182493, "grad_norm": 8.833627700805664, "learning_rate": 1.3375228995941135e-05, "loss": 9.1694, "step": 2101 }, { "epoch": 0.41070730754200857, "grad_norm": 8.893487930297852, "learning_rate": 1.336927095623607e-05, "loss": 9.4226, "step": 2102 }, { "epoch": 0.41090269636576787, "grad_norm": 25.64516830444336, "learning_rate": 1.3363311567033384e-05, "loss": 9.3881, "step": 2103 }, { "epoch": 0.4110980851895272, "grad_norm": 9.39368724822998, "learning_rate": 1.3357350830719997e-05, "loss": 9.8792, "step": 2104 }, { "epoch": 0.4112934740132864, "grad_norm": 7.454868316650391, "learning_rate": 1.3351388749683369e-05, "loss": 9.0325, "step": 2105 }, { "epoch": 0.4114888628370457, "grad_norm": 13.555609703063965, "learning_rate": 1.33454253263115e-05, "loss": 9.6081, "step": 2106 }, { "epoch": 0.411684251660805, "grad_norm": 8.101183891296387, "learning_rate": 1.3339460562992922e-05, "loss": 10.0975, "step": 2107 }, { "epoch": 0.41187964048456427, "grad_norm": 11.35245418548584, "learning_rate": 1.333349446211671e-05, "loss": 10.0821, "step": 2108 }, { "epoch": 0.41207502930832357, "grad_norm": 10.807251930236816, "learning_rate": 1.3327527026072473e-05, "loss": 10.1509, "step": 2109 }, { "epoch": 0.41227041813208287, "grad_norm": 6.899622440338135, "learning_rate": 1.3321558257250355e-05, "loss": 8.7787, "step": 2110 }, { "epoch": 0.4124658069558421, "grad_norm": 9.099929809570312, "learning_rate": 1.3315588158041032e-05, "loss": 9.8039, "step": 2111 }, { "epoch": 0.4126611957796014, "grad_norm": 7.551453590393066, "learning_rate": 1.3309616730835715e-05, "loss": 9.2101, "step": 2112 }, { "epoch": 0.41285658460336067, "grad_norm": 11.950295448303223, "learning_rate": 1.3303643978026145e-05, "loss": 10.7997, "step": 2113 }, { "epoch": 0.41305197342711997, "grad_norm": 8.216484069824219, "learning_rate": 1.3297669902004598e-05, "loss": 9.7015, "step": 2114 }, { "epoch": 0.41324736225087927, "grad_norm": 7.783416748046875, "learning_rate": 1.3291694505163872e-05, "loss": 8.1667, "step": 2115 }, { "epoch": 0.4134427510746385, "grad_norm": 7.72036600112915, "learning_rate": 1.32857177898973e-05, "loss": 9.1759, "step": 2116 }, { "epoch": 0.4136381398983978, "grad_norm": 9.248005867004395, "learning_rate": 1.3279739758598746e-05, "loss": 9.5641, "step": 2117 }, { "epoch": 0.4138335287221571, "grad_norm": 8.15285587310791, "learning_rate": 1.3273760413662596e-05, "loss": 8.9547, "step": 2118 }, { "epoch": 0.41402891754591636, "grad_norm": 9.635090827941895, "learning_rate": 1.3267779757483761e-05, "loss": 9.9121, "step": 2119 }, { "epoch": 0.41422430636967567, "grad_norm": 8.915787696838379, "learning_rate": 1.3261797792457678e-05, "loss": 9.6434, "step": 2120 }, { "epoch": 0.4144196951934349, "grad_norm": 7.4869303703308105, "learning_rate": 1.3255814520980312e-05, "loss": 9.0176, "step": 2121 }, { "epoch": 0.4146150840171942, "grad_norm": 13.5423002243042, "learning_rate": 1.3249829945448151e-05, "loss": 9.7015, "step": 2122 }, { "epoch": 0.4148104728409535, "grad_norm": 11.67705249786377, "learning_rate": 1.32438440682582e-05, "loss": 10.2476, "step": 2123 }, { "epoch": 0.41500586166471276, "grad_norm": 9.512430191040039, "learning_rate": 1.3237856891807992e-05, "loss": 9.7233, "step": 2124 }, { "epoch": 0.41520125048847206, "grad_norm": 88.60699462890625, "learning_rate": 1.3231868418495573e-05, "loss": 11.1191, "step": 2125 }, { "epoch": 0.41539663931223136, "grad_norm": 7.701420307159424, "learning_rate": 1.3225878650719514e-05, "loss": 9.1693, "step": 2126 }, { "epoch": 0.4155920281359906, "grad_norm": 9.494232177734375, "learning_rate": 1.3219887590878903e-05, "loss": 9.4935, "step": 2127 }, { "epoch": 0.4157874169597499, "grad_norm": 8.114104270935059, "learning_rate": 1.3213895241373348e-05, "loss": 9.3104, "step": 2128 }, { "epoch": 0.41598280578350916, "grad_norm": 8.193770408630371, "learning_rate": 1.320790160460297e-05, "loss": 10.0441, "step": 2129 }, { "epoch": 0.41617819460726846, "grad_norm": 9.657182693481445, "learning_rate": 1.3201906682968404e-05, "loss": 9.3909, "step": 2130 }, { "epoch": 0.41637358343102776, "grad_norm": 9.406392097473145, "learning_rate": 1.3195910478870804e-05, "loss": 9.7109, "step": 2131 }, { "epoch": 0.416568972254787, "grad_norm": 7.698124885559082, "learning_rate": 1.3189912994711836e-05, "loss": 8.3727, "step": 2132 }, { "epoch": 0.4167643610785463, "grad_norm": 10.945155143737793, "learning_rate": 1.3183914232893675e-05, "loss": 9.561, "step": 2133 }, { "epoch": 0.4169597499023056, "grad_norm": 8.722308158874512, "learning_rate": 1.3177914195819018e-05, "loss": 10.1848, "step": 2134 }, { "epoch": 0.41715513872606486, "grad_norm": 10.87885856628418, "learning_rate": 1.3171912885891063e-05, "loss": 9.3673, "step": 2135 }, { "epoch": 0.41735052754982416, "grad_norm": 7.66211462020874, "learning_rate": 1.3165910305513521e-05, "loss": 9.4162, "step": 2136 }, { "epoch": 0.4175459163735834, "grad_norm": 11.434732437133789, "learning_rate": 1.3159906457090607e-05, "loss": 9.6539, "step": 2137 }, { "epoch": 0.4177413051973427, "grad_norm": 38.60456085205078, "learning_rate": 1.3153901343027058e-05, "loss": 8.932, "step": 2138 }, { "epoch": 0.417936694021102, "grad_norm": 9.288236618041992, "learning_rate": 1.3147894965728103e-05, "loss": 9.7608, "step": 2139 }, { "epoch": 0.41813208284486125, "grad_norm": 9.36833667755127, "learning_rate": 1.314188732759948e-05, "loss": 9.2929, "step": 2140 }, { "epoch": 0.41832747166862055, "grad_norm": 9.080362319946289, "learning_rate": 1.3135878431047442e-05, "loss": 9.847, "step": 2141 }, { "epoch": 0.41852286049237986, "grad_norm": 8.36670970916748, "learning_rate": 1.3129868278478733e-05, "loss": 9.0836, "step": 2142 }, { "epoch": 0.4187182493161391, "grad_norm": 8.57102108001709, "learning_rate": 1.3123856872300607e-05, "loss": 9.3054, "step": 2143 }, { "epoch": 0.4189136381398984, "grad_norm": 7.616203308105469, "learning_rate": 1.3117844214920818e-05, "loss": 8.6007, "step": 2144 }, { "epoch": 0.4191090269636577, "grad_norm": 6.933886528015137, "learning_rate": 1.3111830308747627e-05, "loss": 7.9322, "step": 2145 }, { "epoch": 0.41930441578741695, "grad_norm": 10.009140968322754, "learning_rate": 1.3105815156189782e-05, "loss": 8.7556, "step": 2146 }, { "epoch": 0.41949980461117625, "grad_norm": 9.128080368041992, "learning_rate": 1.3099798759656546e-05, "loss": 9.5527, "step": 2147 }, { "epoch": 0.4196951934349355, "grad_norm": 10.222684860229492, "learning_rate": 1.3093781121557667e-05, "loss": 9.1902, "step": 2148 }, { "epoch": 0.4198905822586948, "grad_norm": 10.032594680786133, "learning_rate": 1.3087762244303399e-05, "loss": 8.6057, "step": 2149 }, { "epoch": 0.4200859710824541, "grad_norm": 8.646735191345215, "learning_rate": 1.308174213030449e-05, "loss": 9.2424, "step": 2150 }, { "epoch": 0.42028135990621335, "grad_norm": 8.629133224487305, "learning_rate": 1.3075720781972176e-05, "loss": 10.0302, "step": 2151 }, { "epoch": 0.42047674872997265, "grad_norm": 10.1091890335083, "learning_rate": 1.3069698201718202e-05, "loss": 9.4035, "step": 2152 }, { "epoch": 0.42067213755373195, "grad_norm": 9.671299934387207, "learning_rate": 1.3063674391954795e-05, "loss": 10.2995, "step": 2153 }, { "epoch": 0.4208675263774912, "grad_norm": 7.490504264831543, "learning_rate": 1.3057649355094678e-05, "loss": 9.9019, "step": 2154 }, { "epoch": 0.4210629152012505, "grad_norm": 8.988183975219727, "learning_rate": 1.3051623093551062e-05, "loss": 8.8773, "step": 2155 }, { "epoch": 0.42125830402500974, "grad_norm": 11.667671203613281, "learning_rate": 1.3045595609737659e-05, "loss": 10.0266, "step": 2156 }, { "epoch": 0.42145369284876905, "grad_norm": 15.264968872070312, "learning_rate": 1.3039566906068657e-05, "loss": 9.9159, "step": 2157 }, { "epoch": 0.42164908167252835, "grad_norm": 11.067122459411621, "learning_rate": 1.3033536984958744e-05, "loss": 10.0485, "step": 2158 }, { "epoch": 0.4218444704962876, "grad_norm": 7.868844032287598, "learning_rate": 1.3027505848823086e-05, "loss": 9.4467, "step": 2159 }, { "epoch": 0.4220398593200469, "grad_norm": 7.70686149597168, "learning_rate": 1.3021473500077342e-05, "loss": 8.182, "step": 2160 }, { "epoch": 0.4222352481438062, "grad_norm": 7.336475849151611, "learning_rate": 1.301543994113766e-05, "loss": 8.5334, "step": 2161 }, { "epoch": 0.42243063696756544, "grad_norm": 9.529787063598633, "learning_rate": 1.300940517442066e-05, "loss": 10.0935, "step": 2162 }, { "epoch": 0.42262602579132474, "grad_norm": 9.209541320800781, "learning_rate": 1.3003369202343454e-05, "loss": 9.6814, "step": 2163 }, { "epoch": 0.422821414615084, "grad_norm": 8.053021430969238, "learning_rate": 1.299733202732364e-05, "loss": 8.7216, "step": 2164 }, { "epoch": 0.4230168034388433, "grad_norm": 8.257932662963867, "learning_rate": 1.2991293651779296e-05, "loss": 9.8704, "step": 2165 }, { "epoch": 0.4232121922626026, "grad_norm": 10.342300415039062, "learning_rate": 1.298525407812897e-05, "loss": 9.5254, "step": 2166 }, { "epoch": 0.42340758108636184, "grad_norm": 10.803786277770996, "learning_rate": 1.2979213308791709e-05, "loss": 9.3382, "step": 2167 }, { "epoch": 0.42360296991012114, "grad_norm": 7.010225296020508, "learning_rate": 1.2973171346187021e-05, "loss": 8.1361, "step": 2168 }, { "epoch": 0.42379835873388044, "grad_norm": 8.592170715332031, "learning_rate": 1.2967128192734903e-05, "loss": 9.1814, "step": 2169 }, { "epoch": 0.4239937475576397, "grad_norm": 11.579129219055176, "learning_rate": 1.2961083850855826e-05, "loss": 9.909, "step": 2170 }, { "epoch": 0.424189136381399, "grad_norm": 7.7738118171691895, "learning_rate": 1.2955038322970737e-05, "loss": 8.9426, "step": 2171 }, { "epoch": 0.4243845252051583, "grad_norm": 9.89653205871582, "learning_rate": 1.2948991611501053e-05, "loss": 10.6861, "step": 2172 }, { "epoch": 0.42457991402891754, "grad_norm": 7.241786479949951, "learning_rate": 1.2942943718868675e-05, "loss": 8.2366, "step": 2173 }, { "epoch": 0.42477530285267684, "grad_norm": 9.199349403381348, "learning_rate": 1.293689464749597e-05, "loss": 8.7557, "step": 2174 }, { "epoch": 0.4249706916764361, "grad_norm": 6.656001567840576, "learning_rate": 1.293084439980578e-05, "loss": 8.5495, "step": 2175 }, { "epoch": 0.4251660805001954, "grad_norm": 9.568596839904785, "learning_rate": 1.2924792978221414e-05, "loss": 10.432, "step": 2176 }, { "epoch": 0.4253614693239547, "grad_norm": 7.979373931884766, "learning_rate": 1.2918740385166655e-05, "loss": 8.3202, "step": 2177 }, { "epoch": 0.42555685814771393, "grad_norm": 8.833368301391602, "learning_rate": 1.2912686623065755e-05, "loss": 9.2741, "step": 2178 }, { "epoch": 0.42575224697147324, "grad_norm": 6.991685390472412, "learning_rate": 1.2906631694343436e-05, "loss": 8.5693, "step": 2179 }, { "epoch": 0.42594763579523254, "grad_norm": 10.28195571899414, "learning_rate": 1.2900575601424883e-05, "loss": 10.0393, "step": 2180 }, { "epoch": 0.4261430246189918, "grad_norm": 8.920546531677246, "learning_rate": 1.289451834673575e-05, "loss": 9.9295, "step": 2181 }, { "epoch": 0.4263384134427511, "grad_norm": 8.196661949157715, "learning_rate": 1.288845993270215e-05, "loss": 8.2215, "step": 2182 }, { "epoch": 0.42653380226651033, "grad_norm": 9.465472221374512, "learning_rate": 1.2882400361750676e-05, "loss": 8.8123, "step": 2183 }, { "epoch": 0.42672919109026963, "grad_norm": 8.361612319946289, "learning_rate": 1.2876339636308366e-05, "loss": 8.811, "step": 2184 }, { "epoch": 0.42692457991402893, "grad_norm": 8.781848907470703, "learning_rate": 1.287027775880273e-05, "loss": 9.1459, "step": 2185 }, { "epoch": 0.4271199687377882, "grad_norm": 10.067546844482422, "learning_rate": 1.2864214731661743e-05, "loss": 9.4827, "step": 2186 }, { "epoch": 0.4273153575615475, "grad_norm": 8.98030948638916, "learning_rate": 1.2858150557313832e-05, "loss": 9.0296, "step": 2187 }, { "epoch": 0.4275107463853068, "grad_norm": 8.424452781677246, "learning_rate": 1.285208523818789e-05, "loss": 8.3527, "step": 2188 }, { "epoch": 0.42770613520906603, "grad_norm": 12.284327507019043, "learning_rate": 1.2846018776713257e-05, "loss": 9.3219, "step": 2189 }, { "epoch": 0.42790152403282533, "grad_norm": 7.31266975402832, "learning_rate": 1.2839951175319752e-05, "loss": 8.6924, "step": 2190 }, { "epoch": 0.4280969128565846, "grad_norm": 8.193886756896973, "learning_rate": 1.2833882436437626e-05, "loss": 8.9165, "step": 2191 }, { "epoch": 0.4282923016803439, "grad_norm": 9.161237716674805, "learning_rate": 1.2827812562497603e-05, "loss": 9.3125, "step": 2192 }, { "epoch": 0.4284876905041032, "grad_norm": 8.897019386291504, "learning_rate": 1.2821741555930856e-05, "loss": 9.0099, "step": 2193 }, { "epoch": 0.4286830793278624, "grad_norm": 8.073427200317383, "learning_rate": 1.2815669419169013e-05, "loss": 9.546, "step": 2194 }, { "epoch": 0.42887846815162173, "grad_norm": 26.498016357421875, "learning_rate": 1.2809596154644148e-05, "loss": 9.2152, "step": 2195 }, { "epoch": 0.42907385697538103, "grad_norm": 9.393672943115234, "learning_rate": 1.2803521764788795e-05, "loss": 8.4541, "step": 2196 }, { "epoch": 0.4292692457991403, "grad_norm": 8.790255546569824, "learning_rate": 1.279744625203594e-05, "loss": 10.2608, "step": 2197 }, { "epoch": 0.4294646346228996, "grad_norm": 8.156511306762695, "learning_rate": 1.2791369618819008e-05, "loss": 7.7263, "step": 2198 }, { "epoch": 0.4296600234466589, "grad_norm": 6.234302997589111, "learning_rate": 1.2785291867571883e-05, "loss": 8.707, "step": 2199 }, { "epoch": 0.4298554122704181, "grad_norm": 10.79444408416748, "learning_rate": 1.2779213000728889e-05, "loss": 10.0896, "step": 2200 }, { "epoch": 0.4300508010941774, "grad_norm": 9.629950523376465, "learning_rate": 1.2773133020724804e-05, "loss": 10.357, "step": 2201 }, { "epoch": 0.43024618991793667, "grad_norm": 9.38863754272461, "learning_rate": 1.2767051929994848e-05, "loss": 9.3469, "step": 2202 }, { "epoch": 0.430441578741696, "grad_norm": 7.55531120300293, "learning_rate": 1.2760969730974692e-05, "loss": 9.691, "step": 2203 }, { "epoch": 0.4306369675654553, "grad_norm": 7.783170223236084, "learning_rate": 1.2754886426100436e-05, "loss": 9.4499, "step": 2204 }, { "epoch": 0.4308323563892145, "grad_norm": 7.9393086433410645, "learning_rate": 1.2748802017808637e-05, "loss": 10.4184, "step": 2205 }, { "epoch": 0.4310277452129738, "grad_norm": 8.78271770477295, "learning_rate": 1.274271650853629e-05, "loss": 8.9384, "step": 2206 }, { "epoch": 0.4312231340367331, "grad_norm": 38.12883377075195, "learning_rate": 1.2736629900720832e-05, "loss": 9.033, "step": 2207 }, { "epoch": 0.43141852286049237, "grad_norm": 9.394990921020508, "learning_rate": 1.2730542196800132e-05, "loss": 9.2351, "step": 2208 }, { "epoch": 0.43161391168425167, "grad_norm": 9.552518844604492, "learning_rate": 1.2724453399212513e-05, "loss": 8.9445, "step": 2209 }, { "epoch": 0.4318093005080109, "grad_norm": 8.833136558532715, "learning_rate": 1.2718363510396721e-05, "loss": 9.0274, "step": 2210 }, { "epoch": 0.4320046893317702, "grad_norm": 8.848443984985352, "learning_rate": 1.2712272532791947e-05, "loss": 10.6383, "step": 2211 }, { "epoch": 0.4322000781555295, "grad_norm": 8.959117889404297, "learning_rate": 1.270618046883782e-05, "loss": 9.0019, "step": 2212 }, { "epoch": 0.43239546697928877, "grad_norm": 8.169061660766602, "learning_rate": 1.2700087320974394e-05, "loss": 8.4105, "step": 2213 }, { "epoch": 0.43259085580304807, "grad_norm": 7.978032112121582, "learning_rate": 1.269399309164217e-05, "loss": 8.4926, "step": 2214 }, { "epoch": 0.43278624462680737, "grad_norm": 8.165763854980469, "learning_rate": 1.2687897783282072e-05, "loss": 8.1933, "step": 2215 }, { "epoch": 0.4329816334505666, "grad_norm": 8.540172576904297, "learning_rate": 1.2681801398335467e-05, "loss": 9.6549, "step": 2216 }, { "epoch": 0.4331770222743259, "grad_norm": 20.1662654876709, "learning_rate": 1.267570393924414e-05, "loss": 9.2204, "step": 2217 }, { "epoch": 0.43337241109808516, "grad_norm": 20.048967361450195, "learning_rate": 1.2669605408450314e-05, "loss": 9.9814, "step": 2218 }, { "epoch": 0.43356779992184447, "grad_norm": 9.27750015258789, "learning_rate": 1.2663505808396639e-05, "loss": 9.7194, "step": 2219 }, { "epoch": 0.43376318874560377, "grad_norm": 8.13779354095459, "learning_rate": 1.2657405141526196e-05, "loss": 9.7392, "step": 2220 }, { "epoch": 0.433958577569363, "grad_norm": 12.067617416381836, "learning_rate": 1.2651303410282489e-05, "loss": 9.1701, "step": 2221 }, { "epoch": 0.4341539663931223, "grad_norm": 5.860738277435303, "learning_rate": 1.2645200617109456e-05, "loss": 8.9809, "step": 2222 }, { "epoch": 0.4343493552168816, "grad_norm": 19.50615119934082, "learning_rate": 1.2639096764451451e-05, "loss": 10.4136, "step": 2223 }, { "epoch": 0.43454474404064086, "grad_norm": 9.074341773986816, "learning_rate": 1.2632991854753257e-05, "loss": 9.2914, "step": 2224 }, { "epoch": 0.43474013286440016, "grad_norm": 10.70407485961914, "learning_rate": 1.2626885890460079e-05, "loss": 10.488, "step": 2225 }, { "epoch": 0.4349355216881594, "grad_norm": 40.406402587890625, "learning_rate": 1.262077887401755e-05, "loss": 9.7503, "step": 2226 }, { "epoch": 0.4351309105119187, "grad_norm": 10.195911407470703, "learning_rate": 1.2614670807871711e-05, "loss": 9.8839, "step": 2227 }, { "epoch": 0.435326299335678, "grad_norm": 7.769820690155029, "learning_rate": 1.2608561694469042e-05, "loss": 8.9693, "step": 2228 }, { "epoch": 0.43552168815943726, "grad_norm": 10.964240074157715, "learning_rate": 1.2602451536256426e-05, "loss": 9.7091, "step": 2229 }, { "epoch": 0.43571707698319656, "grad_norm": 10.297403335571289, "learning_rate": 1.2596340335681174e-05, "loss": 9.5607, "step": 2230 }, { "epoch": 0.43591246580695586, "grad_norm": 8.6085205078125, "learning_rate": 1.2590228095191011e-05, "loss": 9.6563, "step": 2231 }, { "epoch": 0.4361078546307151, "grad_norm": 7.507328987121582, "learning_rate": 1.2584114817234076e-05, "loss": 8.9352, "step": 2232 }, { "epoch": 0.4363032434544744, "grad_norm": 10.990617752075195, "learning_rate": 1.2578000504258931e-05, "loss": 9.4201, "step": 2233 }, { "epoch": 0.4364986322782337, "grad_norm": 8.079955101013184, "learning_rate": 1.2571885158714545e-05, "loss": 10.1628, "step": 2234 }, { "epoch": 0.43669402110199296, "grad_norm": 10.56502628326416, "learning_rate": 1.2565768783050304e-05, "loss": 9.422, "step": 2235 }, { "epoch": 0.43688940992575226, "grad_norm": 8.347862243652344, "learning_rate": 1.255965137971601e-05, "loss": 10.2135, "step": 2236 }, { "epoch": 0.4370847987495115, "grad_norm": 13.142292022705078, "learning_rate": 1.255353295116187e-05, "loss": 9.6481, "step": 2237 }, { "epoch": 0.4372801875732708, "grad_norm": 7.691909313201904, "learning_rate": 1.2547413499838506e-05, "loss": 9.0773, "step": 2238 }, { "epoch": 0.4374755763970301, "grad_norm": 14.73984146118164, "learning_rate": 1.2541293028196946e-05, "loss": 9.5216, "step": 2239 }, { "epoch": 0.43767096522078935, "grad_norm": 9.408361434936523, "learning_rate": 1.2535171538688632e-05, "loss": 9.2028, "step": 2240 }, { "epoch": 0.43786635404454866, "grad_norm": 10.333508491516113, "learning_rate": 1.2529049033765405e-05, "loss": 10.0852, "step": 2241 }, { "epoch": 0.43806174286830796, "grad_norm": 9.904654502868652, "learning_rate": 1.2522925515879525e-05, "loss": 10.235, "step": 2242 }, { "epoch": 0.4382571316920672, "grad_norm": 10.235962867736816, "learning_rate": 1.2516800987483651e-05, "loss": 10.2423, "step": 2243 }, { "epoch": 0.4384525205158265, "grad_norm": 9.246352195739746, "learning_rate": 1.251067545103084e-05, "loss": 9.4063, "step": 2244 }, { "epoch": 0.43864790933958575, "grad_norm": 28.940309524536133, "learning_rate": 1.2504548908974562e-05, "loss": 10.0929, "step": 2245 }, { "epoch": 0.43884329816334505, "grad_norm": 8.908502578735352, "learning_rate": 1.2498421363768692e-05, "loss": 9.6554, "step": 2246 }, { "epoch": 0.43903868698710435, "grad_norm": 8.206972122192383, "learning_rate": 1.2492292817867498e-05, "loss": 8.6696, "step": 2247 }, { "epoch": 0.4392340758108636, "grad_norm": 10.232550621032715, "learning_rate": 1.248616327372565e-05, "loss": 9.515, "step": 2248 }, { "epoch": 0.4394294646346229, "grad_norm": 8.533775329589844, "learning_rate": 1.2480032733798229e-05, "loss": 9.4387, "step": 2249 }, { "epoch": 0.4396248534583822, "grad_norm": 10.445351600646973, "learning_rate": 1.2473901200540699e-05, "loss": 10.2788, "step": 2250 }, { "epoch": 0.43982024228214145, "grad_norm": 11.097278594970703, "learning_rate": 1.2467768676408936e-05, "loss": 9.391, "step": 2251 }, { "epoch": 0.44001563110590075, "grad_norm": 10.348945617675781, "learning_rate": 1.24616351638592e-05, "loss": 9.4205, "step": 2252 }, { "epoch": 0.44021101992966, "grad_norm": 12.105810165405273, "learning_rate": 1.2455500665348154e-05, "loss": 10.2328, "step": 2253 }, { "epoch": 0.4404064087534193, "grad_norm": 14.303231239318848, "learning_rate": 1.2449365183332862e-05, "loss": 9.1009, "step": 2254 }, { "epoch": 0.4406017975771786, "grad_norm": 8.684207916259766, "learning_rate": 1.2443228720270768e-05, "loss": 8.7528, "step": 2255 }, { "epoch": 0.44079718640093785, "grad_norm": 10.567802429199219, "learning_rate": 1.243709127861972e-05, "loss": 9.976, "step": 2256 }, { "epoch": 0.44099257522469715, "grad_norm": 7.749636173248291, "learning_rate": 1.2430952860837948e-05, "loss": 9.5294, "step": 2257 }, { "epoch": 0.44118796404845645, "grad_norm": 13.847437858581543, "learning_rate": 1.2424813469384084e-05, "loss": 11.1775, "step": 2258 }, { "epoch": 0.4413833528722157, "grad_norm": 7.527254104614258, "learning_rate": 1.2418673106717145e-05, "loss": 9.2885, "step": 2259 }, { "epoch": 0.441578741695975, "grad_norm": 8.540417671203613, "learning_rate": 1.2412531775296534e-05, "loss": 8.9638, "step": 2260 }, { "epoch": 0.4417741305197343, "grad_norm": 8.876973152160645, "learning_rate": 1.2406389477582048e-05, "loss": 10.072, "step": 2261 }, { "epoch": 0.44196951934349354, "grad_norm": 8.543230056762695, "learning_rate": 1.2400246216033867e-05, "loss": 8.6821, "step": 2262 }, { "epoch": 0.44216490816725285, "grad_norm": 12.249687194824219, "learning_rate": 1.2394101993112558e-05, "loss": 9.7186, "step": 2263 }, { "epoch": 0.4423602969910121, "grad_norm": 7.321566581726074, "learning_rate": 1.2387956811279069e-05, "loss": 9.8925, "step": 2264 }, { "epoch": 0.4425556858147714, "grad_norm": 6.195285797119141, "learning_rate": 1.2381810672994742e-05, "loss": 9.0145, "step": 2265 }, { "epoch": 0.4427510746385307, "grad_norm": 9.123198509216309, "learning_rate": 1.2375663580721296e-05, "loss": 9.6302, "step": 2266 }, { "epoch": 0.44294646346228994, "grad_norm": 10.450803756713867, "learning_rate": 1.2369515536920826e-05, "loss": 10.1972, "step": 2267 }, { "epoch": 0.44314185228604924, "grad_norm": 8.73688793182373, "learning_rate": 1.2363366544055821e-05, "loss": 9.3621, "step": 2268 }, { "epoch": 0.44333724110980854, "grad_norm": 8.812823295593262, "learning_rate": 1.2357216604589139e-05, "loss": 9.6422, "step": 2269 }, { "epoch": 0.4435326299335678, "grad_norm": 7.929076194763184, "learning_rate": 1.2351065720984027e-05, "loss": 9.6526, "step": 2270 }, { "epoch": 0.4437280187573271, "grad_norm": 7.984181880950928, "learning_rate": 1.2344913895704099e-05, "loss": 9.0684, "step": 2271 }, { "epoch": 0.44392340758108634, "grad_norm": 9.382901191711426, "learning_rate": 1.2338761131213357e-05, "loss": 9.6309, "step": 2272 }, { "epoch": 0.44411879640484564, "grad_norm": 8.028267860412598, "learning_rate": 1.2332607429976169e-05, "loss": 8.8354, "step": 2273 }, { "epoch": 0.44431418522860494, "grad_norm": 8.45240306854248, "learning_rate": 1.2326452794457289e-05, "loss": 10.0709, "step": 2274 }, { "epoch": 0.4445095740523642, "grad_norm": 7.99829626083374, "learning_rate": 1.2320297227121834e-05, "loss": 9.2649, "step": 2275 }, { "epoch": 0.4447049628761235, "grad_norm": 7.859771728515625, "learning_rate": 1.2314140730435305e-05, "loss": 8.5114, "step": 2276 }, { "epoch": 0.4449003516998828, "grad_norm": 11.960403442382812, "learning_rate": 1.2307983306863565e-05, "loss": 10.3548, "step": 2277 }, { "epoch": 0.44509574052364204, "grad_norm": 7.42790412902832, "learning_rate": 1.2301824958872857e-05, "loss": 9.0629, "step": 2278 }, { "epoch": 0.44529112934740134, "grad_norm": 7.955683708190918, "learning_rate": 1.229566568892979e-05, "loss": 8.9718, "step": 2279 }, { "epoch": 0.4454865181711606, "grad_norm": 14.33918571472168, "learning_rate": 1.2289505499501341e-05, "loss": 9.9159, "step": 2280 }, { "epoch": 0.4456819069949199, "grad_norm": 9.731377601623535, "learning_rate": 1.2283344393054862e-05, "loss": 9.4328, "step": 2281 }, { "epoch": 0.4458772958186792, "grad_norm": 10.249810218811035, "learning_rate": 1.227718237205806e-05, "loss": 9.5776, "step": 2282 }, { "epoch": 0.44607268464243843, "grad_norm": 7.93415641784668, "learning_rate": 1.2271019438979023e-05, "loss": 9.3234, "step": 2283 }, { "epoch": 0.44626807346619773, "grad_norm": 10.255139350891113, "learning_rate": 1.226485559628619e-05, "loss": 9.2531, "step": 2284 }, { "epoch": 0.44646346228995704, "grad_norm": 10.489916801452637, "learning_rate": 1.2258690846448382e-05, "loss": 9.2626, "step": 2285 }, { "epoch": 0.4466588511137163, "grad_norm": 8.567852973937988, "learning_rate": 1.225252519193476e-05, "loss": 9.6984, "step": 2286 }, { "epoch": 0.4468542399374756, "grad_norm": 9.522313117980957, "learning_rate": 1.2246358635214868e-05, "loss": 9.6132, "step": 2287 }, { "epoch": 0.44704962876123483, "grad_norm": 8.825810432434082, "learning_rate": 1.2240191178758598e-05, "loss": 9.7715, "step": 2288 }, { "epoch": 0.44724501758499413, "grad_norm": 9.867698669433594, "learning_rate": 1.2234022825036213e-05, "loss": 10.4578, "step": 2289 }, { "epoch": 0.44744040640875343, "grad_norm": 7.061227798461914, "learning_rate": 1.2227853576518329e-05, "loss": 9.2461, "step": 2290 }, { "epoch": 0.4476357952325127, "grad_norm": 9.363693237304688, "learning_rate": 1.222168343567592e-05, "loss": 10.1934, "step": 2291 }, { "epoch": 0.447831184056272, "grad_norm": 7.705371856689453, "learning_rate": 1.2215512404980321e-05, "loss": 9.0384, "step": 2292 }, { "epoch": 0.4480265728800313, "grad_norm": 10.034061431884766, "learning_rate": 1.2209340486903218e-05, "loss": 9.4011, "step": 2293 }, { "epoch": 0.44822196170379053, "grad_norm": 9.613932609558105, "learning_rate": 1.2203167683916658e-05, "loss": 9.2143, "step": 2294 }, { "epoch": 0.44841735052754983, "grad_norm": 10.576947212219238, "learning_rate": 1.2196993998493043e-05, "loss": 9.119, "step": 2295 }, { "epoch": 0.44861273935130913, "grad_norm": 7.123161315917969, "learning_rate": 1.219081943310512e-05, "loss": 9.2717, "step": 2296 }, { "epoch": 0.4488081281750684, "grad_norm": 10.57288932800293, "learning_rate": 1.2184643990225998e-05, "loss": 9.8432, "step": 2297 }, { "epoch": 0.4490035169988277, "grad_norm": 9.966277122497559, "learning_rate": 1.2178467672329136e-05, "loss": 9.3059, "step": 2298 }, { "epoch": 0.4491989058225869, "grad_norm": 8.62827205657959, "learning_rate": 1.2172290481888331e-05, "loss": 10.1742, "step": 2299 }, { "epoch": 0.4493942946463462, "grad_norm": 7.116940975189209, "learning_rate": 1.2166112421377749e-05, "loss": 8.0496, "step": 2300 }, { "epoch": 0.44958968347010553, "grad_norm": 8.18814754486084, "learning_rate": 1.2159933493271894e-05, "loss": 8.8701, "step": 2301 }, { "epoch": 0.4497850722938648, "grad_norm": 13.047163009643555, "learning_rate": 1.2153753700045614e-05, "loss": 9.4084, "step": 2302 }, { "epoch": 0.4499804611176241, "grad_norm": 6.840201377868652, "learning_rate": 1.2147573044174113e-05, "loss": 8.7816, "step": 2303 }, { "epoch": 0.4501758499413834, "grad_norm": 9.08859920501709, "learning_rate": 1.2141391528132931e-05, "loss": 9.1049, "step": 2304 }, { "epoch": 0.4503712387651426, "grad_norm": 8.56104850769043, "learning_rate": 1.2135209154397962e-05, "loss": 9.6122, "step": 2305 }, { "epoch": 0.4505666275889019, "grad_norm": 9.848429679870605, "learning_rate": 1.2129025925445435e-05, "loss": 9.074, "step": 2306 }, { "epoch": 0.45076201641266117, "grad_norm": 9.95374584197998, "learning_rate": 1.2122841843751925e-05, "loss": 10.2276, "step": 2307 }, { "epoch": 0.45095740523642047, "grad_norm": 7.756079196929932, "learning_rate": 1.211665691179435e-05, "loss": 8.8907, "step": 2308 }, { "epoch": 0.4511527940601798, "grad_norm": 8.182064056396484, "learning_rate": 1.2110471132049971e-05, "loss": 8.6392, "step": 2309 }, { "epoch": 0.451348182883939, "grad_norm": 8.239133834838867, "learning_rate": 1.2104284506996378e-05, "loss": 8.9845, "step": 2310 }, { "epoch": 0.4515435717076983, "grad_norm": 9.457928657531738, "learning_rate": 1.2098097039111511e-05, "loss": 9.528, "step": 2311 }, { "epoch": 0.4517389605314576, "grad_norm": 9.49638843536377, "learning_rate": 1.2091908730873641e-05, "loss": 10.0566, "step": 2312 }, { "epoch": 0.45193434935521687, "grad_norm": 22.126733779907227, "learning_rate": 1.2085719584761378e-05, "loss": 8.9613, "step": 2313 }, { "epoch": 0.45212973817897617, "grad_norm": 30.99988555908203, "learning_rate": 1.2079529603253666e-05, "loss": 10.345, "step": 2314 }, { "epoch": 0.4523251270027354, "grad_norm": 7.838778972625732, "learning_rate": 1.2073338788829787e-05, "loss": 9.2702, "step": 2315 }, { "epoch": 0.4525205158264947, "grad_norm": 7.920363903045654, "learning_rate": 1.2067147143969353e-05, "loss": 9.3191, "step": 2316 }, { "epoch": 0.452715904650254, "grad_norm": 9.7103271484375, "learning_rate": 1.206095467115231e-05, "loss": 8.8647, "step": 2317 }, { "epoch": 0.45291129347401327, "grad_norm": 7.919499397277832, "learning_rate": 1.2054761372858937e-05, "loss": 9.1471, "step": 2318 }, { "epoch": 0.45310668229777257, "grad_norm": 6.757690906524658, "learning_rate": 1.204856725156984e-05, "loss": 9.449, "step": 2319 }, { "epoch": 0.45330207112153187, "grad_norm": 6.938695430755615, "learning_rate": 1.204237230976596e-05, "loss": 8.7082, "step": 2320 }, { "epoch": 0.4534974599452911, "grad_norm": 6.6420369148254395, "learning_rate": 1.203617654992856e-05, "loss": 8.5528, "step": 2321 }, { "epoch": 0.4536928487690504, "grad_norm": 6.399874687194824, "learning_rate": 1.2029979974539233e-05, "loss": 8.5661, "step": 2322 }, { "epoch": 0.4538882375928097, "grad_norm": 7.898861885070801, "learning_rate": 1.2023782586079908e-05, "loss": 9.201, "step": 2323 }, { "epoch": 0.45408362641656896, "grad_norm": 9.081944465637207, "learning_rate": 1.2017584387032826e-05, "loss": 9.1618, "step": 2324 }, { "epoch": 0.45427901524032827, "grad_norm": 9.277242660522461, "learning_rate": 1.2011385379880555e-05, "loss": 9.4893, "step": 2325 }, { "epoch": 0.4544744040640875, "grad_norm": 8.17261791229248, "learning_rate": 1.2005185567105994e-05, "loss": 9.4795, "step": 2326 }, { "epoch": 0.4546697928878468, "grad_norm": 9.705378532409668, "learning_rate": 1.1998984951192361e-05, "loss": 8.5081, "step": 2327 }, { "epoch": 0.4548651817116061, "grad_norm": 8.096948623657227, "learning_rate": 1.1992783534623191e-05, "loss": 9.8414, "step": 2328 }, { "epoch": 0.45506057053536536, "grad_norm": 9.253615379333496, "learning_rate": 1.198658131988235e-05, "loss": 8.4891, "step": 2329 }, { "epoch": 0.45525595935912466, "grad_norm": 9.506009101867676, "learning_rate": 1.198037830945401e-05, "loss": 8.114, "step": 2330 }, { "epoch": 0.45545134818288396, "grad_norm": 8.674689292907715, "learning_rate": 1.1974174505822677e-05, "loss": 8.6011, "step": 2331 }, { "epoch": 0.4556467370066432, "grad_norm": 8.619670867919922, "learning_rate": 1.1967969911473162e-05, "loss": 9.3079, "step": 2332 }, { "epoch": 0.4558421258304025, "grad_norm": 9.665189743041992, "learning_rate": 1.1961764528890599e-05, "loss": 9.5076, "step": 2333 }, { "epoch": 0.45603751465416176, "grad_norm": 16.26235008239746, "learning_rate": 1.1955558360560438e-05, "loss": 9.5821, "step": 2334 }, { "epoch": 0.45623290347792106, "grad_norm": 9.290892601013184, "learning_rate": 1.194935140896844e-05, "loss": 9.7788, "step": 2335 }, { "epoch": 0.45642829230168036, "grad_norm": 8.773561477661133, "learning_rate": 1.194314367660068e-05, "loss": 9.1339, "step": 2336 }, { "epoch": 0.4566236811254396, "grad_norm": 10.138145446777344, "learning_rate": 1.1936935165943553e-05, "loss": 9.6766, "step": 2337 }, { "epoch": 0.4568190699491989, "grad_norm": 10.880075454711914, "learning_rate": 1.1930725879483756e-05, "loss": 9.9402, "step": 2338 }, { "epoch": 0.4570144587729582, "grad_norm": 9.441678047180176, "learning_rate": 1.19245158197083e-05, "loss": 9.9798, "step": 2339 }, { "epoch": 0.45720984759671746, "grad_norm": 9.227924346923828, "learning_rate": 1.1918304989104515e-05, "loss": 9.4905, "step": 2340 }, { "epoch": 0.45740523642047676, "grad_norm": 11.096552848815918, "learning_rate": 1.191209339016002e-05, "loss": 8.7374, "step": 2341 }, { "epoch": 0.457600625244236, "grad_norm": 8.83130931854248, "learning_rate": 1.190588102536276e-05, "loss": 9.031, "step": 2342 }, { "epoch": 0.4577960140679953, "grad_norm": 9.083894729614258, "learning_rate": 1.1899667897200978e-05, "loss": 9.2363, "step": 2343 }, { "epoch": 0.4579914028917546, "grad_norm": 10.509827613830566, "learning_rate": 1.1893454008163226e-05, "loss": 9.105, "step": 2344 }, { "epoch": 0.45818679171551385, "grad_norm": 8.675061225891113, "learning_rate": 1.1887239360738356e-05, "loss": 9.1982, "step": 2345 }, { "epoch": 0.45838218053927315, "grad_norm": 9.243605613708496, "learning_rate": 1.1881023957415535e-05, "loss": 9.5825, "step": 2346 }, { "epoch": 0.45857756936303246, "grad_norm": 17.001510620117188, "learning_rate": 1.1874807800684217e-05, "loss": 10.4223, "step": 2347 }, { "epoch": 0.4587729581867917, "grad_norm": 8.395930290222168, "learning_rate": 1.186859089303417e-05, "loss": 9.7094, "step": 2348 }, { "epoch": 0.458968347010551, "grad_norm": 9.87113094329834, "learning_rate": 1.1862373236955454e-05, "loss": 9.786, "step": 2349 }, { "epoch": 0.4591637358343103, "grad_norm": 10.49590015411377, "learning_rate": 1.1856154834938437e-05, "loss": 9.9623, "step": 2350 }, { "epoch": 0.45935912465806955, "grad_norm": 10.947039604187012, "learning_rate": 1.1849935689473786e-05, "loss": 10.264, "step": 2351 }, { "epoch": 0.45955451348182885, "grad_norm": 8.210731506347656, "learning_rate": 1.1843715803052453e-05, "loss": 9.5774, "step": 2352 }, { "epoch": 0.4597499023055881, "grad_norm": 6.829330921173096, "learning_rate": 1.1837495178165706e-05, "loss": 9.1155, "step": 2353 }, { "epoch": 0.4599452911293474, "grad_norm": 7.158986568450928, "learning_rate": 1.1831273817305088e-05, "loss": 9.4125, "step": 2354 }, { "epoch": 0.4601406799531067, "grad_norm": 11.706311225891113, "learning_rate": 1.1825051722962454e-05, "loss": 9.8573, "step": 2355 }, { "epoch": 0.46033606877686595, "grad_norm": 40.86763000488281, "learning_rate": 1.1818828897629941e-05, "loss": 9.155, "step": 2356 }, { "epoch": 0.46053145760062525, "grad_norm": 10.87190055847168, "learning_rate": 1.1812605343799989e-05, "loss": 9.4604, "step": 2357 }, { "epoch": 0.46072684642438455, "grad_norm": 9.73469352722168, "learning_rate": 1.180638106396532e-05, "loss": 9.801, "step": 2358 }, { "epoch": 0.4609222352481438, "grad_norm": 7.995573997497559, "learning_rate": 1.1800156060618955e-05, "loss": 10.1415, "step": 2359 }, { "epoch": 0.4611176240719031, "grad_norm": 8.469133377075195, "learning_rate": 1.1793930336254195e-05, "loss": 8.9077, "step": 2360 }, { "epoch": 0.46131301289566234, "grad_norm": 18.08845329284668, "learning_rate": 1.1787703893364645e-05, "loss": 9.7153, "step": 2361 }, { "epoch": 0.46150840171942165, "grad_norm": 9.333691596984863, "learning_rate": 1.1781476734444181e-05, "loss": 9.3963, "step": 2362 }, { "epoch": 0.46170379054318095, "grad_norm": 9.010549545288086, "learning_rate": 1.1775248861986978e-05, "loss": 9.5966, "step": 2363 }, { "epoch": 0.4618991793669402, "grad_norm": 11.042677879333496, "learning_rate": 1.176902027848749e-05, "loss": 9.724, "step": 2364 }, { "epoch": 0.4620945681906995, "grad_norm": 10.08812427520752, "learning_rate": 1.176279098644046e-05, "loss": 9.7276, "step": 2365 }, { "epoch": 0.4622899570144588, "grad_norm": 13.08978271484375, "learning_rate": 1.1756560988340912e-05, "loss": 9.7234, "step": 2366 }, { "epoch": 0.46248534583821804, "grad_norm": 10.918002128601074, "learning_rate": 1.1750330286684153e-05, "loss": 8.2166, "step": 2367 }, { "epoch": 0.46268073466197734, "grad_norm": 8.465288162231445, "learning_rate": 1.174409888396577e-05, "loss": 8.8377, "step": 2368 }, { "epoch": 0.4628761234857366, "grad_norm": 11.169697761535645, "learning_rate": 1.173786678268164e-05, "loss": 8.7151, "step": 2369 }, { "epoch": 0.4630715123094959, "grad_norm": 9.257155418395996, "learning_rate": 1.1731633985327906e-05, "loss": 9.4025, "step": 2370 }, { "epoch": 0.4632669011332552, "grad_norm": 8.561311721801758, "learning_rate": 1.1725400494401e-05, "loss": 9.0156, "step": 2371 }, { "epoch": 0.46346228995701444, "grad_norm": 8.800094604492188, "learning_rate": 1.171916631239763e-05, "loss": 9.1216, "step": 2372 }, { "epoch": 0.46365767878077374, "grad_norm": 11.316170692443848, "learning_rate": 1.1712931441814776e-05, "loss": 9.9333, "step": 2373 }, { "epoch": 0.46385306760453304, "grad_norm": 9.657631874084473, "learning_rate": 1.1706695885149702e-05, "loss": 10.1079, "step": 2374 }, { "epoch": 0.4640484564282923, "grad_norm": 8.509729385375977, "learning_rate": 1.1700459644899938e-05, "loss": 9.6815, "step": 2375 }, { "epoch": 0.4642438452520516, "grad_norm": 9.730488777160645, "learning_rate": 1.1694222723563291e-05, "loss": 9.8981, "step": 2376 }, { "epoch": 0.46443923407581084, "grad_norm": 6.831174373626709, "learning_rate": 1.1687985123637846e-05, "loss": 8.5715, "step": 2377 }, { "epoch": 0.46463462289957014, "grad_norm": 36.317893981933594, "learning_rate": 1.1681746847621951e-05, "loss": 10.3437, "step": 2378 }, { "epoch": 0.46483001172332944, "grad_norm": 9.85550308227539, "learning_rate": 1.1675507898014235e-05, "loss": 9.3706, "step": 2379 }, { "epoch": 0.4650254005470887, "grad_norm": 10.740527153015137, "learning_rate": 1.1669268277313585e-05, "loss": 9.1447, "step": 2380 }, { "epoch": 0.465220789370848, "grad_norm": 6.926548957824707, "learning_rate": 1.1663027988019166e-05, "loss": 9.047, "step": 2381 }, { "epoch": 0.4654161781946073, "grad_norm": 8.198529243469238, "learning_rate": 1.1656787032630405e-05, "loss": 8.1967, "step": 2382 }, { "epoch": 0.46561156701836653, "grad_norm": 9.073921203613281, "learning_rate": 1.1650545413647001e-05, "loss": 9.9589, "step": 2383 }, { "epoch": 0.46580695584212584, "grad_norm": 13.952261924743652, "learning_rate": 1.1644303133568915e-05, "loss": 9.2896, "step": 2384 }, { "epoch": 0.46600234466588514, "grad_norm": 8.263925552368164, "learning_rate": 1.1638060194896378e-05, "loss": 9.2413, "step": 2385 }, { "epoch": 0.4661977334896444, "grad_norm": 22.786531448364258, "learning_rate": 1.163181660012987e-05, "loss": 9.0289, "step": 2386 }, { "epoch": 0.4663931223134037, "grad_norm": 7.734263896942139, "learning_rate": 1.1625572351770157e-05, "loss": 8.9236, "step": 2387 }, { "epoch": 0.46658851113716293, "grad_norm": 17.39560890197754, "learning_rate": 1.1619327452318248e-05, "loss": 9.8094, "step": 2388 }, { "epoch": 0.46678389996092223, "grad_norm": 7.780709743499756, "learning_rate": 1.1613081904275419e-05, "loss": 8.9536, "step": 2389 }, { "epoch": 0.46697928878468153, "grad_norm": 8.669890403747559, "learning_rate": 1.1606835710143207e-05, "loss": 8.8066, "step": 2390 }, { "epoch": 0.4671746776084408, "grad_norm": 25.645999908447266, "learning_rate": 1.1600588872423406e-05, "loss": 9.4202, "step": 2391 }, { "epoch": 0.4673700664322001, "grad_norm": 9.888745307922363, "learning_rate": 1.1594341393618071e-05, "loss": 10.3101, "step": 2392 }, { "epoch": 0.4675654552559594, "grad_norm": 10.650516510009766, "learning_rate": 1.158809327622951e-05, "loss": 9.7502, "step": 2393 }, { "epoch": 0.46776084407971863, "grad_norm": 9.753227233886719, "learning_rate": 1.1581844522760287e-05, "loss": 8.6243, "step": 2394 }, { "epoch": 0.46795623290347793, "grad_norm": 8.801817893981934, "learning_rate": 1.157559513571322e-05, "loss": 8.637, "step": 2395 }, { "epoch": 0.4681516217272372, "grad_norm": 11.485738754272461, "learning_rate": 1.1569345117591384e-05, "loss": 10.5581, "step": 2396 }, { "epoch": 0.4683470105509965, "grad_norm": 43.89450454711914, "learning_rate": 1.1563094470898106e-05, "loss": 9.4397, "step": 2397 }, { "epoch": 0.4685423993747558, "grad_norm": 11.00967788696289, "learning_rate": 1.1556843198136966e-05, "loss": 10.098, "step": 2398 }, { "epoch": 0.468737788198515, "grad_norm": 8.109318733215332, "learning_rate": 1.155059130181179e-05, "loss": 9.6306, "step": 2399 }, { "epoch": 0.46893317702227433, "grad_norm": 7.781679630279541, "learning_rate": 1.1544338784426656e-05, "loss": 8.9134, "step": 2400 }, { "epoch": 0.46912856584603363, "grad_norm": 15.912310600280762, "learning_rate": 1.1538085648485895e-05, "loss": 8.9027, "step": 2401 }, { "epoch": 0.4693239546697929, "grad_norm": 9.630857467651367, "learning_rate": 1.153183189649408e-05, "loss": 9.6047, "step": 2402 }, { "epoch": 0.4695193434935522, "grad_norm": 7.696481704711914, "learning_rate": 1.1525577530956029e-05, "loss": 9.589, "step": 2403 }, { "epoch": 0.4697147323173114, "grad_norm": 11.7659273147583, "learning_rate": 1.1519322554376816e-05, "loss": 10.0371, "step": 2404 }, { "epoch": 0.4699101211410707, "grad_norm": 8.038793563842773, "learning_rate": 1.151306696926175e-05, "loss": 9.0711, "step": 2405 }, { "epoch": 0.47010550996483, "grad_norm": 8.523825645446777, "learning_rate": 1.1506810778116389e-05, "loss": 8.9531, "step": 2406 }, { "epoch": 0.47030089878858927, "grad_norm": 8.917826652526855, "learning_rate": 1.1500553983446527e-05, "loss": 9.3239, "step": 2407 }, { "epoch": 0.4704962876123486, "grad_norm": 12.260668754577637, "learning_rate": 1.1494296587758212e-05, "loss": 10.3139, "step": 2408 }, { "epoch": 0.4706916764361079, "grad_norm": 9.119510650634766, "learning_rate": 1.148803859355772e-05, "loss": 9.6787, "step": 2409 }, { "epoch": 0.4708870652598671, "grad_norm": 8.856493949890137, "learning_rate": 1.148178000335157e-05, "loss": 10.5187, "step": 2410 }, { "epoch": 0.4710824540836264, "grad_norm": 11.283249855041504, "learning_rate": 1.1475520819646526e-05, "loss": 10.0481, "step": 2411 }, { "epoch": 0.4712778429073857, "grad_norm": 6.657747745513916, "learning_rate": 1.1469261044949584e-05, "loss": 7.8504, "step": 2412 }, { "epoch": 0.47147323173114497, "grad_norm": 10.082439422607422, "learning_rate": 1.146300068176798e-05, "loss": 10.1751, "step": 2413 }, { "epoch": 0.47166862055490427, "grad_norm": 7.962443828582764, "learning_rate": 1.1456739732609179e-05, "loss": 9.62, "step": 2414 }, { "epoch": 0.4718640093786635, "grad_norm": 6.457741737365723, "learning_rate": 1.145047819998089e-05, "loss": 8.9817, "step": 2415 }, { "epoch": 0.4720593982024228, "grad_norm": 9.077668190002441, "learning_rate": 1.1444216086391052e-05, "loss": 9.7593, "step": 2416 }, { "epoch": 0.4722547870261821, "grad_norm": 9.026140213012695, "learning_rate": 1.143795339434783e-05, "loss": 9.2714, "step": 2417 }, { "epoch": 0.47245017584994137, "grad_norm": 6.757838249206543, "learning_rate": 1.1431690126359632e-05, "loss": 8.3541, "step": 2418 }, { "epoch": 0.47264556467370067, "grad_norm": 10.788949966430664, "learning_rate": 1.142542628493509e-05, "loss": 8.5847, "step": 2419 }, { "epoch": 0.47284095349745997, "grad_norm": 8.040674209594727, "learning_rate": 1.1419161872583065e-05, "loss": 8.9276, "step": 2420 }, { "epoch": 0.4730363423212192, "grad_norm": 7.822730541229248, "learning_rate": 1.1412896891812655e-05, "loss": 8.8715, "step": 2421 }, { "epoch": 0.4732317311449785, "grad_norm": 9.462562561035156, "learning_rate": 1.140663134513317e-05, "loss": 8.9585, "step": 2422 }, { "epoch": 0.47342711996873776, "grad_norm": 10.111047744750977, "learning_rate": 1.140036523505416e-05, "loss": 9.907, "step": 2423 }, { "epoch": 0.47362250879249707, "grad_norm": 9.930951118469238, "learning_rate": 1.1394098564085399e-05, "loss": 8.905, "step": 2424 }, { "epoch": 0.47381789761625637, "grad_norm": 12.305794715881348, "learning_rate": 1.1387831334736878e-05, "loss": 10.4031, "step": 2425 }, { "epoch": 0.4740132864400156, "grad_norm": 9.197370529174805, "learning_rate": 1.1381563549518823e-05, "loss": 9.1929, "step": 2426 }, { "epoch": 0.4742086752637749, "grad_norm": 10.884814262390137, "learning_rate": 1.1375295210941674e-05, "loss": 8.6907, "step": 2427 }, { "epoch": 0.4744040640875342, "grad_norm": 7.242812156677246, "learning_rate": 1.1369026321516094e-05, "loss": 9.2762, "step": 2428 }, { "epoch": 0.47459945291129346, "grad_norm": 13.42102336883545, "learning_rate": 1.136275688375297e-05, "loss": 9.0727, "step": 2429 }, { "epoch": 0.47479484173505276, "grad_norm": 8.243850708007812, "learning_rate": 1.1356486900163404e-05, "loss": 9.6639, "step": 2430 }, { "epoch": 0.474990230558812, "grad_norm": 9.975531578063965, "learning_rate": 1.1350216373258723e-05, "loss": 9.575, "step": 2431 }, { "epoch": 0.4751856193825713, "grad_norm": 9.604329109191895, "learning_rate": 1.1343945305550464e-05, "loss": 9.822, "step": 2432 }, { "epoch": 0.4753810082063306, "grad_norm": 9.020915985107422, "learning_rate": 1.1337673699550382e-05, "loss": 8.9249, "step": 2433 }, { "epoch": 0.47557639703008986, "grad_norm": 7.667820453643799, "learning_rate": 1.1331401557770461e-05, "loss": 9.5894, "step": 2434 }, { "epoch": 0.47577178585384916, "grad_norm": 7.572841644287109, "learning_rate": 1.1325128882722876e-05, "loss": 8.8098, "step": 2435 }, { "epoch": 0.47596717467760846, "grad_norm": 11.557333946228027, "learning_rate": 1.131885567692003e-05, "loss": 10.3338, "step": 2436 }, { "epoch": 0.4761625635013677, "grad_norm": 7.1445841789245605, "learning_rate": 1.1312581942874546e-05, "loss": 8.5131, "step": 2437 }, { "epoch": 0.476357952325127, "grad_norm": 7.750548362731934, "learning_rate": 1.1306307683099237e-05, "loss": 9.4567, "step": 2438 }, { "epoch": 0.4765533411488863, "grad_norm": 9.577622413635254, "learning_rate": 1.1300032900107147e-05, "loss": 9.1148, "step": 2439 }, { "epoch": 0.47674872997264556, "grad_norm": 10.267053604125977, "learning_rate": 1.1293757596411522e-05, "loss": 9.7057, "step": 2440 }, { "epoch": 0.47694411879640486, "grad_norm": 9.346061706542969, "learning_rate": 1.128748177452581e-05, "loss": 9.019, "step": 2441 }, { "epoch": 0.4771395076201641, "grad_norm": 7.570725917816162, "learning_rate": 1.1281205436963676e-05, "loss": 8.3706, "step": 2442 }, { "epoch": 0.4773348964439234, "grad_norm": 7.138862133026123, "learning_rate": 1.1274928586238987e-05, "loss": 8.349, "step": 2443 }, { "epoch": 0.4775302852676827, "grad_norm": 10.648791313171387, "learning_rate": 1.1268651224865818e-05, "loss": 9.6219, "step": 2444 }, { "epoch": 0.47772567409144195, "grad_norm": 9.657983779907227, "learning_rate": 1.1262373355358447e-05, "loss": 8.6772, "step": 2445 }, { "epoch": 0.47792106291520126, "grad_norm": 7.355937480926514, "learning_rate": 1.1256094980231356e-05, "loss": 8.7624, "step": 2446 }, { "epoch": 0.47811645173896056, "grad_norm": 9.921847343444824, "learning_rate": 1.1249816101999227e-05, "loss": 8.6981, "step": 2447 }, { "epoch": 0.4783118405627198, "grad_norm": 10.855193138122559, "learning_rate": 1.1243536723176947e-05, "loss": 9.7601, "step": 2448 }, { "epoch": 0.4785072293864791, "grad_norm": 8.516083717346191, "learning_rate": 1.1237256846279603e-05, "loss": 8.7526, "step": 2449 }, { "epoch": 0.47870261821023835, "grad_norm": 9.826043128967285, "learning_rate": 1.1230976473822478e-05, "loss": 9.6618, "step": 2450 }, { "epoch": 0.47889800703399765, "grad_norm": 8.2230224609375, "learning_rate": 1.1224695608321057e-05, "loss": 9.1489, "step": 2451 }, { "epoch": 0.47909339585775695, "grad_norm": 53.27069091796875, "learning_rate": 1.1218414252291024e-05, "loss": 10.0209, "step": 2452 }, { "epoch": 0.4792887846815162, "grad_norm": 9.501374244689941, "learning_rate": 1.1212132408248256e-05, "loss": 9.2953, "step": 2453 }, { "epoch": 0.4794841735052755, "grad_norm": 8.565271377563477, "learning_rate": 1.1205850078708824e-05, "loss": 9.3541, "step": 2454 }, { "epoch": 0.4796795623290348, "grad_norm": 7.600032806396484, "learning_rate": 1.1199567266189e-05, "loss": 9.3391, "step": 2455 }, { "epoch": 0.47987495115279405, "grad_norm": 7.484233856201172, "learning_rate": 1.1193283973205242e-05, "loss": 9.673, "step": 2456 }, { "epoch": 0.48007033997655335, "grad_norm": 9.007365226745605, "learning_rate": 1.1187000202274207e-05, "loss": 10.5114, "step": 2457 }, { "epoch": 0.4802657288003126, "grad_norm": 10.141722679138184, "learning_rate": 1.118071595591274e-05, "loss": 10.2439, "step": 2458 }, { "epoch": 0.4804611176240719, "grad_norm": 26.891250610351562, "learning_rate": 1.1174431236637875e-05, "loss": 9.4861, "step": 2459 }, { "epoch": 0.4806565064478312, "grad_norm": 9.505500793457031, "learning_rate": 1.1168146046966838e-05, "loss": 9.4145, "step": 2460 }, { "epoch": 0.48085189527159045, "grad_norm": 8.005602836608887, "learning_rate": 1.1161860389417041e-05, "loss": 8.5351, "step": 2461 }, { "epoch": 0.48104728409534975, "grad_norm": 7.418105602264404, "learning_rate": 1.1155574266506089e-05, "loss": 9.4643, "step": 2462 }, { "epoch": 0.48124267291910905, "grad_norm": 9.839845657348633, "learning_rate": 1.1149287680751763e-05, "loss": 10.0309, "step": 2463 }, { "epoch": 0.4814380617428683, "grad_norm": 8.617295265197754, "learning_rate": 1.1143000634672043e-05, "loss": 8.7564, "step": 2464 }, { "epoch": 0.4816334505666276, "grad_norm": 10.28796100616455, "learning_rate": 1.1136713130785079e-05, "loss": 9.6796, "step": 2465 }, { "epoch": 0.48182883939038684, "grad_norm": 12.032356262207031, "learning_rate": 1.1130425171609218e-05, "loss": 9.0274, "step": 2466 }, { "epoch": 0.48202422821414614, "grad_norm": 15.842183113098145, "learning_rate": 1.112413675966298e-05, "loss": 10.0809, "step": 2467 }, { "epoch": 0.48221961703790545, "grad_norm": 8.806893348693848, "learning_rate": 1.1117847897465071e-05, "loss": 9.4578, "step": 2468 }, { "epoch": 0.4824150058616647, "grad_norm": 24.973302841186523, "learning_rate": 1.111155858753437e-05, "loss": 9.3003, "step": 2469 }, { "epoch": 0.482610394685424, "grad_norm": 63.59320831298828, "learning_rate": 1.1105268832389946e-05, "loss": 9.683, "step": 2470 }, { "epoch": 0.4828057835091833, "grad_norm": 8.895898818969727, "learning_rate": 1.109897863455104e-05, "loss": 9.9377, "step": 2471 }, { "epoch": 0.48300117233294254, "grad_norm": 8.31209659576416, "learning_rate": 1.1092687996537075e-05, "loss": 9.3796, "step": 2472 }, { "epoch": 0.48319656115670184, "grad_norm": 8.310145378112793, "learning_rate": 1.1086396920867642e-05, "loss": 9.5811, "step": 2473 }, { "epoch": 0.48339194998046114, "grad_norm": 9.453574180603027, "learning_rate": 1.1080105410062512e-05, "loss": 9.3392, "step": 2474 }, { "epoch": 0.4835873388042204, "grad_norm": 9.653833389282227, "learning_rate": 1.1073813466641633e-05, "loss": 8.8944, "step": 2475 }, { "epoch": 0.4837827276279797, "grad_norm": 9.032988548278809, "learning_rate": 1.1067521093125123e-05, "loss": 9.4389, "step": 2476 }, { "epoch": 0.48397811645173894, "grad_norm": 9.604551315307617, "learning_rate": 1.106122829203327e-05, "loss": 9.1107, "step": 2477 }, { "epoch": 0.48417350527549824, "grad_norm": 10.697450637817383, "learning_rate": 1.1054935065886541e-05, "loss": 9.6396, "step": 2478 }, { "epoch": 0.48436889409925754, "grad_norm": 8.172533988952637, "learning_rate": 1.1048641417205563e-05, "loss": 8.8635, "step": 2479 }, { "epoch": 0.4845642829230168, "grad_norm": 9.067036628723145, "learning_rate": 1.1042347348511145e-05, "loss": 10.2947, "step": 2480 }, { "epoch": 0.4847596717467761, "grad_norm": 12.501672744750977, "learning_rate": 1.103605286232425e-05, "loss": 9.8823, "step": 2481 }, { "epoch": 0.4849550605705354, "grad_norm": 13.687049865722656, "learning_rate": 1.1029757961166019e-05, "loss": 9.2825, "step": 2482 }, { "epoch": 0.48515044939429464, "grad_norm": 9.006345748901367, "learning_rate": 1.1023462647557752e-05, "loss": 9.474, "step": 2483 }, { "epoch": 0.48534583821805394, "grad_norm": 8.601507186889648, "learning_rate": 1.1017166924020921e-05, "loss": 9.5004, "step": 2484 }, { "epoch": 0.4855412270418132, "grad_norm": 7.638338565826416, "learning_rate": 1.101087079307716e-05, "loss": 8.4473, "step": 2485 }, { "epoch": 0.4857366158655725, "grad_norm": 10.555083274841309, "learning_rate": 1.100457425724826e-05, "loss": 8.6913, "step": 2486 }, { "epoch": 0.4859320046893318, "grad_norm": 8.18532943725586, "learning_rate": 1.0998277319056181e-05, "loss": 9.7484, "step": 2487 }, { "epoch": 0.48612739351309103, "grad_norm": 9.937145233154297, "learning_rate": 1.0991979981023044e-05, "loss": 8.4777, "step": 2488 }, { "epoch": 0.48632278233685033, "grad_norm": 9.026718139648438, "learning_rate": 1.098568224567113e-05, "loss": 8.9193, "step": 2489 }, { "epoch": 0.48651817116060964, "grad_norm": 9.864764213562012, "learning_rate": 1.0979384115522871e-05, "loss": 10.205, "step": 2490 }, { "epoch": 0.4867135599843689, "grad_norm": 7.276735305786133, "learning_rate": 1.097308559310087e-05, "loss": 8.0698, "step": 2491 }, { "epoch": 0.4869089488081282, "grad_norm": 7.894010543823242, "learning_rate": 1.0966786680927875e-05, "loss": 10.2766, "step": 2492 }, { "epoch": 0.48710433763188743, "grad_norm": 8.936746597290039, "learning_rate": 1.0960487381526801e-05, "loss": 9.2279, "step": 2493 }, { "epoch": 0.48729972645564673, "grad_norm": 8.326064109802246, "learning_rate": 1.095418769742071e-05, "loss": 8.7967, "step": 2494 }, { "epoch": 0.48749511527940603, "grad_norm": 10.585051536560059, "learning_rate": 1.0947887631132823e-05, "loss": 9.2622, "step": 2495 }, { "epoch": 0.4876905041031653, "grad_norm": 7.402519702911377, "learning_rate": 1.0941587185186512e-05, "loss": 8.8416, "step": 2496 }, { "epoch": 0.4878858929269246, "grad_norm": 8.184988975524902, "learning_rate": 1.0935286362105298e-05, "loss": 9.509, "step": 2497 }, { "epoch": 0.4880812817506839, "grad_norm": 10.121121406555176, "learning_rate": 1.092898516441286e-05, "loss": 9.354, "step": 2498 }, { "epoch": 0.48827667057444313, "grad_norm": 9.630965232849121, "learning_rate": 1.092268359463302e-05, "loss": 8.4544, "step": 2499 }, { "epoch": 0.48847205939820243, "grad_norm": 7.4092183113098145, "learning_rate": 1.0916381655289756e-05, "loss": 8.9632, "step": 2500 }, { "epoch": 0.48866744822196173, "grad_norm": 9.727022171020508, "learning_rate": 1.091007934890719e-05, "loss": 8.9544, "step": 2501 }, { "epoch": 0.488862837045721, "grad_norm": 8.538817405700684, "learning_rate": 1.090377667800959e-05, "loss": 8.3954, "step": 2502 }, { "epoch": 0.4890582258694803, "grad_norm": 9.476492881774902, "learning_rate": 1.0897473645121373e-05, "loss": 9.1442, "step": 2503 }, { "epoch": 0.4892536146932395, "grad_norm": 18.45824432373047, "learning_rate": 1.0891170252767094e-05, "loss": 8.5243, "step": 2504 }, { "epoch": 0.4894490035169988, "grad_norm": 9.022347450256348, "learning_rate": 1.0884866503471463e-05, "loss": 9.0409, "step": 2505 }, { "epoch": 0.48964439234075813, "grad_norm": 11.24704647064209, "learning_rate": 1.0878562399759324e-05, "loss": 9.3097, "step": 2506 }, { "epoch": 0.4898397811645174, "grad_norm": 14.451510429382324, "learning_rate": 1.087225794415567e-05, "loss": 9.3767, "step": 2507 }, { "epoch": 0.4900351699882767, "grad_norm": 17.859617233276367, "learning_rate": 1.0865953139185625e-05, "loss": 10.09, "step": 2508 }, { "epoch": 0.490230558812036, "grad_norm": 6.762642860412598, "learning_rate": 1.0859647987374467e-05, "loss": 8.9209, "step": 2509 }, { "epoch": 0.4904259476357952, "grad_norm": 10.67780876159668, "learning_rate": 1.0853342491247598e-05, "loss": 9.9544, "step": 2510 }, { "epoch": 0.4906213364595545, "grad_norm": 10.100765228271484, "learning_rate": 1.084703665333057e-05, "loss": 9.2581, "step": 2511 }, { "epoch": 0.49081672528331377, "grad_norm": 9.13901138305664, "learning_rate": 1.0840730476149064e-05, "loss": 9.1497, "step": 2512 }, { "epoch": 0.49101211410707307, "grad_norm": 10.920374870300293, "learning_rate": 1.0834423962228902e-05, "loss": 9.2326, "step": 2513 }, { "epoch": 0.4912075029308324, "grad_norm": 10.368356704711914, "learning_rate": 1.0828117114096035e-05, "loss": 9.8312, "step": 2514 }, { "epoch": 0.4914028917545916, "grad_norm": 6.879678249359131, "learning_rate": 1.0821809934276555e-05, "loss": 9.4539, "step": 2515 }, { "epoch": 0.4915982805783509, "grad_norm": 8.457839012145996, "learning_rate": 1.0815502425296685e-05, "loss": 9.1, "step": 2516 }, { "epoch": 0.4917936694021102, "grad_norm": 8.530153274536133, "learning_rate": 1.0809194589682772e-05, "loss": 10.0003, "step": 2517 }, { "epoch": 0.49198905822586947, "grad_norm": 8.18465518951416, "learning_rate": 1.0802886429961303e-05, "loss": 9.053, "step": 2518 }, { "epoch": 0.49218444704962877, "grad_norm": 9.027351379394531, "learning_rate": 1.0796577948658893e-05, "loss": 9.2156, "step": 2519 }, { "epoch": 0.492379835873388, "grad_norm": 10.641546249389648, "learning_rate": 1.0790269148302283e-05, "loss": 9.5959, "step": 2520 }, { "epoch": 0.4925752246971473, "grad_norm": 10.388471603393555, "learning_rate": 1.0783960031418345e-05, "loss": 10.1863, "step": 2521 }, { "epoch": 0.4927706135209066, "grad_norm": 10.69629192352295, "learning_rate": 1.0777650600534076e-05, "loss": 9.5259, "step": 2522 }, { "epoch": 0.49296600234466587, "grad_norm": 8.71397876739502, "learning_rate": 1.0771340858176595e-05, "loss": 9.2118, "step": 2523 }, { "epoch": 0.49316139116842517, "grad_norm": 8.098600387573242, "learning_rate": 1.0765030806873155e-05, "loss": 9.6289, "step": 2524 }, { "epoch": 0.49335677999218447, "grad_norm": 10.088919639587402, "learning_rate": 1.0758720449151125e-05, "loss": 8.5031, "step": 2525 }, { "epoch": 0.4935521688159437, "grad_norm": 7.400728225708008, "learning_rate": 1.0752409787538e-05, "loss": 9.5247, "step": 2526 }, { "epoch": 0.493747557639703, "grad_norm": 8.490511894226074, "learning_rate": 1.0746098824561397e-05, "loss": 9.7487, "step": 2527 }, { "epoch": 0.49394294646346226, "grad_norm": 10.189058303833008, "learning_rate": 1.0739787562749049e-05, "loss": 9.1151, "step": 2528 }, { "epoch": 0.49413833528722156, "grad_norm": 9.887974739074707, "learning_rate": 1.0733476004628817e-05, "loss": 10.338, "step": 2529 }, { "epoch": 0.49433372411098087, "grad_norm": 27.863605499267578, "learning_rate": 1.0727164152728671e-05, "loss": 9.5593, "step": 2530 }, { "epoch": 0.4945291129347401, "grad_norm": 7.720983028411865, "learning_rate": 1.0720852009576705e-05, "loss": 9.0009, "step": 2531 }, { "epoch": 0.4947245017584994, "grad_norm": 9.274893760681152, "learning_rate": 1.0714539577701131e-05, "loss": 8.6465, "step": 2532 }, { "epoch": 0.4949198905822587, "grad_norm": 8.545907974243164, "learning_rate": 1.0708226859630273e-05, "loss": 9.3616, "step": 2533 }, { "epoch": 0.49511527940601796, "grad_norm": 9.02899169921875, "learning_rate": 1.0701913857892567e-05, "loss": 8.4511, "step": 2534 }, { "epoch": 0.49531066822977726, "grad_norm": 7.919510841369629, "learning_rate": 1.0695600575016571e-05, "loss": 8.8204, "step": 2535 }, { "epoch": 0.49550605705353656, "grad_norm": 9.303332328796387, "learning_rate": 1.0689287013530949e-05, "loss": 10.0516, "step": 2536 }, { "epoch": 0.4957014458772958, "grad_norm": 10.700124740600586, "learning_rate": 1.0682973175964476e-05, "loss": 9.2763, "step": 2537 }, { "epoch": 0.4958968347010551, "grad_norm": 8.245739936828613, "learning_rate": 1.0676659064846044e-05, "loss": 8.3476, "step": 2538 }, { "epoch": 0.49609222352481436, "grad_norm": 10.72991943359375, "learning_rate": 1.067034468270465e-05, "loss": 9.4349, "step": 2539 }, { "epoch": 0.49628761234857366, "grad_norm": 9.243351936340332, "learning_rate": 1.0664030032069396e-05, "loss": 7.9045, "step": 2540 }, { "epoch": 0.49648300117233296, "grad_norm": 9.342486381530762, "learning_rate": 1.06577151154695e-05, "loss": 9.3108, "step": 2541 }, { "epoch": 0.4966783899960922, "grad_norm": 7.926912307739258, "learning_rate": 1.0651399935434281e-05, "loss": 9.807, "step": 2542 }, { "epoch": 0.4968737788198515, "grad_norm": 9.867131233215332, "learning_rate": 1.0645084494493166e-05, "loss": 10.9449, "step": 2543 }, { "epoch": 0.4970691676436108, "grad_norm": 7.8673834800720215, "learning_rate": 1.063876879517568e-05, "loss": 9.6844, "step": 2544 }, { "epoch": 0.49726455646737006, "grad_norm": 9.949667930603027, "learning_rate": 1.0632452840011463e-05, "loss": 9.6345, "step": 2545 }, { "epoch": 0.49745994529112936, "grad_norm": 9.161530494689941, "learning_rate": 1.0626136631530245e-05, "loss": 9.8698, "step": 2546 }, { "epoch": 0.4976553341148886, "grad_norm": 8.347764015197754, "learning_rate": 1.0619820172261867e-05, "loss": 9.2639, "step": 2547 }, { "epoch": 0.4978507229386479, "grad_norm": 7.256474494934082, "learning_rate": 1.0613503464736267e-05, "loss": 8.2839, "step": 2548 }, { "epoch": 0.4980461117624072, "grad_norm": 10.699699401855469, "learning_rate": 1.0607186511483485e-05, "loss": 9.0246, "step": 2549 }, { "epoch": 0.49824150058616645, "grad_norm": 8.31127643585205, "learning_rate": 1.0600869315033651e-05, "loss": 8.6217, "step": 2550 }, { "epoch": 0.49843688940992575, "grad_norm": 8.519987106323242, "learning_rate": 1.0594551877917003e-05, "loss": 8.2195, "step": 2551 }, { "epoch": 0.49863227823368506, "grad_norm": 10.063212394714355, "learning_rate": 1.058823420266387e-05, "loss": 10.2044, "step": 2552 }, { "epoch": 0.4988276670574443, "grad_norm": 12.953171730041504, "learning_rate": 1.0581916291804675e-05, "loss": 8.8876, "step": 2553 }, { "epoch": 0.4990230558812036, "grad_norm": 11.009001731872559, "learning_rate": 1.0575598147869939e-05, "loss": 9.1471, "step": 2554 }, { "epoch": 0.49921844470496285, "grad_norm": 11.144474983215332, "learning_rate": 1.0569279773390273e-05, "loss": 9.1009, "step": 2555 }, { "epoch": 0.49941383352872215, "grad_norm": 8.041287422180176, "learning_rate": 1.0562961170896384e-05, "loss": 8.7397, "step": 2556 }, { "epoch": 0.49960922235248145, "grad_norm": 9.339242935180664, "learning_rate": 1.0556642342919071e-05, "loss": 10.0537, "step": 2557 }, { "epoch": 0.4998046111762407, "grad_norm": 9.279107093811035, "learning_rate": 1.0550323291989217e-05, "loss": 8.9358, "step": 2558 }, { "epoch": 0.5, "grad_norm": 7.196199417114258, "learning_rate": 1.0544004020637799e-05, "loss": 8.1397, "step": 2559 }, { "epoch": 0.5001953888237592, "grad_norm": 7.969322681427002, "learning_rate": 1.053768453139588e-05, "loss": 8.3753, "step": 2560 }, { "epoch": 0.5003907776475186, "grad_norm": 9.300406455993652, "learning_rate": 1.0531364826794612e-05, "loss": 9.3357, "step": 2561 }, { "epoch": 0.5005861664712778, "grad_norm": 12.185202598571777, "learning_rate": 1.0525044909365237e-05, "loss": 8.2229, "step": 2562 }, { "epoch": 0.5007815552950371, "grad_norm": 17.077205657958984, "learning_rate": 1.0518724781639073e-05, "loss": 9.7043, "step": 2563 }, { "epoch": 0.5009769441187965, "grad_norm": 11.250316619873047, "learning_rate": 1.0512404446147531e-05, "loss": 9.4062, "step": 2564 }, { "epoch": 0.5011723329425557, "grad_norm": 7.564905166625977, "learning_rate": 1.0506083905422097e-05, "loss": 9.1755, "step": 2565 }, { "epoch": 0.5013677217663149, "grad_norm": 15.252607345581055, "learning_rate": 1.049976316199435e-05, "loss": 9.1116, "step": 2566 }, { "epoch": 0.5015631105900743, "grad_norm": 9.757753372192383, "learning_rate": 1.0493442218395938e-05, "loss": 8.4352, "step": 2567 }, { "epoch": 0.5017584994138335, "grad_norm": 11.019709587097168, "learning_rate": 1.04871210771586e-05, "loss": 9.4381, "step": 2568 }, { "epoch": 0.5019538882375928, "grad_norm": 8.746038436889648, "learning_rate": 1.0480799740814145e-05, "loss": 8.9848, "step": 2569 }, { "epoch": 0.502149277061352, "grad_norm": 8.94467544555664, "learning_rate": 1.047447821189447e-05, "loss": 9.3281, "step": 2570 }, { "epoch": 0.5023446658851114, "grad_norm": 13.826160430908203, "learning_rate": 1.046815649293154e-05, "loss": 10.3383, "step": 2571 }, { "epoch": 0.5025400547088706, "grad_norm": 8.326482772827148, "learning_rate": 1.0461834586457398e-05, "loss": 9.0154, "step": 2572 }, { "epoch": 0.5027354435326299, "grad_norm": 10.193694114685059, "learning_rate": 1.0455512495004167e-05, "loss": 10.2128, "step": 2573 }, { "epoch": 0.5029308323563892, "grad_norm": 8.055763244628906, "learning_rate": 1.044919022110404e-05, "loss": 9.2993, "step": 2574 }, { "epoch": 0.5031262211801485, "grad_norm": 7.485759735107422, "learning_rate": 1.0442867767289282e-05, "loss": 8.1856, "step": 2575 }, { "epoch": 0.5033216100039077, "grad_norm": 11.85636043548584, "learning_rate": 1.0436545136092234e-05, "loss": 10.6178, "step": 2576 }, { "epoch": 0.5035169988276671, "grad_norm": 12.691350936889648, "learning_rate": 1.0430222330045306e-05, "loss": 9.6323, "step": 2577 }, { "epoch": 0.5037123876514263, "grad_norm": 7.107717514038086, "learning_rate": 1.0423899351680979e-05, "loss": 9.3238, "step": 2578 }, { "epoch": 0.5039077764751856, "grad_norm": 8.05132007598877, "learning_rate": 1.0417576203531801e-05, "loss": 9.8049, "step": 2579 }, { "epoch": 0.5041031652989449, "grad_norm": 8.098458290100098, "learning_rate": 1.0411252888130388e-05, "loss": 8.9939, "step": 2580 }, { "epoch": 0.5042985541227042, "grad_norm": 15.144835472106934, "learning_rate": 1.0404929408009425e-05, "loss": 9.4043, "step": 2581 }, { "epoch": 0.5044939429464634, "grad_norm": 8.488953590393066, "learning_rate": 1.0398605765701663e-05, "loss": 9.0053, "step": 2582 }, { "epoch": 0.5046893317702228, "grad_norm": 7.072780132293701, "learning_rate": 1.0392281963739918e-05, "loss": 8.9614, "step": 2583 }, { "epoch": 0.504884720593982, "grad_norm": 13.088480949401855, "learning_rate": 1.0385958004657069e-05, "loss": 9.5415, "step": 2584 }, { "epoch": 0.5050801094177413, "grad_norm": 10.07627010345459, "learning_rate": 1.0379633890986054e-05, "loss": 9.8575, "step": 2585 }, { "epoch": 0.5052754982415005, "grad_norm": 9.738160133361816, "learning_rate": 1.037330962525988e-05, "loss": 9.3236, "step": 2586 }, { "epoch": 0.5054708870652599, "grad_norm": 11.041217803955078, "learning_rate": 1.036698521001161e-05, "loss": 9.0937, "step": 2587 }, { "epoch": 0.5056662758890191, "grad_norm": 6.805891990661621, "learning_rate": 1.036066064777437e-05, "loss": 9.6693, "step": 2588 }, { "epoch": 0.5058616647127784, "grad_norm": 7.843379497528076, "learning_rate": 1.0354335941081344e-05, "loss": 8.8285, "step": 2589 }, { "epoch": 0.5060570535365377, "grad_norm": 9.05453872680664, "learning_rate": 1.0348011092465772e-05, "loss": 8.7001, "step": 2590 }, { "epoch": 0.506252442360297, "grad_norm": 7.629778861999512, "learning_rate": 1.0341686104460954e-05, "loss": 9.5514, "step": 2591 }, { "epoch": 0.5064478311840562, "grad_norm": 12.36606502532959, "learning_rate": 1.0335360979600244e-05, "loss": 9.2567, "step": 2592 }, { "epoch": 0.5066432200078156, "grad_norm": 9.524723052978516, "learning_rate": 1.0329035720417049e-05, "loss": 9.4839, "step": 2593 }, { "epoch": 0.5068386088315748, "grad_norm": 6.781744956970215, "learning_rate": 1.0322710329444831e-05, "loss": 8.4025, "step": 2594 }, { "epoch": 0.5070339976553341, "grad_norm": 7.558021068572998, "learning_rate": 1.0316384809217108e-05, "loss": 8.1913, "step": 2595 }, { "epoch": 0.5072293864790934, "grad_norm": 10.1206693649292, "learning_rate": 1.0310059162267448e-05, "loss": 8.9099, "step": 2596 }, { "epoch": 0.5074247753028527, "grad_norm": 10.372179985046387, "learning_rate": 1.0303733391129467e-05, "loss": 9.707, "step": 2597 }, { "epoch": 0.5076201641266119, "grad_norm": 8.356359481811523, "learning_rate": 1.0297407498336832e-05, "loss": 9.1372, "step": 2598 }, { "epoch": 0.5078155529503713, "grad_norm": 10.27808666229248, "learning_rate": 1.029108148642326e-05, "loss": 8.5818, "step": 2599 }, { "epoch": 0.5080109417741305, "grad_norm": 7.9865336418151855, "learning_rate": 1.0284755357922515e-05, "loss": 8.9857, "step": 2600 }, { "epoch": 0.5082063305978898, "grad_norm": 7.481165409088135, "learning_rate": 1.027842911536841e-05, "loss": 9.6992, "step": 2601 }, { "epoch": 0.5084017194216491, "grad_norm": 8.574689865112305, "learning_rate": 1.02721027612948e-05, "loss": 8.7809, "step": 2602 }, { "epoch": 0.5085971082454084, "grad_norm": 9.692758560180664, "learning_rate": 1.0265776298235586e-05, "loss": 9.4764, "step": 2603 }, { "epoch": 0.5087924970691676, "grad_norm": 9.18383502960205, "learning_rate": 1.0259449728724712e-05, "loss": 9.5221, "step": 2604 }, { "epoch": 0.5089878858929269, "grad_norm": 9.158015251159668, "learning_rate": 1.0253123055296167e-05, "loss": 8.7137, "step": 2605 }, { "epoch": 0.5091832747166862, "grad_norm": 9.627050399780273, "learning_rate": 1.0246796280483982e-05, "loss": 9.5247, "step": 2606 }, { "epoch": 0.5093786635404455, "grad_norm": 7.9211812019348145, "learning_rate": 1.0240469406822221e-05, "loss": 8.6351, "step": 2607 }, { "epoch": 0.5095740523642047, "grad_norm": 11.340261459350586, "learning_rate": 1.0234142436845e-05, "loss": 9.7535, "step": 2608 }, { "epoch": 0.5097694411879641, "grad_norm": 8.429910659790039, "learning_rate": 1.0227815373086463e-05, "loss": 8.8982, "step": 2609 }, { "epoch": 0.5099648300117233, "grad_norm": 14.658758163452148, "learning_rate": 1.0221488218080796e-05, "loss": 9.8136, "step": 2610 }, { "epoch": 0.5101602188354826, "grad_norm": 9.807266235351562, "learning_rate": 1.0215160974362224e-05, "loss": 9.6128, "step": 2611 }, { "epoch": 0.5103556076592419, "grad_norm": 8.956068992614746, "learning_rate": 1.0208833644464997e-05, "loss": 10.4443, "step": 2612 }, { "epoch": 0.5105509964830012, "grad_norm": 8.865155220031738, "learning_rate": 1.0202506230923417e-05, "loss": 8.4965, "step": 2613 }, { "epoch": 0.5107463853067604, "grad_norm": 9.319807052612305, "learning_rate": 1.0196178736271803e-05, "loss": 9.5889, "step": 2614 }, { "epoch": 0.5109417741305198, "grad_norm": 11.961516380310059, "learning_rate": 1.0189851163044511e-05, "loss": 8.8658, "step": 2615 }, { "epoch": 0.511137162954279, "grad_norm": 9.452198028564453, "learning_rate": 1.0183523513775935e-05, "loss": 8.5611, "step": 2616 }, { "epoch": 0.5113325517780383, "grad_norm": 7.758265972137451, "learning_rate": 1.0177195791000496e-05, "loss": 8.7061, "step": 2617 }, { "epoch": 0.5115279406017976, "grad_norm": 15.241104125976562, "learning_rate": 1.017086799725264e-05, "loss": 9.2781, "step": 2618 }, { "epoch": 0.5117233294255569, "grad_norm": 8.775419235229492, "learning_rate": 1.0164540135066846e-05, "loss": 8.6912, "step": 2619 }, { "epoch": 0.5119187182493161, "grad_norm": 10.642983436584473, "learning_rate": 1.015821220697762e-05, "loss": 9.1323, "step": 2620 }, { "epoch": 0.5121141070730755, "grad_norm": 8.464726448059082, "learning_rate": 1.0151884215519491e-05, "loss": 9.4481, "step": 2621 }, { "epoch": 0.5123094958968347, "grad_norm": 10.788630485534668, "learning_rate": 1.0145556163227021e-05, "loss": 8.8519, "step": 2622 }, { "epoch": 0.512504884720594, "grad_norm": 10.610641479492188, "learning_rate": 1.013922805263479e-05, "loss": 8.0585, "step": 2623 }, { "epoch": 0.5127002735443532, "grad_norm": 11.172237396240234, "learning_rate": 1.0132899886277398e-05, "loss": 10.3538, "step": 2624 }, { "epoch": 0.5128956623681126, "grad_norm": 7.80220890045166, "learning_rate": 1.0126571666689475e-05, "loss": 9.3621, "step": 2625 }, { "epoch": 0.5130910511918718, "grad_norm": 7.723788261413574, "learning_rate": 1.0120243396405676e-05, "loss": 8.9266, "step": 2626 }, { "epoch": 0.5132864400156311, "grad_norm": 14.545475959777832, "learning_rate": 1.011391507796066e-05, "loss": 10.129, "step": 2627 }, { "epoch": 0.5134818288393904, "grad_norm": 7.488824844360352, "learning_rate": 1.0107586713889117e-05, "loss": 9.7446, "step": 2628 }, { "epoch": 0.5136772176631497, "grad_norm": 8.845382690429688, "learning_rate": 1.0101258306725754e-05, "loss": 9.5032, "step": 2629 }, { "epoch": 0.5138726064869089, "grad_norm": 8.629199028015137, "learning_rate": 1.0094929859005299e-05, "loss": 9.31, "step": 2630 }, { "epoch": 0.5140679953106683, "grad_norm": 11.23619556427002, "learning_rate": 1.0088601373262486e-05, "loss": 9.6429, "step": 2631 }, { "epoch": 0.5142633841344275, "grad_norm": 11.032987594604492, "learning_rate": 1.0082272852032073e-05, "loss": 8.6772, "step": 2632 }, { "epoch": 0.5144587729581868, "grad_norm": 10.961579322814941, "learning_rate": 1.0075944297848827e-05, "loss": 8.5175, "step": 2633 }, { "epoch": 0.5146541617819461, "grad_norm": 7.2397894859313965, "learning_rate": 1.0069615713247534e-05, "loss": 8.9605, "step": 2634 }, { "epoch": 0.5148495506057054, "grad_norm": 7.577977180480957, "learning_rate": 1.0063287100762987e-05, "loss": 9.0421, "step": 2635 }, { "epoch": 0.5150449394294646, "grad_norm": 8.063026428222656, "learning_rate": 1.005695846292999e-05, "loss": 8.8904, "step": 2636 }, { "epoch": 0.515240328253224, "grad_norm": 10.646279335021973, "learning_rate": 1.0050629802283357e-05, "loss": 9.6499, "step": 2637 }, { "epoch": 0.5154357170769832, "grad_norm": 16.412921905517578, "learning_rate": 1.0044301121357923e-05, "loss": 9.8436, "step": 2638 }, { "epoch": 0.5156311059007425, "grad_norm": 10.519708633422852, "learning_rate": 1.003797242268851e-05, "loss": 7.6298, "step": 2639 }, { "epoch": 0.5158264947245017, "grad_norm": 9.18188762664795, "learning_rate": 1.0031643708809965e-05, "loss": 10.5935, "step": 2640 }, { "epoch": 0.5160218835482611, "grad_norm": 8.290685653686523, "learning_rate": 1.0025314982257131e-05, "loss": 8.5635, "step": 2641 }, { "epoch": 0.5162172723720203, "grad_norm": 11.62686538696289, "learning_rate": 1.0018986245564862e-05, "loss": 9.1133, "step": 2642 }, { "epoch": 0.5164126611957796, "grad_norm": 9.008712768554688, "learning_rate": 1.001265750126801e-05, "loss": 10.4709, "step": 2643 }, { "epoch": 0.5166080500195389, "grad_norm": 7.980377674102783, "learning_rate": 1.0006328751901438e-05, "loss": 9.1196, "step": 2644 }, { "epoch": 0.5168034388432982, "grad_norm": 12.153450012207031, "learning_rate": 1e-05, "loss": 9.0372, "step": 2645 }, { "epoch": 0.5169988276670574, "grad_norm": 9.671964645385742, "learning_rate": 9.993671248098567e-06, "loss": 9.165, "step": 2646 }, { "epoch": 0.5171942164908168, "grad_norm": 7.296621322631836, "learning_rate": 9.987342498731993e-06, "loss": 8.58, "step": 2647 }, { "epoch": 0.517389605314576, "grad_norm": 7.797286033630371, "learning_rate": 9.981013754435142e-06, "loss": 9.5087, "step": 2648 }, { "epoch": 0.5175849941383353, "grad_norm": 7.952873706817627, "learning_rate": 9.97468501774287e-06, "loss": 9.1935, "step": 2649 }, { "epoch": 0.5177803829620946, "grad_norm": 11.546721458435059, "learning_rate": 9.968356291190038e-06, "loss": 8.7435, "step": 2650 }, { "epoch": 0.5179757717858539, "grad_norm": 9.659433364868164, "learning_rate": 9.962027577311491e-06, "loss": 7.5912, "step": 2651 }, { "epoch": 0.5181711606096131, "grad_norm": 9.96871566772461, "learning_rate": 9.955698878642082e-06, "loss": 9.4127, "step": 2652 }, { "epoch": 0.5183665494333725, "grad_norm": 8.684602737426758, "learning_rate": 9.949370197716644e-06, "loss": 9.9006, "step": 2653 }, { "epoch": 0.5185619382571317, "grad_norm": 12.929977416992188, "learning_rate": 9.943041537070014e-06, "loss": 9.6474, "step": 2654 }, { "epoch": 0.518757327080891, "grad_norm": 10.658129692077637, "learning_rate": 9.936712899237018e-06, "loss": 9.7753, "step": 2655 }, { "epoch": 0.5189527159046503, "grad_norm": 33.21316146850586, "learning_rate": 9.930384286752469e-06, "loss": 7.8055, "step": 2656 }, { "epoch": 0.5191481047284096, "grad_norm": 7.257182598114014, "learning_rate": 9.924055702151176e-06, "loss": 9.2182, "step": 2657 }, { "epoch": 0.5193434935521688, "grad_norm": 8.233306884765625, "learning_rate": 9.91772714796793e-06, "loss": 9.055, "step": 2658 }, { "epoch": 0.519538882375928, "grad_norm": 8.045198440551758, "learning_rate": 9.911398626737517e-06, "loss": 8.596, "step": 2659 }, { "epoch": 0.5197342711996874, "grad_norm": 8.076264381408691, "learning_rate": 9.905070140994705e-06, "loss": 9.4418, "step": 2660 }, { "epoch": 0.5199296600234466, "grad_norm": 10.012219429016113, "learning_rate": 9.898741693274247e-06, "loss": 9.1936, "step": 2661 }, { "epoch": 0.5201250488472059, "grad_norm": 8.335098266601562, "learning_rate": 9.892413286110886e-06, "loss": 9.2575, "step": 2662 }, { "epoch": 0.5203204376709653, "grad_norm": 8.148777961730957, "learning_rate": 9.886084922039344e-06, "loss": 9.0273, "step": 2663 }, { "epoch": 0.5205158264947245, "grad_norm": 6.868932247161865, "learning_rate": 9.879756603594328e-06, "loss": 9.055, "step": 2664 }, { "epoch": 0.5207112153184837, "grad_norm": 8.413737297058105, "learning_rate": 9.873428333310527e-06, "loss": 9.4662, "step": 2665 }, { "epoch": 0.5209066041422431, "grad_norm": 8.817794799804688, "learning_rate": 9.867100113722605e-06, "loss": 9.1913, "step": 2666 }, { "epoch": 0.5211019929660023, "grad_norm": 10.635193824768066, "learning_rate": 9.860771947365214e-06, "loss": 9.1389, "step": 2667 }, { "epoch": 0.5212973817897616, "grad_norm": 7.9307026863098145, "learning_rate": 9.85444383677298e-06, "loss": 8.9327, "step": 2668 }, { "epoch": 0.521492770613521, "grad_norm": 8.434563636779785, "learning_rate": 9.84811578448051e-06, "loss": 9.2359, "step": 2669 }, { "epoch": 0.5216881594372802, "grad_norm": 7.498775482177734, "learning_rate": 9.841787793022383e-06, "loss": 9.6017, "step": 2670 }, { "epoch": 0.5218835482610394, "grad_norm": 10.593565940856934, "learning_rate": 9.835459864933156e-06, "loss": 10.0484, "step": 2671 }, { "epoch": 0.5220789370847988, "grad_norm": 7.587101459503174, "learning_rate": 9.829132002747363e-06, "loss": 8.3536, "step": 2672 }, { "epoch": 0.522274325908558, "grad_norm": 10.2027587890625, "learning_rate": 9.822804208999508e-06, "loss": 10.2716, "step": 2673 }, { "epoch": 0.5224697147323173, "grad_norm": 9.8612699508667, "learning_rate": 9.816476486224067e-06, "loss": 10.2911, "step": 2674 }, { "epoch": 0.5226651035560765, "grad_norm": 6.963353633880615, "learning_rate": 9.810148836955492e-06, "loss": 9.2523, "step": 2675 }, { "epoch": 0.5228604923798359, "grad_norm": 9.44482135772705, "learning_rate": 9.8038212637282e-06, "loss": 9.9173, "step": 2676 }, { "epoch": 0.5230558812035951, "grad_norm": 9.125898361206055, "learning_rate": 9.797493769076587e-06, "loss": 8.8245, "step": 2677 }, { "epoch": 0.5232512700273544, "grad_norm": 10.172958374023438, "learning_rate": 9.791166355535004e-06, "loss": 9.8728, "step": 2678 }, { "epoch": 0.5234466588511137, "grad_norm": 10.32078742980957, "learning_rate": 9.78483902563778e-06, "loss": 8.6425, "step": 2679 }, { "epoch": 0.523642047674873, "grad_norm": 11.205142974853516, "learning_rate": 9.778511781919205e-06, "loss": 10.0009, "step": 2680 }, { "epoch": 0.5238374364986322, "grad_norm": 7.736554145812988, "learning_rate": 9.772184626913537e-06, "loss": 8.3589, "step": 2681 }, { "epoch": 0.5240328253223916, "grad_norm": 8.641042709350586, "learning_rate": 9.765857563155e-06, "loss": 9.821, "step": 2682 }, { "epoch": 0.5242282141461508, "grad_norm": 8.605805397033691, "learning_rate": 9.759530593177777e-06, "loss": 9.7728, "step": 2683 }, { "epoch": 0.5244236029699101, "grad_norm": 8.84467887878418, "learning_rate": 9.75320371951602e-06, "loss": 8.7332, "step": 2684 }, { "epoch": 0.5246189917936694, "grad_norm": 10.527255058288574, "learning_rate": 9.746876944703833e-06, "loss": 9.2599, "step": 2685 }, { "epoch": 0.5248143806174287, "grad_norm": 10.367681503295898, "learning_rate": 9.740550271275287e-06, "loss": 8.7876, "step": 2686 }, { "epoch": 0.5250097694411879, "grad_norm": 8.358341217041016, "learning_rate": 9.734223701764415e-06, "loss": 8.8515, "step": 2687 }, { "epoch": 0.5252051582649473, "grad_norm": 9.358244895935059, "learning_rate": 9.7278972387052e-06, "loss": 9.2275, "step": 2688 }, { "epoch": 0.5254005470887065, "grad_norm": 9.502291679382324, "learning_rate": 9.721570884631591e-06, "loss": 9.2014, "step": 2689 }, { "epoch": 0.5255959359124658, "grad_norm": 9.14945125579834, "learning_rate": 9.715244642077485e-06, "loss": 9.0812, "step": 2690 }, { "epoch": 0.5257913247362251, "grad_norm": 10.896734237670898, "learning_rate": 9.708918513576742e-06, "loss": 9.2459, "step": 2691 }, { "epoch": 0.5259867135599844, "grad_norm": 8.33903980255127, "learning_rate": 9.702592501663172e-06, "loss": 9.0402, "step": 2692 }, { "epoch": 0.5261821023837436, "grad_norm": 8.217243194580078, "learning_rate": 9.696266608870536e-06, "loss": 8.9903, "step": 2693 }, { "epoch": 0.5263774912075029, "grad_norm": 9.530319213867188, "learning_rate": 9.689940837732554e-06, "loss": 8.7124, "step": 2694 }, { "epoch": 0.5265728800312622, "grad_norm": 7.9275360107421875, "learning_rate": 9.683615190782892e-06, "loss": 9.1352, "step": 2695 }, { "epoch": 0.5267682688550215, "grad_norm": 9.213961601257324, "learning_rate": 9.67728967055517e-06, "loss": 8.9705, "step": 2696 }, { "epoch": 0.5269636576787807, "grad_norm": 7.555400848388672, "learning_rate": 9.670964279582953e-06, "loss": 8.2625, "step": 2697 }, { "epoch": 0.5271590465025401, "grad_norm": 8.472102165222168, "learning_rate": 9.664639020399757e-06, "loss": 9.0743, "step": 2698 }, { "epoch": 0.5273544353262993, "grad_norm": 10.481719970703125, "learning_rate": 9.658313895539045e-06, "loss": 9.0627, "step": 2699 }, { "epoch": 0.5275498241500586, "grad_norm": 8.405017852783203, "learning_rate": 9.651988907534228e-06, "loss": 9.979, "step": 2700 }, { "epoch": 0.5277452129738179, "grad_norm": 7.347189903259277, "learning_rate": 9.645664058918656e-06, "loss": 9.4459, "step": 2701 }, { "epoch": 0.5279406017975772, "grad_norm": 6.290065765380859, "learning_rate": 9.63933935222563e-06, "loss": 8.7453, "step": 2702 }, { "epoch": 0.5281359906213364, "grad_norm": 7.323775291442871, "learning_rate": 9.633014789988391e-06, "loss": 8.7035, "step": 2703 }, { "epoch": 0.5283313794450958, "grad_norm": 11.120918273925781, "learning_rate": 9.626690374740124e-06, "loss": 9.9083, "step": 2704 }, { "epoch": 0.528526768268855, "grad_norm": 8.071630477905273, "learning_rate": 9.620366109013949e-06, "loss": 9.4713, "step": 2705 }, { "epoch": 0.5287221570926143, "grad_norm": 7.903960227966309, "learning_rate": 9.614041995342935e-06, "loss": 8.9713, "step": 2706 }, { "epoch": 0.5289175459163736, "grad_norm": 9.752288818359375, "learning_rate": 9.607718036260083e-06, "loss": 8.2165, "step": 2707 }, { "epoch": 0.5291129347401329, "grad_norm": 7.624960899353027, "learning_rate": 9.601394234298337e-06, "loss": 8.3963, "step": 2708 }, { "epoch": 0.5293083235638921, "grad_norm": 9.899323463439941, "learning_rate": 9.595070591990575e-06, "loss": 9.0401, "step": 2709 }, { "epoch": 0.5295037123876515, "grad_norm": 7.964188098907471, "learning_rate": 9.588747111869614e-06, "loss": 9.2519, "step": 2710 }, { "epoch": 0.5296991012114107, "grad_norm": 6.899359703063965, "learning_rate": 9.5824237964682e-06, "loss": 8.4557, "step": 2711 }, { "epoch": 0.52989449003517, "grad_norm": 6.853302955627441, "learning_rate": 9.576100648319023e-06, "loss": 8.5504, "step": 2712 }, { "epoch": 0.5300898788589292, "grad_norm": 10.143784523010254, "learning_rate": 9.569777669954694e-06, "loss": 9.248, "step": 2713 }, { "epoch": 0.5302852676826886, "grad_norm": 10.853752136230469, "learning_rate": 9.563454863907771e-06, "loss": 9.5435, "step": 2714 }, { "epoch": 0.5304806565064478, "grad_norm": 9.94910717010498, "learning_rate": 9.557132232710725e-06, "loss": 9.2622, "step": 2715 }, { "epoch": 0.5306760453302071, "grad_norm": 8.490160942077637, "learning_rate": 9.550809778895967e-06, "loss": 8.1029, "step": 2716 }, { "epoch": 0.5308714341539664, "grad_norm": 8.913311958312988, "learning_rate": 9.544487504995838e-06, "loss": 9.7678, "step": 2717 }, { "epoch": 0.5310668229777257, "grad_norm": 7.795229911804199, "learning_rate": 9.538165413542607e-06, "loss": 10.2682, "step": 2718 }, { "epoch": 0.5312622118014849, "grad_norm": 9.823403358459473, "learning_rate": 9.531843507068466e-06, "loss": 9.1957, "step": 2719 }, { "epoch": 0.5314576006252443, "grad_norm": 8.27929401397705, "learning_rate": 9.525521788105534e-06, "loss": 9.0988, "step": 2720 }, { "epoch": 0.5316529894490035, "grad_norm": 12.38388729095459, "learning_rate": 9.519200259185856e-06, "loss": 9.1459, "step": 2721 }, { "epoch": 0.5318483782727628, "grad_norm": 7.560203552246094, "learning_rate": 9.512878922841403e-06, "loss": 8.4563, "step": 2722 }, { "epoch": 0.5320437670965221, "grad_norm": 6.650539398193359, "learning_rate": 9.506557781604067e-06, "loss": 9.0784, "step": 2723 }, { "epoch": 0.5322391559202814, "grad_norm": 8.1847505569458, "learning_rate": 9.500236838005656e-06, "loss": 9.5058, "step": 2724 }, { "epoch": 0.5324345447440406, "grad_norm": 6.476805686950684, "learning_rate": 9.493916094577908e-06, "loss": 8.3882, "step": 2725 }, { "epoch": 0.5326299335678, "grad_norm": 14.108317375183105, "learning_rate": 9.487595553852476e-06, "loss": 9.6569, "step": 2726 }, { "epoch": 0.5328253223915592, "grad_norm": 8.654928207397461, "learning_rate": 9.481275218360933e-06, "loss": 9.746, "step": 2727 }, { "epoch": 0.5330207112153185, "grad_norm": 7.9840521812438965, "learning_rate": 9.474955090634768e-06, "loss": 9.4968, "step": 2728 }, { "epoch": 0.5332161000390777, "grad_norm": 9.199403762817383, "learning_rate": 9.468635173205392e-06, "loss": 9.8293, "step": 2729 }, { "epoch": 0.5334114888628371, "grad_norm": 7.720449447631836, "learning_rate": 9.462315468604126e-06, "loss": 8.8497, "step": 2730 }, { "epoch": 0.5336068776865963, "grad_norm": 8.197383880615234, "learning_rate": 9.455995979362206e-06, "loss": 9.5901, "step": 2731 }, { "epoch": 0.5338022665103556, "grad_norm": 7.391048908233643, "learning_rate": 9.449676708010788e-06, "loss": 8.9208, "step": 2732 }, { "epoch": 0.5339976553341149, "grad_norm": 7.571177005767822, "learning_rate": 9.443357657080932e-06, "loss": 8.9337, "step": 2733 }, { "epoch": 0.5341930441578742, "grad_norm": 6.633815288543701, "learning_rate": 9.437038829103618e-06, "loss": 9.1504, "step": 2734 }, { "epoch": 0.5343884329816334, "grad_norm": 15.273905754089355, "learning_rate": 9.43072022660973e-06, "loss": 10.3509, "step": 2735 }, { "epoch": 0.5345838218053928, "grad_norm": 20.139923095703125, "learning_rate": 9.424401852130068e-06, "loss": 10.224, "step": 2736 }, { "epoch": 0.534779210629152, "grad_norm": 9.288772583007812, "learning_rate": 9.418083708195332e-06, "loss": 9.6518, "step": 2737 }, { "epoch": 0.5349745994529113, "grad_norm": 9.217354774475098, "learning_rate": 9.411765797336136e-06, "loss": 8.8565, "step": 2738 }, { "epoch": 0.5351699882766706, "grad_norm": 9.57825756072998, "learning_rate": 9.405448122083002e-06, "loss": 9.4921, "step": 2739 }, { "epoch": 0.5353653771004299, "grad_norm": 8.773568153381348, "learning_rate": 9.399130684966354e-06, "loss": 9.2346, "step": 2740 }, { "epoch": 0.5355607659241891, "grad_norm": 7.933095932006836, "learning_rate": 9.392813488516521e-06, "loss": 10.1161, "step": 2741 }, { "epoch": 0.5357561547479485, "grad_norm": 8.663396835327148, "learning_rate": 9.386496535263736e-06, "loss": 8.5159, "step": 2742 }, { "epoch": 0.5359515435717077, "grad_norm": 9.957633972167969, "learning_rate": 9.380179827738138e-06, "loss": 9.9456, "step": 2743 }, { "epoch": 0.536146932395467, "grad_norm": 9.319618225097656, "learning_rate": 9.37386336846976e-06, "loss": 9.237, "step": 2744 }, { "epoch": 0.5363423212192263, "grad_norm": 8.81218433380127, "learning_rate": 9.367547159988542e-06, "loss": 8.0087, "step": 2745 }, { "epoch": 0.5365377100429856, "grad_norm": 9.204264640808105, "learning_rate": 9.361231204824324e-06, "loss": 9.5934, "step": 2746 }, { "epoch": 0.5367330988667448, "grad_norm": 6.535369396209717, "learning_rate": 9.354915505506839e-06, "loss": 8.9945, "step": 2747 }, { "epoch": 0.536928487690504, "grad_norm": 9.098174095153809, "learning_rate": 9.34860006456572e-06, "loss": 9.3909, "step": 2748 }, { "epoch": 0.5371238765142634, "grad_norm": 8.996417045593262, "learning_rate": 9.342284884530504e-06, "loss": 8.5684, "step": 2749 }, { "epoch": 0.5373192653380227, "grad_norm": 14.172801971435547, "learning_rate": 9.335969967930607e-06, "loss": 9.9654, "step": 2750 }, { "epoch": 0.5375146541617819, "grad_norm": 11.021417617797852, "learning_rate": 9.329655317295354e-06, "loss": 9.4326, "step": 2751 }, { "epoch": 0.5377100429855413, "grad_norm": 12.336243629455566, "learning_rate": 9.323340935153958e-06, "loss": 9.0942, "step": 2752 }, { "epoch": 0.5379054318093005, "grad_norm": 8.203696250915527, "learning_rate": 9.317026824035525e-06, "loss": 8.7552, "step": 2753 }, { "epoch": 0.5381008206330598, "grad_norm": 8.273969650268555, "learning_rate": 9.310712986469055e-06, "loss": 9.7479, "step": 2754 }, { "epoch": 0.5382962094568191, "grad_norm": 7.83505392074585, "learning_rate": 9.30439942498343e-06, "loss": 9.5113, "step": 2755 }, { "epoch": 0.5384915982805784, "grad_norm": 10.40118408203125, "learning_rate": 9.298086142107436e-06, "loss": 9.2196, "step": 2756 }, { "epoch": 0.5386869871043376, "grad_norm": 9.551358222961426, "learning_rate": 9.291773140369732e-06, "loss": 8.7518, "step": 2757 }, { "epoch": 0.538882375928097, "grad_norm": 8.112876892089844, "learning_rate": 9.28546042229887e-06, "loss": 7.146, "step": 2758 }, { "epoch": 0.5390777647518562, "grad_norm": 10.979324340820312, "learning_rate": 9.279147990423297e-06, "loss": 9.6383, "step": 2759 }, { "epoch": 0.5392731535756154, "grad_norm": 9.588387489318848, "learning_rate": 9.272835847271332e-06, "loss": 10.5762, "step": 2760 }, { "epoch": 0.5394685423993748, "grad_norm": 8.50763988494873, "learning_rate": 9.266523995371186e-06, "loss": 8.8917, "step": 2761 }, { "epoch": 0.539663931223134, "grad_norm": 9.856245040893555, "learning_rate": 9.260212437250955e-06, "loss": 9.8669, "step": 2762 }, { "epoch": 0.5398593200468933, "grad_norm": 8.32020092010498, "learning_rate": 9.253901175438607e-06, "loss": 8.4172, "step": 2763 }, { "epoch": 0.5400547088706525, "grad_norm": 8.922029495239258, "learning_rate": 9.247590212462001e-06, "loss": 9.3984, "step": 2764 }, { "epoch": 0.5402500976944119, "grad_norm": 6.328679084777832, "learning_rate": 9.241279550848877e-06, "loss": 8.091, "step": 2765 }, { "epoch": 0.5404454865181711, "grad_norm": 7.473681449890137, "learning_rate": 9.234969193126847e-06, "loss": 8.7435, "step": 2766 }, { "epoch": 0.5406408753419304, "grad_norm": 7.918021202087402, "learning_rate": 9.228659141823407e-06, "loss": 8.6152, "step": 2767 }, { "epoch": 0.5408362641656898, "grad_norm": 7.37488317489624, "learning_rate": 9.222349399465927e-06, "loss": 8.4447, "step": 2768 }, { "epoch": 0.541031652989449, "grad_norm": 10.067622184753418, "learning_rate": 9.216039968581656e-06, "loss": 9.1785, "step": 2769 }, { "epoch": 0.5412270418132082, "grad_norm": 10.376371383666992, "learning_rate": 9.209730851697718e-06, "loss": 10.3289, "step": 2770 }, { "epoch": 0.5414224306369676, "grad_norm": 7.040582180023193, "learning_rate": 9.203422051341109e-06, "loss": 8.9531, "step": 2771 }, { "epoch": 0.5416178194607268, "grad_norm": 7.681628704071045, "learning_rate": 9.197113570038699e-06, "loss": 9.2074, "step": 2772 }, { "epoch": 0.5418132082844861, "grad_norm": 8.371284484863281, "learning_rate": 9.190805410317231e-06, "loss": 9.2613, "step": 2773 }, { "epoch": 0.5420085971082454, "grad_norm": 8.331684112548828, "learning_rate": 9.184497574703318e-06, "loss": 9.1944, "step": 2774 }, { "epoch": 0.5422039859320047, "grad_norm": 10.406688690185547, "learning_rate": 9.178190065723447e-06, "loss": 9.0204, "step": 2775 }, { "epoch": 0.5423993747557639, "grad_norm": 10.491332054138184, "learning_rate": 9.171882885903967e-06, "loss": 9.5959, "step": 2776 }, { "epoch": 0.5425947635795233, "grad_norm": 7.148529529571533, "learning_rate": 9.165576037771102e-06, "loss": 8.2787, "step": 2777 }, { "epoch": 0.5427901524032825, "grad_norm": 6.761819362640381, "learning_rate": 9.159269523850939e-06, "loss": 9.3, "step": 2778 }, { "epoch": 0.5429855412270418, "grad_norm": 8.8622407913208, "learning_rate": 9.152963346669432e-06, "loss": 9.5788, "step": 2779 }, { "epoch": 0.5431809300508011, "grad_norm": 9.01783561706543, "learning_rate": 9.146657508752404e-06, "loss": 9.5756, "step": 2780 }, { "epoch": 0.5433763188745604, "grad_norm": 7.692517280578613, "learning_rate": 9.140352012625538e-06, "loss": 7.3885, "step": 2781 }, { "epoch": 0.5435717076983196, "grad_norm": 8.276142120361328, "learning_rate": 9.134046860814377e-06, "loss": 8.3795, "step": 2782 }, { "epoch": 0.5437670965220789, "grad_norm": 7.867055416107178, "learning_rate": 9.127742055844334e-06, "loss": 9.4628, "step": 2783 }, { "epoch": 0.5439624853458382, "grad_norm": 8.413628578186035, "learning_rate": 9.121437600240679e-06, "loss": 8.7052, "step": 2784 }, { "epoch": 0.5441578741695975, "grad_norm": 8.16491413116455, "learning_rate": 9.11513349652854e-06, "loss": 10.1515, "step": 2785 }, { "epoch": 0.5443532629933567, "grad_norm": 7.399144649505615, "learning_rate": 9.108829747232909e-06, "loss": 8.2059, "step": 2786 }, { "epoch": 0.5445486518171161, "grad_norm": 8.344947814941406, "learning_rate": 9.102526354878632e-06, "loss": 9.4465, "step": 2787 }, { "epoch": 0.5447440406408753, "grad_norm": 7.7732157707214355, "learning_rate": 9.096223321990414e-06, "loss": 9.7633, "step": 2788 }, { "epoch": 0.5449394294646346, "grad_norm": 9.581931114196777, "learning_rate": 9.089920651092812e-06, "loss": 8.6713, "step": 2789 }, { "epoch": 0.5451348182883939, "grad_norm": 10.937223434448242, "learning_rate": 9.083618344710246e-06, "loss": 9.332, "step": 2790 }, { "epoch": 0.5453302071121532, "grad_norm": 8.541001319885254, "learning_rate": 9.07731640536698e-06, "loss": 9.2417, "step": 2791 }, { "epoch": 0.5455255959359124, "grad_norm": 8.981481552124023, "learning_rate": 9.071014835587144e-06, "loss": 9.0391, "step": 2792 }, { "epoch": 0.5457209847596718, "grad_norm": 7.546681880950928, "learning_rate": 9.064713637894706e-06, "loss": 8.8833, "step": 2793 }, { "epoch": 0.545916373583431, "grad_norm": 10.02629280090332, "learning_rate": 9.058412814813493e-06, "loss": 10.0756, "step": 2794 }, { "epoch": 0.5461117624071903, "grad_norm": 8.090208053588867, "learning_rate": 9.05211236886718e-06, "loss": 9.1745, "step": 2795 }, { "epoch": 0.5463071512309496, "grad_norm": 10.446197509765625, "learning_rate": 9.045812302579293e-06, "loss": 9.5297, "step": 2796 }, { "epoch": 0.5465025400547089, "grad_norm": 7.993022441864014, "learning_rate": 9.039512618473202e-06, "loss": 9.2405, "step": 2797 }, { "epoch": 0.5466979288784681, "grad_norm": 7.007857799530029, "learning_rate": 9.033213319072126e-06, "loss": 8.2401, "step": 2798 }, { "epoch": 0.5468933177022274, "grad_norm": 7.0248260498046875, "learning_rate": 9.026914406899134e-06, "loss": 8.3184, "step": 2799 }, { "epoch": 0.5470887065259867, "grad_norm": 7.492818355560303, "learning_rate": 9.02061588447713e-06, "loss": 8.7988, "step": 2800 }, { "epoch": 0.547284095349746, "grad_norm": 8.054252624511719, "learning_rate": 9.014317754328873e-06, "loss": 9.4373, "step": 2801 }, { "epoch": 0.5474794841735052, "grad_norm": 8.825780868530273, "learning_rate": 9.008020018976959e-06, "loss": 8.9278, "step": 2802 }, { "epoch": 0.5476748729972646, "grad_norm": 7.805006980895996, "learning_rate": 9.00172268094382e-06, "loss": 9.6183, "step": 2803 }, { "epoch": 0.5478702618210238, "grad_norm": 8.974769592285156, "learning_rate": 8.995425742751743e-06, "loss": 9.6281, "step": 2804 }, { "epoch": 0.5480656506447831, "grad_norm": 9.293949127197266, "learning_rate": 8.989129206922844e-06, "loss": 7.8442, "step": 2805 }, { "epoch": 0.5482610394685424, "grad_norm": 8.424934387207031, "learning_rate": 8.98283307597908e-06, "loss": 8.1946, "step": 2806 }, { "epoch": 0.5484564282923017, "grad_norm": 8.770522117614746, "learning_rate": 8.97653735244225e-06, "loss": 9.0785, "step": 2807 }, { "epoch": 0.5486518171160609, "grad_norm": 8.615560531616211, "learning_rate": 8.970242038833985e-06, "loss": 9.4919, "step": 2808 }, { "epoch": 0.5488472059398203, "grad_norm": 10.273186683654785, "learning_rate": 8.963947137675754e-06, "loss": 9.844, "step": 2809 }, { "epoch": 0.5490425947635795, "grad_norm": 9.942178726196289, "learning_rate": 8.95765265148886e-06, "loss": 9.7945, "step": 2810 }, { "epoch": 0.5492379835873388, "grad_norm": 9.615091323852539, "learning_rate": 8.951358582794438e-06, "loss": 9.2248, "step": 2811 }, { "epoch": 0.5494333724110981, "grad_norm": 8.68079948425293, "learning_rate": 8.945064934113464e-06, "loss": 9.2415, "step": 2812 }, { "epoch": 0.5496287612348574, "grad_norm": 10.706026077270508, "learning_rate": 8.938771707966733e-06, "loss": 9.3499, "step": 2813 }, { "epoch": 0.5498241500586166, "grad_norm": 7.729515075683594, "learning_rate": 8.93247890687488e-06, "loss": 8.0236, "step": 2814 }, { "epoch": 0.550019538882376, "grad_norm": 10.87379264831543, "learning_rate": 8.92618653335837e-06, "loss": 10.2506, "step": 2815 }, { "epoch": 0.5502149277061352, "grad_norm": 7.861601829528809, "learning_rate": 8.919894589937492e-06, "loss": 8.2541, "step": 2816 }, { "epoch": 0.5504103165298945, "grad_norm": 9.677508354187012, "learning_rate": 8.913603079132361e-06, "loss": 9.235, "step": 2817 }, { "epoch": 0.5506057053536537, "grad_norm": 8.08492660522461, "learning_rate": 8.907312003462926e-06, "loss": 9.1055, "step": 2818 }, { "epoch": 0.5508010941774131, "grad_norm": 7.463259696960449, "learning_rate": 8.901021365448959e-06, "loss": 8.8301, "step": 2819 }, { "epoch": 0.5509964830011723, "grad_norm": 8.580756187438965, "learning_rate": 8.894731167610054e-06, "loss": 9.5622, "step": 2820 }, { "epoch": 0.5511918718249316, "grad_norm": 9.969331741333008, "learning_rate": 8.888441412465631e-06, "loss": 9.2752, "step": 2821 }, { "epoch": 0.5513872606486909, "grad_norm": 6.705500602722168, "learning_rate": 8.88215210253493e-06, "loss": 9.0879, "step": 2822 }, { "epoch": 0.5515826494724502, "grad_norm": 8.789013862609863, "learning_rate": 8.87586324033702e-06, "loss": 10.0995, "step": 2823 }, { "epoch": 0.5517780382962094, "grad_norm": 8.352804183959961, "learning_rate": 8.869574828390782e-06, "loss": 9.2406, "step": 2824 }, { "epoch": 0.5519734271199688, "grad_norm": 9.722862243652344, "learning_rate": 8.86328686921492e-06, "loss": 8.4509, "step": 2825 }, { "epoch": 0.552168815943728, "grad_norm": 45.539649963378906, "learning_rate": 8.856999365327959e-06, "loss": 9.6746, "step": 2826 }, { "epoch": 0.5523642047674873, "grad_norm": 7.403289794921875, "learning_rate": 8.850712319248237e-06, "loss": 9.0779, "step": 2827 }, { "epoch": 0.5525595935912466, "grad_norm": 9.596833229064941, "learning_rate": 8.844425733493915e-06, "loss": 10.3165, "step": 2828 }, { "epoch": 0.5527549824150059, "grad_norm": 7.67540168762207, "learning_rate": 8.83813961058296e-06, "loss": 9.3652, "step": 2829 }, { "epoch": 0.5529503712387651, "grad_norm": 7.21089506149292, "learning_rate": 8.831853953033165e-06, "loss": 8.4772, "step": 2830 }, { "epoch": 0.5531457600625245, "grad_norm": 7.019738674163818, "learning_rate": 8.825568763362126e-06, "loss": 9.5753, "step": 2831 }, { "epoch": 0.5533411488862837, "grad_norm": 7.952762126922607, "learning_rate": 8.81928404408726e-06, "loss": 9.3136, "step": 2832 }, { "epoch": 0.553536537710043, "grad_norm": 9.55434513092041, "learning_rate": 8.812999797725793e-06, "loss": 9.0351, "step": 2833 }, { "epoch": 0.5537319265338023, "grad_norm": 7.781782627105713, "learning_rate": 8.806716026794758e-06, "loss": 8.1702, "step": 2834 }, { "epoch": 0.5539273153575616, "grad_norm": 7.829460144042969, "learning_rate": 8.800432733811e-06, "loss": 8.661, "step": 2835 }, { "epoch": 0.5541227041813208, "grad_norm": 8.869157791137695, "learning_rate": 8.794149921291176e-06, "loss": 9.5003, "step": 2836 }, { "epoch": 0.5543180930050801, "grad_norm": 7.07666540145874, "learning_rate": 8.787867591751746e-06, "loss": 8.7779, "step": 2837 }, { "epoch": 0.5545134818288394, "grad_norm": 7.235918045043945, "learning_rate": 8.781585747708976e-06, "loss": 9.5523, "step": 2838 }, { "epoch": 0.5547088706525987, "grad_norm": 8.273935317993164, "learning_rate": 8.775304391678943e-06, "loss": 10.2941, "step": 2839 }, { "epoch": 0.5549042594763579, "grad_norm": 12.045279502868652, "learning_rate": 8.769023526177524e-06, "loss": 9.8395, "step": 2840 }, { "epoch": 0.5550996483001173, "grad_norm": 10.213648796081543, "learning_rate": 8.7627431537204e-06, "loss": 9.3265, "step": 2841 }, { "epoch": 0.5552950371238765, "grad_norm": 9.113309860229492, "learning_rate": 8.756463276823054e-06, "loss": 9.5764, "step": 2842 }, { "epoch": 0.5554904259476358, "grad_norm": 9.318734169006348, "learning_rate": 8.750183898000775e-06, "loss": 9.885, "step": 2843 }, { "epoch": 0.5556858147713951, "grad_norm": 9.162623405456543, "learning_rate": 8.743905019768646e-06, "loss": 10.3492, "step": 2844 }, { "epoch": 0.5558812035951544, "grad_norm": 7.756526470184326, "learning_rate": 8.737626644641553e-06, "loss": 8.9615, "step": 2845 }, { "epoch": 0.5560765924189136, "grad_norm": 18.73146629333496, "learning_rate": 8.731348775134182e-06, "loss": 8.7326, "step": 2846 }, { "epoch": 0.556271981242673, "grad_norm": 18.97466468811035, "learning_rate": 8.725071413761013e-06, "loss": 9.347, "step": 2847 }, { "epoch": 0.5564673700664322, "grad_norm": 7.583729267120361, "learning_rate": 8.718794563036326e-06, "loss": 8.5347, "step": 2848 }, { "epoch": 0.5566627588901915, "grad_norm": 9.999347686767578, "learning_rate": 8.712518225474191e-06, "loss": 8.9671, "step": 2849 }, { "epoch": 0.5568581477139508, "grad_norm": 8.69705867767334, "learning_rate": 8.70624240358848e-06, "loss": 9.2673, "step": 2850 }, { "epoch": 0.5570535365377101, "grad_norm": 7.3665618896484375, "learning_rate": 8.699967099892851e-06, "loss": 9.4397, "step": 2851 }, { "epoch": 0.5572489253614693, "grad_norm": 8.915785789489746, "learning_rate": 8.693692316900768e-06, "loss": 9.6176, "step": 2852 }, { "epoch": 0.5574443141852286, "grad_norm": 8.13288402557373, "learning_rate": 8.68741805712546e-06, "loss": 9.3996, "step": 2853 }, { "epoch": 0.5576397030089879, "grad_norm": 6.944223403930664, "learning_rate": 8.681144323079973e-06, "loss": 9.267, "step": 2854 }, { "epoch": 0.5578350918327472, "grad_norm": 9.68083667755127, "learning_rate": 8.67487111727713e-06, "loss": 8.8771, "step": 2855 }, { "epoch": 0.5580304806565064, "grad_norm": 9.229497909545898, "learning_rate": 8.668598442229545e-06, "loss": 9.5669, "step": 2856 }, { "epoch": 0.5582258694802658, "grad_norm": 8.748711585998535, "learning_rate": 8.66232630044962e-06, "loss": 9.3087, "step": 2857 }, { "epoch": 0.558421258304025, "grad_norm": 8.638025283813477, "learning_rate": 8.656054694449541e-06, "loss": 8.0628, "step": 2858 }, { "epoch": 0.5586166471277842, "grad_norm": 11.083617210388184, "learning_rate": 8.64978362674128e-06, "loss": 9.7794, "step": 2859 }, { "epoch": 0.5588120359515436, "grad_norm": 12.017905235290527, "learning_rate": 8.6435130998366e-06, "loss": 8.9731, "step": 2860 }, { "epoch": 0.5590074247753029, "grad_norm": 8.4308500289917, "learning_rate": 8.637243116247036e-06, "loss": 7.6834, "step": 2861 }, { "epoch": 0.5592028135990621, "grad_norm": 8.182673454284668, "learning_rate": 8.630973678483911e-06, "loss": 8.5101, "step": 2862 }, { "epoch": 0.5593982024228215, "grad_norm": 8.208181381225586, "learning_rate": 8.624704789058331e-06, "loss": 9.0657, "step": 2863 }, { "epoch": 0.5595935912465807, "grad_norm": 10.765786170959473, "learning_rate": 8.618436450481182e-06, "loss": 9.9012, "step": 2864 }, { "epoch": 0.55978898007034, "grad_norm": 10.154743194580078, "learning_rate": 8.612168665263125e-06, "loss": 7.8456, "step": 2865 }, { "epoch": 0.5599843688940993, "grad_norm": 13.1641263961792, "learning_rate": 8.605901435914608e-06, "loss": 9.3338, "step": 2866 }, { "epoch": 0.5601797577178586, "grad_norm": 7.61334228515625, "learning_rate": 8.599634764945845e-06, "loss": 7.9552, "step": 2867 }, { "epoch": 0.5603751465416178, "grad_norm": 9.793645858764648, "learning_rate": 8.593368654866835e-06, "loss": 8.9891, "step": 2868 }, { "epoch": 0.5605705353653772, "grad_norm": 8.298504829406738, "learning_rate": 8.587103108187352e-06, "loss": 8.6191, "step": 2869 }, { "epoch": 0.5607659241891364, "grad_norm": 10.942178726196289, "learning_rate": 8.580838127416937e-06, "loss": 9.8686, "step": 2870 }, { "epoch": 0.5609613130128956, "grad_norm": 8.616482734680176, "learning_rate": 8.574573715064913e-06, "loss": 9.1715, "step": 2871 }, { "epoch": 0.5611567018366549, "grad_norm": 16.707321166992188, "learning_rate": 8.56830987364037e-06, "loss": 9.2851, "step": 2872 }, { "epoch": 0.5613520906604142, "grad_norm": 8.741340637207031, "learning_rate": 8.562046605652174e-06, "loss": 10.2423, "step": 2873 }, { "epoch": 0.5615474794841735, "grad_norm": 9.816158294677734, "learning_rate": 8.555783913608955e-06, "loss": 9.1263, "step": 2874 }, { "epoch": 0.5617428683079327, "grad_norm": 12.084359169006348, "learning_rate": 8.549521800019115e-06, "loss": 9.1611, "step": 2875 }, { "epoch": 0.5619382571316921, "grad_norm": 7.766233921051025, "learning_rate": 8.543260267390824e-06, "loss": 9.5226, "step": 2876 }, { "epoch": 0.5621336459554513, "grad_norm": 9.25195026397705, "learning_rate": 8.536999318232025e-06, "loss": 9.8658, "step": 2877 }, { "epoch": 0.5623290347792106, "grad_norm": 7.865414619445801, "learning_rate": 8.53073895505042e-06, "loss": 7.745, "step": 2878 }, { "epoch": 0.56252442360297, "grad_norm": 9.51463508605957, "learning_rate": 8.52447918035348e-06, "loss": 9.9384, "step": 2879 }, { "epoch": 0.5627198124267292, "grad_norm": 11.417374610900879, "learning_rate": 8.518219996648435e-06, "loss": 9.4388, "step": 2880 }, { "epoch": 0.5629152012504884, "grad_norm": 10.57020092010498, "learning_rate": 8.511961406442286e-06, "loss": 8.6496, "step": 2881 }, { "epoch": 0.5631105900742478, "grad_norm": 20.026182174682617, "learning_rate": 8.505703412241793e-06, "loss": 10.1787, "step": 2882 }, { "epoch": 0.563305978898007, "grad_norm": 10.828873634338379, "learning_rate": 8.499446016553475e-06, "loss": 9.5856, "step": 2883 }, { "epoch": 0.5635013677217663, "grad_norm": 9.27635383605957, "learning_rate": 8.493189221883615e-06, "loss": 7.8634, "step": 2884 }, { "epoch": 0.5636967565455256, "grad_norm": 15.371089935302734, "learning_rate": 8.486933030738252e-06, "loss": 9.4457, "step": 2885 }, { "epoch": 0.5638921453692849, "grad_norm": 8.049420356750488, "learning_rate": 8.480677445623187e-06, "loss": 9.6421, "step": 2886 }, { "epoch": 0.5640875341930441, "grad_norm": 11.85643482208252, "learning_rate": 8.474422469043974e-06, "loss": 9.5314, "step": 2887 }, { "epoch": 0.5642829230168034, "grad_norm": 9.827743530273438, "learning_rate": 8.468168103505926e-06, "loss": 9.3676, "step": 2888 }, { "epoch": 0.5644783118405627, "grad_norm": 7.995309352874756, "learning_rate": 8.461914351514109e-06, "loss": 8.5075, "step": 2889 }, { "epoch": 0.564673700664322, "grad_norm": 9.013062477111816, "learning_rate": 8.455661215573345e-06, "loss": 8.5168, "step": 2890 }, { "epoch": 0.5648690894880812, "grad_norm": 12.336776733398438, "learning_rate": 8.449408698188212e-06, "loss": 10.6045, "step": 2891 }, { "epoch": 0.5650644783118406, "grad_norm": 8.412564277648926, "learning_rate": 8.443156801863037e-06, "loss": 8.6074, "step": 2892 }, { "epoch": 0.5652598671355998, "grad_norm": 6.834690093994141, "learning_rate": 8.436905529101896e-06, "loss": 8.5053, "step": 2893 }, { "epoch": 0.5654552559593591, "grad_norm": 7.084744930267334, "learning_rate": 8.430654882408619e-06, "loss": 8.0449, "step": 2894 }, { "epoch": 0.5656506447831184, "grad_norm": 8.889432907104492, "learning_rate": 8.424404864286784e-06, "loss": 8.8036, "step": 2895 }, { "epoch": 0.5658460336068777, "grad_norm": 8.221407890319824, "learning_rate": 8.418155477239718e-06, "loss": 9.1227, "step": 2896 }, { "epoch": 0.5660414224306369, "grad_norm": 8.30092716217041, "learning_rate": 8.411906723770493e-06, "loss": 8.8562, "step": 2897 }, { "epoch": 0.5662368112543963, "grad_norm": 8.753860473632812, "learning_rate": 8.40565860638193e-06, "loss": 8.9539, "step": 2898 }, { "epoch": 0.5664322000781555, "grad_norm": 9.127002716064453, "learning_rate": 8.399411127576597e-06, "loss": 9.6091, "step": 2899 }, { "epoch": 0.5666275889019148, "grad_norm": 9.005349159240723, "learning_rate": 8.393164289856797e-06, "loss": 9.0778, "step": 2900 }, { "epoch": 0.5668229777256741, "grad_norm": 8.229484558105469, "learning_rate": 8.386918095724584e-06, "loss": 9.9055, "step": 2901 }, { "epoch": 0.5670183665494334, "grad_norm": 6.597065448760986, "learning_rate": 8.380672547681757e-06, "loss": 8.0167, "step": 2902 }, { "epoch": 0.5672137553731926, "grad_norm": 9.146075248718262, "learning_rate": 8.374427648229845e-06, "loss": 8.7352, "step": 2903 }, { "epoch": 0.567409144196952, "grad_norm": 9.293710708618164, "learning_rate": 8.368183399870132e-06, "loss": 9.692, "step": 2904 }, { "epoch": 0.5676045330207112, "grad_norm": 7.453103542327881, "learning_rate": 8.361939805103628e-06, "loss": 8.6005, "step": 2905 }, { "epoch": 0.5677999218444705, "grad_norm": 6.154191970825195, "learning_rate": 8.355696866431087e-06, "loss": 9.0064, "step": 2906 }, { "epoch": 0.5679953106682297, "grad_norm": 10.142219543457031, "learning_rate": 8.349454586353e-06, "loss": 10.9779, "step": 2907 }, { "epoch": 0.5681906994919891, "grad_norm": 6.828899383544922, "learning_rate": 8.343212967369598e-06, "loss": 9.5795, "step": 2908 }, { "epoch": 0.5683860883157483, "grad_norm": 8.226543426513672, "learning_rate": 8.336972011980837e-06, "loss": 8.374, "step": 2909 }, { "epoch": 0.5685814771395076, "grad_norm": 7.978690147399902, "learning_rate": 8.330731722686416e-06, "loss": 8.69, "step": 2910 }, { "epoch": 0.5687768659632669, "grad_norm": 7.589662075042725, "learning_rate": 8.324492101985766e-06, "loss": 9.587, "step": 2911 }, { "epoch": 0.5689722547870262, "grad_norm": 7.409912109375, "learning_rate": 8.31825315237805e-06, "loss": 8.7215, "step": 2912 }, { "epoch": 0.5691676436107854, "grad_norm": 8.238944053649902, "learning_rate": 8.312014876362157e-06, "loss": 8.2848, "step": 2913 }, { "epoch": 0.5693630324345448, "grad_norm": 8.096856117248535, "learning_rate": 8.305777276436712e-06, "loss": 8.7827, "step": 2914 }, { "epoch": 0.569558421258304, "grad_norm": 7.199119567871094, "learning_rate": 8.299540355100066e-06, "loss": 8.9888, "step": 2915 }, { "epoch": 0.5697538100820633, "grad_norm": 9.162145614624023, "learning_rate": 8.293304114850302e-06, "loss": 9.1039, "step": 2916 }, { "epoch": 0.5699491989058226, "grad_norm": 8.96921157836914, "learning_rate": 8.287068558185225e-06, "loss": 9.3564, "step": 2917 }, { "epoch": 0.5701445877295819, "grad_norm": 8.391603469848633, "learning_rate": 8.280833687602372e-06, "loss": 9.2859, "step": 2918 }, { "epoch": 0.5703399765533411, "grad_norm": 8.860815048217773, "learning_rate": 8.274599505599002e-06, "loss": 8.9089, "step": 2919 }, { "epoch": 0.5705353653771005, "grad_norm": 10.831894874572754, "learning_rate": 8.268366014672095e-06, "loss": 8.3156, "step": 2920 }, { "epoch": 0.5707307542008597, "grad_norm": 8.517152786254883, "learning_rate": 8.262133217318363e-06, "loss": 7.8869, "step": 2921 }, { "epoch": 0.570926143024619, "grad_norm": 8.543846130371094, "learning_rate": 8.255901116034231e-06, "loss": 8.3345, "step": 2922 }, { "epoch": 0.5711215318483783, "grad_norm": 32.423622131347656, "learning_rate": 8.249669713315849e-06, "loss": 10.3243, "step": 2923 }, { "epoch": 0.5713169206721376, "grad_norm": 9.708186149597168, "learning_rate": 8.24343901165909e-06, "loss": 9.1622, "step": 2924 }, { "epoch": 0.5715123094958968, "grad_norm": 10.76029109954834, "learning_rate": 8.237209013559542e-06, "loss": 9.2075, "step": 2925 }, { "epoch": 0.5717076983196561, "grad_norm": 9.259683609008789, "learning_rate": 8.230979721512511e-06, "loss": 9.4542, "step": 2926 }, { "epoch": 0.5719030871434154, "grad_norm": 7.613196849822998, "learning_rate": 8.224751138013025e-06, "loss": 9.2146, "step": 2927 }, { "epoch": 0.5720984759671747, "grad_norm": 8.023900985717773, "learning_rate": 8.21852326555582e-06, "loss": 8.7284, "step": 2928 }, { "epoch": 0.5722938647909339, "grad_norm": 7.808635711669922, "learning_rate": 8.212296106635358e-06, "loss": 9.2191, "step": 2929 }, { "epoch": 0.5724892536146933, "grad_norm": 10.434918403625488, "learning_rate": 8.206069663745806e-06, "loss": 9.3047, "step": 2930 }, { "epoch": 0.5726846424384525, "grad_norm": 9.0089111328125, "learning_rate": 8.19984393938105e-06, "loss": 9.5568, "step": 2931 }, { "epoch": 0.5728800312622118, "grad_norm": 9.408156394958496, "learning_rate": 8.193618936034683e-06, "loss": 9.1575, "step": 2932 }, { "epoch": 0.5730754200859711, "grad_norm": 8.474098205566406, "learning_rate": 8.187394656200013e-06, "loss": 10.0839, "step": 2933 }, { "epoch": 0.5732708089097304, "grad_norm": 7.609988689422607, "learning_rate": 8.181171102370062e-06, "loss": 8.9113, "step": 2934 }, { "epoch": 0.5734661977334896, "grad_norm": 9.248665809631348, "learning_rate": 8.174948277037548e-06, "loss": 8.7736, "step": 2935 }, { "epoch": 0.573661586557249, "grad_norm": 8.241129875183105, "learning_rate": 8.168726182694913e-06, "loss": 9.8128, "step": 2936 }, { "epoch": 0.5738569753810082, "grad_norm": 9.227700233459473, "learning_rate": 8.162504821834296e-06, "loss": 9.1306, "step": 2937 }, { "epoch": 0.5740523642047675, "grad_norm": 7.878133773803711, "learning_rate": 8.156284196947549e-06, "loss": 8.7135, "step": 2938 }, { "epoch": 0.5742477530285268, "grad_norm": 8.619709014892578, "learning_rate": 8.150064310526217e-06, "loss": 8.6783, "step": 2939 }, { "epoch": 0.5744431418522861, "grad_norm": 9.555558204650879, "learning_rate": 8.143845165061564e-06, "loss": 9.4556, "step": 2940 }, { "epoch": 0.5746385306760453, "grad_norm": 7.761295795440674, "learning_rate": 8.13762676304455e-06, "loss": 9.2814, "step": 2941 }, { "epoch": 0.5748339194998046, "grad_norm": 8.48897647857666, "learning_rate": 8.131409106965836e-06, "loss": 8.5359, "step": 2942 }, { "epoch": 0.5750293083235639, "grad_norm": 6.357192039489746, "learning_rate": 8.125192199315788e-06, "loss": 9.0464, "step": 2943 }, { "epoch": 0.5752246971473232, "grad_norm": 8.602022171020508, "learning_rate": 8.11897604258447e-06, "loss": 8.2716, "step": 2944 }, { "epoch": 0.5754200859710824, "grad_norm": 12.048428535461426, "learning_rate": 8.112760639261646e-06, "loss": 9.6382, "step": 2945 }, { "epoch": 0.5756154747948418, "grad_norm": 8.437899589538574, "learning_rate": 8.106545991836778e-06, "loss": 10.1358, "step": 2946 }, { "epoch": 0.575810863618601, "grad_norm": 8.29260540008545, "learning_rate": 8.100332102799026e-06, "loss": 9.3783, "step": 2947 }, { "epoch": 0.5760062524423603, "grad_norm": 10.251049995422363, "learning_rate": 8.094118974637245e-06, "loss": 9.6561, "step": 2948 }, { "epoch": 0.5762016412661196, "grad_norm": 7.310251235961914, "learning_rate": 8.087906609839984e-06, "loss": 9.7492, "step": 2949 }, { "epoch": 0.5763970300898789, "grad_norm": 9.511523246765137, "learning_rate": 8.081695010895489e-06, "loss": 9.3339, "step": 2950 }, { "epoch": 0.5765924189136381, "grad_norm": 9.167460441589355, "learning_rate": 8.075484180291702e-06, "loss": 9.8695, "step": 2951 }, { "epoch": 0.5767878077373975, "grad_norm": 9.307744026184082, "learning_rate": 8.069274120516249e-06, "loss": 9.5672, "step": 2952 }, { "epoch": 0.5769831965611567, "grad_norm": 6.490602493286133, "learning_rate": 8.06306483405645e-06, "loss": 9.5354, "step": 2953 }, { "epoch": 0.577178585384916, "grad_norm": 8.028519630432129, "learning_rate": 8.056856323399321e-06, "loss": 8.4964, "step": 2954 }, { "epoch": 0.5773739742086753, "grad_norm": 9.199743270874023, "learning_rate": 8.050648591031562e-06, "loss": 9.4259, "step": 2955 }, { "epoch": 0.5775693630324346, "grad_norm": 8.184178352355957, "learning_rate": 8.044441639439564e-06, "loss": 10.0353, "step": 2956 }, { "epoch": 0.5777647518561938, "grad_norm": 7.361752510070801, "learning_rate": 8.038235471109401e-06, "loss": 8.7964, "step": 2957 }, { "epoch": 0.5779601406799532, "grad_norm": 8.456011772155762, "learning_rate": 8.03203008852684e-06, "loss": 9.1222, "step": 2958 }, { "epoch": 0.5781555295037124, "grad_norm": 34.3315315246582, "learning_rate": 8.025825494177324e-06, "loss": 8.6756, "step": 2959 }, { "epoch": 0.5783509183274717, "grad_norm": 7.625585556030273, "learning_rate": 8.01962169054599e-06, "loss": 9.5105, "step": 2960 }, { "epoch": 0.5785463071512309, "grad_norm": 10.44150447845459, "learning_rate": 8.013418680117652e-06, "loss": 9.686, "step": 2961 }, { "epoch": 0.5787416959749903, "grad_norm": 7.208309650421143, "learning_rate": 8.007216465376809e-06, "loss": 8.249, "step": 2962 }, { "epoch": 0.5789370847987495, "grad_norm": 10.5735502243042, "learning_rate": 8.00101504880764e-06, "loss": 8.962, "step": 2963 }, { "epoch": 0.5791324736225087, "grad_norm": 7.800056457519531, "learning_rate": 7.994814432894008e-06, "loss": 8.4317, "step": 2964 }, { "epoch": 0.5793278624462681, "grad_norm": 14.920567512512207, "learning_rate": 7.988614620119446e-06, "loss": 8.8176, "step": 2965 }, { "epoch": 0.5795232512700274, "grad_norm": 6.216339588165283, "learning_rate": 7.982415612967177e-06, "loss": 8.0652, "step": 2966 }, { "epoch": 0.5797186400937866, "grad_norm": 11.770721435546875, "learning_rate": 7.976217413920093e-06, "loss": 9.7224, "step": 2967 }, { "epoch": 0.579914028917546, "grad_norm": 14.045543670654297, "learning_rate": 7.970020025460765e-06, "loss": 8.9005, "step": 2968 }, { "epoch": 0.5801094177413052, "grad_norm": 7.276342868804932, "learning_rate": 7.963823450071442e-06, "loss": 9.1735, "step": 2969 }, { "epoch": 0.5803048065650644, "grad_norm": 10.889382362365723, "learning_rate": 7.957627690234042e-06, "loss": 9.7824, "step": 2970 }, { "epoch": 0.5805001953888238, "grad_norm": 8.489128112792969, "learning_rate": 7.95143274843016e-06, "loss": 9.0536, "step": 2971 }, { "epoch": 0.580695584212583, "grad_norm": 7.522155284881592, "learning_rate": 7.945238627141063e-06, "loss": 9.0415, "step": 2972 }, { "epoch": 0.5808909730363423, "grad_norm": 9.565140724182129, "learning_rate": 7.93904532884769e-06, "loss": 9.0799, "step": 2973 }, { "epoch": 0.5810863618601017, "grad_norm": 10.56672477722168, "learning_rate": 7.932852856030646e-06, "loss": 9.5377, "step": 2974 }, { "epoch": 0.5812817506838609, "grad_norm": 7.589129447937012, "learning_rate": 7.926661211170213e-06, "loss": 8.8639, "step": 2975 }, { "epoch": 0.5814771395076201, "grad_norm": 9.25190544128418, "learning_rate": 7.920470396746334e-06, "loss": 9.2866, "step": 2976 }, { "epoch": 0.5816725283313794, "grad_norm": 9.842716217041016, "learning_rate": 7.914280415238624e-06, "loss": 9.493, "step": 2977 }, { "epoch": 0.5818679171551387, "grad_norm": 8.670297622680664, "learning_rate": 7.90809126912636e-06, "loss": 8.3575, "step": 2978 }, { "epoch": 0.582063305978898, "grad_norm": 115.61127471923828, "learning_rate": 7.90190296088849e-06, "loss": 10.4128, "step": 2979 }, { "epoch": 0.5822586948026572, "grad_norm": 8.339439392089844, "learning_rate": 7.895715493003623e-06, "loss": 8.7615, "step": 2980 }, { "epoch": 0.5824540836264166, "grad_norm": 7.4925537109375, "learning_rate": 7.88952886795003e-06, "loss": 9.2757, "step": 2981 }, { "epoch": 0.5826494724501758, "grad_norm": 9.57435131072998, "learning_rate": 7.883343088205648e-06, "loss": 8.7239, "step": 2982 }, { "epoch": 0.5828448612739351, "grad_norm": 9.265599250793457, "learning_rate": 7.877158156248075e-06, "loss": 9.0901, "step": 2983 }, { "epoch": 0.5830402500976944, "grad_norm": 8.441311836242676, "learning_rate": 7.870974074554566e-06, "loss": 8.9896, "step": 2984 }, { "epoch": 0.5832356389214537, "grad_norm": 9.167561531066895, "learning_rate": 7.86479084560204e-06, "loss": 9.6028, "step": 2985 }, { "epoch": 0.5834310277452129, "grad_norm": 8.28317642211914, "learning_rate": 7.858608471867069e-06, "loss": 8.3372, "step": 2986 }, { "epoch": 0.5836264165689723, "grad_norm": 9.223546981811523, "learning_rate": 7.852426955825889e-06, "loss": 9.7081, "step": 2987 }, { "epoch": 0.5838218053927315, "grad_norm": 8.843958854675293, "learning_rate": 7.846246299954386e-06, "loss": 9.6832, "step": 2988 }, { "epoch": 0.5840171942164908, "grad_norm": 7.481940269470215, "learning_rate": 7.840066506728108e-06, "loss": 8.7187, "step": 2989 }, { "epoch": 0.5842125830402501, "grad_norm": 8.704890251159668, "learning_rate": 7.833887578622254e-06, "loss": 9.2202, "step": 2990 }, { "epoch": 0.5844079718640094, "grad_norm": 70.9208755493164, "learning_rate": 7.827709518111674e-06, "loss": 10.2329, "step": 2991 }, { "epoch": 0.5846033606877686, "grad_norm": 7.739467144012451, "learning_rate": 7.821532327670871e-06, "loss": 9.1888, "step": 2992 }, { "epoch": 0.584798749511528, "grad_norm": 11.358851432800293, "learning_rate": 7.815356009774006e-06, "loss": 9.6033, "step": 2993 }, { "epoch": 0.5849941383352872, "grad_norm": 10.573582649230957, "learning_rate": 7.809180566894882e-06, "loss": 8.9731, "step": 2994 }, { "epoch": 0.5851895271590465, "grad_norm": 8.335413932800293, "learning_rate": 7.80300600150696e-06, "loss": 8.6475, "step": 2995 }, { "epoch": 0.5853849159828057, "grad_norm": 8.019357681274414, "learning_rate": 7.796832316083347e-06, "loss": 8.9185, "step": 2996 }, { "epoch": 0.5855803048065651, "grad_norm": 8.108970642089844, "learning_rate": 7.790659513096787e-06, "loss": 9.8417, "step": 2997 }, { "epoch": 0.5857756936303243, "grad_norm": 6.562138557434082, "learning_rate": 7.784487595019686e-06, "loss": 9.0271, "step": 2998 }, { "epoch": 0.5859710824540836, "grad_norm": 6.990952014923096, "learning_rate": 7.778316564324085e-06, "loss": 9.229, "step": 2999 }, { "epoch": 0.5861664712778429, "grad_norm": 9.679683685302734, "learning_rate": 7.772146423481676e-06, "loss": 9.4981, "step": 3000 }, { "epoch": 0.5863618601016022, "grad_norm": 8.7763090133667, "learning_rate": 7.76597717496379e-06, "loss": 8.3711, "step": 3001 }, { "epoch": 0.5865572489253614, "grad_norm": 9.195281028747559, "learning_rate": 7.759808821241406e-06, "loss": 9.6284, "step": 3002 }, { "epoch": 0.5867526377491208, "grad_norm": 6.703279972076416, "learning_rate": 7.753641364785139e-06, "loss": 8.2272, "step": 3003 }, { "epoch": 0.58694802657288, "grad_norm": 7.0960001945495605, "learning_rate": 7.747474808065244e-06, "loss": 7.9143, "step": 3004 }, { "epoch": 0.5871434153966393, "grad_norm": 7.982921123504639, "learning_rate": 7.741309153551625e-06, "loss": 8.9272, "step": 3005 }, { "epoch": 0.5873388042203986, "grad_norm": 49.128475189208984, "learning_rate": 7.735144403713811e-06, "loss": 10.1243, "step": 3006 }, { "epoch": 0.5875341930441579, "grad_norm": 7.292541027069092, "learning_rate": 7.72898056102098e-06, "loss": 8.2249, "step": 3007 }, { "epoch": 0.5877295818679171, "grad_norm": 7.324792861938477, "learning_rate": 7.722817627941943e-06, "loss": 9.1744, "step": 3008 }, { "epoch": 0.5879249706916765, "grad_norm": 7.497281551361084, "learning_rate": 7.716655606945143e-06, "loss": 10.0566, "step": 3009 }, { "epoch": 0.5881203595154357, "grad_norm": 6.438875675201416, "learning_rate": 7.710494500498662e-06, "loss": 8.9169, "step": 3010 }, { "epoch": 0.588315748339195, "grad_norm": 8.44289779663086, "learning_rate": 7.704334311070215e-06, "loss": 9.1258, "step": 3011 }, { "epoch": 0.5885111371629543, "grad_norm": 9.694618225097656, "learning_rate": 7.698175041127148e-06, "loss": 10.1942, "step": 3012 }, { "epoch": 0.5887065259867136, "grad_norm": 10.61161994934082, "learning_rate": 7.69201669313644e-06, "loss": 8.6437, "step": 3013 }, { "epoch": 0.5889019148104728, "grad_norm": 8.284984588623047, "learning_rate": 7.685859269564702e-06, "loss": 9.9219, "step": 3014 }, { "epoch": 0.5890973036342321, "grad_norm": 8.922584533691406, "learning_rate": 7.679702772878171e-06, "loss": 9.5434, "step": 3015 }, { "epoch": 0.5892926924579914, "grad_norm": 7.449310302734375, "learning_rate": 7.673547205542718e-06, "loss": 8.126, "step": 3016 }, { "epoch": 0.5894880812817507, "grad_norm": 8.603376388549805, "learning_rate": 7.667392570023834e-06, "loss": 9.302, "step": 3017 }, { "epoch": 0.5896834701055099, "grad_norm": 53.00111389160156, "learning_rate": 7.661238868786648e-06, "loss": 9.0841, "step": 3018 }, { "epoch": 0.5898788589292693, "grad_norm": 8.923293113708496, "learning_rate": 7.655086104295904e-06, "loss": 8.7745, "step": 3019 }, { "epoch": 0.5900742477530285, "grad_norm": 8.259012222290039, "learning_rate": 7.648934279015977e-06, "loss": 9.5202, "step": 3020 }, { "epoch": 0.5902696365767878, "grad_norm": 30.650358200073242, "learning_rate": 7.642783395410863e-06, "loss": 8.6799, "step": 3021 }, { "epoch": 0.5904650254005471, "grad_norm": 8.582069396972656, "learning_rate": 7.636633455944184e-06, "loss": 9.9083, "step": 3022 }, { "epoch": 0.5906604142243064, "grad_norm": 7.850262641906738, "learning_rate": 7.630484463079177e-06, "loss": 8.974, "step": 3023 }, { "epoch": 0.5908558030480656, "grad_norm": 9.134495735168457, "learning_rate": 7.6243364192787095e-06, "loss": 8.9006, "step": 3024 }, { "epoch": 0.591051191871825, "grad_norm": 7.085219383239746, "learning_rate": 7.618189327005261e-06, "loss": 8.6565, "step": 3025 }, { "epoch": 0.5912465806955842, "grad_norm": 7.791258335113525, "learning_rate": 7.612043188720933e-06, "loss": 9.6032, "step": 3026 }, { "epoch": 0.5914419695193435, "grad_norm": 10.506172180175781, "learning_rate": 7.605898006887447e-06, "loss": 9.6001, "step": 3027 }, { "epoch": 0.5916373583431028, "grad_norm": 9.15788459777832, "learning_rate": 7.599753783966136e-06, "loss": 9.9401, "step": 3028 }, { "epoch": 0.5918327471668621, "grad_norm": 10.521397590637207, "learning_rate": 7.5936105224179534e-06, "loss": 9.1462, "step": 3029 }, { "epoch": 0.5920281359906213, "grad_norm": 11.96612548828125, "learning_rate": 7.587468224703467e-06, "loss": 9.2608, "step": 3030 }, { "epoch": 0.5922235248143806, "grad_norm": 8.097817420959473, "learning_rate": 7.581326893282858e-06, "loss": 8.5244, "step": 3031 }, { "epoch": 0.5924189136381399, "grad_norm": 7.517138481140137, "learning_rate": 7.5751865306159174e-06, "loss": 8.6608, "step": 3032 }, { "epoch": 0.5926143024618992, "grad_norm": 8.84758472442627, "learning_rate": 7.569047139162054e-06, "loss": 9.0976, "step": 3033 }, { "epoch": 0.5928096912856584, "grad_norm": 12.096185684204102, "learning_rate": 7.5629087213802846e-06, "loss": 8.6873, "step": 3034 }, { "epoch": 0.5930050801094178, "grad_norm": 9.214247703552246, "learning_rate": 7.556771279729236e-06, "loss": 9.2499, "step": 3035 }, { "epoch": 0.593200468933177, "grad_norm": 7.316731929779053, "learning_rate": 7.550634816667142e-06, "loss": 9.0062, "step": 3036 }, { "epoch": 0.5933958577569363, "grad_norm": 9.038198471069336, "learning_rate": 7.544499334651847e-06, "loss": 8.8277, "step": 3037 }, { "epoch": 0.5935912465806956, "grad_norm": 7.669400691986084, "learning_rate": 7.538364836140804e-06, "loss": 9.1077, "step": 3038 }, { "epoch": 0.5937866354044549, "grad_norm": 11.094863891601562, "learning_rate": 7.532231323591068e-06, "loss": 7.842, "step": 3039 }, { "epoch": 0.5939820242282141, "grad_norm": 10.717040061950684, "learning_rate": 7.526098799459303e-06, "loss": 10.1978, "step": 3040 }, { "epoch": 0.5941774130519735, "grad_norm": 7.92678165435791, "learning_rate": 7.519967266201774e-06, "loss": 8.9518, "step": 3041 }, { "epoch": 0.5943728018757327, "grad_norm": 8.335107803344727, "learning_rate": 7.513836726274352e-06, "loss": 9.5342, "step": 3042 }, { "epoch": 0.594568190699492, "grad_norm": 8.594441413879395, "learning_rate": 7.507707182132507e-06, "loss": 9.543, "step": 3043 }, { "epoch": 0.5947635795232513, "grad_norm": 8.061004638671875, "learning_rate": 7.5015786362313115e-06, "loss": 9.0036, "step": 3044 }, { "epoch": 0.5949589683470106, "grad_norm": 8.662178039550781, "learning_rate": 7.495451091025441e-06, "loss": 8.7278, "step": 3045 }, { "epoch": 0.5951543571707698, "grad_norm": 8.901294708251953, "learning_rate": 7.489324548969163e-06, "loss": 10.3345, "step": 3046 }, { "epoch": 0.5953497459945292, "grad_norm": 10.195141792297363, "learning_rate": 7.483199012516353e-06, "loss": 8.9794, "step": 3047 }, { "epoch": 0.5955451348182884, "grad_norm": 10.048354148864746, "learning_rate": 7.477074484120477e-06, "loss": 8.6309, "step": 3048 }, { "epoch": 0.5957405236420477, "grad_norm": 10.495071411132812, "learning_rate": 7.470950966234597e-06, "loss": 9.6058, "step": 3049 }, { "epoch": 0.5959359124658069, "grad_norm": 7.866878032684326, "learning_rate": 7.464828461311372e-06, "loss": 9.4768, "step": 3050 }, { "epoch": 0.5961313012895663, "grad_norm": 10.282525062561035, "learning_rate": 7.458706971803056e-06, "loss": 9.3288, "step": 3051 }, { "epoch": 0.5963266901133255, "grad_norm": 6.528744220733643, "learning_rate": 7.452586500161496e-06, "loss": 9.0563, "step": 3052 }, { "epoch": 0.5965220789370848, "grad_norm": 8.274612426757812, "learning_rate": 7.446467048838131e-06, "loss": 8.4749, "step": 3053 }, { "epoch": 0.5967174677608441, "grad_norm": 8.588279724121094, "learning_rate": 7.44034862028399e-06, "loss": 8.847, "step": 3054 }, { "epoch": 0.5969128565846034, "grad_norm": 7.771967887878418, "learning_rate": 7.434231216949696e-06, "loss": 9.1657, "step": 3055 }, { "epoch": 0.5971082454083626, "grad_norm": 9.351832389831543, "learning_rate": 7.428114841285458e-06, "loss": 10.4129, "step": 3056 }, { "epoch": 0.597303634232122, "grad_norm": 13.710846900939941, "learning_rate": 7.421999495741072e-06, "loss": 9.5978, "step": 3057 }, { "epoch": 0.5974990230558812, "grad_norm": 7.8330464363098145, "learning_rate": 7.415885182765927e-06, "loss": 9.15, "step": 3058 }, { "epoch": 0.5976944118796405, "grad_norm": 9.916016578674316, "learning_rate": 7.409771904808993e-06, "loss": 8.9455, "step": 3059 }, { "epoch": 0.5978898007033998, "grad_norm": 14.84218692779541, "learning_rate": 7.403659664318828e-06, "loss": 9.5175, "step": 3060 }, { "epoch": 0.5980851895271591, "grad_norm": 7.279349327087402, "learning_rate": 7.397548463743578e-06, "loss": 9.2482, "step": 3061 }, { "epoch": 0.5982805783509183, "grad_norm": 10.927651405334473, "learning_rate": 7.391438305530961e-06, "loss": 8.7706, "step": 3062 }, { "epoch": 0.5984759671746777, "grad_norm": 8.275943756103516, "learning_rate": 7.38532919212829e-06, "loss": 8.6868, "step": 3063 }, { "epoch": 0.5986713559984369, "grad_norm": 7.2512712478637695, "learning_rate": 7.379221125982454e-06, "loss": 9.0557, "step": 3064 }, { "epoch": 0.5988667448221962, "grad_norm": 9.323055267333984, "learning_rate": 7.3731141095399225e-06, "loss": 9.4869, "step": 3065 }, { "epoch": 0.5990621336459554, "grad_norm": 9.154170036315918, "learning_rate": 7.367008145246746e-06, "loss": 9.1243, "step": 3066 }, { "epoch": 0.5992575224697148, "grad_norm": 7.6077775955200195, "learning_rate": 7.360903235548552e-06, "loss": 8.9403, "step": 3067 }, { "epoch": 0.599452911293474, "grad_norm": 13.280308723449707, "learning_rate": 7.354799382890546e-06, "loss": 10.0786, "step": 3068 }, { "epoch": 0.5996483001172332, "grad_norm": 20.785051345825195, "learning_rate": 7.348696589717512e-06, "loss": 9.6481, "step": 3069 }, { "epoch": 0.5998436889409926, "grad_norm": 7.255066394805908, "learning_rate": 7.342594858473807e-06, "loss": 9.4078, "step": 3070 }, { "epoch": 0.6000390777647518, "grad_norm": 8.109085083007812, "learning_rate": 7.336494191603364e-06, "loss": 8.3056, "step": 3071 }, { "epoch": 0.6002344665885111, "grad_norm": 6.681050777435303, "learning_rate": 7.33039459154969e-06, "loss": 9.0156, "step": 3072 }, { "epoch": 0.6004298554122705, "grad_norm": 7.881631851196289, "learning_rate": 7.3242960607558626e-06, "loss": 8.4381, "step": 3073 }, { "epoch": 0.6006252442360297, "grad_norm": 12.575008392333984, "learning_rate": 7.318198601664537e-06, "loss": 10.4451, "step": 3074 }, { "epoch": 0.6008206330597889, "grad_norm": 10.129982948303223, "learning_rate": 7.312102216717929e-06, "loss": 9.4406, "step": 3075 }, { "epoch": 0.6010160218835483, "grad_norm": 6.688687801361084, "learning_rate": 7.3060069083578325e-06, "loss": 8.0307, "step": 3076 }, { "epoch": 0.6012114107073075, "grad_norm": 8.346129417419434, "learning_rate": 7.299912679025609e-06, "loss": 8.1342, "step": 3077 }, { "epoch": 0.6014067995310668, "grad_norm": 8.57900619506836, "learning_rate": 7.293819531162185e-06, "loss": 9.8277, "step": 3078 }, { "epoch": 0.6016021883548262, "grad_norm": 8.378705978393555, "learning_rate": 7.287727467208055e-06, "loss": 7.9267, "step": 3079 }, { "epoch": 0.6017975771785854, "grad_norm": 9.134478569030762, "learning_rate": 7.2816364896032835e-06, "loss": 9.6305, "step": 3080 }, { "epoch": 0.6019929660023446, "grad_norm": 10.45907974243164, "learning_rate": 7.2755466007874905e-06, "loss": 9.1696, "step": 3081 }, { "epoch": 0.602188354826104, "grad_norm": 10.994975090026855, "learning_rate": 7.269457803199868e-06, "loss": 9.1492, "step": 3082 }, { "epoch": 0.6023837436498632, "grad_norm": 10.184895515441895, "learning_rate": 7.263370099279173e-06, "loss": 9.3663, "step": 3083 }, { "epoch": 0.6025791324736225, "grad_norm": 8.31871509552002, "learning_rate": 7.257283491463711e-06, "loss": 9.104, "step": 3084 }, { "epoch": 0.6027745212973817, "grad_norm": 6.585756778717041, "learning_rate": 7.251197982191365e-06, "loss": 8.4756, "step": 3085 }, { "epoch": 0.6029699101211411, "grad_norm": 8.996659278869629, "learning_rate": 7.245113573899567e-06, "loss": 9.7396, "step": 3086 }, { "epoch": 0.6031652989449003, "grad_norm": 9.247672080993652, "learning_rate": 7.239030269025311e-06, "loss": 10.0921, "step": 3087 }, { "epoch": 0.6033606877686596, "grad_norm": 9.721524238586426, "learning_rate": 7.232948070005153e-06, "loss": 8.7793, "step": 3088 }, { "epoch": 0.6035560765924189, "grad_norm": 9.967495918273926, "learning_rate": 7.226866979275198e-06, "loss": 8.1611, "step": 3089 }, { "epoch": 0.6037514654161782, "grad_norm": 8.950152397155762, "learning_rate": 7.220786999271114e-06, "loss": 9.3363, "step": 3090 }, { "epoch": 0.6039468542399374, "grad_norm": 7.841548442840576, "learning_rate": 7.214708132428121e-06, "loss": 8.4895, "step": 3091 }, { "epoch": 0.6041422430636968, "grad_norm": 6.933669567108154, "learning_rate": 7.208630381180995e-06, "loss": 8.7605, "step": 3092 }, { "epoch": 0.604337631887456, "grad_norm": 9.261148452758789, "learning_rate": 7.202553747964063e-06, "loss": 9.047, "step": 3093 }, { "epoch": 0.6045330207112153, "grad_norm": 7.2570672035217285, "learning_rate": 7.196478235211204e-06, "loss": 8.7401, "step": 3094 }, { "epoch": 0.6047284095349746, "grad_norm": 7.566317558288574, "learning_rate": 7.190403845355853e-06, "loss": 9.5184, "step": 3095 }, { "epoch": 0.6049237983587339, "grad_norm": 9.053235054016113, "learning_rate": 7.18433058083099e-06, "loss": 9.1205, "step": 3096 }, { "epoch": 0.6051191871824931, "grad_norm": 8.755714416503906, "learning_rate": 7.178258444069144e-06, "loss": 10.2415, "step": 3097 }, { "epoch": 0.6053145760062525, "grad_norm": 9.840388298034668, "learning_rate": 7.172187437502397e-06, "loss": 8.8317, "step": 3098 }, { "epoch": 0.6055099648300117, "grad_norm": 8.49913501739502, "learning_rate": 7.166117563562375e-06, "loss": 9.3803, "step": 3099 }, { "epoch": 0.605705353653771, "grad_norm": 8.731346130371094, "learning_rate": 7.160048824680251e-06, "loss": 9.8032, "step": 3100 }, { "epoch": 0.6059007424775303, "grad_norm": 7.560633182525635, "learning_rate": 7.153981223286743e-06, "loss": 9.2163, "step": 3101 }, { "epoch": 0.6060961313012896, "grad_norm": 10.798296928405762, "learning_rate": 7.1479147618121135e-06, "loss": 9.709, "step": 3102 }, { "epoch": 0.6062915201250488, "grad_norm": 8.901002883911133, "learning_rate": 7.141849442686168e-06, "loss": 9.0237, "step": 3103 }, { "epoch": 0.6064869089488081, "grad_norm": 9.370978355407715, "learning_rate": 7.1357852683382565e-06, "loss": 9.0017, "step": 3104 }, { "epoch": 0.6066822977725674, "grad_norm": 9.1856050491333, "learning_rate": 7.129722241197269e-06, "loss": 9.2504, "step": 3105 }, { "epoch": 0.6068776865963267, "grad_norm": 8.396125793457031, "learning_rate": 7.123660363691636e-06, "loss": 9.5331, "step": 3106 }, { "epoch": 0.6070730754200859, "grad_norm": 8.387005805969238, "learning_rate": 7.117599638249326e-06, "loss": 8.3431, "step": 3107 }, { "epoch": 0.6072684642438453, "grad_norm": 13.175034523010254, "learning_rate": 7.11154006729785e-06, "loss": 9.4098, "step": 3108 }, { "epoch": 0.6074638530676045, "grad_norm": 9.21987533569336, "learning_rate": 7.1054816532642525e-06, "loss": 9.9374, "step": 3109 }, { "epoch": 0.6076592418913638, "grad_norm": 6.446108341217041, "learning_rate": 7.0994243985751175e-06, "loss": 7.8664, "step": 3110 }, { "epoch": 0.6078546307151231, "grad_norm": 10.859238624572754, "learning_rate": 7.093368305656564e-06, "loss": 8.7809, "step": 3111 }, { "epoch": 0.6080500195388824, "grad_norm": 9.779617309570312, "learning_rate": 7.087313376934244e-06, "loss": 9.1884, "step": 3112 }, { "epoch": 0.6082454083626416, "grad_norm": 9.77717399597168, "learning_rate": 7.081259614833345e-06, "loss": 9.1779, "step": 3113 }, { "epoch": 0.608440797186401, "grad_norm": 9.52149486541748, "learning_rate": 7.07520702177859e-06, "loss": 9.1229, "step": 3114 }, { "epoch": 0.6086361860101602, "grad_norm": 8.325057983398438, "learning_rate": 7.069155600194223e-06, "loss": 9.2613, "step": 3115 }, { "epoch": 0.6088315748339195, "grad_norm": 10.121953010559082, "learning_rate": 7.063105352504031e-06, "loss": 9.1125, "step": 3116 }, { "epoch": 0.6090269636576788, "grad_norm": 10.042227745056152, "learning_rate": 7.057056281131325e-06, "loss": 9.4723, "step": 3117 }, { "epoch": 0.6092223524814381, "grad_norm": 10.98670768737793, "learning_rate": 7.0510083884989476e-06, "loss": 9.4069, "step": 3118 }, { "epoch": 0.6094177413051973, "grad_norm": 10.467682838439941, "learning_rate": 7.044961677029265e-06, "loss": 9.74, "step": 3119 }, { "epoch": 0.6096131301289566, "grad_norm": 7.917074203491211, "learning_rate": 7.038916149144175e-06, "loss": 8.6953, "step": 3120 }, { "epoch": 0.6098085189527159, "grad_norm": 7.649165153503418, "learning_rate": 7.032871807265097e-06, "loss": 9.1816, "step": 3121 }, { "epoch": 0.6100039077764752, "grad_norm": 7.314424991607666, "learning_rate": 7.02682865381298e-06, "loss": 8.7005, "step": 3122 }, { "epoch": 0.6101992966002344, "grad_norm": 8.90090560913086, "learning_rate": 7.020786691208292e-06, "loss": 9.215, "step": 3123 }, { "epoch": 0.6103946854239938, "grad_norm": 7.860108375549316, "learning_rate": 7.014745921871029e-06, "loss": 9.1524, "step": 3124 }, { "epoch": 0.610590074247753, "grad_norm": 7.705721855163574, "learning_rate": 7.008706348220706e-06, "loss": 9.3513, "step": 3125 }, { "epoch": 0.6107854630715123, "grad_norm": 8.295076370239258, "learning_rate": 7.00266797267636e-06, "loss": 9.2205, "step": 3126 }, { "epoch": 0.6109808518952716, "grad_norm": 9.50456714630127, "learning_rate": 6.996630797656547e-06, "loss": 9.4783, "step": 3127 }, { "epoch": 0.6111762407190309, "grad_norm": 7.332135200500488, "learning_rate": 6.990594825579347e-06, "loss": 8.861, "step": 3128 }, { "epoch": 0.6113716295427901, "grad_norm": 7.384237766265869, "learning_rate": 6.984560058862345e-06, "loss": 8.6675, "step": 3129 }, { "epoch": 0.6115670183665495, "grad_norm": 8.145926475524902, "learning_rate": 6.97852649992266e-06, "loss": 8.709, "step": 3130 }, { "epoch": 0.6117624071903087, "grad_norm": 9.77009391784668, "learning_rate": 6.972494151176917e-06, "loss": 8.2443, "step": 3131 }, { "epoch": 0.611957796014068, "grad_norm": 9.653775215148926, "learning_rate": 6.966463015041262e-06, "loss": 8.9126, "step": 3132 }, { "epoch": 0.6121531848378273, "grad_norm": 7.483040809631348, "learning_rate": 6.9604330939313466e-06, "loss": 8.1147, "step": 3133 }, { "epoch": 0.6123485736615866, "grad_norm": 7.499786376953125, "learning_rate": 6.954404390262346e-06, "loss": 8.5026, "step": 3134 }, { "epoch": 0.6125439624853458, "grad_norm": 8.2280912399292, "learning_rate": 6.948376906448941e-06, "loss": 8.6048, "step": 3135 }, { "epoch": 0.6127393513091052, "grad_norm": 9.438916206359863, "learning_rate": 6.942350644905329e-06, "loss": 9.9455, "step": 3136 }, { "epoch": 0.6129347401328644, "grad_norm": 7.607224941253662, "learning_rate": 6.936325608045211e-06, "loss": 8.7415, "step": 3137 }, { "epoch": 0.6131301289566237, "grad_norm": 8.274602890014648, "learning_rate": 6.930301798281803e-06, "loss": 8.9917, "step": 3138 }, { "epoch": 0.6133255177803829, "grad_norm": 18.421751022338867, "learning_rate": 6.924279218027828e-06, "loss": 9.6311, "step": 3139 }, { "epoch": 0.6135209066041423, "grad_norm": 8.55823040008545, "learning_rate": 6.918257869695517e-06, "loss": 8.6787, "step": 3140 }, { "epoch": 0.6137162954279015, "grad_norm": 8.536458969116211, "learning_rate": 6.9122377556966045e-06, "loss": 9.1508, "step": 3141 }, { "epoch": 0.6139116842516608, "grad_norm": 9.035019874572754, "learning_rate": 6.906218878442337e-06, "loss": 9.7352, "step": 3142 }, { "epoch": 0.6141070730754201, "grad_norm": 12.555957794189453, "learning_rate": 6.900201240343458e-06, "loss": 9.1511, "step": 3143 }, { "epoch": 0.6143024618991794, "grad_norm": 7.259243011474609, "learning_rate": 6.894184843810221e-06, "loss": 9.4217, "step": 3144 }, { "epoch": 0.6144978507229386, "grad_norm": 9.768951416015625, "learning_rate": 6.88816969125238e-06, "loss": 8.5827, "step": 3145 }, { "epoch": 0.614693239546698, "grad_norm": 10.937029838562012, "learning_rate": 6.882155785079186e-06, "loss": 9.6322, "step": 3146 }, { "epoch": 0.6148886283704572, "grad_norm": 8.791969299316406, "learning_rate": 6.876143127699397e-06, "loss": 8.9697, "step": 3147 }, { "epoch": 0.6150840171942165, "grad_norm": 26.33353614807129, "learning_rate": 6.8701317215212716e-06, "loss": 9.2435, "step": 3148 }, { "epoch": 0.6152794060179758, "grad_norm": 8.440177917480469, "learning_rate": 6.8641215689525635e-06, "loss": 8.714, "step": 3149 }, { "epoch": 0.6154747948417351, "grad_norm": 8.96982479095459, "learning_rate": 6.858112672400523e-06, "loss": 9.534, "step": 3150 }, { "epoch": 0.6156701836654943, "grad_norm": 10.628748893737793, "learning_rate": 6.852105034271903e-06, "loss": 8.9206, "step": 3151 }, { "epoch": 0.6158655724892537, "grad_norm": 10.883689880371094, "learning_rate": 6.846098656972947e-06, "loss": 9.4001, "step": 3152 }, { "epoch": 0.6160609613130129, "grad_norm": 63.99466323852539, "learning_rate": 6.840093542909396e-06, "loss": 8.4326, "step": 3153 }, { "epoch": 0.6162563501367722, "grad_norm": 8.935728073120117, "learning_rate": 6.834089694486485e-06, "loss": 8.2268, "step": 3154 }, { "epoch": 0.6164517389605314, "grad_norm": 9.16843032836914, "learning_rate": 6.8280871141089415e-06, "loss": 9.8084, "step": 3155 }, { "epoch": 0.6166471277842908, "grad_norm": 33.58384323120117, "learning_rate": 6.822085804180985e-06, "loss": 9.3495, "step": 3156 }, { "epoch": 0.61684251660805, "grad_norm": 34.05607223510742, "learning_rate": 6.816085767106328e-06, "loss": 8.9961, "step": 3157 }, { "epoch": 0.6170379054318093, "grad_norm": 9.352027893066406, "learning_rate": 6.810087005288169e-06, "loss": 9.0201, "step": 3158 }, { "epoch": 0.6172332942555686, "grad_norm": 9.532946586608887, "learning_rate": 6.804089521129202e-06, "loss": 10.2474, "step": 3159 }, { "epoch": 0.6174286830793279, "grad_norm": 11.039137840270996, "learning_rate": 6.798093317031601e-06, "loss": 9.4527, "step": 3160 }, { "epoch": 0.6176240719030871, "grad_norm": 7.1295576095581055, "learning_rate": 6.792098395397036e-06, "loss": 9.0233, "step": 3161 }, { "epoch": 0.6178194607268465, "grad_norm": 6.612754821777344, "learning_rate": 6.786104758626655e-06, "loss": 8.1259, "step": 3162 }, { "epoch": 0.6180148495506057, "grad_norm": 9.287490844726562, "learning_rate": 6.780112409121099e-06, "loss": 9.7588, "step": 3163 }, { "epoch": 0.618210238374365, "grad_norm": 7.553799629211426, "learning_rate": 6.774121349280489e-06, "loss": 8.5617, "step": 3164 }, { "epoch": 0.6184056271981243, "grad_norm": 10.90973949432373, "learning_rate": 6.7681315815044316e-06, "loss": 9.4657, "step": 3165 }, { "epoch": 0.6186010160218836, "grad_norm": 10.601751327514648, "learning_rate": 6.762143108192012e-06, "loss": 8.1576, "step": 3166 }, { "epoch": 0.6187964048456428, "grad_norm": 6.629915237426758, "learning_rate": 6.756155931741803e-06, "loss": 8.4618, "step": 3167 }, { "epoch": 0.6189917936694022, "grad_norm": 10.512444496154785, "learning_rate": 6.750170054551852e-06, "loss": 8.9375, "step": 3168 }, { "epoch": 0.6191871824931614, "grad_norm": 13.67101001739502, "learning_rate": 6.744185479019689e-06, "loss": 9.5742, "step": 3169 }, { "epoch": 0.6193825713169206, "grad_norm": 8.136218070983887, "learning_rate": 6.738202207542325e-06, "loss": 9.0165, "step": 3170 }, { "epoch": 0.61957796014068, "grad_norm": 9.810447692871094, "learning_rate": 6.732220242516243e-06, "loss": 8.9211, "step": 3171 }, { "epoch": 0.6197733489644393, "grad_norm": 9.089319229125977, "learning_rate": 6.726239586337408e-06, "loss": 8.4533, "step": 3172 }, { "epoch": 0.6199687377881985, "grad_norm": 7.8400559425354, "learning_rate": 6.7202602414012555e-06, "loss": 9.4795, "step": 3173 }, { "epoch": 0.6201641266119577, "grad_norm": 11.087610244750977, "learning_rate": 6.714282210102701e-06, "loss": 8.8104, "step": 3174 }, { "epoch": 0.6203595154357171, "grad_norm": 7.8216352462768555, "learning_rate": 6.708305494836131e-06, "loss": 9.7963, "step": 3175 }, { "epoch": 0.6205549042594763, "grad_norm": 7.982954502105713, "learning_rate": 6.702330097995406e-06, "loss": 8.6174, "step": 3176 }, { "epoch": 0.6207502930832356, "grad_norm": 9.812914848327637, "learning_rate": 6.696356021973856e-06, "loss": 9.1891, "step": 3177 }, { "epoch": 0.620945681906995, "grad_norm": 8.950902938842773, "learning_rate": 6.690383269164287e-06, "loss": 8.8326, "step": 3178 }, { "epoch": 0.6211410707307542, "grad_norm": 8.162497520446777, "learning_rate": 6.68441184195897e-06, "loss": 8.6706, "step": 3179 }, { "epoch": 0.6213364595545134, "grad_norm": 11.237481117248535, "learning_rate": 6.6784417427496465e-06, "loss": 8.7699, "step": 3180 }, { "epoch": 0.6215318483782728, "grad_norm": 12.55990982055664, "learning_rate": 6.672472973927528e-06, "loss": 10.0447, "step": 3181 }, { "epoch": 0.621727237202032, "grad_norm": 10.461010932922363, "learning_rate": 6.666505537883292e-06, "loss": 8.6269, "step": 3182 }, { "epoch": 0.6219226260257913, "grad_norm": 7.173439979553223, "learning_rate": 6.660539437007081e-06, "loss": 8.3415, "step": 3183 }, { "epoch": 0.6221180148495506, "grad_norm": 20.17717933654785, "learning_rate": 6.654574673688503e-06, "loss": 9.4778, "step": 3184 }, { "epoch": 0.6223134036733099, "grad_norm": 7.767277717590332, "learning_rate": 6.648611250316635e-06, "loss": 9.8497, "step": 3185 }, { "epoch": 0.6225087924970691, "grad_norm": 16.488759994506836, "learning_rate": 6.642649169280005e-06, "loss": 9.7972, "step": 3186 }, { "epoch": 0.6227041813208285, "grad_norm": 8.96546745300293, "learning_rate": 6.636688432966618e-06, "loss": 9.7068, "step": 3187 }, { "epoch": 0.6228995701445877, "grad_norm": 8.427123069763184, "learning_rate": 6.6307290437639326e-06, "loss": 9.4458, "step": 3188 }, { "epoch": 0.623094958968347, "grad_norm": 7.777038097381592, "learning_rate": 6.624771004058869e-06, "loss": 8.7883, "step": 3189 }, { "epoch": 0.6232903477921062, "grad_norm": 9.135398864746094, "learning_rate": 6.618814316237807e-06, "loss": 9.5184, "step": 3190 }, { "epoch": 0.6234857366158656, "grad_norm": 11.99392032623291, "learning_rate": 6.612858982686584e-06, "loss": 9.166, "step": 3191 }, { "epoch": 0.6236811254396248, "grad_norm": 8.496952056884766, "learning_rate": 6.606905005790498e-06, "loss": 9.4242, "step": 3192 }, { "epoch": 0.6238765142633841, "grad_norm": 8.410484313964844, "learning_rate": 6.6009523879343e-06, "loss": 9.6058, "step": 3193 }, { "epoch": 0.6240719030871434, "grad_norm": 7.824978351593018, "learning_rate": 6.5950011315021986e-06, "loss": 9.4955, "step": 3194 }, { "epoch": 0.6242672919109027, "grad_norm": 6.903197288513184, "learning_rate": 6.589051238877858e-06, "loss": 8.9792, "step": 3195 }, { "epoch": 0.6244626807346619, "grad_norm": 7.065006256103516, "learning_rate": 6.5831027124443935e-06, "loss": 9.3928, "step": 3196 }, { "epoch": 0.6246580695584213, "grad_norm": 7.14594841003418, "learning_rate": 6.5771555545843755e-06, "loss": 8.5202, "step": 3197 }, { "epoch": 0.6248534583821805, "grad_norm": 11.351675033569336, "learning_rate": 6.571209767679827e-06, "loss": 9.4747, "step": 3198 }, { "epoch": 0.6250488472059398, "grad_norm": 8.51473617553711, "learning_rate": 6.565265354112215e-06, "loss": 8.8936, "step": 3199 }, { "epoch": 0.6252442360296991, "grad_norm": 8.77932071685791, "learning_rate": 6.559322316262465e-06, "loss": 9.1062, "step": 3200 }, { "epoch": 0.6254396248534584, "grad_norm": 7.614633560180664, "learning_rate": 6.553380656510948e-06, "loss": 8.5036, "step": 3201 }, { "epoch": 0.6256350136772176, "grad_norm": 9.768440246582031, "learning_rate": 6.547440377237484e-06, "loss": 9.7145, "step": 3202 }, { "epoch": 0.625830402500977, "grad_norm": 10.376811027526855, "learning_rate": 6.5415014808213385e-06, "loss": 8.7293, "step": 3203 }, { "epoch": 0.6260257913247362, "grad_norm": 8.296785354614258, "learning_rate": 6.535563969641223e-06, "loss": 8.2191, "step": 3204 }, { "epoch": 0.6262211801484955, "grad_norm": 10.103215217590332, "learning_rate": 6.529627846075297e-06, "loss": 9.3273, "step": 3205 }, { "epoch": 0.6264165689722548, "grad_norm": 8.04591178894043, "learning_rate": 6.52369311250116e-06, "loss": 9.2947, "step": 3206 }, { "epoch": 0.6266119577960141, "grad_norm": 90.45257568359375, "learning_rate": 6.51775977129586e-06, "loss": 9.6114, "step": 3207 }, { "epoch": 0.6268073466197733, "grad_norm": 7.731884956359863, "learning_rate": 6.511827824835881e-06, "loss": 9.0109, "step": 3208 }, { "epoch": 0.6270027354435326, "grad_norm": 8.872273445129395, "learning_rate": 6.5058972754971535e-06, "loss": 9.6837, "step": 3209 }, { "epoch": 0.6271981242672919, "grad_norm": 13.767102241516113, "learning_rate": 6.499968125655047e-06, "loss": 9.097, "step": 3210 }, { "epoch": 0.6273935130910512, "grad_norm": 6.631365776062012, "learning_rate": 6.4940403776843715e-06, "loss": 8.7749, "step": 3211 }, { "epoch": 0.6275889019148104, "grad_norm": 9.937586784362793, "learning_rate": 6.488114033959371e-06, "loss": 9.6075, "step": 3212 }, { "epoch": 0.6277842907385698, "grad_norm": 7.5149617195129395, "learning_rate": 6.48218909685373e-06, "loss": 9.0256, "step": 3213 }, { "epoch": 0.627979679562329, "grad_norm": 9.131972312927246, "learning_rate": 6.476265568740572e-06, "loss": 8.9044, "step": 3214 }, { "epoch": 0.6281750683860883, "grad_norm": 15.428956031799316, "learning_rate": 6.470343451992452e-06, "loss": 9.7636, "step": 3215 }, { "epoch": 0.6283704572098476, "grad_norm": 8.987717628479004, "learning_rate": 6.464422748981364e-06, "loss": 8.9946, "step": 3216 }, { "epoch": 0.6285658460336069, "grad_norm": 10.1642484664917, "learning_rate": 6.458503462078731e-06, "loss": 10.0214, "step": 3217 }, { "epoch": 0.6287612348573661, "grad_norm": 7.6031413078308105, "learning_rate": 6.452585593655412e-06, "loss": 9.4287, "step": 3218 }, { "epoch": 0.6289566236811255, "grad_norm": 8.994832038879395, "learning_rate": 6.446669146081697e-06, "loss": 9.3486, "step": 3219 }, { "epoch": 0.6291520125048847, "grad_norm": 15.768937110900879, "learning_rate": 6.440754121727308e-06, "loss": 9.4399, "step": 3220 }, { "epoch": 0.629347401328644, "grad_norm": 8.522614479064941, "learning_rate": 6.434840522961395e-06, "loss": 8.7106, "step": 3221 }, { "epoch": 0.6295427901524033, "grad_norm": 9.83200740814209, "learning_rate": 6.428928352152538e-06, "loss": 9.5416, "step": 3222 }, { "epoch": 0.6297381789761626, "grad_norm": 9.205872535705566, "learning_rate": 6.423017611668745e-06, "loss": 8.5827, "step": 3223 }, { "epoch": 0.6299335677999218, "grad_norm": 9.592184066772461, "learning_rate": 6.4171083038774525e-06, "loss": 10.8158, "step": 3224 }, { "epoch": 0.6301289566236812, "grad_norm": 12.21749210357666, "learning_rate": 6.411200431145519e-06, "loss": 10.0817, "step": 3225 }, { "epoch": 0.6303243454474404, "grad_norm": 11.177057266235352, "learning_rate": 6.40529399583923e-06, "loss": 9.248, "step": 3226 }, { "epoch": 0.6305197342711997, "grad_norm": 8.112342834472656, "learning_rate": 6.399389000324301e-06, "loss": 9.1992, "step": 3227 }, { "epoch": 0.6307151230949589, "grad_norm": 14.266334533691406, "learning_rate": 6.393485446965861e-06, "loss": 8.742, "step": 3228 }, { "epoch": 0.6309105119187183, "grad_norm": 8.99083423614502, "learning_rate": 6.387583338128471e-06, "loss": 9.5529, "step": 3229 }, { "epoch": 0.6311059007424775, "grad_norm": 7.230621337890625, "learning_rate": 6.381682676176106e-06, "loss": 8.9044, "step": 3230 }, { "epoch": 0.6313012895662368, "grad_norm": 10.015031814575195, "learning_rate": 6.375783463472165e-06, "loss": 8.8948, "step": 3231 }, { "epoch": 0.6314966783899961, "grad_norm": 6.90847110748291, "learning_rate": 6.369885702379465e-06, "loss": 9.1416, "step": 3232 }, { "epoch": 0.6316920672137554, "grad_norm": 10.970864295959473, "learning_rate": 6.363989395260244e-06, "loss": 10.2324, "step": 3233 }, { "epoch": 0.6318874560375146, "grad_norm": 8.515042304992676, "learning_rate": 6.358094544476154e-06, "loss": 8.9114, "step": 3234 }, { "epoch": 0.632082844861274, "grad_norm": 8.391950607299805, "learning_rate": 6.352201152388269e-06, "loss": 9.033, "step": 3235 }, { "epoch": 0.6322782336850332, "grad_norm": 7.665505886077881, "learning_rate": 6.346309221357072e-06, "loss": 10.047, "step": 3236 }, { "epoch": 0.6324736225087925, "grad_norm": 7.0385212898254395, "learning_rate": 6.340418753742468e-06, "loss": 9.8185, "step": 3237 }, { "epoch": 0.6326690113325518, "grad_norm": 8.163466453552246, "learning_rate": 6.334529751903768e-06, "loss": 9.4894, "step": 3238 }, { "epoch": 0.6328644001563111, "grad_norm": 8.51423454284668, "learning_rate": 6.328642218199702e-06, "loss": 9.8436, "step": 3239 }, { "epoch": 0.6330597889800703, "grad_norm": 7.965399742126465, "learning_rate": 6.32275615498841e-06, "loss": 9.2198, "step": 3240 }, { "epoch": 0.6332551778038297, "grad_norm": 9.111120223999023, "learning_rate": 6.316871564627443e-06, "loss": 9.3589, "step": 3241 }, { "epoch": 0.6334505666275889, "grad_norm": 7.943230152130127, "learning_rate": 6.310988449473763e-06, "loss": 8.1335, "step": 3242 }, { "epoch": 0.6336459554513482, "grad_norm": 8.767099380493164, "learning_rate": 6.30510681188374e-06, "loss": 8.2788, "step": 3243 }, { "epoch": 0.6338413442751074, "grad_norm": 9.133307456970215, "learning_rate": 6.299226654213151e-06, "loss": 9.5274, "step": 3244 }, { "epoch": 0.6340367330988668, "grad_norm": 12.576370239257812, "learning_rate": 6.293347978817182e-06, "loss": 8.6891, "step": 3245 }, { "epoch": 0.634232121922626, "grad_norm": 7.583223342895508, "learning_rate": 6.287470788050427e-06, "loss": 9.4539, "step": 3246 }, { "epoch": 0.6344275107463853, "grad_norm": 9.225921630859375, "learning_rate": 6.281595084266883e-06, "loss": 9.8242, "step": 3247 }, { "epoch": 0.6346228995701446, "grad_norm": 9.983611106872559, "learning_rate": 6.2757208698199475e-06, "loss": 9.5421, "step": 3248 }, { "epoch": 0.6348182883939039, "grad_norm": 13.373017311096191, "learning_rate": 6.269848147062431e-06, "loss": 9.2219, "step": 3249 }, { "epoch": 0.6350136772176631, "grad_norm": 9.064508438110352, "learning_rate": 6.263976918346542e-06, "loss": 9.4761, "step": 3250 }, { "epoch": 0.6352090660414225, "grad_norm": 10.210753440856934, "learning_rate": 6.258107186023885e-06, "loss": 8.3957, "step": 3251 }, { "epoch": 0.6354044548651817, "grad_norm": 7.373657703399658, "learning_rate": 6.252238952445473e-06, "loss": 8.2487, "step": 3252 }, { "epoch": 0.635599843688941, "grad_norm": 7.989492416381836, "learning_rate": 6.246372219961714e-06, "loss": 8.9867, "step": 3253 }, { "epoch": 0.6357952325127003, "grad_norm": 7.448361396789551, "learning_rate": 6.240506990922418e-06, "loss": 9.0057, "step": 3254 }, { "epoch": 0.6359906213364596, "grad_norm": 8.415063858032227, "learning_rate": 6.23464326767679e-06, "loss": 9.4472, "step": 3255 }, { "epoch": 0.6361860101602188, "grad_norm": 10.542451858520508, "learning_rate": 6.228781052573436e-06, "loss": 9.0757, "step": 3256 }, { "epoch": 0.6363813989839782, "grad_norm": 9.972257614135742, "learning_rate": 6.22292034796035e-06, "loss": 9.8713, "step": 3257 }, { "epoch": 0.6365767878077374, "grad_norm": 9.886518478393555, "learning_rate": 6.217061156184932e-06, "loss": 8.1271, "step": 3258 }, { "epoch": 0.6367721766314967, "grad_norm": 9.851444244384766, "learning_rate": 6.211203479593968e-06, "loss": 9.6882, "step": 3259 }, { "epoch": 0.636967565455256, "grad_norm": 9.61854362487793, "learning_rate": 6.205347320533637e-06, "loss": 9.0446, "step": 3260 }, { "epoch": 0.6371629542790153, "grad_norm": 10.928366661071777, "learning_rate": 6.199492681349516e-06, "loss": 10.1104, "step": 3261 }, { "epoch": 0.6373583431027745, "grad_norm": 7.869286060333252, "learning_rate": 6.193639564386567e-06, "loss": 8.8156, "step": 3262 }, { "epoch": 0.6375537319265338, "grad_norm": 10.400713920593262, "learning_rate": 6.18778797198915e-06, "loss": 8.5672, "step": 3263 }, { "epoch": 0.6377491207502931, "grad_norm": 10.945712089538574, "learning_rate": 6.181937906501001e-06, "loss": 10.0861, "step": 3264 }, { "epoch": 0.6379445095740524, "grad_norm": 9.357305526733398, "learning_rate": 6.176089370265259e-06, "loss": 8.6621, "step": 3265 }, { "epoch": 0.6381398983978116, "grad_norm": 11.457706451416016, "learning_rate": 6.170242365624446e-06, "loss": 9.3661, "step": 3266 }, { "epoch": 0.638335287221571, "grad_norm": 13.932377815246582, "learning_rate": 6.164396894920463e-06, "loss": 10.1762, "step": 3267 }, { "epoch": 0.6385306760453302, "grad_norm": 8.705086708068848, "learning_rate": 6.158552960494605e-06, "loss": 9.3154, "step": 3268 }, { "epoch": 0.6387260648690894, "grad_norm": 9.752387046813965, "learning_rate": 6.152710564687552e-06, "loss": 8.626, "step": 3269 }, { "epoch": 0.6389214536928488, "grad_norm": 10.422157287597656, "learning_rate": 6.1468697098393585e-06, "loss": 9.6922, "step": 3270 }, { "epoch": 0.639116842516608, "grad_norm": 8.750487327575684, "learning_rate": 6.141030398289474e-06, "loss": 8.7594, "step": 3271 }, { "epoch": 0.6393122313403673, "grad_norm": 10.304443359375, "learning_rate": 6.135192632376721e-06, "loss": 9.5961, "step": 3272 }, { "epoch": 0.6395076201641267, "grad_norm": 7.825802326202393, "learning_rate": 6.129356414439308e-06, "loss": 9.8164, "step": 3273 }, { "epoch": 0.6397030089878859, "grad_norm": 7.505426406860352, "learning_rate": 6.12352174681482e-06, "loss": 8.8713, "step": 3274 }, { "epoch": 0.6398983978116451, "grad_norm": 8.099054336547852, "learning_rate": 6.117688631840224e-06, "loss": 9.2513, "step": 3275 }, { "epoch": 0.6400937866354045, "grad_norm": 6.938951015472412, "learning_rate": 6.111857071851861e-06, "loss": 9.7799, "step": 3276 }, { "epoch": 0.6402891754591638, "grad_norm": 7.928799629211426, "learning_rate": 6.106027069185455e-06, "loss": 9.7725, "step": 3277 }, { "epoch": 0.640484564282923, "grad_norm": 14.206740379333496, "learning_rate": 6.100198626176101e-06, "loss": 9.5218, "step": 3278 }, { "epoch": 0.6406799531066822, "grad_norm": 8.585370063781738, "learning_rate": 6.094371745158273e-06, "loss": 9.4407, "step": 3279 }, { "epoch": 0.6408753419304416, "grad_norm": 10.159416198730469, "learning_rate": 6.088546428465816e-06, "loss": 10.2069, "step": 3280 }, { "epoch": 0.6410707307542008, "grad_norm": 8.027912139892578, "learning_rate": 6.082722678431952e-06, "loss": 10.0021, "step": 3281 }, { "epoch": 0.6412661195779601, "grad_norm": 7.191110134124756, "learning_rate": 6.076900497389273e-06, "loss": 9.1605, "step": 3282 }, { "epoch": 0.6414615084017194, "grad_norm": 9.171540260314941, "learning_rate": 6.071079887669742e-06, "loss": 9.4411, "step": 3283 }, { "epoch": 0.6416568972254787, "grad_norm": 9.93891716003418, "learning_rate": 6.065260851604694e-06, "loss": 10.1845, "step": 3284 }, { "epoch": 0.6418522860492379, "grad_norm": 7.612746238708496, "learning_rate": 6.0594433915248355e-06, "loss": 7.7668, "step": 3285 }, { "epoch": 0.6420476748729973, "grad_norm": 9.86822509765625, "learning_rate": 6.053627509760238e-06, "loss": 9.5375, "step": 3286 }, { "epoch": 0.6422430636967565, "grad_norm": 9.132811546325684, "learning_rate": 6.047813208640343e-06, "loss": 9.6339, "step": 3287 }, { "epoch": 0.6424384525205158, "grad_norm": 8.128195762634277, "learning_rate": 6.04200049049396e-06, "loss": 8.6282, "step": 3288 }, { "epoch": 0.6426338413442751, "grad_norm": 10.335332870483398, "learning_rate": 6.036189357649263e-06, "loss": 9.745, "step": 3289 }, { "epoch": 0.6428292301680344, "grad_norm": 8.72690486907959, "learning_rate": 6.030379812433788e-06, "loss": 8.6669, "step": 3290 }, { "epoch": 0.6430246189917936, "grad_norm": 7.441038131713867, "learning_rate": 6.024571857174443e-06, "loss": 8.4453, "step": 3291 }, { "epoch": 0.643220007815553, "grad_norm": 7.772334098815918, "learning_rate": 6.018765494197492e-06, "loss": 9.6616, "step": 3292 }, { "epoch": 0.6434153966393122, "grad_norm": 8.764260292053223, "learning_rate": 6.012960725828561e-06, "loss": 9.2197, "step": 3293 }, { "epoch": 0.6436107854630715, "grad_norm": 9.021172523498535, "learning_rate": 6.0071575543926445e-06, "loss": 9.8582, "step": 3294 }, { "epoch": 0.6438061742868308, "grad_norm": 7.492588043212891, "learning_rate": 6.001355982214092e-06, "loss": 9.0723, "step": 3295 }, { "epoch": 0.6440015631105901, "grad_norm": 8.09508228302002, "learning_rate": 5.995556011616611e-06, "loss": 8.796, "step": 3296 }, { "epoch": 0.6441969519343493, "grad_norm": 9.950896263122559, "learning_rate": 5.989757644923271e-06, "loss": 7.8733, "step": 3297 }, { "epoch": 0.6443923407581086, "grad_norm": 8.723230361938477, "learning_rate": 5.983960884456496e-06, "loss": 9.9915, "step": 3298 }, { "epoch": 0.6445877295818679, "grad_norm": 31.13210105895996, "learning_rate": 5.978165732538073e-06, "loss": 9.8638, "step": 3299 }, { "epoch": 0.6447831184056272, "grad_norm": 6.883072376251221, "learning_rate": 5.972372191489136e-06, "loss": 8.3624, "step": 3300 }, { "epoch": 0.6449785072293864, "grad_norm": 7.361454010009766, "learning_rate": 5.9665802636301805e-06, "loss": 8.5977, "step": 3301 }, { "epoch": 0.6451738960531458, "grad_norm": 7.5528459548950195, "learning_rate": 5.960789951281052e-06, "loss": 9.2596, "step": 3302 }, { "epoch": 0.645369284876905, "grad_norm": 9.67246150970459, "learning_rate": 5.955001256760951e-06, "loss": 9.8974, "step": 3303 }, { "epoch": 0.6455646737006643, "grad_norm": 8.841197967529297, "learning_rate": 5.9492141823884295e-06, "loss": 9.5671, "step": 3304 }, { "epoch": 0.6457600625244236, "grad_norm": 8.962235450744629, "learning_rate": 5.943428730481391e-06, "loss": 7.8073, "step": 3305 }, { "epoch": 0.6459554513481829, "grad_norm": 10.199047088623047, "learning_rate": 5.937644903357087e-06, "loss": 9.4195, "step": 3306 }, { "epoch": 0.6461508401719421, "grad_norm": 9.305266380310059, "learning_rate": 5.931862703332121e-06, "loss": 8.6202, "step": 3307 }, { "epoch": 0.6463462289957015, "grad_norm": 8.97860336303711, "learning_rate": 5.926082132722445e-06, "loss": 8.9567, "step": 3308 }, { "epoch": 0.6465416178194607, "grad_norm": 6.764257431030273, "learning_rate": 5.920303193843353e-06, "loss": 8.338, "step": 3309 }, { "epoch": 0.64673700664322, "grad_norm": 7.807435035705566, "learning_rate": 5.914525889009493e-06, "loss": 8.907, "step": 3310 }, { "epoch": 0.6469323954669793, "grad_norm": 10.246101379394531, "learning_rate": 5.908750220534853e-06, "loss": 8.8581, "step": 3311 }, { "epoch": 0.6471277842907386, "grad_norm": 8.756179809570312, "learning_rate": 5.902976190732768e-06, "loss": 8.8028, "step": 3312 }, { "epoch": 0.6473231731144978, "grad_norm": 8.108232498168945, "learning_rate": 5.897203801915918e-06, "loss": 9.0011, "step": 3313 }, { "epoch": 0.6475185619382572, "grad_norm": 7.685898303985596, "learning_rate": 5.8914330563963205e-06, "loss": 9.0275, "step": 3314 }, { "epoch": 0.6477139507620164, "grad_norm": 7.592754364013672, "learning_rate": 5.885663956485341e-06, "loss": 8.0847, "step": 3315 }, { "epoch": 0.6479093395857757, "grad_norm": 8.844161033630371, "learning_rate": 5.8798965044936816e-06, "loss": 9.006, "step": 3316 }, { "epoch": 0.6481047284095349, "grad_norm": 10.648222923278809, "learning_rate": 5.8741307027313875e-06, "loss": 9.1198, "step": 3317 }, { "epoch": 0.6483001172332943, "grad_norm": 9.618973731994629, "learning_rate": 5.868366553507841e-06, "loss": 9.2452, "step": 3318 }, { "epoch": 0.6484955060570535, "grad_norm": 9.56131649017334, "learning_rate": 5.86260405913176e-06, "loss": 9.0775, "step": 3319 }, { "epoch": 0.6486908948808128, "grad_norm": 8.947144508361816, "learning_rate": 5.856843221911206e-06, "loss": 8.8293, "step": 3320 }, { "epoch": 0.6488862837045721, "grad_norm": 7.5485382080078125, "learning_rate": 5.851084044153573e-06, "loss": 9.0039, "step": 3321 }, { "epoch": 0.6490816725283314, "grad_norm": 11.291420936584473, "learning_rate": 5.845326528165588e-06, "loss": 8.9254, "step": 3322 }, { "epoch": 0.6492770613520906, "grad_norm": 8.44905948638916, "learning_rate": 5.839570676253317e-06, "loss": 9.5899, "step": 3323 }, { "epoch": 0.64947245017585, "grad_norm": 8.68692684173584, "learning_rate": 5.833816490722153e-06, "loss": 9.3606, "step": 3324 }, { "epoch": 0.6496678389996092, "grad_norm": 8.023199081420898, "learning_rate": 5.828063973876834e-06, "loss": 8.6232, "step": 3325 }, { "epoch": 0.6498632278233685, "grad_norm": 6.996118068695068, "learning_rate": 5.822313128021413e-06, "loss": 7.3301, "step": 3326 }, { "epoch": 0.6500586166471278, "grad_norm": 6.996118068695068, "learning_rate": 5.822313128021413e-06, "loss": 9.9519, "step": 3327 }, { "epoch": 0.6502540054708871, "grad_norm": 7.3202056884765625, "learning_rate": 5.81656395545929e-06, "loss": 8.1281, "step": 3328 }, { "epoch": 0.6504493942946463, "grad_norm": 9.246292114257812, "learning_rate": 5.810816458493184e-06, "loss": 8.9741, "step": 3329 }, { "epoch": 0.6506447831184057, "grad_norm": 14.636777877807617, "learning_rate": 5.805070639425141e-06, "loss": 9.053, "step": 3330 }, { "epoch": 0.6508401719421649, "grad_norm": 10.433954238891602, "learning_rate": 5.7993265005565494e-06, "loss": 8.3707, "step": 3331 }, { "epoch": 0.6510355607659242, "grad_norm": 9.631535530090332, "learning_rate": 5.793584044188105e-06, "loss": 8.9179, "step": 3332 }, { "epoch": 0.6512309495896834, "grad_norm": 9.813020706176758, "learning_rate": 5.7878432726198495e-06, "loss": 8.7182, "step": 3333 }, { "epoch": 0.6514263384134428, "grad_norm": 11.817461967468262, "learning_rate": 5.782104188151132e-06, "loss": 9.2989, "step": 3334 }, { "epoch": 0.651621727237202, "grad_norm": 9.830137252807617, "learning_rate": 5.776366793080641e-06, "loss": 9.5555, "step": 3335 }, { "epoch": 0.6518171160609613, "grad_norm": 7.121776103973389, "learning_rate": 5.770631089706374e-06, "loss": 8.3879, "step": 3336 }, { "epoch": 0.6520125048847206, "grad_norm": 11.094462394714355, "learning_rate": 5.7648970803256664e-06, "loss": 10.1041, "step": 3337 }, { "epoch": 0.6522078937084799, "grad_norm": 7.530900955200195, "learning_rate": 5.759164767235159e-06, "loss": 9.1009, "step": 3338 }, { "epoch": 0.6524032825322391, "grad_norm": 10.451367378234863, "learning_rate": 5.753434152730829e-06, "loss": 8.9201, "step": 3339 }, { "epoch": 0.6525986713559985, "grad_norm": 9.436930656433105, "learning_rate": 5.747705239107958e-06, "loss": 8.7656, "step": 3340 }, { "epoch": 0.6527940601797577, "grad_norm": 10.836393356323242, "learning_rate": 5.741978028661161e-06, "loss": 9.5958, "step": 3341 }, { "epoch": 0.652989449003517, "grad_norm": 10.820073127746582, "learning_rate": 5.736252523684359e-06, "loss": 9.6317, "step": 3342 }, { "epoch": 0.6531848378272763, "grad_norm": 6.66307258605957, "learning_rate": 5.730528726470792e-06, "loss": 7.7654, "step": 3343 }, { "epoch": 0.6533802266510356, "grad_norm": 9.497946739196777, "learning_rate": 5.724806639313025e-06, "loss": 8.1978, "step": 3344 }, { "epoch": 0.6535756154747948, "grad_norm": 8.500927925109863, "learning_rate": 5.719086264502924e-06, "loss": 9.5653, "step": 3345 }, { "epoch": 0.6537710042985542, "grad_norm": 8.197712898254395, "learning_rate": 5.713367604331688e-06, "loss": 8.9477, "step": 3346 }, { "epoch": 0.6539663931223134, "grad_norm": 9.65852165222168, "learning_rate": 5.707650661089804e-06, "loss": 9.4163, "step": 3347 }, { "epoch": 0.6541617819460727, "grad_norm": 7.677917957305908, "learning_rate": 5.701935437067096e-06, "loss": 9.2371, "step": 3348 }, { "epoch": 0.654357170769832, "grad_norm": 8.639566421508789, "learning_rate": 5.696221934552684e-06, "loss": 8.1582, "step": 3349 }, { "epoch": 0.6545525595935913, "grad_norm": 7.323366165161133, "learning_rate": 5.690510155835005e-06, "loss": 9.698, "step": 3350 }, { "epoch": 0.6547479484173505, "grad_norm": 8.872257232666016, "learning_rate": 5.6848001032018e-06, "loss": 8.4604, "step": 3351 }, { "epoch": 0.6549433372411098, "grad_norm": 8.639331817626953, "learning_rate": 5.679091778940129e-06, "loss": 9.3642, "step": 3352 }, { "epoch": 0.6551387260648691, "grad_norm": 7.850310325622559, "learning_rate": 5.6733851853363455e-06, "loss": 9.2343, "step": 3353 }, { "epoch": 0.6553341148886284, "grad_norm": 8.028477668762207, "learning_rate": 5.667680324676125e-06, "loss": 9.2818, "step": 3354 }, { "epoch": 0.6555295037123876, "grad_norm": 8.717341423034668, "learning_rate": 5.661977199244437e-06, "loss": 9.2635, "step": 3355 }, { "epoch": 0.655724892536147, "grad_norm": 8.757511138916016, "learning_rate": 5.656275811325557e-06, "loss": 9.2249, "step": 3356 }, { "epoch": 0.6559202813599062, "grad_norm": 11.633004188537598, "learning_rate": 5.650576163203075e-06, "loss": 8.787, "step": 3357 }, { "epoch": 0.6561156701836655, "grad_norm": 13.906158447265625, "learning_rate": 5.64487825715987e-06, "loss": 9.0062, "step": 3358 }, { "epoch": 0.6563110590074248, "grad_norm": 8.314506530761719, "learning_rate": 5.639182095478138e-06, "loss": 8.6612, "step": 3359 }, { "epoch": 0.6565064478311841, "grad_norm": 6.556262016296387, "learning_rate": 5.633487680439362e-06, "loss": 9.3609, "step": 3360 }, { "epoch": 0.6567018366549433, "grad_norm": 9.146658897399902, "learning_rate": 5.627795014324335e-06, "loss": 9.2437, "step": 3361 }, { "epoch": 0.6568972254787027, "grad_norm": 8.117045402526855, "learning_rate": 5.622104099413146e-06, "loss": 9.0189, "step": 3362 }, { "epoch": 0.6570926143024619, "grad_norm": 7.30604887008667, "learning_rate": 5.616414937985184e-06, "loss": 8.4643, "step": 3363 }, { "epoch": 0.6572880031262212, "grad_norm": 7.005485534667969, "learning_rate": 5.610727532319131e-06, "loss": 7.646, "step": 3364 }, { "epoch": 0.6574833919499805, "grad_norm": 8.44398307800293, "learning_rate": 5.605041884692977e-06, "loss": 9.31, "step": 3365 }, { "epoch": 0.6576787807737398, "grad_norm": 10.372838973999023, "learning_rate": 5.5993579973839915e-06, "loss": 9.2662, "step": 3366 }, { "epoch": 0.657874169597499, "grad_norm": 7.844415664672852, "learning_rate": 5.5936758726687554e-06, "loss": 9.3544, "step": 3367 }, { "epoch": 0.6580695584212582, "grad_norm": 10.088156700134277, "learning_rate": 5.587995512823133e-06, "loss": 8.2324, "step": 3368 }, { "epoch": 0.6582649472450176, "grad_norm": 10.11949348449707, "learning_rate": 5.582316920122281e-06, "loss": 9.4767, "step": 3369 }, { "epoch": 0.6584603360687769, "grad_norm": 7.221989631652832, "learning_rate": 5.57664009684066e-06, "loss": 8.1311, "step": 3370 }, { "epoch": 0.6586557248925361, "grad_norm": 10.301944732666016, "learning_rate": 5.570965045252006e-06, "loss": 8.7277, "step": 3371 }, { "epoch": 0.6588511137162955, "grad_norm": 7.021796703338623, "learning_rate": 5.565291767629359e-06, "loss": 9.1348, "step": 3372 }, { "epoch": 0.6590465025400547, "grad_norm": 7.4562883377075195, "learning_rate": 5.559620266245038e-06, "loss": 9.2466, "step": 3373 }, { "epoch": 0.659241891363814, "grad_norm": 8.977425575256348, "learning_rate": 5.553950543370663e-06, "loss": 10.1065, "step": 3374 }, { "epoch": 0.6594372801875733, "grad_norm": 10.201164245605469, "learning_rate": 5.548282601277124e-06, "loss": 9.538, "step": 3375 }, { "epoch": 0.6596326690113326, "grad_norm": 7.474252223968506, "learning_rate": 5.542616442234618e-06, "loss": 8.566, "step": 3376 }, { "epoch": 0.6598280578350918, "grad_norm": 13.249435424804688, "learning_rate": 5.536952068512609e-06, "loss": 9.1197, "step": 3377 }, { "epoch": 0.6600234466588512, "grad_norm": 7.932127952575684, "learning_rate": 5.53128948237986e-06, "loss": 8.6702, "step": 3378 }, { "epoch": 0.6602188354826104, "grad_norm": 8.74057674407959, "learning_rate": 5.52562868610441e-06, "loss": 9.1433, "step": 3379 }, { "epoch": 0.6604142243063696, "grad_norm": 9.633611679077148, "learning_rate": 5.5199696819535856e-06, "loss": 9.1596, "step": 3380 }, { "epoch": 0.660609613130129, "grad_norm": 8.033287048339844, "learning_rate": 5.514312472193992e-06, "loss": 8.9639, "step": 3381 }, { "epoch": 0.6608050019538882, "grad_norm": 9.30175495147705, "learning_rate": 5.508657059091516e-06, "loss": 9.6563, "step": 3382 }, { "epoch": 0.6610003907776475, "grad_norm": 9.922159194946289, "learning_rate": 5.50300344491133e-06, "loss": 9.0399, "step": 3383 }, { "epoch": 0.6611957796014069, "grad_norm": 11.326530456542969, "learning_rate": 5.497351631917878e-06, "loss": 8.8985, "step": 3384 }, { "epoch": 0.6613911684251661, "grad_norm": 8.612028121948242, "learning_rate": 5.49170162237489e-06, "loss": 9.1703, "step": 3385 }, { "epoch": 0.6615865572489253, "grad_norm": 8.003447532653809, "learning_rate": 5.4860534185453665e-06, "loss": 8.3363, "step": 3386 }, { "epoch": 0.6617819460726846, "grad_norm": 8.059162139892578, "learning_rate": 5.480407022691594e-06, "loss": 8.5584, "step": 3387 }, { "epoch": 0.661977334896444, "grad_norm": 8.822245597839355, "learning_rate": 5.474762437075123e-06, "loss": 9.7109, "step": 3388 }, { "epoch": 0.6621727237202032, "grad_norm": 8.332511901855469, "learning_rate": 5.46911966395679e-06, "loss": 8.1816, "step": 3389 }, { "epoch": 0.6623681125439624, "grad_norm": 7.900548934936523, "learning_rate": 5.463478705596696e-06, "loss": 9.266, "step": 3390 }, { "epoch": 0.6625635013677218, "grad_norm": 8.51298713684082, "learning_rate": 5.457839564254228e-06, "loss": 9.6674, "step": 3391 }, { "epoch": 0.662758890191481, "grad_norm": 7.403846740722656, "learning_rate": 5.452202242188029e-06, "loss": 8.6096, "step": 3392 }, { "epoch": 0.6629542790152403, "grad_norm": 7.903921127319336, "learning_rate": 5.446566741656028e-06, "loss": 9.7027, "step": 3393 }, { "epoch": 0.6631496678389996, "grad_norm": 10.345064163208008, "learning_rate": 5.440933064915414e-06, "loss": 8.7708, "step": 3394 }, { "epoch": 0.6633450566627589, "grad_norm": 9.622152328491211, "learning_rate": 5.435301214222646e-06, "loss": 9.8728, "step": 3395 }, { "epoch": 0.6635404454865181, "grad_norm": 10.184322357177734, "learning_rate": 5.429671191833464e-06, "loss": 9.5646, "step": 3396 }, { "epoch": 0.6637358343102775, "grad_norm": 7.466825008392334, "learning_rate": 5.424043000002857e-06, "loss": 8.7748, "step": 3397 }, { "epoch": 0.6639312231340367, "grad_norm": 8.833131790161133, "learning_rate": 5.4184166409851e-06, "loss": 9.9261, "step": 3398 }, { "epoch": 0.664126611957796, "grad_norm": 8.251693725585938, "learning_rate": 5.412792117033717e-06, "loss": 7.8546, "step": 3399 }, { "epoch": 0.6643220007815553, "grad_norm": 10.826041221618652, "learning_rate": 5.407169430401511e-06, "loss": 10.1836, "step": 3400 }, { "epoch": 0.6645173896053146, "grad_norm": 11.188115119934082, "learning_rate": 5.401548583340535e-06, "loss": 10.2475, "step": 3401 }, { "epoch": 0.6647127784290738, "grad_norm": 12.538986206054688, "learning_rate": 5.395929578102119e-06, "loss": 10.5756, "step": 3402 }, { "epoch": 0.6649081672528332, "grad_norm": 10.929691314697266, "learning_rate": 5.390312416936845e-06, "loss": 9.0069, "step": 3403 }, { "epoch": 0.6651035560765924, "grad_norm": 7.685513496398926, "learning_rate": 5.384697102094565e-06, "loss": 8.5899, "step": 3404 }, { "epoch": 0.6652989449003517, "grad_norm": 7.4986090660095215, "learning_rate": 5.379083635824385e-06, "loss": 9.4647, "step": 3405 }, { "epoch": 0.6654943337241109, "grad_norm": 9.000301361083984, "learning_rate": 5.3734720203746715e-06, "loss": 9.9078, "step": 3406 }, { "epoch": 0.6656897225478703, "grad_norm": 7.087594032287598, "learning_rate": 5.367862257993046e-06, "loss": 8.7551, "step": 3407 }, { "epoch": 0.6658851113716295, "grad_norm": 9.473038673400879, "learning_rate": 5.362254350926403e-06, "loss": 10.2642, "step": 3408 }, { "epoch": 0.6660805001953888, "grad_norm": 8.565691947937012, "learning_rate": 5.356648301420872e-06, "loss": 9.0764, "step": 3409 }, { "epoch": 0.6662758890191481, "grad_norm": 6.862060070037842, "learning_rate": 5.3510441117218615e-06, "loss": 8.8252, "step": 3410 }, { "epoch": 0.6664712778429074, "grad_norm": 8.263054847717285, "learning_rate": 5.3454417840740125e-06, "loss": 9.004, "step": 3411 }, { "epoch": 0.6666666666666666, "grad_norm": 9.111275672912598, "learning_rate": 5.339841320721239e-06, "loss": 8.0287, "step": 3412 }, { "epoch": 0.666862055490426, "grad_norm": 8.513102531433105, "learning_rate": 5.334242723906701e-06, "loss": 9.5652, "step": 3413 }, { "epoch": 0.6670574443141852, "grad_norm": 8.109259605407715, "learning_rate": 5.328645995872801e-06, "loss": 9.0984, "step": 3414 }, { "epoch": 0.6672528331379445, "grad_norm": 7.018557071685791, "learning_rate": 5.323051138861213e-06, "loss": 9.2285, "step": 3415 }, { "epoch": 0.6674482219617038, "grad_norm": 7.353377819061279, "learning_rate": 5.317458155112842e-06, "loss": 9.5638, "step": 3416 }, { "epoch": 0.6676436107854631, "grad_norm": 9.536938667297363, "learning_rate": 5.31186704686786e-06, "loss": 10.1526, "step": 3417 }, { "epoch": 0.6678389996092223, "grad_norm": 7.111334800720215, "learning_rate": 5.30627781636567e-06, "loss": 9.0047, "step": 3418 }, { "epoch": 0.6680343884329817, "grad_norm": 9.799473762512207, "learning_rate": 5.3006904658449445e-06, "loss": 8.7924, "step": 3419 }, { "epoch": 0.6682297772567409, "grad_norm": 8.26110553741455, "learning_rate": 5.295104997543579e-06, "loss": 9.8125, "step": 3420 }, { "epoch": 0.6684251660805002, "grad_norm": 6.976203441619873, "learning_rate": 5.289521413698735e-06, "loss": 9.0219, "step": 3421 }, { "epoch": 0.6686205549042594, "grad_norm": 9.517335891723633, "learning_rate": 5.283939716546805e-06, "loss": 8.918, "step": 3422 }, { "epoch": 0.6688159437280188, "grad_norm": 12.369084358215332, "learning_rate": 5.2783599083234395e-06, "loss": 9.2844, "step": 3423 }, { "epoch": 0.669011332551778, "grad_norm": 8.553173065185547, "learning_rate": 5.2727819912635155e-06, "loss": 9.8545, "step": 3424 }, { "epoch": 0.6692067213755373, "grad_norm": 77.29253387451172, "learning_rate": 5.267205967601172e-06, "loss": 9.5447, "step": 3425 }, { "epoch": 0.6694021101992966, "grad_norm": 10.61068058013916, "learning_rate": 5.261631839569776e-06, "loss": 9.4725, "step": 3426 }, { "epoch": 0.6695974990230559, "grad_norm": 8.081039428710938, "learning_rate": 5.256059609401931e-06, "loss": 9.1839, "step": 3427 }, { "epoch": 0.6697928878468151, "grad_norm": 8.719537734985352, "learning_rate": 5.250489279329501e-06, "loss": 9.3304, "step": 3428 }, { "epoch": 0.6699882766705745, "grad_norm": 7.648680686950684, "learning_rate": 5.244920851583565e-06, "loss": 8.7841, "step": 3429 }, { "epoch": 0.6701836654943337, "grad_norm": 7.385828971862793, "learning_rate": 5.2393543283944595e-06, "loss": 8.0707, "step": 3430 }, { "epoch": 0.670379054318093, "grad_norm": 8.194306373596191, "learning_rate": 5.233789711991744e-06, "loss": 10.0857, "step": 3431 }, { "epoch": 0.6705744431418523, "grad_norm": 12.389236450195312, "learning_rate": 5.228227004604225e-06, "loss": 9.7494, "step": 3432 }, { "epoch": 0.6707698319656116, "grad_norm": 7.089478492736816, "learning_rate": 5.222666208459933e-06, "loss": 8.8768, "step": 3433 }, { "epoch": 0.6709652207893708, "grad_norm": 9.06132984161377, "learning_rate": 5.217107325786148e-06, "loss": 9.0011, "step": 3434 }, { "epoch": 0.6711606096131302, "grad_norm": 6.347553253173828, "learning_rate": 5.211550358809368e-06, "loss": 8.3051, "step": 3435 }, { "epoch": 0.6713559984368894, "grad_norm": 9.32907485961914, "learning_rate": 5.205995309755335e-06, "loss": 9.5297, "step": 3436 }, { "epoch": 0.6715513872606487, "grad_norm": 9.510924339294434, "learning_rate": 5.2004421808490145e-06, "loss": 8.7285, "step": 3437 }, { "epoch": 0.671746776084408, "grad_norm": 6.453519344329834, "learning_rate": 5.194890974314612e-06, "loss": 8.2006, "step": 3438 }, { "epoch": 0.6719421649081673, "grad_norm": 19.823518753051758, "learning_rate": 5.189341692375558e-06, "loss": 9.7615, "step": 3439 }, { "epoch": 0.6721375537319265, "grad_norm": 9.389676094055176, "learning_rate": 5.183794337254504e-06, "loss": 9.473, "step": 3440 }, { "epoch": 0.6723329425556858, "grad_norm": 7.221741199493408, "learning_rate": 5.1782489111733494e-06, "loss": 8.0322, "step": 3441 }, { "epoch": 0.6725283313794451, "grad_norm": 9.929072380065918, "learning_rate": 5.172705416353199e-06, "loss": 8.7213, "step": 3442 }, { "epoch": 0.6727237202032044, "grad_norm": 13.621519088745117, "learning_rate": 5.167163855014403e-06, "loss": 8.6147, "step": 3443 }, { "epoch": 0.6729191090269636, "grad_norm": 7.856151103973389, "learning_rate": 5.161624229376523e-06, "loss": 8.6556, "step": 3444 }, { "epoch": 0.673114497850723, "grad_norm": 9.297572135925293, "learning_rate": 5.156086541658356e-06, "loss": 9.1263, "step": 3445 }, { "epoch": 0.6733098866744822, "grad_norm": 9.4236421585083, "learning_rate": 5.150550794077911e-06, "loss": 9.3593, "step": 3446 }, { "epoch": 0.6735052754982415, "grad_norm": 8.085559844970703, "learning_rate": 5.145016988852436e-06, "loss": 8.7282, "step": 3447 }, { "epoch": 0.6737006643220008, "grad_norm": 7.528972148895264, "learning_rate": 5.139485128198382e-06, "loss": 8.7748, "step": 3448 }, { "epoch": 0.6738960531457601, "grad_norm": 13.162374496459961, "learning_rate": 5.133955214331439e-06, "loss": 8.553, "step": 3449 }, { "epoch": 0.6740914419695193, "grad_norm": 10.401016235351562, "learning_rate": 5.128427249466503e-06, "loss": 9.7187, "step": 3450 }, { "epoch": 0.6742868307932787, "grad_norm": 7.216558933258057, "learning_rate": 5.122901235817701e-06, "loss": 9.4551, "step": 3451 }, { "epoch": 0.6744822196170379, "grad_norm": 9.662524223327637, "learning_rate": 5.117377175598372e-06, "loss": 8.5878, "step": 3452 }, { "epoch": 0.6746776084407972, "grad_norm": 7.320957660675049, "learning_rate": 5.111855071021068e-06, "loss": 8.5201, "step": 3453 }, { "epoch": 0.6748729972645565, "grad_norm": 10.163082122802734, "learning_rate": 5.10633492429757e-06, "loss": 8.9651, "step": 3454 }, { "epoch": 0.6750683860883158, "grad_norm": 8.646751403808594, "learning_rate": 5.100816737638863e-06, "loss": 9.3415, "step": 3455 }, { "epoch": 0.675263774912075, "grad_norm": 7.870480537414551, "learning_rate": 5.095300513255157e-06, "loss": 9.3154, "step": 3456 }, { "epoch": 0.6754591637358343, "grad_norm": 9.3548583984375, "learning_rate": 5.0897862533558665e-06, "loss": 8.4811, "step": 3457 }, { "epoch": 0.6756545525595936, "grad_norm": 8.656578063964844, "learning_rate": 5.084273960149629e-06, "loss": 8.9353, "step": 3458 }, { "epoch": 0.6758499413833529, "grad_norm": 9.321224212646484, "learning_rate": 5.0787636358442825e-06, "loss": 8.4416, "step": 3459 }, { "epoch": 0.6760453302071121, "grad_norm": 9.321224212646484, "learning_rate": 5.0787636358442825e-06, "loss": 10.2964, "step": 3460 }, { "epoch": 0.6762407190308715, "grad_norm": 8.607054710388184, "learning_rate": 5.07325528264689e-06, "loss": 8.4016, "step": 3461 }, { "epoch": 0.6764361078546307, "grad_norm": 7.984342098236084, "learning_rate": 5.0677489027637115e-06, "loss": 8.8981, "step": 3462 }, { "epoch": 0.67663149667839, "grad_norm": 9.526960372924805, "learning_rate": 5.062244498400228e-06, "loss": 8.2054, "step": 3463 }, { "epoch": 0.6768268855021493, "grad_norm": 12.335521697998047, "learning_rate": 5.05674207176112e-06, "loss": 9.5845, "step": 3464 }, { "epoch": 0.6770222743259086, "grad_norm": 10.526650428771973, "learning_rate": 5.051241625050287e-06, "loss": 9.766, "step": 3465 }, { "epoch": 0.6772176631496678, "grad_norm": 9.057307243347168, "learning_rate": 5.045743160470824e-06, "loss": 8.5928, "step": 3466 }, { "epoch": 0.6774130519734272, "grad_norm": 11.242501258850098, "learning_rate": 5.040246680225034e-06, "loss": 10.1617, "step": 3467 }, { "epoch": 0.6776084407971864, "grad_norm": 8.937871932983398, "learning_rate": 5.034752186514433e-06, "loss": 8.895, "step": 3468 }, { "epoch": 0.6778038296209457, "grad_norm": 8.857319831848145, "learning_rate": 5.029259681539732e-06, "loss": 8.7988, "step": 3469 }, { "epoch": 0.677999218444705, "grad_norm": 8.807754516601562, "learning_rate": 5.023769167500856e-06, "loss": 8.2176, "step": 3470 }, { "epoch": 0.6781946072684643, "grad_norm": 9.361241340637207, "learning_rate": 5.018280646596918e-06, "loss": 9.7489, "step": 3471 }, { "epoch": 0.6783899960922235, "grad_norm": 9.284899711608887, "learning_rate": 5.01279412102625e-06, "loss": 8.8437, "step": 3472 }, { "epoch": 0.6785853849159829, "grad_norm": 11.50267505645752, "learning_rate": 5.007309592986367e-06, "loss": 9.6544, "step": 3473 }, { "epoch": 0.6787807737397421, "grad_norm": 8.50446605682373, "learning_rate": 5.001827064674001e-06, "loss": 9.1788, "step": 3474 }, { "epoch": 0.6789761625635014, "grad_norm": 7.195436477661133, "learning_rate": 4.996346538285066e-06, "loss": 9.0272, "step": 3475 }, { "epoch": 0.6791715513872606, "grad_norm": 7.552397727966309, "learning_rate": 4.9908680160146934e-06, "loss": 8.7314, "step": 3476 }, { "epoch": 0.67936694021102, "grad_norm": 8.463982582092285, "learning_rate": 4.9853915000571915e-06, "loss": 9.6868, "step": 3477 }, { "epoch": 0.6795623290347792, "grad_norm": 14.675905227661133, "learning_rate": 4.979916992606083e-06, "loss": 9.1982, "step": 3478 }, { "epoch": 0.6797577178585384, "grad_norm": 7.972330570220947, "learning_rate": 4.974444495854075e-06, "loss": 8.4736, "step": 3479 }, { "epoch": 0.6799531066822978, "grad_norm": 9.317290306091309, "learning_rate": 4.968974011993067e-06, "loss": 9.9627, "step": 3480 }, { "epoch": 0.680148495506057, "grad_norm": 8.15661907196045, "learning_rate": 4.9635055432141675e-06, "loss": 8.6467, "step": 3481 }, { "epoch": 0.6803438843298163, "grad_norm": 7.992409706115723, "learning_rate": 4.958039091707659e-06, "loss": 10.0247, "step": 3482 }, { "epoch": 0.6805392731535757, "grad_norm": 7.441638946533203, "learning_rate": 4.952574659663032e-06, "loss": 8.4088, "step": 3483 }, { "epoch": 0.6807346619773349, "grad_norm": 7.223865509033203, "learning_rate": 4.947112249268955e-06, "loss": 8.8867, "step": 3484 }, { "epoch": 0.6809300508010941, "grad_norm": 9.672779083251953, "learning_rate": 4.9416518627133006e-06, "loss": 8.8267, "step": 3485 }, { "epoch": 0.6811254396248535, "grad_norm": 8.218040466308594, "learning_rate": 4.936193502183114e-06, "loss": 9.6503, "step": 3486 }, { "epoch": 0.6813208284486127, "grad_norm": 9.12499713897705, "learning_rate": 4.930737169864647e-06, "loss": 9.4916, "step": 3487 }, { "epoch": 0.681516217272372, "grad_norm": 9.555933952331543, "learning_rate": 4.925282867943322e-06, "loss": 8.9187, "step": 3488 }, { "epoch": 0.6817116060961314, "grad_norm": 7.154976844787598, "learning_rate": 4.9198305986037635e-06, "loss": 8.8264, "step": 3489 }, { "epoch": 0.6819069949198906, "grad_norm": 6.26767635345459, "learning_rate": 4.9143803640297675e-06, "loss": 8.361, "step": 3490 }, { "epoch": 0.6821023837436498, "grad_norm": 9.955586433410645, "learning_rate": 4.90893216640433e-06, "loss": 9.5551, "step": 3491 }, { "epoch": 0.6822977725674091, "grad_norm": 7.493185997009277, "learning_rate": 4.903486007909619e-06, "loss": 8.9033, "step": 3492 }, { "epoch": 0.6824931613911684, "grad_norm": 12.523082733154297, "learning_rate": 4.898041890726987e-06, "loss": 9.4942, "step": 3493 }, { "epoch": 0.6826885502149277, "grad_norm": 8.432913780212402, "learning_rate": 4.892599817036978e-06, "loss": 8.9897, "step": 3494 }, { "epoch": 0.6828839390386869, "grad_norm": 11.34340763092041, "learning_rate": 4.887159789019306e-06, "loss": 8.7441, "step": 3495 }, { "epoch": 0.6830793278624463, "grad_norm": 9.538162231445312, "learning_rate": 4.881721808852877e-06, "loss": 9.3453, "step": 3496 }, { "epoch": 0.6832747166862055, "grad_norm": 6.3256120681762695, "learning_rate": 4.876285878715764e-06, "loss": 9.0006, "step": 3497 }, { "epoch": 0.6834701055099648, "grad_norm": 10.030577659606934, "learning_rate": 4.870852000785233e-06, "loss": 10.0886, "step": 3498 }, { "epoch": 0.6836654943337241, "grad_norm": 7.294377326965332, "learning_rate": 4.865420177237714e-06, "loss": 9.1273, "step": 3499 }, { "epoch": 0.6838608831574834, "grad_norm": 8.801251411437988, "learning_rate": 4.859990410248828e-06, "loss": 9.648, "step": 3500 }, { "epoch": 0.6840562719812426, "grad_norm": 8.2620849609375, "learning_rate": 4.854562701993358e-06, "loss": 9.1979, "step": 3501 }, { "epoch": 0.684251660805002, "grad_norm": 10.138997077941895, "learning_rate": 4.849137054645276e-06, "loss": 9.3575, "step": 3502 }, { "epoch": 0.6844470496287612, "grad_norm": 8.214333534240723, "learning_rate": 4.843713470377716e-06, "loss": 8.9957, "step": 3503 }, { "epoch": 0.6846424384525205, "grad_norm": 11.297102928161621, "learning_rate": 4.838291951363e-06, "loss": 8.6428, "step": 3504 }, { "epoch": 0.6848378272762798, "grad_norm": 8.354808807373047, "learning_rate": 4.8328724997726095e-06, "loss": 8.0891, "step": 3505 }, { "epoch": 0.6850332161000391, "grad_norm": 10.444690704345703, "learning_rate": 4.827455117777199e-06, "loss": 9.1011, "step": 3506 }, { "epoch": 0.6852286049237983, "grad_norm": 8.39549732208252, "learning_rate": 4.822039807546607e-06, "loss": 9.0153, "step": 3507 }, { "epoch": 0.6854239937475577, "grad_norm": 13.161046981811523, "learning_rate": 4.816626571249825e-06, "loss": 10.1814, "step": 3508 }, { "epoch": 0.6856193825713169, "grad_norm": 9.3190279006958, "learning_rate": 4.811215411055032e-06, "loss": 9.6779, "step": 3509 }, { "epoch": 0.6858147713950762, "grad_norm": 7.86978006362915, "learning_rate": 4.805806329129554e-06, "loss": 9.9839, "step": 3510 }, { "epoch": 0.6860101602188354, "grad_norm": 11.596525192260742, "learning_rate": 4.800399327639907e-06, "loss": 9.7852, "step": 3511 }, { "epoch": 0.6862055490425948, "grad_norm": 7.724665641784668, "learning_rate": 4.794994408751753e-06, "loss": 9.2349, "step": 3512 }, { "epoch": 0.686400937866354, "grad_norm": 8.651827812194824, "learning_rate": 4.78959157462994e-06, "loss": 9.4594, "step": 3513 }, { "epoch": 0.6865963266901133, "grad_norm": 7.398945331573486, "learning_rate": 4.784190827438462e-06, "loss": 9.0394, "step": 3514 }, { "epoch": 0.6867917155138726, "grad_norm": 8.461224555969238, "learning_rate": 4.7787921693404934e-06, "loss": 9.575, "step": 3515 }, { "epoch": 0.6869871043376319, "grad_norm": 9.653675079345703, "learning_rate": 4.773395602498358e-06, "loss": 8.9556, "step": 3516 }, { "epoch": 0.6871824931613911, "grad_norm": 14.610803604125977, "learning_rate": 4.768001129073553e-06, "loss": 8.9779, "step": 3517 }, { "epoch": 0.6873778819851505, "grad_norm": 7.912567138671875, "learning_rate": 4.762608751226731e-06, "loss": 8.9374, "step": 3518 }, { "epoch": 0.6875732708089097, "grad_norm": 7.511148452758789, "learning_rate": 4.757218471117704e-06, "loss": 9.0939, "step": 3519 }, { "epoch": 0.687768659632669, "grad_norm": 7.186136722564697, "learning_rate": 4.7518302909054515e-06, "loss": 8.2911, "step": 3520 }, { "epoch": 0.6879640484564283, "grad_norm": 8.460360527038574, "learning_rate": 4.7464442127481e-06, "loss": 8.9182, "step": 3521 }, { "epoch": 0.6881594372801876, "grad_norm": 8.387063026428223, "learning_rate": 4.74106023880295e-06, "loss": 9.7751, "step": 3522 }, { "epoch": 0.6883548261039468, "grad_norm": 8.238409042358398, "learning_rate": 4.7356783712264405e-06, "loss": 9.041, "step": 3523 }, { "epoch": 0.6885502149277062, "grad_norm": 6.9534592628479, "learning_rate": 4.730298612174187e-06, "loss": 8.1406, "step": 3524 }, { "epoch": 0.6887456037514654, "grad_norm": 11.205235481262207, "learning_rate": 4.724920963800939e-06, "loss": 9.901, "step": 3525 }, { "epoch": 0.6889409925752247, "grad_norm": 7.504866600036621, "learning_rate": 4.719545428260623e-06, "loss": 8.7138, "step": 3526 }, { "epoch": 0.689136381398984, "grad_norm": 8.635149002075195, "learning_rate": 4.714172007706298e-06, "loss": 9.1709, "step": 3527 }, { "epoch": 0.6893317702227433, "grad_norm": 6.568149566650391, "learning_rate": 4.708800704290193e-06, "loss": 8.9039, "step": 3528 }, { "epoch": 0.6895271590465025, "grad_norm": 6.23305082321167, "learning_rate": 4.703431520163675e-06, "loss": 8.5542, "step": 3529 }, { "epoch": 0.6897225478702618, "grad_norm": 207.06385803222656, "learning_rate": 4.698064457477276e-06, "loss": 10.1821, "step": 3530 }, { "epoch": 0.6899179366940211, "grad_norm": 6.3483076095581055, "learning_rate": 4.692699518380664e-06, "loss": 7.4388, "step": 3531 }, { "epoch": 0.6901133255177804, "grad_norm": 7.407440662384033, "learning_rate": 4.687336705022672e-06, "loss": 7.9367, "step": 3532 }, { "epoch": 0.6903087143415396, "grad_norm": 8.763710975646973, "learning_rate": 4.681976019551268e-06, "loss": 8.208, "step": 3533 }, { "epoch": 0.690504103165299, "grad_norm": 9.159621238708496, "learning_rate": 4.676617464113571e-06, "loss": 9.6811, "step": 3534 }, { "epoch": 0.6906994919890582, "grad_norm": 7.8151679039001465, "learning_rate": 4.6712610408558546e-06, "loss": 9.3893, "step": 3535 }, { "epoch": 0.6908948808128175, "grad_norm": 8.498544692993164, "learning_rate": 4.665906751923526e-06, "loss": 8.9263, "step": 3536 }, { "epoch": 0.6910902696365768, "grad_norm": 7.852433681488037, "learning_rate": 4.660554599461151e-06, "loss": 9.2672, "step": 3537 }, { "epoch": 0.6912856584603361, "grad_norm": 7.7206220626831055, "learning_rate": 4.655204585612426e-06, "loss": 8.5579, "step": 3538 }, { "epoch": 0.6914810472840953, "grad_norm": 8.858148574829102, "learning_rate": 4.649856712520205e-06, "loss": 10.4965, "step": 3539 }, { "epoch": 0.6916764361078547, "grad_norm": 9.706315994262695, "learning_rate": 4.64451098232647e-06, "loss": 9.7618, "step": 3540 }, { "epoch": 0.6918718249316139, "grad_norm": 9.114653587341309, "learning_rate": 4.639167397172358e-06, "loss": 8.7886, "step": 3541 }, { "epoch": 0.6920672137553732, "grad_norm": 7.406974792480469, "learning_rate": 4.633825959198135e-06, "loss": 9.5343, "step": 3542 }, { "epoch": 0.6922626025791325, "grad_norm": 8.273829460144043, "learning_rate": 4.628486670543218e-06, "loss": 9.231, "step": 3543 }, { "epoch": 0.6924579914028918, "grad_norm": 8.927740097045898, "learning_rate": 4.623149533346158e-06, "loss": 9.1105, "step": 3544 }, { "epoch": 0.692653380226651, "grad_norm": 8.607205390930176, "learning_rate": 4.6178145497446404e-06, "loss": 8.9878, "step": 3545 }, { "epoch": 0.6928487690504103, "grad_norm": 9.740939140319824, "learning_rate": 4.61248172187549e-06, "loss": 9.4668, "step": 3546 }, { "epoch": 0.6930441578741696, "grad_norm": 10.739304542541504, "learning_rate": 4.607151051874677e-06, "loss": 9.0491, "step": 3547 }, { "epoch": 0.6932395466979289, "grad_norm": 7.079265117645264, "learning_rate": 4.601822541877291e-06, "loss": 8.5638, "step": 3548 }, { "epoch": 0.6934349355216881, "grad_norm": 10.578570365905762, "learning_rate": 4.596496194017575e-06, "loss": 9.3137, "step": 3549 }, { "epoch": 0.6936303243454475, "grad_norm": 15.079474449157715, "learning_rate": 4.591172010428895e-06, "loss": 8.961, "step": 3550 }, { "epoch": 0.6938257131692067, "grad_norm": 9.929420471191406, "learning_rate": 4.585849993243744e-06, "loss": 9.1616, "step": 3551 }, { "epoch": 0.694021101992966, "grad_norm": 6.517254829406738, "learning_rate": 4.580530144593765e-06, "loss": 6.7743, "step": 3552 }, { "epoch": 0.6942164908167253, "grad_norm": 10.221206665039062, "learning_rate": 4.575212466609713e-06, "loss": 10.227, "step": 3553 }, { "epoch": 0.6944118796404846, "grad_norm": 8.188820838928223, "learning_rate": 4.5698969614214924e-06, "loss": 9.3501, "step": 3554 }, { "epoch": 0.6946072684642438, "grad_norm": 10.827705383300781, "learning_rate": 4.5645836311581215e-06, "loss": 10.4984, "step": 3555 }, { "epoch": 0.6948026572880032, "grad_norm": 8.928410530090332, "learning_rate": 4.559272477947759e-06, "loss": 8.8778, "step": 3556 }, { "epoch": 0.6949980461117624, "grad_norm": 8.749245643615723, "learning_rate": 4.553963503917681e-06, "loss": 9.4237, "step": 3557 }, { "epoch": 0.6951934349355217, "grad_norm": 10.222722053527832, "learning_rate": 4.548656711194302e-06, "loss": 9.4023, "step": 3558 }, { "epoch": 0.695388823759281, "grad_norm": 7.6055450439453125, "learning_rate": 4.543352101903153e-06, "loss": 9.0012, "step": 3559 }, { "epoch": 0.6955842125830403, "grad_norm": 11.432832717895508, "learning_rate": 4.538049678168899e-06, "loss": 9.6928, "step": 3560 }, { "epoch": 0.6957796014067995, "grad_norm": 6.709242343902588, "learning_rate": 4.5327494421153195e-06, "loss": 9.0779, "step": 3561 }, { "epoch": 0.6959749902305589, "grad_norm": 6.713991641998291, "learning_rate": 4.527451395865331e-06, "loss": 7.6969, "step": 3562 }, { "epoch": 0.6961703790543181, "grad_norm": 8.824820518493652, "learning_rate": 4.522155541540961e-06, "loss": 9.0058, "step": 3563 }, { "epoch": 0.6963657678780774, "grad_norm": 8.883805274963379, "learning_rate": 4.51686188126336e-06, "loss": 9.5905, "step": 3564 }, { "epoch": 0.6965611567018366, "grad_norm": 7.526402950286865, "learning_rate": 4.5115704171528105e-06, "loss": 8.956, "step": 3565 }, { "epoch": 0.696756545525596, "grad_norm": 6.879123687744141, "learning_rate": 4.506281151328703e-06, "loss": 7.6253, "step": 3566 }, { "epoch": 0.6969519343493552, "grad_norm": 8.952786445617676, "learning_rate": 4.500994085909557e-06, "loss": 9.9273, "step": 3567 }, { "epoch": 0.6971473231731145, "grad_norm": 7.670935153961182, "learning_rate": 4.495709223013e-06, "loss": 8.6843, "step": 3568 }, { "epoch": 0.6973427119968738, "grad_norm": 8.049132347106934, "learning_rate": 4.490426564755792e-06, "loss": 8.277, "step": 3569 }, { "epoch": 0.6975381008206331, "grad_norm": 7.849486351013184, "learning_rate": 4.485146113253794e-06, "loss": 8.815, "step": 3570 }, { "epoch": 0.6977334896443923, "grad_norm": 7.602990627288818, "learning_rate": 4.479867870621996e-06, "loss": 9.5663, "step": 3571 }, { "epoch": 0.6979288784681517, "grad_norm": 7.465991020202637, "learning_rate": 4.474591838974495e-06, "loss": 9.0858, "step": 3572 }, { "epoch": 0.6981242672919109, "grad_norm": 10.337199211120605, "learning_rate": 4.469318020424508e-06, "loss": 9.0271, "step": 3573 }, { "epoch": 0.6983196561156702, "grad_norm": 8.851112365722656, "learning_rate": 4.464046417084359e-06, "loss": 8.8061, "step": 3574 }, { "epoch": 0.6985150449394295, "grad_norm": 8.679439544677734, "learning_rate": 4.458777031065496e-06, "loss": 9.4217, "step": 3575 }, { "epoch": 0.6987104337631888, "grad_norm": 8.675277709960938, "learning_rate": 4.453509864478464e-06, "loss": 8.6926, "step": 3576 }, { "epoch": 0.698905822586948, "grad_norm": 11.892107963562012, "learning_rate": 4.448244919432929e-06, "loss": 9.2724, "step": 3577 }, { "epoch": 0.6991012114107074, "grad_norm": 10.948375701904297, "learning_rate": 4.442982198037669e-06, "loss": 9.3081, "step": 3578 }, { "epoch": 0.6992966002344666, "grad_norm": 12.927075386047363, "learning_rate": 4.437721702400559e-06, "loss": 8.6354, "step": 3579 }, { "epoch": 0.6994919890582258, "grad_norm": 9.045134544372559, "learning_rate": 4.432463434628601e-06, "loss": 9.3044, "step": 3580 }, { "epoch": 0.6996873778819851, "grad_norm": 9.391888618469238, "learning_rate": 4.4272073968278865e-06, "loss": 9.1682, "step": 3581 }, { "epoch": 0.6998827667057445, "grad_norm": 9.01781177520752, "learning_rate": 4.421953591103627e-06, "loss": 8.4321, "step": 3582 }, { "epoch": 0.7000781555295037, "grad_norm": 9.802136421203613, "learning_rate": 4.416702019560129e-06, "loss": 9.8514, "step": 3583 }, { "epoch": 0.7002735443532629, "grad_norm": 8.904387474060059, "learning_rate": 4.411452684300818e-06, "loss": 9.3018, "step": 3584 }, { "epoch": 0.7004689331770223, "grad_norm": 7.979209899902344, "learning_rate": 4.406205587428206e-06, "loss": 8.6123, "step": 3585 }, { "epoch": 0.7006643220007815, "grad_norm": 17.200040817260742, "learning_rate": 4.400960731043929e-06, "loss": 9.4106, "step": 3586 }, { "epoch": 0.7008597108245408, "grad_norm": 7.51259708404541, "learning_rate": 4.395718117248704e-06, "loss": 9.1072, "step": 3587 }, { "epoch": 0.7010550996483002, "grad_norm": 8.189982414245605, "learning_rate": 4.39047774814237e-06, "loss": 8.2669, "step": 3588 }, { "epoch": 0.7012504884720594, "grad_norm": 6.8721442222595215, "learning_rate": 4.385239625823854e-06, "loss": 8.0266, "step": 3589 }, { "epoch": 0.7014458772958186, "grad_norm": 9.310396194458008, "learning_rate": 4.380003752391181e-06, "loss": 9.1325, "step": 3590 }, { "epoch": 0.701641266119578, "grad_norm": 9.49889850616455, "learning_rate": 4.374770129941491e-06, "loss": 10.0293, "step": 3591 }, { "epoch": 0.7018366549433372, "grad_norm": 9.905617713928223, "learning_rate": 4.369538760571003e-06, "loss": 9.4679, "step": 3592 }, { "epoch": 0.7020320437670965, "grad_norm": 8.300040245056152, "learning_rate": 4.3643096463750504e-06, "loss": 8.839, "step": 3593 }, { "epoch": 0.7022274325908558, "grad_norm": 9.779138565063477, "learning_rate": 4.359082789448048e-06, "loss": 8.1039, "step": 3594 }, { "epoch": 0.7024228214146151, "grad_norm": 8.973949432373047, "learning_rate": 4.3538581918835236e-06, "loss": 10.0138, "step": 3595 }, { "epoch": 0.7026182102383743, "grad_norm": 12.79630184173584, "learning_rate": 4.348635855774082e-06, "loss": 10.153, "step": 3596 }, { "epoch": 0.7028135990621337, "grad_norm": 10.279449462890625, "learning_rate": 4.343415783211438e-06, "loss": 9.1011, "step": 3597 }, { "epoch": 0.7030089878858929, "grad_norm": 7.635469913482666, "learning_rate": 4.338197976286385e-06, "loss": 8.5996, "step": 3598 }, { "epoch": 0.7032043767096522, "grad_norm": 9.318404197692871, "learning_rate": 4.332982437088825e-06, "loss": 9.6105, "step": 3599 }, { "epoch": 0.7033997655334114, "grad_norm": 8.180233001708984, "learning_rate": 4.3277691677077375e-06, "loss": 9.5781, "step": 3600 }, { "epoch": 0.7035951543571708, "grad_norm": 7.368651390075684, "learning_rate": 4.322558170231202e-06, "loss": 9.1421, "step": 3601 }, { "epoch": 0.70379054318093, "grad_norm": 10.323442459106445, "learning_rate": 4.317349446746382e-06, "loss": 9.5176, "step": 3602 }, { "epoch": 0.7039859320046893, "grad_norm": 8.985882759094238, "learning_rate": 4.312142999339537e-06, "loss": 9.0242, "step": 3603 }, { "epoch": 0.7041813208284486, "grad_norm": 9.284278869628906, "learning_rate": 4.306938830096008e-06, "loss": 9.0943, "step": 3604 }, { "epoch": 0.7043767096522079, "grad_norm": 6.843111038208008, "learning_rate": 4.301736941100223e-06, "loss": 8.3563, "step": 3605 }, { "epoch": 0.7045720984759671, "grad_norm": 6.5979905128479, "learning_rate": 4.296537334435707e-06, "loss": 7.6602, "step": 3606 }, { "epoch": 0.7047674872997265, "grad_norm": 10.094749450683594, "learning_rate": 4.291340012185058e-06, "loss": 9.9754, "step": 3607 }, { "epoch": 0.7049628761234857, "grad_norm": 7.74041748046875, "learning_rate": 4.286144976429971e-06, "loss": 9.5605, "step": 3608 }, { "epoch": 0.705158264947245, "grad_norm": 7.466304302215576, "learning_rate": 4.280952229251212e-06, "loss": 8.9152, "step": 3609 }, { "epoch": 0.7053536537710043, "grad_norm": 18.040563583374023, "learning_rate": 4.275761772728644e-06, "loss": 7.9846, "step": 3610 }, { "epoch": 0.7055490425947636, "grad_norm": 9.60568904876709, "learning_rate": 4.2705736089411995e-06, "loss": 9.5358, "step": 3611 }, { "epoch": 0.7057444314185228, "grad_norm": 10.833187103271484, "learning_rate": 4.265387739966907e-06, "loss": 10.1627, "step": 3612 }, { "epoch": 0.7059398202422822, "grad_norm": 15.13516902923584, "learning_rate": 4.26020416788286e-06, "loss": 8.5006, "step": 3613 }, { "epoch": 0.7061352090660414, "grad_norm": 8.97628402709961, "learning_rate": 4.255022894765247e-06, "loss": 9.3216, "step": 3614 }, { "epoch": 0.7063305978898007, "grad_norm": 8.268674850463867, "learning_rate": 4.249843922689322e-06, "loss": 9.4552, "step": 3615 }, { "epoch": 0.70652598671356, "grad_norm": 6.960999488830566, "learning_rate": 4.244667253729431e-06, "loss": 8.6508, "step": 3616 }, { "epoch": 0.7067213755373193, "grad_norm": 7.6391987800598145, "learning_rate": 4.2394928899589884e-06, "loss": 8.8044, "step": 3617 }, { "epoch": 0.7069167643610785, "grad_norm": 9.734604835510254, "learning_rate": 4.234320833450482e-06, "loss": 9.2713, "step": 3618 }, { "epoch": 0.7071121531848378, "grad_norm": 8.456120491027832, "learning_rate": 4.229151086275488e-06, "loss": 9.1271, "step": 3619 }, { "epoch": 0.7073075420085971, "grad_norm": 9.645123481750488, "learning_rate": 4.2239836505046465e-06, "loss": 9.9589, "step": 3620 }, { "epoch": 0.7075029308323564, "grad_norm": 10.434496879577637, "learning_rate": 4.21881852820768e-06, "loss": 9.1162, "step": 3621 }, { "epoch": 0.7076983196561156, "grad_norm": 11.226633071899414, "learning_rate": 4.213655721453373e-06, "loss": 9.5855, "step": 3622 }, { "epoch": 0.707893708479875, "grad_norm": 8.523823738098145, "learning_rate": 4.208495232309601e-06, "loss": 8.8937, "step": 3623 }, { "epoch": 0.7080890973036342, "grad_norm": 10.059098243713379, "learning_rate": 4.203337062843289e-06, "loss": 9.5106, "step": 3624 }, { "epoch": 0.7082844861273935, "grad_norm": 6.950419902801514, "learning_rate": 4.1981812151204515e-06, "loss": 9.1168, "step": 3625 }, { "epoch": 0.7084798749511528, "grad_norm": 7.646895408630371, "learning_rate": 4.19302769120616e-06, "loss": 8.8619, "step": 3626 }, { "epoch": 0.7086752637749121, "grad_norm": 10.336943626403809, "learning_rate": 4.187876493164569e-06, "loss": 9.8278, "step": 3627 }, { "epoch": 0.7088706525986713, "grad_norm": 7.599386692047119, "learning_rate": 4.182727623058883e-06, "loss": 9.5769, "step": 3628 }, { "epoch": 0.7090660414224307, "grad_norm": 7.983441352844238, "learning_rate": 4.177581082951393e-06, "loss": 8.6082, "step": 3629 }, { "epoch": 0.7092614302461899, "grad_norm": 7.941792011260986, "learning_rate": 4.1724368749034434e-06, "loss": 9.0121, "step": 3630 }, { "epoch": 0.7094568190699492, "grad_norm": 8.246491432189941, "learning_rate": 4.167295000975449e-06, "loss": 9.4766, "step": 3631 }, { "epoch": 0.7096522078937085, "grad_norm": 13.80324649810791, "learning_rate": 4.1621554632268935e-06, "loss": 9.6819, "step": 3632 }, { "epoch": 0.7098475967174678, "grad_norm": 8.703511238098145, "learning_rate": 4.1570182637163155e-06, "loss": 8.7774, "step": 3633 }, { "epoch": 0.710042985541227, "grad_norm": 8.409005165100098, "learning_rate": 4.15188340450133e-06, "loss": 8.604, "step": 3634 }, { "epoch": 0.7102383743649863, "grad_norm": 8.394705772399902, "learning_rate": 4.146750887638601e-06, "loss": 8.5099, "step": 3635 }, { "epoch": 0.7104337631887456, "grad_norm": 6.875105381011963, "learning_rate": 4.141620715183867e-06, "loss": 8.7819, "step": 3636 }, { "epoch": 0.7106291520125049, "grad_norm": 9.762279510498047, "learning_rate": 4.136492889191914e-06, "loss": 9.6374, "step": 3637 }, { "epoch": 0.7108245408362641, "grad_norm": 8.945638656616211, "learning_rate": 4.131367411716605e-06, "loss": 9.8514, "step": 3638 }, { "epoch": 0.7110199296600235, "grad_norm": 8.55778980255127, "learning_rate": 4.126244284810842e-06, "loss": 8.1234, "step": 3639 }, { "epoch": 0.7112153184837827, "grad_norm": 5.584824085235596, "learning_rate": 4.1211235105266065e-06, "loss": 8.2136, "step": 3640 }, { "epoch": 0.711410707307542, "grad_norm": 9.62353801727295, "learning_rate": 4.11600509091492e-06, "loss": 9.8375, "step": 3641 }, { "epoch": 0.7116060961313013, "grad_norm": 8.715863227844238, "learning_rate": 4.110889028025874e-06, "loss": 9.1487, "step": 3642 }, { "epoch": 0.7118014849550606, "grad_norm": 7.7625651359558105, "learning_rate": 4.105775323908608e-06, "loss": 8.7921, "step": 3643 }, { "epoch": 0.7119968737788198, "grad_norm": 11.389391899108887, "learning_rate": 4.100663980611317e-06, "loss": 8.8859, "step": 3644 }, { "epoch": 0.7121922626025792, "grad_norm": 7.64915132522583, "learning_rate": 4.095555000181257e-06, "loss": 8.9723, "step": 3645 }, { "epoch": 0.7123876514263384, "grad_norm": 10.494763374328613, "learning_rate": 4.090448384664728e-06, "loss": 10.0071, "step": 3646 }, { "epoch": 0.7125830402500977, "grad_norm": 8.759225845336914, "learning_rate": 4.085344136107096e-06, "loss": 9.3408, "step": 3647 }, { "epoch": 0.712778429073857, "grad_norm": 11.41947078704834, "learning_rate": 4.080242256552761e-06, "loss": 9.6381, "step": 3648 }, { "epoch": 0.7129738178976163, "grad_norm": 6.726463794708252, "learning_rate": 4.075142748045194e-06, "loss": 8.6356, "step": 3649 }, { "epoch": 0.7131692067213755, "grad_norm": 9.784330368041992, "learning_rate": 4.070045612626898e-06, "loss": 9.9572, "step": 3650 }, { "epoch": 0.7133645955451349, "grad_norm": 9.007823944091797, "learning_rate": 4.064950852339442e-06, "loss": 9.215, "step": 3651 }, { "epoch": 0.7135599843688941, "grad_norm": 7.706821441650391, "learning_rate": 4.059858469223428e-06, "loss": 7.903, "step": 3652 }, { "epoch": 0.7137553731926534, "grad_norm": 7.795345783233643, "learning_rate": 4.054768465318521e-06, "loss": 8.621, "step": 3653 }, { "epoch": 0.7139507620164126, "grad_norm": 8.038520812988281, "learning_rate": 4.049680842663416e-06, "loss": 8.8847, "step": 3654 }, { "epoch": 0.714146150840172, "grad_norm": 6.677659511566162, "learning_rate": 4.044595603295876e-06, "loss": 9.6192, "step": 3655 }, { "epoch": 0.7143415396639312, "grad_norm": 8.80317211151123, "learning_rate": 4.039512749252689e-06, "loss": 9.7371, "step": 3656 }, { "epoch": 0.7145369284876905, "grad_norm": 6.420666694641113, "learning_rate": 4.034432282569694e-06, "loss": 8.5476, "step": 3657 }, { "epoch": 0.7147323173114498, "grad_norm": 9.233664512634277, "learning_rate": 4.029354205281784e-06, "loss": 10.0034, "step": 3658 }, { "epoch": 0.7149277061352091, "grad_norm": 9.827657699584961, "learning_rate": 4.024278519422877e-06, "loss": 8.514, "step": 3659 }, { "epoch": 0.7151230949589683, "grad_norm": 8.356546401977539, "learning_rate": 4.019205227025952e-06, "loss": 8.3579, "step": 3660 }, { "epoch": 0.7153184837827277, "grad_norm": 9.356789588928223, "learning_rate": 4.014134330123012e-06, "loss": 8.6536, "step": 3661 }, { "epoch": 0.7155138726064869, "grad_norm": 10.166316032409668, "learning_rate": 4.009065830745116e-06, "loss": 8.6797, "step": 3662 }, { "epoch": 0.7157092614302462, "grad_norm": 79.7872314453125, "learning_rate": 4.0039997309223475e-06, "loss": 10.0384, "step": 3663 }, { "epoch": 0.7159046502540055, "grad_norm": 8.668107986450195, "learning_rate": 3.9989360326838455e-06, "loss": 8.5372, "step": 3664 }, { "epoch": 0.7161000390777648, "grad_norm": 8.511775970458984, "learning_rate": 3.993874738057771e-06, "loss": 8.6834, "step": 3665 }, { "epoch": 0.716295427901524, "grad_norm": 8.757607460021973, "learning_rate": 3.988815849071335e-06, "loss": 8.415, "step": 3666 }, { "epoch": 0.7164908167252834, "grad_norm": 9.092312812805176, "learning_rate": 3.983759367750772e-06, "loss": 8.8122, "step": 3667 }, { "epoch": 0.7166862055490426, "grad_norm": 9.101534843444824, "learning_rate": 3.978705296121372e-06, "loss": 9.1954, "step": 3668 }, { "epoch": 0.7168815943728019, "grad_norm": 8.66796588897705, "learning_rate": 3.973653636207437e-06, "loss": 8.7924, "step": 3669 }, { "epoch": 0.7170769831965611, "grad_norm": 23.35134506225586, "learning_rate": 3.968604390032316e-06, "loss": 8.6947, "step": 3670 }, { "epoch": 0.7172723720203205, "grad_norm": 9.638257026672363, "learning_rate": 3.963557559618392e-06, "loss": 8.8963, "step": 3671 }, { "epoch": 0.7174677608440797, "grad_norm": 7.746241569519043, "learning_rate": 3.958513146987073e-06, "loss": 8.4087, "step": 3672 }, { "epoch": 0.717663149667839, "grad_norm": 8.321249008178711, "learning_rate": 3.953471154158808e-06, "loss": 9.321, "step": 3673 }, { "epoch": 0.7178585384915983, "grad_norm": 7.365278720855713, "learning_rate": 3.948431583153064e-06, "loss": 9.0058, "step": 3674 }, { "epoch": 0.7180539273153576, "grad_norm": 9.134242057800293, "learning_rate": 3.943394435988356e-06, "loss": 9.9397, "step": 3675 }, { "epoch": 0.7182493161391168, "grad_norm": 8.319193840026855, "learning_rate": 3.9383597146822076e-06, "loss": 8.5295, "step": 3676 }, { "epoch": 0.7184447049628762, "grad_norm": 11.505075454711914, "learning_rate": 3.933327421251189e-06, "loss": 9.552, "step": 3677 }, { "epoch": 0.7186400937866354, "grad_norm": 8.731011390686035, "learning_rate": 3.928297557710883e-06, "loss": 8.065, "step": 3678 }, { "epoch": 0.7188354826103946, "grad_norm": 7.6019110679626465, "learning_rate": 3.923270126075913e-06, "loss": 8.741, "step": 3679 }, { "epoch": 0.719030871434154, "grad_norm": 6.32687520980835, "learning_rate": 3.9182451283599156e-06, "loss": 7.7967, "step": 3680 }, { "epoch": 0.7192262602579133, "grad_norm": 18.116029739379883, "learning_rate": 3.913222566575564e-06, "loss": 9.2571, "step": 3681 }, { "epoch": 0.7194216490816725, "grad_norm": 8.875486373901367, "learning_rate": 3.908202442734546e-06, "loss": 8.6051, "step": 3682 }, { "epoch": 0.7196170379054319, "grad_norm": 7.803152561187744, "learning_rate": 3.90318475884758e-06, "loss": 8.6105, "step": 3683 }, { "epoch": 0.7198124267291911, "grad_norm": 7.7268171310424805, "learning_rate": 3.8981695169243986e-06, "loss": 8.451, "step": 3684 }, { "epoch": 0.7200078155529503, "grad_norm": 7.699978828430176, "learning_rate": 3.893156718973767e-06, "loss": 8.5678, "step": 3685 }, { "epoch": 0.7202032043767097, "grad_norm": 7.997290134429932, "learning_rate": 3.888146367003464e-06, "loss": 8.8491, "step": 3686 }, { "epoch": 0.720398593200469, "grad_norm": 7.056305408477783, "learning_rate": 3.883138463020294e-06, "loss": 9.4386, "step": 3687 }, { "epoch": 0.7205939820242282, "grad_norm": 7.773458957672119, "learning_rate": 3.878133009030076e-06, "loss": 8.9443, "step": 3688 }, { "epoch": 0.7207893708479874, "grad_norm": 6.357101917266846, "learning_rate": 3.873130007037647e-06, "loss": 7.9893, "step": 3689 }, { "epoch": 0.7209847596717468, "grad_norm": 14.02954387664795, "learning_rate": 3.868129459046871e-06, "loss": 9.3322, "step": 3690 }, { "epoch": 0.721180148495506, "grad_norm": 10.02996826171875, "learning_rate": 3.863131367060615e-06, "loss": 10.0922, "step": 3691 }, { "epoch": 0.7213755373192653, "grad_norm": 8.229723930358887, "learning_rate": 3.858135733080779e-06, "loss": 9.2091, "step": 3692 }, { "epoch": 0.7215709261430246, "grad_norm": 12.047175407409668, "learning_rate": 3.8531425591082605e-06, "loss": 9.8282, "step": 3693 }, { "epoch": 0.7217663149667839, "grad_norm": 7.275228023529053, "learning_rate": 3.848151847142988e-06, "loss": 9.1976, "step": 3694 }, { "epoch": 0.7219617037905431, "grad_norm": 6.717846393585205, "learning_rate": 3.84316359918389e-06, "loss": 8.3496, "step": 3695 }, { "epoch": 0.7221570926143025, "grad_norm": 8.532776832580566, "learning_rate": 3.838177817228922e-06, "loss": 9.1765, "step": 3696 }, { "epoch": 0.7223524814380617, "grad_norm": 20.730764389038086, "learning_rate": 3.833194503275038e-06, "loss": 9.5346, "step": 3697 }, { "epoch": 0.722547870261821, "grad_norm": 9.638972282409668, "learning_rate": 3.828213659318215e-06, "loss": 9.4929, "step": 3698 }, { "epoch": 0.7227432590855803, "grad_norm": 8.68529987335205, "learning_rate": 3.8232352873534295e-06, "loss": 9.7288, "step": 3699 }, { "epoch": 0.7229386479093396, "grad_norm": 11.741350173950195, "learning_rate": 3.818259389374682e-06, "loss": 8.9153, "step": 3700 }, { "epoch": 0.7231340367330988, "grad_norm": 9.967458724975586, "learning_rate": 3.8132859673749688e-06, "loss": 8.623, "step": 3701 }, { "epoch": 0.7233294255568582, "grad_norm": 7.636810302734375, "learning_rate": 3.8083150233462984e-06, "loss": 8.3056, "step": 3702 }, { "epoch": 0.7235248143806174, "grad_norm": 8.557602882385254, "learning_rate": 3.8033465592796946e-06, "loss": 9.1016, "step": 3703 }, { "epoch": 0.7237202032043767, "grad_norm": 8.045110702514648, "learning_rate": 3.7983805771651728e-06, "loss": 9.2546, "step": 3704 }, { "epoch": 0.723915592028136, "grad_norm": 9.116978645324707, "learning_rate": 3.7934170789917725e-06, "loss": 9.1902, "step": 3705 }, { "epoch": 0.7241109808518953, "grad_norm": 6.71596097946167, "learning_rate": 3.78845606674752e-06, "loss": 8.4867, "step": 3706 }, { "epoch": 0.7243063696756545, "grad_norm": 8.542150497436523, "learning_rate": 3.783497542419463e-06, "loss": 9.0346, "step": 3707 }, { "epoch": 0.7245017584994138, "grad_norm": 10.2990083694458, "learning_rate": 3.778541507993636e-06, "loss": 9.4014, "step": 3708 }, { "epoch": 0.7246971473231731, "grad_norm": 9.195205688476562, "learning_rate": 3.7735879654550935e-06, "loss": 8.6709, "step": 3709 }, { "epoch": 0.7248925361469324, "grad_norm": 10.924129486083984, "learning_rate": 3.7686369167878735e-06, "loss": 8.9261, "step": 3710 }, { "epoch": 0.7250879249706916, "grad_norm": 9.022870063781738, "learning_rate": 3.7636883639750343e-06, "loss": 9.8402, "step": 3711 }, { "epoch": 0.725283313794451, "grad_norm": 98.3774185180664, "learning_rate": 3.7587423089986174e-06, "loss": 8.5148, "step": 3712 }, { "epoch": 0.7254787026182102, "grad_norm": 6.8416924476623535, "learning_rate": 3.753798753839677e-06, "loss": 8.3697, "step": 3713 }, { "epoch": 0.7256740914419695, "grad_norm": 12.198902130126953, "learning_rate": 3.7488577004782578e-06, "loss": 9.4243, "step": 3714 }, { "epoch": 0.7258694802657288, "grad_norm": 7.990806579589844, "learning_rate": 3.7439191508934013e-06, "loss": 8.7914, "step": 3715 }, { "epoch": 0.7260648690894881, "grad_norm": 9.815138816833496, "learning_rate": 3.7389831070631577e-06, "loss": 8.9791, "step": 3716 }, { "epoch": 0.7262602579132473, "grad_norm": 9.926175117492676, "learning_rate": 3.7340495709645585e-06, "loss": 9.9698, "step": 3717 }, { "epoch": 0.7264556467370067, "grad_norm": 8.747008323669434, "learning_rate": 3.7291185445736445e-06, "loss": 9.4764, "step": 3718 }, { "epoch": 0.7266510355607659, "grad_norm": 6.5735697746276855, "learning_rate": 3.724190029865439e-06, "loss": 8.7215, "step": 3719 }, { "epoch": 0.7268464243845252, "grad_norm": 11.640028953552246, "learning_rate": 3.7192640288139704e-06, "loss": 10.2233, "step": 3720 }, { "epoch": 0.7270418132082845, "grad_norm": 7.5807204246521, "learning_rate": 3.7143405433922486e-06, "loss": 9.4529, "step": 3721 }, { "epoch": 0.7272372020320438, "grad_norm": 11.326850891113281, "learning_rate": 3.7094195755722906e-06, "loss": 9.1891, "step": 3722 }, { "epoch": 0.727432590855803, "grad_norm": 8.540494918823242, "learning_rate": 3.7045011273250898e-06, "loss": 9.5362, "step": 3723 }, { "epoch": 0.7276279796795623, "grad_norm": 8.911155700683594, "learning_rate": 3.6995852006206424e-06, "loss": 9.799, "step": 3724 }, { "epoch": 0.7278233685033216, "grad_norm": 9.42534065246582, "learning_rate": 3.6946717974279245e-06, "loss": 9.7036, "step": 3725 }, { "epoch": 0.7280187573270809, "grad_norm": 8.128520011901855, "learning_rate": 3.689760919714912e-06, "loss": 9.2025, "step": 3726 }, { "epoch": 0.7282141461508401, "grad_norm": 9.171087265014648, "learning_rate": 3.6848525694485627e-06, "loss": 9.3736, "step": 3727 }, { "epoch": 0.7284095349745995, "grad_norm": 9.77519416809082, "learning_rate": 3.679946748594818e-06, "loss": 9.7459, "step": 3728 }, { "epoch": 0.7286049237983587, "grad_norm": 8.036867141723633, "learning_rate": 3.675043459118619e-06, "loss": 9.031, "step": 3729 }, { "epoch": 0.728800312622118, "grad_norm": 8.540367126464844, "learning_rate": 3.670142702983878e-06, "loss": 8.9552, "step": 3730 }, { "epoch": 0.7289957014458773, "grad_norm": 10.316926956176758, "learning_rate": 3.6652444821535072e-06, "loss": 8.7552, "step": 3731 }, { "epoch": 0.7291910902696366, "grad_norm": 8.742179870605469, "learning_rate": 3.6603487985893894e-06, "loss": 8.6493, "step": 3732 }, { "epoch": 0.7293864790933958, "grad_norm": 10.511765480041504, "learning_rate": 3.6554556542524046e-06, "loss": 8.7671, "step": 3733 }, { "epoch": 0.7295818679171552, "grad_norm": 19.103931427001953, "learning_rate": 3.6505650511024014e-06, "loss": 9.5827, "step": 3734 }, { "epoch": 0.7297772567409144, "grad_norm": 9.42544174194336, "learning_rate": 3.645676991098227e-06, "loss": 9.5682, "step": 3735 }, { "epoch": 0.7299726455646737, "grad_norm": 8.16747760772705, "learning_rate": 3.6407914761976928e-06, "loss": 9.0845, "step": 3736 }, { "epoch": 0.730168034388433, "grad_norm": 9.376529693603516, "learning_rate": 3.635908508357606e-06, "loss": 9.4353, "step": 3737 }, { "epoch": 0.7303634232121923, "grad_norm": 8.061734199523926, "learning_rate": 3.6310280895337402e-06, "loss": 8.2525, "step": 3738 }, { "epoch": 0.7305588120359515, "grad_norm": 10.184969902038574, "learning_rate": 3.626150221680863e-06, "loss": 9.6473, "step": 3739 }, { "epoch": 0.7307542008597109, "grad_norm": 6.46053409576416, "learning_rate": 3.6212749067527054e-06, "loss": 8.5041, "step": 3740 }, { "epoch": 0.7309495896834701, "grad_norm": 8.612481117248535, "learning_rate": 3.6164021467019817e-06, "loss": 9.8489, "step": 3741 }, { "epoch": 0.7311449785072294, "grad_norm": 11.279267311096191, "learning_rate": 3.6115319434803897e-06, "loss": 10.6624, "step": 3742 }, { "epoch": 0.7313403673309886, "grad_norm": 8.220184326171875, "learning_rate": 3.606664299038589e-06, "loss": 9.1821, "step": 3743 }, { "epoch": 0.731535756154748, "grad_norm": 9.151845932006836, "learning_rate": 3.6017992153262303e-06, "loss": 9.3378, "step": 3744 }, { "epoch": 0.7317311449785072, "grad_norm": 10.015589714050293, "learning_rate": 3.5969366942919237e-06, "loss": 10.7843, "step": 3745 }, { "epoch": 0.7319265338022665, "grad_norm": 8.539863586425781, "learning_rate": 3.5920767378832666e-06, "loss": 9.346, "step": 3746 }, { "epoch": 0.7321219226260258, "grad_norm": 7.310202121734619, "learning_rate": 3.5872193480468155e-06, "loss": 9.3841, "step": 3747 }, { "epoch": 0.7323173114497851, "grad_norm": 9.648876190185547, "learning_rate": 3.5823645267281126e-06, "loss": 9.2589, "step": 3748 }, { "epoch": 0.7325127002735443, "grad_norm": 9.810677528381348, "learning_rate": 3.5775122758716585e-06, "loss": 9.8274, "step": 3749 }, { "epoch": 0.7327080890973037, "grad_norm": 8.219881057739258, "learning_rate": 3.572662597420935e-06, "loss": 9.3783, "step": 3750 }, { "epoch": 0.7329034779210629, "grad_norm": 9.856924057006836, "learning_rate": 3.567815493318385e-06, "loss": 9.4114, "step": 3751 }, { "epoch": 0.7330988667448222, "grad_norm": 7.9009833335876465, "learning_rate": 3.562970965505429e-06, "loss": 9.3978, "step": 3752 }, { "epoch": 0.7332942555685815, "grad_norm": 8.5271577835083, "learning_rate": 3.558129015922448e-06, "loss": 8.2685, "step": 3753 }, { "epoch": 0.7334896443923408, "grad_norm": 15.44652271270752, "learning_rate": 3.5532896465087897e-06, "loss": 9.7942, "step": 3754 }, { "epoch": 0.7336850332161, "grad_norm": 9.352770805358887, "learning_rate": 3.5484528592027778e-06, "loss": 9.0612, "step": 3755 }, { "epoch": 0.7338804220398594, "grad_norm": 11.316088676452637, "learning_rate": 3.543618655941691e-06, "loss": 10.4106, "step": 3756 }, { "epoch": 0.7340758108636186, "grad_norm": 13.240620613098145, "learning_rate": 3.5387870386617827e-06, "loss": 8.9332, "step": 3757 }, { "epoch": 0.7342711996873779, "grad_norm": 13.1504487991333, "learning_rate": 3.5339580092982594e-06, "loss": 11.0741, "step": 3758 }, { "epoch": 0.7344665885111371, "grad_norm": 11.847900390625, "learning_rate": 3.5291315697853044e-06, "loss": 9.0539, "step": 3759 }, { "epoch": 0.7346619773348965, "grad_norm": 8.007011413574219, "learning_rate": 3.5243077220560496e-06, "loss": 9.2799, "step": 3760 }, { "epoch": 0.7348573661586557, "grad_norm": 11.476548194885254, "learning_rate": 3.5194864680426023e-06, "loss": 8.8405, "step": 3761 }, { "epoch": 0.735052754982415, "grad_norm": 10.305270195007324, "learning_rate": 3.5146678096760166e-06, "loss": 9.647, "step": 3762 }, { "epoch": 0.7352481438061743, "grad_norm": 9.828678131103516, "learning_rate": 3.509851748886325e-06, "loss": 10.0625, "step": 3763 }, { "epoch": 0.7354435326299336, "grad_norm": 7.253856658935547, "learning_rate": 3.505038287602499e-06, "loss": 8.6707, "step": 3764 }, { "epoch": 0.7356389214536928, "grad_norm": 9.606746673583984, "learning_rate": 3.500227427752487e-06, "loss": 10.9174, "step": 3765 }, { "epoch": 0.7358343102774522, "grad_norm": 8.862897872924805, "learning_rate": 3.4954191712631856e-06, "loss": 9.2135, "step": 3766 }, { "epoch": 0.7360296991012114, "grad_norm": 7.542580604553223, "learning_rate": 3.4906135200604464e-06, "loss": 8.3897, "step": 3767 }, { "epoch": 0.7362250879249707, "grad_norm": 6.870659351348877, "learning_rate": 3.485810476069089e-06, "loss": 8.4792, "step": 3768 }, { "epoch": 0.73642047674873, "grad_norm": 8.340803146362305, "learning_rate": 3.4810100412128743e-06, "loss": 9.2354, "step": 3769 }, { "epoch": 0.7366158655724893, "grad_norm": 6.958797931671143, "learning_rate": 3.476212217414533e-06, "loss": 8.385, "step": 3770 }, { "epoch": 0.7368112543962485, "grad_norm": 9.973862648010254, "learning_rate": 3.471417006595735e-06, "loss": 9.4786, "step": 3771 }, { "epoch": 0.7370066432200079, "grad_norm": 8.524601936340332, "learning_rate": 3.4666244106771196e-06, "loss": 8.7931, "step": 3772 }, { "epoch": 0.7372020320437671, "grad_norm": 8.614114761352539, "learning_rate": 3.46183443157826e-06, "loss": 8.5033, "step": 3773 }, { "epoch": 0.7373974208675264, "grad_norm": 10.846492767333984, "learning_rate": 3.457047071217703e-06, "loss": 9.0408, "step": 3774 }, { "epoch": 0.7375928096912857, "grad_norm": 9.433976173400879, "learning_rate": 3.4522623315129244e-06, "loss": 9.3712, "step": 3775 }, { "epoch": 0.737788198515045, "grad_norm": 6.5583109855651855, "learning_rate": 3.4474802143803687e-06, "loss": 8.9908, "step": 3776 }, { "epoch": 0.7379835873388042, "grad_norm": 9.018953323364258, "learning_rate": 3.442700721735417e-06, "loss": 8.8893, "step": 3777 }, { "epoch": 0.7381789761625634, "grad_norm": 6.55804443359375, "learning_rate": 3.437923855492409e-06, "loss": 9.0761, "step": 3778 }, { "epoch": 0.7383743649863228, "grad_norm": 10.90300178527832, "learning_rate": 3.4331496175646263e-06, "loss": 9.2355, "step": 3779 }, { "epoch": 0.738569753810082, "grad_norm": 12.411234855651855, "learning_rate": 3.428378009864296e-06, "loss": 10.3317, "step": 3780 }, { "epoch": 0.7387651426338413, "grad_norm": 7.479711532592773, "learning_rate": 3.423609034302601e-06, "loss": 9.0381, "step": 3781 }, { "epoch": 0.7389605314576007, "grad_norm": 8.085942268371582, "learning_rate": 3.4188426927896567e-06, "loss": 9.7472, "step": 3782 }, { "epoch": 0.7391559202813599, "grad_norm": 7.482016086578369, "learning_rate": 3.41407898723454e-06, "loss": 9.0756, "step": 3783 }, { "epoch": 0.7393513091051191, "grad_norm": 6.062737941741943, "learning_rate": 3.409317919545254e-06, "loss": 8.9825, "step": 3784 }, { "epoch": 0.7395466979288785, "grad_norm": 9.465961456298828, "learning_rate": 3.4045594916287616e-06, "loss": 9.1356, "step": 3785 }, { "epoch": 0.7397420867526378, "grad_norm": 8.177811622619629, "learning_rate": 3.399803705390955e-06, "loss": 9.8667, "step": 3786 }, { "epoch": 0.739937475576397, "grad_norm": 6.569251537322998, "learning_rate": 3.3950505627366793e-06, "loss": 8.4834, "step": 3787 }, { "epoch": 0.7401328644001564, "grad_norm": 9.74722957611084, "learning_rate": 3.3903000655697094e-06, "loss": 10.6275, "step": 3788 }, { "epoch": 0.7403282532239156, "grad_norm": 7.447836875915527, "learning_rate": 3.3855522157927756e-06, "loss": 9.1656, "step": 3789 }, { "epoch": 0.7405236420476748, "grad_norm": 8.01734447479248, "learning_rate": 3.380807015307529e-06, "loss": 8.9998, "step": 3790 }, { "epoch": 0.7407190308714342, "grad_norm": 8.140630722045898, "learning_rate": 3.3760644660145794e-06, "loss": 9.3295, "step": 3791 }, { "epoch": 0.7409144196951934, "grad_norm": 6.896175861358643, "learning_rate": 3.37132456981346e-06, "loss": 7.9847, "step": 3792 }, { "epoch": 0.7411098085189527, "grad_norm": 7.749749660491943, "learning_rate": 3.366587328602644e-06, "loss": 8.6011, "step": 3793 }, { "epoch": 0.741305197342712, "grad_norm": 11.753620147705078, "learning_rate": 3.36185274427955e-06, "loss": 8.9515, "step": 3794 }, { "epoch": 0.7415005861664713, "grad_norm": 9.068345069885254, "learning_rate": 3.357120818740519e-06, "loss": 9.6614, "step": 3795 }, { "epoch": 0.7416959749902305, "grad_norm": 7.815624713897705, "learning_rate": 3.3523915538808406e-06, "loss": 8.1013, "step": 3796 }, { "epoch": 0.7418913638139898, "grad_norm": 9.315130233764648, "learning_rate": 3.3476649515947257e-06, "loss": 8.381, "step": 3797 }, { "epoch": 0.7420867526377491, "grad_norm": 7.379053115844727, "learning_rate": 3.3429410137753327e-06, "loss": 9.1761, "step": 3798 }, { "epoch": 0.7422821414615084, "grad_norm": 11.974115371704102, "learning_rate": 3.338219742314738e-06, "loss": 9.9621, "step": 3799 }, { "epoch": 0.7424775302852676, "grad_norm": 8.75280475616455, "learning_rate": 3.333501139103964e-06, "loss": 9.0321, "step": 3800 }, { "epoch": 0.742672919109027, "grad_norm": 14.717581748962402, "learning_rate": 3.328785206032952e-06, "loss": 8.8125, "step": 3801 }, { "epoch": 0.7428683079327862, "grad_norm": 12.672874450683594, "learning_rate": 3.3240719449905846e-06, "loss": 9.2577, "step": 3802 }, { "epoch": 0.7430636967565455, "grad_norm": 7.265257358551025, "learning_rate": 3.3193613578646633e-06, "loss": 7.7047, "step": 3803 }, { "epoch": 0.7432590855803048, "grad_norm": 6.224616527557373, "learning_rate": 3.3146534465419322e-06, "loss": 7.5593, "step": 3804 }, { "epoch": 0.7434544744040641, "grad_norm": 9.308873176574707, "learning_rate": 3.3099482129080518e-06, "loss": 9.8408, "step": 3805 }, { "epoch": 0.7436498632278233, "grad_norm": 7.382295608520508, "learning_rate": 3.30524565884761e-06, "loss": 9.6591, "step": 3806 }, { "epoch": 0.7438452520515827, "grad_norm": 10.499371528625488, "learning_rate": 3.300545786244134e-06, "loss": 8.9012, "step": 3807 }, { "epoch": 0.7440406408753419, "grad_norm": 7.243477821350098, "learning_rate": 3.2958485969800603e-06, "loss": 8.7174, "step": 3808 }, { "epoch": 0.7442360296991012, "grad_norm": 8.12458324432373, "learning_rate": 3.291154092936768e-06, "loss": 8.6591, "step": 3809 }, { "epoch": 0.7444314185228605, "grad_norm": 8.82094669342041, "learning_rate": 3.286462275994543e-06, "loss": 8.1543, "step": 3810 }, { "epoch": 0.7446268073466198, "grad_norm": 9.012158393859863, "learning_rate": 3.281773148032611e-06, "loss": 9.7201, "step": 3811 }, { "epoch": 0.744822196170379, "grad_norm": 9.375688552856445, "learning_rate": 3.277086710929106e-06, "loss": 9.6693, "step": 3812 }, { "epoch": 0.7450175849941383, "grad_norm": 9.3660888671875, "learning_rate": 3.2724029665611e-06, "loss": 8.9164, "step": 3813 }, { "epoch": 0.7452129738178976, "grad_norm": 7.924816608428955, "learning_rate": 3.2677219168045694e-06, "loss": 9.2943, "step": 3814 }, { "epoch": 0.7454083626416569, "grad_norm": 9.506559371948242, "learning_rate": 3.2630435635344283e-06, "loss": 10.2879, "step": 3815 }, { "epoch": 0.7456037514654161, "grad_norm": 10.148906707763672, "learning_rate": 3.2583679086244946e-06, "loss": 9.9814, "step": 3816 }, { "epoch": 0.7457991402891755, "grad_norm": 9.202022552490234, "learning_rate": 3.2536949539475195e-06, "loss": 9.4508, "step": 3817 }, { "epoch": 0.7459945291129347, "grad_norm": 10.081830978393555, "learning_rate": 3.249024701375165e-06, "loss": 9.2082, "step": 3818 }, { "epoch": 0.746189917936694, "grad_norm": 12.018476486206055, "learning_rate": 3.244357152778007e-06, "loss": 9.1257, "step": 3819 }, { "epoch": 0.7463853067604533, "grad_norm": 6.693393707275391, "learning_rate": 3.2396923100255515e-06, "loss": 8.8083, "step": 3820 }, { "epoch": 0.7465806955842126, "grad_norm": 6.632091045379639, "learning_rate": 3.235030174986209e-06, "loss": 7.9511, "step": 3821 }, { "epoch": 0.7467760844079718, "grad_norm": 9.492447853088379, "learning_rate": 3.230370749527306e-06, "loss": 8.7927, "step": 3822 }, { "epoch": 0.7469714732317312, "grad_norm": 8.027215957641602, "learning_rate": 3.225714035515094e-06, "loss": 9.1603, "step": 3823 }, { "epoch": 0.7471668620554904, "grad_norm": 6.63563871383667, "learning_rate": 3.221060034814728e-06, "loss": 7.9951, "step": 3824 }, { "epoch": 0.7473622508792497, "grad_norm": 8.573164939880371, "learning_rate": 3.216408749290276e-06, "loss": 8.796, "step": 3825 }, { "epoch": 0.747557639703009, "grad_norm": 7.453122615814209, "learning_rate": 3.2117601808047294e-06, "loss": 9.077, "step": 3826 }, { "epoch": 0.7477530285267683, "grad_norm": 8.522750854492188, "learning_rate": 3.207114331219978e-06, "loss": 9.0829, "step": 3827 }, { "epoch": 0.7479484173505275, "grad_norm": 7.745052814483643, "learning_rate": 3.202471202396835e-06, "loss": 9.6801, "step": 3828 }, { "epoch": 0.7481438061742869, "grad_norm": 8.938319206237793, "learning_rate": 3.1978307961950107e-06, "loss": 9.351, "step": 3829 }, { "epoch": 0.7483391949980461, "grad_norm": 8.661677360534668, "learning_rate": 3.1931931144731386e-06, "loss": 8.8951, "step": 3830 }, { "epoch": 0.7485345838218054, "grad_norm": 9.34511947631836, "learning_rate": 3.188558159088748e-06, "loss": 8.8357, "step": 3831 }, { "epoch": 0.7487299726455646, "grad_norm": 7.346662521362305, "learning_rate": 3.1839259318982885e-06, "loss": 8.201, "step": 3832 }, { "epoch": 0.748925361469324, "grad_norm": 6.691265106201172, "learning_rate": 3.1792964347571053e-06, "loss": 9.089, "step": 3833 }, { "epoch": 0.7491207502930832, "grad_norm": 8.465272903442383, "learning_rate": 3.174669669519461e-06, "loss": 8.5986, "step": 3834 }, { "epoch": 0.7493161391168425, "grad_norm": 7.529587268829346, "learning_rate": 3.1700456380385124e-06, "loss": 9.1974, "step": 3835 }, { "epoch": 0.7495115279406018, "grad_norm": 10.19723892211914, "learning_rate": 3.1654243421663355e-06, "loss": 9.8661, "step": 3836 }, { "epoch": 0.7497069167643611, "grad_norm": 7.2738213539123535, "learning_rate": 3.1608057837538976e-06, "loss": 9.3369, "step": 3837 }, { "epoch": 0.7499023055881203, "grad_norm": 9.646924018859863, "learning_rate": 3.156189964651073e-06, "loss": 8.6084, "step": 3838 }, { "epoch": 0.7500976944118797, "grad_norm": 10.57065486907959, "learning_rate": 3.1515768867066464e-06, "loss": 9.9245, "step": 3839 }, { "epoch": 0.7502930832356389, "grad_norm": 15.623647689819336, "learning_rate": 3.1469665517682923e-06, "loss": 10.3266, "step": 3840 }, { "epoch": 0.7504884720593982, "grad_norm": 8.014330863952637, "learning_rate": 3.1423589616825987e-06, "loss": 8.9361, "step": 3841 }, { "epoch": 0.7506838608831575, "grad_norm": 6.676121234893799, "learning_rate": 3.1377541182950445e-06, "loss": 8.8935, "step": 3842 }, { "epoch": 0.7508792497069168, "grad_norm": 8.338479042053223, "learning_rate": 3.1331520234500155e-06, "loss": 8.4392, "step": 3843 }, { "epoch": 0.751074638530676, "grad_norm": 9.977602005004883, "learning_rate": 3.1285526789907906e-06, "loss": 9.0042, "step": 3844 }, { "epoch": 0.7512700273544354, "grad_norm": 7.855495452880859, "learning_rate": 3.123956086759553e-06, "loss": 8.7187, "step": 3845 }, { "epoch": 0.7514654161781946, "grad_norm": 9.590307235717773, "learning_rate": 3.119362248597377e-06, "loss": 9.383, "step": 3846 }, { "epoch": 0.7516608050019539, "grad_norm": 7.904482364654541, "learning_rate": 3.1147711663442427e-06, "loss": 8.3656, "step": 3847 }, { "epoch": 0.7518561938257131, "grad_norm": 7.658161163330078, "learning_rate": 3.1101828418390147e-06, "loss": 8.2054, "step": 3848 }, { "epoch": 0.7520515826494725, "grad_norm": 8.585644721984863, "learning_rate": 3.1055972769194655e-06, "loss": 9.3294, "step": 3849 }, { "epoch": 0.7522469714732317, "grad_norm": 7.512440204620361, "learning_rate": 3.1010144734222535e-06, "loss": 8.4882, "step": 3850 }, { "epoch": 0.752442360296991, "grad_norm": 9.79820728302002, "learning_rate": 3.09643443318293e-06, "loss": 9.4896, "step": 3851 }, { "epoch": 0.7526377491207503, "grad_norm": 21.057519912719727, "learning_rate": 3.0918571580359504e-06, "loss": 9.8927, "step": 3852 }, { "epoch": 0.7528331379445096, "grad_norm": 12.370172500610352, "learning_rate": 3.087282649814648e-06, "loss": 9.0297, "step": 3853 }, { "epoch": 0.7530285267682688, "grad_norm": 10.342479705810547, "learning_rate": 3.0827109103512643e-06, "loss": 9.0344, "step": 3854 }, { "epoch": 0.7532239155920282, "grad_norm": 10.107954978942871, "learning_rate": 3.0781419414769132e-06, "loss": 8.8846, "step": 3855 }, { "epoch": 0.7534193044157874, "grad_norm": 7.770096778869629, "learning_rate": 3.0735757450216177e-06, "loss": 9.1557, "step": 3856 }, { "epoch": 0.7536146932395467, "grad_norm": 7.809630870819092, "learning_rate": 3.069012322814272e-06, "loss": 7.8859, "step": 3857 }, { "epoch": 0.753810082063306, "grad_norm": 8.783084869384766, "learning_rate": 3.064451676682678e-06, "loss": 9.449, "step": 3858 }, { "epoch": 0.7540054708870653, "grad_norm": 7.793107032775879, "learning_rate": 3.059893808453508e-06, "loss": 9.1583, "step": 3859 }, { "epoch": 0.7542008597108245, "grad_norm": 6.793116092681885, "learning_rate": 3.0553387199523354e-06, "loss": 8.1216, "step": 3860 }, { "epoch": 0.7543962485345839, "grad_norm": 8.21915340423584, "learning_rate": 3.0507864130036103e-06, "loss": 8.977, "step": 3861 }, { "epoch": 0.7545916373583431, "grad_norm": 8.23648738861084, "learning_rate": 3.046236889430677e-06, "loss": 9.1412, "step": 3862 }, { "epoch": 0.7547870261821024, "grad_norm": 8.978711128234863, "learning_rate": 3.0416901510557593e-06, "loss": 8.8821, "step": 3863 }, { "epoch": 0.7549824150058617, "grad_norm": 8.333256721496582, "learning_rate": 3.037146199699964e-06, "loss": 9.7569, "step": 3864 }, { "epoch": 0.755177803829621, "grad_norm": 6.864106178283691, "learning_rate": 3.0326050371832894e-06, "loss": 8.6864, "step": 3865 }, { "epoch": 0.7553731926533802, "grad_norm": 8.051876068115234, "learning_rate": 3.028066665324607e-06, "loss": 9.2071, "step": 3866 }, { "epoch": 0.7555685814771395, "grad_norm": 6.9381794929504395, "learning_rate": 3.023531085941682e-06, "loss": 9.3013, "step": 3867 }, { "epoch": 0.7557639703008988, "grad_norm": 9.048749923706055, "learning_rate": 3.0189983008511483e-06, "loss": 9.4292, "step": 3868 }, { "epoch": 0.7559593591246581, "grad_norm": 8.086410522460938, "learning_rate": 3.0144683118685336e-06, "loss": 9.2434, "step": 3869 }, { "epoch": 0.7561547479484173, "grad_norm": 7.529863357543945, "learning_rate": 3.009941120808232e-06, "loss": 9.6011, "step": 3870 }, { "epoch": 0.7563501367721767, "grad_norm": 8.00220775604248, "learning_rate": 3.0054167294835314e-06, "loss": 8.9942, "step": 3871 }, { "epoch": 0.7565455255959359, "grad_norm": 8.991710662841797, "learning_rate": 3.0008951397065832e-06, "loss": 9.8394, "step": 3872 }, { "epoch": 0.7567409144196952, "grad_norm": 7.8540472984313965, "learning_rate": 2.996376353288433e-06, "loss": 9.798, "step": 3873 }, { "epoch": 0.7569363032434545, "grad_norm": 9.14909839630127, "learning_rate": 2.991860372038986e-06, "loss": 9.1226, "step": 3874 }, { "epoch": 0.7571316920672138, "grad_norm": 8.964995384216309, "learning_rate": 2.987347197767041e-06, "loss": 8.612, "step": 3875 }, { "epoch": 0.757327080890973, "grad_norm": 9.87065601348877, "learning_rate": 2.9828368322802614e-06, "loss": 8.6584, "step": 3876 }, { "epoch": 0.7575224697147324, "grad_norm": 8.337159156799316, "learning_rate": 2.978329277385186e-06, "loss": 9.5367, "step": 3877 }, { "epoch": 0.7577178585384916, "grad_norm": 22.911712646484375, "learning_rate": 2.973824534887234e-06, "loss": 9.3709, "step": 3878 }, { "epoch": 0.7579132473622509, "grad_norm": 7.285062313079834, "learning_rate": 2.969322606590691e-06, "loss": 8.3937, "step": 3879 }, { "epoch": 0.7581086361860102, "grad_norm": 8.002433776855469, "learning_rate": 2.9648234942987242e-06, "loss": 8.646, "step": 3880 }, { "epoch": 0.7583040250097695, "grad_norm": 9.95106315612793, "learning_rate": 2.960327199813362e-06, "loss": 9.9894, "step": 3881 }, { "epoch": 0.7584994138335287, "grad_norm": 8.543545722961426, "learning_rate": 2.955833724935516e-06, "loss": 8.9347, "step": 3882 }, { "epoch": 0.758694802657288, "grad_norm": 24.709081649780273, "learning_rate": 2.951343071464956e-06, "loss": 9.0592, "step": 3883 }, { "epoch": 0.7588901914810473, "grad_norm": 6.559577941894531, "learning_rate": 2.946855241200335e-06, "loss": 7.936, "step": 3884 }, { "epoch": 0.7590855803048066, "grad_norm": 9.748425483703613, "learning_rate": 2.94237023593916e-06, "loss": 9.623, "step": 3885 }, { "epoch": 0.7592809691285658, "grad_norm": 7.7878336906433105, "learning_rate": 2.9378880574778245e-06, "loss": 9.0149, "step": 3886 }, { "epoch": 0.7594763579523252, "grad_norm": 6.624049663543701, "learning_rate": 2.9334087076115714e-06, "loss": 8.8067, "step": 3887 }, { "epoch": 0.7596717467760844, "grad_norm": 7.3495354652404785, "learning_rate": 2.9289321881345257e-06, "loss": 8.3652, "step": 3888 }, { "epoch": 0.7598671355998436, "grad_norm": 8.031523704528809, "learning_rate": 2.9244585008396696e-06, "loss": 8.8399, "step": 3889 }, { "epoch": 0.760062524423603, "grad_norm": 8.215229034423828, "learning_rate": 2.9199876475188528e-06, "loss": 8.9035, "step": 3890 }, { "epoch": 0.7602579132473622, "grad_norm": 9.209456443786621, "learning_rate": 2.915519629962793e-06, "loss": 9.2322, "step": 3891 }, { "epoch": 0.7604533020711215, "grad_norm": 7.7869873046875, "learning_rate": 2.911054449961067e-06, "loss": 8.837, "step": 3892 }, { "epoch": 0.7606486908948809, "grad_norm": 8.22655963897705, "learning_rate": 2.906592109302123e-06, "loss": 9.6905, "step": 3893 }, { "epoch": 0.7608440797186401, "grad_norm": 10.398638725280762, "learning_rate": 2.9021326097732616e-06, "loss": 8.9069, "step": 3894 }, { "epoch": 0.7610394685423993, "grad_norm": 8.627163887023926, "learning_rate": 2.8976759531606556e-06, "loss": 9.115, "step": 3895 }, { "epoch": 0.7612348573661587, "grad_norm": 10.054001808166504, "learning_rate": 2.893222141249329e-06, "loss": 8.7211, "step": 3896 }, { "epoch": 0.761430246189918, "grad_norm": 8.275874137878418, "learning_rate": 2.8887711758231783e-06, "loss": 8.9959, "step": 3897 }, { "epoch": 0.7616256350136772, "grad_norm": 8.254229545593262, "learning_rate": 2.8843230586649463e-06, "loss": 9.1554, "step": 3898 }, { "epoch": 0.7618210238374366, "grad_norm": 8.058757781982422, "learning_rate": 2.879877791556248e-06, "loss": 8.6234, "step": 3899 }, { "epoch": 0.7620164126611958, "grad_norm": 8.41688060760498, "learning_rate": 2.8754353762775465e-06, "loss": 9.6577, "step": 3900 }, { "epoch": 0.762211801484955, "grad_norm": 50.58344268798828, "learning_rate": 2.8709958146081717e-06, "loss": 9.4878, "step": 3901 }, { "epoch": 0.7624071903087143, "grad_norm": 9.422719955444336, "learning_rate": 2.866559108326302e-06, "loss": 9.6191, "step": 3902 }, { "epoch": 0.7626025791324736, "grad_norm": 7.204468250274658, "learning_rate": 2.8621252592089752e-06, "loss": 8.2942, "step": 3903 }, { "epoch": 0.7627979679562329, "grad_norm": 7.632490158081055, "learning_rate": 2.8576942690320907e-06, "loss": 8.7558, "step": 3904 }, { "epoch": 0.7629933567799921, "grad_norm": 7.333845138549805, "learning_rate": 2.853266139570391e-06, "loss": 8.8135, "step": 3905 }, { "epoch": 0.7631887456037515, "grad_norm": 6.49276065826416, "learning_rate": 2.848840872597485e-06, "loss": 7.6965, "step": 3906 }, { "epoch": 0.7633841344275107, "grad_norm": 7.868816375732422, "learning_rate": 2.8444184698858247e-06, "loss": 7.6861, "step": 3907 }, { "epoch": 0.76357952325127, "grad_norm": 9.188851356506348, "learning_rate": 2.8399989332067258e-06, "loss": 8.7012, "step": 3908 }, { "epoch": 0.7637749120750293, "grad_norm": 9.501800537109375, "learning_rate": 2.8355822643303433e-06, "loss": 8.4651, "step": 3909 }, { "epoch": 0.7639703008987886, "grad_norm": 8.46778678894043, "learning_rate": 2.8311684650256964e-06, "loss": 7.8761, "step": 3910 }, { "epoch": 0.7641656897225478, "grad_norm": 9.765848159790039, "learning_rate": 2.8267575370606427e-06, "loss": 9.5767, "step": 3911 }, { "epoch": 0.7643610785463072, "grad_norm": 11.269289016723633, "learning_rate": 2.8223494822019015e-06, "loss": 9.2451, "step": 3912 }, { "epoch": 0.7645564673700664, "grad_norm": 8.891103744506836, "learning_rate": 2.81794430221503e-06, "loss": 9.3999, "step": 3913 }, { "epoch": 0.7647518561938257, "grad_norm": 9.411876678466797, "learning_rate": 2.8135419988644453e-06, "loss": 9.6113, "step": 3914 }, { "epoch": 0.764947245017585, "grad_norm": 8.9310884475708, "learning_rate": 2.8091425739134036e-06, "loss": 9.8384, "step": 3915 }, { "epoch": 0.7651426338413443, "grad_norm": 9.01473617553711, "learning_rate": 2.804746029124009e-06, "loss": 8.0194, "step": 3916 }, { "epoch": 0.7653380226651035, "grad_norm": 9.225128173828125, "learning_rate": 2.8003523662572195e-06, "loss": 8.8305, "step": 3917 }, { "epoch": 0.7655334114888629, "grad_norm": 13.47047233581543, "learning_rate": 2.7959615870728276e-06, "loss": 9.7281, "step": 3918 }, { "epoch": 0.7657288003126221, "grad_norm": 9.079336166381836, "learning_rate": 2.7915736933294813e-06, "loss": 9.0856, "step": 3919 }, { "epoch": 0.7659241891363814, "grad_norm": 9.223864555358887, "learning_rate": 2.7871886867846642e-06, "loss": 9.1781, "step": 3920 }, { "epoch": 0.7661195779601406, "grad_norm": 8.404414176940918, "learning_rate": 2.7828065691947137e-06, "loss": 9.6851, "step": 3921 }, { "epoch": 0.7663149667839, "grad_norm": 6.931270122528076, "learning_rate": 2.7784273423147966e-06, "loss": 8.5956, "step": 3922 }, { "epoch": 0.7665103556076592, "grad_norm": 9.345423698425293, "learning_rate": 2.774051007898936e-06, "loss": 9.2563, "step": 3923 }, { "epoch": 0.7667057444314185, "grad_norm": 39.78207015991211, "learning_rate": 2.7696775676999833e-06, "loss": 9.5094, "step": 3924 }, { "epoch": 0.7669011332551778, "grad_norm": 9.660195350646973, "learning_rate": 2.765307023469643e-06, "loss": 9.2362, "step": 3925 }, { "epoch": 0.7670965220789371, "grad_norm": 13.873888969421387, "learning_rate": 2.760939376958448e-06, "loss": 8.566, "step": 3926 }, { "epoch": 0.7672919109026963, "grad_norm": 6.7762651443481445, "learning_rate": 2.7565746299157826e-06, "loss": 9.5676, "step": 3927 }, { "epoch": 0.7674872997264557, "grad_norm": 10.185317039489746, "learning_rate": 2.7522127840898603e-06, "loss": 9.5498, "step": 3928 }, { "epoch": 0.7676826885502149, "grad_norm": 6.987491130828857, "learning_rate": 2.747853841227732e-06, "loss": 8.372, "step": 3929 }, { "epoch": 0.7678780773739742, "grad_norm": 8.690187454223633, "learning_rate": 2.743497803075298e-06, "loss": 9.0992, "step": 3930 }, { "epoch": 0.7680734661977335, "grad_norm": 7.142638683319092, "learning_rate": 2.7391446713772775e-06, "loss": 8.3566, "step": 3931 }, { "epoch": 0.7682688550214928, "grad_norm": 7.855913162231445, "learning_rate": 2.7347944478772436e-06, "loss": 9.3046, "step": 3932 }, { "epoch": 0.768464243845252, "grad_norm": 10.175846099853516, "learning_rate": 2.7304471343175876e-06, "loss": 9.1877, "step": 3933 }, { "epoch": 0.7686596326690114, "grad_norm": 8.158573150634766, "learning_rate": 2.726102732439552e-06, "loss": 8.2, "step": 3934 }, { "epoch": 0.7688550214927706, "grad_norm": 13.254216194152832, "learning_rate": 2.7217612439831974e-06, "loss": 9.6704, "step": 3935 }, { "epoch": 0.7690504103165299, "grad_norm": 12.06765365600586, "learning_rate": 2.71742267068743e-06, "loss": 9.2111, "step": 3936 }, { "epoch": 0.7692457991402891, "grad_norm": 7.126428127288818, "learning_rate": 2.713087014289978e-06, "loss": 8.5633, "step": 3937 }, { "epoch": 0.7694411879640485, "grad_norm": 8.78329849243164, "learning_rate": 2.7087542765274135e-06, "loss": 8.7111, "step": 3938 }, { "epoch": 0.7696365767878077, "grad_norm": 7.155397415161133, "learning_rate": 2.704424459135123e-06, "loss": 8.6173, "step": 3939 }, { "epoch": 0.769831965611567, "grad_norm": 8.362483978271484, "learning_rate": 2.7000975638473425e-06, "loss": 9.1443, "step": 3940 }, { "epoch": 0.7700273544353263, "grad_norm": 10.86987590789795, "learning_rate": 2.695773592397123e-06, "loss": 9.1629, "step": 3941 }, { "epoch": 0.7702227432590856, "grad_norm": 9.431424140930176, "learning_rate": 2.6914525465163466e-06, "loss": 7.8835, "step": 3942 }, { "epoch": 0.7704181320828448, "grad_norm": 12.760275840759277, "learning_rate": 2.6871344279357335e-06, "loss": 8.8893, "step": 3943 }, { "epoch": 0.7706135209066042, "grad_norm": 13.065872192382812, "learning_rate": 2.682819238384816e-06, "loss": 9.2098, "step": 3944 }, { "epoch": 0.7708089097303634, "grad_norm": 7.924666404724121, "learning_rate": 2.678506979591969e-06, "loss": 9.6815, "step": 3945 }, { "epoch": 0.7710042985541227, "grad_norm": 8.195263862609863, "learning_rate": 2.6741976532843795e-06, "loss": 8.47, "step": 3946 }, { "epoch": 0.771199687377882, "grad_norm": 8.898924827575684, "learning_rate": 2.6698912611880735e-06, "loss": 8.9015, "step": 3947 }, { "epoch": 0.7713950762016413, "grad_norm": 6.993349075317383, "learning_rate": 2.6655878050278874e-06, "loss": 8.9547, "step": 3948 }, { "epoch": 0.7715904650254005, "grad_norm": 8.768937110900879, "learning_rate": 2.6612872865274954e-06, "loss": 9.3573, "step": 3949 }, { "epoch": 0.7717858538491599, "grad_norm": 7.907637119293213, "learning_rate": 2.656989707409383e-06, "loss": 8.6754, "step": 3950 }, { "epoch": 0.7719812426729191, "grad_norm": 7.222424507141113, "learning_rate": 2.6526950693948692e-06, "loss": 9.0668, "step": 3951 }, { "epoch": 0.7721766314966784, "grad_norm": 7.868619918823242, "learning_rate": 2.648403374204086e-06, "loss": 9.5197, "step": 3952 }, { "epoch": 0.7723720203204377, "grad_norm": 8.713985443115234, "learning_rate": 2.6441146235559933e-06, "loss": 9.4808, "step": 3953 }, { "epoch": 0.772567409144197, "grad_norm": 6.500898361206055, "learning_rate": 2.6398288191683687e-06, "loss": 7.9375, "step": 3954 }, { "epoch": 0.7727627979679562, "grad_norm": 8.31683349609375, "learning_rate": 2.635545962757806e-06, "loss": 8.9627, "step": 3955 }, { "epoch": 0.7729581867917155, "grad_norm": 6.714325904846191, "learning_rate": 2.6312660560397273e-06, "loss": 9.4997, "step": 3956 }, { "epoch": 0.7731535756154748, "grad_norm": 9.260298728942871, "learning_rate": 2.6269891007283664e-06, "loss": 9.3548, "step": 3957 }, { "epoch": 0.7733489644392341, "grad_norm": 8.693593978881836, "learning_rate": 2.6227150985367735e-06, "loss": 10.1707, "step": 3958 }, { "epoch": 0.7735443532629933, "grad_norm": 7.7326765060424805, "learning_rate": 2.618444051176824e-06, "loss": 8.2939, "step": 3959 }, { "epoch": 0.7737397420867527, "grad_norm": 10.119888305664062, "learning_rate": 2.6141759603592033e-06, "loss": 9.2571, "step": 3960 }, { "epoch": 0.7739351309105119, "grad_norm": 10.321942329406738, "learning_rate": 2.6099108277934105e-06, "loss": 9.8407, "step": 3961 }, { "epoch": 0.7741305197342712, "grad_norm": 11.165168762207031, "learning_rate": 2.6056486551877704e-06, "loss": 8.3103, "step": 3962 }, { "epoch": 0.7743259085580305, "grad_norm": 10.171852111816406, "learning_rate": 2.601389444249408e-06, "loss": 9.9649, "step": 3963 }, { "epoch": 0.7745212973817898, "grad_norm": 10.053415298461914, "learning_rate": 2.597133196684277e-06, "loss": 9.0758, "step": 3964 }, { "epoch": 0.774716686205549, "grad_norm": 9.017916679382324, "learning_rate": 2.592879914197131e-06, "loss": 8.4977, "step": 3965 }, { "epoch": 0.7749120750293084, "grad_norm": 26.458412170410156, "learning_rate": 2.5886295984915456e-06, "loss": 9.6598, "step": 3966 }, { "epoch": 0.7751074638530676, "grad_norm": 9.767171859741211, "learning_rate": 2.584382251269899e-06, "loss": 8.8856, "step": 3967 }, { "epoch": 0.7753028526768269, "grad_norm": 8.08504867553711, "learning_rate": 2.5801378742333928e-06, "loss": 8.772, "step": 3968 }, { "epoch": 0.7754982415005862, "grad_norm": 11.8588228225708, "learning_rate": 2.5758964690820254e-06, "loss": 8.7757, "step": 3969 }, { "epoch": 0.7756936303243455, "grad_norm": 7.2053985595703125, "learning_rate": 2.571658037514616e-06, "loss": 9.1364, "step": 3970 }, { "epoch": 0.7758890191481047, "grad_norm": 8.632712364196777, "learning_rate": 2.5674225812287834e-06, "loss": 8.9427, "step": 3971 }, { "epoch": 0.776084407971864, "grad_norm": 7.743727207183838, "learning_rate": 2.563190101920965e-06, "loss": 9.5031, "step": 3972 }, { "epoch": 0.7762797967956233, "grad_norm": 9.737512588500977, "learning_rate": 2.5589606012863968e-06, "loss": 7.5548, "step": 3973 }, { "epoch": 0.7764751856193826, "grad_norm": 9.05333423614502, "learning_rate": 2.5547340810191223e-06, "loss": 9.6562, "step": 3974 }, { "epoch": 0.7766705744431418, "grad_norm": 7.593080997467041, "learning_rate": 2.550510542811999e-06, "loss": 8.6516, "step": 3975 }, { "epoch": 0.7768659632669012, "grad_norm": 9.689908027648926, "learning_rate": 2.5462899883566815e-06, "loss": 9.416, "step": 3976 }, { "epoch": 0.7770613520906604, "grad_norm": 12.020401000976562, "learning_rate": 2.5420724193436377e-06, "loss": 10.2451, "step": 3977 }, { "epoch": 0.7772567409144197, "grad_norm": 8.075044631958008, "learning_rate": 2.5378578374621286e-06, "loss": 8.8815, "step": 3978 }, { "epoch": 0.777452129738179, "grad_norm": 9.765203475952148, "learning_rate": 2.533646244400231e-06, "loss": 8.5774, "step": 3979 }, { "epoch": 0.7776475185619383, "grad_norm": 6.740596771240234, "learning_rate": 2.529437641844813e-06, "loss": 8.7272, "step": 3980 }, { "epoch": 0.7778429073856975, "grad_norm": 8.368038177490234, "learning_rate": 2.525232031481557e-06, "loss": 8.9325, "step": 3981 }, { "epoch": 0.7780382962094569, "grad_norm": 9.283183097839355, "learning_rate": 2.5210294149949332e-06, "loss": 10.0498, "step": 3982 }, { "epoch": 0.7782336850332161, "grad_norm": 8.498453140258789, "learning_rate": 2.5168297940682273e-06, "loss": 8.8612, "step": 3983 }, { "epoch": 0.7784290738569754, "grad_norm": 8.46799087524414, "learning_rate": 2.5126331703835117e-06, "loss": 8.133, "step": 3984 }, { "epoch": 0.7786244626807347, "grad_norm": 6.731637477874756, "learning_rate": 2.5084395456216694e-06, "loss": 8.3384, "step": 3985 }, { "epoch": 0.778819851504494, "grad_norm": 10.091724395751953, "learning_rate": 2.5042489214623754e-06, "loss": 9.0973, "step": 3986 }, { "epoch": 0.7790152403282532, "grad_norm": 7.505545139312744, "learning_rate": 2.5000612995841024e-06, "loss": 8.7713, "step": 3987 }, { "epoch": 0.7792106291520126, "grad_norm": 9.824227333068848, "learning_rate": 2.4958766816641265e-06, "loss": 8.9991, "step": 3988 }, { "epoch": 0.7794060179757718, "grad_norm": 7.5289692878723145, "learning_rate": 2.4916950693785136e-06, "loss": 9.1619, "step": 3989 }, { "epoch": 0.779601406799531, "grad_norm": 17.181425094604492, "learning_rate": 2.4875164644021343e-06, "loss": 9.0135, "step": 3990 }, { "epoch": 0.7797967956232903, "grad_norm": 7.876916408538818, "learning_rate": 2.4833408684086434e-06, "loss": 7.9555, "step": 3991 }, { "epoch": 0.7799921844470497, "grad_norm": 7.679806709289551, "learning_rate": 2.4791682830705034e-06, "loss": 9.2439, "step": 3992 }, { "epoch": 0.7801875732708089, "grad_norm": 6.449591636657715, "learning_rate": 2.474998710058958e-06, "loss": 8.5342, "step": 3993 }, { "epoch": 0.7803829620945681, "grad_norm": 7.2947797775268555, "learning_rate": 2.470832151044056e-06, "loss": 8.8409, "step": 3994 }, { "epoch": 0.7805783509183275, "grad_norm": 28.776031494140625, "learning_rate": 2.4666686076946278e-06, "loss": 9.1977, "step": 3995 }, { "epoch": 0.7807737397420867, "grad_norm": 39.56930160522461, "learning_rate": 2.4625080816783075e-06, "loss": 9.3758, "step": 3996 }, { "epoch": 0.780969128565846, "grad_norm": 8.051578521728516, "learning_rate": 2.4583505746615112e-06, "loss": 8.1979, "step": 3997 }, { "epoch": 0.7811645173896054, "grad_norm": 6.23033332824707, "learning_rate": 2.454196088309453e-06, "loss": 8.2484, "step": 3998 }, { "epoch": 0.7813599062133646, "grad_norm": 8.523757934570312, "learning_rate": 2.450044624286131e-06, "loss": 9.0294, "step": 3999 }, { "epoch": 0.7815552950371238, "grad_norm": 11.340201377868652, "learning_rate": 2.445896184254334e-06, "loss": 9.384, "step": 4000 }, { "epoch": 0.7817506838608832, "grad_norm": 9.148540496826172, "learning_rate": 2.441750769875645e-06, "loss": 9.3063, "step": 4001 }, { "epoch": 0.7819460726846424, "grad_norm": 9.013167381286621, "learning_rate": 2.437608382810427e-06, "loss": 8.689, "step": 4002 }, { "epoch": 0.7821414615084017, "grad_norm": 8.351414680480957, "learning_rate": 2.4334690247178396e-06, "loss": 8.9728, "step": 4003 }, { "epoch": 0.782336850332161, "grad_norm": 12.632786750793457, "learning_rate": 2.429332697255817e-06, "loss": 9.2797, "step": 4004 }, { "epoch": 0.7825322391559203, "grad_norm": 8.268104553222656, "learning_rate": 2.425199402081095e-06, "loss": 8.5112, "step": 4005 }, { "epoch": 0.7827276279796795, "grad_norm": 6.804723262786865, "learning_rate": 2.4210691408491805e-06, "loss": 9.2314, "step": 4006 }, { "epoch": 0.7829230168034389, "grad_norm": 6.3445916175842285, "learning_rate": 2.416941915214377e-06, "loss": 8.6576, "step": 4007 }, { "epoch": 0.7831184056271981, "grad_norm": 16.956613540649414, "learning_rate": 2.412817726829759e-06, "loss": 9.0176, "step": 4008 }, { "epoch": 0.7833137944509574, "grad_norm": 8.851213455200195, "learning_rate": 2.4086965773471993e-06, "loss": 8.9511, "step": 4009 }, { "epoch": 0.7835091832747166, "grad_norm": 8.128167152404785, "learning_rate": 2.40457846841734e-06, "loss": 9.7655, "step": 4010 }, { "epoch": 0.783704572098476, "grad_norm": 7.9139533042907715, "learning_rate": 2.4004634016896176e-06, "loss": 8.9192, "step": 4011 }, { "epoch": 0.7838999609222352, "grad_norm": 23.174137115478516, "learning_rate": 2.3963513788122416e-06, "loss": 9.2223, "step": 4012 }, { "epoch": 0.7840953497459945, "grad_norm": 9.148961067199707, "learning_rate": 2.392242401432202e-06, "loss": 8.3294, "step": 4013 }, { "epoch": 0.7842907385697538, "grad_norm": 10.631275177001953, "learning_rate": 2.3881364711952757e-06, "loss": 8.4833, "step": 4014 }, { "epoch": 0.7844861273935131, "grad_norm": 8.675843238830566, "learning_rate": 2.384033589746012e-06, "loss": 9.5845, "step": 4015 }, { "epoch": 0.7846815162172723, "grad_norm": 10.582771301269531, "learning_rate": 2.379933758727746e-06, "loss": 8.7382, "step": 4016 }, { "epoch": 0.7848769050410317, "grad_norm": 30.143577575683594, "learning_rate": 2.375836979782582e-06, "loss": 8.6157, "step": 4017 }, { "epoch": 0.7850722938647909, "grad_norm": 10.265090942382812, "learning_rate": 2.371743254551414e-06, "loss": 9.9724, "step": 4018 }, { "epoch": 0.7852676826885502, "grad_norm": 9.800621032714844, "learning_rate": 2.3676525846738996e-06, "loss": 8.9021, "step": 4019 }, { "epoch": 0.7854630715123095, "grad_norm": 7.986928939819336, "learning_rate": 2.363564971788486e-06, "loss": 8.969, "step": 4020 }, { "epoch": 0.7856584603360688, "grad_norm": 9.971089363098145, "learning_rate": 2.3594804175323806e-06, "loss": 9.3363, "step": 4021 }, { "epoch": 0.785853849159828, "grad_norm": 10.663592338562012, "learning_rate": 2.355398923541582e-06, "loss": 8.7179, "step": 4022 }, { "epoch": 0.7860492379835874, "grad_norm": 10.637864112854004, "learning_rate": 2.351320491450849e-06, "loss": 9.0681, "step": 4023 }, { "epoch": 0.7862446268073466, "grad_norm": 13.780783653259277, "learning_rate": 2.3472451228937254e-06, "loss": 9.7322, "step": 4024 }, { "epoch": 0.7864400156311059, "grad_norm": 7.653302192687988, "learning_rate": 2.343172819502522e-06, "loss": 9.7462, "step": 4025 }, { "epoch": 0.7866354044548651, "grad_norm": 18.18355941772461, "learning_rate": 2.3391035829083173e-06, "loss": 8.2701, "step": 4026 }, { "epoch": 0.7868307932786245, "grad_norm": 8.42194938659668, "learning_rate": 2.3350374147409748e-06, "loss": 8.2877, "step": 4027 }, { "epoch": 0.7870261821023837, "grad_norm": 9.77413558959961, "learning_rate": 2.330974316629113e-06, "loss": 8.9961, "step": 4028 }, { "epoch": 0.787221570926143, "grad_norm": 9.971951484680176, "learning_rate": 2.3269142902001374e-06, "loss": 8.6043, "step": 4029 }, { "epoch": 0.7874169597499023, "grad_norm": 6.719539642333984, "learning_rate": 2.3228573370802064e-06, "loss": 8.1629, "step": 4030 }, { "epoch": 0.7876123485736616, "grad_norm": 7.909749507904053, "learning_rate": 2.3188034588942633e-06, "loss": 9.08, "step": 4031 }, { "epoch": 0.7878077373974208, "grad_norm": 8.593505859375, "learning_rate": 2.314752657266005e-06, "loss": 8.6831, "step": 4032 }, { "epoch": 0.7880031262211802, "grad_norm": 7.045526504516602, "learning_rate": 2.31070493381791e-06, "loss": 7.8853, "step": 4033 }, { "epoch": 0.7881985150449394, "grad_norm": 8.301514625549316, "learning_rate": 2.306660290171211e-06, "loss": 8.8027, "step": 4034 }, { "epoch": 0.7883939038686987, "grad_norm": 8.381290435791016, "learning_rate": 2.302618727945919e-06, "loss": 8.9424, "step": 4035 }, { "epoch": 0.788589292692458, "grad_norm": 10.212125778198242, "learning_rate": 2.298580248760799e-06, "loss": 9.5702, "step": 4036 }, { "epoch": 0.7887846815162173, "grad_norm": 11.792108535766602, "learning_rate": 2.294544854233395e-06, "loss": 9.9829, "step": 4037 }, { "epoch": 0.7889800703399765, "grad_norm": 9.178725242614746, "learning_rate": 2.2905125459800027e-06, "loss": 9.7524, "step": 4038 }, { "epoch": 0.7891754591637359, "grad_norm": 10.572028160095215, "learning_rate": 2.286483325615685e-06, "loss": 9.0691, "step": 4039 }, { "epoch": 0.7893708479874951, "grad_norm": 7.9066338539123535, "learning_rate": 2.282457194754275e-06, "loss": 8.8206, "step": 4040 }, { "epoch": 0.7895662368112544, "grad_norm": 7.298490524291992, "learning_rate": 2.2784341550083577e-06, "loss": 8.3582, "step": 4041 }, { "epoch": 0.7897616256350137, "grad_norm": 11.32341194152832, "learning_rate": 2.2744142079892906e-06, "loss": 9.8477, "step": 4042 }, { "epoch": 0.789957014458773, "grad_norm": 7.7698845863342285, "learning_rate": 2.2703973553071834e-06, "loss": 8.897, "step": 4043 }, { "epoch": 0.7901524032825322, "grad_norm": 8.147603988647461, "learning_rate": 2.2663835985709138e-06, "loss": 9.0695, "step": 4044 }, { "epoch": 0.7903477921062915, "grad_norm": 7.221194267272949, "learning_rate": 2.2623729393881113e-06, "loss": 9.3573, "step": 4045 }, { "epoch": 0.7905431809300508, "grad_norm": 7.137025356292725, "learning_rate": 2.258365379365174e-06, "loss": 8.1276, "step": 4046 }, { "epoch": 0.7907385697538101, "grad_norm": 8.200143814086914, "learning_rate": 2.25436092010725e-06, "loss": 9.598, "step": 4047 }, { "epoch": 0.7909339585775693, "grad_norm": 8.565221786499023, "learning_rate": 2.250359563218255e-06, "loss": 8.8693, "step": 4048 }, { "epoch": 0.7911293474013287, "grad_norm": 12.058979034423828, "learning_rate": 2.24636131030085e-06, "loss": 9.9144, "step": 4049 }, { "epoch": 0.7913247362250879, "grad_norm": 9.160593032836914, "learning_rate": 2.242366162956465e-06, "loss": 9.7148, "step": 4050 }, { "epoch": 0.7915201250488472, "grad_norm": 11.761293411254883, "learning_rate": 2.2383741227852796e-06, "loss": 8.9634, "step": 4051 }, { "epoch": 0.7917155138726065, "grad_norm": 8.902557373046875, "learning_rate": 2.2343851913862258e-06, "loss": 8.9805, "step": 4052 }, { "epoch": 0.7919109026963658, "grad_norm": 12.234577178955078, "learning_rate": 2.2303993703569993e-06, "loss": 9.676, "step": 4053 }, { "epoch": 0.792106291520125, "grad_norm": 29.982467651367188, "learning_rate": 2.226416661294041e-06, "loss": 8.1496, "step": 4054 }, { "epoch": 0.7923016803438844, "grad_norm": 12.731390953063965, "learning_rate": 2.222437065792554e-06, "loss": 8.9748, "step": 4055 }, { "epoch": 0.7924970691676436, "grad_norm": 7.898471832275391, "learning_rate": 2.218460585446486e-06, "loss": 8.7759, "step": 4056 }, { "epoch": 0.7926924579914029, "grad_norm": 8.303677558898926, "learning_rate": 2.2144872218485435e-06, "loss": 8.8412, "step": 4057 }, { "epoch": 0.7928878468151622, "grad_norm": 9.355086326599121, "learning_rate": 2.210516976590179e-06, "loss": 9.176, "step": 4058 }, { "epoch": 0.7930832356389215, "grad_norm": 8.416228294372559, "learning_rate": 2.2065498512616036e-06, "loss": 8.2258, "step": 4059 }, { "epoch": 0.7932786244626807, "grad_norm": 9.133707046508789, "learning_rate": 2.202585847451768e-06, "loss": 8.8832, "step": 4060 }, { "epoch": 0.79347401328644, "grad_norm": 7.949434757232666, "learning_rate": 2.198624966748385e-06, "loss": 9.0004, "step": 4061 }, { "epoch": 0.7936694021101993, "grad_norm": 7.470557689666748, "learning_rate": 2.1946672107379052e-06, "loss": 8.2045, "step": 4062 }, { "epoch": 0.7938647909339586, "grad_norm": 8.743266105651855, "learning_rate": 2.1907125810055373e-06, "loss": 9.0787, "step": 4063 }, { "epoch": 0.7940601797577178, "grad_norm": 8.27902603149414, "learning_rate": 2.1867610791352313e-06, "loss": 9.0252, "step": 4064 }, { "epoch": 0.7942555685814772, "grad_norm": 8.170581817626953, "learning_rate": 2.1828127067096825e-06, "loss": 7.8556, "step": 4065 }, { "epoch": 0.7944509574052364, "grad_norm": 8.815306663513184, "learning_rate": 2.1788674653103436e-06, "loss": 9.4213, "step": 4066 }, { "epoch": 0.7946463462289957, "grad_norm": 6.542332649230957, "learning_rate": 2.174925356517401e-06, "loss": 7.9175, "step": 4067 }, { "epoch": 0.794841735052755, "grad_norm": 8.297599792480469, "learning_rate": 2.1709863819097952e-06, "loss": 9.1062, "step": 4068 }, { "epoch": 0.7950371238765143, "grad_norm": 9.112894058227539, "learning_rate": 2.1670505430652044e-06, "loss": 7.9776, "step": 4069 }, { "epoch": 0.7952325127002735, "grad_norm": 7.934484958648682, "learning_rate": 2.1631178415600572e-06, "loss": 8.5803, "step": 4070 }, { "epoch": 0.7954279015240329, "grad_norm": 7.044009208679199, "learning_rate": 2.1591882789695207e-06, "loss": 7.5398, "step": 4071 }, { "epoch": 0.7956232903477921, "grad_norm": 10.852298736572266, "learning_rate": 2.1552618568675088e-06, "loss": 7.795, "step": 4072 }, { "epoch": 0.7958186791715514, "grad_norm": 7.112560749053955, "learning_rate": 2.1513385768266713e-06, "loss": 8.4551, "step": 4073 }, { "epoch": 0.7960140679953107, "grad_norm": 7.160801887512207, "learning_rate": 2.1474184404184105e-06, "loss": 8.8399, "step": 4074 }, { "epoch": 0.79620945681907, "grad_norm": 7.856019973754883, "learning_rate": 2.1435014492128547e-06, "loss": 8.5651, "step": 4075 }, { "epoch": 0.7964048456428292, "grad_norm": 11.128726959228516, "learning_rate": 2.139587604778889e-06, "loss": 9.135, "step": 4076 }, { "epoch": 0.7966002344665886, "grad_norm": 11.092827796936035, "learning_rate": 2.1356769086841233e-06, "loss": 8.9956, "step": 4077 }, { "epoch": 0.7967956232903478, "grad_norm": 6.319994926452637, "learning_rate": 2.131769362494913e-06, "loss": 8.4058, "step": 4078 }, { "epoch": 0.7969910121141071, "grad_norm": 7.189726829528809, "learning_rate": 2.1278649677763553e-06, "loss": 8.8374, "step": 4079 }, { "epoch": 0.7971864009378663, "grad_norm": 7.846137046813965, "learning_rate": 2.1239637260922773e-06, "loss": 8.5696, "step": 4080 }, { "epoch": 0.7973817897616257, "grad_norm": 8.4998197555542, "learning_rate": 2.1200656390052523e-06, "loss": 9.4478, "step": 4081 }, { "epoch": 0.7975771785853849, "grad_norm": 8.599392890930176, "learning_rate": 2.116170708076579e-06, "loss": 9.4599, "step": 4082 }, { "epoch": 0.7977725674091442, "grad_norm": 9.487367630004883, "learning_rate": 2.112278934866305e-06, "loss": 8.6577, "step": 4083 }, { "epoch": 0.7979679562329035, "grad_norm": 6.9012885093688965, "learning_rate": 2.1083903209332004e-06, "loss": 8.4376, "step": 4084 }, { "epoch": 0.7981633450566628, "grad_norm": 7.9485297203063965, "learning_rate": 2.1045048678347814e-06, "loss": 8.9016, "step": 4085 }, { "epoch": 0.798358733880422, "grad_norm": 9.573652267456055, "learning_rate": 2.100622577127286e-06, "loss": 8.789, "step": 4086 }, { "epoch": 0.7985541227041814, "grad_norm": 8.524712562561035, "learning_rate": 2.0967434503656993e-06, "loss": 9.0548, "step": 4087 }, { "epoch": 0.7987495115279406, "grad_norm": 6.874051094055176, "learning_rate": 2.0928674891037247e-06, "loss": 9.174, "step": 4088 }, { "epoch": 0.7989449003516998, "grad_norm": 7.7614264488220215, "learning_rate": 2.0889946948938123e-06, "loss": 8.8263, "step": 4089 }, { "epoch": 0.7991402891754592, "grad_norm": 8.428397178649902, "learning_rate": 2.0851250692871327e-06, "loss": 8.4872, "step": 4090 }, { "epoch": 0.7993356779992185, "grad_norm": 7.815609931945801, "learning_rate": 2.081258613833589e-06, "loss": 9.7076, "step": 4091 }, { "epoch": 0.7995310668229777, "grad_norm": 8.944595336914062, "learning_rate": 2.0773953300818204e-06, "loss": 9.3888, "step": 4092 }, { "epoch": 0.7997264556467371, "grad_norm": 8.181797981262207, "learning_rate": 2.073535219579189e-06, "loss": 9.0652, "step": 4093 }, { "epoch": 0.7999218444704963, "grad_norm": 12.623723983764648, "learning_rate": 2.0696782838717934e-06, "loss": 9.017, "step": 4094 }, { "epoch": 0.8001172332942555, "grad_norm": 8.842370986938477, "learning_rate": 2.0658245245044515e-06, "loss": 9.6341, "step": 4095 }, { "epoch": 0.8003126221180149, "grad_norm": 10.332437515258789, "learning_rate": 2.0619739430207162e-06, "loss": 9.6929, "step": 4096 }, { "epoch": 0.8005080109417742, "grad_norm": 7.728264808654785, "learning_rate": 2.0581265409628613e-06, "loss": 8.5661, "step": 4097 }, { "epoch": 0.8007033997655334, "grad_norm": 8.843079566955566, "learning_rate": 2.0542823198718954e-06, "loss": 8.7008, "step": 4098 }, { "epoch": 0.8008987885892926, "grad_norm": 7.340389728546143, "learning_rate": 2.050441281287544e-06, "loss": 9.6193, "step": 4099 }, { "epoch": 0.801094177413052, "grad_norm": 8.187955856323242, "learning_rate": 2.046603426748267e-06, "loss": 9.2346, "step": 4100 }, { "epoch": 0.8012895662368112, "grad_norm": 13.1972017288208, "learning_rate": 2.0427687577912404e-06, "loss": 10.442, "step": 4101 }, { "epoch": 0.8014849550605705, "grad_norm": 7.015872478485107, "learning_rate": 2.038937275952372e-06, "loss": 7.6382, "step": 4102 }, { "epoch": 0.8016803438843298, "grad_norm": 10.845377922058105, "learning_rate": 2.0351089827662827e-06, "loss": 9.1876, "step": 4103 }, { "epoch": 0.8018757327080891, "grad_norm": 6.643104553222656, "learning_rate": 2.0312838797663315e-06, "loss": 8.6192, "step": 4104 }, { "epoch": 0.8020711215318483, "grad_norm": 8.04154109954834, "learning_rate": 2.027461968484583e-06, "loss": 7.2225, "step": 4105 }, { "epoch": 0.8022665103556077, "grad_norm": 9.899718284606934, "learning_rate": 2.023643250451838e-06, "loss": 9.4718, "step": 4106 }, { "epoch": 0.8024618991793669, "grad_norm": 6.92594051361084, "learning_rate": 2.019827727197605e-06, "loss": 8.9539, "step": 4107 }, { "epoch": 0.8026572880031262, "grad_norm": 15.220681190490723, "learning_rate": 2.0160154002501265e-06, "loss": 9.6917, "step": 4108 }, { "epoch": 0.8028526768268855, "grad_norm": 6.64687442779541, "learning_rate": 2.012206271136353e-06, "loss": 8.9374, "step": 4109 }, { "epoch": 0.8030480656506448, "grad_norm": 6.912804126739502, "learning_rate": 2.0084003413819585e-06, "loss": 8.6066, "step": 4110 }, { "epoch": 0.803243454474404, "grad_norm": 8.238934516906738, "learning_rate": 2.0045976125113407e-06, "loss": 8.3299, "step": 4111 }, { "epoch": 0.8034388432981634, "grad_norm": 9.851848602294922, "learning_rate": 2.000798086047604e-06, "loss": 9.4563, "step": 4112 }, { "epoch": 0.8036342321219226, "grad_norm": 8.415313720703125, "learning_rate": 1.997001763512584e-06, "loss": 8.1643, "step": 4113 }, { "epoch": 0.8038296209456819, "grad_norm": 86.34687042236328, "learning_rate": 1.9932086464268187e-06, "loss": 8.9568, "step": 4114 }, { "epoch": 0.8040250097694411, "grad_norm": 7.504203796386719, "learning_rate": 1.989418736309575e-06, "loss": 9.5223, "step": 4115 }, { "epoch": 0.8042203985932005, "grad_norm": 9.638359069824219, "learning_rate": 1.985632034678825e-06, "loss": 9.614, "step": 4116 }, { "epoch": 0.8044157874169597, "grad_norm": 10.84245491027832, "learning_rate": 1.981848543051265e-06, "loss": 9.5363, "step": 4117 }, { "epoch": 0.804611176240719, "grad_norm": 7.719939231872559, "learning_rate": 1.9780682629422954e-06, "loss": 9.0556, "step": 4118 }, { "epoch": 0.8048065650644783, "grad_norm": 8.364896774291992, "learning_rate": 1.9742911958660406e-06, "loss": 8.1381, "step": 4119 }, { "epoch": 0.8050019538882376, "grad_norm": 11.49855899810791, "learning_rate": 1.9705173433353297e-06, "loss": 9.4236, "step": 4120 }, { "epoch": 0.8051973427119968, "grad_norm": 8.680183410644531, "learning_rate": 1.966746706861712e-06, "loss": 9.4808, "step": 4121 }, { "epoch": 0.8053927315357562, "grad_norm": 8.259016990661621, "learning_rate": 1.962979287955441e-06, "loss": 9.5783, "step": 4122 }, { "epoch": 0.8055881203595154, "grad_norm": 7.60164213180542, "learning_rate": 1.9592150881254836e-06, "loss": 8.1726, "step": 4123 }, { "epoch": 0.8057835091832747, "grad_norm": 8.234026908874512, "learning_rate": 1.9554541088795244e-06, "loss": 9.0773, "step": 4124 }, { "epoch": 0.805978898007034, "grad_norm": 10.081842422485352, "learning_rate": 1.951696351723945e-06, "loss": 8.6069, "step": 4125 }, { "epoch": 0.8061742868307933, "grad_norm": 10.049712181091309, "learning_rate": 1.9479418181638508e-06, "loss": 9.2187, "step": 4126 }, { "epoch": 0.8063696756545525, "grad_norm": 9.004876136779785, "learning_rate": 1.9441905097030434e-06, "loss": 7.5274, "step": 4127 }, { "epoch": 0.8065650644783119, "grad_norm": 8.803363800048828, "learning_rate": 1.9404424278440424e-06, "loss": 8.1947, "step": 4128 }, { "epoch": 0.8067604533020711, "grad_norm": 8.290931701660156, "learning_rate": 1.9366975740880676e-06, "loss": 8.6446, "step": 4129 }, { "epoch": 0.8069558421258304, "grad_norm": 7.384932041168213, "learning_rate": 1.932955949935054e-06, "loss": 9.1818, "step": 4130 }, { "epoch": 0.8071512309495897, "grad_norm": 12.507975578308105, "learning_rate": 1.9292175568836314e-06, "loss": 10.3594, "step": 4131 }, { "epoch": 0.807346619773349, "grad_norm": 11.068806648254395, "learning_rate": 1.9254823964311488e-06, "loss": 9.9343, "step": 4132 }, { "epoch": 0.8075420085971082, "grad_norm": 11.15834903717041, "learning_rate": 1.9217504700736487e-06, "loss": 8.3411, "step": 4133 }, { "epoch": 0.8077373974208675, "grad_norm": 12.640612602233887, "learning_rate": 1.91802177930589e-06, "loss": 9.8704, "step": 4134 }, { "epoch": 0.8079327862446268, "grad_norm": 7.127387046813965, "learning_rate": 1.9142963256213234e-06, "loss": 8.5134, "step": 4135 }, { "epoch": 0.8081281750683861, "grad_norm": 8.432134628295898, "learning_rate": 1.910574110512108e-06, "loss": 8.7671, "step": 4136 }, { "epoch": 0.8083235638921453, "grad_norm": 11.820125579833984, "learning_rate": 1.9068551354691124e-06, "loss": 8.6807, "step": 4137 }, { "epoch": 0.8085189527159047, "grad_norm": 7.877838611602783, "learning_rate": 1.903139401981895e-06, "loss": 8.9795, "step": 4138 }, { "epoch": 0.8087143415396639, "grad_norm": 8.861102104187012, "learning_rate": 1.8994269115387276e-06, "loss": 9.2973, "step": 4139 }, { "epoch": 0.8089097303634232, "grad_norm": 8.142215728759766, "learning_rate": 1.8957176656265741e-06, "loss": 8.6584, "step": 4140 }, { "epoch": 0.8091051191871825, "grad_norm": 7.580594539642334, "learning_rate": 1.892011665731106e-06, "loss": 8.9637, "step": 4141 }, { "epoch": 0.8093005080109418, "grad_norm": 10.086572647094727, "learning_rate": 1.8883089133366882e-06, "loss": 9.515, "step": 4142 }, { "epoch": 0.809495896834701, "grad_norm": 6.808967590332031, "learning_rate": 1.8846094099263911e-06, "loss": 8.3363, "step": 4143 }, { "epoch": 0.8096912856584604, "grad_norm": 7.257555961608887, "learning_rate": 1.880913156981977e-06, "loss": 8.6286, "step": 4144 }, { "epoch": 0.8098866744822196, "grad_norm": 8.123153686523438, "learning_rate": 1.8772201559839143e-06, "loss": 8.6086, "step": 4145 }, { "epoch": 0.8100820633059789, "grad_norm": 10.420025825500488, "learning_rate": 1.8735304084113593e-06, "loss": 10.4163, "step": 4146 }, { "epoch": 0.8102774521297382, "grad_norm": 9.424251556396484, "learning_rate": 1.8698439157421767e-06, "loss": 9.763, "step": 4147 }, { "epoch": 0.8104728409534975, "grad_norm": 7.444462776184082, "learning_rate": 1.8661606794529174e-06, "loss": 8.2321, "step": 4148 }, { "epoch": 0.8106682297772567, "grad_norm": 12.281042098999023, "learning_rate": 1.8624807010188305e-06, "loss": 10.1341, "step": 4149 }, { "epoch": 0.810863618601016, "grad_norm": 6.998084545135498, "learning_rate": 1.858803981913866e-06, "loss": 9.0192, "step": 4150 }, { "epoch": 0.8110590074247753, "grad_norm": 7.253018379211426, "learning_rate": 1.8551305236106588e-06, "loss": 8.8867, "step": 4151 }, { "epoch": 0.8112543962485346, "grad_norm": 7.112781524658203, "learning_rate": 1.8514603275805487e-06, "loss": 8.7034, "step": 4152 }, { "epoch": 0.8114497850722938, "grad_norm": 7.53427267074585, "learning_rate": 1.8477933952935568e-06, "loss": 8.161, "step": 4153 }, { "epoch": 0.8116451738960532, "grad_norm": 11.889721870422363, "learning_rate": 1.84412972821841e-06, "loss": 9.7866, "step": 4154 }, { "epoch": 0.8118405627198124, "grad_norm": 11.897488594055176, "learning_rate": 1.840469327822515e-06, "loss": 9.418, "step": 4155 }, { "epoch": 0.8120359515435717, "grad_norm": 10.787230491638184, "learning_rate": 1.8368121955719808e-06, "loss": 10.2132, "step": 4156 }, { "epoch": 0.812231340367331, "grad_norm": 10.199027061462402, "learning_rate": 1.8331583329315972e-06, "loss": 8.7687, "step": 4157 }, { "epoch": 0.8124267291910903, "grad_norm": 9.7492094039917, "learning_rate": 1.8295077413648555e-06, "loss": 9.6211, "step": 4158 }, { "epoch": 0.8126221180148495, "grad_norm": 7.9797515869140625, "learning_rate": 1.825860422333925e-06, "loss": 8.9543, "step": 4159 }, { "epoch": 0.8128175068386089, "grad_norm": 15.008012771606445, "learning_rate": 1.8222163772996749e-06, "loss": 8.6499, "step": 4160 }, { "epoch": 0.8130128956623681, "grad_norm": 8.104180335998535, "learning_rate": 1.8185756077216576e-06, "loss": 8.5021, "step": 4161 }, { "epoch": 0.8132082844861274, "grad_norm": 7.260696887969971, "learning_rate": 1.814938115058109e-06, "loss": 8.7586, "step": 4162 }, { "epoch": 0.8134036733098867, "grad_norm": 7.8021159172058105, "learning_rate": 1.811303900765966e-06, "loss": 8.4024, "step": 4163 }, { "epoch": 0.813599062133646, "grad_norm": 11.577256202697754, "learning_rate": 1.8076729663008374e-06, "loss": 8.7474, "step": 4164 }, { "epoch": 0.8137944509574052, "grad_norm": 12.749056816101074, "learning_rate": 1.804045313117031e-06, "loss": 9.4589, "step": 4165 }, { "epoch": 0.8139898397811646, "grad_norm": 16.994075775146484, "learning_rate": 1.800420942667529e-06, "loss": 8.5454, "step": 4166 }, { "epoch": 0.8141852286049238, "grad_norm": 9.551435470581055, "learning_rate": 1.796799856404009e-06, "loss": 9.6272, "step": 4167 }, { "epoch": 0.8143806174286831, "grad_norm": 8.51735782623291, "learning_rate": 1.7931820557768241e-06, "loss": 7.9052, "step": 4168 }, { "epoch": 0.8145760062524423, "grad_norm": 9.26491928100586, "learning_rate": 1.78956754223502e-06, "loss": 9.768, "step": 4169 }, { "epoch": 0.8147713950762017, "grad_norm": 9.60155200958252, "learning_rate": 1.7859563172263183e-06, "loss": 9.4273, "step": 4170 }, { "epoch": 0.8149667838999609, "grad_norm": 9.635828971862793, "learning_rate": 1.78234838219713e-06, "loss": 9.2051, "step": 4171 }, { "epoch": 0.8151621727237202, "grad_norm": 7.847221851348877, "learning_rate": 1.7787437385925398e-06, "loss": 8.7587, "step": 4172 }, { "epoch": 0.8153575615474795, "grad_norm": 8.79216480255127, "learning_rate": 1.7751423878563267e-06, "loss": 8.9006, "step": 4173 }, { "epoch": 0.8155529503712388, "grad_norm": 8.901490211486816, "learning_rate": 1.771544331430939e-06, "loss": 8.9884, "step": 4174 }, { "epoch": 0.815748339194998, "grad_norm": 8.927732467651367, "learning_rate": 1.7679495707575078e-06, "loss": 8.4172, "step": 4175 }, { "epoch": 0.8159437280187574, "grad_norm": 9.58761215209961, "learning_rate": 1.7643581072758509e-06, "loss": 9.075, "step": 4176 }, { "epoch": 0.8161391168425166, "grad_norm": 8.125791549682617, "learning_rate": 1.7607699424244583e-06, "loss": 8.1893, "step": 4177 }, { "epoch": 0.8163345056662759, "grad_norm": 8.676210403442383, "learning_rate": 1.757185077640503e-06, "loss": 8.5555, "step": 4178 }, { "epoch": 0.8165298944900352, "grad_norm": 7.854306221008301, "learning_rate": 1.7536035143598328e-06, "loss": 9.0119, "step": 4179 }, { "epoch": 0.8167252833137945, "grad_norm": 8.398869514465332, "learning_rate": 1.7500252540169782e-06, "loss": 8.9472, "step": 4180 }, { "epoch": 0.8169206721375537, "grad_norm": 9.091753959655762, "learning_rate": 1.746450298045138e-06, "loss": 9.2992, "step": 4181 }, { "epoch": 0.8171160609613131, "grad_norm": 7.773633003234863, "learning_rate": 1.7428786478762005e-06, "loss": 8.4178, "step": 4182 }, { "epoch": 0.8173114497850723, "grad_norm": 9.261589050292969, "learning_rate": 1.7393103049407145e-06, "loss": 9.1313, "step": 4183 }, { "epoch": 0.8175068386088316, "grad_norm": 8.865499496459961, "learning_rate": 1.73574527066792e-06, "loss": 8.8908, "step": 4184 }, { "epoch": 0.8177022274325909, "grad_norm": 11.810985565185547, "learning_rate": 1.7321835464857162e-06, "loss": 9.0723, "step": 4185 }, { "epoch": 0.8178976162563502, "grad_norm": 13.720571517944336, "learning_rate": 1.7286251338206916e-06, "loss": 9.3866, "step": 4186 }, { "epoch": 0.8180930050801094, "grad_norm": 8.280376434326172, "learning_rate": 1.7250700340980942e-06, "loss": 8.0574, "step": 4187 }, { "epoch": 0.8182883939038686, "grad_norm": 7.536670207977295, "learning_rate": 1.721518248741858e-06, "loss": 8.0419, "step": 4188 }, { "epoch": 0.818483782727628, "grad_norm": 7.725442886352539, "learning_rate": 1.7179697791745797e-06, "loss": 8.6258, "step": 4189 }, { "epoch": 0.8186791715513873, "grad_norm": 8.78536319732666, "learning_rate": 1.7144246268175291e-06, "loss": 8.5599, "step": 4190 }, { "epoch": 0.8188745603751465, "grad_norm": 6.458619117736816, "learning_rate": 1.7108827930906568e-06, "loss": 7.56, "step": 4191 }, { "epoch": 0.8190699491989059, "grad_norm": 8.604883193969727, "learning_rate": 1.7073442794125694e-06, "loss": 9.3799, "step": 4192 }, { "epoch": 0.8192653380226651, "grad_norm": 9.05053997039795, "learning_rate": 1.7038090872005587e-06, "loss": 9.1484, "step": 4193 }, { "epoch": 0.8194607268464243, "grad_norm": 7.796328544616699, "learning_rate": 1.7002772178705717e-06, "loss": 9.0087, "step": 4194 }, { "epoch": 0.8196561156701837, "grad_norm": 8.378427505493164, "learning_rate": 1.6967486728372395e-06, "loss": 9.9526, "step": 4195 }, { "epoch": 0.819851504493943, "grad_norm": 13.551206588745117, "learning_rate": 1.693223453513847e-06, "loss": 9.6657, "step": 4196 }, { "epoch": 0.8200468933177022, "grad_norm": 9.289401054382324, "learning_rate": 1.689701561312359e-06, "loss": 9.4121, "step": 4197 }, { "epoch": 0.8202422821414616, "grad_norm": 8.325844764709473, "learning_rate": 1.6861829976433974e-06, "loss": 8.3365, "step": 4198 }, { "epoch": 0.8204376709652208, "grad_norm": 9.091864585876465, "learning_rate": 1.6826677639162635e-06, "loss": 9.9388, "step": 4199 }, { "epoch": 0.82063305978898, "grad_norm": 7.198275089263916, "learning_rate": 1.6791558615389103e-06, "loss": 8.9716, "step": 4200 }, { "epoch": 0.8208284486127394, "grad_norm": 9.158873558044434, "learning_rate": 1.675647291917969e-06, "loss": 9.7336, "step": 4201 }, { "epoch": 0.8210238374364986, "grad_norm": 8.986889839172363, "learning_rate": 1.6721420564587288e-06, "loss": 8.8077, "step": 4202 }, { "epoch": 0.8212192262602579, "grad_norm": 8.387405395507812, "learning_rate": 1.6686401565651423e-06, "loss": 9.0873, "step": 4203 }, { "epoch": 0.8214146150840171, "grad_norm": 10.184917449951172, "learning_rate": 1.665141593639834e-06, "loss": 9.9188, "step": 4204 }, { "epoch": 0.8216100039077765, "grad_norm": 8.070707321166992, "learning_rate": 1.6616463690840833e-06, "loss": 8.8431, "step": 4205 }, { "epoch": 0.8218053927315357, "grad_norm": 7.336175441741943, "learning_rate": 1.6581544842978391e-06, "loss": 8.619, "step": 4206 }, { "epoch": 0.822000781555295, "grad_norm": 6.866813659667969, "learning_rate": 1.6546659406797072e-06, "loss": 8.6299, "step": 4207 }, { "epoch": 0.8221961703790543, "grad_norm": 8.972613334655762, "learning_rate": 1.6511807396269607e-06, "loss": 9.2132, "step": 4208 }, { "epoch": 0.8223915592028136, "grad_norm": 10.178311347961426, "learning_rate": 1.6476988825355257e-06, "loss": 9.4726, "step": 4209 }, { "epoch": 0.8225869480265728, "grad_norm": 10.723286628723145, "learning_rate": 1.6442203707999994e-06, "loss": 9.7473, "step": 4210 }, { "epoch": 0.8227823368503322, "grad_norm": 7.932458400726318, "learning_rate": 1.6407452058136298e-06, "loss": 9.3409, "step": 4211 }, { "epoch": 0.8229777256740914, "grad_norm": 8.253729820251465, "learning_rate": 1.6372733889683323e-06, "loss": 9.2522, "step": 4212 }, { "epoch": 0.8231731144978507, "grad_norm": 7.544541835784912, "learning_rate": 1.6338049216546725e-06, "loss": 8.1359, "step": 4213 }, { "epoch": 0.82336850332161, "grad_norm": 9.513879776000977, "learning_rate": 1.6303398052618845e-06, "loss": 8.9512, "step": 4214 }, { "epoch": 0.8235638921453693, "grad_norm": 9.312100410461426, "learning_rate": 1.6268780411778529e-06, "loss": 9.0866, "step": 4215 }, { "epoch": 0.8237592809691285, "grad_norm": 7.766869068145752, "learning_rate": 1.6234196307891182e-06, "loss": 8.1376, "step": 4216 }, { "epoch": 0.8239546697928879, "grad_norm": 10.72817325592041, "learning_rate": 1.6199645754808867e-06, "loss": 10.2697, "step": 4217 }, { "epoch": 0.8241500586166471, "grad_norm": 7.453179836273193, "learning_rate": 1.6165128766370097e-06, "loss": 8.6152, "step": 4218 }, { "epoch": 0.8243454474404064, "grad_norm": 9.792024612426758, "learning_rate": 1.6130645356400054e-06, "loss": 8.3674, "step": 4219 }, { "epoch": 0.8245408362641657, "grad_norm": 8.387077331542969, "learning_rate": 1.609619553871037e-06, "loss": 8.6538, "step": 4220 }, { "epoch": 0.824736225087925, "grad_norm": 10.731857299804688, "learning_rate": 1.6061779327099303e-06, "loss": 9.5248, "step": 4221 }, { "epoch": 0.8249316139116842, "grad_norm": 7.778739929199219, "learning_rate": 1.6027396735351575e-06, "loss": 8.5324, "step": 4222 }, { "epoch": 0.8251270027354435, "grad_norm": 9.51624870300293, "learning_rate": 1.5993047777238525e-06, "loss": 8.9079, "step": 4223 }, { "epoch": 0.8253223915592028, "grad_norm": 8.001130104064941, "learning_rate": 1.5958732466517923e-06, "loss": 9.3273, "step": 4224 }, { "epoch": 0.8255177803829621, "grad_norm": 8.451061248779297, "learning_rate": 1.5924450816934177e-06, "loss": 9.2385, "step": 4225 }, { "epoch": 0.8257131692067213, "grad_norm": 6.511702537536621, "learning_rate": 1.5890202842218094e-06, "loss": 8.0991, "step": 4226 }, { "epoch": 0.8259085580304807, "grad_norm": 10.386144638061523, "learning_rate": 1.585598855608711e-06, "loss": 8.8742, "step": 4227 }, { "epoch": 0.8261039468542399, "grad_norm": 7.724105358123779, "learning_rate": 1.5821807972245073e-06, "loss": 9.5477, "step": 4228 }, { "epoch": 0.8262993356779992, "grad_norm": 7.820478439331055, "learning_rate": 1.5787661104382357e-06, "loss": 9.0195, "step": 4229 }, { "epoch": 0.8264947245017585, "grad_norm": 7.830831527709961, "learning_rate": 1.5753547966175875e-06, "loss": 8.3695, "step": 4230 }, { "epoch": 0.8266901133255178, "grad_norm": 8.081059455871582, "learning_rate": 1.5719468571288966e-06, "loss": 8.575, "step": 4231 }, { "epoch": 0.826885502149277, "grad_norm": 8.540038108825684, "learning_rate": 1.5685422933371518e-06, "loss": 8.8372, "step": 4232 }, { "epoch": 0.8270808909730364, "grad_norm": 7.683509826660156, "learning_rate": 1.5651411066059853e-06, "loss": 8.6626, "step": 4233 }, { "epoch": 0.8272762797967956, "grad_norm": 7.379648208618164, "learning_rate": 1.5617432982976776e-06, "loss": 8.4186, "step": 4234 }, { "epoch": 0.8274716686205549, "grad_norm": 8.802104949951172, "learning_rate": 1.5583488697731542e-06, "loss": 9.2311, "step": 4235 }, { "epoch": 0.8276670574443142, "grad_norm": 9.275979995727539, "learning_rate": 1.5549578223919938e-06, "loss": 9.0974, "step": 4236 }, { "epoch": 0.8278624462680735, "grad_norm": 12.322457313537598, "learning_rate": 1.5515701575124108e-06, "loss": 9.4169, "step": 4237 }, { "epoch": 0.8280578350918327, "grad_norm": 10.231083869934082, "learning_rate": 1.548185876491276e-06, "loss": 9.0337, "step": 4238 }, { "epoch": 0.828253223915592, "grad_norm": 8.195764541625977, "learning_rate": 1.5448049806840937e-06, "loss": 8.2788, "step": 4239 }, { "epoch": 0.8284486127393513, "grad_norm": 8.08511734008789, "learning_rate": 1.5414274714450228e-06, "loss": 8.8088, "step": 4240 }, { "epoch": 0.8286440015631106, "grad_norm": 58.29790115356445, "learning_rate": 1.5380533501268547e-06, "loss": 10.0793, "step": 4241 }, { "epoch": 0.8288393903868698, "grad_norm": 9.225850105285645, "learning_rate": 1.534682618081036e-06, "loss": 9.4692, "step": 4242 }, { "epoch": 0.8290347792106292, "grad_norm": 8.517762184143066, "learning_rate": 1.5313152766576434e-06, "loss": 9.4023, "step": 4243 }, { "epoch": 0.8292301680343884, "grad_norm": 25.837026596069336, "learning_rate": 1.527951327205407e-06, "loss": 9.359, "step": 4244 }, { "epoch": 0.8294255568581477, "grad_norm": 9.489827156066895, "learning_rate": 1.5245907710716912e-06, "loss": 8.9078, "step": 4245 }, { "epoch": 0.829620945681907, "grad_norm": 11.948223114013672, "learning_rate": 1.5212336096025005e-06, "loss": 9.2429, "step": 4246 }, { "epoch": 0.8298163345056663, "grad_norm": 8.366604804992676, "learning_rate": 1.5178798441424858e-06, "loss": 9.345, "step": 4247 }, { "epoch": 0.8300117233294255, "grad_norm": 8.934426307678223, "learning_rate": 1.5145294760349317e-06, "loss": 9.0068, "step": 4248 }, { "epoch": 0.8302071121531849, "grad_norm": 8.252093315124512, "learning_rate": 1.5111825066217668e-06, "loss": 9.1736, "step": 4249 }, { "epoch": 0.8304025009769441, "grad_norm": 8.491825103759766, "learning_rate": 1.5078389372435543e-06, "loss": 9.1984, "step": 4250 }, { "epoch": 0.8305978898007034, "grad_norm": 11.35191822052002, "learning_rate": 1.5044987692394997e-06, "loss": 9.3074, "step": 4251 }, { "epoch": 0.8307932786244627, "grad_norm": 8.606689453125, "learning_rate": 1.5011620039474407e-06, "loss": 8.9876, "step": 4252 }, { "epoch": 0.830988667448222, "grad_norm": 9.13046932220459, "learning_rate": 1.4978286427038602e-06, "loss": 9.5026, "step": 4253 }, { "epoch": 0.8311840562719812, "grad_norm": 10.701326370239258, "learning_rate": 1.4944986868438682e-06, "loss": 9.3942, "step": 4254 }, { "epoch": 0.8313794450957406, "grad_norm": 8.744029998779297, "learning_rate": 1.491172137701219e-06, "loss": 8.8568, "step": 4255 }, { "epoch": 0.8315748339194998, "grad_norm": 7.451327800750732, "learning_rate": 1.4878489966082942e-06, "loss": 8.6679, "step": 4256 }, { "epoch": 0.8317702227432591, "grad_norm": 8.913905143737793, "learning_rate": 1.4845292648961219e-06, "loss": 9.3168, "step": 4257 }, { "epoch": 0.8319656115670183, "grad_norm": 9.110503196716309, "learning_rate": 1.48121294389435e-06, "loss": 9.5957, "step": 4258 }, { "epoch": 0.8321610003907777, "grad_norm": 24.264135360717773, "learning_rate": 1.4779000349312754e-06, "loss": 9.6662, "step": 4259 }, { "epoch": 0.8323563892145369, "grad_norm": 7.600764751434326, "learning_rate": 1.4745905393338167e-06, "loss": 9.4055, "step": 4260 }, { "epoch": 0.8325517780382962, "grad_norm": 5.802551746368408, "learning_rate": 1.4712844584275275e-06, "loss": 7.2947, "step": 4261 }, { "epoch": 0.8327471668620555, "grad_norm": 13.387187957763672, "learning_rate": 1.4679817935366014e-06, "loss": 9.8224, "step": 4262 }, { "epoch": 0.8329425556858148, "grad_norm": 8.431346893310547, "learning_rate": 1.4646825459838532e-06, "loss": 9.5619, "step": 4263 }, { "epoch": 0.833137944509574, "grad_norm": 9.317984580993652, "learning_rate": 1.4613867170907391e-06, "loss": 9.7581, "step": 4264 }, { "epoch": 0.8333333333333334, "grad_norm": 7.618305683135986, "learning_rate": 1.4580943081773346e-06, "loss": 8.5974, "step": 4265 }, { "epoch": 0.8335287221570926, "grad_norm": 9.696817398071289, "learning_rate": 1.454805320562358e-06, "loss": 8.7606, "step": 4266 }, { "epoch": 0.8337241109808519, "grad_norm": 8.106192588806152, "learning_rate": 1.4515197555631467e-06, "loss": 9.3544, "step": 4267 }, { "epoch": 0.8339194998046112, "grad_norm": 7.084646224975586, "learning_rate": 1.4482376144956744e-06, "loss": 8.9587, "step": 4268 }, { "epoch": 0.8341148886283705, "grad_norm": 7.673435211181641, "learning_rate": 1.4449588986745367e-06, "loss": 9.1257, "step": 4269 }, { "epoch": 0.8343102774521297, "grad_norm": 9.534943580627441, "learning_rate": 1.441683609412965e-06, "loss": 9.3258, "step": 4270 }, { "epoch": 0.8345056662758891, "grad_norm": 7.976189613342285, "learning_rate": 1.438411748022811e-06, "loss": 9.1592, "step": 4271 }, { "epoch": 0.8347010550996483, "grad_norm": 9.561246871948242, "learning_rate": 1.435143315814561e-06, "loss": 9.6537, "step": 4272 }, { "epoch": 0.8348964439234076, "grad_norm": 7.933861255645752, "learning_rate": 1.4318783140973214e-06, "loss": 9.3873, "step": 4273 }, { "epoch": 0.8350918327471668, "grad_norm": 7.248596668243408, "learning_rate": 1.4286167441788234e-06, "loss": 8.6292, "step": 4274 }, { "epoch": 0.8352872215709262, "grad_norm": 8.570123672485352, "learning_rate": 1.4253586073654325e-06, "loss": 9.133, "step": 4275 }, { "epoch": 0.8354826103946854, "grad_norm": 16.415498733520508, "learning_rate": 1.4221039049621277e-06, "loss": 9.2063, "step": 4276 }, { "epoch": 0.8356779992184447, "grad_norm": 8.9085111618042, "learning_rate": 1.4188526382725255e-06, "loss": 8.7019, "step": 4277 }, { "epoch": 0.835873388042204, "grad_norm": 7.478800296783447, "learning_rate": 1.4156048085988528e-06, "loss": 8.8263, "step": 4278 }, { "epoch": 0.8360687768659633, "grad_norm": 9.699009895324707, "learning_rate": 1.4123604172419714e-06, "loss": 9.9962, "step": 4279 }, { "epoch": 0.8362641656897225, "grad_norm": 8.485151290893555, "learning_rate": 1.4091194655013552e-06, "loss": 9.8214, "step": 4280 }, { "epoch": 0.8364595545134819, "grad_norm": 9.319722175598145, "learning_rate": 1.4058819546751123e-06, "loss": 9.7082, "step": 4281 }, { "epoch": 0.8366549433372411, "grad_norm": 8.54001235961914, "learning_rate": 1.4026478860599602e-06, "loss": 9.5097, "step": 4282 }, { "epoch": 0.8368503321610004, "grad_norm": 148.36419677734375, "learning_rate": 1.3994172609512492e-06, "loss": 9.5145, "step": 4283 }, { "epoch": 0.8370457209847597, "grad_norm": 6.561718940734863, "learning_rate": 1.3961900806429396e-06, "loss": 8.4146, "step": 4284 }, { "epoch": 0.837241109808519, "grad_norm": 9.170931816101074, "learning_rate": 1.3929663464276222e-06, "loss": 9.1873, "step": 4285 }, { "epoch": 0.8374364986322782, "grad_norm": 82.37069702148438, "learning_rate": 1.3897460595965006e-06, "loss": 9.6036, "step": 4286 }, { "epoch": 0.8376318874560376, "grad_norm": 5.722506046295166, "learning_rate": 1.3865292214393977e-06, "loss": 8.3765, "step": 4287 }, { "epoch": 0.8378272762797968, "grad_norm": 8.180362701416016, "learning_rate": 1.3833158332447615e-06, "loss": 8.4419, "step": 4288 }, { "epoch": 0.838022665103556, "grad_norm": 6.897322654724121, "learning_rate": 1.3801058962996482e-06, "loss": 8.6013, "step": 4289 }, { "epoch": 0.8382180539273154, "grad_norm": 9.958105087280273, "learning_rate": 1.3768994118897427e-06, "loss": 10.2248, "step": 4290 }, { "epoch": 0.8384134427510747, "grad_norm": 7.8778767585754395, "learning_rate": 1.3736963812993365e-06, "loss": 9.15, "step": 4291 }, { "epoch": 0.8386088315748339, "grad_norm": 9.966565132141113, "learning_rate": 1.3704968058113487e-06, "loss": 8.7372, "step": 4292 }, { "epoch": 0.8388042203985931, "grad_norm": 7.740724563598633, "learning_rate": 1.3673006867073013e-06, "loss": 8.7413, "step": 4293 }, { "epoch": 0.8389996092223525, "grad_norm": 15.081512451171875, "learning_rate": 1.3641080252673467e-06, "loss": 9.7921, "step": 4294 }, { "epoch": 0.8391949980461118, "grad_norm": 9.276426315307617, "learning_rate": 1.3609188227702387e-06, "loss": 9.8176, "step": 4295 }, { "epoch": 0.839390386869871, "grad_norm": 8.647336959838867, "learning_rate": 1.3577330804933563e-06, "loss": 8.5131, "step": 4296 }, { "epoch": 0.8395857756936304, "grad_norm": 8.417224884033203, "learning_rate": 1.354550799712685e-06, "loss": 7.2477, "step": 4297 }, { "epoch": 0.8397811645173896, "grad_norm": 10.368106842041016, "learning_rate": 1.3513719817028302e-06, "loss": 8.8178, "step": 4298 }, { "epoch": 0.8399765533411488, "grad_norm": 7.813013553619385, "learning_rate": 1.3481966277370061e-06, "loss": 8.5611, "step": 4299 }, { "epoch": 0.8401719421649082, "grad_norm": 9.421406745910645, "learning_rate": 1.3450247390870374e-06, "loss": 8.6661, "step": 4300 }, { "epoch": 0.8403673309886674, "grad_norm": 10.088541984558105, "learning_rate": 1.3418563170233689e-06, "loss": 10.0167, "step": 4301 }, { "epoch": 0.8405627198124267, "grad_norm": 6.547402381896973, "learning_rate": 1.3386913628150477e-06, "loss": 8.4222, "step": 4302 }, { "epoch": 0.840758108636186, "grad_norm": 7.839051723480225, "learning_rate": 1.3355298777297399e-06, "loss": 8.9197, "step": 4303 }, { "epoch": 0.8409534974599453, "grad_norm": 9.462071418762207, "learning_rate": 1.3323718630337145e-06, "loss": 9.1845, "step": 4304 }, { "epoch": 0.8411488862837045, "grad_norm": 9.410916328430176, "learning_rate": 1.329217319991859e-06, "loss": 9.1187, "step": 4305 }, { "epoch": 0.8413442751074639, "grad_norm": 16.695587158203125, "learning_rate": 1.3260662498676613e-06, "loss": 8.5229, "step": 4306 }, { "epoch": 0.8415396639312231, "grad_norm": 8.594900131225586, "learning_rate": 1.3229186539232275e-06, "loss": 9.1592, "step": 4307 }, { "epoch": 0.8417350527549824, "grad_norm": 10.912017822265625, "learning_rate": 1.3197745334192624e-06, "loss": 9.6075, "step": 4308 }, { "epoch": 0.8419304415787418, "grad_norm": 7.126783847808838, "learning_rate": 1.3166338896150887e-06, "loss": 9.0325, "step": 4309 }, { "epoch": 0.842125830402501, "grad_norm": 7.9845757484436035, "learning_rate": 1.313496723768628e-06, "loss": 8.4876, "step": 4310 }, { "epoch": 0.8423212192262602, "grad_norm": 9.117425918579102, "learning_rate": 1.3103630371364151e-06, "loss": 9.6359, "step": 4311 }, { "epoch": 0.8425166080500195, "grad_norm": 9.479337692260742, "learning_rate": 1.3072328309735892e-06, "loss": 9.6399, "step": 4312 }, { "epoch": 0.8427119968737788, "grad_norm": 7.3381476402282715, "learning_rate": 1.30410610653389e-06, "loss": 8.1394, "step": 4313 }, { "epoch": 0.8429073856975381, "grad_norm": 8.921339988708496, "learning_rate": 1.3009828650696743e-06, "loss": 9.6048, "step": 4314 }, { "epoch": 0.8431027745212973, "grad_norm": 7.185836315155029, "learning_rate": 1.2978631078318927e-06, "loss": 9.2513, "step": 4315 }, { "epoch": 0.8432981633450567, "grad_norm": 8.77956485748291, "learning_rate": 1.2947468360701076e-06, "loss": 9.8427, "step": 4316 }, { "epoch": 0.8434935521688159, "grad_norm": 7.311408996582031, "learning_rate": 1.29163405103248e-06, "loss": 9.4744, "step": 4317 }, { "epoch": 0.8436889409925752, "grad_norm": 7.208104133605957, "learning_rate": 1.2885247539657808e-06, "loss": 8.9895, "step": 4318 }, { "epoch": 0.8438843298163345, "grad_norm": 9.21522045135498, "learning_rate": 1.2854189461153753e-06, "loss": 9.1812, "step": 4319 }, { "epoch": 0.8440797186400938, "grad_norm": 8.503985404968262, "learning_rate": 1.2823166287252397e-06, "loss": 9.2833, "step": 4320 }, { "epoch": 0.844275107463853, "grad_norm": 8.361055374145508, "learning_rate": 1.2792178030379454e-06, "loss": 9.3749, "step": 4321 }, { "epoch": 0.8444704962876124, "grad_norm": 6.587082862854004, "learning_rate": 1.2761224702946727e-06, "loss": 9.228, "step": 4322 }, { "epoch": 0.8446658851113716, "grad_norm": 7.459856986999512, "learning_rate": 1.2730306317351926e-06, "loss": 9.1024, "step": 4323 }, { "epoch": 0.8448612739351309, "grad_norm": 9.749171257019043, "learning_rate": 1.2699422885978886e-06, "loss": 9.5284, "step": 4324 }, { "epoch": 0.8450566627588902, "grad_norm": 9.829996109008789, "learning_rate": 1.2668574421197343e-06, "loss": 8.3387, "step": 4325 }, { "epoch": 0.8452520515826495, "grad_norm": 7.526081085205078, "learning_rate": 1.2637760935363053e-06, "loss": 8.2625, "step": 4326 }, { "epoch": 0.8454474404064087, "grad_norm": 9.518244743347168, "learning_rate": 1.2606982440817804e-06, "loss": 9.0902, "step": 4327 }, { "epoch": 0.845642829230168, "grad_norm": 7.615694522857666, "learning_rate": 1.2576238949889308e-06, "loss": 9.7948, "step": 4328 }, { "epoch": 0.8458382180539273, "grad_norm": 7.726398468017578, "learning_rate": 1.2545530474891332e-06, "loss": 8.0975, "step": 4329 }, { "epoch": 0.8460336068776866, "grad_norm": 6.965662956237793, "learning_rate": 1.251485702812353e-06, "loss": 8.5139, "step": 4330 }, { "epoch": 0.8462289957014458, "grad_norm": 7.9974799156188965, "learning_rate": 1.2484218621871603e-06, "loss": 9.1442, "step": 4331 }, { "epoch": 0.8464243845252052, "grad_norm": 10.21238899230957, "learning_rate": 1.2453615268407149e-06, "loss": 9.5565, "step": 4332 }, { "epoch": 0.8466197733489644, "grad_norm": 9.505382537841797, "learning_rate": 1.2423046979987797e-06, "loss": 9.3563, "step": 4333 }, { "epoch": 0.8468151621727237, "grad_norm": 6.460310935974121, "learning_rate": 1.239251376885706e-06, "loss": 8.7227, "step": 4334 }, { "epoch": 0.847010550996483, "grad_norm": 7.768892288208008, "learning_rate": 1.2362015647244485e-06, "loss": 8.3439, "step": 4335 }, { "epoch": 0.8472059398202423, "grad_norm": 8.403698921203613, "learning_rate": 1.2331552627365462e-06, "loss": 9.6446, "step": 4336 }, { "epoch": 0.8474013286440015, "grad_norm": 8.840633392333984, "learning_rate": 1.230112472142142e-06, "loss": 9.7222, "step": 4337 }, { "epoch": 0.8475967174677609, "grad_norm": 9.855066299438477, "learning_rate": 1.2270731941599667e-06, "loss": 9.3417, "step": 4338 }, { "epoch": 0.8477921062915201, "grad_norm": 8.718852043151855, "learning_rate": 1.2240374300073422e-06, "loss": 8.9743, "step": 4339 }, { "epoch": 0.8479874951152794, "grad_norm": 17.223560333251953, "learning_rate": 1.221005180900192e-06, "loss": 9.4087, "step": 4340 }, { "epoch": 0.8481828839390387, "grad_norm": 9.039057731628418, "learning_rate": 1.2179764480530197e-06, "loss": 8.351, "step": 4341 }, { "epoch": 0.848378272762798, "grad_norm": 8.74781608581543, "learning_rate": 1.2149512326789337e-06, "loss": 9.2977, "step": 4342 }, { "epoch": 0.8485736615865572, "grad_norm": 9.709748268127441, "learning_rate": 1.21192953598962e-06, "loss": 9.4355, "step": 4343 }, { "epoch": 0.8487690504103166, "grad_norm": 9.52278995513916, "learning_rate": 1.2089113591953684e-06, "loss": 9.4792, "step": 4344 }, { "epoch": 0.8489644392340758, "grad_norm": 6.382225036621094, "learning_rate": 1.2058967035050461e-06, "loss": 8.2168, "step": 4345 }, { "epoch": 0.8491598280578351, "grad_norm": 9.21106243133545, "learning_rate": 1.2028855701261223e-06, "loss": 10.1486, "step": 4346 }, { "epoch": 0.8493552168815943, "grad_norm": 9.709728240966797, "learning_rate": 1.1998779602646438e-06, "loss": 8.5562, "step": 4347 }, { "epoch": 0.8495506057053537, "grad_norm": 14.302094459533691, "learning_rate": 1.1968738751252562e-06, "loss": 9.4097, "step": 4348 }, { "epoch": 0.8497459945291129, "grad_norm": 7.293747425079346, "learning_rate": 1.1938733159111849e-06, "loss": 8.9235, "step": 4349 }, { "epoch": 0.8499413833528722, "grad_norm": 7.6194376945495605, "learning_rate": 1.1908762838242515e-06, "loss": 9.5107, "step": 4350 }, { "epoch": 0.8501367721766315, "grad_norm": 8.724333763122559, "learning_rate": 1.1878827800648585e-06, "loss": 9.5367, "step": 4351 }, { "epoch": 0.8503321610003908, "grad_norm": 9.795305252075195, "learning_rate": 1.1848928058319942e-06, "loss": 8.3142, "step": 4352 }, { "epoch": 0.85052754982415, "grad_norm": 7.095324516296387, "learning_rate": 1.1819063623232408e-06, "loss": 8.7944, "step": 4353 }, { "epoch": 0.8507229386479094, "grad_norm": 9.359949111938477, "learning_rate": 1.1789234507347568e-06, "loss": 8.3412, "step": 4354 }, { "epoch": 0.8509183274716686, "grad_norm": 10.278728485107422, "learning_rate": 1.1759440722612957e-06, "loss": 9.0669, "step": 4355 }, { "epoch": 0.8511137162954279, "grad_norm": 10.226433753967285, "learning_rate": 1.1729682280961873e-06, "loss": 9.0757, "step": 4356 }, { "epoch": 0.8513091051191872, "grad_norm": 10.047861099243164, "learning_rate": 1.1699959194313525e-06, "loss": 9.5237, "step": 4357 }, { "epoch": 0.8515044939429465, "grad_norm": 7.863394260406494, "learning_rate": 1.1670271474572903e-06, "loss": 8.8631, "step": 4358 }, { "epoch": 0.8516998827667057, "grad_norm": 7.909746170043945, "learning_rate": 1.1640619133630894e-06, "loss": 8.9935, "step": 4359 }, { "epoch": 0.8518952715904651, "grad_norm": 9.45473861694336, "learning_rate": 1.1611002183364152e-06, "loss": 9.7448, "step": 4360 }, { "epoch": 0.8520906604142243, "grad_norm": 7.1263837814331055, "learning_rate": 1.1581420635635198e-06, "loss": 9.5666, "step": 4361 }, { "epoch": 0.8522860492379836, "grad_norm": 8.597981452941895, "learning_rate": 1.1551874502292359e-06, "loss": 8.9688, "step": 4362 }, { "epoch": 0.8524814380617428, "grad_norm": 8.461832046508789, "learning_rate": 1.1522363795169788e-06, "loss": 9.1263, "step": 4363 }, { "epoch": 0.8526768268855022, "grad_norm": 8.200736999511719, "learning_rate": 1.149288852608743e-06, "loss": 9.6539, "step": 4364 }, { "epoch": 0.8528722157092614, "grad_norm": 9.481253623962402, "learning_rate": 1.1463448706851021e-06, "loss": 8.6186, "step": 4365 }, { "epoch": 0.8530676045330207, "grad_norm": 11.275646209716797, "learning_rate": 1.1434044349252172e-06, "loss": 9.864, "step": 4366 }, { "epoch": 0.85326299335678, "grad_norm": 7.154355049133301, "learning_rate": 1.1404675465068193e-06, "loss": 9.6745, "step": 4367 }, { "epoch": 0.8534583821805393, "grad_norm": 9.041812896728516, "learning_rate": 1.1375342066062277e-06, "loss": 9.1627, "step": 4368 }, { "epoch": 0.8536537710042985, "grad_norm": 7.8985090255737305, "learning_rate": 1.1346044163983327e-06, "loss": 8.8172, "step": 4369 }, { "epoch": 0.8538491598280579, "grad_norm": 10.522104263305664, "learning_rate": 1.1316781770566087e-06, "loss": 9.471, "step": 4370 }, { "epoch": 0.8540445486518171, "grad_norm": 6.28630256652832, "learning_rate": 1.128755489753105e-06, "loss": 8.1832, "step": 4371 }, { "epoch": 0.8542399374755764, "grad_norm": 7.923295497894287, "learning_rate": 1.1258363556584473e-06, "loss": 9.5306, "step": 4372 }, { "epoch": 0.8544353262993357, "grad_norm": 7.415898323059082, "learning_rate": 1.1229207759418381e-06, "loss": 8.9138, "step": 4373 }, { "epoch": 0.854630715123095, "grad_norm": 7.470461368560791, "learning_rate": 1.1200087517710623e-06, "loss": 9.5922, "step": 4374 }, { "epoch": 0.8548261039468542, "grad_norm": 8.525838851928711, "learning_rate": 1.11710028431247e-06, "loss": 8.5091, "step": 4375 }, { "epoch": 0.8550214927706136, "grad_norm": 8.140287399291992, "learning_rate": 1.1141953747309986e-06, "loss": 8.9586, "step": 4376 }, { "epoch": 0.8552168815943728, "grad_norm": 18.053083419799805, "learning_rate": 1.1112940241901504e-06, "loss": 9.1289, "step": 4377 }, { "epoch": 0.8554122704181321, "grad_norm": 8.162330627441406, "learning_rate": 1.1083962338520093e-06, "loss": 8.8091, "step": 4378 }, { "epoch": 0.8556076592418914, "grad_norm": 8.185535430908203, "learning_rate": 1.1055020048772268e-06, "loss": 9.3505, "step": 4379 }, { "epoch": 0.8558030480656507, "grad_norm": 9.759170532226562, "learning_rate": 1.102611338425036e-06, "loss": 9.4584, "step": 4380 }, { "epoch": 0.8559984368894099, "grad_norm": 10.33072566986084, "learning_rate": 1.0997242356532335e-06, "loss": 8.1816, "step": 4381 }, { "epoch": 0.8561938257131692, "grad_norm": 7.357562065124512, "learning_rate": 1.096840697718199e-06, "loss": 8.4596, "step": 4382 }, { "epoch": 0.8563892145369285, "grad_norm": 6.7860870361328125, "learning_rate": 1.0939607257748752e-06, "loss": 8.8797, "step": 4383 }, { "epoch": 0.8565846033606878, "grad_norm": 8.139361381530762, "learning_rate": 1.0910843209767797e-06, "loss": 9.3265, "step": 4384 }, { "epoch": 0.856779992184447, "grad_norm": 8.5925874710083, "learning_rate": 1.088211484476005e-06, "loss": 9.0923, "step": 4385 }, { "epoch": 0.8569753810082064, "grad_norm": 11.344609260559082, "learning_rate": 1.0853422174232087e-06, "loss": 9.6272, "step": 4386 }, { "epoch": 0.8571707698319656, "grad_norm": 7.951318740844727, "learning_rate": 1.0824765209676235e-06, "loss": 8.6183, "step": 4387 }, { "epoch": 0.8573661586557249, "grad_norm": 8.583785057067871, "learning_rate": 1.079614396257047e-06, "loss": 9.3787, "step": 4388 }, { "epoch": 0.8575615474794842, "grad_norm": 7.105406284332275, "learning_rate": 1.0767558444378535e-06, "loss": 8.2011, "step": 4389 }, { "epoch": 0.8577569363032435, "grad_norm": 6.523879051208496, "learning_rate": 1.0739008666549767e-06, "loss": 8.861, "step": 4390 }, { "epoch": 0.8579523251270027, "grad_norm": 7.175806522369385, "learning_rate": 1.0710494640519276e-06, "loss": 8.2861, "step": 4391 }, { "epoch": 0.8581477139507621, "grad_norm": 9.211763381958008, "learning_rate": 1.0682016377707772e-06, "loss": 9.3112, "step": 4392 }, { "epoch": 0.8583431027745213, "grad_norm": 7.219457149505615, "learning_rate": 1.065357388952174e-06, "loss": 9.6006, "step": 4393 }, { "epoch": 0.8585384915982806, "grad_norm": 8.839190483093262, "learning_rate": 1.0625167187353236e-06, "loss": 9.7189, "step": 4394 }, { "epoch": 0.8587338804220399, "grad_norm": 8.153175354003906, "learning_rate": 1.0596796282580058e-06, "loss": 8.6085, "step": 4395 }, { "epoch": 0.8589292692457992, "grad_norm": 7.035209655761719, "learning_rate": 1.0568461186565604e-06, "loss": 8.595, "step": 4396 }, { "epoch": 0.8591246580695584, "grad_norm": 11.23974609375, "learning_rate": 1.0540161910658964e-06, "loss": 8.1272, "step": 4397 }, { "epoch": 0.8593200468933178, "grad_norm": 7.754348278045654, "learning_rate": 1.0511898466194903e-06, "loss": 9.4844, "step": 4398 }, { "epoch": 0.859515435717077, "grad_norm": 11.255935668945312, "learning_rate": 1.0483670864493777e-06, "loss": 8.4559, "step": 4399 }, { "epoch": 0.8597108245408362, "grad_norm": 7.187273025512695, "learning_rate": 1.0455479116861645e-06, "loss": 8.0356, "step": 4400 }, { "epoch": 0.8599062133645955, "grad_norm": 10.443448066711426, "learning_rate": 1.0427323234590137e-06, "loss": 9.0796, "step": 4401 }, { "epoch": 0.8601016021883549, "grad_norm": 9.212592124938965, "learning_rate": 1.0399203228956611e-06, "loss": 10.1266, "step": 4402 }, { "epoch": 0.8602969910121141, "grad_norm": 10.160746574401855, "learning_rate": 1.0371119111223949e-06, "loss": 8.8189, "step": 4403 }, { "epoch": 0.8604923798358733, "grad_norm": 8.360414505004883, "learning_rate": 1.034307089264076e-06, "loss": 9.6335, "step": 4404 }, { "epoch": 0.8606877686596327, "grad_norm": 8.571639060974121, "learning_rate": 1.0315058584441185e-06, "loss": 9.2224, "step": 4405 }, { "epoch": 0.860883157483392, "grad_norm": 9.940386772155762, "learning_rate": 1.0287082197845055e-06, "loss": 9.1021, "step": 4406 }, { "epoch": 0.8610785463071512, "grad_norm": 9.578882217407227, "learning_rate": 1.0259141744057755e-06, "loss": 9.4857, "step": 4407 }, { "epoch": 0.8612739351309106, "grad_norm": 7.491061210632324, "learning_rate": 1.0231237234270331e-06, "loss": 9.269, "step": 4408 }, { "epoch": 0.8614693239546698, "grad_norm": 8.327766418457031, "learning_rate": 1.0203368679659399e-06, "loss": 8.8025, "step": 4409 }, { "epoch": 0.861664712778429, "grad_norm": 8.449031829833984, "learning_rate": 1.017553609138715e-06, "loss": 8.0793, "step": 4410 }, { "epoch": 0.8618601016021884, "grad_norm": 7.012770652770996, "learning_rate": 1.0147739480601448e-06, "loss": 8.1449, "step": 4411 }, { "epoch": 0.8620554904259476, "grad_norm": 7.9110918045043945, "learning_rate": 1.0119978858435653e-06, "loss": 9.2474, "step": 4412 }, { "epoch": 0.8622508792497069, "grad_norm": 8.513326644897461, "learning_rate": 1.0092254236008793e-06, "loss": 8.8102, "step": 4413 }, { "epoch": 0.8624462680734662, "grad_norm": 64.09504699707031, "learning_rate": 1.0064565624425404e-06, "loss": 8.6384, "step": 4414 }, { "epoch": 0.8626416568972255, "grad_norm": 8.766754150390625, "learning_rate": 1.0036913034775675e-06, "loss": 8.5842, "step": 4415 }, { "epoch": 0.8628370457209847, "grad_norm": 9.33322525024414, "learning_rate": 1.0009296478135289e-06, "loss": 8.9067, "step": 4416 }, { "epoch": 0.863032434544744, "grad_norm": 9.218510627746582, "learning_rate": 9.981715965565565e-07, "loss": 9.0773, "step": 4417 }, { "epoch": 0.8632278233685033, "grad_norm": 8.177020072937012, "learning_rate": 9.95417150811333e-07, "loss": 7.9319, "step": 4418 }, { "epoch": 0.8634232121922626, "grad_norm": 8.058493614196777, "learning_rate": 9.926663116811019e-07, "loss": 8.6709, "step": 4419 }, { "epoch": 0.8636186010160218, "grad_norm": 8.650046348571777, "learning_rate": 9.89919080267656e-07, "loss": 9.4331, "step": 4420 }, { "epoch": 0.8638139898397812, "grad_norm": 11.028565406799316, "learning_rate": 9.87175457671351e-07, "loss": 9.8931, "step": 4421 }, { "epoch": 0.8640093786635404, "grad_norm": 9.88052749633789, "learning_rate": 9.84435444991092e-07, "loss": 9.7764, "step": 4422 }, { "epoch": 0.8642047674872997, "grad_norm": 8.127789497375488, "learning_rate": 9.816990433243356e-07, "loss": 9.4421, "step": 4423 }, { "epoch": 0.864400156311059, "grad_norm": 7.8543219566345215, "learning_rate": 9.789662537671007e-07, "loss": 8.0521, "step": 4424 }, { "epoch": 0.8645955451348183, "grad_norm": 8.689035415649414, "learning_rate": 9.762370774139484e-07, "loss": 9.1581, "step": 4425 }, { "epoch": 0.8647909339585775, "grad_norm": 9.804941177368164, "learning_rate": 9.735115153580055e-07, "loss": 10.1786, "step": 4426 }, { "epoch": 0.8649863227823369, "grad_norm": 7.875881195068359, "learning_rate": 9.707895686909385e-07, "loss": 8.8083, "step": 4427 }, { "epoch": 0.8651817116060961, "grad_norm": 7.706095218658447, "learning_rate": 9.680712385029755e-07, "loss": 9.2423, "step": 4428 }, { "epoch": 0.8653771004298554, "grad_norm": 12.053943634033203, "learning_rate": 9.65356525882888e-07, "loss": 9.502, "step": 4429 }, { "epoch": 0.8655724892536147, "grad_norm": 10.30573844909668, "learning_rate": 9.626454319180078e-07, "loss": 9.3767, "step": 4430 }, { "epoch": 0.865767878077374, "grad_norm": 9.071868896484375, "learning_rate": 9.599379576942069e-07, "loss": 8.9809, "step": 4431 }, { "epoch": 0.8659632669011332, "grad_norm": 7.747514724731445, "learning_rate": 9.572341042959177e-07, "loss": 8.4974, "step": 4432 }, { "epoch": 0.8661586557248926, "grad_norm": 10.477039337158203, "learning_rate": 9.545338728061125e-07, "loss": 9.1711, "step": 4433 }, { "epoch": 0.8663540445486518, "grad_norm": 12.359572410583496, "learning_rate": 9.518372643063234e-07, "loss": 9.9854, "step": 4434 }, { "epoch": 0.8665494333724111, "grad_norm": 7.519041061401367, "learning_rate": 9.491442798766226e-07, "loss": 8.6956, "step": 4435 }, { "epoch": 0.8667448221961703, "grad_norm": 12.082423210144043, "learning_rate": 9.464549205956319e-07, "loss": 8.4, "step": 4436 }, { "epoch": 0.8669402110199297, "grad_norm": 10.535531997680664, "learning_rate": 9.437691875405275e-07, "loss": 9.6666, "step": 4437 }, { "epoch": 0.8671355998436889, "grad_norm": 18.231430053710938, "learning_rate": 9.410870817870254e-07, "loss": 10.2696, "step": 4438 }, { "epoch": 0.8673309886674482, "grad_norm": 10.223472595214844, "learning_rate": 9.384086044093954e-07, "loss": 9.7457, "step": 4439 }, { "epoch": 0.8675263774912075, "grad_norm": 9.341796875, "learning_rate": 9.35733756480447e-07, "loss": 8.7086, "step": 4440 }, { "epoch": 0.8677217663149668, "grad_norm": 6.032013416290283, "learning_rate": 9.330625390715442e-07, "loss": 8.1348, "step": 4441 }, { "epoch": 0.867917155138726, "grad_norm": 8.410514831542969, "learning_rate": 9.30394953252588e-07, "loss": 8.7077, "step": 4442 }, { "epoch": 0.8681125439624854, "grad_norm": 7.853433609008789, "learning_rate": 9.277310000920337e-07, "loss": 8.4718, "step": 4443 }, { "epoch": 0.8683079327862446, "grad_norm": 8.205535888671875, "learning_rate": 9.250706806568744e-07, "loss": 9.3656, "step": 4444 }, { "epoch": 0.8685033216100039, "grad_norm": 8.704334259033203, "learning_rate": 9.224139960126521e-07, "loss": 9.2046, "step": 4445 }, { "epoch": 0.8686987104337632, "grad_norm": 8.048930168151855, "learning_rate": 9.197609472234493e-07, "loss": 9.0922, "step": 4446 }, { "epoch": 0.8688940992575225, "grad_norm": 7.903109550476074, "learning_rate": 9.171115353518978e-07, "loss": 8.3976, "step": 4447 }, { "epoch": 0.8690894880812817, "grad_norm": 101.25208282470703, "learning_rate": 9.144657614591679e-07, "loss": 9.5005, "step": 4448 }, { "epoch": 0.8692848769050411, "grad_norm": 7.834874629974365, "learning_rate": 9.118236266049707e-07, "loss": 8.7355, "step": 4449 }, { "epoch": 0.8694802657288003, "grad_norm": 6.803002834320068, "learning_rate": 9.091851318475687e-07, "loss": 7.4237, "step": 4450 }, { "epoch": 0.8696756545525596, "grad_norm": 9.04742431640625, "learning_rate": 9.065502782437574e-07, "loss": 8.9199, "step": 4451 }, { "epoch": 0.8698710433763188, "grad_norm": 7.7430548667907715, "learning_rate": 9.039190668488795e-07, "loss": 9.5888, "step": 4452 }, { "epoch": 0.8700664322000782, "grad_norm": 9.58332633972168, "learning_rate": 9.012914987168143e-07, "loss": 9.9074, "step": 4453 }, { "epoch": 0.8702618210238374, "grad_norm": 28.31788444519043, "learning_rate": 8.986675748999885e-07, "loss": 8.6878, "step": 4454 }, { "epoch": 0.8704572098475967, "grad_norm": 7.615555286407471, "learning_rate": 8.960472964493594e-07, "loss": 9.6356, "step": 4455 }, { "epoch": 0.870652598671356, "grad_norm": 6.998612403869629, "learning_rate": 8.934306644144363e-07, "loss": 8.8061, "step": 4456 }, { "epoch": 0.8708479874951153, "grad_norm": 7.023011207580566, "learning_rate": 8.908176798432555e-07, "loss": 7.6729, "step": 4457 }, { "epoch": 0.8710433763188745, "grad_norm": 6.8855485916137695, "learning_rate": 8.88208343782404e-07, "loss": 8.3022, "step": 4458 }, { "epoch": 0.8712387651426339, "grad_norm": 9.884055137634277, "learning_rate": 8.856026572769971e-07, "loss": 9.3856, "step": 4459 }, { "epoch": 0.8714341539663931, "grad_norm": 9.977845191955566, "learning_rate": 8.830006213706965e-07, "loss": 10.5723, "step": 4460 }, { "epoch": 0.8716295427901524, "grad_norm": 6.782526969909668, "learning_rate": 8.804022371056975e-07, "loss": 8.438, "step": 4461 }, { "epoch": 0.8718249316139117, "grad_norm": 9.667460441589355, "learning_rate": 8.77807505522732e-07, "loss": 9.2985, "step": 4462 }, { "epoch": 0.872020320437671, "grad_norm": 9.0558500289917, "learning_rate": 8.752164276610731e-07, "loss": 9.3276, "step": 4463 }, { "epoch": 0.8722157092614302, "grad_norm": 8.599926948547363, "learning_rate": 8.72629004558525e-07, "loss": 8.5314, "step": 4464 }, { "epoch": 0.8724110980851896, "grad_norm": 8.152388572692871, "learning_rate": 8.700452372514345e-07, "loss": 8.797, "step": 4465 }, { "epoch": 0.8726064869089488, "grad_norm": 9.223265647888184, "learning_rate": 8.674651267746769e-07, "loss": 8.859, "step": 4466 }, { "epoch": 0.8728018757327081, "grad_norm": 9.665291786193848, "learning_rate": 8.648886741616691e-07, "loss": 9.4288, "step": 4467 }, { "epoch": 0.8729972645564674, "grad_norm": 7.774353504180908, "learning_rate": 8.623158804443598e-07, "loss": 8.5911, "step": 4468 }, { "epoch": 0.8731926533802267, "grad_norm": 10.168339729309082, "learning_rate": 8.597467466532327e-07, "loss": 8.9385, "step": 4469 }, { "epoch": 0.8733880422039859, "grad_norm": 7.361769676208496, "learning_rate": 8.571812738173047e-07, "loss": 8.7944, "step": 4470 }, { "epoch": 0.8735834310277452, "grad_norm": 8.525863647460938, "learning_rate": 8.546194629641302e-07, "loss": 8.6798, "step": 4471 }, { "epoch": 0.8737788198515045, "grad_norm": 6.9871087074279785, "learning_rate": 8.520613151197899e-07, "loss": 8.3553, "step": 4472 }, { "epoch": 0.8739742086752638, "grad_norm": 8.88776969909668, "learning_rate": 8.49506831308905e-07, "loss": 9.5342, "step": 4473 }, { "epoch": 0.874169597499023, "grad_norm": 6.036227703094482, "learning_rate": 8.469560125546239e-07, "loss": 8.7762, "step": 4474 }, { "epoch": 0.8743649863227824, "grad_norm": 9.236881256103516, "learning_rate": 8.444088598786282e-07, "loss": 9.0996, "step": 4475 }, { "epoch": 0.8745603751465416, "grad_norm": 8.032596588134766, "learning_rate": 8.418653743011329e-07, "loss": 9.2407, "step": 4476 }, { "epoch": 0.8747557639703009, "grad_norm": 8.645380973815918, "learning_rate": 8.39325556840881e-07, "loss": 8.2006, "step": 4477 }, { "epoch": 0.8749511527940602, "grad_norm": 6.382596969604492, "learning_rate": 8.367894085151518e-07, "loss": 7.9927, "step": 4478 }, { "epoch": 0.8751465416178195, "grad_norm": 6.3066253662109375, "learning_rate": 8.342569303397474e-07, "loss": 8.2247, "step": 4479 }, { "epoch": 0.8753419304415787, "grad_norm": 8.22442626953125, "learning_rate": 8.317281233290064e-07, "loss": 8.3415, "step": 4480 }, { "epoch": 0.8755373192653381, "grad_norm": 10.735391616821289, "learning_rate": 8.29202988495793e-07, "loss": 10.3575, "step": 4481 }, { "epoch": 0.8757327080890973, "grad_norm": 8.855950355529785, "learning_rate": 8.266815268515049e-07, "loss": 9.7522, "step": 4482 }, { "epoch": 0.8759280969128566, "grad_norm": 9.162856101989746, "learning_rate": 8.241637394060619e-07, "loss": 9.6998, "step": 4483 }, { "epoch": 0.8761234857366159, "grad_norm": 9.004673957824707, "learning_rate": 8.216496271679198e-07, "loss": 8.7978, "step": 4484 }, { "epoch": 0.8763188745603752, "grad_norm": 7.955142021179199, "learning_rate": 8.191391911440549e-07, "loss": 9.3476, "step": 4485 }, { "epoch": 0.8765142633841344, "grad_norm": 9.796892166137695, "learning_rate": 8.166324323399788e-07, "loss": 8.8969, "step": 4486 }, { "epoch": 0.8767096522078938, "grad_norm": 10.60122013092041, "learning_rate": 8.141293517597238e-07, "loss": 9.199, "step": 4487 }, { "epoch": 0.876905041031653, "grad_norm": 9.102415084838867, "learning_rate": 8.1162995040585e-07, "loss": 10.0809, "step": 4488 }, { "epoch": 0.8771004298554123, "grad_norm": 8.153091430664062, "learning_rate": 8.091342292794491e-07, "loss": 9.1124, "step": 4489 }, { "epoch": 0.8772958186791715, "grad_norm": 11.209802627563477, "learning_rate": 8.066421893801302e-07, "loss": 9.4476, "step": 4490 }, { "epoch": 0.8774912075029309, "grad_norm": 12.699365615844727, "learning_rate": 8.041538317060371e-07, "loss": 8.7293, "step": 4491 }, { "epoch": 0.8776865963266901, "grad_norm": 7.22158145904541, "learning_rate": 8.016691572538315e-07, "loss": 9.8362, "step": 4492 }, { "epoch": 0.8778819851504494, "grad_norm": 10.028066635131836, "learning_rate": 7.99188167018704e-07, "loss": 8.9034, "step": 4493 }, { "epoch": 0.8780773739742087, "grad_norm": 9.291180610656738, "learning_rate": 7.967108619943664e-07, "loss": 9.7576, "step": 4494 }, { "epoch": 0.878272762797968, "grad_norm": 9.38831615447998, "learning_rate": 7.942372431730594e-07, "loss": 9.0136, "step": 4495 }, { "epoch": 0.8784681516217272, "grad_norm": 8.535799026489258, "learning_rate": 7.917673115455415e-07, "loss": 9.3348, "step": 4496 }, { "epoch": 0.8786635404454866, "grad_norm": 10.475357055664062, "learning_rate": 7.893010681010982e-07, "loss": 9.7507, "step": 4497 }, { "epoch": 0.8788589292692458, "grad_norm": 7.551560401916504, "learning_rate": 7.868385138275358e-07, "loss": 9.5255, "step": 4498 }, { "epoch": 0.879054318093005, "grad_norm": 18.42361831665039, "learning_rate": 7.843796497111844e-07, "loss": 8.8792, "step": 4499 }, { "epoch": 0.8792497069167644, "grad_norm": 11.153907775878906, "learning_rate": 7.81924476736896e-07, "loss": 8.9052, "step": 4500 }, { "epoch": 0.8794450957405237, "grad_norm": 5.823765277862549, "learning_rate": 7.79472995888041e-07, "loss": 8.4478, "step": 4501 }, { "epoch": 0.8796404845642829, "grad_norm": 8.497440338134766, "learning_rate": 7.77025208146518e-07, "loss": 9.1186, "step": 4502 }, { "epoch": 0.8798358733880423, "grad_norm": 7.643743515014648, "learning_rate": 7.745811144927362e-07, "loss": 7.929, "step": 4503 }, { "epoch": 0.8800312622118015, "grad_norm": 14.62929916381836, "learning_rate": 7.72140715905636e-07, "loss": 9.1529, "step": 4504 }, { "epoch": 0.8802266510355607, "grad_norm": 11.714295387268066, "learning_rate": 7.697040133626698e-07, "loss": 10.5666, "step": 4505 }, { "epoch": 0.88042203985932, "grad_norm": 8.457733154296875, "learning_rate": 7.67271007839816e-07, "loss": 9.4585, "step": 4506 }, { "epoch": 0.8806174286830794, "grad_norm": 7.4943366050720215, "learning_rate": 7.648417003115637e-07, "loss": 9.0691, "step": 4507 }, { "epoch": 0.8808128175068386, "grad_norm": 10.112883567810059, "learning_rate": 7.624160917509316e-07, "loss": 8.9789, "step": 4508 }, { "epoch": 0.8810082063305978, "grad_norm": 10.249805450439453, "learning_rate": 7.599941831294477e-07, "loss": 9.7161, "step": 4509 }, { "epoch": 0.8812035951543572, "grad_norm": 7.2333083152771, "learning_rate": 7.57575975417163e-07, "loss": 9.1904, "step": 4510 }, { "epoch": 0.8813989839781164, "grad_norm": 10.90276050567627, "learning_rate": 7.551614695826414e-07, "loss": 9.4712, "step": 4511 }, { "epoch": 0.8815943728018757, "grad_norm": 9.032613754272461, "learning_rate": 7.527506665929729e-07, "loss": 8.7427, "step": 4512 }, { "epoch": 0.881789761625635, "grad_norm": 7.372929096221924, "learning_rate": 7.50343567413755e-07, "loss": 9.654, "step": 4513 }, { "epoch": 0.8819851504493943, "grad_norm": 8.577502250671387, "learning_rate": 7.479401730091074e-07, "loss": 9.232, "step": 4514 }, { "epoch": 0.8821805392731535, "grad_norm": 8.171577453613281, "learning_rate": 7.455404843416636e-07, "loss": 8.0958, "step": 4515 }, { "epoch": 0.8823759280969129, "grad_norm": 10.150381088256836, "learning_rate": 7.431445023725736e-07, "loss": 9.2528, "step": 4516 }, { "epoch": 0.8825713169206721, "grad_norm": 9.050222396850586, "learning_rate": 7.40752228061502e-07, "loss": 8.2561, "step": 4517 }, { "epoch": 0.8827667057444314, "grad_norm": 8.562286376953125, "learning_rate": 7.383636623666302e-07, "loss": 9.0159, "step": 4518 }, { "epoch": 0.8829620945681907, "grad_norm": 9.105795860290527, "learning_rate": 7.359788062446527e-07, "loss": 9.1769, "step": 4519 }, { "epoch": 0.88315748339195, "grad_norm": 8.349502563476562, "learning_rate": 7.335976606507766e-07, "loss": 9.4373, "step": 4520 }, { "epoch": 0.8833528722157092, "grad_norm": 9.987401008605957, "learning_rate": 7.312202265387269e-07, "loss": 8.829, "step": 4521 }, { "epoch": 0.8835482610394686, "grad_norm": 8.053004264831543, "learning_rate": 7.288465048607373e-07, "loss": 8.9076, "step": 4522 }, { "epoch": 0.8837436498632278, "grad_norm": 9.053923606872559, "learning_rate": 7.264764965675597e-07, "loss": 8.3517, "step": 4523 }, { "epoch": 0.8839390386869871, "grad_norm": 8.437475204467773, "learning_rate": 7.241102026084534e-07, "loss": 8.5202, "step": 4524 }, { "epoch": 0.8841344275107463, "grad_norm": 12.07729721069336, "learning_rate": 7.21747623931196e-07, "loss": 9.1904, "step": 4525 }, { "epoch": 0.8843298163345057, "grad_norm": 7.962643146514893, "learning_rate": 7.193887614820694e-07, "loss": 10.0814, "step": 4526 }, { "epoch": 0.8845252051582649, "grad_norm": 6.445322036743164, "learning_rate": 7.170336162058755e-07, "loss": 8.7302, "step": 4527 }, { "epoch": 0.8847205939820242, "grad_norm": 9.202902793884277, "learning_rate": 7.146821890459188e-07, "loss": 9.811, "step": 4528 }, { "epoch": 0.8849159828057835, "grad_norm": 7.17478609085083, "learning_rate": 7.123344809440236e-07, "loss": 8.906, "step": 4529 }, { "epoch": 0.8851113716295428, "grad_norm": 7.45881986618042, "learning_rate": 7.099904928405155e-07, "loss": 9.1534, "step": 4530 }, { "epoch": 0.885306760453302, "grad_norm": 9.53923511505127, "learning_rate": 7.076502256742379e-07, "loss": 9.2339, "step": 4531 }, { "epoch": 0.8855021492770614, "grad_norm": 8.295525550842285, "learning_rate": 7.053136803825389e-07, "loss": 8.5461, "step": 4532 }, { "epoch": 0.8856975381008206, "grad_norm": 8.039118766784668, "learning_rate": 7.029808579012765e-07, "loss": 9.0157, "step": 4533 }, { "epoch": 0.8858929269245799, "grad_norm": 8.169330596923828, "learning_rate": 7.00651759164821e-07, "loss": 8.5258, "step": 4534 }, { "epoch": 0.8860883157483392, "grad_norm": 8.783629417419434, "learning_rate": 6.983263851060451e-07, "loss": 9.4092, "step": 4535 }, { "epoch": 0.8862837045720985, "grad_norm": 9.435042381286621, "learning_rate": 6.960047366563372e-07, "loss": 9.1183, "step": 4536 }, { "epoch": 0.8864790933958577, "grad_norm": 9.772560119628906, "learning_rate": 6.936868147455866e-07, "loss": 9.5689, "step": 4537 }, { "epoch": 0.8866744822196171, "grad_norm": 10.278083801269531, "learning_rate": 6.913726203021954e-07, "loss": 9.6147, "step": 4538 }, { "epoch": 0.8868698710433763, "grad_norm": 8.931450843811035, "learning_rate": 6.890621542530684e-07, "loss": 9.0847, "step": 4539 }, { "epoch": 0.8870652598671356, "grad_norm": 11.48383903503418, "learning_rate": 6.867554175236202e-07, "loss": 9.6914, "step": 4540 }, { "epoch": 0.8872606486908948, "grad_norm": 9.49150276184082, "learning_rate": 6.844524110377692e-07, "loss": 9.1198, "step": 4541 }, { "epoch": 0.8874560375146542, "grad_norm": 10.23884391784668, "learning_rate": 6.821531357179434e-07, "loss": 8.8346, "step": 4542 }, { "epoch": 0.8876514263384134, "grad_norm": 7.415590286254883, "learning_rate": 6.798575924850715e-07, "loss": 8.6566, "step": 4543 }, { "epoch": 0.8878468151621727, "grad_norm": 7.1067633628845215, "learning_rate": 6.775657822585913e-07, "loss": 9.1218, "step": 4544 }, { "epoch": 0.888042203985932, "grad_norm": 8.757865905761719, "learning_rate": 6.752777059564431e-07, "loss": 8.2709, "step": 4545 }, { "epoch": 0.8882375928096913, "grad_norm": 8.671647071838379, "learning_rate": 6.729933644950726e-07, "loss": 8.8263, "step": 4546 }, { "epoch": 0.8884329816334505, "grad_norm": 9.775559425354004, "learning_rate": 6.707127587894302e-07, "loss": 8.3645, "step": 4547 }, { "epoch": 0.8886283704572099, "grad_norm": 8.119025230407715, "learning_rate": 6.684358897529675e-07, "loss": 8.2647, "step": 4548 }, { "epoch": 0.8888237592809691, "grad_norm": 8.645852088928223, "learning_rate": 6.661627582976438e-07, "loss": 9.4793, "step": 4549 }, { "epoch": 0.8890191481047284, "grad_norm": 8.791229248046875, "learning_rate": 6.638933653339153e-07, "loss": 9.5033, "step": 4550 }, { "epoch": 0.8892145369284877, "grad_norm": 9.361802101135254, "learning_rate": 6.616277117707493e-07, "loss": 9.4369, "step": 4551 }, { "epoch": 0.889409925752247, "grad_norm": 8.829449653625488, "learning_rate": 6.593657985156055e-07, "loss": 9.1888, "step": 4552 }, { "epoch": 0.8896053145760062, "grad_norm": 7.402151107788086, "learning_rate": 6.571076264744547e-07, "loss": 8.5052, "step": 4553 }, { "epoch": 0.8898007033997656, "grad_norm": 8.041218757629395, "learning_rate": 6.548531965517601e-07, "loss": 8.4622, "step": 4554 }, { "epoch": 0.8899960922235248, "grad_norm": 8.33278751373291, "learning_rate": 6.52602509650495e-07, "loss": 8.6969, "step": 4555 }, { "epoch": 0.8901914810472841, "grad_norm": 6.467602729797363, "learning_rate": 6.503555666721273e-07, "loss": 9.3724, "step": 4556 }, { "epoch": 0.8903868698710434, "grad_norm": 7.473740577697754, "learning_rate": 6.481123685166291e-07, "loss": 8.0025, "step": 4557 }, { "epoch": 0.8905822586948027, "grad_norm": 8.307002067565918, "learning_rate": 6.458729160824706e-07, "loss": 9.0213, "step": 4558 }, { "epoch": 0.8907776475185619, "grad_norm": 9.312746047973633, "learning_rate": 6.436372102666188e-07, "loss": 9.2817, "step": 4559 }, { "epoch": 0.8909730363423212, "grad_norm": 9.156469345092773, "learning_rate": 6.414052519645475e-07, "loss": 9.197, "step": 4560 }, { "epoch": 0.8911684251660805, "grad_norm": 8.96273136138916, "learning_rate": 6.391770420702215e-07, "loss": 10.073, "step": 4561 }, { "epoch": 0.8913638139898398, "grad_norm": 8.047943115234375, "learning_rate": 6.369525814761124e-07, "loss": 9.3223, "step": 4562 }, { "epoch": 0.891559202813599, "grad_norm": 7.415777206420898, "learning_rate": 6.347318710731809e-07, "loss": 9.2714, "step": 4563 }, { "epoch": 0.8917545916373584, "grad_norm": 8.42771053314209, "learning_rate": 6.325149117508955e-07, "loss": 9.1933, "step": 4564 }, { "epoch": 0.8919499804611176, "grad_norm": 7.360374450683594, "learning_rate": 6.303017043972126e-07, "loss": 8.4876, "step": 4565 }, { "epoch": 0.8921453692848769, "grad_norm": 8.370670318603516, "learning_rate": 6.280922498985942e-07, "loss": 8.844, "step": 4566 }, { "epoch": 0.8923407581086362, "grad_norm": 7.138886451721191, "learning_rate": 6.258865491399913e-07, "loss": 8.1337, "step": 4567 }, { "epoch": 0.8925361469323955, "grad_norm": 9.302095413208008, "learning_rate": 6.236846030048605e-07, "loss": 9.2156, "step": 4568 }, { "epoch": 0.8927315357561547, "grad_norm": 8.165783882141113, "learning_rate": 6.214864123751463e-07, "loss": 9.412, "step": 4569 }, { "epoch": 0.8929269245799141, "grad_norm": 10.003043174743652, "learning_rate": 6.192919781312933e-07, "loss": 9.6109, "step": 4570 }, { "epoch": 0.8931223134036733, "grad_norm": 9.300692558288574, "learning_rate": 6.171013011522409e-07, "loss": 9.0853, "step": 4571 }, { "epoch": 0.8933177022274326, "grad_norm": 13.417641639709473, "learning_rate": 6.149143823154213e-07, "loss": 9.0996, "step": 4572 }, { "epoch": 0.8935130910511919, "grad_norm": 10.26941204071045, "learning_rate": 6.127312224967652e-07, "loss": 9.5182, "step": 4573 }, { "epoch": 0.8937084798749512, "grad_norm": 7.442628383636475, "learning_rate": 6.105518225706952e-07, "loss": 8.868, "step": 4574 }, { "epoch": 0.8939038686987104, "grad_norm": 8.16067123413086, "learning_rate": 6.083761834101298e-07, "loss": 8.9479, "step": 4575 }, { "epoch": 0.8940992575224697, "grad_norm": 7.069525718688965, "learning_rate": 6.062043058864775e-07, "loss": 8.8524, "step": 4576 }, { "epoch": 0.894294646346229, "grad_norm": 8.341736793518066, "learning_rate": 6.040361908696457e-07, "loss": 8.4742, "step": 4577 }, { "epoch": 0.8944900351699883, "grad_norm": 10.101537704467773, "learning_rate": 6.018718392280299e-07, "loss": 9.0462, "step": 4578 }, { "epoch": 0.8946854239937475, "grad_norm": 7.973488807678223, "learning_rate": 5.997112518285208e-07, "loss": 9.04, "step": 4579 }, { "epoch": 0.8948808128175069, "grad_norm": 6.7695722579956055, "learning_rate": 5.975544295364998e-07, "loss": 8.5619, "step": 4580 }, { "epoch": 0.8950762016412661, "grad_norm": 7.932429313659668, "learning_rate": 5.954013732158436e-07, "loss": 8.5523, "step": 4581 }, { "epoch": 0.8952715904650254, "grad_norm": 10.644766807556152, "learning_rate": 5.932520837289135e-07, "loss": 10.0759, "step": 4582 }, { "epoch": 0.8954669792887847, "grad_norm": 8.38106918334961, "learning_rate": 5.91106561936573e-07, "loss": 9.2323, "step": 4583 }, { "epoch": 0.895662368112544, "grad_norm": 9.988547325134277, "learning_rate": 5.889648086981659e-07, "loss": 9.3092, "step": 4584 }, { "epoch": 0.8958577569363032, "grad_norm": 7.1622395515441895, "learning_rate": 5.868268248715292e-07, "loss": 7.0635, "step": 4585 }, { "epoch": 0.8960531457600626, "grad_norm": 11.381697654724121, "learning_rate": 5.84692611312997e-07, "loss": 9.1964, "step": 4586 }, { "epoch": 0.8962485345838218, "grad_norm": 10.23323917388916, "learning_rate": 5.825621688773842e-07, "loss": 9.7945, "step": 4587 }, { "epoch": 0.8964439234075811, "grad_norm": 21.31047248840332, "learning_rate": 5.804354984180017e-07, "loss": 8.9616, "step": 4588 }, { "epoch": 0.8966393122313404, "grad_norm": 8.889069557189941, "learning_rate": 5.783126007866435e-07, "loss": 8.7006, "step": 4589 }, { "epoch": 0.8968347010550997, "grad_norm": 8.523070335388184, "learning_rate": 5.761934768335997e-07, "loss": 9.2636, "step": 4590 }, { "epoch": 0.8970300898788589, "grad_norm": 17.558130264282227, "learning_rate": 5.740781274076423e-07, "loss": 9.6848, "step": 4591 }, { "epoch": 0.8972254787026183, "grad_norm": 7.022171497344971, "learning_rate": 5.719665533560381e-07, "loss": 8.2926, "step": 4592 }, { "epoch": 0.8974208675263775, "grad_norm": 12.051819801330566, "learning_rate": 5.698587555245339e-07, "loss": 9.8679, "step": 4593 }, { "epoch": 0.8976162563501368, "grad_norm": 8.585456848144531, "learning_rate": 5.67754734757372e-07, "loss": 9.2112, "step": 4594 }, { "epoch": 0.897811645173896, "grad_norm": 8.774697303771973, "learning_rate": 5.65654491897274e-07, "loss": 9.3123, "step": 4595 }, { "epoch": 0.8980070339976554, "grad_norm": 7.8391313552856445, "learning_rate": 5.635580277854558e-07, "loss": 8.8919, "step": 4596 }, { "epoch": 0.8982024228214146, "grad_norm": 11.301671981811523, "learning_rate": 5.614653432616158e-07, "loss": 9.3398, "step": 4597 }, { "epoch": 0.8983978116451738, "grad_norm": 7.452347755432129, "learning_rate": 5.593764391639367e-07, "loss": 9.0038, "step": 4598 }, { "epoch": 0.8985932004689332, "grad_norm": 6.5044121742248535, "learning_rate": 5.572913163290927e-07, "loss": 8.6604, "step": 4599 }, { "epoch": 0.8987885892926925, "grad_norm": 10.333758354187012, "learning_rate": 5.552099755922369e-07, "loss": 8.8245, "step": 4600 }, { "epoch": 0.8989839781164517, "grad_norm": 7.728987216949463, "learning_rate": 5.531324177870145e-07, "loss": 9.1096, "step": 4601 }, { "epoch": 0.8991793669402111, "grad_norm": 10.902887344360352, "learning_rate": 5.510586437455478e-07, "loss": 8.6363, "step": 4602 }, { "epoch": 0.8993747557639703, "grad_norm": 8.748449325561523, "learning_rate": 5.489886542984524e-07, "loss": 9.1152, "step": 4603 }, { "epoch": 0.8995701445877295, "grad_norm": 8.182178497314453, "learning_rate": 5.469224502748193e-07, "loss": 9.7359, "step": 4604 }, { "epoch": 0.8997655334114889, "grad_norm": 10.667359352111816, "learning_rate": 5.448600325022302e-07, "loss": 8.3748, "step": 4605 }, { "epoch": 0.8999609222352482, "grad_norm": 7.522793292999268, "learning_rate": 5.428014018067441e-07, "loss": 8.6727, "step": 4606 }, { "epoch": 0.9001563110590074, "grad_norm": 7.92480993270874, "learning_rate": 5.407465590129101e-07, "loss": 8.1705, "step": 4607 }, { "epoch": 0.9003516998827668, "grad_norm": 6.827234268188477, "learning_rate": 5.386955049437548e-07, "loss": 8.2938, "step": 4608 }, { "epoch": 0.900547088706526, "grad_norm": 7.39226770401001, "learning_rate": 5.366482404207896e-07, "loss": 9.7872, "step": 4609 }, { "epoch": 0.9007424775302852, "grad_norm": 7.509429454803467, "learning_rate": 5.346047662640075e-07, "loss": 8.5024, "step": 4610 }, { "epoch": 0.9009378663540446, "grad_norm": 9.19665813446045, "learning_rate": 5.325650832918816e-07, "loss": 9.1335, "step": 4611 }, { "epoch": 0.9011332551778038, "grad_norm": 10.296882629394531, "learning_rate": 5.305291923213718e-07, "loss": 9.4044, "step": 4612 }, { "epoch": 0.9013286440015631, "grad_norm": 8.83753776550293, "learning_rate": 5.284970941679113e-07, "loss": 9.4248, "step": 4613 }, { "epoch": 0.9015240328253223, "grad_norm": 8.993282318115234, "learning_rate": 5.264687896454224e-07, "loss": 8.6009, "step": 4614 }, { "epoch": 0.9017194216490817, "grad_norm": 6.61049747467041, "learning_rate": 5.244442795663007e-07, "loss": 7.7182, "step": 4615 }, { "epoch": 0.9019148104728409, "grad_norm": 7.919312477111816, "learning_rate": 5.224235647414277e-07, "loss": 9.915, "step": 4616 }, { "epoch": 0.9021101992966002, "grad_norm": 9.764341354370117, "learning_rate": 5.204066459801615e-07, "loss": 8.7336, "step": 4617 }, { "epoch": 0.9023055881203595, "grad_norm": 7.9409637451171875, "learning_rate": 5.183935240903415e-07, "loss": 8.6041, "step": 4618 }, { "epoch": 0.9025009769441188, "grad_norm": 8.285787582397461, "learning_rate": 5.163841998782837e-07, "loss": 9.4445, "step": 4619 }, { "epoch": 0.902696365767878, "grad_norm": 9.423076629638672, "learning_rate": 5.143786741487877e-07, "loss": 8.5114, "step": 4620 }, { "epoch": 0.9028917545916374, "grad_norm": 11.886407852172852, "learning_rate": 5.123769477051255e-07, "loss": 9.2892, "step": 4621 }, { "epoch": 0.9030871434153966, "grad_norm": 7.831228733062744, "learning_rate": 5.103790213490545e-07, "loss": 8.2329, "step": 4622 }, { "epoch": 0.9032825322391559, "grad_norm": 10.496557235717773, "learning_rate": 5.083848958808035e-07, "loss": 9.1138, "step": 4623 }, { "epoch": 0.9034779210629152, "grad_norm": 17.42132568359375, "learning_rate": 5.063945720990803e-07, "loss": 9.1258, "step": 4624 }, { "epoch": 0.9036733098866745, "grad_norm": 6.3549885749816895, "learning_rate": 5.044080508010751e-07, "loss": 8.6081, "step": 4625 }, { "epoch": 0.9038686987104337, "grad_norm": 8.271417617797852, "learning_rate": 5.02425332782448e-07, "loss": 8.8358, "step": 4626 }, { "epoch": 0.9040640875341931, "grad_norm": 9.527425765991211, "learning_rate": 5.004464188373426e-07, "loss": 8.5627, "step": 4627 }, { "epoch": 0.9042594763579523, "grad_norm": 7.8970232009887695, "learning_rate": 4.984713097583715e-07, "loss": 9.9096, "step": 4628 }, { "epoch": 0.9044548651817116, "grad_norm": 8.61898422241211, "learning_rate": 4.96500006336631e-07, "loss": 8.2139, "step": 4629 }, { "epoch": 0.9046502540054708, "grad_norm": 9.109868049621582, "learning_rate": 4.94532509361686e-07, "loss": 8.6737, "step": 4630 }, { "epoch": 0.9048456428292302, "grad_norm": 8.05453872680664, "learning_rate": 4.925688196215828e-07, "loss": 9.8056, "step": 4631 }, { "epoch": 0.9050410316529894, "grad_norm": 88.9920883178711, "learning_rate": 4.906089379028367e-07, "loss": 10.1364, "step": 4632 }, { "epoch": 0.9052364204767487, "grad_norm": 7.312097072601318, "learning_rate": 4.886528649904465e-07, "loss": 8.6661, "step": 4633 }, { "epoch": 0.905431809300508, "grad_norm": 8.211421966552734, "learning_rate": 4.867006016678744e-07, "loss": 8.9301, "step": 4634 }, { "epoch": 0.9056271981242673, "grad_norm": 12.64998722076416, "learning_rate": 4.84752148717067e-07, "loss": 9.0967, "step": 4635 }, { "epoch": 0.9058225869480265, "grad_norm": 7.412219047546387, "learning_rate": 4.828075069184379e-07, "loss": 7.99, "step": 4636 }, { "epoch": 0.9060179757717859, "grad_norm": 9.487683296203613, "learning_rate": 4.808666770508752e-07, "loss": 8.7177, "step": 4637 }, { "epoch": 0.9062133645955451, "grad_norm": 11.212496757507324, "learning_rate": 4.789296598917459e-07, "loss": 9.1199, "step": 4638 }, { "epoch": 0.9064087534193044, "grad_norm": 8.739277839660645, "learning_rate": 4.769964562168805e-07, "loss": 9.9834, "step": 4639 }, { "epoch": 0.9066041422430637, "grad_norm": 13.263821601867676, "learning_rate": 4.750670668005908e-07, "loss": 9.4101, "step": 4640 }, { "epoch": 0.906799531066823, "grad_norm": 9.429154396057129, "learning_rate": 4.7314149241565413e-07, "loss": 8.9467, "step": 4641 }, { "epoch": 0.9069949198905822, "grad_norm": 9.144912719726562, "learning_rate": 4.712197338333269e-07, "loss": 9.005, "step": 4642 }, { "epoch": 0.9071903087143416, "grad_norm": 8.525959014892578, "learning_rate": 4.693017918233278e-07, "loss": 8.4626, "step": 4643 }, { "epoch": 0.9073856975381008, "grad_norm": 11.013566017150879, "learning_rate": 4.67387667153858e-07, "loss": 9.1099, "step": 4644 }, { "epoch": 0.9075810863618601, "grad_norm": 9.597681999206543, "learning_rate": 4.6547736059157857e-07, "loss": 9.493, "step": 4645 }, { "epoch": 0.9077764751856194, "grad_norm": 6.093164443969727, "learning_rate": 4.6357087290162973e-07, "loss": 7.5166, "step": 4646 }, { "epoch": 0.9079718640093787, "grad_norm": 10.783944129943848, "learning_rate": 4.6166820484761954e-07, "loss": 9.2103, "step": 4647 }, { "epoch": 0.9081672528331379, "grad_norm": 7.36179256439209, "learning_rate": 4.597693571916229e-07, "loss": 8.1978, "step": 4648 }, { "epoch": 0.9083626416568972, "grad_norm": 10.97280216217041, "learning_rate": 4.5787433069418796e-07, "loss": 10.0678, "step": 4649 }, { "epoch": 0.9085580304806565, "grad_norm": 8.537384986877441, "learning_rate": 4.5598312611433324e-07, "loss": 9.7343, "step": 4650 }, { "epoch": 0.9087534193044158, "grad_norm": 7.580404281616211, "learning_rate": 4.540957442095428e-07, "loss": 8.5973, "step": 4651 }, { "epoch": 0.908948808128175, "grad_norm": 8.775703430175781, "learning_rate": 4.522121857357742e-07, "loss": 8.0289, "step": 4652 }, { "epoch": 0.9091441969519344, "grad_norm": 9.910072326660156, "learning_rate": 4.503324514474483e-07, "loss": 9.032, "step": 4653 }, { "epoch": 0.9093395857756936, "grad_norm": 13.489008903503418, "learning_rate": 4.484565420974596e-07, "loss": 9.2559, "step": 4654 }, { "epoch": 0.9095349745994529, "grad_norm": 7.9361467361450195, "learning_rate": 4.4658445843716704e-07, "loss": 8.5171, "step": 4655 }, { "epoch": 0.9097303634232122, "grad_norm": 9.057096481323242, "learning_rate": 4.4471620121639634e-07, "loss": 8.3678, "step": 4656 }, { "epoch": 0.9099257522469715, "grad_norm": 9.839863777160645, "learning_rate": 4.428517711834468e-07, "loss": 8.7929, "step": 4657 }, { "epoch": 0.9101211410707307, "grad_norm": 7.448708534240723, "learning_rate": 4.4099116908507545e-07, "loss": 8.6383, "step": 4658 }, { "epoch": 0.9103165298944901, "grad_norm": 8.156187057495117, "learning_rate": 4.391343956665151e-07, "loss": 8.5996, "step": 4659 }, { "epoch": 0.9105119187182493, "grad_norm": 7.864421844482422, "learning_rate": 4.372814516714596e-07, "loss": 9.6516, "step": 4660 }, { "epoch": 0.9107073075420086, "grad_norm": 9.196493148803711, "learning_rate": 4.3543233784207106e-07, "loss": 9.4084, "step": 4661 }, { "epoch": 0.9109026963657679, "grad_norm": 7.9403910636901855, "learning_rate": 4.335870549189758e-07, "loss": 8.8431, "step": 4662 }, { "epoch": 0.9110980851895272, "grad_norm": 7.183925628662109, "learning_rate": 4.317456036412682e-07, "loss": 8.0112, "step": 4663 }, { "epoch": 0.9112934740132864, "grad_norm": 10.258187294006348, "learning_rate": 4.29907984746506e-07, "loss": 7.8186, "step": 4664 }, { "epoch": 0.9114888628370457, "grad_norm": 8.958974838256836, "learning_rate": 4.2807419897071376e-07, "loss": 8.9419, "step": 4665 }, { "epoch": 0.911684251660805, "grad_norm": 8.800867080688477, "learning_rate": 4.262442470483774e-07, "loss": 9.1317, "step": 4666 }, { "epoch": 0.9118796404845643, "grad_norm": 7.859527111053467, "learning_rate": 4.244181297124528e-07, "loss": 8.9888, "step": 4667 }, { "epoch": 0.9120750293083235, "grad_norm": 8.100984573364258, "learning_rate": 4.225958476943537e-07, "loss": 9.5326, "step": 4668 }, { "epoch": 0.9122704181320829, "grad_norm": 8.952960968017578, "learning_rate": 4.2077740172396074e-07, "loss": 9.0005, "step": 4669 }, { "epoch": 0.9124658069558421, "grad_norm": 7.81179141998291, "learning_rate": 4.189627925296202e-07, "loss": 7.9352, "step": 4670 }, { "epoch": 0.9126611957796014, "grad_norm": 7.670047760009766, "learning_rate": 4.171520208381363e-07, "loss": 9.8982, "step": 4671 }, { "epoch": 0.9128565846033607, "grad_norm": 8.656058311462402, "learning_rate": 4.153450873747822e-07, "loss": 8.8703, "step": 4672 }, { "epoch": 0.91305197342712, "grad_norm": 7.943114757537842, "learning_rate": 4.135419928632889e-07, "loss": 8.4271, "step": 4673 }, { "epoch": 0.9132473622508792, "grad_norm": 7.806142807006836, "learning_rate": 4.117427380258532e-07, "loss": 9.5021, "step": 4674 }, { "epoch": 0.9134427510746386, "grad_norm": 9.020642280578613, "learning_rate": 4.0994732358313085e-07, "loss": 8.1661, "step": 4675 }, { "epoch": 0.9136381398983978, "grad_norm": 35.45269012451172, "learning_rate": 4.0815575025424436e-07, "loss": 10.1253, "step": 4676 }, { "epoch": 0.9138335287221571, "grad_norm": 10.363656997680664, "learning_rate": 4.063680187567698e-07, "loss": 8.9976, "step": 4677 }, { "epoch": 0.9140289175459164, "grad_norm": 7.983797073364258, "learning_rate": 4.0458412980675323e-07, "loss": 9.0811, "step": 4678 }, { "epoch": 0.9142243063696757, "grad_norm": 8.363759994506836, "learning_rate": 4.028040841186942e-07, "loss": 8.5338, "step": 4679 }, { "epoch": 0.9144196951934349, "grad_norm": 8.386238098144531, "learning_rate": 4.0102788240555914e-07, "loss": 9.381, "step": 4680 }, { "epoch": 0.9146150840171943, "grad_norm": 11.326103210449219, "learning_rate": 3.992555253787711e-07, "loss": 9.0722, "step": 4681 }, { "epoch": 0.9148104728409535, "grad_norm": 8.53385066986084, "learning_rate": 3.974870137482123e-07, "loss": 9.4061, "step": 4682 }, { "epoch": 0.9150058616647128, "grad_norm": 10.785691261291504, "learning_rate": 3.957223482222294e-07, "loss": 9.9972, "step": 4683 }, { "epoch": 0.915201250488472, "grad_norm": 8.604122161865234, "learning_rate": 3.9396152950762267e-07, "loss": 8.727, "step": 4684 }, { "epoch": 0.9153966393122314, "grad_norm": 9.557936668395996, "learning_rate": 3.922045583096579e-07, "loss": 9.3909, "step": 4685 }, { "epoch": 0.9155920281359906, "grad_norm": 8.236830711364746, "learning_rate": 3.904514353320543e-07, "loss": 9.4076, "step": 4686 }, { "epoch": 0.9157874169597499, "grad_norm": 8.740063667297363, "learning_rate": 3.887021612769937e-07, "loss": 8.691, "step": 4687 }, { "epoch": 0.9159828057835092, "grad_norm": 8.937166213989258, "learning_rate": 3.869567368451121e-07, "loss": 9.3647, "step": 4688 }, { "epoch": 0.9161781946072685, "grad_norm": 8.27723217010498, "learning_rate": 3.852151627355094e-07, "loss": 7.8112, "step": 4689 }, { "epoch": 0.9163735834310277, "grad_norm": 56.198974609375, "learning_rate": 3.834774396457375e-07, "loss": 7.8735, "step": 4690 }, { "epoch": 0.9165689722547871, "grad_norm": 10.308794021606445, "learning_rate": 3.817435682718096e-07, "loss": 9.0237, "step": 4691 }, { "epoch": 0.9167643610785463, "grad_norm": 10.48629379272461, "learning_rate": 3.8001354930819467e-07, "loss": 9.9133, "step": 4692 }, { "epoch": 0.9169597499023056, "grad_norm": 7.921257019042969, "learning_rate": 3.7828738344782067e-07, "loss": 7.5941, "step": 4693 }, { "epoch": 0.9171551387260649, "grad_norm": 6.670938968658447, "learning_rate": 3.76565071382069e-07, "loss": 8.4198, "step": 4694 }, { "epoch": 0.9173505275498242, "grad_norm": 8.655702590942383, "learning_rate": 3.748466138007778e-07, "loss": 8.5202, "step": 4695 }, { "epoch": 0.9175459163735834, "grad_norm": 9.957679748535156, "learning_rate": 3.7313201139224544e-07, "loss": 9.579, "step": 4696 }, { "epoch": 0.9177413051973428, "grad_norm": 9.686959266662598, "learning_rate": 3.714212648432203e-07, "loss": 10.1259, "step": 4697 }, { "epoch": 0.917936694021102, "grad_norm": 9.702391624450684, "learning_rate": 3.6971437483891427e-07, "loss": 9.2411, "step": 4698 }, { "epoch": 0.9181320828448613, "grad_norm": 7.698165416717529, "learning_rate": 3.6801134206298386e-07, "loss": 9.0435, "step": 4699 }, { "epoch": 0.9183274716686206, "grad_norm": 11.09498119354248, "learning_rate": 3.6631216719755225e-07, "loss": 9.8762, "step": 4700 }, { "epoch": 0.9185228604923799, "grad_norm": 9.750531196594238, "learning_rate": 3.6461685092318735e-07, "loss": 9.7316, "step": 4701 }, { "epoch": 0.9187182493161391, "grad_norm": 69.85716247558594, "learning_rate": 3.6292539391891814e-07, "loss": 10.0667, "step": 4702 }, { "epoch": 0.9189136381398983, "grad_norm": 7.222912311553955, "learning_rate": 3.6123779686222493e-07, "loss": 9.3583, "step": 4703 }, { "epoch": 0.9191090269636577, "grad_norm": 7.434391498565674, "learning_rate": 3.595540604290437e-07, "loss": 9.436, "step": 4704 }, { "epoch": 0.919304415787417, "grad_norm": 9.064720153808594, "learning_rate": 3.578741852937617e-07, "loss": 9.2944, "step": 4705 }, { "epoch": 0.9194998046111762, "grad_norm": 7.21022367477417, "learning_rate": 3.561981721292218e-07, "loss": 8.2714, "step": 4706 }, { "epoch": 0.9196951934349356, "grad_norm": 7.854347229003906, "learning_rate": 3.5452602160672033e-07, "loss": 9.0663, "step": 4707 }, { "epoch": 0.9198905822586948, "grad_norm": 9.648550987243652, "learning_rate": 3.5285773439600377e-07, "loss": 10.1147, "step": 4708 }, { "epoch": 0.920085971082454, "grad_norm": 8.285013198852539, "learning_rate": 3.5119331116527434e-07, "loss": 9.0837, "step": 4709 }, { "epoch": 0.9202813599062134, "grad_norm": 11.687288284301758, "learning_rate": 3.4953275258118314e-07, "loss": 9.547, "step": 4710 }, { "epoch": 0.9204767487299726, "grad_norm": 20.341156005859375, "learning_rate": 3.478760593088382e-07, "loss": 9.4041, "step": 4711 }, { "epoch": 0.9206721375537319, "grad_norm": 11.171713829040527, "learning_rate": 3.462232320117942e-07, "loss": 7.7676, "step": 4712 }, { "epoch": 0.9208675263774913, "grad_norm": 9.766027450561523, "learning_rate": 3.445742713520617e-07, "loss": 9.4757, "step": 4713 }, { "epoch": 0.9210629152012505, "grad_norm": 8.181708335876465, "learning_rate": 3.429291779900989e-07, "loss": 8.9366, "step": 4714 }, { "epoch": 0.9212583040250097, "grad_norm": 14.08752155303955, "learning_rate": 3.4128795258481896e-07, "loss": 9.5898, "step": 4715 }, { "epoch": 0.9214536928487691, "grad_norm": 7.5151591300964355, "learning_rate": 3.396505957935814e-07, "loss": 7.3947, "step": 4716 }, { "epoch": 0.9216490816725283, "grad_norm": 9.194290161132812, "learning_rate": 3.380171082721995e-07, "loss": 9.9539, "step": 4717 }, { "epoch": 0.9218444704962876, "grad_norm": 8.23127269744873, "learning_rate": 3.3638749067493534e-07, "loss": 9.0621, "step": 4718 }, { "epoch": 0.9220398593200468, "grad_norm": 8.573027610778809, "learning_rate": 3.3476174365450255e-07, "loss": 8.5441, "step": 4719 }, { "epoch": 0.9222352481438062, "grad_norm": 10.570746421813965, "learning_rate": 3.331398678620623e-07, "loss": 9.3848, "step": 4720 }, { "epoch": 0.9224306369675654, "grad_norm": 8.211080551147461, "learning_rate": 3.3152186394722506e-07, "loss": 8.2236, "step": 4721 }, { "epoch": 0.9226260257913247, "grad_norm": 8.340023040771484, "learning_rate": 3.299077325580535e-07, "loss": 9.4421, "step": 4722 }, { "epoch": 0.922821414615084, "grad_norm": 7.6149678230285645, "learning_rate": 3.2829747434105606e-07, "loss": 8.7159, "step": 4723 }, { "epoch": 0.9230168034388433, "grad_norm": 8.387945175170898, "learning_rate": 3.2669108994119236e-07, "loss": 9.4417, "step": 4724 }, { "epoch": 0.9232121922626025, "grad_norm": 8.78463363647461, "learning_rate": 3.250885800018677e-07, "loss": 9.0099, "step": 4725 }, { "epoch": 0.9234075810863619, "grad_norm": 7.261391639709473, "learning_rate": 3.234899451649398e-07, "loss": 9.0144, "step": 4726 }, { "epoch": 0.9236029699101211, "grad_norm": 10.1168212890625, "learning_rate": 3.2189518607070756e-07, "loss": 9.5387, "step": 4727 }, { "epoch": 0.9237983587338804, "grad_norm": 8.67878246307373, "learning_rate": 3.203043033579256e-07, "loss": 9.395, "step": 4728 }, { "epoch": 0.9239937475576397, "grad_norm": 7.543828010559082, "learning_rate": 3.1871729766378867e-07, "loss": 8.5915, "step": 4729 }, { "epoch": 0.924189136381399, "grad_norm": 7.960564613342285, "learning_rate": 3.171341696239438e-07, "loss": 9.3239, "step": 4730 }, { "epoch": 0.9243845252051582, "grad_norm": 9.259940147399902, "learning_rate": 3.1555491987248056e-07, "loss": 8.4018, "step": 4731 }, { "epoch": 0.9245799140289176, "grad_norm": 8.424381256103516, "learning_rate": 3.139795490419417e-07, "loss": 8.2413, "step": 4732 }, { "epoch": 0.9247753028526768, "grad_norm": 9.10590648651123, "learning_rate": 3.1240805776330816e-07, "loss": 8.8184, "step": 4733 }, { "epoch": 0.9249706916764361, "grad_norm": 6.929506778717041, "learning_rate": 3.10840446666012e-07, "loss": 8.5184, "step": 4734 }, { "epoch": 0.9251660805001954, "grad_norm": 11.383835792541504, "learning_rate": 3.09276716377932e-07, "loss": 9.9644, "step": 4735 }, { "epoch": 0.9253614693239547, "grad_norm": 7.0921406745910645, "learning_rate": 3.077168675253872e-07, "loss": 9.0621, "step": 4736 }, { "epoch": 0.9255568581477139, "grad_norm": 9.944376945495605, "learning_rate": 3.0616090073314896e-07, "loss": 9.0666, "step": 4737 }, { "epoch": 0.9257522469714732, "grad_norm": 7.62268590927124, "learning_rate": 3.046088166244276e-07, "loss": 8.8974, "step": 4738 }, { "epoch": 0.9259476357952325, "grad_norm": 9.146649360656738, "learning_rate": 3.0306061582088374e-07, "loss": 9.1993, "step": 4739 }, { "epoch": 0.9261430246189918, "grad_norm": 9.146553993225098, "learning_rate": 3.015162989426168e-07, "loss": 9.1427, "step": 4740 }, { "epoch": 0.926338413442751, "grad_norm": 7.872930526733398, "learning_rate": 2.9997586660817643e-07, "loss": 9.728, "step": 4741 }, { "epoch": 0.9265338022665104, "grad_norm": 7.268552780151367, "learning_rate": 2.984393194345514e-07, "loss": 8.4245, "step": 4742 }, { "epoch": 0.9267291910902696, "grad_norm": 9.872431755065918, "learning_rate": 2.969066580371782e-07, "loss": 8.7397, "step": 4743 }, { "epoch": 0.9269245799140289, "grad_norm": 9.95816707611084, "learning_rate": 2.9537788302993477e-07, "loss": 8.6281, "step": 4744 }, { "epoch": 0.9271199687377882, "grad_norm": 6.963932991027832, "learning_rate": 2.9385299502514233e-07, "loss": 8.7163, "step": 4745 }, { "epoch": 0.9273153575615475, "grad_norm": 10.237529754638672, "learning_rate": 2.923319946335668e-07, "loss": 8.6967, "step": 4746 }, { "epoch": 0.9275107463853067, "grad_norm": 8.188759803771973, "learning_rate": 2.9081488246441527e-07, "loss": 8.4961, "step": 4747 }, { "epoch": 0.9277061352090661, "grad_norm": 9.989144325256348, "learning_rate": 2.8930165912533945e-07, "loss": 8.1715, "step": 4748 }, { "epoch": 0.9279015240328253, "grad_norm": 7.172133445739746, "learning_rate": 2.877923252224302e-07, "loss": 8.7734, "step": 4749 }, { "epoch": 0.9280969128565846, "grad_norm": 9.931110382080078, "learning_rate": 2.8628688136022487e-07, "loss": 9.472, "step": 4750 }, { "epoch": 0.9282923016803439, "grad_norm": 8.37661075592041, "learning_rate": 2.847853281416979e-07, "loss": 9.0718, "step": 4751 }, { "epoch": 0.9284876905041032, "grad_norm": 8.126204490661621, "learning_rate": 2.832876661682715e-07, "loss": 8.2512, "step": 4752 }, { "epoch": 0.9286830793278624, "grad_norm": 7.955578327178955, "learning_rate": 2.817938960398014e-07, "loss": 9.0037, "step": 4753 }, { "epoch": 0.9288784681516217, "grad_norm": 9.936150550842285, "learning_rate": 2.803040183545935e-07, "loss": 8.9849, "step": 4754 }, { "epoch": 0.929073856975381, "grad_norm": 10.026246070861816, "learning_rate": 2.78818033709386e-07, "loss": 9.4893, "step": 4755 }, { "epoch": 0.9292692457991403, "grad_norm": 9.931303024291992, "learning_rate": 2.7733594269936606e-07, "loss": 9.9154, "step": 4756 }, { "epoch": 0.9294646346228995, "grad_norm": 7.731067657470703, "learning_rate": 2.7585774591815217e-07, "loss": 8.77, "step": 4757 }, { "epoch": 0.9296600234466589, "grad_norm": 7.984607219696045, "learning_rate": 2.7438344395781305e-07, "loss": 8.161, "step": 4758 }, { "epoch": 0.9298554122704181, "grad_norm": 9.72499942779541, "learning_rate": 2.7291303740884955e-07, "loss": 9.1158, "step": 4759 }, { "epoch": 0.9300508010941774, "grad_norm": 8.763854026794434, "learning_rate": 2.714465268602051e-07, "loss": 9.2098, "step": 4760 }, { "epoch": 0.9302461899179367, "grad_norm": 15.150201797485352, "learning_rate": 2.6998391289926316e-07, "loss": 9.5308, "step": 4761 }, { "epoch": 0.930441578741696, "grad_norm": 9.16210651397705, "learning_rate": 2.6852519611184623e-07, "loss": 9.9065, "step": 4762 }, { "epoch": 0.9306369675654552, "grad_norm": 8.150128364562988, "learning_rate": 2.6707037708221584e-07, "loss": 8.2702, "step": 4763 }, { "epoch": 0.9308323563892146, "grad_norm": 6.748828411102295, "learning_rate": 2.656194563930714e-07, "loss": 9.3573, "step": 4764 }, { "epoch": 0.9310277452129738, "grad_norm": 6.6904683113098145, "learning_rate": 2.6417243462555364e-07, "loss": 8.2785, "step": 4765 }, { "epoch": 0.9312231340367331, "grad_norm": 6.414731025695801, "learning_rate": 2.627293123592367e-07, "loss": 8.253, "step": 4766 }, { "epoch": 0.9314185228604924, "grad_norm": 10.10960578918457, "learning_rate": 2.6129009017213826e-07, "loss": 9.0605, "step": 4767 }, { "epoch": 0.9316139116842517, "grad_norm": 7.682331562042236, "learning_rate": 2.5985476864070935e-07, "loss": 8.9442, "step": 4768 }, { "epoch": 0.9318093005080109, "grad_norm": 9.000534057617188, "learning_rate": 2.584233483398435e-07, "loss": 9.626, "step": 4769 }, { "epoch": 0.9320046893317703, "grad_norm": 9.852577209472656, "learning_rate": 2.5699582984286655e-07, "loss": 8.8923, "step": 4770 }, { "epoch": 0.9322000781555295, "grad_norm": 8.212091445922852, "learning_rate": 2.555722137215455e-07, "loss": 8.4139, "step": 4771 }, { "epoch": 0.9323954669792888, "grad_norm": 7.885027885437012, "learning_rate": 2.541525005460821e-07, "loss": 8.9835, "step": 4772 }, { "epoch": 0.932590855803048, "grad_norm": 8.578592300415039, "learning_rate": 2.5273669088511696e-07, "loss": 9.0651, "step": 4773 }, { "epoch": 0.9327862446268074, "grad_norm": 7.461894989013672, "learning_rate": 2.5132478530572436e-07, "loss": 7.6183, "step": 4774 }, { "epoch": 0.9329816334505666, "grad_norm": 6.767593860626221, "learning_rate": 2.4991678437341515e-07, "loss": 8.8828, "step": 4775 }, { "epoch": 0.9331770222743259, "grad_norm": 8.581154823303223, "learning_rate": 2.485126886521394e-07, "loss": 9.6972, "step": 4776 }, { "epoch": 0.9333724110980852, "grad_norm": 10.012948036193848, "learning_rate": 2.471124987042806e-07, "loss": 9.1716, "step": 4777 }, { "epoch": 0.9335677999218445, "grad_norm": 8.47640323638916, "learning_rate": 2.4571621509065893e-07, "loss": 8.9597, "step": 4778 }, { "epoch": 0.9337631887456037, "grad_norm": 8.2051362991333, "learning_rate": 2.443238383705271e-07, "loss": 8.9585, "step": 4779 }, { "epoch": 0.9339585775693631, "grad_norm": 7.979532718658447, "learning_rate": 2.429353691015779e-07, "loss": 8.8593, "step": 4780 }, { "epoch": 0.9341539663931223, "grad_norm": 8.390580177307129, "learning_rate": 2.4155080783993423e-07, "loss": 9.2431, "step": 4781 }, { "epoch": 0.9343493552168816, "grad_norm": 8.807951927185059, "learning_rate": 2.4017015514015806e-07, "loss": 8.272, "step": 4782 }, { "epoch": 0.9345447440406409, "grad_norm": 11.47719955444336, "learning_rate": 2.387934115552404e-07, "loss": 9.1402, "step": 4783 }, { "epoch": 0.9347401328644002, "grad_norm": 8.282925605773926, "learning_rate": 2.374205776366134e-07, "loss": 8.0761, "step": 4784 }, { "epoch": 0.9349355216881594, "grad_norm": 7.644189834594727, "learning_rate": 2.3605165393413841e-07, "loss": 8.367, "step": 4785 }, { "epoch": 0.9351309105119188, "grad_norm": 8.098487854003906, "learning_rate": 2.3468664099611128e-07, "loss": 8.7649, "step": 4786 }, { "epoch": 0.935326299335678, "grad_norm": 7.511486053466797, "learning_rate": 2.333255393692613e-07, "loss": 8.7467, "step": 4787 }, { "epoch": 0.9355216881594373, "grad_norm": 27.118160247802734, "learning_rate": 2.3196834959875348e-07, "loss": 10.1981, "step": 4788 }, { "epoch": 0.9357170769831966, "grad_norm": 6.575469493865967, "learning_rate": 2.3061507222818303e-07, "loss": 8.2403, "step": 4789 }, { "epoch": 0.9359124658069559, "grad_norm": 7.317230701446533, "learning_rate": 2.292657077995819e-07, "loss": 9.0534, "step": 4790 }, { "epoch": 0.9361078546307151, "grad_norm": 11.103191375732422, "learning_rate": 2.2792025685341113e-07, "loss": 8.3513, "step": 4791 }, { "epoch": 0.9363032434544744, "grad_norm": 7.332716941833496, "learning_rate": 2.2657871992856407e-07, "loss": 8.6779, "step": 4792 }, { "epoch": 0.9364986322782337, "grad_norm": 10.763957023620605, "learning_rate": 2.2524109756236977e-07, "loss": 9.7406, "step": 4793 }, { "epoch": 0.936694021101993, "grad_norm": 8.400541305541992, "learning_rate": 2.2390739029058639e-07, "loss": 9.0757, "step": 4794 }, { "epoch": 0.9368894099257522, "grad_norm": 6.764805793762207, "learning_rate": 2.2257759864740768e-07, "loss": 8.4885, "step": 4795 }, { "epoch": 0.9370847987495116, "grad_norm": 7.343725204467773, "learning_rate": 2.2125172316545206e-07, "loss": 8.0188, "step": 4796 }, { "epoch": 0.9372801875732708, "grad_norm": 8.75682544708252, "learning_rate": 2.1992976437577807e-07, "loss": 8.8639, "step": 4797 }, { "epoch": 0.93747557639703, "grad_norm": 6.371266841888428, "learning_rate": 2.186117228078688e-07, "loss": 7.8495, "step": 4798 }, { "epoch": 0.9376709652207894, "grad_norm": 10.427175521850586, "learning_rate": 2.1729759898964197e-07, "loss": 8.9847, "step": 4799 }, { "epoch": 0.9378663540445487, "grad_norm": 7.948103904724121, "learning_rate": 2.1598739344744323e-07, "loss": 8.7709, "step": 4800 }, { "epoch": 0.9380617428683079, "grad_norm": 7.93149995803833, "learning_rate": 2.146811067060539e-07, "loss": 9.1597, "step": 4801 }, { "epoch": 0.9382571316920673, "grad_norm": 9.593384742736816, "learning_rate": 2.133787392886788e-07, "loss": 8.5564, "step": 4802 }, { "epoch": 0.9384525205158265, "grad_norm": 7.262961387634277, "learning_rate": 2.1208029171695844e-07, "loss": 8.4633, "step": 4803 }, { "epoch": 0.9386479093395858, "grad_norm": 8.87141227722168, "learning_rate": 2.1078576451096123e-07, "loss": 10.1814, "step": 4804 }, { "epoch": 0.9388432981633451, "grad_norm": 7.445680618286133, "learning_rate": 2.0949515818918465e-07, "loss": 9.209, "step": 4805 }, { "epoch": 0.9390386869871044, "grad_norm": 8.99807357788086, "learning_rate": 2.082084732685574e-07, "loss": 9.5791, "step": 4806 }, { "epoch": 0.9392340758108636, "grad_norm": 7.861563205718994, "learning_rate": 2.0692571026443618e-07, "loss": 8.1887, "step": 4807 }, { "epoch": 0.9394294646346228, "grad_norm": 6.860263824462891, "learning_rate": 2.0564686969060887e-07, "loss": 7.9136, "step": 4808 }, { "epoch": 0.9396248534583822, "grad_norm": 8.453475952148438, "learning_rate": 2.0437195205928906e-07, "loss": 9.5563, "step": 4809 }, { "epoch": 0.9398202422821414, "grad_norm": 8.56126594543457, "learning_rate": 2.0310095788112162e-07, "loss": 9.7925, "step": 4810 }, { "epoch": 0.9400156311059007, "grad_norm": 7.933896064758301, "learning_rate": 2.018338876651782e-07, "loss": 8.5813, "step": 4811 }, { "epoch": 0.94021101992966, "grad_norm": 8.358501434326172, "learning_rate": 2.0057074191896176e-07, "loss": 8.4015, "step": 4812 }, { "epoch": 0.9404064087534193, "grad_norm": 12.378544807434082, "learning_rate": 1.9931152114839868e-07, "loss": 9.8506, "step": 4813 }, { "epoch": 0.9406017975771785, "grad_norm": 14.021265029907227, "learning_rate": 1.9805622585785001e-07, "loss": 10.5824, "step": 4814 }, { "epoch": 0.9407971864009379, "grad_norm": 6.565126419067383, "learning_rate": 1.9680485655009573e-07, "loss": 8.7138, "step": 4815 }, { "epoch": 0.9409925752246971, "grad_norm": 7.936339855194092, "learning_rate": 1.955574137263516e-07, "loss": 9.2013, "step": 4816 }, { "epoch": 0.9411879640484564, "grad_norm": 9.885173797607422, "learning_rate": 1.9431389788625576e-07, "loss": 10.3527, "step": 4817 }, { "epoch": 0.9413833528722158, "grad_norm": 9.101330757141113, "learning_rate": 1.930743095278753e-07, "loss": 9.6166, "step": 4818 }, { "epoch": 0.941578741695975, "grad_norm": 7.584497928619385, "learning_rate": 1.9183864914770312e-07, "loss": 9.5762, "step": 4819 }, { "epoch": 0.9417741305197342, "grad_norm": 9.555315971374512, "learning_rate": 1.9060691724065995e-07, "loss": 10.0103, "step": 4820 }, { "epoch": 0.9419695193434936, "grad_norm": 8.41330337524414, "learning_rate": 1.8937911430009338e-07, "loss": 9.634, "step": 4821 }, { "epoch": 0.9421649081672528, "grad_norm": 9.359399795532227, "learning_rate": 1.8815524081777559e-07, "loss": 9.9597, "step": 4822 }, { "epoch": 0.9423602969910121, "grad_norm": 9.938153266906738, "learning_rate": 1.869352972839067e-07, "loss": 9.2335, "step": 4823 }, { "epoch": 0.9425556858147714, "grad_norm": 11.01329517364502, "learning_rate": 1.8571928418711137e-07, "loss": 9.3612, "step": 4824 }, { "epoch": 0.9427510746385307, "grad_norm": 8.913078308105469, "learning_rate": 1.8450720201444116e-07, "loss": 8.643, "step": 4825 }, { "epoch": 0.9429464634622899, "grad_norm": 9.12736701965332, "learning_rate": 1.8329905125137215e-07, "loss": 8.3215, "step": 4826 }, { "epoch": 0.9431418522860492, "grad_norm": 7.773070812225342, "learning_rate": 1.8209483238180726e-07, "loss": 9.3525, "step": 4827 }, { "epoch": 0.9433372411098085, "grad_norm": 6.683079719543457, "learning_rate": 1.8089454588807066e-07, "loss": 8.8364, "step": 4828 }, { "epoch": 0.9435326299335678, "grad_norm": 7.532355785369873, "learning_rate": 1.7969819225091888e-07, "loss": 9.1896, "step": 4829 }, { "epoch": 0.943728018757327, "grad_norm": 8.087076187133789, "learning_rate": 1.7850577194952535e-07, "loss": 8.8348, "step": 4830 }, { "epoch": 0.9439234075810864, "grad_norm": 7.5128607749938965, "learning_rate": 1.773172854614913e-07, "loss": 8.2127, "step": 4831 }, { "epoch": 0.9441187964048456, "grad_norm": 6.844386100769043, "learning_rate": 1.7613273326284374e-07, "loss": 7.821, "step": 4832 }, { "epoch": 0.9443141852286049, "grad_norm": 8.602503776550293, "learning_rate": 1.7495211582803208e-07, "loss": 9.7718, "step": 4833 }, { "epoch": 0.9445095740523642, "grad_norm": 47.355567932128906, "learning_rate": 1.7377543362993132e-07, "loss": 9.2821, "step": 4834 }, { "epoch": 0.9447049628761235, "grad_norm": 8.016357421875, "learning_rate": 1.7260268713983675e-07, "loss": 8.796, "step": 4835 }, { "epoch": 0.9449003516998827, "grad_norm": 11.182827949523926, "learning_rate": 1.714338768274726e-07, "loss": 9.1631, "step": 4836 }, { "epoch": 0.9450957405236421, "grad_norm": 7.747939109802246, "learning_rate": 1.7026900316098217e-07, "loss": 9.759, "step": 4837 }, { "epoch": 0.9452911293474013, "grad_norm": 7.514883995056152, "learning_rate": 1.6910806660693445e-07, "loss": 8.0, "step": 4838 }, { "epoch": 0.9454865181711606, "grad_norm": 6.813232898712158, "learning_rate": 1.6795106763031867e-07, "loss": 8.5752, "step": 4839 }, { "epoch": 0.9456819069949199, "grad_norm": 9.245199203491211, "learning_rate": 1.6679800669455072e-07, "loss": 9.3135, "step": 4840 }, { "epoch": 0.9458772958186792, "grad_norm": 14.45501708984375, "learning_rate": 1.6564888426146674e-07, "loss": 8.9624, "step": 4841 }, { "epoch": 0.9460726846424384, "grad_norm": 9.25383472442627, "learning_rate": 1.6450370079132637e-07, "loss": 8.9619, "step": 4842 }, { "epoch": 0.9462680734661977, "grad_norm": 7.9999003410339355, "learning_rate": 1.6336245674280937e-07, "loss": 8.6831, "step": 4843 }, { "epoch": 0.946463462289957, "grad_norm": 11.17634391784668, "learning_rate": 1.6222515257302118e-07, "loss": 9.9972, "step": 4844 }, { "epoch": 0.9466588511137163, "grad_norm": 7.812686920166016, "learning_rate": 1.6109178873748855e-07, "loss": 8.5462, "step": 4845 }, { "epoch": 0.9468542399374755, "grad_norm": 7.749689102172852, "learning_rate": 1.59962365690155e-07, "loss": 9.3361, "step": 4846 }, { "epoch": 0.9470496287612349, "grad_norm": 7.081580638885498, "learning_rate": 1.5883688388339314e-07, "loss": 8.4492, "step": 4847 }, { "epoch": 0.9472450175849941, "grad_norm": 9.793357849121094, "learning_rate": 1.5771534376799125e-07, "loss": 9.6255, "step": 4848 }, { "epoch": 0.9474404064087534, "grad_norm": 12.532551765441895, "learning_rate": 1.5659774579316222e-07, "loss": 10.3395, "step": 4849 }, { "epoch": 0.9476357952325127, "grad_norm": 8.414477348327637, "learning_rate": 1.554840904065369e-07, "loss": 9.4367, "step": 4850 }, { "epoch": 0.947831184056272, "grad_norm": 6.619372367858887, "learning_rate": 1.5437437805417178e-07, "loss": 8.6572, "step": 4851 }, { "epoch": 0.9480265728800312, "grad_norm": 6.143033504486084, "learning_rate": 1.532686091805391e-07, "loss": 8.9362, "step": 4852 }, { "epoch": 0.9482219617037906, "grad_norm": 8.467583656311035, "learning_rate": 1.521667842285346e-07, "loss": 8.3852, "step": 4853 }, { "epoch": 0.9484173505275498, "grad_norm": 7.625029563903809, "learning_rate": 1.510689036394719e-07, "loss": 8.4104, "step": 4854 }, { "epoch": 0.9486127393513091, "grad_norm": 6.924171447753906, "learning_rate": 1.4997496785308818e-07, "loss": 8.7127, "step": 4855 }, { "epoch": 0.9488081281750684, "grad_norm": 9.200094223022461, "learning_rate": 1.4888497730753627e-07, "loss": 8.9581, "step": 4856 }, { "epoch": 0.9490035169988277, "grad_norm": 9.852042198181152, "learning_rate": 1.4779893243939358e-07, "loss": 9.4307, "step": 4857 }, { "epoch": 0.9491989058225869, "grad_norm": 8.711514472961426, "learning_rate": 1.467168336836544e-07, "loss": 8.9601, "step": 4858 }, { "epoch": 0.9493942946463463, "grad_norm": 8.813514709472656, "learning_rate": 1.4563868147373095e-07, "loss": 8.983, "step": 4859 }, { "epoch": 0.9495896834701055, "grad_norm": 7.923171520233154, "learning_rate": 1.4456447624145998e-07, "loss": 8.7049, "step": 4860 }, { "epoch": 0.9497850722938648, "grad_norm": 8.254159927368164, "learning_rate": 1.4349421841709067e-07, "loss": 7.9401, "step": 4861 }, { "epoch": 0.949980461117624, "grad_norm": 6.768784999847412, "learning_rate": 1.4242790842929567e-07, "loss": 8.5365, "step": 4862 }, { "epoch": 0.9501758499413834, "grad_norm": 7.572927474975586, "learning_rate": 1.4136554670516667e-07, "loss": 8.3918, "step": 4863 }, { "epoch": 0.9503712387651426, "grad_norm": 11.043039321899414, "learning_rate": 1.4030713367021108e-07, "loss": 9.5904, "step": 4864 }, { "epoch": 0.9505666275889019, "grad_norm": 9.595534324645996, "learning_rate": 1.3925266974835538e-07, "loss": 9.3853, "step": 4865 }, { "epoch": 0.9507620164126612, "grad_norm": 7.735306739807129, "learning_rate": 1.3820215536194725e-07, "loss": 8.9673, "step": 4866 }, { "epoch": 0.9509574052364205, "grad_norm": 9.947039604187012, "learning_rate": 1.37155590931749e-07, "loss": 9.6789, "step": 4867 }, { "epoch": 0.9511527940601797, "grad_norm": 7.425265312194824, "learning_rate": 1.3611297687694312e-07, "loss": 9.7541, "step": 4868 }, { "epoch": 0.9513481828839391, "grad_norm": 7.681835651397705, "learning_rate": 1.350743136151267e-07, "loss": 10.2013, "step": 4869 }, { "epoch": 0.9515435717076983, "grad_norm": 7.884913921356201, "learning_rate": 1.3403960156232022e-07, "loss": 8.9761, "step": 4870 }, { "epoch": 0.9517389605314576, "grad_norm": 7.007309913635254, "learning_rate": 1.3300884113295442e-07, "loss": 8.6048, "step": 4871 }, { "epoch": 0.9519343493552169, "grad_norm": 7.391115188598633, "learning_rate": 1.319820327398824e-07, "loss": 8.8546, "step": 4872 }, { "epoch": 0.9521297381789762, "grad_norm": 8.182116508483887, "learning_rate": 1.3095917679437408e-07, "loss": 9.6435, "step": 4873 }, { "epoch": 0.9523251270027354, "grad_norm": 10.858007431030273, "learning_rate": 1.2994027370611173e-07, "loss": 9.0315, "step": 4874 }, { "epoch": 0.9525205158264948, "grad_norm": 8.331448554992676, "learning_rate": 1.2892532388320113e-07, "loss": 9.6667, "step": 4875 }, { "epoch": 0.952715904650254, "grad_norm": 7.683496952056885, "learning_rate": 1.2791432773215928e-07, "loss": 7.4806, "step": 4876 }, { "epoch": 0.9529112934740133, "grad_norm": 8.721888542175293, "learning_rate": 1.2690728565792232e-07, "loss": 9.2332, "step": 4877 }, { "epoch": 0.9531066822977726, "grad_norm": 7.155975341796875, "learning_rate": 1.259041980638398e-07, "loss": 8.8735, "step": 4878 }, { "epoch": 0.9533020711215319, "grad_norm": 8.131467819213867, "learning_rate": 1.2490506535168368e-07, "loss": 8.2475, "step": 4879 }, { "epoch": 0.9534974599452911, "grad_norm": 8.241068840026855, "learning_rate": 1.2390988792163272e-07, "loss": 9.6212, "step": 4880 }, { "epoch": 0.9536928487690504, "grad_norm": 11.174493789672852, "learning_rate": 1.229186661722903e-07, "loss": 9.0657, "step": 4881 }, { "epoch": 0.9538882375928097, "grad_norm": 13.578412055969238, "learning_rate": 1.2193140050066887e-07, "loss": 8.9438, "step": 4882 }, { "epoch": 0.954083626416569, "grad_norm": 6.047148704528809, "learning_rate": 1.2094809130220097e-07, "loss": 8.6007, "step": 4883 }, { "epoch": 0.9542790152403282, "grad_norm": 6.912959098815918, "learning_rate": 1.1996873897073158e-07, "loss": 7.6259, "step": 4884 }, { "epoch": 0.9544744040640876, "grad_norm": 9.91042423248291, "learning_rate": 1.1899334389852135e-07, "loss": 9.8637, "step": 4885 }, { "epoch": 0.9546697928878468, "grad_norm": 6.9881768226623535, "learning_rate": 1.1802190647624667e-07, "loss": 8.7643, "step": 4886 }, { "epoch": 0.9548651817116061, "grad_norm": 8.107124328613281, "learning_rate": 1.1705442709299852e-07, "loss": 8.6659, "step": 4887 }, { "epoch": 0.9550605705353654, "grad_norm": 8.125308990478516, "learning_rate": 1.1609090613628249e-07, "loss": 8.6355, "step": 4888 }, { "epoch": 0.9552559593591247, "grad_norm": 7.603143692016602, "learning_rate": 1.1513134399201765e-07, "loss": 9.0061, "step": 4889 }, { "epoch": 0.9554513481828839, "grad_norm": 10.955138206481934, "learning_rate": 1.1417574104453877e-07, "loss": 8.4504, "step": 4890 }, { "epoch": 0.9556467370066433, "grad_norm": 9.054905891418457, "learning_rate": 1.1322409767659526e-07, "loss": 8.9073, "step": 4891 }, { "epoch": 0.9558421258304025, "grad_norm": 8.954079627990723, "learning_rate": 1.122764142693511e-07, "loss": 9.7326, "step": 4892 }, { "epoch": 0.9560375146541618, "grad_norm": 7.77183198928833, "learning_rate": 1.1133269120237933e-07, "loss": 8.7141, "step": 4893 }, { "epoch": 0.9562329034779211, "grad_norm": 8.300443649291992, "learning_rate": 1.1039292885367315e-07, "loss": 9.2519, "step": 4894 }, { "epoch": 0.9564282923016804, "grad_norm": 9.766745567321777, "learning_rate": 1.0945712759963478e-07, "loss": 9.4207, "step": 4895 }, { "epoch": 0.9566236811254396, "grad_norm": 8.486104965209961, "learning_rate": 1.0852528781508442e-07, "loss": 9.179, "step": 4896 }, { "epoch": 0.9568190699491989, "grad_norm": 7.6745429039001465, "learning_rate": 1.0759740987325017e-07, "loss": 9.4932, "step": 4897 }, { "epoch": 0.9570144587729582, "grad_norm": 9.365317344665527, "learning_rate": 1.0667349414577588e-07, "loss": 8.9638, "step": 4898 }, { "epoch": 0.9572098475967175, "grad_norm": 28.076303482055664, "learning_rate": 1.0575354100272106e-07, "loss": 9.2634, "step": 4899 }, { "epoch": 0.9574052364204767, "grad_norm": 9.686136245727539, "learning_rate": 1.0483755081255209e-07, "loss": 8.7086, "step": 4900 }, { "epoch": 0.9576006252442361, "grad_norm": 8.524884223937988, "learning_rate": 1.0392552394215438e-07, "loss": 9.3499, "step": 4901 }, { "epoch": 0.9577960140679953, "grad_norm": 7.558854103088379, "learning_rate": 1.0301746075682128e-07, "loss": 9.2694, "step": 4902 }, { "epoch": 0.9579914028917546, "grad_norm": 16.856613159179688, "learning_rate": 1.0211336162026187e-07, "loss": 8.7741, "step": 4903 }, { "epoch": 0.9581867917155139, "grad_norm": 8.923369407653809, "learning_rate": 1.0121322689459422e-07, "loss": 8.8212, "step": 4904 }, { "epoch": 0.9583821805392732, "grad_norm": 6.723689079284668, "learning_rate": 1.003170569403511e-07, "loss": 8.2187, "step": 4905 }, { "epoch": 0.9585775693630324, "grad_norm": 9.944154739379883, "learning_rate": 9.942485211647645e-08, "loss": 9.0431, "step": 4906 }, { "epoch": 0.9587729581867918, "grad_norm": 9.219122886657715, "learning_rate": 9.853661278032556e-08, "loss": 9.4427, "step": 4907 }, { "epoch": 0.958968347010551, "grad_norm": 8.637513160705566, "learning_rate": 9.765233928766493e-08, "loss": 7.9597, "step": 4908 }, { "epoch": 0.9591637358343102, "grad_norm": 6.749722957611084, "learning_rate": 9.677203199267682e-08, "loss": 8.2952, "step": 4909 }, { "epoch": 0.9593591246580696, "grad_norm": 22.5472412109375, "learning_rate": 9.589569124794918e-08, "loss": 8.8987, "step": 4910 }, { "epoch": 0.9595545134818289, "grad_norm": 7.759912490844727, "learning_rate": 9.502331740448233e-08, "loss": 9.3352, "step": 4911 }, { "epoch": 0.9597499023055881, "grad_norm": 8.938230514526367, "learning_rate": 9.415491081169237e-08, "loss": 9.2042, "step": 4912 }, { "epoch": 0.9599452911293475, "grad_norm": 8.625635147094727, "learning_rate": 9.329047181740103e-08, "loss": 9.4016, "step": 4913 }, { "epoch": 0.9601406799531067, "grad_norm": 8.247848510742188, "learning_rate": 9.243000076784359e-08, "loss": 8.846, "step": 4914 }, { "epoch": 0.960336068776866, "grad_norm": 7.982619285583496, "learning_rate": 9.157349800766435e-08, "loss": 8.2125, "step": 4915 }, { "epoch": 0.9605314576006252, "grad_norm": 10.335287094116211, "learning_rate": 9.072096387992113e-08, "loss": 8.868, "step": 4916 }, { "epoch": 0.9607268464243846, "grad_norm": 8.667943954467773, "learning_rate": 8.987239872607856e-08, "loss": 9.4956, "step": 4917 }, { "epoch": 0.9609222352481438, "grad_norm": 11.45176887512207, "learning_rate": 8.902780288601587e-08, "loss": 10.3252, "step": 4918 }, { "epoch": 0.961117624071903, "grad_norm": 7.204813480377197, "learning_rate": 8.818717669801691e-08, "loss": 8.4043, "step": 4919 }, { "epoch": 0.9613130128956624, "grad_norm": 8.619112014770508, "learning_rate": 8.735052049878012e-08, "loss": 9.1276, "step": 4920 }, { "epoch": 0.9615084017194216, "grad_norm": 10.103872299194336, "learning_rate": 8.651783462341078e-08, "loss": 9.4011, "step": 4921 }, { "epoch": 0.9617037905431809, "grad_norm": 8.542306900024414, "learning_rate": 8.568911940542879e-08, "loss": 8.6925, "step": 4922 }, { "epoch": 0.9618991793669402, "grad_norm": 11.84721851348877, "learning_rate": 8.48643751767575e-08, "loss": 9.5716, "step": 4923 }, { "epoch": 0.9620945681906995, "grad_norm": 7.118066787719727, "learning_rate": 8.404360226773267e-08, "loss": 8.3886, "step": 4924 }, { "epoch": 0.9622899570144587, "grad_norm": 6.681229114532471, "learning_rate": 8.322680100710023e-08, "loss": 8.4012, "step": 4925 }, { "epoch": 0.9624853458382181, "grad_norm": 8.622283935546875, "learning_rate": 8.241397172201405e-08, "loss": 9.1808, "step": 4926 }, { "epoch": 0.9626807346619773, "grad_norm": 12.33980941772461, "learning_rate": 8.160511473803701e-08, "loss": 8.9697, "step": 4927 }, { "epoch": 0.9628761234857366, "grad_norm": 8.637188911437988, "learning_rate": 8.080023037914331e-08, "loss": 9.0236, "step": 4928 }, { "epoch": 0.963071512309496, "grad_norm": 10.9049072265625, "learning_rate": 7.999931896771284e-08, "loss": 9.8126, "step": 4929 }, { "epoch": 0.9632669011332552, "grad_norm": 7.670109748840332, "learning_rate": 7.920238082453457e-08, "loss": 8.4132, "step": 4930 }, { "epoch": 0.9634622899570144, "grad_norm": 17.906023025512695, "learning_rate": 7.840941626880871e-08, "loss": 9.3315, "step": 4931 }, { "epoch": 0.9636576787807737, "grad_norm": 6.756670951843262, "learning_rate": 7.762042561814009e-08, "loss": 8.0376, "step": 4932 }, { "epoch": 0.963853067604533, "grad_norm": 8.018085479736328, "learning_rate": 7.683540918854593e-08, "loss": 9.0674, "step": 4933 }, { "epoch": 0.9640484564282923, "grad_norm": 7.3927788734436035, "learning_rate": 7.605436729444915e-08, "loss": 9.0571, "step": 4934 }, { "epoch": 0.9642438452520515, "grad_norm": 9.25537109375, "learning_rate": 7.527730024868062e-08, "loss": 9.1265, "step": 4935 }, { "epoch": 0.9644392340758109, "grad_norm": 9.464190483093262, "learning_rate": 7.450420836248029e-08, "loss": 9.5791, "step": 4936 }, { "epoch": 0.9646346228995701, "grad_norm": 8.062870979309082, "learning_rate": 7.373509194549599e-08, "loss": 9.1729, "step": 4937 }, { "epoch": 0.9648300117233294, "grad_norm": 8.228679656982422, "learning_rate": 7.296995130578243e-08, "loss": 8.4348, "step": 4938 }, { "epoch": 0.9650254005470887, "grad_norm": 8.645241737365723, "learning_rate": 7.220878674980225e-08, "loss": 9.5973, "step": 4939 }, { "epoch": 0.965220789370848, "grad_norm": 7.164260387420654, "learning_rate": 7.145159858242379e-08, "loss": 8.5819, "step": 4940 }, { "epoch": 0.9654161781946072, "grad_norm": 8.89592456817627, "learning_rate": 7.069838710692666e-08, "loss": 9.1587, "step": 4941 }, { "epoch": 0.9656115670183666, "grad_norm": 9.88491439819336, "learning_rate": 6.994915262499513e-08, "loss": 8.8914, "step": 4942 }, { "epoch": 0.9658069558421258, "grad_norm": 11.146038055419922, "learning_rate": 6.920389543671913e-08, "loss": 9.8806, "step": 4943 }, { "epoch": 0.9660023446658851, "grad_norm": 8.83292293548584, "learning_rate": 6.846261584059988e-08, "loss": 8.9843, "step": 4944 }, { "epoch": 0.9661977334896444, "grad_norm": 8.25486946105957, "learning_rate": 6.772531413353989e-08, "loss": 7.6047, "step": 4945 }, { "epoch": 0.9663931223134037, "grad_norm": 7.636203289031982, "learning_rate": 6.699199061085515e-08, "loss": 9.1152, "step": 4946 }, { "epoch": 0.9665885111371629, "grad_norm": 8.359781265258789, "learning_rate": 6.626264556626071e-08, "loss": 8.99, "step": 4947 }, { "epoch": 0.9667838999609223, "grad_norm": 8.841544151306152, "learning_rate": 6.553727929188403e-08, "loss": 8.7749, "step": 4948 }, { "epoch": 0.9669792887846815, "grad_norm": 8.096626281738281, "learning_rate": 6.4815892078256e-08, "loss": 9.272, "step": 4949 }, { "epoch": 0.9671746776084408, "grad_norm": 8.442158699035645, "learning_rate": 6.409848421431553e-08, "loss": 9.2086, "step": 4950 }, { "epoch": 0.9673700664322, "grad_norm": 14.558295249938965, "learning_rate": 6.338505598740608e-08, "loss": 9.3925, "step": 4951 }, { "epoch": 0.9675654552559594, "grad_norm": 8.240745544433594, "learning_rate": 6.267560768327796e-08, "loss": 8.2754, "step": 4952 }, { "epoch": 0.9677608440797186, "grad_norm": 8.812482833862305, "learning_rate": 6.197013958608611e-08, "loss": 8.1315, "step": 4953 }, { "epoch": 0.9679562329034779, "grad_norm": 7.7007155418396, "learning_rate": 6.126865197839449e-08, "loss": 9.099, "step": 4954 }, { "epoch": 0.9681516217272372, "grad_norm": 7.753499984741211, "learning_rate": 6.057114514116947e-08, "loss": 8.4374, "step": 4955 }, { "epoch": 0.9683470105509965, "grad_norm": 7.419507026672363, "learning_rate": 5.987761935378422e-08, "loss": 8.917, "step": 4956 }, { "epoch": 0.9685423993747557, "grad_norm": 7.506134986877441, "learning_rate": 5.918807489401768e-08, "loss": 9.2826, "step": 4957 }, { "epoch": 0.9687377881985151, "grad_norm": 7.759176254272461, "learning_rate": 5.8502512038053347e-08, "loss": 8.6572, "step": 4958 }, { "epoch": 0.9689331770222743, "grad_norm": 8.795549392700195, "learning_rate": 5.782093106048159e-08, "loss": 9.0967, "step": 4959 }, { "epoch": 0.9691285658460336, "grad_norm": 8.271385192871094, "learning_rate": 5.714333223429402e-08, "loss": 8.7773, "step": 4960 }, { "epoch": 0.9693239546697929, "grad_norm": 7.687859058380127, "learning_rate": 5.6469715830893556e-08, "loss": 8.9301, "step": 4961 }, { "epoch": 0.9695193434935522, "grad_norm": 11.383176803588867, "learning_rate": 5.5800082120082143e-08, "loss": 9.0446, "step": 4962 }, { "epoch": 0.9697147323173114, "grad_norm": 9.46120548248291, "learning_rate": 5.5134431370069685e-08, "loss": 9.951, "step": 4963 }, { "epoch": 0.9699101211410708, "grad_norm": 9.806598663330078, "learning_rate": 5.447276384746958e-08, "loss": 8.7027, "step": 4964 }, { "epoch": 0.97010550996483, "grad_norm": 8.101316452026367, "learning_rate": 5.3815079817300944e-08, "loss": 8.0066, "step": 4965 }, { "epoch": 0.9703008987885893, "grad_norm": 9.437013626098633, "learning_rate": 5.316137954298528e-08, "loss": 9.5399, "step": 4966 }, { "epoch": 0.9704962876123485, "grad_norm": 11.781790733337402, "learning_rate": 5.251166328635093e-08, "loss": 9.9803, "step": 4967 }, { "epoch": 0.9706916764361079, "grad_norm": 8.37763786315918, "learning_rate": 5.186593130762974e-08, "loss": 8.733, "step": 4968 }, { "epoch": 0.9708870652598671, "grad_norm": 7.655576705932617, "learning_rate": 5.122418386545591e-08, "loss": 8.9133, "step": 4969 }, { "epoch": 0.9710824540836264, "grad_norm": 7.902202129364014, "learning_rate": 5.058642121687163e-08, "loss": 8.4583, "step": 4970 }, { "epoch": 0.9712778429073857, "grad_norm": 8.114288330078125, "learning_rate": 4.995264361731811e-08, "loss": 8.8281, "step": 4971 }, { "epoch": 0.971473231731145, "grad_norm": 9.332640647888184, "learning_rate": 4.9322851320644514e-08, "loss": 9.5133, "step": 4972 }, { "epoch": 0.9716686205549042, "grad_norm": 8.73367691040039, "learning_rate": 4.8697044579101295e-08, "loss": 7.8143, "step": 4973 }, { "epoch": 0.9718640093786636, "grad_norm": 10.668346405029297, "learning_rate": 4.8075223643343494e-08, "loss": 10.1966, "step": 4974 }, { "epoch": 0.9720593982024228, "grad_norm": 11.458003044128418, "learning_rate": 4.745738876243078e-08, "loss": 8.8785, "step": 4975 }, { "epoch": 0.9722547870261821, "grad_norm": 7.601654052734375, "learning_rate": 4.684354018382409e-08, "loss": 7.8087, "step": 4976 }, { "epoch": 0.9724501758499414, "grad_norm": 8.562968254089355, "learning_rate": 4.623367815339008e-08, "loss": 9.594, "step": 4977 }, { "epoch": 0.9726455646737007, "grad_norm": 7.967249870300293, "learning_rate": 4.562780291539559e-08, "loss": 8.6013, "step": 4978 }, { "epoch": 0.9728409534974599, "grad_norm": 10.01623821258545, "learning_rate": 4.5025914712514276e-08, "loss": 9.4209, "step": 4979 }, { "epoch": 0.9730363423212193, "grad_norm": 6.892254829406738, "learning_rate": 4.4428013785819954e-08, "loss": 9.9811, "step": 4980 }, { "epoch": 0.9732317311449785, "grad_norm": 7.683324813842773, "learning_rate": 4.3834100374791075e-08, "loss": 8.6577, "step": 4981 }, { "epoch": 0.9734271199687378, "grad_norm": 8.40973949432373, "learning_rate": 4.3244174717308464e-08, "loss": 9.6727, "step": 4982 }, { "epoch": 0.9736225087924971, "grad_norm": 8.25219440460205, "learning_rate": 4.2658237049655325e-08, "loss": 8.2001, "step": 4983 }, { "epoch": 0.9738178976162564, "grad_norm": 7.207298755645752, "learning_rate": 4.2076287606517275e-08, "loss": 8.6362, "step": 4984 }, { "epoch": 0.9740132864400156, "grad_norm": 11.115377426147461, "learning_rate": 4.149832662098452e-08, "loss": 9.2928, "step": 4985 }, { "epoch": 0.9742086752637749, "grad_norm": 8.779725074768066, "learning_rate": 4.0924354324546335e-08, "loss": 8.5166, "step": 4986 }, { "epoch": 0.9744040640875342, "grad_norm": 10.163342475891113, "learning_rate": 4.035437094709882e-08, "loss": 10.2392, "step": 4987 }, { "epoch": 0.9745994529112935, "grad_norm": 7.949179172515869, "learning_rate": 3.9788376716936025e-08, "loss": 8.439, "step": 4988 }, { "epoch": 0.9747948417350527, "grad_norm": 6.725371360778809, "learning_rate": 3.922637186075884e-08, "loss": 8.2117, "step": 4989 }, { "epoch": 0.9749902305588121, "grad_norm": 8.798787117004395, "learning_rate": 3.866835660366386e-08, "loss": 9.3537, "step": 4990 }, { "epoch": 0.9751856193825713, "grad_norm": 12.254704475402832, "learning_rate": 3.811433116915675e-08, "loss": 9.1208, "step": 4991 }, { "epoch": 0.9753810082063306, "grad_norm": 8.929437637329102, "learning_rate": 3.7564295779141115e-08, "loss": 9.5874, "step": 4992 }, { "epoch": 0.9755763970300899, "grad_norm": 7.432225227355957, "learning_rate": 3.701825065392184e-08, "loss": 8.3311, "step": 4993 }, { "epoch": 0.9757717858538492, "grad_norm": 9.744684219360352, "learning_rate": 3.647619601220953e-08, "loss": 9.9649, "step": 4994 }, { "epoch": 0.9759671746776084, "grad_norm": 7.140275955200195, "learning_rate": 3.593813207111052e-08, "loss": 8.2258, "step": 4995 }, { "epoch": 0.9761625635013678, "grad_norm": 10.876626014709473, "learning_rate": 3.540405904613908e-08, "loss": 9.3578, "step": 4996 }, { "epoch": 0.976357952325127, "grad_norm": 7.864413738250732, "learning_rate": 3.487397715120633e-08, "loss": 9.3616, "step": 4997 }, { "epoch": 0.9765533411488863, "grad_norm": 7.5049638748168945, "learning_rate": 3.4347886598626865e-08, "loss": 9.0532, "step": 4998 }, { "epoch": 0.9767487299726456, "grad_norm": 15.398046493530273, "learning_rate": 3.382578759911659e-08, "loss": 9.5909, "step": 4999 }, { "epoch": 0.9769441187964049, "grad_norm": 7.960659503936768, "learning_rate": 3.330768036179266e-08, "loss": 9.2443, "step": 5000 }, { "epoch": 0.9771395076201641, "grad_norm": 10.92277717590332, "learning_rate": 3.279356509417242e-08, "loss": 8.6086, "step": 5001 }, { "epoch": 0.9773348964439235, "grad_norm": 7.6711039543151855, "learning_rate": 3.228344200217559e-08, "loss": 9.291, "step": 5002 }, { "epoch": 0.9775302852676827, "grad_norm": 6.7085795402526855, "learning_rate": 3.177731129012207e-08, "loss": 8.5472, "step": 5003 }, { "epoch": 0.977725674091442, "grad_norm": 8.609145164489746, "learning_rate": 3.127517316073303e-08, "loss": 9.4066, "step": 5004 }, { "epoch": 0.9779210629152012, "grad_norm": 7.209899425506592, "learning_rate": 3.07770278151287e-08, "loss": 8.5165, "step": 5005 }, { "epoch": 0.9781164517389606, "grad_norm": 9.901558876037598, "learning_rate": 3.0282875452835034e-08, "loss": 8.6032, "step": 5006 }, { "epoch": 0.9783118405627198, "grad_norm": 8.552497863769531, "learning_rate": 2.9792716271772605e-08, "loss": 9.3627, "step": 5007 }, { "epoch": 0.978507229386479, "grad_norm": 9.150060653686523, "learning_rate": 2.9306550468266582e-08, "loss": 9.2595, "step": 5008 }, { "epoch": 0.9787026182102384, "grad_norm": 7.365716934204102, "learning_rate": 2.8824378237042318e-08, "loss": 9.1018, "step": 5009 }, { "epoch": 0.9788980070339977, "grad_norm": 7.778922080993652, "learning_rate": 2.8346199771221995e-08, "loss": 8.7822, "step": 5010 }, { "epoch": 0.9790933958577569, "grad_norm": 8.252825736999512, "learning_rate": 2.7872015262334628e-08, "loss": 9.1626, "step": 5011 }, { "epoch": 0.9792887846815163, "grad_norm": 10.327372550964355, "learning_rate": 2.7401824900302743e-08, "loss": 9.7747, "step": 5012 }, { "epoch": 0.9794841735052755, "grad_norm": 11.820197105407715, "learning_rate": 2.6935628873453468e-08, "loss": 9.6602, "step": 5013 }, { "epoch": 0.9796795623290347, "grad_norm": 83.55939483642578, "learning_rate": 2.6473427368511883e-08, "loss": 10.351, "step": 5014 }, { "epoch": 0.9798749511527941, "grad_norm": 12.766364097595215, "learning_rate": 2.6015220570605458e-08, "loss": 9.6129, "step": 5015 }, { "epoch": 0.9800703399765534, "grad_norm": 10.860968589782715, "learning_rate": 2.5561008663259613e-08, "loss": 9.4036, "step": 5016 }, { "epoch": 0.9802657288003126, "grad_norm": 9.887362480163574, "learning_rate": 2.5110791828399928e-08, "loss": 8.7792, "step": 5017 }, { "epoch": 0.980461117624072, "grad_norm": 9.302169799804688, "learning_rate": 2.466457024635327e-08, "loss": 8.8441, "step": 5018 }, { "epoch": 0.9806565064478312, "grad_norm": 9.141999244689941, "learning_rate": 2.4222344095844452e-08, "loss": 8.8272, "step": 5019 }, { "epoch": 0.9808518952715904, "grad_norm": 9.363895416259766, "learning_rate": 2.3784113553998454e-08, "loss": 9.4373, "step": 5020 }, { "epoch": 0.9810472840953497, "grad_norm": 11.084437370300293, "learning_rate": 2.334987879634043e-08, "loss": 9.3966, "step": 5021 }, { "epoch": 0.981242672919109, "grad_norm": 14.619644165039062, "learning_rate": 2.2919639996795695e-08, "loss": 9.4309, "step": 5022 }, { "epoch": 0.9814380617428683, "grad_norm": 7.190587520599365, "learning_rate": 2.2493397327686404e-08, "loss": 8.5346, "step": 5023 }, { "epoch": 0.9816334505666275, "grad_norm": 9.363481521606445, "learning_rate": 2.207115095973822e-08, "loss": 9.0853, "step": 5024 }, { "epoch": 0.9818288393903869, "grad_norm": 9.095197677612305, "learning_rate": 2.1652901062072517e-08, "loss": 8.4736, "step": 5025 }, { "epoch": 0.9820242282141461, "grad_norm": 8.333402633666992, "learning_rate": 2.1238647802213075e-08, "loss": 9.3244, "step": 5026 }, { "epoch": 0.9822196170379054, "grad_norm": 26.559803009033203, "learning_rate": 2.082839134607828e-08, "loss": 8.7026, "step": 5027 }, { "epoch": 0.9824150058616647, "grad_norm": 11.467344284057617, "learning_rate": 2.0422131857991133e-08, "loss": 9.5603, "step": 5028 }, { "epoch": 0.982610394685424, "grad_norm": 8.219799995422363, "learning_rate": 2.0019869500669255e-08, "loss": 9.8476, "step": 5029 }, { "epoch": 0.9828057835091832, "grad_norm": 13.808085441589355, "learning_rate": 1.9621604435232645e-08, "loss": 8.0784, "step": 5030 }, { "epoch": 0.9830011723329426, "grad_norm": 8.613082885742188, "learning_rate": 1.922733682119815e-08, "loss": 8.7472, "step": 5031 }, { "epoch": 0.9831965611567018, "grad_norm": 21.057920455932617, "learning_rate": 1.8837066816482785e-08, "loss": 8.6952, "step": 5032 }, { "epoch": 0.9833919499804611, "grad_norm": 9.525463104248047, "learning_rate": 1.845079457740151e-08, "loss": 8.8848, "step": 5033 }, { "epoch": 0.9835873388042204, "grad_norm": 9.992208480834961, "learning_rate": 1.8068520258667233e-08, "loss": 8.8986, "step": 5034 }, { "epoch": 0.9837827276279797, "grad_norm": 14.523115158081055, "learning_rate": 1.7690244013394142e-08, "loss": 9.344, "step": 5035 }, { "epoch": 0.9839781164517389, "grad_norm": 8.562339782714844, "learning_rate": 1.7315965993093264e-08, "loss": 9.6286, "step": 5036 }, { "epoch": 0.9841735052754983, "grad_norm": 8.526247024536133, "learning_rate": 1.6945686347674683e-08, "loss": 9.0399, "step": 5037 }, { "epoch": 0.9843688940992575, "grad_norm": 10.577889442443848, "learning_rate": 1.657940522544643e-08, "loss": 9.2172, "step": 5038 }, { "epoch": 0.9845642829230168, "grad_norm": 9.276185035705566, "learning_rate": 1.6217122773115603e-08, "loss": 7.9005, "step": 5039 }, { "epoch": 0.984759671746776, "grad_norm": 7.138728141784668, "learning_rate": 1.585883913578834e-08, "loss": 8.8503, "step": 5040 }, { "epoch": 0.9849550605705354, "grad_norm": 7.980425834655762, "learning_rate": 1.5504554456966525e-08, "loss": 7.087, "step": 5041 }, { "epoch": 0.9851504493942946, "grad_norm": 11.567042350769043, "learning_rate": 1.5154268878554422e-08, "loss": 9.2517, "step": 5042 }, { "epoch": 0.9853458382180539, "grad_norm": 8.005668640136719, "learning_rate": 1.4807982540850919e-08, "loss": 9.2925, "step": 5043 }, { "epoch": 0.9855412270418132, "grad_norm": 9.739407539367676, "learning_rate": 1.4465695582553951e-08, "loss": 9.5434, "step": 5044 }, { "epoch": 0.9857366158655725, "grad_norm": 8.46045970916748, "learning_rate": 1.4127408140761633e-08, "loss": 9.3594, "step": 5045 }, { "epoch": 0.9859320046893317, "grad_norm": 6.429606914520264, "learning_rate": 1.3793120350967804e-08, "loss": 8.3392, "step": 5046 }, { "epoch": 0.9861273935130911, "grad_norm": 9.116524696350098, "learning_rate": 1.3462832347065358e-08, "loss": 9.0163, "step": 5047 }, { "epoch": 0.9863227823368503, "grad_norm": 8.624211311340332, "learning_rate": 1.3136544261344031e-08, "loss": 9.3884, "step": 5048 }, { "epoch": 0.9865181711606096, "grad_norm": 8.262917518615723, "learning_rate": 1.2814256224493727e-08, "loss": 10.0031, "step": 5049 }, { "epoch": 0.9867135599843689, "grad_norm": 7.221947193145752, "learning_rate": 1.2495968365598964e-08, "loss": 8.3295, "step": 5050 }, { "epoch": 0.9869089488081282, "grad_norm": 7.57441520690918, "learning_rate": 1.2181680812145546e-08, "loss": 9.3922, "step": 5051 }, { "epoch": 0.9871043376318874, "grad_norm": 7.57441520690918, "learning_rate": 1.2181680812145546e-08, "loss": 9.2995, "step": 5052 }, { "epoch": 0.9872997264556468, "grad_norm": 8.46338939666748, "learning_rate": 1.1871393690014998e-08, "loss": 8.8303, "step": 5053 }, { "epoch": 0.987495115279406, "grad_norm": 9.874073028564453, "learning_rate": 1.1565107123486797e-08, "loss": 8.1657, "step": 5054 }, { "epoch": 0.9876905041031653, "grad_norm": 10.480962753295898, "learning_rate": 1.1262821235237253e-08, "loss": 9.8809, "step": 5055 }, { "epoch": 0.9878858929269245, "grad_norm": 9.544706344604492, "learning_rate": 1.0964536146342852e-08, "loss": 9.468, "step": 5056 }, { "epoch": 0.9880812817506839, "grad_norm": 9.673017501831055, "learning_rate": 1.0670251976275803e-08, "loss": 8.9854, "step": 5057 }, { "epoch": 0.9882766705744431, "grad_norm": 56.37615966796875, "learning_rate": 1.0379968842905153e-08, "loss": 9.9719, "step": 5058 }, { "epoch": 0.9884720593982024, "grad_norm": 7.474615573883057, "learning_rate": 1.0093686862499008e-08, "loss": 9.6783, "step": 5059 }, { "epoch": 0.9886674482219617, "grad_norm": 7.644552230834961, "learning_rate": 9.811406149721203e-09, "loss": 9.8834, "step": 5060 }, { "epoch": 0.988862837045721, "grad_norm": 9.560027122497559, "learning_rate": 9.533126817635741e-09, "loss": 9.0952, "step": 5061 }, { "epoch": 0.9890582258694802, "grad_norm": 8.46674919128418, "learning_rate": 9.25884897770013e-09, "loss": 8.96, "step": 5062 }, { "epoch": 0.9892536146932396, "grad_norm": 9.916620254516602, "learning_rate": 8.98857273977316e-09, "loss": 8.7321, "step": 5063 }, { "epoch": 0.9894490035169988, "grad_norm": 8.093708038330078, "learning_rate": 8.722298212107128e-09, "loss": 8.8145, "step": 5064 }, { "epoch": 0.9896443923407581, "grad_norm": 6.903542995452881, "learning_rate": 8.460025501353386e-09, "loss": 7.712, "step": 5065 }, { "epoch": 0.9898397811645174, "grad_norm": 7.556875705718994, "learning_rate": 8.201754712561238e-09, "loss": 8.5463, "step": 5066 }, { "epoch": 0.9900351699882767, "grad_norm": 7.805353164672852, "learning_rate": 7.947485949175715e-09, "loss": 8.5967, "step": 5067 }, { "epoch": 0.9902305588120359, "grad_norm": 8.18213176727295, "learning_rate": 7.697219313039794e-09, "loss": 8.5686, "step": 5068 }, { "epoch": 0.9904259476357953, "grad_norm": 9.124711990356445, "learning_rate": 7.450954904392182e-09, "loss": 9.3234, "step": 5069 }, { "epoch": 0.9906213364595545, "grad_norm": 8.662633895874023, "learning_rate": 7.2086928218706446e-09, "loss": 7.7065, "step": 5070 }, { "epoch": 0.9908167252833138, "grad_norm": 7.768984794616699, "learning_rate": 6.970433162507562e-09, "loss": 8.3612, "step": 5071 }, { "epoch": 0.9910121141070731, "grad_norm": 8.390610694885254, "learning_rate": 6.7361760217332654e-09, "loss": 8.2334, "step": 5072 }, { "epoch": 0.9912075029308324, "grad_norm": 7.7365312576293945, "learning_rate": 6.505921493374923e-09, "loss": 8.4446, "step": 5073 }, { "epoch": 0.9914028917545916, "grad_norm": 8.441004753112793, "learning_rate": 6.279669669657651e-09, "loss": 9.3394, "step": 5074 }, { "epoch": 0.9915982805783509, "grad_norm": 10.001994132995605, "learning_rate": 6.057420641202294e-09, "loss": 8.4434, "step": 5075 }, { "epoch": 0.9917936694021102, "grad_norm": 8.94590950012207, "learning_rate": 5.839174497025424e-09, "loss": 8.8657, "step": 5076 }, { "epoch": 0.9919890582258695, "grad_norm": 8.118600845336914, "learning_rate": 5.624931324541561e-09, "loss": 8.8006, "step": 5077 }, { "epoch": 0.9921844470496287, "grad_norm": 10.194698333740234, "learning_rate": 5.414691209563172e-09, "loss": 9.4743, "step": 5078 }, { "epoch": 0.9923798358733881, "grad_norm": 8.967790603637695, "learning_rate": 5.208454236296234e-09, "loss": 9.1476, "step": 5079 }, { "epoch": 0.9925752246971473, "grad_norm": 7.312407970428467, "learning_rate": 5.006220487345781e-09, "loss": 8.651, "step": 5080 }, { "epoch": 0.9927706135209066, "grad_norm": 6.785313606262207, "learning_rate": 4.807990043712574e-09, "loss": 7.4504, "step": 5081 }, { "epoch": 0.9929660023446659, "grad_norm": 10.432636260986328, "learning_rate": 4.613762984794212e-09, "loss": 8.8484, "step": 5082 }, { "epoch": 0.9931613911684252, "grad_norm": 8.083576202392578, "learning_rate": 4.423539388385134e-09, "loss": 9.9983, "step": 5083 }, { "epoch": 0.9933567799921844, "grad_norm": 7.688929080963135, "learning_rate": 4.237319330674394e-09, "loss": 9.4568, "step": 5084 }, { "epoch": 0.9935521688159438, "grad_norm": 8.144380569458008, "learning_rate": 4.055102886250106e-09, "loss": 8.8193, "step": 5085 }, { "epoch": 0.993747557639703, "grad_norm": 10.60041332244873, "learning_rate": 3.876890128095001e-09, "loss": 8.3073, "step": 5086 }, { "epoch": 0.9939429464634623, "grad_norm": 6.922112464904785, "learning_rate": 3.702681127589758e-09, "loss": 8.6703, "step": 5087 }, { "epoch": 0.9941383352872216, "grad_norm": 7.471693992614746, "learning_rate": 3.532475954508563e-09, "loss": 8.9103, "step": 5088 }, { "epoch": 0.9943337241109809, "grad_norm": 10.693000793457031, "learning_rate": 3.3662746770268816e-09, "loss": 9.3337, "step": 5089 }, { "epoch": 0.9945291129347401, "grad_norm": 9.791496276855469, "learning_rate": 3.204077361710356e-09, "loss": 9.2034, "step": 5090 }, { "epoch": 0.9947245017584995, "grad_norm": 9.332125663757324, "learning_rate": 3.0458840735259065e-09, "loss": 8.6006, "step": 5091 }, { "epoch": 0.9949198905822587, "grad_norm": 8.35640811920166, "learning_rate": 2.8916948758339613e-09, "loss": 9.2044, "step": 5092 }, { "epoch": 0.995115279406018, "grad_norm": 9.399685859680176, "learning_rate": 2.7415098303928966e-09, "loss": 8.4741, "step": 5093 }, { "epoch": 0.9953106682297772, "grad_norm": 7.687415599822998, "learning_rate": 2.5953289973568164e-09, "loss": 8.0641, "step": 5094 }, { "epoch": 0.9955060570535366, "grad_norm": 9.725567817687988, "learning_rate": 2.4531524352744417e-09, "loss": 8.7922, "step": 5095 }, { "epoch": 0.9957014458772958, "grad_norm": 8.47549819946289, "learning_rate": 2.3149802010913323e-09, "loss": 9.0966, "step": 5096 }, { "epoch": 0.9958968347010551, "grad_norm": 8.40412425994873, "learning_rate": 2.180812350152106e-09, "loss": 10.0462, "step": 5097 }, { "epoch": 0.9960922235248144, "grad_norm": 9.487896919250488, "learning_rate": 2.0506489361937776e-09, "loss": 9.3278, "step": 5098 }, { "epoch": 0.9962876123485737, "grad_norm": 7.810408592224121, "learning_rate": 1.9244900113501996e-09, "loss": 8.3744, "step": 5099 }, { "epoch": 0.9964830011723329, "grad_norm": 11.756729125976562, "learning_rate": 1.8023356261531732e-09, "loss": 9.7681, "step": 5100 }, { "epoch": 0.9966783899960923, "grad_norm": 8.442209243774414, "learning_rate": 1.6841858295291169e-09, "loss": 9.5311, "step": 5101 }, { "epoch": 0.9968737788198515, "grad_norm": 9.346763610839844, "learning_rate": 1.5700406688001768e-09, "loss": 9.0829, "step": 5102 }, { "epoch": 0.9970691676436108, "grad_norm": 7.547781944274902, "learning_rate": 1.4599001896842268e-09, "loss": 7.8833, "step": 5103 }, { "epoch": 0.9972645564673701, "grad_norm": 8.288264274597168, "learning_rate": 1.3537644362981995e-09, "loss": 9.1846, "step": 5104 }, { "epoch": 0.9974599452911294, "grad_norm": 6.996792793273926, "learning_rate": 1.2516334511514238e-09, "loss": 8.4772, "step": 5105 }, { "epoch": 0.9976553341148886, "grad_norm": 9.186427116394043, "learning_rate": 1.1535072751500676e-09, "loss": 9.8799, "step": 5106 }, { "epoch": 0.997850722938648, "grad_norm": 7.869745254516602, "learning_rate": 1.0593859475982459e-09, "loss": 9.2626, "step": 5107 }, { "epoch": 0.9980461117624072, "grad_norm": 7.204585552215576, "learning_rate": 9.692695061924717e-10, "loss": 8.7089, "step": 5108 }, { "epoch": 0.9982415005861665, "grad_norm": 9.824396133422852, "learning_rate": 8.831579870283158e-10, "loss": 9.2606, "step": 5109 }, { "epoch": 0.9984368894099257, "grad_norm": 8.864141464233398, "learning_rate": 8.010514245970769e-10, "loss": 8.4175, "step": 5110 }, { "epoch": 0.9986322782336851, "grad_norm": 9.267828941345215, "learning_rate": 7.22949851783561e-10, "loss": 9.4053, "step": 5111 }, { "epoch": 0.9988276670574443, "grad_norm": 7.050810813903809, "learning_rate": 6.48853299869412e-10, "loss": 8.1441, "step": 5112 }, { "epoch": 0.9990230558812035, "grad_norm": 9.182863235473633, "learning_rate": 5.78761798534222e-10, "loss": 9.0271, "step": 5113 }, { "epoch": 0.9992184447049629, "grad_norm": 17.024173736572266, "learning_rate": 5.126753758499803e-10, "loss": 10.0235, "step": 5114 }, { "epoch": 0.9994138335287222, "grad_norm": 9.310667991638184, "learning_rate": 4.5059405828884464e-10, "loss": 8.7047, "step": 5115 }, { "epoch": 0.9996092223524814, "grad_norm": 8.904376029968262, "learning_rate": 3.9251787071425963e-10, "loss": 8.4758, "step": 5116 }, { "epoch": 0.9998046111762408, "grad_norm": 6.933563709259033, "learning_rate": 3.384468363876181e-10, "loss": 8.654, "step": 5117 }, { "epoch": 1.0, "grad_norm": 7.946910858154297, "learning_rate": 2.8838097696715083e-10, "loss": 9.0272, "step": 5118 }, { "epoch": 1.0, "step": 5118, "total_flos": 4.1041189744830054e+17, "train_loss": 9.726896564119167, "train_runtime": 62185.3542, "train_samples_per_second": 2.634, "train_steps_per_second": 0.082 } ], "logging_steps": 1.0, "max_steps": 5118, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.1041189744830054e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }