diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30887 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 4395, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011376564277588168, + "grad_norm": 0.9736748337745667, + "learning_rate": 0.0009997724687144482, + "loss": 2.1831, + "step": 1 + }, + { + "epoch": 0.0022753128555176336, + "grad_norm": NaN, + "learning_rate": 0.0009997724687144482, + "loss": 3.1748, + "step": 2 + }, + { + "epoch": 0.0034129692832764505, + "grad_norm": NaN, + "learning_rate": 0.0009997724687144482, + "loss": 4.6431, + "step": 3 + }, + { + "epoch": 0.004550625711035267, + "grad_norm": 1.4586130380630493, + "learning_rate": 0.0009995449374288964, + "loss": 3.0014, + "step": 4 + }, + { + "epoch": 0.005688282138794084, + "grad_norm": 1.1064860820770264, + "learning_rate": 0.0009993174061433449, + "loss": 3.0825, + "step": 5 + }, + { + "epoch": 0.006825938566552901, + "grad_norm": 1.7668497562408447, + "learning_rate": 0.000999089874857793, + "loss": 4.1777, + "step": 6 + }, + { + "epoch": 0.007963594994311717, + "grad_norm": 1.0811728239059448, + "learning_rate": 0.000998862343572241, + "loss": 2.9395, + "step": 7 + }, + { + "epoch": 0.009101251422070534, + "grad_norm": 1.3404239416122437, + "learning_rate": 0.0009986348122866895, + "loss": 3.0641, + "step": 8 + }, + { + "epoch": 0.010238907849829351, + "grad_norm": 1.7664979696273804, + "learning_rate": 0.0009984072810011377, + "loss": 4.9861, + "step": 9 + }, + { + "epoch": 0.011376564277588168, + "grad_norm": 1.3834271430969238, + "learning_rate": 0.000998179749715586, + "loss": 3.3572, + "step": 10 + }, + { + "epoch": 0.012514220705346985, + "grad_norm": 1.6286646127700806, + "learning_rate": 0.0009979522184300341, + "loss": 4.866, + "step": 11 + }, + { + "epoch": 0.013651877133105802, + "grad_norm": 0.9623754620552063, + "learning_rate": 0.0009977246871444823, + "loss": 2.6571, + "step": 12 + }, + { + "epoch": 0.01478953356086462, + "grad_norm": 1.4946273565292358, + "learning_rate": 0.0009974971558589305, + "loss": 3.2579, + "step": 13 + }, + { + "epoch": 0.015927189988623434, + "grad_norm": 1.5644868612289429, + "learning_rate": 0.000997269624573379, + "loss": 4.3634, + "step": 14 + }, + { + "epoch": 0.017064846416382253, + "grad_norm": 1.1862064599990845, + "learning_rate": 0.0009970420932878272, + "loss": 2.5753, + "step": 15 + }, + { + "epoch": 0.01820250284414107, + "grad_norm": 1.8198087215423584, + "learning_rate": 0.0009968145620022754, + "loss": 2.7812, + "step": 16 + }, + { + "epoch": 0.019340159271899887, + "grad_norm": 1.6003659963607788, + "learning_rate": 0.0009965870307167236, + "loss": 3.8034, + "step": 17 + }, + { + "epoch": 0.020477815699658702, + "grad_norm": 0.6375657916069031, + "learning_rate": 0.0009963594994311718, + "loss": 1.8391, + "step": 18 + }, + { + "epoch": 0.02161547212741752, + "grad_norm": 1.5585576295852661, + "learning_rate": 0.00099613196814562, + "loss": 3.863, + "step": 19 + }, + { + "epoch": 0.022753128555176336, + "grad_norm": 1.4096543788909912, + "learning_rate": 0.0009959044368600682, + "loss": 2.7089, + "step": 20 + }, + { + "epoch": 0.023890784982935155, + "grad_norm": 1.2902367115020752, + "learning_rate": 0.0009956769055745164, + "loss": 3.508, + "step": 21 + }, + { + "epoch": 0.02502844141069397, + "grad_norm": 1.1524360179901123, + "learning_rate": 0.0009954493742889647, + "loss": 3.5223, + "step": 22 + }, + { + "epoch": 0.026166097838452786, + "grad_norm": 1.2636401653289795, + "learning_rate": 0.000995221843003413, + "loss": 2.9982, + "step": 23 + }, + { + "epoch": 0.027303754266211604, + "grad_norm": 1.3827288150787354, + "learning_rate": 0.0009949943117178613, + "loss": 4.1671, + "step": 24 + }, + { + "epoch": 0.02844141069397042, + "grad_norm": 1.1216269731521606, + "learning_rate": 0.0009947667804323095, + "loss": 2.0255, + "step": 25 + }, + { + "epoch": 0.02957906712172924, + "grad_norm": 1.6343213319778442, + "learning_rate": 0.0009945392491467577, + "loss": 3.2769, + "step": 26 + }, + { + "epoch": 0.030716723549488054, + "grad_norm": 1.988418698310852, + "learning_rate": 0.000994311717861206, + "loss": 4.1173, + "step": 27 + }, + { + "epoch": 0.03185437997724687, + "grad_norm": 1.2403652667999268, + "learning_rate": 0.0009940841865756541, + "loss": 2.7539, + "step": 28 + }, + { + "epoch": 0.03299203640500569, + "grad_norm": 1.1057771444320679, + "learning_rate": 0.0009938566552901023, + "loss": 3.5235, + "step": 29 + }, + { + "epoch": 0.034129692832764506, + "grad_norm": 1.1171739101409912, + "learning_rate": 0.0009936291240045505, + "loss": 2.8045, + "step": 30 + }, + { + "epoch": 0.03526734926052332, + "grad_norm": 1.4165009260177612, + "learning_rate": 0.000993401592718999, + "loss": 4.1957, + "step": 31 + }, + { + "epoch": 0.03640500568828214, + "grad_norm": 1.4927332401275635, + "learning_rate": 0.0009931740614334472, + "loss": 3.5043, + "step": 32 + }, + { + "epoch": 0.03754266211604096, + "grad_norm": 1.4980137348175049, + "learning_rate": 0.0009929465301478954, + "loss": 4.4631, + "step": 33 + }, + { + "epoch": 0.038680318543799774, + "grad_norm": 1.375657320022583, + "learning_rate": 0.0009927189988623436, + "loss": 3.3355, + "step": 34 + }, + { + "epoch": 0.03981797497155859, + "grad_norm": 1.5445199012756348, + "learning_rate": 0.0009924914675767918, + "loss": 4.7957, + "step": 35 + }, + { + "epoch": 0.040955631399317405, + "grad_norm": 1.644087314605713, + "learning_rate": 0.00099226393629124, + "loss": 3.2537, + "step": 36 + }, + { + "epoch": 0.04209328782707622, + "grad_norm": 1.307708978652954, + "learning_rate": 0.0009920364050056882, + "loss": 3.6423, + "step": 37 + }, + { + "epoch": 0.04323094425483504, + "grad_norm": 1.1098673343658447, + "learning_rate": 0.0009918088737201364, + "loss": 2.7579, + "step": 38 + }, + { + "epoch": 0.04436860068259386, + "grad_norm": 1.436892032623291, + "learning_rate": 0.0009915813424345847, + "loss": 2.6732, + "step": 39 + }, + { + "epoch": 0.04550625711035267, + "grad_norm": 0.9877327680587769, + "learning_rate": 0.000991353811149033, + "loss": 2.4214, + "step": 40 + }, + { + "epoch": 0.04664391353811149, + "grad_norm": 1.6885147094726562, + "learning_rate": 0.0009911262798634813, + "loss": 3.0437, + "step": 41 + }, + { + "epoch": 0.04778156996587031, + "grad_norm": 1.2590283155441284, + "learning_rate": 0.0009908987485779295, + "loss": 3.1304, + "step": 42 + }, + { + "epoch": 0.048919226393629126, + "grad_norm": 1.3581849336624146, + "learning_rate": 0.0009906712172923777, + "loss": 3.5613, + "step": 43 + }, + { + "epoch": 0.05005688282138794, + "grad_norm": 1.5315910577774048, + "learning_rate": 0.000990443686006826, + "loss": 5.662, + "step": 44 + }, + { + "epoch": 0.051194539249146756, + "grad_norm": 0.9183916449546814, + "learning_rate": 0.0009902161547212743, + "loss": 2.6213, + "step": 45 + }, + { + "epoch": 0.05233219567690557, + "grad_norm": 1.1212267875671387, + "learning_rate": 0.0009899886234357223, + "loss": 2.613, + "step": 46 + }, + { + "epoch": 0.053469852104664393, + "grad_norm": 0.9141287803649902, + "learning_rate": 0.0009897610921501705, + "loss": 2.2044, + "step": 47 + }, + { + "epoch": 0.05460750853242321, + "grad_norm": 1.1506439447402954, + "learning_rate": 0.000989533560864619, + "loss": 3.5604, + "step": 48 + }, + { + "epoch": 0.055745164960182024, + "grad_norm": 0.9535030722618103, + "learning_rate": 0.0009893060295790672, + "loss": 2.5689, + "step": 49 + }, + { + "epoch": 0.05688282138794084, + "grad_norm": 0.8492175936698914, + "learning_rate": 0.0009890784982935154, + "loss": 2.4686, + "step": 50 + }, + { + "epoch": 0.05802047781569966, + "grad_norm": 1.176944613456726, + "learning_rate": 0.0009888509670079636, + "loss": 2.9579, + "step": 51 + }, + { + "epoch": 0.05915813424345848, + "grad_norm": 1.0339919328689575, + "learning_rate": 0.0009886234357224118, + "loss": 2.6745, + "step": 52 + }, + { + "epoch": 0.06029579067121729, + "grad_norm": 0.9942947626113892, + "learning_rate": 0.00098839590443686, + "loss": 1.9818, + "step": 53 + }, + { + "epoch": 0.06143344709897611, + "grad_norm": 0.8406325578689575, + "learning_rate": 0.0009881683731513085, + "loss": 1.4172, + "step": 54 + }, + { + "epoch": 0.06257110352673492, + "grad_norm": 1.1649565696716309, + "learning_rate": 0.0009879408418657567, + "loss": 3.7562, + "step": 55 + }, + { + "epoch": 0.06370875995449374, + "grad_norm": 1.0086359977722168, + "learning_rate": 0.0009877133105802047, + "loss": 1.6131, + "step": 56 + }, + { + "epoch": 0.06484641638225255, + "grad_norm": 1.4940587282180786, + "learning_rate": 0.000987485779294653, + "loss": 2.0438, + "step": 57 + }, + { + "epoch": 0.06598407281001138, + "grad_norm": 1.365527868270874, + "learning_rate": 0.0009872582480091013, + "loss": 3.1455, + "step": 58 + }, + { + "epoch": 0.0671217292377702, + "grad_norm": 1.9875714778900146, + "learning_rate": 0.0009870307167235495, + "loss": 4.7546, + "step": 59 + }, + { + "epoch": 0.06825938566552901, + "grad_norm": 1.2569215297698975, + "learning_rate": 0.0009868031854379977, + "loss": 2.9149, + "step": 60 + }, + { + "epoch": 0.06939704209328783, + "grad_norm": 1.7797069549560547, + "learning_rate": 0.000986575654152446, + "loss": 4.0154, + "step": 61 + }, + { + "epoch": 0.07053469852104664, + "grad_norm": 1.348248839378357, + "learning_rate": 0.0009863481228668941, + "loss": 4.3475, + "step": 62 + }, + { + "epoch": 0.07167235494880546, + "grad_norm": 1.334631323814392, + "learning_rate": 0.0009861205915813426, + "loss": 4.094, + "step": 63 + }, + { + "epoch": 0.07281001137656427, + "grad_norm": 1.1636334657669067, + "learning_rate": 0.0009858930602957908, + "loss": 3.4968, + "step": 64 + }, + { + "epoch": 0.07394766780432309, + "grad_norm": 1.045125126838684, + "learning_rate": 0.000985665529010239, + "loss": 2.4535, + "step": 65 + }, + { + "epoch": 0.07508532423208192, + "grad_norm": 0.9353369474411011, + "learning_rate": 0.0009854379977246872, + "loss": 2.0262, + "step": 66 + }, + { + "epoch": 0.07622298065984073, + "grad_norm": 1.0836305618286133, + "learning_rate": 0.0009852104664391354, + "loss": 3.03, + "step": 67 + }, + { + "epoch": 0.07736063708759955, + "grad_norm": 0.8448026776313782, + "learning_rate": 0.0009849829351535836, + "loss": 2.0041, + "step": 68 + }, + { + "epoch": 0.07849829351535836, + "grad_norm": 1.4605052471160889, + "learning_rate": 0.0009847554038680318, + "loss": 3.9341, + "step": 69 + }, + { + "epoch": 0.07963594994311718, + "grad_norm": 1.040484070777893, + "learning_rate": 0.00098452787258248, + "loss": 3.1491, + "step": 70 + }, + { + "epoch": 0.080773606370876, + "grad_norm": 1.9364869594573975, + "learning_rate": 0.0009843003412969285, + "loss": 4.5523, + "step": 71 + }, + { + "epoch": 0.08191126279863481, + "grad_norm": 1.1543667316436768, + "learning_rate": 0.0009840728100113767, + "loss": 3.0408, + "step": 72 + }, + { + "epoch": 0.08304891922639362, + "grad_norm": 1.410697340965271, + "learning_rate": 0.0009838452787258249, + "loss": 3.5993, + "step": 73 + }, + { + "epoch": 0.08418657565415244, + "grad_norm": 1.4086410999298096, + "learning_rate": 0.000983617747440273, + "loss": 3.7115, + "step": 74 + }, + { + "epoch": 0.08532423208191127, + "grad_norm": 1.3084092140197754, + "learning_rate": 0.0009833902161547213, + "loss": 2.0049, + "step": 75 + }, + { + "epoch": 0.08646188850967008, + "grad_norm": 1.598218321800232, + "learning_rate": 0.0009831626848691695, + "loss": 3.0724, + "step": 76 + }, + { + "epoch": 0.0875995449374289, + "grad_norm": 0.9294804334640503, + "learning_rate": 0.0009829351535836177, + "loss": 2.953, + "step": 77 + }, + { + "epoch": 0.08873720136518772, + "grad_norm": 0.882921576499939, + "learning_rate": 0.000982707622298066, + "loss": 1.8918, + "step": 78 + }, + { + "epoch": 0.08987485779294653, + "grad_norm": 0.8168361186981201, + "learning_rate": 0.0009824800910125141, + "loss": 1.9626, + "step": 79 + }, + { + "epoch": 0.09101251422070535, + "grad_norm": 1.6279023885726929, + "learning_rate": 0.0009822525597269626, + "loss": 4.2579, + "step": 80 + }, + { + "epoch": 0.09215017064846416, + "grad_norm": 1.3332301378250122, + "learning_rate": 0.0009820250284414108, + "loss": 2.8298, + "step": 81 + }, + { + "epoch": 0.09328782707622298, + "grad_norm": 1.2498598098754883, + "learning_rate": 0.000981797497155859, + "loss": 2.9971, + "step": 82 + }, + { + "epoch": 0.09442548350398179, + "grad_norm": 0.8949465751647949, + "learning_rate": 0.0009815699658703072, + "loss": 1.7321, + "step": 83 + }, + { + "epoch": 0.09556313993174062, + "grad_norm": 0.8322932124137878, + "learning_rate": 0.0009813424345847554, + "loss": 1.9678, + "step": 84 + }, + { + "epoch": 0.09670079635949944, + "grad_norm": 1.2313084602355957, + "learning_rate": 0.0009811149032992036, + "loss": 2.7896, + "step": 85 + }, + { + "epoch": 0.09783845278725825, + "grad_norm": 0.749366283416748, + "learning_rate": 0.0009808873720136518, + "loss": 1.7082, + "step": 86 + }, + { + "epoch": 0.09897610921501707, + "grad_norm": 1.3737764358520508, + "learning_rate": 0.0009806598407281, + "loss": 4.0502, + "step": 87 + }, + { + "epoch": 0.10011376564277588, + "grad_norm": 1.0343174934387207, + "learning_rate": 0.0009804323094425485, + "loss": 4.1111, + "step": 88 + }, + { + "epoch": 0.1012514220705347, + "grad_norm": 1.1936379671096802, + "learning_rate": 0.0009802047781569967, + "loss": 3.3507, + "step": 89 + }, + { + "epoch": 0.10238907849829351, + "grad_norm": 1.1276384592056274, + "learning_rate": 0.0009799772468714449, + "loss": 2.6818, + "step": 90 + }, + { + "epoch": 0.10352673492605233, + "grad_norm": 2.1285316944122314, + "learning_rate": 0.000979749715585893, + "loss": 5.7983, + "step": 91 + }, + { + "epoch": 0.10466439135381114, + "grad_norm": 1.4918886423110962, + "learning_rate": 0.0009795221843003413, + "loss": 2.4772, + "step": 92 + }, + { + "epoch": 0.10580204778156997, + "grad_norm": 1.229989767074585, + "learning_rate": 0.0009792946530147895, + "loss": 3.1748, + "step": 93 + }, + { + "epoch": 0.10693970420932879, + "grad_norm": 1.2130271196365356, + "learning_rate": 0.000979067121729238, + "loss": 1.964, + "step": 94 + }, + { + "epoch": 0.1080773606370876, + "grad_norm": 1.5376529693603516, + "learning_rate": 0.000978839590443686, + "loss": 3.2146, + "step": 95 + }, + { + "epoch": 0.10921501706484642, + "grad_norm": 1.0450708866119385, + "learning_rate": 0.0009786120591581341, + "loss": 2.9779, + "step": 96 + }, + { + "epoch": 0.11035267349260523, + "grad_norm": 0.8169401288032532, + "learning_rate": 0.0009783845278725826, + "loss": 2.9477, + "step": 97 + }, + { + "epoch": 0.11149032992036405, + "grad_norm": 1.361669898033142, + "learning_rate": 0.0009781569965870308, + "loss": 2.842, + "step": 98 + }, + { + "epoch": 0.11262798634812286, + "grad_norm": 0.8985223174095154, + "learning_rate": 0.000977929465301479, + "loss": 2.3365, + "step": 99 + }, + { + "epoch": 0.11376564277588168, + "grad_norm": 0.9749724864959717, + "learning_rate": 0.0009777019340159272, + "loss": 1.5105, + "step": 100 + }, + { + "epoch": 0.1149032992036405, + "grad_norm": 1.0484577417373657, + "learning_rate": 0.0009774744027303754, + "loss": 1.8364, + "step": 101 + }, + { + "epoch": 0.11604095563139932, + "grad_norm": 0.9952252507209778, + "learning_rate": 0.0009772468714448236, + "loss": 2.8899, + "step": 102 + }, + { + "epoch": 0.11717861205915814, + "grad_norm": 1.3469035625457764, + "learning_rate": 0.000977019340159272, + "loss": 3.7179, + "step": 103 + }, + { + "epoch": 0.11831626848691695, + "grad_norm": 0.9715613722801208, + "learning_rate": 0.0009767918088737202, + "loss": 1.8968, + "step": 104 + }, + { + "epoch": 0.11945392491467577, + "grad_norm": 0.7201526165008545, + "learning_rate": 0.0009765642775881682, + "loss": 1.6332, + "step": 105 + }, + { + "epoch": 0.12059158134243458, + "grad_norm": 0.9688479900360107, + "learning_rate": 0.0009763367463026166, + "loss": 2.8186, + "step": 106 + }, + { + "epoch": 0.1217292377701934, + "grad_norm": 1.2450870275497437, + "learning_rate": 0.0009761092150170649, + "loss": 3.0069, + "step": 107 + }, + { + "epoch": 0.12286689419795221, + "grad_norm": 1.3213499784469604, + "learning_rate": 0.0009758816837315131, + "loss": 3.8223, + "step": 108 + }, + { + "epoch": 0.12400455062571103, + "grad_norm": 1.4514771699905396, + "learning_rate": 0.0009756541524459613, + "loss": 4.2365, + "step": 109 + }, + { + "epoch": 0.12514220705346984, + "grad_norm": 0.8881714344024658, + "learning_rate": 0.0009754266211604096, + "loss": 2.1791, + "step": 110 + }, + { + "epoch": 0.12627986348122866, + "grad_norm": 1.0673104524612427, + "learning_rate": 0.0009751990898748578, + "loss": 2.073, + "step": 111 + }, + { + "epoch": 0.12741751990898748, + "grad_norm": 1.6061453819274902, + "learning_rate": 0.000974971558589306, + "loss": 3.0393, + "step": 112 + }, + { + "epoch": 0.1285551763367463, + "grad_norm": 0.9326847791671753, + "learning_rate": 0.0009747440273037544, + "loss": 1.549, + "step": 113 + }, + { + "epoch": 0.1296928327645051, + "grad_norm": 0.8536593914031982, + "learning_rate": 0.0009745164960182025, + "loss": 1.7561, + "step": 114 + }, + { + "epoch": 0.13083048919226395, + "grad_norm": 1.1350816488265991, + "learning_rate": 0.0009742889647326507, + "loss": 4.0099, + "step": 115 + }, + { + "epoch": 0.13196814562002276, + "grad_norm": 1.5794169902801514, + "learning_rate": 0.000974061433447099, + "loss": 4.4439, + "step": 116 + }, + { + "epoch": 0.13310580204778158, + "grad_norm": 1.0470244884490967, + "learning_rate": 0.0009738339021615472, + "loss": 2.459, + "step": 117 + }, + { + "epoch": 0.1342434584755404, + "grad_norm": 0.9671889543533325, + "learning_rate": 0.0009736063708759955, + "loss": 2.7996, + "step": 118 + }, + { + "epoch": 0.1353811149032992, + "grad_norm": 1.0588785409927368, + "learning_rate": 0.0009733788395904437, + "loss": 2.4111, + "step": 119 + }, + { + "epoch": 0.13651877133105803, + "grad_norm": 1.3822710514068604, + "learning_rate": 0.0009731513083048919, + "loss": 3.7897, + "step": 120 + }, + { + "epoch": 0.13765642775881684, + "grad_norm": 1.1157410144805908, + "learning_rate": 0.0009729237770193402, + "loss": 2.8744, + "step": 121 + }, + { + "epoch": 0.13879408418657566, + "grad_norm": 1.1391868591308594, + "learning_rate": 0.0009726962457337885, + "loss": 3.3561, + "step": 122 + }, + { + "epoch": 0.13993174061433447, + "grad_norm": 0.7412275671958923, + "learning_rate": 0.0009724687144482367, + "loss": 2.0783, + "step": 123 + }, + { + "epoch": 0.1410693970420933, + "grad_norm": 1.821694254875183, + "learning_rate": 0.0009722411831626849, + "loss": 4.1648, + "step": 124 + }, + { + "epoch": 0.1422070534698521, + "grad_norm": 0.9905460476875305, + "learning_rate": 0.0009720136518771331, + "loss": 2.3263, + "step": 125 + }, + { + "epoch": 0.14334470989761092, + "grad_norm": NaN, + "learning_rate": 0.0009720136518771331, + "loss": 3.7223, + "step": 126 + }, + { + "epoch": 0.14448236632536973, + "grad_norm": 0.9033792614936829, + "learning_rate": 0.0009717861205915813, + "loss": 2.0929, + "step": 127 + }, + { + "epoch": 0.14562002275312855, + "grad_norm": 0.9408984184265137, + "learning_rate": 0.0009715585893060296, + "loss": 2.4161, + "step": 128 + }, + { + "epoch": 0.14675767918088736, + "grad_norm": 0.9367227554321289, + "learning_rate": 0.0009713310580204778, + "loss": 2.1832, + "step": 129 + }, + { + "epoch": 0.14789533560864618, + "grad_norm": 1.7745789289474487, + "learning_rate": 0.000971103526734926, + "loss": 3.0164, + "step": 130 + }, + { + "epoch": 0.149032992036405, + "grad_norm": 1.1147022247314453, + "learning_rate": 0.0009708759954493744, + "loss": 2.4758, + "step": 131 + }, + { + "epoch": 0.15017064846416384, + "grad_norm": 1.2068356275558472, + "learning_rate": 0.0009706484641638226, + "loss": 2.1346, + "step": 132 + }, + { + "epoch": 0.15130830489192265, + "grad_norm": 1.1658344268798828, + "learning_rate": 0.0009704209328782708, + "loss": 2.3267, + "step": 133 + }, + { + "epoch": 0.15244596131968147, + "grad_norm": 1.0015108585357666, + "learning_rate": 0.0009701934015927191, + "loss": 2.6728, + "step": 134 + }, + { + "epoch": 0.15358361774744028, + "grad_norm": 0.8626338243484497, + "learning_rate": 0.0009699658703071672, + "loss": 2.2007, + "step": 135 + }, + { + "epoch": 0.1547212741751991, + "grad_norm": 1.3096460103988647, + "learning_rate": 0.0009697383390216154, + "loss": 2.7757, + "step": 136 + }, + { + "epoch": 0.1558589306029579, + "grad_norm": 0.5792064070701599, + "learning_rate": 0.0009695108077360637, + "loss": 1.4733, + "step": 137 + }, + { + "epoch": 0.15699658703071673, + "grad_norm": 1.103218913078308, + "learning_rate": 0.0009692832764505119, + "loss": 3.3553, + "step": 138 + }, + { + "epoch": 0.15813424345847554, + "grad_norm": 1.1538619995117188, + "learning_rate": 0.0009690557451649603, + "loss": 2.6128, + "step": 139 + }, + { + "epoch": 0.15927189988623436, + "grad_norm": 0.8585355877876282, + "learning_rate": 0.0009688282138794085, + "loss": 1.995, + "step": 140 + }, + { + "epoch": 0.16040955631399317, + "grad_norm": 0.9618576765060425, + "learning_rate": 0.0009686006825938567, + "loss": 2.7825, + "step": 141 + }, + { + "epoch": 0.161547212741752, + "grad_norm": 1.1649035215377808, + "learning_rate": 0.000968373151308305, + "loss": 3.4463, + "step": 142 + }, + { + "epoch": 0.1626848691695108, + "grad_norm": 0.7993972301483154, + "learning_rate": 0.0009681456200227532, + "loss": 1.343, + "step": 143 + }, + { + "epoch": 0.16382252559726962, + "grad_norm": 1.05124032497406, + "learning_rate": 0.0009679180887372013, + "loss": 3.3284, + "step": 144 + }, + { + "epoch": 0.16496018202502843, + "grad_norm": 1.1311984062194824, + "learning_rate": 0.0009676905574516496, + "loss": 2.7092, + "step": 145 + }, + { + "epoch": 0.16609783845278725, + "grad_norm": 1.1213219165802002, + "learning_rate": 0.0009674630261660978, + "loss": 2.905, + "step": 146 + }, + { + "epoch": 0.16723549488054607, + "grad_norm": 1.0415153503417969, + "learning_rate": 0.000967235494880546, + "loss": 2.5957, + "step": 147 + }, + { + "epoch": 0.16837315130830488, + "grad_norm": 1.1598129272460938, + "learning_rate": 0.0009670079635949944, + "loss": 2.9467, + "step": 148 + }, + { + "epoch": 0.1695108077360637, + "grad_norm": 1.0593596696853638, + "learning_rate": 0.0009667804323094426, + "loss": 3.2002, + "step": 149 + }, + { + "epoch": 0.17064846416382254, + "grad_norm": 1.003199815750122, + "learning_rate": 0.0009665529010238908, + "loss": 2.107, + "step": 150 + }, + { + "epoch": 0.17178612059158135, + "grad_norm": 1.135500431060791, + "learning_rate": 0.0009663253697383391, + "loss": 2.8572, + "step": 151 + }, + { + "epoch": 0.17292377701934017, + "grad_norm": 3.164407730102539, + "learning_rate": 0.0009660978384527873, + "loss": 2.5331, + "step": 152 + }, + { + "epoch": 0.17406143344709898, + "grad_norm": 1.1366970539093018, + "learning_rate": 0.0009658703071672355, + "loss": 2.8843, + "step": 153 + }, + { + "epoch": 0.1751990898748578, + "grad_norm": 1.3237171173095703, + "learning_rate": 0.0009656427758816837, + "loss": 3.0479, + "step": 154 + }, + { + "epoch": 0.17633674630261661, + "grad_norm": 0.7948922514915466, + "learning_rate": 0.0009654152445961319, + "loss": 2.3427, + "step": 155 + }, + { + "epoch": 0.17747440273037543, + "grad_norm": 1.0152616500854492, + "learning_rate": 0.0009651877133105801, + "loss": 2.1661, + "step": 156 + }, + { + "epoch": 0.17861205915813425, + "grad_norm": 0.6189094185829163, + "learning_rate": 0.0009649601820250285, + "loss": 1.7452, + "step": 157 + }, + { + "epoch": 0.17974971558589306, + "grad_norm": 1.024207353591919, + "learning_rate": 0.0009647326507394767, + "loss": 3.1021, + "step": 158 + }, + { + "epoch": 0.18088737201365188, + "grad_norm": 0.9709880352020264, + "learning_rate": 0.000964505119453925, + "loss": 2.5928, + "step": 159 + }, + { + "epoch": 0.1820250284414107, + "grad_norm": 0.9320815801620483, + "learning_rate": 0.0009642775881683732, + "loss": 2.5435, + "step": 160 + }, + { + "epoch": 0.1831626848691695, + "grad_norm": 1.1968194246292114, + "learning_rate": 0.0009640500568828214, + "loss": 2.7312, + "step": 161 + }, + { + "epoch": 0.18430034129692832, + "grad_norm": 0.6706811189651489, + "learning_rate": 0.0009638225255972697, + "loss": 1.6675, + "step": 162 + }, + { + "epoch": 0.18543799772468714, + "grad_norm": 0.9634714722633362, + "learning_rate": 0.0009635949943117179, + "loss": 1.9415, + "step": 163 + }, + { + "epoch": 0.18657565415244595, + "grad_norm": 1.2071101665496826, + "learning_rate": 0.000963367463026166, + "loss": 2.3851, + "step": 164 + }, + { + "epoch": 0.18771331058020477, + "grad_norm": 1.1103309392929077, + "learning_rate": 0.0009631399317406144, + "loss": 2.4283, + "step": 165 + }, + { + "epoch": 0.18885096700796358, + "grad_norm": 0.7859079241752625, + "learning_rate": 0.0009629124004550626, + "loss": 1.6424, + "step": 166 + }, + { + "epoch": 0.1899886234357224, + "grad_norm": 1.008284568786621, + "learning_rate": 0.0009626848691695108, + "loss": 2.4152, + "step": 167 + }, + { + "epoch": 0.19112627986348124, + "grad_norm": 1.1314949989318848, + "learning_rate": 0.0009624573378839591, + "loss": 3.8599, + "step": 168 + }, + { + "epoch": 0.19226393629124006, + "grad_norm": 1.0891621112823486, + "learning_rate": 0.0009622298065984073, + "loss": 2.8165, + "step": 169 + }, + { + "epoch": 0.19340159271899887, + "grad_norm": 1.5934785604476929, + "learning_rate": 0.0009620022753128555, + "loss": 2.2033, + "step": 170 + }, + { + "epoch": 0.1945392491467577, + "grad_norm": 0.9060195684432983, + "learning_rate": 0.0009617747440273038, + "loss": 1.9928, + "step": 171 + }, + { + "epoch": 0.1956769055745165, + "grad_norm": 0.8390949964523315, + "learning_rate": 0.000961547212741752, + "loss": 2.1069, + "step": 172 + }, + { + "epoch": 0.19681456200227532, + "grad_norm": 1.1905990839004517, + "learning_rate": 0.0009613196814562003, + "loss": 2.5079, + "step": 173 + }, + { + "epoch": 0.19795221843003413, + "grad_norm": 1.3425909280776978, + "learning_rate": 0.0009610921501706485, + "loss": 3.3445, + "step": 174 + }, + { + "epoch": 0.19908987485779295, + "grad_norm": 0.9901446104049683, + "learning_rate": 0.0009608646188850967, + "loss": 3.0084, + "step": 175 + }, + { + "epoch": 0.20022753128555176, + "grad_norm": 1.7754806280136108, + "learning_rate": 0.0009606370875995449, + "loss": 2.8587, + "step": 176 + }, + { + "epoch": 0.20136518771331058, + "grad_norm": 1.0438529253005981, + "learning_rate": 0.0009604095563139932, + "loss": 2.5869, + "step": 177 + }, + { + "epoch": 0.2025028441410694, + "grad_norm": 1.0804096460342407, + "learning_rate": 0.0009601820250284414, + "loss": 3.065, + "step": 178 + }, + { + "epoch": 0.2036405005688282, + "grad_norm": 1.2588906288146973, + "learning_rate": 0.0009599544937428897, + "loss": 1.9953, + "step": 179 + }, + { + "epoch": 0.20477815699658702, + "grad_norm": 1.1029516458511353, + "learning_rate": 0.0009597269624573379, + "loss": 2.2572, + "step": 180 + }, + { + "epoch": 0.20591581342434584, + "grad_norm": 1.1141575574874878, + "learning_rate": 0.0009594994311717862, + "loss": 2.3295, + "step": 181 + }, + { + "epoch": 0.20705346985210465, + "grad_norm": 1.2331255674362183, + "learning_rate": 0.0009592718998862345, + "loss": 2.8308, + "step": 182 + }, + { + "epoch": 0.20819112627986347, + "grad_norm": 1.3520818948745728, + "learning_rate": 0.0009590443686006826, + "loss": 3.12, + "step": 183 + }, + { + "epoch": 0.20932878270762229, + "grad_norm": 1.2728475332260132, + "learning_rate": 0.0009588168373151308, + "loss": 2.8458, + "step": 184 + }, + { + "epoch": 0.21046643913538113, + "grad_norm": 0.779961884021759, + "learning_rate": 0.0009585893060295791, + "loss": 1.7038, + "step": 185 + }, + { + "epoch": 0.21160409556313994, + "grad_norm": 0.7108963131904602, + "learning_rate": 0.0009583617747440273, + "loss": 1.3587, + "step": 186 + }, + { + "epoch": 0.21274175199089876, + "grad_norm": 0.7882399559020996, + "learning_rate": 0.0009581342434584755, + "loss": 1.7968, + "step": 187 + }, + { + "epoch": 0.21387940841865757, + "grad_norm": 1.3022332191467285, + "learning_rate": 0.0009579067121729238, + "loss": 3.4653, + "step": 188 + }, + { + "epoch": 0.2150170648464164, + "grad_norm": 0.9861379265785217, + "learning_rate": 0.000957679180887372, + "loss": 2.5134, + "step": 189 + }, + { + "epoch": 0.2161547212741752, + "grad_norm": 1.3816951513290405, + "learning_rate": 0.0009574516496018203, + "loss": 1.7213, + "step": 190 + }, + { + "epoch": 0.21729237770193402, + "grad_norm": 0.7945815920829773, + "learning_rate": 0.0009572241183162686, + "loss": 1.533, + "step": 191 + }, + { + "epoch": 0.21843003412969283, + "grad_norm": 0.8966066837310791, + "learning_rate": 0.0009569965870307168, + "loss": 1.9149, + "step": 192 + }, + { + "epoch": 0.21956769055745165, + "grad_norm": 1.2404614686965942, + "learning_rate": 0.0009567690557451649, + "loss": 2.8121, + "step": 193 + }, + { + "epoch": 0.22070534698521047, + "grad_norm": 1.0486642122268677, + "learning_rate": 0.0009565415244596132, + "loss": 2.5131, + "step": 194 + }, + { + "epoch": 0.22184300341296928, + "grad_norm": 0.9867016673088074, + "learning_rate": 0.0009563139931740614, + "loss": 2.9784, + "step": 195 + }, + { + "epoch": 0.2229806598407281, + "grad_norm": 1.2857524156570435, + "learning_rate": 0.0009560864618885096, + "loss": 2.7056, + "step": 196 + }, + { + "epoch": 0.2241183162684869, + "grad_norm": 0.8611189126968384, + "learning_rate": 0.0009558589306029579, + "loss": 1.6405, + "step": 197 + }, + { + "epoch": 0.22525597269624573, + "grad_norm": 1.1097594499588013, + "learning_rate": 0.0009556313993174062, + "loss": 2.3999, + "step": 198 + }, + { + "epoch": 0.22639362912400454, + "grad_norm": 1.3522695302963257, + "learning_rate": 0.0009554038680318545, + "loss": 2.4307, + "step": 199 + }, + { + "epoch": 0.22753128555176336, + "grad_norm": 0.7342440485954285, + "learning_rate": 0.0009551763367463027, + "loss": 1.4805, + "step": 200 + }, + { + "epoch": 0.22866894197952217, + "grad_norm": 1.099187970161438, + "learning_rate": 0.0009549488054607509, + "loss": 2.4419, + "step": 201 + }, + { + "epoch": 0.229806598407281, + "grad_norm": 0.7036212682723999, + "learning_rate": 0.0009547212741751992, + "loss": 1.5371, + "step": 202 + }, + { + "epoch": 0.23094425483503983, + "grad_norm": 0.7153437733650208, + "learning_rate": 0.0009544937428896473, + "loss": 2.0429, + "step": 203 + }, + { + "epoch": 0.23208191126279865, + "grad_norm": 0.9110562801361084, + "learning_rate": 0.0009542662116040955, + "loss": 2.2197, + "step": 204 + }, + { + "epoch": 0.23321956769055746, + "grad_norm": 0.9033525586128235, + "learning_rate": 0.0009540386803185438, + "loss": 2.0181, + "step": 205 + }, + { + "epoch": 0.23435722411831628, + "grad_norm": 1.1249139308929443, + "learning_rate": 0.000953811149032992, + "loss": 2.6982, + "step": 206 + }, + { + "epoch": 0.2354948805460751, + "grad_norm": 2.2283267974853516, + "learning_rate": 0.0009535836177474403, + "loss": 3.7574, + "step": 207 + }, + { + "epoch": 0.2366325369738339, + "grad_norm": 1.0475785732269287, + "learning_rate": 0.0009533560864618886, + "loss": 2.3081, + "step": 208 + }, + { + "epoch": 0.23777019340159272, + "grad_norm": 0.9977370500564575, + "learning_rate": 0.0009531285551763368, + "loss": 2.5508, + "step": 209 + }, + { + "epoch": 0.23890784982935154, + "grad_norm": 1.2504466772079468, + "learning_rate": 0.000952901023890785, + "loss": 2.9425, + "step": 210 + }, + { + "epoch": 0.24004550625711035, + "grad_norm": 1.4991356134414673, + "learning_rate": 0.0009526734926052333, + "loss": 2.6766, + "step": 211 + }, + { + "epoch": 0.24118316268486917, + "grad_norm": 0.952013373374939, + "learning_rate": 0.0009524459613196815, + "loss": 1.3622, + "step": 212 + }, + { + "epoch": 0.24232081911262798, + "grad_norm": 0.9106293320655823, + "learning_rate": 0.0009522184300341296, + "loss": 2.2534, + "step": 213 + }, + { + "epoch": 0.2434584755403868, + "grad_norm": 1.6993825435638428, + "learning_rate": 0.0009519908987485779, + "loss": 3.5046, + "step": 214 + }, + { + "epoch": 0.2445961319681456, + "grad_norm": 1.4656329154968262, + "learning_rate": 0.0009517633674630262, + "loss": 3.9916, + "step": 215 + }, + { + "epoch": 0.24573378839590443, + "grad_norm": 0.9613212943077087, + "learning_rate": 0.0009515358361774744, + "loss": 2.7701, + "step": 216 + }, + { + "epoch": 0.24687144482366324, + "grad_norm": 1.357133388519287, + "learning_rate": 0.0009513083048919227, + "loss": 3.3809, + "step": 217 + }, + { + "epoch": 0.24800910125142206, + "grad_norm": 1.2995182275772095, + "learning_rate": 0.0009510807736063709, + "loss": 2.889, + "step": 218 + }, + { + "epoch": 0.24914675767918087, + "grad_norm": 1.4708192348480225, + "learning_rate": 0.0009508532423208191, + "loss": 2.6981, + "step": 219 + }, + { + "epoch": 0.2502844141069397, + "grad_norm": 1.0964555740356445, + "learning_rate": 0.0009506257110352674, + "loss": 3.448, + "step": 220 + }, + { + "epoch": 0.2514220705346985, + "grad_norm": 0.9647961854934692, + "learning_rate": 0.0009503981797497156, + "loss": 1.367, + "step": 221 + }, + { + "epoch": 0.2525597269624573, + "grad_norm": 1.0223067998886108, + "learning_rate": 0.0009501706484641638, + "loss": 2.653, + "step": 222 + }, + { + "epoch": 0.25369738339021614, + "grad_norm": 1.2166616916656494, + "learning_rate": 0.000949943117178612, + "loss": 3.8313, + "step": 223 + }, + { + "epoch": 0.25483503981797495, + "grad_norm": 1.2261899709701538, + "learning_rate": 0.0009497155858930603, + "loss": 3.3107, + "step": 224 + }, + { + "epoch": 0.25597269624573377, + "grad_norm": 1.2194271087646484, + "learning_rate": 0.0009494880546075086, + "loss": 2.7319, + "step": 225 + }, + { + "epoch": 0.2571103526734926, + "grad_norm": 1.196639895439148, + "learning_rate": 0.0009492605233219568, + "loss": 3.1096, + "step": 226 + }, + { + "epoch": 0.2582480091012514, + "grad_norm": 0.9778220057487488, + "learning_rate": 0.000949032992036405, + "loss": 2.5783, + "step": 227 + }, + { + "epoch": 0.2593856655290102, + "grad_norm": 1.0489749908447266, + "learning_rate": 0.0009488054607508533, + "loss": 2.1872, + "step": 228 + }, + { + "epoch": 0.2605233219567691, + "grad_norm": 2.25301194190979, + "learning_rate": 0.0009485779294653015, + "loss": 2.4844, + "step": 229 + }, + { + "epoch": 0.2616609783845279, + "grad_norm": 1.065746784210205, + "learning_rate": 0.0009483503981797497, + "loss": 3.7665, + "step": 230 + }, + { + "epoch": 0.2627986348122867, + "grad_norm": 1.1636378765106201, + "learning_rate": 0.0009481228668941981, + "loss": 2.3635, + "step": 231 + }, + { + "epoch": 0.26393629124004553, + "grad_norm": 0.915281355381012, + "learning_rate": 0.0009478953356086462, + "loss": 2.5689, + "step": 232 + }, + { + "epoch": 0.26507394766780434, + "grad_norm": 1.088327407836914, + "learning_rate": 0.0009476678043230944, + "loss": 2.3029, + "step": 233 + }, + { + "epoch": 0.26621160409556316, + "grad_norm": 0.8353843092918396, + "learning_rate": 0.0009474402730375427, + "loss": 2.1205, + "step": 234 + }, + { + "epoch": 0.267349260523322, + "grad_norm": 0.8425211906433105, + "learning_rate": 0.0009472127417519909, + "loss": 1.6575, + "step": 235 + }, + { + "epoch": 0.2684869169510808, + "grad_norm": 1.309173822402954, + "learning_rate": 0.0009469852104664391, + "loss": 3.7625, + "step": 236 + }, + { + "epoch": 0.2696245733788396, + "grad_norm": 1.0323517322540283, + "learning_rate": 0.0009467576791808874, + "loss": 2.7422, + "step": 237 + }, + { + "epoch": 0.2707622298065984, + "grad_norm": 1.0105620622634888, + "learning_rate": 0.0009465301478953356, + "loss": 2.0942, + "step": 238 + }, + { + "epoch": 0.27189988623435724, + "grad_norm": 1.2887792587280273, + "learning_rate": 0.0009463026166097838, + "loss": 3.3921, + "step": 239 + }, + { + "epoch": 0.27303754266211605, + "grad_norm": 0.8635759949684143, + "learning_rate": 0.0009460750853242322, + "loss": 1.6201, + "step": 240 + }, + { + "epoch": 0.27417519908987487, + "grad_norm": 1.0267283916473389, + "learning_rate": 0.0009458475540386804, + "loss": 2.6196, + "step": 241 + }, + { + "epoch": 0.2753128555176337, + "grad_norm": 1.3228164911270142, + "learning_rate": 0.0009456200227531286, + "loss": 2.9269, + "step": 242 + }, + { + "epoch": 0.2764505119453925, + "grad_norm": 1.1229983568191528, + "learning_rate": 0.0009453924914675768, + "loss": 2.0913, + "step": 243 + }, + { + "epoch": 0.2775881683731513, + "grad_norm": 1.4177569150924683, + "learning_rate": 0.000945164960182025, + "loss": 2.1312, + "step": 244 + }, + { + "epoch": 0.2787258248009101, + "grad_norm": 0.8409637212753296, + "learning_rate": 0.0009449374288964733, + "loss": 2.4469, + "step": 245 + }, + { + "epoch": 0.27986348122866894, + "grad_norm": 0.9386504292488098, + "learning_rate": 0.0009447098976109215, + "loss": 1.9967, + "step": 246 + }, + { + "epoch": 0.28100113765642776, + "grad_norm": 1.1676746606826782, + "learning_rate": 0.0009444823663253697, + "loss": 2.1986, + "step": 247 + }, + { + "epoch": 0.2821387940841866, + "grad_norm": 0.7538396716117859, + "learning_rate": 0.0009442548350398181, + "loss": 1.3639, + "step": 248 + }, + { + "epoch": 0.2832764505119454, + "grad_norm": 0.9919435381889343, + "learning_rate": 0.0009440273037542663, + "loss": 2.5573, + "step": 249 + }, + { + "epoch": 0.2844141069397042, + "grad_norm": 0.7212353944778442, + "learning_rate": 0.0009437997724687145, + "loss": 2.7537, + "step": 250 + }, + { + "epoch": 0.285551763367463, + "grad_norm": 1.0949044227600098, + "learning_rate": 0.0009435722411831627, + "loss": 1.8613, + "step": 251 + }, + { + "epoch": 0.28668941979522183, + "grad_norm": 1.1615073680877686, + "learning_rate": 0.0009433447098976109, + "loss": 2.3952, + "step": 252 + }, + { + "epoch": 0.28782707622298065, + "grad_norm": 1.1122605800628662, + "learning_rate": 0.0009431171786120591, + "loss": 2.5904, + "step": 253 + }, + { + "epoch": 0.28896473265073946, + "grad_norm": 1.0439378023147583, + "learning_rate": 0.0009428896473265074, + "loss": 2.2653, + "step": 254 + }, + { + "epoch": 0.2901023890784983, + "grad_norm": 2.413464069366455, + "learning_rate": 0.0009426621160409556, + "loss": 4.6422, + "step": 255 + }, + { + "epoch": 0.2912400455062571, + "grad_norm": 0.9542593955993652, + "learning_rate": 0.0009424345847554038, + "loss": 2.3552, + "step": 256 + }, + { + "epoch": 0.2923777019340159, + "grad_norm": 1.229703426361084, + "learning_rate": 0.0009422070534698522, + "loss": 2.993, + "step": 257 + }, + { + "epoch": 0.2935153583617747, + "grad_norm": 1.0281733274459839, + "learning_rate": 0.0009419795221843004, + "loss": 2.3383, + "step": 258 + }, + { + "epoch": 0.29465301478953354, + "grad_norm": 0.8514539003372192, + "learning_rate": 0.0009417519908987486, + "loss": 1.9407, + "step": 259 + }, + { + "epoch": 0.29579067121729236, + "grad_norm": 0.6966246962547302, + "learning_rate": 0.0009415244596131969, + "loss": 1.6663, + "step": 260 + }, + { + "epoch": 0.29692832764505117, + "grad_norm": 1.4110140800476074, + "learning_rate": 0.000941296928327645, + "loss": 4.4839, + "step": 261 + }, + { + "epoch": 0.29806598407281, + "grad_norm": 1.094144582748413, + "learning_rate": 0.0009410693970420933, + "loss": 2.2603, + "step": 262 + }, + { + "epoch": 0.2992036405005688, + "grad_norm": 1.0117661952972412, + "learning_rate": 0.0009408418657565415, + "loss": 1.9057, + "step": 263 + }, + { + "epoch": 0.3003412969283277, + "grad_norm": 1.1968410015106201, + "learning_rate": 0.0009406143344709897, + "loss": 2.5717, + "step": 264 + }, + { + "epoch": 0.3014789533560865, + "grad_norm": 1.324204444885254, + "learning_rate": 0.0009403868031854381, + "loss": 2.4079, + "step": 265 + }, + { + "epoch": 0.3026166097838453, + "grad_norm": 1.3995249271392822, + "learning_rate": 0.0009401592718998863, + "loss": 3.3401, + "step": 266 + }, + { + "epoch": 0.3037542662116041, + "grad_norm": 1.0412198305130005, + "learning_rate": 0.0009399317406143345, + "loss": 3.0768, + "step": 267 + }, + { + "epoch": 0.30489192263936293, + "grad_norm": 0.6889500617980957, + "learning_rate": 0.0009397042093287828, + "loss": 1.8718, + "step": 268 + }, + { + "epoch": 0.30602957906712175, + "grad_norm": 1.1345181465148926, + "learning_rate": 0.000939476678043231, + "loss": 3.4683, + "step": 269 + }, + { + "epoch": 0.30716723549488056, + "grad_norm": 0.8793913722038269, + "learning_rate": 0.0009392491467576792, + "loss": 2.2562, + "step": 270 + }, + { + "epoch": 0.3083048919226394, + "grad_norm": 1.0166738033294678, + "learning_rate": 0.0009390216154721274, + "loss": 1.328, + "step": 271 + }, + { + "epoch": 0.3094425483503982, + "grad_norm": 0.817034900188446, + "learning_rate": 0.0009387940841865756, + "loss": 2.4001, + "step": 272 + }, + { + "epoch": 0.310580204778157, + "grad_norm": 0.9647735357284546, + "learning_rate": 0.0009385665529010238, + "loss": 2.2161, + "step": 273 + }, + { + "epoch": 0.3117178612059158, + "grad_norm": 0.9152204394340515, + "learning_rate": 0.0009383390216154722, + "loss": 2.5515, + "step": 274 + }, + { + "epoch": 0.31285551763367464, + "grad_norm": 1.4312481880187988, + "learning_rate": 0.0009381114903299204, + "loss": 4.4949, + "step": 275 + }, + { + "epoch": 0.31399317406143346, + "grad_norm": 0.9626976847648621, + "learning_rate": 0.0009378839590443686, + "loss": 1.8928, + "step": 276 + }, + { + "epoch": 0.31513083048919227, + "grad_norm": 2.2159972190856934, + "learning_rate": 0.0009376564277588169, + "loss": 4.2681, + "step": 277 + }, + { + "epoch": 0.3162684869169511, + "grad_norm": 1.171950101852417, + "learning_rate": 0.0009374288964732651, + "loss": 2.1355, + "step": 278 + }, + { + "epoch": 0.3174061433447099, + "grad_norm": 0.6437592506408691, + "learning_rate": 0.0009372013651877133, + "loss": 1.5357, + "step": 279 + }, + { + "epoch": 0.3185437997724687, + "grad_norm": 1.0927355289459229, + "learning_rate": 0.0009369738339021616, + "loss": 2.1487, + "step": 280 + }, + { + "epoch": 0.31968145620022753, + "grad_norm": 1.1274104118347168, + "learning_rate": 0.0009367463026166097, + "loss": 2.6737, + "step": 281 + }, + { + "epoch": 0.32081911262798635, + "grad_norm": 0.9172748327255249, + "learning_rate": 0.000936518771331058, + "loss": 1.8367, + "step": 282 + }, + { + "epoch": 0.32195676905574516, + "grad_norm": 1.2308052778244019, + "learning_rate": 0.0009362912400455063, + "loss": 2.4757, + "step": 283 + }, + { + "epoch": 0.323094425483504, + "grad_norm": 0.8640644550323486, + "learning_rate": 0.0009360637087599545, + "loss": 2.0472, + "step": 284 + }, + { + "epoch": 0.3242320819112628, + "grad_norm": 1.4058482646942139, + "learning_rate": 0.0009358361774744028, + "loss": 2.8178, + "step": 285 + }, + { + "epoch": 0.3253697383390216, + "grad_norm": 1.1872564554214478, + "learning_rate": 0.000935608646188851, + "loss": 2.3008, + "step": 286 + }, + { + "epoch": 0.3265073947667804, + "grad_norm": 0.7326186299324036, + "learning_rate": 0.0009353811149032992, + "loss": 1.173, + "step": 287 + }, + { + "epoch": 0.32764505119453924, + "grad_norm": 1.229687213897705, + "learning_rate": 0.0009351535836177475, + "loss": 1.8621, + "step": 288 + }, + { + "epoch": 0.32878270762229805, + "grad_norm": 1.0105575323104858, + "learning_rate": 0.0009349260523321957, + "loss": 2.6121, + "step": 289 + }, + { + "epoch": 0.32992036405005687, + "grad_norm": 1.0747400522232056, + "learning_rate": 0.0009346985210466438, + "loss": 2.3038, + "step": 290 + }, + { + "epoch": 0.3310580204778157, + "grad_norm": 1.3244582414627075, + "learning_rate": 0.0009344709897610922, + "loss": 1.9476, + "step": 291 + }, + { + "epoch": 0.3321956769055745, + "grad_norm": 0.8411991000175476, + "learning_rate": 0.0009342434584755404, + "loss": 1.8068, + "step": 292 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.9534935355186462, + "learning_rate": 0.0009340159271899886, + "loss": 2.0956, + "step": 293 + }, + { + "epoch": 0.33447098976109213, + "grad_norm": 1.0585649013519287, + "learning_rate": 0.0009337883959044369, + "loss": 2.7111, + "step": 294 + }, + { + "epoch": 0.33560864618885095, + "grad_norm": 0.7823183536529541, + "learning_rate": 0.0009335608646188851, + "loss": 1.9446, + "step": 295 + }, + { + "epoch": 0.33674630261660976, + "grad_norm": 0.8460807204246521, + "learning_rate": 0.0009333333333333333, + "loss": 2.6191, + "step": 296 + }, + { + "epoch": 0.3378839590443686, + "grad_norm": 1.1409828662872314, + "learning_rate": 0.0009331058020477816, + "loss": 3.1742, + "step": 297 + }, + { + "epoch": 0.3390216154721274, + "grad_norm": 1.1377147436141968, + "learning_rate": 0.0009328782707622299, + "loss": 2.7242, + "step": 298 + }, + { + "epoch": 0.34015927189988626, + "grad_norm": 1.4707083702087402, + "learning_rate": 0.0009326507394766781, + "loss": 3.7897, + "step": 299 + }, + { + "epoch": 0.3412969283276451, + "grad_norm": 0.964156448841095, + "learning_rate": 0.0009324232081911263, + "loss": 2.0425, + "step": 300 + }, + { + "epoch": 0.3424345847554039, + "grad_norm": 1.323971152305603, + "learning_rate": 0.0009321956769055745, + "loss": 3.1034, + "step": 301 + }, + { + "epoch": 0.3435722411831627, + "grad_norm": 1.3859295845031738, + "learning_rate": 0.0009319681456200227, + "loss": 2.9186, + "step": 302 + }, + { + "epoch": 0.3447098976109215, + "grad_norm": 0.8150478601455688, + "learning_rate": 0.000931740614334471, + "loss": 2.3752, + "step": 303 + }, + { + "epoch": 0.34584755403868034, + "grad_norm": 0.8035518527030945, + "learning_rate": 0.0009315130830489192, + "loss": 2.252, + "step": 304 + }, + { + "epoch": 0.34698521046643915, + "grad_norm": 1.2278333902359009, + "learning_rate": 0.0009312855517633675, + "loss": 2.3787, + "step": 305 + }, + { + "epoch": 0.34812286689419797, + "grad_norm": 1.116999626159668, + "learning_rate": 0.0009310580204778157, + "loss": 1.4018, + "step": 306 + }, + { + "epoch": 0.3492605233219568, + "grad_norm": 1.0040169954299927, + "learning_rate": 0.000930830489192264, + "loss": 2.0511, + "step": 307 + }, + { + "epoch": 0.3503981797497156, + "grad_norm": 0.901432991027832, + "learning_rate": 0.0009306029579067123, + "loss": 1.8721, + "step": 308 + }, + { + "epoch": 0.3515358361774744, + "grad_norm": 1.451587438583374, + "learning_rate": 0.0009303754266211605, + "loss": 3.0643, + "step": 309 + }, + { + "epoch": 0.35267349260523323, + "grad_norm": 1.2055668830871582, + "learning_rate": 0.0009301478953356086, + "loss": 3.2057, + "step": 310 + }, + { + "epoch": 0.35381114903299204, + "grad_norm": 0.8358264565467834, + "learning_rate": 0.0009299203640500569, + "loss": 2.0797, + "step": 311 + }, + { + "epoch": 0.35494880546075086, + "grad_norm": 0.7835274338722229, + "learning_rate": 0.0009296928327645051, + "loss": 1.2359, + "step": 312 + }, + { + "epoch": 0.3560864618885097, + "grad_norm": 0.9940115809440613, + "learning_rate": 0.0009294653014789533, + "loss": 2.2001, + "step": 313 + }, + { + "epoch": 0.3572241183162685, + "grad_norm": 1.147113561630249, + "learning_rate": 0.0009292377701934016, + "loss": 2.0408, + "step": 314 + }, + { + "epoch": 0.3583617747440273, + "grad_norm": 0.4681651294231415, + "learning_rate": 0.0009290102389078499, + "loss": 0.7199, + "step": 315 + }, + { + "epoch": 0.3594994311717861, + "grad_norm": 1.2197333574295044, + "learning_rate": 0.0009287827076222981, + "loss": 3.6619, + "step": 316 + }, + { + "epoch": 0.36063708759954494, + "grad_norm": 0.9935864806175232, + "learning_rate": 0.0009285551763367464, + "loss": 2.0533, + "step": 317 + }, + { + "epoch": 0.36177474402730375, + "grad_norm": 0.9229695200920105, + "learning_rate": 0.0009283276450511946, + "loss": 1.4724, + "step": 318 + }, + { + "epoch": 0.36291240045506257, + "grad_norm": 1.3133660554885864, + "learning_rate": 0.0009281001137656428, + "loss": 2.9495, + "step": 319 + }, + { + "epoch": 0.3640500568828214, + "grad_norm": 0.9246838092803955, + "learning_rate": 0.000927872582480091, + "loss": 1.6493, + "step": 320 + }, + { + "epoch": 0.3651877133105802, + "grad_norm": 1.4214354753494263, + "learning_rate": 0.0009276450511945392, + "loss": 2.2151, + "step": 321 + }, + { + "epoch": 0.366325369738339, + "grad_norm": 1.167389988899231, + "learning_rate": 0.0009274175199089874, + "loss": 2.1775, + "step": 322 + }, + { + "epoch": 0.36746302616609783, + "grad_norm": 1.3376412391662598, + "learning_rate": 0.0009271899886234357, + "loss": 3.2754, + "step": 323 + }, + { + "epoch": 0.36860068259385664, + "grad_norm": 0.9739036560058594, + "learning_rate": 0.000926962457337884, + "loss": 2.151, + "step": 324 + }, + { + "epoch": 0.36973833902161546, + "grad_norm": 0.9827941656112671, + "learning_rate": 0.0009267349260523323, + "loss": 3.0028, + "step": 325 + }, + { + "epoch": 0.3708759954493743, + "grad_norm": 0.811549723148346, + "learning_rate": 0.0009265073947667805, + "loss": 2.1614, + "step": 326 + }, + { + "epoch": 0.3720136518771331, + "grad_norm": 0.9825149178504944, + "learning_rate": 0.0009262798634812287, + "loss": 1.7451, + "step": 327 + }, + { + "epoch": 0.3731513083048919, + "grad_norm": 1.237622857093811, + "learning_rate": 0.000926052332195677, + "loss": 3.3102, + "step": 328 + }, + { + "epoch": 0.3742889647326507, + "grad_norm": 0.9079214334487915, + "learning_rate": 0.0009258248009101251, + "loss": 2.0276, + "step": 329 + }, + { + "epoch": 0.37542662116040953, + "grad_norm": 0.7722564935684204, + "learning_rate": 0.0009255972696245733, + "loss": 2.1105, + "step": 330 + }, + { + "epoch": 0.37656427758816835, + "grad_norm": 0.9066492319107056, + "learning_rate": 0.0009253697383390216, + "loss": 3.4384, + "step": 331 + }, + { + "epoch": 0.37770193401592717, + "grad_norm": 0.8562538623809814, + "learning_rate": 0.0009251422070534699, + "loss": 2.9233, + "step": 332 + }, + { + "epoch": 0.378839590443686, + "grad_norm": 1.1646368503570557, + "learning_rate": 0.0009249146757679181, + "loss": 2.551, + "step": 333 + }, + { + "epoch": 0.3799772468714448, + "grad_norm": 0.9063814878463745, + "learning_rate": 0.0009246871444823664, + "loss": 2.3003, + "step": 334 + }, + { + "epoch": 0.38111490329920367, + "grad_norm": 0.993689775466919, + "learning_rate": 0.0009244596131968146, + "loss": 3.3941, + "step": 335 + }, + { + "epoch": 0.3822525597269625, + "grad_norm": 1.2754982709884644, + "learning_rate": 0.0009242320819112628, + "loss": 2.5285, + "step": 336 + }, + { + "epoch": 0.3833902161547213, + "grad_norm": 1.0740411281585693, + "learning_rate": 0.0009240045506257111, + "loss": 1.4935, + "step": 337 + }, + { + "epoch": 0.3845278725824801, + "grad_norm": 1.089651107788086, + "learning_rate": 0.0009237770193401593, + "loss": 2.136, + "step": 338 + }, + { + "epoch": 0.3856655290102389, + "grad_norm": 0.8579985499382019, + "learning_rate": 0.0009235494880546074, + "loss": 1.9851, + "step": 339 + }, + { + "epoch": 0.38680318543799774, + "grad_norm": 1.0594276189804077, + "learning_rate": 0.0009233219567690558, + "loss": 2.244, + "step": 340 + }, + { + "epoch": 0.38794084186575656, + "grad_norm": 0.9024211168289185, + "learning_rate": 0.000923094425483504, + "loss": 2.5911, + "step": 341 + }, + { + "epoch": 0.3890784982935154, + "grad_norm": 1.2810311317443848, + "learning_rate": 0.0009228668941979522, + "loss": 3.0762, + "step": 342 + }, + { + "epoch": 0.3902161547212742, + "grad_norm": 0.9511512517929077, + "learning_rate": 0.0009226393629124005, + "loss": 2.871, + "step": 343 + }, + { + "epoch": 0.391353811149033, + "grad_norm": 1.0560070276260376, + "learning_rate": 0.0009224118316268487, + "loss": 1.4818, + "step": 344 + }, + { + "epoch": 0.3924914675767918, + "grad_norm": 0.7268106341362, + "learning_rate": 0.000922184300341297, + "loss": 0.9631, + "step": 345 + }, + { + "epoch": 0.39362912400455063, + "grad_norm": 1.2521812915802002, + "learning_rate": 0.0009219567690557452, + "loss": 2.0506, + "step": 346 + }, + { + "epoch": 0.39476678043230945, + "grad_norm": 0.8288787603378296, + "learning_rate": 0.0009217292377701934, + "loss": 1.7319, + "step": 347 + }, + { + "epoch": 0.39590443686006827, + "grad_norm": 1.5332680940628052, + "learning_rate": 0.0009215017064846418, + "loss": 1.8226, + "step": 348 + }, + { + "epoch": 0.3970420932878271, + "grad_norm": 0.918202817440033, + "learning_rate": 0.0009212741751990899, + "loss": 1.9544, + "step": 349 + }, + { + "epoch": 0.3981797497155859, + "grad_norm": 1.5337828397750854, + "learning_rate": 0.0009210466439135381, + "loss": 1.7653, + "step": 350 + }, + { + "epoch": 0.3993174061433447, + "grad_norm": 0.8244525194168091, + "learning_rate": 0.0009208191126279864, + "loss": 1.0721, + "step": 351 + }, + { + "epoch": 0.4004550625711035, + "grad_norm": 1.727785587310791, + "learning_rate": 0.0009205915813424346, + "loss": 3.8503, + "step": 352 + }, + { + "epoch": 0.40159271899886234, + "grad_norm": 0.8593099117279053, + "learning_rate": 0.0009203640500568828, + "loss": 1.5773, + "step": 353 + }, + { + "epoch": 0.40273037542662116, + "grad_norm": 0.929568886756897, + "learning_rate": 0.0009201365187713311, + "loss": 2.0465, + "step": 354 + }, + { + "epoch": 0.40386803185437997, + "grad_norm": 0.8598810434341431, + "learning_rate": 0.0009199089874857793, + "loss": 2.3678, + "step": 355 + }, + { + "epoch": 0.4050056882821388, + "grad_norm": 1.19975745677948, + "learning_rate": 0.0009196814562002275, + "loss": 1.9968, + "step": 356 + }, + { + "epoch": 0.4061433447098976, + "grad_norm": 0.8773411512374878, + "learning_rate": 0.0009194539249146759, + "loss": 2.3507, + "step": 357 + }, + { + "epoch": 0.4072810011376564, + "grad_norm": NaN, + "learning_rate": 0.0009194539249146759, + "loss": 3.3585, + "step": 358 + }, + { + "epoch": 0.40841865756541523, + "grad_norm": 1.068331241607666, + "learning_rate": 0.000919226393629124, + "loss": 2.2658, + "step": 359 + }, + { + "epoch": 0.40955631399317405, + "grad_norm": 0.8418660163879395, + "learning_rate": 0.0009189988623435722, + "loss": 1.1336, + "step": 360 + }, + { + "epoch": 0.41069397042093286, + "grad_norm": 1.0328933000564575, + "learning_rate": 0.0009187713310580205, + "loss": 2.308, + "step": 361 + }, + { + "epoch": 0.4118316268486917, + "grad_norm": 0.7511950135231018, + "learning_rate": 0.0009185437997724687, + "loss": 2.0096, + "step": 362 + }, + { + "epoch": 0.4129692832764505, + "grad_norm": 1.0027238130569458, + "learning_rate": 0.0009183162684869169, + "loss": 2.4037, + "step": 363 + }, + { + "epoch": 0.4141069397042093, + "grad_norm": 0.7929302453994751, + "learning_rate": 0.0009180887372013652, + "loss": 1.7147, + "step": 364 + }, + { + "epoch": 0.4152445961319681, + "grad_norm": 0.9058247804641724, + "learning_rate": 0.0009178612059158134, + "loss": 1.2271, + "step": 365 + }, + { + "epoch": 0.41638225255972694, + "grad_norm": 1.107584834098816, + "learning_rate": 0.0009176336746302618, + "loss": 1.976, + "step": 366 + }, + { + "epoch": 0.41751990898748575, + "grad_norm": 0.9150335788726807, + "learning_rate": 0.00091740614334471, + "loss": 1.5669, + "step": 367 + }, + { + "epoch": 0.41865756541524457, + "grad_norm": 1.2605892419815063, + "learning_rate": 0.0009171786120591582, + "loss": 3.2092, + "step": 368 + }, + { + "epoch": 0.4197952218430034, + "grad_norm": 1.8941680192947388, + "learning_rate": 0.0009169510807736064, + "loss": 2.2166, + "step": 369 + }, + { + "epoch": 0.42093287827076226, + "grad_norm": 1.0579288005828857, + "learning_rate": 0.0009167235494880546, + "loss": 2.2364, + "step": 370 + }, + { + "epoch": 0.42207053469852107, + "grad_norm": 0.8153421878814697, + "learning_rate": 0.0009164960182025028, + "loss": 2.2531, + "step": 371 + }, + { + "epoch": 0.4232081911262799, + "grad_norm": 0.9382014274597168, + "learning_rate": 0.0009162684869169511, + "loss": 1.7155, + "step": 372 + }, + { + "epoch": 0.4243458475540387, + "grad_norm": 0.8863131403923035, + "learning_rate": 0.0009160409556313993, + "loss": 2.0033, + "step": 373 + }, + { + "epoch": 0.4254835039817975, + "grad_norm": 1.0132858753204346, + "learning_rate": 0.0009158134243458475, + "loss": 2.7736, + "step": 374 + }, + { + "epoch": 0.42662116040955633, + "grad_norm": 1.237510085105896, + "learning_rate": 0.0009155858930602959, + "loss": 2.3289, + "step": 375 + }, + { + "epoch": 0.42775881683731515, + "grad_norm": 1.4978190660476685, + "learning_rate": 0.0009153583617747441, + "loss": 3.452, + "step": 376 + }, + { + "epoch": 0.42889647326507396, + "grad_norm": 0.9788629412651062, + "learning_rate": 0.0009151308304891923, + "loss": 2.536, + "step": 377 + }, + { + "epoch": 0.4300341296928328, + "grad_norm": 1.0312438011169434, + "learning_rate": 0.0009149032992036406, + "loss": 2.1322, + "step": 378 + }, + { + "epoch": 0.4311717861205916, + "grad_norm": 0.7808533906936646, + "learning_rate": 0.0009146757679180887, + "loss": 1.7681, + "step": 379 + }, + { + "epoch": 0.4323094425483504, + "grad_norm": 1.5941463708877563, + "learning_rate": 0.0009144482366325369, + "loss": 3.6207, + "step": 380 + }, + { + "epoch": 0.4334470989761092, + "grad_norm": 0.9317173361778259, + "learning_rate": 0.0009142207053469852, + "loss": 1.5295, + "step": 381 + }, + { + "epoch": 0.43458475540386804, + "grad_norm": 1.0690711736679077, + "learning_rate": 0.0009139931740614334, + "loss": 2.6297, + "step": 382 + }, + { + "epoch": 0.43572241183162685, + "grad_norm": 0.9129312634468079, + "learning_rate": 0.0009137656427758817, + "loss": 1.3539, + "step": 383 + }, + { + "epoch": 0.43686006825938567, + "grad_norm": 1.0281989574432373, + "learning_rate": 0.00091353811149033, + "loss": 2.2572, + "step": 384 + }, + { + "epoch": 0.4379977246871445, + "grad_norm": 0.9232156872749329, + "learning_rate": 0.0009133105802047782, + "loss": 2.7774, + "step": 385 + }, + { + "epoch": 0.4391353811149033, + "grad_norm": 0.8758882880210876, + "learning_rate": 0.0009130830489192265, + "loss": 1.8537, + "step": 386 + }, + { + "epoch": 0.4402730375426621, + "grad_norm": 0.6057271361351013, + "learning_rate": 0.0009128555176336747, + "loss": 1.1669, + "step": 387 + }, + { + "epoch": 0.44141069397042093, + "grad_norm": 0.764013409614563, + "learning_rate": 0.0009126279863481229, + "loss": 1.5494, + "step": 388 + }, + { + "epoch": 0.44254835039817975, + "grad_norm": 0.7935448288917542, + "learning_rate": 0.0009124004550625711, + "loss": 1.5411, + "step": 389 + }, + { + "epoch": 0.44368600682593856, + "grad_norm": 0.9698971509933472, + "learning_rate": 0.0009121729237770193, + "loss": 2.1653, + "step": 390 + }, + { + "epoch": 0.4448236632536974, + "grad_norm": 0.9462388753890991, + "learning_rate": 0.0009119453924914675, + "loss": 3.2917, + "step": 391 + }, + { + "epoch": 0.4459613196814562, + "grad_norm": 1.1189157962799072, + "learning_rate": 0.0009117178612059159, + "loss": 1.8733, + "step": 392 + }, + { + "epoch": 0.447098976109215, + "grad_norm": 1.251373529434204, + "learning_rate": 0.0009114903299203641, + "loss": 2.239, + "step": 393 + }, + { + "epoch": 0.4482366325369738, + "grad_norm": 1.385899305343628, + "learning_rate": 0.0009112627986348123, + "loss": 3.8287, + "step": 394 + }, + { + "epoch": 0.44937428896473264, + "grad_norm": 0.8532758951187134, + "learning_rate": 0.0009110352673492606, + "loss": 1.5912, + "step": 395 + }, + { + "epoch": 0.45051194539249145, + "grad_norm": 0.9151597023010254, + "learning_rate": 0.0009108077360637088, + "loss": 1.841, + "step": 396 + }, + { + "epoch": 0.45164960182025027, + "grad_norm": 1.054795265197754, + "learning_rate": 0.000910580204778157, + "loss": 2.7137, + "step": 397 + }, + { + "epoch": 0.4527872582480091, + "grad_norm": 0.7954731583595276, + "learning_rate": 0.0009103526734926052, + "loss": 1.7309, + "step": 398 + }, + { + "epoch": 0.4539249146757679, + "grad_norm": 1.1954761743545532, + "learning_rate": 0.0009101251422070534, + "loss": 3.0152, + "step": 399 + }, + { + "epoch": 0.4550625711035267, + "grad_norm": 1.0547521114349365, + "learning_rate": 0.0009098976109215017, + "loss": 3.1303, + "step": 400 + }, + { + "epoch": 0.45620022753128553, + "grad_norm": 1.2710171937942505, + "learning_rate": 0.00090967007963595, + "loss": 3.1468, + "step": 401 + }, + { + "epoch": 0.45733788395904434, + "grad_norm": 0.8694390058517456, + "learning_rate": 0.0009094425483503982, + "loss": 1.815, + "step": 402 + }, + { + "epoch": 0.45847554038680316, + "grad_norm": 0.9537279605865479, + "learning_rate": 0.0009092150170648464, + "loss": 1.9914, + "step": 403 + }, + { + "epoch": 0.459613196814562, + "grad_norm": 0.9804282188415527, + "learning_rate": 0.0009089874857792947, + "loss": 2.4996, + "step": 404 + }, + { + "epoch": 0.46075085324232085, + "grad_norm": 1.2874373197555542, + "learning_rate": 0.0009087599544937429, + "loss": 2.2957, + "step": 405 + }, + { + "epoch": 0.46188850967007966, + "grad_norm": 0.8987186551094055, + "learning_rate": 0.0009085324232081912, + "loss": 2.2726, + "step": 406 + }, + { + "epoch": 0.4630261660978385, + "grad_norm": 1.0590325593948364, + "learning_rate": 0.0009083048919226394, + "loss": 2.5264, + "step": 407 + }, + { + "epoch": 0.4641638225255973, + "grad_norm": 1.0790257453918457, + "learning_rate": 0.0009080773606370875, + "loss": 2.7717, + "step": 408 + }, + { + "epoch": 0.4653014789533561, + "grad_norm": 1.1287912130355835, + "learning_rate": 0.0009078498293515359, + "loss": 1.9665, + "step": 409 + }, + { + "epoch": 0.4664391353811149, + "grad_norm": 0.8903608918190002, + "learning_rate": 0.0009076222980659841, + "loss": 3.0162, + "step": 410 + }, + { + "epoch": 0.46757679180887374, + "grad_norm": 0.7123409509658813, + "learning_rate": 0.0009073947667804323, + "loss": 1.5362, + "step": 411 + }, + { + "epoch": 0.46871444823663255, + "grad_norm": 0.9984010457992554, + "learning_rate": 0.0009071672354948806, + "loss": 2.6421, + "step": 412 + }, + { + "epoch": 0.46985210466439137, + "grad_norm": 0.9573709964752197, + "learning_rate": 0.0009069397042093288, + "loss": 3.0399, + "step": 413 + }, + { + "epoch": 0.4709897610921502, + "grad_norm": 1.0717376470565796, + "learning_rate": 0.000906712172923777, + "loss": 2.5299, + "step": 414 + }, + { + "epoch": 0.472127417519909, + "grad_norm": 1.3975024223327637, + "learning_rate": 0.0009064846416382253, + "loss": 2.6824, + "step": 415 + }, + { + "epoch": 0.4732650739476678, + "grad_norm": 0.9038397073745728, + "learning_rate": 0.0009062571103526736, + "loss": 2.0442, + "step": 416 + }, + { + "epoch": 0.47440273037542663, + "grad_norm": 1.1075880527496338, + "learning_rate": 0.0009060295790671218, + "loss": 2.1013, + "step": 417 + }, + { + "epoch": 0.47554038680318544, + "grad_norm": 0.8074356913566589, + "learning_rate": 0.00090580204778157, + "loss": 2.9299, + "step": 418 + }, + { + "epoch": 0.47667804323094426, + "grad_norm": 0.9493515491485596, + "learning_rate": 0.0009055745164960182, + "loss": 2.0034, + "step": 419 + }, + { + "epoch": 0.4778156996587031, + "grad_norm": 1.3625547885894775, + "learning_rate": 0.0009053469852104664, + "loss": 2.4718, + "step": 420 + }, + { + "epoch": 0.4789533560864619, + "grad_norm": 1.2692298889160156, + "learning_rate": 0.0009051194539249147, + "loss": 2.676, + "step": 421 + }, + { + "epoch": 0.4800910125142207, + "grad_norm": 1.1094417572021484, + "learning_rate": 0.0009048919226393629, + "loss": 1.5969, + "step": 422 + }, + { + "epoch": 0.4812286689419795, + "grad_norm": 0.8246015310287476, + "learning_rate": 0.0009046643913538111, + "loss": 1.9902, + "step": 423 + }, + { + "epoch": 0.48236632536973834, + "grad_norm": 1.2059593200683594, + "learning_rate": 0.0009044368600682594, + "loss": 2.2454, + "step": 424 + }, + { + "epoch": 0.48350398179749715, + "grad_norm": 1.2407282590866089, + "learning_rate": 0.0009042093287827077, + "loss": 1.9269, + "step": 425 + }, + { + "epoch": 0.48464163822525597, + "grad_norm": 1.407323956489563, + "learning_rate": 0.0009039817974971559, + "loss": 3.2682, + "step": 426 + }, + { + "epoch": 0.4857792946530148, + "grad_norm": 1.0163967609405518, + "learning_rate": 0.0009037542662116041, + "loss": 2.3068, + "step": 427 + }, + { + "epoch": 0.4869169510807736, + "grad_norm": 0.9030061364173889, + "learning_rate": 0.0009035267349260523, + "loss": 1.8157, + "step": 428 + }, + { + "epoch": 0.4880546075085324, + "grad_norm": 1.6989500522613525, + "learning_rate": 0.0009032992036405006, + "loss": 3.7374, + "step": 429 + }, + { + "epoch": 0.4891922639362912, + "grad_norm": 1.1923679113388062, + "learning_rate": 0.0009030716723549488, + "loss": 2.565, + "step": 430 + }, + { + "epoch": 0.49032992036405004, + "grad_norm": 0.7808157205581665, + "learning_rate": 0.000902844141069397, + "loss": 2.05, + "step": 431 + }, + { + "epoch": 0.49146757679180886, + "grad_norm": 1.2099822759628296, + "learning_rate": 0.0009026166097838453, + "loss": 2.8765, + "step": 432 + }, + { + "epoch": 0.4926052332195677, + "grad_norm": 0.628551185131073, + "learning_rate": 0.0009023890784982936, + "loss": 1.3558, + "step": 433 + }, + { + "epoch": 0.4937428896473265, + "grad_norm": 1.0964410305023193, + "learning_rate": 0.0009021615472127418, + "loss": 3.4868, + "step": 434 + }, + { + "epoch": 0.4948805460750853, + "grad_norm": 1.1500060558319092, + "learning_rate": 0.0009019340159271901, + "loss": 2.6184, + "step": 435 + }, + { + "epoch": 0.4960182025028441, + "grad_norm": 0.9226061701774597, + "learning_rate": 0.0009017064846416383, + "loss": 2.2053, + "step": 436 + }, + { + "epoch": 0.49715585893060293, + "grad_norm": 0.9873738288879395, + "learning_rate": 0.0009014789533560864, + "loss": 2.4479, + "step": 437 + }, + { + "epoch": 0.49829351535836175, + "grad_norm": 0.6207978129386902, + "learning_rate": 0.0009012514220705347, + "loss": 1.1146, + "step": 438 + }, + { + "epoch": 0.49943117178612056, + "grad_norm": 0.7853182554244995, + "learning_rate": 0.0009010238907849829, + "loss": 2.4183, + "step": 439 + }, + { + "epoch": 0.5005688282138794, + "grad_norm": 1.1462852954864502, + "learning_rate": 0.0009007963594994311, + "loss": 2.3492, + "step": 440 + }, + { + "epoch": 0.5017064846416383, + "grad_norm": 0.7885423302650452, + "learning_rate": 0.0009005688282138795, + "loss": 1.2548, + "step": 441 + }, + { + "epoch": 0.502844141069397, + "grad_norm": 1.071851134300232, + "learning_rate": 0.0009003412969283277, + "loss": 2.1613, + "step": 442 + }, + { + "epoch": 0.5039817974971559, + "grad_norm": 0.934515655040741, + "learning_rate": 0.0009001137656427759, + "loss": 2.1865, + "step": 443 + }, + { + "epoch": 0.5051194539249146, + "grad_norm": 0.8959553241729736, + "learning_rate": 0.0008998862343572242, + "loss": 1.3645, + "step": 444 + }, + { + "epoch": 0.5062571103526735, + "grad_norm": 0.6154829263687134, + "learning_rate": 0.0008996587030716724, + "loss": 0.9074, + "step": 445 + }, + { + "epoch": 0.5073947667804323, + "grad_norm": 0.8270671367645264, + "learning_rate": 0.0008994311717861206, + "loss": 2.3498, + "step": 446 + }, + { + "epoch": 0.5085324232081911, + "grad_norm": 1.0123733282089233, + "learning_rate": 0.0008992036405005688, + "loss": 2.0715, + "step": 447 + }, + { + "epoch": 0.5096700796359499, + "grad_norm": 1.2498971223831177, + "learning_rate": 0.000898976109215017, + "loss": 3.0001, + "step": 448 + }, + { + "epoch": 0.5108077360637088, + "grad_norm": 1.0939654111862183, + "learning_rate": 0.0008987485779294653, + "loss": 1.7905, + "step": 449 + }, + { + "epoch": 0.5119453924914675, + "grad_norm": 1.0476197004318237, + "learning_rate": 0.0008985210466439136, + "loss": 2.68, + "step": 450 + }, + { + "epoch": 0.5130830489192264, + "grad_norm": 1.1398872137069702, + "learning_rate": 0.0008982935153583618, + "loss": 3.1602, + "step": 451 + }, + { + "epoch": 0.5142207053469852, + "grad_norm": 1.092517614364624, + "learning_rate": 0.0008980659840728101, + "loss": 1.772, + "step": 452 + }, + { + "epoch": 0.515358361774744, + "grad_norm": 1.1467770338058472, + "learning_rate": 0.0008978384527872583, + "loss": 3.0966, + "step": 453 + }, + { + "epoch": 0.5164960182025028, + "grad_norm": 0.9609680771827698, + "learning_rate": 0.0008976109215017065, + "loss": 2.1103, + "step": 454 + }, + { + "epoch": 0.5176336746302617, + "grad_norm": 1.0481035709381104, + "learning_rate": 0.0008973833902161548, + "loss": 1.9659, + "step": 455 + }, + { + "epoch": 0.5187713310580204, + "grad_norm": 0.8882219791412354, + "learning_rate": 0.000897155858930603, + "loss": 1.6778, + "step": 456 + }, + { + "epoch": 0.5199089874857793, + "grad_norm": 0.8644529581069946, + "learning_rate": 0.0008969283276450511, + "loss": 2.0891, + "step": 457 + }, + { + "epoch": 0.5210466439135382, + "grad_norm": 0.9775826930999756, + "learning_rate": 0.0008967007963594995, + "loss": 2.6407, + "step": 458 + }, + { + "epoch": 0.5221843003412969, + "grad_norm": 1.4607579708099365, + "learning_rate": 0.0008964732650739477, + "loss": 2.8334, + "step": 459 + }, + { + "epoch": 0.5233219567690558, + "grad_norm": 1.4416364431381226, + "learning_rate": 0.0008962457337883959, + "loss": 2.8912, + "step": 460 + }, + { + "epoch": 0.5244596131968146, + "grad_norm": 0.7647553086280823, + "learning_rate": 0.0008960182025028442, + "loss": 2.1203, + "step": 461 + }, + { + "epoch": 0.5255972696245734, + "grad_norm": 0.9002684950828552, + "learning_rate": 0.0008957906712172924, + "loss": 1.9081, + "step": 462 + }, + { + "epoch": 0.5267349260523322, + "grad_norm": 0.6581302285194397, + "learning_rate": 0.0008955631399317406, + "loss": 2.001, + "step": 463 + }, + { + "epoch": 0.5278725824800911, + "grad_norm": 1.0612741708755493, + "learning_rate": 0.0008953356086461889, + "loss": 1.9066, + "step": 464 + }, + { + "epoch": 0.5290102389078498, + "grad_norm": 1.4963552951812744, + "learning_rate": 0.0008951080773606371, + "loss": 2.1809, + "step": 465 + }, + { + "epoch": 0.5301478953356087, + "grad_norm": 1.1061559915542603, + "learning_rate": 0.0008948805460750852, + "loss": 3.068, + "step": 466 + }, + { + "epoch": 0.5312855517633674, + "grad_norm": 0.9316163659095764, + "learning_rate": 0.0008946530147895336, + "loss": 1.9232, + "step": 467 + }, + { + "epoch": 0.5324232081911263, + "grad_norm": 0.8861683011054993, + "learning_rate": 0.0008944254835039818, + "loss": 2.1368, + "step": 468 + }, + { + "epoch": 0.5335608646188851, + "grad_norm": 0.835658848285675, + "learning_rate": 0.0008941979522184301, + "loss": 1.8731, + "step": 469 + }, + { + "epoch": 0.534698521046644, + "grad_norm": 1.1577521562576294, + "learning_rate": 0.0008939704209328783, + "loss": 3.0726, + "step": 470 + }, + { + "epoch": 0.5358361774744027, + "grad_norm": 1.3069539070129395, + "learning_rate": 0.0008937428896473265, + "loss": 2.8827, + "step": 471 + }, + { + "epoch": 0.5369738339021616, + "grad_norm": 0.884005606174469, + "learning_rate": 0.0008935153583617748, + "loss": 2.2167, + "step": 472 + }, + { + "epoch": 0.5381114903299203, + "grad_norm": 1.0972729921340942, + "learning_rate": 0.000893287827076223, + "loss": 2.8096, + "step": 473 + }, + { + "epoch": 0.5392491467576792, + "grad_norm": 0.6620914340019226, + "learning_rate": 0.0008930602957906712, + "loss": 1.8139, + "step": 474 + }, + { + "epoch": 0.540386803185438, + "grad_norm": 1.0320155620574951, + "learning_rate": 0.0008928327645051196, + "loss": 2.7753, + "step": 475 + }, + { + "epoch": 0.5415244596131968, + "grad_norm": 1.195923924446106, + "learning_rate": 0.0008926052332195677, + "loss": 2.8982, + "step": 476 + }, + { + "epoch": 0.5426621160409556, + "grad_norm": 0.6206729412078857, + "learning_rate": 0.0008923777019340159, + "loss": 1.6572, + "step": 477 + }, + { + "epoch": 0.5437997724687145, + "grad_norm": 0.867581844329834, + "learning_rate": 0.0008921501706484642, + "loss": 1.3072, + "step": 478 + }, + { + "epoch": 0.5449374288964732, + "grad_norm": 1.066758632659912, + "learning_rate": 0.0008919226393629124, + "loss": 2.2283, + "step": 479 + }, + { + "epoch": 0.5460750853242321, + "grad_norm": 0.9295621514320374, + "learning_rate": 0.0008916951080773606, + "loss": 1.8157, + "step": 480 + }, + { + "epoch": 0.5472127417519909, + "grad_norm": 1.0595327615737915, + "learning_rate": 0.0008914675767918089, + "loss": 2.486, + "step": 481 + }, + { + "epoch": 0.5483503981797497, + "grad_norm": 1.1606998443603516, + "learning_rate": 0.0008912400455062571, + "loss": 1.9862, + "step": 482 + }, + { + "epoch": 0.5494880546075085, + "grad_norm": 1.3078361749649048, + "learning_rate": 0.0008910125142207054, + "loss": 2.824, + "step": 483 + }, + { + "epoch": 0.5506257110352674, + "grad_norm": 1.1636848449707031, + "learning_rate": 0.0008907849829351537, + "loss": 2.9846, + "step": 484 + }, + { + "epoch": 0.5517633674630261, + "grad_norm": 1.6126782894134521, + "learning_rate": 0.0008905574516496019, + "loss": 3.7502, + "step": 485 + }, + { + "epoch": 0.552901023890785, + "grad_norm": 0.9118810296058655, + "learning_rate": 0.00089032992036405, + "loss": 2.7482, + "step": 486 + }, + { + "epoch": 0.5540386803185438, + "grad_norm": 0.9121948480606079, + "learning_rate": 0.0008901023890784983, + "loss": 1.8244, + "step": 487 + }, + { + "epoch": 0.5551763367463026, + "grad_norm": 1.749747633934021, + "learning_rate": 0.0008898748577929465, + "loss": 3.4315, + "step": 488 + }, + { + "epoch": 0.5563139931740614, + "grad_norm": 0.9559519290924072, + "learning_rate": 0.0008896473265073947, + "loss": 1.2761, + "step": 489 + }, + { + "epoch": 0.5574516496018203, + "grad_norm": 1.3748880624771118, + "learning_rate": 0.000889419795221843, + "loss": 2.4184, + "step": 490 + }, + { + "epoch": 0.558589306029579, + "grad_norm": 0.9325023889541626, + "learning_rate": 0.0008891922639362912, + "loss": 1.6256, + "step": 491 + }, + { + "epoch": 0.5597269624573379, + "grad_norm": 1.2054123878479004, + "learning_rate": 0.0008889647326507396, + "loss": 2.4543, + "step": 492 + }, + { + "epoch": 0.5608646188850968, + "grad_norm": 1.1576976776123047, + "learning_rate": 0.0008887372013651878, + "loss": 1.7206, + "step": 493 + }, + { + "epoch": 0.5620022753128555, + "grad_norm": 0.8814551830291748, + "learning_rate": 0.000888509670079636, + "loss": 1.8812, + "step": 494 + }, + { + "epoch": 0.5631399317406144, + "grad_norm": 0.8528147339820862, + "learning_rate": 0.0008882821387940843, + "loss": 2.411, + "step": 495 + }, + { + "epoch": 0.5642775881683731, + "grad_norm": 1.0148645639419556, + "learning_rate": 0.0008880546075085324, + "loss": 2.1765, + "step": 496 + }, + { + "epoch": 0.565415244596132, + "grad_norm": 0.7151885032653809, + "learning_rate": 0.0008878270762229806, + "loss": 1.2428, + "step": 497 + }, + { + "epoch": 0.5665529010238908, + "grad_norm": 1.7494755983352661, + "learning_rate": 0.0008875995449374289, + "loss": 4.1417, + "step": 498 + }, + { + "epoch": 0.5676905574516496, + "grad_norm": 0.8421460390090942, + "learning_rate": 0.0008873720136518771, + "loss": 1.7912, + "step": 499 + }, + { + "epoch": 0.5688282138794084, + "grad_norm": 0.9811123013496399, + "learning_rate": 0.0008871444823663254, + "loss": 2.1765, + "step": 500 + }, + { + "epoch": 0.5699658703071673, + "grad_norm": 0.9422712922096252, + "learning_rate": 0.0008869169510807737, + "loss": 1.7157, + "step": 501 + }, + { + "epoch": 0.571103526734926, + "grad_norm": 0.8885846138000488, + "learning_rate": 0.0008866894197952219, + "loss": 2.2024, + "step": 502 + }, + { + "epoch": 0.5722411831626849, + "grad_norm": 1.6120095252990723, + "learning_rate": 0.0008864618885096701, + "loss": 3.8523, + "step": 503 + }, + { + "epoch": 0.5733788395904437, + "grad_norm": 1.2259505987167358, + "learning_rate": 0.0008862343572241184, + "loss": 2.3551, + "step": 504 + }, + { + "epoch": 0.5745164960182025, + "grad_norm": 0.9865310788154602, + "learning_rate": 0.0008860068259385665, + "loss": 1.0719, + "step": 505 + }, + { + "epoch": 0.5756541524459613, + "grad_norm": 0.945303201675415, + "learning_rate": 0.0008857792946530147, + "loss": 1.8286, + "step": 506 + }, + { + "epoch": 0.5767918088737202, + "grad_norm": 1.099501371383667, + "learning_rate": 0.000885551763367463, + "loss": 2.4123, + "step": 507 + }, + { + "epoch": 0.5779294653014789, + "grad_norm": 1.3092284202575684, + "learning_rate": 0.0008853242320819112, + "loss": 2.8365, + "step": 508 + }, + { + "epoch": 0.5790671217292378, + "grad_norm": 0.9153149127960205, + "learning_rate": 0.0008850967007963595, + "loss": 1.4962, + "step": 509 + }, + { + "epoch": 0.5802047781569966, + "grad_norm": 1.0560232400894165, + "learning_rate": 0.0008848691695108078, + "loss": 2.0615, + "step": 510 + }, + { + "epoch": 0.5813424345847554, + "grad_norm": 0.6506679058074951, + "learning_rate": 0.000884641638225256, + "loss": 1.7931, + "step": 511 + }, + { + "epoch": 0.5824800910125142, + "grad_norm": 0.6726229786872864, + "learning_rate": 0.0008844141069397043, + "loss": 1.5145, + "step": 512 + }, + { + "epoch": 0.5836177474402731, + "grad_norm": 0.8806062340736389, + "learning_rate": 0.0008841865756541525, + "loss": 1.3302, + "step": 513 + }, + { + "epoch": 0.5847554038680318, + "grad_norm": 0.8398321270942688, + "learning_rate": 0.0008839590443686007, + "loss": 1.0688, + "step": 514 + }, + { + "epoch": 0.5858930602957907, + "grad_norm": 1.0214747190475464, + "learning_rate": 0.0008837315130830489, + "loss": 2.7142, + "step": 515 + }, + { + "epoch": 0.5870307167235495, + "grad_norm": 1.1030226945877075, + "learning_rate": 0.0008835039817974971, + "loss": 1.7378, + "step": 516 + }, + { + "epoch": 0.5881683731513083, + "grad_norm": 0.9792185425758362, + "learning_rate": 0.0008832764505119454, + "loss": 1.9917, + "step": 517 + }, + { + "epoch": 0.5893060295790671, + "grad_norm": 0.7293819785118103, + "learning_rate": 0.0008830489192263937, + "loss": 1.7711, + "step": 518 + }, + { + "epoch": 0.590443686006826, + "grad_norm": 0.9090821743011475, + "learning_rate": 0.0008828213879408419, + "loss": 2.3188, + "step": 519 + }, + { + "epoch": 0.5915813424345847, + "grad_norm": 0.9316034317016602, + "learning_rate": 0.0008825938566552901, + "loss": 1.9596, + "step": 520 + }, + { + "epoch": 0.5927189988623436, + "grad_norm": 1.022985577583313, + "learning_rate": 0.0008823663253697384, + "loss": 1.7448, + "step": 521 + }, + { + "epoch": 0.5938566552901023, + "grad_norm": 1.1398788690567017, + "learning_rate": 0.0008821387940841866, + "loss": 2.932, + "step": 522 + }, + { + "epoch": 0.5949943117178612, + "grad_norm": 1.1560657024383545, + "learning_rate": 0.0008819112627986348, + "loss": 2.2647, + "step": 523 + }, + { + "epoch": 0.59613196814562, + "grad_norm": 1.1563608646392822, + "learning_rate": 0.0008816837315130831, + "loss": 2.6389, + "step": 524 + }, + { + "epoch": 0.5972696245733788, + "grad_norm": 0.6971971988677979, + "learning_rate": 0.0008814562002275312, + "loss": 1.6904, + "step": 525 + }, + { + "epoch": 0.5984072810011376, + "grad_norm": 1.1338340044021606, + "learning_rate": 0.0008812286689419795, + "loss": 2.7721, + "step": 526 + }, + { + "epoch": 0.5995449374288965, + "grad_norm": 0.9443331360816956, + "learning_rate": 0.0008810011376564278, + "loss": 1.673, + "step": 527 + }, + { + "epoch": 0.6006825938566553, + "grad_norm": 0.7245666980743408, + "learning_rate": 0.000880773606370876, + "loss": 1.716, + "step": 528 + }, + { + "epoch": 0.6018202502844141, + "grad_norm": 0.9884563684463501, + "learning_rate": 0.0008805460750853242, + "loss": 3.0318, + "step": 529 + }, + { + "epoch": 0.602957906712173, + "grad_norm": 1.2986050844192505, + "learning_rate": 0.0008803185437997725, + "loss": 2.7565, + "step": 530 + }, + { + "epoch": 0.6040955631399317, + "grad_norm": 0.9133585691452026, + "learning_rate": 0.0008800910125142207, + "loss": 2.1352, + "step": 531 + }, + { + "epoch": 0.6052332195676906, + "grad_norm": 0.7802772521972656, + "learning_rate": 0.000879863481228669, + "loss": 1.487, + "step": 532 + }, + { + "epoch": 0.6063708759954494, + "grad_norm": 1.488508701324463, + "learning_rate": 0.0008796359499431173, + "loss": 3.6104, + "step": 533 + }, + { + "epoch": 0.6075085324232082, + "grad_norm": 1.4523831605911255, + "learning_rate": 0.0008794084186575654, + "loss": 4.1854, + "step": 534 + }, + { + "epoch": 0.608646188850967, + "grad_norm": 0.9844598174095154, + "learning_rate": 0.0008791808873720137, + "loss": 2.2759, + "step": 535 + }, + { + "epoch": 0.6097838452787259, + "grad_norm": 1.0784645080566406, + "learning_rate": 0.0008789533560864619, + "loss": 2.6678, + "step": 536 + }, + { + "epoch": 0.6109215017064846, + "grad_norm": 1.2045968770980835, + "learning_rate": 0.0008787258248009101, + "loss": 2.4359, + "step": 537 + }, + { + "epoch": 0.6120591581342435, + "grad_norm": 1.1759347915649414, + "learning_rate": 0.0008784982935153584, + "loss": 2.6307, + "step": 538 + }, + { + "epoch": 0.6131968145620023, + "grad_norm": 1.0141427516937256, + "learning_rate": 0.0008782707622298066, + "loss": 2.5533, + "step": 539 + }, + { + "epoch": 0.6143344709897611, + "grad_norm": 0.986733615398407, + "learning_rate": 0.0008780432309442548, + "loss": 1.1977, + "step": 540 + }, + { + "epoch": 0.6154721274175199, + "grad_norm": 1.7550513744354248, + "learning_rate": 0.0008778156996587031, + "loss": 1.977, + "step": 541 + }, + { + "epoch": 0.6166097838452788, + "grad_norm": 1.0284448862075806, + "learning_rate": 0.0008775881683731514, + "loss": 2.2885, + "step": 542 + }, + { + "epoch": 0.6177474402730375, + "grad_norm": 0.8970025181770325, + "learning_rate": 0.0008773606370875996, + "loss": 1.519, + "step": 543 + }, + { + "epoch": 0.6188850967007964, + "grad_norm": 1.1675552129745483, + "learning_rate": 0.0008771331058020478, + "loss": 2.5995, + "step": 544 + }, + { + "epoch": 0.6200227531285551, + "grad_norm": 0.7909214496612549, + "learning_rate": 0.000876905574516496, + "loss": 2.4494, + "step": 545 + }, + { + "epoch": 0.621160409556314, + "grad_norm": 1.1816651821136475, + "learning_rate": 0.0008766780432309442, + "loss": 2.0798, + "step": 546 + }, + { + "epoch": 0.6222980659840728, + "grad_norm": 1.2361869812011719, + "learning_rate": 0.0008764505119453925, + "loss": 2.1153, + "step": 547 + }, + { + "epoch": 0.6234357224118316, + "grad_norm": 1.498818278312683, + "learning_rate": 0.0008762229806598407, + "loss": 3.6819, + "step": 548 + }, + { + "epoch": 0.6245733788395904, + "grad_norm": 1.2410242557525635, + "learning_rate": 0.0008759954493742889, + "loss": 2.3829, + "step": 549 + }, + { + "epoch": 0.6257110352673493, + "grad_norm": 1.1705360412597656, + "learning_rate": 0.0008757679180887373, + "loss": 1.718, + "step": 550 + }, + { + "epoch": 0.626848691695108, + "grad_norm": 0.9694558382034302, + "learning_rate": 0.0008755403868031855, + "loss": 2.1928, + "step": 551 + }, + { + "epoch": 0.6279863481228669, + "grad_norm": 0.6821228265762329, + "learning_rate": 0.0008753128555176338, + "loss": 1.406, + "step": 552 + }, + { + "epoch": 0.6291240045506257, + "grad_norm": 0.8572499752044678, + "learning_rate": 0.000875085324232082, + "loss": 1.6783, + "step": 553 + }, + { + "epoch": 0.6302616609783845, + "grad_norm": 0.8171662092208862, + "learning_rate": 0.0008748577929465301, + "loss": 2.4399, + "step": 554 + }, + { + "epoch": 0.6313993174061433, + "grad_norm": 1.320075273513794, + "learning_rate": 0.0008746302616609784, + "loss": 1.4817, + "step": 555 + }, + { + "epoch": 0.6325369738339022, + "grad_norm": 0.7951433658599854, + "learning_rate": 0.0008744027303754266, + "loss": 1.7655, + "step": 556 + }, + { + "epoch": 0.6336746302616609, + "grad_norm": 0.7013391256332397, + "learning_rate": 0.0008741751990898748, + "loss": 1.3957, + "step": 557 + }, + { + "epoch": 0.6348122866894198, + "grad_norm": 0.8427788019180298, + "learning_rate": 0.0008739476678043232, + "loss": 1.5612, + "step": 558 + }, + { + "epoch": 0.6359499431171786, + "grad_norm": 1.0773825645446777, + "learning_rate": 0.0008737201365187714, + "loss": 1.8739, + "step": 559 + }, + { + "epoch": 0.6370875995449374, + "grad_norm": 0.7293964624404907, + "learning_rate": 0.0008734926052332196, + "loss": 1.1344, + "step": 560 + }, + { + "epoch": 0.6382252559726962, + "grad_norm": 0.8789951205253601, + "learning_rate": 0.0008732650739476679, + "loss": 2.1382, + "step": 561 + }, + { + "epoch": 0.6393629124004551, + "grad_norm": 1.86644446849823, + "learning_rate": 0.0008730375426621161, + "loss": 4.1822, + "step": 562 + }, + { + "epoch": 0.6405005688282139, + "grad_norm": 1.1930569410324097, + "learning_rate": 0.0008728100113765643, + "loss": 2.7447, + "step": 563 + }, + { + "epoch": 0.6416382252559727, + "grad_norm": 0.9682884812355042, + "learning_rate": 0.0008725824800910125, + "loss": 2.0626, + "step": 564 + }, + { + "epoch": 0.6427758816837316, + "grad_norm": 0.7793911695480347, + "learning_rate": 0.0008723549488054607, + "loss": 1.643, + "step": 565 + }, + { + "epoch": 0.6439135381114903, + "grad_norm": 1.0574642419815063, + "learning_rate": 0.0008721274175199089, + "loss": 2.1083, + "step": 566 + }, + { + "epoch": 0.6450511945392492, + "grad_norm": 0.8339124917984009, + "learning_rate": 0.0008718998862343573, + "loss": 1.7013, + "step": 567 + }, + { + "epoch": 0.646188850967008, + "grad_norm": 0.7775769829750061, + "learning_rate": 0.0008716723549488055, + "loss": 1.3428, + "step": 568 + }, + { + "epoch": 0.6473265073947668, + "grad_norm": 1.089309573173523, + "learning_rate": 0.0008714448236632537, + "loss": 2.1378, + "step": 569 + }, + { + "epoch": 0.6484641638225256, + "grad_norm": 1.043135166168213, + "learning_rate": 0.000871217292377702, + "loss": 3.3488, + "step": 570 + }, + { + "epoch": 0.6496018202502845, + "grad_norm": 0.9596251249313354, + "learning_rate": 0.0008709897610921502, + "loss": 2.3173, + "step": 571 + }, + { + "epoch": 0.6507394766780432, + "grad_norm": 0.8774250745773315, + "learning_rate": 0.0008707622298065985, + "loss": 2.1654, + "step": 572 + }, + { + "epoch": 0.6518771331058021, + "grad_norm": 0.7768117189407349, + "learning_rate": 0.0008705346985210466, + "loss": 1.56, + "step": 573 + }, + { + "epoch": 0.6530147895335608, + "grad_norm": 0.9584445953369141, + "learning_rate": 0.0008703071672354948, + "loss": 1.8768, + "step": 574 + }, + { + "epoch": 0.6541524459613197, + "grad_norm": 0.7490730285644531, + "learning_rate": 0.0008700796359499432, + "loss": 1.5192, + "step": 575 + }, + { + "epoch": 0.6552901023890785, + "grad_norm": 0.779798686504364, + "learning_rate": 0.0008698521046643914, + "loss": 1.6678, + "step": 576 + }, + { + "epoch": 0.6564277588168373, + "grad_norm": 1.2081419229507446, + "learning_rate": 0.0008696245733788396, + "loss": 2.5276, + "step": 577 + }, + { + "epoch": 0.6575654152445961, + "grad_norm": 1.5270092487335205, + "learning_rate": 0.0008693970420932879, + "loss": 3.8846, + "step": 578 + }, + { + "epoch": 0.658703071672355, + "grad_norm": 0.9504697918891907, + "learning_rate": 0.0008691695108077361, + "loss": 1.8834, + "step": 579 + }, + { + "epoch": 0.6598407281001137, + "grad_norm": 0.8998041749000549, + "learning_rate": 0.0008689419795221843, + "loss": 2.0045, + "step": 580 + }, + { + "epoch": 0.6609783845278726, + "grad_norm": 0.8134379386901855, + "learning_rate": 0.0008687144482366326, + "loss": 0.9556, + "step": 581 + }, + { + "epoch": 0.6621160409556314, + "grad_norm": 0.7257466316223145, + "learning_rate": 0.0008684869169510808, + "loss": 1.2292, + "step": 582 + }, + { + "epoch": 0.6632536973833902, + "grad_norm": 1.3256492614746094, + "learning_rate": 0.0008682593856655289, + "loss": 3.8102, + "step": 583 + }, + { + "epoch": 0.664391353811149, + "grad_norm": 0.7932450771331787, + "learning_rate": 0.0008680318543799773, + "loss": 2.2484, + "step": 584 + }, + { + "epoch": 0.6655290102389079, + "grad_norm": 1.0340895652770996, + "learning_rate": 0.0008678043230944255, + "loss": 2.1459, + "step": 585 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.0674877166748047, + "learning_rate": 0.0008675767918088737, + "loss": 2.7635, + "step": 586 + }, + { + "epoch": 0.6678043230944255, + "grad_norm": 1.0885896682739258, + "learning_rate": 0.000867349260523322, + "loss": 2.525, + "step": 587 + }, + { + "epoch": 0.6689419795221843, + "grad_norm": 0.6129403710365295, + "learning_rate": 0.0008671217292377702, + "loss": 1.225, + "step": 588 + }, + { + "epoch": 0.6700796359499431, + "grad_norm": 0.9944592118263245, + "learning_rate": 0.0008668941979522184, + "loss": 1.9925, + "step": 589 + }, + { + "epoch": 0.6712172923777019, + "grad_norm": 1.1834667921066284, + "learning_rate": 0.0008666666666666667, + "loss": 2.3584, + "step": 590 + }, + { + "epoch": 0.6723549488054608, + "grad_norm": 1.3296507596969604, + "learning_rate": 0.0008664391353811149, + "loss": 2.7873, + "step": 591 + }, + { + "epoch": 0.6734926052332195, + "grad_norm": 0.7346708178520203, + "learning_rate": 0.0008662116040955633, + "loss": 1.5522, + "step": 592 + }, + { + "epoch": 0.6746302616609784, + "grad_norm": 1.405709147453308, + "learning_rate": 0.0008659840728100114, + "loss": 3.4668, + "step": 593 + }, + { + "epoch": 0.6757679180887372, + "grad_norm": 0.925556480884552, + "learning_rate": 0.0008657565415244596, + "loss": 2.5614, + "step": 594 + }, + { + "epoch": 0.676905574516496, + "grad_norm": 0.6485567688941956, + "learning_rate": 0.0008655290102389079, + "loss": 1.3796, + "step": 595 + }, + { + "epoch": 0.6780432309442548, + "grad_norm": 1.2317935228347778, + "learning_rate": 0.0008653014789533561, + "loss": 2.7619, + "step": 596 + }, + { + "epoch": 0.6791808873720137, + "grad_norm": 1.8749306201934814, + "learning_rate": 0.0008650739476678043, + "loss": 2.0424, + "step": 597 + }, + { + "epoch": 0.6803185437997725, + "grad_norm": 1.0405480861663818, + "learning_rate": 0.0008648464163822526, + "loss": 1.6496, + "step": 598 + }, + { + "epoch": 0.6814562002275313, + "grad_norm": 0.7675598859786987, + "learning_rate": 0.0008646188850967008, + "loss": 1.5025, + "step": 599 + }, + { + "epoch": 0.6825938566552902, + "grad_norm": 0.9507393836975098, + "learning_rate": 0.000864391353811149, + "loss": 2.1572, + "step": 600 + }, + { + "epoch": 0.6837315130830489, + "grad_norm": 0.7208441495895386, + "learning_rate": 0.0008641638225255974, + "loss": 1.8724, + "step": 601 + }, + { + "epoch": 0.6848691695108078, + "grad_norm": 1.0471227169036865, + "learning_rate": 0.0008639362912400455, + "loss": 1.5183, + "step": 602 + }, + { + "epoch": 0.6860068259385665, + "grad_norm": 0.7617486119270325, + "learning_rate": 0.0008637087599544937, + "loss": 1.9664, + "step": 603 + }, + { + "epoch": 0.6871444823663254, + "grad_norm": 1.7303751707077026, + "learning_rate": 0.000863481228668942, + "loss": 3.4134, + "step": 604 + }, + { + "epoch": 0.6882821387940842, + "grad_norm": 1.1979167461395264, + "learning_rate": 0.0008632536973833902, + "loss": 2.1287, + "step": 605 + }, + { + "epoch": 0.689419795221843, + "grad_norm": 0.8797517418861389, + "learning_rate": 0.0008630261660978384, + "loss": 1.4359, + "step": 606 + }, + { + "epoch": 0.6905574516496018, + "grad_norm": 1.3012560606002808, + "learning_rate": 0.0008627986348122867, + "loss": 1.9451, + "step": 607 + }, + { + "epoch": 0.6916951080773607, + "grad_norm": 0.8899447917938232, + "learning_rate": 0.000862571103526735, + "loss": 1.9974, + "step": 608 + }, + { + "epoch": 0.6928327645051194, + "grad_norm": 1.449118733406067, + "learning_rate": 0.0008623435722411832, + "loss": 3.0255, + "step": 609 + }, + { + "epoch": 0.6939704209328783, + "grad_norm": 1.4355418682098389, + "learning_rate": 0.0008621160409556315, + "loss": 3.6942, + "step": 610 + }, + { + "epoch": 0.6951080773606371, + "grad_norm": 0.9907065033912659, + "learning_rate": 0.0008618885096700797, + "loss": 1.8645, + "step": 611 + }, + { + "epoch": 0.6962457337883959, + "grad_norm": 0.8287332057952881, + "learning_rate": 0.0008616609783845278, + "loss": 1.909, + "step": 612 + }, + { + "epoch": 0.6973833902161547, + "grad_norm": 1.207824468612671, + "learning_rate": 0.0008614334470989761, + "loss": 2.1948, + "step": 613 + }, + { + "epoch": 0.6985210466439136, + "grad_norm": 0.6670779585838318, + "learning_rate": 0.0008612059158134243, + "loss": 1.2597, + "step": 614 + }, + { + "epoch": 0.6996587030716723, + "grad_norm": 0.6023557186126709, + "learning_rate": 0.0008609783845278726, + "loss": 1.2117, + "step": 615 + }, + { + "epoch": 0.7007963594994312, + "grad_norm": 1.5673305988311768, + "learning_rate": 0.0008607508532423208, + "loss": 3.1698, + "step": 616 + }, + { + "epoch": 0.70193401592719, + "grad_norm": 0.9000980257987976, + "learning_rate": 0.000860523321956769, + "loss": 1.9187, + "step": 617 + }, + { + "epoch": 0.7030716723549488, + "grad_norm": 0.9901409149169922, + "learning_rate": 0.0008602957906712174, + "loss": 2.3486, + "step": 618 + }, + { + "epoch": 0.7042093287827076, + "grad_norm": 0.7944636940956116, + "learning_rate": 0.0008600682593856656, + "loss": 2.0677, + "step": 619 + }, + { + "epoch": 0.7053469852104665, + "grad_norm": 0.9612780213356018, + "learning_rate": 0.0008598407281001138, + "loss": 1.693, + "step": 620 + }, + { + "epoch": 0.7064846416382252, + "grad_norm": 1.2384001016616821, + "learning_rate": 0.0008596131968145621, + "loss": 2.9955, + "step": 621 + }, + { + "epoch": 0.7076222980659841, + "grad_norm": 1.5061485767364502, + "learning_rate": 0.0008593856655290102, + "loss": 1.945, + "step": 622 + }, + { + "epoch": 0.7087599544937428, + "grad_norm": 1.0285781621932983, + "learning_rate": 0.0008591581342434584, + "loss": 2.5533, + "step": 623 + }, + { + "epoch": 0.7098976109215017, + "grad_norm": 0.783734917640686, + "learning_rate": 0.0008589306029579067, + "loss": 1.6898, + "step": 624 + }, + { + "epoch": 0.7110352673492605, + "grad_norm": 0.7737388610839844, + "learning_rate": 0.000858703071672355, + "loss": 2.4584, + "step": 625 + }, + { + "epoch": 0.7121729237770194, + "grad_norm": 0.6664589047431946, + "learning_rate": 0.0008584755403868032, + "loss": 1.7401, + "step": 626 + }, + { + "epoch": 0.7133105802047781, + "grad_norm": 0.7452929019927979, + "learning_rate": 0.0008582480091012515, + "loss": 1.3864, + "step": 627 + }, + { + "epoch": 0.714448236632537, + "grad_norm": 0.8343956470489502, + "learning_rate": 0.0008580204778156997, + "loss": 1.4364, + "step": 628 + }, + { + "epoch": 0.7155858930602957, + "grad_norm": 0.8422945737838745, + "learning_rate": 0.0008577929465301479, + "loss": 2.3053, + "step": 629 + }, + { + "epoch": 0.7167235494880546, + "grad_norm": 1.2495871782302856, + "learning_rate": 0.0008575654152445962, + "loss": 3.8026, + "step": 630 + }, + { + "epoch": 0.7178612059158134, + "grad_norm": 1.0053642988204956, + "learning_rate": 0.0008573378839590444, + "loss": 2.2322, + "step": 631 + }, + { + "epoch": 0.7189988623435722, + "grad_norm": 1.342475414276123, + "learning_rate": 0.0008571103526734925, + "loss": 2.2204, + "step": 632 + }, + { + "epoch": 0.7201365187713311, + "grad_norm": 1.1478005647659302, + "learning_rate": 0.0008568828213879408, + "loss": 2.0024, + "step": 633 + }, + { + "epoch": 0.7212741751990899, + "grad_norm": 1.3100638389587402, + "learning_rate": 0.0008566552901023891, + "loss": 1.7865, + "step": 634 + }, + { + "epoch": 0.7224118316268487, + "grad_norm": 0.7397316694259644, + "learning_rate": 0.0008564277588168374, + "loss": 0.9055, + "step": 635 + }, + { + "epoch": 0.7235494880546075, + "grad_norm": 1.5261900424957275, + "learning_rate": 0.0008562002275312856, + "loss": 2.518, + "step": 636 + }, + { + "epoch": 0.7246871444823664, + "grad_norm": 1.1781413555145264, + "learning_rate": 0.0008559726962457338, + "loss": 2.4799, + "step": 637 + }, + { + "epoch": 0.7258248009101251, + "grad_norm": 0.9748155474662781, + "learning_rate": 0.0008557451649601821, + "loss": 1.9048, + "step": 638 + }, + { + "epoch": 0.726962457337884, + "grad_norm": 0.9338977336883545, + "learning_rate": 0.0008555176336746303, + "loss": 2.0054, + "step": 639 + }, + { + "epoch": 0.7281001137656428, + "grad_norm": 1.5872938632965088, + "learning_rate": 0.0008552901023890785, + "loss": 2.7386, + "step": 640 + }, + { + "epoch": 0.7292377701934016, + "grad_norm": 0.899350643157959, + "learning_rate": 0.0008550625711035267, + "loss": 1.694, + "step": 641 + }, + { + "epoch": 0.7303754266211604, + "grad_norm": 1.3371248245239258, + "learning_rate": 0.000854835039817975, + "loss": 2.4103, + "step": 642 + }, + { + "epoch": 0.7315130830489193, + "grad_norm": 0.9004554152488708, + "learning_rate": 0.0008546075085324232, + "loss": 1.7984, + "step": 643 + }, + { + "epoch": 0.732650739476678, + "grad_norm": 1.229750394821167, + "learning_rate": 0.0008543799772468715, + "loss": 2.8663, + "step": 644 + }, + { + "epoch": 0.7337883959044369, + "grad_norm": 0.8719218373298645, + "learning_rate": 0.0008541524459613197, + "loss": 1.5535, + "step": 645 + }, + { + "epoch": 0.7349260523321957, + "grad_norm": 1.2239844799041748, + "learning_rate": 0.0008539249146757679, + "loss": 2.5728, + "step": 646 + }, + { + "epoch": 0.7360637087599545, + "grad_norm": 0.9655494093894958, + "learning_rate": 0.0008536973833902162, + "loss": 1.7532, + "step": 647 + }, + { + "epoch": 0.7372013651877133, + "grad_norm": 0.8480125665664673, + "learning_rate": 0.0008534698521046644, + "loss": 1.3947, + "step": 648 + }, + { + "epoch": 0.7383390216154722, + "grad_norm": 1.0782033205032349, + "learning_rate": 0.0008532423208191126, + "loss": 2.2726, + "step": 649 + }, + { + "epoch": 0.7394766780432309, + "grad_norm": 0.8517910838127136, + "learning_rate": 0.000853014789533561, + "loss": 1.7146, + "step": 650 + }, + { + "epoch": 0.7406143344709898, + "grad_norm": 0.6702635288238525, + "learning_rate": 0.0008527872582480091, + "loss": 1.4064, + "step": 651 + }, + { + "epoch": 0.7417519908987485, + "grad_norm": 0.807049572467804, + "learning_rate": 0.0008525597269624573, + "loss": 1.5396, + "step": 652 + }, + { + "epoch": 0.7428896473265074, + "grad_norm": 1.3272011280059814, + "learning_rate": 0.0008523321956769056, + "loss": 2.6018, + "step": 653 + }, + { + "epoch": 0.7440273037542662, + "grad_norm": 0.8511213064193726, + "learning_rate": 0.0008521046643913538, + "loss": 2.0945, + "step": 654 + }, + { + "epoch": 0.745164960182025, + "grad_norm": 1.0640743970870972, + "learning_rate": 0.0008518771331058021, + "loss": 3.3749, + "step": 655 + }, + { + "epoch": 0.7463026166097838, + "grad_norm": 0.9631751179695129, + "learning_rate": 0.0008516496018202503, + "loss": 2.4991, + "step": 656 + }, + { + "epoch": 0.7474402730375427, + "grad_norm": 1.0921177864074707, + "learning_rate": 0.0008514220705346985, + "loss": 1.9951, + "step": 657 + }, + { + "epoch": 0.7485779294653014, + "grad_norm": 1.032777190208435, + "learning_rate": 0.0008511945392491469, + "loss": 1.9648, + "step": 658 + }, + { + "epoch": 0.7497155858930603, + "grad_norm": 0.7291549444198608, + "learning_rate": 0.0008509670079635951, + "loss": 2.0568, + "step": 659 + }, + { + "epoch": 0.7508532423208191, + "grad_norm": 1.0137559175491333, + "learning_rate": 0.0008507394766780433, + "loss": 2.5676, + "step": 660 + }, + { + "epoch": 0.7519908987485779, + "grad_norm": 0.9185921549797058, + "learning_rate": 0.0008505119453924915, + "loss": 1.6855, + "step": 661 + }, + { + "epoch": 0.7531285551763367, + "grad_norm": 1.0048507452011108, + "learning_rate": 0.0008502844141069397, + "loss": 1.5001, + "step": 662 + }, + { + "epoch": 0.7542662116040956, + "grad_norm": 1.2506744861602783, + "learning_rate": 0.0008500568828213879, + "loss": 2.4675, + "step": 663 + }, + { + "epoch": 0.7554038680318543, + "grad_norm": 1.7534509897232056, + "learning_rate": 0.0008498293515358362, + "loss": 4.192, + "step": 664 + }, + { + "epoch": 0.7565415244596132, + "grad_norm": 1.0792453289031982, + "learning_rate": 0.0008496018202502844, + "loss": 2.2308, + "step": 665 + }, + { + "epoch": 0.757679180887372, + "grad_norm": 0.9935582280158997, + "learning_rate": 0.0008493742889647326, + "loss": 1.499, + "step": 666 + }, + { + "epoch": 0.7588168373151308, + "grad_norm": 0.9015896916389465, + "learning_rate": 0.000849146757679181, + "loss": 1.6236, + "step": 667 + }, + { + "epoch": 0.7599544937428896, + "grad_norm": 1.0141130685806274, + "learning_rate": 0.0008489192263936292, + "loss": 1.9243, + "step": 668 + }, + { + "epoch": 0.7610921501706485, + "grad_norm": 0.5651462078094482, + "learning_rate": 0.0008486916951080774, + "loss": 0.778, + "step": 669 + }, + { + "epoch": 0.7622298065984073, + "grad_norm": 1.0191128253936768, + "learning_rate": 0.0008484641638225257, + "loss": 2.0027, + "step": 670 + }, + { + "epoch": 0.7633674630261661, + "grad_norm": 1.0855798721313477, + "learning_rate": 0.0008482366325369738, + "loss": 2.4567, + "step": 671 + }, + { + "epoch": 0.764505119453925, + "grad_norm": 1.7190563678741455, + "learning_rate": 0.000848009101251422, + "loss": 2.7192, + "step": 672 + }, + { + "epoch": 0.7656427758816837, + "grad_norm": 1.0108932256698608, + "learning_rate": 0.0008477815699658703, + "loss": 1.2952, + "step": 673 + }, + { + "epoch": 0.7667804323094426, + "grad_norm": 2.3063066005706787, + "learning_rate": 0.0008475540386803185, + "loss": 1.726, + "step": 674 + }, + { + "epoch": 0.7679180887372014, + "grad_norm": 0.5806533098220825, + "learning_rate": 0.0008473265073947669, + "loss": 1.3231, + "step": 675 + }, + { + "epoch": 0.7690557451649602, + "grad_norm": 0.9485315680503845, + "learning_rate": 0.0008470989761092151, + "loss": 1.7927, + "step": 676 + }, + { + "epoch": 0.770193401592719, + "grad_norm": 0.7798807621002197, + "learning_rate": 0.0008468714448236633, + "loss": 1.4712, + "step": 677 + }, + { + "epoch": 0.7713310580204779, + "grad_norm": 1.0551048517227173, + "learning_rate": 0.0008466439135381116, + "loss": 1.9981, + "step": 678 + }, + { + "epoch": 0.7724687144482366, + "grad_norm": 1.0884678363800049, + "learning_rate": 0.0008464163822525598, + "loss": 2.0978, + "step": 679 + }, + { + "epoch": 0.7736063708759955, + "grad_norm": 0.9386458992958069, + "learning_rate": 0.0008461888509670079, + "loss": 2.5401, + "step": 680 + }, + { + "epoch": 0.7747440273037542, + "grad_norm": 1.1224387884140015, + "learning_rate": 0.0008459613196814562, + "loss": 2.5177, + "step": 681 + }, + { + "epoch": 0.7758816837315131, + "grad_norm": 0.7325891852378845, + "learning_rate": 0.0008457337883959044, + "loss": 1.9845, + "step": 682 + }, + { + "epoch": 0.7770193401592719, + "grad_norm": 0.8216614723205566, + "learning_rate": 0.0008455062571103526, + "loss": 1.7352, + "step": 683 + }, + { + "epoch": 0.7781569965870307, + "grad_norm": 0.8514799475669861, + "learning_rate": 0.000845278725824801, + "loss": 1.6026, + "step": 684 + }, + { + "epoch": 0.7792946530147895, + "grad_norm": 1.0461024045944214, + "learning_rate": 0.0008450511945392492, + "loss": 2.7851, + "step": 685 + }, + { + "epoch": 0.7804323094425484, + "grad_norm": 1.0738078355789185, + "learning_rate": 0.0008448236632536974, + "loss": 2.2409, + "step": 686 + }, + { + "epoch": 0.7815699658703071, + "grad_norm": 1.5895496606826782, + "learning_rate": 0.0008445961319681457, + "loss": 2.6157, + "step": 687 + }, + { + "epoch": 0.782707622298066, + "grad_norm": 1.095568299293518, + "learning_rate": 0.0008443686006825939, + "loss": 3.4401, + "step": 688 + }, + { + "epoch": 0.7838452787258248, + "grad_norm": 1.2721914052963257, + "learning_rate": 0.0008441410693970421, + "loss": 3.7557, + "step": 689 + }, + { + "epoch": 0.7849829351535836, + "grad_norm": 0.8100789785385132, + "learning_rate": 0.0008439135381114903, + "loss": 1.8537, + "step": 690 + }, + { + "epoch": 0.7861205915813424, + "grad_norm": 0.8364444375038147, + "learning_rate": 0.0008436860068259385, + "loss": 1.9646, + "step": 691 + }, + { + "epoch": 0.7872582480091013, + "grad_norm": 0.8454108834266663, + "learning_rate": 0.0008434584755403867, + "loss": 1.3103, + "step": 692 + }, + { + "epoch": 0.78839590443686, + "grad_norm": 1.0426613092422485, + "learning_rate": 0.0008432309442548351, + "loss": 2.1801, + "step": 693 + }, + { + "epoch": 0.7895335608646189, + "grad_norm": 0.6206464171409607, + "learning_rate": 0.0008430034129692833, + "loss": 0.6764, + "step": 694 + }, + { + "epoch": 0.7906712172923777, + "grad_norm": 1.0742131471633911, + "learning_rate": 0.0008427758816837315, + "loss": 2.0438, + "step": 695 + }, + { + "epoch": 0.7918088737201365, + "grad_norm": 1.2915891408920288, + "learning_rate": 0.0008425483503981798, + "loss": 2.3515, + "step": 696 + }, + { + "epoch": 0.7929465301478953, + "grad_norm": 1.545964002609253, + "learning_rate": 0.000842320819112628, + "loss": 3.3463, + "step": 697 + }, + { + "epoch": 0.7940841865756542, + "grad_norm": 0.8047581911087036, + "learning_rate": 0.0008420932878270763, + "loss": 2.392, + "step": 698 + }, + { + "epoch": 0.7952218430034129, + "grad_norm": 1.0554293394088745, + "learning_rate": 0.0008418657565415245, + "loss": 1.8506, + "step": 699 + }, + { + "epoch": 0.7963594994311718, + "grad_norm": 1.518147587776184, + "learning_rate": 0.0008416382252559726, + "loss": 4.3646, + "step": 700 + }, + { + "epoch": 0.7974971558589306, + "grad_norm": 0.942899227142334, + "learning_rate": 0.000841410693970421, + "loss": 1.8301, + "step": 701 + }, + { + "epoch": 0.7986348122866894, + "grad_norm": 1.1088321208953857, + "learning_rate": 0.0008411831626848692, + "loss": 3.0843, + "step": 702 + }, + { + "epoch": 0.7997724687144482, + "grad_norm": 1.0423277616500854, + "learning_rate": 0.0008409556313993174, + "loss": 2.6175, + "step": 703 + }, + { + "epoch": 0.800910125142207, + "grad_norm": 0.7414242625236511, + "learning_rate": 0.0008407281001137657, + "loss": 1.399, + "step": 704 + }, + { + "epoch": 0.8020477815699659, + "grad_norm": 1.0195127725601196, + "learning_rate": 0.0008405005688282139, + "loss": 2.1314, + "step": 705 + }, + { + "epoch": 0.8031854379977247, + "grad_norm": 1.4709278345108032, + "learning_rate": 0.0008402730375426621, + "loss": 3.2922, + "step": 706 + }, + { + "epoch": 0.8043230944254836, + "grad_norm": 1.656906247138977, + "learning_rate": 0.0008400455062571104, + "loss": 3.844, + "step": 707 + }, + { + "epoch": 0.8054607508532423, + "grad_norm": 0.985885500907898, + "learning_rate": 0.0008398179749715586, + "loss": 2.9415, + "step": 708 + }, + { + "epoch": 0.8065984072810012, + "grad_norm": 1.5020736455917358, + "learning_rate": 0.0008395904436860067, + "loss": 2.183, + "step": 709 + }, + { + "epoch": 0.8077360637087599, + "grad_norm": 0.7547074556350708, + "learning_rate": 0.0008393629124004551, + "loss": 1.8464, + "step": 710 + }, + { + "epoch": 0.8088737201365188, + "grad_norm": 1.1752091646194458, + "learning_rate": 0.0008391353811149033, + "loss": 2.5145, + "step": 711 + }, + { + "epoch": 0.8100113765642776, + "grad_norm": 0.7590292096138, + "learning_rate": 0.0008389078498293515, + "loss": 1.4564, + "step": 712 + }, + { + "epoch": 0.8111490329920364, + "grad_norm": 0.8058563470840454, + "learning_rate": 0.0008386803185437998, + "loss": 1.7101, + "step": 713 + }, + { + "epoch": 0.8122866894197952, + "grad_norm": 0.971361517906189, + "learning_rate": 0.000838452787258248, + "loss": 1.8707, + "step": 714 + }, + { + "epoch": 0.8134243458475541, + "grad_norm": 1.3266187906265259, + "learning_rate": 0.0008382252559726962, + "loss": 2.8622, + "step": 715 + }, + { + "epoch": 0.8145620022753128, + "grad_norm": 1.2026985883712769, + "learning_rate": 0.0008379977246871445, + "loss": 2.386, + "step": 716 + }, + { + "epoch": 0.8156996587030717, + "grad_norm": 1.041385531425476, + "learning_rate": 0.0008377701934015928, + "loss": 2.1941, + "step": 717 + }, + { + "epoch": 0.8168373151308305, + "grad_norm": 0.9578999280929565, + "learning_rate": 0.0008375426621160411, + "loss": 2.4792, + "step": 718 + }, + { + "epoch": 0.8179749715585893, + "grad_norm": 1.569138765335083, + "learning_rate": 0.0008373151308304892, + "loss": 2.5056, + "step": 719 + }, + { + "epoch": 0.8191126279863481, + "grad_norm": 1.2005671262741089, + "learning_rate": 0.0008370875995449374, + "loss": 2.8101, + "step": 720 + }, + { + "epoch": 0.820250284414107, + "grad_norm": 0.8075481057167053, + "learning_rate": 0.0008368600682593857, + "loss": 1.1596, + "step": 721 + }, + { + "epoch": 0.8213879408418657, + "grad_norm": 1.131332278251648, + "learning_rate": 0.0008366325369738339, + "loss": 2.2506, + "step": 722 + }, + { + "epoch": 0.8225255972696246, + "grad_norm": 1.0563104152679443, + "learning_rate": 0.0008364050056882821, + "loss": 1.9322, + "step": 723 + }, + { + "epoch": 0.8236632536973834, + "grad_norm": 0.7609984874725342, + "learning_rate": 0.0008361774744027304, + "loss": 1.8229, + "step": 724 + }, + { + "epoch": 0.8248009101251422, + "grad_norm": 0.8261270523071289, + "learning_rate": 0.0008359499431171786, + "loss": 2.785, + "step": 725 + }, + { + "epoch": 0.825938566552901, + "grad_norm": 0.5597876310348511, + "learning_rate": 0.0008357224118316269, + "loss": 1.0098, + "step": 726 + }, + { + "epoch": 0.8270762229806599, + "grad_norm": 1.219977617263794, + "learning_rate": 0.0008354948805460752, + "loss": 3.3632, + "step": 727 + }, + { + "epoch": 0.8282138794084186, + "grad_norm": 1.1048645973205566, + "learning_rate": 0.0008352673492605234, + "loss": 3.235, + "step": 728 + }, + { + "epoch": 0.8293515358361775, + "grad_norm": 1.0460500717163086, + "learning_rate": 0.0008350398179749715, + "loss": 1.7857, + "step": 729 + }, + { + "epoch": 0.8304891922639362, + "grad_norm": 1.144655466079712, + "learning_rate": 0.0008348122866894198, + "loss": 2.25, + "step": 730 + }, + { + "epoch": 0.8316268486916951, + "grad_norm": 0.9807350039482117, + "learning_rate": 0.000834584755403868, + "loss": 2.4989, + "step": 731 + }, + { + "epoch": 0.8327645051194539, + "grad_norm": 0.9099501371383667, + "learning_rate": 0.0008343572241183162, + "loss": 2.3824, + "step": 732 + }, + { + "epoch": 0.8339021615472128, + "grad_norm": 0.6631197333335876, + "learning_rate": 0.0008341296928327645, + "loss": 1.6735, + "step": 733 + }, + { + "epoch": 0.8350398179749715, + "grad_norm": 0.6548817157745361, + "learning_rate": 0.0008339021615472128, + "loss": 1.5283, + "step": 734 + }, + { + "epoch": 0.8361774744027304, + "grad_norm": 0.6837170720100403, + "learning_rate": 0.000833674630261661, + "loss": 1.3024, + "step": 735 + }, + { + "epoch": 0.8373151308304891, + "grad_norm": 0.8492507934570312, + "learning_rate": 0.0008334470989761093, + "loss": 1.6662, + "step": 736 + }, + { + "epoch": 0.838452787258248, + "grad_norm": 1.2175747156143188, + "learning_rate": 0.0008332195676905575, + "loss": 2.804, + "step": 737 + }, + { + "epoch": 0.8395904436860068, + "grad_norm": 1.0982182025909424, + "learning_rate": 0.0008329920364050058, + "loss": 2.1759, + "step": 738 + }, + { + "epoch": 0.8407281001137656, + "grad_norm": 1.106663465499878, + "learning_rate": 0.0008327645051194539, + "loss": 2.0419, + "step": 739 + }, + { + "epoch": 0.8418657565415245, + "grad_norm": 0.7106451988220215, + "learning_rate": 0.0008325369738339021, + "loss": 1.2006, + "step": 740 + }, + { + "epoch": 0.8430034129692833, + "grad_norm": 1.1868878602981567, + "learning_rate": 0.0008323094425483504, + "loss": 2.0938, + "step": 741 + }, + { + "epoch": 0.8441410693970421, + "grad_norm": 1.006433367729187, + "learning_rate": 0.0008320819112627986, + "loss": 1.7796, + "step": 742 + }, + { + "epoch": 0.8452787258248009, + "grad_norm": 1.0175524950027466, + "learning_rate": 0.0008318543799772469, + "loss": 2.5526, + "step": 743 + }, + { + "epoch": 0.8464163822525598, + "grad_norm": 0.9404505491256714, + "learning_rate": 0.0008316268486916952, + "loss": 1.9948, + "step": 744 + }, + { + "epoch": 0.8475540386803185, + "grad_norm": 0.9086321592330933, + "learning_rate": 0.0008313993174061434, + "loss": 1.9277, + "step": 745 + }, + { + "epoch": 0.8486916951080774, + "grad_norm": 0.7922766208648682, + "learning_rate": 0.0008311717861205916, + "loss": 2.3355, + "step": 746 + }, + { + "epoch": 0.8498293515358362, + "grad_norm": 0.8049002289772034, + "learning_rate": 0.0008309442548350399, + "loss": 1.7969, + "step": 747 + }, + { + "epoch": 0.850967007963595, + "grad_norm": 0.8303267955780029, + "learning_rate": 0.000830716723549488, + "loss": 1.4973, + "step": 748 + }, + { + "epoch": 0.8521046643913538, + "grad_norm": 0.7163656949996948, + "learning_rate": 0.0008304891922639362, + "loss": 1.7677, + "step": 749 + }, + { + "epoch": 0.8532423208191127, + "grad_norm": 1.022519588470459, + "learning_rate": 0.0008302616609783845, + "loss": 1.6188, + "step": 750 + }, + { + "epoch": 0.8543799772468714, + "grad_norm": 1.2985116243362427, + "learning_rate": 0.0008300341296928328, + "loss": 2.3702, + "step": 751 + }, + { + "epoch": 0.8555176336746303, + "grad_norm": 0.7469916939735413, + "learning_rate": 0.000829806598407281, + "loss": 1.1039, + "step": 752 + }, + { + "epoch": 0.856655290102389, + "grad_norm": 0.8801395297050476, + "learning_rate": 0.0008295790671217293, + "loss": 1.615, + "step": 753 + }, + { + "epoch": 0.8577929465301479, + "grad_norm": 0.6937797665596008, + "learning_rate": 0.0008293515358361775, + "loss": 1.366, + "step": 754 + }, + { + "epoch": 0.8589306029579067, + "grad_norm": 0.92084801197052, + "learning_rate": 0.0008291240045506257, + "loss": 1.1217, + "step": 755 + }, + { + "epoch": 0.8600682593856656, + "grad_norm": 1.1955760717391968, + "learning_rate": 0.000828896473265074, + "loss": 2.3874, + "step": 756 + }, + { + "epoch": 0.8612059158134243, + "grad_norm": 1.270263433456421, + "learning_rate": 0.0008286689419795222, + "loss": 0.7639, + "step": 757 + }, + { + "epoch": 0.8623435722411832, + "grad_norm": 1.0862644910812378, + "learning_rate": 0.0008284414106939704, + "loss": 1.666, + "step": 758 + }, + { + "epoch": 0.863481228668942, + "grad_norm": 1.0658801794052124, + "learning_rate": 0.0008282138794084187, + "loss": 3.0639, + "step": 759 + }, + { + "epoch": 0.8646188850967008, + "grad_norm": 0.7104091644287109, + "learning_rate": 0.0008279863481228669, + "loss": 1.7136, + "step": 760 + }, + { + "epoch": 0.8657565415244596, + "grad_norm": 1.0300426483154297, + "learning_rate": 0.0008277588168373152, + "loss": 1.9242, + "step": 761 + }, + { + "epoch": 0.8668941979522184, + "grad_norm": 0.9637190699577332, + "learning_rate": 0.0008275312855517634, + "loss": 2.0804, + "step": 762 + }, + { + "epoch": 0.8680318543799772, + "grad_norm": 0.9905063509941101, + "learning_rate": 0.0008273037542662116, + "loss": 1.7953, + "step": 763 + }, + { + "epoch": 0.8691695108077361, + "grad_norm": 1.0103565454483032, + "learning_rate": 0.0008270762229806599, + "loss": 2.3104, + "step": 764 + }, + { + "epoch": 0.8703071672354948, + "grad_norm": 0.6847909688949585, + "learning_rate": 0.0008268486916951081, + "loss": 0.9037, + "step": 765 + }, + { + "epoch": 0.8714448236632537, + "grad_norm": 0.958070695400238, + "learning_rate": 0.0008266211604095563, + "loss": 1.3769, + "step": 766 + }, + { + "epoch": 0.8725824800910125, + "grad_norm": 1.081380009651184, + "learning_rate": 0.0008263936291240047, + "loss": 2.6554, + "step": 767 + }, + { + "epoch": 0.8737201365187713, + "grad_norm": 1.003764271736145, + "learning_rate": 0.0008261660978384528, + "loss": 2.3287, + "step": 768 + }, + { + "epoch": 0.8748577929465301, + "grad_norm": 0.9018321633338928, + "learning_rate": 0.000825938566552901, + "loss": 1.1058, + "step": 769 + }, + { + "epoch": 0.875995449374289, + "grad_norm": 1.702228307723999, + "learning_rate": 0.0008257110352673493, + "loss": 2.4671, + "step": 770 + }, + { + "epoch": 0.8771331058020477, + "grad_norm": 0.7024636268615723, + "learning_rate": 0.0008254835039817975, + "loss": 1.3952, + "step": 771 + }, + { + "epoch": 0.8782707622298066, + "grad_norm": 0.8887207508087158, + "learning_rate": 0.0008252559726962457, + "loss": 1.8992, + "step": 772 + }, + { + "epoch": 0.8794084186575654, + "grad_norm": 0.8344920873641968, + "learning_rate": 0.000825028441410694, + "loss": 1.003, + "step": 773 + }, + { + "epoch": 0.8805460750853242, + "grad_norm": 1.174970030784607, + "learning_rate": 0.0008248009101251422, + "loss": 2.1063, + "step": 774 + }, + { + "epoch": 0.8816837315130831, + "grad_norm": 1.4693725109100342, + "learning_rate": 0.0008245733788395904, + "loss": 2.349, + "step": 775 + }, + { + "epoch": 0.8828213879408419, + "grad_norm": 0.8909391164779663, + "learning_rate": 0.0008243458475540388, + "loss": 2.3083, + "step": 776 + }, + { + "epoch": 0.8839590443686007, + "grad_norm": 1.178287148475647, + "learning_rate": 0.000824118316268487, + "loss": 2.2453, + "step": 777 + }, + { + "epoch": 0.8850967007963595, + "grad_norm": 1.1147403717041016, + "learning_rate": 0.0008238907849829351, + "loss": 3.2198, + "step": 778 + }, + { + "epoch": 0.8862343572241184, + "grad_norm": 0.8934231996536255, + "learning_rate": 0.0008236632536973834, + "loss": 2.1325, + "step": 779 + }, + { + "epoch": 0.8873720136518771, + "grad_norm": 0.7500196695327759, + "learning_rate": 0.0008234357224118316, + "loss": 1.062, + "step": 780 + }, + { + "epoch": 0.888509670079636, + "grad_norm": 1.1905717849731445, + "learning_rate": 0.0008232081911262799, + "loss": 2.6013, + "step": 781 + }, + { + "epoch": 0.8896473265073948, + "grad_norm": 0.7128300070762634, + "learning_rate": 0.0008229806598407281, + "loss": 1.4425, + "step": 782 + }, + { + "epoch": 0.8907849829351536, + "grad_norm": 0.9810397028923035, + "learning_rate": 0.0008227531285551763, + "loss": 2.2742, + "step": 783 + }, + { + "epoch": 0.8919226393629124, + "grad_norm": 1.1497716903686523, + "learning_rate": 0.0008225255972696247, + "loss": 2.6226, + "step": 784 + }, + { + "epoch": 0.8930602957906713, + "grad_norm": 1.3112133741378784, + "learning_rate": 0.0008222980659840729, + "loss": 1.8846, + "step": 785 + }, + { + "epoch": 0.89419795221843, + "grad_norm": 1.030337929725647, + "learning_rate": 0.0008220705346985211, + "loss": 2.6641, + "step": 786 + }, + { + "epoch": 0.8953356086461889, + "grad_norm": 1.1316418647766113, + "learning_rate": 0.0008218430034129693, + "loss": 1.8058, + "step": 787 + }, + { + "epoch": 0.8964732650739476, + "grad_norm": 1.315293788909912, + "learning_rate": 0.0008216154721274175, + "loss": 1.6608, + "step": 788 + }, + { + "epoch": 0.8976109215017065, + "grad_norm": 1.0237807035446167, + "learning_rate": 0.0008213879408418657, + "loss": 2.9869, + "step": 789 + }, + { + "epoch": 0.8987485779294653, + "grad_norm": 1.0724749565124512, + "learning_rate": 0.000821160409556314, + "loss": 1.7659, + "step": 790 + }, + { + "epoch": 0.8998862343572241, + "grad_norm": 0.8942409753799438, + "learning_rate": 0.0008209328782707622, + "loss": 1.5329, + "step": 791 + }, + { + "epoch": 0.9010238907849829, + "grad_norm": 1.5801607370376587, + "learning_rate": 0.0008207053469852104, + "loss": 1.8717, + "step": 792 + }, + { + "epoch": 0.9021615472127418, + "grad_norm": 0.8906893730163574, + "learning_rate": 0.0008204778156996588, + "loss": 1.5824, + "step": 793 + }, + { + "epoch": 0.9032992036405005, + "grad_norm": 0.6598983407020569, + "learning_rate": 0.000820250284414107, + "loss": 1.4168, + "step": 794 + }, + { + "epoch": 0.9044368600682594, + "grad_norm": 0.7640515565872192, + "learning_rate": 0.0008200227531285552, + "loss": 1.7068, + "step": 795 + }, + { + "epoch": 0.9055745164960182, + "grad_norm": 1.3205150365829468, + "learning_rate": 0.0008197952218430035, + "loss": 2.5249, + "step": 796 + }, + { + "epoch": 0.906712172923777, + "grad_norm": 1.2995262145996094, + "learning_rate": 0.0008195676905574516, + "loss": 2.9522, + "step": 797 + }, + { + "epoch": 0.9078498293515358, + "grad_norm": 0.8465941548347473, + "learning_rate": 0.0008193401592718998, + "loss": 2.0508, + "step": 798 + }, + { + "epoch": 0.9089874857792947, + "grad_norm": 1.2404921054840088, + "learning_rate": 0.0008191126279863481, + "loss": 2.4488, + "step": 799 + }, + { + "epoch": 0.9101251422070534, + "grad_norm": 1.00394868850708, + "learning_rate": 0.0008188850967007963, + "loss": 1.665, + "step": 800 + }, + { + "epoch": 0.9112627986348123, + "grad_norm": 1.0288304090499878, + "learning_rate": 0.0008186575654152447, + "loss": 2.2679, + "step": 801 + }, + { + "epoch": 0.9124004550625711, + "grad_norm": 1.0812597274780273, + "learning_rate": 0.0008184300341296929, + "loss": 2.092, + "step": 802 + }, + { + "epoch": 0.9135381114903299, + "grad_norm": 0.9165686964988708, + "learning_rate": 0.0008182025028441411, + "loss": 1.6572, + "step": 803 + }, + { + "epoch": 0.9146757679180887, + "grad_norm": 1.254876971244812, + "learning_rate": 0.0008179749715585894, + "loss": 2.2742, + "step": 804 + }, + { + "epoch": 0.9158134243458476, + "grad_norm": 0.8111211657524109, + "learning_rate": 0.0008177474402730376, + "loss": 1.4599, + "step": 805 + }, + { + "epoch": 0.9169510807736063, + "grad_norm": 1.2796216011047363, + "learning_rate": 0.0008175199089874858, + "loss": 2.8677, + "step": 806 + }, + { + "epoch": 0.9180887372013652, + "grad_norm": 0.5638169050216675, + "learning_rate": 0.000817292377701934, + "loss": 1.0711, + "step": 807 + }, + { + "epoch": 0.919226393629124, + "grad_norm": 1.0181145668029785, + "learning_rate": 0.0008170648464163822, + "loss": 2.546, + "step": 808 + }, + { + "epoch": 0.9203640500568828, + "grad_norm": 1.0140022039413452, + "learning_rate": 0.0008168373151308304, + "loss": 1.6735, + "step": 809 + }, + { + "epoch": 0.9215017064846417, + "grad_norm": 0.6265180706977844, + "learning_rate": 0.0008166097838452788, + "loss": 1.581, + "step": 810 + }, + { + "epoch": 0.9226393629124005, + "grad_norm": 0.9092225432395935, + "learning_rate": 0.000816382252559727, + "loss": 2.0735, + "step": 811 + }, + { + "epoch": 0.9237770193401593, + "grad_norm": 1.3408722877502441, + "learning_rate": 0.0008161547212741752, + "loss": 3.3588, + "step": 812 + }, + { + "epoch": 0.9249146757679181, + "grad_norm": 1.0309351682662964, + "learning_rate": 0.0008159271899886235, + "loss": 2.2884, + "step": 813 + }, + { + "epoch": 0.926052332195677, + "grad_norm": 1.130393147468567, + "learning_rate": 0.0008156996587030717, + "loss": 1.994, + "step": 814 + }, + { + "epoch": 0.9271899886234357, + "grad_norm": 0.9755035042762756, + "learning_rate": 0.0008154721274175199, + "loss": 2.4471, + "step": 815 + }, + { + "epoch": 0.9283276450511946, + "grad_norm": 0.7888014912605286, + "learning_rate": 0.0008152445961319681, + "loss": 1.7634, + "step": 816 + }, + { + "epoch": 0.9294653014789533, + "grad_norm": 1.2425562143325806, + "learning_rate": 0.0008150170648464163, + "loss": 2.6725, + "step": 817 + }, + { + "epoch": 0.9306029579067122, + "grad_norm": 1.112234354019165, + "learning_rate": 0.0008147895335608646, + "loss": 2.0245, + "step": 818 + }, + { + "epoch": 0.931740614334471, + "grad_norm": 1.8099498748779297, + "learning_rate": 0.0008145620022753129, + "loss": 3.6121, + "step": 819 + }, + { + "epoch": 0.9328782707622298, + "grad_norm": 0.594555139541626, + "learning_rate": 0.0008143344709897611, + "loss": 1.1207, + "step": 820 + }, + { + "epoch": 0.9340159271899886, + "grad_norm": 1.2875663042068481, + "learning_rate": 0.0008141069397042094, + "loss": 2.6515, + "step": 821 + }, + { + "epoch": 0.9351535836177475, + "grad_norm": 1.2231221199035645, + "learning_rate": 0.0008138794084186576, + "loss": 2.1126, + "step": 822 + }, + { + "epoch": 0.9362912400455062, + "grad_norm": 0.6779431700706482, + "learning_rate": 0.0008136518771331058, + "loss": 1.0446, + "step": 823 + }, + { + "epoch": 0.9374288964732651, + "grad_norm": 0.8786545991897583, + "learning_rate": 0.0008134243458475541, + "loss": 1.8822, + "step": 824 + }, + { + "epoch": 0.9385665529010239, + "grad_norm": 0.7241141200065613, + "learning_rate": 0.0008131968145620023, + "loss": 1.3241, + "step": 825 + }, + { + "epoch": 0.9397042093287827, + "grad_norm": 1.032473087310791, + "learning_rate": 0.0008129692832764504, + "loss": 2.3931, + "step": 826 + }, + { + "epoch": 0.9408418657565415, + "grad_norm": 0.8265206217765808, + "learning_rate": 0.0008127417519908988, + "loss": 1.842, + "step": 827 + }, + { + "epoch": 0.9419795221843004, + "grad_norm": 0.9826211929321289, + "learning_rate": 0.000812514220705347, + "loss": 2.6482, + "step": 828 + }, + { + "epoch": 0.9431171786120591, + "grad_norm": 1.065434455871582, + "learning_rate": 0.0008122866894197952, + "loss": 2.2127, + "step": 829 + }, + { + "epoch": 0.944254835039818, + "grad_norm": 0.6911349892616272, + "learning_rate": 0.0008120591581342435, + "loss": 1.7043, + "step": 830 + }, + { + "epoch": 0.9453924914675768, + "grad_norm": 1.031029224395752, + "learning_rate": 0.0008118316268486917, + "loss": 1.971, + "step": 831 + }, + { + "epoch": 0.9465301478953356, + "grad_norm": 0.9776269197463989, + "learning_rate": 0.0008116040955631399, + "loss": 1.7573, + "step": 832 + }, + { + "epoch": 0.9476678043230944, + "grad_norm": 1.006766438484192, + "learning_rate": 0.0008113765642775882, + "loss": 2.1867, + "step": 833 + }, + { + "epoch": 0.9488054607508533, + "grad_norm": 1.4474374055862427, + "learning_rate": 0.0008111490329920365, + "loss": 1.8203, + "step": 834 + }, + { + "epoch": 0.949943117178612, + "grad_norm": 1.0589663982391357, + "learning_rate": 0.0008109215017064847, + "loss": 2.376, + "step": 835 + }, + { + "epoch": 0.9510807736063709, + "grad_norm": 0.8817835450172424, + "learning_rate": 0.0008106939704209329, + "loss": 2.1685, + "step": 836 + }, + { + "epoch": 0.9522184300341296, + "grad_norm": 1.0257823467254639, + "learning_rate": 0.0008104664391353811, + "loss": 2.7541, + "step": 837 + }, + { + "epoch": 0.9533560864618885, + "grad_norm": 0.6072942018508911, + "learning_rate": 0.0008102389078498293, + "loss": 0.9657, + "step": 838 + }, + { + "epoch": 0.9544937428896473, + "grad_norm": 0.6976664066314697, + "learning_rate": 0.0008100113765642776, + "loss": 1.7921, + "step": 839 + }, + { + "epoch": 0.9556313993174061, + "grad_norm": 0.7202094197273254, + "learning_rate": 0.0008097838452787258, + "loss": 1.6851, + "step": 840 + }, + { + "epoch": 0.9567690557451649, + "grad_norm": 0.7134101390838623, + "learning_rate": 0.0008095563139931741, + "loss": 1.2245, + "step": 841 + }, + { + "epoch": 0.9579067121729238, + "grad_norm": 0.9187147617340088, + "learning_rate": 0.0008093287827076223, + "loss": 1.9654, + "step": 842 + }, + { + "epoch": 0.9590443686006825, + "grad_norm": 1.6910814046859741, + "learning_rate": 0.0008091012514220706, + "loss": 3.1648, + "step": 843 + }, + { + "epoch": 0.9601820250284414, + "grad_norm": 1.1383386850357056, + "learning_rate": 0.0008088737201365189, + "loss": 1.5833, + "step": 844 + }, + { + "epoch": 0.9613196814562003, + "grad_norm": 1.1125869750976562, + "learning_rate": 0.0008086461888509671, + "loss": 2.4418, + "step": 845 + }, + { + "epoch": 0.962457337883959, + "grad_norm": 1.096062183380127, + "learning_rate": 0.0008084186575654152, + "loss": 1.5872, + "step": 846 + }, + { + "epoch": 0.9635949943117179, + "grad_norm": 1.1159650087356567, + "learning_rate": 0.0008081911262798635, + "loss": 0.9669, + "step": 847 + }, + { + "epoch": 0.9647326507394767, + "grad_norm": 0.8109821081161499, + "learning_rate": 0.0008079635949943117, + "loss": 2.3282, + "step": 848 + }, + { + "epoch": 0.9658703071672355, + "grad_norm": 0.6839150786399841, + "learning_rate": 0.0008077360637087599, + "loss": 1.5686, + "step": 849 + }, + { + "epoch": 0.9670079635949943, + "grad_norm": 1.0192281007766724, + "learning_rate": 0.0008075085324232082, + "loss": 2.3769, + "step": 850 + }, + { + "epoch": 0.9681456200227532, + "grad_norm": 1.0800739526748657, + "learning_rate": 0.0008072810011376565, + "loss": 1.9699, + "step": 851 + }, + { + "epoch": 0.9692832764505119, + "grad_norm": 1.4787782430648804, + "learning_rate": 0.0008070534698521047, + "loss": 2.1332, + "step": 852 + }, + { + "epoch": 0.9704209328782708, + "grad_norm": 0.8719042539596558, + "learning_rate": 0.000806825938566553, + "loss": 1.9816, + "step": 853 + }, + { + "epoch": 0.9715585893060296, + "grad_norm": 0.4718307554721832, + "learning_rate": 0.0008065984072810012, + "loss": 0.907, + "step": 854 + }, + { + "epoch": 0.9726962457337884, + "grad_norm": 0.7292342185974121, + "learning_rate": 0.0008063708759954493, + "loss": 1.3498, + "step": 855 + }, + { + "epoch": 0.9738339021615472, + "grad_norm": 1.0442382097244263, + "learning_rate": 0.0008061433447098976, + "loss": 0.8467, + "step": 856 + }, + { + "epoch": 0.9749715585893061, + "grad_norm": 1.4007940292358398, + "learning_rate": 0.0008059158134243458, + "loss": 2.5427, + "step": 857 + }, + { + "epoch": 0.9761092150170648, + "grad_norm": 0.6884713172912598, + "learning_rate": 0.000805688282138794, + "loss": 1.4121, + "step": 858 + }, + { + "epoch": 0.9772468714448237, + "grad_norm": 0.8210100531578064, + "learning_rate": 0.0008054607508532424, + "loss": 1.4311, + "step": 859 + }, + { + "epoch": 0.9783845278725825, + "grad_norm": 1.017969012260437, + "learning_rate": 0.0008052332195676906, + "loss": 2.1577, + "step": 860 + }, + { + "epoch": 0.9795221843003413, + "grad_norm": 0.6463725566864014, + "learning_rate": 0.0008050056882821389, + "loss": 1.1913, + "step": 861 + }, + { + "epoch": 0.9806598407281001, + "grad_norm": 0.609213650226593, + "learning_rate": 0.0008047781569965871, + "loss": 1.4609, + "step": 862 + }, + { + "epoch": 0.981797497155859, + "grad_norm": 1.2324273586273193, + "learning_rate": 0.0008045506257110353, + "loss": 1.5182, + "step": 863 + }, + { + "epoch": 0.9829351535836177, + "grad_norm": 1.6558321714401245, + "learning_rate": 0.0008043230944254836, + "loss": 2.2892, + "step": 864 + }, + { + "epoch": 0.9840728100113766, + "grad_norm": 1.1968799829483032, + "learning_rate": 0.0008040955631399317, + "loss": 2.6779, + "step": 865 + }, + { + "epoch": 0.9852104664391353, + "grad_norm": 0.6827085614204407, + "learning_rate": 0.0008038680318543799, + "loss": 1.8453, + "step": 866 + }, + { + "epoch": 0.9863481228668942, + "grad_norm": 0.70023113489151, + "learning_rate": 0.0008036405005688282, + "loss": 1.5166, + "step": 867 + }, + { + "epoch": 0.987485779294653, + "grad_norm": 1.1971696615219116, + "learning_rate": 0.0008034129692832765, + "loss": 3.0315, + "step": 868 + }, + { + "epoch": 0.9886234357224118, + "grad_norm": 1.1876801252365112, + "learning_rate": 0.0008031854379977247, + "loss": 2.263, + "step": 869 + }, + { + "epoch": 0.9897610921501706, + "grad_norm": 1.1676205396652222, + "learning_rate": 0.000802957906712173, + "loss": 1.9647, + "step": 870 + }, + { + "epoch": 0.9908987485779295, + "grad_norm": 1.2230088710784912, + "learning_rate": 0.0008027303754266212, + "loss": 3.6692, + "step": 871 + }, + { + "epoch": 0.9920364050056882, + "grad_norm": 0.9202735424041748, + "learning_rate": 0.0008025028441410694, + "loss": 1.9178, + "step": 872 + }, + { + "epoch": 0.9931740614334471, + "grad_norm": 0.685429036617279, + "learning_rate": 0.0008022753128555177, + "loss": 1.6819, + "step": 873 + }, + { + "epoch": 0.9943117178612059, + "grad_norm": 0.9149478077888489, + "learning_rate": 0.0008020477815699659, + "loss": 1.7829, + "step": 874 + }, + { + "epoch": 0.9954493742889647, + "grad_norm": 0.9612428545951843, + "learning_rate": 0.000801820250284414, + "loss": 2.3288, + "step": 875 + }, + { + "epoch": 0.9965870307167235, + "grad_norm": 1.1229710578918457, + "learning_rate": 0.0008015927189988624, + "loss": 1.8854, + "step": 876 + }, + { + "epoch": 0.9977246871444824, + "grad_norm": 1.3799995183944702, + "learning_rate": 0.0008013651877133106, + "loss": 3.6562, + "step": 877 + }, + { + "epoch": 0.9988623435722411, + "grad_norm": 0.7298029661178589, + "learning_rate": 0.0008011376564277588, + "loss": 1.7173, + "step": 878 + }, + { + "epoch": 1.0, + "grad_norm": 1.1531485319137573, + "learning_rate": 0.0008009101251422071, + "loss": 2.2511, + "step": 879 + }, + { + "epoch": 1.0, + "eval_f1": 0.8883, + "eval_gen_len": 49.5182, + "eval_loss": 1.9363936185836792, + "eval_precision": 0.8875, + "eval_recall": 0.8892, + "eval_rouge1": 0.4398, + "eval_rouge2": 0.1855, + "eval_rougeL": 0.3668, + "eval_rougeLsum": 0.411, + "eval_runtime": 28.6515, + "eval_samples_per_second": 3.839, + "eval_steps_per_second": 0.489, + "step": 879 + }, + { + "epoch": 1.0011376564277588, + "grad_norm": 0.6251806616783142, + "learning_rate": 0.0008006825938566553, + "loss": 1.0413, + "step": 880 + }, + { + "epoch": 1.0022753128555177, + "grad_norm": 0.9199168086051941, + "learning_rate": 0.0008004550625711036, + "loss": 1.691, + "step": 881 + }, + { + "epoch": 1.0034129692832765, + "grad_norm": 1.4455974102020264, + "learning_rate": 0.0008002275312855518, + "loss": 4.0842, + "step": 882 + }, + { + "epoch": 1.0045506257110353, + "grad_norm": 1.1115312576293945, + "learning_rate": 0.0008, + "loss": 2.4961, + "step": 883 + }, + { + "epoch": 1.005688282138794, + "grad_norm": 0.952612042427063, + "learning_rate": 0.0007997724687144482, + "loss": 1.8239, + "step": 884 + }, + { + "epoch": 1.006825938566553, + "grad_norm": 0.9800905585289001, + "learning_rate": 0.0007995449374288965, + "loss": 1.981, + "step": 885 + }, + { + "epoch": 1.0079635949943118, + "grad_norm": 1.0975840091705322, + "learning_rate": 0.0007993174061433447, + "loss": 2.4143, + "step": 886 + }, + { + "epoch": 1.0091012514220705, + "grad_norm": 0.936336874961853, + "learning_rate": 0.000799089874857793, + "loss": 1.6641, + "step": 887 + }, + { + "epoch": 1.0102389078498293, + "grad_norm": 0.5330931544303894, + "learning_rate": 0.0007988623435722412, + "loss": 1.2285, + "step": 888 + }, + { + "epoch": 1.0113765642775883, + "grad_norm": 1.2912349700927734, + "learning_rate": 0.0007986348122866894, + "loss": 1.594, + "step": 889 + }, + { + "epoch": 1.012514220705347, + "grad_norm": 0.8078871369361877, + "learning_rate": 0.0007984072810011377, + "loss": 1.0581, + "step": 890 + }, + { + "epoch": 1.0136518771331058, + "grad_norm": 1.071419358253479, + "learning_rate": 0.0007981797497155859, + "loss": 1.7577, + "step": 891 + }, + { + "epoch": 1.0147895335608645, + "grad_norm": 1.0178149938583374, + "learning_rate": 0.0007979522184300341, + "loss": 2.4778, + "step": 892 + }, + { + "epoch": 1.0159271899886235, + "grad_norm": 0.9748836159706116, + "learning_rate": 0.0007977246871444825, + "loss": 1.8737, + "step": 893 + }, + { + "epoch": 1.0170648464163823, + "grad_norm": 0.7067264914512634, + "learning_rate": 0.0007974971558589306, + "loss": 1.3954, + "step": 894 + }, + { + "epoch": 1.018202502844141, + "grad_norm": 1.2109535932540894, + "learning_rate": 0.0007972696245733788, + "loss": 2.3141, + "step": 895 + }, + { + "epoch": 1.0193401592718998, + "grad_norm": 0.8363299369812012, + "learning_rate": 0.0007970420932878271, + "loss": 2.0917, + "step": 896 + }, + { + "epoch": 1.0204778156996588, + "grad_norm": 0.7036440968513489, + "learning_rate": 0.0007968145620022753, + "loss": 1.5766, + "step": 897 + }, + { + "epoch": 1.0216154721274175, + "grad_norm": 0.6679667830467224, + "learning_rate": 0.0007965870307167235, + "loss": 1.8018, + "step": 898 + }, + { + "epoch": 1.0227531285551763, + "grad_norm": 1.0804336071014404, + "learning_rate": 0.0007963594994311718, + "loss": 2.0411, + "step": 899 + }, + { + "epoch": 1.023890784982935, + "grad_norm": 0.9264572262763977, + "learning_rate": 0.00079613196814562, + "loss": 2.1497, + "step": 900 + }, + { + "epoch": 1.025028441410694, + "grad_norm": 1.2212860584259033, + "learning_rate": 0.0007959044368600682, + "loss": 2.7216, + "step": 901 + }, + { + "epoch": 1.0261660978384528, + "grad_norm": 1.1044024229049683, + "learning_rate": 0.0007956769055745166, + "loss": 2.3266, + "step": 902 + }, + { + "epoch": 1.0273037542662116, + "grad_norm": 0.7855750918388367, + "learning_rate": 0.0007954493742889648, + "loss": 1.4662, + "step": 903 + }, + { + "epoch": 1.0284414106939703, + "grad_norm": 1.0114825963974, + "learning_rate": 0.000795221843003413, + "loss": 2.0779, + "step": 904 + }, + { + "epoch": 1.0295790671217293, + "grad_norm": 0.9704268574714661, + "learning_rate": 0.0007949943117178612, + "loss": 1.6038, + "step": 905 + }, + { + "epoch": 1.030716723549488, + "grad_norm": 1.155097246170044, + "learning_rate": 0.0007947667804323094, + "loss": 2.13, + "step": 906 + }, + { + "epoch": 1.0318543799772468, + "grad_norm": 1.3842484951019287, + "learning_rate": 0.0007945392491467577, + "loss": 3.5606, + "step": 907 + }, + { + "epoch": 1.0329920364050056, + "grad_norm": 1.134261131286621, + "learning_rate": 0.0007943117178612059, + "loss": 2.9482, + "step": 908 + }, + { + "epoch": 1.0341296928327646, + "grad_norm": 1.0498160123825073, + "learning_rate": 0.0007940841865756541, + "loss": 2.3777, + "step": 909 + }, + { + "epoch": 1.0352673492605233, + "grad_norm": 0.7719895243644714, + "learning_rate": 0.0007938566552901025, + "loss": 2.2765, + "step": 910 + }, + { + "epoch": 1.036405005688282, + "grad_norm": 1.402180790901184, + "learning_rate": 0.0007936291240045507, + "loss": 3.0459, + "step": 911 + }, + { + "epoch": 1.0375426621160408, + "grad_norm": 0.8468472361564636, + "learning_rate": 0.0007934015927189989, + "loss": 0.922, + "step": 912 + }, + { + "epoch": 1.0386803185437998, + "grad_norm": 0.8631690740585327, + "learning_rate": 0.0007931740614334472, + "loss": 1.2322, + "step": 913 + }, + { + "epoch": 1.0398179749715586, + "grad_norm": 1.3747214078903198, + "learning_rate": 0.0007929465301478953, + "loss": 2.485, + "step": 914 + }, + { + "epoch": 1.0409556313993173, + "grad_norm": 0.7905521988868713, + "learning_rate": 0.0007927189988623435, + "loss": 2.1488, + "step": 915 + }, + { + "epoch": 1.0420932878270763, + "grad_norm": 1.0372867584228516, + "learning_rate": 0.0007924914675767918, + "loss": 2.3263, + "step": 916 + }, + { + "epoch": 1.043230944254835, + "grad_norm": 0.964745819568634, + "learning_rate": 0.00079226393629124, + "loss": 1.4245, + "step": 917 + }, + { + "epoch": 1.0443686006825939, + "grad_norm": 1.2599427700042725, + "learning_rate": 0.0007920364050056883, + "loss": 2.8828, + "step": 918 + }, + { + "epoch": 1.0455062571103526, + "grad_norm": 0.5311685800552368, + "learning_rate": 0.0007918088737201366, + "loss": 1.2969, + "step": 919 + }, + { + "epoch": 1.0466439135381116, + "grad_norm": 1.0544791221618652, + "learning_rate": 0.0007915813424345848, + "loss": 2.3014, + "step": 920 + }, + { + "epoch": 1.0477815699658704, + "grad_norm": 0.7876049280166626, + "learning_rate": 0.000791353811149033, + "loss": 2.2945, + "step": 921 + }, + { + "epoch": 1.0489192263936291, + "grad_norm": 0.7309691309928894, + "learning_rate": 0.0007911262798634813, + "loss": 1.2384, + "step": 922 + }, + { + "epoch": 1.0500568828213879, + "grad_norm": 0.8504576086997986, + "learning_rate": 0.0007908987485779294, + "loss": 1.719, + "step": 923 + }, + { + "epoch": 1.0511945392491469, + "grad_norm": 1.7143735885620117, + "learning_rate": 0.0007906712172923777, + "loss": 3.9275, + "step": 924 + }, + { + "epoch": 1.0523321956769056, + "grad_norm": 1.8625571727752686, + "learning_rate": 0.0007904436860068259, + "loss": 4.4724, + "step": 925 + }, + { + "epoch": 1.0534698521046644, + "grad_norm": 1.0929653644561768, + "learning_rate": 0.0007902161547212741, + "loss": 1.9623, + "step": 926 + }, + { + "epoch": 1.0546075085324231, + "grad_norm": 1.0899792909622192, + "learning_rate": 0.0007899886234357225, + "loss": 1.6986, + "step": 927 + }, + { + "epoch": 1.0557451649601821, + "grad_norm": 1.1779440641403198, + "learning_rate": 0.0007897610921501707, + "loss": 1.905, + "step": 928 + }, + { + "epoch": 1.0568828213879409, + "grad_norm": 0.6737589240074158, + "learning_rate": 0.0007895335608646189, + "loss": 1.734, + "step": 929 + }, + { + "epoch": 1.0580204778156996, + "grad_norm": 1.0531206130981445, + "learning_rate": 0.0007893060295790672, + "loss": 1.4693, + "step": 930 + }, + { + "epoch": 1.0591581342434584, + "grad_norm": 0.5907047390937805, + "learning_rate": 0.0007890784982935154, + "loss": 1.4901, + "step": 931 + }, + { + "epoch": 1.0602957906712174, + "grad_norm": 0.8621008396148682, + "learning_rate": 0.0007888509670079636, + "loss": 1.5857, + "step": 932 + }, + { + "epoch": 1.0614334470989761, + "grad_norm": 0.9032453298568726, + "learning_rate": 0.0007886234357224118, + "loss": 1.8026, + "step": 933 + }, + { + "epoch": 1.062571103526735, + "grad_norm": 1.4888174533843994, + "learning_rate": 0.00078839590443686, + "loss": 1.8748, + "step": 934 + }, + { + "epoch": 1.0637087599544937, + "grad_norm": 0.6122904419898987, + "learning_rate": 0.0007881683731513083, + "loss": 0.7945, + "step": 935 + }, + { + "epoch": 1.0648464163822526, + "grad_norm": 1.013519048690796, + "learning_rate": 0.0007879408418657566, + "loss": 1.9398, + "step": 936 + }, + { + "epoch": 1.0659840728100114, + "grad_norm": 0.7138906717300415, + "learning_rate": 0.0007877133105802048, + "loss": 1.6637, + "step": 937 + }, + { + "epoch": 1.0671217292377702, + "grad_norm": 1.3058874607086182, + "learning_rate": 0.000787485779294653, + "loss": 3.3544, + "step": 938 + }, + { + "epoch": 1.068259385665529, + "grad_norm": 0.9075451493263245, + "learning_rate": 0.0007872582480091013, + "loss": 1.7222, + "step": 939 + }, + { + "epoch": 1.069397042093288, + "grad_norm": 0.9286370873451233, + "learning_rate": 0.0007870307167235495, + "loss": 2.0651, + "step": 940 + }, + { + "epoch": 1.0705346985210467, + "grad_norm": 0.6360165476799011, + "learning_rate": 0.0007868031854379977, + "loss": 0.89, + "step": 941 + }, + { + "epoch": 1.0716723549488054, + "grad_norm": 1.2726821899414062, + "learning_rate": 0.000786575654152446, + "loss": 2.7395, + "step": 942 + }, + { + "epoch": 1.0728100113765642, + "grad_norm": 1.024568796157837, + "learning_rate": 0.0007863481228668941, + "loss": 2.1989, + "step": 943 + }, + { + "epoch": 1.0739476678043232, + "grad_norm": 1.1119788885116577, + "learning_rate": 0.0007861205915813425, + "loss": 3.14, + "step": 944 + }, + { + "epoch": 1.075085324232082, + "grad_norm": 1.0225321054458618, + "learning_rate": 0.0007858930602957907, + "loss": 2.8281, + "step": 945 + }, + { + "epoch": 1.0762229806598407, + "grad_norm": 0.8482196927070618, + "learning_rate": 0.0007856655290102389, + "loss": 1.4616, + "step": 946 + }, + { + "epoch": 1.0773606370875997, + "grad_norm": 0.9475064277648926, + "learning_rate": 0.0007854379977246872, + "loss": 2.9296, + "step": 947 + }, + { + "epoch": 1.0784982935153584, + "grad_norm": 0.5353087186813354, + "learning_rate": 0.0007852104664391354, + "loss": 0.9475, + "step": 948 + }, + { + "epoch": 1.0796359499431172, + "grad_norm": 1.2120368480682373, + "learning_rate": 0.0007849829351535836, + "loss": 3.095, + "step": 949 + }, + { + "epoch": 1.080773606370876, + "grad_norm": 1.1500520706176758, + "learning_rate": 0.0007847554038680319, + "loss": 2.4183, + "step": 950 + }, + { + "epoch": 1.0819112627986347, + "grad_norm": 0.629219651222229, + "learning_rate": 0.0007845278725824802, + "loss": 1.1091, + "step": 951 + }, + { + "epoch": 1.0830489192263937, + "grad_norm": 0.6308894753456116, + "learning_rate": 0.0007843003412969284, + "loss": 1.0829, + "step": 952 + }, + { + "epoch": 1.0841865756541524, + "grad_norm": 1.4992165565490723, + "learning_rate": 0.0007840728100113766, + "loss": 3.7885, + "step": 953 + }, + { + "epoch": 1.0853242320819112, + "grad_norm": 1.0505317449569702, + "learning_rate": 0.0007838452787258248, + "loss": 2.2605, + "step": 954 + }, + { + "epoch": 1.0864618885096702, + "grad_norm": 0.898767352104187, + "learning_rate": 0.000783617747440273, + "loss": 3.0967, + "step": 955 + }, + { + "epoch": 1.087599544937429, + "grad_norm": 1.2503669261932373, + "learning_rate": 0.0007833902161547213, + "loss": 2.2391, + "step": 956 + }, + { + "epoch": 1.0887372013651877, + "grad_norm": 1.2384415864944458, + "learning_rate": 0.0007831626848691695, + "loss": 3.554, + "step": 957 + }, + { + "epoch": 1.0898748577929465, + "grad_norm": 1.1613341569900513, + "learning_rate": 0.0007829351535836177, + "loss": 1.7405, + "step": 958 + }, + { + "epoch": 1.0910125142207054, + "grad_norm": 0.6541109681129456, + "learning_rate": 0.000782707622298066, + "loss": 1.5068, + "step": 959 + }, + { + "epoch": 1.0921501706484642, + "grad_norm": 1.1372532844543457, + "learning_rate": 0.0007824800910125143, + "loss": 2.6415, + "step": 960 + }, + { + "epoch": 1.093287827076223, + "grad_norm": 0.944439709186554, + "learning_rate": 0.0007822525597269625, + "loss": 2.2019, + "step": 961 + }, + { + "epoch": 1.0944254835039817, + "grad_norm": 1.1232166290283203, + "learning_rate": 0.0007820250284414107, + "loss": 2.172, + "step": 962 + }, + { + "epoch": 1.0955631399317407, + "grad_norm": 1.3538726568222046, + "learning_rate": 0.0007817974971558589, + "loss": 2.0305, + "step": 963 + }, + { + "epoch": 1.0967007963594995, + "grad_norm": 0.7923217415809631, + "learning_rate": 0.0007815699658703072, + "loss": 1.2107, + "step": 964 + }, + { + "epoch": 1.0978384527872582, + "grad_norm": 0.9362553358078003, + "learning_rate": 0.0007813424345847554, + "loss": 1.9665, + "step": 965 + }, + { + "epoch": 1.098976109215017, + "grad_norm": 1.2658346891403198, + "learning_rate": 0.0007811149032992036, + "loss": 2.9011, + "step": 966 + }, + { + "epoch": 1.100113765642776, + "grad_norm": 1.526066780090332, + "learning_rate": 0.0007808873720136519, + "loss": 1.7315, + "step": 967 + }, + { + "epoch": 1.1012514220705347, + "grad_norm": 1.3018304109573364, + "learning_rate": 0.0007806598407281002, + "loss": 3.2345, + "step": 968 + }, + { + "epoch": 1.1023890784982935, + "grad_norm": 0.8176997900009155, + "learning_rate": 0.0007804323094425484, + "loss": 1.467, + "step": 969 + }, + { + "epoch": 1.1035267349260522, + "grad_norm": 0.8336053490638733, + "learning_rate": 0.0007802047781569967, + "loss": 1.4161, + "step": 970 + }, + { + "epoch": 1.1046643913538112, + "grad_norm": 0.7302894592285156, + "learning_rate": 0.0007799772468714449, + "loss": 1.3151, + "step": 971 + }, + { + "epoch": 1.10580204778157, + "grad_norm": 0.9276258945465088, + "learning_rate": 0.000779749715585893, + "loss": 2.3525, + "step": 972 + }, + { + "epoch": 1.1069397042093287, + "grad_norm": 0.8648287057876587, + "learning_rate": 0.0007795221843003413, + "loss": 1.7804, + "step": 973 + }, + { + "epoch": 1.1080773606370875, + "grad_norm": 1.4959267377853394, + "learning_rate": 0.0007792946530147895, + "loss": 2.9281, + "step": 974 + }, + { + "epoch": 1.1092150170648465, + "grad_norm": 0.7826813459396362, + "learning_rate": 0.0007790671217292377, + "loss": 0.9164, + "step": 975 + }, + { + "epoch": 1.1103526734926052, + "grad_norm": 1.157572865486145, + "learning_rate": 0.000778839590443686, + "loss": 1.9721, + "step": 976 + }, + { + "epoch": 1.111490329920364, + "grad_norm": 1.137681484222412, + "learning_rate": 0.0007786120591581343, + "loss": 1.5874, + "step": 977 + }, + { + "epoch": 1.1126279863481228, + "grad_norm": 0.9332581162452698, + "learning_rate": 0.0007783845278725825, + "loss": 2.5314, + "step": 978 + }, + { + "epoch": 1.1137656427758817, + "grad_norm": 1.1413997411727905, + "learning_rate": 0.0007781569965870308, + "loss": 3.0732, + "step": 979 + }, + { + "epoch": 1.1149032992036405, + "grad_norm": 0.7028293609619141, + "learning_rate": 0.000777929465301479, + "loss": 0.8924, + "step": 980 + }, + { + "epoch": 1.1160409556313993, + "grad_norm": 1.0388150215148926, + "learning_rate": 0.0007777019340159272, + "loss": 1.9138, + "step": 981 + }, + { + "epoch": 1.117178612059158, + "grad_norm": 0.9347336292266846, + "learning_rate": 0.0007774744027303754, + "loss": 1.5226, + "step": 982 + }, + { + "epoch": 1.118316268486917, + "grad_norm": 0.9350568056106567, + "learning_rate": 0.0007772468714448236, + "loss": 2.6978, + "step": 983 + }, + { + "epoch": 1.1194539249146758, + "grad_norm": 1.407080888748169, + "learning_rate": 0.0007770193401592718, + "loss": 1.3464, + "step": 984 + }, + { + "epoch": 1.1205915813424345, + "grad_norm": 1.077770471572876, + "learning_rate": 0.0007767918088737202, + "loss": 2.123, + "step": 985 + }, + { + "epoch": 1.1217292377701935, + "grad_norm": 1.041999101638794, + "learning_rate": 0.0007765642775881684, + "loss": 2.5526, + "step": 986 + }, + { + "epoch": 1.1228668941979523, + "grad_norm": 1.0291063785552979, + "learning_rate": 0.0007763367463026167, + "loss": 2.1948, + "step": 987 + }, + { + "epoch": 1.124004550625711, + "grad_norm": 0.9935250282287598, + "learning_rate": 0.0007761092150170649, + "loss": 3.3882, + "step": 988 + }, + { + "epoch": 1.1251422070534698, + "grad_norm": 1.035112977027893, + "learning_rate": 0.0007758816837315131, + "loss": 2.0542, + "step": 989 + }, + { + "epoch": 1.1262798634812285, + "grad_norm": 1.0226819515228271, + "learning_rate": 0.0007756541524459614, + "loss": 1.8742, + "step": 990 + }, + { + "epoch": 1.1274175199089875, + "grad_norm": 1.0159075260162354, + "learning_rate": 0.0007754266211604095, + "loss": 1.3203, + "step": 991 + }, + { + "epoch": 1.1285551763367463, + "grad_norm": 2.0587501525878906, + "learning_rate": 0.0007751990898748577, + "loss": 3.9035, + "step": 992 + }, + { + "epoch": 1.129692832764505, + "grad_norm": 0.9951366186141968, + "learning_rate": 0.000774971558589306, + "loss": 2.5265, + "step": 993 + }, + { + "epoch": 1.130830489192264, + "grad_norm": 0.7112089395523071, + "learning_rate": 0.0007747440273037543, + "loss": 1.3928, + "step": 994 + }, + { + "epoch": 1.1319681456200228, + "grad_norm": 1.0504612922668457, + "learning_rate": 0.0007745164960182025, + "loss": 2.0178, + "step": 995 + }, + { + "epoch": 1.1331058020477816, + "grad_norm": 0.7228176593780518, + "learning_rate": 0.0007742889647326508, + "loss": 1.4623, + "step": 996 + }, + { + "epoch": 1.1342434584755403, + "grad_norm": 1.1857068538665771, + "learning_rate": 0.000774061433447099, + "loss": 2.5628, + "step": 997 + }, + { + "epoch": 1.1353811149032993, + "grad_norm": 0.6408101916313171, + "learning_rate": 0.0007738339021615472, + "loss": 1.3296, + "step": 998 + }, + { + "epoch": 1.136518771331058, + "grad_norm": 0.9753408432006836, + "learning_rate": 0.0007736063708759955, + "loss": 2.0656, + "step": 999 + }, + { + "epoch": 1.1376564277588168, + "grad_norm": 1.139440894126892, + "learning_rate": 0.0007733788395904437, + "loss": 2.2056, + "step": 1000 + }, + { + "epoch": 1.1387940841865756, + "grad_norm": 0.6815441846847534, + "learning_rate": 0.0007731513083048918, + "loss": 1.2574, + "step": 1001 + }, + { + "epoch": 1.1399317406143346, + "grad_norm": 1.032520055770874, + "learning_rate": 0.0007729237770193402, + "loss": 2.0944, + "step": 1002 + }, + { + "epoch": 1.1410693970420933, + "grad_norm": 0.8344050049781799, + "learning_rate": 0.0007726962457337884, + "loss": 1.5627, + "step": 1003 + }, + { + "epoch": 1.142207053469852, + "grad_norm": 1.2502180337905884, + "learning_rate": 0.0007724687144482366, + "loss": 2.0755, + "step": 1004 + }, + { + "epoch": 1.1433447098976108, + "grad_norm": 1.008817195892334, + "learning_rate": 0.0007722411831626849, + "loss": 2.4644, + "step": 1005 + }, + { + "epoch": 1.1444823663253698, + "grad_norm": 1.0237065553665161, + "learning_rate": 0.0007720136518771331, + "loss": 1.4251, + "step": 1006 + }, + { + "epoch": 1.1456200227531286, + "grad_norm": 0.9443050026893616, + "learning_rate": 0.0007717861205915814, + "loss": 2.5625, + "step": 1007 + }, + { + "epoch": 1.1467576791808873, + "grad_norm": 0.8654250502586365, + "learning_rate": 0.0007715585893060296, + "loss": 1.999, + "step": 1008 + }, + { + "epoch": 1.147895335608646, + "grad_norm": 1.082979679107666, + "learning_rate": 0.0007713310580204778, + "loss": 1.2456, + "step": 1009 + }, + { + "epoch": 1.149032992036405, + "grad_norm": 1.07938814163208, + "learning_rate": 0.0007711035267349262, + "loss": 2.0854, + "step": 1010 + }, + { + "epoch": 1.1501706484641638, + "grad_norm": 0.6003649234771729, + "learning_rate": 0.0007708759954493743, + "loss": 1.4027, + "step": 1011 + }, + { + "epoch": 1.1513083048919226, + "grad_norm": 1.157963514328003, + "learning_rate": 0.0007706484641638225, + "loss": 3.3311, + "step": 1012 + }, + { + "epoch": 1.1524459613196814, + "grad_norm": 0.5869401097297668, + "learning_rate": 0.0007704209328782708, + "loss": 1.4386, + "step": 1013 + }, + { + "epoch": 1.1535836177474403, + "grad_norm": 0.9778612852096558, + "learning_rate": 0.000770193401592719, + "loss": 2.9109, + "step": 1014 + }, + { + "epoch": 1.154721274175199, + "grad_norm": 0.9791373610496521, + "learning_rate": 0.0007699658703071672, + "loss": 1.5482, + "step": 1015 + }, + { + "epoch": 1.1558589306029579, + "grad_norm": 1.0005147457122803, + "learning_rate": 0.0007697383390216155, + "loss": 2.9281, + "step": 1016 + }, + { + "epoch": 1.1569965870307168, + "grad_norm": 0.6844426989555359, + "learning_rate": 0.0007695108077360637, + "loss": 0.9119, + "step": 1017 + }, + { + "epoch": 1.1581342434584756, + "grad_norm": 0.9615586400032043, + "learning_rate": 0.000769283276450512, + "loss": 2.3095, + "step": 1018 + }, + { + "epoch": 1.1592718998862344, + "grad_norm": 1.1434069871902466, + "learning_rate": 0.0007690557451649603, + "loss": 2.035, + "step": 1019 + }, + { + "epoch": 1.1604095563139931, + "grad_norm": 1.1195374727249146, + "learning_rate": 0.0007688282138794085, + "loss": 2.8448, + "step": 1020 + }, + { + "epoch": 1.1615472127417519, + "grad_norm": 0.9364283084869385, + "learning_rate": 0.0007686006825938566, + "loss": 2.0258, + "step": 1021 + }, + { + "epoch": 1.1626848691695109, + "grad_norm": 0.37525808811187744, + "learning_rate": 0.0007683731513083049, + "loss": 0.439, + "step": 1022 + }, + { + "epoch": 1.1638225255972696, + "grad_norm": 1.8962267637252808, + "learning_rate": 0.0007681456200227531, + "loss": 2.1773, + "step": 1023 + }, + { + "epoch": 1.1649601820250284, + "grad_norm": 1.661497712135315, + "learning_rate": 0.0007679180887372013, + "loss": 2.577, + "step": 1024 + }, + { + "epoch": 1.1660978384527874, + "grad_norm": 1.1258926391601562, + "learning_rate": 0.0007676905574516496, + "loss": 2.3722, + "step": 1025 + }, + { + "epoch": 1.1672354948805461, + "grad_norm": 0.9760286808013916, + "learning_rate": 0.0007674630261660978, + "loss": 2.135, + "step": 1026 + }, + { + "epoch": 1.1683731513083049, + "grad_norm": 0.89606112241745, + "learning_rate": 0.0007672354948805462, + "loss": 1.8733, + "step": 1027 + }, + { + "epoch": 1.1695108077360636, + "grad_norm": 1.0964630842208862, + "learning_rate": 0.0007670079635949944, + "loss": 1.9628, + "step": 1028 + }, + { + "epoch": 1.1706484641638226, + "grad_norm": 1.214050054550171, + "learning_rate": 0.0007667804323094426, + "loss": 2.3792, + "step": 1029 + }, + { + "epoch": 1.1717861205915814, + "grad_norm": 1.242618203163147, + "learning_rate": 0.0007665529010238908, + "loss": 3.0571, + "step": 1030 + }, + { + "epoch": 1.1729237770193401, + "grad_norm": 0.7527234554290771, + "learning_rate": 0.000766325369738339, + "loss": 1.7859, + "step": 1031 + }, + { + "epoch": 1.174061433447099, + "grad_norm": 0.9902629256248474, + "learning_rate": 0.0007660978384527872, + "loss": 2.1558, + "step": 1032 + }, + { + "epoch": 1.1751990898748579, + "grad_norm": 1.109411597251892, + "learning_rate": 0.0007658703071672355, + "loss": 2.4196, + "step": 1033 + }, + { + "epoch": 1.1763367463026166, + "grad_norm": 0.9211401343345642, + "learning_rate": 0.0007656427758816837, + "loss": 1.8503, + "step": 1034 + }, + { + "epoch": 1.1774744027303754, + "grad_norm": 0.9370328187942505, + "learning_rate": 0.000765415244596132, + "loss": 2.7057, + "step": 1035 + }, + { + "epoch": 1.1786120591581342, + "grad_norm": 0.8330713510513306, + "learning_rate": 0.0007651877133105803, + "loss": 1.6611, + "step": 1036 + }, + { + "epoch": 1.1797497155858931, + "grad_norm": 0.7571448087692261, + "learning_rate": 0.0007649601820250285, + "loss": 1.2492, + "step": 1037 + }, + { + "epoch": 1.180887372013652, + "grad_norm": 1.6156978607177734, + "learning_rate": 0.0007647326507394767, + "loss": 2.814, + "step": 1038 + }, + { + "epoch": 1.1820250284414107, + "grad_norm": 1.8085129261016846, + "learning_rate": 0.000764505119453925, + "loss": 3.3276, + "step": 1039 + }, + { + "epoch": 1.1831626848691694, + "grad_norm": 1.0824229717254639, + "learning_rate": 0.0007642775881683731, + "loss": 2.5017, + "step": 1040 + }, + { + "epoch": 1.1843003412969284, + "grad_norm": 0.6034184098243713, + "learning_rate": 0.0007640500568828213, + "loss": 1.304, + "step": 1041 + }, + { + "epoch": 1.1854379977246872, + "grad_norm": 0.71455317735672, + "learning_rate": 0.0007638225255972696, + "loss": 1.8677, + "step": 1042 + }, + { + "epoch": 1.186575654152446, + "grad_norm": 0.7810432314872742, + "learning_rate": 0.0007635949943117178, + "loss": 2.6322, + "step": 1043 + }, + { + "epoch": 1.1877133105802047, + "grad_norm": 0.7499434351921082, + "learning_rate": 0.0007633674630261661, + "loss": 1.8316, + "step": 1044 + }, + { + "epoch": 1.1888509670079637, + "grad_norm": 0.9468162059783936, + "learning_rate": 0.0007631399317406144, + "loss": 1.5508, + "step": 1045 + }, + { + "epoch": 1.1899886234357224, + "grad_norm": 1.3421801328659058, + "learning_rate": 0.0007629124004550626, + "loss": 2.2196, + "step": 1046 + }, + { + "epoch": 1.1911262798634812, + "grad_norm": 0.9094765782356262, + "learning_rate": 0.0007626848691695109, + "loss": 2.1863, + "step": 1047 + }, + { + "epoch": 1.1922639362912402, + "grad_norm": 0.902521014213562, + "learning_rate": 0.0007624573378839591, + "loss": 1.4415, + "step": 1048 + }, + { + "epoch": 1.193401592718999, + "grad_norm": 0.8511042594909668, + "learning_rate": 0.0007622298065984073, + "loss": 1.5148, + "step": 1049 + }, + { + "epoch": 1.1945392491467577, + "grad_norm": 0.836769163608551, + "learning_rate": 0.0007620022753128555, + "loss": 1.819, + "step": 1050 + }, + { + "epoch": 1.1956769055745164, + "grad_norm": 0.7466940879821777, + "learning_rate": 0.0007617747440273037, + "loss": 2.2535, + "step": 1051 + }, + { + "epoch": 1.1968145620022752, + "grad_norm": 0.8963870406150818, + "learning_rate": 0.000761547212741752, + "loss": 1.8274, + "step": 1052 + }, + { + "epoch": 1.1979522184300342, + "grad_norm": 1.2822279930114746, + "learning_rate": 0.0007613196814562003, + "loss": 2.1197, + "step": 1053 + }, + { + "epoch": 1.199089874857793, + "grad_norm": 1.0114504098892212, + "learning_rate": 0.0007610921501706485, + "loss": 3.1561, + "step": 1054 + }, + { + "epoch": 1.2002275312855517, + "grad_norm": 0.7050504088401794, + "learning_rate": 0.0007608646188850967, + "loss": 0.8309, + "step": 1055 + }, + { + "epoch": 1.2013651877133107, + "grad_norm": 1.2295100688934326, + "learning_rate": 0.000760637087599545, + "loss": 2.0593, + "step": 1056 + }, + { + "epoch": 1.2025028441410694, + "grad_norm": 1.2942835092544556, + "learning_rate": 0.0007604095563139932, + "loss": 2.06, + "step": 1057 + }, + { + "epoch": 1.2036405005688282, + "grad_norm": 0.652757465839386, + "learning_rate": 0.0007601820250284414, + "loss": 1.2173, + "step": 1058 + }, + { + "epoch": 1.204778156996587, + "grad_norm": 0.7156775593757629, + "learning_rate": 0.0007599544937428896, + "loss": 1.6357, + "step": 1059 + }, + { + "epoch": 1.2059158134243457, + "grad_norm": 0.8230730891227722, + "learning_rate": 0.0007597269624573379, + "loss": 1.0645, + "step": 1060 + }, + { + "epoch": 1.2070534698521047, + "grad_norm": 1.35136079788208, + "learning_rate": 0.0007594994311717861, + "loss": 3.901, + "step": 1061 + }, + { + "epoch": 1.2081911262798635, + "grad_norm": 0.6722981333732605, + "learning_rate": 0.0007592718998862344, + "loss": 1.5358, + "step": 1062 + }, + { + "epoch": 1.2093287827076222, + "grad_norm": 1.035335659980774, + "learning_rate": 0.0007590443686006826, + "loss": 1.7922, + "step": 1063 + }, + { + "epoch": 1.2104664391353812, + "grad_norm": 1.9629613161087036, + "learning_rate": 0.0007588168373151308, + "loss": 3.3534, + "step": 1064 + }, + { + "epoch": 1.21160409556314, + "grad_norm": 0.9386662244796753, + "learning_rate": 0.0007585893060295791, + "loss": 1.7383, + "step": 1065 + }, + { + "epoch": 1.2127417519908987, + "grad_norm": 1.02280592918396, + "learning_rate": 0.0007583617747440273, + "loss": 2.3867, + "step": 1066 + }, + { + "epoch": 1.2138794084186575, + "grad_norm": 0.9958323836326599, + "learning_rate": 0.0007581342434584756, + "loss": 2.2671, + "step": 1067 + }, + { + "epoch": 1.2150170648464165, + "grad_norm": 0.9132734537124634, + "learning_rate": 0.0007579067121729239, + "loss": 1.5166, + "step": 1068 + }, + { + "epoch": 1.2161547212741752, + "grad_norm": 0.9900468587875366, + "learning_rate": 0.000757679180887372, + "loss": 1.9609, + "step": 1069 + }, + { + "epoch": 1.217292377701934, + "grad_norm": 0.7868292331695557, + "learning_rate": 0.0007574516496018203, + "loss": 1.2568, + "step": 1070 + }, + { + "epoch": 1.2184300341296928, + "grad_norm": 0.7199138402938843, + "learning_rate": 0.0007572241183162685, + "loss": 1.6196, + "step": 1071 + }, + { + "epoch": 1.2195676905574517, + "grad_norm": 1.0280647277832031, + "learning_rate": 0.0007569965870307167, + "loss": 2.3159, + "step": 1072 + }, + { + "epoch": 1.2207053469852105, + "grad_norm": 1.435230016708374, + "learning_rate": 0.000756769055745165, + "loss": 1.9331, + "step": 1073 + }, + { + "epoch": 1.2218430034129693, + "grad_norm": 0.982181966304779, + "learning_rate": 0.0007565415244596132, + "loss": 1.9982, + "step": 1074 + }, + { + "epoch": 1.222980659840728, + "grad_norm": 1.21809720993042, + "learning_rate": 0.0007563139931740614, + "loss": 3.0575, + "step": 1075 + }, + { + "epoch": 1.224118316268487, + "grad_norm": 1.0095391273498535, + "learning_rate": 0.0007560864618885098, + "loss": 2.1901, + "step": 1076 + }, + { + "epoch": 1.2252559726962458, + "grad_norm": 0.8512126207351685, + "learning_rate": 0.000755858930602958, + "loss": 1.5792, + "step": 1077 + }, + { + "epoch": 1.2263936291240045, + "grad_norm": 1.3197203874588013, + "learning_rate": 0.0007556313993174062, + "loss": 2.7239, + "step": 1078 + }, + { + "epoch": 1.2275312855517633, + "grad_norm": 1.1738183498382568, + "learning_rate": 0.0007554038680318544, + "loss": 2.4295, + "step": 1079 + }, + { + "epoch": 1.2286689419795223, + "grad_norm": 0.743088960647583, + "learning_rate": 0.0007551763367463026, + "loss": 1.134, + "step": 1080 + }, + { + "epoch": 1.229806598407281, + "grad_norm": 1.0175997018814087, + "learning_rate": 0.0007549488054607508, + "loss": 3.1313, + "step": 1081 + }, + { + "epoch": 1.2309442548350398, + "grad_norm": 1.1242859363555908, + "learning_rate": 0.0007547212741751991, + "loss": 2.3403, + "step": 1082 + }, + { + "epoch": 1.2320819112627985, + "grad_norm": 1.0247671604156494, + "learning_rate": 0.0007544937428896473, + "loss": 1.6092, + "step": 1083 + }, + { + "epoch": 1.2332195676905575, + "grad_norm": 0.7052936553955078, + "learning_rate": 0.0007542662116040955, + "loss": 1.915, + "step": 1084 + }, + { + "epoch": 1.2343572241183163, + "grad_norm": 1.157301425933838, + "learning_rate": 0.0007540386803185439, + "loss": 1.9053, + "step": 1085 + }, + { + "epoch": 1.235494880546075, + "grad_norm": 0.7313311100006104, + "learning_rate": 0.0007538111490329921, + "loss": 2.1036, + "step": 1086 + }, + { + "epoch": 1.236632536973834, + "grad_norm": 0.998532772064209, + "learning_rate": 0.0007535836177474404, + "loss": 1.6923, + "step": 1087 + }, + { + "epoch": 1.2377701934015928, + "grad_norm": 0.8534971475601196, + "learning_rate": 0.0007533560864618886, + "loss": 2.4127, + "step": 1088 + }, + { + "epoch": 1.2389078498293515, + "grad_norm": 0.6717026233673096, + "learning_rate": 0.0007531285551763367, + "loss": 1.4987, + "step": 1089 + }, + { + "epoch": 1.2400455062571103, + "grad_norm": 0.9222111105918884, + "learning_rate": 0.000752901023890785, + "loss": 1.3934, + "step": 1090 + }, + { + "epoch": 1.241183162684869, + "grad_norm": 0.8172008395195007, + "learning_rate": 0.0007526734926052332, + "loss": 1.4544, + "step": 1091 + }, + { + "epoch": 1.242320819112628, + "grad_norm": 0.8321571946144104, + "learning_rate": 0.0007524459613196814, + "loss": 1.4483, + "step": 1092 + }, + { + "epoch": 1.2434584755403868, + "grad_norm": 0.663096010684967, + "learning_rate": 0.0007522184300341298, + "loss": 1.1409, + "step": 1093 + }, + { + "epoch": 1.2445961319681456, + "grad_norm": 0.871487021446228, + "learning_rate": 0.000751990898748578, + "loss": 1.6311, + "step": 1094 + }, + { + "epoch": 1.2457337883959045, + "grad_norm": 0.6653394103050232, + "learning_rate": 0.0007517633674630262, + "loss": 0.9919, + "step": 1095 + }, + { + "epoch": 1.2468714448236633, + "grad_norm": 1.093170404434204, + "learning_rate": 0.0007515358361774745, + "loss": 1.4075, + "step": 1096 + }, + { + "epoch": 1.248009101251422, + "grad_norm": 0.6205571293830872, + "learning_rate": 0.0007513083048919227, + "loss": 1.814, + "step": 1097 + }, + { + "epoch": 1.2491467576791808, + "grad_norm": 1.6945722103118896, + "learning_rate": 0.0007510807736063708, + "loss": 2.4636, + "step": 1098 + }, + { + "epoch": 1.2502844141069396, + "grad_norm": 0.8336583375930786, + "learning_rate": 0.0007508532423208191, + "loss": 1.9484, + "step": 1099 + }, + { + "epoch": 1.2514220705346986, + "grad_norm": 0.9842813014984131, + "learning_rate": 0.0007506257110352673, + "loss": 2.2475, + "step": 1100 + }, + { + "epoch": 1.2525597269624573, + "grad_norm": 0.6157675981521606, + "learning_rate": 0.0007503981797497155, + "loss": 1.1768, + "step": 1101 + }, + { + "epoch": 1.253697383390216, + "grad_norm": 1.6868700981140137, + "learning_rate": 0.0007501706484641639, + "loss": 1.702, + "step": 1102 + }, + { + "epoch": 1.254835039817975, + "grad_norm": 0.7272259593009949, + "learning_rate": 0.0007499431171786121, + "loss": 1.5946, + "step": 1103 + }, + { + "epoch": 1.2559726962457338, + "grad_norm": 1.0236316919326782, + "learning_rate": 0.0007497155858930603, + "loss": 1.6014, + "step": 1104 + }, + { + "epoch": 1.2571103526734926, + "grad_norm": 0.9424517750740051, + "learning_rate": 0.0007494880546075086, + "loss": 2.589, + "step": 1105 + }, + { + "epoch": 1.2582480091012513, + "grad_norm": 0.8940749168395996, + "learning_rate": 0.0007492605233219568, + "loss": 2.1612, + "step": 1106 + }, + { + "epoch": 1.25938566552901, + "grad_norm": 1.2272729873657227, + "learning_rate": 0.000749032992036405, + "loss": 2.1587, + "step": 1107 + }, + { + "epoch": 1.260523321956769, + "grad_norm": 0.6935333609580994, + "learning_rate": 0.0007488054607508532, + "loss": 1.8994, + "step": 1108 + }, + { + "epoch": 1.2616609783845278, + "grad_norm": 0.9777292013168335, + "learning_rate": 0.0007485779294653014, + "loss": 1.6954, + "step": 1109 + }, + { + "epoch": 1.2627986348122868, + "grad_norm": 0.7491214275360107, + "learning_rate": 0.0007483503981797498, + "loss": 0.9805, + "step": 1110 + }, + { + "epoch": 1.2639362912400456, + "grad_norm": 1.3111625909805298, + "learning_rate": 0.000748122866894198, + "loss": 3.0255, + "step": 1111 + }, + { + "epoch": 1.2650739476678043, + "grad_norm": 0.9766296744346619, + "learning_rate": 0.0007478953356086462, + "loss": 1.5356, + "step": 1112 + }, + { + "epoch": 1.266211604095563, + "grad_norm": 0.731914758682251, + "learning_rate": 0.0007476678043230945, + "loss": 1.7081, + "step": 1113 + }, + { + "epoch": 1.2673492605233219, + "grad_norm": 2.02380633354187, + "learning_rate": 0.0007474402730375427, + "loss": 4.2285, + "step": 1114 + }, + { + "epoch": 1.2684869169510808, + "grad_norm": 1.3702422380447388, + "learning_rate": 0.0007472127417519909, + "loss": 1.6421, + "step": 1115 + }, + { + "epoch": 1.2696245733788396, + "grad_norm": 1.1319351196289062, + "learning_rate": 0.0007469852104664392, + "loss": 2.3609, + "step": 1116 + }, + { + "epoch": 1.2707622298065984, + "grad_norm": 0.7127083539962769, + "learning_rate": 0.0007467576791808874, + "loss": 1.7728, + "step": 1117 + }, + { + "epoch": 1.2718998862343573, + "grad_norm": 1.783090353012085, + "learning_rate": 0.0007465301478953355, + "loss": 3.132, + "step": 1118 + }, + { + "epoch": 1.273037542662116, + "grad_norm": 0.9179248809814453, + "learning_rate": 0.0007463026166097839, + "loss": 2.0857, + "step": 1119 + }, + { + "epoch": 1.2741751990898749, + "grad_norm": 0.9802193641662598, + "learning_rate": 0.0007460750853242321, + "loss": 2.608, + "step": 1120 + }, + { + "epoch": 1.2753128555176336, + "grad_norm": 0.855622410774231, + "learning_rate": 0.0007458475540386803, + "loss": 1.9717, + "step": 1121 + }, + { + "epoch": 1.2764505119453924, + "grad_norm": 1.1232682466506958, + "learning_rate": 0.0007456200227531286, + "loss": 2.113, + "step": 1122 + }, + { + "epoch": 1.2775881683731514, + "grad_norm": 0.699447512626648, + "learning_rate": 0.0007453924914675768, + "loss": 1.2087, + "step": 1123 + }, + { + "epoch": 1.2787258248009101, + "grad_norm": 1.3320637941360474, + "learning_rate": 0.000745164960182025, + "loss": 1.3834, + "step": 1124 + }, + { + "epoch": 1.2798634812286689, + "grad_norm": 1.0277353525161743, + "learning_rate": 0.0007449374288964733, + "loss": 2.2678, + "step": 1125 + }, + { + "epoch": 1.2810011376564279, + "grad_norm": 0.8935559391975403, + "learning_rate": 0.0007447098976109215, + "loss": 1.816, + "step": 1126 + }, + { + "epoch": 1.2821387940841866, + "grad_norm": 1.2181432247161865, + "learning_rate": 0.0007444823663253698, + "loss": 2.2487, + "step": 1127 + }, + { + "epoch": 1.2832764505119454, + "grad_norm": 1.007584810256958, + "learning_rate": 0.000744254835039818, + "loss": 2.1345, + "step": 1128 + }, + { + "epoch": 1.2844141069397041, + "grad_norm": 1.0149571895599365, + "learning_rate": 0.0007440273037542662, + "loss": 2.4568, + "step": 1129 + }, + { + "epoch": 1.285551763367463, + "grad_norm": 0.6311346888542175, + "learning_rate": 0.0007437997724687145, + "loss": 1.5611, + "step": 1130 + }, + { + "epoch": 1.286689419795222, + "grad_norm": 1.3615020513534546, + "learning_rate": 0.0007435722411831627, + "loss": 2.5493, + "step": 1131 + }, + { + "epoch": 1.2878270762229806, + "grad_norm": 0.7496081590652466, + "learning_rate": 0.0007433447098976109, + "loss": 1.5885, + "step": 1132 + }, + { + "epoch": 1.2889647326507394, + "grad_norm": 0.9829562306404114, + "learning_rate": 0.0007431171786120592, + "loss": 1.8779, + "step": 1133 + }, + { + "epoch": 1.2901023890784984, + "grad_norm": 1.2002413272857666, + "learning_rate": 0.0007428896473265074, + "loss": 1.9528, + "step": 1134 + }, + { + "epoch": 1.2912400455062572, + "grad_norm": 0.7884618639945984, + "learning_rate": 0.0007426621160409557, + "loss": 0.8373, + "step": 1135 + }, + { + "epoch": 1.292377701934016, + "grad_norm": 0.734190821647644, + "learning_rate": 0.000742434584755404, + "loss": 1.6223, + "step": 1136 + }, + { + "epoch": 1.2935153583617747, + "grad_norm": 1.1885126829147339, + "learning_rate": 0.0007422070534698521, + "loss": 2.1086, + "step": 1137 + }, + { + "epoch": 1.2946530147895334, + "grad_norm": 1.3277819156646729, + "learning_rate": 0.0007419795221843003, + "loss": 2.6113, + "step": 1138 + }, + { + "epoch": 1.2957906712172924, + "grad_norm": 1.1494975090026855, + "learning_rate": 0.0007417519908987486, + "loss": 2.57, + "step": 1139 + }, + { + "epoch": 1.2969283276450512, + "grad_norm": 0.7995481491088867, + "learning_rate": 0.0007415244596131968, + "loss": 2.041, + "step": 1140 + }, + { + "epoch": 1.29806598407281, + "grad_norm": 1.1029703617095947, + "learning_rate": 0.000741296928327645, + "loss": 2.4597, + "step": 1141 + }, + { + "epoch": 1.299203640500569, + "grad_norm": 0.808023989200592, + "learning_rate": 0.0007410693970420933, + "loss": 1.195, + "step": 1142 + }, + { + "epoch": 1.3003412969283277, + "grad_norm": 1.3540676832199097, + "learning_rate": 0.0007408418657565415, + "loss": 2.8982, + "step": 1143 + }, + { + "epoch": 1.3014789533560864, + "grad_norm": 0.6362115740776062, + "learning_rate": 0.0007406143344709898, + "loss": 0.9607, + "step": 1144 + }, + { + "epoch": 1.3026166097838452, + "grad_norm": 0.5855313539505005, + "learning_rate": 0.0007403868031854381, + "loss": 1.1911, + "step": 1145 + }, + { + "epoch": 1.3037542662116042, + "grad_norm": 0.902195930480957, + "learning_rate": 0.0007401592718998863, + "loss": 1.8625, + "step": 1146 + }, + { + "epoch": 1.304891922639363, + "grad_norm": 0.9085184335708618, + "learning_rate": 0.0007399317406143344, + "loss": 2.2494, + "step": 1147 + }, + { + "epoch": 1.3060295790671217, + "grad_norm": 0.8404464721679688, + "learning_rate": 0.0007397042093287827, + "loss": 1.7344, + "step": 1148 + }, + { + "epoch": 1.3071672354948807, + "grad_norm": 1.5872386693954468, + "learning_rate": 0.0007394766780432309, + "loss": 2.6294, + "step": 1149 + }, + { + "epoch": 1.3083048919226394, + "grad_norm": 0.81778484582901, + "learning_rate": 0.0007392491467576792, + "loss": 1.716, + "step": 1150 + }, + { + "epoch": 1.3094425483503982, + "grad_norm": 0.9477559924125671, + "learning_rate": 0.0007390216154721274, + "loss": 1.7293, + "step": 1151 + }, + { + "epoch": 1.310580204778157, + "grad_norm": 1.0724713802337646, + "learning_rate": 0.0007387940841865757, + "loss": 2.1255, + "step": 1152 + }, + { + "epoch": 1.3117178612059157, + "grad_norm": 0.7965221405029297, + "learning_rate": 0.000738566552901024, + "loss": 1.1067, + "step": 1153 + }, + { + "epoch": 1.3128555176336747, + "grad_norm": 0.9288459420204163, + "learning_rate": 0.0007383390216154722, + "loss": 2.1202, + "step": 1154 + }, + { + "epoch": 1.3139931740614335, + "grad_norm": 1.221725344657898, + "learning_rate": 0.0007381114903299204, + "loss": 2.4878, + "step": 1155 + }, + { + "epoch": 1.3151308304891922, + "grad_norm": 1.0420432090759277, + "learning_rate": 0.0007378839590443687, + "loss": 1.9784, + "step": 1156 + }, + { + "epoch": 1.3162684869169512, + "grad_norm": 1.3146973848342896, + "learning_rate": 0.0007376564277588168, + "loss": 2.385, + "step": 1157 + }, + { + "epoch": 1.31740614334471, + "grad_norm": 1.1637381315231323, + "learning_rate": 0.000737428896473265, + "loss": 1.8719, + "step": 1158 + }, + { + "epoch": 1.3185437997724687, + "grad_norm": 1.2052867412567139, + "learning_rate": 0.0007372013651877133, + "loss": 3.4914, + "step": 1159 + }, + { + "epoch": 1.3196814562002275, + "grad_norm": 0.9300926327705383, + "learning_rate": 0.0007369738339021615, + "loss": 2.7666, + "step": 1160 + }, + { + "epoch": 1.3208191126279862, + "grad_norm": 1.0949327945709229, + "learning_rate": 0.0007367463026166098, + "loss": 2.3889, + "step": 1161 + }, + { + "epoch": 1.3219567690557452, + "grad_norm": 1.1416770219802856, + "learning_rate": 0.0007365187713310581, + "loss": 2.082, + "step": 1162 + }, + { + "epoch": 1.323094425483504, + "grad_norm": 0.6161019802093506, + "learning_rate": 0.0007362912400455063, + "loss": 1.1583, + "step": 1163 + }, + { + "epoch": 1.3242320819112627, + "grad_norm": 1.192044734954834, + "learning_rate": 0.0007360637087599545, + "loss": 3.2384, + "step": 1164 + }, + { + "epoch": 1.3253697383390217, + "grad_norm": 0.7260434031486511, + "learning_rate": 0.0007358361774744028, + "loss": 1.4132, + "step": 1165 + }, + { + "epoch": 1.3265073947667805, + "grad_norm": 0.8180050253868103, + "learning_rate": 0.0007356086461888509, + "loss": 1.8317, + "step": 1166 + }, + { + "epoch": 1.3276450511945392, + "grad_norm": 0.7380133867263794, + "learning_rate": 0.0007353811149032991, + "loss": 1.4301, + "step": 1167 + }, + { + "epoch": 1.328782707622298, + "grad_norm": 0.9578267335891724, + "learning_rate": 0.0007351535836177474, + "loss": 1.9568, + "step": 1168 + }, + { + "epoch": 1.3299203640500568, + "grad_norm": 0.9573442339897156, + "learning_rate": 0.0007349260523321957, + "loss": 2.6368, + "step": 1169 + }, + { + "epoch": 1.3310580204778157, + "grad_norm": 0.8377273678779602, + "learning_rate": 0.000734698521046644, + "loss": 1.9496, + "step": 1170 + }, + { + "epoch": 1.3321956769055745, + "grad_norm": 1.150334358215332, + "learning_rate": 0.0007344709897610922, + "loss": 1.8031, + "step": 1171 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.1262184381484985, + "learning_rate": 0.0007342434584755404, + "loss": 2.4094, + "step": 1172 + }, + { + "epoch": 1.3344709897610922, + "grad_norm": 0.8950188159942627, + "learning_rate": 0.0007340159271899887, + "loss": 1.557, + "step": 1173 + }, + { + "epoch": 1.335608646188851, + "grad_norm": 1.0164450407028198, + "learning_rate": 0.0007337883959044369, + "loss": 2.5082, + "step": 1174 + }, + { + "epoch": 1.3367463026166098, + "grad_norm": 0.6207723021507263, + "learning_rate": 0.0007335608646188851, + "loss": 1.6945, + "step": 1175 + }, + { + "epoch": 1.3378839590443685, + "grad_norm": 1.3576558828353882, + "learning_rate": 0.0007333333333333333, + "loss": 2.383, + "step": 1176 + }, + { + "epoch": 1.3390216154721273, + "grad_norm": 1.040291428565979, + "learning_rate": 0.0007331058020477816, + "loss": 1.887, + "step": 1177 + }, + { + "epoch": 1.3401592718998863, + "grad_norm": 1.2558809518814087, + "learning_rate": 0.0007328782707622298, + "loss": 2.6059, + "step": 1178 + }, + { + "epoch": 1.341296928327645, + "grad_norm": 1.1690802574157715, + "learning_rate": 0.0007326507394766781, + "loss": 2.0956, + "step": 1179 + }, + { + "epoch": 1.342434584755404, + "grad_norm": 0.6343255043029785, + "learning_rate": 0.0007324232081911263, + "loss": 1.1936, + "step": 1180 + }, + { + "epoch": 1.3435722411831628, + "grad_norm": 1.210610270500183, + "learning_rate": 0.0007321956769055745, + "loss": 2.7752, + "step": 1181 + }, + { + "epoch": 1.3447098976109215, + "grad_norm": 1.1297094821929932, + "learning_rate": 0.0007319681456200228, + "loss": 1.6202, + "step": 1182 + }, + { + "epoch": 1.3458475540386803, + "grad_norm": 1.0680432319641113, + "learning_rate": 0.000731740614334471, + "loss": 2.5643, + "step": 1183 + }, + { + "epoch": 1.346985210466439, + "grad_norm": 1.117632508277893, + "learning_rate": 0.0007315130830489192, + "loss": 2.2324, + "step": 1184 + }, + { + "epoch": 1.348122866894198, + "grad_norm": 1.2003607749938965, + "learning_rate": 0.0007312855517633676, + "loss": 1.7135, + "step": 1185 + }, + { + "epoch": 1.3492605233219568, + "grad_norm": 1.0429855585098267, + "learning_rate": 0.0007310580204778157, + "loss": 1.7824, + "step": 1186 + }, + { + "epoch": 1.3503981797497155, + "grad_norm": 1.0069884061813354, + "learning_rate": 0.0007308304891922639, + "loss": 1.6408, + "step": 1187 + }, + { + "epoch": 1.3515358361774745, + "grad_norm": 0.7664169669151306, + "learning_rate": 0.0007306029579067122, + "loss": 1.3137, + "step": 1188 + }, + { + "epoch": 1.3526734926052333, + "grad_norm": 0.9586772322654724, + "learning_rate": 0.0007303754266211604, + "loss": 1.861, + "step": 1189 + }, + { + "epoch": 1.353811149032992, + "grad_norm": 0.6649326086044312, + "learning_rate": 0.0007301478953356086, + "loss": 1.0684, + "step": 1190 + }, + { + "epoch": 1.3549488054607508, + "grad_norm": 1.4369438886642456, + "learning_rate": 0.0007299203640500569, + "loss": 3.2283, + "step": 1191 + }, + { + "epoch": 1.3560864618885096, + "grad_norm": 1.1447542905807495, + "learning_rate": 0.0007296928327645051, + "loss": 2.1419, + "step": 1192 + }, + { + "epoch": 1.3572241183162685, + "grad_norm": 0.9492262601852417, + "learning_rate": 0.0007294653014789535, + "loss": 1.9544, + "step": 1193 + }, + { + "epoch": 1.3583617747440273, + "grad_norm": 0.8491740822792053, + "learning_rate": 0.0007292377701934017, + "loss": 1.5818, + "step": 1194 + }, + { + "epoch": 1.359499431171786, + "grad_norm": 1.0647927522659302, + "learning_rate": 0.0007290102389078499, + "loss": 2.6932, + "step": 1195 + }, + { + "epoch": 1.360637087599545, + "grad_norm": 1.270956039428711, + "learning_rate": 0.0007287827076222981, + "loss": 1.9941, + "step": 1196 + }, + { + "epoch": 1.3617747440273038, + "grad_norm": 0.9028446674346924, + "learning_rate": 0.0007285551763367463, + "loss": 2.2445, + "step": 1197 + }, + { + "epoch": 1.3629124004550626, + "grad_norm": 0.8218393921852112, + "learning_rate": 0.0007283276450511945, + "loss": 1.9432, + "step": 1198 + }, + { + "epoch": 1.3640500568828213, + "grad_norm": 0.7149525284767151, + "learning_rate": 0.0007281001137656428, + "loss": 0.8669, + "step": 1199 + }, + { + "epoch": 1.36518771331058, + "grad_norm": 0.9978352189064026, + "learning_rate": 0.000727872582480091, + "loss": 1.4849, + "step": 1200 + }, + { + "epoch": 1.366325369738339, + "grad_norm": 0.9793040752410889, + "learning_rate": 0.0007276450511945392, + "loss": 1.7734, + "step": 1201 + }, + { + "epoch": 1.3674630261660978, + "grad_norm": 0.8100583553314209, + "learning_rate": 0.0007274175199089876, + "loss": 1.7183, + "step": 1202 + }, + { + "epoch": 1.3686006825938566, + "grad_norm": 1.051924467086792, + "learning_rate": 0.0007271899886234358, + "loss": 2.0836, + "step": 1203 + }, + { + "epoch": 1.3697383390216156, + "grad_norm": 1.057503342628479, + "learning_rate": 0.000726962457337884, + "loss": 2.1088, + "step": 1204 + }, + { + "epoch": 1.3708759954493743, + "grad_norm": 1.1179509162902832, + "learning_rate": 0.0007267349260523322, + "loss": 1.452, + "step": 1205 + }, + { + "epoch": 1.372013651877133, + "grad_norm": 1.2062307596206665, + "learning_rate": 0.0007265073947667804, + "loss": 2.4706, + "step": 1206 + }, + { + "epoch": 1.3731513083048918, + "grad_norm": 1.5062495470046997, + "learning_rate": 0.0007262798634812286, + "loss": 3.7995, + "step": 1207 + }, + { + "epoch": 1.3742889647326506, + "grad_norm": 1.0101311206817627, + "learning_rate": 0.0007260523321956769, + "loss": 1.889, + "step": 1208 + }, + { + "epoch": 1.3754266211604096, + "grad_norm": 0.5304610729217529, + "learning_rate": 0.0007258248009101251, + "loss": 0.8576, + "step": 1209 + }, + { + "epoch": 1.3765642775881684, + "grad_norm": 0.82547926902771, + "learning_rate": 0.0007255972696245733, + "loss": 1.8134, + "step": 1210 + }, + { + "epoch": 1.377701934015927, + "grad_norm": 1.220291256904602, + "learning_rate": 0.0007253697383390217, + "loss": 2.1109, + "step": 1211 + }, + { + "epoch": 1.378839590443686, + "grad_norm": 0.7384538650512695, + "learning_rate": 0.0007251422070534699, + "loss": 1.9597, + "step": 1212 + }, + { + "epoch": 1.3799772468714449, + "grad_norm": 1.0133798122406006, + "learning_rate": 0.0007249146757679182, + "loss": 2.7964, + "step": 1213 + }, + { + "epoch": 1.3811149032992036, + "grad_norm": 1.00083327293396, + "learning_rate": 0.0007246871444823664, + "loss": 2.6457, + "step": 1214 + }, + { + "epoch": 1.3822525597269624, + "grad_norm": 0.9185119271278381, + "learning_rate": 0.0007244596131968145, + "loss": 1.5573, + "step": 1215 + }, + { + "epoch": 1.3833902161547214, + "grad_norm": 0.8959344029426575, + "learning_rate": 0.0007242320819112628, + "loss": 1.5507, + "step": 1216 + }, + { + "epoch": 1.3845278725824801, + "grad_norm": 0.7419568300247192, + "learning_rate": 0.000724004550625711, + "loss": 1.599, + "step": 1217 + }, + { + "epoch": 1.3856655290102389, + "grad_norm": 1.2727500200271606, + "learning_rate": 0.0007237770193401592, + "loss": 1.8304, + "step": 1218 + }, + { + "epoch": 1.3868031854379979, + "grad_norm": 0.6277416944503784, + "learning_rate": 0.0007235494880546076, + "loss": 1.4432, + "step": 1219 + }, + { + "epoch": 1.3879408418657566, + "grad_norm": 1.2073726654052734, + "learning_rate": 0.0007233219567690558, + "loss": 2.4847, + "step": 1220 + }, + { + "epoch": 1.3890784982935154, + "grad_norm": 1.075276255607605, + "learning_rate": 0.000723094425483504, + "loss": 2.4881, + "step": 1221 + }, + { + "epoch": 1.3902161547212741, + "grad_norm": 1.0907241106033325, + "learning_rate": 0.0007228668941979523, + "loss": 2.7326, + "step": 1222 + }, + { + "epoch": 1.391353811149033, + "grad_norm": 0.9791719317436218, + "learning_rate": 0.0007226393629124005, + "loss": 1.8307, + "step": 1223 + }, + { + "epoch": 1.3924914675767919, + "grad_norm": 1.1889147758483887, + "learning_rate": 0.0007224118316268487, + "loss": 3.306, + "step": 1224 + }, + { + "epoch": 1.3936291240045506, + "grad_norm": 0.9719458818435669, + "learning_rate": 0.0007221843003412969, + "loss": 1.7917, + "step": 1225 + }, + { + "epoch": 1.3947667804323094, + "grad_norm": 1.136434555053711, + "learning_rate": 0.0007219567690557451, + "loss": 2.1016, + "step": 1226 + }, + { + "epoch": 1.3959044368600684, + "grad_norm": 0.9792470335960388, + "learning_rate": 0.0007217292377701933, + "loss": 1.5415, + "step": 1227 + }, + { + "epoch": 1.3970420932878271, + "grad_norm": 0.5439932346343994, + "learning_rate": 0.0007215017064846417, + "loss": 0.6703, + "step": 1228 + }, + { + "epoch": 1.398179749715586, + "grad_norm": 1.0243198871612549, + "learning_rate": 0.0007212741751990899, + "loss": 1.7786, + "step": 1229 + }, + { + "epoch": 1.3993174061433447, + "grad_norm": 1.2160857915878296, + "learning_rate": 0.0007210466439135381, + "loss": 2.0586, + "step": 1230 + }, + { + "epoch": 1.4004550625711034, + "grad_norm": 1.0249682664871216, + "learning_rate": 0.0007208191126279864, + "loss": 2.3755, + "step": 1231 + }, + { + "epoch": 1.4015927189988624, + "grad_norm": 1.3977047204971313, + "learning_rate": 0.0007205915813424346, + "loss": 2.1492, + "step": 1232 + }, + { + "epoch": 1.4027303754266212, + "grad_norm": 0.7847321033477783, + "learning_rate": 0.0007203640500568829, + "loss": 1.2055, + "step": 1233 + }, + { + "epoch": 1.40386803185438, + "grad_norm": 1.104148030281067, + "learning_rate": 0.0007201365187713311, + "loss": 3.1498, + "step": 1234 + }, + { + "epoch": 1.405005688282139, + "grad_norm": 0.87827068567276, + "learning_rate": 0.0007199089874857792, + "loss": 2.2748, + "step": 1235 + }, + { + "epoch": 1.4061433447098977, + "grad_norm": 1.0710291862487793, + "learning_rate": 0.0007196814562002276, + "loss": 1.4276, + "step": 1236 + }, + { + "epoch": 1.4072810011376564, + "grad_norm": 0.950809121131897, + "learning_rate": 0.0007194539249146758, + "loss": 1.8939, + "step": 1237 + }, + { + "epoch": 1.4084186575654152, + "grad_norm": 1.010000228881836, + "learning_rate": 0.000719226393629124, + "loss": 1.6589, + "step": 1238 + }, + { + "epoch": 1.409556313993174, + "grad_norm": 1.1753206253051758, + "learning_rate": 0.0007189988623435723, + "loss": 2.3059, + "step": 1239 + }, + { + "epoch": 1.410693970420933, + "grad_norm": 1.1671147346496582, + "learning_rate": 0.0007187713310580205, + "loss": 2.4946, + "step": 1240 + }, + { + "epoch": 1.4118316268486917, + "grad_norm": 0.8529374599456787, + "learning_rate": 0.0007185437997724687, + "loss": 2.0972, + "step": 1241 + }, + { + "epoch": 1.4129692832764504, + "grad_norm": 1.1962100267410278, + "learning_rate": 0.000718316268486917, + "loss": 2.4916, + "step": 1242 + }, + { + "epoch": 1.4141069397042094, + "grad_norm": 0.8587897419929504, + "learning_rate": 0.0007180887372013652, + "loss": 0.9147, + "step": 1243 + }, + { + "epoch": 1.4152445961319682, + "grad_norm": 0.9558615684509277, + "learning_rate": 0.0007178612059158133, + "loss": 1.8377, + "step": 1244 + }, + { + "epoch": 1.416382252559727, + "grad_norm": 0.694108784198761, + "learning_rate": 0.0007176336746302617, + "loss": 1.4134, + "step": 1245 + }, + { + "epoch": 1.4175199089874857, + "grad_norm": 0.855204164981842, + "learning_rate": 0.0007174061433447099, + "loss": 2.012, + "step": 1246 + }, + { + "epoch": 1.4186575654152445, + "grad_norm": 1.4311326742172241, + "learning_rate": 0.0007171786120591581, + "loss": 3.41, + "step": 1247 + }, + { + "epoch": 1.4197952218430034, + "grad_norm": 0.9610998630523682, + "learning_rate": 0.0007169510807736064, + "loss": 1.6547, + "step": 1248 + }, + { + "epoch": 1.4209328782707622, + "grad_norm": 0.7780663371086121, + "learning_rate": 0.0007167235494880546, + "loss": 1.9053, + "step": 1249 + }, + { + "epoch": 1.4220705346985212, + "grad_norm": 1.3516772985458374, + "learning_rate": 0.0007164960182025028, + "loss": 0.701, + "step": 1250 + }, + { + "epoch": 1.42320819112628, + "grad_norm": 1.596235990524292, + "learning_rate": 0.0007162684869169511, + "loss": 2.6895, + "step": 1251 + }, + { + "epoch": 1.4243458475540387, + "grad_norm": 0.9624771475791931, + "learning_rate": 0.0007160409556313994, + "loss": 1.341, + "step": 1252 + }, + { + "epoch": 1.4254835039817975, + "grad_norm": 0.8453911542892456, + "learning_rate": 0.0007158134243458477, + "loss": 2.0326, + "step": 1253 + }, + { + "epoch": 1.4266211604095562, + "grad_norm": 1.0875787734985352, + "learning_rate": 0.0007155858930602958, + "loss": 1.4671, + "step": 1254 + }, + { + "epoch": 1.4277588168373152, + "grad_norm": 1.0011261701583862, + "learning_rate": 0.000715358361774744, + "loss": 1.4844, + "step": 1255 + }, + { + "epoch": 1.428896473265074, + "grad_norm": 0.7726243734359741, + "learning_rate": 0.0007151308304891923, + "loss": 1.7404, + "step": 1256 + }, + { + "epoch": 1.4300341296928327, + "grad_norm": 0.7656590342521667, + "learning_rate": 0.0007149032992036405, + "loss": 1.6766, + "step": 1257 + }, + { + "epoch": 1.4311717861205917, + "grad_norm": 0.8595698475837708, + "learning_rate": 0.0007146757679180887, + "loss": 2.1565, + "step": 1258 + }, + { + "epoch": 1.4323094425483505, + "grad_norm": 1.003932237625122, + "learning_rate": 0.000714448236632537, + "loss": 1.721, + "step": 1259 + }, + { + "epoch": 1.4334470989761092, + "grad_norm": 0.8126673102378845, + "learning_rate": 0.0007142207053469852, + "loss": 2.0854, + "step": 1260 + }, + { + "epoch": 1.434584755403868, + "grad_norm": 0.9045354723930359, + "learning_rate": 0.0007139931740614335, + "loss": 1.5096, + "step": 1261 + }, + { + "epoch": 1.4357224118316267, + "grad_norm": 0.923866331577301, + "learning_rate": 0.0007137656427758818, + "loss": 2.2521, + "step": 1262 + }, + { + "epoch": 1.4368600682593857, + "grad_norm": 0.8187153339385986, + "learning_rate": 0.00071353811149033, + "loss": 2.3217, + "step": 1263 + }, + { + "epoch": 1.4379977246871445, + "grad_norm": 1.0635052919387817, + "learning_rate": 0.0007133105802047781, + "loss": 1.8729, + "step": 1264 + }, + { + "epoch": 1.4391353811149032, + "grad_norm": 0.8792582750320435, + "learning_rate": 0.0007130830489192264, + "loss": 1.9955, + "step": 1265 + }, + { + "epoch": 1.4402730375426622, + "grad_norm": 1.3968185186386108, + "learning_rate": 0.0007128555176336746, + "loss": 4.4219, + "step": 1266 + }, + { + "epoch": 1.441410693970421, + "grad_norm": 0.956078827381134, + "learning_rate": 0.0007126279863481228, + "loss": 1.5945, + "step": 1267 + }, + { + "epoch": 1.4425483503981797, + "grad_norm": 0.8696045875549316, + "learning_rate": 0.0007124004550625711, + "loss": 1.1966, + "step": 1268 + }, + { + "epoch": 1.4436860068259385, + "grad_norm": 0.7060182690620422, + "learning_rate": 0.0007121729237770194, + "loss": 1.3995, + "step": 1269 + }, + { + "epoch": 1.4448236632536973, + "grad_norm": 0.8254784941673279, + "learning_rate": 0.0007119453924914676, + "loss": 1.0873, + "step": 1270 + }, + { + "epoch": 1.4459613196814562, + "grad_norm": 1.0359618663787842, + "learning_rate": 0.0007117178612059159, + "loss": 2.2489, + "step": 1271 + }, + { + "epoch": 1.447098976109215, + "grad_norm": 0.9475293159484863, + "learning_rate": 0.0007114903299203641, + "loss": 1.8545, + "step": 1272 + }, + { + "epoch": 1.4482366325369738, + "grad_norm": 0.7588447332382202, + "learning_rate": 0.0007112627986348122, + "loss": 1.5502, + "step": 1273 + }, + { + "epoch": 1.4493742889647327, + "grad_norm": 1.24519681930542, + "learning_rate": 0.0007110352673492605, + "loss": 2.6479, + "step": 1274 + }, + { + "epoch": 1.4505119453924915, + "grad_norm": 0.6365463137626648, + "learning_rate": 0.0007108077360637087, + "loss": 1.404, + "step": 1275 + }, + { + "epoch": 1.4516496018202503, + "grad_norm": 1.4220560789108276, + "learning_rate": 0.000710580204778157, + "loss": 2.4303, + "step": 1276 + }, + { + "epoch": 1.452787258248009, + "grad_norm": 0.965912938117981, + "learning_rate": 0.0007103526734926053, + "loss": 2.1538, + "step": 1277 + }, + { + "epoch": 1.4539249146757678, + "grad_norm": 1.103785514831543, + "learning_rate": 0.0007101251422070535, + "loss": 2.8817, + "step": 1278 + }, + { + "epoch": 1.4550625711035268, + "grad_norm": 1.0863354206085205, + "learning_rate": 0.0007098976109215018, + "loss": 3.1132, + "step": 1279 + }, + { + "epoch": 1.4562002275312855, + "grad_norm": 0.988666296005249, + "learning_rate": 0.00070967007963595, + "loss": 1.915, + "step": 1280 + }, + { + "epoch": 1.4573378839590443, + "grad_norm": 1.0589752197265625, + "learning_rate": 0.0007094425483503982, + "loss": 1.6478, + "step": 1281 + }, + { + "epoch": 1.4584755403868033, + "grad_norm": 0.9732983708381653, + "learning_rate": 0.0007092150170648465, + "loss": 1.834, + "step": 1282 + }, + { + "epoch": 1.459613196814562, + "grad_norm": 0.932620644569397, + "learning_rate": 0.0007089874857792946, + "loss": 2.184, + "step": 1283 + }, + { + "epoch": 1.4607508532423208, + "grad_norm": 1.0213489532470703, + "learning_rate": 0.0007087599544937428, + "loss": 1.9008, + "step": 1284 + }, + { + "epoch": 1.4618885096700796, + "grad_norm": 0.8882502317428589, + "learning_rate": 0.0007085324232081911, + "loss": 1.6468, + "step": 1285 + }, + { + "epoch": 1.4630261660978385, + "grad_norm": 0.794425904750824, + "learning_rate": 0.0007083048919226394, + "loss": 2.0429, + "step": 1286 + }, + { + "epoch": 1.4641638225255973, + "grad_norm": 1.1039729118347168, + "learning_rate": 0.0007080773606370876, + "loss": 2.3133, + "step": 1287 + }, + { + "epoch": 1.465301478953356, + "grad_norm": 1.3886933326721191, + "learning_rate": 0.0007078498293515359, + "loss": 2.0589, + "step": 1288 + }, + { + "epoch": 1.466439135381115, + "grad_norm": 0.6157169938087463, + "learning_rate": 0.0007076222980659841, + "loss": 1.0264, + "step": 1289 + }, + { + "epoch": 1.4675767918088738, + "grad_norm": 1.0444914102554321, + "learning_rate": 0.0007073947667804323, + "loss": 2.8021, + "step": 1290 + }, + { + "epoch": 1.4687144482366326, + "grad_norm": 0.9021384716033936, + "learning_rate": 0.0007071672354948806, + "loss": 1.5786, + "step": 1291 + }, + { + "epoch": 1.4698521046643913, + "grad_norm": 0.9910659790039062, + "learning_rate": 0.0007069397042093288, + "loss": 1.3046, + "step": 1292 + }, + { + "epoch": 1.47098976109215, + "grad_norm": 0.8417410254478455, + "learning_rate": 0.0007067121729237769, + "loss": 1.388, + "step": 1293 + }, + { + "epoch": 1.472127417519909, + "grad_norm": 0.8313772082328796, + "learning_rate": 0.0007064846416382253, + "loss": 1.5113, + "step": 1294 + }, + { + "epoch": 1.4732650739476678, + "grad_norm": 0.9730493426322937, + "learning_rate": 0.0007062571103526735, + "loss": 1.3389, + "step": 1295 + }, + { + "epoch": 1.4744027303754266, + "grad_norm": 0.9900069832801819, + "learning_rate": 0.0007060295790671218, + "loss": 2.1483, + "step": 1296 + }, + { + "epoch": 1.4755403868031856, + "grad_norm": 0.8475415110588074, + "learning_rate": 0.00070580204778157, + "loss": 1.397, + "step": 1297 + }, + { + "epoch": 1.4766780432309443, + "grad_norm": 1.0743128061294556, + "learning_rate": 0.0007055745164960182, + "loss": 1.8463, + "step": 1298 + }, + { + "epoch": 1.477815699658703, + "grad_norm": 1.1737279891967773, + "learning_rate": 0.0007053469852104665, + "loss": 3.4954, + "step": 1299 + }, + { + "epoch": 1.4789533560864618, + "grad_norm": 1.0930012464523315, + "learning_rate": 0.0007051194539249147, + "loss": 2.65, + "step": 1300 + }, + { + "epoch": 1.4800910125142206, + "grad_norm": 0.6050748229026794, + "learning_rate": 0.0007048919226393629, + "loss": 1.1043, + "step": 1301 + }, + { + "epoch": 1.4812286689419796, + "grad_norm": 2.5956709384918213, + "learning_rate": 0.0007046643913538113, + "loss": 4.9918, + "step": 1302 + }, + { + "epoch": 1.4823663253697383, + "grad_norm": 1.5760893821716309, + "learning_rate": 0.0007044368600682594, + "loss": 3.41, + "step": 1303 + }, + { + "epoch": 1.483503981797497, + "grad_norm": 1.2234876155853271, + "learning_rate": 0.0007042093287827076, + "loss": 2.0168, + "step": 1304 + }, + { + "epoch": 1.484641638225256, + "grad_norm": 1.5653973817825317, + "learning_rate": 0.0007039817974971559, + "loss": 3.4326, + "step": 1305 + }, + { + "epoch": 1.4857792946530148, + "grad_norm": 0.9075149297714233, + "learning_rate": 0.0007037542662116041, + "loss": 1.5127, + "step": 1306 + }, + { + "epoch": 1.4869169510807736, + "grad_norm": 0.8964717984199524, + "learning_rate": 0.0007035267349260523, + "loss": 1.9019, + "step": 1307 + }, + { + "epoch": 1.4880546075085324, + "grad_norm": 0.8409413695335388, + "learning_rate": 0.0007032992036405006, + "loss": 1.4982, + "step": 1308 + }, + { + "epoch": 1.4891922639362911, + "grad_norm": 0.7918345928192139, + "learning_rate": 0.0007030716723549488, + "loss": 1.5671, + "step": 1309 + }, + { + "epoch": 1.49032992036405, + "grad_norm": 0.9930965900421143, + "learning_rate": 0.000702844141069397, + "loss": 1.7916, + "step": 1310 + }, + { + "epoch": 1.4914675767918089, + "grad_norm": 0.7483389377593994, + "learning_rate": 0.0007026166097838454, + "loss": 1.2254, + "step": 1311 + }, + { + "epoch": 1.4926052332195676, + "grad_norm": 0.5501680374145508, + "learning_rate": 0.0007023890784982935, + "loss": 1.4911, + "step": 1312 + }, + { + "epoch": 1.4937428896473266, + "grad_norm": 0.6416218280792236, + "learning_rate": 0.0007021615472127417, + "loss": 1.1327, + "step": 1313 + }, + { + "epoch": 1.4948805460750854, + "grad_norm": 1.3638542890548706, + "learning_rate": 0.00070193401592719, + "loss": 2.1457, + "step": 1314 + }, + { + "epoch": 1.4960182025028441, + "grad_norm": 0.625977635383606, + "learning_rate": 0.0007017064846416382, + "loss": 0.9247, + "step": 1315 + }, + { + "epoch": 1.4971558589306029, + "grad_norm": 0.9826652407646179, + "learning_rate": 0.0007014789533560865, + "loss": 2.6433, + "step": 1316 + }, + { + "epoch": 1.4982935153583616, + "grad_norm": 0.8866605162620544, + "learning_rate": 0.0007012514220705347, + "loss": 1.3826, + "step": 1317 + }, + { + "epoch": 1.4994311717861206, + "grad_norm": 0.7106355428695679, + "learning_rate": 0.0007010238907849829, + "loss": 1.3616, + "step": 1318 + }, + { + "epoch": 1.5005688282138794, + "grad_norm": 0.9405243992805481, + "learning_rate": 0.0007007963594994313, + "loss": 1.6128, + "step": 1319 + }, + { + "epoch": 1.5017064846416384, + "grad_norm": 0.947182834148407, + "learning_rate": 0.0007005688282138795, + "loss": 3.0259, + "step": 1320 + }, + { + "epoch": 1.5028441410693971, + "grad_norm": 0.89002925157547, + "learning_rate": 0.0007003412969283277, + "loss": 2.0293, + "step": 1321 + }, + { + "epoch": 1.5039817974971559, + "grad_norm": 0.582564115524292, + "learning_rate": 0.0007001137656427759, + "loss": 0.5041, + "step": 1322 + }, + { + "epoch": 1.5051194539249146, + "grad_norm": 0.961336612701416, + "learning_rate": 0.0006998862343572241, + "loss": 2.0847, + "step": 1323 + }, + { + "epoch": 1.5062571103526734, + "grad_norm": 1.3698042631149292, + "learning_rate": 0.0006996587030716723, + "loss": 2.7407, + "step": 1324 + }, + { + "epoch": 1.5073947667804322, + "grad_norm": 1.4105736017227173, + "learning_rate": 0.0006994311717861206, + "loss": 2.8289, + "step": 1325 + }, + { + "epoch": 1.5085324232081911, + "grad_norm": 0.9374104142189026, + "learning_rate": 0.0006992036405005688, + "loss": 1.7586, + "step": 1326 + }, + { + "epoch": 1.50967007963595, + "grad_norm": 0.7621793746948242, + "learning_rate": 0.000698976109215017, + "loss": 1.3067, + "step": 1327 + }, + { + "epoch": 1.5108077360637089, + "grad_norm": 1.1107509136199951, + "learning_rate": 0.0006987485779294654, + "loss": 2.591, + "step": 1328 + }, + { + "epoch": 1.5119453924914676, + "grad_norm": 0.9593746662139893, + "learning_rate": 0.0006985210466439136, + "loss": 2.4804, + "step": 1329 + }, + { + "epoch": 1.5130830489192264, + "grad_norm": 0.7596848011016846, + "learning_rate": 0.0006982935153583618, + "loss": 0.9857, + "step": 1330 + }, + { + "epoch": 1.5142207053469852, + "grad_norm": 0.8829966187477112, + "learning_rate": 0.0006980659840728101, + "loss": 1.1982, + "step": 1331 + }, + { + "epoch": 1.515358361774744, + "grad_norm": 0.8279595375061035, + "learning_rate": 0.0006978384527872582, + "loss": 2.2226, + "step": 1332 + }, + { + "epoch": 1.5164960182025027, + "grad_norm": 1.1019062995910645, + "learning_rate": 0.0006976109215017064, + "loss": 1.7367, + "step": 1333 + }, + { + "epoch": 1.5176336746302617, + "grad_norm": 1.7979804277420044, + "learning_rate": 0.0006973833902161547, + "loss": 1.959, + "step": 1334 + }, + { + "epoch": 1.5187713310580204, + "grad_norm": 0.8705268502235413, + "learning_rate": 0.0006971558589306029, + "loss": 2.1202, + "step": 1335 + }, + { + "epoch": 1.5199089874857794, + "grad_norm": 1.0724592208862305, + "learning_rate": 0.0006969283276450513, + "loss": 1.7139, + "step": 1336 + }, + { + "epoch": 1.5210466439135382, + "grad_norm": 0.9775965809822083, + "learning_rate": 0.0006967007963594995, + "loss": 2.0016, + "step": 1337 + }, + { + "epoch": 1.522184300341297, + "grad_norm": 0.9448524713516235, + "learning_rate": 0.0006964732650739477, + "loss": 1.6153, + "step": 1338 + }, + { + "epoch": 1.5233219567690557, + "grad_norm": 1.5415128469467163, + "learning_rate": 0.000696245733788396, + "loss": 3.1016, + "step": 1339 + }, + { + "epoch": 1.5244596131968144, + "grad_norm": 0.9991744160652161, + "learning_rate": 0.0006960182025028442, + "loss": 1.5704, + "step": 1340 + }, + { + "epoch": 1.5255972696245734, + "grad_norm": 0.5757598280906677, + "learning_rate": 0.0006957906712172923, + "loss": 1.0319, + "step": 1341 + }, + { + "epoch": 1.5267349260523322, + "grad_norm": 0.7392444014549255, + "learning_rate": 0.0006955631399317406, + "loss": 1.6469, + "step": 1342 + }, + { + "epoch": 1.5278725824800912, + "grad_norm": 0.8269877433776855, + "learning_rate": 0.0006953356086461888, + "loss": 1.5095, + "step": 1343 + }, + { + "epoch": 1.52901023890785, + "grad_norm": 0.8942914605140686, + "learning_rate": 0.000695108077360637, + "loss": 1.7443, + "step": 1344 + }, + { + "epoch": 1.5301478953356087, + "grad_norm": 0.9986234307289124, + "learning_rate": 0.0006948805460750854, + "loss": 2.1091, + "step": 1345 + }, + { + "epoch": 1.5312855517633674, + "grad_norm": 1.5111842155456543, + "learning_rate": 0.0006946530147895336, + "loss": 2.9399, + "step": 1346 + }, + { + "epoch": 1.5324232081911262, + "grad_norm": 0.8107113242149353, + "learning_rate": 0.0006944254835039818, + "loss": 1.5483, + "step": 1347 + }, + { + "epoch": 1.533560864618885, + "grad_norm": 0.9013845324516296, + "learning_rate": 0.0006941979522184301, + "loss": 2.0372, + "step": 1348 + }, + { + "epoch": 1.534698521046644, + "grad_norm": 0.812639057636261, + "learning_rate": 0.0006939704209328783, + "loss": 1.7029, + "step": 1349 + }, + { + "epoch": 1.5358361774744027, + "grad_norm": 1.5611118078231812, + "learning_rate": 0.0006937428896473265, + "loss": 3.1895, + "step": 1350 + }, + { + "epoch": 1.5369738339021617, + "grad_norm": 0.7234958410263062, + "learning_rate": 0.0006935153583617747, + "loss": 1.7994, + "step": 1351 + }, + { + "epoch": 1.5381114903299204, + "grad_norm": 0.5042601227760315, + "learning_rate": 0.0006932878270762229, + "loss": 0.6559, + "step": 1352 + }, + { + "epoch": 1.5392491467576792, + "grad_norm": 0.7862391471862793, + "learning_rate": 0.0006930602957906712, + "loss": 1.2773, + "step": 1353 + }, + { + "epoch": 1.540386803185438, + "grad_norm": 0.8332534432411194, + "learning_rate": 0.0006928327645051195, + "loss": 1.7685, + "step": 1354 + }, + { + "epoch": 1.5415244596131967, + "grad_norm": 0.7258904576301575, + "learning_rate": 0.0006926052332195677, + "loss": 0.8505, + "step": 1355 + }, + { + "epoch": 1.5426621160409555, + "grad_norm": 0.6030761003494263, + "learning_rate": 0.000692377701934016, + "loss": 1.2053, + "step": 1356 + }, + { + "epoch": 1.5437997724687145, + "grad_norm": 1.0888185501098633, + "learning_rate": 0.0006921501706484642, + "loss": 2.0252, + "step": 1357 + }, + { + "epoch": 1.5449374288964732, + "grad_norm": 1.2503432035446167, + "learning_rate": 0.0006919226393629124, + "loss": 3.6095, + "step": 1358 + }, + { + "epoch": 1.5460750853242322, + "grad_norm": 0.6536726951599121, + "learning_rate": 0.0006916951080773607, + "loss": 1.171, + "step": 1359 + }, + { + "epoch": 1.547212741751991, + "grad_norm": 0.8668822646141052, + "learning_rate": 0.000691467576791809, + "loss": 2.234, + "step": 1360 + }, + { + "epoch": 1.5483503981797497, + "grad_norm": 1.2817611694335938, + "learning_rate": 0.000691240045506257, + "loss": 1.7949, + "step": 1361 + }, + { + "epoch": 1.5494880546075085, + "grad_norm": 0.9210306406021118, + "learning_rate": 0.0006910125142207054, + "loss": 0.6781, + "step": 1362 + }, + { + "epoch": 1.5506257110352673, + "grad_norm": 1.0318769216537476, + "learning_rate": 0.0006907849829351536, + "loss": 2.4866, + "step": 1363 + }, + { + "epoch": 1.551763367463026, + "grad_norm": 1.0677587985992432, + "learning_rate": 0.0006905574516496018, + "loss": 2.1298, + "step": 1364 + }, + { + "epoch": 1.552901023890785, + "grad_norm": 0.6409241557121277, + "learning_rate": 0.0006903299203640501, + "loss": 1.4516, + "step": 1365 + }, + { + "epoch": 1.5540386803185438, + "grad_norm": 1.06160569190979, + "learning_rate": 0.0006901023890784983, + "loss": 2.8236, + "step": 1366 + }, + { + "epoch": 1.5551763367463027, + "grad_norm": 1.5889126062393188, + "learning_rate": 0.0006898748577929465, + "loss": 2.9584, + "step": 1367 + }, + { + "epoch": 1.5563139931740615, + "grad_norm": 0.5789588093757629, + "learning_rate": 0.0006896473265073948, + "loss": 1.2789, + "step": 1368 + }, + { + "epoch": 1.5574516496018203, + "grad_norm": 1.5852688550949097, + "learning_rate": 0.000689419795221843, + "loss": 3.2372, + "step": 1369 + }, + { + "epoch": 1.558589306029579, + "grad_norm": 0.8930657505989075, + "learning_rate": 0.0006891922639362913, + "loss": 1.2431, + "step": 1370 + }, + { + "epoch": 1.5597269624573378, + "grad_norm": 1.1231560707092285, + "learning_rate": 0.0006889647326507395, + "loss": 2.3293, + "step": 1371 + }, + { + "epoch": 1.5608646188850968, + "grad_norm": 1.0928031206130981, + "learning_rate": 0.0006887372013651877, + "loss": 2.5423, + "step": 1372 + }, + { + "epoch": 1.5620022753128555, + "grad_norm": 0.9887200593948364, + "learning_rate": 0.0006885096700796359, + "loss": 2.1031, + "step": 1373 + }, + { + "epoch": 1.5631399317406145, + "grad_norm": 1.6233134269714355, + "learning_rate": 0.0006882821387940842, + "loss": 3.366, + "step": 1374 + }, + { + "epoch": 1.5642775881683733, + "grad_norm": 0.6267789006233215, + "learning_rate": 0.0006880546075085324, + "loss": 1.0757, + "step": 1375 + }, + { + "epoch": 1.565415244596132, + "grad_norm": 0.8272294402122498, + "learning_rate": 0.0006878270762229807, + "loss": 1.3076, + "step": 1376 + }, + { + "epoch": 1.5665529010238908, + "grad_norm": 0.7376548051834106, + "learning_rate": 0.000687599544937429, + "loss": 1.4426, + "step": 1377 + }, + { + "epoch": 1.5676905574516495, + "grad_norm": 1.136896014213562, + "learning_rate": 0.0006873720136518772, + "loss": 2.9297, + "step": 1378 + }, + { + "epoch": 1.5688282138794083, + "grad_norm": 0.8306413888931274, + "learning_rate": 0.0006871444823663255, + "loss": 1.9698, + "step": 1379 + }, + { + "epoch": 1.5699658703071673, + "grad_norm": 0.9145554900169373, + "learning_rate": 0.0006869169510807736, + "loss": 1.2156, + "step": 1380 + }, + { + "epoch": 1.571103526734926, + "grad_norm": 1.226146936416626, + "learning_rate": 0.0006866894197952218, + "loss": 2.1672, + "step": 1381 + }, + { + "epoch": 1.572241183162685, + "grad_norm": 0.6552311182022095, + "learning_rate": 0.0006864618885096701, + "loss": 0.9541, + "step": 1382 + }, + { + "epoch": 1.5733788395904438, + "grad_norm": 0.8643866777420044, + "learning_rate": 0.0006862343572241183, + "loss": 1.3602, + "step": 1383 + }, + { + "epoch": 1.5745164960182025, + "grad_norm": 1.0479025840759277, + "learning_rate": 0.0006860068259385665, + "loss": 2.6468, + "step": 1384 + }, + { + "epoch": 1.5756541524459613, + "grad_norm": 1.173642635345459, + "learning_rate": 0.0006857792946530148, + "loss": 2.4588, + "step": 1385 + }, + { + "epoch": 1.57679180887372, + "grad_norm": 0.751677930355072, + "learning_rate": 0.0006855517633674631, + "loss": 0.8411, + "step": 1386 + }, + { + "epoch": 1.5779294653014788, + "grad_norm": 1.1802860498428345, + "learning_rate": 0.0006853242320819113, + "loss": 3.0113, + "step": 1387 + }, + { + "epoch": 1.5790671217292378, + "grad_norm": 0.8296557068824768, + "learning_rate": 0.0006850967007963596, + "loss": 1.7901, + "step": 1388 + }, + { + "epoch": 1.5802047781569966, + "grad_norm": 1.1105976104736328, + "learning_rate": 0.0006848691695108078, + "loss": 1.6191, + "step": 1389 + }, + { + "epoch": 1.5813424345847555, + "grad_norm": 1.2469074726104736, + "learning_rate": 0.0006846416382252559, + "loss": 1.4178, + "step": 1390 + }, + { + "epoch": 1.5824800910125143, + "grad_norm": 1.089394450187683, + "learning_rate": 0.0006844141069397042, + "loss": 1.9584, + "step": 1391 + }, + { + "epoch": 1.583617747440273, + "grad_norm": 0.9064239859580994, + "learning_rate": 0.0006841865756541524, + "loss": 1.5156, + "step": 1392 + }, + { + "epoch": 1.5847554038680318, + "grad_norm": 1.0828756093978882, + "learning_rate": 0.0006839590443686006, + "loss": 2.1486, + "step": 1393 + }, + { + "epoch": 1.5858930602957906, + "grad_norm": 1.159847617149353, + "learning_rate": 0.000683731513083049, + "loss": 2.8738, + "step": 1394 + }, + { + "epoch": 1.5870307167235493, + "grad_norm": 0.6444835066795349, + "learning_rate": 0.0006835039817974972, + "loss": 1.2373, + "step": 1395 + }, + { + "epoch": 1.5881683731513083, + "grad_norm": 0.6985568404197693, + "learning_rate": 0.0006832764505119454, + "loss": 1.828, + "step": 1396 + }, + { + "epoch": 1.589306029579067, + "grad_norm": 0.9549153447151184, + "learning_rate": 0.0006830489192263937, + "loss": 1.5909, + "step": 1397 + }, + { + "epoch": 1.590443686006826, + "grad_norm": 0.720780074596405, + "learning_rate": 0.0006828213879408419, + "loss": 1.4502, + "step": 1398 + }, + { + "epoch": 1.5915813424345848, + "grad_norm": 1.090728998184204, + "learning_rate": 0.0006825938566552902, + "loss": 2.7399, + "step": 1399 + }, + { + "epoch": 1.5927189988623436, + "grad_norm": 1.3747398853302002, + "learning_rate": 0.0006823663253697383, + "loss": 2.4482, + "step": 1400 + }, + { + "epoch": 1.5938566552901023, + "grad_norm": 1.0005912780761719, + "learning_rate": 0.0006821387940841865, + "loss": 1.7441, + "step": 1401 + }, + { + "epoch": 1.594994311717861, + "grad_norm": 1.298248052597046, + "learning_rate": 0.0006819112627986348, + "loss": 2.1579, + "step": 1402 + }, + { + "epoch": 1.5961319681456199, + "grad_norm": 1.0068423748016357, + "learning_rate": 0.0006816837315130831, + "loss": 2.5003, + "step": 1403 + }, + { + "epoch": 1.5972696245733788, + "grad_norm": 1.1366382837295532, + "learning_rate": 0.0006814562002275313, + "loss": 2.3228, + "step": 1404 + }, + { + "epoch": 1.5984072810011376, + "grad_norm": 0.9474295973777771, + "learning_rate": 0.0006812286689419796, + "loss": 1.1316, + "step": 1405 + }, + { + "epoch": 1.5995449374288966, + "grad_norm": 1.3815207481384277, + "learning_rate": 0.0006810011376564278, + "loss": 4.0211, + "step": 1406 + }, + { + "epoch": 1.6006825938566553, + "grad_norm": 0.703368604183197, + "learning_rate": 0.000680773606370876, + "loss": 1.5288, + "step": 1407 + }, + { + "epoch": 1.601820250284414, + "grad_norm": 1.4243957996368408, + "learning_rate": 0.0006805460750853243, + "loss": 2.4914, + "step": 1408 + }, + { + "epoch": 1.6029579067121729, + "grad_norm": 1.1586917638778687, + "learning_rate": 0.0006803185437997725, + "loss": 2.1698, + "step": 1409 + }, + { + "epoch": 1.6040955631399316, + "grad_norm": 0.6247298717498779, + "learning_rate": 0.0006800910125142206, + "loss": 1.3813, + "step": 1410 + }, + { + "epoch": 1.6052332195676906, + "grad_norm": 1.181565284729004, + "learning_rate": 0.000679863481228669, + "loss": 2.3456, + "step": 1411 + }, + { + "epoch": 1.6063708759954494, + "grad_norm": 0.8031807541847229, + "learning_rate": 0.0006796359499431172, + "loss": 1.1918, + "step": 1412 + }, + { + "epoch": 1.6075085324232083, + "grad_norm": 0.920711874961853, + "learning_rate": 0.0006794084186575654, + "loss": 2.328, + "step": 1413 + }, + { + "epoch": 1.608646188850967, + "grad_norm": 1.0009433031082153, + "learning_rate": 0.0006791808873720137, + "loss": 1.3054, + "step": 1414 + }, + { + "epoch": 1.6097838452787259, + "grad_norm": 1.1435019969940186, + "learning_rate": 0.0006789533560864619, + "loss": 1.6864, + "step": 1415 + }, + { + "epoch": 1.6109215017064846, + "grad_norm": 0.9297929406166077, + "learning_rate": 0.0006787258248009101, + "loss": 2.1921, + "step": 1416 + }, + { + "epoch": 1.6120591581342434, + "grad_norm": 0.8632849454879761, + "learning_rate": 0.0006784982935153584, + "loss": 1.8867, + "step": 1417 + }, + { + "epoch": 1.6131968145620021, + "grad_norm": 0.7660686373710632, + "learning_rate": 0.0006782707622298066, + "loss": 1.4099, + "step": 1418 + }, + { + "epoch": 1.6143344709897611, + "grad_norm": 1.1486921310424805, + "learning_rate": 0.0006780432309442548, + "loss": 2.032, + "step": 1419 + }, + { + "epoch": 1.6154721274175199, + "grad_norm": 1.0095970630645752, + "learning_rate": 0.0006778156996587031, + "loss": 2.4319, + "step": 1420 + }, + { + "epoch": 1.6166097838452789, + "grad_norm": 1.0770419836044312, + "learning_rate": 0.0006775881683731513, + "loss": 2.338, + "step": 1421 + }, + { + "epoch": 1.6177474402730376, + "grad_norm": 0.8985020518302917, + "learning_rate": 0.0006773606370875996, + "loss": 1.3659, + "step": 1422 + }, + { + "epoch": 1.6188850967007964, + "grad_norm": 1.1922725439071655, + "learning_rate": 0.0006771331058020478, + "loss": 0.865, + "step": 1423 + }, + { + "epoch": 1.6200227531285551, + "grad_norm": 0.781085193157196, + "learning_rate": 0.000676905574516496, + "loss": 1.6722, + "step": 1424 + }, + { + "epoch": 1.621160409556314, + "grad_norm": 1.0233283042907715, + "learning_rate": 0.0006766780432309443, + "loss": 2.5444, + "step": 1425 + }, + { + "epoch": 1.6222980659840727, + "grad_norm": 0.7940084934234619, + "learning_rate": 0.0006764505119453925, + "loss": 1.4521, + "step": 1426 + }, + { + "epoch": 1.6234357224118316, + "grad_norm": 0.6902288794517517, + "learning_rate": 0.0006762229806598407, + "loss": 1.0446, + "step": 1427 + }, + { + "epoch": 1.6245733788395904, + "grad_norm": 0.9518580436706543, + "learning_rate": 0.0006759954493742891, + "loss": 2.1233, + "step": 1428 + }, + { + "epoch": 1.6257110352673494, + "grad_norm": 1.147662878036499, + "learning_rate": 0.0006757679180887372, + "loss": 3.0784, + "step": 1429 + }, + { + "epoch": 1.6268486916951082, + "grad_norm": 0.7210685610771179, + "learning_rate": 0.0006755403868031854, + "loss": 1.0661, + "step": 1430 + }, + { + "epoch": 1.627986348122867, + "grad_norm": 1.2803034782409668, + "learning_rate": 0.0006753128555176337, + "loss": 1.7098, + "step": 1431 + }, + { + "epoch": 1.6291240045506257, + "grad_norm": 1.3280972242355347, + "learning_rate": 0.0006750853242320819, + "loss": 1.756, + "step": 1432 + }, + { + "epoch": 1.6302616609783844, + "grad_norm": 0.7079578638076782, + "learning_rate": 0.0006748577929465301, + "loss": 1.4689, + "step": 1433 + }, + { + "epoch": 1.6313993174061432, + "grad_norm": 1.2363884449005127, + "learning_rate": 0.0006746302616609784, + "loss": 1.9533, + "step": 1434 + }, + { + "epoch": 1.6325369738339022, + "grad_norm": 1.2230910062789917, + "learning_rate": 0.0006744027303754266, + "loss": 2.7112, + "step": 1435 + }, + { + "epoch": 1.633674630261661, + "grad_norm": 1.5893361568450928, + "learning_rate": 0.0006741751990898749, + "loss": 1.6204, + "step": 1436 + }, + { + "epoch": 1.63481228668942, + "grad_norm": 0.9474936723709106, + "learning_rate": 0.0006739476678043232, + "loss": 2.2779, + "step": 1437 + }, + { + "epoch": 1.6359499431171787, + "grad_norm": 0.6129853129386902, + "learning_rate": 0.0006737201365187714, + "loss": 0.7177, + "step": 1438 + }, + { + "epoch": 1.6370875995449374, + "grad_norm": 0.7504928708076477, + "learning_rate": 0.0006734926052332196, + "loss": 2.2781, + "step": 1439 + }, + { + "epoch": 1.6382252559726962, + "grad_norm": 0.9989508986473083, + "learning_rate": 0.0006732650739476678, + "loss": 1.9803, + "step": 1440 + }, + { + "epoch": 1.639362912400455, + "grad_norm": 1.202582836151123, + "learning_rate": 0.000673037542662116, + "loss": 1.8057, + "step": 1441 + }, + { + "epoch": 1.640500568828214, + "grad_norm": 0.9157416224479675, + "learning_rate": 0.0006728100113765643, + "loss": 2.2069, + "step": 1442 + }, + { + "epoch": 1.6416382252559727, + "grad_norm": 0.6836434602737427, + "learning_rate": 0.0006725824800910125, + "loss": 1.0477, + "step": 1443 + }, + { + "epoch": 1.6427758816837317, + "grad_norm": 0.8547561168670654, + "learning_rate": 0.0006723549488054607, + "loss": 1.8937, + "step": 1444 + }, + { + "epoch": 1.6439135381114904, + "grad_norm": 0.8537775874137878, + "learning_rate": 0.0006721274175199091, + "loss": 1.5731, + "step": 1445 + }, + { + "epoch": 1.6450511945392492, + "grad_norm": 1.0358448028564453, + "learning_rate": 0.0006718998862343573, + "loss": 2.0885, + "step": 1446 + }, + { + "epoch": 1.646188850967008, + "grad_norm": 0.9270764589309692, + "learning_rate": 0.0006716723549488055, + "loss": 1.8622, + "step": 1447 + }, + { + "epoch": 1.6473265073947667, + "grad_norm": 0.6689456701278687, + "learning_rate": 0.0006714448236632537, + "loss": 0.7905, + "step": 1448 + }, + { + "epoch": 1.6484641638225255, + "grad_norm": 0.6682091951370239, + "learning_rate": 0.0006712172923777019, + "loss": 1.0117, + "step": 1449 + }, + { + "epoch": 1.6496018202502845, + "grad_norm": 0.9012134671211243, + "learning_rate": 0.0006709897610921501, + "loss": 1.6618, + "step": 1450 + }, + { + "epoch": 1.6507394766780432, + "grad_norm": 0.7726583480834961, + "learning_rate": 0.0006707622298065984, + "loss": 1.4585, + "step": 1451 + }, + { + "epoch": 1.6518771331058022, + "grad_norm": 1.0777757167816162, + "learning_rate": 0.0006705346985210466, + "loss": 2.1442, + "step": 1452 + }, + { + "epoch": 1.653014789533561, + "grad_norm": 1.3284507989883423, + "learning_rate": 0.0006703071672354949, + "loss": 1.9266, + "step": 1453 + }, + { + "epoch": 1.6541524459613197, + "grad_norm": 1.139455795288086, + "learning_rate": 0.0006700796359499432, + "loss": 2.3044, + "step": 1454 + }, + { + "epoch": 1.6552901023890785, + "grad_norm": 0.736585795879364, + "learning_rate": 0.0006698521046643914, + "loss": 1.3312, + "step": 1455 + }, + { + "epoch": 1.6564277588168372, + "grad_norm": 1.6546106338500977, + "learning_rate": 0.0006696245733788396, + "loss": 1.9867, + "step": 1456 + }, + { + "epoch": 1.657565415244596, + "grad_norm": 0.872257649898529, + "learning_rate": 0.0006693970420932879, + "loss": 2.0105, + "step": 1457 + }, + { + "epoch": 1.658703071672355, + "grad_norm": 0.9059979915618896, + "learning_rate": 0.000669169510807736, + "loss": 2.0219, + "step": 1458 + }, + { + "epoch": 1.6598407281001137, + "grad_norm": 0.6183615326881409, + "learning_rate": 0.0006689419795221842, + "loss": 1.2641, + "step": 1459 + }, + { + "epoch": 1.6609783845278727, + "grad_norm": 0.7358295917510986, + "learning_rate": 0.0006687144482366325, + "loss": 1.3596, + "step": 1460 + }, + { + "epoch": 1.6621160409556315, + "grad_norm": 0.8297770023345947, + "learning_rate": 0.0006684869169510807, + "loss": 1.5614, + "step": 1461 + }, + { + "epoch": 1.6632536973833902, + "grad_norm": 0.6983165144920349, + "learning_rate": 0.0006682593856655291, + "loss": 1.1403, + "step": 1462 + }, + { + "epoch": 1.664391353811149, + "grad_norm": 1.0305124521255493, + "learning_rate": 0.0006680318543799773, + "loss": 2.0436, + "step": 1463 + }, + { + "epoch": 1.6655290102389078, + "grad_norm": 1.53620183467865, + "learning_rate": 0.0006678043230944255, + "loss": 2.6743, + "step": 1464 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.9701448678970337, + "learning_rate": 0.0006675767918088738, + "loss": 2.7797, + "step": 1465 + }, + { + "epoch": 1.6678043230944255, + "grad_norm": 1.1551249027252197, + "learning_rate": 0.000667349260523322, + "loss": 1.9898, + "step": 1466 + }, + { + "epoch": 1.6689419795221843, + "grad_norm": 1.8078079223632812, + "learning_rate": 0.0006671217292377702, + "loss": 3.4769, + "step": 1467 + }, + { + "epoch": 1.6700796359499432, + "grad_norm": 0.9920907020568848, + "learning_rate": 0.0006668941979522184, + "loss": 1.5748, + "step": 1468 + }, + { + "epoch": 1.671217292377702, + "grad_norm": 1.0543971061706543, + "learning_rate": 0.0006666666666666666, + "loss": 2.2308, + "step": 1469 + }, + { + "epoch": 1.6723549488054608, + "grad_norm": 1.3774967193603516, + "learning_rate": 0.0006664391353811149, + "loss": 2.9355, + "step": 1470 + }, + { + "epoch": 1.6734926052332195, + "grad_norm": 0.819430947303772, + "learning_rate": 0.0006662116040955632, + "loss": 1.8651, + "step": 1471 + }, + { + "epoch": 1.6746302616609783, + "grad_norm": 0.9009912014007568, + "learning_rate": 0.0006659840728100114, + "loss": 1.771, + "step": 1472 + }, + { + "epoch": 1.675767918088737, + "grad_norm": 0.9403852224349976, + "learning_rate": 0.0006657565415244596, + "loss": 1.9841, + "step": 1473 + }, + { + "epoch": 1.676905574516496, + "grad_norm": 1.555391550064087, + "learning_rate": 0.0006655290102389079, + "loss": 2.3617, + "step": 1474 + }, + { + "epoch": 1.6780432309442548, + "grad_norm": 1.146911382675171, + "learning_rate": 0.0006653014789533561, + "loss": 2.3203, + "step": 1475 + }, + { + "epoch": 1.6791808873720138, + "grad_norm": 0.853667140007019, + "learning_rate": 0.0006650739476678043, + "loss": 1.2485, + "step": 1476 + }, + { + "epoch": 1.6803185437997725, + "grad_norm": 1.278773307800293, + "learning_rate": 0.0006648464163822526, + "loss": 2.0797, + "step": 1477 + }, + { + "epoch": 1.6814562002275313, + "grad_norm": 1.3791263103485107, + "learning_rate": 0.0006646188850967008, + "loss": 2.859, + "step": 1478 + }, + { + "epoch": 1.68259385665529, + "grad_norm": 0.9324190616607666, + "learning_rate": 0.000664391353811149, + "loss": 2.3866, + "step": 1479 + }, + { + "epoch": 1.6837315130830488, + "grad_norm": 1.0593537092208862, + "learning_rate": 0.0006641638225255973, + "loss": 1.6384, + "step": 1480 + }, + { + "epoch": 1.6848691695108078, + "grad_norm": 1.1699516773223877, + "learning_rate": 0.0006639362912400455, + "loss": 2.8342, + "step": 1481 + }, + { + "epoch": 1.6860068259385665, + "grad_norm": 0.7266961336135864, + "learning_rate": 0.0006637087599544938, + "loss": 1.4487, + "step": 1482 + }, + { + "epoch": 1.6871444823663255, + "grad_norm": 0.8684595227241516, + "learning_rate": 0.000663481228668942, + "loss": 1.6997, + "step": 1483 + }, + { + "epoch": 1.6882821387940843, + "grad_norm": 0.958516538143158, + "learning_rate": 0.0006632536973833902, + "loss": 1.7327, + "step": 1484 + }, + { + "epoch": 1.689419795221843, + "grad_norm": 1.0229347944259644, + "learning_rate": 0.0006630261660978385, + "loss": 3.2928, + "step": 1485 + }, + { + "epoch": 1.6905574516496018, + "grad_norm": 1.314428687095642, + "learning_rate": 0.0006627986348122868, + "loss": 1.7722, + "step": 1486 + }, + { + "epoch": 1.6916951080773606, + "grad_norm": 0.6298933625221252, + "learning_rate": 0.0006625711035267349, + "loss": 1.2597, + "step": 1487 + }, + { + "epoch": 1.6928327645051193, + "grad_norm": 0.8204244375228882, + "learning_rate": 0.0006623435722411832, + "loss": 1.4012, + "step": 1488 + }, + { + "epoch": 1.6939704209328783, + "grad_norm": 0.6752439737319946, + "learning_rate": 0.0006621160409556314, + "loss": 1.3025, + "step": 1489 + }, + { + "epoch": 1.695108077360637, + "grad_norm": 0.7786659002304077, + "learning_rate": 0.0006618885096700796, + "loss": 1.601, + "step": 1490 + }, + { + "epoch": 1.696245733788396, + "grad_norm": 0.9201928377151489, + "learning_rate": 0.0006616609783845279, + "loss": 2.8528, + "step": 1491 + }, + { + "epoch": 1.6973833902161548, + "grad_norm": 0.9727663993835449, + "learning_rate": 0.0006614334470989761, + "loss": 2.2852, + "step": 1492 + }, + { + "epoch": 1.6985210466439136, + "grad_norm": 1.015677571296692, + "learning_rate": 0.0006612059158134243, + "loss": 2.5611, + "step": 1493 + }, + { + "epoch": 1.6996587030716723, + "grad_norm": 0.6832883954048157, + "learning_rate": 0.0006609783845278727, + "loss": 0.8426, + "step": 1494 + }, + { + "epoch": 1.700796359499431, + "grad_norm": 1.015763759613037, + "learning_rate": 0.0006607508532423209, + "loss": 2.1979, + "step": 1495 + }, + { + "epoch": 1.7019340159271898, + "grad_norm": 1.0312163829803467, + "learning_rate": 0.0006605233219567691, + "loss": 1.6475, + "step": 1496 + }, + { + "epoch": 1.7030716723549488, + "grad_norm": 1.1301902532577515, + "learning_rate": 0.0006602957906712173, + "loss": 1.6602, + "step": 1497 + }, + { + "epoch": 1.7042093287827076, + "grad_norm": 1.635214924812317, + "learning_rate": 0.0006600682593856655, + "loss": 2.0394, + "step": 1498 + }, + { + "epoch": 1.7053469852104666, + "grad_norm": 1.2907978296279907, + "learning_rate": 0.0006598407281001137, + "loss": 2.3087, + "step": 1499 + }, + { + "epoch": 1.7064846416382253, + "grad_norm": 0.8886736035346985, + "learning_rate": 0.000659613196814562, + "loss": 2.1899, + "step": 1500 + }, + { + "epoch": 1.707622298065984, + "grad_norm": 0.7392778992652893, + "learning_rate": 0.0006593856655290102, + "loss": 1.1168, + "step": 1501 + }, + { + "epoch": 1.7087599544937428, + "grad_norm": 0.9279195070266724, + "learning_rate": 0.0006591581342434585, + "loss": 3.2731, + "step": 1502 + }, + { + "epoch": 1.7098976109215016, + "grad_norm": 0.7852345705032349, + "learning_rate": 0.0006589306029579068, + "loss": 1.6468, + "step": 1503 + }, + { + "epoch": 1.7110352673492604, + "grad_norm": 0.8137550354003906, + "learning_rate": 0.000658703071672355, + "loss": 1.7085, + "step": 1504 + }, + { + "epoch": 1.7121729237770194, + "grad_norm": 1.1284794807434082, + "learning_rate": 0.0006584755403868033, + "loss": 1.3615, + "step": 1505 + }, + { + "epoch": 1.713310580204778, + "grad_norm": 0.9071482419967651, + "learning_rate": 0.0006582480091012515, + "loss": 1.9902, + "step": 1506 + }, + { + "epoch": 1.714448236632537, + "grad_norm": 0.6611661314964294, + "learning_rate": 0.0006580204778156996, + "loss": 1.4362, + "step": 1507 + }, + { + "epoch": 1.7155858930602959, + "grad_norm": 0.6886146664619446, + "learning_rate": 0.0006577929465301479, + "loss": 1.5251, + "step": 1508 + }, + { + "epoch": 1.7167235494880546, + "grad_norm": 0.859805166721344, + "learning_rate": 0.0006575654152445961, + "loss": 1.6648, + "step": 1509 + }, + { + "epoch": 1.7178612059158134, + "grad_norm": 0.8424084782600403, + "learning_rate": 0.0006573378839590443, + "loss": 1.7239, + "step": 1510 + }, + { + "epoch": 1.7189988623435721, + "grad_norm": 0.5435293316841125, + "learning_rate": 0.0006571103526734927, + "loss": 1.1297, + "step": 1511 + }, + { + "epoch": 1.7201365187713311, + "grad_norm": 1.4670809507369995, + "learning_rate": 0.0006568828213879409, + "loss": 3.3636, + "step": 1512 + }, + { + "epoch": 1.7212741751990899, + "grad_norm": 0.8184078931808472, + "learning_rate": 0.0006566552901023891, + "loss": 1.8129, + "step": 1513 + }, + { + "epoch": 1.7224118316268489, + "grad_norm": 0.8720480799674988, + "learning_rate": 0.0006564277588168374, + "loss": 1.7082, + "step": 1514 + }, + { + "epoch": 1.7235494880546076, + "grad_norm": 1.2240320444107056, + "learning_rate": 0.0006562002275312856, + "loss": 1.6893, + "step": 1515 + }, + { + "epoch": 1.7246871444823664, + "grad_norm": 1.4103710651397705, + "learning_rate": 0.0006559726962457337, + "loss": 1.4781, + "step": 1516 + }, + { + "epoch": 1.7258248009101251, + "grad_norm": 0.7082839608192444, + "learning_rate": 0.000655745164960182, + "loss": 1.6487, + "step": 1517 + }, + { + "epoch": 1.726962457337884, + "grad_norm": 1.3092166185379028, + "learning_rate": 0.0006555176336746302, + "loss": 3.0383, + "step": 1518 + }, + { + "epoch": 1.7281001137656427, + "grad_norm": 0.6530581116676331, + "learning_rate": 0.0006552901023890784, + "loss": 1.1405, + "step": 1519 + }, + { + "epoch": 1.7292377701934016, + "grad_norm": 0.5259117484092712, + "learning_rate": 0.0006550625711035268, + "loss": 0.9486, + "step": 1520 + }, + { + "epoch": 1.7303754266211604, + "grad_norm": 0.8423599004745483, + "learning_rate": 0.000654835039817975, + "loss": 1.7263, + "step": 1521 + }, + { + "epoch": 1.7315130830489194, + "grad_norm": 0.9277529716491699, + "learning_rate": 0.0006546075085324233, + "loss": 1.3625, + "step": 1522 + }, + { + "epoch": 1.7326507394766781, + "grad_norm": 0.8496603965759277, + "learning_rate": 0.0006543799772468715, + "loss": 2.406, + "step": 1523 + }, + { + "epoch": 1.733788395904437, + "grad_norm": 1.1598669290542603, + "learning_rate": 0.0006541524459613197, + "loss": 2.4768, + "step": 1524 + }, + { + "epoch": 1.7349260523321957, + "grad_norm": 0.9473751783370972, + "learning_rate": 0.000653924914675768, + "loss": 2.243, + "step": 1525 + }, + { + "epoch": 1.7360637087599544, + "grad_norm": 0.8064972758293152, + "learning_rate": 0.0006536973833902161, + "loss": 1.6925, + "step": 1526 + }, + { + "epoch": 1.7372013651877132, + "grad_norm": 1.018776297569275, + "learning_rate": 0.0006534698521046643, + "loss": 1.6876, + "step": 1527 + }, + { + "epoch": 1.7383390216154722, + "grad_norm": 1.1636130809783936, + "learning_rate": 0.0006532423208191127, + "loss": 2.0573, + "step": 1528 + }, + { + "epoch": 1.739476678043231, + "grad_norm": 0.890720784664154, + "learning_rate": 0.0006530147895335609, + "loss": 0.8197, + "step": 1529 + }, + { + "epoch": 1.74061433447099, + "grad_norm": 0.8947886228561401, + "learning_rate": 0.0006527872582480091, + "loss": 1.5267, + "step": 1530 + }, + { + "epoch": 1.7417519908987487, + "grad_norm": 1.0752261877059937, + "learning_rate": 0.0006525597269624574, + "loss": 2.324, + "step": 1531 + }, + { + "epoch": 1.7428896473265074, + "grad_norm": 1.107813835144043, + "learning_rate": 0.0006523321956769056, + "loss": 2.1986, + "step": 1532 + }, + { + "epoch": 1.7440273037542662, + "grad_norm": 0.8949710726737976, + "learning_rate": 0.0006521046643913538, + "loss": 1.3886, + "step": 1533 + }, + { + "epoch": 1.745164960182025, + "grad_norm": 1.6787731647491455, + "learning_rate": 0.0006518771331058021, + "loss": 3.7587, + "step": 1534 + }, + { + "epoch": 1.7463026166097837, + "grad_norm": 1.2499511241912842, + "learning_rate": 0.0006516496018202503, + "loss": 2.327, + "step": 1535 + }, + { + "epoch": 1.7474402730375427, + "grad_norm": 0.7595018148422241, + "learning_rate": 0.0006514220705346984, + "loss": 1.3373, + "step": 1536 + }, + { + "epoch": 1.7485779294653014, + "grad_norm": 1.2214561700820923, + "learning_rate": 0.0006511945392491468, + "loss": 2.4152, + "step": 1537 + }, + { + "epoch": 1.7497155858930604, + "grad_norm": 0.727344274520874, + "learning_rate": 0.000650967007963595, + "loss": 1.4908, + "step": 1538 + }, + { + "epoch": 1.7508532423208192, + "grad_norm": 0.9511622786521912, + "learning_rate": 0.0006507394766780432, + "loss": 1.6895, + "step": 1539 + }, + { + "epoch": 1.751990898748578, + "grad_norm": 0.8098679184913635, + "learning_rate": 0.0006505119453924915, + "loss": 1.125, + "step": 1540 + }, + { + "epoch": 1.7531285551763367, + "grad_norm": 0.9225071668624878, + "learning_rate": 0.0006502844141069397, + "loss": 1.8387, + "step": 1541 + }, + { + "epoch": 1.7542662116040955, + "grad_norm": 0.999563992023468, + "learning_rate": 0.000650056882821388, + "loss": 2.7922, + "step": 1542 + }, + { + "epoch": 1.7554038680318542, + "grad_norm": 0.722228467464447, + "learning_rate": 0.0006498293515358362, + "loss": 1.7187, + "step": 1543 + }, + { + "epoch": 1.7565415244596132, + "grad_norm": 0.6238258481025696, + "learning_rate": 0.0006496018202502844, + "loss": 0.8305, + "step": 1544 + }, + { + "epoch": 1.757679180887372, + "grad_norm": 1.207566499710083, + "learning_rate": 0.0006493742889647328, + "loss": 2.9552, + "step": 1545 + }, + { + "epoch": 1.758816837315131, + "grad_norm": 1.0733917951583862, + "learning_rate": 0.0006491467576791809, + "loss": 1.9941, + "step": 1546 + }, + { + "epoch": 1.7599544937428897, + "grad_norm": 0.8945872783660889, + "learning_rate": 0.0006489192263936291, + "loss": 1.6187, + "step": 1547 + }, + { + "epoch": 1.7610921501706485, + "grad_norm": 1.0715452432632446, + "learning_rate": 0.0006486916951080774, + "loss": 3.1526, + "step": 1548 + }, + { + "epoch": 1.7622298065984072, + "grad_norm": 0.7034804821014404, + "learning_rate": 0.0006484641638225256, + "loss": 1.6216, + "step": 1549 + }, + { + "epoch": 1.763367463026166, + "grad_norm": 1.182071328163147, + "learning_rate": 0.0006482366325369738, + "loss": 3.9126, + "step": 1550 + }, + { + "epoch": 1.764505119453925, + "grad_norm": 1.0914125442504883, + "learning_rate": 0.0006480091012514221, + "loss": 1.7731, + "step": 1551 + }, + { + "epoch": 1.7656427758816837, + "grad_norm": 0.7337900996208191, + "learning_rate": 0.0006477815699658703, + "loss": 0.7843, + "step": 1552 + }, + { + "epoch": 1.7667804323094427, + "grad_norm": 0.9463424682617188, + "learning_rate": 0.0006475540386803186, + "loss": 2.3052, + "step": 1553 + }, + { + "epoch": 1.7679180887372015, + "grad_norm": 0.798876166343689, + "learning_rate": 0.0006473265073947669, + "loss": 1.7428, + "step": 1554 + }, + { + "epoch": 1.7690557451649602, + "grad_norm": 1.0317094326019287, + "learning_rate": 0.000647098976109215, + "loss": 1.5478, + "step": 1555 + }, + { + "epoch": 1.770193401592719, + "grad_norm": 1.0436172485351562, + "learning_rate": 0.0006468714448236632, + "loss": 2.3285, + "step": 1556 + }, + { + "epoch": 1.7713310580204777, + "grad_norm": 1.389779806137085, + "learning_rate": 0.0006466439135381115, + "loss": 3.1217, + "step": 1557 + }, + { + "epoch": 1.7724687144482365, + "grad_norm": 0.9066593050956726, + "learning_rate": 0.0006464163822525597, + "loss": 1.5202, + "step": 1558 + }, + { + "epoch": 1.7736063708759955, + "grad_norm": 0.7045938968658447, + "learning_rate": 0.0006461888509670079, + "loss": 2.0027, + "step": 1559 + }, + { + "epoch": 1.7747440273037542, + "grad_norm": 1.222909688949585, + "learning_rate": 0.0006459613196814562, + "loss": 2.5047, + "step": 1560 + }, + { + "epoch": 1.7758816837315132, + "grad_norm": 0.9253849983215332, + "learning_rate": 0.0006457337883959044, + "loss": 1.8513, + "step": 1561 + }, + { + "epoch": 1.777019340159272, + "grad_norm": 1.5743485689163208, + "learning_rate": 0.0006455062571103528, + "loss": 3.3089, + "step": 1562 + }, + { + "epoch": 1.7781569965870307, + "grad_norm": 1.3808788061141968, + "learning_rate": 0.000645278725824801, + "loss": 2.1207, + "step": 1563 + }, + { + "epoch": 1.7792946530147895, + "grad_norm": 0.5420240163803101, + "learning_rate": 0.0006450511945392492, + "loss": 0.9326, + "step": 1564 + }, + { + "epoch": 1.7804323094425483, + "grad_norm": 1.006123661994934, + "learning_rate": 0.0006448236632536974, + "loss": 1.3682, + "step": 1565 + }, + { + "epoch": 1.781569965870307, + "grad_norm": 1.0188605785369873, + "learning_rate": 0.0006445961319681456, + "loss": 3.1302, + "step": 1566 + }, + { + "epoch": 1.782707622298066, + "grad_norm": 1.3520559072494507, + "learning_rate": 0.0006443686006825938, + "loss": 1.9661, + "step": 1567 + }, + { + "epoch": 1.7838452787258248, + "grad_norm": 1.2583212852478027, + "learning_rate": 0.0006441410693970421, + "loss": 1.7267, + "step": 1568 + }, + { + "epoch": 1.7849829351535837, + "grad_norm": 0.6001319885253906, + "learning_rate": 0.0006439135381114903, + "loss": 1.2674, + "step": 1569 + }, + { + "epoch": 1.7861205915813425, + "grad_norm": 0.5909472703933716, + "learning_rate": 0.0006436860068259386, + "loss": 1.1398, + "step": 1570 + }, + { + "epoch": 1.7872582480091013, + "grad_norm": 1.0126385688781738, + "learning_rate": 0.0006434584755403869, + "loss": 1.4477, + "step": 1571 + }, + { + "epoch": 1.78839590443686, + "grad_norm": 0.760152280330658, + "learning_rate": 0.0006432309442548351, + "loss": 1.0436, + "step": 1572 + }, + { + "epoch": 1.7895335608646188, + "grad_norm": 1.2938870191574097, + "learning_rate": 0.0006430034129692833, + "loss": 2.4826, + "step": 1573 + }, + { + "epoch": 1.7906712172923775, + "grad_norm": 0.8041395545005798, + "learning_rate": 0.0006427758816837316, + "loss": 1.7244, + "step": 1574 + }, + { + "epoch": 1.7918088737201365, + "grad_norm": 0.9691146016120911, + "learning_rate": 0.0006425483503981797, + "loss": 1.7926, + "step": 1575 + }, + { + "epoch": 1.7929465301478953, + "grad_norm": 1.1277107000350952, + "learning_rate": 0.0006423208191126279, + "loss": 2.1719, + "step": 1576 + }, + { + "epoch": 1.7940841865756543, + "grad_norm": 0.8621721267700195, + "learning_rate": 0.0006420932878270762, + "loss": 2.341, + "step": 1577 + }, + { + "epoch": 1.795221843003413, + "grad_norm": 1.3702713251113892, + "learning_rate": 0.0006418657565415244, + "loss": 2.7296, + "step": 1578 + }, + { + "epoch": 1.7963594994311718, + "grad_norm": 0.6883115768432617, + "learning_rate": 0.0006416382252559727, + "loss": 1.762, + "step": 1579 + }, + { + "epoch": 1.7974971558589306, + "grad_norm": 1.0266193151474, + "learning_rate": 0.000641410693970421, + "loss": 2.1009, + "step": 1580 + }, + { + "epoch": 1.7986348122866893, + "grad_norm": 0.908869743347168, + "learning_rate": 0.0006411831626848692, + "loss": 1.7309, + "step": 1581 + }, + { + "epoch": 1.799772468714448, + "grad_norm": 0.8608745336532593, + "learning_rate": 0.0006409556313993174, + "loss": 1.7538, + "step": 1582 + }, + { + "epoch": 1.800910125142207, + "grad_norm": 1.5852781534194946, + "learning_rate": 0.0006407281001137657, + "loss": 2.7645, + "step": 1583 + }, + { + "epoch": 1.802047781569966, + "grad_norm": 0.8057767152786255, + "learning_rate": 0.0006405005688282139, + "loss": 1.7418, + "step": 1584 + }, + { + "epoch": 1.8031854379977248, + "grad_norm": 1.129563570022583, + "learning_rate": 0.0006402730375426621, + "loss": 2.454, + "step": 1585 + }, + { + "epoch": 1.8043230944254836, + "grad_norm": 0.9921088218688965, + "learning_rate": 0.0006400455062571103, + "loss": 1.8042, + "step": 1586 + }, + { + "epoch": 1.8054607508532423, + "grad_norm": 1.2487443685531616, + "learning_rate": 0.0006398179749715586, + "loss": 2.1645, + "step": 1587 + }, + { + "epoch": 1.806598407281001, + "grad_norm": 1.0499515533447266, + "learning_rate": 0.0006395904436860069, + "loss": 2.1733, + "step": 1588 + }, + { + "epoch": 1.8077360637087598, + "grad_norm": 1.092044472694397, + "learning_rate": 0.0006393629124004551, + "loss": 2.5157, + "step": 1589 + }, + { + "epoch": 1.8088737201365188, + "grad_norm": 0.9797663688659668, + "learning_rate": 0.0006391353811149033, + "loss": 1.3745, + "step": 1590 + }, + { + "epoch": 1.8100113765642776, + "grad_norm": 1.0946043729782104, + "learning_rate": 0.0006389078498293516, + "loss": 2.1346, + "step": 1591 + }, + { + "epoch": 1.8111490329920366, + "grad_norm": 1.3804898262023926, + "learning_rate": 0.0006386803185437998, + "loss": 2.088, + "step": 1592 + }, + { + "epoch": 1.8122866894197953, + "grad_norm": 0.9063937067985535, + "learning_rate": 0.000638452787258248, + "loss": 2.0188, + "step": 1593 + }, + { + "epoch": 1.813424345847554, + "grad_norm": 0.7004676461219788, + "learning_rate": 0.0006382252559726962, + "loss": 1.5326, + "step": 1594 + }, + { + "epoch": 1.8145620022753128, + "grad_norm": 0.7816855907440186, + "learning_rate": 0.0006379977246871445, + "loss": 1.5907, + "step": 1595 + }, + { + "epoch": 1.8156996587030716, + "grad_norm": 1.2419103384017944, + "learning_rate": 0.0006377701934015927, + "loss": 2.683, + "step": 1596 + }, + { + "epoch": 1.8168373151308304, + "grad_norm": 1.7206608057022095, + "learning_rate": 0.000637542662116041, + "loss": 3.0181, + "step": 1597 + }, + { + "epoch": 1.8179749715585893, + "grad_norm": 0.7101848125457764, + "learning_rate": 0.0006373151308304892, + "loss": 2.2839, + "step": 1598 + }, + { + "epoch": 1.819112627986348, + "grad_norm": 0.9703013896942139, + "learning_rate": 0.0006370875995449374, + "loss": 1.6638, + "step": 1599 + }, + { + "epoch": 1.820250284414107, + "grad_norm": 0.7560202479362488, + "learning_rate": 0.0006368600682593857, + "loss": 1.285, + "step": 1600 + }, + { + "epoch": 1.8213879408418658, + "grad_norm": 0.8459346294403076, + "learning_rate": 0.0006366325369738339, + "loss": 2.0187, + "step": 1601 + }, + { + "epoch": 1.8225255972696246, + "grad_norm": 1.0976618528366089, + "learning_rate": 0.0006364050056882821, + "loss": 2.2865, + "step": 1602 + }, + { + "epoch": 1.8236632536973834, + "grad_norm": 0.7999181747436523, + "learning_rate": 0.0006361774744027305, + "loss": 2.0923, + "step": 1603 + }, + { + "epoch": 1.8248009101251421, + "grad_norm": 0.6172679662704468, + "learning_rate": 0.0006359499431171786, + "loss": 0.9479, + "step": 1604 + }, + { + "epoch": 1.8259385665529009, + "grad_norm": 0.8358675241470337, + "learning_rate": 0.0006357224118316269, + "loss": 2.4008, + "step": 1605 + }, + { + "epoch": 1.8270762229806599, + "grad_norm": 0.7997340559959412, + "learning_rate": 0.0006354948805460751, + "loss": 2.0383, + "step": 1606 + }, + { + "epoch": 1.8282138794084186, + "grad_norm": 1.1405185461044312, + "learning_rate": 0.0006352673492605233, + "loss": 1.925, + "step": 1607 + }, + { + "epoch": 1.8293515358361776, + "grad_norm": 0.7813712358474731, + "learning_rate": 0.0006350398179749716, + "loss": 1.7278, + "step": 1608 + }, + { + "epoch": 1.8304891922639364, + "grad_norm": 1.3376038074493408, + "learning_rate": 0.0006348122866894198, + "loss": 2.3325, + "step": 1609 + }, + { + "epoch": 1.8316268486916951, + "grad_norm": 1.1971509456634521, + "learning_rate": 0.000634584755403868, + "loss": 2.1967, + "step": 1610 + }, + { + "epoch": 1.8327645051194539, + "grad_norm": 1.8342376947402954, + "learning_rate": 0.0006343572241183164, + "loss": 3.0377, + "step": 1611 + }, + { + "epoch": 1.8339021615472126, + "grad_norm": 0.983214795589447, + "learning_rate": 0.0006341296928327646, + "loss": 1.5361, + "step": 1612 + }, + { + "epoch": 1.8350398179749714, + "grad_norm": 0.7843421697616577, + "learning_rate": 0.0006339021615472128, + "loss": 1.4898, + "step": 1613 + }, + { + "epoch": 1.8361774744027304, + "grad_norm": 0.8079589605331421, + "learning_rate": 0.000633674630261661, + "loss": 1.42, + "step": 1614 + }, + { + "epoch": 1.8373151308304891, + "grad_norm": 0.9361008405685425, + "learning_rate": 0.0006334470989761092, + "loss": 2.3352, + "step": 1615 + }, + { + "epoch": 1.8384527872582481, + "grad_norm": 0.8050186038017273, + "learning_rate": 0.0006332195676905574, + "loss": 1.2832, + "step": 1616 + }, + { + "epoch": 1.8395904436860069, + "grad_norm": 0.6007594466209412, + "learning_rate": 0.0006329920364050057, + "loss": 0.7322, + "step": 1617 + }, + { + "epoch": 1.8407281001137656, + "grad_norm": 1.09076988697052, + "learning_rate": 0.0006327645051194539, + "loss": 3.1086, + "step": 1618 + }, + { + "epoch": 1.8418657565415244, + "grad_norm": 1.1623958349227905, + "learning_rate": 0.0006325369738339021, + "loss": 2.5158, + "step": 1619 + }, + { + "epoch": 1.8430034129692832, + "grad_norm": 0.967048704624176, + "learning_rate": 0.0006323094425483505, + "loss": 1.5403, + "step": 1620 + }, + { + "epoch": 1.8441410693970421, + "grad_norm": 0.7205291390419006, + "learning_rate": 0.0006320819112627987, + "loss": 0.9221, + "step": 1621 + }, + { + "epoch": 1.845278725824801, + "grad_norm": 0.8706921339035034, + "learning_rate": 0.0006318543799772469, + "loss": 2.1171, + "step": 1622 + }, + { + "epoch": 1.8464163822525599, + "grad_norm": 0.8545218110084534, + "learning_rate": 0.0006316268486916951, + "loss": 1.3462, + "step": 1623 + }, + { + "epoch": 1.8475540386803186, + "grad_norm": 1.0250085592269897, + "learning_rate": 0.0006313993174061433, + "loss": 1.8658, + "step": 1624 + }, + { + "epoch": 1.8486916951080774, + "grad_norm": 0.8345952033996582, + "learning_rate": 0.0006311717861205916, + "loss": 2.4092, + "step": 1625 + }, + { + "epoch": 1.8498293515358362, + "grad_norm": 0.9249033331871033, + "learning_rate": 0.0006309442548350398, + "loss": 1.7311, + "step": 1626 + }, + { + "epoch": 1.850967007963595, + "grad_norm": 1.0323681831359863, + "learning_rate": 0.000630716723549488, + "loss": 2.3136, + "step": 1627 + }, + { + "epoch": 1.8521046643913537, + "grad_norm": 0.6797596216201782, + "learning_rate": 0.0006304891922639364, + "loss": 1.4115, + "step": 1628 + }, + { + "epoch": 1.8532423208191127, + "grad_norm": 0.6315971612930298, + "learning_rate": 0.0006302616609783846, + "loss": 1.6303, + "step": 1629 + }, + { + "epoch": 1.8543799772468714, + "grad_norm": 1.357260823249817, + "learning_rate": 0.0006300341296928328, + "loss": 3.4739, + "step": 1630 + }, + { + "epoch": 1.8555176336746304, + "grad_norm": 0.8246727585792542, + "learning_rate": 0.0006298065984072811, + "loss": 1.4725, + "step": 1631 + }, + { + "epoch": 1.8566552901023892, + "grad_norm": 1.0035685300827026, + "learning_rate": 0.0006295790671217293, + "loss": 1.9371, + "step": 1632 + }, + { + "epoch": 1.857792946530148, + "grad_norm": 0.8692108392715454, + "learning_rate": 0.0006293515358361774, + "loss": 1.5302, + "step": 1633 + }, + { + "epoch": 1.8589306029579067, + "grad_norm": 0.7576583623886108, + "learning_rate": 0.0006291240045506257, + "loss": 1.3123, + "step": 1634 + }, + { + "epoch": 1.8600682593856654, + "grad_norm": 1.0412949323654175, + "learning_rate": 0.0006288964732650739, + "loss": 1.8658, + "step": 1635 + }, + { + "epoch": 1.8612059158134242, + "grad_norm": 0.6774725317955017, + "learning_rate": 0.0006286689419795221, + "loss": 1.499, + "step": 1636 + }, + { + "epoch": 1.8623435722411832, + "grad_norm": 0.7042348980903625, + "learning_rate": 0.0006284414106939705, + "loss": 1.423, + "step": 1637 + }, + { + "epoch": 1.863481228668942, + "grad_norm": 0.8518065810203552, + "learning_rate": 0.0006282138794084187, + "loss": 1.1174, + "step": 1638 + }, + { + "epoch": 1.864618885096701, + "grad_norm": 1.1529499292373657, + "learning_rate": 0.0006279863481228669, + "loss": 2.3384, + "step": 1639 + }, + { + "epoch": 1.8657565415244597, + "grad_norm": 0.8600414991378784, + "learning_rate": 0.0006277588168373152, + "loss": 1.7475, + "step": 1640 + }, + { + "epoch": 1.8668941979522184, + "grad_norm": 1.1277893781661987, + "learning_rate": 0.0006275312855517634, + "loss": 3.2803, + "step": 1641 + }, + { + "epoch": 1.8680318543799772, + "grad_norm": 1.1876293420791626, + "learning_rate": 0.0006273037542662116, + "loss": 2.062, + "step": 1642 + }, + { + "epoch": 1.869169510807736, + "grad_norm": 0.8083938360214233, + "learning_rate": 0.0006270762229806598, + "loss": 1.4945, + "step": 1643 + }, + { + "epoch": 1.8703071672354947, + "grad_norm": 1.136566400527954, + "learning_rate": 0.000626848691695108, + "loss": 2.2583, + "step": 1644 + }, + { + "epoch": 1.8714448236632537, + "grad_norm": 0.8216238617897034, + "learning_rate": 0.0006266211604095564, + "loss": 1.1567, + "step": 1645 + }, + { + "epoch": 1.8725824800910125, + "grad_norm": 0.8726761341094971, + "learning_rate": 0.0006263936291240046, + "loss": 1.4604, + "step": 1646 + }, + { + "epoch": 1.8737201365187715, + "grad_norm": 0.7124577164649963, + "learning_rate": 0.0006261660978384528, + "loss": 1.8203, + "step": 1647 + }, + { + "epoch": 1.8748577929465302, + "grad_norm": 0.7909445762634277, + "learning_rate": 0.0006259385665529011, + "loss": 1.5815, + "step": 1648 + }, + { + "epoch": 1.875995449374289, + "grad_norm": 1.0055829286575317, + "learning_rate": 0.0006257110352673493, + "loss": 1.5233, + "step": 1649 + }, + { + "epoch": 1.8771331058020477, + "grad_norm": 0.6920607089996338, + "learning_rate": 0.0006254835039817975, + "loss": 0.8769, + "step": 1650 + }, + { + "epoch": 1.8782707622298065, + "grad_norm": 0.8368102312088013, + "learning_rate": 0.0006252559726962458, + "loss": 1.9653, + "step": 1651 + }, + { + "epoch": 1.8794084186575652, + "grad_norm": 1.1881768703460693, + "learning_rate": 0.000625028441410694, + "loss": 2.2672, + "step": 1652 + }, + { + "epoch": 1.8805460750853242, + "grad_norm": 0.7409395575523376, + "learning_rate": 0.0006248009101251421, + "loss": 1.6343, + "step": 1653 + }, + { + "epoch": 1.8816837315130832, + "grad_norm": 1.2841352224349976, + "learning_rate": 0.0006245733788395905, + "loss": 1.9493, + "step": 1654 + }, + { + "epoch": 1.882821387940842, + "grad_norm": 0.8569640517234802, + "learning_rate": 0.0006243458475540387, + "loss": 1.9457, + "step": 1655 + }, + { + "epoch": 1.8839590443686007, + "grad_norm": 1.32857346534729, + "learning_rate": 0.0006241183162684869, + "loss": 1.7589, + "step": 1656 + }, + { + "epoch": 1.8850967007963595, + "grad_norm": 0.8553183674812317, + "learning_rate": 0.0006238907849829352, + "loss": 1.2652, + "step": 1657 + }, + { + "epoch": 1.8862343572241183, + "grad_norm": 0.9133054614067078, + "learning_rate": 0.0006236632536973834, + "loss": 1.5618, + "step": 1658 + }, + { + "epoch": 1.887372013651877, + "grad_norm": 1.1053630113601685, + "learning_rate": 0.0006234357224118316, + "loss": 1.4316, + "step": 1659 + }, + { + "epoch": 1.888509670079636, + "grad_norm": 1.3917018175125122, + "learning_rate": 0.0006232081911262799, + "loss": 3.7291, + "step": 1660 + }, + { + "epoch": 1.8896473265073948, + "grad_norm": 0.6863465905189514, + "learning_rate": 0.0006229806598407281, + "loss": 1.2208, + "step": 1661 + }, + { + "epoch": 1.8907849829351537, + "grad_norm": 1.6601629257202148, + "learning_rate": 0.0006227531285551762, + "loss": 3.3405, + "step": 1662 + }, + { + "epoch": 1.8919226393629125, + "grad_norm": 0.6873295903205872, + "learning_rate": 0.0006225255972696246, + "loss": 1.66, + "step": 1663 + }, + { + "epoch": 1.8930602957906713, + "grad_norm": 1.2501400709152222, + "learning_rate": 0.0006222980659840728, + "loss": 2.2344, + "step": 1664 + }, + { + "epoch": 1.89419795221843, + "grad_norm": 1.2724485397338867, + "learning_rate": 0.000622070534698521, + "loss": 1.8736, + "step": 1665 + }, + { + "epoch": 1.8953356086461888, + "grad_norm": 1.0667139291763306, + "learning_rate": 0.0006218430034129693, + "loss": 3.2257, + "step": 1666 + }, + { + "epoch": 1.8964732650739475, + "grad_norm": 0.7864385843276978, + "learning_rate": 0.0006216154721274175, + "loss": 1.8004, + "step": 1667 + }, + { + "epoch": 1.8976109215017065, + "grad_norm": 0.8352164030075073, + "learning_rate": 0.0006213879408418658, + "loss": 1.4674, + "step": 1668 + }, + { + "epoch": 1.8987485779294653, + "grad_norm": 0.8850026726722717, + "learning_rate": 0.000621160409556314, + "loss": 1.7806, + "step": 1669 + }, + { + "epoch": 1.8998862343572243, + "grad_norm": 0.7337632775306702, + "learning_rate": 0.0006209328782707623, + "loss": 1.2096, + "step": 1670 + }, + { + "epoch": 1.901023890784983, + "grad_norm": 0.9589748978614807, + "learning_rate": 0.0006207053469852106, + "loss": 2.1104, + "step": 1671 + }, + { + "epoch": 1.9021615472127418, + "grad_norm": 1.047741174697876, + "learning_rate": 0.0006204778156996587, + "loss": 2.5989, + "step": 1672 + }, + { + "epoch": 1.9032992036405005, + "grad_norm": 1.0804181098937988, + "learning_rate": 0.0006202502844141069, + "loss": 1.5729, + "step": 1673 + }, + { + "epoch": 1.9044368600682593, + "grad_norm": 0.99363774061203, + "learning_rate": 0.0006200227531285552, + "loss": 2.5205, + "step": 1674 + }, + { + "epoch": 1.905574516496018, + "grad_norm": 1.1013110876083374, + "learning_rate": 0.0006197952218430034, + "loss": 1.4929, + "step": 1675 + }, + { + "epoch": 1.906712172923777, + "grad_norm": 1.1436536312103271, + "learning_rate": 0.0006195676905574516, + "loss": 1.503, + "step": 1676 + }, + { + "epoch": 1.9078498293515358, + "grad_norm": 0.9888268113136292, + "learning_rate": 0.0006193401592718999, + "loss": 1.8105, + "step": 1677 + }, + { + "epoch": 1.9089874857792948, + "grad_norm": 0.5987884998321533, + "learning_rate": 0.0006191126279863481, + "loss": 0.6153, + "step": 1678 + }, + { + "epoch": 1.9101251422070535, + "grad_norm": 1.2569278478622437, + "learning_rate": 0.0006188850967007964, + "loss": 2.58, + "step": 1679 + }, + { + "epoch": 1.9112627986348123, + "grad_norm": 1.7216945886611938, + "learning_rate": 0.0006186575654152447, + "loss": 3.3363, + "step": 1680 + }, + { + "epoch": 1.912400455062571, + "grad_norm": 0.8490168452262878, + "learning_rate": 0.0006184300341296929, + "loss": 1.6139, + "step": 1681 + }, + { + "epoch": 1.9135381114903298, + "grad_norm": 0.8663358688354492, + "learning_rate": 0.000618202502844141, + "loss": 1.0861, + "step": 1682 + }, + { + "epoch": 1.9146757679180886, + "grad_norm": 1.2262353897094727, + "learning_rate": 0.0006179749715585893, + "loss": 2.4656, + "step": 1683 + }, + { + "epoch": 1.9158134243458476, + "grad_norm": 0.8106787800788879, + "learning_rate": 0.0006177474402730375, + "loss": 0.7479, + "step": 1684 + }, + { + "epoch": 1.9169510807736063, + "grad_norm": 0.8843865394592285, + "learning_rate": 0.0006175199089874857, + "loss": 1.8057, + "step": 1685 + }, + { + "epoch": 1.9180887372013653, + "grad_norm": 1.0910826921463013, + "learning_rate": 0.000617292377701934, + "loss": 2.1269, + "step": 1686 + }, + { + "epoch": 1.919226393629124, + "grad_norm": 0.8909353613853455, + "learning_rate": 0.0006170648464163823, + "loss": 1.4374, + "step": 1687 + }, + { + "epoch": 1.9203640500568828, + "grad_norm": 0.838852047920227, + "learning_rate": 0.0006168373151308306, + "loss": 1.455, + "step": 1688 + }, + { + "epoch": 1.9215017064846416, + "grad_norm": 1.186137080192566, + "learning_rate": 0.0006166097838452788, + "loss": 2.9913, + "step": 1689 + }, + { + "epoch": 1.9226393629124003, + "grad_norm": 0.947559654712677, + "learning_rate": 0.000616382252559727, + "loss": 2.5864, + "step": 1690 + }, + { + "epoch": 1.9237770193401593, + "grad_norm": 1.1154701709747314, + "learning_rate": 0.0006161547212741753, + "loss": 1.7531, + "step": 1691 + }, + { + "epoch": 1.924914675767918, + "grad_norm": 0.9917047023773193, + "learning_rate": 0.0006159271899886234, + "loss": 1.847, + "step": 1692 + }, + { + "epoch": 1.926052332195677, + "grad_norm": 0.8470612168312073, + "learning_rate": 0.0006156996587030716, + "loss": 1.9226, + "step": 1693 + }, + { + "epoch": 1.9271899886234358, + "grad_norm": 1.0250667333602905, + "learning_rate": 0.0006154721274175199, + "loss": 1.3656, + "step": 1694 + }, + { + "epoch": 1.9283276450511946, + "grad_norm": 1.122154951095581, + "learning_rate": 0.0006152445961319682, + "loss": 1.7061, + "step": 1695 + }, + { + "epoch": 1.9294653014789533, + "grad_norm": 0.7498093843460083, + "learning_rate": 0.0006150170648464164, + "loss": 1.86, + "step": 1696 + }, + { + "epoch": 1.930602957906712, + "grad_norm": 1.526132583618164, + "learning_rate": 0.0006147895335608647, + "loss": 1.9259, + "step": 1697 + }, + { + "epoch": 1.9317406143344709, + "grad_norm": 0.7942246794700623, + "learning_rate": 0.0006145620022753129, + "loss": 2.1063, + "step": 1698 + }, + { + "epoch": 1.9328782707622298, + "grad_norm": 0.7092151045799255, + "learning_rate": 0.0006143344709897611, + "loss": 1.3752, + "step": 1699 + }, + { + "epoch": 1.9340159271899886, + "grad_norm": 1.0814862251281738, + "learning_rate": 0.0006141069397042094, + "loss": 1.3951, + "step": 1700 + }, + { + "epoch": 1.9351535836177476, + "grad_norm": 0.8918007016181946, + "learning_rate": 0.0006138794084186575, + "loss": 1.4203, + "step": 1701 + }, + { + "epoch": 1.9362912400455063, + "grad_norm": 1.2497609853744507, + "learning_rate": 0.0006136518771331057, + "loss": 2.2451, + "step": 1702 + }, + { + "epoch": 1.937428896473265, + "grad_norm": 1.0966827869415283, + "learning_rate": 0.000613424345847554, + "loss": 2.617, + "step": 1703 + }, + { + "epoch": 1.9385665529010239, + "grad_norm": 0.6311344504356384, + "learning_rate": 0.0006131968145620023, + "loss": 0.9732, + "step": 1704 + }, + { + "epoch": 1.9397042093287826, + "grad_norm": 0.7735137939453125, + "learning_rate": 0.0006129692832764505, + "loss": 1.6126, + "step": 1705 + }, + { + "epoch": 1.9408418657565414, + "grad_norm": 0.8205373287200928, + "learning_rate": 0.0006127417519908988, + "loss": 0.8851, + "step": 1706 + }, + { + "epoch": 1.9419795221843004, + "grad_norm": 1.8471014499664307, + "learning_rate": 0.000612514220705347, + "loss": 3.0294, + "step": 1707 + }, + { + "epoch": 1.9431171786120591, + "grad_norm": 0.8841797113418579, + "learning_rate": 0.0006122866894197953, + "loss": 2.4339, + "step": 1708 + }, + { + "epoch": 1.944254835039818, + "grad_norm": 1.112204670906067, + "learning_rate": 0.0006120591581342435, + "loss": 3.4836, + "step": 1709 + }, + { + "epoch": 1.9453924914675769, + "grad_norm": 0.7821568250656128, + "learning_rate": 0.0006118316268486917, + "loss": 1.1741, + "step": 1710 + }, + { + "epoch": 1.9465301478953356, + "grad_norm": 0.9497551918029785, + "learning_rate": 0.0006116040955631399, + "loss": 1.5836, + "step": 1711 + }, + { + "epoch": 1.9476678043230944, + "grad_norm": 0.9662843942642212, + "learning_rate": 0.0006113765642775882, + "loss": 2.4538, + "step": 1712 + }, + { + "epoch": 1.9488054607508531, + "grad_norm": 0.8117369413375854, + "learning_rate": 0.0006111490329920364, + "loss": 2.783, + "step": 1713 + }, + { + "epoch": 1.949943117178612, + "grad_norm": 0.7812523245811462, + "learning_rate": 0.0006109215017064847, + "loss": 1.5216, + "step": 1714 + }, + { + "epoch": 1.9510807736063709, + "grad_norm": 0.7647354006767273, + "learning_rate": 0.0006106939704209329, + "loss": 1.8356, + "step": 1715 + }, + { + "epoch": 1.9522184300341296, + "grad_norm": 1.348557949066162, + "learning_rate": 0.0006104664391353811, + "loss": 2.3718, + "step": 1716 + }, + { + "epoch": 1.9533560864618886, + "grad_norm": 0.9883414506912231, + "learning_rate": 0.0006102389078498294, + "loss": 1.9512, + "step": 1717 + }, + { + "epoch": 1.9544937428896474, + "grad_norm": 1.112888216972351, + "learning_rate": 0.0006100113765642776, + "loss": 1.9945, + "step": 1718 + }, + { + "epoch": 1.9556313993174061, + "grad_norm": 1.1321780681610107, + "learning_rate": 0.0006097838452787258, + "loss": 1.7481, + "step": 1719 + }, + { + "epoch": 1.956769055745165, + "grad_norm": 1.408277153968811, + "learning_rate": 0.0006095563139931742, + "loss": 2.5749, + "step": 1720 + }, + { + "epoch": 1.9579067121729237, + "grad_norm": 0.9839853644371033, + "learning_rate": 0.0006093287827076223, + "loss": 1.2224, + "step": 1721 + }, + { + "epoch": 1.9590443686006824, + "grad_norm": 0.9178591966629028, + "learning_rate": 0.0006091012514220705, + "loss": 1.5087, + "step": 1722 + }, + { + "epoch": 1.9601820250284414, + "grad_norm": 1.2959963083267212, + "learning_rate": 0.0006088737201365188, + "loss": 3.2079, + "step": 1723 + }, + { + "epoch": 1.9613196814562004, + "grad_norm": 0.9269609451293945, + "learning_rate": 0.000608646188850967, + "loss": 1.5689, + "step": 1724 + }, + { + "epoch": 1.9624573378839592, + "grad_norm": 0.8281601071357727, + "learning_rate": 0.0006084186575654152, + "loss": 1.2933, + "step": 1725 + }, + { + "epoch": 1.963594994311718, + "grad_norm": 1.0037777423858643, + "learning_rate": 0.0006081911262798635, + "loss": 1.7339, + "step": 1726 + }, + { + "epoch": 1.9647326507394767, + "grad_norm": 1.0925790071487427, + "learning_rate": 0.0006079635949943117, + "loss": 2.8919, + "step": 1727 + }, + { + "epoch": 1.9658703071672354, + "grad_norm": 0.9971732497215271, + "learning_rate": 0.00060773606370876, + "loss": 2.1831, + "step": 1728 + }, + { + "epoch": 1.9670079635949942, + "grad_norm": 1.161362648010254, + "learning_rate": 0.0006075085324232083, + "loss": 3.2354, + "step": 1729 + }, + { + "epoch": 1.9681456200227532, + "grad_norm": 1.0122507810592651, + "learning_rate": 0.0006072810011376564, + "loss": 2.0288, + "step": 1730 + }, + { + "epoch": 1.969283276450512, + "grad_norm": 0.47560903429985046, + "learning_rate": 0.0006070534698521047, + "loss": 0.5176, + "step": 1731 + }, + { + "epoch": 1.970420932878271, + "grad_norm": 0.6199741363525391, + "learning_rate": 0.0006068259385665529, + "loss": 1.2013, + "step": 1732 + }, + { + "epoch": 1.9715585893060297, + "grad_norm": 1.1044777631759644, + "learning_rate": 0.0006065984072810011, + "loss": 1.7702, + "step": 1733 + }, + { + "epoch": 1.9726962457337884, + "grad_norm": 0.9448471069335938, + "learning_rate": 0.0006063708759954494, + "loss": 1.9484, + "step": 1734 + }, + { + "epoch": 1.9738339021615472, + "grad_norm": 0.9100543856620789, + "learning_rate": 0.0006061433447098976, + "loss": 2.0936, + "step": 1735 + }, + { + "epoch": 1.974971558589306, + "grad_norm": 0.6803526282310486, + "learning_rate": 0.0006059158134243458, + "loss": 1.1829, + "step": 1736 + }, + { + "epoch": 1.9761092150170647, + "grad_norm": 1.1807910203933716, + "learning_rate": 0.0006056882821387942, + "loss": 2.4637, + "step": 1737 + }, + { + "epoch": 1.9772468714448237, + "grad_norm": 0.9801465272903442, + "learning_rate": 0.0006054607508532424, + "loss": 1.9998, + "step": 1738 + }, + { + "epoch": 1.9783845278725825, + "grad_norm": 0.987866222858429, + "learning_rate": 0.0006052332195676906, + "loss": 1.2759, + "step": 1739 + }, + { + "epoch": 1.9795221843003414, + "grad_norm": 1.0122973918914795, + "learning_rate": 0.0006050056882821388, + "loss": 0.9901, + "step": 1740 + }, + { + "epoch": 1.9806598407281002, + "grad_norm": 0.9722836017608643, + "learning_rate": 0.000604778156996587, + "loss": 2.178, + "step": 1741 + }, + { + "epoch": 1.981797497155859, + "grad_norm": 0.9257890582084656, + "learning_rate": 0.0006045506257110352, + "loss": 2.1119, + "step": 1742 + }, + { + "epoch": 1.9829351535836177, + "grad_norm": 0.6407757997512817, + "learning_rate": 0.0006043230944254835, + "loss": 1.2141, + "step": 1743 + }, + { + "epoch": 1.9840728100113765, + "grad_norm": 0.7565338015556335, + "learning_rate": 0.0006040955631399317, + "loss": 1.7172, + "step": 1744 + }, + { + "epoch": 1.9852104664391352, + "grad_norm": 0.7070271372795105, + "learning_rate": 0.0006038680318543799, + "loss": 0.856, + "step": 1745 + }, + { + "epoch": 1.9863481228668942, + "grad_norm": 1.3683280944824219, + "learning_rate": 0.0006036405005688283, + "loss": 3.7194, + "step": 1746 + }, + { + "epoch": 1.987485779294653, + "grad_norm": 0.8713628649711609, + "learning_rate": 0.0006034129692832765, + "loss": 2.4061, + "step": 1747 + }, + { + "epoch": 1.988623435722412, + "grad_norm": 1.6695564985275269, + "learning_rate": 0.0006031854379977248, + "loss": 3.2265, + "step": 1748 + }, + { + "epoch": 1.9897610921501707, + "grad_norm": 1.007358431816101, + "learning_rate": 0.000602957906712173, + "loss": 2.1454, + "step": 1749 + }, + { + "epoch": 1.9908987485779295, + "grad_norm": 1.256252408027649, + "learning_rate": 0.0006027303754266211, + "loss": 2.2461, + "step": 1750 + }, + { + "epoch": 1.9920364050056882, + "grad_norm": 1.4839099645614624, + "learning_rate": 0.0006025028441410694, + "loss": 3.5124, + "step": 1751 + }, + { + "epoch": 1.993174061433447, + "grad_norm": 0.8481181263923645, + "learning_rate": 0.0006022753128555176, + "loss": 1.5822, + "step": 1752 + }, + { + "epoch": 1.9943117178612058, + "grad_norm": 1.6936454772949219, + "learning_rate": 0.0006020477815699658, + "loss": 2.7639, + "step": 1753 + }, + { + "epoch": 1.9954493742889647, + "grad_norm": 1.1564828157424927, + "learning_rate": 0.0006018202502844142, + "loss": 2.4716, + "step": 1754 + }, + { + "epoch": 1.9965870307167235, + "grad_norm": 1.2504764795303345, + "learning_rate": 0.0006015927189988624, + "loss": 1.9542, + "step": 1755 + }, + { + "epoch": 1.9977246871444825, + "grad_norm": 0.9221636652946472, + "learning_rate": 0.0006013651877133106, + "loss": 2.1488, + "step": 1756 + }, + { + "epoch": 1.9988623435722412, + "grad_norm": 1.1251505613327026, + "learning_rate": 0.0006011376564277589, + "loss": 1.9403, + "step": 1757 + }, + { + "epoch": 2.0, + "grad_norm": 0.7302814722061157, + "learning_rate": 0.0006009101251422071, + "loss": 1.4557, + "step": 1758 + }, + { + "epoch": 2.0, + "eval_f1": 0.8901, + "eval_gen_len": 49.6091, + "eval_loss": 1.861061930656433, + "eval_precision": 0.8889, + "eval_recall": 0.8915, + "eval_rouge1": 0.4491, + "eval_rouge2": 0.2031, + "eval_rougeL": 0.3721, + "eval_rougeLsum": 0.4148, + "eval_runtime": 28.0457, + "eval_samples_per_second": 3.922, + "eval_steps_per_second": 0.499, + "step": 1758 + }, + { + "epoch": 2.0011376564277588, + "grad_norm": 0.7474573850631714, + "learning_rate": 0.0006006825938566553, + "loss": 0.9735, + "step": 1759 + }, + { + "epoch": 2.0022753128555175, + "grad_norm": 1.003105640411377, + "learning_rate": 0.0006004550625711035, + "loss": 1.6829, + "step": 1760 + }, + { + "epoch": 2.0034129692832763, + "grad_norm": 0.7681793570518494, + "learning_rate": 0.0006002275312855517, + "loss": 1.6531, + "step": 1761 + }, + { + "epoch": 2.0045506257110355, + "grad_norm": 1.0925558805465698, + "learning_rate": 0.0006, + "loss": 2.754, + "step": 1762 + }, + { + "epoch": 2.0056882821387942, + "grad_norm": 0.8437767028808594, + "learning_rate": 0.0005997724687144483, + "loss": 1.3487, + "step": 1763 + }, + { + "epoch": 2.006825938566553, + "grad_norm": 0.6217951774597168, + "learning_rate": 0.0005995449374288965, + "loss": 1.3183, + "step": 1764 + }, + { + "epoch": 2.0079635949943118, + "grad_norm": 0.9381899237632751, + "learning_rate": 0.0005993174061433447, + "loss": 1.5027, + "step": 1765 + }, + { + "epoch": 2.0091012514220705, + "grad_norm": 0.801108181476593, + "learning_rate": 0.000599089874857793, + "loss": 1.207, + "step": 1766 + }, + { + "epoch": 2.0102389078498293, + "grad_norm": 0.9824976325035095, + "learning_rate": 0.0005988623435722412, + "loss": 2.4301, + "step": 1767 + }, + { + "epoch": 2.011376564277588, + "grad_norm": 0.7496545910835266, + "learning_rate": 0.0005986348122866895, + "loss": 1.25, + "step": 1768 + }, + { + "epoch": 2.012514220705347, + "grad_norm": 0.8144867420196533, + "learning_rate": 0.0005984072810011376, + "loss": 1.9912, + "step": 1769 + }, + { + "epoch": 2.013651877133106, + "grad_norm": 1.2619274854660034, + "learning_rate": 0.0005981797497155858, + "loss": 2.0556, + "step": 1770 + }, + { + "epoch": 2.0147895335608648, + "grad_norm": 0.8984837532043457, + "learning_rate": 0.0005979522184300342, + "loss": 1.3548, + "step": 1771 + }, + { + "epoch": 2.0159271899886235, + "grad_norm": 0.9129440784454346, + "learning_rate": 0.0005977246871444824, + "loss": 2.3292, + "step": 1772 + }, + { + "epoch": 2.0170648464163823, + "grad_norm": 0.7606117129325867, + "learning_rate": 0.0005974971558589306, + "loss": 1.3862, + "step": 1773 + }, + { + "epoch": 2.018202502844141, + "grad_norm": 1.030834674835205, + "learning_rate": 0.0005972696245733789, + "loss": 1.4272, + "step": 1774 + }, + { + "epoch": 2.0193401592719, + "grad_norm": 0.6545241475105286, + "learning_rate": 0.0005970420932878271, + "loss": 1.1461, + "step": 1775 + }, + { + "epoch": 2.0204778156996586, + "grad_norm": 0.8098762631416321, + "learning_rate": 0.0005968145620022753, + "loss": 1.6309, + "step": 1776 + }, + { + "epoch": 2.0216154721274173, + "grad_norm": 1.2966359853744507, + "learning_rate": 0.0005965870307167236, + "loss": 2.2563, + "step": 1777 + }, + { + "epoch": 2.0227531285551765, + "grad_norm": 1.0538915395736694, + "learning_rate": 0.0005963594994311718, + "loss": 2.8999, + "step": 1778 + }, + { + "epoch": 2.0238907849829353, + "grad_norm": 0.788987934589386, + "learning_rate": 0.00059613196814562, + "loss": 1.5423, + "step": 1779 + }, + { + "epoch": 2.025028441410694, + "grad_norm": 0.9566778540611267, + "learning_rate": 0.0005959044368600683, + "loss": 2.4709, + "step": 1780 + }, + { + "epoch": 2.026166097838453, + "grad_norm": 1.1112850904464722, + "learning_rate": 0.0005956769055745165, + "loss": 2.3085, + "step": 1781 + }, + { + "epoch": 2.0273037542662116, + "grad_norm": 1.074602484703064, + "learning_rate": 0.0005954493742889647, + "loss": 1.9328, + "step": 1782 + }, + { + "epoch": 2.0284414106939703, + "grad_norm": 1.017351508140564, + "learning_rate": 0.000595221843003413, + "loss": 2.1152, + "step": 1783 + }, + { + "epoch": 2.029579067121729, + "grad_norm": 1.069661021232605, + "learning_rate": 0.0005949943117178612, + "loss": 1.8133, + "step": 1784 + }, + { + "epoch": 2.030716723549488, + "grad_norm": 1.099387764930725, + "learning_rate": 0.0005947667804323094, + "loss": 2.207, + "step": 1785 + }, + { + "epoch": 2.031854379977247, + "grad_norm": 1.02996027469635, + "learning_rate": 0.0005945392491467577, + "loss": 1.9207, + "step": 1786 + }, + { + "epoch": 2.032992036405006, + "grad_norm": 0.7691861391067505, + "learning_rate": 0.000594311717861206, + "loss": 1.2884, + "step": 1787 + }, + { + "epoch": 2.0341296928327646, + "grad_norm": 0.9716812968254089, + "learning_rate": 0.0005940841865756542, + "loss": 1.4235, + "step": 1788 + }, + { + "epoch": 2.0352673492605233, + "grad_norm": 0.9133804440498352, + "learning_rate": 0.0005938566552901024, + "loss": 1.654, + "step": 1789 + }, + { + "epoch": 2.036405005688282, + "grad_norm": 1.1487979888916016, + "learning_rate": 0.0005936291240045506, + "loss": 2.4215, + "step": 1790 + }, + { + "epoch": 2.037542662116041, + "grad_norm": 0.8201186060905457, + "learning_rate": 0.0005934015927189989, + "loss": 1.775, + "step": 1791 + }, + { + "epoch": 2.0386803185437996, + "grad_norm": 1.0376436710357666, + "learning_rate": 0.0005931740614334471, + "loss": 2.3957, + "step": 1792 + }, + { + "epoch": 2.039817974971559, + "grad_norm": 0.9257051944732666, + "learning_rate": 0.0005929465301478953, + "loss": 2.02, + "step": 1793 + }, + { + "epoch": 2.0409556313993176, + "grad_norm": 1.158453106880188, + "learning_rate": 0.0005927189988623436, + "loss": 2.3353, + "step": 1794 + }, + { + "epoch": 2.0420932878270763, + "grad_norm": 0.9140053987503052, + "learning_rate": 0.0005924914675767918, + "loss": 1.6694, + "step": 1795 + }, + { + "epoch": 2.043230944254835, + "grad_norm": 0.8473706245422363, + "learning_rate": 0.0005922639362912401, + "loss": 1.4332, + "step": 1796 + }, + { + "epoch": 2.044368600682594, + "grad_norm": 1.5226987600326538, + "learning_rate": 0.0005920364050056884, + "loss": 2.6597, + "step": 1797 + }, + { + "epoch": 2.0455062571103526, + "grad_norm": 1.0220167636871338, + "learning_rate": 0.0005918088737201365, + "loss": 1.9809, + "step": 1798 + }, + { + "epoch": 2.0466439135381114, + "grad_norm": 0.8180732727050781, + "learning_rate": 0.0005915813424345847, + "loss": 1.5031, + "step": 1799 + }, + { + "epoch": 2.04778156996587, + "grad_norm": 1.1611244678497314, + "learning_rate": 0.000591353811149033, + "loss": 1.2286, + "step": 1800 + }, + { + "epoch": 2.0489192263936293, + "grad_norm": 1.1958653926849365, + "learning_rate": 0.0005911262798634812, + "loss": 2.6489, + "step": 1801 + }, + { + "epoch": 2.050056882821388, + "grad_norm": 1.252683401107788, + "learning_rate": 0.0005908987485779294, + "loss": 2.6139, + "step": 1802 + }, + { + "epoch": 2.051194539249147, + "grad_norm": 1.3542306423187256, + "learning_rate": 0.0005906712172923777, + "loss": 2.6136, + "step": 1803 + }, + { + "epoch": 2.0523321956769056, + "grad_norm": 1.3260592222213745, + "learning_rate": 0.000590443686006826, + "loss": 3.5016, + "step": 1804 + }, + { + "epoch": 2.0534698521046644, + "grad_norm": 1.2958813905715942, + "learning_rate": 0.0005902161547212742, + "loss": 3.2267, + "step": 1805 + }, + { + "epoch": 2.054607508532423, + "grad_norm": 0.963817834854126, + "learning_rate": 0.0005899886234357225, + "loss": 2.2298, + "step": 1806 + }, + { + "epoch": 2.055745164960182, + "grad_norm": 0.8188225626945496, + "learning_rate": 0.0005897610921501707, + "loss": 1.3832, + "step": 1807 + }, + { + "epoch": 2.0568828213879407, + "grad_norm": 1.3529433012008667, + "learning_rate": 0.0005895335608646188, + "loss": 1.5602, + "step": 1808 + }, + { + "epoch": 2.0580204778157, + "grad_norm": 1.1197400093078613, + "learning_rate": 0.0005893060295790671, + "loss": 2.5839, + "step": 1809 + }, + { + "epoch": 2.0591581342434586, + "grad_norm": 0.9456532001495361, + "learning_rate": 0.0005890784982935153, + "loss": 2.6459, + "step": 1810 + }, + { + "epoch": 2.0602957906712174, + "grad_norm": 0.923581063747406, + "learning_rate": 0.0005888509670079636, + "loss": 1.9121, + "step": 1811 + }, + { + "epoch": 2.061433447098976, + "grad_norm": 1.202931523323059, + "learning_rate": 0.0005886234357224119, + "loss": 1.8609, + "step": 1812 + }, + { + "epoch": 2.062571103526735, + "grad_norm": 1.0954371690750122, + "learning_rate": 0.0005883959044368601, + "loss": 1.6039, + "step": 1813 + }, + { + "epoch": 2.0637087599544937, + "grad_norm": 0.8433867692947388, + "learning_rate": 0.0005881683731513084, + "loss": 1.2853, + "step": 1814 + }, + { + "epoch": 2.0648464163822524, + "grad_norm": 0.7211450934410095, + "learning_rate": 0.0005879408418657566, + "loss": 0.9241, + "step": 1815 + }, + { + "epoch": 2.065984072810011, + "grad_norm": 1.0169728994369507, + "learning_rate": 0.0005877133105802048, + "loss": 3.0501, + "step": 1816 + }, + { + "epoch": 2.0671217292377704, + "grad_norm": 1.1246399879455566, + "learning_rate": 0.0005874857792946531, + "loss": 2.3138, + "step": 1817 + }, + { + "epoch": 2.068259385665529, + "grad_norm": 0.945504367351532, + "learning_rate": 0.0005872582480091012, + "loss": 1.3534, + "step": 1818 + }, + { + "epoch": 2.069397042093288, + "grad_norm": 0.8048107624053955, + "learning_rate": 0.0005870307167235494, + "loss": 2.2219, + "step": 1819 + }, + { + "epoch": 2.0705346985210467, + "grad_norm": 0.7122802734375, + "learning_rate": 0.0005868031854379977, + "loss": 1.239, + "step": 1820 + }, + { + "epoch": 2.0716723549488054, + "grad_norm": 0.5502296686172485, + "learning_rate": 0.000586575654152446, + "loss": 1.1367, + "step": 1821 + }, + { + "epoch": 2.072810011376564, + "grad_norm": 0.9720338582992554, + "learning_rate": 0.0005863481228668942, + "loss": 2.4818, + "step": 1822 + }, + { + "epoch": 2.073947667804323, + "grad_norm": 1.1907950639724731, + "learning_rate": 0.0005861205915813425, + "loss": 3.3499, + "step": 1823 + }, + { + "epoch": 2.0750853242320817, + "grad_norm": 0.7875828146934509, + "learning_rate": 0.0005858930602957907, + "loss": 1.2593, + "step": 1824 + }, + { + "epoch": 2.076222980659841, + "grad_norm": 1.0819391012191772, + "learning_rate": 0.0005856655290102389, + "loss": 2.0526, + "step": 1825 + }, + { + "epoch": 2.0773606370875997, + "grad_norm": 0.7829857468605042, + "learning_rate": 0.0005854379977246872, + "loss": 1.7224, + "step": 1826 + }, + { + "epoch": 2.0784982935153584, + "grad_norm": 0.6612402200698853, + "learning_rate": 0.0005852104664391354, + "loss": 1.3579, + "step": 1827 + }, + { + "epoch": 2.079635949943117, + "grad_norm": 0.6340951323509216, + "learning_rate": 0.0005849829351535835, + "loss": 1.2102, + "step": 1828 + }, + { + "epoch": 2.080773606370876, + "grad_norm": 0.6889867186546326, + "learning_rate": 0.0005847554038680319, + "loss": 1.0524, + "step": 1829 + }, + { + "epoch": 2.0819112627986347, + "grad_norm": 1.2241733074188232, + "learning_rate": 0.0005845278725824801, + "loss": 1.7329, + "step": 1830 + }, + { + "epoch": 2.0830489192263935, + "grad_norm": 0.9656223654747009, + "learning_rate": 0.0005843003412969284, + "loss": 1.5006, + "step": 1831 + }, + { + "epoch": 2.0841865756541527, + "grad_norm": 0.9914301037788391, + "learning_rate": 0.0005840728100113766, + "loss": 1.72, + "step": 1832 + }, + { + "epoch": 2.0853242320819114, + "grad_norm": 0.6369336843490601, + "learning_rate": 0.0005838452787258248, + "loss": 1.4671, + "step": 1833 + }, + { + "epoch": 2.08646188850967, + "grad_norm": 1.0386950969696045, + "learning_rate": 0.0005836177474402731, + "loss": 1.8412, + "step": 1834 + }, + { + "epoch": 2.087599544937429, + "grad_norm": 0.9129205346107483, + "learning_rate": 0.0005833902161547213, + "loss": 1.8206, + "step": 1835 + }, + { + "epoch": 2.0887372013651877, + "grad_norm": 0.8238604664802551, + "learning_rate": 0.0005831626848691695, + "loss": 1.8884, + "step": 1836 + }, + { + "epoch": 2.0898748577929465, + "grad_norm": 1.2888492345809937, + "learning_rate": 0.0005829351535836177, + "loss": 2.121, + "step": 1837 + }, + { + "epoch": 2.091012514220705, + "grad_norm": 1.0287847518920898, + "learning_rate": 0.000582707622298066, + "loss": 1.878, + "step": 1838 + }, + { + "epoch": 2.092150170648464, + "grad_norm": 1.340661883354187, + "learning_rate": 0.0005824800910125142, + "loss": 2.5264, + "step": 1839 + }, + { + "epoch": 2.093287827076223, + "grad_norm": 0.8161119818687439, + "learning_rate": 0.0005822525597269625, + "loss": 1.8105, + "step": 1840 + }, + { + "epoch": 2.094425483503982, + "grad_norm": 1.0750125646591187, + "learning_rate": 0.0005820250284414107, + "loss": 2.3553, + "step": 1841 + }, + { + "epoch": 2.0955631399317407, + "grad_norm": 1.0218100547790527, + "learning_rate": 0.0005817974971558589, + "loss": 1.8317, + "step": 1842 + }, + { + "epoch": 2.0967007963594995, + "grad_norm": 0.5259703397750854, + "learning_rate": 0.0005815699658703072, + "loss": 0.7726, + "step": 1843 + }, + { + "epoch": 2.0978384527872582, + "grad_norm": 0.9108942151069641, + "learning_rate": 0.0005813424345847554, + "loss": 1.7108, + "step": 1844 + }, + { + "epoch": 2.098976109215017, + "grad_norm": 1.2494343519210815, + "learning_rate": 0.0005811149032992036, + "loss": 2.0056, + "step": 1845 + }, + { + "epoch": 2.1001137656427757, + "grad_norm": 0.6544501185417175, + "learning_rate": 0.000580887372013652, + "loss": 1.1255, + "step": 1846 + }, + { + "epoch": 2.1012514220705345, + "grad_norm": 1.4156732559204102, + "learning_rate": 0.0005806598407281001, + "loss": 2.7314, + "step": 1847 + }, + { + "epoch": 2.1023890784982937, + "grad_norm": 0.6912670731544495, + "learning_rate": 0.0005804323094425483, + "loss": 1.3614, + "step": 1848 + }, + { + "epoch": 2.1035267349260525, + "grad_norm": 0.7118052840232849, + "learning_rate": 0.0005802047781569966, + "loss": 1.9744, + "step": 1849 + }, + { + "epoch": 2.1046643913538112, + "grad_norm": 0.9591397047042847, + "learning_rate": 0.0005799772468714448, + "loss": 0.8785, + "step": 1850 + }, + { + "epoch": 2.10580204778157, + "grad_norm": 0.7862758040428162, + "learning_rate": 0.0005797497155858931, + "loss": 1.5104, + "step": 1851 + }, + { + "epoch": 2.1069397042093287, + "grad_norm": 0.8719160556793213, + "learning_rate": 0.0005795221843003413, + "loss": 1.3436, + "step": 1852 + }, + { + "epoch": 2.1080773606370875, + "grad_norm": 0.7223889231681824, + "learning_rate": 0.0005792946530147895, + "loss": 1.116, + "step": 1853 + }, + { + "epoch": 2.1092150170648463, + "grad_norm": 1.0977421998977661, + "learning_rate": 0.0005790671217292379, + "loss": 1.5358, + "step": 1854 + }, + { + "epoch": 2.110352673492605, + "grad_norm": 1.2012536525726318, + "learning_rate": 0.0005788395904436861, + "loss": 1.9212, + "step": 1855 + }, + { + "epoch": 2.1114903299203642, + "grad_norm": 1.2642079591751099, + "learning_rate": 0.0005786120591581343, + "loss": 2.3365, + "step": 1856 + }, + { + "epoch": 2.112627986348123, + "grad_norm": 0.7523776888847351, + "learning_rate": 0.0005783845278725825, + "loss": 1.2154, + "step": 1857 + }, + { + "epoch": 2.1137656427758817, + "grad_norm": 0.4518072307109833, + "learning_rate": 0.0005781569965870307, + "loss": 0.5069, + "step": 1858 + }, + { + "epoch": 2.1149032992036405, + "grad_norm": 0.6998793482780457, + "learning_rate": 0.0005779294653014789, + "loss": 1.7925, + "step": 1859 + }, + { + "epoch": 2.1160409556313993, + "grad_norm": 1.0769786834716797, + "learning_rate": 0.0005777019340159272, + "loss": 2.0462, + "step": 1860 + }, + { + "epoch": 2.117178612059158, + "grad_norm": 1.269291877746582, + "learning_rate": 0.0005774744027303754, + "loss": 1.5268, + "step": 1861 + }, + { + "epoch": 2.118316268486917, + "grad_norm": 1.7893264293670654, + "learning_rate": 0.0005772468714448236, + "loss": 3.6841, + "step": 1862 + }, + { + "epoch": 2.1194539249146755, + "grad_norm": 1.0378164052963257, + "learning_rate": 0.000577019340159272, + "loss": 1.8547, + "step": 1863 + }, + { + "epoch": 2.1205915813424348, + "grad_norm": 0.983869731426239, + "learning_rate": 0.0005767918088737202, + "loss": 2.1582, + "step": 1864 + }, + { + "epoch": 2.1217292377701935, + "grad_norm": 0.6995578408241272, + "learning_rate": 0.0005765642775881684, + "loss": 1.7941, + "step": 1865 + }, + { + "epoch": 2.1228668941979523, + "grad_norm": 0.8206987380981445, + "learning_rate": 0.0005763367463026167, + "loss": 1.4095, + "step": 1866 + }, + { + "epoch": 2.124004550625711, + "grad_norm": 0.9261349439620972, + "learning_rate": 0.0005761092150170648, + "loss": 1.2816, + "step": 1867 + }, + { + "epoch": 2.12514220705347, + "grad_norm": 0.971121609210968, + "learning_rate": 0.000575881683731513, + "loss": 1.9692, + "step": 1868 + }, + { + "epoch": 2.1262798634812285, + "grad_norm": 0.8004586100578308, + "learning_rate": 0.0005756541524459613, + "loss": 1.4504, + "step": 1869 + }, + { + "epoch": 2.1274175199089873, + "grad_norm": 1.1579177379608154, + "learning_rate": 0.0005754266211604095, + "loss": 1.8514, + "step": 1870 + }, + { + "epoch": 2.1285551763367465, + "grad_norm": 0.6660380959510803, + "learning_rate": 0.0005751990898748578, + "loss": 1.2882, + "step": 1871 + }, + { + "epoch": 2.1296928327645053, + "grad_norm": 1.0082716941833496, + "learning_rate": 0.0005749715585893061, + "loss": 1.9409, + "step": 1872 + }, + { + "epoch": 2.130830489192264, + "grad_norm": 0.6915638446807861, + "learning_rate": 0.0005747440273037543, + "loss": 1.6384, + "step": 1873 + }, + { + "epoch": 2.131968145620023, + "grad_norm": 0.91794753074646, + "learning_rate": 0.0005745164960182026, + "loss": 2.066, + "step": 1874 + }, + { + "epoch": 2.1331058020477816, + "grad_norm": 0.7902770638465881, + "learning_rate": 0.0005742889647326508, + "loss": 1.3539, + "step": 1875 + }, + { + "epoch": 2.1342434584755403, + "grad_norm": 0.8815358281135559, + "learning_rate": 0.0005740614334470989, + "loss": 1.9951, + "step": 1876 + }, + { + "epoch": 2.135381114903299, + "grad_norm": 2.684619665145874, + "learning_rate": 0.0005738339021615472, + "loss": 2.3747, + "step": 1877 + }, + { + "epoch": 2.136518771331058, + "grad_norm": 0.864995539188385, + "learning_rate": 0.0005736063708759954, + "loss": 1.2905, + "step": 1878 + }, + { + "epoch": 2.137656427758817, + "grad_norm": 1.144092082977295, + "learning_rate": 0.0005733788395904436, + "loss": 3.1334, + "step": 1879 + }, + { + "epoch": 2.138794084186576, + "grad_norm": 0.8507954478263855, + "learning_rate": 0.000573151308304892, + "loss": 2.1402, + "step": 1880 + }, + { + "epoch": 2.1399317406143346, + "grad_norm": 0.9215604662895203, + "learning_rate": 0.0005729237770193402, + "loss": 1.339, + "step": 1881 + }, + { + "epoch": 2.1410693970420933, + "grad_norm": 1.1672780513763428, + "learning_rate": 0.0005726962457337884, + "loss": 1.6453, + "step": 1882 + }, + { + "epoch": 2.142207053469852, + "grad_norm": 1.1848803758621216, + "learning_rate": 0.0005724687144482367, + "loss": 2.445, + "step": 1883 + }, + { + "epoch": 2.143344709897611, + "grad_norm": 1.0215684175491333, + "learning_rate": 0.0005722411831626849, + "loss": 1.3356, + "step": 1884 + }, + { + "epoch": 2.1444823663253696, + "grad_norm": 0.9936044216156006, + "learning_rate": 0.0005720136518771331, + "loss": 1.9894, + "step": 1885 + }, + { + "epoch": 2.1456200227531284, + "grad_norm": 1.2094396352767944, + "learning_rate": 0.0005717861205915813, + "loss": 1.5998, + "step": 1886 + }, + { + "epoch": 2.1467576791808876, + "grad_norm": 1.1404001712799072, + "learning_rate": 0.0005715585893060295, + "loss": 2.1319, + "step": 1887 + }, + { + "epoch": 2.1478953356086463, + "grad_norm": 0.656340479850769, + "learning_rate": 0.0005713310580204778, + "loss": 1.1715, + "step": 1888 + }, + { + "epoch": 2.149032992036405, + "grad_norm": 1.03688645362854, + "learning_rate": 0.0005711035267349261, + "loss": 2.4935, + "step": 1889 + }, + { + "epoch": 2.150170648464164, + "grad_norm": 0.8806471228599548, + "learning_rate": 0.0005708759954493743, + "loss": 1.9498, + "step": 1890 + }, + { + "epoch": 2.1513083048919226, + "grad_norm": 0.9819498062133789, + "learning_rate": 0.0005706484641638225, + "loss": 1.7589, + "step": 1891 + }, + { + "epoch": 2.1524459613196814, + "grad_norm": 0.9967643618583679, + "learning_rate": 0.0005704209328782708, + "loss": 1.479, + "step": 1892 + }, + { + "epoch": 2.15358361774744, + "grad_norm": 0.8531262278556824, + "learning_rate": 0.000570193401592719, + "loss": 2.3837, + "step": 1893 + }, + { + "epoch": 2.1547212741751993, + "grad_norm": 0.7077489495277405, + "learning_rate": 0.0005699658703071673, + "loss": 0.9478, + "step": 1894 + }, + { + "epoch": 2.155858930602958, + "grad_norm": 0.4791664183139801, + "learning_rate": 0.0005697383390216155, + "loss": 0.6543, + "step": 1895 + }, + { + "epoch": 2.156996587030717, + "grad_norm": 1.3306792974472046, + "learning_rate": 0.0005695108077360637, + "loss": 2.6389, + "step": 1896 + }, + { + "epoch": 2.1581342434584756, + "grad_norm": 0.8933519721031189, + "learning_rate": 0.000569283276450512, + "loss": 1.1383, + "step": 1897 + }, + { + "epoch": 2.1592718998862344, + "grad_norm": 1.3570899963378906, + "learning_rate": 0.0005690557451649602, + "loss": 1.7026, + "step": 1898 + }, + { + "epoch": 2.160409556313993, + "grad_norm": 0.8122515678405762, + "learning_rate": 0.0005688282138794084, + "loss": 1.8717, + "step": 1899 + }, + { + "epoch": 2.161547212741752, + "grad_norm": 1.0075249671936035, + "learning_rate": 0.0005686006825938567, + "loss": 1.9019, + "step": 1900 + }, + { + "epoch": 2.1626848691695106, + "grad_norm": 1.1624641418457031, + "learning_rate": 0.0005683731513083049, + "loss": 1.4538, + "step": 1901 + }, + { + "epoch": 2.1638225255972694, + "grad_norm": 1.6518383026123047, + "learning_rate": 0.0005681456200227531, + "loss": 3.6701, + "step": 1902 + }, + { + "epoch": 2.1649601820250286, + "grad_norm": 0.6541008353233337, + "learning_rate": 0.0005679180887372014, + "loss": 1.3688, + "step": 1903 + }, + { + "epoch": 2.1660978384527874, + "grad_norm": 1.05472993850708, + "learning_rate": 0.0005676905574516497, + "loss": 2.1368, + "step": 1904 + }, + { + "epoch": 2.167235494880546, + "grad_norm": 1.1374725103378296, + "learning_rate": 0.0005674630261660978, + "loss": 3.3545, + "step": 1905 + }, + { + "epoch": 2.168373151308305, + "grad_norm": 1.0366812944412231, + "learning_rate": 0.0005672354948805461, + "loss": 2.4839, + "step": 1906 + }, + { + "epoch": 2.1695108077360636, + "grad_norm": 0.8595036268234253, + "learning_rate": 0.0005670079635949943, + "loss": 1.784, + "step": 1907 + }, + { + "epoch": 2.1706484641638224, + "grad_norm": 1.023034930229187, + "learning_rate": 0.0005667804323094425, + "loss": 1.3478, + "step": 1908 + }, + { + "epoch": 2.171786120591581, + "grad_norm": 0.741868257522583, + "learning_rate": 0.0005665529010238908, + "loss": 1.0789, + "step": 1909 + }, + { + "epoch": 2.1729237770193404, + "grad_norm": 1.1272695064544678, + "learning_rate": 0.000566325369738339, + "loss": 2.037, + "step": 1910 + }, + { + "epoch": 2.174061433447099, + "grad_norm": 1.0772629976272583, + "learning_rate": 0.0005660978384527872, + "loss": 2.3381, + "step": 1911 + }, + { + "epoch": 2.175199089874858, + "grad_norm": 1.0794868469238281, + "learning_rate": 0.0005658703071672356, + "loss": 2.9036, + "step": 1912 + }, + { + "epoch": 2.1763367463026166, + "grad_norm": 0.9306978583335876, + "learning_rate": 0.0005656427758816838, + "loss": 2.0821, + "step": 1913 + }, + { + "epoch": 2.1774744027303754, + "grad_norm": 0.88816237449646, + "learning_rate": 0.0005654152445961321, + "loss": 1.7599, + "step": 1914 + }, + { + "epoch": 2.178612059158134, + "grad_norm": 1.169001817703247, + "learning_rate": 0.0005651877133105802, + "loss": 2.1915, + "step": 1915 + }, + { + "epoch": 2.179749715585893, + "grad_norm": 1.0751813650131226, + "learning_rate": 0.0005649601820250284, + "loss": 1.6451, + "step": 1916 + }, + { + "epoch": 2.1808873720136517, + "grad_norm": 1.062515377998352, + "learning_rate": 0.0005647326507394767, + "loss": 2.4742, + "step": 1917 + }, + { + "epoch": 2.182025028441411, + "grad_norm": 0.9888359308242798, + "learning_rate": 0.0005645051194539249, + "loss": 3.1331, + "step": 1918 + }, + { + "epoch": 2.1831626848691696, + "grad_norm": 0.8473877310752869, + "learning_rate": 0.0005642775881683731, + "loss": 1.4678, + "step": 1919 + }, + { + "epoch": 2.1843003412969284, + "grad_norm": 0.85085129737854, + "learning_rate": 0.0005640500568828214, + "loss": 2.127, + "step": 1920 + }, + { + "epoch": 2.185437997724687, + "grad_norm": 1.2935888767242432, + "learning_rate": 0.0005638225255972697, + "loss": 3.5023, + "step": 1921 + }, + { + "epoch": 2.186575654152446, + "grad_norm": 0.8884962201118469, + "learning_rate": 0.0005635949943117179, + "loss": 0.8726, + "step": 1922 + }, + { + "epoch": 2.1877133105802047, + "grad_norm": 0.9087686538696289, + "learning_rate": 0.0005633674630261662, + "loss": 1.3675, + "step": 1923 + }, + { + "epoch": 2.1888509670079634, + "grad_norm": 0.6281200647354126, + "learning_rate": 0.0005631399317406144, + "loss": 0.7575, + "step": 1924 + }, + { + "epoch": 2.189988623435722, + "grad_norm": 1.0639797449111938, + "learning_rate": 0.0005629124004550625, + "loss": 2.3185, + "step": 1925 + }, + { + "epoch": 2.1911262798634814, + "grad_norm": 0.596128523349762, + "learning_rate": 0.0005626848691695108, + "loss": 0.8454, + "step": 1926 + }, + { + "epoch": 2.19226393629124, + "grad_norm": 0.8989168405532837, + "learning_rate": 0.000562457337883959, + "loss": 0.7302, + "step": 1927 + }, + { + "epoch": 2.193401592718999, + "grad_norm": 0.7336655855178833, + "learning_rate": 0.0005622298065984072, + "loss": 1.2145, + "step": 1928 + }, + { + "epoch": 2.1945392491467577, + "grad_norm": 1.077013611793518, + "learning_rate": 0.0005620022753128556, + "loss": 2.4185, + "step": 1929 + }, + { + "epoch": 2.1956769055745164, + "grad_norm": 0.8258651494979858, + "learning_rate": 0.0005617747440273038, + "loss": 1.4536, + "step": 1930 + }, + { + "epoch": 2.196814562002275, + "grad_norm": 0.569733202457428, + "learning_rate": 0.000561547212741752, + "loss": 0.7994, + "step": 1931 + }, + { + "epoch": 2.197952218430034, + "grad_norm": 0.843547523021698, + "learning_rate": 0.0005613196814562003, + "loss": 1.3272, + "step": 1932 + }, + { + "epoch": 2.199089874857793, + "grad_norm": 0.8367027044296265, + "learning_rate": 0.0005610921501706485, + "loss": 1.625, + "step": 1933 + }, + { + "epoch": 2.200227531285552, + "grad_norm": 1.0637445449829102, + "learning_rate": 0.0005608646188850968, + "loss": 1.3352, + "step": 1934 + }, + { + "epoch": 2.2013651877133107, + "grad_norm": 1.4886406660079956, + "learning_rate": 0.0005606370875995449, + "loss": 2.3982, + "step": 1935 + }, + { + "epoch": 2.2025028441410694, + "grad_norm": 1.015264630317688, + "learning_rate": 0.0005604095563139931, + "loss": 2.7429, + "step": 1936 + }, + { + "epoch": 2.203640500568828, + "grad_norm": 1.2239853143692017, + "learning_rate": 0.0005601820250284414, + "loss": 1.8034, + "step": 1937 + }, + { + "epoch": 2.204778156996587, + "grad_norm": 1.1284427642822266, + "learning_rate": 0.0005599544937428897, + "loss": 2.232, + "step": 1938 + }, + { + "epoch": 2.2059158134243457, + "grad_norm": 1.118551254272461, + "learning_rate": 0.0005597269624573379, + "loss": 1.9726, + "step": 1939 + }, + { + "epoch": 2.2070534698521045, + "grad_norm": 1.115544080734253, + "learning_rate": 0.0005594994311717862, + "loss": 2.3797, + "step": 1940 + }, + { + "epoch": 2.2081911262798632, + "grad_norm": 0.8640685677528381, + "learning_rate": 0.0005592718998862344, + "loss": 2.5765, + "step": 1941 + }, + { + "epoch": 2.2093287827076225, + "grad_norm": 1.0405633449554443, + "learning_rate": 0.0005590443686006826, + "loss": 1.6865, + "step": 1942 + }, + { + "epoch": 2.210466439135381, + "grad_norm": 1.6807098388671875, + "learning_rate": 0.0005588168373151309, + "loss": 2.5395, + "step": 1943 + }, + { + "epoch": 2.21160409556314, + "grad_norm": 0.9843060374259949, + "learning_rate": 0.000558589306029579, + "loss": 2.0961, + "step": 1944 + }, + { + "epoch": 2.2127417519908987, + "grad_norm": 0.995924174785614, + "learning_rate": 0.0005583617747440272, + "loss": 2.0971, + "step": 1945 + }, + { + "epoch": 2.2138794084186575, + "grad_norm": 0.8533014059066772, + "learning_rate": 0.0005581342434584756, + "loss": 1.5151, + "step": 1946 + }, + { + "epoch": 2.2150170648464163, + "grad_norm": 0.9610128402709961, + "learning_rate": 0.0005579067121729238, + "loss": 2.239, + "step": 1947 + }, + { + "epoch": 2.216154721274175, + "grad_norm": 0.799690306186676, + "learning_rate": 0.000557679180887372, + "loss": 1.5567, + "step": 1948 + }, + { + "epoch": 2.217292377701934, + "grad_norm": 0.8226889371871948, + "learning_rate": 0.0005574516496018203, + "loss": 2.6057, + "step": 1949 + }, + { + "epoch": 2.218430034129693, + "grad_norm": 1.314974308013916, + "learning_rate": 0.0005572241183162685, + "loss": 2.2834, + "step": 1950 + }, + { + "epoch": 2.2195676905574517, + "grad_norm": 0.9259127974510193, + "learning_rate": 0.0005569965870307167, + "loss": 1.9402, + "step": 1951 + }, + { + "epoch": 2.2207053469852105, + "grad_norm": 1.1179683208465576, + "learning_rate": 0.000556769055745165, + "loss": 2.6231, + "step": 1952 + }, + { + "epoch": 2.2218430034129693, + "grad_norm": 1.0641530752182007, + "learning_rate": 0.0005565415244596132, + "loss": 2.055, + "step": 1953 + }, + { + "epoch": 2.222980659840728, + "grad_norm": 0.7080785036087036, + "learning_rate": 0.0005563139931740613, + "loss": 0.7283, + "step": 1954 + }, + { + "epoch": 2.2241183162684868, + "grad_norm": 1.3224704265594482, + "learning_rate": 0.0005560864618885097, + "loss": 2.189, + "step": 1955 + }, + { + "epoch": 2.2252559726962455, + "grad_norm": 0.9138402938842773, + "learning_rate": 0.0005558589306029579, + "loss": 1.9759, + "step": 1956 + }, + { + "epoch": 2.2263936291240047, + "grad_norm": 1.1304935216903687, + "learning_rate": 0.0005556313993174062, + "loss": 2.2992, + "step": 1957 + }, + { + "epoch": 2.2275312855517635, + "grad_norm": 0.7692627906799316, + "learning_rate": 0.0005554038680318544, + "loss": 1.195, + "step": 1958 + }, + { + "epoch": 2.2286689419795223, + "grad_norm": 0.9814881086349487, + "learning_rate": 0.0005551763367463026, + "loss": 1.7418, + "step": 1959 + }, + { + "epoch": 2.229806598407281, + "grad_norm": 1.449527621269226, + "learning_rate": 0.0005549488054607509, + "loss": 2.8254, + "step": 1960 + }, + { + "epoch": 2.2309442548350398, + "grad_norm": 1.177649974822998, + "learning_rate": 0.0005547212741751991, + "loss": 2.1793, + "step": 1961 + }, + { + "epoch": 2.2320819112627985, + "grad_norm": 1.1293201446533203, + "learning_rate": 0.0005544937428896473, + "loss": 3.0604, + "step": 1962 + }, + { + "epoch": 2.2332195676905573, + "grad_norm": 0.9934133887290955, + "learning_rate": 0.0005542662116040957, + "loss": 1.8564, + "step": 1963 + }, + { + "epoch": 2.234357224118316, + "grad_norm": 0.8571220636367798, + "learning_rate": 0.0005540386803185438, + "loss": 1.4268, + "step": 1964 + }, + { + "epoch": 2.2354948805460753, + "grad_norm": 0.84453284740448, + "learning_rate": 0.000553811149032992, + "loss": 1.4253, + "step": 1965 + }, + { + "epoch": 2.236632536973834, + "grad_norm": 0.842588484287262, + "learning_rate": 0.0005535836177474403, + "loss": 2.4052, + "step": 1966 + }, + { + "epoch": 2.2377701934015928, + "grad_norm": 1.4743871688842773, + "learning_rate": 0.0005533560864618885, + "loss": 2.9152, + "step": 1967 + }, + { + "epoch": 2.2389078498293515, + "grad_norm": 0.6935710310935974, + "learning_rate": 0.0005531285551763367, + "loss": 1.2375, + "step": 1968 + }, + { + "epoch": 2.2400455062571103, + "grad_norm": 1.2612910270690918, + "learning_rate": 0.000552901023890785, + "loss": 1.9998, + "step": 1969 + }, + { + "epoch": 2.241183162684869, + "grad_norm": 0.8181163668632507, + "learning_rate": 0.0005526734926052332, + "loss": 1.3744, + "step": 1970 + }, + { + "epoch": 2.242320819112628, + "grad_norm": 1.1267778873443604, + "learning_rate": 0.0005524459613196815, + "loss": 2.3809, + "step": 1971 + }, + { + "epoch": 2.243458475540387, + "grad_norm": 1.200745701789856, + "learning_rate": 0.0005522184300341298, + "loss": 2.4499, + "step": 1972 + }, + { + "epoch": 2.244596131968146, + "grad_norm": 1.04634690284729, + "learning_rate": 0.0005519908987485779, + "loss": 1.1126, + "step": 1973 + }, + { + "epoch": 2.2457337883959045, + "grad_norm": 0.9528573751449585, + "learning_rate": 0.0005517633674630261, + "loss": 2.4237, + "step": 1974 + }, + { + "epoch": 2.2468714448236633, + "grad_norm": 1.3052282333374023, + "learning_rate": 0.0005515358361774744, + "loss": 2.5489, + "step": 1975 + }, + { + "epoch": 2.248009101251422, + "grad_norm": 1.095739722251892, + "learning_rate": 0.0005513083048919226, + "loss": 1.1754, + "step": 1976 + }, + { + "epoch": 2.249146757679181, + "grad_norm": 1.1007702350616455, + "learning_rate": 0.0005510807736063709, + "loss": 1.7626, + "step": 1977 + }, + { + "epoch": 2.2502844141069396, + "grad_norm": 0.6495097279548645, + "learning_rate": 0.0005508532423208191, + "loss": 1.7669, + "step": 1978 + }, + { + "epoch": 2.2514220705346983, + "grad_norm": 0.9295183420181274, + "learning_rate": 0.0005506257110352673, + "loss": 1.9161, + "step": 1979 + }, + { + "epoch": 2.252559726962457, + "grad_norm": 1.0629956722259521, + "learning_rate": 0.0005503981797497157, + "loss": 2.2528, + "step": 1980 + }, + { + "epoch": 2.2536973833902163, + "grad_norm": 0.6436116695404053, + "learning_rate": 0.0005501706484641639, + "loss": 1.2159, + "step": 1981 + }, + { + "epoch": 2.254835039817975, + "grad_norm": 1.0467777252197266, + "learning_rate": 0.0005499431171786121, + "loss": 2.0886, + "step": 1982 + }, + { + "epoch": 2.255972696245734, + "grad_norm": 0.7404427528381348, + "learning_rate": 0.0005497155858930603, + "loss": 1.4913, + "step": 1983 + }, + { + "epoch": 2.2571103526734926, + "grad_norm": 1.0175822973251343, + "learning_rate": 0.0005494880546075085, + "loss": 1.9654, + "step": 1984 + }, + { + "epoch": 2.2582480091012513, + "grad_norm": 0.930240273475647, + "learning_rate": 0.0005492605233219567, + "loss": 1.7043, + "step": 1985 + }, + { + "epoch": 2.25938566552901, + "grad_norm": 2.17866587638855, + "learning_rate": 0.000549032992036405, + "loss": 3.0881, + "step": 1986 + }, + { + "epoch": 2.260523321956769, + "grad_norm": 1.5989547967910767, + "learning_rate": 0.0005488054607508532, + "loss": 3.0951, + "step": 1987 + }, + { + "epoch": 2.261660978384528, + "grad_norm": 1.6310112476348877, + "learning_rate": 0.0005485779294653015, + "loss": 3.687, + "step": 1988 + }, + { + "epoch": 2.262798634812287, + "grad_norm": 0.682499885559082, + "learning_rate": 0.0005483503981797498, + "loss": 1.5835, + "step": 1989 + }, + { + "epoch": 2.2639362912400456, + "grad_norm": 1.0656623840332031, + "learning_rate": 0.000548122866894198, + "loss": 2.0308, + "step": 1990 + }, + { + "epoch": 2.2650739476678043, + "grad_norm": 0.7635136842727661, + "learning_rate": 0.0005478953356086462, + "loss": 2.4743, + "step": 1991 + }, + { + "epoch": 2.266211604095563, + "grad_norm": 0.7663008570671082, + "learning_rate": 0.0005476678043230945, + "loss": 2.0004, + "step": 1992 + }, + { + "epoch": 2.267349260523322, + "grad_norm": 0.5305643677711487, + "learning_rate": 0.0005474402730375426, + "loss": 0.7431, + "step": 1993 + }, + { + "epoch": 2.2684869169510806, + "grad_norm": 1.2221653461456299, + "learning_rate": 0.0005472127417519908, + "loss": 2.3999, + "step": 1994 + }, + { + "epoch": 2.26962457337884, + "grad_norm": 1.0990016460418701, + "learning_rate": 0.0005469852104664391, + "loss": 3.3276, + "step": 1995 + }, + { + "epoch": 2.2707622298065986, + "grad_norm": 1.074623703956604, + "learning_rate": 0.0005467576791808873, + "loss": 2.2739, + "step": 1996 + }, + { + "epoch": 2.2718998862343573, + "grad_norm": 0.7581028342247009, + "learning_rate": 0.0005465301478953357, + "loss": 1.9877, + "step": 1997 + }, + { + "epoch": 2.273037542662116, + "grad_norm": 1.2376879453659058, + "learning_rate": 0.0005463026166097839, + "loss": 2.5045, + "step": 1998 + }, + { + "epoch": 2.274175199089875, + "grad_norm": 0.8391635417938232, + "learning_rate": 0.0005460750853242321, + "loss": 1.414, + "step": 1999 + }, + { + "epoch": 2.2753128555176336, + "grad_norm": 0.917779803276062, + "learning_rate": 0.0005458475540386804, + "loss": 1.5652, + "step": 2000 + }, + { + "epoch": 2.2764505119453924, + "grad_norm": 0.9148494005203247, + "learning_rate": 0.0005456200227531286, + "loss": 0.9553, + "step": 2001 + }, + { + "epoch": 2.277588168373151, + "grad_norm": 1.6205681562423706, + "learning_rate": 0.0005453924914675768, + "loss": 3.2662, + "step": 2002 + }, + { + "epoch": 2.27872582480091, + "grad_norm": 0.5104256272315979, + "learning_rate": 0.000545164960182025, + "loss": 1.1526, + "step": 2003 + }, + { + "epoch": 2.279863481228669, + "grad_norm": 0.9608463644981384, + "learning_rate": 0.0005449374288964732, + "loss": 1.3495, + "step": 2004 + }, + { + "epoch": 2.281001137656428, + "grad_norm": 0.8729520440101624, + "learning_rate": 0.0005447098976109215, + "loss": 1.6019, + "step": 2005 + }, + { + "epoch": 2.2821387940841866, + "grad_norm": 0.7411619424819946, + "learning_rate": 0.0005444823663253698, + "loss": 1.5559, + "step": 2006 + }, + { + "epoch": 2.2832764505119454, + "grad_norm": 0.7767372727394104, + "learning_rate": 0.000544254835039818, + "loss": 2.0135, + "step": 2007 + }, + { + "epoch": 2.284414106939704, + "grad_norm": 0.5355758666992188, + "learning_rate": 0.0005440273037542662, + "loss": 1.1862, + "step": 2008 + }, + { + "epoch": 2.285551763367463, + "grad_norm": 0.719421923160553, + "learning_rate": 0.0005437997724687145, + "loss": 1.1892, + "step": 2009 + }, + { + "epoch": 2.2866894197952217, + "grad_norm": 0.7264088988304138, + "learning_rate": 0.0005435722411831627, + "loss": 1.5477, + "step": 2010 + }, + { + "epoch": 2.287827076222981, + "grad_norm": 0.8898872137069702, + "learning_rate": 0.0005433447098976109, + "loss": 1.5906, + "step": 2011 + }, + { + "epoch": 2.2889647326507396, + "grad_norm": 0.6888718605041504, + "learning_rate": 0.0005431171786120591, + "loss": 1.1981, + "step": 2012 + }, + { + "epoch": 2.2901023890784984, + "grad_norm": 0.6405702829360962, + "learning_rate": 0.0005428896473265074, + "loss": 0.9176, + "step": 2013 + }, + { + "epoch": 2.291240045506257, + "grad_norm": 1.125874638557434, + "learning_rate": 0.0005426621160409556, + "loss": 1.9476, + "step": 2014 + }, + { + "epoch": 2.292377701934016, + "grad_norm": 0.9443691372871399, + "learning_rate": 0.0005424345847554039, + "loss": 1.1386, + "step": 2015 + }, + { + "epoch": 2.2935153583617747, + "grad_norm": 1.1550638675689697, + "learning_rate": 0.0005422070534698521, + "loss": 2.695, + "step": 2016 + }, + { + "epoch": 2.2946530147895334, + "grad_norm": 0.9223167896270752, + "learning_rate": 0.0005419795221843004, + "loss": 1.962, + "step": 2017 + }, + { + "epoch": 2.295790671217292, + "grad_norm": 0.9546693563461304, + "learning_rate": 0.0005417519908987486, + "loss": 1.9184, + "step": 2018 + }, + { + "epoch": 2.296928327645051, + "grad_norm": 1.0364398956298828, + "learning_rate": 0.0005415244596131968, + "loss": 2.3646, + "step": 2019 + }, + { + "epoch": 2.29806598407281, + "grad_norm": 0.7954057455062866, + "learning_rate": 0.0005412969283276451, + "loss": 1.0092, + "step": 2020 + }, + { + "epoch": 2.299203640500569, + "grad_norm": 0.9007298946380615, + "learning_rate": 0.0005410693970420934, + "loss": 1.6872, + "step": 2021 + }, + { + "epoch": 2.3003412969283277, + "grad_norm": 0.9581443071365356, + "learning_rate": 0.0005408418657565415, + "loss": 2.0143, + "step": 2022 + }, + { + "epoch": 2.3014789533560864, + "grad_norm": 0.9659209251403809, + "learning_rate": 0.0005406143344709898, + "loss": 1.2962, + "step": 2023 + }, + { + "epoch": 2.302616609783845, + "grad_norm": 0.974143385887146, + "learning_rate": 0.000540386803185438, + "loss": 2.7885, + "step": 2024 + }, + { + "epoch": 2.303754266211604, + "grad_norm": 1.0462844371795654, + "learning_rate": 0.0005401592718998862, + "loss": 2.1409, + "step": 2025 + }, + { + "epoch": 2.3048919226393627, + "grad_norm": 0.8965214490890503, + "learning_rate": 0.0005399317406143345, + "loss": 2.2281, + "step": 2026 + }, + { + "epoch": 2.306029579067122, + "grad_norm": 1.1970421075820923, + "learning_rate": 0.0005397042093287827, + "loss": 2.378, + "step": 2027 + }, + { + "epoch": 2.3071672354948807, + "grad_norm": 1.012830138206482, + "learning_rate": 0.0005394766780432309, + "loss": 1.4127, + "step": 2028 + }, + { + "epoch": 2.3083048919226394, + "grad_norm": 0.8547312021255493, + "learning_rate": 0.0005392491467576793, + "loss": 2.5707, + "step": 2029 + }, + { + "epoch": 2.309442548350398, + "grad_norm": 1.0248013734817505, + "learning_rate": 0.0005390216154721275, + "loss": 1.1376, + "step": 2030 + }, + { + "epoch": 2.310580204778157, + "grad_norm": 1.2407397031784058, + "learning_rate": 0.0005387940841865757, + "loss": 2.5036, + "step": 2031 + }, + { + "epoch": 2.3117178612059157, + "grad_norm": 1.169358730316162, + "learning_rate": 0.0005385665529010239, + "loss": 2.4767, + "step": 2032 + }, + { + "epoch": 2.3128555176336745, + "grad_norm": 0.552113950252533, + "learning_rate": 0.0005383390216154721, + "loss": 0.8825, + "step": 2033 + }, + { + "epoch": 2.3139931740614337, + "grad_norm": 1.2916892766952515, + "learning_rate": 0.0005381114903299203, + "loss": 2.3583, + "step": 2034 + }, + { + "epoch": 2.3151308304891924, + "grad_norm": 0.9264313578605652, + "learning_rate": 0.0005378839590443686, + "loss": 1.945, + "step": 2035 + }, + { + "epoch": 2.316268486916951, + "grad_norm": 0.6379786729812622, + "learning_rate": 0.0005376564277588168, + "loss": 0.7919, + "step": 2036 + }, + { + "epoch": 2.31740614334471, + "grad_norm": 1.199393630027771, + "learning_rate": 0.0005374288964732651, + "loss": 2.3252, + "step": 2037 + }, + { + "epoch": 2.3185437997724687, + "grad_norm": 1.0265052318572998, + "learning_rate": 0.0005372013651877134, + "loss": 1.6382, + "step": 2038 + }, + { + "epoch": 2.3196814562002275, + "grad_norm": 1.808610439300537, + "learning_rate": 0.0005369738339021616, + "loss": 3.1174, + "step": 2039 + }, + { + "epoch": 2.3208191126279862, + "grad_norm": 0.7785828113555908, + "learning_rate": 0.0005367463026166099, + "loss": 1.7553, + "step": 2040 + }, + { + "epoch": 2.321956769055745, + "grad_norm": 1.1902951002120972, + "learning_rate": 0.0005365187713310581, + "loss": 1.9243, + "step": 2041 + }, + { + "epoch": 2.3230944254835038, + "grad_norm": 0.8798537254333496, + "learning_rate": 0.0005362912400455062, + "loss": 1.8127, + "step": 2042 + }, + { + "epoch": 2.324232081911263, + "grad_norm": 1.0299949645996094, + "learning_rate": 0.0005360637087599545, + "loss": 2.0079, + "step": 2043 + }, + { + "epoch": 2.3253697383390217, + "grad_norm": 0.8465110063552856, + "learning_rate": 0.0005358361774744027, + "loss": 1.6008, + "step": 2044 + }, + { + "epoch": 2.3265073947667805, + "grad_norm": 1.031044363975525, + "learning_rate": 0.0005356086461888509, + "loss": 1.6429, + "step": 2045 + }, + { + "epoch": 2.3276450511945392, + "grad_norm": 0.9194596409797668, + "learning_rate": 0.0005353811149032993, + "loss": 1.5493, + "step": 2046 + }, + { + "epoch": 2.328782707622298, + "grad_norm": 0.8609010577201843, + "learning_rate": 0.0005351535836177475, + "loss": 1.4473, + "step": 2047 + }, + { + "epoch": 2.3299203640500568, + "grad_norm": 1.3532382249832153, + "learning_rate": 0.0005349260523321957, + "loss": 4.2047, + "step": 2048 + }, + { + "epoch": 2.3310580204778155, + "grad_norm": 0.9629133939743042, + "learning_rate": 0.000534698521046644, + "loss": 1.1077, + "step": 2049 + }, + { + "epoch": 2.3321956769055747, + "grad_norm": 0.8413902521133423, + "learning_rate": 0.0005344709897610922, + "loss": 1.5364, + "step": 2050 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 1.0402729511260986, + "learning_rate": 0.0005342434584755403, + "loss": 2.7655, + "step": 2051 + }, + { + "epoch": 2.3344709897610922, + "grad_norm": 0.9589785933494568, + "learning_rate": 0.0005340159271899886, + "loss": 1.5259, + "step": 2052 + }, + { + "epoch": 2.335608646188851, + "grad_norm": 0.6738478541374207, + "learning_rate": 0.0005337883959044368, + "loss": 1.5741, + "step": 2053 + }, + { + "epoch": 2.3367463026166098, + "grad_norm": 1.0174530744552612, + "learning_rate": 0.000533560864618885, + "loss": 3.3126, + "step": 2054 + }, + { + "epoch": 2.3378839590443685, + "grad_norm": 0.9035469889640808, + "learning_rate": 0.0005333333333333334, + "loss": 1.6059, + "step": 2055 + }, + { + "epoch": 2.3390216154721273, + "grad_norm": 1.1631959676742554, + "learning_rate": 0.0005331058020477816, + "loss": 1.7973, + "step": 2056 + }, + { + "epoch": 2.3401592718998865, + "grad_norm": 0.8147276043891907, + "learning_rate": 0.0005328782707622299, + "loss": 1.3416, + "step": 2057 + }, + { + "epoch": 2.3412969283276452, + "grad_norm": 1.1491146087646484, + "learning_rate": 0.0005326507394766781, + "loss": 3.0711, + "step": 2058 + }, + { + "epoch": 2.342434584755404, + "grad_norm": 0.7472861409187317, + "learning_rate": 0.0005324232081911263, + "loss": 1.4432, + "step": 2059 + }, + { + "epoch": 2.3435722411831628, + "grad_norm": 1.2172683477401733, + "learning_rate": 0.0005321956769055746, + "loss": 1.5058, + "step": 2060 + }, + { + "epoch": 2.3447098976109215, + "grad_norm": 1.0745410919189453, + "learning_rate": 0.0005319681456200227, + "loss": 1.6888, + "step": 2061 + }, + { + "epoch": 2.3458475540386803, + "grad_norm": 0.8925970792770386, + "learning_rate": 0.0005317406143344709, + "loss": 1.4089, + "step": 2062 + }, + { + "epoch": 2.346985210466439, + "grad_norm": 0.8472179770469666, + "learning_rate": 0.0005315130830489193, + "loss": 1.4942, + "step": 2063 + }, + { + "epoch": 2.348122866894198, + "grad_norm": 0.4184499979019165, + "learning_rate": 0.0005312855517633675, + "loss": 0.6314, + "step": 2064 + }, + { + "epoch": 2.3492605233219566, + "grad_norm": 1.262465238571167, + "learning_rate": 0.0005310580204778157, + "loss": 2.0469, + "step": 2065 + }, + { + "epoch": 2.3503981797497158, + "grad_norm": 1.097849726676941, + "learning_rate": 0.000530830489192264, + "loss": 2.8422, + "step": 2066 + }, + { + "epoch": 2.3515358361774745, + "grad_norm": 1.6128443479537964, + "learning_rate": 0.0005306029579067122, + "loss": 1.9792, + "step": 2067 + }, + { + "epoch": 2.3526734926052333, + "grad_norm": 0.8861138820648193, + "learning_rate": 0.0005303754266211604, + "loss": 1.7751, + "step": 2068 + }, + { + "epoch": 2.353811149032992, + "grad_norm": 0.8302369713783264, + "learning_rate": 0.0005301478953356087, + "loss": 1.8749, + "step": 2069 + }, + { + "epoch": 2.354948805460751, + "grad_norm": 0.9765578508377075, + "learning_rate": 0.0005299203640500569, + "loss": 2.7383, + "step": 2070 + }, + { + "epoch": 2.3560864618885096, + "grad_norm": 1.0690255165100098, + "learning_rate": 0.000529692832764505, + "loss": 1.6116, + "step": 2071 + }, + { + "epoch": 2.3572241183162683, + "grad_norm": 0.96379154920578, + "learning_rate": 0.0005294653014789534, + "loss": 1.7744, + "step": 2072 + }, + { + "epoch": 2.3583617747440275, + "grad_norm": 0.5584749579429626, + "learning_rate": 0.0005292377701934016, + "loss": 1.0452, + "step": 2073 + }, + { + "epoch": 2.3594994311717863, + "grad_norm": 1.5071784257888794, + "learning_rate": 0.0005290102389078498, + "loss": 3.7007, + "step": 2074 + }, + { + "epoch": 2.360637087599545, + "grad_norm": 0.7578591108322144, + "learning_rate": 0.0005287827076222981, + "loss": 0.9569, + "step": 2075 + }, + { + "epoch": 2.361774744027304, + "grad_norm": 0.8119309544563293, + "learning_rate": 0.0005285551763367463, + "loss": 1.6168, + "step": 2076 + }, + { + "epoch": 2.3629124004550626, + "grad_norm": 0.9481661319732666, + "learning_rate": 0.0005283276450511945, + "loss": 2.9747, + "step": 2077 + }, + { + "epoch": 2.3640500568828213, + "grad_norm": 0.9913211464881897, + "learning_rate": 0.0005281001137656428, + "loss": 2.35, + "step": 2078 + }, + { + "epoch": 2.36518771331058, + "grad_norm": 1.1122124195098877, + "learning_rate": 0.000527872582480091, + "loss": 2.0505, + "step": 2079 + }, + { + "epoch": 2.366325369738339, + "grad_norm": 0.8721759915351868, + "learning_rate": 0.0005276450511945393, + "loss": 1.6231, + "step": 2080 + }, + { + "epoch": 2.3674630261660976, + "grad_norm": 0.8457727432250977, + "learning_rate": 0.0005274175199089875, + "loss": 1.6846, + "step": 2081 + }, + { + "epoch": 2.368600682593857, + "grad_norm": 1.0846689939498901, + "learning_rate": 0.0005271899886234357, + "loss": 1.7606, + "step": 2082 + }, + { + "epoch": 2.3697383390216156, + "grad_norm": 0.9165617823600769, + "learning_rate": 0.000526962457337884, + "loss": 1.3047, + "step": 2083 + }, + { + "epoch": 2.3708759954493743, + "grad_norm": 1.1322617530822754, + "learning_rate": 0.0005267349260523322, + "loss": 1.1403, + "step": 2084 + }, + { + "epoch": 2.372013651877133, + "grad_norm": 1.6939765214920044, + "learning_rate": 0.0005265073947667804, + "loss": 3.1601, + "step": 2085 + }, + { + "epoch": 2.373151308304892, + "grad_norm": 1.0793713331222534, + "learning_rate": 0.0005262798634812287, + "loss": 2.0837, + "step": 2086 + }, + { + "epoch": 2.3742889647326506, + "grad_norm": 0.8238543272018433, + "learning_rate": 0.0005260523321956769, + "loss": 1.438, + "step": 2087 + }, + { + "epoch": 2.3754266211604094, + "grad_norm": 0.932790994644165, + "learning_rate": 0.0005258248009101252, + "loss": 1.9309, + "step": 2088 + }, + { + "epoch": 2.3765642775881686, + "grad_norm": 0.6618528366088867, + "learning_rate": 0.0005255972696245735, + "loss": 1.4155, + "step": 2089 + }, + { + "epoch": 2.3777019340159273, + "grad_norm": 0.7635231018066406, + "learning_rate": 0.0005253697383390216, + "loss": 1.3532, + "step": 2090 + }, + { + "epoch": 2.378839590443686, + "grad_norm": 0.9490572810173035, + "learning_rate": 0.0005251422070534698, + "loss": 2.3154, + "step": 2091 + }, + { + "epoch": 2.379977246871445, + "grad_norm": 1.2259362936019897, + "learning_rate": 0.0005249146757679181, + "loss": 2.4494, + "step": 2092 + }, + { + "epoch": 2.3811149032992036, + "grad_norm": 0.7659657001495361, + "learning_rate": 0.0005246871444823663, + "loss": 1.1013, + "step": 2093 + }, + { + "epoch": 2.3822525597269624, + "grad_norm": 1.1086747646331787, + "learning_rate": 0.0005244596131968145, + "loss": 2.263, + "step": 2094 + }, + { + "epoch": 2.383390216154721, + "grad_norm": 0.9705541729927063, + "learning_rate": 0.0005242320819112628, + "loss": 1.4336, + "step": 2095 + }, + { + "epoch": 2.3845278725824803, + "grad_norm": 0.5771173238754272, + "learning_rate": 0.000524004550625711, + "loss": 0.8406, + "step": 2096 + }, + { + "epoch": 2.385665529010239, + "grad_norm": 0.424938440322876, + "learning_rate": 0.0005237770193401593, + "loss": 0.5156, + "step": 2097 + }, + { + "epoch": 2.386803185437998, + "grad_norm": 0.7281388640403748, + "learning_rate": 0.0005235494880546076, + "loss": 0.8201, + "step": 2098 + }, + { + "epoch": 2.3879408418657566, + "grad_norm": 1.5582374334335327, + "learning_rate": 0.0005233219567690558, + "loss": 3.5289, + "step": 2099 + }, + { + "epoch": 2.3890784982935154, + "grad_norm": 1.2791839838027954, + "learning_rate": 0.000523094425483504, + "loss": 1.6449, + "step": 2100 + }, + { + "epoch": 2.390216154721274, + "grad_norm": 1.1435104608535767, + "learning_rate": 0.0005228668941979522, + "loss": 2.1264, + "step": 2101 + }, + { + "epoch": 2.391353811149033, + "grad_norm": 1.234621286392212, + "learning_rate": 0.0005226393629124004, + "loss": 1.0853, + "step": 2102 + }, + { + "epoch": 2.3924914675767917, + "grad_norm": 0.9681219458580017, + "learning_rate": 0.0005224118316268487, + "loss": 2.4709, + "step": 2103 + }, + { + "epoch": 2.3936291240045504, + "grad_norm": 1.064215064048767, + "learning_rate": 0.0005221843003412969, + "loss": 2.7476, + "step": 2104 + }, + { + "epoch": 2.3947667804323096, + "grad_norm": 1.3225258588790894, + "learning_rate": 0.0005219567690557452, + "loss": 2.7113, + "step": 2105 + }, + { + "epoch": 2.3959044368600684, + "grad_norm": 1.2326604127883911, + "learning_rate": 0.0005217292377701935, + "loss": 1.7622, + "step": 2106 + }, + { + "epoch": 2.397042093287827, + "grad_norm": 0.7115911841392517, + "learning_rate": 0.0005215017064846417, + "loss": 1.8257, + "step": 2107 + }, + { + "epoch": 2.398179749715586, + "grad_norm": 0.762967050075531, + "learning_rate": 0.0005212741751990899, + "loss": 1.5422, + "step": 2108 + }, + { + "epoch": 2.3993174061433447, + "grad_norm": 1.2912230491638184, + "learning_rate": 0.0005210466439135382, + "loss": 2.3056, + "step": 2109 + }, + { + "epoch": 2.4004550625711034, + "grad_norm": 0.8842760920524597, + "learning_rate": 0.0005208191126279863, + "loss": 1.9213, + "step": 2110 + }, + { + "epoch": 2.401592718998862, + "grad_norm": 1.1768587827682495, + "learning_rate": 0.0005205915813424345, + "loss": 2.635, + "step": 2111 + }, + { + "epoch": 2.4027303754266214, + "grad_norm": 0.7537290453910828, + "learning_rate": 0.0005203640500568828, + "loss": 1.4866, + "step": 2112 + }, + { + "epoch": 2.40386803185438, + "grad_norm": 0.5235282182693481, + "learning_rate": 0.000520136518771331, + "loss": 1.3188, + "step": 2113 + }, + { + "epoch": 2.405005688282139, + "grad_norm": 1.8466582298278809, + "learning_rate": 0.0005199089874857793, + "loss": 3.5333, + "step": 2114 + }, + { + "epoch": 2.4061433447098977, + "grad_norm": 2.3424501419067383, + "learning_rate": 0.0005196814562002276, + "loss": 1.6052, + "step": 2115 + }, + { + "epoch": 2.4072810011376564, + "grad_norm": 1.0736968517303467, + "learning_rate": 0.0005194539249146758, + "loss": 1.6249, + "step": 2116 + }, + { + "epoch": 2.408418657565415, + "grad_norm": 1.0152912139892578, + "learning_rate": 0.000519226393629124, + "loss": 2.1684, + "step": 2117 + }, + { + "epoch": 2.409556313993174, + "grad_norm": 0.896634578704834, + "learning_rate": 0.0005189988623435723, + "loss": 1.4528, + "step": 2118 + }, + { + "epoch": 2.4106939704209327, + "grad_norm": 0.5919451713562012, + "learning_rate": 0.0005187713310580204, + "loss": 1.2616, + "step": 2119 + }, + { + "epoch": 2.4118316268486915, + "grad_norm": 0.9945755004882812, + "learning_rate": 0.0005185437997724687, + "loss": 1.9749, + "step": 2120 + }, + { + "epoch": 2.4129692832764507, + "grad_norm": 0.6944431066513062, + "learning_rate": 0.000518316268486917, + "loss": 1.538, + "step": 2121 + }, + { + "epoch": 2.4141069397042094, + "grad_norm": 1.0082086324691772, + "learning_rate": 0.0005180887372013652, + "loss": 1.2937, + "step": 2122 + }, + { + "epoch": 2.415244596131968, + "grad_norm": 0.8779463768005371, + "learning_rate": 0.0005178612059158135, + "loss": 2.8401, + "step": 2123 + }, + { + "epoch": 2.416382252559727, + "grad_norm": 0.729810357093811, + "learning_rate": 0.0005176336746302617, + "loss": 2.1615, + "step": 2124 + }, + { + "epoch": 2.4175199089874857, + "grad_norm": 0.5567348003387451, + "learning_rate": 0.0005174061433447099, + "loss": 1.0664, + "step": 2125 + }, + { + "epoch": 2.4186575654152445, + "grad_norm": 0.7598175406455994, + "learning_rate": 0.0005171786120591582, + "loss": 1.6606, + "step": 2126 + }, + { + "epoch": 2.419795221843003, + "grad_norm": 0.9565114378929138, + "learning_rate": 0.0005169510807736064, + "loss": 1.97, + "step": 2127 + }, + { + "epoch": 2.4209328782707624, + "grad_norm": 1.6986292600631714, + "learning_rate": 0.0005167235494880546, + "loss": 3.6617, + "step": 2128 + }, + { + "epoch": 2.422070534698521, + "grad_norm": 0.8568170666694641, + "learning_rate": 0.0005164960182025028, + "loss": 2.0927, + "step": 2129 + }, + { + "epoch": 2.42320819112628, + "grad_norm": 1.218289852142334, + "learning_rate": 0.000516268486916951, + "loss": 2.0588, + "step": 2130 + }, + { + "epoch": 2.4243458475540387, + "grad_norm": 1.114538550376892, + "learning_rate": 0.0005160409556313993, + "loss": 2.6901, + "step": 2131 + }, + { + "epoch": 2.4254835039817975, + "grad_norm": 0.9539201855659485, + "learning_rate": 0.0005158134243458476, + "loss": 1.3669, + "step": 2132 + }, + { + "epoch": 2.426621160409556, + "grad_norm": 1.0585428476333618, + "learning_rate": 0.0005155858930602958, + "loss": 2.5264, + "step": 2133 + }, + { + "epoch": 2.427758816837315, + "grad_norm": 0.5750988125801086, + "learning_rate": 0.000515358361774744, + "loss": 1.1382, + "step": 2134 + }, + { + "epoch": 2.428896473265074, + "grad_norm": 1.5301157236099243, + "learning_rate": 0.0005151308304891923, + "loss": 2.8573, + "step": 2135 + }, + { + "epoch": 2.430034129692833, + "grad_norm": 0.6881375312805176, + "learning_rate": 0.0005149032992036405, + "loss": 1.4282, + "step": 2136 + }, + { + "epoch": 2.4311717861205917, + "grad_norm": 0.642068088054657, + "learning_rate": 0.0005146757679180887, + "loss": 0.8719, + "step": 2137 + }, + { + "epoch": 2.4323094425483505, + "grad_norm": 0.9515458941459656, + "learning_rate": 0.0005144482366325371, + "loss": 1.7866, + "step": 2138 + }, + { + "epoch": 2.4334470989761092, + "grad_norm": 1.4139736890792847, + "learning_rate": 0.0005142207053469852, + "loss": 2.3178, + "step": 2139 + }, + { + "epoch": 2.434584755403868, + "grad_norm": 0.7378571629524231, + "learning_rate": 0.0005139931740614334, + "loss": 1.5993, + "step": 2140 + }, + { + "epoch": 2.4357224118316267, + "grad_norm": 1.243595004081726, + "learning_rate": 0.0005137656427758817, + "loss": 2.0773, + "step": 2141 + }, + { + "epoch": 2.4368600682593855, + "grad_norm": 0.7528473734855652, + "learning_rate": 0.0005135381114903299, + "loss": 1.8747, + "step": 2142 + }, + { + "epoch": 2.4379977246871443, + "grad_norm": 0.9805837273597717, + "learning_rate": 0.0005133105802047782, + "loss": 2.3093, + "step": 2143 + }, + { + "epoch": 2.4391353811149035, + "grad_norm": 1.0196789503097534, + "learning_rate": 0.0005130830489192264, + "loss": 1.6322, + "step": 2144 + }, + { + "epoch": 2.4402730375426622, + "grad_norm": 1.2920022010803223, + "learning_rate": 0.0005128555176336746, + "loss": 1.0212, + "step": 2145 + }, + { + "epoch": 2.441410693970421, + "grad_norm": 1.036919116973877, + "learning_rate": 0.000512627986348123, + "loss": 2.4244, + "step": 2146 + }, + { + "epoch": 2.4425483503981797, + "grad_norm": 0.7487674951553345, + "learning_rate": 0.0005124004550625712, + "loss": 1.2761, + "step": 2147 + }, + { + "epoch": 2.4436860068259385, + "grad_norm": 0.9197911024093628, + "learning_rate": 0.0005121729237770194, + "loss": 2.1063, + "step": 2148 + }, + { + "epoch": 2.4448236632536973, + "grad_norm": 0.9425987005233765, + "learning_rate": 0.0005119453924914676, + "loss": 1.6781, + "step": 2149 + }, + { + "epoch": 2.445961319681456, + "grad_norm": 1.1719930171966553, + "learning_rate": 0.0005117178612059158, + "loss": 2.0511, + "step": 2150 + }, + { + "epoch": 2.4470989761092152, + "grad_norm": 0.8875211477279663, + "learning_rate": 0.000511490329920364, + "loss": 1.6565, + "step": 2151 + }, + { + "epoch": 2.448236632536974, + "grad_norm": 0.61222243309021, + "learning_rate": 0.0005112627986348123, + "loss": 1.1201, + "step": 2152 + }, + { + "epoch": 2.4493742889647327, + "grad_norm": 1.5771725177764893, + "learning_rate": 0.0005110352673492605, + "loss": 3.2926, + "step": 2153 + }, + { + "epoch": 2.4505119453924915, + "grad_norm": 0.8051725029945374, + "learning_rate": 0.0005108077360637087, + "loss": 1.0753, + "step": 2154 + }, + { + "epoch": 2.4516496018202503, + "grad_norm": 0.7018311619758606, + "learning_rate": 0.0005105802047781571, + "loss": 0.9978, + "step": 2155 + }, + { + "epoch": 2.452787258248009, + "grad_norm": 1.0658284425735474, + "learning_rate": 0.0005103526734926053, + "loss": 3.1038, + "step": 2156 + }, + { + "epoch": 2.453924914675768, + "grad_norm": 0.890017032623291, + "learning_rate": 0.0005101251422070535, + "loss": 1.3806, + "step": 2157 + }, + { + "epoch": 2.4550625711035265, + "grad_norm": 1.4623593091964722, + "learning_rate": 0.0005098976109215017, + "loss": 3.3608, + "step": 2158 + }, + { + "epoch": 2.4562002275312853, + "grad_norm": 1.3333830833435059, + "learning_rate": 0.0005096700796359499, + "loss": 1.9364, + "step": 2159 + }, + { + "epoch": 2.4573378839590445, + "grad_norm": 1.0337042808532715, + "learning_rate": 0.0005094425483503981, + "loss": 1.6893, + "step": 2160 + }, + { + "epoch": 2.4584755403868033, + "grad_norm": 1.2401093244552612, + "learning_rate": 0.0005092150170648464, + "loss": 1.8306, + "step": 2161 + }, + { + "epoch": 2.459613196814562, + "grad_norm": 0.6069979071617126, + "learning_rate": 0.0005089874857792946, + "loss": 1.5413, + "step": 2162 + }, + { + "epoch": 2.460750853242321, + "grad_norm": 1.1139168739318848, + "learning_rate": 0.000508759954493743, + "loss": 2.6418, + "step": 2163 + }, + { + "epoch": 2.4618885096700796, + "grad_norm": 0.7371734976768494, + "learning_rate": 0.0005085324232081912, + "loss": 1.3743, + "step": 2164 + }, + { + "epoch": 2.4630261660978383, + "grad_norm": 1.3759161233901978, + "learning_rate": 0.0005083048919226394, + "loss": 2.3257, + "step": 2165 + }, + { + "epoch": 2.464163822525597, + "grad_norm": 1.0357613563537598, + "learning_rate": 0.0005080773606370877, + "loss": 1.6746, + "step": 2166 + }, + { + "epoch": 2.4653014789533563, + "grad_norm": 0.5313310623168945, + "learning_rate": 0.0005078498293515359, + "loss": 0.8946, + "step": 2167 + }, + { + "epoch": 2.466439135381115, + "grad_norm": 0.9679710268974304, + "learning_rate": 0.000507622298065984, + "loss": 1.906, + "step": 2168 + }, + { + "epoch": 2.467576791808874, + "grad_norm": 0.7682181596755981, + "learning_rate": 0.0005073947667804323, + "loss": 1.9641, + "step": 2169 + }, + { + "epoch": 2.4687144482366326, + "grad_norm": 0.6322567462921143, + "learning_rate": 0.0005071672354948805, + "loss": 0.5872, + "step": 2170 + }, + { + "epoch": 2.4698521046643913, + "grad_norm": 0.6293168663978577, + "learning_rate": 0.0005069397042093287, + "loss": 1.1534, + "step": 2171 + }, + { + "epoch": 2.47098976109215, + "grad_norm": 0.9249051213264465, + "learning_rate": 0.0005067121729237771, + "loss": 2.3496, + "step": 2172 + }, + { + "epoch": 2.472127417519909, + "grad_norm": 0.8873506784439087, + "learning_rate": 0.0005064846416382253, + "loss": 1.5146, + "step": 2173 + }, + { + "epoch": 2.473265073947668, + "grad_norm": 0.8709667325019836, + "learning_rate": 0.0005062571103526735, + "loss": 1.8368, + "step": 2174 + }, + { + "epoch": 2.474402730375427, + "grad_norm": 0.8933395147323608, + "learning_rate": 0.0005060295790671218, + "loss": 1.5841, + "step": 2175 + }, + { + "epoch": 2.4755403868031856, + "grad_norm": 0.8149117231369019, + "learning_rate": 0.00050580204778157, + "loss": 1.6464, + "step": 2176 + }, + { + "epoch": 2.4766780432309443, + "grad_norm": 0.8341697454452515, + "learning_rate": 0.0005055745164960182, + "loss": 1.1955, + "step": 2177 + }, + { + "epoch": 2.477815699658703, + "grad_norm": 1.2008378505706787, + "learning_rate": 0.0005053469852104664, + "loss": 2.0201, + "step": 2178 + }, + { + "epoch": 2.478953356086462, + "grad_norm": 1.0396666526794434, + "learning_rate": 0.0005051194539249146, + "loss": 1.9521, + "step": 2179 + }, + { + "epoch": 2.4800910125142206, + "grad_norm": 0.7898449897766113, + "learning_rate": 0.0005048919226393628, + "loss": 1.1885, + "step": 2180 + }, + { + "epoch": 2.4812286689419794, + "grad_norm": 0.9955923557281494, + "learning_rate": 0.0005046643913538112, + "loss": 1.8145, + "step": 2181 + }, + { + "epoch": 2.482366325369738, + "grad_norm": 1.1414570808410645, + "learning_rate": 0.0005044368600682594, + "loss": 2.2897, + "step": 2182 + }, + { + "epoch": 2.4835039817974973, + "grad_norm": 1.0973477363586426, + "learning_rate": 0.0005042093287827077, + "loss": 2.2097, + "step": 2183 + }, + { + "epoch": 2.484641638225256, + "grad_norm": 0.955952525138855, + "learning_rate": 0.0005039817974971559, + "loss": 1.6976, + "step": 2184 + }, + { + "epoch": 2.485779294653015, + "grad_norm": 0.9474056959152222, + "learning_rate": 0.0005037542662116041, + "loss": 1.5724, + "step": 2185 + }, + { + "epoch": 2.4869169510807736, + "grad_norm": 0.7748664021492004, + "learning_rate": 0.0005035267349260524, + "loss": 2.1302, + "step": 2186 + }, + { + "epoch": 2.4880546075085324, + "grad_norm": 1.1004105806350708, + "learning_rate": 0.0005032992036405005, + "loss": 1.9072, + "step": 2187 + }, + { + "epoch": 2.489192263936291, + "grad_norm": 0.8701795935630798, + "learning_rate": 0.0005030716723549487, + "loss": 1.2504, + "step": 2188 + }, + { + "epoch": 2.49032992036405, + "grad_norm": 1.0033451318740845, + "learning_rate": 0.0005028441410693971, + "loss": 2.6241, + "step": 2189 + }, + { + "epoch": 2.491467576791809, + "grad_norm": 0.9840807318687439, + "learning_rate": 0.0005026166097838453, + "loss": 1.5594, + "step": 2190 + }, + { + "epoch": 2.492605233219568, + "grad_norm": 1.0026658773422241, + "learning_rate": 0.0005023890784982935, + "loss": 2.1102, + "step": 2191 + }, + { + "epoch": 2.4937428896473266, + "grad_norm": 0.8774160146713257, + "learning_rate": 0.0005021615472127418, + "loss": 2.5087, + "step": 2192 + }, + { + "epoch": 2.4948805460750854, + "grad_norm": 0.7475762963294983, + "learning_rate": 0.00050193401592719, + "loss": 0.8467, + "step": 2193 + }, + { + "epoch": 2.496018202502844, + "grad_norm": 1.0985372066497803, + "learning_rate": 0.0005017064846416382, + "loss": 2.0641, + "step": 2194 + }, + { + "epoch": 2.497155858930603, + "grad_norm": 0.8731891512870789, + "learning_rate": 0.0005014789533560865, + "loss": 2.1272, + "step": 2195 + }, + { + "epoch": 2.4982935153583616, + "grad_norm": 0.9103354215621948, + "learning_rate": 0.0005012514220705347, + "loss": 2.0572, + "step": 2196 + }, + { + "epoch": 2.4994311717861204, + "grad_norm": 1.041029691696167, + "learning_rate": 0.0005010238907849828, + "loss": 1.9749, + "step": 2197 + }, + { + "epoch": 2.500568828213879, + "grad_norm": 1.2097538709640503, + "learning_rate": 0.0005007963594994312, + "loss": 1.671, + "step": 2198 + }, + { + "epoch": 2.5017064846416384, + "grad_norm": 0.9088473916053772, + "learning_rate": 0.0005005688282138794, + "loss": 2.0683, + "step": 2199 + }, + { + "epoch": 2.502844141069397, + "grad_norm": 0.728599488735199, + "learning_rate": 0.0005003412969283276, + "loss": 1.4432, + "step": 2200 + }, + { + "epoch": 2.503981797497156, + "grad_norm": 0.7432947754859924, + "learning_rate": 0.0005001137656427759, + "loss": 1.3989, + "step": 2201 + }, + { + "epoch": 2.5051194539249146, + "grad_norm": 0.9259990453720093, + "learning_rate": 0.0004998862343572241, + "loss": 2.1398, + "step": 2202 + }, + { + "epoch": 2.5062571103526734, + "grad_norm": 1.316038966178894, + "learning_rate": 0.0004996587030716724, + "loss": 2.2743, + "step": 2203 + }, + { + "epoch": 2.507394766780432, + "grad_norm": 0.8661054372787476, + "learning_rate": 0.0004994311717861205, + "loss": 1.706, + "step": 2204 + }, + { + "epoch": 2.508532423208191, + "grad_norm": 0.8994172215461731, + "learning_rate": 0.0004992036405005689, + "loss": 1.6767, + "step": 2205 + }, + { + "epoch": 2.50967007963595, + "grad_norm": 0.683313250541687, + "learning_rate": 0.0004989761092150171, + "loss": 0.8057, + "step": 2206 + }, + { + "epoch": 2.510807736063709, + "grad_norm": 1.4154108762741089, + "learning_rate": 0.0004987485779294653, + "loss": 1.5263, + "step": 2207 + }, + { + "epoch": 2.5119453924914676, + "grad_norm": 1.0941131114959717, + "learning_rate": 0.0004985210466439136, + "loss": 1.7871, + "step": 2208 + }, + { + "epoch": 2.5130830489192264, + "grad_norm": 0.8073767423629761, + "learning_rate": 0.0004982935153583618, + "loss": 1.7233, + "step": 2209 + }, + { + "epoch": 2.514220705346985, + "grad_norm": 1.2724852561950684, + "learning_rate": 0.00049806598407281, + "loss": 2.4039, + "step": 2210 + }, + { + "epoch": 2.515358361774744, + "grad_norm": 0.6580117344856262, + "learning_rate": 0.0004978384527872582, + "loss": 1.5502, + "step": 2211 + }, + { + "epoch": 2.5164960182025027, + "grad_norm": 0.9353324174880981, + "learning_rate": 0.0004976109215017065, + "loss": 1.7699, + "step": 2212 + }, + { + "epoch": 2.517633674630262, + "grad_norm": 0.7029836177825928, + "learning_rate": 0.0004973833902161547, + "loss": 1.4815, + "step": 2213 + }, + { + "epoch": 2.51877133105802, + "grad_norm": 0.8808972239494324, + "learning_rate": 0.000497155858930603, + "loss": 1.9639, + "step": 2214 + }, + { + "epoch": 2.5199089874857794, + "grad_norm": 0.7552292943000793, + "learning_rate": 0.0004969283276450512, + "loss": 0.9553, + "step": 2215 + }, + { + "epoch": 2.521046643913538, + "grad_norm": 0.8695642948150635, + "learning_rate": 0.0004967007963594995, + "loss": 1.8042, + "step": 2216 + }, + { + "epoch": 2.522184300341297, + "grad_norm": 1.5711946487426758, + "learning_rate": 0.0004964732650739477, + "loss": 2.2919, + "step": 2217 + }, + { + "epoch": 2.5233219567690557, + "grad_norm": 1.2966930866241455, + "learning_rate": 0.0004962457337883959, + "loss": 2.9262, + "step": 2218 + }, + { + "epoch": 2.5244596131968144, + "grad_norm": 0.7041467428207397, + "learning_rate": 0.0004960182025028441, + "loss": 1.2383, + "step": 2219 + }, + { + "epoch": 2.5255972696245736, + "grad_norm": 0.8301080465316772, + "learning_rate": 0.0004957906712172923, + "loss": 1.9915, + "step": 2220 + }, + { + "epoch": 2.526734926052332, + "grad_norm": 0.8538893461227417, + "learning_rate": 0.0004955631399317406, + "loss": 2.2926, + "step": 2221 + }, + { + "epoch": 2.527872582480091, + "grad_norm": 1.1304670572280884, + "learning_rate": 0.0004953356086461889, + "loss": 2.2345, + "step": 2222 + }, + { + "epoch": 2.52901023890785, + "grad_norm": 1.1222370862960815, + "learning_rate": 0.0004951080773606372, + "loss": 1.9092, + "step": 2223 + }, + { + "epoch": 2.5301478953356087, + "grad_norm": 1.1593031883239746, + "learning_rate": 0.0004948805460750853, + "loss": 2.5001, + "step": 2224 + }, + { + "epoch": 2.5312855517633674, + "grad_norm": 0.9046427011489868, + "learning_rate": 0.0004946530147895336, + "loss": 1.8159, + "step": 2225 + }, + { + "epoch": 2.532423208191126, + "grad_norm": 0.9191851615905762, + "learning_rate": 0.0004944254835039818, + "loss": 2.3244, + "step": 2226 + }, + { + "epoch": 2.533560864618885, + "grad_norm": 0.8399704098701477, + "learning_rate": 0.00049419795221843, + "loss": 2.1157, + "step": 2227 + }, + { + "epoch": 2.5346985210466437, + "grad_norm": 1.3385767936706543, + "learning_rate": 0.0004939704209328783, + "loss": 2.495, + "step": 2228 + }, + { + "epoch": 2.535836177474403, + "grad_norm": 0.9084081053733826, + "learning_rate": 0.0004937428896473265, + "loss": 1.222, + "step": 2229 + }, + { + "epoch": 2.5369738339021617, + "grad_norm": 1.0885717868804932, + "learning_rate": 0.0004935153583617748, + "loss": 2.0583, + "step": 2230 + }, + { + "epoch": 2.5381114903299204, + "grad_norm": 0.6400699019432068, + "learning_rate": 0.000493287827076223, + "loss": 0.7095, + "step": 2231 + }, + { + "epoch": 2.539249146757679, + "grad_norm": 0.8817832469940186, + "learning_rate": 0.0004930602957906713, + "loss": 2.0175, + "step": 2232 + }, + { + "epoch": 2.540386803185438, + "grad_norm": 0.5931545495986938, + "learning_rate": 0.0004928327645051195, + "loss": 1.6168, + "step": 2233 + }, + { + "epoch": 2.5415244596131967, + "grad_norm": 1.0301717519760132, + "learning_rate": 0.0004926052332195677, + "loss": 1.5534, + "step": 2234 + }, + { + "epoch": 2.5426621160409555, + "grad_norm": 0.6240630149841309, + "learning_rate": 0.0004923777019340159, + "loss": 0.7599, + "step": 2235 + }, + { + "epoch": 2.5437997724687147, + "grad_norm": 0.7773973345756531, + "learning_rate": 0.0004921501706484642, + "loss": 2.37, + "step": 2236 + }, + { + "epoch": 2.544937428896473, + "grad_norm": 0.7759304046630859, + "learning_rate": 0.0004919226393629124, + "loss": 1.0116, + "step": 2237 + }, + { + "epoch": 2.546075085324232, + "grad_norm": 1.2322027683258057, + "learning_rate": 0.0004916951080773606, + "loss": 3.2444, + "step": 2238 + }, + { + "epoch": 2.547212741751991, + "grad_norm": 0.8914145231246948, + "learning_rate": 0.0004914675767918089, + "loss": 1.5857, + "step": 2239 + }, + { + "epoch": 2.5483503981797497, + "grad_norm": 0.9285094738006592, + "learning_rate": 0.0004912400455062571, + "loss": 1.8327, + "step": 2240 + }, + { + "epoch": 2.5494880546075085, + "grad_norm": 1.1583962440490723, + "learning_rate": 0.0004910125142207054, + "loss": 2.0139, + "step": 2241 + }, + { + "epoch": 2.5506257110352673, + "grad_norm": 1.8207440376281738, + "learning_rate": 0.0004907849829351536, + "loss": 1.8943, + "step": 2242 + }, + { + "epoch": 2.551763367463026, + "grad_norm": 1.2174469232559204, + "learning_rate": 0.0004905574516496018, + "loss": 2.1719, + "step": 2243 + }, + { + "epoch": 2.5529010238907848, + "grad_norm": 1.4703840017318726, + "learning_rate": 0.00049032992036405, + "loss": 3.2219, + "step": 2244 + }, + { + "epoch": 2.554038680318544, + "grad_norm": 0.9278658628463745, + "learning_rate": 0.0004901023890784983, + "loss": 1.4814, + "step": 2245 + }, + { + "epoch": 2.5551763367463027, + "grad_norm": 0.5682125687599182, + "learning_rate": 0.0004898748577929465, + "loss": 0.9685, + "step": 2246 + }, + { + "epoch": 2.5563139931740615, + "grad_norm": 0.5616304278373718, + "learning_rate": 0.0004896473265073948, + "loss": 1.0696, + "step": 2247 + }, + { + "epoch": 2.5574516496018203, + "grad_norm": 0.9043198227882385, + "learning_rate": 0.000489419795221843, + "loss": 1.2706, + "step": 2248 + }, + { + "epoch": 2.558589306029579, + "grad_norm": 1.1037359237670898, + "learning_rate": 0.0004891922639362913, + "loss": 1.8091, + "step": 2249 + }, + { + "epoch": 2.5597269624573378, + "grad_norm": 1.0690373182296753, + "learning_rate": 0.0004889647326507395, + "loss": 1.6337, + "step": 2250 + }, + { + "epoch": 2.5608646188850965, + "grad_norm": 0.9125576019287109, + "learning_rate": 0.0004887372013651877, + "loss": 2.0998, + "step": 2251 + }, + { + "epoch": 2.5620022753128557, + "grad_norm": 1.3516008853912354, + "learning_rate": 0.000488509670079636, + "loss": 2.317, + "step": 2252 + }, + { + "epoch": 2.5631399317406145, + "grad_norm": 1.6707299947738647, + "learning_rate": 0.0004882821387940841, + "loss": 2.8554, + "step": 2253 + }, + { + "epoch": 2.5642775881683733, + "grad_norm": 0.7115156054496765, + "learning_rate": 0.00048805460750853244, + "loss": 0.7969, + "step": 2254 + }, + { + "epoch": 2.565415244596132, + "grad_norm": 0.7895395755767822, + "learning_rate": 0.00048782707622298065, + "loss": 1.3132, + "step": 2255 + }, + { + "epoch": 2.5665529010238908, + "grad_norm": 1.3991349935531616, + "learning_rate": 0.0004875995449374289, + "loss": 2.6118, + "step": 2256 + }, + { + "epoch": 2.5676905574516495, + "grad_norm": 1.0328587293624878, + "learning_rate": 0.0004873720136518772, + "loss": 1.8474, + "step": 2257 + }, + { + "epoch": 2.5688282138794083, + "grad_norm": 0.8418103456497192, + "learning_rate": 0.00048714448236632533, + "loss": 1.9257, + "step": 2258 + }, + { + "epoch": 2.5699658703071675, + "grad_norm": 1.1641095876693726, + "learning_rate": 0.0004869169510807736, + "loss": 1.6591, + "step": 2259 + }, + { + "epoch": 2.571103526734926, + "grad_norm": 1.1286218166351318, + "learning_rate": 0.00048668941979522186, + "loss": 1.7781, + "step": 2260 + }, + { + "epoch": 2.572241183162685, + "grad_norm": 0.9085434675216675, + "learning_rate": 0.0004864618885096701, + "loss": 1.7569, + "step": 2261 + }, + { + "epoch": 2.573378839590444, + "grad_norm": 0.6849757432937622, + "learning_rate": 0.00048623435722411833, + "loss": 1.2772, + "step": 2262 + }, + { + "epoch": 2.5745164960182025, + "grad_norm": 0.6630122661590576, + "learning_rate": 0.00048600682593856654, + "loss": 1.5381, + "step": 2263 + }, + { + "epoch": 2.5756541524459613, + "grad_norm": 0.7607198357582092, + "learning_rate": 0.0004857792946530148, + "loss": 1.3695, + "step": 2264 + }, + { + "epoch": 2.57679180887372, + "grad_norm": 1.6203835010528564, + "learning_rate": 0.000485551763367463, + "loss": 2.5078, + "step": 2265 + }, + { + "epoch": 2.577929465301479, + "grad_norm": 1.9112815856933594, + "learning_rate": 0.0004853242320819113, + "loss": 1.0513, + "step": 2266 + }, + { + "epoch": 2.5790671217292376, + "grad_norm": 1.3533493280410767, + "learning_rate": 0.00048509670079635955, + "loss": 3.0494, + "step": 2267 + }, + { + "epoch": 2.580204778156997, + "grad_norm": 0.7832985520362854, + "learning_rate": 0.0004848691695108077, + "loss": 1.0865, + "step": 2268 + }, + { + "epoch": 2.5813424345847555, + "grad_norm": 0.8799665570259094, + "learning_rate": 0.00048464163822525597, + "loss": 2.473, + "step": 2269 + }, + { + "epoch": 2.5824800910125143, + "grad_norm": 1.135883092880249, + "learning_rate": 0.00048441410693970423, + "loss": 2.009, + "step": 2270 + }, + { + "epoch": 2.583617747440273, + "grad_norm": 0.740875780582428, + "learning_rate": 0.0004841865756541525, + "loss": 1.5996, + "step": 2271 + }, + { + "epoch": 2.584755403868032, + "grad_norm": 1.1426676511764526, + "learning_rate": 0.00048395904436860065, + "loss": 1.7545, + "step": 2272 + }, + { + "epoch": 2.5858930602957906, + "grad_norm": 1.1677348613739014, + "learning_rate": 0.0004837315130830489, + "loss": 3.6223, + "step": 2273 + }, + { + "epoch": 2.5870307167235493, + "grad_norm": 1.1222909688949585, + "learning_rate": 0.0004835039817974972, + "loss": 2.0058, + "step": 2274 + }, + { + "epoch": 2.5881683731513085, + "grad_norm": 0.7786708474159241, + "learning_rate": 0.0004832764505119454, + "loss": 1.3458, + "step": 2275 + }, + { + "epoch": 2.589306029579067, + "grad_norm": 0.6702552437782288, + "learning_rate": 0.00048304891922639365, + "loss": 1.3789, + "step": 2276 + }, + { + "epoch": 2.590443686006826, + "grad_norm": 1.0909732580184937, + "learning_rate": 0.00048282138794084186, + "loss": 2.0698, + "step": 2277 + }, + { + "epoch": 2.591581342434585, + "grad_norm": 0.9485477209091187, + "learning_rate": 0.00048259385665529007, + "loss": 1.8122, + "step": 2278 + }, + { + "epoch": 2.5927189988623436, + "grad_norm": 1.065976619720459, + "learning_rate": 0.00048236632536973834, + "loss": 2.2232, + "step": 2279 + }, + { + "epoch": 2.5938566552901023, + "grad_norm": 0.547852635383606, + "learning_rate": 0.0004821387940841866, + "loss": 1.5199, + "step": 2280 + }, + { + "epoch": 2.594994311717861, + "grad_norm": 0.7793285250663757, + "learning_rate": 0.00048191126279863486, + "loss": 1.6383, + "step": 2281 + }, + { + "epoch": 2.59613196814562, + "grad_norm": 1.2793906927108765, + "learning_rate": 0.000481683731513083, + "loss": 1.7647, + "step": 2282 + }, + { + "epoch": 2.5972696245733786, + "grad_norm": 0.7028161883354187, + "learning_rate": 0.0004814562002275313, + "loss": 0.9584, + "step": 2283 + }, + { + "epoch": 2.598407281001138, + "grad_norm": 0.725344717502594, + "learning_rate": 0.00048122866894197955, + "loss": 1.2296, + "step": 2284 + }, + { + "epoch": 2.5995449374288966, + "grad_norm": 0.9674602746963501, + "learning_rate": 0.00048100113765642776, + "loss": 2.275, + "step": 2285 + }, + { + "epoch": 2.6006825938566553, + "grad_norm": 0.7748560905456543, + "learning_rate": 0.000480773606370876, + "loss": 1.7323, + "step": 2286 + }, + { + "epoch": 2.601820250284414, + "grad_norm": 1.1546324491500854, + "learning_rate": 0.00048054607508532423, + "loss": 2.054, + "step": 2287 + }, + { + "epoch": 2.602957906712173, + "grad_norm": 0.7889047861099243, + "learning_rate": 0.00048031854379977244, + "loss": 1.787, + "step": 2288 + }, + { + "epoch": 2.6040955631399316, + "grad_norm": 1.0750888586044312, + "learning_rate": 0.0004800910125142207, + "loss": 2.1864, + "step": 2289 + }, + { + "epoch": 2.6052332195676904, + "grad_norm": 0.9969229698181152, + "learning_rate": 0.00047986348122866897, + "loss": 1.7515, + "step": 2290 + }, + { + "epoch": 2.6063708759954496, + "grad_norm": 0.8853392004966736, + "learning_rate": 0.00047963594994311723, + "loss": 1.1181, + "step": 2291 + }, + { + "epoch": 2.6075085324232083, + "grad_norm": 0.7410064935684204, + "learning_rate": 0.0004794084186575654, + "loss": 1.8175, + "step": 2292 + }, + { + "epoch": 2.608646188850967, + "grad_norm": 1.1709070205688477, + "learning_rate": 0.00047918088737201365, + "loss": 1.8607, + "step": 2293 + }, + { + "epoch": 2.609783845278726, + "grad_norm": 1.0198490619659424, + "learning_rate": 0.0004789533560864619, + "loss": 1.6995, + "step": 2294 + }, + { + "epoch": 2.6109215017064846, + "grad_norm": 1.765076994895935, + "learning_rate": 0.00047872582480091013, + "loss": 3.519, + "step": 2295 + }, + { + "epoch": 2.6120591581342434, + "grad_norm": 1.3096511363983154, + "learning_rate": 0.0004784982935153584, + "loss": 2.7398, + "step": 2296 + }, + { + "epoch": 2.613196814562002, + "grad_norm": 2.6314849853515625, + "learning_rate": 0.0004782707622298066, + "loss": 5.0637, + "step": 2297 + }, + { + "epoch": 2.6143344709897613, + "grad_norm": 0.6920953392982483, + "learning_rate": 0.0004780432309442548, + "loss": 1.1274, + "step": 2298 + }, + { + "epoch": 2.6154721274175197, + "grad_norm": 1.1631900072097778, + "learning_rate": 0.0004778156996587031, + "loss": 1.7851, + "step": 2299 + }, + { + "epoch": 2.616609783845279, + "grad_norm": 0.7960054874420166, + "learning_rate": 0.00047758816837315134, + "loss": 1.2316, + "step": 2300 + }, + { + "epoch": 2.6177474402730376, + "grad_norm": 1.066870927810669, + "learning_rate": 0.0004773606370875996, + "loss": 2.3944, + "step": 2301 + }, + { + "epoch": 2.6188850967007964, + "grad_norm": 0.7467948198318481, + "learning_rate": 0.00047713310580204776, + "loss": 1.1561, + "step": 2302 + }, + { + "epoch": 2.620022753128555, + "grad_norm": 0.7254666686058044, + "learning_rate": 0.000476905574516496, + "loss": 1.0797, + "step": 2303 + }, + { + "epoch": 2.621160409556314, + "grad_norm": 0.6046991944313049, + "learning_rate": 0.0004766780432309443, + "loss": 0.9486, + "step": 2304 + }, + { + "epoch": 2.6222980659840727, + "grad_norm": 1.0409255027770996, + "learning_rate": 0.0004764505119453925, + "loss": 1.9647, + "step": 2305 + }, + { + "epoch": 2.6234357224118314, + "grad_norm": 1.2402981519699097, + "learning_rate": 0.00047622298065984076, + "loss": 2.1702, + "step": 2306 + }, + { + "epoch": 2.6245733788395906, + "grad_norm": 1.274269700050354, + "learning_rate": 0.00047599544937428897, + "loss": 2.9181, + "step": 2307 + }, + { + "epoch": 2.6257110352673494, + "grad_norm": 0.724886953830719, + "learning_rate": 0.0004757679180887372, + "loss": 1.357, + "step": 2308 + }, + { + "epoch": 2.626848691695108, + "grad_norm": 0.9972879886627197, + "learning_rate": 0.00047554038680318545, + "loss": 2.0593, + "step": 2309 + }, + { + "epoch": 2.627986348122867, + "grad_norm": 0.8259227275848389, + "learning_rate": 0.0004753128555176337, + "loss": 1.7083, + "step": 2310 + }, + { + "epoch": 2.6291240045506257, + "grad_norm": 1.0254158973693848, + "learning_rate": 0.0004750853242320819, + "loss": 2.4944, + "step": 2311 + }, + { + "epoch": 2.6302616609783844, + "grad_norm": 0.8054444193840027, + "learning_rate": 0.00047485779294653013, + "loss": 1.1276, + "step": 2312 + }, + { + "epoch": 2.631399317406143, + "grad_norm": 0.7392967939376831, + "learning_rate": 0.0004746302616609784, + "loss": 1.3377, + "step": 2313 + }, + { + "epoch": 2.6325369738339024, + "grad_norm": 0.9780520796775818, + "learning_rate": 0.00047440273037542666, + "loss": 2.8929, + "step": 2314 + }, + { + "epoch": 2.6336746302616607, + "grad_norm": 1.0950572490692139, + "learning_rate": 0.00047417519908987487, + "loss": 2.1541, + "step": 2315 + }, + { + "epoch": 2.63481228668942, + "grad_norm": 0.9937504529953003, + "learning_rate": 0.0004739476678043231, + "loss": 1.8749, + "step": 2316 + }, + { + "epoch": 2.6359499431171787, + "grad_norm": 0.6954947710037231, + "learning_rate": 0.00047372013651877134, + "loss": 1.8055, + "step": 2317 + }, + { + "epoch": 2.6370875995449374, + "grad_norm": 0.8226844668388367, + "learning_rate": 0.00047349260523321955, + "loss": 1.5377, + "step": 2318 + }, + { + "epoch": 2.638225255972696, + "grad_norm": 1.9866377115249634, + "learning_rate": 0.0004732650739476678, + "loss": 4.2124, + "step": 2319 + }, + { + "epoch": 2.639362912400455, + "grad_norm": 1.1584763526916504, + "learning_rate": 0.0004730375426621161, + "loss": 1.5185, + "step": 2320 + }, + { + "epoch": 2.640500568828214, + "grad_norm": 0.9889481067657471, + "learning_rate": 0.0004728100113765643, + "loss": 2.2838, + "step": 2321 + }, + { + "epoch": 2.6416382252559725, + "grad_norm": 0.9309085011482239, + "learning_rate": 0.0004725824800910125, + "loss": 1.152, + "step": 2322 + }, + { + "epoch": 2.6427758816837317, + "grad_norm": 1.0950933694839478, + "learning_rate": 0.00047235494880546076, + "loss": 2.3803, + "step": 2323 + }, + { + "epoch": 2.6439135381114904, + "grad_norm": 0.8817667365074158, + "learning_rate": 0.00047212741751990903, + "loss": 1.4906, + "step": 2324 + }, + { + "epoch": 2.645051194539249, + "grad_norm": 0.8397241830825806, + "learning_rate": 0.00047189988623435724, + "loss": 1.8036, + "step": 2325 + }, + { + "epoch": 2.646188850967008, + "grad_norm": 1.4048362970352173, + "learning_rate": 0.00047167235494880545, + "loss": 2.0251, + "step": 2326 + }, + { + "epoch": 2.6473265073947667, + "grad_norm": 1.4513121843338013, + "learning_rate": 0.0004714448236632537, + "loss": 1.8078, + "step": 2327 + }, + { + "epoch": 2.6484641638225255, + "grad_norm": 0.9570140838623047, + "learning_rate": 0.0004712172923777019, + "loss": 1.8617, + "step": 2328 + }, + { + "epoch": 2.6496018202502842, + "grad_norm": 0.8819069862365723, + "learning_rate": 0.0004709897610921502, + "loss": 1.8745, + "step": 2329 + }, + { + "epoch": 2.6507394766780434, + "grad_norm": 1.094341516494751, + "learning_rate": 0.00047076222980659845, + "loss": 1.788, + "step": 2330 + }, + { + "epoch": 2.651877133105802, + "grad_norm": 0.832409143447876, + "learning_rate": 0.00047053469852104666, + "loss": 2.2568, + "step": 2331 + }, + { + "epoch": 2.653014789533561, + "grad_norm": 0.9308044910430908, + "learning_rate": 0.00047030716723549487, + "loss": 2.3719, + "step": 2332 + }, + { + "epoch": 2.6541524459613197, + "grad_norm": 1.2896952629089355, + "learning_rate": 0.00047007963594994313, + "loss": 3.7022, + "step": 2333 + }, + { + "epoch": 2.6552901023890785, + "grad_norm": 1.0395236015319824, + "learning_rate": 0.0004698521046643914, + "loss": 2.8605, + "step": 2334 + }, + { + "epoch": 2.6564277588168372, + "grad_norm": 0.8033049702644348, + "learning_rate": 0.0004696245733788396, + "loss": 1.5591, + "step": 2335 + }, + { + "epoch": 2.657565415244596, + "grad_norm": 0.798548698425293, + "learning_rate": 0.0004693970420932878, + "loss": 1.158, + "step": 2336 + }, + { + "epoch": 2.658703071672355, + "grad_norm": 0.7860295176506042, + "learning_rate": 0.0004691695108077361, + "loss": 2.112, + "step": 2337 + }, + { + "epoch": 2.6598407281001135, + "grad_norm": 0.8449381589889526, + "learning_rate": 0.0004689419795221843, + "loss": 1.5287, + "step": 2338 + }, + { + "epoch": 2.6609783845278727, + "grad_norm": 1.0243216753005981, + "learning_rate": 0.00046871444823663256, + "loss": 1.6321, + "step": 2339 + }, + { + "epoch": 2.6621160409556315, + "grad_norm": 0.9676476716995239, + "learning_rate": 0.0004684869169510808, + "loss": 2.4557, + "step": 2340 + }, + { + "epoch": 2.6632536973833902, + "grad_norm": 0.9129576683044434, + "learning_rate": 0.000468259385665529, + "loss": 1.8287, + "step": 2341 + }, + { + "epoch": 2.664391353811149, + "grad_norm": 1.1413075923919678, + "learning_rate": 0.00046803185437997724, + "loss": 2.0987, + "step": 2342 + }, + { + "epoch": 2.6655290102389078, + "grad_norm": 0.7088634967803955, + "learning_rate": 0.0004678043230944255, + "loss": 1.6769, + "step": 2343 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.8204455971717834, + "learning_rate": 0.00046757679180887377, + "loss": 2.0439, + "step": 2344 + }, + { + "epoch": 2.6678043230944253, + "grad_norm": 0.7846645712852478, + "learning_rate": 0.0004673492605233219, + "loss": 1.4144, + "step": 2345 + }, + { + "epoch": 2.6689419795221845, + "grad_norm": 0.8308724761009216, + "learning_rate": 0.0004671217292377702, + "loss": 1.2093, + "step": 2346 + }, + { + "epoch": 2.6700796359499432, + "grad_norm": 1.0567412376403809, + "learning_rate": 0.00046689419795221845, + "loss": 1.8646, + "step": 2347 + }, + { + "epoch": 2.671217292377702, + "grad_norm": 0.986167311668396, + "learning_rate": 0.00046666666666666666, + "loss": 2.4904, + "step": 2348 + }, + { + "epoch": 2.6723549488054608, + "grad_norm": 1.8298397064208984, + "learning_rate": 0.0004664391353811149, + "loss": 2.9794, + "step": 2349 + }, + { + "epoch": 2.6734926052332195, + "grad_norm": 0.6814413070678711, + "learning_rate": 0.00046621160409556314, + "loss": 1.7104, + "step": 2350 + }, + { + "epoch": 2.6746302616609783, + "grad_norm": 1.0616414546966553, + "learning_rate": 0.00046598407281001135, + "loss": 2.269, + "step": 2351 + }, + { + "epoch": 2.675767918088737, + "grad_norm": 1.0702776908874512, + "learning_rate": 0.0004657565415244596, + "loss": 1.9667, + "step": 2352 + }, + { + "epoch": 2.6769055745164962, + "grad_norm": 1.4715807437896729, + "learning_rate": 0.0004655290102389079, + "loss": 3.1964, + "step": 2353 + }, + { + "epoch": 2.6780432309442546, + "grad_norm": 0.9601611495018005, + "learning_rate": 0.00046530147895335614, + "loss": 2.5208, + "step": 2354 + }, + { + "epoch": 2.6791808873720138, + "grad_norm": 1.083862543106079, + "learning_rate": 0.0004650739476678043, + "loss": 1.8693, + "step": 2355 + }, + { + "epoch": 2.6803185437997725, + "grad_norm": 1.272933006286621, + "learning_rate": 0.00046484641638225256, + "loss": 3.148, + "step": 2356 + }, + { + "epoch": 2.6814562002275313, + "grad_norm": 1.0518128871917725, + "learning_rate": 0.0004646188850967008, + "loss": 1.2337, + "step": 2357 + }, + { + "epoch": 2.68259385665529, + "grad_norm": 0.9924670457839966, + "learning_rate": 0.00046439135381114903, + "loss": 1.4861, + "step": 2358 + }, + { + "epoch": 2.683731513083049, + "grad_norm": 1.0956393480300903, + "learning_rate": 0.0004641638225255973, + "loss": 1.7977, + "step": 2359 + }, + { + "epoch": 2.684869169510808, + "grad_norm": 1.3490567207336426, + "learning_rate": 0.0004639362912400455, + "loss": 2.5556, + "step": 2360 + }, + { + "epoch": 2.6860068259385663, + "grad_norm": 0.8191376328468323, + "learning_rate": 0.0004637087599544937, + "loss": 1.7643, + "step": 2361 + }, + { + "epoch": 2.6871444823663255, + "grad_norm": 0.9714041948318481, + "learning_rate": 0.000463481228668942, + "loss": 1.9565, + "step": 2362 + }, + { + "epoch": 2.6882821387940843, + "grad_norm": 1.045387625694275, + "learning_rate": 0.00046325369738339024, + "loss": 2.746, + "step": 2363 + }, + { + "epoch": 2.689419795221843, + "grad_norm": 0.934880793094635, + "learning_rate": 0.0004630261660978385, + "loss": 2.6093, + "step": 2364 + }, + { + "epoch": 2.690557451649602, + "grad_norm": 0.7764321565628052, + "learning_rate": 0.00046279863481228666, + "loss": 1.0081, + "step": 2365 + }, + { + "epoch": 2.6916951080773606, + "grad_norm": 0.7609637379646301, + "learning_rate": 0.00046257110352673493, + "loss": 1.7661, + "step": 2366 + }, + { + "epoch": 2.6928327645051193, + "grad_norm": 0.8447152376174927, + "learning_rate": 0.0004623435722411832, + "loss": 1.5554, + "step": 2367 + }, + { + "epoch": 2.693970420932878, + "grad_norm": 0.76449054479599, + "learning_rate": 0.0004621160409556314, + "loss": 1.6179, + "step": 2368 + }, + { + "epoch": 2.6951080773606373, + "grad_norm": 0.7846193313598633, + "learning_rate": 0.00046188850967007967, + "loss": 1.0255, + "step": 2369 + }, + { + "epoch": 2.696245733788396, + "grad_norm": 0.8696131706237793, + "learning_rate": 0.0004616609783845279, + "loss": 2.0406, + "step": 2370 + }, + { + "epoch": 2.697383390216155, + "grad_norm": 0.9892042279243469, + "learning_rate": 0.0004614334470989761, + "loss": 2.7178, + "step": 2371 + }, + { + "epoch": 2.6985210466439136, + "grad_norm": 1.1036131381988525, + "learning_rate": 0.00046120591581342435, + "loss": 1.5833, + "step": 2372 + }, + { + "epoch": 2.6996587030716723, + "grad_norm": 0.9095990061759949, + "learning_rate": 0.0004609783845278726, + "loss": 2.3233, + "step": 2373 + }, + { + "epoch": 2.700796359499431, + "grad_norm": 1.0550446510314941, + "learning_rate": 0.0004607508532423209, + "loss": 1.5132, + "step": 2374 + }, + { + "epoch": 2.70193401592719, + "grad_norm": 0.984180748462677, + "learning_rate": 0.00046052332195676903, + "loss": 2.2976, + "step": 2375 + }, + { + "epoch": 2.703071672354949, + "grad_norm": 0.9732262492179871, + "learning_rate": 0.0004602957906712173, + "loss": 1.767, + "step": 2376 + }, + { + "epoch": 2.7042093287827074, + "grad_norm": 0.858201265335083, + "learning_rate": 0.00046006825938566556, + "loss": 1.3972, + "step": 2377 + }, + { + "epoch": 2.7053469852104666, + "grad_norm": 0.9151699542999268, + "learning_rate": 0.00045984072810011377, + "loss": 1.821, + "step": 2378 + }, + { + "epoch": 2.7064846416382253, + "grad_norm": 1.1489654779434204, + "learning_rate": 0.000459613196814562, + "loss": 1.3073, + "step": 2379 + }, + { + "epoch": 2.707622298065984, + "grad_norm": 1.2800509929656982, + "learning_rate": 0.00045938566552901025, + "loss": 1.9937, + "step": 2380 + }, + { + "epoch": 2.708759954493743, + "grad_norm": 0.6855766177177429, + "learning_rate": 0.00045915813424345846, + "loss": 0.7626, + "step": 2381 + }, + { + "epoch": 2.7098976109215016, + "grad_norm": 0.8592435717582703, + "learning_rate": 0.0004589306029579067, + "loss": 1.4657, + "step": 2382 + }, + { + "epoch": 2.7110352673492604, + "grad_norm": 0.6695002913475037, + "learning_rate": 0.000458703071672355, + "loss": 0.964, + "step": 2383 + }, + { + "epoch": 2.712172923777019, + "grad_norm": 0.6979312300682068, + "learning_rate": 0.0004584755403868032, + "loss": 1.4597, + "step": 2384 + }, + { + "epoch": 2.7133105802047783, + "grad_norm": 1.0145376920700073, + "learning_rate": 0.0004582480091012514, + "loss": 2.5069, + "step": 2385 + }, + { + "epoch": 2.714448236632537, + "grad_norm": 0.6209183931350708, + "learning_rate": 0.00045802047781569967, + "loss": 1.0564, + "step": 2386 + }, + { + "epoch": 2.715585893060296, + "grad_norm": 0.9510672688484192, + "learning_rate": 0.00045779294653014793, + "loss": 1.5424, + "step": 2387 + }, + { + "epoch": 2.7167235494880546, + "grad_norm": 0.911859393119812, + "learning_rate": 0.00045756541524459614, + "loss": 1.6315, + "step": 2388 + }, + { + "epoch": 2.7178612059158134, + "grad_norm": 0.7897265553474426, + "learning_rate": 0.00045733788395904435, + "loss": 1.3671, + "step": 2389 + }, + { + "epoch": 2.718998862343572, + "grad_norm": 0.8800843358039856, + "learning_rate": 0.0004571103526734926, + "loss": 2.1943, + "step": 2390 + }, + { + "epoch": 2.720136518771331, + "grad_norm": 1.3472900390625, + "learning_rate": 0.0004568828213879408, + "loss": 2.8666, + "step": 2391 + }, + { + "epoch": 2.72127417519909, + "grad_norm": 1.0880565643310547, + "learning_rate": 0.0004566552901023891, + "loss": 1.939, + "step": 2392 + }, + { + "epoch": 2.722411831626849, + "grad_norm": 0.8288098573684692, + "learning_rate": 0.00045642775881683735, + "loss": 2.334, + "step": 2393 + }, + { + "epoch": 2.7235494880546076, + "grad_norm": 1.0421987771987915, + "learning_rate": 0.00045620022753128556, + "loss": 2.0377, + "step": 2394 + }, + { + "epoch": 2.7246871444823664, + "grad_norm": 1.3530818223953247, + "learning_rate": 0.0004559726962457338, + "loss": 3.6316, + "step": 2395 + }, + { + "epoch": 2.725824800910125, + "grad_norm": 0.9477719068527222, + "learning_rate": 0.00045574516496018204, + "loss": 2.0223, + "step": 2396 + }, + { + "epoch": 2.726962457337884, + "grad_norm": 1.1411749124526978, + "learning_rate": 0.0004555176336746303, + "loss": 2.1665, + "step": 2397 + }, + { + "epoch": 2.7281001137656427, + "grad_norm": 1.3905211687088013, + "learning_rate": 0.0004552901023890785, + "loss": 1.8378, + "step": 2398 + }, + { + "epoch": 2.729237770193402, + "grad_norm": 0.7103641629219055, + "learning_rate": 0.0004550625711035267, + "loss": 1.6213, + "step": 2399 + }, + { + "epoch": 2.73037542662116, + "grad_norm": 0.7716681361198425, + "learning_rate": 0.000454835039817975, + "loss": 1.0399, + "step": 2400 + }, + { + "epoch": 2.7315130830489194, + "grad_norm": 0.6330393552780151, + "learning_rate": 0.0004546075085324232, + "loss": 1.2756, + "step": 2401 + }, + { + "epoch": 2.732650739476678, + "grad_norm": 1.137176513671875, + "learning_rate": 0.00045437997724687146, + "loss": 3.3, + "step": 2402 + }, + { + "epoch": 2.733788395904437, + "grad_norm": 0.9734787940979004, + "learning_rate": 0.0004541524459613197, + "loss": 1.4292, + "step": 2403 + }, + { + "epoch": 2.7349260523321957, + "grad_norm": 0.896617591381073, + "learning_rate": 0.00045392491467576793, + "loss": 0.7397, + "step": 2404 + }, + { + "epoch": 2.7360637087599544, + "grad_norm": 1.0114847421646118, + "learning_rate": 0.00045369738339021614, + "loss": 1.7089, + "step": 2405 + }, + { + "epoch": 2.737201365187713, + "grad_norm": 0.9761648774147034, + "learning_rate": 0.0004534698521046644, + "loss": 1.9788, + "step": 2406 + }, + { + "epoch": 2.738339021615472, + "grad_norm": 1.1160939931869507, + "learning_rate": 0.00045324232081911267, + "loss": 2.4602, + "step": 2407 + }, + { + "epoch": 2.739476678043231, + "grad_norm": 0.9416034817695618, + "learning_rate": 0.0004530147895335609, + "loss": 1.6489, + "step": 2408 + }, + { + "epoch": 2.74061433447099, + "grad_norm": 0.9150708913803101, + "learning_rate": 0.0004527872582480091, + "loss": 1.4331, + "step": 2409 + }, + { + "epoch": 2.7417519908987487, + "grad_norm": 1.14388108253479, + "learning_rate": 0.00045255972696245736, + "loss": 1.9662, + "step": 2410 + }, + { + "epoch": 2.7428896473265074, + "grad_norm": 0.8412722945213318, + "learning_rate": 0.00045233219567690557, + "loss": 1.2538, + "step": 2411 + }, + { + "epoch": 2.744027303754266, + "grad_norm": 1.3321762084960938, + "learning_rate": 0.00045210466439135383, + "loss": 1.693, + "step": 2412 + }, + { + "epoch": 2.745164960182025, + "grad_norm": 0.9153639674186707, + "learning_rate": 0.00045187713310580204, + "loss": 1.625, + "step": 2413 + }, + { + "epoch": 2.7463026166097837, + "grad_norm": 0.8170965313911438, + "learning_rate": 0.0004516496018202503, + "loss": 1.6253, + "step": 2414 + }, + { + "epoch": 2.747440273037543, + "grad_norm": 0.7869139909744263, + "learning_rate": 0.0004514220705346985, + "loss": 1.4511, + "step": 2415 + }, + { + "epoch": 2.748577929465301, + "grad_norm": 1.0545377731323242, + "learning_rate": 0.0004511945392491468, + "loss": 1.6514, + "step": 2416 + }, + { + "epoch": 2.7497155858930604, + "grad_norm": 1.045305848121643, + "learning_rate": 0.00045096700796359504, + "loss": 1.2496, + "step": 2417 + }, + { + "epoch": 2.750853242320819, + "grad_norm": 1.186232566833496, + "learning_rate": 0.0004507394766780432, + "loss": 2.2593, + "step": 2418 + }, + { + "epoch": 2.751990898748578, + "grad_norm": 0.8955073356628418, + "learning_rate": 0.00045051194539249146, + "loss": 1.1633, + "step": 2419 + }, + { + "epoch": 2.7531285551763367, + "grad_norm": 1.0408909320831299, + "learning_rate": 0.0004502844141069397, + "loss": 1.6361, + "step": 2420 + }, + { + "epoch": 2.7542662116040955, + "grad_norm": 1.312453031539917, + "learning_rate": 0.00045005688282138794, + "loss": 1.7388, + "step": 2421 + }, + { + "epoch": 2.755403868031854, + "grad_norm": 1.0657882690429688, + "learning_rate": 0.0004498293515358362, + "loss": 2.0169, + "step": 2422 + }, + { + "epoch": 2.756541524459613, + "grad_norm": 1.0726020336151123, + "learning_rate": 0.0004496018202502844, + "loss": 1.5911, + "step": 2423 + }, + { + "epoch": 2.757679180887372, + "grad_norm": 0.9591920375823975, + "learning_rate": 0.0004493742889647327, + "loss": 1.6334, + "step": 2424 + }, + { + "epoch": 2.758816837315131, + "grad_norm": 1.259514570236206, + "learning_rate": 0.0004491467576791809, + "loss": 2.5346, + "step": 2425 + }, + { + "epoch": 2.7599544937428897, + "grad_norm": 0.8274584412574768, + "learning_rate": 0.00044891922639362915, + "loss": 1.5429, + "step": 2426 + }, + { + "epoch": 2.7610921501706485, + "grad_norm": 1.121607780456543, + "learning_rate": 0.0004486916951080774, + "loss": 1.9823, + "step": 2427 + }, + { + "epoch": 2.7622298065984072, + "grad_norm": 1.2291669845581055, + "learning_rate": 0.00044846416382252557, + "loss": 1.9587, + "step": 2428 + }, + { + "epoch": 2.763367463026166, + "grad_norm": 0.8511943817138672, + "learning_rate": 0.00044823663253697383, + "loss": 1.1893, + "step": 2429 + }, + { + "epoch": 2.7645051194539247, + "grad_norm": 0.9760825037956238, + "learning_rate": 0.0004480091012514221, + "loss": 1.1563, + "step": 2430 + }, + { + "epoch": 2.765642775881684, + "grad_norm": 0.7539849877357483, + "learning_rate": 0.0004477815699658703, + "loss": 1.6624, + "step": 2431 + }, + { + "epoch": 2.7667804323094427, + "grad_norm": 1.0250943899154663, + "learning_rate": 0.00044755403868031857, + "loss": 1.4936, + "step": 2432 + }, + { + "epoch": 2.7679180887372015, + "grad_norm": 0.6538174152374268, + "learning_rate": 0.0004473265073947668, + "loss": 1.3009, + "step": 2433 + }, + { + "epoch": 2.7690557451649602, + "grad_norm": 0.6914688944816589, + "learning_rate": 0.00044709897610921504, + "loss": 1.0585, + "step": 2434 + }, + { + "epoch": 2.770193401592719, + "grad_norm": 0.9097804427146912, + "learning_rate": 0.00044687144482366325, + "loss": 1.5146, + "step": 2435 + }, + { + "epoch": 2.7713310580204777, + "grad_norm": 1.8249222040176392, + "learning_rate": 0.0004466439135381115, + "loss": 4.1139, + "step": 2436 + }, + { + "epoch": 2.7724687144482365, + "grad_norm": 0.8782042860984802, + "learning_rate": 0.0004464163822525598, + "loss": 1.4278, + "step": 2437 + }, + { + "epoch": 2.7736063708759957, + "grad_norm": 0.8963881134986877, + "learning_rate": 0.00044618885096700794, + "loss": 0.8318, + "step": 2438 + }, + { + "epoch": 2.774744027303754, + "grad_norm": 0.768720805644989, + "learning_rate": 0.0004459613196814562, + "loss": 1.2816, + "step": 2439 + }, + { + "epoch": 2.7758816837315132, + "grad_norm": 1.2364832162857056, + "learning_rate": 0.00044573378839590447, + "loss": 2.7189, + "step": 2440 + }, + { + "epoch": 2.777019340159272, + "grad_norm": 0.8980220556259155, + "learning_rate": 0.0004455062571103527, + "loss": 2.1147, + "step": 2441 + }, + { + "epoch": 2.7781569965870307, + "grad_norm": 1.1556415557861328, + "learning_rate": 0.00044527872582480094, + "loss": 2.5734, + "step": 2442 + }, + { + "epoch": 2.7792946530147895, + "grad_norm": 1.0572619438171387, + "learning_rate": 0.00044505119453924915, + "loss": 1.7883, + "step": 2443 + }, + { + "epoch": 2.7804323094425483, + "grad_norm": 1.2387428283691406, + "learning_rate": 0.00044482366325369736, + "loss": 1.9794, + "step": 2444 + }, + { + "epoch": 2.781569965870307, + "grad_norm": 0.6518829464912415, + "learning_rate": 0.0004445961319681456, + "loss": 1.567, + "step": 2445 + }, + { + "epoch": 2.782707622298066, + "grad_norm": 0.9689300060272217, + "learning_rate": 0.0004443686006825939, + "loss": 1.8925, + "step": 2446 + }, + { + "epoch": 2.783845278725825, + "grad_norm": 1.2153396606445312, + "learning_rate": 0.00044414106939704215, + "loss": 2.3077, + "step": 2447 + }, + { + "epoch": 2.7849829351535837, + "grad_norm": 0.9673851728439331, + "learning_rate": 0.0004439135381114903, + "loss": 1.3106, + "step": 2448 + }, + { + "epoch": 2.7861205915813425, + "grad_norm": 1.2174962759017944, + "learning_rate": 0.00044368600682593857, + "loss": 2.8822, + "step": 2449 + }, + { + "epoch": 2.7872582480091013, + "grad_norm": 1.0069944858551025, + "learning_rate": 0.00044345847554038684, + "loss": 1.4753, + "step": 2450 + }, + { + "epoch": 2.78839590443686, + "grad_norm": 1.794924020767212, + "learning_rate": 0.00044323094425483504, + "loss": 3.3568, + "step": 2451 + }, + { + "epoch": 2.789533560864619, + "grad_norm": 1.3522320985794067, + "learning_rate": 0.00044300341296928325, + "loss": 2.2246, + "step": 2452 + }, + { + "epoch": 2.7906712172923775, + "grad_norm": 1.126507043838501, + "learning_rate": 0.0004427758816837315, + "loss": 2.1909, + "step": 2453 + }, + { + "epoch": 2.7918088737201368, + "grad_norm": 0.840438961982727, + "learning_rate": 0.00044254835039817973, + "loss": 1.315, + "step": 2454 + }, + { + "epoch": 2.792946530147895, + "grad_norm": 7.848939418792725, + "learning_rate": 0.000442320819112628, + "loss": 2.2406, + "step": 2455 + }, + { + "epoch": 2.7940841865756543, + "grad_norm": 1.4858578443527222, + "learning_rate": 0.00044209328782707626, + "loss": 3.2399, + "step": 2456 + }, + { + "epoch": 2.795221843003413, + "grad_norm": 0.7680813670158386, + "learning_rate": 0.00044186575654152447, + "loss": 1.4417, + "step": 2457 + }, + { + "epoch": 2.796359499431172, + "grad_norm": 1.2522087097167969, + "learning_rate": 0.0004416382252559727, + "loss": 2.4295, + "step": 2458 + }, + { + "epoch": 2.7974971558589306, + "grad_norm": 1.1698752641677856, + "learning_rate": 0.00044141069397042094, + "loss": 1.9101, + "step": 2459 + }, + { + "epoch": 2.7986348122866893, + "grad_norm": 0.5076554417610168, + "learning_rate": 0.0004411831626848692, + "loss": 0.9025, + "step": 2460 + }, + { + "epoch": 2.799772468714448, + "grad_norm": 0.7442627549171448, + "learning_rate": 0.0004409556313993174, + "loss": 1.3259, + "step": 2461 + }, + { + "epoch": 2.800910125142207, + "grad_norm": 1.1652323007583618, + "learning_rate": 0.0004407281001137656, + "loss": 1.3523, + "step": 2462 + }, + { + "epoch": 2.802047781569966, + "grad_norm": 1.1334046125411987, + "learning_rate": 0.0004405005688282139, + "loss": 2.4571, + "step": 2463 + }, + { + "epoch": 2.803185437997725, + "grad_norm": 1.446699857711792, + "learning_rate": 0.0004402730375426621, + "loss": 3.8358, + "step": 2464 + }, + { + "epoch": 2.8043230944254836, + "grad_norm": 0.8421689867973328, + "learning_rate": 0.00044004550625711036, + "loss": 2.1174, + "step": 2465 + }, + { + "epoch": 2.8054607508532423, + "grad_norm": 1.1578481197357178, + "learning_rate": 0.0004398179749715586, + "loss": 2.3561, + "step": 2466 + }, + { + "epoch": 2.806598407281001, + "grad_norm": 1.4153037071228027, + "learning_rate": 0.00043959044368600684, + "loss": 3.3699, + "step": 2467 + }, + { + "epoch": 2.80773606370876, + "grad_norm": 0.9994482398033142, + "learning_rate": 0.00043936291240045505, + "loss": 2.3389, + "step": 2468 + }, + { + "epoch": 2.8088737201365186, + "grad_norm": 1.0240802764892578, + "learning_rate": 0.0004391353811149033, + "loss": 1.4459, + "step": 2469 + }, + { + "epoch": 2.810011376564278, + "grad_norm": 0.505115807056427, + "learning_rate": 0.0004389078498293516, + "loss": 0.5616, + "step": 2470 + }, + { + "epoch": 2.8111490329920366, + "grad_norm": 1.041799545288086, + "learning_rate": 0.0004386803185437998, + "loss": 1.6856, + "step": 2471 + }, + { + "epoch": 2.8122866894197953, + "grad_norm": 0.7291197776794434, + "learning_rate": 0.000438452787258248, + "loss": 1.8971, + "step": 2472 + }, + { + "epoch": 2.813424345847554, + "grad_norm": 0.7052469253540039, + "learning_rate": 0.00043822525597269626, + "loss": 1.6314, + "step": 2473 + }, + { + "epoch": 2.814562002275313, + "grad_norm": 1.3250932693481445, + "learning_rate": 0.00043799772468714447, + "loss": 1.6973, + "step": 2474 + }, + { + "epoch": 2.8156996587030716, + "grad_norm": 0.8435229659080505, + "learning_rate": 0.00043777019340159273, + "loss": 1.387, + "step": 2475 + }, + { + "epoch": 2.8168373151308304, + "grad_norm": 0.8370683193206787, + "learning_rate": 0.000437542662116041, + "loss": 1.1863, + "step": 2476 + }, + { + "epoch": 2.8179749715585896, + "grad_norm": 0.940768301486969, + "learning_rate": 0.0004373151308304892, + "loss": 1.4706, + "step": 2477 + }, + { + "epoch": 2.819112627986348, + "grad_norm": 0.8866887092590332, + "learning_rate": 0.0004370875995449374, + "loss": 2.093, + "step": 2478 + }, + { + "epoch": 2.820250284414107, + "grad_norm": 0.6502683162689209, + "learning_rate": 0.0004368600682593857, + "loss": 1.17, + "step": 2479 + }, + { + "epoch": 2.821387940841866, + "grad_norm": 1.3090890645980835, + "learning_rate": 0.00043663253697383394, + "loss": 2.7351, + "step": 2480 + }, + { + "epoch": 2.8225255972696246, + "grad_norm": 0.6929295063018799, + "learning_rate": 0.00043640500568828215, + "loss": 1.0882, + "step": 2481 + }, + { + "epoch": 2.8236632536973834, + "grad_norm": 1.296984314918518, + "learning_rate": 0.00043617747440273036, + "loss": 2.5549, + "step": 2482 + }, + { + "epoch": 2.824800910125142, + "grad_norm": 1.1640862226486206, + "learning_rate": 0.00043594994311717863, + "loss": 2.2164, + "step": 2483 + }, + { + "epoch": 2.825938566552901, + "grad_norm": 0.8431540727615356, + "learning_rate": 0.00043572241183162684, + "loss": 1.8785, + "step": 2484 + }, + { + "epoch": 2.8270762229806596, + "grad_norm": 1.0226455926895142, + "learning_rate": 0.0004354948805460751, + "loss": 2.7847, + "step": 2485 + }, + { + "epoch": 2.828213879408419, + "grad_norm": 0.9944515824317932, + "learning_rate": 0.0004352673492605233, + "loss": 3.0653, + "step": 2486 + }, + { + "epoch": 2.8293515358361776, + "grad_norm": 0.989396333694458, + "learning_rate": 0.0004350398179749716, + "loss": 1.5644, + "step": 2487 + }, + { + "epoch": 2.8304891922639364, + "grad_norm": 1.1219333410263062, + "learning_rate": 0.0004348122866894198, + "loss": 3.0597, + "step": 2488 + }, + { + "epoch": 2.831626848691695, + "grad_norm": 0.9987985491752625, + "learning_rate": 0.00043458475540386805, + "loss": 1.918, + "step": 2489 + }, + { + "epoch": 2.832764505119454, + "grad_norm": 1.3687235116958618, + "learning_rate": 0.0004343572241183163, + "loss": 2.0344, + "step": 2490 + }, + { + "epoch": 2.8339021615472126, + "grad_norm": 0.5967879891395569, + "learning_rate": 0.00043412969283276447, + "loss": 1.1413, + "step": 2491 + }, + { + "epoch": 2.8350398179749714, + "grad_norm": 0.7929881811141968, + "learning_rate": 0.00043390216154721273, + "loss": 1.3015, + "step": 2492 + }, + { + "epoch": 2.8361774744027306, + "grad_norm": 1.2926918268203735, + "learning_rate": 0.000433674630261661, + "loss": 2.332, + "step": 2493 + }, + { + "epoch": 2.837315130830489, + "grad_norm": 0.7639207243919373, + "learning_rate": 0.0004334470989761092, + "loss": 1.7233, + "step": 2494 + }, + { + "epoch": 2.838452787258248, + "grad_norm": 0.7680888772010803, + "learning_rate": 0.00043321956769055747, + "loss": 0.8933, + "step": 2495 + }, + { + "epoch": 2.839590443686007, + "grad_norm": 0.9917737245559692, + "learning_rate": 0.0004329920364050057, + "loss": 1.8064, + "step": 2496 + }, + { + "epoch": 2.8407281001137656, + "grad_norm": 0.8714781403541565, + "learning_rate": 0.00043276450511945395, + "loss": 1.688, + "step": 2497 + }, + { + "epoch": 2.8418657565415244, + "grad_norm": 1.283627986907959, + "learning_rate": 0.00043253697383390216, + "loss": 1.9438, + "step": 2498 + }, + { + "epoch": 2.843003412969283, + "grad_norm": 0.9072063565254211, + "learning_rate": 0.0004323094425483504, + "loss": 1.5175, + "step": 2499 + }, + { + "epoch": 2.8441410693970424, + "grad_norm": 0.9820964336395264, + "learning_rate": 0.0004320819112627987, + "loss": 3.0533, + "step": 2500 + }, + { + "epoch": 2.8452787258248007, + "grad_norm": 1.6198686361312866, + "learning_rate": 0.00043185437997724684, + "loss": 2.8954, + "step": 2501 + }, + { + "epoch": 2.84641638225256, + "grad_norm": 1.0898877382278442, + "learning_rate": 0.0004316268486916951, + "loss": 1.7376, + "step": 2502 + }, + { + "epoch": 2.8475540386803186, + "grad_norm": 0.8086861968040466, + "learning_rate": 0.00043139931740614337, + "loss": 1.8419, + "step": 2503 + }, + { + "epoch": 2.8486916951080774, + "grad_norm": 0.9705652594566345, + "learning_rate": 0.0004311717861205916, + "loss": 1.2042, + "step": 2504 + }, + { + "epoch": 2.849829351535836, + "grad_norm": 0.7838239669799805, + "learning_rate": 0.00043094425483503984, + "loss": 0.9004, + "step": 2505 + }, + { + "epoch": 2.850967007963595, + "grad_norm": 3.2648749351501465, + "learning_rate": 0.00043071672354948805, + "loss": 1.1682, + "step": 2506 + }, + { + "epoch": 2.8521046643913537, + "grad_norm": 1.287819743156433, + "learning_rate": 0.0004304891922639363, + "loss": 2.8992, + "step": 2507 + }, + { + "epoch": 2.8532423208191124, + "grad_norm": 1.7003014087677002, + "learning_rate": 0.0004302616609783845, + "loss": 1.2319, + "step": 2508 + }, + { + "epoch": 2.8543799772468716, + "grad_norm": 0.8558884859085083, + "learning_rate": 0.0004300341296928328, + "loss": 2.2637, + "step": 2509 + }, + { + "epoch": 2.8555176336746304, + "grad_norm": 0.9409870505332947, + "learning_rate": 0.00042980659840728105, + "loss": 2.3024, + "step": 2510 + }, + { + "epoch": 2.856655290102389, + "grad_norm": 0.627686619758606, + "learning_rate": 0.0004295790671217292, + "loss": 1.1552, + "step": 2511 + }, + { + "epoch": 2.857792946530148, + "grad_norm": 1.0397640466690063, + "learning_rate": 0.0004293515358361775, + "loss": 1.6213, + "step": 2512 + }, + { + "epoch": 2.8589306029579067, + "grad_norm": 1.128299593925476, + "learning_rate": 0.00042912400455062574, + "loss": 2.0099, + "step": 2513 + }, + { + "epoch": 2.8600682593856654, + "grad_norm": 1.486132025718689, + "learning_rate": 0.00042889647326507395, + "loss": 2.9052, + "step": 2514 + }, + { + "epoch": 2.861205915813424, + "grad_norm": 1.4445245265960693, + "learning_rate": 0.0004286689419795222, + "loss": 2.075, + "step": 2515 + }, + { + "epoch": 2.8623435722411834, + "grad_norm": 1.2654812335968018, + "learning_rate": 0.0004284414106939704, + "loss": 0.5957, + "step": 2516 + }, + { + "epoch": 2.8634812286689417, + "grad_norm": 0.7981293201446533, + "learning_rate": 0.0004282138794084187, + "loss": 1.9943, + "step": 2517 + }, + { + "epoch": 2.864618885096701, + "grad_norm": 0.5515188574790955, + "learning_rate": 0.0004279863481228669, + "loss": 0.8496, + "step": 2518 + }, + { + "epoch": 2.8657565415244597, + "grad_norm": 1.4269306659698486, + "learning_rate": 0.00042775881683731516, + "loss": 4.1156, + "step": 2519 + }, + { + "epoch": 2.8668941979522184, + "grad_norm": 0.7809433341026306, + "learning_rate": 0.00042753128555176337, + "loss": 1.3538, + "step": 2520 + }, + { + "epoch": 2.868031854379977, + "grad_norm": 0.7941915988922119, + "learning_rate": 0.0004273037542662116, + "loss": 1.7591, + "step": 2521 + }, + { + "epoch": 2.869169510807736, + "grad_norm": 1.2019193172454834, + "learning_rate": 0.00042707622298065984, + "loss": 2.0824, + "step": 2522 + }, + { + "epoch": 2.8703071672354947, + "grad_norm": 0.727832555770874, + "learning_rate": 0.0004268486916951081, + "loss": 1.8108, + "step": 2523 + }, + { + "epoch": 2.8714448236632535, + "grad_norm": 0.7144151926040649, + "learning_rate": 0.0004266211604095563, + "loss": 1.6019, + "step": 2524 + }, + { + "epoch": 2.8725824800910127, + "grad_norm": 1.0367114543914795, + "learning_rate": 0.00042639362912400453, + "loss": 1.4144, + "step": 2525 + }, + { + "epoch": 2.8737201365187715, + "grad_norm": 0.9273185729980469, + "learning_rate": 0.0004261660978384528, + "loss": 1.5532, + "step": 2526 + }, + { + "epoch": 2.87485779294653, + "grad_norm": 0.8667803406715393, + "learning_rate": 0.00042593856655290106, + "loss": 2.2255, + "step": 2527 + }, + { + "epoch": 2.875995449374289, + "grad_norm": 1.1647021770477295, + "learning_rate": 0.00042571103526734927, + "loss": 1.6445, + "step": 2528 + }, + { + "epoch": 2.8771331058020477, + "grad_norm": 0.8458011150360107, + "learning_rate": 0.00042548350398179753, + "loss": 1.6222, + "step": 2529 + }, + { + "epoch": 2.8782707622298065, + "grad_norm": 0.6948020458221436, + "learning_rate": 0.00042525597269624574, + "loss": 0.8429, + "step": 2530 + }, + { + "epoch": 2.8794084186575652, + "grad_norm": 1.5468206405639648, + "learning_rate": 0.00042502844141069395, + "loss": 2.3449, + "step": 2531 + }, + { + "epoch": 2.8805460750853245, + "grad_norm": 1.0919640064239502, + "learning_rate": 0.0004248009101251422, + "loss": 2.7566, + "step": 2532 + }, + { + "epoch": 2.881683731513083, + "grad_norm": 0.9144538044929504, + "learning_rate": 0.0004245733788395905, + "loss": 1.4095, + "step": 2533 + }, + { + "epoch": 2.882821387940842, + "grad_norm": 0.6683197617530823, + "learning_rate": 0.0004243458475540387, + "loss": 1.396, + "step": 2534 + }, + { + "epoch": 2.8839590443686007, + "grad_norm": 1.1698358058929443, + "learning_rate": 0.0004241183162684869, + "loss": 2.6583, + "step": 2535 + }, + { + "epoch": 2.8850967007963595, + "grad_norm": 0.9804813265800476, + "learning_rate": 0.00042389078498293516, + "loss": 1.8973, + "step": 2536 + }, + { + "epoch": 2.8862343572241183, + "grad_norm": 0.8138146996498108, + "learning_rate": 0.0004236632536973834, + "loss": 1.2295, + "step": 2537 + }, + { + "epoch": 2.887372013651877, + "grad_norm": 0.9173576831817627, + "learning_rate": 0.00042343572241183164, + "loss": 1.904, + "step": 2538 + }, + { + "epoch": 2.888509670079636, + "grad_norm": 0.9176083207130432, + "learning_rate": 0.0004232081911262799, + "loss": 3.0666, + "step": 2539 + }, + { + "epoch": 2.8896473265073945, + "grad_norm": 0.901013195514679, + "learning_rate": 0.0004229806598407281, + "loss": 2.0642, + "step": 2540 + }, + { + "epoch": 2.8907849829351537, + "grad_norm": 0.8971734642982483, + "learning_rate": 0.0004227531285551763, + "loss": 1.586, + "step": 2541 + }, + { + "epoch": 2.8919226393629125, + "grad_norm": 0.9048395752906799, + "learning_rate": 0.0004225255972696246, + "loss": 2.0343, + "step": 2542 + }, + { + "epoch": 2.8930602957906713, + "grad_norm": 1.1656171083450317, + "learning_rate": 0.00042229806598407285, + "loss": 2.5376, + "step": 2543 + }, + { + "epoch": 2.89419795221843, + "grad_norm": 0.655851423740387, + "learning_rate": 0.00042207053469852106, + "loss": 1.4965, + "step": 2544 + }, + { + "epoch": 2.8953356086461888, + "grad_norm": 0.8759918212890625, + "learning_rate": 0.00042184300341296927, + "loss": 2.1066, + "step": 2545 + }, + { + "epoch": 2.8964732650739475, + "grad_norm": 0.8949549794197083, + "learning_rate": 0.00042161547212741753, + "loss": 1.6719, + "step": 2546 + }, + { + "epoch": 2.8976109215017063, + "grad_norm": 1.0389626026153564, + "learning_rate": 0.00042138794084186574, + "loss": 2.4444, + "step": 2547 + }, + { + "epoch": 2.8987485779294655, + "grad_norm": 1.5342673063278198, + "learning_rate": 0.000421160409556314, + "loss": 2.5499, + "step": 2548 + }, + { + "epoch": 2.8998862343572243, + "grad_norm": 1.0685155391693115, + "learning_rate": 0.00042093287827076227, + "loss": 2.8287, + "step": 2549 + }, + { + "epoch": 2.901023890784983, + "grad_norm": 1.428017020225525, + "learning_rate": 0.0004207053469852105, + "loss": 3.5992, + "step": 2550 + }, + { + "epoch": 2.9021615472127418, + "grad_norm": 0.7409979701042175, + "learning_rate": 0.0004204778156996587, + "loss": 1.7332, + "step": 2551 + }, + { + "epoch": 2.9032992036405005, + "grad_norm": 1.2421164512634277, + "learning_rate": 0.00042025028441410695, + "loss": 1.5578, + "step": 2552 + }, + { + "epoch": 2.9044368600682593, + "grad_norm": 0.9279178380966187, + "learning_rate": 0.0004200227531285552, + "loss": 1.1623, + "step": 2553 + }, + { + "epoch": 2.905574516496018, + "grad_norm": 0.8222157955169678, + "learning_rate": 0.0004197952218430034, + "loss": 1.5473, + "step": 2554 + }, + { + "epoch": 2.9067121729237773, + "grad_norm": 0.7585321664810181, + "learning_rate": 0.00041956769055745164, + "loss": 1.3996, + "step": 2555 + }, + { + "epoch": 2.9078498293515356, + "grad_norm": 0.9835622310638428, + "learning_rate": 0.0004193401592718999, + "loss": 1.7122, + "step": 2556 + }, + { + "epoch": 2.908987485779295, + "grad_norm": 0.6884040236473083, + "learning_rate": 0.0004191126279863481, + "loss": 1.533, + "step": 2557 + }, + { + "epoch": 2.9101251422070535, + "grad_norm": 1.0069630146026611, + "learning_rate": 0.0004188850967007964, + "loss": 1.7265, + "step": 2558 + }, + { + "epoch": 2.9112627986348123, + "grad_norm": 0.984389066696167, + "learning_rate": 0.0004186575654152446, + "loss": 1.6568, + "step": 2559 + }, + { + "epoch": 2.912400455062571, + "grad_norm": 0.7369997501373291, + "learning_rate": 0.00041843003412969285, + "loss": 1.8428, + "step": 2560 + }, + { + "epoch": 2.91353811149033, + "grad_norm": 1.4680728912353516, + "learning_rate": 0.00041820250284414106, + "loss": 3.2181, + "step": 2561 + }, + { + "epoch": 2.9146757679180886, + "grad_norm": 0.7019348740577698, + "learning_rate": 0.0004179749715585893, + "loss": 1.529, + "step": 2562 + }, + { + "epoch": 2.9158134243458473, + "grad_norm": 0.6902445554733276, + "learning_rate": 0.0004177474402730376, + "loss": 0.6651, + "step": 2563 + }, + { + "epoch": 2.9169510807736065, + "grad_norm": 1.133726716041565, + "learning_rate": 0.00041751990898748574, + "loss": 3.7021, + "step": 2564 + }, + { + "epoch": 2.9180887372013653, + "grad_norm": 0.9320973753929138, + "learning_rate": 0.000417292377701934, + "loss": 2.1056, + "step": 2565 + }, + { + "epoch": 2.919226393629124, + "grad_norm": 0.9788163900375366, + "learning_rate": 0.00041706484641638227, + "loss": 1.8839, + "step": 2566 + }, + { + "epoch": 2.920364050056883, + "grad_norm": 1.0894557237625122, + "learning_rate": 0.0004168373151308305, + "loss": 2.4257, + "step": 2567 + }, + { + "epoch": 2.9215017064846416, + "grad_norm": 0.7440232038497925, + "learning_rate": 0.00041660978384527875, + "loss": 1.6213, + "step": 2568 + }, + { + "epoch": 2.9226393629124003, + "grad_norm": 1.0889848470687866, + "learning_rate": 0.00041638225255972696, + "loss": 1.632, + "step": 2569 + }, + { + "epoch": 2.923777019340159, + "grad_norm": 0.9316622018814087, + "learning_rate": 0.0004161547212741752, + "loss": 1.5941, + "step": 2570 + }, + { + "epoch": 2.9249146757679183, + "grad_norm": 1.1574180126190186, + "learning_rate": 0.00041592718998862343, + "loss": 2.5757, + "step": 2571 + }, + { + "epoch": 2.926052332195677, + "grad_norm": 0.629701554775238, + "learning_rate": 0.0004156996587030717, + "loss": 1.1369, + "step": 2572 + }, + { + "epoch": 2.927189988623436, + "grad_norm": 0.4679393172264099, + "learning_rate": 0.00041547212741751996, + "loss": 0.8413, + "step": 2573 + }, + { + "epoch": 2.9283276450511946, + "grad_norm": 0.8145558834075928, + "learning_rate": 0.0004152445961319681, + "loss": 1.3481, + "step": 2574 + }, + { + "epoch": 2.9294653014789533, + "grad_norm": 0.5542713403701782, + "learning_rate": 0.0004150170648464164, + "loss": 1.2693, + "step": 2575 + }, + { + "epoch": 2.930602957906712, + "grad_norm": 1.2317445278167725, + "learning_rate": 0.00041478953356086464, + "loss": 2.8908, + "step": 2576 + }, + { + "epoch": 2.931740614334471, + "grad_norm": 0.9596227407455444, + "learning_rate": 0.00041456200227531285, + "loss": 1.3738, + "step": 2577 + }, + { + "epoch": 2.93287827076223, + "grad_norm": 0.988106369972229, + "learning_rate": 0.0004143344709897611, + "loss": 2.4601, + "step": 2578 + }, + { + "epoch": 2.9340159271899884, + "grad_norm": 0.9745345115661621, + "learning_rate": 0.0004141069397042093, + "loss": 1.9739, + "step": 2579 + }, + { + "epoch": 2.9351535836177476, + "grad_norm": 0.8387298583984375, + "learning_rate": 0.0004138794084186576, + "loss": 1.6605, + "step": 2580 + }, + { + "epoch": 2.9362912400455063, + "grad_norm": 0.811417818069458, + "learning_rate": 0.0004136518771331058, + "loss": 0.7618, + "step": 2581 + }, + { + "epoch": 2.937428896473265, + "grad_norm": 0.8037904500961304, + "learning_rate": 0.00041342434584755406, + "loss": 1.1208, + "step": 2582 + }, + { + "epoch": 2.938566552901024, + "grad_norm": 1.0455831289291382, + "learning_rate": 0.00041319681456200233, + "loss": 2.6639, + "step": 2583 + }, + { + "epoch": 2.9397042093287826, + "grad_norm": 0.9639081358909607, + "learning_rate": 0.0004129692832764505, + "loss": 1.7122, + "step": 2584 + }, + { + "epoch": 2.9408418657565414, + "grad_norm": 0.9406579732894897, + "learning_rate": 0.00041274175199089875, + "loss": 1.7419, + "step": 2585 + }, + { + "epoch": 2.9419795221843, + "grad_norm": 1.0303080081939697, + "learning_rate": 0.000412514220705347, + "loss": 1.1925, + "step": 2586 + }, + { + "epoch": 2.9431171786120593, + "grad_norm": 1.262969970703125, + "learning_rate": 0.0004122866894197952, + "loss": 3.077, + "step": 2587 + }, + { + "epoch": 2.944254835039818, + "grad_norm": 1.7101789712905884, + "learning_rate": 0.0004120591581342435, + "loss": 3.1926, + "step": 2588 + }, + { + "epoch": 2.945392491467577, + "grad_norm": 0.583219587802887, + "learning_rate": 0.0004118316268486917, + "loss": 1.4564, + "step": 2589 + }, + { + "epoch": 2.9465301478953356, + "grad_norm": 0.916775107383728, + "learning_rate": 0.00041160409556313996, + "loss": 1.9524, + "step": 2590 + }, + { + "epoch": 2.9476678043230944, + "grad_norm": 0.6460449695587158, + "learning_rate": 0.00041137656427758817, + "loss": 0.8583, + "step": 2591 + }, + { + "epoch": 2.948805460750853, + "grad_norm": 0.9714045524597168, + "learning_rate": 0.00041114903299203643, + "loss": 1.8316, + "step": 2592 + }, + { + "epoch": 2.949943117178612, + "grad_norm": 0.7467246651649475, + "learning_rate": 0.00041092150170648464, + "loss": 1.1244, + "step": 2593 + }, + { + "epoch": 2.951080773606371, + "grad_norm": 1.166478157043457, + "learning_rate": 0.00041069397042093285, + "loss": 2.589, + "step": 2594 + }, + { + "epoch": 2.9522184300341294, + "grad_norm": 1.1131619215011597, + "learning_rate": 0.0004104664391353811, + "loss": 2.0701, + "step": 2595 + }, + { + "epoch": 2.9533560864618886, + "grad_norm": 0.7802785038948059, + "learning_rate": 0.0004102389078498294, + "loss": 1.5037, + "step": 2596 + }, + { + "epoch": 2.9544937428896474, + "grad_norm": 1.3113210201263428, + "learning_rate": 0.0004100113765642776, + "loss": 3.5825, + "step": 2597 + }, + { + "epoch": 2.955631399317406, + "grad_norm": 0.9257882833480835, + "learning_rate": 0.0004097838452787258, + "loss": 2.0172, + "step": 2598 + }, + { + "epoch": 2.956769055745165, + "grad_norm": 0.6490365862846375, + "learning_rate": 0.00040955631399317407, + "loss": 1.0444, + "step": 2599 + }, + { + "epoch": 2.9579067121729237, + "grad_norm": 0.7082270979881287, + "learning_rate": 0.00040932878270762233, + "loss": 1.6777, + "step": 2600 + }, + { + "epoch": 2.9590443686006824, + "grad_norm": 1.1460232734680176, + "learning_rate": 0.00040910125142207054, + "loss": 2.843, + "step": 2601 + }, + { + "epoch": 2.960182025028441, + "grad_norm": 1.1202770471572876, + "learning_rate": 0.0004088737201365188, + "loss": 1.8192, + "step": 2602 + }, + { + "epoch": 2.9613196814562004, + "grad_norm": 1.5068840980529785, + "learning_rate": 0.000408646188850967, + "loss": 3.6918, + "step": 2603 + }, + { + "epoch": 2.962457337883959, + "grad_norm": 0.9092023968696594, + "learning_rate": 0.0004084186575654152, + "loss": 1.8649, + "step": 2604 + }, + { + "epoch": 2.963594994311718, + "grad_norm": 1.0940500497817993, + "learning_rate": 0.0004081911262798635, + "loss": 2.9138, + "step": 2605 + }, + { + "epoch": 2.9647326507394767, + "grad_norm": 1.2195347547531128, + "learning_rate": 0.00040796359499431175, + "loss": 2.0279, + "step": 2606 + }, + { + "epoch": 2.9658703071672354, + "grad_norm": 0.7400806546211243, + "learning_rate": 0.00040773606370875996, + "loss": 1.3149, + "step": 2607 + }, + { + "epoch": 2.967007963594994, + "grad_norm": 1.0603524446487427, + "learning_rate": 0.00040750853242320817, + "loss": 2.151, + "step": 2608 + }, + { + "epoch": 2.968145620022753, + "grad_norm": 0.9565656185150146, + "learning_rate": 0.00040728100113765644, + "loss": 1.4545, + "step": 2609 + }, + { + "epoch": 2.969283276450512, + "grad_norm": 0.9198881387710571, + "learning_rate": 0.0004070534698521047, + "loss": 1.7001, + "step": 2610 + }, + { + "epoch": 2.970420932878271, + "grad_norm": 0.9140946865081787, + "learning_rate": 0.0004068259385665529, + "loss": 1.5544, + "step": 2611 + }, + { + "epoch": 2.9715585893060297, + "grad_norm": 0.6245988607406616, + "learning_rate": 0.0004065984072810012, + "loss": 1.3737, + "step": 2612 + }, + { + "epoch": 2.9726962457337884, + "grad_norm": 0.9493893980979919, + "learning_rate": 0.0004063708759954494, + "loss": 1.7512, + "step": 2613 + }, + { + "epoch": 2.973833902161547, + "grad_norm": 1.0149247646331787, + "learning_rate": 0.0004061433447098976, + "loss": 1.3255, + "step": 2614 + }, + { + "epoch": 2.974971558589306, + "grad_norm": 1.1951128244400024, + "learning_rate": 0.00040591581342434586, + "loss": 2.2548, + "step": 2615 + }, + { + "epoch": 2.9761092150170647, + "grad_norm": 1.160248875617981, + "learning_rate": 0.0004056882821387941, + "loss": 1.9538, + "step": 2616 + }, + { + "epoch": 2.977246871444824, + "grad_norm": 1.113005518913269, + "learning_rate": 0.00040546075085324233, + "loss": 1.813, + "step": 2617 + }, + { + "epoch": 2.9783845278725822, + "grad_norm": 0.6714508533477783, + "learning_rate": 0.00040523321956769054, + "loss": 1.2078, + "step": 2618 + }, + { + "epoch": 2.9795221843003414, + "grad_norm": 0.9738563299179077, + "learning_rate": 0.0004050056882821388, + "loss": 1.5833, + "step": 2619 + }, + { + "epoch": 2.9806598407281, + "grad_norm": 0.7636871933937073, + "learning_rate": 0.00040477815699658707, + "loss": 1.7431, + "step": 2620 + }, + { + "epoch": 2.981797497155859, + "grad_norm": 0.6886274814605713, + "learning_rate": 0.0004045506257110353, + "loss": 1.346, + "step": 2621 + }, + { + "epoch": 2.9829351535836177, + "grad_norm": 0.7128561735153198, + "learning_rate": 0.00040432309442548354, + "loss": 1.2031, + "step": 2622 + }, + { + "epoch": 2.9840728100113765, + "grad_norm": 1.0730477571487427, + "learning_rate": 0.00040409556313993175, + "loss": 1.9424, + "step": 2623 + }, + { + "epoch": 2.9852104664391352, + "grad_norm": 0.5753147006034851, + "learning_rate": 0.00040386803185437996, + "loss": 0.5761, + "step": 2624 + }, + { + "epoch": 2.986348122866894, + "grad_norm": 1.2759993076324463, + "learning_rate": 0.00040364050056882823, + "loss": 2.9886, + "step": 2625 + }, + { + "epoch": 2.987485779294653, + "grad_norm": 0.916204571723938, + "learning_rate": 0.0004034129692832765, + "loss": 1.5574, + "step": 2626 + }, + { + "epoch": 2.988623435722412, + "grad_norm": 0.940258264541626, + "learning_rate": 0.00040318543799772465, + "loss": 1.8689, + "step": 2627 + }, + { + "epoch": 2.9897610921501707, + "grad_norm": 0.5174016952514648, + "learning_rate": 0.0004029579067121729, + "loss": 0.8162, + "step": 2628 + }, + { + "epoch": 2.9908987485779295, + "grad_norm": 0.779630720615387, + "learning_rate": 0.0004027303754266212, + "loss": 1.2215, + "step": 2629 + }, + { + "epoch": 2.9920364050056882, + "grad_norm": 1.0173307657241821, + "learning_rate": 0.00040250284414106944, + "loss": 1.6225, + "step": 2630 + }, + { + "epoch": 2.993174061433447, + "grad_norm": 0.9133301377296448, + "learning_rate": 0.00040227531285551765, + "loss": 2.143, + "step": 2631 + }, + { + "epoch": 2.9943117178612058, + "grad_norm": 0.818585991859436, + "learning_rate": 0.00040204778156996586, + "loss": 2.071, + "step": 2632 + }, + { + "epoch": 2.995449374288965, + "grad_norm": 1.6560077667236328, + "learning_rate": 0.0004018202502844141, + "loss": 2.403, + "step": 2633 + }, + { + "epoch": 2.9965870307167233, + "grad_norm": 0.7060096859931946, + "learning_rate": 0.00040159271899886233, + "loss": 0.9656, + "step": 2634 + }, + { + "epoch": 2.9977246871444825, + "grad_norm": 0.7223090529441833, + "learning_rate": 0.0004013651877133106, + "loss": 1.7089, + "step": 2635 + }, + { + "epoch": 2.9988623435722412, + "grad_norm": 0.6294040679931641, + "learning_rate": 0.00040113765642775886, + "loss": 1.2626, + "step": 2636 + }, + { + "epoch": 3.0, + "grad_norm": 0.931922972202301, + "learning_rate": 0.000400910125142207, + "loss": 1.8149, + "step": 2637 + }, + { + "epoch": 3.0, + "eval_f1": 0.8905, + "eval_gen_len": 49.5636, + "eval_loss": 1.8385756015777588, + "eval_precision": 0.889, + "eval_recall": 0.8923, + "eval_rouge1": 0.4436, + "eval_rouge2": 0.2001, + "eval_rougeL": 0.3707, + "eval_rougeLsum": 0.4092, + "eval_runtime": 28.0556, + "eval_samples_per_second": 3.921, + "eval_steps_per_second": 0.499, + "step": 2637 + }, + { + "epoch": 3.0011376564277588, + "grad_norm": 0.7378818988800049, + "learning_rate": 0.0004006825938566553, + "loss": 1.5501, + "step": 2638 + }, + { + "epoch": 3.0022753128555175, + "grad_norm": 0.9810494780540466, + "learning_rate": 0.00040045506257110355, + "loss": 2.0296, + "step": 2639 + }, + { + "epoch": 3.0034129692832763, + "grad_norm": 1.0457974672317505, + "learning_rate": 0.0004002275312855518, + "loss": 1.9824, + "step": 2640 + }, + { + "epoch": 3.0045506257110355, + "grad_norm": 1.2151912450790405, + "learning_rate": 0.0004, + "loss": 2.0357, + "step": 2641 + }, + { + "epoch": 3.0056882821387942, + "grad_norm": 0.9331043362617493, + "learning_rate": 0.00039977246871444823, + "loss": 1.1641, + "step": 2642 + }, + { + "epoch": 3.006825938566553, + "grad_norm": 0.8802271485328674, + "learning_rate": 0.0003995449374288965, + "loss": 1.4499, + "step": 2643 + }, + { + "epoch": 3.0079635949943118, + "grad_norm": 0.8039838075637817, + "learning_rate": 0.0003993174061433447, + "loss": 1.3322, + "step": 2644 + }, + { + "epoch": 3.0091012514220705, + "grad_norm": 1.716158390045166, + "learning_rate": 0.00039908987485779297, + "loss": 1.9503, + "step": 2645 + }, + { + "epoch": 3.0102389078498293, + "grad_norm": 0.589878261089325, + "learning_rate": 0.00039886234357224123, + "loss": 1.2121, + "step": 2646 + }, + { + "epoch": 3.011376564277588, + "grad_norm": 0.6693535447120667, + "learning_rate": 0.0003986348122866894, + "loss": 1.5157, + "step": 2647 + }, + { + "epoch": 3.012514220705347, + "grad_norm": 1.0818902254104614, + "learning_rate": 0.00039840728100113765, + "loss": 2.445, + "step": 2648 + }, + { + "epoch": 3.013651877133106, + "grad_norm": 1.072434663772583, + "learning_rate": 0.0003981797497155859, + "loss": 3.0144, + "step": 2649 + }, + { + "epoch": 3.0147895335608648, + "grad_norm": 1.0067790746688843, + "learning_rate": 0.0003979522184300341, + "loss": 1.1798, + "step": 2650 + }, + { + "epoch": 3.0159271899886235, + "grad_norm": 0.8552153706550598, + "learning_rate": 0.0003977246871444824, + "loss": 1.7862, + "step": 2651 + }, + { + "epoch": 3.0170648464163823, + "grad_norm": 0.9946853518486023, + "learning_rate": 0.0003974971558589306, + "loss": 2.5064, + "step": 2652 + }, + { + "epoch": 3.018202502844141, + "grad_norm": 0.8925816416740417, + "learning_rate": 0.00039726962457337886, + "loss": 1.3693, + "step": 2653 + }, + { + "epoch": 3.0193401592719, + "grad_norm": 1.815544605255127, + "learning_rate": 0.0003970420932878271, + "loss": 3.0686, + "step": 2654 + }, + { + "epoch": 3.0204778156996586, + "grad_norm": 1.122719407081604, + "learning_rate": 0.00039681456200227534, + "loss": 2.3256, + "step": 2655 + }, + { + "epoch": 3.0216154721274173, + "grad_norm": 1.784497618675232, + "learning_rate": 0.0003965870307167236, + "loss": 3.1539, + "step": 2656 + }, + { + "epoch": 3.0227531285551765, + "grad_norm": 1.0257576704025269, + "learning_rate": 0.00039635949943117176, + "loss": 1.8041, + "step": 2657 + }, + { + "epoch": 3.0238907849829353, + "grad_norm": 0.9659282565116882, + "learning_rate": 0.00039613196814562, + "loss": 1.4202, + "step": 2658 + }, + { + "epoch": 3.025028441410694, + "grad_norm": 1.3800609111785889, + "learning_rate": 0.0003959044368600683, + "loss": 1.9181, + "step": 2659 + }, + { + "epoch": 3.026166097838453, + "grad_norm": 1.056583285331726, + "learning_rate": 0.0003956769055745165, + "loss": 1.6186, + "step": 2660 + }, + { + "epoch": 3.0273037542662116, + "grad_norm": 0.7123137712478638, + "learning_rate": 0.0003954493742889647, + "loss": 1.2721, + "step": 2661 + }, + { + "epoch": 3.0284414106939703, + "grad_norm": 0.8692189455032349, + "learning_rate": 0.00039522184300341297, + "loss": 1.8621, + "step": 2662 + }, + { + "epoch": 3.029579067121729, + "grad_norm": 0.8391653895378113, + "learning_rate": 0.00039499431171786123, + "loss": 1.0275, + "step": 2663 + }, + { + "epoch": 3.030716723549488, + "grad_norm": 1.1637401580810547, + "learning_rate": 0.00039476678043230944, + "loss": 2.4589, + "step": 2664 + }, + { + "epoch": 3.031854379977247, + "grad_norm": 0.9150156378746033, + "learning_rate": 0.0003945392491467577, + "loss": 1.5759, + "step": 2665 + }, + { + "epoch": 3.032992036405006, + "grad_norm": 1.1568349599838257, + "learning_rate": 0.0003943117178612059, + "loss": 2.138, + "step": 2666 + }, + { + "epoch": 3.0341296928327646, + "grad_norm": 1.1453518867492676, + "learning_rate": 0.0003940841865756541, + "loss": 2.3239, + "step": 2667 + }, + { + "epoch": 3.0352673492605233, + "grad_norm": 0.8770859837532043, + "learning_rate": 0.0003938566552901024, + "loss": 1.523, + "step": 2668 + }, + { + "epoch": 3.036405005688282, + "grad_norm": 1.0244057178497314, + "learning_rate": 0.00039362912400455065, + "loss": 1.7684, + "step": 2669 + }, + { + "epoch": 3.037542662116041, + "grad_norm": 1.0480972528457642, + "learning_rate": 0.00039340159271899886, + "loss": 1.5302, + "step": 2670 + }, + { + "epoch": 3.0386803185437996, + "grad_norm": 1.1314648389816284, + "learning_rate": 0.0003931740614334471, + "loss": 1.6721, + "step": 2671 + }, + { + "epoch": 3.039817974971559, + "grad_norm": 0.7867646813392639, + "learning_rate": 0.00039294653014789534, + "loss": 1.3925, + "step": 2672 + }, + { + "epoch": 3.0409556313993176, + "grad_norm": 1.450878381729126, + "learning_rate": 0.0003927189988623436, + "loss": 2.2478, + "step": 2673 + }, + { + "epoch": 3.0420932878270763, + "grad_norm": 0.7771217226982117, + "learning_rate": 0.0003924914675767918, + "loss": 1.3815, + "step": 2674 + }, + { + "epoch": 3.043230944254835, + "grad_norm": 1.2466049194335938, + "learning_rate": 0.0003922639362912401, + "loss": 3.2891, + "step": 2675 + }, + { + "epoch": 3.044368600682594, + "grad_norm": 1.7478383779525757, + "learning_rate": 0.0003920364050056883, + "loss": 2.8603, + "step": 2676 + }, + { + "epoch": 3.0455062571103526, + "grad_norm": 0.8281832933425903, + "learning_rate": 0.0003918088737201365, + "loss": 1.5477, + "step": 2677 + }, + { + "epoch": 3.0466439135381114, + "grad_norm": 0.695612370967865, + "learning_rate": 0.00039158134243458476, + "loss": 1.4521, + "step": 2678 + }, + { + "epoch": 3.04778156996587, + "grad_norm": 0.9778900742530823, + "learning_rate": 0.000391353811149033, + "loss": 1.341, + "step": 2679 + }, + { + "epoch": 3.0489192263936293, + "grad_norm": 1.0386836528778076, + "learning_rate": 0.00039112627986348123, + "loss": 1.7095, + "step": 2680 + }, + { + "epoch": 3.050056882821388, + "grad_norm": 0.9944807291030884, + "learning_rate": 0.00039089874857792944, + "loss": 1.9036, + "step": 2681 + }, + { + "epoch": 3.051194539249147, + "grad_norm": 0.8040305972099304, + "learning_rate": 0.0003906712172923777, + "loss": 1.1083, + "step": 2682 + }, + { + "epoch": 3.0523321956769056, + "grad_norm": 1.0549193620681763, + "learning_rate": 0.00039044368600682597, + "loss": 2.558, + "step": 2683 + }, + { + "epoch": 3.0534698521046644, + "grad_norm": 0.7308775782585144, + "learning_rate": 0.0003902161547212742, + "loss": 1.073, + "step": 2684 + }, + { + "epoch": 3.054607508532423, + "grad_norm": 0.532346785068512, + "learning_rate": 0.00038998862343572245, + "loss": 0.8228, + "step": 2685 + }, + { + "epoch": 3.055745164960182, + "grad_norm": 0.9749463200569153, + "learning_rate": 0.00038976109215017066, + "loss": 1.6924, + "step": 2686 + }, + { + "epoch": 3.0568828213879407, + "grad_norm": 0.9695119857788086, + "learning_rate": 0.00038953356086461887, + "loss": 1.8108, + "step": 2687 + }, + { + "epoch": 3.0580204778157, + "grad_norm": 0.9351180195808411, + "learning_rate": 0.00038930602957906713, + "loss": 2.2291, + "step": 2688 + }, + { + "epoch": 3.0591581342434586, + "grad_norm": 0.8033020496368408, + "learning_rate": 0.0003890784982935154, + "loss": 1.8916, + "step": 2689 + }, + { + "epoch": 3.0602957906712174, + "grad_norm": 1.3790737390518188, + "learning_rate": 0.0003888509670079636, + "loss": 1.9922, + "step": 2690 + }, + { + "epoch": 3.061433447098976, + "grad_norm": 1.2332922220230103, + "learning_rate": 0.0003886234357224118, + "loss": 2.345, + "step": 2691 + }, + { + "epoch": 3.062571103526735, + "grad_norm": 1.38813316822052, + "learning_rate": 0.0003883959044368601, + "loss": 2.8993, + "step": 2692 + }, + { + "epoch": 3.0637087599544937, + "grad_norm": 0.952480137348175, + "learning_rate": 0.00038816837315130834, + "loss": 1.8394, + "step": 2693 + }, + { + "epoch": 3.0648464163822524, + "grad_norm": 0.8586364984512329, + "learning_rate": 0.00038794084186575655, + "loss": 1.1771, + "step": 2694 + }, + { + "epoch": 3.065984072810011, + "grad_norm": 0.9498509764671326, + "learning_rate": 0.00038771331058020476, + "loss": 1.4771, + "step": 2695 + }, + { + "epoch": 3.0671217292377704, + "grad_norm": 0.9688740968704224, + "learning_rate": 0.000387485779294653, + "loss": 2.1815, + "step": 2696 + }, + { + "epoch": 3.068259385665529, + "grad_norm": 2.0778191089630127, + "learning_rate": 0.00038725824800910124, + "loss": 2.46, + "step": 2697 + }, + { + "epoch": 3.069397042093288, + "grad_norm": 1.1978789567947388, + "learning_rate": 0.0003870307167235495, + "loss": 2.347, + "step": 2698 + }, + { + "epoch": 3.0705346985210467, + "grad_norm": 1.4718050956726074, + "learning_rate": 0.00038680318543799776, + "loss": 2.3554, + "step": 2699 + }, + { + "epoch": 3.0716723549488054, + "grad_norm": 0.9731530547142029, + "learning_rate": 0.0003865756541524459, + "loss": 1.4802, + "step": 2700 + }, + { + "epoch": 3.072810011376564, + "grad_norm": 1.0412006378173828, + "learning_rate": 0.0003863481228668942, + "loss": 1.8537, + "step": 2701 + }, + { + "epoch": 3.073947667804323, + "grad_norm": 0.7110769748687744, + "learning_rate": 0.00038612059158134245, + "loss": 1.6169, + "step": 2702 + }, + { + "epoch": 3.0750853242320817, + "grad_norm": 1.7731373310089111, + "learning_rate": 0.0003858930602957907, + "loss": 3.4582, + "step": 2703 + }, + { + "epoch": 3.076222980659841, + "grad_norm": 1.136021614074707, + "learning_rate": 0.0003856655290102389, + "loss": 2.161, + "step": 2704 + }, + { + "epoch": 3.0773606370875997, + "grad_norm": 1.2377517223358154, + "learning_rate": 0.00038543799772468713, + "loss": 1.9528, + "step": 2705 + }, + { + "epoch": 3.0784982935153584, + "grad_norm": 0.7984117865562439, + "learning_rate": 0.0003852104664391354, + "loss": 1.0635, + "step": 2706 + }, + { + "epoch": 3.079635949943117, + "grad_norm": 1.1283124685287476, + "learning_rate": 0.0003849829351535836, + "loss": 2.1958, + "step": 2707 + }, + { + "epoch": 3.080773606370876, + "grad_norm": 1.1983591318130493, + "learning_rate": 0.00038475540386803187, + "loss": 2.5032, + "step": 2708 + }, + { + "epoch": 3.0819112627986347, + "grad_norm": 0.8128067255020142, + "learning_rate": 0.00038452787258248013, + "loss": 1.566, + "step": 2709 + }, + { + "epoch": 3.0830489192263935, + "grad_norm": 1.4617652893066406, + "learning_rate": 0.0003843003412969283, + "loss": 2.3352, + "step": 2710 + }, + { + "epoch": 3.0841865756541527, + "grad_norm": 0.9528759717941284, + "learning_rate": 0.00038407281001137655, + "loss": 1.7667, + "step": 2711 + }, + { + "epoch": 3.0853242320819114, + "grad_norm": 1.0481518507003784, + "learning_rate": 0.0003838452787258248, + "loss": 1.9046, + "step": 2712 + }, + { + "epoch": 3.08646188850967, + "grad_norm": 0.9186555743217468, + "learning_rate": 0.0003836177474402731, + "loss": 1.8621, + "step": 2713 + }, + { + "epoch": 3.087599544937429, + "grad_norm": 0.6513910293579102, + "learning_rate": 0.0003833902161547213, + "loss": 1.2062, + "step": 2714 + }, + { + "epoch": 3.0887372013651877, + "grad_norm": 1.1539534330368042, + "learning_rate": 0.0003831626848691695, + "loss": 2.0368, + "step": 2715 + }, + { + "epoch": 3.0898748577929465, + "grad_norm": 1.2739286422729492, + "learning_rate": 0.00038293515358361777, + "loss": 2.7124, + "step": 2716 + }, + { + "epoch": 3.091012514220705, + "grad_norm": 0.796596348285675, + "learning_rate": 0.000382707622298066, + "loss": 1.5446, + "step": 2717 + }, + { + "epoch": 3.092150170648464, + "grad_norm": 1.144640564918518, + "learning_rate": 0.00038248009101251424, + "loss": 1.3855, + "step": 2718 + }, + { + "epoch": 3.093287827076223, + "grad_norm": 0.6495816111564636, + "learning_rate": 0.0003822525597269625, + "loss": 1.5521, + "step": 2719 + }, + { + "epoch": 3.094425483503982, + "grad_norm": 1.0980159044265747, + "learning_rate": 0.00038202502844141066, + "loss": 1.5025, + "step": 2720 + }, + { + "epoch": 3.0955631399317407, + "grad_norm": 0.8496591448783875, + "learning_rate": 0.0003817974971558589, + "loss": 2.1075, + "step": 2721 + }, + { + "epoch": 3.0967007963594995, + "grad_norm": 0.9507794380187988, + "learning_rate": 0.0003815699658703072, + "loss": 1.8178, + "step": 2722 + }, + { + "epoch": 3.0978384527872582, + "grad_norm": 1.0450421571731567, + "learning_rate": 0.00038134243458475545, + "loss": 2.2859, + "step": 2723 + }, + { + "epoch": 3.098976109215017, + "grad_norm": 0.8103886246681213, + "learning_rate": 0.00038111490329920366, + "loss": 1.7053, + "step": 2724 + }, + { + "epoch": 3.1001137656427757, + "grad_norm": 1.1811316013336182, + "learning_rate": 0.00038088737201365187, + "loss": 1.5576, + "step": 2725 + }, + { + "epoch": 3.1012514220705345, + "grad_norm": 1.117814064025879, + "learning_rate": 0.00038065984072810014, + "loss": 2.484, + "step": 2726 + }, + { + "epoch": 3.1023890784982937, + "grad_norm": 1.1703466176986694, + "learning_rate": 0.00038043230944254835, + "loss": 2.0287, + "step": 2727 + }, + { + "epoch": 3.1035267349260525, + "grad_norm": 1.159627914428711, + "learning_rate": 0.0003802047781569966, + "loss": 2.4316, + "step": 2728 + }, + { + "epoch": 3.1046643913538112, + "grad_norm": 0.6235855221748352, + "learning_rate": 0.0003799772468714448, + "loss": 1.3779, + "step": 2729 + }, + { + "epoch": 3.10580204778157, + "grad_norm": 0.8967525959014893, + "learning_rate": 0.00037974971558589303, + "loss": 1.8938, + "step": 2730 + }, + { + "epoch": 3.1069397042093287, + "grad_norm": 0.7541645169258118, + "learning_rate": 0.0003795221843003413, + "loss": 1.6706, + "step": 2731 + }, + { + "epoch": 3.1080773606370875, + "grad_norm": 0.7087587714195251, + "learning_rate": 0.00037929465301478956, + "loss": 1.185, + "step": 2732 + }, + { + "epoch": 3.1092150170648463, + "grad_norm": 0.4740487337112427, + "learning_rate": 0.0003790671217292378, + "loss": 0.5638, + "step": 2733 + }, + { + "epoch": 3.110352673492605, + "grad_norm": 0.6750523447990417, + "learning_rate": 0.000378839590443686, + "loss": 1.064, + "step": 2734 + }, + { + "epoch": 3.1114903299203642, + "grad_norm": 0.6977342367172241, + "learning_rate": 0.00037861205915813424, + "loss": 1.721, + "step": 2735 + }, + { + "epoch": 3.112627986348123, + "grad_norm": 0.5952358841896057, + "learning_rate": 0.0003783845278725825, + "loss": 0.7465, + "step": 2736 + }, + { + "epoch": 3.1137656427758817, + "grad_norm": 1.1384859085083008, + "learning_rate": 0.0003781569965870307, + "loss": 2.1374, + "step": 2737 + }, + { + "epoch": 3.1149032992036405, + "grad_norm": 1.135901927947998, + "learning_rate": 0.000377929465301479, + "loss": 3.0381, + "step": 2738 + }, + { + "epoch": 3.1160409556313993, + "grad_norm": 1.3090107440948486, + "learning_rate": 0.0003777019340159272, + "loss": 3.5599, + "step": 2739 + }, + { + "epoch": 3.117178612059158, + "grad_norm": 0.7160171866416931, + "learning_rate": 0.0003774744027303754, + "loss": 0.7931, + "step": 2740 + }, + { + "epoch": 3.118316268486917, + "grad_norm": 0.9017439484596252, + "learning_rate": 0.00037724687144482366, + "loss": 1.3222, + "step": 2741 + }, + { + "epoch": 3.1194539249146755, + "grad_norm": 0.7213937640190125, + "learning_rate": 0.00037701934015927193, + "loss": 1.8957, + "step": 2742 + }, + { + "epoch": 3.1205915813424348, + "grad_norm": 1.6694979667663574, + "learning_rate": 0.0003767918088737202, + "loss": 1.9735, + "step": 2743 + }, + { + "epoch": 3.1217292377701935, + "grad_norm": 1.073201298713684, + "learning_rate": 0.00037656427758816835, + "loss": 1.2988, + "step": 2744 + }, + { + "epoch": 3.1228668941979523, + "grad_norm": 0.7072137594223022, + "learning_rate": 0.0003763367463026166, + "loss": 1.4237, + "step": 2745 + }, + { + "epoch": 3.124004550625711, + "grad_norm": 1.140438437461853, + "learning_rate": 0.0003761092150170649, + "loss": 2.3338, + "step": 2746 + }, + { + "epoch": 3.12514220705347, + "grad_norm": 0.9373031258583069, + "learning_rate": 0.0003758816837315131, + "loss": 1.7434, + "step": 2747 + }, + { + "epoch": 3.1262798634812285, + "grad_norm": 1.1408966779708862, + "learning_rate": 0.00037565415244596135, + "loss": 2.5215, + "step": 2748 + }, + { + "epoch": 3.1274175199089873, + "grad_norm": 0.9613400101661682, + "learning_rate": 0.00037542662116040956, + "loss": 1.7157, + "step": 2749 + }, + { + "epoch": 3.1285551763367465, + "grad_norm": 1.0001013278961182, + "learning_rate": 0.00037519908987485777, + "loss": 1.3756, + "step": 2750 + }, + { + "epoch": 3.1296928327645053, + "grad_norm": 1.740805983543396, + "learning_rate": 0.00037497155858930603, + "loss": 2.2071, + "step": 2751 + }, + { + "epoch": 3.130830489192264, + "grad_norm": 0.9777475595474243, + "learning_rate": 0.0003747440273037543, + "loss": 1.8485, + "step": 2752 + }, + { + "epoch": 3.131968145620023, + "grad_norm": 1.1642980575561523, + "learning_rate": 0.0003745164960182025, + "loss": 1.2435, + "step": 2753 + }, + { + "epoch": 3.1331058020477816, + "grad_norm": 1.1870895624160767, + "learning_rate": 0.0003742889647326507, + "loss": 1.3834, + "step": 2754 + }, + { + "epoch": 3.1342434584755403, + "grad_norm": 0.9124128818511963, + "learning_rate": 0.000374061433447099, + "loss": 1.534, + "step": 2755 + }, + { + "epoch": 3.135381114903299, + "grad_norm": 0.6941938400268555, + "learning_rate": 0.00037383390216154725, + "loss": 1.5597, + "step": 2756 + }, + { + "epoch": 3.136518771331058, + "grad_norm": 0.9864102602005005, + "learning_rate": 0.00037360637087599546, + "loss": 1.5537, + "step": 2757 + }, + { + "epoch": 3.137656427758817, + "grad_norm": 0.6157810688018799, + "learning_rate": 0.0003733788395904437, + "loss": 1.1024, + "step": 2758 + }, + { + "epoch": 3.138794084186576, + "grad_norm": 0.9289652109146118, + "learning_rate": 0.00037315130830489193, + "loss": 1.2505, + "step": 2759 + }, + { + "epoch": 3.1399317406143346, + "grad_norm": 0.6562715768814087, + "learning_rate": 0.00037292377701934014, + "loss": 1.1522, + "step": 2760 + }, + { + "epoch": 3.1410693970420933, + "grad_norm": 0.7173290252685547, + "learning_rate": 0.0003726962457337884, + "loss": 1.3983, + "step": 2761 + }, + { + "epoch": 3.142207053469852, + "grad_norm": 0.8549534678459167, + "learning_rate": 0.00037246871444823667, + "loss": 1.4353, + "step": 2762 + }, + { + "epoch": 3.143344709897611, + "grad_norm": 1.8511520624160767, + "learning_rate": 0.0003722411831626849, + "loss": 2.9588, + "step": 2763 + }, + { + "epoch": 3.1444823663253696, + "grad_norm": 1.1187710762023926, + "learning_rate": 0.0003720136518771331, + "loss": 1.8577, + "step": 2764 + }, + { + "epoch": 3.1456200227531284, + "grad_norm": 0.9971904158592224, + "learning_rate": 0.00037178612059158135, + "loss": 2.0944, + "step": 2765 + }, + { + "epoch": 3.1467576791808876, + "grad_norm": 1.0542211532592773, + "learning_rate": 0.0003715585893060296, + "loss": 1.5842, + "step": 2766 + }, + { + "epoch": 3.1478953356086463, + "grad_norm": 0.8797990083694458, + "learning_rate": 0.0003713310580204778, + "loss": 2.6677, + "step": 2767 + }, + { + "epoch": 3.149032992036405, + "grad_norm": 1.0683292150497437, + "learning_rate": 0.00037110352673492604, + "loss": 1.4515, + "step": 2768 + }, + { + "epoch": 3.150170648464164, + "grad_norm": 0.9214059114456177, + "learning_rate": 0.0003708759954493743, + "loss": 2.1592, + "step": 2769 + }, + { + "epoch": 3.1513083048919226, + "grad_norm": 1.1661478281021118, + "learning_rate": 0.0003706484641638225, + "loss": 2.3451, + "step": 2770 + }, + { + "epoch": 3.1524459613196814, + "grad_norm": 0.8435385823249817, + "learning_rate": 0.0003704209328782708, + "loss": 1.3745, + "step": 2771 + }, + { + "epoch": 3.15358361774744, + "grad_norm": 0.646821916103363, + "learning_rate": 0.00037019340159271904, + "loss": 1.2736, + "step": 2772 + }, + { + "epoch": 3.1547212741751993, + "grad_norm": 1.5238386392593384, + "learning_rate": 0.0003699658703071672, + "loss": 2.0629, + "step": 2773 + }, + { + "epoch": 3.155858930602958, + "grad_norm": 0.9329938292503357, + "learning_rate": 0.00036973833902161546, + "loss": 2.6737, + "step": 2774 + }, + { + "epoch": 3.156996587030717, + "grad_norm": 0.9424443244934082, + "learning_rate": 0.0003695108077360637, + "loss": 1.814, + "step": 2775 + }, + { + "epoch": 3.1581342434584756, + "grad_norm": 1.060135006904602, + "learning_rate": 0.000369283276450512, + "loss": 2.2409, + "step": 2776 + }, + { + "epoch": 3.1592718998862344, + "grad_norm": 1.1120651960372925, + "learning_rate": 0.0003690557451649602, + "loss": 2.7252, + "step": 2777 + }, + { + "epoch": 3.160409556313993, + "grad_norm": 1.0458694696426392, + "learning_rate": 0.0003688282138794084, + "loss": 2.2002, + "step": 2778 + }, + { + "epoch": 3.161547212741752, + "grad_norm": 0.7672396898269653, + "learning_rate": 0.00036860068259385667, + "loss": 1.5965, + "step": 2779 + }, + { + "epoch": 3.1626848691695106, + "grad_norm": 0.9745705127716064, + "learning_rate": 0.0003683731513083049, + "loss": 2.8014, + "step": 2780 + }, + { + "epoch": 3.1638225255972694, + "grad_norm": 1.4451996088027954, + "learning_rate": 0.00036814562002275314, + "loss": 3.0917, + "step": 2781 + }, + { + "epoch": 3.1649601820250286, + "grad_norm": 0.7620997428894043, + "learning_rate": 0.0003679180887372014, + "loss": 1.4155, + "step": 2782 + }, + { + "epoch": 3.1660978384527874, + "grad_norm": 0.8162498474121094, + "learning_rate": 0.00036769055745164956, + "loss": 2.5782, + "step": 2783 + }, + { + "epoch": 3.167235494880546, + "grad_norm": 0.8843737244606018, + "learning_rate": 0.00036746302616609783, + "loss": 1.6138, + "step": 2784 + }, + { + "epoch": 3.168373151308305, + "grad_norm": 0.6327600479125977, + "learning_rate": 0.0003672354948805461, + "loss": 1.0368, + "step": 2785 + }, + { + "epoch": 3.1695108077360636, + "grad_norm": 1.3573901653289795, + "learning_rate": 0.00036700796359499436, + "loss": 3.1572, + "step": 2786 + }, + { + "epoch": 3.1706484641638224, + "grad_norm": 0.9209976196289062, + "learning_rate": 0.00036678043230944257, + "loss": 1.9979, + "step": 2787 + }, + { + "epoch": 3.171786120591581, + "grad_norm": 1.0654752254486084, + "learning_rate": 0.0003665529010238908, + "loss": 1.1564, + "step": 2788 + }, + { + "epoch": 3.1729237770193404, + "grad_norm": 0.8304941654205322, + "learning_rate": 0.00036632536973833904, + "loss": 0.804, + "step": 2789 + }, + { + "epoch": 3.174061433447099, + "grad_norm": 0.7033611536026001, + "learning_rate": 0.00036609783845278725, + "loss": 1.2548, + "step": 2790 + }, + { + "epoch": 3.175199089874858, + "grad_norm": 0.7177807688713074, + "learning_rate": 0.0003658703071672355, + "loss": 0.925, + "step": 2791 + }, + { + "epoch": 3.1763367463026166, + "grad_norm": 0.7976208925247192, + "learning_rate": 0.0003656427758816838, + "loss": 1.8367, + "step": 2792 + }, + { + "epoch": 3.1774744027303754, + "grad_norm": 1.0865761041641235, + "learning_rate": 0.00036541524459613193, + "loss": 2.0069, + "step": 2793 + }, + { + "epoch": 3.178612059158134, + "grad_norm": 1.0845462083816528, + "learning_rate": 0.0003651877133105802, + "loss": 2.4428, + "step": 2794 + }, + { + "epoch": 3.179749715585893, + "grad_norm": 1.635365605354309, + "learning_rate": 0.00036496018202502846, + "loss": 2.376, + "step": 2795 + }, + { + "epoch": 3.1808873720136517, + "grad_norm": 0.9851645231246948, + "learning_rate": 0.0003647326507394767, + "loss": 1.7787, + "step": 2796 + }, + { + "epoch": 3.182025028441411, + "grad_norm": 0.8619524836540222, + "learning_rate": 0.00036450511945392494, + "loss": 0.8745, + "step": 2797 + }, + { + "epoch": 3.1831626848691696, + "grad_norm": 0.8011617064476013, + "learning_rate": 0.00036427758816837315, + "loss": 1.2395, + "step": 2798 + }, + { + "epoch": 3.1843003412969284, + "grad_norm": 1.1883291006088257, + "learning_rate": 0.0003640500568828214, + "loss": 2.4134, + "step": 2799 + }, + { + "epoch": 3.185437997724687, + "grad_norm": 0.9594371914863586, + "learning_rate": 0.0003638225255972696, + "loss": 1.8051, + "step": 2800 + }, + { + "epoch": 3.186575654152446, + "grad_norm": 0.9812491536140442, + "learning_rate": 0.0003635949943117179, + "loss": 1.9984, + "step": 2801 + }, + { + "epoch": 3.1877133105802047, + "grad_norm": 1.5427114963531494, + "learning_rate": 0.0003633674630261661, + "loss": 2.8443, + "step": 2802 + }, + { + "epoch": 3.1888509670079634, + "grad_norm": 1.0131993293762207, + "learning_rate": 0.0003631399317406143, + "loss": 1.4077, + "step": 2803 + }, + { + "epoch": 3.189988623435722, + "grad_norm": 1.21848464012146, + "learning_rate": 0.00036291240045506257, + "loss": 2.6364, + "step": 2804 + }, + { + "epoch": 3.1911262798634814, + "grad_norm": 0.8747435212135315, + "learning_rate": 0.00036268486916951083, + "loss": 1.4387, + "step": 2805 + }, + { + "epoch": 3.19226393629124, + "grad_norm": 1.2929614782333374, + "learning_rate": 0.0003624573378839591, + "loss": 2.2714, + "step": 2806 + }, + { + "epoch": 3.193401592718999, + "grad_norm": 0.6196789145469666, + "learning_rate": 0.00036222980659840725, + "loss": 1.4344, + "step": 2807 + }, + { + "epoch": 3.1945392491467577, + "grad_norm": 1.1467669010162354, + "learning_rate": 0.0003620022753128555, + "loss": 2.1119, + "step": 2808 + }, + { + "epoch": 3.1956769055745164, + "grad_norm": 0.781956136226654, + "learning_rate": 0.0003617747440273038, + "loss": 1.4887, + "step": 2809 + }, + { + "epoch": 3.196814562002275, + "grad_norm": 1.177976369857788, + "learning_rate": 0.000361547212741752, + "loss": 3.2043, + "step": 2810 + }, + { + "epoch": 3.197952218430034, + "grad_norm": 0.8673222064971924, + "learning_rate": 0.00036131968145620025, + "loss": 1.0446, + "step": 2811 + }, + { + "epoch": 3.199089874857793, + "grad_norm": 0.8764348030090332, + "learning_rate": 0.00036109215017064846, + "loss": 2.2418, + "step": 2812 + }, + { + "epoch": 3.200227531285552, + "grad_norm": 1.8115644454956055, + "learning_rate": 0.0003608646188850967, + "loss": 1.0493, + "step": 2813 + }, + { + "epoch": 3.2013651877133107, + "grad_norm": 1.0345537662506104, + "learning_rate": 0.00036063708759954494, + "loss": 2.0083, + "step": 2814 + }, + { + "epoch": 3.2025028441410694, + "grad_norm": 0.7956832647323608, + "learning_rate": 0.0003604095563139932, + "loss": 1.377, + "step": 2815 + }, + { + "epoch": 3.203640500568828, + "grad_norm": 0.7763450145721436, + "learning_rate": 0.00036018202502844147, + "loss": 1.4905, + "step": 2816 + }, + { + "epoch": 3.204778156996587, + "grad_norm": 1.0623788833618164, + "learning_rate": 0.0003599544937428896, + "loss": 1.4396, + "step": 2817 + }, + { + "epoch": 3.2059158134243457, + "grad_norm": 0.8401424288749695, + "learning_rate": 0.0003597269624573379, + "loss": 1.7506, + "step": 2818 + }, + { + "epoch": 3.2070534698521045, + "grad_norm": 1.2517905235290527, + "learning_rate": 0.00035949943117178615, + "loss": 1.8421, + "step": 2819 + }, + { + "epoch": 3.2081911262798632, + "grad_norm": 0.9095095992088318, + "learning_rate": 0.00035927189988623436, + "loss": 1.069, + "step": 2820 + }, + { + "epoch": 3.2093287827076225, + "grad_norm": 1.1191394329071045, + "learning_rate": 0.0003590443686006826, + "loss": 2.1787, + "step": 2821 + }, + { + "epoch": 3.210466439135381, + "grad_norm": 0.9273049235343933, + "learning_rate": 0.00035881683731513083, + "loss": 2.2313, + "step": 2822 + }, + { + "epoch": 3.21160409556314, + "grad_norm": 1.0636650323867798, + "learning_rate": 0.00035858930602957904, + "loss": 1.5293, + "step": 2823 + }, + { + "epoch": 3.2127417519908987, + "grad_norm": 1.2871456146240234, + "learning_rate": 0.0003583617747440273, + "loss": 3.0545, + "step": 2824 + }, + { + "epoch": 3.2138794084186575, + "grad_norm": 1.2373828887939453, + "learning_rate": 0.00035813424345847557, + "loss": 2.0834, + "step": 2825 + }, + { + "epoch": 3.2150170648464163, + "grad_norm": 0.7762901186943054, + "learning_rate": 0.00035790671217292384, + "loss": 1.4895, + "step": 2826 + }, + { + "epoch": 3.216154721274175, + "grad_norm": 1.2859349250793457, + "learning_rate": 0.000357679180887372, + "loss": 1.9091, + "step": 2827 + }, + { + "epoch": 3.217292377701934, + "grad_norm": 1.2489643096923828, + "learning_rate": 0.00035745164960182026, + "loss": 1.831, + "step": 2828 + }, + { + "epoch": 3.218430034129693, + "grad_norm": 0.9940024018287659, + "learning_rate": 0.0003572241183162685, + "loss": 2.157, + "step": 2829 + }, + { + "epoch": 3.2195676905574517, + "grad_norm": 1.1232733726501465, + "learning_rate": 0.00035699658703071673, + "loss": 2.9384, + "step": 2830 + }, + { + "epoch": 3.2207053469852105, + "grad_norm": 0.7718859910964966, + "learning_rate": 0.000356769055745165, + "loss": 1.8196, + "step": 2831 + }, + { + "epoch": 3.2218430034129693, + "grad_norm": 1.089641809463501, + "learning_rate": 0.0003565415244596132, + "loss": 1.8792, + "step": 2832 + }, + { + "epoch": 3.222980659840728, + "grad_norm": 1.2740263938903809, + "learning_rate": 0.0003563139931740614, + "loss": 2.5414, + "step": 2833 + }, + { + "epoch": 3.2241183162684868, + "grad_norm": 0.9137701392173767, + "learning_rate": 0.0003560864618885097, + "loss": 1.3666, + "step": 2834 + }, + { + "epoch": 3.2252559726962455, + "grad_norm": 0.6575736403465271, + "learning_rate": 0.00035585893060295794, + "loss": 1.2568, + "step": 2835 + }, + { + "epoch": 3.2263936291240047, + "grad_norm": 0.9433609843254089, + "learning_rate": 0.0003556313993174061, + "loss": 1.4612, + "step": 2836 + }, + { + "epoch": 3.2275312855517635, + "grad_norm": 0.8380080461502075, + "learning_rate": 0.00035540386803185436, + "loss": 1.1819, + "step": 2837 + }, + { + "epoch": 3.2286689419795223, + "grad_norm": 0.7855201363563538, + "learning_rate": 0.0003551763367463026, + "loss": 1.8921, + "step": 2838 + }, + { + "epoch": 3.229806598407281, + "grad_norm": 1.5746235847473145, + "learning_rate": 0.0003549488054607509, + "loss": 3.0391, + "step": 2839 + }, + { + "epoch": 3.2309442548350398, + "grad_norm": 1.1520813703536987, + "learning_rate": 0.0003547212741751991, + "loss": 2.1142, + "step": 2840 + }, + { + "epoch": 3.2320819112627985, + "grad_norm": 0.8569504618644714, + "learning_rate": 0.0003544937428896473, + "loss": 2.0138, + "step": 2841 + }, + { + "epoch": 3.2332195676905573, + "grad_norm": 1.0433871746063232, + "learning_rate": 0.0003542662116040956, + "loss": 1.673, + "step": 2842 + }, + { + "epoch": 3.234357224118316, + "grad_norm": 1.01398766040802, + "learning_rate": 0.0003540386803185438, + "loss": 1.9582, + "step": 2843 + }, + { + "epoch": 3.2354948805460753, + "grad_norm": 1.0465679168701172, + "learning_rate": 0.00035381114903299205, + "loss": 1.9819, + "step": 2844 + }, + { + "epoch": 3.236632536973834, + "grad_norm": 0.787959635257721, + "learning_rate": 0.0003535836177474403, + "loss": 1.639, + "step": 2845 + }, + { + "epoch": 3.2377701934015928, + "grad_norm": 1.155023455619812, + "learning_rate": 0.00035335608646188847, + "loss": 1.8541, + "step": 2846 + }, + { + "epoch": 3.2389078498293515, + "grad_norm": 1.1908761262893677, + "learning_rate": 0.00035312855517633673, + "loss": 1.9333, + "step": 2847 + }, + { + "epoch": 3.2400455062571103, + "grad_norm": 0.9262527227401733, + "learning_rate": 0.000352901023890785, + "loss": 1.3209, + "step": 2848 + }, + { + "epoch": 3.241183162684869, + "grad_norm": 0.8731938004493713, + "learning_rate": 0.00035267349260523326, + "loss": 2.2036, + "step": 2849 + }, + { + "epoch": 3.242320819112628, + "grad_norm": 0.6273211240768433, + "learning_rate": 0.00035244596131968147, + "loss": 0.9153, + "step": 2850 + }, + { + "epoch": 3.243458475540387, + "grad_norm": 1.0732612609863281, + "learning_rate": 0.0003522184300341297, + "loss": 2.286, + "step": 2851 + }, + { + "epoch": 3.244596131968146, + "grad_norm": 1.1554096937179565, + "learning_rate": 0.00035199089874857794, + "loss": 2.0289, + "step": 2852 + }, + { + "epoch": 3.2457337883959045, + "grad_norm": 0.8074318170547485, + "learning_rate": 0.00035176336746302615, + "loss": 1.8181, + "step": 2853 + }, + { + "epoch": 3.2468714448236633, + "grad_norm": 1.392298936843872, + "learning_rate": 0.0003515358361774744, + "loss": 3.4711, + "step": 2854 + }, + { + "epoch": 3.248009101251422, + "grad_norm": 0.9060192108154297, + "learning_rate": 0.0003513083048919227, + "loss": 2.0728, + "step": 2855 + }, + { + "epoch": 3.249146757679181, + "grad_norm": 1.2438005208969116, + "learning_rate": 0.00035108077360637084, + "loss": 1.4773, + "step": 2856 + }, + { + "epoch": 3.2502844141069396, + "grad_norm": 1.0764005184173584, + "learning_rate": 0.0003508532423208191, + "loss": 1.5244, + "step": 2857 + }, + { + "epoch": 3.2514220705346983, + "grad_norm": 0.8813822269439697, + "learning_rate": 0.00035062571103526737, + "loss": 1.2567, + "step": 2858 + }, + { + "epoch": 3.252559726962457, + "grad_norm": 0.6739155054092407, + "learning_rate": 0.00035039817974971563, + "loss": 0.682, + "step": 2859 + }, + { + "epoch": 3.2536973833902163, + "grad_norm": 1.2803453207015991, + "learning_rate": 0.00035017064846416384, + "loss": 1.5126, + "step": 2860 + }, + { + "epoch": 3.254835039817975, + "grad_norm": 1.1614502668380737, + "learning_rate": 0.00034994311717861205, + "loss": 3.0873, + "step": 2861 + }, + { + "epoch": 3.255972696245734, + "grad_norm": 0.8287785053253174, + "learning_rate": 0.0003497155858930603, + "loss": 1.3949, + "step": 2862 + }, + { + "epoch": 3.2571103526734926, + "grad_norm": 0.8668496608734131, + "learning_rate": 0.0003494880546075085, + "loss": 1.5943, + "step": 2863 + }, + { + "epoch": 3.2582480091012513, + "grad_norm": 0.8574039936065674, + "learning_rate": 0.0003492605233219568, + "loss": 1.8113, + "step": 2864 + }, + { + "epoch": 3.25938566552901, + "grad_norm": 0.8691564202308655, + "learning_rate": 0.00034903299203640505, + "loss": 1.5579, + "step": 2865 + }, + { + "epoch": 3.260523321956769, + "grad_norm": 0.948115885257721, + "learning_rate": 0.0003488054607508532, + "loss": 1.4459, + "step": 2866 + }, + { + "epoch": 3.261660978384528, + "grad_norm": 1.410564661026001, + "learning_rate": 0.00034857792946530147, + "loss": 3.3305, + "step": 2867 + }, + { + "epoch": 3.262798634812287, + "grad_norm": 0.904012143611908, + "learning_rate": 0.00034835039817974973, + "loss": 1.7655, + "step": 2868 + }, + { + "epoch": 3.2639362912400456, + "grad_norm": 0.9044657945632935, + "learning_rate": 0.000348122866894198, + "loss": 1.2666, + "step": 2869 + }, + { + "epoch": 3.2650739476678043, + "grad_norm": 0.8898612856864929, + "learning_rate": 0.00034789533560864615, + "loss": 2.0107, + "step": 2870 + }, + { + "epoch": 3.266211604095563, + "grad_norm": 0.8412598967552185, + "learning_rate": 0.0003476678043230944, + "loss": 1.4021, + "step": 2871 + }, + { + "epoch": 3.267349260523322, + "grad_norm": 1.2648974657058716, + "learning_rate": 0.0003474402730375427, + "loss": 3.4206, + "step": 2872 + }, + { + "epoch": 3.2684869169510806, + "grad_norm": 0.9399024248123169, + "learning_rate": 0.0003472127417519909, + "loss": 2.8845, + "step": 2873 + }, + { + "epoch": 3.26962457337884, + "grad_norm": 0.9316064119338989, + "learning_rate": 0.00034698521046643916, + "loss": 2.0801, + "step": 2874 + }, + { + "epoch": 3.2707622298065986, + "grad_norm": 0.9166231155395508, + "learning_rate": 0.00034675767918088737, + "loss": 1.661, + "step": 2875 + }, + { + "epoch": 3.2718998862343573, + "grad_norm": 1.0802944898605347, + "learning_rate": 0.0003465301478953356, + "loss": 3.3163, + "step": 2876 + }, + { + "epoch": 3.273037542662116, + "grad_norm": 1.2445770502090454, + "learning_rate": 0.00034630261660978384, + "loss": 2.9616, + "step": 2877 + }, + { + "epoch": 3.274175199089875, + "grad_norm": 0.8100591897964478, + "learning_rate": 0.0003460750853242321, + "loss": 1.1971, + "step": 2878 + }, + { + "epoch": 3.2753128555176336, + "grad_norm": 0.9066864252090454, + "learning_rate": 0.00034584755403868037, + "loss": 1.7813, + "step": 2879 + }, + { + "epoch": 3.2764505119453924, + "grad_norm": 1.0590927600860596, + "learning_rate": 0.0003456200227531285, + "loss": 2.0441, + "step": 2880 + }, + { + "epoch": 3.277588168373151, + "grad_norm": 0.6991727352142334, + "learning_rate": 0.0003453924914675768, + "loss": 1.2761, + "step": 2881 + }, + { + "epoch": 3.27872582480091, + "grad_norm": 0.6339994668960571, + "learning_rate": 0.00034516496018202505, + "loss": 1.0168, + "step": 2882 + }, + { + "epoch": 3.279863481228669, + "grad_norm": 0.630440890789032, + "learning_rate": 0.00034493742889647326, + "loss": 1.3744, + "step": 2883 + }, + { + "epoch": 3.281001137656428, + "grad_norm": 0.8100972771644592, + "learning_rate": 0.0003447098976109215, + "loss": 1.9117, + "step": 2884 + }, + { + "epoch": 3.2821387940841866, + "grad_norm": 0.9163875579833984, + "learning_rate": 0.00034448236632536974, + "loss": 1.1392, + "step": 2885 + }, + { + "epoch": 3.2832764505119454, + "grad_norm": 0.9157561659812927, + "learning_rate": 0.00034425483503981795, + "loss": 1.9776, + "step": 2886 + }, + { + "epoch": 3.284414106939704, + "grad_norm": 0.7969180345535278, + "learning_rate": 0.0003440273037542662, + "loss": 1.4908, + "step": 2887 + }, + { + "epoch": 3.285551763367463, + "grad_norm": 0.7929701209068298, + "learning_rate": 0.0003437997724687145, + "loss": 1.6476, + "step": 2888 + }, + { + "epoch": 3.2866894197952217, + "grad_norm": 1.0109394788742065, + "learning_rate": 0.00034357224118316274, + "loss": 2.3243, + "step": 2889 + }, + { + "epoch": 3.287827076222981, + "grad_norm": 1.095586895942688, + "learning_rate": 0.0003433447098976109, + "loss": 2.0614, + "step": 2890 + }, + { + "epoch": 3.2889647326507396, + "grad_norm": 0.8181962966918945, + "learning_rate": 0.00034311717861205916, + "loss": 1.0926, + "step": 2891 + }, + { + "epoch": 3.2901023890784984, + "grad_norm": 0.5682795643806458, + "learning_rate": 0.0003428896473265074, + "loss": 0.5625, + "step": 2892 + }, + { + "epoch": 3.291240045506257, + "grad_norm": 0.8557625412940979, + "learning_rate": 0.00034266211604095563, + "loss": 1.8372, + "step": 2893 + }, + { + "epoch": 3.292377701934016, + "grad_norm": 0.6423472166061401, + "learning_rate": 0.0003424345847554039, + "loss": 1.3594, + "step": 2894 + }, + { + "epoch": 3.2935153583617747, + "grad_norm": 1.044715404510498, + "learning_rate": 0.0003422070534698521, + "loss": 2.7601, + "step": 2895 + }, + { + "epoch": 3.2946530147895334, + "grad_norm": 0.7515947222709656, + "learning_rate": 0.0003419795221843003, + "loss": 1.1865, + "step": 2896 + }, + { + "epoch": 3.295790671217292, + "grad_norm": 0.9097882509231567, + "learning_rate": 0.0003417519908987486, + "loss": 1.4461, + "step": 2897 + }, + { + "epoch": 3.296928327645051, + "grad_norm": 0.7172821760177612, + "learning_rate": 0.00034152445961319684, + "loss": 2.2885, + "step": 2898 + }, + { + "epoch": 3.29806598407281, + "grad_norm": 1.1029298305511475, + "learning_rate": 0.0003412969283276451, + "loss": 1.5583, + "step": 2899 + }, + { + "epoch": 3.299203640500569, + "grad_norm": 0.853276252746582, + "learning_rate": 0.00034106939704209326, + "loss": 1.4709, + "step": 2900 + }, + { + "epoch": 3.3003412969283277, + "grad_norm": 0.8551457524299622, + "learning_rate": 0.00034084186575654153, + "loss": 2.4956, + "step": 2901 + }, + { + "epoch": 3.3014789533560864, + "grad_norm": 0.909735381603241, + "learning_rate": 0.0003406143344709898, + "loss": 2.0525, + "step": 2902 + }, + { + "epoch": 3.302616609783845, + "grad_norm": 1.0006370544433594, + "learning_rate": 0.000340386803185438, + "loss": 1.6306, + "step": 2903 + }, + { + "epoch": 3.303754266211604, + "grad_norm": 0.5882608294487, + "learning_rate": 0.00034015927189988627, + "loss": 1.3989, + "step": 2904 + }, + { + "epoch": 3.3048919226393627, + "grad_norm": 1.2735271453857422, + "learning_rate": 0.0003399317406143345, + "loss": 2.394, + "step": 2905 + }, + { + "epoch": 3.306029579067122, + "grad_norm": 0.9738203287124634, + "learning_rate": 0.0003397042093287827, + "loss": 1.951, + "step": 2906 + }, + { + "epoch": 3.3071672354948807, + "grad_norm": 0.5416101217269897, + "learning_rate": 0.00033947667804323095, + "loss": 1.0747, + "step": 2907 + }, + { + "epoch": 3.3083048919226394, + "grad_norm": 0.9188492298126221, + "learning_rate": 0.0003392491467576792, + "loss": 1.4549, + "step": 2908 + }, + { + "epoch": 3.309442548350398, + "grad_norm": 1.4744441509246826, + "learning_rate": 0.0003390216154721274, + "loss": 2.4909, + "step": 2909 + }, + { + "epoch": 3.310580204778157, + "grad_norm": 0.6633999347686768, + "learning_rate": 0.00033879408418657563, + "loss": 1.4089, + "step": 2910 + }, + { + "epoch": 3.3117178612059157, + "grad_norm": 0.6373212933540344, + "learning_rate": 0.0003385665529010239, + "loss": 1.4588, + "step": 2911 + }, + { + "epoch": 3.3128555176336745, + "grad_norm": 0.5806794166564941, + "learning_rate": 0.00033833902161547216, + "loss": 1.3158, + "step": 2912 + }, + { + "epoch": 3.3139931740614337, + "grad_norm": 1.0303038358688354, + "learning_rate": 0.00033811149032992037, + "loss": 1.6245, + "step": 2913 + }, + { + "epoch": 3.3151308304891924, + "grad_norm": 1.0633960962295532, + "learning_rate": 0.0003378839590443686, + "loss": 1.9088, + "step": 2914 + }, + { + "epoch": 3.316268486916951, + "grad_norm": 0.9028981328010559, + "learning_rate": 0.00033765642775881685, + "loss": 1.64, + "step": 2915 + }, + { + "epoch": 3.31740614334471, + "grad_norm": 0.886370062828064, + "learning_rate": 0.00033742889647326506, + "loss": 1.292, + "step": 2916 + }, + { + "epoch": 3.3185437997724687, + "grad_norm": 1.6007177829742432, + "learning_rate": 0.0003372013651877133, + "loss": 3.1382, + "step": 2917 + }, + { + "epoch": 3.3196814562002275, + "grad_norm": 1.1357500553131104, + "learning_rate": 0.0003369738339021616, + "loss": 2.1361, + "step": 2918 + }, + { + "epoch": 3.3208191126279862, + "grad_norm": 0.9156873822212219, + "learning_rate": 0.0003367463026166098, + "loss": 1.3616, + "step": 2919 + }, + { + "epoch": 3.321956769055745, + "grad_norm": 1.0556998252868652, + "learning_rate": 0.000336518771331058, + "loss": 2.6077, + "step": 2920 + }, + { + "epoch": 3.3230944254835038, + "grad_norm": 1.1073150634765625, + "learning_rate": 0.00033629124004550627, + "loss": 1.6265, + "step": 2921 + }, + { + "epoch": 3.324232081911263, + "grad_norm": 0.6052758097648621, + "learning_rate": 0.00033606370875995453, + "loss": 1.0629, + "step": 2922 + }, + { + "epoch": 3.3253697383390217, + "grad_norm": 0.9209448099136353, + "learning_rate": 0.00033583617747440274, + "loss": 1.3336, + "step": 2923 + }, + { + "epoch": 3.3265073947667805, + "grad_norm": 0.5461276769638062, + "learning_rate": 0.00033560864618885095, + "loss": 1.1149, + "step": 2924 + }, + { + "epoch": 3.3276450511945392, + "grad_norm": 1.0375051498413086, + "learning_rate": 0.0003353811149032992, + "loss": 1.4975, + "step": 2925 + }, + { + "epoch": 3.328782707622298, + "grad_norm": 0.9948595762252808, + "learning_rate": 0.0003351535836177474, + "loss": 1.7277, + "step": 2926 + }, + { + "epoch": 3.3299203640500568, + "grad_norm": 1.711958408355713, + "learning_rate": 0.0003349260523321957, + "loss": 2.7451, + "step": 2927 + }, + { + "epoch": 3.3310580204778155, + "grad_norm": 0.9110103845596313, + "learning_rate": 0.00033469852104664395, + "loss": 1.8131, + "step": 2928 + }, + { + "epoch": 3.3321956769055747, + "grad_norm": 0.9176352620124817, + "learning_rate": 0.0003344709897610921, + "loss": 1.4978, + "step": 2929 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.8127102255821228, + "learning_rate": 0.0003342434584755404, + "loss": 1.8899, + "step": 2930 + }, + { + "epoch": 3.3344709897610922, + "grad_norm": 1.1340645551681519, + "learning_rate": 0.00033401592718998864, + "loss": 1.3567, + "step": 2931 + }, + { + "epoch": 3.335608646188851, + "grad_norm": 1.366903305053711, + "learning_rate": 0.0003337883959044369, + "loss": 3.0076, + "step": 2932 + }, + { + "epoch": 3.3367463026166098, + "grad_norm": 0.8700219392776489, + "learning_rate": 0.0003335608646188851, + "loss": 1.2597, + "step": 2933 + }, + { + "epoch": 3.3378839590443685, + "grad_norm": 0.9277002215385437, + "learning_rate": 0.0003333333333333333, + "loss": 2.267, + "step": 2934 + }, + { + "epoch": 3.3390216154721273, + "grad_norm": 0.5930815935134888, + "learning_rate": 0.0003331058020477816, + "loss": 1.3837, + "step": 2935 + }, + { + "epoch": 3.3401592718998865, + "grad_norm": 1.0349963903427124, + "learning_rate": 0.0003328782707622298, + "loss": 2.4195, + "step": 2936 + }, + { + "epoch": 3.3412969283276452, + "grad_norm": 1.2994656562805176, + "learning_rate": 0.00033265073947667806, + "loss": 1.4095, + "step": 2937 + }, + { + "epoch": 3.342434584755404, + "grad_norm": 0.7547191381454468, + "learning_rate": 0.0003324232081911263, + "loss": 1.6748, + "step": 2938 + }, + { + "epoch": 3.3435722411831628, + "grad_norm": 0.9032368659973145, + "learning_rate": 0.0003321956769055745, + "loss": 2.3666, + "step": 2939 + }, + { + "epoch": 3.3447098976109215, + "grad_norm": 0.8295652866363525, + "learning_rate": 0.00033196814562002274, + "loss": 1.4128, + "step": 2940 + }, + { + "epoch": 3.3458475540386803, + "grad_norm": 0.6656214594841003, + "learning_rate": 0.000331740614334471, + "loss": 0.8634, + "step": 2941 + }, + { + "epoch": 3.346985210466439, + "grad_norm": 0.9207725524902344, + "learning_rate": 0.00033151308304891927, + "loss": 2.3504, + "step": 2942 + }, + { + "epoch": 3.348122866894198, + "grad_norm": 0.7400831580162048, + "learning_rate": 0.00033128555176336743, + "loss": 0.9548, + "step": 2943 + }, + { + "epoch": 3.3492605233219566, + "grad_norm": 0.9420154690742493, + "learning_rate": 0.0003310580204778157, + "loss": 2.347, + "step": 2944 + }, + { + "epoch": 3.3503981797497158, + "grad_norm": 0.9879665970802307, + "learning_rate": 0.00033083048919226396, + "loss": 1.6713, + "step": 2945 + }, + { + "epoch": 3.3515358361774745, + "grad_norm": 0.8822195529937744, + "learning_rate": 0.00033060295790671217, + "loss": 0.9356, + "step": 2946 + }, + { + "epoch": 3.3526734926052333, + "grad_norm": 0.959632933139801, + "learning_rate": 0.00033037542662116043, + "loss": 2.099, + "step": 2947 + }, + { + "epoch": 3.353811149032992, + "grad_norm": 0.8753674626350403, + "learning_rate": 0.00033014789533560864, + "loss": 1.0478, + "step": 2948 + }, + { + "epoch": 3.354948805460751, + "grad_norm": 0.9367252588272095, + "learning_rate": 0.00032992036405005685, + "loss": 2.0897, + "step": 2949 + }, + { + "epoch": 3.3560864618885096, + "grad_norm": 1.059227705001831, + "learning_rate": 0.0003296928327645051, + "loss": 2.1907, + "step": 2950 + }, + { + "epoch": 3.3572241183162683, + "grad_norm": 1.2451213598251343, + "learning_rate": 0.0003294653014789534, + "loss": 2.9248, + "step": 2951 + }, + { + "epoch": 3.3583617747440275, + "grad_norm": 1.4228962659835815, + "learning_rate": 0.00032923777019340164, + "loss": 4.301, + "step": 2952 + }, + { + "epoch": 3.3594994311717863, + "grad_norm": 1.0747414827346802, + "learning_rate": 0.0003290102389078498, + "loss": 1.7888, + "step": 2953 + }, + { + "epoch": 3.360637087599545, + "grad_norm": 1.38980233669281, + "learning_rate": 0.00032878270762229806, + "loss": 0.5853, + "step": 2954 + }, + { + "epoch": 3.361774744027304, + "grad_norm": 0.793297290802002, + "learning_rate": 0.0003285551763367463, + "loss": 1.1242, + "step": 2955 + }, + { + "epoch": 3.3629124004550626, + "grad_norm": 2.7201013565063477, + "learning_rate": 0.00032832764505119454, + "loss": 4.6065, + "step": 2956 + }, + { + "epoch": 3.3640500568828213, + "grad_norm": 1.3290942907333374, + "learning_rate": 0.0003281001137656428, + "loss": 2.6127, + "step": 2957 + }, + { + "epoch": 3.36518771331058, + "grad_norm": 1.104690670967102, + "learning_rate": 0.000327872582480091, + "loss": 1.6713, + "step": 2958 + }, + { + "epoch": 3.366325369738339, + "grad_norm": 0.6908164620399475, + "learning_rate": 0.0003276450511945392, + "loss": 1.0, + "step": 2959 + }, + { + "epoch": 3.3674630261660976, + "grad_norm": 0.8555911779403687, + "learning_rate": 0.0003274175199089875, + "loss": 1.6721, + "step": 2960 + }, + { + "epoch": 3.368600682593857, + "grad_norm": 0.8147174119949341, + "learning_rate": 0.00032718998862343575, + "loss": 2.1911, + "step": 2961 + }, + { + "epoch": 3.3697383390216156, + "grad_norm": 1.1406668424606323, + "learning_rate": 0.000326962457337884, + "loss": 2.4286, + "step": 2962 + }, + { + "epoch": 3.3708759954493743, + "grad_norm": 0.8895596861839294, + "learning_rate": 0.00032673492605233217, + "loss": 1.8801, + "step": 2963 + }, + { + "epoch": 3.372013651877133, + "grad_norm": 0.6777999401092529, + "learning_rate": 0.00032650739476678043, + "loss": 0.8933, + "step": 2964 + }, + { + "epoch": 3.373151308304892, + "grad_norm": 0.8110511898994446, + "learning_rate": 0.0003262798634812287, + "loss": 1.6234, + "step": 2965 + }, + { + "epoch": 3.3742889647326506, + "grad_norm": 1.3193098306655884, + "learning_rate": 0.0003260523321956769, + "loss": 2.0755, + "step": 2966 + }, + { + "epoch": 3.3754266211604094, + "grad_norm": 1.0191433429718018, + "learning_rate": 0.00032582480091012517, + "loss": 2.6676, + "step": 2967 + }, + { + "epoch": 3.3765642775881686, + "grad_norm": 0.5699923038482666, + "learning_rate": 0.0003255972696245734, + "loss": 1.0331, + "step": 2968 + }, + { + "epoch": 3.3777019340159273, + "grad_norm": 1.2073493003845215, + "learning_rate": 0.0003253697383390216, + "loss": 3.2836, + "step": 2969 + }, + { + "epoch": 3.378839590443686, + "grad_norm": 0.5660185217857361, + "learning_rate": 0.00032514220705346985, + "loss": 0.8925, + "step": 2970 + }, + { + "epoch": 3.379977246871445, + "grad_norm": 1.1539686918258667, + "learning_rate": 0.0003249146757679181, + "loss": 1.6701, + "step": 2971 + }, + { + "epoch": 3.3811149032992036, + "grad_norm": 1.223471999168396, + "learning_rate": 0.0003246871444823664, + "loss": 2.0681, + "step": 2972 + }, + { + "epoch": 3.3822525597269624, + "grad_norm": 0.7714402675628662, + "learning_rate": 0.00032445961319681454, + "loss": 1.9618, + "step": 2973 + }, + { + "epoch": 3.383390216154721, + "grad_norm": 1.1004797220230103, + "learning_rate": 0.0003242320819112628, + "loss": 1.4624, + "step": 2974 + }, + { + "epoch": 3.3845278725824803, + "grad_norm": 0.7591771483421326, + "learning_rate": 0.00032400455062571107, + "loss": 1.1837, + "step": 2975 + }, + { + "epoch": 3.385665529010239, + "grad_norm": 0.9769864082336426, + "learning_rate": 0.0003237770193401593, + "loss": 1.6725, + "step": 2976 + }, + { + "epoch": 3.386803185437998, + "grad_norm": 1.0868338346481323, + "learning_rate": 0.0003235494880546075, + "loss": 2.3017, + "step": 2977 + }, + { + "epoch": 3.3879408418657566, + "grad_norm": 0.943532407283783, + "learning_rate": 0.00032332195676905575, + "loss": 2.1185, + "step": 2978 + }, + { + "epoch": 3.3890784982935154, + "grad_norm": 1.0216169357299805, + "learning_rate": 0.00032309442548350396, + "loss": 1.9361, + "step": 2979 + }, + { + "epoch": 3.390216154721274, + "grad_norm": 0.9502822756767273, + "learning_rate": 0.0003228668941979522, + "loss": 0.9628, + "step": 2980 + }, + { + "epoch": 3.391353811149033, + "grad_norm": 0.6996874213218689, + "learning_rate": 0.0003226393629124005, + "loss": 1.223, + "step": 2981 + }, + { + "epoch": 3.3924914675767917, + "grad_norm": 0.9376258254051208, + "learning_rate": 0.0003224118316268487, + "loss": 1.4156, + "step": 2982 + }, + { + "epoch": 3.3936291240045504, + "grad_norm": 1.000174880027771, + "learning_rate": 0.0003221843003412969, + "loss": 2.2908, + "step": 2983 + }, + { + "epoch": 3.3947667804323096, + "grad_norm": 1.1017203330993652, + "learning_rate": 0.00032195676905574517, + "loss": 2.837, + "step": 2984 + }, + { + "epoch": 3.3959044368600684, + "grad_norm": 0.7534080147743225, + "learning_rate": 0.00032172923777019344, + "loss": 1.6282, + "step": 2985 + }, + { + "epoch": 3.397042093287827, + "grad_norm": 0.8684095144271851, + "learning_rate": 0.00032150170648464165, + "loss": 2.2301, + "step": 2986 + }, + { + "epoch": 3.398179749715586, + "grad_norm": 1.0283501148223877, + "learning_rate": 0.00032127417519908986, + "loss": 2.0298, + "step": 2987 + }, + { + "epoch": 3.3993174061433447, + "grad_norm": 0.8715803623199463, + "learning_rate": 0.0003210466439135381, + "loss": 1.5656, + "step": 2988 + }, + { + "epoch": 3.4004550625711034, + "grad_norm": 1.0342568159103394, + "learning_rate": 0.00032081911262798633, + "loss": 1.8496, + "step": 2989 + }, + { + "epoch": 3.401592718998862, + "grad_norm": 1.0820322036743164, + "learning_rate": 0.0003205915813424346, + "loss": 1.4579, + "step": 2990 + }, + { + "epoch": 3.4027303754266214, + "grad_norm": 0.9718009233474731, + "learning_rate": 0.00032036405005688286, + "loss": 1.8414, + "step": 2991 + }, + { + "epoch": 3.40386803185438, + "grad_norm": 0.6825365424156189, + "learning_rate": 0.00032013651877133107, + "loss": 1.4311, + "step": 2992 + }, + { + "epoch": 3.405005688282139, + "grad_norm": 0.7811647057533264, + "learning_rate": 0.0003199089874857793, + "loss": 0.7493, + "step": 2993 + }, + { + "epoch": 3.4061433447098977, + "grad_norm": 0.8294082283973694, + "learning_rate": 0.00031968145620022754, + "loss": 1.5319, + "step": 2994 + }, + { + "epoch": 3.4072810011376564, + "grad_norm": 1.365054965019226, + "learning_rate": 0.0003194539249146758, + "loss": 3.0346, + "step": 2995 + }, + { + "epoch": 3.408418657565415, + "grad_norm": 1.0830689668655396, + "learning_rate": 0.000319226393629124, + "loss": 2.3795, + "step": 2996 + }, + { + "epoch": 3.409556313993174, + "grad_norm": 1.158379077911377, + "learning_rate": 0.0003189988623435722, + "loss": 1.6544, + "step": 2997 + }, + { + "epoch": 3.4106939704209327, + "grad_norm": 0.8690700531005859, + "learning_rate": 0.0003187713310580205, + "loss": 1.473, + "step": 2998 + }, + { + "epoch": 3.4118316268486915, + "grad_norm": 0.7095939517021179, + "learning_rate": 0.0003185437997724687, + "loss": 1.4625, + "step": 2999 + }, + { + "epoch": 3.4129692832764507, + "grad_norm": 1.2548210620880127, + "learning_rate": 0.00031831626848691696, + "loss": 1.8051, + "step": 3000 + }, + { + "epoch": 3.4141069397042094, + "grad_norm": 1.4674617052078247, + "learning_rate": 0.00031808873720136523, + "loss": 1.9178, + "step": 3001 + }, + { + "epoch": 3.415244596131968, + "grad_norm": 0.9363518953323364, + "learning_rate": 0.00031786120591581344, + "loss": 2.3612, + "step": 3002 + }, + { + "epoch": 3.416382252559727, + "grad_norm": 1.0488542318344116, + "learning_rate": 0.00031763367463026165, + "loss": 2.0528, + "step": 3003 + }, + { + "epoch": 3.4175199089874857, + "grad_norm": 1.0611392259597778, + "learning_rate": 0.0003174061433447099, + "loss": 1.2209, + "step": 3004 + }, + { + "epoch": 3.4186575654152445, + "grad_norm": 0.7108457088470459, + "learning_rate": 0.0003171786120591582, + "loss": 2.2781, + "step": 3005 + }, + { + "epoch": 3.419795221843003, + "grad_norm": 0.7717037200927734, + "learning_rate": 0.0003169510807736064, + "loss": 1.8069, + "step": 3006 + }, + { + "epoch": 3.4209328782707624, + "grad_norm": 1.1115652322769165, + "learning_rate": 0.0003167235494880546, + "loss": 2.1257, + "step": 3007 + }, + { + "epoch": 3.422070534698521, + "grad_norm": 1.1573059558868408, + "learning_rate": 0.00031649601820250286, + "loss": 1.5691, + "step": 3008 + }, + { + "epoch": 3.42320819112628, + "grad_norm": 0.7534975409507751, + "learning_rate": 0.00031626848691695107, + "loss": 1.6702, + "step": 3009 + }, + { + "epoch": 3.4243458475540387, + "grad_norm": 0.8538755178451538, + "learning_rate": 0.00031604095563139933, + "loss": 1.8829, + "step": 3010 + }, + { + "epoch": 3.4254835039817975, + "grad_norm": 1.041550636291504, + "learning_rate": 0.00031581342434584754, + "loss": 1.8357, + "step": 3011 + }, + { + "epoch": 3.426621160409556, + "grad_norm": 0.9445088505744934, + "learning_rate": 0.0003155858930602958, + "loss": 2.0357, + "step": 3012 + }, + { + "epoch": 3.427758816837315, + "grad_norm": 0.8078857064247131, + "learning_rate": 0.000315358361774744, + "loss": 1.6423, + "step": 3013 + }, + { + "epoch": 3.428896473265074, + "grad_norm": 1.6331647634506226, + "learning_rate": 0.0003151308304891923, + "loss": 2.6264, + "step": 3014 + }, + { + "epoch": 3.430034129692833, + "grad_norm": 1.0118826627731323, + "learning_rate": 0.00031490329920364055, + "loss": 2.5419, + "step": 3015 + }, + { + "epoch": 3.4311717861205917, + "grad_norm": 1.1630852222442627, + "learning_rate": 0.0003146757679180887, + "loss": 2.345, + "step": 3016 + }, + { + "epoch": 3.4323094425483505, + "grad_norm": 1.082171082496643, + "learning_rate": 0.00031444823663253697, + "loss": 1.6889, + "step": 3017 + }, + { + "epoch": 3.4334470989761092, + "grad_norm": 0.9755992889404297, + "learning_rate": 0.00031422070534698523, + "loss": 1.599, + "step": 3018 + }, + { + "epoch": 3.434584755403868, + "grad_norm": 1.2539499998092651, + "learning_rate": 0.00031399317406143344, + "loss": 3.2968, + "step": 3019 + }, + { + "epoch": 3.4357224118316267, + "grad_norm": 0.8818562030792236, + "learning_rate": 0.0003137656427758817, + "loss": 2.6973, + "step": 3020 + }, + { + "epoch": 3.4368600682593855, + "grad_norm": 0.900875449180603, + "learning_rate": 0.0003135381114903299, + "loss": 1.6828, + "step": 3021 + }, + { + "epoch": 3.4379977246871443, + "grad_norm": 0.7886406779289246, + "learning_rate": 0.0003133105802047782, + "loss": 2.1497, + "step": 3022 + }, + { + "epoch": 3.4391353811149035, + "grad_norm": 1.0653672218322754, + "learning_rate": 0.0003130830489192264, + "loss": 1.2112, + "step": 3023 + }, + { + "epoch": 3.4402730375426622, + "grad_norm": 0.8921288251876831, + "learning_rate": 0.00031285551763367465, + "loss": 1.5596, + "step": 3024 + }, + { + "epoch": 3.441410693970421, + "grad_norm": 1.0506072044372559, + "learning_rate": 0.0003126279863481229, + "loss": 1.9131, + "step": 3025 + }, + { + "epoch": 3.4425483503981797, + "grad_norm": 0.9992786645889282, + "learning_rate": 0.00031240045506257107, + "loss": 2.2033, + "step": 3026 + }, + { + "epoch": 3.4436860068259385, + "grad_norm": 1.0742168426513672, + "learning_rate": 0.00031217292377701934, + "loss": 1.793, + "step": 3027 + }, + { + "epoch": 3.4448236632536973, + "grad_norm": 1.2853068113327026, + "learning_rate": 0.0003119453924914676, + "loss": 2.5013, + "step": 3028 + }, + { + "epoch": 3.445961319681456, + "grad_norm": 1.1094430685043335, + "learning_rate": 0.0003117178612059158, + "loss": 2.518, + "step": 3029 + }, + { + "epoch": 3.4470989761092152, + "grad_norm": 1.2212568521499634, + "learning_rate": 0.0003114903299203641, + "loss": 1.1286, + "step": 3030 + }, + { + "epoch": 3.448236632536974, + "grad_norm": 0.8297713398933411, + "learning_rate": 0.0003112627986348123, + "loss": 2.0985, + "step": 3031 + }, + { + "epoch": 3.4493742889647327, + "grad_norm": 1.1644102334976196, + "learning_rate": 0.0003110352673492605, + "loss": 2.4079, + "step": 3032 + }, + { + "epoch": 3.4505119453924915, + "grad_norm": 0.8777130842208862, + "learning_rate": 0.00031080773606370876, + "loss": 3.0359, + "step": 3033 + }, + { + "epoch": 3.4516496018202503, + "grad_norm": 1.0054441690444946, + "learning_rate": 0.000310580204778157, + "loss": 1.4005, + "step": 3034 + }, + { + "epoch": 3.452787258248009, + "grad_norm": 0.7273943424224854, + "learning_rate": 0.0003103526734926053, + "loss": 0.9642, + "step": 3035 + }, + { + "epoch": 3.453924914675768, + "grad_norm": 0.9213597774505615, + "learning_rate": 0.00031012514220705344, + "loss": 1.603, + "step": 3036 + }, + { + "epoch": 3.4550625711035265, + "grad_norm": 0.8535187840461731, + "learning_rate": 0.0003098976109215017, + "loss": 1.8995, + "step": 3037 + }, + { + "epoch": 3.4562002275312853, + "grad_norm": 0.6558015942573547, + "learning_rate": 0.00030967007963594997, + "loss": 1.7263, + "step": 3038 + }, + { + "epoch": 3.4573378839590445, + "grad_norm": 0.8745822310447693, + "learning_rate": 0.0003094425483503982, + "loss": 2.1463, + "step": 3039 + }, + { + "epoch": 3.4584755403868033, + "grad_norm": 0.829898476600647, + "learning_rate": 0.00030921501706484644, + "loss": 1.1866, + "step": 3040 + }, + { + "epoch": 3.459613196814562, + "grad_norm": 0.7422157526016235, + "learning_rate": 0.00030898748577929465, + "loss": 1.6347, + "step": 3041 + }, + { + "epoch": 3.460750853242321, + "grad_norm": 1.017354965209961, + "learning_rate": 0.00030875995449374286, + "loss": 2.092, + "step": 3042 + }, + { + "epoch": 3.4618885096700796, + "grad_norm": 0.9957495331764221, + "learning_rate": 0.00030853242320819113, + "loss": 1.6618, + "step": 3043 + }, + { + "epoch": 3.4630261660978383, + "grad_norm": 0.9194496273994446, + "learning_rate": 0.0003083048919226394, + "loss": 1.3234, + "step": 3044 + }, + { + "epoch": 3.464163822525597, + "grad_norm": 0.8070917129516602, + "learning_rate": 0.00030807736063708766, + "loss": 1.0518, + "step": 3045 + }, + { + "epoch": 3.4653014789533563, + "grad_norm": 0.9527956247329712, + "learning_rate": 0.0003078498293515358, + "loss": 1.8434, + "step": 3046 + }, + { + "epoch": 3.466439135381115, + "grad_norm": 0.9919708967208862, + "learning_rate": 0.0003076222980659841, + "loss": 1.8574, + "step": 3047 + }, + { + "epoch": 3.467576791808874, + "grad_norm": 0.7566163539886475, + "learning_rate": 0.00030739476678043234, + "loss": 2.2815, + "step": 3048 + }, + { + "epoch": 3.4687144482366326, + "grad_norm": 0.9245063066482544, + "learning_rate": 0.00030716723549488055, + "loss": 1.8061, + "step": 3049 + }, + { + "epoch": 3.4698521046643913, + "grad_norm": 0.5763617753982544, + "learning_rate": 0.00030693970420932876, + "loss": 0.8955, + "step": 3050 + }, + { + "epoch": 3.47098976109215, + "grad_norm": 0.6619337797164917, + "learning_rate": 0.000306712172923777, + "loss": 0.8753, + "step": 3051 + }, + { + "epoch": 3.472127417519909, + "grad_norm": 0.9321338534355164, + "learning_rate": 0.00030648464163822523, + "loss": 1.3895, + "step": 3052 + }, + { + "epoch": 3.473265073947668, + "grad_norm": 1.0468645095825195, + "learning_rate": 0.0003062571103526735, + "loss": 1.9941, + "step": 3053 + }, + { + "epoch": 3.474402730375427, + "grad_norm": 1.2254489660263062, + "learning_rate": 0.00030602957906712176, + "loss": 2.2438, + "step": 3054 + }, + { + "epoch": 3.4755403868031856, + "grad_norm": 0.965639591217041, + "learning_rate": 0.00030580204778156997, + "loss": 1.6008, + "step": 3055 + }, + { + "epoch": 3.4766780432309443, + "grad_norm": 0.7972604632377625, + "learning_rate": 0.0003055745164960182, + "loss": 1.2332, + "step": 3056 + }, + { + "epoch": 3.477815699658703, + "grad_norm": 0.8752893209457397, + "learning_rate": 0.00030534698521046645, + "loss": 1.3047, + "step": 3057 + }, + { + "epoch": 3.478953356086462, + "grad_norm": 0.7717936038970947, + "learning_rate": 0.0003051194539249147, + "loss": 1.4136, + "step": 3058 + }, + { + "epoch": 3.4800910125142206, + "grad_norm": 0.8218960165977478, + "learning_rate": 0.0003048919226393629, + "loss": 1.5268, + "step": 3059 + }, + { + "epoch": 3.4812286689419794, + "grad_norm": 0.8942006230354309, + "learning_rate": 0.00030466439135381113, + "loss": 1.9143, + "step": 3060 + }, + { + "epoch": 3.482366325369738, + "grad_norm": 0.8847683668136597, + "learning_rate": 0.0003044368600682594, + "loss": 2.8478, + "step": 3061 + }, + { + "epoch": 3.4835039817974973, + "grad_norm": 0.8269094228744507, + "learning_rate": 0.0003042093287827076, + "loss": 2.0533, + "step": 3062 + }, + { + "epoch": 3.484641638225256, + "grad_norm": 0.7389246225357056, + "learning_rate": 0.00030398179749715587, + "loss": 1.5518, + "step": 3063 + }, + { + "epoch": 3.485779294653015, + "grad_norm": 0.7460439205169678, + "learning_rate": 0.00030375426621160413, + "loss": 0.9565, + "step": 3064 + }, + { + "epoch": 3.4869169510807736, + "grad_norm": 0.8218055367469788, + "learning_rate": 0.00030352673492605234, + "loss": 1.8419, + "step": 3065 + }, + { + "epoch": 3.4880546075085324, + "grad_norm": 1.2303011417388916, + "learning_rate": 0.00030329920364050055, + "loss": 2.0404, + "step": 3066 + }, + { + "epoch": 3.489192263936291, + "grad_norm": 0.7625948190689087, + "learning_rate": 0.0003030716723549488, + "loss": 1.502, + "step": 3067 + }, + { + "epoch": 3.49032992036405, + "grad_norm": 0.9459888339042664, + "learning_rate": 0.0003028441410693971, + "loss": 1.4141, + "step": 3068 + }, + { + "epoch": 3.491467576791809, + "grad_norm": 0.9511308670043945, + "learning_rate": 0.0003026166097838453, + "loss": 2.9199, + "step": 3069 + }, + { + "epoch": 3.492605233219568, + "grad_norm": 0.9939665198326111, + "learning_rate": 0.0003023890784982935, + "loss": 1.7376, + "step": 3070 + }, + { + "epoch": 3.4937428896473266, + "grad_norm": 1.1104254722595215, + "learning_rate": 0.00030216154721274176, + "loss": 2.1018, + "step": 3071 + }, + { + "epoch": 3.4948805460750854, + "grad_norm": 0.8199077248573303, + "learning_rate": 0.00030193401592718997, + "loss": 2.0834, + "step": 3072 + }, + { + "epoch": 3.496018202502844, + "grad_norm": 0.7726981043815613, + "learning_rate": 0.00030170648464163824, + "loss": 1.2873, + "step": 3073 + }, + { + "epoch": 3.497155858930603, + "grad_norm": 1.0992076396942139, + "learning_rate": 0.0003014789533560865, + "loss": 2.3819, + "step": 3074 + }, + { + "epoch": 3.4982935153583616, + "grad_norm": 0.7880586385726929, + "learning_rate": 0.0003012514220705347, + "loss": 2.0231, + "step": 3075 + }, + { + "epoch": 3.4994311717861204, + "grad_norm": 0.5555304884910583, + "learning_rate": 0.0003010238907849829, + "loss": 1.0044, + "step": 3076 + }, + { + "epoch": 3.500568828213879, + "grad_norm": 1.2383909225463867, + "learning_rate": 0.0003007963594994312, + "loss": 1.2884, + "step": 3077 + }, + { + "epoch": 3.5017064846416384, + "grad_norm": 1.214747667312622, + "learning_rate": 0.00030056882821387945, + "loss": 1.4196, + "step": 3078 + }, + { + "epoch": 3.502844141069397, + "grad_norm": 0.7822383046150208, + "learning_rate": 0.00030034129692832766, + "loss": 1.3068, + "step": 3079 + }, + { + "epoch": 3.503981797497156, + "grad_norm": 1.200726866722107, + "learning_rate": 0.00030011376564277587, + "loss": 2.0791, + "step": 3080 + }, + { + "epoch": 3.5051194539249146, + "grad_norm": 0.7341413497924805, + "learning_rate": 0.00029988623435722413, + "loss": 0.9889, + "step": 3081 + }, + { + "epoch": 3.5062571103526734, + "grad_norm": 1.4545503854751587, + "learning_rate": 0.00029965870307167234, + "loss": 2.6975, + "step": 3082 + }, + { + "epoch": 3.507394766780432, + "grad_norm": 0.8442320227622986, + "learning_rate": 0.0002994311717861206, + "loss": 1.6271, + "step": 3083 + }, + { + "epoch": 3.508532423208191, + "grad_norm": 0.7989565134048462, + "learning_rate": 0.0002992036405005688, + "loss": 0.9025, + "step": 3084 + }, + { + "epoch": 3.50967007963595, + "grad_norm": 0.9094103574752808, + "learning_rate": 0.0002989761092150171, + "loss": 1.4245, + "step": 3085 + }, + { + "epoch": 3.510807736063709, + "grad_norm": 1.1836727857589722, + "learning_rate": 0.0002987485779294653, + "loss": 2.3158, + "step": 3086 + }, + { + "epoch": 3.5119453924914676, + "grad_norm": 0.9379547238349915, + "learning_rate": 0.00029852104664391355, + "loss": 1.3348, + "step": 3087 + }, + { + "epoch": 3.5130830489192264, + "grad_norm": 1.4176782369613647, + "learning_rate": 0.0002982935153583618, + "loss": 2.056, + "step": 3088 + }, + { + "epoch": 3.514220705346985, + "grad_norm": 0.8050400018692017, + "learning_rate": 0.00029806598407281, + "loss": 1.7547, + "step": 3089 + }, + { + "epoch": 3.515358361774744, + "grad_norm": 0.6850399971008301, + "learning_rate": 0.00029783845278725824, + "loss": 0.8674, + "step": 3090 + }, + { + "epoch": 3.5164960182025027, + "grad_norm": 1.1448007822036743, + "learning_rate": 0.0002976109215017065, + "loss": 2.3783, + "step": 3091 + }, + { + "epoch": 3.517633674630262, + "grad_norm": 0.645057737827301, + "learning_rate": 0.0002973833902161547, + "loss": 0.9246, + "step": 3092 + }, + { + "epoch": 3.51877133105802, + "grad_norm": 1.0810428857803345, + "learning_rate": 0.000297155858930603, + "loss": 1.9262, + "step": 3093 + }, + { + "epoch": 3.5199089874857794, + "grad_norm": 0.7463341951370239, + "learning_rate": 0.0002969283276450512, + "loss": 1.8343, + "step": 3094 + }, + { + "epoch": 3.521046643913538, + "grad_norm": 1.1941145658493042, + "learning_rate": 0.00029670079635949945, + "loss": 3.0099, + "step": 3095 + }, + { + "epoch": 3.522184300341297, + "grad_norm": 1.1003155708312988, + "learning_rate": 0.00029647326507394766, + "loss": 1.9862, + "step": 3096 + }, + { + "epoch": 3.5233219567690557, + "grad_norm": 0.7779751420021057, + "learning_rate": 0.0002962457337883959, + "loss": 1.5242, + "step": 3097 + }, + { + "epoch": 3.5244596131968144, + "grad_norm": 0.7989513874053955, + "learning_rate": 0.0002960182025028442, + "loss": 1.2492, + "step": 3098 + }, + { + "epoch": 3.5255972696245736, + "grad_norm": 0.9354263544082642, + "learning_rate": 0.00029579067121729234, + "loss": 1.1991, + "step": 3099 + }, + { + "epoch": 3.526734926052332, + "grad_norm": 1.2096282243728638, + "learning_rate": 0.0002955631399317406, + "loss": 2.2768, + "step": 3100 + }, + { + "epoch": 3.527872582480091, + "grad_norm": 1.2972865104675293, + "learning_rate": 0.00029533560864618887, + "loss": 2.3998, + "step": 3101 + }, + { + "epoch": 3.52901023890785, + "grad_norm": 0.9327741861343384, + "learning_rate": 0.0002951080773606371, + "loss": 2.3759, + "step": 3102 + }, + { + "epoch": 3.5301478953356087, + "grad_norm": 1.7685364484786987, + "learning_rate": 0.00029488054607508535, + "loss": 2.9593, + "step": 3103 + }, + { + "epoch": 3.5312855517633674, + "grad_norm": 1.1170707941055298, + "learning_rate": 0.00029465301478953356, + "loss": 1.7477, + "step": 3104 + }, + { + "epoch": 3.532423208191126, + "grad_norm": 0.635349452495575, + "learning_rate": 0.0002944254835039818, + "loss": 0.4989, + "step": 3105 + }, + { + "epoch": 3.533560864618885, + "grad_norm": 1.165709376335144, + "learning_rate": 0.00029419795221843003, + "loss": 1.5922, + "step": 3106 + }, + { + "epoch": 3.5346985210466437, + "grad_norm": 0.8228427171707153, + "learning_rate": 0.0002939704209328783, + "loss": 1.6231, + "step": 3107 + }, + { + "epoch": 3.535836177474403, + "grad_norm": 0.7035273313522339, + "learning_rate": 0.00029374288964732656, + "loss": 1.2547, + "step": 3108 + }, + { + "epoch": 3.5369738339021617, + "grad_norm": 0.9411433935165405, + "learning_rate": 0.0002935153583617747, + "loss": 1.4946, + "step": 3109 + }, + { + "epoch": 3.5381114903299204, + "grad_norm": 0.8050945401191711, + "learning_rate": 0.000293287827076223, + "loss": 1.2655, + "step": 3110 + }, + { + "epoch": 3.539249146757679, + "grad_norm": 0.5025121569633484, + "learning_rate": 0.00029306029579067124, + "loss": 1.1542, + "step": 3111 + }, + { + "epoch": 3.540386803185438, + "grad_norm": 0.6222087740898132, + "learning_rate": 0.00029283276450511945, + "loss": 0.8863, + "step": 3112 + }, + { + "epoch": 3.5415244596131967, + "grad_norm": 1.4298913478851318, + "learning_rate": 0.0002926052332195677, + "loss": 2.4176, + "step": 3113 + }, + { + "epoch": 3.5426621160409555, + "grad_norm": 0.5780576467514038, + "learning_rate": 0.0002923777019340159, + "loss": 0.5984, + "step": 3114 + }, + { + "epoch": 3.5437997724687147, + "grad_norm": 0.6261698007583618, + "learning_rate": 0.0002921501706484642, + "loss": 0.7561, + "step": 3115 + }, + { + "epoch": 3.544937428896473, + "grad_norm": 0.7118838429450989, + "learning_rate": 0.0002919226393629124, + "loss": 0.7812, + "step": 3116 + }, + { + "epoch": 3.546075085324232, + "grad_norm": 0.8261246681213379, + "learning_rate": 0.00029169510807736066, + "loss": 1.0061, + "step": 3117 + }, + { + "epoch": 3.547212741751991, + "grad_norm": 1.191845178604126, + "learning_rate": 0.0002914675767918089, + "loss": 3.8113, + "step": 3118 + }, + { + "epoch": 3.5483503981797497, + "grad_norm": 1.1474705934524536, + "learning_rate": 0.0002912400455062571, + "loss": 2.7153, + "step": 3119 + }, + { + "epoch": 3.5494880546075085, + "grad_norm": 0.9370527267456055, + "learning_rate": 0.00029101251422070535, + "loss": 1.5725, + "step": 3120 + }, + { + "epoch": 3.5506257110352673, + "grad_norm": 0.9595047235488892, + "learning_rate": 0.0002907849829351536, + "loss": 1.8677, + "step": 3121 + }, + { + "epoch": 3.551763367463026, + "grad_norm": 1.2102956771850586, + "learning_rate": 0.0002905574516496018, + "loss": 1.2953, + "step": 3122 + }, + { + "epoch": 3.5529010238907848, + "grad_norm": 0.595016360282898, + "learning_rate": 0.00029032992036405003, + "loss": 0.6797, + "step": 3123 + }, + { + "epoch": 3.554038680318544, + "grad_norm": 0.8877463936805725, + "learning_rate": 0.0002901023890784983, + "loss": 1.4732, + "step": 3124 + }, + { + "epoch": 3.5551763367463027, + "grad_norm": 0.9120120406150818, + "learning_rate": 0.00028987485779294656, + "loss": 2.0458, + "step": 3125 + }, + { + "epoch": 3.5563139931740615, + "grad_norm": 0.845203697681427, + "learning_rate": 0.00028964732650739477, + "loss": 1.9619, + "step": 3126 + }, + { + "epoch": 3.5574516496018203, + "grad_norm": 1.032363772392273, + "learning_rate": 0.00028941979522184303, + "loss": 1.3608, + "step": 3127 + }, + { + "epoch": 3.558589306029579, + "grad_norm": 0.7119994759559631, + "learning_rate": 0.00028919226393629124, + "loss": 1.3793, + "step": 3128 + }, + { + "epoch": 3.5597269624573378, + "grad_norm": 2.150618553161621, + "learning_rate": 0.00028896473265073945, + "loss": 4.2388, + "step": 3129 + }, + { + "epoch": 3.5608646188850965, + "grad_norm": 0.7926281094551086, + "learning_rate": 0.0002887372013651877, + "loss": 1.7266, + "step": 3130 + }, + { + "epoch": 3.5620022753128557, + "grad_norm": 1.1008011102676392, + "learning_rate": 0.000288509670079636, + "loss": 2.8486, + "step": 3131 + }, + { + "epoch": 3.5631399317406145, + "grad_norm": 0.8936535716056824, + "learning_rate": 0.0002882821387940842, + "loss": 2.1958, + "step": 3132 + }, + { + "epoch": 3.5642775881683733, + "grad_norm": 0.7171263694763184, + "learning_rate": 0.0002880546075085324, + "loss": 0.8248, + "step": 3133 + }, + { + "epoch": 3.565415244596132, + "grad_norm": 1.07510244846344, + "learning_rate": 0.00028782707622298067, + "loss": 1.7625, + "step": 3134 + }, + { + "epoch": 3.5665529010238908, + "grad_norm": 0.6037983298301697, + "learning_rate": 0.0002875995449374289, + "loss": 0.8, + "step": 3135 + }, + { + "epoch": 3.5676905574516495, + "grad_norm": 0.737299919128418, + "learning_rate": 0.00028737201365187714, + "loss": 1.5646, + "step": 3136 + }, + { + "epoch": 3.5688282138794083, + "grad_norm": 0.8927704095840454, + "learning_rate": 0.0002871444823663254, + "loss": 1.8448, + "step": 3137 + }, + { + "epoch": 3.5699658703071675, + "grad_norm": 0.7696796655654907, + "learning_rate": 0.0002869169510807736, + "loss": 1.8304, + "step": 3138 + }, + { + "epoch": 3.571103526734926, + "grad_norm": 1.4837048053741455, + "learning_rate": 0.0002866894197952218, + "loss": 3.0218, + "step": 3139 + }, + { + "epoch": 3.572241183162685, + "grad_norm": 0.9347735643386841, + "learning_rate": 0.0002864618885096701, + "loss": 1.3186, + "step": 3140 + }, + { + "epoch": 3.573378839590444, + "grad_norm": 0.8641846179962158, + "learning_rate": 0.00028623435722411835, + "loss": 1.4623, + "step": 3141 + }, + { + "epoch": 3.5745164960182025, + "grad_norm": 1.1673868894577026, + "learning_rate": 0.00028600682593856656, + "loss": 2.0682, + "step": 3142 + }, + { + "epoch": 3.5756541524459613, + "grad_norm": 1.1850996017456055, + "learning_rate": 0.00028577929465301477, + "loss": 2.0922, + "step": 3143 + }, + { + "epoch": 3.57679180887372, + "grad_norm": 0.799368679523468, + "learning_rate": 0.00028555176336746304, + "loss": 2.0306, + "step": 3144 + }, + { + "epoch": 3.577929465301479, + "grad_norm": 0.7206624150276184, + "learning_rate": 0.00028532423208191125, + "loss": 1.3149, + "step": 3145 + }, + { + "epoch": 3.5790671217292376, + "grad_norm": 1.1149860620498657, + "learning_rate": 0.0002850967007963595, + "loss": 2.1369, + "step": 3146 + }, + { + "epoch": 3.580204778156997, + "grad_norm": 1.0406419038772583, + "learning_rate": 0.0002848691695108078, + "loss": 1.652, + "step": 3147 + }, + { + "epoch": 3.5813424345847555, + "grad_norm": 1.1601135730743408, + "learning_rate": 0.000284641638225256, + "loss": 1.6278, + "step": 3148 + }, + { + "epoch": 3.5824800910125143, + "grad_norm": 1.0218907594680786, + "learning_rate": 0.0002844141069397042, + "loss": 1.2107, + "step": 3149 + }, + { + "epoch": 3.583617747440273, + "grad_norm": 1.2365930080413818, + "learning_rate": 0.00028418657565415246, + "loss": 2.1005, + "step": 3150 + }, + { + "epoch": 3.584755403868032, + "grad_norm": 0.7121724486351013, + "learning_rate": 0.0002839590443686007, + "loss": 1.5354, + "step": 3151 + }, + { + "epoch": 3.5858930602957906, + "grad_norm": 0.9739391207695007, + "learning_rate": 0.0002837315130830489, + "loss": 2.6124, + "step": 3152 + }, + { + "epoch": 3.5870307167235493, + "grad_norm": 1.4687827825546265, + "learning_rate": 0.00028350398179749714, + "loss": 2.4509, + "step": 3153 + }, + { + "epoch": 3.5881683731513085, + "grad_norm": 1.106471061706543, + "learning_rate": 0.0002832764505119454, + "loss": 1.6431, + "step": 3154 + }, + { + "epoch": 3.589306029579067, + "grad_norm": 1.566792607307434, + "learning_rate": 0.0002830489192263936, + "loss": 2.5289, + "step": 3155 + }, + { + "epoch": 3.590443686006826, + "grad_norm": 0.8209744095802307, + "learning_rate": 0.0002828213879408419, + "loss": 1.251, + "step": 3156 + }, + { + "epoch": 3.591581342434585, + "grad_norm": 1.5616912841796875, + "learning_rate": 0.0002825938566552901, + "loss": 2.958, + "step": 3157 + }, + { + "epoch": 3.5927189988623436, + "grad_norm": 0.8948552012443542, + "learning_rate": 0.00028236632536973835, + "loss": 1.6619, + "step": 3158 + }, + { + "epoch": 3.5938566552901023, + "grad_norm": 1.0657151937484741, + "learning_rate": 0.00028213879408418656, + "loss": 1.8547, + "step": 3159 + }, + { + "epoch": 3.594994311717861, + "grad_norm": 0.7531653046607971, + "learning_rate": 0.00028191126279863483, + "loss": 1.5972, + "step": 3160 + }, + { + "epoch": 3.59613196814562, + "grad_norm": 0.9963746070861816, + "learning_rate": 0.0002816837315130831, + "loss": 2.1391, + "step": 3161 + }, + { + "epoch": 3.5972696245733786, + "grad_norm": 1.1848362684249878, + "learning_rate": 0.00028145620022753125, + "loss": 1.8631, + "step": 3162 + }, + { + "epoch": 3.598407281001138, + "grad_norm": 1.2183339595794678, + "learning_rate": 0.0002812286689419795, + "loss": 2.2846, + "step": 3163 + }, + { + "epoch": 3.5995449374288966, + "grad_norm": 0.654116153717041, + "learning_rate": 0.0002810011376564278, + "loss": 1.3402, + "step": 3164 + }, + { + "epoch": 3.6006825938566553, + "grad_norm": 1.1731293201446533, + "learning_rate": 0.000280773606370876, + "loss": 2.3222, + "step": 3165 + }, + { + "epoch": 3.601820250284414, + "grad_norm": 0.6737352013587952, + "learning_rate": 0.00028054607508532425, + "loss": 1.3616, + "step": 3166 + }, + { + "epoch": 3.602957906712173, + "grad_norm": 0.6698241233825684, + "learning_rate": 0.00028031854379977246, + "loss": 1.0889, + "step": 3167 + }, + { + "epoch": 3.6040955631399316, + "grad_norm": 1.0666381120681763, + "learning_rate": 0.0002800910125142207, + "loss": 1.6506, + "step": 3168 + }, + { + "epoch": 3.6052332195676904, + "grad_norm": 0.7355189323425293, + "learning_rate": 0.00027986348122866893, + "loss": 1.2037, + "step": 3169 + }, + { + "epoch": 3.6063708759954496, + "grad_norm": 1.5804355144500732, + "learning_rate": 0.0002796359499431172, + "loss": 3.5022, + "step": 3170 + }, + { + "epoch": 3.6075085324232083, + "grad_norm": 0.602844774723053, + "learning_rate": 0.00027940841865756546, + "loss": 0.6136, + "step": 3171 + }, + { + "epoch": 3.608646188850967, + "grad_norm": 0.6072266101837158, + "learning_rate": 0.0002791808873720136, + "loss": 0.932, + "step": 3172 + }, + { + "epoch": 3.609783845278726, + "grad_norm": 0.9740130305290222, + "learning_rate": 0.0002789533560864619, + "loss": 1.3407, + "step": 3173 + }, + { + "epoch": 3.6109215017064846, + "grad_norm": 0.99945068359375, + "learning_rate": 0.00027872582480091015, + "loss": 3.0923, + "step": 3174 + }, + { + "epoch": 3.6120591581342434, + "grad_norm": 0.8721227645874023, + "learning_rate": 0.00027849829351535836, + "loss": 1.1302, + "step": 3175 + }, + { + "epoch": 3.613196814562002, + "grad_norm": 0.9758340716362, + "learning_rate": 0.0002782707622298066, + "loss": 1.2336, + "step": 3176 + }, + { + "epoch": 3.6143344709897613, + "grad_norm": 0.9425876140594482, + "learning_rate": 0.00027804323094425483, + "loss": 1.7407, + "step": 3177 + }, + { + "epoch": 3.6154721274175197, + "grad_norm": 1.0528191328048706, + "learning_rate": 0.0002778156996587031, + "loss": 1.7902, + "step": 3178 + }, + { + "epoch": 3.616609783845279, + "grad_norm": 0.9743272662162781, + "learning_rate": 0.0002775881683731513, + "loss": 1.5341, + "step": 3179 + }, + { + "epoch": 3.6177474402730376, + "grad_norm": 1.2372887134552002, + "learning_rate": 0.00027736063708759957, + "loss": 2.7279, + "step": 3180 + }, + { + "epoch": 3.6188850967007964, + "grad_norm": 1.1999444961547852, + "learning_rate": 0.00027713310580204783, + "loss": 2.0523, + "step": 3181 + }, + { + "epoch": 3.620022753128555, + "grad_norm": 0.903556227684021, + "learning_rate": 0.000276905574516496, + "loss": 1.3633, + "step": 3182 + }, + { + "epoch": 3.621160409556314, + "grad_norm": 0.7182222008705139, + "learning_rate": 0.00027667804323094425, + "loss": 1.5335, + "step": 3183 + }, + { + "epoch": 3.6222980659840727, + "grad_norm": 0.9772674441337585, + "learning_rate": 0.0002764505119453925, + "loss": 2.0408, + "step": 3184 + }, + { + "epoch": 3.6234357224118314, + "grad_norm": 0.6779370903968811, + "learning_rate": 0.0002762229806598407, + "loss": 0.963, + "step": 3185 + }, + { + "epoch": 3.6245733788395906, + "grad_norm": 1.3481802940368652, + "learning_rate": 0.00027599544937428894, + "loss": 1.9255, + "step": 3186 + }, + { + "epoch": 3.6257110352673494, + "grad_norm": 0.7263208031654358, + "learning_rate": 0.0002757679180887372, + "loss": 1.388, + "step": 3187 + }, + { + "epoch": 3.626848691695108, + "grad_norm": 0.8536869287490845, + "learning_rate": 0.00027554038680318546, + "loss": 1.5152, + "step": 3188 + }, + { + "epoch": 3.627986348122867, + "grad_norm": 0.7139433026313782, + "learning_rate": 0.0002753128555176337, + "loss": 1.4358, + "step": 3189 + }, + { + "epoch": 3.6291240045506257, + "grad_norm": 1.031976580619812, + "learning_rate": 0.00027508532423208194, + "loss": 2.6185, + "step": 3190 + }, + { + "epoch": 3.6302616609783844, + "grad_norm": 0.670224130153656, + "learning_rate": 0.00027485779294653015, + "loss": 1.2865, + "step": 3191 + }, + { + "epoch": 3.631399317406143, + "grad_norm": 0.8890179991722107, + "learning_rate": 0.00027463026166097836, + "loss": 1.5146, + "step": 3192 + }, + { + "epoch": 3.6325369738339024, + "grad_norm": 1.02622652053833, + "learning_rate": 0.0002744027303754266, + "loss": 1.8741, + "step": 3193 + }, + { + "epoch": 3.6336746302616607, + "grad_norm": 0.7721883654594421, + "learning_rate": 0.0002741751990898749, + "loss": 1.5905, + "step": 3194 + }, + { + "epoch": 3.63481228668942, + "grad_norm": 1.2086652517318726, + "learning_rate": 0.0002739476678043231, + "loss": 2.4846, + "step": 3195 + }, + { + "epoch": 3.6359499431171787, + "grad_norm": 1.0432493686676025, + "learning_rate": 0.0002737201365187713, + "loss": 1.6268, + "step": 3196 + }, + { + "epoch": 3.6370875995449374, + "grad_norm": 0.746393084526062, + "learning_rate": 0.00027349260523321957, + "loss": 1.5132, + "step": 3197 + }, + { + "epoch": 3.638225255972696, + "grad_norm": 0.9152166843414307, + "learning_rate": 0.00027326507394766783, + "loss": 2.676, + "step": 3198 + }, + { + "epoch": 3.639362912400455, + "grad_norm": 0.8825878500938416, + "learning_rate": 0.00027303754266211604, + "loss": 1.5822, + "step": 3199 + }, + { + "epoch": 3.640500568828214, + "grad_norm": 1.0070838928222656, + "learning_rate": 0.0002728100113765643, + "loss": 2.4247, + "step": 3200 + }, + { + "epoch": 3.6416382252559725, + "grad_norm": 0.7467294931411743, + "learning_rate": 0.0002725824800910125, + "loss": 1.0876, + "step": 3201 + }, + { + "epoch": 3.6427758816837317, + "grad_norm": 0.6660275459289551, + "learning_rate": 0.00027235494880546073, + "loss": 1.7321, + "step": 3202 + }, + { + "epoch": 3.6439135381114904, + "grad_norm": 0.9477649927139282, + "learning_rate": 0.000272127417519909, + "loss": 2.3247, + "step": 3203 + }, + { + "epoch": 3.645051194539249, + "grad_norm": 0.8924974203109741, + "learning_rate": 0.00027189988623435726, + "loss": 2.3003, + "step": 3204 + }, + { + "epoch": 3.646188850967008, + "grad_norm": 1.107689619064331, + "learning_rate": 0.00027167235494880547, + "loss": 2.079, + "step": 3205 + }, + { + "epoch": 3.6473265073947667, + "grad_norm": 0.7451488971710205, + "learning_rate": 0.0002714448236632537, + "loss": 1.4819, + "step": 3206 + }, + { + "epoch": 3.6484641638225255, + "grad_norm": 0.881750226020813, + "learning_rate": 0.00027121729237770194, + "loss": 1.4403, + "step": 3207 + }, + { + "epoch": 3.6496018202502842, + "grad_norm": 1.023970365524292, + "learning_rate": 0.0002709897610921502, + "loss": 1.5968, + "step": 3208 + }, + { + "epoch": 3.6507394766780434, + "grad_norm": 0.9700276255607605, + "learning_rate": 0.0002707622298065984, + "loss": 2.5977, + "step": 3209 + }, + { + "epoch": 3.651877133105802, + "grad_norm": 1.010783076286316, + "learning_rate": 0.0002705346985210467, + "loss": 2.2983, + "step": 3210 + }, + { + "epoch": 3.653014789533561, + "grad_norm": 1.0232635736465454, + "learning_rate": 0.0002703071672354949, + "loss": 1.461, + "step": 3211 + }, + { + "epoch": 3.6541524459613197, + "grad_norm": 0.6770385503768921, + "learning_rate": 0.0002700796359499431, + "loss": 0.8051, + "step": 3212 + }, + { + "epoch": 3.6552901023890785, + "grad_norm": 0.5668506026268005, + "learning_rate": 0.00026985210466439136, + "loss": 1.0955, + "step": 3213 + }, + { + "epoch": 3.6564277588168372, + "grad_norm": 1.4066870212554932, + "learning_rate": 0.0002696245733788396, + "loss": 3.6724, + "step": 3214 + }, + { + "epoch": 3.657565415244596, + "grad_norm": 1.153571605682373, + "learning_rate": 0.00026939704209328784, + "loss": 1.989, + "step": 3215 + }, + { + "epoch": 3.658703071672355, + "grad_norm": 0.728625476360321, + "learning_rate": 0.00026916951080773605, + "loss": 0.9875, + "step": 3216 + }, + { + "epoch": 3.6598407281001135, + "grad_norm": 0.9735997915267944, + "learning_rate": 0.0002689419795221843, + "loss": 2.0278, + "step": 3217 + }, + { + "epoch": 3.6609783845278727, + "grad_norm": 0.7140381932258606, + "learning_rate": 0.0002687144482366326, + "loss": 1.2533, + "step": 3218 + }, + { + "epoch": 3.6621160409556315, + "grad_norm": 0.6878055930137634, + "learning_rate": 0.0002684869169510808, + "loss": 1.1472, + "step": 3219 + }, + { + "epoch": 3.6632536973833902, + "grad_norm": 0.9914765954017639, + "learning_rate": 0.00026825938566552905, + "loss": 2.0081, + "step": 3220 + }, + { + "epoch": 3.664391353811149, + "grad_norm": 1.0459471940994263, + "learning_rate": 0.00026803185437997726, + "loss": 2.5561, + "step": 3221 + }, + { + "epoch": 3.6655290102389078, + "grad_norm": 0.8058758974075317, + "learning_rate": 0.00026780432309442547, + "loss": 1.2284, + "step": 3222 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 1.2225570678710938, + "learning_rate": 0.00026757679180887373, + "loss": 1.9648, + "step": 3223 + }, + { + "epoch": 3.6678043230944253, + "grad_norm": 1.3064903020858765, + "learning_rate": 0.000267349260523322, + "loss": 2.1073, + "step": 3224 + }, + { + "epoch": 3.6689419795221845, + "grad_norm": 0.7548709511756897, + "learning_rate": 0.00026712172923777015, + "loss": 1.5539, + "step": 3225 + }, + { + "epoch": 3.6700796359499432, + "grad_norm": 1.034284234046936, + "learning_rate": 0.0002668941979522184, + "loss": 2.125, + "step": 3226 + }, + { + "epoch": 3.671217292377702, + "grad_norm": 1.494303822517395, + "learning_rate": 0.0002666666666666667, + "loss": 2.775, + "step": 3227 + }, + { + "epoch": 3.6723549488054608, + "grad_norm": 1.1169606447219849, + "learning_rate": 0.00026643913538111494, + "loss": 1.798, + "step": 3228 + }, + { + "epoch": 3.6734926052332195, + "grad_norm": 0.8246114253997803, + "learning_rate": 0.00026621160409556315, + "loss": 2.3232, + "step": 3229 + }, + { + "epoch": 3.6746302616609783, + "grad_norm": 1.181199312210083, + "learning_rate": 0.00026598407281001136, + "loss": 2.0188, + "step": 3230 + }, + { + "epoch": 3.675767918088737, + "grad_norm": 0.7052603960037231, + "learning_rate": 0.00026575654152445963, + "loss": 0.8771, + "step": 3231 + }, + { + "epoch": 3.6769055745164962, + "grad_norm": 0.5976938605308533, + "learning_rate": 0.00026552901023890784, + "loss": 0.9885, + "step": 3232 + }, + { + "epoch": 3.6780432309442546, + "grad_norm": 0.6874876022338867, + "learning_rate": 0.0002653014789533561, + "loss": 1.0909, + "step": 3233 + }, + { + "epoch": 3.6791808873720138, + "grad_norm": 0.9620742797851562, + "learning_rate": 0.00026507394766780437, + "loss": 2.0523, + "step": 3234 + }, + { + "epoch": 3.6803185437997725, + "grad_norm": 1.0894056558609009, + "learning_rate": 0.0002648464163822525, + "loss": 2.1868, + "step": 3235 + }, + { + "epoch": 3.6814562002275313, + "grad_norm": 0.8352320194244385, + "learning_rate": 0.0002646188850967008, + "loss": 1.5531, + "step": 3236 + }, + { + "epoch": 3.68259385665529, + "grad_norm": 0.7866209149360657, + "learning_rate": 0.00026439135381114905, + "loss": 1.2869, + "step": 3237 + }, + { + "epoch": 3.683731513083049, + "grad_norm": 0.8318602442741394, + "learning_rate": 0.00026416382252559726, + "loss": 1.4013, + "step": 3238 + }, + { + "epoch": 3.684869169510808, + "grad_norm": 0.6473860144615173, + "learning_rate": 0.0002639362912400455, + "loss": 1.0885, + "step": 3239 + }, + { + "epoch": 3.6860068259385663, + "grad_norm": 0.8395644426345825, + "learning_rate": 0.00026370875995449373, + "loss": 1.8404, + "step": 3240 + }, + { + "epoch": 3.6871444823663255, + "grad_norm": 1.3969433307647705, + "learning_rate": 0.000263481228668942, + "loss": 2.3692, + "step": 3241 + }, + { + "epoch": 3.6882821387940843, + "grad_norm": 0.7492949366569519, + "learning_rate": 0.0002632536973833902, + "loss": 1.4541, + "step": 3242 + }, + { + "epoch": 3.689419795221843, + "grad_norm": 0.7749677300453186, + "learning_rate": 0.00026302616609783847, + "loss": 1.3882, + "step": 3243 + }, + { + "epoch": 3.690557451649602, + "grad_norm": 0.507800817489624, + "learning_rate": 0.00026279863481228674, + "loss": 0.6856, + "step": 3244 + }, + { + "epoch": 3.6916951080773606, + "grad_norm": 0.7074844837188721, + "learning_rate": 0.0002625711035267349, + "loss": 1.4151, + "step": 3245 + }, + { + "epoch": 3.6928327645051193, + "grad_norm": 1.3755199909210205, + "learning_rate": 0.00026234357224118316, + "loss": 2.833, + "step": 3246 + }, + { + "epoch": 3.693970420932878, + "grad_norm": 0.7762560248374939, + "learning_rate": 0.0002621160409556314, + "loss": 1.0901, + "step": 3247 + }, + { + "epoch": 3.6951080773606373, + "grad_norm": 1.1140830516815186, + "learning_rate": 0.00026188850967007963, + "loss": 3.3667, + "step": 3248 + }, + { + "epoch": 3.696245733788396, + "grad_norm": 1.8301548957824707, + "learning_rate": 0.0002616609783845279, + "loss": 2.7103, + "step": 3249 + }, + { + "epoch": 3.697383390216155, + "grad_norm": 0.9659634828567505, + "learning_rate": 0.0002614334470989761, + "loss": 2.4665, + "step": 3250 + }, + { + "epoch": 3.6985210466439136, + "grad_norm": 1.0916699171066284, + "learning_rate": 0.00026120591581342437, + "loss": 1.5426, + "step": 3251 + }, + { + "epoch": 3.6996587030716723, + "grad_norm": 1.2825108766555786, + "learning_rate": 0.0002609783845278726, + "loss": 1.7881, + "step": 3252 + }, + { + "epoch": 3.700796359499431, + "grad_norm": 1.1956276893615723, + "learning_rate": 0.00026075085324232084, + "loss": 1.8214, + "step": 3253 + }, + { + "epoch": 3.70193401592719, + "grad_norm": 1.0599949359893799, + "learning_rate": 0.0002605233219567691, + "loss": 2.1982, + "step": 3254 + }, + { + "epoch": 3.703071672354949, + "grad_norm": 0.9849620461463928, + "learning_rate": 0.00026029579067121726, + "loss": 1.133, + "step": 3255 + }, + { + "epoch": 3.7042093287827074, + "grad_norm": 1.0015318393707275, + "learning_rate": 0.0002600682593856655, + "loss": 1.0853, + "step": 3256 + }, + { + "epoch": 3.7053469852104666, + "grad_norm": 1.1830706596374512, + "learning_rate": 0.0002598407281001138, + "loss": 2.3646, + "step": 3257 + }, + { + "epoch": 3.7064846416382253, + "grad_norm": 0.695976734161377, + "learning_rate": 0.000259613196814562, + "loss": 1.4282, + "step": 3258 + }, + { + "epoch": 3.707622298065984, + "grad_norm": 0.9679135680198669, + "learning_rate": 0.0002593856655290102, + "loss": 1.3835, + "step": 3259 + }, + { + "epoch": 3.708759954493743, + "grad_norm": 1.6105592250823975, + "learning_rate": 0.0002591581342434585, + "loss": 1.4964, + "step": 3260 + }, + { + "epoch": 3.7098976109215016, + "grad_norm": 1.0834482908248901, + "learning_rate": 0.00025893060295790674, + "loss": 1.6271, + "step": 3261 + }, + { + "epoch": 3.7110352673492604, + "grad_norm": 1.945980429649353, + "learning_rate": 0.00025870307167235495, + "loss": 1.3425, + "step": 3262 + }, + { + "epoch": 3.712172923777019, + "grad_norm": 0.9838660359382629, + "learning_rate": 0.0002584755403868032, + "loss": 1.7534, + "step": 3263 + }, + { + "epoch": 3.7133105802047783, + "grad_norm": 0.8386029005050659, + "learning_rate": 0.0002582480091012514, + "loss": 1.7482, + "step": 3264 + }, + { + "epoch": 3.714448236632537, + "grad_norm": 0.7002511620521545, + "learning_rate": 0.00025802047781569963, + "loss": 1.1395, + "step": 3265 + }, + { + "epoch": 3.715585893060296, + "grad_norm": 0.9224847555160522, + "learning_rate": 0.0002577929465301479, + "loss": 1.4517, + "step": 3266 + }, + { + "epoch": 3.7167235494880546, + "grad_norm": 1.0433372259140015, + "learning_rate": 0.00025756541524459616, + "loss": 2.0467, + "step": 3267 + }, + { + "epoch": 3.7178612059158134, + "grad_norm": 1.5209763050079346, + "learning_rate": 0.00025733788395904437, + "loss": 2.7371, + "step": 3268 + }, + { + "epoch": 3.718998862343572, + "grad_norm": 0.7279452085494995, + "learning_rate": 0.0002571103526734926, + "loss": 1.0362, + "step": 3269 + }, + { + "epoch": 3.720136518771331, + "grad_norm": 0.7574169635772705, + "learning_rate": 0.00025688282138794084, + "loss": 1.2043, + "step": 3270 + }, + { + "epoch": 3.72127417519909, + "grad_norm": 1.3098034858703613, + "learning_rate": 0.0002566552901023891, + "loss": 2.5448, + "step": 3271 + }, + { + "epoch": 3.722411831626849, + "grad_norm": 0.8687241673469543, + "learning_rate": 0.0002564277588168373, + "loss": 1.4687, + "step": 3272 + }, + { + "epoch": 3.7235494880546076, + "grad_norm": 0.9659656882286072, + "learning_rate": 0.0002562002275312856, + "loss": 2.4018, + "step": 3273 + }, + { + "epoch": 3.7246871444823664, + "grad_norm": 1.438889503479004, + "learning_rate": 0.0002559726962457338, + "loss": 3.2661, + "step": 3274 + }, + { + "epoch": 3.725824800910125, + "grad_norm": 1.4293714761734009, + "learning_rate": 0.000255745164960182, + "loss": 3.2473, + "step": 3275 + }, + { + "epoch": 3.726962457337884, + "grad_norm": 1.1246088743209839, + "learning_rate": 0.00025551763367463026, + "loss": 2.274, + "step": 3276 + }, + { + "epoch": 3.7281001137656427, + "grad_norm": 0.6158525943756104, + "learning_rate": 0.00025529010238907853, + "loss": 1.1515, + "step": 3277 + }, + { + "epoch": 3.729237770193402, + "grad_norm": 1.0718598365783691, + "learning_rate": 0.00025506257110352674, + "loss": 1.6976, + "step": 3278 + }, + { + "epoch": 3.73037542662116, + "grad_norm": 0.9811798334121704, + "learning_rate": 0.00025483503981797495, + "loss": 1.6137, + "step": 3279 + }, + { + "epoch": 3.7315130830489194, + "grad_norm": 1.0085113048553467, + "learning_rate": 0.0002546075085324232, + "loss": 2.4258, + "step": 3280 + }, + { + "epoch": 3.732650739476678, + "grad_norm": 1.0862224102020264, + "learning_rate": 0.0002543799772468715, + "loss": 1.6301, + "step": 3281 + }, + { + "epoch": 3.733788395904437, + "grad_norm": 1.057241678237915, + "learning_rate": 0.0002541524459613197, + "loss": 2.4249, + "step": 3282 + }, + { + "epoch": 3.7349260523321957, + "grad_norm": 0.9607272148132324, + "learning_rate": 0.00025392491467576795, + "loss": 2.4097, + "step": 3283 + }, + { + "epoch": 3.7360637087599544, + "grad_norm": 0.769008994102478, + "learning_rate": 0.00025369738339021616, + "loss": 2.0803, + "step": 3284 + }, + { + "epoch": 3.737201365187713, + "grad_norm": 1.317842721939087, + "learning_rate": 0.00025346985210466437, + "loss": 3.1254, + "step": 3285 + }, + { + "epoch": 3.738339021615472, + "grad_norm": 0.8753394484519958, + "learning_rate": 0.00025324232081911263, + "loss": 0.9374, + "step": 3286 + }, + { + "epoch": 3.739476678043231, + "grad_norm": 0.7274285554885864, + "learning_rate": 0.0002530147895335609, + "loss": 1.4164, + "step": 3287 + }, + { + "epoch": 3.74061433447099, + "grad_norm": 0.7005650997161865, + "learning_rate": 0.0002527872582480091, + "loss": 1.3413, + "step": 3288 + }, + { + "epoch": 3.7417519908987487, + "grad_norm": 1.0076242685317993, + "learning_rate": 0.0002525597269624573, + "loss": 2.4045, + "step": 3289 + }, + { + "epoch": 3.7428896473265074, + "grad_norm": 1.4078526496887207, + "learning_rate": 0.0002523321956769056, + "loss": 2.6869, + "step": 3290 + }, + { + "epoch": 3.744027303754266, + "grad_norm": 1.1299002170562744, + "learning_rate": 0.00025210466439135385, + "loss": 2.3044, + "step": 3291 + }, + { + "epoch": 3.745164960182025, + "grad_norm": 0.8027879595756531, + "learning_rate": 0.00025187713310580206, + "loss": 1.8643, + "step": 3292 + }, + { + "epoch": 3.7463026166097837, + "grad_norm": 1.083298921585083, + "learning_rate": 0.00025164960182025027, + "loss": 1.4689, + "step": 3293 + }, + { + "epoch": 3.747440273037543, + "grad_norm": 0.9221919178962708, + "learning_rate": 0.00025142207053469853, + "loss": 1.6231, + "step": 3294 + }, + { + "epoch": 3.748577929465301, + "grad_norm": 0.8968214392662048, + "learning_rate": 0.00025119453924914674, + "loss": 2.3925, + "step": 3295 + }, + { + "epoch": 3.7497155858930604, + "grad_norm": 0.8066171407699585, + "learning_rate": 0.000250967007963595, + "loss": 1.804, + "step": 3296 + }, + { + "epoch": 3.750853242320819, + "grad_norm": 1.2196367979049683, + "learning_rate": 0.00025073947667804327, + "loss": 3.5051, + "step": 3297 + }, + { + "epoch": 3.751990898748578, + "grad_norm": 1.0004627704620361, + "learning_rate": 0.0002505119453924914, + "loss": 1.7358, + "step": 3298 + }, + { + "epoch": 3.7531285551763367, + "grad_norm": 1.1003127098083496, + "learning_rate": 0.0002502844141069397, + "loss": 1.6266, + "step": 3299 + }, + { + "epoch": 3.7542662116040955, + "grad_norm": 0.8804592490196228, + "learning_rate": 0.00025005688282138795, + "loss": 1.6458, + "step": 3300 + }, + { + "epoch": 3.755403868031854, + "grad_norm": 0.6198902130126953, + "learning_rate": 0.0002498293515358362, + "loss": 1.0018, + "step": 3301 + }, + { + "epoch": 3.756541524459613, + "grad_norm": 0.9782140851020813, + "learning_rate": 0.0002496018202502844, + "loss": 2.0916, + "step": 3302 + }, + { + "epoch": 3.757679180887372, + "grad_norm": 1.2002933025360107, + "learning_rate": 0.00024937428896473264, + "loss": 1.7946, + "step": 3303 + }, + { + "epoch": 3.758816837315131, + "grad_norm": 0.9678136110305786, + "learning_rate": 0.0002491467576791809, + "loss": 1.476, + "step": 3304 + }, + { + "epoch": 3.7599544937428897, + "grad_norm": 0.7648789286613464, + "learning_rate": 0.0002489192263936291, + "loss": 1.3207, + "step": 3305 + }, + { + "epoch": 3.7610921501706485, + "grad_norm": 1.2675551176071167, + "learning_rate": 0.0002486916951080774, + "loss": 2.6937, + "step": 3306 + }, + { + "epoch": 3.7622298065984072, + "grad_norm": 0.7276213765144348, + "learning_rate": 0.0002484641638225256, + "loss": 0.9917, + "step": 3307 + }, + { + "epoch": 3.763367463026166, + "grad_norm": 1.3405811786651611, + "learning_rate": 0.00024823663253697385, + "loss": 1.6142, + "step": 3308 + }, + { + "epoch": 3.7645051194539247, + "grad_norm": 0.7004644274711609, + "learning_rate": 0.00024800910125142206, + "loss": 0.9751, + "step": 3309 + }, + { + "epoch": 3.765642775881684, + "grad_norm": 0.8266583681106567, + "learning_rate": 0.0002477815699658703, + "loss": 1.7234, + "step": 3310 + }, + { + "epoch": 3.7667804323094427, + "grad_norm": 0.827187180519104, + "learning_rate": 0.0002475540386803186, + "loss": 1.5155, + "step": 3311 + }, + { + "epoch": 3.7679180887372015, + "grad_norm": 0.8548292517662048, + "learning_rate": 0.0002473265073947668, + "loss": 1.7327, + "step": 3312 + }, + { + "epoch": 3.7690557451649602, + "grad_norm": 0.884003221988678, + "learning_rate": 0.000247098976109215, + "loss": 1.4591, + "step": 3313 + }, + { + "epoch": 3.770193401592719, + "grad_norm": 0.7317343950271606, + "learning_rate": 0.00024687144482366327, + "loss": 1.0964, + "step": 3314 + }, + { + "epoch": 3.7713310580204777, + "grad_norm": 0.8710649013519287, + "learning_rate": 0.0002466439135381115, + "loss": 1.5287, + "step": 3315 + }, + { + "epoch": 3.7724687144482365, + "grad_norm": 1.5312626361846924, + "learning_rate": 0.00024641638225255974, + "loss": 3.5776, + "step": 3316 + }, + { + "epoch": 3.7736063708759957, + "grad_norm": 0.6833708882331848, + "learning_rate": 0.00024618885096700795, + "loss": 1.2004, + "step": 3317 + }, + { + "epoch": 3.774744027303754, + "grad_norm": 1.1912914514541626, + "learning_rate": 0.0002459613196814562, + "loss": 2.4428, + "step": 3318 + }, + { + "epoch": 3.7758816837315132, + "grad_norm": 1.0153708457946777, + "learning_rate": 0.00024573378839590443, + "loss": 1.9598, + "step": 3319 + }, + { + "epoch": 3.777019340159272, + "grad_norm": 1.0456105470657349, + "learning_rate": 0.0002455062571103527, + "loss": 1.7178, + "step": 3320 + }, + { + "epoch": 3.7781569965870307, + "grad_norm": 0.9774885177612305, + "learning_rate": 0.0002452787258248009, + "loss": 1.9783, + "step": 3321 + }, + { + "epoch": 3.7792946530147895, + "grad_norm": 1.089418649673462, + "learning_rate": 0.00024505119453924917, + "loss": 2.1403, + "step": 3322 + }, + { + "epoch": 3.7804323094425483, + "grad_norm": 1.1333575248718262, + "learning_rate": 0.0002448236632536974, + "loss": 1.3416, + "step": 3323 + }, + { + "epoch": 3.781569965870307, + "grad_norm": 0.8414488434791565, + "learning_rate": 0.00024459613196814564, + "loss": 1.5206, + "step": 3324 + }, + { + "epoch": 3.782707622298066, + "grad_norm": 1.0938860177993774, + "learning_rate": 0.00024436860068259385, + "loss": 2.2782, + "step": 3325 + }, + { + "epoch": 3.783845278725825, + "grad_norm": 1.1637766361236572, + "learning_rate": 0.00024414106939704206, + "loss": 1.9136, + "step": 3326 + }, + { + "epoch": 3.7849829351535837, + "grad_norm": 0.893345057964325, + "learning_rate": 0.00024391353811149032, + "loss": 1.1827, + "step": 3327 + }, + { + "epoch": 3.7861205915813425, + "grad_norm": 0.8583236336708069, + "learning_rate": 0.0002436860068259386, + "loss": 1.831, + "step": 3328 + }, + { + "epoch": 3.7872582480091013, + "grad_norm": 1.1517513990402222, + "learning_rate": 0.0002434584755403868, + "loss": 1.955, + "step": 3329 + }, + { + "epoch": 3.78839590443686, + "grad_norm": 0.8208685517311096, + "learning_rate": 0.00024323094425483506, + "loss": 1.3271, + "step": 3330 + }, + { + "epoch": 3.789533560864619, + "grad_norm": 0.6080604195594788, + "learning_rate": 0.00024300341296928327, + "loss": 1.1819, + "step": 3331 + }, + { + "epoch": 3.7906712172923775, + "grad_norm": 0.7177555561065674, + "learning_rate": 0.0002427758816837315, + "loss": 1.8782, + "step": 3332 + }, + { + "epoch": 3.7918088737201368, + "grad_norm": 0.7688407301902771, + "learning_rate": 0.00024254835039817977, + "loss": 2.0169, + "step": 3333 + }, + { + "epoch": 3.792946530147895, + "grad_norm": 1.3213694095611572, + "learning_rate": 0.00024232081911262798, + "loss": 1.6509, + "step": 3334 + }, + { + "epoch": 3.7940841865756543, + "grad_norm": 1.1717759370803833, + "learning_rate": 0.00024209328782707625, + "loss": 2.5551, + "step": 3335 + }, + { + "epoch": 3.795221843003413, + "grad_norm": 1.1276628971099854, + "learning_rate": 0.00024186575654152446, + "loss": 1.9977, + "step": 3336 + }, + { + "epoch": 3.796359499431172, + "grad_norm": 0.9729685187339783, + "learning_rate": 0.0002416382252559727, + "loss": 2.0503, + "step": 3337 + }, + { + "epoch": 3.7974971558589306, + "grad_norm": 1.1408803462982178, + "learning_rate": 0.00024141069397042093, + "loss": 1.7418, + "step": 3338 + }, + { + "epoch": 3.7986348122866893, + "grad_norm": 0.7034707069396973, + "learning_rate": 0.00024118316268486917, + "loss": 1.622, + "step": 3339 + }, + { + "epoch": 3.799772468714448, + "grad_norm": 1.2321475744247437, + "learning_rate": 0.00024095563139931743, + "loss": 2.719, + "step": 3340 + }, + { + "epoch": 3.800910125142207, + "grad_norm": 0.9584794640541077, + "learning_rate": 0.00024072810011376564, + "loss": 1.7564, + "step": 3341 + }, + { + "epoch": 3.802047781569966, + "grad_norm": 0.508548378944397, + "learning_rate": 0.00024050056882821388, + "loss": 0.6892, + "step": 3342 + }, + { + "epoch": 3.803185437997725, + "grad_norm": 1.1406333446502686, + "learning_rate": 0.00024027303754266212, + "loss": 2.4856, + "step": 3343 + }, + { + "epoch": 3.8043230944254836, + "grad_norm": 1.3431506156921387, + "learning_rate": 0.00024004550625711035, + "loss": 2.4605, + "step": 3344 + }, + { + "epoch": 3.8054607508532423, + "grad_norm": 1.2452622652053833, + "learning_rate": 0.00023981797497155862, + "loss": 2.2794, + "step": 3345 + }, + { + "epoch": 3.806598407281001, + "grad_norm": 0.8428059816360474, + "learning_rate": 0.00023959044368600683, + "loss": 1.701, + "step": 3346 + }, + { + "epoch": 3.80773606370876, + "grad_norm": 1.1248981952667236, + "learning_rate": 0.00023936291240045506, + "loss": 2.0512, + "step": 3347 + }, + { + "epoch": 3.8088737201365186, + "grad_norm": 0.9891200661659241, + "learning_rate": 0.0002391353811149033, + "loss": 2.4231, + "step": 3348 + }, + { + "epoch": 3.810011376564278, + "grad_norm": 1.2691020965576172, + "learning_rate": 0.00023890784982935154, + "loss": 1.9103, + "step": 3349 + }, + { + "epoch": 3.8111490329920366, + "grad_norm": 1.2526246309280396, + "learning_rate": 0.0002386803185437998, + "loss": 2.4644, + "step": 3350 + }, + { + "epoch": 3.8122866894197953, + "grad_norm": 1.0178083181381226, + "learning_rate": 0.000238452787258248, + "loss": 1.6648, + "step": 3351 + }, + { + "epoch": 3.813424345847554, + "grad_norm": 0.7109336256980896, + "learning_rate": 0.00023822525597269625, + "loss": 1.1645, + "step": 3352 + }, + { + "epoch": 3.814562002275313, + "grad_norm": 1.2275941371917725, + "learning_rate": 0.00023799772468714449, + "loss": 1.998, + "step": 3353 + }, + { + "epoch": 3.8156996587030716, + "grad_norm": 1.1337257623672485, + "learning_rate": 0.00023777019340159272, + "loss": 3.3256, + "step": 3354 + }, + { + "epoch": 3.8168373151308304, + "grad_norm": 1.2316417694091797, + "learning_rate": 0.00023754266211604096, + "loss": 2.1339, + "step": 3355 + }, + { + "epoch": 3.8179749715585896, + "grad_norm": 2.0374646186828613, + "learning_rate": 0.0002373151308304892, + "loss": 3.4686, + "step": 3356 + }, + { + "epoch": 3.819112627986348, + "grad_norm": 1.059798002243042, + "learning_rate": 0.00023708759954493743, + "loss": 1.6049, + "step": 3357 + }, + { + "epoch": 3.820250284414107, + "grad_norm": 0.8509323596954346, + "learning_rate": 0.00023686006825938567, + "loss": 1.2672, + "step": 3358 + }, + { + "epoch": 3.821387940841866, + "grad_norm": 0.603013277053833, + "learning_rate": 0.0002366325369738339, + "loss": 1.0302, + "step": 3359 + }, + { + "epoch": 3.8225255972696246, + "grad_norm": 1.5710378885269165, + "learning_rate": 0.00023640500568828215, + "loss": 2.2583, + "step": 3360 + }, + { + "epoch": 3.8236632536973834, + "grad_norm": 1.1296995878219604, + "learning_rate": 0.00023617747440273038, + "loss": 2.1769, + "step": 3361 + }, + { + "epoch": 3.824800910125142, + "grad_norm": 1.1707825660705566, + "learning_rate": 0.00023594994311717862, + "loss": 2.6797, + "step": 3362 + }, + { + "epoch": 3.825938566552901, + "grad_norm": 1.0534515380859375, + "learning_rate": 0.00023572241183162686, + "loss": 0.8871, + "step": 3363 + }, + { + "epoch": 3.8270762229806596, + "grad_norm": 0.9066932797431946, + "learning_rate": 0.0002354948805460751, + "loss": 1.6269, + "step": 3364 + }, + { + "epoch": 3.828213879408419, + "grad_norm": 0.8703908324241638, + "learning_rate": 0.00023526734926052333, + "loss": 1.5106, + "step": 3365 + }, + { + "epoch": 3.8293515358361776, + "grad_norm": 0.8020843267440796, + "learning_rate": 0.00023503981797497157, + "loss": 2.0528, + "step": 3366 + }, + { + "epoch": 3.8304891922639364, + "grad_norm": 0.9272792935371399, + "learning_rate": 0.0002348122866894198, + "loss": 1.6559, + "step": 3367 + }, + { + "epoch": 3.831626848691695, + "grad_norm": 0.5369601845741272, + "learning_rate": 0.00023458475540386804, + "loss": 1.0992, + "step": 3368 + }, + { + "epoch": 3.832764505119454, + "grad_norm": 0.6304998397827148, + "learning_rate": 0.00023435722411831628, + "loss": 0.8554, + "step": 3369 + }, + { + "epoch": 3.8339021615472126, + "grad_norm": 0.7873075604438782, + "learning_rate": 0.0002341296928327645, + "loss": 2.2207, + "step": 3370 + }, + { + "epoch": 3.8350398179749714, + "grad_norm": 0.6331322193145752, + "learning_rate": 0.00023390216154721275, + "loss": 1.4673, + "step": 3371 + }, + { + "epoch": 3.8361774744027306, + "grad_norm": 0.5962525010108948, + "learning_rate": 0.00023367463026166096, + "loss": 1.5718, + "step": 3372 + }, + { + "epoch": 3.837315130830489, + "grad_norm": 1.0758075714111328, + "learning_rate": 0.00023344709897610923, + "loss": 2.0164, + "step": 3373 + }, + { + "epoch": 3.838452787258248, + "grad_norm": 0.6574288010597229, + "learning_rate": 0.00023321956769055746, + "loss": 1.0371, + "step": 3374 + }, + { + "epoch": 3.839590443686007, + "grad_norm": 0.984384298324585, + "learning_rate": 0.00023299203640500567, + "loss": 2.0911, + "step": 3375 + }, + { + "epoch": 3.8407281001137656, + "grad_norm": 1.0093640089035034, + "learning_rate": 0.00023276450511945394, + "loss": 1.3854, + "step": 3376 + }, + { + "epoch": 3.8418657565415244, + "grad_norm": 1.2866005897521973, + "learning_rate": 0.00023253697383390215, + "loss": 2.0335, + "step": 3377 + }, + { + "epoch": 3.843003412969283, + "grad_norm": 1.504915475845337, + "learning_rate": 0.0002323094425483504, + "loss": 2.1961, + "step": 3378 + }, + { + "epoch": 3.8441410693970424, + "grad_norm": 0.6888614296913147, + "learning_rate": 0.00023208191126279865, + "loss": 1.6042, + "step": 3379 + }, + { + "epoch": 3.8452787258248007, + "grad_norm": 0.6555305123329163, + "learning_rate": 0.00023185437997724686, + "loss": 0.8984, + "step": 3380 + }, + { + "epoch": 3.84641638225256, + "grad_norm": 0.7594724893569946, + "learning_rate": 0.00023162684869169512, + "loss": 1.2783, + "step": 3381 + }, + { + "epoch": 3.8475540386803186, + "grad_norm": 0.9284168481826782, + "learning_rate": 0.00023139931740614333, + "loss": 1.9291, + "step": 3382 + }, + { + "epoch": 3.8486916951080774, + "grad_norm": 0.5923244953155518, + "learning_rate": 0.0002311717861205916, + "loss": 0.7256, + "step": 3383 + }, + { + "epoch": 3.849829351535836, + "grad_norm": 0.9779441952705383, + "learning_rate": 0.00023094425483503983, + "loss": 1.9968, + "step": 3384 + }, + { + "epoch": 3.850967007963595, + "grad_norm": 1.1208325624465942, + "learning_rate": 0.00023071672354948804, + "loss": 2.5118, + "step": 3385 + }, + { + "epoch": 3.8521046643913537, + "grad_norm": 0.8016112446784973, + "learning_rate": 0.0002304891922639363, + "loss": 0.7837, + "step": 3386 + }, + { + "epoch": 3.8532423208191124, + "grad_norm": 0.9915149807929993, + "learning_rate": 0.00023026166097838452, + "loss": 2.0725, + "step": 3387 + }, + { + "epoch": 3.8543799772468716, + "grad_norm": 0.5172927379608154, + "learning_rate": 0.00023003412969283278, + "loss": 0.6931, + "step": 3388 + }, + { + "epoch": 3.8555176336746304, + "grad_norm": 0.6758817434310913, + "learning_rate": 0.000229806598407281, + "loss": 1.3358, + "step": 3389 + }, + { + "epoch": 3.856655290102389, + "grad_norm": 0.9508668780326843, + "learning_rate": 0.00022957906712172923, + "loss": 2.2465, + "step": 3390 + }, + { + "epoch": 3.857792946530148, + "grad_norm": 0.6855764389038086, + "learning_rate": 0.0002293515358361775, + "loss": 1.5222, + "step": 3391 + }, + { + "epoch": 3.8589306029579067, + "grad_norm": 0.8102338314056396, + "learning_rate": 0.0002291240045506257, + "loss": 1.9551, + "step": 3392 + }, + { + "epoch": 3.8600682593856654, + "grad_norm": 0.8812468647956848, + "learning_rate": 0.00022889647326507397, + "loss": 1.9685, + "step": 3393 + }, + { + "epoch": 3.861205915813424, + "grad_norm": 0.6355249881744385, + "learning_rate": 0.00022866894197952218, + "loss": 1.231, + "step": 3394 + }, + { + "epoch": 3.8623435722411834, + "grad_norm": 0.5649857521057129, + "learning_rate": 0.0002284414106939704, + "loss": 0.5657, + "step": 3395 + }, + { + "epoch": 3.8634812286689417, + "grad_norm": 0.7780776619911194, + "learning_rate": 0.00022821387940841868, + "loss": 0.8215, + "step": 3396 + }, + { + "epoch": 3.864618885096701, + "grad_norm": 0.7705486416816711, + "learning_rate": 0.0002279863481228669, + "loss": 1.7517, + "step": 3397 + }, + { + "epoch": 3.8657565415244597, + "grad_norm": 0.7325091361999512, + "learning_rate": 0.00022775881683731515, + "loss": 1.9095, + "step": 3398 + }, + { + "epoch": 3.8668941979522184, + "grad_norm": 0.6553176045417786, + "learning_rate": 0.00022753128555176336, + "loss": 1.3486, + "step": 3399 + }, + { + "epoch": 3.868031854379977, + "grad_norm": 1.1297094821929932, + "learning_rate": 0.0002273037542662116, + "loss": 1.3066, + "step": 3400 + }, + { + "epoch": 3.869169510807736, + "grad_norm": 0.6546614170074463, + "learning_rate": 0.00022707622298065986, + "loss": 1.5045, + "step": 3401 + }, + { + "epoch": 3.8703071672354947, + "grad_norm": 0.6554498076438904, + "learning_rate": 0.00022684869169510807, + "loss": 1.1808, + "step": 3402 + }, + { + "epoch": 3.8714448236632535, + "grad_norm": 0.9071396589279175, + "learning_rate": 0.00022662116040955634, + "loss": 1.5988, + "step": 3403 + }, + { + "epoch": 3.8725824800910127, + "grad_norm": 1.2640100717544556, + "learning_rate": 0.00022639362912400455, + "loss": 2.3752, + "step": 3404 + }, + { + "epoch": 3.8737201365187715, + "grad_norm": 1.221620798110962, + "learning_rate": 0.00022616609783845278, + "loss": 2.2681, + "step": 3405 + }, + { + "epoch": 3.87485779294653, + "grad_norm": 0.8574184775352478, + "learning_rate": 0.00022593856655290102, + "loss": 1.9581, + "step": 3406 + }, + { + "epoch": 3.875995449374289, + "grad_norm": 0.9591223001480103, + "learning_rate": 0.00022571103526734926, + "loss": 2.0804, + "step": 3407 + }, + { + "epoch": 3.8771331058020477, + "grad_norm": 0.7188199758529663, + "learning_rate": 0.00022548350398179752, + "loss": 1.6862, + "step": 3408 + }, + { + "epoch": 3.8782707622298065, + "grad_norm": 1.205315351486206, + "learning_rate": 0.00022525597269624573, + "loss": 2.6322, + "step": 3409 + }, + { + "epoch": 3.8794084186575652, + "grad_norm": 0.6590704321861267, + "learning_rate": 0.00022502844141069397, + "loss": 0.9434, + "step": 3410 + }, + { + "epoch": 3.8805460750853245, + "grad_norm": 1.0419220924377441, + "learning_rate": 0.0002248009101251422, + "loss": 1.7335, + "step": 3411 + }, + { + "epoch": 3.881683731513083, + "grad_norm": 0.9429000616073608, + "learning_rate": 0.00022457337883959044, + "loss": 1.6675, + "step": 3412 + }, + { + "epoch": 3.882821387940842, + "grad_norm": 0.8173443675041199, + "learning_rate": 0.0002243458475540387, + "loss": 1.5351, + "step": 3413 + }, + { + "epoch": 3.8839590443686007, + "grad_norm": 1.2141562700271606, + "learning_rate": 0.00022411831626848692, + "loss": 2.0883, + "step": 3414 + }, + { + "epoch": 3.8850967007963595, + "grad_norm": 0.9572274088859558, + "learning_rate": 0.00022389078498293515, + "loss": 2.0156, + "step": 3415 + }, + { + "epoch": 3.8862343572241183, + "grad_norm": 0.7675147652626038, + "learning_rate": 0.0002236632536973834, + "loss": 1.4844, + "step": 3416 + }, + { + "epoch": 3.887372013651877, + "grad_norm": 0.8228021264076233, + "learning_rate": 0.00022343572241183163, + "loss": 2.0502, + "step": 3417 + }, + { + "epoch": 3.888509670079636, + "grad_norm": 1.1497551202774048, + "learning_rate": 0.0002232081911262799, + "loss": 1.1353, + "step": 3418 + }, + { + "epoch": 3.8896473265073945, + "grad_norm": 0.7159321308135986, + "learning_rate": 0.0002229806598407281, + "loss": 1.627, + "step": 3419 + }, + { + "epoch": 3.8907849829351537, + "grad_norm": 1.681058645248413, + "learning_rate": 0.00022275312855517634, + "loss": 3.7341, + "step": 3420 + }, + { + "epoch": 3.8919226393629125, + "grad_norm": 0.4869871437549591, + "learning_rate": 0.00022252559726962457, + "loss": 0.7067, + "step": 3421 + }, + { + "epoch": 3.8930602957906713, + "grad_norm": 0.6947728395462036, + "learning_rate": 0.0002222980659840728, + "loss": 1.0103, + "step": 3422 + }, + { + "epoch": 3.89419795221843, + "grad_norm": 0.7470018267631531, + "learning_rate": 0.00022207053469852108, + "loss": 0.5964, + "step": 3423 + }, + { + "epoch": 3.8953356086461888, + "grad_norm": 1.2415233850479126, + "learning_rate": 0.00022184300341296929, + "loss": 2.5444, + "step": 3424 + }, + { + "epoch": 3.8964732650739475, + "grad_norm": 0.9338128566741943, + "learning_rate": 0.00022161547212741752, + "loss": 1.5243, + "step": 3425 + }, + { + "epoch": 3.8976109215017063, + "grad_norm": 0.6499525904655457, + "learning_rate": 0.00022138794084186576, + "loss": 1.009, + "step": 3426 + }, + { + "epoch": 3.8987485779294655, + "grad_norm": 1.1649621725082397, + "learning_rate": 0.000221160409556314, + "loss": 2.3154, + "step": 3427 + }, + { + "epoch": 3.8998862343572243, + "grad_norm": 1.090055227279663, + "learning_rate": 0.00022093287827076223, + "loss": 2.3174, + "step": 3428 + }, + { + "epoch": 3.901023890784983, + "grad_norm": 0.9271589517593384, + "learning_rate": 0.00022070534698521047, + "loss": 1.8594, + "step": 3429 + }, + { + "epoch": 3.9021615472127418, + "grad_norm": 0.8594130277633667, + "learning_rate": 0.0002204778156996587, + "loss": 1.9265, + "step": 3430 + }, + { + "epoch": 3.9032992036405005, + "grad_norm": 0.860653817653656, + "learning_rate": 0.00022025028441410694, + "loss": 1.9662, + "step": 3431 + }, + { + "epoch": 3.9044368600682593, + "grad_norm": 0.8960944414138794, + "learning_rate": 0.00022002275312855518, + "loss": 1.6532, + "step": 3432 + }, + { + "epoch": 3.905574516496018, + "grad_norm": 0.753968358039856, + "learning_rate": 0.00021979522184300342, + "loss": 0.8833, + "step": 3433 + }, + { + "epoch": 3.9067121729237773, + "grad_norm": 0.7025309205055237, + "learning_rate": 0.00021956769055745166, + "loss": 0.9119, + "step": 3434 + }, + { + "epoch": 3.9078498293515356, + "grad_norm": 0.9567241668701172, + "learning_rate": 0.0002193401592718999, + "loss": 1.3435, + "step": 3435 + }, + { + "epoch": 3.908987485779295, + "grad_norm": 0.8972539901733398, + "learning_rate": 0.00021911262798634813, + "loss": 1.5827, + "step": 3436 + }, + { + "epoch": 3.9101251422070535, + "grad_norm": 0.8140578269958496, + "learning_rate": 0.00021888509670079637, + "loss": 1.5984, + "step": 3437 + }, + { + "epoch": 3.9112627986348123, + "grad_norm": 1.117913007736206, + "learning_rate": 0.0002186575654152446, + "loss": 2.3435, + "step": 3438 + }, + { + "epoch": 3.912400455062571, + "grad_norm": 1.821190357208252, + "learning_rate": 0.00021843003412969284, + "loss": 3.6124, + "step": 3439 + }, + { + "epoch": 3.91353811149033, + "grad_norm": 1.1013044118881226, + "learning_rate": 0.00021820250284414108, + "loss": 2.9663, + "step": 3440 + }, + { + "epoch": 3.9146757679180886, + "grad_norm": 0.9850253462791443, + "learning_rate": 0.00021797497155858931, + "loss": 1.6718, + "step": 3441 + }, + { + "epoch": 3.9158134243458473, + "grad_norm": 0.7211486101150513, + "learning_rate": 0.00021774744027303755, + "loss": 1.7512, + "step": 3442 + }, + { + "epoch": 3.9169510807736065, + "grad_norm": 1.0833756923675537, + "learning_rate": 0.0002175199089874858, + "loss": 2.52, + "step": 3443 + }, + { + "epoch": 3.9180887372013653, + "grad_norm": 1.1811580657958984, + "learning_rate": 0.00021729237770193403, + "loss": 2.2193, + "step": 3444 + }, + { + "epoch": 3.919226393629124, + "grad_norm": 1.0001695156097412, + "learning_rate": 0.00021706484641638224, + "loss": 1.7069, + "step": 3445 + }, + { + "epoch": 3.920364050056883, + "grad_norm": 0.64976966381073, + "learning_rate": 0.0002168373151308305, + "loss": 1.7047, + "step": 3446 + }, + { + "epoch": 3.9215017064846416, + "grad_norm": 0.8505579233169556, + "learning_rate": 0.00021660978384527874, + "loss": 1.1988, + "step": 3447 + }, + { + "epoch": 3.9226393629124003, + "grad_norm": 1.036294937133789, + "learning_rate": 0.00021638225255972697, + "loss": 1.8824, + "step": 3448 + }, + { + "epoch": 3.923777019340159, + "grad_norm": 1.0031839609146118, + "learning_rate": 0.0002161547212741752, + "loss": 1.9741, + "step": 3449 + }, + { + "epoch": 3.9249146757679183, + "grad_norm": 0.8414771556854248, + "learning_rate": 0.00021592718998862342, + "loss": 1.6352, + "step": 3450 + }, + { + "epoch": 3.926052332195677, + "grad_norm": 1.0658117532730103, + "learning_rate": 0.00021569965870307168, + "loss": 1.2177, + "step": 3451 + }, + { + "epoch": 3.927189988623436, + "grad_norm": 1.3510658740997314, + "learning_rate": 0.00021547212741751992, + "loss": 2.6798, + "step": 3452 + }, + { + "epoch": 3.9283276450511946, + "grad_norm": 1.097466230392456, + "learning_rate": 0.00021524459613196816, + "loss": 1.955, + "step": 3453 + }, + { + "epoch": 3.9294653014789533, + "grad_norm": 0.5628653168678284, + "learning_rate": 0.0002150170648464164, + "loss": 1.148, + "step": 3454 + }, + { + "epoch": 3.930602957906712, + "grad_norm": 0.7025729417800903, + "learning_rate": 0.0002147895335608646, + "loss": 1.5446, + "step": 3455 + }, + { + "epoch": 3.931740614334471, + "grad_norm": 1.8733407258987427, + "learning_rate": 0.00021456200227531287, + "loss": 4.242, + "step": 3456 + }, + { + "epoch": 3.93287827076223, + "grad_norm": 1.2189491987228394, + "learning_rate": 0.0002143344709897611, + "loss": 2.2309, + "step": 3457 + }, + { + "epoch": 3.9340159271899884, + "grad_norm": 1.0556656122207642, + "learning_rate": 0.00021410693970420934, + "loss": 1.535, + "step": 3458 + }, + { + "epoch": 3.9351535836177476, + "grad_norm": 1.2942211627960205, + "learning_rate": 0.00021387940841865758, + "loss": 1.7564, + "step": 3459 + }, + { + "epoch": 3.9362912400455063, + "grad_norm": 1.3906549215316772, + "learning_rate": 0.0002136518771331058, + "loss": 2.4698, + "step": 3460 + }, + { + "epoch": 3.937428896473265, + "grad_norm": 1.5894346237182617, + "learning_rate": 0.00021342434584755405, + "loss": 3.9154, + "step": 3461 + }, + { + "epoch": 3.938566552901024, + "grad_norm": 1.111507534980774, + "learning_rate": 0.00021319681456200226, + "loss": 2.2502, + "step": 3462 + }, + { + "epoch": 3.9397042093287826, + "grad_norm": 0.9626011848449707, + "learning_rate": 0.00021296928327645053, + "loss": 2.6883, + "step": 3463 + }, + { + "epoch": 3.9408418657565414, + "grad_norm": 0.8860620856285095, + "learning_rate": 0.00021274175199089877, + "loss": 1.697, + "step": 3464 + }, + { + "epoch": 3.9419795221843, + "grad_norm": 0.9714303612709045, + "learning_rate": 0.00021251422070534698, + "loss": 3.0761, + "step": 3465 + }, + { + "epoch": 3.9431171786120593, + "grad_norm": 0.7767373323440552, + "learning_rate": 0.00021228668941979524, + "loss": 2.555, + "step": 3466 + }, + { + "epoch": 3.944254835039818, + "grad_norm": 1.177393913269043, + "learning_rate": 0.00021205915813424345, + "loss": 2.1534, + "step": 3467 + }, + { + "epoch": 3.945392491467577, + "grad_norm": 0.7708033919334412, + "learning_rate": 0.0002118316268486917, + "loss": 1.6853, + "step": 3468 + }, + { + "epoch": 3.9465301478953356, + "grad_norm": 1.034865140914917, + "learning_rate": 0.00021160409556313995, + "loss": 1.7026, + "step": 3469 + }, + { + "epoch": 3.9476678043230944, + "grad_norm": 0.769185483455658, + "learning_rate": 0.00021137656427758816, + "loss": 1.1644, + "step": 3470 + }, + { + "epoch": 3.948805460750853, + "grad_norm": 1.4568527936935425, + "learning_rate": 0.00021114903299203642, + "loss": 2.0562, + "step": 3471 + }, + { + "epoch": 3.949943117178612, + "grad_norm": 0.8477489352226257, + "learning_rate": 0.00021092150170648463, + "loss": 1.6948, + "step": 3472 + }, + { + "epoch": 3.951080773606371, + "grad_norm": 0.6970076560974121, + "learning_rate": 0.00021069397042093287, + "loss": 1.3396, + "step": 3473 + }, + { + "epoch": 3.9522184300341294, + "grad_norm": 0.5849756598472595, + "learning_rate": 0.00021046643913538114, + "loss": 1.2495, + "step": 3474 + }, + { + "epoch": 3.9533560864618886, + "grad_norm": 0.9044439196586609, + "learning_rate": 0.00021023890784982934, + "loss": 2.0495, + "step": 3475 + }, + { + "epoch": 3.9544937428896474, + "grad_norm": 1.0596073865890503, + "learning_rate": 0.0002100113765642776, + "loss": 2.21, + "step": 3476 + }, + { + "epoch": 3.955631399317406, + "grad_norm": 1.0529900789260864, + "learning_rate": 0.00020978384527872582, + "loss": 2.0394, + "step": 3477 + }, + { + "epoch": 3.956769055745165, + "grad_norm": 1.1782349348068237, + "learning_rate": 0.00020955631399317406, + "loss": 2.4599, + "step": 3478 + }, + { + "epoch": 3.9579067121729237, + "grad_norm": 0.6079378128051758, + "learning_rate": 0.0002093287827076223, + "loss": 1.3727, + "step": 3479 + }, + { + "epoch": 3.9590443686006824, + "grad_norm": 0.9474525451660156, + "learning_rate": 0.00020910125142207053, + "loss": 2.3577, + "step": 3480 + }, + { + "epoch": 3.960182025028441, + "grad_norm": 0.8396434783935547, + "learning_rate": 0.0002088737201365188, + "loss": 1.6286, + "step": 3481 + }, + { + "epoch": 3.9613196814562004, + "grad_norm": 0.784811794757843, + "learning_rate": 0.000208646188850967, + "loss": 1.852, + "step": 3482 + }, + { + "epoch": 3.962457337883959, + "grad_norm": 1.2220088243484497, + "learning_rate": 0.00020841865756541524, + "loss": 2.4076, + "step": 3483 + }, + { + "epoch": 3.963594994311718, + "grad_norm": 0.8224531412124634, + "learning_rate": 0.00020819112627986348, + "loss": 2.0785, + "step": 3484 + }, + { + "epoch": 3.9647326507394767, + "grad_norm": 1.1083797216415405, + "learning_rate": 0.00020796359499431171, + "loss": 2.125, + "step": 3485 + }, + { + "epoch": 3.9658703071672354, + "grad_norm": 0.8573135137557983, + "learning_rate": 0.00020773606370875998, + "loss": 1.063, + "step": 3486 + }, + { + "epoch": 3.967007963594994, + "grad_norm": 1.380513072013855, + "learning_rate": 0.0002075085324232082, + "loss": 3.7957, + "step": 3487 + }, + { + "epoch": 3.968145620022753, + "grad_norm": 0.7844088673591614, + "learning_rate": 0.00020728100113765643, + "loss": 1.1714, + "step": 3488 + }, + { + "epoch": 3.969283276450512, + "grad_norm": 0.8056669235229492, + "learning_rate": 0.00020705346985210466, + "loss": 1.4436, + "step": 3489 + }, + { + "epoch": 3.970420932878271, + "grad_norm": 1.3273180723190308, + "learning_rate": 0.0002068259385665529, + "loss": 3.3308, + "step": 3490 + }, + { + "epoch": 3.9715585893060297, + "grad_norm": 0.7416298389434814, + "learning_rate": 0.00020659840728100116, + "loss": 1.7255, + "step": 3491 + }, + { + "epoch": 3.9726962457337884, + "grad_norm": 1.28645658493042, + "learning_rate": 0.00020637087599544937, + "loss": 2.2977, + "step": 3492 + }, + { + "epoch": 3.973833902161547, + "grad_norm": 0.637898862361908, + "learning_rate": 0.0002061433447098976, + "loss": 1.4102, + "step": 3493 + }, + { + "epoch": 3.974971558589306, + "grad_norm": 0.5494396686553955, + "learning_rate": 0.00020591581342434585, + "loss": 1.001, + "step": 3494 + }, + { + "epoch": 3.9761092150170647, + "grad_norm": 1.295616626739502, + "learning_rate": 0.00020568828213879408, + "loss": 3.9937, + "step": 3495 + }, + { + "epoch": 3.977246871444824, + "grad_norm": 1.2073827981948853, + "learning_rate": 0.00020546075085324232, + "loss": 2.9927, + "step": 3496 + }, + { + "epoch": 3.9783845278725822, + "grad_norm": 0.5181747674942017, + "learning_rate": 0.00020523321956769056, + "loss": 0.6257, + "step": 3497 + }, + { + "epoch": 3.9795221843003414, + "grad_norm": 1.6364872455596924, + "learning_rate": 0.0002050056882821388, + "loss": 3.7021, + "step": 3498 + }, + { + "epoch": 3.9806598407281, + "grad_norm": 0.7388929724693298, + "learning_rate": 0.00020477815699658703, + "loss": 1.1443, + "step": 3499 + }, + { + "epoch": 3.981797497155859, + "grad_norm": 0.8518296480178833, + "learning_rate": 0.00020455062571103527, + "loss": 1.7027, + "step": 3500 + }, + { + "epoch": 3.9829351535836177, + "grad_norm": 0.885672390460968, + "learning_rate": 0.0002043230944254835, + "loss": 1.5378, + "step": 3501 + }, + { + "epoch": 3.9840728100113765, + "grad_norm": 0.7524988055229187, + "learning_rate": 0.00020409556313993174, + "loss": 1.4504, + "step": 3502 + }, + { + "epoch": 3.9852104664391352, + "grad_norm": 0.8979845643043518, + "learning_rate": 0.00020386803185437998, + "loss": 1.747, + "step": 3503 + }, + { + "epoch": 3.986348122866894, + "grad_norm": 0.9114024639129639, + "learning_rate": 0.00020364050056882822, + "loss": 2.2745, + "step": 3504 + }, + { + "epoch": 3.987485779294653, + "grad_norm": 1.3460856676101685, + "learning_rate": 0.00020341296928327645, + "loss": 2.4317, + "step": 3505 + }, + { + "epoch": 3.988623435722412, + "grad_norm": 0.6706535816192627, + "learning_rate": 0.0002031854379977247, + "loss": 1.5125, + "step": 3506 + }, + { + "epoch": 3.9897610921501707, + "grad_norm": 0.6658063530921936, + "learning_rate": 0.00020295790671217293, + "loss": 0.6909, + "step": 3507 + }, + { + "epoch": 3.9908987485779295, + "grad_norm": 0.7595200538635254, + "learning_rate": 0.00020273037542662117, + "loss": 1.3346, + "step": 3508 + }, + { + "epoch": 3.9920364050056882, + "grad_norm": 1.1703503131866455, + "learning_rate": 0.0002025028441410694, + "loss": 1.7334, + "step": 3509 + }, + { + "epoch": 3.993174061433447, + "grad_norm": 1.095198154449463, + "learning_rate": 0.00020227531285551764, + "loss": 2.9393, + "step": 3510 + }, + { + "epoch": 3.9943117178612058, + "grad_norm": 1.123388409614563, + "learning_rate": 0.00020204778156996588, + "loss": 1.6582, + "step": 3511 + }, + { + "epoch": 3.995449374288965, + "grad_norm": 0.6648460626602173, + "learning_rate": 0.00020182025028441411, + "loss": 1.0768, + "step": 3512 + }, + { + "epoch": 3.9965870307167233, + "grad_norm": 1.1515564918518066, + "learning_rate": 0.00020159271899886232, + "loss": 3.5262, + "step": 3513 + }, + { + "epoch": 3.9977246871444825, + "grad_norm": 0.7331502437591553, + "learning_rate": 0.0002013651877133106, + "loss": 1.3639, + "step": 3514 + }, + { + "epoch": 3.9988623435722412, + "grad_norm": 1.1474095582962036, + "learning_rate": 0.00020113765642775882, + "loss": 1.7056, + "step": 3515 + }, + { + "epoch": 4.0, + "grad_norm": 1.1696233749389648, + "learning_rate": 0.00020091012514220706, + "loss": 2.7192, + "step": 3516 + }, + { + "epoch": 4.0, + "eval_f1": 0.8897, + "eval_gen_len": 49.6091, + "eval_loss": 1.8271052837371826, + "eval_precision": 0.8878, + "eval_recall": 0.8917, + "eval_rouge1": 0.4366, + "eval_rouge2": 0.1966, + "eval_rougeL": 0.3643, + "eval_rougeLsum": 0.4041, + "eval_runtime": 28.1666, + "eval_samples_per_second": 3.905, + "eval_steps_per_second": 0.497, + "step": 3516 + }, + { + "epoch": 4.001137656427759, + "grad_norm": 0.8035992383956909, + "learning_rate": 0.0002006825938566553, + "loss": 2.5708, + "step": 3517 + }, + { + "epoch": 4.0022753128555175, + "grad_norm": 0.5220395922660828, + "learning_rate": 0.0002004550625711035, + "loss": 0.9217, + "step": 3518 + }, + { + "epoch": 4.003412969283277, + "grad_norm": 0.8917536735534668, + "learning_rate": 0.00020022753128555177, + "loss": 1.7236, + "step": 3519 + }, + { + "epoch": 4.004550625711035, + "grad_norm": 1.0690449476242065, + "learning_rate": 0.0002, + "loss": 2.2547, + "step": 3520 + }, + { + "epoch": 4.005688282138794, + "grad_norm": 1.0998704433441162, + "learning_rate": 0.00019977246871444825, + "loss": 1.9161, + "step": 3521 + }, + { + "epoch": 4.006825938566553, + "grad_norm": 0.6901816129684448, + "learning_rate": 0.00019954493742889648, + "loss": 0.828, + "step": 3522 + }, + { + "epoch": 4.007963594994312, + "grad_norm": 1.0238609313964844, + "learning_rate": 0.0001993174061433447, + "loss": 1.0532, + "step": 3523 + }, + { + "epoch": 4.009101251422071, + "grad_norm": 0.8400004506111145, + "learning_rate": 0.00019908987485779296, + "loss": 1.5558, + "step": 3524 + }, + { + "epoch": 4.010238907849829, + "grad_norm": 0.8628343343734741, + "learning_rate": 0.0001988623435722412, + "loss": 1.6871, + "step": 3525 + }, + { + "epoch": 4.0113765642775885, + "grad_norm": 1.2406960725784302, + "learning_rate": 0.00019863481228668943, + "loss": 1.9634, + "step": 3526 + }, + { + "epoch": 4.012514220705347, + "grad_norm": 1.2915693521499634, + "learning_rate": 0.00019840728100113767, + "loss": 2.4605, + "step": 3527 + }, + { + "epoch": 4.013651877133106, + "grad_norm": 1.227972149848938, + "learning_rate": 0.00019817974971558588, + "loss": 2.6666, + "step": 3528 + }, + { + "epoch": 4.014789533560864, + "grad_norm": 1.1289844512939453, + "learning_rate": 0.00019795221843003414, + "loss": 1.9807, + "step": 3529 + }, + { + "epoch": 4.0159271899886235, + "grad_norm": 0.8085461258888245, + "learning_rate": 0.00019772468714448235, + "loss": 0.9486, + "step": 3530 + }, + { + "epoch": 4.017064846416382, + "grad_norm": 0.6080414652824402, + "learning_rate": 0.00019749715585893062, + "loss": 0.995, + "step": 3531 + }, + { + "epoch": 4.018202502844141, + "grad_norm": 0.7787399291992188, + "learning_rate": 0.00019726962457337885, + "loss": 1.6868, + "step": 3532 + }, + { + "epoch": 4.0193401592719, + "grad_norm": 0.7362167239189148, + "learning_rate": 0.00019704209328782706, + "loss": 1.096, + "step": 3533 + }, + { + "epoch": 4.020477815699659, + "grad_norm": 1.1093764305114746, + "learning_rate": 0.00019681456200227533, + "loss": 1.16, + "step": 3534 + }, + { + "epoch": 4.021615472127418, + "grad_norm": 1.1228350400924683, + "learning_rate": 0.00019658703071672354, + "loss": 2.6241, + "step": 3535 + }, + { + "epoch": 4.022753128555176, + "grad_norm": 0.7320429086685181, + "learning_rate": 0.0001963594994311718, + "loss": 1.7036, + "step": 3536 + }, + { + "epoch": 4.023890784982935, + "grad_norm": 0.8620187640190125, + "learning_rate": 0.00019613196814562004, + "loss": 1.7516, + "step": 3537 + }, + { + "epoch": 4.025028441410694, + "grad_norm": 0.8271589875221252, + "learning_rate": 0.00019590443686006825, + "loss": 1.4492, + "step": 3538 + }, + { + "epoch": 4.026166097838453, + "grad_norm": 1.1758748292922974, + "learning_rate": 0.0001956769055745165, + "loss": 2.0319, + "step": 3539 + }, + { + "epoch": 4.027303754266212, + "grad_norm": 1.373871088027954, + "learning_rate": 0.00019544937428896472, + "loss": 1.6359, + "step": 3540 + }, + { + "epoch": 4.02844141069397, + "grad_norm": 0.7867767214775085, + "learning_rate": 0.00019522184300341299, + "loss": 1.3931, + "step": 3541 + }, + { + "epoch": 4.0295790671217295, + "grad_norm": 0.9068583846092224, + "learning_rate": 0.00019499431171786122, + "loss": 1.5971, + "step": 3542 + }, + { + "epoch": 4.030716723549488, + "grad_norm": 1.1074446439743042, + "learning_rate": 0.00019476678043230943, + "loss": 2.0918, + "step": 3543 + }, + { + "epoch": 4.031854379977247, + "grad_norm": 0.8541181087493896, + "learning_rate": 0.0001945392491467577, + "loss": 2.3143, + "step": 3544 + }, + { + "epoch": 4.032992036405005, + "grad_norm": 1.084362268447876, + "learning_rate": 0.0001943117178612059, + "loss": 1.7493, + "step": 3545 + }, + { + "epoch": 4.034129692832765, + "grad_norm": 1.041742205619812, + "learning_rate": 0.00019408418657565417, + "loss": 1.8627, + "step": 3546 + }, + { + "epoch": 4.035267349260523, + "grad_norm": 0.7245301604270935, + "learning_rate": 0.00019385665529010238, + "loss": 0.726, + "step": 3547 + }, + { + "epoch": 4.036405005688282, + "grad_norm": 0.8224193453788757, + "learning_rate": 0.00019362912400455062, + "loss": 1.5722, + "step": 3548 + }, + { + "epoch": 4.037542662116041, + "grad_norm": 1.1694352626800537, + "learning_rate": 0.00019340159271899888, + "loss": 1.0027, + "step": 3549 + }, + { + "epoch": 4.0386803185438, + "grad_norm": 0.9688643217086792, + "learning_rate": 0.0001931740614334471, + "loss": 2.2008, + "step": 3550 + }, + { + "epoch": 4.039817974971559, + "grad_norm": 0.6902188062667847, + "learning_rate": 0.00019294653014789536, + "loss": 1.4584, + "step": 3551 + }, + { + "epoch": 4.040955631399317, + "grad_norm": 1.1102920770645142, + "learning_rate": 0.00019271899886234357, + "loss": 2.3507, + "step": 3552 + }, + { + "epoch": 4.042093287827076, + "grad_norm": 0.5606656074523926, + "learning_rate": 0.0001924914675767918, + "loss": 1.1701, + "step": 3553 + }, + { + "epoch": 4.043230944254835, + "grad_norm": 0.5422776937484741, + "learning_rate": 0.00019226393629124007, + "loss": 1.1612, + "step": 3554 + }, + { + "epoch": 4.044368600682594, + "grad_norm": 1.0809518098831177, + "learning_rate": 0.00019203640500568828, + "loss": 2.5527, + "step": 3555 + }, + { + "epoch": 4.045506257110353, + "grad_norm": 0.9332743287086487, + "learning_rate": 0.00019180887372013654, + "loss": 1.4009, + "step": 3556 + }, + { + "epoch": 4.046643913538111, + "grad_norm": 0.721682608127594, + "learning_rate": 0.00019158134243458475, + "loss": 0.9377, + "step": 3557 + }, + { + "epoch": 4.047781569965871, + "grad_norm": 0.8674675226211548, + "learning_rate": 0.000191353811149033, + "loss": 1.783, + "step": 3558 + }, + { + "epoch": 4.048919226393629, + "grad_norm": 0.8048381209373474, + "learning_rate": 0.00019112627986348125, + "loss": 1.9681, + "step": 3559 + }, + { + "epoch": 4.050056882821388, + "grad_norm": 1.0859109163284302, + "learning_rate": 0.00019089874857792946, + "loss": 1.4804, + "step": 3560 + }, + { + "epoch": 4.051194539249146, + "grad_norm": 0.8473076820373535, + "learning_rate": 0.00019067121729237773, + "loss": 1.2142, + "step": 3561 + }, + { + "epoch": 4.052332195676906, + "grad_norm": 0.7596136331558228, + "learning_rate": 0.00019044368600682594, + "loss": 1.2756, + "step": 3562 + }, + { + "epoch": 4.053469852104665, + "grad_norm": 0.7931225895881653, + "learning_rate": 0.00019021615472127417, + "loss": 1.4067, + "step": 3563 + }, + { + "epoch": 4.054607508532423, + "grad_norm": 0.8623807430267334, + "learning_rate": 0.0001899886234357224, + "loss": 1.6287, + "step": 3564 + }, + { + "epoch": 4.055745164960182, + "grad_norm": 1.0621230602264404, + "learning_rate": 0.00018976109215017065, + "loss": 2.8103, + "step": 3565 + }, + { + "epoch": 4.056882821387941, + "grad_norm": 0.8181642293930054, + "learning_rate": 0.0001895335608646189, + "loss": 1.1884, + "step": 3566 + }, + { + "epoch": 4.0580204778157, + "grad_norm": 0.9738771319389343, + "learning_rate": 0.00018930602957906712, + "loss": 1.9755, + "step": 3567 + }, + { + "epoch": 4.059158134243458, + "grad_norm": 0.9224938154220581, + "learning_rate": 0.00018907849829351536, + "loss": 1.288, + "step": 3568 + }, + { + "epoch": 4.060295790671217, + "grad_norm": 0.8679994940757751, + "learning_rate": 0.0001888509670079636, + "loss": 1.4742, + "step": 3569 + }, + { + "epoch": 4.061433447098976, + "grad_norm": 1.6250495910644531, + "learning_rate": 0.00018862343572241183, + "loss": 3.5226, + "step": 3570 + }, + { + "epoch": 4.062571103526735, + "grad_norm": NaN, + "learning_rate": 0.00018862343572241183, + "loss": 1.1473, + "step": 3571 + }, + { + "epoch": 4.063708759954494, + "grad_norm": 1.0141960382461548, + "learning_rate": 0.0001883959044368601, + "loss": 2.936, + "step": 3572 + }, + { + "epoch": 4.064846416382252, + "grad_norm": 1.2739934921264648, + "learning_rate": 0.0001881683731513083, + "loss": 2.1047, + "step": 3573 + }, + { + "epoch": 4.065984072810012, + "grad_norm": 1.6481419801712036, + "learning_rate": 0.00018794084186575654, + "loss": 3.6404, + "step": 3574 + }, + { + "epoch": 4.06712172923777, + "grad_norm": 0.9871388673782349, + "learning_rate": 0.00018771331058020478, + "loss": 2.5576, + "step": 3575 + }, + { + "epoch": 4.068259385665529, + "grad_norm": 0.8460843563079834, + "learning_rate": 0.00018748577929465302, + "loss": 2.5027, + "step": 3576 + }, + { + "epoch": 4.0693970420932875, + "grad_norm": 0.9054187536239624, + "learning_rate": 0.00018725824800910125, + "loss": 1.5227, + "step": 3577 + }, + { + "epoch": 4.070534698521047, + "grad_norm": 1.1341551542282104, + "learning_rate": 0.0001870307167235495, + "loss": 1.8531, + "step": 3578 + }, + { + "epoch": 4.071672354948806, + "grad_norm": 0.7718223929405212, + "learning_rate": 0.00018680318543799773, + "loss": 1.1577, + "step": 3579 + }, + { + "epoch": 4.072810011376564, + "grad_norm": 0.7591809034347534, + "learning_rate": 0.00018657565415244596, + "loss": 1.069, + "step": 3580 + }, + { + "epoch": 4.073947667804323, + "grad_norm": 1.5402214527130127, + "learning_rate": 0.0001863481228668942, + "loss": 3.2458, + "step": 3581 + }, + { + "epoch": 4.075085324232082, + "grad_norm": 1.6219172477722168, + "learning_rate": 0.00018612059158134244, + "loss": 2.8352, + "step": 3582 + }, + { + "epoch": 4.076222980659841, + "grad_norm": 0.7609260082244873, + "learning_rate": 0.00018589306029579068, + "loss": 1.5297, + "step": 3583 + }, + { + "epoch": 4.077360637087599, + "grad_norm": 0.9463198184967041, + "learning_rate": 0.0001856655290102389, + "loss": 1.4272, + "step": 3584 + }, + { + "epoch": 4.078498293515358, + "grad_norm": 1.0422461032867432, + "learning_rate": 0.00018543799772468715, + "loss": 2.3036, + "step": 3585 + }, + { + "epoch": 4.079635949943118, + "grad_norm": 1.6263346672058105, + "learning_rate": 0.0001852104664391354, + "loss": 2.6899, + "step": 3586 + }, + { + "epoch": 4.080773606370876, + "grad_norm": 0.8726171851158142, + "learning_rate": 0.0001849829351535836, + "loss": 1.9618, + "step": 3587 + }, + { + "epoch": 4.081911262798635, + "grad_norm": 1.0334999561309814, + "learning_rate": 0.00018475540386803186, + "loss": 1.9655, + "step": 3588 + }, + { + "epoch": 4.0830489192263935, + "grad_norm": 1.2761929035186768, + "learning_rate": 0.0001845278725824801, + "loss": 1.1134, + "step": 3589 + }, + { + "epoch": 4.084186575654153, + "grad_norm": 1.0597755908966064, + "learning_rate": 0.00018430034129692833, + "loss": 1.6925, + "step": 3590 + }, + { + "epoch": 4.085324232081911, + "grad_norm": 1.0300536155700684, + "learning_rate": 0.00018407281001137657, + "loss": 1.877, + "step": 3591 + }, + { + "epoch": 4.08646188850967, + "grad_norm": 1.1007896661758423, + "learning_rate": 0.00018384527872582478, + "loss": 1.8541, + "step": 3592 + }, + { + "epoch": 4.0875995449374285, + "grad_norm": 0.9704376459121704, + "learning_rate": 0.00018361774744027305, + "loss": 1.1683, + "step": 3593 + }, + { + "epoch": 4.088737201365188, + "grad_norm": 1.4303745031356812, + "learning_rate": 0.00018339021615472128, + "loss": 1.9568, + "step": 3594 + }, + { + "epoch": 4.089874857792947, + "grad_norm": 0.7820960879325867, + "learning_rate": 0.00018316268486916952, + "loss": 1.3428, + "step": 3595 + }, + { + "epoch": 4.091012514220705, + "grad_norm": 0.9371392726898193, + "learning_rate": 0.00018293515358361776, + "loss": 1.4801, + "step": 3596 + }, + { + "epoch": 4.092150170648464, + "grad_norm": 0.9383748173713684, + "learning_rate": 0.00018270762229806597, + "loss": 1.8923, + "step": 3597 + }, + { + "epoch": 4.093287827076223, + "grad_norm": 1.0831904411315918, + "learning_rate": 0.00018248009101251423, + "loss": 2.4795, + "step": 3598 + }, + { + "epoch": 4.094425483503982, + "grad_norm": 0.872491180896759, + "learning_rate": 0.00018225255972696247, + "loss": 1.6777, + "step": 3599 + }, + { + "epoch": 4.09556313993174, + "grad_norm": 1.4505406618118286, + "learning_rate": 0.0001820250284414107, + "loss": 2.7823, + "step": 3600 + }, + { + "epoch": 4.0967007963594995, + "grad_norm": 1.2872380018234253, + "learning_rate": 0.00018179749715585894, + "loss": 2.3393, + "step": 3601 + }, + { + "epoch": 4.097838452787259, + "grad_norm": 1.019313097000122, + "learning_rate": 0.00018156996587030715, + "loss": 2.0507, + "step": 3602 + }, + { + "epoch": 4.098976109215017, + "grad_norm": 1.1505863666534424, + "learning_rate": 0.00018134243458475542, + "loss": 1.7178, + "step": 3603 + }, + { + "epoch": 4.100113765642776, + "grad_norm": 0.6773290038108826, + "learning_rate": 0.00018111490329920363, + "loss": 1.6069, + "step": 3604 + }, + { + "epoch": 4.1012514220705345, + "grad_norm": 0.7799268960952759, + "learning_rate": 0.0001808873720136519, + "loss": 1.6404, + "step": 3605 + }, + { + "epoch": 4.102389078498294, + "grad_norm": 1.3413171768188477, + "learning_rate": 0.00018065984072810013, + "loss": 4.3731, + "step": 3606 + }, + { + "epoch": 4.103526734926052, + "grad_norm": 1.068517804145813, + "learning_rate": 0.00018043230944254834, + "loss": 1.7495, + "step": 3607 + }, + { + "epoch": 4.104664391353811, + "grad_norm": 0.895536482334137, + "learning_rate": 0.0001802047781569966, + "loss": 2.339, + "step": 3608 + }, + { + "epoch": 4.1058020477815695, + "grad_norm": 1.09737229347229, + "learning_rate": 0.0001799772468714448, + "loss": 1.8625, + "step": 3609 + }, + { + "epoch": 4.106939704209329, + "grad_norm": 1.0622013807296753, + "learning_rate": 0.00017974971558589307, + "loss": 1.309, + "step": 3610 + }, + { + "epoch": 4.108077360637088, + "grad_norm": 1.0724921226501465, + "learning_rate": 0.0001795221843003413, + "loss": 2.1258, + "step": 3611 + }, + { + "epoch": 4.109215017064846, + "grad_norm": 0.6296405792236328, + "learning_rate": 0.00017929465301478952, + "loss": 1.1405, + "step": 3612 + }, + { + "epoch": 4.1103526734926055, + "grad_norm": 0.6233872771263123, + "learning_rate": 0.00017906712172923779, + "loss": 1.4618, + "step": 3613 + }, + { + "epoch": 4.111490329920364, + "grad_norm": 1.0284563302993774, + "learning_rate": 0.000178839590443686, + "loss": 2.1088, + "step": 3614 + }, + { + "epoch": 4.112627986348123, + "grad_norm": 0.5234109163284302, + "learning_rate": 0.00017861205915813426, + "loss": 0.6546, + "step": 3615 + }, + { + "epoch": 4.113765642775881, + "grad_norm": 1.1184611320495605, + "learning_rate": 0.0001783845278725825, + "loss": 2.2601, + "step": 3616 + }, + { + "epoch": 4.1149032992036405, + "grad_norm": 0.5588635802268982, + "learning_rate": 0.0001781569965870307, + "loss": 1.2117, + "step": 3617 + }, + { + "epoch": 4.1160409556314, + "grad_norm": 1.029264211654663, + "learning_rate": 0.00017792946530147897, + "loss": 2.4449, + "step": 3618 + }, + { + "epoch": 4.117178612059158, + "grad_norm": 0.6462013125419617, + "learning_rate": 0.00017770193401592718, + "loss": 1.0404, + "step": 3619 + }, + { + "epoch": 4.118316268486917, + "grad_norm": 0.8388407826423645, + "learning_rate": 0.00017747440273037544, + "loss": 1.1833, + "step": 3620 + }, + { + "epoch": 4.1194539249146755, + "grad_norm": 0.6785479187965393, + "learning_rate": 0.00017724687144482365, + "loss": 1.404, + "step": 3621 + }, + { + "epoch": 4.120591581342435, + "grad_norm": 0.742090106010437, + "learning_rate": 0.0001770193401592719, + "loss": 1.844, + "step": 3622 + }, + { + "epoch": 4.121729237770193, + "grad_norm": 0.5642427206039429, + "learning_rate": 0.00017679180887372016, + "loss": 0.8566, + "step": 3623 + }, + { + "epoch": 4.122866894197952, + "grad_norm": 1.279248595237732, + "learning_rate": 0.00017656427758816837, + "loss": 2.091, + "step": 3624 + }, + { + "epoch": 4.1240045506257115, + "grad_norm": 0.8221580386161804, + "learning_rate": 0.00017633674630261663, + "loss": 1.0874, + "step": 3625 + }, + { + "epoch": 4.12514220705347, + "grad_norm": 1.105421543121338, + "learning_rate": 0.00017610921501706484, + "loss": 1.8684, + "step": 3626 + }, + { + "epoch": 4.126279863481229, + "grad_norm": 0.6208410859107971, + "learning_rate": 0.00017588168373151308, + "loss": 1.1682, + "step": 3627 + }, + { + "epoch": 4.127417519908987, + "grad_norm": 1.1576206684112549, + "learning_rate": 0.00017565415244596134, + "loss": 2.1256, + "step": 3628 + }, + { + "epoch": 4.1285551763367465, + "grad_norm": 0.9814143180847168, + "learning_rate": 0.00017542662116040955, + "loss": 2.8486, + "step": 3629 + }, + { + "epoch": 4.129692832764505, + "grad_norm": 1.1456234455108643, + "learning_rate": 0.00017519908987485781, + "loss": 1.3626, + "step": 3630 + }, + { + "epoch": 4.130830489192264, + "grad_norm": 0.8578894734382629, + "learning_rate": 0.00017497155858930602, + "loss": 1.7545, + "step": 3631 + }, + { + "epoch": 4.131968145620022, + "grad_norm": 0.7834432721138, + "learning_rate": 0.00017474402730375426, + "loss": 1.7061, + "step": 3632 + }, + { + "epoch": 4.1331058020477816, + "grad_norm": 1.3007863759994507, + "learning_rate": 0.00017451649601820253, + "loss": 1.5035, + "step": 3633 + }, + { + "epoch": 4.134243458475541, + "grad_norm": 1.0654963254928589, + "learning_rate": 0.00017428896473265074, + "loss": 2.8058, + "step": 3634 + }, + { + "epoch": 4.135381114903299, + "grad_norm": 1.042611002922058, + "learning_rate": 0.000174061433447099, + "loss": 1.5158, + "step": 3635 + }, + { + "epoch": 4.136518771331058, + "grad_norm": 0.8858871459960938, + "learning_rate": 0.0001738339021615472, + "loss": 2.2742, + "step": 3636 + }, + { + "epoch": 4.137656427758817, + "grad_norm": 1.099360466003418, + "learning_rate": 0.00017360637087599545, + "loss": 2.7292, + "step": 3637 + }, + { + "epoch": 4.138794084186576, + "grad_norm": 0.520845890045166, + "learning_rate": 0.00017337883959044368, + "loss": 0.512, + "step": 3638 + }, + { + "epoch": 4.139931740614334, + "grad_norm": 1.025590181350708, + "learning_rate": 0.00017315130830489192, + "loss": 2.4849, + "step": 3639 + }, + { + "epoch": 4.141069397042093, + "grad_norm": 2.0368833541870117, + "learning_rate": 0.00017292377701934018, + "loss": 2.6333, + "step": 3640 + }, + { + "epoch": 4.1422070534698525, + "grad_norm": 0.952337920665741, + "learning_rate": 0.0001726962457337884, + "loss": 0.9957, + "step": 3641 + }, + { + "epoch": 4.143344709897611, + "grad_norm": 0.782004177570343, + "learning_rate": 0.00017246871444823663, + "loss": 1.4577, + "step": 3642 + }, + { + "epoch": 4.14448236632537, + "grad_norm": 0.8281341195106506, + "learning_rate": 0.00017224118316268487, + "loss": 1.6821, + "step": 3643 + }, + { + "epoch": 4.145620022753128, + "grad_norm": 0.8365357518196106, + "learning_rate": 0.0001720136518771331, + "loss": 2.0037, + "step": 3644 + }, + { + "epoch": 4.146757679180888, + "grad_norm": 0.7121614813804626, + "learning_rate": 0.00017178612059158137, + "loss": 1.2755, + "step": 3645 + }, + { + "epoch": 4.147895335608646, + "grad_norm": 0.890727162361145, + "learning_rate": 0.00017155858930602958, + "loss": 1.4112, + "step": 3646 + }, + { + "epoch": 4.149032992036405, + "grad_norm": 0.9248006343841553, + "learning_rate": 0.00017133105802047782, + "loss": 1.4232, + "step": 3647 + }, + { + "epoch": 4.150170648464163, + "grad_norm": 0.775360107421875, + "learning_rate": 0.00017110352673492605, + "loss": 1.675, + "step": 3648 + }, + { + "epoch": 4.151308304891923, + "grad_norm": 0.8243028521537781, + "learning_rate": 0.0001708759954493743, + "loss": 1.8722, + "step": 3649 + }, + { + "epoch": 4.152445961319682, + "grad_norm": 1.020837426185608, + "learning_rate": 0.00017064846416382255, + "loss": 2.4388, + "step": 3650 + }, + { + "epoch": 4.15358361774744, + "grad_norm": 1.433733344078064, + "learning_rate": 0.00017042093287827076, + "loss": 3.276, + "step": 3651 + }, + { + "epoch": 4.154721274175199, + "grad_norm": 0.7089780569076538, + "learning_rate": 0.000170193401592719, + "loss": 1.5784, + "step": 3652 + }, + { + "epoch": 4.155858930602958, + "grad_norm": 1.084784984588623, + "learning_rate": 0.00016996587030716724, + "loss": 1.4955, + "step": 3653 + }, + { + "epoch": 4.156996587030717, + "grad_norm": 0.6325297951698303, + "learning_rate": 0.00016973833902161548, + "loss": 0.6437, + "step": 3654 + }, + { + "epoch": 4.158134243458475, + "grad_norm": 0.7360290884971619, + "learning_rate": 0.0001695108077360637, + "loss": 0.833, + "step": 3655 + }, + { + "epoch": 4.159271899886234, + "grad_norm": 0.983505129814148, + "learning_rate": 0.00016928327645051195, + "loss": 2.5266, + "step": 3656 + }, + { + "epoch": 4.160409556313994, + "grad_norm": 1.1645041704177856, + "learning_rate": 0.00016905574516496019, + "loss": 1.9561, + "step": 3657 + }, + { + "epoch": 4.161547212741752, + "grad_norm": 0.8973751068115234, + "learning_rate": 0.00016882821387940842, + "loss": 0.9499, + "step": 3658 + }, + { + "epoch": 4.162684869169511, + "grad_norm": 0.748742401599884, + "learning_rate": 0.00016860068259385666, + "loss": 1.542, + "step": 3659 + }, + { + "epoch": 4.163822525597269, + "grad_norm": 0.6661348342895508, + "learning_rate": 0.0001683731513083049, + "loss": 1.4095, + "step": 3660 + }, + { + "epoch": 4.164960182025029, + "grad_norm": 1.0042301416397095, + "learning_rate": 0.00016814562002275313, + "loss": 1.6135, + "step": 3661 + }, + { + "epoch": 4.166097838452787, + "grad_norm": 1.056707739830017, + "learning_rate": 0.00016791808873720137, + "loss": 1.8746, + "step": 3662 + }, + { + "epoch": 4.167235494880546, + "grad_norm": 1.1747101545333862, + "learning_rate": 0.0001676905574516496, + "loss": 1.8581, + "step": 3663 + }, + { + "epoch": 4.168373151308305, + "grad_norm": 0.9841439723968506, + "learning_rate": 0.00016746302616609785, + "loss": 2.1848, + "step": 3664 + }, + { + "epoch": 4.169510807736064, + "grad_norm": 1.0959367752075195, + "learning_rate": 0.00016723549488054606, + "loss": 2.9336, + "step": 3665 + }, + { + "epoch": 4.170648464163823, + "grad_norm": 1.0343743562698364, + "learning_rate": 0.00016700796359499432, + "loss": 1.8865, + "step": 3666 + }, + { + "epoch": 4.171786120591581, + "grad_norm": 1.1250258684158325, + "learning_rate": 0.00016678043230944256, + "loss": 2.206, + "step": 3667 + }, + { + "epoch": 4.17292377701934, + "grad_norm": 0.8602643609046936, + "learning_rate": 0.0001665529010238908, + "loss": 1.6804, + "step": 3668 + }, + { + "epoch": 4.174061433447099, + "grad_norm": 1.1515132188796997, + "learning_rate": 0.00016632536973833903, + "loss": 2.0477, + "step": 3669 + }, + { + "epoch": 4.175199089874858, + "grad_norm": 1.0483062267303467, + "learning_rate": 0.00016609783845278724, + "loss": 1.4297, + "step": 3670 + }, + { + "epoch": 4.176336746302616, + "grad_norm": 1.0967140197753906, + "learning_rate": 0.0001658703071672355, + "loss": 2.4236, + "step": 3671 + }, + { + "epoch": 4.177474402730375, + "grad_norm": 0.9350886940956116, + "learning_rate": 0.00016564277588168371, + "loss": 2.0317, + "step": 3672 + }, + { + "epoch": 4.178612059158135, + "grad_norm": 0.6748828291893005, + "learning_rate": 0.00016541524459613198, + "loss": 1.6643, + "step": 3673 + }, + { + "epoch": 4.179749715585893, + "grad_norm": 1.0818042755126953, + "learning_rate": 0.00016518771331058022, + "loss": 1.7651, + "step": 3674 + }, + { + "epoch": 4.180887372013652, + "grad_norm": 0.8341732621192932, + "learning_rate": 0.00016496018202502842, + "loss": 1.7182, + "step": 3675 + }, + { + "epoch": 4.18202502844141, + "grad_norm": 0.7781969904899597, + "learning_rate": 0.0001647326507394767, + "loss": 1.5752, + "step": 3676 + }, + { + "epoch": 4.18316268486917, + "grad_norm": 0.9930599927902222, + "learning_rate": 0.0001645051194539249, + "loss": 1.7508, + "step": 3677 + }, + { + "epoch": 4.184300341296928, + "grad_norm": 1.0659433603286743, + "learning_rate": 0.00016427758816837316, + "loss": 2.5191, + "step": 3678 + }, + { + "epoch": 4.185437997724687, + "grad_norm": 0.7574253082275391, + "learning_rate": 0.0001640500568828214, + "loss": 1.2169, + "step": 3679 + }, + { + "epoch": 4.186575654152446, + "grad_norm": 0.7488005757331848, + "learning_rate": 0.0001638225255972696, + "loss": 1.2127, + "step": 3680 + }, + { + "epoch": 4.187713310580205, + "grad_norm": 0.8474146127700806, + "learning_rate": 0.00016359499431171787, + "loss": 1.2748, + "step": 3681 + }, + { + "epoch": 4.188850967007964, + "grad_norm": 0.9467219710350037, + "learning_rate": 0.00016336746302616608, + "loss": 2.0191, + "step": 3682 + }, + { + "epoch": 4.189988623435722, + "grad_norm": 0.7899359464645386, + "learning_rate": 0.00016313993174061435, + "loss": 0.7179, + "step": 3683 + }, + { + "epoch": 4.191126279863481, + "grad_norm": 1.5700042247772217, + "learning_rate": 0.00016291240045506259, + "loss": 2.2769, + "step": 3684 + }, + { + "epoch": 4.19226393629124, + "grad_norm": 0.9096818566322327, + "learning_rate": 0.0001626848691695108, + "loss": 1.828, + "step": 3685 + }, + { + "epoch": 4.193401592718999, + "grad_norm": 0.9237043857574463, + "learning_rate": 0.00016245733788395906, + "loss": 1.4392, + "step": 3686 + }, + { + "epoch": 4.194539249146757, + "grad_norm": 1.1367732286453247, + "learning_rate": 0.00016222980659840727, + "loss": 3.0341, + "step": 3687 + }, + { + "epoch": 4.1956769055745164, + "grad_norm": 0.9193979501724243, + "learning_rate": 0.00016200227531285553, + "loss": 1.32, + "step": 3688 + }, + { + "epoch": 4.196814562002276, + "grad_norm": 1.5365267992019653, + "learning_rate": 0.00016177474402730374, + "loss": 3.1212, + "step": 3689 + }, + { + "epoch": 4.197952218430034, + "grad_norm": 0.8413816094398499, + "learning_rate": 0.00016154721274175198, + "loss": 1.7901, + "step": 3690 + }, + { + "epoch": 4.199089874857793, + "grad_norm": 0.6427741050720215, + "learning_rate": 0.00016131968145620024, + "loss": 0.5028, + "step": 3691 + }, + { + "epoch": 4.2002275312855515, + "grad_norm": 0.9684869647026062, + "learning_rate": 0.00016109215017064845, + "loss": 2.0259, + "step": 3692 + }, + { + "epoch": 4.201365187713311, + "grad_norm": 0.6879338622093201, + "learning_rate": 0.00016086461888509672, + "loss": 1.3484, + "step": 3693 + }, + { + "epoch": 4.202502844141069, + "grad_norm": 0.9934858679771423, + "learning_rate": 0.00016063708759954493, + "loss": 1.5266, + "step": 3694 + }, + { + "epoch": 4.203640500568828, + "grad_norm": 1.18158757686615, + "learning_rate": 0.00016040955631399316, + "loss": 2.0914, + "step": 3695 + }, + { + "epoch": 4.204778156996587, + "grad_norm": 0.776360809803009, + "learning_rate": 0.00016018202502844143, + "loss": 1.8111, + "step": 3696 + }, + { + "epoch": 4.205915813424346, + "grad_norm": 0.5315882563591003, + "learning_rate": 0.00015995449374288964, + "loss": 0.8625, + "step": 3697 + }, + { + "epoch": 4.207053469852105, + "grad_norm": 0.7266376614570618, + "learning_rate": 0.0001597269624573379, + "loss": 0.9552, + "step": 3698 + }, + { + "epoch": 4.208191126279863, + "grad_norm": 0.8702644109725952, + "learning_rate": 0.0001594994311717861, + "loss": 1.4781, + "step": 3699 + }, + { + "epoch": 4.2093287827076225, + "grad_norm": 0.6491544246673584, + "learning_rate": 0.00015927189988623435, + "loss": 0.9899, + "step": 3700 + }, + { + "epoch": 4.210466439135381, + "grad_norm": 1.0166810750961304, + "learning_rate": 0.00015904436860068261, + "loss": 1.7634, + "step": 3701 + }, + { + "epoch": 4.21160409556314, + "grad_norm": 1.268738031387329, + "learning_rate": 0.00015881683731513082, + "loss": 2.6249, + "step": 3702 + }, + { + "epoch": 4.212741751990899, + "grad_norm": 1.180303692817688, + "learning_rate": 0.0001585893060295791, + "loss": 3.388, + "step": 3703 + }, + { + "epoch": 4.2138794084186575, + "grad_norm": 0.736415684223175, + "learning_rate": 0.0001583617747440273, + "loss": 1.1843, + "step": 3704 + }, + { + "epoch": 4.215017064846417, + "grad_norm": 1.1527396440505981, + "learning_rate": 0.00015813424345847553, + "loss": 2.0684, + "step": 3705 + }, + { + "epoch": 4.216154721274175, + "grad_norm": 1.2436633110046387, + "learning_rate": 0.00015790671217292377, + "loss": 3.2018, + "step": 3706 + }, + { + "epoch": 4.217292377701934, + "grad_norm": 0.5855286717414856, + "learning_rate": 0.000157679180887372, + "loss": 1.0727, + "step": 3707 + }, + { + "epoch": 4.2184300341296925, + "grad_norm": 1.469887375831604, + "learning_rate": 0.00015745164960182027, + "loss": 3.1109, + "step": 3708 + }, + { + "epoch": 4.219567690557452, + "grad_norm": 0.6294847130775452, + "learning_rate": 0.00015722411831626848, + "loss": 1.3746, + "step": 3709 + }, + { + "epoch": 4.22070534698521, + "grad_norm": 0.7737325429916382, + "learning_rate": 0.00015699658703071672, + "loss": 1.4918, + "step": 3710 + }, + { + "epoch": 4.221843003412969, + "grad_norm": 0.7594680786132812, + "learning_rate": 0.00015676905574516496, + "loss": 2.4034, + "step": 3711 + }, + { + "epoch": 4.2229806598407285, + "grad_norm": 0.9010804295539856, + "learning_rate": 0.0001565415244596132, + "loss": 1.1525, + "step": 3712 + }, + { + "epoch": 4.224118316268487, + "grad_norm": 1.6272693872451782, + "learning_rate": 0.00015631399317406146, + "loss": 3.6835, + "step": 3713 + }, + { + "epoch": 4.225255972696246, + "grad_norm": 0.7173957228660583, + "learning_rate": 0.00015608646188850967, + "loss": 1.3503, + "step": 3714 + }, + { + "epoch": 4.226393629124004, + "grad_norm": 0.8401118516921997, + "learning_rate": 0.0001558589306029579, + "loss": 1.1993, + "step": 3715 + }, + { + "epoch": 4.2275312855517635, + "grad_norm": 0.8311299681663513, + "learning_rate": 0.00015563139931740614, + "loss": 1.3891, + "step": 3716 + }, + { + "epoch": 4.228668941979522, + "grad_norm": 2.7499144077301025, + "learning_rate": 0.00015540386803185438, + "loss": 4.453, + "step": 3717 + }, + { + "epoch": 4.229806598407281, + "grad_norm": 0.736031174659729, + "learning_rate": 0.00015517633674630264, + "loss": 1.815, + "step": 3718 + }, + { + "epoch": 4.23094425483504, + "grad_norm": 0.7462246417999268, + "learning_rate": 0.00015494880546075085, + "loss": 1.0605, + "step": 3719 + }, + { + "epoch": 4.2320819112627985, + "grad_norm": 1.0690699815750122, + "learning_rate": 0.0001547212741751991, + "loss": 1.1685, + "step": 3720 + }, + { + "epoch": 4.233219567690558, + "grad_norm": 0.781626284122467, + "learning_rate": 0.00015449374288964733, + "loss": 0.8943, + "step": 3721 + }, + { + "epoch": 4.234357224118316, + "grad_norm": 0.9446958899497986, + "learning_rate": 0.00015426621160409556, + "loss": 1.8685, + "step": 3722 + }, + { + "epoch": 4.235494880546075, + "grad_norm": 1.0105640888214111, + "learning_rate": 0.00015403868031854383, + "loss": 2.961, + "step": 3723 + }, + { + "epoch": 4.236632536973834, + "grad_norm": 0.6021206974983215, + "learning_rate": 0.00015381114903299204, + "loss": 1.5929, + "step": 3724 + }, + { + "epoch": 4.237770193401593, + "grad_norm": 0.7067481279373169, + "learning_rate": 0.00015358361774744027, + "loss": 0.7998, + "step": 3725 + }, + { + "epoch": 4.238907849829351, + "grad_norm": 0.968774676322937, + "learning_rate": 0.0001533560864618885, + "loss": 1.5134, + "step": 3726 + }, + { + "epoch": 4.24004550625711, + "grad_norm": 0.7576475739479065, + "learning_rate": 0.00015312855517633675, + "loss": 1.3597, + "step": 3727 + }, + { + "epoch": 4.2411831626848695, + "grad_norm": 0.8560457229614258, + "learning_rate": 0.00015290102389078499, + "loss": 1.7398, + "step": 3728 + }, + { + "epoch": 4.242320819112628, + "grad_norm": 1.2698020935058594, + "learning_rate": 0.00015267349260523322, + "loss": 1.4396, + "step": 3729 + }, + { + "epoch": 4.243458475540387, + "grad_norm": 0.9575531482696533, + "learning_rate": 0.00015244596131968146, + "loss": 1.3904, + "step": 3730 + }, + { + "epoch": 4.244596131968145, + "grad_norm": 1.038975715637207, + "learning_rate": 0.0001522184300341297, + "loss": 1.6743, + "step": 3731 + }, + { + "epoch": 4.2457337883959045, + "grad_norm": 1.3596773147583008, + "learning_rate": 0.00015199089874857793, + "loss": 3.709, + "step": 3732 + }, + { + "epoch": 4.246871444823663, + "grad_norm": 0.5811980962753296, + "learning_rate": 0.00015176336746302617, + "loss": 1.1902, + "step": 3733 + }, + { + "epoch": 4.248009101251422, + "grad_norm": 0.7279687523841858, + "learning_rate": 0.0001515358361774744, + "loss": 1.28, + "step": 3734 + }, + { + "epoch": 4.249146757679181, + "grad_norm": 0.8053296208381653, + "learning_rate": 0.00015130830489192264, + "loss": 1.5716, + "step": 3735 + }, + { + "epoch": 4.25028441410694, + "grad_norm": 1.2876355648040771, + "learning_rate": 0.00015108077360637088, + "loss": 2.0535, + "step": 3736 + }, + { + "epoch": 4.251422070534699, + "grad_norm": 1.4081701040267944, + "learning_rate": 0.00015085324232081912, + "loss": 2.8273, + "step": 3737 + }, + { + "epoch": 4.252559726962457, + "grad_norm": 1.2183709144592285, + "learning_rate": 0.00015062571103526736, + "loss": 2.7009, + "step": 3738 + }, + { + "epoch": 4.253697383390216, + "grad_norm": 1.22077476978302, + "learning_rate": 0.0001503981797497156, + "loss": 2.3349, + "step": 3739 + }, + { + "epoch": 4.254835039817975, + "grad_norm": 0.965623140335083, + "learning_rate": 0.00015017064846416383, + "loss": 2.194, + "step": 3740 + }, + { + "epoch": 4.255972696245734, + "grad_norm": 0.9789025187492371, + "learning_rate": 0.00014994311717861207, + "loss": 1.3343, + "step": 3741 + }, + { + "epoch": 4.257110352673493, + "grad_norm": 0.7638621926307678, + "learning_rate": 0.0001497155858930603, + "loss": 1.2936, + "step": 3742 + }, + { + "epoch": 4.258248009101251, + "grad_norm": 0.8102353811264038, + "learning_rate": 0.00014948805460750854, + "loss": 1.6045, + "step": 3743 + }, + { + "epoch": 4.2593856655290105, + "grad_norm": 1.2423808574676514, + "learning_rate": 0.00014926052332195678, + "loss": 2.631, + "step": 3744 + }, + { + "epoch": 4.260523321956769, + "grad_norm": 1.3112363815307617, + "learning_rate": 0.000149032992036405, + "loss": 2.9128, + "step": 3745 + }, + { + "epoch": 4.261660978384528, + "grad_norm": 1.219910740852356, + "learning_rate": 0.00014880546075085325, + "loss": 2.0505, + "step": 3746 + }, + { + "epoch": 4.262798634812286, + "grad_norm": 0.9391239881515503, + "learning_rate": 0.0001485779294653015, + "loss": 1.2113, + "step": 3747 + }, + { + "epoch": 4.263936291240046, + "grad_norm": 1.231005072593689, + "learning_rate": 0.00014835039817974973, + "loss": 1.5756, + "step": 3748 + }, + { + "epoch": 4.265073947667805, + "grad_norm": 1.1906815767288208, + "learning_rate": 0.00014812286689419796, + "loss": 2.0473, + "step": 3749 + }, + { + "epoch": 4.266211604095563, + "grad_norm": 0.9939584136009216, + "learning_rate": 0.00014789533560864617, + "loss": 1.4565, + "step": 3750 + }, + { + "epoch": 4.267349260523322, + "grad_norm": 0.7417742609977722, + "learning_rate": 0.00014766780432309444, + "loss": 1.7072, + "step": 3751 + }, + { + "epoch": 4.268486916951081, + "grad_norm": 0.9343417882919312, + "learning_rate": 0.00014744027303754267, + "loss": 1.6886, + "step": 3752 + }, + { + "epoch": 4.26962457337884, + "grad_norm": 1.7899564504623413, + "learning_rate": 0.0001472127417519909, + "loss": 3.9775, + "step": 3753 + }, + { + "epoch": 4.270762229806598, + "grad_norm": 0.759251058101654, + "learning_rate": 0.00014698521046643915, + "loss": 1.273, + "step": 3754 + }, + { + "epoch": 4.271899886234357, + "grad_norm": 1.0627682209014893, + "learning_rate": 0.00014675767918088736, + "loss": 1.5295, + "step": 3755 + }, + { + "epoch": 4.273037542662116, + "grad_norm": 0.836782693862915, + "learning_rate": 0.00014653014789533562, + "loss": 1.7053, + "step": 3756 + }, + { + "epoch": 4.274175199089875, + "grad_norm": 0.8767544031143188, + "learning_rate": 0.00014630261660978386, + "loss": 1.7385, + "step": 3757 + }, + { + "epoch": 4.275312855517634, + "grad_norm": 1.1457442045211792, + "learning_rate": 0.0001460750853242321, + "loss": 2.2233, + "step": 3758 + }, + { + "epoch": 4.276450511945392, + "grad_norm": 1.1655712127685547, + "learning_rate": 0.00014584755403868033, + "loss": 1.6773, + "step": 3759 + }, + { + "epoch": 4.277588168373152, + "grad_norm": 0.9990490674972534, + "learning_rate": 0.00014562002275312854, + "loss": 1.7472, + "step": 3760 + }, + { + "epoch": 4.27872582480091, + "grad_norm": 1.2598174810409546, + "learning_rate": 0.0001453924914675768, + "loss": 2.318, + "step": 3761 + }, + { + "epoch": 4.279863481228669, + "grad_norm": 1.3087960481643677, + "learning_rate": 0.00014516496018202502, + "loss": 2.8482, + "step": 3762 + }, + { + "epoch": 4.281001137656427, + "grad_norm": 1.1006321907043457, + "learning_rate": 0.00014493742889647328, + "loss": 2.0884, + "step": 3763 + }, + { + "epoch": 4.282138794084187, + "grad_norm": 0.648513674736023, + "learning_rate": 0.00014470989761092152, + "loss": 1.5729, + "step": 3764 + }, + { + "epoch": 4.283276450511945, + "grad_norm": 0.9822235107421875, + "learning_rate": 0.00014448236632536973, + "loss": 1.5298, + "step": 3765 + }, + { + "epoch": 4.284414106939704, + "grad_norm": 0.9084975123405457, + "learning_rate": 0.000144254835039818, + "loss": 2.1044, + "step": 3766 + }, + { + "epoch": 4.285551763367463, + "grad_norm": 1.9358890056610107, + "learning_rate": 0.0001440273037542662, + "loss": 3.062, + "step": 3767 + }, + { + "epoch": 4.286689419795222, + "grad_norm": 0.8079110980033875, + "learning_rate": 0.00014379977246871444, + "loss": 1.7288, + "step": 3768 + }, + { + "epoch": 4.287827076222981, + "grad_norm": 0.6145922541618347, + "learning_rate": 0.0001435722411831627, + "loss": 1.0093, + "step": 3769 + }, + { + "epoch": 4.288964732650739, + "grad_norm": 0.9286468625068665, + "learning_rate": 0.0001433447098976109, + "loss": 1.4367, + "step": 3770 + }, + { + "epoch": 4.290102389078498, + "grad_norm": 0.7882009148597717, + "learning_rate": 0.00014311717861205918, + "loss": 1.2809, + "step": 3771 + }, + { + "epoch": 4.291240045506257, + "grad_norm": 1.0285731554031372, + "learning_rate": 0.00014288964732650739, + "loss": 2.5163, + "step": 3772 + }, + { + "epoch": 4.292377701934016, + "grad_norm": 1.0870704650878906, + "learning_rate": 0.00014266211604095562, + "loss": 1.9514, + "step": 3773 + }, + { + "epoch": 4.293515358361775, + "grad_norm": 0.9731798768043518, + "learning_rate": 0.0001424345847554039, + "loss": 1.2031, + "step": 3774 + }, + { + "epoch": 4.294653014789533, + "grad_norm": 0.6875901222229004, + "learning_rate": 0.0001422070534698521, + "loss": 0.9267, + "step": 3775 + }, + { + "epoch": 4.295790671217293, + "grad_norm": 0.746791422367096, + "learning_rate": 0.00014197952218430036, + "loss": 1.5462, + "step": 3776 + }, + { + "epoch": 4.296928327645051, + "grad_norm": 0.7123384475708008, + "learning_rate": 0.00014175199089874857, + "loss": 1.3801, + "step": 3777 + }, + { + "epoch": 4.29806598407281, + "grad_norm": 0.554326593875885, + "learning_rate": 0.0001415244596131968, + "loss": 0.6866, + "step": 3778 + }, + { + "epoch": 4.2992036405005685, + "grad_norm": 0.9146575331687927, + "learning_rate": 0.00014129692832764505, + "loss": 1.5966, + "step": 3779 + }, + { + "epoch": 4.300341296928328, + "grad_norm": 0.6338298916816711, + "learning_rate": 0.00014106939704209328, + "loss": 1.0133, + "step": 3780 + }, + { + "epoch": 4.301478953356087, + "grad_norm": 0.6664391756057739, + "learning_rate": 0.00014084186575654155, + "loss": 1.1051, + "step": 3781 + }, + { + "epoch": 4.302616609783845, + "grad_norm": 0.8975256085395813, + "learning_rate": 0.00014061433447098976, + "loss": 1.273, + "step": 3782 + }, + { + "epoch": 4.303754266211604, + "grad_norm": 1.1492512226104736, + "learning_rate": 0.000140386803185438, + "loss": 2.3168, + "step": 3783 + }, + { + "epoch": 4.304891922639363, + "grad_norm": 1.3415923118591309, + "learning_rate": 0.00014015927189988623, + "loss": 2.4386, + "step": 3784 + }, + { + "epoch": 4.306029579067122, + "grad_norm": 1.2044072151184082, + "learning_rate": 0.00013993174061433447, + "loss": 3.6453, + "step": 3785 + }, + { + "epoch": 4.30716723549488, + "grad_norm": 0.7362768650054932, + "learning_rate": 0.00013970420932878273, + "loss": 1.5149, + "step": 3786 + }, + { + "epoch": 4.308304891922639, + "grad_norm": 0.8360229134559631, + "learning_rate": 0.00013947667804323094, + "loss": 2.0411, + "step": 3787 + }, + { + "epoch": 4.309442548350399, + "grad_norm": 0.8958834409713745, + "learning_rate": 0.00013924914675767918, + "loss": 1.5186, + "step": 3788 + }, + { + "epoch": 4.310580204778157, + "grad_norm": 0.7361008524894714, + "learning_rate": 0.00013902161547212741, + "loss": 2.1691, + "step": 3789 + }, + { + "epoch": 4.311717861205916, + "grad_norm": 0.9051206111907959, + "learning_rate": 0.00013879408418657565, + "loss": 1.7265, + "step": 3790 + }, + { + "epoch": 4.3128555176336745, + "grad_norm": 1.077871561050415, + "learning_rate": 0.00013856655290102392, + "loss": 3.1665, + "step": 3791 + }, + { + "epoch": 4.313993174061434, + "grad_norm": 0.9962257742881775, + "learning_rate": 0.00013833902161547213, + "loss": 0.8876, + "step": 3792 + }, + { + "epoch": 4.315130830489192, + "grad_norm": 0.7965003848075867, + "learning_rate": 0.00013811149032992036, + "loss": 0.8467, + "step": 3793 + }, + { + "epoch": 4.316268486916951, + "grad_norm": 1.0827739238739014, + "learning_rate": 0.0001378839590443686, + "loss": 2.0453, + "step": 3794 + }, + { + "epoch": 4.3174061433447095, + "grad_norm": 1.0761308670043945, + "learning_rate": 0.00013765642775881684, + "loss": 2.2351, + "step": 3795 + }, + { + "epoch": 4.318543799772469, + "grad_norm": 0.7478647232055664, + "learning_rate": 0.00013742889647326507, + "loss": 1.662, + "step": 3796 + }, + { + "epoch": 4.319681456200228, + "grad_norm": 1.300010085105896, + "learning_rate": 0.0001372013651877133, + "loss": 2.644, + "step": 3797 + }, + { + "epoch": 4.320819112627986, + "grad_norm": 1.0593494176864624, + "learning_rate": 0.00013697383390216155, + "loss": 2.2775, + "step": 3798 + }, + { + "epoch": 4.321956769055745, + "grad_norm": 1.0908184051513672, + "learning_rate": 0.00013674630261660978, + "loss": 1.6699, + "step": 3799 + }, + { + "epoch": 4.323094425483504, + "grad_norm": 1.0574840307235718, + "learning_rate": 0.00013651877133105802, + "loss": 2.6921, + "step": 3800 + }, + { + "epoch": 4.324232081911263, + "grad_norm": 0.6852555871009827, + "learning_rate": 0.00013629124004550626, + "loss": 1.0971, + "step": 3801 + }, + { + "epoch": 4.325369738339021, + "grad_norm": 1.1705669164657593, + "learning_rate": 0.0001360637087599545, + "loss": 2.0903, + "step": 3802 + }, + { + "epoch": 4.3265073947667805, + "grad_norm": 0.9083330035209656, + "learning_rate": 0.00013583617747440273, + "loss": 2.8104, + "step": 3803 + }, + { + "epoch": 4.327645051194539, + "grad_norm": 1.045404076576233, + "learning_rate": 0.00013560864618885097, + "loss": 1.8789, + "step": 3804 + }, + { + "epoch": 4.328782707622298, + "grad_norm": 1.4782066345214844, + "learning_rate": 0.0001353811149032992, + "loss": 3.033, + "step": 3805 + }, + { + "epoch": 4.329920364050057, + "grad_norm": 0.8685378432273865, + "learning_rate": 0.00013515358361774744, + "loss": 1.8752, + "step": 3806 + }, + { + "epoch": 4.3310580204778155, + "grad_norm": 0.6213506460189819, + "learning_rate": 0.00013492605233219568, + "loss": 1.4381, + "step": 3807 + }, + { + "epoch": 4.332195676905575, + "grad_norm": 1.271410346031189, + "learning_rate": 0.00013469852104664392, + "loss": 2.2921, + "step": 3808 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 1.084229826927185, + "learning_rate": 0.00013447098976109215, + "loss": 2.1447, + "step": 3809 + }, + { + "epoch": 4.334470989761092, + "grad_norm": 1.2566276788711548, + "learning_rate": 0.0001342434584755404, + "loss": 2.3514, + "step": 3810 + }, + { + "epoch": 4.335608646188851, + "grad_norm": 0.6356386542320251, + "learning_rate": 0.00013401592718998863, + "loss": 0.815, + "step": 3811 + }, + { + "epoch": 4.33674630261661, + "grad_norm": 0.9741693139076233, + "learning_rate": 0.00013378839590443687, + "loss": 2.4285, + "step": 3812 + }, + { + "epoch": 4.337883959044369, + "grad_norm": 0.9355404376983643, + "learning_rate": 0.00013356086461888508, + "loss": 2.015, + "step": 3813 + }, + { + "epoch": 4.339021615472127, + "grad_norm": 0.9928659796714783, + "learning_rate": 0.00013333333333333334, + "loss": 1.7225, + "step": 3814 + }, + { + "epoch": 4.3401592718998865, + "grad_norm": 0.7346398830413818, + "learning_rate": 0.00013310580204778158, + "loss": 0.8244, + "step": 3815 + }, + { + "epoch": 4.341296928327645, + "grad_norm": 0.8423483967781067, + "learning_rate": 0.00013287827076222981, + "loss": 1.5708, + "step": 3816 + }, + { + "epoch": 4.342434584755404, + "grad_norm": 1.3044344186782837, + "learning_rate": 0.00013265073947667805, + "loss": 2.0567, + "step": 3817 + }, + { + "epoch": 4.343572241183162, + "grad_norm": 0.7337809801101685, + "learning_rate": 0.00013242320819112626, + "loss": 0.9073, + "step": 3818 + }, + { + "epoch": 4.3447098976109215, + "grad_norm": 1.2432464361190796, + "learning_rate": 0.00013219567690557452, + "loss": 2.3843, + "step": 3819 + }, + { + "epoch": 4.345847554038681, + "grad_norm": 0.8972126245498657, + "learning_rate": 0.00013196814562002276, + "loss": 1.5984, + "step": 3820 + }, + { + "epoch": 4.346985210466439, + "grad_norm": 0.7023994326591492, + "learning_rate": 0.000131740614334471, + "loss": 1.7012, + "step": 3821 + }, + { + "epoch": 4.348122866894198, + "grad_norm": 0.857025146484375, + "learning_rate": 0.00013151308304891924, + "loss": 1.8499, + "step": 3822 + }, + { + "epoch": 4.349260523321957, + "grad_norm": 0.8133417963981628, + "learning_rate": 0.00013128555176336745, + "loss": 1.8394, + "step": 3823 + }, + { + "epoch": 4.350398179749716, + "grad_norm": 1.043709635734558, + "learning_rate": 0.0001310580204778157, + "loss": 1.6459, + "step": 3824 + }, + { + "epoch": 4.351535836177474, + "grad_norm": 0.9391498565673828, + "learning_rate": 0.00013083048919226395, + "loss": 1.4526, + "step": 3825 + }, + { + "epoch": 4.352673492605233, + "grad_norm": 1.0450690984725952, + "learning_rate": 0.00013060295790671218, + "loss": 1.5307, + "step": 3826 + }, + { + "epoch": 4.3538111490329925, + "grad_norm": 1.0160549879074097, + "learning_rate": 0.00013037542662116042, + "loss": 2.3228, + "step": 3827 + }, + { + "epoch": 4.354948805460751, + "grad_norm": 1.861527919769287, + "learning_rate": 0.00013014789533560863, + "loss": 2.6769, + "step": 3828 + }, + { + "epoch": 4.35608646188851, + "grad_norm": 0.8127307295799255, + "learning_rate": 0.0001299203640500569, + "loss": 1.3464, + "step": 3829 + }, + { + "epoch": 4.357224118316268, + "grad_norm": 0.8189122080802917, + "learning_rate": 0.0001296928327645051, + "loss": 1.2768, + "step": 3830 + }, + { + "epoch": 4.3583617747440275, + "grad_norm": 0.8693903088569641, + "learning_rate": 0.00012946530147895337, + "loss": 2.2173, + "step": 3831 + }, + { + "epoch": 4.359499431171786, + "grad_norm": 1.17019522190094, + "learning_rate": 0.0001292377701934016, + "loss": 2.2724, + "step": 3832 + }, + { + "epoch": 4.360637087599545, + "grad_norm": 0.7961968183517456, + "learning_rate": 0.00012901023890784982, + "loss": 0.8773, + "step": 3833 + }, + { + "epoch": 4.361774744027303, + "grad_norm": 0.9709134697914124, + "learning_rate": 0.00012878270762229808, + "loss": 1.8771, + "step": 3834 + }, + { + "epoch": 4.362912400455063, + "grad_norm": 1.0111744403839111, + "learning_rate": 0.0001285551763367463, + "loss": 1.7679, + "step": 3835 + }, + { + "epoch": 4.364050056882822, + "grad_norm": 1.0463035106658936, + "learning_rate": 0.00012832764505119455, + "loss": 1.8787, + "step": 3836 + }, + { + "epoch": 4.36518771331058, + "grad_norm": 0.9706122875213623, + "learning_rate": 0.0001281001137656428, + "loss": 1.8892, + "step": 3837 + }, + { + "epoch": 4.366325369738339, + "grad_norm": 1.487461805343628, + "learning_rate": 0.000127872582480091, + "loss": 3.0566, + "step": 3838 + }, + { + "epoch": 4.367463026166098, + "grad_norm": 0.578607976436615, + "learning_rate": 0.00012764505119453926, + "loss": 1.1034, + "step": 3839 + }, + { + "epoch": 4.368600682593857, + "grad_norm": 0.8049798607826233, + "learning_rate": 0.00012741751990898747, + "loss": 1.4093, + "step": 3840 + }, + { + "epoch": 4.369738339021615, + "grad_norm": 0.9491667747497559, + "learning_rate": 0.00012718998862343574, + "loss": 1.2235, + "step": 3841 + }, + { + "epoch": 4.370875995449374, + "grad_norm": 0.6280384063720703, + "learning_rate": 0.00012696245733788398, + "loss": 1.5624, + "step": 3842 + }, + { + "epoch": 4.372013651877133, + "grad_norm": 1.2607163190841675, + "learning_rate": 0.00012673492605233219, + "loss": 2.3174, + "step": 3843 + }, + { + "epoch": 4.373151308304892, + "grad_norm": 0.9251709580421448, + "learning_rate": 0.00012650739476678045, + "loss": 1.8042, + "step": 3844 + }, + { + "epoch": 4.374288964732651, + "grad_norm": 1.0482347011566162, + "learning_rate": 0.00012627986348122866, + "loss": 1.7866, + "step": 3845 + }, + { + "epoch": 4.375426621160409, + "grad_norm": 1.0927660465240479, + "learning_rate": 0.00012605233219567692, + "loss": 2.0026, + "step": 3846 + }, + { + "epoch": 4.376564277588169, + "grad_norm": 0.9109655618667603, + "learning_rate": 0.00012582480091012513, + "loss": 1.6597, + "step": 3847 + }, + { + "epoch": 4.377701934015927, + "grad_norm": 1.1354659795761108, + "learning_rate": 0.00012559726962457337, + "loss": 2.0034, + "step": 3848 + }, + { + "epoch": 4.378839590443686, + "grad_norm": 0.624963641166687, + "learning_rate": 0.00012536973833902163, + "loss": 1.1846, + "step": 3849 + }, + { + "epoch": 4.379977246871444, + "grad_norm": 0.7862791419029236, + "learning_rate": 0.00012514220705346984, + "loss": 1.0518, + "step": 3850 + }, + { + "epoch": 4.381114903299204, + "grad_norm": 1.0999188423156738, + "learning_rate": 0.0001249146757679181, + "loss": 3.3598, + "step": 3851 + }, + { + "epoch": 4.382252559726963, + "grad_norm": 1.0978202819824219, + "learning_rate": 0.00012468714448236632, + "loss": 1.4192, + "step": 3852 + }, + { + "epoch": 4.383390216154721, + "grad_norm": 0.8978061079978943, + "learning_rate": 0.00012445961319681456, + "loss": 1.5036, + "step": 3853 + }, + { + "epoch": 4.38452787258248, + "grad_norm": 0.7772718071937561, + "learning_rate": 0.0001242320819112628, + "loss": 1.9559, + "step": 3854 + }, + { + "epoch": 4.385665529010239, + "grad_norm": 0.6393569111824036, + "learning_rate": 0.00012400455062571103, + "loss": 1.2368, + "step": 3855 + }, + { + "epoch": 4.386803185437998, + "grad_norm": 1.3224518299102783, + "learning_rate": 0.0001237770193401593, + "loss": 2.4815, + "step": 3856 + }, + { + "epoch": 4.387940841865756, + "grad_norm": 0.8100547194480896, + "learning_rate": 0.0001235494880546075, + "loss": 2.0509, + "step": 3857 + }, + { + "epoch": 4.389078498293515, + "grad_norm": 1.1932857036590576, + "learning_rate": 0.00012332195676905574, + "loss": 2.3144, + "step": 3858 + }, + { + "epoch": 4.390216154721275, + "grad_norm": 1.004632592201233, + "learning_rate": 0.00012309442548350398, + "loss": 1.9511, + "step": 3859 + }, + { + "epoch": 4.391353811149033, + "grad_norm": 0.73143070936203, + "learning_rate": 0.00012286689419795221, + "loss": 1.5956, + "step": 3860 + }, + { + "epoch": 4.392491467576792, + "grad_norm": 0.8859034180641174, + "learning_rate": 0.00012263936291240045, + "loss": 1.6946, + "step": 3861 + }, + { + "epoch": 4.39362912400455, + "grad_norm": 1.6607998609542847, + "learning_rate": 0.0001224118316268487, + "loss": 3.6334, + "step": 3862 + }, + { + "epoch": 4.39476678043231, + "grad_norm": 0.5776720643043518, + "learning_rate": 0.00012218430034129693, + "loss": 0.8281, + "step": 3863 + }, + { + "epoch": 4.395904436860068, + "grad_norm": 0.8924266695976257, + "learning_rate": 0.00012195676905574516, + "loss": 1.7758, + "step": 3864 + }, + { + "epoch": 4.397042093287827, + "grad_norm": 0.5048301815986633, + "learning_rate": 0.0001217292377701934, + "loss": 0.5899, + "step": 3865 + }, + { + "epoch": 4.398179749715586, + "grad_norm": 0.9036781191825867, + "learning_rate": 0.00012150170648464164, + "loss": 1.7569, + "step": 3866 + }, + { + "epoch": 4.399317406143345, + "grad_norm": 1.0674494504928589, + "learning_rate": 0.00012127417519908989, + "loss": 2.3646, + "step": 3867 + }, + { + "epoch": 4.400455062571104, + "grad_norm": 1.6074708700180054, + "learning_rate": 0.00012104664391353812, + "loss": 2.0718, + "step": 3868 + }, + { + "epoch": 4.401592718998862, + "grad_norm": 1.289448618888855, + "learning_rate": 0.00012081911262798635, + "loss": 2.4961, + "step": 3869 + }, + { + "epoch": 4.402730375426621, + "grad_norm": 1.5797637701034546, + "learning_rate": 0.00012059158134243458, + "loss": 2.0542, + "step": 3870 + }, + { + "epoch": 4.40386803185438, + "grad_norm": 0.8559139370918274, + "learning_rate": 0.00012036405005688282, + "loss": 1.8854, + "step": 3871 + }, + { + "epoch": 4.405005688282139, + "grad_norm": 0.749015748500824, + "learning_rate": 0.00012013651877133106, + "loss": 1.3702, + "step": 3872 + }, + { + "epoch": 4.406143344709897, + "grad_norm": 0.95188969373703, + "learning_rate": 0.00011990898748577931, + "loss": 1.6222, + "step": 3873 + }, + { + "epoch": 4.407281001137656, + "grad_norm": 0.8665216565132141, + "learning_rate": 0.00011968145620022753, + "loss": 1.9693, + "step": 3874 + }, + { + "epoch": 4.408418657565416, + "grad_norm": 0.7548373937606812, + "learning_rate": 0.00011945392491467577, + "loss": 2.2095, + "step": 3875 + }, + { + "epoch": 4.409556313993174, + "grad_norm": 0.7718438506126404, + "learning_rate": 0.000119226393629124, + "loss": 1.0631, + "step": 3876 + }, + { + "epoch": 4.410693970420933, + "grad_norm": 0.7795398235321045, + "learning_rate": 0.00011899886234357224, + "loss": 1.5037, + "step": 3877 + }, + { + "epoch": 4.4118316268486915, + "grad_norm": 0.9634506702423096, + "learning_rate": 0.00011877133105802048, + "loss": 1.2737, + "step": 3878 + }, + { + "epoch": 4.412969283276451, + "grad_norm": 1.0782710313796997, + "learning_rate": 0.00011854379977246872, + "loss": 2.0432, + "step": 3879 + }, + { + "epoch": 4.414106939704209, + "grad_norm": 0.8432207107543945, + "learning_rate": 0.00011831626848691695, + "loss": 1.4583, + "step": 3880 + }, + { + "epoch": 4.415244596131968, + "grad_norm": 1.0028852224349976, + "learning_rate": 0.00011808873720136519, + "loss": 1.3358, + "step": 3881 + }, + { + "epoch": 4.4163822525597265, + "grad_norm": 0.9383002519607544, + "learning_rate": 0.00011786120591581343, + "loss": 1.9518, + "step": 3882 + }, + { + "epoch": 4.417519908987486, + "grad_norm": 0.7450307607650757, + "learning_rate": 0.00011763367463026167, + "loss": 0.7314, + "step": 3883 + }, + { + "epoch": 4.418657565415245, + "grad_norm": 0.8783142566680908, + "learning_rate": 0.0001174061433447099, + "loss": 1.8254, + "step": 3884 + }, + { + "epoch": 4.419795221843003, + "grad_norm": 0.7310676574707031, + "learning_rate": 0.00011717861205915814, + "loss": 1.7363, + "step": 3885 + }, + { + "epoch": 4.420932878270762, + "grad_norm": 0.4681646525859833, + "learning_rate": 0.00011695108077360638, + "loss": 1.0317, + "step": 3886 + }, + { + "epoch": 4.422070534698521, + "grad_norm": 0.9998228549957275, + "learning_rate": 0.00011672354948805461, + "loss": 1.5986, + "step": 3887 + }, + { + "epoch": 4.42320819112628, + "grad_norm": 0.6129392981529236, + "learning_rate": 0.00011649601820250284, + "loss": 0.952, + "step": 3888 + }, + { + "epoch": 4.424345847554038, + "grad_norm": 0.8074744343757629, + "learning_rate": 0.00011626848691695107, + "loss": 1.1051, + "step": 3889 + }, + { + "epoch": 4.4254835039817975, + "grad_norm": 0.9240376949310303, + "learning_rate": 0.00011604095563139932, + "loss": 1.1525, + "step": 3890 + }, + { + "epoch": 4.426621160409557, + "grad_norm": 0.9832790493965149, + "learning_rate": 0.00011581342434584756, + "loss": 1.538, + "step": 3891 + }, + { + "epoch": 4.427758816837315, + "grad_norm": 1.5793112516403198, + "learning_rate": 0.0001155858930602958, + "loss": 2.8963, + "step": 3892 + }, + { + "epoch": 4.428896473265074, + "grad_norm": 0.6335533261299133, + "learning_rate": 0.00011535836177474402, + "loss": 0.7007, + "step": 3893 + }, + { + "epoch": 4.4300341296928325, + "grad_norm": 0.8912391066551208, + "learning_rate": 0.00011513083048919226, + "loss": 1.6569, + "step": 3894 + }, + { + "epoch": 4.431171786120592, + "grad_norm": 1.0746057033538818, + "learning_rate": 0.0001149032992036405, + "loss": 1.9166, + "step": 3895 + }, + { + "epoch": 4.43230944254835, + "grad_norm": 0.8969407677650452, + "learning_rate": 0.00011467576791808875, + "loss": 1.823, + "step": 3896 + }, + { + "epoch": 4.433447098976109, + "grad_norm": 1.103092074394226, + "learning_rate": 0.00011444823663253698, + "loss": 2.2962, + "step": 3897 + }, + { + "epoch": 4.434584755403868, + "grad_norm": 0.9207330942153931, + "learning_rate": 0.0001142207053469852, + "loss": 1.9893, + "step": 3898 + }, + { + "epoch": 4.435722411831627, + "grad_norm": 0.7149428725242615, + "learning_rate": 0.00011399317406143344, + "loss": 1.9576, + "step": 3899 + }, + { + "epoch": 4.436860068259386, + "grad_norm": 1.1259676218032837, + "learning_rate": 0.00011376564277588168, + "loss": 1.7732, + "step": 3900 + }, + { + "epoch": 4.437997724687144, + "grad_norm": 1.0312747955322266, + "learning_rate": 0.00011353811149032993, + "loss": 2.2845, + "step": 3901 + }, + { + "epoch": 4.4391353811149035, + "grad_norm": 1.1275362968444824, + "learning_rate": 0.00011331058020477817, + "loss": 2.7354, + "step": 3902 + }, + { + "epoch": 4.440273037542662, + "grad_norm": 0.8789055943489075, + "learning_rate": 0.00011308304891922639, + "loss": 2.0587, + "step": 3903 + }, + { + "epoch": 4.441410693970421, + "grad_norm": 2.0354743003845215, + "learning_rate": 0.00011285551763367463, + "loss": 2.5204, + "step": 3904 + }, + { + "epoch": 4.44254835039818, + "grad_norm": 1.1655513048171997, + "learning_rate": 0.00011262798634812287, + "loss": 1.7625, + "step": 3905 + }, + { + "epoch": 4.4436860068259385, + "grad_norm": 0.7838327288627625, + "learning_rate": 0.0001124004550625711, + "loss": 1.7538, + "step": 3906 + }, + { + "epoch": 4.444823663253698, + "grad_norm": 1.042612075805664, + "learning_rate": 0.00011217292377701935, + "loss": 1.9573, + "step": 3907 + }, + { + "epoch": 4.445961319681456, + "grad_norm": 0.8811992406845093, + "learning_rate": 0.00011194539249146758, + "loss": 1.237, + "step": 3908 + }, + { + "epoch": 4.447098976109215, + "grad_norm": 0.8946145176887512, + "learning_rate": 0.00011171786120591581, + "loss": 1.7659, + "step": 3909 + }, + { + "epoch": 4.4482366325369735, + "grad_norm": 0.6377555727958679, + "learning_rate": 0.00011149032992036405, + "loss": 1.1502, + "step": 3910 + }, + { + "epoch": 4.449374288964733, + "grad_norm": 1.0547488927841187, + "learning_rate": 0.00011126279863481229, + "loss": 1.3714, + "step": 3911 + }, + { + "epoch": 4.450511945392491, + "grad_norm": 0.7710375785827637, + "learning_rate": 0.00011103526734926054, + "loss": 1.2559, + "step": 3912 + }, + { + "epoch": 4.45164960182025, + "grad_norm": 0.5346420407295227, + "learning_rate": 0.00011080773606370876, + "loss": 0.5416, + "step": 3913 + }, + { + "epoch": 4.4527872582480095, + "grad_norm": 0.8821654915809631, + "learning_rate": 0.000110580204778157, + "loss": 1.1061, + "step": 3914 + }, + { + "epoch": 4.453924914675768, + "grad_norm": 2.041219472885132, + "learning_rate": 0.00011035267349260524, + "loss": 3.7444, + "step": 3915 + }, + { + "epoch": 4.455062571103527, + "grad_norm": 0.9593986868858337, + "learning_rate": 0.00011012514220705347, + "loss": 1.6756, + "step": 3916 + }, + { + "epoch": 4.456200227531285, + "grad_norm": 0.8779603838920593, + "learning_rate": 0.00010989761092150171, + "loss": 2.0283, + "step": 3917 + }, + { + "epoch": 4.4573378839590445, + "grad_norm": 1.146952748298645, + "learning_rate": 0.00010967007963594995, + "loss": 2.2362, + "step": 3918 + }, + { + "epoch": 4.458475540386803, + "grad_norm": 0.8655639290809631, + "learning_rate": 0.00010944254835039818, + "loss": 1.9992, + "step": 3919 + }, + { + "epoch": 4.459613196814562, + "grad_norm": 1.1940038204193115, + "learning_rate": 0.00010921501706484642, + "loss": 1.7449, + "step": 3920 + }, + { + "epoch": 4.460750853242321, + "grad_norm": 0.7749353647232056, + "learning_rate": 0.00010898748577929466, + "loss": 1.1885, + "step": 3921 + }, + { + "epoch": 4.4618885096700796, + "grad_norm": 0.8721646666526794, + "learning_rate": 0.0001087599544937429, + "loss": 1.8104, + "step": 3922 + }, + { + "epoch": 4.463026166097839, + "grad_norm": 0.6543747782707214, + "learning_rate": 0.00010853242320819112, + "loss": 1.2385, + "step": 3923 + }, + { + "epoch": 4.464163822525597, + "grad_norm": 0.9697179198265076, + "learning_rate": 0.00010830489192263937, + "loss": 1.6976, + "step": 3924 + }, + { + "epoch": 4.465301478953356, + "grad_norm": 0.9224042892456055, + "learning_rate": 0.0001080773606370876, + "loss": 1.214, + "step": 3925 + }, + { + "epoch": 4.466439135381115, + "grad_norm": 0.9176324605941772, + "learning_rate": 0.00010784982935153584, + "loss": 1.2962, + "step": 3926 + }, + { + "epoch": 4.467576791808874, + "grad_norm": 0.9372009038925171, + "learning_rate": 0.00010762229806598408, + "loss": 1.9831, + "step": 3927 + }, + { + "epoch": 4.468714448236632, + "grad_norm": 1.1387556791305542, + "learning_rate": 0.0001073947667804323, + "loss": 1.9173, + "step": 3928 + }, + { + "epoch": 4.469852104664391, + "grad_norm": 1.1000502109527588, + "learning_rate": 0.00010716723549488055, + "loss": 1.9951, + "step": 3929 + }, + { + "epoch": 4.4709897610921505, + "grad_norm": 0.8819757699966431, + "learning_rate": 0.00010693970420932879, + "loss": 1.3953, + "step": 3930 + }, + { + "epoch": 4.472127417519909, + "grad_norm": 0.9281446933746338, + "learning_rate": 0.00010671217292377703, + "loss": 1.3608, + "step": 3931 + }, + { + "epoch": 4.473265073947668, + "grad_norm": 1.0403672456741333, + "learning_rate": 0.00010648464163822526, + "loss": 2.4958, + "step": 3932 + }, + { + "epoch": 4.474402730375426, + "grad_norm": 0.6432840824127197, + "learning_rate": 0.00010625711035267349, + "loss": 1.0788, + "step": 3933 + }, + { + "epoch": 4.4755403868031856, + "grad_norm": 0.8006333112716675, + "learning_rate": 0.00010602957906712172, + "loss": 1.5623, + "step": 3934 + }, + { + "epoch": 4.476678043230944, + "grad_norm": 0.8013043403625488, + "learning_rate": 0.00010580204778156998, + "loss": 1.5541, + "step": 3935 + }, + { + "epoch": 4.477815699658703, + "grad_norm": 1.071979284286499, + "learning_rate": 0.00010557451649601821, + "loss": 3.0613, + "step": 3936 + }, + { + "epoch": 4.478953356086462, + "grad_norm": 0.6907356381416321, + "learning_rate": 0.00010534698521046644, + "loss": 1.8117, + "step": 3937 + }, + { + "epoch": 4.480091012514221, + "grad_norm": 0.7573530077934265, + "learning_rate": 0.00010511945392491467, + "loss": 1.4187, + "step": 3938 + }, + { + "epoch": 4.48122866894198, + "grad_norm": 0.7063055038452148, + "learning_rate": 0.00010489192263936291, + "loss": 0.7842, + "step": 3939 + }, + { + "epoch": 4.482366325369738, + "grad_norm": 0.9287145733833313, + "learning_rate": 0.00010466439135381115, + "loss": 1.7274, + "step": 3940 + }, + { + "epoch": 4.483503981797497, + "grad_norm": 0.7556503415107727, + "learning_rate": 0.0001044368600682594, + "loss": 1.4371, + "step": 3941 + }, + { + "epoch": 4.484641638225256, + "grad_norm": 1.12169349193573, + "learning_rate": 0.00010420932878270762, + "loss": 1.4631, + "step": 3942 + }, + { + "epoch": 4.485779294653015, + "grad_norm": 1.066739797592163, + "learning_rate": 0.00010398179749715586, + "loss": 2.0575, + "step": 3943 + }, + { + "epoch": 4.486916951080774, + "grad_norm": 0.6979597210884094, + "learning_rate": 0.0001037542662116041, + "loss": 1.2034, + "step": 3944 + }, + { + "epoch": 4.488054607508532, + "grad_norm": 0.9167845249176025, + "learning_rate": 0.00010352673492605233, + "loss": 1.3631, + "step": 3945 + }, + { + "epoch": 4.489192263936292, + "grad_norm": 1.0211360454559326, + "learning_rate": 0.00010329920364050058, + "loss": 2.8476, + "step": 3946 + }, + { + "epoch": 4.49032992036405, + "grad_norm": 1.0867899656295776, + "learning_rate": 0.0001030716723549488, + "loss": 2.0269, + "step": 3947 + }, + { + "epoch": 4.491467576791809, + "grad_norm": 0.963070809841156, + "learning_rate": 0.00010284414106939704, + "loss": 1.9714, + "step": 3948 + }, + { + "epoch": 4.492605233219567, + "grad_norm": 1.0459845066070557, + "learning_rate": 0.00010261660978384528, + "loss": 1.8135, + "step": 3949 + }, + { + "epoch": 4.493742889647327, + "grad_norm": 1.2111842632293701, + "learning_rate": 0.00010238907849829352, + "loss": 2.337, + "step": 3950 + }, + { + "epoch": 4.494880546075085, + "grad_norm": 0.9683245420455933, + "learning_rate": 0.00010216154721274175, + "loss": 1.2889, + "step": 3951 + }, + { + "epoch": 4.496018202502844, + "grad_norm": 0.9261424541473389, + "learning_rate": 0.00010193401592718999, + "loss": 1.5083, + "step": 3952 + }, + { + "epoch": 4.497155858930603, + "grad_norm": 0.6647104620933533, + "learning_rate": 0.00010170648464163823, + "loss": 1.3438, + "step": 3953 + }, + { + "epoch": 4.498293515358362, + "grad_norm": 0.7070716023445129, + "learning_rate": 0.00010147895335608646, + "loss": 1.2005, + "step": 3954 + }, + { + "epoch": 4.499431171786121, + "grad_norm": 0.6603410840034485, + "learning_rate": 0.0001012514220705347, + "loss": 0.869, + "step": 3955 + }, + { + "epoch": 4.500568828213879, + "grad_norm": 1.0688494443893433, + "learning_rate": 0.00010102389078498294, + "loss": 1.8926, + "step": 3956 + }, + { + "epoch": 4.501706484641638, + "grad_norm": 0.7224915027618408, + "learning_rate": 0.00010079635949943116, + "loss": 1.9618, + "step": 3957 + }, + { + "epoch": 4.502844141069397, + "grad_norm": 1.0792325735092163, + "learning_rate": 0.00010056882821387941, + "loss": 1.9267, + "step": 3958 + }, + { + "epoch": 4.503981797497156, + "grad_norm": 0.7390668392181396, + "learning_rate": 0.00010034129692832765, + "loss": 1.4238, + "step": 3959 + }, + { + "epoch": 4.505119453924914, + "grad_norm": 1.3947380781173706, + "learning_rate": 0.00010011376564277589, + "loss": 2.3192, + "step": 3960 + }, + { + "epoch": 4.506257110352673, + "grad_norm": 0.9918806552886963, + "learning_rate": 9.988623435722412e-05, + "loss": 1.6336, + "step": 3961 + }, + { + "epoch": 4.507394766780433, + "grad_norm": 1.1548150777816772, + "learning_rate": 9.965870307167235e-05, + "loss": 1.2732, + "step": 3962 + }, + { + "epoch": 4.508532423208191, + "grad_norm": 1.110249400138855, + "learning_rate": 9.94311717861206e-05, + "loss": 1.8542, + "step": 3963 + }, + { + "epoch": 4.50967007963595, + "grad_norm": 0.9127480387687683, + "learning_rate": 9.920364050056883e-05, + "loss": 1.477, + "step": 3964 + }, + { + "epoch": 4.510807736063708, + "grad_norm": 1.161213755607605, + "learning_rate": 9.897610921501707e-05, + "loss": 1.8094, + "step": 3965 + }, + { + "epoch": 4.511945392491468, + "grad_norm": 1.106597661972046, + "learning_rate": 9.874857792946531e-05, + "loss": 1.9764, + "step": 3966 + }, + { + "epoch": 4.513083048919226, + "grad_norm": 0.7442317008972168, + "learning_rate": 9.852104664391353e-05, + "loss": 1.4526, + "step": 3967 + }, + { + "epoch": 4.514220705346985, + "grad_norm": 0.7829383015632629, + "learning_rate": 9.829351535836177e-05, + "loss": 2.0345, + "step": 3968 + }, + { + "epoch": 4.515358361774744, + "grad_norm": 1.1125015020370483, + "learning_rate": 9.806598407281002e-05, + "loss": 2.6904, + "step": 3969 + }, + { + "epoch": 4.516496018202503, + "grad_norm": 0.7292245626449585, + "learning_rate": 9.783845278725826e-05, + "loss": 1.323, + "step": 3970 + }, + { + "epoch": 4.517633674630262, + "grad_norm": 1.535749912261963, + "learning_rate": 9.761092150170649e-05, + "loss": 3.5423, + "step": 3971 + }, + { + "epoch": 4.51877133105802, + "grad_norm": 0.9439961314201355, + "learning_rate": 9.738339021615472e-05, + "loss": 3.0287, + "step": 3972 + }, + { + "epoch": 4.519908987485779, + "grad_norm": 0.8544148206710815, + "learning_rate": 9.715585893060295e-05, + "loss": 1.7422, + "step": 3973 + }, + { + "epoch": 4.521046643913538, + "grad_norm": 0.7354198694229126, + "learning_rate": 9.692832764505119e-05, + "loss": 1.4524, + "step": 3974 + }, + { + "epoch": 4.522184300341297, + "grad_norm": 0.7296625375747681, + "learning_rate": 9.670079635949944e-05, + "loss": 1.3034, + "step": 3975 + }, + { + "epoch": 4.523321956769056, + "grad_norm": 1.0089813470840454, + "learning_rate": 9.647326507394768e-05, + "loss": 1.5669, + "step": 3976 + }, + { + "epoch": 4.5244596131968144, + "grad_norm": 1.005476474761963, + "learning_rate": 9.62457337883959e-05, + "loss": 1.4951, + "step": 3977 + }, + { + "epoch": 4.525597269624574, + "grad_norm": 0.7817225456237793, + "learning_rate": 9.601820250284414e-05, + "loss": 1.6755, + "step": 3978 + }, + { + "epoch": 4.526734926052332, + "grad_norm": 0.814610481262207, + "learning_rate": 9.579067121729238e-05, + "loss": 1.0051, + "step": 3979 + }, + { + "epoch": 4.527872582480091, + "grad_norm": 0.869544267654419, + "learning_rate": 9.556313993174063e-05, + "loss": 1.5352, + "step": 3980 + }, + { + "epoch": 4.5290102389078495, + "grad_norm": 0.805802047252655, + "learning_rate": 9.533560864618886e-05, + "loss": 1.9839, + "step": 3981 + }, + { + "epoch": 4.530147895335609, + "grad_norm": 0.7213151454925537, + "learning_rate": 9.510807736063709e-05, + "loss": 2.1419, + "step": 3982 + }, + { + "epoch": 4.531285551763368, + "grad_norm": 0.743634045124054, + "learning_rate": 9.488054607508532e-05, + "loss": 1.2667, + "step": 3983 + }, + { + "epoch": 4.532423208191126, + "grad_norm": 0.8204901218414307, + "learning_rate": 9.465301478953356e-05, + "loss": 1.5102, + "step": 3984 + }, + { + "epoch": 4.533560864618885, + "grad_norm": 0.902324378490448, + "learning_rate": 9.44254835039818e-05, + "loss": 1.2913, + "step": 3985 + }, + { + "epoch": 4.534698521046644, + "grad_norm": 1.2690314054489136, + "learning_rate": 9.419795221843005e-05, + "loss": 3.7403, + "step": 3986 + }, + { + "epoch": 4.535836177474403, + "grad_norm": 0.6763675212860107, + "learning_rate": 9.397042093287827e-05, + "loss": 0.8402, + "step": 3987 + }, + { + "epoch": 4.536973833902161, + "grad_norm": 0.6812355518341064, + "learning_rate": 9.374288964732651e-05, + "loss": 0.8881, + "step": 3988 + }, + { + "epoch": 4.5381114903299204, + "grad_norm": 0.7160151600837708, + "learning_rate": 9.351535836177475e-05, + "loss": 1.3187, + "step": 3989 + }, + { + "epoch": 4.53924914675768, + "grad_norm": 0.6161930561065674, + "learning_rate": 9.328782707622298e-05, + "loss": 1.0509, + "step": 3990 + }, + { + "epoch": 4.540386803185438, + "grad_norm": 0.9480435848236084, + "learning_rate": 9.306029579067122e-05, + "loss": 1.898, + "step": 3991 + }, + { + "epoch": 4.541524459613197, + "grad_norm": 1.0816932916641235, + "learning_rate": 9.283276450511946e-05, + "loss": 2.1839, + "step": 3992 + }, + { + "epoch": 4.5426621160409555, + "grad_norm": 0.9521051049232483, + "learning_rate": 9.26052332195677e-05, + "loss": 1.5058, + "step": 3993 + }, + { + "epoch": 4.543799772468715, + "grad_norm": 0.5779545903205872, + "learning_rate": 9.237770193401593e-05, + "loss": 1.2215, + "step": 3994 + }, + { + "epoch": 4.544937428896473, + "grad_norm": 0.7004631161689758, + "learning_rate": 9.215017064846417e-05, + "loss": 0.8524, + "step": 3995 + }, + { + "epoch": 4.546075085324232, + "grad_norm": 1.5310842990875244, + "learning_rate": 9.192263936291239e-05, + "loss": 2.2935, + "step": 3996 + }, + { + "epoch": 4.5472127417519905, + "grad_norm": 1.0182299613952637, + "learning_rate": 9.169510807736064e-05, + "loss": 1.4897, + "step": 3997 + }, + { + "epoch": 4.54835039817975, + "grad_norm": 1.2902427911758423, + "learning_rate": 9.146757679180888e-05, + "loss": 2.0195, + "step": 3998 + }, + { + "epoch": 4.549488054607508, + "grad_norm": 0.9700065851211548, + "learning_rate": 9.124004550625712e-05, + "loss": 1.4082, + "step": 3999 + }, + { + "epoch": 4.550625711035267, + "grad_norm": 1.0845290422439575, + "learning_rate": 9.101251422070535e-05, + "loss": 2.3722, + "step": 4000 + }, + { + "epoch": 4.5517633674630265, + "grad_norm": 1.14189612865448, + "learning_rate": 9.078498293515358e-05, + "loss": 2.0645, + "step": 4001 + }, + { + "epoch": 4.552901023890785, + "grad_norm": 1.1628812551498413, + "learning_rate": 9.055745164960181e-05, + "loss": 2.7279, + "step": 4002 + }, + { + "epoch": 4.554038680318544, + "grad_norm": 0.8541833758354187, + "learning_rate": 9.032992036405006e-05, + "loss": 1.8064, + "step": 4003 + }, + { + "epoch": 4.555176336746302, + "grad_norm": 0.7190375924110413, + "learning_rate": 9.01023890784983e-05, + "loss": 1.7207, + "step": 4004 + }, + { + "epoch": 4.5563139931740615, + "grad_norm": 0.6545979976654053, + "learning_rate": 8.987485779294654e-05, + "loss": 1.4735, + "step": 4005 + }, + { + "epoch": 4.55745164960182, + "grad_norm": 1.12898588180542, + "learning_rate": 8.964732650739476e-05, + "loss": 1.8204, + "step": 4006 + }, + { + "epoch": 4.558589306029579, + "grad_norm": 1.265575885772705, + "learning_rate": 8.9419795221843e-05, + "loss": 2.1016, + "step": 4007 + }, + { + "epoch": 4.559726962457338, + "grad_norm": 1.3039684295654297, + "learning_rate": 8.919226393629125e-05, + "loss": 2.1831, + "step": 4008 + }, + { + "epoch": 4.5608646188850965, + "grad_norm": 0.6052323579788208, + "learning_rate": 8.896473265073949e-05, + "loss": 1.5035, + "step": 4009 + }, + { + "epoch": 4.562002275312856, + "grad_norm": 1.2771997451782227, + "learning_rate": 8.873720136518772e-05, + "loss": 2.3026, + "step": 4010 + }, + { + "epoch": 4.563139931740614, + "grad_norm": 1.577376365661621, + "learning_rate": 8.850967007963595e-05, + "loss": 2.7743, + "step": 4011 + }, + { + "epoch": 4.564277588168373, + "grad_norm": 0.78837651014328, + "learning_rate": 8.828213879408418e-05, + "loss": 1.4329, + "step": 4012 + }, + { + "epoch": 4.565415244596132, + "grad_norm": 1.392586350440979, + "learning_rate": 8.805460750853242e-05, + "loss": 1.9271, + "step": 4013 + }, + { + "epoch": 4.566552901023891, + "grad_norm": 0.6456694602966309, + "learning_rate": 8.782707622298067e-05, + "loss": 1.1636, + "step": 4014 + }, + { + "epoch": 4.56769055745165, + "grad_norm": 0.8244945406913757, + "learning_rate": 8.759954493742891e-05, + "loss": 1.3006, + "step": 4015 + }, + { + "epoch": 4.568828213879408, + "grad_norm": 1.0723069906234741, + "learning_rate": 8.737201365187713e-05, + "loss": 1.835, + "step": 4016 + }, + { + "epoch": 4.5699658703071675, + "grad_norm": 1.0027074813842773, + "learning_rate": 8.714448236632537e-05, + "loss": 1.806, + "step": 4017 + }, + { + "epoch": 4.571103526734926, + "grad_norm": 1.1087925434112549, + "learning_rate": 8.69169510807736e-05, + "loss": 1.7219, + "step": 4018 + }, + { + "epoch": 4.572241183162685, + "grad_norm": 0.5315782427787781, + "learning_rate": 8.668941979522184e-05, + "loss": 0.856, + "step": 4019 + }, + { + "epoch": 4.573378839590443, + "grad_norm": 0.8060416579246521, + "learning_rate": 8.646188850967009e-05, + "loss": 1.3456, + "step": 4020 + }, + { + "epoch": 4.5745164960182025, + "grad_norm": 0.8221978545188904, + "learning_rate": 8.623435722411832e-05, + "loss": 1.7365, + "step": 4021 + }, + { + "epoch": 4.575654152445962, + "grad_norm": 0.8591867685317993, + "learning_rate": 8.600682593856655e-05, + "loss": 1.5206, + "step": 4022 + }, + { + "epoch": 4.57679180887372, + "grad_norm": 0.694990336894989, + "learning_rate": 8.577929465301479e-05, + "loss": 0.6024, + "step": 4023 + }, + { + "epoch": 4.577929465301479, + "grad_norm": 1.0141761302947998, + "learning_rate": 8.555176336746303e-05, + "loss": 1.8285, + "step": 4024 + }, + { + "epoch": 4.579067121729238, + "grad_norm": 0.9870526194572449, + "learning_rate": 8.532423208191128e-05, + "loss": 2.191, + "step": 4025 + }, + { + "epoch": 4.580204778156997, + "grad_norm": 1.3243286609649658, + "learning_rate": 8.50967007963595e-05, + "loss": 2.1655, + "step": 4026 + }, + { + "epoch": 4.581342434584755, + "grad_norm": 0.9310430884361267, + "learning_rate": 8.486916951080774e-05, + "loss": 1.5505, + "step": 4027 + }, + { + "epoch": 4.582480091012514, + "grad_norm": 0.8326849937438965, + "learning_rate": 8.464163822525597e-05, + "loss": 1.2742, + "step": 4028 + }, + { + "epoch": 4.5836177474402735, + "grad_norm": 1.1547021865844727, + "learning_rate": 8.441410693970421e-05, + "loss": 1.895, + "step": 4029 + }, + { + "epoch": 4.584755403868032, + "grad_norm": 4.82136869430542, + "learning_rate": 8.418657565415245e-05, + "loss": 1.7264, + "step": 4030 + }, + { + "epoch": 4.585893060295791, + "grad_norm": 2.2376370429992676, + "learning_rate": 8.395904436860069e-05, + "loss": 3.246, + "step": 4031 + }, + { + "epoch": 4.587030716723549, + "grad_norm": 1.1774529218673706, + "learning_rate": 8.373151308304892e-05, + "loss": 2.3837, + "step": 4032 + }, + { + "epoch": 4.5881683731513085, + "grad_norm": 0.8420902490615845, + "learning_rate": 8.350398179749716e-05, + "loss": 1.535, + "step": 4033 + }, + { + "epoch": 4.589306029579067, + "grad_norm": 1.1322485208511353, + "learning_rate": 8.32764505119454e-05, + "loss": 3.0796, + "step": 4034 + }, + { + "epoch": 4.590443686006826, + "grad_norm": 0.7916383743286133, + "learning_rate": 8.304891922639362e-05, + "loss": 1.7278, + "step": 4035 + }, + { + "epoch": 4.591581342434584, + "grad_norm": 0.9946612119674683, + "learning_rate": 8.282138794084186e-05, + "loss": 1.3464, + "step": 4036 + }, + { + "epoch": 4.592718998862344, + "grad_norm": 0.9056809544563293, + "learning_rate": 8.259385665529011e-05, + "loss": 1.2729, + "step": 4037 + }, + { + "epoch": 4.593856655290102, + "grad_norm": 1.1132405996322632, + "learning_rate": 8.236632536973834e-05, + "loss": 1.6577, + "step": 4038 + }, + { + "epoch": 4.594994311717861, + "grad_norm": 1.0267456769943237, + "learning_rate": 8.213879408418658e-05, + "loss": 2.0504, + "step": 4039 + }, + { + "epoch": 4.59613196814562, + "grad_norm": 0.6499386429786682, + "learning_rate": 8.19112627986348e-05, + "loss": 0.6439, + "step": 4040 + }, + { + "epoch": 4.597269624573379, + "grad_norm": 0.9048442244529724, + "learning_rate": 8.168373151308304e-05, + "loss": 1.5015, + "step": 4041 + }, + { + "epoch": 4.598407281001138, + "grad_norm": 1.0397703647613525, + "learning_rate": 8.145620022753129e-05, + "loss": 1.5219, + "step": 4042 + }, + { + "epoch": 4.599544937428896, + "grad_norm": 0.7324240803718567, + "learning_rate": 8.122866894197953e-05, + "loss": 1.4209, + "step": 4043 + }, + { + "epoch": 4.600682593856655, + "grad_norm": 0.9085008502006531, + "learning_rate": 8.100113765642777e-05, + "loss": 1.7762, + "step": 4044 + }, + { + "epoch": 4.601820250284414, + "grad_norm": 1.1294033527374268, + "learning_rate": 8.077360637087599e-05, + "loss": 2.8995, + "step": 4045 + }, + { + "epoch": 4.602957906712173, + "grad_norm": 0.8178098201751709, + "learning_rate": 8.054607508532423e-05, + "loss": 2.0034, + "step": 4046 + }, + { + "epoch": 4.604095563139932, + "grad_norm": 1.7326245307922363, + "learning_rate": 8.031854379977246e-05, + "loss": 2.8204, + "step": 4047 + }, + { + "epoch": 4.60523321956769, + "grad_norm": 1.0977067947387695, + "learning_rate": 8.009101251422071e-05, + "loss": 1.5176, + "step": 4048 + }, + { + "epoch": 4.60637087599545, + "grad_norm": 0.7887221574783325, + "learning_rate": 7.986348122866895e-05, + "loss": 1.7636, + "step": 4049 + }, + { + "epoch": 4.607508532423208, + "grad_norm": 1.0273473262786865, + "learning_rate": 7.963594994311717e-05, + "loss": 1.5681, + "step": 4050 + }, + { + "epoch": 4.608646188850967, + "grad_norm": 0.9911322593688965, + "learning_rate": 7.940841865756541e-05, + "loss": 1.8449, + "step": 4051 + }, + { + "epoch": 4.609783845278725, + "grad_norm": 0.6486635804176331, + "learning_rate": 7.918088737201365e-05, + "loss": 1.4309, + "step": 4052 + }, + { + "epoch": 4.610921501706485, + "grad_norm": 1.0114645957946777, + "learning_rate": 7.895335608646189e-05, + "loss": 2.1855, + "step": 4053 + }, + { + "epoch": 4.612059158134244, + "grad_norm": 1.1461702585220337, + "learning_rate": 7.872582480091014e-05, + "loss": 2.1323, + "step": 4054 + }, + { + "epoch": 4.613196814562002, + "grad_norm": 0.8668122291564941, + "learning_rate": 7.849829351535836e-05, + "loss": 1.418, + "step": 4055 + }, + { + "epoch": 4.614334470989761, + "grad_norm": 0.5558659434318542, + "learning_rate": 7.82707622298066e-05, + "loss": 0.7871, + "step": 4056 + }, + { + "epoch": 4.61547212741752, + "grad_norm": 1.008002758026123, + "learning_rate": 7.804323094425483e-05, + "loss": 1.9257, + "step": 4057 + }, + { + "epoch": 4.616609783845279, + "grad_norm": 1.01564621925354, + "learning_rate": 7.781569965870307e-05, + "loss": 2.2797, + "step": 4058 + }, + { + "epoch": 4.617747440273037, + "grad_norm": 0.8626942038536072, + "learning_rate": 7.758816837315132e-05, + "loss": 1.7334, + "step": 4059 + }, + { + "epoch": 4.618885096700796, + "grad_norm": 0.9426528811454773, + "learning_rate": 7.736063708759954e-05, + "loss": 2.4934, + "step": 4060 + }, + { + "epoch": 4.620022753128556, + "grad_norm": 0.8705387711524963, + "learning_rate": 7.713310580204778e-05, + "loss": 1.5291, + "step": 4061 + }, + { + "epoch": 4.621160409556314, + "grad_norm": 0.6598314046859741, + "learning_rate": 7.690557451649602e-05, + "loss": 1.3372, + "step": 4062 + }, + { + "epoch": 4.622298065984073, + "grad_norm": 1.0638949871063232, + "learning_rate": 7.667804323094426e-05, + "loss": 2.271, + "step": 4063 + }, + { + "epoch": 4.623435722411831, + "grad_norm": 1.2114907503128052, + "learning_rate": 7.645051194539249e-05, + "loss": 1.9116, + "step": 4064 + }, + { + "epoch": 4.624573378839591, + "grad_norm": 0.896415650844574, + "learning_rate": 7.622298065984073e-05, + "loss": 1.5814, + "step": 4065 + }, + { + "epoch": 4.625711035267349, + "grad_norm": 0.9475829005241394, + "learning_rate": 7.599544937428897e-05, + "loss": 1.3713, + "step": 4066 + }, + { + "epoch": 4.626848691695108, + "grad_norm": 1.0576077699661255, + "learning_rate": 7.57679180887372e-05, + "loss": 1.3842, + "step": 4067 + }, + { + "epoch": 4.627986348122867, + "grad_norm": 1.035212755203247, + "learning_rate": 7.554038680318544e-05, + "loss": 1.5382, + "step": 4068 + }, + { + "epoch": 4.629124004550626, + "grad_norm": 1.0625888109207153, + "learning_rate": 7.531285551763368e-05, + "loss": 1.9357, + "step": 4069 + }, + { + "epoch": 4.630261660978385, + "grad_norm": 0.7255961894989014, + "learning_rate": 7.508532423208191e-05, + "loss": 1.8392, + "step": 4070 + }, + { + "epoch": 4.631399317406143, + "grad_norm": 0.6510151624679565, + "learning_rate": 7.485779294653015e-05, + "loss": 1.6608, + "step": 4071 + }, + { + "epoch": 4.632536973833902, + "grad_norm": 0.7432663440704346, + "learning_rate": 7.463026166097839e-05, + "loss": 1.4151, + "step": 4072 + }, + { + "epoch": 4.633674630261661, + "grad_norm": 1.0191010236740112, + "learning_rate": 7.440273037542663e-05, + "loss": 2.6943, + "step": 4073 + }, + { + "epoch": 4.63481228668942, + "grad_norm": 0.7630909085273743, + "learning_rate": 7.417519908987486e-05, + "loss": 1.6592, + "step": 4074 + }, + { + "epoch": 4.635949943117178, + "grad_norm": 0.8889945149421692, + "learning_rate": 7.394766780432309e-05, + "loss": 1.8455, + "step": 4075 + }, + { + "epoch": 4.637087599544937, + "grad_norm": 1.1400561332702637, + "learning_rate": 7.372013651877134e-05, + "loss": 2.2122, + "step": 4076 + }, + { + "epoch": 4.638225255972696, + "grad_norm": 0.7421183586120605, + "learning_rate": 7.349260523321957e-05, + "loss": 1.1382, + "step": 4077 + }, + { + "epoch": 4.639362912400455, + "grad_norm": 0.8412690162658691, + "learning_rate": 7.326507394766781e-05, + "loss": 1.8067, + "step": 4078 + }, + { + "epoch": 4.640500568828214, + "grad_norm": 0.6977559924125671, + "learning_rate": 7.303754266211605e-05, + "loss": 0.9484, + "step": 4079 + }, + { + "epoch": 4.6416382252559725, + "grad_norm": 1.04801607131958, + "learning_rate": 7.281001137656427e-05, + "loss": 3.1128, + "step": 4080 + }, + { + "epoch": 4.642775881683732, + "grad_norm": 0.740193247795105, + "learning_rate": 7.258248009101251e-05, + "loss": 1.061, + "step": 4081 + }, + { + "epoch": 4.64391353811149, + "grad_norm": 0.5949002504348755, + "learning_rate": 7.235494880546076e-05, + "loss": 0.9576, + "step": 4082 + }, + { + "epoch": 4.645051194539249, + "grad_norm": 0.8039756417274475, + "learning_rate": 7.2127417519909e-05, + "loss": 1.3286, + "step": 4083 + }, + { + "epoch": 4.6461888509670075, + "grad_norm": 1.1032826900482178, + "learning_rate": 7.189988623435722e-05, + "loss": 2.7052, + "step": 4084 + }, + { + "epoch": 4.647326507394767, + "grad_norm": 1.0323725938796997, + "learning_rate": 7.167235494880546e-05, + "loss": 1.7848, + "step": 4085 + }, + { + "epoch": 4.648464163822526, + "grad_norm": 0.5838832259178162, + "learning_rate": 7.144482366325369e-05, + "loss": 1.4307, + "step": 4086 + }, + { + "epoch": 4.649601820250284, + "grad_norm": 1.3185147047042847, + "learning_rate": 7.121729237770194e-05, + "loss": 1.7205, + "step": 4087 + }, + { + "epoch": 4.650739476678043, + "grad_norm": 0.5053433179855347, + "learning_rate": 7.098976109215018e-05, + "loss": 0.4623, + "step": 4088 + }, + { + "epoch": 4.651877133105802, + "grad_norm": 0.9480729699134827, + "learning_rate": 7.07622298065984e-05, + "loss": 1.9962, + "step": 4089 + }, + { + "epoch": 4.653014789533561, + "grad_norm": 0.9530803561210632, + "learning_rate": 7.053469852104664e-05, + "loss": 2.4195, + "step": 4090 + }, + { + "epoch": 4.654152445961319, + "grad_norm": 0.7480978965759277, + "learning_rate": 7.030716723549488e-05, + "loss": 0.7856, + "step": 4091 + }, + { + "epoch": 4.6552901023890785, + "grad_norm": 0.8531357049942017, + "learning_rate": 7.007963594994311e-05, + "loss": 1.9666, + "step": 4092 + }, + { + "epoch": 4.656427758816838, + "grad_norm": 0.8261699080467224, + "learning_rate": 6.985210466439137e-05, + "loss": 1.2341, + "step": 4093 + }, + { + "epoch": 4.657565415244596, + "grad_norm": 1.0157686471939087, + "learning_rate": 6.962457337883959e-05, + "loss": 1.1924, + "step": 4094 + }, + { + "epoch": 4.658703071672355, + "grad_norm": 0.8260225057601929, + "learning_rate": 6.939704209328783e-05, + "loss": 1.092, + "step": 4095 + }, + { + "epoch": 4.6598407281001135, + "grad_norm": 0.836290717124939, + "learning_rate": 6.916951080773606e-05, + "loss": 0.9278, + "step": 4096 + }, + { + "epoch": 4.660978384527873, + "grad_norm": 1.124277949333191, + "learning_rate": 6.89419795221843e-05, + "loss": 1.9577, + "step": 4097 + }, + { + "epoch": 4.662116040955631, + "grad_norm": 1.066027045249939, + "learning_rate": 6.871444823663254e-05, + "loss": 2.6585, + "step": 4098 + }, + { + "epoch": 4.66325369738339, + "grad_norm": 0.5382254123687744, + "learning_rate": 6.848691695108077e-05, + "loss": 1.025, + "step": 4099 + }, + { + "epoch": 4.664391353811149, + "grad_norm": 0.6736378073692322, + "learning_rate": 6.825938566552901e-05, + "loss": 1.2211, + "step": 4100 + }, + { + "epoch": 4.665529010238908, + "grad_norm": 1.0047065019607544, + "learning_rate": 6.803185437997725e-05, + "loss": 1.528, + "step": 4101 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 1.521615743637085, + "learning_rate": 6.780432309442548e-05, + "loss": 3.3803, + "step": 4102 + }, + { + "epoch": 4.667804323094425, + "grad_norm": 0.9080568552017212, + "learning_rate": 6.757679180887372e-05, + "loss": 1.778, + "step": 4103 + }, + { + "epoch": 4.6689419795221845, + "grad_norm": 0.7825086116790771, + "learning_rate": 6.734926052332196e-05, + "loss": 1.6855, + "step": 4104 + }, + { + "epoch": 4.670079635949943, + "grad_norm": 0.905820906162262, + "learning_rate": 6.71217292377702e-05, + "loss": 2.29, + "step": 4105 + }, + { + "epoch": 4.671217292377702, + "grad_norm": 0.7313637137413025, + "learning_rate": 6.689419795221843e-05, + "loss": 1.6036, + "step": 4106 + }, + { + "epoch": 4.672354948805461, + "grad_norm": 0.683791995048523, + "learning_rate": 6.666666666666667e-05, + "loss": 1.4341, + "step": 4107 + }, + { + "epoch": 4.6734926052332195, + "grad_norm": 1.0041923522949219, + "learning_rate": 6.643913538111491e-05, + "loss": 1.4274, + "step": 4108 + }, + { + "epoch": 4.674630261660979, + "grad_norm": 0.662774920463562, + "learning_rate": 6.621160409556313e-05, + "loss": 1.4739, + "step": 4109 + }, + { + "epoch": 4.675767918088737, + "grad_norm": 1.1887277364730835, + "learning_rate": 6.598407281001138e-05, + "loss": 1.7699, + "step": 4110 + }, + { + "epoch": 4.676905574516496, + "grad_norm": 1.0310107469558716, + "learning_rate": 6.575654152445962e-05, + "loss": 1.565, + "step": 4111 + }, + { + "epoch": 4.678043230944255, + "grad_norm": 0.9942086338996887, + "learning_rate": 6.552901023890785e-05, + "loss": 1.6227, + "step": 4112 + }, + { + "epoch": 4.679180887372014, + "grad_norm": 1.4443100690841675, + "learning_rate": 6.530147895335609e-05, + "loss": 3.6737, + "step": 4113 + }, + { + "epoch": 4.680318543799773, + "grad_norm": 0.9084301590919495, + "learning_rate": 6.507394766780432e-05, + "loss": 1.8005, + "step": 4114 + }, + { + "epoch": 4.681456200227531, + "grad_norm": 1.0044715404510498, + "learning_rate": 6.484641638225255e-05, + "loss": 2.3198, + "step": 4115 + }, + { + "epoch": 4.6825938566552905, + "grad_norm": 0.6797364950180054, + "learning_rate": 6.46188850967008e-05, + "loss": 1.0637, + "step": 4116 + }, + { + "epoch": 4.683731513083049, + "grad_norm": 1.2862342596054077, + "learning_rate": 6.439135381114904e-05, + "loss": 2.6791, + "step": 4117 + }, + { + "epoch": 4.684869169510808, + "grad_norm": 0.9734237790107727, + "learning_rate": 6.416382252559728e-05, + "loss": 1.7895, + "step": 4118 + }, + { + "epoch": 4.686006825938566, + "grad_norm": 1.044211506843567, + "learning_rate": 6.39362912400455e-05, + "loss": 3.2597, + "step": 4119 + }, + { + "epoch": 4.6871444823663255, + "grad_norm": 1.082377552986145, + "learning_rate": 6.370875995449374e-05, + "loss": 1.8831, + "step": 4120 + }, + { + "epoch": 4.688282138794084, + "grad_norm": 1.1628836393356323, + "learning_rate": 6.348122866894199e-05, + "loss": 2.3543, + "step": 4121 + }, + { + "epoch": 4.689419795221843, + "grad_norm": 0.9119383692741394, + "learning_rate": 6.325369738339022e-05, + "loss": 1.5563, + "step": 4122 + }, + { + "epoch": 4.690557451649601, + "grad_norm": 1.1674100160598755, + "learning_rate": 6.302616609783846e-05, + "loss": 1.7306, + "step": 4123 + }, + { + "epoch": 4.691695108077361, + "grad_norm": 0.7952550053596497, + "learning_rate": 6.279863481228669e-05, + "loss": 1.5032, + "step": 4124 + }, + { + "epoch": 4.69283276450512, + "grad_norm": 0.9031153917312622, + "learning_rate": 6.257110352673492e-05, + "loss": 2.8563, + "step": 4125 + }, + { + "epoch": 4.693970420932878, + "grad_norm": 1.1440151929855347, + "learning_rate": 6.234357224118316e-05, + "loss": 1.1994, + "step": 4126 + }, + { + "epoch": 4.695108077360637, + "grad_norm": 0.8565515279769897, + "learning_rate": 6.21160409556314e-05, + "loss": 0.904, + "step": 4127 + }, + { + "epoch": 4.696245733788396, + "grad_norm": 1.0120102167129517, + "learning_rate": 6.188850967007965e-05, + "loss": 1.6285, + "step": 4128 + }, + { + "epoch": 4.697383390216155, + "grad_norm": 0.95466148853302, + "learning_rate": 6.166097838452787e-05, + "loss": 1.4088, + "step": 4129 + }, + { + "epoch": 4.698521046643913, + "grad_norm": 0.9938770532608032, + "learning_rate": 6.143344709897611e-05, + "loss": 1.9033, + "step": 4130 + }, + { + "epoch": 4.699658703071672, + "grad_norm": 0.8543733954429626, + "learning_rate": 6.120591581342434e-05, + "loss": 2.0826, + "step": 4131 + }, + { + "epoch": 4.7007963594994315, + "grad_norm": 1.0578027963638306, + "learning_rate": 6.097838452787258e-05, + "loss": 1.5184, + "step": 4132 + }, + { + "epoch": 4.70193401592719, + "grad_norm": 1.0492652654647827, + "learning_rate": 6.075085324232082e-05, + "loss": 2.2705, + "step": 4133 + }, + { + "epoch": 4.703071672354949, + "grad_norm": 0.7921819686889648, + "learning_rate": 6.052332195676906e-05, + "loss": 1.2564, + "step": 4134 + }, + { + "epoch": 4.704209328782707, + "grad_norm": 0.9057871103286743, + "learning_rate": 6.029579067121729e-05, + "loss": 1.3608, + "step": 4135 + }, + { + "epoch": 4.705346985210467, + "grad_norm": 1.0830860137939453, + "learning_rate": 6.006825938566553e-05, + "loss": 2.1617, + "step": 4136 + }, + { + "epoch": 4.706484641638225, + "grad_norm": 0.9428957104682922, + "learning_rate": 5.9840728100113766e-05, + "loss": 2.4494, + "step": 4137 + }, + { + "epoch": 4.707622298065984, + "grad_norm": 0.7517849206924438, + "learning_rate": 5.9613196814562e-05, + "loss": 1.2648, + "step": 4138 + }, + { + "epoch": 4.708759954493743, + "grad_norm": 1.1423892974853516, + "learning_rate": 5.938566552901024e-05, + "loss": 1.8682, + "step": 4139 + }, + { + "epoch": 4.709897610921502, + "grad_norm": 0.8202763795852661, + "learning_rate": 5.915813424345848e-05, + "loss": 2.3286, + "step": 4140 + }, + { + "epoch": 4.711035267349261, + "grad_norm": 0.7963849306106567, + "learning_rate": 5.8930602957906714e-05, + "loss": 1.5263, + "step": 4141 + }, + { + "epoch": 4.712172923777019, + "grad_norm": 0.8499445915222168, + "learning_rate": 5.870307167235495e-05, + "loss": 1.4715, + "step": 4142 + }, + { + "epoch": 4.713310580204778, + "grad_norm": 0.9363263249397278, + "learning_rate": 5.847554038680319e-05, + "loss": 2.0218, + "step": 4143 + }, + { + "epoch": 4.714448236632537, + "grad_norm": 0.9191851615905762, + "learning_rate": 5.824800910125142e-05, + "loss": 2.695, + "step": 4144 + }, + { + "epoch": 4.715585893060296, + "grad_norm": 1.0178710222244263, + "learning_rate": 5.802047781569966e-05, + "loss": 2.0507, + "step": 4145 + }, + { + "epoch": 4.716723549488055, + "grad_norm": 0.9625295400619507, + "learning_rate": 5.77929465301479e-05, + "loss": 1.7354, + "step": 4146 + }, + { + "epoch": 4.717861205915813, + "grad_norm": 1.0893375873565674, + "learning_rate": 5.756541524459613e-05, + "loss": 2.4767, + "step": 4147 + }, + { + "epoch": 4.718998862343573, + "grad_norm": 0.6518070101737976, + "learning_rate": 5.733788395904437e-05, + "loss": 1.2152, + "step": 4148 + }, + { + "epoch": 4.720136518771331, + "grad_norm": 1.2176522016525269, + "learning_rate": 5.71103526734926e-05, + "loss": 1.7852, + "step": 4149 + }, + { + "epoch": 4.72127417519909, + "grad_norm": 1.0597165822982788, + "learning_rate": 5.688282138794084e-05, + "loss": 2.6517, + "step": 4150 + }, + { + "epoch": 4.722411831626848, + "grad_norm": 0.9489892721176147, + "learning_rate": 5.6655290102389084e-05, + "loss": 2.0657, + "step": 4151 + }, + { + "epoch": 4.723549488054608, + "grad_norm": 0.9729322195053101, + "learning_rate": 5.6427758816837314e-05, + "loss": 1.7927, + "step": 4152 + }, + { + "epoch": 4.724687144482367, + "grad_norm": 0.832109808921814, + "learning_rate": 5.620022753128555e-05, + "loss": 1.1887, + "step": 4153 + }, + { + "epoch": 4.725824800910125, + "grad_norm": 1.0068272352218628, + "learning_rate": 5.597269624573379e-05, + "loss": 1.3375, + "step": 4154 + }, + { + "epoch": 4.726962457337884, + "grad_norm": 0.7367716431617737, + "learning_rate": 5.5745164960182025e-05, + "loss": 1.3822, + "step": 4155 + }, + { + "epoch": 4.728100113765643, + "grad_norm": 1.1299673318862915, + "learning_rate": 5.551763367463027e-05, + "loss": 2.0459, + "step": 4156 + }, + { + "epoch": 4.729237770193402, + "grad_norm": 0.8980764150619507, + "learning_rate": 5.52901023890785e-05, + "loss": 1.6, + "step": 4157 + }, + { + "epoch": 4.73037542662116, + "grad_norm": 1.1862162351608276, + "learning_rate": 5.5062571103526736e-05, + "loss": 2.6499, + "step": 4158 + }, + { + "epoch": 4.731513083048919, + "grad_norm": 0.6605071425437927, + "learning_rate": 5.483503981797497e-05, + "loss": 0.782, + "step": 4159 + }, + { + "epoch": 4.732650739476678, + "grad_norm": 1.0915684700012207, + "learning_rate": 5.460750853242321e-05, + "loss": 2.1564, + "step": 4160 + }, + { + "epoch": 4.733788395904437, + "grad_norm": 0.7124598026275635, + "learning_rate": 5.437997724687145e-05, + "loss": 1.3858, + "step": 4161 + }, + { + "epoch": 4.734926052332195, + "grad_norm": 0.8076978325843811, + "learning_rate": 5.4152445961319684e-05, + "loss": 1.6923, + "step": 4162 + }, + { + "epoch": 4.736063708759954, + "grad_norm": 1.0069531202316284, + "learning_rate": 5.392491467576792e-05, + "loss": 2.5924, + "step": 4163 + }, + { + "epoch": 4.737201365187714, + "grad_norm": 0.7821694016456604, + "learning_rate": 5.369738339021615e-05, + "loss": 1.5226, + "step": 4164 + }, + { + "epoch": 4.738339021615472, + "grad_norm": 1.0969029664993286, + "learning_rate": 5.3469852104664395e-05, + "loss": 1.791, + "step": 4165 + }, + { + "epoch": 4.739476678043231, + "grad_norm": 0.6663936972618103, + "learning_rate": 5.324232081911263e-05, + "loss": 1.298, + "step": 4166 + }, + { + "epoch": 4.7406143344709895, + "grad_norm": 0.6106514930725098, + "learning_rate": 5.301478953356086e-05, + "loss": 1.0094, + "step": 4167 + }, + { + "epoch": 4.741751990898749, + "grad_norm": 0.8213837742805481, + "learning_rate": 5.2787258248009106e-05, + "loss": 1.5493, + "step": 4168 + }, + { + "epoch": 4.742889647326507, + "grad_norm": 1.1655770540237427, + "learning_rate": 5.2559726962457336e-05, + "loss": 2.0271, + "step": 4169 + }, + { + "epoch": 4.744027303754266, + "grad_norm": 0.9248787760734558, + "learning_rate": 5.233219567690557e-05, + "loss": 2.3724, + "step": 4170 + }, + { + "epoch": 4.745164960182025, + "grad_norm": 0.7159304022789001, + "learning_rate": 5.210466439135381e-05, + "loss": 1.3412, + "step": 4171 + }, + { + "epoch": 4.746302616609784, + "grad_norm": 0.773129403591156, + "learning_rate": 5.187713310580205e-05, + "loss": 2.2651, + "step": 4172 + }, + { + "epoch": 4.747440273037543, + "grad_norm": 1.6007509231567383, + "learning_rate": 5.164960182025029e-05, + "loss": 2.185, + "step": 4173 + }, + { + "epoch": 4.748577929465301, + "grad_norm": 0.9146437048912048, + "learning_rate": 5.142207053469852e-05, + "loss": 1.8578, + "step": 4174 + }, + { + "epoch": 4.74971558589306, + "grad_norm": 1.0452239513397217, + "learning_rate": 5.119453924914676e-05, + "loss": 2.7301, + "step": 4175 + }, + { + "epoch": 4.750853242320819, + "grad_norm": 1.1946684122085571, + "learning_rate": 5.0967007963594995e-05, + "loss": 2.3748, + "step": 4176 + }, + { + "epoch": 4.751990898748578, + "grad_norm": 1.1966066360473633, + "learning_rate": 5.073947667804323e-05, + "loss": 1.9953, + "step": 4177 + }, + { + "epoch": 4.753128555176337, + "grad_norm": 0.8742679357528687, + "learning_rate": 5.051194539249147e-05, + "loss": 1.7049, + "step": 4178 + }, + { + "epoch": 4.7542662116040955, + "grad_norm": 0.7945109605789185, + "learning_rate": 5.0284414106939706e-05, + "loss": 1.4822, + "step": 4179 + }, + { + "epoch": 4.755403868031855, + "grad_norm": 0.7654008269309998, + "learning_rate": 5.005688282138794e-05, + "loss": 1.0475, + "step": 4180 + }, + { + "epoch": 4.756541524459613, + "grad_norm": 1.0608311891555786, + "learning_rate": 4.982935153583617e-05, + "loss": 1.5947, + "step": 4181 + }, + { + "epoch": 4.757679180887372, + "grad_norm": 1.2881845235824585, + "learning_rate": 4.960182025028442e-05, + "loss": 3.6129, + "step": 4182 + }, + { + "epoch": 4.7588168373151305, + "grad_norm": 1.1011303663253784, + "learning_rate": 4.9374288964732654e-05, + "loss": 1.1276, + "step": 4183 + }, + { + "epoch": 4.75995449374289, + "grad_norm": 1.142633318901062, + "learning_rate": 4.9146757679180884e-05, + "loss": 2.4236, + "step": 4184 + }, + { + "epoch": 4.761092150170649, + "grad_norm": 0.940455436706543, + "learning_rate": 4.891922639362913e-05, + "loss": 2.3076, + "step": 4185 + }, + { + "epoch": 4.762229806598407, + "grad_norm": 0.8096868991851807, + "learning_rate": 4.869169510807736e-05, + "loss": 0.983, + "step": 4186 + }, + { + "epoch": 4.763367463026166, + "grad_norm": 1.020517349243164, + "learning_rate": 4.8464163822525595e-05, + "loss": 2.3447, + "step": 4187 + }, + { + "epoch": 4.764505119453925, + "grad_norm": 1.09331214427948, + "learning_rate": 4.823663253697384e-05, + "loss": 2.6737, + "step": 4188 + }, + { + "epoch": 4.765642775881684, + "grad_norm": 1.5871385335922241, + "learning_rate": 4.800910125142207e-05, + "loss": 3.2077, + "step": 4189 + }, + { + "epoch": 4.766780432309442, + "grad_norm": 0.6802913546562195, + "learning_rate": 4.778156996587031e-05, + "loss": 1.2683, + "step": 4190 + }, + { + "epoch": 4.7679180887372015, + "grad_norm": 0.8317682147026062, + "learning_rate": 4.755403868031854e-05, + "loss": 1.9439, + "step": 4191 + }, + { + "epoch": 4.769055745164961, + "grad_norm": 0.8209235668182373, + "learning_rate": 4.732650739476678e-05, + "loss": 1.3119, + "step": 4192 + }, + { + "epoch": 4.770193401592719, + "grad_norm": 1.44149649143219, + "learning_rate": 4.7098976109215024e-05, + "loss": 2.4981, + "step": 4193 + }, + { + "epoch": 4.771331058020478, + "grad_norm": 1.0103590488433838, + "learning_rate": 4.6871444823663254e-05, + "loss": 1.6604, + "step": 4194 + }, + { + "epoch": 4.7724687144482365, + "grad_norm": 1.4098118543624878, + "learning_rate": 4.664391353811149e-05, + "loss": 3.3837, + "step": 4195 + }, + { + "epoch": 4.773606370875996, + "grad_norm": 0.829317569732666, + "learning_rate": 4.641638225255973e-05, + "loss": 1.5868, + "step": 4196 + }, + { + "epoch": 4.774744027303754, + "grad_norm": 0.9138002991676331, + "learning_rate": 4.6188850967007965e-05, + "loss": 2.3734, + "step": 4197 + }, + { + "epoch": 4.775881683731513, + "grad_norm": 0.6845062971115112, + "learning_rate": 4.5961319681456195e-05, + "loss": 1.1712, + "step": 4198 + }, + { + "epoch": 4.7770193401592715, + "grad_norm": 0.592144787311554, + "learning_rate": 4.573378839590444e-05, + "loss": 1.0386, + "step": 4199 + }, + { + "epoch": 4.778156996587031, + "grad_norm": 1.318644404411316, + "learning_rate": 4.5506257110352676e-05, + "loss": 2.859, + "step": 4200 + }, + { + "epoch": 4.779294653014789, + "grad_norm": 1.0490177869796753, + "learning_rate": 4.5278725824800906e-05, + "loss": 2.2225, + "step": 4201 + }, + { + "epoch": 4.780432309442548, + "grad_norm": 0.9637972712516785, + "learning_rate": 4.505119453924915e-05, + "loss": 1.6452, + "step": 4202 + }, + { + "epoch": 4.7815699658703075, + "grad_norm": 1.3395143747329712, + "learning_rate": 4.482366325369738e-05, + "loss": 2.982, + "step": 4203 + }, + { + "epoch": 4.782707622298066, + "grad_norm": 0.9925627708435059, + "learning_rate": 4.4596131968145624e-05, + "loss": 2.2197, + "step": 4204 + }, + { + "epoch": 4.783845278725825, + "grad_norm": 0.715560257434845, + "learning_rate": 4.436860068259386e-05, + "loss": 1.6095, + "step": 4205 + }, + { + "epoch": 4.784982935153583, + "grad_norm": 0.7981997132301331, + "learning_rate": 4.414106939704209e-05, + "loss": 1.859, + "step": 4206 + }, + { + "epoch": 4.7861205915813425, + "grad_norm": 0.894432783126831, + "learning_rate": 4.3913538111490335e-05, + "loss": 1.4185, + "step": 4207 + }, + { + "epoch": 4.787258248009101, + "grad_norm": 1.0646681785583496, + "learning_rate": 4.3686006825938565e-05, + "loss": 2.6534, + "step": 4208 + }, + { + "epoch": 4.78839590443686, + "grad_norm": 1.0470125675201416, + "learning_rate": 4.34584755403868e-05, + "loss": 2.4396, + "step": 4209 + }, + { + "epoch": 4.789533560864619, + "grad_norm": 1.2576868534088135, + "learning_rate": 4.3230944254835046e-05, + "loss": 1.9555, + "step": 4210 + }, + { + "epoch": 4.7906712172923775, + "grad_norm": 0.7670148611068726, + "learning_rate": 4.3003412969283276e-05, + "loss": 1.1409, + "step": 4211 + }, + { + "epoch": 4.791808873720137, + "grad_norm": 0.9157593250274658, + "learning_rate": 4.277588168373151e-05, + "loss": 1.9202, + "step": 4212 + }, + { + "epoch": 4.792946530147895, + "grad_norm": 0.4483858048915863, + "learning_rate": 4.254835039817975e-05, + "loss": 0.7922, + "step": 4213 + }, + { + "epoch": 4.794084186575654, + "grad_norm": 1.031166911125183, + "learning_rate": 4.232081911262799e-05, + "loss": 2.3445, + "step": 4214 + }, + { + "epoch": 4.795221843003413, + "grad_norm": 1.042548418045044, + "learning_rate": 4.2093287827076224e-05, + "loss": 1.7547, + "step": 4215 + }, + { + "epoch": 4.796359499431172, + "grad_norm": 1.197379231452942, + "learning_rate": 4.186575654152446e-05, + "loss": 1.7015, + "step": 4216 + }, + { + "epoch": 4.797497155858931, + "grad_norm": 0.9254120588302612, + "learning_rate": 4.16382252559727e-05, + "loss": 1.1992, + "step": 4217 + }, + { + "epoch": 4.798634812286689, + "grad_norm": 0.6915335655212402, + "learning_rate": 4.141069397042093e-05, + "loss": 0.8966, + "step": 4218 + }, + { + "epoch": 4.7997724687144485, + "grad_norm": 1.2596569061279297, + "learning_rate": 4.118316268486917e-05, + "loss": 1.7151, + "step": 4219 + }, + { + "epoch": 4.800910125142207, + "grad_norm": 0.873573362827301, + "learning_rate": 4.09556313993174e-05, + "loss": 2.261, + "step": 4220 + }, + { + "epoch": 4.802047781569966, + "grad_norm": 0.6213791370391846, + "learning_rate": 4.0728100113765646e-05, + "loss": 1.1226, + "step": 4221 + }, + { + "epoch": 4.803185437997724, + "grad_norm": 0.9243321418762207, + "learning_rate": 4.050056882821388e-05, + "loss": 2.3551, + "step": 4222 + }, + { + "epoch": 4.8043230944254836, + "grad_norm": 1.7408545017242432, + "learning_rate": 4.0273037542662113e-05, + "loss": 3.208, + "step": 4223 + }, + { + "epoch": 4.805460750853243, + "grad_norm": 0.5592201352119446, + "learning_rate": 4.004550625711036e-05, + "loss": 0.7587, + "step": 4224 + }, + { + "epoch": 4.806598407281001, + "grad_norm": 0.7832121849060059, + "learning_rate": 3.981797497155859e-05, + "loss": 1.2192, + "step": 4225 + }, + { + "epoch": 4.80773606370876, + "grad_norm": 0.9754132628440857, + "learning_rate": 3.9590443686006824e-05, + "loss": 1.2067, + "step": 4226 + }, + { + "epoch": 4.808873720136519, + "grad_norm": 0.9849193096160889, + "learning_rate": 3.936291240045507e-05, + "loss": 2.3535, + "step": 4227 + }, + { + "epoch": 4.810011376564278, + "grad_norm": 0.9449824690818787, + "learning_rate": 3.91353811149033e-05, + "loss": 1.7503, + "step": 4228 + }, + { + "epoch": 4.811149032992036, + "grad_norm": 1.1584771871566772, + "learning_rate": 3.8907849829351535e-05, + "loss": 2.9749, + "step": 4229 + }, + { + "epoch": 4.812286689419795, + "grad_norm": 0.9928334355354309, + "learning_rate": 3.868031854379977e-05, + "loss": 2.3772, + "step": 4230 + }, + { + "epoch": 4.8134243458475545, + "grad_norm": 1.044175386428833, + "learning_rate": 3.845278725824801e-05, + "loss": 2.2084, + "step": 4231 + }, + { + "epoch": 4.814562002275313, + "grad_norm": 1.501725673675537, + "learning_rate": 3.8225255972696246e-05, + "loss": 3.0545, + "step": 4232 + }, + { + "epoch": 4.815699658703072, + "grad_norm": 0.6129480004310608, + "learning_rate": 3.799772468714448e-05, + "loss": 1.1711, + "step": 4233 + }, + { + "epoch": 4.81683731513083, + "grad_norm": 0.6920023560523987, + "learning_rate": 3.777019340159272e-05, + "loss": 1.006, + "step": 4234 + }, + { + "epoch": 4.81797497155859, + "grad_norm": 1.1781179904937744, + "learning_rate": 3.754266211604096e-05, + "loss": 2.6292, + "step": 4235 + }, + { + "epoch": 4.819112627986348, + "grad_norm": 0.9663881659507751, + "learning_rate": 3.7315130830489194e-05, + "loss": 1.689, + "step": 4236 + }, + { + "epoch": 4.820250284414107, + "grad_norm": 1.1518983840942383, + "learning_rate": 3.708759954493743e-05, + "loss": 2.804, + "step": 4237 + }, + { + "epoch": 4.821387940841865, + "grad_norm": 1.6885265111923218, + "learning_rate": 3.686006825938567e-05, + "loss": 2.9798, + "step": 4238 + }, + { + "epoch": 4.822525597269625, + "grad_norm": 1.2374640703201294, + "learning_rate": 3.6632536973833905e-05, + "loss": 1.4952, + "step": 4239 + }, + { + "epoch": 4.823663253697383, + "grad_norm": 0.8825571537017822, + "learning_rate": 3.6405005688282136e-05, + "loss": 1.5783, + "step": 4240 + }, + { + "epoch": 4.824800910125142, + "grad_norm": 1.4535282850265503, + "learning_rate": 3.617747440273038e-05, + "loss": 2.3041, + "step": 4241 + }, + { + "epoch": 4.825938566552901, + "grad_norm": 0.9175541400909424, + "learning_rate": 3.594994311717861e-05, + "loss": 1.5203, + "step": 4242 + }, + { + "epoch": 4.82707622298066, + "grad_norm": 0.984664797782898, + "learning_rate": 3.5722411831626847e-05, + "loss": 2.5147, + "step": 4243 + }, + { + "epoch": 4.828213879408419, + "grad_norm": 0.7770801782608032, + "learning_rate": 3.549488054607509e-05, + "loss": 2.0094, + "step": 4244 + }, + { + "epoch": 4.829351535836177, + "grad_norm": 0.9367466568946838, + "learning_rate": 3.526734926052332e-05, + "loss": 2.4352, + "step": 4245 + }, + { + "epoch": 4.830489192263936, + "grad_norm": 0.7870509028434753, + "learning_rate": 3.503981797497156e-05, + "loss": 2.0384, + "step": 4246 + }, + { + "epoch": 4.831626848691695, + "grad_norm": 0.8838803172111511, + "learning_rate": 3.4812286689419794e-05, + "loss": 1.4871, + "step": 4247 + }, + { + "epoch": 4.832764505119454, + "grad_norm": 0.8027423024177551, + "learning_rate": 3.458475540386803e-05, + "loss": 1.8367, + "step": 4248 + }, + { + "epoch": 4.833902161547213, + "grad_norm": 0.9481743574142456, + "learning_rate": 3.435722411831627e-05, + "loss": 0.9802, + "step": 4249 + }, + { + "epoch": 4.835039817974971, + "grad_norm": 0.6448602676391602, + "learning_rate": 3.4129692832764505e-05, + "loss": 1.3029, + "step": 4250 + }, + { + "epoch": 4.836177474402731, + "grad_norm": 0.8399932980537415, + "learning_rate": 3.390216154721274e-05, + "loss": 2.0227, + "step": 4251 + }, + { + "epoch": 4.837315130830489, + "grad_norm": 1.314458966255188, + "learning_rate": 3.367463026166098e-05, + "loss": 2.1751, + "step": 4252 + }, + { + "epoch": 4.838452787258248, + "grad_norm": 0.5397687554359436, + "learning_rate": 3.3447098976109216e-05, + "loss": 1.035, + "step": 4253 + }, + { + "epoch": 4.839590443686006, + "grad_norm": 0.7466420531272888, + "learning_rate": 3.3219567690557453e-05, + "loss": 1.8664, + "step": 4254 + }, + { + "epoch": 4.840728100113766, + "grad_norm": 1.525064468383789, + "learning_rate": 3.299203640500569e-05, + "loss": 1.8065, + "step": 4255 + }, + { + "epoch": 4.841865756541525, + "grad_norm": 1.5727421045303345, + "learning_rate": 3.276450511945393e-05, + "loss": 1.9101, + "step": 4256 + }, + { + "epoch": 4.843003412969283, + "grad_norm": 0.8878625631332397, + "learning_rate": 3.253697383390216e-05, + "loss": 2.3298, + "step": 4257 + }, + { + "epoch": 4.844141069397042, + "grad_norm": 0.9675851464271545, + "learning_rate": 3.23094425483504e-05, + "loss": 1.6869, + "step": 4258 + }, + { + "epoch": 4.845278725824801, + "grad_norm": 1.704845666885376, + "learning_rate": 3.208191126279864e-05, + "loss": 3.9697, + "step": 4259 + }, + { + "epoch": 4.84641638225256, + "grad_norm": 0.6775122284889221, + "learning_rate": 3.185437997724687e-05, + "loss": 1.1285, + "step": 4260 + }, + { + "epoch": 4.847554038680318, + "grad_norm": 0.7678422331809998, + "learning_rate": 3.162684869169511e-05, + "loss": 1.6568, + "step": 4261 + }, + { + "epoch": 4.848691695108077, + "grad_norm": 0.8857121467590332, + "learning_rate": 3.139931740614334e-05, + "loss": 1.3846, + "step": 4262 + }, + { + "epoch": 4.849829351535837, + "grad_norm": 0.8548673987388611, + "learning_rate": 3.117178612059158e-05, + "loss": 1.6368, + "step": 4263 + }, + { + "epoch": 4.850967007963595, + "grad_norm": 0.9651059508323669, + "learning_rate": 3.094425483503982e-05, + "loss": 2.2593, + "step": 4264 + }, + { + "epoch": 4.852104664391354, + "grad_norm": 0.593999445438385, + "learning_rate": 3.0716723549488054e-05, + "loss": 0.7888, + "step": 4265 + }, + { + "epoch": 4.853242320819112, + "grad_norm": 1.1292147636413574, + "learning_rate": 3.048919226393629e-05, + "loss": 1.7061, + "step": 4266 + }, + { + "epoch": 4.854379977246872, + "grad_norm": 1.285089135169983, + "learning_rate": 3.026166097838453e-05, + "loss": 2.3986, + "step": 4267 + }, + { + "epoch": 4.85551763367463, + "grad_norm": 0.9155679941177368, + "learning_rate": 3.0034129692832765e-05, + "loss": 1.9914, + "step": 4268 + }, + { + "epoch": 4.856655290102389, + "grad_norm": 1.3086957931518555, + "learning_rate": 2.9806598407281e-05, + "loss": 2.3898, + "step": 4269 + }, + { + "epoch": 4.857792946530148, + "grad_norm": 1.1196303367614746, + "learning_rate": 2.957906712172924e-05, + "loss": 3.3304, + "step": 4270 + }, + { + "epoch": 4.858930602957907, + "grad_norm": 1.0274990797042847, + "learning_rate": 2.9351535836177476e-05, + "loss": 1.8834, + "step": 4271 + }, + { + "epoch": 4.860068259385666, + "grad_norm": 0.9945041537284851, + "learning_rate": 2.912400455062571e-05, + "loss": 2.645, + "step": 4272 + }, + { + "epoch": 4.861205915813424, + "grad_norm": 0.88654625415802, + "learning_rate": 2.889647326507395e-05, + "loss": 1.7848, + "step": 4273 + }, + { + "epoch": 4.862343572241183, + "grad_norm": 0.6680393815040588, + "learning_rate": 2.8668941979522186e-05, + "loss": 1.1586, + "step": 4274 + }, + { + "epoch": 4.863481228668942, + "grad_norm": 1.0366647243499756, + "learning_rate": 2.844141069397042e-05, + "loss": 1.2493, + "step": 4275 + }, + { + "epoch": 4.864618885096701, + "grad_norm": 1.4107547998428345, + "learning_rate": 2.8213879408418657e-05, + "loss": 3.2325, + "step": 4276 + }, + { + "epoch": 4.865756541524459, + "grad_norm": 0.627693772315979, + "learning_rate": 2.7986348122866894e-05, + "loss": 0.5508, + "step": 4277 + }, + { + "epoch": 4.8668941979522184, + "grad_norm": 0.7929319739341736, + "learning_rate": 2.7758816837315134e-05, + "loss": 1.5375, + "step": 4278 + }, + { + "epoch": 4.868031854379977, + "grad_norm": 1.2130954265594482, + "learning_rate": 2.7531285551763368e-05, + "loss": 2.0214, + "step": 4279 + }, + { + "epoch": 4.869169510807736, + "grad_norm": 0.9208192229270935, + "learning_rate": 2.7303754266211605e-05, + "loss": 1.7519, + "step": 4280 + }, + { + "epoch": 4.870307167235495, + "grad_norm": 1.1531082391738892, + "learning_rate": 2.7076222980659842e-05, + "loss": 2.057, + "step": 4281 + }, + { + "epoch": 4.8714448236632535, + "grad_norm": 1.157972812652588, + "learning_rate": 2.6848691695108076e-05, + "loss": 2.9342, + "step": 4282 + }, + { + "epoch": 4.872582480091013, + "grad_norm": 0.9264168739318848, + "learning_rate": 2.6621160409556316e-05, + "loss": 1.2696, + "step": 4283 + }, + { + "epoch": 4.873720136518771, + "grad_norm": 1.2955060005187988, + "learning_rate": 2.6393629124004553e-05, + "loss": 3.8855, + "step": 4284 + }, + { + "epoch": 4.87485779294653, + "grad_norm": 0.8939307332038879, + "learning_rate": 2.6166097838452787e-05, + "loss": 1.0266, + "step": 4285 + }, + { + "epoch": 4.8759954493742885, + "grad_norm": 0.6583961844444275, + "learning_rate": 2.5938566552901024e-05, + "loss": 1.4206, + "step": 4286 + }, + { + "epoch": 4.877133105802048, + "grad_norm": 0.642296552658081, + "learning_rate": 2.571103526734926e-05, + "loss": 0.9931, + "step": 4287 + }, + { + "epoch": 4.878270762229807, + "grad_norm": 0.9714062809944153, + "learning_rate": 2.5483503981797498e-05, + "loss": 1.8689, + "step": 4288 + }, + { + "epoch": 4.879408418657565, + "grad_norm": 1.0246009826660156, + "learning_rate": 2.5255972696245735e-05, + "loss": 2.1948, + "step": 4289 + }, + { + "epoch": 4.8805460750853245, + "grad_norm": 0.6404163837432861, + "learning_rate": 2.502844141069397e-05, + "loss": 0.7135, + "step": 4290 + }, + { + "epoch": 4.881683731513083, + "grad_norm": 0.6843558549880981, + "learning_rate": 2.480091012514221e-05, + "loss": 1.1422, + "step": 4291 + }, + { + "epoch": 4.882821387940842, + "grad_norm": 0.5428027510643005, + "learning_rate": 2.4573378839590442e-05, + "loss": 1.0285, + "step": 4292 + }, + { + "epoch": 4.8839590443686, + "grad_norm": 1.4773718118667603, + "learning_rate": 2.434584755403868e-05, + "loss": 1.9624, + "step": 4293 + }, + { + "epoch": 4.8850967007963595, + "grad_norm": 1.062320590019226, + "learning_rate": 2.411831626848692e-05, + "loss": 2.0422, + "step": 4294 + }, + { + "epoch": 4.886234357224119, + "grad_norm": 0.9401637315750122, + "learning_rate": 2.3890784982935157e-05, + "loss": 2.0883, + "step": 4295 + }, + { + "epoch": 4.887372013651877, + "grad_norm": 0.9944307804107666, + "learning_rate": 2.366325369738339e-05, + "loss": 2.0369, + "step": 4296 + }, + { + "epoch": 4.888509670079636, + "grad_norm": 1.2259140014648438, + "learning_rate": 2.3435722411831627e-05, + "loss": 2.3347, + "step": 4297 + }, + { + "epoch": 4.8896473265073945, + "grad_norm": 0.7835159301757812, + "learning_rate": 2.3208191126279864e-05, + "loss": 1.4377, + "step": 4298 + }, + { + "epoch": 4.890784982935154, + "grad_norm": 1.1913021802902222, + "learning_rate": 2.2980659840728098e-05, + "loss": 1.6073, + "step": 4299 + }, + { + "epoch": 4.891922639362912, + "grad_norm": 0.95366370677948, + "learning_rate": 2.2753128555176338e-05, + "loss": 1.8692, + "step": 4300 + }, + { + "epoch": 4.893060295790671, + "grad_norm": 0.8128577470779419, + "learning_rate": 2.2525597269624575e-05, + "loss": 1.3592, + "step": 4301 + }, + { + "epoch": 4.8941979522184305, + "grad_norm": 1.3554686307907104, + "learning_rate": 2.2298065984072812e-05, + "loss": 2.6948, + "step": 4302 + }, + { + "epoch": 4.895335608646189, + "grad_norm": 0.8826941847801208, + "learning_rate": 2.2070534698521046e-05, + "loss": 1.4915, + "step": 4303 + }, + { + "epoch": 4.896473265073948, + "grad_norm": 1.1675195693969727, + "learning_rate": 2.1843003412969283e-05, + "loss": 1.9757, + "step": 4304 + }, + { + "epoch": 4.897610921501706, + "grad_norm": 0.7045602798461914, + "learning_rate": 2.1615472127417523e-05, + "loss": 1.5306, + "step": 4305 + }, + { + "epoch": 4.8987485779294655, + "grad_norm": 1.0397857427597046, + "learning_rate": 2.1387940841865757e-05, + "loss": 1.9285, + "step": 4306 + }, + { + "epoch": 4.899886234357224, + "grad_norm": 0.5089128613471985, + "learning_rate": 2.1160409556313994e-05, + "loss": 0.8025, + "step": 4307 + }, + { + "epoch": 4.901023890784983, + "grad_norm": 0.6715580821037292, + "learning_rate": 2.093287827076223e-05, + "loss": 1.3201, + "step": 4308 + }, + { + "epoch": 4.902161547212742, + "grad_norm": 1.0820637941360474, + "learning_rate": 2.0705346985210464e-05, + "loss": 1.5423, + "step": 4309 + }, + { + "epoch": 4.9032992036405005, + "grad_norm": 1.0669132471084595, + "learning_rate": 2.04778156996587e-05, + "loss": 1.2283, + "step": 4310 + }, + { + "epoch": 4.90443686006826, + "grad_norm": 0.9476402401924133, + "learning_rate": 2.025028441410694e-05, + "loss": 1.5601, + "step": 4311 + }, + { + "epoch": 4.905574516496018, + "grad_norm": 0.8250419497489929, + "learning_rate": 2.002275312855518e-05, + "loss": 1.2876, + "step": 4312 + }, + { + "epoch": 4.906712172923777, + "grad_norm": 0.8450213074684143, + "learning_rate": 1.9795221843003412e-05, + "loss": 1.3217, + "step": 4313 + }, + { + "epoch": 4.907849829351536, + "grad_norm": 0.6894076466560364, + "learning_rate": 1.956769055745165e-05, + "loss": 0.8804, + "step": 4314 + }, + { + "epoch": 4.908987485779295, + "grad_norm": 0.8378640413284302, + "learning_rate": 1.9340159271899886e-05, + "loss": 1.5368, + "step": 4315 + }, + { + "epoch": 4.910125142207053, + "grad_norm": 0.8414782285690308, + "learning_rate": 1.9112627986348123e-05, + "loss": 2.0629, + "step": 4316 + }, + { + "epoch": 4.911262798634812, + "grad_norm": 0.7455840706825256, + "learning_rate": 1.888509670079636e-05, + "loss": 2.0014, + "step": 4317 + }, + { + "epoch": 4.912400455062571, + "grad_norm": 0.9223942756652832, + "learning_rate": 1.8657565415244597e-05, + "loss": 2.2794, + "step": 4318 + }, + { + "epoch": 4.91353811149033, + "grad_norm": 0.8256419897079468, + "learning_rate": 1.8430034129692834e-05, + "loss": 0.9545, + "step": 4319 + }, + { + "epoch": 4.914675767918089, + "grad_norm": 0.9742276668548584, + "learning_rate": 1.8202502844141068e-05, + "loss": 1.5479, + "step": 4320 + }, + { + "epoch": 4.915813424345847, + "grad_norm": 0.6871644854545593, + "learning_rate": 1.7974971558589305e-05, + "loss": 1.025, + "step": 4321 + }, + { + "epoch": 4.9169510807736065, + "grad_norm": 0.43966150283813477, + "learning_rate": 1.7747440273037545e-05, + "loss": 0.4528, + "step": 4322 + }, + { + "epoch": 4.918088737201365, + "grad_norm": 0.7050820589065552, + "learning_rate": 1.751990898748578e-05, + "loss": 1.4931, + "step": 4323 + }, + { + "epoch": 4.919226393629124, + "grad_norm": 0.9527431130409241, + "learning_rate": 1.7292377701934016e-05, + "loss": 1.7784, + "step": 4324 + }, + { + "epoch": 4.920364050056882, + "grad_norm": 0.806577742099762, + "learning_rate": 1.7064846416382253e-05, + "loss": 1.7357, + "step": 4325 + }, + { + "epoch": 4.921501706484642, + "grad_norm": 0.8829598426818848, + "learning_rate": 1.683731513083049e-05, + "loss": 0.9894, + "step": 4326 + }, + { + "epoch": 4.922639362912401, + "grad_norm": 0.9835923910140991, + "learning_rate": 1.6609783845278727e-05, + "loss": 2.0375, + "step": 4327 + }, + { + "epoch": 4.923777019340159, + "grad_norm": 0.6324472427368164, + "learning_rate": 1.6382252559726964e-05, + "loss": 1.0425, + "step": 4328 + }, + { + "epoch": 4.924914675767918, + "grad_norm": 1.1519763469696045, + "learning_rate": 1.61547212741752e-05, + "loss": 1.2597, + "step": 4329 + }, + { + "epoch": 4.926052332195677, + "grad_norm": 0.9644741415977478, + "learning_rate": 1.5927189988623434e-05, + "loss": 1.327, + "step": 4330 + }, + { + "epoch": 4.927189988623436, + "grad_norm": 1.3414863348007202, + "learning_rate": 1.569965870307167e-05, + "loss": 1.7992, + "step": 4331 + }, + { + "epoch": 4.928327645051194, + "grad_norm": 1.1579735279083252, + "learning_rate": 1.547212741751991e-05, + "loss": 2.0279, + "step": 4332 + }, + { + "epoch": 4.929465301478953, + "grad_norm": 0.6621242761611938, + "learning_rate": 1.5244596131968145e-05, + "loss": 1.217, + "step": 4333 + }, + { + "epoch": 4.9306029579067125, + "grad_norm": 1.626521348953247, + "learning_rate": 1.5017064846416382e-05, + "loss": 1.5727, + "step": 4334 + }, + { + "epoch": 4.931740614334471, + "grad_norm": 1.1708447933197021, + "learning_rate": 1.478953356086462e-05, + "loss": 1.681, + "step": 4335 + }, + { + "epoch": 4.93287827076223, + "grad_norm": 0.654159426689148, + "learning_rate": 1.4562002275312855e-05, + "loss": 1.478, + "step": 4336 + }, + { + "epoch": 4.934015927189988, + "grad_norm": 0.9996544718742371, + "learning_rate": 1.4334470989761093e-05, + "loss": 2.0139, + "step": 4337 + }, + { + "epoch": 4.935153583617748, + "grad_norm": 0.6572261452674866, + "learning_rate": 1.4106939704209329e-05, + "loss": 1.5581, + "step": 4338 + }, + { + "epoch": 4.936291240045506, + "grad_norm": 0.7390673160552979, + "learning_rate": 1.3879408418657567e-05, + "loss": 1.218, + "step": 4339 + }, + { + "epoch": 4.937428896473265, + "grad_norm": 0.8499306440353394, + "learning_rate": 1.3651877133105803e-05, + "loss": 2.0041, + "step": 4340 + }, + { + "epoch": 4.938566552901024, + "grad_norm": 0.7346335053443909, + "learning_rate": 1.3424345847554038e-05, + "loss": 1.2743, + "step": 4341 + }, + { + "epoch": 4.939704209328783, + "grad_norm": 0.9406227469444275, + "learning_rate": 1.3196814562002277e-05, + "loss": 1.9248, + "step": 4342 + }, + { + "epoch": 4.940841865756542, + "grad_norm": 0.9315640330314636, + "learning_rate": 1.2969283276450512e-05, + "loss": 1.8989, + "step": 4343 + }, + { + "epoch": 4.9419795221843, + "grad_norm": 0.8979254961013794, + "learning_rate": 1.2741751990898749e-05, + "loss": 1.645, + "step": 4344 + }, + { + "epoch": 4.943117178612059, + "grad_norm": 1.1764620542526245, + "learning_rate": 1.2514220705346986e-05, + "loss": 2.31, + "step": 4345 + }, + { + "epoch": 4.944254835039818, + "grad_norm": 0.8805619478225708, + "learning_rate": 1.2286689419795221e-05, + "loss": 2.1559, + "step": 4346 + }, + { + "epoch": 4.945392491467577, + "grad_norm": 2.1326658725738525, + "learning_rate": 1.205915813424346e-05, + "loss": 4.3989, + "step": 4347 + }, + { + "epoch": 4.946530147895336, + "grad_norm": 0.6682014465332031, + "learning_rate": 1.1831626848691695e-05, + "loss": 1.1768, + "step": 4348 + }, + { + "epoch": 4.947667804323094, + "grad_norm": 0.696792721748352, + "learning_rate": 1.1604095563139932e-05, + "loss": 0.8609, + "step": 4349 + }, + { + "epoch": 4.948805460750854, + "grad_norm": 1.4207789897918701, + "learning_rate": 1.1376564277588169e-05, + "loss": 2.1457, + "step": 4350 + }, + { + "epoch": 4.949943117178612, + "grad_norm": 1.3908188343048096, + "learning_rate": 1.1149032992036406e-05, + "loss": 1.7755, + "step": 4351 + }, + { + "epoch": 4.951080773606371, + "grad_norm": 1.002634882926941, + "learning_rate": 1.0921501706484641e-05, + "loss": 2.3076, + "step": 4352 + }, + { + "epoch": 4.952218430034129, + "grad_norm": 1.1484334468841553, + "learning_rate": 1.0693970420932878e-05, + "loss": 2.1246, + "step": 4353 + }, + { + "epoch": 4.953356086461889, + "grad_norm": 1.0049216747283936, + "learning_rate": 1.0466439135381115e-05, + "loss": 2.0208, + "step": 4354 + }, + { + "epoch": 4.954493742889647, + "grad_norm": 0.7231261134147644, + "learning_rate": 1.023890784982935e-05, + "loss": 0.9476, + "step": 4355 + }, + { + "epoch": 4.955631399317406, + "grad_norm": 0.9415931105613708, + "learning_rate": 1.001137656427759e-05, + "loss": 1.935, + "step": 4356 + }, + { + "epoch": 4.9567690557451645, + "grad_norm": 1.4136885404586792, + "learning_rate": 9.783845278725825e-06, + "loss": 2.5718, + "step": 4357 + }, + { + "epoch": 4.957906712172924, + "grad_norm": 0.6967998147010803, + "learning_rate": 9.556313993174062e-06, + "loss": 0.6904, + "step": 4358 + }, + { + "epoch": 4.959044368600683, + "grad_norm": 0.5910859107971191, + "learning_rate": 9.328782707622299e-06, + "loss": 0.8882, + "step": 4359 + }, + { + "epoch": 4.960182025028441, + "grad_norm": 0.9837796688079834, + "learning_rate": 9.101251422070534e-06, + "loss": 0.7616, + "step": 4360 + }, + { + "epoch": 4.9613196814562, + "grad_norm": 0.983348548412323, + "learning_rate": 8.873720136518773e-06, + "loss": 1.538, + "step": 4361 + }, + { + "epoch": 4.962457337883959, + "grad_norm": 0.8036980628967285, + "learning_rate": 8.646188850967008e-06, + "loss": 1.886, + "step": 4362 + }, + { + "epoch": 4.963594994311718, + "grad_norm": 1.007638692855835, + "learning_rate": 8.418657565415245e-06, + "loss": 2.004, + "step": 4363 + }, + { + "epoch": 4.964732650739476, + "grad_norm": 0.9947067499160767, + "learning_rate": 8.191126279863482e-06, + "loss": 1.8577, + "step": 4364 + }, + { + "epoch": 4.965870307167235, + "grad_norm": 1.1273043155670166, + "learning_rate": 7.963594994311717e-06, + "loss": 1.8335, + "step": 4365 + }, + { + "epoch": 4.967007963594995, + "grad_norm": 1.045556664466858, + "learning_rate": 7.736063708759956e-06, + "loss": 1.9307, + "step": 4366 + }, + { + "epoch": 4.968145620022753, + "grad_norm": 1.058106541633606, + "learning_rate": 7.508532423208191e-06, + "loss": 1.9808, + "step": 4367 + }, + { + "epoch": 4.969283276450512, + "grad_norm": 0.9825536608695984, + "learning_rate": 7.281001137656427e-06, + "loss": 1.0482, + "step": 4368 + }, + { + "epoch": 4.9704209328782705, + "grad_norm": 1.0794082880020142, + "learning_rate": 7.053469852104664e-06, + "loss": 1.8367, + "step": 4369 + }, + { + "epoch": 4.97155858930603, + "grad_norm": 1.2737934589385986, + "learning_rate": 6.825938566552901e-06, + "loss": 2.6611, + "step": 4370 + }, + { + "epoch": 4.972696245733788, + "grad_norm": 1.13683021068573, + "learning_rate": 6.598407281001138e-06, + "loss": 1.9974, + "step": 4371 + }, + { + "epoch": 4.973833902161547, + "grad_norm": 1.0530338287353516, + "learning_rate": 6.370875995449374e-06, + "loss": 1.9576, + "step": 4372 + }, + { + "epoch": 4.974971558589306, + "grad_norm": 1.3305796384811401, + "learning_rate": 6.1433447098976105e-06, + "loss": 3.5862, + "step": 4373 + }, + { + "epoch": 4.976109215017065, + "grad_norm": 1.4939963817596436, + "learning_rate": 5.9158134243458475e-06, + "loss": 2.9359, + "step": 4374 + }, + { + "epoch": 4.977246871444824, + "grad_norm": 1.0199118852615356, + "learning_rate": 5.6882821387940845e-06, + "loss": 1.8068, + "step": 4375 + }, + { + "epoch": 4.978384527872582, + "grad_norm": 1.2236491441726685, + "learning_rate": 5.460750853242321e-06, + "loss": 2.3517, + "step": 4376 + }, + { + "epoch": 4.979522184300341, + "grad_norm": 0.7806452512741089, + "learning_rate": 5.233219567690558e-06, + "loss": 1.238, + "step": 4377 + }, + { + "epoch": 4.9806598407281, + "grad_norm": 1.1601351499557495, + "learning_rate": 5.005688282138795e-06, + "loss": 1.9707, + "step": 4378 + }, + { + "epoch": 4.981797497155859, + "grad_norm": 1.6477166414260864, + "learning_rate": 4.778156996587031e-06, + "loss": 3.1835, + "step": 4379 + }, + { + "epoch": 4.982935153583618, + "grad_norm": 0.8821132779121399, + "learning_rate": 4.550625711035267e-06, + "loss": 1.4507, + "step": 4380 + }, + { + "epoch": 4.9840728100113765, + "grad_norm": 0.922042965888977, + "learning_rate": 4.323094425483504e-06, + "loss": 1.9507, + "step": 4381 + }, + { + "epoch": 4.985210466439136, + "grad_norm": 0.9775927662849426, + "learning_rate": 4.095563139931741e-06, + "loss": 1.4578, + "step": 4382 + }, + { + "epoch": 4.986348122866894, + "grad_norm": 1.6098992824554443, + "learning_rate": 3.868031854379978e-06, + "loss": 2.923, + "step": 4383 + }, + { + "epoch": 4.987485779294653, + "grad_norm": 0.938714325428009, + "learning_rate": 3.6405005688282136e-06, + "loss": 1.6787, + "step": 4384 + }, + { + "epoch": 4.9886234357224115, + "grad_norm": 0.8386014103889465, + "learning_rate": 3.4129692832764506e-06, + "loss": 2.1241, + "step": 4385 + }, + { + "epoch": 4.989761092150171, + "grad_norm": 1.0702085494995117, + "learning_rate": 3.185437997724687e-06, + "loss": 2.7462, + "step": 4386 + }, + { + "epoch": 4.99089874857793, + "grad_norm": 0.7873029708862305, + "learning_rate": 2.9579067121729238e-06, + "loss": 1.3806, + "step": 4387 + }, + { + "epoch": 4.992036405005688, + "grad_norm": 0.7936583757400513, + "learning_rate": 2.7303754266211603e-06, + "loss": 1.4501, + "step": 4388 + }, + { + "epoch": 4.993174061433447, + "grad_norm": 1.116550087928772, + "learning_rate": 2.5028441410693973e-06, + "loss": 3.2445, + "step": 4389 + }, + { + "epoch": 4.994311717861206, + "grad_norm": 0.6965498328208923, + "learning_rate": 2.2753128555176335e-06, + "loss": 1.3281, + "step": 4390 + }, + { + "epoch": 4.995449374288965, + "grad_norm": 1.0656460523605347, + "learning_rate": 2.0477815699658705e-06, + "loss": 2.3715, + "step": 4391 + }, + { + "epoch": 4.996587030716723, + "grad_norm": 1.0514278411865234, + "learning_rate": 1.8202502844141068e-06, + "loss": 1.2846, + "step": 4392 + }, + { + "epoch": 4.9977246871444825, + "grad_norm": 1.341645359992981, + "learning_rate": 1.5927189988623436e-06, + "loss": 1.663, + "step": 4393 + }, + { + "epoch": 4.998862343572241, + "grad_norm": 0.9820802807807922, + "learning_rate": 1.3651877133105802e-06, + "loss": 2.5961, + "step": 4394 + }, + { + "epoch": 5.0, + "grad_norm": 0.8708580136299133, + "learning_rate": 1.1376564277588167e-06, + "loss": 1.7838, + "step": 4395 + }, + { + "epoch": 5.0, + "eval_f1": 0.8906, + "eval_gen_len": 49.5727, + "eval_loss": 1.8275777101516724, + "eval_precision": 0.8894, + "eval_recall": 0.8921, + "eval_rouge1": 0.4407, + "eval_rouge2": 0.1997, + "eval_rougeL": 0.3672, + "eval_rougeLsum": 0.4075, + "eval_runtime": 28.8614, + "eval_samples_per_second": 3.811, + "eval_steps_per_second": 0.485, + "step": 4395 + }, + { + "epoch": 5.0, + "step": 4395, + "total_flos": 4146070071306240.0, + "train_loss": 1.9703198982044563, + "train_runtime": 950.7032, + "train_samples_per_second": 4.623, + "train_steps_per_second": 4.623 + } + ], + "logging_steps": 1, + "max_steps": 4395, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4146070071306240.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}