{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 4395, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011376564277588168, "grad_norm": 0.9736748337745667, "learning_rate": 0.0009997724687144482, "loss": 2.1831, "step": 1 }, { "epoch": 0.0022753128555176336, "grad_norm": NaN, "learning_rate": 0.0009997724687144482, "loss": 3.1748, "step": 2 }, { "epoch": 0.0034129692832764505, "grad_norm": NaN, "learning_rate": 0.0009997724687144482, "loss": 4.6431, "step": 3 }, { "epoch": 0.004550625711035267, "grad_norm": 1.4586130380630493, "learning_rate": 0.0009995449374288964, "loss": 3.0014, "step": 4 }, { "epoch": 0.005688282138794084, "grad_norm": 1.1064860820770264, "learning_rate": 0.0009993174061433449, "loss": 3.0825, "step": 5 }, { "epoch": 0.006825938566552901, "grad_norm": 1.7668497562408447, "learning_rate": 0.000999089874857793, "loss": 4.1777, "step": 6 }, { "epoch": 0.007963594994311717, "grad_norm": 1.0811728239059448, "learning_rate": 0.000998862343572241, "loss": 2.9395, "step": 7 }, { "epoch": 0.009101251422070534, "grad_norm": 1.3404239416122437, "learning_rate": 0.0009986348122866895, "loss": 3.0641, "step": 8 }, { "epoch": 0.010238907849829351, "grad_norm": 1.7664979696273804, "learning_rate": 0.0009984072810011377, "loss": 4.9861, "step": 9 }, { "epoch": 0.011376564277588168, "grad_norm": 1.3834271430969238, "learning_rate": 0.000998179749715586, "loss": 3.3572, "step": 10 }, { "epoch": 0.012514220705346985, "grad_norm": 1.6286646127700806, "learning_rate": 0.0009979522184300341, "loss": 4.866, "step": 11 }, { "epoch": 0.013651877133105802, "grad_norm": 0.9623754620552063, "learning_rate": 0.0009977246871444823, "loss": 2.6571, "step": 12 }, { "epoch": 0.01478953356086462, "grad_norm": 1.4946273565292358, "learning_rate": 0.0009974971558589305, "loss": 3.2579, "step": 13 }, { "epoch": 0.015927189988623434, "grad_norm": 1.5644868612289429, "learning_rate": 0.000997269624573379, "loss": 4.3634, "step": 14 }, { "epoch": 0.017064846416382253, "grad_norm": 1.1862064599990845, "learning_rate": 0.0009970420932878272, "loss": 2.5753, "step": 15 }, { "epoch": 0.01820250284414107, "grad_norm": 1.8198087215423584, "learning_rate": 0.0009968145620022754, "loss": 2.7812, "step": 16 }, { "epoch": 0.019340159271899887, "grad_norm": 1.6003659963607788, "learning_rate": 0.0009965870307167236, "loss": 3.8034, "step": 17 }, { "epoch": 0.020477815699658702, "grad_norm": 0.6375657916069031, "learning_rate": 0.0009963594994311718, "loss": 1.8391, "step": 18 }, { "epoch": 0.02161547212741752, "grad_norm": 1.5585576295852661, "learning_rate": 0.00099613196814562, "loss": 3.863, "step": 19 }, { "epoch": 0.022753128555176336, "grad_norm": 1.4096543788909912, "learning_rate": 0.0009959044368600682, "loss": 2.7089, "step": 20 }, { "epoch": 0.023890784982935155, "grad_norm": 1.2902367115020752, "learning_rate": 0.0009956769055745164, "loss": 3.508, "step": 21 }, { "epoch": 0.02502844141069397, "grad_norm": 1.1524360179901123, "learning_rate": 0.0009954493742889647, "loss": 3.5223, "step": 22 }, { "epoch": 0.026166097838452786, "grad_norm": 1.2636401653289795, "learning_rate": 0.000995221843003413, "loss": 2.9982, "step": 23 }, { "epoch": 0.027303754266211604, "grad_norm": 1.3827288150787354, "learning_rate": 0.0009949943117178613, "loss": 4.1671, "step": 24 }, { "epoch": 0.02844141069397042, "grad_norm": 1.1216269731521606, "learning_rate": 0.0009947667804323095, "loss": 2.0255, "step": 25 }, { "epoch": 0.02957906712172924, "grad_norm": 1.6343213319778442, "learning_rate": 0.0009945392491467577, "loss": 3.2769, "step": 26 }, { "epoch": 0.030716723549488054, "grad_norm": 1.988418698310852, "learning_rate": 0.000994311717861206, "loss": 4.1173, "step": 27 }, { "epoch": 0.03185437997724687, "grad_norm": 1.2403652667999268, "learning_rate": 0.0009940841865756541, "loss": 2.7539, "step": 28 }, { "epoch": 0.03299203640500569, "grad_norm": 1.1057771444320679, "learning_rate": 0.0009938566552901023, "loss": 3.5235, "step": 29 }, { "epoch": 0.034129692832764506, "grad_norm": 1.1171739101409912, "learning_rate": 0.0009936291240045505, "loss": 2.8045, "step": 30 }, { "epoch": 0.03526734926052332, "grad_norm": 1.4165009260177612, "learning_rate": 0.000993401592718999, "loss": 4.1957, "step": 31 }, { "epoch": 0.03640500568828214, "grad_norm": 1.4927332401275635, "learning_rate": 0.0009931740614334472, "loss": 3.5043, "step": 32 }, { "epoch": 0.03754266211604096, "grad_norm": 1.4980137348175049, "learning_rate": 0.0009929465301478954, "loss": 4.4631, "step": 33 }, { "epoch": 0.038680318543799774, "grad_norm": 1.375657320022583, "learning_rate": 0.0009927189988623436, "loss": 3.3355, "step": 34 }, { "epoch": 0.03981797497155859, "grad_norm": 1.5445199012756348, "learning_rate": 0.0009924914675767918, "loss": 4.7957, "step": 35 }, { "epoch": 0.040955631399317405, "grad_norm": 1.644087314605713, "learning_rate": 0.00099226393629124, "loss": 3.2537, "step": 36 }, { "epoch": 0.04209328782707622, "grad_norm": 1.307708978652954, "learning_rate": 0.0009920364050056882, "loss": 3.6423, "step": 37 }, { "epoch": 0.04323094425483504, "grad_norm": 1.1098673343658447, "learning_rate": 0.0009918088737201364, "loss": 2.7579, "step": 38 }, { "epoch": 0.04436860068259386, "grad_norm": 1.436892032623291, "learning_rate": 0.0009915813424345847, "loss": 2.6732, "step": 39 }, { "epoch": 0.04550625711035267, "grad_norm": 0.9877327680587769, "learning_rate": 0.000991353811149033, "loss": 2.4214, "step": 40 }, { "epoch": 0.04664391353811149, "grad_norm": 1.6885147094726562, "learning_rate": 0.0009911262798634813, "loss": 3.0437, "step": 41 }, { "epoch": 0.04778156996587031, "grad_norm": 1.2590283155441284, "learning_rate": 0.0009908987485779295, "loss": 3.1304, "step": 42 }, { "epoch": 0.048919226393629126, "grad_norm": 1.3581849336624146, "learning_rate": 0.0009906712172923777, "loss": 3.5613, "step": 43 }, { "epoch": 0.05005688282138794, "grad_norm": 1.5315910577774048, "learning_rate": 0.000990443686006826, "loss": 5.662, "step": 44 }, { "epoch": 0.051194539249146756, "grad_norm": 0.9183916449546814, "learning_rate": 0.0009902161547212743, "loss": 2.6213, "step": 45 }, { "epoch": 0.05233219567690557, "grad_norm": 1.1212267875671387, "learning_rate": 0.0009899886234357223, "loss": 2.613, "step": 46 }, { "epoch": 0.053469852104664393, "grad_norm": 0.9141287803649902, "learning_rate": 0.0009897610921501705, "loss": 2.2044, "step": 47 }, { "epoch": 0.05460750853242321, "grad_norm": 1.1506439447402954, "learning_rate": 0.000989533560864619, "loss": 3.5604, "step": 48 }, { "epoch": 0.055745164960182024, "grad_norm": 0.9535030722618103, "learning_rate": 0.0009893060295790672, "loss": 2.5689, "step": 49 }, { "epoch": 0.05688282138794084, "grad_norm": 0.8492175936698914, "learning_rate": 0.0009890784982935154, "loss": 2.4686, "step": 50 }, { "epoch": 0.05802047781569966, "grad_norm": 1.176944613456726, "learning_rate": 0.0009888509670079636, "loss": 2.9579, "step": 51 }, { "epoch": 0.05915813424345848, "grad_norm": 1.0339919328689575, "learning_rate": 0.0009886234357224118, "loss": 2.6745, "step": 52 }, { "epoch": 0.06029579067121729, "grad_norm": 0.9942947626113892, "learning_rate": 0.00098839590443686, "loss": 1.9818, "step": 53 }, { "epoch": 0.06143344709897611, "grad_norm": 0.8406325578689575, "learning_rate": 0.0009881683731513085, "loss": 1.4172, "step": 54 }, { "epoch": 0.06257110352673492, "grad_norm": 1.1649565696716309, "learning_rate": 0.0009879408418657567, "loss": 3.7562, "step": 55 }, { "epoch": 0.06370875995449374, "grad_norm": 1.0086359977722168, "learning_rate": 0.0009877133105802047, "loss": 1.6131, "step": 56 }, { "epoch": 0.06484641638225255, "grad_norm": 1.4940587282180786, "learning_rate": 0.000987485779294653, "loss": 2.0438, "step": 57 }, { "epoch": 0.06598407281001138, "grad_norm": 1.365527868270874, "learning_rate": 0.0009872582480091013, "loss": 3.1455, "step": 58 }, { "epoch": 0.0671217292377702, "grad_norm": 1.9875714778900146, "learning_rate": 0.0009870307167235495, "loss": 4.7546, "step": 59 }, { "epoch": 0.06825938566552901, "grad_norm": 1.2569215297698975, "learning_rate": 0.0009868031854379977, "loss": 2.9149, "step": 60 }, { "epoch": 0.06939704209328783, "grad_norm": 1.7797069549560547, "learning_rate": 0.000986575654152446, "loss": 4.0154, "step": 61 }, { "epoch": 0.07053469852104664, "grad_norm": 1.348248839378357, "learning_rate": 0.0009863481228668941, "loss": 4.3475, "step": 62 }, { "epoch": 0.07167235494880546, "grad_norm": 1.334631323814392, "learning_rate": 0.0009861205915813426, "loss": 4.094, "step": 63 }, { "epoch": 0.07281001137656427, "grad_norm": 1.1636334657669067, "learning_rate": 0.0009858930602957908, "loss": 3.4968, "step": 64 }, { "epoch": 0.07394766780432309, "grad_norm": 1.045125126838684, "learning_rate": 0.000985665529010239, "loss": 2.4535, "step": 65 }, { "epoch": 0.07508532423208192, "grad_norm": 0.9353369474411011, "learning_rate": 0.0009854379977246872, "loss": 2.0262, "step": 66 }, { "epoch": 0.07622298065984073, "grad_norm": 1.0836305618286133, "learning_rate": 0.0009852104664391354, "loss": 3.03, "step": 67 }, { "epoch": 0.07736063708759955, "grad_norm": 0.8448026776313782, "learning_rate": 0.0009849829351535836, "loss": 2.0041, "step": 68 }, { "epoch": 0.07849829351535836, "grad_norm": 1.4605052471160889, "learning_rate": 0.0009847554038680318, "loss": 3.9341, "step": 69 }, { "epoch": 0.07963594994311718, "grad_norm": 1.040484070777893, "learning_rate": 0.00098452787258248, "loss": 3.1491, "step": 70 }, { "epoch": 0.080773606370876, "grad_norm": 1.9364869594573975, "learning_rate": 0.0009843003412969285, "loss": 4.5523, "step": 71 }, { "epoch": 0.08191126279863481, "grad_norm": 1.1543667316436768, "learning_rate": 0.0009840728100113767, "loss": 3.0408, "step": 72 }, { "epoch": 0.08304891922639362, "grad_norm": 1.410697340965271, "learning_rate": 0.0009838452787258249, "loss": 3.5993, "step": 73 }, { "epoch": 0.08418657565415244, "grad_norm": 1.4086410999298096, "learning_rate": 0.000983617747440273, "loss": 3.7115, "step": 74 }, { "epoch": 0.08532423208191127, "grad_norm": 1.3084092140197754, "learning_rate": 0.0009833902161547213, "loss": 2.0049, "step": 75 }, { "epoch": 0.08646188850967008, "grad_norm": 1.598218321800232, "learning_rate": 0.0009831626848691695, "loss": 3.0724, "step": 76 }, { "epoch": 0.0875995449374289, "grad_norm": 0.9294804334640503, "learning_rate": 0.0009829351535836177, "loss": 2.953, "step": 77 }, { "epoch": 0.08873720136518772, "grad_norm": 0.882921576499939, "learning_rate": 0.000982707622298066, "loss": 1.8918, "step": 78 }, { "epoch": 0.08987485779294653, "grad_norm": 0.8168361186981201, "learning_rate": 0.0009824800910125141, "loss": 1.9626, "step": 79 }, { "epoch": 0.09101251422070535, "grad_norm": 1.6279023885726929, "learning_rate": 0.0009822525597269626, "loss": 4.2579, "step": 80 }, { "epoch": 0.09215017064846416, "grad_norm": 1.3332301378250122, "learning_rate": 0.0009820250284414108, "loss": 2.8298, "step": 81 }, { "epoch": 0.09328782707622298, "grad_norm": 1.2498598098754883, "learning_rate": 0.000981797497155859, "loss": 2.9971, "step": 82 }, { "epoch": 0.09442548350398179, "grad_norm": 0.8949465751647949, "learning_rate": 0.0009815699658703072, "loss": 1.7321, "step": 83 }, { "epoch": 0.09556313993174062, "grad_norm": 0.8322932124137878, "learning_rate": 0.0009813424345847554, "loss": 1.9678, "step": 84 }, { "epoch": 0.09670079635949944, "grad_norm": 1.2313084602355957, "learning_rate": 0.0009811149032992036, "loss": 2.7896, "step": 85 }, { "epoch": 0.09783845278725825, "grad_norm": 0.749366283416748, "learning_rate": 0.0009808873720136518, "loss": 1.7082, "step": 86 }, { "epoch": 0.09897610921501707, "grad_norm": 1.3737764358520508, "learning_rate": 0.0009806598407281, "loss": 4.0502, "step": 87 }, { "epoch": 0.10011376564277588, "grad_norm": 1.0343174934387207, "learning_rate": 0.0009804323094425485, "loss": 4.1111, "step": 88 }, { "epoch": 0.1012514220705347, "grad_norm": 1.1936379671096802, "learning_rate": 0.0009802047781569967, "loss": 3.3507, "step": 89 }, { "epoch": 0.10238907849829351, "grad_norm": 1.1276384592056274, "learning_rate": 0.0009799772468714449, "loss": 2.6818, "step": 90 }, { "epoch": 0.10352673492605233, "grad_norm": 2.1285316944122314, "learning_rate": 0.000979749715585893, "loss": 5.7983, "step": 91 }, { "epoch": 0.10466439135381114, "grad_norm": 1.4918886423110962, "learning_rate": 0.0009795221843003413, "loss": 2.4772, "step": 92 }, { "epoch": 0.10580204778156997, "grad_norm": 1.229989767074585, "learning_rate": 0.0009792946530147895, "loss": 3.1748, "step": 93 }, { "epoch": 0.10693970420932879, "grad_norm": 1.2130271196365356, "learning_rate": 0.000979067121729238, "loss": 1.964, "step": 94 }, { "epoch": 0.1080773606370876, "grad_norm": 1.5376529693603516, "learning_rate": 0.000978839590443686, "loss": 3.2146, "step": 95 }, { "epoch": 0.10921501706484642, "grad_norm": 1.0450708866119385, "learning_rate": 0.0009786120591581341, "loss": 2.9779, "step": 96 }, { "epoch": 0.11035267349260523, "grad_norm": 0.8169401288032532, "learning_rate": 0.0009783845278725826, "loss": 2.9477, "step": 97 }, { "epoch": 0.11149032992036405, "grad_norm": 1.361669898033142, "learning_rate": 0.0009781569965870308, "loss": 2.842, "step": 98 }, { "epoch": 0.11262798634812286, "grad_norm": 0.8985223174095154, "learning_rate": 0.000977929465301479, "loss": 2.3365, "step": 99 }, { "epoch": 0.11376564277588168, "grad_norm": 0.9749724864959717, "learning_rate": 0.0009777019340159272, "loss": 1.5105, "step": 100 }, { "epoch": 0.1149032992036405, "grad_norm": 1.0484577417373657, "learning_rate": 0.0009774744027303754, "loss": 1.8364, "step": 101 }, { "epoch": 0.11604095563139932, "grad_norm": 0.9952252507209778, "learning_rate": 0.0009772468714448236, "loss": 2.8899, "step": 102 }, { "epoch": 0.11717861205915814, "grad_norm": 1.3469035625457764, "learning_rate": 0.000977019340159272, "loss": 3.7179, "step": 103 }, { "epoch": 0.11831626848691695, "grad_norm": 0.9715613722801208, "learning_rate": 0.0009767918088737202, "loss": 1.8968, "step": 104 }, { "epoch": 0.11945392491467577, "grad_norm": 0.7201526165008545, "learning_rate": 0.0009765642775881682, "loss": 1.6332, "step": 105 }, { "epoch": 0.12059158134243458, "grad_norm": 0.9688479900360107, "learning_rate": 0.0009763367463026166, "loss": 2.8186, "step": 106 }, { "epoch": 0.1217292377701934, "grad_norm": 1.2450870275497437, "learning_rate": 0.0009761092150170649, "loss": 3.0069, "step": 107 }, { "epoch": 0.12286689419795221, "grad_norm": 1.3213499784469604, "learning_rate": 0.0009758816837315131, "loss": 3.8223, "step": 108 }, { "epoch": 0.12400455062571103, "grad_norm": 1.4514771699905396, "learning_rate": 0.0009756541524459613, "loss": 4.2365, "step": 109 }, { "epoch": 0.12514220705346984, "grad_norm": 0.8881714344024658, "learning_rate": 0.0009754266211604096, "loss": 2.1791, "step": 110 }, { "epoch": 0.12627986348122866, "grad_norm": 1.0673104524612427, "learning_rate": 0.0009751990898748578, "loss": 2.073, "step": 111 }, { "epoch": 0.12741751990898748, "grad_norm": 1.6061453819274902, "learning_rate": 0.000974971558589306, "loss": 3.0393, "step": 112 }, { "epoch": 0.1285551763367463, "grad_norm": 0.9326847791671753, "learning_rate": 0.0009747440273037544, "loss": 1.549, "step": 113 }, { "epoch": 0.1296928327645051, "grad_norm": 0.8536593914031982, "learning_rate": 0.0009745164960182025, "loss": 1.7561, "step": 114 }, { "epoch": 0.13083048919226395, "grad_norm": 1.1350816488265991, "learning_rate": 0.0009742889647326507, "loss": 4.0099, "step": 115 }, { "epoch": 0.13196814562002276, "grad_norm": 1.5794169902801514, "learning_rate": 0.000974061433447099, "loss": 4.4439, "step": 116 }, { "epoch": 0.13310580204778158, "grad_norm": 1.0470244884490967, "learning_rate": 0.0009738339021615472, "loss": 2.459, "step": 117 }, { "epoch": 0.1342434584755404, "grad_norm": 0.9671889543533325, "learning_rate": 0.0009736063708759955, "loss": 2.7996, "step": 118 }, { "epoch": 0.1353811149032992, "grad_norm": 1.0588785409927368, "learning_rate": 0.0009733788395904437, "loss": 2.4111, "step": 119 }, { "epoch": 0.13651877133105803, "grad_norm": 1.3822710514068604, "learning_rate": 0.0009731513083048919, "loss": 3.7897, "step": 120 }, { "epoch": 0.13765642775881684, "grad_norm": 1.1157410144805908, "learning_rate": 0.0009729237770193402, "loss": 2.8744, "step": 121 }, { "epoch": 0.13879408418657566, "grad_norm": 1.1391868591308594, "learning_rate": 0.0009726962457337885, "loss": 3.3561, "step": 122 }, { "epoch": 0.13993174061433447, "grad_norm": 0.7412275671958923, "learning_rate": 0.0009724687144482367, "loss": 2.0783, "step": 123 }, { "epoch": 0.1410693970420933, "grad_norm": 1.821694254875183, "learning_rate": 0.0009722411831626849, "loss": 4.1648, "step": 124 }, { "epoch": 0.1422070534698521, "grad_norm": 0.9905460476875305, "learning_rate": 0.0009720136518771331, "loss": 2.3263, "step": 125 }, { "epoch": 0.14334470989761092, "grad_norm": NaN, "learning_rate": 0.0009720136518771331, "loss": 3.7223, "step": 126 }, { "epoch": 0.14448236632536973, "grad_norm": 0.9033792614936829, "learning_rate": 0.0009717861205915813, "loss": 2.0929, "step": 127 }, { "epoch": 0.14562002275312855, "grad_norm": 0.9408984184265137, "learning_rate": 0.0009715585893060296, "loss": 2.4161, "step": 128 }, { "epoch": 0.14675767918088736, "grad_norm": 0.9367227554321289, "learning_rate": 0.0009713310580204778, "loss": 2.1832, "step": 129 }, { "epoch": 0.14789533560864618, "grad_norm": 1.7745789289474487, "learning_rate": 0.000971103526734926, "loss": 3.0164, "step": 130 }, { "epoch": 0.149032992036405, "grad_norm": 1.1147022247314453, "learning_rate": 0.0009708759954493744, "loss": 2.4758, "step": 131 }, { "epoch": 0.15017064846416384, "grad_norm": 1.2068356275558472, "learning_rate": 0.0009706484641638226, "loss": 2.1346, "step": 132 }, { "epoch": 0.15130830489192265, "grad_norm": 1.1658344268798828, "learning_rate": 0.0009704209328782708, "loss": 2.3267, "step": 133 }, { "epoch": 0.15244596131968147, "grad_norm": 1.0015108585357666, "learning_rate": 0.0009701934015927191, "loss": 2.6728, "step": 134 }, { "epoch": 0.15358361774744028, "grad_norm": 0.8626338243484497, "learning_rate": 0.0009699658703071672, "loss": 2.2007, "step": 135 }, { "epoch": 0.1547212741751991, "grad_norm": 1.3096460103988647, "learning_rate": 0.0009697383390216154, "loss": 2.7757, "step": 136 }, { "epoch": 0.1558589306029579, "grad_norm": 0.5792064070701599, "learning_rate": 0.0009695108077360637, "loss": 1.4733, "step": 137 }, { "epoch": 0.15699658703071673, "grad_norm": 1.103218913078308, "learning_rate": 0.0009692832764505119, "loss": 3.3553, "step": 138 }, { "epoch": 0.15813424345847554, "grad_norm": 1.1538619995117188, "learning_rate": 0.0009690557451649603, "loss": 2.6128, "step": 139 }, { "epoch": 0.15927189988623436, "grad_norm": 0.8585355877876282, "learning_rate": 0.0009688282138794085, "loss": 1.995, "step": 140 }, { "epoch": 0.16040955631399317, "grad_norm": 0.9618576765060425, "learning_rate": 0.0009686006825938567, "loss": 2.7825, "step": 141 }, { "epoch": 0.161547212741752, "grad_norm": 1.1649035215377808, "learning_rate": 0.000968373151308305, "loss": 3.4463, "step": 142 }, { "epoch": 0.1626848691695108, "grad_norm": 0.7993972301483154, "learning_rate": 0.0009681456200227532, "loss": 1.343, "step": 143 }, { "epoch": 0.16382252559726962, "grad_norm": 1.05124032497406, "learning_rate": 0.0009679180887372013, "loss": 3.3284, "step": 144 }, { "epoch": 0.16496018202502843, "grad_norm": 1.1311984062194824, "learning_rate": 0.0009676905574516496, "loss": 2.7092, "step": 145 }, { "epoch": 0.16609783845278725, "grad_norm": 1.1213219165802002, "learning_rate": 0.0009674630261660978, "loss": 2.905, "step": 146 }, { "epoch": 0.16723549488054607, "grad_norm": 1.0415153503417969, "learning_rate": 0.000967235494880546, "loss": 2.5957, "step": 147 }, { "epoch": 0.16837315130830488, "grad_norm": 1.1598129272460938, "learning_rate": 0.0009670079635949944, "loss": 2.9467, "step": 148 }, { "epoch": 0.1695108077360637, "grad_norm": 1.0593596696853638, "learning_rate": 0.0009667804323094426, "loss": 3.2002, "step": 149 }, { "epoch": 0.17064846416382254, "grad_norm": 1.003199815750122, "learning_rate": 0.0009665529010238908, "loss": 2.107, "step": 150 }, { "epoch": 0.17178612059158135, "grad_norm": 1.135500431060791, "learning_rate": 0.0009663253697383391, "loss": 2.8572, "step": 151 }, { "epoch": 0.17292377701934017, "grad_norm": 3.164407730102539, "learning_rate": 0.0009660978384527873, "loss": 2.5331, "step": 152 }, { "epoch": 0.17406143344709898, "grad_norm": 1.1366970539093018, "learning_rate": 0.0009658703071672355, "loss": 2.8843, "step": 153 }, { "epoch": 0.1751990898748578, "grad_norm": 1.3237171173095703, "learning_rate": 0.0009656427758816837, "loss": 3.0479, "step": 154 }, { "epoch": 0.17633674630261661, "grad_norm": 0.7948922514915466, "learning_rate": 0.0009654152445961319, "loss": 2.3427, "step": 155 }, { "epoch": 0.17747440273037543, "grad_norm": 1.0152616500854492, "learning_rate": 0.0009651877133105801, "loss": 2.1661, "step": 156 }, { "epoch": 0.17861205915813425, "grad_norm": 0.6189094185829163, "learning_rate": 0.0009649601820250285, "loss": 1.7452, "step": 157 }, { "epoch": 0.17974971558589306, "grad_norm": 1.024207353591919, "learning_rate": 0.0009647326507394767, "loss": 3.1021, "step": 158 }, { "epoch": 0.18088737201365188, "grad_norm": 0.9709880352020264, "learning_rate": 0.000964505119453925, "loss": 2.5928, "step": 159 }, { "epoch": 0.1820250284414107, "grad_norm": 0.9320815801620483, "learning_rate": 0.0009642775881683732, "loss": 2.5435, "step": 160 }, { "epoch": 0.1831626848691695, "grad_norm": 1.1968194246292114, "learning_rate": 0.0009640500568828214, "loss": 2.7312, "step": 161 }, { "epoch": 0.18430034129692832, "grad_norm": 0.6706811189651489, "learning_rate": 0.0009638225255972697, "loss": 1.6675, "step": 162 }, { "epoch": 0.18543799772468714, "grad_norm": 0.9634714722633362, "learning_rate": 0.0009635949943117179, "loss": 1.9415, "step": 163 }, { "epoch": 0.18657565415244595, "grad_norm": 1.2071101665496826, "learning_rate": 0.000963367463026166, "loss": 2.3851, "step": 164 }, { "epoch": 0.18771331058020477, "grad_norm": 1.1103309392929077, "learning_rate": 0.0009631399317406144, "loss": 2.4283, "step": 165 }, { "epoch": 0.18885096700796358, "grad_norm": 0.7859079241752625, "learning_rate": 0.0009629124004550626, "loss": 1.6424, "step": 166 }, { "epoch": 0.1899886234357224, "grad_norm": 1.008284568786621, "learning_rate": 0.0009626848691695108, "loss": 2.4152, "step": 167 }, { "epoch": 0.19112627986348124, "grad_norm": 1.1314949989318848, "learning_rate": 0.0009624573378839591, "loss": 3.8599, "step": 168 }, { "epoch": 0.19226393629124006, "grad_norm": 1.0891621112823486, "learning_rate": 0.0009622298065984073, "loss": 2.8165, "step": 169 }, { "epoch": 0.19340159271899887, "grad_norm": 1.5934785604476929, "learning_rate": 0.0009620022753128555, "loss": 2.2033, "step": 170 }, { "epoch": 0.1945392491467577, "grad_norm": 0.9060195684432983, "learning_rate": 0.0009617747440273038, "loss": 1.9928, "step": 171 }, { "epoch": 0.1956769055745165, "grad_norm": 0.8390949964523315, "learning_rate": 0.000961547212741752, "loss": 2.1069, "step": 172 }, { "epoch": 0.19681456200227532, "grad_norm": 1.1905990839004517, "learning_rate": 0.0009613196814562003, "loss": 2.5079, "step": 173 }, { "epoch": 0.19795221843003413, "grad_norm": 1.3425909280776978, "learning_rate": 0.0009610921501706485, "loss": 3.3445, "step": 174 }, { "epoch": 0.19908987485779295, "grad_norm": 0.9901446104049683, "learning_rate": 0.0009608646188850967, "loss": 3.0084, "step": 175 }, { "epoch": 0.20022753128555176, "grad_norm": 1.7754806280136108, "learning_rate": 0.0009606370875995449, "loss": 2.8587, "step": 176 }, { "epoch": 0.20136518771331058, "grad_norm": 1.0438529253005981, "learning_rate": 0.0009604095563139932, "loss": 2.5869, "step": 177 }, { "epoch": 0.2025028441410694, "grad_norm": 1.0804096460342407, "learning_rate": 0.0009601820250284414, "loss": 3.065, "step": 178 }, { "epoch": 0.2036405005688282, "grad_norm": 1.2588906288146973, "learning_rate": 0.0009599544937428897, "loss": 1.9953, "step": 179 }, { "epoch": 0.20477815699658702, "grad_norm": 1.1029516458511353, "learning_rate": 0.0009597269624573379, "loss": 2.2572, "step": 180 }, { "epoch": 0.20591581342434584, "grad_norm": 1.1141575574874878, "learning_rate": 0.0009594994311717862, "loss": 2.3295, "step": 181 }, { "epoch": 0.20705346985210465, "grad_norm": 1.2331255674362183, "learning_rate": 0.0009592718998862345, "loss": 2.8308, "step": 182 }, { "epoch": 0.20819112627986347, "grad_norm": 1.3520818948745728, "learning_rate": 0.0009590443686006826, "loss": 3.12, "step": 183 }, { "epoch": 0.20932878270762229, "grad_norm": 1.2728475332260132, "learning_rate": 0.0009588168373151308, "loss": 2.8458, "step": 184 }, { "epoch": 0.21046643913538113, "grad_norm": 0.779961884021759, "learning_rate": 0.0009585893060295791, "loss": 1.7038, "step": 185 }, { "epoch": 0.21160409556313994, "grad_norm": 0.7108963131904602, "learning_rate": 0.0009583617747440273, "loss": 1.3587, "step": 186 }, { "epoch": 0.21274175199089876, "grad_norm": 0.7882399559020996, "learning_rate": 0.0009581342434584755, "loss": 1.7968, "step": 187 }, { "epoch": 0.21387940841865757, "grad_norm": 1.3022332191467285, "learning_rate": 0.0009579067121729238, "loss": 3.4653, "step": 188 }, { "epoch": 0.2150170648464164, "grad_norm": 0.9861379265785217, "learning_rate": 0.000957679180887372, "loss": 2.5134, "step": 189 }, { "epoch": 0.2161547212741752, "grad_norm": 1.3816951513290405, "learning_rate": 0.0009574516496018203, "loss": 1.7213, "step": 190 }, { "epoch": 0.21729237770193402, "grad_norm": 0.7945815920829773, "learning_rate": 0.0009572241183162686, "loss": 1.533, "step": 191 }, { "epoch": 0.21843003412969283, "grad_norm": 0.8966066837310791, "learning_rate": 0.0009569965870307168, "loss": 1.9149, "step": 192 }, { "epoch": 0.21956769055745165, "grad_norm": 1.2404614686965942, "learning_rate": 0.0009567690557451649, "loss": 2.8121, "step": 193 }, { "epoch": 0.22070534698521047, "grad_norm": 1.0486642122268677, "learning_rate": 0.0009565415244596132, "loss": 2.5131, "step": 194 }, { "epoch": 0.22184300341296928, "grad_norm": 0.9867016673088074, "learning_rate": 0.0009563139931740614, "loss": 2.9784, "step": 195 }, { "epoch": 0.2229806598407281, "grad_norm": 1.2857524156570435, "learning_rate": 0.0009560864618885096, "loss": 2.7056, "step": 196 }, { "epoch": 0.2241183162684869, "grad_norm": 0.8611189126968384, "learning_rate": 0.0009558589306029579, "loss": 1.6405, "step": 197 }, { "epoch": 0.22525597269624573, "grad_norm": 1.1097594499588013, "learning_rate": 0.0009556313993174062, "loss": 2.3999, "step": 198 }, { "epoch": 0.22639362912400454, "grad_norm": 1.3522695302963257, "learning_rate": 0.0009554038680318545, "loss": 2.4307, "step": 199 }, { "epoch": 0.22753128555176336, "grad_norm": 0.7342440485954285, "learning_rate": 0.0009551763367463027, "loss": 1.4805, "step": 200 }, { "epoch": 0.22866894197952217, "grad_norm": 1.099187970161438, "learning_rate": 0.0009549488054607509, "loss": 2.4419, "step": 201 }, { "epoch": 0.229806598407281, "grad_norm": 0.7036212682723999, "learning_rate": 0.0009547212741751992, "loss": 1.5371, "step": 202 }, { "epoch": 0.23094425483503983, "grad_norm": 0.7153437733650208, "learning_rate": 0.0009544937428896473, "loss": 2.0429, "step": 203 }, { "epoch": 0.23208191126279865, "grad_norm": 0.9110562801361084, "learning_rate": 0.0009542662116040955, "loss": 2.2197, "step": 204 }, { "epoch": 0.23321956769055746, "grad_norm": 0.9033525586128235, "learning_rate": 0.0009540386803185438, "loss": 2.0181, "step": 205 }, { "epoch": 0.23435722411831628, "grad_norm": 1.1249139308929443, "learning_rate": 0.000953811149032992, "loss": 2.6982, "step": 206 }, { "epoch": 0.2354948805460751, "grad_norm": 2.2283267974853516, "learning_rate": 0.0009535836177474403, "loss": 3.7574, "step": 207 }, { "epoch": 0.2366325369738339, "grad_norm": 1.0475785732269287, "learning_rate": 0.0009533560864618886, "loss": 2.3081, "step": 208 }, { "epoch": 0.23777019340159272, "grad_norm": 0.9977370500564575, "learning_rate": 0.0009531285551763368, "loss": 2.5508, "step": 209 }, { "epoch": 0.23890784982935154, "grad_norm": 1.2504466772079468, "learning_rate": 0.000952901023890785, "loss": 2.9425, "step": 210 }, { "epoch": 0.24004550625711035, "grad_norm": 1.4991356134414673, "learning_rate": 0.0009526734926052333, "loss": 2.6766, "step": 211 }, { "epoch": 0.24118316268486917, "grad_norm": 0.952013373374939, "learning_rate": 0.0009524459613196815, "loss": 1.3622, "step": 212 }, { "epoch": 0.24232081911262798, "grad_norm": 0.9106293320655823, "learning_rate": 0.0009522184300341296, "loss": 2.2534, "step": 213 }, { "epoch": 0.2434584755403868, "grad_norm": 1.6993825435638428, "learning_rate": 0.0009519908987485779, "loss": 3.5046, "step": 214 }, { "epoch": 0.2445961319681456, "grad_norm": 1.4656329154968262, "learning_rate": 0.0009517633674630262, "loss": 3.9916, "step": 215 }, { "epoch": 0.24573378839590443, "grad_norm": 0.9613212943077087, "learning_rate": 0.0009515358361774744, "loss": 2.7701, "step": 216 }, { "epoch": 0.24687144482366324, "grad_norm": 1.357133388519287, "learning_rate": 0.0009513083048919227, "loss": 3.3809, "step": 217 }, { "epoch": 0.24800910125142206, "grad_norm": 1.2995182275772095, "learning_rate": 0.0009510807736063709, "loss": 2.889, "step": 218 }, { "epoch": 0.24914675767918087, "grad_norm": 1.4708192348480225, "learning_rate": 0.0009508532423208191, "loss": 2.6981, "step": 219 }, { "epoch": 0.2502844141069397, "grad_norm": 1.0964555740356445, "learning_rate": 0.0009506257110352674, "loss": 3.448, "step": 220 }, { "epoch": 0.2514220705346985, "grad_norm": 0.9647961854934692, "learning_rate": 0.0009503981797497156, "loss": 1.367, "step": 221 }, { "epoch": 0.2525597269624573, "grad_norm": 1.0223067998886108, "learning_rate": 0.0009501706484641638, "loss": 2.653, "step": 222 }, { "epoch": 0.25369738339021614, "grad_norm": 1.2166616916656494, "learning_rate": 0.000949943117178612, "loss": 3.8313, "step": 223 }, { "epoch": 0.25483503981797495, "grad_norm": 1.2261899709701538, "learning_rate": 0.0009497155858930603, "loss": 3.3107, "step": 224 }, { "epoch": 0.25597269624573377, "grad_norm": 1.2194271087646484, "learning_rate": 0.0009494880546075086, "loss": 2.7319, "step": 225 }, { "epoch": 0.2571103526734926, "grad_norm": 1.196639895439148, "learning_rate": 0.0009492605233219568, "loss": 3.1096, "step": 226 }, { "epoch": 0.2582480091012514, "grad_norm": 0.9778220057487488, "learning_rate": 0.000949032992036405, "loss": 2.5783, "step": 227 }, { "epoch": 0.2593856655290102, "grad_norm": 1.0489749908447266, "learning_rate": 0.0009488054607508533, "loss": 2.1872, "step": 228 }, { "epoch": 0.2605233219567691, "grad_norm": 2.25301194190979, "learning_rate": 0.0009485779294653015, "loss": 2.4844, "step": 229 }, { "epoch": 0.2616609783845279, "grad_norm": 1.065746784210205, "learning_rate": 0.0009483503981797497, "loss": 3.7665, "step": 230 }, { "epoch": 0.2627986348122867, "grad_norm": 1.1636378765106201, "learning_rate": 0.0009481228668941981, "loss": 2.3635, "step": 231 }, { "epoch": 0.26393629124004553, "grad_norm": 0.915281355381012, "learning_rate": 0.0009478953356086462, "loss": 2.5689, "step": 232 }, { "epoch": 0.26507394766780434, "grad_norm": 1.088327407836914, "learning_rate": 0.0009476678043230944, "loss": 2.3029, "step": 233 }, { "epoch": 0.26621160409556316, "grad_norm": 0.8353843092918396, "learning_rate": 0.0009474402730375427, "loss": 2.1205, "step": 234 }, { "epoch": 0.267349260523322, "grad_norm": 0.8425211906433105, "learning_rate": 0.0009472127417519909, "loss": 1.6575, "step": 235 }, { "epoch": 0.2684869169510808, "grad_norm": 1.309173822402954, "learning_rate": 0.0009469852104664391, "loss": 3.7625, "step": 236 }, { "epoch": 0.2696245733788396, "grad_norm": 1.0323517322540283, "learning_rate": 0.0009467576791808874, "loss": 2.7422, "step": 237 }, { "epoch": 0.2707622298065984, "grad_norm": 1.0105620622634888, "learning_rate": 0.0009465301478953356, "loss": 2.0942, "step": 238 }, { "epoch": 0.27189988623435724, "grad_norm": 1.2887792587280273, "learning_rate": 0.0009463026166097838, "loss": 3.3921, "step": 239 }, { "epoch": 0.27303754266211605, "grad_norm": 0.8635759949684143, "learning_rate": 0.0009460750853242322, "loss": 1.6201, "step": 240 }, { "epoch": 0.27417519908987487, "grad_norm": 1.0267283916473389, "learning_rate": 0.0009458475540386804, "loss": 2.6196, "step": 241 }, { "epoch": 0.2753128555176337, "grad_norm": 1.3228164911270142, "learning_rate": 0.0009456200227531286, "loss": 2.9269, "step": 242 }, { "epoch": 0.2764505119453925, "grad_norm": 1.1229983568191528, "learning_rate": 0.0009453924914675768, "loss": 2.0913, "step": 243 }, { "epoch": 0.2775881683731513, "grad_norm": 1.4177569150924683, "learning_rate": 0.000945164960182025, "loss": 2.1312, "step": 244 }, { "epoch": 0.2787258248009101, "grad_norm": 0.8409637212753296, "learning_rate": 0.0009449374288964733, "loss": 2.4469, "step": 245 }, { "epoch": 0.27986348122866894, "grad_norm": 0.9386504292488098, "learning_rate": 0.0009447098976109215, "loss": 1.9967, "step": 246 }, { "epoch": 0.28100113765642776, "grad_norm": 1.1676746606826782, "learning_rate": 0.0009444823663253697, "loss": 2.1986, "step": 247 }, { "epoch": 0.2821387940841866, "grad_norm": 0.7538396716117859, "learning_rate": 0.0009442548350398181, "loss": 1.3639, "step": 248 }, { "epoch": 0.2832764505119454, "grad_norm": 0.9919435381889343, "learning_rate": 0.0009440273037542663, "loss": 2.5573, "step": 249 }, { "epoch": 0.2844141069397042, "grad_norm": 0.7212353944778442, "learning_rate": 0.0009437997724687145, "loss": 2.7537, "step": 250 }, { "epoch": 0.285551763367463, "grad_norm": 1.0949044227600098, "learning_rate": 0.0009435722411831627, "loss": 1.8613, "step": 251 }, { "epoch": 0.28668941979522183, "grad_norm": 1.1615073680877686, "learning_rate": 0.0009433447098976109, "loss": 2.3952, "step": 252 }, { "epoch": 0.28782707622298065, "grad_norm": 1.1122605800628662, "learning_rate": 0.0009431171786120591, "loss": 2.5904, "step": 253 }, { "epoch": 0.28896473265073946, "grad_norm": 1.0439378023147583, "learning_rate": 0.0009428896473265074, "loss": 2.2653, "step": 254 }, { "epoch": 0.2901023890784983, "grad_norm": 2.413464069366455, "learning_rate": 0.0009426621160409556, "loss": 4.6422, "step": 255 }, { "epoch": 0.2912400455062571, "grad_norm": 0.9542593955993652, "learning_rate": 0.0009424345847554038, "loss": 2.3552, "step": 256 }, { "epoch": 0.2923777019340159, "grad_norm": 1.229703426361084, "learning_rate": 0.0009422070534698522, "loss": 2.993, "step": 257 }, { "epoch": 0.2935153583617747, "grad_norm": 1.0281733274459839, "learning_rate": 0.0009419795221843004, "loss": 2.3383, "step": 258 }, { "epoch": 0.29465301478953354, "grad_norm": 0.8514539003372192, "learning_rate": 0.0009417519908987486, "loss": 1.9407, "step": 259 }, { "epoch": 0.29579067121729236, "grad_norm": 0.6966246962547302, "learning_rate": 0.0009415244596131969, "loss": 1.6663, "step": 260 }, { "epoch": 0.29692832764505117, "grad_norm": 1.4110140800476074, "learning_rate": 0.000941296928327645, "loss": 4.4839, "step": 261 }, { "epoch": 0.29806598407281, "grad_norm": 1.094144582748413, "learning_rate": 0.0009410693970420933, "loss": 2.2603, "step": 262 }, { "epoch": 0.2992036405005688, "grad_norm": 1.0117661952972412, "learning_rate": 0.0009408418657565415, "loss": 1.9057, "step": 263 }, { "epoch": 0.3003412969283277, "grad_norm": 1.1968410015106201, "learning_rate": 0.0009406143344709897, "loss": 2.5717, "step": 264 }, { "epoch": 0.3014789533560865, "grad_norm": 1.324204444885254, "learning_rate": 0.0009403868031854381, "loss": 2.4079, "step": 265 }, { "epoch": 0.3026166097838453, "grad_norm": 1.3995249271392822, "learning_rate": 0.0009401592718998863, "loss": 3.3401, "step": 266 }, { "epoch": 0.3037542662116041, "grad_norm": 1.0412198305130005, "learning_rate": 0.0009399317406143345, "loss": 3.0768, "step": 267 }, { "epoch": 0.30489192263936293, "grad_norm": 0.6889500617980957, "learning_rate": 0.0009397042093287828, "loss": 1.8718, "step": 268 }, { "epoch": 0.30602957906712175, "grad_norm": 1.1345181465148926, "learning_rate": 0.000939476678043231, "loss": 3.4683, "step": 269 }, { "epoch": 0.30716723549488056, "grad_norm": 0.8793913722038269, "learning_rate": 0.0009392491467576792, "loss": 2.2562, "step": 270 }, { "epoch": 0.3083048919226394, "grad_norm": 1.0166738033294678, "learning_rate": 0.0009390216154721274, "loss": 1.328, "step": 271 }, { "epoch": 0.3094425483503982, "grad_norm": 0.817034900188446, "learning_rate": 0.0009387940841865756, "loss": 2.4001, "step": 272 }, { "epoch": 0.310580204778157, "grad_norm": 0.9647735357284546, "learning_rate": 0.0009385665529010238, "loss": 2.2161, "step": 273 }, { "epoch": 0.3117178612059158, "grad_norm": 0.9152204394340515, "learning_rate": 0.0009383390216154722, "loss": 2.5515, "step": 274 }, { "epoch": 0.31285551763367464, "grad_norm": 1.4312481880187988, "learning_rate": 0.0009381114903299204, "loss": 4.4949, "step": 275 }, { "epoch": 0.31399317406143346, "grad_norm": 0.9626976847648621, "learning_rate": 0.0009378839590443686, "loss": 1.8928, "step": 276 }, { "epoch": 0.31513083048919227, "grad_norm": 2.2159972190856934, "learning_rate": 0.0009376564277588169, "loss": 4.2681, "step": 277 }, { "epoch": 0.3162684869169511, "grad_norm": 1.171950101852417, "learning_rate": 0.0009374288964732651, "loss": 2.1355, "step": 278 }, { "epoch": 0.3174061433447099, "grad_norm": 0.6437592506408691, "learning_rate": 0.0009372013651877133, "loss": 1.5357, "step": 279 }, { "epoch": 0.3185437997724687, "grad_norm": 1.0927355289459229, "learning_rate": 0.0009369738339021616, "loss": 2.1487, "step": 280 }, { "epoch": 0.31968145620022753, "grad_norm": 1.1274104118347168, "learning_rate": 0.0009367463026166097, "loss": 2.6737, "step": 281 }, { "epoch": 0.32081911262798635, "grad_norm": 0.9172748327255249, "learning_rate": 0.000936518771331058, "loss": 1.8367, "step": 282 }, { "epoch": 0.32195676905574516, "grad_norm": 1.2308052778244019, "learning_rate": 0.0009362912400455063, "loss": 2.4757, "step": 283 }, { "epoch": 0.323094425483504, "grad_norm": 0.8640644550323486, "learning_rate": 0.0009360637087599545, "loss": 2.0472, "step": 284 }, { "epoch": 0.3242320819112628, "grad_norm": 1.4058482646942139, "learning_rate": 0.0009358361774744028, "loss": 2.8178, "step": 285 }, { "epoch": 0.3253697383390216, "grad_norm": 1.1872564554214478, "learning_rate": 0.000935608646188851, "loss": 2.3008, "step": 286 }, { "epoch": 0.3265073947667804, "grad_norm": 0.7326186299324036, "learning_rate": 0.0009353811149032992, "loss": 1.173, "step": 287 }, { "epoch": 0.32764505119453924, "grad_norm": 1.229687213897705, "learning_rate": 0.0009351535836177475, "loss": 1.8621, "step": 288 }, { "epoch": 0.32878270762229805, "grad_norm": 1.0105575323104858, "learning_rate": 0.0009349260523321957, "loss": 2.6121, "step": 289 }, { "epoch": 0.32992036405005687, "grad_norm": 1.0747400522232056, "learning_rate": 0.0009346985210466438, "loss": 2.3038, "step": 290 }, { "epoch": 0.3310580204778157, "grad_norm": 1.3244582414627075, "learning_rate": 0.0009344709897610922, "loss": 1.9476, "step": 291 }, { "epoch": 0.3321956769055745, "grad_norm": 0.8411991000175476, "learning_rate": 0.0009342434584755404, "loss": 1.8068, "step": 292 }, { "epoch": 0.3333333333333333, "grad_norm": 0.9534935355186462, "learning_rate": 0.0009340159271899886, "loss": 2.0956, "step": 293 }, { "epoch": 0.33447098976109213, "grad_norm": 1.0585649013519287, "learning_rate": 0.0009337883959044369, "loss": 2.7111, "step": 294 }, { "epoch": 0.33560864618885095, "grad_norm": 0.7823183536529541, "learning_rate": 0.0009335608646188851, "loss": 1.9446, "step": 295 }, { "epoch": 0.33674630261660976, "grad_norm": 0.8460807204246521, "learning_rate": 0.0009333333333333333, "loss": 2.6191, "step": 296 }, { "epoch": 0.3378839590443686, "grad_norm": 1.1409828662872314, "learning_rate": 0.0009331058020477816, "loss": 3.1742, "step": 297 }, { "epoch": 0.3390216154721274, "grad_norm": 1.1377147436141968, "learning_rate": 0.0009328782707622299, "loss": 2.7242, "step": 298 }, { "epoch": 0.34015927189988626, "grad_norm": 1.4707083702087402, "learning_rate": 0.0009326507394766781, "loss": 3.7897, "step": 299 }, { "epoch": 0.3412969283276451, "grad_norm": 0.964156448841095, "learning_rate": 0.0009324232081911263, "loss": 2.0425, "step": 300 }, { "epoch": 0.3424345847554039, "grad_norm": 1.323971152305603, "learning_rate": 0.0009321956769055745, "loss": 3.1034, "step": 301 }, { "epoch": 0.3435722411831627, "grad_norm": 1.3859295845031738, "learning_rate": 0.0009319681456200227, "loss": 2.9186, "step": 302 }, { "epoch": 0.3447098976109215, "grad_norm": 0.8150478601455688, "learning_rate": 0.000931740614334471, "loss": 2.3752, "step": 303 }, { "epoch": 0.34584755403868034, "grad_norm": 0.8035518527030945, "learning_rate": 0.0009315130830489192, "loss": 2.252, "step": 304 }, { "epoch": 0.34698521046643915, "grad_norm": 1.2278333902359009, "learning_rate": 0.0009312855517633675, "loss": 2.3787, "step": 305 }, { "epoch": 0.34812286689419797, "grad_norm": 1.116999626159668, "learning_rate": 0.0009310580204778157, "loss": 1.4018, "step": 306 }, { "epoch": 0.3492605233219568, "grad_norm": 1.0040169954299927, "learning_rate": 0.000930830489192264, "loss": 2.0511, "step": 307 }, { "epoch": 0.3503981797497156, "grad_norm": 0.901432991027832, "learning_rate": 0.0009306029579067123, "loss": 1.8721, "step": 308 }, { "epoch": 0.3515358361774744, "grad_norm": 1.451587438583374, "learning_rate": 0.0009303754266211605, "loss": 3.0643, "step": 309 }, { "epoch": 0.35267349260523323, "grad_norm": 1.2055668830871582, "learning_rate": 0.0009301478953356086, "loss": 3.2057, "step": 310 }, { "epoch": 0.35381114903299204, "grad_norm": 0.8358264565467834, "learning_rate": 0.0009299203640500569, "loss": 2.0797, "step": 311 }, { "epoch": 0.35494880546075086, "grad_norm": 0.7835274338722229, "learning_rate": 0.0009296928327645051, "loss": 1.2359, "step": 312 }, { "epoch": 0.3560864618885097, "grad_norm": 0.9940115809440613, "learning_rate": 0.0009294653014789533, "loss": 2.2001, "step": 313 }, { "epoch": 0.3572241183162685, "grad_norm": 1.147113561630249, "learning_rate": 0.0009292377701934016, "loss": 2.0408, "step": 314 }, { "epoch": 0.3583617747440273, "grad_norm": 0.4681651294231415, "learning_rate": 0.0009290102389078499, "loss": 0.7199, "step": 315 }, { "epoch": 0.3594994311717861, "grad_norm": 1.2197333574295044, "learning_rate": 0.0009287827076222981, "loss": 3.6619, "step": 316 }, { "epoch": 0.36063708759954494, "grad_norm": 0.9935864806175232, "learning_rate": 0.0009285551763367464, "loss": 2.0533, "step": 317 }, { "epoch": 0.36177474402730375, "grad_norm": 0.9229695200920105, "learning_rate": 0.0009283276450511946, "loss": 1.4724, "step": 318 }, { "epoch": 0.36291240045506257, "grad_norm": 1.3133660554885864, "learning_rate": 0.0009281001137656428, "loss": 2.9495, "step": 319 }, { "epoch": 0.3640500568828214, "grad_norm": 0.9246838092803955, "learning_rate": 0.000927872582480091, "loss": 1.6493, "step": 320 }, { "epoch": 0.3651877133105802, "grad_norm": 1.4214354753494263, "learning_rate": 0.0009276450511945392, "loss": 2.2151, "step": 321 }, { "epoch": 0.366325369738339, "grad_norm": 1.167389988899231, "learning_rate": 0.0009274175199089874, "loss": 2.1775, "step": 322 }, { "epoch": 0.36746302616609783, "grad_norm": 1.3376412391662598, "learning_rate": 0.0009271899886234357, "loss": 3.2754, "step": 323 }, { "epoch": 0.36860068259385664, "grad_norm": 0.9739036560058594, "learning_rate": 0.000926962457337884, "loss": 2.151, "step": 324 }, { "epoch": 0.36973833902161546, "grad_norm": 0.9827941656112671, "learning_rate": 0.0009267349260523323, "loss": 3.0028, "step": 325 }, { "epoch": 0.3708759954493743, "grad_norm": 0.811549723148346, "learning_rate": 0.0009265073947667805, "loss": 2.1614, "step": 326 }, { "epoch": 0.3720136518771331, "grad_norm": 0.9825149178504944, "learning_rate": 0.0009262798634812287, "loss": 1.7451, "step": 327 }, { "epoch": 0.3731513083048919, "grad_norm": 1.237622857093811, "learning_rate": 0.000926052332195677, "loss": 3.3102, "step": 328 }, { "epoch": 0.3742889647326507, "grad_norm": 0.9079214334487915, "learning_rate": 0.0009258248009101251, "loss": 2.0276, "step": 329 }, { "epoch": 0.37542662116040953, "grad_norm": 0.7722564935684204, "learning_rate": 0.0009255972696245733, "loss": 2.1105, "step": 330 }, { "epoch": 0.37656427758816835, "grad_norm": 0.9066492319107056, "learning_rate": 0.0009253697383390216, "loss": 3.4384, "step": 331 }, { "epoch": 0.37770193401592717, "grad_norm": 0.8562538623809814, "learning_rate": 0.0009251422070534699, "loss": 2.9233, "step": 332 }, { "epoch": 0.378839590443686, "grad_norm": 1.1646368503570557, "learning_rate": 0.0009249146757679181, "loss": 2.551, "step": 333 }, { "epoch": 0.3799772468714448, "grad_norm": 0.9063814878463745, "learning_rate": 0.0009246871444823664, "loss": 2.3003, "step": 334 }, { "epoch": 0.38111490329920367, "grad_norm": 0.993689775466919, "learning_rate": 0.0009244596131968146, "loss": 3.3941, "step": 335 }, { "epoch": 0.3822525597269625, "grad_norm": 1.2754982709884644, "learning_rate": 0.0009242320819112628, "loss": 2.5285, "step": 336 }, { "epoch": 0.3833902161547213, "grad_norm": 1.0740411281585693, "learning_rate": 0.0009240045506257111, "loss": 1.4935, "step": 337 }, { "epoch": 0.3845278725824801, "grad_norm": 1.089651107788086, "learning_rate": 0.0009237770193401593, "loss": 2.136, "step": 338 }, { "epoch": 0.3856655290102389, "grad_norm": 0.8579985499382019, "learning_rate": 0.0009235494880546074, "loss": 1.9851, "step": 339 }, { "epoch": 0.38680318543799774, "grad_norm": 1.0594276189804077, "learning_rate": 0.0009233219567690558, "loss": 2.244, "step": 340 }, { "epoch": 0.38794084186575656, "grad_norm": 0.9024211168289185, "learning_rate": 0.000923094425483504, "loss": 2.5911, "step": 341 }, { "epoch": 0.3890784982935154, "grad_norm": 1.2810311317443848, "learning_rate": 0.0009228668941979522, "loss": 3.0762, "step": 342 }, { "epoch": 0.3902161547212742, "grad_norm": 0.9511512517929077, "learning_rate": 0.0009226393629124005, "loss": 2.871, "step": 343 }, { "epoch": 0.391353811149033, "grad_norm": 1.0560070276260376, "learning_rate": 0.0009224118316268487, "loss": 1.4818, "step": 344 }, { "epoch": 0.3924914675767918, "grad_norm": 0.7268106341362, "learning_rate": 0.000922184300341297, "loss": 0.9631, "step": 345 }, { "epoch": 0.39362912400455063, "grad_norm": 1.2521812915802002, "learning_rate": 0.0009219567690557452, "loss": 2.0506, "step": 346 }, { "epoch": 0.39476678043230945, "grad_norm": 0.8288787603378296, "learning_rate": 0.0009217292377701934, "loss": 1.7319, "step": 347 }, { "epoch": 0.39590443686006827, "grad_norm": 1.5332680940628052, "learning_rate": 0.0009215017064846418, "loss": 1.8226, "step": 348 }, { "epoch": 0.3970420932878271, "grad_norm": 0.918202817440033, "learning_rate": 0.0009212741751990899, "loss": 1.9544, "step": 349 }, { "epoch": 0.3981797497155859, "grad_norm": 1.5337828397750854, "learning_rate": 0.0009210466439135381, "loss": 1.7653, "step": 350 }, { "epoch": 0.3993174061433447, "grad_norm": 0.8244525194168091, "learning_rate": 0.0009208191126279864, "loss": 1.0721, "step": 351 }, { "epoch": 0.4004550625711035, "grad_norm": 1.727785587310791, "learning_rate": 0.0009205915813424346, "loss": 3.8503, "step": 352 }, { "epoch": 0.40159271899886234, "grad_norm": 0.8593099117279053, "learning_rate": 0.0009203640500568828, "loss": 1.5773, "step": 353 }, { "epoch": 0.40273037542662116, "grad_norm": 0.929568886756897, "learning_rate": 0.0009201365187713311, "loss": 2.0465, "step": 354 }, { "epoch": 0.40386803185437997, "grad_norm": 0.8598810434341431, "learning_rate": 0.0009199089874857793, "loss": 2.3678, "step": 355 }, { "epoch": 0.4050056882821388, "grad_norm": 1.19975745677948, "learning_rate": 0.0009196814562002275, "loss": 1.9968, "step": 356 }, { "epoch": 0.4061433447098976, "grad_norm": 0.8773411512374878, "learning_rate": 0.0009194539249146759, "loss": 2.3507, "step": 357 }, { "epoch": 0.4072810011376564, "grad_norm": NaN, "learning_rate": 0.0009194539249146759, "loss": 3.3585, "step": 358 }, { "epoch": 0.40841865756541523, "grad_norm": 1.068331241607666, "learning_rate": 0.000919226393629124, "loss": 2.2658, "step": 359 }, { "epoch": 0.40955631399317405, "grad_norm": 0.8418660163879395, "learning_rate": 0.0009189988623435722, "loss": 1.1336, "step": 360 }, { "epoch": 0.41069397042093286, "grad_norm": 1.0328933000564575, "learning_rate": 0.0009187713310580205, "loss": 2.308, "step": 361 }, { "epoch": 0.4118316268486917, "grad_norm": 0.7511950135231018, "learning_rate": 0.0009185437997724687, "loss": 2.0096, "step": 362 }, { "epoch": 0.4129692832764505, "grad_norm": 1.0027238130569458, "learning_rate": 0.0009183162684869169, "loss": 2.4037, "step": 363 }, { "epoch": 0.4141069397042093, "grad_norm": 0.7929302453994751, "learning_rate": 0.0009180887372013652, "loss": 1.7147, "step": 364 }, { "epoch": 0.4152445961319681, "grad_norm": 0.9058247804641724, "learning_rate": 0.0009178612059158134, "loss": 1.2271, "step": 365 }, { "epoch": 0.41638225255972694, "grad_norm": 1.107584834098816, "learning_rate": 0.0009176336746302618, "loss": 1.976, "step": 366 }, { "epoch": 0.41751990898748575, "grad_norm": 0.9150335788726807, "learning_rate": 0.00091740614334471, "loss": 1.5669, "step": 367 }, { "epoch": 0.41865756541524457, "grad_norm": 1.2605892419815063, "learning_rate": 0.0009171786120591582, "loss": 3.2092, "step": 368 }, { "epoch": 0.4197952218430034, "grad_norm": 1.8941680192947388, "learning_rate": 0.0009169510807736064, "loss": 2.2166, "step": 369 }, { "epoch": 0.42093287827076226, "grad_norm": 1.0579288005828857, "learning_rate": 0.0009167235494880546, "loss": 2.2364, "step": 370 }, { "epoch": 0.42207053469852107, "grad_norm": 0.8153421878814697, "learning_rate": 0.0009164960182025028, "loss": 2.2531, "step": 371 }, { "epoch": 0.4232081911262799, "grad_norm": 0.9382014274597168, "learning_rate": 0.0009162684869169511, "loss": 1.7155, "step": 372 }, { "epoch": 0.4243458475540387, "grad_norm": 0.8863131403923035, "learning_rate": 0.0009160409556313993, "loss": 2.0033, "step": 373 }, { "epoch": 0.4254835039817975, "grad_norm": 1.0132858753204346, "learning_rate": 0.0009158134243458475, "loss": 2.7736, "step": 374 }, { "epoch": 0.42662116040955633, "grad_norm": 1.237510085105896, "learning_rate": 0.0009155858930602959, "loss": 2.3289, "step": 375 }, { "epoch": 0.42775881683731515, "grad_norm": 1.4978190660476685, "learning_rate": 0.0009153583617747441, "loss": 3.452, "step": 376 }, { "epoch": 0.42889647326507396, "grad_norm": 0.9788629412651062, "learning_rate": 0.0009151308304891923, "loss": 2.536, "step": 377 }, { "epoch": 0.4300341296928328, "grad_norm": 1.0312438011169434, "learning_rate": 0.0009149032992036406, "loss": 2.1322, "step": 378 }, { "epoch": 0.4311717861205916, "grad_norm": 0.7808533906936646, "learning_rate": 0.0009146757679180887, "loss": 1.7681, "step": 379 }, { "epoch": 0.4323094425483504, "grad_norm": 1.5941463708877563, "learning_rate": 0.0009144482366325369, "loss": 3.6207, "step": 380 }, { "epoch": 0.4334470989761092, "grad_norm": 0.9317173361778259, "learning_rate": 0.0009142207053469852, "loss": 1.5295, "step": 381 }, { "epoch": 0.43458475540386804, "grad_norm": 1.0690711736679077, "learning_rate": 0.0009139931740614334, "loss": 2.6297, "step": 382 }, { "epoch": 0.43572241183162685, "grad_norm": 0.9129312634468079, "learning_rate": 0.0009137656427758817, "loss": 1.3539, "step": 383 }, { "epoch": 0.43686006825938567, "grad_norm": 1.0281989574432373, "learning_rate": 0.00091353811149033, "loss": 2.2572, "step": 384 }, { "epoch": 0.4379977246871445, "grad_norm": 0.9232156872749329, "learning_rate": 0.0009133105802047782, "loss": 2.7774, "step": 385 }, { "epoch": 0.4391353811149033, "grad_norm": 0.8758882880210876, "learning_rate": 0.0009130830489192265, "loss": 1.8537, "step": 386 }, { "epoch": 0.4402730375426621, "grad_norm": 0.6057271361351013, "learning_rate": 0.0009128555176336747, "loss": 1.1669, "step": 387 }, { "epoch": 0.44141069397042093, "grad_norm": 0.764013409614563, "learning_rate": 0.0009126279863481229, "loss": 1.5494, "step": 388 }, { "epoch": 0.44254835039817975, "grad_norm": 0.7935448288917542, "learning_rate": 0.0009124004550625711, "loss": 1.5411, "step": 389 }, { "epoch": 0.44368600682593856, "grad_norm": 0.9698971509933472, "learning_rate": 0.0009121729237770193, "loss": 2.1653, "step": 390 }, { "epoch": 0.4448236632536974, "grad_norm": 0.9462388753890991, "learning_rate": 0.0009119453924914675, "loss": 3.2917, "step": 391 }, { "epoch": 0.4459613196814562, "grad_norm": 1.1189157962799072, "learning_rate": 0.0009117178612059159, "loss": 1.8733, "step": 392 }, { "epoch": 0.447098976109215, "grad_norm": 1.251373529434204, "learning_rate": 0.0009114903299203641, "loss": 2.239, "step": 393 }, { "epoch": 0.4482366325369738, "grad_norm": 1.385899305343628, "learning_rate": 0.0009112627986348123, "loss": 3.8287, "step": 394 }, { "epoch": 0.44937428896473264, "grad_norm": 0.8532758951187134, "learning_rate": 0.0009110352673492606, "loss": 1.5912, "step": 395 }, { "epoch": 0.45051194539249145, "grad_norm": 0.9151597023010254, "learning_rate": 0.0009108077360637088, "loss": 1.841, "step": 396 }, { "epoch": 0.45164960182025027, "grad_norm": 1.054795265197754, "learning_rate": 0.000910580204778157, "loss": 2.7137, "step": 397 }, { "epoch": 0.4527872582480091, "grad_norm": 0.7954731583595276, "learning_rate": 0.0009103526734926052, "loss": 1.7309, "step": 398 }, { "epoch": 0.4539249146757679, "grad_norm": 1.1954761743545532, "learning_rate": 0.0009101251422070534, "loss": 3.0152, "step": 399 }, { "epoch": 0.4550625711035267, "grad_norm": 1.0547521114349365, "learning_rate": 0.0009098976109215017, "loss": 3.1303, "step": 400 }, { "epoch": 0.45620022753128553, "grad_norm": 1.2710171937942505, "learning_rate": 0.00090967007963595, "loss": 3.1468, "step": 401 }, { "epoch": 0.45733788395904434, "grad_norm": 0.8694390058517456, "learning_rate": 0.0009094425483503982, "loss": 1.815, "step": 402 }, { "epoch": 0.45847554038680316, "grad_norm": 0.9537279605865479, "learning_rate": 0.0009092150170648464, "loss": 1.9914, "step": 403 }, { "epoch": 0.459613196814562, "grad_norm": 0.9804282188415527, "learning_rate": 0.0009089874857792947, "loss": 2.4996, "step": 404 }, { "epoch": 0.46075085324232085, "grad_norm": 1.2874373197555542, "learning_rate": 0.0009087599544937429, "loss": 2.2957, "step": 405 }, { "epoch": 0.46188850967007966, "grad_norm": 0.8987186551094055, "learning_rate": 0.0009085324232081912, "loss": 2.2726, "step": 406 }, { "epoch": 0.4630261660978385, "grad_norm": 1.0590325593948364, "learning_rate": 0.0009083048919226394, "loss": 2.5264, "step": 407 }, { "epoch": 0.4641638225255973, "grad_norm": 1.0790257453918457, "learning_rate": 0.0009080773606370875, "loss": 2.7717, "step": 408 }, { "epoch": 0.4653014789533561, "grad_norm": 1.1287912130355835, "learning_rate": 0.0009078498293515359, "loss": 1.9665, "step": 409 }, { "epoch": 0.4664391353811149, "grad_norm": 0.8903608918190002, "learning_rate": 0.0009076222980659841, "loss": 3.0162, "step": 410 }, { "epoch": 0.46757679180887374, "grad_norm": 0.7123409509658813, "learning_rate": 0.0009073947667804323, "loss": 1.5362, "step": 411 }, { "epoch": 0.46871444823663255, "grad_norm": 0.9984010457992554, "learning_rate": 0.0009071672354948806, "loss": 2.6421, "step": 412 }, { "epoch": 0.46985210466439137, "grad_norm": 0.9573709964752197, "learning_rate": 0.0009069397042093288, "loss": 3.0399, "step": 413 }, { "epoch": 0.4709897610921502, "grad_norm": 1.0717376470565796, "learning_rate": 0.000906712172923777, "loss": 2.5299, "step": 414 }, { "epoch": 0.472127417519909, "grad_norm": 1.3975024223327637, "learning_rate": 0.0009064846416382253, "loss": 2.6824, "step": 415 }, { "epoch": 0.4732650739476678, "grad_norm": 0.9038397073745728, "learning_rate": 0.0009062571103526736, "loss": 2.0442, "step": 416 }, { "epoch": 0.47440273037542663, "grad_norm": 1.1075880527496338, "learning_rate": 0.0009060295790671218, "loss": 2.1013, "step": 417 }, { "epoch": 0.47554038680318544, "grad_norm": 0.8074356913566589, "learning_rate": 0.00090580204778157, "loss": 2.9299, "step": 418 }, { "epoch": 0.47667804323094426, "grad_norm": 0.9493515491485596, "learning_rate": 0.0009055745164960182, "loss": 2.0034, "step": 419 }, { "epoch": 0.4778156996587031, "grad_norm": 1.3625547885894775, "learning_rate": 0.0009053469852104664, "loss": 2.4718, "step": 420 }, { "epoch": 0.4789533560864619, "grad_norm": 1.2692298889160156, "learning_rate": 0.0009051194539249147, "loss": 2.676, "step": 421 }, { "epoch": 0.4800910125142207, "grad_norm": 1.1094417572021484, "learning_rate": 0.0009048919226393629, "loss": 1.5969, "step": 422 }, { "epoch": 0.4812286689419795, "grad_norm": 0.8246015310287476, "learning_rate": 0.0009046643913538111, "loss": 1.9902, "step": 423 }, { "epoch": 0.48236632536973834, "grad_norm": 1.2059593200683594, "learning_rate": 0.0009044368600682594, "loss": 2.2454, "step": 424 }, { "epoch": 0.48350398179749715, "grad_norm": 1.2407282590866089, "learning_rate": 0.0009042093287827077, "loss": 1.9269, "step": 425 }, { "epoch": 0.48464163822525597, "grad_norm": 1.407323956489563, "learning_rate": 0.0009039817974971559, "loss": 3.2682, "step": 426 }, { "epoch": 0.4857792946530148, "grad_norm": 1.0163967609405518, "learning_rate": 0.0009037542662116041, "loss": 2.3068, "step": 427 }, { "epoch": 0.4869169510807736, "grad_norm": 0.9030061364173889, "learning_rate": 0.0009035267349260523, "loss": 1.8157, "step": 428 }, { "epoch": 0.4880546075085324, "grad_norm": 1.6989500522613525, "learning_rate": 0.0009032992036405006, "loss": 3.7374, "step": 429 }, { "epoch": 0.4891922639362912, "grad_norm": 1.1923679113388062, "learning_rate": 0.0009030716723549488, "loss": 2.565, "step": 430 }, { "epoch": 0.49032992036405004, "grad_norm": 0.7808157205581665, "learning_rate": 0.000902844141069397, "loss": 2.05, "step": 431 }, { "epoch": 0.49146757679180886, "grad_norm": 1.2099822759628296, "learning_rate": 0.0009026166097838453, "loss": 2.8765, "step": 432 }, { "epoch": 0.4926052332195677, "grad_norm": 0.628551185131073, "learning_rate": 0.0009023890784982936, "loss": 1.3558, "step": 433 }, { "epoch": 0.4937428896473265, "grad_norm": 1.0964410305023193, "learning_rate": 0.0009021615472127418, "loss": 3.4868, "step": 434 }, { "epoch": 0.4948805460750853, "grad_norm": 1.1500060558319092, "learning_rate": 0.0009019340159271901, "loss": 2.6184, "step": 435 }, { "epoch": 0.4960182025028441, "grad_norm": 0.9226061701774597, "learning_rate": 0.0009017064846416383, "loss": 2.2053, "step": 436 }, { "epoch": 0.49715585893060293, "grad_norm": 0.9873738288879395, "learning_rate": 0.0009014789533560864, "loss": 2.4479, "step": 437 }, { "epoch": 0.49829351535836175, "grad_norm": 0.6207978129386902, "learning_rate": 0.0009012514220705347, "loss": 1.1146, "step": 438 }, { "epoch": 0.49943117178612056, "grad_norm": 0.7853182554244995, "learning_rate": 0.0009010238907849829, "loss": 2.4183, "step": 439 }, { "epoch": 0.5005688282138794, "grad_norm": 1.1462852954864502, "learning_rate": 0.0009007963594994311, "loss": 2.3492, "step": 440 }, { "epoch": 0.5017064846416383, "grad_norm": 0.7885423302650452, "learning_rate": 0.0009005688282138795, "loss": 1.2548, "step": 441 }, { "epoch": 0.502844141069397, "grad_norm": 1.071851134300232, "learning_rate": 0.0009003412969283277, "loss": 2.1613, "step": 442 }, { "epoch": 0.5039817974971559, "grad_norm": 0.934515655040741, "learning_rate": 0.0009001137656427759, "loss": 2.1865, "step": 443 }, { "epoch": 0.5051194539249146, "grad_norm": 0.8959553241729736, "learning_rate": 0.0008998862343572242, "loss": 1.3645, "step": 444 }, { "epoch": 0.5062571103526735, "grad_norm": 0.6154829263687134, "learning_rate": 0.0008996587030716724, "loss": 0.9074, "step": 445 }, { "epoch": 0.5073947667804323, "grad_norm": 0.8270671367645264, "learning_rate": 0.0008994311717861206, "loss": 2.3498, "step": 446 }, { "epoch": 0.5085324232081911, "grad_norm": 1.0123733282089233, "learning_rate": 0.0008992036405005688, "loss": 2.0715, "step": 447 }, { "epoch": 0.5096700796359499, "grad_norm": 1.2498971223831177, "learning_rate": 0.000898976109215017, "loss": 3.0001, "step": 448 }, { "epoch": 0.5108077360637088, "grad_norm": 1.0939654111862183, "learning_rate": 0.0008987485779294653, "loss": 1.7905, "step": 449 }, { "epoch": 0.5119453924914675, "grad_norm": 1.0476197004318237, "learning_rate": 0.0008985210466439136, "loss": 2.68, "step": 450 }, { "epoch": 0.5130830489192264, "grad_norm": 1.1398872137069702, "learning_rate": 0.0008982935153583618, "loss": 3.1602, "step": 451 }, { "epoch": 0.5142207053469852, "grad_norm": 1.092517614364624, "learning_rate": 0.0008980659840728101, "loss": 1.772, "step": 452 }, { "epoch": 0.515358361774744, "grad_norm": 1.1467770338058472, "learning_rate": 0.0008978384527872583, "loss": 3.0966, "step": 453 }, { "epoch": 0.5164960182025028, "grad_norm": 0.9609680771827698, "learning_rate": 0.0008976109215017065, "loss": 2.1103, "step": 454 }, { "epoch": 0.5176336746302617, "grad_norm": 1.0481035709381104, "learning_rate": 0.0008973833902161548, "loss": 1.9659, "step": 455 }, { "epoch": 0.5187713310580204, "grad_norm": 0.8882219791412354, "learning_rate": 0.000897155858930603, "loss": 1.6778, "step": 456 }, { "epoch": 0.5199089874857793, "grad_norm": 0.8644529581069946, "learning_rate": 0.0008969283276450511, "loss": 2.0891, "step": 457 }, { "epoch": 0.5210466439135382, "grad_norm": 0.9775826930999756, "learning_rate": 0.0008967007963594995, "loss": 2.6407, "step": 458 }, { "epoch": 0.5221843003412969, "grad_norm": 1.4607579708099365, "learning_rate": 0.0008964732650739477, "loss": 2.8334, "step": 459 }, { "epoch": 0.5233219567690558, "grad_norm": 1.4416364431381226, "learning_rate": 0.0008962457337883959, "loss": 2.8912, "step": 460 }, { "epoch": 0.5244596131968146, "grad_norm": 0.7647553086280823, "learning_rate": 0.0008960182025028442, "loss": 2.1203, "step": 461 }, { "epoch": 0.5255972696245734, "grad_norm": 0.9002684950828552, "learning_rate": 0.0008957906712172924, "loss": 1.9081, "step": 462 }, { "epoch": 0.5267349260523322, "grad_norm": 0.6581302285194397, "learning_rate": 0.0008955631399317406, "loss": 2.001, "step": 463 }, { "epoch": 0.5278725824800911, "grad_norm": 1.0612741708755493, "learning_rate": 0.0008953356086461889, "loss": 1.9066, "step": 464 }, { "epoch": 0.5290102389078498, "grad_norm": 1.4963552951812744, "learning_rate": 0.0008951080773606371, "loss": 2.1809, "step": 465 }, { "epoch": 0.5301478953356087, "grad_norm": 1.1061559915542603, "learning_rate": 0.0008948805460750852, "loss": 3.068, "step": 466 }, { "epoch": 0.5312855517633674, "grad_norm": 0.9316163659095764, "learning_rate": 0.0008946530147895336, "loss": 1.9232, "step": 467 }, { "epoch": 0.5324232081911263, "grad_norm": 0.8861683011054993, "learning_rate": 0.0008944254835039818, "loss": 2.1368, "step": 468 }, { "epoch": 0.5335608646188851, "grad_norm": 0.835658848285675, "learning_rate": 0.0008941979522184301, "loss": 1.8731, "step": 469 }, { "epoch": 0.534698521046644, "grad_norm": 1.1577521562576294, "learning_rate": 0.0008939704209328783, "loss": 3.0726, "step": 470 }, { "epoch": 0.5358361774744027, "grad_norm": 1.3069539070129395, "learning_rate": 0.0008937428896473265, "loss": 2.8827, "step": 471 }, { "epoch": 0.5369738339021616, "grad_norm": 0.884005606174469, "learning_rate": 0.0008935153583617748, "loss": 2.2167, "step": 472 }, { "epoch": 0.5381114903299203, "grad_norm": 1.0972729921340942, "learning_rate": 0.000893287827076223, "loss": 2.8096, "step": 473 }, { "epoch": 0.5392491467576792, "grad_norm": 0.6620914340019226, "learning_rate": 0.0008930602957906712, "loss": 1.8139, "step": 474 }, { "epoch": 0.540386803185438, "grad_norm": 1.0320155620574951, "learning_rate": 0.0008928327645051196, "loss": 2.7753, "step": 475 }, { "epoch": 0.5415244596131968, "grad_norm": 1.195923924446106, "learning_rate": 0.0008926052332195677, "loss": 2.8982, "step": 476 }, { "epoch": 0.5426621160409556, "grad_norm": 0.6206729412078857, "learning_rate": 0.0008923777019340159, "loss": 1.6572, "step": 477 }, { "epoch": 0.5437997724687145, "grad_norm": 0.867581844329834, "learning_rate": 0.0008921501706484642, "loss": 1.3072, "step": 478 }, { "epoch": 0.5449374288964732, "grad_norm": 1.066758632659912, "learning_rate": 0.0008919226393629124, "loss": 2.2283, "step": 479 }, { "epoch": 0.5460750853242321, "grad_norm": 0.9295621514320374, "learning_rate": 0.0008916951080773606, "loss": 1.8157, "step": 480 }, { "epoch": 0.5472127417519909, "grad_norm": 1.0595327615737915, "learning_rate": 0.0008914675767918089, "loss": 2.486, "step": 481 }, { "epoch": 0.5483503981797497, "grad_norm": 1.1606998443603516, "learning_rate": 0.0008912400455062571, "loss": 1.9862, "step": 482 }, { "epoch": 0.5494880546075085, "grad_norm": 1.3078361749649048, "learning_rate": 0.0008910125142207054, "loss": 2.824, "step": 483 }, { "epoch": 0.5506257110352674, "grad_norm": 1.1636848449707031, "learning_rate": 0.0008907849829351537, "loss": 2.9846, "step": 484 }, { "epoch": 0.5517633674630261, "grad_norm": 1.6126782894134521, "learning_rate": 0.0008905574516496019, "loss": 3.7502, "step": 485 }, { "epoch": 0.552901023890785, "grad_norm": 0.9118810296058655, "learning_rate": 0.00089032992036405, "loss": 2.7482, "step": 486 }, { "epoch": 0.5540386803185438, "grad_norm": 0.9121948480606079, "learning_rate": 0.0008901023890784983, "loss": 1.8244, "step": 487 }, { "epoch": 0.5551763367463026, "grad_norm": 1.749747633934021, "learning_rate": 0.0008898748577929465, "loss": 3.4315, "step": 488 }, { "epoch": 0.5563139931740614, "grad_norm": 0.9559519290924072, "learning_rate": 0.0008896473265073947, "loss": 1.2761, "step": 489 }, { "epoch": 0.5574516496018203, "grad_norm": 1.3748880624771118, "learning_rate": 0.000889419795221843, "loss": 2.4184, "step": 490 }, { "epoch": 0.558589306029579, "grad_norm": 0.9325023889541626, "learning_rate": 0.0008891922639362912, "loss": 1.6256, "step": 491 }, { "epoch": 0.5597269624573379, "grad_norm": 1.2054123878479004, "learning_rate": 0.0008889647326507396, "loss": 2.4543, "step": 492 }, { "epoch": 0.5608646188850968, "grad_norm": 1.1576976776123047, "learning_rate": 0.0008887372013651878, "loss": 1.7206, "step": 493 }, { "epoch": 0.5620022753128555, "grad_norm": 0.8814551830291748, "learning_rate": 0.000888509670079636, "loss": 1.8812, "step": 494 }, { "epoch": 0.5631399317406144, "grad_norm": 0.8528147339820862, "learning_rate": 0.0008882821387940843, "loss": 2.411, "step": 495 }, { "epoch": 0.5642775881683731, "grad_norm": 1.0148645639419556, "learning_rate": 0.0008880546075085324, "loss": 2.1765, "step": 496 }, { "epoch": 0.565415244596132, "grad_norm": 0.7151885032653809, "learning_rate": 0.0008878270762229806, "loss": 1.2428, "step": 497 }, { "epoch": 0.5665529010238908, "grad_norm": 1.7494755983352661, "learning_rate": 0.0008875995449374289, "loss": 4.1417, "step": 498 }, { "epoch": 0.5676905574516496, "grad_norm": 0.8421460390090942, "learning_rate": 0.0008873720136518771, "loss": 1.7912, "step": 499 }, { "epoch": 0.5688282138794084, "grad_norm": 0.9811123013496399, "learning_rate": 0.0008871444823663254, "loss": 2.1765, "step": 500 }, { "epoch": 0.5699658703071673, "grad_norm": 0.9422712922096252, "learning_rate": 0.0008869169510807737, "loss": 1.7157, "step": 501 }, { "epoch": 0.571103526734926, "grad_norm": 0.8885846138000488, "learning_rate": 0.0008866894197952219, "loss": 2.2024, "step": 502 }, { "epoch": 0.5722411831626849, "grad_norm": 1.6120095252990723, "learning_rate": 0.0008864618885096701, "loss": 3.8523, "step": 503 }, { "epoch": 0.5733788395904437, "grad_norm": 1.2259505987167358, "learning_rate": 0.0008862343572241184, "loss": 2.3551, "step": 504 }, { "epoch": 0.5745164960182025, "grad_norm": 0.9865310788154602, "learning_rate": 0.0008860068259385665, "loss": 1.0719, "step": 505 }, { "epoch": 0.5756541524459613, "grad_norm": 0.945303201675415, "learning_rate": 0.0008857792946530147, "loss": 1.8286, "step": 506 }, { "epoch": 0.5767918088737202, "grad_norm": 1.099501371383667, "learning_rate": 0.000885551763367463, "loss": 2.4123, "step": 507 }, { "epoch": 0.5779294653014789, "grad_norm": 1.3092284202575684, "learning_rate": 0.0008853242320819112, "loss": 2.8365, "step": 508 }, { "epoch": 0.5790671217292378, "grad_norm": 0.9153149127960205, "learning_rate": 0.0008850967007963595, "loss": 1.4962, "step": 509 }, { "epoch": 0.5802047781569966, "grad_norm": 1.0560232400894165, "learning_rate": 0.0008848691695108078, "loss": 2.0615, "step": 510 }, { "epoch": 0.5813424345847554, "grad_norm": 0.6506679058074951, "learning_rate": 0.000884641638225256, "loss": 1.7931, "step": 511 }, { "epoch": 0.5824800910125142, "grad_norm": 0.6726229786872864, "learning_rate": 0.0008844141069397043, "loss": 1.5145, "step": 512 }, { "epoch": 0.5836177474402731, "grad_norm": 0.8806062340736389, "learning_rate": 0.0008841865756541525, "loss": 1.3302, "step": 513 }, { "epoch": 0.5847554038680318, "grad_norm": 0.8398321270942688, "learning_rate": 0.0008839590443686007, "loss": 1.0688, "step": 514 }, { "epoch": 0.5858930602957907, "grad_norm": 1.0214747190475464, "learning_rate": 0.0008837315130830489, "loss": 2.7142, "step": 515 }, { "epoch": 0.5870307167235495, "grad_norm": 1.1030226945877075, "learning_rate": 0.0008835039817974971, "loss": 1.7378, "step": 516 }, { "epoch": 0.5881683731513083, "grad_norm": 0.9792185425758362, "learning_rate": 0.0008832764505119454, "loss": 1.9917, "step": 517 }, { "epoch": 0.5893060295790671, "grad_norm": 0.7293819785118103, "learning_rate": 0.0008830489192263937, "loss": 1.7711, "step": 518 }, { "epoch": 0.590443686006826, "grad_norm": 0.9090821743011475, "learning_rate": 0.0008828213879408419, "loss": 2.3188, "step": 519 }, { "epoch": 0.5915813424345847, "grad_norm": 0.9316034317016602, "learning_rate": 0.0008825938566552901, "loss": 1.9596, "step": 520 }, { "epoch": 0.5927189988623436, "grad_norm": 1.022985577583313, "learning_rate": 0.0008823663253697384, "loss": 1.7448, "step": 521 }, { "epoch": 0.5938566552901023, "grad_norm": 1.1398788690567017, "learning_rate": 0.0008821387940841866, "loss": 2.932, "step": 522 }, { "epoch": 0.5949943117178612, "grad_norm": 1.1560657024383545, "learning_rate": 0.0008819112627986348, "loss": 2.2647, "step": 523 }, { "epoch": 0.59613196814562, "grad_norm": 1.1563608646392822, "learning_rate": 0.0008816837315130831, "loss": 2.6389, "step": 524 }, { "epoch": 0.5972696245733788, "grad_norm": 0.6971971988677979, "learning_rate": 0.0008814562002275312, "loss": 1.6904, "step": 525 }, { "epoch": 0.5984072810011376, "grad_norm": 1.1338340044021606, "learning_rate": 0.0008812286689419795, "loss": 2.7721, "step": 526 }, { "epoch": 0.5995449374288965, "grad_norm": 0.9443331360816956, "learning_rate": 0.0008810011376564278, "loss": 1.673, "step": 527 }, { "epoch": 0.6006825938566553, "grad_norm": 0.7245666980743408, "learning_rate": 0.000880773606370876, "loss": 1.716, "step": 528 }, { "epoch": 0.6018202502844141, "grad_norm": 0.9884563684463501, "learning_rate": 0.0008805460750853242, "loss": 3.0318, "step": 529 }, { "epoch": 0.602957906712173, "grad_norm": 1.2986050844192505, "learning_rate": 0.0008803185437997725, "loss": 2.7565, "step": 530 }, { "epoch": 0.6040955631399317, "grad_norm": 0.9133585691452026, "learning_rate": 0.0008800910125142207, "loss": 2.1352, "step": 531 }, { "epoch": 0.6052332195676906, "grad_norm": 0.7802772521972656, "learning_rate": 0.000879863481228669, "loss": 1.487, "step": 532 }, { "epoch": 0.6063708759954494, "grad_norm": 1.488508701324463, "learning_rate": 0.0008796359499431173, "loss": 3.6104, "step": 533 }, { "epoch": 0.6075085324232082, "grad_norm": 1.4523831605911255, "learning_rate": 0.0008794084186575654, "loss": 4.1854, "step": 534 }, { "epoch": 0.608646188850967, "grad_norm": 0.9844598174095154, "learning_rate": 0.0008791808873720137, "loss": 2.2759, "step": 535 }, { "epoch": 0.6097838452787259, "grad_norm": 1.0784645080566406, "learning_rate": 0.0008789533560864619, "loss": 2.6678, "step": 536 }, { "epoch": 0.6109215017064846, "grad_norm": 1.2045968770980835, "learning_rate": 0.0008787258248009101, "loss": 2.4359, "step": 537 }, { "epoch": 0.6120591581342435, "grad_norm": 1.1759347915649414, "learning_rate": 0.0008784982935153584, "loss": 2.6307, "step": 538 }, { "epoch": 0.6131968145620023, "grad_norm": 1.0141427516937256, "learning_rate": 0.0008782707622298066, "loss": 2.5533, "step": 539 }, { "epoch": 0.6143344709897611, "grad_norm": 0.986733615398407, "learning_rate": 0.0008780432309442548, "loss": 1.1977, "step": 540 }, { "epoch": 0.6154721274175199, "grad_norm": 1.7550513744354248, "learning_rate": 0.0008778156996587031, "loss": 1.977, "step": 541 }, { "epoch": 0.6166097838452788, "grad_norm": 1.0284448862075806, "learning_rate": 0.0008775881683731514, "loss": 2.2885, "step": 542 }, { "epoch": 0.6177474402730375, "grad_norm": 0.8970025181770325, "learning_rate": 0.0008773606370875996, "loss": 1.519, "step": 543 }, { "epoch": 0.6188850967007964, "grad_norm": 1.1675552129745483, "learning_rate": 0.0008771331058020478, "loss": 2.5995, "step": 544 }, { "epoch": 0.6200227531285551, "grad_norm": 0.7909214496612549, "learning_rate": 0.000876905574516496, "loss": 2.4494, "step": 545 }, { "epoch": 0.621160409556314, "grad_norm": 1.1816651821136475, "learning_rate": 0.0008766780432309442, "loss": 2.0798, "step": 546 }, { "epoch": 0.6222980659840728, "grad_norm": 1.2361869812011719, "learning_rate": 0.0008764505119453925, "loss": 2.1153, "step": 547 }, { "epoch": 0.6234357224118316, "grad_norm": 1.498818278312683, "learning_rate": 0.0008762229806598407, "loss": 3.6819, "step": 548 }, { "epoch": 0.6245733788395904, "grad_norm": 1.2410242557525635, "learning_rate": 0.0008759954493742889, "loss": 2.3829, "step": 549 }, { "epoch": 0.6257110352673493, "grad_norm": 1.1705360412597656, "learning_rate": 0.0008757679180887373, "loss": 1.718, "step": 550 }, { "epoch": 0.626848691695108, "grad_norm": 0.9694558382034302, "learning_rate": 0.0008755403868031855, "loss": 2.1928, "step": 551 }, { "epoch": 0.6279863481228669, "grad_norm": 0.6821228265762329, "learning_rate": 0.0008753128555176338, "loss": 1.406, "step": 552 }, { "epoch": 0.6291240045506257, "grad_norm": 0.8572499752044678, "learning_rate": 0.000875085324232082, "loss": 1.6783, "step": 553 }, { "epoch": 0.6302616609783845, "grad_norm": 0.8171662092208862, "learning_rate": 0.0008748577929465301, "loss": 2.4399, "step": 554 }, { "epoch": 0.6313993174061433, "grad_norm": 1.320075273513794, "learning_rate": 0.0008746302616609784, "loss": 1.4817, "step": 555 }, { "epoch": 0.6325369738339022, "grad_norm": 0.7951433658599854, "learning_rate": 0.0008744027303754266, "loss": 1.7655, "step": 556 }, { "epoch": 0.6336746302616609, "grad_norm": 0.7013391256332397, "learning_rate": 0.0008741751990898748, "loss": 1.3957, "step": 557 }, { "epoch": 0.6348122866894198, "grad_norm": 0.8427788019180298, "learning_rate": 0.0008739476678043232, "loss": 1.5612, "step": 558 }, { "epoch": 0.6359499431171786, "grad_norm": 1.0773825645446777, "learning_rate": 0.0008737201365187714, "loss": 1.8739, "step": 559 }, { "epoch": 0.6370875995449374, "grad_norm": 0.7293964624404907, "learning_rate": 0.0008734926052332196, "loss": 1.1344, "step": 560 }, { "epoch": 0.6382252559726962, "grad_norm": 0.8789951205253601, "learning_rate": 0.0008732650739476679, "loss": 2.1382, "step": 561 }, { "epoch": 0.6393629124004551, "grad_norm": 1.86644446849823, "learning_rate": 0.0008730375426621161, "loss": 4.1822, "step": 562 }, { "epoch": 0.6405005688282139, "grad_norm": 1.1930569410324097, "learning_rate": 0.0008728100113765643, "loss": 2.7447, "step": 563 }, { "epoch": 0.6416382252559727, "grad_norm": 0.9682884812355042, "learning_rate": 0.0008725824800910125, "loss": 2.0626, "step": 564 }, { "epoch": 0.6427758816837316, "grad_norm": 0.7793911695480347, "learning_rate": 0.0008723549488054607, "loss": 1.643, "step": 565 }, { "epoch": 0.6439135381114903, "grad_norm": 1.0574642419815063, "learning_rate": 0.0008721274175199089, "loss": 2.1083, "step": 566 }, { "epoch": 0.6450511945392492, "grad_norm": 0.8339124917984009, "learning_rate": 0.0008718998862343573, "loss": 1.7013, "step": 567 }, { "epoch": 0.646188850967008, "grad_norm": 0.7775769829750061, "learning_rate": 0.0008716723549488055, "loss": 1.3428, "step": 568 }, { "epoch": 0.6473265073947668, "grad_norm": 1.089309573173523, "learning_rate": 0.0008714448236632537, "loss": 2.1378, "step": 569 }, { "epoch": 0.6484641638225256, "grad_norm": 1.043135166168213, "learning_rate": 0.000871217292377702, "loss": 3.3488, "step": 570 }, { "epoch": 0.6496018202502845, "grad_norm": 0.9596251249313354, "learning_rate": 0.0008709897610921502, "loss": 2.3173, "step": 571 }, { "epoch": 0.6507394766780432, "grad_norm": 0.8774250745773315, "learning_rate": 0.0008707622298065985, "loss": 2.1654, "step": 572 }, { "epoch": 0.6518771331058021, "grad_norm": 0.7768117189407349, "learning_rate": 0.0008705346985210466, "loss": 1.56, "step": 573 }, { "epoch": 0.6530147895335608, "grad_norm": 0.9584445953369141, "learning_rate": 0.0008703071672354948, "loss": 1.8768, "step": 574 }, { "epoch": 0.6541524459613197, "grad_norm": 0.7490730285644531, "learning_rate": 0.0008700796359499432, "loss": 1.5192, "step": 575 }, { "epoch": 0.6552901023890785, "grad_norm": 0.779798686504364, "learning_rate": 0.0008698521046643914, "loss": 1.6678, "step": 576 }, { "epoch": 0.6564277588168373, "grad_norm": 1.2081419229507446, "learning_rate": 0.0008696245733788396, "loss": 2.5276, "step": 577 }, { "epoch": 0.6575654152445961, "grad_norm": 1.5270092487335205, "learning_rate": 0.0008693970420932879, "loss": 3.8846, "step": 578 }, { "epoch": 0.658703071672355, "grad_norm": 0.9504697918891907, "learning_rate": 0.0008691695108077361, "loss": 1.8834, "step": 579 }, { "epoch": 0.6598407281001137, "grad_norm": 0.8998041749000549, "learning_rate": 0.0008689419795221843, "loss": 2.0045, "step": 580 }, { "epoch": 0.6609783845278726, "grad_norm": 0.8134379386901855, "learning_rate": 0.0008687144482366326, "loss": 0.9556, "step": 581 }, { "epoch": 0.6621160409556314, "grad_norm": 0.7257466316223145, "learning_rate": 0.0008684869169510808, "loss": 1.2292, "step": 582 }, { "epoch": 0.6632536973833902, "grad_norm": 1.3256492614746094, "learning_rate": 0.0008682593856655289, "loss": 3.8102, "step": 583 }, { "epoch": 0.664391353811149, "grad_norm": 0.7932450771331787, "learning_rate": 0.0008680318543799773, "loss": 2.2484, "step": 584 }, { "epoch": 0.6655290102389079, "grad_norm": 1.0340895652770996, "learning_rate": 0.0008678043230944255, "loss": 2.1459, "step": 585 }, { "epoch": 0.6666666666666666, "grad_norm": 1.0674877166748047, "learning_rate": 0.0008675767918088737, "loss": 2.7635, "step": 586 }, { "epoch": 0.6678043230944255, "grad_norm": 1.0885896682739258, "learning_rate": 0.000867349260523322, "loss": 2.525, "step": 587 }, { "epoch": 0.6689419795221843, "grad_norm": 0.6129403710365295, "learning_rate": 0.0008671217292377702, "loss": 1.225, "step": 588 }, { "epoch": 0.6700796359499431, "grad_norm": 0.9944592118263245, "learning_rate": 0.0008668941979522184, "loss": 1.9925, "step": 589 }, { "epoch": 0.6712172923777019, "grad_norm": 1.1834667921066284, "learning_rate": 0.0008666666666666667, "loss": 2.3584, "step": 590 }, { "epoch": 0.6723549488054608, "grad_norm": 1.3296507596969604, "learning_rate": 0.0008664391353811149, "loss": 2.7873, "step": 591 }, { "epoch": 0.6734926052332195, "grad_norm": 0.7346708178520203, "learning_rate": 0.0008662116040955633, "loss": 1.5522, "step": 592 }, { "epoch": 0.6746302616609784, "grad_norm": 1.405709147453308, "learning_rate": 0.0008659840728100114, "loss": 3.4668, "step": 593 }, { "epoch": 0.6757679180887372, "grad_norm": 0.925556480884552, "learning_rate": 0.0008657565415244596, "loss": 2.5614, "step": 594 }, { "epoch": 0.676905574516496, "grad_norm": 0.6485567688941956, "learning_rate": 0.0008655290102389079, "loss": 1.3796, "step": 595 }, { "epoch": 0.6780432309442548, "grad_norm": 1.2317935228347778, "learning_rate": 0.0008653014789533561, "loss": 2.7619, "step": 596 }, { "epoch": 0.6791808873720137, "grad_norm": 1.8749306201934814, "learning_rate": 0.0008650739476678043, "loss": 2.0424, "step": 597 }, { "epoch": 0.6803185437997725, "grad_norm": 1.0405480861663818, "learning_rate": 0.0008648464163822526, "loss": 1.6496, "step": 598 }, { "epoch": 0.6814562002275313, "grad_norm": 0.7675598859786987, "learning_rate": 0.0008646188850967008, "loss": 1.5025, "step": 599 }, { "epoch": 0.6825938566552902, "grad_norm": 0.9507393836975098, "learning_rate": 0.000864391353811149, "loss": 2.1572, "step": 600 }, { "epoch": 0.6837315130830489, "grad_norm": 0.7208441495895386, "learning_rate": 0.0008641638225255974, "loss": 1.8724, "step": 601 }, { "epoch": 0.6848691695108078, "grad_norm": 1.0471227169036865, "learning_rate": 0.0008639362912400455, "loss": 1.5183, "step": 602 }, { "epoch": 0.6860068259385665, "grad_norm": 0.7617486119270325, "learning_rate": 0.0008637087599544937, "loss": 1.9664, "step": 603 }, { "epoch": 0.6871444823663254, "grad_norm": 1.7303751707077026, "learning_rate": 0.000863481228668942, "loss": 3.4134, "step": 604 }, { "epoch": 0.6882821387940842, "grad_norm": 1.1979167461395264, "learning_rate": 0.0008632536973833902, "loss": 2.1287, "step": 605 }, { "epoch": 0.689419795221843, "grad_norm": 0.8797517418861389, "learning_rate": 0.0008630261660978384, "loss": 1.4359, "step": 606 }, { "epoch": 0.6905574516496018, "grad_norm": 1.3012560606002808, "learning_rate": 0.0008627986348122867, "loss": 1.9451, "step": 607 }, { "epoch": 0.6916951080773607, "grad_norm": 0.8899447917938232, "learning_rate": 0.000862571103526735, "loss": 1.9974, "step": 608 }, { "epoch": 0.6928327645051194, "grad_norm": 1.449118733406067, "learning_rate": 0.0008623435722411832, "loss": 3.0255, "step": 609 }, { "epoch": 0.6939704209328783, "grad_norm": 1.4355418682098389, "learning_rate": 0.0008621160409556315, "loss": 3.6942, "step": 610 }, { "epoch": 0.6951080773606371, "grad_norm": 0.9907065033912659, "learning_rate": 0.0008618885096700797, "loss": 1.8645, "step": 611 }, { "epoch": 0.6962457337883959, "grad_norm": 0.8287332057952881, "learning_rate": 0.0008616609783845278, "loss": 1.909, "step": 612 }, { "epoch": 0.6973833902161547, "grad_norm": 1.207824468612671, "learning_rate": 0.0008614334470989761, "loss": 2.1948, "step": 613 }, { "epoch": 0.6985210466439136, "grad_norm": 0.6670779585838318, "learning_rate": 0.0008612059158134243, "loss": 1.2597, "step": 614 }, { "epoch": 0.6996587030716723, "grad_norm": 0.6023557186126709, "learning_rate": 0.0008609783845278726, "loss": 1.2117, "step": 615 }, { "epoch": 0.7007963594994312, "grad_norm": 1.5673305988311768, "learning_rate": 0.0008607508532423208, "loss": 3.1698, "step": 616 }, { "epoch": 0.70193401592719, "grad_norm": 0.9000980257987976, "learning_rate": 0.000860523321956769, "loss": 1.9187, "step": 617 }, { "epoch": 0.7030716723549488, "grad_norm": 0.9901409149169922, "learning_rate": 0.0008602957906712174, "loss": 2.3486, "step": 618 }, { "epoch": 0.7042093287827076, "grad_norm": 0.7944636940956116, "learning_rate": 0.0008600682593856656, "loss": 2.0677, "step": 619 }, { "epoch": 0.7053469852104665, "grad_norm": 0.9612780213356018, "learning_rate": 0.0008598407281001138, "loss": 1.693, "step": 620 }, { "epoch": 0.7064846416382252, "grad_norm": 1.2384001016616821, "learning_rate": 0.0008596131968145621, "loss": 2.9955, "step": 621 }, { "epoch": 0.7076222980659841, "grad_norm": 1.5061485767364502, "learning_rate": 0.0008593856655290102, "loss": 1.945, "step": 622 }, { "epoch": 0.7087599544937428, "grad_norm": 1.0285781621932983, "learning_rate": 0.0008591581342434584, "loss": 2.5533, "step": 623 }, { "epoch": 0.7098976109215017, "grad_norm": 0.783734917640686, "learning_rate": 0.0008589306029579067, "loss": 1.6898, "step": 624 }, { "epoch": 0.7110352673492605, "grad_norm": 0.7737388610839844, "learning_rate": 0.000858703071672355, "loss": 2.4584, "step": 625 }, { "epoch": 0.7121729237770194, "grad_norm": 0.6664589047431946, "learning_rate": 0.0008584755403868032, "loss": 1.7401, "step": 626 }, { "epoch": 0.7133105802047781, "grad_norm": 0.7452929019927979, "learning_rate": 0.0008582480091012515, "loss": 1.3864, "step": 627 }, { "epoch": 0.714448236632537, "grad_norm": 0.8343956470489502, "learning_rate": 0.0008580204778156997, "loss": 1.4364, "step": 628 }, { "epoch": 0.7155858930602957, "grad_norm": 0.8422945737838745, "learning_rate": 0.0008577929465301479, "loss": 2.3053, "step": 629 }, { "epoch": 0.7167235494880546, "grad_norm": 1.2495871782302856, "learning_rate": 0.0008575654152445962, "loss": 3.8026, "step": 630 }, { "epoch": 0.7178612059158134, "grad_norm": 1.0053642988204956, "learning_rate": 0.0008573378839590444, "loss": 2.2322, "step": 631 }, { "epoch": 0.7189988623435722, "grad_norm": 1.342475414276123, "learning_rate": 0.0008571103526734925, "loss": 2.2204, "step": 632 }, { "epoch": 0.7201365187713311, "grad_norm": 1.1478005647659302, "learning_rate": 0.0008568828213879408, "loss": 2.0024, "step": 633 }, { "epoch": 0.7212741751990899, "grad_norm": 1.3100638389587402, "learning_rate": 0.0008566552901023891, "loss": 1.7865, "step": 634 }, { "epoch": 0.7224118316268487, "grad_norm": 0.7397316694259644, "learning_rate": 0.0008564277588168374, "loss": 0.9055, "step": 635 }, { "epoch": 0.7235494880546075, "grad_norm": 1.5261900424957275, "learning_rate": 0.0008562002275312856, "loss": 2.518, "step": 636 }, { "epoch": 0.7246871444823664, "grad_norm": 1.1781413555145264, "learning_rate": 0.0008559726962457338, "loss": 2.4799, "step": 637 }, { "epoch": 0.7258248009101251, "grad_norm": 0.9748155474662781, "learning_rate": 0.0008557451649601821, "loss": 1.9048, "step": 638 }, { "epoch": 0.726962457337884, "grad_norm": 0.9338977336883545, "learning_rate": 0.0008555176336746303, "loss": 2.0054, "step": 639 }, { "epoch": 0.7281001137656428, "grad_norm": 1.5872938632965088, "learning_rate": 0.0008552901023890785, "loss": 2.7386, "step": 640 }, { "epoch": 0.7292377701934016, "grad_norm": 0.899350643157959, "learning_rate": 0.0008550625711035267, "loss": 1.694, "step": 641 }, { "epoch": 0.7303754266211604, "grad_norm": 1.3371248245239258, "learning_rate": 0.000854835039817975, "loss": 2.4103, "step": 642 }, { "epoch": 0.7315130830489193, "grad_norm": 0.9004554152488708, "learning_rate": 0.0008546075085324232, "loss": 1.7984, "step": 643 }, { "epoch": 0.732650739476678, "grad_norm": 1.229750394821167, "learning_rate": 0.0008543799772468715, "loss": 2.8663, "step": 644 }, { "epoch": 0.7337883959044369, "grad_norm": 0.8719218373298645, "learning_rate": 0.0008541524459613197, "loss": 1.5535, "step": 645 }, { "epoch": 0.7349260523321957, "grad_norm": 1.2239844799041748, "learning_rate": 0.0008539249146757679, "loss": 2.5728, "step": 646 }, { "epoch": 0.7360637087599545, "grad_norm": 0.9655494093894958, "learning_rate": 0.0008536973833902162, "loss": 1.7532, "step": 647 }, { "epoch": 0.7372013651877133, "grad_norm": 0.8480125665664673, "learning_rate": 0.0008534698521046644, "loss": 1.3947, "step": 648 }, { "epoch": 0.7383390216154722, "grad_norm": 1.0782033205032349, "learning_rate": 0.0008532423208191126, "loss": 2.2726, "step": 649 }, { "epoch": 0.7394766780432309, "grad_norm": 0.8517910838127136, "learning_rate": 0.000853014789533561, "loss": 1.7146, "step": 650 }, { "epoch": 0.7406143344709898, "grad_norm": 0.6702635288238525, "learning_rate": 0.0008527872582480091, "loss": 1.4064, "step": 651 }, { "epoch": 0.7417519908987485, "grad_norm": 0.807049572467804, "learning_rate": 0.0008525597269624573, "loss": 1.5396, "step": 652 }, { "epoch": 0.7428896473265074, "grad_norm": 1.3272011280059814, "learning_rate": 0.0008523321956769056, "loss": 2.6018, "step": 653 }, { "epoch": 0.7440273037542662, "grad_norm": 0.8511213064193726, "learning_rate": 0.0008521046643913538, "loss": 2.0945, "step": 654 }, { "epoch": 0.745164960182025, "grad_norm": 1.0640743970870972, "learning_rate": 0.0008518771331058021, "loss": 3.3749, "step": 655 }, { "epoch": 0.7463026166097838, "grad_norm": 0.9631751179695129, "learning_rate": 0.0008516496018202503, "loss": 2.4991, "step": 656 }, { "epoch": 0.7474402730375427, "grad_norm": 1.0921177864074707, "learning_rate": 0.0008514220705346985, "loss": 1.9951, "step": 657 }, { "epoch": 0.7485779294653014, "grad_norm": 1.032777190208435, "learning_rate": 0.0008511945392491469, "loss": 1.9648, "step": 658 }, { "epoch": 0.7497155858930603, "grad_norm": 0.7291549444198608, "learning_rate": 0.0008509670079635951, "loss": 2.0568, "step": 659 }, { "epoch": 0.7508532423208191, "grad_norm": 1.0137559175491333, "learning_rate": 0.0008507394766780433, "loss": 2.5676, "step": 660 }, { "epoch": 0.7519908987485779, "grad_norm": 0.9185921549797058, "learning_rate": 0.0008505119453924915, "loss": 1.6855, "step": 661 }, { "epoch": 0.7531285551763367, "grad_norm": 1.0048507452011108, "learning_rate": 0.0008502844141069397, "loss": 1.5001, "step": 662 }, { "epoch": 0.7542662116040956, "grad_norm": 1.2506744861602783, "learning_rate": 0.0008500568828213879, "loss": 2.4675, "step": 663 }, { "epoch": 0.7554038680318543, "grad_norm": 1.7534509897232056, "learning_rate": 0.0008498293515358362, "loss": 4.192, "step": 664 }, { "epoch": 0.7565415244596132, "grad_norm": 1.0792453289031982, "learning_rate": 0.0008496018202502844, "loss": 2.2308, "step": 665 }, { "epoch": 0.757679180887372, "grad_norm": 0.9935582280158997, "learning_rate": 0.0008493742889647326, "loss": 1.499, "step": 666 }, { "epoch": 0.7588168373151308, "grad_norm": 0.9015896916389465, "learning_rate": 0.000849146757679181, "loss": 1.6236, "step": 667 }, { "epoch": 0.7599544937428896, "grad_norm": 1.0141130685806274, "learning_rate": 0.0008489192263936292, "loss": 1.9243, "step": 668 }, { "epoch": 0.7610921501706485, "grad_norm": 0.5651462078094482, "learning_rate": 0.0008486916951080774, "loss": 0.778, "step": 669 }, { "epoch": 0.7622298065984073, "grad_norm": 1.0191128253936768, "learning_rate": 0.0008484641638225257, "loss": 2.0027, "step": 670 }, { "epoch": 0.7633674630261661, "grad_norm": 1.0855798721313477, "learning_rate": 0.0008482366325369738, "loss": 2.4567, "step": 671 }, { "epoch": 0.764505119453925, "grad_norm": 1.7190563678741455, "learning_rate": 0.000848009101251422, "loss": 2.7192, "step": 672 }, { "epoch": 0.7656427758816837, "grad_norm": 1.0108932256698608, "learning_rate": 0.0008477815699658703, "loss": 1.2952, "step": 673 }, { "epoch": 0.7667804323094426, "grad_norm": 2.3063066005706787, "learning_rate": 0.0008475540386803185, "loss": 1.726, "step": 674 }, { "epoch": 0.7679180887372014, "grad_norm": 0.5806533098220825, "learning_rate": 0.0008473265073947669, "loss": 1.3231, "step": 675 }, { "epoch": 0.7690557451649602, "grad_norm": 0.9485315680503845, "learning_rate": 0.0008470989761092151, "loss": 1.7927, "step": 676 }, { "epoch": 0.770193401592719, "grad_norm": 0.7798807621002197, "learning_rate": 0.0008468714448236633, "loss": 1.4712, "step": 677 }, { "epoch": 0.7713310580204779, "grad_norm": 1.0551048517227173, "learning_rate": 0.0008466439135381116, "loss": 1.9981, "step": 678 }, { "epoch": 0.7724687144482366, "grad_norm": 1.0884678363800049, "learning_rate": 0.0008464163822525598, "loss": 2.0978, "step": 679 }, { "epoch": 0.7736063708759955, "grad_norm": 0.9386458992958069, "learning_rate": 0.0008461888509670079, "loss": 2.5401, "step": 680 }, { "epoch": 0.7747440273037542, "grad_norm": 1.1224387884140015, "learning_rate": 0.0008459613196814562, "loss": 2.5177, "step": 681 }, { "epoch": 0.7758816837315131, "grad_norm": 0.7325891852378845, "learning_rate": 0.0008457337883959044, "loss": 1.9845, "step": 682 }, { "epoch": 0.7770193401592719, "grad_norm": 0.8216614723205566, "learning_rate": 0.0008455062571103526, "loss": 1.7352, "step": 683 }, { "epoch": 0.7781569965870307, "grad_norm": 0.8514799475669861, "learning_rate": 0.000845278725824801, "loss": 1.6026, "step": 684 }, { "epoch": 0.7792946530147895, "grad_norm": 1.0461024045944214, "learning_rate": 0.0008450511945392492, "loss": 2.7851, "step": 685 }, { "epoch": 0.7804323094425484, "grad_norm": 1.0738078355789185, "learning_rate": 0.0008448236632536974, "loss": 2.2409, "step": 686 }, { "epoch": 0.7815699658703071, "grad_norm": 1.5895496606826782, "learning_rate": 0.0008445961319681457, "loss": 2.6157, "step": 687 }, { "epoch": 0.782707622298066, "grad_norm": 1.095568299293518, "learning_rate": 0.0008443686006825939, "loss": 3.4401, "step": 688 }, { "epoch": 0.7838452787258248, "grad_norm": 1.2721914052963257, "learning_rate": 0.0008441410693970421, "loss": 3.7557, "step": 689 }, { "epoch": 0.7849829351535836, "grad_norm": 0.8100789785385132, "learning_rate": 0.0008439135381114903, "loss": 1.8537, "step": 690 }, { "epoch": 0.7861205915813424, "grad_norm": 0.8364444375038147, "learning_rate": 0.0008436860068259385, "loss": 1.9646, "step": 691 }, { "epoch": 0.7872582480091013, "grad_norm": 0.8454108834266663, "learning_rate": 0.0008434584755403867, "loss": 1.3103, "step": 692 }, { "epoch": 0.78839590443686, "grad_norm": 1.0426613092422485, "learning_rate": 0.0008432309442548351, "loss": 2.1801, "step": 693 }, { "epoch": 0.7895335608646189, "grad_norm": 0.6206464171409607, "learning_rate": 0.0008430034129692833, "loss": 0.6764, "step": 694 }, { "epoch": 0.7906712172923777, "grad_norm": 1.0742131471633911, "learning_rate": 0.0008427758816837315, "loss": 2.0438, "step": 695 }, { "epoch": 0.7918088737201365, "grad_norm": 1.2915891408920288, "learning_rate": 0.0008425483503981798, "loss": 2.3515, "step": 696 }, { "epoch": 0.7929465301478953, "grad_norm": 1.545964002609253, "learning_rate": 0.000842320819112628, "loss": 3.3463, "step": 697 }, { "epoch": 0.7940841865756542, "grad_norm": 0.8047581911087036, "learning_rate": 0.0008420932878270763, "loss": 2.392, "step": 698 }, { "epoch": 0.7952218430034129, "grad_norm": 1.0554293394088745, "learning_rate": 0.0008418657565415245, "loss": 1.8506, "step": 699 }, { "epoch": 0.7963594994311718, "grad_norm": 1.518147587776184, "learning_rate": 0.0008416382252559726, "loss": 4.3646, "step": 700 }, { "epoch": 0.7974971558589306, "grad_norm": 0.942899227142334, "learning_rate": 0.000841410693970421, "loss": 1.8301, "step": 701 }, { "epoch": 0.7986348122866894, "grad_norm": 1.1088321208953857, "learning_rate": 0.0008411831626848692, "loss": 3.0843, "step": 702 }, { "epoch": 0.7997724687144482, "grad_norm": 1.0423277616500854, "learning_rate": 0.0008409556313993174, "loss": 2.6175, "step": 703 }, { "epoch": 0.800910125142207, "grad_norm": 0.7414242625236511, "learning_rate": 0.0008407281001137657, "loss": 1.399, "step": 704 }, { "epoch": 0.8020477815699659, "grad_norm": 1.0195127725601196, "learning_rate": 0.0008405005688282139, "loss": 2.1314, "step": 705 }, { "epoch": 0.8031854379977247, "grad_norm": 1.4709278345108032, "learning_rate": 0.0008402730375426621, "loss": 3.2922, "step": 706 }, { "epoch": 0.8043230944254836, "grad_norm": 1.656906247138977, "learning_rate": 0.0008400455062571104, "loss": 3.844, "step": 707 }, { "epoch": 0.8054607508532423, "grad_norm": 0.985885500907898, "learning_rate": 0.0008398179749715586, "loss": 2.9415, "step": 708 }, { "epoch": 0.8065984072810012, "grad_norm": 1.5020736455917358, "learning_rate": 0.0008395904436860067, "loss": 2.183, "step": 709 }, { "epoch": 0.8077360637087599, "grad_norm": 0.7547074556350708, "learning_rate": 0.0008393629124004551, "loss": 1.8464, "step": 710 }, { "epoch": 0.8088737201365188, "grad_norm": 1.1752091646194458, "learning_rate": 0.0008391353811149033, "loss": 2.5145, "step": 711 }, { "epoch": 0.8100113765642776, "grad_norm": 0.7590292096138, "learning_rate": 0.0008389078498293515, "loss": 1.4564, "step": 712 }, { "epoch": 0.8111490329920364, "grad_norm": 0.8058563470840454, "learning_rate": 0.0008386803185437998, "loss": 1.7101, "step": 713 }, { "epoch": 0.8122866894197952, "grad_norm": 0.971361517906189, "learning_rate": 0.000838452787258248, "loss": 1.8707, "step": 714 }, { "epoch": 0.8134243458475541, "grad_norm": 1.3266187906265259, "learning_rate": 0.0008382252559726962, "loss": 2.8622, "step": 715 }, { "epoch": 0.8145620022753128, "grad_norm": 1.2026985883712769, "learning_rate": 0.0008379977246871445, "loss": 2.386, "step": 716 }, { "epoch": 0.8156996587030717, "grad_norm": 1.041385531425476, "learning_rate": 0.0008377701934015928, "loss": 2.1941, "step": 717 }, { "epoch": 0.8168373151308305, "grad_norm": 0.9578999280929565, "learning_rate": 0.0008375426621160411, "loss": 2.4792, "step": 718 }, { "epoch": 0.8179749715585893, "grad_norm": 1.569138765335083, "learning_rate": 0.0008373151308304892, "loss": 2.5056, "step": 719 }, { "epoch": 0.8191126279863481, "grad_norm": 1.2005671262741089, "learning_rate": 0.0008370875995449374, "loss": 2.8101, "step": 720 }, { "epoch": 0.820250284414107, "grad_norm": 0.8075481057167053, "learning_rate": 0.0008368600682593857, "loss": 1.1596, "step": 721 }, { "epoch": 0.8213879408418657, "grad_norm": 1.131332278251648, "learning_rate": 0.0008366325369738339, "loss": 2.2506, "step": 722 }, { "epoch": 0.8225255972696246, "grad_norm": 1.0563104152679443, "learning_rate": 0.0008364050056882821, "loss": 1.9322, "step": 723 }, { "epoch": 0.8236632536973834, "grad_norm": 0.7609984874725342, "learning_rate": 0.0008361774744027304, "loss": 1.8229, "step": 724 }, { "epoch": 0.8248009101251422, "grad_norm": 0.8261270523071289, "learning_rate": 0.0008359499431171786, "loss": 2.785, "step": 725 }, { "epoch": 0.825938566552901, "grad_norm": 0.5597876310348511, "learning_rate": 0.0008357224118316269, "loss": 1.0098, "step": 726 }, { "epoch": 0.8270762229806599, "grad_norm": 1.219977617263794, "learning_rate": 0.0008354948805460752, "loss": 3.3632, "step": 727 }, { "epoch": 0.8282138794084186, "grad_norm": 1.1048645973205566, "learning_rate": 0.0008352673492605234, "loss": 3.235, "step": 728 }, { "epoch": 0.8293515358361775, "grad_norm": 1.0460500717163086, "learning_rate": 0.0008350398179749715, "loss": 1.7857, "step": 729 }, { "epoch": 0.8304891922639362, "grad_norm": 1.144655466079712, "learning_rate": 0.0008348122866894198, "loss": 2.25, "step": 730 }, { "epoch": 0.8316268486916951, "grad_norm": 0.9807350039482117, "learning_rate": 0.000834584755403868, "loss": 2.4989, "step": 731 }, { "epoch": 0.8327645051194539, "grad_norm": 0.9099501371383667, "learning_rate": 0.0008343572241183162, "loss": 2.3824, "step": 732 }, { "epoch": 0.8339021615472128, "grad_norm": 0.6631197333335876, "learning_rate": 0.0008341296928327645, "loss": 1.6735, "step": 733 }, { "epoch": 0.8350398179749715, "grad_norm": 0.6548817157745361, "learning_rate": 0.0008339021615472128, "loss": 1.5283, "step": 734 }, { "epoch": 0.8361774744027304, "grad_norm": 0.6837170720100403, "learning_rate": 0.000833674630261661, "loss": 1.3024, "step": 735 }, { "epoch": 0.8373151308304891, "grad_norm": 0.8492507934570312, "learning_rate": 0.0008334470989761093, "loss": 1.6662, "step": 736 }, { "epoch": 0.838452787258248, "grad_norm": 1.2175747156143188, "learning_rate": 0.0008332195676905575, "loss": 2.804, "step": 737 }, { "epoch": 0.8395904436860068, "grad_norm": 1.0982182025909424, "learning_rate": 0.0008329920364050058, "loss": 2.1759, "step": 738 }, { "epoch": 0.8407281001137656, "grad_norm": 1.106663465499878, "learning_rate": 0.0008327645051194539, "loss": 2.0419, "step": 739 }, { "epoch": 0.8418657565415245, "grad_norm": 0.7106451988220215, "learning_rate": 0.0008325369738339021, "loss": 1.2006, "step": 740 }, { "epoch": 0.8430034129692833, "grad_norm": 1.1868878602981567, "learning_rate": 0.0008323094425483504, "loss": 2.0938, "step": 741 }, { "epoch": 0.8441410693970421, "grad_norm": 1.006433367729187, "learning_rate": 0.0008320819112627986, "loss": 1.7796, "step": 742 }, { "epoch": 0.8452787258248009, "grad_norm": 1.0175524950027466, "learning_rate": 0.0008318543799772469, "loss": 2.5526, "step": 743 }, { "epoch": 0.8464163822525598, "grad_norm": 0.9404505491256714, "learning_rate": 0.0008316268486916952, "loss": 1.9948, "step": 744 }, { "epoch": 0.8475540386803185, "grad_norm": 0.9086321592330933, "learning_rate": 0.0008313993174061434, "loss": 1.9277, "step": 745 }, { "epoch": 0.8486916951080774, "grad_norm": 0.7922766208648682, "learning_rate": 0.0008311717861205916, "loss": 2.3355, "step": 746 }, { "epoch": 0.8498293515358362, "grad_norm": 0.8049002289772034, "learning_rate": 0.0008309442548350399, "loss": 1.7969, "step": 747 }, { "epoch": 0.850967007963595, "grad_norm": 0.8303267955780029, "learning_rate": 0.000830716723549488, "loss": 1.4973, "step": 748 }, { "epoch": 0.8521046643913538, "grad_norm": 0.7163656949996948, "learning_rate": 0.0008304891922639362, "loss": 1.7677, "step": 749 }, { "epoch": 0.8532423208191127, "grad_norm": 1.022519588470459, "learning_rate": 0.0008302616609783845, "loss": 1.6188, "step": 750 }, { "epoch": 0.8543799772468714, "grad_norm": 1.2985116243362427, "learning_rate": 0.0008300341296928328, "loss": 2.3702, "step": 751 }, { "epoch": 0.8555176336746303, "grad_norm": 0.7469916939735413, "learning_rate": 0.000829806598407281, "loss": 1.1039, "step": 752 }, { "epoch": 0.856655290102389, "grad_norm": 0.8801395297050476, "learning_rate": 0.0008295790671217293, "loss": 1.615, "step": 753 }, { "epoch": 0.8577929465301479, "grad_norm": 0.6937797665596008, "learning_rate": 0.0008293515358361775, "loss": 1.366, "step": 754 }, { "epoch": 0.8589306029579067, "grad_norm": 0.92084801197052, "learning_rate": 0.0008291240045506257, "loss": 1.1217, "step": 755 }, { "epoch": 0.8600682593856656, "grad_norm": 1.1955760717391968, "learning_rate": 0.000828896473265074, "loss": 2.3874, "step": 756 }, { "epoch": 0.8612059158134243, "grad_norm": 1.270263433456421, "learning_rate": 0.0008286689419795222, "loss": 0.7639, "step": 757 }, { "epoch": 0.8623435722411832, "grad_norm": 1.0862644910812378, "learning_rate": 0.0008284414106939704, "loss": 1.666, "step": 758 }, { "epoch": 0.863481228668942, "grad_norm": 1.0658801794052124, "learning_rate": 0.0008282138794084187, "loss": 3.0639, "step": 759 }, { "epoch": 0.8646188850967008, "grad_norm": 0.7104091644287109, "learning_rate": 0.0008279863481228669, "loss": 1.7136, "step": 760 }, { "epoch": 0.8657565415244596, "grad_norm": 1.0300426483154297, "learning_rate": 0.0008277588168373152, "loss": 1.9242, "step": 761 }, { "epoch": 0.8668941979522184, "grad_norm": 0.9637190699577332, "learning_rate": 0.0008275312855517634, "loss": 2.0804, "step": 762 }, { "epoch": 0.8680318543799772, "grad_norm": 0.9905063509941101, "learning_rate": 0.0008273037542662116, "loss": 1.7953, "step": 763 }, { "epoch": 0.8691695108077361, "grad_norm": 1.0103565454483032, "learning_rate": 0.0008270762229806599, "loss": 2.3104, "step": 764 }, { "epoch": 0.8703071672354948, "grad_norm": 0.6847909688949585, "learning_rate": 0.0008268486916951081, "loss": 0.9037, "step": 765 }, { "epoch": 0.8714448236632537, "grad_norm": 0.958070695400238, "learning_rate": 0.0008266211604095563, "loss": 1.3769, "step": 766 }, { "epoch": 0.8725824800910125, "grad_norm": 1.081380009651184, "learning_rate": 0.0008263936291240047, "loss": 2.6554, "step": 767 }, { "epoch": 0.8737201365187713, "grad_norm": 1.003764271736145, "learning_rate": 0.0008261660978384528, "loss": 2.3287, "step": 768 }, { "epoch": 0.8748577929465301, "grad_norm": 0.9018321633338928, "learning_rate": 0.000825938566552901, "loss": 1.1058, "step": 769 }, { "epoch": 0.875995449374289, "grad_norm": 1.702228307723999, "learning_rate": 0.0008257110352673493, "loss": 2.4671, "step": 770 }, { "epoch": 0.8771331058020477, "grad_norm": 0.7024636268615723, "learning_rate": 0.0008254835039817975, "loss": 1.3952, "step": 771 }, { "epoch": 0.8782707622298066, "grad_norm": 0.8887207508087158, "learning_rate": 0.0008252559726962457, "loss": 1.8992, "step": 772 }, { "epoch": 0.8794084186575654, "grad_norm": 0.8344920873641968, "learning_rate": 0.000825028441410694, "loss": 1.003, "step": 773 }, { "epoch": 0.8805460750853242, "grad_norm": 1.174970030784607, "learning_rate": 0.0008248009101251422, "loss": 2.1063, "step": 774 }, { "epoch": 0.8816837315130831, "grad_norm": 1.4693725109100342, "learning_rate": 0.0008245733788395904, "loss": 2.349, "step": 775 }, { "epoch": 0.8828213879408419, "grad_norm": 0.8909391164779663, "learning_rate": 0.0008243458475540388, "loss": 2.3083, "step": 776 }, { "epoch": 0.8839590443686007, "grad_norm": 1.178287148475647, "learning_rate": 0.000824118316268487, "loss": 2.2453, "step": 777 }, { "epoch": 0.8850967007963595, "grad_norm": 1.1147403717041016, "learning_rate": 0.0008238907849829351, "loss": 3.2198, "step": 778 }, { "epoch": 0.8862343572241184, "grad_norm": 0.8934231996536255, "learning_rate": 0.0008236632536973834, "loss": 2.1325, "step": 779 }, { "epoch": 0.8873720136518771, "grad_norm": 0.7500196695327759, "learning_rate": 0.0008234357224118316, "loss": 1.062, "step": 780 }, { "epoch": 0.888509670079636, "grad_norm": 1.1905717849731445, "learning_rate": 0.0008232081911262799, "loss": 2.6013, "step": 781 }, { "epoch": 0.8896473265073948, "grad_norm": 0.7128300070762634, "learning_rate": 0.0008229806598407281, "loss": 1.4425, "step": 782 }, { "epoch": 0.8907849829351536, "grad_norm": 0.9810397028923035, "learning_rate": 0.0008227531285551763, "loss": 2.2742, "step": 783 }, { "epoch": 0.8919226393629124, "grad_norm": 1.1497716903686523, "learning_rate": 0.0008225255972696247, "loss": 2.6226, "step": 784 }, { "epoch": 0.8930602957906713, "grad_norm": 1.3112133741378784, "learning_rate": 0.0008222980659840729, "loss": 1.8846, "step": 785 }, { "epoch": 0.89419795221843, "grad_norm": 1.030337929725647, "learning_rate": 0.0008220705346985211, "loss": 2.6641, "step": 786 }, { "epoch": 0.8953356086461889, "grad_norm": 1.1316418647766113, "learning_rate": 0.0008218430034129693, "loss": 1.8058, "step": 787 }, { "epoch": 0.8964732650739476, "grad_norm": 1.315293788909912, "learning_rate": 0.0008216154721274175, "loss": 1.6608, "step": 788 }, { "epoch": 0.8976109215017065, "grad_norm": 1.0237807035446167, "learning_rate": 0.0008213879408418657, "loss": 2.9869, "step": 789 }, { "epoch": 0.8987485779294653, "grad_norm": 1.0724749565124512, "learning_rate": 0.000821160409556314, "loss": 1.7659, "step": 790 }, { "epoch": 0.8998862343572241, "grad_norm": 0.8942409753799438, "learning_rate": 0.0008209328782707622, "loss": 1.5329, "step": 791 }, { "epoch": 0.9010238907849829, "grad_norm": 1.5801607370376587, "learning_rate": 0.0008207053469852104, "loss": 1.8717, "step": 792 }, { "epoch": 0.9021615472127418, "grad_norm": 0.8906893730163574, "learning_rate": 0.0008204778156996588, "loss": 1.5824, "step": 793 }, { "epoch": 0.9032992036405005, "grad_norm": 0.6598983407020569, "learning_rate": 0.000820250284414107, "loss": 1.4168, "step": 794 }, { "epoch": 0.9044368600682594, "grad_norm": 0.7640515565872192, "learning_rate": 0.0008200227531285552, "loss": 1.7068, "step": 795 }, { "epoch": 0.9055745164960182, "grad_norm": 1.3205150365829468, "learning_rate": 0.0008197952218430035, "loss": 2.5249, "step": 796 }, { "epoch": 0.906712172923777, "grad_norm": 1.2995262145996094, "learning_rate": 0.0008195676905574516, "loss": 2.9522, "step": 797 }, { "epoch": 0.9078498293515358, "grad_norm": 0.8465941548347473, "learning_rate": 0.0008193401592718998, "loss": 2.0508, "step": 798 }, { "epoch": 0.9089874857792947, "grad_norm": 1.2404921054840088, "learning_rate": 0.0008191126279863481, "loss": 2.4488, "step": 799 }, { "epoch": 0.9101251422070534, "grad_norm": 1.00394868850708, "learning_rate": 0.0008188850967007963, "loss": 1.665, "step": 800 }, { "epoch": 0.9112627986348123, "grad_norm": 1.0288304090499878, "learning_rate": 0.0008186575654152447, "loss": 2.2679, "step": 801 }, { "epoch": 0.9124004550625711, "grad_norm": 1.0812597274780273, "learning_rate": 0.0008184300341296929, "loss": 2.092, "step": 802 }, { "epoch": 0.9135381114903299, "grad_norm": 0.9165686964988708, "learning_rate": 0.0008182025028441411, "loss": 1.6572, "step": 803 }, { "epoch": 0.9146757679180887, "grad_norm": 1.254876971244812, "learning_rate": 0.0008179749715585894, "loss": 2.2742, "step": 804 }, { "epoch": 0.9158134243458476, "grad_norm": 0.8111211657524109, "learning_rate": 0.0008177474402730376, "loss": 1.4599, "step": 805 }, { "epoch": 0.9169510807736063, "grad_norm": 1.2796216011047363, "learning_rate": 0.0008175199089874858, "loss": 2.8677, "step": 806 }, { "epoch": 0.9180887372013652, "grad_norm": 0.5638169050216675, "learning_rate": 0.000817292377701934, "loss": 1.0711, "step": 807 }, { "epoch": 0.919226393629124, "grad_norm": 1.0181145668029785, "learning_rate": 0.0008170648464163822, "loss": 2.546, "step": 808 }, { "epoch": 0.9203640500568828, "grad_norm": 1.0140022039413452, "learning_rate": 0.0008168373151308304, "loss": 1.6735, "step": 809 }, { "epoch": 0.9215017064846417, "grad_norm": 0.6265180706977844, "learning_rate": 0.0008166097838452788, "loss": 1.581, "step": 810 }, { "epoch": 0.9226393629124005, "grad_norm": 0.9092225432395935, "learning_rate": 0.000816382252559727, "loss": 2.0735, "step": 811 }, { "epoch": 0.9237770193401593, "grad_norm": 1.3408722877502441, "learning_rate": 0.0008161547212741752, "loss": 3.3588, "step": 812 }, { "epoch": 0.9249146757679181, "grad_norm": 1.0309351682662964, "learning_rate": 0.0008159271899886235, "loss": 2.2884, "step": 813 }, { "epoch": 0.926052332195677, "grad_norm": 1.130393147468567, "learning_rate": 0.0008156996587030717, "loss": 1.994, "step": 814 }, { "epoch": 0.9271899886234357, "grad_norm": 0.9755035042762756, "learning_rate": 0.0008154721274175199, "loss": 2.4471, "step": 815 }, { "epoch": 0.9283276450511946, "grad_norm": 0.7888014912605286, "learning_rate": 0.0008152445961319681, "loss": 1.7634, "step": 816 }, { "epoch": 0.9294653014789533, "grad_norm": 1.2425562143325806, "learning_rate": 0.0008150170648464163, "loss": 2.6725, "step": 817 }, { "epoch": 0.9306029579067122, "grad_norm": 1.112234354019165, "learning_rate": 0.0008147895335608646, "loss": 2.0245, "step": 818 }, { "epoch": 0.931740614334471, "grad_norm": 1.8099498748779297, "learning_rate": 0.0008145620022753129, "loss": 3.6121, "step": 819 }, { "epoch": 0.9328782707622298, "grad_norm": 0.594555139541626, "learning_rate": 0.0008143344709897611, "loss": 1.1207, "step": 820 }, { "epoch": 0.9340159271899886, "grad_norm": 1.2875663042068481, "learning_rate": 0.0008141069397042094, "loss": 2.6515, "step": 821 }, { "epoch": 0.9351535836177475, "grad_norm": 1.2231221199035645, "learning_rate": 0.0008138794084186576, "loss": 2.1126, "step": 822 }, { "epoch": 0.9362912400455062, "grad_norm": 0.6779431700706482, "learning_rate": 0.0008136518771331058, "loss": 1.0446, "step": 823 }, { "epoch": 0.9374288964732651, "grad_norm": 0.8786545991897583, "learning_rate": 0.0008134243458475541, "loss": 1.8822, "step": 824 }, { "epoch": 0.9385665529010239, "grad_norm": 0.7241141200065613, "learning_rate": 0.0008131968145620023, "loss": 1.3241, "step": 825 }, { "epoch": 0.9397042093287827, "grad_norm": 1.032473087310791, "learning_rate": 0.0008129692832764504, "loss": 2.3931, "step": 826 }, { "epoch": 0.9408418657565415, "grad_norm": 0.8265206217765808, "learning_rate": 0.0008127417519908988, "loss": 1.842, "step": 827 }, { "epoch": 0.9419795221843004, "grad_norm": 0.9826211929321289, "learning_rate": 0.000812514220705347, "loss": 2.6482, "step": 828 }, { "epoch": 0.9431171786120591, "grad_norm": 1.065434455871582, "learning_rate": 0.0008122866894197952, "loss": 2.2127, "step": 829 }, { "epoch": 0.944254835039818, "grad_norm": 0.6911349892616272, "learning_rate": 0.0008120591581342435, "loss": 1.7043, "step": 830 }, { "epoch": 0.9453924914675768, "grad_norm": 1.031029224395752, "learning_rate": 0.0008118316268486917, "loss": 1.971, "step": 831 }, { "epoch": 0.9465301478953356, "grad_norm": 0.9776269197463989, "learning_rate": 0.0008116040955631399, "loss": 1.7573, "step": 832 }, { "epoch": 0.9476678043230944, "grad_norm": 1.006766438484192, "learning_rate": 0.0008113765642775882, "loss": 2.1867, "step": 833 }, { "epoch": 0.9488054607508533, "grad_norm": 1.4474374055862427, "learning_rate": 0.0008111490329920365, "loss": 1.8203, "step": 834 }, { "epoch": 0.949943117178612, "grad_norm": 1.0589663982391357, "learning_rate": 0.0008109215017064847, "loss": 2.376, "step": 835 }, { "epoch": 0.9510807736063709, "grad_norm": 0.8817835450172424, "learning_rate": 0.0008106939704209329, "loss": 2.1685, "step": 836 }, { "epoch": 0.9522184300341296, "grad_norm": 1.0257823467254639, "learning_rate": 0.0008104664391353811, "loss": 2.7541, "step": 837 }, { "epoch": 0.9533560864618885, "grad_norm": 0.6072942018508911, "learning_rate": 0.0008102389078498293, "loss": 0.9657, "step": 838 }, { "epoch": 0.9544937428896473, "grad_norm": 0.6976664066314697, "learning_rate": 0.0008100113765642776, "loss": 1.7921, "step": 839 }, { "epoch": 0.9556313993174061, "grad_norm": 0.7202094197273254, "learning_rate": 0.0008097838452787258, "loss": 1.6851, "step": 840 }, { "epoch": 0.9567690557451649, "grad_norm": 0.7134101390838623, "learning_rate": 0.0008095563139931741, "loss": 1.2245, "step": 841 }, { "epoch": 0.9579067121729238, "grad_norm": 0.9187147617340088, "learning_rate": 0.0008093287827076223, "loss": 1.9654, "step": 842 }, { "epoch": 0.9590443686006825, "grad_norm": 1.6910814046859741, "learning_rate": 0.0008091012514220706, "loss": 3.1648, "step": 843 }, { "epoch": 0.9601820250284414, "grad_norm": 1.1383386850357056, "learning_rate": 0.0008088737201365189, "loss": 1.5833, "step": 844 }, { "epoch": 0.9613196814562003, "grad_norm": 1.1125869750976562, "learning_rate": 0.0008086461888509671, "loss": 2.4418, "step": 845 }, { "epoch": 0.962457337883959, "grad_norm": 1.096062183380127, "learning_rate": 0.0008084186575654152, "loss": 1.5872, "step": 846 }, { "epoch": 0.9635949943117179, "grad_norm": 1.1159650087356567, "learning_rate": 0.0008081911262798635, "loss": 0.9669, "step": 847 }, { "epoch": 0.9647326507394767, "grad_norm": 0.8109821081161499, "learning_rate": 0.0008079635949943117, "loss": 2.3282, "step": 848 }, { "epoch": 0.9658703071672355, "grad_norm": 0.6839150786399841, "learning_rate": 0.0008077360637087599, "loss": 1.5686, "step": 849 }, { "epoch": 0.9670079635949943, "grad_norm": 1.0192281007766724, "learning_rate": 0.0008075085324232082, "loss": 2.3769, "step": 850 }, { "epoch": 0.9681456200227532, "grad_norm": 1.0800739526748657, "learning_rate": 0.0008072810011376565, "loss": 1.9699, "step": 851 }, { "epoch": 0.9692832764505119, "grad_norm": 1.4787782430648804, "learning_rate": 0.0008070534698521047, "loss": 2.1332, "step": 852 }, { "epoch": 0.9704209328782708, "grad_norm": 0.8719042539596558, "learning_rate": 0.000806825938566553, "loss": 1.9816, "step": 853 }, { "epoch": 0.9715585893060296, "grad_norm": 0.4718307554721832, "learning_rate": 0.0008065984072810012, "loss": 0.907, "step": 854 }, { "epoch": 0.9726962457337884, "grad_norm": 0.7292342185974121, "learning_rate": 0.0008063708759954493, "loss": 1.3498, "step": 855 }, { "epoch": 0.9738339021615472, "grad_norm": 1.0442382097244263, "learning_rate": 0.0008061433447098976, "loss": 0.8467, "step": 856 }, { "epoch": 0.9749715585893061, "grad_norm": 1.4007940292358398, "learning_rate": 0.0008059158134243458, "loss": 2.5427, "step": 857 }, { "epoch": 0.9761092150170648, "grad_norm": 0.6884713172912598, "learning_rate": 0.000805688282138794, "loss": 1.4121, "step": 858 }, { "epoch": 0.9772468714448237, "grad_norm": 0.8210100531578064, "learning_rate": 0.0008054607508532424, "loss": 1.4311, "step": 859 }, { "epoch": 0.9783845278725825, "grad_norm": 1.017969012260437, "learning_rate": 0.0008052332195676906, "loss": 2.1577, "step": 860 }, { "epoch": 0.9795221843003413, "grad_norm": 0.6463725566864014, "learning_rate": 0.0008050056882821389, "loss": 1.1913, "step": 861 }, { "epoch": 0.9806598407281001, "grad_norm": 0.609213650226593, "learning_rate": 0.0008047781569965871, "loss": 1.4609, "step": 862 }, { "epoch": 0.981797497155859, "grad_norm": 1.2324273586273193, "learning_rate": 0.0008045506257110353, "loss": 1.5182, "step": 863 }, { "epoch": 0.9829351535836177, "grad_norm": 1.6558321714401245, "learning_rate": 0.0008043230944254836, "loss": 2.2892, "step": 864 }, { "epoch": 0.9840728100113766, "grad_norm": 1.1968799829483032, "learning_rate": 0.0008040955631399317, "loss": 2.6779, "step": 865 }, { "epoch": 0.9852104664391353, "grad_norm": 0.6827085614204407, "learning_rate": 0.0008038680318543799, "loss": 1.8453, "step": 866 }, { "epoch": 0.9863481228668942, "grad_norm": 0.70023113489151, "learning_rate": 0.0008036405005688282, "loss": 1.5166, "step": 867 }, { "epoch": 0.987485779294653, "grad_norm": 1.1971696615219116, "learning_rate": 0.0008034129692832765, "loss": 3.0315, "step": 868 }, { "epoch": 0.9886234357224118, "grad_norm": 1.1876801252365112, "learning_rate": 0.0008031854379977247, "loss": 2.263, "step": 869 }, { "epoch": 0.9897610921501706, "grad_norm": 1.1676205396652222, "learning_rate": 0.000802957906712173, "loss": 1.9647, "step": 870 }, { "epoch": 0.9908987485779295, "grad_norm": 1.2230088710784912, "learning_rate": 0.0008027303754266212, "loss": 3.6692, "step": 871 }, { "epoch": 0.9920364050056882, "grad_norm": 0.9202735424041748, "learning_rate": 0.0008025028441410694, "loss": 1.9178, "step": 872 }, { "epoch": 0.9931740614334471, "grad_norm": 0.685429036617279, "learning_rate": 0.0008022753128555177, "loss": 1.6819, "step": 873 }, { "epoch": 0.9943117178612059, "grad_norm": 0.9149478077888489, "learning_rate": 0.0008020477815699659, "loss": 1.7829, "step": 874 }, { "epoch": 0.9954493742889647, "grad_norm": 0.9612428545951843, "learning_rate": 0.000801820250284414, "loss": 2.3288, "step": 875 }, { "epoch": 0.9965870307167235, "grad_norm": 1.1229710578918457, "learning_rate": 0.0008015927189988624, "loss": 1.8854, "step": 876 }, { "epoch": 0.9977246871444824, "grad_norm": 1.3799995183944702, "learning_rate": 0.0008013651877133106, "loss": 3.6562, "step": 877 }, { "epoch": 0.9988623435722411, "grad_norm": 0.7298029661178589, "learning_rate": 0.0008011376564277588, "loss": 1.7173, "step": 878 }, { "epoch": 1.0, "grad_norm": 1.1531485319137573, "learning_rate": 0.0008009101251422071, "loss": 2.2511, "step": 879 }, { "epoch": 1.0, "eval_f1": 0.8883, "eval_gen_len": 49.5182, "eval_loss": 1.9363936185836792, "eval_precision": 0.8875, "eval_recall": 0.8892, "eval_rouge1": 0.4398, "eval_rouge2": 0.1855, "eval_rougeL": 0.3668, "eval_rougeLsum": 0.411, "eval_runtime": 28.6515, "eval_samples_per_second": 3.839, "eval_steps_per_second": 0.489, "step": 879 }, { "epoch": 1.0011376564277588, "grad_norm": 0.6251806616783142, "learning_rate": 0.0008006825938566553, "loss": 1.0413, "step": 880 }, { "epoch": 1.0022753128555177, "grad_norm": 0.9199168086051941, "learning_rate": 0.0008004550625711036, "loss": 1.691, "step": 881 }, { "epoch": 1.0034129692832765, "grad_norm": 1.4455974102020264, "learning_rate": 0.0008002275312855518, "loss": 4.0842, "step": 882 }, { "epoch": 1.0045506257110353, "grad_norm": 1.1115312576293945, "learning_rate": 0.0008, "loss": 2.4961, "step": 883 }, { "epoch": 1.005688282138794, "grad_norm": 0.952612042427063, "learning_rate": 0.0007997724687144482, "loss": 1.8239, "step": 884 }, { "epoch": 1.006825938566553, "grad_norm": 0.9800905585289001, "learning_rate": 0.0007995449374288965, "loss": 1.981, "step": 885 }, { "epoch": 1.0079635949943118, "grad_norm": 1.0975840091705322, "learning_rate": 0.0007993174061433447, "loss": 2.4143, "step": 886 }, { "epoch": 1.0091012514220705, "grad_norm": 0.936336874961853, "learning_rate": 0.000799089874857793, "loss": 1.6641, "step": 887 }, { "epoch": 1.0102389078498293, "grad_norm": 0.5330931544303894, "learning_rate": 0.0007988623435722412, "loss": 1.2285, "step": 888 }, { "epoch": 1.0113765642775883, "grad_norm": 1.2912349700927734, "learning_rate": 0.0007986348122866894, "loss": 1.594, "step": 889 }, { "epoch": 1.012514220705347, "grad_norm": 0.8078871369361877, "learning_rate": 0.0007984072810011377, "loss": 1.0581, "step": 890 }, { "epoch": 1.0136518771331058, "grad_norm": 1.071419358253479, "learning_rate": 0.0007981797497155859, "loss": 1.7577, "step": 891 }, { "epoch": 1.0147895335608645, "grad_norm": 1.0178149938583374, "learning_rate": 0.0007979522184300341, "loss": 2.4778, "step": 892 }, { "epoch": 1.0159271899886235, "grad_norm": 0.9748836159706116, "learning_rate": 0.0007977246871444825, "loss": 1.8737, "step": 893 }, { "epoch": 1.0170648464163823, "grad_norm": 0.7067264914512634, "learning_rate": 0.0007974971558589306, "loss": 1.3954, "step": 894 }, { "epoch": 1.018202502844141, "grad_norm": 1.2109535932540894, "learning_rate": 0.0007972696245733788, "loss": 2.3141, "step": 895 }, { "epoch": 1.0193401592718998, "grad_norm": 0.8363299369812012, "learning_rate": 0.0007970420932878271, "loss": 2.0917, "step": 896 }, { "epoch": 1.0204778156996588, "grad_norm": 0.7036440968513489, "learning_rate": 0.0007968145620022753, "loss": 1.5766, "step": 897 }, { "epoch": 1.0216154721274175, "grad_norm": 0.6679667830467224, "learning_rate": 0.0007965870307167235, "loss": 1.8018, "step": 898 }, { "epoch": 1.0227531285551763, "grad_norm": 1.0804336071014404, "learning_rate": 0.0007963594994311718, "loss": 2.0411, "step": 899 }, { "epoch": 1.023890784982935, "grad_norm": 0.9264572262763977, "learning_rate": 0.00079613196814562, "loss": 2.1497, "step": 900 }, { "epoch": 1.025028441410694, "grad_norm": 1.2212860584259033, "learning_rate": 0.0007959044368600682, "loss": 2.7216, "step": 901 }, { "epoch": 1.0261660978384528, "grad_norm": 1.1044024229049683, "learning_rate": 0.0007956769055745166, "loss": 2.3266, "step": 902 }, { "epoch": 1.0273037542662116, "grad_norm": 0.7855750918388367, "learning_rate": 0.0007954493742889648, "loss": 1.4662, "step": 903 }, { "epoch": 1.0284414106939703, "grad_norm": 1.0114825963974, "learning_rate": 0.000795221843003413, "loss": 2.0779, "step": 904 }, { "epoch": 1.0295790671217293, "grad_norm": 0.9704268574714661, "learning_rate": 0.0007949943117178612, "loss": 1.6038, "step": 905 }, { "epoch": 1.030716723549488, "grad_norm": 1.155097246170044, "learning_rate": 0.0007947667804323094, "loss": 2.13, "step": 906 }, { "epoch": 1.0318543799772468, "grad_norm": 1.3842484951019287, "learning_rate": 0.0007945392491467577, "loss": 3.5606, "step": 907 }, { "epoch": 1.0329920364050056, "grad_norm": 1.134261131286621, "learning_rate": 0.0007943117178612059, "loss": 2.9482, "step": 908 }, { "epoch": 1.0341296928327646, "grad_norm": 1.0498160123825073, "learning_rate": 0.0007940841865756541, "loss": 2.3777, "step": 909 }, { "epoch": 1.0352673492605233, "grad_norm": 0.7719895243644714, "learning_rate": 0.0007938566552901025, "loss": 2.2765, "step": 910 }, { "epoch": 1.036405005688282, "grad_norm": 1.402180790901184, "learning_rate": 0.0007936291240045507, "loss": 3.0459, "step": 911 }, { "epoch": 1.0375426621160408, "grad_norm": 0.8468472361564636, "learning_rate": 0.0007934015927189989, "loss": 0.922, "step": 912 }, { "epoch": 1.0386803185437998, "grad_norm": 0.8631690740585327, "learning_rate": 0.0007931740614334472, "loss": 1.2322, "step": 913 }, { "epoch": 1.0398179749715586, "grad_norm": 1.3747214078903198, "learning_rate": 0.0007929465301478953, "loss": 2.485, "step": 914 }, { "epoch": 1.0409556313993173, "grad_norm": 0.7905521988868713, "learning_rate": 0.0007927189988623435, "loss": 2.1488, "step": 915 }, { "epoch": 1.0420932878270763, "grad_norm": 1.0372867584228516, "learning_rate": 0.0007924914675767918, "loss": 2.3263, "step": 916 }, { "epoch": 1.043230944254835, "grad_norm": 0.964745819568634, "learning_rate": 0.00079226393629124, "loss": 1.4245, "step": 917 }, { "epoch": 1.0443686006825939, "grad_norm": 1.2599427700042725, "learning_rate": 0.0007920364050056883, "loss": 2.8828, "step": 918 }, { "epoch": 1.0455062571103526, "grad_norm": 0.5311685800552368, "learning_rate": 0.0007918088737201366, "loss": 1.2969, "step": 919 }, { "epoch": 1.0466439135381116, "grad_norm": 1.0544791221618652, "learning_rate": 0.0007915813424345848, "loss": 2.3014, "step": 920 }, { "epoch": 1.0477815699658704, "grad_norm": 0.7876049280166626, "learning_rate": 0.000791353811149033, "loss": 2.2945, "step": 921 }, { "epoch": 1.0489192263936291, "grad_norm": 0.7309691309928894, "learning_rate": 0.0007911262798634813, "loss": 1.2384, "step": 922 }, { "epoch": 1.0500568828213879, "grad_norm": 0.8504576086997986, "learning_rate": 0.0007908987485779294, "loss": 1.719, "step": 923 }, { "epoch": 1.0511945392491469, "grad_norm": 1.7143735885620117, "learning_rate": 0.0007906712172923777, "loss": 3.9275, "step": 924 }, { "epoch": 1.0523321956769056, "grad_norm": 1.8625571727752686, "learning_rate": 0.0007904436860068259, "loss": 4.4724, "step": 925 }, { "epoch": 1.0534698521046644, "grad_norm": 1.0929653644561768, "learning_rate": 0.0007902161547212741, "loss": 1.9623, "step": 926 }, { "epoch": 1.0546075085324231, "grad_norm": 1.0899792909622192, "learning_rate": 0.0007899886234357225, "loss": 1.6986, "step": 927 }, { "epoch": 1.0557451649601821, "grad_norm": 1.1779440641403198, "learning_rate": 0.0007897610921501707, "loss": 1.905, "step": 928 }, { "epoch": 1.0568828213879409, "grad_norm": 0.6737589240074158, "learning_rate": 0.0007895335608646189, "loss": 1.734, "step": 929 }, { "epoch": 1.0580204778156996, "grad_norm": 1.0531206130981445, "learning_rate": 0.0007893060295790672, "loss": 1.4693, "step": 930 }, { "epoch": 1.0591581342434584, "grad_norm": 0.5907047390937805, "learning_rate": 0.0007890784982935154, "loss": 1.4901, "step": 931 }, { "epoch": 1.0602957906712174, "grad_norm": 0.8621008396148682, "learning_rate": 0.0007888509670079636, "loss": 1.5857, "step": 932 }, { "epoch": 1.0614334470989761, "grad_norm": 0.9032453298568726, "learning_rate": 0.0007886234357224118, "loss": 1.8026, "step": 933 }, { "epoch": 1.062571103526735, "grad_norm": 1.4888174533843994, "learning_rate": 0.00078839590443686, "loss": 1.8748, "step": 934 }, { "epoch": 1.0637087599544937, "grad_norm": 0.6122904419898987, "learning_rate": 0.0007881683731513083, "loss": 0.7945, "step": 935 }, { "epoch": 1.0648464163822526, "grad_norm": 1.013519048690796, "learning_rate": 0.0007879408418657566, "loss": 1.9398, "step": 936 }, { "epoch": 1.0659840728100114, "grad_norm": 0.7138906717300415, "learning_rate": 0.0007877133105802048, "loss": 1.6637, "step": 937 }, { "epoch": 1.0671217292377702, "grad_norm": 1.3058874607086182, "learning_rate": 0.000787485779294653, "loss": 3.3544, "step": 938 }, { "epoch": 1.068259385665529, "grad_norm": 0.9075451493263245, "learning_rate": 0.0007872582480091013, "loss": 1.7222, "step": 939 }, { "epoch": 1.069397042093288, "grad_norm": 0.9286370873451233, "learning_rate": 0.0007870307167235495, "loss": 2.0651, "step": 940 }, { "epoch": 1.0705346985210467, "grad_norm": 0.6360165476799011, "learning_rate": 0.0007868031854379977, "loss": 0.89, "step": 941 }, { "epoch": 1.0716723549488054, "grad_norm": 1.2726821899414062, "learning_rate": 0.000786575654152446, "loss": 2.7395, "step": 942 }, { "epoch": 1.0728100113765642, "grad_norm": 1.024568796157837, "learning_rate": 0.0007863481228668941, "loss": 2.1989, "step": 943 }, { "epoch": 1.0739476678043232, "grad_norm": 1.1119788885116577, "learning_rate": 0.0007861205915813425, "loss": 3.14, "step": 944 }, { "epoch": 1.075085324232082, "grad_norm": 1.0225321054458618, "learning_rate": 0.0007858930602957907, "loss": 2.8281, "step": 945 }, { "epoch": 1.0762229806598407, "grad_norm": 0.8482196927070618, "learning_rate": 0.0007856655290102389, "loss": 1.4616, "step": 946 }, { "epoch": 1.0773606370875997, "grad_norm": 0.9475064277648926, "learning_rate": 0.0007854379977246872, "loss": 2.9296, "step": 947 }, { "epoch": 1.0784982935153584, "grad_norm": 0.5353087186813354, "learning_rate": 0.0007852104664391354, "loss": 0.9475, "step": 948 }, { "epoch": 1.0796359499431172, "grad_norm": 1.2120368480682373, "learning_rate": 0.0007849829351535836, "loss": 3.095, "step": 949 }, { "epoch": 1.080773606370876, "grad_norm": 1.1500520706176758, "learning_rate": 0.0007847554038680319, "loss": 2.4183, "step": 950 }, { "epoch": 1.0819112627986347, "grad_norm": 0.629219651222229, "learning_rate": 0.0007845278725824802, "loss": 1.1091, "step": 951 }, { "epoch": 1.0830489192263937, "grad_norm": 0.6308894753456116, "learning_rate": 0.0007843003412969284, "loss": 1.0829, "step": 952 }, { "epoch": 1.0841865756541524, "grad_norm": 1.4992165565490723, "learning_rate": 0.0007840728100113766, "loss": 3.7885, "step": 953 }, { "epoch": 1.0853242320819112, "grad_norm": 1.0505317449569702, "learning_rate": 0.0007838452787258248, "loss": 2.2605, "step": 954 }, { "epoch": 1.0864618885096702, "grad_norm": 0.898767352104187, "learning_rate": 0.000783617747440273, "loss": 3.0967, "step": 955 }, { "epoch": 1.087599544937429, "grad_norm": 1.2503669261932373, "learning_rate": 0.0007833902161547213, "loss": 2.2391, "step": 956 }, { "epoch": 1.0887372013651877, "grad_norm": 1.2384415864944458, "learning_rate": 0.0007831626848691695, "loss": 3.554, "step": 957 }, { "epoch": 1.0898748577929465, "grad_norm": 1.1613341569900513, "learning_rate": 0.0007829351535836177, "loss": 1.7405, "step": 958 }, { "epoch": 1.0910125142207054, "grad_norm": 0.6541109681129456, "learning_rate": 0.000782707622298066, "loss": 1.5068, "step": 959 }, { "epoch": 1.0921501706484642, "grad_norm": 1.1372532844543457, "learning_rate": 0.0007824800910125143, "loss": 2.6415, "step": 960 }, { "epoch": 1.093287827076223, "grad_norm": 0.944439709186554, "learning_rate": 0.0007822525597269625, "loss": 2.2019, "step": 961 }, { "epoch": 1.0944254835039817, "grad_norm": 1.1232166290283203, "learning_rate": 0.0007820250284414107, "loss": 2.172, "step": 962 }, { "epoch": 1.0955631399317407, "grad_norm": 1.3538726568222046, "learning_rate": 0.0007817974971558589, "loss": 2.0305, "step": 963 }, { "epoch": 1.0967007963594995, "grad_norm": 0.7923217415809631, "learning_rate": 0.0007815699658703072, "loss": 1.2107, "step": 964 }, { "epoch": 1.0978384527872582, "grad_norm": 0.9362553358078003, "learning_rate": 0.0007813424345847554, "loss": 1.9665, "step": 965 }, { "epoch": 1.098976109215017, "grad_norm": 1.2658346891403198, "learning_rate": 0.0007811149032992036, "loss": 2.9011, "step": 966 }, { "epoch": 1.100113765642776, "grad_norm": 1.526066780090332, "learning_rate": 0.0007808873720136519, "loss": 1.7315, "step": 967 }, { "epoch": 1.1012514220705347, "grad_norm": 1.3018304109573364, "learning_rate": 0.0007806598407281002, "loss": 3.2345, "step": 968 }, { "epoch": 1.1023890784982935, "grad_norm": 0.8176997900009155, "learning_rate": 0.0007804323094425484, "loss": 1.467, "step": 969 }, { "epoch": 1.1035267349260522, "grad_norm": 0.8336053490638733, "learning_rate": 0.0007802047781569967, "loss": 1.4161, "step": 970 }, { "epoch": 1.1046643913538112, "grad_norm": 0.7302894592285156, "learning_rate": 0.0007799772468714449, "loss": 1.3151, "step": 971 }, { "epoch": 1.10580204778157, "grad_norm": 0.9276258945465088, "learning_rate": 0.000779749715585893, "loss": 2.3525, "step": 972 }, { "epoch": 1.1069397042093287, "grad_norm": 0.8648287057876587, "learning_rate": 0.0007795221843003413, "loss": 1.7804, "step": 973 }, { "epoch": 1.1080773606370875, "grad_norm": 1.4959267377853394, "learning_rate": 0.0007792946530147895, "loss": 2.9281, "step": 974 }, { "epoch": 1.1092150170648465, "grad_norm": 0.7826813459396362, "learning_rate": 0.0007790671217292377, "loss": 0.9164, "step": 975 }, { "epoch": 1.1103526734926052, "grad_norm": 1.157572865486145, "learning_rate": 0.000778839590443686, "loss": 1.9721, "step": 976 }, { "epoch": 1.111490329920364, "grad_norm": 1.137681484222412, "learning_rate": 0.0007786120591581343, "loss": 1.5874, "step": 977 }, { "epoch": 1.1126279863481228, "grad_norm": 0.9332581162452698, "learning_rate": 0.0007783845278725825, "loss": 2.5314, "step": 978 }, { "epoch": 1.1137656427758817, "grad_norm": 1.1413997411727905, "learning_rate": 0.0007781569965870308, "loss": 3.0732, "step": 979 }, { "epoch": 1.1149032992036405, "grad_norm": 0.7028293609619141, "learning_rate": 0.000777929465301479, "loss": 0.8924, "step": 980 }, { "epoch": 1.1160409556313993, "grad_norm": 1.0388150215148926, "learning_rate": 0.0007777019340159272, "loss": 1.9138, "step": 981 }, { "epoch": 1.117178612059158, "grad_norm": 0.9347336292266846, "learning_rate": 0.0007774744027303754, "loss": 1.5226, "step": 982 }, { "epoch": 1.118316268486917, "grad_norm": 0.9350568056106567, "learning_rate": 0.0007772468714448236, "loss": 2.6978, "step": 983 }, { "epoch": 1.1194539249146758, "grad_norm": 1.407080888748169, "learning_rate": 0.0007770193401592718, "loss": 1.3464, "step": 984 }, { "epoch": 1.1205915813424345, "grad_norm": 1.077770471572876, "learning_rate": 0.0007767918088737202, "loss": 2.123, "step": 985 }, { "epoch": 1.1217292377701935, "grad_norm": 1.041999101638794, "learning_rate": 0.0007765642775881684, "loss": 2.5526, "step": 986 }, { "epoch": 1.1228668941979523, "grad_norm": 1.0291063785552979, "learning_rate": 0.0007763367463026167, "loss": 2.1948, "step": 987 }, { "epoch": 1.124004550625711, "grad_norm": 0.9935250282287598, "learning_rate": 0.0007761092150170649, "loss": 3.3882, "step": 988 }, { "epoch": 1.1251422070534698, "grad_norm": 1.035112977027893, "learning_rate": 0.0007758816837315131, "loss": 2.0542, "step": 989 }, { "epoch": 1.1262798634812285, "grad_norm": 1.0226819515228271, "learning_rate": 0.0007756541524459614, "loss": 1.8742, "step": 990 }, { "epoch": 1.1274175199089875, "grad_norm": 1.0159075260162354, "learning_rate": 0.0007754266211604095, "loss": 1.3203, "step": 991 }, { "epoch": 1.1285551763367463, "grad_norm": 2.0587501525878906, "learning_rate": 0.0007751990898748577, "loss": 3.9035, "step": 992 }, { "epoch": 1.129692832764505, "grad_norm": 0.9951366186141968, "learning_rate": 0.000774971558589306, "loss": 2.5265, "step": 993 }, { "epoch": 1.130830489192264, "grad_norm": 0.7112089395523071, "learning_rate": 0.0007747440273037543, "loss": 1.3928, "step": 994 }, { "epoch": 1.1319681456200228, "grad_norm": 1.0504612922668457, "learning_rate": 0.0007745164960182025, "loss": 2.0178, "step": 995 }, { "epoch": 1.1331058020477816, "grad_norm": 0.7228176593780518, "learning_rate": 0.0007742889647326508, "loss": 1.4623, "step": 996 }, { "epoch": 1.1342434584755403, "grad_norm": 1.1857068538665771, "learning_rate": 0.000774061433447099, "loss": 2.5628, "step": 997 }, { "epoch": 1.1353811149032993, "grad_norm": 0.6408101916313171, "learning_rate": 0.0007738339021615472, "loss": 1.3296, "step": 998 }, { "epoch": 1.136518771331058, "grad_norm": 0.9753408432006836, "learning_rate": 0.0007736063708759955, "loss": 2.0656, "step": 999 }, { "epoch": 1.1376564277588168, "grad_norm": 1.139440894126892, "learning_rate": 0.0007733788395904437, "loss": 2.2056, "step": 1000 }, { "epoch": 1.1387940841865756, "grad_norm": 0.6815441846847534, "learning_rate": 0.0007731513083048918, "loss": 1.2574, "step": 1001 }, { "epoch": 1.1399317406143346, "grad_norm": 1.032520055770874, "learning_rate": 0.0007729237770193402, "loss": 2.0944, "step": 1002 }, { "epoch": 1.1410693970420933, "grad_norm": 0.8344050049781799, "learning_rate": 0.0007726962457337884, "loss": 1.5627, "step": 1003 }, { "epoch": 1.142207053469852, "grad_norm": 1.2502180337905884, "learning_rate": 0.0007724687144482366, "loss": 2.0755, "step": 1004 }, { "epoch": 1.1433447098976108, "grad_norm": 1.008817195892334, "learning_rate": 0.0007722411831626849, "loss": 2.4644, "step": 1005 }, { "epoch": 1.1444823663253698, "grad_norm": 1.0237065553665161, "learning_rate": 0.0007720136518771331, "loss": 1.4251, "step": 1006 }, { "epoch": 1.1456200227531286, "grad_norm": 0.9443050026893616, "learning_rate": 0.0007717861205915814, "loss": 2.5625, "step": 1007 }, { "epoch": 1.1467576791808873, "grad_norm": 0.8654250502586365, "learning_rate": 0.0007715585893060296, "loss": 1.999, "step": 1008 }, { "epoch": 1.147895335608646, "grad_norm": 1.082979679107666, "learning_rate": 0.0007713310580204778, "loss": 1.2456, "step": 1009 }, { "epoch": 1.149032992036405, "grad_norm": 1.07938814163208, "learning_rate": 0.0007711035267349262, "loss": 2.0854, "step": 1010 }, { "epoch": 1.1501706484641638, "grad_norm": 0.6003649234771729, "learning_rate": 0.0007708759954493743, "loss": 1.4027, "step": 1011 }, { "epoch": 1.1513083048919226, "grad_norm": 1.157963514328003, "learning_rate": 0.0007706484641638225, "loss": 3.3311, "step": 1012 }, { "epoch": 1.1524459613196814, "grad_norm": 0.5869401097297668, "learning_rate": 0.0007704209328782708, "loss": 1.4386, "step": 1013 }, { "epoch": 1.1535836177474403, "grad_norm": 0.9778612852096558, "learning_rate": 0.000770193401592719, "loss": 2.9109, "step": 1014 }, { "epoch": 1.154721274175199, "grad_norm": 0.9791373610496521, "learning_rate": 0.0007699658703071672, "loss": 1.5482, "step": 1015 }, { "epoch": 1.1558589306029579, "grad_norm": 1.0005147457122803, "learning_rate": 0.0007697383390216155, "loss": 2.9281, "step": 1016 }, { "epoch": 1.1569965870307168, "grad_norm": 0.6844426989555359, "learning_rate": 0.0007695108077360637, "loss": 0.9119, "step": 1017 }, { "epoch": 1.1581342434584756, "grad_norm": 0.9615586400032043, "learning_rate": 0.000769283276450512, "loss": 2.3095, "step": 1018 }, { "epoch": 1.1592718998862344, "grad_norm": 1.1434069871902466, "learning_rate": 0.0007690557451649603, "loss": 2.035, "step": 1019 }, { "epoch": 1.1604095563139931, "grad_norm": 1.1195374727249146, "learning_rate": 0.0007688282138794085, "loss": 2.8448, "step": 1020 }, { "epoch": 1.1615472127417519, "grad_norm": 0.9364283084869385, "learning_rate": 0.0007686006825938566, "loss": 2.0258, "step": 1021 }, { "epoch": 1.1626848691695109, "grad_norm": 0.37525808811187744, "learning_rate": 0.0007683731513083049, "loss": 0.439, "step": 1022 }, { "epoch": 1.1638225255972696, "grad_norm": 1.8962267637252808, "learning_rate": 0.0007681456200227531, "loss": 2.1773, "step": 1023 }, { "epoch": 1.1649601820250284, "grad_norm": 1.661497712135315, "learning_rate": 0.0007679180887372013, "loss": 2.577, "step": 1024 }, { "epoch": 1.1660978384527874, "grad_norm": 1.1258926391601562, "learning_rate": 0.0007676905574516496, "loss": 2.3722, "step": 1025 }, { "epoch": 1.1672354948805461, "grad_norm": 0.9760286808013916, "learning_rate": 0.0007674630261660978, "loss": 2.135, "step": 1026 }, { "epoch": 1.1683731513083049, "grad_norm": 0.89606112241745, "learning_rate": 0.0007672354948805462, "loss": 1.8733, "step": 1027 }, { "epoch": 1.1695108077360636, "grad_norm": 1.0964630842208862, "learning_rate": 0.0007670079635949944, "loss": 1.9628, "step": 1028 }, { "epoch": 1.1706484641638226, "grad_norm": 1.214050054550171, "learning_rate": 0.0007667804323094426, "loss": 2.3792, "step": 1029 }, { "epoch": 1.1717861205915814, "grad_norm": 1.242618203163147, "learning_rate": 0.0007665529010238908, "loss": 3.0571, "step": 1030 }, { "epoch": 1.1729237770193401, "grad_norm": 0.7527234554290771, "learning_rate": 0.000766325369738339, "loss": 1.7859, "step": 1031 }, { "epoch": 1.174061433447099, "grad_norm": 0.9902629256248474, "learning_rate": 0.0007660978384527872, "loss": 2.1558, "step": 1032 }, { "epoch": 1.1751990898748579, "grad_norm": 1.109411597251892, "learning_rate": 0.0007658703071672355, "loss": 2.4196, "step": 1033 }, { "epoch": 1.1763367463026166, "grad_norm": 0.9211401343345642, "learning_rate": 0.0007656427758816837, "loss": 1.8503, "step": 1034 }, { "epoch": 1.1774744027303754, "grad_norm": 0.9370328187942505, "learning_rate": 0.000765415244596132, "loss": 2.7057, "step": 1035 }, { "epoch": 1.1786120591581342, "grad_norm": 0.8330713510513306, "learning_rate": 0.0007651877133105803, "loss": 1.6611, "step": 1036 }, { "epoch": 1.1797497155858931, "grad_norm": 0.7571448087692261, "learning_rate": 0.0007649601820250285, "loss": 1.2492, "step": 1037 }, { "epoch": 1.180887372013652, "grad_norm": 1.6156978607177734, "learning_rate": 0.0007647326507394767, "loss": 2.814, "step": 1038 }, { "epoch": 1.1820250284414107, "grad_norm": 1.8085129261016846, "learning_rate": 0.000764505119453925, "loss": 3.3276, "step": 1039 }, { "epoch": 1.1831626848691694, "grad_norm": 1.0824229717254639, "learning_rate": 0.0007642775881683731, "loss": 2.5017, "step": 1040 }, { "epoch": 1.1843003412969284, "grad_norm": 0.6034184098243713, "learning_rate": 0.0007640500568828213, "loss": 1.304, "step": 1041 }, { "epoch": 1.1854379977246872, "grad_norm": 0.71455317735672, "learning_rate": 0.0007638225255972696, "loss": 1.8677, "step": 1042 }, { "epoch": 1.186575654152446, "grad_norm": 0.7810432314872742, "learning_rate": 0.0007635949943117178, "loss": 2.6322, "step": 1043 }, { "epoch": 1.1877133105802047, "grad_norm": 0.7499434351921082, "learning_rate": 0.0007633674630261661, "loss": 1.8316, "step": 1044 }, { "epoch": 1.1888509670079637, "grad_norm": 0.9468162059783936, "learning_rate": 0.0007631399317406144, "loss": 1.5508, "step": 1045 }, { "epoch": 1.1899886234357224, "grad_norm": 1.3421801328659058, "learning_rate": 0.0007629124004550626, "loss": 2.2196, "step": 1046 }, { "epoch": 1.1911262798634812, "grad_norm": 0.9094765782356262, "learning_rate": 0.0007626848691695109, "loss": 2.1863, "step": 1047 }, { "epoch": 1.1922639362912402, "grad_norm": 0.902521014213562, "learning_rate": 0.0007624573378839591, "loss": 1.4415, "step": 1048 }, { "epoch": 1.193401592718999, "grad_norm": 0.8511042594909668, "learning_rate": 0.0007622298065984073, "loss": 1.5148, "step": 1049 }, { "epoch": 1.1945392491467577, "grad_norm": 0.836769163608551, "learning_rate": 0.0007620022753128555, "loss": 1.819, "step": 1050 }, { "epoch": 1.1956769055745164, "grad_norm": 0.7466940879821777, "learning_rate": 0.0007617747440273037, "loss": 2.2535, "step": 1051 }, { "epoch": 1.1968145620022752, "grad_norm": 0.8963870406150818, "learning_rate": 0.000761547212741752, "loss": 1.8274, "step": 1052 }, { "epoch": 1.1979522184300342, "grad_norm": 1.2822279930114746, "learning_rate": 0.0007613196814562003, "loss": 2.1197, "step": 1053 }, { "epoch": 1.199089874857793, "grad_norm": 1.0114504098892212, "learning_rate": 0.0007610921501706485, "loss": 3.1561, "step": 1054 }, { "epoch": 1.2002275312855517, "grad_norm": 0.7050504088401794, "learning_rate": 0.0007608646188850967, "loss": 0.8309, "step": 1055 }, { "epoch": 1.2013651877133107, "grad_norm": 1.2295100688934326, "learning_rate": 0.000760637087599545, "loss": 2.0593, "step": 1056 }, { "epoch": 1.2025028441410694, "grad_norm": 1.2942835092544556, "learning_rate": 0.0007604095563139932, "loss": 2.06, "step": 1057 }, { "epoch": 1.2036405005688282, "grad_norm": 0.652757465839386, "learning_rate": 0.0007601820250284414, "loss": 1.2173, "step": 1058 }, { "epoch": 1.204778156996587, "grad_norm": 0.7156775593757629, "learning_rate": 0.0007599544937428896, "loss": 1.6357, "step": 1059 }, { "epoch": 1.2059158134243457, "grad_norm": 0.8230730891227722, "learning_rate": 0.0007597269624573379, "loss": 1.0645, "step": 1060 }, { "epoch": 1.2070534698521047, "grad_norm": 1.35136079788208, "learning_rate": 0.0007594994311717861, "loss": 3.901, "step": 1061 }, { "epoch": 1.2081911262798635, "grad_norm": 0.6722981333732605, "learning_rate": 0.0007592718998862344, "loss": 1.5358, "step": 1062 }, { "epoch": 1.2093287827076222, "grad_norm": 1.035335659980774, "learning_rate": 0.0007590443686006826, "loss": 1.7922, "step": 1063 }, { "epoch": 1.2104664391353812, "grad_norm": 1.9629613161087036, "learning_rate": 0.0007588168373151308, "loss": 3.3534, "step": 1064 }, { "epoch": 1.21160409556314, "grad_norm": 0.9386662244796753, "learning_rate": 0.0007585893060295791, "loss": 1.7383, "step": 1065 }, { "epoch": 1.2127417519908987, "grad_norm": 1.02280592918396, "learning_rate": 0.0007583617747440273, "loss": 2.3867, "step": 1066 }, { "epoch": 1.2138794084186575, "grad_norm": 0.9958323836326599, "learning_rate": 0.0007581342434584756, "loss": 2.2671, "step": 1067 }, { "epoch": 1.2150170648464165, "grad_norm": 0.9132734537124634, "learning_rate": 0.0007579067121729239, "loss": 1.5166, "step": 1068 }, { "epoch": 1.2161547212741752, "grad_norm": 0.9900468587875366, "learning_rate": 0.000757679180887372, "loss": 1.9609, "step": 1069 }, { "epoch": 1.217292377701934, "grad_norm": 0.7868292331695557, "learning_rate": 0.0007574516496018203, "loss": 1.2568, "step": 1070 }, { "epoch": 1.2184300341296928, "grad_norm": 0.7199138402938843, "learning_rate": 0.0007572241183162685, "loss": 1.6196, "step": 1071 }, { "epoch": 1.2195676905574517, "grad_norm": 1.0280647277832031, "learning_rate": 0.0007569965870307167, "loss": 2.3159, "step": 1072 }, { "epoch": 1.2207053469852105, "grad_norm": 1.435230016708374, "learning_rate": 0.000756769055745165, "loss": 1.9331, "step": 1073 }, { "epoch": 1.2218430034129693, "grad_norm": 0.982181966304779, "learning_rate": 0.0007565415244596132, "loss": 1.9982, "step": 1074 }, { "epoch": 1.222980659840728, "grad_norm": 1.21809720993042, "learning_rate": 0.0007563139931740614, "loss": 3.0575, "step": 1075 }, { "epoch": 1.224118316268487, "grad_norm": 1.0095391273498535, "learning_rate": 0.0007560864618885098, "loss": 2.1901, "step": 1076 }, { "epoch": 1.2252559726962458, "grad_norm": 0.8512126207351685, "learning_rate": 0.000755858930602958, "loss": 1.5792, "step": 1077 }, { "epoch": 1.2263936291240045, "grad_norm": 1.3197203874588013, "learning_rate": 0.0007556313993174062, "loss": 2.7239, "step": 1078 }, { "epoch": 1.2275312855517633, "grad_norm": 1.1738183498382568, "learning_rate": 0.0007554038680318544, "loss": 2.4295, "step": 1079 }, { "epoch": 1.2286689419795223, "grad_norm": 0.743088960647583, "learning_rate": 0.0007551763367463026, "loss": 1.134, "step": 1080 }, { "epoch": 1.229806598407281, "grad_norm": 1.0175997018814087, "learning_rate": 0.0007549488054607508, "loss": 3.1313, "step": 1081 }, { "epoch": 1.2309442548350398, "grad_norm": 1.1242859363555908, "learning_rate": 0.0007547212741751991, "loss": 2.3403, "step": 1082 }, { "epoch": 1.2320819112627985, "grad_norm": 1.0247671604156494, "learning_rate": 0.0007544937428896473, "loss": 1.6092, "step": 1083 }, { "epoch": 1.2332195676905575, "grad_norm": 0.7052936553955078, "learning_rate": 0.0007542662116040955, "loss": 1.915, "step": 1084 }, { "epoch": 1.2343572241183163, "grad_norm": 1.157301425933838, "learning_rate": 0.0007540386803185439, "loss": 1.9053, "step": 1085 }, { "epoch": 1.235494880546075, "grad_norm": 0.7313311100006104, "learning_rate": 0.0007538111490329921, "loss": 2.1036, "step": 1086 }, { "epoch": 1.236632536973834, "grad_norm": 0.998532772064209, "learning_rate": 0.0007535836177474404, "loss": 1.6923, "step": 1087 }, { "epoch": 1.2377701934015928, "grad_norm": 0.8534971475601196, "learning_rate": 0.0007533560864618886, "loss": 2.4127, "step": 1088 }, { "epoch": 1.2389078498293515, "grad_norm": 0.6717026233673096, "learning_rate": 0.0007531285551763367, "loss": 1.4987, "step": 1089 }, { "epoch": 1.2400455062571103, "grad_norm": 0.9222111105918884, "learning_rate": 0.000752901023890785, "loss": 1.3934, "step": 1090 }, { "epoch": 1.241183162684869, "grad_norm": 0.8172008395195007, "learning_rate": 0.0007526734926052332, "loss": 1.4544, "step": 1091 }, { "epoch": 1.242320819112628, "grad_norm": 0.8321571946144104, "learning_rate": 0.0007524459613196814, "loss": 1.4483, "step": 1092 }, { "epoch": 1.2434584755403868, "grad_norm": 0.663096010684967, "learning_rate": 0.0007522184300341298, "loss": 1.1409, "step": 1093 }, { "epoch": 1.2445961319681456, "grad_norm": 0.871487021446228, "learning_rate": 0.000751990898748578, "loss": 1.6311, "step": 1094 }, { "epoch": 1.2457337883959045, "grad_norm": 0.6653394103050232, "learning_rate": 0.0007517633674630262, "loss": 0.9919, "step": 1095 }, { "epoch": 1.2468714448236633, "grad_norm": 1.093170404434204, "learning_rate": 0.0007515358361774745, "loss": 1.4075, "step": 1096 }, { "epoch": 1.248009101251422, "grad_norm": 0.6205571293830872, "learning_rate": 0.0007513083048919227, "loss": 1.814, "step": 1097 }, { "epoch": 1.2491467576791808, "grad_norm": 1.6945722103118896, "learning_rate": 0.0007510807736063708, "loss": 2.4636, "step": 1098 }, { "epoch": 1.2502844141069396, "grad_norm": 0.8336583375930786, "learning_rate": 0.0007508532423208191, "loss": 1.9484, "step": 1099 }, { "epoch": 1.2514220705346986, "grad_norm": 0.9842813014984131, "learning_rate": 0.0007506257110352673, "loss": 2.2475, "step": 1100 }, { "epoch": 1.2525597269624573, "grad_norm": 0.6157675981521606, "learning_rate": 0.0007503981797497155, "loss": 1.1768, "step": 1101 }, { "epoch": 1.253697383390216, "grad_norm": 1.6868700981140137, "learning_rate": 0.0007501706484641639, "loss": 1.702, "step": 1102 }, { "epoch": 1.254835039817975, "grad_norm": 0.7272259593009949, "learning_rate": 0.0007499431171786121, "loss": 1.5946, "step": 1103 }, { "epoch": 1.2559726962457338, "grad_norm": 1.0236316919326782, "learning_rate": 0.0007497155858930603, "loss": 1.6014, "step": 1104 }, { "epoch": 1.2571103526734926, "grad_norm": 0.9424517750740051, "learning_rate": 0.0007494880546075086, "loss": 2.589, "step": 1105 }, { "epoch": 1.2582480091012513, "grad_norm": 0.8940749168395996, "learning_rate": 0.0007492605233219568, "loss": 2.1612, "step": 1106 }, { "epoch": 1.25938566552901, "grad_norm": 1.2272729873657227, "learning_rate": 0.000749032992036405, "loss": 2.1587, "step": 1107 }, { "epoch": 1.260523321956769, "grad_norm": 0.6935333609580994, "learning_rate": 0.0007488054607508532, "loss": 1.8994, "step": 1108 }, { "epoch": 1.2616609783845278, "grad_norm": 0.9777292013168335, "learning_rate": 0.0007485779294653014, "loss": 1.6954, "step": 1109 }, { "epoch": 1.2627986348122868, "grad_norm": 0.7491214275360107, "learning_rate": 0.0007483503981797498, "loss": 0.9805, "step": 1110 }, { "epoch": 1.2639362912400456, "grad_norm": 1.3111625909805298, "learning_rate": 0.000748122866894198, "loss": 3.0255, "step": 1111 }, { "epoch": 1.2650739476678043, "grad_norm": 0.9766296744346619, "learning_rate": 0.0007478953356086462, "loss": 1.5356, "step": 1112 }, { "epoch": 1.266211604095563, "grad_norm": 0.731914758682251, "learning_rate": 0.0007476678043230945, "loss": 1.7081, "step": 1113 }, { "epoch": 1.2673492605233219, "grad_norm": 2.02380633354187, "learning_rate": 0.0007474402730375427, "loss": 4.2285, "step": 1114 }, { "epoch": 1.2684869169510808, "grad_norm": 1.3702422380447388, "learning_rate": 0.0007472127417519909, "loss": 1.6421, "step": 1115 }, { "epoch": 1.2696245733788396, "grad_norm": 1.1319351196289062, "learning_rate": 0.0007469852104664392, "loss": 2.3609, "step": 1116 }, { "epoch": 1.2707622298065984, "grad_norm": 0.7127083539962769, "learning_rate": 0.0007467576791808874, "loss": 1.7728, "step": 1117 }, { "epoch": 1.2718998862343573, "grad_norm": 1.783090353012085, "learning_rate": 0.0007465301478953355, "loss": 3.132, "step": 1118 }, { "epoch": 1.273037542662116, "grad_norm": 0.9179248809814453, "learning_rate": 0.0007463026166097839, "loss": 2.0857, "step": 1119 }, { "epoch": 1.2741751990898749, "grad_norm": 0.9802193641662598, "learning_rate": 0.0007460750853242321, "loss": 2.608, "step": 1120 }, { "epoch": 1.2753128555176336, "grad_norm": 0.855622410774231, "learning_rate": 0.0007458475540386803, "loss": 1.9717, "step": 1121 }, { "epoch": 1.2764505119453924, "grad_norm": 1.1232682466506958, "learning_rate": 0.0007456200227531286, "loss": 2.113, "step": 1122 }, { "epoch": 1.2775881683731514, "grad_norm": 0.699447512626648, "learning_rate": 0.0007453924914675768, "loss": 1.2087, "step": 1123 }, { "epoch": 1.2787258248009101, "grad_norm": 1.3320637941360474, "learning_rate": 0.000745164960182025, "loss": 1.3834, "step": 1124 }, { "epoch": 1.2798634812286689, "grad_norm": 1.0277353525161743, "learning_rate": 0.0007449374288964733, "loss": 2.2678, "step": 1125 }, { "epoch": 1.2810011376564279, "grad_norm": 0.8935559391975403, "learning_rate": 0.0007447098976109215, "loss": 1.816, "step": 1126 }, { "epoch": 1.2821387940841866, "grad_norm": 1.2181432247161865, "learning_rate": 0.0007444823663253698, "loss": 2.2487, "step": 1127 }, { "epoch": 1.2832764505119454, "grad_norm": 1.007584810256958, "learning_rate": 0.000744254835039818, "loss": 2.1345, "step": 1128 }, { "epoch": 1.2844141069397041, "grad_norm": 1.0149571895599365, "learning_rate": 0.0007440273037542662, "loss": 2.4568, "step": 1129 }, { "epoch": 1.285551763367463, "grad_norm": 0.6311346888542175, "learning_rate": 0.0007437997724687145, "loss": 1.5611, "step": 1130 }, { "epoch": 1.286689419795222, "grad_norm": 1.3615020513534546, "learning_rate": 0.0007435722411831627, "loss": 2.5493, "step": 1131 }, { "epoch": 1.2878270762229806, "grad_norm": 0.7496081590652466, "learning_rate": 0.0007433447098976109, "loss": 1.5885, "step": 1132 }, { "epoch": 1.2889647326507394, "grad_norm": 0.9829562306404114, "learning_rate": 0.0007431171786120592, "loss": 1.8779, "step": 1133 }, { "epoch": 1.2901023890784984, "grad_norm": 1.2002413272857666, "learning_rate": 0.0007428896473265074, "loss": 1.9528, "step": 1134 }, { "epoch": 1.2912400455062572, "grad_norm": 0.7884618639945984, "learning_rate": 0.0007426621160409557, "loss": 0.8373, "step": 1135 }, { "epoch": 1.292377701934016, "grad_norm": 0.734190821647644, "learning_rate": 0.000742434584755404, "loss": 1.6223, "step": 1136 }, { "epoch": 1.2935153583617747, "grad_norm": 1.1885126829147339, "learning_rate": 0.0007422070534698521, "loss": 2.1086, "step": 1137 }, { "epoch": 1.2946530147895334, "grad_norm": 1.3277819156646729, "learning_rate": 0.0007419795221843003, "loss": 2.6113, "step": 1138 }, { "epoch": 1.2957906712172924, "grad_norm": 1.1494975090026855, "learning_rate": 0.0007417519908987486, "loss": 2.57, "step": 1139 }, { "epoch": 1.2969283276450512, "grad_norm": 0.7995481491088867, "learning_rate": 0.0007415244596131968, "loss": 2.041, "step": 1140 }, { "epoch": 1.29806598407281, "grad_norm": 1.1029703617095947, "learning_rate": 0.000741296928327645, "loss": 2.4597, "step": 1141 }, { "epoch": 1.299203640500569, "grad_norm": 0.808023989200592, "learning_rate": 0.0007410693970420933, "loss": 1.195, "step": 1142 }, { "epoch": 1.3003412969283277, "grad_norm": 1.3540676832199097, "learning_rate": 0.0007408418657565415, "loss": 2.8982, "step": 1143 }, { "epoch": 1.3014789533560864, "grad_norm": 0.6362115740776062, "learning_rate": 0.0007406143344709898, "loss": 0.9607, "step": 1144 }, { "epoch": 1.3026166097838452, "grad_norm": 0.5855313539505005, "learning_rate": 0.0007403868031854381, "loss": 1.1911, "step": 1145 }, { "epoch": 1.3037542662116042, "grad_norm": 0.902195930480957, "learning_rate": 0.0007401592718998863, "loss": 1.8625, "step": 1146 }, { "epoch": 1.304891922639363, "grad_norm": 0.9085184335708618, "learning_rate": 0.0007399317406143344, "loss": 2.2494, "step": 1147 }, { "epoch": 1.3060295790671217, "grad_norm": 0.8404464721679688, "learning_rate": 0.0007397042093287827, "loss": 1.7344, "step": 1148 }, { "epoch": 1.3071672354948807, "grad_norm": 1.5872386693954468, "learning_rate": 0.0007394766780432309, "loss": 2.6294, "step": 1149 }, { "epoch": 1.3083048919226394, "grad_norm": 0.81778484582901, "learning_rate": 0.0007392491467576792, "loss": 1.716, "step": 1150 }, { "epoch": 1.3094425483503982, "grad_norm": 0.9477559924125671, "learning_rate": 0.0007390216154721274, "loss": 1.7293, "step": 1151 }, { "epoch": 1.310580204778157, "grad_norm": 1.0724713802337646, "learning_rate": 0.0007387940841865757, "loss": 2.1255, "step": 1152 }, { "epoch": 1.3117178612059157, "grad_norm": 0.7965221405029297, "learning_rate": 0.000738566552901024, "loss": 1.1067, "step": 1153 }, { "epoch": 1.3128555176336747, "grad_norm": 0.9288459420204163, "learning_rate": 0.0007383390216154722, "loss": 2.1202, "step": 1154 }, { "epoch": 1.3139931740614335, "grad_norm": 1.221725344657898, "learning_rate": 0.0007381114903299204, "loss": 2.4878, "step": 1155 }, { "epoch": 1.3151308304891922, "grad_norm": 1.0420432090759277, "learning_rate": 0.0007378839590443687, "loss": 1.9784, "step": 1156 }, { "epoch": 1.3162684869169512, "grad_norm": 1.3146973848342896, "learning_rate": 0.0007376564277588168, "loss": 2.385, "step": 1157 }, { "epoch": 1.31740614334471, "grad_norm": 1.1637381315231323, "learning_rate": 0.000737428896473265, "loss": 1.8719, "step": 1158 }, { "epoch": 1.3185437997724687, "grad_norm": 1.2052867412567139, "learning_rate": 0.0007372013651877133, "loss": 3.4914, "step": 1159 }, { "epoch": 1.3196814562002275, "grad_norm": 0.9300926327705383, "learning_rate": 0.0007369738339021615, "loss": 2.7666, "step": 1160 }, { "epoch": 1.3208191126279862, "grad_norm": 1.0949327945709229, "learning_rate": 0.0007367463026166098, "loss": 2.3889, "step": 1161 }, { "epoch": 1.3219567690557452, "grad_norm": 1.1416770219802856, "learning_rate": 0.0007365187713310581, "loss": 2.082, "step": 1162 }, { "epoch": 1.323094425483504, "grad_norm": 0.6161019802093506, "learning_rate": 0.0007362912400455063, "loss": 1.1583, "step": 1163 }, { "epoch": 1.3242320819112627, "grad_norm": 1.192044734954834, "learning_rate": 0.0007360637087599545, "loss": 3.2384, "step": 1164 }, { "epoch": 1.3253697383390217, "grad_norm": 0.7260434031486511, "learning_rate": 0.0007358361774744028, "loss": 1.4132, "step": 1165 }, { "epoch": 1.3265073947667805, "grad_norm": 0.8180050253868103, "learning_rate": 0.0007356086461888509, "loss": 1.8317, "step": 1166 }, { "epoch": 1.3276450511945392, "grad_norm": 0.7380133867263794, "learning_rate": 0.0007353811149032991, "loss": 1.4301, "step": 1167 }, { "epoch": 1.328782707622298, "grad_norm": 0.9578267335891724, "learning_rate": 0.0007351535836177474, "loss": 1.9568, "step": 1168 }, { "epoch": 1.3299203640500568, "grad_norm": 0.9573442339897156, "learning_rate": 0.0007349260523321957, "loss": 2.6368, "step": 1169 }, { "epoch": 1.3310580204778157, "grad_norm": 0.8377273678779602, "learning_rate": 0.000734698521046644, "loss": 1.9496, "step": 1170 }, { "epoch": 1.3321956769055745, "grad_norm": 1.150334358215332, "learning_rate": 0.0007344709897610922, "loss": 1.8031, "step": 1171 }, { "epoch": 1.3333333333333333, "grad_norm": 1.1262184381484985, "learning_rate": 0.0007342434584755404, "loss": 2.4094, "step": 1172 }, { "epoch": 1.3344709897610922, "grad_norm": 0.8950188159942627, "learning_rate": 0.0007340159271899887, "loss": 1.557, "step": 1173 }, { "epoch": 1.335608646188851, "grad_norm": 1.0164450407028198, "learning_rate": 0.0007337883959044369, "loss": 2.5082, "step": 1174 }, { "epoch": 1.3367463026166098, "grad_norm": 0.6207723021507263, "learning_rate": 0.0007335608646188851, "loss": 1.6945, "step": 1175 }, { "epoch": 1.3378839590443685, "grad_norm": 1.3576558828353882, "learning_rate": 0.0007333333333333333, "loss": 2.383, "step": 1176 }, { "epoch": 1.3390216154721273, "grad_norm": 1.040291428565979, "learning_rate": 0.0007331058020477816, "loss": 1.887, "step": 1177 }, { "epoch": 1.3401592718998863, "grad_norm": 1.2558809518814087, "learning_rate": 0.0007328782707622298, "loss": 2.6059, "step": 1178 }, { "epoch": 1.341296928327645, "grad_norm": 1.1690802574157715, "learning_rate": 0.0007326507394766781, "loss": 2.0956, "step": 1179 }, { "epoch": 1.342434584755404, "grad_norm": 0.6343255043029785, "learning_rate": 0.0007324232081911263, "loss": 1.1936, "step": 1180 }, { "epoch": 1.3435722411831628, "grad_norm": 1.210610270500183, "learning_rate": 0.0007321956769055745, "loss": 2.7752, "step": 1181 }, { "epoch": 1.3447098976109215, "grad_norm": 1.1297094821929932, "learning_rate": 0.0007319681456200228, "loss": 1.6202, "step": 1182 }, { "epoch": 1.3458475540386803, "grad_norm": 1.0680432319641113, "learning_rate": 0.000731740614334471, "loss": 2.5643, "step": 1183 }, { "epoch": 1.346985210466439, "grad_norm": 1.117632508277893, "learning_rate": 0.0007315130830489192, "loss": 2.2324, "step": 1184 }, { "epoch": 1.348122866894198, "grad_norm": 1.2003607749938965, "learning_rate": 0.0007312855517633676, "loss": 1.7135, "step": 1185 }, { "epoch": 1.3492605233219568, "grad_norm": 1.0429855585098267, "learning_rate": 0.0007310580204778157, "loss": 1.7824, "step": 1186 }, { "epoch": 1.3503981797497155, "grad_norm": 1.0069884061813354, "learning_rate": 0.0007308304891922639, "loss": 1.6408, "step": 1187 }, { "epoch": 1.3515358361774745, "grad_norm": 0.7664169669151306, "learning_rate": 0.0007306029579067122, "loss": 1.3137, "step": 1188 }, { "epoch": 1.3526734926052333, "grad_norm": 0.9586772322654724, "learning_rate": 0.0007303754266211604, "loss": 1.861, "step": 1189 }, { "epoch": 1.353811149032992, "grad_norm": 0.6649326086044312, "learning_rate": 0.0007301478953356086, "loss": 1.0684, "step": 1190 }, { "epoch": 1.3549488054607508, "grad_norm": 1.4369438886642456, "learning_rate": 0.0007299203640500569, "loss": 3.2283, "step": 1191 }, { "epoch": 1.3560864618885096, "grad_norm": 1.1447542905807495, "learning_rate": 0.0007296928327645051, "loss": 2.1419, "step": 1192 }, { "epoch": 1.3572241183162685, "grad_norm": 0.9492262601852417, "learning_rate": 0.0007294653014789535, "loss": 1.9544, "step": 1193 }, { "epoch": 1.3583617747440273, "grad_norm": 0.8491740822792053, "learning_rate": 0.0007292377701934017, "loss": 1.5818, "step": 1194 }, { "epoch": 1.359499431171786, "grad_norm": 1.0647927522659302, "learning_rate": 0.0007290102389078499, "loss": 2.6932, "step": 1195 }, { "epoch": 1.360637087599545, "grad_norm": 1.270956039428711, "learning_rate": 0.0007287827076222981, "loss": 1.9941, "step": 1196 }, { "epoch": 1.3617747440273038, "grad_norm": 0.9028446674346924, "learning_rate": 0.0007285551763367463, "loss": 2.2445, "step": 1197 }, { "epoch": 1.3629124004550626, "grad_norm": 0.8218393921852112, "learning_rate": 0.0007283276450511945, "loss": 1.9432, "step": 1198 }, { "epoch": 1.3640500568828213, "grad_norm": 0.7149525284767151, "learning_rate": 0.0007281001137656428, "loss": 0.8669, "step": 1199 }, { "epoch": 1.36518771331058, "grad_norm": 0.9978352189064026, "learning_rate": 0.000727872582480091, "loss": 1.4849, "step": 1200 }, { "epoch": 1.366325369738339, "grad_norm": 0.9793040752410889, "learning_rate": 0.0007276450511945392, "loss": 1.7734, "step": 1201 }, { "epoch": 1.3674630261660978, "grad_norm": 0.8100583553314209, "learning_rate": 0.0007274175199089876, "loss": 1.7183, "step": 1202 }, { "epoch": 1.3686006825938566, "grad_norm": 1.051924467086792, "learning_rate": 0.0007271899886234358, "loss": 2.0836, "step": 1203 }, { "epoch": 1.3697383390216156, "grad_norm": 1.057503342628479, "learning_rate": 0.000726962457337884, "loss": 2.1088, "step": 1204 }, { "epoch": 1.3708759954493743, "grad_norm": 1.1179509162902832, "learning_rate": 0.0007267349260523322, "loss": 1.452, "step": 1205 }, { "epoch": 1.372013651877133, "grad_norm": 1.2062307596206665, "learning_rate": 0.0007265073947667804, "loss": 2.4706, "step": 1206 }, { "epoch": 1.3731513083048918, "grad_norm": 1.5062495470046997, "learning_rate": 0.0007262798634812286, "loss": 3.7995, "step": 1207 }, { "epoch": 1.3742889647326506, "grad_norm": 1.0101311206817627, "learning_rate": 0.0007260523321956769, "loss": 1.889, "step": 1208 }, { "epoch": 1.3754266211604096, "grad_norm": 0.5304610729217529, "learning_rate": 0.0007258248009101251, "loss": 0.8576, "step": 1209 }, { "epoch": 1.3765642775881684, "grad_norm": 0.82547926902771, "learning_rate": 0.0007255972696245733, "loss": 1.8134, "step": 1210 }, { "epoch": 1.377701934015927, "grad_norm": 1.220291256904602, "learning_rate": 0.0007253697383390217, "loss": 2.1109, "step": 1211 }, { "epoch": 1.378839590443686, "grad_norm": 0.7384538650512695, "learning_rate": 0.0007251422070534699, "loss": 1.9597, "step": 1212 }, { "epoch": 1.3799772468714449, "grad_norm": 1.0133798122406006, "learning_rate": 0.0007249146757679182, "loss": 2.7964, "step": 1213 }, { "epoch": 1.3811149032992036, "grad_norm": 1.00083327293396, "learning_rate": 0.0007246871444823664, "loss": 2.6457, "step": 1214 }, { "epoch": 1.3822525597269624, "grad_norm": 0.9185119271278381, "learning_rate": 0.0007244596131968145, "loss": 1.5573, "step": 1215 }, { "epoch": 1.3833902161547214, "grad_norm": 0.8959344029426575, "learning_rate": 0.0007242320819112628, "loss": 1.5507, "step": 1216 }, { "epoch": 1.3845278725824801, "grad_norm": 0.7419568300247192, "learning_rate": 0.000724004550625711, "loss": 1.599, "step": 1217 }, { "epoch": 1.3856655290102389, "grad_norm": 1.2727500200271606, "learning_rate": 0.0007237770193401592, "loss": 1.8304, "step": 1218 }, { "epoch": 1.3868031854379979, "grad_norm": 0.6277416944503784, "learning_rate": 0.0007235494880546076, "loss": 1.4432, "step": 1219 }, { "epoch": 1.3879408418657566, "grad_norm": 1.2073726654052734, "learning_rate": 0.0007233219567690558, "loss": 2.4847, "step": 1220 }, { "epoch": 1.3890784982935154, "grad_norm": 1.075276255607605, "learning_rate": 0.000723094425483504, "loss": 2.4881, "step": 1221 }, { "epoch": 1.3902161547212741, "grad_norm": 1.0907241106033325, "learning_rate": 0.0007228668941979523, "loss": 2.7326, "step": 1222 }, { "epoch": 1.391353811149033, "grad_norm": 0.9791719317436218, "learning_rate": 0.0007226393629124005, "loss": 1.8307, "step": 1223 }, { "epoch": 1.3924914675767919, "grad_norm": 1.1889147758483887, "learning_rate": 0.0007224118316268487, "loss": 3.306, "step": 1224 }, { "epoch": 1.3936291240045506, "grad_norm": 0.9719458818435669, "learning_rate": 0.0007221843003412969, "loss": 1.7917, "step": 1225 }, { "epoch": 1.3947667804323094, "grad_norm": 1.136434555053711, "learning_rate": 0.0007219567690557451, "loss": 2.1016, "step": 1226 }, { "epoch": 1.3959044368600684, "grad_norm": 0.9792470335960388, "learning_rate": 0.0007217292377701933, "loss": 1.5415, "step": 1227 }, { "epoch": 1.3970420932878271, "grad_norm": 0.5439932346343994, "learning_rate": 0.0007215017064846417, "loss": 0.6703, "step": 1228 }, { "epoch": 1.398179749715586, "grad_norm": 1.0243198871612549, "learning_rate": 0.0007212741751990899, "loss": 1.7786, "step": 1229 }, { "epoch": 1.3993174061433447, "grad_norm": 1.2160857915878296, "learning_rate": 0.0007210466439135381, "loss": 2.0586, "step": 1230 }, { "epoch": 1.4004550625711034, "grad_norm": 1.0249682664871216, "learning_rate": 0.0007208191126279864, "loss": 2.3755, "step": 1231 }, { "epoch": 1.4015927189988624, "grad_norm": 1.3977047204971313, "learning_rate": 0.0007205915813424346, "loss": 2.1492, "step": 1232 }, { "epoch": 1.4027303754266212, "grad_norm": 0.7847321033477783, "learning_rate": 0.0007203640500568829, "loss": 1.2055, "step": 1233 }, { "epoch": 1.40386803185438, "grad_norm": 1.104148030281067, "learning_rate": 0.0007201365187713311, "loss": 3.1498, "step": 1234 }, { "epoch": 1.405005688282139, "grad_norm": 0.87827068567276, "learning_rate": 0.0007199089874857792, "loss": 2.2748, "step": 1235 }, { "epoch": 1.4061433447098977, "grad_norm": 1.0710291862487793, "learning_rate": 0.0007196814562002276, "loss": 1.4276, "step": 1236 }, { "epoch": 1.4072810011376564, "grad_norm": 0.950809121131897, "learning_rate": 0.0007194539249146758, "loss": 1.8939, "step": 1237 }, { "epoch": 1.4084186575654152, "grad_norm": 1.010000228881836, "learning_rate": 0.000719226393629124, "loss": 1.6589, "step": 1238 }, { "epoch": 1.409556313993174, "grad_norm": 1.1753206253051758, "learning_rate": 0.0007189988623435723, "loss": 2.3059, "step": 1239 }, { "epoch": 1.410693970420933, "grad_norm": 1.1671147346496582, "learning_rate": 0.0007187713310580205, "loss": 2.4946, "step": 1240 }, { "epoch": 1.4118316268486917, "grad_norm": 0.8529374599456787, "learning_rate": 0.0007185437997724687, "loss": 2.0972, "step": 1241 }, { "epoch": 1.4129692832764504, "grad_norm": 1.1962100267410278, "learning_rate": 0.000718316268486917, "loss": 2.4916, "step": 1242 }, { "epoch": 1.4141069397042094, "grad_norm": 0.8587897419929504, "learning_rate": 0.0007180887372013652, "loss": 0.9147, "step": 1243 }, { "epoch": 1.4152445961319682, "grad_norm": 0.9558615684509277, "learning_rate": 0.0007178612059158133, "loss": 1.8377, "step": 1244 }, { "epoch": 1.416382252559727, "grad_norm": 0.694108784198761, "learning_rate": 0.0007176336746302617, "loss": 1.4134, "step": 1245 }, { "epoch": 1.4175199089874857, "grad_norm": 0.855204164981842, "learning_rate": 0.0007174061433447099, "loss": 2.012, "step": 1246 }, { "epoch": 1.4186575654152445, "grad_norm": 1.4311326742172241, "learning_rate": 0.0007171786120591581, "loss": 3.41, "step": 1247 }, { "epoch": 1.4197952218430034, "grad_norm": 0.9610998630523682, "learning_rate": 0.0007169510807736064, "loss": 1.6547, "step": 1248 }, { "epoch": 1.4209328782707622, "grad_norm": 0.7780663371086121, "learning_rate": 0.0007167235494880546, "loss": 1.9053, "step": 1249 }, { "epoch": 1.4220705346985212, "grad_norm": 1.3516772985458374, "learning_rate": 0.0007164960182025028, "loss": 0.701, "step": 1250 }, { "epoch": 1.42320819112628, "grad_norm": 1.596235990524292, "learning_rate": 0.0007162684869169511, "loss": 2.6895, "step": 1251 }, { "epoch": 1.4243458475540387, "grad_norm": 0.9624771475791931, "learning_rate": 0.0007160409556313994, "loss": 1.341, "step": 1252 }, { "epoch": 1.4254835039817975, "grad_norm": 0.8453911542892456, "learning_rate": 0.0007158134243458477, "loss": 2.0326, "step": 1253 }, { "epoch": 1.4266211604095562, "grad_norm": 1.0875787734985352, "learning_rate": 0.0007155858930602958, "loss": 1.4671, "step": 1254 }, { "epoch": 1.4277588168373152, "grad_norm": 1.0011261701583862, "learning_rate": 0.000715358361774744, "loss": 1.4844, "step": 1255 }, { "epoch": 1.428896473265074, "grad_norm": 0.7726243734359741, "learning_rate": 0.0007151308304891923, "loss": 1.7404, "step": 1256 }, { "epoch": 1.4300341296928327, "grad_norm": 0.7656590342521667, "learning_rate": 0.0007149032992036405, "loss": 1.6766, "step": 1257 }, { "epoch": 1.4311717861205917, "grad_norm": 0.8595698475837708, "learning_rate": 0.0007146757679180887, "loss": 2.1565, "step": 1258 }, { "epoch": 1.4323094425483505, "grad_norm": 1.003932237625122, "learning_rate": 0.000714448236632537, "loss": 1.721, "step": 1259 }, { "epoch": 1.4334470989761092, "grad_norm": 0.8126673102378845, "learning_rate": 0.0007142207053469852, "loss": 2.0854, "step": 1260 }, { "epoch": 1.434584755403868, "grad_norm": 0.9045354723930359, "learning_rate": 0.0007139931740614335, "loss": 1.5096, "step": 1261 }, { "epoch": 1.4357224118316267, "grad_norm": 0.923866331577301, "learning_rate": 0.0007137656427758818, "loss": 2.2521, "step": 1262 }, { "epoch": 1.4368600682593857, "grad_norm": 0.8187153339385986, "learning_rate": 0.00071353811149033, "loss": 2.3217, "step": 1263 }, { "epoch": 1.4379977246871445, "grad_norm": 1.0635052919387817, "learning_rate": 0.0007133105802047781, "loss": 1.8729, "step": 1264 }, { "epoch": 1.4391353811149032, "grad_norm": 0.8792582750320435, "learning_rate": 0.0007130830489192264, "loss": 1.9955, "step": 1265 }, { "epoch": 1.4402730375426622, "grad_norm": 1.3968185186386108, "learning_rate": 0.0007128555176336746, "loss": 4.4219, "step": 1266 }, { "epoch": 1.441410693970421, "grad_norm": 0.956078827381134, "learning_rate": 0.0007126279863481228, "loss": 1.5945, "step": 1267 }, { "epoch": 1.4425483503981797, "grad_norm": 0.8696045875549316, "learning_rate": 0.0007124004550625711, "loss": 1.1966, "step": 1268 }, { "epoch": 1.4436860068259385, "grad_norm": 0.7060182690620422, "learning_rate": 0.0007121729237770194, "loss": 1.3995, "step": 1269 }, { "epoch": 1.4448236632536973, "grad_norm": 0.8254784941673279, "learning_rate": 0.0007119453924914676, "loss": 1.0873, "step": 1270 }, { "epoch": 1.4459613196814562, "grad_norm": 1.0359618663787842, "learning_rate": 0.0007117178612059159, "loss": 2.2489, "step": 1271 }, { "epoch": 1.447098976109215, "grad_norm": 0.9475293159484863, "learning_rate": 0.0007114903299203641, "loss": 1.8545, "step": 1272 }, { "epoch": 1.4482366325369738, "grad_norm": 0.7588447332382202, "learning_rate": 0.0007112627986348122, "loss": 1.5502, "step": 1273 }, { "epoch": 1.4493742889647327, "grad_norm": 1.24519681930542, "learning_rate": 0.0007110352673492605, "loss": 2.6479, "step": 1274 }, { "epoch": 1.4505119453924915, "grad_norm": 0.6365463137626648, "learning_rate": 0.0007108077360637087, "loss": 1.404, "step": 1275 }, { "epoch": 1.4516496018202503, "grad_norm": 1.4220560789108276, "learning_rate": 0.000710580204778157, "loss": 2.4303, "step": 1276 }, { "epoch": 1.452787258248009, "grad_norm": 0.965912938117981, "learning_rate": 0.0007103526734926053, "loss": 2.1538, "step": 1277 }, { "epoch": 1.4539249146757678, "grad_norm": 1.103785514831543, "learning_rate": 0.0007101251422070535, "loss": 2.8817, "step": 1278 }, { "epoch": 1.4550625711035268, "grad_norm": 1.0863354206085205, "learning_rate": 0.0007098976109215018, "loss": 3.1132, "step": 1279 }, { "epoch": 1.4562002275312855, "grad_norm": 0.988666296005249, "learning_rate": 0.00070967007963595, "loss": 1.915, "step": 1280 }, { "epoch": 1.4573378839590443, "grad_norm": 1.0589752197265625, "learning_rate": 0.0007094425483503982, "loss": 1.6478, "step": 1281 }, { "epoch": 1.4584755403868033, "grad_norm": 0.9732983708381653, "learning_rate": 0.0007092150170648465, "loss": 1.834, "step": 1282 }, { "epoch": 1.459613196814562, "grad_norm": 0.932620644569397, "learning_rate": 0.0007089874857792946, "loss": 2.184, "step": 1283 }, { "epoch": 1.4607508532423208, "grad_norm": 1.0213489532470703, "learning_rate": 0.0007087599544937428, "loss": 1.9008, "step": 1284 }, { "epoch": 1.4618885096700796, "grad_norm": 0.8882502317428589, "learning_rate": 0.0007085324232081911, "loss": 1.6468, "step": 1285 }, { "epoch": 1.4630261660978385, "grad_norm": 0.794425904750824, "learning_rate": 0.0007083048919226394, "loss": 2.0429, "step": 1286 }, { "epoch": 1.4641638225255973, "grad_norm": 1.1039729118347168, "learning_rate": 0.0007080773606370876, "loss": 2.3133, "step": 1287 }, { "epoch": 1.465301478953356, "grad_norm": 1.3886933326721191, "learning_rate": 0.0007078498293515359, "loss": 2.0589, "step": 1288 }, { "epoch": 1.466439135381115, "grad_norm": 0.6157169938087463, "learning_rate": 0.0007076222980659841, "loss": 1.0264, "step": 1289 }, { "epoch": 1.4675767918088738, "grad_norm": 1.0444914102554321, "learning_rate": 0.0007073947667804323, "loss": 2.8021, "step": 1290 }, { "epoch": 1.4687144482366326, "grad_norm": 0.9021384716033936, "learning_rate": 0.0007071672354948806, "loss": 1.5786, "step": 1291 }, { "epoch": 1.4698521046643913, "grad_norm": 0.9910659790039062, "learning_rate": 0.0007069397042093288, "loss": 1.3046, "step": 1292 }, { "epoch": 1.47098976109215, "grad_norm": 0.8417410254478455, "learning_rate": 0.0007067121729237769, "loss": 1.388, "step": 1293 }, { "epoch": 1.472127417519909, "grad_norm": 0.8313772082328796, "learning_rate": 0.0007064846416382253, "loss": 1.5113, "step": 1294 }, { "epoch": 1.4732650739476678, "grad_norm": 0.9730493426322937, "learning_rate": 0.0007062571103526735, "loss": 1.3389, "step": 1295 }, { "epoch": 1.4744027303754266, "grad_norm": 0.9900069832801819, "learning_rate": 0.0007060295790671218, "loss": 2.1483, "step": 1296 }, { "epoch": 1.4755403868031856, "grad_norm": 0.8475415110588074, "learning_rate": 0.00070580204778157, "loss": 1.397, "step": 1297 }, { "epoch": 1.4766780432309443, "grad_norm": 1.0743128061294556, "learning_rate": 0.0007055745164960182, "loss": 1.8463, "step": 1298 }, { "epoch": 1.477815699658703, "grad_norm": 1.1737279891967773, "learning_rate": 0.0007053469852104665, "loss": 3.4954, "step": 1299 }, { "epoch": 1.4789533560864618, "grad_norm": 1.0930012464523315, "learning_rate": 0.0007051194539249147, "loss": 2.65, "step": 1300 }, { "epoch": 1.4800910125142206, "grad_norm": 0.6050748229026794, "learning_rate": 0.0007048919226393629, "loss": 1.1043, "step": 1301 }, { "epoch": 1.4812286689419796, "grad_norm": 2.5956709384918213, "learning_rate": 0.0007046643913538113, "loss": 4.9918, "step": 1302 }, { "epoch": 1.4823663253697383, "grad_norm": 1.5760893821716309, "learning_rate": 0.0007044368600682594, "loss": 3.41, "step": 1303 }, { "epoch": 1.483503981797497, "grad_norm": 1.2234876155853271, "learning_rate": 0.0007042093287827076, "loss": 2.0168, "step": 1304 }, { "epoch": 1.484641638225256, "grad_norm": 1.5653973817825317, "learning_rate": 0.0007039817974971559, "loss": 3.4326, "step": 1305 }, { "epoch": 1.4857792946530148, "grad_norm": 0.9075149297714233, "learning_rate": 0.0007037542662116041, "loss": 1.5127, "step": 1306 }, { "epoch": 1.4869169510807736, "grad_norm": 0.8964717984199524, "learning_rate": 0.0007035267349260523, "loss": 1.9019, "step": 1307 }, { "epoch": 1.4880546075085324, "grad_norm": 0.8409413695335388, "learning_rate": 0.0007032992036405006, "loss": 1.4982, "step": 1308 }, { "epoch": 1.4891922639362911, "grad_norm": 0.7918345928192139, "learning_rate": 0.0007030716723549488, "loss": 1.5671, "step": 1309 }, { "epoch": 1.49032992036405, "grad_norm": 0.9930965900421143, "learning_rate": 0.000702844141069397, "loss": 1.7916, "step": 1310 }, { "epoch": 1.4914675767918089, "grad_norm": 0.7483389377593994, "learning_rate": 0.0007026166097838454, "loss": 1.2254, "step": 1311 }, { "epoch": 1.4926052332195676, "grad_norm": 0.5501680374145508, "learning_rate": 0.0007023890784982935, "loss": 1.4911, "step": 1312 }, { "epoch": 1.4937428896473266, "grad_norm": 0.6416218280792236, "learning_rate": 0.0007021615472127417, "loss": 1.1327, "step": 1313 }, { "epoch": 1.4948805460750854, "grad_norm": 1.3638542890548706, "learning_rate": 0.00070193401592719, "loss": 2.1457, "step": 1314 }, { "epoch": 1.4960182025028441, "grad_norm": 0.625977635383606, "learning_rate": 0.0007017064846416382, "loss": 0.9247, "step": 1315 }, { "epoch": 1.4971558589306029, "grad_norm": 0.9826652407646179, "learning_rate": 0.0007014789533560865, "loss": 2.6433, "step": 1316 }, { "epoch": 1.4982935153583616, "grad_norm": 0.8866605162620544, "learning_rate": 0.0007012514220705347, "loss": 1.3826, "step": 1317 }, { "epoch": 1.4994311717861206, "grad_norm": 0.7106355428695679, "learning_rate": 0.0007010238907849829, "loss": 1.3616, "step": 1318 }, { "epoch": 1.5005688282138794, "grad_norm": 0.9405243992805481, "learning_rate": 0.0007007963594994313, "loss": 1.6128, "step": 1319 }, { "epoch": 1.5017064846416384, "grad_norm": 0.947182834148407, "learning_rate": 0.0007005688282138795, "loss": 3.0259, "step": 1320 }, { "epoch": 1.5028441410693971, "grad_norm": 0.89002925157547, "learning_rate": 0.0007003412969283277, "loss": 2.0293, "step": 1321 }, { "epoch": 1.5039817974971559, "grad_norm": 0.582564115524292, "learning_rate": 0.0007001137656427759, "loss": 0.5041, "step": 1322 }, { "epoch": 1.5051194539249146, "grad_norm": 0.961336612701416, "learning_rate": 0.0006998862343572241, "loss": 2.0847, "step": 1323 }, { "epoch": 1.5062571103526734, "grad_norm": 1.3698042631149292, "learning_rate": 0.0006996587030716723, "loss": 2.7407, "step": 1324 }, { "epoch": 1.5073947667804322, "grad_norm": 1.4105736017227173, "learning_rate": 0.0006994311717861206, "loss": 2.8289, "step": 1325 }, { "epoch": 1.5085324232081911, "grad_norm": 0.9374104142189026, "learning_rate": 0.0006992036405005688, "loss": 1.7586, "step": 1326 }, { "epoch": 1.50967007963595, "grad_norm": 0.7621793746948242, "learning_rate": 0.000698976109215017, "loss": 1.3067, "step": 1327 }, { "epoch": 1.5108077360637089, "grad_norm": 1.1107509136199951, "learning_rate": 0.0006987485779294654, "loss": 2.591, "step": 1328 }, { "epoch": 1.5119453924914676, "grad_norm": 0.9593746662139893, "learning_rate": 0.0006985210466439136, "loss": 2.4804, "step": 1329 }, { "epoch": 1.5130830489192264, "grad_norm": 0.7596848011016846, "learning_rate": 0.0006982935153583618, "loss": 0.9857, "step": 1330 }, { "epoch": 1.5142207053469852, "grad_norm": 0.8829966187477112, "learning_rate": 0.0006980659840728101, "loss": 1.1982, "step": 1331 }, { "epoch": 1.515358361774744, "grad_norm": 0.8279595375061035, "learning_rate": 0.0006978384527872582, "loss": 2.2226, "step": 1332 }, { "epoch": 1.5164960182025027, "grad_norm": 1.1019062995910645, "learning_rate": 0.0006976109215017064, "loss": 1.7367, "step": 1333 }, { "epoch": 1.5176336746302617, "grad_norm": 1.7979804277420044, "learning_rate": 0.0006973833902161547, "loss": 1.959, "step": 1334 }, { "epoch": 1.5187713310580204, "grad_norm": 0.8705268502235413, "learning_rate": 0.0006971558589306029, "loss": 2.1202, "step": 1335 }, { "epoch": 1.5199089874857794, "grad_norm": 1.0724592208862305, "learning_rate": 0.0006969283276450513, "loss": 1.7139, "step": 1336 }, { "epoch": 1.5210466439135382, "grad_norm": 0.9775965809822083, "learning_rate": 0.0006967007963594995, "loss": 2.0016, "step": 1337 }, { "epoch": 1.522184300341297, "grad_norm": 0.9448524713516235, "learning_rate": 0.0006964732650739477, "loss": 1.6153, "step": 1338 }, { "epoch": 1.5233219567690557, "grad_norm": 1.5415128469467163, "learning_rate": 0.000696245733788396, "loss": 3.1016, "step": 1339 }, { "epoch": 1.5244596131968144, "grad_norm": 0.9991744160652161, "learning_rate": 0.0006960182025028442, "loss": 1.5704, "step": 1340 }, { "epoch": 1.5255972696245734, "grad_norm": 0.5757598280906677, "learning_rate": 0.0006957906712172923, "loss": 1.0319, "step": 1341 }, { "epoch": 1.5267349260523322, "grad_norm": 0.7392444014549255, "learning_rate": 0.0006955631399317406, "loss": 1.6469, "step": 1342 }, { "epoch": 1.5278725824800912, "grad_norm": 0.8269877433776855, "learning_rate": 0.0006953356086461888, "loss": 1.5095, "step": 1343 }, { "epoch": 1.52901023890785, "grad_norm": 0.8942914605140686, "learning_rate": 0.000695108077360637, "loss": 1.7443, "step": 1344 }, { "epoch": 1.5301478953356087, "grad_norm": 0.9986234307289124, "learning_rate": 0.0006948805460750854, "loss": 2.1091, "step": 1345 }, { "epoch": 1.5312855517633674, "grad_norm": 1.5111842155456543, "learning_rate": 0.0006946530147895336, "loss": 2.9399, "step": 1346 }, { "epoch": 1.5324232081911262, "grad_norm": 0.8107113242149353, "learning_rate": 0.0006944254835039818, "loss": 1.5483, "step": 1347 }, { "epoch": 1.533560864618885, "grad_norm": 0.9013845324516296, "learning_rate": 0.0006941979522184301, "loss": 2.0372, "step": 1348 }, { "epoch": 1.534698521046644, "grad_norm": 0.812639057636261, "learning_rate": 0.0006939704209328783, "loss": 1.7029, "step": 1349 }, { "epoch": 1.5358361774744027, "grad_norm": 1.5611118078231812, "learning_rate": 0.0006937428896473265, "loss": 3.1895, "step": 1350 }, { "epoch": 1.5369738339021617, "grad_norm": 0.7234958410263062, "learning_rate": 0.0006935153583617747, "loss": 1.7994, "step": 1351 }, { "epoch": 1.5381114903299204, "grad_norm": 0.5042601227760315, "learning_rate": 0.0006932878270762229, "loss": 0.6559, "step": 1352 }, { "epoch": 1.5392491467576792, "grad_norm": 0.7862391471862793, "learning_rate": 0.0006930602957906712, "loss": 1.2773, "step": 1353 }, { "epoch": 1.540386803185438, "grad_norm": 0.8332534432411194, "learning_rate": 0.0006928327645051195, "loss": 1.7685, "step": 1354 }, { "epoch": 1.5415244596131967, "grad_norm": 0.7258904576301575, "learning_rate": 0.0006926052332195677, "loss": 0.8505, "step": 1355 }, { "epoch": 1.5426621160409555, "grad_norm": 0.6030761003494263, "learning_rate": 0.000692377701934016, "loss": 1.2053, "step": 1356 }, { "epoch": 1.5437997724687145, "grad_norm": 1.0888185501098633, "learning_rate": 0.0006921501706484642, "loss": 2.0252, "step": 1357 }, { "epoch": 1.5449374288964732, "grad_norm": 1.2503432035446167, "learning_rate": 0.0006919226393629124, "loss": 3.6095, "step": 1358 }, { "epoch": 1.5460750853242322, "grad_norm": 0.6536726951599121, "learning_rate": 0.0006916951080773607, "loss": 1.171, "step": 1359 }, { "epoch": 1.547212741751991, "grad_norm": 0.8668822646141052, "learning_rate": 0.000691467576791809, "loss": 2.234, "step": 1360 }, { "epoch": 1.5483503981797497, "grad_norm": 1.2817611694335938, "learning_rate": 0.000691240045506257, "loss": 1.7949, "step": 1361 }, { "epoch": 1.5494880546075085, "grad_norm": 0.9210306406021118, "learning_rate": 0.0006910125142207054, "loss": 0.6781, "step": 1362 }, { "epoch": 1.5506257110352673, "grad_norm": 1.0318769216537476, "learning_rate": 0.0006907849829351536, "loss": 2.4866, "step": 1363 }, { "epoch": 1.551763367463026, "grad_norm": 1.0677587985992432, "learning_rate": 0.0006905574516496018, "loss": 2.1298, "step": 1364 }, { "epoch": 1.552901023890785, "grad_norm": 0.6409241557121277, "learning_rate": 0.0006903299203640501, "loss": 1.4516, "step": 1365 }, { "epoch": 1.5540386803185438, "grad_norm": 1.06160569190979, "learning_rate": 0.0006901023890784983, "loss": 2.8236, "step": 1366 }, { "epoch": 1.5551763367463027, "grad_norm": 1.5889126062393188, "learning_rate": 0.0006898748577929465, "loss": 2.9584, "step": 1367 }, { "epoch": 1.5563139931740615, "grad_norm": 0.5789588093757629, "learning_rate": 0.0006896473265073948, "loss": 1.2789, "step": 1368 }, { "epoch": 1.5574516496018203, "grad_norm": 1.5852688550949097, "learning_rate": 0.000689419795221843, "loss": 3.2372, "step": 1369 }, { "epoch": 1.558589306029579, "grad_norm": 0.8930657505989075, "learning_rate": 0.0006891922639362913, "loss": 1.2431, "step": 1370 }, { "epoch": 1.5597269624573378, "grad_norm": 1.1231560707092285, "learning_rate": 0.0006889647326507395, "loss": 2.3293, "step": 1371 }, { "epoch": 1.5608646188850968, "grad_norm": 1.0928031206130981, "learning_rate": 0.0006887372013651877, "loss": 2.5423, "step": 1372 }, { "epoch": 1.5620022753128555, "grad_norm": 0.9887200593948364, "learning_rate": 0.0006885096700796359, "loss": 2.1031, "step": 1373 }, { "epoch": 1.5631399317406145, "grad_norm": 1.6233134269714355, "learning_rate": 0.0006882821387940842, "loss": 3.366, "step": 1374 }, { "epoch": 1.5642775881683733, "grad_norm": 0.6267789006233215, "learning_rate": 0.0006880546075085324, "loss": 1.0757, "step": 1375 }, { "epoch": 1.565415244596132, "grad_norm": 0.8272294402122498, "learning_rate": 0.0006878270762229807, "loss": 1.3076, "step": 1376 }, { "epoch": 1.5665529010238908, "grad_norm": 0.7376548051834106, "learning_rate": 0.000687599544937429, "loss": 1.4426, "step": 1377 }, { "epoch": 1.5676905574516495, "grad_norm": 1.136896014213562, "learning_rate": 0.0006873720136518772, "loss": 2.9297, "step": 1378 }, { "epoch": 1.5688282138794083, "grad_norm": 0.8306413888931274, "learning_rate": 0.0006871444823663255, "loss": 1.9698, "step": 1379 }, { "epoch": 1.5699658703071673, "grad_norm": 0.9145554900169373, "learning_rate": 0.0006869169510807736, "loss": 1.2156, "step": 1380 }, { "epoch": 1.571103526734926, "grad_norm": 1.226146936416626, "learning_rate": 0.0006866894197952218, "loss": 2.1672, "step": 1381 }, { "epoch": 1.572241183162685, "grad_norm": 0.6552311182022095, "learning_rate": 0.0006864618885096701, "loss": 0.9541, "step": 1382 }, { "epoch": 1.5733788395904438, "grad_norm": 0.8643866777420044, "learning_rate": 0.0006862343572241183, "loss": 1.3602, "step": 1383 }, { "epoch": 1.5745164960182025, "grad_norm": 1.0479025840759277, "learning_rate": 0.0006860068259385665, "loss": 2.6468, "step": 1384 }, { "epoch": 1.5756541524459613, "grad_norm": 1.173642635345459, "learning_rate": 0.0006857792946530148, "loss": 2.4588, "step": 1385 }, { "epoch": 1.57679180887372, "grad_norm": 0.751677930355072, "learning_rate": 0.0006855517633674631, "loss": 0.8411, "step": 1386 }, { "epoch": 1.5779294653014788, "grad_norm": 1.1802860498428345, "learning_rate": 0.0006853242320819113, "loss": 3.0113, "step": 1387 }, { "epoch": 1.5790671217292378, "grad_norm": 0.8296557068824768, "learning_rate": 0.0006850967007963596, "loss": 1.7901, "step": 1388 }, { "epoch": 1.5802047781569966, "grad_norm": 1.1105976104736328, "learning_rate": 0.0006848691695108078, "loss": 1.6191, "step": 1389 }, { "epoch": 1.5813424345847555, "grad_norm": 1.2469074726104736, "learning_rate": 0.0006846416382252559, "loss": 1.4178, "step": 1390 }, { "epoch": 1.5824800910125143, "grad_norm": 1.089394450187683, "learning_rate": 0.0006844141069397042, "loss": 1.9584, "step": 1391 }, { "epoch": 1.583617747440273, "grad_norm": 0.9064239859580994, "learning_rate": 0.0006841865756541524, "loss": 1.5156, "step": 1392 }, { "epoch": 1.5847554038680318, "grad_norm": 1.0828756093978882, "learning_rate": 0.0006839590443686006, "loss": 2.1486, "step": 1393 }, { "epoch": 1.5858930602957906, "grad_norm": 1.159847617149353, "learning_rate": 0.000683731513083049, "loss": 2.8738, "step": 1394 }, { "epoch": 1.5870307167235493, "grad_norm": 0.6444835066795349, "learning_rate": 0.0006835039817974972, "loss": 1.2373, "step": 1395 }, { "epoch": 1.5881683731513083, "grad_norm": 0.6985568404197693, "learning_rate": 0.0006832764505119454, "loss": 1.828, "step": 1396 }, { "epoch": 1.589306029579067, "grad_norm": 0.9549153447151184, "learning_rate": 0.0006830489192263937, "loss": 1.5909, "step": 1397 }, { "epoch": 1.590443686006826, "grad_norm": 0.720780074596405, "learning_rate": 0.0006828213879408419, "loss": 1.4502, "step": 1398 }, { "epoch": 1.5915813424345848, "grad_norm": 1.090728998184204, "learning_rate": 0.0006825938566552902, "loss": 2.7399, "step": 1399 }, { "epoch": 1.5927189988623436, "grad_norm": 1.3747398853302002, "learning_rate": 0.0006823663253697383, "loss": 2.4482, "step": 1400 }, { "epoch": 1.5938566552901023, "grad_norm": 1.0005912780761719, "learning_rate": 0.0006821387940841865, "loss": 1.7441, "step": 1401 }, { "epoch": 1.594994311717861, "grad_norm": 1.298248052597046, "learning_rate": 0.0006819112627986348, "loss": 2.1579, "step": 1402 }, { "epoch": 1.5961319681456199, "grad_norm": 1.0068423748016357, "learning_rate": 0.0006816837315130831, "loss": 2.5003, "step": 1403 }, { "epoch": 1.5972696245733788, "grad_norm": 1.1366382837295532, "learning_rate": 0.0006814562002275313, "loss": 2.3228, "step": 1404 }, { "epoch": 1.5984072810011376, "grad_norm": 0.9474295973777771, "learning_rate": 0.0006812286689419796, "loss": 1.1316, "step": 1405 }, { "epoch": 1.5995449374288966, "grad_norm": 1.3815207481384277, "learning_rate": 0.0006810011376564278, "loss": 4.0211, "step": 1406 }, { "epoch": 1.6006825938566553, "grad_norm": 0.703368604183197, "learning_rate": 0.000680773606370876, "loss": 1.5288, "step": 1407 }, { "epoch": 1.601820250284414, "grad_norm": 1.4243957996368408, "learning_rate": 0.0006805460750853243, "loss": 2.4914, "step": 1408 }, { "epoch": 1.6029579067121729, "grad_norm": 1.1586917638778687, "learning_rate": 0.0006803185437997725, "loss": 2.1698, "step": 1409 }, { "epoch": 1.6040955631399316, "grad_norm": 0.6247298717498779, "learning_rate": 0.0006800910125142206, "loss": 1.3813, "step": 1410 }, { "epoch": 1.6052332195676906, "grad_norm": 1.181565284729004, "learning_rate": 0.000679863481228669, "loss": 2.3456, "step": 1411 }, { "epoch": 1.6063708759954494, "grad_norm": 0.8031807541847229, "learning_rate": 0.0006796359499431172, "loss": 1.1918, "step": 1412 }, { "epoch": 1.6075085324232083, "grad_norm": 0.920711874961853, "learning_rate": 0.0006794084186575654, "loss": 2.328, "step": 1413 }, { "epoch": 1.608646188850967, "grad_norm": 1.0009433031082153, "learning_rate": 0.0006791808873720137, "loss": 1.3054, "step": 1414 }, { "epoch": 1.6097838452787259, "grad_norm": 1.1435019969940186, "learning_rate": 0.0006789533560864619, "loss": 1.6864, "step": 1415 }, { "epoch": 1.6109215017064846, "grad_norm": 0.9297929406166077, "learning_rate": 0.0006787258248009101, "loss": 2.1921, "step": 1416 }, { "epoch": 1.6120591581342434, "grad_norm": 0.8632849454879761, "learning_rate": 0.0006784982935153584, "loss": 1.8867, "step": 1417 }, { "epoch": 1.6131968145620021, "grad_norm": 0.7660686373710632, "learning_rate": 0.0006782707622298066, "loss": 1.4099, "step": 1418 }, { "epoch": 1.6143344709897611, "grad_norm": 1.1486921310424805, "learning_rate": 0.0006780432309442548, "loss": 2.032, "step": 1419 }, { "epoch": 1.6154721274175199, "grad_norm": 1.0095970630645752, "learning_rate": 0.0006778156996587031, "loss": 2.4319, "step": 1420 }, { "epoch": 1.6166097838452789, "grad_norm": 1.0770419836044312, "learning_rate": 0.0006775881683731513, "loss": 2.338, "step": 1421 }, { "epoch": 1.6177474402730376, "grad_norm": 0.8985020518302917, "learning_rate": 0.0006773606370875996, "loss": 1.3659, "step": 1422 }, { "epoch": 1.6188850967007964, "grad_norm": 1.1922725439071655, "learning_rate": 0.0006771331058020478, "loss": 0.865, "step": 1423 }, { "epoch": 1.6200227531285551, "grad_norm": 0.781085193157196, "learning_rate": 0.000676905574516496, "loss": 1.6722, "step": 1424 }, { "epoch": 1.621160409556314, "grad_norm": 1.0233283042907715, "learning_rate": 0.0006766780432309443, "loss": 2.5444, "step": 1425 }, { "epoch": 1.6222980659840727, "grad_norm": 0.7940084934234619, "learning_rate": 0.0006764505119453925, "loss": 1.4521, "step": 1426 }, { "epoch": 1.6234357224118316, "grad_norm": 0.6902288794517517, "learning_rate": 0.0006762229806598407, "loss": 1.0446, "step": 1427 }, { "epoch": 1.6245733788395904, "grad_norm": 0.9518580436706543, "learning_rate": 0.0006759954493742891, "loss": 2.1233, "step": 1428 }, { "epoch": 1.6257110352673494, "grad_norm": 1.147662878036499, "learning_rate": 0.0006757679180887372, "loss": 3.0784, "step": 1429 }, { "epoch": 1.6268486916951082, "grad_norm": 0.7210685610771179, "learning_rate": 0.0006755403868031854, "loss": 1.0661, "step": 1430 }, { "epoch": 1.627986348122867, "grad_norm": 1.2803034782409668, "learning_rate": 0.0006753128555176337, "loss": 1.7098, "step": 1431 }, { "epoch": 1.6291240045506257, "grad_norm": 1.3280972242355347, "learning_rate": 0.0006750853242320819, "loss": 1.756, "step": 1432 }, { "epoch": 1.6302616609783844, "grad_norm": 0.7079578638076782, "learning_rate": 0.0006748577929465301, "loss": 1.4689, "step": 1433 }, { "epoch": 1.6313993174061432, "grad_norm": 1.2363884449005127, "learning_rate": 0.0006746302616609784, "loss": 1.9533, "step": 1434 }, { "epoch": 1.6325369738339022, "grad_norm": 1.2230910062789917, "learning_rate": 0.0006744027303754266, "loss": 2.7112, "step": 1435 }, { "epoch": 1.633674630261661, "grad_norm": 1.5893361568450928, "learning_rate": 0.0006741751990898749, "loss": 1.6204, "step": 1436 }, { "epoch": 1.63481228668942, "grad_norm": 0.9474936723709106, "learning_rate": 0.0006739476678043232, "loss": 2.2779, "step": 1437 }, { "epoch": 1.6359499431171787, "grad_norm": 0.6129853129386902, "learning_rate": 0.0006737201365187714, "loss": 0.7177, "step": 1438 }, { "epoch": 1.6370875995449374, "grad_norm": 0.7504928708076477, "learning_rate": 0.0006734926052332196, "loss": 2.2781, "step": 1439 }, { "epoch": 1.6382252559726962, "grad_norm": 0.9989508986473083, "learning_rate": 0.0006732650739476678, "loss": 1.9803, "step": 1440 }, { "epoch": 1.639362912400455, "grad_norm": 1.202582836151123, "learning_rate": 0.000673037542662116, "loss": 1.8057, "step": 1441 }, { "epoch": 1.640500568828214, "grad_norm": 0.9157416224479675, "learning_rate": 0.0006728100113765643, "loss": 2.2069, "step": 1442 }, { "epoch": 1.6416382252559727, "grad_norm": 0.6836434602737427, "learning_rate": 0.0006725824800910125, "loss": 1.0477, "step": 1443 }, { "epoch": 1.6427758816837317, "grad_norm": 0.8547561168670654, "learning_rate": 0.0006723549488054607, "loss": 1.8937, "step": 1444 }, { "epoch": 1.6439135381114904, "grad_norm": 0.8537775874137878, "learning_rate": 0.0006721274175199091, "loss": 1.5731, "step": 1445 }, { "epoch": 1.6450511945392492, "grad_norm": 1.0358448028564453, "learning_rate": 0.0006718998862343573, "loss": 2.0885, "step": 1446 }, { "epoch": 1.646188850967008, "grad_norm": 0.9270764589309692, "learning_rate": 0.0006716723549488055, "loss": 1.8622, "step": 1447 }, { "epoch": 1.6473265073947667, "grad_norm": 0.6689456701278687, "learning_rate": 0.0006714448236632537, "loss": 0.7905, "step": 1448 }, { "epoch": 1.6484641638225255, "grad_norm": 0.6682091951370239, "learning_rate": 0.0006712172923777019, "loss": 1.0117, "step": 1449 }, { "epoch": 1.6496018202502845, "grad_norm": 0.9012134671211243, "learning_rate": 0.0006709897610921501, "loss": 1.6618, "step": 1450 }, { "epoch": 1.6507394766780432, "grad_norm": 0.7726583480834961, "learning_rate": 0.0006707622298065984, "loss": 1.4585, "step": 1451 }, { "epoch": 1.6518771331058022, "grad_norm": 1.0777757167816162, "learning_rate": 0.0006705346985210466, "loss": 2.1442, "step": 1452 }, { "epoch": 1.653014789533561, "grad_norm": 1.3284507989883423, "learning_rate": 0.0006703071672354949, "loss": 1.9266, "step": 1453 }, { "epoch": 1.6541524459613197, "grad_norm": 1.139455795288086, "learning_rate": 0.0006700796359499432, "loss": 2.3044, "step": 1454 }, { "epoch": 1.6552901023890785, "grad_norm": 0.736585795879364, "learning_rate": 0.0006698521046643914, "loss": 1.3312, "step": 1455 }, { "epoch": 1.6564277588168372, "grad_norm": 1.6546106338500977, "learning_rate": 0.0006696245733788396, "loss": 1.9867, "step": 1456 }, { "epoch": 1.657565415244596, "grad_norm": 0.872257649898529, "learning_rate": 0.0006693970420932879, "loss": 2.0105, "step": 1457 }, { "epoch": 1.658703071672355, "grad_norm": 0.9059979915618896, "learning_rate": 0.000669169510807736, "loss": 2.0219, "step": 1458 }, { "epoch": 1.6598407281001137, "grad_norm": 0.6183615326881409, "learning_rate": 0.0006689419795221842, "loss": 1.2641, "step": 1459 }, { "epoch": 1.6609783845278727, "grad_norm": 0.7358295917510986, "learning_rate": 0.0006687144482366325, "loss": 1.3596, "step": 1460 }, { "epoch": 1.6621160409556315, "grad_norm": 0.8297770023345947, "learning_rate": 0.0006684869169510807, "loss": 1.5614, "step": 1461 }, { "epoch": 1.6632536973833902, "grad_norm": 0.6983165144920349, "learning_rate": 0.0006682593856655291, "loss": 1.1403, "step": 1462 }, { "epoch": 1.664391353811149, "grad_norm": 1.0305124521255493, "learning_rate": 0.0006680318543799773, "loss": 2.0436, "step": 1463 }, { "epoch": 1.6655290102389078, "grad_norm": 1.53620183467865, "learning_rate": 0.0006678043230944255, "loss": 2.6743, "step": 1464 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9701448678970337, "learning_rate": 0.0006675767918088738, "loss": 2.7797, "step": 1465 }, { "epoch": 1.6678043230944255, "grad_norm": 1.1551249027252197, "learning_rate": 0.000667349260523322, "loss": 1.9898, "step": 1466 }, { "epoch": 1.6689419795221843, "grad_norm": 1.8078079223632812, "learning_rate": 0.0006671217292377702, "loss": 3.4769, "step": 1467 }, { "epoch": 1.6700796359499432, "grad_norm": 0.9920907020568848, "learning_rate": 0.0006668941979522184, "loss": 1.5748, "step": 1468 }, { "epoch": 1.671217292377702, "grad_norm": 1.0543971061706543, "learning_rate": 0.0006666666666666666, "loss": 2.2308, "step": 1469 }, { "epoch": 1.6723549488054608, "grad_norm": 1.3774967193603516, "learning_rate": 0.0006664391353811149, "loss": 2.9355, "step": 1470 }, { "epoch": 1.6734926052332195, "grad_norm": 0.819430947303772, "learning_rate": 0.0006662116040955632, "loss": 1.8651, "step": 1471 }, { "epoch": 1.6746302616609783, "grad_norm": 0.9009912014007568, "learning_rate": 0.0006659840728100114, "loss": 1.771, "step": 1472 }, { "epoch": 1.675767918088737, "grad_norm": 0.9403852224349976, "learning_rate": 0.0006657565415244596, "loss": 1.9841, "step": 1473 }, { "epoch": 1.676905574516496, "grad_norm": 1.555391550064087, "learning_rate": 0.0006655290102389079, "loss": 2.3617, "step": 1474 }, { "epoch": 1.6780432309442548, "grad_norm": 1.146911382675171, "learning_rate": 0.0006653014789533561, "loss": 2.3203, "step": 1475 }, { "epoch": 1.6791808873720138, "grad_norm": 0.853667140007019, "learning_rate": 0.0006650739476678043, "loss": 1.2485, "step": 1476 }, { "epoch": 1.6803185437997725, "grad_norm": 1.278773307800293, "learning_rate": 0.0006648464163822526, "loss": 2.0797, "step": 1477 }, { "epoch": 1.6814562002275313, "grad_norm": 1.3791263103485107, "learning_rate": 0.0006646188850967008, "loss": 2.859, "step": 1478 }, { "epoch": 1.68259385665529, "grad_norm": 0.9324190616607666, "learning_rate": 0.000664391353811149, "loss": 2.3866, "step": 1479 }, { "epoch": 1.6837315130830488, "grad_norm": 1.0593537092208862, "learning_rate": 0.0006641638225255973, "loss": 1.6384, "step": 1480 }, { "epoch": 1.6848691695108078, "grad_norm": 1.1699516773223877, "learning_rate": 0.0006639362912400455, "loss": 2.8342, "step": 1481 }, { "epoch": 1.6860068259385665, "grad_norm": 0.7266961336135864, "learning_rate": 0.0006637087599544938, "loss": 1.4487, "step": 1482 }, { "epoch": 1.6871444823663255, "grad_norm": 0.8684595227241516, "learning_rate": 0.000663481228668942, "loss": 1.6997, "step": 1483 }, { "epoch": 1.6882821387940843, "grad_norm": 0.958516538143158, "learning_rate": 0.0006632536973833902, "loss": 1.7327, "step": 1484 }, { "epoch": 1.689419795221843, "grad_norm": 1.0229347944259644, "learning_rate": 0.0006630261660978385, "loss": 3.2928, "step": 1485 }, { "epoch": 1.6905574516496018, "grad_norm": 1.314428687095642, "learning_rate": 0.0006627986348122868, "loss": 1.7722, "step": 1486 }, { "epoch": 1.6916951080773606, "grad_norm": 0.6298933625221252, "learning_rate": 0.0006625711035267349, "loss": 1.2597, "step": 1487 }, { "epoch": 1.6928327645051193, "grad_norm": 0.8204244375228882, "learning_rate": 0.0006623435722411832, "loss": 1.4012, "step": 1488 }, { "epoch": 1.6939704209328783, "grad_norm": 0.6752439737319946, "learning_rate": 0.0006621160409556314, "loss": 1.3025, "step": 1489 }, { "epoch": 1.695108077360637, "grad_norm": 0.7786659002304077, "learning_rate": 0.0006618885096700796, "loss": 1.601, "step": 1490 }, { "epoch": 1.696245733788396, "grad_norm": 0.9201928377151489, "learning_rate": 0.0006616609783845279, "loss": 2.8528, "step": 1491 }, { "epoch": 1.6973833902161548, "grad_norm": 0.9727663993835449, "learning_rate": 0.0006614334470989761, "loss": 2.2852, "step": 1492 }, { "epoch": 1.6985210466439136, "grad_norm": 1.015677571296692, "learning_rate": 0.0006612059158134243, "loss": 2.5611, "step": 1493 }, { "epoch": 1.6996587030716723, "grad_norm": 0.6832883954048157, "learning_rate": 0.0006609783845278727, "loss": 0.8426, "step": 1494 }, { "epoch": 1.700796359499431, "grad_norm": 1.015763759613037, "learning_rate": 0.0006607508532423209, "loss": 2.1979, "step": 1495 }, { "epoch": 1.7019340159271898, "grad_norm": 1.0312163829803467, "learning_rate": 0.0006605233219567691, "loss": 1.6475, "step": 1496 }, { "epoch": 1.7030716723549488, "grad_norm": 1.1301902532577515, "learning_rate": 0.0006602957906712173, "loss": 1.6602, "step": 1497 }, { "epoch": 1.7042093287827076, "grad_norm": 1.635214924812317, "learning_rate": 0.0006600682593856655, "loss": 2.0394, "step": 1498 }, { "epoch": 1.7053469852104666, "grad_norm": 1.2907978296279907, "learning_rate": 0.0006598407281001137, "loss": 2.3087, "step": 1499 }, { "epoch": 1.7064846416382253, "grad_norm": 0.8886736035346985, "learning_rate": 0.000659613196814562, "loss": 2.1899, "step": 1500 }, { "epoch": 1.707622298065984, "grad_norm": 0.7392778992652893, "learning_rate": 0.0006593856655290102, "loss": 1.1168, "step": 1501 }, { "epoch": 1.7087599544937428, "grad_norm": 0.9279195070266724, "learning_rate": 0.0006591581342434585, "loss": 3.2731, "step": 1502 }, { "epoch": 1.7098976109215016, "grad_norm": 0.7852345705032349, "learning_rate": 0.0006589306029579068, "loss": 1.6468, "step": 1503 }, { "epoch": 1.7110352673492604, "grad_norm": 0.8137550354003906, "learning_rate": 0.000658703071672355, "loss": 1.7085, "step": 1504 }, { "epoch": 1.7121729237770194, "grad_norm": 1.1284794807434082, "learning_rate": 0.0006584755403868033, "loss": 1.3615, "step": 1505 }, { "epoch": 1.713310580204778, "grad_norm": 0.9071482419967651, "learning_rate": 0.0006582480091012515, "loss": 1.9902, "step": 1506 }, { "epoch": 1.714448236632537, "grad_norm": 0.6611661314964294, "learning_rate": 0.0006580204778156996, "loss": 1.4362, "step": 1507 }, { "epoch": 1.7155858930602959, "grad_norm": 0.6886146664619446, "learning_rate": 0.0006577929465301479, "loss": 1.5251, "step": 1508 }, { "epoch": 1.7167235494880546, "grad_norm": 0.859805166721344, "learning_rate": 0.0006575654152445961, "loss": 1.6648, "step": 1509 }, { "epoch": 1.7178612059158134, "grad_norm": 0.8424084782600403, "learning_rate": 0.0006573378839590443, "loss": 1.7239, "step": 1510 }, { "epoch": 1.7189988623435721, "grad_norm": 0.5435293316841125, "learning_rate": 0.0006571103526734927, "loss": 1.1297, "step": 1511 }, { "epoch": 1.7201365187713311, "grad_norm": 1.4670809507369995, "learning_rate": 0.0006568828213879409, "loss": 3.3636, "step": 1512 }, { "epoch": 1.7212741751990899, "grad_norm": 0.8184078931808472, "learning_rate": 0.0006566552901023891, "loss": 1.8129, "step": 1513 }, { "epoch": 1.7224118316268489, "grad_norm": 0.8720480799674988, "learning_rate": 0.0006564277588168374, "loss": 1.7082, "step": 1514 }, { "epoch": 1.7235494880546076, "grad_norm": 1.2240320444107056, "learning_rate": 0.0006562002275312856, "loss": 1.6893, "step": 1515 }, { "epoch": 1.7246871444823664, "grad_norm": 1.4103710651397705, "learning_rate": 0.0006559726962457337, "loss": 1.4781, "step": 1516 }, { "epoch": 1.7258248009101251, "grad_norm": 0.7082839608192444, "learning_rate": 0.000655745164960182, "loss": 1.6487, "step": 1517 }, { "epoch": 1.726962457337884, "grad_norm": 1.3092166185379028, "learning_rate": 0.0006555176336746302, "loss": 3.0383, "step": 1518 }, { "epoch": 1.7281001137656427, "grad_norm": 0.6530581116676331, "learning_rate": 0.0006552901023890784, "loss": 1.1405, "step": 1519 }, { "epoch": 1.7292377701934016, "grad_norm": 0.5259117484092712, "learning_rate": 0.0006550625711035268, "loss": 0.9486, "step": 1520 }, { "epoch": 1.7303754266211604, "grad_norm": 0.8423599004745483, "learning_rate": 0.000654835039817975, "loss": 1.7263, "step": 1521 }, { "epoch": 1.7315130830489194, "grad_norm": 0.9277529716491699, "learning_rate": 0.0006546075085324233, "loss": 1.3625, "step": 1522 }, { "epoch": 1.7326507394766781, "grad_norm": 0.8496603965759277, "learning_rate": 0.0006543799772468715, "loss": 2.406, "step": 1523 }, { "epoch": 1.733788395904437, "grad_norm": 1.1598669290542603, "learning_rate": 0.0006541524459613197, "loss": 2.4768, "step": 1524 }, { "epoch": 1.7349260523321957, "grad_norm": 0.9473751783370972, "learning_rate": 0.000653924914675768, "loss": 2.243, "step": 1525 }, { "epoch": 1.7360637087599544, "grad_norm": 0.8064972758293152, "learning_rate": 0.0006536973833902161, "loss": 1.6925, "step": 1526 }, { "epoch": 1.7372013651877132, "grad_norm": 1.018776297569275, "learning_rate": 0.0006534698521046643, "loss": 1.6876, "step": 1527 }, { "epoch": 1.7383390216154722, "grad_norm": 1.1636130809783936, "learning_rate": 0.0006532423208191127, "loss": 2.0573, "step": 1528 }, { "epoch": 1.739476678043231, "grad_norm": 0.890720784664154, "learning_rate": 0.0006530147895335609, "loss": 0.8197, "step": 1529 }, { "epoch": 1.74061433447099, "grad_norm": 0.8947886228561401, "learning_rate": 0.0006527872582480091, "loss": 1.5267, "step": 1530 }, { "epoch": 1.7417519908987487, "grad_norm": 1.0752261877059937, "learning_rate": 0.0006525597269624574, "loss": 2.324, "step": 1531 }, { "epoch": 1.7428896473265074, "grad_norm": 1.107813835144043, "learning_rate": 0.0006523321956769056, "loss": 2.1986, "step": 1532 }, { "epoch": 1.7440273037542662, "grad_norm": 0.8949710726737976, "learning_rate": 0.0006521046643913538, "loss": 1.3886, "step": 1533 }, { "epoch": 1.745164960182025, "grad_norm": 1.6787731647491455, "learning_rate": 0.0006518771331058021, "loss": 3.7587, "step": 1534 }, { "epoch": 1.7463026166097837, "grad_norm": 1.2499511241912842, "learning_rate": 0.0006516496018202503, "loss": 2.327, "step": 1535 }, { "epoch": 1.7474402730375427, "grad_norm": 0.7595018148422241, "learning_rate": 0.0006514220705346984, "loss": 1.3373, "step": 1536 }, { "epoch": 1.7485779294653014, "grad_norm": 1.2214561700820923, "learning_rate": 0.0006511945392491468, "loss": 2.4152, "step": 1537 }, { "epoch": 1.7497155858930604, "grad_norm": 0.727344274520874, "learning_rate": 0.000650967007963595, "loss": 1.4908, "step": 1538 }, { "epoch": 1.7508532423208192, "grad_norm": 0.9511622786521912, "learning_rate": 0.0006507394766780432, "loss": 1.6895, "step": 1539 }, { "epoch": 1.751990898748578, "grad_norm": 0.8098679184913635, "learning_rate": 0.0006505119453924915, "loss": 1.125, "step": 1540 }, { "epoch": 1.7531285551763367, "grad_norm": 0.9225071668624878, "learning_rate": 0.0006502844141069397, "loss": 1.8387, "step": 1541 }, { "epoch": 1.7542662116040955, "grad_norm": 0.999563992023468, "learning_rate": 0.000650056882821388, "loss": 2.7922, "step": 1542 }, { "epoch": 1.7554038680318542, "grad_norm": 0.722228467464447, "learning_rate": 0.0006498293515358362, "loss": 1.7187, "step": 1543 }, { "epoch": 1.7565415244596132, "grad_norm": 0.6238258481025696, "learning_rate": 0.0006496018202502844, "loss": 0.8305, "step": 1544 }, { "epoch": 1.757679180887372, "grad_norm": 1.207566499710083, "learning_rate": 0.0006493742889647328, "loss": 2.9552, "step": 1545 }, { "epoch": 1.758816837315131, "grad_norm": 1.0733917951583862, "learning_rate": 0.0006491467576791809, "loss": 1.9941, "step": 1546 }, { "epoch": 1.7599544937428897, "grad_norm": 0.8945872783660889, "learning_rate": 0.0006489192263936291, "loss": 1.6187, "step": 1547 }, { "epoch": 1.7610921501706485, "grad_norm": 1.0715452432632446, "learning_rate": 0.0006486916951080774, "loss": 3.1526, "step": 1548 }, { "epoch": 1.7622298065984072, "grad_norm": 0.7034804821014404, "learning_rate": 0.0006484641638225256, "loss": 1.6216, "step": 1549 }, { "epoch": 1.763367463026166, "grad_norm": 1.182071328163147, "learning_rate": 0.0006482366325369738, "loss": 3.9126, "step": 1550 }, { "epoch": 1.764505119453925, "grad_norm": 1.0914125442504883, "learning_rate": 0.0006480091012514221, "loss": 1.7731, "step": 1551 }, { "epoch": 1.7656427758816837, "grad_norm": 0.7337900996208191, "learning_rate": 0.0006477815699658703, "loss": 0.7843, "step": 1552 }, { "epoch": 1.7667804323094427, "grad_norm": 0.9463424682617188, "learning_rate": 0.0006475540386803186, "loss": 2.3052, "step": 1553 }, { "epoch": 1.7679180887372015, "grad_norm": 0.798876166343689, "learning_rate": 0.0006473265073947669, "loss": 1.7428, "step": 1554 }, { "epoch": 1.7690557451649602, "grad_norm": 1.0317094326019287, "learning_rate": 0.000647098976109215, "loss": 1.5478, "step": 1555 }, { "epoch": 1.770193401592719, "grad_norm": 1.0436172485351562, "learning_rate": 0.0006468714448236632, "loss": 2.3285, "step": 1556 }, { "epoch": 1.7713310580204777, "grad_norm": 1.389779806137085, "learning_rate": 0.0006466439135381115, "loss": 3.1217, "step": 1557 }, { "epoch": 1.7724687144482365, "grad_norm": 0.9066593050956726, "learning_rate": 0.0006464163822525597, "loss": 1.5202, "step": 1558 }, { "epoch": 1.7736063708759955, "grad_norm": 0.7045938968658447, "learning_rate": 0.0006461888509670079, "loss": 2.0027, "step": 1559 }, { "epoch": 1.7747440273037542, "grad_norm": 1.222909688949585, "learning_rate": 0.0006459613196814562, "loss": 2.5047, "step": 1560 }, { "epoch": 1.7758816837315132, "grad_norm": 0.9253849983215332, "learning_rate": 0.0006457337883959044, "loss": 1.8513, "step": 1561 }, { "epoch": 1.777019340159272, "grad_norm": 1.5743485689163208, "learning_rate": 0.0006455062571103528, "loss": 3.3089, "step": 1562 }, { "epoch": 1.7781569965870307, "grad_norm": 1.3808788061141968, "learning_rate": 0.000645278725824801, "loss": 2.1207, "step": 1563 }, { "epoch": 1.7792946530147895, "grad_norm": 0.5420240163803101, "learning_rate": 0.0006450511945392492, "loss": 0.9326, "step": 1564 }, { "epoch": 1.7804323094425483, "grad_norm": 1.006123661994934, "learning_rate": 0.0006448236632536974, "loss": 1.3682, "step": 1565 }, { "epoch": 1.781569965870307, "grad_norm": 1.0188605785369873, "learning_rate": 0.0006445961319681456, "loss": 3.1302, "step": 1566 }, { "epoch": 1.782707622298066, "grad_norm": 1.3520559072494507, "learning_rate": 0.0006443686006825938, "loss": 1.9661, "step": 1567 }, { "epoch": 1.7838452787258248, "grad_norm": 1.2583212852478027, "learning_rate": 0.0006441410693970421, "loss": 1.7267, "step": 1568 }, { "epoch": 1.7849829351535837, "grad_norm": 0.6001319885253906, "learning_rate": 0.0006439135381114903, "loss": 1.2674, "step": 1569 }, { "epoch": 1.7861205915813425, "grad_norm": 0.5909472703933716, "learning_rate": 0.0006436860068259386, "loss": 1.1398, "step": 1570 }, { "epoch": 1.7872582480091013, "grad_norm": 1.0126385688781738, "learning_rate": 0.0006434584755403869, "loss": 1.4477, "step": 1571 }, { "epoch": 1.78839590443686, "grad_norm": 0.760152280330658, "learning_rate": 0.0006432309442548351, "loss": 1.0436, "step": 1572 }, { "epoch": 1.7895335608646188, "grad_norm": 1.2938870191574097, "learning_rate": 0.0006430034129692833, "loss": 2.4826, "step": 1573 }, { "epoch": 1.7906712172923775, "grad_norm": 0.8041395545005798, "learning_rate": 0.0006427758816837316, "loss": 1.7244, "step": 1574 }, { "epoch": 1.7918088737201365, "grad_norm": 0.9691146016120911, "learning_rate": 0.0006425483503981797, "loss": 1.7926, "step": 1575 }, { "epoch": 1.7929465301478953, "grad_norm": 1.1277107000350952, "learning_rate": 0.0006423208191126279, "loss": 2.1719, "step": 1576 }, { "epoch": 1.7940841865756543, "grad_norm": 0.8621721267700195, "learning_rate": 0.0006420932878270762, "loss": 2.341, "step": 1577 }, { "epoch": 1.795221843003413, "grad_norm": 1.3702713251113892, "learning_rate": 0.0006418657565415244, "loss": 2.7296, "step": 1578 }, { "epoch": 1.7963594994311718, "grad_norm": 0.6883115768432617, "learning_rate": 0.0006416382252559727, "loss": 1.762, "step": 1579 }, { "epoch": 1.7974971558589306, "grad_norm": 1.0266193151474, "learning_rate": 0.000641410693970421, "loss": 2.1009, "step": 1580 }, { "epoch": 1.7986348122866893, "grad_norm": 0.908869743347168, "learning_rate": 0.0006411831626848692, "loss": 1.7309, "step": 1581 }, { "epoch": 1.799772468714448, "grad_norm": 0.8608745336532593, "learning_rate": 0.0006409556313993174, "loss": 1.7538, "step": 1582 }, { "epoch": 1.800910125142207, "grad_norm": 1.5852781534194946, "learning_rate": 0.0006407281001137657, "loss": 2.7645, "step": 1583 }, { "epoch": 1.802047781569966, "grad_norm": 0.8057767152786255, "learning_rate": 0.0006405005688282139, "loss": 1.7418, "step": 1584 }, { "epoch": 1.8031854379977248, "grad_norm": 1.129563570022583, "learning_rate": 0.0006402730375426621, "loss": 2.454, "step": 1585 }, { "epoch": 1.8043230944254836, "grad_norm": 0.9921088218688965, "learning_rate": 0.0006400455062571103, "loss": 1.8042, "step": 1586 }, { "epoch": 1.8054607508532423, "grad_norm": 1.2487443685531616, "learning_rate": 0.0006398179749715586, "loss": 2.1645, "step": 1587 }, { "epoch": 1.806598407281001, "grad_norm": 1.0499515533447266, "learning_rate": 0.0006395904436860069, "loss": 2.1733, "step": 1588 }, { "epoch": 1.8077360637087598, "grad_norm": 1.092044472694397, "learning_rate": 0.0006393629124004551, "loss": 2.5157, "step": 1589 }, { "epoch": 1.8088737201365188, "grad_norm": 0.9797663688659668, "learning_rate": 0.0006391353811149033, "loss": 1.3745, "step": 1590 }, { "epoch": 1.8100113765642776, "grad_norm": 1.0946043729782104, "learning_rate": 0.0006389078498293516, "loss": 2.1346, "step": 1591 }, { "epoch": 1.8111490329920366, "grad_norm": 1.3804898262023926, "learning_rate": 0.0006386803185437998, "loss": 2.088, "step": 1592 }, { "epoch": 1.8122866894197953, "grad_norm": 0.9063937067985535, "learning_rate": 0.000638452787258248, "loss": 2.0188, "step": 1593 }, { "epoch": 1.813424345847554, "grad_norm": 0.7004676461219788, "learning_rate": 0.0006382252559726962, "loss": 1.5326, "step": 1594 }, { "epoch": 1.8145620022753128, "grad_norm": 0.7816855907440186, "learning_rate": 0.0006379977246871445, "loss": 1.5907, "step": 1595 }, { "epoch": 1.8156996587030716, "grad_norm": 1.2419103384017944, "learning_rate": 0.0006377701934015927, "loss": 2.683, "step": 1596 }, { "epoch": 1.8168373151308304, "grad_norm": 1.7206608057022095, "learning_rate": 0.000637542662116041, "loss": 3.0181, "step": 1597 }, { "epoch": 1.8179749715585893, "grad_norm": 0.7101848125457764, "learning_rate": 0.0006373151308304892, "loss": 2.2839, "step": 1598 }, { "epoch": 1.819112627986348, "grad_norm": 0.9703013896942139, "learning_rate": 0.0006370875995449374, "loss": 1.6638, "step": 1599 }, { "epoch": 1.820250284414107, "grad_norm": 0.7560202479362488, "learning_rate": 0.0006368600682593857, "loss": 1.285, "step": 1600 }, { "epoch": 1.8213879408418658, "grad_norm": 0.8459346294403076, "learning_rate": 0.0006366325369738339, "loss": 2.0187, "step": 1601 }, { "epoch": 1.8225255972696246, "grad_norm": 1.0976618528366089, "learning_rate": 0.0006364050056882821, "loss": 2.2865, "step": 1602 }, { "epoch": 1.8236632536973834, "grad_norm": 0.7999181747436523, "learning_rate": 0.0006361774744027305, "loss": 2.0923, "step": 1603 }, { "epoch": 1.8248009101251421, "grad_norm": 0.6172679662704468, "learning_rate": 0.0006359499431171786, "loss": 0.9479, "step": 1604 }, { "epoch": 1.8259385665529009, "grad_norm": 0.8358675241470337, "learning_rate": 0.0006357224118316269, "loss": 2.4008, "step": 1605 }, { "epoch": 1.8270762229806599, "grad_norm": 0.7997340559959412, "learning_rate": 0.0006354948805460751, "loss": 2.0383, "step": 1606 }, { "epoch": 1.8282138794084186, "grad_norm": 1.1405185461044312, "learning_rate": 0.0006352673492605233, "loss": 1.925, "step": 1607 }, { "epoch": 1.8293515358361776, "grad_norm": 0.7813712358474731, "learning_rate": 0.0006350398179749716, "loss": 1.7278, "step": 1608 }, { "epoch": 1.8304891922639364, "grad_norm": 1.3376038074493408, "learning_rate": 0.0006348122866894198, "loss": 2.3325, "step": 1609 }, { "epoch": 1.8316268486916951, "grad_norm": 1.1971509456634521, "learning_rate": 0.000634584755403868, "loss": 2.1967, "step": 1610 }, { "epoch": 1.8327645051194539, "grad_norm": 1.8342376947402954, "learning_rate": 0.0006343572241183164, "loss": 3.0377, "step": 1611 }, { "epoch": 1.8339021615472126, "grad_norm": 0.983214795589447, "learning_rate": 0.0006341296928327646, "loss": 1.5361, "step": 1612 }, { "epoch": 1.8350398179749714, "grad_norm": 0.7843421697616577, "learning_rate": 0.0006339021615472128, "loss": 1.4898, "step": 1613 }, { "epoch": 1.8361774744027304, "grad_norm": 0.8079589605331421, "learning_rate": 0.000633674630261661, "loss": 1.42, "step": 1614 }, { "epoch": 1.8373151308304891, "grad_norm": 0.9361008405685425, "learning_rate": 0.0006334470989761092, "loss": 2.3352, "step": 1615 }, { "epoch": 1.8384527872582481, "grad_norm": 0.8050186038017273, "learning_rate": 0.0006332195676905574, "loss": 1.2832, "step": 1616 }, { "epoch": 1.8395904436860069, "grad_norm": 0.6007594466209412, "learning_rate": 0.0006329920364050057, "loss": 0.7322, "step": 1617 }, { "epoch": 1.8407281001137656, "grad_norm": 1.09076988697052, "learning_rate": 0.0006327645051194539, "loss": 3.1086, "step": 1618 }, { "epoch": 1.8418657565415244, "grad_norm": 1.1623958349227905, "learning_rate": 0.0006325369738339021, "loss": 2.5158, "step": 1619 }, { "epoch": 1.8430034129692832, "grad_norm": 0.967048704624176, "learning_rate": 0.0006323094425483505, "loss": 1.5403, "step": 1620 }, { "epoch": 1.8441410693970421, "grad_norm": 0.7205291390419006, "learning_rate": 0.0006320819112627987, "loss": 0.9221, "step": 1621 }, { "epoch": 1.845278725824801, "grad_norm": 0.8706921339035034, "learning_rate": 0.0006318543799772469, "loss": 2.1171, "step": 1622 }, { "epoch": 1.8464163822525599, "grad_norm": 0.8545218110084534, "learning_rate": 0.0006316268486916951, "loss": 1.3462, "step": 1623 }, { "epoch": 1.8475540386803186, "grad_norm": 1.0250085592269897, "learning_rate": 0.0006313993174061433, "loss": 1.8658, "step": 1624 }, { "epoch": 1.8486916951080774, "grad_norm": 0.8345952033996582, "learning_rate": 0.0006311717861205916, "loss": 2.4092, "step": 1625 }, { "epoch": 1.8498293515358362, "grad_norm": 0.9249033331871033, "learning_rate": 0.0006309442548350398, "loss": 1.7311, "step": 1626 }, { "epoch": 1.850967007963595, "grad_norm": 1.0323681831359863, "learning_rate": 0.000630716723549488, "loss": 2.3136, "step": 1627 }, { "epoch": 1.8521046643913537, "grad_norm": 0.6797596216201782, "learning_rate": 0.0006304891922639364, "loss": 1.4115, "step": 1628 }, { "epoch": 1.8532423208191127, "grad_norm": 0.6315971612930298, "learning_rate": 0.0006302616609783846, "loss": 1.6303, "step": 1629 }, { "epoch": 1.8543799772468714, "grad_norm": 1.357260823249817, "learning_rate": 0.0006300341296928328, "loss": 3.4739, "step": 1630 }, { "epoch": 1.8555176336746304, "grad_norm": 0.8246727585792542, "learning_rate": 0.0006298065984072811, "loss": 1.4725, "step": 1631 }, { "epoch": 1.8566552901023892, "grad_norm": 1.0035685300827026, "learning_rate": 0.0006295790671217293, "loss": 1.9371, "step": 1632 }, { "epoch": 1.857792946530148, "grad_norm": 0.8692108392715454, "learning_rate": 0.0006293515358361774, "loss": 1.5302, "step": 1633 }, { "epoch": 1.8589306029579067, "grad_norm": 0.7576583623886108, "learning_rate": 0.0006291240045506257, "loss": 1.3123, "step": 1634 }, { "epoch": 1.8600682593856654, "grad_norm": 1.0412949323654175, "learning_rate": 0.0006288964732650739, "loss": 1.8658, "step": 1635 }, { "epoch": 1.8612059158134242, "grad_norm": 0.6774725317955017, "learning_rate": 0.0006286689419795221, "loss": 1.499, "step": 1636 }, { "epoch": 1.8623435722411832, "grad_norm": 0.7042348980903625, "learning_rate": 0.0006284414106939705, "loss": 1.423, "step": 1637 }, { "epoch": 1.863481228668942, "grad_norm": 0.8518065810203552, "learning_rate": 0.0006282138794084187, "loss": 1.1174, "step": 1638 }, { "epoch": 1.864618885096701, "grad_norm": 1.1529499292373657, "learning_rate": 0.0006279863481228669, "loss": 2.3384, "step": 1639 }, { "epoch": 1.8657565415244597, "grad_norm": 0.8600414991378784, "learning_rate": 0.0006277588168373152, "loss": 1.7475, "step": 1640 }, { "epoch": 1.8668941979522184, "grad_norm": 1.1277893781661987, "learning_rate": 0.0006275312855517634, "loss": 3.2803, "step": 1641 }, { "epoch": 1.8680318543799772, "grad_norm": 1.1876293420791626, "learning_rate": 0.0006273037542662116, "loss": 2.062, "step": 1642 }, { "epoch": 1.869169510807736, "grad_norm": 0.8083938360214233, "learning_rate": 0.0006270762229806598, "loss": 1.4945, "step": 1643 }, { "epoch": 1.8703071672354947, "grad_norm": 1.136566400527954, "learning_rate": 0.000626848691695108, "loss": 2.2583, "step": 1644 }, { "epoch": 1.8714448236632537, "grad_norm": 0.8216238617897034, "learning_rate": 0.0006266211604095564, "loss": 1.1567, "step": 1645 }, { "epoch": 1.8725824800910125, "grad_norm": 0.8726761341094971, "learning_rate": 0.0006263936291240046, "loss": 1.4604, "step": 1646 }, { "epoch": 1.8737201365187715, "grad_norm": 0.7124577164649963, "learning_rate": 0.0006261660978384528, "loss": 1.8203, "step": 1647 }, { "epoch": 1.8748577929465302, "grad_norm": 0.7909445762634277, "learning_rate": 0.0006259385665529011, "loss": 1.5815, "step": 1648 }, { "epoch": 1.875995449374289, "grad_norm": 1.0055829286575317, "learning_rate": 0.0006257110352673493, "loss": 1.5233, "step": 1649 }, { "epoch": 1.8771331058020477, "grad_norm": 0.6920607089996338, "learning_rate": 0.0006254835039817975, "loss": 0.8769, "step": 1650 }, { "epoch": 1.8782707622298065, "grad_norm": 0.8368102312088013, "learning_rate": 0.0006252559726962458, "loss": 1.9653, "step": 1651 }, { "epoch": 1.8794084186575652, "grad_norm": 1.1881768703460693, "learning_rate": 0.000625028441410694, "loss": 2.2672, "step": 1652 }, { "epoch": 1.8805460750853242, "grad_norm": 0.7409395575523376, "learning_rate": 0.0006248009101251421, "loss": 1.6343, "step": 1653 }, { "epoch": 1.8816837315130832, "grad_norm": 1.2841352224349976, "learning_rate": 0.0006245733788395905, "loss": 1.9493, "step": 1654 }, { "epoch": 1.882821387940842, "grad_norm": 0.8569640517234802, "learning_rate": 0.0006243458475540387, "loss": 1.9457, "step": 1655 }, { "epoch": 1.8839590443686007, "grad_norm": 1.32857346534729, "learning_rate": 0.0006241183162684869, "loss": 1.7589, "step": 1656 }, { "epoch": 1.8850967007963595, "grad_norm": 0.8553183674812317, "learning_rate": 0.0006238907849829352, "loss": 1.2652, "step": 1657 }, { "epoch": 1.8862343572241183, "grad_norm": 0.9133054614067078, "learning_rate": 0.0006236632536973834, "loss": 1.5618, "step": 1658 }, { "epoch": 1.887372013651877, "grad_norm": 1.1053630113601685, "learning_rate": 0.0006234357224118316, "loss": 1.4316, "step": 1659 }, { "epoch": 1.888509670079636, "grad_norm": 1.3917018175125122, "learning_rate": 0.0006232081911262799, "loss": 3.7291, "step": 1660 }, { "epoch": 1.8896473265073948, "grad_norm": 0.6863465905189514, "learning_rate": 0.0006229806598407281, "loss": 1.2208, "step": 1661 }, { "epoch": 1.8907849829351537, "grad_norm": 1.6601629257202148, "learning_rate": 0.0006227531285551762, "loss": 3.3405, "step": 1662 }, { "epoch": 1.8919226393629125, "grad_norm": 0.6873295903205872, "learning_rate": 0.0006225255972696246, "loss": 1.66, "step": 1663 }, { "epoch": 1.8930602957906713, "grad_norm": 1.2501400709152222, "learning_rate": 0.0006222980659840728, "loss": 2.2344, "step": 1664 }, { "epoch": 1.89419795221843, "grad_norm": 1.2724485397338867, "learning_rate": 0.000622070534698521, "loss": 1.8736, "step": 1665 }, { "epoch": 1.8953356086461888, "grad_norm": 1.0667139291763306, "learning_rate": 0.0006218430034129693, "loss": 3.2257, "step": 1666 }, { "epoch": 1.8964732650739475, "grad_norm": 0.7864385843276978, "learning_rate": 0.0006216154721274175, "loss": 1.8004, "step": 1667 }, { "epoch": 1.8976109215017065, "grad_norm": 0.8352164030075073, "learning_rate": 0.0006213879408418658, "loss": 1.4674, "step": 1668 }, { "epoch": 1.8987485779294653, "grad_norm": 0.8850026726722717, "learning_rate": 0.000621160409556314, "loss": 1.7806, "step": 1669 }, { "epoch": 1.8998862343572243, "grad_norm": 0.7337632775306702, "learning_rate": 0.0006209328782707623, "loss": 1.2096, "step": 1670 }, { "epoch": 1.901023890784983, "grad_norm": 0.9589748978614807, "learning_rate": 0.0006207053469852106, "loss": 2.1104, "step": 1671 }, { "epoch": 1.9021615472127418, "grad_norm": 1.047741174697876, "learning_rate": 0.0006204778156996587, "loss": 2.5989, "step": 1672 }, { "epoch": 1.9032992036405005, "grad_norm": 1.0804181098937988, "learning_rate": 0.0006202502844141069, "loss": 1.5729, "step": 1673 }, { "epoch": 1.9044368600682593, "grad_norm": 0.99363774061203, "learning_rate": 0.0006200227531285552, "loss": 2.5205, "step": 1674 }, { "epoch": 1.905574516496018, "grad_norm": 1.1013110876083374, "learning_rate": 0.0006197952218430034, "loss": 1.4929, "step": 1675 }, { "epoch": 1.906712172923777, "grad_norm": 1.1436536312103271, "learning_rate": 0.0006195676905574516, "loss": 1.503, "step": 1676 }, { "epoch": 1.9078498293515358, "grad_norm": 0.9888268113136292, "learning_rate": 0.0006193401592718999, "loss": 1.8105, "step": 1677 }, { "epoch": 1.9089874857792948, "grad_norm": 0.5987884998321533, "learning_rate": 0.0006191126279863481, "loss": 0.6153, "step": 1678 }, { "epoch": 1.9101251422070535, "grad_norm": 1.2569278478622437, "learning_rate": 0.0006188850967007964, "loss": 2.58, "step": 1679 }, { "epoch": 1.9112627986348123, "grad_norm": 1.7216945886611938, "learning_rate": 0.0006186575654152447, "loss": 3.3363, "step": 1680 }, { "epoch": 1.912400455062571, "grad_norm": 0.8490168452262878, "learning_rate": 0.0006184300341296929, "loss": 1.6139, "step": 1681 }, { "epoch": 1.9135381114903298, "grad_norm": 0.8663358688354492, "learning_rate": 0.000618202502844141, "loss": 1.0861, "step": 1682 }, { "epoch": 1.9146757679180886, "grad_norm": 1.2262353897094727, "learning_rate": 0.0006179749715585893, "loss": 2.4656, "step": 1683 }, { "epoch": 1.9158134243458476, "grad_norm": 0.8106787800788879, "learning_rate": 0.0006177474402730375, "loss": 0.7479, "step": 1684 }, { "epoch": 1.9169510807736063, "grad_norm": 0.8843865394592285, "learning_rate": 0.0006175199089874857, "loss": 1.8057, "step": 1685 }, { "epoch": 1.9180887372013653, "grad_norm": 1.0910826921463013, "learning_rate": 0.000617292377701934, "loss": 2.1269, "step": 1686 }, { "epoch": 1.919226393629124, "grad_norm": 0.8909353613853455, "learning_rate": 0.0006170648464163823, "loss": 1.4374, "step": 1687 }, { "epoch": 1.9203640500568828, "grad_norm": 0.838852047920227, "learning_rate": 0.0006168373151308306, "loss": 1.455, "step": 1688 }, { "epoch": 1.9215017064846416, "grad_norm": 1.186137080192566, "learning_rate": 0.0006166097838452788, "loss": 2.9913, "step": 1689 }, { "epoch": 1.9226393629124003, "grad_norm": 0.947559654712677, "learning_rate": 0.000616382252559727, "loss": 2.5864, "step": 1690 }, { "epoch": 1.9237770193401593, "grad_norm": 1.1154701709747314, "learning_rate": 0.0006161547212741753, "loss": 1.7531, "step": 1691 }, { "epoch": 1.924914675767918, "grad_norm": 0.9917047023773193, "learning_rate": 0.0006159271899886234, "loss": 1.847, "step": 1692 }, { "epoch": 1.926052332195677, "grad_norm": 0.8470612168312073, "learning_rate": 0.0006156996587030716, "loss": 1.9226, "step": 1693 }, { "epoch": 1.9271899886234358, "grad_norm": 1.0250667333602905, "learning_rate": 0.0006154721274175199, "loss": 1.3656, "step": 1694 }, { "epoch": 1.9283276450511946, "grad_norm": 1.122154951095581, "learning_rate": 0.0006152445961319682, "loss": 1.7061, "step": 1695 }, { "epoch": 1.9294653014789533, "grad_norm": 0.7498093843460083, "learning_rate": 0.0006150170648464164, "loss": 1.86, "step": 1696 }, { "epoch": 1.930602957906712, "grad_norm": 1.526132583618164, "learning_rate": 0.0006147895335608647, "loss": 1.9259, "step": 1697 }, { "epoch": 1.9317406143344709, "grad_norm": 0.7942246794700623, "learning_rate": 0.0006145620022753129, "loss": 2.1063, "step": 1698 }, { "epoch": 1.9328782707622298, "grad_norm": 0.7092151045799255, "learning_rate": 0.0006143344709897611, "loss": 1.3752, "step": 1699 }, { "epoch": 1.9340159271899886, "grad_norm": 1.0814862251281738, "learning_rate": 0.0006141069397042094, "loss": 1.3951, "step": 1700 }, { "epoch": 1.9351535836177476, "grad_norm": 0.8918007016181946, "learning_rate": 0.0006138794084186575, "loss": 1.4203, "step": 1701 }, { "epoch": 1.9362912400455063, "grad_norm": 1.2497609853744507, "learning_rate": 0.0006136518771331057, "loss": 2.2451, "step": 1702 }, { "epoch": 1.937428896473265, "grad_norm": 1.0966827869415283, "learning_rate": 0.000613424345847554, "loss": 2.617, "step": 1703 }, { "epoch": 1.9385665529010239, "grad_norm": 0.6311344504356384, "learning_rate": 0.0006131968145620023, "loss": 0.9732, "step": 1704 }, { "epoch": 1.9397042093287826, "grad_norm": 0.7735137939453125, "learning_rate": 0.0006129692832764505, "loss": 1.6126, "step": 1705 }, { "epoch": 1.9408418657565414, "grad_norm": 0.8205373287200928, "learning_rate": 0.0006127417519908988, "loss": 0.8851, "step": 1706 }, { "epoch": 1.9419795221843004, "grad_norm": 1.8471014499664307, "learning_rate": 0.000612514220705347, "loss": 3.0294, "step": 1707 }, { "epoch": 1.9431171786120591, "grad_norm": 0.8841797113418579, "learning_rate": 0.0006122866894197953, "loss": 2.4339, "step": 1708 }, { "epoch": 1.944254835039818, "grad_norm": 1.112204670906067, "learning_rate": 0.0006120591581342435, "loss": 3.4836, "step": 1709 }, { "epoch": 1.9453924914675769, "grad_norm": 0.7821568250656128, "learning_rate": 0.0006118316268486917, "loss": 1.1741, "step": 1710 }, { "epoch": 1.9465301478953356, "grad_norm": 0.9497551918029785, "learning_rate": 0.0006116040955631399, "loss": 1.5836, "step": 1711 }, { "epoch": 1.9476678043230944, "grad_norm": 0.9662843942642212, "learning_rate": 0.0006113765642775882, "loss": 2.4538, "step": 1712 }, { "epoch": 1.9488054607508531, "grad_norm": 0.8117369413375854, "learning_rate": 0.0006111490329920364, "loss": 2.783, "step": 1713 }, { "epoch": 1.949943117178612, "grad_norm": 0.7812523245811462, "learning_rate": 0.0006109215017064847, "loss": 1.5216, "step": 1714 }, { "epoch": 1.9510807736063709, "grad_norm": 0.7647354006767273, "learning_rate": 0.0006106939704209329, "loss": 1.8356, "step": 1715 }, { "epoch": 1.9522184300341296, "grad_norm": 1.348557949066162, "learning_rate": 0.0006104664391353811, "loss": 2.3718, "step": 1716 }, { "epoch": 1.9533560864618886, "grad_norm": 0.9883414506912231, "learning_rate": 0.0006102389078498294, "loss": 1.9512, "step": 1717 }, { "epoch": 1.9544937428896474, "grad_norm": 1.112888216972351, "learning_rate": 0.0006100113765642776, "loss": 1.9945, "step": 1718 }, { "epoch": 1.9556313993174061, "grad_norm": 1.1321780681610107, "learning_rate": 0.0006097838452787258, "loss": 1.7481, "step": 1719 }, { "epoch": 1.956769055745165, "grad_norm": 1.408277153968811, "learning_rate": 0.0006095563139931742, "loss": 2.5749, "step": 1720 }, { "epoch": 1.9579067121729237, "grad_norm": 0.9839853644371033, "learning_rate": 0.0006093287827076223, "loss": 1.2224, "step": 1721 }, { "epoch": 1.9590443686006824, "grad_norm": 0.9178591966629028, "learning_rate": 0.0006091012514220705, "loss": 1.5087, "step": 1722 }, { "epoch": 1.9601820250284414, "grad_norm": 1.2959963083267212, "learning_rate": 0.0006088737201365188, "loss": 3.2079, "step": 1723 }, { "epoch": 1.9613196814562004, "grad_norm": 0.9269609451293945, "learning_rate": 0.000608646188850967, "loss": 1.5689, "step": 1724 }, { "epoch": 1.9624573378839592, "grad_norm": 0.8281601071357727, "learning_rate": 0.0006084186575654152, "loss": 1.2933, "step": 1725 }, { "epoch": 1.963594994311718, "grad_norm": 1.0037777423858643, "learning_rate": 0.0006081911262798635, "loss": 1.7339, "step": 1726 }, { "epoch": 1.9647326507394767, "grad_norm": 1.0925790071487427, "learning_rate": 0.0006079635949943117, "loss": 2.8919, "step": 1727 }, { "epoch": 1.9658703071672354, "grad_norm": 0.9971732497215271, "learning_rate": 0.00060773606370876, "loss": 2.1831, "step": 1728 }, { "epoch": 1.9670079635949942, "grad_norm": 1.161362648010254, "learning_rate": 0.0006075085324232083, "loss": 3.2354, "step": 1729 }, { "epoch": 1.9681456200227532, "grad_norm": 1.0122507810592651, "learning_rate": 0.0006072810011376564, "loss": 2.0288, "step": 1730 }, { "epoch": 1.969283276450512, "grad_norm": 0.47560903429985046, "learning_rate": 0.0006070534698521047, "loss": 0.5176, "step": 1731 }, { "epoch": 1.970420932878271, "grad_norm": 0.6199741363525391, "learning_rate": 0.0006068259385665529, "loss": 1.2013, "step": 1732 }, { "epoch": 1.9715585893060297, "grad_norm": 1.1044777631759644, "learning_rate": 0.0006065984072810011, "loss": 1.7702, "step": 1733 }, { "epoch": 1.9726962457337884, "grad_norm": 0.9448471069335938, "learning_rate": 0.0006063708759954494, "loss": 1.9484, "step": 1734 }, { "epoch": 1.9738339021615472, "grad_norm": 0.9100543856620789, "learning_rate": 0.0006061433447098976, "loss": 2.0936, "step": 1735 }, { "epoch": 1.974971558589306, "grad_norm": 0.6803526282310486, "learning_rate": 0.0006059158134243458, "loss": 1.1829, "step": 1736 }, { "epoch": 1.9761092150170647, "grad_norm": 1.1807910203933716, "learning_rate": 0.0006056882821387942, "loss": 2.4637, "step": 1737 }, { "epoch": 1.9772468714448237, "grad_norm": 0.9801465272903442, "learning_rate": 0.0006054607508532424, "loss": 1.9998, "step": 1738 }, { "epoch": 1.9783845278725825, "grad_norm": 0.987866222858429, "learning_rate": 0.0006052332195676906, "loss": 1.2759, "step": 1739 }, { "epoch": 1.9795221843003414, "grad_norm": 1.0122973918914795, "learning_rate": 0.0006050056882821388, "loss": 0.9901, "step": 1740 }, { "epoch": 1.9806598407281002, "grad_norm": 0.9722836017608643, "learning_rate": 0.000604778156996587, "loss": 2.178, "step": 1741 }, { "epoch": 1.981797497155859, "grad_norm": 0.9257890582084656, "learning_rate": 0.0006045506257110352, "loss": 2.1119, "step": 1742 }, { "epoch": 1.9829351535836177, "grad_norm": 0.6407757997512817, "learning_rate": 0.0006043230944254835, "loss": 1.2141, "step": 1743 }, { "epoch": 1.9840728100113765, "grad_norm": 0.7565338015556335, "learning_rate": 0.0006040955631399317, "loss": 1.7172, "step": 1744 }, { "epoch": 1.9852104664391352, "grad_norm": 0.7070271372795105, "learning_rate": 0.0006038680318543799, "loss": 0.856, "step": 1745 }, { "epoch": 1.9863481228668942, "grad_norm": 1.3683280944824219, "learning_rate": 0.0006036405005688283, "loss": 3.7194, "step": 1746 }, { "epoch": 1.987485779294653, "grad_norm": 0.8713628649711609, "learning_rate": 0.0006034129692832765, "loss": 2.4061, "step": 1747 }, { "epoch": 1.988623435722412, "grad_norm": 1.6695564985275269, "learning_rate": 0.0006031854379977248, "loss": 3.2265, "step": 1748 }, { "epoch": 1.9897610921501707, "grad_norm": 1.007358431816101, "learning_rate": 0.000602957906712173, "loss": 2.1454, "step": 1749 }, { "epoch": 1.9908987485779295, "grad_norm": 1.256252408027649, "learning_rate": 0.0006027303754266211, "loss": 2.2461, "step": 1750 }, { "epoch": 1.9920364050056882, "grad_norm": 1.4839099645614624, "learning_rate": 0.0006025028441410694, "loss": 3.5124, "step": 1751 }, { "epoch": 1.993174061433447, "grad_norm": 0.8481181263923645, "learning_rate": 0.0006022753128555176, "loss": 1.5822, "step": 1752 }, { "epoch": 1.9943117178612058, "grad_norm": 1.6936454772949219, "learning_rate": 0.0006020477815699658, "loss": 2.7639, "step": 1753 }, { "epoch": 1.9954493742889647, "grad_norm": 1.1564828157424927, "learning_rate": 0.0006018202502844142, "loss": 2.4716, "step": 1754 }, { "epoch": 1.9965870307167235, "grad_norm": 1.2504764795303345, "learning_rate": 0.0006015927189988624, "loss": 1.9542, "step": 1755 }, { "epoch": 1.9977246871444825, "grad_norm": 0.9221636652946472, "learning_rate": 0.0006013651877133106, "loss": 2.1488, "step": 1756 }, { "epoch": 1.9988623435722412, "grad_norm": 1.1251505613327026, "learning_rate": 0.0006011376564277589, "loss": 1.9403, "step": 1757 }, { "epoch": 2.0, "grad_norm": 0.7302814722061157, "learning_rate": 0.0006009101251422071, "loss": 1.4557, "step": 1758 }, { "epoch": 2.0, "eval_f1": 0.8901, "eval_gen_len": 49.6091, "eval_loss": 1.861061930656433, "eval_precision": 0.8889, "eval_recall": 0.8915, "eval_rouge1": 0.4491, "eval_rouge2": 0.2031, "eval_rougeL": 0.3721, "eval_rougeLsum": 0.4148, "eval_runtime": 28.0457, "eval_samples_per_second": 3.922, "eval_steps_per_second": 0.499, "step": 1758 }, { "epoch": 2.0011376564277588, "grad_norm": 0.7474573850631714, "learning_rate": 0.0006006825938566553, "loss": 0.9735, "step": 1759 }, { "epoch": 2.0022753128555175, "grad_norm": 1.003105640411377, "learning_rate": 0.0006004550625711035, "loss": 1.6829, "step": 1760 }, { "epoch": 2.0034129692832763, "grad_norm": 0.7681793570518494, "learning_rate": 0.0006002275312855517, "loss": 1.6531, "step": 1761 }, { "epoch": 2.0045506257110355, "grad_norm": 1.0925558805465698, "learning_rate": 0.0006, "loss": 2.754, "step": 1762 }, { "epoch": 2.0056882821387942, "grad_norm": 0.8437767028808594, "learning_rate": 0.0005997724687144483, "loss": 1.3487, "step": 1763 }, { "epoch": 2.006825938566553, "grad_norm": 0.6217951774597168, "learning_rate": 0.0005995449374288965, "loss": 1.3183, "step": 1764 }, { "epoch": 2.0079635949943118, "grad_norm": 0.9381899237632751, "learning_rate": 0.0005993174061433447, "loss": 1.5027, "step": 1765 }, { "epoch": 2.0091012514220705, "grad_norm": 0.801108181476593, "learning_rate": 0.000599089874857793, "loss": 1.207, "step": 1766 }, { "epoch": 2.0102389078498293, "grad_norm": 0.9824976325035095, "learning_rate": 0.0005988623435722412, "loss": 2.4301, "step": 1767 }, { "epoch": 2.011376564277588, "grad_norm": 0.7496545910835266, "learning_rate": 0.0005986348122866895, "loss": 1.25, "step": 1768 }, { "epoch": 2.012514220705347, "grad_norm": 0.8144867420196533, "learning_rate": 0.0005984072810011376, "loss": 1.9912, "step": 1769 }, { "epoch": 2.013651877133106, "grad_norm": 1.2619274854660034, "learning_rate": 0.0005981797497155858, "loss": 2.0556, "step": 1770 }, { "epoch": 2.0147895335608648, "grad_norm": 0.8984837532043457, "learning_rate": 0.0005979522184300342, "loss": 1.3548, "step": 1771 }, { "epoch": 2.0159271899886235, "grad_norm": 0.9129440784454346, "learning_rate": 0.0005977246871444824, "loss": 2.3292, "step": 1772 }, { "epoch": 2.0170648464163823, "grad_norm": 0.7606117129325867, "learning_rate": 0.0005974971558589306, "loss": 1.3862, "step": 1773 }, { "epoch": 2.018202502844141, "grad_norm": 1.030834674835205, "learning_rate": 0.0005972696245733789, "loss": 1.4272, "step": 1774 }, { "epoch": 2.0193401592719, "grad_norm": 0.6545241475105286, "learning_rate": 0.0005970420932878271, "loss": 1.1461, "step": 1775 }, { "epoch": 2.0204778156996586, "grad_norm": 0.8098762631416321, "learning_rate": 0.0005968145620022753, "loss": 1.6309, "step": 1776 }, { "epoch": 2.0216154721274173, "grad_norm": 1.2966359853744507, "learning_rate": 0.0005965870307167236, "loss": 2.2563, "step": 1777 }, { "epoch": 2.0227531285551765, "grad_norm": 1.0538915395736694, "learning_rate": 0.0005963594994311718, "loss": 2.8999, "step": 1778 }, { "epoch": 2.0238907849829353, "grad_norm": 0.788987934589386, "learning_rate": 0.00059613196814562, "loss": 1.5423, "step": 1779 }, { "epoch": 2.025028441410694, "grad_norm": 0.9566778540611267, "learning_rate": 0.0005959044368600683, "loss": 2.4709, "step": 1780 }, { "epoch": 2.026166097838453, "grad_norm": 1.1112850904464722, "learning_rate": 0.0005956769055745165, "loss": 2.3085, "step": 1781 }, { "epoch": 2.0273037542662116, "grad_norm": 1.074602484703064, "learning_rate": 0.0005954493742889647, "loss": 1.9328, "step": 1782 }, { "epoch": 2.0284414106939703, "grad_norm": 1.017351508140564, "learning_rate": 0.000595221843003413, "loss": 2.1152, "step": 1783 }, { "epoch": 2.029579067121729, "grad_norm": 1.069661021232605, "learning_rate": 0.0005949943117178612, "loss": 1.8133, "step": 1784 }, { "epoch": 2.030716723549488, "grad_norm": 1.099387764930725, "learning_rate": 0.0005947667804323094, "loss": 2.207, "step": 1785 }, { "epoch": 2.031854379977247, "grad_norm": 1.02996027469635, "learning_rate": 0.0005945392491467577, "loss": 1.9207, "step": 1786 }, { "epoch": 2.032992036405006, "grad_norm": 0.7691861391067505, "learning_rate": 0.000594311717861206, "loss": 1.2884, "step": 1787 }, { "epoch": 2.0341296928327646, "grad_norm": 0.9716812968254089, "learning_rate": 0.0005940841865756542, "loss": 1.4235, "step": 1788 }, { "epoch": 2.0352673492605233, "grad_norm": 0.9133804440498352, "learning_rate": 0.0005938566552901024, "loss": 1.654, "step": 1789 }, { "epoch": 2.036405005688282, "grad_norm": 1.1487979888916016, "learning_rate": 0.0005936291240045506, "loss": 2.4215, "step": 1790 }, { "epoch": 2.037542662116041, "grad_norm": 0.8201186060905457, "learning_rate": 0.0005934015927189989, "loss": 1.775, "step": 1791 }, { "epoch": 2.0386803185437996, "grad_norm": 1.0376436710357666, "learning_rate": 0.0005931740614334471, "loss": 2.3957, "step": 1792 }, { "epoch": 2.039817974971559, "grad_norm": 0.9257051944732666, "learning_rate": 0.0005929465301478953, "loss": 2.02, "step": 1793 }, { "epoch": 2.0409556313993176, "grad_norm": 1.158453106880188, "learning_rate": 0.0005927189988623436, "loss": 2.3353, "step": 1794 }, { "epoch": 2.0420932878270763, "grad_norm": 0.9140053987503052, "learning_rate": 0.0005924914675767918, "loss": 1.6694, "step": 1795 }, { "epoch": 2.043230944254835, "grad_norm": 0.8473706245422363, "learning_rate": 0.0005922639362912401, "loss": 1.4332, "step": 1796 }, { "epoch": 2.044368600682594, "grad_norm": 1.5226987600326538, "learning_rate": 0.0005920364050056884, "loss": 2.6597, "step": 1797 }, { "epoch": 2.0455062571103526, "grad_norm": 1.0220167636871338, "learning_rate": 0.0005918088737201365, "loss": 1.9809, "step": 1798 }, { "epoch": 2.0466439135381114, "grad_norm": 0.8180732727050781, "learning_rate": 0.0005915813424345847, "loss": 1.5031, "step": 1799 }, { "epoch": 2.04778156996587, "grad_norm": 1.1611244678497314, "learning_rate": 0.000591353811149033, "loss": 1.2286, "step": 1800 }, { "epoch": 2.0489192263936293, "grad_norm": 1.1958653926849365, "learning_rate": 0.0005911262798634812, "loss": 2.6489, "step": 1801 }, { "epoch": 2.050056882821388, "grad_norm": 1.252683401107788, "learning_rate": 0.0005908987485779294, "loss": 2.6139, "step": 1802 }, { "epoch": 2.051194539249147, "grad_norm": 1.3542306423187256, "learning_rate": 0.0005906712172923777, "loss": 2.6136, "step": 1803 }, { "epoch": 2.0523321956769056, "grad_norm": 1.3260592222213745, "learning_rate": 0.000590443686006826, "loss": 3.5016, "step": 1804 }, { "epoch": 2.0534698521046644, "grad_norm": 1.2958813905715942, "learning_rate": 0.0005902161547212742, "loss": 3.2267, "step": 1805 }, { "epoch": 2.054607508532423, "grad_norm": 0.963817834854126, "learning_rate": 0.0005899886234357225, "loss": 2.2298, "step": 1806 }, { "epoch": 2.055745164960182, "grad_norm": 0.8188225626945496, "learning_rate": 0.0005897610921501707, "loss": 1.3832, "step": 1807 }, { "epoch": 2.0568828213879407, "grad_norm": 1.3529433012008667, "learning_rate": 0.0005895335608646188, "loss": 1.5602, "step": 1808 }, { "epoch": 2.0580204778157, "grad_norm": 1.1197400093078613, "learning_rate": 0.0005893060295790671, "loss": 2.5839, "step": 1809 }, { "epoch": 2.0591581342434586, "grad_norm": 0.9456532001495361, "learning_rate": 0.0005890784982935153, "loss": 2.6459, "step": 1810 }, { "epoch": 2.0602957906712174, "grad_norm": 0.923581063747406, "learning_rate": 0.0005888509670079636, "loss": 1.9121, "step": 1811 }, { "epoch": 2.061433447098976, "grad_norm": 1.202931523323059, "learning_rate": 0.0005886234357224119, "loss": 1.8609, "step": 1812 }, { "epoch": 2.062571103526735, "grad_norm": 1.0954371690750122, "learning_rate": 0.0005883959044368601, "loss": 1.6039, "step": 1813 }, { "epoch": 2.0637087599544937, "grad_norm": 0.8433867692947388, "learning_rate": 0.0005881683731513084, "loss": 1.2853, "step": 1814 }, { "epoch": 2.0648464163822524, "grad_norm": 0.7211450934410095, "learning_rate": 0.0005879408418657566, "loss": 0.9241, "step": 1815 }, { "epoch": 2.065984072810011, "grad_norm": 1.0169728994369507, "learning_rate": 0.0005877133105802048, "loss": 3.0501, "step": 1816 }, { "epoch": 2.0671217292377704, "grad_norm": 1.1246399879455566, "learning_rate": 0.0005874857792946531, "loss": 2.3138, "step": 1817 }, { "epoch": 2.068259385665529, "grad_norm": 0.945504367351532, "learning_rate": 0.0005872582480091012, "loss": 1.3534, "step": 1818 }, { "epoch": 2.069397042093288, "grad_norm": 0.8048107624053955, "learning_rate": 0.0005870307167235494, "loss": 2.2219, "step": 1819 }, { "epoch": 2.0705346985210467, "grad_norm": 0.7122802734375, "learning_rate": 0.0005868031854379977, "loss": 1.239, "step": 1820 }, { "epoch": 2.0716723549488054, "grad_norm": 0.5502296686172485, "learning_rate": 0.000586575654152446, "loss": 1.1367, "step": 1821 }, { "epoch": 2.072810011376564, "grad_norm": 0.9720338582992554, "learning_rate": 0.0005863481228668942, "loss": 2.4818, "step": 1822 }, { "epoch": 2.073947667804323, "grad_norm": 1.1907950639724731, "learning_rate": 0.0005861205915813425, "loss": 3.3499, "step": 1823 }, { "epoch": 2.0750853242320817, "grad_norm": 0.7875828146934509, "learning_rate": 0.0005858930602957907, "loss": 1.2593, "step": 1824 }, { "epoch": 2.076222980659841, "grad_norm": 1.0819391012191772, "learning_rate": 0.0005856655290102389, "loss": 2.0526, "step": 1825 }, { "epoch": 2.0773606370875997, "grad_norm": 0.7829857468605042, "learning_rate": 0.0005854379977246872, "loss": 1.7224, "step": 1826 }, { "epoch": 2.0784982935153584, "grad_norm": 0.6612402200698853, "learning_rate": 0.0005852104664391354, "loss": 1.3579, "step": 1827 }, { "epoch": 2.079635949943117, "grad_norm": 0.6340951323509216, "learning_rate": 0.0005849829351535835, "loss": 1.2102, "step": 1828 }, { "epoch": 2.080773606370876, "grad_norm": 0.6889867186546326, "learning_rate": 0.0005847554038680319, "loss": 1.0524, "step": 1829 }, { "epoch": 2.0819112627986347, "grad_norm": 1.2241733074188232, "learning_rate": 0.0005845278725824801, "loss": 1.7329, "step": 1830 }, { "epoch": 2.0830489192263935, "grad_norm": 0.9656223654747009, "learning_rate": 0.0005843003412969284, "loss": 1.5006, "step": 1831 }, { "epoch": 2.0841865756541527, "grad_norm": 0.9914301037788391, "learning_rate": 0.0005840728100113766, "loss": 1.72, "step": 1832 }, { "epoch": 2.0853242320819114, "grad_norm": 0.6369336843490601, "learning_rate": 0.0005838452787258248, "loss": 1.4671, "step": 1833 }, { "epoch": 2.08646188850967, "grad_norm": 1.0386950969696045, "learning_rate": 0.0005836177474402731, "loss": 1.8412, "step": 1834 }, { "epoch": 2.087599544937429, "grad_norm": 0.9129205346107483, "learning_rate": 0.0005833902161547213, "loss": 1.8206, "step": 1835 }, { "epoch": 2.0887372013651877, "grad_norm": 0.8238604664802551, "learning_rate": 0.0005831626848691695, "loss": 1.8884, "step": 1836 }, { "epoch": 2.0898748577929465, "grad_norm": 1.2888492345809937, "learning_rate": 0.0005829351535836177, "loss": 2.121, "step": 1837 }, { "epoch": 2.091012514220705, "grad_norm": 1.0287847518920898, "learning_rate": 0.000582707622298066, "loss": 1.878, "step": 1838 }, { "epoch": 2.092150170648464, "grad_norm": 1.340661883354187, "learning_rate": 0.0005824800910125142, "loss": 2.5264, "step": 1839 }, { "epoch": 2.093287827076223, "grad_norm": 0.8161119818687439, "learning_rate": 0.0005822525597269625, "loss": 1.8105, "step": 1840 }, { "epoch": 2.094425483503982, "grad_norm": 1.0750125646591187, "learning_rate": 0.0005820250284414107, "loss": 2.3553, "step": 1841 }, { "epoch": 2.0955631399317407, "grad_norm": 1.0218100547790527, "learning_rate": 0.0005817974971558589, "loss": 1.8317, "step": 1842 }, { "epoch": 2.0967007963594995, "grad_norm": 0.5259703397750854, "learning_rate": 0.0005815699658703072, "loss": 0.7726, "step": 1843 }, { "epoch": 2.0978384527872582, "grad_norm": 0.9108942151069641, "learning_rate": 0.0005813424345847554, "loss": 1.7108, "step": 1844 }, { "epoch": 2.098976109215017, "grad_norm": 1.2494343519210815, "learning_rate": 0.0005811149032992036, "loss": 2.0056, "step": 1845 }, { "epoch": 2.1001137656427757, "grad_norm": 0.6544501185417175, "learning_rate": 0.000580887372013652, "loss": 1.1255, "step": 1846 }, { "epoch": 2.1012514220705345, "grad_norm": 1.4156732559204102, "learning_rate": 0.0005806598407281001, "loss": 2.7314, "step": 1847 }, { "epoch": 2.1023890784982937, "grad_norm": 0.6912670731544495, "learning_rate": 0.0005804323094425483, "loss": 1.3614, "step": 1848 }, { "epoch": 2.1035267349260525, "grad_norm": 0.7118052840232849, "learning_rate": 0.0005802047781569966, "loss": 1.9744, "step": 1849 }, { "epoch": 2.1046643913538112, "grad_norm": 0.9591397047042847, "learning_rate": 0.0005799772468714448, "loss": 0.8785, "step": 1850 }, { "epoch": 2.10580204778157, "grad_norm": 0.7862758040428162, "learning_rate": 0.0005797497155858931, "loss": 1.5104, "step": 1851 }, { "epoch": 2.1069397042093287, "grad_norm": 0.8719160556793213, "learning_rate": 0.0005795221843003413, "loss": 1.3436, "step": 1852 }, { "epoch": 2.1080773606370875, "grad_norm": 0.7223889231681824, "learning_rate": 0.0005792946530147895, "loss": 1.116, "step": 1853 }, { "epoch": 2.1092150170648463, "grad_norm": 1.0977421998977661, "learning_rate": 0.0005790671217292379, "loss": 1.5358, "step": 1854 }, { "epoch": 2.110352673492605, "grad_norm": 1.2012536525726318, "learning_rate": 0.0005788395904436861, "loss": 1.9212, "step": 1855 }, { "epoch": 2.1114903299203642, "grad_norm": 1.2642079591751099, "learning_rate": 0.0005786120591581343, "loss": 2.3365, "step": 1856 }, { "epoch": 2.112627986348123, "grad_norm": 0.7523776888847351, "learning_rate": 0.0005783845278725825, "loss": 1.2154, "step": 1857 }, { "epoch": 2.1137656427758817, "grad_norm": 0.4518072307109833, "learning_rate": 0.0005781569965870307, "loss": 0.5069, "step": 1858 }, { "epoch": 2.1149032992036405, "grad_norm": 0.6998793482780457, "learning_rate": 0.0005779294653014789, "loss": 1.7925, "step": 1859 }, { "epoch": 2.1160409556313993, "grad_norm": 1.0769786834716797, "learning_rate": 0.0005777019340159272, "loss": 2.0462, "step": 1860 }, { "epoch": 2.117178612059158, "grad_norm": 1.269291877746582, "learning_rate": 0.0005774744027303754, "loss": 1.5268, "step": 1861 }, { "epoch": 2.118316268486917, "grad_norm": 1.7893264293670654, "learning_rate": 0.0005772468714448236, "loss": 3.6841, "step": 1862 }, { "epoch": 2.1194539249146755, "grad_norm": 1.0378164052963257, "learning_rate": 0.000577019340159272, "loss": 1.8547, "step": 1863 }, { "epoch": 2.1205915813424348, "grad_norm": 0.983869731426239, "learning_rate": 0.0005767918088737202, "loss": 2.1582, "step": 1864 }, { "epoch": 2.1217292377701935, "grad_norm": 0.6995578408241272, "learning_rate": 0.0005765642775881684, "loss": 1.7941, "step": 1865 }, { "epoch": 2.1228668941979523, "grad_norm": 0.8206987380981445, "learning_rate": 0.0005763367463026167, "loss": 1.4095, "step": 1866 }, { "epoch": 2.124004550625711, "grad_norm": 0.9261349439620972, "learning_rate": 0.0005761092150170648, "loss": 1.2816, "step": 1867 }, { "epoch": 2.12514220705347, "grad_norm": 0.971121609210968, "learning_rate": 0.000575881683731513, "loss": 1.9692, "step": 1868 }, { "epoch": 2.1262798634812285, "grad_norm": 0.8004586100578308, "learning_rate": 0.0005756541524459613, "loss": 1.4504, "step": 1869 }, { "epoch": 2.1274175199089873, "grad_norm": 1.1579177379608154, "learning_rate": 0.0005754266211604095, "loss": 1.8514, "step": 1870 }, { "epoch": 2.1285551763367465, "grad_norm": 0.6660380959510803, "learning_rate": 0.0005751990898748578, "loss": 1.2882, "step": 1871 }, { "epoch": 2.1296928327645053, "grad_norm": 1.0082716941833496, "learning_rate": 0.0005749715585893061, "loss": 1.9409, "step": 1872 }, { "epoch": 2.130830489192264, "grad_norm": 0.6915638446807861, "learning_rate": 0.0005747440273037543, "loss": 1.6384, "step": 1873 }, { "epoch": 2.131968145620023, "grad_norm": 0.91794753074646, "learning_rate": 0.0005745164960182026, "loss": 2.066, "step": 1874 }, { "epoch": 2.1331058020477816, "grad_norm": 0.7902770638465881, "learning_rate": 0.0005742889647326508, "loss": 1.3539, "step": 1875 }, { "epoch": 2.1342434584755403, "grad_norm": 0.8815358281135559, "learning_rate": 0.0005740614334470989, "loss": 1.9951, "step": 1876 }, { "epoch": 2.135381114903299, "grad_norm": 2.684619665145874, "learning_rate": 0.0005738339021615472, "loss": 2.3747, "step": 1877 }, { "epoch": 2.136518771331058, "grad_norm": 0.864995539188385, "learning_rate": 0.0005736063708759954, "loss": 1.2905, "step": 1878 }, { "epoch": 2.137656427758817, "grad_norm": 1.144092082977295, "learning_rate": 0.0005733788395904436, "loss": 3.1334, "step": 1879 }, { "epoch": 2.138794084186576, "grad_norm": 0.8507954478263855, "learning_rate": 0.000573151308304892, "loss": 2.1402, "step": 1880 }, { "epoch": 2.1399317406143346, "grad_norm": 0.9215604662895203, "learning_rate": 0.0005729237770193402, "loss": 1.339, "step": 1881 }, { "epoch": 2.1410693970420933, "grad_norm": 1.1672780513763428, "learning_rate": 0.0005726962457337884, "loss": 1.6453, "step": 1882 }, { "epoch": 2.142207053469852, "grad_norm": 1.1848803758621216, "learning_rate": 0.0005724687144482367, "loss": 2.445, "step": 1883 }, { "epoch": 2.143344709897611, "grad_norm": 1.0215684175491333, "learning_rate": 0.0005722411831626849, "loss": 1.3356, "step": 1884 }, { "epoch": 2.1444823663253696, "grad_norm": 0.9936044216156006, "learning_rate": 0.0005720136518771331, "loss": 1.9894, "step": 1885 }, { "epoch": 2.1456200227531284, "grad_norm": 1.2094396352767944, "learning_rate": 0.0005717861205915813, "loss": 1.5998, "step": 1886 }, { "epoch": 2.1467576791808876, "grad_norm": 1.1404001712799072, "learning_rate": 0.0005715585893060295, "loss": 2.1319, "step": 1887 }, { "epoch": 2.1478953356086463, "grad_norm": 0.656340479850769, "learning_rate": 0.0005713310580204778, "loss": 1.1715, "step": 1888 }, { "epoch": 2.149032992036405, "grad_norm": 1.03688645362854, "learning_rate": 0.0005711035267349261, "loss": 2.4935, "step": 1889 }, { "epoch": 2.150170648464164, "grad_norm": 0.8806471228599548, "learning_rate": 0.0005708759954493743, "loss": 1.9498, "step": 1890 }, { "epoch": 2.1513083048919226, "grad_norm": 0.9819498062133789, "learning_rate": 0.0005706484641638225, "loss": 1.7589, "step": 1891 }, { "epoch": 2.1524459613196814, "grad_norm": 0.9967643618583679, "learning_rate": 0.0005704209328782708, "loss": 1.479, "step": 1892 }, { "epoch": 2.15358361774744, "grad_norm": 0.8531262278556824, "learning_rate": 0.000570193401592719, "loss": 2.3837, "step": 1893 }, { "epoch": 2.1547212741751993, "grad_norm": 0.7077489495277405, "learning_rate": 0.0005699658703071673, "loss": 0.9478, "step": 1894 }, { "epoch": 2.155858930602958, "grad_norm": 0.4791664183139801, "learning_rate": 0.0005697383390216155, "loss": 0.6543, "step": 1895 }, { "epoch": 2.156996587030717, "grad_norm": 1.3306792974472046, "learning_rate": 0.0005695108077360637, "loss": 2.6389, "step": 1896 }, { "epoch": 2.1581342434584756, "grad_norm": 0.8933519721031189, "learning_rate": 0.000569283276450512, "loss": 1.1383, "step": 1897 }, { "epoch": 2.1592718998862344, "grad_norm": 1.3570899963378906, "learning_rate": 0.0005690557451649602, "loss": 1.7026, "step": 1898 }, { "epoch": 2.160409556313993, "grad_norm": 0.8122515678405762, "learning_rate": 0.0005688282138794084, "loss": 1.8717, "step": 1899 }, { "epoch": 2.161547212741752, "grad_norm": 1.0075249671936035, "learning_rate": 0.0005686006825938567, "loss": 1.9019, "step": 1900 }, { "epoch": 2.1626848691695106, "grad_norm": 1.1624641418457031, "learning_rate": 0.0005683731513083049, "loss": 1.4538, "step": 1901 }, { "epoch": 2.1638225255972694, "grad_norm": 1.6518383026123047, "learning_rate": 0.0005681456200227531, "loss": 3.6701, "step": 1902 }, { "epoch": 2.1649601820250286, "grad_norm": 0.6541008353233337, "learning_rate": 0.0005679180887372014, "loss": 1.3688, "step": 1903 }, { "epoch": 2.1660978384527874, "grad_norm": 1.05472993850708, "learning_rate": 0.0005676905574516497, "loss": 2.1368, "step": 1904 }, { "epoch": 2.167235494880546, "grad_norm": 1.1374725103378296, "learning_rate": 0.0005674630261660978, "loss": 3.3545, "step": 1905 }, { "epoch": 2.168373151308305, "grad_norm": 1.0366812944412231, "learning_rate": 0.0005672354948805461, "loss": 2.4839, "step": 1906 }, { "epoch": 2.1695108077360636, "grad_norm": 0.8595036268234253, "learning_rate": 0.0005670079635949943, "loss": 1.784, "step": 1907 }, { "epoch": 2.1706484641638224, "grad_norm": 1.023034930229187, "learning_rate": 0.0005667804323094425, "loss": 1.3478, "step": 1908 }, { "epoch": 2.171786120591581, "grad_norm": 0.741868257522583, "learning_rate": 0.0005665529010238908, "loss": 1.0789, "step": 1909 }, { "epoch": 2.1729237770193404, "grad_norm": 1.1272695064544678, "learning_rate": 0.000566325369738339, "loss": 2.037, "step": 1910 }, { "epoch": 2.174061433447099, "grad_norm": 1.0772629976272583, "learning_rate": 0.0005660978384527872, "loss": 2.3381, "step": 1911 }, { "epoch": 2.175199089874858, "grad_norm": 1.0794868469238281, "learning_rate": 0.0005658703071672356, "loss": 2.9036, "step": 1912 }, { "epoch": 2.1763367463026166, "grad_norm": 0.9306978583335876, "learning_rate": 0.0005656427758816838, "loss": 2.0821, "step": 1913 }, { "epoch": 2.1774744027303754, "grad_norm": 0.88816237449646, "learning_rate": 0.0005654152445961321, "loss": 1.7599, "step": 1914 }, { "epoch": 2.178612059158134, "grad_norm": 1.169001817703247, "learning_rate": 0.0005651877133105802, "loss": 2.1915, "step": 1915 }, { "epoch": 2.179749715585893, "grad_norm": 1.0751813650131226, "learning_rate": 0.0005649601820250284, "loss": 1.6451, "step": 1916 }, { "epoch": 2.1808873720136517, "grad_norm": 1.062515377998352, "learning_rate": 0.0005647326507394767, "loss": 2.4742, "step": 1917 }, { "epoch": 2.182025028441411, "grad_norm": 0.9888359308242798, "learning_rate": 0.0005645051194539249, "loss": 3.1331, "step": 1918 }, { "epoch": 2.1831626848691696, "grad_norm": 0.8473877310752869, "learning_rate": 0.0005642775881683731, "loss": 1.4678, "step": 1919 }, { "epoch": 2.1843003412969284, "grad_norm": 0.85085129737854, "learning_rate": 0.0005640500568828214, "loss": 2.127, "step": 1920 }, { "epoch": 2.185437997724687, "grad_norm": 1.2935888767242432, "learning_rate": 0.0005638225255972697, "loss": 3.5023, "step": 1921 }, { "epoch": 2.186575654152446, "grad_norm": 0.8884962201118469, "learning_rate": 0.0005635949943117179, "loss": 0.8726, "step": 1922 }, { "epoch": 2.1877133105802047, "grad_norm": 0.9087686538696289, "learning_rate": 0.0005633674630261662, "loss": 1.3675, "step": 1923 }, { "epoch": 2.1888509670079634, "grad_norm": 0.6281200647354126, "learning_rate": 0.0005631399317406144, "loss": 0.7575, "step": 1924 }, { "epoch": 2.189988623435722, "grad_norm": 1.0639797449111938, "learning_rate": 0.0005629124004550625, "loss": 2.3185, "step": 1925 }, { "epoch": 2.1911262798634814, "grad_norm": 0.596128523349762, "learning_rate": 0.0005626848691695108, "loss": 0.8454, "step": 1926 }, { "epoch": 2.19226393629124, "grad_norm": 0.8989168405532837, "learning_rate": 0.000562457337883959, "loss": 0.7302, "step": 1927 }, { "epoch": 2.193401592718999, "grad_norm": 0.7336655855178833, "learning_rate": 0.0005622298065984072, "loss": 1.2145, "step": 1928 }, { "epoch": 2.1945392491467577, "grad_norm": 1.077013611793518, "learning_rate": 0.0005620022753128556, "loss": 2.4185, "step": 1929 }, { "epoch": 2.1956769055745164, "grad_norm": 0.8258651494979858, "learning_rate": 0.0005617747440273038, "loss": 1.4536, "step": 1930 }, { "epoch": 2.196814562002275, "grad_norm": 0.569733202457428, "learning_rate": 0.000561547212741752, "loss": 0.7994, "step": 1931 }, { "epoch": 2.197952218430034, "grad_norm": 0.843547523021698, "learning_rate": 0.0005613196814562003, "loss": 1.3272, "step": 1932 }, { "epoch": 2.199089874857793, "grad_norm": 0.8367027044296265, "learning_rate": 0.0005610921501706485, "loss": 1.625, "step": 1933 }, { "epoch": 2.200227531285552, "grad_norm": 1.0637445449829102, "learning_rate": 0.0005608646188850968, "loss": 1.3352, "step": 1934 }, { "epoch": 2.2013651877133107, "grad_norm": 1.4886406660079956, "learning_rate": 0.0005606370875995449, "loss": 2.3982, "step": 1935 }, { "epoch": 2.2025028441410694, "grad_norm": 1.015264630317688, "learning_rate": 0.0005604095563139931, "loss": 2.7429, "step": 1936 }, { "epoch": 2.203640500568828, "grad_norm": 1.2239853143692017, "learning_rate": 0.0005601820250284414, "loss": 1.8034, "step": 1937 }, { "epoch": 2.204778156996587, "grad_norm": 1.1284427642822266, "learning_rate": 0.0005599544937428897, "loss": 2.232, "step": 1938 }, { "epoch": 2.2059158134243457, "grad_norm": 1.118551254272461, "learning_rate": 0.0005597269624573379, "loss": 1.9726, "step": 1939 }, { "epoch": 2.2070534698521045, "grad_norm": 1.115544080734253, "learning_rate": 0.0005594994311717862, "loss": 2.3797, "step": 1940 }, { "epoch": 2.2081911262798632, "grad_norm": 0.8640685677528381, "learning_rate": 0.0005592718998862344, "loss": 2.5765, "step": 1941 }, { "epoch": 2.2093287827076225, "grad_norm": 1.0405633449554443, "learning_rate": 0.0005590443686006826, "loss": 1.6865, "step": 1942 }, { "epoch": 2.210466439135381, "grad_norm": 1.6807098388671875, "learning_rate": 0.0005588168373151309, "loss": 2.5395, "step": 1943 }, { "epoch": 2.21160409556314, "grad_norm": 0.9843060374259949, "learning_rate": 0.000558589306029579, "loss": 2.0961, "step": 1944 }, { "epoch": 2.2127417519908987, "grad_norm": 0.995924174785614, "learning_rate": 0.0005583617747440272, "loss": 2.0971, "step": 1945 }, { "epoch": 2.2138794084186575, "grad_norm": 0.8533014059066772, "learning_rate": 0.0005581342434584756, "loss": 1.5151, "step": 1946 }, { "epoch": 2.2150170648464163, "grad_norm": 0.9610128402709961, "learning_rate": 0.0005579067121729238, "loss": 2.239, "step": 1947 }, { "epoch": 2.216154721274175, "grad_norm": 0.799690306186676, "learning_rate": 0.000557679180887372, "loss": 1.5567, "step": 1948 }, { "epoch": 2.217292377701934, "grad_norm": 0.8226889371871948, "learning_rate": 0.0005574516496018203, "loss": 2.6057, "step": 1949 }, { "epoch": 2.218430034129693, "grad_norm": 1.314974308013916, "learning_rate": 0.0005572241183162685, "loss": 2.2834, "step": 1950 }, { "epoch": 2.2195676905574517, "grad_norm": 0.9259127974510193, "learning_rate": 0.0005569965870307167, "loss": 1.9402, "step": 1951 }, { "epoch": 2.2207053469852105, "grad_norm": 1.1179683208465576, "learning_rate": 0.000556769055745165, "loss": 2.6231, "step": 1952 }, { "epoch": 2.2218430034129693, "grad_norm": 1.0641530752182007, "learning_rate": 0.0005565415244596132, "loss": 2.055, "step": 1953 }, { "epoch": 2.222980659840728, "grad_norm": 0.7080785036087036, "learning_rate": 0.0005563139931740613, "loss": 0.7283, "step": 1954 }, { "epoch": 2.2241183162684868, "grad_norm": 1.3224704265594482, "learning_rate": 0.0005560864618885097, "loss": 2.189, "step": 1955 }, { "epoch": 2.2252559726962455, "grad_norm": 0.9138402938842773, "learning_rate": 0.0005558589306029579, "loss": 1.9759, "step": 1956 }, { "epoch": 2.2263936291240047, "grad_norm": 1.1304935216903687, "learning_rate": 0.0005556313993174062, "loss": 2.2992, "step": 1957 }, { "epoch": 2.2275312855517635, "grad_norm": 0.7692627906799316, "learning_rate": 0.0005554038680318544, "loss": 1.195, "step": 1958 }, { "epoch": 2.2286689419795223, "grad_norm": 0.9814881086349487, "learning_rate": 0.0005551763367463026, "loss": 1.7418, "step": 1959 }, { "epoch": 2.229806598407281, "grad_norm": 1.449527621269226, "learning_rate": 0.0005549488054607509, "loss": 2.8254, "step": 1960 }, { "epoch": 2.2309442548350398, "grad_norm": 1.177649974822998, "learning_rate": 0.0005547212741751991, "loss": 2.1793, "step": 1961 }, { "epoch": 2.2320819112627985, "grad_norm": 1.1293201446533203, "learning_rate": 0.0005544937428896473, "loss": 3.0604, "step": 1962 }, { "epoch": 2.2332195676905573, "grad_norm": 0.9934133887290955, "learning_rate": 0.0005542662116040957, "loss": 1.8564, "step": 1963 }, { "epoch": 2.234357224118316, "grad_norm": 0.8571220636367798, "learning_rate": 0.0005540386803185438, "loss": 1.4268, "step": 1964 }, { "epoch": 2.2354948805460753, "grad_norm": 0.84453284740448, "learning_rate": 0.000553811149032992, "loss": 1.4253, "step": 1965 }, { "epoch": 2.236632536973834, "grad_norm": 0.842588484287262, "learning_rate": 0.0005535836177474403, "loss": 2.4052, "step": 1966 }, { "epoch": 2.2377701934015928, "grad_norm": 1.4743871688842773, "learning_rate": 0.0005533560864618885, "loss": 2.9152, "step": 1967 }, { "epoch": 2.2389078498293515, "grad_norm": 0.6935710310935974, "learning_rate": 0.0005531285551763367, "loss": 1.2375, "step": 1968 }, { "epoch": 2.2400455062571103, "grad_norm": 1.2612910270690918, "learning_rate": 0.000552901023890785, "loss": 1.9998, "step": 1969 }, { "epoch": 2.241183162684869, "grad_norm": 0.8181163668632507, "learning_rate": 0.0005526734926052332, "loss": 1.3744, "step": 1970 }, { "epoch": 2.242320819112628, "grad_norm": 1.1267778873443604, "learning_rate": 0.0005524459613196815, "loss": 2.3809, "step": 1971 }, { "epoch": 2.243458475540387, "grad_norm": 1.200745701789856, "learning_rate": 0.0005522184300341298, "loss": 2.4499, "step": 1972 }, { "epoch": 2.244596131968146, "grad_norm": 1.04634690284729, "learning_rate": 0.0005519908987485779, "loss": 1.1126, "step": 1973 }, { "epoch": 2.2457337883959045, "grad_norm": 0.9528573751449585, "learning_rate": 0.0005517633674630261, "loss": 2.4237, "step": 1974 }, { "epoch": 2.2468714448236633, "grad_norm": 1.3052282333374023, "learning_rate": 0.0005515358361774744, "loss": 2.5489, "step": 1975 }, { "epoch": 2.248009101251422, "grad_norm": 1.095739722251892, "learning_rate": 0.0005513083048919226, "loss": 1.1754, "step": 1976 }, { "epoch": 2.249146757679181, "grad_norm": 1.1007702350616455, "learning_rate": 0.0005510807736063709, "loss": 1.7626, "step": 1977 }, { "epoch": 2.2502844141069396, "grad_norm": 0.6495097279548645, "learning_rate": 0.0005508532423208191, "loss": 1.7669, "step": 1978 }, { "epoch": 2.2514220705346983, "grad_norm": 0.9295183420181274, "learning_rate": 0.0005506257110352673, "loss": 1.9161, "step": 1979 }, { "epoch": 2.252559726962457, "grad_norm": 1.0629956722259521, "learning_rate": 0.0005503981797497157, "loss": 2.2528, "step": 1980 }, { "epoch": 2.2536973833902163, "grad_norm": 0.6436116695404053, "learning_rate": 0.0005501706484641639, "loss": 1.2159, "step": 1981 }, { "epoch": 2.254835039817975, "grad_norm": 1.0467777252197266, "learning_rate": 0.0005499431171786121, "loss": 2.0886, "step": 1982 }, { "epoch": 2.255972696245734, "grad_norm": 0.7404427528381348, "learning_rate": 0.0005497155858930603, "loss": 1.4913, "step": 1983 }, { "epoch": 2.2571103526734926, "grad_norm": 1.0175822973251343, "learning_rate": 0.0005494880546075085, "loss": 1.9654, "step": 1984 }, { "epoch": 2.2582480091012513, "grad_norm": 0.930240273475647, "learning_rate": 0.0005492605233219567, "loss": 1.7043, "step": 1985 }, { "epoch": 2.25938566552901, "grad_norm": 2.17866587638855, "learning_rate": 0.000549032992036405, "loss": 3.0881, "step": 1986 }, { "epoch": 2.260523321956769, "grad_norm": 1.5989547967910767, "learning_rate": 0.0005488054607508532, "loss": 3.0951, "step": 1987 }, { "epoch": 2.261660978384528, "grad_norm": 1.6310112476348877, "learning_rate": 0.0005485779294653015, "loss": 3.687, "step": 1988 }, { "epoch": 2.262798634812287, "grad_norm": 0.682499885559082, "learning_rate": 0.0005483503981797498, "loss": 1.5835, "step": 1989 }, { "epoch": 2.2639362912400456, "grad_norm": 1.0656623840332031, "learning_rate": 0.000548122866894198, "loss": 2.0308, "step": 1990 }, { "epoch": 2.2650739476678043, "grad_norm": 0.7635136842727661, "learning_rate": 0.0005478953356086462, "loss": 2.4743, "step": 1991 }, { "epoch": 2.266211604095563, "grad_norm": 0.7663008570671082, "learning_rate": 0.0005476678043230945, "loss": 2.0004, "step": 1992 }, { "epoch": 2.267349260523322, "grad_norm": 0.5305643677711487, "learning_rate": 0.0005474402730375426, "loss": 0.7431, "step": 1993 }, { "epoch": 2.2684869169510806, "grad_norm": 1.2221653461456299, "learning_rate": 0.0005472127417519908, "loss": 2.3999, "step": 1994 }, { "epoch": 2.26962457337884, "grad_norm": 1.0990016460418701, "learning_rate": 0.0005469852104664391, "loss": 3.3276, "step": 1995 }, { "epoch": 2.2707622298065986, "grad_norm": 1.074623703956604, "learning_rate": 0.0005467576791808873, "loss": 2.2739, "step": 1996 }, { "epoch": 2.2718998862343573, "grad_norm": 0.7581028342247009, "learning_rate": 0.0005465301478953357, "loss": 1.9877, "step": 1997 }, { "epoch": 2.273037542662116, "grad_norm": 1.2376879453659058, "learning_rate": 0.0005463026166097839, "loss": 2.5045, "step": 1998 }, { "epoch": 2.274175199089875, "grad_norm": 0.8391635417938232, "learning_rate": 0.0005460750853242321, "loss": 1.414, "step": 1999 }, { "epoch": 2.2753128555176336, "grad_norm": 0.917779803276062, "learning_rate": 0.0005458475540386804, "loss": 1.5652, "step": 2000 }, { "epoch": 2.2764505119453924, "grad_norm": 0.9148494005203247, "learning_rate": 0.0005456200227531286, "loss": 0.9553, "step": 2001 }, { "epoch": 2.277588168373151, "grad_norm": 1.6205681562423706, "learning_rate": 0.0005453924914675768, "loss": 3.2662, "step": 2002 }, { "epoch": 2.27872582480091, "grad_norm": 0.5104256272315979, "learning_rate": 0.000545164960182025, "loss": 1.1526, "step": 2003 }, { "epoch": 2.279863481228669, "grad_norm": 0.9608463644981384, "learning_rate": 0.0005449374288964732, "loss": 1.3495, "step": 2004 }, { "epoch": 2.281001137656428, "grad_norm": 0.8729520440101624, "learning_rate": 0.0005447098976109215, "loss": 1.6019, "step": 2005 }, { "epoch": 2.2821387940841866, "grad_norm": 0.7411619424819946, "learning_rate": 0.0005444823663253698, "loss": 1.5559, "step": 2006 }, { "epoch": 2.2832764505119454, "grad_norm": 0.7767372727394104, "learning_rate": 0.000544254835039818, "loss": 2.0135, "step": 2007 }, { "epoch": 2.284414106939704, "grad_norm": 0.5355758666992188, "learning_rate": 0.0005440273037542662, "loss": 1.1862, "step": 2008 }, { "epoch": 2.285551763367463, "grad_norm": 0.719421923160553, "learning_rate": 0.0005437997724687145, "loss": 1.1892, "step": 2009 }, { "epoch": 2.2866894197952217, "grad_norm": 0.7264088988304138, "learning_rate": 0.0005435722411831627, "loss": 1.5477, "step": 2010 }, { "epoch": 2.287827076222981, "grad_norm": 0.8898872137069702, "learning_rate": 0.0005433447098976109, "loss": 1.5906, "step": 2011 }, { "epoch": 2.2889647326507396, "grad_norm": 0.6888718605041504, "learning_rate": 0.0005431171786120591, "loss": 1.1981, "step": 2012 }, { "epoch": 2.2901023890784984, "grad_norm": 0.6405702829360962, "learning_rate": 0.0005428896473265074, "loss": 0.9176, "step": 2013 }, { "epoch": 2.291240045506257, "grad_norm": 1.125874638557434, "learning_rate": 0.0005426621160409556, "loss": 1.9476, "step": 2014 }, { "epoch": 2.292377701934016, "grad_norm": 0.9443691372871399, "learning_rate": 0.0005424345847554039, "loss": 1.1386, "step": 2015 }, { "epoch": 2.2935153583617747, "grad_norm": 1.1550638675689697, "learning_rate": 0.0005422070534698521, "loss": 2.695, "step": 2016 }, { "epoch": 2.2946530147895334, "grad_norm": 0.9223167896270752, "learning_rate": 0.0005419795221843004, "loss": 1.962, "step": 2017 }, { "epoch": 2.295790671217292, "grad_norm": 0.9546693563461304, "learning_rate": 0.0005417519908987486, "loss": 1.9184, "step": 2018 }, { "epoch": 2.296928327645051, "grad_norm": 1.0364398956298828, "learning_rate": 0.0005415244596131968, "loss": 2.3646, "step": 2019 }, { "epoch": 2.29806598407281, "grad_norm": 0.7954057455062866, "learning_rate": 0.0005412969283276451, "loss": 1.0092, "step": 2020 }, { "epoch": 2.299203640500569, "grad_norm": 0.9007298946380615, "learning_rate": 0.0005410693970420934, "loss": 1.6872, "step": 2021 }, { "epoch": 2.3003412969283277, "grad_norm": 0.9581443071365356, "learning_rate": 0.0005408418657565415, "loss": 2.0143, "step": 2022 }, { "epoch": 2.3014789533560864, "grad_norm": 0.9659209251403809, "learning_rate": 0.0005406143344709898, "loss": 1.2962, "step": 2023 }, { "epoch": 2.302616609783845, "grad_norm": 0.974143385887146, "learning_rate": 0.000540386803185438, "loss": 2.7885, "step": 2024 }, { "epoch": 2.303754266211604, "grad_norm": 1.0462844371795654, "learning_rate": 0.0005401592718998862, "loss": 2.1409, "step": 2025 }, { "epoch": 2.3048919226393627, "grad_norm": 0.8965214490890503, "learning_rate": 0.0005399317406143345, "loss": 2.2281, "step": 2026 }, { "epoch": 2.306029579067122, "grad_norm": 1.1970421075820923, "learning_rate": 0.0005397042093287827, "loss": 2.378, "step": 2027 }, { "epoch": 2.3071672354948807, "grad_norm": 1.012830138206482, "learning_rate": 0.0005394766780432309, "loss": 1.4127, "step": 2028 }, { "epoch": 2.3083048919226394, "grad_norm": 0.8547312021255493, "learning_rate": 0.0005392491467576793, "loss": 2.5707, "step": 2029 }, { "epoch": 2.309442548350398, "grad_norm": 1.0248013734817505, "learning_rate": 0.0005390216154721275, "loss": 1.1376, "step": 2030 }, { "epoch": 2.310580204778157, "grad_norm": 1.2407397031784058, "learning_rate": 0.0005387940841865757, "loss": 2.5036, "step": 2031 }, { "epoch": 2.3117178612059157, "grad_norm": 1.169358730316162, "learning_rate": 0.0005385665529010239, "loss": 2.4767, "step": 2032 }, { "epoch": 2.3128555176336745, "grad_norm": 0.552113950252533, "learning_rate": 0.0005383390216154721, "loss": 0.8825, "step": 2033 }, { "epoch": 2.3139931740614337, "grad_norm": 1.2916892766952515, "learning_rate": 0.0005381114903299203, "loss": 2.3583, "step": 2034 }, { "epoch": 2.3151308304891924, "grad_norm": 0.9264313578605652, "learning_rate": 0.0005378839590443686, "loss": 1.945, "step": 2035 }, { "epoch": 2.316268486916951, "grad_norm": 0.6379786729812622, "learning_rate": 0.0005376564277588168, "loss": 0.7919, "step": 2036 }, { "epoch": 2.31740614334471, "grad_norm": 1.199393630027771, "learning_rate": 0.0005374288964732651, "loss": 2.3252, "step": 2037 }, { "epoch": 2.3185437997724687, "grad_norm": 1.0265052318572998, "learning_rate": 0.0005372013651877134, "loss": 1.6382, "step": 2038 }, { "epoch": 2.3196814562002275, "grad_norm": 1.808610439300537, "learning_rate": 0.0005369738339021616, "loss": 3.1174, "step": 2039 }, { "epoch": 2.3208191126279862, "grad_norm": 0.7785828113555908, "learning_rate": 0.0005367463026166099, "loss": 1.7553, "step": 2040 }, { "epoch": 2.321956769055745, "grad_norm": 1.1902951002120972, "learning_rate": 0.0005365187713310581, "loss": 1.9243, "step": 2041 }, { "epoch": 2.3230944254835038, "grad_norm": 0.8798537254333496, "learning_rate": 0.0005362912400455062, "loss": 1.8127, "step": 2042 }, { "epoch": 2.324232081911263, "grad_norm": 1.0299949645996094, "learning_rate": 0.0005360637087599545, "loss": 2.0079, "step": 2043 }, { "epoch": 2.3253697383390217, "grad_norm": 0.8465110063552856, "learning_rate": 0.0005358361774744027, "loss": 1.6008, "step": 2044 }, { "epoch": 2.3265073947667805, "grad_norm": 1.031044363975525, "learning_rate": 0.0005356086461888509, "loss": 1.6429, "step": 2045 }, { "epoch": 2.3276450511945392, "grad_norm": 0.9194596409797668, "learning_rate": 0.0005353811149032993, "loss": 1.5493, "step": 2046 }, { "epoch": 2.328782707622298, "grad_norm": 0.8609010577201843, "learning_rate": 0.0005351535836177475, "loss": 1.4473, "step": 2047 }, { "epoch": 2.3299203640500568, "grad_norm": 1.3532382249832153, "learning_rate": 0.0005349260523321957, "loss": 4.2047, "step": 2048 }, { "epoch": 2.3310580204778155, "grad_norm": 0.9629133939743042, "learning_rate": 0.000534698521046644, "loss": 1.1077, "step": 2049 }, { "epoch": 2.3321956769055747, "grad_norm": 0.8413902521133423, "learning_rate": 0.0005344709897610922, "loss": 1.5364, "step": 2050 }, { "epoch": 2.3333333333333335, "grad_norm": 1.0402729511260986, "learning_rate": 0.0005342434584755403, "loss": 2.7655, "step": 2051 }, { "epoch": 2.3344709897610922, "grad_norm": 0.9589785933494568, "learning_rate": 0.0005340159271899886, "loss": 1.5259, "step": 2052 }, { "epoch": 2.335608646188851, "grad_norm": 0.6738478541374207, "learning_rate": 0.0005337883959044368, "loss": 1.5741, "step": 2053 }, { "epoch": 2.3367463026166098, "grad_norm": 1.0174530744552612, "learning_rate": 0.000533560864618885, "loss": 3.3126, "step": 2054 }, { "epoch": 2.3378839590443685, "grad_norm": 0.9035469889640808, "learning_rate": 0.0005333333333333334, "loss": 1.6059, "step": 2055 }, { "epoch": 2.3390216154721273, "grad_norm": 1.1631959676742554, "learning_rate": 0.0005331058020477816, "loss": 1.7973, "step": 2056 }, { "epoch": 2.3401592718998865, "grad_norm": 0.8147276043891907, "learning_rate": 0.0005328782707622299, "loss": 1.3416, "step": 2057 }, { "epoch": 2.3412969283276452, "grad_norm": 1.1491146087646484, "learning_rate": 0.0005326507394766781, "loss": 3.0711, "step": 2058 }, { "epoch": 2.342434584755404, "grad_norm": 0.7472861409187317, "learning_rate": 0.0005324232081911263, "loss": 1.4432, "step": 2059 }, { "epoch": 2.3435722411831628, "grad_norm": 1.2172683477401733, "learning_rate": 0.0005321956769055746, "loss": 1.5058, "step": 2060 }, { "epoch": 2.3447098976109215, "grad_norm": 1.0745410919189453, "learning_rate": 0.0005319681456200227, "loss": 1.6888, "step": 2061 }, { "epoch": 2.3458475540386803, "grad_norm": 0.8925970792770386, "learning_rate": 0.0005317406143344709, "loss": 1.4089, "step": 2062 }, { "epoch": 2.346985210466439, "grad_norm": 0.8472179770469666, "learning_rate": 0.0005315130830489193, "loss": 1.4942, "step": 2063 }, { "epoch": 2.348122866894198, "grad_norm": 0.4184499979019165, "learning_rate": 0.0005312855517633675, "loss": 0.6314, "step": 2064 }, { "epoch": 2.3492605233219566, "grad_norm": 1.262465238571167, "learning_rate": 0.0005310580204778157, "loss": 2.0469, "step": 2065 }, { "epoch": 2.3503981797497158, "grad_norm": 1.097849726676941, "learning_rate": 0.000530830489192264, "loss": 2.8422, "step": 2066 }, { "epoch": 2.3515358361774745, "grad_norm": 1.6128443479537964, "learning_rate": 0.0005306029579067122, "loss": 1.9792, "step": 2067 }, { "epoch": 2.3526734926052333, "grad_norm": 0.8861138820648193, "learning_rate": 0.0005303754266211604, "loss": 1.7751, "step": 2068 }, { "epoch": 2.353811149032992, "grad_norm": 0.8302369713783264, "learning_rate": 0.0005301478953356087, "loss": 1.8749, "step": 2069 }, { "epoch": 2.354948805460751, "grad_norm": 0.9765578508377075, "learning_rate": 0.0005299203640500569, "loss": 2.7383, "step": 2070 }, { "epoch": 2.3560864618885096, "grad_norm": 1.0690255165100098, "learning_rate": 0.000529692832764505, "loss": 1.6116, "step": 2071 }, { "epoch": 2.3572241183162683, "grad_norm": 0.96379154920578, "learning_rate": 0.0005294653014789534, "loss": 1.7744, "step": 2072 }, { "epoch": 2.3583617747440275, "grad_norm": 0.5584749579429626, "learning_rate": 0.0005292377701934016, "loss": 1.0452, "step": 2073 }, { "epoch": 2.3594994311717863, "grad_norm": 1.5071784257888794, "learning_rate": 0.0005290102389078498, "loss": 3.7007, "step": 2074 }, { "epoch": 2.360637087599545, "grad_norm": 0.7578591108322144, "learning_rate": 0.0005287827076222981, "loss": 0.9569, "step": 2075 }, { "epoch": 2.361774744027304, "grad_norm": 0.8119309544563293, "learning_rate": 0.0005285551763367463, "loss": 1.6168, "step": 2076 }, { "epoch": 2.3629124004550626, "grad_norm": 0.9481661319732666, "learning_rate": 0.0005283276450511945, "loss": 2.9747, "step": 2077 }, { "epoch": 2.3640500568828213, "grad_norm": 0.9913211464881897, "learning_rate": 0.0005281001137656428, "loss": 2.35, "step": 2078 }, { "epoch": 2.36518771331058, "grad_norm": 1.1122124195098877, "learning_rate": 0.000527872582480091, "loss": 2.0505, "step": 2079 }, { "epoch": 2.366325369738339, "grad_norm": 0.8721759915351868, "learning_rate": 0.0005276450511945393, "loss": 1.6231, "step": 2080 }, { "epoch": 2.3674630261660976, "grad_norm": 0.8457727432250977, "learning_rate": 0.0005274175199089875, "loss": 1.6846, "step": 2081 }, { "epoch": 2.368600682593857, "grad_norm": 1.0846689939498901, "learning_rate": 0.0005271899886234357, "loss": 1.7606, "step": 2082 }, { "epoch": 2.3697383390216156, "grad_norm": 0.9165617823600769, "learning_rate": 0.000526962457337884, "loss": 1.3047, "step": 2083 }, { "epoch": 2.3708759954493743, "grad_norm": 1.1322617530822754, "learning_rate": 0.0005267349260523322, "loss": 1.1403, "step": 2084 }, { "epoch": 2.372013651877133, "grad_norm": 1.6939765214920044, "learning_rate": 0.0005265073947667804, "loss": 3.1601, "step": 2085 }, { "epoch": 2.373151308304892, "grad_norm": 1.0793713331222534, "learning_rate": 0.0005262798634812287, "loss": 2.0837, "step": 2086 }, { "epoch": 2.3742889647326506, "grad_norm": 0.8238543272018433, "learning_rate": 0.0005260523321956769, "loss": 1.438, "step": 2087 }, { "epoch": 2.3754266211604094, "grad_norm": 0.932790994644165, "learning_rate": 0.0005258248009101252, "loss": 1.9309, "step": 2088 }, { "epoch": 2.3765642775881686, "grad_norm": 0.6618528366088867, "learning_rate": 0.0005255972696245735, "loss": 1.4155, "step": 2089 }, { "epoch": 2.3777019340159273, "grad_norm": 0.7635231018066406, "learning_rate": 0.0005253697383390216, "loss": 1.3532, "step": 2090 }, { "epoch": 2.378839590443686, "grad_norm": 0.9490572810173035, "learning_rate": 0.0005251422070534698, "loss": 2.3154, "step": 2091 }, { "epoch": 2.379977246871445, "grad_norm": 1.2259362936019897, "learning_rate": 0.0005249146757679181, "loss": 2.4494, "step": 2092 }, { "epoch": 2.3811149032992036, "grad_norm": 0.7659657001495361, "learning_rate": 0.0005246871444823663, "loss": 1.1013, "step": 2093 }, { "epoch": 2.3822525597269624, "grad_norm": 1.1086747646331787, "learning_rate": 0.0005244596131968145, "loss": 2.263, "step": 2094 }, { "epoch": 2.383390216154721, "grad_norm": 0.9705541729927063, "learning_rate": 0.0005242320819112628, "loss": 1.4336, "step": 2095 }, { "epoch": 2.3845278725824803, "grad_norm": 0.5771173238754272, "learning_rate": 0.000524004550625711, "loss": 0.8406, "step": 2096 }, { "epoch": 2.385665529010239, "grad_norm": 0.424938440322876, "learning_rate": 0.0005237770193401593, "loss": 0.5156, "step": 2097 }, { "epoch": 2.386803185437998, "grad_norm": 0.7281388640403748, "learning_rate": 0.0005235494880546076, "loss": 0.8201, "step": 2098 }, { "epoch": 2.3879408418657566, "grad_norm": 1.5582374334335327, "learning_rate": 0.0005233219567690558, "loss": 3.5289, "step": 2099 }, { "epoch": 2.3890784982935154, "grad_norm": 1.2791839838027954, "learning_rate": 0.000523094425483504, "loss": 1.6449, "step": 2100 }, { "epoch": 2.390216154721274, "grad_norm": 1.1435104608535767, "learning_rate": 0.0005228668941979522, "loss": 2.1264, "step": 2101 }, { "epoch": 2.391353811149033, "grad_norm": 1.234621286392212, "learning_rate": 0.0005226393629124004, "loss": 1.0853, "step": 2102 }, { "epoch": 2.3924914675767917, "grad_norm": 0.9681219458580017, "learning_rate": 0.0005224118316268487, "loss": 2.4709, "step": 2103 }, { "epoch": 2.3936291240045504, "grad_norm": 1.064215064048767, "learning_rate": 0.0005221843003412969, "loss": 2.7476, "step": 2104 }, { "epoch": 2.3947667804323096, "grad_norm": 1.3225258588790894, "learning_rate": 0.0005219567690557452, "loss": 2.7113, "step": 2105 }, { "epoch": 2.3959044368600684, "grad_norm": 1.2326604127883911, "learning_rate": 0.0005217292377701935, "loss": 1.7622, "step": 2106 }, { "epoch": 2.397042093287827, "grad_norm": 0.7115911841392517, "learning_rate": 0.0005215017064846417, "loss": 1.8257, "step": 2107 }, { "epoch": 2.398179749715586, "grad_norm": 0.762967050075531, "learning_rate": 0.0005212741751990899, "loss": 1.5422, "step": 2108 }, { "epoch": 2.3993174061433447, "grad_norm": 1.2912230491638184, "learning_rate": 0.0005210466439135382, "loss": 2.3056, "step": 2109 }, { "epoch": 2.4004550625711034, "grad_norm": 0.8842760920524597, "learning_rate": 0.0005208191126279863, "loss": 1.9213, "step": 2110 }, { "epoch": 2.401592718998862, "grad_norm": 1.1768587827682495, "learning_rate": 0.0005205915813424345, "loss": 2.635, "step": 2111 }, { "epoch": 2.4027303754266214, "grad_norm": 0.7537290453910828, "learning_rate": 0.0005203640500568828, "loss": 1.4866, "step": 2112 }, { "epoch": 2.40386803185438, "grad_norm": 0.5235282182693481, "learning_rate": 0.000520136518771331, "loss": 1.3188, "step": 2113 }, { "epoch": 2.405005688282139, "grad_norm": 1.8466582298278809, "learning_rate": 0.0005199089874857793, "loss": 3.5333, "step": 2114 }, { "epoch": 2.4061433447098977, "grad_norm": 2.3424501419067383, "learning_rate": 0.0005196814562002276, "loss": 1.6052, "step": 2115 }, { "epoch": 2.4072810011376564, "grad_norm": 1.0736968517303467, "learning_rate": 0.0005194539249146758, "loss": 1.6249, "step": 2116 }, { "epoch": 2.408418657565415, "grad_norm": 1.0152912139892578, "learning_rate": 0.000519226393629124, "loss": 2.1684, "step": 2117 }, { "epoch": 2.409556313993174, "grad_norm": 0.896634578704834, "learning_rate": 0.0005189988623435723, "loss": 1.4528, "step": 2118 }, { "epoch": 2.4106939704209327, "grad_norm": 0.5919451713562012, "learning_rate": 0.0005187713310580204, "loss": 1.2616, "step": 2119 }, { "epoch": 2.4118316268486915, "grad_norm": 0.9945755004882812, "learning_rate": 0.0005185437997724687, "loss": 1.9749, "step": 2120 }, { "epoch": 2.4129692832764507, "grad_norm": 0.6944431066513062, "learning_rate": 0.000518316268486917, "loss": 1.538, "step": 2121 }, { "epoch": 2.4141069397042094, "grad_norm": 1.0082086324691772, "learning_rate": 0.0005180887372013652, "loss": 1.2937, "step": 2122 }, { "epoch": 2.415244596131968, "grad_norm": 0.8779463768005371, "learning_rate": 0.0005178612059158135, "loss": 2.8401, "step": 2123 }, { "epoch": 2.416382252559727, "grad_norm": 0.729810357093811, "learning_rate": 0.0005176336746302617, "loss": 2.1615, "step": 2124 }, { "epoch": 2.4175199089874857, "grad_norm": 0.5567348003387451, "learning_rate": 0.0005174061433447099, "loss": 1.0664, "step": 2125 }, { "epoch": 2.4186575654152445, "grad_norm": 0.7598175406455994, "learning_rate": 0.0005171786120591582, "loss": 1.6606, "step": 2126 }, { "epoch": 2.419795221843003, "grad_norm": 0.9565114378929138, "learning_rate": 0.0005169510807736064, "loss": 1.97, "step": 2127 }, { "epoch": 2.4209328782707624, "grad_norm": 1.6986292600631714, "learning_rate": 0.0005167235494880546, "loss": 3.6617, "step": 2128 }, { "epoch": 2.422070534698521, "grad_norm": 0.8568170666694641, "learning_rate": 0.0005164960182025028, "loss": 2.0927, "step": 2129 }, { "epoch": 2.42320819112628, "grad_norm": 1.218289852142334, "learning_rate": 0.000516268486916951, "loss": 2.0588, "step": 2130 }, { "epoch": 2.4243458475540387, "grad_norm": 1.114538550376892, "learning_rate": 0.0005160409556313993, "loss": 2.6901, "step": 2131 }, { "epoch": 2.4254835039817975, "grad_norm": 0.9539201855659485, "learning_rate": 0.0005158134243458476, "loss": 1.3669, "step": 2132 }, { "epoch": 2.426621160409556, "grad_norm": 1.0585428476333618, "learning_rate": 0.0005155858930602958, "loss": 2.5264, "step": 2133 }, { "epoch": 2.427758816837315, "grad_norm": 0.5750988125801086, "learning_rate": 0.000515358361774744, "loss": 1.1382, "step": 2134 }, { "epoch": 2.428896473265074, "grad_norm": 1.5301157236099243, "learning_rate": 0.0005151308304891923, "loss": 2.8573, "step": 2135 }, { "epoch": 2.430034129692833, "grad_norm": 0.6881375312805176, "learning_rate": 0.0005149032992036405, "loss": 1.4282, "step": 2136 }, { "epoch": 2.4311717861205917, "grad_norm": 0.642068088054657, "learning_rate": 0.0005146757679180887, "loss": 0.8719, "step": 2137 }, { "epoch": 2.4323094425483505, "grad_norm": 0.9515458941459656, "learning_rate": 0.0005144482366325371, "loss": 1.7866, "step": 2138 }, { "epoch": 2.4334470989761092, "grad_norm": 1.4139736890792847, "learning_rate": 0.0005142207053469852, "loss": 2.3178, "step": 2139 }, { "epoch": 2.434584755403868, "grad_norm": 0.7378571629524231, "learning_rate": 0.0005139931740614334, "loss": 1.5993, "step": 2140 }, { "epoch": 2.4357224118316267, "grad_norm": 1.243595004081726, "learning_rate": 0.0005137656427758817, "loss": 2.0773, "step": 2141 }, { "epoch": 2.4368600682593855, "grad_norm": 0.7528473734855652, "learning_rate": 0.0005135381114903299, "loss": 1.8747, "step": 2142 }, { "epoch": 2.4379977246871443, "grad_norm": 0.9805837273597717, "learning_rate": 0.0005133105802047782, "loss": 2.3093, "step": 2143 }, { "epoch": 2.4391353811149035, "grad_norm": 1.0196789503097534, "learning_rate": 0.0005130830489192264, "loss": 1.6322, "step": 2144 }, { "epoch": 2.4402730375426622, "grad_norm": 1.2920022010803223, "learning_rate": 0.0005128555176336746, "loss": 1.0212, "step": 2145 }, { "epoch": 2.441410693970421, "grad_norm": 1.036919116973877, "learning_rate": 0.000512627986348123, "loss": 2.4244, "step": 2146 }, { "epoch": 2.4425483503981797, "grad_norm": 0.7487674951553345, "learning_rate": 0.0005124004550625712, "loss": 1.2761, "step": 2147 }, { "epoch": 2.4436860068259385, "grad_norm": 0.9197911024093628, "learning_rate": 0.0005121729237770194, "loss": 2.1063, "step": 2148 }, { "epoch": 2.4448236632536973, "grad_norm": 0.9425987005233765, "learning_rate": 0.0005119453924914676, "loss": 1.6781, "step": 2149 }, { "epoch": 2.445961319681456, "grad_norm": 1.1719930171966553, "learning_rate": 0.0005117178612059158, "loss": 2.0511, "step": 2150 }, { "epoch": 2.4470989761092152, "grad_norm": 0.8875211477279663, "learning_rate": 0.000511490329920364, "loss": 1.6565, "step": 2151 }, { "epoch": 2.448236632536974, "grad_norm": 0.61222243309021, "learning_rate": 0.0005112627986348123, "loss": 1.1201, "step": 2152 }, { "epoch": 2.4493742889647327, "grad_norm": 1.5771725177764893, "learning_rate": 0.0005110352673492605, "loss": 3.2926, "step": 2153 }, { "epoch": 2.4505119453924915, "grad_norm": 0.8051725029945374, "learning_rate": 0.0005108077360637087, "loss": 1.0753, "step": 2154 }, { "epoch": 2.4516496018202503, "grad_norm": 0.7018311619758606, "learning_rate": 0.0005105802047781571, "loss": 0.9978, "step": 2155 }, { "epoch": 2.452787258248009, "grad_norm": 1.0658284425735474, "learning_rate": 0.0005103526734926053, "loss": 3.1038, "step": 2156 }, { "epoch": 2.453924914675768, "grad_norm": 0.890017032623291, "learning_rate": 0.0005101251422070535, "loss": 1.3806, "step": 2157 }, { "epoch": 2.4550625711035265, "grad_norm": 1.4623593091964722, "learning_rate": 0.0005098976109215017, "loss": 3.3608, "step": 2158 }, { "epoch": 2.4562002275312853, "grad_norm": 1.3333830833435059, "learning_rate": 0.0005096700796359499, "loss": 1.9364, "step": 2159 }, { "epoch": 2.4573378839590445, "grad_norm": 1.0337042808532715, "learning_rate": 0.0005094425483503981, "loss": 1.6893, "step": 2160 }, { "epoch": 2.4584755403868033, "grad_norm": 1.2401093244552612, "learning_rate": 0.0005092150170648464, "loss": 1.8306, "step": 2161 }, { "epoch": 2.459613196814562, "grad_norm": 0.6069979071617126, "learning_rate": 0.0005089874857792946, "loss": 1.5413, "step": 2162 }, { "epoch": 2.460750853242321, "grad_norm": 1.1139168739318848, "learning_rate": 0.000508759954493743, "loss": 2.6418, "step": 2163 }, { "epoch": 2.4618885096700796, "grad_norm": 0.7371734976768494, "learning_rate": 0.0005085324232081912, "loss": 1.3743, "step": 2164 }, { "epoch": 2.4630261660978383, "grad_norm": 1.3759161233901978, "learning_rate": 0.0005083048919226394, "loss": 2.3257, "step": 2165 }, { "epoch": 2.464163822525597, "grad_norm": 1.0357613563537598, "learning_rate": 0.0005080773606370877, "loss": 1.6746, "step": 2166 }, { "epoch": 2.4653014789533563, "grad_norm": 0.5313310623168945, "learning_rate": 0.0005078498293515359, "loss": 0.8946, "step": 2167 }, { "epoch": 2.466439135381115, "grad_norm": 0.9679710268974304, "learning_rate": 0.000507622298065984, "loss": 1.906, "step": 2168 }, { "epoch": 2.467576791808874, "grad_norm": 0.7682181596755981, "learning_rate": 0.0005073947667804323, "loss": 1.9641, "step": 2169 }, { "epoch": 2.4687144482366326, "grad_norm": 0.6322567462921143, "learning_rate": 0.0005071672354948805, "loss": 0.5872, "step": 2170 }, { "epoch": 2.4698521046643913, "grad_norm": 0.6293168663978577, "learning_rate": 0.0005069397042093287, "loss": 1.1534, "step": 2171 }, { "epoch": 2.47098976109215, "grad_norm": 0.9249051213264465, "learning_rate": 0.0005067121729237771, "loss": 2.3496, "step": 2172 }, { "epoch": 2.472127417519909, "grad_norm": 0.8873506784439087, "learning_rate": 0.0005064846416382253, "loss": 1.5146, "step": 2173 }, { "epoch": 2.473265073947668, "grad_norm": 0.8709667325019836, "learning_rate": 0.0005062571103526735, "loss": 1.8368, "step": 2174 }, { "epoch": 2.474402730375427, "grad_norm": 0.8933395147323608, "learning_rate": 0.0005060295790671218, "loss": 1.5841, "step": 2175 }, { "epoch": 2.4755403868031856, "grad_norm": 0.8149117231369019, "learning_rate": 0.00050580204778157, "loss": 1.6464, "step": 2176 }, { "epoch": 2.4766780432309443, "grad_norm": 0.8341697454452515, "learning_rate": 0.0005055745164960182, "loss": 1.1955, "step": 2177 }, { "epoch": 2.477815699658703, "grad_norm": 1.2008378505706787, "learning_rate": 0.0005053469852104664, "loss": 2.0201, "step": 2178 }, { "epoch": 2.478953356086462, "grad_norm": 1.0396666526794434, "learning_rate": 0.0005051194539249146, "loss": 1.9521, "step": 2179 }, { "epoch": 2.4800910125142206, "grad_norm": 0.7898449897766113, "learning_rate": 0.0005048919226393628, "loss": 1.1885, "step": 2180 }, { "epoch": 2.4812286689419794, "grad_norm": 0.9955923557281494, "learning_rate": 0.0005046643913538112, "loss": 1.8145, "step": 2181 }, { "epoch": 2.482366325369738, "grad_norm": 1.1414570808410645, "learning_rate": 0.0005044368600682594, "loss": 2.2897, "step": 2182 }, { "epoch": 2.4835039817974973, "grad_norm": 1.0973477363586426, "learning_rate": 0.0005042093287827077, "loss": 2.2097, "step": 2183 }, { "epoch": 2.484641638225256, "grad_norm": 0.955952525138855, "learning_rate": 0.0005039817974971559, "loss": 1.6976, "step": 2184 }, { "epoch": 2.485779294653015, "grad_norm": 0.9474056959152222, "learning_rate": 0.0005037542662116041, "loss": 1.5724, "step": 2185 }, { "epoch": 2.4869169510807736, "grad_norm": 0.7748664021492004, "learning_rate": 0.0005035267349260524, "loss": 2.1302, "step": 2186 }, { "epoch": 2.4880546075085324, "grad_norm": 1.1004105806350708, "learning_rate": 0.0005032992036405005, "loss": 1.9072, "step": 2187 }, { "epoch": 2.489192263936291, "grad_norm": 0.8701795935630798, "learning_rate": 0.0005030716723549487, "loss": 1.2504, "step": 2188 }, { "epoch": 2.49032992036405, "grad_norm": 1.0033451318740845, "learning_rate": 0.0005028441410693971, "loss": 2.6241, "step": 2189 }, { "epoch": 2.491467576791809, "grad_norm": 0.9840807318687439, "learning_rate": 0.0005026166097838453, "loss": 1.5594, "step": 2190 }, { "epoch": 2.492605233219568, "grad_norm": 1.0026658773422241, "learning_rate": 0.0005023890784982935, "loss": 2.1102, "step": 2191 }, { "epoch": 2.4937428896473266, "grad_norm": 0.8774160146713257, "learning_rate": 0.0005021615472127418, "loss": 2.5087, "step": 2192 }, { "epoch": 2.4948805460750854, "grad_norm": 0.7475762963294983, "learning_rate": 0.00050193401592719, "loss": 0.8467, "step": 2193 }, { "epoch": 2.496018202502844, "grad_norm": 1.0985372066497803, "learning_rate": 0.0005017064846416382, "loss": 2.0641, "step": 2194 }, { "epoch": 2.497155858930603, "grad_norm": 0.8731891512870789, "learning_rate": 0.0005014789533560865, "loss": 2.1272, "step": 2195 }, { "epoch": 2.4982935153583616, "grad_norm": 0.9103354215621948, "learning_rate": 0.0005012514220705347, "loss": 2.0572, "step": 2196 }, { "epoch": 2.4994311717861204, "grad_norm": 1.041029691696167, "learning_rate": 0.0005010238907849828, "loss": 1.9749, "step": 2197 }, { "epoch": 2.500568828213879, "grad_norm": 1.2097538709640503, "learning_rate": 0.0005007963594994312, "loss": 1.671, "step": 2198 }, { "epoch": 2.5017064846416384, "grad_norm": 0.9088473916053772, "learning_rate": 0.0005005688282138794, "loss": 2.0683, "step": 2199 }, { "epoch": 2.502844141069397, "grad_norm": 0.728599488735199, "learning_rate": 0.0005003412969283276, "loss": 1.4432, "step": 2200 }, { "epoch": 2.503981797497156, "grad_norm": 0.7432947754859924, "learning_rate": 0.0005001137656427759, "loss": 1.3989, "step": 2201 }, { "epoch": 2.5051194539249146, "grad_norm": 0.9259990453720093, "learning_rate": 0.0004998862343572241, "loss": 2.1398, "step": 2202 }, { "epoch": 2.5062571103526734, "grad_norm": 1.316038966178894, "learning_rate": 0.0004996587030716724, "loss": 2.2743, "step": 2203 }, { "epoch": 2.507394766780432, "grad_norm": 0.8661054372787476, "learning_rate": 0.0004994311717861205, "loss": 1.706, "step": 2204 }, { "epoch": 2.508532423208191, "grad_norm": 0.8994172215461731, "learning_rate": 0.0004992036405005689, "loss": 1.6767, "step": 2205 }, { "epoch": 2.50967007963595, "grad_norm": 0.683313250541687, "learning_rate": 0.0004989761092150171, "loss": 0.8057, "step": 2206 }, { "epoch": 2.510807736063709, "grad_norm": 1.4154108762741089, "learning_rate": 0.0004987485779294653, "loss": 1.5263, "step": 2207 }, { "epoch": 2.5119453924914676, "grad_norm": 1.0941131114959717, "learning_rate": 0.0004985210466439136, "loss": 1.7871, "step": 2208 }, { "epoch": 2.5130830489192264, "grad_norm": 0.8073767423629761, "learning_rate": 0.0004982935153583618, "loss": 1.7233, "step": 2209 }, { "epoch": 2.514220705346985, "grad_norm": 1.2724852561950684, "learning_rate": 0.00049806598407281, "loss": 2.4039, "step": 2210 }, { "epoch": 2.515358361774744, "grad_norm": 0.6580117344856262, "learning_rate": 0.0004978384527872582, "loss": 1.5502, "step": 2211 }, { "epoch": 2.5164960182025027, "grad_norm": 0.9353324174880981, "learning_rate": 0.0004976109215017065, "loss": 1.7699, "step": 2212 }, { "epoch": 2.517633674630262, "grad_norm": 0.7029836177825928, "learning_rate": 0.0004973833902161547, "loss": 1.4815, "step": 2213 }, { "epoch": 2.51877133105802, "grad_norm": 0.8808972239494324, "learning_rate": 0.000497155858930603, "loss": 1.9639, "step": 2214 }, { "epoch": 2.5199089874857794, "grad_norm": 0.7552292943000793, "learning_rate": 0.0004969283276450512, "loss": 0.9553, "step": 2215 }, { "epoch": 2.521046643913538, "grad_norm": 0.8695642948150635, "learning_rate": 0.0004967007963594995, "loss": 1.8042, "step": 2216 }, { "epoch": 2.522184300341297, "grad_norm": 1.5711946487426758, "learning_rate": 0.0004964732650739477, "loss": 2.2919, "step": 2217 }, { "epoch": 2.5233219567690557, "grad_norm": 1.2966930866241455, "learning_rate": 0.0004962457337883959, "loss": 2.9262, "step": 2218 }, { "epoch": 2.5244596131968144, "grad_norm": 0.7041467428207397, "learning_rate": 0.0004960182025028441, "loss": 1.2383, "step": 2219 }, { "epoch": 2.5255972696245736, "grad_norm": 0.8301080465316772, "learning_rate": 0.0004957906712172923, "loss": 1.9915, "step": 2220 }, { "epoch": 2.526734926052332, "grad_norm": 0.8538893461227417, "learning_rate": 0.0004955631399317406, "loss": 2.2926, "step": 2221 }, { "epoch": 2.527872582480091, "grad_norm": 1.1304670572280884, "learning_rate": 0.0004953356086461889, "loss": 2.2345, "step": 2222 }, { "epoch": 2.52901023890785, "grad_norm": 1.1222370862960815, "learning_rate": 0.0004951080773606372, "loss": 1.9092, "step": 2223 }, { "epoch": 2.5301478953356087, "grad_norm": 1.1593031883239746, "learning_rate": 0.0004948805460750853, "loss": 2.5001, "step": 2224 }, { "epoch": 2.5312855517633674, "grad_norm": 0.9046427011489868, "learning_rate": 0.0004946530147895336, "loss": 1.8159, "step": 2225 }, { "epoch": 2.532423208191126, "grad_norm": 0.9191851615905762, "learning_rate": 0.0004944254835039818, "loss": 2.3244, "step": 2226 }, { "epoch": 2.533560864618885, "grad_norm": 0.8399704098701477, "learning_rate": 0.00049419795221843, "loss": 2.1157, "step": 2227 }, { "epoch": 2.5346985210466437, "grad_norm": 1.3385767936706543, "learning_rate": 0.0004939704209328783, "loss": 2.495, "step": 2228 }, { "epoch": 2.535836177474403, "grad_norm": 0.9084081053733826, "learning_rate": 0.0004937428896473265, "loss": 1.222, "step": 2229 }, { "epoch": 2.5369738339021617, "grad_norm": 1.0885717868804932, "learning_rate": 0.0004935153583617748, "loss": 2.0583, "step": 2230 }, { "epoch": 2.5381114903299204, "grad_norm": 0.6400699019432068, "learning_rate": 0.000493287827076223, "loss": 0.7095, "step": 2231 }, { "epoch": 2.539249146757679, "grad_norm": 0.8817832469940186, "learning_rate": 0.0004930602957906713, "loss": 2.0175, "step": 2232 }, { "epoch": 2.540386803185438, "grad_norm": 0.5931545495986938, "learning_rate": 0.0004928327645051195, "loss": 1.6168, "step": 2233 }, { "epoch": 2.5415244596131967, "grad_norm": 1.0301717519760132, "learning_rate": 0.0004926052332195677, "loss": 1.5534, "step": 2234 }, { "epoch": 2.5426621160409555, "grad_norm": 0.6240630149841309, "learning_rate": 0.0004923777019340159, "loss": 0.7599, "step": 2235 }, { "epoch": 2.5437997724687147, "grad_norm": 0.7773973345756531, "learning_rate": 0.0004921501706484642, "loss": 2.37, "step": 2236 }, { "epoch": 2.544937428896473, "grad_norm": 0.7759304046630859, "learning_rate": 0.0004919226393629124, "loss": 1.0116, "step": 2237 }, { "epoch": 2.546075085324232, "grad_norm": 1.2322027683258057, "learning_rate": 0.0004916951080773606, "loss": 3.2444, "step": 2238 }, { "epoch": 2.547212741751991, "grad_norm": 0.8914145231246948, "learning_rate": 0.0004914675767918089, "loss": 1.5857, "step": 2239 }, { "epoch": 2.5483503981797497, "grad_norm": 0.9285094738006592, "learning_rate": 0.0004912400455062571, "loss": 1.8327, "step": 2240 }, { "epoch": 2.5494880546075085, "grad_norm": 1.1583962440490723, "learning_rate": 0.0004910125142207054, "loss": 2.0139, "step": 2241 }, { "epoch": 2.5506257110352673, "grad_norm": 1.8207440376281738, "learning_rate": 0.0004907849829351536, "loss": 1.8943, "step": 2242 }, { "epoch": 2.551763367463026, "grad_norm": 1.2174469232559204, "learning_rate": 0.0004905574516496018, "loss": 2.1719, "step": 2243 }, { "epoch": 2.5529010238907848, "grad_norm": 1.4703840017318726, "learning_rate": 0.00049032992036405, "loss": 3.2219, "step": 2244 }, { "epoch": 2.554038680318544, "grad_norm": 0.9278658628463745, "learning_rate": 0.0004901023890784983, "loss": 1.4814, "step": 2245 }, { "epoch": 2.5551763367463027, "grad_norm": 0.5682125687599182, "learning_rate": 0.0004898748577929465, "loss": 0.9685, "step": 2246 }, { "epoch": 2.5563139931740615, "grad_norm": 0.5616304278373718, "learning_rate": 0.0004896473265073948, "loss": 1.0696, "step": 2247 }, { "epoch": 2.5574516496018203, "grad_norm": 0.9043198227882385, "learning_rate": 0.000489419795221843, "loss": 1.2706, "step": 2248 }, { "epoch": 2.558589306029579, "grad_norm": 1.1037359237670898, "learning_rate": 0.0004891922639362913, "loss": 1.8091, "step": 2249 }, { "epoch": 2.5597269624573378, "grad_norm": 1.0690373182296753, "learning_rate": 0.0004889647326507395, "loss": 1.6337, "step": 2250 }, { "epoch": 2.5608646188850965, "grad_norm": 0.9125576019287109, "learning_rate": 0.0004887372013651877, "loss": 2.0998, "step": 2251 }, { "epoch": 2.5620022753128557, "grad_norm": 1.3516008853912354, "learning_rate": 0.000488509670079636, "loss": 2.317, "step": 2252 }, { "epoch": 2.5631399317406145, "grad_norm": 1.6707299947738647, "learning_rate": 0.0004882821387940841, "loss": 2.8554, "step": 2253 }, { "epoch": 2.5642775881683733, "grad_norm": 0.7115156054496765, "learning_rate": 0.00048805460750853244, "loss": 0.7969, "step": 2254 }, { "epoch": 2.565415244596132, "grad_norm": 0.7895395755767822, "learning_rate": 0.00048782707622298065, "loss": 1.3132, "step": 2255 }, { "epoch": 2.5665529010238908, "grad_norm": 1.3991349935531616, "learning_rate": 0.0004875995449374289, "loss": 2.6118, "step": 2256 }, { "epoch": 2.5676905574516495, "grad_norm": 1.0328587293624878, "learning_rate": 0.0004873720136518772, "loss": 1.8474, "step": 2257 }, { "epoch": 2.5688282138794083, "grad_norm": 0.8418103456497192, "learning_rate": 0.00048714448236632533, "loss": 1.9257, "step": 2258 }, { "epoch": 2.5699658703071675, "grad_norm": 1.1641095876693726, "learning_rate": 0.0004869169510807736, "loss": 1.6591, "step": 2259 }, { "epoch": 2.571103526734926, "grad_norm": 1.1286218166351318, "learning_rate": 0.00048668941979522186, "loss": 1.7781, "step": 2260 }, { "epoch": 2.572241183162685, "grad_norm": 0.9085434675216675, "learning_rate": 0.0004864618885096701, "loss": 1.7569, "step": 2261 }, { "epoch": 2.573378839590444, "grad_norm": 0.6849757432937622, "learning_rate": 0.00048623435722411833, "loss": 1.2772, "step": 2262 }, { "epoch": 2.5745164960182025, "grad_norm": 0.6630122661590576, "learning_rate": 0.00048600682593856654, "loss": 1.5381, "step": 2263 }, { "epoch": 2.5756541524459613, "grad_norm": 0.7607198357582092, "learning_rate": 0.0004857792946530148, "loss": 1.3695, "step": 2264 }, { "epoch": 2.57679180887372, "grad_norm": 1.6203835010528564, "learning_rate": 0.000485551763367463, "loss": 2.5078, "step": 2265 }, { "epoch": 2.577929465301479, "grad_norm": 1.9112815856933594, "learning_rate": 0.0004853242320819113, "loss": 1.0513, "step": 2266 }, { "epoch": 2.5790671217292376, "grad_norm": 1.3533493280410767, "learning_rate": 0.00048509670079635955, "loss": 3.0494, "step": 2267 }, { "epoch": 2.580204778156997, "grad_norm": 0.7832985520362854, "learning_rate": 0.0004848691695108077, "loss": 1.0865, "step": 2268 }, { "epoch": 2.5813424345847555, "grad_norm": 0.8799665570259094, "learning_rate": 0.00048464163822525597, "loss": 2.473, "step": 2269 }, { "epoch": 2.5824800910125143, "grad_norm": 1.135883092880249, "learning_rate": 0.00048441410693970423, "loss": 2.009, "step": 2270 }, { "epoch": 2.583617747440273, "grad_norm": 0.740875780582428, "learning_rate": 0.0004841865756541525, "loss": 1.5996, "step": 2271 }, { "epoch": 2.584755403868032, "grad_norm": 1.1426676511764526, "learning_rate": 0.00048395904436860065, "loss": 1.7545, "step": 2272 }, { "epoch": 2.5858930602957906, "grad_norm": 1.1677348613739014, "learning_rate": 0.0004837315130830489, "loss": 3.6223, "step": 2273 }, { "epoch": 2.5870307167235493, "grad_norm": 1.1222909688949585, "learning_rate": 0.0004835039817974972, "loss": 2.0058, "step": 2274 }, { "epoch": 2.5881683731513085, "grad_norm": 0.7786708474159241, "learning_rate": 0.0004832764505119454, "loss": 1.3458, "step": 2275 }, { "epoch": 2.589306029579067, "grad_norm": 0.6702552437782288, "learning_rate": 0.00048304891922639365, "loss": 1.3789, "step": 2276 }, { "epoch": 2.590443686006826, "grad_norm": 1.0909732580184937, "learning_rate": 0.00048282138794084186, "loss": 2.0698, "step": 2277 }, { "epoch": 2.591581342434585, "grad_norm": 0.9485477209091187, "learning_rate": 0.00048259385665529007, "loss": 1.8122, "step": 2278 }, { "epoch": 2.5927189988623436, "grad_norm": 1.065976619720459, "learning_rate": 0.00048236632536973834, "loss": 2.2232, "step": 2279 }, { "epoch": 2.5938566552901023, "grad_norm": 0.547852635383606, "learning_rate": 0.0004821387940841866, "loss": 1.5199, "step": 2280 }, { "epoch": 2.594994311717861, "grad_norm": 0.7793285250663757, "learning_rate": 0.00048191126279863486, "loss": 1.6383, "step": 2281 }, { "epoch": 2.59613196814562, "grad_norm": 1.2793906927108765, "learning_rate": 0.000481683731513083, "loss": 1.7647, "step": 2282 }, { "epoch": 2.5972696245733786, "grad_norm": 0.7028161883354187, "learning_rate": 0.0004814562002275313, "loss": 0.9584, "step": 2283 }, { "epoch": 2.598407281001138, "grad_norm": 0.725344717502594, "learning_rate": 0.00048122866894197955, "loss": 1.2296, "step": 2284 }, { "epoch": 2.5995449374288966, "grad_norm": 0.9674602746963501, "learning_rate": 0.00048100113765642776, "loss": 2.275, "step": 2285 }, { "epoch": 2.6006825938566553, "grad_norm": 0.7748560905456543, "learning_rate": 0.000480773606370876, "loss": 1.7323, "step": 2286 }, { "epoch": 2.601820250284414, "grad_norm": 1.1546324491500854, "learning_rate": 0.00048054607508532423, "loss": 2.054, "step": 2287 }, { "epoch": 2.602957906712173, "grad_norm": 0.7889047861099243, "learning_rate": 0.00048031854379977244, "loss": 1.787, "step": 2288 }, { "epoch": 2.6040955631399316, "grad_norm": 1.0750888586044312, "learning_rate": 0.0004800910125142207, "loss": 2.1864, "step": 2289 }, { "epoch": 2.6052332195676904, "grad_norm": 0.9969229698181152, "learning_rate": 0.00047986348122866897, "loss": 1.7515, "step": 2290 }, { "epoch": 2.6063708759954496, "grad_norm": 0.8853392004966736, "learning_rate": 0.00047963594994311723, "loss": 1.1181, "step": 2291 }, { "epoch": 2.6075085324232083, "grad_norm": 0.7410064935684204, "learning_rate": 0.0004794084186575654, "loss": 1.8175, "step": 2292 }, { "epoch": 2.608646188850967, "grad_norm": 1.1709070205688477, "learning_rate": 0.00047918088737201365, "loss": 1.8607, "step": 2293 }, { "epoch": 2.609783845278726, "grad_norm": 1.0198490619659424, "learning_rate": 0.0004789533560864619, "loss": 1.6995, "step": 2294 }, { "epoch": 2.6109215017064846, "grad_norm": 1.765076994895935, "learning_rate": 0.00047872582480091013, "loss": 3.519, "step": 2295 }, { "epoch": 2.6120591581342434, "grad_norm": 1.3096511363983154, "learning_rate": 0.0004784982935153584, "loss": 2.7398, "step": 2296 }, { "epoch": 2.613196814562002, "grad_norm": 2.6314849853515625, "learning_rate": 0.0004782707622298066, "loss": 5.0637, "step": 2297 }, { "epoch": 2.6143344709897613, "grad_norm": 0.6920953392982483, "learning_rate": 0.0004780432309442548, "loss": 1.1274, "step": 2298 }, { "epoch": 2.6154721274175197, "grad_norm": 1.1631900072097778, "learning_rate": 0.0004778156996587031, "loss": 1.7851, "step": 2299 }, { "epoch": 2.616609783845279, "grad_norm": 0.7960054874420166, "learning_rate": 0.00047758816837315134, "loss": 1.2316, "step": 2300 }, { "epoch": 2.6177474402730376, "grad_norm": 1.066870927810669, "learning_rate": 0.0004773606370875996, "loss": 2.3944, "step": 2301 }, { "epoch": 2.6188850967007964, "grad_norm": 0.7467948198318481, "learning_rate": 0.00047713310580204776, "loss": 1.1561, "step": 2302 }, { "epoch": 2.620022753128555, "grad_norm": 0.7254666686058044, "learning_rate": 0.000476905574516496, "loss": 1.0797, "step": 2303 }, { "epoch": 2.621160409556314, "grad_norm": 0.6046991944313049, "learning_rate": 0.0004766780432309443, "loss": 0.9486, "step": 2304 }, { "epoch": 2.6222980659840727, "grad_norm": 1.0409255027770996, "learning_rate": 0.0004764505119453925, "loss": 1.9647, "step": 2305 }, { "epoch": 2.6234357224118314, "grad_norm": 1.2402981519699097, "learning_rate": 0.00047622298065984076, "loss": 2.1702, "step": 2306 }, { "epoch": 2.6245733788395906, "grad_norm": 1.274269700050354, "learning_rate": 0.00047599544937428897, "loss": 2.9181, "step": 2307 }, { "epoch": 2.6257110352673494, "grad_norm": 0.724886953830719, "learning_rate": 0.0004757679180887372, "loss": 1.357, "step": 2308 }, { "epoch": 2.626848691695108, "grad_norm": 0.9972879886627197, "learning_rate": 0.00047554038680318545, "loss": 2.0593, "step": 2309 }, { "epoch": 2.627986348122867, "grad_norm": 0.8259227275848389, "learning_rate": 0.0004753128555176337, "loss": 1.7083, "step": 2310 }, { "epoch": 2.6291240045506257, "grad_norm": 1.0254158973693848, "learning_rate": 0.0004750853242320819, "loss": 2.4944, "step": 2311 }, { "epoch": 2.6302616609783844, "grad_norm": 0.8054444193840027, "learning_rate": 0.00047485779294653013, "loss": 1.1276, "step": 2312 }, { "epoch": 2.631399317406143, "grad_norm": 0.7392967939376831, "learning_rate": 0.0004746302616609784, "loss": 1.3377, "step": 2313 }, { "epoch": 2.6325369738339024, "grad_norm": 0.9780520796775818, "learning_rate": 0.00047440273037542666, "loss": 2.8929, "step": 2314 }, { "epoch": 2.6336746302616607, "grad_norm": 1.0950572490692139, "learning_rate": 0.00047417519908987487, "loss": 2.1541, "step": 2315 }, { "epoch": 2.63481228668942, "grad_norm": 0.9937504529953003, "learning_rate": 0.0004739476678043231, "loss": 1.8749, "step": 2316 }, { "epoch": 2.6359499431171787, "grad_norm": 0.6954947710037231, "learning_rate": 0.00047372013651877134, "loss": 1.8055, "step": 2317 }, { "epoch": 2.6370875995449374, "grad_norm": 0.8226844668388367, "learning_rate": 0.00047349260523321955, "loss": 1.5377, "step": 2318 }, { "epoch": 2.638225255972696, "grad_norm": 1.9866377115249634, "learning_rate": 0.0004732650739476678, "loss": 4.2124, "step": 2319 }, { "epoch": 2.639362912400455, "grad_norm": 1.1584763526916504, "learning_rate": 0.0004730375426621161, "loss": 1.5185, "step": 2320 }, { "epoch": 2.640500568828214, "grad_norm": 0.9889481067657471, "learning_rate": 0.0004728100113765643, "loss": 2.2838, "step": 2321 }, { "epoch": 2.6416382252559725, "grad_norm": 0.9309085011482239, "learning_rate": 0.0004725824800910125, "loss": 1.152, "step": 2322 }, { "epoch": 2.6427758816837317, "grad_norm": 1.0950933694839478, "learning_rate": 0.00047235494880546076, "loss": 2.3803, "step": 2323 }, { "epoch": 2.6439135381114904, "grad_norm": 0.8817667365074158, "learning_rate": 0.00047212741751990903, "loss": 1.4906, "step": 2324 }, { "epoch": 2.645051194539249, "grad_norm": 0.8397241830825806, "learning_rate": 0.00047189988623435724, "loss": 1.8036, "step": 2325 }, { "epoch": 2.646188850967008, "grad_norm": 1.4048362970352173, "learning_rate": 0.00047167235494880545, "loss": 2.0251, "step": 2326 }, { "epoch": 2.6473265073947667, "grad_norm": 1.4513121843338013, "learning_rate": 0.0004714448236632537, "loss": 1.8078, "step": 2327 }, { "epoch": 2.6484641638225255, "grad_norm": 0.9570140838623047, "learning_rate": 0.0004712172923777019, "loss": 1.8617, "step": 2328 }, { "epoch": 2.6496018202502842, "grad_norm": 0.8819069862365723, "learning_rate": 0.0004709897610921502, "loss": 1.8745, "step": 2329 }, { "epoch": 2.6507394766780434, "grad_norm": 1.094341516494751, "learning_rate": 0.00047076222980659845, "loss": 1.788, "step": 2330 }, { "epoch": 2.651877133105802, "grad_norm": 0.832409143447876, "learning_rate": 0.00047053469852104666, "loss": 2.2568, "step": 2331 }, { "epoch": 2.653014789533561, "grad_norm": 0.9308044910430908, "learning_rate": 0.00047030716723549487, "loss": 2.3719, "step": 2332 }, { "epoch": 2.6541524459613197, "grad_norm": 1.2896952629089355, "learning_rate": 0.00047007963594994313, "loss": 3.7022, "step": 2333 }, { "epoch": 2.6552901023890785, "grad_norm": 1.0395236015319824, "learning_rate": 0.0004698521046643914, "loss": 2.8605, "step": 2334 }, { "epoch": 2.6564277588168372, "grad_norm": 0.8033049702644348, "learning_rate": 0.0004696245733788396, "loss": 1.5591, "step": 2335 }, { "epoch": 2.657565415244596, "grad_norm": 0.798548698425293, "learning_rate": 0.0004693970420932878, "loss": 1.158, "step": 2336 }, { "epoch": 2.658703071672355, "grad_norm": 0.7860295176506042, "learning_rate": 0.0004691695108077361, "loss": 2.112, "step": 2337 }, { "epoch": 2.6598407281001135, "grad_norm": 0.8449381589889526, "learning_rate": 0.0004689419795221843, "loss": 1.5287, "step": 2338 }, { "epoch": 2.6609783845278727, "grad_norm": 1.0243216753005981, "learning_rate": 0.00046871444823663256, "loss": 1.6321, "step": 2339 }, { "epoch": 2.6621160409556315, "grad_norm": 0.9676476716995239, "learning_rate": 0.0004684869169510808, "loss": 2.4557, "step": 2340 }, { "epoch": 2.6632536973833902, "grad_norm": 0.9129576683044434, "learning_rate": 0.000468259385665529, "loss": 1.8287, "step": 2341 }, { "epoch": 2.664391353811149, "grad_norm": 1.1413075923919678, "learning_rate": 0.00046803185437997724, "loss": 2.0987, "step": 2342 }, { "epoch": 2.6655290102389078, "grad_norm": 0.7088634967803955, "learning_rate": 0.0004678043230944255, "loss": 1.6769, "step": 2343 }, { "epoch": 2.6666666666666665, "grad_norm": 0.8204455971717834, "learning_rate": 0.00046757679180887377, "loss": 2.0439, "step": 2344 }, { "epoch": 2.6678043230944253, "grad_norm": 0.7846645712852478, "learning_rate": 0.0004673492605233219, "loss": 1.4144, "step": 2345 }, { "epoch": 2.6689419795221845, "grad_norm": 0.8308724761009216, "learning_rate": 0.0004671217292377702, "loss": 1.2093, "step": 2346 }, { "epoch": 2.6700796359499432, "grad_norm": 1.0567412376403809, "learning_rate": 0.00046689419795221845, "loss": 1.8646, "step": 2347 }, { "epoch": 2.671217292377702, "grad_norm": 0.986167311668396, "learning_rate": 0.00046666666666666666, "loss": 2.4904, "step": 2348 }, { "epoch": 2.6723549488054608, "grad_norm": 1.8298397064208984, "learning_rate": 0.0004664391353811149, "loss": 2.9794, "step": 2349 }, { "epoch": 2.6734926052332195, "grad_norm": 0.6814413070678711, "learning_rate": 0.00046621160409556314, "loss": 1.7104, "step": 2350 }, { "epoch": 2.6746302616609783, "grad_norm": 1.0616414546966553, "learning_rate": 0.00046598407281001135, "loss": 2.269, "step": 2351 }, { "epoch": 2.675767918088737, "grad_norm": 1.0702776908874512, "learning_rate": 0.0004657565415244596, "loss": 1.9667, "step": 2352 }, { "epoch": 2.6769055745164962, "grad_norm": 1.4715807437896729, "learning_rate": 0.0004655290102389079, "loss": 3.1964, "step": 2353 }, { "epoch": 2.6780432309442546, "grad_norm": 0.9601611495018005, "learning_rate": 0.00046530147895335614, "loss": 2.5208, "step": 2354 }, { "epoch": 2.6791808873720138, "grad_norm": 1.083862543106079, "learning_rate": 0.0004650739476678043, "loss": 1.8693, "step": 2355 }, { "epoch": 2.6803185437997725, "grad_norm": 1.272933006286621, "learning_rate": 0.00046484641638225256, "loss": 3.148, "step": 2356 }, { "epoch": 2.6814562002275313, "grad_norm": 1.0518128871917725, "learning_rate": 0.0004646188850967008, "loss": 1.2337, "step": 2357 }, { "epoch": 2.68259385665529, "grad_norm": 0.9924670457839966, "learning_rate": 0.00046439135381114903, "loss": 1.4861, "step": 2358 }, { "epoch": 2.683731513083049, "grad_norm": 1.0956393480300903, "learning_rate": 0.0004641638225255973, "loss": 1.7977, "step": 2359 }, { "epoch": 2.684869169510808, "grad_norm": 1.3490567207336426, "learning_rate": 0.0004639362912400455, "loss": 2.5556, "step": 2360 }, { "epoch": 2.6860068259385663, "grad_norm": 0.8191376328468323, "learning_rate": 0.0004637087599544937, "loss": 1.7643, "step": 2361 }, { "epoch": 2.6871444823663255, "grad_norm": 0.9714041948318481, "learning_rate": 0.000463481228668942, "loss": 1.9565, "step": 2362 }, { "epoch": 2.6882821387940843, "grad_norm": 1.045387625694275, "learning_rate": 0.00046325369738339024, "loss": 2.746, "step": 2363 }, { "epoch": 2.689419795221843, "grad_norm": 0.934880793094635, "learning_rate": 0.0004630261660978385, "loss": 2.6093, "step": 2364 }, { "epoch": 2.690557451649602, "grad_norm": 0.7764321565628052, "learning_rate": 0.00046279863481228666, "loss": 1.0081, "step": 2365 }, { "epoch": 2.6916951080773606, "grad_norm": 0.7609637379646301, "learning_rate": 0.00046257110352673493, "loss": 1.7661, "step": 2366 }, { "epoch": 2.6928327645051193, "grad_norm": 0.8447152376174927, "learning_rate": 0.0004623435722411832, "loss": 1.5554, "step": 2367 }, { "epoch": 2.693970420932878, "grad_norm": 0.76449054479599, "learning_rate": 0.0004621160409556314, "loss": 1.6179, "step": 2368 }, { "epoch": 2.6951080773606373, "grad_norm": 0.7846193313598633, "learning_rate": 0.00046188850967007967, "loss": 1.0255, "step": 2369 }, { "epoch": 2.696245733788396, "grad_norm": 0.8696131706237793, "learning_rate": 0.0004616609783845279, "loss": 2.0406, "step": 2370 }, { "epoch": 2.697383390216155, "grad_norm": 0.9892042279243469, "learning_rate": 0.0004614334470989761, "loss": 2.7178, "step": 2371 }, { "epoch": 2.6985210466439136, "grad_norm": 1.1036131381988525, "learning_rate": 0.00046120591581342435, "loss": 1.5833, "step": 2372 }, { "epoch": 2.6996587030716723, "grad_norm": 0.9095990061759949, "learning_rate": 0.0004609783845278726, "loss": 2.3233, "step": 2373 }, { "epoch": 2.700796359499431, "grad_norm": 1.0550446510314941, "learning_rate": 0.0004607508532423209, "loss": 1.5132, "step": 2374 }, { "epoch": 2.70193401592719, "grad_norm": 0.984180748462677, "learning_rate": 0.00046052332195676903, "loss": 2.2976, "step": 2375 }, { "epoch": 2.703071672354949, "grad_norm": 0.9732262492179871, "learning_rate": 0.0004602957906712173, "loss": 1.767, "step": 2376 }, { "epoch": 2.7042093287827074, "grad_norm": 0.858201265335083, "learning_rate": 0.00046006825938566556, "loss": 1.3972, "step": 2377 }, { "epoch": 2.7053469852104666, "grad_norm": 0.9151699542999268, "learning_rate": 0.00045984072810011377, "loss": 1.821, "step": 2378 }, { "epoch": 2.7064846416382253, "grad_norm": 1.1489654779434204, "learning_rate": 0.000459613196814562, "loss": 1.3073, "step": 2379 }, { "epoch": 2.707622298065984, "grad_norm": 1.2800509929656982, "learning_rate": 0.00045938566552901025, "loss": 1.9937, "step": 2380 }, { "epoch": 2.708759954493743, "grad_norm": 0.6855766177177429, "learning_rate": 0.00045915813424345846, "loss": 0.7626, "step": 2381 }, { "epoch": 2.7098976109215016, "grad_norm": 0.8592435717582703, "learning_rate": 0.0004589306029579067, "loss": 1.4657, "step": 2382 }, { "epoch": 2.7110352673492604, "grad_norm": 0.6695002913475037, "learning_rate": 0.000458703071672355, "loss": 0.964, "step": 2383 }, { "epoch": 2.712172923777019, "grad_norm": 0.6979312300682068, "learning_rate": 0.0004584755403868032, "loss": 1.4597, "step": 2384 }, { "epoch": 2.7133105802047783, "grad_norm": 1.0145376920700073, "learning_rate": 0.0004582480091012514, "loss": 2.5069, "step": 2385 }, { "epoch": 2.714448236632537, "grad_norm": 0.6209183931350708, "learning_rate": 0.00045802047781569967, "loss": 1.0564, "step": 2386 }, { "epoch": 2.715585893060296, "grad_norm": 0.9510672688484192, "learning_rate": 0.00045779294653014793, "loss": 1.5424, "step": 2387 }, { "epoch": 2.7167235494880546, "grad_norm": 0.911859393119812, "learning_rate": 0.00045756541524459614, "loss": 1.6315, "step": 2388 }, { "epoch": 2.7178612059158134, "grad_norm": 0.7897265553474426, "learning_rate": 0.00045733788395904435, "loss": 1.3671, "step": 2389 }, { "epoch": 2.718998862343572, "grad_norm": 0.8800843358039856, "learning_rate": 0.0004571103526734926, "loss": 2.1943, "step": 2390 }, { "epoch": 2.720136518771331, "grad_norm": 1.3472900390625, "learning_rate": 0.0004568828213879408, "loss": 2.8666, "step": 2391 }, { "epoch": 2.72127417519909, "grad_norm": 1.0880565643310547, "learning_rate": 0.0004566552901023891, "loss": 1.939, "step": 2392 }, { "epoch": 2.722411831626849, "grad_norm": 0.8288098573684692, "learning_rate": 0.00045642775881683735, "loss": 2.334, "step": 2393 }, { "epoch": 2.7235494880546076, "grad_norm": 1.0421987771987915, "learning_rate": 0.00045620022753128556, "loss": 2.0377, "step": 2394 }, { "epoch": 2.7246871444823664, "grad_norm": 1.3530818223953247, "learning_rate": 0.0004559726962457338, "loss": 3.6316, "step": 2395 }, { "epoch": 2.725824800910125, "grad_norm": 0.9477719068527222, "learning_rate": 0.00045574516496018204, "loss": 2.0223, "step": 2396 }, { "epoch": 2.726962457337884, "grad_norm": 1.1411749124526978, "learning_rate": 0.0004555176336746303, "loss": 2.1665, "step": 2397 }, { "epoch": 2.7281001137656427, "grad_norm": 1.3905211687088013, "learning_rate": 0.0004552901023890785, "loss": 1.8378, "step": 2398 }, { "epoch": 2.729237770193402, "grad_norm": 0.7103641629219055, "learning_rate": 0.0004550625711035267, "loss": 1.6213, "step": 2399 }, { "epoch": 2.73037542662116, "grad_norm": 0.7716681361198425, "learning_rate": 0.000454835039817975, "loss": 1.0399, "step": 2400 }, { "epoch": 2.7315130830489194, "grad_norm": 0.6330393552780151, "learning_rate": 0.0004546075085324232, "loss": 1.2756, "step": 2401 }, { "epoch": 2.732650739476678, "grad_norm": 1.137176513671875, "learning_rate": 0.00045437997724687146, "loss": 3.3, "step": 2402 }, { "epoch": 2.733788395904437, "grad_norm": 0.9734787940979004, "learning_rate": 0.0004541524459613197, "loss": 1.4292, "step": 2403 }, { "epoch": 2.7349260523321957, "grad_norm": 0.896617591381073, "learning_rate": 0.00045392491467576793, "loss": 0.7397, "step": 2404 }, { "epoch": 2.7360637087599544, "grad_norm": 1.0114847421646118, "learning_rate": 0.00045369738339021614, "loss": 1.7089, "step": 2405 }, { "epoch": 2.737201365187713, "grad_norm": 0.9761648774147034, "learning_rate": 0.0004534698521046644, "loss": 1.9788, "step": 2406 }, { "epoch": 2.738339021615472, "grad_norm": 1.1160939931869507, "learning_rate": 0.00045324232081911267, "loss": 2.4602, "step": 2407 }, { "epoch": 2.739476678043231, "grad_norm": 0.9416034817695618, "learning_rate": 0.0004530147895335609, "loss": 1.6489, "step": 2408 }, { "epoch": 2.74061433447099, "grad_norm": 0.9150708913803101, "learning_rate": 0.0004527872582480091, "loss": 1.4331, "step": 2409 }, { "epoch": 2.7417519908987487, "grad_norm": 1.14388108253479, "learning_rate": 0.00045255972696245736, "loss": 1.9662, "step": 2410 }, { "epoch": 2.7428896473265074, "grad_norm": 0.8412722945213318, "learning_rate": 0.00045233219567690557, "loss": 1.2538, "step": 2411 }, { "epoch": 2.744027303754266, "grad_norm": 1.3321762084960938, "learning_rate": 0.00045210466439135383, "loss": 1.693, "step": 2412 }, { "epoch": 2.745164960182025, "grad_norm": 0.9153639674186707, "learning_rate": 0.00045187713310580204, "loss": 1.625, "step": 2413 }, { "epoch": 2.7463026166097837, "grad_norm": 0.8170965313911438, "learning_rate": 0.0004516496018202503, "loss": 1.6253, "step": 2414 }, { "epoch": 2.747440273037543, "grad_norm": 0.7869139909744263, "learning_rate": 0.0004514220705346985, "loss": 1.4511, "step": 2415 }, { "epoch": 2.748577929465301, "grad_norm": 1.0545377731323242, "learning_rate": 0.0004511945392491468, "loss": 1.6514, "step": 2416 }, { "epoch": 2.7497155858930604, "grad_norm": 1.045305848121643, "learning_rate": 0.00045096700796359504, "loss": 1.2496, "step": 2417 }, { "epoch": 2.750853242320819, "grad_norm": 1.186232566833496, "learning_rate": 0.0004507394766780432, "loss": 2.2593, "step": 2418 }, { "epoch": 2.751990898748578, "grad_norm": 0.8955073356628418, "learning_rate": 0.00045051194539249146, "loss": 1.1633, "step": 2419 }, { "epoch": 2.7531285551763367, "grad_norm": 1.0408909320831299, "learning_rate": 0.0004502844141069397, "loss": 1.6361, "step": 2420 }, { "epoch": 2.7542662116040955, "grad_norm": 1.312453031539917, "learning_rate": 0.00045005688282138794, "loss": 1.7388, "step": 2421 }, { "epoch": 2.755403868031854, "grad_norm": 1.0657882690429688, "learning_rate": 0.0004498293515358362, "loss": 2.0169, "step": 2422 }, { "epoch": 2.756541524459613, "grad_norm": 1.0726020336151123, "learning_rate": 0.0004496018202502844, "loss": 1.5911, "step": 2423 }, { "epoch": 2.757679180887372, "grad_norm": 0.9591920375823975, "learning_rate": 0.0004493742889647327, "loss": 1.6334, "step": 2424 }, { "epoch": 2.758816837315131, "grad_norm": 1.259514570236206, "learning_rate": 0.0004491467576791809, "loss": 2.5346, "step": 2425 }, { "epoch": 2.7599544937428897, "grad_norm": 0.8274584412574768, "learning_rate": 0.00044891922639362915, "loss": 1.5429, "step": 2426 }, { "epoch": 2.7610921501706485, "grad_norm": 1.121607780456543, "learning_rate": 0.0004486916951080774, "loss": 1.9823, "step": 2427 }, { "epoch": 2.7622298065984072, "grad_norm": 1.2291669845581055, "learning_rate": 0.00044846416382252557, "loss": 1.9587, "step": 2428 }, { "epoch": 2.763367463026166, "grad_norm": 0.8511943817138672, "learning_rate": 0.00044823663253697383, "loss": 1.1893, "step": 2429 }, { "epoch": 2.7645051194539247, "grad_norm": 0.9760825037956238, "learning_rate": 0.0004480091012514221, "loss": 1.1563, "step": 2430 }, { "epoch": 2.765642775881684, "grad_norm": 0.7539849877357483, "learning_rate": 0.0004477815699658703, "loss": 1.6624, "step": 2431 }, { "epoch": 2.7667804323094427, "grad_norm": 1.0250943899154663, "learning_rate": 0.00044755403868031857, "loss": 1.4936, "step": 2432 }, { "epoch": 2.7679180887372015, "grad_norm": 0.6538174152374268, "learning_rate": 0.0004473265073947668, "loss": 1.3009, "step": 2433 }, { "epoch": 2.7690557451649602, "grad_norm": 0.6914688944816589, "learning_rate": 0.00044709897610921504, "loss": 1.0585, "step": 2434 }, { "epoch": 2.770193401592719, "grad_norm": 0.9097804427146912, "learning_rate": 0.00044687144482366325, "loss": 1.5146, "step": 2435 }, { "epoch": 2.7713310580204777, "grad_norm": 1.8249222040176392, "learning_rate": 0.0004466439135381115, "loss": 4.1139, "step": 2436 }, { "epoch": 2.7724687144482365, "grad_norm": 0.8782042860984802, "learning_rate": 0.0004464163822525598, "loss": 1.4278, "step": 2437 }, { "epoch": 2.7736063708759957, "grad_norm": 0.8963881134986877, "learning_rate": 0.00044618885096700794, "loss": 0.8318, "step": 2438 }, { "epoch": 2.774744027303754, "grad_norm": 0.768720805644989, "learning_rate": 0.0004459613196814562, "loss": 1.2816, "step": 2439 }, { "epoch": 2.7758816837315132, "grad_norm": 1.2364832162857056, "learning_rate": 0.00044573378839590447, "loss": 2.7189, "step": 2440 }, { "epoch": 2.777019340159272, "grad_norm": 0.8980220556259155, "learning_rate": 0.0004455062571103527, "loss": 2.1147, "step": 2441 }, { "epoch": 2.7781569965870307, "grad_norm": 1.1556415557861328, "learning_rate": 0.00044527872582480094, "loss": 2.5734, "step": 2442 }, { "epoch": 2.7792946530147895, "grad_norm": 1.0572619438171387, "learning_rate": 0.00044505119453924915, "loss": 1.7883, "step": 2443 }, { "epoch": 2.7804323094425483, "grad_norm": 1.2387428283691406, "learning_rate": 0.00044482366325369736, "loss": 1.9794, "step": 2444 }, { "epoch": 2.781569965870307, "grad_norm": 0.6518829464912415, "learning_rate": 0.0004445961319681456, "loss": 1.567, "step": 2445 }, { "epoch": 2.782707622298066, "grad_norm": 0.9689300060272217, "learning_rate": 0.0004443686006825939, "loss": 1.8925, "step": 2446 }, { "epoch": 2.783845278725825, "grad_norm": 1.2153396606445312, "learning_rate": 0.00044414106939704215, "loss": 2.3077, "step": 2447 }, { "epoch": 2.7849829351535837, "grad_norm": 0.9673851728439331, "learning_rate": 0.0004439135381114903, "loss": 1.3106, "step": 2448 }, { "epoch": 2.7861205915813425, "grad_norm": 1.2174962759017944, "learning_rate": 0.00044368600682593857, "loss": 2.8822, "step": 2449 }, { "epoch": 2.7872582480091013, "grad_norm": 1.0069944858551025, "learning_rate": 0.00044345847554038684, "loss": 1.4753, "step": 2450 }, { "epoch": 2.78839590443686, "grad_norm": 1.794924020767212, "learning_rate": 0.00044323094425483504, "loss": 3.3568, "step": 2451 }, { "epoch": 2.789533560864619, "grad_norm": 1.3522320985794067, "learning_rate": 0.00044300341296928325, "loss": 2.2246, "step": 2452 }, { "epoch": 2.7906712172923775, "grad_norm": 1.126507043838501, "learning_rate": 0.0004427758816837315, "loss": 2.1909, "step": 2453 }, { "epoch": 2.7918088737201368, "grad_norm": 0.840438961982727, "learning_rate": 0.00044254835039817973, "loss": 1.315, "step": 2454 }, { "epoch": 2.792946530147895, "grad_norm": 7.848939418792725, "learning_rate": 0.000442320819112628, "loss": 2.2406, "step": 2455 }, { "epoch": 2.7940841865756543, "grad_norm": 1.4858578443527222, "learning_rate": 0.00044209328782707626, "loss": 3.2399, "step": 2456 }, { "epoch": 2.795221843003413, "grad_norm": 0.7680813670158386, "learning_rate": 0.00044186575654152447, "loss": 1.4417, "step": 2457 }, { "epoch": 2.796359499431172, "grad_norm": 1.2522087097167969, "learning_rate": 0.0004416382252559727, "loss": 2.4295, "step": 2458 }, { "epoch": 2.7974971558589306, "grad_norm": 1.1698752641677856, "learning_rate": 0.00044141069397042094, "loss": 1.9101, "step": 2459 }, { "epoch": 2.7986348122866893, "grad_norm": 0.5076554417610168, "learning_rate": 0.0004411831626848692, "loss": 0.9025, "step": 2460 }, { "epoch": 2.799772468714448, "grad_norm": 0.7442627549171448, "learning_rate": 0.0004409556313993174, "loss": 1.3259, "step": 2461 }, { "epoch": 2.800910125142207, "grad_norm": 1.1652323007583618, "learning_rate": 0.0004407281001137656, "loss": 1.3523, "step": 2462 }, { "epoch": 2.802047781569966, "grad_norm": 1.1334046125411987, "learning_rate": 0.0004405005688282139, "loss": 2.4571, "step": 2463 }, { "epoch": 2.803185437997725, "grad_norm": 1.446699857711792, "learning_rate": 0.0004402730375426621, "loss": 3.8358, "step": 2464 }, { "epoch": 2.8043230944254836, "grad_norm": 0.8421689867973328, "learning_rate": 0.00044004550625711036, "loss": 2.1174, "step": 2465 }, { "epoch": 2.8054607508532423, "grad_norm": 1.1578481197357178, "learning_rate": 0.0004398179749715586, "loss": 2.3561, "step": 2466 }, { "epoch": 2.806598407281001, "grad_norm": 1.4153037071228027, "learning_rate": 0.00043959044368600684, "loss": 3.3699, "step": 2467 }, { "epoch": 2.80773606370876, "grad_norm": 0.9994482398033142, "learning_rate": 0.00043936291240045505, "loss": 2.3389, "step": 2468 }, { "epoch": 2.8088737201365186, "grad_norm": 1.0240802764892578, "learning_rate": 0.0004391353811149033, "loss": 1.4459, "step": 2469 }, { "epoch": 2.810011376564278, "grad_norm": 0.505115807056427, "learning_rate": 0.0004389078498293516, "loss": 0.5616, "step": 2470 }, { "epoch": 2.8111490329920366, "grad_norm": 1.041799545288086, "learning_rate": 0.0004386803185437998, "loss": 1.6856, "step": 2471 }, { "epoch": 2.8122866894197953, "grad_norm": 0.7291197776794434, "learning_rate": 0.000438452787258248, "loss": 1.8971, "step": 2472 }, { "epoch": 2.813424345847554, "grad_norm": 0.7052469253540039, "learning_rate": 0.00043822525597269626, "loss": 1.6314, "step": 2473 }, { "epoch": 2.814562002275313, "grad_norm": 1.3250932693481445, "learning_rate": 0.00043799772468714447, "loss": 1.6973, "step": 2474 }, { "epoch": 2.8156996587030716, "grad_norm": 0.8435229659080505, "learning_rate": 0.00043777019340159273, "loss": 1.387, "step": 2475 }, { "epoch": 2.8168373151308304, "grad_norm": 0.8370683193206787, "learning_rate": 0.000437542662116041, "loss": 1.1863, "step": 2476 }, { "epoch": 2.8179749715585896, "grad_norm": 0.940768301486969, "learning_rate": 0.0004373151308304892, "loss": 1.4706, "step": 2477 }, { "epoch": 2.819112627986348, "grad_norm": 0.8866887092590332, "learning_rate": 0.0004370875995449374, "loss": 2.093, "step": 2478 }, { "epoch": 2.820250284414107, "grad_norm": 0.6502683162689209, "learning_rate": 0.0004368600682593857, "loss": 1.17, "step": 2479 }, { "epoch": 2.821387940841866, "grad_norm": 1.3090890645980835, "learning_rate": 0.00043663253697383394, "loss": 2.7351, "step": 2480 }, { "epoch": 2.8225255972696246, "grad_norm": 0.6929295063018799, "learning_rate": 0.00043640500568828215, "loss": 1.0882, "step": 2481 }, { "epoch": 2.8236632536973834, "grad_norm": 1.296984314918518, "learning_rate": 0.00043617747440273036, "loss": 2.5549, "step": 2482 }, { "epoch": 2.824800910125142, "grad_norm": 1.1640862226486206, "learning_rate": 0.00043594994311717863, "loss": 2.2164, "step": 2483 }, { "epoch": 2.825938566552901, "grad_norm": 0.8431540727615356, "learning_rate": 0.00043572241183162684, "loss": 1.8785, "step": 2484 }, { "epoch": 2.8270762229806596, "grad_norm": 1.0226455926895142, "learning_rate": 0.0004354948805460751, "loss": 2.7847, "step": 2485 }, { "epoch": 2.828213879408419, "grad_norm": 0.9944515824317932, "learning_rate": 0.0004352673492605233, "loss": 3.0653, "step": 2486 }, { "epoch": 2.8293515358361776, "grad_norm": 0.989396333694458, "learning_rate": 0.0004350398179749716, "loss": 1.5644, "step": 2487 }, { "epoch": 2.8304891922639364, "grad_norm": 1.1219333410263062, "learning_rate": 0.0004348122866894198, "loss": 3.0597, "step": 2488 }, { "epoch": 2.831626848691695, "grad_norm": 0.9987985491752625, "learning_rate": 0.00043458475540386805, "loss": 1.918, "step": 2489 }, { "epoch": 2.832764505119454, "grad_norm": 1.3687235116958618, "learning_rate": 0.0004343572241183163, "loss": 2.0344, "step": 2490 }, { "epoch": 2.8339021615472126, "grad_norm": 0.5967879891395569, "learning_rate": 0.00043412969283276447, "loss": 1.1413, "step": 2491 }, { "epoch": 2.8350398179749714, "grad_norm": 0.7929881811141968, "learning_rate": 0.00043390216154721273, "loss": 1.3015, "step": 2492 }, { "epoch": 2.8361774744027306, "grad_norm": 1.2926918268203735, "learning_rate": 0.000433674630261661, "loss": 2.332, "step": 2493 }, { "epoch": 2.837315130830489, "grad_norm": 0.7639207243919373, "learning_rate": 0.0004334470989761092, "loss": 1.7233, "step": 2494 }, { "epoch": 2.838452787258248, "grad_norm": 0.7680888772010803, "learning_rate": 0.00043321956769055747, "loss": 0.8933, "step": 2495 }, { "epoch": 2.839590443686007, "grad_norm": 0.9917737245559692, "learning_rate": 0.0004329920364050057, "loss": 1.8064, "step": 2496 }, { "epoch": 2.8407281001137656, "grad_norm": 0.8714781403541565, "learning_rate": 0.00043276450511945395, "loss": 1.688, "step": 2497 }, { "epoch": 2.8418657565415244, "grad_norm": 1.283627986907959, "learning_rate": 0.00043253697383390216, "loss": 1.9438, "step": 2498 }, { "epoch": 2.843003412969283, "grad_norm": 0.9072063565254211, "learning_rate": 0.0004323094425483504, "loss": 1.5175, "step": 2499 }, { "epoch": 2.8441410693970424, "grad_norm": 0.9820964336395264, "learning_rate": 0.0004320819112627987, "loss": 3.0533, "step": 2500 }, { "epoch": 2.8452787258248007, "grad_norm": 1.6198686361312866, "learning_rate": 0.00043185437997724684, "loss": 2.8954, "step": 2501 }, { "epoch": 2.84641638225256, "grad_norm": 1.0898877382278442, "learning_rate": 0.0004316268486916951, "loss": 1.7376, "step": 2502 }, { "epoch": 2.8475540386803186, "grad_norm": 0.8086861968040466, "learning_rate": 0.00043139931740614337, "loss": 1.8419, "step": 2503 }, { "epoch": 2.8486916951080774, "grad_norm": 0.9705652594566345, "learning_rate": 0.0004311717861205916, "loss": 1.2042, "step": 2504 }, { "epoch": 2.849829351535836, "grad_norm": 0.7838239669799805, "learning_rate": 0.00043094425483503984, "loss": 0.9004, "step": 2505 }, { "epoch": 2.850967007963595, "grad_norm": 3.2648749351501465, "learning_rate": 0.00043071672354948805, "loss": 1.1682, "step": 2506 }, { "epoch": 2.8521046643913537, "grad_norm": 1.287819743156433, "learning_rate": 0.0004304891922639363, "loss": 2.8992, "step": 2507 }, { "epoch": 2.8532423208191124, "grad_norm": 1.7003014087677002, "learning_rate": 0.0004302616609783845, "loss": 1.2319, "step": 2508 }, { "epoch": 2.8543799772468716, "grad_norm": 0.8558884859085083, "learning_rate": 0.0004300341296928328, "loss": 2.2637, "step": 2509 }, { "epoch": 2.8555176336746304, "grad_norm": 0.9409870505332947, "learning_rate": 0.00042980659840728105, "loss": 2.3024, "step": 2510 }, { "epoch": 2.856655290102389, "grad_norm": 0.627686619758606, "learning_rate": 0.0004295790671217292, "loss": 1.1552, "step": 2511 }, { "epoch": 2.857792946530148, "grad_norm": 1.0397640466690063, "learning_rate": 0.0004293515358361775, "loss": 1.6213, "step": 2512 }, { "epoch": 2.8589306029579067, "grad_norm": 1.128299593925476, "learning_rate": 0.00042912400455062574, "loss": 2.0099, "step": 2513 }, { "epoch": 2.8600682593856654, "grad_norm": 1.486132025718689, "learning_rate": 0.00042889647326507395, "loss": 2.9052, "step": 2514 }, { "epoch": 2.861205915813424, "grad_norm": 1.4445245265960693, "learning_rate": 0.0004286689419795222, "loss": 2.075, "step": 2515 }, { "epoch": 2.8623435722411834, "grad_norm": 1.2654812335968018, "learning_rate": 0.0004284414106939704, "loss": 0.5957, "step": 2516 }, { "epoch": 2.8634812286689417, "grad_norm": 0.7981293201446533, "learning_rate": 0.0004282138794084187, "loss": 1.9943, "step": 2517 }, { "epoch": 2.864618885096701, "grad_norm": 0.5515188574790955, "learning_rate": 0.0004279863481228669, "loss": 0.8496, "step": 2518 }, { "epoch": 2.8657565415244597, "grad_norm": 1.4269306659698486, "learning_rate": 0.00042775881683731516, "loss": 4.1156, "step": 2519 }, { "epoch": 2.8668941979522184, "grad_norm": 0.7809433341026306, "learning_rate": 0.00042753128555176337, "loss": 1.3538, "step": 2520 }, { "epoch": 2.868031854379977, "grad_norm": 0.7941915988922119, "learning_rate": 0.0004273037542662116, "loss": 1.7591, "step": 2521 }, { "epoch": 2.869169510807736, "grad_norm": 1.2019193172454834, "learning_rate": 0.00042707622298065984, "loss": 2.0824, "step": 2522 }, { "epoch": 2.8703071672354947, "grad_norm": 0.727832555770874, "learning_rate": 0.0004268486916951081, "loss": 1.8108, "step": 2523 }, { "epoch": 2.8714448236632535, "grad_norm": 0.7144151926040649, "learning_rate": 0.0004266211604095563, "loss": 1.6019, "step": 2524 }, { "epoch": 2.8725824800910127, "grad_norm": 1.0367114543914795, "learning_rate": 0.00042639362912400453, "loss": 1.4144, "step": 2525 }, { "epoch": 2.8737201365187715, "grad_norm": 0.9273185729980469, "learning_rate": 0.0004261660978384528, "loss": 1.5532, "step": 2526 }, { "epoch": 2.87485779294653, "grad_norm": 0.8667803406715393, "learning_rate": 0.00042593856655290106, "loss": 2.2255, "step": 2527 }, { "epoch": 2.875995449374289, "grad_norm": 1.1647021770477295, "learning_rate": 0.00042571103526734927, "loss": 1.6445, "step": 2528 }, { "epoch": 2.8771331058020477, "grad_norm": 0.8458011150360107, "learning_rate": 0.00042548350398179753, "loss": 1.6222, "step": 2529 }, { "epoch": 2.8782707622298065, "grad_norm": 0.6948020458221436, "learning_rate": 0.00042525597269624574, "loss": 0.8429, "step": 2530 }, { "epoch": 2.8794084186575652, "grad_norm": 1.5468206405639648, "learning_rate": 0.00042502844141069395, "loss": 2.3449, "step": 2531 }, { "epoch": 2.8805460750853245, "grad_norm": 1.0919640064239502, "learning_rate": 0.0004248009101251422, "loss": 2.7566, "step": 2532 }, { "epoch": 2.881683731513083, "grad_norm": 0.9144538044929504, "learning_rate": 0.0004245733788395905, "loss": 1.4095, "step": 2533 }, { "epoch": 2.882821387940842, "grad_norm": 0.6683197617530823, "learning_rate": 0.0004243458475540387, "loss": 1.396, "step": 2534 }, { "epoch": 2.8839590443686007, "grad_norm": 1.1698358058929443, "learning_rate": 0.0004241183162684869, "loss": 2.6583, "step": 2535 }, { "epoch": 2.8850967007963595, "grad_norm": 0.9804813265800476, "learning_rate": 0.00042389078498293516, "loss": 1.8973, "step": 2536 }, { "epoch": 2.8862343572241183, "grad_norm": 0.8138146996498108, "learning_rate": 0.0004236632536973834, "loss": 1.2295, "step": 2537 }, { "epoch": 2.887372013651877, "grad_norm": 0.9173576831817627, "learning_rate": 0.00042343572241183164, "loss": 1.904, "step": 2538 }, { "epoch": 2.888509670079636, "grad_norm": 0.9176083207130432, "learning_rate": 0.0004232081911262799, "loss": 3.0666, "step": 2539 }, { "epoch": 2.8896473265073945, "grad_norm": 0.901013195514679, "learning_rate": 0.0004229806598407281, "loss": 2.0642, "step": 2540 }, { "epoch": 2.8907849829351537, "grad_norm": 0.8971734642982483, "learning_rate": 0.0004227531285551763, "loss": 1.586, "step": 2541 }, { "epoch": 2.8919226393629125, "grad_norm": 0.9048395752906799, "learning_rate": 0.0004225255972696246, "loss": 2.0343, "step": 2542 }, { "epoch": 2.8930602957906713, "grad_norm": 1.1656171083450317, "learning_rate": 0.00042229806598407285, "loss": 2.5376, "step": 2543 }, { "epoch": 2.89419795221843, "grad_norm": 0.655851423740387, "learning_rate": 0.00042207053469852106, "loss": 1.4965, "step": 2544 }, { "epoch": 2.8953356086461888, "grad_norm": 0.8759918212890625, "learning_rate": 0.00042184300341296927, "loss": 2.1066, "step": 2545 }, { "epoch": 2.8964732650739475, "grad_norm": 0.8949549794197083, "learning_rate": 0.00042161547212741753, "loss": 1.6719, "step": 2546 }, { "epoch": 2.8976109215017063, "grad_norm": 1.0389626026153564, "learning_rate": 0.00042138794084186574, "loss": 2.4444, "step": 2547 }, { "epoch": 2.8987485779294655, "grad_norm": 1.5342673063278198, "learning_rate": 0.000421160409556314, "loss": 2.5499, "step": 2548 }, { "epoch": 2.8998862343572243, "grad_norm": 1.0685155391693115, "learning_rate": 0.00042093287827076227, "loss": 2.8287, "step": 2549 }, { "epoch": 2.901023890784983, "grad_norm": 1.428017020225525, "learning_rate": 0.0004207053469852105, "loss": 3.5992, "step": 2550 }, { "epoch": 2.9021615472127418, "grad_norm": 0.7409979701042175, "learning_rate": 0.0004204778156996587, "loss": 1.7332, "step": 2551 }, { "epoch": 2.9032992036405005, "grad_norm": 1.2421164512634277, "learning_rate": 0.00042025028441410695, "loss": 1.5578, "step": 2552 }, { "epoch": 2.9044368600682593, "grad_norm": 0.9279178380966187, "learning_rate": 0.0004200227531285552, "loss": 1.1623, "step": 2553 }, { "epoch": 2.905574516496018, "grad_norm": 0.8222157955169678, "learning_rate": 0.0004197952218430034, "loss": 1.5473, "step": 2554 }, { "epoch": 2.9067121729237773, "grad_norm": 0.7585321664810181, "learning_rate": 0.00041956769055745164, "loss": 1.3996, "step": 2555 }, { "epoch": 2.9078498293515356, "grad_norm": 0.9835622310638428, "learning_rate": 0.0004193401592718999, "loss": 1.7122, "step": 2556 }, { "epoch": 2.908987485779295, "grad_norm": 0.6884040236473083, "learning_rate": 0.0004191126279863481, "loss": 1.533, "step": 2557 }, { "epoch": 2.9101251422070535, "grad_norm": 1.0069630146026611, "learning_rate": 0.0004188850967007964, "loss": 1.7265, "step": 2558 }, { "epoch": 2.9112627986348123, "grad_norm": 0.984389066696167, "learning_rate": 0.0004186575654152446, "loss": 1.6568, "step": 2559 }, { "epoch": 2.912400455062571, "grad_norm": 0.7369997501373291, "learning_rate": 0.00041843003412969285, "loss": 1.8428, "step": 2560 }, { "epoch": 2.91353811149033, "grad_norm": 1.4680728912353516, "learning_rate": 0.00041820250284414106, "loss": 3.2181, "step": 2561 }, { "epoch": 2.9146757679180886, "grad_norm": 0.7019348740577698, "learning_rate": 0.0004179749715585893, "loss": 1.529, "step": 2562 }, { "epoch": 2.9158134243458473, "grad_norm": 0.6902445554733276, "learning_rate": 0.0004177474402730376, "loss": 0.6651, "step": 2563 }, { "epoch": 2.9169510807736065, "grad_norm": 1.133726716041565, "learning_rate": 0.00041751990898748574, "loss": 3.7021, "step": 2564 }, { "epoch": 2.9180887372013653, "grad_norm": 0.9320973753929138, "learning_rate": 0.000417292377701934, "loss": 2.1056, "step": 2565 }, { "epoch": 2.919226393629124, "grad_norm": 0.9788163900375366, "learning_rate": 0.00041706484641638227, "loss": 1.8839, "step": 2566 }, { "epoch": 2.920364050056883, "grad_norm": 1.0894557237625122, "learning_rate": 0.0004168373151308305, "loss": 2.4257, "step": 2567 }, { "epoch": 2.9215017064846416, "grad_norm": 0.7440232038497925, "learning_rate": 0.00041660978384527875, "loss": 1.6213, "step": 2568 }, { "epoch": 2.9226393629124003, "grad_norm": 1.0889848470687866, "learning_rate": 0.00041638225255972696, "loss": 1.632, "step": 2569 }, { "epoch": 2.923777019340159, "grad_norm": 0.9316622018814087, "learning_rate": 0.0004161547212741752, "loss": 1.5941, "step": 2570 }, { "epoch": 2.9249146757679183, "grad_norm": 1.1574180126190186, "learning_rate": 0.00041592718998862343, "loss": 2.5757, "step": 2571 }, { "epoch": 2.926052332195677, "grad_norm": 0.629701554775238, "learning_rate": 0.0004156996587030717, "loss": 1.1369, "step": 2572 }, { "epoch": 2.927189988623436, "grad_norm": 0.4679393172264099, "learning_rate": 0.00041547212741751996, "loss": 0.8413, "step": 2573 }, { "epoch": 2.9283276450511946, "grad_norm": 0.8145558834075928, "learning_rate": 0.0004152445961319681, "loss": 1.3481, "step": 2574 }, { "epoch": 2.9294653014789533, "grad_norm": 0.5542713403701782, "learning_rate": 0.0004150170648464164, "loss": 1.2693, "step": 2575 }, { "epoch": 2.930602957906712, "grad_norm": 1.2317445278167725, "learning_rate": 0.00041478953356086464, "loss": 2.8908, "step": 2576 }, { "epoch": 2.931740614334471, "grad_norm": 0.9596227407455444, "learning_rate": 0.00041456200227531285, "loss": 1.3738, "step": 2577 }, { "epoch": 2.93287827076223, "grad_norm": 0.988106369972229, "learning_rate": 0.0004143344709897611, "loss": 2.4601, "step": 2578 }, { "epoch": 2.9340159271899884, "grad_norm": 0.9745345115661621, "learning_rate": 0.0004141069397042093, "loss": 1.9739, "step": 2579 }, { "epoch": 2.9351535836177476, "grad_norm": 0.8387298583984375, "learning_rate": 0.0004138794084186576, "loss": 1.6605, "step": 2580 }, { "epoch": 2.9362912400455063, "grad_norm": 0.811417818069458, "learning_rate": 0.0004136518771331058, "loss": 0.7618, "step": 2581 }, { "epoch": 2.937428896473265, "grad_norm": 0.8037904500961304, "learning_rate": 0.00041342434584755406, "loss": 1.1208, "step": 2582 }, { "epoch": 2.938566552901024, "grad_norm": 1.0455831289291382, "learning_rate": 0.00041319681456200233, "loss": 2.6639, "step": 2583 }, { "epoch": 2.9397042093287826, "grad_norm": 0.9639081358909607, "learning_rate": 0.0004129692832764505, "loss": 1.7122, "step": 2584 }, { "epoch": 2.9408418657565414, "grad_norm": 0.9406579732894897, "learning_rate": 0.00041274175199089875, "loss": 1.7419, "step": 2585 }, { "epoch": 2.9419795221843, "grad_norm": 1.0303080081939697, "learning_rate": 0.000412514220705347, "loss": 1.1925, "step": 2586 }, { "epoch": 2.9431171786120593, "grad_norm": 1.262969970703125, "learning_rate": 0.0004122866894197952, "loss": 3.077, "step": 2587 }, { "epoch": 2.944254835039818, "grad_norm": 1.7101789712905884, "learning_rate": 0.0004120591581342435, "loss": 3.1926, "step": 2588 }, { "epoch": 2.945392491467577, "grad_norm": 0.583219587802887, "learning_rate": 0.0004118316268486917, "loss": 1.4564, "step": 2589 }, { "epoch": 2.9465301478953356, "grad_norm": 0.916775107383728, "learning_rate": 0.00041160409556313996, "loss": 1.9524, "step": 2590 }, { "epoch": 2.9476678043230944, "grad_norm": 0.6460449695587158, "learning_rate": 0.00041137656427758817, "loss": 0.8583, "step": 2591 }, { "epoch": 2.948805460750853, "grad_norm": 0.9714045524597168, "learning_rate": 0.00041114903299203643, "loss": 1.8316, "step": 2592 }, { "epoch": 2.949943117178612, "grad_norm": 0.7467246651649475, "learning_rate": 0.00041092150170648464, "loss": 1.1244, "step": 2593 }, { "epoch": 2.951080773606371, "grad_norm": 1.166478157043457, "learning_rate": 0.00041069397042093285, "loss": 2.589, "step": 2594 }, { "epoch": 2.9522184300341294, "grad_norm": 1.1131619215011597, "learning_rate": 0.0004104664391353811, "loss": 2.0701, "step": 2595 }, { "epoch": 2.9533560864618886, "grad_norm": 0.7802785038948059, "learning_rate": 0.0004102389078498294, "loss": 1.5037, "step": 2596 }, { "epoch": 2.9544937428896474, "grad_norm": 1.3113210201263428, "learning_rate": 0.0004100113765642776, "loss": 3.5825, "step": 2597 }, { "epoch": 2.955631399317406, "grad_norm": 0.9257882833480835, "learning_rate": 0.0004097838452787258, "loss": 2.0172, "step": 2598 }, { "epoch": 2.956769055745165, "grad_norm": 0.6490365862846375, "learning_rate": 0.00040955631399317407, "loss": 1.0444, "step": 2599 }, { "epoch": 2.9579067121729237, "grad_norm": 0.7082270979881287, "learning_rate": 0.00040932878270762233, "loss": 1.6777, "step": 2600 }, { "epoch": 2.9590443686006824, "grad_norm": 1.1460232734680176, "learning_rate": 0.00040910125142207054, "loss": 2.843, "step": 2601 }, { "epoch": 2.960182025028441, "grad_norm": 1.1202770471572876, "learning_rate": 0.0004088737201365188, "loss": 1.8192, "step": 2602 }, { "epoch": 2.9613196814562004, "grad_norm": 1.5068840980529785, "learning_rate": 0.000408646188850967, "loss": 3.6918, "step": 2603 }, { "epoch": 2.962457337883959, "grad_norm": 0.9092023968696594, "learning_rate": 0.0004084186575654152, "loss": 1.8649, "step": 2604 }, { "epoch": 2.963594994311718, "grad_norm": 1.0940500497817993, "learning_rate": 0.0004081911262798635, "loss": 2.9138, "step": 2605 }, { "epoch": 2.9647326507394767, "grad_norm": 1.2195347547531128, "learning_rate": 0.00040796359499431175, "loss": 2.0279, "step": 2606 }, { "epoch": 2.9658703071672354, "grad_norm": 0.7400806546211243, "learning_rate": 0.00040773606370875996, "loss": 1.3149, "step": 2607 }, { "epoch": 2.967007963594994, "grad_norm": 1.0603524446487427, "learning_rate": 0.00040750853242320817, "loss": 2.151, "step": 2608 }, { "epoch": 2.968145620022753, "grad_norm": 0.9565656185150146, "learning_rate": 0.00040728100113765644, "loss": 1.4545, "step": 2609 }, { "epoch": 2.969283276450512, "grad_norm": 0.9198881387710571, "learning_rate": 0.0004070534698521047, "loss": 1.7001, "step": 2610 }, { "epoch": 2.970420932878271, "grad_norm": 0.9140946865081787, "learning_rate": 0.0004068259385665529, "loss": 1.5544, "step": 2611 }, { "epoch": 2.9715585893060297, "grad_norm": 0.6245988607406616, "learning_rate": 0.0004065984072810012, "loss": 1.3737, "step": 2612 }, { "epoch": 2.9726962457337884, "grad_norm": 0.9493893980979919, "learning_rate": 0.0004063708759954494, "loss": 1.7512, "step": 2613 }, { "epoch": 2.973833902161547, "grad_norm": 1.0149247646331787, "learning_rate": 0.0004061433447098976, "loss": 1.3255, "step": 2614 }, { "epoch": 2.974971558589306, "grad_norm": 1.1951128244400024, "learning_rate": 0.00040591581342434586, "loss": 2.2548, "step": 2615 }, { "epoch": 2.9761092150170647, "grad_norm": 1.160248875617981, "learning_rate": 0.0004056882821387941, "loss": 1.9538, "step": 2616 }, { "epoch": 2.977246871444824, "grad_norm": 1.113005518913269, "learning_rate": 0.00040546075085324233, "loss": 1.813, "step": 2617 }, { "epoch": 2.9783845278725822, "grad_norm": 0.6714508533477783, "learning_rate": 0.00040523321956769054, "loss": 1.2078, "step": 2618 }, { "epoch": 2.9795221843003414, "grad_norm": 0.9738563299179077, "learning_rate": 0.0004050056882821388, "loss": 1.5833, "step": 2619 }, { "epoch": 2.9806598407281, "grad_norm": 0.7636871933937073, "learning_rate": 0.00040477815699658707, "loss": 1.7431, "step": 2620 }, { "epoch": 2.981797497155859, "grad_norm": 0.6886274814605713, "learning_rate": 0.0004045506257110353, "loss": 1.346, "step": 2621 }, { "epoch": 2.9829351535836177, "grad_norm": 0.7128561735153198, "learning_rate": 0.00040432309442548354, "loss": 1.2031, "step": 2622 }, { "epoch": 2.9840728100113765, "grad_norm": 1.0730477571487427, "learning_rate": 0.00040409556313993175, "loss": 1.9424, "step": 2623 }, { "epoch": 2.9852104664391352, "grad_norm": 0.5753147006034851, "learning_rate": 0.00040386803185437996, "loss": 0.5761, "step": 2624 }, { "epoch": 2.986348122866894, "grad_norm": 1.2759993076324463, "learning_rate": 0.00040364050056882823, "loss": 2.9886, "step": 2625 }, { "epoch": 2.987485779294653, "grad_norm": 0.916204571723938, "learning_rate": 0.0004034129692832765, "loss": 1.5574, "step": 2626 }, { "epoch": 2.988623435722412, "grad_norm": 0.940258264541626, "learning_rate": 0.00040318543799772465, "loss": 1.8689, "step": 2627 }, { "epoch": 2.9897610921501707, "grad_norm": 0.5174016952514648, "learning_rate": 0.0004029579067121729, "loss": 0.8162, "step": 2628 }, { "epoch": 2.9908987485779295, "grad_norm": 0.779630720615387, "learning_rate": 0.0004027303754266212, "loss": 1.2215, "step": 2629 }, { "epoch": 2.9920364050056882, "grad_norm": 1.0173307657241821, "learning_rate": 0.00040250284414106944, "loss": 1.6225, "step": 2630 }, { "epoch": 2.993174061433447, "grad_norm": 0.9133301377296448, "learning_rate": 0.00040227531285551765, "loss": 2.143, "step": 2631 }, { "epoch": 2.9943117178612058, "grad_norm": 0.818585991859436, "learning_rate": 0.00040204778156996586, "loss": 2.071, "step": 2632 }, { "epoch": 2.995449374288965, "grad_norm": 1.6560077667236328, "learning_rate": 0.0004018202502844141, "loss": 2.403, "step": 2633 }, { "epoch": 2.9965870307167233, "grad_norm": 0.7060096859931946, "learning_rate": 0.00040159271899886233, "loss": 0.9656, "step": 2634 }, { "epoch": 2.9977246871444825, "grad_norm": 0.7223090529441833, "learning_rate": 0.0004013651877133106, "loss": 1.7089, "step": 2635 }, { "epoch": 2.9988623435722412, "grad_norm": 0.6294040679931641, "learning_rate": 0.00040113765642775886, "loss": 1.2626, "step": 2636 }, { "epoch": 3.0, "grad_norm": 0.931922972202301, "learning_rate": 0.000400910125142207, "loss": 1.8149, "step": 2637 }, { "epoch": 3.0, "eval_f1": 0.8905, "eval_gen_len": 49.5636, "eval_loss": 1.8385756015777588, "eval_precision": 0.889, "eval_recall": 0.8923, "eval_rouge1": 0.4436, "eval_rouge2": 0.2001, "eval_rougeL": 0.3707, "eval_rougeLsum": 0.4092, "eval_runtime": 28.0556, "eval_samples_per_second": 3.921, "eval_steps_per_second": 0.499, "step": 2637 }, { "epoch": 3.0011376564277588, "grad_norm": 0.7378818988800049, "learning_rate": 0.0004006825938566553, "loss": 1.5501, "step": 2638 }, { "epoch": 3.0022753128555175, "grad_norm": 0.9810494780540466, "learning_rate": 0.00040045506257110355, "loss": 2.0296, "step": 2639 }, { "epoch": 3.0034129692832763, "grad_norm": 1.0457974672317505, "learning_rate": 0.0004002275312855518, "loss": 1.9824, "step": 2640 }, { "epoch": 3.0045506257110355, "grad_norm": 1.2151912450790405, "learning_rate": 0.0004, "loss": 2.0357, "step": 2641 }, { "epoch": 3.0056882821387942, "grad_norm": 0.9331043362617493, "learning_rate": 0.00039977246871444823, "loss": 1.1641, "step": 2642 }, { "epoch": 3.006825938566553, "grad_norm": 0.8802271485328674, "learning_rate": 0.0003995449374288965, "loss": 1.4499, "step": 2643 }, { "epoch": 3.0079635949943118, "grad_norm": 0.8039838075637817, "learning_rate": 0.0003993174061433447, "loss": 1.3322, "step": 2644 }, { "epoch": 3.0091012514220705, "grad_norm": 1.716158390045166, "learning_rate": 0.00039908987485779297, "loss": 1.9503, "step": 2645 }, { "epoch": 3.0102389078498293, "grad_norm": 0.589878261089325, "learning_rate": 0.00039886234357224123, "loss": 1.2121, "step": 2646 }, { "epoch": 3.011376564277588, "grad_norm": 0.6693535447120667, "learning_rate": 0.0003986348122866894, "loss": 1.5157, "step": 2647 }, { "epoch": 3.012514220705347, "grad_norm": 1.0818902254104614, "learning_rate": 0.00039840728100113765, "loss": 2.445, "step": 2648 }, { "epoch": 3.013651877133106, "grad_norm": 1.072434663772583, "learning_rate": 0.0003981797497155859, "loss": 3.0144, "step": 2649 }, { "epoch": 3.0147895335608648, "grad_norm": 1.0067790746688843, "learning_rate": 0.0003979522184300341, "loss": 1.1798, "step": 2650 }, { "epoch": 3.0159271899886235, "grad_norm": 0.8552153706550598, "learning_rate": 0.0003977246871444824, "loss": 1.7862, "step": 2651 }, { "epoch": 3.0170648464163823, "grad_norm": 0.9946853518486023, "learning_rate": 0.0003974971558589306, "loss": 2.5064, "step": 2652 }, { "epoch": 3.018202502844141, "grad_norm": 0.8925816416740417, "learning_rate": 0.00039726962457337886, "loss": 1.3693, "step": 2653 }, { "epoch": 3.0193401592719, "grad_norm": 1.815544605255127, "learning_rate": 0.0003970420932878271, "loss": 3.0686, "step": 2654 }, { "epoch": 3.0204778156996586, "grad_norm": 1.122719407081604, "learning_rate": 0.00039681456200227534, "loss": 2.3256, "step": 2655 }, { "epoch": 3.0216154721274173, "grad_norm": 1.784497618675232, "learning_rate": 0.0003965870307167236, "loss": 3.1539, "step": 2656 }, { "epoch": 3.0227531285551765, "grad_norm": 1.0257576704025269, "learning_rate": 0.00039635949943117176, "loss": 1.8041, "step": 2657 }, { "epoch": 3.0238907849829353, "grad_norm": 0.9659282565116882, "learning_rate": 0.00039613196814562, "loss": 1.4202, "step": 2658 }, { "epoch": 3.025028441410694, "grad_norm": 1.3800609111785889, "learning_rate": 0.0003959044368600683, "loss": 1.9181, "step": 2659 }, { "epoch": 3.026166097838453, "grad_norm": 1.056583285331726, "learning_rate": 0.0003956769055745165, "loss": 1.6186, "step": 2660 }, { "epoch": 3.0273037542662116, "grad_norm": 0.7123137712478638, "learning_rate": 0.0003954493742889647, "loss": 1.2721, "step": 2661 }, { "epoch": 3.0284414106939703, "grad_norm": 0.8692189455032349, "learning_rate": 0.00039522184300341297, "loss": 1.8621, "step": 2662 }, { "epoch": 3.029579067121729, "grad_norm": 0.8391653895378113, "learning_rate": 0.00039499431171786123, "loss": 1.0275, "step": 2663 }, { "epoch": 3.030716723549488, "grad_norm": 1.1637401580810547, "learning_rate": 0.00039476678043230944, "loss": 2.4589, "step": 2664 }, { "epoch": 3.031854379977247, "grad_norm": 0.9150156378746033, "learning_rate": 0.0003945392491467577, "loss": 1.5759, "step": 2665 }, { "epoch": 3.032992036405006, "grad_norm": 1.1568349599838257, "learning_rate": 0.0003943117178612059, "loss": 2.138, "step": 2666 }, { "epoch": 3.0341296928327646, "grad_norm": 1.1453518867492676, "learning_rate": 0.0003940841865756541, "loss": 2.3239, "step": 2667 }, { "epoch": 3.0352673492605233, "grad_norm": 0.8770859837532043, "learning_rate": 0.0003938566552901024, "loss": 1.523, "step": 2668 }, { "epoch": 3.036405005688282, "grad_norm": 1.0244057178497314, "learning_rate": 0.00039362912400455065, "loss": 1.7684, "step": 2669 }, { "epoch": 3.037542662116041, "grad_norm": 1.0480972528457642, "learning_rate": 0.00039340159271899886, "loss": 1.5302, "step": 2670 }, { "epoch": 3.0386803185437996, "grad_norm": 1.1314648389816284, "learning_rate": 0.0003931740614334471, "loss": 1.6721, "step": 2671 }, { "epoch": 3.039817974971559, "grad_norm": 0.7867646813392639, "learning_rate": 0.00039294653014789534, "loss": 1.3925, "step": 2672 }, { "epoch": 3.0409556313993176, "grad_norm": 1.450878381729126, "learning_rate": 0.0003927189988623436, "loss": 2.2478, "step": 2673 }, { "epoch": 3.0420932878270763, "grad_norm": 0.7771217226982117, "learning_rate": 0.0003924914675767918, "loss": 1.3815, "step": 2674 }, { "epoch": 3.043230944254835, "grad_norm": 1.2466049194335938, "learning_rate": 0.0003922639362912401, "loss": 3.2891, "step": 2675 }, { "epoch": 3.044368600682594, "grad_norm": 1.7478383779525757, "learning_rate": 0.0003920364050056883, "loss": 2.8603, "step": 2676 }, { "epoch": 3.0455062571103526, "grad_norm": 0.8281832933425903, "learning_rate": 0.0003918088737201365, "loss": 1.5477, "step": 2677 }, { "epoch": 3.0466439135381114, "grad_norm": 0.695612370967865, "learning_rate": 0.00039158134243458476, "loss": 1.4521, "step": 2678 }, { "epoch": 3.04778156996587, "grad_norm": 0.9778900742530823, "learning_rate": 0.000391353811149033, "loss": 1.341, "step": 2679 }, { "epoch": 3.0489192263936293, "grad_norm": 1.0386836528778076, "learning_rate": 0.00039112627986348123, "loss": 1.7095, "step": 2680 }, { "epoch": 3.050056882821388, "grad_norm": 0.9944807291030884, "learning_rate": 0.00039089874857792944, "loss": 1.9036, "step": 2681 }, { "epoch": 3.051194539249147, "grad_norm": 0.8040305972099304, "learning_rate": 0.0003906712172923777, "loss": 1.1083, "step": 2682 }, { "epoch": 3.0523321956769056, "grad_norm": 1.0549193620681763, "learning_rate": 0.00039044368600682597, "loss": 2.558, "step": 2683 }, { "epoch": 3.0534698521046644, "grad_norm": 0.7308775782585144, "learning_rate": 0.0003902161547212742, "loss": 1.073, "step": 2684 }, { "epoch": 3.054607508532423, "grad_norm": 0.532346785068512, "learning_rate": 0.00038998862343572245, "loss": 0.8228, "step": 2685 }, { "epoch": 3.055745164960182, "grad_norm": 0.9749463200569153, "learning_rate": 0.00038976109215017066, "loss": 1.6924, "step": 2686 }, { "epoch": 3.0568828213879407, "grad_norm": 0.9695119857788086, "learning_rate": 0.00038953356086461887, "loss": 1.8108, "step": 2687 }, { "epoch": 3.0580204778157, "grad_norm": 0.9351180195808411, "learning_rate": 0.00038930602957906713, "loss": 2.2291, "step": 2688 }, { "epoch": 3.0591581342434586, "grad_norm": 0.8033020496368408, "learning_rate": 0.0003890784982935154, "loss": 1.8916, "step": 2689 }, { "epoch": 3.0602957906712174, "grad_norm": 1.3790737390518188, "learning_rate": 0.0003888509670079636, "loss": 1.9922, "step": 2690 }, { "epoch": 3.061433447098976, "grad_norm": 1.2332922220230103, "learning_rate": 0.0003886234357224118, "loss": 2.345, "step": 2691 }, { "epoch": 3.062571103526735, "grad_norm": 1.38813316822052, "learning_rate": 0.0003883959044368601, "loss": 2.8993, "step": 2692 }, { "epoch": 3.0637087599544937, "grad_norm": 0.952480137348175, "learning_rate": 0.00038816837315130834, "loss": 1.8394, "step": 2693 }, { "epoch": 3.0648464163822524, "grad_norm": 0.8586364984512329, "learning_rate": 0.00038794084186575655, "loss": 1.1771, "step": 2694 }, { "epoch": 3.065984072810011, "grad_norm": 0.9498509764671326, "learning_rate": 0.00038771331058020476, "loss": 1.4771, "step": 2695 }, { "epoch": 3.0671217292377704, "grad_norm": 0.9688740968704224, "learning_rate": 0.000387485779294653, "loss": 2.1815, "step": 2696 }, { "epoch": 3.068259385665529, "grad_norm": 2.0778191089630127, "learning_rate": 0.00038725824800910124, "loss": 2.46, "step": 2697 }, { "epoch": 3.069397042093288, "grad_norm": 1.1978789567947388, "learning_rate": 0.0003870307167235495, "loss": 2.347, "step": 2698 }, { "epoch": 3.0705346985210467, "grad_norm": 1.4718050956726074, "learning_rate": 0.00038680318543799776, "loss": 2.3554, "step": 2699 }, { "epoch": 3.0716723549488054, "grad_norm": 0.9731530547142029, "learning_rate": 0.0003865756541524459, "loss": 1.4802, "step": 2700 }, { "epoch": 3.072810011376564, "grad_norm": 1.0412006378173828, "learning_rate": 0.0003863481228668942, "loss": 1.8537, "step": 2701 }, { "epoch": 3.073947667804323, "grad_norm": 0.7110769748687744, "learning_rate": 0.00038612059158134245, "loss": 1.6169, "step": 2702 }, { "epoch": 3.0750853242320817, "grad_norm": 1.7731373310089111, "learning_rate": 0.0003858930602957907, "loss": 3.4582, "step": 2703 }, { "epoch": 3.076222980659841, "grad_norm": 1.136021614074707, "learning_rate": 0.0003856655290102389, "loss": 2.161, "step": 2704 }, { "epoch": 3.0773606370875997, "grad_norm": 1.2377517223358154, "learning_rate": 0.00038543799772468713, "loss": 1.9528, "step": 2705 }, { "epoch": 3.0784982935153584, "grad_norm": 0.7984117865562439, "learning_rate": 0.0003852104664391354, "loss": 1.0635, "step": 2706 }, { "epoch": 3.079635949943117, "grad_norm": 1.1283124685287476, "learning_rate": 0.0003849829351535836, "loss": 2.1958, "step": 2707 }, { "epoch": 3.080773606370876, "grad_norm": 1.1983591318130493, "learning_rate": 0.00038475540386803187, "loss": 2.5032, "step": 2708 }, { "epoch": 3.0819112627986347, "grad_norm": 0.8128067255020142, "learning_rate": 0.00038452787258248013, "loss": 1.566, "step": 2709 }, { "epoch": 3.0830489192263935, "grad_norm": 1.4617652893066406, "learning_rate": 0.0003843003412969283, "loss": 2.3352, "step": 2710 }, { "epoch": 3.0841865756541527, "grad_norm": 0.9528759717941284, "learning_rate": 0.00038407281001137655, "loss": 1.7667, "step": 2711 }, { "epoch": 3.0853242320819114, "grad_norm": 1.0481518507003784, "learning_rate": 0.0003838452787258248, "loss": 1.9046, "step": 2712 }, { "epoch": 3.08646188850967, "grad_norm": 0.9186555743217468, "learning_rate": 0.0003836177474402731, "loss": 1.8621, "step": 2713 }, { "epoch": 3.087599544937429, "grad_norm": 0.6513910293579102, "learning_rate": 0.0003833902161547213, "loss": 1.2062, "step": 2714 }, { "epoch": 3.0887372013651877, "grad_norm": 1.1539534330368042, "learning_rate": 0.0003831626848691695, "loss": 2.0368, "step": 2715 }, { "epoch": 3.0898748577929465, "grad_norm": 1.2739286422729492, "learning_rate": 0.00038293515358361777, "loss": 2.7124, "step": 2716 }, { "epoch": 3.091012514220705, "grad_norm": 0.796596348285675, "learning_rate": 0.000382707622298066, "loss": 1.5446, "step": 2717 }, { "epoch": 3.092150170648464, "grad_norm": 1.144640564918518, "learning_rate": 0.00038248009101251424, "loss": 1.3855, "step": 2718 }, { "epoch": 3.093287827076223, "grad_norm": 0.6495816111564636, "learning_rate": 0.0003822525597269625, "loss": 1.5521, "step": 2719 }, { "epoch": 3.094425483503982, "grad_norm": 1.0980159044265747, "learning_rate": 0.00038202502844141066, "loss": 1.5025, "step": 2720 }, { "epoch": 3.0955631399317407, "grad_norm": 0.8496591448783875, "learning_rate": 0.0003817974971558589, "loss": 2.1075, "step": 2721 }, { "epoch": 3.0967007963594995, "grad_norm": 0.9507794380187988, "learning_rate": 0.0003815699658703072, "loss": 1.8178, "step": 2722 }, { "epoch": 3.0978384527872582, "grad_norm": 1.0450421571731567, "learning_rate": 0.00038134243458475545, "loss": 2.2859, "step": 2723 }, { "epoch": 3.098976109215017, "grad_norm": 0.8103886246681213, "learning_rate": 0.00038111490329920366, "loss": 1.7053, "step": 2724 }, { "epoch": 3.1001137656427757, "grad_norm": 1.1811316013336182, "learning_rate": 0.00038088737201365187, "loss": 1.5576, "step": 2725 }, { "epoch": 3.1012514220705345, "grad_norm": 1.117814064025879, "learning_rate": 0.00038065984072810014, "loss": 2.484, "step": 2726 }, { "epoch": 3.1023890784982937, "grad_norm": 1.1703466176986694, "learning_rate": 0.00038043230944254835, "loss": 2.0287, "step": 2727 }, { "epoch": 3.1035267349260525, "grad_norm": 1.159627914428711, "learning_rate": 0.0003802047781569966, "loss": 2.4316, "step": 2728 }, { "epoch": 3.1046643913538112, "grad_norm": 0.6235855221748352, "learning_rate": 0.0003799772468714448, "loss": 1.3779, "step": 2729 }, { "epoch": 3.10580204778157, "grad_norm": 0.8967525959014893, "learning_rate": 0.00037974971558589303, "loss": 1.8938, "step": 2730 }, { "epoch": 3.1069397042093287, "grad_norm": 0.7541645169258118, "learning_rate": 0.0003795221843003413, "loss": 1.6706, "step": 2731 }, { "epoch": 3.1080773606370875, "grad_norm": 0.7087587714195251, "learning_rate": 0.00037929465301478956, "loss": 1.185, "step": 2732 }, { "epoch": 3.1092150170648463, "grad_norm": 0.4740487337112427, "learning_rate": 0.0003790671217292378, "loss": 0.5638, "step": 2733 }, { "epoch": 3.110352673492605, "grad_norm": 0.6750523447990417, "learning_rate": 0.000378839590443686, "loss": 1.064, "step": 2734 }, { "epoch": 3.1114903299203642, "grad_norm": 0.6977342367172241, "learning_rate": 0.00037861205915813424, "loss": 1.721, "step": 2735 }, { "epoch": 3.112627986348123, "grad_norm": 0.5952358841896057, "learning_rate": 0.0003783845278725825, "loss": 0.7465, "step": 2736 }, { "epoch": 3.1137656427758817, "grad_norm": 1.1384859085083008, "learning_rate": 0.0003781569965870307, "loss": 2.1374, "step": 2737 }, { "epoch": 3.1149032992036405, "grad_norm": 1.135901927947998, "learning_rate": 0.000377929465301479, "loss": 3.0381, "step": 2738 }, { "epoch": 3.1160409556313993, "grad_norm": 1.3090107440948486, "learning_rate": 0.0003777019340159272, "loss": 3.5599, "step": 2739 }, { "epoch": 3.117178612059158, "grad_norm": 0.7160171866416931, "learning_rate": 0.0003774744027303754, "loss": 0.7931, "step": 2740 }, { "epoch": 3.118316268486917, "grad_norm": 0.9017439484596252, "learning_rate": 0.00037724687144482366, "loss": 1.3222, "step": 2741 }, { "epoch": 3.1194539249146755, "grad_norm": 0.7213937640190125, "learning_rate": 0.00037701934015927193, "loss": 1.8957, "step": 2742 }, { "epoch": 3.1205915813424348, "grad_norm": 1.6694979667663574, "learning_rate": 0.0003767918088737202, "loss": 1.9735, "step": 2743 }, { "epoch": 3.1217292377701935, "grad_norm": 1.073201298713684, "learning_rate": 0.00037656427758816835, "loss": 1.2988, "step": 2744 }, { "epoch": 3.1228668941979523, "grad_norm": 0.7072137594223022, "learning_rate": 0.0003763367463026166, "loss": 1.4237, "step": 2745 }, { "epoch": 3.124004550625711, "grad_norm": 1.140438437461853, "learning_rate": 0.0003761092150170649, "loss": 2.3338, "step": 2746 }, { "epoch": 3.12514220705347, "grad_norm": 0.9373031258583069, "learning_rate": 0.0003758816837315131, "loss": 1.7434, "step": 2747 }, { "epoch": 3.1262798634812285, "grad_norm": 1.1408966779708862, "learning_rate": 0.00037565415244596135, "loss": 2.5215, "step": 2748 }, { "epoch": 3.1274175199089873, "grad_norm": 0.9613400101661682, "learning_rate": 0.00037542662116040956, "loss": 1.7157, "step": 2749 }, { "epoch": 3.1285551763367465, "grad_norm": 1.0001013278961182, "learning_rate": 0.00037519908987485777, "loss": 1.3756, "step": 2750 }, { "epoch": 3.1296928327645053, "grad_norm": 1.740805983543396, "learning_rate": 0.00037497155858930603, "loss": 2.2071, "step": 2751 }, { "epoch": 3.130830489192264, "grad_norm": 0.9777475595474243, "learning_rate": 0.0003747440273037543, "loss": 1.8485, "step": 2752 }, { "epoch": 3.131968145620023, "grad_norm": 1.1642980575561523, "learning_rate": 0.0003745164960182025, "loss": 1.2435, "step": 2753 }, { "epoch": 3.1331058020477816, "grad_norm": 1.1870895624160767, "learning_rate": 0.0003742889647326507, "loss": 1.3834, "step": 2754 }, { "epoch": 3.1342434584755403, "grad_norm": 0.9124128818511963, "learning_rate": 0.000374061433447099, "loss": 1.534, "step": 2755 }, { "epoch": 3.135381114903299, "grad_norm": 0.6941938400268555, "learning_rate": 0.00037383390216154725, "loss": 1.5597, "step": 2756 }, { "epoch": 3.136518771331058, "grad_norm": 0.9864102602005005, "learning_rate": 0.00037360637087599546, "loss": 1.5537, "step": 2757 }, { "epoch": 3.137656427758817, "grad_norm": 0.6157810688018799, "learning_rate": 0.0003733788395904437, "loss": 1.1024, "step": 2758 }, { "epoch": 3.138794084186576, "grad_norm": 0.9289652109146118, "learning_rate": 0.00037315130830489193, "loss": 1.2505, "step": 2759 }, { "epoch": 3.1399317406143346, "grad_norm": 0.6562715768814087, "learning_rate": 0.00037292377701934014, "loss": 1.1522, "step": 2760 }, { "epoch": 3.1410693970420933, "grad_norm": 0.7173290252685547, "learning_rate": 0.0003726962457337884, "loss": 1.3983, "step": 2761 }, { "epoch": 3.142207053469852, "grad_norm": 0.8549534678459167, "learning_rate": 0.00037246871444823667, "loss": 1.4353, "step": 2762 }, { "epoch": 3.143344709897611, "grad_norm": 1.8511520624160767, "learning_rate": 0.0003722411831626849, "loss": 2.9588, "step": 2763 }, { "epoch": 3.1444823663253696, "grad_norm": 1.1187710762023926, "learning_rate": 0.0003720136518771331, "loss": 1.8577, "step": 2764 }, { "epoch": 3.1456200227531284, "grad_norm": 0.9971904158592224, "learning_rate": 0.00037178612059158135, "loss": 2.0944, "step": 2765 }, { "epoch": 3.1467576791808876, "grad_norm": 1.0542211532592773, "learning_rate": 0.0003715585893060296, "loss": 1.5842, "step": 2766 }, { "epoch": 3.1478953356086463, "grad_norm": 0.8797990083694458, "learning_rate": 0.0003713310580204778, "loss": 2.6677, "step": 2767 }, { "epoch": 3.149032992036405, "grad_norm": 1.0683292150497437, "learning_rate": 0.00037110352673492604, "loss": 1.4515, "step": 2768 }, { "epoch": 3.150170648464164, "grad_norm": 0.9214059114456177, "learning_rate": 0.0003708759954493743, "loss": 2.1592, "step": 2769 }, { "epoch": 3.1513083048919226, "grad_norm": 1.1661478281021118, "learning_rate": 0.0003706484641638225, "loss": 2.3451, "step": 2770 }, { "epoch": 3.1524459613196814, "grad_norm": 0.8435385823249817, "learning_rate": 0.0003704209328782708, "loss": 1.3745, "step": 2771 }, { "epoch": 3.15358361774744, "grad_norm": 0.646821916103363, "learning_rate": 0.00037019340159271904, "loss": 1.2736, "step": 2772 }, { "epoch": 3.1547212741751993, "grad_norm": 1.5238386392593384, "learning_rate": 0.0003699658703071672, "loss": 2.0629, "step": 2773 }, { "epoch": 3.155858930602958, "grad_norm": 0.9329938292503357, "learning_rate": 0.00036973833902161546, "loss": 2.6737, "step": 2774 }, { "epoch": 3.156996587030717, "grad_norm": 0.9424443244934082, "learning_rate": 0.0003695108077360637, "loss": 1.814, "step": 2775 }, { "epoch": 3.1581342434584756, "grad_norm": 1.060135006904602, "learning_rate": 0.000369283276450512, "loss": 2.2409, "step": 2776 }, { "epoch": 3.1592718998862344, "grad_norm": 1.1120651960372925, "learning_rate": 0.0003690557451649602, "loss": 2.7252, "step": 2777 }, { "epoch": 3.160409556313993, "grad_norm": 1.0458694696426392, "learning_rate": 0.0003688282138794084, "loss": 2.2002, "step": 2778 }, { "epoch": 3.161547212741752, "grad_norm": 0.7672396898269653, "learning_rate": 0.00036860068259385667, "loss": 1.5965, "step": 2779 }, { "epoch": 3.1626848691695106, "grad_norm": 0.9745705127716064, "learning_rate": 0.0003683731513083049, "loss": 2.8014, "step": 2780 }, { "epoch": 3.1638225255972694, "grad_norm": 1.4451996088027954, "learning_rate": 0.00036814562002275314, "loss": 3.0917, "step": 2781 }, { "epoch": 3.1649601820250286, "grad_norm": 0.7620997428894043, "learning_rate": 0.0003679180887372014, "loss": 1.4155, "step": 2782 }, { "epoch": 3.1660978384527874, "grad_norm": 0.8162498474121094, "learning_rate": 0.00036769055745164956, "loss": 2.5782, "step": 2783 }, { "epoch": 3.167235494880546, "grad_norm": 0.8843737244606018, "learning_rate": 0.00036746302616609783, "loss": 1.6138, "step": 2784 }, { "epoch": 3.168373151308305, "grad_norm": 0.6327600479125977, "learning_rate": 0.0003672354948805461, "loss": 1.0368, "step": 2785 }, { "epoch": 3.1695108077360636, "grad_norm": 1.3573901653289795, "learning_rate": 0.00036700796359499436, "loss": 3.1572, "step": 2786 }, { "epoch": 3.1706484641638224, "grad_norm": 0.9209976196289062, "learning_rate": 0.00036678043230944257, "loss": 1.9979, "step": 2787 }, { "epoch": 3.171786120591581, "grad_norm": 1.0654752254486084, "learning_rate": 0.0003665529010238908, "loss": 1.1564, "step": 2788 }, { "epoch": 3.1729237770193404, "grad_norm": 0.8304941654205322, "learning_rate": 0.00036632536973833904, "loss": 0.804, "step": 2789 }, { "epoch": 3.174061433447099, "grad_norm": 0.7033611536026001, "learning_rate": 0.00036609783845278725, "loss": 1.2548, "step": 2790 }, { "epoch": 3.175199089874858, "grad_norm": 0.7177807688713074, "learning_rate": 0.0003658703071672355, "loss": 0.925, "step": 2791 }, { "epoch": 3.1763367463026166, "grad_norm": 0.7976208925247192, "learning_rate": 0.0003656427758816838, "loss": 1.8367, "step": 2792 }, { "epoch": 3.1774744027303754, "grad_norm": 1.0865761041641235, "learning_rate": 0.00036541524459613193, "loss": 2.0069, "step": 2793 }, { "epoch": 3.178612059158134, "grad_norm": 1.0845462083816528, "learning_rate": 0.0003651877133105802, "loss": 2.4428, "step": 2794 }, { "epoch": 3.179749715585893, "grad_norm": 1.635365605354309, "learning_rate": 0.00036496018202502846, "loss": 2.376, "step": 2795 }, { "epoch": 3.1808873720136517, "grad_norm": 0.9851645231246948, "learning_rate": 0.0003647326507394767, "loss": 1.7787, "step": 2796 }, { "epoch": 3.182025028441411, "grad_norm": 0.8619524836540222, "learning_rate": 0.00036450511945392494, "loss": 0.8745, "step": 2797 }, { "epoch": 3.1831626848691696, "grad_norm": 0.8011617064476013, "learning_rate": 0.00036427758816837315, "loss": 1.2395, "step": 2798 }, { "epoch": 3.1843003412969284, "grad_norm": 1.1883291006088257, "learning_rate": 0.0003640500568828214, "loss": 2.4134, "step": 2799 }, { "epoch": 3.185437997724687, "grad_norm": 0.9594371914863586, "learning_rate": 0.0003638225255972696, "loss": 1.8051, "step": 2800 }, { "epoch": 3.186575654152446, "grad_norm": 0.9812491536140442, "learning_rate": 0.0003635949943117179, "loss": 1.9984, "step": 2801 }, { "epoch": 3.1877133105802047, "grad_norm": 1.5427114963531494, "learning_rate": 0.0003633674630261661, "loss": 2.8443, "step": 2802 }, { "epoch": 3.1888509670079634, "grad_norm": 1.0131993293762207, "learning_rate": 0.0003631399317406143, "loss": 1.4077, "step": 2803 }, { "epoch": 3.189988623435722, "grad_norm": 1.21848464012146, "learning_rate": 0.00036291240045506257, "loss": 2.6364, "step": 2804 }, { "epoch": 3.1911262798634814, "grad_norm": 0.8747435212135315, "learning_rate": 0.00036268486916951083, "loss": 1.4387, "step": 2805 }, { "epoch": 3.19226393629124, "grad_norm": 1.2929614782333374, "learning_rate": 0.0003624573378839591, "loss": 2.2714, "step": 2806 }, { "epoch": 3.193401592718999, "grad_norm": 0.6196789145469666, "learning_rate": 0.00036222980659840725, "loss": 1.4344, "step": 2807 }, { "epoch": 3.1945392491467577, "grad_norm": 1.1467669010162354, "learning_rate": 0.0003620022753128555, "loss": 2.1119, "step": 2808 }, { "epoch": 3.1956769055745164, "grad_norm": 0.781956136226654, "learning_rate": 0.0003617747440273038, "loss": 1.4887, "step": 2809 }, { "epoch": 3.196814562002275, "grad_norm": 1.177976369857788, "learning_rate": 0.000361547212741752, "loss": 3.2043, "step": 2810 }, { "epoch": 3.197952218430034, "grad_norm": 0.8673222064971924, "learning_rate": 0.00036131968145620025, "loss": 1.0446, "step": 2811 }, { "epoch": 3.199089874857793, "grad_norm": 0.8764348030090332, "learning_rate": 0.00036109215017064846, "loss": 2.2418, "step": 2812 }, { "epoch": 3.200227531285552, "grad_norm": 1.8115644454956055, "learning_rate": 0.0003608646188850967, "loss": 1.0493, "step": 2813 }, { "epoch": 3.2013651877133107, "grad_norm": 1.0345537662506104, "learning_rate": 0.00036063708759954494, "loss": 2.0083, "step": 2814 }, { "epoch": 3.2025028441410694, "grad_norm": 0.7956832647323608, "learning_rate": 0.0003604095563139932, "loss": 1.377, "step": 2815 }, { "epoch": 3.203640500568828, "grad_norm": 0.7763450145721436, "learning_rate": 0.00036018202502844147, "loss": 1.4905, "step": 2816 }, { "epoch": 3.204778156996587, "grad_norm": 1.0623788833618164, "learning_rate": 0.0003599544937428896, "loss": 1.4396, "step": 2817 }, { "epoch": 3.2059158134243457, "grad_norm": 0.8401424288749695, "learning_rate": 0.0003597269624573379, "loss": 1.7506, "step": 2818 }, { "epoch": 3.2070534698521045, "grad_norm": 1.2517905235290527, "learning_rate": 0.00035949943117178615, "loss": 1.8421, "step": 2819 }, { "epoch": 3.2081911262798632, "grad_norm": 0.9095095992088318, "learning_rate": 0.00035927189988623436, "loss": 1.069, "step": 2820 }, { "epoch": 3.2093287827076225, "grad_norm": 1.1191394329071045, "learning_rate": 0.0003590443686006826, "loss": 2.1787, "step": 2821 }, { "epoch": 3.210466439135381, "grad_norm": 0.9273049235343933, "learning_rate": 0.00035881683731513083, "loss": 2.2313, "step": 2822 }, { "epoch": 3.21160409556314, "grad_norm": 1.0636650323867798, "learning_rate": 0.00035858930602957904, "loss": 1.5293, "step": 2823 }, { "epoch": 3.2127417519908987, "grad_norm": 1.2871456146240234, "learning_rate": 0.0003583617747440273, "loss": 3.0545, "step": 2824 }, { "epoch": 3.2138794084186575, "grad_norm": 1.2373828887939453, "learning_rate": 0.00035813424345847557, "loss": 2.0834, "step": 2825 }, { "epoch": 3.2150170648464163, "grad_norm": 0.7762901186943054, "learning_rate": 0.00035790671217292384, "loss": 1.4895, "step": 2826 }, { "epoch": 3.216154721274175, "grad_norm": 1.2859349250793457, "learning_rate": 0.000357679180887372, "loss": 1.9091, "step": 2827 }, { "epoch": 3.217292377701934, "grad_norm": 1.2489643096923828, "learning_rate": 0.00035745164960182026, "loss": 1.831, "step": 2828 }, { "epoch": 3.218430034129693, "grad_norm": 0.9940024018287659, "learning_rate": 0.0003572241183162685, "loss": 2.157, "step": 2829 }, { "epoch": 3.2195676905574517, "grad_norm": 1.1232733726501465, "learning_rate": 0.00035699658703071673, "loss": 2.9384, "step": 2830 }, { "epoch": 3.2207053469852105, "grad_norm": 0.7718859910964966, "learning_rate": 0.000356769055745165, "loss": 1.8196, "step": 2831 }, { "epoch": 3.2218430034129693, "grad_norm": 1.089641809463501, "learning_rate": 0.0003565415244596132, "loss": 1.8792, "step": 2832 }, { "epoch": 3.222980659840728, "grad_norm": 1.2740263938903809, "learning_rate": 0.0003563139931740614, "loss": 2.5414, "step": 2833 }, { "epoch": 3.2241183162684868, "grad_norm": 0.9137701392173767, "learning_rate": 0.0003560864618885097, "loss": 1.3666, "step": 2834 }, { "epoch": 3.2252559726962455, "grad_norm": 0.6575736403465271, "learning_rate": 0.00035585893060295794, "loss": 1.2568, "step": 2835 }, { "epoch": 3.2263936291240047, "grad_norm": 0.9433609843254089, "learning_rate": 0.0003556313993174061, "loss": 1.4612, "step": 2836 }, { "epoch": 3.2275312855517635, "grad_norm": 0.8380080461502075, "learning_rate": 0.00035540386803185436, "loss": 1.1819, "step": 2837 }, { "epoch": 3.2286689419795223, "grad_norm": 0.7855201363563538, "learning_rate": 0.0003551763367463026, "loss": 1.8921, "step": 2838 }, { "epoch": 3.229806598407281, "grad_norm": 1.5746235847473145, "learning_rate": 0.0003549488054607509, "loss": 3.0391, "step": 2839 }, { "epoch": 3.2309442548350398, "grad_norm": 1.1520813703536987, "learning_rate": 0.0003547212741751991, "loss": 2.1142, "step": 2840 }, { "epoch": 3.2320819112627985, "grad_norm": 0.8569504618644714, "learning_rate": 0.0003544937428896473, "loss": 2.0138, "step": 2841 }, { "epoch": 3.2332195676905573, "grad_norm": 1.0433871746063232, "learning_rate": 0.0003542662116040956, "loss": 1.673, "step": 2842 }, { "epoch": 3.234357224118316, "grad_norm": 1.01398766040802, "learning_rate": 0.0003540386803185438, "loss": 1.9582, "step": 2843 }, { "epoch": 3.2354948805460753, "grad_norm": 1.0465679168701172, "learning_rate": 0.00035381114903299205, "loss": 1.9819, "step": 2844 }, { "epoch": 3.236632536973834, "grad_norm": 0.787959635257721, "learning_rate": 0.0003535836177474403, "loss": 1.639, "step": 2845 }, { "epoch": 3.2377701934015928, "grad_norm": 1.155023455619812, "learning_rate": 0.00035335608646188847, "loss": 1.8541, "step": 2846 }, { "epoch": 3.2389078498293515, "grad_norm": 1.1908761262893677, "learning_rate": 0.00035312855517633673, "loss": 1.9333, "step": 2847 }, { "epoch": 3.2400455062571103, "grad_norm": 0.9262527227401733, "learning_rate": 0.000352901023890785, "loss": 1.3209, "step": 2848 }, { "epoch": 3.241183162684869, "grad_norm": 0.8731938004493713, "learning_rate": 0.00035267349260523326, "loss": 2.2036, "step": 2849 }, { "epoch": 3.242320819112628, "grad_norm": 0.6273211240768433, "learning_rate": 0.00035244596131968147, "loss": 0.9153, "step": 2850 }, { "epoch": 3.243458475540387, "grad_norm": 1.0732612609863281, "learning_rate": 0.0003522184300341297, "loss": 2.286, "step": 2851 }, { "epoch": 3.244596131968146, "grad_norm": 1.1554096937179565, "learning_rate": 0.00035199089874857794, "loss": 2.0289, "step": 2852 }, { "epoch": 3.2457337883959045, "grad_norm": 0.8074318170547485, "learning_rate": 0.00035176336746302615, "loss": 1.8181, "step": 2853 }, { "epoch": 3.2468714448236633, "grad_norm": 1.392298936843872, "learning_rate": 0.0003515358361774744, "loss": 3.4711, "step": 2854 }, { "epoch": 3.248009101251422, "grad_norm": 0.9060192108154297, "learning_rate": 0.0003513083048919227, "loss": 2.0728, "step": 2855 }, { "epoch": 3.249146757679181, "grad_norm": 1.2438005208969116, "learning_rate": 0.00035108077360637084, "loss": 1.4773, "step": 2856 }, { "epoch": 3.2502844141069396, "grad_norm": 1.0764005184173584, "learning_rate": 0.0003508532423208191, "loss": 1.5244, "step": 2857 }, { "epoch": 3.2514220705346983, "grad_norm": 0.8813822269439697, "learning_rate": 0.00035062571103526737, "loss": 1.2567, "step": 2858 }, { "epoch": 3.252559726962457, "grad_norm": 0.6739155054092407, "learning_rate": 0.00035039817974971563, "loss": 0.682, "step": 2859 }, { "epoch": 3.2536973833902163, "grad_norm": 1.2803453207015991, "learning_rate": 0.00035017064846416384, "loss": 1.5126, "step": 2860 }, { "epoch": 3.254835039817975, "grad_norm": 1.1614502668380737, "learning_rate": 0.00034994311717861205, "loss": 3.0873, "step": 2861 }, { "epoch": 3.255972696245734, "grad_norm": 0.8287785053253174, "learning_rate": 0.0003497155858930603, "loss": 1.3949, "step": 2862 }, { "epoch": 3.2571103526734926, "grad_norm": 0.8668496608734131, "learning_rate": 0.0003494880546075085, "loss": 1.5943, "step": 2863 }, { "epoch": 3.2582480091012513, "grad_norm": 0.8574039936065674, "learning_rate": 0.0003492605233219568, "loss": 1.8113, "step": 2864 }, { "epoch": 3.25938566552901, "grad_norm": 0.8691564202308655, "learning_rate": 0.00034903299203640505, "loss": 1.5579, "step": 2865 }, { "epoch": 3.260523321956769, "grad_norm": 0.948115885257721, "learning_rate": 0.0003488054607508532, "loss": 1.4459, "step": 2866 }, { "epoch": 3.261660978384528, "grad_norm": 1.410564661026001, "learning_rate": 0.00034857792946530147, "loss": 3.3305, "step": 2867 }, { "epoch": 3.262798634812287, "grad_norm": 0.904012143611908, "learning_rate": 0.00034835039817974973, "loss": 1.7655, "step": 2868 }, { "epoch": 3.2639362912400456, "grad_norm": 0.9044657945632935, "learning_rate": 0.000348122866894198, "loss": 1.2666, "step": 2869 }, { "epoch": 3.2650739476678043, "grad_norm": 0.8898612856864929, "learning_rate": 0.00034789533560864615, "loss": 2.0107, "step": 2870 }, { "epoch": 3.266211604095563, "grad_norm": 0.8412598967552185, "learning_rate": 0.0003476678043230944, "loss": 1.4021, "step": 2871 }, { "epoch": 3.267349260523322, "grad_norm": 1.2648974657058716, "learning_rate": 0.0003474402730375427, "loss": 3.4206, "step": 2872 }, { "epoch": 3.2684869169510806, "grad_norm": 0.9399024248123169, "learning_rate": 0.0003472127417519909, "loss": 2.8845, "step": 2873 }, { "epoch": 3.26962457337884, "grad_norm": 0.9316064119338989, "learning_rate": 0.00034698521046643916, "loss": 2.0801, "step": 2874 }, { "epoch": 3.2707622298065986, "grad_norm": 0.9166231155395508, "learning_rate": 0.00034675767918088737, "loss": 1.661, "step": 2875 }, { "epoch": 3.2718998862343573, "grad_norm": 1.0802944898605347, "learning_rate": 0.0003465301478953356, "loss": 3.3163, "step": 2876 }, { "epoch": 3.273037542662116, "grad_norm": 1.2445770502090454, "learning_rate": 0.00034630261660978384, "loss": 2.9616, "step": 2877 }, { "epoch": 3.274175199089875, "grad_norm": 0.8100591897964478, "learning_rate": 0.0003460750853242321, "loss": 1.1971, "step": 2878 }, { "epoch": 3.2753128555176336, "grad_norm": 0.9066864252090454, "learning_rate": 0.00034584755403868037, "loss": 1.7813, "step": 2879 }, { "epoch": 3.2764505119453924, "grad_norm": 1.0590927600860596, "learning_rate": 0.0003456200227531285, "loss": 2.0441, "step": 2880 }, { "epoch": 3.277588168373151, "grad_norm": 0.6991727352142334, "learning_rate": 0.0003453924914675768, "loss": 1.2761, "step": 2881 }, { "epoch": 3.27872582480091, "grad_norm": 0.6339994668960571, "learning_rate": 0.00034516496018202505, "loss": 1.0168, "step": 2882 }, { "epoch": 3.279863481228669, "grad_norm": 0.630440890789032, "learning_rate": 0.00034493742889647326, "loss": 1.3744, "step": 2883 }, { "epoch": 3.281001137656428, "grad_norm": 0.8100972771644592, "learning_rate": 0.0003447098976109215, "loss": 1.9117, "step": 2884 }, { "epoch": 3.2821387940841866, "grad_norm": 0.9163875579833984, "learning_rate": 0.00034448236632536974, "loss": 1.1392, "step": 2885 }, { "epoch": 3.2832764505119454, "grad_norm": 0.9157561659812927, "learning_rate": 0.00034425483503981795, "loss": 1.9776, "step": 2886 }, { "epoch": 3.284414106939704, "grad_norm": 0.7969180345535278, "learning_rate": 0.0003440273037542662, "loss": 1.4908, "step": 2887 }, { "epoch": 3.285551763367463, "grad_norm": 0.7929701209068298, "learning_rate": 0.0003437997724687145, "loss": 1.6476, "step": 2888 }, { "epoch": 3.2866894197952217, "grad_norm": 1.0109394788742065, "learning_rate": 0.00034357224118316274, "loss": 2.3243, "step": 2889 }, { "epoch": 3.287827076222981, "grad_norm": 1.095586895942688, "learning_rate": 0.0003433447098976109, "loss": 2.0614, "step": 2890 }, { "epoch": 3.2889647326507396, "grad_norm": 0.8181962966918945, "learning_rate": 0.00034311717861205916, "loss": 1.0926, "step": 2891 }, { "epoch": 3.2901023890784984, "grad_norm": 0.5682795643806458, "learning_rate": 0.0003428896473265074, "loss": 0.5625, "step": 2892 }, { "epoch": 3.291240045506257, "grad_norm": 0.8557625412940979, "learning_rate": 0.00034266211604095563, "loss": 1.8372, "step": 2893 }, { "epoch": 3.292377701934016, "grad_norm": 0.6423472166061401, "learning_rate": 0.0003424345847554039, "loss": 1.3594, "step": 2894 }, { "epoch": 3.2935153583617747, "grad_norm": 1.044715404510498, "learning_rate": 0.0003422070534698521, "loss": 2.7601, "step": 2895 }, { "epoch": 3.2946530147895334, "grad_norm": 0.7515947222709656, "learning_rate": 0.0003419795221843003, "loss": 1.1865, "step": 2896 }, { "epoch": 3.295790671217292, "grad_norm": 0.9097882509231567, "learning_rate": 0.0003417519908987486, "loss": 1.4461, "step": 2897 }, { "epoch": 3.296928327645051, "grad_norm": 0.7172821760177612, "learning_rate": 0.00034152445961319684, "loss": 2.2885, "step": 2898 }, { "epoch": 3.29806598407281, "grad_norm": 1.1029298305511475, "learning_rate": 0.0003412969283276451, "loss": 1.5583, "step": 2899 }, { "epoch": 3.299203640500569, "grad_norm": 0.853276252746582, "learning_rate": 0.00034106939704209326, "loss": 1.4709, "step": 2900 }, { "epoch": 3.3003412969283277, "grad_norm": 0.8551457524299622, "learning_rate": 0.00034084186575654153, "loss": 2.4956, "step": 2901 }, { "epoch": 3.3014789533560864, "grad_norm": 0.909735381603241, "learning_rate": 0.0003406143344709898, "loss": 2.0525, "step": 2902 }, { "epoch": 3.302616609783845, "grad_norm": 1.0006370544433594, "learning_rate": 0.000340386803185438, "loss": 1.6306, "step": 2903 }, { "epoch": 3.303754266211604, "grad_norm": 0.5882608294487, "learning_rate": 0.00034015927189988627, "loss": 1.3989, "step": 2904 }, { "epoch": 3.3048919226393627, "grad_norm": 1.2735271453857422, "learning_rate": 0.0003399317406143345, "loss": 2.394, "step": 2905 }, { "epoch": 3.306029579067122, "grad_norm": 0.9738203287124634, "learning_rate": 0.0003397042093287827, "loss": 1.951, "step": 2906 }, { "epoch": 3.3071672354948807, "grad_norm": 0.5416101217269897, "learning_rate": 0.00033947667804323095, "loss": 1.0747, "step": 2907 }, { "epoch": 3.3083048919226394, "grad_norm": 0.9188492298126221, "learning_rate": 0.0003392491467576792, "loss": 1.4549, "step": 2908 }, { "epoch": 3.309442548350398, "grad_norm": 1.4744441509246826, "learning_rate": 0.0003390216154721274, "loss": 2.4909, "step": 2909 }, { "epoch": 3.310580204778157, "grad_norm": 0.6633999347686768, "learning_rate": 0.00033879408418657563, "loss": 1.4089, "step": 2910 }, { "epoch": 3.3117178612059157, "grad_norm": 0.6373212933540344, "learning_rate": 0.0003385665529010239, "loss": 1.4588, "step": 2911 }, { "epoch": 3.3128555176336745, "grad_norm": 0.5806794166564941, "learning_rate": 0.00033833902161547216, "loss": 1.3158, "step": 2912 }, { "epoch": 3.3139931740614337, "grad_norm": 1.0303038358688354, "learning_rate": 0.00033811149032992037, "loss": 1.6245, "step": 2913 }, { "epoch": 3.3151308304891924, "grad_norm": 1.0633960962295532, "learning_rate": 0.0003378839590443686, "loss": 1.9088, "step": 2914 }, { "epoch": 3.316268486916951, "grad_norm": 0.9028981328010559, "learning_rate": 0.00033765642775881685, "loss": 1.64, "step": 2915 }, { "epoch": 3.31740614334471, "grad_norm": 0.886370062828064, "learning_rate": 0.00033742889647326506, "loss": 1.292, "step": 2916 }, { "epoch": 3.3185437997724687, "grad_norm": 1.6007177829742432, "learning_rate": 0.0003372013651877133, "loss": 3.1382, "step": 2917 }, { "epoch": 3.3196814562002275, "grad_norm": 1.1357500553131104, "learning_rate": 0.0003369738339021616, "loss": 2.1361, "step": 2918 }, { "epoch": 3.3208191126279862, "grad_norm": 0.9156873822212219, "learning_rate": 0.0003367463026166098, "loss": 1.3616, "step": 2919 }, { "epoch": 3.321956769055745, "grad_norm": 1.0556998252868652, "learning_rate": 0.000336518771331058, "loss": 2.6077, "step": 2920 }, { "epoch": 3.3230944254835038, "grad_norm": 1.1073150634765625, "learning_rate": 0.00033629124004550627, "loss": 1.6265, "step": 2921 }, { "epoch": 3.324232081911263, "grad_norm": 0.6052758097648621, "learning_rate": 0.00033606370875995453, "loss": 1.0629, "step": 2922 }, { "epoch": 3.3253697383390217, "grad_norm": 0.9209448099136353, "learning_rate": 0.00033583617747440274, "loss": 1.3336, "step": 2923 }, { "epoch": 3.3265073947667805, "grad_norm": 0.5461276769638062, "learning_rate": 0.00033560864618885095, "loss": 1.1149, "step": 2924 }, { "epoch": 3.3276450511945392, "grad_norm": 1.0375051498413086, "learning_rate": 0.0003353811149032992, "loss": 1.4975, "step": 2925 }, { "epoch": 3.328782707622298, "grad_norm": 0.9948595762252808, "learning_rate": 0.0003351535836177474, "loss": 1.7277, "step": 2926 }, { "epoch": 3.3299203640500568, "grad_norm": 1.711958408355713, "learning_rate": 0.0003349260523321957, "loss": 2.7451, "step": 2927 }, { "epoch": 3.3310580204778155, "grad_norm": 0.9110103845596313, "learning_rate": 0.00033469852104664395, "loss": 1.8131, "step": 2928 }, { "epoch": 3.3321956769055747, "grad_norm": 0.9176352620124817, "learning_rate": 0.0003344709897610921, "loss": 1.4978, "step": 2929 }, { "epoch": 3.3333333333333335, "grad_norm": 0.8127102255821228, "learning_rate": 0.0003342434584755404, "loss": 1.8899, "step": 2930 }, { "epoch": 3.3344709897610922, "grad_norm": 1.1340645551681519, "learning_rate": 0.00033401592718998864, "loss": 1.3567, "step": 2931 }, { "epoch": 3.335608646188851, "grad_norm": 1.366903305053711, "learning_rate": 0.0003337883959044369, "loss": 3.0076, "step": 2932 }, { "epoch": 3.3367463026166098, "grad_norm": 0.8700219392776489, "learning_rate": 0.0003335608646188851, "loss": 1.2597, "step": 2933 }, { "epoch": 3.3378839590443685, "grad_norm": 0.9277002215385437, "learning_rate": 0.0003333333333333333, "loss": 2.267, "step": 2934 }, { "epoch": 3.3390216154721273, "grad_norm": 0.5930815935134888, "learning_rate": 0.0003331058020477816, "loss": 1.3837, "step": 2935 }, { "epoch": 3.3401592718998865, "grad_norm": 1.0349963903427124, "learning_rate": 0.0003328782707622298, "loss": 2.4195, "step": 2936 }, { "epoch": 3.3412969283276452, "grad_norm": 1.2994656562805176, "learning_rate": 0.00033265073947667806, "loss": 1.4095, "step": 2937 }, { "epoch": 3.342434584755404, "grad_norm": 0.7547191381454468, "learning_rate": 0.0003324232081911263, "loss": 1.6748, "step": 2938 }, { "epoch": 3.3435722411831628, "grad_norm": 0.9032368659973145, "learning_rate": 0.0003321956769055745, "loss": 2.3666, "step": 2939 }, { "epoch": 3.3447098976109215, "grad_norm": 0.8295652866363525, "learning_rate": 0.00033196814562002274, "loss": 1.4128, "step": 2940 }, { "epoch": 3.3458475540386803, "grad_norm": 0.6656214594841003, "learning_rate": 0.000331740614334471, "loss": 0.8634, "step": 2941 }, { "epoch": 3.346985210466439, "grad_norm": 0.9207725524902344, "learning_rate": 0.00033151308304891927, "loss": 2.3504, "step": 2942 }, { "epoch": 3.348122866894198, "grad_norm": 0.7400831580162048, "learning_rate": 0.00033128555176336743, "loss": 0.9548, "step": 2943 }, { "epoch": 3.3492605233219566, "grad_norm": 0.9420154690742493, "learning_rate": 0.0003310580204778157, "loss": 2.347, "step": 2944 }, { "epoch": 3.3503981797497158, "grad_norm": 0.9879665970802307, "learning_rate": 0.00033083048919226396, "loss": 1.6713, "step": 2945 }, { "epoch": 3.3515358361774745, "grad_norm": 0.8822195529937744, "learning_rate": 0.00033060295790671217, "loss": 0.9356, "step": 2946 }, { "epoch": 3.3526734926052333, "grad_norm": 0.959632933139801, "learning_rate": 0.00033037542662116043, "loss": 2.099, "step": 2947 }, { "epoch": 3.353811149032992, "grad_norm": 0.8753674626350403, "learning_rate": 0.00033014789533560864, "loss": 1.0478, "step": 2948 }, { "epoch": 3.354948805460751, "grad_norm": 0.9367252588272095, "learning_rate": 0.00032992036405005685, "loss": 2.0897, "step": 2949 }, { "epoch": 3.3560864618885096, "grad_norm": 1.059227705001831, "learning_rate": 0.0003296928327645051, "loss": 2.1907, "step": 2950 }, { "epoch": 3.3572241183162683, "grad_norm": 1.2451213598251343, "learning_rate": 0.0003294653014789534, "loss": 2.9248, "step": 2951 }, { "epoch": 3.3583617747440275, "grad_norm": 1.4228962659835815, "learning_rate": 0.00032923777019340164, "loss": 4.301, "step": 2952 }, { "epoch": 3.3594994311717863, "grad_norm": 1.0747414827346802, "learning_rate": 0.0003290102389078498, "loss": 1.7888, "step": 2953 }, { "epoch": 3.360637087599545, "grad_norm": 1.38980233669281, "learning_rate": 0.00032878270762229806, "loss": 0.5853, "step": 2954 }, { "epoch": 3.361774744027304, "grad_norm": 0.793297290802002, "learning_rate": 0.0003285551763367463, "loss": 1.1242, "step": 2955 }, { "epoch": 3.3629124004550626, "grad_norm": 2.7201013565063477, "learning_rate": 0.00032832764505119454, "loss": 4.6065, "step": 2956 }, { "epoch": 3.3640500568828213, "grad_norm": 1.3290942907333374, "learning_rate": 0.0003281001137656428, "loss": 2.6127, "step": 2957 }, { "epoch": 3.36518771331058, "grad_norm": 1.104690670967102, "learning_rate": 0.000327872582480091, "loss": 1.6713, "step": 2958 }, { "epoch": 3.366325369738339, "grad_norm": 0.6908164620399475, "learning_rate": 0.0003276450511945392, "loss": 1.0, "step": 2959 }, { "epoch": 3.3674630261660976, "grad_norm": 0.8555911779403687, "learning_rate": 0.0003274175199089875, "loss": 1.6721, "step": 2960 }, { "epoch": 3.368600682593857, "grad_norm": 0.8147174119949341, "learning_rate": 0.00032718998862343575, "loss": 2.1911, "step": 2961 }, { "epoch": 3.3697383390216156, "grad_norm": 1.1406668424606323, "learning_rate": 0.000326962457337884, "loss": 2.4286, "step": 2962 }, { "epoch": 3.3708759954493743, "grad_norm": 0.8895596861839294, "learning_rate": 0.00032673492605233217, "loss": 1.8801, "step": 2963 }, { "epoch": 3.372013651877133, "grad_norm": 0.6777999401092529, "learning_rate": 0.00032650739476678043, "loss": 0.8933, "step": 2964 }, { "epoch": 3.373151308304892, "grad_norm": 0.8110511898994446, "learning_rate": 0.0003262798634812287, "loss": 1.6234, "step": 2965 }, { "epoch": 3.3742889647326506, "grad_norm": 1.3193098306655884, "learning_rate": 0.0003260523321956769, "loss": 2.0755, "step": 2966 }, { "epoch": 3.3754266211604094, "grad_norm": 1.0191433429718018, "learning_rate": 0.00032582480091012517, "loss": 2.6676, "step": 2967 }, { "epoch": 3.3765642775881686, "grad_norm": 0.5699923038482666, "learning_rate": 0.0003255972696245734, "loss": 1.0331, "step": 2968 }, { "epoch": 3.3777019340159273, "grad_norm": 1.2073493003845215, "learning_rate": 0.0003253697383390216, "loss": 3.2836, "step": 2969 }, { "epoch": 3.378839590443686, "grad_norm": 0.5660185217857361, "learning_rate": 0.00032514220705346985, "loss": 0.8925, "step": 2970 }, { "epoch": 3.379977246871445, "grad_norm": 1.1539686918258667, "learning_rate": 0.0003249146757679181, "loss": 1.6701, "step": 2971 }, { "epoch": 3.3811149032992036, "grad_norm": 1.223471999168396, "learning_rate": 0.0003246871444823664, "loss": 2.0681, "step": 2972 }, { "epoch": 3.3822525597269624, "grad_norm": 0.7714402675628662, "learning_rate": 0.00032445961319681454, "loss": 1.9618, "step": 2973 }, { "epoch": 3.383390216154721, "grad_norm": 1.1004797220230103, "learning_rate": 0.0003242320819112628, "loss": 1.4624, "step": 2974 }, { "epoch": 3.3845278725824803, "grad_norm": 0.7591771483421326, "learning_rate": 0.00032400455062571107, "loss": 1.1837, "step": 2975 }, { "epoch": 3.385665529010239, "grad_norm": 0.9769864082336426, "learning_rate": 0.0003237770193401593, "loss": 1.6725, "step": 2976 }, { "epoch": 3.386803185437998, "grad_norm": 1.0868338346481323, "learning_rate": 0.0003235494880546075, "loss": 2.3017, "step": 2977 }, { "epoch": 3.3879408418657566, "grad_norm": 0.943532407283783, "learning_rate": 0.00032332195676905575, "loss": 2.1185, "step": 2978 }, { "epoch": 3.3890784982935154, "grad_norm": 1.0216169357299805, "learning_rate": 0.00032309442548350396, "loss": 1.9361, "step": 2979 }, { "epoch": 3.390216154721274, "grad_norm": 0.9502822756767273, "learning_rate": 0.0003228668941979522, "loss": 0.9628, "step": 2980 }, { "epoch": 3.391353811149033, "grad_norm": 0.6996874213218689, "learning_rate": 0.0003226393629124005, "loss": 1.223, "step": 2981 }, { "epoch": 3.3924914675767917, "grad_norm": 0.9376258254051208, "learning_rate": 0.0003224118316268487, "loss": 1.4156, "step": 2982 }, { "epoch": 3.3936291240045504, "grad_norm": 1.000174880027771, "learning_rate": 0.0003221843003412969, "loss": 2.2908, "step": 2983 }, { "epoch": 3.3947667804323096, "grad_norm": 1.1017203330993652, "learning_rate": 0.00032195676905574517, "loss": 2.837, "step": 2984 }, { "epoch": 3.3959044368600684, "grad_norm": 0.7534080147743225, "learning_rate": 0.00032172923777019344, "loss": 1.6282, "step": 2985 }, { "epoch": 3.397042093287827, "grad_norm": 0.8684095144271851, "learning_rate": 0.00032150170648464165, "loss": 2.2301, "step": 2986 }, { "epoch": 3.398179749715586, "grad_norm": 1.0283501148223877, "learning_rate": 0.00032127417519908986, "loss": 2.0298, "step": 2987 }, { "epoch": 3.3993174061433447, "grad_norm": 0.8715803623199463, "learning_rate": 0.0003210466439135381, "loss": 1.5656, "step": 2988 }, { "epoch": 3.4004550625711034, "grad_norm": 1.0342568159103394, "learning_rate": 0.00032081911262798633, "loss": 1.8496, "step": 2989 }, { "epoch": 3.401592718998862, "grad_norm": 1.0820322036743164, "learning_rate": 0.0003205915813424346, "loss": 1.4579, "step": 2990 }, { "epoch": 3.4027303754266214, "grad_norm": 0.9718009233474731, "learning_rate": 0.00032036405005688286, "loss": 1.8414, "step": 2991 }, { "epoch": 3.40386803185438, "grad_norm": 0.6825365424156189, "learning_rate": 0.00032013651877133107, "loss": 1.4311, "step": 2992 }, { "epoch": 3.405005688282139, "grad_norm": 0.7811647057533264, "learning_rate": 0.0003199089874857793, "loss": 0.7493, "step": 2993 }, { "epoch": 3.4061433447098977, "grad_norm": 0.8294082283973694, "learning_rate": 0.00031968145620022754, "loss": 1.5319, "step": 2994 }, { "epoch": 3.4072810011376564, "grad_norm": 1.365054965019226, "learning_rate": 0.0003194539249146758, "loss": 3.0346, "step": 2995 }, { "epoch": 3.408418657565415, "grad_norm": 1.0830689668655396, "learning_rate": 0.000319226393629124, "loss": 2.3795, "step": 2996 }, { "epoch": 3.409556313993174, "grad_norm": 1.158379077911377, "learning_rate": 0.0003189988623435722, "loss": 1.6544, "step": 2997 }, { "epoch": 3.4106939704209327, "grad_norm": 0.8690700531005859, "learning_rate": 0.0003187713310580205, "loss": 1.473, "step": 2998 }, { "epoch": 3.4118316268486915, "grad_norm": 0.7095939517021179, "learning_rate": 0.0003185437997724687, "loss": 1.4625, "step": 2999 }, { "epoch": 3.4129692832764507, "grad_norm": 1.2548210620880127, "learning_rate": 0.00031831626848691696, "loss": 1.8051, "step": 3000 }, { "epoch": 3.4141069397042094, "grad_norm": 1.4674617052078247, "learning_rate": 0.00031808873720136523, "loss": 1.9178, "step": 3001 }, { "epoch": 3.415244596131968, "grad_norm": 0.9363518953323364, "learning_rate": 0.00031786120591581344, "loss": 2.3612, "step": 3002 }, { "epoch": 3.416382252559727, "grad_norm": 1.0488542318344116, "learning_rate": 0.00031763367463026165, "loss": 2.0528, "step": 3003 }, { "epoch": 3.4175199089874857, "grad_norm": 1.0611392259597778, "learning_rate": 0.0003174061433447099, "loss": 1.2209, "step": 3004 }, { "epoch": 3.4186575654152445, "grad_norm": 0.7108457088470459, "learning_rate": 0.0003171786120591582, "loss": 2.2781, "step": 3005 }, { "epoch": 3.419795221843003, "grad_norm": 0.7717037200927734, "learning_rate": 0.0003169510807736064, "loss": 1.8069, "step": 3006 }, { "epoch": 3.4209328782707624, "grad_norm": 1.1115652322769165, "learning_rate": 0.0003167235494880546, "loss": 2.1257, "step": 3007 }, { "epoch": 3.422070534698521, "grad_norm": 1.1573059558868408, "learning_rate": 0.00031649601820250286, "loss": 1.5691, "step": 3008 }, { "epoch": 3.42320819112628, "grad_norm": 0.7534975409507751, "learning_rate": 0.00031626848691695107, "loss": 1.6702, "step": 3009 }, { "epoch": 3.4243458475540387, "grad_norm": 0.8538755178451538, "learning_rate": 0.00031604095563139933, "loss": 1.8829, "step": 3010 }, { "epoch": 3.4254835039817975, "grad_norm": 1.041550636291504, "learning_rate": 0.00031581342434584754, "loss": 1.8357, "step": 3011 }, { "epoch": 3.426621160409556, "grad_norm": 0.9445088505744934, "learning_rate": 0.0003155858930602958, "loss": 2.0357, "step": 3012 }, { "epoch": 3.427758816837315, "grad_norm": 0.8078857064247131, "learning_rate": 0.000315358361774744, "loss": 1.6423, "step": 3013 }, { "epoch": 3.428896473265074, "grad_norm": 1.6331647634506226, "learning_rate": 0.0003151308304891923, "loss": 2.6264, "step": 3014 }, { "epoch": 3.430034129692833, "grad_norm": 1.0118826627731323, "learning_rate": 0.00031490329920364055, "loss": 2.5419, "step": 3015 }, { "epoch": 3.4311717861205917, "grad_norm": 1.1630852222442627, "learning_rate": 0.0003146757679180887, "loss": 2.345, "step": 3016 }, { "epoch": 3.4323094425483505, "grad_norm": 1.082171082496643, "learning_rate": 0.00031444823663253697, "loss": 1.6889, "step": 3017 }, { "epoch": 3.4334470989761092, "grad_norm": 0.9755992889404297, "learning_rate": 0.00031422070534698523, "loss": 1.599, "step": 3018 }, { "epoch": 3.434584755403868, "grad_norm": 1.2539499998092651, "learning_rate": 0.00031399317406143344, "loss": 3.2968, "step": 3019 }, { "epoch": 3.4357224118316267, "grad_norm": 0.8818562030792236, "learning_rate": 0.0003137656427758817, "loss": 2.6973, "step": 3020 }, { "epoch": 3.4368600682593855, "grad_norm": 0.900875449180603, "learning_rate": 0.0003135381114903299, "loss": 1.6828, "step": 3021 }, { "epoch": 3.4379977246871443, "grad_norm": 0.7886406779289246, "learning_rate": 0.0003133105802047782, "loss": 2.1497, "step": 3022 }, { "epoch": 3.4391353811149035, "grad_norm": 1.0653672218322754, "learning_rate": 0.0003130830489192264, "loss": 1.2112, "step": 3023 }, { "epoch": 3.4402730375426622, "grad_norm": 0.8921288251876831, "learning_rate": 0.00031285551763367465, "loss": 1.5596, "step": 3024 }, { "epoch": 3.441410693970421, "grad_norm": 1.0506072044372559, "learning_rate": 0.0003126279863481229, "loss": 1.9131, "step": 3025 }, { "epoch": 3.4425483503981797, "grad_norm": 0.9992786645889282, "learning_rate": 0.00031240045506257107, "loss": 2.2033, "step": 3026 }, { "epoch": 3.4436860068259385, "grad_norm": 1.0742168426513672, "learning_rate": 0.00031217292377701934, "loss": 1.793, "step": 3027 }, { "epoch": 3.4448236632536973, "grad_norm": 1.2853068113327026, "learning_rate": 0.0003119453924914676, "loss": 2.5013, "step": 3028 }, { "epoch": 3.445961319681456, "grad_norm": 1.1094430685043335, "learning_rate": 0.0003117178612059158, "loss": 2.518, "step": 3029 }, { "epoch": 3.4470989761092152, "grad_norm": 1.2212568521499634, "learning_rate": 0.0003114903299203641, "loss": 1.1286, "step": 3030 }, { "epoch": 3.448236632536974, "grad_norm": 0.8297713398933411, "learning_rate": 0.0003112627986348123, "loss": 2.0985, "step": 3031 }, { "epoch": 3.4493742889647327, "grad_norm": 1.1644102334976196, "learning_rate": 0.0003110352673492605, "loss": 2.4079, "step": 3032 }, { "epoch": 3.4505119453924915, "grad_norm": 0.8777130842208862, "learning_rate": 0.00031080773606370876, "loss": 3.0359, "step": 3033 }, { "epoch": 3.4516496018202503, "grad_norm": 1.0054441690444946, "learning_rate": 0.000310580204778157, "loss": 1.4005, "step": 3034 }, { "epoch": 3.452787258248009, "grad_norm": 0.7273943424224854, "learning_rate": 0.0003103526734926053, "loss": 0.9642, "step": 3035 }, { "epoch": 3.453924914675768, "grad_norm": 0.9213597774505615, "learning_rate": 0.00031012514220705344, "loss": 1.603, "step": 3036 }, { "epoch": 3.4550625711035265, "grad_norm": 0.8535187840461731, "learning_rate": 0.0003098976109215017, "loss": 1.8995, "step": 3037 }, { "epoch": 3.4562002275312853, "grad_norm": 0.6558015942573547, "learning_rate": 0.00030967007963594997, "loss": 1.7263, "step": 3038 }, { "epoch": 3.4573378839590445, "grad_norm": 0.8745822310447693, "learning_rate": 0.0003094425483503982, "loss": 2.1463, "step": 3039 }, { "epoch": 3.4584755403868033, "grad_norm": 0.829898476600647, "learning_rate": 0.00030921501706484644, "loss": 1.1866, "step": 3040 }, { "epoch": 3.459613196814562, "grad_norm": 0.7422157526016235, "learning_rate": 0.00030898748577929465, "loss": 1.6347, "step": 3041 }, { "epoch": 3.460750853242321, "grad_norm": 1.017354965209961, "learning_rate": 0.00030875995449374286, "loss": 2.092, "step": 3042 }, { "epoch": 3.4618885096700796, "grad_norm": 0.9957495331764221, "learning_rate": 0.00030853242320819113, "loss": 1.6618, "step": 3043 }, { "epoch": 3.4630261660978383, "grad_norm": 0.9194496273994446, "learning_rate": 0.0003083048919226394, "loss": 1.3234, "step": 3044 }, { "epoch": 3.464163822525597, "grad_norm": 0.8070917129516602, "learning_rate": 0.00030807736063708766, "loss": 1.0518, "step": 3045 }, { "epoch": 3.4653014789533563, "grad_norm": 0.9527956247329712, "learning_rate": 0.0003078498293515358, "loss": 1.8434, "step": 3046 }, { "epoch": 3.466439135381115, "grad_norm": 0.9919708967208862, "learning_rate": 0.0003076222980659841, "loss": 1.8574, "step": 3047 }, { "epoch": 3.467576791808874, "grad_norm": 0.7566163539886475, "learning_rate": 0.00030739476678043234, "loss": 2.2815, "step": 3048 }, { "epoch": 3.4687144482366326, "grad_norm": 0.9245063066482544, "learning_rate": 0.00030716723549488055, "loss": 1.8061, "step": 3049 }, { "epoch": 3.4698521046643913, "grad_norm": 0.5763617753982544, "learning_rate": 0.00030693970420932876, "loss": 0.8955, "step": 3050 }, { "epoch": 3.47098976109215, "grad_norm": 0.6619337797164917, "learning_rate": 0.000306712172923777, "loss": 0.8753, "step": 3051 }, { "epoch": 3.472127417519909, "grad_norm": 0.9321338534355164, "learning_rate": 0.00030648464163822523, "loss": 1.3895, "step": 3052 }, { "epoch": 3.473265073947668, "grad_norm": 1.0468645095825195, "learning_rate": 0.0003062571103526735, "loss": 1.9941, "step": 3053 }, { "epoch": 3.474402730375427, "grad_norm": 1.2254489660263062, "learning_rate": 0.00030602957906712176, "loss": 2.2438, "step": 3054 }, { "epoch": 3.4755403868031856, "grad_norm": 0.965639591217041, "learning_rate": 0.00030580204778156997, "loss": 1.6008, "step": 3055 }, { "epoch": 3.4766780432309443, "grad_norm": 0.7972604632377625, "learning_rate": 0.0003055745164960182, "loss": 1.2332, "step": 3056 }, { "epoch": 3.477815699658703, "grad_norm": 0.8752893209457397, "learning_rate": 0.00030534698521046645, "loss": 1.3047, "step": 3057 }, { "epoch": 3.478953356086462, "grad_norm": 0.7717936038970947, "learning_rate": 0.0003051194539249147, "loss": 1.4136, "step": 3058 }, { "epoch": 3.4800910125142206, "grad_norm": 0.8218960165977478, "learning_rate": 0.0003048919226393629, "loss": 1.5268, "step": 3059 }, { "epoch": 3.4812286689419794, "grad_norm": 0.8942006230354309, "learning_rate": 0.00030466439135381113, "loss": 1.9143, "step": 3060 }, { "epoch": 3.482366325369738, "grad_norm": 0.8847683668136597, "learning_rate": 0.0003044368600682594, "loss": 2.8478, "step": 3061 }, { "epoch": 3.4835039817974973, "grad_norm": 0.8269094228744507, "learning_rate": 0.0003042093287827076, "loss": 2.0533, "step": 3062 }, { "epoch": 3.484641638225256, "grad_norm": 0.7389246225357056, "learning_rate": 0.00030398179749715587, "loss": 1.5518, "step": 3063 }, { "epoch": 3.485779294653015, "grad_norm": 0.7460439205169678, "learning_rate": 0.00030375426621160413, "loss": 0.9565, "step": 3064 }, { "epoch": 3.4869169510807736, "grad_norm": 0.8218055367469788, "learning_rate": 0.00030352673492605234, "loss": 1.8419, "step": 3065 }, { "epoch": 3.4880546075085324, "grad_norm": 1.2303011417388916, "learning_rate": 0.00030329920364050055, "loss": 2.0404, "step": 3066 }, { "epoch": 3.489192263936291, "grad_norm": 0.7625948190689087, "learning_rate": 0.0003030716723549488, "loss": 1.502, "step": 3067 }, { "epoch": 3.49032992036405, "grad_norm": 0.9459888339042664, "learning_rate": 0.0003028441410693971, "loss": 1.4141, "step": 3068 }, { "epoch": 3.491467576791809, "grad_norm": 0.9511308670043945, "learning_rate": 0.0003026166097838453, "loss": 2.9199, "step": 3069 }, { "epoch": 3.492605233219568, "grad_norm": 0.9939665198326111, "learning_rate": 0.0003023890784982935, "loss": 1.7376, "step": 3070 }, { "epoch": 3.4937428896473266, "grad_norm": 1.1104254722595215, "learning_rate": 0.00030216154721274176, "loss": 2.1018, "step": 3071 }, { "epoch": 3.4948805460750854, "grad_norm": 0.8199077248573303, "learning_rate": 0.00030193401592718997, "loss": 2.0834, "step": 3072 }, { "epoch": 3.496018202502844, "grad_norm": 0.7726981043815613, "learning_rate": 0.00030170648464163824, "loss": 1.2873, "step": 3073 }, { "epoch": 3.497155858930603, "grad_norm": 1.0992076396942139, "learning_rate": 0.0003014789533560865, "loss": 2.3819, "step": 3074 }, { "epoch": 3.4982935153583616, "grad_norm": 0.7880586385726929, "learning_rate": 0.0003012514220705347, "loss": 2.0231, "step": 3075 }, { "epoch": 3.4994311717861204, "grad_norm": 0.5555304884910583, "learning_rate": 0.0003010238907849829, "loss": 1.0044, "step": 3076 }, { "epoch": 3.500568828213879, "grad_norm": 1.2383909225463867, "learning_rate": 0.0003007963594994312, "loss": 1.2884, "step": 3077 }, { "epoch": 3.5017064846416384, "grad_norm": 1.214747667312622, "learning_rate": 0.00030056882821387945, "loss": 1.4196, "step": 3078 }, { "epoch": 3.502844141069397, "grad_norm": 0.7822383046150208, "learning_rate": 0.00030034129692832766, "loss": 1.3068, "step": 3079 }, { "epoch": 3.503981797497156, "grad_norm": 1.200726866722107, "learning_rate": 0.00030011376564277587, "loss": 2.0791, "step": 3080 }, { "epoch": 3.5051194539249146, "grad_norm": 0.7341413497924805, "learning_rate": 0.00029988623435722413, "loss": 0.9889, "step": 3081 }, { "epoch": 3.5062571103526734, "grad_norm": 1.4545503854751587, "learning_rate": 0.00029965870307167234, "loss": 2.6975, "step": 3082 }, { "epoch": 3.507394766780432, "grad_norm": 0.8442320227622986, "learning_rate": 0.0002994311717861206, "loss": 1.6271, "step": 3083 }, { "epoch": 3.508532423208191, "grad_norm": 0.7989565134048462, "learning_rate": 0.0002992036405005688, "loss": 0.9025, "step": 3084 }, { "epoch": 3.50967007963595, "grad_norm": 0.9094103574752808, "learning_rate": 0.0002989761092150171, "loss": 1.4245, "step": 3085 }, { "epoch": 3.510807736063709, "grad_norm": 1.1836727857589722, "learning_rate": 0.0002987485779294653, "loss": 2.3158, "step": 3086 }, { "epoch": 3.5119453924914676, "grad_norm": 0.9379547238349915, "learning_rate": 0.00029852104664391355, "loss": 1.3348, "step": 3087 }, { "epoch": 3.5130830489192264, "grad_norm": 1.4176782369613647, "learning_rate": 0.0002982935153583618, "loss": 2.056, "step": 3088 }, { "epoch": 3.514220705346985, "grad_norm": 0.8050400018692017, "learning_rate": 0.00029806598407281, "loss": 1.7547, "step": 3089 }, { "epoch": 3.515358361774744, "grad_norm": 0.6850399971008301, "learning_rate": 0.00029783845278725824, "loss": 0.8674, "step": 3090 }, { "epoch": 3.5164960182025027, "grad_norm": 1.1448007822036743, "learning_rate": 0.0002976109215017065, "loss": 2.3783, "step": 3091 }, { "epoch": 3.517633674630262, "grad_norm": 0.645057737827301, "learning_rate": 0.0002973833902161547, "loss": 0.9246, "step": 3092 }, { "epoch": 3.51877133105802, "grad_norm": 1.0810428857803345, "learning_rate": 0.000297155858930603, "loss": 1.9262, "step": 3093 }, { "epoch": 3.5199089874857794, "grad_norm": 0.7463341951370239, "learning_rate": 0.0002969283276450512, "loss": 1.8343, "step": 3094 }, { "epoch": 3.521046643913538, "grad_norm": 1.1941145658493042, "learning_rate": 0.00029670079635949945, "loss": 3.0099, "step": 3095 }, { "epoch": 3.522184300341297, "grad_norm": 1.1003155708312988, "learning_rate": 0.00029647326507394766, "loss": 1.9862, "step": 3096 }, { "epoch": 3.5233219567690557, "grad_norm": 0.7779751420021057, "learning_rate": 0.0002962457337883959, "loss": 1.5242, "step": 3097 }, { "epoch": 3.5244596131968144, "grad_norm": 0.7989513874053955, "learning_rate": 0.0002960182025028442, "loss": 1.2492, "step": 3098 }, { "epoch": 3.5255972696245736, "grad_norm": 0.9354263544082642, "learning_rate": 0.00029579067121729234, "loss": 1.1991, "step": 3099 }, { "epoch": 3.526734926052332, "grad_norm": 1.2096282243728638, "learning_rate": 0.0002955631399317406, "loss": 2.2768, "step": 3100 }, { "epoch": 3.527872582480091, "grad_norm": 1.2972865104675293, "learning_rate": 0.00029533560864618887, "loss": 2.3998, "step": 3101 }, { "epoch": 3.52901023890785, "grad_norm": 0.9327741861343384, "learning_rate": 0.0002951080773606371, "loss": 2.3759, "step": 3102 }, { "epoch": 3.5301478953356087, "grad_norm": 1.7685364484786987, "learning_rate": 0.00029488054607508535, "loss": 2.9593, "step": 3103 }, { "epoch": 3.5312855517633674, "grad_norm": 1.1170707941055298, "learning_rate": 0.00029465301478953356, "loss": 1.7477, "step": 3104 }, { "epoch": 3.532423208191126, "grad_norm": 0.635349452495575, "learning_rate": 0.0002944254835039818, "loss": 0.4989, "step": 3105 }, { "epoch": 3.533560864618885, "grad_norm": 1.165709376335144, "learning_rate": 0.00029419795221843003, "loss": 1.5922, "step": 3106 }, { "epoch": 3.5346985210466437, "grad_norm": 0.8228427171707153, "learning_rate": 0.0002939704209328783, "loss": 1.6231, "step": 3107 }, { "epoch": 3.535836177474403, "grad_norm": 0.7035273313522339, "learning_rate": 0.00029374288964732656, "loss": 1.2547, "step": 3108 }, { "epoch": 3.5369738339021617, "grad_norm": 0.9411433935165405, "learning_rate": 0.0002935153583617747, "loss": 1.4946, "step": 3109 }, { "epoch": 3.5381114903299204, "grad_norm": 0.8050945401191711, "learning_rate": 0.000293287827076223, "loss": 1.2655, "step": 3110 }, { "epoch": 3.539249146757679, "grad_norm": 0.5025121569633484, "learning_rate": 0.00029306029579067124, "loss": 1.1542, "step": 3111 }, { "epoch": 3.540386803185438, "grad_norm": 0.6222087740898132, "learning_rate": 0.00029283276450511945, "loss": 0.8863, "step": 3112 }, { "epoch": 3.5415244596131967, "grad_norm": 1.4298913478851318, "learning_rate": 0.0002926052332195677, "loss": 2.4176, "step": 3113 }, { "epoch": 3.5426621160409555, "grad_norm": 0.5780576467514038, "learning_rate": 0.0002923777019340159, "loss": 0.5984, "step": 3114 }, { "epoch": 3.5437997724687147, "grad_norm": 0.6261698007583618, "learning_rate": 0.0002921501706484642, "loss": 0.7561, "step": 3115 }, { "epoch": 3.544937428896473, "grad_norm": 0.7118838429450989, "learning_rate": 0.0002919226393629124, "loss": 0.7812, "step": 3116 }, { "epoch": 3.546075085324232, "grad_norm": 0.8261246681213379, "learning_rate": 0.00029169510807736066, "loss": 1.0061, "step": 3117 }, { "epoch": 3.547212741751991, "grad_norm": 1.191845178604126, "learning_rate": 0.0002914675767918089, "loss": 3.8113, "step": 3118 }, { "epoch": 3.5483503981797497, "grad_norm": 1.1474705934524536, "learning_rate": 0.0002912400455062571, "loss": 2.7153, "step": 3119 }, { "epoch": 3.5494880546075085, "grad_norm": 0.9370527267456055, "learning_rate": 0.00029101251422070535, "loss": 1.5725, "step": 3120 }, { "epoch": 3.5506257110352673, "grad_norm": 0.9595047235488892, "learning_rate": 0.0002907849829351536, "loss": 1.8677, "step": 3121 }, { "epoch": 3.551763367463026, "grad_norm": 1.2102956771850586, "learning_rate": 0.0002905574516496018, "loss": 1.2953, "step": 3122 }, { "epoch": 3.5529010238907848, "grad_norm": 0.595016360282898, "learning_rate": 0.00029032992036405003, "loss": 0.6797, "step": 3123 }, { "epoch": 3.554038680318544, "grad_norm": 0.8877463936805725, "learning_rate": 0.0002901023890784983, "loss": 1.4732, "step": 3124 }, { "epoch": 3.5551763367463027, "grad_norm": 0.9120120406150818, "learning_rate": 0.00028987485779294656, "loss": 2.0458, "step": 3125 }, { "epoch": 3.5563139931740615, "grad_norm": 0.845203697681427, "learning_rate": 0.00028964732650739477, "loss": 1.9619, "step": 3126 }, { "epoch": 3.5574516496018203, "grad_norm": 1.032363772392273, "learning_rate": 0.00028941979522184303, "loss": 1.3608, "step": 3127 }, { "epoch": 3.558589306029579, "grad_norm": 0.7119994759559631, "learning_rate": 0.00028919226393629124, "loss": 1.3793, "step": 3128 }, { "epoch": 3.5597269624573378, "grad_norm": 2.150618553161621, "learning_rate": 0.00028896473265073945, "loss": 4.2388, "step": 3129 }, { "epoch": 3.5608646188850965, "grad_norm": 0.7926281094551086, "learning_rate": 0.0002887372013651877, "loss": 1.7266, "step": 3130 }, { "epoch": 3.5620022753128557, "grad_norm": 1.1008011102676392, "learning_rate": 0.000288509670079636, "loss": 2.8486, "step": 3131 }, { "epoch": 3.5631399317406145, "grad_norm": 0.8936535716056824, "learning_rate": 0.0002882821387940842, "loss": 2.1958, "step": 3132 }, { "epoch": 3.5642775881683733, "grad_norm": 0.7171263694763184, "learning_rate": 0.0002880546075085324, "loss": 0.8248, "step": 3133 }, { "epoch": 3.565415244596132, "grad_norm": 1.07510244846344, "learning_rate": 0.00028782707622298067, "loss": 1.7625, "step": 3134 }, { "epoch": 3.5665529010238908, "grad_norm": 0.6037983298301697, "learning_rate": 0.0002875995449374289, "loss": 0.8, "step": 3135 }, { "epoch": 3.5676905574516495, "grad_norm": 0.737299919128418, "learning_rate": 0.00028737201365187714, "loss": 1.5646, "step": 3136 }, { "epoch": 3.5688282138794083, "grad_norm": 0.8927704095840454, "learning_rate": 0.0002871444823663254, "loss": 1.8448, "step": 3137 }, { "epoch": 3.5699658703071675, "grad_norm": 0.7696796655654907, "learning_rate": 0.0002869169510807736, "loss": 1.8304, "step": 3138 }, { "epoch": 3.571103526734926, "grad_norm": 1.4837048053741455, "learning_rate": 0.0002866894197952218, "loss": 3.0218, "step": 3139 }, { "epoch": 3.572241183162685, "grad_norm": 0.9347735643386841, "learning_rate": 0.0002864618885096701, "loss": 1.3186, "step": 3140 }, { "epoch": 3.573378839590444, "grad_norm": 0.8641846179962158, "learning_rate": 0.00028623435722411835, "loss": 1.4623, "step": 3141 }, { "epoch": 3.5745164960182025, "grad_norm": 1.1673868894577026, "learning_rate": 0.00028600682593856656, "loss": 2.0682, "step": 3142 }, { "epoch": 3.5756541524459613, "grad_norm": 1.1850996017456055, "learning_rate": 0.00028577929465301477, "loss": 2.0922, "step": 3143 }, { "epoch": 3.57679180887372, "grad_norm": 0.799368679523468, "learning_rate": 0.00028555176336746304, "loss": 2.0306, "step": 3144 }, { "epoch": 3.577929465301479, "grad_norm": 0.7206624150276184, "learning_rate": 0.00028532423208191125, "loss": 1.3149, "step": 3145 }, { "epoch": 3.5790671217292376, "grad_norm": 1.1149860620498657, "learning_rate": 0.0002850967007963595, "loss": 2.1369, "step": 3146 }, { "epoch": 3.580204778156997, "grad_norm": 1.0406419038772583, "learning_rate": 0.0002848691695108078, "loss": 1.652, "step": 3147 }, { "epoch": 3.5813424345847555, "grad_norm": 1.1601135730743408, "learning_rate": 0.000284641638225256, "loss": 1.6278, "step": 3148 }, { "epoch": 3.5824800910125143, "grad_norm": 1.0218907594680786, "learning_rate": 0.0002844141069397042, "loss": 1.2107, "step": 3149 }, { "epoch": 3.583617747440273, "grad_norm": 1.2365930080413818, "learning_rate": 0.00028418657565415246, "loss": 2.1005, "step": 3150 }, { "epoch": 3.584755403868032, "grad_norm": 0.7121724486351013, "learning_rate": 0.0002839590443686007, "loss": 1.5354, "step": 3151 }, { "epoch": 3.5858930602957906, "grad_norm": 0.9739391207695007, "learning_rate": 0.0002837315130830489, "loss": 2.6124, "step": 3152 }, { "epoch": 3.5870307167235493, "grad_norm": 1.4687827825546265, "learning_rate": 0.00028350398179749714, "loss": 2.4509, "step": 3153 }, { "epoch": 3.5881683731513085, "grad_norm": 1.106471061706543, "learning_rate": 0.0002832764505119454, "loss": 1.6431, "step": 3154 }, { "epoch": 3.589306029579067, "grad_norm": 1.566792607307434, "learning_rate": 0.0002830489192263936, "loss": 2.5289, "step": 3155 }, { "epoch": 3.590443686006826, "grad_norm": 0.8209744095802307, "learning_rate": 0.0002828213879408419, "loss": 1.251, "step": 3156 }, { "epoch": 3.591581342434585, "grad_norm": 1.5616912841796875, "learning_rate": 0.0002825938566552901, "loss": 2.958, "step": 3157 }, { "epoch": 3.5927189988623436, "grad_norm": 0.8948552012443542, "learning_rate": 0.00028236632536973835, "loss": 1.6619, "step": 3158 }, { "epoch": 3.5938566552901023, "grad_norm": 1.0657151937484741, "learning_rate": 0.00028213879408418656, "loss": 1.8547, "step": 3159 }, { "epoch": 3.594994311717861, "grad_norm": 0.7531653046607971, "learning_rate": 0.00028191126279863483, "loss": 1.5972, "step": 3160 }, { "epoch": 3.59613196814562, "grad_norm": 0.9963746070861816, "learning_rate": 0.0002816837315130831, "loss": 2.1391, "step": 3161 }, { "epoch": 3.5972696245733786, "grad_norm": 1.1848362684249878, "learning_rate": 0.00028145620022753125, "loss": 1.8631, "step": 3162 }, { "epoch": 3.598407281001138, "grad_norm": 1.2183339595794678, "learning_rate": 0.0002812286689419795, "loss": 2.2846, "step": 3163 }, { "epoch": 3.5995449374288966, "grad_norm": 0.654116153717041, "learning_rate": 0.0002810011376564278, "loss": 1.3402, "step": 3164 }, { "epoch": 3.6006825938566553, "grad_norm": 1.1731293201446533, "learning_rate": 0.000280773606370876, "loss": 2.3222, "step": 3165 }, { "epoch": 3.601820250284414, "grad_norm": 0.6737352013587952, "learning_rate": 0.00028054607508532425, "loss": 1.3616, "step": 3166 }, { "epoch": 3.602957906712173, "grad_norm": 0.6698241233825684, "learning_rate": 0.00028031854379977246, "loss": 1.0889, "step": 3167 }, { "epoch": 3.6040955631399316, "grad_norm": 1.0666381120681763, "learning_rate": 0.0002800910125142207, "loss": 1.6506, "step": 3168 }, { "epoch": 3.6052332195676904, "grad_norm": 0.7355189323425293, "learning_rate": 0.00027986348122866893, "loss": 1.2037, "step": 3169 }, { "epoch": 3.6063708759954496, "grad_norm": 1.5804355144500732, "learning_rate": 0.0002796359499431172, "loss": 3.5022, "step": 3170 }, { "epoch": 3.6075085324232083, "grad_norm": 0.602844774723053, "learning_rate": 0.00027940841865756546, "loss": 0.6136, "step": 3171 }, { "epoch": 3.608646188850967, "grad_norm": 0.6072266101837158, "learning_rate": 0.0002791808873720136, "loss": 0.932, "step": 3172 }, { "epoch": 3.609783845278726, "grad_norm": 0.9740130305290222, "learning_rate": 0.0002789533560864619, "loss": 1.3407, "step": 3173 }, { "epoch": 3.6109215017064846, "grad_norm": 0.99945068359375, "learning_rate": 0.00027872582480091015, "loss": 3.0923, "step": 3174 }, { "epoch": 3.6120591581342434, "grad_norm": 0.8721227645874023, "learning_rate": 0.00027849829351535836, "loss": 1.1302, "step": 3175 }, { "epoch": 3.613196814562002, "grad_norm": 0.9758340716362, "learning_rate": 0.0002782707622298066, "loss": 1.2336, "step": 3176 }, { "epoch": 3.6143344709897613, "grad_norm": 0.9425876140594482, "learning_rate": 0.00027804323094425483, "loss": 1.7407, "step": 3177 }, { "epoch": 3.6154721274175197, "grad_norm": 1.0528191328048706, "learning_rate": 0.0002778156996587031, "loss": 1.7902, "step": 3178 }, { "epoch": 3.616609783845279, "grad_norm": 0.9743272662162781, "learning_rate": 0.0002775881683731513, "loss": 1.5341, "step": 3179 }, { "epoch": 3.6177474402730376, "grad_norm": 1.2372887134552002, "learning_rate": 0.00027736063708759957, "loss": 2.7279, "step": 3180 }, { "epoch": 3.6188850967007964, "grad_norm": 1.1999444961547852, "learning_rate": 0.00027713310580204783, "loss": 2.0523, "step": 3181 }, { "epoch": 3.620022753128555, "grad_norm": 0.903556227684021, "learning_rate": 0.000276905574516496, "loss": 1.3633, "step": 3182 }, { "epoch": 3.621160409556314, "grad_norm": 0.7182222008705139, "learning_rate": 0.00027667804323094425, "loss": 1.5335, "step": 3183 }, { "epoch": 3.6222980659840727, "grad_norm": 0.9772674441337585, "learning_rate": 0.0002764505119453925, "loss": 2.0408, "step": 3184 }, { "epoch": 3.6234357224118314, "grad_norm": 0.6779370903968811, "learning_rate": 0.0002762229806598407, "loss": 0.963, "step": 3185 }, { "epoch": 3.6245733788395906, "grad_norm": 1.3481802940368652, "learning_rate": 0.00027599544937428894, "loss": 1.9255, "step": 3186 }, { "epoch": 3.6257110352673494, "grad_norm": 0.7263208031654358, "learning_rate": 0.0002757679180887372, "loss": 1.388, "step": 3187 }, { "epoch": 3.626848691695108, "grad_norm": 0.8536869287490845, "learning_rate": 0.00027554038680318546, "loss": 1.5152, "step": 3188 }, { "epoch": 3.627986348122867, "grad_norm": 0.7139433026313782, "learning_rate": 0.0002753128555176337, "loss": 1.4358, "step": 3189 }, { "epoch": 3.6291240045506257, "grad_norm": 1.031976580619812, "learning_rate": 0.00027508532423208194, "loss": 2.6185, "step": 3190 }, { "epoch": 3.6302616609783844, "grad_norm": 0.670224130153656, "learning_rate": 0.00027485779294653015, "loss": 1.2865, "step": 3191 }, { "epoch": 3.631399317406143, "grad_norm": 0.8890179991722107, "learning_rate": 0.00027463026166097836, "loss": 1.5146, "step": 3192 }, { "epoch": 3.6325369738339024, "grad_norm": 1.02622652053833, "learning_rate": 0.0002744027303754266, "loss": 1.8741, "step": 3193 }, { "epoch": 3.6336746302616607, "grad_norm": 0.7721883654594421, "learning_rate": 0.0002741751990898749, "loss": 1.5905, "step": 3194 }, { "epoch": 3.63481228668942, "grad_norm": 1.2086652517318726, "learning_rate": 0.0002739476678043231, "loss": 2.4846, "step": 3195 }, { "epoch": 3.6359499431171787, "grad_norm": 1.0432493686676025, "learning_rate": 0.0002737201365187713, "loss": 1.6268, "step": 3196 }, { "epoch": 3.6370875995449374, "grad_norm": 0.746393084526062, "learning_rate": 0.00027349260523321957, "loss": 1.5132, "step": 3197 }, { "epoch": 3.638225255972696, "grad_norm": 0.9152166843414307, "learning_rate": 0.00027326507394766783, "loss": 2.676, "step": 3198 }, { "epoch": 3.639362912400455, "grad_norm": 0.8825878500938416, "learning_rate": 0.00027303754266211604, "loss": 1.5822, "step": 3199 }, { "epoch": 3.640500568828214, "grad_norm": 1.0070838928222656, "learning_rate": 0.0002728100113765643, "loss": 2.4247, "step": 3200 }, { "epoch": 3.6416382252559725, "grad_norm": 0.7467294931411743, "learning_rate": 0.0002725824800910125, "loss": 1.0876, "step": 3201 }, { "epoch": 3.6427758816837317, "grad_norm": 0.6660275459289551, "learning_rate": 0.00027235494880546073, "loss": 1.7321, "step": 3202 }, { "epoch": 3.6439135381114904, "grad_norm": 0.9477649927139282, "learning_rate": 0.000272127417519909, "loss": 2.3247, "step": 3203 }, { "epoch": 3.645051194539249, "grad_norm": 0.8924974203109741, "learning_rate": 0.00027189988623435726, "loss": 2.3003, "step": 3204 }, { "epoch": 3.646188850967008, "grad_norm": 1.107689619064331, "learning_rate": 0.00027167235494880547, "loss": 2.079, "step": 3205 }, { "epoch": 3.6473265073947667, "grad_norm": 0.7451488971710205, "learning_rate": 0.0002714448236632537, "loss": 1.4819, "step": 3206 }, { "epoch": 3.6484641638225255, "grad_norm": 0.881750226020813, "learning_rate": 0.00027121729237770194, "loss": 1.4403, "step": 3207 }, { "epoch": 3.6496018202502842, "grad_norm": 1.023970365524292, "learning_rate": 0.0002709897610921502, "loss": 1.5968, "step": 3208 }, { "epoch": 3.6507394766780434, "grad_norm": 0.9700276255607605, "learning_rate": 0.0002707622298065984, "loss": 2.5977, "step": 3209 }, { "epoch": 3.651877133105802, "grad_norm": 1.010783076286316, "learning_rate": 0.0002705346985210467, "loss": 2.2983, "step": 3210 }, { "epoch": 3.653014789533561, "grad_norm": 1.0232635736465454, "learning_rate": 0.0002703071672354949, "loss": 1.461, "step": 3211 }, { "epoch": 3.6541524459613197, "grad_norm": 0.6770385503768921, "learning_rate": 0.0002700796359499431, "loss": 0.8051, "step": 3212 }, { "epoch": 3.6552901023890785, "grad_norm": 0.5668506026268005, "learning_rate": 0.00026985210466439136, "loss": 1.0955, "step": 3213 }, { "epoch": 3.6564277588168372, "grad_norm": 1.4066870212554932, "learning_rate": 0.0002696245733788396, "loss": 3.6724, "step": 3214 }, { "epoch": 3.657565415244596, "grad_norm": 1.153571605682373, "learning_rate": 0.00026939704209328784, "loss": 1.989, "step": 3215 }, { "epoch": 3.658703071672355, "grad_norm": 0.728625476360321, "learning_rate": 0.00026916951080773605, "loss": 0.9875, "step": 3216 }, { "epoch": 3.6598407281001135, "grad_norm": 0.9735997915267944, "learning_rate": 0.0002689419795221843, "loss": 2.0278, "step": 3217 }, { "epoch": 3.6609783845278727, "grad_norm": 0.7140381932258606, "learning_rate": 0.0002687144482366326, "loss": 1.2533, "step": 3218 }, { "epoch": 3.6621160409556315, "grad_norm": 0.6878055930137634, "learning_rate": 0.0002684869169510808, "loss": 1.1472, "step": 3219 }, { "epoch": 3.6632536973833902, "grad_norm": 0.9914765954017639, "learning_rate": 0.00026825938566552905, "loss": 2.0081, "step": 3220 }, { "epoch": 3.664391353811149, "grad_norm": 1.0459471940994263, "learning_rate": 0.00026803185437997726, "loss": 2.5561, "step": 3221 }, { "epoch": 3.6655290102389078, "grad_norm": 0.8058758974075317, "learning_rate": 0.00026780432309442547, "loss": 1.2284, "step": 3222 }, { "epoch": 3.6666666666666665, "grad_norm": 1.2225570678710938, "learning_rate": 0.00026757679180887373, "loss": 1.9648, "step": 3223 }, { "epoch": 3.6678043230944253, "grad_norm": 1.3064903020858765, "learning_rate": 0.000267349260523322, "loss": 2.1073, "step": 3224 }, { "epoch": 3.6689419795221845, "grad_norm": 0.7548709511756897, "learning_rate": 0.00026712172923777015, "loss": 1.5539, "step": 3225 }, { "epoch": 3.6700796359499432, "grad_norm": 1.034284234046936, "learning_rate": 0.0002668941979522184, "loss": 2.125, "step": 3226 }, { "epoch": 3.671217292377702, "grad_norm": 1.494303822517395, "learning_rate": 0.0002666666666666667, "loss": 2.775, "step": 3227 }, { "epoch": 3.6723549488054608, "grad_norm": 1.1169606447219849, "learning_rate": 0.00026643913538111494, "loss": 1.798, "step": 3228 }, { "epoch": 3.6734926052332195, "grad_norm": 0.8246114253997803, "learning_rate": 0.00026621160409556315, "loss": 2.3232, "step": 3229 }, { "epoch": 3.6746302616609783, "grad_norm": 1.181199312210083, "learning_rate": 0.00026598407281001136, "loss": 2.0188, "step": 3230 }, { "epoch": 3.675767918088737, "grad_norm": 0.7052603960037231, "learning_rate": 0.00026575654152445963, "loss": 0.8771, "step": 3231 }, { "epoch": 3.6769055745164962, "grad_norm": 0.5976938605308533, "learning_rate": 0.00026552901023890784, "loss": 0.9885, "step": 3232 }, { "epoch": 3.6780432309442546, "grad_norm": 0.6874876022338867, "learning_rate": 0.0002653014789533561, "loss": 1.0909, "step": 3233 }, { "epoch": 3.6791808873720138, "grad_norm": 0.9620742797851562, "learning_rate": 0.00026507394766780437, "loss": 2.0523, "step": 3234 }, { "epoch": 3.6803185437997725, "grad_norm": 1.0894056558609009, "learning_rate": 0.0002648464163822525, "loss": 2.1868, "step": 3235 }, { "epoch": 3.6814562002275313, "grad_norm": 0.8352320194244385, "learning_rate": 0.0002646188850967008, "loss": 1.5531, "step": 3236 }, { "epoch": 3.68259385665529, "grad_norm": 0.7866209149360657, "learning_rate": 0.00026439135381114905, "loss": 1.2869, "step": 3237 }, { "epoch": 3.683731513083049, "grad_norm": 0.8318602442741394, "learning_rate": 0.00026416382252559726, "loss": 1.4013, "step": 3238 }, { "epoch": 3.684869169510808, "grad_norm": 0.6473860144615173, "learning_rate": 0.0002639362912400455, "loss": 1.0885, "step": 3239 }, { "epoch": 3.6860068259385663, "grad_norm": 0.8395644426345825, "learning_rate": 0.00026370875995449373, "loss": 1.8404, "step": 3240 }, { "epoch": 3.6871444823663255, "grad_norm": 1.3969433307647705, "learning_rate": 0.000263481228668942, "loss": 2.3692, "step": 3241 }, { "epoch": 3.6882821387940843, "grad_norm": 0.7492949366569519, "learning_rate": 0.0002632536973833902, "loss": 1.4541, "step": 3242 }, { "epoch": 3.689419795221843, "grad_norm": 0.7749677300453186, "learning_rate": 0.00026302616609783847, "loss": 1.3882, "step": 3243 }, { "epoch": 3.690557451649602, "grad_norm": 0.507800817489624, "learning_rate": 0.00026279863481228674, "loss": 0.6856, "step": 3244 }, { "epoch": 3.6916951080773606, "grad_norm": 0.7074844837188721, "learning_rate": 0.0002625711035267349, "loss": 1.4151, "step": 3245 }, { "epoch": 3.6928327645051193, "grad_norm": 1.3755199909210205, "learning_rate": 0.00026234357224118316, "loss": 2.833, "step": 3246 }, { "epoch": 3.693970420932878, "grad_norm": 0.7762560248374939, "learning_rate": 0.0002621160409556314, "loss": 1.0901, "step": 3247 }, { "epoch": 3.6951080773606373, "grad_norm": 1.1140830516815186, "learning_rate": 0.00026188850967007963, "loss": 3.3667, "step": 3248 }, { "epoch": 3.696245733788396, "grad_norm": 1.8301548957824707, "learning_rate": 0.0002616609783845279, "loss": 2.7103, "step": 3249 }, { "epoch": 3.697383390216155, "grad_norm": 0.9659634828567505, "learning_rate": 0.0002614334470989761, "loss": 2.4665, "step": 3250 }, { "epoch": 3.6985210466439136, "grad_norm": 1.0916699171066284, "learning_rate": 0.00026120591581342437, "loss": 1.5426, "step": 3251 }, { "epoch": 3.6996587030716723, "grad_norm": 1.2825108766555786, "learning_rate": 0.0002609783845278726, "loss": 1.7881, "step": 3252 }, { "epoch": 3.700796359499431, "grad_norm": 1.1956276893615723, "learning_rate": 0.00026075085324232084, "loss": 1.8214, "step": 3253 }, { "epoch": 3.70193401592719, "grad_norm": 1.0599949359893799, "learning_rate": 0.0002605233219567691, "loss": 2.1982, "step": 3254 }, { "epoch": 3.703071672354949, "grad_norm": 0.9849620461463928, "learning_rate": 0.00026029579067121726, "loss": 1.133, "step": 3255 }, { "epoch": 3.7042093287827074, "grad_norm": 1.0015318393707275, "learning_rate": 0.0002600682593856655, "loss": 1.0853, "step": 3256 }, { "epoch": 3.7053469852104666, "grad_norm": 1.1830706596374512, "learning_rate": 0.0002598407281001138, "loss": 2.3646, "step": 3257 }, { "epoch": 3.7064846416382253, "grad_norm": 0.695976734161377, "learning_rate": 0.000259613196814562, "loss": 1.4282, "step": 3258 }, { "epoch": 3.707622298065984, "grad_norm": 0.9679135680198669, "learning_rate": 0.0002593856655290102, "loss": 1.3835, "step": 3259 }, { "epoch": 3.708759954493743, "grad_norm": 1.6105592250823975, "learning_rate": 0.0002591581342434585, "loss": 1.4964, "step": 3260 }, { "epoch": 3.7098976109215016, "grad_norm": 1.0834482908248901, "learning_rate": 0.00025893060295790674, "loss": 1.6271, "step": 3261 }, { "epoch": 3.7110352673492604, "grad_norm": 1.945980429649353, "learning_rate": 0.00025870307167235495, "loss": 1.3425, "step": 3262 }, { "epoch": 3.712172923777019, "grad_norm": 0.9838660359382629, "learning_rate": 0.0002584755403868032, "loss": 1.7534, "step": 3263 }, { "epoch": 3.7133105802047783, "grad_norm": 0.8386029005050659, "learning_rate": 0.0002582480091012514, "loss": 1.7482, "step": 3264 }, { "epoch": 3.714448236632537, "grad_norm": 0.7002511620521545, "learning_rate": 0.00025802047781569963, "loss": 1.1395, "step": 3265 }, { "epoch": 3.715585893060296, "grad_norm": 0.9224847555160522, "learning_rate": 0.0002577929465301479, "loss": 1.4517, "step": 3266 }, { "epoch": 3.7167235494880546, "grad_norm": 1.0433372259140015, "learning_rate": 0.00025756541524459616, "loss": 2.0467, "step": 3267 }, { "epoch": 3.7178612059158134, "grad_norm": 1.5209763050079346, "learning_rate": 0.00025733788395904437, "loss": 2.7371, "step": 3268 }, { "epoch": 3.718998862343572, "grad_norm": 0.7279452085494995, "learning_rate": 0.0002571103526734926, "loss": 1.0362, "step": 3269 }, { "epoch": 3.720136518771331, "grad_norm": 0.7574169635772705, "learning_rate": 0.00025688282138794084, "loss": 1.2043, "step": 3270 }, { "epoch": 3.72127417519909, "grad_norm": 1.3098034858703613, "learning_rate": 0.0002566552901023891, "loss": 2.5448, "step": 3271 }, { "epoch": 3.722411831626849, "grad_norm": 0.8687241673469543, "learning_rate": 0.0002564277588168373, "loss": 1.4687, "step": 3272 }, { "epoch": 3.7235494880546076, "grad_norm": 0.9659656882286072, "learning_rate": 0.0002562002275312856, "loss": 2.4018, "step": 3273 }, { "epoch": 3.7246871444823664, "grad_norm": 1.438889503479004, "learning_rate": 0.0002559726962457338, "loss": 3.2661, "step": 3274 }, { "epoch": 3.725824800910125, "grad_norm": 1.4293714761734009, "learning_rate": 0.000255745164960182, "loss": 3.2473, "step": 3275 }, { "epoch": 3.726962457337884, "grad_norm": 1.1246088743209839, "learning_rate": 0.00025551763367463026, "loss": 2.274, "step": 3276 }, { "epoch": 3.7281001137656427, "grad_norm": 0.6158525943756104, "learning_rate": 0.00025529010238907853, "loss": 1.1515, "step": 3277 }, { "epoch": 3.729237770193402, "grad_norm": 1.0718598365783691, "learning_rate": 0.00025506257110352674, "loss": 1.6976, "step": 3278 }, { "epoch": 3.73037542662116, "grad_norm": 0.9811798334121704, "learning_rate": 0.00025483503981797495, "loss": 1.6137, "step": 3279 }, { "epoch": 3.7315130830489194, "grad_norm": 1.0085113048553467, "learning_rate": 0.0002546075085324232, "loss": 2.4258, "step": 3280 }, { "epoch": 3.732650739476678, "grad_norm": 1.0862224102020264, "learning_rate": 0.0002543799772468715, "loss": 1.6301, "step": 3281 }, { "epoch": 3.733788395904437, "grad_norm": 1.057241678237915, "learning_rate": 0.0002541524459613197, "loss": 2.4249, "step": 3282 }, { "epoch": 3.7349260523321957, "grad_norm": 0.9607272148132324, "learning_rate": 0.00025392491467576795, "loss": 2.4097, "step": 3283 }, { "epoch": 3.7360637087599544, "grad_norm": 0.769008994102478, "learning_rate": 0.00025369738339021616, "loss": 2.0803, "step": 3284 }, { "epoch": 3.737201365187713, "grad_norm": 1.317842721939087, "learning_rate": 0.00025346985210466437, "loss": 3.1254, "step": 3285 }, { "epoch": 3.738339021615472, "grad_norm": 0.8753394484519958, "learning_rate": 0.00025324232081911263, "loss": 0.9374, "step": 3286 }, { "epoch": 3.739476678043231, "grad_norm": 0.7274285554885864, "learning_rate": 0.0002530147895335609, "loss": 1.4164, "step": 3287 }, { "epoch": 3.74061433447099, "grad_norm": 0.7005650997161865, "learning_rate": 0.0002527872582480091, "loss": 1.3413, "step": 3288 }, { "epoch": 3.7417519908987487, "grad_norm": 1.0076242685317993, "learning_rate": 0.0002525597269624573, "loss": 2.4045, "step": 3289 }, { "epoch": 3.7428896473265074, "grad_norm": 1.4078526496887207, "learning_rate": 0.0002523321956769056, "loss": 2.6869, "step": 3290 }, { "epoch": 3.744027303754266, "grad_norm": 1.1299002170562744, "learning_rate": 0.00025210466439135385, "loss": 2.3044, "step": 3291 }, { "epoch": 3.745164960182025, "grad_norm": 0.8027879595756531, "learning_rate": 0.00025187713310580206, "loss": 1.8643, "step": 3292 }, { "epoch": 3.7463026166097837, "grad_norm": 1.083298921585083, "learning_rate": 0.00025164960182025027, "loss": 1.4689, "step": 3293 }, { "epoch": 3.747440273037543, "grad_norm": 0.9221919178962708, "learning_rate": 0.00025142207053469853, "loss": 1.6231, "step": 3294 }, { "epoch": 3.748577929465301, "grad_norm": 0.8968214392662048, "learning_rate": 0.00025119453924914674, "loss": 2.3925, "step": 3295 }, { "epoch": 3.7497155858930604, "grad_norm": 0.8066171407699585, "learning_rate": 0.000250967007963595, "loss": 1.804, "step": 3296 }, { "epoch": 3.750853242320819, "grad_norm": 1.2196367979049683, "learning_rate": 0.00025073947667804327, "loss": 3.5051, "step": 3297 }, { "epoch": 3.751990898748578, "grad_norm": 1.0004627704620361, "learning_rate": 0.0002505119453924914, "loss": 1.7358, "step": 3298 }, { "epoch": 3.7531285551763367, "grad_norm": 1.1003127098083496, "learning_rate": 0.0002502844141069397, "loss": 1.6266, "step": 3299 }, { "epoch": 3.7542662116040955, "grad_norm": 0.8804592490196228, "learning_rate": 0.00025005688282138795, "loss": 1.6458, "step": 3300 }, { "epoch": 3.755403868031854, "grad_norm": 0.6198902130126953, "learning_rate": 0.0002498293515358362, "loss": 1.0018, "step": 3301 }, { "epoch": 3.756541524459613, "grad_norm": 0.9782140851020813, "learning_rate": 0.0002496018202502844, "loss": 2.0916, "step": 3302 }, { "epoch": 3.757679180887372, "grad_norm": 1.2002933025360107, "learning_rate": 0.00024937428896473264, "loss": 1.7946, "step": 3303 }, { "epoch": 3.758816837315131, "grad_norm": 0.9678136110305786, "learning_rate": 0.0002491467576791809, "loss": 1.476, "step": 3304 }, { "epoch": 3.7599544937428897, "grad_norm": 0.7648789286613464, "learning_rate": 0.0002489192263936291, "loss": 1.3207, "step": 3305 }, { "epoch": 3.7610921501706485, "grad_norm": 1.2675551176071167, "learning_rate": 0.0002486916951080774, "loss": 2.6937, "step": 3306 }, { "epoch": 3.7622298065984072, "grad_norm": 0.7276213765144348, "learning_rate": 0.0002484641638225256, "loss": 0.9917, "step": 3307 }, { "epoch": 3.763367463026166, "grad_norm": 1.3405811786651611, "learning_rate": 0.00024823663253697385, "loss": 1.6142, "step": 3308 }, { "epoch": 3.7645051194539247, "grad_norm": 0.7004644274711609, "learning_rate": 0.00024800910125142206, "loss": 0.9751, "step": 3309 }, { "epoch": 3.765642775881684, "grad_norm": 0.8266583681106567, "learning_rate": 0.0002477815699658703, "loss": 1.7234, "step": 3310 }, { "epoch": 3.7667804323094427, "grad_norm": 0.827187180519104, "learning_rate": 0.0002475540386803186, "loss": 1.5155, "step": 3311 }, { "epoch": 3.7679180887372015, "grad_norm": 0.8548292517662048, "learning_rate": 0.0002473265073947668, "loss": 1.7327, "step": 3312 }, { "epoch": 3.7690557451649602, "grad_norm": 0.884003221988678, "learning_rate": 0.000247098976109215, "loss": 1.4591, "step": 3313 }, { "epoch": 3.770193401592719, "grad_norm": 0.7317343950271606, "learning_rate": 0.00024687144482366327, "loss": 1.0964, "step": 3314 }, { "epoch": 3.7713310580204777, "grad_norm": 0.8710649013519287, "learning_rate": 0.0002466439135381115, "loss": 1.5287, "step": 3315 }, { "epoch": 3.7724687144482365, "grad_norm": 1.5312626361846924, "learning_rate": 0.00024641638225255974, "loss": 3.5776, "step": 3316 }, { "epoch": 3.7736063708759957, "grad_norm": 0.6833708882331848, "learning_rate": 0.00024618885096700795, "loss": 1.2004, "step": 3317 }, { "epoch": 3.774744027303754, "grad_norm": 1.1912914514541626, "learning_rate": 0.0002459613196814562, "loss": 2.4428, "step": 3318 }, { "epoch": 3.7758816837315132, "grad_norm": 1.0153708457946777, "learning_rate": 0.00024573378839590443, "loss": 1.9598, "step": 3319 }, { "epoch": 3.777019340159272, "grad_norm": 1.0456105470657349, "learning_rate": 0.0002455062571103527, "loss": 1.7178, "step": 3320 }, { "epoch": 3.7781569965870307, "grad_norm": 0.9774885177612305, "learning_rate": 0.0002452787258248009, "loss": 1.9783, "step": 3321 }, { "epoch": 3.7792946530147895, "grad_norm": 1.089418649673462, "learning_rate": 0.00024505119453924917, "loss": 2.1403, "step": 3322 }, { "epoch": 3.7804323094425483, "grad_norm": 1.1333575248718262, "learning_rate": 0.0002448236632536974, "loss": 1.3416, "step": 3323 }, { "epoch": 3.781569965870307, "grad_norm": 0.8414488434791565, "learning_rate": 0.00024459613196814564, "loss": 1.5206, "step": 3324 }, { "epoch": 3.782707622298066, "grad_norm": 1.0938860177993774, "learning_rate": 0.00024436860068259385, "loss": 2.2782, "step": 3325 }, { "epoch": 3.783845278725825, "grad_norm": 1.1637766361236572, "learning_rate": 0.00024414106939704206, "loss": 1.9136, "step": 3326 }, { "epoch": 3.7849829351535837, "grad_norm": 0.893345057964325, "learning_rate": 0.00024391353811149032, "loss": 1.1827, "step": 3327 }, { "epoch": 3.7861205915813425, "grad_norm": 0.8583236336708069, "learning_rate": 0.0002436860068259386, "loss": 1.831, "step": 3328 }, { "epoch": 3.7872582480091013, "grad_norm": 1.1517513990402222, "learning_rate": 0.0002434584755403868, "loss": 1.955, "step": 3329 }, { "epoch": 3.78839590443686, "grad_norm": 0.8208685517311096, "learning_rate": 0.00024323094425483506, "loss": 1.3271, "step": 3330 }, { "epoch": 3.789533560864619, "grad_norm": 0.6080604195594788, "learning_rate": 0.00024300341296928327, "loss": 1.1819, "step": 3331 }, { "epoch": 3.7906712172923775, "grad_norm": 0.7177555561065674, "learning_rate": 0.0002427758816837315, "loss": 1.8782, "step": 3332 }, { "epoch": 3.7918088737201368, "grad_norm": 0.7688407301902771, "learning_rate": 0.00024254835039817977, "loss": 2.0169, "step": 3333 }, { "epoch": 3.792946530147895, "grad_norm": 1.3213694095611572, "learning_rate": 0.00024232081911262798, "loss": 1.6509, "step": 3334 }, { "epoch": 3.7940841865756543, "grad_norm": 1.1717759370803833, "learning_rate": 0.00024209328782707625, "loss": 2.5551, "step": 3335 }, { "epoch": 3.795221843003413, "grad_norm": 1.1276628971099854, "learning_rate": 0.00024186575654152446, "loss": 1.9977, "step": 3336 }, { "epoch": 3.796359499431172, "grad_norm": 0.9729685187339783, "learning_rate": 0.0002416382252559727, "loss": 2.0503, "step": 3337 }, { "epoch": 3.7974971558589306, "grad_norm": 1.1408803462982178, "learning_rate": 0.00024141069397042093, "loss": 1.7418, "step": 3338 }, { "epoch": 3.7986348122866893, "grad_norm": 0.7034707069396973, "learning_rate": 0.00024118316268486917, "loss": 1.622, "step": 3339 }, { "epoch": 3.799772468714448, "grad_norm": 1.2321475744247437, "learning_rate": 0.00024095563139931743, "loss": 2.719, "step": 3340 }, { "epoch": 3.800910125142207, "grad_norm": 0.9584794640541077, "learning_rate": 0.00024072810011376564, "loss": 1.7564, "step": 3341 }, { "epoch": 3.802047781569966, "grad_norm": 0.508548378944397, "learning_rate": 0.00024050056882821388, "loss": 0.6892, "step": 3342 }, { "epoch": 3.803185437997725, "grad_norm": 1.1406333446502686, "learning_rate": 0.00024027303754266212, "loss": 2.4856, "step": 3343 }, { "epoch": 3.8043230944254836, "grad_norm": 1.3431506156921387, "learning_rate": 0.00024004550625711035, "loss": 2.4605, "step": 3344 }, { "epoch": 3.8054607508532423, "grad_norm": 1.2452622652053833, "learning_rate": 0.00023981797497155862, "loss": 2.2794, "step": 3345 }, { "epoch": 3.806598407281001, "grad_norm": 0.8428059816360474, "learning_rate": 0.00023959044368600683, "loss": 1.701, "step": 3346 }, { "epoch": 3.80773606370876, "grad_norm": 1.1248981952667236, "learning_rate": 0.00023936291240045506, "loss": 2.0512, "step": 3347 }, { "epoch": 3.8088737201365186, "grad_norm": 0.9891200661659241, "learning_rate": 0.0002391353811149033, "loss": 2.4231, "step": 3348 }, { "epoch": 3.810011376564278, "grad_norm": 1.2691020965576172, "learning_rate": 0.00023890784982935154, "loss": 1.9103, "step": 3349 }, { "epoch": 3.8111490329920366, "grad_norm": 1.2526246309280396, "learning_rate": 0.0002386803185437998, "loss": 2.4644, "step": 3350 }, { "epoch": 3.8122866894197953, "grad_norm": 1.0178083181381226, "learning_rate": 0.000238452787258248, "loss": 1.6648, "step": 3351 }, { "epoch": 3.813424345847554, "grad_norm": 0.7109336256980896, "learning_rate": 0.00023822525597269625, "loss": 1.1645, "step": 3352 }, { "epoch": 3.814562002275313, "grad_norm": 1.2275941371917725, "learning_rate": 0.00023799772468714449, "loss": 1.998, "step": 3353 }, { "epoch": 3.8156996587030716, "grad_norm": 1.1337257623672485, "learning_rate": 0.00023777019340159272, "loss": 3.3256, "step": 3354 }, { "epoch": 3.8168373151308304, "grad_norm": 1.2316417694091797, "learning_rate": 0.00023754266211604096, "loss": 2.1339, "step": 3355 }, { "epoch": 3.8179749715585896, "grad_norm": 2.0374646186828613, "learning_rate": 0.0002373151308304892, "loss": 3.4686, "step": 3356 }, { "epoch": 3.819112627986348, "grad_norm": 1.059798002243042, "learning_rate": 0.00023708759954493743, "loss": 1.6049, "step": 3357 }, { "epoch": 3.820250284414107, "grad_norm": 0.8509323596954346, "learning_rate": 0.00023686006825938567, "loss": 1.2672, "step": 3358 }, { "epoch": 3.821387940841866, "grad_norm": 0.603013277053833, "learning_rate": 0.0002366325369738339, "loss": 1.0302, "step": 3359 }, { "epoch": 3.8225255972696246, "grad_norm": 1.5710378885269165, "learning_rate": 0.00023640500568828215, "loss": 2.2583, "step": 3360 }, { "epoch": 3.8236632536973834, "grad_norm": 1.1296995878219604, "learning_rate": 0.00023617747440273038, "loss": 2.1769, "step": 3361 }, { "epoch": 3.824800910125142, "grad_norm": 1.1707825660705566, "learning_rate": 0.00023594994311717862, "loss": 2.6797, "step": 3362 }, { "epoch": 3.825938566552901, "grad_norm": 1.0534515380859375, "learning_rate": 0.00023572241183162686, "loss": 0.8871, "step": 3363 }, { "epoch": 3.8270762229806596, "grad_norm": 0.9066932797431946, "learning_rate": 0.0002354948805460751, "loss": 1.6269, "step": 3364 }, { "epoch": 3.828213879408419, "grad_norm": 0.8703908324241638, "learning_rate": 0.00023526734926052333, "loss": 1.5106, "step": 3365 }, { "epoch": 3.8293515358361776, "grad_norm": 0.8020843267440796, "learning_rate": 0.00023503981797497157, "loss": 2.0528, "step": 3366 }, { "epoch": 3.8304891922639364, "grad_norm": 0.9272792935371399, "learning_rate": 0.0002348122866894198, "loss": 1.6559, "step": 3367 }, { "epoch": 3.831626848691695, "grad_norm": 0.5369601845741272, "learning_rate": 0.00023458475540386804, "loss": 1.0992, "step": 3368 }, { "epoch": 3.832764505119454, "grad_norm": 0.6304998397827148, "learning_rate": 0.00023435722411831628, "loss": 0.8554, "step": 3369 }, { "epoch": 3.8339021615472126, "grad_norm": 0.7873075604438782, "learning_rate": 0.0002341296928327645, "loss": 2.2207, "step": 3370 }, { "epoch": 3.8350398179749714, "grad_norm": 0.6331322193145752, "learning_rate": 0.00023390216154721275, "loss": 1.4673, "step": 3371 }, { "epoch": 3.8361774744027306, "grad_norm": 0.5962525010108948, "learning_rate": 0.00023367463026166096, "loss": 1.5718, "step": 3372 }, { "epoch": 3.837315130830489, "grad_norm": 1.0758075714111328, "learning_rate": 0.00023344709897610923, "loss": 2.0164, "step": 3373 }, { "epoch": 3.838452787258248, "grad_norm": 0.6574288010597229, "learning_rate": 0.00023321956769055746, "loss": 1.0371, "step": 3374 }, { "epoch": 3.839590443686007, "grad_norm": 0.984384298324585, "learning_rate": 0.00023299203640500567, "loss": 2.0911, "step": 3375 }, { "epoch": 3.8407281001137656, "grad_norm": 1.0093640089035034, "learning_rate": 0.00023276450511945394, "loss": 1.3854, "step": 3376 }, { "epoch": 3.8418657565415244, "grad_norm": 1.2866005897521973, "learning_rate": 0.00023253697383390215, "loss": 2.0335, "step": 3377 }, { "epoch": 3.843003412969283, "grad_norm": 1.504915475845337, "learning_rate": 0.0002323094425483504, "loss": 2.1961, "step": 3378 }, { "epoch": 3.8441410693970424, "grad_norm": 0.6888614296913147, "learning_rate": 0.00023208191126279865, "loss": 1.6042, "step": 3379 }, { "epoch": 3.8452787258248007, "grad_norm": 0.6555305123329163, "learning_rate": 0.00023185437997724686, "loss": 0.8984, "step": 3380 }, { "epoch": 3.84641638225256, "grad_norm": 0.7594724893569946, "learning_rate": 0.00023162684869169512, "loss": 1.2783, "step": 3381 }, { "epoch": 3.8475540386803186, "grad_norm": 0.9284168481826782, "learning_rate": 0.00023139931740614333, "loss": 1.9291, "step": 3382 }, { "epoch": 3.8486916951080774, "grad_norm": 0.5923244953155518, "learning_rate": 0.0002311717861205916, "loss": 0.7256, "step": 3383 }, { "epoch": 3.849829351535836, "grad_norm": 0.9779441952705383, "learning_rate": 0.00023094425483503983, "loss": 1.9968, "step": 3384 }, { "epoch": 3.850967007963595, "grad_norm": 1.1208325624465942, "learning_rate": 0.00023071672354948804, "loss": 2.5118, "step": 3385 }, { "epoch": 3.8521046643913537, "grad_norm": 0.8016112446784973, "learning_rate": 0.0002304891922639363, "loss": 0.7837, "step": 3386 }, { "epoch": 3.8532423208191124, "grad_norm": 0.9915149807929993, "learning_rate": 0.00023026166097838452, "loss": 2.0725, "step": 3387 }, { "epoch": 3.8543799772468716, "grad_norm": 0.5172927379608154, "learning_rate": 0.00023003412969283278, "loss": 0.6931, "step": 3388 }, { "epoch": 3.8555176336746304, "grad_norm": 0.6758817434310913, "learning_rate": 0.000229806598407281, "loss": 1.3358, "step": 3389 }, { "epoch": 3.856655290102389, "grad_norm": 0.9508668780326843, "learning_rate": 0.00022957906712172923, "loss": 2.2465, "step": 3390 }, { "epoch": 3.857792946530148, "grad_norm": 0.6855764389038086, "learning_rate": 0.0002293515358361775, "loss": 1.5222, "step": 3391 }, { "epoch": 3.8589306029579067, "grad_norm": 0.8102338314056396, "learning_rate": 0.0002291240045506257, "loss": 1.9551, "step": 3392 }, { "epoch": 3.8600682593856654, "grad_norm": 0.8812468647956848, "learning_rate": 0.00022889647326507397, "loss": 1.9685, "step": 3393 }, { "epoch": 3.861205915813424, "grad_norm": 0.6355249881744385, "learning_rate": 0.00022866894197952218, "loss": 1.231, "step": 3394 }, { "epoch": 3.8623435722411834, "grad_norm": 0.5649857521057129, "learning_rate": 0.0002284414106939704, "loss": 0.5657, "step": 3395 }, { "epoch": 3.8634812286689417, "grad_norm": 0.7780776619911194, "learning_rate": 0.00022821387940841868, "loss": 0.8215, "step": 3396 }, { "epoch": 3.864618885096701, "grad_norm": 0.7705486416816711, "learning_rate": 0.0002279863481228669, "loss": 1.7517, "step": 3397 }, { "epoch": 3.8657565415244597, "grad_norm": 0.7325091361999512, "learning_rate": 0.00022775881683731515, "loss": 1.9095, "step": 3398 }, { "epoch": 3.8668941979522184, "grad_norm": 0.6553176045417786, "learning_rate": 0.00022753128555176336, "loss": 1.3486, "step": 3399 }, { "epoch": 3.868031854379977, "grad_norm": 1.1297094821929932, "learning_rate": 0.0002273037542662116, "loss": 1.3066, "step": 3400 }, { "epoch": 3.869169510807736, "grad_norm": 0.6546614170074463, "learning_rate": 0.00022707622298065986, "loss": 1.5045, "step": 3401 }, { "epoch": 3.8703071672354947, "grad_norm": 0.6554498076438904, "learning_rate": 0.00022684869169510807, "loss": 1.1808, "step": 3402 }, { "epoch": 3.8714448236632535, "grad_norm": 0.9071396589279175, "learning_rate": 0.00022662116040955634, "loss": 1.5988, "step": 3403 }, { "epoch": 3.8725824800910127, "grad_norm": 1.2640100717544556, "learning_rate": 0.00022639362912400455, "loss": 2.3752, "step": 3404 }, { "epoch": 3.8737201365187715, "grad_norm": 1.221620798110962, "learning_rate": 0.00022616609783845278, "loss": 2.2681, "step": 3405 }, { "epoch": 3.87485779294653, "grad_norm": 0.8574184775352478, "learning_rate": 0.00022593856655290102, "loss": 1.9581, "step": 3406 }, { "epoch": 3.875995449374289, "grad_norm": 0.9591223001480103, "learning_rate": 0.00022571103526734926, "loss": 2.0804, "step": 3407 }, { "epoch": 3.8771331058020477, "grad_norm": 0.7188199758529663, "learning_rate": 0.00022548350398179752, "loss": 1.6862, "step": 3408 }, { "epoch": 3.8782707622298065, "grad_norm": 1.205315351486206, "learning_rate": 0.00022525597269624573, "loss": 2.6322, "step": 3409 }, { "epoch": 3.8794084186575652, "grad_norm": 0.6590704321861267, "learning_rate": 0.00022502844141069397, "loss": 0.9434, "step": 3410 }, { "epoch": 3.8805460750853245, "grad_norm": 1.0419220924377441, "learning_rate": 0.0002248009101251422, "loss": 1.7335, "step": 3411 }, { "epoch": 3.881683731513083, "grad_norm": 0.9429000616073608, "learning_rate": 0.00022457337883959044, "loss": 1.6675, "step": 3412 }, { "epoch": 3.882821387940842, "grad_norm": 0.8173443675041199, "learning_rate": 0.0002243458475540387, "loss": 1.5351, "step": 3413 }, { "epoch": 3.8839590443686007, "grad_norm": 1.2141562700271606, "learning_rate": 0.00022411831626848692, "loss": 2.0883, "step": 3414 }, { "epoch": 3.8850967007963595, "grad_norm": 0.9572274088859558, "learning_rate": 0.00022389078498293515, "loss": 2.0156, "step": 3415 }, { "epoch": 3.8862343572241183, "grad_norm": 0.7675147652626038, "learning_rate": 0.0002236632536973834, "loss": 1.4844, "step": 3416 }, { "epoch": 3.887372013651877, "grad_norm": 0.8228021264076233, "learning_rate": 0.00022343572241183163, "loss": 2.0502, "step": 3417 }, { "epoch": 3.888509670079636, "grad_norm": 1.1497551202774048, "learning_rate": 0.0002232081911262799, "loss": 1.1353, "step": 3418 }, { "epoch": 3.8896473265073945, "grad_norm": 0.7159321308135986, "learning_rate": 0.0002229806598407281, "loss": 1.627, "step": 3419 }, { "epoch": 3.8907849829351537, "grad_norm": 1.681058645248413, "learning_rate": 0.00022275312855517634, "loss": 3.7341, "step": 3420 }, { "epoch": 3.8919226393629125, "grad_norm": 0.4869871437549591, "learning_rate": 0.00022252559726962457, "loss": 0.7067, "step": 3421 }, { "epoch": 3.8930602957906713, "grad_norm": 0.6947728395462036, "learning_rate": 0.0002222980659840728, "loss": 1.0103, "step": 3422 }, { "epoch": 3.89419795221843, "grad_norm": 0.7470018267631531, "learning_rate": 0.00022207053469852108, "loss": 0.5964, "step": 3423 }, { "epoch": 3.8953356086461888, "grad_norm": 1.2415233850479126, "learning_rate": 0.00022184300341296929, "loss": 2.5444, "step": 3424 }, { "epoch": 3.8964732650739475, "grad_norm": 0.9338128566741943, "learning_rate": 0.00022161547212741752, "loss": 1.5243, "step": 3425 }, { "epoch": 3.8976109215017063, "grad_norm": 0.6499525904655457, "learning_rate": 0.00022138794084186576, "loss": 1.009, "step": 3426 }, { "epoch": 3.8987485779294655, "grad_norm": 1.1649621725082397, "learning_rate": 0.000221160409556314, "loss": 2.3154, "step": 3427 }, { "epoch": 3.8998862343572243, "grad_norm": 1.090055227279663, "learning_rate": 0.00022093287827076223, "loss": 2.3174, "step": 3428 }, { "epoch": 3.901023890784983, "grad_norm": 0.9271589517593384, "learning_rate": 0.00022070534698521047, "loss": 1.8594, "step": 3429 }, { "epoch": 3.9021615472127418, "grad_norm": 0.8594130277633667, "learning_rate": 0.0002204778156996587, "loss": 1.9265, "step": 3430 }, { "epoch": 3.9032992036405005, "grad_norm": 0.860653817653656, "learning_rate": 0.00022025028441410694, "loss": 1.9662, "step": 3431 }, { "epoch": 3.9044368600682593, "grad_norm": 0.8960944414138794, "learning_rate": 0.00022002275312855518, "loss": 1.6532, "step": 3432 }, { "epoch": 3.905574516496018, "grad_norm": 0.753968358039856, "learning_rate": 0.00021979522184300342, "loss": 0.8833, "step": 3433 }, { "epoch": 3.9067121729237773, "grad_norm": 0.7025309205055237, "learning_rate": 0.00021956769055745166, "loss": 0.9119, "step": 3434 }, { "epoch": 3.9078498293515356, "grad_norm": 0.9567241668701172, "learning_rate": 0.0002193401592718999, "loss": 1.3435, "step": 3435 }, { "epoch": 3.908987485779295, "grad_norm": 0.8972539901733398, "learning_rate": 0.00021911262798634813, "loss": 1.5827, "step": 3436 }, { "epoch": 3.9101251422070535, "grad_norm": 0.8140578269958496, "learning_rate": 0.00021888509670079637, "loss": 1.5984, "step": 3437 }, { "epoch": 3.9112627986348123, "grad_norm": 1.117913007736206, "learning_rate": 0.0002186575654152446, "loss": 2.3435, "step": 3438 }, { "epoch": 3.912400455062571, "grad_norm": 1.821190357208252, "learning_rate": 0.00021843003412969284, "loss": 3.6124, "step": 3439 }, { "epoch": 3.91353811149033, "grad_norm": 1.1013044118881226, "learning_rate": 0.00021820250284414108, "loss": 2.9663, "step": 3440 }, { "epoch": 3.9146757679180886, "grad_norm": 0.9850253462791443, "learning_rate": 0.00021797497155858931, "loss": 1.6718, "step": 3441 }, { "epoch": 3.9158134243458473, "grad_norm": 0.7211486101150513, "learning_rate": 0.00021774744027303755, "loss": 1.7512, "step": 3442 }, { "epoch": 3.9169510807736065, "grad_norm": 1.0833756923675537, "learning_rate": 0.0002175199089874858, "loss": 2.52, "step": 3443 }, { "epoch": 3.9180887372013653, "grad_norm": 1.1811580657958984, "learning_rate": 0.00021729237770193403, "loss": 2.2193, "step": 3444 }, { "epoch": 3.919226393629124, "grad_norm": 1.0001695156097412, "learning_rate": 0.00021706484641638224, "loss": 1.7069, "step": 3445 }, { "epoch": 3.920364050056883, "grad_norm": 0.64976966381073, "learning_rate": 0.0002168373151308305, "loss": 1.7047, "step": 3446 }, { "epoch": 3.9215017064846416, "grad_norm": 0.8505579233169556, "learning_rate": 0.00021660978384527874, "loss": 1.1988, "step": 3447 }, { "epoch": 3.9226393629124003, "grad_norm": 1.036294937133789, "learning_rate": 0.00021638225255972697, "loss": 1.8824, "step": 3448 }, { "epoch": 3.923777019340159, "grad_norm": 1.0031839609146118, "learning_rate": 0.0002161547212741752, "loss": 1.9741, "step": 3449 }, { "epoch": 3.9249146757679183, "grad_norm": 0.8414771556854248, "learning_rate": 0.00021592718998862342, "loss": 1.6352, "step": 3450 }, { "epoch": 3.926052332195677, "grad_norm": 1.0658117532730103, "learning_rate": 0.00021569965870307168, "loss": 1.2177, "step": 3451 }, { "epoch": 3.927189988623436, "grad_norm": 1.3510658740997314, "learning_rate": 0.00021547212741751992, "loss": 2.6798, "step": 3452 }, { "epoch": 3.9283276450511946, "grad_norm": 1.097466230392456, "learning_rate": 0.00021524459613196816, "loss": 1.955, "step": 3453 }, { "epoch": 3.9294653014789533, "grad_norm": 0.5628653168678284, "learning_rate": 0.0002150170648464164, "loss": 1.148, "step": 3454 }, { "epoch": 3.930602957906712, "grad_norm": 0.7025729417800903, "learning_rate": 0.0002147895335608646, "loss": 1.5446, "step": 3455 }, { "epoch": 3.931740614334471, "grad_norm": 1.8733407258987427, "learning_rate": 0.00021456200227531287, "loss": 4.242, "step": 3456 }, { "epoch": 3.93287827076223, "grad_norm": 1.2189491987228394, "learning_rate": 0.0002143344709897611, "loss": 2.2309, "step": 3457 }, { "epoch": 3.9340159271899884, "grad_norm": 1.0556656122207642, "learning_rate": 0.00021410693970420934, "loss": 1.535, "step": 3458 }, { "epoch": 3.9351535836177476, "grad_norm": 1.2942211627960205, "learning_rate": 0.00021387940841865758, "loss": 1.7564, "step": 3459 }, { "epoch": 3.9362912400455063, "grad_norm": 1.3906549215316772, "learning_rate": 0.0002136518771331058, "loss": 2.4698, "step": 3460 }, { "epoch": 3.937428896473265, "grad_norm": 1.5894346237182617, "learning_rate": 0.00021342434584755405, "loss": 3.9154, "step": 3461 }, { "epoch": 3.938566552901024, "grad_norm": 1.111507534980774, "learning_rate": 0.00021319681456200226, "loss": 2.2502, "step": 3462 }, { "epoch": 3.9397042093287826, "grad_norm": 0.9626011848449707, "learning_rate": 0.00021296928327645053, "loss": 2.6883, "step": 3463 }, { "epoch": 3.9408418657565414, "grad_norm": 0.8860620856285095, "learning_rate": 0.00021274175199089877, "loss": 1.697, "step": 3464 }, { "epoch": 3.9419795221843, "grad_norm": 0.9714303612709045, "learning_rate": 0.00021251422070534698, "loss": 3.0761, "step": 3465 }, { "epoch": 3.9431171786120593, "grad_norm": 0.7767373323440552, "learning_rate": 0.00021228668941979524, "loss": 2.555, "step": 3466 }, { "epoch": 3.944254835039818, "grad_norm": 1.177393913269043, "learning_rate": 0.00021205915813424345, "loss": 2.1534, "step": 3467 }, { "epoch": 3.945392491467577, "grad_norm": 0.7708033919334412, "learning_rate": 0.0002118316268486917, "loss": 1.6853, "step": 3468 }, { "epoch": 3.9465301478953356, "grad_norm": 1.034865140914917, "learning_rate": 0.00021160409556313995, "loss": 1.7026, "step": 3469 }, { "epoch": 3.9476678043230944, "grad_norm": 0.769185483455658, "learning_rate": 0.00021137656427758816, "loss": 1.1644, "step": 3470 }, { "epoch": 3.948805460750853, "grad_norm": 1.4568527936935425, "learning_rate": 0.00021114903299203642, "loss": 2.0562, "step": 3471 }, { "epoch": 3.949943117178612, "grad_norm": 0.8477489352226257, "learning_rate": 0.00021092150170648463, "loss": 1.6948, "step": 3472 }, { "epoch": 3.951080773606371, "grad_norm": 0.6970076560974121, "learning_rate": 0.00021069397042093287, "loss": 1.3396, "step": 3473 }, { "epoch": 3.9522184300341294, "grad_norm": 0.5849756598472595, "learning_rate": 0.00021046643913538114, "loss": 1.2495, "step": 3474 }, { "epoch": 3.9533560864618886, "grad_norm": 0.9044439196586609, "learning_rate": 0.00021023890784982934, "loss": 2.0495, "step": 3475 }, { "epoch": 3.9544937428896474, "grad_norm": 1.0596073865890503, "learning_rate": 0.0002100113765642776, "loss": 2.21, "step": 3476 }, { "epoch": 3.955631399317406, "grad_norm": 1.0529900789260864, "learning_rate": 0.00020978384527872582, "loss": 2.0394, "step": 3477 }, { "epoch": 3.956769055745165, "grad_norm": 1.1782349348068237, "learning_rate": 0.00020955631399317406, "loss": 2.4599, "step": 3478 }, { "epoch": 3.9579067121729237, "grad_norm": 0.6079378128051758, "learning_rate": 0.0002093287827076223, "loss": 1.3727, "step": 3479 }, { "epoch": 3.9590443686006824, "grad_norm": 0.9474525451660156, "learning_rate": 0.00020910125142207053, "loss": 2.3577, "step": 3480 }, { "epoch": 3.960182025028441, "grad_norm": 0.8396434783935547, "learning_rate": 0.0002088737201365188, "loss": 1.6286, "step": 3481 }, { "epoch": 3.9613196814562004, "grad_norm": 0.784811794757843, "learning_rate": 0.000208646188850967, "loss": 1.852, "step": 3482 }, { "epoch": 3.962457337883959, "grad_norm": 1.2220088243484497, "learning_rate": 0.00020841865756541524, "loss": 2.4076, "step": 3483 }, { "epoch": 3.963594994311718, "grad_norm": 0.8224531412124634, "learning_rate": 0.00020819112627986348, "loss": 2.0785, "step": 3484 }, { "epoch": 3.9647326507394767, "grad_norm": 1.1083797216415405, "learning_rate": 0.00020796359499431171, "loss": 2.125, "step": 3485 }, { "epoch": 3.9658703071672354, "grad_norm": 0.8573135137557983, "learning_rate": 0.00020773606370875998, "loss": 1.063, "step": 3486 }, { "epoch": 3.967007963594994, "grad_norm": 1.380513072013855, "learning_rate": 0.0002075085324232082, "loss": 3.7957, "step": 3487 }, { "epoch": 3.968145620022753, "grad_norm": 0.7844088673591614, "learning_rate": 0.00020728100113765643, "loss": 1.1714, "step": 3488 }, { "epoch": 3.969283276450512, "grad_norm": 0.8056669235229492, "learning_rate": 0.00020705346985210466, "loss": 1.4436, "step": 3489 }, { "epoch": 3.970420932878271, "grad_norm": 1.3273180723190308, "learning_rate": 0.0002068259385665529, "loss": 3.3308, "step": 3490 }, { "epoch": 3.9715585893060297, "grad_norm": 0.7416298389434814, "learning_rate": 0.00020659840728100116, "loss": 1.7255, "step": 3491 }, { "epoch": 3.9726962457337884, "grad_norm": 1.28645658493042, "learning_rate": 0.00020637087599544937, "loss": 2.2977, "step": 3492 }, { "epoch": 3.973833902161547, "grad_norm": 0.637898862361908, "learning_rate": 0.0002061433447098976, "loss": 1.4102, "step": 3493 }, { "epoch": 3.974971558589306, "grad_norm": 0.5494396686553955, "learning_rate": 0.00020591581342434585, "loss": 1.001, "step": 3494 }, { "epoch": 3.9761092150170647, "grad_norm": 1.295616626739502, "learning_rate": 0.00020568828213879408, "loss": 3.9937, "step": 3495 }, { "epoch": 3.977246871444824, "grad_norm": 1.2073827981948853, "learning_rate": 0.00020546075085324232, "loss": 2.9927, "step": 3496 }, { "epoch": 3.9783845278725822, "grad_norm": 0.5181747674942017, "learning_rate": 0.00020523321956769056, "loss": 0.6257, "step": 3497 }, { "epoch": 3.9795221843003414, "grad_norm": 1.6364872455596924, "learning_rate": 0.0002050056882821388, "loss": 3.7021, "step": 3498 }, { "epoch": 3.9806598407281, "grad_norm": 0.7388929724693298, "learning_rate": 0.00020477815699658703, "loss": 1.1443, "step": 3499 }, { "epoch": 3.981797497155859, "grad_norm": 0.8518296480178833, "learning_rate": 0.00020455062571103527, "loss": 1.7027, "step": 3500 }, { "epoch": 3.9829351535836177, "grad_norm": 0.885672390460968, "learning_rate": 0.0002043230944254835, "loss": 1.5378, "step": 3501 }, { "epoch": 3.9840728100113765, "grad_norm": 0.7524988055229187, "learning_rate": 0.00020409556313993174, "loss": 1.4504, "step": 3502 }, { "epoch": 3.9852104664391352, "grad_norm": 0.8979845643043518, "learning_rate": 0.00020386803185437998, "loss": 1.747, "step": 3503 }, { "epoch": 3.986348122866894, "grad_norm": 0.9114024639129639, "learning_rate": 0.00020364050056882822, "loss": 2.2745, "step": 3504 }, { "epoch": 3.987485779294653, "grad_norm": 1.3460856676101685, "learning_rate": 0.00020341296928327645, "loss": 2.4317, "step": 3505 }, { "epoch": 3.988623435722412, "grad_norm": 0.6706535816192627, "learning_rate": 0.0002031854379977247, "loss": 1.5125, "step": 3506 }, { "epoch": 3.9897610921501707, "grad_norm": 0.6658063530921936, "learning_rate": 0.00020295790671217293, "loss": 0.6909, "step": 3507 }, { "epoch": 3.9908987485779295, "grad_norm": 0.7595200538635254, "learning_rate": 0.00020273037542662117, "loss": 1.3346, "step": 3508 }, { "epoch": 3.9920364050056882, "grad_norm": 1.1703503131866455, "learning_rate": 0.0002025028441410694, "loss": 1.7334, "step": 3509 }, { "epoch": 3.993174061433447, "grad_norm": 1.095198154449463, "learning_rate": 0.00020227531285551764, "loss": 2.9393, "step": 3510 }, { "epoch": 3.9943117178612058, "grad_norm": 1.123388409614563, "learning_rate": 0.00020204778156996588, "loss": 1.6582, "step": 3511 }, { "epoch": 3.995449374288965, "grad_norm": 0.6648460626602173, "learning_rate": 0.00020182025028441411, "loss": 1.0768, "step": 3512 }, { "epoch": 3.9965870307167233, "grad_norm": 1.1515564918518066, "learning_rate": 0.00020159271899886232, "loss": 3.5262, "step": 3513 }, { "epoch": 3.9977246871444825, "grad_norm": 0.7331502437591553, "learning_rate": 0.0002013651877133106, "loss": 1.3639, "step": 3514 }, { "epoch": 3.9988623435722412, "grad_norm": 1.1474095582962036, "learning_rate": 0.00020113765642775882, "loss": 1.7056, "step": 3515 }, { "epoch": 4.0, "grad_norm": 1.1696233749389648, "learning_rate": 0.00020091012514220706, "loss": 2.7192, "step": 3516 }, { "epoch": 4.0, "eval_f1": 0.8897, "eval_gen_len": 49.6091, "eval_loss": 1.8271052837371826, "eval_precision": 0.8878, "eval_recall": 0.8917, "eval_rouge1": 0.4366, "eval_rouge2": 0.1966, "eval_rougeL": 0.3643, "eval_rougeLsum": 0.4041, "eval_runtime": 28.1666, "eval_samples_per_second": 3.905, "eval_steps_per_second": 0.497, "step": 3516 }, { "epoch": 4.001137656427759, "grad_norm": 0.8035992383956909, "learning_rate": 0.0002006825938566553, "loss": 2.5708, "step": 3517 }, { "epoch": 4.0022753128555175, "grad_norm": 0.5220395922660828, "learning_rate": 0.0002004550625711035, "loss": 0.9217, "step": 3518 }, { "epoch": 4.003412969283277, "grad_norm": 0.8917536735534668, "learning_rate": 0.00020022753128555177, "loss": 1.7236, "step": 3519 }, { "epoch": 4.004550625711035, "grad_norm": 1.0690449476242065, "learning_rate": 0.0002, "loss": 2.2547, "step": 3520 }, { "epoch": 4.005688282138794, "grad_norm": 1.0998704433441162, "learning_rate": 0.00019977246871444825, "loss": 1.9161, "step": 3521 }, { "epoch": 4.006825938566553, "grad_norm": 0.6901816129684448, "learning_rate": 0.00019954493742889648, "loss": 0.828, "step": 3522 }, { "epoch": 4.007963594994312, "grad_norm": 1.0238609313964844, "learning_rate": 0.0001993174061433447, "loss": 1.0532, "step": 3523 }, { "epoch": 4.009101251422071, "grad_norm": 0.8400004506111145, "learning_rate": 0.00019908987485779296, "loss": 1.5558, "step": 3524 }, { "epoch": 4.010238907849829, "grad_norm": 0.8628343343734741, "learning_rate": 0.0001988623435722412, "loss": 1.6871, "step": 3525 }, { "epoch": 4.0113765642775885, "grad_norm": 1.2406960725784302, "learning_rate": 0.00019863481228668943, "loss": 1.9634, "step": 3526 }, { "epoch": 4.012514220705347, "grad_norm": 1.2915693521499634, "learning_rate": 0.00019840728100113767, "loss": 2.4605, "step": 3527 }, { "epoch": 4.013651877133106, "grad_norm": 1.227972149848938, "learning_rate": 0.00019817974971558588, "loss": 2.6666, "step": 3528 }, { "epoch": 4.014789533560864, "grad_norm": 1.1289844512939453, "learning_rate": 0.00019795221843003414, "loss": 1.9807, "step": 3529 }, { "epoch": 4.0159271899886235, "grad_norm": 0.8085461258888245, "learning_rate": 0.00019772468714448235, "loss": 0.9486, "step": 3530 }, { "epoch": 4.017064846416382, "grad_norm": 0.6080414652824402, "learning_rate": 0.00019749715585893062, "loss": 0.995, "step": 3531 }, { "epoch": 4.018202502844141, "grad_norm": 0.7787399291992188, "learning_rate": 0.00019726962457337885, "loss": 1.6868, "step": 3532 }, { "epoch": 4.0193401592719, "grad_norm": 0.7362167239189148, "learning_rate": 0.00019704209328782706, "loss": 1.096, "step": 3533 }, { "epoch": 4.020477815699659, "grad_norm": 1.1093764305114746, "learning_rate": 0.00019681456200227533, "loss": 1.16, "step": 3534 }, { "epoch": 4.021615472127418, "grad_norm": 1.1228350400924683, "learning_rate": 0.00019658703071672354, "loss": 2.6241, "step": 3535 }, { "epoch": 4.022753128555176, "grad_norm": 0.7320429086685181, "learning_rate": 0.0001963594994311718, "loss": 1.7036, "step": 3536 }, { "epoch": 4.023890784982935, "grad_norm": 0.8620187640190125, "learning_rate": 0.00019613196814562004, "loss": 1.7516, "step": 3537 }, { "epoch": 4.025028441410694, "grad_norm": 0.8271589875221252, "learning_rate": 0.00019590443686006825, "loss": 1.4492, "step": 3538 }, { "epoch": 4.026166097838453, "grad_norm": 1.1758748292922974, "learning_rate": 0.0001956769055745165, "loss": 2.0319, "step": 3539 }, { "epoch": 4.027303754266212, "grad_norm": 1.373871088027954, "learning_rate": 0.00019544937428896472, "loss": 1.6359, "step": 3540 }, { "epoch": 4.02844141069397, "grad_norm": 0.7867767214775085, "learning_rate": 0.00019522184300341299, "loss": 1.3931, "step": 3541 }, { "epoch": 4.0295790671217295, "grad_norm": 0.9068583846092224, "learning_rate": 0.00019499431171786122, "loss": 1.5971, "step": 3542 }, { "epoch": 4.030716723549488, "grad_norm": 1.1074446439743042, "learning_rate": 0.00019476678043230943, "loss": 2.0918, "step": 3543 }, { "epoch": 4.031854379977247, "grad_norm": 0.8541181087493896, "learning_rate": 0.0001945392491467577, "loss": 2.3143, "step": 3544 }, { "epoch": 4.032992036405005, "grad_norm": 1.084362268447876, "learning_rate": 0.0001943117178612059, "loss": 1.7493, "step": 3545 }, { "epoch": 4.034129692832765, "grad_norm": 1.041742205619812, "learning_rate": 0.00019408418657565417, "loss": 1.8627, "step": 3546 }, { "epoch": 4.035267349260523, "grad_norm": 0.7245301604270935, "learning_rate": 0.00019385665529010238, "loss": 0.726, "step": 3547 }, { "epoch": 4.036405005688282, "grad_norm": 0.8224193453788757, "learning_rate": 0.00019362912400455062, "loss": 1.5722, "step": 3548 }, { "epoch": 4.037542662116041, "grad_norm": 1.1694352626800537, "learning_rate": 0.00019340159271899888, "loss": 1.0027, "step": 3549 }, { "epoch": 4.0386803185438, "grad_norm": 0.9688643217086792, "learning_rate": 0.0001931740614334471, "loss": 2.2008, "step": 3550 }, { "epoch": 4.039817974971559, "grad_norm": 0.6902188062667847, "learning_rate": 0.00019294653014789536, "loss": 1.4584, "step": 3551 }, { "epoch": 4.040955631399317, "grad_norm": 1.1102920770645142, "learning_rate": 0.00019271899886234357, "loss": 2.3507, "step": 3552 }, { "epoch": 4.042093287827076, "grad_norm": 0.5606656074523926, "learning_rate": 0.0001924914675767918, "loss": 1.1701, "step": 3553 }, { "epoch": 4.043230944254835, "grad_norm": 0.5422776937484741, "learning_rate": 0.00019226393629124007, "loss": 1.1612, "step": 3554 }, { "epoch": 4.044368600682594, "grad_norm": 1.0809518098831177, "learning_rate": 0.00019203640500568828, "loss": 2.5527, "step": 3555 }, { "epoch": 4.045506257110353, "grad_norm": 0.9332743287086487, "learning_rate": 0.00019180887372013654, "loss": 1.4009, "step": 3556 }, { "epoch": 4.046643913538111, "grad_norm": 0.721682608127594, "learning_rate": 0.00019158134243458475, "loss": 0.9377, "step": 3557 }, { "epoch": 4.047781569965871, "grad_norm": 0.8674675226211548, "learning_rate": 0.000191353811149033, "loss": 1.783, "step": 3558 }, { "epoch": 4.048919226393629, "grad_norm": 0.8048381209373474, "learning_rate": 0.00019112627986348125, "loss": 1.9681, "step": 3559 }, { "epoch": 4.050056882821388, "grad_norm": 1.0859109163284302, "learning_rate": 0.00019089874857792946, "loss": 1.4804, "step": 3560 }, { "epoch": 4.051194539249146, "grad_norm": 0.8473076820373535, "learning_rate": 0.00019067121729237773, "loss": 1.2142, "step": 3561 }, { "epoch": 4.052332195676906, "grad_norm": 0.7596136331558228, "learning_rate": 0.00019044368600682594, "loss": 1.2756, "step": 3562 }, { "epoch": 4.053469852104665, "grad_norm": 0.7931225895881653, "learning_rate": 0.00019021615472127417, "loss": 1.4067, "step": 3563 }, { "epoch": 4.054607508532423, "grad_norm": 0.8623807430267334, "learning_rate": 0.0001899886234357224, "loss": 1.6287, "step": 3564 }, { "epoch": 4.055745164960182, "grad_norm": 1.0621230602264404, "learning_rate": 0.00018976109215017065, "loss": 2.8103, "step": 3565 }, { "epoch": 4.056882821387941, "grad_norm": 0.8181642293930054, "learning_rate": 0.0001895335608646189, "loss": 1.1884, "step": 3566 }, { "epoch": 4.0580204778157, "grad_norm": 0.9738771319389343, "learning_rate": 0.00018930602957906712, "loss": 1.9755, "step": 3567 }, { "epoch": 4.059158134243458, "grad_norm": 0.9224938154220581, "learning_rate": 0.00018907849829351536, "loss": 1.288, "step": 3568 }, { "epoch": 4.060295790671217, "grad_norm": 0.8679994940757751, "learning_rate": 0.0001888509670079636, "loss": 1.4742, "step": 3569 }, { "epoch": 4.061433447098976, "grad_norm": 1.6250495910644531, "learning_rate": 0.00018862343572241183, "loss": 3.5226, "step": 3570 }, { "epoch": 4.062571103526735, "grad_norm": NaN, "learning_rate": 0.00018862343572241183, "loss": 1.1473, "step": 3571 }, { "epoch": 4.063708759954494, "grad_norm": 1.0141960382461548, "learning_rate": 0.0001883959044368601, "loss": 2.936, "step": 3572 }, { "epoch": 4.064846416382252, "grad_norm": 1.2739934921264648, "learning_rate": 0.0001881683731513083, "loss": 2.1047, "step": 3573 }, { "epoch": 4.065984072810012, "grad_norm": 1.6481419801712036, "learning_rate": 0.00018794084186575654, "loss": 3.6404, "step": 3574 }, { "epoch": 4.06712172923777, "grad_norm": 0.9871388673782349, "learning_rate": 0.00018771331058020478, "loss": 2.5576, "step": 3575 }, { "epoch": 4.068259385665529, "grad_norm": 0.8460843563079834, "learning_rate": 0.00018748577929465302, "loss": 2.5027, "step": 3576 }, { "epoch": 4.0693970420932875, "grad_norm": 0.9054187536239624, "learning_rate": 0.00018725824800910125, "loss": 1.5227, "step": 3577 }, { "epoch": 4.070534698521047, "grad_norm": 1.1341551542282104, "learning_rate": 0.0001870307167235495, "loss": 1.8531, "step": 3578 }, { "epoch": 4.071672354948806, "grad_norm": 0.7718223929405212, "learning_rate": 0.00018680318543799773, "loss": 1.1577, "step": 3579 }, { "epoch": 4.072810011376564, "grad_norm": 0.7591809034347534, "learning_rate": 0.00018657565415244596, "loss": 1.069, "step": 3580 }, { "epoch": 4.073947667804323, "grad_norm": 1.5402214527130127, "learning_rate": 0.0001863481228668942, "loss": 3.2458, "step": 3581 }, { "epoch": 4.075085324232082, "grad_norm": 1.6219172477722168, "learning_rate": 0.00018612059158134244, "loss": 2.8352, "step": 3582 }, { "epoch": 4.076222980659841, "grad_norm": 0.7609260082244873, "learning_rate": 0.00018589306029579068, "loss": 1.5297, "step": 3583 }, { "epoch": 4.077360637087599, "grad_norm": 0.9463198184967041, "learning_rate": 0.0001856655290102389, "loss": 1.4272, "step": 3584 }, { "epoch": 4.078498293515358, "grad_norm": 1.0422461032867432, "learning_rate": 0.00018543799772468715, "loss": 2.3036, "step": 3585 }, { "epoch": 4.079635949943118, "grad_norm": 1.6263346672058105, "learning_rate": 0.0001852104664391354, "loss": 2.6899, "step": 3586 }, { "epoch": 4.080773606370876, "grad_norm": 0.8726171851158142, "learning_rate": 0.0001849829351535836, "loss": 1.9618, "step": 3587 }, { "epoch": 4.081911262798635, "grad_norm": 1.0334999561309814, "learning_rate": 0.00018475540386803186, "loss": 1.9655, "step": 3588 }, { "epoch": 4.0830489192263935, "grad_norm": 1.2761929035186768, "learning_rate": 0.0001845278725824801, "loss": 1.1134, "step": 3589 }, { "epoch": 4.084186575654153, "grad_norm": 1.0597755908966064, "learning_rate": 0.00018430034129692833, "loss": 1.6925, "step": 3590 }, { "epoch": 4.085324232081911, "grad_norm": 1.0300536155700684, "learning_rate": 0.00018407281001137657, "loss": 1.877, "step": 3591 }, { "epoch": 4.08646188850967, "grad_norm": 1.1007896661758423, "learning_rate": 0.00018384527872582478, "loss": 1.8541, "step": 3592 }, { "epoch": 4.0875995449374285, "grad_norm": 0.9704376459121704, "learning_rate": 0.00018361774744027305, "loss": 1.1683, "step": 3593 }, { "epoch": 4.088737201365188, "grad_norm": 1.4303745031356812, "learning_rate": 0.00018339021615472128, "loss": 1.9568, "step": 3594 }, { "epoch": 4.089874857792947, "grad_norm": 0.7820960879325867, "learning_rate": 0.00018316268486916952, "loss": 1.3428, "step": 3595 }, { "epoch": 4.091012514220705, "grad_norm": 0.9371392726898193, "learning_rate": 0.00018293515358361776, "loss": 1.4801, "step": 3596 }, { "epoch": 4.092150170648464, "grad_norm": 0.9383748173713684, "learning_rate": 0.00018270762229806597, "loss": 1.8923, "step": 3597 }, { "epoch": 4.093287827076223, "grad_norm": 1.0831904411315918, "learning_rate": 0.00018248009101251423, "loss": 2.4795, "step": 3598 }, { "epoch": 4.094425483503982, "grad_norm": 0.872491180896759, "learning_rate": 0.00018225255972696247, "loss": 1.6777, "step": 3599 }, { "epoch": 4.09556313993174, "grad_norm": 1.4505406618118286, "learning_rate": 0.0001820250284414107, "loss": 2.7823, "step": 3600 }, { "epoch": 4.0967007963594995, "grad_norm": 1.2872380018234253, "learning_rate": 0.00018179749715585894, "loss": 2.3393, "step": 3601 }, { "epoch": 4.097838452787259, "grad_norm": 1.019313097000122, "learning_rate": 0.00018156996587030715, "loss": 2.0507, "step": 3602 }, { "epoch": 4.098976109215017, "grad_norm": 1.1505863666534424, "learning_rate": 0.00018134243458475542, "loss": 1.7178, "step": 3603 }, { "epoch": 4.100113765642776, "grad_norm": 0.6773290038108826, "learning_rate": 0.00018111490329920363, "loss": 1.6069, "step": 3604 }, { "epoch": 4.1012514220705345, "grad_norm": 0.7799268960952759, "learning_rate": 0.0001808873720136519, "loss": 1.6404, "step": 3605 }, { "epoch": 4.102389078498294, "grad_norm": 1.3413171768188477, "learning_rate": 0.00018065984072810013, "loss": 4.3731, "step": 3606 }, { "epoch": 4.103526734926052, "grad_norm": 1.068517804145813, "learning_rate": 0.00018043230944254834, "loss": 1.7495, "step": 3607 }, { "epoch": 4.104664391353811, "grad_norm": 0.895536482334137, "learning_rate": 0.0001802047781569966, "loss": 2.339, "step": 3608 }, { "epoch": 4.1058020477815695, "grad_norm": 1.09737229347229, "learning_rate": 0.0001799772468714448, "loss": 1.8625, "step": 3609 }, { "epoch": 4.106939704209329, "grad_norm": 1.0622013807296753, "learning_rate": 0.00017974971558589307, "loss": 1.309, "step": 3610 }, { "epoch": 4.108077360637088, "grad_norm": 1.0724921226501465, "learning_rate": 0.0001795221843003413, "loss": 2.1258, "step": 3611 }, { "epoch": 4.109215017064846, "grad_norm": 0.6296405792236328, "learning_rate": 0.00017929465301478952, "loss": 1.1405, "step": 3612 }, { "epoch": 4.1103526734926055, "grad_norm": 0.6233872771263123, "learning_rate": 0.00017906712172923779, "loss": 1.4618, "step": 3613 }, { "epoch": 4.111490329920364, "grad_norm": 1.0284563302993774, "learning_rate": 0.000178839590443686, "loss": 2.1088, "step": 3614 }, { "epoch": 4.112627986348123, "grad_norm": 0.5234109163284302, "learning_rate": 0.00017861205915813426, "loss": 0.6546, "step": 3615 }, { "epoch": 4.113765642775881, "grad_norm": 1.1184611320495605, "learning_rate": 0.0001783845278725825, "loss": 2.2601, "step": 3616 }, { "epoch": 4.1149032992036405, "grad_norm": 0.5588635802268982, "learning_rate": 0.0001781569965870307, "loss": 1.2117, "step": 3617 }, { "epoch": 4.1160409556314, "grad_norm": 1.029264211654663, "learning_rate": 0.00017792946530147897, "loss": 2.4449, "step": 3618 }, { "epoch": 4.117178612059158, "grad_norm": 0.6462013125419617, "learning_rate": 0.00017770193401592718, "loss": 1.0404, "step": 3619 }, { "epoch": 4.118316268486917, "grad_norm": 0.8388407826423645, "learning_rate": 0.00017747440273037544, "loss": 1.1833, "step": 3620 }, { "epoch": 4.1194539249146755, "grad_norm": 0.6785479187965393, "learning_rate": 0.00017724687144482365, "loss": 1.404, "step": 3621 }, { "epoch": 4.120591581342435, "grad_norm": 0.742090106010437, "learning_rate": 0.0001770193401592719, "loss": 1.844, "step": 3622 }, { "epoch": 4.121729237770193, "grad_norm": 0.5642427206039429, "learning_rate": 0.00017679180887372016, "loss": 0.8566, "step": 3623 }, { "epoch": 4.122866894197952, "grad_norm": 1.279248595237732, "learning_rate": 0.00017656427758816837, "loss": 2.091, "step": 3624 }, { "epoch": 4.1240045506257115, "grad_norm": 0.8221580386161804, "learning_rate": 0.00017633674630261663, "loss": 1.0874, "step": 3625 }, { "epoch": 4.12514220705347, "grad_norm": 1.105421543121338, "learning_rate": 0.00017610921501706484, "loss": 1.8684, "step": 3626 }, { "epoch": 4.126279863481229, "grad_norm": 0.6208410859107971, "learning_rate": 0.00017588168373151308, "loss": 1.1682, "step": 3627 }, { "epoch": 4.127417519908987, "grad_norm": 1.1576206684112549, "learning_rate": 0.00017565415244596134, "loss": 2.1256, "step": 3628 }, { "epoch": 4.1285551763367465, "grad_norm": 0.9814143180847168, "learning_rate": 0.00017542662116040955, "loss": 2.8486, "step": 3629 }, { "epoch": 4.129692832764505, "grad_norm": 1.1456234455108643, "learning_rate": 0.00017519908987485781, "loss": 1.3626, "step": 3630 }, { "epoch": 4.130830489192264, "grad_norm": 0.8578894734382629, "learning_rate": 0.00017497155858930602, "loss": 1.7545, "step": 3631 }, { "epoch": 4.131968145620022, "grad_norm": 0.7834432721138, "learning_rate": 0.00017474402730375426, "loss": 1.7061, "step": 3632 }, { "epoch": 4.1331058020477816, "grad_norm": 1.3007863759994507, "learning_rate": 0.00017451649601820253, "loss": 1.5035, "step": 3633 }, { "epoch": 4.134243458475541, "grad_norm": 1.0654963254928589, "learning_rate": 0.00017428896473265074, "loss": 2.8058, "step": 3634 }, { "epoch": 4.135381114903299, "grad_norm": 1.042611002922058, "learning_rate": 0.000174061433447099, "loss": 1.5158, "step": 3635 }, { "epoch": 4.136518771331058, "grad_norm": 0.8858871459960938, "learning_rate": 0.0001738339021615472, "loss": 2.2742, "step": 3636 }, { "epoch": 4.137656427758817, "grad_norm": 1.099360466003418, "learning_rate": 0.00017360637087599545, "loss": 2.7292, "step": 3637 }, { "epoch": 4.138794084186576, "grad_norm": 0.520845890045166, "learning_rate": 0.00017337883959044368, "loss": 0.512, "step": 3638 }, { "epoch": 4.139931740614334, "grad_norm": 1.025590181350708, "learning_rate": 0.00017315130830489192, "loss": 2.4849, "step": 3639 }, { "epoch": 4.141069397042093, "grad_norm": 2.0368833541870117, "learning_rate": 0.00017292377701934018, "loss": 2.6333, "step": 3640 }, { "epoch": 4.1422070534698525, "grad_norm": 0.952337920665741, "learning_rate": 0.0001726962457337884, "loss": 0.9957, "step": 3641 }, { "epoch": 4.143344709897611, "grad_norm": 0.782004177570343, "learning_rate": 0.00017246871444823663, "loss": 1.4577, "step": 3642 }, { "epoch": 4.14448236632537, "grad_norm": 0.8281341195106506, "learning_rate": 0.00017224118316268487, "loss": 1.6821, "step": 3643 }, { "epoch": 4.145620022753128, "grad_norm": 0.8365357518196106, "learning_rate": 0.0001720136518771331, "loss": 2.0037, "step": 3644 }, { "epoch": 4.146757679180888, "grad_norm": 0.7121614813804626, "learning_rate": 0.00017178612059158137, "loss": 1.2755, "step": 3645 }, { "epoch": 4.147895335608646, "grad_norm": 0.890727162361145, "learning_rate": 0.00017155858930602958, "loss": 1.4112, "step": 3646 }, { "epoch": 4.149032992036405, "grad_norm": 0.9248006343841553, "learning_rate": 0.00017133105802047782, "loss": 1.4232, "step": 3647 }, { "epoch": 4.150170648464163, "grad_norm": 0.775360107421875, "learning_rate": 0.00017110352673492605, "loss": 1.675, "step": 3648 }, { "epoch": 4.151308304891923, "grad_norm": 0.8243028521537781, "learning_rate": 0.0001708759954493743, "loss": 1.8722, "step": 3649 }, { "epoch": 4.152445961319682, "grad_norm": 1.020837426185608, "learning_rate": 0.00017064846416382255, "loss": 2.4388, "step": 3650 }, { "epoch": 4.15358361774744, "grad_norm": 1.433733344078064, "learning_rate": 0.00017042093287827076, "loss": 3.276, "step": 3651 }, { "epoch": 4.154721274175199, "grad_norm": 0.7089780569076538, "learning_rate": 0.000170193401592719, "loss": 1.5784, "step": 3652 }, { "epoch": 4.155858930602958, "grad_norm": 1.084784984588623, "learning_rate": 0.00016996587030716724, "loss": 1.4955, "step": 3653 }, { "epoch": 4.156996587030717, "grad_norm": 0.6325297951698303, "learning_rate": 0.00016973833902161548, "loss": 0.6437, "step": 3654 }, { "epoch": 4.158134243458475, "grad_norm": 0.7360290884971619, "learning_rate": 0.0001695108077360637, "loss": 0.833, "step": 3655 }, { "epoch": 4.159271899886234, "grad_norm": 0.983505129814148, "learning_rate": 0.00016928327645051195, "loss": 2.5266, "step": 3656 }, { "epoch": 4.160409556313994, "grad_norm": 1.1645041704177856, "learning_rate": 0.00016905574516496019, "loss": 1.9561, "step": 3657 }, { "epoch": 4.161547212741752, "grad_norm": 0.8973751068115234, "learning_rate": 0.00016882821387940842, "loss": 0.9499, "step": 3658 }, { "epoch": 4.162684869169511, "grad_norm": 0.748742401599884, "learning_rate": 0.00016860068259385666, "loss": 1.542, "step": 3659 }, { "epoch": 4.163822525597269, "grad_norm": 0.6661348342895508, "learning_rate": 0.0001683731513083049, "loss": 1.4095, "step": 3660 }, { "epoch": 4.164960182025029, "grad_norm": 1.0042301416397095, "learning_rate": 0.00016814562002275313, "loss": 1.6135, "step": 3661 }, { "epoch": 4.166097838452787, "grad_norm": 1.056707739830017, "learning_rate": 0.00016791808873720137, "loss": 1.8746, "step": 3662 }, { "epoch": 4.167235494880546, "grad_norm": 1.1747101545333862, "learning_rate": 0.0001676905574516496, "loss": 1.8581, "step": 3663 }, { "epoch": 4.168373151308305, "grad_norm": 0.9841439723968506, "learning_rate": 0.00016746302616609785, "loss": 2.1848, "step": 3664 }, { "epoch": 4.169510807736064, "grad_norm": 1.0959367752075195, "learning_rate": 0.00016723549488054606, "loss": 2.9336, "step": 3665 }, { "epoch": 4.170648464163823, "grad_norm": 1.0343743562698364, "learning_rate": 0.00016700796359499432, "loss": 1.8865, "step": 3666 }, { "epoch": 4.171786120591581, "grad_norm": 1.1250258684158325, "learning_rate": 0.00016678043230944256, "loss": 2.206, "step": 3667 }, { "epoch": 4.17292377701934, "grad_norm": 0.8602643609046936, "learning_rate": 0.0001665529010238908, "loss": 1.6804, "step": 3668 }, { "epoch": 4.174061433447099, "grad_norm": 1.1515132188796997, "learning_rate": 0.00016632536973833903, "loss": 2.0477, "step": 3669 }, { "epoch": 4.175199089874858, "grad_norm": 1.0483062267303467, "learning_rate": 0.00016609783845278724, "loss": 1.4297, "step": 3670 }, { "epoch": 4.176336746302616, "grad_norm": 1.0967140197753906, "learning_rate": 0.0001658703071672355, "loss": 2.4236, "step": 3671 }, { "epoch": 4.177474402730375, "grad_norm": 0.9350886940956116, "learning_rate": 0.00016564277588168371, "loss": 2.0317, "step": 3672 }, { "epoch": 4.178612059158135, "grad_norm": 0.6748828291893005, "learning_rate": 0.00016541524459613198, "loss": 1.6643, "step": 3673 }, { "epoch": 4.179749715585893, "grad_norm": 1.0818042755126953, "learning_rate": 0.00016518771331058022, "loss": 1.7651, "step": 3674 }, { "epoch": 4.180887372013652, "grad_norm": 0.8341732621192932, "learning_rate": 0.00016496018202502842, "loss": 1.7182, "step": 3675 }, { "epoch": 4.18202502844141, "grad_norm": 0.7781969904899597, "learning_rate": 0.0001647326507394767, "loss": 1.5752, "step": 3676 }, { "epoch": 4.18316268486917, "grad_norm": 0.9930599927902222, "learning_rate": 0.0001645051194539249, "loss": 1.7508, "step": 3677 }, { "epoch": 4.184300341296928, "grad_norm": 1.0659433603286743, "learning_rate": 0.00016427758816837316, "loss": 2.5191, "step": 3678 }, { "epoch": 4.185437997724687, "grad_norm": 0.7574253082275391, "learning_rate": 0.0001640500568828214, "loss": 1.2169, "step": 3679 }, { "epoch": 4.186575654152446, "grad_norm": 0.7488005757331848, "learning_rate": 0.0001638225255972696, "loss": 1.2127, "step": 3680 }, { "epoch": 4.187713310580205, "grad_norm": 0.8474146127700806, "learning_rate": 0.00016359499431171787, "loss": 1.2748, "step": 3681 }, { "epoch": 4.188850967007964, "grad_norm": 0.9467219710350037, "learning_rate": 0.00016336746302616608, "loss": 2.0191, "step": 3682 }, { "epoch": 4.189988623435722, "grad_norm": 0.7899359464645386, "learning_rate": 0.00016313993174061435, "loss": 0.7179, "step": 3683 }, { "epoch": 4.191126279863481, "grad_norm": 1.5700042247772217, "learning_rate": 0.00016291240045506259, "loss": 2.2769, "step": 3684 }, { "epoch": 4.19226393629124, "grad_norm": 0.9096818566322327, "learning_rate": 0.0001626848691695108, "loss": 1.828, "step": 3685 }, { "epoch": 4.193401592718999, "grad_norm": 0.9237043857574463, "learning_rate": 0.00016245733788395906, "loss": 1.4392, "step": 3686 }, { "epoch": 4.194539249146757, "grad_norm": 1.1367732286453247, "learning_rate": 0.00016222980659840727, "loss": 3.0341, "step": 3687 }, { "epoch": 4.1956769055745164, "grad_norm": 0.9193979501724243, "learning_rate": 0.00016200227531285553, "loss": 1.32, "step": 3688 }, { "epoch": 4.196814562002276, "grad_norm": 1.5365267992019653, "learning_rate": 0.00016177474402730374, "loss": 3.1212, "step": 3689 }, { "epoch": 4.197952218430034, "grad_norm": 0.8413816094398499, "learning_rate": 0.00016154721274175198, "loss": 1.7901, "step": 3690 }, { "epoch": 4.199089874857793, "grad_norm": 0.6427741050720215, "learning_rate": 0.00016131968145620024, "loss": 0.5028, "step": 3691 }, { "epoch": 4.2002275312855515, "grad_norm": 0.9684869647026062, "learning_rate": 0.00016109215017064845, "loss": 2.0259, "step": 3692 }, { "epoch": 4.201365187713311, "grad_norm": 0.6879338622093201, "learning_rate": 0.00016086461888509672, "loss": 1.3484, "step": 3693 }, { "epoch": 4.202502844141069, "grad_norm": 0.9934858679771423, "learning_rate": 0.00016063708759954493, "loss": 1.5266, "step": 3694 }, { "epoch": 4.203640500568828, "grad_norm": 1.18158757686615, "learning_rate": 0.00016040955631399316, "loss": 2.0914, "step": 3695 }, { "epoch": 4.204778156996587, "grad_norm": 0.776360809803009, "learning_rate": 0.00016018202502844143, "loss": 1.8111, "step": 3696 }, { "epoch": 4.205915813424346, "grad_norm": 0.5315882563591003, "learning_rate": 0.00015995449374288964, "loss": 0.8625, "step": 3697 }, { "epoch": 4.207053469852105, "grad_norm": 0.7266376614570618, "learning_rate": 0.0001597269624573379, "loss": 0.9552, "step": 3698 }, { "epoch": 4.208191126279863, "grad_norm": 0.8702644109725952, "learning_rate": 0.0001594994311717861, "loss": 1.4781, "step": 3699 }, { "epoch": 4.2093287827076225, "grad_norm": 0.6491544246673584, "learning_rate": 0.00015927189988623435, "loss": 0.9899, "step": 3700 }, { "epoch": 4.210466439135381, "grad_norm": 1.0166810750961304, "learning_rate": 0.00015904436860068261, "loss": 1.7634, "step": 3701 }, { "epoch": 4.21160409556314, "grad_norm": 1.268738031387329, "learning_rate": 0.00015881683731513082, "loss": 2.6249, "step": 3702 }, { "epoch": 4.212741751990899, "grad_norm": 1.180303692817688, "learning_rate": 0.0001585893060295791, "loss": 3.388, "step": 3703 }, { "epoch": 4.2138794084186575, "grad_norm": 0.736415684223175, "learning_rate": 0.0001583617747440273, "loss": 1.1843, "step": 3704 }, { "epoch": 4.215017064846417, "grad_norm": 1.1527396440505981, "learning_rate": 0.00015813424345847553, "loss": 2.0684, "step": 3705 }, { "epoch": 4.216154721274175, "grad_norm": 1.2436633110046387, "learning_rate": 0.00015790671217292377, "loss": 3.2018, "step": 3706 }, { "epoch": 4.217292377701934, "grad_norm": 0.5855286717414856, "learning_rate": 0.000157679180887372, "loss": 1.0727, "step": 3707 }, { "epoch": 4.2184300341296925, "grad_norm": 1.469887375831604, "learning_rate": 0.00015745164960182027, "loss": 3.1109, "step": 3708 }, { "epoch": 4.219567690557452, "grad_norm": 0.6294847130775452, "learning_rate": 0.00015722411831626848, "loss": 1.3746, "step": 3709 }, { "epoch": 4.22070534698521, "grad_norm": 0.7737325429916382, "learning_rate": 0.00015699658703071672, "loss": 1.4918, "step": 3710 }, { "epoch": 4.221843003412969, "grad_norm": 0.7594680786132812, "learning_rate": 0.00015676905574516496, "loss": 2.4034, "step": 3711 }, { "epoch": 4.2229806598407285, "grad_norm": 0.9010804295539856, "learning_rate": 0.0001565415244596132, "loss": 1.1525, "step": 3712 }, { "epoch": 4.224118316268487, "grad_norm": 1.6272693872451782, "learning_rate": 0.00015631399317406146, "loss": 3.6835, "step": 3713 }, { "epoch": 4.225255972696246, "grad_norm": 0.7173957228660583, "learning_rate": 0.00015608646188850967, "loss": 1.3503, "step": 3714 }, { "epoch": 4.226393629124004, "grad_norm": 0.8401118516921997, "learning_rate": 0.0001558589306029579, "loss": 1.1993, "step": 3715 }, { "epoch": 4.2275312855517635, "grad_norm": 0.8311299681663513, "learning_rate": 0.00015563139931740614, "loss": 1.3891, "step": 3716 }, { "epoch": 4.228668941979522, "grad_norm": 2.7499144077301025, "learning_rate": 0.00015540386803185438, "loss": 4.453, "step": 3717 }, { "epoch": 4.229806598407281, "grad_norm": 0.736031174659729, "learning_rate": 0.00015517633674630264, "loss": 1.815, "step": 3718 }, { "epoch": 4.23094425483504, "grad_norm": 0.7462246417999268, "learning_rate": 0.00015494880546075085, "loss": 1.0605, "step": 3719 }, { "epoch": 4.2320819112627985, "grad_norm": 1.0690699815750122, "learning_rate": 0.0001547212741751991, "loss": 1.1685, "step": 3720 }, { "epoch": 4.233219567690558, "grad_norm": 0.781626284122467, "learning_rate": 0.00015449374288964733, "loss": 0.8943, "step": 3721 }, { "epoch": 4.234357224118316, "grad_norm": 0.9446958899497986, "learning_rate": 0.00015426621160409556, "loss": 1.8685, "step": 3722 }, { "epoch": 4.235494880546075, "grad_norm": 1.0105640888214111, "learning_rate": 0.00015403868031854383, "loss": 2.961, "step": 3723 }, { "epoch": 4.236632536973834, "grad_norm": 0.6021206974983215, "learning_rate": 0.00015381114903299204, "loss": 1.5929, "step": 3724 }, { "epoch": 4.237770193401593, "grad_norm": 0.7067481279373169, "learning_rate": 0.00015358361774744027, "loss": 0.7998, "step": 3725 }, { "epoch": 4.238907849829351, "grad_norm": 0.968774676322937, "learning_rate": 0.0001533560864618885, "loss": 1.5134, "step": 3726 }, { "epoch": 4.24004550625711, "grad_norm": 0.7576475739479065, "learning_rate": 0.00015312855517633675, "loss": 1.3597, "step": 3727 }, { "epoch": 4.2411831626848695, "grad_norm": 0.8560457229614258, "learning_rate": 0.00015290102389078499, "loss": 1.7398, "step": 3728 }, { "epoch": 4.242320819112628, "grad_norm": 1.2698020935058594, "learning_rate": 0.00015267349260523322, "loss": 1.4396, "step": 3729 }, { "epoch": 4.243458475540387, "grad_norm": 0.9575531482696533, "learning_rate": 0.00015244596131968146, "loss": 1.3904, "step": 3730 }, { "epoch": 4.244596131968145, "grad_norm": 1.038975715637207, "learning_rate": 0.0001522184300341297, "loss": 1.6743, "step": 3731 }, { "epoch": 4.2457337883959045, "grad_norm": 1.3596773147583008, "learning_rate": 0.00015199089874857793, "loss": 3.709, "step": 3732 }, { "epoch": 4.246871444823663, "grad_norm": 0.5811980962753296, "learning_rate": 0.00015176336746302617, "loss": 1.1902, "step": 3733 }, { "epoch": 4.248009101251422, "grad_norm": 0.7279687523841858, "learning_rate": 0.0001515358361774744, "loss": 1.28, "step": 3734 }, { "epoch": 4.249146757679181, "grad_norm": 0.8053296208381653, "learning_rate": 0.00015130830489192264, "loss": 1.5716, "step": 3735 }, { "epoch": 4.25028441410694, "grad_norm": 1.2876355648040771, "learning_rate": 0.00015108077360637088, "loss": 2.0535, "step": 3736 }, { "epoch": 4.251422070534699, "grad_norm": 1.4081701040267944, "learning_rate": 0.00015085324232081912, "loss": 2.8273, "step": 3737 }, { "epoch": 4.252559726962457, "grad_norm": 1.2183709144592285, "learning_rate": 0.00015062571103526736, "loss": 2.7009, "step": 3738 }, { "epoch": 4.253697383390216, "grad_norm": 1.22077476978302, "learning_rate": 0.0001503981797497156, "loss": 2.3349, "step": 3739 }, { "epoch": 4.254835039817975, "grad_norm": 0.965623140335083, "learning_rate": 0.00015017064846416383, "loss": 2.194, "step": 3740 }, { "epoch": 4.255972696245734, "grad_norm": 0.9789025187492371, "learning_rate": 0.00014994311717861207, "loss": 1.3343, "step": 3741 }, { "epoch": 4.257110352673493, "grad_norm": 0.7638621926307678, "learning_rate": 0.0001497155858930603, "loss": 1.2936, "step": 3742 }, { "epoch": 4.258248009101251, "grad_norm": 0.8102353811264038, "learning_rate": 0.00014948805460750854, "loss": 1.6045, "step": 3743 }, { "epoch": 4.2593856655290105, "grad_norm": 1.2423808574676514, "learning_rate": 0.00014926052332195678, "loss": 2.631, "step": 3744 }, { "epoch": 4.260523321956769, "grad_norm": 1.3112363815307617, "learning_rate": 0.000149032992036405, "loss": 2.9128, "step": 3745 }, { "epoch": 4.261660978384528, "grad_norm": 1.219910740852356, "learning_rate": 0.00014880546075085325, "loss": 2.0505, "step": 3746 }, { "epoch": 4.262798634812286, "grad_norm": 0.9391239881515503, "learning_rate": 0.0001485779294653015, "loss": 1.2113, "step": 3747 }, { "epoch": 4.263936291240046, "grad_norm": 1.231005072593689, "learning_rate": 0.00014835039817974973, "loss": 1.5756, "step": 3748 }, { "epoch": 4.265073947667805, "grad_norm": 1.1906815767288208, "learning_rate": 0.00014812286689419796, "loss": 2.0473, "step": 3749 }, { "epoch": 4.266211604095563, "grad_norm": 0.9939584136009216, "learning_rate": 0.00014789533560864617, "loss": 1.4565, "step": 3750 }, { "epoch": 4.267349260523322, "grad_norm": 0.7417742609977722, "learning_rate": 0.00014766780432309444, "loss": 1.7072, "step": 3751 }, { "epoch": 4.268486916951081, "grad_norm": 0.9343417882919312, "learning_rate": 0.00014744027303754267, "loss": 1.6886, "step": 3752 }, { "epoch": 4.26962457337884, "grad_norm": 1.7899564504623413, "learning_rate": 0.0001472127417519909, "loss": 3.9775, "step": 3753 }, { "epoch": 4.270762229806598, "grad_norm": 0.759251058101654, "learning_rate": 0.00014698521046643915, "loss": 1.273, "step": 3754 }, { "epoch": 4.271899886234357, "grad_norm": 1.0627682209014893, "learning_rate": 0.00014675767918088736, "loss": 1.5295, "step": 3755 }, { "epoch": 4.273037542662116, "grad_norm": 0.836782693862915, "learning_rate": 0.00014653014789533562, "loss": 1.7053, "step": 3756 }, { "epoch": 4.274175199089875, "grad_norm": 0.8767544031143188, "learning_rate": 0.00014630261660978386, "loss": 1.7385, "step": 3757 }, { "epoch": 4.275312855517634, "grad_norm": 1.1457442045211792, "learning_rate": 0.0001460750853242321, "loss": 2.2233, "step": 3758 }, { "epoch": 4.276450511945392, "grad_norm": 1.1655712127685547, "learning_rate": 0.00014584755403868033, "loss": 1.6773, "step": 3759 }, { "epoch": 4.277588168373152, "grad_norm": 0.9990490674972534, "learning_rate": 0.00014562002275312854, "loss": 1.7472, "step": 3760 }, { "epoch": 4.27872582480091, "grad_norm": 1.2598174810409546, "learning_rate": 0.0001453924914675768, "loss": 2.318, "step": 3761 }, { "epoch": 4.279863481228669, "grad_norm": 1.3087960481643677, "learning_rate": 0.00014516496018202502, "loss": 2.8482, "step": 3762 }, { "epoch": 4.281001137656427, "grad_norm": 1.1006321907043457, "learning_rate": 0.00014493742889647328, "loss": 2.0884, "step": 3763 }, { "epoch": 4.282138794084187, "grad_norm": 0.648513674736023, "learning_rate": 0.00014470989761092152, "loss": 1.5729, "step": 3764 }, { "epoch": 4.283276450511945, "grad_norm": 0.9822235107421875, "learning_rate": 0.00014448236632536973, "loss": 1.5298, "step": 3765 }, { "epoch": 4.284414106939704, "grad_norm": 0.9084975123405457, "learning_rate": 0.000144254835039818, "loss": 2.1044, "step": 3766 }, { "epoch": 4.285551763367463, "grad_norm": 1.9358890056610107, "learning_rate": 0.0001440273037542662, "loss": 3.062, "step": 3767 }, { "epoch": 4.286689419795222, "grad_norm": 0.8079110980033875, "learning_rate": 0.00014379977246871444, "loss": 1.7288, "step": 3768 }, { "epoch": 4.287827076222981, "grad_norm": 0.6145922541618347, "learning_rate": 0.0001435722411831627, "loss": 1.0093, "step": 3769 }, { "epoch": 4.288964732650739, "grad_norm": 0.9286468625068665, "learning_rate": 0.0001433447098976109, "loss": 1.4367, "step": 3770 }, { "epoch": 4.290102389078498, "grad_norm": 0.7882009148597717, "learning_rate": 0.00014311717861205918, "loss": 1.2809, "step": 3771 }, { "epoch": 4.291240045506257, "grad_norm": 1.0285731554031372, "learning_rate": 0.00014288964732650739, "loss": 2.5163, "step": 3772 }, { "epoch": 4.292377701934016, "grad_norm": 1.0870704650878906, "learning_rate": 0.00014266211604095562, "loss": 1.9514, "step": 3773 }, { "epoch": 4.293515358361775, "grad_norm": 0.9731798768043518, "learning_rate": 0.0001424345847554039, "loss": 1.2031, "step": 3774 }, { "epoch": 4.294653014789533, "grad_norm": 0.6875901222229004, "learning_rate": 0.0001422070534698521, "loss": 0.9267, "step": 3775 }, { "epoch": 4.295790671217293, "grad_norm": 0.746791422367096, "learning_rate": 0.00014197952218430036, "loss": 1.5462, "step": 3776 }, { "epoch": 4.296928327645051, "grad_norm": 0.7123384475708008, "learning_rate": 0.00014175199089874857, "loss": 1.3801, "step": 3777 }, { "epoch": 4.29806598407281, "grad_norm": 0.554326593875885, "learning_rate": 0.0001415244596131968, "loss": 0.6866, "step": 3778 }, { "epoch": 4.2992036405005685, "grad_norm": 0.9146575331687927, "learning_rate": 0.00014129692832764505, "loss": 1.5966, "step": 3779 }, { "epoch": 4.300341296928328, "grad_norm": 0.6338298916816711, "learning_rate": 0.00014106939704209328, "loss": 1.0133, "step": 3780 }, { "epoch": 4.301478953356087, "grad_norm": 0.6664391756057739, "learning_rate": 0.00014084186575654155, "loss": 1.1051, "step": 3781 }, { "epoch": 4.302616609783845, "grad_norm": 0.8975256085395813, "learning_rate": 0.00014061433447098976, "loss": 1.273, "step": 3782 }, { "epoch": 4.303754266211604, "grad_norm": 1.1492512226104736, "learning_rate": 0.000140386803185438, "loss": 2.3168, "step": 3783 }, { "epoch": 4.304891922639363, "grad_norm": 1.3415923118591309, "learning_rate": 0.00014015927189988623, "loss": 2.4386, "step": 3784 }, { "epoch": 4.306029579067122, "grad_norm": 1.2044072151184082, "learning_rate": 0.00013993174061433447, "loss": 3.6453, "step": 3785 }, { "epoch": 4.30716723549488, "grad_norm": 0.7362768650054932, "learning_rate": 0.00013970420932878273, "loss": 1.5149, "step": 3786 }, { "epoch": 4.308304891922639, "grad_norm": 0.8360229134559631, "learning_rate": 0.00013947667804323094, "loss": 2.0411, "step": 3787 }, { "epoch": 4.309442548350399, "grad_norm": 0.8958834409713745, "learning_rate": 0.00013924914675767918, "loss": 1.5186, "step": 3788 }, { "epoch": 4.310580204778157, "grad_norm": 0.7361008524894714, "learning_rate": 0.00013902161547212741, "loss": 2.1691, "step": 3789 }, { "epoch": 4.311717861205916, "grad_norm": 0.9051206111907959, "learning_rate": 0.00013879408418657565, "loss": 1.7265, "step": 3790 }, { "epoch": 4.3128555176336745, "grad_norm": 1.077871561050415, "learning_rate": 0.00013856655290102392, "loss": 3.1665, "step": 3791 }, { "epoch": 4.313993174061434, "grad_norm": 0.9962257742881775, "learning_rate": 0.00013833902161547213, "loss": 0.8876, "step": 3792 }, { "epoch": 4.315130830489192, "grad_norm": 0.7965003848075867, "learning_rate": 0.00013811149032992036, "loss": 0.8467, "step": 3793 }, { "epoch": 4.316268486916951, "grad_norm": 1.0827739238739014, "learning_rate": 0.0001378839590443686, "loss": 2.0453, "step": 3794 }, { "epoch": 4.3174061433447095, "grad_norm": 1.0761308670043945, "learning_rate": 0.00013765642775881684, "loss": 2.2351, "step": 3795 }, { "epoch": 4.318543799772469, "grad_norm": 0.7478647232055664, "learning_rate": 0.00013742889647326507, "loss": 1.662, "step": 3796 }, { "epoch": 4.319681456200228, "grad_norm": 1.300010085105896, "learning_rate": 0.0001372013651877133, "loss": 2.644, "step": 3797 }, { "epoch": 4.320819112627986, "grad_norm": 1.0593494176864624, "learning_rate": 0.00013697383390216155, "loss": 2.2775, "step": 3798 }, { "epoch": 4.321956769055745, "grad_norm": 1.0908184051513672, "learning_rate": 0.00013674630261660978, "loss": 1.6699, "step": 3799 }, { "epoch": 4.323094425483504, "grad_norm": 1.0574840307235718, "learning_rate": 0.00013651877133105802, "loss": 2.6921, "step": 3800 }, { "epoch": 4.324232081911263, "grad_norm": 0.6852555871009827, "learning_rate": 0.00013629124004550626, "loss": 1.0971, "step": 3801 }, { "epoch": 4.325369738339021, "grad_norm": 1.1705669164657593, "learning_rate": 0.0001360637087599545, "loss": 2.0903, "step": 3802 }, { "epoch": 4.3265073947667805, "grad_norm": 0.9083330035209656, "learning_rate": 0.00013583617747440273, "loss": 2.8104, "step": 3803 }, { "epoch": 4.327645051194539, "grad_norm": 1.045404076576233, "learning_rate": 0.00013560864618885097, "loss": 1.8789, "step": 3804 }, { "epoch": 4.328782707622298, "grad_norm": 1.4782066345214844, "learning_rate": 0.0001353811149032992, "loss": 3.033, "step": 3805 }, { "epoch": 4.329920364050057, "grad_norm": 0.8685378432273865, "learning_rate": 0.00013515358361774744, "loss": 1.8752, "step": 3806 }, { "epoch": 4.3310580204778155, "grad_norm": 0.6213506460189819, "learning_rate": 0.00013492605233219568, "loss": 1.4381, "step": 3807 }, { "epoch": 4.332195676905575, "grad_norm": 1.271410346031189, "learning_rate": 0.00013469852104664392, "loss": 2.2921, "step": 3808 }, { "epoch": 4.333333333333333, "grad_norm": 1.084229826927185, "learning_rate": 0.00013447098976109215, "loss": 2.1447, "step": 3809 }, { "epoch": 4.334470989761092, "grad_norm": 1.2566276788711548, "learning_rate": 0.0001342434584755404, "loss": 2.3514, "step": 3810 }, { "epoch": 4.335608646188851, "grad_norm": 0.6356386542320251, "learning_rate": 0.00013401592718998863, "loss": 0.815, "step": 3811 }, { "epoch": 4.33674630261661, "grad_norm": 0.9741693139076233, "learning_rate": 0.00013378839590443687, "loss": 2.4285, "step": 3812 }, { "epoch": 4.337883959044369, "grad_norm": 0.9355404376983643, "learning_rate": 0.00013356086461888508, "loss": 2.015, "step": 3813 }, { "epoch": 4.339021615472127, "grad_norm": 0.9928659796714783, "learning_rate": 0.00013333333333333334, "loss": 1.7225, "step": 3814 }, { "epoch": 4.3401592718998865, "grad_norm": 0.7346398830413818, "learning_rate": 0.00013310580204778158, "loss": 0.8244, "step": 3815 }, { "epoch": 4.341296928327645, "grad_norm": 0.8423483967781067, "learning_rate": 0.00013287827076222981, "loss": 1.5708, "step": 3816 }, { "epoch": 4.342434584755404, "grad_norm": 1.3044344186782837, "learning_rate": 0.00013265073947667805, "loss": 2.0567, "step": 3817 }, { "epoch": 4.343572241183162, "grad_norm": 0.7337809801101685, "learning_rate": 0.00013242320819112626, "loss": 0.9073, "step": 3818 }, { "epoch": 4.3447098976109215, "grad_norm": 1.2432464361190796, "learning_rate": 0.00013219567690557452, "loss": 2.3843, "step": 3819 }, { "epoch": 4.345847554038681, "grad_norm": 0.8972126245498657, "learning_rate": 0.00013196814562002276, "loss": 1.5984, "step": 3820 }, { "epoch": 4.346985210466439, "grad_norm": 0.7023994326591492, "learning_rate": 0.000131740614334471, "loss": 1.7012, "step": 3821 }, { "epoch": 4.348122866894198, "grad_norm": 0.857025146484375, "learning_rate": 0.00013151308304891924, "loss": 1.8499, "step": 3822 }, { "epoch": 4.349260523321957, "grad_norm": 0.8133417963981628, "learning_rate": 0.00013128555176336745, "loss": 1.8394, "step": 3823 }, { "epoch": 4.350398179749716, "grad_norm": 1.043709635734558, "learning_rate": 0.0001310580204778157, "loss": 1.6459, "step": 3824 }, { "epoch": 4.351535836177474, "grad_norm": 0.9391498565673828, "learning_rate": 0.00013083048919226395, "loss": 1.4526, "step": 3825 }, { "epoch": 4.352673492605233, "grad_norm": 1.0450690984725952, "learning_rate": 0.00013060295790671218, "loss": 1.5307, "step": 3826 }, { "epoch": 4.3538111490329925, "grad_norm": 1.0160549879074097, "learning_rate": 0.00013037542662116042, "loss": 2.3228, "step": 3827 }, { "epoch": 4.354948805460751, "grad_norm": 1.861527919769287, "learning_rate": 0.00013014789533560863, "loss": 2.6769, "step": 3828 }, { "epoch": 4.35608646188851, "grad_norm": 0.8127307295799255, "learning_rate": 0.0001299203640500569, "loss": 1.3464, "step": 3829 }, { "epoch": 4.357224118316268, "grad_norm": 0.8189122080802917, "learning_rate": 0.0001296928327645051, "loss": 1.2768, "step": 3830 }, { "epoch": 4.3583617747440275, "grad_norm": 0.8693903088569641, "learning_rate": 0.00012946530147895337, "loss": 2.2173, "step": 3831 }, { "epoch": 4.359499431171786, "grad_norm": 1.17019522190094, "learning_rate": 0.0001292377701934016, "loss": 2.2724, "step": 3832 }, { "epoch": 4.360637087599545, "grad_norm": 0.7961968183517456, "learning_rate": 0.00012901023890784982, "loss": 0.8773, "step": 3833 }, { "epoch": 4.361774744027303, "grad_norm": 0.9709134697914124, "learning_rate": 0.00012878270762229808, "loss": 1.8771, "step": 3834 }, { "epoch": 4.362912400455063, "grad_norm": 1.0111744403839111, "learning_rate": 0.0001285551763367463, "loss": 1.7679, "step": 3835 }, { "epoch": 4.364050056882822, "grad_norm": 1.0463035106658936, "learning_rate": 0.00012832764505119455, "loss": 1.8787, "step": 3836 }, { "epoch": 4.36518771331058, "grad_norm": 0.9706122875213623, "learning_rate": 0.0001281001137656428, "loss": 1.8892, "step": 3837 }, { "epoch": 4.366325369738339, "grad_norm": 1.487461805343628, "learning_rate": 0.000127872582480091, "loss": 3.0566, "step": 3838 }, { "epoch": 4.367463026166098, "grad_norm": 0.578607976436615, "learning_rate": 0.00012764505119453926, "loss": 1.1034, "step": 3839 }, { "epoch": 4.368600682593857, "grad_norm": 0.8049798607826233, "learning_rate": 0.00012741751990898747, "loss": 1.4093, "step": 3840 }, { "epoch": 4.369738339021615, "grad_norm": 0.9491667747497559, "learning_rate": 0.00012718998862343574, "loss": 1.2235, "step": 3841 }, { "epoch": 4.370875995449374, "grad_norm": 0.6280384063720703, "learning_rate": 0.00012696245733788398, "loss": 1.5624, "step": 3842 }, { "epoch": 4.372013651877133, "grad_norm": 1.2607163190841675, "learning_rate": 0.00012673492605233219, "loss": 2.3174, "step": 3843 }, { "epoch": 4.373151308304892, "grad_norm": 0.9251709580421448, "learning_rate": 0.00012650739476678045, "loss": 1.8042, "step": 3844 }, { "epoch": 4.374288964732651, "grad_norm": 1.0482347011566162, "learning_rate": 0.00012627986348122866, "loss": 1.7866, "step": 3845 }, { "epoch": 4.375426621160409, "grad_norm": 1.0927660465240479, "learning_rate": 0.00012605233219567692, "loss": 2.0026, "step": 3846 }, { "epoch": 4.376564277588169, "grad_norm": 0.9109655618667603, "learning_rate": 0.00012582480091012513, "loss": 1.6597, "step": 3847 }, { "epoch": 4.377701934015927, "grad_norm": 1.1354659795761108, "learning_rate": 0.00012559726962457337, "loss": 2.0034, "step": 3848 }, { "epoch": 4.378839590443686, "grad_norm": 0.624963641166687, "learning_rate": 0.00012536973833902163, "loss": 1.1846, "step": 3849 }, { "epoch": 4.379977246871444, "grad_norm": 0.7862791419029236, "learning_rate": 0.00012514220705346984, "loss": 1.0518, "step": 3850 }, { "epoch": 4.381114903299204, "grad_norm": 1.0999188423156738, "learning_rate": 0.0001249146757679181, "loss": 3.3598, "step": 3851 }, { "epoch": 4.382252559726963, "grad_norm": 1.0978202819824219, "learning_rate": 0.00012468714448236632, "loss": 1.4192, "step": 3852 }, { "epoch": 4.383390216154721, "grad_norm": 0.8978061079978943, "learning_rate": 0.00012445961319681456, "loss": 1.5036, "step": 3853 }, { "epoch": 4.38452787258248, "grad_norm": 0.7772718071937561, "learning_rate": 0.0001242320819112628, "loss": 1.9559, "step": 3854 }, { "epoch": 4.385665529010239, "grad_norm": 0.6393569111824036, "learning_rate": 0.00012400455062571103, "loss": 1.2368, "step": 3855 }, { "epoch": 4.386803185437998, "grad_norm": 1.3224518299102783, "learning_rate": 0.0001237770193401593, "loss": 2.4815, "step": 3856 }, { "epoch": 4.387940841865756, "grad_norm": 0.8100547194480896, "learning_rate": 0.0001235494880546075, "loss": 2.0509, "step": 3857 }, { "epoch": 4.389078498293515, "grad_norm": 1.1932857036590576, "learning_rate": 0.00012332195676905574, "loss": 2.3144, "step": 3858 }, { "epoch": 4.390216154721275, "grad_norm": 1.004632592201233, "learning_rate": 0.00012309442548350398, "loss": 1.9511, "step": 3859 }, { "epoch": 4.391353811149033, "grad_norm": 0.73143070936203, "learning_rate": 0.00012286689419795221, "loss": 1.5956, "step": 3860 }, { "epoch": 4.392491467576792, "grad_norm": 0.8859034180641174, "learning_rate": 0.00012263936291240045, "loss": 1.6946, "step": 3861 }, { "epoch": 4.39362912400455, "grad_norm": 1.6607998609542847, "learning_rate": 0.0001224118316268487, "loss": 3.6334, "step": 3862 }, { "epoch": 4.39476678043231, "grad_norm": 0.5776720643043518, "learning_rate": 0.00012218430034129693, "loss": 0.8281, "step": 3863 }, { "epoch": 4.395904436860068, "grad_norm": 0.8924266695976257, "learning_rate": 0.00012195676905574516, "loss": 1.7758, "step": 3864 }, { "epoch": 4.397042093287827, "grad_norm": 0.5048301815986633, "learning_rate": 0.0001217292377701934, "loss": 0.5899, "step": 3865 }, { "epoch": 4.398179749715586, "grad_norm": 0.9036781191825867, "learning_rate": 0.00012150170648464164, "loss": 1.7569, "step": 3866 }, { "epoch": 4.399317406143345, "grad_norm": 1.0674494504928589, "learning_rate": 0.00012127417519908989, "loss": 2.3646, "step": 3867 }, { "epoch": 4.400455062571104, "grad_norm": 1.6074708700180054, "learning_rate": 0.00012104664391353812, "loss": 2.0718, "step": 3868 }, { "epoch": 4.401592718998862, "grad_norm": 1.289448618888855, "learning_rate": 0.00012081911262798635, "loss": 2.4961, "step": 3869 }, { "epoch": 4.402730375426621, "grad_norm": 1.5797637701034546, "learning_rate": 0.00012059158134243458, "loss": 2.0542, "step": 3870 }, { "epoch": 4.40386803185438, "grad_norm": 0.8559139370918274, "learning_rate": 0.00012036405005688282, "loss": 1.8854, "step": 3871 }, { "epoch": 4.405005688282139, "grad_norm": 0.749015748500824, "learning_rate": 0.00012013651877133106, "loss": 1.3702, "step": 3872 }, { "epoch": 4.406143344709897, "grad_norm": 0.95188969373703, "learning_rate": 0.00011990898748577931, "loss": 1.6222, "step": 3873 }, { "epoch": 4.407281001137656, "grad_norm": 0.8665216565132141, "learning_rate": 0.00011968145620022753, "loss": 1.9693, "step": 3874 }, { "epoch": 4.408418657565416, "grad_norm": 0.7548373937606812, "learning_rate": 0.00011945392491467577, "loss": 2.2095, "step": 3875 }, { "epoch": 4.409556313993174, "grad_norm": 0.7718438506126404, "learning_rate": 0.000119226393629124, "loss": 1.0631, "step": 3876 }, { "epoch": 4.410693970420933, "grad_norm": 0.7795398235321045, "learning_rate": 0.00011899886234357224, "loss": 1.5037, "step": 3877 }, { "epoch": 4.4118316268486915, "grad_norm": 0.9634506702423096, "learning_rate": 0.00011877133105802048, "loss": 1.2737, "step": 3878 }, { "epoch": 4.412969283276451, "grad_norm": 1.0782710313796997, "learning_rate": 0.00011854379977246872, "loss": 2.0432, "step": 3879 }, { "epoch": 4.414106939704209, "grad_norm": 0.8432207107543945, "learning_rate": 0.00011831626848691695, "loss": 1.4583, "step": 3880 }, { "epoch": 4.415244596131968, "grad_norm": 1.0028852224349976, "learning_rate": 0.00011808873720136519, "loss": 1.3358, "step": 3881 }, { "epoch": 4.4163822525597265, "grad_norm": 0.9383002519607544, "learning_rate": 0.00011786120591581343, "loss": 1.9518, "step": 3882 }, { "epoch": 4.417519908987486, "grad_norm": 0.7450307607650757, "learning_rate": 0.00011763367463026167, "loss": 0.7314, "step": 3883 }, { "epoch": 4.418657565415245, "grad_norm": 0.8783142566680908, "learning_rate": 0.0001174061433447099, "loss": 1.8254, "step": 3884 }, { "epoch": 4.419795221843003, "grad_norm": 0.7310676574707031, "learning_rate": 0.00011717861205915814, "loss": 1.7363, "step": 3885 }, { "epoch": 4.420932878270762, "grad_norm": 0.4681646525859833, "learning_rate": 0.00011695108077360638, "loss": 1.0317, "step": 3886 }, { "epoch": 4.422070534698521, "grad_norm": 0.9998228549957275, "learning_rate": 0.00011672354948805461, "loss": 1.5986, "step": 3887 }, { "epoch": 4.42320819112628, "grad_norm": 0.6129392981529236, "learning_rate": 0.00011649601820250284, "loss": 0.952, "step": 3888 }, { "epoch": 4.424345847554038, "grad_norm": 0.8074744343757629, "learning_rate": 0.00011626848691695107, "loss": 1.1051, "step": 3889 }, { "epoch": 4.4254835039817975, "grad_norm": 0.9240376949310303, "learning_rate": 0.00011604095563139932, "loss": 1.1525, "step": 3890 }, { "epoch": 4.426621160409557, "grad_norm": 0.9832790493965149, "learning_rate": 0.00011581342434584756, "loss": 1.538, "step": 3891 }, { "epoch": 4.427758816837315, "grad_norm": 1.5793112516403198, "learning_rate": 0.0001155858930602958, "loss": 2.8963, "step": 3892 }, { "epoch": 4.428896473265074, "grad_norm": 0.6335533261299133, "learning_rate": 0.00011535836177474402, "loss": 0.7007, "step": 3893 }, { "epoch": 4.4300341296928325, "grad_norm": 0.8912391066551208, "learning_rate": 0.00011513083048919226, "loss": 1.6569, "step": 3894 }, { "epoch": 4.431171786120592, "grad_norm": 1.0746057033538818, "learning_rate": 0.0001149032992036405, "loss": 1.9166, "step": 3895 }, { "epoch": 4.43230944254835, "grad_norm": 0.8969407677650452, "learning_rate": 0.00011467576791808875, "loss": 1.823, "step": 3896 }, { "epoch": 4.433447098976109, "grad_norm": 1.103092074394226, "learning_rate": 0.00011444823663253698, "loss": 2.2962, "step": 3897 }, { "epoch": 4.434584755403868, "grad_norm": 0.9207330942153931, "learning_rate": 0.0001142207053469852, "loss": 1.9893, "step": 3898 }, { "epoch": 4.435722411831627, "grad_norm": 0.7149428725242615, "learning_rate": 0.00011399317406143344, "loss": 1.9576, "step": 3899 }, { "epoch": 4.436860068259386, "grad_norm": 1.1259676218032837, "learning_rate": 0.00011376564277588168, "loss": 1.7732, "step": 3900 }, { "epoch": 4.437997724687144, "grad_norm": 1.0312747955322266, "learning_rate": 0.00011353811149032993, "loss": 2.2845, "step": 3901 }, { "epoch": 4.4391353811149035, "grad_norm": 1.1275362968444824, "learning_rate": 0.00011331058020477817, "loss": 2.7354, "step": 3902 }, { "epoch": 4.440273037542662, "grad_norm": 0.8789055943489075, "learning_rate": 0.00011308304891922639, "loss": 2.0587, "step": 3903 }, { "epoch": 4.441410693970421, "grad_norm": 2.0354743003845215, "learning_rate": 0.00011285551763367463, "loss": 2.5204, "step": 3904 }, { "epoch": 4.44254835039818, "grad_norm": 1.1655513048171997, "learning_rate": 0.00011262798634812287, "loss": 1.7625, "step": 3905 }, { "epoch": 4.4436860068259385, "grad_norm": 0.7838327288627625, "learning_rate": 0.0001124004550625711, "loss": 1.7538, "step": 3906 }, { "epoch": 4.444823663253698, "grad_norm": 1.042612075805664, "learning_rate": 0.00011217292377701935, "loss": 1.9573, "step": 3907 }, { "epoch": 4.445961319681456, "grad_norm": 0.8811992406845093, "learning_rate": 0.00011194539249146758, "loss": 1.237, "step": 3908 }, { "epoch": 4.447098976109215, "grad_norm": 0.8946145176887512, "learning_rate": 0.00011171786120591581, "loss": 1.7659, "step": 3909 }, { "epoch": 4.4482366325369735, "grad_norm": 0.6377555727958679, "learning_rate": 0.00011149032992036405, "loss": 1.1502, "step": 3910 }, { "epoch": 4.449374288964733, "grad_norm": 1.0547488927841187, "learning_rate": 0.00011126279863481229, "loss": 1.3714, "step": 3911 }, { "epoch": 4.450511945392491, "grad_norm": 0.7710375785827637, "learning_rate": 0.00011103526734926054, "loss": 1.2559, "step": 3912 }, { "epoch": 4.45164960182025, "grad_norm": 0.5346420407295227, "learning_rate": 0.00011080773606370876, "loss": 0.5416, "step": 3913 }, { "epoch": 4.4527872582480095, "grad_norm": 0.8821654915809631, "learning_rate": 0.000110580204778157, "loss": 1.1061, "step": 3914 }, { "epoch": 4.453924914675768, "grad_norm": 2.041219472885132, "learning_rate": 0.00011035267349260524, "loss": 3.7444, "step": 3915 }, { "epoch": 4.455062571103527, "grad_norm": 0.9593986868858337, "learning_rate": 0.00011012514220705347, "loss": 1.6756, "step": 3916 }, { "epoch": 4.456200227531285, "grad_norm": 0.8779603838920593, "learning_rate": 0.00010989761092150171, "loss": 2.0283, "step": 3917 }, { "epoch": 4.4573378839590445, "grad_norm": 1.146952748298645, "learning_rate": 0.00010967007963594995, "loss": 2.2362, "step": 3918 }, { "epoch": 4.458475540386803, "grad_norm": 0.8655639290809631, "learning_rate": 0.00010944254835039818, "loss": 1.9992, "step": 3919 }, { "epoch": 4.459613196814562, "grad_norm": 1.1940038204193115, "learning_rate": 0.00010921501706484642, "loss": 1.7449, "step": 3920 }, { "epoch": 4.460750853242321, "grad_norm": 0.7749353647232056, "learning_rate": 0.00010898748577929466, "loss": 1.1885, "step": 3921 }, { "epoch": 4.4618885096700796, "grad_norm": 0.8721646666526794, "learning_rate": 0.0001087599544937429, "loss": 1.8104, "step": 3922 }, { "epoch": 4.463026166097839, "grad_norm": 0.6543747782707214, "learning_rate": 0.00010853242320819112, "loss": 1.2385, "step": 3923 }, { "epoch": 4.464163822525597, "grad_norm": 0.9697179198265076, "learning_rate": 0.00010830489192263937, "loss": 1.6976, "step": 3924 }, { "epoch": 4.465301478953356, "grad_norm": 0.9224042892456055, "learning_rate": 0.0001080773606370876, "loss": 1.214, "step": 3925 }, { "epoch": 4.466439135381115, "grad_norm": 0.9176324605941772, "learning_rate": 0.00010784982935153584, "loss": 1.2962, "step": 3926 }, { "epoch": 4.467576791808874, "grad_norm": 0.9372009038925171, "learning_rate": 0.00010762229806598408, "loss": 1.9831, "step": 3927 }, { "epoch": 4.468714448236632, "grad_norm": 1.1387556791305542, "learning_rate": 0.0001073947667804323, "loss": 1.9173, "step": 3928 }, { "epoch": 4.469852104664391, "grad_norm": 1.1000502109527588, "learning_rate": 0.00010716723549488055, "loss": 1.9951, "step": 3929 }, { "epoch": 4.4709897610921505, "grad_norm": 0.8819757699966431, "learning_rate": 0.00010693970420932879, "loss": 1.3953, "step": 3930 }, { "epoch": 4.472127417519909, "grad_norm": 0.9281446933746338, "learning_rate": 0.00010671217292377703, "loss": 1.3608, "step": 3931 }, { "epoch": 4.473265073947668, "grad_norm": 1.0403672456741333, "learning_rate": 0.00010648464163822526, "loss": 2.4958, "step": 3932 }, { "epoch": 4.474402730375426, "grad_norm": 0.6432840824127197, "learning_rate": 0.00010625711035267349, "loss": 1.0788, "step": 3933 }, { "epoch": 4.4755403868031856, "grad_norm": 0.8006333112716675, "learning_rate": 0.00010602957906712172, "loss": 1.5623, "step": 3934 }, { "epoch": 4.476678043230944, "grad_norm": 0.8013043403625488, "learning_rate": 0.00010580204778156998, "loss": 1.5541, "step": 3935 }, { "epoch": 4.477815699658703, "grad_norm": 1.071979284286499, "learning_rate": 0.00010557451649601821, "loss": 3.0613, "step": 3936 }, { "epoch": 4.478953356086462, "grad_norm": 0.6907356381416321, "learning_rate": 0.00010534698521046644, "loss": 1.8117, "step": 3937 }, { "epoch": 4.480091012514221, "grad_norm": 0.7573530077934265, "learning_rate": 0.00010511945392491467, "loss": 1.4187, "step": 3938 }, { "epoch": 4.48122866894198, "grad_norm": 0.7063055038452148, "learning_rate": 0.00010489192263936291, "loss": 0.7842, "step": 3939 }, { "epoch": 4.482366325369738, "grad_norm": 0.9287145733833313, "learning_rate": 0.00010466439135381115, "loss": 1.7274, "step": 3940 }, { "epoch": 4.483503981797497, "grad_norm": 0.7556503415107727, "learning_rate": 0.0001044368600682594, "loss": 1.4371, "step": 3941 }, { "epoch": 4.484641638225256, "grad_norm": 1.12169349193573, "learning_rate": 0.00010420932878270762, "loss": 1.4631, "step": 3942 }, { "epoch": 4.485779294653015, "grad_norm": 1.066739797592163, "learning_rate": 0.00010398179749715586, "loss": 2.0575, "step": 3943 }, { "epoch": 4.486916951080774, "grad_norm": 0.6979597210884094, "learning_rate": 0.0001037542662116041, "loss": 1.2034, "step": 3944 }, { "epoch": 4.488054607508532, "grad_norm": 0.9167845249176025, "learning_rate": 0.00010352673492605233, "loss": 1.3631, "step": 3945 }, { "epoch": 4.489192263936292, "grad_norm": 1.0211360454559326, "learning_rate": 0.00010329920364050058, "loss": 2.8476, "step": 3946 }, { "epoch": 4.49032992036405, "grad_norm": 1.0867899656295776, "learning_rate": 0.0001030716723549488, "loss": 2.0269, "step": 3947 }, { "epoch": 4.491467576791809, "grad_norm": 0.963070809841156, "learning_rate": 0.00010284414106939704, "loss": 1.9714, "step": 3948 }, { "epoch": 4.492605233219567, "grad_norm": 1.0459845066070557, "learning_rate": 0.00010261660978384528, "loss": 1.8135, "step": 3949 }, { "epoch": 4.493742889647327, "grad_norm": 1.2111842632293701, "learning_rate": 0.00010238907849829352, "loss": 2.337, "step": 3950 }, { "epoch": 4.494880546075085, "grad_norm": 0.9683245420455933, "learning_rate": 0.00010216154721274175, "loss": 1.2889, "step": 3951 }, { "epoch": 4.496018202502844, "grad_norm": 0.9261424541473389, "learning_rate": 0.00010193401592718999, "loss": 1.5083, "step": 3952 }, { "epoch": 4.497155858930603, "grad_norm": 0.6647104620933533, "learning_rate": 0.00010170648464163823, "loss": 1.3438, "step": 3953 }, { "epoch": 4.498293515358362, "grad_norm": 0.7070716023445129, "learning_rate": 0.00010147895335608646, "loss": 1.2005, "step": 3954 }, { "epoch": 4.499431171786121, "grad_norm": 0.6603410840034485, "learning_rate": 0.0001012514220705347, "loss": 0.869, "step": 3955 }, { "epoch": 4.500568828213879, "grad_norm": 1.0688494443893433, "learning_rate": 0.00010102389078498294, "loss": 1.8926, "step": 3956 }, { "epoch": 4.501706484641638, "grad_norm": 0.7224915027618408, "learning_rate": 0.00010079635949943116, "loss": 1.9618, "step": 3957 }, { "epoch": 4.502844141069397, "grad_norm": 1.0792325735092163, "learning_rate": 0.00010056882821387941, "loss": 1.9267, "step": 3958 }, { "epoch": 4.503981797497156, "grad_norm": 0.7390668392181396, "learning_rate": 0.00010034129692832765, "loss": 1.4238, "step": 3959 }, { "epoch": 4.505119453924914, "grad_norm": 1.3947380781173706, "learning_rate": 0.00010011376564277589, "loss": 2.3192, "step": 3960 }, { "epoch": 4.506257110352673, "grad_norm": 0.9918806552886963, "learning_rate": 9.988623435722412e-05, "loss": 1.6336, "step": 3961 }, { "epoch": 4.507394766780433, "grad_norm": 1.1548150777816772, "learning_rate": 9.965870307167235e-05, "loss": 1.2732, "step": 3962 }, { "epoch": 4.508532423208191, "grad_norm": 1.110249400138855, "learning_rate": 9.94311717861206e-05, "loss": 1.8542, "step": 3963 }, { "epoch": 4.50967007963595, "grad_norm": 0.9127480387687683, "learning_rate": 9.920364050056883e-05, "loss": 1.477, "step": 3964 }, { "epoch": 4.510807736063708, "grad_norm": 1.161213755607605, "learning_rate": 9.897610921501707e-05, "loss": 1.8094, "step": 3965 }, { "epoch": 4.511945392491468, "grad_norm": 1.106597661972046, "learning_rate": 9.874857792946531e-05, "loss": 1.9764, "step": 3966 }, { "epoch": 4.513083048919226, "grad_norm": 0.7442317008972168, "learning_rate": 9.852104664391353e-05, "loss": 1.4526, "step": 3967 }, { "epoch": 4.514220705346985, "grad_norm": 0.7829383015632629, "learning_rate": 9.829351535836177e-05, "loss": 2.0345, "step": 3968 }, { "epoch": 4.515358361774744, "grad_norm": 1.1125015020370483, "learning_rate": 9.806598407281002e-05, "loss": 2.6904, "step": 3969 }, { "epoch": 4.516496018202503, "grad_norm": 0.7292245626449585, "learning_rate": 9.783845278725826e-05, "loss": 1.323, "step": 3970 }, { "epoch": 4.517633674630262, "grad_norm": 1.535749912261963, "learning_rate": 9.761092150170649e-05, "loss": 3.5423, "step": 3971 }, { "epoch": 4.51877133105802, "grad_norm": 0.9439961314201355, "learning_rate": 9.738339021615472e-05, "loss": 3.0287, "step": 3972 }, { "epoch": 4.519908987485779, "grad_norm": 0.8544148206710815, "learning_rate": 9.715585893060295e-05, "loss": 1.7422, "step": 3973 }, { "epoch": 4.521046643913538, "grad_norm": 0.7354198694229126, "learning_rate": 9.692832764505119e-05, "loss": 1.4524, "step": 3974 }, { "epoch": 4.522184300341297, "grad_norm": 0.7296625375747681, "learning_rate": 9.670079635949944e-05, "loss": 1.3034, "step": 3975 }, { "epoch": 4.523321956769056, "grad_norm": 1.0089813470840454, "learning_rate": 9.647326507394768e-05, "loss": 1.5669, "step": 3976 }, { "epoch": 4.5244596131968144, "grad_norm": 1.005476474761963, "learning_rate": 9.62457337883959e-05, "loss": 1.4951, "step": 3977 }, { "epoch": 4.525597269624574, "grad_norm": 0.7817225456237793, "learning_rate": 9.601820250284414e-05, "loss": 1.6755, "step": 3978 }, { "epoch": 4.526734926052332, "grad_norm": 0.814610481262207, "learning_rate": 9.579067121729238e-05, "loss": 1.0051, "step": 3979 }, { "epoch": 4.527872582480091, "grad_norm": 0.869544267654419, "learning_rate": 9.556313993174063e-05, "loss": 1.5352, "step": 3980 }, { "epoch": 4.5290102389078495, "grad_norm": 0.805802047252655, "learning_rate": 9.533560864618886e-05, "loss": 1.9839, "step": 3981 }, { "epoch": 4.530147895335609, "grad_norm": 0.7213151454925537, "learning_rate": 9.510807736063709e-05, "loss": 2.1419, "step": 3982 }, { "epoch": 4.531285551763368, "grad_norm": 0.743634045124054, "learning_rate": 9.488054607508532e-05, "loss": 1.2667, "step": 3983 }, { "epoch": 4.532423208191126, "grad_norm": 0.8204901218414307, "learning_rate": 9.465301478953356e-05, "loss": 1.5102, "step": 3984 }, { "epoch": 4.533560864618885, "grad_norm": 0.902324378490448, "learning_rate": 9.44254835039818e-05, "loss": 1.2913, "step": 3985 }, { "epoch": 4.534698521046644, "grad_norm": 1.2690314054489136, "learning_rate": 9.419795221843005e-05, "loss": 3.7403, "step": 3986 }, { "epoch": 4.535836177474403, "grad_norm": 0.6763675212860107, "learning_rate": 9.397042093287827e-05, "loss": 0.8402, "step": 3987 }, { "epoch": 4.536973833902161, "grad_norm": 0.6812355518341064, "learning_rate": 9.374288964732651e-05, "loss": 0.8881, "step": 3988 }, { "epoch": 4.5381114903299204, "grad_norm": 0.7160151600837708, "learning_rate": 9.351535836177475e-05, "loss": 1.3187, "step": 3989 }, { "epoch": 4.53924914675768, "grad_norm": 0.6161930561065674, "learning_rate": 9.328782707622298e-05, "loss": 1.0509, "step": 3990 }, { "epoch": 4.540386803185438, "grad_norm": 0.9480435848236084, "learning_rate": 9.306029579067122e-05, "loss": 1.898, "step": 3991 }, { "epoch": 4.541524459613197, "grad_norm": 1.0816932916641235, "learning_rate": 9.283276450511946e-05, "loss": 2.1839, "step": 3992 }, { "epoch": 4.5426621160409555, "grad_norm": 0.9521051049232483, "learning_rate": 9.26052332195677e-05, "loss": 1.5058, "step": 3993 }, { "epoch": 4.543799772468715, "grad_norm": 0.5779545903205872, "learning_rate": 9.237770193401593e-05, "loss": 1.2215, "step": 3994 }, { "epoch": 4.544937428896473, "grad_norm": 0.7004631161689758, "learning_rate": 9.215017064846417e-05, "loss": 0.8524, "step": 3995 }, { "epoch": 4.546075085324232, "grad_norm": 1.5310842990875244, "learning_rate": 9.192263936291239e-05, "loss": 2.2935, "step": 3996 }, { "epoch": 4.5472127417519905, "grad_norm": 1.0182299613952637, "learning_rate": 9.169510807736064e-05, "loss": 1.4897, "step": 3997 }, { "epoch": 4.54835039817975, "grad_norm": 1.2902427911758423, "learning_rate": 9.146757679180888e-05, "loss": 2.0195, "step": 3998 }, { "epoch": 4.549488054607508, "grad_norm": 0.9700065851211548, "learning_rate": 9.124004550625712e-05, "loss": 1.4082, "step": 3999 }, { "epoch": 4.550625711035267, "grad_norm": 1.0845290422439575, "learning_rate": 9.101251422070535e-05, "loss": 2.3722, "step": 4000 }, { "epoch": 4.5517633674630265, "grad_norm": 1.14189612865448, "learning_rate": 9.078498293515358e-05, "loss": 2.0645, "step": 4001 }, { "epoch": 4.552901023890785, "grad_norm": 1.1628812551498413, "learning_rate": 9.055745164960181e-05, "loss": 2.7279, "step": 4002 }, { "epoch": 4.554038680318544, "grad_norm": 0.8541833758354187, "learning_rate": 9.032992036405006e-05, "loss": 1.8064, "step": 4003 }, { "epoch": 4.555176336746302, "grad_norm": 0.7190375924110413, "learning_rate": 9.01023890784983e-05, "loss": 1.7207, "step": 4004 }, { "epoch": 4.5563139931740615, "grad_norm": 0.6545979976654053, "learning_rate": 8.987485779294654e-05, "loss": 1.4735, "step": 4005 }, { "epoch": 4.55745164960182, "grad_norm": 1.12898588180542, "learning_rate": 8.964732650739476e-05, "loss": 1.8204, "step": 4006 }, { "epoch": 4.558589306029579, "grad_norm": 1.265575885772705, "learning_rate": 8.9419795221843e-05, "loss": 2.1016, "step": 4007 }, { "epoch": 4.559726962457338, "grad_norm": 1.3039684295654297, "learning_rate": 8.919226393629125e-05, "loss": 2.1831, "step": 4008 }, { "epoch": 4.5608646188850965, "grad_norm": 0.6052323579788208, "learning_rate": 8.896473265073949e-05, "loss": 1.5035, "step": 4009 }, { "epoch": 4.562002275312856, "grad_norm": 1.2771997451782227, "learning_rate": 8.873720136518772e-05, "loss": 2.3026, "step": 4010 }, { "epoch": 4.563139931740614, "grad_norm": 1.577376365661621, "learning_rate": 8.850967007963595e-05, "loss": 2.7743, "step": 4011 }, { "epoch": 4.564277588168373, "grad_norm": 0.78837651014328, "learning_rate": 8.828213879408418e-05, "loss": 1.4329, "step": 4012 }, { "epoch": 4.565415244596132, "grad_norm": 1.392586350440979, "learning_rate": 8.805460750853242e-05, "loss": 1.9271, "step": 4013 }, { "epoch": 4.566552901023891, "grad_norm": 0.6456694602966309, "learning_rate": 8.782707622298067e-05, "loss": 1.1636, "step": 4014 }, { "epoch": 4.56769055745165, "grad_norm": 0.8244945406913757, "learning_rate": 8.759954493742891e-05, "loss": 1.3006, "step": 4015 }, { "epoch": 4.568828213879408, "grad_norm": 1.0723069906234741, "learning_rate": 8.737201365187713e-05, "loss": 1.835, "step": 4016 }, { "epoch": 4.5699658703071675, "grad_norm": 1.0027074813842773, "learning_rate": 8.714448236632537e-05, "loss": 1.806, "step": 4017 }, { "epoch": 4.571103526734926, "grad_norm": 1.1087925434112549, "learning_rate": 8.69169510807736e-05, "loss": 1.7219, "step": 4018 }, { "epoch": 4.572241183162685, "grad_norm": 0.5315782427787781, "learning_rate": 8.668941979522184e-05, "loss": 0.856, "step": 4019 }, { "epoch": 4.573378839590443, "grad_norm": 0.8060416579246521, "learning_rate": 8.646188850967009e-05, "loss": 1.3456, "step": 4020 }, { "epoch": 4.5745164960182025, "grad_norm": 0.8221978545188904, "learning_rate": 8.623435722411832e-05, "loss": 1.7365, "step": 4021 }, { "epoch": 4.575654152445962, "grad_norm": 0.8591867685317993, "learning_rate": 8.600682593856655e-05, "loss": 1.5206, "step": 4022 }, { "epoch": 4.57679180887372, "grad_norm": 0.694990336894989, "learning_rate": 8.577929465301479e-05, "loss": 0.6024, "step": 4023 }, { "epoch": 4.577929465301479, "grad_norm": 1.0141761302947998, "learning_rate": 8.555176336746303e-05, "loss": 1.8285, "step": 4024 }, { "epoch": 4.579067121729238, "grad_norm": 0.9870526194572449, "learning_rate": 8.532423208191128e-05, "loss": 2.191, "step": 4025 }, { "epoch": 4.580204778156997, "grad_norm": 1.3243286609649658, "learning_rate": 8.50967007963595e-05, "loss": 2.1655, "step": 4026 }, { "epoch": 4.581342434584755, "grad_norm": 0.9310430884361267, "learning_rate": 8.486916951080774e-05, "loss": 1.5505, "step": 4027 }, { "epoch": 4.582480091012514, "grad_norm": 0.8326849937438965, "learning_rate": 8.464163822525597e-05, "loss": 1.2742, "step": 4028 }, { "epoch": 4.5836177474402735, "grad_norm": 1.1547021865844727, "learning_rate": 8.441410693970421e-05, "loss": 1.895, "step": 4029 }, { "epoch": 4.584755403868032, "grad_norm": 4.82136869430542, "learning_rate": 8.418657565415245e-05, "loss": 1.7264, "step": 4030 }, { "epoch": 4.585893060295791, "grad_norm": 2.2376370429992676, "learning_rate": 8.395904436860069e-05, "loss": 3.246, "step": 4031 }, { "epoch": 4.587030716723549, "grad_norm": 1.1774529218673706, "learning_rate": 8.373151308304892e-05, "loss": 2.3837, "step": 4032 }, { "epoch": 4.5881683731513085, "grad_norm": 0.8420902490615845, "learning_rate": 8.350398179749716e-05, "loss": 1.535, "step": 4033 }, { "epoch": 4.589306029579067, "grad_norm": 1.1322485208511353, "learning_rate": 8.32764505119454e-05, "loss": 3.0796, "step": 4034 }, { "epoch": 4.590443686006826, "grad_norm": 0.7916383743286133, "learning_rate": 8.304891922639362e-05, "loss": 1.7278, "step": 4035 }, { "epoch": 4.591581342434584, "grad_norm": 0.9946612119674683, "learning_rate": 8.282138794084186e-05, "loss": 1.3464, "step": 4036 }, { "epoch": 4.592718998862344, "grad_norm": 0.9056809544563293, "learning_rate": 8.259385665529011e-05, "loss": 1.2729, "step": 4037 }, { "epoch": 4.593856655290102, "grad_norm": 1.1132405996322632, "learning_rate": 8.236632536973834e-05, "loss": 1.6577, "step": 4038 }, { "epoch": 4.594994311717861, "grad_norm": 1.0267456769943237, "learning_rate": 8.213879408418658e-05, "loss": 2.0504, "step": 4039 }, { "epoch": 4.59613196814562, "grad_norm": 0.6499386429786682, "learning_rate": 8.19112627986348e-05, "loss": 0.6439, "step": 4040 }, { "epoch": 4.597269624573379, "grad_norm": 0.9048442244529724, "learning_rate": 8.168373151308304e-05, "loss": 1.5015, "step": 4041 }, { "epoch": 4.598407281001138, "grad_norm": 1.0397703647613525, "learning_rate": 8.145620022753129e-05, "loss": 1.5219, "step": 4042 }, { "epoch": 4.599544937428896, "grad_norm": 0.7324240803718567, "learning_rate": 8.122866894197953e-05, "loss": 1.4209, "step": 4043 }, { "epoch": 4.600682593856655, "grad_norm": 0.9085008502006531, "learning_rate": 8.100113765642777e-05, "loss": 1.7762, "step": 4044 }, { "epoch": 4.601820250284414, "grad_norm": 1.1294033527374268, "learning_rate": 8.077360637087599e-05, "loss": 2.8995, "step": 4045 }, { "epoch": 4.602957906712173, "grad_norm": 0.8178098201751709, "learning_rate": 8.054607508532423e-05, "loss": 2.0034, "step": 4046 }, { "epoch": 4.604095563139932, "grad_norm": 1.7326245307922363, "learning_rate": 8.031854379977246e-05, "loss": 2.8204, "step": 4047 }, { "epoch": 4.60523321956769, "grad_norm": 1.0977067947387695, "learning_rate": 8.009101251422071e-05, "loss": 1.5176, "step": 4048 }, { "epoch": 4.60637087599545, "grad_norm": 0.7887221574783325, "learning_rate": 7.986348122866895e-05, "loss": 1.7636, "step": 4049 }, { "epoch": 4.607508532423208, "grad_norm": 1.0273473262786865, "learning_rate": 7.963594994311717e-05, "loss": 1.5681, "step": 4050 }, { "epoch": 4.608646188850967, "grad_norm": 0.9911322593688965, "learning_rate": 7.940841865756541e-05, "loss": 1.8449, "step": 4051 }, { "epoch": 4.609783845278725, "grad_norm": 0.6486635804176331, "learning_rate": 7.918088737201365e-05, "loss": 1.4309, "step": 4052 }, { "epoch": 4.610921501706485, "grad_norm": 1.0114645957946777, "learning_rate": 7.895335608646189e-05, "loss": 2.1855, "step": 4053 }, { "epoch": 4.612059158134244, "grad_norm": 1.1461702585220337, "learning_rate": 7.872582480091014e-05, "loss": 2.1323, "step": 4054 }, { "epoch": 4.613196814562002, "grad_norm": 0.8668122291564941, "learning_rate": 7.849829351535836e-05, "loss": 1.418, "step": 4055 }, { "epoch": 4.614334470989761, "grad_norm": 0.5558659434318542, "learning_rate": 7.82707622298066e-05, "loss": 0.7871, "step": 4056 }, { "epoch": 4.61547212741752, "grad_norm": 1.008002758026123, "learning_rate": 7.804323094425483e-05, "loss": 1.9257, "step": 4057 }, { "epoch": 4.616609783845279, "grad_norm": 1.01564621925354, "learning_rate": 7.781569965870307e-05, "loss": 2.2797, "step": 4058 }, { "epoch": 4.617747440273037, "grad_norm": 0.8626942038536072, "learning_rate": 7.758816837315132e-05, "loss": 1.7334, "step": 4059 }, { "epoch": 4.618885096700796, "grad_norm": 0.9426528811454773, "learning_rate": 7.736063708759954e-05, "loss": 2.4934, "step": 4060 }, { "epoch": 4.620022753128556, "grad_norm": 0.8705387711524963, "learning_rate": 7.713310580204778e-05, "loss": 1.5291, "step": 4061 }, { "epoch": 4.621160409556314, "grad_norm": 0.6598314046859741, "learning_rate": 7.690557451649602e-05, "loss": 1.3372, "step": 4062 }, { "epoch": 4.622298065984073, "grad_norm": 1.0638949871063232, "learning_rate": 7.667804323094426e-05, "loss": 2.271, "step": 4063 }, { "epoch": 4.623435722411831, "grad_norm": 1.2114907503128052, "learning_rate": 7.645051194539249e-05, "loss": 1.9116, "step": 4064 }, { "epoch": 4.624573378839591, "grad_norm": 0.896415650844574, "learning_rate": 7.622298065984073e-05, "loss": 1.5814, "step": 4065 }, { "epoch": 4.625711035267349, "grad_norm": 0.9475829005241394, "learning_rate": 7.599544937428897e-05, "loss": 1.3713, "step": 4066 }, { "epoch": 4.626848691695108, "grad_norm": 1.0576077699661255, "learning_rate": 7.57679180887372e-05, "loss": 1.3842, "step": 4067 }, { "epoch": 4.627986348122867, "grad_norm": 1.035212755203247, "learning_rate": 7.554038680318544e-05, "loss": 1.5382, "step": 4068 }, { "epoch": 4.629124004550626, "grad_norm": 1.0625888109207153, "learning_rate": 7.531285551763368e-05, "loss": 1.9357, "step": 4069 }, { "epoch": 4.630261660978385, "grad_norm": 0.7255961894989014, "learning_rate": 7.508532423208191e-05, "loss": 1.8392, "step": 4070 }, { "epoch": 4.631399317406143, "grad_norm": 0.6510151624679565, "learning_rate": 7.485779294653015e-05, "loss": 1.6608, "step": 4071 }, { "epoch": 4.632536973833902, "grad_norm": 0.7432663440704346, "learning_rate": 7.463026166097839e-05, "loss": 1.4151, "step": 4072 }, { "epoch": 4.633674630261661, "grad_norm": 1.0191010236740112, "learning_rate": 7.440273037542663e-05, "loss": 2.6943, "step": 4073 }, { "epoch": 4.63481228668942, "grad_norm": 0.7630909085273743, "learning_rate": 7.417519908987486e-05, "loss": 1.6592, "step": 4074 }, { "epoch": 4.635949943117178, "grad_norm": 0.8889945149421692, "learning_rate": 7.394766780432309e-05, "loss": 1.8455, "step": 4075 }, { "epoch": 4.637087599544937, "grad_norm": 1.1400561332702637, "learning_rate": 7.372013651877134e-05, "loss": 2.2122, "step": 4076 }, { "epoch": 4.638225255972696, "grad_norm": 0.7421183586120605, "learning_rate": 7.349260523321957e-05, "loss": 1.1382, "step": 4077 }, { "epoch": 4.639362912400455, "grad_norm": 0.8412690162658691, "learning_rate": 7.326507394766781e-05, "loss": 1.8067, "step": 4078 }, { "epoch": 4.640500568828214, "grad_norm": 0.6977559924125671, "learning_rate": 7.303754266211605e-05, "loss": 0.9484, "step": 4079 }, { "epoch": 4.6416382252559725, "grad_norm": 1.04801607131958, "learning_rate": 7.281001137656427e-05, "loss": 3.1128, "step": 4080 }, { "epoch": 4.642775881683732, "grad_norm": 0.740193247795105, "learning_rate": 7.258248009101251e-05, "loss": 1.061, "step": 4081 }, { "epoch": 4.64391353811149, "grad_norm": 0.5949002504348755, "learning_rate": 7.235494880546076e-05, "loss": 0.9576, "step": 4082 }, { "epoch": 4.645051194539249, "grad_norm": 0.8039756417274475, "learning_rate": 7.2127417519909e-05, "loss": 1.3286, "step": 4083 }, { "epoch": 4.6461888509670075, "grad_norm": 1.1032826900482178, "learning_rate": 7.189988623435722e-05, "loss": 2.7052, "step": 4084 }, { "epoch": 4.647326507394767, "grad_norm": 1.0323725938796997, "learning_rate": 7.167235494880546e-05, "loss": 1.7848, "step": 4085 }, { "epoch": 4.648464163822526, "grad_norm": 0.5838832259178162, "learning_rate": 7.144482366325369e-05, "loss": 1.4307, "step": 4086 }, { "epoch": 4.649601820250284, "grad_norm": 1.3185147047042847, "learning_rate": 7.121729237770194e-05, "loss": 1.7205, "step": 4087 }, { "epoch": 4.650739476678043, "grad_norm": 0.5053433179855347, "learning_rate": 7.098976109215018e-05, "loss": 0.4623, "step": 4088 }, { "epoch": 4.651877133105802, "grad_norm": 0.9480729699134827, "learning_rate": 7.07622298065984e-05, "loss": 1.9962, "step": 4089 }, { "epoch": 4.653014789533561, "grad_norm": 0.9530803561210632, "learning_rate": 7.053469852104664e-05, "loss": 2.4195, "step": 4090 }, { "epoch": 4.654152445961319, "grad_norm": 0.7480978965759277, "learning_rate": 7.030716723549488e-05, "loss": 0.7856, "step": 4091 }, { "epoch": 4.6552901023890785, "grad_norm": 0.8531357049942017, "learning_rate": 7.007963594994311e-05, "loss": 1.9666, "step": 4092 }, { "epoch": 4.656427758816838, "grad_norm": 0.8261699080467224, "learning_rate": 6.985210466439137e-05, "loss": 1.2341, "step": 4093 }, { "epoch": 4.657565415244596, "grad_norm": 1.0157686471939087, "learning_rate": 6.962457337883959e-05, "loss": 1.1924, "step": 4094 }, { "epoch": 4.658703071672355, "grad_norm": 0.8260225057601929, "learning_rate": 6.939704209328783e-05, "loss": 1.092, "step": 4095 }, { "epoch": 4.6598407281001135, "grad_norm": 0.836290717124939, "learning_rate": 6.916951080773606e-05, "loss": 0.9278, "step": 4096 }, { "epoch": 4.660978384527873, "grad_norm": 1.124277949333191, "learning_rate": 6.89419795221843e-05, "loss": 1.9577, "step": 4097 }, { "epoch": 4.662116040955631, "grad_norm": 1.066027045249939, "learning_rate": 6.871444823663254e-05, "loss": 2.6585, "step": 4098 }, { "epoch": 4.66325369738339, "grad_norm": 0.5382254123687744, "learning_rate": 6.848691695108077e-05, "loss": 1.025, "step": 4099 }, { "epoch": 4.664391353811149, "grad_norm": 0.6736378073692322, "learning_rate": 6.825938566552901e-05, "loss": 1.2211, "step": 4100 }, { "epoch": 4.665529010238908, "grad_norm": 1.0047065019607544, "learning_rate": 6.803185437997725e-05, "loss": 1.528, "step": 4101 }, { "epoch": 4.666666666666667, "grad_norm": 1.521615743637085, "learning_rate": 6.780432309442548e-05, "loss": 3.3803, "step": 4102 }, { "epoch": 4.667804323094425, "grad_norm": 0.9080568552017212, "learning_rate": 6.757679180887372e-05, "loss": 1.778, "step": 4103 }, { "epoch": 4.6689419795221845, "grad_norm": 0.7825086116790771, "learning_rate": 6.734926052332196e-05, "loss": 1.6855, "step": 4104 }, { "epoch": 4.670079635949943, "grad_norm": 0.905820906162262, "learning_rate": 6.71217292377702e-05, "loss": 2.29, "step": 4105 }, { "epoch": 4.671217292377702, "grad_norm": 0.7313637137413025, "learning_rate": 6.689419795221843e-05, "loss": 1.6036, "step": 4106 }, { "epoch": 4.672354948805461, "grad_norm": 0.683791995048523, "learning_rate": 6.666666666666667e-05, "loss": 1.4341, "step": 4107 }, { "epoch": 4.6734926052332195, "grad_norm": 1.0041923522949219, "learning_rate": 6.643913538111491e-05, "loss": 1.4274, "step": 4108 }, { "epoch": 4.674630261660979, "grad_norm": 0.662774920463562, "learning_rate": 6.621160409556313e-05, "loss": 1.4739, "step": 4109 }, { "epoch": 4.675767918088737, "grad_norm": 1.1887277364730835, "learning_rate": 6.598407281001138e-05, "loss": 1.7699, "step": 4110 }, { "epoch": 4.676905574516496, "grad_norm": 1.0310107469558716, "learning_rate": 6.575654152445962e-05, "loss": 1.565, "step": 4111 }, { "epoch": 4.678043230944255, "grad_norm": 0.9942086338996887, "learning_rate": 6.552901023890785e-05, "loss": 1.6227, "step": 4112 }, { "epoch": 4.679180887372014, "grad_norm": 1.4443100690841675, "learning_rate": 6.530147895335609e-05, "loss": 3.6737, "step": 4113 }, { "epoch": 4.680318543799773, "grad_norm": 0.9084301590919495, "learning_rate": 6.507394766780432e-05, "loss": 1.8005, "step": 4114 }, { "epoch": 4.681456200227531, "grad_norm": 1.0044715404510498, "learning_rate": 6.484641638225255e-05, "loss": 2.3198, "step": 4115 }, { "epoch": 4.6825938566552905, "grad_norm": 0.6797364950180054, "learning_rate": 6.46188850967008e-05, "loss": 1.0637, "step": 4116 }, { "epoch": 4.683731513083049, "grad_norm": 1.2862342596054077, "learning_rate": 6.439135381114904e-05, "loss": 2.6791, "step": 4117 }, { "epoch": 4.684869169510808, "grad_norm": 0.9734237790107727, "learning_rate": 6.416382252559728e-05, "loss": 1.7895, "step": 4118 }, { "epoch": 4.686006825938566, "grad_norm": 1.044211506843567, "learning_rate": 6.39362912400455e-05, "loss": 3.2597, "step": 4119 }, { "epoch": 4.6871444823663255, "grad_norm": 1.082377552986145, "learning_rate": 6.370875995449374e-05, "loss": 1.8831, "step": 4120 }, { "epoch": 4.688282138794084, "grad_norm": 1.1628836393356323, "learning_rate": 6.348122866894199e-05, "loss": 2.3543, "step": 4121 }, { "epoch": 4.689419795221843, "grad_norm": 0.9119383692741394, "learning_rate": 6.325369738339022e-05, "loss": 1.5563, "step": 4122 }, { "epoch": 4.690557451649601, "grad_norm": 1.1674100160598755, "learning_rate": 6.302616609783846e-05, "loss": 1.7306, "step": 4123 }, { "epoch": 4.691695108077361, "grad_norm": 0.7952550053596497, "learning_rate": 6.279863481228669e-05, "loss": 1.5032, "step": 4124 }, { "epoch": 4.69283276450512, "grad_norm": 0.9031153917312622, "learning_rate": 6.257110352673492e-05, "loss": 2.8563, "step": 4125 }, { "epoch": 4.693970420932878, "grad_norm": 1.1440151929855347, "learning_rate": 6.234357224118316e-05, "loss": 1.1994, "step": 4126 }, { "epoch": 4.695108077360637, "grad_norm": 0.8565515279769897, "learning_rate": 6.21160409556314e-05, "loss": 0.904, "step": 4127 }, { "epoch": 4.696245733788396, "grad_norm": 1.0120102167129517, "learning_rate": 6.188850967007965e-05, "loss": 1.6285, "step": 4128 }, { "epoch": 4.697383390216155, "grad_norm": 0.95466148853302, "learning_rate": 6.166097838452787e-05, "loss": 1.4088, "step": 4129 }, { "epoch": 4.698521046643913, "grad_norm": 0.9938770532608032, "learning_rate": 6.143344709897611e-05, "loss": 1.9033, "step": 4130 }, { "epoch": 4.699658703071672, "grad_norm": 0.8543733954429626, "learning_rate": 6.120591581342434e-05, "loss": 2.0826, "step": 4131 }, { "epoch": 4.7007963594994315, "grad_norm": 1.0578027963638306, "learning_rate": 6.097838452787258e-05, "loss": 1.5184, "step": 4132 }, { "epoch": 4.70193401592719, "grad_norm": 1.0492652654647827, "learning_rate": 6.075085324232082e-05, "loss": 2.2705, "step": 4133 }, { "epoch": 4.703071672354949, "grad_norm": 0.7921819686889648, "learning_rate": 6.052332195676906e-05, "loss": 1.2564, "step": 4134 }, { "epoch": 4.704209328782707, "grad_norm": 0.9057871103286743, "learning_rate": 6.029579067121729e-05, "loss": 1.3608, "step": 4135 }, { "epoch": 4.705346985210467, "grad_norm": 1.0830860137939453, "learning_rate": 6.006825938566553e-05, "loss": 2.1617, "step": 4136 }, { "epoch": 4.706484641638225, "grad_norm": 0.9428957104682922, "learning_rate": 5.9840728100113766e-05, "loss": 2.4494, "step": 4137 }, { "epoch": 4.707622298065984, "grad_norm": 0.7517849206924438, "learning_rate": 5.9613196814562e-05, "loss": 1.2648, "step": 4138 }, { "epoch": 4.708759954493743, "grad_norm": 1.1423892974853516, "learning_rate": 5.938566552901024e-05, "loss": 1.8682, "step": 4139 }, { "epoch": 4.709897610921502, "grad_norm": 0.8202763795852661, "learning_rate": 5.915813424345848e-05, "loss": 2.3286, "step": 4140 }, { "epoch": 4.711035267349261, "grad_norm": 0.7963849306106567, "learning_rate": 5.8930602957906714e-05, "loss": 1.5263, "step": 4141 }, { "epoch": 4.712172923777019, "grad_norm": 0.8499445915222168, "learning_rate": 5.870307167235495e-05, "loss": 1.4715, "step": 4142 }, { "epoch": 4.713310580204778, "grad_norm": 0.9363263249397278, "learning_rate": 5.847554038680319e-05, "loss": 2.0218, "step": 4143 }, { "epoch": 4.714448236632537, "grad_norm": 0.9191851615905762, "learning_rate": 5.824800910125142e-05, "loss": 2.695, "step": 4144 }, { "epoch": 4.715585893060296, "grad_norm": 1.0178710222244263, "learning_rate": 5.802047781569966e-05, "loss": 2.0507, "step": 4145 }, { "epoch": 4.716723549488055, "grad_norm": 0.9625295400619507, "learning_rate": 5.77929465301479e-05, "loss": 1.7354, "step": 4146 }, { "epoch": 4.717861205915813, "grad_norm": 1.0893375873565674, "learning_rate": 5.756541524459613e-05, "loss": 2.4767, "step": 4147 }, { "epoch": 4.718998862343573, "grad_norm": 0.6518070101737976, "learning_rate": 5.733788395904437e-05, "loss": 1.2152, "step": 4148 }, { "epoch": 4.720136518771331, "grad_norm": 1.2176522016525269, "learning_rate": 5.71103526734926e-05, "loss": 1.7852, "step": 4149 }, { "epoch": 4.72127417519909, "grad_norm": 1.0597165822982788, "learning_rate": 5.688282138794084e-05, "loss": 2.6517, "step": 4150 }, { "epoch": 4.722411831626848, "grad_norm": 0.9489892721176147, "learning_rate": 5.6655290102389084e-05, "loss": 2.0657, "step": 4151 }, { "epoch": 4.723549488054608, "grad_norm": 0.9729322195053101, "learning_rate": 5.6427758816837314e-05, "loss": 1.7927, "step": 4152 }, { "epoch": 4.724687144482367, "grad_norm": 0.832109808921814, "learning_rate": 5.620022753128555e-05, "loss": 1.1887, "step": 4153 }, { "epoch": 4.725824800910125, "grad_norm": 1.0068272352218628, "learning_rate": 5.597269624573379e-05, "loss": 1.3375, "step": 4154 }, { "epoch": 4.726962457337884, "grad_norm": 0.7367716431617737, "learning_rate": 5.5745164960182025e-05, "loss": 1.3822, "step": 4155 }, { "epoch": 4.728100113765643, "grad_norm": 1.1299673318862915, "learning_rate": 5.551763367463027e-05, "loss": 2.0459, "step": 4156 }, { "epoch": 4.729237770193402, "grad_norm": 0.8980764150619507, "learning_rate": 5.52901023890785e-05, "loss": 1.6, "step": 4157 }, { "epoch": 4.73037542662116, "grad_norm": 1.1862162351608276, "learning_rate": 5.5062571103526736e-05, "loss": 2.6499, "step": 4158 }, { "epoch": 4.731513083048919, "grad_norm": 0.6605071425437927, "learning_rate": 5.483503981797497e-05, "loss": 0.782, "step": 4159 }, { "epoch": 4.732650739476678, "grad_norm": 1.0915684700012207, "learning_rate": 5.460750853242321e-05, "loss": 2.1564, "step": 4160 }, { "epoch": 4.733788395904437, "grad_norm": 0.7124598026275635, "learning_rate": 5.437997724687145e-05, "loss": 1.3858, "step": 4161 }, { "epoch": 4.734926052332195, "grad_norm": 0.8076978325843811, "learning_rate": 5.4152445961319684e-05, "loss": 1.6923, "step": 4162 }, { "epoch": 4.736063708759954, "grad_norm": 1.0069531202316284, "learning_rate": 5.392491467576792e-05, "loss": 2.5924, "step": 4163 }, { "epoch": 4.737201365187714, "grad_norm": 0.7821694016456604, "learning_rate": 5.369738339021615e-05, "loss": 1.5226, "step": 4164 }, { "epoch": 4.738339021615472, "grad_norm": 1.0969029664993286, "learning_rate": 5.3469852104664395e-05, "loss": 1.791, "step": 4165 }, { "epoch": 4.739476678043231, "grad_norm": 0.6663936972618103, "learning_rate": 5.324232081911263e-05, "loss": 1.298, "step": 4166 }, { "epoch": 4.7406143344709895, "grad_norm": 0.6106514930725098, "learning_rate": 5.301478953356086e-05, "loss": 1.0094, "step": 4167 }, { "epoch": 4.741751990898749, "grad_norm": 0.8213837742805481, "learning_rate": 5.2787258248009106e-05, "loss": 1.5493, "step": 4168 }, { "epoch": 4.742889647326507, "grad_norm": 1.1655770540237427, "learning_rate": 5.2559726962457336e-05, "loss": 2.0271, "step": 4169 }, { "epoch": 4.744027303754266, "grad_norm": 0.9248787760734558, "learning_rate": 5.233219567690557e-05, "loss": 2.3724, "step": 4170 }, { "epoch": 4.745164960182025, "grad_norm": 0.7159304022789001, "learning_rate": 5.210466439135381e-05, "loss": 1.3412, "step": 4171 }, { "epoch": 4.746302616609784, "grad_norm": 0.773129403591156, "learning_rate": 5.187713310580205e-05, "loss": 2.2651, "step": 4172 }, { "epoch": 4.747440273037543, "grad_norm": 1.6007509231567383, "learning_rate": 5.164960182025029e-05, "loss": 2.185, "step": 4173 }, { "epoch": 4.748577929465301, "grad_norm": 0.9146437048912048, "learning_rate": 5.142207053469852e-05, "loss": 1.8578, "step": 4174 }, { "epoch": 4.74971558589306, "grad_norm": 1.0452239513397217, "learning_rate": 5.119453924914676e-05, "loss": 2.7301, "step": 4175 }, { "epoch": 4.750853242320819, "grad_norm": 1.1946684122085571, "learning_rate": 5.0967007963594995e-05, "loss": 2.3748, "step": 4176 }, { "epoch": 4.751990898748578, "grad_norm": 1.1966066360473633, "learning_rate": 5.073947667804323e-05, "loss": 1.9953, "step": 4177 }, { "epoch": 4.753128555176337, "grad_norm": 0.8742679357528687, "learning_rate": 5.051194539249147e-05, "loss": 1.7049, "step": 4178 }, { "epoch": 4.7542662116040955, "grad_norm": 0.7945109605789185, "learning_rate": 5.0284414106939706e-05, "loss": 1.4822, "step": 4179 }, { "epoch": 4.755403868031855, "grad_norm": 0.7654008269309998, "learning_rate": 5.005688282138794e-05, "loss": 1.0475, "step": 4180 }, { "epoch": 4.756541524459613, "grad_norm": 1.0608311891555786, "learning_rate": 4.982935153583617e-05, "loss": 1.5947, "step": 4181 }, { "epoch": 4.757679180887372, "grad_norm": 1.2881845235824585, "learning_rate": 4.960182025028442e-05, "loss": 3.6129, "step": 4182 }, { "epoch": 4.7588168373151305, "grad_norm": 1.1011303663253784, "learning_rate": 4.9374288964732654e-05, "loss": 1.1276, "step": 4183 }, { "epoch": 4.75995449374289, "grad_norm": 1.142633318901062, "learning_rate": 4.9146757679180884e-05, "loss": 2.4236, "step": 4184 }, { "epoch": 4.761092150170649, "grad_norm": 0.940455436706543, "learning_rate": 4.891922639362913e-05, "loss": 2.3076, "step": 4185 }, { "epoch": 4.762229806598407, "grad_norm": 0.8096868991851807, "learning_rate": 4.869169510807736e-05, "loss": 0.983, "step": 4186 }, { "epoch": 4.763367463026166, "grad_norm": 1.020517349243164, "learning_rate": 4.8464163822525595e-05, "loss": 2.3447, "step": 4187 }, { "epoch": 4.764505119453925, "grad_norm": 1.09331214427948, "learning_rate": 4.823663253697384e-05, "loss": 2.6737, "step": 4188 }, { "epoch": 4.765642775881684, "grad_norm": 1.5871385335922241, "learning_rate": 4.800910125142207e-05, "loss": 3.2077, "step": 4189 }, { "epoch": 4.766780432309442, "grad_norm": 0.6802913546562195, "learning_rate": 4.778156996587031e-05, "loss": 1.2683, "step": 4190 }, { "epoch": 4.7679180887372015, "grad_norm": 0.8317682147026062, "learning_rate": 4.755403868031854e-05, "loss": 1.9439, "step": 4191 }, { "epoch": 4.769055745164961, "grad_norm": 0.8209235668182373, "learning_rate": 4.732650739476678e-05, "loss": 1.3119, "step": 4192 }, { "epoch": 4.770193401592719, "grad_norm": 1.44149649143219, "learning_rate": 4.7098976109215024e-05, "loss": 2.4981, "step": 4193 }, { "epoch": 4.771331058020478, "grad_norm": 1.0103590488433838, "learning_rate": 4.6871444823663254e-05, "loss": 1.6604, "step": 4194 }, { "epoch": 4.7724687144482365, "grad_norm": 1.4098118543624878, "learning_rate": 4.664391353811149e-05, "loss": 3.3837, "step": 4195 }, { "epoch": 4.773606370875996, "grad_norm": 0.829317569732666, "learning_rate": 4.641638225255973e-05, "loss": 1.5868, "step": 4196 }, { "epoch": 4.774744027303754, "grad_norm": 0.9138002991676331, "learning_rate": 4.6188850967007965e-05, "loss": 2.3734, "step": 4197 }, { "epoch": 4.775881683731513, "grad_norm": 0.6845062971115112, "learning_rate": 4.5961319681456195e-05, "loss": 1.1712, "step": 4198 }, { "epoch": 4.7770193401592715, "grad_norm": 0.592144787311554, "learning_rate": 4.573378839590444e-05, "loss": 1.0386, "step": 4199 }, { "epoch": 4.778156996587031, "grad_norm": 1.318644404411316, "learning_rate": 4.5506257110352676e-05, "loss": 2.859, "step": 4200 }, { "epoch": 4.779294653014789, "grad_norm": 1.0490177869796753, "learning_rate": 4.5278725824800906e-05, "loss": 2.2225, "step": 4201 }, { "epoch": 4.780432309442548, "grad_norm": 0.9637972712516785, "learning_rate": 4.505119453924915e-05, "loss": 1.6452, "step": 4202 }, { "epoch": 4.7815699658703075, "grad_norm": 1.3395143747329712, "learning_rate": 4.482366325369738e-05, "loss": 2.982, "step": 4203 }, { "epoch": 4.782707622298066, "grad_norm": 0.9925627708435059, "learning_rate": 4.4596131968145624e-05, "loss": 2.2197, "step": 4204 }, { "epoch": 4.783845278725825, "grad_norm": 0.715560257434845, "learning_rate": 4.436860068259386e-05, "loss": 1.6095, "step": 4205 }, { "epoch": 4.784982935153583, "grad_norm": 0.7981997132301331, "learning_rate": 4.414106939704209e-05, "loss": 1.859, "step": 4206 }, { "epoch": 4.7861205915813425, "grad_norm": 0.894432783126831, "learning_rate": 4.3913538111490335e-05, "loss": 1.4185, "step": 4207 }, { "epoch": 4.787258248009101, "grad_norm": 1.0646681785583496, "learning_rate": 4.3686006825938565e-05, "loss": 2.6534, "step": 4208 }, { "epoch": 4.78839590443686, "grad_norm": 1.0470125675201416, "learning_rate": 4.34584755403868e-05, "loss": 2.4396, "step": 4209 }, { "epoch": 4.789533560864619, "grad_norm": 1.2576868534088135, "learning_rate": 4.3230944254835046e-05, "loss": 1.9555, "step": 4210 }, { "epoch": 4.7906712172923775, "grad_norm": 0.7670148611068726, "learning_rate": 4.3003412969283276e-05, "loss": 1.1409, "step": 4211 }, { "epoch": 4.791808873720137, "grad_norm": 0.9157593250274658, "learning_rate": 4.277588168373151e-05, "loss": 1.9202, "step": 4212 }, { "epoch": 4.792946530147895, "grad_norm": 0.4483858048915863, "learning_rate": 4.254835039817975e-05, "loss": 0.7922, "step": 4213 }, { "epoch": 4.794084186575654, "grad_norm": 1.031166911125183, "learning_rate": 4.232081911262799e-05, "loss": 2.3445, "step": 4214 }, { "epoch": 4.795221843003413, "grad_norm": 1.042548418045044, "learning_rate": 4.2093287827076224e-05, "loss": 1.7547, "step": 4215 }, { "epoch": 4.796359499431172, "grad_norm": 1.197379231452942, "learning_rate": 4.186575654152446e-05, "loss": 1.7015, "step": 4216 }, { "epoch": 4.797497155858931, "grad_norm": 0.9254120588302612, "learning_rate": 4.16382252559727e-05, "loss": 1.1992, "step": 4217 }, { "epoch": 4.798634812286689, "grad_norm": 0.6915335655212402, "learning_rate": 4.141069397042093e-05, "loss": 0.8966, "step": 4218 }, { "epoch": 4.7997724687144485, "grad_norm": 1.2596569061279297, "learning_rate": 4.118316268486917e-05, "loss": 1.7151, "step": 4219 }, { "epoch": 4.800910125142207, "grad_norm": 0.873573362827301, "learning_rate": 4.09556313993174e-05, "loss": 2.261, "step": 4220 }, { "epoch": 4.802047781569966, "grad_norm": 0.6213791370391846, "learning_rate": 4.0728100113765646e-05, "loss": 1.1226, "step": 4221 }, { "epoch": 4.803185437997724, "grad_norm": 0.9243321418762207, "learning_rate": 4.050056882821388e-05, "loss": 2.3551, "step": 4222 }, { "epoch": 4.8043230944254836, "grad_norm": 1.7408545017242432, "learning_rate": 4.0273037542662113e-05, "loss": 3.208, "step": 4223 }, { "epoch": 4.805460750853243, "grad_norm": 0.5592201352119446, "learning_rate": 4.004550625711036e-05, "loss": 0.7587, "step": 4224 }, { "epoch": 4.806598407281001, "grad_norm": 0.7832121849060059, "learning_rate": 3.981797497155859e-05, "loss": 1.2192, "step": 4225 }, { "epoch": 4.80773606370876, "grad_norm": 0.9754132628440857, "learning_rate": 3.9590443686006824e-05, "loss": 1.2067, "step": 4226 }, { "epoch": 4.808873720136519, "grad_norm": 0.9849193096160889, "learning_rate": 3.936291240045507e-05, "loss": 2.3535, "step": 4227 }, { "epoch": 4.810011376564278, "grad_norm": 0.9449824690818787, "learning_rate": 3.91353811149033e-05, "loss": 1.7503, "step": 4228 }, { "epoch": 4.811149032992036, "grad_norm": 1.1584771871566772, "learning_rate": 3.8907849829351535e-05, "loss": 2.9749, "step": 4229 }, { "epoch": 4.812286689419795, "grad_norm": 0.9928334355354309, "learning_rate": 3.868031854379977e-05, "loss": 2.3772, "step": 4230 }, { "epoch": 4.8134243458475545, "grad_norm": 1.044175386428833, "learning_rate": 3.845278725824801e-05, "loss": 2.2084, "step": 4231 }, { "epoch": 4.814562002275313, "grad_norm": 1.501725673675537, "learning_rate": 3.8225255972696246e-05, "loss": 3.0545, "step": 4232 }, { "epoch": 4.815699658703072, "grad_norm": 0.6129480004310608, "learning_rate": 3.799772468714448e-05, "loss": 1.1711, "step": 4233 }, { "epoch": 4.81683731513083, "grad_norm": 0.6920023560523987, "learning_rate": 3.777019340159272e-05, "loss": 1.006, "step": 4234 }, { "epoch": 4.81797497155859, "grad_norm": 1.1781179904937744, "learning_rate": 3.754266211604096e-05, "loss": 2.6292, "step": 4235 }, { "epoch": 4.819112627986348, "grad_norm": 0.9663881659507751, "learning_rate": 3.7315130830489194e-05, "loss": 1.689, "step": 4236 }, { "epoch": 4.820250284414107, "grad_norm": 1.1518983840942383, "learning_rate": 3.708759954493743e-05, "loss": 2.804, "step": 4237 }, { "epoch": 4.821387940841865, "grad_norm": 1.6885265111923218, "learning_rate": 3.686006825938567e-05, "loss": 2.9798, "step": 4238 }, { "epoch": 4.822525597269625, "grad_norm": 1.2374640703201294, "learning_rate": 3.6632536973833905e-05, "loss": 1.4952, "step": 4239 }, { "epoch": 4.823663253697383, "grad_norm": 0.8825571537017822, "learning_rate": 3.6405005688282136e-05, "loss": 1.5783, "step": 4240 }, { "epoch": 4.824800910125142, "grad_norm": 1.4535282850265503, "learning_rate": 3.617747440273038e-05, "loss": 2.3041, "step": 4241 }, { "epoch": 4.825938566552901, "grad_norm": 0.9175541400909424, "learning_rate": 3.594994311717861e-05, "loss": 1.5203, "step": 4242 }, { "epoch": 4.82707622298066, "grad_norm": 0.984664797782898, "learning_rate": 3.5722411831626847e-05, "loss": 2.5147, "step": 4243 }, { "epoch": 4.828213879408419, "grad_norm": 0.7770801782608032, "learning_rate": 3.549488054607509e-05, "loss": 2.0094, "step": 4244 }, { "epoch": 4.829351535836177, "grad_norm": 0.9367466568946838, "learning_rate": 3.526734926052332e-05, "loss": 2.4352, "step": 4245 }, { "epoch": 4.830489192263936, "grad_norm": 0.7870509028434753, "learning_rate": 3.503981797497156e-05, "loss": 2.0384, "step": 4246 }, { "epoch": 4.831626848691695, "grad_norm": 0.8838803172111511, "learning_rate": 3.4812286689419794e-05, "loss": 1.4871, "step": 4247 }, { "epoch": 4.832764505119454, "grad_norm": 0.8027423024177551, "learning_rate": 3.458475540386803e-05, "loss": 1.8367, "step": 4248 }, { "epoch": 4.833902161547213, "grad_norm": 0.9481743574142456, "learning_rate": 3.435722411831627e-05, "loss": 0.9802, "step": 4249 }, { "epoch": 4.835039817974971, "grad_norm": 0.6448602676391602, "learning_rate": 3.4129692832764505e-05, "loss": 1.3029, "step": 4250 }, { "epoch": 4.836177474402731, "grad_norm": 0.8399932980537415, "learning_rate": 3.390216154721274e-05, "loss": 2.0227, "step": 4251 }, { "epoch": 4.837315130830489, "grad_norm": 1.314458966255188, "learning_rate": 3.367463026166098e-05, "loss": 2.1751, "step": 4252 }, { "epoch": 4.838452787258248, "grad_norm": 0.5397687554359436, "learning_rate": 3.3447098976109216e-05, "loss": 1.035, "step": 4253 }, { "epoch": 4.839590443686006, "grad_norm": 0.7466420531272888, "learning_rate": 3.3219567690557453e-05, "loss": 1.8664, "step": 4254 }, { "epoch": 4.840728100113766, "grad_norm": 1.525064468383789, "learning_rate": 3.299203640500569e-05, "loss": 1.8065, "step": 4255 }, { "epoch": 4.841865756541525, "grad_norm": 1.5727421045303345, "learning_rate": 3.276450511945393e-05, "loss": 1.9101, "step": 4256 }, { "epoch": 4.843003412969283, "grad_norm": 0.8878625631332397, "learning_rate": 3.253697383390216e-05, "loss": 2.3298, "step": 4257 }, { "epoch": 4.844141069397042, "grad_norm": 0.9675851464271545, "learning_rate": 3.23094425483504e-05, "loss": 1.6869, "step": 4258 }, { "epoch": 4.845278725824801, "grad_norm": 1.704845666885376, "learning_rate": 3.208191126279864e-05, "loss": 3.9697, "step": 4259 }, { "epoch": 4.84641638225256, "grad_norm": 0.6775122284889221, "learning_rate": 3.185437997724687e-05, "loss": 1.1285, "step": 4260 }, { "epoch": 4.847554038680318, "grad_norm": 0.7678422331809998, "learning_rate": 3.162684869169511e-05, "loss": 1.6568, "step": 4261 }, { "epoch": 4.848691695108077, "grad_norm": 0.8857121467590332, "learning_rate": 3.139931740614334e-05, "loss": 1.3846, "step": 4262 }, { "epoch": 4.849829351535837, "grad_norm": 0.8548673987388611, "learning_rate": 3.117178612059158e-05, "loss": 1.6368, "step": 4263 }, { "epoch": 4.850967007963595, "grad_norm": 0.9651059508323669, "learning_rate": 3.094425483503982e-05, "loss": 2.2593, "step": 4264 }, { "epoch": 4.852104664391354, "grad_norm": 0.593999445438385, "learning_rate": 3.0716723549488054e-05, "loss": 0.7888, "step": 4265 }, { "epoch": 4.853242320819112, "grad_norm": 1.1292147636413574, "learning_rate": 3.048919226393629e-05, "loss": 1.7061, "step": 4266 }, { "epoch": 4.854379977246872, "grad_norm": 1.285089135169983, "learning_rate": 3.026166097838453e-05, "loss": 2.3986, "step": 4267 }, { "epoch": 4.85551763367463, "grad_norm": 0.9155679941177368, "learning_rate": 3.0034129692832765e-05, "loss": 1.9914, "step": 4268 }, { "epoch": 4.856655290102389, "grad_norm": 1.3086957931518555, "learning_rate": 2.9806598407281e-05, "loss": 2.3898, "step": 4269 }, { "epoch": 4.857792946530148, "grad_norm": 1.1196303367614746, "learning_rate": 2.957906712172924e-05, "loss": 3.3304, "step": 4270 }, { "epoch": 4.858930602957907, "grad_norm": 1.0274990797042847, "learning_rate": 2.9351535836177476e-05, "loss": 1.8834, "step": 4271 }, { "epoch": 4.860068259385666, "grad_norm": 0.9945041537284851, "learning_rate": 2.912400455062571e-05, "loss": 2.645, "step": 4272 }, { "epoch": 4.861205915813424, "grad_norm": 0.88654625415802, "learning_rate": 2.889647326507395e-05, "loss": 1.7848, "step": 4273 }, { "epoch": 4.862343572241183, "grad_norm": 0.6680393815040588, "learning_rate": 2.8668941979522186e-05, "loss": 1.1586, "step": 4274 }, { "epoch": 4.863481228668942, "grad_norm": 1.0366647243499756, "learning_rate": 2.844141069397042e-05, "loss": 1.2493, "step": 4275 }, { "epoch": 4.864618885096701, "grad_norm": 1.4107547998428345, "learning_rate": 2.8213879408418657e-05, "loss": 3.2325, "step": 4276 }, { "epoch": 4.865756541524459, "grad_norm": 0.627693772315979, "learning_rate": 2.7986348122866894e-05, "loss": 0.5508, "step": 4277 }, { "epoch": 4.8668941979522184, "grad_norm": 0.7929319739341736, "learning_rate": 2.7758816837315134e-05, "loss": 1.5375, "step": 4278 }, { "epoch": 4.868031854379977, "grad_norm": 1.2130954265594482, "learning_rate": 2.7531285551763368e-05, "loss": 2.0214, "step": 4279 }, { "epoch": 4.869169510807736, "grad_norm": 0.9208192229270935, "learning_rate": 2.7303754266211605e-05, "loss": 1.7519, "step": 4280 }, { "epoch": 4.870307167235495, "grad_norm": 1.1531082391738892, "learning_rate": 2.7076222980659842e-05, "loss": 2.057, "step": 4281 }, { "epoch": 4.8714448236632535, "grad_norm": 1.157972812652588, "learning_rate": 2.6848691695108076e-05, "loss": 2.9342, "step": 4282 }, { "epoch": 4.872582480091013, "grad_norm": 0.9264168739318848, "learning_rate": 2.6621160409556316e-05, "loss": 1.2696, "step": 4283 }, { "epoch": 4.873720136518771, "grad_norm": 1.2955060005187988, "learning_rate": 2.6393629124004553e-05, "loss": 3.8855, "step": 4284 }, { "epoch": 4.87485779294653, "grad_norm": 0.8939307332038879, "learning_rate": 2.6166097838452787e-05, "loss": 1.0266, "step": 4285 }, { "epoch": 4.8759954493742885, "grad_norm": 0.6583961844444275, "learning_rate": 2.5938566552901024e-05, "loss": 1.4206, "step": 4286 }, { "epoch": 4.877133105802048, "grad_norm": 0.642296552658081, "learning_rate": 2.571103526734926e-05, "loss": 0.9931, "step": 4287 }, { "epoch": 4.878270762229807, "grad_norm": 0.9714062809944153, "learning_rate": 2.5483503981797498e-05, "loss": 1.8689, "step": 4288 }, { "epoch": 4.879408418657565, "grad_norm": 1.0246009826660156, "learning_rate": 2.5255972696245735e-05, "loss": 2.1948, "step": 4289 }, { "epoch": 4.8805460750853245, "grad_norm": 0.6404163837432861, "learning_rate": 2.502844141069397e-05, "loss": 0.7135, "step": 4290 }, { "epoch": 4.881683731513083, "grad_norm": 0.6843558549880981, "learning_rate": 2.480091012514221e-05, "loss": 1.1422, "step": 4291 }, { "epoch": 4.882821387940842, "grad_norm": 0.5428027510643005, "learning_rate": 2.4573378839590442e-05, "loss": 1.0285, "step": 4292 }, { "epoch": 4.8839590443686, "grad_norm": 1.4773718118667603, "learning_rate": 2.434584755403868e-05, "loss": 1.9624, "step": 4293 }, { "epoch": 4.8850967007963595, "grad_norm": 1.062320590019226, "learning_rate": 2.411831626848692e-05, "loss": 2.0422, "step": 4294 }, { "epoch": 4.886234357224119, "grad_norm": 0.9401637315750122, "learning_rate": 2.3890784982935157e-05, "loss": 2.0883, "step": 4295 }, { "epoch": 4.887372013651877, "grad_norm": 0.9944307804107666, "learning_rate": 2.366325369738339e-05, "loss": 2.0369, "step": 4296 }, { "epoch": 4.888509670079636, "grad_norm": 1.2259140014648438, "learning_rate": 2.3435722411831627e-05, "loss": 2.3347, "step": 4297 }, { "epoch": 4.8896473265073945, "grad_norm": 0.7835159301757812, "learning_rate": 2.3208191126279864e-05, "loss": 1.4377, "step": 4298 }, { "epoch": 4.890784982935154, "grad_norm": 1.1913021802902222, "learning_rate": 2.2980659840728098e-05, "loss": 1.6073, "step": 4299 }, { "epoch": 4.891922639362912, "grad_norm": 0.95366370677948, "learning_rate": 2.2753128555176338e-05, "loss": 1.8692, "step": 4300 }, { "epoch": 4.893060295790671, "grad_norm": 0.8128577470779419, "learning_rate": 2.2525597269624575e-05, "loss": 1.3592, "step": 4301 }, { "epoch": 4.8941979522184305, "grad_norm": 1.3554686307907104, "learning_rate": 2.2298065984072812e-05, "loss": 2.6948, "step": 4302 }, { "epoch": 4.895335608646189, "grad_norm": 0.8826941847801208, "learning_rate": 2.2070534698521046e-05, "loss": 1.4915, "step": 4303 }, { "epoch": 4.896473265073948, "grad_norm": 1.1675195693969727, "learning_rate": 2.1843003412969283e-05, "loss": 1.9757, "step": 4304 }, { "epoch": 4.897610921501706, "grad_norm": 0.7045602798461914, "learning_rate": 2.1615472127417523e-05, "loss": 1.5306, "step": 4305 }, { "epoch": 4.8987485779294655, "grad_norm": 1.0397857427597046, "learning_rate": 2.1387940841865757e-05, "loss": 1.9285, "step": 4306 }, { "epoch": 4.899886234357224, "grad_norm": 0.5089128613471985, "learning_rate": 2.1160409556313994e-05, "loss": 0.8025, "step": 4307 }, { "epoch": 4.901023890784983, "grad_norm": 0.6715580821037292, "learning_rate": 2.093287827076223e-05, "loss": 1.3201, "step": 4308 }, { "epoch": 4.902161547212742, "grad_norm": 1.0820637941360474, "learning_rate": 2.0705346985210464e-05, "loss": 1.5423, "step": 4309 }, { "epoch": 4.9032992036405005, "grad_norm": 1.0669132471084595, "learning_rate": 2.04778156996587e-05, "loss": 1.2283, "step": 4310 }, { "epoch": 4.90443686006826, "grad_norm": 0.9476402401924133, "learning_rate": 2.025028441410694e-05, "loss": 1.5601, "step": 4311 }, { "epoch": 4.905574516496018, "grad_norm": 0.8250419497489929, "learning_rate": 2.002275312855518e-05, "loss": 1.2876, "step": 4312 }, { "epoch": 4.906712172923777, "grad_norm": 0.8450213074684143, "learning_rate": 1.9795221843003412e-05, "loss": 1.3217, "step": 4313 }, { "epoch": 4.907849829351536, "grad_norm": 0.6894076466560364, "learning_rate": 1.956769055745165e-05, "loss": 0.8804, "step": 4314 }, { "epoch": 4.908987485779295, "grad_norm": 0.8378640413284302, "learning_rate": 1.9340159271899886e-05, "loss": 1.5368, "step": 4315 }, { "epoch": 4.910125142207053, "grad_norm": 0.8414782285690308, "learning_rate": 1.9112627986348123e-05, "loss": 2.0629, "step": 4316 }, { "epoch": 4.911262798634812, "grad_norm": 0.7455840706825256, "learning_rate": 1.888509670079636e-05, "loss": 2.0014, "step": 4317 }, { "epoch": 4.912400455062571, "grad_norm": 0.9223942756652832, "learning_rate": 1.8657565415244597e-05, "loss": 2.2794, "step": 4318 }, { "epoch": 4.91353811149033, "grad_norm": 0.8256419897079468, "learning_rate": 1.8430034129692834e-05, "loss": 0.9545, "step": 4319 }, { "epoch": 4.914675767918089, "grad_norm": 0.9742276668548584, "learning_rate": 1.8202502844141068e-05, "loss": 1.5479, "step": 4320 }, { "epoch": 4.915813424345847, "grad_norm": 0.6871644854545593, "learning_rate": 1.7974971558589305e-05, "loss": 1.025, "step": 4321 }, { "epoch": 4.9169510807736065, "grad_norm": 0.43966150283813477, "learning_rate": 1.7747440273037545e-05, "loss": 0.4528, "step": 4322 }, { "epoch": 4.918088737201365, "grad_norm": 0.7050820589065552, "learning_rate": 1.751990898748578e-05, "loss": 1.4931, "step": 4323 }, { "epoch": 4.919226393629124, "grad_norm": 0.9527431130409241, "learning_rate": 1.7292377701934016e-05, "loss": 1.7784, "step": 4324 }, { "epoch": 4.920364050056882, "grad_norm": 0.806577742099762, "learning_rate": 1.7064846416382253e-05, "loss": 1.7357, "step": 4325 }, { "epoch": 4.921501706484642, "grad_norm": 0.8829598426818848, "learning_rate": 1.683731513083049e-05, "loss": 0.9894, "step": 4326 }, { "epoch": 4.922639362912401, "grad_norm": 0.9835923910140991, "learning_rate": 1.6609783845278727e-05, "loss": 2.0375, "step": 4327 }, { "epoch": 4.923777019340159, "grad_norm": 0.6324472427368164, "learning_rate": 1.6382252559726964e-05, "loss": 1.0425, "step": 4328 }, { "epoch": 4.924914675767918, "grad_norm": 1.1519763469696045, "learning_rate": 1.61547212741752e-05, "loss": 1.2597, "step": 4329 }, { "epoch": 4.926052332195677, "grad_norm": 0.9644741415977478, "learning_rate": 1.5927189988623434e-05, "loss": 1.327, "step": 4330 }, { "epoch": 4.927189988623436, "grad_norm": 1.3414863348007202, "learning_rate": 1.569965870307167e-05, "loss": 1.7992, "step": 4331 }, { "epoch": 4.928327645051194, "grad_norm": 1.1579735279083252, "learning_rate": 1.547212741751991e-05, "loss": 2.0279, "step": 4332 }, { "epoch": 4.929465301478953, "grad_norm": 0.6621242761611938, "learning_rate": 1.5244596131968145e-05, "loss": 1.217, "step": 4333 }, { "epoch": 4.9306029579067125, "grad_norm": 1.626521348953247, "learning_rate": 1.5017064846416382e-05, "loss": 1.5727, "step": 4334 }, { "epoch": 4.931740614334471, "grad_norm": 1.1708447933197021, "learning_rate": 1.478953356086462e-05, "loss": 1.681, "step": 4335 }, { "epoch": 4.93287827076223, "grad_norm": 0.654159426689148, "learning_rate": 1.4562002275312855e-05, "loss": 1.478, "step": 4336 }, { "epoch": 4.934015927189988, "grad_norm": 0.9996544718742371, "learning_rate": 1.4334470989761093e-05, "loss": 2.0139, "step": 4337 }, { "epoch": 4.935153583617748, "grad_norm": 0.6572261452674866, "learning_rate": 1.4106939704209329e-05, "loss": 1.5581, "step": 4338 }, { "epoch": 4.936291240045506, "grad_norm": 0.7390673160552979, "learning_rate": 1.3879408418657567e-05, "loss": 1.218, "step": 4339 }, { "epoch": 4.937428896473265, "grad_norm": 0.8499306440353394, "learning_rate": 1.3651877133105803e-05, "loss": 2.0041, "step": 4340 }, { "epoch": 4.938566552901024, "grad_norm": 0.7346335053443909, "learning_rate": 1.3424345847554038e-05, "loss": 1.2743, "step": 4341 }, { "epoch": 4.939704209328783, "grad_norm": 0.9406227469444275, "learning_rate": 1.3196814562002277e-05, "loss": 1.9248, "step": 4342 }, { "epoch": 4.940841865756542, "grad_norm": 0.9315640330314636, "learning_rate": 1.2969283276450512e-05, "loss": 1.8989, "step": 4343 }, { "epoch": 4.9419795221843, "grad_norm": 0.8979254961013794, "learning_rate": 1.2741751990898749e-05, "loss": 1.645, "step": 4344 }, { "epoch": 4.943117178612059, "grad_norm": 1.1764620542526245, "learning_rate": 1.2514220705346986e-05, "loss": 2.31, "step": 4345 }, { "epoch": 4.944254835039818, "grad_norm": 0.8805619478225708, "learning_rate": 1.2286689419795221e-05, "loss": 2.1559, "step": 4346 }, { "epoch": 4.945392491467577, "grad_norm": 2.1326658725738525, "learning_rate": 1.205915813424346e-05, "loss": 4.3989, "step": 4347 }, { "epoch": 4.946530147895336, "grad_norm": 0.6682014465332031, "learning_rate": 1.1831626848691695e-05, "loss": 1.1768, "step": 4348 }, { "epoch": 4.947667804323094, "grad_norm": 0.696792721748352, "learning_rate": 1.1604095563139932e-05, "loss": 0.8609, "step": 4349 }, { "epoch": 4.948805460750854, "grad_norm": 1.4207789897918701, "learning_rate": 1.1376564277588169e-05, "loss": 2.1457, "step": 4350 }, { "epoch": 4.949943117178612, "grad_norm": 1.3908188343048096, "learning_rate": 1.1149032992036406e-05, "loss": 1.7755, "step": 4351 }, { "epoch": 4.951080773606371, "grad_norm": 1.002634882926941, "learning_rate": 1.0921501706484641e-05, "loss": 2.3076, "step": 4352 }, { "epoch": 4.952218430034129, "grad_norm": 1.1484334468841553, "learning_rate": 1.0693970420932878e-05, "loss": 2.1246, "step": 4353 }, { "epoch": 4.953356086461889, "grad_norm": 1.0049216747283936, "learning_rate": 1.0466439135381115e-05, "loss": 2.0208, "step": 4354 }, { "epoch": 4.954493742889647, "grad_norm": 0.7231261134147644, "learning_rate": 1.023890784982935e-05, "loss": 0.9476, "step": 4355 }, { "epoch": 4.955631399317406, "grad_norm": 0.9415931105613708, "learning_rate": 1.001137656427759e-05, "loss": 1.935, "step": 4356 }, { "epoch": 4.9567690557451645, "grad_norm": 1.4136885404586792, "learning_rate": 9.783845278725825e-06, "loss": 2.5718, "step": 4357 }, { "epoch": 4.957906712172924, "grad_norm": 0.6967998147010803, "learning_rate": 9.556313993174062e-06, "loss": 0.6904, "step": 4358 }, { "epoch": 4.959044368600683, "grad_norm": 0.5910859107971191, "learning_rate": 9.328782707622299e-06, "loss": 0.8882, "step": 4359 }, { "epoch": 4.960182025028441, "grad_norm": 0.9837796688079834, "learning_rate": 9.101251422070534e-06, "loss": 0.7616, "step": 4360 }, { "epoch": 4.9613196814562, "grad_norm": 0.983348548412323, "learning_rate": 8.873720136518773e-06, "loss": 1.538, "step": 4361 }, { "epoch": 4.962457337883959, "grad_norm": 0.8036980628967285, "learning_rate": 8.646188850967008e-06, "loss": 1.886, "step": 4362 }, { "epoch": 4.963594994311718, "grad_norm": 1.007638692855835, "learning_rate": 8.418657565415245e-06, "loss": 2.004, "step": 4363 }, { "epoch": 4.964732650739476, "grad_norm": 0.9947067499160767, "learning_rate": 8.191126279863482e-06, "loss": 1.8577, "step": 4364 }, { "epoch": 4.965870307167235, "grad_norm": 1.1273043155670166, "learning_rate": 7.963594994311717e-06, "loss": 1.8335, "step": 4365 }, { "epoch": 4.967007963594995, "grad_norm": 1.045556664466858, "learning_rate": 7.736063708759956e-06, "loss": 1.9307, "step": 4366 }, { "epoch": 4.968145620022753, "grad_norm": 1.058106541633606, "learning_rate": 7.508532423208191e-06, "loss": 1.9808, "step": 4367 }, { "epoch": 4.969283276450512, "grad_norm": 0.9825536608695984, "learning_rate": 7.281001137656427e-06, "loss": 1.0482, "step": 4368 }, { "epoch": 4.9704209328782705, "grad_norm": 1.0794082880020142, "learning_rate": 7.053469852104664e-06, "loss": 1.8367, "step": 4369 }, { "epoch": 4.97155858930603, "grad_norm": 1.2737934589385986, "learning_rate": 6.825938566552901e-06, "loss": 2.6611, "step": 4370 }, { "epoch": 4.972696245733788, "grad_norm": 1.13683021068573, "learning_rate": 6.598407281001138e-06, "loss": 1.9974, "step": 4371 }, { "epoch": 4.973833902161547, "grad_norm": 1.0530338287353516, "learning_rate": 6.370875995449374e-06, "loss": 1.9576, "step": 4372 }, { "epoch": 4.974971558589306, "grad_norm": 1.3305796384811401, "learning_rate": 6.1433447098976105e-06, "loss": 3.5862, "step": 4373 }, { "epoch": 4.976109215017065, "grad_norm": 1.4939963817596436, "learning_rate": 5.9158134243458475e-06, "loss": 2.9359, "step": 4374 }, { "epoch": 4.977246871444824, "grad_norm": 1.0199118852615356, "learning_rate": 5.6882821387940845e-06, "loss": 1.8068, "step": 4375 }, { "epoch": 4.978384527872582, "grad_norm": 1.2236491441726685, "learning_rate": 5.460750853242321e-06, "loss": 2.3517, "step": 4376 }, { "epoch": 4.979522184300341, "grad_norm": 0.7806452512741089, "learning_rate": 5.233219567690558e-06, "loss": 1.238, "step": 4377 }, { "epoch": 4.9806598407281, "grad_norm": 1.1601351499557495, "learning_rate": 5.005688282138795e-06, "loss": 1.9707, "step": 4378 }, { "epoch": 4.981797497155859, "grad_norm": 1.6477166414260864, "learning_rate": 4.778156996587031e-06, "loss": 3.1835, "step": 4379 }, { "epoch": 4.982935153583618, "grad_norm": 0.8821132779121399, "learning_rate": 4.550625711035267e-06, "loss": 1.4507, "step": 4380 }, { "epoch": 4.9840728100113765, "grad_norm": 0.922042965888977, "learning_rate": 4.323094425483504e-06, "loss": 1.9507, "step": 4381 }, { "epoch": 4.985210466439136, "grad_norm": 0.9775927662849426, "learning_rate": 4.095563139931741e-06, "loss": 1.4578, "step": 4382 }, { "epoch": 4.986348122866894, "grad_norm": 1.6098992824554443, "learning_rate": 3.868031854379978e-06, "loss": 2.923, "step": 4383 }, { "epoch": 4.987485779294653, "grad_norm": 0.938714325428009, "learning_rate": 3.6405005688282136e-06, "loss": 1.6787, "step": 4384 }, { "epoch": 4.9886234357224115, "grad_norm": 0.8386014103889465, "learning_rate": 3.4129692832764506e-06, "loss": 2.1241, "step": 4385 }, { "epoch": 4.989761092150171, "grad_norm": 1.0702085494995117, "learning_rate": 3.185437997724687e-06, "loss": 2.7462, "step": 4386 }, { "epoch": 4.99089874857793, "grad_norm": 0.7873029708862305, "learning_rate": 2.9579067121729238e-06, "loss": 1.3806, "step": 4387 }, { "epoch": 4.992036405005688, "grad_norm": 0.7936583757400513, "learning_rate": 2.7303754266211603e-06, "loss": 1.4501, "step": 4388 }, { "epoch": 4.993174061433447, "grad_norm": 1.116550087928772, "learning_rate": 2.5028441410693973e-06, "loss": 3.2445, "step": 4389 }, { "epoch": 4.994311717861206, "grad_norm": 0.6965498328208923, "learning_rate": 2.2753128555176335e-06, "loss": 1.3281, "step": 4390 }, { "epoch": 4.995449374288965, "grad_norm": 1.0656460523605347, "learning_rate": 2.0477815699658705e-06, "loss": 2.3715, "step": 4391 }, { "epoch": 4.996587030716723, "grad_norm": 1.0514278411865234, "learning_rate": 1.8202502844141068e-06, "loss": 1.2846, "step": 4392 }, { "epoch": 4.9977246871444825, "grad_norm": 1.341645359992981, "learning_rate": 1.5927189988623436e-06, "loss": 1.663, "step": 4393 }, { "epoch": 4.998862343572241, "grad_norm": 0.9820802807807922, "learning_rate": 1.3651877133105802e-06, "loss": 2.5961, "step": 4394 }, { "epoch": 5.0, "grad_norm": 0.8708580136299133, "learning_rate": 1.1376564277588167e-06, "loss": 1.7838, "step": 4395 }, { "epoch": 5.0, "eval_f1": 0.8906, "eval_gen_len": 49.5727, "eval_loss": 1.8275777101516724, "eval_precision": 0.8894, "eval_recall": 0.8921, "eval_rouge1": 0.4407, "eval_rouge2": 0.1997, "eval_rougeL": 0.3672, "eval_rougeLsum": 0.4075, "eval_runtime": 28.8614, "eval_samples_per_second": 3.811, "eval_steps_per_second": 0.485, "step": 4395 }, { "epoch": 5.0, "step": 4395, "total_flos": 4146070071306240.0, "train_loss": 1.9703198982044563, "train_runtime": 950.7032, "train_samples_per_second": 4.623, "train_steps_per_second": 4.623 } ], "logging_steps": 1, "max_steps": 4395, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4146070071306240.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }