{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9974842767295597, "eval_steps": 20, "global_step": 794, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005031446540880503, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8316, "step": 2 }, { "epoch": 0.010062893081761006, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8423, "step": 4 }, { "epoch": 0.01509433962264151, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8389, "step": 6 }, { "epoch": 0.02012578616352201, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8302, "step": 8 }, { "epoch": 0.025157232704402517, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8634, "step": 10 }, { "epoch": 0.03018867924528302, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 0.8553, "step": 12 }, { "epoch": 0.03522012578616352, "grad_norm": 0.7739448547363281, "learning_rate": 1.6666666666666667e-06, "loss": 0.8423, "step": 14 }, { "epoch": 0.04025157232704402, "grad_norm": 0.7387983202934265, "learning_rate": 5e-06, "loss": 0.8385, "step": 16 }, { "epoch": 0.045283018867924525, "grad_norm": 0.34386318922042847, "learning_rate": 8.333333333333334e-06, "loss": 0.8163, "step": 18 }, { "epoch": 0.050314465408805034, "grad_norm": 0.32477569580078125, "learning_rate": 1e-05, "loss": 0.7541, "step": 20 }, { "epoch": 0.050314465408805034, "eval_loss": 0.7434237599372864, "eval_runtime": 512.9045, "eval_samples_per_second": 20.873, "eval_steps_per_second": 0.164, "step": 20 }, { "epoch": 0.055345911949685536, "grad_norm": 0.3248128294944763, "learning_rate": 1.3333333333333333e-05, "loss": 0.7494, "step": 22 }, { "epoch": 0.06037735849056604, "grad_norm": 0.25117382407188416, "learning_rate": 1.6666666666666667e-05, "loss": 0.7204, "step": 24 }, { "epoch": 0.06540880503144654, "grad_norm": 0.23494713008403778, "learning_rate": 2e-05, "loss": 0.6902, "step": 26 }, { "epoch": 0.07044025157232704, "grad_norm": 0.19120900332927704, "learning_rate": 2.3333333333333336e-05, "loss": 0.6971, "step": 28 }, { "epoch": 0.07547169811320754, "grad_norm": 0.16960154473781586, "learning_rate": 2.6666666666666667e-05, "loss": 0.677, "step": 30 }, { "epoch": 0.08050314465408805, "grad_norm": 0.16359932720661163, "learning_rate": 3e-05, "loss": 0.6863, "step": 32 }, { "epoch": 0.08553459119496855, "grad_norm": 0.1389608234167099, "learning_rate": 3.3333333333333335e-05, "loss": 0.6786, "step": 34 }, { "epoch": 0.09056603773584905, "grad_norm": 0.14994372427463531, "learning_rate": 3.6666666666666666e-05, "loss": 0.6692, "step": 36 }, { "epoch": 0.09559748427672957, "grad_norm": 0.13334757089614868, "learning_rate": 4e-05, "loss": 0.666, "step": 38 }, { "epoch": 0.10062893081761007, "grad_norm": 0.14932414889335632, "learning_rate": 4.3333333333333334e-05, "loss": 0.6771, "step": 40 }, { "epoch": 0.10062893081761007, "eval_loss": 0.6487388610839844, "eval_runtime": 477.079, "eval_samples_per_second": 22.441, "eval_steps_per_second": 0.176, "step": 40 }, { "epoch": 0.10566037735849057, "grad_norm": 0.1352750062942505, "learning_rate": 4.666666666666667e-05, "loss": 0.668, "step": 42 }, { "epoch": 0.11069182389937107, "grad_norm": 0.11694569140672684, "learning_rate": 5e-05, "loss": 0.6517, "step": 44 }, { "epoch": 0.11572327044025157, "grad_norm": 0.12432057410478592, "learning_rate": 4.9999239107866414e-05, "loss": 0.6353, "step": 46 }, { "epoch": 0.12075471698113208, "grad_norm": 0.121449314057827, "learning_rate": 4.9996956482928485e-05, "loss": 0.6348, "step": 48 }, { "epoch": 0.12578616352201258, "grad_norm": 0.119928739964962, "learning_rate": 4.999315227957123e-05, "loss": 0.6402, "step": 50 }, { "epoch": 0.13081761006289308, "grad_norm": 0.12270842492580414, "learning_rate": 4.998782675509138e-05, "loss": 0.6294, "step": 52 }, { "epoch": 0.13584905660377358, "grad_norm": 0.1132158562541008, "learning_rate": 4.998098026968003e-05, "loss": 0.6417, "step": 54 }, { "epoch": 0.14088050314465408, "grad_norm": 0.13522745668888092, "learning_rate": 4.997261328639824e-05, "loss": 0.6366, "step": 56 }, { "epoch": 0.1459119496855346, "grad_norm": 0.11879543960094452, "learning_rate": 4.996272637114571e-05, "loss": 0.6335, "step": 58 }, { "epoch": 0.1509433962264151, "grad_norm": 0.1235610768198967, "learning_rate": 4.995132019262254e-05, "loss": 0.6483, "step": 60 }, { "epoch": 0.1509433962264151, "eval_loss": 0.6241472363471985, "eval_runtime": 445.3592, "eval_samples_per_second": 24.039, "eval_steps_per_second": 0.189, "step": 60 }, { "epoch": 0.1559748427672956, "grad_norm": 0.11320329457521439, "learning_rate": 4.993839552228398e-05, "loss": 0.6242, "step": 62 }, { "epoch": 0.1610062893081761, "grad_norm": 0.12192777544260025, "learning_rate": 4.992395323428824e-05, "loss": 0.6214, "step": 64 }, { "epoch": 0.1660377358490566, "grad_norm": 0.12313053756952286, "learning_rate": 4.9907994305437405e-05, "loss": 0.6331, "step": 66 }, { "epoch": 0.1710691823899371, "grad_norm": 0.12243402749300003, "learning_rate": 4.989051981511133e-05, "loss": 0.6222, "step": 68 }, { "epoch": 0.1761006289308176, "grad_norm": 0.12040385603904724, "learning_rate": 4.9871530945194654e-05, "loss": 0.6394, "step": 70 }, { "epoch": 0.1811320754716981, "grad_norm": 0.11485008895397186, "learning_rate": 4.985102897999687e-05, "loss": 0.5755, "step": 72 }, { "epoch": 0.1861635220125786, "grad_norm": 0.1206885501742363, "learning_rate": 4.982901530616545e-05, "loss": 0.6354, "step": 74 }, { "epoch": 0.19119496855345913, "grad_norm": 0.12174931168556213, "learning_rate": 4.980549141259205e-05, "loss": 0.6362, "step": 76 }, { "epoch": 0.19622641509433963, "grad_norm": 0.13587461411952972, "learning_rate": 4.9780458890311846e-05, "loss": 0.616, "step": 78 }, { "epoch": 0.20125786163522014, "grad_norm": 0.12933531403541565, "learning_rate": 4.9753919432395876e-05, "loss": 0.6363, "step": 80 }, { "epoch": 0.20125786163522014, "eval_loss": 0.6132378578186035, "eval_runtime": 463.9686, "eval_samples_per_second": 23.075, "eval_steps_per_second": 0.181, "step": 80 }, { "epoch": 0.20628930817610064, "grad_norm": 0.11656603217124939, "learning_rate": 4.9725874833836574e-05, "loss": 0.62, "step": 82 }, { "epoch": 0.21132075471698114, "grad_norm": 0.11784744262695312, "learning_rate": 4.969632699142632e-05, "loss": 0.6094, "step": 84 }, { "epoch": 0.21635220125786164, "grad_norm": 0.11344680190086365, "learning_rate": 4.966527790362919e-05, "loss": 0.5968, "step": 86 }, { "epoch": 0.22138364779874214, "grad_norm": 0.11833988130092621, "learning_rate": 4.963272967044579e-05, "loss": 0.6306, "step": 88 }, { "epoch": 0.22641509433962265, "grad_norm": 0.11473698914051056, "learning_rate": 4.959868449327119e-05, "loss": 0.6152, "step": 90 }, { "epoch": 0.23144654088050315, "grad_norm": 0.10996667295694351, "learning_rate": 4.9563144674746046e-05, "loss": 0.595, "step": 92 }, { "epoch": 0.23647798742138365, "grad_norm": 0.11677283048629761, "learning_rate": 4.952611261860089e-05, "loss": 0.5967, "step": 94 }, { "epoch": 0.24150943396226415, "grad_norm": 0.1206679567694664, "learning_rate": 4.9487590829493514e-05, "loss": 0.604, "step": 96 }, { "epoch": 0.24654088050314465, "grad_norm": 0.12233300507068634, "learning_rate": 4.944758191283959e-05, "loss": 0.6043, "step": 98 }, { "epoch": 0.25157232704402516, "grad_norm": 0.11701351404190063, "learning_rate": 4.940608857463644e-05, "loss": 0.6, "step": 100 }, { "epoch": 0.25157232704402516, "eval_loss": 0.6065506339073181, "eval_runtime": 422.5948, "eval_samples_per_second": 25.334, "eval_steps_per_second": 0.199, "step": 100 }, { "epoch": 0.25660377358490566, "grad_norm": 0.1179809644818306, "learning_rate": 4.9363113621280036e-05, "loss": 0.6406, "step": 102 }, { "epoch": 0.26163522012578616, "grad_norm": 0.12534746527671814, "learning_rate": 4.931865995937519e-05, "loss": 0.6227, "step": 104 }, { "epoch": 0.26666666666666666, "grad_norm": 0.12294790893793106, "learning_rate": 4.927273059553892e-05, "loss": 0.5957, "step": 106 }, { "epoch": 0.27169811320754716, "grad_norm": 0.12756744027137756, "learning_rate": 4.9225328636197144e-05, "loss": 0.6226, "step": 108 }, { "epoch": 0.27672955974842767, "grad_norm": 0.12662391364574432, "learning_rate": 4.9176457287374584e-05, "loss": 0.5899, "step": 110 }, { "epoch": 0.28176100628930817, "grad_norm": 0.12516862154006958, "learning_rate": 4.912611985447789e-05, "loss": 0.6238, "step": 112 }, { "epoch": 0.28679245283018867, "grad_norm": 0.116559699177742, "learning_rate": 4.907431974207211e-05, "loss": 0.6112, "step": 114 }, { "epoch": 0.2918238993710692, "grad_norm": 0.11833405494689941, "learning_rate": 4.90210604536504e-05, "loss": 0.6087, "step": 116 }, { "epoch": 0.2968553459119497, "grad_norm": 0.1188972070813179, "learning_rate": 4.896634559139707e-05, "loss": 0.594, "step": 118 }, { "epoch": 0.3018867924528302, "grad_norm": 0.11520268023014069, "learning_rate": 4.891017885594399e-05, "loss": 0.6059, "step": 120 }, { "epoch": 0.3018867924528302, "eval_loss": 0.6016330718994141, "eval_runtime": 415.9012, "eval_samples_per_second": 25.742, "eval_steps_per_second": 0.202, "step": 120 }, { "epoch": 0.3069182389937107, "grad_norm": 0.11097536981105804, "learning_rate": 4.885256404612022e-05, "loss": 0.5963, "step": 122 }, { "epoch": 0.3119496855345912, "grad_norm": 0.1210559606552124, "learning_rate": 4.8793505058695155e-05, "loss": 0.6367, "step": 124 }, { "epoch": 0.3169811320754717, "grad_norm": 0.12011069059371948, "learning_rate": 4.8733005888114915e-05, "loss": 0.5889, "step": 126 }, { "epoch": 0.3220125786163522, "grad_norm": 0.12133902311325073, "learning_rate": 4.867107062623223e-05, "loss": 0.5812, "step": 128 }, { "epoch": 0.3270440251572327, "grad_norm": 0.11452817916870117, "learning_rate": 4.860770346202962e-05, "loss": 0.607, "step": 130 }, { "epoch": 0.3320754716981132, "grad_norm": 0.1322205364704132, "learning_rate": 4.854290868133614e-05, "loss": 0.5789, "step": 132 }, { "epoch": 0.3371069182389937, "grad_norm": 0.1177845150232315, "learning_rate": 4.847669066653746e-05, "loss": 0.5834, "step": 134 }, { "epoch": 0.3421383647798742, "grad_norm": 0.12132906913757324, "learning_rate": 4.840905389627951e-05, "loss": 0.6178, "step": 136 }, { "epoch": 0.3471698113207547, "grad_norm": 0.11665117740631104, "learning_rate": 4.834000294516552e-05, "loss": 0.5999, "step": 138 }, { "epoch": 0.3522012578616352, "grad_norm": 0.12204349786043167, "learning_rate": 4.8269542483446654e-05, "loss": 0.602, "step": 140 }, { "epoch": 0.3522012578616352, "eval_loss": 0.5978492498397827, "eval_runtime": 462.6047, "eval_samples_per_second": 23.143, "eval_steps_per_second": 0.182, "step": 140 }, { "epoch": 0.3572327044025157, "grad_norm": 0.1177980899810791, "learning_rate": 4.819767727670612e-05, "loss": 0.6035, "step": 142 }, { "epoch": 0.3622641509433962, "grad_norm": 0.12166167795658112, "learning_rate": 4.812441218553683e-05, "loss": 0.5909, "step": 144 }, { "epoch": 0.3672955974842767, "grad_norm": 0.13141223788261414, "learning_rate": 4.804975216521272e-05, "loss": 0.5985, "step": 146 }, { "epoch": 0.3723270440251572, "grad_norm": 0.12138612568378448, "learning_rate": 4.797370226535353e-05, "loss": 0.5866, "step": 148 }, { "epoch": 0.37735849056603776, "grad_norm": 0.12571364641189575, "learning_rate": 4.789626762958331e-05, "loss": 0.5789, "step": 150 }, { "epoch": 0.38238993710691827, "grad_norm": 0.1183118149638176, "learning_rate": 4.781745349518252e-05, "loss": 0.5958, "step": 152 }, { "epoch": 0.38742138364779877, "grad_norm": 0.11182838678359985, "learning_rate": 4.7737265192733815e-05, "loss": 0.5768, "step": 154 }, { "epoch": 0.39245283018867927, "grad_norm": 0.12711752951145172, "learning_rate": 4.765570814576153e-05, "loss": 0.5951, "step": 156 }, { "epoch": 0.39748427672955977, "grad_norm": 0.12628626823425293, "learning_rate": 4.757278787036479e-05, "loss": 0.5907, "step": 158 }, { "epoch": 0.4025157232704403, "grad_norm": 0.11637621372938156, "learning_rate": 4.748850997484452e-05, "loss": 0.6115, "step": 160 }, { "epoch": 0.4025157232704403, "eval_loss": 0.5944415926933289, "eval_runtime": 416.2458, "eval_samples_per_second": 25.72, "eval_steps_per_second": 0.202, "step": 160 }, { "epoch": 0.4075471698113208, "grad_norm": 0.11336452513933182, "learning_rate": 4.7402880159324084e-05, "loss": 0.5685, "step": 162 }, { "epoch": 0.4125786163522013, "grad_norm": 0.12172479182481766, "learning_rate": 4.7315904215363734e-05, "loss": 0.5765, "step": 164 }, { "epoch": 0.4176100628930818, "grad_norm": 0.11980731040239334, "learning_rate": 4.722758802556896e-05, "loss": 0.5948, "step": 166 }, { "epoch": 0.4226415094339623, "grad_norm": 0.11705721169710159, "learning_rate": 4.7137937563192555e-05, "loss": 0.5749, "step": 168 }, { "epoch": 0.4276729559748428, "grad_norm": 0.1204642727971077, "learning_rate": 4.704695889173066e-05, "loss": 0.6069, "step": 170 }, { "epoch": 0.4327044025157233, "grad_norm": 0.11957567930221558, "learning_rate": 4.695465816451266e-05, "loss": 0.5724, "step": 172 }, { "epoch": 0.4377358490566038, "grad_norm": 0.12370068579912186, "learning_rate": 4.686104162428497e-05, "loss": 0.6164, "step": 174 }, { "epoch": 0.4427672955974843, "grad_norm": 0.14216069877147675, "learning_rate": 4.676611560278884e-05, "loss": 0.6018, "step": 176 }, { "epoch": 0.4477987421383648, "grad_norm": 0.12077496945858002, "learning_rate": 4.66698865203321e-05, "loss": 0.5919, "step": 178 }, { "epoch": 0.4528301886792453, "grad_norm": 0.12382370233535767, "learning_rate": 4.6572360885354905e-05, "loss": 0.6304, "step": 180 }, { "epoch": 0.4528301886792453, "eval_loss": 0.5917236804962158, "eval_runtime": 416.6125, "eval_samples_per_second": 25.698, "eval_steps_per_second": 0.202, "step": 180 }, { "epoch": 0.4578616352201258, "grad_norm": 0.11310654878616333, "learning_rate": 4.647354529398957e-05, "loss": 0.5984, "step": 182 }, { "epoch": 0.4628930817610063, "grad_norm": 0.11125874519348145, "learning_rate": 4.637344642961442e-05, "loss": 0.5993, "step": 184 }, { "epoch": 0.4679245283018868, "grad_norm": 0.10250482708215714, "learning_rate": 4.627207106240176e-05, "loss": 0.5958, "step": 186 }, { "epoch": 0.4729559748427673, "grad_norm": 0.10660769045352936, "learning_rate": 4.6169426048859994e-05, "loss": 0.5602, "step": 188 }, { "epoch": 0.4779874213836478, "grad_norm": 0.10431323200464249, "learning_rate": 4.606551833136985e-05, "loss": 0.5903, "step": 190 }, { "epoch": 0.4830188679245283, "grad_norm": 0.11766020953655243, "learning_rate": 4.596035493771488e-05, "loss": 0.6004, "step": 192 }, { "epoch": 0.4880503144654088, "grad_norm": 0.1166047751903534, "learning_rate": 4.585394298060611e-05, "loss": 0.5977, "step": 194 }, { "epoch": 0.4930817610062893, "grad_norm": 0.1147083193063736, "learning_rate": 4.574628965720097e-05, "loss": 0.5897, "step": 196 }, { "epoch": 0.4981132075471698, "grad_norm": 0.10733990371227264, "learning_rate": 4.5637402248616506e-05, "loss": 0.5978, "step": 198 }, { "epoch": 0.5031446540880503, "grad_norm": 0.10964643210172653, "learning_rate": 4.552728811943696e-05, "loss": 0.602, "step": 200 }, { "epoch": 0.5031446540880503, "eval_loss": 0.5892359018325806, "eval_runtime": 416.8902, "eval_samples_per_second": 25.681, "eval_steps_per_second": 0.201, "step": 200 }, { "epoch": 0.5081761006289308, "grad_norm": 0.11032500118017197, "learning_rate": 4.54717733587572e-05, "loss": 0.5817, "step": 202 }, { "epoch": 0.5132075471698113, "grad_norm": 0.1092258021235466, "learning_rate": 4.5359833138637734e-05, "loss": 0.5982, "step": 204 }, { "epoch": 0.5182389937106918, "grad_norm": 0.10684628039598465, "learning_rate": 4.524668497127006e-05, "loss": 0.5923, "step": 206 }, { "epoch": 0.5232704402515723, "grad_norm": 0.11228887736797333, "learning_rate": 4.513233650941422e-05, "loss": 0.5742, "step": 208 }, { "epoch": 0.5283018867924528, "grad_norm": 0.1152106523513794, "learning_rate": 4.501679548701201e-05, "loss": 0.5955, "step": 210 }, { "epoch": 0.5333333333333333, "grad_norm": 0.11588042229413986, "learning_rate": 4.490006971866385e-05, "loss": 0.5936, "step": 212 }, { "epoch": 0.5383647798742138, "grad_norm": 0.11394830793142319, "learning_rate": 4.478216709910035e-05, "loss": 0.5937, "step": 214 }, { "epoch": 0.5433962264150943, "grad_norm": 0.11031538993120193, "learning_rate": 4.466309560264822e-05, "loss": 0.5973, "step": 216 }, { "epoch": 0.5484276729559748, "grad_norm": 0.11161042749881744, "learning_rate": 4.4542863282691014e-05, "loss": 0.5701, "step": 218 }, { "epoch": 0.5534591194968553, "grad_norm": 0.11783146113157272, "learning_rate": 4.4421478271124426e-05, "loss": 0.603, "step": 220 }, { "epoch": 0.5534591194968553, "eval_loss": 0.5872239470481873, "eval_runtime": 412.7815, "eval_samples_per_second": 25.936, "eval_steps_per_second": 0.203, "step": 220 }, { "epoch": 0.5584905660377358, "grad_norm": 0.10747622698545456, "learning_rate": 4.429894877780627e-05, "loss": 0.6089, "step": 222 }, { "epoch": 0.5635220125786163, "grad_norm": 0.1075616180896759, "learning_rate": 4.4175283090001225e-05, "loss": 0.6042, "step": 224 }, { "epoch": 0.5685534591194968, "grad_norm": 0.11763885617256165, "learning_rate": 4.4050489571820306e-05, "loss": 0.5854, "step": 226 }, { "epoch": 0.5735849056603773, "grad_norm": 0.1110839769244194, "learning_rate": 4.392457666365519e-05, "loss": 0.5731, "step": 228 }, { "epoch": 0.5786163522012578, "grad_norm": 0.1211901530623436, "learning_rate": 4.379755288160733e-05, "loss": 0.5571, "step": 230 }, { "epoch": 0.5836477987421383, "grad_norm": 0.10990186035633087, "learning_rate": 4.3669426816911985e-05, "loss": 0.5919, "step": 232 }, { "epoch": 0.5886792452830188, "grad_norm": 0.1129634901881218, "learning_rate": 4.354020713535711e-05, "loss": 0.5853, "step": 234 }, { "epoch": 0.5937106918238994, "grad_norm": 0.11691266298294067, "learning_rate": 4.340990257669732e-05, "loss": 0.5878, "step": 236 }, { "epoch": 0.5987421383647799, "grad_norm": 0.11645273119211197, "learning_rate": 4.327852195406271e-05, "loss": 0.5946, "step": 238 }, { "epoch": 0.6037735849056604, "grad_norm": 0.1155095249414444, "learning_rate": 4.314607415336281e-05, "loss": 0.6004, "step": 240 }, { "epoch": 0.6037735849056604, "eval_loss": 0.5851526856422424, "eval_runtime": 413.8117, "eval_samples_per_second": 25.872, "eval_steps_per_second": 0.203, "step": 240 }, { "epoch": 0.6088050314465409, "grad_norm": 0.11004958301782608, "learning_rate": 4.301256813268559e-05, "loss": 0.5846, "step": 242 }, { "epoch": 0.6138364779874214, "grad_norm": 0.10904593765735626, "learning_rate": 4.287801292169159e-05, "loss": 0.5871, "step": 244 }, { "epoch": 0.6188679245283019, "grad_norm": 0.11857830733060837, "learning_rate": 4.274241762100315e-05, "loss": 0.5826, "step": 246 }, { "epoch": 0.6238993710691824, "grad_norm": 0.11342642456293106, "learning_rate": 4.260579140158898e-05, "loss": 0.5807, "step": 248 }, { "epoch": 0.6289308176100629, "grad_norm": 0.10923154652118683, "learning_rate": 4.246814350414377e-05, "loss": 0.5732, "step": 250 }, { "epoch": 0.6339622641509434, "grad_norm": 0.11175252497196198, "learning_rate": 4.2329483238463304e-05, "loss": 0.5649, "step": 252 }, { "epoch": 0.6389937106918239, "grad_norm": 0.10550232976675034, "learning_rate": 4.218981998281471e-05, "loss": 0.5853, "step": 254 }, { "epoch": 0.6440251572327044, "grad_norm": 0.10999017208814621, "learning_rate": 4.204916318330225e-05, "loss": 0.5864, "step": 256 }, { "epoch": 0.6490566037735849, "grad_norm": 0.13525208830833435, "learning_rate": 4.190752235322832e-05, "loss": 0.5842, "step": 258 }, { "epoch": 0.6540880503144654, "grad_norm": 0.11547064781188965, "learning_rate": 4.176490707245011e-05, "loss": 0.5891, "step": 260 }, { "epoch": 0.6540880503144654, "eval_loss": 0.5836161971092224, "eval_runtime": 414.2177, "eval_samples_per_second": 25.846, "eval_steps_per_second": 0.203, "step": 260 }, { "epoch": 0.6591194968553459, "grad_norm": 0.10233239829540253, "learning_rate": 4.162132698673167e-05, "loss": 0.5708, "step": 262 }, { "epoch": 0.6641509433962264, "grad_norm": 0.11854418367147446, "learning_rate": 4.1476791807091445e-05, "loss": 0.6074, "step": 264 }, { "epoch": 0.6691823899371069, "grad_norm": 0.13011477887630463, "learning_rate": 4.133131130914555e-05, "loss": 0.597, "step": 266 }, { "epoch": 0.6742138364779874, "grad_norm": 0.11256586760282516, "learning_rate": 4.118489533244655e-05, "loss": 0.5895, "step": 268 }, { "epoch": 0.6792452830188679, "grad_norm": 0.10651887208223343, "learning_rate": 4.1037553779818016e-05, "loss": 0.5934, "step": 270 }, { "epoch": 0.6842767295597484, "grad_norm": 0.11151424050331116, "learning_rate": 4.088929661668468e-05, "loss": 0.6028, "step": 272 }, { "epoch": 0.6893081761006289, "grad_norm": 0.10524857044219971, "learning_rate": 4.0740133870398456e-05, "loss": 0.608, "step": 274 }, { "epoch": 0.6943396226415094, "grad_norm": 0.12080366164445877, "learning_rate": 4.059007562956027e-05, "loss": 0.6175, "step": 276 }, { "epoch": 0.6993710691823899, "grad_norm": 0.1079036071896553, "learning_rate": 4.0439132043337666e-05, "loss": 0.5938, "step": 278 }, { "epoch": 0.7044025157232704, "grad_norm": 0.11142345517873764, "learning_rate": 4.028731332077843e-05, "loss": 0.5752, "step": 280 }, { "epoch": 0.7044025157232704, "eval_loss": 0.5814208984375, "eval_runtime": 413.672, "eval_samples_per_second": 25.88, "eval_steps_per_second": 0.203, "step": 280 }, { "epoch": 0.7094339622641509, "grad_norm": 0.11195844411849976, "learning_rate": 4.0134629730120045e-05, "loss": 0.583, "step": 282 }, { "epoch": 0.7144654088050314, "grad_norm": 0.10488727688789368, "learning_rate": 3.9981091598095213e-05, "loss": 0.5593, "step": 284 }, { "epoch": 0.7194968553459119, "grad_norm": 0.10145172476768494, "learning_rate": 3.9826709309233454e-05, "loss": 0.5839, "step": 286 }, { "epoch": 0.7245283018867924, "grad_norm": 0.1036810651421547, "learning_rate": 3.967149330515867e-05, "loss": 0.5796, "step": 288 }, { "epoch": 0.7295597484276729, "grad_norm": 0.1167021244764328, "learning_rate": 3.951545408388301e-05, "loss": 0.6006, "step": 290 }, { "epoch": 0.7345911949685534, "grad_norm": 0.10975436121225357, "learning_rate": 3.935860219909679e-05, "loss": 0.5802, "step": 292 }, { "epoch": 0.7396226415094339, "grad_norm": 0.11112164705991745, "learning_rate": 3.920094825945468e-05, "loss": 0.5851, "step": 294 }, { "epoch": 0.7446540880503144, "grad_norm": 0.10697871446609497, "learning_rate": 3.904250292785825e-05, "loss": 0.5855, "step": 296 }, { "epoch": 0.7496855345911949, "grad_norm": 0.10061946511268616, "learning_rate": 3.8883276920734736e-05, "loss": 0.5941, "step": 298 }, { "epoch": 0.7547169811320755, "grad_norm": 0.102944515645504, "learning_rate": 3.8723281007312256e-05, "loss": 0.5732, "step": 300 }, { "epoch": 0.7547169811320755, "eval_loss": 0.5801501870155334, "eval_runtime": 412.1928, "eval_samples_per_second": 25.973, "eval_steps_per_second": 0.204, "step": 300 }, { "epoch": 0.759748427672956, "grad_norm": 0.10275246948003769, "learning_rate": 3.856252600889143e-05, "loss": 0.5803, "step": 302 }, { "epoch": 0.7647798742138365, "grad_norm": 0.10686468333005905, "learning_rate": 3.840102279811345e-05, "loss": 0.585, "step": 304 }, { "epoch": 0.769811320754717, "grad_norm": 0.11284485459327698, "learning_rate": 3.82387822982248e-05, "loss": 0.5937, "step": 306 }, { "epoch": 0.7748427672955975, "grad_norm": 0.10271965712308884, "learning_rate": 3.807581548233837e-05, "loss": 0.5777, "step": 308 }, { "epoch": 0.779874213836478, "grad_norm": 0.10338881611824036, "learning_rate": 3.791213337269134e-05, "loss": 0.5888, "step": 310 }, { "epoch": 0.7849056603773585, "grad_norm": 0.11098296195268631, "learning_rate": 3.7747747039899676e-05, "loss": 0.5764, "step": 312 }, { "epoch": 0.789937106918239, "grad_norm": 0.10431266576051712, "learning_rate": 3.758266760220937e-05, "loss": 0.5985, "step": 314 }, { "epoch": 0.7949685534591195, "grad_norm": 0.10191661864519119, "learning_rate": 3.741690622474449e-05, "loss": 0.5626, "step": 316 }, { "epoch": 0.8, "grad_norm": 0.11264406889677048, "learning_rate": 3.7250474118751974e-05, "loss": 0.6094, "step": 318 }, { "epoch": 0.8050314465408805, "grad_norm": 0.10798995941877365, "learning_rate": 3.708338254084339e-05, "loss": 0.5714, "step": 320 }, { "epoch": 0.8050314465408805, "eval_loss": 0.5783895254135132, "eval_runtime": 411.3396, "eval_samples_per_second": 26.027, "eval_steps_per_second": 0.204, "step": 320 }, { "epoch": 0.810062893081761, "grad_norm": 0.10546964406967163, "learning_rate": 3.69156427922336e-05, "loss": 0.5682, "step": 322 }, { "epoch": 0.8150943396226416, "grad_norm": 0.11086619645357132, "learning_rate": 3.6747266217976414e-05, "loss": 0.5682, "step": 324 }, { "epoch": 0.820125786163522, "grad_norm": 0.11166223883628845, "learning_rate": 3.6578264206197245e-05, "loss": 0.5798, "step": 326 }, { "epoch": 0.8251572327044026, "grad_norm": 0.1012871265411377, "learning_rate": 3.6408648187322854e-05, "loss": 0.5694, "step": 328 }, { "epoch": 0.8301886792452831, "grad_norm": 0.11129719018936157, "learning_rate": 3.623842963330832e-05, "loss": 0.5827, "step": 330 }, { "epoch": 0.8352201257861636, "grad_norm": 0.10445208847522736, "learning_rate": 3.6067620056861086e-05, "loss": 0.5684, "step": 332 }, { "epoch": 0.8402515723270441, "grad_norm": 0.09836594015359879, "learning_rate": 3.589623101066232e-05, "loss": 0.5755, "step": 334 }, { "epoch": 0.8452830188679246, "grad_norm": 0.10354923456907272, "learning_rate": 3.572427408658552e-05, "loss": 0.5782, "step": 336 }, { "epoch": 0.8503144654088051, "grad_norm": 0.10813874751329422, "learning_rate": 3.5551760914912546e-05, "loss": 0.5939, "step": 338 }, { "epoch": 0.8553459119496856, "grad_norm": 0.10268381237983704, "learning_rate": 3.537870316354699e-05, "loss": 0.5697, "step": 340 }, { "epoch": 0.8553459119496856, "eval_loss": 0.5766698122024536, "eval_runtime": 412.4072, "eval_samples_per_second": 25.96, "eval_steps_per_second": 0.204, "step": 340 }, { "epoch": 0.8603773584905661, "grad_norm": 0.10261879861354828, "learning_rate": 3.5205112537224974e-05, "loss": 0.5694, "step": 342 }, { "epoch": 0.8654088050314466, "grad_norm": 0.10596223920583725, "learning_rate": 3.50310007767236e-05, "loss": 0.6091, "step": 344 }, { "epoch": 0.8704402515723271, "grad_norm": 0.09846257418394089, "learning_rate": 3.485637965806674e-05, "loss": 0.5808, "step": 346 }, { "epoch": 0.8754716981132076, "grad_norm": 0.10605087131261826, "learning_rate": 3.4681260991728685e-05, "loss": 0.5737, "step": 348 }, { "epoch": 0.8805031446540881, "grad_norm": 0.11082804203033447, "learning_rate": 3.450565662183527e-05, "loss": 0.5826, "step": 350 }, { "epoch": 0.8855345911949686, "grad_norm": 0.10169358551502228, "learning_rate": 3.432957842536282e-05, "loss": 0.5724, "step": 352 }, { "epoch": 0.8905660377358491, "grad_norm": 0.10837408900260925, "learning_rate": 3.415303831133485e-05, "loss": 0.5736, "step": 354 }, { "epoch": 0.8955974842767296, "grad_norm": 0.11302381008863449, "learning_rate": 3.3976048220016604e-05, "loss": 0.589, "step": 356 }, { "epoch": 0.9006289308176101, "grad_norm": 0.10759381949901581, "learning_rate": 3.37986201221075e-05, "loss": 0.5769, "step": 358 }, { "epoch": 0.9056603773584906, "grad_norm": 0.11644274741411209, "learning_rate": 3.362076601793142e-05, "loss": 0.5603, "step": 360 }, { "epoch": 0.9056603773584906, "eval_loss": 0.5751825571060181, "eval_runtime": 411.2176, "eval_samples_per_second": 26.035, "eval_steps_per_second": 0.204, "step": 360 }, { "epoch": 0.9106918238993711, "grad_norm": 0.10910682380199432, "learning_rate": 3.344249793662514e-05, "loss": 0.5632, "step": 362 }, { "epoch": 0.9157232704402516, "grad_norm": 0.10915949195623398, "learning_rate": 3.326382793532476e-05, "loss": 0.5903, "step": 364 }, { "epoch": 0.9207547169811321, "grad_norm": 0.11586213856935501, "learning_rate": 3.308476809835013e-05, "loss": 0.594, "step": 366 }, { "epoch": 0.9257861635220126, "grad_norm": 0.11010655015707016, "learning_rate": 3.290533053638759e-05, "loss": 0.5723, "step": 368 }, { "epoch": 0.9308176100628931, "grad_norm": 0.11362364888191223, "learning_rate": 3.272552738567086e-05, "loss": 0.5825, "step": 370 }, { "epoch": 0.9358490566037736, "grad_norm": 0.10686285048723221, "learning_rate": 3.254537080716021e-05, "loss": 0.586, "step": 372 }, { "epoch": 0.9408805031446541, "grad_norm": 0.11016613245010376, "learning_rate": 3.236487298571996e-05, "loss": 0.5813, "step": 374 }, { "epoch": 0.9459119496855346, "grad_norm": 0.10999295115470886, "learning_rate": 3.2184046129294295e-05, "loss": 0.5716, "step": 376 }, { "epoch": 0.9509433962264151, "grad_norm": 0.1100740134716034, "learning_rate": 3.20029024680817e-05, "loss": 0.5424, "step": 378 }, { "epoch": 0.9559748427672956, "grad_norm": 0.105403371155262, "learning_rate": 3.1821454253707646e-05, "loss": 0.5963, "step": 380 }, { "epoch": 0.9559748427672956, "eval_loss": 0.5737633109092712, "eval_runtime": 412.702, "eval_samples_per_second": 25.941, "eval_steps_per_second": 0.204, "step": 380 }, { "epoch": 0.9610062893081761, "grad_norm": 0.10489460825920105, "learning_rate": 3.1639713758396055e-05, "loss": 0.567, "step": 382 }, { "epoch": 0.9660377358490566, "grad_norm": 0.10349351167678833, "learning_rate": 3.145769327413922e-05, "loss": 0.5721, "step": 384 }, { "epoch": 0.9710691823899371, "grad_norm": 0.10395889729261398, "learning_rate": 3.127540511186643e-05, "loss": 0.569, "step": 386 }, { "epoch": 0.9761006289308176, "grad_norm": 0.10146255046129227, "learning_rate": 3.109286160061136e-05, "loss": 0.5699, "step": 388 }, { "epoch": 0.9811320754716981, "grad_norm": 0.10547154396772385, "learning_rate": 3.091007508667814e-05, "loss": 0.5686, "step": 390 }, { "epoch": 0.9861635220125786, "grad_norm": 0.10124453902244568, "learning_rate": 3.072705793280642e-05, "loss": 0.5983, "step": 392 }, { "epoch": 0.9911949685534591, "grad_norm": 0.10909611731767654, "learning_rate": 3.054382251733507e-05, "loss": 0.5881, "step": 394 }, { "epoch": 0.9962264150943396, "grad_norm": 0.10168620944023132, "learning_rate": 3.0360381233365105e-05, "loss": 0.5978, "step": 396 }, { "epoch": 1.0012578616352201, "grad_norm": 0.1318611353635788, "learning_rate": 3.0176746487921404e-05, "loss": 0.5694, "step": 398 }, { "epoch": 1.0062893081761006, "grad_norm": 0.12533968687057495, "learning_rate": 2.9992930701113586e-05, "loss": 0.5082, "step": 400 }, { "epoch": 1.0062893081761006, "eval_loss": 0.5776596665382385, "eval_runtime": 410.8135, "eval_samples_per_second": 26.06, "eval_steps_per_second": 0.204, "step": 400 }, { "epoch": 1.0113207547169811, "grad_norm": 0.12798835337162018, "learning_rate": 2.9808946305295988e-05, "loss": 0.4912, "step": 402 }, { "epoch": 1.0163522012578616, "grad_norm": 0.13782812654972076, "learning_rate": 2.962480574422678e-05, "loss": 0.5288, "step": 404 }, { "epoch": 1.0213836477987421, "grad_norm": 0.11341769248247147, "learning_rate": 2.9440521472226368e-05, "loss": 0.5032, "step": 406 }, { "epoch": 1.0264150943396226, "grad_norm": 0.1318856179714203, "learning_rate": 2.9256105953334982e-05, "loss": 0.5038, "step": 408 }, { "epoch": 1.0314465408805031, "grad_norm": 0.11981856822967529, "learning_rate": 2.9071571660469775e-05, "loss": 0.4965, "step": 410 }, { "epoch": 1.0364779874213836, "grad_norm": 0.11475057154893875, "learning_rate": 2.888693107458111e-05, "loss": 0.4912, "step": 412 }, { "epoch": 1.0415094339622641, "grad_norm": 0.13232818245887756, "learning_rate": 2.8702196683808496e-05, "loss": 0.5065, "step": 414 }, { "epoch": 1.0465408805031446, "grad_norm": 0.1348014920949936, "learning_rate": 2.8517380982635906e-05, "loss": 0.5293, "step": 416 }, { "epoch": 1.0515723270440251, "grad_norm": 0.11755809187889099, "learning_rate": 2.8332496471046737e-05, "loss": 0.486, "step": 418 }, { "epoch": 1.0566037735849056, "grad_norm": 0.1252366006374359, "learning_rate": 2.8147555653678353e-05, "loss": 0.4975, "step": 420 }, { "epoch": 1.0566037735849056, "eval_loss": 0.5800932049751282, "eval_runtime": 411.2332, "eval_samples_per_second": 26.034, "eval_steps_per_second": 0.204, "step": 420 }, { "epoch": 1.0616352201257861, "grad_norm": 0.11638514697551727, "learning_rate": 2.7962571038976376e-05, "loss": 0.5021, "step": 422 }, { "epoch": 1.0666666666666667, "grad_norm": 0.11196921765804291, "learning_rate": 2.777755513834865e-05, "loss": 0.5081, "step": 424 }, { "epoch": 1.0716981132075472, "grad_norm": 0.12703673541545868, "learning_rate": 2.7592520465319012e-05, "loss": 0.5186, "step": 426 }, { "epoch": 1.0767295597484277, "grad_norm": 0.12006295472383499, "learning_rate": 2.7407479534680997e-05, "loss": 0.5123, "step": 428 }, { "epoch": 1.0817610062893082, "grad_norm": 0.11968886107206345, "learning_rate": 2.722244486165136e-05, "loss": 0.5135, "step": 430 }, { "epoch": 1.0867924528301887, "grad_norm": 0.11439641565084457, "learning_rate": 2.7037428961023632e-05, "loss": 0.4964, "step": 432 }, { "epoch": 1.0918238993710692, "grad_norm": 0.12513814866542816, "learning_rate": 2.685244434632166e-05, "loss": 0.5103, "step": 434 }, { "epoch": 1.0968553459119497, "grad_norm": 0.11105407029390335, "learning_rate": 2.6667503528953275e-05, "loss": 0.4915, "step": 436 }, { "epoch": 1.1018867924528302, "grad_norm": 0.1270296275615692, "learning_rate": 2.6482619017364096e-05, "loss": 0.5197, "step": 438 }, { "epoch": 1.1069182389937107, "grad_norm": 0.12292016297578812, "learning_rate": 2.629780331619151e-05, "loss": 0.5131, "step": 440 }, { "epoch": 1.1069182389937107, "eval_loss": 0.5787190198898315, "eval_runtime": 410.8629, "eval_samples_per_second": 26.057, "eval_steps_per_second": 0.204, "step": 440 }, { "epoch": 1.1119496855345912, "grad_norm": 0.10833011567592621, "learning_rate": 2.6113068925418892e-05, "loss": 0.4747, "step": 442 }, { "epoch": 1.1169811320754717, "grad_norm": 0.11480649560689926, "learning_rate": 2.592842833953023e-05, "loss": 0.5033, "step": 444 }, { "epoch": 1.1220125786163522, "grad_norm": 0.11621647328138351, "learning_rate": 2.5743894046665013e-05, "loss": 0.5041, "step": 446 }, { "epoch": 1.1270440251572327, "grad_norm": 0.11848396062850952, "learning_rate": 2.555947852777364e-05, "loss": 0.4868, "step": 448 }, { "epoch": 1.1320754716981132, "grad_norm": 0.11532817780971527, "learning_rate": 2.537519425577322e-05, "loss": 0.485, "step": 450 }, { "epoch": 1.1371069182389937, "grad_norm": 0.11470374464988708, "learning_rate": 2.519105369470402e-05, "loss": 0.5009, "step": 452 }, { "epoch": 1.1421383647798742, "grad_norm": 0.11918400973081589, "learning_rate": 2.5007069298886416e-05, "loss": 0.492, "step": 454 }, { "epoch": 1.1471698113207547, "grad_norm": 0.11399897933006287, "learning_rate": 2.4823253512078605e-05, "loss": 0.5227, "step": 456 }, { "epoch": 1.1522012578616352, "grad_norm": 0.1135721504688263, "learning_rate": 2.4639618766634904e-05, "loss": 0.4948, "step": 458 }, { "epoch": 1.1572327044025157, "grad_norm": 0.11768464744091034, "learning_rate": 2.4456177482664932e-05, "loss": 0.5069, "step": 460 }, { "epoch": 1.1572327044025157, "eval_loss": 0.5783973932266235, "eval_runtime": 410.9352, "eval_samples_per_second": 26.053, "eval_steps_per_second": 0.204, "step": 460 }, { "epoch": 1.1622641509433962, "grad_norm": 0.11413212865591049, "learning_rate": 2.4272942067193593e-05, "loss": 0.4919, "step": 462 }, { "epoch": 1.1672955974842767, "grad_norm": 0.11099102348089218, "learning_rate": 2.4089924913321854e-05, "loss": 0.5131, "step": 464 }, { "epoch": 1.1723270440251572, "grad_norm": 0.12233982235193253, "learning_rate": 2.3907138399388656e-05, "loss": 0.5152, "step": 466 }, { "epoch": 1.1773584905660377, "grad_norm": 0.109150230884552, "learning_rate": 2.3724594888133578e-05, "loss": 0.4942, "step": 468 }, { "epoch": 1.1823899371069182, "grad_norm": 0.11547227948904037, "learning_rate": 2.354230672586079e-05, "loss": 0.5087, "step": 470 }, { "epoch": 1.1874213836477987, "grad_norm": 0.11096334457397461, "learning_rate": 2.3360286241603947e-05, "loss": 0.528, "step": 472 }, { "epoch": 1.1924528301886792, "grad_norm": 0.11583118885755539, "learning_rate": 2.3178545746292363e-05, "loss": 0.5053, "step": 474 }, { "epoch": 1.1974842767295597, "grad_norm": 0.12394651025533676, "learning_rate": 2.299709753191831e-05, "loss": 0.519, "step": 476 }, { "epoch": 1.2025157232704402, "grad_norm": 0.10962869971990585, "learning_rate": 2.281595387070571e-05, "loss": 0.5119, "step": 478 }, { "epoch": 1.2075471698113207, "grad_norm": 0.11548969149589539, "learning_rate": 2.263512701428005e-05, "loss": 0.5053, "step": 480 }, { "epoch": 1.2075471698113207, "eval_loss": 0.5777500867843628, "eval_runtime": 410.7907, "eval_samples_per_second": 26.062, "eval_steps_per_second": 0.204, "step": 480 }, { "epoch": 1.2125786163522012, "grad_norm": 0.11964194476604462, "learning_rate": 2.2454629192839782e-05, "loss": 0.5067, "step": 482 }, { "epoch": 1.2176100628930817, "grad_norm": 0.11280932277441025, "learning_rate": 2.2274472614329146e-05, "loss": 0.5097, "step": 484 }, { "epoch": 1.2226415094339622, "grad_norm": 0.11903873831033707, "learning_rate": 2.2094669463612417e-05, "loss": 0.4973, "step": 486 }, { "epoch": 1.2276729559748427, "grad_norm": 0.11274790018796921, "learning_rate": 2.191523190164988e-05, "loss": 0.4802, "step": 488 }, { "epoch": 1.2327044025157232, "grad_norm": 0.12155181169509888, "learning_rate": 2.1736172064675242e-05, "loss": 0.5039, "step": 490 }, { "epoch": 1.2377358490566037, "grad_norm": 0.1149788573384285, "learning_rate": 2.1557502063374863e-05, "loss": 0.5018, "step": 492 }, { "epoch": 1.2427672955974842, "grad_norm": 0.11619096249341965, "learning_rate": 2.1379233982068597e-05, "loss": 0.5001, "step": 494 }, { "epoch": 1.2477987421383647, "grad_norm": 0.11911306530237198, "learning_rate": 2.120137987789252e-05, "loss": 0.5257, "step": 496 }, { "epoch": 1.2528301886792452, "grad_norm": 0.11577396094799042, "learning_rate": 2.1023951779983408e-05, "loss": 0.5156, "step": 498 }, { "epoch": 1.2578616352201257, "grad_norm": 0.11911614239215851, "learning_rate": 2.0846961688665158e-05, "loss": 0.5189, "step": 500 }, { "epoch": 1.2578616352201257, "eval_loss": 0.5771186947822571, "eval_runtime": 410.8096, "eval_samples_per_second": 26.061, "eval_steps_per_second": 0.204, "step": 500 }, { "epoch": 1.2628930817610062, "grad_norm": 0.12026111036539078, "learning_rate": 2.0670421574637182e-05, "loss": 0.4965, "step": 502 }, { "epoch": 1.2679245283018867, "grad_norm": 0.11090611666440964, "learning_rate": 2.0494343378164736e-05, "loss": 0.4924, "step": 504 }, { "epoch": 1.2729559748427672, "grad_norm": 0.11498820036649704, "learning_rate": 2.0318739008271327e-05, "loss": 0.5069, "step": 506 }, { "epoch": 1.2779874213836477, "grad_norm": 0.11681642383337021, "learning_rate": 2.014362034193326e-05, "loss": 0.5208, "step": 508 }, { "epoch": 1.2830188679245282, "grad_norm": 0.11289618164300919, "learning_rate": 1.9968999223276406e-05, "loss": 0.497, "step": 510 }, { "epoch": 1.2880503144654087, "grad_norm": 0.11700893938541412, "learning_rate": 1.979488746277503e-05, "loss": 0.4872, "step": 512 }, { "epoch": 1.2930817610062892, "grad_norm": 0.11669134348630905, "learning_rate": 1.9621296836453025e-05, "loss": 0.5117, "step": 514 }, { "epoch": 1.2981132075471697, "grad_norm": 0.11578242480754852, "learning_rate": 1.944823908508745e-05, "loss": 0.5046, "step": 516 }, { "epoch": 1.3031446540880502, "grad_norm": 0.11336881667375565, "learning_rate": 1.9275725913414483e-05, "loss": 0.4828, "step": 518 }, { "epoch": 1.3081761006289307, "grad_norm": 0.1218356043100357, "learning_rate": 1.910376898933769e-05, "loss": 0.5173, "step": 520 }, { "epoch": 1.3081761006289307, "eval_loss": 0.5762000679969788, "eval_runtime": 410.9091, "eval_samples_per_second": 26.054, "eval_steps_per_second": 0.204, "step": 520 }, { "epoch": 1.3132075471698113, "grad_norm": 0.11644181609153748, "learning_rate": 1.8932379943138916e-05, "loss": 0.5002, "step": 522 }, { "epoch": 1.3182389937106918, "grad_norm": 0.11215106397867203, "learning_rate": 1.8761570366691684e-05, "loss": 0.4808, "step": 524 }, { "epoch": 1.3232704402515723, "grad_norm": 0.11506900936365128, "learning_rate": 1.8591351812677144e-05, "loss": 0.4915, "step": 526 }, { "epoch": 1.3283018867924528, "grad_norm": 0.11646901071071625, "learning_rate": 1.8421735793802763e-05, "loss": 0.5067, "step": 528 }, { "epoch": 1.3333333333333333, "grad_norm": 0.1217241957783699, "learning_rate": 1.8252733782023584e-05, "loss": 0.5105, "step": 530 }, { "epoch": 1.3383647798742138, "grad_norm": 0.12330880761146545, "learning_rate": 1.8084357207766406e-05, "loss": 0.5107, "step": 532 }, { "epoch": 1.3433962264150943, "grad_norm": 0.10948923975229263, "learning_rate": 1.7916617459156615e-05, "loss": 0.4929, "step": 534 }, { "epoch": 1.3484276729559748, "grad_norm": 0.11415420472621918, "learning_rate": 1.7749525881248035e-05, "loss": 0.5123, "step": 536 }, { "epoch": 1.3534591194968553, "grad_norm": 0.11750365048646927, "learning_rate": 1.7583093775255516e-05, "loss": 0.5082, "step": 538 }, { "epoch": 1.3584905660377358, "grad_norm": 0.11664094030857086, "learning_rate": 1.741733239779063e-05, "loss": 0.5048, "step": 540 }, { "epoch": 1.3584905660377358, "eval_loss": 0.5755621194839478, "eval_runtime": 410.7838, "eval_samples_per_second": 26.062, "eval_steps_per_second": 0.204, "step": 540 }, { "epoch": 1.3635220125786163, "grad_norm": 0.11655986309051514, "learning_rate": 1.725225296010034e-05, "loss": 0.4923, "step": 542 }, { "epoch": 1.3685534591194968, "grad_norm": 0.11432712525129318, "learning_rate": 1.7087866627308664e-05, "loss": 0.4976, "step": 544 }, { "epoch": 1.3735849056603773, "grad_norm": 0.11400482058525085, "learning_rate": 1.692418451766163e-05, "loss": 0.5026, "step": 546 }, { "epoch": 1.378616352201258, "grad_norm": 0.11588790267705917, "learning_rate": 1.6761217701775207e-05, "loss": 0.5031, "step": 548 }, { "epoch": 1.3836477987421385, "grad_norm": 0.11426915228366852, "learning_rate": 1.6598977201886558e-05, "loss": 0.5001, "step": 550 }, { "epoch": 1.388679245283019, "grad_norm": 0.11552328616380692, "learning_rate": 1.6437473991108585e-05, "loss": 0.4928, "step": 552 }, { "epoch": 1.3937106918238995, "grad_norm": 0.11312104761600494, "learning_rate": 1.6276718992687746e-05, "loss": 0.4977, "step": 554 }, { "epoch": 1.39874213836478, "grad_norm": 0.11197475343942642, "learning_rate": 1.6116723079265263e-05, "loss": 0.489, "step": 556 }, { "epoch": 1.4037735849056605, "grad_norm": 0.11652438342571259, "learning_rate": 1.5957497072141758e-05, "loss": 0.4971, "step": 558 }, { "epoch": 1.408805031446541, "grad_norm": 0.1163628101348877, "learning_rate": 1.579905174054533e-05, "loss": 0.4986, "step": 560 }, { "epoch": 1.408805031446541, "eval_loss": 0.5742356777191162, "eval_runtime": 410.5998, "eval_samples_per_second": 26.074, "eval_steps_per_second": 0.205, "step": 560 }, { "epoch": 1.4138364779874215, "grad_norm": 0.1128329187631607, "learning_rate": 1.5641397800903222e-05, "loss": 0.5068, "step": 562 }, { "epoch": 1.418867924528302, "grad_norm": 0.11648018658161163, "learning_rate": 1.5484545916116995e-05, "loss": 0.4958, "step": 564 }, { "epoch": 1.4238993710691825, "grad_norm": 0.1150885596871376, "learning_rate": 1.5328506694841334e-05, "loss": 0.4855, "step": 566 }, { "epoch": 1.428930817610063, "grad_norm": 0.11181043833494186, "learning_rate": 1.5173290690766553e-05, "loss": 0.5114, "step": 568 }, { "epoch": 1.4339622641509435, "grad_norm": 0.11899517476558685, "learning_rate": 1.5018908401904785e-05, "loss": 0.5048, "step": 570 }, { "epoch": 1.438993710691824, "grad_norm": 0.11897191405296326, "learning_rate": 1.4865370269879955e-05, "loss": 0.5308, "step": 572 }, { "epoch": 1.4440251572327045, "grad_norm": 0.11142674088478088, "learning_rate": 1.471268667922157e-05, "loss": 0.4958, "step": 574 }, { "epoch": 1.449056603773585, "grad_norm": 0.1150866225361824, "learning_rate": 1.4560867956662336e-05, "loss": 0.4939, "step": 576 }, { "epoch": 1.4540880503144655, "grad_norm": 0.11816877871751785, "learning_rate": 1.4409924370439737e-05, "loss": 0.4913, "step": 578 }, { "epoch": 1.459119496855346, "grad_norm": 0.11381349712610245, "learning_rate": 1.425986612960155e-05, "loss": 0.5039, "step": 580 }, { "epoch": 1.459119496855346, "eval_loss": 0.573836624622345, "eval_runtime": 410.5985, "eval_samples_per_second": 26.074, "eval_steps_per_second": 0.205, "step": 580 }, { "epoch": 1.4641509433962265, "grad_norm": 0.1094905436038971, "learning_rate": 1.4110703383315326e-05, "loss": 0.4901, "step": 582 }, { "epoch": 1.469182389937107, "grad_norm": 0.11396130174398422, "learning_rate": 1.396244622018199e-05, "loss": 0.5081, "step": 584 }, { "epoch": 1.4742138364779875, "grad_norm": 0.1160426139831543, "learning_rate": 1.3815104667553452e-05, "loss": 0.4869, "step": 586 }, { "epoch": 1.479245283018868, "grad_norm": 0.11492225527763367, "learning_rate": 1.3668688690854453e-05, "loss": 0.4888, "step": 588 }, { "epoch": 1.4842767295597485, "grad_norm": 0.11282163113355637, "learning_rate": 1.3523208192908562e-05, "loss": 0.4983, "step": 590 }, { "epoch": 1.489308176100629, "grad_norm": 0.11276757717132568, "learning_rate": 1.3378673013268336e-05, "loss": 0.517, "step": 592 }, { "epoch": 1.4943396226415095, "grad_norm": 0.11005326360464096, "learning_rate": 1.3235092927549888e-05, "loss": 0.4933, "step": 594 }, { "epoch": 1.49937106918239, "grad_norm": 0.11391846090555191, "learning_rate": 1.3092477646771686e-05, "loss": 0.5047, "step": 596 }, { "epoch": 1.5044025157232706, "grad_norm": 0.112746462225914, "learning_rate": 1.2950836816697753e-05, "loss": 0.4933, "step": 598 }, { "epoch": 1.509433962264151, "grad_norm": 0.11274772882461548, "learning_rate": 1.2810180017185286e-05, "loss": 0.4928, "step": 600 }, { "epoch": 1.509433962264151, "eval_loss": 0.5733875632286072, "eval_runtime": 410.9812, "eval_samples_per_second": 26.05, "eval_steps_per_second": 0.204, "step": 600 }, { "epoch": 1.5144654088050316, "grad_norm": 0.11344069242477417, "learning_rate": 1.2670516761536705e-05, "loss": 0.5083, "step": 602 }, { "epoch": 1.519496855345912, "grad_norm": 0.1206919476389885, "learning_rate": 1.2531856495856234e-05, "loss": 0.4931, "step": 604 }, { "epoch": 1.5245283018867926, "grad_norm": 0.11568621546030045, "learning_rate": 1.2394208598411026e-05, "loss": 0.4961, "step": 606 }, { "epoch": 1.529559748427673, "grad_norm": 0.11288320273160934, "learning_rate": 1.2257582378996846e-05, "loss": 0.493, "step": 608 }, { "epoch": 1.5345911949685536, "grad_norm": 0.11447000503540039, "learning_rate": 1.2121987078308414e-05, "loss": 0.487, "step": 610 }, { "epoch": 1.539622641509434, "grad_norm": 0.1167483702301979, "learning_rate": 1.1987431867314417e-05, "loss": 0.5078, "step": 612 }, { "epoch": 1.5446540880503146, "grad_norm": 0.11339499801397324, "learning_rate": 1.1853925846637192e-05, "loss": 0.5101, "step": 614 }, { "epoch": 1.549685534591195, "grad_norm": 0.11238376796245575, "learning_rate": 1.1721478045937298e-05, "loss": 0.5075, "step": 616 }, { "epoch": 1.5547169811320756, "grad_norm": 0.11750028282403946, "learning_rate": 1.1590097423302684e-05, "loss": 0.5223, "step": 618 }, { "epoch": 1.559748427672956, "grad_norm": 0.11243315786123276, "learning_rate": 1.1459792864642889e-05, "loss": 0.5014, "step": 620 }, { "epoch": 1.559748427672956, "eval_loss": 0.5725140571594238, "eval_runtime": 411.304, "eval_samples_per_second": 26.029, "eval_steps_per_second": 0.204, "step": 620 }, { "epoch": 1.5647798742138366, "grad_norm": 0.10880452394485474, "learning_rate": 1.1330573183088027e-05, "loss": 0.4946, "step": 622 }, { "epoch": 1.569811320754717, "grad_norm": 0.11715767532587051, "learning_rate": 1.1202447118392666e-05, "loss": 0.4934, "step": 624 }, { "epoch": 1.5748427672955976, "grad_norm": 0.1085837110877037, "learning_rate": 1.1075423336344815e-05, "loss": 0.4918, "step": 626 }, { "epoch": 1.579874213836478, "grad_norm": 0.11400571465492249, "learning_rate": 1.0949510428179703e-05, "loss": 0.4907, "step": 628 }, { "epoch": 1.5849056603773586, "grad_norm": 0.11114535480737686, "learning_rate": 1.0824716909998783e-05, "loss": 0.504, "step": 630 }, { "epoch": 1.589937106918239, "grad_norm": 0.10678807646036148, "learning_rate": 1.0701051222193734e-05, "loss": 0.4757, "step": 632 }, { "epoch": 1.5949685534591196, "grad_norm": 0.11523126810789108, "learning_rate": 1.0578521728875578e-05, "loss": 0.5019, "step": 634 }, { "epoch": 1.6, "grad_norm": 0.11389489471912384, "learning_rate": 1.0457136717308988e-05, "loss": 0.5162, "step": 636 }, { "epoch": 1.6050314465408806, "grad_norm": 0.11754269152879715, "learning_rate": 1.0336904397351794e-05, "loss": 0.4991, "step": 638 }, { "epoch": 1.610062893081761, "grad_norm": 0.11521238088607788, "learning_rate": 1.021783290089966e-05, "loss": 0.5041, "step": 640 }, { "epoch": 1.610062893081761, "eval_loss": 0.57233065366745, "eval_runtime": 417.7875, "eval_samples_per_second": 25.625, "eval_steps_per_second": 0.201, "step": 640 }, { "epoch": 1.6150943396226416, "grad_norm": 0.1117156520485878, "learning_rate": 1.009993028133615e-05, "loss": 0.4919, "step": 642 }, { "epoch": 1.620125786163522, "grad_norm": 0.11549975723028183, "learning_rate": 9.983204512988004e-06, "loss": 0.4988, "step": 644 }, { "epoch": 1.6251572327044026, "grad_norm": 0.11243242025375366, "learning_rate": 9.867663490585783e-06, "loss": 0.5128, "step": 646 }, { "epoch": 1.630188679245283, "grad_norm": 0.11129079759120941, "learning_rate": 9.753315028729948e-06, "loss": 0.4893, "step": 648 }, { "epoch": 1.6352201257861636, "grad_norm": 0.11360695213079453, "learning_rate": 9.640166861362268e-06, "loss": 0.503, "step": 650 }, { "epoch": 1.640251572327044, "grad_norm": 0.11027677357196808, "learning_rate": 9.528226641242804e-06, "loss": 0.4933, "step": 652 }, { "epoch": 1.6452830188679246, "grad_norm": 0.11328162252902985, "learning_rate": 9.417501939432257e-06, "loss": 0.4969, "step": 654 }, { "epoch": 1.650314465408805, "grad_norm": 0.111870676279068, "learning_rate": 9.308000244779918e-06, "loss": 0.5009, "step": 656 }, { "epoch": 1.6553459119496856, "grad_norm": 0.11578749120235443, "learning_rate": 9.19972896341717e-06, "loss": 0.5226, "step": 658 }, { "epoch": 1.6603773584905661, "grad_norm": 0.11840783059597015, "learning_rate": 9.09269541825658e-06, "loss": 0.4876, "step": 660 }, { "epoch": 1.6603773584905661, "eval_loss": 0.5716937184333801, "eval_runtime": 411.8982, "eval_samples_per_second": 25.992, "eval_steps_per_second": 0.204, "step": 660 }, { "epoch": 1.6654088050314466, "grad_norm": 0.10933776944875717, "learning_rate": 8.98690684849659e-06, "loss": 0.5217, "step": 662 }, { "epoch": 1.6704402515723271, "grad_norm": 0.11809239536523819, "learning_rate": 8.882370409131924e-06, "loss": 0.5182, "step": 664 }, { "epoch": 1.6754716981132076, "grad_norm": 0.1144753098487854, "learning_rate": 8.779093170469629e-06, "loss": 0.4999, "step": 666 }, { "epoch": 1.6805031446540881, "grad_norm": 0.11396916210651398, "learning_rate": 8.677082117650906e-06, "loss": 0.507, "step": 668 }, { "epoch": 1.6855345911949686, "grad_norm": 0.11068397760391235, "learning_rate": 8.576344150178653e-06, "loss": 0.5136, "step": 670 }, { "epoch": 1.6905660377358491, "grad_norm": 0.1053067147731781, "learning_rate": 8.47688608145083e-06, "loss": 0.4907, "step": 672 }, { "epoch": 1.6955974842767296, "grad_norm": 0.1102994978427887, "learning_rate": 8.378714638299628e-06, "loss": 0.4881, "step": 674 }, { "epoch": 1.7006289308176101, "grad_norm": 0.10971739888191223, "learning_rate": 8.28183646053649e-06, "loss": 0.5176, "step": 676 }, { "epoch": 1.7056603773584906, "grad_norm": 0.11169516295194626, "learning_rate": 8.186258100503058e-06, "loss": 0.5102, "step": 678 }, { "epoch": 1.7106918238993711, "grad_norm": 0.1112278550863266, "learning_rate": 8.091986022627978e-06, "loss": 0.5272, "step": 680 }, { "epoch": 1.7106918238993711, "eval_loss": 0.5712010860443115, "eval_runtime": 411.4247, "eval_samples_per_second": 26.022, "eval_steps_per_second": 0.204, "step": 680 }, { "epoch": 1.7157232704402516, "grad_norm": 0.11466188728809357, "learning_rate": 7.999026602989687e-06, "loss": 0.4974, "step": 682 }, { "epoch": 1.7207547169811321, "grad_norm": 0.12257977575063705, "learning_rate": 7.907386128885182e-06, "loss": 0.4946, "step": 684 }, { "epoch": 1.7257861635220126, "grad_norm": 0.12135323882102966, "learning_rate": 7.817070798404755e-06, "loss": 0.5374, "step": 686 }, { "epoch": 1.7308176100628931, "grad_norm": 0.11566798388957977, "learning_rate": 7.728086720012813e-06, "loss": 0.5048, "step": 688 }, { "epoch": 1.7358490566037736, "grad_norm": 0.11244137585163116, "learning_rate": 7.640439912134711e-06, "loss": 0.5169, "step": 690 }, { "epoch": 1.7408805031446541, "grad_norm": 0.1125202625989914, "learning_rate": 7.554136302749705e-06, "loss": 0.5076, "step": 692 }, { "epoch": 1.7459119496855346, "grad_norm": 0.1143079325556755, "learning_rate": 7.469181728990013e-06, "loss": 0.4961, "step": 694 }, { "epoch": 1.7509433962264151, "grad_norm": 0.11676593869924545, "learning_rate": 7.385581936746035e-06, "loss": 0.5003, "step": 696 }, { "epoch": 1.7559748427672957, "grad_norm": 0.1095028892159462, "learning_rate": 7.303342580277696e-06, "loss": 0.4755, "step": 698 }, { "epoch": 1.7610062893081762, "grad_norm": 0.11439331620931625, "learning_rate": 7.222469221832061e-06, "loss": 0.5057, "step": 700 }, { "epoch": 1.7610062893081762, "eval_loss": 0.5707039833068848, "eval_runtime": 411.7854, "eval_samples_per_second": 25.999, "eval_steps_per_second": 0.204, "step": 700 }, { "epoch": 1.7660377358490567, "grad_norm": 0.11388733237981796, "learning_rate": 7.142967331267113e-06, "loss": 0.4748, "step": 702 }, { "epoch": 1.7710691823899372, "grad_norm": 0.11583738774061203, "learning_rate": 7.064842285681781e-06, "loss": 0.494, "step": 704 }, { "epoch": 1.7761006289308177, "grad_norm": 0.11597929149866104, "learning_rate": 6.988099369052318e-06, "loss": 0.5106, "step": 706 }, { "epoch": 1.7811320754716982, "grad_norm": 0.1117326021194458, "learning_rate": 6.9127437718748465e-06, "loss": 0.4844, "step": 708 }, { "epoch": 1.7861635220125787, "grad_norm": 0.11276806890964508, "learning_rate": 6.838780590814366e-06, "loss": 0.5221, "step": 710 }, { "epoch": 1.7911949685534592, "grad_norm": 0.11557289958000183, "learning_rate": 6.7662148283599955e-06, "loss": 0.5021, "step": 712 }, { "epoch": 1.7962264150943397, "grad_norm": 0.11254438757896423, "learning_rate": 6.695051392486652e-06, "loss": 0.4999, "step": 714 }, { "epoch": 1.8012578616352202, "grad_norm": 0.11114822328090668, "learning_rate": 6.625295096323097e-06, "loss": 0.4849, "step": 716 }, { "epoch": 1.8062893081761007, "grad_norm": 0.11419788002967834, "learning_rate": 6.556950657826405e-06, "loss": 0.5227, "step": 718 }, { "epoch": 1.8113207547169812, "grad_norm": 0.11834213882684708, "learning_rate": 6.490022699462844e-06, "loss": 0.5043, "step": 720 }, { "epoch": 1.8113207547169812, "eval_loss": 0.5703166723251343, "eval_runtime": 411.34, "eval_samples_per_second": 26.027, "eval_steps_per_second": 0.204, "step": 720 }, { "epoch": 1.8163522012578617, "grad_norm": 0.11034736037254333, "learning_rate": 6.424515747895265e-06, "loss": 0.48, "step": 722 }, { "epoch": 1.8213836477987422, "grad_norm": 0.11254922300577164, "learning_rate": 6.360434233676926e-06, "loss": 0.4864, "step": 724 }, { "epoch": 1.8264150943396227, "grad_norm": 0.10830461978912354, "learning_rate": 6.297782490951833e-06, "loss": 0.4943, "step": 726 }, { "epoch": 1.8314465408805032, "grad_norm": 0.1077997013926506, "learning_rate": 6.236564757161608e-06, "loss": 0.4865, "step": 728 }, { "epoch": 1.8364779874213837, "grad_norm": 0.11505385488271713, "learning_rate": 6.176785172758871e-06, "loss": 0.5039, "step": 730 }, { "epoch": 1.8415094339622642, "grad_norm": 0.12043328583240509, "learning_rate": 6.118447780927233e-06, "loss": 0.4909, "step": 732 }, { "epoch": 1.8465408805031447, "grad_norm": 0.11934798955917358, "learning_rate": 6.0615565273078025e-06, "loss": 0.4978, "step": 734 }, { "epoch": 1.8515723270440252, "grad_norm": 0.11153744161128998, "learning_rate": 6.006115259732345e-06, "loss": 0.4924, "step": 736 }, { "epoch": 1.8566037735849057, "grad_norm": 0.11369643360376358, "learning_rate": 5.952127727963029e-06, "loss": 0.4938, "step": 738 }, { "epoch": 1.8616352201257862, "grad_norm": 0.1063813716173172, "learning_rate": 5.899597583438808e-06, "loss": 0.5059, "step": 740 }, { "epoch": 1.8616352201257862, "eval_loss": 0.5707431435585022, "eval_runtime": 411.3909, "eval_samples_per_second": 26.024, "eval_steps_per_second": 0.204, "step": 740 }, { "epoch": 1.8666666666666667, "grad_norm": 0.12777185440063477, "learning_rate": 5.848528379028456e-06, "loss": 0.5138, "step": 742 }, { "epoch": 1.8716981132075472, "grad_norm": 0.11385183781385422, "learning_rate": 5.798923568790283e-06, "loss": 0.5101, "step": 744 }, { "epoch": 1.8767295597484277, "grad_norm": 0.11212068051099777, "learning_rate": 5.750786507738497e-06, "loss": 0.4845, "step": 746 }, { "epoch": 1.8817610062893082, "grad_norm": 0.11255759000778198, "learning_rate": 5.704120451616305e-06, "loss": 0.5019, "step": 748 }, { "epoch": 1.8867924528301887, "grad_norm": 0.11542278528213501, "learning_rate": 5.6589285566757095e-06, "loss": 0.5014, "step": 750 }, { "epoch": 1.8918238993710692, "grad_norm": 0.11562332510948181, "learning_rate": 5.61521387946403e-06, "loss": 0.5077, "step": 752 }, { "epoch": 1.8968553459119497, "grad_norm": 0.10956519842147827, "learning_rate": 5.572979376617183e-06, "loss": 0.4842, "step": 754 }, { "epoch": 1.9018867924528302, "grad_norm": 0.11526080965995789, "learning_rate": 5.532227904659695e-06, "loss": 0.488, "step": 756 }, { "epoch": 1.9069182389937107, "grad_norm": 0.11320438235998154, "learning_rate": 5.49296221981152e-06, "loss": 0.5021, "step": 758 }, { "epoch": 1.9119496855345912, "grad_norm": 0.11186771839857101, "learning_rate": 5.455184977801612e-06, "loss": 0.4996, "step": 760 }, { "epoch": 1.9119496855345912, "eval_loss": 0.5699969530105591, "eval_runtime": 411.4136, "eval_samples_per_second": 26.022, "eval_steps_per_second": 0.204, "step": 760 }, { "epoch": 1.9169811320754717, "grad_norm": 0.1088699921965599, "learning_rate": 5.418898733688302e-06, "loss": 0.4933, "step": 762 }, { "epoch": 1.9220125786163522, "grad_norm": 0.11521943658590317, "learning_rate": 5.384105941686499e-06, "loss": 0.5018, "step": 764 }, { "epoch": 1.9270440251572327, "grad_norm": 0.1104261726140976, "learning_rate": 5.350808955001693e-06, "loss": 0.4945, "step": 766 }, { "epoch": 1.9320754716981132, "grad_norm": 0.11762084811925888, "learning_rate": 5.3190100256707905e-06, "loss": 0.4892, "step": 768 }, { "epoch": 1.9371069182389937, "grad_norm": 0.1136779636144638, "learning_rate": 5.288711304409814e-06, "loss": 0.5075, "step": 770 }, { "epoch": 1.9421383647798742, "grad_norm": 0.11609724909067154, "learning_rate": 5.259914840468416e-06, "loss": 0.5052, "step": 772 }, { "epoch": 1.9471698113207547, "grad_norm": 0.11565674096345901, "learning_rate": 5.2326225814913e-06, "loss": 0.5054, "step": 774 }, { "epoch": 1.9522012578616352, "grad_norm": 0.10971004515886307, "learning_rate": 5.206836373386482e-06, "loss": 0.4711, "step": 776 }, { "epoch": 1.9572327044025157, "grad_norm": 0.11187240481376648, "learning_rate": 5.182557960200441e-06, "loss": 0.4946, "step": 778 }, { "epoch": 1.9622641509433962, "grad_norm": 0.11195476353168488, "learning_rate": 5.1597889840001635e-06, "loss": 0.4975, "step": 780 }, { "epoch": 1.9622641509433962, "eval_loss": 0.5697274208068848, "eval_runtime": 411.2337, "eval_samples_per_second": 26.034, "eval_steps_per_second": 0.204, "step": 780 }, { "epoch": 1.9672955974842767, "grad_norm": 0.11138684302568436, "learning_rate": 5.138530984762087e-06, "loss": 0.4915, "step": 782 }, { "epoch": 1.9723270440251572, "grad_norm": 0.10923836380243301, "learning_rate": 5.118785400267929e-06, "loss": 0.4855, "step": 784 }, { "epoch": 1.9773584905660377, "grad_norm": 0.10922150313854218, "learning_rate": 5.100553566007467e-06, "loss": 0.4794, "step": 786 }, { "epoch": 1.9823899371069182, "grad_norm": 0.10965920239686966, "learning_rate": 5.083836715088188e-06, "loss": 0.4836, "step": 788 }, { "epoch": 1.9874213836477987, "grad_norm": 0.11357378959655762, "learning_rate": 5.068635978151901e-06, "loss": 0.4942, "step": 790 }, { "epoch": 1.9924528301886792, "grad_norm": 0.11112317442893982, "learning_rate": 5.0549523832982645e-06, "loss": 0.4939, "step": 792 }, { "epoch": 1.9974842767295597, "grad_norm": 0.11381904780864716, "learning_rate": 5.042786856015253e-06, "loss": 0.5013, "step": 794 }, { "epoch": 1.9974842767295597, "step": 794, "total_flos": 946984181301248.0, "train_loss": 0.5561736259862818, "train_runtime": 112418.4712, "train_samples_per_second": 3.619, "train_steps_per_second": 0.007 } ], "logging_steps": 2, "max_steps": 794, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 946984181301248.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }