|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9974842767295597, |
|
"eval_steps": 20, |
|
"global_step": 794, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005031446540880503, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8316, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.010062893081761006, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8423, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01509433962264151, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8389, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.02012578616352201, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8302, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.025157232704402517, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8634, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03018867924528302, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.8553, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03522012578616352, |
|
"grad_norm": 0.7739448547363281, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.8423, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04025157232704402, |
|
"grad_norm": 0.7387983202934265, |
|
"learning_rate": 5e-06, |
|
"loss": 0.8385, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.045283018867924525, |
|
"grad_norm": 0.34386318922042847, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 0.8163, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.050314465408805034, |
|
"grad_norm": 0.32477569580078125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7541, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.050314465408805034, |
|
"eval_loss": 0.7434237599372864, |
|
"eval_runtime": 512.9045, |
|
"eval_samples_per_second": 20.873, |
|
"eval_steps_per_second": 0.164, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.055345911949685536, |
|
"grad_norm": 0.3248128294944763, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 0.7494, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06037735849056604, |
|
"grad_norm": 0.25117382407188416, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.7204, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06540880503144654, |
|
"grad_norm": 0.23494713008403778, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6902, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.07044025157232704, |
|
"grad_norm": 0.19120900332927704, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 0.6971, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07547169811320754, |
|
"grad_norm": 0.16960154473781586, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 0.677, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08050314465408805, |
|
"grad_norm": 0.16359932720661163, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6863, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08553459119496855, |
|
"grad_norm": 0.1389608234167099, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.6786, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.09056603773584905, |
|
"grad_norm": 0.14994372427463531, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 0.6692, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09559748427672957, |
|
"grad_norm": 0.13334757089614868, |
|
"learning_rate": 4e-05, |
|
"loss": 0.666, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.10062893081761007, |
|
"grad_norm": 0.14932414889335632, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 0.6771, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10062893081761007, |
|
"eval_loss": 0.6487388610839844, |
|
"eval_runtime": 477.079, |
|
"eval_samples_per_second": 22.441, |
|
"eval_steps_per_second": 0.176, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10566037735849057, |
|
"grad_norm": 0.1352750062942505, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 0.668, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.11069182389937107, |
|
"grad_norm": 0.11694569140672684, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6517, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11572327044025157, |
|
"grad_norm": 0.12432057410478592, |
|
"learning_rate": 4.9999239107866414e-05, |
|
"loss": 0.6353, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.12075471698113208, |
|
"grad_norm": 0.121449314057827, |
|
"learning_rate": 4.9996956482928485e-05, |
|
"loss": 0.6348, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12578616352201258, |
|
"grad_norm": 0.119928739964962, |
|
"learning_rate": 4.999315227957123e-05, |
|
"loss": 0.6402, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13081761006289308, |
|
"grad_norm": 0.12270842492580414, |
|
"learning_rate": 4.998782675509138e-05, |
|
"loss": 0.6294, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13584905660377358, |
|
"grad_norm": 0.1132158562541008, |
|
"learning_rate": 4.998098026968003e-05, |
|
"loss": 0.6417, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.14088050314465408, |
|
"grad_norm": 0.13522745668888092, |
|
"learning_rate": 4.997261328639824e-05, |
|
"loss": 0.6366, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1459119496855346, |
|
"grad_norm": 0.11879543960094452, |
|
"learning_rate": 4.996272637114571e-05, |
|
"loss": 0.6335, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.1235610768198967, |
|
"learning_rate": 4.995132019262254e-05, |
|
"loss": 0.6483, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"eval_loss": 0.6241472363471985, |
|
"eval_runtime": 445.3592, |
|
"eval_samples_per_second": 24.039, |
|
"eval_steps_per_second": 0.189, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1559748427672956, |
|
"grad_norm": 0.11320329457521439, |
|
"learning_rate": 4.993839552228398e-05, |
|
"loss": 0.6242, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.1610062893081761, |
|
"grad_norm": 0.12192777544260025, |
|
"learning_rate": 4.992395323428824e-05, |
|
"loss": 0.6214, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1660377358490566, |
|
"grad_norm": 0.12313053756952286, |
|
"learning_rate": 4.9907994305437405e-05, |
|
"loss": 0.6331, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1710691823899371, |
|
"grad_norm": 0.12243402749300003, |
|
"learning_rate": 4.989051981511133e-05, |
|
"loss": 0.6222, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.1761006289308176, |
|
"grad_norm": 0.12040385603904724, |
|
"learning_rate": 4.9871530945194654e-05, |
|
"loss": 0.6394, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1811320754716981, |
|
"grad_norm": 0.11485008895397186, |
|
"learning_rate": 4.985102897999687e-05, |
|
"loss": 0.5755, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1861635220125786, |
|
"grad_norm": 0.1206885501742363, |
|
"learning_rate": 4.982901530616545e-05, |
|
"loss": 0.6354, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.19119496855345913, |
|
"grad_norm": 0.12174931168556213, |
|
"learning_rate": 4.980549141259205e-05, |
|
"loss": 0.6362, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19622641509433963, |
|
"grad_norm": 0.13587461411952972, |
|
"learning_rate": 4.9780458890311846e-05, |
|
"loss": 0.616, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.20125786163522014, |
|
"grad_norm": 0.12933531403541565, |
|
"learning_rate": 4.9753919432395876e-05, |
|
"loss": 0.6363, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20125786163522014, |
|
"eval_loss": 0.6132378578186035, |
|
"eval_runtime": 463.9686, |
|
"eval_samples_per_second": 23.075, |
|
"eval_steps_per_second": 0.181, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20628930817610064, |
|
"grad_norm": 0.11656603217124939, |
|
"learning_rate": 4.9725874833836574e-05, |
|
"loss": 0.62, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.21132075471698114, |
|
"grad_norm": 0.11784744262695312, |
|
"learning_rate": 4.969632699142632e-05, |
|
"loss": 0.6094, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21635220125786164, |
|
"grad_norm": 0.11344680190086365, |
|
"learning_rate": 4.966527790362919e-05, |
|
"loss": 0.5968, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.22138364779874214, |
|
"grad_norm": 0.11833988130092621, |
|
"learning_rate": 4.963272967044579e-05, |
|
"loss": 0.6306, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.22641509433962265, |
|
"grad_norm": 0.11473698914051056, |
|
"learning_rate": 4.959868449327119e-05, |
|
"loss": 0.6152, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.23144654088050315, |
|
"grad_norm": 0.10996667295694351, |
|
"learning_rate": 4.9563144674746046e-05, |
|
"loss": 0.595, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23647798742138365, |
|
"grad_norm": 0.11677283048629761, |
|
"learning_rate": 4.952611261860089e-05, |
|
"loss": 0.5967, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.24150943396226415, |
|
"grad_norm": 0.1206679567694664, |
|
"learning_rate": 4.9487590829493514e-05, |
|
"loss": 0.604, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.24654088050314465, |
|
"grad_norm": 0.12233300507068634, |
|
"learning_rate": 4.944758191283959e-05, |
|
"loss": 0.6043, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"grad_norm": 0.11701351404190063, |
|
"learning_rate": 4.940608857463644e-05, |
|
"loss": 0.6, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25157232704402516, |
|
"eval_loss": 0.6065506339073181, |
|
"eval_runtime": 422.5948, |
|
"eval_samples_per_second": 25.334, |
|
"eval_steps_per_second": 0.199, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25660377358490566, |
|
"grad_norm": 0.1179809644818306, |
|
"learning_rate": 4.9363113621280036e-05, |
|
"loss": 0.6406, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.26163522012578616, |
|
"grad_norm": 0.12534746527671814, |
|
"learning_rate": 4.931865995937519e-05, |
|
"loss": 0.6227, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.12294790893793106, |
|
"learning_rate": 4.927273059553892e-05, |
|
"loss": 0.5957, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.27169811320754716, |
|
"grad_norm": 0.12756744027137756, |
|
"learning_rate": 4.9225328636197144e-05, |
|
"loss": 0.6226, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.27672955974842767, |
|
"grad_norm": 0.12662391364574432, |
|
"learning_rate": 4.9176457287374584e-05, |
|
"loss": 0.5899, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.28176100628930817, |
|
"grad_norm": 0.12516862154006958, |
|
"learning_rate": 4.912611985447789e-05, |
|
"loss": 0.6238, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28679245283018867, |
|
"grad_norm": 0.116559699177742, |
|
"learning_rate": 4.907431974207211e-05, |
|
"loss": 0.6112, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2918238993710692, |
|
"grad_norm": 0.11833405494689941, |
|
"learning_rate": 4.90210604536504e-05, |
|
"loss": 0.6087, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2968553459119497, |
|
"grad_norm": 0.1188972070813179, |
|
"learning_rate": 4.896634559139707e-05, |
|
"loss": 0.594, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 0.11520268023014069, |
|
"learning_rate": 4.891017885594399e-05, |
|
"loss": 0.6059, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"eval_loss": 0.6016330718994141, |
|
"eval_runtime": 415.9012, |
|
"eval_samples_per_second": 25.742, |
|
"eval_steps_per_second": 0.202, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3069182389937107, |
|
"grad_norm": 0.11097536981105804, |
|
"learning_rate": 4.885256404612022e-05, |
|
"loss": 0.5963, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.3119496855345912, |
|
"grad_norm": 0.1210559606552124, |
|
"learning_rate": 4.8793505058695155e-05, |
|
"loss": 0.6367, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3169811320754717, |
|
"grad_norm": 0.12011069059371948, |
|
"learning_rate": 4.8733005888114915e-05, |
|
"loss": 0.5889, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3220125786163522, |
|
"grad_norm": 0.12133902311325073, |
|
"learning_rate": 4.867107062623223e-05, |
|
"loss": 0.5812, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3270440251572327, |
|
"grad_norm": 0.11452817916870117, |
|
"learning_rate": 4.860770346202962e-05, |
|
"loss": 0.607, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3320754716981132, |
|
"grad_norm": 0.1322205364704132, |
|
"learning_rate": 4.854290868133614e-05, |
|
"loss": 0.5789, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3371069182389937, |
|
"grad_norm": 0.1177845150232315, |
|
"learning_rate": 4.847669066653746e-05, |
|
"loss": 0.5834, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.3421383647798742, |
|
"grad_norm": 0.12132906913757324, |
|
"learning_rate": 4.840905389627951e-05, |
|
"loss": 0.6178, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3471698113207547, |
|
"grad_norm": 0.11665117740631104, |
|
"learning_rate": 4.834000294516552e-05, |
|
"loss": 0.5999, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3522012578616352, |
|
"grad_norm": 0.12204349786043167, |
|
"learning_rate": 4.8269542483446654e-05, |
|
"loss": 0.602, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3522012578616352, |
|
"eval_loss": 0.5978492498397827, |
|
"eval_runtime": 462.6047, |
|
"eval_samples_per_second": 23.143, |
|
"eval_steps_per_second": 0.182, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3572327044025157, |
|
"grad_norm": 0.1177980899810791, |
|
"learning_rate": 4.819767727670612e-05, |
|
"loss": 0.6035, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3622641509433962, |
|
"grad_norm": 0.12166167795658112, |
|
"learning_rate": 4.812441218553683e-05, |
|
"loss": 0.5909, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3672955974842767, |
|
"grad_norm": 0.13141223788261414, |
|
"learning_rate": 4.804975216521272e-05, |
|
"loss": 0.5985, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3723270440251572, |
|
"grad_norm": 0.12138612568378448, |
|
"learning_rate": 4.797370226535353e-05, |
|
"loss": 0.5866, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37735849056603776, |
|
"grad_norm": 0.12571364641189575, |
|
"learning_rate": 4.789626762958331e-05, |
|
"loss": 0.5789, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.38238993710691827, |
|
"grad_norm": 0.1183118149638176, |
|
"learning_rate": 4.781745349518252e-05, |
|
"loss": 0.5958, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.38742138364779877, |
|
"grad_norm": 0.11182838678359985, |
|
"learning_rate": 4.7737265192733815e-05, |
|
"loss": 0.5768, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.39245283018867927, |
|
"grad_norm": 0.12711752951145172, |
|
"learning_rate": 4.765570814576153e-05, |
|
"loss": 0.5951, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.39748427672955977, |
|
"grad_norm": 0.12628626823425293, |
|
"learning_rate": 4.757278787036479e-05, |
|
"loss": 0.5907, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.4025157232704403, |
|
"grad_norm": 0.11637621372938156, |
|
"learning_rate": 4.748850997484452e-05, |
|
"loss": 0.6115, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4025157232704403, |
|
"eval_loss": 0.5944415926933289, |
|
"eval_runtime": 416.2458, |
|
"eval_samples_per_second": 25.72, |
|
"eval_steps_per_second": 0.202, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4075471698113208, |
|
"grad_norm": 0.11336452513933182, |
|
"learning_rate": 4.7402880159324084e-05, |
|
"loss": 0.5685, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4125786163522013, |
|
"grad_norm": 0.12172479182481766, |
|
"learning_rate": 4.7315904215363734e-05, |
|
"loss": 0.5765, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4176100628930818, |
|
"grad_norm": 0.11980731040239334, |
|
"learning_rate": 4.722758802556896e-05, |
|
"loss": 0.5948, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.4226415094339623, |
|
"grad_norm": 0.11705721169710159, |
|
"learning_rate": 4.7137937563192555e-05, |
|
"loss": 0.5749, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4276729559748428, |
|
"grad_norm": 0.1204642727971077, |
|
"learning_rate": 4.704695889173066e-05, |
|
"loss": 0.6069, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4327044025157233, |
|
"grad_norm": 0.11957567930221558, |
|
"learning_rate": 4.695465816451266e-05, |
|
"loss": 0.5724, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4377358490566038, |
|
"grad_norm": 0.12370068579912186, |
|
"learning_rate": 4.686104162428497e-05, |
|
"loss": 0.6164, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4427672955974843, |
|
"grad_norm": 0.14216069877147675, |
|
"learning_rate": 4.676611560278884e-05, |
|
"loss": 0.6018, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4477987421383648, |
|
"grad_norm": 0.12077496945858002, |
|
"learning_rate": 4.66698865203321e-05, |
|
"loss": 0.5919, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 0.12382370233535767, |
|
"learning_rate": 4.6572360885354905e-05, |
|
"loss": 0.6304, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"eval_loss": 0.5917236804962158, |
|
"eval_runtime": 416.6125, |
|
"eval_samples_per_second": 25.698, |
|
"eval_steps_per_second": 0.202, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4578616352201258, |
|
"grad_norm": 0.11310654878616333, |
|
"learning_rate": 4.647354529398957e-05, |
|
"loss": 0.5984, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.4628930817610063, |
|
"grad_norm": 0.11125874519348145, |
|
"learning_rate": 4.637344642961442e-05, |
|
"loss": 0.5993, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.4679245283018868, |
|
"grad_norm": 0.10250482708215714, |
|
"learning_rate": 4.627207106240176e-05, |
|
"loss": 0.5958, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4729559748427673, |
|
"grad_norm": 0.10660769045352936, |
|
"learning_rate": 4.6169426048859994e-05, |
|
"loss": 0.5602, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4779874213836478, |
|
"grad_norm": 0.10431323200464249, |
|
"learning_rate": 4.606551833136985e-05, |
|
"loss": 0.5903, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4830188679245283, |
|
"grad_norm": 0.11766020953655243, |
|
"learning_rate": 4.596035493771488e-05, |
|
"loss": 0.6004, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4880503144654088, |
|
"grad_norm": 0.1166047751903534, |
|
"learning_rate": 4.585394298060611e-05, |
|
"loss": 0.5977, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4930817610062893, |
|
"grad_norm": 0.1147083193063736, |
|
"learning_rate": 4.574628965720097e-05, |
|
"loss": 0.5897, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4981132075471698, |
|
"grad_norm": 0.10733990371227264, |
|
"learning_rate": 4.5637402248616506e-05, |
|
"loss": 0.5978, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"grad_norm": 0.10964643210172653, |
|
"learning_rate": 4.552728811943696e-05, |
|
"loss": 0.602, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5031446540880503, |
|
"eval_loss": 0.5892359018325806, |
|
"eval_runtime": 416.8902, |
|
"eval_samples_per_second": 25.681, |
|
"eval_steps_per_second": 0.201, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5081761006289308, |
|
"grad_norm": 0.11032500118017197, |
|
"learning_rate": 4.54717733587572e-05, |
|
"loss": 0.5817, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5132075471698113, |
|
"grad_norm": 0.1092258021235466, |
|
"learning_rate": 4.5359833138637734e-05, |
|
"loss": 0.5982, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5182389937106918, |
|
"grad_norm": 0.10684628039598465, |
|
"learning_rate": 4.524668497127006e-05, |
|
"loss": 0.5923, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5232704402515723, |
|
"grad_norm": 0.11228887736797333, |
|
"learning_rate": 4.513233650941422e-05, |
|
"loss": 0.5742, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5283018867924528, |
|
"grad_norm": 0.1152106523513794, |
|
"learning_rate": 4.501679548701201e-05, |
|
"loss": 0.5955, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.11588042229413986, |
|
"learning_rate": 4.490006971866385e-05, |
|
"loss": 0.5936, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5383647798742138, |
|
"grad_norm": 0.11394830793142319, |
|
"learning_rate": 4.478216709910035e-05, |
|
"loss": 0.5937, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5433962264150943, |
|
"grad_norm": 0.11031538993120193, |
|
"learning_rate": 4.466309560264822e-05, |
|
"loss": 0.5973, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5484276729559748, |
|
"grad_norm": 0.11161042749881744, |
|
"learning_rate": 4.4542863282691014e-05, |
|
"loss": 0.5701, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5534591194968553, |
|
"grad_norm": 0.11783146113157272, |
|
"learning_rate": 4.4421478271124426e-05, |
|
"loss": 0.603, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5534591194968553, |
|
"eval_loss": 0.5872239470481873, |
|
"eval_runtime": 412.7815, |
|
"eval_samples_per_second": 25.936, |
|
"eval_steps_per_second": 0.203, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5584905660377358, |
|
"grad_norm": 0.10747622698545456, |
|
"learning_rate": 4.429894877780627e-05, |
|
"loss": 0.6089, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5635220125786163, |
|
"grad_norm": 0.1075616180896759, |
|
"learning_rate": 4.4175283090001225e-05, |
|
"loss": 0.6042, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5685534591194968, |
|
"grad_norm": 0.11763885617256165, |
|
"learning_rate": 4.4050489571820306e-05, |
|
"loss": 0.5854, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5735849056603773, |
|
"grad_norm": 0.1110839769244194, |
|
"learning_rate": 4.392457666365519e-05, |
|
"loss": 0.5731, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5786163522012578, |
|
"grad_norm": 0.1211901530623436, |
|
"learning_rate": 4.379755288160733e-05, |
|
"loss": 0.5571, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5836477987421383, |
|
"grad_norm": 0.10990186035633087, |
|
"learning_rate": 4.3669426816911985e-05, |
|
"loss": 0.5919, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5886792452830188, |
|
"grad_norm": 0.1129634901881218, |
|
"learning_rate": 4.354020713535711e-05, |
|
"loss": 0.5853, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5937106918238994, |
|
"grad_norm": 0.11691266298294067, |
|
"learning_rate": 4.340990257669732e-05, |
|
"loss": 0.5878, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5987421383647799, |
|
"grad_norm": 0.11645273119211197, |
|
"learning_rate": 4.327852195406271e-05, |
|
"loss": 0.5946, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 0.1155095249414444, |
|
"learning_rate": 4.314607415336281e-05, |
|
"loss": 0.6004, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"eval_loss": 0.5851526856422424, |
|
"eval_runtime": 413.8117, |
|
"eval_samples_per_second": 25.872, |
|
"eval_steps_per_second": 0.203, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6088050314465409, |
|
"grad_norm": 0.11004958301782608, |
|
"learning_rate": 4.301256813268559e-05, |
|
"loss": 0.5846, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6138364779874214, |
|
"grad_norm": 0.10904593765735626, |
|
"learning_rate": 4.287801292169159e-05, |
|
"loss": 0.5871, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6188679245283019, |
|
"grad_norm": 0.11857830733060837, |
|
"learning_rate": 4.274241762100315e-05, |
|
"loss": 0.5826, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6238993710691824, |
|
"grad_norm": 0.11342642456293106, |
|
"learning_rate": 4.260579140158898e-05, |
|
"loss": 0.5807, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6289308176100629, |
|
"grad_norm": 0.10923154652118683, |
|
"learning_rate": 4.246814350414377e-05, |
|
"loss": 0.5732, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6339622641509434, |
|
"grad_norm": 0.11175252497196198, |
|
"learning_rate": 4.2329483238463304e-05, |
|
"loss": 0.5649, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6389937106918239, |
|
"grad_norm": 0.10550232976675034, |
|
"learning_rate": 4.218981998281471e-05, |
|
"loss": 0.5853, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6440251572327044, |
|
"grad_norm": 0.10999017208814621, |
|
"learning_rate": 4.204916318330225e-05, |
|
"loss": 0.5864, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6490566037735849, |
|
"grad_norm": 0.13525208830833435, |
|
"learning_rate": 4.190752235322832e-05, |
|
"loss": 0.5842, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6540880503144654, |
|
"grad_norm": 0.11547064781188965, |
|
"learning_rate": 4.176490707245011e-05, |
|
"loss": 0.5891, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6540880503144654, |
|
"eval_loss": 0.5836161971092224, |
|
"eval_runtime": 414.2177, |
|
"eval_samples_per_second": 25.846, |
|
"eval_steps_per_second": 0.203, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6591194968553459, |
|
"grad_norm": 0.10233239829540253, |
|
"learning_rate": 4.162132698673167e-05, |
|
"loss": 0.5708, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6641509433962264, |
|
"grad_norm": 0.11854418367147446, |
|
"learning_rate": 4.1476791807091445e-05, |
|
"loss": 0.6074, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6691823899371069, |
|
"grad_norm": 0.13011477887630463, |
|
"learning_rate": 4.133131130914555e-05, |
|
"loss": 0.597, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6742138364779874, |
|
"grad_norm": 0.11256586760282516, |
|
"learning_rate": 4.118489533244655e-05, |
|
"loss": 0.5895, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6792452830188679, |
|
"grad_norm": 0.10651887208223343, |
|
"learning_rate": 4.1037553779818016e-05, |
|
"loss": 0.5934, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6842767295597484, |
|
"grad_norm": 0.11151424050331116, |
|
"learning_rate": 4.088929661668468e-05, |
|
"loss": 0.6028, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6893081761006289, |
|
"grad_norm": 0.10524857044219971, |
|
"learning_rate": 4.0740133870398456e-05, |
|
"loss": 0.608, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6943396226415094, |
|
"grad_norm": 0.12080366164445877, |
|
"learning_rate": 4.059007562956027e-05, |
|
"loss": 0.6175, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6993710691823899, |
|
"grad_norm": 0.1079036071896553, |
|
"learning_rate": 4.0439132043337666e-05, |
|
"loss": 0.5938, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.7044025157232704, |
|
"grad_norm": 0.11142345517873764, |
|
"learning_rate": 4.028731332077843e-05, |
|
"loss": 0.5752, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7044025157232704, |
|
"eval_loss": 0.5814208984375, |
|
"eval_runtime": 413.672, |
|
"eval_samples_per_second": 25.88, |
|
"eval_steps_per_second": 0.203, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7094339622641509, |
|
"grad_norm": 0.11195844411849976, |
|
"learning_rate": 4.0134629730120045e-05, |
|
"loss": 0.583, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7144654088050314, |
|
"grad_norm": 0.10488727688789368, |
|
"learning_rate": 3.9981091598095213e-05, |
|
"loss": 0.5593, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7194968553459119, |
|
"grad_norm": 0.10145172476768494, |
|
"learning_rate": 3.9826709309233454e-05, |
|
"loss": 0.5839, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7245283018867924, |
|
"grad_norm": 0.1036810651421547, |
|
"learning_rate": 3.967149330515867e-05, |
|
"loss": 0.5796, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7295597484276729, |
|
"grad_norm": 0.1167021244764328, |
|
"learning_rate": 3.951545408388301e-05, |
|
"loss": 0.6006, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7345911949685534, |
|
"grad_norm": 0.10975436121225357, |
|
"learning_rate": 3.935860219909679e-05, |
|
"loss": 0.5802, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7396226415094339, |
|
"grad_norm": 0.11112164705991745, |
|
"learning_rate": 3.920094825945468e-05, |
|
"loss": 0.5851, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7446540880503144, |
|
"grad_norm": 0.10697871446609497, |
|
"learning_rate": 3.904250292785825e-05, |
|
"loss": 0.5855, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7496855345911949, |
|
"grad_norm": 0.10061946511268616, |
|
"learning_rate": 3.8883276920734736e-05, |
|
"loss": 0.5941, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"grad_norm": 0.102944515645504, |
|
"learning_rate": 3.8723281007312256e-05, |
|
"loss": 0.5732, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7547169811320755, |
|
"eval_loss": 0.5801501870155334, |
|
"eval_runtime": 412.1928, |
|
"eval_samples_per_second": 25.973, |
|
"eval_steps_per_second": 0.204, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.759748427672956, |
|
"grad_norm": 0.10275246948003769, |
|
"learning_rate": 3.856252600889143e-05, |
|
"loss": 0.5803, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7647798742138365, |
|
"grad_norm": 0.10686468333005905, |
|
"learning_rate": 3.840102279811345e-05, |
|
"loss": 0.585, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.769811320754717, |
|
"grad_norm": 0.11284485459327698, |
|
"learning_rate": 3.82387822982248e-05, |
|
"loss": 0.5937, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7748427672955975, |
|
"grad_norm": 0.10271965712308884, |
|
"learning_rate": 3.807581548233837e-05, |
|
"loss": 0.5777, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.779874213836478, |
|
"grad_norm": 0.10338881611824036, |
|
"learning_rate": 3.791213337269134e-05, |
|
"loss": 0.5888, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7849056603773585, |
|
"grad_norm": 0.11098296195268631, |
|
"learning_rate": 3.7747747039899676e-05, |
|
"loss": 0.5764, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.789937106918239, |
|
"grad_norm": 0.10431266576051712, |
|
"learning_rate": 3.758266760220937e-05, |
|
"loss": 0.5985, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7949685534591195, |
|
"grad_norm": 0.10191661864519119, |
|
"learning_rate": 3.741690622474449e-05, |
|
"loss": 0.5626, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.11264406889677048, |
|
"learning_rate": 3.7250474118751974e-05, |
|
"loss": 0.6094, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.8050314465408805, |
|
"grad_norm": 0.10798995941877365, |
|
"learning_rate": 3.708338254084339e-05, |
|
"loss": 0.5714, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8050314465408805, |
|
"eval_loss": 0.5783895254135132, |
|
"eval_runtime": 411.3396, |
|
"eval_samples_per_second": 26.027, |
|
"eval_steps_per_second": 0.204, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.810062893081761, |
|
"grad_norm": 0.10546964406967163, |
|
"learning_rate": 3.69156427922336e-05, |
|
"loss": 0.5682, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8150943396226416, |
|
"grad_norm": 0.11086619645357132, |
|
"learning_rate": 3.6747266217976414e-05, |
|
"loss": 0.5682, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.820125786163522, |
|
"grad_norm": 0.11166223883628845, |
|
"learning_rate": 3.6578264206197245e-05, |
|
"loss": 0.5798, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8251572327044026, |
|
"grad_norm": 0.1012871265411377, |
|
"learning_rate": 3.6408648187322854e-05, |
|
"loss": 0.5694, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8301886792452831, |
|
"grad_norm": 0.11129719018936157, |
|
"learning_rate": 3.623842963330832e-05, |
|
"loss": 0.5827, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8352201257861636, |
|
"grad_norm": 0.10445208847522736, |
|
"learning_rate": 3.6067620056861086e-05, |
|
"loss": 0.5684, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8402515723270441, |
|
"grad_norm": 0.09836594015359879, |
|
"learning_rate": 3.589623101066232e-05, |
|
"loss": 0.5755, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8452830188679246, |
|
"grad_norm": 0.10354923456907272, |
|
"learning_rate": 3.572427408658552e-05, |
|
"loss": 0.5782, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8503144654088051, |
|
"grad_norm": 0.10813874751329422, |
|
"learning_rate": 3.5551760914912546e-05, |
|
"loss": 0.5939, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8553459119496856, |
|
"grad_norm": 0.10268381237983704, |
|
"learning_rate": 3.537870316354699e-05, |
|
"loss": 0.5697, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8553459119496856, |
|
"eval_loss": 0.5766698122024536, |
|
"eval_runtime": 412.4072, |
|
"eval_samples_per_second": 25.96, |
|
"eval_steps_per_second": 0.204, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8603773584905661, |
|
"grad_norm": 0.10261879861354828, |
|
"learning_rate": 3.5205112537224974e-05, |
|
"loss": 0.5694, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8654088050314466, |
|
"grad_norm": 0.10596223920583725, |
|
"learning_rate": 3.50310007767236e-05, |
|
"loss": 0.6091, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8704402515723271, |
|
"grad_norm": 0.09846257418394089, |
|
"learning_rate": 3.485637965806674e-05, |
|
"loss": 0.5808, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8754716981132076, |
|
"grad_norm": 0.10605087131261826, |
|
"learning_rate": 3.4681260991728685e-05, |
|
"loss": 0.5737, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8805031446540881, |
|
"grad_norm": 0.11082804203033447, |
|
"learning_rate": 3.450565662183527e-05, |
|
"loss": 0.5826, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8855345911949686, |
|
"grad_norm": 0.10169358551502228, |
|
"learning_rate": 3.432957842536282e-05, |
|
"loss": 0.5724, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8905660377358491, |
|
"grad_norm": 0.10837408900260925, |
|
"learning_rate": 3.415303831133485e-05, |
|
"loss": 0.5736, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8955974842767296, |
|
"grad_norm": 0.11302381008863449, |
|
"learning_rate": 3.3976048220016604e-05, |
|
"loss": 0.589, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.9006289308176101, |
|
"grad_norm": 0.10759381949901581, |
|
"learning_rate": 3.37986201221075e-05, |
|
"loss": 0.5769, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"grad_norm": 0.11644274741411209, |
|
"learning_rate": 3.362076601793142e-05, |
|
"loss": 0.5603, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9056603773584906, |
|
"eval_loss": 0.5751825571060181, |
|
"eval_runtime": 411.2176, |
|
"eval_samples_per_second": 26.035, |
|
"eval_steps_per_second": 0.204, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9106918238993711, |
|
"grad_norm": 0.10910682380199432, |
|
"learning_rate": 3.344249793662514e-05, |
|
"loss": 0.5632, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9157232704402516, |
|
"grad_norm": 0.10915949195623398, |
|
"learning_rate": 3.326382793532476e-05, |
|
"loss": 0.5903, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9207547169811321, |
|
"grad_norm": 0.11586213856935501, |
|
"learning_rate": 3.308476809835013e-05, |
|
"loss": 0.594, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9257861635220126, |
|
"grad_norm": 0.11010655015707016, |
|
"learning_rate": 3.290533053638759e-05, |
|
"loss": 0.5723, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9308176100628931, |
|
"grad_norm": 0.11362364888191223, |
|
"learning_rate": 3.272552738567086e-05, |
|
"loss": 0.5825, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9358490566037736, |
|
"grad_norm": 0.10686285048723221, |
|
"learning_rate": 3.254537080716021e-05, |
|
"loss": 0.586, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9408805031446541, |
|
"grad_norm": 0.11016613245010376, |
|
"learning_rate": 3.236487298571996e-05, |
|
"loss": 0.5813, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9459119496855346, |
|
"grad_norm": 0.10999295115470886, |
|
"learning_rate": 3.2184046129294295e-05, |
|
"loss": 0.5716, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9509433962264151, |
|
"grad_norm": 0.1100740134716034, |
|
"learning_rate": 3.20029024680817e-05, |
|
"loss": 0.5424, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9559748427672956, |
|
"grad_norm": 0.105403371155262, |
|
"learning_rate": 3.1821454253707646e-05, |
|
"loss": 0.5963, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9559748427672956, |
|
"eval_loss": 0.5737633109092712, |
|
"eval_runtime": 412.702, |
|
"eval_samples_per_second": 25.941, |
|
"eval_steps_per_second": 0.204, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9610062893081761, |
|
"grad_norm": 0.10489460825920105, |
|
"learning_rate": 3.1639713758396055e-05, |
|
"loss": 0.567, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9660377358490566, |
|
"grad_norm": 0.10349351167678833, |
|
"learning_rate": 3.145769327413922e-05, |
|
"loss": 0.5721, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9710691823899371, |
|
"grad_norm": 0.10395889729261398, |
|
"learning_rate": 3.127540511186643e-05, |
|
"loss": 0.569, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9761006289308176, |
|
"grad_norm": 0.10146255046129227, |
|
"learning_rate": 3.109286160061136e-05, |
|
"loss": 0.5699, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9811320754716981, |
|
"grad_norm": 0.10547154396772385, |
|
"learning_rate": 3.091007508667814e-05, |
|
"loss": 0.5686, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9861635220125786, |
|
"grad_norm": 0.10124453902244568, |
|
"learning_rate": 3.072705793280642e-05, |
|
"loss": 0.5983, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9911949685534591, |
|
"grad_norm": 0.10909611731767654, |
|
"learning_rate": 3.054382251733507e-05, |
|
"loss": 0.5881, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9962264150943396, |
|
"grad_norm": 0.10168620944023132, |
|
"learning_rate": 3.0360381233365105e-05, |
|
"loss": 0.5978, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.0012578616352201, |
|
"grad_norm": 0.1318611353635788, |
|
"learning_rate": 3.0176746487921404e-05, |
|
"loss": 0.5694, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.0062893081761006, |
|
"grad_norm": 0.12533968687057495, |
|
"learning_rate": 2.9992930701113586e-05, |
|
"loss": 0.5082, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0062893081761006, |
|
"eval_loss": 0.5776596665382385, |
|
"eval_runtime": 410.8135, |
|
"eval_samples_per_second": 26.06, |
|
"eval_steps_per_second": 0.204, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0113207547169811, |
|
"grad_norm": 0.12798835337162018, |
|
"learning_rate": 2.9808946305295988e-05, |
|
"loss": 0.4912, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.0163522012578616, |
|
"grad_norm": 0.13782812654972076, |
|
"learning_rate": 2.962480574422678e-05, |
|
"loss": 0.5288, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.0213836477987421, |
|
"grad_norm": 0.11341769248247147, |
|
"learning_rate": 2.9440521472226368e-05, |
|
"loss": 0.5032, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.0264150943396226, |
|
"grad_norm": 0.1318856179714203, |
|
"learning_rate": 2.9256105953334982e-05, |
|
"loss": 0.5038, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.0314465408805031, |
|
"grad_norm": 0.11981856822967529, |
|
"learning_rate": 2.9071571660469775e-05, |
|
"loss": 0.4965, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.0364779874213836, |
|
"grad_norm": 0.11475057154893875, |
|
"learning_rate": 2.888693107458111e-05, |
|
"loss": 0.4912, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.0415094339622641, |
|
"grad_norm": 0.13232818245887756, |
|
"learning_rate": 2.8702196683808496e-05, |
|
"loss": 0.5065, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.0465408805031446, |
|
"grad_norm": 0.1348014920949936, |
|
"learning_rate": 2.8517380982635906e-05, |
|
"loss": 0.5293, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.0515723270440251, |
|
"grad_norm": 0.11755809187889099, |
|
"learning_rate": 2.8332496471046737e-05, |
|
"loss": 0.486, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.0566037735849056, |
|
"grad_norm": 0.1252366006374359, |
|
"learning_rate": 2.8147555653678353e-05, |
|
"loss": 0.4975, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0566037735849056, |
|
"eval_loss": 0.5800932049751282, |
|
"eval_runtime": 411.2332, |
|
"eval_samples_per_second": 26.034, |
|
"eval_steps_per_second": 0.204, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0616352201257861, |
|
"grad_norm": 0.11638514697551727, |
|
"learning_rate": 2.7962571038976376e-05, |
|
"loss": 0.5021, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.11196921765804291, |
|
"learning_rate": 2.777755513834865e-05, |
|
"loss": 0.5081, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.0716981132075472, |
|
"grad_norm": 0.12703673541545868, |
|
"learning_rate": 2.7592520465319012e-05, |
|
"loss": 0.5186, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.0767295597484277, |
|
"grad_norm": 0.12006295472383499, |
|
"learning_rate": 2.7407479534680997e-05, |
|
"loss": 0.5123, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.0817610062893082, |
|
"grad_norm": 0.11968886107206345, |
|
"learning_rate": 2.722244486165136e-05, |
|
"loss": 0.5135, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.0867924528301887, |
|
"grad_norm": 0.11439641565084457, |
|
"learning_rate": 2.7037428961023632e-05, |
|
"loss": 0.4964, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.0918238993710692, |
|
"grad_norm": 0.12513814866542816, |
|
"learning_rate": 2.685244434632166e-05, |
|
"loss": 0.5103, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.0968553459119497, |
|
"grad_norm": 0.11105407029390335, |
|
"learning_rate": 2.6667503528953275e-05, |
|
"loss": 0.4915, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.1018867924528302, |
|
"grad_norm": 0.1270296275615692, |
|
"learning_rate": 2.6482619017364096e-05, |
|
"loss": 0.5197, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.1069182389937107, |
|
"grad_norm": 0.12292016297578812, |
|
"learning_rate": 2.629780331619151e-05, |
|
"loss": 0.5131, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1069182389937107, |
|
"eval_loss": 0.5787190198898315, |
|
"eval_runtime": 410.8629, |
|
"eval_samples_per_second": 26.057, |
|
"eval_steps_per_second": 0.204, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.1119496855345912, |
|
"grad_norm": 0.10833011567592621, |
|
"learning_rate": 2.6113068925418892e-05, |
|
"loss": 0.4747, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.1169811320754717, |
|
"grad_norm": 0.11480649560689926, |
|
"learning_rate": 2.592842833953023e-05, |
|
"loss": 0.5033, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.1220125786163522, |
|
"grad_norm": 0.11621647328138351, |
|
"learning_rate": 2.5743894046665013e-05, |
|
"loss": 0.5041, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.1270440251572327, |
|
"grad_norm": 0.11848396062850952, |
|
"learning_rate": 2.555947852777364e-05, |
|
"loss": 0.4868, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.1320754716981132, |
|
"grad_norm": 0.11532817780971527, |
|
"learning_rate": 2.537519425577322e-05, |
|
"loss": 0.485, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.1371069182389937, |
|
"grad_norm": 0.11470374464988708, |
|
"learning_rate": 2.519105369470402e-05, |
|
"loss": 0.5009, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.1421383647798742, |
|
"grad_norm": 0.11918400973081589, |
|
"learning_rate": 2.5007069298886416e-05, |
|
"loss": 0.492, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.1471698113207547, |
|
"grad_norm": 0.11399897933006287, |
|
"learning_rate": 2.4823253512078605e-05, |
|
"loss": 0.5227, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.1522012578616352, |
|
"grad_norm": 0.1135721504688263, |
|
"learning_rate": 2.4639618766634904e-05, |
|
"loss": 0.4948, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.1572327044025157, |
|
"grad_norm": 0.11768464744091034, |
|
"learning_rate": 2.4456177482664932e-05, |
|
"loss": 0.5069, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1572327044025157, |
|
"eval_loss": 0.5783973932266235, |
|
"eval_runtime": 410.9352, |
|
"eval_samples_per_second": 26.053, |
|
"eval_steps_per_second": 0.204, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.1622641509433962, |
|
"grad_norm": 0.11413212865591049, |
|
"learning_rate": 2.4272942067193593e-05, |
|
"loss": 0.4919, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.1672955974842767, |
|
"grad_norm": 0.11099102348089218, |
|
"learning_rate": 2.4089924913321854e-05, |
|
"loss": 0.5131, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.1723270440251572, |
|
"grad_norm": 0.12233982235193253, |
|
"learning_rate": 2.3907138399388656e-05, |
|
"loss": 0.5152, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.1773584905660377, |
|
"grad_norm": 0.109150230884552, |
|
"learning_rate": 2.3724594888133578e-05, |
|
"loss": 0.4942, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.1823899371069182, |
|
"grad_norm": 0.11547227948904037, |
|
"learning_rate": 2.354230672586079e-05, |
|
"loss": 0.5087, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1874213836477987, |
|
"grad_norm": 0.11096334457397461, |
|
"learning_rate": 2.3360286241603947e-05, |
|
"loss": 0.528, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.1924528301886792, |
|
"grad_norm": 0.11583118885755539, |
|
"learning_rate": 2.3178545746292363e-05, |
|
"loss": 0.5053, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.1974842767295597, |
|
"grad_norm": 0.12394651025533676, |
|
"learning_rate": 2.299709753191831e-05, |
|
"loss": 0.519, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.2025157232704402, |
|
"grad_norm": 0.10962869971990585, |
|
"learning_rate": 2.281595387070571e-05, |
|
"loss": 0.5119, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.2075471698113207, |
|
"grad_norm": 0.11548969149589539, |
|
"learning_rate": 2.263512701428005e-05, |
|
"loss": 0.5053, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2075471698113207, |
|
"eval_loss": 0.5777500867843628, |
|
"eval_runtime": 410.7907, |
|
"eval_samples_per_second": 26.062, |
|
"eval_steps_per_second": 0.204, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.2125786163522012, |
|
"grad_norm": 0.11964194476604462, |
|
"learning_rate": 2.2454629192839782e-05, |
|
"loss": 0.5067, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.2176100628930817, |
|
"grad_norm": 0.11280932277441025, |
|
"learning_rate": 2.2274472614329146e-05, |
|
"loss": 0.5097, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.2226415094339622, |
|
"grad_norm": 0.11903873831033707, |
|
"learning_rate": 2.2094669463612417e-05, |
|
"loss": 0.4973, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.2276729559748427, |
|
"grad_norm": 0.11274790018796921, |
|
"learning_rate": 2.191523190164988e-05, |
|
"loss": 0.4802, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.2327044025157232, |
|
"grad_norm": 0.12155181169509888, |
|
"learning_rate": 2.1736172064675242e-05, |
|
"loss": 0.5039, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.2377358490566037, |
|
"grad_norm": 0.1149788573384285, |
|
"learning_rate": 2.1557502063374863e-05, |
|
"loss": 0.5018, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.2427672955974842, |
|
"grad_norm": 0.11619096249341965, |
|
"learning_rate": 2.1379233982068597e-05, |
|
"loss": 0.5001, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.2477987421383647, |
|
"grad_norm": 0.11911306530237198, |
|
"learning_rate": 2.120137987789252e-05, |
|
"loss": 0.5257, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.2528301886792452, |
|
"grad_norm": 0.11577396094799042, |
|
"learning_rate": 2.1023951779983408e-05, |
|
"loss": 0.5156, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.2578616352201257, |
|
"grad_norm": 0.11911614239215851, |
|
"learning_rate": 2.0846961688665158e-05, |
|
"loss": 0.5189, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2578616352201257, |
|
"eval_loss": 0.5771186947822571, |
|
"eval_runtime": 410.8096, |
|
"eval_samples_per_second": 26.061, |
|
"eval_steps_per_second": 0.204, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2628930817610062, |
|
"grad_norm": 0.12026111036539078, |
|
"learning_rate": 2.0670421574637182e-05, |
|
"loss": 0.4965, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.2679245283018867, |
|
"grad_norm": 0.11090611666440964, |
|
"learning_rate": 2.0494343378164736e-05, |
|
"loss": 0.4924, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.2729559748427672, |
|
"grad_norm": 0.11498820036649704, |
|
"learning_rate": 2.0318739008271327e-05, |
|
"loss": 0.5069, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.2779874213836477, |
|
"grad_norm": 0.11681642383337021, |
|
"learning_rate": 2.014362034193326e-05, |
|
"loss": 0.5208, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.2830188679245282, |
|
"grad_norm": 0.11289618164300919, |
|
"learning_rate": 1.9968999223276406e-05, |
|
"loss": 0.497, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2880503144654087, |
|
"grad_norm": 0.11700893938541412, |
|
"learning_rate": 1.979488746277503e-05, |
|
"loss": 0.4872, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.2930817610062892, |
|
"grad_norm": 0.11669134348630905, |
|
"learning_rate": 1.9621296836453025e-05, |
|
"loss": 0.5117, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.2981132075471697, |
|
"grad_norm": 0.11578242480754852, |
|
"learning_rate": 1.944823908508745e-05, |
|
"loss": 0.5046, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.3031446540880502, |
|
"grad_norm": 0.11336881667375565, |
|
"learning_rate": 1.9275725913414483e-05, |
|
"loss": 0.4828, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.3081761006289307, |
|
"grad_norm": 0.1218356043100357, |
|
"learning_rate": 1.910376898933769e-05, |
|
"loss": 0.5173, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3081761006289307, |
|
"eval_loss": 0.5762000679969788, |
|
"eval_runtime": 410.9091, |
|
"eval_samples_per_second": 26.054, |
|
"eval_steps_per_second": 0.204, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.3132075471698113, |
|
"grad_norm": 0.11644181609153748, |
|
"learning_rate": 1.8932379943138916e-05, |
|
"loss": 0.5002, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.3182389937106918, |
|
"grad_norm": 0.11215106397867203, |
|
"learning_rate": 1.8761570366691684e-05, |
|
"loss": 0.4808, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.3232704402515723, |
|
"grad_norm": 0.11506900936365128, |
|
"learning_rate": 1.8591351812677144e-05, |
|
"loss": 0.4915, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.3283018867924528, |
|
"grad_norm": 0.11646901071071625, |
|
"learning_rate": 1.8421735793802763e-05, |
|
"loss": 0.5067, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.1217241957783699, |
|
"learning_rate": 1.8252733782023584e-05, |
|
"loss": 0.5105, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.3383647798742138, |
|
"grad_norm": 0.12330880761146545, |
|
"learning_rate": 1.8084357207766406e-05, |
|
"loss": 0.5107, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.3433962264150943, |
|
"grad_norm": 0.10948923975229263, |
|
"learning_rate": 1.7916617459156615e-05, |
|
"loss": 0.4929, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.3484276729559748, |
|
"grad_norm": 0.11415420472621918, |
|
"learning_rate": 1.7749525881248035e-05, |
|
"loss": 0.5123, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.3534591194968553, |
|
"grad_norm": 0.11750365048646927, |
|
"learning_rate": 1.7583093775255516e-05, |
|
"loss": 0.5082, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.3584905660377358, |
|
"grad_norm": 0.11664094030857086, |
|
"learning_rate": 1.741733239779063e-05, |
|
"loss": 0.5048, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3584905660377358, |
|
"eval_loss": 0.5755621194839478, |
|
"eval_runtime": 410.7838, |
|
"eval_samples_per_second": 26.062, |
|
"eval_steps_per_second": 0.204, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.3635220125786163, |
|
"grad_norm": 0.11655986309051514, |
|
"learning_rate": 1.725225296010034e-05, |
|
"loss": 0.4923, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.3685534591194968, |
|
"grad_norm": 0.11432712525129318, |
|
"learning_rate": 1.7087866627308664e-05, |
|
"loss": 0.4976, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.3735849056603773, |
|
"grad_norm": 0.11400482058525085, |
|
"learning_rate": 1.692418451766163e-05, |
|
"loss": 0.5026, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.378616352201258, |
|
"grad_norm": 0.11588790267705917, |
|
"learning_rate": 1.6761217701775207e-05, |
|
"loss": 0.5031, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.3836477987421385, |
|
"grad_norm": 0.11426915228366852, |
|
"learning_rate": 1.6598977201886558e-05, |
|
"loss": 0.5001, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.388679245283019, |
|
"grad_norm": 0.11552328616380692, |
|
"learning_rate": 1.6437473991108585e-05, |
|
"loss": 0.4928, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.3937106918238995, |
|
"grad_norm": 0.11312104761600494, |
|
"learning_rate": 1.6276718992687746e-05, |
|
"loss": 0.4977, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.39874213836478, |
|
"grad_norm": 0.11197475343942642, |
|
"learning_rate": 1.6116723079265263e-05, |
|
"loss": 0.489, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.4037735849056605, |
|
"grad_norm": 0.11652438342571259, |
|
"learning_rate": 1.5957497072141758e-05, |
|
"loss": 0.4971, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.408805031446541, |
|
"grad_norm": 0.1163628101348877, |
|
"learning_rate": 1.579905174054533e-05, |
|
"loss": 0.4986, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.408805031446541, |
|
"eval_loss": 0.5742356777191162, |
|
"eval_runtime": 410.5998, |
|
"eval_samples_per_second": 26.074, |
|
"eval_steps_per_second": 0.205, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.4138364779874215, |
|
"grad_norm": 0.1128329187631607, |
|
"learning_rate": 1.5641397800903222e-05, |
|
"loss": 0.5068, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.418867924528302, |
|
"grad_norm": 0.11648018658161163, |
|
"learning_rate": 1.5484545916116995e-05, |
|
"loss": 0.4958, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.4238993710691825, |
|
"grad_norm": 0.1150885596871376, |
|
"learning_rate": 1.5328506694841334e-05, |
|
"loss": 0.4855, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.428930817610063, |
|
"grad_norm": 0.11181043833494186, |
|
"learning_rate": 1.5173290690766553e-05, |
|
"loss": 0.5114, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.4339622641509435, |
|
"grad_norm": 0.11899517476558685, |
|
"learning_rate": 1.5018908401904785e-05, |
|
"loss": 0.5048, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.438993710691824, |
|
"grad_norm": 0.11897191405296326, |
|
"learning_rate": 1.4865370269879955e-05, |
|
"loss": 0.5308, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.4440251572327045, |
|
"grad_norm": 0.11142674088478088, |
|
"learning_rate": 1.471268667922157e-05, |
|
"loss": 0.4958, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.449056603773585, |
|
"grad_norm": 0.1150866225361824, |
|
"learning_rate": 1.4560867956662336e-05, |
|
"loss": 0.4939, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.4540880503144655, |
|
"grad_norm": 0.11816877871751785, |
|
"learning_rate": 1.4409924370439737e-05, |
|
"loss": 0.4913, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.459119496855346, |
|
"grad_norm": 0.11381349712610245, |
|
"learning_rate": 1.425986612960155e-05, |
|
"loss": 0.5039, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.459119496855346, |
|
"eval_loss": 0.573836624622345, |
|
"eval_runtime": 410.5985, |
|
"eval_samples_per_second": 26.074, |
|
"eval_steps_per_second": 0.205, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.4641509433962265, |
|
"grad_norm": 0.1094905436038971, |
|
"learning_rate": 1.4110703383315326e-05, |
|
"loss": 0.4901, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.469182389937107, |
|
"grad_norm": 0.11396130174398422, |
|
"learning_rate": 1.396244622018199e-05, |
|
"loss": 0.5081, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.4742138364779875, |
|
"grad_norm": 0.1160426139831543, |
|
"learning_rate": 1.3815104667553452e-05, |
|
"loss": 0.4869, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.479245283018868, |
|
"grad_norm": 0.11492225527763367, |
|
"learning_rate": 1.3668688690854453e-05, |
|
"loss": 0.4888, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.4842767295597485, |
|
"grad_norm": 0.11282163113355637, |
|
"learning_rate": 1.3523208192908562e-05, |
|
"loss": 0.4983, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.489308176100629, |
|
"grad_norm": 0.11276757717132568, |
|
"learning_rate": 1.3378673013268336e-05, |
|
"loss": 0.517, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.4943396226415095, |
|
"grad_norm": 0.11005326360464096, |
|
"learning_rate": 1.3235092927549888e-05, |
|
"loss": 0.4933, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.49937106918239, |
|
"grad_norm": 0.11391846090555191, |
|
"learning_rate": 1.3092477646771686e-05, |
|
"loss": 0.5047, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.5044025157232706, |
|
"grad_norm": 0.112746462225914, |
|
"learning_rate": 1.2950836816697753e-05, |
|
"loss": 0.4933, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"grad_norm": 0.11274772882461548, |
|
"learning_rate": 1.2810180017185286e-05, |
|
"loss": 0.4928, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.509433962264151, |
|
"eval_loss": 0.5733875632286072, |
|
"eval_runtime": 410.9812, |
|
"eval_samples_per_second": 26.05, |
|
"eval_steps_per_second": 0.204, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.5144654088050316, |
|
"grad_norm": 0.11344069242477417, |
|
"learning_rate": 1.2670516761536705e-05, |
|
"loss": 0.5083, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.519496855345912, |
|
"grad_norm": 0.1206919476389885, |
|
"learning_rate": 1.2531856495856234e-05, |
|
"loss": 0.4931, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.5245283018867926, |
|
"grad_norm": 0.11568621546030045, |
|
"learning_rate": 1.2394208598411026e-05, |
|
"loss": 0.4961, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.529559748427673, |
|
"grad_norm": 0.11288320273160934, |
|
"learning_rate": 1.2257582378996846e-05, |
|
"loss": 0.493, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 1.5345911949685536, |
|
"grad_norm": 0.11447000503540039, |
|
"learning_rate": 1.2121987078308414e-05, |
|
"loss": 0.487, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.539622641509434, |
|
"grad_norm": 0.1167483702301979, |
|
"learning_rate": 1.1987431867314417e-05, |
|
"loss": 0.5078, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 1.5446540880503146, |
|
"grad_norm": 0.11339499801397324, |
|
"learning_rate": 1.1853925846637192e-05, |
|
"loss": 0.5101, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 1.549685534591195, |
|
"grad_norm": 0.11238376796245575, |
|
"learning_rate": 1.1721478045937298e-05, |
|
"loss": 0.5075, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 1.5547169811320756, |
|
"grad_norm": 0.11750028282403946, |
|
"learning_rate": 1.1590097423302684e-05, |
|
"loss": 0.5223, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 1.559748427672956, |
|
"grad_norm": 0.11243315786123276, |
|
"learning_rate": 1.1459792864642889e-05, |
|
"loss": 0.5014, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.559748427672956, |
|
"eval_loss": 0.5725140571594238, |
|
"eval_runtime": 411.304, |
|
"eval_samples_per_second": 26.029, |
|
"eval_steps_per_second": 0.204, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.5647798742138366, |
|
"grad_norm": 0.10880452394485474, |
|
"learning_rate": 1.1330573183088027e-05, |
|
"loss": 0.4946, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 1.569811320754717, |
|
"grad_norm": 0.11715767532587051, |
|
"learning_rate": 1.1202447118392666e-05, |
|
"loss": 0.4934, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 1.5748427672955976, |
|
"grad_norm": 0.1085837110877037, |
|
"learning_rate": 1.1075423336344815e-05, |
|
"loss": 0.4918, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 1.579874213836478, |
|
"grad_norm": 0.11400571465492249, |
|
"learning_rate": 1.0949510428179703e-05, |
|
"loss": 0.4907, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 1.5849056603773586, |
|
"grad_norm": 0.11114535480737686, |
|
"learning_rate": 1.0824716909998783e-05, |
|
"loss": 0.504, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.589937106918239, |
|
"grad_norm": 0.10678807646036148, |
|
"learning_rate": 1.0701051222193734e-05, |
|
"loss": 0.4757, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 1.5949685534591196, |
|
"grad_norm": 0.11523126810789108, |
|
"learning_rate": 1.0578521728875578e-05, |
|
"loss": 0.5019, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.11389489471912384, |
|
"learning_rate": 1.0457136717308988e-05, |
|
"loss": 0.5162, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 1.6050314465408806, |
|
"grad_norm": 0.11754269152879715, |
|
"learning_rate": 1.0336904397351794e-05, |
|
"loss": 0.4991, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 1.610062893081761, |
|
"grad_norm": 0.11521238088607788, |
|
"learning_rate": 1.021783290089966e-05, |
|
"loss": 0.5041, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.610062893081761, |
|
"eval_loss": 0.57233065366745, |
|
"eval_runtime": 417.7875, |
|
"eval_samples_per_second": 25.625, |
|
"eval_steps_per_second": 0.201, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.6150943396226416, |
|
"grad_norm": 0.1117156520485878, |
|
"learning_rate": 1.009993028133615e-05, |
|
"loss": 0.4919, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 1.620125786163522, |
|
"grad_norm": 0.11549975723028183, |
|
"learning_rate": 9.983204512988004e-06, |
|
"loss": 0.4988, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 1.6251572327044026, |
|
"grad_norm": 0.11243242025375366, |
|
"learning_rate": 9.867663490585783e-06, |
|
"loss": 0.5128, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 1.630188679245283, |
|
"grad_norm": 0.11129079759120941, |
|
"learning_rate": 9.753315028729948e-06, |
|
"loss": 0.4893, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 1.6352201257861636, |
|
"grad_norm": 0.11360695213079453, |
|
"learning_rate": 9.640166861362268e-06, |
|
"loss": 0.503, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.640251572327044, |
|
"grad_norm": 0.11027677357196808, |
|
"learning_rate": 9.528226641242804e-06, |
|
"loss": 0.4933, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 1.6452830188679246, |
|
"grad_norm": 0.11328162252902985, |
|
"learning_rate": 9.417501939432257e-06, |
|
"loss": 0.4969, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 1.650314465408805, |
|
"grad_norm": 0.111870676279068, |
|
"learning_rate": 9.308000244779918e-06, |
|
"loss": 0.5009, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 1.6553459119496856, |
|
"grad_norm": 0.11578749120235443, |
|
"learning_rate": 9.19972896341717e-06, |
|
"loss": 0.5226, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 1.6603773584905661, |
|
"grad_norm": 0.11840783059597015, |
|
"learning_rate": 9.09269541825658e-06, |
|
"loss": 0.4876, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6603773584905661, |
|
"eval_loss": 0.5716937184333801, |
|
"eval_runtime": 411.8982, |
|
"eval_samples_per_second": 25.992, |
|
"eval_steps_per_second": 0.204, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.6654088050314466, |
|
"grad_norm": 0.10933776944875717, |
|
"learning_rate": 8.98690684849659e-06, |
|
"loss": 0.5217, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 1.6704402515723271, |
|
"grad_norm": 0.11809239536523819, |
|
"learning_rate": 8.882370409131924e-06, |
|
"loss": 0.5182, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 1.6754716981132076, |
|
"grad_norm": 0.1144753098487854, |
|
"learning_rate": 8.779093170469629e-06, |
|
"loss": 0.4999, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 1.6805031446540881, |
|
"grad_norm": 0.11396916210651398, |
|
"learning_rate": 8.677082117650906e-06, |
|
"loss": 0.507, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 1.6855345911949686, |
|
"grad_norm": 0.11068397760391235, |
|
"learning_rate": 8.576344150178653e-06, |
|
"loss": 0.5136, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6905660377358491, |
|
"grad_norm": 0.1053067147731781, |
|
"learning_rate": 8.47688608145083e-06, |
|
"loss": 0.4907, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 1.6955974842767296, |
|
"grad_norm": 0.1102994978427887, |
|
"learning_rate": 8.378714638299628e-06, |
|
"loss": 0.4881, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 1.7006289308176101, |
|
"grad_norm": 0.10971739888191223, |
|
"learning_rate": 8.28183646053649e-06, |
|
"loss": 0.5176, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 1.7056603773584906, |
|
"grad_norm": 0.11169516295194626, |
|
"learning_rate": 8.186258100503058e-06, |
|
"loss": 0.5102, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 1.7106918238993711, |
|
"grad_norm": 0.1112278550863266, |
|
"learning_rate": 8.091986022627978e-06, |
|
"loss": 0.5272, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7106918238993711, |
|
"eval_loss": 0.5712010860443115, |
|
"eval_runtime": 411.4247, |
|
"eval_samples_per_second": 26.022, |
|
"eval_steps_per_second": 0.204, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.7157232704402516, |
|
"grad_norm": 0.11466188728809357, |
|
"learning_rate": 7.999026602989687e-06, |
|
"loss": 0.4974, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 1.7207547169811321, |
|
"grad_norm": 0.12257977575063705, |
|
"learning_rate": 7.907386128885182e-06, |
|
"loss": 0.4946, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 1.7257861635220126, |
|
"grad_norm": 0.12135323882102966, |
|
"learning_rate": 7.817070798404755e-06, |
|
"loss": 0.5374, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 1.7308176100628931, |
|
"grad_norm": 0.11566798388957977, |
|
"learning_rate": 7.728086720012813e-06, |
|
"loss": 0.5048, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 1.7358490566037736, |
|
"grad_norm": 0.11244137585163116, |
|
"learning_rate": 7.640439912134711e-06, |
|
"loss": 0.5169, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.7408805031446541, |
|
"grad_norm": 0.1125202625989914, |
|
"learning_rate": 7.554136302749705e-06, |
|
"loss": 0.5076, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 1.7459119496855346, |
|
"grad_norm": 0.1143079325556755, |
|
"learning_rate": 7.469181728990013e-06, |
|
"loss": 0.4961, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 1.7509433962264151, |
|
"grad_norm": 0.11676593869924545, |
|
"learning_rate": 7.385581936746035e-06, |
|
"loss": 0.5003, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 1.7559748427672957, |
|
"grad_norm": 0.1095028892159462, |
|
"learning_rate": 7.303342580277696e-06, |
|
"loss": 0.4755, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 1.7610062893081762, |
|
"grad_norm": 0.11439331620931625, |
|
"learning_rate": 7.222469221832061e-06, |
|
"loss": 0.5057, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7610062893081762, |
|
"eval_loss": 0.5707039833068848, |
|
"eval_runtime": 411.7854, |
|
"eval_samples_per_second": 25.999, |
|
"eval_steps_per_second": 0.204, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7660377358490567, |
|
"grad_norm": 0.11388733237981796, |
|
"learning_rate": 7.142967331267113e-06, |
|
"loss": 0.4748, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 1.7710691823899372, |
|
"grad_norm": 0.11583738774061203, |
|
"learning_rate": 7.064842285681781e-06, |
|
"loss": 0.494, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 1.7761006289308177, |
|
"grad_norm": 0.11597929149866104, |
|
"learning_rate": 6.988099369052318e-06, |
|
"loss": 0.5106, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 1.7811320754716982, |
|
"grad_norm": 0.1117326021194458, |
|
"learning_rate": 6.9127437718748465e-06, |
|
"loss": 0.4844, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 1.7861635220125787, |
|
"grad_norm": 0.11276806890964508, |
|
"learning_rate": 6.838780590814366e-06, |
|
"loss": 0.5221, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7911949685534592, |
|
"grad_norm": 0.11557289958000183, |
|
"learning_rate": 6.7662148283599955e-06, |
|
"loss": 0.5021, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 1.7962264150943397, |
|
"grad_norm": 0.11254438757896423, |
|
"learning_rate": 6.695051392486652e-06, |
|
"loss": 0.4999, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 1.8012578616352202, |
|
"grad_norm": 0.11114822328090668, |
|
"learning_rate": 6.625295096323097e-06, |
|
"loss": 0.4849, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 1.8062893081761007, |
|
"grad_norm": 0.11419788002967834, |
|
"learning_rate": 6.556950657826405e-06, |
|
"loss": 0.5227, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 1.8113207547169812, |
|
"grad_norm": 0.11834213882684708, |
|
"learning_rate": 6.490022699462844e-06, |
|
"loss": 0.5043, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8113207547169812, |
|
"eval_loss": 0.5703166723251343, |
|
"eval_runtime": 411.34, |
|
"eval_samples_per_second": 26.027, |
|
"eval_steps_per_second": 0.204, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.8163522012578617, |
|
"grad_norm": 0.11034736037254333, |
|
"learning_rate": 6.424515747895265e-06, |
|
"loss": 0.48, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 1.8213836477987422, |
|
"grad_norm": 0.11254922300577164, |
|
"learning_rate": 6.360434233676926e-06, |
|
"loss": 0.4864, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 1.8264150943396227, |
|
"grad_norm": 0.10830461978912354, |
|
"learning_rate": 6.297782490951833e-06, |
|
"loss": 0.4943, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 1.8314465408805032, |
|
"grad_norm": 0.1077997013926506, |
|
"learning_rate": 6.236564757161608e-06, |
|
"loss": 0.4865, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 1.8364779874213837, |
|
"grad_norm": 0.11505385488271713, |
|
"learning_rate": 6.176785172758871e-06, |
|
"loss": 0.5039, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.8415094339622642, |
|
"grad_norm": 0.12043328583240509, |
|
"learning_rate": 6.118447780927233e-06, |
|
"loss": 0.4909, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 1.8465408805031447, |
|
"grad_norm": 0.11934798955917358, |
|
"learning_rate": 6.0615565273078025e-06, |
|
"loss": 0.4978, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 1.8515723270440252, |
|
"grad_norm": 0.11153744161128998, |
|
"learning_rate": 6.006115259732345e-06, |
|
"loss": 0.4924, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 1.8566037735849057, |
|
"grad_norm": 0.11369643360376358, |
|
"learning_rate": 5.952127727963029e-06, |
|
"loss": 0.4938, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 1.8616352201257862, |
|
"grad_norm": 0.1063813716173172, |
|
"learning_rate": 5.899597583438808e-06, |
|
"loss": 0.5059, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8616352201257862, |
|
"eval_loss": 0.5707431435585022, |
|
"eval_runtime": 411.3909, |
|
"eval_samples_per_second": 26.024, |
|
"eval_steps_per_second": 0.204, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.12777185440063477, |
|
"learning_rate": 5.848528379028456e-06, |
|
"loss": 0.5138, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 1.8716981132075472, |
|
"grad_norm": 0.11385183781385422, |
|
"learning_rate": 5.798923568790283e-06, |
|
"loss": 0.5101, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 1.8767295597484277, |
|
"grad_norm": 0.11212068051099777, |
|
"learning_rate": 5.750786507738497e-06, |
|
"loss": 0.4845, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 1.8817610062893082, |
|
"grad_norm": 0.11255759000778198, |
|
"learning_rate": 5.704120451616305e-06, |
|
"loss": 0.5019, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 1.8867924528301887, |
|
"grad_norm": 0.11542278528213501, |
|
"learning_rate": 5.6589285566757095e-06, |
|
"loss": 0.5014, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.8918238993710692, |
|
"grad_norm": 0.11562332510948181, |
|
"learning_rate": 5.61521387946403e-06, |
|
"loss": 0.5077, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 1.8968553459119497, |
|
"grad_norm": 0.10956519842147827, |
|
"learning_rate": 5.572979376617183e-06, |
|
"loss": 0.4842, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 1.9018867924528302, |
|
"grad_norm": 0.11526080965995789, |
|
"learning_rate": 5.532227904659695e-06, |
|
"loss": 0.488, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 1.9069182389937107, |
|
"grad_norm": 0.11320438235998154, |
|
"learning_rate": 5.49296221981152e-06, |
|
"loss": 0.5021, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 1.9119496855345912, |
|
"grad_norm": 0.11186771839857101, |
|
"learning_rate": 5.455184977801612e-06, |
|
"loss": 0.4996, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9119496855345912, |
|
"eval_loss": 0.5699969530105591, |
|
"eval_runtime": 411.4136, |
|
"eval_samples_per_second": 26.022, |
|
"eval_steps_per_second": 0.204, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.9169811320754717, |
|
"grad_norm": 0.1088699921965599, |
|
"learning_rate": 5.418898733688302e-06, |
|
"loss": 0.4933, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 1.9220125786163522, |
|
"grad_norm": 0.11521943658590317, |
|
"learning_rate": 5.384105941686499e-06, |
|
"loss": 0.5018, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 1.9270440251572327, |
|
"grad_norm": 0.1104261726140976, |
|
"learning_rate": 5.350808955001693e-06, |
|
"loss": 0.4945, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 1.9320754716981132, |
|
"grad_norm": 0.11762084811925888, |
|
"learning_rate": 5.3190100256707905e-06, |
|
"loss": 0.4892, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 1.9371069182389937, |
|
"grad_norm": 0.1136779636144638, |
|
"learning_rate": 5.288711304409814e-06, |
|
"loss": 0.5075, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.9421383647798742, |
|
"grad_norm": 0.11609724909067154, |
|
"learning_rate": 5.259914840468416e-06, |
|
"loss": 0.5052, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 1.9471698113207547, |
|
"grad_norm": 0.11565674096345901, |
|
"learning_rate": 5.2326225814913e-06, |
|
"loss": 0.5054, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 1.9522012578616352, |
|
"grad_norm": 0.10971004515886307, |
|
"learning_rate": 5.206836373386482e-06, |
|
"loss": 0.4711, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 1.9572327044025157, |
|
"grad_norm": 0.11187240481376648, |
|
"learning_rate": 5.182557960200441e-06, |
|
"loss": 0.4946, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 1.9622641509433962, |
|
"grad_norm": 0.11195476353168488, |
|
"learning_rate": 5.1597889840001635e-06, |
|
"loss": 0.4975, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9622641509433962, |
|
"eval_loss": 0.5697274208068848, |
|
"eval_runtime": 411.2337, |
|
"eval_samples_per_second": 26.034, |
|
"eval_steps_per_second": 0.204, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.9672955974842767, |
|
"grad_norm": 0.11138684302568436, |
|
"learning_rate": 5.138530984762087e-06, |
|
"loss": 0.4915, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 1.9723270440251572, |
|
"grad_norm": 0.10923836380243301, |
|
"learning_rate": 5.118785400267929e-06, |
|
"loss": 0.4855, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.9773584905660377, |
|
"grad_norm": 0.10922150313854218, |
|
"learning_rate": 5.100553566007467e-06, |
|
"loss": 0.4794, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 1.9823899371069182, |
|
"grad_norm": 0.10965920239686966, |
|
"learning_rate": 5.083836715088188e-06, |
|
"loss": 0.4836, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 1.9874213836477987, |
|
"grad_norm": 0.11357378959655762, |
|
"learning_rate": 5.068635978151901e-06, |
|
"loss": 0.4942, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.9924528301886792, |
|
"grad_norm": 0.11112317442893982, |
|
"learning_rate": 5.0549523832982645e-06, |
|
"loss": 0.4939, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 1.9974842767295597, |
|
"grad_norm": 0.11381904780864716, |
|
"learning_rate": 5.042786856015253e-06, |
|
"loss": 0.5013, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 1.9974842767295597, |
|
"step": 794, |
|
"total_flos": 946984181301248.0, |
|
"train_loss": 0.5561736259862818, |
|
"train_runtime": 112418.4712, |
|
"train_samples_per_second": 3.619, |
|
"train_steps_per_second": 0.007 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 794, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 946984181301248.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|