|
{ |
|
"best_metric": 0.4903908967971802, |
|
"best_model_checkpoint": "./beans_outputs/checkpoint-621", |
|
"epoch": 50.0, |
|
"eval_steps": 500, |
|
"global_step": 1150, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 3.492309093475342, |
|
"learning_rate": 1.9826086956521742e-05, |
|
"loss": 0.7305, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 4.003854751586914, |
|
"learning_rate": 1.965217391304348e-05, |
|
"loss": 0.6794, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.6062992125984252, |
|
"eval_loss": 0.6559741497039795, |
|
"eval_runtime": 4.6669, |
|
"eval_samples_per_second": 54.425, |
|
"eval_steps_per_second": 0.857, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 4.675185680389404, |
|
"learning_rate": 1.947826086956522e-05, |
|
"loss": 0.6399, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 4.350035667419434, |
|
"learning_rate": 1.9304347826086957e-05, |
|
"loss": 0.6215, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7362204724409449, |
|
"eval_loss": 0.5833372473716736, |
|
"eval_runtime": 3.7107, |
|
"eval_samples_per_second": 68.451, |
|
"eval_steps_per_second": 1.078, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 3.180147886276245, |
|
"learning_rate": 1.9130434782608697e-05, |
|
"loss": 0.5964, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 3.246190309524536, |
|
"learning_rate": 1.8956521739130434e-05, |
|
"loss": 0.5784, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.5489528179168701, |
|
"eval_runtime": 4.3517, |
|
"eval_samples_per_second": 58.367, |
|
"eval_steps_per_second": 0.919, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 2.3400914669036865, |
|
"learning_rate": 1.8782608695652175e-05, |
|
"loss": 0.5412, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 3.5264837741851807, |
|
"learning_rate": 1.8608695652173912e-05, |
|
"loss": 0.5659, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.9130434782608696, |
|
"grad_norm": 4.993140697479248, |
|
"learning_rate": 1.8434782608695653e-05, |
|
"loss": 0.5347, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5305963754653931, |
|
"eval_runtime": 3.9321, |
|
"eval_samples_per_second": 64.596, |
|
"eval_steps_per_second": 1.017, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 2.20806622505188, |
|
"learning_rate": 1.8260869565217393e-05, |
|
"loss": 0.5086, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.782608695652174, |
|
"grad_norm": 4.256261825561523, |
|
"learning_rate": 1.808695652173913e-05, |
|
"loss": 0.5307, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5235078930854797, |
|
"eval_runtime": 3.7141, |
|
"eval_samples_per_second": 68.389, |
|
"eval_steps_per_second": 1.077, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 2.6543545722961426, |
|
"learning_rate": 1.791304347826087e-05, |
|
"loss": 0.5085, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.6521739130434785, |
|
"grad_norm": 4.274487495422363, |
|
"learning_rate": 1.773913043478261e-05, |
|
"loss": 0.5391, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5090441703796387, |
|
"eval_runtime": 4.3438, |
|
"eval_samples_per_second": 58.475, |
|
"eval_steps_per_second": 0.921, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 3.147414445877075, |
|
"learning_rate": 1.756521739130435e-05, |
|
"loss": 0.4977, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.521739130434782, |
|
"grad_norm": 4.254673004150391, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.5297, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 2.083784818649292, |
|
"learning_rate": 1.721739130434783e-05, |
|
"loss": 0.48, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5108471512794495, |
|
"eval_runtime": 4.1877, |
|
"eval_samples_per_second": 60.653, |
|
"eval_steps_per_second": 0.955, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 7.391304347826087, |
|
"grad_norm": 4.193545818328857, |
|
"learning_rate": 1.7043478260869566e-05, |
|
"loss": 0.4826, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 7.826086956521739, |
|
"grad_norm": 2.05076003074646, |
|
"learning_rate": 1.6869565217391307e-05, |
|
"loss": 0.473, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.5028324127197266, |
|
"eval_runtime": 3.7021, |
|
"eval_samples_per_second": 68.61, |
|
"eval_steps_per_second": 1.08, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 8.26086956521739, |
|
"grad_norm": 3.007233142852783, |
|
"learning_rate": 1.6695652173913044e-05, |
|
"loss": 0.5255, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 2.196945905685425, |
|
"learning_rate": 1.6521739130434785e-05, |
|
"loss": 0.5014, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5054498314857483, |
|
"eval_runtime": 4.225, |
|
"eval_samples_per_second": 60.119, |
|
"eval_steps_per_second": 0.947, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 9.130434782608695, |
|
"grad_norm": 2.184353828430176, |
|
"learning_rate": 1.6347826086956525e-05, |
|
"loss": 0.5044, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.565217391304348, |
|
"grad_norm": 4.106619358062744, |
|
"learning_rate": 1.6173913043478262e-05, |
|
"loss": 0.4822, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 4.000082969665527, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.496, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5039955973625183, |
|
"eval_runtime": 5.3498, |
|
"eval_samples_per_second": 47.478, |
|
"eval_steps_per_second": 0.748, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.434782608695652, |
|
"grad_norm": 5.726933002471924, |
|
"learning_rate": 1.582608695652174e-05, |
|
"loss": 0.5101, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 10.869565217391305, |
|
"grad_norm": 4.100568771362305, |
|
"learning_rate": 1.565217391304348e-05, |
|
"loss": 0.4688, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.4972316324710846, |
|
"eval_runtime": 3.7607, |
|
"eval_samples_per_second": 67.54, |
|
"eval_steps_per_second": 1.064, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 11.304347826086957, |
|
"grad_norm": 2.6119587421417236, |
|
"learning_rate": 1.5478260869565217e-05, |
|
"loss": 0.485, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 11.73913043478261, |
|
"grad_norm": 3.003861427307129, |
|
"learning_rate": 1.5304347826086958e-05, |
|
"loss": 0.4943, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.49771231412887573, |
|
"eval_runtime": 4.9203, |
|
"eval_samples_per_second": 51.622, |
|
"eval_steps_per_second": 0.813, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 12.173913043478262, |
|
"grad_norm": 2.9490270614624023, |
|
"learning_rate": 1.5130434782608697e-05, |
|
"loss": 0.4505, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 12.608695652173914, |
|
"grad_norm": 2.8131847381591797, |
|
"learning_rate": 1.4956521739130436e-05, |
|
"loss": 0.5012, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5057242512702942, |
|
"eval_runtime": 3.7024, |
|
"eval_samples_per_second": 68.605, |
|
"eval_steps_per_second": 1.08, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 17.65978240966797, |
|
"learning_rate": 1.4782608695652174e-05, |
|
"loss": 0.4768, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.478260869565217, |
|
"grad_norm": 2.085587978363037, |
|
"learning_rate": 1.4608695652173915e-05, |
|
"loss": 0.4729, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 13.91304347826087, |
|
"grad_norm": 4.59744119644165, |
|
"learning_rate": 1.4434782608695654e-05, |
|
"loss": 0.4639, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5010089874267578, |
|
"eval_runtime": 3.7018, |
|
"eval_samples_per_second": 68.616, |
|
"eval_steps_per_second": 1.081, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 14.347826086956522, |
|
"grad_norm": 2.4057395458221436, |
|
"learning_rate": 1.4260869565217392e-05, |
|
"loss": 0.4751, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 14.782608695652174, |
|
"grad_norm": 3.549567222595215, |
|
"learning_rate": 1.4086956521739133e-05, |
|
"loss": 0.4709, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.4948899447917938, |
|
"eval_runtime": 4.9714, |
|
"eval_samples_per_second": 51.092, |
|
"eval_steps_per_second": 0.805, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 15.217391304347826, |
|
"grad_norm": 6.705427646636963, |
|
"learning_rate": 1.391304347826087e-05, |
|
"loss": 0.4379, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 15.652173913043478, |
|
"grad_norm": 2.444533348083496, |
|
"learning_rate": 1.373913043478261e-05, |
|
"loss": 0.4888, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.49550917744636536, |
|
"eval_runtime": 3.6768, |
|
"eval_samples_per_second": 69.081, |
|
"eval_steps_per_second": 1.088, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 16.08695652173913, |
|
"grad_norm": 5.470461845397949, |
|
"learning_rate": 1.3565217391304348e-05, |
|
"loss": 0.4952, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 16.52173913043478, |
|
"grad_norm": 2.0678608417510986, |
|
"learning_rate": 1.3391304347826088e-05, |
|
"loss": 0.4784, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 16.956521739130434, |
|
"grad_norm": 6.63480806350708, |
|
"learning_rate": 1.3217391304347827e-05, |
|
"loss": 0.4594, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.49856194853782654, |
|
"eval_runtime": 3.7219, |
|
"eval_samples_per_second": 68.245, |
|
"eval_steps_per_second": 1.075, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 4.448991298675537, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.4607, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 17.82608695652174, |
|
"grad_norm": 2.716780424118042, |
|
"learning_rate": 1.2869565217391305e-05, |
|
"loss": 0.4745, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.501070499420166, |
|
"eval_runtime": 4.5054, |
|
"eval_samples_per_second": 56.377, |
|
"eval_steps_per_second": 0.888, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 18.26086956521739, |
|
"grad_norm": 2.406355857849121, |
|
"learning_rate": 1.2695652173913045e-05, |
|
"loss": 0.4639, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 18.695652173913043, |
|
"grad_norm": 5.627669811248779, |
|
"learning_rate": 1.2521739130434784e-05, |
|
"loss": 0.4667, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.4928434491157532, |
|
"eval_runtime": 4.0475, |
|
"eval_samples_per_second": 62.756, |
|
"eval_steps_per_second": 0.988, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 19.130434782608695, |
|
"grad_norm": 4.074652671813965, |
|
"learning_rate": 1.2347826086956523e-05, |
|
"loss": 0.4671, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 19.565217391304348, |
|
"grad_norm": 5.88148832321167, |
|
"learning_rate": 1.2173913043478263e-05, |
|
"loss": 0.4442, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 3.00347900390625, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.4551, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5055357217788696, |
|
"eval_runtime": 3.6885, |
|
"eval_samples_per_second": 68.862, |
|
"eval_steps_per_second": 1.084, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.434782608695652, |
|
"grad_norm": 10.164237976074219, |
|
"learning_rate": 1.182608695652174e-05, |
|
"loss": 0.4657, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 20.869565217391305, |
|
"grad_norm": 2.1962711811065674, |
|
"learning_rate": 1.1652173913043478e-05, |
|
"loss": 0.4657, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.4928124248981476, |
|
"eval_runtime": 4.4478, |
|
"eval_samples_per_second": 57.107, |
|
"eval_steps_per_second": 0.899, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 21.304347826086957, |
|
"grad_norm": 5.0302228927612305, |
|
"learning_rate": 1.1478260869565218e-05, |
|
"loss": 0.4564, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 3.5275819301605225, |
|
"learning_rate": 1.1304347826086957e-05, |
|
"loss": 0.4818, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.5001721978187561, |
|
"eval_runtime": 4.0355, |
|
"eval_samples_per_second": 62.942, |
|
"eval_steps_per_second": 0.991, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 22.17391304347826, |
|
"grad_norm": 6.920666694641113, |
|
"learning_rate": 1.1130434782608696e-05, |
|
"loss": 0.4608, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 22.608695652173914, |
|
"grad_norm": 2.2840707302093506, |
|
"learning_rate": 1.0956521739130435e-05, |
|
"loss": 0.4633, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.49459317326545715, |
|
"eval_runtime": 3.7179, |
|
"eval_samples_per_second": 68.319, |
|
"eval_steps_per_second": 1.076, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 23.043478260869566, |
|
"grad_norm": 6.509201526641846, |
|
"learning_rate": 1.0782608695652175e-05, |
|
"loss": 0.4694, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 23.47826086956522, |
|
"grad_norm": 2.403275489807129, |
|
"learning_rate": 1.0608695652173914e-05, |
|
"loss": 0.4874, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 23.91304347826087, |
|
"grad_norm": 2.1320598125457764, |
|
"learning_rate": 1.0434782608695653e-05, |
|
"loss": 0.4779, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.49417200684547424, |
|
"eval_runtime": 4.3215, |
|
"eval_samples_per_second": 58.776, |
|
"eval_steps_per_second": 0.926, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 24.347826086956523, |
|
"grad_norm": 3.7421488761901855, |
|
"learning_rate": 1.0260869565217393e-05, |
|
"loss": 0.4579, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 24.782608695652176, |
|
"grad_norm": 3.07060170173645, |
|
"learning_rate": 1.008695652173913e-05, |
|
"loss": 0.4718, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.49625155329704285, |
|
"eval_runtime": 5.5612, |
|
"eval_samples_per_second": 45.674, |
|
"eval_steps_per_second": 0.719, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 25.217391304347824, |
|
"grad_norm": 4.446998596191406, |
|
"learning_rate": 9.913043478260871e-06, |
|
"loss": 0.443, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 25.652173913043477, |
|
"grad_norm": 2.4786624908447266, |
|
"learning_rate": 9.73913043478261e-06, |
|
"loss": 0.4511, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5011107325553894, |
|
"eval_runtime": 3.7637, |
|
"eval_samples_per_second": 67.487, |
|
"eval_steps_per_second": 1.063, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 26.08695652173913, |
|
"grad_norm": 5.552999019622803, |
|
"learning_rate": 9.565217391304349e-06, |
|
"loss": 0.4631, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 26.52173913043478, |
|
"grad_norm": 5.050811290740967, |
|
"learning_rate": 9.391304347826087e-06, |
|
"loss": 0.4564, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 26.956521739130434, |
|
"grad_norm": 2.0711512565612793, |
|
"learning_rate": 9.217391304347826e-06, |
|
"loss": 0.4798, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.4903908967971802, |
|
"eval_runtime": 4.9056, |
|
"eval_samples_per_second": 51.777, |
|
"eval_steps_per_second": 0.815, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 27.391304347826086, |
|
"grad_norm": 4.117509365081787, |
|
"learning_rate": 9.043478260869565e-06, |
|
"loss": 0.4411, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 27.82608695652174, |
|
"grad_norm": 4.448685646057129, |
|
"learning_rate": 8.869565217391306e-06, |
|
"loss": 0.4868, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.4982087016105652, |
|
"eval_runtime": 3.7322, |
|
"eval_samples_per_second": 68.057, |
|
"eval_steps_per_second": 1.072, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 28.26086956521739, |
|
"grad_norm": 3.0993807315826416, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.4414, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 28.695652173913043, |
|
"grad_norm": 4.982347011566162, |
|
"learning_rate": 8.521739130434783e-06, |
|
"loss": 0.4653, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.498798668384552, |
|
"eval_runtime": 3.7347, |
|
"eval_samples_per_second": 68.012, |
|
"eval_steps_per_second": 1.071, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 29.130434782608695, |
|
"grad_norm": 3.081833600997925, |
|
"learning_rate": 8.347826086956522e-06, |
|
"loss": 0.4503, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 29.565217391304348, |
|
"grad_norm": 4.352429389953613, |
|
"learning_rate": 8.173913043478263e-06, |
|
"loss": 0.4674, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 5.281393051147461, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.4613, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.49851593375205994, |
|
"eval_runtime": 4.8766, |
|
"eval_samples_per_second": 52.085, |
|
"eval_steps_per_second": 0.82, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.434782608695652, |
|
"grad_norm": 2.2079997062683105, |
|
"learning_rate": 7.82608695652174e-06, |
|
"loss": 0.4574, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 30.869565217391305, |
|
"grad_norm": 4.6935858726501465, |
|
"learning_rate": 7.652173913043479e-06, |
|
"loss": 0.4675, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5060083270072937, |
|
"eval_runtime": 3.7305, |
|
"eval_samples_per_second": 68.087, |
|
"eval_steps_per_second": 1.072, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 31.304347826086957, |
|
"grad_norm": 4.8790602684021, |
|
"learning_rate": 7.478260869565218e-06, |
|
"loss": 0.4802, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 31.73913043478261, |
|
"grad_norm": 5.6365485191345215, |
|
"learning_rate": 7.304347826086957e-06, |
|
"loss": 0.4587, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5059147477149963, |
|
"eval_runtime": 3.7699, |
|
"eval_samples_per_second": 67.376, |
|
"eval_steps_per_second": 1.061, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 32.17391304347826, |
|
"grad_norm": 5.480165004730225, |
|
"learning_rate": 7.130434782608696e-06, |
|
"loss": 0.4541, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 32.608695652173914, |
|
"grad_norm": 2.053098440170288, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 0.464, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5041583180427551, |
|
"eval_runtime": 4.9981, |
|
"eval_samples_per_second": 50.82, |
|
"eval_steps_per_second": 0.8, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 33.04347826086956, |
|
"grad_norm": 3.6429481506347656, |
|
"learning_rate": 6.782608695652174e-06, |
|
"loss": 0.454, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 33.47826086956522, |
|
"grad_norm": 2.436143636703491, |
|
"learning_rate": 6.6086956521739135e-06, |
|
"loss": 0.4612, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 33.91304347826087, |
|
"grad_norm": 2.5793776512145996, |
|
"learning_rate": 6.434782608695652e-06, |
|
"loss": 0.4374, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5063456296920776, |
|
"eval_runtime": 3.7117, |
|
"eval_samples_per_second": 68.432, |
|
"eval_steps_per_second": 1.078, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 34.34782608695652, |
|
"grad_norm": 3.71374773979187, |
|
"learning_rate": 6.260869565217392e-06, |
|
"loss": 0.4667, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 34.78260869565217, |
|
"grad_norm": 4.282368183135986, |
|
"learning_rate": 6.086956521739132e-06, |
|
"loss": 0.4864, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5039507150650024, |
|
"eval_runtime": 3.6837, |
|
"eval_samples_per_second": 68.952, |
|
"eval_steps_per_second": 1.086, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 35.21739130434783, |
|
"grad_norm": 2.896638870239258, |
|
"learning_rate": 5.91304347826087e-06, |
|
"loss": 0.4922, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 35.65217391304348, |
|
"grad_norm": 2.2342097759246826, |
|
"learning_rate": 5.739130434782609e-06, |
|
"loss": 0.4354, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5108994841575623, |
|
"eval_runtime": 4.9899, |
|
"eval_samples_per_second": 50.902, |
|
"eval_steps_per_second": 0.802, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 36.08695652173913, |
|
"grad_norm": 8.385408401489258, |
|
"learning_rate": 5.565217391304348e-06, |
|
"loss": 0.4585, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 36.52173913043478, |
|
"grad_norm": 2.839411497116089, |
|
"learning_rate": 5.391304347826088e-06, |
|
"loss": 0.4497, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 36.95652173913044, |
|
"grad_norm": 2.479076623916626, |
|
"learning_rate": 5.2173913043478265e-06, |
|
"loss": 0.4655, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.510716438293457, |
|
"eval_runtime": 3.6997, |
|
"eval_samples_per_second": 68.653, |
|
"eval_steps_per_second": 1.081, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 37.391304347826086, |
|
"grad_norm": 2.271686553955078, |
|
"learning_rate": 5.043478260869565e-06, |
|
"loss": 0.4462, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 37.82608695652174, |
|
"grad_norm": 3.4210402965545654, |
|
"learning_rate": 4.869565217391305e-06, |
|
"loss": 0.4691, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5093376636505127, |
|
"eval_runtime": 3.7287, |
|
"eval_samples_per_second": 68.119, |
|
"eval_steps_per_second": 1.073, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 38.26086956521739, |
|
"grad_norm": 5.694761276245117, |
|
"learning_rate": 4.695652173913044e-06, |
|
"loss": 0.4592, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 38.69565217391305, |
|
"grad_norm": 2.2949883937835693, |
|
"learning_rate": 4.5217391304347826e-06, |
|
"loss": 0.4826, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5044277906417847, |
|
"eval_runtime": 4.9781, |
|
"eval_samples_per_second": 51.024, |
|
"eval_steps_per_second": 0.804, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 39.130434782608695, |
|
"grad_norm": 3.4144210815429688, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 0.4407, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 39.56521739130435, |
|
"grad_norm": 2.22868013381958, |
|
"learning_rate": 4.173913043478261e-06, |
|
"loss": 0.4482, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 3.2193689346313477, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.4577, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.4999626874923706, |
|
"eval_runtime": 3.6952, |
|
"eval_samples_per_second": 68.738, |
|
"eval_steps_per_second": 1.082, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.43478260869565, |
|
"grad_norm": 4.500718593597412, |
|
"learning_rate": 3.8260869565217395e-06, |
|
"loss": 0.4585, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 40.869565217391305, |
|
"grad_norm": 1.9281222820281982, |
|
"learning_rate": 3.6521739130434787e-06, |
|
"loss": 0.4636, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.4962589144706726, |
|
"eval_runtime": 3.6977, |
|
"eval_samples_per_second": 68.69, |
|
"eval_steps_per_second": 1.082, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 41.30434782608695, |
|
"grad_norm": 2.193452835083008, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 0.4306, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 41.73913043478261, |
|
"grad_norm": 2.2370336055755615, |
|
"learning_rate": 3.3043478260869567e-06, |
|
"loss": 0.4361, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.4958040118217468, |
|
"eval_runtime": 4.9548, |
|
"eval_samples_per_second": 51.264, |
|
"eval_steps_per_second": 0.807, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 42.17391304347826, |
|
"grad_norm": 3.6354355812072754, |
|
"learning_rate": 3.130434782608696e-06, |
|
"loss": 0.4514, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 42.608695652173914, |
|
"grad_norm": 1.8955118656158447, |
|
"learning_rate": 2.956521739130435e-06, |
|
"loss": 0.4534, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5007808208465576, |
|
"eval_runtime": 3.7121, |
|
"eval_samples_per_second": 68.424, |
|
"eval_steps_per_second": 1.078, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 43.04347826086956, |
|
"grad_norm": 2.2034902572631836, |
|
"learning_rate": 2.782608695652174e-06, |
|
"loss": 0.4176, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 43.47826086956522, |
|
"grad_norm": 4.387076377868652, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 0.4748, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 43.91304347826087, |
|
"grad_norm": 5.444644927978516, |
|
"learning_rate": 2.4347826086956525e-06, |
|
"loss": 0.4559, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5025174021720886, |
|
"eval_runtime": 3.7093, |
|
"eval_samples_per_second": 68.476, |
|
"eval_steps_per_second": 1.078, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 44.34782608695652, |
|
"grad_norm": 2.2067017555236816, |
|
"learning_rate": 2.2608695652173913e-06, |
|
"loss": 0.4882, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 44.78260869565217, |
|
"grad_norm": 3.562736988067627, |
|
"learning_rate": 2.0869565217391305e-06, |
|
"loss": 0.4189, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.5014046430587769, |
|
"eval_runtime": 4.9992, |
|
"eval_samples_per_second": 50.808, |
|
"eval_steps_per_second": 0.8, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 45.21739130434783, |
|
"grad_norm": 10.402663230895996, |
|
"learning_rate": 1.9130434782608697e-06, |
|
"loss": 0.4432, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 45.65217391304348, |
|
"grad_norm": 4.949878215789795, |
|
"learning_rate": 1.7391304347826088e-06, |
|
"loss": 0.4861, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5003762245178223, |
|
"eval_runtime": 3.7019, |
|
"eval_samples_per_second": 68.614, |
|
"eval_steps_per_second": 1.081, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 46.08695652173913, |
|
"grad_norm": 1.938593864440918, |
|
"learning_rate": 1.565217391304348e-06, |
|
"loss": 0.4326, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 46.52173913043478, |
|
"grad_norm": 3.236699342727661, |
|
"learning_rate": 1.391304347826087e-06, |
|
"loss": 0.4726, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 46.95652173913044, |
|
"grad_norm": 3.047184944152832, |
|
"learning_rate": 1.2173913043478262e-06, |
|
"loss": 0.4709, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5004997849464417, |
|
"eval_runtime": 3.7143, |
|
"eval_samples_per_second": 68.384, |
|
"eval_steps_per_second": 1.077, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 47.391304347826086, |
|
"grad_norm": 2.8639461994171143, |
|
"learning_rate": 1.0434782608695653e-06, |
|
"loss": 0.4649, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 47.82608695652174, |
|
"grad_norm": 3.7704715728759766, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 0.4726, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.5007592439651489, |
|
"eval_runtime": 4.8498, |
|
"eval_samples_per_second": 52.373, |
|
"eval_steps_per_second": 0.825, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 48.26086956521739, |
|
"grad_norm": 4.941337585449219, |
|
"learning_rate": 6.956521739130435e-07, |
|
"loss": 0.4314, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 48.69565217391305, |
|
"grad_norm": 3.2265655994415283, |
|
"learning_rate": 5.217391304347826e-07, |
|
"loss": 0.4441, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.4987953305244446, |
|
"eval_runtime": 3.6681, |
|
"eval_samples_per_second": 69.246, |
|
"eval_steps_per_second": 1.09, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 49.130434782608695, |
|
"grad_norm": 3.7678611278533936, |
|
"learning_rate": 3.4782608695652175e-07, |
|
"loss": 0.4571, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 49.56521739130435, |
|
"grad_norm": 3.657460927963257, |
|
"learning_rate": 1.7391304347826088e-07, |
|
"loss": 0.4558, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 3.096832513809204, |
|
"learning_rate": 0.0, |
|
"loss": 0.4579, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.499985009431839, |
|
"eval_runtime": 3.8189, |
|
"eval_samples_per_second": 66.512, |
|
"eval_steps_per_second": 1.047, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"step": 1150, |
|
"total_flos": 2.72467378584576e+17, |
|
"train_loss": 0.4791690407628598, |
|
"train_runtime": 1616.3187, |
|
"train_samples_per_second": 44.391, |
|
"train_steps_per_second": 0.711 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.72467378584576e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|