{ "best_metric": 0.4903908967971802, "best_model_checkpoint": "./beans_outputs/checkpoint-621", "epoch": 50.0, "eval_steps": 500, "global_step": 1150, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.43478260869565216, "grad_norm": 3.492309093475342, "learning_rate": 1.9826086956521742e-05, "loss": 0.7305, "step": 10 }, { "epoch": 0.8695652173913043, "grad_norm": 4.003854751586914, "learning_rate": 1.965217391304348e-05, "loss": 0.6794, "step": 20 }, { "epoch": 1.0, "eval_accuracy": 0.6062992125984252, "eval_loss": 0.6559741497039795, "eval_runtime": 4.6669, "eval_samples_per_second": 54.425, "eval_steps_per_second": 0.857, "step": 23 }, { "epoch": 1.3043478260869565, "grad_norm": 4.675185680389404, "learning_rate": 1.947826086956522e-05, "loss": 0.6399, "step": 30 }, { "epoch": 1.7391304347826086, "grad_norm": 4.350035667419434, "learning_rate": 1.9304347826086957e-05, "loss": 0.6215, "step": 40 }, { "epoch": 2.0, "eval_accuracy": 0.7362204724409449, "eval_loss": 0.5833372473716736, "eval_runtime": 3.7107, "eval_samples_per_second": 68.451, "eval_steps_per_second": 1.078, "step": 46 }, { "epoch": 2.1739130434782608, "grad_norm": 3.180147886276245, "learning_rate": 1.9130434782608697e-05, "loss": 0.5964, "step": 50 }, { "epoch": 2.608695652173913, "grad_norm": 3.246190309524536, "learning_rate": 1.8956521739130434e-05, "loss": 0.5784, "step": 60 }, { "epoch": 3.0, "eval_accuracy": 0.7598425196850394, "eval_loss": 0.5489528179168701, "eval_runtime": 4.3517, "eval_samples_per_second": 58.367, "eval_steps_per_second": 0.919, "step": 69 }, { "epoch": 3.0434782608695654, "grad_norm": 2.3400914669036865, "learning_rate": 1.8782608695652175e-05, "loss": 0.5412, "step": 70 }, { "epoch": 3.4782608695652173, "grad_norm": 3.5264837741851807, "learning_rate": 1.8608695652173912e-05, "loss": 0.5659, "step": 80 }, { "epoch": 3.9130434782608696, "grad_norm": 4.993140697479248, "learning_rate": 1.8434782608695653e-05, "loss": 0.5347, "step": 90 }, { "epoch": 4.0, "eval_accuracy": 0.7637795275590551, "eval_loss": 0.5305963754653931, "eval_runtime": 3.9321, "eval_samples_per_second": 64.596, "eval_steps_per_second": 1.017, "step": 92 }, { "epoch": 4.3478260869565215, "grad_norm": 2.20806622505188, "learning_rate": 1.8260869565217393e-05, "loss": 0.5086, "step": 100 }, { "epoch": 4.782608695652174, "grad_norm": 4.256261825561523, "learning_rate": 1.808695652173913e-05, "loss": 0.5307, "step": 110 }, { "epoch": 5.0, "eval_accuracy": 0.7637795275590551, "eval_loss": 0.5235078930854797, "eval_runtime": 3.7141, "eval_samples_per_second": 68.389, "eval_steps_per_second": 1.077, "step": 115 }, { "epoch": 5.217391304347826, "grad_norm": 2.6543545722961426, "learning_rate": 1.791304347826087e-05, "loss": 0.5085, "step": 120 }, { "epoch": 5.6521739130434785, "grad_norm": 4.274487495422363, "learning_rate": 1.773913043478261e-05, "loss": 0.5391, "step": 130 }, { "epoch": 6.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.5090441703796387, "eval_runtime": 4.3438, "eval_samples_per_second": 58.475, "eval_steps_per_second": 0.921, "step": 138 }, { "epoch": 6.086956521739131, "grad_norm": 3.147414445877075, "learning_rate": 1.756521739130435e-05, "loss": 0.4977, "step": 140 }, { "epoch": 6.521739130434782, "grad_norm": 4.254673004150391, "learning_rate": 1.739130434782609e-05, "loss": 0.5297, "step": 150 }, { "epoch": 6.956521739130435, "grad_norm": 2.083784818649292, "learning_rate": 1.721739130434783e-05, "loss": 0.48, "step": 160 }, { "epoch": 7.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5108471512794495, "eval_runtime": 4.1877, "eval_samples_per_second": 60.653, "eval_steps_per_second": 0.955, "step": 161 }, { "epoch": 7.391304347826087, "grad_norm": 4.193545818328857, "learning_rate": 1.7043478260869566e-05, "loss": 0.4826, "step": 170 }, { "epoch": 7.826086956521739, "grad_norm": 2.05076003074646, "learning_rate": 1.6869565217391307e-05, "loss": 0.473, "step": 180 }, { "epoch": 8.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.5028324127197266, "eval_runtime": 3.7021, "eval_samples_per_second": 68.61, "eval_steps_per_second": 1.08, "step": 184 }, { "epoch": 8.26086956521739, "grad_norm": 3.007233142852783, "learning_rate": 1.6695652173913044e-05, "loss": 0.5255, "step": 190 }, { "epoch": 8.695652173913043, "grad_norm": 2.196945905685425, "learning_rate": 1.6521739130434785e-05, "loss": 0.5014, "step": 200 }, { "epoch": 9.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5054498314857483, "eval_runtime": 4.225, "eval_samples_per_second": 60.119, "eval_steps_per_second": 0.947, "step": 207 }, { "epoch": 9.130434782608695, "grad_norm": 2.184353828430176, "learning_rate": 1.6347826086956525e-05, "loss": 0.5044, "step": 210 }, { "epoch": 9.565217391304348, "grad_norm": 4.106619358062744, "learning_rate": 1.6173913043478262e-05, "loss": 0.4822, "step": 220 }, { "epoch": 10.0, "grad_norm": 4.000082969665527, "learning_rate": 1.6000000000000003e-05, "loss": 0.496, "step": 230 }, { "epoch": 10.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5039955973625183, "eval_runtime": 5.3498, "eval_samples_per_second": 47.478, "eval_steps_per_second": 0.748, "step": 230 }, { "epoch": 10.434782608695652, "grad_norm": 5.726933002471924, "learning_rate": 1.582608695652174e-05, "loss": 0.5101, "step": 240 }, { "epoch": 10.869565217391305, "grad_norm": 4.100568771362305, "learning_rate": 1.565217391304348e-05, "loss": 0.4688, "step": 250 }, { "epoch": 11.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.4972316324710846, "eval_runtime": 3.7607, "eval_samples_per_second": 67.54, "eval_steps_per_second": 1.064, "step": 253 }, { "epoch": 11.304347826086957, "grad_norm": 2.6119587421417236, "learning_rate": 1.5478260869565217e-05, "loss": 0.485, "step": 260 }, { "epoch": 11.73913043478261, "grad_norm": 3.003861427307129, "learning_rate": 1.5304347826086958e-05, "loss": 0.4943, "step": 270 }, { "epoch": 12.0, "eval_accuracy": 0.7637795275590551, "eval_loss": 0.49771231412887573, "eval_runtime": 4.9203, "eval_samples_per_second": 51.622, "eval_steps_per_second": 0.813, "step": 276 }, { "epoch": 12.173913043478262, "grad_norm": 2.9490270614624023, "learning_rate": 1.5130434782608697e-05, "loss": 0.4505, "step": 280 }, { "epoch": 12.608695652173914, "grad_norm": 2.8131847381591797, "learning_rate": 1.4956521739130436e-05, "loss": 0.5012, "step": 290 }, { "epoch": 13.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5057242512702942, "eval_runtime": 3.7024, "eval_samples_per_second": 68.605, "eval_steps_per_second": 1.08, "step": 299 }, { "epoch": 13.043478260869565, "grad_norm": 17.65978240966797, "learning_rate": 1.4782608695652174e-05, "loss": 0.4768, "step": 300 }, { "epoch": 13.478260869565217, "grad_norm": 2.085587978363037, "learning_rate": 1.4608695652173915e-05, "loss": 0.4729, "step": 310 }, { "epoch": 13.91304347826087, "grad_norm": 4.59744119644165, "learning_rate": 1.4434782608695654e-05, "loss": 0.4639, "step": 320 }, { "epoch": 14.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5010089874267578, "eval_runtime": 3.7018, "eval_samples_per_second": 68.616, "eval_steps_per_second": 1.081, "step": 322 }, { "epoch": 14.347826086956522, "grad_norm": 2.4057395458221436, "learning_rate": 1.4260869565217392e-05, "loss": 0.4751, "step": 330 }, { "epoch": 14.782608695652174, "grad_norm": 3.549567222595215, "learning_rate": 1.4086956521739133e-05, "loss": 0.4709, "step": 340 }, { "epoch": 15.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.4948899447917938, "eval_runtime": 4.9714, "eval_samples_per_second": 51.092, "eval_steps_per_second": 0.805, "step": 345 }, { "epoch": 15.217391304347826, "grad_norm": 6.705427646636963, "learning_rate": 1.391304347826087e-05, "loss": 0.4379, "step": 350 }, { "epoch": 15.652173913043478, "grad_norm": 2.444533348083496, "learning_rate": 1.373913043478261e-05, "loss": 0.4888, "step": 360 }, { "epoch": 16.0, "eval_accuracy": 0.7834645669291339, "eval_loss": 0.49550917744636536, "eval_runtime": 3.6768, "eval_samples_per_second": 69.081, "eval_steps_per_second": 1.088, "step": 368 }, { "epoch": 16.08695652173913, "grad_norm": 5.470461845397949, "learning_rate": 1.3565217391304348e-05, "loss": 0.4952, "step": 370 }, { "epoch": 16.52173913043478, "grad_norm": 2.0678608417510986, "learning_rate": 1.3391304347826088e-05, "loss": 0.4784, "step": 380 }, { "epoch": 16.956521739130434, "grad_norm": 6.63480806350708, "learning_rate": 1.3217391304347827e-05, "loss": 0.4594, "step": 390 }, { "epoch": 17.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.49856194853782654, "eval_runtime": 3.7219, "eval_samples_per_second": 68.245, "eval_steps_per_second": 1.075, "step": 391 }, { "epoch": 17.391304347826086, "grad_norm": 4.448991298675537, "learning_rate": 1.3043478260869566e-05, "loss": 0.4607, "step": 400 }, { "epoch": 17.82608695652174, "grad_norm": 2.716780424118042, "learning_rate": 1.2869565217391305e-05, "loss": 0.4745, "step": 410 }, { "epoch": 18.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.501070499420166, "eval_runtime": 4.5054, "eval_samples_per_second": 56.377, "eval_steps_per_second": 0.888, "step": 414 }, { "epoch": 18.26086956521739, "grad_norm": 2.406355857849121, "learning_rate": 1.2695652173913045e-05, "loss": 0.4639, "step": 420 }, { "epoch": 18.695652173913043, "grad_norm": 5.627669811248779, "learning_rate": 1.2521739130434784e-05, "loss": 0.4667, "step": 430 }, { "epoch": 19.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.4928434491157532, "eval_runtime": 4.0475, "eval_samples_per_second": 62.756, "eval_steps_per_second": 0.988, "step": 437 }, { "epoch": 19.130434782608695, "grad_norm": 4.074652671813965, "learning_rate": 1.2347826086956523e-05, "loss": 0.4671, "step": 440 }, { "epoch": 19.565217391304348, "grad_norm": 5.88148832321167, "learning_rate": 1.2173913043478263e-05, "loss": 0.4442, "step": 450 }, { "epoch": 20.0, "grad_norm": 3.00347900390625, "learning_rate": 1.2e-05, "loss": 0.4551, "step": 460 }, { "epoch": 20.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.5055357217788696, "eval_runtime": 3.6885, "eval_samples_per_second": 68.862, "eval_steps_per_second": 1.084, "step": 460 }, { "epoch": 20.434782608695652, "grad_norm": 10.164237976074219, "learning_rate": 1.182608695652174e-05, "loss": 0.4657, "step": 470 }, { "epoch": 20.869565217391305, "grad_norm": 2.1962711811065674, "learning_rate": 1.1652173913043478e-05, "loss": 0.4657, "step": 480 }, { "epoch": 21.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.4928124248981476, "eval_runtime": 4.4478, "eval_samples_per_second": 57.107, "eval_steps_per_second": 0.899, "step": 483 }, { "epoch": 21.304347826086957, "grad_norm": 5.0302228927612305, "learning_rate": 1.1478260869565218e-05, "loss": 0.4564, "step": 490 }, { "epoch": 21.73913043478261, "grad_norm": 3.5275819301605225, "learning_rate": 1.1304347826086957e-05, "loss": 0.4818, "step": 500 }, { "epoch": 22.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.5001721978187561, "eval_runtime": 4.0355, "eval_samples_per_second": 62.942, "eval_steps_per_second": 0.991, "step": 506 }, { "epoch": 22.17391304347826, "grad_norm": 6.920666694641113, "learning_rate": 1.1130434782608696e-05, "loss": 0.4608, "step": 510 }, { "epoch": 22.608695652173914, "grad_norm": 2.2840707302093506, "learning_rate": 1.0956521739130435e-05, "loss": 0.4633, "step": 520 }, { "epoch": 23.0, "eval_accuracy": 0.7834645669291339, "eval_loss": 0.49459317326545715, "eval_runtime": 3.7179, "eval_samples_per_second": 68.319, "eval_steps_per_second": 1.076, "step": 529 }, { "epoch": 23.043478260869566, "grad_norm": 6.509201526641846, "learning_rate": 1.0782608695652175e-05, "loss": 0.4694, "step": 530 }, { "epoch": 23.47826086956522, "grad_norm": 2.403275489807129, "learning_rate": 1.0608695652173914e-05, "loss": 0.4874, "step": 540 }, { "epoch": 23.91304347826087, "grad_norm": 2.1320598125457764, "learning_rate": 1.0434782608695653e-05, "loss": 0.4779, "step": 550 }, { "epoch": 24.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.49417200684547424, "eval_runtime": 4.3215, "eval_samples_per_second": 58.776, "eval_steps_per_second": 0.926, "step": 552 }, { "epoch": 24.347826086956523, "grad_norm": 3.7421488761901855, "learning_rate": 1.0260869565217393e-05, "loss": 0.4579, "step": 560 }, { "epoch": 24.782608695652176, "grad_norm": 3.07060170173645, "learning_rate": 1.008695652173913e-05, "loss": 0.4718, "step": 570 }, { "epoch": 25.0, "eval_accuracy": 0.7834645669291339, "eval_loss": 0.49625155329704285, "eval_runtime": 5.5612, "eval_samples_per_second": 45.674, "eval_steps_per_second": 0.719, "step": 575 }, { "epoch": 25.217391304347824, "grad_norm": 4.446998596191406, "learning_rate": 9.913043478260871e-06, "loss": 0.443, "step": 580 }, { "epoch": 25.652173913043477, "grad_norm": 2.4786624908447266, "learning_rate": 9.73913043478261e-06, "loss": 0.4511, "step": 590 }, { "epoch": 26.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5011107325553894, "eval_runtime": 3.7637, "eval_samples_per_second": 67.487, "eval_steps_per_second": 1.063, "step": 598 }, { "epoch": 26.08695652173913, "grad_norm": 5.552999019622803, "learning_rate": 9.565217391304349e-06, "loss": 0.4631, "step": 600 }, { "epoch": 26.52173913043478, "grad_norm": 5.050811290740967, "learning_rate": 9.391304347826087e-06, "loss": 0.4564, "step": 610 }, { "epoch": 26.956521739130434, "grad_norm": 2.0711512565612793, "learning_rate": 9.217391304347826e-06, "loss": 0.4798, "step": 620 }, { "epoch": 27.0, "eval_accuracy": 0.7874015748031497, "eval_loss": 0.4903908967971802, "eval_runtime": 4.9056, "eval_samples_per_second": 51.777, "eval_steps_per_second": 0.815, "step": 621 }, { "epoch": 27.391304347826086, "grad_norm": 4.117509365081787, "learning_rate": 9.043478260869565e-06, "loss": 0.4411, "step": 630 }, { "epoch": 27.82608695652174, "grad_norm": 4.448685646057129, "learning_rate": 8.869565217391306e-06, "loss": 0.4868, "step": 640 }, { "epoch": 28.0, "eval_accuracy": 0.7834645669291339, "eval_loss": 0.4982087016105652, "eval_runtime": 3.7322, "eval_samples_per_second": 68.057, "eval_steps_per_second": 1.072, "step": 644 }, { "epoch": 28.26086956521739, "grad_norm": 3.0993807315826416, "learning_rate": 8.695652173913044e-06, "loss": 0.4414, "step": 650 }, { "epoch": 28.695652173913043, "grad_norm": 4.982347011566162, "learning_rate": 8.521739130434783e-06, "loss": 0.4653, "step": 660 }, { "epoch": 29.0, "eval_accuracy": 0.7874015748031497, "eval_loss": 0.498798668384552, "eval_runtime": 3.7347, "eval_samples_per_second": 68.012, "eval_steps_per_second": 1.071, "step": 667 }, { "epoch": 29.130434782608695, "grad_norm": 3.081833600997925, "learning_rate": 8.347826086956522e-06, "loss": 0.4503, "step": 670 }, { "epoch": 29.565217391304348, "grad_norm": 4.352429389953613, "learning_rate": 8.173913043478263e-06, "loss": 0.4674, "step": 680 }, { "epoch": 30.0, "grad_norm": 5.281393051147461, "learning_rate": 8.000000000000001e-06, "loss": 0.4613, "step": 690 }, { "epoch": 30.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.49851593375205994, "eval_runtime": 4.8766, "eval_samples_per_second": 52.085, "eval_steps_per_second": 0.82, "step": 690 }, { "epoch": 30.434782608695652, "grad_norm": 2.2079997062683105, "learning_rate": 7.82608695652174e-06, "loss": 0.4574, "step": 700 }, { "epoch": 30.869565217391305, "grad_norm": 4.6935858726501465, "learning_rate": 7.652173913043479e-06, "loss": 0.4675, "step": 710 }, { "epoch": 31.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5060083270072937, "eval_runtime": 3.7305, "eval_samples_per_second": 68.087, "eval_steps_per_second": 1.072, "step": 713 }, { "epoch": 31.304347826086957, "grad_norm": 4.8790602684021, "learning_rate": 7.478260869565218e-06, "loss": 0.4802, "step": 720 }, { "epoch": 31.73913043478261, "grad_norm": 5.6365485191345215, "learning_rate": 7.304347826086957e-06, "loss": 0.4587, "step": 730 }, { "epoch": 32.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5059147477149963, "eval_runtime": 3.7699, "eval_samples_per_second": 67.376, "eval_steps_per_second": 1.061, "step": 736 }, { "epoch": 32.17391304347826, "grad_norm": 5.480165004730225, "learning_rate": 7.130434782608696e-06, "loss": 0.4541, "step": 740 }, { "epoch": 32.608695652173914, "grad_norm": 2.053098440170288, "learning_rate": 6.956521739130435e-06, "loss": 0.464, "step": 750 }, { "epoch": 33.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.5041583180427551, "eval_runtime": 4.9981, "eval_samples_per_second": 50.82, "eval_steps_per_second": 0.8, "step": 759 }, { "epoch": 33.04347826086956, "grad_norm": 3.6429481506347656, "learning_rate": 6.782608695652174e-06, "loss": 0.454, "step": 760 }, { "epoch": 33.47826086956522, "grad_norm": 2.436143636703491, "learning_rate": 6.6086956521739135e-06, "loss": 0.4612, "step": 770 }, { "epoch": 33.91304347826087, "grad_norm": 2.5793776512145996, "learning_rate": 6.434782608695652e-06, "loss": 0.4374, "step": 780 }, { "epoch": 34.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.5063456296920776, "eval_runtime": 3.7117, "eval_samples_per_second": 68.432, "eval_steps_per_second": 1.078, "step": 782 }, { "epoch": 34.34782608695652, "grad_norm": 3.71374773979187, "learning_rate": 6.260869565217392e-06, "loss": 0.4667, "step": 790 }, { "epoch": 34.78260869565217, "grad_norm": 4.282368183135986, "learning_rate": 6.086956521739132e-06, "loss": 0.4864, "step": 800 }, { "epoch": 35.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.5039507150650024, "eval_runtime": 3.6837, "eval_samples_per_second": 68.952, "eval_steps_per_second": 1.086, "step": 805 }, { "epoch": 35.21739130434783, "grad_norm": 2.896638870239258, "learning_rate": 5.91304347826087e-06, "loss": 0.4922, "step": 810 }, { "epoch": 35.65217391304348, "grad_norm": 2.2342097759246826, "learning_rate": 5.739130434782609e-06, "loss": 0.4354, "step": 820 }, { "epoch": 36.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5108994841575623, "eval_runtime": 4.9899, "eval_samples_per_second": 50.902, "eval_steps_per_second": 0.802, "step": 828 }, { "epoch": 36.08695652173913, "grad_norm": 8.385408401489258, "learning_rate": 5.565217391304348e-06, "loss": 0.4585, "step": 830 }, { "epoch": 36.52173913043478, "grad_norm": 2.839411497116089, "learning_rate": 5.391304347826088e-06, "loss": 0.4497, "step": 840 }, { "epoch": 36.95652173913044, "grad_norm": 2.479076623916626, "learning_rate": 5.2173913043478265e-06, "loss": 0.4655, "step": 850 }, { "epoch": 37.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.510716438293457, "eval_runtime": 3.6997, "eval_samples_per_second": 68.653, "eval_steps_per_second": 1.081, "step": 851 }, { "epoch": 37.391304347826086, "grad_norm": 2.271686553955078, "learning_rate": 5.043478260869565e-06, "loss": 0.4462, "step": 860 }, { "epoch": 37.82608695652174, "grad_norm": 3.4210402965545654, "learning_rate": 4.869565217391305e-06, "loss": 0.4691, "step": 870 }, { "epoch": 38.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.5093376636505127, "eval_runtime": 3.7287, "eval_samples_per_second": 68.119, "eval_steps_per_second": 1.073, "step": 874 }, { "epoch": 38.26086956521739, "grad_norm": 5.694761276245117, "learning_rate": 4.695652173913044e-06, "loss": 0.4592, "step": 880 }, { "epoch": 38.69565217391305, "grad_norm": 2.2949883937835693, "learning_rate": 4.5217391304347826e-06, "loss": 0.4826, "step": 890 }, { "epoch": 39.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5044277906417847, "eval_runtime": 4.9781, "eval_samples_per_second": 51.024, "eval_steps_per_second": 0.804, "step": 897 }, { "epoch": 39.130434782608695, "grad_norm": 3.4144210815429688, "learning_rate": 4.347826086956522e-06, "loss": 0.4407, "step": 900 }, { "epoch": 39.56521739130435, "grad_norm": 2.22868013381958, "learning_rate": 4.173913043478261e-06, "loss": 0.4482, "step": 910 }, { "epoch": 40.0, "grad_norm": 3.2193689346313477, "learning_rate": 4.000000000000001e-06, "loss": 0.4577, "step": 920 }, { "epoch": 40.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.4999626874923706, "eval_runtime": 3.6952, "eval_samples_per_second": 68.738, "eval_steps_per_second": 1.082, "step": 920 }, { "epoch": 40.43478260869565, "grad_norm": 4.500718593597412, "learning_rate": 3.8260869565217395e-06, "loss": 0.4585, "step": 930 }, { "epoch": 40.869565217391305, "grad_norm": 1.9281222820281982, "learning_rate": 3.6521739130434787e-06, "loss": 0.4636, "step": 940 }, { "epoch": 41.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.4962589144706726, "eval_runtime": 3.6977, "eval_samples_per_second": 68.69, "eval_steps_per_second": 1.082, "step": 943 }, { "epoch": 41.30434782608695, "grad_norm": 2.193452835083008, "learning_rate": 3.4782608695652175e-06, "loss": 0.4306, "step": 950 }, { "epoch": 41.73913043478261, "grad_norm": 2.2370336055755615, "learning_rate": 3.3043478260869567e-06, "loss": 0.4361, "step": 960 }, { "epoch": 42.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.4958040118217468, "eval_runtime": 4.9548, "eval_samples_per_second": 51.264, "eval_steps_per_second": 0.807, "step": 966 }, { "epoch": 42.17391304347826, "grad_norm": 3.6354355812072754, "learning_rate": 3.130434782608696e-06, "loss": 0.4514, "step": 970 }, { "epoch": 42.608695652173914, "grad_norm": 1.8955118656158447, "learning_rate": 2.956521739130435e-06, "loss": 0.4534, "step": 980 }, { "epoch": 43.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.5007808208465576, "eval_runtime": 3.7121, "eval_samples_per_second": 68.424, "eval_steps_per_second": 1.078, "step": 989 }, { "epoch": 43.04347826086956, "grad_norm": 2.2034902572631836, "learning_rate": 2.782608695652174e-06, "loss": 0.4176, "step": 990 }, { "epoch": 43.47826086956522, "grad_norm": 4.387076377868652, "learning_rate": 2.6086956521739132e-06, "loss": 0.4748, "step": 1000 }, { "epoch": 43.91304347826087, "grad_norm": 5.444644927978516, "learning_rate": 2.4347826086956525e-06, "loss": 0.4559, "step": 1010 }, { "epoch": 44.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.5025174021720886, "eval_runtime": 3.7093, "eval_samples_per_second": 68.476, "eval_steps_per_second": 1.078, "step": 1012 }, { "epoch": 44.34782608695652, "grad_norm": 2.2067017555236816, "learning_rate": 2.2608695652173913e-06, "loss": 0.4882, "step": 1020 }, { "epoch": 44.78260869565217, "grad_norm": 3.562736988067627, "learning_rate": 2.0869565217391305e-06, "loss": 0.4189, "step": 1030 }, { "epoch": 45.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.5014046430587769, "eval_runtime": 4.9992, "eval_samples_per_second": 50.808, "eval_steps_per_second": 0.8, "step": 1035 }, { "epoch": 45.21739130434783, "grad_norm": 10.402663230895996, "learning_rate": 1.9130434782608697e-06, "loss": 0.4432, "step": 1040 }, { "epoch": 45.65217391304348, "grad_norm": 4.949878215789795, "learning_rate": 1.7391304347826088e-06, "loss": 0.4861, "step": 1050 }, { "epoch": 46.0, "eval_accuracy": 0.7677165354330708, "eval_loss": 0.5003762245178223, "eval_runtime": 3.7019, "eval_samples_per_second": 68.614, "eval_steps_per_second": 1.081, "step": 1058 }, { "epoch": 46.08695652173913, "grad_norm": 1.938593864440918, "learning_rate": 1.565217391304348e-06, "loss": 0.4326, "step": 1060 }, { "epoch": 46.52173913043478, "grad_norm": 3.236699342727661, "learning_rate": 1.391304347826087e-06, "loss": 0.4726, "step": 1070 }, { "epoch": 46.95652173913044, "grad_norm": 3.047184944152832, "learning_rate": 1.2173913043478262e-06, "loss": 0.4709, "step": 1080 }, { "epoch": 47.0, "eval_accuracy": 0.7795275590551181, "eval_loss": 0.5004997849464417, "eval_runtime": 3.7143, "eval_samples_per_second": 68.384, "eval_steps_per_second": 1.077, "step": 1081 }, { "epoch": 47.391304347826086, "grad_norm": 2.8639461994171143, "learning_rate": 1.0434782608695653e-06, "loss": 0.4649, "step": 1090 }, { "epoch": 47.82608695652174, "grad_norm": 3.7704715728759766, "learning_rate": 8.695652173913044e-07, "loss": 0.4726, "step": 1100 }, { "epoch": 48.0, "eval_accuracy": 0.7716535433070866, "eval_loss": 0.5007592439651489, "eval_runtime": 4.8498, "eval_samples_per_second": 52.373, "eval_steps_per_second": 0.825, "step": 1104 }, { "epoch": 48.26086956521739, "grad_norm": 4.941337585449219, "learning_rate": 6.956521739130435e-07, "loss": 0.4314, "step": 1110 }, { "epoch": 48.69565217391305, "grad_norm": 3.2265655994415283, "learning_rate": 5.217391304347826e-07, "loss": 0.4441, "step": 1120 }, { "epoch": 49.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.4987953305244446, "eval_runtime": 3.6681, "eval_samples_per_second": 69.246, "eval_steps_per_second": 1.09, "step": 1127 }, { "epoch": 49.130434782608695, "grad_norm": 3.7678611278533936, "learning_rate": 3.4782608695652175e-07, "loss": 0.4571, "step": 1130 }, { "epoch": 49.56521739130435, "grad_norm": 3.657460927963257, "learning_rate": 1.7391304347826088e-07, "loss": 0.4558, "step": 1140 }, { "epoch": 50.0, "grad_norm": 3.096832513809204, "learning_rate": 0.0, "loss": 0.4579, "step": 1150 }, { "epoch": 50.0, "eval_accuracy": 0.7755905511811023, "eval_loss": 0.499985009431839, "eval_runtime": 3.8189, "eval_samples_per_second": 66.512, "eval_steps_per_second": 1.047, "step": 1150 }, { "epoch": 50.0, "step": 1150, "total_flos": 2.72467378584576e+17, "train_loss": 0.4791690407628598, "train_runtime": 1616.3187, "train_samples_per_second": 44.391, "train_steps_per_second": 0.711 } ], "logging_steps": 10, "max_steps": 1150, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.72467378584576e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }