test-timm / trainer_state.json
davanstrien's picture
davanstrien HF staff
End of training
bab8b6f verified
raw
history blame
129 kB
{
"best_metric": 0.4896911084651947,
"best_model_checkpoint": "./beans_outputs/checkpoint-1495",
"epoch": 200.0,
"eval_steps": 500,
"global_step": 4600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.43478260869565216,
"grad_norm": 3.492309093475342,
"learning_rate": 1.9826086956521742e-05,
"loss": 0.7305,
"step": 10
},
{
"epoch": 0.8695652173913043,
"grad_norm": 4.003854751586914,
"learning_rate": 1.965217391304348e-05,
"loss": 0.6794,
"step": 20
},
{
"epoch": 1.0,
"eval_accuracy": 0.6062992125984252,
"eval_loss": 0.6559741497039795,
"eval_runtime": 4.6669,
"eval_samples_per_second": 54.425,
"eval_steps_per_second": 0.857,
"step": 23
},
{
"epoch": 1.3043478260869565,
"grad_norm": 4.675185680389404,
"learning_rate": 1.947826086956522e-05,
"loss": 0.6399,
"step": 30
},
{
"epoch": 1.7391304347826086,
"grad_norm": 4.350035667419434,
"learning_rate": 1.9304347826086957e-05,
"loss": 0.6215,
"step": 40
},
{
"epoch": 2.0,
"eval_accuracy": 0.7362204724409449,
"eval_loss": 0.5833372473716736,
"eval_runtime": 3.7107,
"eval_samples_per_second": 68.451,
"eval_steps_per_second": 1.078,
"step": 46
},
{
"epoch": 2.1739130434782608,
"grad_norm": 3.180147886276245,
"learning_rate": 1.9130434782608697e-05,
"loss": 0.5964,
"step": 50
},
{
"epoch": 2.608695652173913,
"grad_norm": 3.246190309524536,
"learning_rate": 1.8956521739130434e-05,
"loss": 0.5784,
"step": 60
},
{
"epoch": 3.0,
"eval_accuracy": 0.7598425196850394,
"eval_loss": 0.5489528179168701,
"eval_runtime": 4.3517,
"eval_samples_per_second": 58.367,
"eval_steps_per_second": 0.919,
"step": 69
},
{
"epoch": 3.0434782608695654,
"grad_norm": 2.3400914669036865,
"learning_rate": 1.8782608695652175e-05,
"loss": 0.5412,
"step": 70
},
{
"epoch": 3.4782608695652173,
"grad_norm": 3.5264837741851807,
"learning_rate": 1.8608695652173912e-05,
"loss": 0.5659,
"step": 80
},
{
"epoch": 3.9130434782608696,
"grad_norm": 4.993140697479248,
"learning_rate": 1.8434782608695653e-05,
"loss": 0.5347,
"step": 90
},
{
"epoch": 4.0,
"eval_accuracy": 0.7637795275590551,
"eval_loss": 0.5305963754653931,
"eval_runtime": 3.9321,
"eval_samples_per_second": 64.596,
"eval_steps_per_second": 1.017,
"step": 92
},
{
"epoch": 4.3478260869565215,
"grad_norm": 2.20806622505188,
"learning_rate": 1.8260869565217393e-05,
"loss": 0.5086,
"step": 100
},
{
"epoch": 4.782608695652174,
"grad_norm": 4.256261825561523,
"learning_rate": 1.808695652173913e-05,
"loss": 0.5307,
"step": 110
},
{
"epoch": 5.0,
"eval_accuracy": 0.7637795275590551,
"eval_loss": 0.5235078930854797,
"eval_runtime": 3.7141,
"eval_samples_per_second": 68.389,
"eval_steps_per_second": 1.077,
"step": 115
},
{
"epoch": 5.217391304347826,
"grad_norm": 2.6543545722961426,
"learning_rate": 1.791304347826087e-05,
"loss": 0.5085,
"step": 120
},
{
"epoch": 5.6521739130434785,
"grad_norm": 4.274487495422363,
"learning_rate": 1.773913043478261e-05,
"loss": 0.5391,
"step": 130
},
{
"epoch": 6.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.5090441703796387,
"eval_runtime": 4.3438,
"eval_samples_per_second": 58.475,
"eval_steps_per_second": 0.921,
"step": 138
},
{
"epoch": 6.086956521739131,
"grad_norm": 3.147414445877075,
"learning_rate": 1.756521739130435e-05,
"loss": 0.4977,
"step": 140
},
{
"epoch": 6.521739130434782,
"grad_norm": 4.254673004150391,
"learning_rate": 1.739130434782609e-05,
"loss": 0.5297,
"step": 150
},
{
"epoch": 6.956521739130435,
"grad_norm": 2.083784818649292,
"learning_rate": 1.721739130434783e-05,
"loss": 0.48,
"step": 160
},
{
"epoch": 7.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5108471512794495,
"eval_runtime": 4.1877,
"eval_samples_per_second": 60.653,
"eval_steps_per_second": 0.955,
"step": 161
},
{
"epoch": 7.391304347826087,
"grad_norm": 4.193545818328857,
"learning_rate": 1.7043478260869566e-05,
"loss": 0.4826,
"step": 170
},
{
"epoch": 7.826086956521739,
"grad_norm": 2.05076003074646,
"learning_rate": 1.6869565217391307e-05,
"loss": 0.473,
"step": 180
},
{
"epoch": 8.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.5028324127197266,
"eval_runtime": 3.7021,
"eval_samples_per_second": 68.61,
"eval_steps_per_second": 1.08,
"step": 184
},
{
"epoch": 8.26086956521739,
"grad_norm": 3.007233142852783,
"learning_rate": 1.6695652173913044e-05,
"loss": 0.5255,
"step": 190
},
{
"epoch": 8.695652173913043,
"grad_norm": 2.196945905685425,
"learning_rate": 1.6521739130434785e-05,
"loss": 0.5014,
"step": 200
},
{
"epoch": 9.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5054498314857483,
"eval_runtime": 4.225,
"eval_samples_per_second": 60.119,
"eval_steps_per_second": 0.947,
"step": 207
},
{
"epoch": 9.130434782608695,
"grad_norm": 2.184353828430176,
"learning_rate": 1.6347826086956525e-05,
"loss": 0.5044,
"step": 210
},
{
"epoch": 9.565217391304348,
"grad_norm": 4.106619358062744,
"learning_rate": 1.6173913043478262e-05,
"loss": 0.4822,
"step": 220
},
{
"epoch": 10.0,
"grad_norm": 4.000082969665527,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.496,
"step": 230
},
{
"epoch": 10.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5039955973625183,
"eval_runtime": 5.3498,
"eval_samples_per_second": 47.478,
"eval_steps_per_second": 0.748,
"step": 230
},
{
"epoch": 10.434782608695652,
"grad_norm": 5.726933002471924,
"learning_rate": 1.582608695652174e-05,
"loss": 0.5101,
"step": 240
},
{
"epoch": 10.869565217391305,
"grad_norm": 4.100568771362305,
"learning_rate": 1.565217391304348e-05,
"loss": 0.4688,
"step": 250
},
{
"epoch": 11.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.4972316324710846,
"eval_runtime": 3.7607,
"eval_samples_per_second": 67.54,
"eval_steps_per_second": 1.064,
"step": 253
},
{
"epoch": 11.304347826086957,
"grad_norm": 2.6119587421417236,
"learning_rate": 1.5478260869565217e-05,
"loss": 0.485,
"step": 260
},
{
"epoch": 11.73913043478261,
"grad_norm": 3.003861427307129,
"learning_rate": 1.5304347826086958e-05,
"loss": 0.4943,
"step": 270
},
{
"epoch": 12.0,
"eval_accuracy": 0.7637795275590551,
"eval_loss": 0.49771231412887573,
"eval_runtime": 4.9203,
"eval_samples_per_second": 51.622,
"eval_steps_per_second": 0.813,
"step": 276
},
{
"epoch": 12.173913043478262,
"grad_norm": 2.9490270614624023,
"learning_rate": 1.5130434782608697e-05,
"loss": 0.4505,
"step": 280
},
{
"epoch": 12.608695652173914,
"grad_norm": 2.8131847381591797,
"learning_rate": 1.4956521739130436e-05,
"loss": 0.5012,
"step": 290
},
{
"epoch": 13.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5057242512702942,
"eval_runtime": 3.7024,
"eval_samples_per_second": 68.605,
"eval_steps_per_second": 1.08,
"step": 299
},
{
"epoch": 13.043478260869565,
"grad_norm": 17.65978240966797,
"learning_rate": 1.4782608695652174e-05,
"loss": 0.4768,
"step": 300
},
{
"epoch": 13.478260869565217,
"grad_norm": 2.085587978363037,
"learning_rate": 1.4608695652173915e-05,
"loss": 0.4729,
"step": 310
},
{
"epoch": 13.91304347826087,
"grad_norm": 4.59744119644165,
"learning_rate": 1.4434782608695654e-05,
"loss": 0.4639,
"step": 320
},
{
"epoch": 14.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5010089874267578,
"eval_runtime": 3.7018,
"eval_samples_per_second": 68.616,
"eval_steps_per_second": 1.081,
"step": 322
},
{
"epoch": 14.347826086956522,
"grad_norm": 2.4057395458221436,
"learning_rate": 1.4260869565217392e-05,
"loss": 0.4751,
"step": 330
},
{
"epoch": 14.782608695652174,
"grad_norm": 3.549567222595215,
"learning_rate": 1.4086956521739133e-05,
"loss": 0.4709,
"step": 340
},
{
"epoch": 15.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.4948899447917938,
"eval_runtime": 4.9714,
"eval_samples_per_second": 51.092,
"eval_steps_per_second": 0.805,
"step": 345
},
{
"epoch": 15.217391304347826,
"grad_norm": 6.705427646636963,
"learning_rate": 1.391304347826087e-05,
"loss": 0.4379,
"step": 350
},
{
"epoch": 15.652173913043478,
"grad_norm": 2.444533348083496,
"learning_rate": 1.373913043478261e-05,
"loss": 0.4888,
"step": 360
},
{
"epoch": 16.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.49550917744636536,
"eval_runtime": 3.6768,
"eval_samples_per_second": 69.081,
"eval_steps_per_second": 1.088,
"step": 368
},
{
"epoch": 16.08695652173913,
"grad_norm": 5.470461845397949,
"learning_rate": 1.3565217391304348e-05,
"loss": 0.4952,
"step": 370
},
{
"epoch": 16.52173913043478,
"grad_norm": 2.0678608417510986,
"learning_rate": 1.3391304347826088e-05,
"loss": 0.4784,
"step": 380
},
{
"epoch": 16.956521739130434,
"grad_norm": 6.63480806350708,
"learning_rate": 1.3217391304347827e-05,
"loss": 0.4594,
"step": 390
},
{
"epoch": 17.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.49856194853782654,
"eval_runtime": 3.7219,
"eval_samples_per_second": 68.245,
"eval_steps_per_second": 1.075,
"step": 391
},
{
"epoch": 17.391304347826086,
"grad_norm": 4.448991298675537,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.4607,
"step": 400
},
{
"epoch": 17.82608695652174,
"grad_norm": 2.716780424118042,
"learning_rate": 1.2869565217391305e-05,
"loss": 0.4745,
"step": 410
},
{
"epoch": 18.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.501070499420166,
"eval_runtime": 4.5054,
"eval_samples_per_second": 56.377,
"eval_steps_per_second": 0.888,
"step": 414
},
{
"epoch": 18.26086956521739,
"grad_norm": 2.406355857849121,
"learning_rate": 1.2695652173913045e-05,
"loss": 0.4639,
"step": 420
},
{
"epoch": 18.695652173913043,
"grad_norm": 5.627669811248779,
"learning_rate": 1.2521739130434784e-05,
"loss": 0.4667,
"step": 430
},
{
"epoch": 19.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.4928434491157532,
"eval_runtime": 4.0475,
"eval_samples_per_second": 62.756,
"eval_steps_per_second": 0.988,
"step": 437
},
{
"epoch": 19.130434782608695,
"grad_norm": 4.074652671813965,
"learning_rate": 1.2347826086956523e-05,
"loss": 0.4671,
"step": 440
},
{
"epoch": 19.565217391304348,
"grad_norm": 5.88148832321167,
"learning_rate": 1.2173913043478263e-05,
"loss": 0.4442,
"step": 450
},
{
"epoch": 20.0,
"grad_norm": 3.00347900390625,
"learning_rate": 1.2e-05,
"loss": 0.4551,
"step": 460
},
{
"epoch": 20.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5055357217788696,
"eval_runtime": 3.6885,
"eval_samples_per_second": 68.862,
"eval_steps_per_second": 1.084,
"step": 460
},
{
"epoch": 20.434782608695652,
"grad_norm": 10.164237976074219,
"learning_rate": 1.182608695652174e-05,
"loss": 0.4657,
"step": 470
},
{
"epoch": 20.869565217391305,
"grad_norm": 2.1962711811065674,
"learning_rate": 1.1652173913043478e-05,
"loss": 0.4657,
"step": 480
},
{
"epoch": 21.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.4928124248981476,
"eval_runtime": 4.4478,
"eval_samples_per_second": 57.107,
"eval_steps_per_second": 0.899,
"step": 483
},
{
"epoch": 21.304347826086957,
"grad_norm": 5.0302228927612305,
"learning_rate": 1.1478260869565218e-05,
"loss": 0.4564,
"step": 490
},
{
"epoch": 21.73913043478261,
"grad_norm": 3.5275819301605225,
"learning_rate": 1.1304347826086957e-05,
"loss": 0.4818,
"step": 500
},
{
"epoch": 22.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.5001721978187561,
"eval_runtime": 4.0355,
"eval_samples_per_second": 62.942,
"eval_steps_per_second": 0.991,
"step": 506
},
{
"epoch": 22.17391304347826,
"grad_norm": 6.920666694641113,
"learning_rate": 1.1130434782608696e-05,
"loss": 0.4608,
"step": 510
},
{
"epoch": 22.608695652173914,
"grad_norm": 2.2840707302093506,
"learning_rate": 1.0956521739130435e-05,
"loss": 0.4633,
"step": 520
},
{
"epoch": 23.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.49459317326545715,
"eval_runtime": 3.7179,
"eval_samples_per_second": 68.319,
"eval_steps_per_second": 1.076,
"step": 529
},
{
"epoch": 23.043478260869566,
"grad_norm": 6.509201526641846,
"learning_rate": 1.0782608695652175e-05,
"loss": 0.4694,
"step": 530
},
{
"epoch": 23.47826086956522,
"grad_norm": 2.403275489807129,
"learning_rate": 1.0608695652173914e-05,
"loss": 0.4874,
"step": 540
},
{
"epoch": 23.91304347826087,
"grad_norm": 2.1320598125457764,
"learning_rate": 1.0434782608695653e-05,
"loss": 0.4779,
"step": 550
},
{
"epoch": 24.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.49417200684547424,
"eval_runtime": 4.3215,
"eval_samples_per_second": 58.776,
"eval_steps_per_second": 0.926,
"step": 552
},
{
"epoch": 24.347826086956523,
"grad_norm": 3.7421488761901855,
"learning_rate": 1.0260869565217393e-05,
"loss": 0.4579,
"step": 560
},
{
"epoch": 24.782608695652176,
"grad_norm": 3.07060170173645,
"learning_rate": 1.008695652173913e-05,
"loss": 0.4718,
"step": 570
},
{
"epoch": 25.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.49625155329704285,
"eval_runtime": 5.5612,
"eval_samples_per_second": 45.674,
"eval_steps_per_second": 0.719,
"step": 575
},
{
"epoch": 25.217391304347824,
"grad_norm": 4.446998596191406,
"learning_rate": 9.913043478260871e-06,
"loss": 0.443,
"step": 580
},
{
"epoch": 25.652173913043477,
"grad_norm": 2.4786624908447266,
"learning_rate": 9.73913043478261e-06,
"loss": 0.4511,
"step": 590
},
{
"epoch": 26.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5011107325553894,
"eval_runtime": 3.7637,
"eval_samples_per_second": 67.487,
"eval_steps_per_second": 1.063,
"step": 598
},
{
"epoch": 26.08695652173913,
"grad_norm": 5.552999019622803,
"learning_rate": 9.565217391304349e-06,
"loss": 0.4631,
"step": 600
},
{
"epoch": 26.52173913043478,
"grad_norm": 5.050811290740967,
"learning_rate": 9.391304347826087e-06,
"loss": 0.4564,
"step": 610
},
{
"epoch": 26.956521739130434,
"grad_norm": 2.0711512565612793,
"learning_rate": 9.217391304347826e-06,
"loss": 0.4798,
"step": 620
},
{
"epoch": 27.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.4903908967971802,
"eval_runtime": 4.9056,
"eval_samples_per_second": 51.777,
"eval_steps_per_second": 0.815,
"step": 621
},
{
"epoch": 27.391304347826086,
"grad_norm": 4.117509365081787,
"learning_rate": 9.043478260869565e-06,
"loss": 0.4411,
"step": 630
},
{
"epoch": 27.82608695652174,
"grad_norm": 4.448685646057129,
"learning_rate": 8.869565217391306e-06,
"loss": 0.4868,
"step": 640
},
{
"epoch": 28.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.4982087016105652,
"eval_runtime": 3.7322,
"eval_samples_per_second": 68.057,
"eval_steps_per_second": 1.072,
"step": 644
},
{
"epoch": 28.26086956521739,
"grad_norm": 3.0993807315826416,
"learning_rate": 8.695652173913044e-06,
"loss": 0.4414,
"step": 650
},
{
"epoch": 28.695652173913043,
"grad_norm": 4.982347011566162,
"learning_rate": 8.521739130434783e-06,
"loss": 0.4653,
"step": 660
},
{
"epoch": 29.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.498798668384552,
"eval_runtime": 3.7347,
"eval_samples_per_second": 68.012,
"eval_steps_per_second": 1.071,
"step": 667
},
{
"epoch": 29.130434782608695,
"grad_norm": 3.081833600997925,
"learning_rate": 8.347826086956522e-06,
"loss": 0.4503,
"step": 670
},
{
"epoch": 29.565217391304348,
"grad_norm": 4.352429389953613,
"learning_rate": 8.173913043478263e-06,
"loss": 0.4674,
"step": 680
},
{
"epoch": 30.0,
"grad_norm": 5.281393051147461,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4613,
"step": 690
},
{
"epoch": 30.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.49851593375205994,
"eval_runtime": 4.8766,
"eval_samples_per_second": 52.085,
"eval_steps_per_second": 0.82,
"step": 690
},
{
"epoch": 30.434782608695652,
"grad_norm": 2.2079997062683105,
"learning_rate": 7.82608695652174e-06,
"loss": 0.4574,
"step": 700
},
{
"epoch": 30.869565217391305,
"grad_norm": 4.6935858726501465,
"learning_rate": 7.652173913043479e-06,
"loss": 0.4675,
"step": 710
},
{
"epoch": 31.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5060083270072937,
"eval_runtime": 3.7305,
"eval_samples_per_second": 68.087,
"eval_steps_per_second": 1.072,
"step": 713
},
{
"epoch": 31.304347826086957,
"grad_norm": 4.8790602684021,
"learning_rate": 7.478260869565218e-06,
"loss": 0.4802,
"step": 720
},
{
"epoch": 31.73913043478261,
"grad_norm": 5.6365485191345215,
"learning_rate": 7.304347826086957e-06,
"loss": 0.4587,
"step": 730
},
{
"epoch": 32.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5059147477149963,
"eval_runtime": 3.7699,
"eval_samples_per_second": 67.376,
"eval_steps_per_second": 1.061,
"step": 736
},
{
"epoch": 32.17391304347826,
"grad_norm": 5.480165004730225,
"learning_rate": 7.130434782608696e-06,
"loss": 0.4541,
"step": 740
},
{
"epoch": 32.608695652173914,
"grad_norm": 2.053098440170288,
"learning_rate": 6.956521739130435e-06,
"loss": 0.464,
"step": 750
},
{
"epoch": 33.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5041583180427551,
"eval_runtime": 4.9981,
"eval_samples_per_second": 50.82,
"eval_steps_per_second": 0.8,
"step": 759
},
{
"epoch": 33.04347826086956,
"grad_norm": 3.6429481506347656,
"learning_rate": 6.782608695652174e-06,
"loss": 0.454,
"step": 760
},
{
"epoch": 33.47826086956522,
"grad_norm": 2.436143636703491,
"learning_rate": 6.6086956521739135e-06,
"loss": 0.4612,
"step": 770
},
{
"epoch": 33.91304347826087,
"grad_norm": 2.5793776512145996,
"learning_rate": 6.434782608695652e-06,
"loss": 0.4374,
"step": 780
},
{
"epoch": 34.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.5063456296920776,
"eval_runtime": 3.7117,
"eval_samples_per_second": 68.432,
"eval_steps_per_second": 1.078,
"step": 782
},
{
"epoch": 34.34782608695652,
"grad_norm": 3.71374773979187,
"learning_rate": 6.260869565217392e-06,
"loss": 0.4667,
"step": 790
},
{
"epoch": 34.78260869565217,
"grad_norm": 4.282368183135986,
"learning_rate": 6.086956521739132e-06,
"loss": 0.4864,
"step": 800
},
{
"epoch": 35.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.5039507150650024,
"eval_runtime": 3.6837,
"eval_samples_per_second": 68.952,
"eval_steps_per_second": 1.086,
"step": 805
},
{
"epoch": 35.21739130434783,
"grad_norm": 2.896638870239258,
"learning_rate": 5.91304347826087e-06,
"loss": 0.4922,
"step": 810
},
{
"epoch": 35.65217391304348,
"grad_norm": 2.2342097759246826,
"learning_rate": 5.739130434782609e-06,
"loss": 0.4354,
"step": 820
},
{
"epoch": 36.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5108994841575623,
"eval_runtime": 4.9899,
"eval_samples_per_second": 50.902,
"eval_steps_per_second": 0.802,
"step": 828
},
{
"epoch": 36.08695652173913,
"grad_norm": 8.385408401489258,
"learning_rate": 5.565217391304348e-06,
"loss": 0.4585,
"step": 830
},
{
"epoch": 36.52173913043478,
"grad_norm": 2.839411497116089,
"learning_rate": 5.391304347826088e-06,
"loss": 0.4497,
"step": 840
},
{
"epoch": 36.95652173913044,
"grad_norm": 2.479076623916626,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.4655,
"step": 850
},
{
"epoch": 37.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.510716438293457,
"eval_runtime": 3.6997,
"eval_samples_per_second": 68.653,
"eval_steps_per_second": 1.081,
"step": 851
},
{
"epoch": 37.391304347826086,
"grad_norm": 2.271686553955078,
"learning_rate": 5.043478260869565e-06,
"loss": 0.4462,
"step": 860
},
{
"epoch": 37.82608695652174,
"grad_norm": 3.4210402965545654,
"learning_rate": 4.869565217391305e-06,
"loss": 0.4691,
"step": 870
},
{
"epoch": 38.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.5093376636505127,
"eval_runtime": 3.7287,
"eval_samples_per_second": 68.119,
"eval_steps_per_second": 1.073,
"step": 874
},
{
"epoch": 38.26086956521739,
"grad_norm": 5.694761276245117,
"learning_rate": 4.695652173913044e-06,
"loss": 0.4592,
"step": 880
},
{
"epoch": 38.69565217391305,
"grad_norm": 2.2949883937835693,
"learning_rate": 4.5217391304347826e-06,
"loss": 0.4826,
"step": 890
},
{
"epoch": 39.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5044277906417847,
"eval_runtime": 4.9781,
"eval_samples_per_second": 51.024,
"eval_steps_per_second": 0.804,
"step": 897
},
{
"epoch": 39.130434782608695,
"grad_norm": 3.4144210815429688,
"learning_rate": 4.347826086956522e-06,
"loss": 0.4407,
"step": 900
},
{
"epoch": 39.56521739130435,
"grad_norm": 2.22868013381958,
"learning_rate": 4.173913043478261e-06,
"loss": 0.4482,
"step": 910
},
{
"epoch": 40.0,
"grad_norm": 3.2193689346313477,
"learning_rate": 4.000000000000001e-06,
"loss": 0.4577,
"step": 920
},
{
"epoch": 40.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.4999626874923706,
"eval_runtime": 3.6952,
"eval_samples_per_second": 68.738,
"eval_steps_per_second": 1.082,
"step": 920
},
{
"epoch": 40.43478260869565,
"grad_norm": 4.500718593597412,
"learning_rate": 3.8260869565217395e-06,
"loss": 0.4585,
"step": 930
},
{
"epoch": 40.869565217391305,
"grad_norm": 1.9281222820281982,
"learning_rate": 3.6521739130434787e-06,
"loss": 0.4636,
"step": 940
},
{
"epoch": 41.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.4962589144706726,
"eval_runtime": 3.6977,
"eval_samples_per_second": 68.69,
"eval_steps_per_second": 1.082,
"step": 943
},
{
"epoch": 41.30434782608695,
"grad_norm": 2.193452835083008,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.4306,
"step": 950
},
{
"epoch": 41.73913043478261,
"grad_norm": 2.2370336055755615,
"learning_rate": 3.3043478260869567e-06,
"loss": 0.4361,
"step": 960
},
{
"epoch": 42.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.4958040118217468,
"eval_runtime": 4.9548,
"eval_samples_per_second": 51.264,
"eval_steps_per_second": 0.807,
"step": 966
},
{
"epoch": 42.17391304347826,
"grad_norm": 3.6354355812072754,
"learning_rate": 3.130434782608696e-06,
"loss": 0.4514,
"step": 970
},
{
"epoch": 42.608695652173914,
"grad_norm": 1.8955118656158447,
"learning_rate": 2.956521739130435e-06,
"loss": 0.4534,
"step": 980
},
{
"epoch": 43.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5007808208465576,
"eval_runtime": 3.7121,
"eval_samples_per_second": 68.424,
"eval_steps_per_second": 1.078,
"step": 989
},
{
"epoch": 43.04347826086956,
"grad_norm": 2.2034902572631836,
"learning_rate": 2.782608695652174e-06,
"loss": 0.4176,
"step": 990
},
{
"epoch": 43.47826086956522,
"grad_norm": 4.387076377868652,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.4748,
"step": 1000
},
{
"epoch": 43.91304347826087,
"grad_norm": 5.444644927978516,
"learning_rate": 2.4347826086956525e-06,
"loss": 0.4559,
"step": 1010
},
{
"epoch": 44.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5025174021720886,
"eval_runtime": 3.7093,
"eval_samples_per_second": 68.476,
"eval_steps_per_second": 1.078,
"step": 1012
},
{
"epoch": 44.34782608695652,
"grad_norm": 2.2067017555236816,
"learning_rate": 2.2608695652173913e-06,
"loss": 0.4882,
"step": 1020
},
{
"epoch": 44.78260869565217,
"grad_norm": 3.562736988067627,
"learning_rate": 2.0869565217391305e-06,
"loss": 0.4189,
"step": 1030
},
{
"epoch": 45.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.5014046430587769,
"eval_runtime": 4.9992,
"eval_samples_per_second": 50.808,
"eval_steps_per_second": 0.8,
"step": 1035
},
{
"epoch": 45.21739130434783,
"grad_norm": 10.402663230895996,
"learning_rate": 1.9130434782608697e-06,
"loss": 0.4432,
"step": 1040
},
{
"epoch": 45.65217391304348,
"grad_norm": 4.949878215789795,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.4861,
"step": 1050
},
{
"epoch": 46.0,
"eval_accuracy": 0.7677165354330708,
"eval_loss": 0.5003762245178223,
"eval_runtime": 3.7019,
"eval_samples_per_second": 68.614,
"eval_steps_per_second": 1.081,
"step": 1058
},
{
"epoch": 46.08695652173913,
"grad_norm": 1.938593864440918,
"learning_rate": 1.565217391304348e-06,
"loss": 0.4326,
"step": 1060
},
{
"epoch": 46.52173913043478,
"grad_norm": 3.236699342727661,
"learning_rate": 1.391304347826087e-06,
"loss": 0.4726,
"step": 1070
},
{
"epoch": 46.95652173913044,
"grad_norm": 3.047184944152832,
"learning_rate": 1.2173913043478262e-06,
"loss": 0.4709,
"step": 1080
},
{
"epoch": 47.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5004997849464417,
"eval_runtime": 3.7143,
"eval_samples_per_second": 68.384,
"eval_steps_per_second": 1.077,
"step": 1081
},
{
"epoch": 47.391304347826086,
"grad_norm": 2.8639461994171143,
"learning_rate": 1.0434782608695653e-06,
"loss": 0.4649,
"step": 1090
},
{
"epoch": 47.82608695652174,
"grad_norm": 3.7704715728759766,
"learning_rate": 8.695652173913044e-07,
"loss": 0.4726,
"step": 1100
},
{
"epoch": 48.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.5007592439651489,
"eval_runtime": 4.8498,
"eval_samples_per_second": 52.373,
"eval_steps_per_second": 0.825,
"step": 1104
},
{
"epoch": 48.26086956521739,
"grad_norm": 4.941337585449219,
"learning_rate": 6.956521739130435e-07,
"loss": 0.4314,
"step": 1110
},
{
"epoch": 48.69565217391305,
"grad_norm": 3.2265655994415283,
"learning_rate": 5.217391304347826e-07,
"loss": 0.4441,
"step": 1120
},
{
"epoch": 49.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.4987953305244446,
"eval_runtime": 3.6681,
"eval_samples_per_second": 69.246,
"eval_steps_per_second": 1.09,
"step": 1127
},
{
"epoch": 49.130434782608695,
"grad_norm": 3.7678611278533936,
"learning_rate": 3.4782608695652175e-07,
"loss": 0.4571,
"step": 1130
},
{
"epoch": 49.56521739130435,
"grad_norm": 3.657460927963257,
"learning_rate": 1.7391304347826088e-07,
"loss": 0.4558,
"step": 1140
},
{
"epoch": 50.0,
"grad_norm": 3.096832513809204,
"learning_rate": 0.0,
"loss": 0.4579,
"step": 1150
},
{
"epoch": 50.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.499985009431839,
"eval_runtime": 3.8189,
"eval_samples_per_second": 66.512,
"eval_steps_per_second": 1.047,
"step": 1150
},
{
"epoch": 50.43478260869565,
"grad_norm": 2.7469470500946045,
"learning_rate": 1.4956521739130436e-05,
"loss": 0.4877,
"step": 1160
},
{
"epoch": 50.869565217391305,
"grad_norm": 2.5254504680633545,
"learning_rate": 1.491304347826087e-05,
"loss": 0.4366,
"step": 1170
},
{
"epoch": 51.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.497986376285553,
"eval_runtime": 3.9676,
"eval_samples_per_second": 64.019,
"eval_steps_per_second": 0.504,
"step": 1173
},
{
"epoch": 51.30434782608695,
"grad_norm": 2.032457113265991,
"learning_rate": 1.4869565217391306e-05,
"loss": 0.4663,
"step": 1180
},
{
"epoch": 51.73913043478261,
"grad_norm": 2.802882432937622,
"learning_rate": 1.4826086956521741e-05,
"loss": 0.4467,
"step": 1190
},
{
"epoch": 52.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.49471431970596313,
"eval_runtime": 4.8494,
"eval_samples_per_second": 52.378,
"eval_steps_per_second": 0.412,
"step": 1196
},
{
"epoch": 52.17391304347826,
"grad_norm": 2.2359466552734375,
"learning_rate": 1.4782608695652174e-05,
"loss": 0.4424,
"step": 1200
},
{
"epoch": 52.608695652173914,
"grad_norm": 3.219308376312256,
"learning_rate": 1.473913043478261e-05,
"loss": 0.4797,
"step": 1210
},
{
"epoch": 53.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.4950390160083771,
"eval_runtime": 3.8997,
"eval_samples_per_second": 65.133,
"eval_steps_per_second": 0.513,
"step": 1219
},
{
"epoch": 53.04347826086956,
"grad_norm": 2.6939969062805176,
"learning_rate": 1.4695652173913045e-05,
"loss": 0.4256,
"step": 1220
},
{
"epoch": 53.47826086956522,
"grad_norm": 2.6343085765838623,
"learning_rate": 1.465217391304348e-05,
"loss": 0.4192,
"step": 1230
},
{
"epoch": 53.91304347826087,
"grad_norm": 10.655885696411133,
"learning_rate": 1.4608695652173915e-05,
"loss": 0.4544,
"step": 1240
},
{
"epoch": 54.0,
"eval_accuracy": 0.7716535433070866,
"eval_loss": 0.49978330731391907,
"eval_runtime": 3.7833,
"eval_samples_per_second": 67.137,
"eval_steps_per_second": 0.529,
"step": 1242
},
{
"epoch": 54.34782608695652,
"grad_norm": 4.838284969329834,
"learning_rate": 1.456521739130435e-05,
"loss": 0.4361,
"step": 1250
},
{
"epoch": 54.78260869565217,
"grad_norm": 3.1171820163726807,
"learning_rate": 1.4521739130434785e-05,
"loss": 0.4466,
"step": 1260
},
{
"epoch": 55.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.49803978204727173,
"eval_runtime": 3.7735,
"eval_samples_per_second": 67.311,
"eval_steps_per_second": 0.53,
"step": 1265
},
{
"epoch": 55.21739130434783,
"grad_norm": 2.5744450092315674,
"learning_rate": 1.447826086956522e-05,
"loss": 0.4511,
"step": 1270
},
{
"epoch": 55.65217391304348,
"grad_norm": 7.211576461791992,
"learning_rate": 1.4434782608695654e-05,
"loss": 0.4599,
"step": 1280
},
{
"epoch": 56.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.4962967336177826,
"eval_runtime": 4.8613,
"eval_samples_per_second": 52.249,
"eval_steps_per_second": 0.411,
"step": 1288
},
{
"epoch": 56.08695652173913,
"grad_norm": 1.932460069656372,
"learning_rate": 1.4391304347826087e-05,
"loss": 0.4168,
"step": 1290
},
{
"epoch": 56.52173913043478,
"grad_norm": 5.841196537017822,
"learning_rate": 1.4347826086956522e-05,
"loss": 0.4622,
"step": 1300
},
{
"epoch": 56.95652173913044,
"grad_norm": 1.948188066482544,
"learning_rate": 1.4304347826086957e-05,
"loss": 0.4458,
"step": 1310
},
{
"epoch": 57.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.49557480216026306,
"eval_runtime": 3.6949,
"eval_samples_per_second": 68.744,
"eval_steps_per_second": 0.541,
"step": 1311
},
{
"epoch": 57.391304347826086,
"grad_norm": 4.304020881652832,
"learning_rate": 1.4260869565217392e-05,
"loss": 0.4378,
"step": 1320
},
{
"epoch": 57.82608695652174,
"grad_norm": 2.710130453109741,
"learning_rate": 1.4217391304347828e-05,
"loss": 0.4296,
"step": 1330
},
{
"epoch": 58.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.49939388036727905,
"eval_runtime": 3.7356,
"eval_samples_per_second": 67.995,
"eval_steps_per_second": 0.535,
"step": 1334
},
{
"epoch": 58.26086956521739,
"grad_norm": 3.730140209197998,
"learning_rate": 1.4173913043478263e-05,
"loss": 0.4664,
"step": 1340
},
{
"epoch": 58.69565217391305,
"grad_norm": 9.71405029296875,
"learning_rate": 1.4130434782608698e-05,
"loss": 0.4415,
"step": 1350
},
{
"epoch": 59.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.4997561573982239,
"eval_runtime": 4.9859,
"eval_samples_per_second": 50.943,
"eval_steps_per_second": 0.401,
"step": 1357
},
{
"epoch": 59.130434782608695,
"grad_norm": 2.7752935886383057,
"learning_rate": 1.4086956521739133e-05,
"loss": 0.4436,
"step": 1360
},
{
"epoch": 59.56521739130435,
"grad_norm": 4.0491251945495605,
"learning_rate": 1.4043478260869568e-05,
"loss": 0.4442,
"step": 1370
},
{
"epoch": 60.0,
"grad_norm": 3.6015145778656006,
"learning_rate": 1.4e-05,
"loss": 0.4036,
"step": 1380
},
{
"epoch": 60.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.4996122717857361,
"eval_runtime": 3.8039,
"eval_samples_per_second": 66.774,
"eval_steps_per_second": 0.526,
"step": 1380
},
{
"epoch": 60.43478260869565,
"grad_norm": 2.5297908782958984,
"learning_rate": 1.3956521739130435e-05,
"loss": 0.4364,
"step": 1390
},
{
"epoch": 60.869565217391305,
"grad_norm": 2.8682429790496826,
"learning_rate": 1.391304347826087e-05,
"loss": 0.4406,
"step": 1400
},
{
"epoch": 61.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5022182464599609,
"eval_runtime": 3.7333,
"eval_samples_per_second": 68.036,
"eval_steps_per_second": 0.536,
"step": 1403
},
{
"epoch": 61.30434782608695,
"grad_norm": 3.3014872074127197,
"learning_rate": 1.3869565217391305e-05,
"loss": 0.4346,
"step": 1410
},
{
"epoch": 61.73913043478261,
"grad_norm": 3.4654860496520996,
"learning_rate": 1.382608695652174e-05,
"loss": 0.4235,
"step": 1420
},
{
"epoch": 62.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5018435120582581,
"eval_runtime": 4.9797,
"eval_samples_per_second": 51.007,
"eval_steps_per_second": 0.402,
"step": 1426
},
{
"epoch": 62.17391304347826,
"grad_norm": 6.609365940093994,
"learning_rate": 1.3782608695652175e-05,
"loss": 0.4884,
"step": 1430
},
{
"epoch": 62.608695652173914,
"grad_norm": 3.425076484680176,
"learning_rate": 1.373913043478261e-05,
"loss": 0.4492,
"step": 1440
},
{
"epoch": 63.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.4963783323764801,
"eval_runtime": 3.7677,
"eval_samples_per_second": 67.416,
"eval_steps_per_second": 0.531,
"step": 1449
},
{
"epoch": 63.04347826086956,
"grad_norm": 4.069096088409424,
"learning_rate": 1.3695652173913046e-05,
"loss": 0.4119,
"step": 1450
},
{
"epoch": 63.47826086956522,
"grad_norm": 2.3584377765655518,
"learning_rate": 1.3652173913043479e-05,
"loss": 0.4356,
"step": 1460
},
{
"epoch": 63.91304347826087,
"grad_norm": 12.776151657104492,
"learning_rate": 1.3608695652173913e-05,
"loss": 0.4065,
"step": 1470
},
{
"epoch": 64.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.4952879250049591,
"eval_runtime": 3.7751,
"eval_samples_per_second": 67.284,
"eval_steps_per_second": 0.53,
"step": 1472
},
{
"epoch": 64.34782608695652,
"grad_norm": 2.501909017562866,
"learning_rate": 1.3565217391304348e-05,
"loss": 0.4356,
"step": 1480
},
{
"epoch": 64.78260869565217,
"grad_norm": 2.650075674057007,
"learning_rate": 1.3521739130434783e-05,
"loss": 0.4474,
"step": 1490
},
{
"epoch": 65.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.4896911084651947,
"eval_runtime": 5.1494,
"eval_samples_per_second": 49.326,
"eval_steps_per_second": 0.388,
"step": 1495
},
{
"epoch": 65.21739130434783,
"grad_norm": 4.465973854064941,
"learning_rate": 1.3478260869565218e-05,
"loss": 0.4246,
"step": 1500
},
{
"epoch": 65.65217391304348,
"grad_norm": 2.057035207748413,
"learning_rate": 1.3434782608695653e-05,
"loss": 0.4605,
"step": 1510
},
{
"epoch": 66.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5039426684379578,
"eval_runtime": 3.7424,
"eval_samples_per_second": 67.871,
"eval_steps_per_second": 0.534,
"step": 1518
},
{
"epoch": 66.08695652173913,
"grad_norm": 3.064012050628662,
"learning_rate": 1.3391304347826088e-05,
"loss": 0.4157,
"step": 1520
},
{
"epoch": 66.52173913043478,
"grad_norm": 3.7584011554718018,
"learning_rate": 1.3347826086956523e-05,
"loss": 0.4553,
"step": 1530
},
{
"epoch": 66.95652173913044,
"grad_norm": 2.945054054260254,
"learning_rate": 1.3304347826086958e-05,
"loss": 0.436,
"step": 1540
},
{
"epoch": 67.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.5024412274360657,
"eval_runtime": 3.7621,
"eval_samples_per_second": 67.516,
"eval_steps_per_second": 0.532,
"step": 1541
},
{
"epoch": 67.3913043478261,
"grad_norm": 3.1257166862487793,
"learning_rate": 1.3260869565217392e-05,
"loss": 0.4173,
"step": 1550
},
{
"epoch": 67.82608695652173,
"grad_norm": 5.225259304046631,
"learning_rate": 1.3217391304347827e-05,
"loss": 0.4746,
"step": 1560
},
{
"epoch": 68.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5006521344184875,
"eval_runtime": 5.0155,
"eval_samples_per_second": 50.643,
"eval_steps_per_second": 0.399,
"step": 1564
},
{
"epoch": 68.26086956521739,
"grad_norm": 3.3438003063201904,
"learning_rate": 1.3173913043478262e-05,
"loss": 0.3957,
"step": 1570
},
{
"epoch": 68.69565217391305,
"grad_norm": 2.6640641689300537,
"learning_rate": 1.3130434782608697e-05,
"loss": 0.4555,
"step": 1580
},
{
"epoch": 69.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5053796172142029,
"eval_runtime": 3.7207,
"eval_samples_per_second": 68.266,
"eval_steps_per_second": 0.538,
"step": 1587
},
{
"epoch": 69.1304347826087,
"grad_norm": 6.726771831512451,
"learning_rate": 1.308695652173913e-05,
"loss": 0.4322,
"step": 1590
},
{
"epoch": 69.56521739130434,
"grad_norm": 3.231029748916626,
"learning_rate": 1.3043478260869566e-05,
"loss": 0.3949,
"step": 1600
},
{
"epoch": 70.0,
"grad_norm": 6.560612201690674,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.433,
"step": 1610
},
{
"epoch": 70.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.49738696217536926,
"eval_runtime": 3.716,
"eval_samples_per_second": 68.353,
"eval_steps_per_second": 0.538,
"step": 1610
},
{
"epoch": 70.43478260869566,
"grad_norm": 2.6361474990844727,
"learning_rate": 1.2956521739130436e-05,
"loss": 0.4158,
"step": 1620
},
{
"epoch": 70.8695652173913,
"grad_norm": 2.7182960510253906,
"learning_rate": 1.2913043478260871e-05,
"loss": 0.4503,
"step": 1630
},
{
"epoch": 71.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.509588897228241,
"eval_runtime": 5.0488,
"eval_samples_per_second": 50.309,
"eval_steps_per_second": 0.396,
"step": 1633
},
{
"epoch": 71.30434782608695,
"grad_norm": 1.938330888748169,
"learning_rate": 1.2869565217391305e-05,
"loss": 0.4285,
"step": 1640
},
{
"epoch": 71.73913043478261,
"grad_norm": 3.1797378063201904,
"learning_rate": 1.282608695652174e-05,
"loss": 0.4424,
"step": 1650
},
{
"epoch": 72.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.5040333867073059,
"eval_runtime": 3.7598,
"eval_samples_per_second": 67.557,
"eval_steps_per_second": 0.532,
"step": 1656
},
{
"epoch": 72.17391304347827,
"grad_norm": 3.028841257095337,
"learning_rate": 1.2782608695652175e-05,
"loss": 0.4278,
"step": 1660
},
{
"epoch": 72.6086956521739,
"grad_norm": 3.0137178897857666,
"learning_rate": 1.273913043478261e-05,
"loss": 0.4331,
"step": 1670
},
{
"epoch": 73.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.505591869354248,
"eval_runtime": 3.7202,
"eval_samples_per_second": 68.277,
"eval_steps_per_second": 0.538,
"step": 1679
},
{
"epoch": 73.04347826086956,
"grad_norm": 3.519934892654419,
"learning_rate": 1.2695652173913045e-05,
"loss": 0.4523,
"step": 1680
},
{
"epoch": 73.47826086956522,
"grad_norm": 2.6839394569396973,
"learning_rate": 1.265217391304348e-05,
"loss": 0.4143,
"step": 1690
},
{
"epoch": 73.91304347826087,
"grad_norm": 4.223355770111084,
"learning_rate": 1.2608695652173915e-05,
"loss": 0.4263,
"step": 1700
},
{
"epoch": 74.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5025500059127808,
"eval_runtime": 5.0354,
"eval_samples_per_second": 50.442,
"eval_steps_per_second": 0.397,
"step": 1702
},
{
"epoch": 74.34782608695652,
"grad_norm": 2.633610248565674,
"learning_rate": 1.2565217391304349e-05,
"loss": 0.4451,
"step": 1710
},
{
"epoch": 74.78260869565217,
"grad_norm": 4.227041721343994,
"learning_rate": 1.2521739130434784e-05,
"loss": 0.4305,
"step": 1720
},
{
"epoch": 75.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5032832026481628,
"eval_runtime": 3.7074,
"eval_samples_per_second": 68.512,
"eval_steps_per_second": 0.539,
"step": 1725
},
{
"epoch": 75.21739130434783,
"grad_norm": 3.885732412338257,
"learning_rate": 1.2478260869565217e-05,
"loss": 0.4177,
"step": 1730
},
{
"epoch": 75.65217391304348,
"grad_norm": 6.669870853424072,
"learning_rate": 1.2434782608695652e-05,
"loss": 0.4271,
"step": 1740
},
{
"epoch": 76.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5014809966087341,
"eval_runtime": 3.6911,
"eval_samples_per_second": 68.814,
"eval_steps_per_second": 0.542,
"step": 1748
},
{
"epoch": 76.08695652173913,
"grad_norm": 1.820388913154602,
"learning_rate": 1.2391304347826088e-05,
"loss": 0.4457,
"step": 1750
},
{
"epoch": 76.52173913043478,
"grad_norm": 2.142805337905884,
"learning_rate": 1.2347826086956523e-05,
"loss": 0.3962,
"step": 1760
},
{
"epoch": 76.95652173913044,
"grad_norm": 3.5151073932647705,
"learning_rate": 1.2304347826086958e-05,
"loss": 0.4635,
"step": 1770
},
{
"epoch": 77.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.49884113669395447,
"eval_runtime": 4.3354,
"eval_samples_per_second": 58.587,
"eval_steps_per_second": 0.461,
"step": 1771
},
{
"epoch": 77.3913043478261,
"grad_norm": 3.867955207824707,
"learning_rate": 1.2260869565217393e-05,
"loss": 0.4616,
"step": 1780
},
{
"epoch": 77.82608695652173,
"grad_norm": 2.6050870418548584,
"learning_rate": 1.2217391304347828e-05,
"loss": 0.4212,
"step": 1790
},
{
"epoch": 78.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.4993511438369751,
"eval_runtime": 4.267,
"eval_samples_per_second": 59.527,
"eval_steps_per_second": 0.469,
"step": 1794
},
{
"epoch": 78.26086956521739,
"grad_norm": 2.1961538791656494,
"learning_rate": 1.2173913043478263e-05,
"loss": 0.4191,
"step": 1800
},
{
"epoch": 78.69565217391305,
"grad_norm": 6.02454948425293,
"learning_rate": 1.2130434782608698e-05,
"loss": 0.4154,
"step": 1810
},
{
"epoch": 79.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5044043660163879,
"eval_runtime": 3.7036,
"eval_samples_per_second": 68.581,
"eval_steps_per_second": 0.54,
"step": 1817
},
{
"epoch": 79.1304347826087,
"grad_norm": 2.1048858165740967,
"learning_rate": 1.208695652173913e-05,
"loss": 0.4196,
"step": 1820
},
{
"epoch": 79.56521739130434,
"grad_norm": 2.8622193336486816,
"learning_rate": 1.2043478260869565e-05,
"loss": 0.4314,
"step": 1830
},
{
"epoch": 80.0,
"grad_norm": 6.1558427810668945,
"learning_rate": 1.2e-05,
"loss": 0.4288,
"step": 1840
},
{
"epoch": 80.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5033003687858582,
"eval_runtime": 3.7575,
"eval_samples_per_second": 67.598,
"eval_steps_per_second": 0.532,
"step": 1840
},
{
"epoch": 80.43478260869566,
"grad_norm": 3.3254945278167725,
"learning_rate": 1.1956521739130435e-05,
"loss": 0.4297,
"step": 1850
},
{
"epoch": 80.8695652173913,
"grad_norm": 2.2818620204925537,
"learning_rate": 1.191304347826087e-05,
"loss": 0.4211,
"step": 1860
},
{
"epoch": 81.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5050157904624939,
"eval_runtime": 5.0113,
"eval_samples_per_second": 50.685,
"eval_steps_per_second": 0.399,
"step": 1863
},
{
"epoch": 81.30434782608695,
"grad_norm": 4.174459934234619,
"learning_rate": 1.1869565217391306e-05,
"loss": 0.4229,
"step": 1870
},
{
"epoch": 81.73913043478261,
"grad_norm": 2.87514066696167,
"learning_rate": 1.182608695652174e-05,
"loss": 0.4022,
"step": 1880
},
{
"epoch": 82.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5021248459815979,
"eval_runtime": 3.7629,
"eval_samples_per_second": 67.5,
"eval_steps_per_second": 0.531,
"step": 1886
},
{
"epoch": 82.17391304347827,
"grad_norm": 5.307149410247803,
"learning_rate": 1.1782608695652176e-05,
"loss": 0.4564,
"step": 1890
},
{
"epoch": 82.6086956521739,
"grad_norm": 4.411511421203613,
"learning_rate": 1.1739130434782611e-05,
"loss": 0.4477,
"step": 1900
},
{
"epoch": 83.0,
"eval_accuracy": 0.7755905511811023,
"eval_loss": 0.509568452835083,
"eval_runtime": 5.3605,
"eval_samples_per_second": 47.384,
"eval_steps_per_second": 0.373,
"step": 1909
},
{
"epoch": 83.04347826086956,
"grad_norm": 2.478482246398926,
"learning_rate": 1.1695652173913043e-05,
"loss": 0.4118,
"step": 1910
},
{
"epoch": 83.47826086956522,
"grad_norm": 2.000185012817383,
"learning_rate": 1.1652173913043478e-05,
"loss": 0.4486,
"step": 1920
},
{
"epoch": 83.91304347826087,
"grad_norm": 4.231175422668457,
"learning_rate": 1.1608695652173913e-05,
"loss": 0.4091,
"step": 1930
},
{
"epoch": 84.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.501672625541687,
"eval_runtime": 4.6714,
"eval_samples_per_second": 54.374,
"eval_steps_per_second": 0.428,
"step": 1932
},
{
"epoch": 84.34782608695652,
"grad_norm": 8.062799453735352,
"learning_rate": 1.1565217391304348e-05,
"loss": 0.4108,
"step": 1940
},
{
"epoch": 84.78260869565217,
"grad_norm": 3.525912046432495,
"learning_rate": 1.1521739130434783e-05,
"loss": 0.4284,
"step": 1950
},
{
"epoch": 85.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5094006061553955,
"eval_runtime": 3.7057,
"eval_samples_per_second": 68.544,
"eval_steps_per_second": 0.54,
"step": 1955
},
{
"epoch": 85.21739130434783,
"grad_norm": 2.8294172286987305,
"learning_rate": 1.1478260869565218e-05,
"loss": 0.4341,
"step": 1960
},
{
"epoch": 85.65217391304348,
"grad_norm": 2.6164603233337402,
"learning_rate": 1.1434782608695654e-05,
"loss": 0.4317,
"step": 1970
},
{
"epoch": 86.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5055702328681946,
"eval_runtime": 3.78,
"eval_samples_per_second": 67.195,
"eval_steps_per_second": 0.529,
"step": 1978
},
{
"epoch": 86.08695652173913,
"grad_norm": 5.29531717300415,
"learning_rate": 1.1391304347826089e-05,
"loss": 0.3842,
"step": 1980
},
{
"epoch": 86.52173913043478,
"grad_norm": 3.8016159534454346,
"learning_rate": 1.1347826086956524e-05,
"loss": 0.4294,
"step": 1990
},
{
"epoch": 86.95652173913044,
"grad_norm": 2.229055643081665,
"learning_rate": 1.1304347826086957e-05,
"loss": 0.4011,
"step": 2000
},
{
"epoch": 87.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.4991566836833954,
"eval_runtime": 4.9616,
"eval_samples_per_second": 51.194,
"eval_steps_per_second": 0.403,
"step": 2001
},
{
"epoch": 87.3913043478261,
"grad_norm": 4.449975490570068,
"learning_rate": 1.1260869565217392e-05,
"loss": 0.4413,
"step": 2010
},
{
"epoch": 87.82608695652173,
"grad_norm": 3.4843342304229736,
"learning_rate": 1.1217391304347827e-05,
"loss": 0.4043,
"step": 2020
},
{
"epoch": 88.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5106358528137207,
"eval_runtime": 3.759,
"eval_samples_per_second": 67.571,
"eval_steps_per_second": 0.532,
"step": 2024
},
{
"epoch": 88.26086956521739,
"grad_norm": 3.2311477661132812,
"learning_rate": 1.1173913043478261e-05,
"loss": 0.4127,
"step": 2030
},
{
"epoch": 88.69565217391305,
"grad_norm": 3.511033058166504,
"learning_rate": 1.1130434782608696e-05,
"loss": 0.4233,
"step": 2040
},
{
"epoch": 89.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5082967877388,
"eval_runtime": 3.7279,
"eval_samples_per_second": 68.135,
"eval_steps_per_second": 0.536,
"step": 2047
},
{
"epoch": 89.1304347826087,
"grad_norm": 3.1737847328186035,
"learning_rate": 1.1086956521739131e-05,
"loss": 0.4449,
"step": 2050
},
{
"epoch": 89.56521739130434,
"grad_norm": 3.3332552909851074,
"learning_rate": 1.1043478260869566e-05,
"loss": 0.4148,
"step": 2060
},
{
"epoch": 90.0,
"grad_norm": 5.011209487915039,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.4383,
"step": 2070
},
{
"epoch": 90.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5015798211097717,
"eval_runtime": 5.0126,
"eval_samples_per_second": 50.672,
"eval_steps_per_second": 0.399,
"step": 2070
},
{
"epoch": 90.43478260869566,
"grad_norm": 2.4368808269500732,
"learning_rate": 1.0956521739130435e-05,
"loss": 0.4133,
"step": 2080
},
{
"epoch": 90.8695652173913,
"grad_norm": 5.885110378265381,
"learning_rate": 1.091304347826087e-05,
"loss": 0.4328,
"step": 2090
},
{
"epoch": 91.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5062097311019897,
"eval_runtime": 3.7299,
"eval_samples_per_second": 68.098,
"eval_steps_per_second": 0.536,
"step": 2093
},
{
"epoch": 91.30434782608695,
"grad_norm": 2.2072901725769043,
"learning_rate": 1.0869565217391305e-05,
"loss": 0.4137,
"step": 2100
},
{
"epoch": 91.73913043478261,
"grad_norm": 1.9848076105117798,
"learning_rate": 1.082608695652174e-05,
"loss": 0.3978,
"step": 2110
},
{
"epoch": 92.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5026075839996338,
"eval_runtime": 3.7759,
"eval_samples_per_second": 67.268,
"eval_steps_per_second": 0.53,
"step": 2116
},
{
"epoch": 92.17391304347827,
"grad_norm": 3.738398313522339,
"learning_rate": 1.0782608695652175e-05,
"loss": 0.4459,
"step": 2120
},
{
"epoch": 92.6086956521739,
"grad_norm": 3.0096168518066406,
"learning_rate": 1.073913043478261e-05,
"loss": 0.4052,
"step": 2130
},
{
"epoch": 93.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.49642127752304077,
"eval_runtime": 5.0236,
"eval_samples_per_second": 50.562,
"eval_steps_per_second": 0.398,
"step": 2139
},
{
"epoch": 93.04347826086956,
"grad_norm": 3.7452170848846436,
"learning_rate": 1.0695652173913046e-05,
"loss": 0.4205,
"step": 2140
},
{
"epoch": 93.47826086956522,
"grad_norm": 3.8985049724578857,
"learning_rate": 1.0652173913043479e-05,
"loss": 0.4171,
"step": 2150
},
{
"epoch": 93.91304347826087,
"grad_norm": 2.283020496368408,
"learning_rate": 1.0608695652173914e-05,
"loss": 0.3938,
"step": 2160
},
{
"epoch": 94.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5036487579345703,
"eval_runtime": 3.7298,
"eval_samples_per_second": 68.101,
"eval_steps_per_second": 0.536,
"step": 2162
},
{
"epoch": 94.34782608695652,
"grad_norm": 7.054046630859375,
"learning_rate": 1.0565217391304348e-05,
"loss": 0.4336,
"step": 2170
},
{
"epoch": 94.78260869565217,
"grad_norm": 3.131002902984619,
"learning_rate": 1.0521739130434783e-05,
"loss": 0.393,
"step": 2180
},
{
"epoch": 95.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5102458596229553,
"eval_runtime": 3.6839,
"eval_samples_per_second": 68.949,
"eval_steps_per_second": 0.543,
"step": 2185
},
{
"epoch": 95.21739130434783,
"grad_norm": 2.4622268676757812,
"learning_rate": 1.0478260869565218e-05,
"loss": 0.3997,
"step": 2190
},
{
"epoch": 95.65217391304348,
"grad_norm": 3.815375566482544,
"learning_rate": 1.0434782608695653e-05,
"loss": 0.4294,
"step": 2200
},
{
"epoch": 96.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5002910494804382,
"eval_runtime": 4.8997,
"eval_samples_per_second": 51.84,
"eval_steps_per_second": 0.408,
"step": 2208
},
{
"epoch": 96.08695652173913,
"grad_norm": 8.787290573120117,
"learning_rate": 1.0391304347826088e-05,
"loss": 0.4155,
"step": 2210
},
{
"epoch": 96.52173913043478,
"grad_norm": 2.8499906063079834,
"learning_rate": 1.0347826086956523e-05,
"loss": 0.4095,
"step": 2220
},
{
"epoch": 96.95652173913044,
"grad_norm": 6.26355504989624,
"learning_rate": 1.0304347826086958e-05,
"loss": 0.4122,
"step": 2230
},
{
"epoch": 97.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5013226270675659,
"eval_runtime": 3.7744,
"eval_samples_per_second": 67.296,
"eval_steps_per_second": 0.53,
"step": 2231
},
{
"epoch": 97.3913043478261,
"grad_norm": 3.257772445678711,
"learning_rate": 1.0260869565217393e-05,
"loss": 0.3522,
"step": 2240
},
{
"epoch": 97.82608695652173,
"grad_norm": 2.788611888885498,
"learning_rate": 1.0217391304347829e-05,
"loss": 0.4207,
"step": 2250
},
{
"epoch": 98.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.507587730884552,
"eval_runtime": 3.8534,
"eval_samples_per_second": 65.915,
"eval_steps_per_second": 0.519,
"step": 2254
},
{
"epoch": 98.26086956521739,
"grad_norm": 2.974043846130371,
"learning_rate": 1.017391304347826e-05,
"loss": 0.4352,
"step": 2260
},
{
"epoch": 98.69565217391305,
"grad_norm": 3.231869697570801,
"learning_rate": 1.0130434782608695e-05,
"loss": 0.4127,
"step": 2270
},
{
"epoch": 99.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.503979504108429,
"eval_runtime": 5.0998,
"eval_samples_per_second": 49.806,
"eval_steps_per_second": 0.392,
"step": 2277
},
{
"epoch": 99.1304347826087,
"grad_norm": 2.597999095916748,
"learning_rate": 1.008695652173913e-05,
"loss": 0.3888,
"step": 2280
},
{
"epoch": 99.56521739130434,
"grad_norm": 4.4219889640808105,
"learning_rate": 1.0043478260869566e-05,
"loss": 0.3921,
"step": 2290
},
{
"epoch": 100.0,
"grad_norm": 4.641758441925049,
"learning_rate": 1e-05,
"loss": 0.441,
"step": 2300
},
{
"epoch": 100.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.502194881439209,
"eval_runtime": 3.7077,
"eval_samples_per_second": 68.506,
"eval_steps_per_second": 0.539,
"step": 2300
},
{
"epoch": 100.43478260869566,
"grad_norm": 2.948529005050659,
"learning_rate": 9.956521739130436e-06,
"loss": 0.4324,
"step": 2310
},
{
"epoch": 100.8695652173913,
"grad_norm": 2.4855594635009766,
"learning_rate": 9.913043478260871e-06,
"loss": 0.3938,
"step": 2320
},
{
"epoch": 101.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.4974897503852844,
"eval_runtime": 3.7364,
"eval_samples_per_second": 67.98,
"eval_steps_per_second": 0.535,
"step": 2323
},
{
"epoch": 101.30434782608695,
"grad_norm": 4.753269195556641,
"learning_rate": 9.869565217391304e-06,
"loss": 0.3918,
"step": 2330
},
{
"epoch": 101.73913043478261,
"grad_norm": 5.000470161437988,
"learning_rate": 9.82608695652174e-06,
"loss": 0.4109,
"step": 2340
},
{
"epoch": 102.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5018798112869263,
"eval_runtime": 4.8425,
"eval_samples_per_second": 52.452,
"eval_steps_per_second": 0.413,
"step": 2346
},
{
"epoch": 102.17391304347827,
"grad_norm": 2.8584697246551514,
"learning_rate": 9.782608695652175e-06,
"loss": 0.4199,
"step": 2350
},
{
"epoch": 102.6086956521739,
"grad_norm": 2.773083448410034,
"learning_rate": 9.73913043478261e-06,
"loss": 0.4299,
"step": 2360
},
{
"epoch": 103.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5060404539108276,
"eval_runtime": 3.7179,
"eval_samples_per_second": 68.318,
"eval_steps_per_second": 0.538,
"step": 2369
},
{
"epoch": 103.04347826086956,
"grad_norm": 1.847158670425415,
"learning_rate": 9.695652173913043e-06,
"loss": 0.3834,
"step": 2370
},
{
"epoch": 103.47826086956522,
"grad_norm": 4.114128112792969,
"learning_rate": 9.652173913043478e-06,
"loss": 0.4061,
"step": 2380
},
{
"epoch": 103.91304347826087,
"grad_norm": 5.080406665802002,
"learning_rate": 9.608695652173914e-06,
"loss": 0.4148,
"step": 2390
},
{
"epoch": 104.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5038026571273804,
"eval_runtime": 3.7535,
"eval_samples_per_second": 67.671,
"eval_steps_per_second": 0.533,
"step": 2392
},
{
"epoch": 104.34782608695652,
"grad_norm": 3.291896104812622,
"learning_rate": 9.565217391304349e-06,
"loss": 0.4272,
"step": 2400
},
{
"epoch": 104.78260869565217,
"grad_norm": 2.7959041595458984,
"learning_rate": 9.521739130434784e-06,
"loss": 0.4179,
"step": 2410
},
{
"epoch": 105.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5064316391944885,
"eval_runtime": 4.8627,
"eval_samples_per_second": 52.235,
"eval_steps_per_second": 0.411,
"step": 2415
},
{
"epoch": 105.21739130434783,
"grad_norm": 5.880518913269043,
"learning_rate": 9.478260869565217e-06,
"loss": 0.4155,
"step": 2420
},
{
"epoch": 105.65217391304348,
"grad_norm": 2.2435200214385986,
"learning_rate": 9.434782608695652e-06,
"loss": 0.4352,
"step": 2430
},
{
"epoch": 106.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5059410929679871,
"eval_runtime": 3.7149,
"eval_samples_per_second": 68.373,
"eval_steps_per_second": 0.538,
"step": 2438
},
{
"epoch": 106.08695652173913,
"grad_norm": 3.1865811347961426,
"learning_rate": 9.391304347826087e-06,
"loss": 0.3997,
"step": 2440
},
{
"epoch": 106.52173913043478,
"grad_norm": 4.0479936599731445,
"learning_rate": 9.347826086956523e-06,
"loss": 0.401,
"step": 2450
},
{
"epoch": 106.95652173913044,
"grad_norm": 2.87663197517395,
"learning_rate": 9.304347826086956e-06,
"loss": 0.4027,
"step": 2460
},
{
"epoch": 107.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5025486350059509,
"eval_runtime": 3.7614,
"eval_samples_per_second": 67.528,
"eval_steps_per_second": 0.532,
"step": 2461
},
{
"epoch": 107.3913043478261,
"grad_norm": 2.630986452102661,
"learning_rate": 9.260869565217391e-06,
"loss": 0.3828,
"step": 2470
},
{
"epoch": 107.82608695652173,
"grad_norm": 2.9700822830200195,
"learning_rate": 9.217391304347826e-06,
"loss": 0.4002,
"step": 2480
},
{
"epoch": 108.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5020495653152466,
"eval_runtime": 4.6331,
"eval_samples_per_second": 54.823,
"eval_steps_per_second": 0.432,
"step": 2484
},
{
"epoch": 108.26086956521739,
"grad_norm": 4.361221790313721,
"learning_rate": 9.173913043478261e-06,
"loss": 0.405,
"step": 2490
},
{
"epoch": 108.69565217391305,
"grad_norm": 2.9328296184539795,
"learning_rate": 9.130434782608697e-06,
"loss": 0.3988,
"step": 2500
},
{
"epoch": 109.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5063354969024658,
"eval_runtime": 3.8012,
"eval_samples_per_second": 66.821,
"eval_steps_per_second": 0.526,
"step": 2507
},
{
"epoch": 109.1304347826087,
"grad_norm": 2.3236513137817383,
"learning_rate": 9.086956521739132e-06,
"loss": 0.3894,
"step": 2510
},
{
"epoch": 109.56521739130434,
"grad_norm": 3.4379804134368896,
"learning_rate": 9.043478260869565e-06,
"loss": 0.4023,
"step": 2520
},
{
"epoch": 110.0,
"grad_norm": 4.300137042999268,
"learning_rate": 9e-06,
"loss": 0.4095,
"step": 2530
},
{
"epoch": 110.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5034452676773071,
"eval_runtime": 3.7021,
"eval_samples_per_second": 68.61,
"eval_steps_per_second": 0.54,
"step": 2530
},
{
"epoch": 110.43478260869566,
"grad_norm": 2.190524101257324,
"learning_rate": 8.956521739130435e-06,
"loss": 0.4072,
"step": 2540
},
{
"epoch": 110.8695652173913,
"grad_norm": 2.2291879653930664,
"learning_rate": 8.91304347826087e-06,
"loss": 0.4001,
"step": 2550
},
{
"epoch": 111.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.505436360836029,
"eval_runtime": 4.2919,
"eval_samples_per_second": 59.182,
"eval_steps_per_second": 0.466,
"step": 2553
},
{
"epoch": 111.30434782608695,
"grad_norm": 3.1182541847229004,
"learning_rate": 8.869565217391306e-06,
"loss": 0.3904,
"step": 2560
},
{
"epoch": 111.73913043478261,
"grad_norm": 3.8375625610351562,
"learning_rate": 8.82608695652174e-06,
"loss": 0.4201,
"step": 2570
},
{
"epoch": 112.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5076125860214233,
"eval_runtime": 4.2691,
"eval_samples_per_second": 59.497,
"eval_steps_per_second": 0.468,
"step": 2576
},
{
"epoch": 112.17391304347827,
"grad_norm": 2.4231808185577393,
"learning_rate": 8.782608695652174e-06,
"loss": 0.3925,
"step": 2580
},
{
"epoch": 112.6086956521739,
"grad_norm": 4.854309558868408,
"learning_rate": 8.73913043478261e-06,
"loss": 0.4134,
"step": 2590
},
{
"epoch": 113.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5069688558578491,
"eval_runtime": 3.7367,
"eval_samples_per_second": 67.974,
"eval_steps_per_second": 0.535,
"step": 2599
},
{
"epoch": 113.04347826086956,
"grad_norm": 4.327704429626465,
"learning_rate": 8.695652173913044e-06,
"loss": 0.3959,
"step": 2600
},
{
"epoch": 113.47826086956522,
"grad_norm": 2.8718910217285156,
"learning_rate": 8.65217391304348e-06,
"loss": 0.3806,
"step": 2610
},
{
"epoch": 113.91304347826087,
"grad_norm": 5.400497913360596,
"learning_rate": 8.608695652173915e-06,
"loss": 0.3614,
"step": 2620
},
{
"epoch": 114.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5032684206962585,
"eval_runtime": 4.4717,
"eval_samples_per_second": 56.802,
"eval_steps_per_second": 0.447,
"step": 2622
},
{
"epoch": 114.34782608695652,
"grad_norm": 2.7276597023010254,
"learning_rate": 8.56521739130435e-06,
"loss": 0.3956,
"step": 2630
},
{
"epoch": 114.78260869565217,
"grad_norm": 3.339860200881958,
"learning_rate": 8.521739130434783e-06,
"loss": 0.3928,
"step": 2640
},
{
"epoch": 115.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5042973160743713,
"eval_runtime": 4.1216,
"eval_samples_per_second": 61.627,
"eval_steps_per_second": 0.485,
"step": 2645
},
{
"epoch": 115.21739130434783,
"grad_norm": 2.435579538345337,
"learning_rate": 8.478260869565218e-06,
"loss": 0.4149,
"step": 2650
},
{
"epoch": 115.65217391304348,
"grad_norm": 3.9001612663269043,
"learning_rate": 8.434782608695653e-06,
"loss": 0.435,
"step": 2660
},
{
"epoch": 116.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.4998602271080017,
"eval_runtime": 3.7703,
"eval_samples_per_second": 67.369,
"eval_steps_per_second": 0.53,
"step": 2668
},
{
"epoch": 116.08695652173913,
"grad_norm": 4.031954288482666,
"learning_rate": 8.391304347826089e-06,
"loss": 0.3575,
"step": 2670
},
{
"epoch": 116.52173913043478,
"grad_norm": 3.1172120571136475,
"learning_rate": 8.347826086956522e-06,
"loss": 0.4062,
"step": 2680
},
{
"epoch": 116.95652173913044,
"grad_norm": 2.6061761379241943,
"learning_rate": 8.304347826086957e-06,
"loss": 0.4162,
"step": 2690
},
{
"epoch": 117.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5132189393043518,
"eval_runtime": 4.4279,
"eval_samples_per_second": 57.363,
"eval_steps_per_second": 0.452,
"step": 2691
},
{
"epoch": 117.3913043478261,
"grad_norm": 1.7457960844039917,
"learning_rate": 8.260869565217392e-06,
"loss": 0.3887,
"step": 2700
},
{
"epoch": 117.82608695652173,
"grad_norm": 5.013397216796875,
"learning_rate": 8.217391304347827e-06,
"loss": 0.4078,
"step": 2710
},
{
"epoch": 118.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5088200569152832,
"eval_runtime": 4.1897,
"eval_samples_per_second": 60.625,
"eval_steps_per_second": 0.477,
"step": 2714
},
{
"epoch": 118.26086956521739,
"grad_norm": 3.4758872985839844,
"learning_rate": 8.173913043478263e-06,
"loss": 0.4251,
"step": 2720
},
{
"epoch": 118.69565217391305,
"grad_norm": 1.8225319385528564,
"learning_rate": 8.130434782608696e-06,
"loss": 0.4025,
"step": 2730
},
{
"epoch": 119.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.507527768611908,
"eval_runtime": 3.721,
"eval_samples_per_second": 68.261,
"eval_steps_per_second": 0.537,
"step": 2737
},
{
"epoch": 119.1304347826087,
"grad_norm": 4.636626720428467,
"learning_rate": 8.086956521739131e-06,
"loss": 0.4024,
"step": 2740
},
{
"epoch": 119.56521739130434,
"grad_norm": 2.249758720397949,
"learning_rate": 8.043478260869566e-06,
"loss": 0.3917,
"step": 2750
},
{
"epoch": 120.0,
"grad_norm": 6.408204555511475,
"learning_rate": 8.000000000000001e-06,
"loss": 0.4096,
"step": 2760
},
{
"epoch": 120.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.502310574054718,
"eval_runtime": 4.469,
"eval_samples_per_second": 56.836,
"eval_steps_per_second": 0.448,
"step": 2760
},
{
"epoch": 120.43478260869566,
"grad_norm": 2.495302200317383,
"learning_rate": 7.956521739130435e-06,
"loss": 0.3791,
"step": 2770
},
{
"epoch": 120.8695652173913,
"grad_norm": 2.840449571609497,
"learning_rate": 7.91304347826087e-06,
"loss": 0.3879,
"step": 2780
},
{
"epoch": 121.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5062641501426697,
"eval_runtime": 3.9315,
"eval_samples_per_second": 64.606,
"eval_steps_per_second": 0.509,
"step": 2783
},
{
"epoch": 121.30434782608695,
"grad_norm": 4.82555627822876,
"learning_rate": 7.869565217391305e-06,
"loss": 0.4232,
"step": 2790
},
{
"epoch": 121.73913043478261,
"grad_norm": 3.220736503601074,
"learning_rate": 7.82608695652174e-06,
"loss": 0.4033,
"step": 2800
},
{
"epoch": 122.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5001329183578491,
"eval_runtime": 3.7903,
"eval_samples_per_second": 67.012,
"eval_steps_per_second": 0.528,
"step": 2806
},
{
"epoch": 122.17391304347827,
"grad_norm": 4.516547203063965,
"learning_rate": 7.782608695652174e-06,
"loss": 0.4144,
"step": 2810
},
{
"epoch": 122.6086956521739,
"grad_norm": 2.559272289276123,
"learning_rate": 7.739130434782609e-06,
"loss": 0.3927,
"step": 2820
},
{
"epoch": 123.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5087068676948547,
"eval_runtime": 4.404,
"eval_samples_per_second": 57.675,
"eval_steps_per_second": 0.454,
"step": 2829
},
{
"epoch": 123.04347826086956,
"grad_norm": 3.344332695007324,
"learning_rate": 7.695652173913044e-06,
"loss": 0.4016,
"step": 2830
},
{
"epoch": 123.47826086956522,
"grad_norm": 2.610856533050537,
"learning_rate": 7.652173913043479e-06,
"loss": 0.3925,
"step": 2840
},
{
"epoch": 123.91304347826087,
"grad_norm": 3.501596689224243,
"learning_rate": 7.608695652173914e-06,
"loss": 0.3803,
"step": 2850
},
{
"epoch": 124.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5149940848350525,
"eval_runtime": 4.2503,
"eval_samples_per_second": 59.76,
"eval_steps_per_second": 0.471,
"step": 2852
},
{
"epoch": 124.34782608695652,
"grad_norm": 4.040353298187256,
"learning_rate": 7.565217391304348e-06,
"loss": 0.4101,
"step": 2860
},
{
"epoch": 124.78260869565217,
"grad_norm": 3.1806752681732178,
"learning_rate": 7.5217391304347835e-06,
"loss": 0.4248,
"step": 2870
},
{
"epoch": 125.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.515027642250061,
"eval_runtime": 3.7006,
"eval_samples_per_second": 68.638,
"eval_steps_per_second": 0.54,
"step": 2875
},
{
"epoch": 125.21739130434783,
"grad_norm": 2.976123332977295,
"learning_rate": 7.478260869565218e-06,
"loss": 0.3806,
"step": 2880
},
{
"epoch": 125.65217391304348,
"grad_norm": 4.0399250984191895,
"learning_rate": 7.434782608695653e-06,
"loss": 0.3874,
"step": 2890
},
{
"epoch": 126.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5157892107963562,
"eval_runtime": 3.8292,
"eval_samples_per_second": 66.332,
"eval_steps_per_second": 0.522,
"step": 2898
},
{
"epoch": 126.08695652173913,
"grad_norm": 2.8186984062194824,
"learning_rate": 7.391304347826087e-06,
"loss": 0.4068,
"step": 2900
},
{
"epoch": 126.52173913043478,
"grad_norm": 1.7811031341552734,
"learning_rate": 7.347826086956522e-06,
"loss": 0.4188,
"step": 2910
},
{
"epoch": 126.95652173913044,
"grad_norm": 2.591479539871216,
"learning_rate": 7.304347826086957e-06,
"loss": 0.3646,
"step": 2920
},
{
"epoch": 127.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.4979710578918457,
"eval_runtime": 4.9476,
"eval_samples_per_second": 51.338,
"eval_steps_per_second": 0.404,
"step": 2921
},
{
"epoch": 127.3913043478261,
"grad_norm": 3.097064733505249,
"learning_rate": 7.2608695652173925e-06,
"loss": 0.3809,
"step": 2930
},
{
"epoch": 127.82608695652173,
"grad_norm": 4.5358805656433105,
"learning_rate": 7.217391304347827e-06,
"loss": 0.4115,
"step": 2940
},
{
"epoch": 128.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.507692813873291,
"eval_runtime": 3.7086,
"eval_samples_per_second": 68.489,
"eval_steps_per_second": 0.539,
"step": 2944
},
{
"epoch": 128.2608695652174,
"grad_norm": 4.192093372344971,
"learning_rate": 7.173913043478261e-06,
"loss": 0.3931,
"step": 2950
},
{
"epoch": 128.69565217391303,
"grad_norm": 2.4763779640197754,
"learning_rate": 7.130434782608696e-06,
"loss": 0.385,
"step": 2960
},
{
"epoch": 129.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5153175592422485,
"eval_runtime": 3.726,
"eval_samples_per_second": 68.17,
"eval_steps_per_second": 0.537,
"step": 2967
},
{
"epoch": 129.1304347826087,
"grad_norm": 2.906510353088379,
"learning_rate": 7.086956521739131e-06,
"loss": 0.4009,
"step": 2970
},
{
"epoch": 129.56521739130434,
"grad_norm": 5.497567653656006,
"learning_rate": 7.0434782608695665e-06,
"loss": 0.4091,
"step": 2980
},
{
"epoch": 130.0,
"grad_norm": 4.277368068695068,
"learning_rate": 7e-06,
"loss": 0.4064,
"step": 2990
},
{
"epoch": 130.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.511443018913269,
"eval_runtime": 5.0723,
"eval_samples_per_second": 50.076,
"eval_steps_per_second": 0.394,
"step": 2990
},
{
"epoch": 130.43478260869566,
"grad_norm": 2.3368613719940186,
"learning_rate": 6.956521739130435e-06,
"loss": 0.3762,
"step": 3000
},
{
"epoch": 130.8695652173913,
"grad_norm": 2.983280897140503,
"learning_rate": 6.91304347826087e-06,
"loss": 0.4168,
"step": 3010
},
{
"epoch": 131.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5056832432746887,
"eval_runtime": 3.7472,
"eval_samples_per_second": 67.785,
"eval_steps_per_second": 0.534,
"step": 3013
},
{
"epoch": 131.30434782608697,
"grad_norm": 5.6472978591918945,
"learning_rate": 6.869565217391305e-06,
"loss": 0.3454,
"step": 3020
},
{
"epoch": 131.7391304347826,
"grad_norm": 2.710934638977051,
"learning_rate": 6.8260869565217395e-06,
"loss": 0.4319,
"step": 3030
},
{
"epoch": 132.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5041180849075317,
"eval_runtime": 3.7165,
"eval_samples_per_second": 68.344,
"eval_steps_per_second": 0.538,
"step": 3036
},
{
"epoch": 132.17391304347825,
"grad_norm": 2.8998305797576904,
"learning_rate": 6.782608695652174e-06,
"loss": 0.3769,
"step": 3040
},
{
"epoch": 132.6086956521739,
"grad_norm": 3.503068208694458,
"learning_rate": 6.739130434782609e-06,
"loss": 0.4234,
"step": 3050
},
{
"epoch": 133.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5119389891624451,
"eval_runtime": 4.8197,
"eval_samples_per_second": 52.701,
"eval_steps_per_second": 0.415,
"step": 3059
},
{
"epoch": 133.04347826086956,
"grad_norm": 2.628817319869995,
"learning_rate": 6.695652173913044e-06,
"loss": 0.3984,
"step": 3060
},
{
"epoch": 133.47826086956522,
"grad_norm": 3.1060750484466553,
"learning_rate": 6.652173913043479e-06,
"loss": 0.4147,
"step": 3070
},
{
"epoch": 133.91304347826087,
"grad_norm": 6.7668328285217285,
"learning_rate": 6.6086956521739135e-06,
"loss": 0.3721,
"step": 3080
},
{
"epoch": 134.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.51175457239151,
"eval_runtime": 3.7909,
"eval_samples_per_second": 67.003,
"eval_steps_per_second": 0.528,
"step": 3082
},
{
"epoch": 134.34782608695653,
"grad_norm": 6.763729572296143,
"learning_rate": 6.565217391304349e-06,
"loss": 0.386,
"step": 3090
},
{
"epoch": 134.7826086956522,
"grad_norm": 4.876804828643799,
"learning_rate": 6.521739130434783e-06,
"loss": 0.3709,
"step": 3100
},
{
"epoch": 135.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5078221559524536,
"eval_runtime": 3.7684,
"eval_samples_per_second": 67.402,
"eval_steps_per_second": 0.531,
"step": 3105
},
{
"epoch": 135.2173913043478,
"grad_norm": 3.7445313930511475,
"learning_rate": 6.478260869565218e-06,
"loss": 0.3592,
"step": 3110
},
{
"epoch": 135.65217391304347,
"grad_norm": 5.715231418609619,
"learning_rate": 6.434782608695652e-06,
"loss": 0.4149,
"step": 3120
},
{
"epoch": 136.0,
"eval_accuracy": 0.7795275590551181,
"eval_loss": 0.5163589715957642,
"eval_runtime": 4.6746,
"eval_samples_per_second": 54.336,
"eval_steps_per_second": 0.428,
"step": 3128
},
{
"epoch": 136.08695652173913,
"grad_norm": 3.3850629329681396,
"learning_rate": 6.391304347826087e-06,
"loss": 0.3681,
"step": 3130
},
{
"epoch": 136.52173913043478,
"grad_norm": 5.502380847930908,
"learning_rate": 6.3478260869565225e-06,
"loss": 0.3629,
"step": 3140
},
{
"epoch": 136.95652173913044,
"grad_norm": 4.158088684082031,
"learning_rate": 6.304347826086958e-06,
"loss": 0.416,
"step": 3150
},
{
"epoch": 137.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5123007297515869,
"eval_runtime": 3.789,
"eval_samples_per_second": 67.036,
"eval_steps_per_second": 0.528,
"step": 3151
},
{
"epoch": 137.3913043478261,
"grad_norm": 2.241478681564331,
"learning_rate": 6.260869565217392e-06,
"loss": 0.4089,
"step": 3160
},
{
"epoch": 137.82608695652175,
"grad_norm": 4.336514472961426,
"learning_rate": 6.217391304347826e-06,
"loss": 0.406,
"step": 3170
},
{
"epoch": 138.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5115824937820435,
"eval_runtime": 3.7195,
"eval_samples_per_second": 68.288,
"eval_steps_per_second": 0.538,
"step": 3174
},
{
"epoch": 138.2608695652174,
"grad_norm": 2.154179334640503,
"learning_rate": 6.173913043478261e-06,
"loss": 0.4018,
"step": 3180
},
{
"epoch": 138.69565217391303,
"grad_norm": 3.2215845584869385,
"learning_rate": 6.1304347826086965e-06,
"loss": 0.3613,
"step": 3190
},
{
"epoch": 139.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5169662237167358,
"eval_runtime": 4.4593,
"eval_samples_per_second": 56.959,
"eval_steps_per_second": 0.448,
"step": 3197
},
{
"epoch": 139.1304347826087,
"grad_norm": 2.800915241241455,
"learning_rate": 6.086956521739132e-06,
"loss": 0.3863,
"step": 3200
},
{
"epoch": 139.56521739130434,
"grad_norm": 7.433578014373779,
"learning_rate": 6.043478260869565e-06,
"loss": 0.4278,
"step": 3210
},
{
"epoch": 140.0,
"grad_norm": 3.887300968170166,
"learning_rate": 6e-06,
"loss": 0.3786,
"step": 3220
},
{
"epoch": 140.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5098868608474731,
"eval_runtime": 4.1343,
"eval_samples_per_second": 61.437,
"eval_steps_per_second": 0.484,
"step": 3220
},
{
"epoch": 140.43478260869566,
"grad_norm": 3.3379013538360596,
"learning_rate": 5.956521739130435e-06,
"loss": 0.405,
"step": 3230
},
{
"epoch": 140.8695652173913,
"grad_norm": 3.2763419151306152,
"learning_rate": 5.91304347826087e-06,
"loss": 0.3976,
"step": 3240
},
{
"epoch": 141.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5111474394798279,
"eval_runtime": 3.8029,
"eval_samples_per_second": 66.792,
"eval_steps_per_second": 0.526,
"step": 3243
},
{
"epoch": 141.30434782608697,
"grad_norm": 3.1908023357391357,
"learning_rate": 5.8695652173913055e-06,
"loss": 0.3856,
"step": 3250
},
{
"epoch": 141.7391304347826,
"grad_norm": 3.875778913497925,
"learning_rate": 5.826086956521739e-06,
"loss": 0.371,
"step": 3260
},
{
"epoch": 142.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5081124901771545,
"eval_runtime": 4.5605,
"eval_samples_per_second": 55.696,
"eval_steps_per_second": 0.439,
"step": 3266
},
{
"epoch": 142.17391304347825,
"grad_norm": 2.925506353378296,
"learning_rate": 5.782608695652174e-06,
"loss": 0.4169,
"step": 3270
},
{
"epoch": 142.6086956521739,
"grad_norm": 9.266388893127441,
"learning_rate": 5.739130434782609e-06,
"loss": 0.4056,
"step": 3280
},
{
"epoch": 143.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5098369717597961,
"eval_runtime": 3.9928,
"eval_samples_per_second": 63.615,
"eval_steps_per_second": 0.501,
"step": 3289
},
{
"epoch": 143.04347826086956,
"grad_norm": 25.856365203857422,
"learning_rate": 5.695652173913044e-06,
"loss": 0.3757,
"step": 3290
},
{
"epoch": 143.47826086956522,
"grad_norm": 2.937258720397949,
"learning_rate": 5.652173913043479e-06,
"loss": 0.3745,
"step": 3300
},
{
"epoch": 143.91304347826087,
"grad_norm": 3.236806631088257,
"learning_rate": 5.608695652173914e-06,
"loss": 0.4214,
"step": 3310
},
{
"epoch": 144.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5085259675979614,
"eval_runtime": 3.7621,
"eval_samples_per_second": 67.516,
"eval_steps_per_second": 0.532,
"step": 3312
},
{
"epoch": 144.34782608695653,
"grad_norm": 3.6454241275787354,
"learning_rate": 5.565217391304348e-06,
"loss": 0.3659,
"step": 3320
},
{
"epoch": 144.7826086956522,
"grad_norm": 3.4510464668273926,
"learning_rate": 5.521739130434783e-06,
"loss": 0.3832,
"step": 3330
},
{
"epoch": 145.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5084368586540222,
"eval_runtime": 4.2307,
"eval_samples_per_second": 60.037,
"eval_steps_per_second": 0.473,
"step": 3335
},
{
"epoch": 145.2173913043478,
"grad_norm": 2.4478542804718018,
"learning_rate": 5.478260869565217e-06,
"loss": 0.3522,
"step": 3340
},
{
"epoch": 145.65217391304347,
"grad_norm": 4.097745895385742,
"learning_rate": 5.4347826086956525e-06,
"loss": 0.3762,
"step": 3350
},
{
"epoch": 146.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5060733556747437,
"eval_runtime": 4.203,
"eval_samples_per_second": 60.433,
"eval_steps_per_second": 0.476,
"step": 3358
},
{
"epoch": 146.08695652173913,
"grad_norm": 3.384960651397705,
"learning_rate": 5.391304347826088e-06,
"loss": 0.3761,
"step": 3360
},
{
"epoch": 146.52173913043478,
"grad_norm": 2.909395217895508,
"learning_rate": 5.347826086956523e-06,
"loss": 0.3902,
"step": 3370
},
{
"epoch": 146.95652173913044,
"grad_norm": 2.538163900375366,
"learning_rate": 5.304347826086957e-06,
"loss": 0.4118,
"step": 3380
},
{
"epoch": 147.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5111083984375,
"eval_runtime": 3.8334,
"eval_samples_per_second": 66.26,
"eval_steps_per_second": 0.522,
"step": 3381
},
{
"epoch": 147.3913043478261,
"grad_norm": 2.9644970893859863,
"learning_rate": 5.260869565217391e-06,
"loss": 0.3802,
"step": 3390
},
{
"epoch": 147.82608695652175,
"grad_norm": 3.0972464084625244,
"learning_rate": 5.2173913043478265e-06,
"loss": 0.3866,
"step": 3400
},
{
"epoch": 148.0,
"eval_accuracy": 0.8070866141732284,
"eval_loss": 0.5092455148696899,
"eval_runtime": 5.6174,
"eval_samples_per_second": 45.216,
"eval_steps_per_second": 0.356,
"step": 3404
},
{
"epoch": 148.2608695652174,
"grad_norm": 1.625214695930481,
"learning_rate": 5.173913043478262e-06,
"loss": 0.3584,
"step": 3410
},
{
"epoch": 148.69565217391303,
"grad_norm": 15.01403522491455,
"learning_rate": 5.130434782608697e-06,
"loss": 0.3869,
"step": 3420
},
{
"epoch": 149.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.512187659740448,
"eval_runtime": 3.8515,
"eval_samples_per_second": 65.948,
"eval_steps_per_second": 0.519,
"step": 3427
},
{
"epoch": 149.1304347826087,
"grad_norm": 1.9776344299316406,
"learning_rate": 5.08695652173913e-06,
"loss": 0.3921,
"step": 3430
},
{
"epoch": 149.56521739130434,
"grad_norm": 2.336129665374756,
"learning_rate": 5.043478260869565e-06,
"loss": 0.4048,
"step": 3440
},
{
"epoch": 150.0,
"grad_norm": 3.6398816108703613,
"learning_rate": 5e-06,
"loss": 0.3734,
"step": 3450
},
{
"epoch": 150.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5116916298866272,
"eval_runtime": 3.7475,
"eval_samples_per_second": 67.779,
"eval_steps_per_second": 0.534,
"step": 3450
},
{
"epoch": 150.43478260869566,
"grad_norm": 2.299021005630493,
"learning_rate": 4.9565217391304355e-06,
"loss": 0.3734,
"step": 3460
},
{
"epoch": 150.8695652173913,
"grad_norm": 3.107494831085205,
"learning_rate": 4.91304347826087e-06,
"loss": 0.4061,
"step": 3470
},
{
"epoch": 151.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5094764232635498,
"eval_runtime": 4.4075,
"eval_samples_per_second": 57.629,
"eval_steps_per_second": 0.454,
"step": 3473
},
{
"epoch": 151.30434782608697,
"grad_norm": 2.319066286087036,
"learning_rate": 4.869565217391305e-06,
"loss": 0.3681,
"step": 3480
},
{
"epoch": 151.7391304347826,
"grad_norm": 2.7603538036346436,
"learning_rate": 4.826086956521739e-06,
"loss": 0.3705,
"step": 3490
},
{
"epoch": 152.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5171404480934143,
"eval_runtime": 4.362,
"eval_samples_per_second": 58.23,
"eval_steps_per_second": 0.459,
"step": 3496
},
{
"epoch": 152.17391304347825,
"grad_norm": 2.0375826358795166,
"learning_rate": 4.782608695652174e-06,
"loss": 0.3882,
"step": 3500
},
{
"epoch": 152.6086956521739,
"grad_norm": 2.8498833179473877,
"learning_rate": 4.739130434782609e-06,
"loss": 0.3873,
"step": 3510
},
{
"epoch": 153.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5179200768470764,
"eval_runtime": 3.7588,
"eval_samples_per_second": 67.575,
"eval_steps_per_second": 0.532,
"step": 3519
},
{
"epoch": 153.04347826086956,
"grad_norm": 2.707977533340454,
"learning_rate": 4.695652173913044e-06,
"loss": 0.3979,
"step": 3520
},
{
"epoch": 153.47826086956522,
"grad_norm": 3.5183486938476562,
"learning_rate": 4.652173913043478e-06,
"loss": 0.4025,
"step": 3530
},
{
"epoch": 153.91304347826087,
"grad_norm": 2.90291166305542,
"learning_rate": 4.608695652173913e-06,
"loss": 0.3927,
"step": 3540
},
{
"epoch": 154.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5117496252059937,
"eval_runtime": 3.7541,
"eval_samples_per_second": 67.659,
"eval_steps_per_second": 0.533,
"step": 3542
},
{
"epoch": 154.34782608695653,
"grad_norm": 4.005958080291748,
"learning_rate": 4.565217391304348e-06,
"loss": 0.4011,
"step": 3550
},
{
"epoch": 154.7826086956522,
"grad_norm": 2.469202995300293,
"learning_rate": 4.5217391304347826e-06,
"loss": 0.3807,
"step": 3560
},
{
"epoch": 155.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5133464932441711,
"eval_runtime": 5.8154,
"eval_samples_per_second": 43.677,
"eval_steps_per_second": 0.344,
"step": 3565
},
{
"epoch": 155.2173913043478,
"grad_norm": 3.2248237133026123,
"learning_rate": 4.478260869565218e-06,
"loss": 0.4498,
"step": 3570
},
{
"epoch": 155.65217391304347,
"grad_norm": 3.463270425796509,
"learning_rate": 4.434782608695653e-06,
"loss": 0.3761,
"step": 3580
},
{
"epoch": 156.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5140319466590881,
"eval_runtime": 3.7668,
"eval_samples_per_second": 67.432,
"eval_steps_per_second": 0.531,
"step": 3588
},
{
"epoch": 156.08695652173913,
"grad_norm": 3.640611171722412,
"learning_rate": 4.391304347826087e-06,
"loss": 0.3609,
"step": 3590
},
{
"epoch": 156.52173913043478,
"grad_norm": 4.198793888092041,
"learning_rate": 4.347826086956522e-06,
"loss": 0.3984,
"step": 3600
},
{
"epoch": 156.95652173913044,
"grad_norm": 2.9035775661468506,
"learning_rate": 4.304347826086957e-06,
"loss": 0.3964,
"step": 3610
},
{
"epoch": 157.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5117691159248352,
"eval_runtime": 3.7832,
"eval_samples_per_second": 67.138,
"eval_steps_per_second": 0.529,
"step": 3611
},
{
"epoch": 157.3913043478261,
"grad_norm": 5.13762092590332,
"learning_rate": 4.260869565217392e-06,
"loss": 0.3818,
"step": 3620
},
{
"epoch": 157.82608695652175,
"grad_norm": 8.948963165283203,
"learning_rate": 4.217391304347827e-06,
"loss": 0.39,
"step": 3630
},
{
"epoch": 158.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5122236609458923,
"eval_runtime": 4.6309,
"eval_samples_per_second": 54.849,
"eval_steps_per_second": 0.432,
"step": 3634
},
{
"epoch": 158.2608695652174,
"grad_norm": 2.4759654998779297,
"learning_rate": 4.173913043478261e-06,
"loss": 0.3784,
"step": 3640
},
{
"epoch": 158.69565217391303,
"grad_norm": 2.407663106918335,
"learning_rate": 4.130434782608696e-06,
"loss": 0.3943,
"step": 3650
},
{
"epoch": 159.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5125917196273804,
"eval_runtime": 3.7278,
"eval_samples_per_second": 68.138,
"eval_steps_per_second": 0.537,
"step": 3657
},
{
"epoch": 159.1304347826087,
"grad_norm": 2.2464840412139893,
"learning_rate": 4.086956521739131e-06,
"loss": 0.3675,
"step": 3660
},
{
"epoch": 159.56521739130434,
"grad_norm": 3.0186944007873535,
"learning_rate": 4.0434782608695655e-06,
"loss": 0.355,
"step": 3670
},
{
"epoch": 160.0,
"grad_norm": 9.606362342834473,
"learning_rate": 4.000000000000001e-06,
"loss": 0.3417,
"step": 3680
},
{
"epoch": 160.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5096677541732788,
"eval_runtime": 3.7505,
"eval_samples_per_second": 67.724,
"eval_steps_per_second": 0.533,
"step": 3680
},
{
"epoch": 160.43478260869566,
"grad_norm": 3.155024766921997,
"learning_rate": 3.956521739130435e-06,
"loss": 0.3951,
"step": 3690
},
{
"epoch": 160.8695652173913,
"grad_norm": 2.3195645809173584,
"learning_rate": 3.91304347826087e-06,
"loss": 0.3996,
"step": 3700
},
{
"epoch": 161.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5048008561134338,
"eval_runtime": 4.9463,
"eval_samples_per_second": 51.351,
"eval_steps_per_second": 0.404,
"step": 3703
},
{
"epoch": 161.30434782608697,
"grad_norm": 16.818618774414062,
"learning_rate": 3.869565217391304e-06,
"loss": 0.3613,
"step": 3710
},
{
"epoch": 161.7391304347826,
"grad_norm": 5.290389060974121,
"learning_rate": 3.8260869565217395e-06,
"loss": 0.4,
"step": 3720
},
{
"epoch": 162.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5148473978042603,
"eval_runtime": 3.7348,
"eval_samples_per_second": 68.008,
"eval_steps_per_second": 0.535,
"step": 3726
},
{
"epoch": 162.17391304347825,
"grad_norm": 4.7519330978393555,
"learning_rate": 3.782608695652174e-06,
"loss": 0.3983,
"step": 3730
},
{
"epoch": 162.6086956521739,
"grad_norm": 2.433164358139038,
"learning_rate": 3.739130434782609e-06,
"loss": 0.4051,
"step": 3740
},
{
"epoch": 163.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5150399804115295,
"eval_runtime": 3.7013,
"eval_samples_per_second": 68.625,
"eval_steps_per_second": 0.54,
"step": 3749
},
{
"epoch": 163.04347826086956,
"grad_norm": 2.870962381362915,
"learning_rate": 3.6956521739130436e-06,
"loss": 0.3903,
"step": 3750
},
{
"epoch": 163.47826086956522,
"grad_norm": 3.3795669078826904,
"learning_rate": 3.6521739130434787e-06,
"loss": 0.3981,
"step": 3760
},
{
"epoch": 163.91304347826087,
"grad_norm": 4.447073936462402,
"learning_rate": 3.6086956521739134e-06,
"loss": 0.3973,
"step": 3770
},
{
"epoch": 164.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5036624073982239,
"eval_runtime": 4.8343,
"eval_samples_per_second": 52.541,
"eval_steps_per_second": 0.414,
"step": 3772
},
{
"epoch": 164.34782608695653,
"grad_norm": 2.5403716564178467,
"learning_rate": 3.565217391304348e-06,
"loss": 0.3586,
"step": 3780
},
{
"epoch": 164.7826086956522,
"grad_norm": 2.5216853618621826,
"learning_rate": 3.5217391304347832e-06,
"loss": 0.3963,
"step": 3790
},
{
"epoch": 165.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5048288702964783,
"eval_runtime": 3.7404,
"eval_samples_per_second": 67.907,
"eval_steps_per_second": 0.535,
"step": 3795
},
{
"epoch": 165.2173913043478,
"grad_norm": 3.382376194000244,
"learning_rate": 3.4782608695652175e-06,
"loss": 0.4012,
"step": 3800
},
{
"epoch": 165.65217391304347,
"grad_norm": 3.0021872520446777,
"learning_rate": 3.4347826086956526e-06,
"loss": 0.3568,
"step": 3810
},
{
"epoch": 166.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5167564749717712,
"eval_runtime": 3.6895,
"eval_samples_per_second": 68.845,
"eval_steps_per_second": 0.542,
"step": 3818
},
{
"epoch": 166.08695652173913,
"grad_norm": 4.209798812866211,
"learning_rate": 3.391304347826087e-06,
"loss": 0.4217,
"step": 3820
},
{
"epoch": 166.52173913043478,
"grad_norm": 2.3605332374572754,
"learning_rate": 3.347826086956522e-06,
"loss": 0.3897,
"step": 3830
},
{
"epoch": 166.95652173913044,
"grad_norm": 7.9494733810424805,
"learning_rate": 3.3043478260869567e-06,
"loss": 0.3995,
"step": 3840
},
{
"epoch": 167.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5096150636672974,
"eval_runtime": 4.9956,
"eval_samples_per_second": 50.845,
"eval_steps_per_second": 0.4,
"step": 3841
},
{
"epoch": 167.3913043478261,
"grad_norm": 3.431043863296509,
"learning_rate": 3.2608695652173914e-06,
"loss": 0.3765,
"step": 3850
},
{
"epoch": 167.82608695652175,
"grad_norm": 3.4384922981262207,
"learning_rate": 3.217391304347826e-06,
"loss": 0.3628,
"step": 3860
},
{
"epoch": 168.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5101594924926758,
"eval_runtime": 3.7705,
"eval_samples_per_second": 67.365,
"eval_steps_per_second": 0.53,
"step": 3864
},
{
"epoch": 168.2608695652174,
"grad_norm": 8.502880096435547,
"learning_rate": 3.1739130434782613e-06,
"loss": 0.3857,
"step": 3870
},
{
"epoch": 168.69565217391303,
"grad_norm": 2.5634241104125977,
"learning_rate": 3.130434782608696e-06,
"loss": 0.3836,
"step": 3880
},
{
"epoch": 169.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5133307576179504,
"eval_runtime": 3.7532,
"eval_samples_per_second": 67.676,
"eval_steps_per_second": 0.533,
"step": 3887
},
{
"epoch": 169.1304347826087,
"grad_norm": 3.617677927017212,
"learning_rate": 3.0869565217391307e-06,
"loss": 0.4251,
"step": 3890
},
{
"epoch": 169.56521739130434,
"grad_norm": 3.9091439247131348,
"learning_rate": 3.043478260869566e-06,
"loss": 0.3747,
"step": 3900
},
{
"epoch": 170.0,
"grad_norm": 12.626005172729492,
"learning_rate": 3e-06,
"loss": 0.3646,
"step": 3910
},
{
"epoch": 170.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5099019408226013,
"eval_runtime": 4.85,
"eval_samples_per_second": 52.372,
"eval_steps_per_second": 0.412,
"step": 3910
},
{
"epoch": 170.43478260869566,
"grad_norm": 5.3712263107299805,
"learning_rate": 2.956521739130435e-06,
"loss": 0.3732,
"step": 3920
},
{
"epoch": 170.8695652173913,
"grad_norm": 2.489645481109619,
"learning_rate": 2.9130434782608695e-06,
"loss": 0.3789,
"step": 3930
},
{
"epoch": 171.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5151440501213074,
"eval_runtime": 3.7265,
"eval_samples_per_second": 68.161,
"eval_steps_per_second": 0.537,
"step": 3933
},
{
"epoch": 171.30434782608697,
"grad_norm": 60.279747009277344,
"learning_rate": 2.8695652173913046e-06,
"loss": 0.3372,
"step": 3940
},
{
"epoch": 171.7391304347826,
"grad_norm": 5.177385330200195,
"learning_rate": 2.8260869565217393e-06,
"loss": 0.3832,
"step": 3950
},
{
"epoch": 172.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5148643255233765,
"eval_runtime": 3.7835,
"eval_samples_per_second": 67.134,
"eval_steps_per_second": 0.529,
"step": 3956
},
{
"epoch": 172.17391304347825,
"grad_norm": 2.5841851234436035,
"learning_rate": 2.782608695652174e-06,
"loss": 0.405,
"step": 3960
},
{
"epoch": 172.6086956521739,
"grad_norm": 2.6472222805023193,
"learning_rate": 2.7391304347826087e-06,
"loss": 0.3476,
"step": 3970
},
{
"epoch": 173.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5178123116493225,
"eval_runtime": 5.0055,
"eval_samples_per_second": 50.745,
"eval_steps_per_second": 0.4,
"step": 3979
},
{
"epoch": 173.04347826086956,
"grad_norm": 2.3995625972747803,
"learning_rate": 2.695652173913044e-06,
"loss": 0.4347,
"step": 3980
},
{
"epoch": 173.47826086956522,
"grad_norm": 4.958439826965332,
"learning_rate": 2.6521739130434785e-06,
"loss": 0.3886,
"step": 3990
},
{
"epoch": 173.91304347826087,
"grad_norm": 4.661713600158691,
"learning_rate": 2.6086956521739132e-06,
"loss": 0.3806,
"step": 4000
},
{
"epoch": 174.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5080812573432922,
"eval_runtime": 3.6838,
"eval_samples_per_second": 68.951,
"eval_steps_per_second": 0.543,
"step": 4002
},
{
"epoch": 174.34782608695653,
"grad_norm": 2.979862928390503,
"learning_rate": 2.5652173913043484e-06,
"loss": 0.3429,
"step": 4010
},
{
"epoch": 174.7826086956522,
"grad_norm": 1.8571139574050903,
"learning_rate": 2.5217391304347826e-06,
"loss": 0.4053,
"step": 4020
},
{
"epoch": 175.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5099707245826721,
"eval_runtime": 3.7194,
"eval_samples_per_second": 68.291,
"eval_steps_per_second": 0.538,
"step": 4025
},
{
"epoch": 175.2173913043478,
"grad_norm": 2.364047050476074,
"learning_rate": 2.4782608695652178e-06,
"loss": 0.3774,
"step": 4030
},
{
"epoch": 175.65217391304347,
"grad_norm": 4.220658779144287,
"learning_rate": 2.4347826086956525e-06,
"loss": 0.3986,
"step": 4040
},
{
"epoch": 176.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5189133286476135,
"eval_runtime": 5.0478,
"eval_samples_per_second": 50.319,
"eval_steps_per_second": 0.396,
"step": 4048
},
{
"epoch": 176.08695652173913,
"grad_norm": 2.9689295291900635,
"learning_rate": 2.391304347826087e-06,
"loss": 0.4225,
"step": 4050
},
{
"epoch": 176.52173913043478,
"grad_norm": 3.78476881980896,
"learning_rate": 2.347826086956522e-06,
"loss": 0.3798,
"step": 4060
},
{
"epoch": 176.95652173913044,
"grad_norm": 2.3258774280548096,
"learning_rate": 2.3043478260869566e-06,
"loss": 0.3827,
"step": 4070
},
{
"epoch": 177.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5128843784332275,
"eval_runtime": 3.7539,
"eval_samples_per_second": 67.663,
"eval_steps_per_second": 0.533,
"step": 4071
},
{
"epoch": 177.3913043478261,
"grad_norm": 2.329585313796997,
"learning_rate": 2.2608695652173913e-06,
"loss": 0.329,
"step": 4080
},
{
"epoch": 177.82608695652175,
"grad_norm": 3.0889029502868652,
"learning_rate": 2.2173913043478264e-06,
"loss": 0.3892,
"step": 4090
},
{
"epoch": 178.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5099364519119263,
"eval_runtime": 3.764,
"eval_samples_per_second": 67.482,
"eval_steps_per_second": 0.531,
"step": 4094
},
{
"epoch": 178.2608695652174,
"grad_norm": 3.167226791381836,
"learning_rate": 2.173913043478261e-06,
"loss": 0.3801,
"step": 4100
},
{
"epoch": 178.69565217391303,
"grad_norm": 2.857957601547241,
"learning_rate": 2.130434782608696e-06,
"loss": 0.3955,
"step": 4110
},
{
"epoch": 179.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5212357640266418,
"eval_runtime": 4.8308,
"eval_samples_per_second": 52.579,
"eval_steps_per_second": 0.414,
"step": 4117
},
{
"epoch": 179.1304347826087,
"grad_norm": 8.153979301452637,
"learning_rate": 2.0869565217391305e-06,
"loss": 0.4062,
"step": 4120
},
{
"epoch": 179.56521739130434,
"grad_norm": 3.2647910118103027,
"learning_rate": 2.0434782608695656e-06,
"loss": 0.3603,
"step": 4130
},
{
"epoch": 180.0,
"grad_norm": 4.87031364440918,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.4077,
"step": 4140
},
{
"epoch": 180.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5101702213287354,
"eval_runtime": 3.8052,
"eval_samples_per_second": 66.75,
"eval_steps_per_second": 0.526,
"step": 4140
},
{
"epoch": 180.43478260869566,
"grad_norm": 3.3625569343566895,
"learning_rate": 1.956521739130435e-06,
"loss": 0.3881,
"step": 4150
},
{
"epoch": 180.8695652173913,
"grad_norm": 3.717646360397339,
"learning_rate": 1.9130434782608697e-06,
"loss": 0.3579,
"step": 4160
},
{
"epoch": 181.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5099858641624451,
"eval_runtime": 3.707,
"eval_samples_per_second": 68.52,
"eval_steps_per_second": 0.54,
"step": 4163
},
{
"epoch": 181.30434782608697,
"grad_norm": 2.5178964138031006,
"learning_rate": 1.8695652173913044e-06,
"loss": 0.3828,
"step": 4170
},
{
"epoch": 181.7391304347826,
"grad_norm": 3.244948625564575,
"learning_rate": 1.8260869565217394e-06,
"loss": 0.3666,
"step": 4180
},
{
"epoch": 182.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5247715711593628,
"eval_runtime": 4.2228,
"eval_samples_per_second": 60.149,
"eval_steps_per_second": 0.474,
"step": 4186
},
{
"epoch": 182.17391304347825,
"grad_norm": 3.418851613998413,
"learning_rate": 1.782608695652174e-06,
"loss": 0.4,
"step": 4190
},
{
"epoch": 182.6086956521739,
"grad_norm": 2.247349262237549,
"learning_rate": 1.7391304347826088e-06,
"loss": 0.3746,
"step": 4200
},
{
"epoch": 183.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5220462083816528,
"eval_runtime": 4.3605,
"eval_samples_per_second": 58.25,
"eval_steps_per_second": 0.459,
"step": 4209
},
{
"epoch": 183.04347826086956,
"grad_norm": 5.591789245605469,
"learning_rate": 1.6956521739130435e-06,
"loss": 0.3971,
"step": 4210
},
{
"epoch": 183.47826086956522,
"grad_norm": 2.8663575649261475,
"learning_rate": 1.6521739130434784e-06,
"loss": 0.3516,
"step": 4220
},
{
"epoch": 183.91304347826087,
"grad_norm": 5.791408061981201,
"learning_rate": 1.608695652173913e-06,
"loss": 0.3867,
"step": 4230
},
{
"epoch": 184.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5172824859619141,
"eval_runtime": 3.8331,
"eval_samples_per_second": 66.265,
"eval_steps_per_second": 0.522,
"step": 4232
},
{
"epoch": 184.34782608695653,
"grad_norm": 3.3605191707611084,
"learning_rate": 1.565217391304348e-06,
"loss": 0.3911,
"step": 4240
},
{
"epoch": 184.7826086956522,
"grad_norm": 3.4683103561401367,
"learning_rate": 1.521739130434783e-06,
"loss": 0.4024,
"step": 4250
},
{
"epoch": 185.0,
"eval_accuracy": 0.7874015748031497,
"eval_loss": 0.5248106122016907,
"eval_runtime": 4.5705,
"eval_samples_per_second": 55.574,
"eval_steps_per_second": 0.438,
"step": 4255
},
{
"epoch": 185.2173913043478,
"grad_norm": 4.495180130004883,
"learning_rate": 1.4782608695652176e-06,
"loss": 0.3931,
"step": 4260
},
{
"epoch": 185.65217391304347,
"grad_norm": 4.51051139831543,
"learning_rate": 1.4347826086956523e-06,
"loss": 0.4014,
"step": 4270
},
{
"epoch": 186.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5084752440452576,
"eval_runtime": 4.1594,
"eval_samples_per_second": 61.066,
"eval_steps_per_second": 0.481,
"step": 4278
},
{
"epoch": 186.08695652173913,
"grad_norm": 6.847979545593262,
"learning_rate": 1.391304347826087e-06,
"loss": 0.3887,
"step": 4280
},
{
"epoch": 186.52173913043478,
"grad_norm": 8.414494514465332,
"learning_rate": 1.347826086956522e-06,
"loss": 0.3876,
"step": 4290
},
{
"epoch": 186.95652173913044,
"grad_norm": 2.0459609031677246,
"learning_rate": 1.3043478260869566e-06,
"loss": 0.3445,
"step": 4300
},
{
"epoch": 187.0,
"eval_accuracy": 0.8031496062992126,
"eval_loss": 0.5136986970901489,
"eval_runtime": 3.7104,
"eval_samples_per_second": 68.456,
"eval_steps_per_second": 0.539,
"step": 4301
},
{
"epoch": 187.3913043478261,
"grad_norm": 2.7707877159118652,
"learning_rate": 1.2608695652173913e-06,
"loss": 0.4067,
"step": 4310
},
{
"epoch": 187.82608695652175,
"grad_norm": 2.2277884483337402,
"learning_rate": 1.2173913043478262e-06,
"loss": 0.382,
"step": 4320
},
{
"epoch": 188.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.521314799785614,
"eval_runtime": 4.1528,
"eval_samples_per_second": 61.164,
"eval_steps_per_second": 0.482,
"step": 4324
},
{
"epoch": 188.2608695652174,
"grad_norm": 4.299314498901367,
"learning_rate": 1.173913043478261e-06,
"loss": 0.3717,
"step": 4330
},
{
"epoch": 188.69565217391303,
"grad_norm": 2.479510545730591,
"learning_rate": 1.1304347826086956e-06,
"loss": 0.3673,
"step": 4340
},
{
"epoch": 189.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5241702795028687,
"eval_runtime": 4.1853,
"eval_samples_per_second": 60.689,
"eval_steps_per_second": 0.478,
"step": 4347
},
{
"epoch": 189.1304347826087,
"grad_norm": 3.9942944049835205,
"learning_rate": 1.0869565217391306e-06,
"loss": 0.4158,
"step": 4350
},
{
"epoch": 189.56521739130434,
"grad_norm": 2.8651175498962402,
"learning_rate": 1.0434782608695653e-06,
"loss": 0.3919,
"step": 4360
},
{
"epoch": 190.0,
"grad_norm": 3.1065168380737305,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3631,
"step": 4370
},
{
"epoch": 190.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5146118402481079,
"eval_runtime": 3.7356,
"eval_samples_per_second": 67.995,
"eval_steps_per_second": 0.535,
"step": 4370
},
{
"epoch": 190.43478260869566,
"grad_norm": 3.8796093463897705,
"learning_rate": 9.565217391304349e-07,
"loss": 0.3893,
"step": 4380
},
{
"epoch": 190.8695652173913,
"grad_norm": 3.2894842624664307,
"learning_rate": 9.130434782608697e-07,
"loss": 0.393,
"step": 4390
},
{
"epoch": 191.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5097819566726685,
"eval_runtime": 3.7404,
"eval_samples_per_second": 67.908,
"eval_steps_per_second": 0.535,
"step": 4393
},
{
"epoch": 191.30434782608697,
"grad_norm": 2.4112348556518555,
"learning_rate": 8.695652173913044e-07,
"loss": 0.4037,
"step": 4400
},
{
"epoch": 191.7391304347826,
"grad_norm": 2.4510791301727295,
"learning_rate": 8.260869565217392e-07,
"loss": 0.3806,
"step": 4410
},
{
"epoch": 192.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5133717656135559,
"eval_runtime": 5.016,
"eval_samples_per_second": 50.638,
"eval_steps_per_second": 0.399,
"step": 4416
},
{
"epoch": 192.17391304347825,
"grad_norm": 3.1017332077026367,
"learning_rate": 7.82608695652174e-07,
"loss": 0.3598,
"step": 4420
},
{
"epoch": 192.6086956521739,
"grad_norm": 3.5164568424224854,
"learning_rate": 7.391304347826088e-07,
"loss": 0.3789,
"step": 4430
},
{
"epoch": 193.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5127285718917847,
"eval_runtime": 3.736,
"eval_samples_per_second": 67.988,
"eval_steps_per_second": 0.535,
"step": 4439
},
{
"epoch": 193.04347826086956,
"grad_norm": 86.44344329833984,
"learning_rate": 6.956521739130435e-07,
"loss": 0.3858,
"step": 4440
},
{
"epoch": 193.47826086956522,
"grad_norm": 2.892185688018799,
"learning_rate": 6.521739130434783e-07,
"loss": 0.3894,
"step": 4450
},
{
"epoch": 193.91304347826087,
"grad_norm": 2.0254733562469482,
"learning_rate": 6.086956521739131e-07,
"loss": 0.3717,
"step": 4460
},
{
"epoch": 194.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5183544158935547,
"eval_runtime": 3.7197,
"eval_samples_per_second": 68.285,
"eval_steps_per_second": 0.538,
"step": 4462
},
{
"epoch": 194.34782608695653,
"grad_norm": 4.124297618865967,
"learning_rate": 5.652173913043478e-07,
"loss": 0.4098,
"step": 4470
},
{
"epoch": 194.7826086956522,
"grad_norm": 4.1497955322265625,
"learning_rate": 5.217391304347826e-07,
"loss": 0.361,
"step": 4480
},
{
"epoch": 195.0,
"eval_accuracy": 0.7834645669291339,
"eval_loss": 0.5185708999633789,
"eval_runtime": 4.9741,
"eval_samples_per_second": 51.064,
"eval_steps_per_second": 0.402,
"step": 4485
},
{
"epoch": 195.2173913043478,
"grad_norm": 11.268845558166504,
"learning_rate": 4.782608695652174e-07,
"loss": 0.3786,
"step": 4490
},
{
"epoch": 195.65217391304347,
"grad_norm": 3.9937920570373535,
"learning_rate": 4.347826086956522e-07,
"loss": 0.3722,
"step": 4500
},
{
"epoch": 196.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5107359886169434,
"eval_runtime": 3.7506,
"eval_samples_per_second": 67.723,
"eval_steps_per_second": 0.533,
"step": 4508
},
{
"epoch": 196.08695652173913,
"grad_norm": 2.869596004486084,
"learning_rate": 3.91304347826087e-07,
"loss": 0.3985,
"step": 4510
},
{
"epoch": 196.52173913043478,
"grad_norm": 6.21280574798584,
"learning_rate": 3.4782608695652175e-07,
"loss": 0.4019,
"step": 4520
},
{
"epoch": 196.95652173913044,
"grad_norm": 2.2324206829071045,
"learning_rate": 3.0434782608695656e-07,
"loss": 0.3551,
"step": 4530
},
{
"epoch": 197.0,
"eval_accuracy": 0.7952755905511811,
"eval_loss": 0.5174936056137085,
"eval_runtime": 3.6975,
"eval_samples_per_second": 68.695,
"eval_steps_per_second": 0.541,
"step": 4531
},
{
"epoch": 197.3913043478261,
"grad_norm": 2.6415905952453613,
"learning_rate": 2.608695652173913e-07,
"loss": 0.3919,
"step": 4540
},
{
"epoch": 197.82608695652175,
"grad_norm": 5.146513938903809,
"learning_rate": 2.173913043478261e-07,
"loss": 0.3649,
"step": 4550
},
{
"epoch": 198.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5135703682899475,
"eval_runtime": 4.9875,
"eval_samples_per_second": 50.928,
"eval_steps_per_second": 0.401,
"step": 4554
},
{
"epoch": 198.2608695652174,
"grad_norm": 3.1943461894989014,
"learning_rate": 1.7391304347826088e-07,
"loss": 0.3763,
"step": 4560
},
{
"epoch": 198.69565217391303,
"grad_norm": 2.8955743312835693,
"learning_rate": 1.3043478260869566e-07,
"loss": 0.3749,
"step": 4570
},
{
"epoch": 199.0,
"eval_accuracy": 0.7913385826771654,
"eval_loss": 0.5192672610282898,
"eval_runtime": 3.6944,
"eval_samples_per_second": 68.753,
"eval_steps_per_second": 0.541,
"step": 4577
},
{
"epoch": 199.1304347826087,
"grad_norm": 12.166488647460938,
"learning_rate": 8.695652173913044e-08,
"loss": 0.3869,
"step": 4580
},
{
"epoch": 199.56521739130434,
"grad_norm": 2.9687561988830566,
"learning_rate": 4.347826086956522e-08,
"loss": 0.3926,
"step": 4590
},
{
"epoch": 200.0,
"grad_norm": 4.834624290466309,
"learning_rate": 0.0,
"loss": 0.3782,
"step": 4600
},
{
"epoch": 200.0,
"eval_accuracy": 0.7992125984251969,
"eval_loss": 0.5181651711463928,
"eval_runtime": 3.7789,
"eval_samples_per_second": 67.216,
"eval_steps_per_second": 0.529,
"step": 4600
},
{
"epoch": 200.0,
"step": 4600,
"total_flos": 1.089869514338304e+18,
"train_loss": 0.30267460563908455,
"train_runtime": 4787.5341,
"train_samples_per_second": 59.947,
"train_steps_per_second": 0.961
}
],
"logging_steps": 10,
"max_steps": 4600,
"num_input_tokens_seen": 0,
"num_train_epochs": 200,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.089869514338304e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}