|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 1100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"learning_rate": 4.545454545454545e-07, |
|
"loss": 2.5611, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 9.090909090909091e-06, |
|
"loss": 2.5692, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 1.8181818181818182e-05, |
|
"loss": 2.644, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.304738759994507, |
|
"eval_runtime": 1.217, |
|
"eval_samples_per_second": 18.899, |
|
"eval_steps_per_second": 4.93, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"learning_rate": 2.7272727272727273e-05, |
|
"loss": 2.3827, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 2.0781, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.9548, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.9419394731521606, |
|
"eval_runtime": 1.2171, |
|
"eval_samples_per_second": 18.898, |
|
"eval_steps_per_second": 4.93, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"learning_rate": 4.9987413559579636e-05, |
|
"loss": 1.8022, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"learning_rate": 4.988679806432712e-05, |
|
"loss": 1.8295, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"learning_rate": 4.968597221690986e-05, |
|
"loss": 1.788, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.9134601354599, |
|
"eval_runtime": 1.2132, |
|
"eval_samples_per_second": 18.958, |
|
"eval_steps_per_second": 4.946, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 3.27, |
|
"learning_rate": 4.938574467213518e-05, |
|
"loss": 1.6784, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"learning_rate": 4.898732434036244e-05, |
|
"loss": 1.6528, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"learning_rate": 4.849231551964771e-05, |
|
"loss": 1.6342, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.9498955011367798, |
|
"eval_runtime": 1.2502, |
|
"eval_samples_per_second": 18.397, |
|
"eval_steps_per_second": 4.799, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"learning_rate": 4.790271143580174e-05, |
|
"loss": 1.3818, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"learning_rate": 4.722088621637309e-05, |
|
"loss": 1.3781, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.1321451663970947, |
|
"eval_runtime": 1.2118, |
|
"eval_samples_per_second": 18.98, |
|
"eval_steps_per_second": 4.951, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 5.09, |
|
"learning_rate": 4.644958533087443e-05, |
|
"loss": 1.2512, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"learning_rate": 4.559191453574582e-05, |
|
"loss": 1.0475, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"learning_rate": 4.465132736856969e-05, |
|
"loss": 1.0617, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.3518364429473877, |
|
"eval_runtime": 1.216, |
|
"eval_samples_per_second": 18.914, |
|
"eval_steps_per_second": 4.934, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.18, |
|
"learning_rate": 4.3631611241893874e-05, |
|
"loss": 0.9003, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"learning_rate": 4.2536872192658036e-05, |
|
"loss": 0.7805, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"learning_rate": 4.137151834863213e-05, |
|
"loss": 0.8104, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.609004259109497, |
|
"eval_runtime": 1.2167, |
|
"eval_samples_per_second": 18.904, |
|
"eval_steps_per_second": 4.932, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"learning_rate": 4.014024217844167e-05, |
|
"loss": 0.6542, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"learning_rate": 3.884800159665276e-05, |
|
"loss": 0.5753, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.5864, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.8889544010162354, |
|
"eval_runtime": 1.2434, |
|
"eval_samples_per_second": 18.498, |
|
"eval_steps_per_second": 4.825, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"learning_rate": 3.610166531514436e-05, |
|
"loss": 0.4181, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 8.73, |
|
"learning_rate": 3.465862814232822e-05, |
|
"loss": 0.4159, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.1356287002563477, |
|
"eval_runtime": 1.2192, |
|
"eval_samples_per_second": 18.865, |
|
"eval_steps_per_second": 4.921, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 9.09, |
|
"learning_rate": 3.3176699082935545e-05, |
|
"loss": 0.4188, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.45, |
|
"learning_rate": 3.166184534225087e-05, |
|
"loss": 0.3131, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 9.82, |
|
"learning_rate": 3.012016670162977e-05, |
|
"loss": 0.3344, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.3189520835876465, |
|
"eval_runtime": 1.2131, |
|
"eval_samples_per_second": 18.96, |
|
"eval_steps_per_second": 4.946, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 10.18, |
|
"learning_rate": 2.8557870956832132e-05, |
|
"loss": 0.3005, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 10.55, |
|
"learning_rate": 2.698124892141971e-05, |
|
"loss": 0.2527, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 10.91, |
|
"learning_rate": 2.5396649095870202e-05, |
|
"loss": 0.2446, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 3.5470495223999023, |
|
"eval_runtime": 1.216, |
|
"eval_samples_per_second": 18.914, |
|
"eval_steps_per_second": 4.934, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"learning_rate": 2.3810452104406444e-05, |
|
"loss": 0.1855, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 11.64, |
|
"learning_rate": 2.222904500247473e-05, |
|
"loss": 0.1745, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.199, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 3.68398118019104, |
|
"eval_runtime": 1.2409, |
|
"eval_samples_per_second": 18.535, |
|
"eval_steps_per_second": 4.835, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 12.36, |
|
"learning_rate": 1.9106026612264316e-05, |
|
"loss": 0.1455, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 12.73, |
|
"learning_rate": 1.7576990616793137e-05, |
|
"loss": 0.1245, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 3.765277147293091, |
|
"eval_runtime": 1.2162, |
|
"eval_samples_per_second": 18.911, |
|
"eval_steps_per_second": 4.933, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 13.09, |
|
"learning_rate": 1.6077844460203206e-05, |
|
"loss": 0.1351, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 13.45, |
|
"learning_rate": 1.4614624674952842e-05, |
|
"loss": 0.0967, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"learning_rate": 1.3193223130682936e-05, |
|
"loss": 0.1208, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 3.8721702098846436, |
|
"eval_runtime": 1.2167, |
|
"eval_samples_per_second": 18.903, |
|
"eval_steps_per_second": 4.931, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 14.18, |
|
"learning_rate": 1.181936330973744e-05, |
|
"loss": 0.0853, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 14.55, |
|
"learning_rate": 1.049857726072005e-05, |
|
"loss": 0.0854, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 14.91, |
|
"learning_rate": 9.236183322886945e-06, |
|
"loss": 0.1003, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 3.9574601650238037, |
|
"eval_runtime": 1.2205, |
|
"eval_samples_per_second": 18.844, |
|
"eval_steps_per_second": 4.916, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 15.27, |
|
"learning_rate": 8.0372647110717e-06, |
|
"loss": 0.0753, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 15.64, |
|
"learning_rate": 6.906649047373246e-06, |
|
"loss": 0.0928, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"learning_rate": 5.848888922025553e-06, |
|
"loss": 0.0767, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 3.9671382904052734, |
|
"eval_runtime": 1.2561, |
|
"eval_samples_per_second": 18.311, |
|
"eval_steps_per_second": 4.777, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 16.36, |
|
"learning_rate": 4.868243561723535e-06, |
|
"loss": 0.0702, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 16.73, |
|
"learning_rate": 3.968661679220468e-06, |
|
"loss": 0.0913, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 3.9921278953552246, |
|
"eval_runtime": 1.2166, |
|
"eval_samples_per_second": 18.905, |
|
"eval_steps_per_second": 4.932, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 17.09, |
|
"learning_rate": 3.1537655732553768e-06, |
|
"loss": 0.0698, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 17.45, |
|
"learning_rate": 2.4268365428344736e-06, |
|
"loss": 0.0661, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 17.82, |
|
"learning_rate": 1.790801674598186e-06, |
|
"loss": 0.0895, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 3.9939558506011963, |
|
"eval_runtime": 1.2161, |
|
"eval_samples_per_second": 18.913, |
|
"eval_steps_per_second": 4.934, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 18.18, |
|
"learning_rate": 1.248222056476367e-06, |
|
"loss": 0.0695, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 18.55, |
|
"learning_rate": 8.012824650910938e-07, |
|
"loss": 0.086, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 18.91, |
|
"learning_rate": 4.517825684323324e-07, |
|
"loss": 0.0671, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 3.9915316104888916, |
|
"eval_runtime": 1.2143, |
|
"eval_samples_per_second": 18.941, |
|
"eval_steps_per_second": 4.941, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 19.27, |
|
"learning_rate": 2.011296792301165e-07, |
|
"loss": 0.0681, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 19.64, |
|
"learning_rate": 5.033308820289184e-08, |
|
"loss": 0.09, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"learning_rate": 0.0, |
|
"loss": 0.0671, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 3.990852117538452, |
|
"eval_runtime": 1.2506, |
|
"eval_samples_per_second": 18.391, |
|
"eval_steps_per_second": 4.798, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"step": 1100, |
|
"total_flos": 3.807078373542298e+16, |
|
"train_loss": 0.6849299516461113, |
|
"train_runtime": 917.6848, |
|
"train_samples_per_second": 4.729, |
|
"train_steps_per_second": 1.199 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 1100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 20, |
|
"total_flos": 3.807078373542298e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|