|
{ |
|
"best_metric": 2.4261486530303955, |
|
"best_model_checkpoint": "./output/training_results/C016_Meta-Llama-3-8B_pretrain_20240721_092214/checkpoint-11088", |
|
"epoch": 4.0, |
|
"eval_steps": 1232, |
|
"global_step": 12316, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0003247807729782397, |
|
"grad_norm": 0.0, |
|
"learning_rate": 0.0, |
|
"loss": 2.6721, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.20006495615459566, |
|
"grad_norm": 1.9328745806142353, |
|
"learning_rate": 1.9805194805194805e-06, |
|
"loss": 2.592, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 0.4001299123091913, |
|
"grad_norm": 2.0226924668887496, |
|
"learning_rate": 2.245175689219919e-06, |
|
"loss": 2.5057, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.4001299123091913, |
|
"eval_loss": 2.4827427864074707, |
|
"eval_runtime": 252.9711, |
|
"eval_samples_per_second": 86.543, |
|
"eval_steps_per_second": 0.68, |
|
"step": 1232 |
|
}, |
|
{ |
|
"epoch": 0.6001948684637869, |
|
"grad_norm": 1.8512197610266496, |
|
"learning_rate": 1.2232016471327423e-06, |
|
"loss": 2.4683, |
|
"step": 1848 |
|
}, |
|
{ |
|
"epoch": 0.8002598246183826, |
|
"grad_norm": 1.989781044650789, |
|
"learning_rate": 6.53049308175953e-07, |
|
"loss": 2.444, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 0.8002598246183826, |
|
"eval_loss": 2.440027952194214, |
|
"eval_runtime": 251.3004, |
|
"eval_samples_per_second": 87.119, |
|
"eval_steps_per_second": 0.684, |
|
"step": 2464 |
|
}, |
|
{ |
|
"epoch": 1.0003247807729783, |
|
"grad_norm": 1.8956736763179278, |
|
"learning_rate": 3.466185770829244e-07, |
|
"loss": 2.4358, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 1.200389736927574, |
|
"grad_norm": 1.9111043304334017, |
|
"learning_rate": 1.8910445197889315e-07, |
|
"loss": 2.3648, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 1.200389736927574, |
|
"eval_loss": 2.431915760040283, |
|
"eval_runtime": 251.1577, |
|
"eval_samples_per_second": 87.168, |
|
"eval_steps_per_second": 0.685, |
|
"step": 3696 |
|
}, |
|
{ |
|
"epoch": 1.4004546930821695, |
|
"grad_norm": 1.884967762986231, |
|
"learning_rate": 1.1168237259086467e-07, |
|
"loss": 2.3663, |
|
"step": 4312 |
|
}, |
|
{ |
|
"epoch": 1.600519649236765, |
|
"grad_norm": 1.9873340014256546, |
|
"learning_rate": 7.563133304849047e-08, |
|
"loss": 2.372, |
|
"step": 4928 |
|
}, |
|
{ |
|
"epoch": 1.600519649236765, |
|
"eval_loss": 2.4293837547302246, |
|
"eval_runtime": 251.386, |
|
"eval_samples_per_second": 87.089, |
|
"eval_steps_per_second": 0.684, |
|
"step": 4928 |
|
}, |
|
{ |
|
"epoch": 1.8005846053913608, |
|
"grad_norm": 1.9124289441200373, |
|
"learning_rate": 5.98689431836726e-08, |
|
"loss": 2.3684, |
|
"step": 5544 |
|
}, |
|
{ |
|
"epoch": 2.0006495615459565, |
|
"grad_norm": 1.9479679838446426, |
|
"learning_rate": 5.346405476547749e-08, |
|
"loss": 2.3667, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.0006495615459565, |
|
"eval_loss": 2.4281327724456787, |
|
"eval_runtime": 251.4248, |
|
"eval_samples_per_second": 87.076, |
|
"eval_steps_per_second": 0.684, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 2.2007145177005523, |
|
"grad_norm": 1.9022764488275758, |
|
"learning_rate": 5.109115615383696e-08, |
|
"loss": 2.3568, |
|
"step": 6776 |
|
}, |
|
{ |
|
"epoch": 2.400779473855148, |
|
"grad_norm": 1.885664698080457, |
|
"learning_rate": 5.0300090028337e-08, |
|
"loss": 2.3573, |
|
"step": 7392 |
|
}, |
|
{ |
|
"epoch": 2.400779473855148, |
|
"eval_loss": 2.4281272888183594, |
|
"eval_runtime": 250.732, |
|
"eval_samples_per_second": 87.316, |
|
"eval_steps_per_second": 0.686, |
|
"step": 7392 |
|
}, |
|
{ |
|
"epoch": 2.6008444300097433, |
|
"grad_norm": 1.9790266878041507, |
|
"learning_rate": 5.006932020966859e-08, |
|
"loss": 2.3533, |
|
"step": 8008 |
|
}, |
|
{ |
|
"epoch": 2.800909386164339, |
|
"grad_norm": 1.9987792342132904, |
|
"learning_rate": 5.0012816199435985e-08, |
|
"loss": 2.3603, |
|
"step": 8624 |
|
}, |
|
{ |
|
"epoch": 2.800909386164339, |
|
"eval_loss": 2.4273290634155273, |
|
"eval_runtime": 251.237, |
|
"eval_samples_per_second": 87.141, |
|
"eval_steps_per_second": 0.685, |
|
"step": 8624 |
|
}, |
|
{ |
|
"epoch": 3.000974342318935, |
|
"grad_norm": 1.892342522891005, |
|
"learning_rate": 5.0001737227175665e-08, |
|
"loss": 2.3575, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 3.2010392984735305, |
|
"grad_norm": 1.945849916471603, |
|
"learning_rate": 5.0000151498505686e-08, |
|
"loss": 2.3522, |
|
"step": 9856 |
|
}, |
|
{ |
|
"epoch": 3.2010392984735305, |
|
"eval_loss": 2.4268627166748047, |
|
"eval_runtime": 251.3708, |
|
"eval_samples_per_second": 87.094, |
|
"eval_steps_per_second": 0.684, |
|
"step": 9856 |
|
}, |
|
{ |
|
"epoch": 3.401104254628126, |
|
"grad_norm": 1.9950133710544578, |
|
"learning_rate": 5.0000006522774745e-08, |
|
"loss": 2.3532, |
|
"step": 10472 |
|
}, |
|
{ |
|
"epoch": 3.6011692107827216, |
|
"grad_norm": 1.9873010505050475, |
|
"learning_rate": 5.000000007975414e-08, |
|
"loss": 2.353, |
|
"step": 11088 |
|
}, |
|
{ |
|
"epoch": 3.6011692107827216, |
|
"eval_loss": 2.4261486530303955, |
|
"eval_runtime": 252.6429, |
|
"eval_samples_per_second": 86.656, |
|
"eval_steps_per_second": 0.681, |
|
"step": 11088 |
|
}, |
|
{ |
|
"epoch": 3.8012341669373173, |
|
"grad_norm": 1.998167327050962, |
|
"learning_rate": 5.0000000000044414e-08, |
|
"loss": 2.3517, |
|
"step": 11704 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"step": 12316, |
|
"total_flos": 1287266123120640.0, |
|
"train_loss": 2.391189053524001, |
|
"train_runtime": 40307.7885, |
|
"train_samples_per_second": 19.553, |
|
"train_steps_per_second": 0.306 |
|
} |
|
], |
|
"logging_steps": 616, |
|
"max_steps": 12316, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 1232, |
|
"total_flos": 1287266123120640.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|