TianyiQ's picture
Upload ./trainer_state.json with huggingface_hub
de37d86 verified
{
"best_metric": 2.4261486530303955,
"best_model_checkpoint": "./output/training_results/C016_Meta-Llama-3-8B_pretrain_20240721_092214/checkpoint-11088",
"epoch": 4.0,
"eval_steps": 1232,
"global_step": 12316,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003247807729782397,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 2.6721,
"step": 1
},
{
"epoch": 0.20006495615459566,
"grad_norm": 1.9328745806142353,
"learning_rate": 1.9805194805194805e-06,
"loss": 2.592,
"step": 616
},
{
"epoch": 0.4001299123091913,
"grad_norm": 2.0226924668887496,
"learning_rate": 2.245175689219919e-06,
"loss": 2.5057,
"step": 1232
},
{
"epoch": 0.4001299123091913,
"eval_loss": 2.4827427864074707,
"eval_runtime": 252.9711,
"eval_samples_per_second": 86.543,
"eval_steps_per_second": 0.68,
"step": 1232
},
{
"epoch": 0.6001948684637869,
"grad_norm": 1.8512197610266496,
"learning_rate": 1.2232016471327423e-06,
"loss": 2.4683,
"step": 1848
},
{
"epoch": 0.8002598246183826,
"grad_norm": 1.989781044650789,
"learning_rate": 6.53049308175953e-07,
"loss": 2.444,
"step": 2464
},
{
"epoch": 0.8002598246183826,
"eval_loss": 2.440027952194214,
"eval_runtime": 251.3004,
"eval_samples_per_second": 87.119,
"eval_steps_per_second": 0.684,
"step": 2464
},
{
"epoch": 1.0003247807729783,
"grad_norm": 1.8956736763179278,
"learning_rate": 3.466185770829244e-07,
"loss": 2.4358,
"step": 3080
},
{
"epoch": 1.200389736927574,
"grad_norm": 1.9111043304334017,
"learning_rate": 1.8910445197889315e-07,
"loss": 2.3648,
"step": 3696
},
{
"epoch": 1.200389736927574,
"eval_loss": 2.431915760040283,
"eval_runtime": 251.1577,
"eval_samples_per_second": 87.168,
"eval_steps_per_second": 0.685,
"step": 3696
},
{
"epoch": 1.4004546930821695,
"grad_norm": 1.884967762986231,
"learning_rate": 1.1168237259086467e-07,
"loss": 2.3663,
"step": 4312
},
{
"epoch": 1.600519649236765,
"grad_norm": 1.9873340014256546,
"learning_rate": 7.563133304849047e-08,
"loss": 2.372,
"step": 4928
},
{
"epoch": 1.600519649236765,
"eval_loss": 2.4293837547302246,
"eval_runtime": 251.386,
"eval_samples_per_second": 87.089,
"eval_steps_per_second": 0.684,
"step": 4928
},
{
"epoch": 1.8005846053913608,
"grad_norm": 1.9124289441200373,
"learning_rate": 5.98689431836726e-08,
"loss": 2.3684,
"step": 5544
},
{
"epoch": 2.0006495615459565,
"grad_norm": 1.9479679838446426,
"learning_rate": 5.346405476547749e-08,
"loss": 2.3667,
"step": 6160
},
{
"epoch": 2.0006495615459565,
"eval_loss": 2.4281327724456787,
"eval_runtime": 251.4248,
"eval_samples_per_second": 87.076,
"eval_steps_per_second": 0.684,
"step": 6160
},
{
"epoch": 2.2007145177005523,
"grad_norm": 1.9022764488275758,
"learning_rate": 5.109115615383696e-08,
"loss": 2.3568,
"step": 6776
},
{
"epoch": 2.400779473855148,
"grad_norm": 1.885664698080457,
"learning_rate": 5.0300090028337e-08,
"loss": 2.3573,
"step": 7392
},
{
"epoch": 2.400779473855148,
"eval_loss": 2.4281272888183594,
"eval_runtime": 250.732,
"eval_samples_per_second": 87.316,
"eval_steps_per_second": 0.686,
"step": 7392
},
{
"epoch": 2.6008444300097433,
"grad_norm": 1.9790266878041507,
"learning_rate": 5.006932020966859e-08,
"loss": 2.3533,
"step": 8008
},
{
"epoch": 2.800909386164339,
"grad_norm": 1.9987792342132904,
"learning_rate": 5.0012816199435985e-08,
"loss": 2.3603,
"step": 8624
},
{
"epoch": 2.800909386164339,
"eval_loss": 2.4273290634155273,
"eval_runtime": 251.237,
"eval_samples_per_second": 87.141,
"eval_steps_per_second": 0.685,
"step": 8624
},
{
"epoch": 3.000974342318935,
"grad_norm": 1.892342522891005,
"learning_rate": 5.0001737227175665e-08,
"loss": 2.3575,
"step": 9240
},
{
"epoch": 3.2010392984735305,
"grad_norm": 1.945849916471603,
"learning_rate": 5.0000151498505686e-08,
"loss": 2.3522,
"step": 9856
},
{
"epoch": 3.2010392984735305,
"eval_loss": 2.4268627166748047,
"eval_runtime": 251.3708,
"eval_samples_per_second": 87.094,
"eval_steps_per_second": 0.684,
"step": 9856
},
{
"epoch": 3.401104254628126,
"grad_norm": 1.9950133710544578,
"learning_rate": 5.0000006522774745e-08,
"loss": 2.3532,
"step": 10472
},
{
"epoch": 3.6011692107827216,
"grad_norm": 1.9873010505050475,
"learning_rate": 5.000000007975414e-08,
"loss": 2.353,
"step": 11088
},
{
"epoch": 3.6011692107827216,
"eval_loss": 2.4261486530303955,
"eval_runtime": 252.6429,
"eval_samples_per_second": 86.656,
"eval_steps_per_second": 0.681,
"step": 11088
},
{
"epoch": 3.8012341669373173,
"grad_norm": 1.998167327050962,
"learning_rate": 5.0000000000044414e-08,
"loss": 2.3517,
"step": 11704
},
{
"epoch": 4.0,
"step": 12316,
"total_flos": 1287266123120640.0,
"train_loss": 2.391189053524001,
"train_runtime": 40307.7885,
"train_samples_per_second": 19.553,
"train_steps_per_second": 0.306
}
],
"logging_steps": 616,
"max_steps": 12316,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1232,
"total_flos": 1287266123120640.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}