{ "best_metric": 2.4261486530303955, "best_model_checkpoint": "./output/training_results/C016_Meta-Llama-3-8B_pretrain_20240721_092214/checkpoint-11088", "epoch": 4.0, "eval_steps": 1232, "global_step": 12316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003247807729782397, "grad_norm": 0.0, "learning_rate": 0.0, "loss": 2.6721, "step": 1 }, { "epoch": 0.20006495615459566, "grad_norm": 1.9328745806142353, "learning_rate": 1.9805194805194805e-06, "loss": 2.592, "step": 616 }, { "epoch": 0.4001299123091913, "grad_norm": 2.0226924668887496, "learning_rate": 2.245175689219919e-06, "loss": 2.5057, "step": 1232 }, { "epoch": 0.4001299123091913, "eval_loss": 2.4827427864074707, "eval_runtime": 252.9711, "eval_samples_per_second": 86.543, "eval_steps_per_second": 0.68, "step": 1232 }, { "epoch": 0.6001948684637869, "grad_norm": 1.8512197610266496, "learning_rate": 1.2232016471327423e-06, "loss": 2.4683, "step": 1848 }, { "epoch": 0.8002598246183826, "grad_norm": 1.989781044650789, "learning_rate": 6.53049308175953e-07, "loss": 2.444, "step": 2464 }, { "epoch": 0.8002598246183826, "eval_loss": 2.440027952194214, "eval_runtime": 251.3004, "eval_samples_per_second": 87.119, "eval_steps_per_second": 0.684, "step": 2464 }, { "epoch": 1.0003247807729783, "grad_norm": 1.8956736763179278, "learning_rate": 3.466185770829244e-07, "loss": 2.4358, "step": 3080 }, { "epoch": 1.200389736927574, "grad_norm": 1.9111043304334017, "learning_rate": 1.8910445197889315e-07, "loss": 2.3648, "step": 3696 }, { "epoch": 1.200389736927574, "eval_loss": 2.431915760040283, "eval_runtime": 251.1577, "eval_samples_per_second": 87.168, "eval_steps_per_second": 0.685, "step": 3696 }, { "epoch": 1.4004546930821695, "grad_norm": 1.884967762986231, "learning_rate": 1.1168237259086467e-07, "loss": 2.3663, "step": 4312 }, { "epoch": 1.600519649236765, "grad_norm": 1.9873340014256546, "learning_rate": 7.563133304849047e-08, "loss": 2.372, "step": 4928 }, { "epoch": 1.600519649236765, "eval_loss": 2.4293837547302246, "eval_runtime": 251.386, "eval_samples_per_second": 87.089, "eval_steps_per_second": 0.684, "step": 4928 }, { "epoch": 1.8005846053913608, "grad_norm": 1.9124289441200373, "learning_rate": 5.98689431836726e-08, "loss": 2.3684, "step": 5544 }, { "epoch": 2.0006495615459565, "grad_norm": 1.9479679838446426, "learning_rate": 5.346405476547749e-08, "loss": 2.3667, "step": 6160 }, { "epoch": 2.0006495615459565, "eval_loss": 2.4281327724456787, "eval_runtime": 251.4248, "eval_samples_per_second": 87.076, "eval_steps_per_second": 0.684, "step": 6160 }, { "epoch": 2.2007145177005523, "grad_norm": 1.9022764488275758, "learning_rate": 5.109115615383696e-08, "loss": 2.3568, "step": 6776 }, { "epoch": 2.400779473855148, "grad_norm": 1.885664698080457, "learning_rate": 5.0300090028337e-08, "loss": 2.3573, "step": 7392 }, { "epoch": 2.400779473855148, "eval_loss": 2.4281272888183594, "eval_runtime": 250.732, "eval_samples_per_second": 87.316, "eval_steps_per_second": 0.686, "step": 7392 }, { "epoch": 2.6008444300097433, "grad_norm": 1.9790266878041507, "learning_rate": 5.006932020966859e-08, "loss": 2.3533, "step": 8008 }, { "epoch": 2.800909386164339, "grad_norm": 1.9987792342132904, "learning_rate": 5.0012816199435985e-08, "loss": 2.3603, "step": 8624 }, { "epoch": 2.800909386164339, "eval_loss": 2.4273290634155273, "eval_runtime": 251.237, "eval_samples_per_second": 87.141, "eval_steps_per_second": 0.685, "step": 8624 }, { "epoch": 3.000974342318935, "grad_norm": 1.892342522891005, "learning_rate": 5.0001737227175665e-08, "loss": 2.3575, "step": 9240 }, { "epoch": 3.2010392984735305, "grad_norm": 1.945849916471603, "learning_rate": 5.0000151498505686e-08, "loss": 2.3522, "step": 9856 }, { "epoch": 3.2010392984735305, "eval_loss": 2.4268627166748047, "eval_runtime": 251.3708, "eval_samples_per_second": 87.094, "eval_steps_per_second": 0.684, "step": 9856 }, { "epoch": 3.401104254628126, "grad_norm": 1.9950133710544578, "learning_rate": 5.0000006522774745e-08, "loss": 2.3532, "step": 10472 }, { "epoch": 3.6011692107827216, "grad_norm": 1.9873010505050475, "learning_rate": 5.000000007975414e-08, "loss": 2.353, "step": 11088 }, { "epoch": 3.6011692107827216, "eval_loss": 2.4261486530303955, "eval_runtime": 252.6429, "eval_samples_per_second": 86.656, "eval_steps_per_second": 0.681, "step": 11088 }, { "epoch": 3.8012341669373173, "grad_norm": 1.998167327050962, "learning_rate": 5.0000000000044414e-08, "loss": 2.3517, "step": 11704 }, { "epoch": 4.0, "step": 12316, "total_flos": 1287266123120640.0, "train_loss": 2.391189053524001, "train_runtime": 40307.7885, "train_samples_per_second": 19.553, "train_steps_per_second": 0.306 } ], "logging_steps": 616, "max_steps": 12316, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 1232, "total_flos": 1287266123120640.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }