{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7545638945233266, "eval_steps": 31, "global_step": 93, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008113590263691683, "eval_loss": 5.708795547485352, "eval_runtime": 34.9917, "eval_samples_per_second": 5.944, "eval_steps_per_second": 0.743, "step": 1 }, { "epoch": 0.02434077079107505, "grad_norm": 27.649152755737305, "learning_rate": 3e-05, "loss": 22.4374, "step": 3 }, { "epoch": 0.0486815415821501, "grad_norm": 24.12411880493164, "learning_rate": 6e-05, "loss": 20.3477, "step": 6 }, { "epoch": 0.07302231237322515, "grad_norm": 22.957651138305664, "learning_rate": 9e-05, "loss": 13.9237, "step": 9 }, { "epoch": 0.0973630831643002, "grad_norm": 22.14281463623047, "learning_rate": 9.999238475781957e-05, "loss": 8.2192, "step": 12 }, { "epoch": 0.12170385395537525, "grad_norm": 15.811911582946777, "learning_rate": 9.99524110790929e-05, "loss": 5.4619, "step": 15 }, { "epoch": 0.1460446247464503, "grad_norm": 17.17688751220703, "learning_rate": 9.987820251299122e-05, "loss": 4.1998, "step": 18 }, { "epoch": 0.17038539553752535, "grad_norm": 9.06302547454834, "learning_rate": 9.976980991835894e-05, "loss": 3.5102, "step": 21 }, { "epoch": 0.1947261663286004, "grad_norm": 9.071863174438477, "learning_rate": 9.962730758206611e-05, "loss": 2.9501, "step": 24 }, { "epoch": 0.21906693711967545, "grad_norm": 10.715943336486816, "learning_rate": 9.945079316809585e-05, "loss": 2.8262, "step": 27 }, { "epoch": 0.2434077079107505, "grad_norm": 6.811004161834717, "learning_rate": 9.924038765061042e-05, "loss": 2.6217, "step": 30 }, { "epoch": 0.2515212981744422, "eval_loss": 0.6682190299034119, "eval_runtime": 35.2448, "eval_samples_per_second": 5.902, "eval_steps_per_second": 0.738, "step": 31 }, { "epoch": 0.26774847870182555, "grad_norm": 5.427604675292969, "learning_rate": 9.899623523104149e-05, "loss": 2.4006, "step": 33 }, { "epoch": 0.2920892494929006, "grad_norm": 7.537416458129883, "learning_rate": 9.871850323926177e-05, "loss": 2.7397, "step": 36 }, { "epoch": 0.31643002028397565, "grad_norm": 8.674805641174316, "learning_rate": 9.84073820189054e-05, "loss": 2.4828, "step": 39 }, { "epoch": 0.3407707910750507, "grad_norm": 8.167730331420898, "learning_rate": 9.806308479691595e-05, "loss": 2.5302, "step": 42 }, { "epoch": 0.36511156186612576, "grad_norm": 10.937932014465332, "learning_rate": 9.768584753741134e-05, "loss": 2.3383, "step": 45 }, { "epoch": 0.3894523326572008, "grad_norm": 3.8246636390686035, "learning_rate": 9.727592877996585e-05, "loss": 2.212, "step": 48 }, { "epoch": 0.41379310344827586, "grad_norm": 4.667800426483154, "learning_rate": 9.683360946241989e-05, "loss": 2.3763, "step": 51 }, { "epoch": 0.4381338742393509, "grad_norm": 4.990477561950684, "learning_rate": 9.635919272833938e-05, "loss": 2.3995, "step": 54 }, { "epoch": 0.46247464503042596, "grad_norm": 9.09072494506836, "learning_rate": 9.58530037192562e-05, "loss": 2.545, "step": 57 }, { "epoch": 0.486815415821501, "grad_norm": 5.338695049285889, "learning_rate": 9.53153893518325e-05, "loss": 2.3183, "step": 60 }, { "epoch": 0.5030425963488844, "eval_loss": 0.5645254850387573, "eval_runtime": 13.4155, "eval_samples_per_second": 15.504, "eval_steps_per_second": 1.938, "step": 62 }, { "epoch": 0.5111561866125761, "grad_norm": 3.9001667499542236, "learning_rate": 9.474671808010126e-05, "loss": 2.3843, "step": 63 }, { "epoch": 0.5354969574036511, "grad_norm": 6.379414081573486, "learning_rate": 9.414737964294636e-05, "loss": 2.2963, "step": 66 }, { "epoch": 0.5598377281947262, "grad_norm": 4.97390079498291, "learning_rate": 9.351778479699499e-05, "loss": 2.2517, "step": 69 }, { "epoch": 0.5841784989858012, "grad_norm": 4.063937187194824, "learning_rate": 9.285836503510562e-05, "loss": 2.2437, "step": 72 }, { "epoch": 0.6085192697768763, "grad_norm": 3.602717161178589, "learning_rate": 9.21695722906443e-05, "loss": 2.1695, "step": 75 }, { "epoch": 0.6328600405679513, "grad_norm": 4.555128574371338, "learning_rate": 9.145187862775209e-05, "loss": 2.1648, "step": 78 }, { "epoch": 0.6572008113590264, "grad_norm": 5.0544867515563965, "learning_rate": 9.070577591781597e-05, "loss": 2.0922, "step": 81 }, { "epoch": 0.6815415821501014, "grad_norm": 4.92878532409668, "learning_rate": 8.993177550236464e-05, "loss": 2.141, "step": 84 }, { "epoch": 0.7058823529411765, "grad_norm": 3.6681647300720215, "learning_rate": 8.91304078426207e-05, "loss": 2.1638, "step": 87 }, { "epoch": 0.7302231237322515, "grad_norm": 4.451002597808838, "learning_rate": 8.83022221559489e-05, "loss": 2.1766, "step": 90 }, { "epoch": 0.7545638945233266, "grad_norm": 5.902685165405273, "learning_rate": 8.744778603945011e-05, "loss": 2.1005, "step": 93 }, { "epoch": 0.7545638945233266, "eval_loss": 0.5568718314170837, "eval_runtime": 13.4611, "eval_samples_per_second": 15.452, "eval_steps_per_second": 1.931, "step": 93 } ], "logging_steps": 3, "max_steps": 370, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 31, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2373610700629606e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }