{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5384615384615383, "eval_steps": 9, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076923076923077, "grad_norm": 0.19627095758914948, "learning_rate": 1e-05, "loss": 10.3769, "step": 1 }, { "epoch": 0.03076923076923077, "eval_loss": 10.386632919311523, "eval_runtime": 0.0801, "eval_samples_per_second": 1361.483, "eval_steps_per_second": 49.963, "step": 1 }, { "epoch": 0.06153846153846154, "grad_norm": 0.20535489916801453, "learning_rate": 2e-05, "loss": 10.3764, "step": 2 }, { "epoch": 0.09230769230769231, "grad_norm": 0.1900486946105957, "learning_rate": 3e-05, "loss": 10.382, "step": 3 }, { "epoch": 0.12307692307692308, "grad_norm": 0.2189124971628189, "learning_rate": 4e-05, "loss": 10.3854, "step": 4 }, { "epoch": 0.15384615384615385, "grad_norm": 0.19613224267959595, "learning_rate": 5e-05, "loss": 10.3846, "step": 5 }, { "epoch": 0.18461538461538463, "grad_norm": 0.21051953732967377, "learning_rate": 6e-05, "loss": 10.3951, "step": 6 }, { "epoch": 0.2153846153846154, "grad_norm": 0.193317711353302, "learning_rate": 7e-05, "loss": 10.4099, "step": 7 }, { "epoch": 0.24615384615384617, "grad_norm": 0.22925445437431335, "learning_rate": 8e-05, "loss": 10.4001, "step": 8 }, { "epoch": 0.27692307692307694, "grad_norm": 0.2118426263332367, "learning_rate": 9e-05, "loss": 10.4226, "step": 9 }, { "epoch": 0.27692307692307694, "eval_loss": 10.384289741516113, "eval_runtime": 0.075, "eval_samples_per_second": 1452.898, "eval_steps_per_second": 53.317, "step": 9 }, { "epoch": 0.3076923076923077, "grad_norm": 0.22410708665847778, "learning_rate": 0.0001, "loss": 10.3974, "step": 10 }, { "epoch": 0.3384615384615385, "grad_norm": 0.28219085931777954, "learning_rate": 9.996740476948385e-05, "loss": 10.3593, "step": 11 }, { "epoch": 0.36923076923076925, "grad_norm": 0.2661738991737366, "learning_rate": 9.98696615758975e-05, "loss": 10.3826, "step": 12 }, { "epoch": 0.4, "grad_norm": 0.26590314507484436, "learning_rate": 9.970689785771798e-05, "loss": 10.3853, "step": 13 }, { "epoch": 0.4307692307692308, "grad_norm": 0.23882359266281128, "learning_rate": 9.947932582778188e-05, "loss": 10.3944, "step": 14 }, { "epoch": 0.46153846153846156, "grad_norm": 0.2391405999660492, "learning_rate": 9.918724219660013e-05, "loss": 10.357, "step": 15 }, { "epoch": 0.49230769230769234, "grad_norm": 0.2403474599123001, "learning_rate": 9.883102778550434e-05, "loss": 10.3803, "step": 16 }, { "epoch": 0.5230769230769231, "grad_norm": 0.22196514904499054, "learning_rate": 9.841114703012817e-05, "loss": 10.3643, "step": 17 }, { "epoch": 0.5538461538461539, "grad_norm": 0.245796337723732, "learning_rate": 9.792814737487207e-05, "loss": 10.4181, "step": 18 }, { "epoch": 0.5538461538461539, "eval_loss": 10.378185272216797, "eval_runtime": 0.0762, "eval_samples_per_second": 1429.641, "eval_steps_per_second": 52.464, "step": 18 }, { "epoch": 0.5846153846153846, "grad_norm": 0.2934595048427582, "learning_rate": 9.738265855914013e-05, "loss": 10.3524, "step": 19 }, { "epoch": 0.6153846153846154, "grad_norm": 0.29529669880867004, "learning_rate": 9.677539179628005e-05, "loss": 10.3852, "step": 20 }, { "epoch": 0.6461538461538462, "grad_norm": 0.2764834761619568, "learning_rate": 9.610713884629666e-05, "loss": 10.3627, "step": 21 }, { "epoch": 0.676923076923077, "grad_norm": 0.270579993724823, "learning_rate": 9.537877098354786e-05, "loss": 10.3802, "step": 22 }, { "epoch": 0.7076923076923077, "grad_norm": 0.28736695647239685, "learning_rate": 9.459123786076912e-05, "loss": 10.3475, "step": 23 }, { "epoch": 0.7384615384615385, "grad_norm": 0.27069252729415894, "learning_rate": 9.374556627090749e-05, "loss": 10.3726, "step": 24 }, { "epoch": 0.7692307692307693, "grad_norm": 0.2778293192386627, "learning_rate": 9.284285880837946e-05, "loss": 10.3683, "step": 25 }, { "epoch": 0.8, "grad_norm": 0.3016285002231598, "learning_rate": 9.188429243149824e-05, "loss": 10.3652, "step": 26 }, { "epoch": 0.8307692307692308, "grad_norm": 0.3102306127548218, "learning_rate": 9.087111692794459e-05, "loss": 10.3612, "step": 27 }, { "epoch": 0.8307692307692308, "eval_loss": 10.371162414550781, "eval_runtime": 0.0804, "eval_samples_per_second": 1355.392, "eval_steps_per_second": 49.739, "step": 27 }, { "epoch": 0.8615384615384616, "grad_norm": 0.2973518967628479, "learning_rate": 8.980465328528219e-05, "loss": 10.3562, "step": 28 }, { "epoch": 0.8923076923076924, "grad_norm": 0.27339646220207214, "learning_rate": 8.868629196864182e-05, "loss": 10.3745, "step": 29 }, { "epoch": 0.9230769230769231, "grad_norm": 0.30518829822540283, "learning_rate": 8.751749110782012e-05, "loss": 10.3788, "step": 30 }, { "epoch": 0.9538461538461539, "grad_norm": 0.30676740407943726, "learning_rate": 8.629977459615655e-05, "loss": 10.3631, "step": 31 }, { "epoch": 0.9846153846153847, "grad_norm": 0.3426137864589691, "learning_rate": 8.503473010366713e-05, "loss": 10.3782, "step": 32 }, { "epoch": 1.0153846153846153, "grad_norm": 0.43814149498939514, "learning_rate": 8.37240070070257e-05, "loss": 14.7399, "step": 33 }, { "epoch": 1.0461538461538462, "grad_norm": 0.39517074823379517, "learning_rate": 8.236931423909138e-05, "loss": 11.6581, "step": 34 }, { "epoch": 1.0769230769230769, "grad_norm": 0.286582887172699, "learning_rate": 8.097241806078615e-05, "loss": 9.8718, "step": 35 }, { "epoch": 1.1076923076923078, "grad_norm": 0.3339255154132843, "learning_rate": 7.953513975822755e-05, "loss": 9.8388, "step": 36 }, { "epoch": 1.1076923076923078, "eval_loss": 10.362679481506348, "eval_runtime": 0.0735, "eval_samples_per_second": 1482.077, "eval_steps_per_second": 54.388, "step": 36 }, { "epoch": 1.1384615384615384, "grad_norm": 0.38969117403030396, "learning_rate": 7.805935326811912e-05, "loss": 9.6292, "step": 37 }, { "epoch": 1.1692307692307693, "grad_norm": 0.4563569724559784, "learning_rate": 7.654698273449435e-05, "loss": 11.4989, "step": 38 }, { "epoch": 1.2, "grad_norm": 0.5030809044837952, "learning_rate": 7.500000000000001e-05, "loss": 11.8672, "step": 39 }, { "epoch": 1.2307692307692308, "grad_norm": 0.32794782519340515, "learning_rate": 7.342042203498951e-05, "loss": 9.4949, "step": 40 }, { "epoch": 1.2615384615384615, "grad_norm": 0.3771244287490845, "learning_rate": 7.181030830777837e-05, "loss": 8.6339, "step": 41 }, { "epoch": 1.2923076923076924, "grad_norm": 0.37634986639022827, "learning_rate": 7.017175809949044e-05, "loss": 9.5719, "step": 42 }, { "epoch": 1.323076923076923, "grad_norm": 0.5357686877250671, "learning_rate": 6.850690776699573e-05, "loss": 13.4886, "step": 43 }, { "epoch": 1.353846153846154, "grad_norm": 0.41375601291656494, "learning_rate": 6.681792795750875e-05, "loss": 10.2368, "step": 44 }, { "epoch": 1.3846153846153846, "grad_norm": 0.3804188370704651, "learning_rate": 6.510702077847863e-05, "loss": 8.42, "step": 45 }, { "epoch": 1.3846153846153846, "eval_loss": 10.35329532623291, "eval_runtime": 0.0829, "eval_samples_per_second": 1314.084, "eval_steps_per_second": 48.223, "step": 45 }, { "epoch": 1.4153846153846155, "grad_norm": 0.5498846173286438, "learning_rate": 6.337641692646106e-05, "loss": 10.7262, "step": 46 }, { "epoch": 1.4461538461538461, "grad_norm": 0.4699338972568512, "learning_rate": 6.162837277871553e-05, "loss": 10.9246, "step": 47 }, { "epoch": 1.476923076923077, "grad_norm": 0.47309648990631104, "learning_rate": 5.9865167451320005e-05, "loss": 10.4618, "step": 48 }, { "epoch": 1.5076923076923077, "grad_norm": 0.44090601801872253, "learning_rate": 5.808909982763825e-05, "loss": 9.5822, "step": 49 }, { "epoch": 1.5384615384615383, "grad_norm": 0.6003273129463196, "learning_rate": 5.6302485561014475e-05, "loss": 12.5208, "step": 50 } ], "logging_steps": 1, "max_steps": 97, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 10460489318400.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }