{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.6716417910447765, "eval_steps": 500, "global_step": 65, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07462686567164178, "grad_norm": 11.831602096557617, "learning_rate": 0.00019692307692307696, "loss": 9.5753, "step": 1 }, { "epoch": 0.14925373134328357, "grad_norm": 10.040313720703125, "learning_rate": 0.00019384615384615385, "loss": 9.3879, "step": 2 }, { "epoch": 0.22388059701492538, "grad_norm": 11.504229545593262, "learning_rate": 0.0001907692307692308, "loss": 9.2222, "step": 3 }, { "epoch": 0.29850746268656714, "grad_norm": 13.777145385742188, "learning_rate": 0.0001876923076923077, "loss": 9.0187, "step": 4 }, { "epoch": 0.373134328358209, "grad_norm": 16.53196907043457, "learning_rate": 0.00018461538461538463, "loss": 8.8251, "step": 5 }, { "epoch": 0.44776119402985076, "grad_norm": 23.02943229675293, "learning_rate": 0.00018153846153846155, "loss": 8.4652, "step": 6 }, { "epoch": 0.5223880597014925, "grad_norm": 20.641206741333008, "learning_rate": 0.00017846153846153847, "loss": 8.1413, "step": 7 }, { "epoch": 0.5970149253731343, "grad_norm": 16.40256690979004, "learning_rate": 0.0001753846153846154, "loss": 7.9036, "step": 8 }, { "epoch": 0.6716417910447762, "grad_norm": 19.79137420654297, "learning_rate": 0.00017230769230769234, "loss": 7.8958, "step": 9 }, { "epoch": 0.746268656716418, "grad_norm": 10.174325942993164, "learning_rate": 0.00016923076923076923, "loss": 7.6606, "step": 10 }, { "epoch": 0.8208955223880597, "grad_norm": 11.052704811096191, "learning_rate": 0.00016615384615384617, "loss": 7.6561, "step": 11 }, { "epoch": 0.8955223880597015, "grad_norm": 12.233122825622559, "learning_rate": 0.0001630769230769231, "loss": 7.5328, "step": 12 }, { "epoch": 0.9701492537313433, "grad_norm": 10.275497436523438, "learning_rate": 0.00016, "loss": 7.4978, "step": 13 }, { "epoch": 1.0, "grad_norm": 7.3966779708862305, "learning_rate": 0.00015692307692307693, "loss": 3.0154, "step": 14 }, { "epoch": 1.0746268656716418, "grad_norm": 13.597101211547852, "learning_rate": 0.00015384615384615385, "loss": 7.3022, "step": 15 }, { "epoch": 1.1492537313432836, "grad_norm": 12.541572570800781, "learning_rate": 0.00015076923076923077, "loss": 7.4243, "step": 16 }, { "epoch": 1.2238805970149254, "grad_norm": 7.609414577484131, "learning_rate": 0.00014769230769230772, "loss": 7.3352, "step": 17 }, { "epoch": 1.2985074626865671, "grad_norm": 8.79019546508789, "learning_rate": 0.0001446153846153846, "loss": 7.3403, "step": 18 }, { "epoch": 1.373134328358209, "grad_norm": 7.513161659240723, "learning_rate": 0.00014153846153846156, "loss": 7.3048, "step": 19 }, { "epoch": 1.4477611940298507, "grad_norm": Infinity, "learning_rate": 0.00014153846153846156, "loss": 7.3934, "step": 20 }, { "epoch": 1.5223880597014925, "grad_norm": 11.386406898498535, "learning_rate": 0.00013846153846153847, "loss": 7.2708, "step": 21 }, { "epoch": 1.5970149253731343, "grad_norm": NaN, "learning_rate": 0.00013846153846153847, "loss": 7.2549, "step": 22 }, { "epoch": 1.671641791044776, "grad_norm": 9.923229217529297, "learning_rate": 0.0001353846153846154, "loss": 7.3161, "step": 23 }, { "epoch": 1.7462686567164178, "grad_norm": 7.624272346496582, "learning_rate": 0.0001323076923076923, "loss": 7.2844, "step": 24 }, { "epoch": 1.8208955223880596, "grad_norm": 6.796629905700684, "learning_rate": 0.00012923076923076923, "loss": 7.2594, "step": 25 }, { "epoch": 1.8955223880597014, "grad_norm": 5.520105838775635, "learning_rate": 0.00012615384615384615, "loss": 7.2639, "step": 26 }, { "epoch": 1.9701492537313432, "grad_norm": 5.706660270690918, "learning_rate": 0.0001230769230769231, "loss": 7.2271, "step": 27 }, { "epoch": 2.0, "grad_norm": 4.667247295379639, "learning_rate": 0.00012, "loss": 2.942, "step": 28 }, { "epoch": 2.074626865671642, "grad_norm": 8.385282516479492, "learning_rate": 0.00011692307692307694, "loss": 7.1126, "step": 29 }, { "epoch": 2.1492537313432836, "grad_norm": 4.17704963684082, "learning_rate": 0.00011384615384615384, "loss": 7.2177, "step": 30 }, { "epoch": 2.2238805970149254, "grad_norm": 7.29147481918335, "learning_rate": 0.00011076923076923077, "loss": 7.2731, "step": 31 }, { "epoch": 2.298507462686567, "grad_norm": 5.086247444152832, "learning_rate": 0.0001076923076923077, "loss": 7.1494, "step": 32 }, { "epoch": 2.373134328358209, "grad_norm": 4.92710542678833, "learning_rate": 0.00010461538461538463, "loss": 7.1479, "step": 33 }, { "epoch": 2.4477611940298507, "grad_norm": 5.310170650482178, "learning_rate": 0.00010153846153846153, "loss": 7.1172, "step": 34 }, { "epoch": 2.5223880597014925, "grad_norm": 5.681138515472412, "learning_rate": 9.846153846153848e-05, "loss": 7.1665, "step": 35 }, { "epoch": 2.5970149253731343, "grad_norm": 6.48416805267334, "learning_rate": 9.53846153846154e-05, "loss": 7.229, "step": 36 }, { "epoch": 2.671641791044776, "grad_norm": 7.22155237197876, "learning_rate": 9.230769230769232e-05, "loss": 7.2443, "step": 37 }, { "epoch": 2.746268656716418, "grad_norm": 5.244325160980225, "learning_rate": 8.923076923076924e-05, "loss": 7.1864, "step": 38 }, { "epoch": 2.8208955223880596, "grad_norm": 6.491042613983154, "learning_rate": 8.615384615384617e-05, "loss": 7.1403, "step": 39 }, { "epoch": 2.8955223880597014, "grad_norm": 4.748079299926758, "learning_rate": 8.307692307692309e-05, "loss": 7.2464, "step": 40 }, { "epoch": 2.970149253731343, "grad_norm": 4.683705806732178, "learning_rate": 8e-05, "loss": 7.1566, "step": 41 }, { "epoch": 3.0, "grad_norm": 8.315367698669434, "learning_rate": 7.692307692307693e-05, "loss": 2.7335, "step": 42 }, { "epoch": 3.074626865671642, "grad_norm": 8.71499252319336, "learning_rate": 7.384615384615386e-05, "loss": 7.0353, "step": 43 }, { "epoch": 3.1492537313432836, "grad_norm": 4.421390056610107, "learning_rate": 7.076923076923078e-05, "loss": 7.1684, "step": 44 }, { "epoch": 3.2238805970149254, "grad_norm": 4.962438106536865, "learning_rate": 6.76923076923077e-05, "loss": 7.0518, "step": 45 }, { "epoch": 3.298507462686567, "grad_norm": 4.061994552612305, "learning_rate": 6.461538461538462e-05, "loss": 7.0753, "step": 46 }, { "epoch": 3.373134328358209, "grad_norm": 10.23737621307373, "learning_rate": 6.153846153846155e-05, "loss": 7.2593, "step": 47 }, { "epoch": 3.4477611940298507, "grad_norm": 5.402864456176758, "learning_rate": 5.846153846153847e-05, "loss": 7.1255, "step": 48 }, { "epoch": 3.5223880597014925, "grad_norm": 7.442513465881348, "learning_rate": 5.538461538461539e-05, "loss": 6.9086, "step": 49 }, { "epoch": 3.5970149253731343, "grad_norm": 12.119452476501465, "learning_rate": 5.230769230769231e-05, "loss": 7.2655, "step": 50 }, { "epoch": 3.671641791044776, "grad_norm": 4.1253156661987305, "learning_rate": 4.923076923076924e-05, "loss": 7.129, "step": 51 }, { "epoch": 3.746268656716418, "grad_norm": 7.582332611083984, "learning_rate": 4.615384615384616e-05, "loss": 7.1866, "step": 52 }, { "epoch": 3.8208955223880596, "grad_norm": 9.213349342346191, "learning_rate": 4.3076923076923084e-05, "loss": 7.2262, "step": 53 }, { "epoch": 3.8955223880597014, "grad_norm": 4.230329513549805, "learning_rate": 4e-05, "loss": 7.0612, "step": 54 }, { "epoch": 3.970149253731343, "grad_norm": 3.959320545196533, "learning_rate": 3.692307692307693e-05, "loss": 7.095, "step": 55 }, { "epoch": 4.0, "grad_norm": 2.782017469406128, "learning_rate": 3.384615384615385e-05, "loss": 2.8985, "step": 56 }, { "epoch": 4.074626865671641, "grad_norm": 8.111074447631836, "learning_rate": 3.0769230769230774e-05, "loss": 7.0008, "step": 57 }, { "epoch": 4.149253731343284, "grad_norm": 3.9246532917022705, "learning_rate": 2.7692307692307694e-05, "loss": 7.1253, "step": 58 }, { "epoch": 4.223880597014926, "grad_norm": 11.857646942138672, "learning_rate": 2.461538461538462e-05, "loss": 6.9411, "step": 59 }, { "epoch": 4.298507462686567, "grad_norm": 2.8319931030273438, "learning_rate": 2.1538461538461542e-05, "loss": 7.1616, "step": 60 }, { "epoch": 4.373134328358209, "grad_norm": 3.061171293258667, "learning_rate": 1.8461538461538465e-05, "loss": 7.1291, "step": 61 }, { "epoch": 4.447761194029851, "grad_norm": 5.19327449798584, "learning_rate": 1.5384615384615387e-05, "loss": 7.1867, "step": 62 }, { "epoch": 4.522388059701493, "grad_norm": 3.0203020572662354, "learning_rate": 1.230769230769231e-05, "loss": 7.1364, "step": 63 }, { "epoch": 4.597014925373134, "grad_norm": 3.957735300064087, "learning_rate": 9.230769230769232e-06, "loss": 7.139, "step": 64 }, { "epoch": 4.6716417910447765, "grad_norm": 4.099056243896484, "learning_rate": 6.153846153846155e-06, "loss": 7.1104, "step": 65 }, { "epoch": 4.6716417910447765, "step": 65, "total_flos": 46267156615872.0, "train_loss": 7.157758657748882, "train_runtime": 2311.3066, "train_samples_per_second": 0.58, "train_steps_per_second": 0.028 } ], "logging_steps": 1.0, "max_steps": 65, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 46267156615872.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }