{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7167381974248928, "eval_steps": 4, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.034334763948497854, "grad_norm": 19.15982437133789, "learning_rate": 2e-05, "loss": 13.5674, "step": 1 }, { "epoch": 0.034334763948497854, "eval_loss": 1.6727509498596191, "eval_runtime": 8.0648, "eval_samples_per_second": 6.076, "eval_steps_per_second": 1.612, "step": 1 }, { "epoch": 0.06866952789699571, "grad_norm": 16.63048553466797, "learning_rate": 4e-05, "loss": 14.7517, "step": 2 }, { "epoch": 0.10300429184549356, "grad_norm": 15.25049877166748, "learning_rate": 6e-05, "loss": 13.1345, "step": 3 }, { "epoch": 0.13733905579399142, "grad_norm": 17.173583984375, "learning_rate": 8e-05, "loss": 11.7391, "step": 4 }, { "epoch": 0.13733905579399142, "eval_loss": 1.4845433235168457, "eval_runtime": 8.0677, "eval_samples_per_second": 6.074, "eval_steps_per_second": 1.611, "step": 4 }, { "epoch": 0.17167381974248927, "grad_norm": 15.846421241760254, "learning_rate": 0.0001, "loss": 13.1035, "step": 5 }, { "epoch": 0.20600858369098712, "grad_norm": 13.740381240844727, "learning_rate": 0.00012, "loss": 10.3992, "step": 6 }, { "epoch": 0.24034334763948498, "grad_norm": 10.496570587158203, "learning_rate": 0.00014, "loss": 9.9128, "step": 7 }, { "epoch": 0.27467811158798283, "grad_norm": 8.85119342803955, "learning_rate": 0.00016, "loss": 7.9807, "step": 8 }, { "epoch": 0.27467811158798283, "eval_loss": 0.9227344989776611, "eval_runtime": 8.0856, "eval_samples_per_second": 6.06, "eval_steps_per_second": 1.608, "step": 8 }, { "epoch": 0.3090128755364807, "grad_norm": 8.351367950439453, "learning_rate": 0.00018, "loss": 6.646, "step": 9 }, { "epoch": 0.34334763948497854, "grad_norm": 11.750950813293457, "learning_rate": 0.0002, "loss": 7.369, "step": 10 }, { "epoch": 0.3776824034334764, "grad_norm": 12.794264793395996, "learning_rate": 0.0001996917333733128, "loss": 5.8694, "step": 11 }, { "epoch": 0.41201716738197425, "grad_norm": 21.229198455810547, "learning_rate": 0.00019876883405951377, "loss": 6.5926, "step": 12 }, { "epoch": 0.41201716738197425, "eval_loss": 0.6635634899139404, "eval_runtime": 8.081, "eval_samples_per_second": 6.064, "eval_steps_per_second": 1.609, "step": 12 }, { "epoch": 0.44635193133047213, "grad_norm": 15.183062553405762, "learning_rate": 0.00019723699203976766, "loss": 5.3242, "step": 13 }, { "epoch": 0.48068669527896996, "grad_norm": 11.120061874389648, "learning_rate": 0.00019510565162951537, "loss": 5.0979, "step": 14 }, { "epoch": 0.5150214592274678, "grad_norm": 9.379684448242188, "learning_rate": 0.0001923879532511287, "loss": 4.2185, "step": 15 }, { "epoch": 0.5493562231759657, "grad_norm": 9.187313079833984, "learning_rate": 0.0001891006524188368, "loss": 3.9636, "step": 16 }, { "epoch": 0.5493562231759657, "eval_loss": 0.5391715168952942, "eval_runtime": 8.0725, "eval_samples_per_second": 6.07, "eval_steps_per_second": 1.61, "step": 16 }, { "epoch": 0.5836909871244635, "grad_norm": 7.920268535614014, "learning_rate": 0.00018526401643540922, "loss": 3.4412, "step": 17 }, { "epoch": 0.6180257510729614, "grad_norm": 8.719185829162598, "learning_rate": 0.00018090169943749476, "loss": 4.7588, "step": 18 }, { "epoch": 0.6523605150214592, "grad_norm": 7.354686260223389, "learning_rate": 0.0001760405965600031, "loss": 3.7329, "step": 19 }, { "epoch": 0.6866952789699571, "grad_norm": 6.449216842651367, "learning_rate": 0.00017071067811865476, "loss": 3.0476, "step": 20 }, { "epoch": 0.6866952789699571, "eval_loss": 0.4746493995189667, "eval_runtime": 8.0715, "eval_samples_per_second": 6.071, "eval_steps_per_second": 1.611, "step": 20 }, { "epoch": 0.721030042918455, "grad_norm": 10.40072250366211, "learning_rate": 0.00016494480483301836, "loss": 3.6035, "step": 21 }, { "epoch": 0.7553648068669528, "grad_norm": 9.310944557189941, "learning_rate": 0.00015877852522924732, "loss": 3.1352, "step": 22 }, { "epoch": 0.7896995708154506, "grad_norm": 9.59798526763916, "learning_rate": 0.0001522498564715949, "loss": 3.9863, "step": 23 }, { "epoch": 0.8240343347639485, "grad_norm": 7.370415210723877, "learning_rate": 0.00014539904997395468, "loss": 3.0432, "step": 24 }, { "epoch": 0.8240343347639485, "eval_loss": 0.4330078065395355, "eval_runtime": 8.0682, "eval_samples_per_second": 6.073, "eval_steps_per_second": 1.611, "step": 24 }, { "epoch": 0.8583690987124464, "grad_norm": 9.128814697265625, "learning_rate": 0.000138268343236509, "loss": 3.4839, "step": 25 }, { "epoch": 0.8927038626609443, "grad_norm": 5.970047473907471, "learning_rate": 0.00013090169943749476, "loss": 3.2252, "step": 26 }, { "epoch": 0.927038626609442, "grad_norm": 6.384853839874268, "learning_rate": 0.00012334453638559057, "loss": 3.4018, "step": 27 }, { "epoch": 0.9613733905579399, "grad_norm": 6.741840839385986, "learning_rate": 0.0001156434465040231, "loss": 3.4371, "step": 28 }, { "epoch": 0.9613733905579399, "eval_loss": 0.4020407199859619, "eval_runtime": 8.0743, "eval_samples_per_second": 6.069, "eval_steps_per_second": 1.61, "step": 28 }, { "epoch": 0.9957081545064378, "grad_norm": 5.92246675491333, "learning_rate": 0.0001078459095727845, "loss": 2.4705, "step": 29 }, { "epoch": 1.0300429184549356, "grad_norm": 6.4705681800842285, "learning_rate": 0.0001, "loss": 2.5391, "step": 30 }, { "epoch": 1.0643776824034334, "grad_norm": 7.236270427703857, "learning_rate": 9.215409042721552e-05, "loss": 3.4479, "step": 31 }, { "epoch": 1.0987124463519313, "grad_norm": 5.484566688537598, "learning_rate": 8.435655349597689e-05, "loss": 2.1396, "step": 32 }, { "epoch": 1.0987124463519313, "eval_loss": 0.3587982654571533, "eval_runtime": 8.0769, "eval_samples_per_second": 6.067, "eval_steps_per_second": 1.61, "step": 32 }, { "epoch": 1.1330472103004292, "grad_norm": 6.789762020111084, "learning_rate": 7.66554636144095e-05, "loss": 2.4319, "step": 33 }, { "epoch": 1.167381974248927, "grad_norm": 5.139922142028809, "learning_rate": 6.909830056250527e-05, "loss": 2.5539, "step": 34 }, { "epoch": 1.201716738197425, "grad_norm": 5.432690143585205, "learning_rate": 6.173165676349103e-05, "loss": 1.9336, "step": 35 }, { "epoch": 1.2360515021459229, "grad_norm": 4.705223560333252, "learning_rate": 5.4600950026045326e-05, "loss": 1.6073, "step": 36 }, { "epoch": 1.2360515021459229, "eval_loss": 0.3393373191356659, "eval_runtime": 8.0747, "eval_samples_per_second": 6.068, "eval_steps_per_second": 1.61, "step": 36 }, { "epoch": 1.2703862660944205, "grad_norm": 8.144052505493164, "learning_rate": 4.7750143528405126e-05, "loss": 2.4886, "step": 37 }, { "epoch": 1.3047210300429184, "grad_norm": 5.885042667388916, "learning_rate": 4.12214747707527e-05, "loss": 2.1173, "step": 38 }, { "epoch": 1.3390557939914163, "grad_norm": 5.44212532043457, "learning_rate": 3.5055195166981645e-05, "loss": 1.5208, "step": 39 }, { "epoch": 1.3733905579399142, "grad_norm": 8.440479278564453, "learning_rate": 2.9289321881345254e-05, "loss": 3.2822, "step": 40 }, { "epoch": 1.3733905579399142, "eval_loss": 0.3231399357318878, "eval_runtime": 8.0758, "eval_samples_per_second": 6.068, "eval_steps_per_second": 1.61, "step": 40 }, { "epoch": 1.407725321888412, "grad_norm": 6.7061991691589355, "learning_rate": 2.3959403439996907e-05, "loss": 2.4418, "step": 41 }, { "epoch": 1.44206008583691, "grad_norm": 5.881109714508057, "learning_rate": 1.9098300562505266e-05, "loss": 2.252, "step": 42 }, { "epoch": 1.4763948497854078, "grad_norm": 5.9437479972839355, "learning_rate": 1.4735983564590783e-05, "loss": 1.8867, "step": 43 }, { "epoch": 1.5107296137339055, "grad_norm": 5.992803573608398, "learning_rate": 1.0899347581163221e-05, "loss": 1.7314, "step": 44 }, { "epoch": 1.5107296137339055, "eval_loss": 0.31934094429016113, "eval_runtime": 8.0874, "eval_samples_per_second": 6.059, "eval_steps_per_second": 1.607, "step": 44 }, { "epoch": 1.5450643776824036, "grad_norm": 6.900696754455566, "learning_rate": 7.612046748871327e-06, "loss": 2.8172, "step": 45 }, { "epoch": 1.5793991416309012, "grad_norm": 4.783636569976807, "learning_rate": 4.8943483704846475e-06, "loss": 1.6444, "step": 46 }, { "epoch": 1.613733905579399, "grad_norm": 7.204476356506348, "learning_rate": 2.7630079602323442e-06, "loss": 2.3282, "step": 47 }, { "epoch": 1.648068669527897, "grad_norm": 5.65119743347168, "learning_rate": 1.231165940486234e-06, "loss": 1.7266, "step": 48 }, { "epoch": 1.648068669527897, "eval_loss": 0.3161769509315491, "eval_runtime": 8.1044, "eval_samples_per_second": 6.046, "eval_steps_per_second": 1.604, "step": 48 }, { "epoch": 1.6824034334763949, "grad_norm": 5.6475605964660645, "learning_rate": 3.0826662668720364e-07, "loss": 1.6549, "step": 49 }, { "epoch": 1.7167381974248928, "grad_norm": 9.571765899658203, "learning_rate": 0.0, "loss": 3.696, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6089318710981427e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }