{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.021105951878429716, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00042211903756859433, "grad_norm": 9.35902214050293, "learning_rate": 2e-05, "loss": 10.0563, "step": 1 }, { "epoch": 0.00042211903756859433, "eval_loss": 2.3652446269989014, "eval_runtime": 134.5359, "eval_samples_per_second": 7.418, "eval_steps_per_second": 3.709, "step": 1 }, { "epoch": 0.0008442380751371887, "grad_norm": 7.56370735168457, "learning_rate": 4e-05, "loss": 7.42, "step": 2 }, { "epoch": 0.001266357112705783, "grad_norm": 9.696252822875977, "learning_rate": 6e-05, "loss": 10.9588, "step": 3 }, { "epoch": 0.0016884761502743773, "grad_norm": 8.566594123840332, "learning_rate": 8e-05, "loss": 10.168, "step": 4 }, { "epoch": 0.002110595187842972, "grad_norm": 7.462744235992432, "learning_rate": 0.0001, "loss": 8.2985, "step": 5 }, { "epoch": 0.002110595187842972, "eval_loss": 2.258758783340454, "eval_runtime": 135.8864, "eval_samples_per_second": 7.344, "eval_steps_per_second": 3.672, "step": 5 }, { "epoch": 0.002532714225411566, "grad_norm": 11.242934226989746, "learning_rate": 0.00012, "loss": 8.9401, "step": 6 }, { "epoch": 0.0029548332629801602, "grad_norm": 13.016233444213867, "learning_rate": 0.00014, "loss": 8.0107, "step": 7 }, { "epoch": 0.0033769523005487546, "grad_norm": 16.885713577270508, "learning_rate": 0.00016, "loss": 7.0748, "step": 8 }, { "epoch": 0.003799071338117349, "grad_norm": 12.32886791229248, "learning_rate": 0.00018, "loss": 6.1376, "step": 9 }, { "epoch": 0.004221190375685944, "grad_norm": 19.14803123474121, "learning_rate": 0.0002, "loss": 6.8045, "step": 10 }, { "epoch": 0.004221190375685944, "eval_loss": 1.6286500692367554, "eval_runtime": 135.9287, "eval_samples_per_second": 7.342, "eval_steps_per_second": 3.671, "step": 10 }, { "epoch": 0.004643309413254538, "grad_norm": 18.249591827392578, "learning_rate": 0.0001996917333733128, "loss": 5.6408, "step": 11 }, { "epoch": 0.005065428450823132, "grad_norm": 8.969521522521973, "learning_rate": 0.00019876883405951377, "loss": 3.3644, "step": 12 }, { "epoch": 0.005487547488391726, "grad_norm": 9.032382011413574, "learning_rate": 0.00019723699203976766, "loss": 5.2804, "step": 13 }, { "epoch": 0.0059096665259603205, "grad_norm": 11.984638214111328, "learning_rate": 0.00019510565162951537, "loss": 4.695, "step": 14 }, { "epoch": 0.006331785563528915, "grad_norm": 14.346260070800781, "learning_rate": 0.0001923879532511287, "loss": 6.4391, "step": 15 }, { "epoch": 0.006331785563528915, "eval_loss": 1.3743129968643188, "eval_runtime": 135.8966, "eval_samples_per_second": 7.344, "eval_steps_per_second": 3.672, "step": 15 }, { "epoch": 0.006753904601097509, "grad_norm": 11.886094093322754, "learning_rate": 0.0001891006524188368, "loss": 5.4466, "step": 16 }, { "epoch": 0.007176023638666104, "grad_norm": 12.336444854736328, "learning_rate": 0.00018526401643540922, "loss": 4.6035, "step": 17 }, { "epoch": 0.007598142676234698, "grad_norm": 19.94431495666504, "learning_rate": 0.00018090169943749476, "loss": 5.2473, "step": 18 }, { "epoch": 0.008020261713803293, "grad_norm": 9.52234172821045, "learning_rate": 0.0001760405965600031, "loss": 3.7112, "step": 19 }, { "epoch": 0.008442380751371888, "grad_norm": 11.920819282531738, "learning_rate": 0.00017071067811865476, "loss": 3.9923, "step": 20 }, { "epoch": 0.008442380751371888, "eval_loss": 1.2984857559204102, "eval_runtime": 135.8981, "eval_samples_per_second": 7.344, "eval_steps_per_second": 3.672, "step": 20 }, { "epoch": 0.008864499788940482, "grad_norm": 11.649659156799316, "learning_rate": 0.00016494480483301836, "loss": 5.2017, "step": 21 }, { "epoch": 0.009286618826509076, "grad_norm": 10.896583557128906, "learning_rate": 0.00015877852522924732, "loss": 5.5399, "step": 22 }, { "epoch": 0.009708737864077669, "grad_norm": 10.63659381866455, "learning_rate": 0.0001522498564715949, "loss": 5.016, "step": 23 }, { "epoch": 0.010130856901646263, "grad_norm": 11.454913139343262, "learning_rate": 0.00014539904997395468, "loss": 5.6765, "step": 24 }, { "epoch": 0.010552975939214858, "grad_norm": 10.400849342346191, "learning_rate": 0.000138268343236509, "loss": 6.1088, "step": 25 }, { "epoch": 0.010552975939214858, "eval_loss": 1.2727545499801636, "eval_runtime": 135.9196, "eval_samples_per_second": 7.343, "eval_steps_per_second": 3.671, "step": 25 }, { "epoch": 0.010975094976783452, "grad_norm": 12.741352081298828, "learning_rate": 0.00013090169943749476, "loss": 5.4666, "step": 26 }, { "epoch": 0.011397214014352047, "grad_norm": 17.090585708618164, "learning_rate": 0.00012334453638559057, "loss": 4.8389, "step": 27 }, { "epoch": 0.011819333051920641, "grad_norm": 12.481046676635742, "learning_rate": 0.0001156434465040231, "loss": 4.6741, "step": 28 }, { "epoch": 0.012241452089489235, "grad_norm": 10.3117094039917, "learning_rate": 0.0001078459095727845, "loss": 5.4316, "step": 29 }, { "epoch": 0.01266357112705783, "grad_norm": 7.214309215545654, "learning_rate": 0.0001, "loss": 3.7213, "step": 30 }, { "epoch": 0.01266357112705783, "eval_loss": 1.2609025239944458, "eval_runtime": 135.9083, "eval_samples_per_second": 7.343, "eval_steps_per_second": 3.672, "step": 30 }, { "epoch": 0.013085690164626424, "grad_norm": 18.718671798706055, "learning_rate": 9.215409042721552e-05, "loss": 5.4732, "step": 31 }, { "epoch": 0.013507809202195019, "grad_norm": 11.435547828674316, "learning_rate": 8.435655349597689e-05, "loss": 4.9203, "step": 32 }, { "epoch": 0.013929928239763613, "grad_norm": 10.921402931213379, "learning_rate": 7.66554636144095e-05, "loss": 4.5804, "step": 33 }, { "epoch": 0.014352047277332207, "grad_norm": 19.59139633178711, "learning_rate": 6.909830056250527e-05, "loss": 5.5784, "step": 34 }, { "epoch": 0.014774166314900802, "grad_norm": 7.975940704345703, "learning_rate": 6.173165676349103e-05, "loss": 5.5038, "step": 35 }, { "epoch": 0.014774166314900802, "eval_loss": 1.2495229244232178, "eval_runtime": 135.8846, "eval_samples_per_second": 7.344, "eval_steps_per_second": 3.672, "step": 35 }, { "epoch": 0.015196285352469396, "grad_norm": 9.815423965454102, "learning_rate": 5.4600950026045326e-05, "loss": 4.815, "step": 36 }, { "epoch": 0.01561840439003799, "grad_norm": 10.220743179321289, "learning_rate": 4.7750143528405126e-05, "loss": 4.7406, "step": 37 }, { "epoch": 0.016040523427606587, "grad_norm": 10.354463577270508, "learning_rate": 4.12214747707527e-05, "loss": 4.5756, "step": 38 }, { "epoch": 0.01646264246517518, "grad_norm": 11.346120834350586, "learning_rate": 3.5055195166981645e-05, "loss": 6.2784, "step": 39 }, { "epoch": 0.016884761502743775, "grad_norm": 9.713720321655273, "learning_rate": 2.9289321881345254e-05, "loss": 5.7159, "step": 40 }, { "epoch": 0.016884761502743775, "eval_loss": 1.2519896030426025, "eval_runtime": 136.0325, "eval_samples_per_second": 7.336, "eval_steps_per_second": 3.668, "step": 40 }, { "epoch": 0.017306880540312368, "grad_norm": 13.019607543945312, "learning_rate": 2.3959403439996907e-05, "loss": 4.0586, "step": 41 }, { "epoch": 0.017728999577880964, "grad_norm": 9.759472846984863, "learning_rate": 1.9098300562505266e-05, "loss": 5.9911, "step": 42 }, { "epoch": 0.018151118615449557, "grad_norm": 9.983514785766602, "learning_rate": 1.4735983564590783e-05, "loss": 4.5841, "step": 43 }, { "epoch": 0.018573237653018153, "grad_norm": 11.719050407409668, "learning_rate": 1.0899347581163221e-05, "loss": 6.0421, "step": 44 }, { "epoch": 0.018995356690586745, "grad_norm": 11.842796325683594, "learning_rate": 7.612046748871327e-06, "loss": 5.632, "step": 45 }, { "epoch": 0.018995356690586745, "eval_loss": 1.2501074075698853, "eval_runtime": 136.0353, "eval_samples_per_second": 7.336, "eval_steps_per_second": 3.668, "step": 45 }, { "epoch": 0.019417475728155338, "grad_norm": 13.858030319213867, "learning_rate": 4.8943483704846475e-06, "loss": 4.1389, "step": 46 }, { "epoch": 0.019839594765723934, "grad_norm": 12.51378345489502, "learning_rate": 2.7630079602323442e-06, "loss": 4.6694, "step": 47 }, { "epoch": 0.020261713803292527, "grad_norm": 14.40710163116455, "learning_rate": 1.231165940486234e-06, "loss": 4.5467, "step": 48 }, { "epoch": 0.020683832840861123, "grad_norm": 12.680715560913086, "learning_rate": 3.0826662668720364e-07, "loss": 4.3332, "step": 49 }, { "epoch": 0.021105951878429716, "grad_norm": 10.62983512878418, "learning_rate": 0.0, "loss": 5.3688, "step": 50 }, { "epoch": 0.021105951878429716, "eval_loss": 1.2478138208389282, "eval_runtime": 135.9744, "eval_samples_per_second": 7.34, "eval_steps_per_second": 3.67, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.25746627182592e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }