|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.021105951878429716, |
|
"eval_steps": 5, |
|
"global_step": 50, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00042211903756859433, |
|
"grad_norm": 9.35902214050293, |
|
"learning_rate": 2e-05, |
|
"loss": 10.0563, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00042211903756859433, |
|
"eval_loss": 2.3652446269989014, |
|
"eval_runtime": 134.5359, |
|
"eval_samples_per_second": 7.418, |
|
"eval_steps_per_second": 3.709, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0008442380751371887, |
|
"grad_norm": 7.56370735168457, |
|
"learning_rate": 4e-05, |
|
"loss": 7.42, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.001266357112705783, |
|
"grad_norm": 9.696252822875977, |
|
"learning_rate": 6e-05, |
|
"loss": 10.9588, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0016884761502743773, |
|
"grad_norm": 8.566594123840332, |
|
"learning_rate": 8e-05, |
|
"loss": 10.168, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.002110595187842972, |
|
"grad_norm": 7.462744235992432, |
|
"learning_rate": 0.0001, |
|
"loss": 8.2985, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002110595187842972, |
|
"eval_loss": 2.258758783340454, |
|
"eval_runtime": 135.8864, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 3.672, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.002532714225411566, |
|
"grad_norm": 11.242934226989746, |
|
"learning_rate": 0.00012, |
|
"loss": 8.9401, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0029548332629801602, |
|
"grad_norm": 13.016233444213867, |
|
"learning_rate": 0.00014, |
|
"loss": 8.0107, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0033769523005487546, |
|
"grad_norm": 16.885713577270508, |
|
"learning_rate": 0.00016, |
|
"loss": 7.0748, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.003799071338117349, |
|
"grad_norm": 12.32886791229248, |
|
"learning_rate": 0.00018, |
|
"loss": 6.1376, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.004221190375685944, |
|
"grad_norm": 19.14803123474121, |
|
"learning_rate": 0.0002, |
|
"loss": 6.8045, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004221190375685944, |
|
"eval_loss": 1.6286500692367554, |
|
"eval_runtime": 135.9287, |
|
"eval_samples_per_second": 7.342, |
|
"eval_steps_per_second": 3.671, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.004643309413254538, |
|
"grad_norm": 18.249591827392578, |
|
"learning_rate": 0.0001996917333733128, |
|
"loss": 5.6408, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.005065428450823132, |
|
"grad_norm": 8.969521522521973, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 3.3644, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.005487547488391726, |
|
"grad_norm": 9.032382011413574, |
|
"learning_rate": 0.00019723699203976766, |
|
"loss": 5.2804, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0059096665259603205, |
|
"grad_norm": 11.984638214111328, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 4.695, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.006331785563528915, |
|
"grad_norm": 14.346260070800781, |
|
"learning_rate": 0.0001923879532511287, |
|
"loss": 6.4391, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006331785563528915, |
|
"eval_loss": 1.3743129968643188, |
|
"eval_runtime": 135.8966, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 3.672, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.006753904601097509, |
|
"grad_norm": 11.886094093322754, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 5.4466, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.007176023638666104, |
|
"grad_norm": 12.336444854736328, |
|
"learning_rate": 0.00018526401643540922, |
|
"loss": 4.6035, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.007598142676234698, |
|
"grad_norm": 19.94431495666504, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 5.2473, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.008020261713803293, |
|
"grad_norm": 9.52234172821045, |
|
"learning_rate": 0.0001760405965600031, |
|
"loss": 3.7112, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.008442380751371888, |
|
"grad_norm": 11.920819282531738, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 3.9923, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008442380751371888, |
|
"eval_loss": 1.2984857559204102, |
|
"eval_runtime": 135.8981, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 3.672, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008864499788940482, |
|
"grad_norm": 11.649659156799316, |
|
"learning_rate": 0.00016494480483301836, |
|
"loss": 5.2017, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.009286618826509076, |
|
"grad_norm": 10.896583557128906, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 5.5399, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.009708737864077669, |
|
"grad_norm": 10.63659381866455, |
|
"learning_rate": 0.0001522498564715949, |
|
"loss": 5.016, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.010130856901646263, |
|
"grad_norm": 11.454913139343262, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 5.6765, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.010552975939214858, |
|
"grad_norm": 10.400849342346191, |
|
"learning_rate": 0.000138268343236509, |
|
"loss": 6.1088, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010552975939214858, |
|
"eval_loss": 1.2727545499801636, |
|
"eval_runtime": 135.9196, |
|
"eval_samples_per_second": 7.343, |
|
"eval_steps_per_second": 3.671, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.010975094976783452, |
|
"grad_norm": 12.741352081298828, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 5.4666, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.011397214014352047, |
|
"grad_norm": 17.090585708618164, |
|
"learning_rate": 0.00012334453638559057, |
|
"loss": 4.8389, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.011819333051920641, |
|
"grad_norm": 12.481046676635742, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 4.6741, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.012241452089489235, |
|
"grad_norm": 10.3117094039917, |
|
"learning_rate": 0.0001078459095727845, |
|
"loss": 5.4316, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.01266357112705783, |
|
"grad_norm": 7.214309215545654, |
|
"learning_rate": 0.0001, |
|
"loss": 3.7213, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01266357112705783, |
|
"eval_loss": 1.2609025239944458, |
|
"eval_runtime": 135.9083, |
|
"eval_samples_per_second": 7.343, |
|
"eval_steps_per_second": 3.672, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.013085690164626424, |
|
"grad_norm": 18.718671798706055, |
|
"learning_rate": 9.215409042721552e-05, |
|
"loss": 5.4732, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.013507809202195019, |
|
"grad_norm": 11.435547828674316, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 4.9203, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.013929928239763613, |
|
"grad_norm": 10.921402931213379, |
|
"learning_rate": 7.66554636144095e-05, |
|
"loss": 4.5804, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.014352047277332207, |
|
"grad_norm": 19.59139633178711, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 5.5784, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.014774166314900802, |
|
"grad_norm": 7.975940704345703, |
|
"learning_rate": 6.173165676349103e-05, |
|
"loss": 5.5038, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.014774166314900802, |
|
"eval_loss": 1.2495229244232178, |
|
"eval_runtime": 135.8846, |
|
"eval_samples_per_second": 7.344, |
|
"eval_steps_per_second": 3.672, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.015196285352469396, |
|
"grad_norm": 9.815423965454102, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 4.815, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.01561840439003799, |
|
"grad_norm": 10.220743179321289, |
|
"learning_rate": 4.7750143528405126e-05, |
|
"loss": 4.7406, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.016040523427606587, |
|
"grad_norm": 10.354463577270508, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 4.5756, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.01646264246517518, |
|
"grad_norm": 11.346120834350586, |
|
"learning_rate": 3.5055195166981645e-05, |
|
"loss": 6.2784, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.016884761502743775, |
|
"grad_norm": 9.713720321655273, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 5.7159, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.016884761502743775, |
|
"eval_loss": 1.2519896030426025, |
|
"eval_runtime": 136.0325, |
|
"eval_samples_per_second": 7.336, |
|
"eval_steps_per_second": 3.668, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.017306880540312368, |
|
"grad_norm": 13.019607543945312, |
|
"learning_rate": 2.3959403439996907e-05, |
|
"loss": 4.0586, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.017728999577880964, |
|
"grad_norm": 9.759472846984863, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 5.9911, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.018151118615449557, |
|
"grad_norm": 9.983514785766602, |
|
"learning_rate": 1.4735983564590783e-05, |
|
"loss": 4.5841, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.018573237653018153, |
|
"grad_norm": 11.719050407409668, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 6.0421, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.018995356690586745, |
|
"grad_norm": 11.842796325683594, |
|
"learning_rate": 7.612046748871327e-06, |
|
"loss": 5.632, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.018995356690586745, |
|
"eval_loss": 1.2501074075698853, |
|
"eval_runtime": 136.0353, |
|
"eval_samples_per_second": 7.336, |
|
"eval_steps_per_second": 3.668, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.019417475728155338, |
|
"grad_norm": 13.858030319213867, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 4.1389, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.019839594765723934, |
|
"grad_norm": 12.51378345489502, |
|
"learning_rate": 2.7630079602323442e-06, |
|
"loss": 4.6694, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.020261713803292527, |
|
"grad_norm": 14.40710163116455, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 4.5467, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.020683832840861123, |
|
"grad_norm": 12.680715560913086, |
|
"learning_rate": 3.0826662668720364e-07, |
|
"loss": 4.3332, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.021105951878429716, |
|
"grad_norm": 10.62983512878418, |
|
"learning_rate": 0.0, |
|
"loss": 5.3688, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.021105951878429716, |
|
"eval_loss": 1.2478138208389282, |
|
"eval_runtime": 135.9744, |
|
"eval_samples_per_second": 7.34, |
|
"eval_steps_per_second": 3.67, |
|
"step": 50 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 50, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 80, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.25746627182592e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|