|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.021276595744681, |
|
"eval_steps": 25, |
|
"global_step": 71, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"grad_norm": 2.0995755195617676, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 3.1063, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0425531914893617, |
|
"eval_loss": 3.8875327110290527, |
|
"eval_runtime": 0.6786, |
|
"eval_samples_per_second": 29.471, |
|
"eval_steps_per_second": 14.735, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0851063829787234, |
|
"grad_norm": 3.263718605041504, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 3.4322, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.1276595744680851, |
|
"grad_norm": 3.820951223373413, |
|
"learning_rate": 0.0001, |
|
"loss": 3.645, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1702127659574468, |
|
"grad_norm": 5.33889627456665, |
|
"learning_rate": 9.994664874011863e-05, |
|
"loss": 4.513, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.2127659574468085, |
|
"grad_norm": 7.883951663970947, |
|
"learning_rate": 9.978670881475172e-05, |
|
"loss": 5.0264, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2553191489361702, |
|
"grad_norm": 2.583144187927246, |
|
"learning_rate": 9.952052154376026e-05, |
|
"loss": 3.26, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.2978723404255319, |
|
"grad_norm": 3.3901255130767822, |
|
"learning_rate": 9.91486549841951e-05, |
|
"loss": 3.2384, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.3404255319148936, |
|
"grad_norm": 4.286656856536865, |
|
"learning_rate": 9.867190271803465e-05, |
|
"loss": 3.4462, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.3829787234042553, |
|
"grad_norm": 4.722428321838379, |
|
"learning_rate": 9.809128215864097e-05, |
|
"loss": 3.7113, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.425531914893617, |
|
"grad_norm": 6.825965404510498, |
|
"learning_rate": 9.74080323795483e-05, |
|
"loss": 4.5429, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.46808510638297873, |
|
"grad_norm": 3.327223777770996, |
|
"learning_rate": 9.662361147021779e-05, |
|
"loss": 3.0301, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.5106382978723404, |
|
"grad_norm": 3.6547610759735107, |
|
"learning_rate": 9.573969342440106e-05, |
|
"loss": 2.9293, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5531914893617021, |
|
"grad_norm": 4.506838798522949, |
|
"learning_rate": 9.475816456775313e-05, |
|
"loss": 3.4406, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.5957446808510638, |
|
"grad_norm": 5.095069408416748, |
|
"learning_rate": 9.368111953231848e-05, |
|
"loss": 3.5664, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6382978723404256, |
|
"grad_norm": 5.679275989532471, |
|
"learning_rate": 9.251085678648072e-05, |
|
"loss": 3.443, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.6808510638297872, |
|
"grad_norm": 2.384618043899536, |
|
"learning_rate": 9.124987372991511e-05, |
|
"loss": 3.0016, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.723404255319149, |
|
"grad_norm": 3.2170469760894775, |
|
"learning_rate": 8.9900861364012e-05, |
|
"loss": 2.9207, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.7659574468085106, |
|
"grad_norm": 4.741713523864746, |
|
"learning_rate": 8.846669854914396e-05, |
|
"loss": 3.2638, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.8085106382978723, |
|
"grad_norm": 4.597310543060303, |
|
"learning_rate": 8.695044586103296e-05, |
|
"loss": 3.119, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.851063829787234, |
|
"grad_norm": 6.106940746307373, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 3.196, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.8936170212765957, |
|
"grad_norm": 3.007261037826538, |
|
"learning_rate": 8.368478218232787e-05, |
|
"loss": 2.7263, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.9361702127659575, |
|
"grad_norm": 3.8679449558258057, |
|
"learning_rate": 8.194234028259806e-05, |
|
"loss": 3.2128, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.9787234042553191, |
|
"grad_norm": 5.008350372314453, |
|
"learning_rate": 8.013173181896283e-05, |
|
"loss": 3.1513, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.0212765957446808, |
|
"grad_norm": 8.066261291503906, |
|
"learning_rate": 7.82568207211296e-05, |
|
"loss": 5.1511, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"grad_norm": 2.237522840499878, |
|
"learning_rate": 7.63216081438678e-05, |
|
"loss": 2.0345, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.0638297872340425, |
|
"eval_loss": 2.8090415000915527, |
|
"eval_runtime": 0.6742, |
|
"eval_samples_per_second": 29.666, |
|
"eval_steps_per_second": 14.833, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.1063829787234043, |
|
"grad_norm": 3.5807905197143555, |
|
"learning_rate": 7.433022392834282e-05, |
|
"loss": 2.552, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.148936170212766, |
|
"grad_norm": 3.804152250289917, |
|
"learning_rate": 7.228691778882693e-05, |
|
"loss": 2.7359, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.1914893617021276, |
|
"grad_norm": 3.969632625579834, |
|
"learning_rate": 7.019605024359474e-05, |
|
"loss": 2.6821, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.2340425531914894, |
|
"grad_norm": 3.872260808944702, |
|
"learning_rate": 6.806208330935766e-05, |
|
"loss": 2.7963, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.2765957446808511, |
|
"grad_norm": 2.6370015144348145, |
|
"learning_rate": 6.588957097909508e-05, |
|
"loss": 2.4097, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.3191489361702127, |
|
"grad_norm": 3.32016658782959, |
|
"learning_rate": 6.368314950360415e-05, |
|
"loss": 2.4904, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.3617021276595744, |
|
"grad_norm": 4.197410583496094, |
|
"learning_rate": 6.14475274975067e-05, |
|
"loss": 2.5286, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.4042553191489362, |
|
"grad_norm": 5.017968654632568, |
|
"learning_rate": 5.918747589082853e-05, |
|
"loss": 2.9064, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.4468085106382977, |
|
"grad_norm": 3.416367292404175, |
|
"learning_rate": 5.6907817747594116e-05, |
|
"loss": 2.7989, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.4893617021276595, |
|
"grad_norm": 2.9843363761901855, |
|
"learning_rate": 5.4613417973165106e-05, |
|
"loss": 2.2761, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.5319148936170213, |
|
"grad_norm": 3.322655200958252, |
|
"learning_rate": 5.230917293228699e-05, |
|
"loss": 2.5173, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.574468085106383, |
|
"grad_norm": 4.156716823577881, |
|
"learning_rate": 5e-05, |
|
"loss": 2.6914, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.6170212765957448, |
|
"grad_norm": 5.216977596282959, |
|
"learning_rate": 4.7690827067713035e-05, |
|
"loss": 3.1488, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.6595744680851063, |
|
"grad_norm": 3.87528133392334, |
|
"learning_rate": 4.5386582026834906e-05, |
|
"loss": 2.8607, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.702127659574468, |
|
"grad_norm": 2.6391236782073975, |
|
"learning_rate": 4.30921822524059e-05, |
|
"loss": 2.315, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.7446808510638299, |
|
"grad_norm": 3.2835886478424072, |
|
"learning_rate": 4.0812524109171476e-05, |
|
"loss": 2.296, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.7872340425531914, |
|
"grad_norm": 4.525584697723389, |
|
"learning_rate": 3.855247250249331e-05, |
|
"loss": 2.9909, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.8297872340425532, |
|
"grad_norm": 5.08241605758667, |
|
"learning_rate": 3.631685049639586e-05, |
|
"loss": 2.5649, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.872340425531915, |
|
"grad_norm": 3.8431763648986816, |
|
"learning_rate": 3.411042902090492e-05, |
|
"loss": 2.9495, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 1.9148936170212765, |
|
"grad_norm": 2.8185856342315674, |
|
"learning_rate": 3.1937916690642356e-05, |
|
"loss": 2.326, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 1.9574468085106385, |
|
"grad_norm": 3.852102518081665, |
|
"learning_rate": 2.980394975640526e-05, |
|
"loss": 2.4781, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 9.235259056091309, |
|
"learning_rate": 2.771308221117309e-05, |
|
"loss": 4.1493, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 2.0425531914893615, |
|
"grad_norm": 2.182204484939575, |
|
"learning_rate": 2.5669776071657192e-05, |
|
"loss": 2.431, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.0851063829787235, |
|
"grad_norm": 2.5565974712371826, |
|
"learning_rate": 2.3678391856132204e-05, |
|
"loss": 2.2611, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"grad_norm": 3.7425453662872314, |
|
"learning_rate": 2.1743179278870407e-05, |
|
"loss": 2.5037, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.127659574468085, |
|
"eval_loss": 2.8088538646698, |
|
"eval_runtime": 0.6893, |
|
"eval_samples_per_second": 29.014, |
|
"eval_steps_per_second": 14.507, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.1702127659574466, |
|
"grad_norm": 4.252004623413086, |
|
"learning_rate": 1.9868268181037185e-05, |
|
"loss": 2.5753, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.2127659574468086, |
|
"grad_norm": 4.096494197845459, |
|
"learning_rate": 1.8057659717401947e-05, |
|
"loss": 2.25, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.25531914893617, |
|
"grad_norm": 2.2319579124450684, |
|
"learning_rate": 1.631521781767214e-05, |
|
"loss": 2.3128, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.297872340425532, |
|
"grad_norm": 2.539842367172241, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 2.2327, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.3404255319148937, |
|
"grad_norm": 3.147387981414795, |
|
"learning_rate": 1.3049554138967051e-05, |
|
"loss": 2.1849, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.382978723404255, |
|
"grad_norm": 3.685150146484375, |
|
"learning_rate": 1.1533301450856054e-05, |
|
"loss": 2.4277, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.425531914893617, |
|
"grad_norm": 4.715639591217041, |
|
"learning_rate": 1.0099138635988026e-05, |
|
"loss": 2.5731, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.4680851063829787, |
|
"grad_norm": 2.3089892864227295, |
|
"learning_rate": 8.75012627008489e-06, |
|
"loss": 2.4525, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.5106382978723403, |
|
"grad_norm": 2.630906105041504, |
|
"learning_rate": 7.489143213519301e-06, |
|
"loss": 2.1467, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.5531914893617023, |
|
"grad_norm": 3.5077598094940186, |
|
"learning_rate": 6.318880467681526e-06, |
|
"loss": 2.4428, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.595744680851064, |
|
"grad_norm": 3.928353786468506, |
|
"learning_rate": 5.241835432246889e-06, |
|
"loss": 2.2675, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.6382978723404253, |
|
"grad_norm": 4.511261940002441, |
|
"learning_rate": 4.260306575598949e-06, |
|
"loss": 2.2792, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.6808510638297873, |
|
"grad_norm": 2.4075710773468018, |
|
"learning_rate": 3.376388529782215e-06, |
|
"loss": 2.3584, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.723404255319149, |
|
"grad_norm": 2.83492112159729, |
|
"learning_rate": 2.591967620451707e-06, |
|
"loss": 1.9989, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.7659574468085104, |
|
"grad_norm": 3.5723989009857178, |
|
"learning_rate": 1.908717841359048e-06, |
|
"loss": 2.2559, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.8085106382978724, |
|
"grad_norm": 3.7944796085357666, |
|
"learning_rate": 1.328097281965357e-06, |
|
"loss": 2.2629, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 2.851063829787234, |
|
"grad_norm": 4.501901626586914, |
|
"learning_rate": 8.513450158049108e-07, |
|
"loss": 2.3413, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 2.8936170212765955, |
|
"grad_norm": 2.496269464492798, |
|
"learning_rate": 4.794784562397458e-07, |
|
"loss": 2.2531, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 2.9361702127659575, |
|
"grad_norm": 2.9496850967407227, |
|
"learning_rate": 2.1329118524827662e-07, |
|
"loss": 2.1338, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 2.978723404255319, |
|
"grad_norm": 4.252143383026123, |
|
"learning_rate": 5.3351259881379014e-08, |
|
"loss": 2.4166, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.021276595744681, |
|
"grad_norm": 7.078902721405029, |
|
"learning_rate": 0.0, |
|
"loss": 4.2177, |
|
"step": 71 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 71, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3899010909667328e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|