|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.21986075485525833, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0029314767314034445, |
|
"grad_norm": 4.896785259246826, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.4996, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0029314767314034445, |
|
"eval_loss": 1.0614820718765259, |
|
"eval_runtime": 20.3602, |
|
"eval_samples_per_second": 14.145, |
|
"eval_steps_per_second": 7.073, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005862953462806889, |
|
"grad_norm": 2.8189775943756104, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.1143, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.008794430194210334, |
|
"grad_norm": 2.35616135597229, |
|
"learning_rate": 0.0001, |
|
"loss": 0.8383, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.011725906925613778, |
|
"grad_norm": 1.8003149032592773, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 0.6748, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.014657383657017223, |
|
"grad_norm": 2.207160472869873, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 0.7224, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01758886038842067, |
|
"grad_norm": 2.059014320373535, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 0.7348, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.020520337119824112, |
|
"grad_norm": 1.301095962524414, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.4525, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.023451813851227556, |
|
"grad_norm": 1.3723230361938477, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 0.4029, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.026383290582631, |
|
"grad_norm": 1.4173338413238525, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.3076, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.029314767314034446, |
|
"grad_norm": 1.3965848684310913, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 0.2697, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03224624404543789, |
|
"grad_norm": 1.271812081336975, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.2067, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03517772077684134, |
|
"grad_norm": 1.1979728937149048, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.2204, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03810919750824478, |
|
"grad_norm": 1.1956003904342651, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 0.1251, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.041040674239648224, |
|
"grad_norm": 0.7809640765190125, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 0.0761, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.043972150971051664, |
|
"grad_norm": 0.9528048634529114, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.1389, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04690362770245511, |
|
"grad_norm": 0.7418190240859985, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 0.0984, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04983510443385856, |
|
"grad_norm": 0.8109403252601624, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 0.1611, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.052766581165262, |
|
"grad_norm": 0.6052976846694946, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 0.0726, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.055698057896665445, |
|
"grad_norm": 0.6859976053237915, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0853, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.05862953462806889, |
|
"grad_norm": 0.8153113126754761, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 0.133, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06156101135947233, |
|
"grad_norm": 0.6893975734710693, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.0523, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.06449248809087578, |
|
"grad_norm": 0.7115774750709534, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 0.0751, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.06742396482227922, |
|
"grad_norm": 0.7574334740638733, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.0751, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07035544155368267, |
|
"grad_norm": 0.7631270289421082, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 0.091, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.07328691828508611, |
|
"grad_norm": 0.4982626438140869, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 0.0477, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07328691828508611, |
|
"eval_loss": 0.08081666380167007, |
|
"eval_runtime": 20.8896, |
|
"eval_samples_per_second": 13.787, |
|
"eval_steps_per_second": 6.893, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07621839501648955, |
|
"grad_norm": 0.8068152666091919, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 0.1082, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.079149871747893, |
|
"grad_norm": 0.3823889493942261, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.0364, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.08208134847929645, |
|
"grad_norm": 0.9048523306846619, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 0.1366, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.08501282521069989, |
|
"grad_norm": 0.8256965279579163, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 0.107, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.08794430194210333, |
|
"grad_norm": 0.5980184078216553, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.0635, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.09087577867350678, |
|
"grad_norm": 0.5219781398773193, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.0253, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.09380725540491022, |
|
"grad_norm": 0.6775377988815308, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 0.1005, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.09673873213631366, |
|
"grad_norm": 0.5344719886779785, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 0.0726, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.09967020886771712, |
|
"grad_norm": 0.684424877166748, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 0.0762, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.10260168559912056, |
|
"grad_norm": 0.47931015491485596, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.053, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.105533162330524, |
|
"grad_norm": 0.48746219277381897, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 0.0769, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.10846463906192745, |
|
"grad_norm": 0.41026896238327026, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 0.0623, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.11139611579333089, |
|
"grad_norm": 0.6301044821739197, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 0.0854, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.11432759252473433, |
|
"grad_norm": 0.43072110414505005, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0488, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.11725906925613779, |
|
"grad_norm": 0.49057018756866455, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 0.0468, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.12019054598754123, |
|
"grad_norm": 0.541214108467102, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 0.1181, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.12312202271894467, |
|
"grad_norm": 0.4960033893585205, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 0.0367, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1260534994503481, |
|
"grad_norm": 0.46519333124160767, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0634, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.12898497618175156, |
|
"grad_norm": 0.5953696966171265, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 0.0357, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.131916452913155, |
|
"grad_norm": 0.4260966181755066, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.0504, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.13484792964455844, |
|
"grad_norm": 0.35495659708976746, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 0.0461, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1377794063759619, |
|
"grad_norm": 0.492147296667099, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.0833, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.14071088310736535, |
|
"grad_norm": 0.47650235891342163, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.0771, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.14364235983876877, |
|
"grad_norm": 0.8814830780029297, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 0.1376, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.14657383657017223, |
|
"grad_norm": 0.7530894875526428, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 0.0946, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14657383657017223, |
|
"eval_loss": 0.06695578992366791, |
|
"eval_runtime": 20.9098, |
|
"eval_samples_per_second": 13.773, |
|
"eval_steps_per_second": 6.887, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.14950531330157568, |
|
"grad_norm": 1.3906869888305664, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.2653, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1524367900329791, |
|
"grad_norm": 0.8645505309104919, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 0.0972, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.15536826676438256, |
|
"grad_norm": 0.3462808430194855, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 0.0178, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.158299743495786, |
|
"grad_norm": 0.8373314142227173, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 0.1511, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.16123122022718944, |
|
"grad_norm": 0.5848434567451477, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0587, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1641626969585929, |
|
"grad_norm": 0.7180687785148621, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 0.1289, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.16709417368999632, |
|
"grad_norm": 0.4832279086112976, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.0529, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.17002565042139978, |
|
"grad_norm": 0.5206469893455505, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 0.054, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.17295712715280323, |
|
"grad_norm": 0.3522478938102722, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0454, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.17588860388420666, |
|
"grad_norm": 0.6369943022727966, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 0.0807, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1788200806156101, |
|
"grad_norm": 0.4056720435619354, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 0.0568, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.18175155734701356, |
|
"grad_norm": 0.4317684471607208, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 0.0559, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.184683034078417, |
|
"grad_norm": 0.5188791751861572, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0894, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.18761451080982045, |
|
"grad_norm": 0.4226022958755493, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 0.0641, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.1905459875412239, |
|
"grad_norm": 0.6744760870933533, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 0.0995, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.19347746427262733, |
|
"grad_norm": 0.36501771211624146, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.0554, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.19640894100403078, |
|
"grad_norm": 0.4740172326564789, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.0403, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.19934041773543423, |
|
"grad_norm": 0.37339267134666443, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 0.0429, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.20227189446683766, |
|
"grad_norm": 0.46301230788230896, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 0.0245, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.2052033711982411, |
|
"grad_norm": 0.37362566590309143, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 0.0502, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.20813484792964457, |
|
"grad_norm": 0.3411478102207184, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.0327, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.211066324661048, |
|
"grad_norm": 0.6511849761009216, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 0.0708, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.21399780139245145, |
|
"grad_norm": 0.4536079168319702, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 0.05, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2169292781238549, |
|
"grad_norm": 0.5318543910980225, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 0.0867, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.21986075485525833, |
|
"grad_norm": 0.3312048017978668, |
|
"learning_rate": 0.0, |
|
"loss": 0.0279, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.21986075485525833, |
|
"eval_loss": 0.06166088581085205, |
|
"eval_runtime": 20.9069, |
|
"eval_samples_per_second": 13.775, |
|
"eval_steps_per_second": 6.888, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.5860955711799296e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|