|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.009448818897637795, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00012598425196850394, |
|
"grad_norm": 4.49857759475708, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 1.1742, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00012598425196850394, |
|
"eval_loss": 1.215790033340454, |
|
"eval_runtime": 1680.5898, |
|
"eval_samples_per_second": 3.978, |
|
"eval_steps_per_second": 1.989, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0002519685039370079, |
|
"grad_norm": 4.2249956130981445, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.118, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0003779527559055118, |
|
"grad_norm": 3.5851938724517822, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0507, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0005039370078740158, |
|
"grad_norm": 3.1795814037323, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 0.7279, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0006299212598425197, |
|
"grad_norm": 4.160511493682861, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 0.6767, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0007559055118110237, |
|
"grad_norm": 2.9607021808624268, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 0.5793, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0008818897637795275, |
|
"grad_norm": 1.9980815649032593, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.4986, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0010078740157480315, |
|
"grad_norm": 1.6180553436279297, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 0.4405, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0011338582677165355, |
|
"grad_norm": 1.7591142654418945, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.5017, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0012598425196850393, |
|
"grad_norm": 2.138441562652588, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 0.4375, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0013858267716535433, |
|
"grad_norm": 1.852003574371338, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.4284, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0015118110236220473, |
|
"grad_norm": 1.5829808712005615, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 0.5171, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.001637795275590551, |
|
"grad_norm": 1.6981840133666992, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 0.492, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.001763779527559055, |
|
"grad_norm": 1.1615180969238281, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 0.4131, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.001889763779527559, |
|
"grad_norm": 1.3211286067962646, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.4482, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.002015748031496063, |
|
"grad_norm": 1.4025088548660278, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 0.4023, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.002141732283464567, |
|
"grad_norm": 1.4087685346603394, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 0.4651, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.002267716535433071, |
|
"grad_norm": 1.3379063606262207, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 0.4122, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.002393700787401575, |
|
"grad_norm": 1.4114474058151245, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.4775, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0025196850393700786, |
|
"grad_norm": 1.4185856580734253, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 0.4664, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.002645669291338583, |
|
"grad_norm": 1.5691876411437988, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 0.5104, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0027716535433070866, |
|
"grad_norm": 1.127351999282837, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 0.43, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0028976377952755904, |
|
"grad_norm": 1.2459053993225098, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.4195, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0030236220472440946, |
|
"grad_norm": 1.2657814025878906, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 0.4536, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0031496062992125984, |
|
"grad_norm": 1.5992778539657593, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 0.4185, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0031496062992125984, |
|
"eval_loss": 0.48135948181152344, |
|
"eval_runtime": 1691.0237, |
|
"eval_samples_per_second": 3.953, |
|
"eval_steps_per_second": 1.977, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.003275590551181102, |
|
"grad_norm": 1.4434717893600464, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 0.4383, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0034015748031496064, |
|
"grad_norm": 1.187625765800476, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.4064, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.00352755905511811, |
|
"grad_norm": 1.288070797920227, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 0.3957, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0036535433070866144, |
|
"grad_norm": 1.3888908624649048, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 0.4349, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.003779527559055118, |
|
"grad_norm": 1.2192840576171875, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 0.3789, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.003905511811023622, |
|
"grad_norm": 1.7539172172546387, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.4327, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.004031496062992126, |
|
"grad_norm": 1.2005337476730347, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 0.416, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.00415748031496063, |
|
"grad_norm": 1.3382314443588257, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 0.4181, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.004283464566929134, |
|
"grad_norm": 1.445595622062683, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 0.4458, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.004409448818897638, |
|
"grad_norm": 1.3078209161758423, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.3524, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.004535433070866142, |
|
"grad_norm": 1.3248423337936401, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 0.4059, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0046614173228346455, |
|
"grad_norm": 1.2014515399932861, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 0.3862, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.00478740157480315, |
|
"grad_norm": 1.251442313194275, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 0.4217, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.004913385826771654, |
|
"grad_norm": 1.549546480178833, |
|
"learning_rate": 5e-05, |
|
"loss": 0.3972, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.005039370078740157, |
|
"grad_norm": 1.5308916568756104, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 0.4102, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0051653543307086614, |
|
"grad_norm": 1.7657283544540405, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 0.4259, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.005291338582677166, |
|
"grad_norm": 1.9394474029541016, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 0.384, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.005417322834645669, |
|
"grad_norm": 1.7663687467575073, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.3748, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.005543307086614173, |
|
"grad_norm": 1.4035406112670898, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 0.3698, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.005669291338582677, |
|
"grad_norm": 1.6968581676483154, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 0.4055, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.005795275590551181, |
|
"grad_norm": 1.6501572132110596, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 0.376, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.005921259842519685, |
|
"grad_norm": 1.4387761354446411, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.3797, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.006047244094488189, |
|
"grad_norm": 2.231095552444458, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 0.409, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0061732283464566926, |
|
"grad_norm": 2.0894861221313477, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 0.4164, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.006299212598425197, |
|
"grad_norm": 2.8405890464782715, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 0.4957, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006299212598425197, |
|
"eval_loss": 0.46602168679237366, |
|
"eval_runtime": 1691.0057, |
|
"eval_samples_per_second": 3.953, |
|
"eval_steps_per_second": 1.977, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.006425196850393701, |
|
"grad_norm": 3.830735683441162, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.5254, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.006551181102362204, |
|
"grad_norm": 4.045533657073975, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 0.5314, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0066771653543307085, |
|
"grad_norm": 2.6203715801239014, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 0.4299, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.006803149606299213, |
|
"grad_norm": 2.2109479904174805, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 0.4133, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.006929133858267717, |
|
"grad_norm": 1.6112823486328125, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.3646, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00705511811023622, |
|
"grad_norm": 1.4806196689605713, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 0.3811, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.0071811023622047245, |
|
"grad_norm": 1.619550108909607, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 0.3262, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.007307086614173229, |
|
"grad_norm": 1.5869569778442383, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 0.4286, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.007433070866141732, |
|
"grad_norm": 1.411007046699524, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.3441, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.007559055118110236, |
|
"grad_norm": 1.3262393474578857, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 0.3322, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0076850393700787405, |
|
"grad_norm": 1.3001681566238403, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 0.3168, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.007811023622047244, |
|
"grad_norm": 1.0694575309753418, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 0.352, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.007937007874015748, |
|
"grad_norm": 1.2116413116455078, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.3719, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.008062992125984252, |
|
"grad_norm": 1.154064655303955, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 0.3342, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.008188976377952756, |
|
"grad_norm": 0.9787663817405701, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 0.3416, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00831496062992126, |
|
"grad_norm": 1.04434335231781, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 0.3214, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.008440944881889763, |
|
"grad_norm": 1.0488498210906982, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.32, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.008566929133858267, |
|
"grad_norm": 1.2639235258102417, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 0.3507, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.008692913385826772, |
|
"grad_norm": 1.2547246217727661, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 0.3217, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.008818897637795276, |
|
"grad_norm": 1.1020681858062744, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 0.328, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.00894488188976378, |
|
"grad_norm": 1.242504596710205, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.3537, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.009070866141732284, |
|
"grad_norm": 1.1716359853744507, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 0.3599, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.009196850393700787, |
|
"grad_norm": 0.999187171459198, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 0.3442, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.009322834645669291, |
|
"grad_norm": 1.0128037929534912, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 0.3309, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.009448818897637795, |
|
"grad_norm": 1.1647950410842896, |
|
"learning_rate": 0.0, |
|
"loss": 0.347, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.009448818897637795, |
|
"eval_loss": 0.3494592607021332, |
|
"eval_runtime": 1690.2407, |
|
"eval_samples_per_second": 3.955, |
|
"eval_steps_per_second": 1.978, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.119015678246912e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|