{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009448818897637795, "eval_steps": 25, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012598425196850394, "grad_norm": 4.49857759475708, "learning_rate": 3.3333333333333335e-05, "loss": 1.1742, "step": 1 }, { "epoch": 0.00012598425196850394, "eval_loss": 1.215790033340454, "eval_runtime": 1680.5898, "eval_samples_per_second": 3.978, "eval_steps_per_second": 1.989, "step": 1 }, { "epoch": 0.0002519685039370079, "grad_norm": 4.2249956130981445, "learning_rate": 6.666666666666667e-05, "loss": 1.118, "step": 2 }, { "epoch": 0.0003779527559055118, "grad_norm": 3.5851938724517822, "learning_rate": 0.0001, "loss": 1.0507, "step": 3 }, { "epoch": 0.0005039370078740158, "grad_norm": 3.1795814037323, "learning_rate": 9.99524110790929e-05, "loss": 0.7279, "step": 4 }, { "epoch": 0.0006299212598425197, "grad_norm": 4.160511493682861, "learning_rate": 9.980973490458728e-05, "loss": 0.6767, "step": 5 }, { "epoch": 0.0007559055118110237, "grad_norm": 2.9607021808624268, "learning_rate": 9.957224306869053e-05, "loss": 0.5793, "step": 6 }, { "epoch": 0.0008818897637795275, "grad_norm": 1.9980815649032593, "learning_rate": 9.924038765061042e-05, "loss": 0.4986, "step": 7 }, { "epoch": 0.0010078740157480315, "grad_norm": 1.6180553436279297, "learning_rate": 9.881480035599667e-05, "loss": 0.4405, "step": 8 }, { "epoch": 0.0011338582677165355, "grad_norm": 1.7591142654418945, "learning_rate": 9.829629131445342e-05, "loss": 0.5017, "step": 9 }, { "epoch": 0.0012598425196850393, "grad_norm": 2.138441562652588, "learning_rate": 9.768584753741134e-05, "loss": 0.4375, "step": 10 }, { "epoch": 0.0013858267716535433, "grad_norm": 1.852003574371338, "learning_rate": 9.698463103929542e-05, "loss": 0.4284, "step": 11 }, { "epoch": 0.0015118110236220473, "grad_norm": 1.5829808712005615, "learning_rate": 9.619397662556435e-05, "loss": 0.5171, "step": 12 }, { "epoch": 0.001637795275590551, "grad_norm": 1.6981840133666992, "learning_rate": 9.53153893518325e-05, "loss": 0.492, "step": 13 }, { "epoch": 0.001763779527559055, "grad_norm": 1.1615180969238281, "learning_rate": 9.435054165891109e-05, "loss": 0.4131, "step": 14 }, { "epoch": 0.001889763779527559, "grad_norm": 1.3211286067962646, "learning_rate": 9.330127018922194e-05, "loss": 0.4482, "step": 15 }, { "epoch": 0.002015748031496063, "grad_norm": 1.4025088548660278, "learning_rate": 9.21695722906443e-05, "loss": 0.4023, "step": 16 }, { "epoch": 0.002141732283464567, "grad_norm": 1.4087685346603394, "learning_rate": 9.09576022144496e-05, "loss": 0.4651, "step": 17 }, { "epoch": 0.002267716535433071, "grad_norm": 1.3379063606262207, "learning_rate": 8.966766701456177e-05, "loss": 0.4122, "step": 18 }, { "epoch": 0.002393700787401575, "grad_norm": 1.4114474058151245, "learning_rate": 8.83022221559489e-05, "loss": 0.4775, "step": 19 }, { "epoch": 0.0025196850393700786, "grad_norm": 1.4185856580734253, "learning_rate": 8.68638668405062e-05, "loss": 0.4664, "step": 20 }, { "epoch": 0.002645669291338583, "grad_norm": 1.5691876411437988, "learning_rate": 8.535533905932738e-05, "loss": 0.5104, "step": 21 }, { "epoch": 0.0027716535433070866, "grad_norm": 1.127351999282837, "learning_rate": 8.377951038078302e-05, "loss": 0.43, "step": 22 }, { "epoch": 0.0028976377952755904, "grad_norm": 1.2459053993225098, "learning_rate": 8.213938048432697e-05, "loss": 0.4195, "step": 23 }, { "epoch": 0.0030236220472440946, "grad_norm": 1.2657814025878906, "learning_rate": 8.043807145043604e-05, "loss": 0.4536, "step": 24 }, { "epoch": 0.0031496062992125984, "grad_norm": 1.5992778539657593, "learning_rate": 7.86788218175523e-05, "loss": 0.4185, "step": 25 }, { "epoch": 0.0031496062992125984, "eval_loss": 0.48135948181152344, "eval_runtime": 1691.0237, "eval_samples_per_second": 3.953, "eval_steps_per_second": 1.977, "step": 25 }, { "epoch": 0.003275590551181102, "grad_norm": 1.4434717893600464, "learning_rate": 7.68649804173412e-05, "loss": 0.4383, "step": 26 }, { "epoch": 0.0034015748031496064, "grad_norm": 1.187625765800476, "learning_rate": 7.500000000000001e-05, "loss": 0.4064, "step": 27 }, { "epoch": 0.00352755905511811, "grad_norm": 1.288070797920227, "learning_rate": 7.308743066175172e-05, "loss": 0.3957, "step": 28 }, { "epoch": 0.0036535433070866144, "grad_norm": 1.3888908624649048, "learning_rate": 7.113091308703498e-05, "loss": 0.4349, "step": 29 }, { "epoch": 0.003779527559055118, "grad_norm": 1.2192840576171875, "learning_rate": 6.91341716182545e-05, "loss": 0.3789, "step": 30 }, { "epoch": 0.003905511811023622, "grad_norm": 1.7539172172546387, "learning_rate": 6.710100716628344e-05, "loss": 0.4327, "step": 31 }, { "epoch": 0.004031496062992126, "grad_norm": 1.2005337476730347, "learning_rate": 6.503528997521366e-05, "loss": 0.416, "step": 32 }, { "epoch": 0.00415748031496063, "grad_norm": 1.3382314443588257, "learning_rate": 6.294095225512603e-05, "loss": 0.4181, "step": 33 }, { "epoch": 0.004283464566929134, "grad_norm": 1.445595622062683, "learning_rate": 6.0821980696905146e-05, "loss": 0.4458, "step": 34 }, { "epoch": 0.004409448818897638, "grad_norm": 1.3078209161758423, "learning_rate": 5.868240888334653e-05, "loss": 0.3524, "step": 35 }, { "epoch": 0.004535433070866142, "grad_norm": 1.3248423337936401, "learning_rate": 5.6526309611002594e-05, "loss": 0.4059, "step": 36 }, { "epoch": 0.0046614173228346455, "grad_norm": 1.2014515399932861, "learning_rate": 5.435778713738292e-05, "loss": 0.3862, "step": 37 }, { "epoch": 0.00478740157480315, "grad_norm": 1.251442313194275, "learning_rate": 5.218096936826681e-05, "loss": 0.4217, "step": 38 }, { "epoch": 0.004913385826771654, "grad_norm": 1.549546480178833, "learning_rate": 5e-05, "loss": 0.3972, "step": 39 }, { "epoch": 0.005039370078740157, "grad_norm": 1.5308916568756104, "learning_rate": 4.781903063173321e-05, "loss": 0.4102, "step": 40 }, { "epoch": 0.0051653543307086614, "grad_norm": 1.7657283544540405, "learning_rate": 4.564221286261709e-05, "loss": 0.4259, "step": 41 }, { "epoch": 0.005291338582677166, "grad_norm": 1.9394474029541016, "learning_rate": 4.347369038899744e-05, "loss": 0.384, "step": 42 }, { "epoch": 0.005417322834645669, "grad_norm": 1.7663687467575073, "learning_rate": 4.131759111665349e-05, "loss": 0.3748, "step": 43 }, { "epoch": 0.005543307086614173, "grad_norm": 1.4035406112670898, "learning_rate": 3.917801930309486e-05, "loss": 0.3698, "step": 44 }, { "epoch": 0.005669291338582677, "grad_norm": 1.6968581676483154, "learning_rate": 3.705904774487396e-05, "loss": 0.4055, "step": 45 }, { "epoch": 0.005795275590551181, "grad_norm": 1.6501572132110596, "learning_rate": 3.4964710024786354e-05, "loss": 0.376, "step": 46 }, { "epoch": 0.005921259842519685, "grad_norm": 1.4387761354446411, "learning_rate": 3.289899283371657e-05, "loss": 0.3797, "step": 47 }, { "epoch": 0.006047244094488189, "grad_norm": 2.231095552444458, "learning_rate": 3.086582838174551e-05, "loss": 0.409, "step": 48 }, { "epoch": 0.0061732283464566926, "grad_norm": 2.0894861221313477, "learning_rate": 2.886908691296504e-05, "loss": 0.4164, "step": 49 }, { "epoch": 0.006299212598425197, "grad_norm": 2.8405890464782715, "learning_rate": 2.6912569338248315e-05, "loss": 0.4957, "step": 50 }, { "epoch": 0.006299212598425197, "eval_loss": 0.46602168679237366, "eval_runtime": 1691.0057, "eval_samples_per_second": 3.953, "eval_steps_per_second": 1.977, "step": 50 }, { "epoch": 0.006425196850393701, "grad_norm": 3.830735683441162, "learning_rate": 2.500000000000001e-05, "loss": 0.5254, "step": 51 }, { "epoch": 0.006551181102362204, "grad_norm": 4.045533657073975, "learning_rate": 2.3135019582658802e-05, "loss": 0.5314, "step": 52 }, { "epoch": 0.0066771653543307085, "grad_norm": 2.6203715801239014, "learning_rate": 2.132117818244771e-05, "loss": 0.4299, "step": 53 }, { "epoch": 0.006803149606299213, "grad_norm": 2.2109479904174805, "learning_rate": 1.9561928549563968e-05, "loss": 0.4133, "step": 54 }, { "epoch": 0.006929133858267717, "grad_norm": 1.6112823486328125, "learning_rate": 1.7860619515673033e-05, "loss": 0.3646, "step": 55 }, { "epoch": 0.00705511811023622, "grad_norm": 1.4806196689605713, "learning_rate": 1.622048961921699e-05, "loss": 0.3811, "step": 56 }, { "epoch": 0.0071811023622047245, "grad_norm": 1.619550108909607, "learning_rate": 1.4644660940672627e-05, "loss": 0.3262, "step": 57 }, { "epoch": 0.007307086614173229, "grad_norm": 1.5869569778442383, "learning_rate": 1.3136133159493802e-05, "loss": 0.4286, "step": 58 }, { "epoch": 0.007433070866141732, "grad_norm": 1.411007046699524, "learning_rate": 1.1697777844051105e-05, "loss": 0.3441, "step": 59 }, { "epoch": 0.007559055118110236, "grad_norm": 1.3262393474578857, "learning_rate": 1.0332332985438248e-05, "loss": 0.3322, "step": 60 }, { "epoch": 0.0076850393700787405, "grad_norm": 1.3001681566238403, "learning_rate": 9.042397785550405e-06, "loss": 0.3168, "step": 61 }, { "epoch": 0.007811023622047244, "grad_norm": 1.0694575309753418, "learning_rate": 7.830427709355725e-06, "loss": 0.352, "step": 62 }, { "epoch": 0.007937007874015748, "grad_norm": 1.2116413116455078, "learning_rate": 6.698729810778065e-06, "loss": 0.3719, "step": 63 }, { "epoch": 0.008062992125984252, "grad_norm": 1.154064655303955, "learning_rate": 5.649458341088915e-06, "loss": 0.3342, "step": 64 }, { "epoch": 0.008188976377952756, "grad_norm": 0.9787663817405701, "learning_rate": 4.684610648167503e-06, "loss": 0.3416, "step": 65 }, { "epoch": 0.00831496062992126, "grad_norm": 1.04434335231781, "learning_rate": 3.8060233744356633e-06, "loss": 0.3214, "step": 66 }, { "epoch": 0.008440944881889763, "grad_norm": 1.0488498210906982, "learning_rate": 3.0153689607045845e-06, "loss": 0.32, "step": 67 }, { "epoch": 0.008566929133858267, "grad_norm": 1.2639235258102417, "learning_rate": 2.314152462588659e-06, "loss": 0.3507, "step": 68 }, { "epoch": 0.008692913385826772, "grad_norm": 1.2547246217727661, "learning_rate": 1.70370868554659e-06, "loss": 0.3217, "step": 69 }, { "epoch": 0.008818897637795276, "grad_norm": 1.1020681858062744, "learning_rate": 1.1851996440033319e-06, "loss": 0.328, "step": 70 }, { "epoch": 0.00894488188976378, "grad_norm": 1.242504596710205, "learning_rate": 7.596123493895991e-07, "loss": 0.3537, "step": 71 }, { "epoch": 0.009070866141732284, "grad_norm": 1.1716359853744507, "learning_rate": 4.277569313094809e-07, "loss": 0.3599, "step": 72 }, { "epoch": 0.009196850393700787, "grad_norm": 0.999187171459198, "learning_rate": 1.9026509541272275e-07, "loss": 0.3442, "step": 73 }, { "epoch": 0.009322834645669291, "grad_norm": 1.0128037929534912, "learning_rate": 4.7588920907110094e-08, "loss": 0.3309, "step": 74 }, { "epoch": 0.009448818897637795, "grad_norm": 1.1647950410842896, "learning_rate": 0.0, "loss": 0.347, "step": 75 }, { "epoch": 0.009448818897637795, "eval_loss": 0.3494592607021332, "eval_runtime": 1690.2407, "eval_samples_per_second": 3.955, "eval_steps_per_second": 1.978, "step": 75 } ], "logging_steps": 1, "max_steps": 75, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.119015678246912e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }