{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.017543859649123, "eval_steps": 8, "global_step": 86, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03508771929824561, "grad_norm": 0.03587839752435684, "learning_rate": 1e-05, "loss": 10.3701, "step": 1 }, { "epoch": 0.03508771929824561, "eval_loss": 10.366876602172852, "eval_runtime": 0.0516, "eval_samples_per_second": 1858.963, "eval_steps_per_second": 58.093, "step": 1 }, { "epoch": 0.07017543859649122, "grad_norm": 0.03268265724182129, "learning_rate": 2e-05, "loss": 10.3724, "step": 2 }, { "epoch": 0.10526315789473684, "grad_norm": 0.03806426748633385, "learning_rate": 3e-05, "loss": 10.372, "step": 3 }, { "epoch": 0.14035087719298245, "grad_norm": 0.03297482430934906, "learning_rate": 4e-05, "loss": 10.3723, "step": 4 }, { "epoch": 0.17543859649122806, "grad_norm": 0.03713623434305191, "learning_rate": 5e-05, "loss": 10.3692, "step": 5 }, { "epoch": 0.21052631578947367, "grad_norm": 0.03266875818371773, "learning_rate": 6e-05, "loss": 10.3731, "step": 6 }, { "epoch": 0.24561403508771928, "grad_norm": 0.0323638878762722, "learning_rate": 7e-05, "loss": 10.3711, "step": 7 }, { "epoch": 0.2807017543859649, "grad_norm": 0.0312843956053257, "learning_rate": 8e-05, "loss": 10.3713, "step": 8 }, { "epoch": 0.2807017543859649, "eval_loss": 10.366599082946777, "eval_runtime": 0.0477, "eval_samples_per_second": 2012.954, "eval_steps_per_second": 62.905, "step": 8 }, { "epoch": 0.3157894736842105, "grad_norm": 0.03661629185080528, "learning_rate": 9e-05, "loss": 10.3736, "step": 9 }, { "epoch": 0.3508771929824561, "grad_norm": 0.03309982642531395, "learning_rate": 0.0001, "loss": 10.3698, "step": 10 }, { "epoch": 0.38596491228070173, "grad_norm": 0.032276641577482224, "learning_rate": 9.995728791936504e-05, "loss": 10.3686, "step": 11 }, { "epoch": 0.42105263157894735, "grad_norm": 0.03594186529517174, "learning_rate": 9.98292246503335e-05, "loss": 10.3717, "step": 12 }, { "epoch": 0.45614035087719296, "grad_norm": 0.03432200476527214, "learning_rate": 9.961602898685226e-05, "loss": 10.3709, "step": 13 }, { "epoch": 0.49122807017543857, "grad_norm": 0.0328349806368351, "learning_rate": 9.931806517013612e-05, "loss": 10.3691, "step": 14 }, { "epoch": 0.5263157894736842, "grad_norm": 0.03265753388404846, "learning_rate": 9.893584226636772e-05, "loss": 10.3697, "step": 15 }, { "epoch": 0.5614035087719298, "grad_norm": 0.03280927613377571, "learning_rate": 9.847001329696653e-05, "loss": 10.3705, "step": 16 }, { "epoch": 0.5614035087719298, "eval_loss": 10.365866661071777, "eval_runtime": 0.0486, "eval_samples_per_second": 1976.309, "eval_steps_per_second": 61.76, "step": 16 }, { "epoch": 0.5964912280701754, "grad_norm": 0.03726591914892197, "learning_rate": 9.792137412291265e-05, "loss": 10.3695, "step": 17 }, { "epoch": 0.631578947368421, "grad_norm": 0.0326569639146328, "learning_rate": 9.729086208503174e-05, "loss": 10.3712, "step": 18 }, { "epoch": 0.6666666666666666, "grad_norm": 0.031912319362163544, "learning_rate": 9.657955440256395e-05, "loss": 10.372, "step": 19 }, { "epoch": 0.7017543859649122, "grad_norm": 0.03385849669575691, "learning_rate": 9.578866633275288e-05, "loss": 10.3681, "step": 20 }, { "epoch": 0.7368421052631579, "grad_norm": 0.031885888427495956, "learning_rate": 9.491954909459895e-05, "loss": 10.373, "step": 21 }, { "epoch": 0.7719298245614035, "grad_norm": 0.030738165602087975, "learning_rate": 9.397368756032445e-05, "loss": 10.3672, "step": 22 }, { "epoch": 0.8070175438596491, "grad_norm": 0.03675466775894165, "learning_rate": 9.295269771849427e-05, "loss": 10.3687, "step": 23 }, { "epoch": 0.8421052631578947, "grad_norm": 0.0321744829416275, "learning_rate": 9.185832391312644e-05, "loss": 10.37, "step": 24 }, { "epoch": 0.8421052631578947, "eval_loss": 10.365135192871094, "eval_runtime": 0.0476, "eval_samples_per_second": 2015.937, "eval_steps_per_second": 62.998, "step": 24 }, { "epoch": 0.8771929824561403, "grad_norm": 0.03129459172487259, "learning_rate": 9.069243586350975e-05, "loss": 10.3678, "step": 25 }, { "epoch": 0.9122807017543859, "grad_norm": 0.034831687808036804, "learning_rate": 8.945702546981969e-05, "loss": 10.3703, "step": 26 }, { "epoch": 0.9473684210526315, "grad_norm": 0.03272359073162079, "learning_rate": 8.815420340999033e-05, "loss": 10.3704, "step": 27 }, { "epoch": 0.9824561403508771, "grad_norm": 0.034819431602954865, "learning_rate": 8.678619553365659e-05, "loss": 10.3677, "step": 28 }, { "epoch": 1.0175438596491229, "grad_norm": 0.054919809103012085, "learning_rate": 8.535533905932738e-05, "loss": 15.7459, "step": 29 }, { "epoch": 1.0526315789473684, "grad_norm": 0.03318790718913078, "learning_rate": 8.386407858128706e-05, "loss": 10.3955, "step": 30 }, { "epoch": 1.087719298245614, "grad_norm": 0.03139394149184227, "learning_rate": 8.231496189304704e-05, "loss": 9.9669, "step": 31 }, { "epoch": 1.1228070175438596, "grad_norm": 0.03303258866071701, "learning_rate": 8.07106356344834e-05, "loss": 10.405, "step": 32 }, { "epoch": 1.1228070175438596, "eval_loss": 10.364384651184082, "eval_runtime": 0.0513, "eval_samples_per_second": 1872.727, "eval_steps_per_second": 58.523, "step": 32 }, { "epoch": 1.1578947368421053, "grad_norm": 0.03839896619319916, "learning_rate": 7.905384077009693e-05, "loss": 10.6439, "step": 33 }, { "epoch": 1.1929824561403508, "grad_norm": 0.03467192500829697, "learning_rate": 7.734740790612136e-05, "loss": 10.345, "step": 34 }, { "epoch": 1.2280701754385965, "grad_norm": 0.040621548891067505, "learning_rate": 7.559425245448006e-05, "loss": 10.649, "step": 35 }, { "epoch": 1.263157894736842, "grad_norm": 0.03688690438866615, "learning_rate": 7.379736965185368e-05, "loss": 10.0362, "step": 36 }, { "epoch": 1.2982456140350878, "grad_norm": 0.03830627351999283, "learning_rate": 7.195982944236851e-05, "loss": 10.4229, "step": 37 }, { "epoch": 1.3333333333333333, "grad_norm": 0.04245205968618393, "learning_rate": 7.008477123264848e-05, "loss": 10.2895, "step": 38 }, { "epoch": 1.368421052631579, "grad_norm": 0.031211622059345245, "learning_rate": 6.817539852819149e-05, "loss": 9.5719, "step": 39 }, { "epoch": 1.4035087719298245, "grad_norm": 0.040859777480363846, "learning_rate": 6.623497346023418e-05, "loss": 11.2269, "step": 40 }, { "epoch": 1.4035087719298245, "eval_loss": 10.363655090332031, "eval_runtime": 0.0473, "eval_samples_per_second": 2029.707, "eval_steps_per_second": 63.428, "step": 40 }, { "epoch": 1.4385964912280702, "grad_norm": 0.03956427797675133, "learning_rate": 6.426681121245527e-05, "loss": 10.0621, "step": 41 }, { "epoch": 1.4736842105263157, "grad_norm": 0.041367191821336746, "learning_rate": 6.227427435703997e-05, "loss": 10.698, "step": 42 }, { "epoch": 1.5087719298245614, "grad_norm": 0.03702181950211525, "learning_rate": 6.026076710978171e-05, "loss": 10.6238, "step": 43 }, { "epoch": 1.543859649122807, "grad_norm": 0.03523598238825798, "learning_rate": 5.8229729514036705e-05, "loss": 9.934, "step": 44 }, { "epoch": 1.5789473684210527, "grad_norm": 0.045091357082128525, "learning_rate": 5.618463156346739e-05, "loss": 10.334, "step": 45 }, { "epoch": 1.6140350877192984, "grad_norm": 0.0438874252140522, "learning_rate": 5.4128967273616625e-05, "loss": 10.5987, "step": 46 }, { "epoch": 1.6491228070175439, "grad_norm": 0.04235617071390152, "learning_rate": 5.2066248712440656e-05, "loss": 10.2434, "step": 47 }, { "epoch": 1.6842105263157894, "grad_norm": 0.04486287012696266, "learning_rate": 5e-05, "loss": 10.619, "step": 48 }, { "epoch": 1.6842105263157894, "eval_loss": 10.362959861755371, "eval_runtime": 0.0514, "eval_samples_per_second": 1868.46, "eval_steps_per_second": 58.389, "step": 48 }, { "epoch": 1.719298245614035, "grad_norm": 0.04507676884531975, "learning_rate": 4.7933751287559335e-05, "loss": 10.0827, "step": 49 }, { "epoch": 1.7543859649122808, "grad_norm": 0.04561790078878403, "learning_rate": 4.5871032726383386e-05, "loss": 10.4802, "step": 50 }, { "epoch": 1.7894736842105263, "grad_norm": 0.03883575648069382, "learning_rate": 4.381536843653262e-05, "loss": 10.0867, "step": 51 }, { "epoch": 1.8245614035087718, "grad_norm": 0.04744650423526764, "learning_rate": 4.17702704859633e-05, "loss": 10.9924, "step": 52 }, { "epoch": 1.8596491228070176, "grad_norm": 0.046235229820013046, "learning_rate": 3.973923289021829e-05, "loss": 9.8877, "step": 53 }, { "epoch": 1.8947368421052633, "grad_norm": 0.05214040353894234, "learning_rate": 3.772572564296005e-05, "loss": 10.369, "step": 54 }, { "epoch": 1.9298245614035088, "grad_norm": 0.041679393500089645, "learning_rate": 3.5733188787544745e-05, "loss": 11.1291, "step": 55 }, { "epoch": 1.9649122807017543, "grad_norm": 0.04238257557153702, "learning_rate": 3.3765026539765834e-05, "loss": 9.99, "step": 56 }, { "epoch": 1.9649122807017543, "eval_loss": 10.36238956451416, "eval_runtime": 0.0482, "eval_samples_per_second": 1990.632, "eval_steps_per_second": 62.207, "step": 56 }, { "epoch": 2.0, "grad_norm": 0.07189306616783142, "learning_rate": 3.18246014718085e-05, "loss": 15.2056, "step": 57 }, { "epoch": 2.0350877192982457, "grad_norm": 0.041363537311553955, "learning_rate": 2.991522876735154e-05, "loss": 10.3649, "step": 58 }, { "epoch": 2.0701754385964914, "grad_norm": 0.04258178174495697, "learning_rate": 2.804017055763149e-05, "loss": 10.3674, "step": 59 }, { "epoch": 2.1052631578947367, "grad_norm": 0.0516861192882061, "learning_rate": 2.6202630348146324e-05, "loss": 10.3673, "step": 60 }, { "epoch": 2.1403508771929824, "grad_norm": 0.0398215688765049, "learning_rate": 2.4405747545519963e-05, "loss": 10.3627, "step": 61 }, { "epoch": 2.175438596491228, "grad_norm": 0.052411239594221115, "learning_rate": 2.2652592093878666e-05, "loss": 10.3681, "step": 62 }, { "epoch": 2.2105263157894735, "grad_norm": 0.04860823601484299, "learning_rate": 2.094615922990309e-05, "loss": 10.3668, "step": 63 }, { "epoch": 2.245614035087719, "grad_norm": 0.04707632586359978, "learning_rate": 1.928936436551661e-05, "loss": 10.3688, "step": 64 }, { "epoch": 2.245614035087719, "eval_loss": 10.361990928649902, "eval_runtime": 0.0503, "eval_samples_per_second": 1907.875, "eval_steps_per_second": 59.621, "step": 64 }, { "epoch": 2.280701754385965, "grad_norm": 0.047561973333358765, "learning_rate": 1.768503810695295e-05, "loss": 10.3625, "step": 65 }, { "epoch": 2.3157894736842106, "grad_norm": 0.0470132939517498, "learning_rate": 1.6135921418712956e-05, "loss": 10.3681, "step": 66 }, { "epoch": 2.3508771929824563, "grad_norm": 0.0447855144739151, "learning_rate": 1.4644660940672627e-05, "loss": 10.3667, "step": 67 }, { "epoch": 2.3859649122807016, "grad_norm": 0.04761236160993576, "learning_rate": 1.3213804466343421e-05, "loss": 10.3676, "step": 68 }, { "epoch": 2.4210526315789473, "grad_norm": 0.04890201613306999, "learning_rate": 1.1845796590009683e-05, "loss": 10.3666, "step": 69 }, { "epoch": 2.456140350877193, "grad_norm": 0.055010754615068436, "learning_rate": 1.0542974530180327e-05, "loss": 10.3671, "step": 70 }, { "epoch": 2.4912280701754383, "grad_norm": 0.04802040010690689, "learning_rate": 9.307564136490254e-06, "loss": 10.3659, "step": 71 }, { "epoch": 2.526315789473684, "grad_norm": 0.0498974546790123, "learning_rate": 8.141676086873572e-06, "loss": 10.3678, "step": 72 }, { "epoch": 2.526315789473684, "eval_loss": 10.361777305603027, "eval_runtime": 0.0479, "eval_samples_per_second": 2002.244, "eval_steps_per_second": 62.57, "step": 72 }, { "epoch": 2.56140350877193, "grad_norm": 0.052232056856155396, "learning_rate": 7.047302281505736e-06, "loss": 10.3665, "step": 73 }, { "epoch": 2.5964912280701755, "grad_norm": 0.045570556074380875, "learning_rate": 6.026312439675552e-06, "loss": 10.3654, "step": 74 }, { "epoch": 2.6315789473684212, "grad_norm": 0.050531383603811264, "learning_rate": 5.080450905401057e-06, "loss": 10.364, "step": 75 }, { "epoch": 2.6666666666666665, "grad_norm": 0.04788883775472641, "learning_rate": 4.2113336672471245e-06, "loss": 10.3642, "step": 76 }, { "epoch": 2.7017543859649122, "grad_norm": 0.05305058881640434, "learning_rate": 3.420445597436056e-06, "loss": 10.3684, "step": 77 }, { "epoch": 2.736842105263158, "grad_norm": 0.04230741783976555, "learning_rate": 2.7091379149682685e-06, "loss": 10.3681, "step": 78 }, { "epoch": 2.7719298245614032, "grad_norm": 0.04892972111701965, "learning_rate": 2.0786258770873647e-06, "loss": 10.3648, "step": 79 }, { "epoch": 2.807017543859649, "grad_norm": 0.05191851034760475, "learning_rate": 1.5299867030334814e-06, "loss": 10.3678, "step": 80 }, { "epoch": 2.807017543859649, "eval_loss": 10.36169147491455, "eval_runtime": 0.0484, "eval_samples_per_second": 1983.67, "eval_steps_per_second": 61.99, "step": 80 }, { "epoch": 2.8421052631578947, "grad_norm": 0.05835256725549698, "learning_rate": 1.064157733632276e-06, "loss": 10.3622, "step": 81 }, { "epoch": 2.8771929824561404, "grad_norm": 0.051547639071941376, "learning_rate": 6.819348298638839e-07, "loss": 10.3642, "step": 82 }, { "epoch": 2.912280701754386, "grad_norm": 0.05024786293506622, "learning_rate": 3.839710131477492e-07, "loss": 10.3686, "step": 83 }, { "epoch": 2.9473684210526314, "grad_norm": 0.05746513977646828, "learning_rate": 1.7077534966650766e-07, "loss": 10.3635, "step": 84 }, { "epoch": 2.982456140350877, "grad_norm": 0.05468269810080528, "learning_rate": 4.2712080634949024e-08, "loss": 10.3656, "step": 85 }, { "epoch": 3.017543859649123, "grad_norm": 0.0812983587384224, "learning_rate": 0.0, "loss": 15.6424, "step": 86 } ], "logging_steps": 1, "max_steps": 86, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 17992041627648.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }