{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 17799, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08427439743805831, "grad_norm": 2.7859835624694824, "learning_rate": 4.85954267093657e-05, "loss": 1.5305, "step": 500 }, { "epoch": 0.16854879487611663, "grad_norm": 2.8764379024505615, "learning_rate": 4.719085341873139e-05, "loss": 1.392, "step": 1000 }, { "epoch": 0.25282319231417494, "grad_norm": 2.9515156745910645, "learning_rate": 4.578628012809709e-05, "loss": 1.3312, "step": 1500 }, { "epoch": 0.33709758975223325, "grad_norm": 2.6709630489349365, "learning_rate": 4.438170683746278e-05, "loss": 1.294, "step": 2000 }, { "epoch": 0.42137198719029156, "grad_norm": 2.601968765258789, "learning_rate": 4.2977133546828475e-05, "loss": 1.2561, "step": 2500 }, { "epoch": 0.5056463846283499, "grad_norm": 2.6964409351348877, "learning_rate": 4.157256025619417e-05, "loss": 1.2364, "step": 3000 }, { "epoch": 0.5899207820664082, "grad_norm": 2.479663610458374, "learning_rate": 4.016798696555987e-05, "loss": 1.215, "step": 3500 }, { "epoch": 0.6741951795044665, "grad_norm": 2.6350433826446533, "learning_rate": 3.876341367492556e-05, "loss": 1.1925, "step": 4000 }, { "epoch": 0.7584695769425248, "grad_norm": 2.7347450256347656, "learning_rate": 3.7358840384291254e-05, "loss": 1.1831, "step": 4500 }, { "epoch": 0.8427439743805831, "grad_norm": 2.7404720783233643, "learning_rate": 3.5954267093656953e-05, "loss": 1.1657, "step": 5000 }, { "epoch": 0.9270183718186416, "grad_norm": 2.71294903755188, "learning_rate": 3.454969380302264e-05, "loss": 1.1629, "step": 5500 }, { "epoch": 1.0112927692566998, "grad_norm": 2.6705377101898193, "learning_rate": 3.314512051238834e-05, "loss": 1.1435, "step": 6000 }, { "epoch": 1.0955671666947582, "grad_norm": 2.524048089981079, "learning_rate": 3.174054722175403e-05, "loss": 1.1304, "step": 6500 }, { "epoch": 1.1798415641328164, "grad_norm": 2.6250593662261963, "learning_rate": 3.0335973931119726e-05, "loss": 1.1167, "step": 7000 }, { "epoch": 1.2641159615708748, "grad_norm": 2.3546817302703857, "learning_rate": 2.8931400640485422e-05, "loss": 1.1121, "step": 7500 }, { "epoch": 1.348390359008933, "grad_norm": 2.7167067527770996, "learning_rate": 2.7526827349851115e-05, "loss": 1.1065, "step": 8000 }, { "epoch": 1.4326647564469914, "grad_norm": 2.4628195762634277, "learning_rate": 2.6122254059216812e-05, "loss": 1.0953, "step": 8500 }, { "epoch": 1.5169391538850499, "grad_norm": 2.47514009475708, "learning_rate": 2.4717680768582505e-05, "loss": 1.0898, "step": 9000 }, { "epoch": 1.601213551323108, "grad_norm": 2.240252733230591, "learning_rate": 2.33131074779482e-05, "loss": 1.0734, "step": 9500 }, { "epoch": 1.6854879487611663, "grad_norm": 2.6827552318573, "learning_rate": 2.1908534187313894e-05, "loss": 1.0771, "step": 10000 }, { "epoch": 1.7697623461992247, "grad_norm": 2.446568012237549, "learning_rate": 2.0503960896679587e-05, "loss": 1.0727, "step": 10500 }, { "epoch": 1.854036743637283, "grad_norm": 2.437731981277466, "learning_rate": 1.9099387606045284e-05, "loss": 1.0573, "step": 11000 }, { "epoch": 1.9383111410753413, "grad_norm": 2.6201038360595703, "learning_rate": 1.769481431541098e-05, "loss": 1.0561, "step": 11500 }, { "epoch": 2.0225855385133995, "grad_norm": 2.4613335132598877, "learning_rate": 1.6290241024776673e-05, "loss": 1.0531, "step": 12000 }, { "epoch": 2.106859935951458, "grad_norm": 2.459319591522217, "learning_rate": 1.488566773414237e-05, "loss": 1.0409, "step": 12500 }, { "epoch": 2.1911343333895164, "grad_norm": 2.536198616027832, "learning_rate": 1.3481094443508063e-05, "loss": 1.0388, "step": 13000 }, { "epoch": 2.2754087308275746, "grad_norm": 2.503535270690918, "learning_rate": 1.2076521152873758e-05, "loss": 1.0326, "step": 13500 }, { "epoch": 2.3596831282656328, "grad_norm": 2.47710919380188, "learning_rate": 1.0671947862239452e-05, "loss": 1.0246, "step": 14000 }, { "epoch": 2.4439575257036914, "grad_norm": 2.4275453090667725, "learning_rate": 9.267374571605147e-06, "loss": 1.0228, "step": 14500 }, { "epoch": 2.5282319231417496, "grad_norm": 2.4777779579162598, "learning_rate": 7.862801280970842e-06, "loss": 1.029, "step": 15000 }, { "epoch": 2.612506320579808, "grad_norm": 2.574960470199585, "learning_rate": 6.458227990336537e-06, "loss": 1.0215, "step": 15500 }, { "epoch": 2.696780718017866, "grad_norm": 2.5446507930755615, "learning_rate": 5.053654699702231e-06, "loss": 1.0206, "step": 16000 }, { "epoch": 2.7810551154559247, "grad_norm": 2.7360856533050537, "learning_rate": 3.6490814090679254e-06, "loss": 1.0215, "step": 16500 }, { "epoch": 2.865329512893983, "grad_norm": 2.4522464275360107, "learning_rate": 2.2445081184336198e-06, "loss": 1.017, "step": 17000 }, { "epoch": 2.949603910332041, "grad_norm": 2.5468955039978027, "learning_rate": 8.399348277993145e-07, "loss": 1.0105, "step": 17500 } ], "logging_steps": 500, "max_steps": 17799, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4989245699264307e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }