{ "best_metric": 5.093143939971924, "best_model_checkpoint": "2024-12-03-roberta-evacun/checkpoint-16456", "epoch": 17.0, "eval_steps": 500, "global_step": 16456, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.5167958656330749, "grad_norm": 5.255120277404785, "learning_rate": 4.974146845915202e-05, "loss": 5.2027, "step": 500 }, { "epoch": 1.0, "eval_accuracy": 0.21748133776447606, "eval_loss": 5.119329929351807, "eval_runtime": 571.3565, "eval_samples_per_second": 54.166, "eval_steps_per_second": 3.387, "step": 968 }, { "epoch": 1.0330749354005169, "grad_norm": 5.84419584274292, "learning_rate": 4.948293691830403e-05, "loss": 5.0886, "step": 1000 }, { "epoch": 1.5498708010335918, "grad_norm": 4.444571495056152, "learning_rate": 4.922440537745605e-05, "loss": 5.0616, "step": 1500 }, { "epoch": 2.0, "eval_accuracy": 0.2174891710390481, "eval_loss": 5.119706153869629, "eval_runtime": 570.6419, "eval_samples_per_second": 54.234, "eval_steps_per_second": 3.391, "step": 1936 }, { "epoch": 2.0661498708010337, "grad_norm": 3.8420157432556152, "learning_rate": 4.896587383660807e-05, "loss": 5.0698, "step": 2000 }, { "epoch": 2.5829457364341084, "grad_norm": 6.509148120880127, "learning_rate": 4.870734229576008e-05, "loss": 5.0702, "step": 2500 }, { "epoch": 3.0, "eval_accuracy": 0.21680529124469344, "eval_loss": 5.109970569610596, "eval_runtime": 570.6566, "eval_samples_per_second": 54.232, "eval_steps_per_second": 3.391, "step": 2904 }, { "epoch": 3.0992248062015504, "grad_norm": 6.749443531036377, "learning_rate": 4.84488107549121e-05, "loss": 5.0508, "step": 3000 }, { "epoch": 3.616020671834625, "grad_norm": 3.7069902420043945, "learning_rate": 4.819027921406412e-05, "loss": 5.0692, "step": 3500 }, { "epoch": 4.0, "eval_accuracy": 0.21822890079972776, "eval_loss": 5.109030246734619, "eval_runtime": 570.6551, "eval_samples_per_second": 54.232, "eval_steps_per_second": 3.391, "step": 3872 }, { "epoch": 4.1322997416020675, "grad_norm": 4.267265319824219, "learning_rate": 4.793174767321613e-05, "loss": 5.0638, "step": 4000 }, { "epoch": 4.649095607235142, "grad_norm": 4.44468879699707, "learning_rate": 4.7673216132368156e-05, "loss": 5.0761, "step": 4500 }, { "epoch": 5.0, "eval_accuracy": 0.21750397926781473, "eval_loss": 5.103179931640625, "eval_runtime": 570.6875, "eval_samples_per_second": 54.229, "eval_steps_per_second": 3.391, "step": 4840 }, { "epoch": 5.165374677002584, "grad_norm": 4.793745994567871, "learning_rate": 4.741468459152017e-05, "loss": 5.0508, "step": 5000 }, { "epoch": 5.682170542635659, "grad_norm": 4.304242134094238, "learning_rate": 4.7156153050672187e-05, "loss": 5.053, "step": 5500 }, { "epoch": 6.0, "eval_accuracy": 0.21806737555634184, "eval_loss": 5.103706359863281, "eval_runtime": 570.7022, "eval_samples_per_second": 54.228, "eval_steps_per_second": 3.391, "step": 5808 }, { "epoch": 6.198449612403101, "grad_norm": 5.248096466064453, "learning_rate": 4.6897621509824205e-05, "loss": 5.0451, "step": 6000 }, { "epoch": 6.715245478036175, "grad_norm": 4.274606227874756, "learning_rate": 4.663908996897622e-05, "loss": 5.0631, "step": 6500 }, { "epoch": 7.0, "eval_accuracy": 0.21671510548702227, "eval_loss": 5.111293315887451, "eval_runtime": 570.6785, "eval_samples_per_second": 54.23, "eval_steps_per_second": 3.391, "step": 6776 }, { "epoch": 7.231524547803618, "grad_norm": 4.097196102142334, "learning_rate": 4.6380558428128236e-05, "loss": 5.0515, "step": 7000 }, { "epoch": 7.7483204134366925, "grad_norm": 5.150557994842529, "learning_rate": 4.6122026887280254e-05, "loss": 5.0643, "step": 7500 }, { "epoch": 8.0, "eval_accuracy": 0.21643664071910304, "eval_loss": 5.104173183441162, "eval_runtime": 570.7404, "eval_samples_per_second": 54.224, "eval_steps_per_second": 3.39, "step": 7744 }, { "epoch": 8.264599483204135, "grad_norm": 4.464865684509277, "learning_rate": 4.5863495346432266e-05, "loss": 5.0575, "step": 8000 }, { "epoch": 8.78139534883721, "grad_norm": 4.509471893310547, "learning_rate": 4.5604963805584284e-05, "loss": 5.0641, "step": 8500 }, { "epoch": 9.0, "eval_accuracy": 0.2164407022306587, "eval_loss": 5.1050801277160645, "eval_runtime": 570.7609, "eval_samples_per_second": 54.222, "eval_steps_per_second": 3.39, "step": 8712 }, { "epoch": 9.29767441860465, "grad_norm": 4.612584114074707, "learning_rate": 4.5346432264736296e-05, "loss": 5.0507, "step": 9000 }, { "epoch": 9.814470284237727, "grad_norm": 5.6399335861206055, "learning_rate": 4.5087900723888315e-05, "loss": 5.0477, "step": 9500 }, { "epoch": 10.0, "eval_accuracy": 0.2173244230825529, "eval_loss": 5.097902297973633, "eval_runtime": 570.8859, "eval_samples_per_second": 54.21, "eval_steps_per_second": 3.389, "step": 9680 }, { "epoch": 10.330749354005167, "grad_norm": 4.067675590515137, "learning_rate": 4.4829369183040333e-05, "loss": 5.0538, "step": 10000 }, { "epoch": 10.847545219638242, "grad_norm": 3.6515378952026367, "learning_rate": 4.4570837642192345e-05, "loss": 5.0516, "step": 10500 }, { "epoch": 11.0, "eval_accuracy": 0.21694436479870868, "eval_loss": 5.105097770690918, "eval_runtime": 570.8353, "eval_samples_per_second": 54.215, "eval_steps_per_second": 3.39, "step": 10648 }, { "epoch": 11.363824289405684, "grad_norm": 3.8045644760131836, "learning_rate": 4.4312306101344364e-05, "loss": 5.0503, "step": 11000 }, { "epoch": 11.88062015503876, "grad_norm": 6.31251859664917, "learning_rate": 4.405377456049638e-05, "loss": 5.0535, "step": 11500 }, { "epoch": 12.0, "eval_accuracy": 0.21655864853378665, "eval_loss": 5.102573394775391, "eval_runtime": 570.862, "eval_samples_per_second": 54.213, "eval_steps_per_second": 3.39, "step": 11616 }, { "epoch": 12.396899224806202, "grad_norm": 7.145920276641846, "learning_rate": 4.3795243019648394e-05, "loss": 5.0599, "step": 12000 }, { "epoch": 12.913695090439276, "grad_norm": 4.022646427154541, "learning_rate": 4.353671147880042e-05, "loss": 5.0491, "step": 12500 }, { "epoch": 13.0, "eval_accuracy": 0.21710994508587633, "eval_loss": 5.104992866516113, "eval_runtime": 570.798, "eval_samples_per_second": 54.219, "eval_steps_per_second": 3.39, "step": 12584 }, { "epoch": 13.429974160206719, "grad_norm": 3.64132022857666, "learning_rate": 4.327817993795243e-05, "loss": 5.0419, "step": 13000 }, { "epoch": 13.946770025839793, "grad_norm": 4.630414009094238, "learning_rate": 4.301964839710445e-05, "loss": 5.0547, "step": 13500 }, { "epoch": 14.0, "eval_accuracy": 0.21658861456412434, "eval_loss": 5.10886287689209, "eval_runtime": 570.8383, "eval_samples_per_second": 54.215, "eval_steps_per_second": 3.39, "step": 13552 }, { "epoch": 14.463049095607236, "grad_norm": 4.112677097320557, "learning_rate": 4.276111685625647e-05, "loss": 5.0602, "step": 14000 }, { "epoch": 14.97984496124031, "grad_norm": 5.0369873046875, "learning_rate": 4.250258531540848e-05, "loss": 5.055, "step": 14500 }, { "epoch": 15.0, "eval_accuracy": 0.21731427174975562, "eval_loss": 5.101281642913818, "eval_runtime": 570.8524, "eval_samples_per_second": 54.214, "eval_steps_per_second": 3.39, "step": 14520 }, { "epoch": 15.496124031007753, "grad_norm": 5.0534281730651855, "learning_rate": 4.22440537745605e-05, "loss": 5.0547, "step": 15000 }, { "epoch": 16.0, "eval_accuracy": 0.21727949673638433, "eval_loss": 5.1018967628479, "eval_runtime": 570.8378, "eval_samples_per_second": 54.215, "eval_steps_per_second": 3.39, "step": 15488 }, { "epoch": 16.012403100775195, "grad_norm": 5.0263752937316895, "learning_rate": 4.198552223371252e-05, "loss": 5.0508, "step": 15500 }, { "epoch": 16.52919896640827, "grad_norm": 3.9026286602020264, "learning_rate": 4.172699069286453e-05, "loss": 5.0532, "step": 16000 }, { "epoch": 17.0, "eval_accuracy": 0.21736250405690846, "eval_loss": 5.093143939971924, "eval_runtime": 570.8413, "eval_samples_per_second": 54.215, "eval_steps_per_second": 3.39, "step": 16456 } ], "logging_steps": 500, "max_steps": 96700, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.31244216577348e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }