{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.7346053772766696, "eval_steps": 200, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03469210754553339, "eval_loss": 1.8023786544799805, "eval_runtime": 572.3832, "eval_samples_per_second": 10.074, "eval_steps_per_second": 1.26, "step": 200 }, { "epoch": 0.06938421509106678, "eval_loss": 1.7305923700332642, "eval_runtime": 572.203, "eval_samples_per_second": 10.077, "eval_steps_per_second": 1.26, "step": 400 }, { "epoch": 0.08673026886383348, "grad_norm": 6.57196569442749, "learning_rate": 2.4566348655680836e-05, "loss": 1.8068, "step": 500 }, { "epoch": 0.10407632263660017, "eval_loss": 1.7001726627349854, "eval_runtime": 572.0907, "eval_samples_per_second": 10.079, "eval_steps_per_second": 1.26, "step": 600 }, { "epoch": 0.13876843018213356, "eval_loss": 1.6665369272232056, "eval_runtime": 572.5138, "eval_samples_per_second": 10.071, "eval_steps_per_second": 1.259, "step": 800 }, { "epoch": 0.17346053772766695, "grad_norm": 6.407934665679932, "learning_rate": 2.4132697311361666e-05, "loss": 1.6773, "step": 1000 }, { "epoch": 0.17346053772766695, "eval_loss": 1.645666480064392, "eval_runtime": 572.4669, "eval_samples_per_second": 10.072, "eval_steps_per_second": 1.259, "step": 1000 }, { "epoch": 0.20815264527320035, "eval_loss": 1.6295970678329468, "eval_runtime": 572.6705, "eval_samples_per_second": 10.069, "eval_steps_per_second": 1.259, "step": 1200 }, { "epoch": 0.24284475281873374, "eval_loss": 1.6119849681854248, "eval_runtime": 572.6602, "eval_samples_per_second": 10.069, "eval_steps_per_second": 1.259, "step": 1400 }, { "epoch": 0.26019080659150046, "grad_norm": 6.23416805267334, "learning_rate": 2.36990459670425e-05, "loss": 1.6291, "step": 1500 }, { "epoch": 0.2775368603642671, "eval_loss": 1.5977734327316284, "eval_runtime": 572.802, "eval_samples_per_second": 10.066, "eval_steps_per_second": 1.259, "step": 1600 }, { "epoch": 0.31222896790980054, "eval_loss": 1.5906885862350464, "eval_runtime": 572.7238, "eval_samples_per_second": 10.068, "eval_steps_per_second": 1.259, "step": 1800 }, { "epoch": 0.3469210754553339, "grad_norm": 5.846036434173584, "learning_rate": 2.326539462272333e-05, "loss": 1.6032, "step": 2000 }, { "epoch": 0.3469210754553339, "eval_loss": 1.5792902708053589, "eval_runtime": 572.6421, "eval_samples_per_second": 10.069, "eval_steps_per_second": 1.259, "step": 2000 }, { "epoch": 0.38161318300086733, "eval_loss": 1.5674443244934082, "eval_runtime": 572.94, "eval_samples_per_second": 10.064, "eval_steps_per_second": 1.258, "step": 2200 }, { "epoch": 0.4163052905464007, "eval_loss": 1.5650794506072998, "eval_runtime": 573.1561, "eval_samples_per_second": 10.06, "eval_steps_per_second": 1.258, "step": 2400 }, { "epoch": 0.4336513443191674, "grad_norm": 6.9578962326049805, "learning_rate": 2.2831743278404163e-05, "loss": 1.5699, "step": 2500 }, { "epoch": 0.45099739809193407, "eval_loss": 1.5550028085708618, "eval_runtime": 572.973, "eval_samples_per_second": 10.063, "eval_steps_per_second": 1.258, "step": 2600 }, { "epoch": 0.4856895056374675, "eval_loss": 1.539338231086731, "eval_runtime": 573.1731, "eval_samples_per_second": 10.06, "eval_steps_per_second": 1.258, "step": 2800 }, { "epoch": 0.5203816131830009, "grad_norm": 6.062795639038086, "learning_rate": 2.2398091934084997e-05, "loss": 1.5555, "step": 3000 }, { "epoch": 0.5203816131830009, "eval_loss": 1.533992886543274, "eval_runtime": 573.3108, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 3000 }, { "epoch": 0.5550737207285342, "eval_loss": 1.5279603004455566, "eval_runtime": 573.3234, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 3200 }, { "epoch": 0.5897658282740676, "eval_loss": 1.5221937894821167, "eval_runtime": 573.2462, "eval_samples_per_second": 10.059, "eval_steps_per_second": 1.258, "step": 3400 }, { "epoch": 0.6071118820468343, "grad_norm": 5.474059581756592, "learning_rate": 2.196444058976583e-05, "loss": 1.5258, "step": 3500 }, { "epoch": 0.6244579358196011, "eval_loss": 1.5145606994628906, "eval_runtime": 573.1527, "eval_samples_per_second": 10.06, "eval_steps_per_second": 1.258, "step": 3600 }, { "epoch": 0.6591500433651344, "eval_loss": 1.5087436437606812, "eval_runtime": 573.3236, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 3800 }, { "epoch": 0.6938421509106678, "grad_norm": 4.400829315185547, "learning_rate": 2.1530789245446662e-05, "loss": 1.5145, "step": 4000 }, { "epoch": 0.6938421509106678, "eval_loss": 1.501986026763916, "eval_runtime": 572.9788, "eval_samples_per_second": 10.063, "eval_steps_per_second": 1.258, "step": 4000 }, { "epoch": 0.7285342584562012, "eval_loss": 1.4961259365081787, "eval_runtime": 572.9318, "eval_samples_per_second": 10.064, "eval_steps_per_second": 1.258, "step": 4200 }, { "epoch": 0.7632263660017347, "eval_loss": 1.4921443462371826, "eval_runtime": 573.2197, "eval_samples_per_second": 10.059, "eval_steps_per_second": 1.258, "step": 4400 }, { "epoch": 0.7805724197745013, "grad_norm": 5.124959945678711, "learning_rate": 2.1097137901127496e-05, "loss": 1.4981, "step": 4500 }, { "epoch": 0.797918473547268, "eval_loss": 1.48764967918396, "eval_runtime": 573.3463, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 4600 }, { "epoch": 0.8326105810928014, "eval_loss": 1.4827669858932495, "eval_runtime": 573.3276, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 4800 }, { "epoch": 0.8673026886383348, "grad_norm": 5.631836414337158, "learning_rate": 2.0663486556808327e-05, "loss": 1.4758, "step": 5000 }, { "epoch": 0.8673026886383348, "eval_loss": 1.4766356945037842, "eval_runtime": 573.3049, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 5000 }, { "epoch": 0.9019947961838681, "eval_loss": 1.4708250761032104, "eval_runtime": 573.3902, "eval_samples_per_second": 10.056, "eval_steps_per_second": 1.257, "step": 5200 }, { "epoch": 0.9366869037294016, "eval_loss": 1.4667783975601196, "eval_runtime": 573.338, "eval_samples_per_second": 10.057, "eval_steps_per_second": 1.258, "step": 5400 }, { "epoch": 0.9540329575021682, "grad_norm": 4.832674980163574, "learning_rate": 2.0229835212489158e-05, "loss": 1.4818, "step": 5500 }, { "epoch": 0.971379011274935, "eval_loss": 1.4649358987808228, "eval_runtime": 573.5907, "eval_samples_per_second": 10.052, "eval_steps_per_second": 1.257, "step": 5600 }, { "epoch": 1.0060711188204683, "eval_loss": 1.4911904335021973, "eval_runtime": 573.8034, "eval_samples_per_second": 10.049, "eval_steps_per_second": 1.257, "step": 5800 }, { "epoch": 1.0407632263660018, "grad_norm": 6.181447982788086, "learning_rate": 1.9796183868169993e-05, "loss": 1.3108, "step": 6000 }, { "epoch": 1.0407632263660018, "eval_loss": 1.5114498138427734, "eval_runtime": 573.9439, "eval_samples_per_second": 10.046, "eval_steps_per_second": 1.256, "step": 6000 }, { "epoch": 1.0754553339115351, "eval_loss": 1.5078836679458618, "eval_runtime": 573.7341, "eval_samples_per_second": 10.05, "eval_steps_per_second": 1.257, "step": 6200 }, { "epoch": 1.1101474414570685, "eval_loss": 1.512686848640442, "eval_runtime": 573.5532, "eval_samples_per_second": 10.053, "eval_steps_per_second": 1.257, "step": 6400 }, { "epoch": 1.1274934952298352, "grad_norm": 6.276436805725098, "learning_rate": 1.9362532523850823e-05, "loss": 1.1338, "step": 6500 }, { "epoch": 1.144839549002602, "eval_loss": 1.5086950063705444, "eval_runtime": 573.502, "eval_samples_per_second": 10.054, "eval_steps_per_second": 1.257, "step": 6600 }, { "epoch": 1.1795316565481353, "eval_loss": 1.5138036012649536, "eval_runtime": 573.4778, "eval_samples_per_second": 10.054, "eval_steps_per_second": 1.257, "step": 6800 }, { "epoch": 1.2142237640936686, "grad_norm": 5.294378280639648, "learning_rate": 1.8928881179531658e-05, "loss": 1.1411, "step": 7000 }, { "epoch": 1.2142237640936686, "eval_loss": 1.5119119882583618, "eval_runtime": 573.2773, "eval_samples_per_second": 10.058, "eval_steps_per_second": 1.258, "step": 7000 }, { "epoch": 1.2489158716392021, "eval_loss": 1.5059071779251099, "eval_runtime": 573.2436, "eval_samples_per_second": 10.059, "eval_steps_per_second": 1.258, "step": 7200 }, { "epoch": 1.2836079791847355, "eval_loss": 1.4931423664093018, "eval_runtime": 573.2431, "eval_samples_per_second": 10.059, "eval_steps_per_second": 1.258, "step": 7400 }, { "epoch": 1.3009540329575022, "grad_norm": 5.875624179840088, "learning_rate": 1.8495229835212492e-05, "loss": 1.1482, "step": 7500 }, { "epoch": 1.318300086730269, "eval_loss": 1.4929821491241455, "eval_runtime": 572.8059, "eval_samples_per_second": 10.066, "eval_steps_per_second": 1.259, "step": 7600 }, { "epoch": 1.3529921942758023, "eval_loss": 1.490503191947937, "eval_runtime": 572.7436, "eval_samples_per_second": 10.067, "eval_steps_per_second": 1.259, "step": 7800 }, { "epoch": 1.3876843018213356, "grad_norm": 5.962628364562988, "learning_rate": 1.8061578490893323e-05, "loss": 1.1534, "step": 8000 }, { "epoch": 1.3876843018213356, "eval_loss": 1.4796279668807983, "eval_runtime": 572.5741, "eval_samples_per_second": 10.07, "eval_steps_per_second": 1.259, "step": 8000 }, { "epoch": 1.4223764093668692, "eval_loss": 1.4942739009857178, "eval_runtime": 572.7895, "eval_samples_per_second": 10.067, "eval_steps_per_second": 1.259, "step": 8200 }, { "epoch": 1.4570685169124025, "eval_loss": 1.478100299835205, "eval_runtime": 574.02, "eval_samples_per_second": 10.045, "eval_steps_per_second": 1.256, "step": 8400 }, { "epoch": 1.4744145706851692, "grad_norm": 5.818081855773926, "learning_rate": 1.7627927146574154e-05, "loss": 1.1493, "step": 8500 }, { "epoch": 1.4917606244579358, "eval_loss": 1.4706262350082397, "eval_runtime": 573.645, "eval_samples_per_second": 10.052, "eval_steps_per_second": 1.257, "step": 8600 }, { "epoch": 1.5264527320034693, "eval_loss": 1.4702831506729126, "eval_runtime": 573.6402, "eval_samples_per_second": 10.052, "eval_steps_per_second": 1.257, "step": 8800 }, { "epoch": 1.5611448395490026, "grad_norm": 6.020638465881348, "learning_rate": 1.7194275802254988e-05, "loss": 1.1517, "step": 9000 }, { "epoch": 1.5611448395490026, "eval_loss": 1.4639151096343994, "eval_runtime": 573.5071, "eval_samples_per_second": 10.054, "eval_steps_per_second": 1.257, "step": 9000 }, { "epoch": 1.595836947094536, "eval_loss": 1.4722236394882202, "eval_runtime": 573.4545, "eval_samples_per_second": 10.055, "eval_steps_per_second": 1.257, "step": 9200 }, { "epoch": 1.6305290546400695, "eval_loss": 1.4613826274871826, "eval_runtime": 573.2765, "eval_samples_per_second": 10.058, "eval_steps_per_second": 1.258, "step": 9400 }, { "epoch": 1.647875108412836, "grad_norm": 5.535754680633545, "learning_rate": 1.676062445793582e-05, "loss": 1.1428, "step": 9500 }, { "epoch": 1.6652211621856028, "eval_loss": 1.4539824724197388, "eval_runtime": 573.1598, "eval_samples_per_second": 10.06, "eval_steps_per_second": 1.258, "step": 9600 }, { "epoch": 1.699913269731136, "eval_loss": 1.457112431526184, "eval_runtime": 573.1778, "eval_samples_per_second": 10.06, "eval_steps_per_second": 1.258, "step": 9800 }, { "epoch": 1.7346053772766696, "grad_norm": 6.019700527191162, "learning_rate": 1.6326973113616653e-05, "loss": 1.1466, "step": 10000 }, { "epoch": 1.7346053772766696, "eval_loss": 1.4443352222442627, "eval_runtime": 573.1133, "eval_samples_per_second": 10.061, "eval_steps_per_second": 1.258, "step": 10000 } ], "logging_steps": 500, "max_steps": 28825, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.74751582519296e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }