|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7346053772766696, |
|
"eval_steps": 200, |
|
"global_step": 10000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03469210754553339, |
|
"eval_loss": 1.8023786544799805, |
|
"eval_runtime": 572.3832, |
|
"eval_samples_per_second": 10.074, |
|
"eval_steps_per_second": 1.26, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06938421509106678, |
|
"eval_loss": 1.7305923700332642, |
|
"eval_runtime": 572.203, |
|
"eval_samples_per_second": 10.077, |
|
"eval_steps_per_second": 1.26, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08673026886383348, |
|
"grad_norm": 6.57196569442749, |
|
"learning_rate": 2.4566348655680836e-05, |
|
"loss": 1.8068, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10407632263660017, |
|
"eval_loss": 1.7001726627349854, |
|
"eval_runtime": 572.0907, |
|
"eval_samples_per_second": 10.079, |
|
"eval_steps_per_second": 1.26, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.13876843018213356, |
|
"eval_loss": 1.6665369272232056, |
|
"eval_runtime": 572.5138, |
|
"eval_samples_per_second": 10.071, |
|
"eval_steps_per_second": 1.259, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.17346053772766695, |
|
"grad_norm": 6.407934665679932, |
|
"learning_rate": 2.4132697311361666e-05, |
|
"loss": 1.6773, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17346053772766695, |
|
"eval_loss": 1.645666480064392, |
|
"eval_runtime": 572.4669, |
|
"eval_samples_per_second": 10.072, |
|
"eval_steps_per_second": 1.259, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.20815264527320035, |
|
"eval_loss": 1.6295970678329468, |
|
"eval_runtime": 572.6705, |
|
"eval_samples_per_second": 10.069, |
|
"eval_steps_per_second": 1.259, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.24284475281873374, |
|
"eval_loss": 1.6119849681854248, |
|
"eval_runtime": 572.6602, |
|
"eval_samples_per_second": 10.069, |
|
"eval_steps_per_second": 1.259, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.26019080659150046, |
|
"grad_norm": 6.23416805267334, |
|
"learning_rate": 2.36990459670425e-05, |
|
"loss": 1.6291, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.2775368603642671, |
|
"eval_loss": 1.5977734327316284, |
|
"eval_runtime": 572.802, |
|
"eval_samples_per_second": 10.066, |
|
"eval_steps_per_second": 1.259, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.31222896790980054, |
|
"eval_loss": 1.5906885862350464, |
|
"eval_runtime": 572.7238, |
|
"eval_samples_per_second": 10.068, |
|
"eval_steps_per_second": 1.259, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.3469210754553339, |
|
"grad_norm": 5.846036434173584, |
|
"learning_rate": 2.326539462272333e-05, |
|
"loss": 1.6032, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3469210754553339, |
|
"eval_loss": 1.5792902708053589, |
|
"eval_runtime": 572.6421, |
|
"eval_samples_per_second": 10.069, |
|
"eval_steps_per_second": 1.259, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.38161318300086733, |
|
"eval_loss": 1.5674443244934082, |
|
"eval_runtime": 572.94, |
|
"eval_samples_per_second": 10.064, |
|
"eval_steps_per_second": 1.258, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.4163052905464007, |
|
"eval_loss": 1.5650794506072998, |
|
"eval_runtime": 573.1561, |
|
"eval_samples_per_second": 10.06, |
|
"eval_steps_per_second": 1.258, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4336513443191674, |
|
"grad_norm": 6.9578962326049805, |
|
"learning_rate": 2.2831743278404163e-05, |
|
"loss": 1.5699, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.45099739809193407, |
|
"eval_loss": 1.5550028085708618, |
|
"eval_runtime": 572.973, |
|
"eval_samples_per_second": 10.063, |
|
"eval_steps_per_second": 1.258, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.4856895056374675, |
|
"eval_loss": 1.539338231086731, |
|
"eval_runtime": 573.1731, |
|
"eval_samples_per_second": 10.06, |
|
"eval_steps_per_second": 1.258, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.5203816131830009, |
|
"grad_norm": 6.062795639038086, |
|
"learning_rate": 2.2398091934084997e-05, |
|
"loss": 1.5555, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5203816131830009, |
|
"eval_loss": 1.533992886543274, |
|
"eval_runtime": 573.3108, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5550737207285342, |
|
"eval_loss": 1.5279603004455566, |
|
"eval_runtime": 573.3234, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5897658282740676, |
|
"eval_loss": 1.5221937894821167, |
|
"eval_runtime": 573.2462, |
|
"eval_samples_per_second": 10.059, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.6071118820468343, |
|
"grad_norm": 5.474059581756592, |
|
"learning_rate": 2.196444058976583e-05, |
|
"loss": 1.5258, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6244579358196011, |
|
"eval_loss": 1.5145606994628906, |
|
"eval_runtime": 573.1527, |
|
"eval_samples_per_second": 10.06, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.6591500433651344, |
|
"eval_loss": 1.5087436437606812, |
|
"eval_runtime": 573.3236, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6938421509106678, |
|
"grad_norm": 4.400829315185547, |
|
"learning_rate": 2.1530789245446662e-05, |
|
"loss": 1.5145, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6938421509106678, |
|
"eval_loss": 1.501986026763916, |
|
"eval_runtime": 572.9788, |
|
"eval_samples_per_second": 10.063, |
|
"eval_steps_per_second": 1.258, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7285342584562012, |
|
"eval_loss": 1.4961259365081787, |
|
"eval_runtime": 572.9318, |
|
"eval_samples_per_second": 10.064, |
|
"eval_steps_per_second": 1.258, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.7632263660017347, |
|
"eval_loss": 1.4921443462371826, |
|
"eval_runtime": 573.2197, |
|
"eval_samples_per_second": 10.059, |
|
"eval_steps_per_second": 1.258, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7805724197745013, |
|
"grad_norm": 5.124959945678711, |
|
"learning_rate": 2.1097137901127496e-05, |
|
"loss": 1.4981, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.797918473547268, |
|
"eval_loss": 1.48764967918396, |
|
"eval_runtime": 573.3463, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.8326105810928014, |
|
"eval_loss": 1.4827669858932495, |
|
"eval_runtime": 573.3276, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.8673026886383348, |
|
"grad_norm": 5.631836414337158, |
|
"learning_rate": 2.0663486556808327e-05, |
|
"loss": 1.4758, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8673026886383348, |
|
"eval_loss": 1.4766356945037842, |
|
"eval_runtime": 573.3049, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9019947961838681, |
|
"eval_loss": 1.4708250761032104, |
|
"eval_runtime": 573.3902, |
|
"eval_samples_per_second": 10.056, |
|
"eval_steps_per_second": 1.257, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.9366869037294016, |
|
"eval_loss": 1.4667783975601196, |
|
"eval_runtime": 573.338, |
|
"eval_samples_per_second": 10.057, |
|
"eval_steps_per_second": 1.258, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.9540329575021682, |
|
"grad_norm": 4.832674980163574, |
|
"learning_rate": 2.0229835212489158e-05, |
|
"loss": 1.4818, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.971379011274935, |
|
"eval_loss": 1.4649358987808228, |
|
"eval_runtime": 573.5907, |
|
"eval_samples_per_second": 10.052, |
|
"eval_steps_per_second": 1.257, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.0060711188204683, |
|
"eval_loss": 1.4911904335021973, |
|
"eval_runtime": 573.8034, |
|
"eval_samples_per_second": 10.049, |
|
"eval_steps_per_second": 1.257, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.0407632263660018, |
|
"grad_norm": 6.181447982788086, |
|
"learning_rate": 1.9796183868169993e-05, |
|
"loss": 1.3108, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.0407632263660018, |
|
"eval_loss": 1.5114498138427734, |
|
"eval_runtime": 573.9439, |
|
"eval_samples_per_second": 10.046, |
|
"eval_steps_per_second": 1.256, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.0754553339115351, |
|
"eval_loss": 1.5078836679458618, |
|
"eval_runtime": 573.7341, |
|
"eval_samples_per_second": 10.05, |
|
"eval_steps_per_second": 1.257, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.1101474414570685, |
|
"eval_loss": 1.512686848640442, |
|
"eval_runtime": 573.5532, |
|
"eval_samples_per_second": 10.053, |
|
"eval_steps_per_second": 1.257, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.1274934952298352, |
|
"grad_norm": 6.276436805725098, |
|
"learning_rate": 1.9362532523850823e-05, |
|
"loss": 1.1338, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.144839549002602, |
|
"eval_loss": 1.5086950063705444, |
|
"eval_runtime": 573.502, |
|
"eval_samples_per_second": 10.054, |
|
"eval_steps_per_second": 1.257, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.1795316565481353, |
|
"eval_loss": 1.5138036012649536, |
|
"eval_runtime": 573.4778, |
|
"eval_samples_per_second": 10.054, |
|
"eval_steps_per_second": 1.257, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.2142237640936686, |
|
"grad_norm": 5.294378280639648, |
|
"learning_rate": 1.8928881179531658e-05, |
|
"loss": 1.1411, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.2142237640936686, |
|
"eval_loss": 1.5119119882583618, |
|
"eval_runtime": 573.2773, |
|
"eval_samples_per_second": 10.058, |
|
"eval_steps_per_second": 1.258, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.2489158716392021, |
|
"eval_loss": 1.5059071779251099, |
|
"eval_runtime": 573.2436, |
|
"eval_samples_per_second": 10.059, |
|
"eval_steps_per_second": 1.258, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.2836079791847355, |
|
"eval_loss": 1.4931423664093018, |
|
"eval_runtime": 573.2431, |
|
"eval_samples_per_second": 10.059, |
|
"eval_steps_per_second": 1.258, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.3009540329575022, |
|
"grad_norm": 5.875624179840088, |
|
"learning_rate": 1.8495229835212492e-05, |
|
"loss": 1.1482, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.318300086730269, |
|
"eval_loss": 1.4929821491241455, |
|
"eval_runtime": 572.8059, |
|
"eval_samples_per_second": 10.066, |
|
"eval_steps_per_second": 1.259, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.3529921942758023, |
|
"eval_loss": 1.490503191947937, |
|
"eval_runtime": 572.7436, |
|
"eval_samples_per_second": 10.067, |
|
"eval_steps_per_second": 1.259, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.3876843018213356, |
|
"grad_norm": 5.962628364562988, |
|
"learning_rate": 1.8061578490893323e-05, |
|
"loss": 1.1534, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.3876843018213356, |
|
"eval_loss": 1.4796279668807983, |
|
"eval_runtime": 572.5741, |
|
"eval_samples_per_second": 10.07, |
|
"eval_steps_per_second": 1.259, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.4223764093668692, |
|
"eval_loss": 1.4942739009857178, |
|
"eval_runtime": 572.7895, |
|
"eval_samples_per_second": 10.067, |
|
"eval_steps_per_second": 1.259, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.4570685169124025, |
|
"eval_loss": 1.478100299835205, |
|
"eval_runtime": 574.02, |
|
"eval_samples_per_second": 10.045, |
|
"eval_steps_per_second": 1.256, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.4744145706851692, |
|
"grad_norm": 5.818081855773926, |
|
"learning_rate": 1.7627927146574154e-05, |
|
"loss": 1.1493, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.4917606244579358, |
|
"eval_loss": 1.4706262350082397, |
|
"eval_runtime": 573.645, |
|
"eval_samples_per_second": 10.052, |
|
"eval_steps_per_second": 1.257, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.5264527320034693, |
|
"eval_loss": 1.4702831506729126, |
|
"eval_runtime": 573.6402, |
|
"eval_samples_per_second": 10.052, |
|
"eval_steps_per_second": 1.257, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.5611448395490026, |
|
"grad_norm": 6.020638465881348, |
|
"learning_rate": 1.7194275802254988e-05, |
|
"loss": 1.1517, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.5611448395490026, |
|
"eval_loss": 1.4639151096343994, |
|
"eval_runtime": 573.5071, |
|
"eval_samples_per_second": 10.054, |
|
"eval_steps_per_second": 1.257, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.595836947094536, |
|
"eval_loss": 1.4722236394882202, |
|
"eval_runtime": 573.4545, |
|
"eval_samples_per_second": 10.055, |
|
"eval_steps_per_second": 1.257, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.6305290546400695, |
|
"eval_loss": 1.4613826274871826, |
|
"eval_runtime": 573.2765, |
|
"eval_samples_per_second": 10.058, |
|
"eval_steps_per_second": 1.258, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.647875108412836, |
|
"grad_norm": 5.535754680633545, |
|
"learning_rate": 1.676062445793582e-05, |
|
"loss": 1.1428, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.6652211621856028, |
|
"eval_loss": 1.4539824724197388, |
|
"eval_runtime": 573.1598, |
|
"eval_samples_per_second": 10.06, |
|
"eval_steps_per_second": 1.258, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.699913269731136, |
|
"eval_loss": 1.457112431526184, |
|
"eval_runtime": 573.1778, |
|
"eval_samples_per_second": 10.06, |
|
"eval_steps_per_second": 1.258, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.7346053772766696, |
|
"grad_norm": 6.019700527191162, |
|
"learning_rate": 1.6326973113616653e-05, |
|
"loss": 1.1466, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.7346053772766696, |
|
"eval_loss": 1.4443352222442627, |
|
"eval_runtime": 573.1133, |
|
"eval_samples_per_second": 10.061, |
|
"eval_steps_per_second": 1.258, |
|
"step": 10000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 28825, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.74751582519296e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|