GaetanMichelet's picture
Model save
75bd8dd verified
{
"best_metric": 0.8246700763702393,
"best_model_checkpoint": "data/Llama-31-8B_task-1_60-samples_config-1_full_auto/checkpoint-46",
"epoch": 14.956521739130435,
"eval_steps": 500,
"global_step": 86,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17391304347826086,
"grad_norm": 1.8825360536575317,
"learning_rate": 4.000000000000001e-06,
"loss": 2.134,
"step": 1
},
{
"epoch": 0.34782608695652173,
"grad_norm": 1.797432780265808,
"learning_rate": 8.000000000000001e-06,
"loss": 2.2794,
"step": 2
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.3481346368789673,
"learning_rate": 1.6000000000000003e-05,
"loss": 2.1899,
"step": 4
},
{
"epoch": 0.8695652173913043,
"eval_loss": 2.0615599155426025,
"eval_runtime": 5.0274,
"eval_samples_per_second": 2.387,
"eval_steps_per_second": 2.387,
"step": 5
},
{
"epoch": 1.0434782608695652,
"grad_norm": 1.4031779766082764,
"learning_rate": 2.4e-05,
"loss": 2.1487,
"step": 6
},
{
"epoch": 1.391304347826087,
"grad_norm": 1.6137045621871948,
"learning_rate": 3.2000000000000005e-05,
"loss": 2.0567,
"step": 8
},
{
"epoch": 1.7391304347826086,
"grad_norm": 1.0302890539169312,
"learning_rate": 4e-05,
"loss": 1.9473,
"step": 10
},
{
"epoch": 1.9130434782608696,
"eval_loss": 1.7736454010009766,
"eval_runtime": 5.0277,
"eval_samples_per_second": 2.387,
"eval_steps_per_second": 2.387,
"step": 11
},
{
"epoch": 2.0869565217391304,
"grad_norm": 1.3873261213302612,
"learning_rate": 4.8e-05,
"loss": 1.9107,
"step": 12
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.8510915637016296,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.7363,
"step": 14
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.6782408356666565,
"learning_rate": 6.400000000000001e-05,
"loss": 1.6049,
"step": 16
},
{
"epoch": 2.9565217391304346,
"eval_loss": 1.4598970413208008,
"eval_runtime": 5.0449,
"eval_samples_per_second": 2.379,
"eval_steps_per_second": 2.379,
"step": 17
},
{
"epoch": 3.130434782608696,
"grad_norm": 0.7629269361495972,
"learning_rate": 7.2e-05,
"loss": 1.5614,
"step": 18
},
{
"epoch": 3.4782608695652173,
"grad_norm": 0.9661106467247009,
"learning_rate": 8e-05,
"loss": 1.3718,
"step": 20
},
{
"epoch": 3.8260869565217392,
"grad_norm": 0.9902026653289795,
"learning_rate": 8.800000000000001e-05,
"loss": 1.1934,
"step": 22
},
{
"epoch": 4.0,
"eval_loss": 1.0349758863449097,
"eval_runtime": 5.0242,
"eval_samples_per_second": 2.388,
"eval_steps_per_second": 2.388,
"step": 23
},
{
"epoch": 4.173913043478261,
"grad_norm": 0.8432597517967224,
"learning_rate": 9.6e-05,
"loss": 1.0786,
"step": 24
},
{
"epoch": 4.521739130434782,
"grad_norm": 0.5363503694534302,
"learning_rate": 9.999512620046522e-05,
"loss": 0.9666,
"step": 26
},
{
"epoch": 4.869565217391305,
"grad_norm": 0.5397949814796448,
"learning_rate": 9.995614150494293e-05,
"loss": 0.8711,
"step": 28
},
{
"epoch": 4.869565217391305,
"eval_loss": 0.906165361404419,
"eval_runtime": 5.0276,
"eval_samples_per_second": 2.387,
"eval_steps_per_second": 2.387,
"step": 28
},
{
"epoch": 5.217391304347826,
"grad_norm": 0.4831950068473816,
"learning_rate": 9.987820251299122e-05,
"loss": 0.8453,
"step": 30
},
{
"epoch": 5.565217391304348,
"grad_norm": 0.45016416907310486,
"learning_rate": 9.976136999909156e-05,
"loss": 0.8762,
"step": 32
},
{
"epoch": 5.913043478260869,
"grad_norm": 0.41508305072784424,
"learning_rate": 9.96057350657239e-05,
"loss": 0.8035,
"step": 34
},
{
"epoch": 5.913043478260869,
"eval_loss": 0.8637191653251648,
"eval_runtime": 5.0295,
"eval_samples_per_second": 2.386,
"eval_steps_per_second": 2.386,
"step": 34
},
{
"epoch": 6.260869565217392,
"grad_norm": 0.4248453378677368,
"learning_rate": 9.941141907232765e-05,
"loss": 0.7614,
"step": 36
},
{
"epoch": 6.608695652173913,
"grad_norm": 0.4545738995075226,
"learning_rate": 9.917857354066931e-05,
"loss": 0.7764,
"step": 38
},
{
"epoch": 6.956521739130435,
"grad_norm": 0.4439631700515747,
"learning_rate": 9.890738003669029e-05,
"loss": 0.7366,
"step": 40
},
{
"epoch": 6.956521739130435,
"eval_loss": 0.8405813574790955,
"eval_runtime": 5.0307,
"eval_samples_per_second": 2.385,
"eval_steps_per_second": 2.385,
"step": 40
},
{
"epoch": 7.304347826086957,
"grad_norm": 0.5056247115135193,
"learning_rate": 9.859805002892732e-05,
"loss": 0.7489,
"step": 42
},
{
"epoch": 7.6521739130434785,
"grad_norm": 0.44373199343681335,
"learning_rate": 9.825082472361557e-05,
"loss": 0.6803,
"step": 44
},
{
"epoch": 8.0,
"grad_norm": 0.5359618067741394,
"learning_rate": 9.786597487660337e-05,
"loss": 0.6995,
"step": 46
},
{
"epoch": 8.0,
"eval_loss": 0.8246700763702393,
"eval_runtime": 5.0242,
"eval_samples_per_second": 2.388,
"eval_steps_per_second": 2.388,
"step": 46
},
{
"epoch": 8.347826086956522,
"grad_norm": 0.5545627474784851,
"learning_rate": 9.744380058222483e-05,
"loss": 0.6248,
"step": 48
},
{
"epoch": 8.695652173913043,
"grad_norm": 0.6266648173332214,
"learning_rate": 9.698463103929542e-05,
"loss": 0.6613,
"step": 50
},
{
"epoch": 8.869565217391305,
"eval_loss": 0.8258533477783203,
"eval_runtime": 5.0277,
"eval_samples_per_second": 2.387,
"eval_steps_per_second": 2.387,
"step": 51
},
{
"epoch": 9.043478260869565,
"grad_norm": 0.7139117121696472,
"learning_rate": 9.648882429441257e-05,
"loss": 0.6253,
"step": 52
},
{
"epoch": 9.391304347826088,
"grad_norm": 0.7528420686721802,
"learning_rate": 9.595676696276172e-05,
"loss": 0.5488,
"step": 54
},
{
"epoch": 9.73913043478261,
"grad_norm": 0.7976906895637512,
"learning_rate": 9.538887392664544e-05,
"loss": 0.5531,
"step": 56
},
{
"epoch": 9.91304347826087,
"eval_loss": 0.8317675590515137,
"eval_runtime": 5.0285,
"eval_samples_per_second": 2.386,
"eval_steps_per_second": 2.386,
"step": 57
},
{
"epoch": 10.08695652173913,
"grad_norm": 0.7176584005355835,
"learning_rate": 9.478558801197065e-05,
"loss": 0.5712,
"step": 58
},
{
"epoch": 10.434782608695652,
"grad_norm": 0.6586626768112183,
"learning_rate": 9.414737964294636e-05,
"loss": 0.4544,
"step": 60
},
{
"epoch": 10.782608695652174,
"grad_norm": 0.8539354205131531,
"learning_rate": 9.347474647526095e-05,
"loss": 0.5061,
"step": 62
},
{
"epoch": 10.956521739130435,
"eval_loss": 0.8597629070281982,
"eval_runtime": 5.029,
"eval_samples_per_second": 2.386,
"eval_steps_per_second": 2.386,
"step": 63
},
{
"epoch": 11.130434782608695,
"grad_norm": 0.6799076199531555,
"learning_rate": 9.276821300802534e-05,
"loss": 0.4456,
"step": 64
},
{
"epoch": 11.478260869565217,
"grad_norm": 0.6904007792472839,
"learning_rate": 9.202833017478422e-05,
"loss": 0.3692,
"step": 66
},
{
"epoch": 11.826086956521738,
"grad_norm": 0.926986813545227,
"learning_rate": 9.125567491391476e-05,
"loss": 0.3776,
"step": 68
},
{
"epoch": 12.0,
"eval_loss": 0.9167086482048035,
"eval_runtime": 5.0287,
"eval_samples_per_second": 2.386,
"eval_steps_per_second": 2.386,
"step": 69
},
{
"epoch": 12.173913043478262,
"grad_norm": 0.8241488337516785,
"learning_rate": 9.045084971874738e-05,
"loss": 0.3262,
"step": 70
},
{
"epoch": 12.521739130434783,
"grad_norm": 0.8996691703796387,
"learning_rate": 8.961448216775954e-05,
"loss": 0.3029,
"step": 72
},
{
"epoch": 12.869565217391305,
"grad_norm": 1.148189663887024,
"learning_rate": 8.874722443520899e-05,
"loss": 0.264,
"step": 74
},
{
"epoch": 12.869565217391305,
"eval_loss": 1.0157562494277954,
"eval_runtime": 5.0275,
"eval_samples_per_second": 2.387,
"eval_steps_per_second": 2.387,
"step": 74
},
{
"epoch": 13.217391304347826,
"grad_norm": 0.9193633794784546,
"learning_rate": 8.784975278258783e-05,
"loss": 0.2267,
"step": 76
},
{
"epoch": 13.565217391304348,
"grad_norm": 1.1552317142486572,
"learning_rate": 8.692276703129421e-05,
"loss": 0.1833,
"step": 78
},
{
"epoch": 13.91304347826087,
"grad_norm": 0.9492861032485962,
"learning_rate": 8.596699001693255e-05,
"loss": 0.1878,
"step": 80
},
{
"epoch": 13.91304347826087,
"eval_loss": 1.1250226497650146,
"eval_runtime": 5.0348,
"eval_samples_per_second": 2.383,
"eval_steps_per_second": 2.383,
"step": 80
},
{
"epoch": 14.26086956521739,
"grad_norm": 1.2470932006835938,
"learning_rate": 8.498316702566828e-05,
"loss": 0.1453,
"step": 82
},
{
"epoch": 14.608695652173914,
"grad_norm": 1.0260671377182007,
"learning_rate": 8.397206521307584e-05,
"loss": 0.126,
"step": 84
},
{
"epoch": 14.956521739130435,
"grad_norm": 1.2034170627593994,
"learning_rate": 8.293447300593402e-05,
"loss": 0.1417,
"step": 86
},
{
"epoch": 14.956521739130435,
"eval_loss": 1.203776240348816,
"eval_runtime": 5.0321,
"eval_samples_per_second": 2.385,
"eval_steps_per_second": 2.385,
"step": 86
},
{
"epoch": 14.956521739130435,
"step": 86,
"total_flos": 1.1330036965572608e+16,
"train_loss": 0.8706800944150187,
"train_runtime": 834.1495,
"train_samples_per_second": 2.757,
"train_steps_per_second": 0.3
}
],
"logging_steps": 2,
"max_steps": 250,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 7,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1330036965572608e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}