|
{ |
|
"best_metric": 0.8202670216560364, |
|
"best_model_checkpoint": "not_included_mt5-base/checkpoint-5000", |
|
"epoch": 5.9880239520958085, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11976047904191617, |
|
"grad_norm": 91.10150146484375, |
|
"learning_rate": 0.0001976047904191617, |
|
"loss": 10.8476, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.23952095808383234, |
|
"grad_norm": 3.626096248626709, |
|
"learning_rate": 0.00019520958083832338, |
|
"loss": 1.901, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 1.472233772277832, |
|
"learning_rate": 0.00019281437125748504, |
|
"loss": 1.5652, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 1.3357157707214355, |
|
"learning_rate": 0.0001904191616766467, |
|
"loss": 1.3415, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"grad_norm": 0.947651207447052, |
|
"learning_rate": 0.0001880239520958084, |
|
"loss": 1.233, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5988023952095808, |
|
"eval_Accuracy": 0.062, |
|
"eval_loss": 1.1174869537353516, |
|
"eval_runtime": 22.1894, |
|
"eval_samples_per_second": 225.333, |
|
"eval_steps_per_second": 7.075, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 0.8490985035896301, |
|
"learning_rate": 0.00018562874251497007, |
|
"loss": 1.1601, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8383233532934131, |
|
"grad_norm": 0.8438432812690735, |
|
"learning_rate": 0.00018323353293413173, |
|
"loss": 1.1129, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.7390198707580566, |
|
"learning_rate": 0.00018083832335329342, |
|
"loss": 1.0631, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.0778443113772456, |
|
"grad_norm": 0.8146846890449524, |
|
"learning_rate": 0.0001784431137724551, |
|
"loss": 1.0243, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"grad_norm": 0.7685344815254211, |
|
"learning_rate": 0.0001760479041916168, |
|
"loss": 0.9898, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.1976047904191618, |
|
"eval_Accuracy": 0.0998, |
|
"eval_loss": 0.9793121814727783, |
|
"eval_runtime": 22.9009, |
|
"eval_samples_per_second": 218.332, |
|
"eval_steps_per_second": 6.856, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.3173652694610778, |
|
"grad_norm": 0.7814993262290955, |
|
"learning_rate": 0.00017365269461077845, |
|
"loss": 0.9735, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.437125748502994, |
|
"grad_norm": 0.7339932322502136, |
|
"learning_rate": 0.0001712574850299401, |
|
"loss": 0.9723, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.55688622754491, |
|
"grad_norm": 0.7049471735954285, |
|
"learning_rate": 0.0001688622754491018, |
|
"loss": 0.9567, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.6766467065868262, |
|
"grad_norm": 0.7287872433662415, |
|
"learning_rate": 0.00016646706586826348, |
|
"loss": 0.946, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"grad_norm": 0.6998888254165649, |
|
"learning_rate": 0.00016407185628742517, |
|
"loss": 0.9204, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7964071856287425, |
|
"eval_Accuracy": 0.1246, |
|
"eval_loss": 0.9095961451530457, |
|
"eval_runtime": 23.1954, |
|
"eval_samples_per_second": 215.56, |
|
"eval_steps_per_second": 6.769, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.9161676646706587, |
|
"grad_norm": 0.9079245328903198, |
|
"learning_rate": 0.00016167664670658683, |
|
"loss": 0.9237, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.035928143712575, |
|
"grad_norm": 0.6102410554885864, |
|
"learning_rate": 0.0001592814371257485, |
|
"loss": 0.9064, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.155688622754491, |
|
"grad_norm": 0.6484932899475098, |
|
"learning_rate": 0.00015688622754491018, |
|
"loss": 0.8548, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.2754491017964074, |
|
"grad_norm": 0.6946207880973816, |
|
"learning_rate": 0.00015449101796407186, |
|
"loss": 0.8632, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"grad_norm": 0.706642210483551, |
|
"learning_rate": 0.00015209580838323355, |
|
"loss": 0.8405, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.3952095808383236, |
|
"eval_Accuracy": 0.1388, |
|
"eval_loss": 0.8839116096496582, |
|
"eval_runtime": 22.9368, |
|
"eval_samples_per_second": 217.99, |
|
"eval_steps_per_second": 6.845, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.5149700598802394, |
|
"grad_norm": 0.7250691056251526, |
|
"learning_rate": 0.0001497005988023952, |
|
"loss": 0.8546, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.6347305389221556, |
|
"grad_norm": 0.7567930817604065, |
|
"learning_rate": 0.0001473053892215569, |
|
"loss": 0.8508, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.754491017964072, |
|
"grad_norm": 0.8484090566635132, |
|
"learning_rate": 0.00014491017964071858, |
|
"loss": 0.851, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.874251497005988, |
|
"grad_norm": 0.8208755254745483, |
|
"learning_rate": 0.00014251497005988024, |
|
"loss": 0.843, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"grad_norm": 0.6463755965232849, |
|
"learning_rate": 0.00014011976047904193, |
|
"loss": 0.8338, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.9940119760479043, |
|
"eval_Accuracy": 0.1382, |
|
"eval_loss": 0.852607786655426, |
|
"eval_runtime": 23.4307, |
|
"eval_samples_per_second": 213.396, |
|
"eval_steps_per_second": 6.701, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.1137724550898205, |
|
"grad_norm": 0.708572506904602, |
|
"learning_rate": 0.00013772455089820359, |
|
"loss": 0.7838, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.2335329341317367, |
|
"grad_norm": 0.758407711982727, |
|
"learning_rate": 0.00013532934131736527, |
|
"loss": 0.7792, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.3532934131736525, |
|
"grad_norm": 0.8018171191215515, |
|
"learning_rate": 0.00013293413173652696, |
|
"loss": 0.7873, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.4730538922155687, |
|
"grad_norm": 0.6300553679466248, |
|
"learning_rate": 0.00013053892215568865, |
|
"loss": 0.774, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"grad_norm": 0.5713552832603455, |
|
"learning_rate": 0.0001281437125748503, |
|
"loss": 0.783, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.592814371257485, |
|
"eval_Accuracy": 0.1508, |
|
"eval_loss": 0.8408400416374207, |
|
"eval_runtime": 23.0015, |
|
"eval_samples_per_second": 217.377, |
|
"eval_steps_per_second": 6.826, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.712574850299401, |
|
"grad_norm": 0.6296516060829163, |
|
"learning_rate": 0.00012574850299401196, |
|
"loss": 0.7694, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.8323353293413174, |
|
"grad_norm": 0.5926547050476074, |
|
"learning_rate": 0.00012335329341317365, |
|
"loss": 0.7878, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.9520958083832336, |
|
"grad_norm": 0.8825701475143433, |
|
"learning_rate": 0.00012095808383233534, |
|
"loss": 0.7683, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.07185628742515, |
|
"grad_norm": 0.7714102268218994, |
|
"learning_rate": 0.00011856287425149701, |
|
"loss": 0.7411, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 4.191616766467066, |
|
"grad_norm": 0.7947181463241577, |
|
"learning_rate": 0.0001161676646706587, |
|
"loss": 0.7288, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.191616766467066, |
|
"eval_Accuracy": 0.1602, |
|
"eval_loss": 0.8330999612808228, |
|
"eval_runtime": 23.0702, |
|
"eval_samples_per_second": 216.73, |
|
"eval_steps_per_second": 6.805, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 4.311377245508982, |
|
"grad_norm": 0.7584651112556458, |
|
"learning_rate": 0.00011377245508982037, |
|
"loss": 0.7258, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.431137724550898, |
|
"grad_norm": 0.6492884159088135, |
|
"learning_rate": 0.00011137724550898203, |
|
"loss": 0.7222, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.550898203592815, |
|
"grad_norm": 0.7623780965805054, |
|
"learning_rate": 0.00010898203592814371, |
|
"loss": 0.7168, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.6706586826347305, |
|
"grad_norm": 0.8104670643806458, |
|
"learning_rate": 0.00010658682634730539, |
|
"loss": 0.7272, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.790419161676647, |
|
"grad_norm": 0.6871969699859619, |
|
"learning_rate": 0.00010419161676646707, |
|
"loss": 0.7204, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.790419161676647, |
|
"eval_Accuracy": 0.159, |
|
"eval_loss": 0.8263267874717712, |
|
"eval_runtime": 22.9773, |
|
"eval_samples_per_second": 217.606, |
|
"eval_steps_per_second": 6.833, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.910179640718563, |
|
"grad_norm": 0.7578293085098267, |
|
"learning_rate": 0.00010179640718562875, |
|
"loss": 0.727, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 5.029940119760479, |
|
"grad_norm": 0.71566241979599, |
|
"learning_rate": 9.940119760479042e-05, |
|
"loss": 0.7111, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 5.149700598802395, |
|
"grad_norm": 0.7744144201278687, |
|
"learning_rate": 9.700598802395209e-05, |
|
"loss": 0.6699, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 5.269461077844311, |
|
"grad_norm": 0.7886649370193481, |
|
"learning_rate": 9.461077844311378e-05, |
|
"loss": 0.6791, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 5.389221556886228, |
|
"grad_norm": 0.7046491503715515, |
|
"learning_rate": 9.221556886227547e-05, |
|
"loss": 0.6533, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.389221556886228, |
|
"eval_Accuracy": 0.1644, |
|
"eval_loss": 0.8296283483505249, |
|
"eval_runtime": 23.2437, |
|
"eval_samples_per_second": 215.112, |
|
"eval_steps_per_second": 6.755, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.508982035928144, |
|
"grad_norm": 0.6836217641830444, |
|
"learning_rate": 8.982035928143712e-05, |
|
"loss": 0.6798, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 5.62874251497006, |
|
"grad_norm": 0.8316423296928406, |
|
"learning_rate": 8.742514970059881e-05, |
|
"loss": 0.6813, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 5.748502994011976, |
|
"grad_norm": 0.806846022605896, |
|
"learning_rate": 8.502994011976048e-05, |
|
"loss": 0.6797, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 5.868263473053892, |
|
"grad_norm": 0.7254876494407654, |
|
"learning_rate": 8.263473053892216e-05, |
|
"loss": 0.6732, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 1.0394260883331299, |
|
"learning_rate": 8.023952095808383e-05, |
|
"loss": 0.6955, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"eval_Accuracy": 0.1616, |
|
"eval_loss": 0.8202670216560364, |
|
"eval_runtime": 23.0559, |
|
"eval_samples_per_second": 216.864, |
|
"eval_steps_per_second": 6.81, |
|
"step": 5000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 8350, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4755250684394496.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|