mt5-base-kirakira-names / trainer_state.json
umisato's picture
Upload 12 files
5f77eed verified
{
"best_metric": 0.8202670216560364,
"best_model_checkpoint": "not_included_mt5-base/checkpoint-5000",
"epoch": 5.9880239520958085,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.11976047904191617,
"grad_norm": 91.10150146484375,
"learning_rate": 0.0001976047904191617,
"loss": 10.8476,
"step": 100
},
{
"epoch": 0.23952095808383234,
"grad_norm": 3.626096248626709,
"learning_rate": 0.00019520958083832338,
"loss": 1.901,
"step": 200
},
{
"epoch": 0.3592814371257485,
"grad_norm": 1.472233772277832,
"learning_rate": 0.00019281437125748504,
"loss": 1.5652,
"step": 300
},
{
"epoch": 0.47904191616766467,
"grad_norm": 1.3357157707214355,
"learning_rate": 0.0001904191616766467,
"loss": 1.3415,
"step": 400
},
{
"epoch": 0.5988023952095808,
"grad_norm": 0.947651207447052,
"learning_rate": 0.0001880239520958084,
"loss": 1.233,
"step": 500
},
{
"epoch": 0.5988023952095808,
"eval_Accuracy": 0.062,
"eval_loss": 1.1174869537353516,
"eval_runtime": 22.1894,
"eval_samples_per_second": 225.333,
"eval_steps_per_second": 7.075,
"step": 500
},
{
"epoch": 0.718562874251497,
"grad_norm": 0.8490985035896301,
"learning_rate": 0.00018562874251497007,
"loss": 1.1601,
"step": 600
},
{
"epoch": 0.8383233532934131,
"grad_norm": 0.8438432812690735,
"learning_rate": 0.00018323353293413173,
"loss": 1.1129,
"step": 700
},
{
"epoch": 0.9580838323353293,
"grad_norm": 0.7390198707580566,
"learning_rate": 0.00018083832335329342,
"loss": 1.0631,
"step": 800
},
{
"epoch": 1.0778443113772456,
"grad_norm": 0.8146846890449524,
"learning_rate": 0.0001784431137724551,
"loss": 1.0243,
"step": 900
},
{
"epoch": 1.1976047904191618,
"grad_norm": 0.7685344815254211,
"learning_rate": 0.0001760479041916168,
"loss": 0.9898,
"step": 1000
},
{
"epoch": 1.1976047904191618,
"eval_Accuracy": 0.0998,
"eval_loss": 0.9793121814727783,
"eval_runtime": 22.9009,
"eval_samples_per_second": 218.332,
"eval_steps_per_second": 6.856,
"step": 1000
},
{
"epoch": 1.3173652694610778,
"grad_norm": 0.7814993262290955,
"learning_rate": 0.00017365269461077845,
"loss": 0.9735,
"step": 1100
},
{
"epoch": 1.437125748502994,
"grad_norm": 0.7339932322502136,
"learning_rate": 0.0001712574850299401,
"loss": 0.9723,
"step": 1200
},
{
"epoch": 1.55688622754491,
"grad_norm": 0.7049471735954285,
"learning_rate": 0.0001688622754491018,
"loss": 0.9567,
"step": 1300
},
{
"epoch": 1.6766467065868262,
"grad_norm": 0.7287872433662415,
"learning_rate": 0.00016646706586826348,
"loss": 0.946,
"step": 1400
},
{
"epoch": 1.7964071856287425,
"grad_norm": 0.6998888254165649,
"learning_rate": 0.00016407185628742517,
"loss": 0.9204,
"step": 1500
},
{
"epoch": 1.7964071856287425,
"eval_Accuracy": 0.1246,
"eval_loss": 0.9095961451530457,
"eval_runtime": 23.1954,
"eval_samples_per_second": 215.56,
"eval_steps_per_second": 6.769,
"step": 1500
},
{
"epoch": 1.9161676646706587,
"grad_norm": 0.9079245328903198,
"learning_rate": 0.00016167664670658683,
"loss": 0.9237,
"step": 1600
},
{
"epoch": 2.035928143712575,
"grad_norm": 0.6102410554885864,
"learning_rate": 0.0001592814371257485,
"loss": 0.9064,
"step": 1700
},
{
"epoch": 2.155688622754491,
"grad_norm": 0.6484932899475098,
"learning_rate": 0.00015688622754491018,
"loss": 0.8548,
"step": 1800
},
{
"epoch": 2.2754491017964074,
"grad_norm": 0.6946207880973816,
"learning_rate": 0.00015449101796407186,
"loss": 0.8632,
"step": 1900
},
{
"epoch": 2.3952095808383236,
"grad_norm": 0.706642210483551,
"learning_rate": 0.00015209580838323355,
"loss": 0.8405,
"step": 2000
},
{
"epoch": 2.3952095808383236,
"eval_Accuracy": 0.1388,
"eval_loss": 0.8839116096496582,
"eval_runtime": 22.9368,
"eval_samples_per_second": 217.99,
"eval_steps_per_second": 6.845,
"step": 2000
},
{
"epoch": 2.5149700598802394,
"grad_norm": 0.7250691056251526,
"learning_rate": 0.0001497005988023952,
"loss": 0.8546,
"step": 2100
},
{
"epoch": 2.6347305389221556,
"grad_norm": 0.7567930817604065,
"learning_rate": 0.0001473053892215569,
"loss": 0.8508,
"step": 2200
},
{
"epoch": 2.754491017964072,
"grad_norm": 0.8484090566635132,
"learning_rate": 0.00014491017964071858,
"loss": 0.851,
"step": 2300
},
{
"epoch": 2.874251497005988,
"grad_norm": 0.8208755254745483,
"learning_rate": 0.00014251497005988024,
"loss": 0.843,
"step": 2400
},
{
"epoch": 2.9940119760479043,
"grad_norm": 0.6463755965232849,
"learning_rate": 0.00014011976047904193,
"loss": 0.8338,
"step": 2500
},
{
"epoch": 2.9940119760479043,
"eval_Accuracy": 0.1382,
"eval_loss": 0.852607786655426,
"eval_runtime": 23.4307,
"eval_samples_per_second": 213.396,
"eval_steps_per_second": 6.701,
"step": 2500
},
{
"epoch": 3.1137724550898205,
"grad_norm": 0.708572506904602,
"learning_rate": 0.00013772455089820359,
"loss": 0.7838,
"step": 2600
},
{
"epoch": 3.2335329341317367,
"grad_norm": 0.758407711982727,
"learning_rate": 0.00013532934131736527,
"loss": 0.7792,
"step": 2700
},
{
"epoch": 3.3532934131736525,
"grad_norm": 0.8018171191215515,
"learning_rate": 0.00013293413173652696,
"loss": 0.7873,
"step": 2800
},
{
"epoch": 3.4730538922155687,
"grad_norm": 0.6300553679466248,
"learning_rate": 0.00013053892215568865,
"loss": 0.774,
"step": 2900
},
{
"epoch": 3.592814371257485,
"grad_norm": 0.5713552832603455,
"learning_rate": 0.0001281437125748503,
"loss": 0.783,
"step": 3000
},
{
"epoch": 3.592814371257485,
"eval_Accuracy": 0.1508,
"eval_loss": 0.8408400416374207,
"eval_runtime": 23.0015,
"eval_samples_per_second": 217.377,
"eval_steps_per_second": 6.826,
"step": 3000
},
{
"epoch": 3.712574850299401,
"grad_norm": 0.6296516060829163,
"learning_rate": 0.00012574850299401196,
"loss": 0.7694,
"step": 3100
},
{
"epoch": 3.8323353293413174,
"grad_norm": 0.5926547050476074,
"learning_rate": 0.00012335329341317365,
"loss": 0.7878,
"step": 3200
},
{
"epoch": 3.9520958083832336,
"grad_norm": 0.8825701475143433,
"learning_rate": 0.00012095808383233534,
"loss": 0.7683,
"step": 3300
},
{
"epoch": 4.07185628742515,
"grad_norm": 0.7714102268218994,
"learning_rate": 0.00011856287425149701,
"loss": 0.7411,
"step": 3400
},
{
"epoch": 4.191616766467066,
"grad_norm": 0.7947181463241577,
"learning_rate": 0.0001161676646706587,
"loss": 0.7288,
"step": 3500
},
{
"epoch": 4.191616766467066,
"eval_Accuracy": 0.1602,
"eval_loss": 0.8330999612808228,
"eval_runtime": 23.0702,
"eval_samples_per_second": 216.73,
"eval_steps_per_second": 6.805,
"step": 3500
},
{
"epoch": 4.311377245508982,
"grad_norm": 0.7584651112556458,
"learning_rate": 0.00011377245508982037,
"loss": 0.7258,
"step": 3600
},
{
"epoch": 4.431137724550898,
"grad_norm": 0.6492884159088135,
"learning_rate": 0.00011137724550898203,
"loss": 0.7222,
"step": 3700
},
{
"epoch": 4.550898203592815,
"grad_norm": 0.7623780965805054,
"learning_rate": 0.00010898203592814371,
"loss": 0.7168,
"step": 3800
},
{
"epoch": 4.6706586826347305,
"grad_norm": 0.8104670643806458,
"learning_rate": 0.00010658682634730539,
"loss": 0.7272,
"step": 3900
},
{
"epoch": 4.790419161676647,
"grad_norm": 0.6871969699859619,
"learning_rate": 0.00010419161676646707,
"loss": 0.7204,
"step": 4000
},
{
"epoch": 4.790419161676647,
"eval_Accuracy": 0.159,
"eval_loss": 0.8263267874717712,
"eval_runtime": 22.9773,
"eval_samples_per_second": 217.606,
"eval_steps_per_second": 6.833,
"step": 4000
},
{
"epoch": 4.910179640718563,
"grad_norm": 0.7578293085098267,
"learning_rate": 0.00010179640718562875,
"loss": 0.727,
"step": 4100
},
{
"epoch": 5.029940119760479,
"grad_norm": 0.71566241979599,
"learning_rate": 9.940119760479042e-05,
"loss": 0.7111,
"step": 4200
},
{
"epoch": 5.149700598802395,
"grad_norm": 0.7744144201278687,
"learning_rate": 9.700598802395209e-05,
"loss": 0.6699,
"step": 4300
},
{
"epoch": 5.269461077844311,
"grad_norm": 0.7886649370193481,
"learning_rate": 9.461077844311378e-05,
"loss": 0.6791,
"step": 4400
},
{
"epoch": 5.389221556886228,
"grad_norm": 0.7046491503715515,
"learning_rate": 9.221556886227547e-05,
"loss": 0.6533,
"step": 4500
},
{
"epoch": 5.389221556886228,
"eval_Accuracy": 0.1644,
"eval_loss": 0.8296283483505249,
"eval_runtime": 23.2437,
"eval_samples_per_second": 215.112,
"eval_steps_per_second": 6.755,
"step": 4500
},
{
"epoch": 5.508982035928144,
"grad_norm": 0.6836217641830444,
"learning_rate": 8.982035928143712e-05,
"loss": 0.6798,
"step": 4600
},
{
"epoch": 5.62874251497006,
"grad_norm": 0.8316423296928406,
"learning_rate": 8.742514970059881e-05,
"loss": 0.6813,
"step": 4700
},
{
"epoch": 5.748502994011976,
"grad_norm": 0.806846022605896,
"learning_rate": 8.502994011976048e-05,
"loss": 0.6797,
"step": 4800
},
{
"epoch": 5.868263473053892,
"grad_norm": 0.7254876494407654,
"learning_rate": 8.263473053892216e-05,
"loss": 0.6732,
"step": 4900
},
{
"epoch": 5.9880239520958085,
"grad_norm": 1.0394260883331299,
"learning_rate": 8.023952095808383e-05,
"loss": 0.6955,
"step": 5000
},
{
"epoch": 5.9880239520958085,
"eval_Accuracy": 0.1616,
"eval_loss": 0.8202670216560364,
"eval_runtime": 23.0559,
"eval_samples_per_second": 216.864,
"eval_steps_per_second": 6.81,
"step": 5000
}
],
"logging_steps": 100,
"max_steps": 8350,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.01
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4755250684394496.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}