{ "best_metric": 0.8202670216560364, "best_model_checkpoint": "not_included_mt5-base/checkpoint-5000", "epoch": 5.9880239520958085, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11976047904191617, "grad_norm": 91.10150146484375, "learning_rate": 0.0001976047904191617, "loss": 10.8476, "step": 100 }, { "epoch": 0.23952095808383234, "grad_norm": 3.626096248626709, "learning_rate": 0.00019520958083832338, "loss": 1.901, "step": 200 }, { "epoch": 0.3592814371257485, "grad_norm": 1.472233772277832, "learning_rate": 0.00019281437125748504, "loss": 1.5652, "step": 300 }, { "epoch": 0.47904191616766467, "grad_norm": 1.3357157707214355, "learning_rate": 0.0001904191616766467, "loss": 1.3415, "step": 400 }, { "epoch": 0.5988023952095808, "grad_norm": 0.947651207447052, "learning_rate": 0.0001880239520958084, "loss": 1.233, "step": 500 }, { "epoch": 0.5988023952095808, "eval_Accuracy": 0.062, "eval_loss": 1.1174869537353516, "eval_runtime": 22.1894, "eval_samples_per_second": 225.333, "eval_steps_per_second": 7.075, "step": 500 }, { "epoch": 0.718562874251497, "grad_norm": 0.8490985035896301, "learning_rate": 0.00018562874251497007, "loss": 1.1601, "step": 600 }, { "epoch": 0.8383233532934131, "grad_norm": 0.8438432812690735, "learning_rate": 0.00018323353293413173, "loss": 1.1129, "step": 700 }, { "epoch": 0.9580838323353293, "grad_norm": 0.7390198707580566, "learning_rate": 0.00018083832335329342, "loss": 1.0631, "step": 800 }, { "epoch": 1.0778443113772456, "grad_norm": 0.8146846890449524, "learning_rate": 0.0001784431137724551, "loss": 1.0243, "step": 900 }, { "epoch": 1.1976047904191618, "grad_norm": 0.7685344815254211, "learning_rate": 0.0001760479041916168, "loss": 0.9898, "step": 1000 }, { "epoch": 1.1976047904191618, "eval_Accuracy": 0.0998, "eval_loss": 0.9793121814727783, "eval_runtime": 22.9009, "eval_samples_per_second": 218.332, "eval_steps_per_second": 6.856, "step": 1000 }, { "epoch": 1.3173652694610778, "grad_norm": 0.7814993262290955, "learning_rate": 0.00017365269461077845, "loss": 0.9735, "step": 1100 }, { "epoch": 1.437125748502994, "grad_norm": 0.7339932322502136, "learning_rate": 0.0001712574850299401, "loss": 0.9723, "step": 1200 }, { "epoch": 1.55688622754491, "grad_norm": 0.7049471735954285, "learning_rate": 0.0001688622754491018, "loss": 0.9567, "step": 1300 }, { "epoch": 1.6766467065868262, "grad_norm": 0.7287872433662415, "learning_rate": 0.00016646706586826348, "loss": 0.946, "step": 1400 }, { "epoch": 1.7964071856287425, "grad_norm": 0.6998888254165649, "learning_rate": 0.00016407185628742517, "loss": 0.9204, "step": 1500 }, { "epoch": 1.7964071856287425, "eval_Accuracy": 0.1246, "eval_loss": 0.9095961451530457, "eval_runtime": 23.1954, "eval_samples_per_second": 215.56, "eval_steps_per_second": 6.769, "step": 1500 }, { "epoch": 1.9161676646706587, "grad_norm": 0.9079245328903198, "learning_rate": 0.00016167664670658683, "loss": 0.9237, "step": 1600 }, { "epoch": 2.035928143712575, "grad_norm": 0.6102410554885864, "learning_rate": 0.0001592814371257485, "loss": 0.9064, "step": 1700 }, { "epoch": 2.155688622754491, "grad_norm": 0.6484932899475098, "learning_rate": 0.00015688622754491018, "loss": 0.8548, "step": 1800 }, { "epoch": 2.2754491017964074, "grad_norm": 0.6946207880973816, "learning_rate": 0.00015449101796407186, "loss": 0.8632, "step": 1900 }, { "epoch": 2.3952095808383236, "grad_norm": 0.706642210483551, "learning_rate": 0.00015209580838323355, "loss": 0.8405, "step": 2000 }, { "epoch": 2.3952095808383236, "eval_Accuracy": 0.1388, "eval_loss": 0.8839116096496582, "eval_runtime": 22.9368, "eval_samples_per_second": 217.99, "eval_steps_per_second": 6.845, "step": 2000 }, { "epoch": 2.5149700598802394, "grad_norm": 0.7250691056251526, "learning_rate": 0.0001497005988023952, "loss": 0.8546, "step": 2100 }, { "epoch": 2.6347305389221556, "grad_norm": 0.7567930817604065, "learning_rate": 0.0001473053892215569, "loss": 0.8508, "step": 2200 }, { "epoch": 2.754491017964072, "grad_norm": 0.8484090566635132, "learning_rate": 0.00014491017964071858, "loss": 0.851, "step": 2300 }, { "epoch": 2.874251497005988, "grad_norm": 0.8208755254745483, "learning_rate": 0.00014251497005988024, "loss": 0.843, "step": 2400 }, { "epoch": 2.9940119760479043, "grad_norm": 0.6463755965232849, "learning_rate": 0.00014011976047904193, "loss": 0.8338, "step": 2500 }, { "epoch": 2.9940119760479043, "eval_Accuracy": 0.1382, "eval_loss": 0.852607786655426, "eval_runtime": 23.4307, "eval_samples_per_second": 213.396, "eval_steps_per_second": 6.701, "step": 2500 }, { "epoch": 3.1137724550898205, "grad_norm": 0.708572506904602, "learning_rate": 0.00013772455089820359, "loss": 0.7838, "step": 2600 }, { "epoch": 3.2335329341317367, "grad_norm": 0.758407711982727, "learning_rate": 0.00013532934131736527, "loss": 0.7792, "step": 2700 }, { "epoch": 3.3532934131736525, "grad_norm": 0.8018171191215515, "learning_rate": 0.00013293413173652696, "loss": 0.7873, "step": 2800 }, { "epoch": 3.4730538922155687, "grad_norm": 0.6300553679466248, "learning_rate": 0.00013053892215568865, "loss": 0.774, "step": 2900 }, { "epoch": 3.592814371257485, "grad_norm": 0.5713552832603455, "learning_rate": 0.0001281437125748503, "loss": 0.783, "step": 3000 }, { "epoch": 3.592814371257485, "eval_Accuracy": 0.1508, "eval_loss": 0.8408400416374207, "eval_runtime": 23.0015, "eval_samples_per_second": 217.377, "eval_steps_per_second": 6.826, "step": 3000 }, { "epoch": 3.712574850299401, "grad_norm": 0.6296516060829163, "learning_rate": 0.00012574850299401196, "loss": 0.7694, "step": 3100 }, { "epoch": 3.8323353293413174, "grad_norm": 0.5926547050476074, "learning_rate": 0.00012335329341317365, "loss": 0.7878, "step": 3200 }, { "epoch": 3.9520958083832336, "grad_norm": 0.8825701475143433, "learning_rate": 0.00012095808383233534, "loss": 0.7683, "step": 3300 }, { "epoch": 4.07185628742515, "grad_norm": 0.7714102268218994, "learning_rate": 0.00011856287425149701, "loss": 0.7411, "step": 3400 }, { "epoch": 4.191616766467066, "grad_norm": 0.7947181463241577, "learning_rate": 0.0001161676646706587, "loss": 0.7288, "step": 3500 }, { "epoch": 4.191616766467066, "eval_Accuracy": 0.1602, "eval_loss": 0.8330999612808228, "eval_runtime": 23.0702, "eval_samples_per_second": 216.73, "eval_steps_per_second": 6.805, "step": 3500 }, { "epoch": 4.311377245508982, "grad_norm": 0.7584651112556458, "learning_rate": 0.00011377245508982037, "loss": 0.7258, "step": 3600 }, { "epoch": 4.431137724550898, "grad_norm": 0.6492884159088135, "learning_rate": 0.00011137724550898203, "loss": 0.7222, "step": 3700 }, { "epoch": 4.550898203592815, "grad_norm": 0.7623780965805054, "learning_rate": 0.00010898203592814371, "loss": 0.7168, "step": 3800 }, { "epoch": 4.6706586826347305, "grad_norm": 0.8104670643806458, "learning_rate": 0.00010658682634730539, "loss": 0.7272, "step": 3900 }, { "epoch": 4.790419161676647, "grad_norm": 0.6871969699859619, "learning_rate": 0.00010419161676646707, "loss": 0.7204, "step": 4000 }, { "epoch": 4.790419161676647, "eval_Accuracy": 0.159, "eval_loss": 0.8263267874717712, "eval_runtime": 22.9773, "eval_samples_per_second": 217.606, "eval_steps_per_second": 6.833, "step": 4000 }, { "epoch": 4.910179640718563, "grad_norm": 0.7578293085098267, "learning_rate": 0.00010179640718562875, "loss": 0.727, "step": 4100 }, { "epoch": 5.029940119760479, "grad_norm": 0.71566241979599, "learning_rate": 9.940119760479042e-05, "loss": 0.7111, "step": 4200 }, { "epoch": 5.149700598802395, "grad_norm": 0.7744144201278687, "learning_rate": 9.700598802395209e-05, "loss": 0.6699, "step": 4300 }, { "epoch": 5.269461077844311, "grad_norm": 0.7886649370193481, "learning_rate": 9.461077844311378e-05, "loss": 0.6791, "step": 4400 }, { "epoch": 5.389221556886228, "grad_norm": 0.7046491503715515, "learning_rate": 9.221556886227547e-05, "loss": 0.6533, "step": 4500 }, { "epoch": 5.389221556886228, "eval_Accuracy": 0.1644, "eval_loss": 0.8296283483505249, "eval_runtime": 23.2437, "eval_samples_per_second": 215.112, "eval_steps_per_second": 6.755, "step": 4500 }, { "epoch": 5.508982035928144, "grad_norm": 0.6836217641830444, "learning_rate": 8.982035928143712e-05, "loss": 0.6798, "step": 4600 }, { "epoch": 5.62874251497006, "grad_norm": 0.8316423296928406, "learning_rate": 8.742514970059881e-05, "loss": 0.6813, "step": 4700 }, { "epoch": 5.748502994011976, "grad_norm": 0.806846022605896, "learning_rate": 8.502994011976048e-05, "loss": 0.6797, "step": 4800 }, { "epoch": 5.868263473053892, "grad_norm": 0.7254876494407654, "learning_rate": 8.263473053892216e-05, "loss": 0.6732, "step": 4900 }, { "epoch": 5.9880239520958085, "grad_norm": 1.0394260883331299, "learning_rate": 8.023952095808383e-05, "loss": 0.6955, "step": 5000 }, { "epoch": 5.9880239520958085, "eval_Accuracy": 0.1616, "eval_loss": 0.8202670216560364, "eval_runtime": 23.0559, "eval_samples_per_second": 216.864, "eval_steps_per_second": 6.81, "step": 5000 } ], "logging_steps": 100, "max_steps": 8350, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4755250684394496.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }