{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2825445684237526, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03206361421059382, "grad_norm": 9.577472686767578, "learning_rate": 0.0002465, "loss": 4.7126, "step": 500 }, { "epoch": 0.03206361421059382, "eval_loss": 1.7046922445297241, "eval_runtime": 189.0412, "eval_samples_per_second": 37.145, "eval_steps_per_second": 0.582, "eval_wer": 0.9345593218086873, "step": 500 }, { "epoch": 0.06412722842118763, "grad_norm": 6.515851020812988, "learning_rate": 0.0002939226804123711, "loss": 1.0533, "step": 1000 }, { "epoch": 0.06412722842118763, "eval_loss": 1.1487088203430176, "eval_runtime": 189.0728, "eval_samples_per_second": 37.139, "eval_steps_per_second": 0.582, "eval_wer": 0.7906695544718904, "step": 1000 }, { "epoch": 0.09619084263178146, "grad_norm": 3.676572799682617, "learning_rate": 0.0002861907216494845, "loss": 0.8268, "step": 1500 }, { "epoch": 0.09619084263178146, "eval_loss": 1.060188889503479, "eval_runtime": 190.9733, "eval_samples_per_second": 36.77, "eval_steps_per_second": 0.576, "eval_wer": 0.7815012158014324, "step": 1500 }, { "epoch": 0.12825445684237527, "grad_norm": 9.430739402770996, "learning_rate": 0.00027845876288659795, "loss": 0.7188, "step": 2000 }, { "epoch": 0.12825445684237527, "eval_loss": 0.9336337447166443, "eval_runtime": 191.0896, "eval_samples_per_second": 36.747, "eval_steps_per_second": 0.576, "eval_wer": 0.671746900702906, "step": 2000 }, { "epoch": 0.16031807105296908, "grad_norm": 2.9828200340270996, "learning_rate": 0.00027072680412371135, "loss": 0.6725, "step": 2500 }, { "epoch": 0.16031807105296908, "eval_loss": 0.9303568005561829, "eval_runtime": 191.2157, "eval_samples_per_second": 36.723, "eval_steps_per_second": 0.575, "eval_wer": 0.6560677128316879, "step": 2500 }, { "epoch": 0.19238168526356292, "grad_norm": 4.710850238800049, "learning_rate": 0.0002629948453608247, "loss": 0.6295, "step": 3000 }, { "epoch": 0.19238168526356292, "eval_loss": 0.8600214719772339, "eval_runtime": 191.6797, "eval_samples_per_second": 36.634, "eval_steps_per_second": 0.574, "eval_wer": 0.6257324705350855, "step": 3000 }, { "epoch": 0.22444529947415673, "grad_norm": 4.912868976593018, "learning_rate": 0.0002552628865979381, "loss": 0.6003, "step": 3500 }, { "epoch": 0.22444529947415673, "eval_loss": 0.8395254611968994, "eval_runtime": 191.3108, "eval_samples_per_second": 36.705, "eval_steps_per_second": 0.575, "eval_wer": 0.6113288776093224, "step": 3500 }, { "epoch": 0.25650891368475054, "grad_norm": 4.513955116271973, "learning_rate": 0.00024754639175257734, "loss": 0.5847, "step": 4000 }, { "epoch": 0.25650891368475054, "eval_loss": 0.7883865833282471, "eval_runtime": 192.8783, "eval_samples_per_second": 36.406, "eval_steps_per_second": 0.57, "eval_wer": 0.5861491648839341, "step": 4000 }, { "epoch": 0.2885725278953444, "grad_norm": 16.630624771118164, "learning_rate": 0.00023984536082474227, "loss": 0.5521, "step": 4500 }, { "epoch": 0.2885725278953444, "eval_loss": 0.7741186618804932, "eval_runtime": 189.6516, "eval_samples_per_second": 37.026, "eval_steps_per_second": 0.58, "eval_wer": 0.5686628841733214, "step": 4500 }, { "epoch": 0.32063614210593816, "grad_norm": 7.58245325088501, "learning_rate": 0.00023211340206185567, "loss": 0.5477, "step": 5000 }, { "epoch": 0.32063614210593816, "eval_loss": 0.7594121098518372, "eval_runtime": 190.5466, "eval_samples_per_second": 36.852, "eval_steps_per_second": 0.577, "eval_wer": 0.5535550565380885, "step": 5000 }, { "epoch": 0.352699756316532, "grad_norm": 5.051167011260986, "learning_rate": 0.00022438144329896904, "loss": 0.5346, "step": 5500 }, { "epoch": 0.352699756316532, "eval_loss": 0.7481973767280579, "eval_runtime": 190.5699, "eval_samples_per_second": 36.847, "eval_steps_per_second": 0.577, "eval_wer": 0.5394039251119468, "step": 5500 }, { "epoch": 0.38476337052712584, "grad_norm": 4.212076187133789, "learning_rate": 0.00021666494845360825, "loss": 0.5154, "step": 6000 }, { "epoch": 0.38476337052712584, "eval_loss": 0.7294158935546875, "eval_runtime": 189.7232, "eval_samples_per_second": 37.012, "eval_steps_per_second": 0.58, "eval_wer": 0.53515194196043, "step": 6000 }, { "epoch": 0.4168269847377196, "grad_norm": 5.682095527648926, "learning_rate": 0.00020893298969072165, "loss": 0.492, "step": 6500 }, { "epoch": 0.4168269847377196, "eval_loss": 0.7247592806816101, "eval_runtime": 190.6553, "eval_samples_per_second": 36.831, "eval_steps_per_second": 0.577, "eval_wer": 0.5492632110445262, "step": 6500 }, { "epoch": 0.44889059894831346, "grad_norm": 8.364203453063965, "learning_rate": 0.0002012164948453608, "loss": 0.4759, "step": 7000 }, { "epoch": 0.44889059894831346, "eval_loss": 0.7076719403266907, "eval_runtime": 189.5572, "eval_samples_per_second": 37.044, "eval_steps_per_second": 0.58, "eval_wer": 0.5134402529929976, "step": 7000 }, { "epoch": 0.4809542131589073, "grad_norm": 4.447290897369385, "learning_rate": 0.0001934845360824742, "loss": 0.4655, "step": 7500 }, { "epoch": 0.4809542131589073, "eval_loss": 0.673875629901886, "eval_runtime": 190.3324, "eval_samples_per_second": 36.893, "eval_steps_per_second": 0.578, "eval_wer": 0.5063979058982979, "step": 7500 }, { "epoch": 0.5130178273695011, "grad_norm": 12.618865013122559, "learning_rate": 0.0001857680412371134, "loss": 0.4594, "step": 8000 }, { "epoch": 0.5130178273695011, "eval_loss": 0.6574720144271851, "eval_runtime": 190.8303, "eval_samples_per_second": 36.797, "eval_steps_per_second": 0.576, "eval_wer": 0.5067300920820101, "step": 8000 }, { "epoch": 0.5450814415800949, "grad_norm": 2.756011962890625, "learning_rate": 0.0001780360824742268, "loss": 0.4538, "step": 8500 }, { "epoch": 0.5450814415800949, "eval_loss": 0.6492609977722168, "eval_runtime": 189.5472, "eval_samples_per_second": 37.046, "eval_steps_per_second": 0.58, "eval_wer": 0.500325542460038, "step": 8500 }, { "epoch": 0.5771450557906888, "grad_norm": 7.6861491203308105, "learning_rate": 0.0001703041237113402, "loss": 0.4739, "step": 9000 }, { "epoch": 0.5771450557906888, "eval_loss": 0.7676782608032227, "eval_runtime": 189.9033, "eval_samples_per_second": 36.977, "eval_steps_per_second": 0.579, "eval_wer": 0.5238576117142136, "step": 9000 }, { "epoch": 0.6092086700012825, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.695, "step": 9500 }, { "epoch": 0.6092086700012825, "eval_loss": NaN, "eval_runtime": 186.9158, "eval_samples_per_second": 37.568, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 9500 }, { "epoch": 0.6412722842118763, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 10000 }, { "epoch": 0.6412722842118763, "eval_loss": NaN, "eval_runtime": 186.7613, "eval_samples_per_second": 37.599, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 10000 }, { "epoch": 0.6733358984224702, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 10500 }, { "epoch": 0.6733358984224702, "eval_loss": NaN, "eval_runtime": 185.9746, "eval_samples_per_second": 37.758, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 10500 }, { "epoch": 0.705399512633064, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 11000 }, { "epoch": 0.705399512633064, "eval_loss": NaN, "eval_runtime": 186.0588, "eval_samples_per_second": 37.741, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 11000 }, { "epoch": 0.7374631268436578, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 11500 }, { "epoch": 0.7374631268436578, "eval_loss": NaN, "eval_runtime": 187.0825, "eval_samples_per_second": 37.534, "eval_steps_per_second": 0.588, "eval_wer": 1.0, "step": 11500 }, { "epoch": 0.7695267410542517, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 12000 }, { "epoch": 0.7695267410542517, "eval_loss": NaN, "eval_runtime": 186.5183, "eval_samples_per_second": 37.648, "eval_steps_per_second": 0.59, "eval_wer": 1.0, "step": 12000 }, { "epoch": 0.8015903552648455, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 12500 }, { "epoch": 0.8015903552648455, "eval_loss": NaN, "eval_runtime": 186.3281, "eval_samples_per_second": 37.686, "eval_steps_per_second": 0.59, "eval_wer": 1.0, "step": 12500 }, { "epoch": 0.8336539694754392, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 13000 }, { "epoch": 0.8336539694754392, "eval_loss": NaN, "eval_runtime": 185.5922, "eval_samples_per_second": 37.836, "eval_steps_per_second": 0.593, "eval_wer": 1.0, "step": 13000 }, { "epoch": 0.8657175836860331, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 13500 }, { "epoch": 0.8657175836860331, "eval_loss": NaN, "eval_runtime": 185.7237, "eval_samples_per_second": 37.809, "eval_steps_per_second": 0.592, "eval_wer": 1.0, "step": 13500 }, { "epoch": 0.8977811978966269, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 14000 }, { "epoch": 0.8977811978966269, "eval_loss": NaN, "eval_runtime": 186.6259, "eval_samples_per_second": 37.626, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 14000 }, { "epoch": 0.9298448121072207, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 14500 }, { "epoch": 0.9298448121072207, "eval_loss": NaN, "eval_runtime": 186.1517, "eval_samples_per_second": 37.722, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 14500 }, { "epoch": 0.9619084263178146, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 15000 }, { "epoch": 0.9619084263178146, "eval_loss": NaN, "eval_runtime": 186.7927, "eval_samples_per_second": 37.592, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 15000 }, { "epoch": 0.9939720405284084, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 15500 }, { "epoch": 0.9939720405284084, "eval_loss": NaN, "eval_runtime": 186.1708, "eval_samples_per_second": 37.718, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 15500 }, { "epoch": 1.0260356547390022, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 16000 }, { "epoch": 1.0260356547390022, "eval_loss": NaN, "eval_runtime": 186.1341, "eval_samples_per_second": 37.725, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 16000 }, { "epoch": 1.058099268949596, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 16500 }, { "epoch": 1.058099268949596, "eval_loss": NaN, "eval_runtime": 186.4575, "eval_samples_per_second": 37.66, "eval_steps_per_second": 0.59, "eval_wer": 1.0, "step": 16500 }, { "epoch": 1.0901628831601897, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 17000 }, { "epoch": 1.0901628831601897, "eval_loss": NaN, "eval_runtime": 185.4444, "eval_samples_per_second": 37.866, "eval_steps_per_second": 0.593, "eval_wer": 1.0, "step": 17000 }, { "epoch": 1.1222264973707836, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 17500 }, { "epoch": 1.1222264973707836, "eval_loss": NaN, "eval_runtime": 186.15, "eval_samples_per_second": 37.722, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 17500 }, { "epoch": 1.1542901115813775, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 18000 }, { "epoch": 1.1542901115813775, "eval_loss": NaN, "eval_runtime": 186.0027, "eval_samples_per_second": 37.752, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 18000 }, { "epoch": 1.1863537257919712, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 18500 }, { "epoch": 1.1863537257919712, "eval_loss": NaN, "eval_runtime": 185.6149, "eval_samples_per_second": 37.831, "eval_steps_per_second": 0.593, "eval_wer": 1.0, "step": 18500 }, { "epoch": 1.218417340002565, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 19000 }, { "epoch": 1.218417340002565, "eval_loss": NaN, "eval_runtime": 186.7557, "eval_samples_per_second": 37.6, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 19000 }, { "epoch": 1.250480954213159, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 19500 }, { "epoch": 1.250480954213159, "eval_loss": NaN, "eval_runtime": 186.7166, "eval_samples_per_second": 37.608, "eval_steps_per_second": 0.589, "eval_wer": 1.0, "step": 19500 }, { "epoch": 1.2825445684237526, "grad_norm": NaN, "learning_rate": 0.00016787628865979378, "loss": 0.0, "step": 20000 }, { "epoch": 1.2825445684237526, "eval_loss": NaN, "eval_runtime": 186.0546, "eval_samples_per_second": 37.742, "eval_steps_per_second": 0.591, "eval_wer": 1.0, "step": 20000 }, { "epoch": 1.2825445684237526, "step": 20000, "total_flos": 2.2824984432894013e+19, "train_loss": 0.38660173568725587, "train_runtime": 15166.2226, "train_samples_per_second": 10.55, "train_steps_per_second": 1.319 } ], "logging_steps": 500, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.2824984432894013e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }