|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.2825445684237526, |
|
"eval_steps": 500, |
|
"global_step": 20000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03206361421059382, |
|
"grad_norm": 9.577472686767578, |
|
"learning_rate": 0.0002465, |
|
"loss": 4.7126, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03206361421059382, |
|
"eval_loss": 1.7046922445297241, |
|
"eval_runtime": 189.0412, |
|
"eval_samples_per_second": 37.145, |
|
"eval_steps_per_second": 0.582, |
|
"eval_wer": 0.9345593218086873, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.06412722842118763, |
|
"grad_norm": 6.515851020812988, |
|
"learning_rate": 0.0002939226804123711, |
|
"loss": 1.0533, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.06412722842118763, |
|
"eval_loss": 1.1487088203430176, |
|
"eval_runtime": 189.0728, |
|
"eval_samples_per_second": 37.139, |
|
"eval_steps_per_second": 0.582, |
|
"eval_wer": 0.7906695544718904, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.09619084263178146, |
|
"grad_norm": 3.676572799682617, |
|
"learning_rate": 0.0002861907216494845, |
|
"loss": 0.8268, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.09619084263178146, |
|
"eval_loss": 1.060188889503479, |
|
"eval_runtime": 190.9733, |
|
"eval_samples_per_second": 36.77, |
|
"eval_steps_per_second": 0.576, |
|
"eval_wer": 0.7815012158014324, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.12825445684237527, |
|
"grad_norm": 9.430739402770996, |
|
"learning_rate": 0.00027845876288659795, |
|
"loss": 0.7188, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.12825445684237527, |
|
"eval_loss": 0.9336337447166443, |
|
"eval_runtime": 191.0896, |
|
"eval_samples_per_second": 36.747, |
|
"eval_steps_per_second": 0.576, |
|
"eval_wer": 0.671746900702906, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.16031807105296908, |
|
"grad_norm": 2.9828200340270996, |
|
"learning_rate": 0.00027072680412371135, |
|
"loss": 0.6725, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.16031807105296908, |
|
"eval_loss": 0.9303568005561829, |
|
"eval_runtime": 191.2157, |
|
"eval_samples_per_second": 36.723, |
|
"eval_steps_per_second": 0.575, |
|
"eval_wer": 0.6560677128316879, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.19238168526356292, |
|
"grad_norm": 4.710850238800049, |
|
"learning_rate": 0.0002629948453608247, |
|
"loss": 0.6295, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.19238168526356292, |
|
"eval_loss": 0.8600214719772339, |
|
"eval_runtime": 191.6797, |
|
"eval_samples_per_second": 36.634, |
|
"eval_steps_per_second": 0.574, |
|
"eval_wer": 0.6257324705350855, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.22444529947415673, |
|
"grad_norm": 4.912868976593018, |
|
"learning_rate": 0.0002552628865979381, |
|
"loss": 0.6003, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.22444529947415673, |
|
"eval_loss": 0.8395254611968994, |
|
"eval_runtime": 191.3108, |
|
"eval_samples_per_second": 36.705, |
|
"eval_steps_per_second": 0.575, |
|
"eval_wer": 0.6113288776093224, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.25650891368475054, |
|
"grad_norm": 4.513955116271973, |
|
"learning_rate": 0.00024754639175257734, |
|
"loss": 0.5847, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25650891368475054, |
|
"eval_loss": 0.7883865833282471, |
|
"eval_runtime": 192.8783, |
|
"eval_samples_per_second": 36.406, |
|
"eval_steps_per_second": 0.57, |
|
"eval_wer": 0.5861491648839341, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.2885725278953444, |
|
"grad_norm": 16.630624771118164, |
|
"learning_rate": 0.00023984536082474227, |
|
"loss": 0.5521, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.2885725278953444, |
|
"eval_loss": 0.7741186618804932, |
|
"eval_runtime": 189.6516, |
|
"eval_samples_per_second": 37.026, |
|
"eval_steps_per_second": 0.58, |
|
"eval_wer": 0.5686628841733214, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.32063614210593816, |
|
"grad_norm": 7.58245325088501, |
|
"learning_rate": 0.00023211340206185567, |
|
"loss": 0.5477, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.32063614210593816, |
|
"eval_loss": 0.7594121098518372, |
|
"eval_runtime": 190.5466, |
|
"eval_samples_per_second": 36.852, |
|
"eval_steps_per_second": 0.577, |
|
"eval_wer": 0.5535550565380885, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.352699756316532, |
|
"grad_norm": 5.051167011260986, |
|
"learning_rate": 0.00022438144329896904, |
|
"loss": 0.5346, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.352699756316532, |
|
"eval_loss": 0.7481973767280579, |
|
"eval_runtime": 190.5699, |
|
"eval_samples_per_second": 36.847, |
|
"eval_steps_per_second": 0.577, |
|
"eval_wer": 0.5394039251119468, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.38476337052712584, |
|
"grad_norm": 4.212076187133789, |
|
"learning_rate": 0.00021666494845360825, |
|
"loss": 0.5154, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.38476337052712584, |
|
"eval_loss": 0.7294158935546875, |
|
"eval_runtime": 189.7232, |
|
"eval_samples_per_second": 37.012, |
|
"eval_steps_per_second": 0.58, |
|
"eval_wer": 0.53515194196043, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4168269847377196, |
|
"grad_norm": 5.682095527648926, |
|
"learning_rate": 0.00020893298969072165, |
|
"loss": 0.492, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.4168269847377196, |
|
"eval_loss": 0.7247592806816101, |
|
"eval_runtime": 190.6553, |
|
"eval_samples_per_second": 36.831, |
|
"eval_steps_per_second": 0.577, |
|
"eval_wer": 0.5492632110445262, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.44889059894831346, |
|
"grad_norm": 8.364203453063965, |
|
"learning_rate": 0.0002012164948453608, |
|
"loss": 0.4759, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.44889059894831346, |
|
"eval_loss": 0.7076719403266907, |
|
"eval_runtime": 189.5572, |
|
"eval_samples_per_second": 37.044, |
|
"eval_steps_per_second": 0.58, |
|
"eval_wer": 0.5134402529929976, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.4809542131589073, |
|
"grad_norm": 4.447290897369385, |
|
"learning_rate": 0.0001934845360824742, |
|
"loss": 0.4655, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.4809542131589073, |
|
"eval_loss": 0.673875629901886, |
|
"eval_runtime": 190.3324, |
|
"eval_samples_per_second": 36.893, |
|
"eval_steps_per_second": 0.578, |
|
"eval_wer": 0.5063979058982979, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.5130178273695011, |
|
"grad_norm": 12.618865013122559, |
|
"learning_rate": 0.0001857680412371134, |
|
"loss": 0.4594, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5130178273695011, |
|
"eval_loss": 0.6574720144271851, |
|
"eval_runtime": 190.8303, |
|
"eval_samples_per_second": 36.797, |
|
"eval_steps_per_second": 0.576, |
|
"eval_wer": 0.5067300920820101, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.5450814415800949, |
|
"grad_norm": 2.756011962890625, |
|
"learning_rate": 0.0001780360824742268, |
|
"loss": 0.4538, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5450814415800949, |
|
"eval_loss": 0.6492609977722168, |
|
"eval_runtime": 189.5472, |
|
"eval_samples_per_second": 37.046, |
|
"eval_steps_per_second": 0.58, |
|
"eval_wer": 0.500325542460038, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.5771450557906888, |
|
"grad_norm": 7.6861491203308105, |
|
"learning_rate": 0.0001703041237113402, |
|
"loss": 0.4739, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.5771450557906888, |
|
"eval_loss": 0.7676782608032227, |
|
"eval_runtime": 189.9033, |
|
"eval_samples_per_second": 36.977, |
|
"eval_steps_per_second": 0.579, |
|
"eval_wer": 0.5238576117142136, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.6092086700012825, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.695, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.6092086700012825, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.9158, |
|
"eval_samples_per_second": 37.568, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.6412722842118763, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6412722842118763, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.7613, |
|
"eval_samples_per_second": 37.599, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.6733358984224702, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.6733358984224702, |
|
"eval_loss": NaN, |
|
"eval_runtime": 185.9746, |
|
"eval_samples_per_second": 37.758, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.705399512633064, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.705399512633064, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.0588, |
|
"eval_samples_per_second": 37.741, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.7374631268436578, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7374631268436578, |
|
"eval_loss": NaN, |
|
"eval_runtime": 187.0825, |
|
"eval_samples_per_second": 37.534, |
|
"eval_steps_per_second": 0.588, |
|
"eval_wer": 1.0, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.7695267410542517, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.7695267410542517, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.5183, |
|
"eval_samples_per_second": 37.648, |
|
"eval_steps_per_second": 0.59, |
|
"eval_wer": 1.0, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.8015903552648455, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.8015903552648455, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.3281, |
|
"eval_samples_per_second": 37.686, |
|
"eval_steps_per_second": 0.59, |
|
"eval_wer": 1.0, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.8336539694754392, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.8336539694754392, |
|
"eval_loss": NaN, |
|
"eval_runtime": 185.5922, |
|
"eval_samples_per_second": 37.836, |
|
"eval_steps_per_second": 0.593, |
|
"eval_wer": 1.0, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.8657175836860331, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8657175836860331, |
|
"eval_loss": NaN, |
|
"eval_runtime": 185.7237, |
|
"eval_samples_per_second": 37.809, |
|
"eval_steps_per_second": 0.592, |
|
"eval_wer": 1.0, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.8977811978966269, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.8977811978966269, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.6259, |
|
"eval_samples_per_second": 37.626, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.9298448121072207, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9298448121072207, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.1517, |
|
"eval_samples_per_second": 37.722, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.9619084263178146, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9619084263178146, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.7927, |
|
"eval_samples_per_second": 37.592, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.9939720405284084, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.9939720405284084, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.1708, |
|
"eval_samples_per_second": 37.718, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.0260356547390022, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.0260356547390022, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.1341, |
|
"eval_samples_per_second": 37.725, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.058099268949596, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.058099268949596, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.4575, |
|
"eval_samples_per_second": 37.66, |
|
"eval_steps_per_second": 0.59, |
|
"eval_wer": 1.0, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.0901628831601897, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.0901628831601897, |
|
"eval_loss": NaN, |
|
"eval_runtime": 185.4444, |
|
"eval_samples_per_second": 37.866, |
|
"eval_steps_per_second": 0.593, |
|
"eval_wer": 1.0, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.1222264973707836, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.1222264973707836, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.15, |
|
"eval_samples_per_second": 37.722, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.1542901115813775, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1542901115813775, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.0027, |
|
"eval_samples_per_second": 37.752, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.1863537257919712, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.1863537257919712, |
|
"eval_loss": NaN, |
|
"eval_runtime": 185.6149, |
|
"eval_samples_per_second": 37.831, |
|
"eval_steps_per_second": 0.593, |
|
"eval_wer": 1.0, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.218417340002565, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.218417340002565, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.7557, |
|
"eval_samples_per_second": 37.6, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.250480954213159, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.250480954213159, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.7166, |
|
"eval_samples_per_second": 37.608, |
|
"eval_steps_per_second": 0.589, |
|
"eval_wer": 1.0, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.2825445684237526, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00016787628865979378, |
|
"loss": 0.0, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2825445684237526, |
|
"eval_loss": NaN, |
|
"eval_runtime": 186.0546, |
|
"eval_samples_per_second": 37.742, |
|
"eval_steps_per_second": 0.591, |
|
"eval_wer": 1.0, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.2825445684237526, |
|
"step": 20000, |
|
"total_flos": 2.2824984432894013e+19, |
|
"train_loss": 0.38660173568725587, |
|
"train_runtime": 15166.2226, |
|
"train_samples_per_second": 10.55, |
|
"train_steps_per_second": 1.319 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2824984432894013e+19, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|