{ "best_metric": 0.5565288662910461, "best_model_checkpoint": "/scratch/skscla001/results/mms-1b-all-bem-natbed-combined/checkpoint-3800", "epoch": 5.506883604505632, "eval_steps": 100, "global_step": 4400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1251564455569462, "grad_norm": 4.582801342010498, "learning_rate": 0.00029099999999999997, "loss": 8.2239, "step": 100 }, { "epoch": 0.1251564455569462, "eval_loss": 1.3301187753677368, "eval_runtime": 95.4493, "eval_samples_per_second": 14.227, "eval_steps_per_second": 1.781, "eval_wer": 0.8959954139710098, "step": 100 }, { "epoch": 0.2503128911138924, "grad_norm": 2.8414461612701416, "learning_rate": 0.000298780896522832, "loss": 0.8992, "step": 200 }, { "epoch": 0.2503128911138924, "eval_loss": 0.7774138450622559, "eval_runtime": 95.7815, "eval_samples_per_second": 14.178, "eval_steps_per_second": 1.775, "eval_wer": 0.6657112439603636, "step": 200 }, { "epoch": 0.37546933667083854, "grad_norm": 1.6420665979385376, "learning_rate": 0.00029752408881441136, "loss": 0.8263, "step": 300 }, { "epoch": 0.37546933667083854, "eval_loss": 0.7549673914909363, "eval_runtime": 95.0731, "eval_samples_per_second": 14.284, "eval_steps_per_second": 1.788, "eval_wer": 0.5891409384980755, "step": 300 }, { "epoch": 0.5006257822277848, "grad_norm": 1.838030219078064, "learning_rate": 0.000296279849183075, "loss": 0.7846, "step": 400 }, { "epoch": 0.5006257822277848, "eval_loss": 0.6995427012443542, "eval_runtime": 94.8499, "eval_samples_per_second": 14.317, "eval_steps_per_second": 1.792, "eval_wer": 0.5665383670461059, "step": 400 }, { "epoch": 0.6257822277847309, "grad_norm": 3.311655282974243, "learning_rate": 0.00029502304147465433, "loss": 0.9046, "step": 500 }, { "epoch": 0.6257822277847309, "eval_loss": 0.6796234846115112, "eval_runtime": 96.1056, "eval_samples_per_second": 14.13, "eval_steps_per_second": 1.769, "eval_wer": 0.5545819343215134, "step": 500 }, { "epoch": 0.7509386733416771, "grad_norm": 1.0524609088897705, "learning_rate": 0.00029376623376623374, "loss": 0.7688, "step": 600 }, { "epoch": 0.7509386733416771, "eval_loss": 0.6881176829338074, "eval_runtime": 95.4601, "eval_samples_per_second": 14.226, "eval_steps_per_second": 1.781, "eval_wer": 0.5436082220948325, "step": 600 }, { "epoch": 0.8760951188986232, "grad_norm": 1.534467339515686, "learning_rate": 0.00029250942605781315, "loss": 0.7139, "step": 700 }, { "epoch": 0.8760951188986232, "eval_loss": 0.6998778581619263, "eval_runtime": 96.7901, "eval_samples_per_second": 14.03, "eval_steps_per_second": 1.756, "eval_wer": 0.5557284415690771, "step": 700 }, { "epoch": 1.0012515644555695, "grad_norm": 4.051555633544922, "learning_rate": 0.0002912526183493925, "loss": 0.7922, "step": 800 }, { "epoch": 1.0012515644555695, "eval_loss": 0.6810752153396606, "eval_runtime": 96.2586, "eval_samples_per_second": 14.108, "eval_steps_per_second": 1.766, "eval_wer": 0.542134141347965, "step": 800 }, { "epoch": 1.1264080100125156, "grad_norm": 3.281071186065674, "learning_rate": 0.0002899958106409719, "loss": 0.7929, "step": 900 }, { "epoch": 1.1264080100125156, "eval_loss": 0.6760120391845703, "eval_runtime": 97.2734, "eval_samples_per_second": 13.961, "eval_steps_per_second": 1.748, "eval_wer": 0.5433625419703546, "step": 900 }, { "epoch": 1.2515644555694618, "grad_norm": 6.815141201019287, "learning_rate": 0.0002887515710096355, "loss": 0.7508, "step": 1000 }, { "epoch": 1.2515644555694618, "eval_loss": 0.6555261015892029, "eval_runtime": 96.3987, "eval_samples_per_second": 14.087, "eval_steps_per_second": 1.764, "eval_wer": 0.5660470067971501, "step": 1000 }, { "epoch": 1.3767209011264079, "grad_norm": 2.6557281017303467, "learning_rate": 0.0002874947633012149, "loss": 0.7534, "step": 1100 }, { "epoch": 1.3767209011264079, "eval_loss": 0.6411116719245911, "eval_runtime": 96.0443, "eval_samples_per_second": 14.139, "eval_steps_per_second": 1.77, "eval_wer": 0.5372205388584064, "step": 1100 }, { "epoch": 1.5018773466833542, "grad_norm": 19.327106475830078, "learning_rate": 0.00028623795559279424, "loss": 0.7316, "step": 1200 }, { "epoch": 1.5018773466833542, "eval_loss": 0.6420451998710632, "eval_runtime": 97.2234, "eval_samples_per_second": 13.968, "eval_steps_per_second": 1.749, "eval_wer": 0.5319793628695438, "step": 1200 }, { "epoch": 1.6270337922403004, "grad_norm": 4.125315189361572, "learning_rate": 0.00028498114788437365, "loss": 0.7147, "step": 1300 }, { "epoch": 1.6270337922403004, "eval_loss": 0.6725718975067139, "eval_runtime": 97.3227, "eval_samples_per_second": 13.954, "eval_steps_per_second": 1.747, "eval_wer": 0.5293587748751126, "step": 1300 }, { "epoch": 1.7521902377972465, "grad_norm": 10.938685417175293, "learning_rate": 0.00028372434017595306, "loss": 0.6734, "step": 1400 }, { "epoch": 1.7521902377972465, "eval_loss": 0.6308336853981018, "eval_runtime": 96.6381, "eval_samples_per_second": 14.052, "eval_steps_per_second": 1.759, "eval_wer": 0.5252641061338138, "step": 1400 }, { "epoch": 1.8773466833541927, "grad_norm": 7.951139450073242, "learning_rate": 0.00028246753246753247, "loss": 0.7084, "step": 1500 }, { "epoch": 1.8773466833541927, "eval_loss": 0.6205306649208069, "eval_runtime": 97.9744, "eval_samples_per_second": 13.861, "eval_steps_per_second": 1.735, "eval_wer": 0.5438539022193104, "step": 1500 }, { "epoch": 2.002503128911139, "grad_norm": 2.1400046348571777, "learning_rate": 0.00028121072475911183, "loss": 0.6714, "step": 1600 }, { "epoch": 2.002503128911139, "eval_loss": 0.6118720769882202, "eval_runtime": 97.507, "eval_samples_per_second": 13.927, "eval_steps_per_second": 1.743, "eval_wer": 0.5231348783883384, "step": 1600 }, { "epoch": 2.127659574468085, "grad_norm": 8.502281188964844, "learning_rate": 0.00027995391705069124, "loss": 0.6888, "step": 1700 }, { "epoch": 2.127659574468085, "eval_loss": 0.6349774599075317, "eval_runtime": 96.997, "eval_samples_per_second": 14.0, "eval_steps_per_second": 1.753, "eval_wer": 0.5166653017770862, "step": 1700 }, { "epoch": 2.252816020025031, "grad_norm": 1.059962272644043, "learning_rate": 0.0002786971093422706, "loss": 0.6871, "step": 1800 }, { "epoch": 2.252816020025031, "eval_loss": 0.6183043718338013, "eval_runtime": 97.927, "eval_samples_per_second": 13.867, "eval_steps_per_second": 1.736, "eval_wer": 0.5118335926623536, "step": 1800 }, { "epoch": 2.3779724655819776, "grad_norm": 1.3032373189926147, "learning_rate": 0.00027744030163385, "loss": 0.6882, "step": 1900 }, { "epoch": 2.3779724655819776, "eval_loss": 0.5973983407020569, "eval_runtime": 97.1298, "eval_samples_per_second": 13.981, "eval_steps_per_second": 1.75, "eval_wer": 0.5332896568667594, "step": 1900 }, { "epoch": 2.5031289111389237, "grad_norm": 1.9106544256210327, "learning_rate": 0.00027618349392542936, "loss": 0.6769, "step": 2000 }, { "epoch": 2.5031289111389237, "eval_loss": 0.5994674563407898, "eval_runtime": 96.7136, "eval_samples_per_second": 14.041, "eval_steps_per_second": 1.758, "eval_wer": 0.5300958152485464, "step": 2000 }, { "epoch": 2.6282853566958697, "grad_norm": 1.2871947288513184, "learning_rate": 0.00027492668621700877, "loss": 0.6801, "step": 2100 }, { "epoch": 2.6282853566958697, "eval_loss": 0.5880154371261597, "eval_runtime": 97.9989, "eval_samples_per_second": 13.857, "eval_steps_per_second": 1.735, "eval_wer": 0.5377937924821882, "step": 2100 }, { "epoch": 2.7534418022528158, "grad_norm": 0.619396448135376, "learning_rate": 0.0002736698785085881, "loss": 0.6695, "step": 2200 }, { "epoch": 2.7534418022528158, "eval_loss": 0.5972831845283508, "eval_runtime": 97.3411, "eval_samples_per_second": 13.951, "eval_steps_per_second": 1.746, "eval_wer": 0.5051183359266236, "step": 2200 }, { "epoch": 2.8785982478097623, "grad_norm": 1.0727863311767578, "learning_rate": 0.0002724130708001676, "loss": 0.6557, "step": 2300 }, { "epoch": 2.8785982478097623, "eval_loss": 0.6026735901832581, "eval_runtime": 97.4796, "eval_samples_per_second": 13.931, "eval_steps_per_second": 1.744, "eval_wer": 0.5056096961755794, "step": 2300 }, { "epoch": 3.0037546933667083, "grad_norm": 0.626075029373169, "learning_rate": 0.00027115626309174695, "loss": 0.6525, "step": 2400 }, { "epoch": 3.0037546933667083, "eval_loss": 0.59461510181427, "eval_runtime": 97.9487, "eval_samples_per_second": 13.864, "eval_steps_per_second": 1.736, "eval_wer": 0.4996314798132831, "step": 2400 }, { "epoch": 3.1289111389236544, "grad_norm": 4.777908802032471, "learning_rate": 0.00026989945538332636, "loss": 0.6829, "step": 2500 }, { "epoch": 3.1289111389236544, "eval_loss": 0.5881961584091187, "eval_runtime": 97.8698, "eval_samples_per_second": 13.876, "eval_steps_per_second": 1.737, "eval_wer": 0.49979526656293505, "step": 2500 }, { "epoch": 3.254067584480601, "grad_norm": 5.189509391784668, "learning_rate": 0.0002686426476749057, "loss": 0.6627, "step": 2600 }, { "epoch": 3.254067584480601, "eval_loss": 0.6009677052497864, "eval_runtime": 95.9556, "eval_samples_per_second": 14.152, "eval_steps_per_second": 1.772, "eval_wer": 0.49848497256571944, "step": 2600 }, { "epoch": 3.379224030037547, "grad_norm": 6.410580158233643, "learning_rate": 0.0002673858399664851, "loss": 0.6146, "step": 2700 }, { "epoch": 3.379224030037547, "eval_loss": 0.5770368576049805, "eval_runtime": 95.4343, "eval_samples_per_second": 14.23, "eval_steps_per_second": 1.781, "eval_wer": 0.5009417738104988, "step": 2700 }, { "epoch": 3.504380475594493, "grad_norm": 1.4566117525100708, "learning_rate": 0.0002661290322580645, "loss": 0.6205, "step": 2800 }, { "epoch": 3.504380475594493, "eval_loss": 0.5738538503646851, "eval_runtime": 95.9905, "eval_samples_per_second": 14.147, "eval_steps_per_second": 1.771, "eval_wer": 0.5020882810580624, "step": 2800 }, { "epoch": 3.6295369211514394, "grad_norm": 5.905794143676758, "learning_rate": 0.0002648722245496439, "loss": 0.7025, "step": 2900 }, { "epoch": 3.6295369211514394, "eval_loss": 0.5806447863578796, "eval_runtime": 96.9069, "eval_samples_per_second": 14.013, "eval_steps_per_second": 1.754, "eval_wer": 0.5223978380149046, "step": 2900 }, { "epoch": 3.7546933667083855, "grad_norm": 7.341220855712891, "learning_rate": 0.00026361541684122324, "loss": 0.6379, "step": 3000 }, { "epoch": 3.7546933667083855, "eval_loss": 0.6210225820541382, "eval_runtime": 96.7041, "eval_samples_per_second": 14.043, "eval_steps_per_second": 1.758, "eval_wer": 0.5064286299238392, "step": 3000 }, { "epoch": 3.8798498122653315, "grad_norm": 2.2515931129455566, "learning_rate": 0.00026235860913280265, "loss": 0.6104, "step": 3100 }, { "epoch": 3.8798498122653315, "eval_loss": 0.5702349543571472, "eval_runtime": 95.9116, "eval_samples_per_second": 14.159, "eval_steps_per_second": 1.772, "eval_wer": 0.5033985750552781, "step": 3100 }, { "epoch": 4.005006257822278, "grad_norm": 0.5629591941833496, "learning_rate": 0.00026110180142438206, "loss": 0.6607, "step": 3200 }, { "epoch": 4.005006257822278, "eval_loss": 0.5755676627159119, "eval_runtime": 96.4548, "eval_samples_per_second": 14.079, "eval_steps_per_second": 1.762, "eval_wer": 0.4891491278355581, "step": 3200 }, { "epoch": 4.130162703379224, "grad_norm": 6.9959259033203125, "learning_rate": 0.0002598449937159615, "loss": 0.6776, "step": 3300 }, { "epoch": 4.130162703379224, "eval_loss": 0.5678644180297852, "eval_runtime": 95.8409, "eval_samples_per_second": 14.169, "eval_steps_per_second": 1.774, "eval_wer": 0.4885758742117763, "step": 3300 }, { "epoch": 4.25531914893617, "grad_norm": 4.9695658683776855, "learning_rate": 0.00025860075408462504, "loss": 0.6343, "step": 3400 }, { "epoch": 4.25531914893617, "eval_loss": 0.5597887635231018, "eval_runtime": 95.5822, "eval_samples_per_second": 14.208, "eval_steps_per_second": 1.779, "eval_wer": 0.4898861682089919, "step": 3400 }, { "epoch": 4.380475594493117, "grad_norm": 1.5676358938217163, "learning_rate": 0.0002573439463762044, "loss": 0.5818, "step": 3500 }, { "epoch": 4.380475594493117, "eval_loss": 0.5806618928909302, "eval_runtime": 95.8037, "eval_samples_per_second": 14.175, "eval_steps_per_second": 1.774, "eval_wer": 0.49643763819507003, "step": 3500 }, { "epoch": 4.505632040050062, "grad_norm": 1.323522925376892, "learning_rate": 0.0002560871386677838, "loss": 0.6085, "step": 3600 }, { "epoch": 4.505632040050062, "eval_loss": 0.5931637287139893, "eval_runtime": 95.2763, "eval_samples_per_second": 14.253, "eval_steps_per_second": 1.784, "eval_wer": 0.4915240357055114, "step": 3600 }, { "epoch": 4.630788485607009, "grad_norm": 1.9220880270004272, "learning_rate": 0.0002548303309593632, "loss": 0.6648, "step": 3700 }, { "epoch": 4.630788485607009, "eval_loss": 0.5579622983932495, "eval_runtime": 96.2129, "eval_samples_per_second": 14.115, "eval_steps_per_second": 1.767, "eval_wer": 0.486037179592171, "step": 3700 }, { "epoch": 4.755944931163955, "grad_norm": 0.6678237915039062, "learning_rate": 0.00025357352325094257, "loss": 0.6359, "step": 3800 }, { "epoch": 4.755944931163955, "eval_loss": 0.5565288662910461, "eval_runtime": 95.7617, "eval_samples_per_second": 14.181, "eval_steps_per_second": 1.775, "eval_wer": 0.4959462779461142, "step": 3800 }, { "epoch": 4.881101376720901, "grad_norm": 0.8339139819145203, "learning_rate": 0.000252316715542522, "loss": 0.6139, "step": 3900 }, { "epoch": 4.881101376720901, "eval_loss": 0.5604941248893738, "eval_runtime": 95.8364, "eval_samples_per_second": 14.17, "eval_steps_per_second": 1.774, "eval_wer": 0.4885758742117763, "step": 3900 }, { "epoch": 5.006257822277847, "grad_norm": 0.6905695199966431, "learning_rate": 0.00025105990783410133, "loss": 0.5995, "step": 4000 }, { "epoch": 5.006257822277847, "eval_loss": 0.5719751715660095, "eval_runtime": 95.3867, "eval_samples_per_second": 14.237, "eval_steps_per_second": 1.782, "eval_wer": 0.48030464335435263, "step": 4000 }, { "epoch": 5.131414267834794, "grad_norm": 1.2827470302581787, "learning_rate": 0.00024980310012568074, "loss": 0.6349, "step": 4100 }, { "epoch": 5.131414267834794, "eval_loss": 0.5505784749984741, "eval_runtime": 96.3073, "eval_samples_per_second": 14.101, "eval_steps_per_second": 1.765, "eval_wer": 0.5011874539349767, "step": 4100 }, { "epoch": 5.256570713391739, "grad_norm": 1.4041779041290283, "learning_rate": 0.00024854629241726015, "loss": 0.6134, "step": 4200 }, { "epoch": 5.256570713391739, "eval_loss": 0.5602907538414001, "eval_runtime": 96.7539, "eval_samples_per_second": 14.036, "eval_steps_per_second": 1.757, "eval_wer": 0.4784210957333552, "step": 4200 }, { "epoch": 5.381727158948686, "grad_norm": 16.846094131469727, "learning_rate": 0.0002472894847088395, "loss": 0.5989, "step": 4300 }, { "epoch": 5.381727158948686, "eval_loss": 0.5714141726493835, "eval_runtime": 96.2578, "eval_samples_per_second": 14.108, "eval_steps_per_second": 1.766, "eval_wer": 0.48439931209565146, "step": 4300 }, { "epoch": 5.506883604505632, "grad_norm": 1.1656825542449951, "learning_rate": 0.0002460326770004189, "loss": 0.6083, "step": 4400 }, { "epoch": 5.506883604505632, "eval_loss": 0.5697967410087585, "eval_runtime": 96.7222, "eval_samples_per_second": 14.04, "eval_steps_per_second": 1.758, "eval_wer": 0.4758005077389239, "step": 4400 }, { "epoch": 5.506883604505632, "step": 4400, "total_flos": 2.278783878958534e+19, "train_loss": 0.858475610559637, "train_runtime": 9525.4326, "train_samples_per_second": 20.116, "train_steps_per_second": 2.516 } ], "logging_steps": 100, "max_steps": 23970, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.278783878958534e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }