diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2327 +1,4627 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 3.9294274824158117, + "epoch": 7.858854964831624, "eval_steps": 1000, - "global_step": 100000, + "global_step": 200000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01964713741207906, - "grad_norm": 2.5734992027282715, + "grad_norm": 3.3487088680267334, "learning_rate": 0.0002982, - "loss": 4.7324, + "loss": 4.7396, "step": 500 }, { "epoch": 0.03929427482415812, - "grad_norm": 3.079489231109619, - "learning_rate": 0.0002985015075376884, - "loss": 1.043, + "grad_norm": 4.3209547996521, + "learning_rate": 0.00029925263157894736, + "loss": 1.092, "step": 1000 }, { "epoch": 0.03929427482415812, - "eval_loss": 1.1073596477508545, - "eval_runtime": 144.3869, - "eval_samples_per_second": 39.173, - "eval_steps_per_second": 4.897, - "eval_wer": 0.7791240711912825, + "eval_loss": 1.1751947402954102, + "eval_runtime": 144.9265, + "eval_samples_per_second": 39.027, + "eval_steps_per_second": 4.878, + "eval_wer": 0.8087496589687214, "step": 1000 }, { "epoch": 0.05894141223623718, - "grad_norm": 2.484443187713623, - "learning_rate": 0.0002969939698492462, - "loss": 0.8123, + "grad_norm": 2.453352451324463, + "learning_rate": 0.0002985007518796992, + "loss": 0.8198, "step": 1500 }, { "epoch": 0.07858854964831624, - "grad_norm": 2.6904137134552, - "learning_rate": 0.00029548643216080397, - "loss": 0.716, + "grad_norm": 4.910336494445801, + "learning_rate": 0.0002977488721804511, + "loss": 0.7307, "step": 2000 }, { "epoch": 0.07858854964831624, - "eval_loss": 0.9458802342414856, - "eval_runtime": 143.0757, - "eval_samples_per_second": 39.532, - "eval_steps_per_second": 4.941, - "eval_wer": 0.7145287348943204, + "eval_loss": 0.9363270998001099, + "eval_runtime": 143.2813, + "eval_samples_per_second": 39.475, + "eval_steps_per_second": 4.934, + "eval_wer": 0.6985123011988252, "step": 2000 }, { "epoch": 0.0982356870603953, - "grad_norm": 2.3412747383117676, - "learning_rate": 0.00029397889447236176, - "loss": 0.6822, + "grad_norm": 2.127075672149658, + "learning_rate": 0.00029699699248120296, + "loss": 0.6921, "step": 2500 }, { "epoch": 0.11788282447247436, - "grad_norm": 2.9600541591644287, - "learning_rate": 0.00029247135678391956, - "loss": 0.6667, + "grad_norm": 2.57724928855896, + "learning_rate": 0.0002962451127819549, + "loss": 0.6633, "step": 3000 }, { "epoch": 0.11788282447247436, - "eval_loss": 0.8828235268592834, - "eval_runtime": 143.1736, - "eval_samples_per_second": 39.504, - "eval_steps_per_second": 4.938, - "eval_wer": 0.6616006804577041, + "eval_loss": 0.8888041377067566, + "eval_runtime": 144.2307, + "eval_samples_per_second": 39.215, + "eval_steps_per_second": 4.902, + "eval_wer": 0.6624352040570686, "step": 3000 }, { "epoch": 0.1375299618845534, - "grad_norm": 2.9464612007141113, - "learning_rate": 0.00029096381909547736, - "loss": 0.6284, + "grad_norm": 3.0028977394104004, + "learning_rate": 0.00029549323308270675, + "loss": 0.6395, "step": 3500 }, { "epoch": 0.15717709929663248, - "grad_norm": 1.9161436557769775, - "learning_rate": 0.00028945628140703515, - "loss": 0.6238, + "grad_norm": 1.864983320236206, + "learning_rate": 0.0002947413533834586, + "loss": 0.6273, "step": 4000 }, { "epoch": 0.15717709929663248, - "eval_loss": 0.8481988906860352, - "eval_runtime": 144.1673, - "eval_samples_per_second": 39.232, - "eval_steps_per_second": 4.904, - "eval_wer": 0.6633178732487041, + "eval_loss": 0.8302789330482483, + "eval_runtime": 143.3729, + "eval_samples_per_second": 39.45, + "eval_steps_per_second": 4.931, + "eval_wer": 0.640432668389209, "step": 4000 }, { "epoch": 0.17682423670871153, - "grad_norm": 2.5864381790161133, - "learning_rate": 0.0002879517587939698, - "loss": 0.6139, + "grad_norm": 1.865403175354004, + "learning_rate": 0.000293990977443609, + "loss": 0.6196, "step": 4500 }, { "epoch": 0.1964713741207906, - "grad_norm": 5.355952739715576, - "learning_rate": 0.0002864442211055276, - "loss": 0.591, + "grad_norm": 2.015836000442505, + "learning_rate": 0.0002932390977443609, + "loss": 0.6031, "step": 5000 }, { "epoch": 0.1964713741207906, - "eval_loss": 0.8120559453964233, - "eval_runtime": 144.0897, - "eval_samples_per_second": 39.253, - "eval_steps_per_second": 4.907, - "eval_wer": 0.6214954020959381, + "eval_loss": 0.7927883267402649, + "eval_runtime": 142.89, + "eval_samples_per_second": 39.583, + "eval_steps_per_second": 4.948, + "eval_wer": 0.6134229911251625, "step": 5000 }, { "epoch": 0.21611851153286965, - "grad_norm": 2.9285130500793457, - "learning_rate": 0.0002849366834170854, - "loss": 0.5983, + "grad_norm": 2.023449420928955, + "learning_rate": 0.00029248721804511275, + "loss": 0.6089, "step": 5500 }, { "epoch": 0.23576564894494872, - "grad_norm": 2.175675630569458, - "learning_rate": 0.00028342914572864316, - "loss": 0.588, + "grad_norm": 2.1012802124023438, + "learning_rate": 0.00029173533834586467, + "loss": 0.6007, "step": 6000 }, { "epoch": 0.23576564894494872, - "eval_loss": 0.7926327586174011, - "eval_runtime": 144.2057, - "eval_samples_per_second": 39.222, - "eval_steps_per_second": 4.903, - "eval_wer": 0.5952079087159571, + "eval_loss": 0.771953821182251, + "eval_runtime": 144.3238, + "eval_samples_per_second": 39.19, + "eval_steps_per_second": 4.899, + "eval_wer": 0.5832357047712282, "step": 6000 }, { "epoch": 0.2554127863570278, - "grad_norm": 1.7357851266860962, - "learning_rate": 0.0002819246231155779, - "loss": 0.5695, + "grad_norm": 1.6933408975601196, + "learning_rate": 0.0002909834586466165, + "loss": 0.5693, "step": 6500 }, { "epoch": 0.2750599237691068, - "grad_norm": 3.1059882640838623, - "learning_rate": 0.00028041708542713567, - "loss": 0.5732, + "grad_norm": 1.8497469425201416, + "learning_rate": 0.0002902315789473684, + "loss": 0.5739, "step": 7000 }, { "epoch": 0.2750599237691068, - "eval_loss": 0.7581684589385986, - "eval_runtime": 144.8221, - "eval_samples_per_second": 39.055, - "eval_steps_per_second": 4.882, - "eval_wer": 0.570685753719247, + "eval_loss": 0.7533236145973206, + "eval_runtime": 144.3945, + "eval_samples_per_second": 39.17, + "eval_steps_per_second": 4.896, + "eval_wer": 0.5685673476593218, "step": 7000 }, { "epoch": 0.2947070611811859, - "grad_norm": 2.637263059616089, - "learning_rate": 0.0002789095477386934, - "loss": 0.571, + "grad_norm": 2.9493396282196045, + "learning_rate": 0.0002894796992481203, + "loss": 0.57, "step": 7500 }, { "epoch": 0.31435419859326497, - "grad_norm": 10.5114107131958, - "learning_rate": 0.00027740201005025127, - "loss": 0.5557, + "grad_norm": 3.0881130695343018, + "learning_rate": 0.00028872781954887214, + "loss": 0.5655, "step": 8000 }, { "epoch": 0.31435419859326497, - "eval_loss": 0.759090781211853, - "eval_runtime": 144.7625, - "eval_samples_per_second": 39.071, - "eval_steps_per_second": 4.884, - "eval_wer": 0.5600295292965929, + "eval_loss": 0.7523130178451538, + "eval_runtime": 143.9033, + "eval_samples_per_second": 39.304, + "eval_steps_per_second": 4.913, + "eval_wer": 0.5594517821893406, "step": 8000 }, { "epoch": 0.33400133600534404, - "grad_norm": 3.085407257080078, - "learning_rate": 0.000275894472361809, - "loss": 0.5434, + "grad_norm": 2.8726112842559814, + "learning_rate": 0.000287975939849624, + "loss": 0.5541, "step": 8500 }, { "epoch": 0.35364847341742306, - "grad_norm": 2.6379830837249756, - "learning_rate": 0.0002743899497487437, - "loss": 0.5587, + "grad_norm": 2.6348774433135986, + "learning_rate": 0.0002872255639097744, + "loss": 0.5584, "step": 9000 }, { "epoch": 0.35364847341742306, - "eval_loss": 0.724540650844574, - "eval_runtime": 145.5811, - "eval_samples_per_second": 38.851, - "eval_steps_per_second": 4.856, - "eval_wer": 0.574120139301247, + "eval_loss": 0.7173847556114197, + "eval_runtime": 144.396, + "eval_samples_per_second": 39.17, + "eval_steps_per_second": 4.896, + "eval_wer": 0.5668020092760507, "step": 9000 }, { "epoch": 0.37329561082950213, - "grad_norm": 3.1712732315063477, - "learning_rate": 0.00027288241206030147, - "loss": 0.5465, + "grad_norm": 3.332054853439331, + "learning_rate": 0.00028647368421052627, + "loss": 0.5485, "step": 9500 }, { "epoch": 0.3929427482415812, - "grad_norm": 3.744135618209839, - "learning_rate": 0.00027137487437185927, - "loss": 0.531, + "grad_norm": 2.823880434036255, + "learning_rate": 0.0002857218045112782, + "loss": 0.5454, "step": 10000 }, { "epoch": 0.3929427482415812, - "eval_loss": 0.7107329964637756, - "eval_runtime": 145.1162, - "eval_samples_per_second": 38.976, - "eval_steps_per_second": 4.872, - "eval_wer": 0.5469499767296304, + "eval_loss": 0.7537470459938049, + "eval_runtime": 144.0918, + "eval_samples_per_second": 39.253, + "eval_steps_per_second": 4.907, + "eval_wer": 0.5798815618430133, "step": 10000 }, { "epoch": 0.4125898856536603, - "grad_norm": 2.3687188625335693, - "learning_rate": 0.00026986733668341706, - "loss": 0.5346, + "grad_norm": 2.8141980171203613, + "learning_rate": 0.00028496992481203006, + "loss": 0.5527, "step": 10500 }, { "epoch": 0.4322370230657393, - "grad_norm": 3.6938724517822266, - "learning_rate": 0.00026835979899497486, - "loss": 0.5275, + "grad_norm": 4.174062728881836, + "learning_rate": 0.0002842180451127819, + "loss": 0.5322, "step": 11000 }, { "epoch": 0.4322370230657393, - "eval_loss": 0.7101730704307556, - "eval_runtime": 144.5782, - "eval_samples_per_second": 39.121, - "eval_steps_per_second": 4.89, - "eval_wer": 0.5447513280159202, + "eval_loss": 0.7155322432518005, + "eval_runtime": 144.2272, + "eval_samples_per_second": 39.216, + "eval_steps_per_second": 4.902, + "eval_wer": 0.5613615573494246, "step": 11000 }, { "epoch": 0.4518841604778184, - "grad_norm": 2.88944935798645, - "learning_rate": 0.00026685226130653266, - "loss": 0.5258, + "grad_norm": 2.2155849933624268, + "learning_rate": 0.0002834676691729323, + "loss": 0.5373, "step": 11500 }, { "epoch": 0.47153129788989745, - "grad_norm": 3.0208890438079834, - "learning_rate": 0.00026534472361809046, - "loss": 0.5101, + "grad_norm": 2.1979432106018066, + "learning_rate": 0.0002827157894736842, + "loss": 0.5206, "step": 12000 }, { "epoch": 0.47153129788989745, - "eval_loss": 0.690454363822937, - "eval_runtime": 144.959, - "eval_samples_per_second": 39.018, - "eval_steps_per_second": 4.877, - "eval_wer": 0.5404342732422847, + "eval_loss": 0.7130174040794373, + "eval_runtime": 144.2801, + "eval_samples_per_second": 39.202, + "eval_steps_per_second": 4.9, + "eval_wer": 0.5746336922854712, "step": 12000 }, { "epoch": 0.4911784353019765, - "grad_norm": 2.797706127166748, - "learning_rate": 0.00026384020100502506, - "loss": 0.5052, + "grad_norm": 3.059553861618042, + "learning_rate": 0.00028196390977443605, + "loss": 0.5303, "step": 12500 }, { "epoch": 0.5108255727140556, - "grad_norm": 2.1760404109954834, - "learning_rate": 0.0002623326633165829, - "loss": 0.5215, + "grad_norm": 2.1422946453094482, + "learning_rate": 0.0002812120300751879, + "loss": 0.5304, "step": 13000 }, { "epoch": 0.5108255727140556, - "eval_loss": 0.682437002658844, - "eval_runtime": 144.6862, - "eval_samples_per_second": 39.091, - "eval_steps_per_second": 4.886, - "eval_wer": 0.5254770425767521, + "eval_loss": 0.6817054748535156, + "eval_runtime": 144.6048, + "eval_samples_per_second": 39.114, + "eval_steps_per_second": 4.889, + "eval_wer": 0.5390220025356679, "step": 13000 }, { "epoch": 0.5304727101261346, - "grad_norm": 2.243823289871216, - "learning_rate": 0.00026082512562814066, - "loss": 0.5048, + "grad_norm": 3.0140066146850586, + "learning_rate": 0.00028046015037593984, + "loss": 0.5156, "step": 13500 }, { "epoch": 0.5501198475382136, - "grad_norm": 2.9280102252960205, - "learning_rate": 0.00025931758793969846, - "loss": 0.5293, + "grad_norm": 2.334425449371338, + "learning_rate": 0.0002797082706766917, + "loss": 0.55, "step": 14000 }, { "epoch": 0.5501198475382136, - "eval_loss": 0.6682488322257996, - "eval_runtime": 144.9037, - "eval_samples_per_second": 39.033, - "eval_steps_per_second": 4.879, - "eval_wer": 0.5163133315144999, + "eval_loss": 0.6902604699134827, + "eval_runtime": 144.3591, + "eval_samples_per_second": 39.18, + "eval_steps_per_second": 4.898, + "eval_wer": 0.534014860939481, "step": 14000 }, { "epoch": 0.5697669849502928, - "grad_norm": 2.742039918899536, - "learning_rate": 0.00025781005025125625, - "loss": 0.5211, + "grad_norm": 1.8595026731491089, + "learning_rate": 0.0002789563909774436, + "loss": 0.5296, "step": 14500 }, { "epoch": 0.5894141223623718, - "grad_norm": 6.8413472175598145, - "learning_rate": 0.00025630251256281405, - "loss": 0.4981, + "grad_norm": 1.83585524559021, + "learning_rate": 0.00027820451127819545, + "loss": 0.5115, "step": 15000 }, { "epoch": 0.5894141223623718, - "eval_loss": 0.6615394949913025, - "eval_runtime": 144.3823, - "eval_samples_per_second": 39.174, - "eval_steps_per_second": 4.897, - "eval_wer": 0.5139702460239765, + "eval_loss": 0.6973890662193298, + "eval_runtime": 143.7301, + "eval_samples_per_second": 39.352, + "eval_steps_per_second": 4.919, + "eval_wer": 0.5437081735167145, "step": 15000 }, { "epoch": 0.6090612597744509, - "grad_norm": 19.166942596435547, - "learning_rate": 0.00025479497487437185, - "loss": 0.4884, + "grad_norm": 3.633268117904663, + "learning_rate": 0.00027745413533834584, + "loss": 0.5083, "step": 15500 }, { "epoch": 0.6287083971865299, - "grad_norm": 2.290081024169922, - "learning_rate": 0.00025328743718592965, - "loss": 0.4891, + "grad_norm": 1.6785953044891357, + "learning_rate": 0.0002767022556390977, + "loss": 0.5097, "step": 16000 }, { "epoch": 0.6287083971865299, - "eval_loss": 0.6634296178817749, - "eval_runtime": 145.228, - "eval_samples_per_second": 38.946, - "eval_steps_per_second": 4.868, - "eval_wer": 0.5249474410617708, + "eval_loss": 0.6785907745361328, + "eval_runtime": 144.7589, + "eval_samples_per_second": 39.072, + "eval_steps_per_second": 4.884, + "eval_wer": 0.5198119112195279, "step": 16000 }, { "epoch": 0.648355534598609, - "grad_norm": 2.9746391773223877, - "learning_rate": 0.0002517829145728643, - "loss": 0.4809, + "grad_norm": 3.022169589996338, + "learning_rate": 0.00027595037593984963, + "loss": 0.5025, "step": 16500 }, { "epoch": 0.6680026720106881, - "grad_norm": 3.1741456985473633, - "learning_rate": 0.0002502753768844221, - "loss": 0.4813, + "grad_norm": 3.9664957523345947, + "learning_rate": 0.00027519999999999997, + "loss": 0.504, "step": 17000 }, { "epoch": 0.6680026720106881, - "eval_loss": 0.6469489336013794, - "eval_runtime": 146.1886, - "eval_samples_per_second": 38.69, - "eval_steps_per_second": 4.836, - "eval_wer": 0.5105198119112195, + "eval_loss": 0.6679931282997131, + "eval_runtime": 144.4325, + "eval_samples_per_second": 39.16, + "eval_steps_per_second": 4.895, + "eval_wer": 0.5067163101218084, "step": 17000 }, { "epoch": 0.6876498094227671, - "grad_norm": 2.78464937210083, - "learning_rate": 0.0002487678391959799, - "loss": 0.4788, + "grad_norm": 1.8392106294631958, + "learning_rate": 0.00027444812030075184, + "loss": 0.4996, "step": 17500 }, { "epoch": 0.7072969468348461, - "grad_norm": 2.7327282428741455, - "learning_rate": 0.00024726331658291457, - "loss": 0.4799, + "grad_norm": 22.285568237304688, + "learning_rate": 0.0002736977443609022, + "loss": 0.4951, "step": 18000 }, { "epoch": 0.7072969468348461, - "eval_loss": 0.6420897245407104, - "eval_runtime": 145.2793, - "eval_samples_per_second": 38.932, - "eval_steps_per_second": 4.866, - "eval_wer": 0.5014202949719954, + "eval_loss": 0.6599805951118469, + "eval_runtime": 144.0873, + "eval_samples_per_second": 39.254, + "eval_steps_per_second": 4.907, + "eval_wer": 0.5222191908330792, "step": 18000 }, { "epoch": 0.7269440842469252, - "grad_norm": 1.9746254682540894, - "learning_rate": 0.00024575577889447236, - "loss": 0.4698, + "grad_norm": 1.8236407041549683, + "learning_rate": 0.0002729458646616541, + "loss": 0.5008, "step": 18500 }, { "epoch": 0.7465912216590043, - "grad_norm": 1.833760142326355, - "learning_rate": 0.0002442482412060301, - "loss": 0.4799, + "grad_norm": 3.337568998336792, + "learning_rate": 0.000272193984962406, + "loss": 0.4982, "step": 19000 }, { "epoch": 0.7465912216590043, - "eval_loss": 0.614359438419342, - "eval_runtime": 144.72, - "eval_samples_per_second": 39.082, - "eval_steps_per_second": 4.885, - "eval_wer": 0.48193737863298614, + "eval_loss": 0.6371914744377136, + "eval_runtime": 144.7615, + "eval_samples_per_second": 39.071, + "eval_steps_per_second": 4.884, + "eval_wer": 0.5010993243568551, "step": 19000 }, { "epoch": 0.7662383590710834, - "grad_norm": 2.3315391540527344, - "learning_rate": 0.0002427407035175879, - "loss": 0.4773, + "grad_norm": 5.579843997955322, + "learning_rate": 0.0002714421052631579, + "loss": 0.4966, "step": 19500 }, { "epoch": 0.7858854964831624, - "grad_norm": 1.5900750160217285, - "learning_rate": 0.0002412331658291457, - "loss": 0.471, + "grad_norm": 1.800836443901062, + "learning_rate": 0.00027069022556390975, + "loss": 0.493, "step": 20000 }, { "epoch": 0.7858854964831624, - "eval_loss": 0.6183646321296692, - "eval_runtime": 144.8701, - "eval_samples_per_second": 39.042, - "eval_steps_per_second": 4.88, - "eval_wer": 0.4914220603103786, + "eval_loss": 0.6563202738761902, + "eval_runtime": 145.0143, + "eval_samples_per_second": 39.003, + "eval_steps_per_second": 4.875, + "eval_wer": 0.523583315947425, "step": 20000 }, { "epoch": 0.8055326338952414, - "grad_norm": 9.128410339355469, - "learning_rate": 0.00023972864321608037, - "loss": 0.4721, + "grad_norm": 2.785470724105835, + "learning_rate": 0.0002699383458646616, + "loss": 0.5046, "step": 20500 }, { "epoch": 0.8251797713073206, - "grad_norm": 2.2207324504852295, - "learning_rate": 0.00023822110552763816, - "loss": 0.4644, + "grad_norm": 2.336695909500122, + "learning_rate": 0.00026918646616541354, + "loss": 0.4928, "step": 21000 }, { "epoch": 0.8251797713073206, - "eval_loss": 0.619006872177124, - "eval_runtime": 146.3215, - "eval_samples_per_second": 38.655, - "eval_steps_per_second": 4.832, - "eval_wer": 0.4983068800051355, + "eval_loss": 0.6477507948875427, + "eval_runtime": 146.5574, + "eval_samples_per_second": 38.592, + "eval_steps_per_second": 4.824, + "eval_wer": 0.5030090995169392, "step": 21000 }, { "epoch": 0.8448269087193996, - "grad_norm": 2.999115467071533, - "learning_rate": 0.00023671356783919596, - "loss": 0.4649, + "grad_norm": 1.3052055835723877, + "learning_rate": 0.0002684360902255639, + "loss": 0.4878, "step": 21500 }, { "epoch": 0.8644740461314786, - "grad_norm": 1.8293167352676392, - "learning_rate": 0.00023520904522613062, - "loss": 0.4645, + "grad_norm": 2.2889890670776367, + "learning_rate": 0.00026768571428571427, + "loss": 0.4964, "step": 22000 }, { "epoch": 0.8644740461314786, - "eval_loss": 0.6084673404693604, - "eval_runtime": 146.1674, - "eval_samples_per_second": 38.695, - "eval_steps_per_second": 4.837, - "eval_wer": 0.47840670186644413, + "eval_loss": 0.6431675553321838, + "eval_runtime": 146.4682, + "eval_samples_per_second": 38.616, + "eval_steps_per_second": 4.827, + "eval_wer": 0.5103272295421354, "step": 22000 }, { "epoch": 0.8841211835435577, - "grad_norm": 6.625216484069824, - "learning_rate": 0.00023370150753768845, - "loss": 0.4539, + "grad_norm": 3.50435733795166, + "learning_rate": 0.00026693383458646614, + "loss": 0.4796, "step": 22500 }, { "epoch": 0.9037683209556368, - "grad_norm": 2.523075819015503, - "learning_rate": 0.00023219396984924622, - "loss": 0.4506, + "grad_norm": 1.364182472229004, + "learning_rate": 0.000266181954887218, + "loss": 0.4818, "step": 23000 }, { "epoch": 0.9037683209556368, - "eval_loss": 0.6067140698432922, - "eval_runtime": 150.1662, - "eval_samples_per_second": 37.665, - "eval_steps_per_second": 4.708, - "eval_wer": 0.4699972717497713, + "eval_loss": 0.6235994100570679, + "eval_runtime": 146.6113, + "eval_samples_per_second": 38.578, + "eval_steps_per_second": 4.822, + "eval_wer": 0.48960857633483656, "step": 23000 }, { "epoch": 0.9234154583677158, - "grad_norm": 2.416762590408325, - "learning_rate": 0.000230686432160804, - "loss": 0.4405, + "grad_norm": 2.1214494705200195, + "learning_rate": 0.00026543007518796993, + "loss": 0.4688, "step": 23500 }, { "epoch": 0.9430625957797949, - "grad_norm": 2.332602024078369, - "learning_rate": 0.00022917889447236179, - "loss": 0.4439, + "grad_norm": 2.6261446475982666, + "learning_rate": 0.00026467819548872174, + "loss": 0.4752, "step": 24000 }, { "epoch": 0.9430625957797949, - "eval_loss": 0.5994084477424622, - "eval_runtime": 145.8786, - "eval_samples_per_second": 38.772, + "eval_loss": 0.6326233744621277, + "eval_runtime": 145.9082, + "eval_samples_per_second": 38.764, "eval_steps_per_second": 4.846, - "eval_wer": 0.47725120765193946, + "eval_wer": 0.500810450803229, "step": 24000 }, { "epoch": 0.9627097331918739, - "grad_norm": 1.971414566040039, - "learning_rate": 0.00022767135678391956, - "loss": 0.4407, + "grad_norm": 4.142037391662598, + "learning_rate": 0.00026392631578947367, + "loss": 0.4698, "step": 24500 }, { "epoch": 0.982356870603953, - "grad_norm": 2.5181193351745605, - "learning_rate": 0.00022616381909547738, - "loss": 0.4476, + "grad_norm": 2.097465991973877, + "learning_rate": 0.00026317443609022553, + "loss": 0.4736, "step": 25000 }, { "epoch": 0.982356870603953, - "eval_loss": 0.5945668816566467, - "eval_runtime": 146.0094, - "eval_samples_per_second": 38.737, - "eval_steps_per_second": 4.842, - "eval_wer": 0.4709601835951919, + "eval_loss": 0.6309667229652405, + "eval_runtime": 145.598, + "eval_samples_per_second": 38.847, + "eval_steps_per_second": 4.856, + "eval_wer": 0.5081446293591821, "step": 25000 }, { "epoch": 1.002004008016032, - "grad_norm": 0.8755355477333069, - "learning_rate": 0.00022465628140703518, - "loss": 0.4328, + "grad_norm": 1.872559666633606, + "learning_rate": 0.0002624225563909774, + "loss": 0.461, "step": 25500 }, { "epoch": 1.0216511454281112, - "grad_norm": 1.2238690853118896, - "learning_rate": 0.00022314874371859295, - "loss": 0.3905, + "grad_norm": 0.9968547821044922, + "learning_rate": 0.00026167067669172927, + "loss": 0.4241, "step": 26000 }, { "epoch": 1.0216511454281112, - "eval_loss": 0.5904036164283752, - "eval_runtime": 146.0538, - "eval_samples_per_second": 38.725, - "eval_steps_per_second": 4.841, - "eval_wer": 0.4539326924620051, + "eval_loss": 0.6126999258995056, + "eval_runtime": 145.6037, + "eval_samples_per_second": 38.845, + "eval_steps_per_second": 4.856, + "eval_wer": 0.47134534833336006, "step": 26000 }, { "epoch": 1.0412982828401902, - "grad_norm": 0.8446155786514282, - "learning_rate": 0.00022164120603015072, - "loss": 0.3785, + "grad_norm": 0.840844988822937, + "learning_rate": 0.00026092030075187966, + "loss": 0.4146, "step": 26500 }, { "epoch": 1.0609454202522692, - "grad_norm": 1.0839017629623413, - "learning_rate": 0.00022013366834170852, - "loss": 0.3807, + "grad_norm": 0.8603857159614563, + "learning_rate": 0.00026016842105263153, + "loss": 0.4196, "step": 27000 }, { "epoch": 1.0609454202522692, - "eval_loss": 0.5839076042175293, - "eval_runtime": 146.3357, - "eval_samples_per_second": 38.651, - "eval_steps_per_second": 4.831, - "eval_wer": 0.45728683539022, + "eval_loss": 0.6066301465034485, + "eval_runtime": 144.6102, + "eval_samples_per_second": 39.112, + "eval_steps_per_second": 4.889, + "eval_wer": 0.46828007895877133, "step": 27000 }, { "epoch": 1.0805925576643483, - "grad_norm": 1.0781913995742798, - "learning_rate": 0.0002186291457286432, - "loss": 0.3873, + "grad_norm": 0.8338613510131836, + "learning_rate": 0.00025941654135338345, + "loss": 0.4213, "step": 27500 }, { "epoch": 1.1002396950764273, - "grad_norm": 1.6534680128097534, - "learning_rate": 0.00021712160804020098, - "loss": 0.3782, + "grad_norm": 1.0736676454544067, + "learning_rate": 0.0002586646616541353, + "loss": 0.4177, "step": 28000 }, { "epoch": 1.1002396950764273, - "eval_loss": 0.5693557262420654, - "eval_runtime": 145.969, - "eval_samples_per_second": 38.748, - "eval_steps_per_second": 4.843, - "eval_wer": 0.44768981399752855, + "eval_loss": 0.5958611965179443, + "eval_runtime": 145.1774, + "eval_samples_per_second": 38.959, + "eval_steps_per_second": 4.87, + "eval_wer": 0.47737959589799556, "step": 28000 }, { "epoch": 1.1198868324885065, - "grad_norm": 0.9230701923370361, - "learning_rate": 0.00021561407035175877, - "loss": 0.3879, + "grad_norm": 0.7483401894569397, + "learning_rate": 0.0002579127819548872, + "loss": 0.4204, "step": 28500 }, { "epoch": 1.1395339699005855, - "grad_norm": 0.8729119896888733, - "learning_rate": 0.00021410954773869344, - "loss": 0.3777, + "grad_norm": 1.1266822814941406, + "learning_rate": 0.0002571624060150376, + "loss": 0.4204, "step": 29000 }, { "epoch": 1.1395339699005855, - "eval_loss": 0.5712409615516663, - "eval_runtime": 145.9912, - "eval_samples_per_second": 38.742, - "eval_steps_per_second": 4.843, - "eval_wer": 0.4580892619280705, + "eval_loss": 0.607071042060852, + "eval_runtime": 145.6021, + "eval_samples_per_second": 38.846, + "eval_steps_per_second": 4.856, + "eval_wer": 0.4893517998427244, "step": 29000 }, { "epoch": 1.1591811073126645, - "grad_norm": 4.197005271911621, - "learning_rate": 0.00021260201005025123, - "loss": 0.3763, + "grad_norm": 2.852926731109619, + "learning_rate": 0.00025641052631578945, + "loss": 0.4255, "step": 29500 }, { "epoch": 1.1788282447247436, - "grad_norm": 1.8312333822250366, - "learning_rate": 0.0002110974874371859, - "loss": 0.3879, + "grad_norm": 2.134554862976074, + "learning_rate": 0.00025566015037593984, + "loss": 0.4238, "step": 30000 }, { "epoch": 1.1788282447247436, - "eval_loss": 0.5694305300712585, - "eval_runtime": 145.4398, - "eval_samples_per_second": 38.889, - "eval_steps_per_second": 4.861, - "eval_wer": 0.45385244980822004, + "eval_loss": 0.600638747215271, + "eval_runtime": 144.9068, + "eval_samples_per_second": 39.032, + "eval_steps_per_second": 4.879, + "eval_wer": 0.476432732583332, "step": 30000 }, { "epoch": 1.1984753821368226, - "grad_norm": 1.415966510772705, - "learning_rate": 0.0002095899497487437, - "loss": 0.3746, + "grad_norm": 1.6866382360458374, + "learning_rate": 0.0002549082706766917, + "loss": 0.4156, "step": 30500 }, { "epoch": 1.2181225195489018, - "grad_norm": 0.7083045244216919, - "learning_rate": 0.00020808241206030147, - "loss": 0.3817, + "grad_norm": 0.7769395709037781, + "learning_rate": 0.0002541563909774436, + "loss": 0.4253, "step": 31000 }, { "epoch": 1.2181225195489018, - "eval_loss": 0.555791974067688, - "eval_runtime": 146.2218, - "eval_samples_per_second": 38.681, - "eval_steps_per_second": 4.835, - "eval_wer": 0.4413987899407809, + "eval_loss": 0.5803025960922241, + "eval_runtime": 144.9773, + "eval_samples_per_second": 39.013, + "eval_steps_per_second": 4.877, + "eval_wer": 0.4623421225786779, "step": 31000 }, { "epoch": 1.2377696569609808, - "grad_norm": 3.3128950595855713, - "learning_rate": 0.0002065748743718593, - "loss": 0.379, + "grad_norm": 1.305306077003479, + "learning_rate": 0.00025340451127819544, + "loss": 0.4257, "step": 31500 }, { "epoch": 1.2574167943730599, - "grad_norm": 1.0298165082931519, - "learning_rate": 0.00020506733668341706, - "loss": 0.3755, + "grad_norm": 0.9590096473693848, + "learning_rate": 0.00025265263157894736, + "loss": 0.4156, "step": 32000 }, { "epoch": 1.2574167943730599, - "eval_loss": 0.5634162425994873, - "eval_runtime": 144.9179, - "eval_samples_per_second": 39.029, - "eval_steps_per_second": 4.879, - "eval_wer": 0.43432138787693986, + "eval_loss": 0.5940248966217041, + "eval_runtime": 145.3704, + "eval_samples_per_second": 38.908, + "eval_steps_per_second": 4.863, + "eval_wer": 0.45728683539022, "step": 32000 }, { "epoch": 1.2770639317851389, - "grad_norm": 1.2316133975982666, - "learning_rate": 0.00020355979899497486, - "loss": 0.3656, + "grad_norm": 1.598183274269104, + "learning_rate": 0.00025190075187969923, + "loss": 0.4103, "step": 32500 }, { "epoch": 1.296711069197218, - "grad_norm": 1.7103346586227417, - "learning_rate": 0.00020205226130653263, - "loss": 0.3629, + "grad_norm": 1.9396251440048218, + "learning_rate": 0.0002511488721804511, + "loss": 0.4058, "step": 33000 }, { "epoch": 1.296711069197218, - "eval_loss": 0.5455352663993835, - "eval_runtime": 144.6962, - "eval_samples_per_second": 39.089, - "eval_steps_per_second": 4.886, - "eval_wer": 0.4341929996308838, + "eval_loss": 0.5802159905433655, + "eval_runtime": 146.1677, + "eval_samples_per_second": 38.695, + "eval_steps_per_second": 4.837, + "eval_wer": 0.4614594533870424, "step": 33000 }, { "epoch": 1.316358206609297, - "grad_norm": 1.2906711101531982, - "learning_rate": 0.00020054472361809043, - "loss": 0.3786, + "grad_norm": 1.7467131614685059, + "learning_rate": 0.00025039699248120297, + "loss": 0.417, "step": 33500 }, { "epoch": 1.3360053440213762, - "grad_norm": 1.3450685739517212, - "learning_rate": 0.00019903718592964822, - "loss": 0.3636, + "grad_norm": 0.9871892333030701, + "learning_rate": 0.0002496451127819549, + "loss": 0.404, "step": 34000 }, { "epoch": 1.3360053440213762, - "eval_loss": 0.5471683740615845, - "eval_runtime": 145.1506, - "eval_samples_per_second": 38.966, - "eval_steps_per_second": 4.871, - "eval_wer": 0.434562115838295, + "eval_loss": 0.5882492065429688, + "eval_runtime": 145.2339, + "eval_samples_per_second": 38.944, + "eval_steps_per_second": 4.868, + "eval_wer": 0.46015952239572466, "step": 34000 }, { "epoch": 1.3556524814334552, - "grad_norm": 1.1486555337905884, - "learning_rate": 0.00019752964824120602, - "loss": 0.3553, + "grad_norm": 1.0927810668945312, + "learning_rate": 0.00024889323308270676, + "loss": 0.3955, "step": 34500 }, { "epoch": 1.3752996188455342, - "grad_norm": 1.4870734214782715, - "learning_rate": 0.0001960221105527638, - "loss": 0.3566, + "grad_norm": 1.8038376569747925, + "learning_rate": 0.0002481413533834586, + "loss": 0.3995, "step": 35000 }, { "epoch": 1.3752996188455342, - "eval_loss": 0.5466644167900085, - "eval_runtime": 145.1639, - "eval_samples_per_second": 38.963, - "eval_steps_per_second": 4.87, - "eval_wer": 0.4321548362247436, + "eval_loss": 0.5840802788734436, + "eval_runtime": 144.2215, + "eval_samples_per_second": 39.217, + "eval_steps_per_second": 4.902, + "eval_wer": 0.46152364751007047, "step": 35000 }, { "epoch": 1.3949467562576132, - "grad_norm": 2.2068591117858887, - "learning_rate": 0.0001945145728643216, - "loss": 0.3592, + "grad_norm": 1.3720190525054932, + "learning_rate": 0.0002473894736842105, + "loss": 0.4077, "step": 35500 }, { "epoch": 1.4145938936696925, - "grad_norm": 1.370450735092163, - "learning_rate": 0.00019300703517587936, - "loss": 0.3683, + "grad_norm": 1.4488073587417603, + "learning_rate": 0.0002466375939849624, + "loss": 0.4049, "step": 36000 }, { "epoch": 1.4145938936696925, - "eval_loss": 0.5440675020217896, - "eval_runtime": 144.9148, - "eval_samples_per_second": 39.03, - "eval_steps_per_second": 4.879, - "eval_wer": 0.4324597583091268, + "eval_loss": 0.5853234529495239, + "eval_runtime": 144.437, + "eval_samples_per_second": 39.159, + "eval_steps_per_second": 4.895, + "eval_wer": 0.4635618109162106, "step": 36000 }, { "epoch": 1.4342410310817715, - "grad_norm": 1.2991498708724976, - "learning_rate": 0.00019149949748743718, - "loss": 0.372, + "grad_norm": 2.5115835666656494, + "learning_rate": 0.00024588571428571423, + "loss": 0.4202, "step": 36500 }, { "epoch": 1.4538881684938505, - "grad_norm": 0.8429681062698364, - "learning_rate": 0.00018999195979899495, - "loss": 0.3581, + "grad_norm": 0.7861095070838928, + "learning_rate": 0.0002451353383458647, + "loss": 0.4018, "step": 37000 }, { "epoch": 1.4538881684938505, - "eval_loss": 0.5278629660606384, - "eval_runtime": 144.4965, - "eval_samples_per_second": 39.143, - "eval_steps_per_second": 4.893, - "eval_wer": 0.4191876233730802, + "eval_loss": 0.5737255215644836, + "eval_runtime": 143.969, + "eval_samples_per_second": 39.286, + "eval_steps_per_second": 4.911, + "eval_wer": 0.4532747027009677, "step": 37000 }, { "epoch": 1.4735353059059295, - "grad_norm": 1.4340102672576904, - "learning_rate": 0.00018848743718592964, - "loss": 0.3522, + "grad_norm": 1.29477858543396, + "learning_rate": 0.0002443834586466165, + "loss": 0.4028, "step": 37500 }, { "epoch": 1.4931824433180085, - "grad_norm": 3.3566503524780273, - "learning_rate": 0.00018697989949748744, - "loss": 0.3448, + "grad_norm": 1.5367540121078491, + "learning_rate": 0.0002436315789473684, + "loss": 0.3906, "step": 38000 }, { "epoch": 1.4931824433180085, - "eval_loss": 0.5341060161590576, - "eval_runtime": 144.4725, - "eval_samples_per_second": 39.149, - "eval_steps_per_second": 4.894, - "eval_wer": 0.41947649692670635, + "eval_loss": 0.5848459005355835, + "eval_runtime": 144.7268, + "eval_samples_per_second": 39.081, + "eval_steps_per_second": 4.885, + "eval_wer": 0.4637222962237807, "step": 38000 }, { "epoch": 1.5128295807300876, - "grad_norm": 5.104796886444092, - "learning_rate": 0.0001854723618090452, - "loss": 0.3658, + "grad_norm": 1.7903566360473633, + "learning_rate": 0.00024288120300751878, + "loss": 0.4147, "step": 38500 }, { "epoch": 1.5324767181421666, - "grad_norm": 0.7495909929275513, - "learning_rate": 0.000183964824120603, - "loss": 0.3558, + "grad_norm": 0.7342734336853027, + "learning_rate": 0.00024212932330827064, + "loss": 0.3932, "step": 39000 }, { "epoch": 1.5324767181421666, - "eval_loss": 0.5193748474121094, - "eval_runtime": 144.086, - "eval_samples_per_second": 39.254, - "eval_steps_per_second": 4.907, - "eval_wer": 0.4212418353099774, + "eval_loss": 0.551567792892456, + "eval_runtime": 144.7104, + "eval_samples_per_second": 39.085, + "eval_steps_per_second": 4.886, + "eval_wer": 0.44000256776492114, "step": 39000 }, { "epoch": 1.5521238555542456, - "grad_norm": 1.2974355220794678, - "learning_rate": 0.00018246030150753767, - "loss": 0.3487, + "grad_norm": 1.3719693422317505, + "learning_rate": 0.00024137894736842104, + "loss": 0.3984, "step": 39500 }, { "epoch": 1.5717709929663248, - "grad_norm": 1.052008867263794, - "learning_rate": 0.00018095276381909547, - "loss": 0.3492, + "grad_norm": 3.398484706878662, + "learning_rate": 0.0002406270676691729, + "loss": 0.4026, "step": 40000 }, { "epoch": 1.5717709929663248, - "eval_loss": 0.5243468284606934, - "eval_runtime": 144.7366, - "eval_samples_per_second": 39.078, - "eval_steps_per_second": 4.885, - "eval_wer": 0.4139076567540242, + "eval_loss": 0.5641522407531738, + "eval_runtime": 145.5525, + "eval_samples_per_second": 38.859, + "eval_steps_per_second": 4.857, + "eval_wer": 0.44844409494310794, "step": 40000 }, { "epoch": 1.5914181303784039, - "grad_norm": 1.5572227239608765, - "learning_rate": 0.00017944522613065324, - "loss": 0.3581, + "grad_norm": 1.1821295022964478, + "learning_rate": 0.0002398751879699248, + "loss": 0.4086, "step": 40500 }, { "epoch": 1.611065267790483, - "grad_norm": 1.3091472387313843, - "learning_rate": 0.00017793768844221104, - "loss": 0.3461, + "grad_norm": 1.3344157934188843, + "learning_rate": 0.00023912330827067667, + "loss": 0.396, "step": 41000 }, { "epoch": 1.611065267790483, - "eval_loss": 0.5143682360649109, - "eval_runtime": 145.6844, - "eval_samples_per_second": 38.824, - "eval_steps_per_second": 4.853, - "eval_wer": 0.40458346038420184, + "eval_loss": 0.5584043264389038, + "eval_runtime": 145.5026, + "eval_samples_per_second": 38.872, + "eval_steps_per_second": 4.859, + "eval_wer": 0.4512044422333135, "step": 41000 }, { "epoch": 1.630712405202562, - "grad_norm": 0.52028888463974, - "learning_rate": 0.00017643015075376883, - "loss": 0.34, + "grad_norm": 0.6132605671882629, + "learning_rate": 0.00023837142857142856, + "loss": 0.3863, "step": 41500 }, { "epoch": 1.6503595426146411, - "grad_norm": 1.1602140665054321, - "learning_rate": 0.00017492261306532663, - "loss": 0.3412, + "grad_norm": 2.896801710128784, + "learning_rate": 0.00023761954887218043, + "loss": 0.3976, "step": 42000 }, { "epoch": 1.6503595426146411, - "eval_loss": 0.5344811081886292, - "eval_runtime": 146.2652, - "eval_samples_per_second": 38.669, - "eval_steps_per_second": 4.834, - "eval_wer": 0.42369726051579976, + "eval_loss": 0.5537524819374084, + "eval_runtime": 145.3562, + "eval_samples_per_second": 38.911, + "eval_steps_per_second": 4.864, + "eval_wer": 0.4436455842467622, "step": 42000 }, { "epoch": 1.6700066800267201, - "grad_norm": 0.4986182749271393, - "learning_rate": 0.0001734150753768844, - "loss": 0.3501, + "grad_norm": 0.4839102029800415, + "learning_rate": 0.00023686766917293232, + "loss": 0.3977, "step": 42500 }, { "epoch": 1.6896538174387992, - "grad_norm": 1.2773915529251099, - "learning_rate": 0.0001719075376884422, - "loss": 0.3424, + "grad_norm": 0.7648475170135498, + "learning_rate": 0.0002361157894736842, + "loss": 0.3936, "step": 43000 }, { "epoch": 1.6896538174387992, - "eval_loss": 0.5192400813102722, - "eval_runtime": 144.477, - "eval_samples_per_second": 39.148, - "eval_steps_per_second": 4.894, - "eval_wer": 0.40917334018070645, + "eval_loss": 0.551811158657074, + "eval_runtime": 144.7074, + "eval_samples_per_second": 39.086, + "eval_steps_per_second": 4.886, + "eval_wer": 0.4412222561024538, "step": 43000 }, { "epoch": 1.7093009548508782, - "grad_norm": 1.184127926826477, - "learning_rate": 0.00017040301507537686, - "loss": 0.3353, + "grad_norm": 1.953736662864685, + "learning_rate": 0.00023536541353383458, + "loss": 0.3865, "step": 43500 }, { "epoch": 1.7289480922629572, - "grad_norm": 1.3090980052947998, - "learning_rate": 0.00016889547738693466, - "loss": 0.341, + "grad_norm": 1.4531214237213135, + "learning_rate": 0.00023461353383458645, + "loss": 0.3879, "step": 44000 }, { "epoch": 1.7289480922629572, - "eval_loss": 0.5130703449249268, - "eval_runtime": 144.4345, - "eval_samples_per_second": 39.16, - "eval_steps_per_second": 4.895, - "eval_wer": 0.40559451782189343, + "eval_loss": 0.5469211935997009, + "eval_runtime": 145.1619, + "eval_samples_per_second": 38.963, + "eval_steps_per_second": 4.87, + "eval_wer": 0.42974755661119224, "step": 44000 }, { "epoch": 1.7485952296750362, - "grad_norm": 1.4495439529418945, - "learning_rate": 0.00016738793969849243, - "loss": 0.3448, + "grad_norm": 1.0637991428375244, + "learning_rate": 0.00023386165413533835, + "loss": 0.3942, "step": 44500 }, { "epoch": 1.7682423670871152, - "grad_norm": 2.0205602645874023, - "learning_rate": 0.00016588040201005025, - "loss": 0.3428, + "grad_norm": 1.0606558322906494, + "learning_rate": 0.00023310977443609021, + "loss": 0.3939, "step": 45000 }, { "epoch": 1.7682423670871152, - "eval_loss": 0.5109943151473999, - "eval_runtime": 145.377, - "eval_samples_per_second": 38.906, - "eval_steps_per_second": 4.863, - "eval_wer": 0.4030107043700149, + "eval_loss": 0.5502393245697021, + "eval_runtime": 144.8654, + "eval_samples_per_second": 39.043, + "eval_steps_per_second": 4.88, + "eval_wer": 0.44024329572627624, "step": 45000 }, { "epoch": 1.7878895044991945, - "grad_norm": 0.9400632381439209, - "learning_rate": 0.00016437286432160802, - "loss": 0.3419, + "grad_norm": 0.7499143481254578, + "learning_rate": 0.0002323578947368421, + "loss": 0.3926, "step": 45500 }, { "epoch": 1.8075366419112735, - "grad_norm": 1.3378790616989136, - "learning_rate": 0.00016286834170854271, - "loss": 0.3337, + "grad_norm": 1.3015657663345337, + "learning_rate": 0.00023160601503759395, + "loss": 0.386, "step": 46000 }, { "epoch": 1.8075366419112735, - "eval_loss": 0.5063343644142151, - "eval_runtime": 144.676, - "eval_samples_per_second": 39.094, - "eval_steps_per_second": 4.887, - "eval_wer": 0.4051130618991831, + "eval_loss": 0.5626779198646545, + "eval_runtime": 145.6466, + "eval_samples_per_second": 38.834, + "eval_steps_per_second": 4.854, + "eval_wer": 0.4409012854873136, "step": 46000 }, { "epoch": 1.8271837793233527, - "grad_norm": 1.5063832998275757, - "learning_rate": 0.0001613608040201005, - "loss": 0.3298, + "grad_norm": 1.1235235929489136, + "learning_rate": 0.00023085413533834585, + "loss": 0.3833, "step": 46500 }, { "epoch": 1.8468309167354318, - "grad_norm": 0.6801648736000061, - "learning_rate": 0.00015985326633165828, - "loss": 0.3286, + "grad_norm": 0.8004291653633118, + "learning_rate": 0.0002301022556390977, + "loss": 0.3823, "step": 47000 }, { "epoch": 1.8468309167354318, - "eval_loss": 0.5045257210731506, - "eval_runtime": 144.5535, - "eval_samples_per_second": 39.127, - "eval_steps_per_second": 4.891, - "eval_wer": 0.39761839803565985, + "eval_loss": 0.5602549910545349, + "eval_runtime": 145.4825, + "eval_samples_per_second": 38.878, + "eval_steps_per_second": 4.86, + "eval_wer": 0.43724222047471556, "step": 47000 }, { "epoch": 1.8664780541475108, - "grad_norm": 1.5206536054611206, - "learning_rate": 0.00015834572864321605, - "loss": 0.3303, + "grad_norm": 2.6733715534210205, + "learning_rate": 0.0002293503759398496, + "loss": 0.3868, "step": 47500 }, { "epoch": 1.8861251915595898, - "grad_norm": 2.1600515842437744, - "learning_rate": 0.00015683819095477385, - "loss": 0.3422, + "grad_norm": 1.501878261566162, + "learning_rate": 0.00022859849624060148, + "loss": 0.3955, "step": 48000 }, { "epoch": 1.8861251915595898, - "eval_loss": 0.4938177168369293, - "eval_runtime": 145.0534, - "eval_samples_per_second": 38.993, - "eval_steps_per_second": 4.874, - "eval_wer": 0.40270578228563175, + "eval_loss": 0.534982442855835, + "eval_runtime": 145.013, + "eval_samples_per_second": 39.003, + "eval_steps_per_second": 4.875, + "eval_wer": 0.4308228081719119, "step": 48000 }, { "epoch": 1.9057723289716688, - "grad_norm": 0.6919649243354797, - "learning_rate": 0.00015533065326633162, - "loss": 0.3262, + "grad_norm": 1.785569667816162, + "learning_rate": 0.00022784661654135337, + "loss": 0.3766, "step": 48500 }, { "epoch": 1.9254194663837478, - "grad_norm": 1.5237656831741333, - "learning_rate": 0.00015382311557788944, - "loss": 0.3271, + "grad_norm": 0.956910252571106, + "learning_rate": 0.00022709473684210524, + "loss": 0.3808, "step": 49000 }, { "epoch": 1.9254194663837478, - "eval_loss": 0.4978640377521515, - "eval_runtime": 145.4834, - "eval_samples_per_second": 38.877, + "eval_loss": 0.550835132598877, + "eval_runtime": 145.4747, + "eval_samples_per_second": 38.88, "eval_steps_per_second": 4.86, - "eval_wer": 0.39097430630225805, + "eval_wer": 0.44476898139975285, "step": 49000 }, { "epoch": 1.9450666037958269, - "grad_norm": 0.9466130137443542, - "learning_rate": 0.0001523155778894472, - "loss": 0.3243, + "grad_norm": 1.0630252361297607, + "learning_rate": 0.00022634285714285713, + "loss": 0.3794, "step": 49500 }, { "epoch": 1.9647137412079059, - "grad_norm": 0.8876349925994873, - "learning_rate": 0.0001508110552763819, - "loss": 0.3313, + "grad_norm": 0.9397912621498108, + "learning_rate": 0.000225593984962406, + "loss": 0.3871, "step": 50000 }, { "epoch": 1.9647137412079059, - "eval_loss": 0.490749329328537, - "eval_runtime": 144.969, - "eval_samples_per_second": 39.015, - "eval_steps_per_second": 4.877, - "eval_wer": 0.3973776700743047, + "eval_loss": 0.5386993885040283, + "eval_runtime": 144.7105, + "eval_samples_per_second": 39.085, + "eval_steps_per_second": 4.886, + "eval_wer": 0.43197830238641655, "step": 50000 }, { "epoch": 1.9843608786199851, - "grad_norm": 1.058655023574829, - "learning_rate": 0.00014930653266331657, - "loss": 0.3138, + "grad_norm": 1.227219581604004, + "learning_rate": 0.00022484360902255636, + "loss": 0.371, "step": 50500 }, { "epoch": 2.004008016032064, - "grad_norm": 0.8039568066596985, - "learning_rate": 0.00014779899497487437, - "loss": 0.3069, + "grad_norm": 1.0407652854919434, + "learning_rate": 0.00022409172932330825, + "loss": 0.3668, "step": 51000 }, { "epoch": 2.004008016032064, - "eval_loss": 0.4898728132247925, - "eval_runtime": 145.2853, - "eval_samples_per_second": 38.93, - "eval_steps_per_second": 4.866, - "eval_wer": 0.38524498082200576, + "eval_loss": 0.5476531982421875, + "eval_runtime": 144.735, + "eval_samples_per_second": 39.078, + "eval_steps_per_second": 4.885, + "eval_wer": 0.4207443308565101, "step": 51000 }, { "epoch": 2.0236551534441434, - "grad_norm": 1.5901250839233398, - "learning_rate": 0.00014629145728643214, - "loss": 0.2734, + "grad_norm": 3.713465929031372, + "learning_rate": 0.00022333984962406012, + "loss": 0.3303, "step": 51500 }, { "epoch": 2.0433022908562224, - "grad_norm": 0.6160286664962769, - "learning_rate": 0.00014478391959798993, - "loss": 0.2771, + "grad_norm": 1.074621319770813, + "learning_rate": 0.00022258796992481202, + "loss": 0.3324, "step": 52000 }, { "epoch": 2.0433022908562224, - "eval_loss": 0.4836235046386719, - "eval_runtime": 145.1471, - "eval_samples_per_second": 38.967, - "eval_steps_per_second": 4.871, - "eval_wer": 0.3845067484071833, + "eval_loss": 0.5283042788505554, + "eval_runtime": 144.7457, + "eval_samples_per_second": 39.075, + "eval_steps_per_second": 4.884, + "eval_wer": 0.4227985427934073, "step": 52000 }, { "epoch": 2.0629494282683014, - "grad_norm": 1.2271616458892822, - "learning_rate": 0.00014327638190954773, - "loss": 0.2705, + "grad_norm": 1.2761338949203491, + "learning_rate": 0.00022183759398496238, + "loss": 0.3299, "step": 52500 }, { "epoch": 2.0825965656803804, - "grad_norm": 1.0109779834747314, - "learning_rate": 0.00014176884422110553, - "loss": 0.2705, + "grad_norm": 0.9065299034118652, + "learning_rate": 0.00022108571428571425, + "loss": 0.3327, "step": 53000 }, { "epoch": 2.0825965656803804, - "eval_loss": 0.4929197132587433, - "eval_runtime": 150.0346, - "eval_samples_per_second": 37.698, - "eval_steps_per_second": 4.712, - "eval_wer": 0.38250068206255716, + "eval_loss": 0.5217949151992798, + "eval_runtime": 145.0885, + "eval_samples_per_second": 38.983, + "eval_steps_per_second": 4.873, + "eval_wer": 0.41557670395275315, "step": 53000 }, { "epoch": 2.1022437030924594, - "grad_norm": 0.9157425165176392, - "learning_rate": 0.0001402613065326633, - "loss": 0.2748, + "grad_norm": 1.1222054958343506, + "learning_rate": 0.00022033383458646615, + "loss": 0.3347, "step": 53500 }, { "epoch": 2.1218908405045385, - "grad_norm": 1.036487102508545, - "learning_rate": 0.0001387537688442211, - "loss": 0.2654, + "grad_norm": 3.639472484588623, + "learning_rate": 0.000219581954887218, + "loss": 0.3251, "step": 54000 }, { "epoch": 2.1218908405045385, - "eval_loss": 0.4842771887779236, - "eval_runtime": 149.1221, - "eval_samples_per_second": 37.929, - "eval_steps_per_second": 4.741, - "eval_wer": 0.3813451878480525, + "eval_loss": 0.5330758094787598, + "eval_runtime": 144.669, + "eval_samples_per_second": 39.096, + "eval_steps_per_second": 4.887, + "eval_wer": 0.41357063760812696, "step": 54000 }, { "epoch": 2.1415379779166175, - "grad_norm": 0.9291256070137024, - "learning_rate": 0.00013724924623115576, - "loss": 0.2719, + "grad_norm": 2.337876558303833, + "learning_rate": 0.0002188300751879699, + "loss": 0.3368, "step": 54500 }, { "epoch": 2.1611851153286965, - "grad_norm": 1.0071407556533813, - "learning_rate": 0.00013574170854271356, - "loss": 0.2794, + "grad_norm": 0.8467469811439514, + "learning_rate": 0.00021807819548872178, + "loss": 0.3466, "step": 55000 }, { "epoch": 2.1611851153286965, - "eval_loss": 0.4819933772087097, - "eval_runtime": 150.0521, - "eval_samples_per_second": 37.694, - "eval_steps_per_second": 4.712, - "eval_wer": 0.3780712875736226, + "eval_loss": 0.5276508927345276, + "eval_runtime": 145.3583, + "eval_samples_per_second": 38.911, + "eval_steps_per_second": 4.864, + "eval_wer": 0.4141002391231083, "step": 55000 }, { "epoch": 2.1808322527407755, - "grad_norm": 0.558372974395752, - "learning_rate": 0.00013423417085427135, - "loss": 0.2699, + "grad_norm": 0.7859643697738647, + "learning_rate": 0.00021732781954887217, + "loss": 0.3337, "step": 55500 }, { "epoch": 2.2004793901528545, - "grad_norm": 0.5932472944259644, - "learning_rate": 0.00013272663316582912, - "loss": 0.2644, + "grad_norm": 0.8069686889648438, + "learning_rate": 0.00021657593984962404, + "loss": 0.3259, "step": 56000 }, { "epoch": 2.2004793901528545, - "eval_loss": 0.47418466210365295, - "eval_runtime": 145.6714, - "eval_samples_per_second": 38.827, - "eval_steps_per_second": 4.853, - "eval_wer": 0.3755195711832582, + "eval_loss": 0.522844672203064, + "eval_runtime": 145.284, + "eval_samples_per_second": 38.931, + "eval_steps_per_second": 4.866, + "eval_wer": 0.40875607838102423, "step": 56000 }, { "epoch": 2.2201265275649336, - "grad_norm": 1.2624306678771973, - "learning_rate": 0.00013121909547738692, - "loss": 0.2767, + "grad_norm": 0.5765830278396606, + "learning_rate": 0.00021582406015037593, + "loss": 0.337, "step": 56500 }, { "epoch": 2.239773664977013, - "grad_norm": 1.0963220596313477, - "learning_rate": 0.00012971758793969848, - "loss": 0.2624, + "grad_norm": 0.9564582109451294, + "learning_rate": 0.0002150736842105263, + "loss": 0.3292, "step": 57000 }, { "epoch": 2.239773664977013, - "eval_loss": 0.46851301193237305, - "eval_runtime": 145.312, - "eval_samples_per_second": 38.923, - "eval_steps_per_second": 4.865, - "eval_wer": 0.3680570043812489, + "eval_loss": 0.5119462013244629, + "eval_runtime": 145.6008, + "eval_samples_per_second": 38.846, + "eval_steps_per_second": 4.856, + "eval_wer": 0.4132657155237438, "step": 57000 }, { "epoch": 2.259420802389092, - "grad_norm": 0.6824894547462463, - "learning_rate": 0.00012821608040201004, - "loss": 0.263, + "grad_norm": 0.7495951056480408, + "learning_rate": 0.00021432330827067666, + "loss": 0.3259, "step": 57500 }, { "epoch": 2.279067939801171, - "grad_norm": 0.6939342021942139, - "learning_rate": 0.00012670854271356783, - "loss": 0.2689, + "grad_norm": 0.825587272644043, + "learning_rate": 0.00021357142857142855, + "loss": 0.3323, "step": 58000 }, { "epoch": 2.279067939801171, - "eval_loss": 0.4649806618690491, - "eval_runtime": 145.0546, - "eval_samples_per_second": 38.992, - "eval_steps_per_second": 4.874, - "eval_wer": 0.366532393959333, + "eval_loss": 0.5191282033920288, + "eval_runtime": 145.5654, + "eval_samples_per_second": 38.855, + "eval_steps_per_second": 4.857, + "eval_wer": 0.40739195326667843, "step": 58000 }, { "epoch": 2.29871507721325, - "grad_norm": 3.4403460025787354, - "learning_rate": 0.0001252010050251256, - "loss": 0.2634, + "grad_norm": 0.9213058948516846, + "learning_rate": 0.00021281954887218042, + "loss": 0.3292, "step": 58500 }, { "epoch": 2.318362214625329, - "grad_norm": 0.7877224683761597, - "learning_rate": 0.0001236934673366834, - "loss": 0.2584, + "grad_norm": 6.7399773597717285, + "learning_rate": 0.00021206766917293232, + "loss": 0.3228, "step": 59000 }, { "epoch": 2.318362214625329, - "eval_loss": 0.4691295921802521, - "eval_runtime": 145.6624, - "eval_samples_per_second": 38.83, - "eval_steps_per_second": 4.854, - "eval_wer": 0.3657620644829966, + "eval_loss": 0.5073339939117432, + "eval_runtime": 145.4974, + "eval_samples_per_second": 38.874, + "eval_steps_per_second": 4.859, + "eval_wer": 0.3955802346295197, "step": 59000 }, { "epoch": 2.338009352037408, - "grad_norm": 0.8857819437980652, - "learning_rate": 0.0001221859296482412, - "loss": 0.2579, + "grad_norm": 0.9246654510498047, + "learning_rate": 0.00021131578947368419, + "loss": 0.3311, "step": 59500 }, { "epoch": 2.357656489449487, - "grad_norm": 2.5980610847473145, - "learning_rate": 0.000120678391959799, - "loss": 0.2535, + "grad_norm": 0.8129465579986572, + "learning_rate": 0.00021056390977443608, + "loss": 0.3172, "step": 60000 }, { "epoch": 2.357656489449487, - "eval_loss": 0.46268430352211, - "eval_runtime": 145.2629, - "eval_samples_per_second": 38.936, + "eval_loss": 0.5084324479103088, + "eval_runtime": 145.2717, + "eval_samples_per_second": 38.934, "eval_steps_per_second": 4.867, - "eval_wer": 0.37131485612492177, + "eval_wer": 0.4045353147919308, "step": 60000 }, { "epoch": 2.377303626861566, - "grad_norm": 0.38764873147010803, - "learning_rate": 0.00011917085427135678, - "loss": 0.2536, + "grad_norm": 0.6280909776687622, + "learning_rate": 0.00020981203007518795, + "loss": 0.3235, "step": 60500 }, { "epoch": 2.396950764273645, - "grad_norm": 0.6207642555236816, - "learning_rate": 0.00011766331658291456, - "loss": 0.2623, + "grad_norm": 3.0157957077026367, + "learning_rate": 0.00020906015037593984, + "loss": 0.332, "step": 61000 }, { "epoch": 2.396950764273645, - "eval_loss": 0.4667174816131592, - "eval_runtime": 145.1487, - "eval_samples_per_second": 38.967, - "eval_steps_per_second": 4.871, - "eval_wer": 0.3669817528205293, + "eval_loss": 0.512955367565155, + "eval_runtime": 145.4819, + "eval_samples_per_second": 38.878, + "eval_steps_per_second": 4.86, + "eval_wer": 0.40151819100961306, "step": 61000 }, { "epoch": 2.4165979016857246, - "grad_norm": 0.9032436609268188, - "learning_rate": 0.00011615577889447236, - "loss": 0.2631, + "grad_norm": 1.2728731632232666, + "learning_rate": 0.0002083082706766917, + "loss": 0.3298, "step": 61500 }, { "epoch": 2.4362450390978037, - "grad_norm": 1.0269511938095093, - "learning_rate": 0.00011464824120603014, - "loss": 0.2502, + "grad_norm": 6.008030891418457, + "learning_rate": 0.0002075563909774436, + "loss": 0.3218, "step": 62000 }, { "epoch": 2.4362450390978037, - "eval_loss": 0.4592076241970062, - "eval_runtime": 144.6818, - "eval_samples_per_second": 39.093, - "eval_steps_per_second": 4.887, - "eval_wer": 0.3680570043812489, + "eval_loss": 0.5102687478065491, + "eval_runtime": 145.1668, + "eval_samples_per_second": 38.962, + "eval_steps_per_second": 4.87, + "eval_wer": 0.39972075556482806, "step": 62000 }, { "epoch": 2.4558921765098827, - "grad_norm": 0.4968740940093994, - "learning_rate": 0.00011314070351758793, - "loss": 0.2514, + "grad_norm": 1.1027765274047852, + "learning_rate": 0.00020680451127819547, + "loss": 0.3207, "step": 62500 }, { "epoch": 2.4755393139219617, - "grad_norm": 0.9083556532859802, - "learning_rate": 0.00011163316582914572, - "loss": 0.2593, + "grad_norm": 0.9439337849617004, + "learning_rate": 0.00020605263157894737, + "loss": 0.3317, "step": 63000 }, { "epoch": 2.4755393139219617, - "eval_loss": 0.4568769037723541, - "eval_runtime": 144.6454, - "eval_samples_per_second": 39.103, - "eval_steps_per_second": 4.888, - "eval_wer": 0.3675755484585386, + "eval_loss": 0.5019811391830444, + "eval_runtime": 145.2988, + "eval_samples_per_second": 38.927, + "eval_steps_per_second": 4.866, + "eval_wer": 0.40500072218388405, "step": 63000 }, { "epoch": 2.4951864513340407, - "grad_norm": 0.7074434757232666, - "learning_rate": 0.00011012562814070351, - "loss": 0.2551, + "grad_norm": 1.4857794046401978, + "learning_rate": 0.0002053007518796992, + "loss": 0.3272, "step": 63500 }, { "epoch": 2.5148335887461197, - "grad_norm": 1.97989821434021, - "learning_rate": 0.0001086180904522613, - "loss": 0.2521, + "grad_norm": 0.9404523968696594, + "learning_rate": 0.0002045488721804511, + "loss": 0.3222, "step": 64000 }, { "epoch": 2.5148335887461197, - "eval_loss": 0.4576202929019928, - "eval_runtime": 143.8418, - "eval_samples_per_second": 39.321, - "eval_steps_per_second": 4.915, - "eval_wer": 0.35895748744202466, + "eval_loss": 0.5072047114372253, + "eval_runtime": 145.1717, + "eval_samples_per_second": 38.961, + "eval_steps_per_second": 4.87, + "eval_wer": 0.39964051291104297, "step": 64000 }, { "epoch": 2.5344807261581987, - "grad_norm": 0.7055521607398987, - "learning_rate": 0.00010711055276381909, - "loss": 0.2545, + "grad_norm": 1.1841472387313843, + "learning_rate": 0.00020379699248120297, + "loss": 0.3261, "step": 64500 }, { "epoch": 2.5541278635702778, - "grad_norm": 0.7131598591804504, - "learning_rate": 0.00010560603015075377, - "loss": 0.2415, + "grad_norm": 0.7141321301460266, + "learning_rate": 0.0002030466165413534, + "loss": 0.3138, "step": 65000 }, { "epoch": 2.5541278635702778, - "eval_loss": 0.4510234296321869, - "eval_runtime": 144.8155, - "eval_samples_per_second": 39.057, - "eval_steps_per_second": 4.882, - "eval_wer": 0.3541589767456789, + "eval_loss": 0.5098404884338379, + "eval_runtime": 144.9074, + "eval_samples_per_second": 39.032, + "eval_steps_per_second": 4.879, + "eval_wer": 0.40357240294651026, "step": 65000 }, { "epoch": 2.573775000982357, - "grad_norm": 2.750293254852295, - "learning_rate": 0.00010409849246231155, - "loss": 0.251, + "grad_norm": 0.8695092797279358, + "learning_rate": 0.00020229473684210523, + "loss": 0.3198, "step": 65500 }, { "epoch": 2.593422138394436, - "grad_norm": 2.111311435699463, - "learning_rate": 0.00010259095477386933, - "loss": 0.2349, + "grad_norm": 1.1630802154541016, + "learning_rate": 0.00020154285714285713, + "loss": 0.3074, "step": 66000 }, { "epoch": 2.593422138394436, - "eval_loss": 0.445352166891098, - "eval_runtime": 144.573, - "eval_samples_per_second": 39.122, - "eval_steps_per_second": 4.89, - "eval_wer": 0.35335655020782847, + "eval_loss": 0.5026105046272278, + "eval_runtime": 145.0532, + "eval_samples_per_second": 38.993, + "eval_steps_per_second": 4.874, + "eval_wer": 0.3981159024891271, "step": 66000 }, { "epoch": 2.613069275806515, - "grad_norm": 2.4952309131622314, - "learning_rate": 0.00010108643216080401, - "loss": 0.2467, + "grad_norm": 1.2439523935317993, + "learning_rate": 0.000200790977443609, + "loss": 0.3234, "step": 66500 }, { "epoch": 2.632716413218594, - "grad_norm": 0.967322587966919, - "learning_rate": 9.95788944723618e-05, - "loss": 0.2482, + "grad_norm": 0.7216903567314148, + "learning_rate": 0.0002000390977443609, + "loss": 0.3261, "step": 67000 }, { "epoch": 2.632716413218594, - "eval_loss": 0.4530712068080902, - "eval_runtime": 144.4196, - "eval_samples_per_second": 39.164, - "eval_steps_per_second": 4.895, - "eval_wer": 0.35855627417309943, + "eval_loss": 0.5030384063720703, + "eval_runtime": 145.8235, + "eval_samples_per_second": 38.787, + "eval_steps_per_second": 4.848, + "eval_wer": 0.39349392563110847, "step": 67000 }, { "epoch": 2.652363550630673, - "grad_norm": 2.702293634414673, - "learning_rate": 9.807135678391959e-05, - "loss": 0.2382, + "grad_norm": 1.0616713762283325, + "learning_rate": 0.00019928721804511276, + "loss": 0.3147, "step": 67500 }, { "epoch": 2.6720106880427523, - "grad_norm": 7.968295097351074, - "learning_rate": 9.656381909547737e-05, - "loss": 0.2527, + "grad_norm": 10.39274787902832, + "learning_rate": 0.00019853533834586465, + "loss": 0.3257, "step": 68000 }, { "epoch": 2.6720106880427523, - "eval_loss": 0.4417664110660553, - "eval_runtime": 144.8639, - "eval_samples_per_second": 39.044, - "eval_steps_per_second": 4.88, - "eval_wer": 0.3521529104010528, + "eval_loss": 0.500296413898468, + "eval_runtime": 144.7526, + "eval_samples_per_second": 39.074, + "eval_steps_per_second": 4.884, + "eval_wer": 0.39028421947970665, "step": 68000 }, { "epoch": 2.6916578254548313, - "grad_norm": 0.7025226950645447, - "learning_rate": 9.505628140703516e-05, - "loss": 0.2471, + "grad_norm": 0.7911710739135742, + "learning_rate": 0.00019778345864661652, + "loss": 0.3274, "step": 68500 }, { "epoch": 2.7113049628669104, - "grad_norm": 0.968565821647644, - "learning_rate": 9.354874371859296e-05, - "loss": 0.2473, + "grad_norm": 0.6936825513839722, + "learning_rate": 0.00019703157894736842, + "loss": 0.3179, "step": 69000 }, { "epoch": 2.7113049628669104, - "eval_loss": 0.44372445344924927, - "eval_runtime": 144.5528, - "eval_samples_per_second": 39.128, - "eval_steps_per_second": 4.891, - "eval_wer": 0.35826740061947326, + "eval_loss": 0.5139185786247253, + "eval_runtime": 145.2074, + "eval_samples_per_second": 38.951, + "eval_steps_per_second": 4.869, + "eval_wer": 0.4003947938566224, "step": 69000 }, { "epoch": 2.7309521002789894, - "grad_norm": 0.7599900364875793, - "learning_rate": 9.204422110552763e-05, - "loss": 0.2348, + "grad_norm": 0.5706244111061096, + "learning_rate": 0.00019627969924812028, + "loss": 0.3147, "step": 69500 }, { "epoch": 2.7505992376910684, - "grad_norm": 2.434638261795044, - "learning_rate": 9.053668341708543e-05, - "loss": 0.2334, + "grad_norm": 0.9982422590255737, + "learning_rate": 0.00019552932330827065, + "loss": 0.3154, "step": 70000 }, { "epoch": 2.7505992376910684, - "eval_loss": 0.43379753828048706, - "eval_runtime": 145.1622, - "eval_samples_per_second": 38.963, - "eval_steps_per_second": 4.87, - "eval_wer": 0.34566930397522105, + "eval_loss": 0.5041365027427673, + "eval_runtime": 144.6141, + "eval_samples_per_second": 39.111, + "eval_steps_per_second": 4.889, + "eval_wer": 0.39455312866107106, "step": 70000 }, { "epoch": 2.7702463751031474, - "grad_norm": 0.5358386635780334, - "learning_rate": 8.902914572864321e-05, - "loss": 0.2355, + "grad_norm": 0.6709697842597961, + "learning_rate": 0.00019477894736842104, + "loss": 0.3116, "step": 70500 }, { "epoch": 2.7898935125152264, - "grad_norm": 1.4655603170394897, - "learning_rate": 8.7521608040201e-05, - "loss": 0.2314, + "grad_norm": 1.2155810594558716, + "learning_rate": 0.0001940270676691729, + "loss": 0.3119, "step": 71000 }, { "epoch": 2.7898935125152264, - "eval_loss": 0.4286041557788849, - "eval_runtime": 144.4601, - "eval_samples_per_second": 39.153, - "eval_steps_per_second": 4.894, - "eval_wer": 0.34557301279067903, + "eval_loss": 0.49135103821754456, + "eval_runtime": 144.8705, + "eval_samples_per_second": 39.042, + "eval_steps_per_second": 4.88, + "eval_wer": 0.3940877212691178, "step": 71000 }, { "epoch": 2.8095406499273055, - "grad_norm": 1.992447018623352, - "learning_rate": 8.60140703517588e-05, - "loss": 0.2391, + "grad_norm": 9.424253463745117, + "learning_rate": 0.0001932766917293233, + "loss": 0.3177, "step": 71500 }, { "epoch": 2.829187787339385, - "grad_norm": 1.4909594058990479, - "learning_rate": 8.450653266331658e-05, - "loss": 0.2318, + "grad_norm": 0.6820365786552429, + "learning_rate": 0.00019252481203007517, + "loss": 0.3128, "step": 72000 }, { "epoch": 2.829187787339385, - "eval_loss": 0.42746907472610474, - "eval_runtime": 144.6523, - "eval_samples_per_second": 39.101, - "eval_steps_per_second": 4.888, - "eval_wer": 0.3371154370817352, + "eval_loss": 0.4867289066314697, + "eval_runtime": 144.797, + "eval_samples_per_second": 39.062, + "eval_steps_per_second": 4.883, + "eval_wer": 0.38309447770056654, "step": 72000 }, { "epoch": 2.848834924751464, - "grad_norm": 0.7300348281860352, - "learning_rate": 8.300201005025126e-05, - "loss": 0.2328, + "grad_norm": 0.6361156702041626, + "learning_rate": 0.00019177443609022553, + "loss": 0.3127, "step": 72500 }, { "epoch": 2.868482062163543, - "grad_norm": 0.5728158354759216, - "learning_rate": 8.149447236180904e-05, - "loss": 0.2347, + "grad_norm": 1.142830491065979, + "learning_rate": 0.00019102255639097743, + "loss": 0.3105, "step": 73000 }, { "epoch": 2.868482062163543, - "eval_loss": 0.42658358812332153, - "eval_runtime": 144.7899, - "eval_samples_per_second": 39.064, - "eval_steps_per_second": 4.883, - "eval_wer": 0.3408386962173613, + "eval_loss": 0.4870510995388031, + "eval_runtime": 145.1289, + "eval_samples_per_second": 38.972, + "eval_steps_per_second": 4.872, + "eval_wer": 0.3817945467092488, "step": 73000 }, { "epoch": 2.888129199575622, - "grad_norm": 0.7156108021736145, - "learning_rate": 7.998693467336682e-05, - "loss": 0.2421, + "grad_norm": 0.8262931704521179, + "learning_rate": 0.0001902706766917293, + "loss": 0.3234, "step": 73500 }, { "epoch": 2.907776336987701, - "grad_norm": 1.1177480220794678, - "learning_rate": 7.847939698492462e-05, - "loss": 0.2313, + "grad_norm": 1.1251935958862305, + "learning_rate": 0.0001895187969924812, + "loss": 0.309, "step": 74000 }, { "epoch": 2.907776336987701, - "eval_loss": 0.42381560802459717, - "eval_runtime": 144.7411, - "eval_samples_per_second": 39.077, - "eval_steps_per_second": 4.885, - "eval_wer": 0.33544638988300624, + "eval_loss": 0.48873621225357056, + "eval_runtime": 144.4047, + "eval_samples_per_second": 39.168, + "eval_steps_per_second": 4.896, + "eval_wer": 0.39885413490394955, "step": 74000 }, { "epoch": 2.92742347439978, - "grad_norm": 0.5893663763999939, - "learning_rate": 7.69718592964824e-05, - "loss": 0.2393, + "grad_norm": 0.825520396232605, + "learning_rate": 0.00018876691729323306, + "loss": 0.3204, "step": 74500 }, { "epoch": 2.947070611811859, - "grad_norm": 0.8663122057914734, - "learning_rate": 7.546733668341708e-05, - "loss": 0.2253, + "grad_norm": 0.5379465818405151, + "learning_rate": 0.00018801654135338345, + "loss": 0.3073, "step": 75000 }, { "epoch": 2.947070611811859, - "eval_loss": 0.41986486315727234, - "eval_runtime": 144.4042, - "eval_samples_per_second": 39.168, - "eval_steps_per_second": 4.896, - "eval_wer": 0.33164288809359505, + "eval_loss": 0.48394420742988586, + "eval_runtime": 145.2237, + "eval_samples_per_second": 38.947, + "eval_steps_per_second": 4.868, + "eval_wer": 0.3903644621334917, "step": 75000 }, { "epoch": 2.966717749223938, - "grad_norm": 0.48322752118110657, - "learning_rate": 7.395979899497487e-05, - "loss": 0.2222, + "grad_norm": 0.4377336800098419, + "learning_rate": 0.00018726466165413532, + "loss": 0.3044, "step": 75500 }, { "epoch": 2.986364886636017, - "grad_norm": 0.6252803206443787, - "learning_rate": 7.245226130653266e-05, - "loss": 0.217, + "grad_norm": 0.8088381290435791, + "learning_rate": 0.0001865127819548872, + "loss": 0.3023, "step": 76000 }, { "epoch": 2.986364886636017, - "eval_loss": 0.4222155809402466, - "eval_runtime": 144.6071, - "eval_samples_per_second": 39.113, - "eval_steps_per_second": 4.889, - "eval_wer": 0.33327983823081, + "eval_loss": 0.48391589522361755, + "eval_runtime": 145.3642, + "eval_samples_per_second": 38.909, + "eval_steps_per_second": 4.864, + "eval_wer": 0.38049461571793103, "step": 76000 }, { "epoch": 3.006012024048096, - "grad_norm": 0.9049218893051147, - "learning_rate": 7.094472361809045e-05, - "loss": 0.2258, + "grad_norm": 0.6517421007156372, + "learning_rate": 0.00018576090225563908, + "loss": 0.3049, "step": 76500 }, { "epoch": 3.025659161460175, - "grad_norm": 0.6731761693954468, - "learning_rate": 6.943718592964823e-05, - "loss": 0.194, + "grad_norm": 1.4399667978286743, + "learning_rate": 0.00018500902255639098, + "loss": 0.2715, "step": 77000 }, { "epoch": 3.025659161460175, - "eval_loss": 0.4252253770828247, - "eval_runtime": 145.6762, - "eval_samples_per_second": 38.826, - "eval_steps_per_second": 4.853, - "eval_wer": 0.3342267015454735, + "eval_loss": 0.48158180713653564, + "eval_runtime": 147.38, + "eval_samples_per_second": 38.377, + "eval_steps_per_second": 4.797, + "eval_wer": 0.38766830896631416, "step": 77000 }, { "epoch": 3.045306298872254, - "grad_norm": 3.186687707901001, - "learning_rate": 6.792964824120603e-05, - "loss": 0.1942, + "grad_norm": 1.0456621646881104, + "learning_rate": 0.00018425714285714284, + "loss": 0.2762, "step": 77500 }, { "epoch": 3.064953436284333, - "grad_norm": 1.0603581666946411, - "learning_rate": 6.642512562814069e-05, - "loss": 0.181, + "grad_norm": 0.6409999132156372, + "learning_rate": 0.00018350526315789474, + "loss": 0.2565, "step": 78000 }, { "epoch": 3.064953436284333, - "eval_loss": 0.42276450991630554, - "eval_runtime": 144.274, - "eval_samples_per_second": 39.203, - "eval_steps_per_second": 4.9, - "eval_wer": 0.33640930172842676, + "eval_loss": 0.49935096502304077, + "eval_runtime": 144.0075, + "eval_samples_per_second": 39.276, + "eval_steps_per_second": 4.909, + "eval_wer": 0.3811044598866974, "step": 78000 }, { "epoch": 3.0846005736964126, - "grad_norm": 0.5516805052757263, - "learning_rate": 6.492060301507537e-05, - "loss": 0.1871, + "grad_norm": 0.8721100687980652, + "learning_rate": 0.0001827548872180451, + "loss": 0.2681, "step": 78500 }, { "epoch": 3.1042477111084916, - "grad_norm": 0.4451453685760498, - "learning_rate": 6.341306532663317e-05, - "loss": 0.187, + "grad_norm": 1.0572487115859985, + "learning_rate": 0.00018200300751879697, + "loss": 0.2697, "step": 79000 }, { "epoch": 3.1042477111084916, - "eval_loss": 0.4196587800979614, - "eval_runtime": 144.9483, - "eval_samples_per_second": 39.021, - "eval_steps_per_second": 4.878, - "eval_wer": 0.3355908266598193, + "eval_loss": 0.48027363419532776, + "eval_runtime": 143.6893, + "eval_samples_per_second": 39.363, + "eval_steps_per_second": 4.92, + "eval_wer": 0.3813291393172955, "step": 79000 }, { "epoch": 3.1238948485205706, - "grad_norm": 0.48886096477508545, - "learning_rate": 6.190552763819095e-05, - "loss": 0.1894, + "grad_norm": 0.5640320777893066, + "learning_rate": 0.00018125112781954887, + "loss": 0.274, "step": 79500 }, { "epoch": 3.1435419859326497, - "grad_norm": 0.874115526676178, - "learning_rate": 6.039798994974873e-05, - "loss": 0.1855, + "grad_norm": 0.740835964679718, + "learning_rate": 0.00018049924812030073, + "loss": 0.2717, "step": 80000 }, { "epoch": 3.1435419859326497, - "eval_loss": 0.42146381735801697, - "eval_runtime": 144.3542, - "eval_samples_per_second": 39.181, - "eval_steps_per_second": 4.898, - "eval_wer": 0.3409831329941744, + "eval_loss": 0.48425012826919556, + "eval_runtime": 144.7996, + "eval_samples_per_second": 39.061, + "eval_steps_per_second": 4.883, + "eval_wer": 0.37988477154916467, "step": 80000 }, { "epoch": 3.1631891233447287, - "grad_norm": 0.5143038630485535, - "learning_rate": 5.8890452261306523e-05, - "loss": 0.1841, + "grad_norm": 0.4206051528453827, + "learning_rate": 0.00017974736842105263, + "loss": 0.2751, "step": 80500 }, { "epoch": 3.1828362607568077, - "grad_norm": 0.7534670829772949, - "learning_rate": 5.7382914572864314e-05, - "loss": 0.1886, + "grad_norm": 1.7560110092163086, + "learning_rate": 0.00017899548872180447, + "loss": 0.2738, "step": 81000 }, { "epoch": 3.1828362607568077, - "eval_loss": 0.4176616668701172, - "eval_runtime": 145.008, - "eval_samples_per_second": 39.005, - "eval_steps_per_second": 4.876, - "eval_wer": 0.33186756752419316, + "eval_loss": 0.4904831647872925, + "eval_runtime": 145.8617, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.37967614064932353, "step": 81000 }, { "epoch": 3.2024833981688867, - "grad_norm": 0.7065553665161133, - "learning_rate": 5.5875376884422104e-05, - "loss": 0.1806, + "grad_norm": 0.5435498952865601, + "learning_rate": 0.00017824360902255637, + "loss": 0.2671, "step": 81500 }, { "epoch": 3.2221305355809657, - "grad_norm": 0.4848778247833252, - "learning_rate": 5.437085427135678e-05, - "loss": 0.1821, + "grad_norm": 0.8769587278366089, + "learning_rate": 0.00017749323308270673, + "loss": 0.2617, "step": 82000 }, { "epoch": 3.2221305355809657, - "eval_loss": 0.4128184914588928, - "eval_runtime": 145.0877, - "eval_samples_per_second": 38.983, - "eval_steps_per_second": 4.873, - "eval_wer": 0.32925165701080067, + "eval_loss": 0.4753645956516266, + "eval_runtime": 144.9825, + "eval_samples_per_second": 39.012, + "eval_steps_per_second": 4.876, + "eval_wer": 0.37282341801608065, "step": 82000 }, { "epoch": 3.2417776729930448, - "grad_norm": 0.9558472037315369, - "learning_rate": 5.286331658291457e-05, - "loss": 0.1802, + "grad_norm": 0.5920813083648682, + "learning_rate": 0.00017674135338345865, + "loss": 0.27, "step": 82500 }, { "epoch": 3.261424810405124, - "grad_norm": 0.5446112751960754, - "learning_rate": 5.1355778894472356e-05, - "loss": 0.1786, + "grad_norm": 0.4276420474052429, + "learning_rate": 0.0001759894736842105, + "loss": 0.2634, "step": 83000 }, { "epoch": 3.261424810405124, - "eval_loss": 0.41023358702659607, - "eval_runtime": 144.3156, - "eval_samples_per_second": 39.192, - "eval_steps_per_second": 4.899, - "eval_wer": 0.32260756527739887, + "eval_loss": 0.4729759693145752, + "eval_runtime": 144.8389, + "eval_samples_per_second": 39.05, + "eval_steps_per_second": 4.881, + "eval_wer": 0.3668052189822022, "step": 83000 }, { "epoch": 3.2810719478172032, - "grad_norm": 0.3519670367240906, - "learning_rate": 4.9848241206030146e-05, - "loss": 0.1877, + "grad_norm": 2.3023736476898193, + "learning_rate": 0.0001752375939849624, + "loss": 0.2771, "step": 83500 }, { "epoch": 3.3007190852292823, - "grad_norm": 0.7854357957839966, - "learning_rate": 4.834070351758794e-05, - "loss": 0.1758, + "grad_norm": 1.1624869108200073, + "learning_rate": 0.00017448571428571426, + "loss": 0.2648, "step": 84000 }, { "epoch": 3.3007190852292823, - "eval_loss": 0.4147268831729889, - "eval_runtime": 144.3652, - "eval_samples_per_second": 39.178, - "eval_steps_per_second": 4.897, - "eval_wer": 0.32637897000529603, + "eval_loss": 0.4768010377883911, + "eval_runtime": 144.7404, + "eval_samples_per_second": 39.077, + "eval_steps_per_second": 4.885, + "eval_wer": 0.3690520132881835, "step": 84000 }, { "epoch": 3.3203662226413613, - "grad_norm": 0.5795627236366272, - "learning_rate": 4.683316582914573e-05, - "loss": 0.185, + "grad_norm": 0.64561527967453, + "learning_rate": 0.00017373383458646615, + "loss": 0.2745, "step": 84500 }, { "epoch": 3.3400133600534403, - "grad_norm": 1.2406518459320068, - "learning_rate": 4.532562814070351e-05, - "loss": 0.171, + "grad_norm": 0.5857324600219727, + "learning_rate": 0.00017298195488721802, + "loss": 0.2567, "step": 85000 }, { "epoch": 3.3400133600534403, - "eval_loss": 0.41314879059791565, - "eval_runtime": 145.1379, - "eval_samples_per_second": 38.97, - "eval_steps_per_second": 4.871, - "eval_wer": 0.31997560623324933, + "eval_loss": 0.4812460243701935, + "eval_runtime": 145.4739, + "eval_samples_per_second": 38.88, + "eval_steps_per_second": 4.86, + "eval_wer": 0.37410730047664137, "step": 85000 }, { "epoch": 3.3596604974655193, - "grad_norm": 9.228507995605469, - "learning_rate": 4.38180904522613e-05, - "loss": 0.1754, + "grad_norm": 0.97500079870224, + "learning_rate": 0.00017223007518796991, + "loss": 0.2686, "step": 85500 }, { "epoch": 3.3793076348775983, - "grad_norm": 0.7298714518547058, - "learning_rate": 4.231356783919597e-05, - "loss": 0.1767, + "grad_norm": 0.6413397789001465, + "learning_rate": 0.00017147969924812028, + "loss": 0.2687, "step": 86000 }, { "epoch": 3.3793076348775983, - "eval_loss": 0.4097689688205719, - "eval_runtime": 145.5087, - "eval_samples_per_second": 38.871, - "eval_steps_per_second": 4.859, - "eval_wer": 0.31719921041228677, + "eval_loss": 0.46830272674560547, + "eval_runtime": 144.9613, + "eval_samples_per_second": 39.017, + "eval_steps_per_second": 4.877, + "eval_wer": 0.37160372967854793, "step": 86000 }, { "epoch": 3.3989547722896774, - "grad_norm": 1.8685766458511353, - "learning_rate": 4.080603015075376e-05, - "loss": 0.1738, + "grad_norm": 1.0018800497055054, + "learning_rate": 0.00017072781954887217, + "loss": 0.2595, "step": 86500 }, { "epoch": 3.4186019097017564, - "grad_norm": 0.5031083822250366, - "learning_rate": 3.929849246231156e-05, - "loss": 0.1804, + "grad_norm": 1.0536398887634277, + "learning_rate": 0.00016997593984962404, + "loss": 0.2757, "step": 87000 }, { "epoch": 3.4186019097017564, - "eval_loss": 0.40913575887680054, - "eval_runtime": 145.0065, - "eval_samples_per_second": 39.005, - "eval_steps_per_second": 4.876, - "eval_wer": 0.3209064210171559, + "eval_loss": 0.46901389956474304, + "eval_runtime": 144.4366, + "eval_samples_per_second": 39.159, + "eval_steps_per_second": 4.895, + "eval_wer": 0.37320858275424884, "step": 87000 }, { "epoch": 3.4382490471138354, - "grad_norm": 0.30902644991874695, - "learning_rate": 3.779095477386935e-05, - "loss": 0.178, + "grad_norm": 0.4486633837223053, + "learning_rate": 0.00016922406015037594, + "loss": 0.2655, "step": 87500 }, { "epoch": 3.4578961845259144, - "grad_norm": 0.7432768940925598, - "learning_rate": 3.6283417085427134e-05, - "loss": 0.1699, + "grad_norm": 0.6999643445014954, + "learning_rate": 0.0001684721804511278, + "loss": 0.2596, "step": 88000 }, { "epoch": 3.4578961845259144, - "eval_loss": 0.40437594056129456, - "eval_runtime": 144.7637, - "eval_samples_per_second": 39.071, - "eval_steps_per_second": 4.884, - "eval_wer": 0.3179374428271092, + "eval_loss": 0.47534072399139404, + "eval_runtime": 145.0031, + "eval_samples_per_second": 39.006, + "eval_steps_per_second": 4.876, + "eval_wer": 0.37824782141194974, "step": 88000 }, { "epoch": 3.4775433219379934, - "grad_norm": 0.6833881139755249, - "learning_rate": 3.477587939698492e-05, - "loss": 0.174, + "grad_norm": 0.9858837723731995, + "learning_rate": 0.0001677203007518797, + "loss": 0.2614, "step": 88500 }, { "epoch": 3.497190459350073, - "grad_norm": 0.38148313760757446, - "learning_rate": 3.3268341708542715e-05, - "loss": 0.1645, + "grad_norm": 0.5992431640625, + "learning_rate": 0.00016696842105263157, + "loss": 0.2589, "step": 89000 }, { "epoch": 3.497190459350073, - "eval_loss": 0.40410056710243225, - "eval_runtime": 145.5748, - "eval_samples_per_second": 38.853, - "eval_steps_per_second": 4.857, - "eval_wer": 0.3167498515510905, + "eval_loss": 0.4645041823387146, + "eval_runtime": 146.1245, + "eval_samples_per_second": 38.707, + "eval_steps_per_second": 4.838, + "eval_wer": 0.3691483044727255, "step": 89000 }, { "epoch": 3.516837596762152, - "grad_norm": 0.7338330149650574, - "learning_rate": 3.1763819095477385e-05, - "loss": 0.1765, + "grad_norm": 2.5524730682373047, + "learning_rate": 0.00016621804511278193, + "loss": 0.2724, "step": 89500 }, { "epoch": 3.536484734174231, - "grad_norm": 0.8941044211387634, - "learning_rate": 3.0256281407035173e-05, - "loss": 0.1707, + "grad_norm": 0.42577388882637024, + "learning_rate": 0.00016546616541353383, + "loss": 0.2627, "step": 90000 }, { "epoch": 3.536484734174231, - "eval_loss": 0.40083417296409607, - "eval_runtime": 145.469, - "eval_samples_per_second": 38.881, - "eval_steps_per_second": 4.86, - "eval_wer": 0.3202484312561185, + "eval_loss": 0.4689880907535553, + "eval_runtime": 146.114, + "eval_samples_per_second": 38.71, + "eval_steps_per_second": 4.839, + "eval_wer": 0.3675274028662676, "step": 90000 }, { "epoch": 3.55613187158631, - "grad_norm": 0.5399070382118225, - "learning_rate": 2.8748743718592963e-05, - "loss": 0.1767, + "grad_norm": 0.530576765537262, + "learning_rate": 0.0001647142857142857, + "loss": 0.2692, "step": 90500 }, { "epoch": 3.575779008998389, - "grad_norm": 0.8689693212509155, - "learning_rate": 2.724120603015075e-05, - "loss": 0.1838, + "grad_norm": 1.5638034343719482, + "learning_rate": 0.0001639624060150376, + "loss": 0.2804, "step": 91000 }, { "epoch": 3.575779008998389, - "eval_loss": 0.3981110751628876, - "eval_runtime": 144.9174, - "eval_samples_per_second": 39.029, - "eval_steps_per_second": 4.879, - "eval_wer": 0.3165412206512494, + "eval_loss": 0.46749356389045715, + "eval_runtime": 146.7363, + "eval_samples_per_second": 38.545, + "eval_steps_per_second": 4.818, + "eval_wer": 0.37420359166118344, "step": 91000 }, { "epoch": 3.595426146410468, - "grad_norm": 0.4749615788459778, - "learning_rate": 2.573366834170854e-05, - "loss": 0.1697, + "grad_norm": 0.7981226444244385, + "learning_rate": 0.00016321052631578946, + "loss": 0.2658, "step": 91500 }, { "epoch": 3.615073283822547, - "grad_norm": 0.7744316458702087, - "learning_rate": 2.4226130653266328e-05, - "loss": 0.1653, + "grad_norm": 0.4092627167701721, + "learning_rate": 0.00016245864661654135, + "loss": 0.2587, "step": 92000 }, { "epoch": 3.615073283822547, - "eval_loss": 0.3987283408641815, - "eval_runtime": 145.3191, - "eval_samples_per_second": 38.921, - "eval_steps_per_second": 4.865, - "eval_wer": 0.3132352233153055, + "eval_loss": 0.46739462018013, + "eval_runtime": 145.0739, + "eval_samples_per_second": 38.987, + "eval_steps_per_second": 4.873, + "eval_wer": 0.3593747492417069, "step": 92000 }, { "epoch": 3.634720421234626, - "grad_norm": 0.5036156177520752, - "learning_rate": 2.2718592964824118e-05, - "loss": 0.1708, + "grad_norm": 0.4350492060184479, + "learning_rate": 0.0001617067669172932, + "loss": 0.2664, "step": 92500 }, { "epoch": 3.654367558646705, - "grad_norm": 0.6102810502052307, - "learning_rate": 2.121105527638191e-05, - "loss": 0.1679, + "grad_norm": 0.7081959247589111, + "learning_rate": 0.00016095488721804512, + "loss": 0.2615, "step": 93000 }, { "epoch": 3.654367558646705, - "eval_loss": 0.39818885922431946, - "eval_runtime": 145.5258, - "eval_samples_per_second": 38.866, - "eval_steps_per_second": 4.858, - "eval_wer": 0.3110044775400812, + "eval_loss": 0.46574193239212036, + "eval_runtime": 145.5779, + "eval_samples_per_second": 38.852, + "eval_steps_per_second": 4.857, + "eval_wer": 0.36321034809263214, "step": 93000 }, { "epoch": 3.6740146960587845, - "grad_norm": 0.39582985639572144, - "learning_rate": 1.9703517587939696e-05, - "loss": 0.1726, + "grad_norm": 1.1376652717590332, + "learning_rate": 0.00016020300751879696, + "loss": 0.2664, "step": 93500 }, { "epoch": 3.6936618334708635, - "grad_norm": 0.4703328609466553, - "learning_rate": 1.8195979899497486e-05, - "loss": 0.1631, + "grad_norm": 0.8354430794715881, + "learning_rate": 0.00015945263157894738, + "loss": 0.2531, "step": 94000 }, { "epoch": 3.6936618334708635, - "eval_loss": 0.39044293761253357, - "eval_runtime": 145.0128, - "eval_samples_per_second": 39.003, - "eval_steps_per_second": 4.875, - "eval_wer": 0.3073775095889971, + "eval_loss": 0.45889467000961304, + "eval_runtime": 145.207, + "eval_samples_per_second": 38.951, + "eval_steps_per_second": 4.869, + "eval_wer": 0.3668373160437162, "step": 94000 }, { "epoch": 3.7133089708829425, - "grad_norm": 0.5020835399627686, - "learning_rate": 1.669145728643216e-05, - "loss": 0.1687, + "grad_norm": 0.7989226579666138, + "learning_rate": 0.00015870075187969922, + "loss": 0.2621, "step": 94500 }, { "epoch": 3.7329561082950216, - "grad_norm": 1.21161687374115, - "learning_rate": 1.5186934673366832e-05, - "loss": 0.1561, + "grad_norm": 1.2648522853851318, + "learning_rate": 0.00015794887218045114, + "loss": 0.2466, "step": 95000 }, { "epoch": 3.7329561082950216, - "eval_loss": 0.39342302083969116, - "eval_runtime": 148.1879, - "eval_samples_per_second": 38.168, - "eval_steps_per_second": 4.771, - "eval_wer": 0.30911075091075413, + "eval_loss": 0.46178776025772095, + "eval_runtime": 145.1044, + "eval_samples_per_second": 38.979, + "eval_steps_per_second": 4.872, + "eval_wer": 0.3691001588804545, "step": 95000 }, { "epoch": 3.7526032457071006, - "grad_norm": 0.49361735582351685, - "learning_rate": 1.3679396984924621e-05, - "loss": 0.1757, + "grad_norm": 0.6409050226211548, + "learning_rate": 0.00015719699248120298, + "loss": 0.2732, "step": 95500 }, { "epoch": 3.7722503831191796, - "grad_norm": 0.9406358003616333, - "learning_rate": 1.217185929648241e-05, - "loss": 0.1699, + "grad_norm": 0.8056377172470093, + "learning_rate": 0.00015644511278195487, + "loss": 0.2653, "step": 96000 }, { "epoch": 3.7722503831191796, - "eval_loss": 0.3916533589363098, - "eval_runtime": 147.0052, - "eval_samples_per_second": 38.475, - "eval_steps_per_second": 4.809, - "eval_wer": 0.3068158110125018, + "eval_loss": 0.46144935488700867, + "eval_runtime": 145.0964, + "eval_samples_per_second": 38.981, + "eval_steps_per_second": 4.873, + "eval_wer": 0.3774774919356133, "step": 96000 }, { "epoch": 3.7918975205312586, - "grad_norm": 0.6055164337158203, - "learning_rate": 1.0667336683417085e-05, - "loss": 0.1668, + "grad_norm": 0.6420221924781799, + "learning_rate": 0.00015569473684210524, + "loss": 0.267, "step": 96500 }, { "epoch": 3.8115446579433376, - "grad_norm": 0.946466863155365, - "learning_rate": 9.159798994974874e-06, - "loss": 0.1591, + "grad_norm": 1.7600951194763184, + "learning_rate": 0.0001549428571428571, + "loss": 0.2542, "step": 97000 }, { "epoch": 3.8115446579433376, - "eval_loss": 0.3917677402496338, - "eval_runtime": 142.9501, - "eval_samples_per_second": 39.566, - "eval_steps_per_second": 4.946, - "eval_wer": 0.30567636532875414, + "eval_loss": 0.4600285291671753, + "eval_runtime": 145.9311, + "eval_samples_per_second": 38.758, + "eval_steps_per_second": 4.845, + "eval_wer": 0.3726308356469965, "step": 97000 }, { "epoch": 3.8311917953554167, - "grad_norm": 9.30636215209961, - "learning_rate": 7.652261306532663e-06, - "loss": 0.1649, + "grad_norm": 7.725172519683838, + "learning_rate": 0.000154190977443609, + "loss": 0.2648, "step": 97500 }, { "epoch": 3.8508389327674957, - "grad_norm": 1.0511385202407837, - "learning_rate": 6.144723618090452e-06, - "loss": 0.1609, + "grad_norm": 0.9012848734855652, + "learning_rate": 0.00015343909774436087, + "loss": 0.2616, "step": 98000 }, { "epoch": 3.8508389327674957, - "eval_loss": 0.39084428548812866, - "eval_runtime": 143.2016, - "eval_samples_per_second": 39.497, - "eval_steps_per_second": 4.937, - "eval_wer": 0.3049541814446887, + "eval_loss": 0.4511352777481079, + "eval_runtime": 146.3391, + "eval_samples_per_second": 38.65, + "eval_steps_per_second": 4.831, + "eval_wer": 0.3660348895058657, "step": 98000 }, { "epoch": 3.8704860701795747, - "grad_norm": 0.48396241664886475, - "learning_rate": 4.637185929648241e-06, - "loss": 0.1486, + "grad_norm": 0.4818692207336426, + "learning_rate": 0.00015268721804511276, + "loss": 0.2429, "step": 98500 }, { "epoch": 3.8901332075916537, - "grad_norm": 0.4520857632160187, - "learning_rate": 3.12964824120603e-06, - "loss": 0.1675, + "grad_norm": 1.495732307434082, + "learning_rate": 0.00015193533834586463, + "loss": 0.2625, "step": 99000 }, { "epoch": 3.8901332075916537, - "eval_loss": 0.3901500105857849, - "eval_runtime": 143.3337, - "eval_samples_per_second": 39.46, - "eval_steps_per_second": 4.933, - "eval_wer": 0.3058849962285953, + "eval_loss": 0.4607318639755249, + "eval_runtime": 145.4992, + "eval_samples_per_second": 38.873, + "eval_steps_per_second": 4.859, + "eval_wer": 0.36436584230713676, "step": 99000 }, { "epoch": 3.9097803450037327, - "grad_norm": 0.444196879863739, - "learning_rate": 1.622110552763819e-06, - "loss": 0.1709, + "grad_norm": 0.5369844436645508, + "learning_rate": 0.00015118345864661653, + "loss": 0.2693, "step": 99500 }, { "epoch": 3.9294274824158117, - "grad_norm": 0.38674989342689514, - "learning_rate": 1.1457286432160803e-07, - "loss": 0.1666, + "grad_norm": 2.374652862548828, + "learning_rate": 0.0001504315789473684, + "loss": 0.2627, "step": 100000 }, { "epoch": 3.9294274824158117, - "eval_loss": 0.3890285789966583, - "eval_runtime": 143.9516, - "eval_samples_per_second": 39.291, - "eval_steps_per_second": 4.911, - "eval_wer": 0.3055961226749691, + "eval_loss": 0.4455793499946594, + "eval_runtime": 146.0128, + "eval_samples_per_second": 38.736, + "eval_steps_per_second": 4.842, + "eval_wer": 0.36256840686235176, "step": 100000 }, { - "epoch": 3.9294274824158117, - "step": 100000, - "total_flos": 1.24400740487767e+20, - "train_loss": 0.35405309791564943, - "train_runtime": 51737.6152, - "train_samples_per_second": 15.463, - "train_steps_per_second": 1.933 + "epoch": 3.949074619827891, + "grad_norm": 1.0198256969451904, + "learning_rate": 0.0001496812030075188, + "loss": 0.2569, + "step": 100500 + }, + { + "epoch": 3.9687217572399702, + "grad_norm": 0.7093910574913025, + "learning_rate": 0.00014892932330827068, + "loss": 0.252, + "step": 101000 + }, + { + "epoch": 3.9687217572399702, + "eval_loss": 0.4579247534275055, + "eval_runtime": 145.1833, + "eval_samples_per_second": 38.958, + "eval_steps_per_second": 4.87, + "eval_wer": 0.36651634542857603, + "step": 101000 + }, + { + "epoch": 3.9883688946520492, + "grad_norm": 0.7897918820381165, + "learning_rate": 0.00014817744360902255, + "loss": 0.2528, + "step": 101500 + }, + { + "epoch": 4.008016032064128, + "grad_norm": 0.773681640625, + "learning_rate": 0.00014742556390977442, + "loss": 0.2489, + "step": 102000 + }, + { + "epoch": 4.008016032064128, + "eval_loss": 0.45104366540908813, + "eval_runtime": 145.9991, + "eval_samples_per_second": 38.74, + "eval_steps_per_second": 4.842, + "eval_wer": 0.36203880534737043, + "step": 102000 + }, + { + "epoch": 4.027663169476208, + "grad_norm": 0.6559247970581055, + "learning_rate": 0.0001466736842105263, + "loss": 0.222, + "step": 102500 + }, + { + "epoch": 4.047310306888287, + "grad_norm": 1.860120415687561, + "learning_rate": 0.00014592481203007517, + "loss": 0.2218, + "step": 103000 + }, + { + "epoch": 4.047310306888287, + "eval_loss": 0.4418700039386749, + "eval_runtime": 149.8176, + "eval_samples_per_second": 37.753, + "eval_steps_per_second": 4.719, + "eval_wer": 0.35350098698464155, + "step": 103000 + }, + { + "epoch": 4.066957444300366, + "grad_norm": 0.8769797682762146, + "learning_rate": 0.00014517293233082707, + "loss": 0.2218, + "step": 103500 + }, + { + "epoch": 4.086604581712445, + "grad_norm": 1.1328709125518799, + "learning_rate": 0.00014442105263157894, + "loss": 0.2211, + "step": 104000 + }, + { + "epoch": 4.086604581712445, + "eval_loss": 0.449856162071228, + "eval_runtime": 144.4159, + "eval_samples_per_second": 39.165, + "eval_steps_per_second": 4.896, + "eval_wer": 0.3771404727897161, + "step": 104000 + }, + { + "epoch": 4.106251719124524, + "grad_norm": 0.8746039271354675, + "learning_rate": 0.0001436706766917293, + "loss": 0.2188, + "step": 104500 + }, + { + "epoch": 4.125898856536603, + "grad_norm": 0.55832839012146, + "learning_rate": 0.0001429203007518797, + "loss": 0.2186, + "step": 105000 + }, + { + "epoch": 4.125898856536603, + "eval_loss": 0.4546278417110443, + "eval_runtime": 144.5427, + "eval_samples_per_second": 39.13, + "eval_steps_per_second": 4.891, + "eval_wer": 0.36601884097510873, + "step": 105000 + }, + { + "epoch": 4.145545993948682, + "grad_norm": 0.7782666087150574, + "learning_rate": 0.00014216842105263156, + "loss": 0.2184, + "step": 105500 + }, + { + "epoch": 4.165193131360761, + "grad_norm": 0.768484890460968, + "learning_rate": 0.00014141654135338346, + "loss": 0.2199, + "step": 106000 + }, + { + "epoch": 4.165193131360761, + "eval_loss": 0.4395730495452881, + "eval_runtime": 144.834, + "eval_samples_per_second": 39.052, + "eval_steps_per_second": 4.881, + "eval_wer": 0.35423921939946396, + "step": 106000 + }, + { + "epoch": 4.18484026877284, + "grad_norm": 0.5472589135169983, + "learning_rate": 0.00014066466165413532, + "loss": 0.2206, + "step": 106500 + }, + { + "epoch": 4.204487406184919, + "grad_norm": 0.4021967947483063, + "learning_rate": 0.0001399127819548872, + "loss": 0.2227, + "step": 107000 + }, + { + "epoch": 4.204487406184919, + "eval_loss": 0.4468631446361542, + "eval_runtime": 144.5271, + "eval_samples_per_second": 39.135, + "eval_steps_per_second": 4.892, + "eval_wer": 0.35748102261237985, + "step": 107000 + }, + { + "epoch": 4.224134543596998, + "grad_norm": 1.0301532745361328, + "learning_rate": 0.0001391609022556391, + "loss": 0.2292, + "step": 107500 + }, + { + "epoch": 4.243781681009077, + "grad_norm": 0.6561172008514404, + "learning_rate": 0.00013840902255639095, + "loss": 0.2212, + "step": 108000 + }, + { + "epoch": 4.243781681009077, + "eval_loss": 0.44032466411590576, + "eval_runtime": 144.5017, + "eval_samples_per_second": 39.141, + "eval_steps_per_second": 4.893, + "eval_wer": 0.3500826499333986, + "step": 108000 + }, + { + "epoch": 4.263428818421156, + "grad_norm": 0.7782973647117615, + "learning_rate": 0.00013765714285714285, + "loss": 0.218, + "step": 108500 + }, + { + "epoch": 4.283075955833235, + "grad_norm": 0.5677826404571533, + "learning_rate": 0.00013690526315789472, + "loss": 0.2182, + "step": 109000 + }, + { + "epoch": 4.283075955833235, + "eval_loss": 0.4507006108760834, + "eval_runtime": 144.3123, + "eval_samples_per_second": 39.193, + "eval_steps_per_second": 4.899, + "eval_wer": 0.3599364478182022, + "step": 109000 + }, + { + "epoch": 4.302723093245314, + "grad_norm": 0.48135581612586975, + "learning_rate": 0.0001361533834586466, + "loss": 0.2191, + "step": 109500 + }, + { + "epoch": 4.322370230657393, + "grad_norm": 0.686140775680542, + "learning_rate": 0.00013540150375939848, + "loss": 0.2212, + "step": 110000 + }, + { + "epoch": 4.322370230657393, + "eval_loss": 0.4435155391693115, + "eval_runtime": 144.5051, + "eval_samples_per_second": 39.14, + "eval_steps_per_second": 4.893, + "eval_wer": 0.3575612652661649, + "step": 110000 + }, + { + "epoch": 4.342017368069472, + "grad_norm": 2.3186769485473633, + "learning_rate": 0.00013464962406015038, + "loss": 0.2213, + "step": 110500 + }, + { + "epoch": 4.361664505481551, + "grad_norm": 2.254951238632202, + "learning_rate": 0.00013389774436090224, + "loss": 0.2211, + "step": 111000 + }, + { + "epoch": 4.361664505481551, + "eval_loss": 0.45138731598854065, + "eval_runtime": 144.5221, + "eval_samples_per_second": 39.136, + "eval_steps_per_second": 4.892, + "eval_wer": 0.36893967357288443, + "step": 111000 + }, + { + "epoch": 4.38131164289363, + "grad_norm": 0.5208560228347778, + "learning_rate": 0.0001331458646616541, + "loss": 0.2042, + "step": 111500 + }, + { + "epoch": 4.400958780305709, + "grad_norm": 0.7651325464248657, + "learning_rate": 0.0001323954887218045, + "loss": 0.2116, + "step": 112000 + }, + { + "epoch": 4.400958780305709, + "eval_loss": 0.44426095485687256, + "eval_runtime": 144.753, + "eval_samples_per_second": 39.073, + "eval_steps_per_second": 4.884, + "eval_wer": 0.35077273675595, + "step": 112000 + }, + { + "epoch": 4.420605917717788, + "grad_norm": 0.7976289987564087, + "learning_rate": 0.00013164360902255637, + "loss": 0.2213, + "step": 112500 + }, + { + "epoch": 4.440253055129867, + "grad_norm": 0.7153854966163635, + "learning_rate": 0.00013089172932330827, + "loss": 0.2218, + "step": 113000 + }, + { + "epoch": 4.440253055129867, + "eval_loss": 0.44099488854408264, + "eval_runtime": 145.5781, + "eval_samples_per_second": 38.852, + "eval_steps_per_second": 4.856, + "eval_wer": 0.3471618173356229, + "step": 113000 + }, + { + "epoch": 4.459900192541947, + "grad_norm": 0.8848706483840942, + "learning_rate": 0.00013014135338345863, + "loss": 0.2213, + "step": 113500 + }, + { + "epoch": 4.479547329954026, + "grad_norm": 0.590100109577179, + "learning_rate": 0.0001293894736842105, + "loss": 0.2152, + "step": 114000 + }, + { + "epoch": 4.479547329954026, + "eval_loss": 0.446841299533844, + "eval_runtime": 146.3889, + "eval_samples_per_second": 38.637, + "eval_steps_per_second": 4.83, + "eval_wer": 0.35348493845388457, + "step": 114000 + }, + { + "epoch": 4.499194467366105, + "grad_norm": 2.4068264961242676, + "learning_rate": 0.0001286375939849624, + "loss": 0.2149, + "step": 114500 + }, + { + "epoch": 4.518841604778184, + "grad_norm": 0.6972984671592712, + "learning_rate": 0.00012788571428571426, + "loss": 0.2174, + "step": 115000 + }, + { + "epoch": 4.518841604778184, + "eval_loss": 0.4498594105243683, + "eval_runtime": 145.1426, + "eval_samples_per_second": 38.969, + "eval_steps_per_second": 4.871, + "eval_wer": 0.3469692349665388, + "step": 115000 + }, + { + "epoch": 4.538488742190263, + "grad_norm": 0.6739790439605713, + "learning_rate": 0.00012713383458646616, + "loss": 0.2148, + "step": 115500 + }, + { + "epoch": 4.558135879602342, + "grad_norm": 4.946841716766357, + "learning_rate": 0.00012638195488721802, + "loss": 0.212, + "step": 116000 + }, + { + "epoch": 4.558135879602342, + "eval_loss": 0.4453933835029602, + "eval_runtime": 145.1072, + "eval_samples_per_second": 38.978, + "eval_steps_per_second": 4.872, + "eval_wer": 0.34401630530724914, + "step": 116000 + }, + { + "epoch": 4.577783017014421, + "grad_norm": 0.5079777240753174, + "learning_rate": 0.00012563007518796992, + "loss": 0.2097, + "step": 116500 + }, + { + "epoch": 4.5974301544265, + "grad_norm": 1.189431071281433, + "learning_rate": 0.0001248781954887218, + "loss": 0.2039, + "step": 117000 + }, + { + "epoch": 4.5974301544265, + "eval_loss": 0.4423506259918213, + "eval_runtime": 144.2129, + "eval_samples_per_second": 39.22, + "eval_steps_per_second": 4.902, + "eval_wer": 0.34892715571889393, + "step": 117000 + }, + { + "epoch": 4.617077291838579, + "grad_norm": 0.5739697813987732, + "learning_rate": 0.00012412781954887218, + "loss": 0.2137, + "step": 117500 + }, + { + "epoch": 4.636724429250658, + "grad_norm": 1.8628792762756348, + "learning_rate": 0.00012337593984962405, + "loss": 0.2073, + "step": 118000 + }, + { + "epoch": 4.636724429250658, + "eval_loss": 0.44371461868286133, + "eval_runtime": 144.8897, + "eval_samples_per_second": 39.037, + "eval_steps_per_second": 4.88, + "eval_wer": 0.3466161672898846, + "step": 118000 + }, + { + "epoch": 4.656371566662737, + "grad_norm": 0.6919093728065491, + "learning_rate": 0.00012262406015037594, + "loss": 0.2111, + "step": 118500 + }, + { + "epoch": 4.676018704074816, + "grad_norm": 0.6628223061561584, + "learning_rate": 0.00012187218045112781, + "loss": 0.2177, + "step": 119000 + }, + { + "epoch": 4.676018704074816, + "eval_loss": 0.43920648097991943, + "eval_runtime": 144.6466, + "eval_samples_per_second": 39.102, + "eval_steps_per_second": 4.888, + "eval_wer": 0.34218677280095006, + "step": 119000 + }, + { + "epoch": 4.695665841486895, + "grad_norm": 0.7294492721557617, + "learning_rate": 0.00012112030075187969, + "loss": 0.2154, + "step": 119500 + }, + { + "epoch": 4.715312978898974, + "grad_norm": 1.2088764905929565, + "learning_rate": 0.00012036842105263157, + "loss": 0.2121, + "step": 120000 + }, + { + "epoch": 4.715312978898974, + "eval_loss": 0.44427990913391113, + "eval_runtime": 144.8984, + "eval_samples_per_second": 39.034, + "eval_steps_per_second": 4.879, + "eval_wer": 0.34372743175362297, + "step": 120000 + }, + { + "epoch": 4.734960116311053, + "grad_norm": 0.3588174283504486, + "learning_rate": 0.00011961654135338345, + "loss": 0.2103, + "step": 120500 + }, + { + "epoch": 4.754607253723132, + "grad_norm": 0.5091924667358398, + "learning_rate": 0.00011886466165413532, + "loss": 0.2072, + "step": 121000 + }, + { + "epoch": 4.754607253723132, + "eval_loss": 0.42684319615364075, + "eval_runtime": 143.2476, + "eval_samples_per_second": 39.484, + "eval_steps_per_second": 4.936, + "eval_wer": 0.34615075989793137, + "step": 121000 + }, + { + "epoch": 4.774254391135211, + "grad_norm": 0.49059540033340454, + "learning_rate": 0.0001181127819548872, + "loss": 0.204, + "step": 121500 + }, + { + "epoch": 4.79390152854729, + "grad_norm": 0.3562159836292267, + "learning_rate": 0.00011736090225563909, + "loss": 0.2138, + "step": 122000 + }, + { + "epoch": 4.79390152854729, + "eval_loss": 0.4271770417690277, + "eval_runtime": 142.8263, + "eval_samples_per_second": 39.601, + "eval_steps_per_second": 4.95, + "eval_wer": 0.34318178170788466, + "step": 122000 + }, + { + "epoch": 4.813548665959369, + "grad_norm": 1.027219295501709, + "learning_rate": 0.00011660902255639097, + "loss": 0.1947, + "step": 122500 + }, + { + "epoch": 4.833195803371449, + "grad_norm": 0.5677986145019531, + "learning_rate": 0.00011585714285714285, + "loss": 0.2145, + "step": 123000 + }, + { + "epoch": 4.833195803371449, + "eval_loss": 0.43315112590789795, + "eval_runtime": 143.3445, + "eval_samples_per_second": 39.457, + "eval_steps_per_second": 4.932, + "eval_wer": 0.3453964789523519, + "step": 123000 + }, + { + "epoch": 4.852842940783528, + "grad_norm": 0.7301272749900818, + "learning_rate": 0.00011510676691729323, + "loss": 0.2019, + "step": 123500 + }, + { + "epoch": 4.872490078195607, + "grad_norm": 16.804716110229492, + "learning_rate": 0.0001143578947368421, + "loss": 0.2217, + "step": 124000 + }, + { + "epoch": 4.872490078195607, + "eval_loss": 0.42095693945884705, + "eval_runtime": 143.7819, + "eval_samples_per_second": 39.337, + "eval_steps_per_second": 4.917, + "eval_wer": 0.3391215034263613, + "step": 124000 + }, + { + "epoch": 4.892137215607686, + "grad_norm": 0.4827280640602112, + "learning_rate": 0.00011360601503759398, + "loss": 0.1994, + "step": 124500 + }, + { + "epoch": 4.911784353019765, + "grad_norm": 0.6648825407028198, + "learning_rate": 0.00011285413533834586, + "loss": 0.2069, + "step": 125000 + }, + { + "epoch": 4.911784353019765, + "eval_loss": 0.427772581577301, + "eval_runtime": 144.7524, + "eval_samples_per_second": 39.074, + "eval_steps_per_second": 4.884, + "eval_wer": 0.3376289900659595, + "step": 125000 + }, + { + "epoch": 4.931431490431844, + "grad_norm": 0.3194764256477356, + "learning_rate": 0.00011210225563909773, + "loss": 0.1946, + "step": 125500 + }, + { + "epoch": 4.951078627843923, + "grad_norm": 0.9185254573822021, + "learning_rate": 0.00011135187969924811, + "loss": 0.2068, + "step": 126000 + }, + { + "epoch": 4.951078627843923, + "eval_loss": 0.4216279685497284, + "eval_runtime": 143.9237, + "eval_samples_per_second": 39.299, + "eval_steps_per_second": 4.912, + "eval_wer": 0.33867214456516503, + "step": 126000 + }, + { + "epoch": 4.970725765256002, + "grad_norm": 0.4608317017555237, + "learning_rate": 0.00011059999999999998, + "loss": 0.2098, + "step": 126500 + }, + { + "epoch": 4.990372902668081, + "grad_norm": 0.7766122221946716, + "learning_rate": 0.00010984812030075186, + "loss": 0.2129, + "step": 127000 + }, + { + "epoch": 4.990372902668081, + "eval_loss": 0.42103302478790283, + "eval_runtime": 144.3261, + "eval_samples_per_second": 39.189, + "eval_steps_per_second": 4.899, + "eval_wer": 0.3361525252363146, + "step": 127000 + }, + { + "epoch": 5.01002004008016, + "grad_norm": 0.7110891342163086, + "learning_rate": 0.00010909624060150374, + "loss": 0.1932, + "step": 127500 + }, + { + "epoch": 5.0296671774922395, + "grad_norm": 0.5839011073112488, + "learning_rate": 0.00010834436090225562, + "loss": 0.1774, + "step": 128000 + }, + { + "epoch": 5.0296671774922395, + "eval_loss": 0.4340197741985321, + "eval_runtime": 144.3949, + "eval_samples_per_second": 39.17, + "eval_steps_per_second": 4.896, + "eval_wer": 0.3303590056330343, + "step": 128000 + }, + { + "epoch": 5.0493143149043185, + "grad_norm": 0.4871758222579956, + "learning_rate": 0.000107593984962406, + "loss": 0.1764, + "step": 128500 + }, + { + "epoch": 5.0689614523163975, + "grad_norm": 1.1092002391815186, + "learning_rate": 0.00010684210526315788, + "loss": 0.1705, + "step": 129000 + }, + { + "epoch": 5.0689614523163975, + "eval_loss": 0.44219356775283813, + "eval_runtime": 144.6388, + "eval_samples_per_second": 39.104, + "eval_steps_per_second": 4.888, + "eval_wer": 0.329877549710324, + "step": 129000 + }, + { + "epoch": 5.0886085897284765, + "grad_norm": 1.4170928001403809, + "learning_rate": 0.00010609022556390976, + "loss": 0.1799, + "step": 129500 + }, + { + "epoch": 5.1082557271405555, + "grad_norm": 0.5609749555587769, + "learning_rate": 0.00010533834586466164, + "loss": 0.1746, + "step": 130000 + }, + { + "epoch": 5.1082557271405555, + "eval_loss": 0.43062400817871094, + "eval_runtime": 144.8052, + "eval_samples_per_second": 39.059, + "eval_steps_per_second": 4.882, + "eval_wer": 0.3363451076053987, + "step": 130000 + }, + { + "epoch": 5.1279028645526346, + "grad_norm": 0.7241942882537842, + "learning_rate": 0.00010458646616541353, + "loss": 0.1719, + "step": 130500 + }, + { + "epoch": 5.147550001964714, + "grad_norm": 7.793860912322998, + "learning_rate": 0.00010383458646616541, + "loss": 0.1813, + "step": 131000 + }, + { + "epoch": 5.147550001964714, + "eval_loss": 0.41806095838546753, + "eval_runtime": 144.8895, + "eval_samples_per_second": 39.037, + "eval_steps_per_second": 4.88, + "eval_wer": 0.33342427500762306, + "step": 131000 + }, + { + "epoch": 5.167197139376793, + "grad_norm": 0.5914771556854248, + "learning_rate": 0.00010308270676691729, + "loss": 0.1799, + "step": 131500 + }, + { + "epoch": 5.186844276788872, + "grad_norm": 1.6738320589065552, + "learning_rate": 0.00010233082706766916, + "loss": 0.1729, + "step": 132000 + }, + { + "epoch": 5.186844276788872, + "eval_loss": 0.4319230020046234, + "eval_runtime": 144.7317, + "eval_samples_per_second": 39.079, + "eval_steps_per_second": 4.885, + "eval_wer": 0.336858660589623, + "step": 132000 + }, + { + "epoch": 5.206491414200951, + "grad_norm": 0.6387330889701843, + "learning_rate": 0.00010157894736842104, + "loss": 0.1682, + "step": 132500 + }, + { + "epoch": 5.22613855161303, + "grad_norm": 0.5514143705368042, + "learning_rate": 0.00010082706766917292, + "loss": 0.1777, + "step": 133000 + }, + { + "epoch": 5.22613855161303, + "eval_loss": 0.4189823567867279, + "eval_runtime": 145.1159, + "eval_samples_per_second": 38.976, + "eval_steps_per_second": 4.872, + "eval_wer": 0.33265394553128663, + "step": 133000 + }, + { + "epoch": 5.245785689025109, + "grad_norm": 0.49433717131614685, + "learning_rate": 0.00010007669172932331, + "loss": 0.1757, + "step": 133500 + }, + { + "epoch": 5.265432826437188, + "grad_norm": 14.663381576538086, + "learning_rate": 9.932481203007518e-05, + "loss": 0.18, + "step": 134000 + }, + { + "epoch": 5.265432826437188, + "eval_loss": 0.42281797528266907, + "eval_runtime": 145.0781, + "eval_samples_per_second": 38.986, + "eval_steps_per_second": 4.873, + "eval_wer": 0.33376129415352024, + "step": 134000 + }, + { + "epoch": 5.285079963849267, + "grad_norm": 0.3960479497909546, + "learning_rate": 9.857293233082706e-05, + "loss": 0.1773, + "step": 134500 + }, + { + "epoch": 5.304727101261347, + "grad_norm": 0.48836782574653625, + "learning_rate": 9.782105263157894e-05, + "loss": 0.1747, + "step": 135000 + }, + { + "epoch": 5.304727101261347, + "eval_loss": 0.4267714023590088, + "eval_runtime": 144.4858, + "eval_samples_per_second": 39.146, + "eval_steps_per_second": 4.893, + "eval_wer": 0.3322687807931184, + "step": 135000 + }, + { + "epoch": 5.324374238673426, + "grad_norm": 0.7414509654045105, + "learning_rate": 9.706917293233082e-05, + "loss": 0.1804, + "step": 135500 + }, + { + "epoch": 5.344021376085505, + "grad_norm": 0.3100612461566925, + "learning_rate": 9.63172932330827e-05, + "loss": 0.1737, + "step": 136000 + }, + { + "epoch": 5.344021376085505, + "eval_loss": 0.41930150985717773, + "eval_runtime": 145.0977, + "eval_samples_per_second": 38.981, + "eval_steps_per_second": 4.873, + "eval_wer": 0.3324774116929595, + "step": 136000 + }, + { + "epoch": 5.363668513497584, + "grad_norm": 2.2844786643981934, + "learning_rate": 9.556541353383459e-05, + "loss": 0.1779, + "step": 136500 + }, + { + "epoch": 5.383315650909663, + "grad_norm": 0.7908081412315369, + "learning_rate": 9.481503759398495e-05, + "loss": 0.1709, + "step": 137000 + }, + { + "epoch": 5.383315650909663, + "eval_loss": 0.4228932559490204, + "eval_runtime": 145.0454, + "eval_samples_per_second": 38.995, + "eval_steps_per_second": 4.874, + "eval_wer": 0.3278714833656979, + "step": 137000 + }, + { + "epoch": 5.402962788321742, + "grad_norm": 1.6749204397201538, + "learning_rate": 9.406315789473683e-05, + "loss": 0.1745, + "step": 137500 + }, + { + "epoch": 5.422609925733821, + "grad_norm": 0.25723955035209656, + "learning_rate": 9.331127819548871e-05, + "loss": 0.1726, + "step": 138000 + }, + { + "epoch": 5.422609925733821, + "eval_loss": 0.4178549647331238, + "eval_runtime": 145.1876, + "eval_samples_per_second": 38.957, + "eval_steps_per_second": 4.87, + "eval_wer": 0.32714929948163246, + "step": 138000 + }, + { + "epoch": 5.4422570631459, + "grad_norm": 0.43192166090011597, + "learning_rate": 9.255939849624058e-05, + "loss": 0.1699, + "step": 138500 + }, + { + "epoch": 5.461904200557979, + "grad_norm": 0.4252433776855469, + "learning_rate": 9.180751879699246e-05, + "loss": 0.1741, + "step": 139000 + }, + { + "epoch": 5.461904200557979, + "eval_loss": 0.42049652338027954, + "eval_runtime": 145.3425, + "eval_samples_per_second": 38.915, + "eval_steps_per_second": 4.864, + "eval_wer": 0.3254963008136605, + "step": 139000 + }, + { + "epoch": 5.481551337970058, + "grad_norm": 0.6398211717605591, + "learning_rate": 9.105563909774435e-05, + "loss": 0.1675, + "step": 139500 + }, + { + "epoch": 5.501198475382137, + "grad_norm": 2.678009510040283, + "learning_rate": 9.030375939849623e-05, + "loss": 0.1723, + "step": 140000 + }, + { + "epoch": 5.501198475382137, + "eval_loss": 0.4140247702598572, + "eval_runtime": 145.8316, + "eval_samples_per_second": 38.784, + "eval_steps_per_second": 4.848, + "eval_wer": 0.32944423937988476, + "step": 140000 + }, + { + "epoch": 5.520845612794216, + "grad_norm": 0.42189884185791016, + "learning_rate": 8.955187969924811e-05, + "loss": 0.167, + "step": 140500 + }, + { + "epoch": 5.540492750206295, + "grad_norm": 0.6850213408470154, + "learning_rate": 8.8803007518797e-05, + "loss": 0.1676, + "step": 141000 + }, + { + "epoch": 5.540492750206295, + "eval_loss": 0.42560333013534546, + "eval_runtime": 145.0938, + "eval_samples_per_second": 38.982, + "eval_steps_per_second": 4.873, + "eval_wer": 0.32540000962911847, + "step": 141000 + }, + { + "epoch": 5.560139887618374, + "grad_norm": 0.46668741106987, + "learning_rate": 8.805112781954888e-05, + "loss": 0.1674, + "step": 141500 + }, + { + "epoch": 5.579787025030453, + "grad_norm": 0.38750043511390686, + "learning_rate": 8.729924812030075e-05, + "loss": 0.1769, + "step": 142000 + }, + { + "epoch": 5.579787025030453, + "eval_loss": 0.41800424456596375, + "eval_runtime": 144.6116, + "eval_samples_per_second": 39.112, + "eval_steps_per_second": 4.889, + "eval_wer": 0.3279196289579689, + "step": 142000 + }, + { + "epoch": 5.599434162442532, + "grad_norm": 0.47452759742736816, + "learning_rate": 8.654887218045112e-05, + "loss": 0.1704, + "step": 142500 + }, + { + "epoch": 5.619081299854611, + "grad_norm": 1.3760634660720825, + "learning_rate": 8.579699248120299e-05, + "loss": 0.1718, + "step": 143000 + }, + { + "epoch": 5.619081299854611, + "eval_loss": 0.4158097207546234, + "eval_runtime": 144.8323, + "eval_samples_per_second": 39.052, + "eval_steps_per_second": 4.882, + "eval_wer": 0.3203928680329316, + "step": 143000 + }, + { + "epoch": 5.63872843726669, + "grad_norm": 1.2168941497802734, + "learning_rate": 8.504511278195487e-05, + "loss": 0.1763, + "step": 143500 + }, + { + "epoch": 5.658375574678769, + "grad_norm": 0.6660623550415039, + "learning_rate": 8.429323308270675e-05, + "loss": 0.1735, + "step": 144000 + }, + { + "epoch": 5.658375574678769, + "eval_loss": 0.41737955808639526, + "eval_runtime": 145.161, + "eval_samples_per_second": 38.964, + "eval_steps_per_second": 4.87, + "eval_wer": 0.3209385180786699, + "step": 144000 + }, + { + "epoch": 5.678022712090849, + "grad_norm": 0.7844908237457275, + "learning_rate": 8.354135338345864e-05, + "loss": 0.1696, + "step": 144500 + }, + { + "epoch": 5.697669849502928, + "grad_norm": 1.7285536527633667, + "learning_rate": 8.278947368421052e-05, + "loss": 0.1693, + "step": 145000 + }, + { + "epoch": 5.697669849502928, + "eval_loss": 0.416604220867157, + "eval_runtime": 143.979, + "eval_samples_per_second": 39.284, + "eval_steps_per_second": 4.91, + "eval_wer": 0.3197669753334082, + "step": 145000 + }, + { + "epoch": 5.717316986915007, + "grad_norm": 0.3506734073162079, + "learning_rate": 8.20375939849624e-05, + "loss": 0.1811, + "step": 145500 + }, + { + "epoch": 5.736964124327086, + "grad_norm": 0.9915302395820618, + "learning_rate": 8.128571428571428e-05, + "loss": 0.1745, + "step": 146000 + }, + { + "epoch": 5.736964124327086, + "eval_loss": 0.41646912693977356, + "eval_runtime": 143.6976, + "eval_samples_per_second": 39.36, + "eval_steps_per_second": 4.92, + "eval_wer": 0.32445314631445493, + "step": 146000 + }, + { + "epoch": 5.756611261739165, + "grad_norm": 0.4368499219417572, + "learning_rate": 8.053383458646616e-05, + "loss": 0.1757, + "step": 146500 + }, + { + "epoch": 5.776258399151244, + "grad_norm": 0.8709374070167542, + "learning_rate": 7.978195488721803e-05, + "loss": 0.1692, + "step": 147000 + }, + { + "epoch": 5.776258399151244, + "eval_loss": 0.4147648215293884, + "eval_runtime": 144.5484, + "eval_samples_per_second": 39.129, + "eval_steps_per_second": 4.891, + "eval_wer": 0.3230408756078381, + "step": 147000 + }, + { + "epoch": 5.795905536563323, + "grad_norm": 16.672887802124023, + "learning_rate": 7.903007518796991e-05, + "loss": 0.1633, + "step": 147500 + }, + { + "epoch": 5.815552673975402, + "grad_norm": 0.7690948247909546, + "learning_rate": 7.82781954887218e-05, + "loss": 0.1641, + "step": 148000 + }, + { + "epoch": 5.815552673975402, + "eval_loss": 0.4115670621395111, + "eval_runtime": 145.0143, + "eval_samples_per_second": 39.003, + "eval_steps_per_second": 4.875, + "eval_wer": 0.3216446534319783, + "step": 148000 + }, + { + "epoch": 5.835199811387481, + "grad_norm": 1.9833319187164307, + "learning_rate": 7.752781954887217e-05, + "loss": 0.1646, + "step": 148500 + }, + { + "epoch": 5.85484694879956, + "grad_norm": 0.38222184777259827, + "learning_rate": 7.677593984962405e-05, + "loss": 0.173, + "step": 149000 + }, + { + "epoch": 5.85484694879956, + "eval_loss": 0.40414321422576904, + "eval_runtime": 148.2393, + "eval_samples_per_second": 38.155, + "eval_steps_per_second": 4.769, + "eval_wer": 0.32366676830736146, + "step": 149000 + }, + { + "epoch": 5.874494086211639, + "grad_norm": 2.3978090286254883, + "learning_rate": 7.602556390977442e-05, + "loss": 0.1669, + "step": 149500 + }, + { + "epoch": 5.894141223623718, + "grad_norm": 0.7286165952682495, + "learning_rate": 7.52736842105263e-05, + "loss": 0.1664, + "step": 150000 + }, + { + "epoch": 5.894141223623718, + "eval_loss": 0.4038516581058502, + "eval_runtime": 145.7264, + "eval_samples_per_second": 38.812, + "eval_steps_per_second": 4.852, + "eval_wer": 0.3184349472805765, + "step": 150000 + }, + { + "epoch": 5.913788361035797, + "grad_norm": 0.6666128635406494, + "learning_rate": 7.45218045112782e-05, + "loss": 0.1631, + "step": 150500 + }, + { + "epoch": 5.933435498447876, + "grad_norm": 3.139840841293335, + "learning_rate": 7.376992481203008e-05, + "loss": 0.1648, + "step": 151000 + }, + { + "epoch": 5.933435498447876, + "eval_loss": 0.4072332978248596, + "eval_runtime": 144.1568, + "eval_samples_per_second": 39.235, + "eval_steps_per_second": 4.904, + "eval_wer": 0.31657331771276337, + "step": 151000 + }, + { + "epoch": 5.953082635859955, + "grad_norm": 0.2758707106113434, + "learning_rate": 7.301804511278196e-05, + "loss": 0.1616, + "step": 151500 + }, + { + "epoch": 5.972729773272034, + "grad_norm": 0.5328942537307739, + "learning_rate": 7.226616541353382e-05, + "loss": 0.1709, + "step": 152000 + }, + { + "epoch": 5.972729773272034, + "eval_loss": 0.40219077467918396, + "eval_runtime": 144.786, + "eval_samples_per_second": 39.065, + "eval_steps_per_second": 4.883, + "eval_wer": 0.3205854504020157, + "step": 152000 + }, + { + "epoch": 5.992376910684113, + "grad_norm": 0.5073242783546448, + "learning_rate": 7.15142857142857e-05, + "loss": 0.1651, + "step": 152500 + }, + { + "epoch": 6.012024048096192, + "grad_norm": 0.4045845866203308, + "learning_rate": 7.076390977443608e-05, + "loss": 0.151, + "step": 153000 + }, + { + "epoch": 6.012024048096192, + "eval_loss": 0.4034076929092407, + "eval_runtime": 144.9751, + "eval_samples_per_second": 39.014, + "eval_steps_per_second": 4.877, + "eval_wer": 0.31882011201874466, + "step": 153000 + }, + { + "epoch": 6.031671185508271, + "grad_norm": 1.1703969240188599, + "learning_rate": 7.001203007518797e-05, + "loss": 0.1397, + "step": 153500 + }, + { + "epoch": 6.05131832292035, + "grad_norm": 0.3152583837509155, + "learning_rate": 6.926015037593985e-05, + "loss": 0.1353, + "step": 154000 + }, + { + "epoch": 6.05131832292035, + "eval_loss": 0.41277533769607544, + "eval_runtime": 144.9149, + "eval_samples_per_second": 39.03, + "eval_steps_per_second": 4.879, + "eval_wer": 0.32572098024425866, + "step": 154000 + }, + { + "epoch": 6.070965460332429, + "grad_norm": 0.5021807551383972, + "learning_rate": 6.850827067669173e-05, + "loss": 0.1429, + "step": 154500 + }, + { + "epoch": 6.090612597744508, + "grad_norm": 0.4375011622905731, + "learning_rate": 6.77563909774436e-05, + "loss": 0.1476, + "step": 155000 + }, + { + "epoch": 6.090612597744508, + "eval_loss": 0.4197489619255066, + "eval_runtime": 145.056, + "eval_samples_per_second": 38.992, + "eval_steps_per_second": 4.874, + "eval_wer": 0.3200398003562774, + "step": 155000 + }, + { + "epoch": 6.110259735156587, + "grad_norm": 0.4859500527381897, + "learning_rate": 6.700451127819548e-05, + "loss": 0.1456, + "step": 155500 + }, + { + "epoch": 6.129906872568666, + "grad_norm": 0.4906657636165619, + "learning_rate": 6.625263157894736e-05, + "loss": 0.1465, + "step": 156000 + }, + { + "epoch": 6.129906872568666, + "eval_loss": 0.40734121203422546, + "eval_runtime": 144.5712, + "eval_samples_per_second": 39.123, + "eval_steps_per_second": 4.89, + "eval_wer": 0.3167338030203335, + "step": 156000 + }, + { + "epoch": 6.149554009980746, + "grad_norm": 0.7306200861930847, + "learning_rate": 6.550075187969924e-05, + "loss": 0.1414, + "step": 156500 + }, + { + "epoch": 6.169201147392825, + "grad_norm": 0.35837283730506897, + "learning_rate": 6.474887218045112e-05, + "loss": 0.139, + "step": 157000 + }, + { + "epoch": 6.169201147392825, + "eval_loss": 0.42275404930114746, + "eval_runtime": 144.9153, + "eval_samples_per_second": 39.03, + "eval_steps_per_second": 4.879, + "eval_wer": 0.321179246040025, + "step": 157000 + }, + { + "epoch": 6.188848284804904, + "grad_norm": 0.5820499658584595, + "learning_rate": 6.39984962406015e-05, + "loss": 0.1408, + "step": 157500 + }, + { + "epoch": 6.208495422216983, + "grad_norm": 0.2785002291202545, + "learning_rate": 6.324812030075188e-05, + "loss": 0.1404, + "step": 158000 + }, + { + "epoch": 6.208495422216983, + "eval_loss": 0.4117072522640228, + "eval_runtime": 144.7738, + "eval_samples_per_second": 39.068, + "eval_steps_per_second": 4.883, + "eval_wer": 0.3244691948452119, + "step": 158000 + }, + { + "epoch": 6.228142559629062, + "grad_norm": 0.9491069912910461, + "learning_rate": 6.249624060150375e-05, + "loss": 0.1443, + "step": 158500 + }, + { + "epoch": 6.247789697041141, + "grad_norm": 0.6151573657989502, + "learning_rate": 6.174436090225563e-05, + "loss": 0.1338, + "step": 159000 + }, + { + "epoch": 6.247789697041141, + "eval_loss": 0.41795113682746887, + "eval_runtime": 144.7948, + "eval_samples_per_second": 39.062, + "eval_steps_per_second": 4.883, + "eval_wer": 0.3153054837829597, + "step": 159000 + }, + { + "epoch": 6.26743683445322, + "grad_norm": 1.4104067087173462, + "learning_rate": 6.099248120300751e-05, + "loss": 0.1458, + "step": 159500 + }, + { + "epoch": 6.287083971865299, + "grad_norm": 0.4986151158809662, + "learning_rate": 6.024060150375939e-05, + "loss": 0.1436, + "step": 160000 + }, + { + "epoch": 6.287083971865299, + "eval_loss": 0.42644599080085754, + "eval_runtime": 145.4284, + "eval_samples_per_second": 38.892, + "eval_steps_per_second": 4.861, + "eval_wer": 0.31670170595881947, + "step": 160000 + }, + { + "epoch": 6.306731109277378, + "grad_norm": 1.0388261079788208, + "learning_rate": 5.9488721804511266e-05, + "loss": 0.1382, + "step": 160500 + }, + { + "epoch": 6.326378246689457, + "grad_norm": 1.0425645112991333, + "learning_rate": 5.873834586466165e-05, + "loss": 0.1317, + "step": 161000 + }, + { + "epoch": 6.326378246689457, + "eval_loss": 0.4117776155471802, + "eval_runtime": 144.9416, + "eval_samples_per_second": 39.023, + "eval_steps_per_second": 4.878, + "eval_wer": 0.31524128965993164, + "step": 161000 + }, + { + "epoch": 6.346025384101536, + "grad_norm": 0.47523021697998047, + "learning_rate": 5.798646616541353e-05, + "loss": 0.1386, + "step": 161500 + }, + { + "epoch": 6.365672521513615, + "grad_norm": 0.27745115756988525, + "learning_rate": 5.7234586466165414e-05, + "loss": 0.1395, + "step": 162000 + }, + { + "epoch": 6.365672521513615, + "eval_loss": 0.42685896158218384, + "eval_runtime": 145.206, + "eval_samples_per_second": 38.952, + "eval_steps_per_second": 4.869, + "eval_wer": 0.3118390011394457, + "step": 162000 + }, + { + "epoch": 6.385319658925694, + "grad_norm": 0.3224891126155853, + "learning_rate": 5.6484210526315785e-05, + "loss": 0.1335, + "step": 162500 + }, + { + "epoch": 6.404966796337773, + "grad_norm": 0.2714509665966034, + "learning_rate": 5.5732330827067666e-05, + "loss": 0.1267, + "step": 163000 + }, + { + "epoch": 6.404966796337773, + "eval_loss": 0.4240754544734955, + "eval_runtime": 144.9066, + "eval_samples_per_second": 39.032, + "eval_steps_per_second": 4.879, + "eval_wer": 0.31345990274590363, + "step": 163000 + }, + { + "epoch": 6.4246139337498525, + "grad_norm": 0.3742597997188568, + "learning_rate": 5.498045112781954e-05, + "loss": 0.1438, + "step": 163500 + }, + { + "epoch": 6.4442610711619315, + "grad_norm": 1.6135519742965698, + "learning_rate": 5.422857142857142e-05, + "loss": 0.1334, + "step": 164000 + }, + { + "epoch": 6.4442610711619315, + "eval_loss": 0.40579110383987427, + "eval_runtime": 144.6174, + "eval_samples_per_second": 39.11, + "eval_steps_per_second": 4.889, + "eval_wer": 0.31686219126638954, + "step": 164000 + }, + { + "epoch": 6.4639082085740105, + "grad_norm": 0.7605300545692444, + "learning_rate": 5.3476691729323304e-05, + "loss": 0.1371, + "step": 164500 + }, + { + "epoch": 6.4835553459860895, + "grad_norm": 0.44126906991004944, + "learning_rate": 5.2724812030075185e-05, + "loss": 0.1369, + "step": 165000 + }, + { + "epoch": 6.4835553459860895, + "eval_loss": 0.40502265095710754, + "eval_runtime": 145.4039, + "eval_samples_per_second": 38.899, + "eval_steps_per_second": 4.862, + "eval_wer": 0.31296239829243633, + "step": 165000 + }, + { + "epoch": 6.5032024833981685, + "grad_norm": 0.32450059056282043, + "learning_rate": 5.197293233082706e-05, + "loss": 0.1352, + "step": 165500 + }, + { + "epoch": 6.522849620810248, + "grad_norm": 1.38713538646698, + "learning_rate": 5.122105263157894e-05, + "loss": 0.1322, + "step": 166000 + }, + { + "epoch": 6.522849620810248, + "eval_loss": 0.40965744853019714, + "eval_runtime": 144.8647, + "eval_samples_per_second": 39.043, + "eval_steps_per_second": 4.88, + "eval_wer": 0.31403764985315596, + "step": 166000 + }, + { + "epoch": 6.5424967582223275, + "grad_norm": 0.7151561379432678, + "learning_rate": 5.046917293233082e-05, + "loss": 0.1385, + "step": 166500 + }, + { + "epoch": 6.5621438956344065, + "grad_norm": 0.46481749415397644, + "learning_rate": 4.9717293233082705e-05, + "loss": 0.1358, + "step": 167000 + }, + { + "epoch": 6.5621438956344065, + "eval_loss": 0.41421449184417725, + "eval_runtime": 144.9831, + "eval_samples_per_second": 39.011, + "eval_steps_per_second": 4.876, + "eval_wer": 0.3129142527001653, + "step": 167000 + }, + { + "epoch": 6.5817910330464855, + "grad_norm": 0.4189301133155823, + "learning_rate": 4.896541353383458e-05, + "loss": 0.1359, + "step": 167500 + }, + { + "epoch": 6.6014381704585645, + "grad_norm": 0.7608076333999634, + "learning_rate": 4.821353383458646e-05, + "loss": 0.1345, + "step": 168000 + }, + { + "epoch": 6.6014381704585645, + "eval_loss": 0.40090152621269226, + "eval_runtime": 144.6628, + "eval_samples_per_second": 39.098, + "eval_steps_per_second": 4.887, + "eval_wer": 0.31230440853139896, + "step": 168000 + }, + { + "epoch": 6.6210853078706435, + "grad_norm": 0.23644275963306427, + "learning_rate": 4.746165413533834e-05, + "loss": 0.1329, + "step": 168500 + }, + { + "epoch": 6.6407324452827226, + "grad_norm": 0.5338233709335327, + "learning_rate": 4.6711278195488714e-05, + "loss": 0.1321, + "step": 169000 + }, + { + "epoch": 6.6407324452827226, + "eval_loss": 0.4004514813423157, + "eval_runtime": 144.9848, + "eval_samples_per_second": 39.011, + "eval_steps_per_second": 4.876, + "eval_wer": 0.3092712362183242, + "step": 169000 + }, + { + "epoch": 6.660379582694802, + "grad_norm": 0.5386209487915039, + "learning_rate": 4.5959398496240595e-05, + "loss": 0.1324, + "step": 169500 + }, + { + "epoch": 6.680026720106881, + "grad_norm": 0.7969732880592346, + "learning_rate": 4.5207518796992477e-05, + "loss": 0.1299, + "step": 170000 + }, + { + "epoch": 6.680026720106881, + "eval_loss": 0.39957067370414734, + "eval_runtime": 144.9466, + "eval_samples_per_second": 39.021, + "eval_steps_per_second": 4.878, + "eval_wer": 0.305387491775128, + "step": 170000 + }, + { + "epoch": 6.69967385751896, + "grad_norm": 0.7069671154022217, + "learning_rate": 4.445563909774436e-05, + "loss": 0.1381, + "step": 170500 + }, + { + "epoch": 6.719320994931039, + "grad_norm": 0.8022767305374146, + "learning_rate": 4.370375939849623e-05, + "loss": 0.1345, + "step": 171000 + }, + { + "epoch": 6.719320994931039, + "eval_loss": 0.40409377217292786, + "eval_runtime": 145.3133, + "eval_samples_per_second": 38.923, + "eval_steps_per_second": 4.865, + "eval_wer": 0.30705653897385693, + "step": 171000 + }, + { + "epoch": 6.738968132343118, + "grad_norm": 0.9058027863502502, + "learning_rate": 4.2951879699248114e-05, + "loss": 0.1314, + "step": 171500 + }, + { + "epoch": 6.758615269755197, + "grad_norm": 0.4458518326282501, + "learning_rate": 4.2199999999999996e-05, + "loss": 0.1328, + "step": 172000 + }, + { + "epoch": 6.758615269755197, + "eval_loss": 0.3997325003147125, + "eval_runtime": 145.3079, + "eval_samples_per_second": 38.924, + "eval_steps_per_second": 4.866, + "eval_wer": 0.3069762963200719, + "step": 172000 + }, + { + "epoch": 6.778262407167276, + "grad_norm": 0.5749480128288269, + "learning_rate": 4.144812030075188e-05, + "loss": 0.135, + "step": 172500 + }, + { + "epoch": 6.797909544579355, + "grad_norm": 0.3367716073989868, + "learning_rate": 4.069624060150375e-05, + "loss": 0.1245, + "step": 173000 + }, + { + "epoch": 6.797909544579355, + "eval_loss": 0.3974212110042572, + "eval_runtime": 145.9176, + "eval_samples_per_second": 38.762, + "eval_steps_per_second": 4.845, + "eval_wer": 0.3044566769912215, + "step": 173000 + }, + { + "epoch": 6.817556681991434, + "grad_norm": 0.546405553817749, + "learning_rate": 3.9944360902255633e-05, + "loss": 0.1312, + "step": 173500 + }, + { + "epoch": 6.837203819403513, + "grad_norm": 0.38214609026908875, + "learning_rate": 3.9192481203007515e-05, + "loss": 0.1356, + "step": 174000 + }, + { + "epoch": 6.837203819403513, + "eval_loss": 0.39992156624794006, + "eval_runtime": 144.9546, + "eval_samples_per_second": 39.019, + "eval_steps_per_second": 4.877, + "eval_wer": 0.3008939031631654, + "step": 174000 + }, + { + "epoch": 6.856850956815592, + "grad_norm": 0.21237680315971375, + "learning_rate": 3.8442105263157886e-05, + "loss": 0.1335, + "step": 174500 + }, + { + "epoch": 6.876498094227671, + "grad_norm": 0.4656332731246948, + "learning_rate": 3.769022556390977e-05, + "loss": 0.1208, + "step": 175000 + }, + { + "epoch": 6.876498094227671, + "eval_loss": 0.39532560110092163, + "eval_runtime": 145.4346, + "eval_samples_per_second": 38.89, + "eval_steps_per_second": 4.861, + "eval_wer": 0.301921009131614, + "step": 175000 + }, + { + "epoch": 6.89614523163975, + "grad_norm": 0.6751464605331421, + "learning_rate": 3.693834586466165e-05, + "loss": 0.1282, + "step": 175500 + }, + { + "epoch": 6.915792369051829, + "grad_norm": 1.1535145044326782, + "learning_rate": 3.618646616541353e-05, + "loss": 0.1316, + "step": 176000 + }, + { + "epoch": 6.915792369051829, + "eval_loss": 0.39738306403160095, + "eval_runtime": 146.048, + "eval_samples_per_second": 38.727, + "eval_steps_per_second": 4.841, + "eval_wer": 0.3056442682672401, + "step": 176000 + }, + { + "epoch": 6.935439506463908, + "grad_norm": 0.8314586877822876, + "learning_rate": 3.543458646616541e-05, + "loss": 0.1271, + "step": 176500 + }, + { + "epoch": 6.955086643875987, + "grad_norm": 0.7973750233650208, + "learning_rate": 3.4682706766917294e-05, + "loss": 0.1232, + "step": 177000 + }, + { + "epoch": 6.955086643875987, + "eval_loss": 0.39205384254455566, + "eval_runtime": 146.1083, + "eval_samples_per_second": 38.711, + "eval_steps_per_second": 4.839, + "eval_wer": 0.30333327983823083, + "step": 177000 + }, + { + "epoch": 6.974733781288066, + "grad_norm": 0.3950521647930145, + "learning_rate": 3.393082706766917e-05, + "loss": 0.1344, + "step": 177500 + }, + { + "epoch": 6.994380918700146, + "grad_norm": 0.4100574851036072, + "learning_rate": 3.317894736842105e-05, + "loss": 0.1261, + "step": 178000 + }, + { + "epoch": 6.994380918700146, + "eval_loss": 0.39850306510925293, + "eval_runtime": 145.9873, + "eval_samples_per_second": 38.743, + "eval_steps_per_second": 4.843, + "eval_wer": 0.3034616680842869, + "step": 178000 + }, + { + "epoch": 7.014028056112225, + "grad_norm": 0.5746680498123169, + "learning_rate": 3.242857142857143e-05, + "loss": 0.1105, + "step": 178500 + }, + { + "epoch": 7.033675193524304, + "grad_norm": 0.5409959554672241, + "learning_rate": 3.16766917293233e-05, + "loss": 0.1184, + "step": 179000 + }, + { + "epoch": 7.033675193524304, + "eval_loss": 0.40056413412094116, + "eval_runtime": 145.9106, + "eval_samples_per_second": 38.763, + "eval_steps_per_second": 4.845, + "eval_wer": 0.3061096756591934, + "step": 179000 + }, + { + "epoch": 7.053322330936383, + "grad_norm": 0.7439139485359192, + "learning_rate": 3.092631578947368e-05, + "loss": 0.1132, + "step": 179500 + }, + { + "epoch": 7.072969468348462, + "grad_norm": 1.1852874755859375, + "learning_rate": 3.0174436090225562e-05, + "loss": 0.1115, + "step": 180000 + }, + { + "epoch": 7.072969468348462, + "eval_loss": 0.4096328318119049, + "eval_runtime": 145.6721, + "eval_samples_per_second": 38.827, + "eval_steps_per_second": 4.853, + "eval_wer": 0.3049541814446887, + "step": 180000 + }, + { + "epoch": 7.092616605760541, + "grad_norm": 0.6158276796340942, + "learning_rate": 2.9422556390977444e-05, + "loss": 0.1032, + "step": 180500 + }, + { + "epoch": 7.11226374317262, + "grad_norm": 1.272557258605957, + "learning_rate": 2.867067669172932e-05, + "loss": 0.1109, + "step": 181000 + }, + { + "epoch": 7.11226374317262, + "eval_loss": 0.41377753019332886, + "eval_runtime": 146.2393, + "eval_samples_per_second": 38.676, + "eval_steps_per_second": 4.835, + "eval_wer": 0.3038147357609411, + "step": 181000 + }, + { + "epoch": 7.131910880584699, + "grad_norm": 0.4577464163303375, + "learning_rate": 2.7918796992481203e-05, + "loss": 0.1157, + "step": 181500 + }, + { + "epoch": 7.151558017996778, + "grad_norm": 1.748535394668579, + "learning_rate": 2.716691729323308e-05, + "loss": 0.1113, + "step": 182000 + }, + { + "epoch": 7.151558017996778, + "eval_loss": 0.41194456815719604, + "eval_runtime": 146.2502, + "eval_samples_per_second": 38.673, + "eval_steps_per_second": 4.834, + "eval_wer": 0.3052270064675579, + "step": 182000 + }, + { + "epoch": 7.171205155408857, + "grad_norm": 0.8288829326629639, + "learning_rate": 2.6416541353383456e-05, + "loss": 0.1114, + "step": 182500 + }, + { + "epoch": 7.190852292820936, + "grad_norm": 0.5038288235664368, + "learning_rate": 2.5664661654135334e-05, + "loss": 0.1075, + "step": 183000 + }, + { + "epoch": 7.190852292820936, + "eval_loss": 0.41699934005737305, + "eval_runtime": 145.6145, + "eval_samples_per_second": 38.842, + "eval_steps_per_second": 4.855, + "eval_wer": 0.30066922373256727, + "step": 183000 + }, + { + "epoch": 7.210499430233015, + "grad_norm": 0.41699087619781494, + "learning_rate": 2.4912781954887215e-05, + "loss": 0.1155, + "step": 183500 + }, + { + "epoch": 7.230146567645094, + "grad_norm": 0.9346128702163696, + "learning_rate": 2.4162406015037593e-05, + "loss": 0.1081, + "step": 184000 + }, + { + "epoch": 7.230146567645094, + "eval_loss": 0.4134830832481384, + "eval_runtime": 145.6714, + "eval_samples_per_second": 38.827, + "eval_steps_per_second": 4.853, + "eval_wer": 0.3031246489383897, + "step": 184000 + }, + { + "epoch": 7.249793705057173, + "grad_norm": 1.0166319608688354, + "learning_rate": 2.341052631578947e-05, + "loss": 0.1173, + "step": 184500 + }, + { + "epoch": 7.269440842469252, + "grad_norm": 0.8515588045120239, + "learning_rate": 2.2658646616541353e-05, + "loss": 0.1108, + "step": 185000 + }, + { + "epoch": 7.269440842469252, + "eval_loss": 0.41293400526046753, + "eval_runtime": 146.3235, + "eval_samples_per_second": 38.654, + "eval_steps_per_second": 4.832, + "eval_wer": 0.3003161560559131, + "step": 185000 + }, + { + "epoch": 7.289087979881331, + "grad_norm": 0.5291551351547241, + "learning_rate": 2.190676691729323e-05, + "loss": 0.1064, + "step": 185500 + }, + { + "epoch": 7.30873511729341, + "grad_norm": 1.0743286609649658, + "learning_rate": 2.1154887218045113e-05, + "loss": 0.1044, + "step": 186000 + }, + { + "epoch": 7.30873511729341, + "eval_loss": 0.41300591826438904, + "eval_runtime": 145.5862, + "eval_samples_per_second": 38.85, + "eval_steps_per_second": 4.856, + "eval_wer": 0.3022740768082682, + "step": 186000 + }, + { + "epoch": 7.328382254705489, + "grad_norm": 0.4959530532360077, + "learning_rate": 2.040300751879699e-05, + "loss": 0.1063, + "step": 186500 + }, + { + "epoch": 7.348029392117568, + "grad_norm": 0.6196532845497131, + "learning_rate": 1.9651127819548872e-05, + "loss": 0.1121, + "step": 187000 + }, + { + "epoch": 7.348029392117568, + "eval_loss": 0.40789899230003357, + "eval_runtime": 145.8295, + "eval_samples_per_second": 38.785, + "eval_steps_per_second": 4.848, + "eval_wer": 0.2992890500874645, + "step": 187000 + }, + { + "epoch": 7.367676529529647, + "grad_norm": 1.7419555187225342, + "learning_rate": 1.889924812030075e-05, + "loss": 0.1092, + "step": 187500 + }, + { + "epoch": 7.387323666941727, + "grad_norm": 0.931948721408844, + "learning_rate": 1.814736842105263e-05, + "loss": 0.1052, + "step": 188000 + }, + { + "epoch": 7.387323666941727, + "eval_loss": 0.40476053953170776, + "eval_runtime": 145.5337, + "eval_samples_per_second": 38.864, + "eval_steps_per_second": 4.858, + "eval_wer": 0.301904960600857, + "step": 188000 + }, + { + "epoch": 7.406970804353806, + "grad_norm": 0.3558327853679657, + "learning_rate": 1.739548872180451e-05, + "loss": 0.112, + "step": 188500 + }, + { + "epoch": 7.426617941765885, + "grad_norm": 0.48914971947669983, + "learning_rate": 1.6643609022556388e-05, + "loss": 0.103, + "step": 189000 + }, + { + "epoch": 7.426617941765885, + "eval_loss": 0.415385365486145, + "eval_runtime": 145.9476, + "eval_samples_per_second": 38.754, + "eval_steps_per_second": 4.844, + "eval_wer": 0.3015197958626888, + "step": 189000 + }, + { + "epoch": 7.446265079177964, + "grad_norm": 0.5291373133659363, + "learning_rate": 1.5893233082706766e-05, + "loss": 0.1073, + "step": 189500 + }, + { + "epoch": 7.465912216590043, + "grad_norm": 0.6397764086723328, + "learning_rate": 1.514285714285714e-05, + "loss": 0.1105, + "step": 190000 + }, + { + "epoch": 7.465912216590043, + "eval_loss": 0.4119686484336853, + "eval_runtime": 145.6307, + "eval_samples_per_second": 38.838, + "eval_steps_per_second": 4.855, + "eval_wer": 0.30187286353934295, + "step": 190000 + }, + { + "epoch": 7.485559354002122, + "grad_norm": 0.45867177844047546, + "learning_rate": 1.439097744360902e-05, + "loss": 0.1079, + "step": 190500 + }, + { + "epoch": 7.505206491414201, + "grad_norm": 1.0139355659484863, + "learning_rate": 1.36390977443609e-05, + "loss": 0.1093, + "step": 191000 + }, + { + "epoch": 7.505206491414201, + "eval_loss": 0.4104667901992798, + "eval_runtime": 146.3292, + "eval_samples_per_second": 38.653, + "eval_steps_per_second": 4.832, + "eval_wer": 0.3007494663863523, + "step": 191000 + }, + { + "epoch": 7.52485362882628, + "grad_norm": 0.35021767020225525, + "learning_rate": 1.288721804511278e-05, + "loss": 0.1108, + "step": 191500 + }, + { + "epoch": 7.544500766238359, + "grad_norm": 0.7307072281837463, + "learning_rate": 1.2136842105263156e-05, + "loss": 0.1058, + "step": 192000 + }, + { + "epoch": 7.544500766238359, + "eval_loss": 0.41022607684135437, + "eval_runtime": 146.2108, + "eval_samples_per_second": 38.684, + "eval_steps_per_second": 4.835, + "eval_wer": 0.3011025340630065, + "step": 192000 + }, + { + "epoch": 7.564147903650438, + "grad_norm": 0.46207743883132935, + "learning_rate": 1.1384962406015036e-05, + "loss": 0.1053, + "step": 192500 + }, + { + "epoch": 7.583795041062517, + "grad_norm": 0.47636836767196655, + "learning_rate": 1.0633082706766916e-05, + "loss": 0.1043, + "step": 193000 + }, + { + "epoch": 7.583795041062517, + "eval_loss": 0.41014641523361206, + "eval_runtime": 145.8628, + "eval_samples_per_second": 38.776, + "eval_steps_per_second": 4.847, + "eval_wer": 0.2994495353950346, + "step": 193000 + }, + { + "epoch": 7.603442178474596, + "grad_norm": 1.0540902614593506, + "learning_rate": 9.881203007518796e-06, + "loss": 0.1072, + "step": 193500 + }, + { + "epoch": 7.623089315886675, + "grad_norm": 0.8974863290786743, + "learning_rate": 9.129323308270676e-06, + "loss": 0.1098, + "step": 194000 + }, + { + "epoch": 7.623089315886675, + "eval_loss": 0.408490389585495, + "eval_runtime": 146.1703, + "eval_samples_per_second": 38.695, + "eval_steps_per_second": 4.837, + "eval_wer": 0.29980260307168877, + "step": 194000 + }, + { + "epoch": 7.642736453298754, + "grad_norm": 0.49042123556137085, + "learning_rate": 8.377443609022555e-06, + "loss": 0.1035, + "step": 194500 + }, + { + "epoch": 7.662383590710833, + "grad_norm": 0.7251204252243042, + "learning_rate": 7.625563909774436e-06, + "loss": 0.1057, + "step": 195000 + }, + { + "epoch": 7.662383590710833, + "eval_loss": 0.40715456008911133, + "eval_runtime": 146.2248, + "eval_samples_per_second": 38.68, + "eval_steps_per_second": 4.835, + "eval_wer": 0.2982137985267449, + "step": 195000 + }, + { + "epoch": 7.682030728122912, + "grad_norm": 0.9783725142478943, + "learning_rate": 6.8751879699248115e-06, + "loss": 0.1078, + "step": 195500 + }, + { + "epoch": 7.701677865534991, + "grad_norm": 0.66826331615448, + "learning_rate": 6.123308270676691e-06, + "loss": 0.1021, + "step": 196000 + }, + { + "epoch": 7.701677865534991, + "eval_loss": 0.4079470634460449, + "eval_runtime": 146.5661, + "eval_samples_per_second": 38.59, + "eval_steps_per_second": 4.824, + "eval_wer": 0.2973792749273804, + "step": 196000 + }, + { + "epoch": 7.72132500294707, + "grad_norm": 0.34865960478782654, + "learning_rate": 5.371428571428571e-06, + "loss": 0.108, + "step": 196500 + }, + { + "epoch": 7.740972140359149, + "grad_norm": 0.6881831884384155, + "learning_rate": 4.619548872180451e-06, + "loss": 0.0994, + "step": 197000 + }, + { + "epoch": 7.740972140359149, + "eval_loss": 0.4088830053806305, + "eval_runtime": 145.6213, + "eval_samples_per_second": 38.84, + "eval_steps_per_second": 4.855, + "eval_wer": 0.29871130298021215, + "step": 197000 + }, + { + "epoch": 7.760619277771228, + "grad_norm": 0.7812435030937195, + "learning_rate": 3.867669172932331e-06, + "loss": 0.1017, + "step": 197500 + }, + { + "epoch": 7.780266415183307, + "grad_norm": 0.23445354402065277, + "learning_rate": 3.118796992481203e-06, + "loss": 0.1065, + "step": 198000 + }, + { + "epoch": 7.780266415183307, + "eval_loss": 0.4065949022769928, + "eval_runtime": 146.17, + "eval_samples_per_second": 38.695, + "eval_steps_per_second": 4.837, + "eval_wer": 0.2973792749273804, + "step": 198000 + }, + { + "epoch": 7.799913552595386, + "grad_norm": 0.4873931407928467, + "learning_rate": 2.366917293233083e-06, + "loss": 0.1052, + "step": 198500 + }, + { + "epoch": 7.8195606900074655, + "grad_norm": 0.24652531743049622, + "learning_rate": 1.6150375939849622e-06, + "loss": 0.1111, + "step": 199000 + }, + { + "epoch": 7.8195606900074655, + "eval_loss": 0.40712785720825195, + "eval_runtime": 145.6053, + "eval_samples_per_second": 38.845, + "eval_steps_per_second": 4.856, + "eval_wer": 0.2981817014652309, + "step": 199000 + }, + { + "epoch": 7.839207827419545, + "grad_norm": 0.5532709956169128, + "learning_rate": 8.631578947368421e-07, + "loss": 0.106, + "step": 199500 + }, + { + "epoch": 7.858854964831624, + "grad_norm": 0.4496346116065979, + "learning_rate": 1.1127819548872179e-07, + "loss": 0.1065, + "step": 200000 + }, + { + "epoch": 7.858854964831624, + "eval_loss": 0.4064280092716217, + "eval_runtime": 146.2071, + "eval_samples_per_second": 38.685, + "eval_steps_per_second": 4.836, + "eval_wer": 0.298422429426586, + "step": 200000 + }, + { + "epoch": 7.858854964831624, + "step": 200000, + "total_flos": 2.4880981924796708e+20, + "train_loss": 0.2853552089881897, + "train_runtime": 103513.8909, + "train_samples_per_second": 15.457, + "train_steps_per_second": 1.932 } ], "logging_steps": 500, - "max_steps": 100000, + "max_steps": 200000, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 8, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { @@ -2335,7 +4635,7 @@ "attributes": {} } }, - "total_flos": 1.24400740487767e+20, + "total_flos": 2.4880981924796708e+20, "train_batch_size": 8, "trial_name": null, "trial_params": null