diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15833 @@ +{ + "best_metric": 2.7007529735565186, + "best_model_checkpoint": "fat5-fr-small_v1/checkpoint-200000", + "epoch": 2.6882443076426785, + "eval_steps": 1000, + "global_step": 200000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0013441221538213392, + "grad_norm": 0.44315826892852783, + "learning_rate": 0.002512499999999994, + "loss": 7.5626, + "step": 100 + }, + { + "epoch": 0.0026882443076426785, + "grad_norm": 0.5437139272689819, + "learning_rate": 0.0025249999999999904, + "loss": 6.2859, + "step": 200 + }, + { + "epoch": 0.004032366461464018, + "grad_norm": 0.7252405881881714, + "learning_rate": 0.00253749999999999, + "loss": 5.8614, + "step": 300 + }, + { + "epoch": 0.005376488615285357, + "grad_norm": 1.1030696630477905, + "learning_rate": 0.0025499999999999885, + "loss": 5.6073, + "step": 400 + }, + { + "epoch": 0.006720610769106696, + "grad_norm": 0.8097702264785767, + "learning_rate": 0.0025624999999999862, + "loss": 5.4185, + "step": 500 + }, + { + "epoch": 0.008064732922928036, + "grad_norm": 0.9873973727226257, + "learning_rate": 0.0025749999999999827, + "loss": 5.2865, + "step": 600 + }, + { + "epoch": 0.009408855076749375, + "grad_norm": 1.1101303100585938, + "learning_rate": 0.0025874999999999813, + "loss": 5.1929, + "step": 700 + }, + { + "epoch": 0.010752977230570714, + "grad_norm": 1.1394418478012085, + "learning_rate": 0.0025999999999999795, + "loss": 5.1122, + "step": 800 + }, + { + "epoch": 0.012097099384392053, + "grad_norm": 1.2768878936767578, + "learning_rate": 0.0026124999999999764, + "loss": 5.0262, + "step": 900 + }, + { + "epoch": 0.013441221538213392, + "grad_norm": 1.238966464996338, + "learning_rate": 0.00262499999999997, + "loss": 4.9743, + "step": 1000 + }, + { + "epoch": 0.013441221538213392, + "eval_MaskedAccuracy": 0.19784819583436258, + "eval_loss": 5.1476359367370605, + "eval_runtime": 157.0766, + "eval_samples_per_second": 404.109, + "eval_steps_per_second": 1.579, + "step": 1000 + }, + { + "epoch": 0.014785343692034733, + "grad_norm": 1.1027828454971313, + "learning_rate": 0.002637499999999968, + "loss": 4.9126, + "step": 1100 + }, + { + "epoch": 0.016129465845856072, + "grad_norm": 1.021935224533081, + "learning_rate": 0.002649999999999965, + "loss": 4.855, + "step": 1200 + }, + { + "epoch": 0.01747358799967741, + "grad_norm": 0.9193494319915771, + "learning_rate": 0.0026624999999999635, + "loss": 4.8046, + "step": 1300 + }, + { + "epoch": 0.01881771015349875, + "grad_norm": 1.0063543319702148, + "learning_rate": 0.002674999999999961, + "loss": 4.7762, + "step": 1400 + }, + { + "epoch": 0.02016183230732009, + "grad_norm": 1.034637689590454, + "learning_rate": 0.0026874999999999564, + "loss": 4.7222, + "step": 1500 + }, + { + "epoch": 0.021505954461141428, + "grad_norm": 0.9538853764533997, + "learning_rate": 0.002699999999999955, + "loss": 4.6804, + "step": 1600 + }, + { + "epoch": 0.02285007661496277, + "grad_norm": 0.8716675043106079, + "learning_rate": 0.002712499999999955, + "loss": 4.6452, + "step": 1700 + }, + { + "epoch": 0.024194198768784106, + "grad_norm": 1.194484829902649, + "learning_rate": 0.0027249999999999514, + "loss": 4.6026, + "step": 1800 + }, + { + "epoch": 0.025538320922605447, + "grad_norm": 0.9794184565544128, + "learning_rate": 0.002737499999999953, + "loss": 4.5661, + "step": 1900 + }, + { + "epoch": 0.026882443076426784, + "grad_norm": 0.8896113038063049, + "learning_rate": 0.0027499999999999495, + "loss": 4.528, + "step": 2000 + }, + { + "epoch": 0.026882443076426784, + "eval_MaskedAccuracy": 0.22670220534969857, + "eval_loss": 4.742874622344971, + "eval_runtime": 155.1957, + "eval_samples_per_second": 409.006, + "eval_steps_per_second": 1.598, + "step": 2000 + }, + { + "epoch": 0.028226565230248125, + "grad_norm": 0.9724192023277283, + "learning_rate": 0.0027624999999999425, + "loss": 4.503, + "step": 2100 + }, + { + "epoch": 0.029570687384069465, + "grad_norm": 0.9325888752937317, + "learning_rate": 0.0027749999999999394, + "loss": 4.4762, + "step": 2200 + }, + { + "epoch": 0.030914809537890803, + "grad_norm": 0.8246894478797913, + "learning_rate": 0.0027874999999999363, + "loss": 4.4439, + "step": 2300 + }, + { + "epoch": 0.032258931691712144, + "grad_norm": 0.9386630654335022, + "learning_rate": 0.0027999999999999345, + "loss": 4.4196, + "step": 2400 + }, + { + "epoch": 0.033603053845533484, + "grad_norm": 0.7635268568992615, + "learning_rate": 0.0028124999999999322, + "loss": 4.3982, + "step": 2500 + }, + { + "epoch": 0.03494717599935482, + "grad_norm": 0.905320405960083, + "learning_rate": 0.00282499999999993, + "loss": 4.3657, + "step": 2600 + }, + { + "epoch": 0.03629129815317616, + "grad_norm": 0.8763816952705383, + "learning_rate": 0.0028374999999999286, + "loss": 4.3476, + "step": 2700 + }, + { + "epoch": 0.0376354203069975, + "grad_norm": 1.0410003662109375, + "learning_rate": 0.002849999999999924, + "loss": 4.3162, + "step": 2800 + }, + { + "epoch": 0.03897954246081884, + "grad_norm": 0.8011109232902527, + "learning_rate": 0.0028624999999999224, + "loss": 4.2953, + "step": 2900 + }, + { + "epoch": 0.04032366461464018, + "grad_norm": 0.9530174136161804, + "learning_rate": 0.002874999999999916, + "loss": 4.2607, + "step": 3000 + }, + { + "epoch": 0.04032366461464018, + "eval_MaskedAccuracy": 0.24486576013786512, + "eval_loss": 4.528601169586182, + "eval_runtime": 159.1854, + "eval_samples_per_second": 398.755, + "eval_steps_per_second": 1.558, + "step": 3000 + }, + { + "epoch": 0.041667786768461515, + "grad_norm": 0.9223951101303101, + "learning_rate": 0.0028874999999999123, + "loss": 4.2462, + "step": 3100 + }, + { + "epoch": 0.043011908922282856, + "grad_norm": 0.76118403673172, + "learning_rate": 0.0028999999999999113, + "loss": 4.2295, + "step": 3200 + }, + { + "epoch": 0.044356031076104196, + "grad_norm": 1.0855016708374023, + "learning_rate": 0.002912499999999909, + "loss": 4.1973, + "step": 3300 + }, + { + "epoch": 0.04570015322992554, + "grad_norm": 1.12669038772583, + "learning_rate": 0.0029249999999999042, + "loss": 4.1689, + "step": 3400 + }, + { + "epoch": 0.04704427538374688, + "grad_norm": 0.8598240613937378, + "learning_rate": 0.002937499999999899, + "loss": 4.1503, + "step": 3500 + }, + { + "epoch": 0.04838839753756821, + "grad_norm": 0.9049589037895203, + "learning_rate": 0.002949999999999894, + "loss": 4.1266, + "step": 3600 + }, + { + "epoch": 0.04973251969138955, + "grad_norm": 0.9112903475761414, + "learning_rate": 0.002962499999999894, + "loss": 4.0996, + "step": 3700 + }, + { + "epoch": 0.05107664184521089, + "grad_norm": 0.8487406969070435, + "learning_rate": 0.002974999999999889, + "loss": 4.0884, + "step": 3800 + }, + { + "epoch": 0.052420763999032234, + "grad_norm": 0.9482388496398926, + "learning_rate": 0.002987499999999884, + "loss": 4.0599, + "step": 3900 + }, + { + "epoch": 0.05376488615285357, + "grad_norm": 1.0483149290084839, + "learning_rate": 0.002999999999999876, + "loss": 4.039, + "step": 4000 + }, + { + "epoch": 0.05376488615285357, + "eval_MaskedAccuracy": 0.2591465652834645, + "eval_loss": 4.372995376586914, + "eval_runtime": 156.5453, + "eval_samples_per_second": 405.48, + "eval_steps_per_second": 1.584, + "step": 4000 + }, + { + "epoch": 0.05510900830667491, + "grad_norm": 0.885144054889679, + "learning_rate": 0.003012499999999872, + "loss": 4.0297, + "step": 4100 + }, + { + "epoch": 0.05645313046049625, + "grad_norm": 1.0170485973358154, + "learning_rate": 0.0030249999999998685, + "loss": 4.0089, + "step": 4200 + }, + { + "epoch": 0.05779725261431759, + "grad_norm": 0.9548616409301758, + "learning_rate": 0.003037499999999864, + "loss": 3.9864, + "step": 4300 + }, + { + "epoch": 0.05914137476813893, + "grad_norm": 1.0004748106002808, + "learning_rate": 0.0030499999999998627, + "loss": 3.9803, + "step": 4400 + }, + { + "epoch": 0.060485496921960265, + "grad_norm": 0.8399825096130371, + "learning_rate": 0.003062499999999858, + "loss": 3.9609, + "step": 4500 + }, + { + "epoch": 0.061829619075781606, + "grad_norm": 0.9576703906059265, + "learning_rate": 0.0030749999999998565, + "loss": 3.9492, + "step": 4600 + }, + { + "epoch": 0.06317374122960295, + "grad_norm": 0.8874163031578064, + "learning_rate": 0.0030874999999998525, + "loss": 3.9307, + "step": 4700 + }, + { + "epoch": 0.06451786338342429, + "grad_norm": 0.8954266309738159, + "learning_rate": 0.003099999999999849, + "loss": 3.917, + "step": 4800 + }, + { + "epoch": 0.06586198553724562, + "grad_norm": 1.2326369285583496, + "learning_rate": 0.0031124999999998476, + "loss": 3.9057, + "step": 4900 + }, + { + "epoch": 0.06720610769106697, + "grad_norm": 0.9583144783973694, + "learning_rate": 0.003124999999999845, + "loss": 3.8926, + "step": 5000 + }, + { + "epoch": 0.06720610769106697, + "eval_MaskedAccuracy": 0.272405235858602, + "eval_loss": 4.243067741394043, + "eval_runtime": 160.7599, + "eval_samples_per_second": 394.85, + "eval_steps_per_second": 1.543, + "step": 5000 + }, + { + "epoch": 0.0685502298448883, + "grad_norm": 1.063835620880127, + "learning_rate": 0.003137499999999841, + "loss": 3.8825, + "step": 5100 + }, + { + "epoch": 0.06989435199870964, + "grad_norm": 0.8451062440872192, + "learning_rate": 0.003149999999999838, + "loss": 3.8725, + "step": 5200 + }, + { + "epoch": 0.07123847415253098, + "grad_norm": 0.8295220732688904, + "learning_rate": 0.003162499999999835, + "loss": 3.8622, + "step": 5300 + }, + { + "epoch": 0.07258259630635232, + "grad_norm": 0.8434109091758728, + "learning_rate": 0.003174999999999833, + "loss": 3.8418, + "step": 5400 + }, + { + "epoch": 0.07392671846017367, + "grad_norm": 0.9849230051040649, + "learning_rate": 0.0031874999999998307, + "loss": 3.8338, + "step": 5500 + }, + { + "epoch": 0.075270840613995, + "grad_norm": 1.2400304079055786, + "learning_rate": 0.003199999999999828, + "loss": 3.8234, + "step": 5600 + }, + { + "epoch": 0.07661496276781633, + "grad_norm": 0.8637450933456421, + "learning_rate": 0.0032124999999998288, + "loss": 3.8194, + "step": 5700 + }, + { + "epoch": 0.07795908492163768, + "grad_norm": 0.7561401128768921, + "learning_rate": 0.0032249999999998287, + "loss": 3.805, + "step": 5800 + }, + { + "epoch": 0.07930320707545901, + "grad_norm": 0.822270929813385, + "learning_rate": 0.003237499999999828, + "loss": 3.7957, + "step": 5900 + }, + { + "epoch": 0.08064732922928036, + "grad_norm": 0.9090524315834045, + "learning_rate": 0.0032499999999998216, + "loss": 3.7777, + "step": 6000 + }, + { + "epoch": 0.08064732922928036, + "eval_MaskedAccuracy": 0.2802241909468907, + "eval_loss": 4.147753715515137, + "eval_runtime": 154.4313, + "eval_samples_per_second": 411.031, + "eval_steps_per_second": 1.606, + "step": 6000 + }, + { + "epoch": 0.0819914513831017, + "grad_norm": 1.0061832666397095, + "learning_rate": 0.0032624999999998194, + "loss": 3.7827, + "step": 6100 + }, + { + "epoch": 0.08333557353692303, + "grad_norm": 1.0628759860992432, + "learning_rate": 0.0032749999999998154, + "loss": 3.7667, + "step": 6200 + }, + { + "epoch": 0.08467969569074438, + "grad_norm": 1.1841171979904175, + "learning_rate": 0.003287499999999813, + "loss": 3.7672, + "step": 6300 + }, + { + "epoch": 0.08602381784456571, + "grad_norm": 0.9175704121589661, + "learning_rate": 0.003299999999999811, + "loss": 3.7556, + "step": 6400 + }, + { + "epoch": 0.08736793999838706, + "grad_norm": 0.9184996485710144, + "learning_rate": 0.0033124999999998082, + "loss": 3.7411, + "step": 6500 + }, + { + "epoch": 0.08871206215220839, + "grad_norm": 0.8513078093528748, + "learning_rate": 0.0033249999999998042, + "loss": 3.7332, + "step": 6600 + }, + { + "epoch": 0.09005618430602973, + "grad_norm": 0.9670765399932861, + "learning_rate": 0.0033374999999997994, + "loss": 3.7274, + "step": 6700 + }, + { + "epoch": 0.09140030645985107, + "grad_norm": 0.9749046564102173, + "learning_rate": 0.0033499999999997932, + "loss": 3.7223, + "step": 6800 + }, + { + "epoch": 0.09274442861367241, + "grad_norm": 0.8485523462295532, + "learning_rate": 0.003362499999999787, + "loss": 3.7125, + "step": 6900 + }, + { + "epoch": 0.09408855076749376, + "grad_norm": 0.84412682056427, + "learning_rate": 0.0033749999999997853, + "loss": 3.71, + "step": 7000 + }, + { + "epoch": 0.09408855076749376, + "eval_MaskedAccuracy": 0.2868874563852213, + "eval_loss": 4.067524433135986, + "eval_runtime": 154.2299, + "eval_samples_per_second": 411.567, + "eval_steps_per_second": 1.608, + "step": 7000 + }, + { + "epoch": 0.09543267292131509, + "grad_norm": 0.8195430040359497, + "learning_rate": 0.0033874999999997804, + "loss": 3.708, + "step": 7100 + }, + { + "epoch": 0.09677679507513642, + "grad_norm": 0.8684709668159485, + "learning_rate": 0.0033999999999997713, + "loss": 3.7003, + "step": 7200 + }, + { + "epoch": 0.09812091722895777, + "grad_norm": 0.8237408399581909, + "learning_rate": 0.0034124999999997647, + "loss": 3.676, + "step": 7300 + }, + { + "epoch": 0.0994650393827791, + "grad_norm": 0.8418594598770142, + "learning_rate": 0.003424999999999764, + "loss": 3.6748, + "step": 7400 + }, + { + "epoch": 0.10080916153660045, + "grad_norm": 0.900364100933075, + "learning_rate": 0.0034374999999997606, + "loss": 3.6822, + "step": 7500 + }, + { + "epoch": 0.10215328369042179, + "grad_norm": 0.9300949573516846, + "learning_rate": 0.003449999999999758, + "loss": 3.6657, + "step": 7600 + }, + { + "epoch": 0.10349740584424312, + "grad_norm": 0.8330152034759521, + "learning_rate": 0.003462499999999754, + "loss": 3.6635, + "step": 7700 + }, + { + "epoch": 0.10484152799806447, + "grad_norm": 0.8555970191955566, + "learning_rate": 0.003474999999999749, + "loss": 3.6461, + "step": 7800 + }, + { + "epoch": 0.1061856501518858, + "grad_norm": 0.9717864394187927, + "learning_rate": 0.003487499999999749, + "loss": 3.6402, + "step": 7900 + }, + { + "epoch": 0.10752977230570714, + "grad_norm": 1.3664884567260742, + "learning_rate": 0.003499999999999745, + "loss": 3.6341, + "step": 8000 + }, + { + "epoch": 0.10752977230570714, + "eval_MaskedAccuracy": 0.2957456999836483, + "eval_loss": 3.9853811264038086, + "eval_runtime": 154.2803, + "eval_samples_per_second": 411.433, + "eval_steps_per_second": 1.607, + "step": 8000 + }, + { + "epoch": 0.10887389445952848, + "grad_norm": 1.9192525148391724, + "learning_rate": 0.0035124999999997415, + "loss": 3.6234, + "step": 8100 + }, + { + "epoch": 0.11021801661334982, + "grad_norm": 0.875035285949707, + "learning_rate": 0.003524999999999741, + "loss": 3.6126, + "step": 8200 + }, + { + "epoch": 0.11156213876717117, + "grad_norm": 1.4383764266967773, + "learning_rate": 0.0035374999999997353, + "loss": 3.5893, + "step": 8300 + }, + { + "epoch": 0.1129062609209925, + "grad_norm": 1.0789637565612793, + "learning_rate": 0.0035499999999997313, + "loss": 3.5687, + "step": 8400 + }, + { + "epoch": 0.11425038307481383, + "grad_norm": 1.7937785387039185, + "learning_rate": 0.00356249999999973, + "loss": 3.5473, + "step": 8500 + }, + { + "epoch": 0.11559450522863518, + "grad_norm": 1.0099656581878662, + "learning_rate": 0.0035749999999997255, + "loss": 3.5311, + "step": 8600 + }, + { + "epoch": 0.11693862738245651, + "grad_norm": 0.876243531703949, + "learning_rate": 0.0035874999999997237, + "loss": 3.5129, + "step": 8700 + }, + { + "epoch": 0.11828274953627786, + "grad_norm": 1.4001803398132324, + "learning_rate": 0.0035999999999997236, + "loss": 3.5052, + "step": 8800 + }, + { + "epoch": 0.1196268716900992, + "grad_norm": 1.103613018989563, + "learning_rate": 0.003612499999999721, + "loss": 3.4943, + "step": 8900 + }, + { + "epoch": 0.12097099384392053, + "grad_norm": 0.8662921190261841, + "learning_rate": 0.00362499999999972, + "loss": 3.4864, + "step": 9000 + }, + { + "epoch": 0.12097099384392053, + "eval_MaskedAccuracy": 0.308084678333672, + "eval_loss": 3.829771041870117, + "eval_runtime": 161.1285, + "eval_samples_per_second": 393.946, + "eval_steps_per_second": 1.539, + "step": 9000 + }, + { + "epoch": 0.12231511599774188, + "grad_norm": 0.9641631841659546, + "learning_rate": 0.0036374999999997113, + "loss": 3.4636, + "step": 9100 + }, + { + "epoch": 0.12365923815156321, + "grad_norm": 0.8773326277732849, + "learning_rate": 0.0036499999999997073, + "loss": 3.4694, + "step": 9200 + }, + { + "epoch": 0.12500336030538456, + "grad_norm": 1.132733941078186, + "learning_rate": 0.003662499999999711, + "loss": 3.4452, + "step": 9300 + }, + { + "epoch": 0.1263474824592059, + "grad_norm": 1.1771531105041504, + "learning_rate": 0.0036749999999997097, + "loss": 3.4436, + "step": 9400 + }, + { + "epoch": 0.12769160461302723, + "grad_norm": 0.7729461193084717, + "learning_rate": 0.0036874999999997097, + "loss": 3.4241, + "step": 9500 + }, + { + "epoch": 0.12903572676684857, + "grad_norm": 1.1024218797683716, + "learning_rate": 0.0036999999999997118, + "loss": 3.4175, + "step": 9600 + }, + { + "epoch": 0.13037984892066992, + "grad_norm": 0.7504922747612, + "learning_rate": 0.0037124999999997074, + "loss": 3.3946, + "step": 9700 + }, + { + "epoch": 0.13172397107449124, + "grad_norm": 0.7903368473052979, + "learning_rate": 0.0037249999999997034, + "loss": 3.3848, + "step": 9800 + }, + { + "epoch": 0.1330680932283126, + "grad_norm": 0.8163586258888245, + "learning_rate": 0.0037374999999996972, + "loss": 3.3669, + "step": 9900 + }, + { + "epoch": 0.13441221538213394, + "grad_norm": 0.839239776134491, + "learning_rate": 0.003749999999999697, + "loss": 3.3447, + "step": 10000 + }, + { + "epoch": 0.13441221538213394, + "eval_MaskedAccuracy": 0.32656299271612027, + "eval_loss": 3.7040650844573975, + "eval_runtime": 161.5088, + "eval_samples_per_second": 393.019, + "eval_steps_per_second": 1.536, + "step": 10000 + }, + { + "epoch": 0.13575633753595526, + "grad_norm": 1.061814546585083, + "learning_rate": 0.003762499999999692, + "loss": 3.3364, + "step": 10100 + }, + { + "epoch": 0.1371004596897766, + "grad_norm": 0.7809873223304749, + "learning_rate": 0.0037749999999996914, + "loss": 3.3223, + "step": 10200 + }, + { + "epoch": 0.13844458184359795, + "grad_norm": 0.8472948670387268, + "learning_rate": 0.003787499999999695, + "loss": 3.3116, + "step": 10300 + }, + { + "epoch": 0.13978870399741927, + "grad_norm": 0.7124361991882324, + "learning_rate": 0.0037999999999996873, + "loss": 3.2889, + "step": 10400 + }, + { + "epoch": 0.14113282615124062, + "grad_norm": 1.7072255611419678, + "learning_rate": 0.003812499999999687, + "loss": 3.2982, + "step": 10500 + }, + { + "epoch": 0.14247694830506197, + "grad_norm": 2.472754716873169, + "learning_rate": 0.0038249999999996833, + "loss": 3.2711, + "step": 10600 + }, + { + "epoch": 0.14382107045888332, + "grad_norm": 0.9063215255737305, + "learning_rate": 0.0038374999999996823, + "loss": 3.2667, + "step": 10700 + }, + { + "epoch": 0.14516519261270464, + "grad_norm": 0.8283532857894897, + "learning_rate": 0.0038499999999996775, + "loss": 3.2545, + "step": 10800 + }, + { + "epoch": 0.14650931476652598, + "grad_norm": 0.8591430187225342, + "learning_rate": 0.0038624999999996735, + "loss": 3.248, + "step": 10900 + }, + { + "epoch": 0.14785343692034733, + "grad_norm": 0.971303403377533, + "learning_rate": 0.0038749999999996712, + "loss": 3.2447, + "step": 11000 + }, + { + "epoch": 0.14785343692034733, + "eval_MaskedAccuracy": 0.34290362452711665, + "eval_loss": 3.601362705230713, + "eval_runtime": 162.0306, + "eval_samples_per_second": 391.753, + "eval_steps_per_second": 1.531, + "step": 11000 + }, + { + "epoch": 0.14919755907416865, + "grad_norm": 1.0778330564498901, + "learning_rate": 0.0038874999999996686, + "loss": 3.2298, + "step": 11100 + }, + { + "epoch": 0.15054168122799, + "grad_norm": 0.7707659602165222, + "learning_rate": 0.003899999999999665, + "loss": 3.2199, + "step": 11200 + }, + { + "epoch": 0.15188580338181135, + "grad_norm": 1.2470498085021973, + "learning_rate": 0.003912499999999662, + "loss": 3.2137, + "step": 11300 + }, + { + "epoch": 0.15322992553563267, + "grad_norm": 0.8463084101676941, + "learning_rate": 0.003924999999999659, + "loss": 3.2123, + "step": 11400 + }, + { + "epoch": 0.154574047689454, + "grad_norm": 0.8666761517524719, + "learning_rate": 0.0039374999999996566, + "loss": 3.1978, + "step": 11500 + }, + { + "epoch": 0.15591816984327536, + "grad_norm": 0.899533748626709, + "learning_rate": 0.0039499999999996595, + "loss": 3.1932, + "step": 11600 + }, + { + "epoch": 0.1572622919970967, + "grad_norm": 0.8859838843345642, + "learning_rate": 0.00396249999999966, + "loss": 3.1799, + "step": 11700 + }, + { + "epoch": 0.15860641415091803, + "grad_norm": 1.3646000623703003, + "learning_rate": 0.003974999999999655, + "loss": 3.1739, + "step": 11800 + }, + { + "epoch": 0.15995053630473938, + "grad_norm": 0.7711676359176636, + "learning_rate": 0.003987499999999656, + "loss": 3.1746, + "step": 11900 + }, + { + "epoch": 0.16129465845856072, + "grad_norm": 0.9850155711174011, + "learning_rate": 0.003999999999999657, + "loss": 3.1749, + "step": 12000 + }, + { + "epoch": 0.16129465845856072, + "eval_MaskedAccuracy": 0.3521379038689249, + "eval_loss": 3.5329132080078125, + "eval_runtime": 154.2202, + "eval_samples_per_second": 411.593, + "eval_steps_per_second": 1.608, + "step": 12000 + }, + { + "epoch": 0.16263878061238204, + "grad_norm": 2.1732335090637207, + "learning_rate": 0.004012499999999649, + "loss": 3.1557, + "step": 12100 + }, + { + "epoch": 0.1639829027662034, + "grad_norm": 0.6864789724349976, + "learning_rate": 0.00402499999999965, + "loss": 3.1457, + "step": 12200 + }, + { + "epoch": 0.16532702492002474, + "grad_norm": 0.7963569760322571, + "learning_rate": 0.004037499999999652, + "loss": 3.1462, + "step": 12300 + }, + { + "epoch": 0.16667114707384606, + "grad_norm": 0.7270441651344299, + "learning_rate": 0.00404999999999965, + "loss": 3.1424, + "step": 12400 + }, + { + "epoch": 0.1680152692276674, + "grad_norm": 1.0645862817764282, + "learning_rate": 0.004062499999999646, + "loss": 3.1301, + "step": 12500 + }, + { + "epoch": 0.16935939138148876, + "grad_norm": 0.7641903758049011, + "learning_rate": 0.004074999999999641, + "loss": 3.1309, + "step": 12600 + }, + { + "epoch": 0.17070351353531008, + "grad_norm": 1.2100406885147095, + "learning_rate": 0.004087499999999641, + "loss": 3.1348, + "step": 12700 + }, + { + "epoch": 0.17204763568913142, + "grad_norm": 0.7301545143127441, + "learning_rate": 0.004099999999999645, + "loss": 3.1175, + "step": 12800 + }, + { + "epoch": 0.17339175784295277, + "grad_norm": 1.3868557214736938, + "learning_rate": 0.004112499999999646, + "loss": 3.1196, + "step": 12900 + }, + { + "epoch": 0.17473587999677412, + "grad_norm": 1.3212717771530151, + "learning_rate": 0.004124999999999645, + "loss": 3.1098, + "step": 13000 + }, + { + "epoch": 0.17473587999677412, + "eval_MaskedAccuracy": 0.35983040074292133, + "eval_loss": 3.4708173274993896, + "eval_runtime": 161.2353, + "eval_samples_per_second": 393.685, + "eval_steps_per_second": 1.538, + "step": 13000 + }, + { + "epoch": 0.17608000215059544, + "grad_norm": 0.6421147584915161, + "learning_rate": 0.0041374999999996415, + "loss": 3.1074, + "step": 13100 + }, + { + "epoch": 0.17742412430441679, + "grad_norm": 1.0816092491149902, + "learning_rate": 0.004149999999999641, + "loss": 3.0902, + "step": 13200 + }, + { + "epoch": 0.17876824645823813, + "grad_norm": 0.8088307976722717, + "learning_rate": 0.004162499999999636, + "loss": 3.0973, + "step": 13300 + }, + { + "epoch": 0.18011236861205945, + "grad_norm": 0.9084308743476868, + "learning_rate": 0.004174999999999633, + "loss": 3.0835, + "step": 13400 + }, + { + "epoch": 0.1814564907658808, + "grad_norm": 1.0027776956558228, + "learning_rate": 0.004187499999999628, + "loss": 3.0876, + "step": 13500 + }, + { + "epoch": 0.18280061291970215, + "grad_norm": 1.0761163234710693, + "learning_rate": 0.004199999999999622, + "loss": 3.0867, + "step": 13600 + }, + { + "epoch": 0.18414473507352347, + "grad_norm": 0.7274723052978516, + "learning_rate": 0.004212499999999617, + "loss": 3.0793, + "step": 13700 + }, + { + "epoch": 0.18548885722734482, + "grad_norm": 0.9493256211280823, + "learning_rate": 0.004224999999999617, + "loss": 3.0753, + "step": 13800 + }, + { + "epoch": 0.18683297938116616, + "grad_norm": 1.288206696510315, + "learning_rate": 0.004237499999999612, + "loss": 3.0625, + "step": 13900 + }, + { + "epoch": 0.1881771015349875, + "grad_norm": 0.8344607353210449, + "learning_rate": 0.004249999999999605, + "loss": 3.0665, + "step": 14000 + }, + { + "epoch": 0.1881771015349875, + "eval_MaskedAccuracy": 0.36507595159603434, + "eval_loss": 3.4270057678222656, + "eval_runtime": 161.5547, + "eval_samples_per_second": 392.907, + "eval_steps_per_second": 1.535, + "step": 14000 + }, + { + "epoch": 0.18952122368880883, + "grad_norm": 0.7024100422859192, + "learning_rate": 0.004262499999999602, + "loss": 3.0688, + "step": 14100 + }, + { + "epoch": 0.19086534584263018, + "grad_norm": 0.7407594323158264, + "learning_rate": 0.004274999999999597, + "loss": 3.0539, + "step": 14200 + }, + { + "epoch": 0.19220946799645153, + "grad_norm": 0.8240285515785217, + "learning_rate": 0.004287499999999591, + "loss": 3.0552, + "step": 14300 + }, + { + "epoch": 0.19355359015027285, + "grad_norm": 0.7373182773590088, + "learning_rate": 0.004299999999999582, + "loss": 3.0529, + "step": 14400 + }, + { + "epoch": 0.1948977123040942, + "grad_norm": 0.6955730319023132, + "learning_rate": 0.004312499999999577, + "loss": 3.0478, + "step": 14500 + }, + { + "epoch": 0.19624183445791554, + "grad_norm": 1.1683614253997803, + "learning_rate": 0.004324999999999574, + "loss": 3.0314, + "step": 14600 + }, + { + "epoch": 0.19758595661173686, + "grad_norm": 1.4355748891830444, + "learning_rate": 0.004337499999999567, + "loss": 3.0478, + "step": 14700 + }, + { + "epoch": 0.1989300787655582, + "grad_norm": 0.7114657163619995, + "learning_rate": 0.004349999999999561, + "loss": 3.0324, + "step": 14800 + }, + { + "epoch": 0.20027420091937956, + "grad_norm": 1.0332584381103516, + "learning_rate": 0.004362499999999559, + "loss": 3.0353, + "step": 14900 + }, + { + "epoch": 0.2016183230732009, + "grad_norm": 0.6959288716316223, + "learning_rate": 0.0043749999999995555, + "loss": 3.0278, + "step": 15000 + }, + { + "epoch": 0.2016183230732009, + "eval_MaskedAccuracy": 0.37069473714124845, + "eval_loss": 3.379894256591797, + "eval_runtime": 161.4047, + "eval_samples_per_second": 393.272, + "eval_steps_per_second": 1.537, + "step": 15000 + }, + { + "epoch": 0.20296244522702223, + "grad_norm": 1.1070080995559692, + "learning_rate": 0.004387499999999551, + "loss": 3.0205, + "step": 15100 + }, + { + "epoch": 0.20430656738084357, + "grad_norm": 0.8838359117507935, + "learning_rate": 0.004399999999999546, + "loss": 3.0299, + "step": 15200 + }, + { + "epoch": 0.20565068953466492, + "grad_norm": 0.8130790591239929, + "learning_rate": 0.004412499999999543, + "loss": 3.0231, + "step": 15300 + }, + { + "epoch": 0.20699481168848624, + "grad_norm": 0.6710375547409058, + "learning_rate": 0.004424999999999538, + "loss": 3.0161, + "step": 15400 + }, + { + "epoch": 0.2083389338423076, + "grad_norm": 0.7571612000465393, + "learning_rate": 0.004437499999999531, + "loss": 3.0166, + "step": 15500 + }, + { + "epoch": 0.20968305599612894, + "grad_norm": 0.658545196056366, + "learning_rate": 0.004449999999999525, + "loss": 3.0011, + "step": 15600 + }, + { + "epoch": 0.21102717814995026, + "grad_norm": 0.7161542177200317, + "learning_rate": 0.004462499999999522, + "loss": 3.0085, + "step": 15700 + }, + { + "epoch": 0.2123713003037716, + "grad_norm": 0.7213286757469177, + "learning_rate": 0.004474999999999518, + "loss": 3.0037, + "step": 15800 + }, + { + "epoch": 0.21371542245759295, + "grad_norm": 1.3048126697540283, + "learning_rate": 0.004487499999999511, + "loss": 2.9974, + "step": 15900 + }, + { + "epoch": 0.21505954461141427, + "grad_norm": 0.999495804309845, + "learning_rate": 0.00449999999999951, + "loss": 2.9973, + "step": 16000 + }, + { + "epoch": 0.21505954461141427, + "eval_MaskedAccuracy": 0.3738960824804252, + "eval_loss": 3.3454906940460205, + "eval_runtime": 157.1677, + "eval_samples_per_second": 403.874, + "eval_steps_per_second": 1.578, + "step": 16000 + }, + { + "epoch": 0.21640366676523562, + "grad_norm": 2.9935622215270996, + "learning_rate": 0.004512499999999503, + "loss": 3.0009, + "step": 16100 + }, + { + "epoch": 0.21774778891905697, + "grad_norm": 1.100763201713562, + "learning_rate": 0.004524999999999504, + "loss": 2.9812, + "step": 16200 + }, + { + "epoch": 0.21909191107287831, + "grad_norm": 0.7473815083503723, + "learning_rate": 0.004537499999999495, + "loss": 2.9833, + "step": 16300 + }, + { + "epoch": 0.22043603322669963, + "grad_norm": 0.8163867592811584, + "learning_rate": 0.004549999999999481, + "loss": 2.9867, + "step": 16400 + }, + { + "epoch": 0.22178015538052098, + "grad_norm": 4.292899131774902, + "learning_rate": 0.0045624999999994715, + "loss": 2.982, + "step": 16500 + }, + { + "epoch": 0.22312427753434233, + "grad_norm": 9.086414337158203, + "learning_rate": 0.0045749999999994675, + "loss": 2.9867, + "step": 16600 + }, + { + "epoch": 0.22446839968816365, + "grad_norm": 0.6632740497589111, + "learning_rate": 0.004587499999999466, + "loss": 2.972, + "step": 16700 + }, + { + "epoch": 0.225812521841985, + "grad_norm": 3.345919132232666, + "learning_rate": 0.004599999999999462, + "loss": 2.972, + "step": 16800 + }, + { + "epoch": 0.22715664399580635, + "grad_norm": 0.6550186276435852, + "learning_rate": 0.0046124999999994564, + "loss": 2.9672, + "step": 16900 + }, + { + "epoch": 0.22850076614962767, + "grad_norm": 0.6624706983566284, + "learning_rate": 0.004624999999999449, + "loss": 2.9731, + "step": 17000 + }, + { + "epoch": 0.22850076614962767, + "eval_MaskedAccuracy": 0.3775072955122549, + "eval_loss": 3.3128609657287598, + "eval_runtime": 154.9603, + "eval_samples_per_second": 409.628, + "eval_steps_per_second": 1.6, + "step": 17000 + }, + { + "epoch": 0.229844888303449, + "grad_norm": 1.0394245386123657, + "learning_rate": 0.004637499999999442, + "loss": 2.9749, + "step": 17100 + }, + { + "epoch": 0.23118901045727036, + "grad_norm": 0.6592782139778137, + "learning_rate": 0.00464999999999944, + "loss": 2.967, + "step": 17200 + }, + { + "epoch": 0.2325331326110917, + "grad_norm": 0.923537015914917, + "learning_rate": 0.00466249999999944, + "loss": 2.9607, + "step": 17300 + }, + { + "epoch": 0.23387725476491303, + "grad_norm": 0.878057599067688, + "learning_rate": 0.004674999999999435, + "loss": 2.9609, + "step": 17400 + }, + { + "epoch": 0.23522137691873438, + "grad_norm": 1.1436913013458252, + "learning_rate": 0.004687499999999431, + "loss": 2.9521, + "step": 17500 + }, + { + "epoch": 0.23656549907255572, + "grad_norm": 0.8196259140968323, + "learning_rate": 0.004699999999999424, + "loss": 2.9523, + "step": 17600 + }, + { + "epoch": 0.23790962122637704, + "grad_norm": 0.625901460647583, + "learning_rate": 0.004712499999999414, + "loss": 2.955, + "step": 17700 + }, + { + "epoch": 0.2392537433801984, + "grad_norm": 0.7023401856422424, + "learning_rate": 0.004724999999999409, + "loss": 2.949, + "step": 17800 + }, + { + "epoch": 0.24059786553401974, + "grad_norm": 0.7306541204452515, + "learning_rate": 0.0047374999999993984, + "loss": 2.9478, + "step": 17900 + }, + { + "epoch": 0.24194198768784106, + "grad_norm": 0.6469770073890686, + "learning_rate": 0.004749999999999393, + "loss": 2.9395, + "step": 18000 + }, + { + "epoch": 0.24194198768784106, + "eval_MaskedAccuracy": 0.3805818810631038, + "eval_loss": 3.2859790325164795, + "eval_runtime": 159.4762, + "eval_samples_per_second": 398.028, + "eval_steps_per_second": 1.555, + "step": 18000 + }, + { + "epoch": 0.2432861098416624, + "grad_norm": 0.6748953461647034, + "learning_rate": 0.004762499999999384, + "loss": 2.9299, + "step": 18100 + }, + { + "epoch": 0.24463023199548375, + "grad_norm": 1.1163654327392578, + "learning_rate": 0.004774999999999384, + "loss": 2.9415, + "step": 18200 + }, + { + "epoch": 0.2459743541493051, + "grad_norm": 1.0123517513275146, + "learning_rate": 0.00478749999999938, + "loss": 2.9301, + "step": 18300 + }, + { + "epoch": 0.24731847630312642, + "grad_norm": 3.0257534980773926, + "learning_rate": 0.004799999999999372, + "loss": 2.9342, + "step": 18400 + }, + { + "epoch": 0.24866259845694777, + "grad_norm": 1.3270440101623535, + "learning_rate": 0.004812499999999362, + "loss": 2.9325, + "step": 18500 + }, + { + "epoch": 0.2500067206107691, + "grad_norm": 1.0727241039276123, + "learning_rate": 0.004824999999999358, + "loss": 2.9221, + "step": 18600 + }, + { + "epoch": 0.25135084276459047, + "grad_norm": 0.7316584587097168, + "learning_rate": 0.004837499999999354, + "loss": 2.9235, + "step": 18700 + }, + { + "epoch": 0.2526949649184118, + "grad_norm": 2.4931344985961914, + "learning_rate": 0.00484999999999935, + "loss": 2.9264, + "step": 18800 + }, + { + "epoch": 0.2540390870722331, + "grad_norm": 2.006314277648926, + "learning_rate": 0.004862499999999345, + "loss": 2.9229, + "step": 18900 + }, + { + "epoch": 0.25538320922605445, + "grad_norm": 1.1321533918380737, + "learning_rate": 0.004874999999999335, + "loss": 2.917, + "step": 19000 + }, + { + "epoch": 0.25538320922605445, + "eval_MaskedAccuracy": 0.38268668848805826, + "eval_loss": 3.2650303840637207, + "eval_runtime": 154.9489, + "eval_samples_per_second": 409.658, + "eval_steps_per_second": 1.601, + "step": 19000 + }, + { + "epoch": 0.2567273313798758, + "grad_norm": 0.637344241142273, + "learning_rate": 0.004887499999999324, + "loss": 2.9099, + "step": 19100 + }, + { + "epoch": 0.25807145353369715, + "grad_norm": 3.686405897140503, + "learning_rate": 0.004899999999999312, + "loss": 2.9115, + "step": 19200 + }, + { + "epoch": 0.2594155756875185, + "grad_norm": 0.734024703502655, + "learning_rate": 0.0049124999999993054, + "loss": 2.9133, + "step": 19300 + }, + { + "epoch": 0.26075969784133984, + "grad_norm": 0.9141765236854553, + "learning_rate": 0.004924999999999296, + "loss": 2.9188, + "step": 19400 + }, + { + "epoch": 0.26210381999516114, + "grad_norm": 0.9035413265228271, + "learning_rate": 0.004937499999999295, + "loss": 2.9223, + "step": 19500 + }, + { + "epoch": 0.2634479421489825, + "grad_norm": 0.9160522222518921, + "learning_rate": 0.004949999999999293, + "loss": 2.9057, + "step": 19600 + }, + { + "epoch": 0.26479206430280383, + "grad_norm": 0.6431342363357544, + "learning_rate": 0.004962499999999282, + "loss": 2.8985, + "step": 19700 + }, + { + "epoch": 0.2661361864566252, + "grad_norm": 0.6499119997024536, + "learning_rate": 0.004974999999999273, + "loss": 2.8946, + "step": 19800 + }, + { + "epoch": 0.2674803086104465, + "grad_norm": 0.7949116826057434, + "learning_rate": 0.004987499999999261, + "loss": 2.9091, + "step": 19900 + }, + { + "epoch": 0.2688244307642679, + "grad_norm": 2.123286008834839, + "learning_rate": 0.005, + "loss": 2.8978, + "step": 20000 + }, + { + "epoch": 0.2688244307642679, + "eval_MaskedAccuracy": 0.3850105547815323, + "eval_loss": 3.2462847232818604, + "eval_runtime": 158.4819, + "eval_samples_per_second": 400.525, + "eval_steps_per_second": 1.565, + "step": 20000 + }, + { + "epoch": 0.2701685529180892, + "grad_norm": 0.6592645645141602, + "learning_rate": 0.004999999943789577, + "loss": 2.896, + "step": 20100 + }, + { + "epoch": 0.2715126750719105, + "grad_norm": 1.3373818397521973, + "learning_rate": 0.0049999997751583045, + "loss": 2.8877, + "step": 20200 + }, + { + "epoch": 0.27285679722573186, + "grad_norm": 0.5964152812957764, + "learning_rate": 0.004999999494106201, + "loss": 2.8969, + "step": 20300 + }, + { + "epoch": 0.2742009193795532, + "grad_norm": 1.4561213254928589, + "learning_rate": 0.004999999100633266, + "loss": 2.8942, + "step": 20400 + }, + { + "epoch": 0.27554504153337456, + "grad_norm": 1.0474812984466553, + "learning_rate": 0.004999998594739517, + "loss": 2.9026, + "step": 20500 + }, + { + "epoch": 0.2768891636871959, + "grad_norm": 0.5717761516571045, + "learning_rate": 0.004999997976424982, + "loss": 2.8946, + "step": 20600 + }, + { + "epoch": 0.27823328584101725, + "grad_norm": 1.6426117420196533, + "learning_rate": 0.004999997245689687, + "loss": 2.881, + "step": 20700 + }, + { + "epoch": 0.27957740799483854, + "grad_norm": 2.5096325874328613, + "learning_rate": 0.004999996402533671, + "loss": 2.8809, + "step": 20800 + }, + { + "epoch": 0.2809215301486599, + "grad_norm": 1.026366949081421, + "learning_rate": 0.0049999954469569655, + "loss": 2.8773, + "step": 20900 + }, + { + "epoch": 0.28226565230248124, + "grad_norm": 1.2996127605438232, + "learning_rate": 0.0049999943789596135, + "loss": 2.871, + "step": 21000 + }, + { + "epoch": 0.28226565230248124, + "eval_MaskedAccuracy": 0.3880521470318861, + "eval_loss": 3.224468469619751, + "eval_runtime": 161.317, + "eval_samples_per_second": 393.486, + "eval_steps_per_second": 1.537, + "step": 21000 + }, + { + "epoch": 0.2836097744563026, + "grad_norm": 0.6892908215522766, + "learning_rate": 0.004999993198541667, + "loss": 2.8798, + "step": 21100 + }, + { + "epoch": 0.28495389661012394, + "grad_norm": 0.8045306205749512, + "learning_rate": 0.00499999190570318, + "loss": 2.8639, + "step": 21200 + }, + { + "epoch": 0.2862980187639453, + "grad_norm": 1.282244324684143, + "learning_rate": 0.004999990500444207, + "loss": 2.8754, + "step": 21300 + }, + { + "epoch": 0.28764214091776663, + "grad_norm": 0.9008962512016296, + "learning_rate": 0.00499998898276481, + "loss": 2.8698, + "step": 21400 + }, + { + "epoch": 0.2889862630715879, + "grad_norm": 0.8007209897041321, + "learning_rate": 0.00499998735266506, + "loss": 2.8647, + "step": 21500 + }, + { + "epoch": 0.29033038522540927, + "grad_norm": 1.014004111289978, + "learning_rate": 0.004999985610145036, + "loss": 2.8576, + "step": 21600 + }, + { + "epoch": 0.2916745073792306, + "grad_norm": 0.891608476638794, + "learning_rate": 0.0049999837552048105, + "loss": 2.8551, + "step": 21700 + }, + { + "epoch": 0.29301862953305197, + "grad_norm": 1.4745222330093384, + "learning_rate": 0.004999981787844464, + "loss": 2.8651, + "step": 21800 + }, + { + "epoch": 0.2943627516868733, + "grad_norm": 1.727281928062439, + "learning_rate": 0.004999979708064089, + "loss": 2.8577, + "step": 21900 + }, + { + "epoch": 0.29570687384069466, + "grad_norm": 0.5889946818351746, + "learning_rate": 0.0049999775158637745, + "loss": 2.8501, + "step": 22000 + }, + { + "epoch": 0.29570687384069466, + "eval_MaskedAccuracy": 0.3906472199993126, + "eval_loss": 3.1981027126312256, + "eval_runtime": 157.5475, + "eval_samples_per_second": 402.901, + "eval_steps_per_second": 1.574, + "step": 22000 + }, + { + "epoch": 0.29705099599451595, + "grad_norm": 1.3106350898742676, + "learning_rate": 0.00499997521124363, + "loss": 2.8498, + "step": 22100 + }, + { + "epoch": 0.2983951181483373, + "grad_norm": 0.670660674571991, + "learning_rate": 0.0049999727942037515, + "loss": 2.8471, + "step": 22200 + }, + { + "epoch": 0.29973924030215865, + "grad_norm": 0.6043007969856262, + "learning_rate": 0.004999970264744249, + "loss": 2.8529, + "step": 22300 + }, + { + "epoch": 0.30108336245598, + "grad_norm": 0.654592752456665, + "learning_rate": 0.004999967622865233, + "loss": 2.855, + "step": 22400 + }, + { + "epoch": 0.30242748460980134, + "grad_norm": 0.7757537364959717, + "learning_rate": 0.004999964868566834, + "loss": 2.8442, + "step": 22500 + }, + { + "epoch": 0.3037716067636227, + "grad_norm": 0.6157627105712891, + "learning_rate": 0.004999962001849164, + "loss": 2.8504, + "step": 22600 + }, + { + "epoch": 0.30511572891744404, + "grad_norm": 0.5535680055618286, + "learning_rate": 0.004999959022712354, + "loss": 2.8465, + "step": 22700 + }, + { + "epoch": 0.30645985107126533, + "grad_norm": 1.0820826292037964, + "learning_rate": 0.0049999559311565385, + "loss": 2.8407, + "step": 22800 + }, + { + "epoch": 0.3078039732250867, + "grad_norm": 0.898114800453186, + "learning_rate": 0.004999952727181866, + "loss": 2.8461, + "step": 22900 + }, + { + "epoch": 0.309148095378908, + "grad_norm": 0.787267804145813, + "learning_rate": 0.0049999494107884745, + "loss": 2.8352, + "step": 23000 + }, + { + "epoch": 0.309148095378908, + "eval_MaskedAccuracy": 0.3932447713059814, + "eval_loss": 3.172161340713501, + "eval_runtime": 157.8175, + "eval_samples_per_second": 402.212, + "eval_steps_per_second": 1.571, + "step": 23000 + }, + { + "epoch": 0.3104922175327294, + "grad_norm": 0.5774521827697754, + "learning_rate": 0.004999945981976511, + "loss": 2.8263, + "step": 23100 + }, + { + "epoch": 0.3118363396865507, + "grad_norm": 0.8543257117271423, + "learning_rate": 0.004999942440746131, + "loss": 2.8307, + "step": 23200 + }, + { + "epoch": 0.31318046184037207, + "grad_norm": 2.0594680309295654, + "learning_rate": 0.004999938787097496, + "loss": 2.8367, + "step": 23300 + }, + { + "epoch": 0.3145245839941934, + "grad_norm": 0.874093770980835, + "learning_rate": 0.00499993502103077, + "loss": 2.8386, + "step": 23400 + }, + { + "epoch": 0.3158687061480147, + "grad_norm": 0.5475397706031799, + "learning_rate": 0.004999931142546117, + "loss": 2.8264, + "step": 23500 + }, + { + "epoch": 0.31721282830183606, + "grad_norm": 0.8593809008598328, + "learning_rate": 0.004999927151643723, + "loss": 2.829, + "step": 23600 + }, + { + "epoch": 0.3185569504556574, + "grad_norm": 0.8034321665763855, + "learning_rate": 0.0049999230483237615, + "loss": 2.8212, + "step": 23700 + }, + { + "epoch": 0.31990107260947875, + "grad_norm": 0.6283460855484009, + "learning_rate": 0.004999918832586416, + "loss": 2.8213, + "step": 23800 + }, + { + "epoch": 0.3212451947633001, + "grad_norm": 0.7373483180999756, + "learning_rate": 0.004999914504431884, + "loss": 2.8307, + "step": 23900 + }, + { + "epoch": 0.32258931691712145, + "grad_norm": 0.5492293834686279, + "learning_rate": 0.004999910063860352, + "loss": 2.8248, + "step": 24000 + }, + { + "epoch": 0.32258931691712145, + "eval_MaskedAccuracy": 0.39530929398224096, + "eval_loss": 3.154477596282959, + "eval_runtime": 154.5504, + "eval_samples_per_second": 410.714, + "eval_steps_per_second": 1.605, + "step": 24000 + }, + { + "epoch": 0.32393343907094274, + "grad_norm": 0.599720299243927, + "learning_rate": 0.004999905510872023, + "loss": 2.8143, + "step": 24100 + }, + { + "epoch": 0.3252775612247641, + "grad_norm": 1.1359796524047852, + "learning_rate": 0.004999900845467094, + "loss": 2.8142, + "step": 24200 + }, + { + "epoch": 0.32662168337858544, + "grad_norm": 1.4440075159072876, + "learning_rate": 0.004999896067645785, + "loss": 2.8057, + "step": 24300 + }, + { + "epoch": 0.3279658055324068, + "grad_norm": 1.1459801197052002, + "learning_rate": 0.004999891177408309, + "loss": 2.8099, + "step": 24400 + }, + { + "epoch": 0.32930992768622813, + "grad_norm": 1.4969666004180908, + "learning_rate": 0.004999886174754891, + "loss": 2.8065, + "step": 24500 + }, + { + "epoch": 0.3306540498400495, + "grad_norm": 5.83732271194458, + "learning_rate": 0.004999881059685752, + "loss": 2.8102, + "step": 24600 + }, + { + "epoch": 0.3319981719938708, + "grad_norm": 1.80928373336792, + "learning_rate": 0.004999875832201121, + "loss": 2.8057, + "step": 24700 + }, + { + "epoch": 0.3333422941476921, + "grad_norm": 0.7706397771835327, + "learning_rate": 0.004999870492301236, + "loss": 2.8147, + "step": 24800 + }, + { + "epoch": 0.33468641630151347, + "grad_norm": 1.311485767364502, + "learning_rate": 0.0049998650399863355, + "loss": 2.8115, + "step": 24900 + }, + { + "epoch": 0.3360305384553348, + "grad_norm": 2.0457215309143066, + "learning_rate": 0.004999859475256669, + "loss": 2.8031, + "step": 25000 + }, + { + "epoch": 0.3360305384553348, + "eval_MaskedAccuracy": 0.39694718273561536, + "eval_loss": 3.1437790393829346, + "eval_runtime": 153.8578, + "eval_samples_per_second": 412.563, + "eval_steps_per_second": 1.612, + "step": 25000 + }, + { + "epoch": 0.33737466060915616, + "grad_norm": 1.1708168983459473, + "learning_rate": 0.004999853798112488, + "loss": 2.8014, + "step": 25100 + }, + { + "epoch": 0.3387187827629775, + "grad_norm": 0.6717488169670105, + "learning_rate": 0.004999848008554047, + "loss": 2.8061, + "step": 25200 + }, + { + "epoch": 0.34006290491679886, + "grad_norm": 1.743415355682373, + "learning_rate": 0.004999842106581601, + "loss": 2.804, + "step": 25300 + }, + { + "epoch": 0.34140702707062015, + "grad_norm": 1.1857333183288574, + "learning_rate": 0.004999836092195418, + "loss": 2.8049, + "step": 25400 + }, + { + "epoch": 0.3427511492244415, + "grad_norm": 0.7652652263641357, + "learning_rate": 0.004999829965395771, + "loss": 2.7976, + "step": 25500 + }, + { + "epoch": 0.34409527137826285, + "grad_norm": 0.8573779463768005, + "learning_rate": 0.004999823726182935, + "loss": 2.7952, + "step": 25600 + }, + { + "epoch": 0.3454393935320842, + "grad_norm": 1.1807929277420044, + "learning_rate": 0.004999817374557191, + "loss": 2.8, + "step": 25700 + }, + { + "epoch": 0.34678351568590554, + "grad_norm": 0.968813419342041, + "learning_rate": 0.004999810910518828, + "loss": 2.7862, + "step": 25800 + }, + { + "epoch": 0.3481276378397269, + "grad_norm": 1.2410879135131836, + "learning_rate": 0.004999804334068139, + "loss": 2.792, + "step": 25900 + }, + { + "epoch": 0.34947175999354824, + "grad_norm": 1.0421373844146729, + "learning_rate": 0.004999797645205414, + "loss": 2.7928, + "step": 26000 + }, + { + "epoch": 0.34947175999354824, + "eval_MaskedAccuracy": 0.40042939663715493, + "eval_loss": 3.116644859313965, + "eval_runtime": 159.2355, + "eval_samples_per_second": 398.63, + "eval_steps_per_second": 1.557, + "step": 26000 + }, + { + "epoch": 0.35081588214736953, + "grad_norm": 1.1622250080108643, + "learning_rate": 0.004999790843930961, + "loss": 2.7914, + "step": 26100 + }, + { + "epoch": 0.3521600043011909, + "grad_norm": 0.9314205050468445, + "learning_rate": 0.004999783930245077, + "loss": 2.7895, + "step": 26200 + }, + { + "epoch": 0.3535041264550122, + "grad_norm": 2.6555447578430176, + "learning_rate": 0.004999776904148078, + "loss": 2.7872, + "step": 26300 + }, + { + "epoch": 0.35484824860883357, + "grad_norm": 1.0436155796051025, + "learning_rate": 0.004999769765640281, + "loss": 2.7887, + "step": 26400 + }, + { + "epoch": 0.3561923707626549, + "grad_norm": 0.901914656162262, + "learning_rate": 0.004999762514722008, + "loss": 2.788, + "step": 26500 + }, + { + "epoch": 0.35753649291647627, + "grad_norm": 1.7726536989212036, + "learning_rate": 0.004999755151393587, + "loss": 2.7863, + "step": 26600 + }, + { + "epoch": 0.3588806150702976, + "grad_norm": 0.90220046043396, + "learning_rate": 0.004999747675655358, + "loss": 2.7789, + "step": 26700 + }, + { + "epoch": 0.3602247372241189, + "grad_norm": 1.074455976486206, + "learning_rate": 0.004999740087507646, + "loss": 2.7786, + "step": 26800 + }, + { + "epoch": 0.36156885937794025, + "grad_norm": 1.2537977695465088, + "learning_rate": 0.004999732386950793, + "loss": 2.7792, + "step": 26900 + }, + { + "epoch": 0.3629129815317616, + "grad_norm": 1.4169613122940063, + "learning_rate": 0.004999724573985147, + "loss": 2.7773, + "step": 27000 + }, + { + "epoch": 0.3629129815317616, + "eval_MaskedAccuracy": 0.4007810600247028, + "eval_loss": 3.110098361968994, + "eval_runtime": 157.997, + "eval_samples_per_second": 401.754, + "eval_steps_per_second": 1.57, + "step": 27000 + }, + { + "epoch": 0.36425710368558295, + "grad_norm": 0.6324722170829773, + "learning_rate": 0.004999716648611063, + "loss": 2.7757, + "step": 27100 + }, + { + "epoch": 0.3656012258394043, + "grad_norm": 5.492055416107178, + "learning_rate": 0.004999708610828894, + "loss": 2.7697, + "step": 27200 + }, + { + "epoch": 0.36694534799322565, + "grad_norm": 0.8834023475646973, + "learning_rate": 0.004999700460639006, + "loss": 2.7714, + "step": 27300 + }, + { + "epoch": 0.36828947014704694, + "grad_norm": 0.8748111724853516, + "learning_rate": 0.004999692198041765, + "loss": 2.7794, + "step": 27400 + }, + { + "epoch": 0.3696335923008683, + "grad_norm": 1.2332714796066284, + "learning_rate": 0.004999683823037545, + "loss": 2.7656, + "step": 27500 + }, + { + "epoch": 0.37097771445468963, + "grad_norm": 0.7697482705116272, + "learning_rate": 0.0049996753356267255, + "loss": 2.7677, + "step": 27600 + }, + { + "epoch": 0.372321836608511, + "grad_norm": 0.9096680283546448, + "learning_rate": 0.004999666735809681, + "loss": 2.7641, + "step": 27700 + }, + { + "epoch": 0.37366595876233233, + "grad_norm": 0.5890783667564392, + "learning_rate": 0.004999658023586803, + "loss": 2.7621, + "step": 27800 + }, + { + "epoch": 0.3750100809161537, + "grad_norm": 0.7764729261398315, + "learning_rate": 0.0049996491989584835, + "loss": 2.7697, + "step": 27900 + }, + { + "epoch": 0.376354203069975, + "grad_norm": 2.1192266941070557, + "learning_rate": 0.004999640261925122, + "loss": 2.7622, + "step": 28000 + }, + { + "epoch": 0.376354203069975, + "eval_MaskedAccuracy": 0.4028772828245879, + "eval_loss": 3.092843532562256, + "eval_runtime": 159.6753, + "eval_samples_per_second": 397.532, + "eval_steps_per_second": 1.553, + "step": 28000 + }, + { + "epoch": 0.3776983252237963, + "grad_norm": 1.4101366996765137, + "learning_rate": 0.00499963121248712, + "loss": 2.7642, + "step": 28100 + }, + { + "epoch": 0.37904244737761766, + "grad_norm": 1.5181620121002197, + "learning_rate": 0.0049996220506448846, + "loss": 2.7512, + "step": 28200 + }, + { + "epoch": 0.380386569531439, + "grad_norm": 1.365559697151184, + "learning_rate": 0.004999612776398833, + "loss": 2.7688, + "step": 28300 + }, + { + "epoch": 0.38173069168526036, + "grad_norm": 1.040536880493164, + "learning_rate": 0.0049996033897493795, + "loss": 2.7537, + "step": 28400 + }, + { + "epoch": 0.3830748138390817, + "grad_norm": 1.0811822414398193, + "learning_rate": 0.004999593890696948, + "loss": 2.7639, + "step": 28500 + }, + { + "epoch": 0.38441893599290305, + "grad_norm": 1.0675724744796753, + "learning_rate": 0.004999584279241965, + "loss": 2.7556, + "step": 28600 + }, + { + "epoch": 0.38576305814672435, + "grad_norm": 0.6620686650276184, + "learning_rate": 0.0049995745553848645, + "loss": 2.7583, + "step": 28700 + }, + { + "epoch": 0.3871071803005457, + "grad_norm": 1.1735743284225464, + "learning_rate": 0.004999564719126081, + "loss": 2.7482, + "step": 28800 + }, + { + "epoch": 0.38845130245436704, + "grad_norm": 0.5812009572982788, + "learning_rate": 0.004999554770466058, + "loss": 2.7443, + "step": 28900 + }, + { + "epoch": 0.3897954246081884, + "grad_norm": 1.6214362382888794, + "learning_rate": 0.0049995447094052506, + "loss": 2.7488, + "step": 29000 + }, + { + "epoch": 0.3897954246081884, + "eval_MaskedAccuracy": 0.403793286520234, + "eval_loss": 3.0827648639678955, + "eval_runtime": 160.6528, + "eval_samples_per_second": 395.113, + "eval_steps_per_second": 1.544, + "step": 29000 + }, + { + "epoch": 0.39113954676200974, + "grad_norm": 1.382157564163208, + "learning_rate": 0.004999534535944106, + "loss": 2.753, + "step": 29100 + }, + { + "epoch": 0.3924836689158311, + "grad_norm": 1.1490193605422974, + "learning_rate": 0.00499952425008309, + "loss": 2.7411, + "step": 29200 + }, + { + "epoch": 0.39382779106965243, + "grad_norm": 0.5846394300460815, + "learning_rate": 0.00499951385182266, + "loss": 2.7502, + "step": 29300 + }, + { + "epoch": 0.3951719132234737, + "grad_norm": 1.563256025314331, + "learning_rate": 0.0049995033411632805, + "loss": 2.7484, + "step": 29400 + }, + { + "epoch": 0.3965160353772951, + "grad_norm": 0.510226309299469, + "learning_rate": 0.004999492718105435, + "loss": 2.7418, + "step": 29500 + }, + { + "epoch": 0.3978601575311164, + "grad_norm": 0.7409377694129944, + "learning_rate": 0.0049994819826496015, + "loss": 2.7455, + "step": 29600 + }, + { + "epoch": 0.39920427968493777, + "grad_norm": 1.132575511932373, + "learning_rate": 0.004999471134796253, + "loss": 2.7337, + "step": 29700 + }, + { + "epoch": 0.4005484018387591, + "grad_norm": 0.8493718504905701, + "learning_rate": 0.004999460174545891, + "loss": 2.7374, + "step": 29800 + }, + { + "epoch": 0.40189252399258046, + "grad_norm": 1.6549121141433716, + "learning_rate": 0.004999449101898994, + "loss": 2.7345, + "step": 29900 + }, + { + "epoch": 0.4032366461464018, + "grad_norm": 1.0378655195236206, + "learning_rate": 0.004999437916856077, + "loss": 2.7411, + "step": 30000 + }, + { + "epoch": 0.4032366461464018, + "eval_MaskedAccuracy": 0.4059693369791507, + "eval_loss": 3.067436456680298, + "eval_runtime": 349.401, + "eval_samples_per_second": 181.671, + "eval_steps_per_second": 0.71, + "step": 30000 + }, + { + "epoch": 0.4045807683002231, + "grad_norm": 3.7114546298980713, + "learning_rate": 0.004999426619417629, + "loss": 2.7291, + "step": 30100 + }, + { + "epoch": 0.40592489045404445, + "grad_norm": 0.8062450289726257, + "learning_rate": 0.0049994152095841704, + "loss": 2.7369, + "step": 30200 + }, + { + "epoch": 0.4072690126078658, + "grad_norm": 0.82357257604599, + "learning_rate": 0.004999403687356217, + "loss": 2.7361, + "step": 30300 + }, + { + "epoch": 0.40861313476168715, + "grad_norm": 0.6285329461097717, + "learning_rate": 0.004999392052734278, + "loss": 2.7343, + "step": 30400 + }, + { + "epoch": 0.4099572569155085, + "grad_norm": 0.9262251853942871, + "learning_rate": 0.0049993803057188785, + "loss": 2.7323, + "step": 30500 + }, + { + "epoch": 0.41130137906932984, + "grad_norm": 1.2343696355819702, + "learning_rate": 0.004999368446310558, + "loss": 2.7286, + "step": 30600 + }, + { + "epoch": 0.41264550122315113, + "grad_norm": 0.9753904342651367, + "learning_rate": 0.004999356474509844, + "loss": 2.7363, + "step": 30700 + }, + { + "epoch": 0.4139896233769725, + "grad_norm": 0.7958431839942932, + "learning_rate": 0.004999344390317274, + "loss": 2.7298, + "step": 30800 + }, + { + "epoch": 0.41533374553079383, + "grad_norm": 4.113492965698242, + "learning_rate": 0.004999332193733396, + "loss": 2.732, + "step": 30900 + }, + { + "epoch": 0.4166778676846152, + "grad_norm": 1.0215173959732056, + "learning_rate": 0.004999319884758765, + "loss": 2.7333, + "step": 31000 + }, + { + "epoch": 0.4166778676846152, + "eval_MaskedAccuracy": 0.40677681485969963, + "eval_loss": 3.0563271045684814, + "eval_runtime": 159.104, + "eval_samples_per_second": 398.959, + "eval_steps_per_second": 1.559, + "step": 31000 + }, + { + "epoch": 0.4180219898384365, + "grad_norm": 1.9819655418395996, + "learning_rate": 0.004999307463393925, + "loss": 2.7266, + "step": 31100 + }, + { + "epoch": 0.4193661119922579, + "grad_norm": 1.4452855587005615, + "learning_rate": 0.004999294929639439, + "loss": 2.7248, + "step": 31200 + }, + { + "epoch": 0.4207102341460792, + "grad_norm": 0.7133325934410095, + "learning_rate": 0.004999282283495872, + "loss": 2.7337, + "step": 31300 + }, + { + "epoch": 0.4220543562999005, + "grad_norm": 0.930253267288208, + "learning_rate": 0.004999269524963801, + "loss": 2.7284, + "step": 31400 + }, + { + "epoch": 0.42339847845372186, + "grad_norm": 2.160118818283081, + "learning_rate": 0.004999256654043792, + "loss": 2.7228, + "step": 31500 + }, + { + "epoch": 0.4247426006075432, + "grad_norm": 1.322357416152954, + "learning_rate": 0.004999243670736428, + "loss": 2.729, + "step": 31600 + }, + { + "epoch": 0.42608672276136456, + "grad_norm": 1.2306387424468994, + "learning_rate": 0.00499923057504229, + "loss": 2.7278, + "step": 31700 + }, + { + "epoch": 0.4274308449151859, + "grad_norm": 2.193079710006714, + "learning_rate": 0.004999217366961968, + "loss": 2.7169, + "step": 31800 + }, + { + "epoch": 0.42877496706900725, + "grad_norm": 0.698665976524353, + "learning_rate": 0.0049992040464960615, + "loss": 2.717, + "step": 31900 + }, + { + "epoch": 0.43011908922282854, + "grad_norm": 0.765285313129425, + "learning_rate": 0.004999190613645174, + "loss": 2.7152, + "step": 32000 + }, + { + "epoch": 0.43011908922282854, + "eval_MaskedAccuracy": 0.40871540481758456, + "eval_loss": 3.042783260345459, + "eval_runtime": 159.0761, + "eval_samples_per_second": 399.029, + "eval_steps_per_second": 1.559, + "step": 32000 + }, + { + "epoch": 0.4314632113766499, + "grad_norm": 0.7268723845481873, + "learning_rate": 0.004999177068409907, + "loss": 2.7235, + "step": 32100 + }, + { + "epoch": 0.43280733353047124, + "grad_norm": 0.5239226818084717, + "learning_rate": 0.004999163410790871, + "loss": 2.7151, + "step": 32200 + }, + { + "epoch": 0.4341514556842926, + "grad_norm": 1.3244593143463135, + "learning_rate": 0.004999149640788679, + "loss": 2.7141, + "step": 32300 + }, + { + "epoch": 0.43549557783811393, + "grad_norm": 0.9290921688079834, + "learning_rate": 0.004999135758403953, + "loss": 2.7156, + "step": 32400 + }, + { + "epoch": 0.4368396999919353, + "grad_norm": 0.5451181530952454, + "learning_rate": 0.004999121763637319, + "loss": 2.7102, + "step": 32500 + }, + { + "epoch": 0.43818382214575663, + "grad_norm": 0.9701553583145142, + "learning_rate": 0.004999107656489411, + "loss": 2.7198, + "step": 32600 + }, + { + "epoch": 0.4395279442995779, + "grad_norm": 5.558290004730225, + "learning_rate": 0.004999093436960863, + "loss": 2.7124, + "step": 32700 + }, + { + "epoch": 0.44087206645339927, + "grad_norm": 0.5010631084442139, + "learning_rate": 0.004999079105052311, + "loss": 2.7112, + "step": 32800 + }, + { + "epoch": 0.4422161886072206, + "grad_norm": 0.5793175101280212, + "learning_rate": 0.004999064660764402, + "loss": 2.7111, + "step": 32900 + }, + { + "epoch": 0.44356031076104196, + "grad_norm": 0.9704194664955139, + "learning_rate": 0.004999050104097792, + "loss": 2.7093, + "step": 33000 + }, + { + "epoch": 0.44356031076104196, + "eval_MaskedAccuracy": 0.4106415839014644, + "eval_loss": 3.032620906829834, + "eval_runtime": 161.401, + "eval_samples_per_second": 393.281, + "eval_steps_per_second": 1.537, + "step": 33000 + }, + { + "epoch": 0.4449044329148633, + "grad_norm": 2.713127613067627, + "learning_rate": 0.004999035435053133, + "loss": 2.7105, + "step": 33100 + }, + { + "epoch": 0.44624855506868466, + "grad_norm": 0.9831237196922302, + "learning_rate": 0.0049990206536310805, + "loss": 2.7106, + "step": 33200 + }, + { + "epoch": 0.447592677222506, + "grad_norm": 1.5991238355636597, + "learning_rate": 0.004999005759832306, + "loss": 2.6998, + "step": 33300 + }, + { + "epoch": 0.4489367993763273, + "grad_norm": 1.0113264322280884, + "learning_rate": 0.004998990753657483, + "loss": 2.7157, + "step": 33400 + }, + { + "epoch": 0.45028092153014865, + "grad_norm": 1.1059205532073975, + "learning_rate": 0.0049989756351072855, + "loss": 2.7127, + "step": 33500 + }, + { + "epoch": 0.45162504368397, + "grad_norm": 0.8544152975082397, + "learning_rate": 0.0049989604041823935, + "loss": 2.7107, + "step": 33600 + }, + { + "epoch": 0.45296916583779134, + "grad_norm": 0.6586983799934387, + "learning_rate": 0.004998945060883495, + "loss": 2.7205, + "step": 33700 + }, + { + "epoch": 0.4543132879916127, + "grad_norm": 0.7723863124847412, + "learning_rate": 0.00499892960521128, + "loss": 2.694, + "step": 33800 + }, + { + "epoch": 0.45565741014543404, + "grad_norm": 1.5881659984588623, + "learning_rate": 0.00499891403716645, + "loss": 2.7052, + "step": 33900 + }, + { + "epoch": 0.45700153229925533, + "grad_norm": 1.2848504781723022, + "learning_rate": 0.004998898356749702, + "loss": 2.703, + "step": 34000 + }, + { + "epoch": 0.45700153229925533, + "eval_MaskedAccuracy": 0.4106395221691058, + "eval_loss": 3.0283474922180176, + "eval_runtime": 258.5416, + "eval_samples_per_second": 245.516, + "eval_steps_per_second": 0.959, + "step": 34000 + }, + { + "epoch": 0.4583456544530767, + "grad_norm": 0.7230987548828125, + "learning_rate": 0.0049988825639617415, + "loss": 2.6976, + "step": 34100 + }, + { + "epoch": 0.459689776606898, + "grad_norm": 0.671057403087616, + "learning_rate": 0.004998866658803282, + "loss": 2.6931, + "step": 34200 + }, + { + "epoch": 0.4610338987607194, + "grad_norm": 1.4508076906204224, + "learning_rate": 0.004998850641275041, + "loss": 2.7036, + "step": 34300 + }, + { + "epoch": 0.4623780209145407, + "grad_norm": 1.325042724609375, + "learning_rate": 0.004998834511377738, + "loss": 2.6982, + "step": 34400 + }, + { + "epoch": 0.46372214306836207, + "grad_norm": 0.49313464760780334, + "learning_rate": 0.004998818269112104, + "loss": 2.6976, + "step": 34500 + }, + { + "epoch": 0.4650662652221834, + "grad_norm": 0.7200739979743958, + "learning_rate": 0.00499880191447886, + "loss": 2.6979, + "step": 34600 + }, + { + "epoch": 0.4664103873760047, + "grad_norm": 1.495227336883545, + "learning_rate": 0.004998785447478756, + "loss": 2.6831, + "step": 34700 + }, + { + "epoch": 0.46775450952982606, + "grad_norm": 0.5236303210258484, + "learning_rate": 0.00499876886811252, + "loss": 2.6932, + "step": 34800 + }, + { + "epoch": 0.4690986316836474, + "grad_norm": 1.4335426092147827, + "learning_rate": 0.004998752176380915, + "loss": 2.6969, + "step": 34900 + }, + { + "epoch": 0.47044275383746875, + "grad_norm": 2.9952592849731445, + "learning_rate": 0.004998735372284684, + "loss": 2.7029, + "step": 35000 + }, + { + "epoch": 0.47044275383746875, + "eval_MaskedAccuracy": 0.4116071260461691, + "eval_loss": 3.016385078430176, + "eval_runtime": 156.2897, + "eval_samples_per_second": 406.143, + "eval_steps_per_second": 1.587, + "step": 35000 + }, + { + "epoch": 0.4717868759912901, + "grad_norm": 0.6854963302612305, + "learning_rate": 0.004998718455824586, + "loss": 2.6894, + "step": 35100 + }, + { + "epoch": 0.47313099814511145, + "grad_norm": 2.010087251663208, + "learning_rate": 0.004998701427001374, + "loss": 2.691, + "step": 35200 + }, + { + "epoch": 0.47447512029893274, + "grad_norm": 0.6356353163719177, + "learning_rate": 0.004998684285815833, + "loss": 2.69, + "step": 35300 + }, + { + "epoch": 0.4758192424527541, + "grad_norm": 0.5063838958740234, + "learning_rate": 0.004998667032268725, + "loss": 2.6895, + "step": 35400 + }, + { + "epoch": 0.47716336460657544, + "grad_norm": 0.8183920979499817, + "learning_rate": 0.0049986496663608285, + "loss": 2.6951, + "step": 35500 + }, + { + "epoch": 0.4785074867603968, + "grad_norm": 0.9003168940544128, + "learning_rate": 0.004998632188092928, + "loss": 2.6822, + "step": 35600 + }, + { + "epoch": 0.47985160891421813, + "grad_norm": 0.8852120041847229, + "learning_rate": 0.004998614597465808, + "loss": 2.6806, + "step": 35700 + }, + { + "epoch": 0.4811957310680395, + "grad_norm": 0.7671552896499634, + "learning_rate": 0.004998596894480259, + "loss": 2.6789, + "step": 35800 + }, + { + "epoch": 0.4825398532218608, + "grad_norm": 1.260359287261963, + "learning_rate": 0.004998579079137085, + "loss": 2.679, + "step": 35900 + }, + { + "epoch": 0.4838839753756821, + "grad_norm": 0.5925015807151794, + "learning_rate": 0.004998561151437081, + "loss": 2.69, + "step": 36000 + }, + { + "epoch": 0.4838839753756821, + "eval_MaskedAccuracy": 0.41254465747964864, + "eval_loss": 3.0127551555633545, + "eval_runtime": 154.2797, + "eval_samples_per_second": 411.434, + "eval_steps_per_second": 1.607, + "step": 36000 + }, + { + "epoch": 0.48522809752950347, + "grad_norm": 0.7644244432449341, + "learning_rate": 0.004998543111381064, + "loss": 2.6828, + "step": 36100 + }, + { + "epoch": 0.4865722196833248, + "grad_norm": 0.8469159007072449, + "learning_rate": 0.004998524958969839, + "loss": 2.6878, + "step": 36200 + }, + { + "epoch": 0.48791634183714616, + "grad_norm": 0.847205638885498, + "learning_rate": 0.004998506694204228, + "loss": 2.6818, + "step": 36300 + }, + { + "epoch": 0.4892604639909675, + "grad_norm": 0.581739604473114, + "learning_rate": 0.004998488317085059, + "loss": 2.6741, + "step": 36400 + }, + { + "epoch": 0.49060458614478886, + "grad_norm": 2.133725881576538, + "learning_rate": 0.004998469827613156, + "loss": 2.6786, + "step": 36500 + }, + { + "epoch": 0.4919487082986102, + "grad_norm": 1.3259893655776978, + "learning_rate": 0.00499845122578935, + "loss": 2.6819, + "step": 36600 + }, + { + "epoch": 0.4932928304524315, + "grad_norm": 1.4917380809783936, + "learning_rate": 0.004998432511614469, + "loss": 2.6802, + "step": 36700 + }, + { + "epoch": 0.49463695260625284, + "grad_norm": 2.2140398025512695, + "learning_rate": 0.004998413685089377, + "loss": 2.6767, + "step": 36800 + }, + { + "epoch": 0.4959810747600742, + "grad_norm": 0.7825784683227539, + "learning_rate": 0.004998394746214914, + "loss": 2.6801, + "step": 36900 + }, + { + "epoch": 0.49732519691389554, + "grad_norm": 2.3250296115875244, + "learning_rate": 0.00499837569499193, + "loss": 2.6736, + "step": 37000 + }, + { + "epoch": 0.49732519691389554, + "eval_MaskedAccuracy": 0.413791745700555, + "eval_loss": 3.001739740371704, + "eval_runtime": 155.081, + "eval_samples_per_second": 409.309, + "eval_steps_per_second": 1.599, + "step": 37000 + }, + { + "epoch": 0.4986693190677169, + "grad_norm": 0.9509634971618652, + "learning_rate": 0.004998356531421287, + "loss": 2.6815, + "step": 37100 + }, + { + "epoch": 0.5000134412215382, + "grad_norm": 0.5249235033988953, + "learning_rate": 0.004998337255503845, + "loss": 2.6761, + "step": 37200 + }, + { + "epoch": 0.5013575633753595, + "grad_norm": 0.492450088262558, + "learning_rate": 0.004998317867240472, + "loss": 2.6713, + "step": 37300 + }, + { + "epoch": 0.5027016855291809, + "grad_norm": 0.5649069547653198, + "learning_rate": 0.00499829836663204, + "loss": 2.6718, + "step": 37400 + }, + { + "epoch": 0.5040458076830022, + "grad_norm": 1.028743028640747, + "learning_rate": 0.004998278753679443, + "loss": 2.6786, + "step": 37500 + }, + { + "epoch": 0.5053899298368236, + "grad_norm": 1.2656359672546387, + "learning_rate": 0.004998259028383549, + "loss": 2.666, + "step": 37600 + }, + { + "epoch": 0.5067340519906449, + "grad_norm": 1.0689085721969604, + "learning_rate": 0.004998239190745246, + "loss": 2.6705, + "step": 37700 + }, + { + "epoch": 0.5080781741444662, + "grad_norm": 1.323631763458252, + "learning_rate": 0.00499821924076543, + "loss": 2.6748, + "step": 37800 + }, + { + "epoch": 0.5094222962982876, + "grad_norm": 1.0238298177719116, + "learning_rate": 0.004998199178445008, + "loss": 2.6692, + "step": 37900 + }, + { + "epoch": 0.5107664184521089, + "grad_norm": 1.5579122304916382, + "learning_rate": 0.0049981790037848806, + "loss": 2.6692, + "step": 38000 + }, + { + "epoch": 0.5107664184521089, + "eval_MaskedAccuracy": 0.41454185677096544, + "eval_loss": 2.9944686889648438, + "eval_runtime": 154.5873, + "eval_samples_per_second": 410.616, + "eval_steps_per_second": 1.604, + "step": 38000 + }, + { + "epoch": 0.5121105406059303, + "grad_norm": 0.9145413041114807, + "learning_rate": 0.004998158716785951, + "loss": 2.6664, + "step": 38100 + }, + { + "epoch": 0.5134546627597516, + "grad_norm": 1.3004742860794067, + "learning_rate": 0.0049981383174491365, + "loss": 2.6749, + "step": 38200 + }, + { + "epoch": 0.5147987849135729, + "grad_norm": 1.8650057315826416, + "learning_rate": 0.0049981178057753595, + "loss": 2.6799, + "step": 38300 + }, + { + "epoch": 0.5161429070673943, + "grad_norm": 1.2665841579437256, + "learning_rate": 0.004998097181765546, + "loss": 2.6638, + "step": 38400 + }, + { + "epoch": 0.5174870292212156, + "grad_norm": 0.8943463563919067, + "learning_rate": 0.004998076445420617, + "loss": 2.6612, + "step": 38500 + }, + { + "epoch": 0.518831151375037, + "grad_norm": 1.6985851526260376, + "learning_rate": 0.004998055596741511, + "loss": 2.6653, + "step": 38600 + }, + { + "epoch": 0.5201752735288583, + "grad_norm": 2.1343910694122314, + "learning_rate": 0.004998034635729168, + "loss": 2.6642, + "step": 38700 + }, + { + "epoch": 0.5215193956826797, + "grad_norm": 1.4548419713974, + "learning_rate": 0.004998013562384535, + "loss": 2.6524, + "step": 38800 + }, + { + "epoch": 0.522863517836501, + "grad_norm": 2.0018160343170166, + "learning_rate": 0.004997992376708557, + "loss": 2.6574, + "step": 38900 + }, + { + "epoch": 0.5242076399903223, + "grad_norm": 1.4500235319137573, + "learning_rate": 0.004997971078702187, + "loss": 2.6614, + "step": 39000 + }, + { + "epoch": 0.5242076399903223, + "eval_MaskedAccuracy": 0.4153363544702115, + "eval_loss": 2.9864044189453125, + "eval_runtime": 154.3717, + "eval_samples_per_second": 411.189, + "eval_steps_per_second": 1.607, + "step": 39000 + }, + { + "epoch": 0.5255517621441437, + "grad_norm": 1.6406347751617432, + "learning_rate": 0.0049979496683663855, + "loss": 2.6609, + "step": 39100 + }, + { + "epoch": 0.526895884297965, + "grad_norm": 1.1125140190124512, + "learning_rate": 0.004997928145702119, + "loss": 2.6553, + "step": 39200 + }, + { + "epoch": 0.5282400064517864, + "grad_norm": 4.977079391479492, + "learning_rate": 0.004997906510710358, + "loss": 2.6627, + "step": 39300 + }, + { + "epoch": 0.5295841286056077, + "grad_norm": 0.5258601903915405, + "learning_rate": 0.004997884763392075, + "loss": 2.6664, + "step": 39400 + }, + { + "epoch": 0.5309282507594291, + "grad_norm": 0.5638339519500732, + "learning_rate": 0.004997862903748257, + "loss": 2.6565, + "step": 39500 + }, + { + "epoch": 0.5322723729132504, + "grad_norm": 0.5276058912277222, + "learning_rate": 0.00499784093177988, + "loss": 2.6583, + "step": 39600 + }, + { + "epoch": 0.5336164950670717, + "grad_norm": 0.7912238836288452, + "learning_rate": 0.004997818847487944, + "loss": 2.6595, + "step": 39700 + }, + { + "epoch": 0.534960617220893, + "grad_norm": 0.949805736541748, + "learning_rate": 0.004997796650873429, + "loss": 2.6698, + "step": 39800 + }, + { + "epoch": 0.5363047393747143, + "grad_norm": 1.7956501245498657, + "learning_rate": 0.004997774341937344, + "loss": 2.6617, + "step": 39900 + }, + { + "epoch": 0.5376488615285357, + "grad_norm": 0.5093005895614624, + "learning_rate": 0.004997751920680701, + "loss": 2.6498, + "step": 40000 + }, + { + "epoch": 0.5376488615285357, + "eval_MaskedAccuracy": 0.4163031669419539, + "eval_loss": 2.9814693927764893, + "eval_runtime": 153.8302, + "eval_samples_per_second": 412.637, + "eval_steps_per_second": 1.612, + "step": 40000 + }, + { + "epoch": 0.538992983682357, + "grad_norm": 1.8199607133865356, + "learning_rate": 0.004997729387104499, + "loss": 2.6542, + "step": 40100 + }, + { + "epoch": 0.5403371058361784, + "grad_norm": 1.2168132066726685, + "learning_rate": 0.004997706741209757, + "loss": 2.6484, + "step": 40200 + }, + { + "epoch": 0.5416812279899997, + "grad_norm": 1.862183928489685, + "learning_rate": 0.004997683982997499, + "loss": 2.6529, + "step": 40300 + }, + { + "epoch": 0.543025350143821, + "grad_norm": 1.591640830039978, + "learning_rate": 0.004997661112468743, + "loss": 2.6541, + "step": 40400 + }, + { + "epoch": 0.5443694722976424, + "grad_norm": 3.0422515869140625, + "learning_rate": 0.004997638129624528, + "loss": 2.6493, + "step": 40500 + }, + { + "epoch": 0.5457135944514637, + "grad_norm": 0.5764662027359009, + "learning_rate": 0.004997615034465888, + "loss": 2.6454, + "step": 40600 + }, + { + "epoch": 0.5470577166052851, + "grad_norm": 0.7897889018058777, + "learning_rate": 0.004997591826993861, + "loss": 2.6526, + "step": 40700 + }, + { + "epoch": 0.5484018387591064, + "grad_norm": 0.5583525896072388, + "learning_rate": 0.004997568507209492, + "loss": 2.6488, + "step": 40800 + }, + { + "epoch": 0.5497459609129277, + "grad_norm": 1.2979474067687988, + "learning_rate": 0.004997545075113834, + "loss": 2.6439, + "step": 40900 + }, + { + "epoch": 0.5510900830667491, + "grad_norm": 2.09676456451416, + "learning_rate": 0.0049975215307079355, + "loss": 2.656, + "step": 41000 + }, + { + "epoch": 0.5510900830667491, + "eval_MaskedAccuracy": 0.4157399235251504, + "eval_loss": 2.981074571609497, + "eval_runtime": 154.1369, + "eval_samples_per_second": 411.816, + "eval_steps_per_second": 1.609, + "step": 41000 + }, + { + "epoch": 0.5524342052205704, + "grad_norm": 3.9577839374542236, + "learning_rate": 0.0049974978739928694, + "loss": 2.6505, + "step": 41100 + }, + { + "epoch": 0.5537783273743918, + "grad_norm": 0.5468003153800964, + "learning_rate": 0.004997474104969696, + "loss": 2.6442, + "step": 41200 + }, + { + "epoch": 0.5551224495282131, + "grad_norm": 0.5739485621452332, + "learning_rate": 0.004997450223639483, + "loss": 2.6446, + "step": 41300 + }, + { + "epoch": 0.5564665716820345, + "grad_norm": 1.8400816917419434, + "learning_rate": 0.00499742623000331, + "loss": 2.6448, + "step": 41400 + }, + { + "epoch": 0.5578106938358558, + "grad_norm": 1.2535688877105713, + "learning_rate": 0.004997402124062256, + "loss": 2.6482, + "step": 41500 + }, + { + "epoch": 0.5591548159896771, + "grad_norm": 1.228569746017456, + "learning_rate": 0.0049973779058174106, + "loss": 2.6498, + "step": 41600 + }, + { + "epoch": 0.5604989381434985, + "grad_norm": 1.4011684656143188, + "learning_rate": 0.004997353575269862, + "loss": 2.643, + "step": 41700 + }, + { + "epoch": 0.5618430602973198, + "grad_norm": 0.5663438439369202, + "learning_rate": 0.004997329132420707, + "loss": 2.6515, + "step": 41800 + }, + { + "epoch": 0.5631871824511412, + "grad_norm": 0.5198083519935608, + "learning_rate": 0.004997304577271046, + "loss": 2.6529, + "step": 41900 + }, + { + "epoch": 0.5645313046049625, + "grad_norm": 3.9535000324249268, + "learning_rate": 0.004997279909821991, + "loss": 2.6393, + "step": 42000 + }, + { + "epoch": 0.5645313046049625, + "eval_MaskedAccuracy": 0.4173549594387652, + "eval_loss": 2.971799850463867, + "eval_runtime": 155.0862, + "eval_samples_per_second": 409.295, + "eval_steps_per_second": 1.599, + "step": 42000 + }, + { + "epoch": 0.5658754267587839, + "grad_norm": 2.1890416145324707, + "learning_rate": 0.004997255130074655, + "loss": 2.6495, + "step": 42100 + }, + { + "epoch": 0.5672195489126052, + "grad_norm": 1.686848521232605, + "learning_rate": 0.004997230238030147, + "loss": 2.6365, + "step": 42200 + }, + { + "epoch": 0.5685636710664265, + "grad_norm": 1.8088412284851074, + "learning_rate": 0.004997205233689578, + "loss": 2.6421, + "step": 42300 + }, + { + "epoch": 0.5699077932202479, + "grad_norm": 1.7008030414581299, + "learning_rate": 0.004997180117054094, + "loss": 2.6371, + "step": 42400 + }, + { + "epoch": 0.5712519153740692, + "grad_norm": 1.355776309967041, + "learning_rate": 0.0049971548881248195, + "loss": 2.6357, + "step": 42500 + }, + { + "epoch": 0.5725960375278906, + "grad_norm": 1.5508376359939575, + "learning_rate": 0.004997129546902893, + "loss": 2.6438, + "step": 42600 + }, + { + "epoch": 0.5739401596817119, + "grad_norm": 0.7327139377593994, + "learning_rate": 0.004997104093389453, + "loss": 2.6318, + "step": 42700 + }, + { + "epoch": 0.5752842818355333, + "grad_norm": 0.8198758959770203, + "learning_rate": 0.004997078527585654, + "loss": 2.6407, + "step": 42800 + }, + { + "epoch": 0.5766284039893546, + "grad_norm": 1.41826331615448, + "learning_rate": 0.004997052849492641, + "loss": 2.6364, + "step": 42900 + }, + { + "epoch": 0.5779725261431758, + "grad_norm": 0.8289880156517029, + "learning_rate": 0.0049970270591115735, + "loss": 2.6494, + "step": 43000 + }, + { + "epoch": 0.5779725261431758, + "eval_MaskedAccuracy": 0.41913729394217447, + "eval_loss": 2.955897092819214, + "eval_runtime": 154.7124, + "eval_samples_per_second": 410.284, + "eval_steps_per_second": 1.603, + "step": 43000 + }, + { + "epoch": 0.5793166482969972, + "grad_norm": 1.136483907699585, + "learning_rate": 0.0049970011564436055, + "loss": 2.6364, + "step": 43100 + }, + { + "epoch": 0.5806607704508185, + "grad_norm": 1.627422571182251, + "learning_rate": 0.004996975141489916, + "loss": 2.6424, + "step": 43200 + }, + { + "epoch": 0.58200489260464, + "grad_norm": 1.4853564500808716, + "learning_rate": 0.004996949014251676, + "loss": 2.6462, + "step": 43300 + }, + { + "epoch": 0.5833490147584612, + "grad_norm": 1.7882176637649536, + "learning_rate": 0.0049969227747300535, + "loss": 2.6278, + "step": 43400 + }, + { + "epoch": 0.5846931369122826, + "grad_norm": 2.825515031814575, + "learning_rate": 0.004996896422926242, + "loss": 2.6412, + "step": 43500 + }, + { + "epoch": 0.5860372590661039, + "grad_norm": 0.4745466709136963, + "learning_rate": 0.004996869958841418, + "loss": 2.6332, + "step": 43600 + }, + { + "epoch": 0.5873813812199252, + "grad_norm": 1.2661569118499756, + "learning_rate": 0.00499684338247678, + "loss": 2.6325, + "step": 43700 + }, + { + "epoch": 0.5887255033737466, + "grad_norm": 1.2799876928329468, + "learning_rate": 0.004996816693833525, + "loss": 2.6413, + "step": 43800 + }, + { + "epoch": 0.5900696255275679, + "grad_norm": 0.7999758720397949, + "learning_rate": 0.004996789892912855, + "loss": 2.6256, + "step": 43900 + }, + { + "epoch": 0.5914137476813893, + "grad_norm": 0.7396803498268127, + "learning_rate": 0.004996762979715977, + "loss": 2.6313, + "step": 44000 + }, + { + "epoch": 0.5914137476813893, + "eval_MaskedAccuracy": 0.41881557303141803, + "eval_loss": 2.9574387073516846, + "eval_runtime": 156.1034, + "eval_samples_per_second": 406.628, + "eval_steps_per_second": 1.589, + "step": 44000 + }, + { + "epoch": 0.5927578698352106, + "grad_norm": 1.06534743309021, + "learning_rate": 0.00499673595424411, + "loss": 2.6311, + "step": 44100 + }, + { + "epoch": 0.5941019919890319, + "grad_norm": 1.120554804801941, + "learning_rate": 0.004996708816498461, + "loss": 2.6289, + "step": 44200 + }, + { + "epoch": 0.5954461141428533, + "grad_norm": 0.5264089703559875, + "learning_rate": 0.004996681566480261, + "loss": 2.6324, + "step": 44300 + }, + { + "epoch": 0.5967902362966746, + "grad_norm": 2.0513694286346436, + "learning_rate": 0.004996654204190736, + "loss": 2.6346, + "step": 44400 + }, + { + "epoch": 0.598134358450496, + "grad_norm": 1.3531913757324219, + "learning_rate": 0.004996626729631115, + "loss": 2.6351, + "step": 44500 + }, + { + "epoch": 0.5994784806043173, + "grad_norm": 0.9898929595947266, + "learning_rate": 0.004996599142802638, + "loss": 2.6232, + "step": 44600 + }, + { + "epoch": 0.6008226027581387, + "grad_norm": 0.674278736114502, + "learning_rate": 0.00499657144370655, + "loss": 2.6227, + "step": 44700 + }, + { + "epoch": 0.60216672491196, + "grad_norm": 0.5685105323791504, + "learning_rate": 0.004996543632344098, + "loss": 2.6306, + "step": 44800 + }, + { + "epoch": 0.6035108470657813, + "grad_norm": 0.5396636724472046, + "learning_rate": 0.004996515708716534, + "loss": 2.6351, + "step": 44900 + }, + { + "epoch": 0.6048549692196027, + "grad_norm": 1.0353196859359741, + "learning_rate": 0.004996487672825116, + "loss": 2.6332, + "step": 45000 + }, + { + "epoch": 0.6048549692196027, + "eval_MaskedAccuracy": 0.4208831607484112, + "eval_loss": 2.9444289207458496, + "eval_runtime": 157.2902, + "eval_samples_per_second": 403.56, + "eval_steps_per_second": 1.577, + "step": 45000 + }, + { + "epoch": 0.606199091373424, + "grad_norm": 0.9090419411659241, + "learning_rate": 0.004996459524671107, + "loss": 2.6303, + "step": 45100 + }, + { + "epoch": 0.6075432135272454, + "grad_norm": 0.5557056069374084, + "learning_rate": 0.004996431264255778, + "loss": 2.6253, + "step": 45200 + }, + { + "epoch": 0.6088873356810667, + "grad_norm": 1.1961573362350464, + "learning_rate": 0.004996402891580404, + "loss": 2.6249, + "step": 45300 + }, + { + "epoch": 0.6102314578348881, + "grad_norm": 0.7062890529632568, + "learning_rate": 0.00499637440664626, + "loss": 2.6264, + "step": 45400 + }, + { + "epoch": 0.6115755799887094, + "grad_norm": 0.7188198566436768, + "learning_rate": 0.004996345809454634, + "loss": 2.63, + "step": 45500 + }, + { + "epoch": 0.6129197021425307, + "grad_norm": 1.2873228788375854, + "learning_rate": 0.0049963171000068085, + "loss": 2.6234, + "step": 45600 + }, + { + "epoch": 0.6142638242963521, + "grad_norm": 1.0937819480895996, + "learning_rate": 0.004996288278304078, + "loss": 2.6309, + "step": 45700 + }, + { + "epoch": 0.6156079464501734, + "grad_norm": 0.9512178897857666, + "learning_rate": 0.004996259344347743, + "loss": 2.6194, + "step": 45800 + }, + { + "epoch": 0.6169520686039948, + "grad_norm": 0.4616401791572571, + "learning_rate": 0.004996230298139107, + "loss": 2.6233, + "step": 45900 + }, + { + "epoch": 0.618296190757816, + "grad_norm": 0.6116599440574646, + "learning_rate": 0.004996201139679473, + "loss": 2.6205, + "step": 46000 + }, + { + "epoch": 0.618296190757816, + "eval_MaskedAccuracy": 0.4190083845851312, + "eval_loss": 2.9517083168029785, + "eval_runtime": 142.8409, + "eval_samples_per_second": 444.382, + "eval_steps_per_second": 1.736, + "step": 46000 + }, + { + "epoch": 0.6196403129116375, + "grad_norm": 1.2493271827697754, + "learning_rate": 0.004996171868970165, + "loss": 2.6293, + "step": 46100 + }, + { + "epoch": 0.6209844350654588, + "grad_norm": 1.9406765699386597, + "learning_rate": 0.004996142486012493, + "loss": 2.626, + "step": 46200 + }, + { + "epoch": 0.62232855721928, + "grad_norm": 0.6907238960266113, + "learning_rate": 0.004996112990807786, + "loss": 2.6141, + "step": 46300 + }, + { + "epoch": 0.6236726793731014, + "grad_norm": 0.5618690848350525, + "learning_rate": 0.0049960833833573755, + "loss": 2.6151, + "step": 46400 + }, + { + "epoch": 0.6250168015269227, + "grad_norm": 1.8456848859786987, + "learning_rate": 0.004996053663662593, + "loss": 2.6158, + "step": 46500 + }, + { + "epoch": 0.6263609236807441, + "grad_norm": 0.4822538495063782, + "learning_rate": 0.004996023831724768, + "loss": 2.6219, + "step": 46600 + }, + { + "epoch": 0.6277050458345654, + "grad_norm": 0.4805866777896881, + "learning_rate": 0.004995993887545261, + "loss": 2.6134, + "step": 46700 + }, + { + "epoch": 0.6290491679883868, + "grad_norm": 1.5017101764678955, + "learning_rate": 0.004995963831125411, + "loss": 2.6146, + "step": 46800 + }, + { + "epoch": 0.6303932901422081, + "grad_norm": 1.2550396919250488, + "learning_rate": 0.00499593366246658, + "loss": 2.6176, + "step": 46900 + }, + { + "epoch": 0.6317374122960294, + "grad_norm": 1.6678540706634521, + "learning_rate": 0.004995903381570121, + "loss": 2.6095, + "step": 47000 + }, + { + "epoch": 0.6317374122960294, + "eval_MaskedAccuracy": 0.42101418324632395, + "eval_loss": 2.939000129699707, + "eval_runtime": 157.4312, + "eval_samples_per_second": 403.198, + "eval_steps_per_second": 1.575, + "step": 47000 + }, + { + "epoch": 0.6330815344498508, + "grad_norm": 0.7519531846046448, + "learning_rate": 0.004995872988437397, + "loss": 2.6146, + "step": 47100 + }, + { + "epoch": 0.6344256566036721, + "grad_norm": 0.5543345808982849, + "learning_rate": 0.004995842483069782, + "loss": 2.6165, + "step": 47200 + }, + { + "epoch": 0.6357697787574935, + "grad_norm": 0.8774918913841248, + "learning_rate": 0.004995811865468641, + "loss": 2.617, + "step": 47300 + }, + { + "epoch": 0.6371139009113148, + "grad_norm": 0.5010842680931091, + "learning_rate": 0.004995781135635373, + "loss": 2.622, + "step": 47400 + }, + { + "epoch": 0.6384580230651361, + "grad_norm": 0.537598192691803, + "learning_rate": 0.0049957502935713445, + "loss": 2.6228, + "step": 47500 + }, + { + "epoch": 0.6398021452189575, + "grad_norm": 0.65438312292099, + "learning_rate": 0.004995719339277949, + "loss": 2.6154, + "step": 47600 + }, + { + "epoch": 0.6411462673727788, + "grad_norm": 0.726901113986969, + "learning_rate": 0.004995688272756588, + "loss": 2.6126, + "step": 47700 + }, + { + "epoch": 0.6424903895266002, + "grad_norm": 1.2524980306625366, + "learning_rate": 0.004995657094008657, + "loss": 2.6075, + "step": 47800 + }, + { + "epoch": 0.6438345116804215, + "grad_norm": 2.223083257675171, + "learning_rate": 0.004995625803035562, + "loss": 2.6113, + "step": 47900 + }, + { + "epoch": 0.6451786338342429, + "grad_norm": 0.7421071529388428, + "learning_rate": 0.004995594399838711, + "loss": 2.6088, + "step": 48000 + }, + { + "epoch": 0.6451786338342429, + "eval_MaskedAccuracy": 0.42257422566160907, + "eval_loss": 2.9292173385620117, + "eval_runtime": 154.814, + "eval_samples_per_second": 410.015, + "eval_steps_per_second": 1.602, + "step": 48000 + }, + { + "epoch": 0.6465227559880642, + "grad_norm": 0.6506995558738708, + "learning_rate": 0.004995562884419525, + "loss": 2.6105, + "step": 48100 + }, + { + "epoch": 0.6478668781418855, + "grad_norm": 1.0766329765319824, + "learning_rate": 0.004995531256779415, + "loss": 2.6139, + "step": 48200 + }, + { + "epoch": 0.6492110002957069, + "grad_norm": 1.1762914657592773, + "learning_rate": 0.00499549951691981, + "loss": 2.6078, + "step": 48300 + }, + { + "epoch": 0.6505551224495282, + "grad_norm": 2.2142832279205322, + "learning_rate": 0.004995467664842145, + "loss": 2.6093, + "step": 48400 + }, + { + "epoch": 0.6518992446033496, + "grad_norm": 1.1189358234405518, + "learning_rate": 0.00499543570054785, + "loss": 2.6141, + "step": 48500 + }, + { + "epoch": 0.6532433667571709, + "grad_norm": 0.7989232540130615, + "learning_rate": 0.004995403624038362, + "loss": 2.6155, + "step": 48600 + }, + { + "epoch": 0.6545874889109923, + "grad_norm": 1.5602754354476929, + "learning_rate": 0.004995371435315131, + "loss": 2.6137, + "step": 48700 + }, + { + "epoch": 0.6559316110648136, + "grad_norm": 0.7082908749580383, + "learning_rate": 0.004995339134379602, + "loss": 2.6138, + "step": 48800 + }, + { + "epoch": 0.6572757332186349, + "grad_norm": 1.3697164058685303, + "learning_rate": 0.004995306721233229, + "loss": 2.6094, + "step": 48900 + }, + { + "epoch": 0.6586198553724563, + "grad_norm": 0.5310668349266052, + "learning_rate": 0.004995274195877492, + "loss": 2.6069, + "step": 49000 + }, + { + "epoch": 0.6586198553724563, + "eval_MaskedAccuracy": 0.42236824728278766, + "eval_loss": 2.928412437438965, + "eval_runtime": 155.9626, + "eval_samples_per_second": 406.995, + "eval_steps_per_second": 1.59, + "step": 49000 + }, + { + "epoch": 0.6599639775262776, + "grad_norm": 1.284600019454956, + "learning_rate": 0.00499524155831384, + "loss": 2.6058, + "step": 49100 + }, + { + "epoch": 0.661308099680099, + "grad_norm": 0.44148510694503784, + "learning_rate": 0.004995208808543738, + "loss": 2.6046, + "step": 49200 + }, + { + "epoch": 0.6626522218339203, + "grad_norm": 1.5299873352050781, + "learning_rate": 0.004995175946568674, + "loss": 2.6057, + "step": 49300 + }, + { + "epoch": 0.6639963439877417, + "grad_norm": 0.8762273192405701, + "learning_rate": 0.00499514297239013, + "loss": 2.6049, + "step": 49400 + }, + { + "epoch": 0.665340466141563, + "grad_norm": 0.6929760575294495, + "learning_rate": 0.004995109886009586, + "loss": 2.6117, + "step": 49500 + }, + { + "epoch": 0.6666845882953842, + "grad_norm": 0.9134759306907654, + "learning_rate": 0.004995076687428528, + "loss": 2.6008, + "step": 49600 + }, + { + "epoch": 0.6680287104492056, + "grad_norm": 1.4550007581710815, + "learning_rate": 0.004995043376648459, + "loss": 2.6087, + "step": 49700 + }, + { + "epoch": 0.6693728326030269, + "grad_norm": 1.1609247922897339, + "learning_rate": 0.004995009953670882, + "loss": 2.6043, + "step": 49800 + }, + { + "epoch": 0.6707169547568483, + "grad_norm": 0.5234310030937195, + "learning_rate": 0.004994976418497298, + "loss": 2.6068, + "step": 49900 + }, + { + "epoch": 0.6720610769106696, + "grad_norm": 1.2967578172683716, + "learning_rate": 0.004994942771129218, + "loss": 2.6029, + "step": 50000 + }, + { + "epoch": 0.6720610769106696, + "eval_MaskedAccuracy": 0.4233113243940233, + "eval_loss": 2.9228169918060303, + "eval_runtime": 161.4288, + "eval_samples_per_second": 393.213, + "eval_steps_per_second": 1.536, + "step": 50000 + }, + { + "epoch": 0.673405199064491, + "grad_norm": 1.6317380666732788, + "learning_rate": 0.00499490901156816, + "loss": 2.6123, + "step": 50100 + }, + { + "epoch": 0.6747493212183123, + "grad_norm": 0.8617536425590515, + "learning_rate": 0.004994875139815648, + "loss": 2.5964, + "step": 50200 + }, + { + "epoch": 0.6760934433721336, + "grad_norm": 1.1220712661743164, + "learning_rate": 0.004994841155873204, + "loss": 2.5989, + "step": 50300 + }, + { + "epoch": 0.677437565525955, + "grad_norm": 0.5247998237609863, + "learning_rate": 0.004994807059742355, + "loss": 2.598, + "step": 50400 + }, + { + "epoch": 0.6787816876797763, + "grad_norm": 1.532121181488037, + "learning_rate": 0.004994772851424643, + "loss": 2.5946, + "step": 50500 + }, + { + "epoch": 0.6801258098335977, + "grad_norm": 1.0279977321624756, + "learning_rate": 0.004994738530921611, + "loss": 2.6086, + "step": 50600 + }, + { + "epoch": 0.681469931987419, + "grad_norm": 0.8515674471855164, + "learning_rate": 0.0049947040982348005, + "loss": 2.5934, + "step": 50700 + }, + { + "epoch": 0.6828140541412403, + "grad_norm": 0.958196222782135, + "learning_rate": 0.004994669553365766, + "loss": 2.597, + "step": 50800 + }, + { + "epoch": 0.6841581762950617, + "grad_norm": 0.795566737651825, + "learning_rate": 0.0049946348963160634, + "loss": 2.6028, + "step": 50900 + }, + { + "epoch": 0.685502298448883, + "grad_norm": 1.396265983581543, + "learning_rate": 0.0049946001270872575, + "loss": 2.5972, + "step": 51000 + }, + { + "epoch": 0.685502298448883, + "eval_MaskedAccuracy": 0.42380398111438183, + "eval_loss": 2.9168331623077393, + "eval_runtime": 155.1031, + "eval_samples_per_second": 409.25, + "eval_steps_per_second": 1.599, + "step": 51000 + }, + { + "epoch": 0.6868464206027044, + "grad_norm": 1.0055732727050781, + "learning_rate": 0.004994565245680906, + "loss": 2.5967, + "step": 51100 + }, + { + "epoch": 0.6881905427565257, + "grad_norm": 1.37259840965271, + "learning_rate": 0.004994530252098587, + "loss": 2.6036, + "step": 51200 + }, + { + "epoch": 0.6895346649103471, + "grad_norm": 0.4619656801223755, + "learning_rate": 0.00499449514634188, + "loss": 2.6019, + "step": 51300 + }, + { + "epoch": 0.6908787870641684, + "grad_norm": 0.7385687828063965, + "learning_rate": 0.004994459928412364, + "loss": 2.6004, + "step": 51400 + }, + { + "epoch": 0.6922229092179897, + "grad_norm": 1.73374342918396, + "learning_rate": 0.004994424598311631, + "loss": 2.592, + "step": 51500 + }, + { + "epoch": 0.6935670313718111, + "grad_norm": 1.9898829460144043, + "learning_rate": 0.00499438915604126, + "loss": 2.5961, + "step": 51600 + }, + { + "epoch": 0.6949111535256324, + "grad_norm": 1.4267821311950684, + "learning_rate": 0.00499435360160286, + "loss": 2.5896, + "step": 51700 + }, + { + "epoch": 0.6962552756794538, + "grad_norm": 1.4741652011871338, + "learning_rate": 0.004994317934998031, + "loss": 2.5957, + "step": 51800 + }, + { + "epoch": 0.6975993978332751, + "grad_norm": 1.60403573513031, + "learning_rate": 0.004994282156228371, + "loss": 2.5936, + "step": 51900 + }, + { + "epoch": 0.6989435199870965, + "grad_norm": 1.172230839729309, + "learning_rate": 0.004994246265295497, + "loss": 2.5973, + "step": 52000 + }, + { + "epoch": 0.6989435199870965, + "eval_MaskedAccuracy": 0.42523802061668947, + "eval_loss": 2.911010265350342, + "eval_runtime": 154.888, + "eval_samples_per_second": 409.819, + "eval_steps_per_second": 1.601, + "step": 52000 + }, + { + "epoch": 0.7002876421409178, + "grad_norm": 1.7461953163146973, + "learning_rate": 0.004994210262201032, + "loss": 2.5981, + "step": 52100 + }, + { + "epoch": 0.7016317642947391, + "grad_norm": 1.0520185232162476, + "learning_rate": 0.004994174146946593, + "loss": 2.5931, + "step": 52200 + }, + { + "epoch": 0.7029758864485605, + "grad_norm": 0.6437071561813354, + "learning_rate": 0.0049941379195338085, + "loss": 2.5871, + "step": 52300 + }, + { + "epoch": 0.7043200086023818, + "grad_norm": 0.5879480838775635, + "learning_rate": 0.004994101579964312, + "loss": 2.5895, + "step": 52400 + }, + { + "epoch": 0.7056641307562032, + "grad_norm": 0.4580605626106262, + "learning_rate": 0.00499406512823974, + "loss": 2.5958, + "step": 52500 + }, + { + "epoch": 0.7070082529100244, + "grad_norm": 1.544168472290039, + "learning_rate": 0.004994028564361737, + "loss": 2.5952, + "step": 52600 + }, + { + "epoch": 0.7083523750638459, + "grad_norm": 2.1680054664611816, + "learning_rate": 0.004993991888331947, + "loss": 2.5939, + "step": 52700 + }, + { + "epoch": 0.7096964972176671, + "grad_norm": 0.9118250608444214, + "learning_rate": 0.004993955100152014, + "loss": 2.5873, + "step": 52800 + }, + { + "epoch": 0.7110406193714884, + "grad_norm": 0.6249663233757019, + "learning_rate": 0.004993918199823611, + "loss": 2.5945, + "step": 52900 + }, + { + "epoch": 0.7123847415253098, + "grad_norm": 0.5641465783119202, + "learning_rate": 0.004993881187348401, + "loss": 2.5875, + "step": 53000 + }, + { + "epoch": 0.7123847415253098, + "eval_MaskedAccuracy": 0.42511639864491074, + "eval_loss": 2.9080851078033447, + "eval_runtime": 155.0598, + "eval_samples_per_second": 409.365, + "eval_steps_per_second": 1.599, + "step": 53000 + }, + { + "epoch": 0.7137288636791311, + "grad_norm": 1.5355523824691772, + "learning_rate": 0.004993844062728042, + "loss": 2.586, + "step": 53100 + }, + { + "epoch": 0.7150729858329525, + "grad_norm": 0.9230164885520935, + "learning_rate": 0.004993806825964211, + "loss": 2.5855, + "step": 53200 + }, + { + "epoch": 0.7164171079867738, + "grad_norm": 1.638444185256958, + "learning_rate": 0.004993769477058582, + "loss": 2.5989, + "step": 53300 + }, + { + "epoch": 0.7177612301405952, + "grad_norm": 0.5577353835105896, + "learning_rate": 0.004993732016012843, + "loss": 2.5967, + "step": 53400 + }, + { + "epoch": 0.7191053522944165, + "grad_norm": 0.5781148672103882, + "learning_rate": 0.0049936944428286825, + "loss": 2.5923, + "step": 53500 + }, + { + "epoch": 0.7204494744482378, + "grad_norm": 1.6455879211425781, + "learning_rate": 0.00499365675750779, + "loss": 2.5868, + "step": 53600 + }, + { + "epoch": 0.7217935966020592, + "grad_norm": 0.6370722651481628, + "learning_rate": 0.004993618960051859, + "loss": 2.5879, + "step": 53700 + }, + { + "epoch": 0.7231377187558805, + "grad_norm": 1.1286998987197876, + "learning_rate": 0.004993581050462601, + "loss": 2.6016, + "step": 53800 + }, + { + "epoch": 0.7244818409097019, + "grad_norm": 1.282611608505249, + "learning_rate": 0.004993543028741726, + "loss": 2.5869, + "step": 53900 + }, + { + "epoch": 0.7258259630635232, + "grad_norm": 0.51445072889328, + "learning_rate": 0.004993504894890944, + "loss": 2.5856, + "step": 54000 + }, + { + "epoch": 0.7258259630635232, + "eval_MaskedAccuracy": 0.4248532700853216, + "eval_loss": 2.909213066101074, + "eval_runtime": 154.7862, + "eval_samples_per_second": 410.088, + "eval_steps_per_second": 1.602, + "step": 54000 + }, + { + "epoch": 0.7271700852173445, + "grad_norm": 1.3245320320129395, + "learning_rate": 0.004993466648911963, + "loss": 2.5915, + "step": 54100 + }, + { + "epoch": 0.7285142073711659, + "grad_norm": 1.215162754058838, + "learning_rate": 0.004993428290806523, + "loss": 2.5772, + "step": 54200 + }, + { + "epoch": 0.7298583295249872, + "grad_norm": 1.2432647943496704, + "learning_rate": 0.004993389820576344, + "loss": 2.5902, + "step": 54300 + }, + { + "epoch": 0.7312024516788086, + "grad_norm": 1.581764817237854, + "learning_rate": 0.0049933512382231565, + "loss": 2.5795, + "step": 54400 + }, + { + "epoch": 0.7325465738326299, + "grad_norm": 0.9531721472740173, + "learning_rate": 0.004993312543748701, + "loss": 2.5764, + "step": 54500 + }, + { + "epoch": 0.7338906959864513, + "grad_norm": 2.677894115447998, + "learning_rate": 0.004993273737154721, + "loss": 2.5922, + "step": 54600 + }, + { + "epoch": 0.7352348181402726, + "grad_norm": 0.793094277381897, + "learning_rate": 0.00499323481844297, + "loss": 2.5841, + "step": 54700 + }, + { + "epoch": 0.7365789402940939, + "grad_norm": 1.0504934787750244, + "learning_rate": 0.0049931957876151924, + "loss": 2.5831, + "step": 54800 + }, + { + "epoch": 0.7379230624479153, + "grad_norm": 1.0326224565505981, + "learning_rate": 0.004993156644673158, + "loss": 2.5774, + "step": 54900 + }, + { + "epoch": 0.7392671846017366, + "grad_norm": 0.7911434173583984, + "learning_rate": 0.004993117389618626, + "loss": 2.5821, + "step": 55000 + }, + { + "epoch": 0.7392671846017366, + "eval_MaskedAccuracy": 0.42632109483323904, + "eval_loss": 2.8994028568267822, + "eval_runtime": 160.7388, + "eval_samples_per_second": 394.902, + "eval_steps_per_second": 1.543, + "step": 55000 + }, + { + "epoch": 0.740611306755558, + "grad_norm": 0.5436133742332458, + "learning_rate": 0.004993078022453364, + "loss": 2.5816, + "step": 55100 + }, + { + "epoch": 0.7419554289093793, + "grad_norm": 0.8503451943397522, + "learning_rate": 0.004993038543179143, + "loss": 2.5722, + "step": 55200 + }, + { + "epoch": 0.7432995510632007, + "grad_norm": 1.585143804550171, + "learning_rate": 0.004992998951797748, + "loss": 2.5707, + "step": 55300 + }, + { + "epoch": 0.744643673217022, + "grad_norm": 2.5457680225372314, + "learning_rate": 0.0049929592483109585, + "loss": 2.5768, + "step": 55400 + }, + { + "epoch": 0.7459877953708433, + "grad_norm": 1.12667977809906, + "learning_rate": 0.004992919432720564, + "loss": 2.5813, + "step": 55500 + }, + { + "epoch": 0.7473319175246647, + "grad_norm": 0.685434103012085, + "learning_rate": 0.00499287950502836, + "loss": 2.5824, + "step": 55600 + }, + { + "epoch": 0.748676039678486, + "grad_norm": 0.5508724451065063, + "learning_rate": 0.0049928394652361445, + "loss": 2.5788, + "step": 55700 + }, + { + "epoch": 0.7500201618323074, + "grad_norm": 1.0351439714431763, + "learning_rate": 0.004992799313345724, + "loss": 2.5811, + "step": 55800 + }, + { + "epoch": 0.7513642839861286, + "grad_norm": 0.4489949345588684, + "learning_rate": 0.004992759049358903, + "loss": 2.5788, + "step": 55900 + }, + { + "epoch": 0.75270840613995, + "grad_norm": 0.7552504539489746, + "learning_rate": 0.0049927186732775025, + "loss": 2.5738, + "step": 56000 + }, + { + "epoch": 0.75270840613995, + "eval_MaskedAccuracy": 0.42679037653842317, + "eval_loss": 2.8953652381896973, + "eval_runtime": 154.4325, + "eval_samples_per_second": 411.027, + "eval_steps_per_second": 1.606, + "step": 56000 + }, + { + "epoch": 0.7540525282937713, + "grad_norm": 1.503426194190979, + "learning_rate": 0.0049926781851033376, + "loss": 2.5817, + "step": 56100 + }, + { + "epoch": 0.7553966504475926, + "grad_norm": 0.7938827872276306, + "learning_rate": 0.004992637584838231, + "loss": 2.5747, + "step": 56200 + }, + { + "epoch": 0.756740772601414, + "grad_norm": 0.6982958316802979, + "learning_rate": 0.004992596872484014, + "loss": 2.5803, + "step": 56300 + }, + { + "epoch": 0.7580848947552353, + "grad_norm": 0.5538859963417053, + "learning_rate": 0.004992556048042525, + "loss": 2.5873, + "step": 56400 + }, + { + "epoch": 0.7594290169090567, + "grad_norm": 1.1309832334518433, + "learning_rate": 0.004992515111515598, + "loss": 2.5732, + "step": 56500 + }, + { + "epoch": 0.760773139062878, + "grad_norm": 2.124687671661377, + "learning_rate": 0.00499247406290508, + "loss": 2.582, + "step": 56600 + }, + { + "epoch": 0.7621172612166994, + "grad_norm": 1.6379998922348022, + "learning_rate": 0.004992432902212811, + "loss": 2.5806, + "step": 56700 + }, + { + "epoch": 0.7634613833705207, + "grad_norm": 2.21696400642395, + "learning_rate": 0.004992391629440661, + "loss": 2.5809, + "step": 56800 + }, + { + "epoch": 0.764805505524342, + "grad_norm": 0.6755267977714539, + "learning_rate": 0.0049923502445904785, + "loss": 2.5759, + "step": 56900 + }, + { + "epoch": 0.7661496276781634, + "grad_norm": 0.5627455711364746, + "learning_rate": 0.004992308747664136, + "loss": 2.5784, + "step": 57000 + }, + { + "epoch": 0.7661496276781634, + "eval_MaskedAccuracy": 0.42715101212392975, + "eval_loss": 2.889655351638794, + "eval_runtime": 156.8451, + "eval_samples_per_second": 404.705, + "eval_steps_per_second": 1.581, + "step": 57000 + }, + { + "epoch": 0.7674937498319847, + "grad_norm": 0.4193578064441681, + "learning_rate": 0.004992267138663497, + "loss": 2.5768, + "step": 57100 + }, + { + "epoch": 0.7688378719858061, + "grad_norm": 0.4133063554763794, + "learning_rate": 0.004992225417590446, + "loss": 2.5712, + "step": 57200 + }, + { + "epoch": 0.7701819941396274, + "grad_norm": 1.6166553497314453, + "learning_rate": 0.004992183584446851, + "loss": 2.5751, + "step": 57300 + }, + { + "epoch": 0.7715261162934487, + "grad_norm": 1.9402251243591309, + "learning_rate": 0.0049921416392346, + "loss": 2.573, + "step": 57400 + }, + { + "epoch": 0.7728702384472701, + "grad_norm": 1.0368021726608276, + "learning_rate": 0.004992099581955586, + "loss": 2.5801, + "step": 57500 + }, + { + "epoch": 0.7742143606010914, + "grad_norm": 0.431902676820755, + "learning_rate": 0.004992057412611701, + "loss": 2.5776, + "step": 57600 + }, + { + "epoch": 0.7755584827549128, + "grad_norm": 0.4904190003871918, + "learning_rate": 0.004992015131204851, + "loss": 2.5792, + "step": 57700 + }, + { + "epoch": 0.7769026049087341, + "grad_norm": 1.3063716888427734, + "learning_rate": 0.004991972737736936, + "loss": 2.5654, + "step": 57800 + }, + { + "epoch": 0.7782467270625555, + "grad_norm": 1.1166818141937256, + "learning_rate": 0.004991930232209869, + "loss": 2.5694, + "step": 57900 + }, + { + "epoch": 0.7795908492163768, + "grad_norm": 0.6093880534172058, + "learning_rate": 0.004991887614625561, + "loss": 2.5768, + "step": 58000 + }, + { + "epoch": 0.7795908492163768, + "eval_MaskedAccuracy": 0.42766976413111096, + "eval_loss": 2.886188507080078, + "eval_runtime": 155.0432, + "eval_samples_per_second": 409.408, + "eval_steps_per_second": 1.6, + "step": 58000 + }, + { + "epoch": 0.7809349713701981, + "grad_norm": 1.4043593406677246, + "learning_rate": 0.00499184488498594, + "loss": 2.5692, + "step": 58100 + }, + { + "epoch": 0.7822790935240195, + "grad_norm": 1.9828555583953857, + "learning_rate": 0.0049918020432929274, + "loss": 2.5707, + "step": 58200 + }, + { + "epoch": 0.7836232156778408, + "grad_norm": 1.0599697828292847, + "learning_rate": 0.0049917590895484495, + "loss": 2.5715, + "step": 58300 + }, + { + "epoch": 0.7849673378316622, + "grad_norm": 0.5353257656097412, + "learning_rate": 0.00499171602375444, + "loss": 2.5601, + "step": 58400 + }, + { + "epoch": 0.7863114599854835, + "grad_norm": 0.45401671528816223, + "learning_rate": 0.004991672845912851, + "loss": 2.5764, + "step": 58500 + }, + { + "epoch": 0.7876555821393049, + "grad_norm": 1.2403732538223267, + "learning_rate": 0.00499162955602562, + "loss": 2.5637, + "step": 58600 + }, + { + "epoch": 0.7889997042931262, + "grad_norm": 1.7403950691223145, + "learning_rate": 0.004991586154094695, + "loss": 2.5634, + "step": 58700 + }, + { + "epoch": 0.7903438264469475, + "grad_norm": 0.8085100650787354, + "learning_rate": 0.004991542640122036, + "loss": 2.564, + "step": 58800 + }, + { + "epoch": 0.7916879486007689, + "grad_norm": 0.4583624005317688, + "learning_rate": 0.004991499014109603, + "loss": 2.5618, + "step": 58900 + }, + { + "epoch": 0.7930320707545901, + "grad_norm": 1.7312582731246948, + "learning_rate": 0.004991455276059368, + "loss": 2.5712, + "step": 59000 + }, + { + "epoch": 0.7930320707545901, + "eval_MaskedAccuracy": 0.4281439150410401, + "eval_loss": 2.884404420852661, + "eval_runtime": 154.3415, + "eval_samples_per_second": 411.27, + "eval_steps_per_second": 1.607, + "step": 59000 + }, + { + "epoch": 0.7943761929084115, + "grad_norm": 1.1742795705795288, + "learning_rate": 0.004991411425973289, + "loss": 2.5578, + "step": 59100 + }, + { + "epoch": 0.7957203150622328, + "grad_norm": 0.8881747722625732, + "learning_rate": 0.004991367463853345, + "loss": 2.5723, + "step": 59200 + }, + { + "epoch": 0.7970644372160542, + "grad_norm": 1.2915089130401611, + "learning_rate": 0.004991323389701527, + "loss": 2.565, + "step": 59300 + }, + { + "epoch": 0.7984085593698755, + "grad_norm": 0.6965966820716858, + "learning_rate": 0.004991279203519816, + "loss": 2.5619, + "step": 59400 + }, + { + "epoch": 0.7997526815236968, + "grad_norm": 2.0180699825286865, + "learning_rate": 0.004991234905310197, + "loss": 2.5639, + "step": 59500 + }, + { + "epoch": 0.8010968036775182, + "grad_norm": 0.8435118794441223, + "learning_rate": 0.004991190495074672, + "loss": 2.5578, + "step": 59600 + }, + { + "epoch": 0.8024409258313395, + "grad_norm": 1.3183516263961792, + "learning_rate": 0.004991145972815238, + "loss": 2.5658, + "step": 59700 + }, + { + "epoch": 0.8037850479851609, + "grad_norm": 0.4824967384338379, + "learning_rate": 0.004991101338533903, + "loss": 2.5641, + "step": 59800 + }, + { + "epoch": 0.8051291701389822, + "grad_norm": 0.4352557361125946, + "learning_rate": 0.004991056592232678, + "loss": 2.5634, + "step": 59900 + }, + { + "epoch": 0.8064732922928036, + "grad_norm": 0.6158259510993958, + "learning_rate": 0.004991011733913582, + "loss": 2.5596, + "step": 60000 + }, + { + "epoch": 0.8064732922928036, + "eval_MaskedAccuracy": 0.42826808328062316, + "eval_loss": 2.881223440170288, + "eval_runtime": 157.4024, + "eval_samples_per_second": 403.272, + "eval_steps_per_second": 1.576, + "step": 60000 + }, + { + "epoch": 0.8078174144466249, + "grad_norm": 1.2049239873886108, + "learning_rate": 0.004990966763578632, + "loss": 2.5647, + "step": 60100 + }, + { + "epoch": 0.8091615366004462, + "grad_norm": 1.9667131900787354, + "learning_rate": 0.0049909216812298604, + "loss": 2.5715, + "step": 60200 + }, + { + "epoch": 0.8105056587542676, + "grad_norm": 1.7864271402359009, + "learning_rate": 0.004990876486869296, + "loss": 2.5616, + "step": 60300 + }, + { + "epoch": 0.8118497809080889, + "grad_norm": 0.9698309302330017, + "learning_rate": 0.004990831180498973, + "loss": 2.5691, + "step": 60400 + }, + { + "epoch": 0.8131939030619103, + "grad_norm": 0.7363952398300171, + "learning_rate": 0.0049907857621209264, + "loss": 2.5709, + "step": 60500 + }, + { + "epoch": 0.8145380252157316, + "grad_norm": 0.45567846298217773, + "learning_rate": 0.004990740231737212, + "loss": 2.5688, + "step": 60600 + }, + { + "epoch": 0.8158821473695529, + "grad_norm": 1.8314270973205566, + "learning_rate": 0.004990694589349877, + "loss": 2.5694, + "step": 60700 + }, + { + "epoch": 0.8172262695233743, + "grad_norm": 0.7680712342262268, + "learning_rate": 0.004990648834960976, + "loss": 2.5678, + "step": 60800 + }, + { + "epoch": 0.8185703916771956, + "grad_norm": 0.8235045075416565, + "learning_rate": 0.004990602968572579, + "loss": 2.5641, + "step": 60900 + }, + { + "epoch": 0.819914513831017, + "grad_norm": 2.173553228378296, + "learning_rate": 0.004990556990186752, + "loss": 2.5631, + "step": 61000 + }, + { + "epoch": 0.819914513831017, + "eval_MaskedAccuracy": 0.42752038773182294, + "eval_loss": 2.8879265785217285, + "eval_runtime": 154.4174, + "eval_samples_per_second": 411.068, + "eval_steps_per_second": 1.606, + "step": 61000 + }, + { + "epoch": 0.8212586359848383, + "grad_norm": 1.7964410781860352, + "learning_rate": 0.004990510899805561, + "loss": 2.5645, + "step": 61100 + }, + { + "epoch": 0.8226027581386597, + "grad_norm": 0.6248696446418762, + "learning_rate": 0.004990464697431084, + "loss": 2.5554, + "step": 61200 + }, + { + "epoch": 0.823946880292481, + "grad_norm": 1.9335159063339233, + "learning_rate": 0.004990418383065405, + "loss": 2.5484, + "step": 61300 + }, + { + "epoch": 0.8252910024463023, + "grad_norm": 1.169259786605835, + "learning_rate": 0.004990371956710612, + "loss": 2.5604, + "step": 61400 + }, + { + "epoch": 0.8266351246001237, + "grad_norm": 1.7953951358795166, + "learning_rate": 0.004990325418368784, + "loss": 2.5482, + "step": 61500 + }, + { + "epoch": 0.827979246753945, + "grad_norm": 0.47882091999053955, + "learning_rate": 0.004990278768042037, + "loss": 2.5603, + "step": 61600 + }, + { + "epoch": 0.8293233689077664, + "grad_norm": 0.668745219707489, + "learning_rate": 0.004990232005732461, + "loss": 2.5587, + "step": 61700 + }, + { + "epoch": 0.8306674910615877, + "grad_norm": 2.7591805458068848, + "learning_rate": 0.0049901851314421665, + "loss": 2.5525, + "step": 61800 + }, + { + "epoch": 0.8320116132154091, + "grad_norm": 0.5808740258216858, + "learning_rate": 0.004990138145173262, + "loss": 2.5544, + "step": 61900 + }, + { + "epoch": 0.8333557353692304, + "grad_norm": 1.9806839227676392, + "learning_rate": 0.00499009104692787, + "loss": 2.5569, + "step": 62000 + }, + { + "epoch": 0.8333557353692304, + "eval_MaskedAccuracy": 0.4281775187262364, + "eval_loss": 2.8783788681030273, + "eval_runtime": 157.1845, + "eval_samples_per_second": 403.831, + "eval_steps_per_second": 1.578, + "step": 62000 + }, + { + "epoch": 0.8346998575230516, + "grad_norm": 0.4611744284629822, + "learning_rate": 0.004990043836708112, + "loss": 2.5569, + "step": 62100 + }, + { + "epoch": 0.836043979676873, + "grad_norm": 0.9632844924926758, + "learning_rate": 0.00498999651451611, + "loss": 2.549, + "step": 62200 + }, + { + "epoch": 0.8373881018306943, + "grad_norm": 0.49269410967826843, + "learning_rate": 0.004989949080353999, + "loss": 2.5543, + "step": 62300 + }, + { + "epoch": 0.8387322239845157, + "grad_norm": 1.0680642127990723, + "learning_rate": 0.0049899015342239166, + "loss": 2.558, + "step": 62400 + }, + { + "epoch": 0.840076346138337, + "grad_norm": 1.5859142541885376, + "learning_rate": 0.004989853876128006, + "loss": 2.5603, + "step": 62500 + }, + { + "epoch": 0.8414204682921584, + "grad_norm": 1.1798615455627441, + "learning_rate": 0.0049898061060684165, + "loss": 2.5592, + "step": 62600 + }, + { + "epoch": 0.8427645904459797, + "grad_norm": 1.3057469129562378, + "learning_rate": 0.004989758224047301, + "loss": 2.5618, + "step": 62700 + }, + { + "epoch": 0.844108712599801, + "grad_norm": 0.3981241285800934, + "learning_rate": 0.004989710230066817, + "loss": 2.5625, + "step": 62800 + }, + { + "epoch": 0.8454528347536224, + "grad_norm": 1.141242504119873, + "learning_rate": 0.00498966212412912, + "loss": 2.5575, + "step": 62900 + }, + { + "epoch": 0.8467969569074437, + "grad_norm": 1.986693024635315, + "learning_rate": 0.004989613906236384, + "loss": 2.5557, + "step": 63000 + }, + { + "epoch": 0.8467969569074437, + "eval_MaskedAccuracy": 0.4287118227276102, + "eval_loss": 2.8758063316345215, + "eval_runtime": 157.2196, + "eval_samples_per_second": 403.741, + "eval_steps_per_second": 1.577, + "step": 63000 + }, + { + "epoch": 0.8481410790612651, + "grad_norm": 0.42742684483528137, + "learning_rate": 0.0049895655763907785, + "loss": 2.5609, + "step": 63100 + }, + { + "epoch": 0.8494852012150864, + "grad_norm": 0.6005121469497681, + "learning_rate": 0.004989517134594481, + "loss": 2.5627, + "step": 63200 + }, + { + "epoch": 0.8508293233689078, + "grad_norm": 1.2548339366912842, + "learning_rate": 0.004989468580849676, + "loss": 2.5541, + "step": 63300 + }, + { + "epoch": 0.8521734455227291, + "grad_norm": 1.1956408023834229, + "learning_rate": 0.004989419915158556, + "loss": 2.5494, + "step": 63400 + }, + { + "epoch": 0.8535175676765504, + "grad_norm": 1.7097578048706055, + "learning_rate": 0.004989371137523307, + "loss": 2.5559, + "step": 63500 + }, + { + "epoch": 0.8548616898303718, + "grad_norm": 0.8344634175300598, + "learning_rate": 0.004989322247946124, + "loss": 2.5594, + "step": 63600 + }, + { + "epoch": 0.8562058119841931, + "grad_norm": 1.0530003309249878, + "learning_rate": 0.004989273246429215, + "loss": 2.548, + "step": 63700 + }, + { + "epoch": 0.8575499341380145, + "grad_norm": 1.2484129667282104, + "learning_rate": 0.004989224132974791, + "loss": 2.5514, + "step": 63800 + }, + { + "epoch": 0.8588940562918358, + "grad_norm": 0.48474428057670593, + "learning_rate": 0.004989174907585054, + "loss": 2.5607, + "step": 63900 + }, + { + "epoch": 0.8602381784456571, + "grad_norm": 0.7260443568229675, + "learning_rate": 0.004989125570262237, + "loss": 2.5607, + "step": 64000 + }, + { + "epoch": 0.8602381784456571, + "eval_MaskedAccuracy": 0.4298587031625389, + "eval_loss": 2.8685121536254883, + "eval_runtime": 156.7219, + "eval_samples_per_second": 405.023, + "eval_steps_per_second": 1.582, + "step": 64000 + }, + { + "epoch": 0.8615823005994785, + "grad_norm": 1.7127766609191895, + "learning_rate": 0.0049890761210085515, + "loss": 2.5457, + "step": 64100 + }, + { + "epoch": 0.8629264227532998, + "grad_norm": 0.9210508465766907, + "learning_rate": 0.004989026559826237, + "loss": 2.5567, + "step": 64200 + }, + { + "epoch": 0.8642705449071212, + "grad_norm": 0.5926476120948792, + "learning_rate": 0.004988976886717514, + "loss": 2.5502, + "step": 64300 + }, + { + "epoch": 0.8656146670609425, + "grad_norm": 0.8887967467308044, + "learning_rate": 0.0049889271016846275, + "loss": 2.5511, + "step": 64400 + }, + { + "epoch": 0.8669587892147639, + "grad_norm": 0.6354972124099731, + "learning_rate": 0.0049888772047298165, + "loss": 2.5549, + "step": 64500 + }, + { + "epoch": 0.8683029113685852, + "grad_norm": 0.45211437344551086, + "learning_rate": 0.004988827195855339, + "loss": 2.5541, + "step": 64600 + }, + { + "epoch": 0.8696470335224065, + "grad_norm": 1.1961520910263062, + "learning_rate": 0.004988777075063434, + "loss": 2.549, + "step": 64700 + }, + { + "epoch": 0.8709911556762279, + "grad_norm": 0.7773739099502563, + "learning_rate": 0.004988726842356369, + "loss": 2.5453, + "step": 64800 + }, + { + "epoch": 0.8723352778300492, + "grad_norm": 1.6650779247283936, + "learning_rate": 0.004988676497736411, + "loss": 2.5591, + "step": 64900 + }, + { + "epoch": 0.8736793999838706, + "grad_norm": 3.7934086322784424, + "learning_rate": 0.004988626041205816, + "loss": 2.5477, + "step": 65000 + }, + { + "epoch": 0.8736793999838706, + "eval_MaskedAccuracy": 0.42935040461530544, + "eval_loss": 2.8740768432617188, + "eval_runtime": 157.0461, + "eval_samples_per_second": 404.187, + "eval_steps_per_second": 1.579, + "step": 65000 + }, + { + "epoch": 0.8750235221376919, + "grad_norm": 0.37112095952033997, + "learning_rate": 0.004988575472766864, + "loss": 2.5442, + "step": 65100 + }, + { + "epoch": 0.8763676442915133, + "grad_norm": 0.4250340461730957, + "learning_rate": 0.0049885247924218385, + "loss": 2.5397, + "step": 65200 + }, + { + "epoch": 0.8777117664453346, + "grad_norm": 0.42517802119255066, + "learning_rate": 0.004988474000173024, + "loss": 2.5453, + "step": 65300 + }, + { + "epoch": 0.8790558885991558, + "grad_norm": 2.0652735233306885, + "learning_rate": 0.0049884230960227, + "loss": 2.5497, + "step": 65400 + }, + { + "epoch": 0.8804000107529772, + "grad_norm": 0.6342276930809021, + "learning_rate": 0.0049883720799731605, + "loss": 2.5378, + "step": 65500 + }, + { + "epoch": 0.8817441329067985, + "grad_norm": 1.0406826734542847, + "learning_rate": 0.0049883209520267075, + "loss": 2.5453, + "step": 65600 + }, + { + "epoch": 0.8830882550606199, + "grad_norm": 1.4055812358856201, + "learning_rate": 0.00498826971218565, + "loss": 2.5437, + "step": 65700 + }, + { + "epoch": 0.8844323772144412, + "grad_norm": 2.7914650440216064, + "learning_rate": 0.004988218360452281, + "loss": 2.5512, + "step": 65800 + }, + { + "epoch": 0.8857764993682626, + "grad_norm": 0.9103912711143494, + "learning_rate": 0.004988166896828934, + "loss": 2.5466, + "step": 65900 + }, + { + "epoch": 0.8871206215220839, + "grad_norm": 1.0117913484573364, + "learning_rate": 0.004988115321317916, + "loss": 2.5452, + "step": 66000 + }, + { + "epoch": 0.8871206215220839, + "eval_MaskedAccuracy": 0.4316935610840671, + "eval_loss": 2.8568239212036133, + "eval_runtime": 155.5022, + "eval_samples_per_second": 408.2, + "eval_steps_per_second": 1.595, + "step": 66000 + }, + { + "epoch": 0.8884647436759052, + "grad_norm": 0.934119701385498, + "learning_rate": 0.004988063633921554, + "loss": 2.5407, + "step": 66100 + }, + { + "epoch": 0.8898088658297266, + "grad_norm": 0.5075318217277527, + "learning_rate": 0.004988011834642176, + "loss": 2.5387, + "step": 66200 + }, + { + "epoch": 0.8911529879835479, + "grad_norm": 2.349472761154175, + "learning_rate": 0.004987959923482121, + "loss": 2.5453, + "step": 66300 + }, + { + "epoch": 0.8924971101373693, + "grad_norm": 1.2346925735473633, + "learning_rate": 0.004987907900443723, + "loss": 2.5433, + "step": 66400 + }, + { + "epoch": 0.8938412322911906, + "grad_norm": 0.9349079132080078, + "learning_rate": 0.004987855765529321, + "loss": 2.5386, + "step": 66500 + }, + { + "epoch": 0.895185354445012, + "grad_norm": 2.3298845291137695, + "learning_rate": 0.00498780351874127, + "loss": 2.5579, + "step": 66600 + }, + { + "epoch": 0.8965294765988333, + "grad_norm": 0.7276595830917358, + "learning_rate": 0.0049877511600819255, + "loss": 2.547, + "step": 66700 + }, + { + "epoch": 0.8978735987526546, + "grad_norm": 0.9799304604530334, + "learning_rate": 0.004987698689553648, + "loss": 2.5479, + "step": 66800 + }, + { + "epoch": 0.899217720906476, + "grad_norm": 4.606154918670654, + "learning_rate": 0.0049876461071588, + "loss": 2.5486, + "step": 66900 + }, + { + "epoch": 0.9005618430602973, + "grad_norm": 0.8625683784484863, + "learning_rate": 0.004987593412899746, + "loss": 2.5448, + "step": 67000 + }, + { + "epoch": 0.9005618430602973, + "eval_MaskedAccuracy": 0.431047719967855, + "eval_loss": 2.8602442741394043, + "eval_runtime": 154.394, + "eval_samples_per_second": 411.13, + "eval_steps_per_second": 1.606, + "step": 67000 + }, + { + "epoch": 0.9019059652141187, + "grad_norm": 0.6021907925605774, + "learning_rate": 0.0049875406067788645, + "loss": 2.5473, + "step": 67100 + }, + { + "epoch": 0.90325008736794, + "grad_norm": 1.0424331426620483, + "learning_rate": 0.004987487688798529, + "loss": 2.5401, + "step": 67200 + }, + { + "epoch": 0.9045942095217613, + "grad_norm": 0.7000874280929565, + "learning_rate": 0.004987434658961136, + "loss": 2.5403, + "step": 67300 + }, + { + "epoch": 0.9059383316755827, + "grad_norm": 0.437077134847641, + "learning_rate": 0.004987381517269073, + "loss": 2.5457, + "step": 67400 + }, + { + "epoch": 0.907282453829404, + "grad_norm": 0.5734995007514954, + "learning_rate": 0.004987328263724719, + "loss": 2.5496, + "step": 67500 + }, + { + "epoch": 0.9086265759832254, + "grad_norm": 0.9793594479560852, + "learning_rate": 0.00498727489833049, + "loss": 2.5374, + "step": 67600 + }, + { + "epoch": 0.9099706981370467, + "grad_norm": 1.336166262626648, + "learning_rate": 0.004987221421088781, + "loss": 2.5348, + "step": 67700 + }, + { + "epoch": 0.9113148202908681, + "grad_norm": 0.8772963881492615, + "learning_rate": 0.00498716783200201, + "loss": 2.5453, + "step": 67800 + }, + { + "epoch": 0.9126589424446894, + "grad_norm": 1.0549404621124268, + "learning_rate": 0.0049871141310725816, + "loss": 2.5394, + "step": 67900 + }, + { + "epoch": 0.9140030645985107, + "grad_norm": 0.6671266555786133, + "learning_rate": 0.004987060318302928, + "loss": 2.5408, + "step": 68000 + }, + { + "epoch": 0.9140030645985107, + "eval_MaskedAccuracy": 0.43042410070670717, + "eval_loss": 2.864036798477173, + "eval_runtime": 158.2908, + "eval_samples_per_second": 401.009, + "eval_steps_per_second": 1.567, + "step": 68000 + }, + { + "epoch": 0.9153471867523321, + "grad_norm": 1.8076815605163574, + "learning_rate": 0.004987006393695469, + "loss": 2.5349, + "step": 68100 + }, + { + "epoch": 0.9166913089061534, + "grad_norm": 0.9832874536514282, + "learning_rate": 0.004986952357252627, + "loss": 2.5372, + "step": 68200 + }, + { + "epoch": 0.9180354310599748, + "grad_norm": 0.6802740693092346, + "learning_rate": 0.004986898208976844, + "loss": 2.538, + "step": 68300 + }, + { + "epoch": 0.919379553213796, + "grad_norm": 1.3755671977996826, + "learning_rate": 0.004986843948870563, + "loss": 2.5427, + "step": 68400 + }, + { + "epoch": 0.9207236753676175, + "grad_norm": 0.6860373020172119, + "learning_rate": 0.0049867895769362194, + "loss": 2.5456, + "step": 68500 + }, + { + "epoch": 0.9220677975214387, + "grad_norm": 0.37920355796813965, + "learning_rate": 0.004986735093176264, + "loss": 2.5388, + "step": 68600 + }, + { + "epoch": 0.92341191967526, + "grad_norm": 0.48161619901657104, + "learning_rate": 0.004986680497593163, + "loss": 2.5482, + "step": 68700 + }, + { + "epoch": 0.9247560418290814, + "grad_norm": 1.5831481218338013, + "learning_rate": 0.0049866257901893625, + "loss": 2.533, + "step": 68800 + }, + { + "epoch": 0.9261001639829027, + "grad_norm": 1.874329924583435, + "learning_rate": 0.004986570970967331, + "loss": 2.5432, + "step": 68900 + }, + { + "epoch": 0.9274442861367241, + "grad_norm": 0.6668677926063538, + "learning_rate": 0.004986516039929541, + "loss": 2.5458, + "step": 69000 + }, + { + "epoch": 0.9274442861367241, + "eval_MaskedAccuracy": 0.4320338287315044, + "eval_loss": 2.851020336151123, + "eval_runtime": 154.6645, + "eval_samples_per_second": 410.411, + "eval_steps_per_second": 1.603, + "step": 69000 + }, + { + "epoch": 0.9287884082905454, + "grad_norm": 0.850222647190094, + "learning_rate": 0.004986460997078469, + "loss": 2.5411, + "step": 69100 + }, + { + "epoch": 0.9301325304443668, + "grad_norm": 1.386954665184021, + "learning_rate": 0.004986405842416592, + "loss": 2.5424, + "step": 69200 + }, + { + "epoch": 0.9314766525981881, + "grad_norm": 0.7696434259414673, + "learning_rate": 0.0049863505759464, + "loss": 2.5336, + "step": 69300 + }, + { + "epoch": 0.9328207747520094, + "grad_norm": 0.8896027207374573, + "learning_rate": 0.004986295197670379, + "loss": 2.5325, + "step": 69400 + }, + { + "epoch": 0.9341648969058308, + "grad_norm": 1.7881613969802856, + "learning_rate": 0.004986239707591028, + "loss": 2.5402, + "step": 69500 + }, + { + "epoch": 0.9355090190596521, + "grad_norm": 0.5906102061271667, + "learning_rate": 0.0049861841057108405, + "loss": 2.542, + "step": 69600 + }, + { + "epoch": 0.9368531412134735, + "grad_norm": 0.7546370625495911, + "learning_rate": 0.004986128392032327, + "loss": 2.5392, + "step": 69700 + }, + { + "epoch": 0.9381972633672948, + "grad_norm": 0.7366216778755188, + "learning_rate": 0.004986072566557999, + "loss": 2.5316, + "step": 69800 + }, + { + "epoch": 0.9395413855211162, + "grad_norm": 1.4292287826538086, + "learning_rate": 0.00498601662929037, + "loss": 2.5371, + "step": 69900 + }, + { + "epoch": 0.9408855076749375, + "grad_norm": 0.9893214106559753, + "learning_rate": 0.00498596058023197, + "loss": 2.5313, + "step": 70000 + }, + { + "epoch": 0.9408855076749375, + "eval_MaskedAccuracy": 0.4319902861219465, + "eval_loss": 2.853639602661133, + "eval_runtime": 157.3354, + "eval_samples_per_second": 403.444, + "eval_steps_per_second": 1.576, + "step": 70000 + }, + { + "epoch": 0.9422296298287588, + "grad_norm": 1.4988006353378296, + "learning_rate": 0.004985904419385302, + "loss": 2.5369, + "step": 70100 + }, + { + "epoch": 0.9435737519825802, + "grad_norm": 1.3903696537017822, + "learning_rate": 0.004985848146752917, + "loss": 2.5312, + "step": 70200 + }, + { + "epoch": 0.9449178741364015, + "grad_norm": 1.741083025932312, + "learning_rate": 0.004985791762337335, + "loss": 2.5343, + "step": 70300 + }, + { + "epoch": 0.9462619962902229, + "grad_norm": 0.43283969163894653, + "learning_rate": 0.00498573526614111, + "loss": 2.5308, + "step": 70400 + }, + { + "epoch": 0.9476061184440442, + "grad_norm": 0.6915990114212036, + "learning_rate": 0.004985678658166784, + "loss": 2.5316, + "step": 70500 + }, + { + "epoch": 0.9489502405978655, + "grad_norm": 3.1753523349761963, + "learning_rate": 0.004985621938416906, + "loss": 2.5376, + "step": 70600 + }, + { + "epoch": 0.9502943627516869, + "grad_norm": 1.0071120262145996, + "learning_rate": 0.0049855651068940335, + "loss": 2.5395, + "step": 70700 + }, + { + "epoch": 0.9516384849055082, + "grad_norm": 3.022740125656128, + "learning_rate": 0.004985508163600716, + "loss": 2.5267, + "step": 70800 + }, + { + "epoch": 0.9529826070593296, + "grad_norm": 2.2657601833343506, + "learning_rate": 0.004985451108539533, + "loss": 2.5279, + "step": 70900 + }, + { + "epoch": 0.9543267292131509, + "grad_norm": 1.0798835754394531, + "learning_rate": 0.004985393941713058, + "loss": 2.5346, + "step": 71000 + }, + { + "epoch": 0.9543267292131509, + "eval_MaskedAccuracy": 0.43291381575543303, + "eval_loss": 2.847073554992676, + "eval_runtime": 157.3815, + "eval_samples_per_second": 403.326, + "eval_steps_per_second": 1.576, + "step": 71000 + }, + { + "epoch": 0.9556708513669723, + "grad_norm": 0.8452098369598389, + "learning_rate": 0.004985336663123854, + "loss": 2.5344, + "step": 71100 + }, + { + "epoch": 0.9570149735207936, + "grad_norm": 1.136425256729126, + "learning_rate": 0.0049852792727745135, + "loss": 2.5373, + "step": 71200 + }, + { + "epoch": 0.9583590956746149, + "grad_norm": 0.421425461769104, + "learning_rate": 0.004985221770667615, + "loss": 2.5365, + "step": 71300 + }, + { + "epoch": 0.9597032178284363, + "grad_norm": 1.0452524423599243, + "learning_rate": 0.004985164156805749, + "loss": 2.5405, + "step": 71400 + }, + { + "epoch": 0.9610473399822576, + "grad_norm": 0.6131758689880371, + "learning_rate": 0.004985106431191513, + "loss": 2.5379, + "step": 71500 + }, + { + "epoch": 0.962391462136079, + "grad_norm": 0.5693802833557129, + "learning_rate": 0.004985048593827507, + "loss": 2.5415, + "step": 71600 + }, + { + "epoch": 0.9637355842899002, + "grad_norm": 0.576070249080658, + "learning_rate": 0.004984990644716343, + "loss": 2.5258, + "step": 71700 + }, + { + "epoch": 0.9650797064437217, + "grad_norm": 0.8983374834060669, + "learning_rate": 0.004984932583860627, + "loss": 2.5222, + "step": 71800 + }, + { + "epoch": 0.9664238285975429, + "grad_norm": 1.3723649978637695, + "learning_rate": 0.004984874411262972, + "loss": 2.5328, + "step": 71900 + }, + { + "epoch": 0.9677679507513642, + "grad_norm": 0.638632595539093, + "learning_rate": 0.004984816126926011, + "loss": 2.5291, + "step": 72000 + }, + { + "epoch": 0.9677679507513642, + "eval_MaskedAccuracy": 0.4332407392153184, + "eval_loss": 2.8442609310150146, + "eval_runtime": 157.5641, + "eval_samples_per_second": 402.858, + "eval_steps_per_second": 1.574, + "step": 72000 + }, + { + "epoch": 0.9691120729051856, + "grad_norm": 0.999791145324707, + "learning_rate": 0.0049847577308523554, + "loss": 2.5289, + "step": 72100 + }, + { + "epoch": 0.9704561950590069, + "grad_norm": 0.9468411803245544, + "learning_rate": 0.0049846992230446465, + "loss": 2.5244, + "step": 72200 + }, + { + "epoch": 0.9718003172128283, + "grad_norm": 1.2006036043167114, + "learning_rate": 0.0049846406035055114, + "loss": 2.5294, + "step": 72300 + }, + { + "epoch": 0.9731444393666496, + "grad_norm": 1.1461031436920166, + "learning_rate": 0.0049845818722376025, + "loss": 2.5251, + "step": 72400 + }, + { + "epoch": 0.974488561520471, + "grad_norm": 1.3583283424377441, + "learning_rate": 0.004984523029243558, + "loss": 2.5401, + "step": 72500 + }, + { + "epoch": 0.9758326836742923, + "grad_norm": 0.8848266005516052, + "learning_rate": 0.004984464074526038, + "loss": 2.534, + "step": 72600 + }, + { + "epoch": 0.9771768058281136, + "grad_norm": 0.8786698579788208, + "learning_rate": 0.004984405008087696, + "loss": 2.5268, + "step": 72700 + }, + { + "epoch": 0.978520927981935, + "grad_norm": 1.2354623079299927, + "learning_rate": 0.004984345829931181, + "loss": 2.5235, + "step": 72800 + }, + { + "epoch": 0.9798650501357563, + "grad_norm": 0.44590994715690613, + "learning_rate": 0.004984286540059167, + "loss": 2.5249, + "step": 72900 + }, + { + "epoch": 0.9812091722895777, + "grad_norm": 1.5958350896835327, + "learning_rate": 0.004984227138474335, + "loss": 2.5252, + "step": 73000 + }, + { + "epoch": 0.9812091722895777, + "eval_MaskedAccuracy": 0.43342773457995387, + "eval_loss": 2.8439483642578125, + "eval_runtime": 156.2562, + "eval_samples_per_second": 406.23, + "eval_steps_per_second": 1.587, + "step": 73000 + }, + { + "epoch": 0.982553294443399, + "grad_norm": 0.9275982975959778, + "learning_rate": 0.004984167625179348, + "loss": 2.5338, + "step": 73100 + }, + { + "epoch": 0.9838974165972204, + "grad_norm": 1.3247750997543335, + "learning_rate": 0.0049841080001768985, + "loss": 2.5258, + "step": 73200 + }, + { + "epoch": 0.9852415387510417, + "grad_norm": 0.8629142642021179, + "learning_rate": 0.004984048263469671, + "loss": 2.5277, + "step": 73300 + }, + { + "epoch": 0.986585660904863, + "grad_norm": 1.2178809642791748, + "learning_rate": 0.004983988415060354, + "loss": 2.5345, + "step": 73400 + }, + { + "epoch": 0.9879297830586844, + "grad_norm": 1.3900634050369263, + "learning_rate": 0.004983928454951644, + "loss": 2.5164, + "step": 73500 + }, + { + "epoch": 0.9892739052125057, + "grad_norm": 1.1817967891693115, + "learning_rate": 0.00498386838314624, + "loss": 2.5267, + "step": 73600 + }, + { + "epoch": 0.9906180273663271, + "grad_norm": 0.8701839447021484, + "learning_rate": 0.004983808199646853, + "loss": 2.5238, + "step": 73700 + }, + { + "epoch": 0.9919621495201484, + "grad_norm": 1.1527352333068848, + "learning_rate": 0.0049837479044561985, + "loss": 2.5293, + "step": 73800 + }, + { + "epoch": 0.9933062716739697, + "grad_norm": 1.1712994575500488, + "learning_rate": 0.004983687497576983, + "loss": 2.5292, + "step": 73900 + }, + { + "epoch": 0.9946503938277911, + "grad_norm": 1.6270473003387451, + "learning_rate": 0.004983626979011935, + "loss": 2.5288, + "step": 74000 + }, + { + "epoch": 0.9946503938277911, + "eval_MaskedAccuracy": 0.432422431765529, + "eval_loss": 2.8484365940093994, + "eval_runtime": 154.8637, + "eval_samples_per_second": 409.883, + "eval_steps_per_second": 1.601, + "step": 74000 + }, + { + "epoch": 0.9959945159816124, + "grad_norm": 0.4700554609298706, + "learning_rate": 0.0049835663487637835, + "loss": 2.5225, + "step": 74100 + }, + { + "epoch": 0.9973386381354338, + "grad_norm": 5.065925121307373, + "learning_rate": 0.004983505606835256, + "loss": 2.5303, + "step": 74200 + }, + { + "epoch": 0.9986827602892551, + "grad_norm": 0.3857990503311157, + "learning_rate": 0.004983444753229089, + "loss": 2.5351, + "step": 74300 + }, + { + "epoch": 1.0000268824430765, + "grad_norm": 3.9997522830963135, + "learning_rate": 0.00498338378794803, + "loss": 2.5212, + "step": 74400 + }, + { + "epoch": 1.0013710045968978, + "grad_norm": 1.400756597518921, + "learning_rate": 0.00498332271099482, + "loss": 2.52, + "step": 74500 + }, + { + "epoch": 1.002715126750719, + "grad_norm": 0.6085138916969299, + "learning_rate": 0.004983261522372212, + "loss": 2.526, + "step": 74600 + }, + { + "epoch": 1.0040592489045403, + "grad_norm": 1.8259930610656738, + "learning_rate": 0.004983200222082968, + "loss": 2.5176, + "step": 74700 + }, + { + "epoch": 1.0054033710583619, + "grad_norm": 2.1580281257629395, + "learning_rate": 0.004983138810129848, + "loss": 2.5273, + "step": 74800 + }, + { + "epoch": 1.0067474932121832, + "grad_norm": 0.8419142365455627, + "learning_rate": 0.004983077286515609, + "loss": 2.5232, + "step": 74900 + }, + { + "epoch": 1.0080916153660044, + "grad_norm": 1.1031572818756104, + "learning_rate": 0.004983015651243041, + "loss": 2.5381, + "step": 75000 + }, + { + "epoch": 1.0080916153660044, + "eval_MaskedAccuracy": 0.43352066355967, + "eval_loss": 2.838632822036743, + "eval_runtime": 154.6683, + "eval_samples_per_second": 410.401, + "eval_steps_per_second": 1.603, + "step": 75000 + }, + { + "epoch": 1.0094357375198257, + "grad_norm": 0.4656729996204376, + "learning_rate": 0.004982953904314914, + "loss": 2.5295, + "step": 75100 + }, + { + "epoch": 1.0107798596736473, + "grad_norm": 0.7429810762405396, + "learning_rate": 0.004982892045734002, + "loss": 2.5279, + "step": 75200 + }, + { + "epoch": 1.0121239818274685, + "grad_norm": 0.36424189805984497, + "learning_rate": 0.004982830075503102, + "loss": 2.5211, + "step": 75300 + }, + { + "epoch": 1.0134681039812898, + "grad_norm": 0.45235106348991394, + "learning_rate": 0.004982767993625003, + "loss": 2.5254, + "step": 75400 + }, + { + "epoch": 1.0148122261351111, + "grad_norm": 0.6336104273796082, + "learning_rate": 0.0049827058001024995, + "loss": 2.5318, + "step": 75500 + }, + { + "epoch": 1.0161563482889324, + "grad_norm": 0.5706799030303955, + "learning_rate": 0.004982643494938397, + "loss": 2.5267, + "step": 75600 + }, + { + "epoch": 1.017500470442754, + "grad_norm": 0.5101211667060852, + "learning_rate": 0.004982581078135498, + "loss": 2.5239, + "step": 75700 + }, + { + "epoch": 1.0188445925965752, + "grad_norm": 0.6169567108154297, + "learning_rate": 0.004982518549696623, + "loss": 2.5309, + "step": 75800 + }, + { + "epoch": 1.0201887147503965, + "grad_norm": 1.594337821006775, + "learning_rate": 0.004982455909624589, + "loss": 2.5253, + "step": 75900 + }, + { + "epoch": 1.0215328369042178, + "grad_norm": 1.7339344024658203, + "learning_rate": 0.004982393157922211, + "loss": 2.5248, + "step": 76000 + }, + { + "epoch": 1.0215328369042178, + "eval_MaskedAccuracy": 0.43333633554622425, + "eval_loss": 2.8395068645477295, + "eval_runtime": 155.9239, + "eval_samples_per_second": 407.096, + "eval_steps_per_second": 1.591, + "step": 76000 + }, + { + "epoch": 1.022876959058039, + "grad_norm": 0.3945859968662262, + "learning_rate": 0.00498233029459232, + "loss": 2.52, + "step": 76100 + }, + { + "epoch": 1.0242210812118606, + "grad_norm": 0.9986801743507385, + "learning_rate": 0.00498226731963775, + "loss": 2.5088, + "step": 76200 + }, + { + "epoch": 1.025565203365682, + "grad_norm": 0.9021244049072266, + "learning_rate": 0.0049822042330613416, + "loss": 2.5183, + "step": 76300 + }, + { + "epoch": 1.0269093255195032, + "grad_norm": 0.3702966272830963, + "learning_rate": 0.004982141034865935, + "loss": 2.5185, + "step": 76400 + }, + { + "epoch": 1.0282534476733245, + "grad_norm": 1.010032057762146, + "learning_rate": 0.004982077725054371, + "loss": 2.5323, + "step": 76500 + }, + { + "epoch": 1.0295975698271458, + "grad_norm": 3.7542576789855957, + "learning_rate": 0.004982014303629503, + "loss": 2.5153, + "step": 76600 + }, + { + "epoch": 1.0309416919809673, + "grad_norm": 0.729006826877594, + "learning_rate": 0.0049819507705941984, + "loss": 2.5259, + "step": 76700 + }, + { + "epoch": 1.0322858141347886, + "grad_norm": 0.475752592086792, + "learning_rate": 0.0049818871259513136, + "loss": 2.5191, + "step": 76800 + }, + { + "epoch": 1.0336299362886099, + "grad_norm": 1.1220066547393799, + "learning_rate": 0.004981823369703716, + "loss": 2.5177, + "step": 76900 + }, + { + "epoch": 1.0349740584424312, + "grad_norm": 0.5770747661590576, + "learning_rate": 0.004981759501854277, + "loss": 2.5298, + "step": 77000 + }, + { + "epoch": 1.0349740584424312, + "eval_MaskedAccuracy": 0.4340570336735209, + "eval_loss": 2.8367176055908203, + "eval_runtime": 156.2265, + "eval_samples_per_second": 406.307, + "eval_steps_per_second": 1.587, + "step": 77000 + }, + { + "epoch": 1.0363181805962527, + "grad_norm": 0.4791591763496399, + "learning_rate": 0.004981695522405884, + "loss": 2.5189, + "step": 77100 + }, + { + "epoch": 1.037662302750074, + "grad_norm": 1.2197085618972778, + "learning_rate": 0.00498163143136141, + "loss": 2.5181, + "step": 77200 + }, + { + "epoch": 1.0390064249038953, + "grad_norm": 1.1560014486312866, + "learning_rate": 0.004981567228723745, + "loss": 2.517, + "step": 77300 + }, + { + "epoch": 1.0403505470577166, + "grad_norm": 0.5016676783561707, + "learning_rate": 0.004981502914495785, + "loss": 2.5196, + "step": 77400 + }, + { + "epoch": 1.0416946692115379, + "grad_norm": 1.4357668161392212, + "learning_rate": 0.004981438488680427, + "loss": 2.5121, + "step": 77500 + }, + { + "epoch": 1.0430387913653594, + "grad_norm": 1.0016825199127197, + "learning_rate": 0.004981373951280573, + "loss": 2.5197, + "step": 77600 + }, + { + "epoch": 1.0443829135191807, + "grad_norm": 5.918942451477051, + "learning_rate": 0.004981309302299129, + "loss": 2.5243, + "step": 77700 + }, + { + "epoch": 1.045727035673002, + "grad_norm": 1.207068681716919, + "learning_rate": 0.004981244541739009, + "loss": 2.5257, + "step": 77800 + }, + { + "epoch": 1.0470711578268233, + "grad_norm": 0.649711012840271, + "learning_rate": 0.004981179669603129, + "loss": 2.5274, + "step": 77900 + }, + { + "epoch": 1.0484152799806445, + "grad_norm": 1.0089892148971558, + "learning_rate": 0.004981114685894418, + "loss": 2.5166, + "step": 78000 + }, + { + "epoch": 1.0484152799806445, + "eval_MaskedAccuracy": 0.4343209398198679, + "eval_loss": 2.835813045501709, + "eval_runtime": 154.6977, + "eval_samples_per_second": 410.323, + "eval_steps_per_second": 1.603, + "step": 78000 + }, + { + "epoch": 1.049759402134466, + "grad_norm": 0.8486099243164062, + "learning_rate": 0.0049810495906158, + "loss": 2.5232, + "step": 78100 + }, + { + "epoch": 1.0511035242882873, + "grad_norm": 0.6109094023704529, + "learning_rate": 0.004980984383770206, + "loss": 2.5207, + "step": 78200 + }, + { + "epoch": 1.0524476464421086, + "grad_norm": 0.463224858045578, + "learning_rate": 0.004980919065360583, + "loss": 2.5198, + "step": 78300 + }, + { + "epoch": 1.05379176859593, + "grad_norm": 0.5557699799537659, + "learning_rate": 0.004980853635389861, + "loss": 2.5235, + "step": 78400 + }, + { + "epoch": 1.0551358907497512, + "grad_norm": 4.43305778503418, + "learning_rate": 0.004980788093860997, + "loss": 2.5133, + "step": 78500 + }, + { + "epoch": 1.0564800129035727, + "grad_norm": 0.5371536612510681, + "learning_rate": 0.0049807224407769435, + "loss": 2.5208, + "step": 78600 + }, + { + "epoch": 1.057824135057394, + "grad_norm": 1.074479341506958, + "learning_rate": 0.004980656676140655, + "loss": 2.5064, + "step": 78700 + }, + { + "epoch": 1.0591682572112153, + "grad_norm": 2.650197744369507, + "learning_rate": 0.004980590799955096, + "loss": 2.523, + "step": 78800 + }, + { + "epoch": 1.0605123793650366, + "grad_norm": 2.4881539344787598, + "learning_rate": 0.004980524812223236, + "loss": 2.5181, + "step": 78900 + }, + { + "epoch": 1.0618565015188581, + "grad_norm": 1.3180015087127686, + "learning_rate": 0.004980458712948045, + "loss": 2.5122, + "step": 79000 + }, + { + "epoch": 1.0618565015188581, + "eval_MaskedAccuracy": 0.4343943324864396, + "eval_loss": 2.8312034606933594, + "eval_runtime": 155.8402, + "eval_samples_per_second": 407.315, + "eval_steps_per_second": 1.591, + "step": 79000 + }, + { + "epoch": 1.0632006236726794, + "grad_norm": 0.38148483633995056, + "learning_rate": 0.004980392502132505, + "loss": 2.5165, + "step": 79100 + }, + { + "epoch": 1.0645447458265007, + "grad_norm": 0.5590552091598511, + "learning_rate": 0.004980326179779604, + "loss": 2.5068, + "step": 79200 + }, + { + "epoch": 1.065888867980322, + "grad_norm": 0.4895831346511841, + "learning_rate": 0.004980259745892328, + "loss": 2.5226, + "step": 79300 + }, + { + "epoch": 1.0672329901341433, + "grad_norm": 0.5289179086685181, + "learning_rate": 0.004980193200473662, + "loss": 2.5117, + "step": 79400 + }, + { + "epoch": 1.0685771122879648, + "grad_norm": 1.0307695865631104, + "learning_rate": 0.004980126543526612, + "loss": 2.5128, + "step": 79500 + }, + { + "epoch": 1.069921234441786, + "grad_norm": 1.9092541933059692, + "learning_rate": 0.004980059775054183, + "loss": 2.5241, + "step": 79600 + }, + { + "epoch": 1.0712653565956074, + "grad_norm": 0.8294230103492737, + "learning_rate": 0.004979992895059378, + "loss": 2.5195, + "step": 79700 + }, + { + "epoch": 1.0726094787494287, + "grad_norm": 3.4225094318389893, + "learning_rate": 0.004979925903545211, + "loss": 2.517, + "step": 79800 + }, + { + "epoch": 1.0739536009032502, + "grad_norm": 0.4758341312408447, + "learning_rate": 0.004979858800514705, + "loss": 2.5176, + "step": 79900 + }, + { + "epoch": 1.0752977230570715, + "grad_norm": 0.5281692147254944, + "learning_rate": 0.00497979158597088, + "loss": 2.522, + "step": 80000 + }, + { + "epoch": 1.0752977230570715, + "eval_MaskedAccuracy": 0.4351065125485484, + "eval_loss": 2.828768491744995, + "eval_runtime": 155.597, + "eval_samples_per_second": 407.951, + "eval_steps_per_second": 1.594, + "step": 80000 + }, + { + "epoch": 1.0766418452108928, + "grad_norm": 0.611584484577179, + "learning_rate": 0.004979724259916765, + "loss": 2.5146, + "step": 80100 + }, + { + "epoch": 1.077985967364714, + "grad_norm": 0.5003161430358887, + "learning_rate": 0.0049796568223553946, + "loss": 2.519, + "step": 80200 + }, + { + "epoch": 1.0793300895185354, + "grad_norm": 0.8842087984085083, + "learning_rate": 0.004979589273289808, + "loss": 2.5081, + "step": 80300 + }, + { + "epoch": 1.0806742116723567, + "grad_norm": 0.86569744348526, + "learning_rate": 0.004979521612723048, + "loss": 2.5062, + "step": 80400 + }, + { + "epoch": 1.0820183338261782, + "grad_norm": 0.7057967782020569, + "learning_rate": 0.004979453840658162, + "loss": 2.5149, + "step": 80500 + }, + { + "epoch": 1.0833624559799995, + "grad_norm": 0.7422287464141846, + "learning_rate": 0.004979385957098205, + "loss": 2.5153, + "step": 80600 + }, + { + "epoch": 1.0847065781338208, + "grad_norm": 0.6432771682739258, + "learning_rate": 0.004979317962046237, + "loss": 2.5157, + "step": 80700 + }, + { + "epoch": 1.086050700287642, + "grad_norm": 0.5962923169136047, + "learning_rate": 0.004979249855505322, + "loss": 2.5068, + "step": 80800 + }, + { + "epoch": 1.0873948224414636, + "grad_norm": 0.939208984375, + "learning_rate": 0.004979181637478523, + "loss": 2.5066, + "step": 80900 + }, + { + "epoch": 1.0887389445952849, + "grad_norm": 0.3996613025665283, + "learning_rate": 0.004979113307968915, + "loss": 2.5086, + "step": 81000 + }, + { + "epoch": 1.0887389445952849, + "eval_MaskedAccuracy": 0.4360474147201799, + "eval_loss": 2.824847936630249, + "eval_runtime": 156.0397, + "eval_samples_per_second": 406.794, + "eval_steps_per_second": 1.589, + "step": 81000 + }, + { + "epoch": 1.0900830667491062, + "grad_norm": 0.3728293478488922, + "learning_rate": 0.004979044866979578, + "loss": 2.5134, + "step": 81100 + }, + { + "epoch": 1.0914271889029274, + "grad_norm": 1.5539531707763672, + "learning_rate": 0.0049789763145136005, + "loss": 2.5235, + "step": 81200 + }, + { + "epoch": 1.0927713110567487, + "grad_norm": 1.7607640027999878, + "learning_rate": 0.004978907650574065, + "loss": 2.5174, + "step": 81300 + }, + { + "epoch": 1.0941154332105703, + "grad_norm": 1.1798290014266968, + "learning_rate": 0.004978838875164072, + "loss": 2.5081, + "step": 81400 + }, + { + "epoch": 1.0954595553643915, + "grad_norm": 0.4024448096752167, + "learning_rate": 0.00497876998828672, + "loss": 2.5172, + "step": 81500 + }, + { + "epoch": 1.0968036775182128, + "grad_norm": 0.5295103788375854, + "learning_rate": 0.004978700989945107, + "loss": 2.5015, + "step": 81600 + }, + { + "epoch": 1.0981477996720341, + "grad_norm": 0.8088193535804749, + "learning_rate": 0.0049786318801423425, + "loss": 2.508, + "step": 81700 + }, + { + "epoch": 1.0994919218258556, + "grad_norm": 0.8756518363952637, + "learning_rate": 0.004978562658881545, + "loss": 2.5063, + "step": 81800 + }, + { + "epoch": 1.100836043979677, + "grad_norm": 4.078980922698975, + "learning_rate": 0.004978493326165831, + "loss": 2.5063, + "step": 81900 + }, + { + "epoch": 1.1021801661334982, + "grad_norm": 1.339497685432434, + "learning_rate": 0.0049784238819983295, + "loss": 2.5074, + "step": 82000 + }, + { + "epoch": 1.1021801661334982, + "eval_MaskedAccuracy": 0.43618356871003144, + "eval_loss": 2.82234525680542, + "eval_runtime": 156.3995, + "eval_samples_per_second": 405.858, + "eval_steps_per_second": 1.586, + "step": 82000 + }, + { + "epoch": 1.1035242882873195, + "grad_norm": 1.6204960346221924, + "learning_rate": 0.004978354326382158, + "loss": 2.5152, + "step": 82100 + }, + { + "epoch": 1.1048684104411408, + "grad_norm": 0.4489379823207855, + "learning_rate": 0.00497828465932046, + "loss": 2.5102, + "step": 82200 + }, + { + "epoch": 1.1062125325949623, + "grad_norm": 1.3593122959136963, + "learning_rate": 0.004978214880816373, + "loss": 2.5025, + "step": 82300 + }, + { + "epoch": 1.1075566547487836, + "grad_norm": 0.4765743017196655, + "learning_rate": 0.004978144990873039, + "loss": 2.5071, + "step": 82400 + }, + { + "epoch": 1.108900776902605, + "grad_norm": 1.4956125020980835, + "learning_rate": 0.004978074989493609, + "loss": 2.5068, + "step": 82500 + }, + { + "epoch": 1.1102448990564262, + "grad_norm": 0.820233941078186, + "learning_rate": 0.004978004876681244, + "loss": 2.5143, + "step": 82600 + }, + { + "epoch": 1.1115890212102475, + "grad_norm": 0.39537087082862854, + "learning_rate": 0.004977934652439093, + "loss": 2.5172, + "step": 82700 + }, + { + "epoch": 1.112933143364069, + "grad_norm": 2.4342026710510254, + "learning_rate": 0.004977864316770319, + "loss": 2.5119, + "step": 82800 + }, + { + "epoch": 1.1142772655178903, + "grad_norm": 1.4095677137374878, + "learning_rate": 0.004977793869678101, + "loss": 2.5082, + "step": 82900 + }, + { + "epoch": 1.1156213876717116, + "grad_norm": 0.6153007745742798, + "learning_rate": 0.004977723311165601, + "loss": 2.5059, + "step": 83000 + }, + { + "epoch": 1.1156213876717116, + "eval_MaskedAccuracy": 0.43704451908857084, + "eval_loss": 2.815913200378418, + "eval_runtime": 155.698, + "eval_samples_per_second": 407.687, + "eval_steps_per_second": 1.593, + "step": 83000 + }, + { + "epoch": 1.1169655098255329, + "grad_norm": 0.7392873764038086, + "learning_rate": 0.004977652641236012, + "loss": 2.5019, + "step": 83100 + }, + { + "epoch": 1.1183096319793542, + "grad_norm": 2.7315380573272705, + "learning_rate": 0.004977581859892506, + "loss": 2.5046, + "step": 83200 + }, + { + "epoch": 1.1196537541331757, + "grad_norm": 0.5851826667785645, + "learning_rate": 0.0049775109671382715, + "loss": 2.5031, + "step": 83300 + }, + { + "epoch": 1.120997876286997, + "grad_norm": 0.34558480978012085, + "learning_rate": 0.004977439962976512, + "loss": 2.5129, + "step": 83400 + }, + { + "epoch": 1.1223419984408183, + "grad_norm": 0.9053849577903748, + "learning_rate": 0.004977368847410425, + "loss": 2.5075, + "step": 83500 + }, + { + "epoch": 1.1236861205946396, + "grad_norm": 1.7211493253707886, + "learning_rate": 0.004977297620443216, + "loss": 2.5035, + "step": 83600 + }, + { + "epoch": 1.125030242748461, + "grad_norm": 0.7423088550567627, + "learning_rate": 0.004977226282078084, + "loss": 2.5118, + "step": 83700 + }, + { + "epoch": 1.1263743649022824, + "grad_norm": 1.4103665351867676, + "learning_rate": 0.004977154832318255, + "loss": 2.5172, + "step": 83800 + }, + { + "epoch": 1.1277184870561037, + "grad_norm": 2.3322620391845703, + "learning_rate": 0.0049770832711669415, + "loss": 2.4982, + "step": 83900 + }, + { + "epoch": 1.129062609209925, + "grad_norm": 0.7170735001564026, + "learning_rate": 0.004977011598627376, + "loss": 2.5056, + "step": 84000 + }, + { + "epoch": 1.129062609209925, + "eval_MaskedAccuracy": 0.4369403158372308, + "eval_loss": 2.8143200874328613, + "eval_runtime": 156.8035, + "eval_samples_per_second": 404.812, + "eval_steps_per_second": 1.582, + "step": 84000 + }, + { + "epoch": 1.1304067313637463, + "grad_norm": 0.4916757643222809, + "learning_rate": 0.004976939814702779, + "loss": 2.5041, + "step": 84100 + }, + { + "epoch": 1.1317508535175678, + "grad_norm": 1.2962486743927002, + "learning_rate": 0.004976867919396393, + "loss": 2.5092, + "step": 84200 + }, + { + "epoch": 1.133094975671389, + "grad_norm": 0.47311854362487793, + "learning_rate": 0.004976795912711456, + "loss": 2.5074, + "step": 84300 + }, + { + "epoch": 1.1344390978252104, + "grad_norm": 0.4036632180213928, + "learning_rate": 0.004976723794651204, + "loss": 2.5089, + "step": 84400 + }, + { + "epoch": 1.1357832199790316, + "grad_norm": 1.4147541522979736, + "learning_rate": 0.004976651565218892, + "loss": 2.5074, + "step": 84500 + }, + { + "epoch": 1.1371273421328532, + "grad_norm": 1.4491769075393677, + "learning_rate": 0.004976579224417779, + "loss": 2.5051, + "step": 84600 + }, + { + "epoch": 1.1384714642866745, + "grad_norm": 1.161087155342102, + "learning_rate": 0.0049765067722511176, + "loss": 2.5125, + "step": 84700 + }, + { + "epoch": 1.1398155864404957, + "grad_norm": 0.37587714195251465, + "learning_rate": 0.004976434208722176, + "loss": 2.5046, + "step": 84800 + }, + { + "epoch": 1.141159708594317, + "grad_norm": 0.6457163095474243, + "learning_rate": 0.004976361533834221, + "loss": 2.5107, + "step": 84900 + }, + { + "epoch": 1.1425038307481383, + "grad_norm": 0.40892913937568665, + "learning_rate": 0.004976288747590531, + "loss": 2.5095, + "step": 85000 + }, + { + "epoch": 1.1425038307481383, + "eval_MaskedAccuracy": 0.4367847754877898, + "eval_loss": 2.817359685897827, + "eval_runtime": 155.4685, + "eval_samples_per_second": 408.288, + "eval_steps_per_second": 1.595, + "step": 85000 + }, + { + "epoch": 1.1438479529019596, + "grad_norm": 0.6328030228614807, + "learning_rate": 0.00497621584999438, + "loss": 2.5027, + "step": 85100 + }, + { + "epoch": 1.1451920750557811, + "grad_norm": 0.8241310119628906, + "learning_rate": 0.004976142841049059, + "loss": 2.5109, + "step": 85200 + }, + { + "epoch": 1.1465361972096024, + "grad_norm": 0.5292043089866638, + "learning_rate": 0.0049760697207578515, + "loss": 2.5051, + "step": 85300 + }, + { + "epoch": 1.1478803193634237, + "grad_norm": 0.6570155620574951, + "learning_rate": 0.004975996489124052, + "loss": 2.4996, + "step": 85400 + }, + { + "epoch": 1.149224441517245, + "grad_norm": 0.3944413959980011, + "learning_rate": 0.0049759231461509726, + "loss": 2.5018, + "step": 85500 + }, + { + "epoch": 1.1505685636710665, + "grad_norm": 0.7858182787895203, + "learning_rate": 0.00497584969184191, + "loss": 2.4994, + "step": 85600 + }, + { + "epoch": 1.1519126858248878, + "grad_norm": 0.354902058839798, + "learning_rate": 0.004975776126200173, + "loss": 2.5109, + "step": 85700 + }, + { + "epoch": 1.153256807978709, + "grad_norm": 1.0036402940750122, + "learning_rate": 0.004975702449229073, + "loss": 2.5043, + "step": 85800 + }, + { + "epoch": 1.1546009301325304, + "grad_norm": 0.9507333636283875, + "learning_rate": 0.004975628660931937, + "loss": 2.502, + "step": 85900 + }, + { + "epoch": 1.1559450522863517, + "grad_norm": 0.5167057514190674, + "learning_rate": 0.004975554761312083, + "loss": 2.5026, + "step": 86000 + }, + { + "epoch": 1.1559450522863517, + "eval_MaskedAccuracy": 0.43768769620070525, + "eval_loss": 2.8092353343963623, + "eval_runtime": 156.668, + "eval_samples_per_second": 405.162, + "eval_steps_per_second": 1.583, + "step": 86000 + }, + { + "epoch": 1.1572891744401732, + "grad_norm": 0.5097525715827942, + "learning_rate": 0.004975480750372851, + "loss": 2.5018, + "step": 86100 + }, + { + "epoch": 1.1586332965939945, + "grad_norm": 0.35351938009262085, + "learning_rate": 0.0049754066281175605, + "loss": 2.501, + "step": 86200 + }, + { + "epoch": 1.1599774187478158, + "grad_norm": 0.8449835181236267, + "learning_rate": 0.004975332394549564, + "loss": 2.5024, + "step": 86300 + }, + { + "epoch": 1.161321540901637, + "grad_norm": 63.61347579956055, + "learning_rate": 0.004975258049672201, + "loss": 2.5018, + "step": 86400 + }, + { + "epoch": 1.1626656630554586, + "grad_norm": 0.5342414975166321, + "learning_rate": 0.004975183593488831, + "loss": 2.6116, + "step": 86500 + }, + { + "epoch": 1.16400978520928, + "grad_norm": 0.47309231758117676, + "learning_rate": 0.0049751090260028, + "loss": 2.5154, + "step": 86600 + }, + { + "epoch": 1.1653539073631012, + "grad_norm": 1.168167233467102, + "learning_rate": 0.004975034347217465, + "loss": 2.5076, + "step": 86700 + }, + { + "epoch": 1.1666980295169225, + "grad_norm": 0.49217191338539124, + "learning_rate": 0.004974959557136195, + "loss": 2.5049, + "step": 86800 + }, + { + "epoch": 1.1680421516707438, + "grad_norm": 2.020880937576294, + "learning_rate": 0.004974884655762357, + "loss": 2.5017, + "step": 86900 + }, + { + "epoch": 1.169386273824565, + "grad_norm": 1.273708701133728, + "learning_rate": 0.004974809643099329, + "loss": 2.503, + "step": 87000 + }, + { + "epoch": 1.169386273824565, + "eval_MaskedAccuracy": 0.4362821121410326, + "eval_loss": 2.820596218109131, + "eval_runtime": 155.9859, + "eval_samples_per_second": 406.934, + "eval_steps_per_second": 1.59, + "step": 87000 + }, + { + "epoch": 1.1707303959783866, + "grad_norm": 0.9947301745414734, + "learning_rate": 0.004974734519150486, + "loss": 2.5085, + "step": 87100 + }, + { + "epoch": 1.1720745181322079, + "grad_norm": 0.36364173889160156, + "learning_rate": 0.004974659283919222, + "loss": 2.52, + "step": 87200 + }, + { + "epoch": 1.1734186402860292, + "grad_norm": 1.1245728731155396, + "learning_rate": 0.0049745839374089225, + "loss": 2.5178, + "step": 87300 + }, + { + "epoch": 1.1747627624398504, + "grad_norm": 0.8172126412391663, + "learning_rate": 0.004974508479622976, + "loss": 2.5063, + "step": 87400 + }, + { + "epoch": 1.176106884593672, + "grad_norm": 0.5378974676132202, + "learning_rate": 0.004974432910564789, + "loss": 2.5085, + "step": 87500 + }, + { + "epoch": 1.1774510067474933, + "grad_norm": 1.354956030845642, + "learning_rate": 0.00497435723023777, + "loss": 2.4993, + "step": 87600 + }, + { + "epoch": 1.1787951289013145, + "grad_norm": 1.2725735902786255, + "learning_rate": 0.00497428143864532, + "loss": 2.4983, + "step": 87700 + }, + { + "epoch": 1.1801392510551358, + "grad_norm": 1.188934326171875, + "learning_rate": 0.004974205535790863, + "loss": 2.4999, + "step": 87800 + }, + { + "epoch": 1.1814833732089571, + "grad_norm": 1.1253455877304077, + "learning_rate": 0.004974129521677812, + "loss": 2.496, + "step": 87900 + }, + { + "epoch": 1.1828274953627786, + "grad_norm": 0.6546825170516968, + "learning_rate": 0.004974053396309598, + "loss": 2.5013, + "step": 88000 + }, + { + "epoch": 1.1828274953627786, + "eval_MaskedAccuracy": 0.43612094406632873, + "eval_loss": 2.817842721939087, + "eval_runtime": 155.3942, + "eval_samples_per_second": 408.484, + "eval_steps_per_second": 1.596, + "step": 88000 + }, + { + "epoch": 1.1841716175166, + "grad_norm": 0.5742254257202148, + "learning_rate": 0.004973977159689648, + "loss": 2.4927, + "step": 88100 + }, + { + "epoch": 1.1855157396704212, + "grad_norm": 0.9239218831062317, + "learning_rate": 0.0049739008118213955, + "loss": 2.4991, + "step": 88200 + }, + { + "epoch": 1.1868598618242425, + "grad_norm": 0.8331120014190674, + "learning_rate": 0.004973824352708291, + "loss": 2.5034, + "step": 88300 + }, + { + "epoch": 1.188203983978064, + "grad_norm": 0.5568376779556274, + "learning_rate": 0.004973747782353763, + "loss": 2.4976, + "step": 88400 + }, + { + "epoch": 1.1895481061318853, + "grad_norm": 0.7204086184501648, + "learning_rate": 0.004973671100761278, + "loss": 2.4977, + "step": 88500 + }, + { + "epoch": 1.1908922282857066, + "grad_norm": 0.5486904382705688, + "learning_rate": 0.004973594307934272, + "loss": 2.5017, + "step": 88600 + }, + { + "epoch": 1.192236350439528, + "grad_norm": 0.8633172512054443, + "learning_rate": 0.0049735174038762235, + "loss": 2.5038, + "step": 88700 + }, + { + "epoch": 1.1935804725933492, + "grad_norm": 0.5683305859565735, + "learning_rate": 0.0049734403885905885, + "loss": 2.5001, + "step": 88800 + }, + { + "epoch": 1.1949245947471705, + "grad_norm": 0.5062243342399597, + "learning_rate": 0.004973363262080839, + "loss": 2.5065, + "step": 88900 + }, + { + "epoch": 1.196268716900992, + "grad_norm": 3.755258321762085, + "learning_rate": 0.004973286024350451, + "loss": 2.491, + "step": 89000 + }, + { + "epoch": 1.196268716900992, + "eval_MaskedAccuracy": 0.4371673629048697, + "eval_loss": 2.814087390899658, + "eval_runtime": 156.1272, + "eval_samples_per_second": 406.566, + "eval_steps_per_second": 1.588, + "step": 89000 + }, + { + "epoch": 1.1976128390548133, + "grad_norm": 0.8349953293800354, + "learning_rate": 0.004973208675402902, + "loss": 2.5118, + "step": 89100 + }, + { + "epoch": 1.1989569612086346, + "grad_norm": 1.188719391822815, + "learning_rate": 0.004973131215241677, + "loss": 2.5103, + "step": 89200 + }, + { + "epoch": 1.200301083362456, + "grad_norm": 1.0278277397155762, + "learning_rate": 0.004973053643870272, + "loss": 2.4956, + "step": 89300 + }, + { + "epoch": 1.2016452055162774, + "grad_norm": 1.174872636795044, + "learning_rate": 0.004972975961292177, + "loss": 2.4981, + "step": 89400 + }, + { + "epoch": 1.2029893276700987, + "grad_norm": 1.0490007400512695, + "learning_rate": 0.004972898167510894, + "loss": 2.5007, + "step": 89500 + }, + { + "epoch": 1.20433344982392, + "grad_norm": 1.1336147785186768, + "learning_rate": 0.004972820262529928, + "loss": 2.5084, + "step": 89600 + }, + { + "epoch": 1.2056775719777413, + "grad_norm": 0.770378589630127, + "learning_rate": 0.004972742246352788, + "loss": 2.4995, + "step": 89700 + }, + { + "epoch": 1.2070216941315626, + "grad_norm": 1.1456156969070435, + "learning_rate": 0.00497266411898299, + "loss": 2.4966, + "step": 89800 + }, + { + "epoch": 1.208365816285384, + "grad_norm": 2.0087265968322754, + "learning_rate": 0.004972585880424056, + "loss": 2.4936, + "step": 89900 + }, + { + "epoch": 1.2097099384392054, + "grad_norm": 1.6110386848449707, + "learning_rate": 0.0049725075306795135, + "loss": 2.4972, + "step": 90000 + }, + { + "epoch": 1.2097099384392054, + "eval_MaskedAccuracy": 0.4370350689102483, + "eval_loss": 2.812059164047241, + "eval_runtime": 155.8421, + "eval_samples_per_second": 407.31, + "eval_steps_per_second": 1.591, + "step": 90000 + }, + { + "epoch": 1.2110540605930267, + "grad_norm": 0.35183417797088623, + "learning_rate": 0.004972429069752887, + "loss": 2.4858, + "step": 90100 + }, + { + "epoch": 1.212398182746848, + "grad_norm": 0.4271332621574402, + "learning_rate": 0.004972350497647718, + "loss": 2.4989, + "step": 90200 + }, + { + "epoch": 1.2137423049006695, + "grad_norm": 3.801957845687866, + "learning_rate": 0.004972271814367547, + "loss": 2.4984, + "step": 90300 + }, + { + "epoch": 1.2150864270544908, + "grad_norm": 2.1145997047424316, + "learning_rate": 0.004972193019915915, + "loss": 2.4964, + "step": 90400 + }, + { + "epoch": 1.216430549208312, + "grad_norm": 0.3601829707622528, + "learning_rate": 0.004972114114296368, + "loss": 2.5001, + "step": 90500 + }, + { + "epoch": 1.2177746713621334, + "grad_norm": 0.31775668263435364, + "learning_rate": 0.004972035097512468, + "loss": 2.4985, + "step": 90600 + }, + { + "epoch": 1.2191187935159546, + "grad_norm": 0.4949300289154053, + "learning_rate": 0.004971955969567773, + "loss": 2.4992, + "step": 90700 + }, + { + "epoch": 1.2204629156697762, + "grad_norm": 0.5731763243675232, + "learning_rate": 0.004971876730465848, + "loss": 2.4985, + "step": 90800 + }, + { + "epoch": 1.2218070378235975, + "grad_norm": 0.6746828556060791, + "learning_rate": 0.004971797380210261, + "loss": 2.4902, + "step": 90900 + }, + { + "epoch": 1.2231511599774187, + "grad_norm": 2.1242833137512207, + "learning_rate": 0.004971717918804597, + "loss": 2.5026, + "step": 91000 + }, + { + "epoch": 1.2231511599774187, + "eval_MaskedAccuracy": 0.4372796066068638, + "eval_loss": 2.8106234073638916, + "eval_runtime": 155.6365, + "eval_samples_per_second": 407.848, + "eval_steps_per_second": 1.593, + "step": 91000 + }, + { + "epoch": 1.22449528213124, + "grad_norm": 1.0051345825195312, + "learning_rate": 0.004971638346252431, + "loss": 2.484, + "step": 91100 + }, + { + "epoch": 1.2258394042850613, + "grad_norm": 0.33799469470977783, + "learning_rate": 0.0049715586625573484, + "loss": 2.4978, + "step": 91200 + }, + { + "epoch": 1.2271835264388828, + "grad_norm": 0.37209373712539673, + "learning_rate": 0.004971478867722932, + "loss": 2.5002, + "step": 91300 + }, + { + "epoch": 1.2285276485927041, + "grad_norm": 0.9765738248825073, + "learning_rate": 0.004971398961752787, + "loss": 2.4962, + "step": 91400 + }, + { + "epoch": 1.2298717707465254, + "grad_norm": 1.2135403156280518, + "learning_rate": 0.004971318944650509, + "loss": 2.4967, + "step": 91500 + }, + { + "epoch": 1.2312158929003467, + "grad_norm": 0.776421070098877, + "learning_rate": 0.004971238816419708, + "loss": 2.4915, + "step": 91600 + }, + { + "epoch": 1.232560015054168, + "grad_norm": 0.8908390998840332, + "learning_rate": 0.004971158577063992, + "loss": 2.5007, + "step": 91700 + }, + { + "epoch": 1.2339041372079895, + "grad_norm": 0.3629126250743866, + "learning_rate": 0.004971078226586978, + "loss": 2.4831, + "step": 91800 + }, + { + "epoch": 1.2352482593618108, + "grad_norm": 0.5708677172660828, + "learning_rate": 0.004970997764992286, + "loss": 2.488, + "step": 91900 + }, + { + "epoch": 1.236592381515632, + "grad_norm": 1.317496418952942, + "learning_rate": 0.004970917192283536, + "loss": 2.4921, + "step": 92000 + }, + { + "epoch": 1.236592381515632, + "eval_MaskedAccuracy": 0.43781475066312364, + "eval_loss": 2.807178020477295, + "eval_runtime": 155.1214, + "eval_samples_per_second": 409.202, + "eval_steps_per_second": 1.599, + "step": 92000 + }, + { + "epoch": 1.2379365036694534, + "grad_norm": 1.5067718029022217, + "learning_rate": 0.004970836508464369, + "loss": 2.4944, + "step": 92100 + }, + { + "epoch": 1.239280625823275, + "grad_norm": 0.6655193567276001, + "learning_rate": 0.004970755713538412, + "loss": 2.4849, + "step": 92200 + }, + { + "epoch": 1.2406247479770962, + "grad_norm": 0.9877846837043762, + "learning_rate": 0.004970674807509303, + "loss": 2.4908, + "step": 92300 + }, + { + "epoch": 1.2419688701309175, + "grad_norm": 0.47736746072769165, + "learning_rate": 0.004970593790380696, + "loss": 2.498, + "step": 92400 + }, + { + "epoch": 1.2433129922847388, + "grad_norm": 0.6927477121353149, + "learning_rate": 0.004970512662156239, + "loss": 2.4849, + "step": 92500 + }, + { + "epoch": 1.24465711443856, + "grad_norm": 0.5318709015846252, + "learning_rate": 0.004970431422839584, + "loss": 2.4965, + "step": 92600 + }, + { + "epoch": 1.2460012365923816, + "grad_norm": 3.21836519241333, + "learning_rate": 0.004970350072434393, + "loss": 2.5004, + "step": 92700 + }, + { + "epoch": 1.247345358746203, + "grad_norm": 0.5743933320045471, + "learning_rate": 0.004970268610944332, + "loss": 2.4935, + "step": 92800 + }, + { + "epoch": 1.2486894809000242, + "grad_norm": 0.4667443335056305, + "learning_rate": 0.004970187038373073, + "loss": 2.4883, + "step": 92900 + }, + { + "epoch": 1.2500336030538455, + "grad_norm": 0.6037049889564514, + "learning_rate": 0.00497010535472429, + "loss": 2.4922, + "step": 93000 + }, + { + "epoch": 1.2500336030538455, + "eval_MaskedAccuracy": 0.4379076740559415, + "eval_loss": 2.804267644882202, + "eval_runtime": 157.0744, + "eval_samples_per_second": 404.114, + "eval_steps_per_second": 1.579, + "step": 93000 + }, + { + "epoch": 1.251377725207667, + "grad_norm": 0.6085057854652405, + "learning_rate": 0.004970023560001663, + "loss": 2.4916, + "step": 93100 + }, + { + "epoch": 1.2527218473614883, + "grad_norm": 0.8414444923400879, + "learning_rate": 0.00496994165420888, + "loss": 2.4915, + "step": 93200 + }, + { + "epoch": 1.2540659695153096, + "grad_norm": 0.962984561920166, + "learning_rate": 0.004969859637349628, + "loss": 2.493, + "step": 93300 + }, + { + "epoch": 1.2554100916691309, + "grad_norm": 0.9756949543952942, + "learning_rate": 0.0049697775094276085, + "loss": 2.496, + "step": 93400 + }, + { + "epoch": 1.2567542138229522, + "grad_norm": 1.2727997303009033, + "learning_rate": 0.004969695270446518, + "loss": 2.4915, + "step": 93500 + }, + { + "epoch": 1.2580983359767735, + "grad_norm": 1.2825567722320557, + "learning_rate": 0.004969612920410054, + "loss": 2.4862, + "step": 93600 + }, + { + "epoch": 1.259442458130595, + "grad_norm": 0.7585506439208984, + "learning_rate": 0.004969530459321946, + "loss": 2.494, + "step": 93700 + }, + { + "epoch": 1.2607865802844163, + "grad_norm": 0.6428466439247131, + "learning_rate": 0.004969447887185897, + "loss": 2.4782, + "step": 93800 + }, + { + "epoch": 1.2621307024382376, + "grad_norm": 0.8756228089332581, + "learning_rate": 0.004969365204005632, + "loss": 2.4977, + "step": 93900 + }, + { + "epoch": 1.263474824592059, + "grad_norm": 0.36692315340042114, + "learning_rate": 0.004969282409784867, + "loss": 2.4931, + "step": 94000 + }, + { + "epoch": 1.263474824592059, + "eval_MaskedAccuracy": 0.43933527822037677, + "eval_loss": 2.797823429107666, + "eval_runtime": 155.1464, + "eval_samples_per_second": 409.136, + "eval_steps_per_second": 1.598, + "step": 94000 + }, + { + "epoch": 1.2648189467458804, + "grad_norm": 2.2874348163604736, + "learning_rate": 0.004969199504527345, + "loss": 2.4854, + "step": 94100 + }, + { + "epoch": 1.2661630688997016, + "grad_norm": 1.3956974744796753, + "learning_rate": 0.0049691164882367895, + "loss": 2.4901, + "step": 94200 + }, + { + "epoch": 1.267507191053523, + "grad_norm": 0.6008788347244263, + "learning_rate": 0.004969033360916954, + "loss": 2.4849, + "step": 94300 + }, + { + "epoch": 1.2688513132073442, + "grad_norm": 1.0495715141296387, + "learning_rate": 0.004968950122571579, + "loss": 2.4919, + "step": 94400 + }, + { + "epoch": 1.2701954353611655, + "grad_norm": 1.057627558708191, + "learning_rate": 0.00496886677320441, + "loss": 2.4909, + "step": 94500 + }, + { + "epoch": 1.271539557514987, + "grad_norm": 0.4978176951408386, + "learning_rate": 0.0049687833128192055, + "loss": 2.4938, + "step": 94600 + }, + { + "epoch": 1.2728836796688083, + "grad_norm": 0.5044616460800171, + "learning_rate": 0.004968699741419729, + "loss": 2.4907, + "step": 94700 + }, + { + "epoch": 1.2742278018226296, + "grad_norm": 0.37560752034187317, + "learning_rate": 0.00496861605900974, + "loss": 2.4896, + "step": 94800 + }, + { + "epoch": 1.275571923976451, + "grad_norm": 0.877223789691925, + "learning_rate": 0.004968532265593022, + "loss": 2.4882, + "step": 94900 + }, + { + "epoch": 1.2769160461302724, + "grad_norm": 1.2958379983901978, + "learning_rate": 0.004968448361173342, + "loss": 2.4856, + "step": 95000 + }, + { + "epoch": 1.2769160461302724, + "eval_MaskedAccuracy": 0.4387117447553094, + "eval_loss": 2.799860954284668, + "eval_runtime": 154.6014, + "eval_samples_per_second": 410.578, + "eval_steps_per_second": 1.604, + "step": 95000 + }, + { + "epoch": 1.2782601682840937, + "grad_norm": 0.3757873773574829, + "learning_rate": 0.004968364345754483, + "loss": 2.4836, + "step": 95100 + }, + { + "epoch": 1.279604290437915, + "grad_norm": 0.9294460415840149, + "learning_rate": 0.004968280219340219, + "loss": 2.4937, + "step": 95200 + }, + { + "epoch": 1.2809484125917363, + "grad_norm": 0.3245985805988312, + "learning_rate": 0.004968195981934356, + "loss": 2.4949, + "step": 95300 + }, + { + "epoch": 1.2822925347455576, + "grad_norm": 0.342230886220932, + "learning_rate": 0.004968111633540681, + "loss": 2.492, + "step": 95400 + }, + { + "epoch": 1.283636656899379, + "grad_norm": 1.0859248638153076, + "learning_rate": 0.004968027174162995, + "loss": 2.4855, + "step": 95500 + }, + { + "epoch": 1.2849807790532004, + "grad_norm": 0.45761579275131226, + "learning_rate": 0.0049679426038051104, + "loss": 2.4863, + "step": 95600 + }, + { + "epoch": 1.2863249012070217, + "grad_norm": 0.49789857864379883, + "learning_rate": 0.004967857922470824, + "loss": 2.4931, + "step": 95700 + }, + { + "epoch": 1.287669023360843, + "grad_norm": 0.39404183626174927, + "learning_rate": 0.004967773130163965, + "loss": 2.4774, + "step": 95800 + }, + { + "epoch": 1.2890131455146645, + "grad_norm": 0.3045353293418884, + "learning_rate": 0.0049676882268883515, + "loss": 2.4826, + "step": 95900 + }, + { + "epoch": 1.2903572676684858, + "grad_norm": 0.34618687629699707, + "learning_rate": 0.004967603212647805, + "loss": 2.4788, + "step": 96000 + }, + { + "epoch": 1.2903572676684858, + "eval_MaskedAccuracy": 0.43972966453745477, + "eval_loss": 2.7956392765045166, + "eval_runtime": 154.9374, + "eval_samples_per_second": 409.688, + "eval_steps_per_second": 1.601, + "step": 96000 + }, + { + "epoch": 1.291701389822307, + "grad_norm": 1.395040512084961, + "learning_rate": 0.004967518087446164, + "loss": 2.4897, + "step": 96100 + }, + { + "epoch": 1.2930455119761284, + "grad_norm": 0.49540796875953674, + "learning_rate": 0.0049674328512872585, + "loss": 2.4829, + "step": 96200 + }, + { + "epoch": 1.2943896341299497, + "grad_norm": 1.1871672868728638, + "learning_rate": 0.00496734750417493, + "loss": 2.4834, + "step": 96300 + }, + { + "epoch": 1.295733756283771, + "grad_norm": 0.9125702977180481, + "learning_rate": 0.004967262046113016, + "loss": 2.4926, + "step": 96400 + }, + { + "epoch": 1.2970778784375925, + "grad_norm": 0.8528000712394714, + "learning_rate": 0.004967176477105385, + "loss": 2.4924, + "step": 96500 + }, + { + "epoch": 1.2984220005914138, + "grad_norm": 0.3273971378803253, + "learning_rate": 0.0049670907971558756, + "loss": 2.481, + "step": 96600 + }, + { + "epoch": 1.299766122745235, + "grad_norm": 1.2552522420883179, + "learning_rate": 0.004967005006268358, + "loss": 2.4869, + "step": 96700 + }, + { + "epoch": 1.3011102448990564, + "grad_norm": 0.8510228991508484, + "learning_rate": 0.004966919104446687, + "loss": 2.4875, + "step": 96800 + }, + { + "epoch": 1.3024543670528779, + "grad_norm": 0.6518368124961853, + "learning_rate": 0.004966833091694741, + "loss": 2.4836, + "step": 96900 + }, + { + "epoch": 1.3037984892066992, + "grad_norm": 1.7670749425888062, + "learning_rate": 0.004966746968016394, + "loss": 2.4831, + "step": 97000 + }, + { + "epoch": 1.3037984892066992, + "eval_MaskedAccuracy": 0.4384087863650507, + "eval_loss": 2.801335573196411, + "eval_runtime": 156.4566, + "eval_samples_per_second": 405.71, + "eval_steps_per_second": 1.585, + "step": 97000 + }, + { + "epoch": 1.3051426113605205, + "grad_norm": 0.37767720222473145, + "learning_rate": 0.004966660733415522, + "loss": 2.4896, + "step": 97100 + }, + { + "epoch": 1.3064867335143417, + "grad_norm": 1.352961540222168, + "learning_rate": 0.004966574387896025, + "loss": 2.4821, + "step": 97200 + }, + { + "epoch": 1.307830855668163, + "grad_norm": 0.5152901411056519, + "learning_rate": 0.004966487931461782, + "loss": 2.4872, + "step": 97300 + }, + { + "epoch": 1.3091749778219843, + "grad_norm": 0.6184568405151367, + "learning_rate": 0.00496640136411669, + "loss": 2.4917, + "step": 97400 + }, + { + "epoch": 1.3105190999758058, + "grad_norm": 0.8382551074028015, + "learning_rate": 0.004966314685864655, + "loss": 2.4785, + "step": 97500 + }, + { + "epoch": 1.3118632221296271, + "grad_norm": 0.9581834077835083, + "learning_rate": 0.004966227896709579, + "loss": 2.4839, + "step": 97600 + }, + { + "epoch": 1.3132073442834484, + "grad_norm": 0.33205175399780273, + "learning_rate": 0.004966140996655376, + "loss": 2.4767, + "step": 97700 + }, + { + "epoch": 1.31455146643727, + "grad_norm": 1.0447673797607422, + "learning_rate": 0.0049660539857059515, + "loss": 2.491, + "step": 97800 + }, + { + "epoch": 1.3158955885910912, + "grad_norm": 1.3554537296295166, + "learning_rate": 0.0049659668638652316, + "loss": 2.4827, + "step": 97900 + }, + { + "epoch": 1.3172397107449125, + "grad_norm": 1.2488070726394653, + "learning_rate": 0.00496587963113714, + "loss": 2.4825, + "step": 98000 + }, + { + "epoch": 1.3172397107449125, + "eval_MaskedAccuracy": 0.439554404426727, + "eval_loss": 2.793813943862915, + "eval_runtime": 158.828, + "eval_samples_per_second": 399.652, + "eval_steps_per_second": 1.561, + "step": 98000 + }, + { + "epoch": 1.3185838328987338, + "grad_norm": 0.7793965935707092, + "learning_rate": 0.004965792287525609, + "loss": 2.4838, + "step": 98100 + }, + { + "epoch": 1.3199279550525551, + "grad_norm": 2.033632755279541, + "learning_rate": 0.004965704833034586, + "loss": 2.4777, + "step": 98200 + }, + { + "epoch": 1.3212720772063764, + "grad_norm": 0.8052381277084351, + "learning_rate": 0.0049656172676679905, + "loss": 2.4794, + "step": 98300 + }, + { + "epoch": 1.322616199360198, + "grad_norm": 0.6361316442489624, + "learning_rate": 0.0049655295914297825, + "loss": 2.4872, + "step": 98400 + }, + { + "epoch": 1.3239603215140192, + "grad_norm": 0.8865070343017578, + "learning_rate": 0.004965441804323918, + "loss": 2.4818, + "step": 98500 + }, + { + "epoch": 1.3253044436678405, + "grad_norm": 0.9113361239433289, + "learning_rate": 0.004965353906354336, + "loss": 2.4834, + "step": 98600 + }, + { + "epoch": 1.3266485658216618, + "grad_norm": 1.3836218118667603, + "learning_rate": 0.004965265897525005, + "loss": 2.4795, + "step": 98700 + }, + { + "epoch": 1.3279926879754833, + "grad_norm": 0.7738713026046753, + "learning_rate": 0.0049651777778398805, + "loss": 2.4869, + "step": 98800 + }, + { + "epoch": 1.3293368101293046, + "grad_norm": 1.1943604946136475, + "learning_rate": 0.004965089547302944, + "loss": 2.489, + "step": 98900 + }, + { + "epoch": 1.330680932283126, + "grad_norm": 0.4339142441749573, + "learning_rate": 0.004965001205918176, + "loss": 2.4801, + "step": 99000 + }, + { + "epoch": 1.330680932283126, + "eval_MaskedAccuracy": 0.4402919161127212, + "eval_loss": 2.789841651916504, + "eval_runtime": 155.2442, + "eval_samples_per_second": 408.878, + "eval_steps_per_second": 1.597, + "step": 99000 + }, + { + "epoch": 1.3320250544369472, + "grad_norm": 0.5905622243881226, + "learning_rate": 0.00496491275368955, + "loss": 2.4818, + "step": 99100 + }, + { + "epoch": 1.3333691765907685, + "grad_norm": 0.8844038844108582, + "learning_rate": 0.004964824190621052, + "loss": 2.4847, + "step": 99200 + }, + { + "epoch": 1.3347132987445898, + "grad_norm": 0.4743984639644623, + "learning_rate": 0.004964735516716671, + "loss": 2.4873, + "step": 99300 + }, + { + "epoch": 1.3360574208984113, + "grad_norm": 0.4455043077468872, + "learning_rate": 0.0049646467319804, + "loss": 2.4894, + "step": 99400 + }, + { + "epoch": 1.3374015430522326, + "grad_norm": 0.33678489923477173, + "learning_rate": 0.004964557836416248, + "loss": 2.4707, + "step": 99500 + }, + { + "epoch": 1.3387456652060539, + "grad_norm": 0.557610809803009, + "learning_rate": 0.004964468830028214, + "loss": 2.4761, + "step": 99600 + }, + { + "epoch": 1.3400897873598754, + "grad_norm": 1.0966076850891113, + "learning_rate": 0.00496437971282031, + "loss": 2.4776, + "step": 99700 + }, + { + "epoch": 1.3414339095136967, + "grad_norm": 1.7193931341171265, + "learning_rate": 0.00496429048479655, + "loss": 2.5095, + "step": 99800 + }, + { + "epoch": 1.342778031667518, + "grad_norm": 1.0982494354248047, + "learning_rate": 0.004964201145960957, + "loss": 2.4835, + "step": 99900 + }, + { + "epoch": 1.3441221538213393, + "grad_norm": 0.7176316976547241, + "learning_rate": 0.004964111696317556, + "loss": 2.4881, + "step": 100000 + }, + { + "epoch": 1.3441221538213393, + "eval_MaskedAccuracy": 0.439665552172269, + "eval_loss": 2.7934842109680176, + "eval_runtime": 156.2627, + "eval_samples_per_second": 406.213, + "eval_steps_per_second": 1.587, + "step": 100000 + }, + { + "epoch": 1.3454662759751606, + "grad_norm": 0.6092787384986877, + "learning_rate": 0.00496402213587037, + "loss": 2.4923, + "step": 100100 + }, + { + "epoch": 1.3468103981289818, + "grad_norm": 0.48341110348701477, + "learning_rate": 0.004963932464623445, + "loss": 2.4847, + "step": 100200 + }, + { + "epoch": 1.3481545202828034, + "grad_norm": 1.8525594472885132, + "learning_rate": 0.004963842682580817, + "loss": 2.4742, + "step": 100300 + }, + { + "epoch": 1.3494986424366247, + "grad_norm": 1.4322495460510254, + "learning_rate": 0.004963752789746537, + "loss": 2.4793, + "step": 100400 + }, + { + "epoch": 1.350842764590446, + "grad_norm": 0.881243884563446, + "learning_rate": 0.004963662786124655, + "loss": 2.4773, + "step": 100500 + }, + { + "epoch": 1.3521868867442675, + "grad_norm": 1.2258617877960205, + "learning_rate": 0.0049635726717192205, + "loss": 2.4833, + "step": 100600 + }, + { + "epoch": 1.3535310088980887, + "grad_norm": 0.9638917446136475, + "learning_rate": 0.004963482446534289, + "loss": 2.4804, + "step": 100700 + }, + { + "epoch": 1.35487513105191, + "grad_norm": 0.9644854068756104, + "learning_rate": 0.004963392110573937, + "loss": 2.4719, + "step": 100800 + }, + { + "epoch": 1.3562192532057313, + "grad_norm": 0.689267635345459, + "learning_rate": 0.0049633016638422275, + "loss": 2.4756, + "step": 100900 + }, + { + "epoch": 1.3575633753595526, + "grad_norm": 1.2867717742919922, + "learning_rate": 0.004963211106343239, + "loss": 2.4812, + "step": 101000 + }, + { + "epoch": 1.3575633753595526, + "eval_MaskedAccuracy": 0.4396971572510662, + "eval_loss": 2.793617010116577, + "eval_runtime": 157.0355, + "eval_samples_per_second": 404.214, + "eval_steps_per_second": 1.579, + "step": 101000 + }, + { + "epoch": 1.358907497513374, + "grad_norm": 1.8802392482757568, + "learning_rate": 0.00496312043808105, + "loss": 2.485, + "step": 101100 + }, + { + "epoch": 1.3602516196671954, + "grad_norm": 0.35604092478752136, + "learning_rate": 0.004963029659059752, + "loss": 2.4812, + "step": 101200 + }, + { + "epoch": 1.3615957418210167, + "grad_norm": 1.0150797367095947, + "learning_rate": 0.004962938769283431, + "loss": 2.4875, + "step": 101300 + }, + { + "epoch": 1.362939863974838, + "grad_norm": 1.3340040445327759, + "learning_rate": 0.004962847768756179, + "loss": 2.4684, + "step": 101400 + }, + { + "epoch": 1.3642839861286593, + "grad_norm": 0.38257646560668945, + "learning_rate": 0.004962756657482102, + "loss": 2.4866, + "step": 101500 + }, + { + "epoch": 1.3656281082824808, + "grad_norm": 0.6464138031005859, + "learning_rate": 0.0049626654354653025, + "loss": 2.4844, + "step": 101600 + }, + { + "epoch": 1.3669722304363021, + "grad_norm": 0.41035962104797363, + "learning_rate": 0.004962574102709892, + "loss": 2.4817, + "step": 101700 + }, + { + "epoch": 1.3683163525901234, + "grad_norm": 0.7176283001899719, + "learning_rate": 0.004962482659219983, + "loss": 2.4751, + "step": 101800 + }, + { + "epoch": 1.3696604747439447, + "grad_norm": 0.7560930848121643, + "learning_rate": 0.004962391104999701, + "loss": 2.48, + "step": 101900 + }, + { + "epoch": 1.371004596897766, + "grad_norm": 0.8050186038017273, + "learning_rate": 0.004962299440053163, + "loss": 2.4691, + "step": 102000 + }, + { + "epoch": 1.371004596897766, + "eval_MaskedAccuracy": 0.44033106657058535, + "eval_loss": 2.7889299392700195, + "eval_runtime": 155.2645, + "eval_samples_per_second": 408.825, + "eval_steps_per_second": 1.597, + "step": 102000 + }, + { + "epoch": 1.3723487190515873, + "grad_norm": 0.39613771438598633, + "learning_rate": 0.004962207664384512, + "loss": 2.4773, + "step": 102100 + }, + { + "epoch": 1.3736928412054088, + "grad_norm": 2.5852887630462646, + "learning_rate": 0.004962115777997869, + "loss": 2.479, + "step": 102200 + }, + { + "epoch": 1.37503696335923, + "grad_norm": 1.9417086839675903, + "learning_rate": 0.004962023780897379, + "loss": 2.479, + "step": 102300 + }, + { + "epoch": 1.3763810855130514, + "grad_norm": 1.185256004333496, + "learning_rate": 0.004961931673087198, + "loss": 2.4713, + "step": 102400 + }, + { + "epoch": 1.377725207666873, + "grad_norm": 0.5902850031852722, + "learning_rate": 0.004961839454571464, + "loss": 2.4805, + "step": 102500 + }, + { + "epoch": 1.3790693298206942, + "grad_norm": 0.61485755443573, + "learning_rate": 0.0049617471253543335, + "loss": 2.478, + "step": 102600 + }, + { + "epoch": 1.3804134519745155, + "grad_norm": 0.3328634798526764, + "learning_rate": 0.004961654685439971, + "loss": 2.4705, + "step": 102700 + }, + { + "epoch": 1.3817575741283368, + "grad_norm": 1.5581071376800537, + "learning_rate": 0.004961562134832535, + "loss": 2.468, + "step": 102800 + }, + { + "epoch": 1.383101696282158, + "grad_norm": 0.8161963820457458, + "learning_rate": 0.004961469473536204, + "loss": 2.4773, + "step": 102900 + }, + { + "epoch": 1.3844458184359794, + "grad_norm": 0.4553765654563904, + "learning_rate": 0.004961376701555151, + "loss": 2.4774, + "step": 103000 + }, + { + "epoch": 1.3844458184359794, + "eval_MaskedAccuracy": 0.44020060249449044, + "eval_loss": 2.7893545627593994, + "eval_runtime": 157.1906, + "eval_samples_per_second": 403.815, + "eval_steps_per_second": 1.578, + "step": 103000 + }, + { + "epoch": 1.3857899405898009, + "grad_norm": 0.7315437197685242, + "learning_rate": 0.004961283818893556, + "loss": 2.4893, + "step": 103100 + }, + { + "epoch": 1.3871340627436222, + "grad_norm": 1.7851052284240723, + "learning_rate": 0.004961190825555604, + "loss": 2.4825, + "step": 103200 + }, + { + "epoch": 1.3884781848974435, + "grad_norm": 0.7662883400917053, + "learning_rate": 0.004961097721545482, + "loss": 2.4694, + "step": 103300 + }, + { + "epoch": 1.3898223070512647, + "grad_norm": 0.3333396315574646, + "learning_rate": 0.0049610045068673855, + "loss": 2.4772, + "step": 103400 + }, + { + "epoch": 1.3911664292050863, + "grad_norm": 0.4882153570652008, + "learning_rate": 0.004960911181525515, + "loss": 2.4764, + "step": 103500 + }, + { + "epoch": 1.3925105513589076, + "grad_norm": 1.4237163066864014, + "learning_rate": 0.0049608177455240775, + "loss": 2.4822, + "step": 103600 + }, + { + "epoch": 1.3938546735127288, + "grad_norm": 0.5641297101974487, + "learning_rate": 0.004960724198867284, + "loss": 2.4779, + "step": 103700 + }, + { + "epoch": 1.3951987956665501, + "grad_norm": 0.6121580600738525, + "learning_rate": 0.00496063054155935, + "loss": 2.4663, + "step": 103800 + }, + { + "epoch": 1.3965429178203714, + "grad_norm": 0.6751658320426941, + "learning_rate": 0.004960536773604492, + "loss": 2.4718, + "step": 103900 + }, + { + "epoch": 1.3978870399741927, + "grad_norm": 0.9351543188095093, + "learning_rate": 0.004960442895006929, + "loss": 2.4855, + "step": 104000 + }, + { + "epoch": 1.3978870399741927, + "eval_MaskedAccuracy": 0.44113983122276385, + "eval_loss": 2.7834458351135254, + "eval_runtime": 156.6243, + "eval_samples_per_second": 405.276, + "eval_steps_per_second": 1.583, + "step": 104000 + }, + { + "epoch": 1.3992311621280142, + "grad_norm": 0.44563743472099304, + "learning_rate": 0.004960348905770906, + "loss": 2.478, + "step": 104100 + }, + { + "epoch": 1.4005752842818355, + "grad_norm": 0.7417739629745483, + "learning_rate": 0.004960254805900649, + "loss": 2.4782, + "step": 104200 + }, + { + "epoch": 1.4019194064356568, + "grad_norm": 0.38673731684684753, + "learning_rate": 0.004960160595400398, + "loss": 2.4703, + "step": 104300 + }, + { + "epoch": 1.4032635285894783, + "grad_norm": 0.41961851716041565, + "learning_rate": 0.004960066274274401, + "loss": 2.4758, + "step": 104400 + }, + { + "epoch": 1.4046076507432996, + "grad_norm": 1.0936461687088013, + "learning_rate": 0.00495997184252691, + "loss": 2.4711, + "step": 104500 + }, + { + "epoch": 1.405951772897121, + "grad_norm": 1.4237301349639893, + "learning_rate": 0.004959877300162172, + "loss": 2.476, + "step": 104600 + }, + { + "epoch": 1.4072958950509422, + "grad_norm": 0.5586555600166321, + "learning_rate": 0.004959782647184455, + "loss": 2.4744, + "step": 104700 + }, + { + "epoch": 1.4086400172047635, + "grad_norm": 0.5998302102088928, + "learning_rate": 0.0049596878835980115, + "loss": 2.481, + "step": 104800 + }, + { + "epoch": 1.4099841393585848, + "grad_norm": 0.6141105890274048, + "learning_rate": 0.004959593009407127, + "loss": 2.4683, + "step": 104900 + }, + { + "epoch": 1.4113282615124063, + "grad_norm": 0.6551980376243591, + "learning_rate": 0.00495949802461607, + "loss": 2.4758, + "step": 105000 + }, + { + "epoch": 1.4113282615124063, + "eval_MaskedAccuracy": 0.4408968621448549, + "eval_loss": 2.78601336479187, + "eval_runtime": 159.2788, + "eval_samples_per_second": 398.521, + "eval_steps_per_second": 1.557, + "step": 105000 + }, + { + "epoch": 1.4126723836662276, + "grad_norm": 0.6265648603439331, + "learning_rate": 0.00495940292922911, + "loss": 2.4772, + "step": 105100 + }, + { + "epoch": 1.414016505820049, + "grad_norm": 1.0922765731811523, + "learning_rate": 0.004959307723250553, + "loss": 2.4649, + "step": 105200 + }, + { + "epoch": 1.4153606279738702, + "grad_norm": 1.4056020975112915, + "learning_rate": 0.004959212406684676, + "loss": 2.4811, + "step": 105300 + }, + { + "epoch": 1.4167047501276917, + "grad_norm": 1.5905888080596924, + "learning_rate": 0.004959116979535776, + "loss": 2.4769, + "step": 105400 + }, + { + "epoch": 1.418048872281513, + "grad_norm": 0.448127418756485, + "learning_rate": 0.004959021441808149, + "loss": 2.4706, + "step": 105500 + }, + { + "epoch": 1.4193929944353343, + "grad_norm": 1.482865571975708, + "learning_rate": 0.004958925793506108, + "loss": 2.4795, + "step": 105600 + }, + { + "epoch": 1.4207371165891556, + "grad_norm": 0.37079110741615295, + "learning_rate": 0.004958830034633952, + "loss": 2.4688, + "step": 105700 + }, + { + "epoch": 1.4220812387429769, + "grad_norm": 0.398387610912323, + "learning_rate": 0.004958734165196007, + "loss": 2.473, + "step": 105800 + }, + { + "epoch": 1.4234253608967982, + "grad_norm": 0.49912792444229126, + "learning_rate": 0.004958638185196587, + "loss": 2.4701, + "step": 105900 + }, + { + "epoch": 1.4247694830506197, + "grad_norm": 0.8730640411376953, + "learning_rate": 0.004958542094640022, + "loss": 2.474, + "step": 106000 + }, + { + "epoch": 1.4247694830506197, + "eval_MaskedAccuracy": 0.44114819868603417, + "eval_loss": 2.7835192680358887, + "eval_runtime": 154.9227, + "eval_samples_per_second": 409.727, + "eval_steps_per_second": 1.601, + "step": 106000 + }, + { + "epoch": 1.426113605204441, + "grad_norm": 0.4877232313156128, + "learning_rate": 0.004958445893530628, + "loss": 2.4685, + "step": 106100 + }, + { + "epoch": 1.4274577273582623, + "grad_norm": 0.6099728345870972, + "learning_rate": 0.004958349581872748, + "loss": 2.4724, + "step": 106200 + }, + { + "epoch": 1.4288018495120838, + "grad_norm": 0.4108321964740753, + "learning_rate": 0.004958253159670722, + "loss": 2.4686, + "step": 106300 + }, + { + "epoch": 1.430145971665905, + "grad_norm": 0.5882393717765808, + "learning_rate": 0.004958156626928895, + "loss": 2.4763, + "step": 106400 + }, + { + "epoch": 1.4314900938197264, + "grad_norm": 0.7170044779777527, + "learning_rate": 0.004958059983651618, + "loss": 2.4681, + "step": 106500 + }, + { + "epoch": 1.4328342159735477, + "grad_norm": 0.8796346783638, + "learning_rate": 0.0049579632298432435, + "loss": 2.4734, + "step": 106600 + }, + { + "epoch": 1.434178338127369, + "grad_norm": 0.3585628271102905, + "learning_rate": 0.0049578663655081385, + "loss": 2.4599, + "step": 106700 + }, + { + "epoch": 1.4355224602811902, + "grad_norm": 1.9920889139175415, + "learning_rate": 0.004957769390650657, + "loss": 2.4726, + "step": 106800 + }, + { + "epoch": 1.4368665824350118, + "grad_norm": 1.0133873224258423, + "learning_rate": 0.004957672305275166, + "loss": 2.4684, + "step": 106900 + }, + { + "epoch": 1.438210704588833, + "grad_norm": 1.567334771156311, + "learning_rate": 0.004957575109386042, + "loss": 2.4643, + "step": 107000 + }, + { + "epoch": 1.438210704588833, + "eval_MaskedAccuracy": 0.4405992607707991, + "eval_loss": 2.786257743835449, + "eval_runtime": 154.0026, + "eval_samples_per_second": 412.175, + "eval_steps_per_second": 1.61, + "step": 107000 + }, + { + "epoch": 1.4395548267426543, + "grad_norm": 1.3851370811462402, + "learning_rate": 0.004957477802987676, + "loss": 2.4719, + "step": 107100 + }, + { + "epoch": 1.4408989488964756, + "grad_norm": 0.3829158842563629, + "learning_rate": 0.0049573803860844465, + "loss": 2.4618, + "step": 107200 + }, + { + "epoch": 1.4422430710502971, + "grad_norm": 0.8415204882621765, + "learning_rate": 0.004957282858680741, + "loss": 2.4691, + "step": 107300 + }, + { + "epoch": 1.4435871932041184, + "grad_norm": 1.198485016822815, + "learning_rate": 0.00495718522078095, + "loss": 2.4661, + "step": 107400 + }, + { + "epoch": 1.4449313153579397, + "grad_norm": 2.3275532722473145, + "learning_rate": 0.00495708747238948, + "loss": 2.4734, + "step": 107500 + }, + { + "epoch": 1.446275437511761, + "grad_norm": 0.5716771483421326, + "learning_rate": 0.0049569896135107355, + "loss": 2.4612, + "step": 107600 + }, + { + "epoch": 1.4476195596655823, + "grad_norm": 1.2711353302001953, + "learning_rate": 0.00495689164414912, + "loss": 2.4738, + "step": 107700 + }, + { + "epoch": 1.4489636818194038, + "grad_norm": 0.5984246730804443, + "learning_rate": 0.004956793564309049, + "loss": 2.4793, + "step": 107800 + }, + { + "epoch": 1.4503078039732251, + "grad_norm": 0.992817223072052, + "learning_rate": 0.004956695373994945, + "loss": 2.4736, + "step": 107900 + }, + { + "epoch": 1.4516519261270464, + "grad_norm": 0.7644572257995605, + "learning_rate": 0.004956597073211232, + "loss": 2.4834, + "step": 108000 + }, + { + "epoch": 1.4516519261270464, + "eval_MaskedAccuracy": 0.4405933158523938, + "eval_loss": 2.7867140769958496, + "eval_runtime": 155.9739, + "eval_samples_per_second": 406.966, + "eval_steps_per_second": 1.59, + "step": 108000 + }, + { + "epoch": 1.4529960482808677, + "grad_norm": 0.39693641662597656, + "learning_rate": 0.004956498661962332, + "loss": 2.4675, + "step": 108100 + }, + { + "epoch": 1.4543401704346892, + "grad_norm": 1.5463731288909912, + "learning_rate": 0.004956400140252694, + "loss": 2.4737, + "step": 108200 + }, + { + "epoch": 1.4556842925885105, + "grad_norm": 1.4359521865844727, + "learning_rate": 0.004956301508086743, + "loss": 2.475, + "step": 108300 + }, + { + "epoch": 1.4570284147423318, + "grad_norm": 1.5656912326812744, + "learning_rate": 0.004956202765468931, + "loss": 2.4734, + "step": 108400 + }, + { + "epoch": 1.458372536896153, + "grad_norm": 0.3067225515842438, + "learning_rate": 0.004956103912403708, + "loss": 2.4703, + "step": 108500 + }, + { + "epoch": 1.4597166590499744, + "grad_norm": 1.173917293548584, + "learning_rate": 0.0049560049488955255, + "loss": 2.4626, + "step": 108600 + }, + { + "epoch": 1.4610607812037957, + "grad_norm": 0.5422664284706116, + "learning_rate": 0.004955905874948838, + "loss": 2.4634, + "step": 108700 + }, + { + "epoch": 1.4624049033576172, + "grad_norm": 1.207939863204956, + "learning_rate": 0.004955806690568119, + "loss": 2.4676, + "step": 108800 + }, + { + "epoch": 1.4637490255114385, + "grad_norm": 1.1675505638122559, + "learning_rate": 0.0049557073957578355, + "loss": 2.4668, + "step": 108900 + }, + { + "epoch": 1.4650931476652598, + "grad_norm": 0.4084634482860565, + "learning_rate": 0.004955607990522452, + "loss": 2.4674, + "step": 109000 + }, + { + "epoch": 1.4650931476652598, + "eval_MaskedAccuracy": 0.44142586982077486, + "eval_loss": 2.781691551208496, + "eval_runtime": 154.5643, + "eval_samples_per_second": 410.677, + "eval_steps_per_second": 1.605, + "step": 109000 + }, + { + "epoch": 1.4664372698190813, + "grad_norm": 0.33248618245124817, + "learning_rate": 0.004955508474866459, + "loss": 2.47, + "step": 109100 + }, + { + "epoch": 1.4677813919729026, + "grad_norm": 1.3263065814971924, + "learning_rate": 0.004955408848794333, + "loss": 2.4666, + "step": 109200 + }, + { + "epoch": 1.4691255141267239, + "grad_norm": 0.5887484550476074, + "learning_rate": 0.004955309112310564, + "loss": 2.4736, + "step": 109300 + }, + { + "epoch": 1.4704696362805452, + "grad_norm": 0.5707719326019287, + "learning_rate": 0.004955209265419648, + "loss": 2.4723, + "step": 109400 + }, + { + "epoch": 1.4718137584343665, + "grad_norm": 1.564012885093689, + "learning_rate": 0.004955109308126084, + "loss": 2.4677, + "step": 109500 + }, + { + "epoch": 1.4731578805881878, + "grad_norm": 0.5303487181663513, + "learning_rate": 0.004955009240434372, + "loss": 2.4748, + "step": 109600 + }, + { + "epoch": 1.4745020027420093, + "grad_norm": 0.8532441258430481, + "learning_rate": 0.0049549090623490265, + "loss": 2.4605, + "step": 109700 + }, + { + "epoch": 1.4758461248958306, + "grad_norm": 1.2264505624771118, + "learning_rate": 0.004954808773874565, + "loss": 2.4663, + "step": 109800 + }, + { + "epoch": 1.4771902470496519, + "grad_norm": 0.4173908233642578, + "learning_rate": 0.0049547083750154965, + "loss": 2.4677, + "step": 109900 + }, + { + "epoch": 1.4785343692034731, + "grad_norm": 1.8136452436447144, + "learning_rate": 0.004954607865776349, + "loss": 2.4652, + "step": 110000 + }, + { + "epoch": 1.4785343692034731, + "eval_MaskedAccuracy": 0.44079433809670326, + "eval_loss": 2.7854974269866943, + "eval_runtime": 155.1144, + "eval_samples_per_second": 409.221, + "eval_steps_per_second": 1.599, + "step": 110000 + }, + { + "epoch": 1.4798784913572947, + "grad_norm": 1.8435359001159668, + "learning_rate": 0.004954507246161656, + "loss": 2.4611, + "step": 110100 + }, + { + "epoch": 1.481222613511116, + "grad_norm": 0.33811017870903015, + "learning_rate": 0.004954406516175938, + "loss": 2.4659, + "step": 110200 + }, + { + "epoch": 1.4825667356649372, + "grad_norm": 1.902899146080017, + "learning_rate": 0.00495430567582375, + "loss": 2.4629, + "step": 110300 + }, + { + "epoch": 1.4839108578187585, + "grad_norm": 2.60919451713562, + "learning_rate": 0.004954204725109627, + "loss": 2.4783, + "step": 110400 + }, + { + "epoch": 1.4852549799725798, + "grad_norm": 0.7571696639060974, + "learning_rate": 0.0049541036640381175, + "loss": 2.4551, + "step": 110500 + }, + { + "epoch": 1.4865991021264011, + "grad_norm": 0.4986100196838379, + "learning_rate": 0.004954002492613781, + "loss": 2.4625, + "step": 110600 + }, + { + "epoch": 1.4879432242802226, + "grad_norm": 1.091294765472412, + "learning_rate": 0.0049539012108411684, + "loss": 2.4686, + "step": 110700 + }, + { + "epoch": 1.489287346434044, + "grad_norm": 0.8807032704353333, + "learning_rate": 0.004953799818724848, + "loss": 2.4764, + "step": 110800 + }, + { + "epoch": 1.4906314685878652, + "grad_norm": 1.00475013256073, + "learning_rate": 0.0049536983162693895, + "loss": 2.4637, + "step": 110900 + }, + { + "epoch": 1.4919755907416867, + "grad_norm": 0.33167359232902527, + "learning_rate": 0.004953596703479359, + "loss": 2.4678, + "step": 111000 + }, + { + "epoch": 1.4919755907416867, + "eval_MaskedAccuracy": 0.44179725878685755, + "eval_loss": 2.776883602142334, + "eval_runtime": 154.4158, + "eval_samples_per_second": 411.072, + "eval_steps_per_second": 1.606, + "step": 111000 + }, + { + "epoch": 1.493319712895508, + "grad_norm": 1.6402376890182495, + "learning_rate": 0.004953494980359343, + "loss": 2.4685, + "step": 111100 + }, + { + "epoch": 1.4946638350493293, + "grad_norm": 0.7305079698562622, + "learning_rate": 0.004953393146913917, + "loss": 2.4643, + "step": 111200 + }, + { + "epoch": 1.4960079572031506, + "grad_norm": 0.560355544090271, + "learning_rate": 0.0049532912031476735, + "loss": 2.4659, + "step": 111300 + }, + { + "epoch": 1.497352079356972, + "grad_norm": 0.9456803202629089, + "learning_rate": 0.004953189149065217, + "loss": 2.4618, + "step": 111400 + }, + { + "epoch": 1.4986962015107932, + "grad_norm": 1.5349628925323486, + "learning_rate": 0.004953086984671131, + "loss": 2.4661, + "step": 111500 + }, + { + "epoch": 1.5000403236646145, + "grad_norm": 1.062753438949585, + "learning_rate": 0.004952984709970025, + "loss": 2.4618, + "step": 111600 + }, + { + "epoch": 1.501384445818436, + "grad_norm": 0.45811429619789124, + "learning_rate": 0.0049528823249665056, + "loss": 2.4586, + "step": 111700 + }, + { + "epoch": 1.5027285679722573, + "grad_norm": 0.5497962832450867, + "learning_rate": 0.004952779829665184, + "loss": 2.4693, + "step": 111800 + }, + { + "epoch": 1.5040726901260788, + "grad_norm": 0.5482380986213684, + "learning_rate": 0.004952677224070681, + "loss": 2.4623, + "step": 111900 + }, + { + "epoch": 1.5054168122799, + "grad_norm": 1.2586387395858765, + "learning_rate": 0.0049525745081876165, + "loss": 2.4669, + "step": 112000 + }, + { + "epoch": 1.5054168122799, + "eval_MaskedAccuracy": 0.44191414103263826, + "eval_loss": 2.776205539703369, + "eval_runtime": 153.7585, + "eval_samples_per_second": 412.829, + "eval_steps_per_second": 1.613, + "step": 112000 + }, + { + "epoch": 1.5067609344337214, + "grad_norm": 2.1461663246154785, + "learning_rate": 0.004952471682020625, + "loss": 2.4569, + "step": 112100 + }, + { + "epoch": 1.5081050565875427, + "grad_norm": 3.052988290786743, + "learning_rate": 0.00495236874557434, + "loss": 2.4583, + "step": 112200 + }, + { + "epoch": 1.509449178741364, + "grad_norm": 0.3190886676311493, + "learning_rate": 0.00495226569885339, + "loss": 2.4577, + "step": 112300 + }, + { + "epoch": 1.5107933008951853, + "grad_norm": 0.390209436416626, + "learning_rate": 0.0049521625418624295, + "loss": 2.4577, + "step": 112400 + }, + { + "epoch": 1.5121374230490066, + "grad_norm": 1.6563820838928223, + "learning_rate": 0.004952059274606098, + "loss": 2.4577, + "step": 112500 + }, + { + "epoch": 1.513481545202828, + "grad_norm": 0.9080681204795837, + "learning_rate": 0.004951955897089055, + "loss": 2.4618, + "step": 112600 + }, + { + "epoch": 1.5148256673566494, + "grad_norm": 0.5969460010528564, + "learning_rate": 0.0049518524093159514, + "loss": 2.472, + "step": 112700 + }, + { + "epoch": 1.5161697895104709, + "grad_norm": 0.358441025018692, + "learning_rate": 0.004951748811291452, + "loss": 2.4619, + "step": 112800 + }, + { + "epoch": 1.5175139116642922, + "grad_norm": 0.6230987906455994, + "learning_rate": 0.004951645103020229, + "loss": 2.4626, + "step": 112900 + }, + { + "epoch": 1.5188580338181135, + "grad_norm": 0.5158119797706604, + "learning_rate": 0.00495154128450696, + "loss": 2.4671, + "step": 113000 + }, + { + "epoch": 1.5188580338181135, + "eval_MaskedAccuracy": 0.4421825141264522, + "eval_loss": 2.7729415893554688, + "eval_runtime": 153.7445, + "eval_samples_per_second": 412.867, + "eval_steps_per_second": 1.613, + "step": 113000 + }, + { + "epoch": 1.5202021559719348, + "grad_norm": 1.2907798290252686, + "learning_rate": 0.004951437355756309, + "loss": 2.4693, + "step": 113100 + }, + { + "epoch": 1.521546278125756, + "grad_norm": 1.6166244745254517, + "learning_rate": 0.004951333316772972, + "loss": 2.4585, + "step": 113200 + }, + { + "epoch": 1.5228904002795773, + "grad_norm": 0.5393241047859192, + "learning_rate": 0.004951229167561628, + "loss": 2.4643, + "step": 113300 + }, + { + "epoch": 1.5242345224333986, + "grad_norm": 0.7464740872383118, + "learning_rate": 0.004951124908126974, + "loss": 2.4576, + "step": 113400 + }, + { + "epoch": 1.52557864458722, + "grad_norm": 0.47540420293807983, + "learning_rate": 0.004951020538473705, + "loss": 2.4567, + "step": 113500 + }, + { + "epoch": 1.5269227667410414, + "grad_norm": 0.3154013454914093, + "learning_rate": 0.00495091605860653, + "loss": 2.4664, + "step": 113600 + }, + { + "epoch": 1.5282668888948627, + "grad_norm": 0.5142553448677063, + "learning_rate": 0.0049508114685301455, + "loss": 2.4582, + "step": 113700 + }, + { + "epoch": 1.5296110110486842, + "grad_norm": 1.193684697151184, + "learning_rate": 0.004950706768249272, + "loss": 2.4598, + "step": 113800 + }, + { + "epoch": 1.5309551332025055, + "grad_norm": 0.3604363203048706, + "learning_rate": 0.0049506019577686316, + "loss": 2.4589, + "step": 113900 + }, + { + "epoch": 1.5322992553563268, + "grad_norm": 0.5039530396461487, + "learning_rate": 0.004950497037092934, + "loss": 2.466, + "step": 114000 + }, + { + "epoch": 1.5322992553563268, + "eval_MaskedAccuracy": 0.4417917832476142, + "eval_loss": 2.775747299194336, + "eval_runtime": 154.7989, + "eval_samples_per_second": 410.055, + "eval_steps_per_second": 1.602, + "step": 114000 + }, + { + "epoch": 1.5336433775101481, + "grad_norm": 0.35583773255348206, + "learning_rate": 0.004950392006226925, + "loss": 2.4673, + "step": 114100 + }, + { + "epoch": 1.5349874996639694, + "grad_norm": 0.35848045349121094, + "learning_rate": 0.0049502868651753185, + "loss": 2.4615, + "step": 114200 + }, + { + "epoch": 1.5363316218177907, + "grad_norm": 1.3079990148544312, + "learning_rate": 0.004950181613942862, + "loss": 2.4687, + "step": 114300 + }, + { + "epoch": 1.537675743971612, + "grad_norm": 1.409440279006958, + "learning_rate": 0.004950076252534293, + "loss": 2.4501, + "step": 114400 + }, + { + "epoch": 1.5390198661254335, + "grad_norm": 2.4961087703704834, + "learning_rate": 0.0049499707809543565, + "loss": 2.461, + "step": 114500 + }, + { + "epoch": 1.5403639882792548, + "grad_norm": 0.69174724817276, + "learning_rate": 0.0049498651992078155, + "loss": 2.4543, + "step": 114600 + }, + { + "epoch": 1.5417081104330763, + "grad_norm": 1.2737400531768799, + "learning_rate": 0.004949759507299418, + "loss": 2.4584, + "step": 114700 + }, + { + "epoch": 1.5430522325868976, + "grad_norm": 1.2695642709732056, + "learning_rate": 0.00494965370523393, + "loss": 2.4593, + "step": 114800 + }, + { + "epoch": 1.544396354740719, + "grad_norm": 0.31750351190567017, + "learning_rate": 0.004949547793016122, + "loss": 2.4551, + "step": 114900 + }, + { + "epoch": 1.5457404768945402, + "grad_norm": 0.38432586193084717, + "learning_rate": 0.004949441770650766, + "loss": 2.4576, + "step": 115000 + }, + { + "epoch": 1.5457404768945402, + "eval_MaskedAccuracy": 0.4425237323528715, + "eval_loss": 2.774029493331909, + "eval_runtime": 156.0215, + "eval_samples_per_second": 406.841, + "eval_steps_per_second": 1.59, + "step": 115000 + }, + { + "epoch": 1.5470845990483615, + "grad_norm": 0.34672319889068604, + "learning_rate": 0.00494933563814263, + "loss": 2.4499, + "step": 115100 + }, + { + "epoch": 1.5484287212021828, + "grad_norm": 0.6219214200973511, + "learning_rate": 0.004949229395496506, + "loss": 2.4494, + "step": 115200 + }, + { + "epoch": 1.549772843356004, + "grad_norm": 3.128021240234375, + "learning_rate": 0.004949123042717173, + "loss": 2.4652, + "step": 115300 + }, + { + "epoch": 1.5511169655098256, + "grad_norm": 0.5915595889091492, + "learning_rate": 0.004949016579809432, + "loss": 2.4696, + "step": 115400 + }, + { + "epoch": 1.5524610876636469, + "grad_norm": 1.1795876026153564, + "learning_rate": 0.004948910006778072, + "loss": 2.4595, + "step": 115500 + }, + { + "epoch": 1.5538052098174682, + "grad_norm": 0.6927655339241028, + "learning_rate": 0.004948803323627903, + "loss": 2.4517, + "step": 115600 + }, + { + "epoch": 1.5551493319712897, + "grad_norm": 0.35303354263305664, + "learning_rate": 0.004948696530363729, + "loss": 2.4593, + "step": 115700 + }, + { + "epoch": 1.556493454125111, + "grad_norm": 2.31588077545166, + "learning_rate": 0.004948589626990361, + "loss": 2.4566, + "step": 115800 + }, + { + "epoch": 1.5578375762789323, + "grad_norm": 0.37357497215270996, + "learning_rate": 0.004948482613512615, + "loss": 2.4553, + "step": 115900 + }, + { + "epoch": 1.5591816984327536, + "grad_norm": 1.7756373882293701, + "learning_rate": 0.0049483754899353125, + "loss": 2.4547, + "step": 116000 + }, + { + "epoch": 1.5591816984327536, + "eval_MaskedAccuracy": 0.4417388896389656, + "eval_loss": 2.7752127647399902, + "eval_runtime": 155.7219, + "eval_samples_per_second": 407.624, + "eval_steps_per_second": 1.593, + "step": 116000 + }, + { + "epoch": 1.5605258205865749, + "grad_norm": 1.2487781047821045, + "learning_rate": 0.004948268256263274, + "loss": 2.4604, + "step": 116100 + }, + { + "epoch": 1.5618699427403961, + "grad_norm": 0.5920694470405579, + "learning_rate": 0.0049481609125013384, + "loss": 2.4585, + "step": 116200 + }, + { + "epoch": 1.5632140648942174, + "grad_norm": 1.373819351196289, + "learning_rate": 0.004948053458654348, + "loss": 2.4526, + "step": 116300 + }, + { + "epoch": 1.564558187048039, + "grad_norm": 1.0958904027938843, + "learning_rate": 0.004947945894727141, + "loss": 2.4591, + "step": 116400 + }, + { + "epoch": 1.5659023092018602, + "grad_norm": 4.045643329620361, + "learning_rate": 0.004947838220724564, + "loss": 2.4613, + "step": 116500 + }, + { + "epoch": 1.5672464313556818, + "grad_norm": 1.2977932691574097, + "learning_rate": 0.004947730436651466, + "loss": 2.4564, + "step": 116600 + }, + { + "epoch": 1.568590553509503, + "grad_norm": 1.1461297273635864, + "learning_rate": 0.004947622542512704, + "loss": 2.4542, + "step": 116700 + }, + { + "epoch": 1.5699346756633243, + "grad_norm": 2.07126522064209, + "learning_rate": 0.004947514538313141, + "loss": 2.4584, + "step": 116800 + }, + { + "epoch": 1.5712787978171456, + "grad_norm": 0.6101120114326477, + "learning_rate": 0.004947406424057641, + "loss": 2.4596, + "step": 116900 + }, + { + "epoch": 1.572622919970967, + "grad_norm": 1.1047863960266113, + "learning_rate": 0.0049472981997510735, + "loss": 2.4563, + "step": 117000 + }, + { + "epoch": 1.572622919970967, + "eval_MaskedAccuracy": 0.44274268801439726, + "eval_loss": 2.769578218460083, + "eval_runtime": 156.3839, + "eval_samples_per_second": 405.899, + "eval_steps_per_second": 1.586, + "step": 117000 + }, + { + "epoch": 1.5739670421247882, + "grad_norm": 0.3559087812900543, + "learning_rate": 0.004947189865398322, + "loss": 2.4558, + "step": 117100 + }, + { + "epoch": 1.5753111642786095, + "grad_norm": 1.209089994430542, + "learning_rate": 0.004947081421004263, + "loss": 2.4592, + "step": 117200 + }, + { + "epoch": 1.576655286432431, + "grad_norm": 1.5720434188842773, + "learning_rate": 0.004946972866573786, + "loss": 2.4598, + "step": 117300 + }, + { + "epoch": 1.5779994085862523, + "grad_norm": 0.33877983689308167, + "learning_rate": 0.004946864202111778, + "loss": 2.4539, + "step": 117400 + }, + { + "epoch": 1.5793435307400736, + "grad_norm": 1.1109495162963867, + "learning_rate": 0.004946755427623137, + "loss": 2.4621, + "step": 117500 + }, + { + "epoch": 1.5806876528938951, + "grad_norm": 0.32806700468063354, + "learning_rate": 0.0049466465431127656, + "loss": 2.4553, + "step": 117600 + }, + { + "epoch": 1.5820317750477164, + "grad_norm": 1.0989540815353394, + "learning_rate": 0.004946537548585571, + "loss": 2.4632, + "step": 117700 + }, + { + "epoch": 1.5833758972015377, + "grad_norm": 0.3961617350578308, + "learning_rate": 0.004946428444046461, + "loss": 2.4529, + "step": 117800 + }, + { + "epoch": 1.584720019355359, + "grad_norm": 2.489666700363159, + "learning_rate": 0.004946319229500352, + "loss": 2.4506, + "step": 117900 + }, + { + "epoch": 1.5860641415091803, + "grad_norm": 0.2874549627304077, + "learning_rate": 0.004946209904952169, + "loss": 2.451, + "step": 118000 + }, + { + "epoch": 1.5860641415091803, + "eval_MaskedAccuracy": 0.44264748991328356, + "eval_loss": 2.770142078399658, + "eval_runtime": 153.5901, + "eval_samples_per_second": 413.282, + "eval_steps_per_second": 1.615, + "step": 118000 + }, + { + "epoch": 1.5874082636630016, + "grad_norm": 2.0315134525299072, + "learning_rate": 0.004946100470406832, + "loss": 2.4472, + "step": 118100 + }, + { + "epoch": 1.5887523858168229, + "grad_norm": 0.7130675911903381, + "learning_rate": 0.004945990925869273, + "loss": 2.4548, + "step": 118200 + }, + { + "epoch": 1.5900965079706444, + "grad_norm": 0.9596794843673706, + "learning_rate": 0.0049458812713444305, + "loss": 2.462, + "step": 118300 + }, + { + "epoch": 1.5914406301244657, + "grad_norm": 1.4742804765701294, + "learning_rate": 0.004945771506837244, + "loss": 2.4549, + "step": 118400 + }, + { + "epoch": 1.5927847522782872, + "grad_norm": 1.3615772724151611, + "learning_rate": 0.004945661632352659, + "loss": 2.4574, + "step": 118500 + }, + { + "epoch": 1.5941288744321085, + "grad_norm": 0.30037587881088257, + "learning_rate": 0.004945551647895632, + "loss": 2.4514, + "step": 118600 + }, + { + "epoch": 1.5954729965859298, + "grad_norm": 1.7664852142333984, + "learning_rate": 0.004945441553471114, + "loss": 2.452, + "step": 118700 + }, + { + "epoch": 1.596817118739751, + "grad_norm": 1.8963963985443115, + "learning_rate": 0.004945331349084064, + "loss": 2.4552, + "step": 118800 + }, + { + "epoch": 1.5981612408935724, + "grad_norm": 0.5631555318832397, + "learning_rate": 0.004945221034739453, + "loss": 2.4525, + "step": 118900 + }, + { + "epoch": 1.5995053630473937, + "grad_norm": 0.28932493925094604, + "learning_rate": 0.00494511061044224, + "loss": 2.4576, + "step": 119000 + }, + { + "epoch": 1.5995053630473937, + "eval_MaskedAccuracy": 0.44307573744928136, + "eval_loss": 2.768948793411255, + "eval_runtime": 153.3984, + "eval_samples_per_second": 413.798, + "eval_steps_per_second": 1.617, + "step": 119000 + }, + { + "epoch": 1.600849485201215, + "grad_norm": 1.037090539932251, + "learning_rate": 0.0049450000761974144, + "loss": 2.4576, + "step": 119100 + }, + { + "epoch": 1.6021936073550365, + "grad_norm": 4.012909412384033, + "learning_rate": 0.004944889432009946, + "loss": 2.4638, + "step": 119200 + }, + { + "epoch": 1.6035377295088578, + "grad_norm": 0.358288437128067, + "learning_rate": 0.004944778677884832, + "loss": 2.4572, + "step": 119300 + }, + { + "epoch": 1.6048818516626793, + "grad_norm": 0.7761825323104858, + "learning_rate": 0.004944667813827054, + "loss": 2.4585, + "step": 119400 + }, + { + "epoch": 1.6062259738165006, + "grad_norm": 1.2888187170028687, + "learning_rate": 0.004944556839841606, + "loss": 2.4593, + "step": 119500 + }, + { + "epoch": 1.6075700959703219, + "grad_norm": 0.6099095344543457, + "learning_rate": 0.0049444457559334925, + "loss": 2.4618, + "step": 119600 + }, + { + "epoch": 1.6089142181241431, + "grad_norm": 0.30497902631759644, + "learning_rate": 0.004944334562107721, + "loss": 2.4596, + "step": 119700 + }, + { + "epoch": 1.6102583402779644, + "grad_norm": 3.6389715671539307, + "learning_rate": 0.004944223258369297, + "loss": 2.4551, + "step": 119800 + }, + { + "epoch": 1.6116024624317857, + "grad_norm": 0.3001348078250885, + "learning_rate": 0.004944111844723238, + "loss": 2.4522, + "step": 119900 + }, + { + "epoch": 1.612946584585607, + "grad_norm": 0.6880463361740112, + "learning_rate": 0.004944000321174563, + "loss": 2.4455, + "step": 120000 + }, + { + "epoch": 1.612946584585607, + "eval_MaskedAccuracy": 0.44322096964619634, + "eval_loss": 2.7688114643096924, + "eval_runtime": 153.4713, + "eval_samples_per_second": 413.602, + "eval_steps_per_second": 1.616, + "step": 120000 + }, + { + "epoch": 1.6142907067394283, + "grad_norm": 0.5405184626579285, + "learning_rate": 0.0049438886877282975, + "loss": 2.4582, + "step": 120100 + }, + { + "epoch": 1.6156348288932498, + "grad_norm": 1.32143235206604, + "learning_rate": 0.004943776944389477, + "loss": 2.4501, + "step": 120200 + }, + { + "epoch": 1.6169789510470711, + "grad_norm": 0.532461941242218, + "learning_rate": 0.004943665091163127, + "loss": 2.4512, + "step": 120300 + }, + { + "epoch": 1.6183230732008926, + "grad_norm": 1.1819446086883545, + "learning_rate": 0.004943553128054288, + "loss": 2.4623, + "step": 120400 + }, + { + "epoch": 1.619667195354714, + "grad_norm": 1.5699783563613892, + "learning_rate": 0.004943441055068009, + "loss": 2.4504, + "step": 120500 + }, + { + "epoch": 1.6210113175085352, + "grad_norm": 1.6754220724105835, + "learning_rate": 0.004943328872209337, + "loss": 2.45, + "step": 120600 + }, + { + "epoch": 1.6223554396623565, + "grad_norm": 0.35783451795578003, + "learning_rate": 0.004943216579483333, + "loss": 2.4411, + "step": 120700 + }, + { + "epoch": 1.6236995618161778, + "grad_norm": 2.1771366596221924, + "learning_rate": 0.004943104176895054, + "loss": 2.4516, + "step": 120800 + }, + { + "epoch": 1.625043683969999, + "grad_norm": 1.1872481107711792, + "learning_rate": 0.0049429916644495656, + "loss": 2.4464, + "step": 120900 + }, + { + "epoch": 1.6263878061238204, + "grad_norm": 1.4642506837844849, + "learning_rate": 0.004942879042151933, + "loss": 2.461, + "step": 121000 + }, + { + "epoch": 1.6263878061238204, + "eval_MaskedAccuracy": 0.4426167755231782, + "eval_loss": 2.7695846557617188, + "eval_runtime": 153.853, + "eval_samples_per_second": 412.576, + "eval_steps_per_second": 1.612, + "step": 121000 + }, + { + "epoch": 1.627731928277642, + "grad_norm": 0.3797900080680847, + "learning_rate": 0.004942766310007232, + "loss": 2.4495, + "step": 121100 + }, + { + "epoch": 1.6290760504314632, + "grad_norm": 0.4284350872039795, + "learning_rate": 0.004942653468020548, + "loss": 2.4434, + "step": 121200 + }, + { + "epoch": 1.6304201725852847, + "grad_norm": 1.1178627014160156, + "learning_rate": 0.004942540516196953, + "loss": 2.4532, + "step": 121300 + }, + { + "epoch": 1.631764294739106, + "grad_norm": 1.3800808191299438, + "learning_rate": 0.0049424274545415425, + "loss": 2.4582, + "step": 121400 + }, + { + "epoch": 1.6331084168929273, + "grad_norm": 0.3553604781627655, + "learning_rate": 0.0049423142830594185, + "loss": 2.4546, + "step": 121500 + }, + { + "epoch": 1.6344525390467486, + "grad_norm": 0.29325976967811584, + "learning_rate": 0.004942201001755672, + "loss": 2.449, + "step": 121600 + }, + { + "epoch": 1.6357966612005699, + "grad_norm": 0.28023549914360046, + "learning_rate": 0.004942087610635415, + "loss": 2.4559, + "step": 121700 + }, + { + "epoch": 1.6371407833543912, + "grad_norm": 0.6937592029571533, + "learning_rate": 0.0049419741097037435, + "loss": 2.458, + "step": 121800 + }, + { + "epoch": 1.6384849055082125, + "grad_norm": 0.3913755416870117, + "learning_rate": 0.004941860498965778, + "loss": 2.4511, + "step": 121900 + }, + { + "epoch": 1.639829027662034, + "grad_norm": 0.817623496055603, + "learning_rate": 0.004941746778426644, + "loss": 2.4551, + "step": 122000 + }, + { + "epoch": 1.639829027662034, + "eval_MaskedAccuracy": 0.44339662189399426, + "eval_loss": 2.764580249786377, + "eval_runtime": 157.5643, + "eval_samples_per_second": 402.858, + "eval_steps_per_second": 1.574, + "step": 122000 + }, + { + "epoch": 1.6411731498158553, + "grad_norm": 0.36455515027046204, + "learning_rate": 0.004941632948091455, + "loss": 2.4556, + "step": 122100 + }, + { + "epoch": 1.6425172719696766, + "grad_norm": 0.5823009014129639, + "learning_rate": 0.004941519007965352, + "loss": 2.4511, + "step": 122200 + }, + { + "epoch": 1.643861394123498, + "grad_norm": 0.7032744884490967, + "learning_rate": 0.004941404958053464, + "loss": 2.4545, + "step": 122300 + }, + { + "epoch": 1.6452055162773194, + "grad_norm": 2.976278781890869, + "learning_rate": 0.004941290798360926, + "loss": 2.45, + "step": 122400 + }, + { + "epoch": 1.6465496384311407, + "grad_norm": 0.37186741828918457, + "learning_rate": 0.004941176528892885, + "loss": 2.4524, + "step": 122500 + }, + { + "epoch": 1.647893760584962, + "grad_norm": 1.0369256734848022, + "learning_rate": 0.004941062149654489, + "loss": 2.4522, + "step": 122600 + }, + { + "epoch": 1.6492378827387832, + "grad_norm": 0.8359701037406921, + "learning_rate": 0.004940947660650888, + "loss": 2.4618, + "step": 122700 + }, + { + "epoch": 1.6505820048926045, + "grad_norm": 0.3227209150791168, + "learning_rate": 0.004940833061887247, + "loss": 2.4419, + "step": 122800 + }, + { + "epoch": 1.6519261270464258, + "grad_norm": 0.44361066818237305, + "learning_rate": 0.00494071835336873, + "loss": 2.4504, + "step": 122900 + }, + { + "epoch": 1.6532702492002473, + "grad_norm": 0.9630382657051086, + "learning_rate": 0.004940603535100503, + "loss": 2.4494, + "step": 123000 + }, + { + "epoch": 1.6532702492002473, + "eval_MaskedAccuracy": 0.4438552018371928, + "eval_loss": 2.762709856033325, + "eval_runtime": 153.7406, + "eval_samples_per_second": 412.877, + "eval_steps_per_second": 1.613, + "step": 123000 + }, + { + "epoch": 1.6546143713540686, + "grad_norm": 1.0358442068099976, + "learning_rate": 0.004940488607087739, + "loss": 2.4472, + "step": 123100 + }, + { + "epoch": 1.6559584935078902, + "grad_norm": 1.2041329145431519, + "learning_rate": 0.004940373569335615, + "loss": 2.4454, + "step": 123200 + }, + { + "epoch": 1.6573026156617114, + "grad_norm": 1.5662907361984253, + "learning_rate": 0.004940258421849318, + "loss": 2.4495, + "step": 123300 + }, + { + "epoch": 1.6586467378155327, + "grad_norm": 0.6521201133728027, + "learning_rate": 0.004940143164634034, + "loss": 2.4478, + "step": 123400 + }, + { + "epoch": 1.659990859969354, + "grad_norm": 0.7364070415496826, + "learning_rate": 0.004940027797694953, + "loss": 2.4472, + "step": 123500 + }, + { + "epoch": 1.6613349821231753, + "grad_norm": 0.5109328031539917, + "learning_rate": 0.004939912321037278, + "loss": 2.4537, + "step": 123600 + }, + { + "epoch": 1.6626791042769966, + "grad_norm": 0.6922867298126221, + "learning_rate": 0.004939796734666213, + "loss": 2.456, + "step": 123700 + }, + { + "epoch": 1.664023226430818, + "grad_norm": 0.8259190320968628, + "learning_rate": 0.004939681038586966, + "loss": 2.4501, + "step": 123800 + }, + { + "epoch": 1.6653673485846394, + "grad_norm": 0.7042433619499207, + "learning_rate": 0.004939565232804747, + "loss": 2.4456, + "step": 123900 + }, + { + "epoch": 1.6667114707384607, + "grad_norm": 1.328395128250122, + "learning_rate": 0.0049394493173247804, + "loss": 2.4599, + "step": 124000 + }, + { + "epoch": 1.6667114707384607, + "eval_MaskedAccuracy": 0.44327534756472914, + "eval_loss": 2.7654707431793213, + "eval_runtime": 156.5562, + "eval_samples_per_second": 405.452, + "eval_steps_per_second": 1.584, + "step": 124000 + }, + { + "epoch": 1.668055592892282, + "grad_norm": 1.040299892425537, + "learning_rate": 0.0049393332921522795, + "loss": 2.4475, + "step": 124100 + }, + { + "epoch": 1.6693997150461035, + "grad_norm": 0.447637677192688, + "learning_rate": 0.004939217157292479, + "loss": 2.4522, + "step": 124200 + }, + { + "epoch": 1.6707438371999248, + "grad_norm": 0.3728015422821045, + "learning_rate": 0.004939100912750606, + "loss": 2.4541, + "step": 124300 + }, + { + "epoch": 1.672087959353746, + "grad_norm": 0.4562872350215912, + "learning_rate": 0.0049389845585319095, + "loss": 2.452, + "step": 124400 + }, + { + "epoch": 1.6734320815075674, + "grad_norm": 1.0070672035217285, + "learning_rate": 0.0049388680946416234, + "loss": 2.4478, + "step": 124500 + }, + { + "epoch": 1.6747762036613887, + "grad_norm": 0.3675072193145752, + "learning_rate": 0.004938751521084995, + "loss": 2.4464, + "step": 124600 + }, + { + "epoch": 1.67612032581521, + "grad_norm": 1.3940047025680542, + "learning_rate": 0.004938634837867282, + "loss": 2.44, + "step": 124700 + }, + { + "epoch": 1.6774644479690313, + "grad_norm": 0.6820449233055115, + "learning_rate": 0.004938518044993735, + "loss": 2.4448, + "step": 124800 + }, + { + "epoch": 1.6788085701228528, + "grad_norm": 0.2972992956638336, + "learning_rate": 0.004938401142469626, + "loss": 2.4407, + "step": 124900 + }, + { + "epoch": 1.680152692276674, + "grad_norm": 1.0912327766418457, + "learning_rate": 0.004938284130300211, + "loss": 2.4543, + "step": 125000 + }, + { + "epoch": 1.680152692276674, + "eval_MaskedAccuracy": 0.4438537899485854, + "eval_loss": 2.7616376876831055, + "eval_runtime": 153.4081, + "eval_samples_per_second": 413.772, + "eval_steps_per_second": 1.617, + "step": 125000 + }, + { + "epoch": 1.6814968144304956, + "grad_norm": 0.8189194798469543, + "learning_rate": 0.004938167008490767, + "loss": 2.4444, + "step": 125100 + }, + { + "epoch": 1.6828409365843169, + "grad_norm": 1.2573928833007812, + "learning_rate": 0.004938049777046575, + "loss": 2.4477, + "step": 125200 + }, + { + "epoch": 1.6841850587381382, + "grad_norm": 0.27490001916885376, + "learning_rate": 0.004937932435972912, + "loss": 2.4494, + "step": 125300 + }, + { + "epoch": 1.6855291808919595, + "grad_norm": 2.5110983848571777, + "learning_rate": 0.0049378149852750694, + "loss": 2.4453, + "step": 125400 + }, + { + "epoch": 1.6868733030457808, + "grad_norm": 0.5286669135093689, + "learning_rate": 0.004937697424958338, + "loss": 2.4499, + "step": 125500 + }, + { + "epoch": 1.688217425199602, + "grad_norm": 1.0157794952392578, + "learning_rate": 0.004937579755028025, + "loss": 2.4499, + "step": 125600 + }, + { + "epoch": 1.6895615473534233, + "grad_norm": 2.4944801330566406, + "learning_rate": 0.004937461975489416, + "loss": 2.4454, + "step": 125700 + }, + { + "epoch": 1.6909056695072449, + "grad_norm": 0.9877827763557434, + "learning_rate": 0.0049373440863478175, + "loss": 2.4435, + "step": 125800 + }, + { + "epoch": 1.6922497916610661, + "grad_norm": 0.28307971358299255, + "learning_rate": 0.004937226087608559, + "loss": 2.4502, + "step": 125900 + }, + { + "epoch": 1.6935939138148874, + "grad_norm": 0.6857683658599854, + "learning_rate": 0.0049371079792769395, + "loss": 2.4459, + "step": 126000 + }, + { + "epoch": 1.6935939138148874, + "eval_MaskedAccuracy": 0.4442026999946834, + "eval_loss": 2.760291576385498, + "eval_runtime": 154.5053, + "eval_samples_per_second": 410.834, + "eval_steps_per_second": 1.605, + "step": 126000 + }, + { + "epoch": 1.694938035968709, + "grad_norm": 0.3602958023548126, + "learning_rate": 0.0049369897613582864, + "loss": 2.4459, + "step": 126100 + }, + { + "epoch": 1.6962821581225302, + "grad_norm": 1.104148507118225, + "learning_rate": 0.004936871433857933, + "loss": 2.452, + "step": 126200 + }, + { + "epoch": 1.6976262802763515, + "grad_norm": 1.409397840499878, + "learning_rate": 0.004936752996781202, + "loss": 2.4469, + "step": 126300 + }, + { + "epoch": 1.6989704024301728, + "grad_norm": 1.9952348470687866, + "learning_rate": 0.004936634450133437, + "loss": 2.4584, + "step": 126400 + }, + { + "epoch": 1.7003145245839941, + "grad_norm": 1.7229875326156616, + "learning_rate": 0.004936515793919977, + "loss": 2.4491, + "step": 126500 + }, + { + "epoch": 1.7016586467378154, + "grad_norm": 0.6407601237297058, + "learning_rate": 0.00493639702814616, + "loss": 2.4525, + "step": 126600 + }, + { + "epoch": 1.7030027688916367, + "grad_norm": 1.1715171337127686, + "learning_rate": 0.004936278152817347, + "loss": 2.4403, + "step": 126700 + }, + { + "epoch": 1.7043468910454582, + "grad_norm": 0.6220793128013611, + "learning_rate": 0.004936159167938901, + "loss": 2.4487, + "step": 126800 + }, + { + "epoch": 1.7056910131992795, + "grad_norm": 0.5494009256362915, + "learning_rate": 0.004936040073516175, + "loss": 2.4331, + "step": 126900 + }, + { + "epoch": 1.707035135353101, + "grad_norm": 1.780874252319336, + "learning_rate": 0.004935920869554534, + "loss": 2.4391, + "step": 127000 + }, + { + "epoch": 1.707035135353101, + "eval_MaskedAccuracy": 0.4440303666034177, + "eval_loss": 2.7617318630218506, + "eval_runtime": 156.1849, + "eval_samples_per_second": 406.416, + "eval_steps_per_second": 1.588, + "step": 127000 + }, + { + "epoch": 1.7083792575069223, + "grad_norm": 1.470224142074585, + "learning_rate": 0.004935801556059347, + "loss": 2.4416, + "step": 127100 + }, + { + "epoch": 1.7097233796607436, + "grad_norm": 0.2865951359272003, + "learning_rate": 0.00493568213303599, + "loss": 2.4505, + "step": 127200 + }, + { + "epoch": 1.711067501814565, + "grad_norm": 1.1740427017211914, + "learning_rate": 0.004935562600489849, + "loss": 2.4505, + "step": 127300 + }, + { + "epoch": 1.7124116239683862, + "grad_norm": 0.6032086610794067, + "learning_rate": 0.0049354429584263105, + "loss": 2.4482, + "step": 127400 + }, + { + "epoch": 1.7137557461222075, + "grad_norm": 1.0448052883148193, + "learning_rate": 0.0049353232068507615, + "loss": 2.45, + "step": 127500 + }, + { + "epoch": 1.7150998682760288, + "grad_norm": 0.433826208114624, + "learning_rate": 0.004935203345768603, + "loss": 2.4493, + "step": 127600 + }, + { + "epoch": 1.7164439904298503, + "grad_norm": 1.4232683181762695, + "learning_rate": 0.004935083375185232, + "loss": 2.4463, + "step": 127700 + }, + { + "epoch": 1.7177881125836716, + "grad_norm": 1.5276943445205688, + "learning_rate": 0.004934963295106055, + "loss": 2.4501, + "step": 127800 + }, + { + "epoch": 1.719132234737493, + "grad_norm": 1.1934447288513184, + "learning_rate": 0.0049348431055364775, + "loss": 2.4529, + "step": 127900 + }, + { + "epoch": 1.7204763568913144, + "grad_norm": 0.4801059663295746, + "learning_rate": 0.004934722806481924, + "loss": 2.4412, + "step": 128000 + }, + { + "epoch": 1.7204763568913144, + "eval_MaskedAccuracy": 0.4440363882564416, + "eval_loss": 2.758544683456421, + "eval_runtime": 156.4492, + "eval_samples_per_second": 405.729, + "eval_steps_per_second": 1.585, + "step": 128000 + }, + { + "epoch": 1.7218204790451357, + "grad_norm": 0.24669547379016876, + "learning_rate": 0.004934602397947807, + "loss": 2.4503, + "step": 128100 + }, + { + "epoch": 1.723164601198957, + "grad_norm": 0.8044118285179138, + "learning_rate": 0.00493448187993956, + "loss": 2.4397, + "step": 128200 + }, + { + "epoch": 1.7245087233527783, + "grad_norm": 0.3931942582130432, + "learning_rate": 0.004934361252462605, + "loss": 2.4423, + "step": 128300 + }, + { + "epoch": 1.7258528455065996, + "grad_norm": 0.3876301944255829, + "learning_rate": 0.004934240515522382, + "loss": 2.4393, + "step": 128400 + }, + { + "epoch": 1.7271969676604209, + "grad_norm": 1.6018636226654053, + "learning_rate": 0.004934119669124325, + "loss": 2.4476, + "step": 128500 + }, + { + "epoch": 1.7285410898142424, + "grad_norm": 2.163212776184082, + "learning_rate": 0.004933998713273891, + "loss": 2.4496, + "step": 128600 + }, + { + "epoch": 1.7298852119680637, + "grad_norm": 0.32517024874687195, + "learning_rate": 0.004933877647976517, + "loss": 2.4492, + "step": 128700 + }, + { + "epoch": 1.731229334121885, + "grad_norm": 0.843076765537262, + "learning_rate": 0.00493375647323767, + "loss": 2.4452, + "step": 128800 + }, + { + "epoch": 1.7325734562757065, + "grad_norm": 0.7287679314613342, + "learning_rate": 0.004933635189062801, + "loss": 2.4411, + "step": 128900 + }, + { + "epoch": 1.7339175784295278, + "grad_norm": 0.561202347278595, + "learning_rate": 0.004933513795457381, + "loss": 2.437, + "step": 129000 + }, + { + "epoch": 1.7339175784295278, + "eval_MaskedAccuracy": 0.4451868773268175, + "eval_loss": 2.7529821395874023, + "eval_runtime": 154.118, + "eval_samples_per_second": 411.866, + "eval_steps_per_second": 1.609, + "step": 129000 + }, + { + "epoch": 1.735261700583349, + "grad_norm": 1.201848030090332, + "learning_rate": 0.0049333922924268804, + "loss": 2.4469, + "step": 129100 + }, + { + "epoch": 1.7366058227371703, + "grad_norm": 0.7610087990760803, + "learning_rate": 0.004933270679976775, + "loss": 2.4427, + "step": 129200 + }, + { + "epoch": 1.7379499448909916, + "grad_norm": 0.4432719051837921, + "learning_rate": 0.004933148958112536, + "loss": 2.4431, + "step": 129300 + }, + { + "epoch": 1.739294067044813, + "grad_norm": 0.4039577841758728, + "learning_rate": 0.004933027126839647, + "loss": 2.4416, + "step": 129400 + }, + { + "epoch": 1.7406381891986342, + "grad_norm": 0.47328197956085205, + "learning_rate": 0.004932905186163602, + "loss": 2.4409, + "step": 129500 + }, + { + "epoch": 1.7419823113524557, + "grad_norm": 0.9691961407661438, + "learning_rate": 0.004932783136089899, + "loss": 2.4415, + "step": 129600 + }, + { + "epoch": 1.743326433506277, + "grad_norm": 0.8540052771568298, + "learning_rate": 0.004932660976624034, + "loss": 2.444, + "step": 129700 + }, + { + "epoch": 1.7446705556600985, + "grad_norm": 1.1006224155426025, + "learning_rate": 0.004932538707771508, + "loss": 2.4509, + "step": 129800 + }, + { + "epoch": 1.7460146778139198, + "grad_norm": 2.2226502895355225, + "learning_rate": 0.004932416329537838, + "loss": 2.4423, + "step": 129900 + }, + { + "epoch": 1.7473587999677411, + "grad_norm": 0.42202070355415344, + "learning_rate": 0.004932293841928537, + "loss": 2.4394, + "step": 130000 + }, + { + "epoch": 1.7473587999677411, + "eval_MaskedAccuracy": 0.44459195138244084, + "eval_loss": 2.756187915802002, + "eval_runtime": 153.5748, + "eval_samples_per_second": 413.323, + "eval_steps_per_second": 1.615, + "step": 130000 + }, + { + "epoch": 1.7487029221215624, + "grad_norm": 0.47480854392051697, + "learning_rate": 0.004932171244949114, + "loss": 2.4463, + "step": 130100 + }, + { + "epoch": 1.7500470442753837, + "grad_norm": 0.9166473150253296, + "learning_rate": 0.004932048538605108, + "loss": 2.4467, + "step": 130200 + }, + { + "epoch": 1.751391166429205, + "grad_norm": 0.4468994438648224, + "learning_rate": 0.0049319257229020344, + "loss": 2.4422, + "step": 130300 + }, + { + "epoch": 1.7527352885830263, + "grad_norm": 2.049225091934204, + "learning_rate": 0.004931802797845434, + "loss": 2.4315, + "step": 130400 + }, + { + "epoch": 1.7540794107368478, + "grad_norm": 0.9961231350898743, + "learning_rate": 0.004931679763440849, + "loss": 2.4394, + "step": 130500 + }, + { + "epoch": 1.755423532890669, + "grad_norm": 0.8406715989112854, + "learning_rate": 0.00493155661969382, + "loss": 2.4436, + "step": 130600 + }, + { + "epoch": 1.7567676550444904, + "grad_norm": 0.6432211995124817, + "learning_rate": 0.004931433366609882, + "loss": 2.4352, + "step": 130700 + }, + { + "epoch": 1.758111777198312, + "grad_norm": 0.34998586773872375, + "learning_rate": 0.004931310004194613, + "loss": 2.4438, + "step": 130800 + }, + { + "epoch": 1.7594558993521332, + "grad_norm": 0.8097212910652161, + "learning_rate": 0.00493118653245356, + "loss": 2.4447, + "step": 130900 + }, + { + "epoch": 1.7608000215059545, + "grad_norm": 0.43594908714294434, + "learning_rate": 0.004931062951392285, + "loss": 2.4391, + "step": 131000 + }, + { + "epoch": 1.7608000215059545, + "eval_MaskedAccuracy": 0.4449512847956205, + "eval_loss": 2.7537848949432373, + "eval_runtime": 153.3786, + "eval_samples_per_second": 413.852, + "eval_steps_per_second": 1.617, + "step": 131000 + }, + { + "epoch": 1.7621441436597758, + "grad_norm": 0.555332362651825, + "learning_rate": 0.004930939261016357, + "loss": 2.439, + "step": 131100 + }, + { + "epoch": 1.763488265813597, + "grad_norm": 0.6408087611198425, + "learning_rate": 0.00493081546133135, + "loss": 2.4447, + "step": 131200 + }, + { + "epoch": 1.7648323879674184, + "grad_norm": 0.2732897996902466, + "learning_rate": 0.0049306915523428425, + "loss": 2.445, + "step": 131300 + }, + { + "epoch": 1.7661765101212397, + "grad_norm": 0.7091703414916992, + "learning_rate": 0.0049305675340564105, + "loss": 2.4377, + "step": 131400 + }, + { + "epoch": 1.7675206322750612, + "grad_norm": 0.3678547739982605, + "learning_rate": 0.004930443406477648, + "loss": 2.4239, + "step": 131500 + }, + { + "epoch": 1.7688647544288825, + "grad_norm": 0.7927549481391907, + "learning_rate": 0.004930319169612157, + "loss": 2.4298, + "step": 131600 + }, + { + "epoch": 1.770208876582704, + "grad_norm": 0.8828220963478088, + "learning_rate": 0.004930194823465527, + "loss": 2.4391, + "step": 131700 + }, + { + "epoch": 1.7715529987365253, + "grad_norm": 0.35567620396614075, + "learning_rate": 0.00493007036804336, + "loss": 2.4399, + "step": 131800 + }, + { + "epoch": 1.7728971208903466, + "grad_norm": 0.5886101722717285, + "learning_rate": 0.0049299458033512655, + "loss": 2.4472, + "step": 131900 + }, + { + "epoch": 1.7742412430441679, + "grad_norm": 0.3277896046638489, + "learning_rate": 0.0049298211293948594, + "loss": 2.4406, + "step": 132000 + }, + { + "epoch": 1.7742412430441679, + "eval_MaskedAccuracy": 0.44539411736195783, + "eval_loss": 2.7525792121887207, + "eval_runtime": 154.9926, + "eval_samples_per_second": 409.542, + "eval_steps_per_second": 1.6, + "step": 132000 + }, + { + "epoch": 1.7755853651979892, + "grad_norm": 0.967954695224762, + "learning_rate": 0.004929696346179746, + "loss": 2.445, + "step": 132100 + }, + { + "epoch": 1.7769294873518104, + "grad_norm": 0.38069549202919006, + "learning_rate": 0.004929571453711563, + "loss": 2.4397, + "step": 132200 + }, + { + "epoch": 1.7782736095056317, + "grad_norm": 0.7181752920150757, + "learning_rate": 0.004929446451995928, + "loss": 2.4411, + "step": 132300 + }, + { + "epoch": 1.7796177316594533, + "grad_norm": 0.7660533785820007, + "learning_rate": 0.004929321341038483, + "loss": 2.4344, + "step": 132400 + }, + { + "epoch": 1.7809618538132745, + "grad_norm": 1.3645398616790771, + "learning_rate": 0.004929196120844856, + "loss": 2.4429, + "step": 132500 + }, + { + "epoch": 1.7823059759670958, + "grad_norm": 0.40024033188819885, + "learning_rate": 0.004929070791420691, + "loss": 2.4525, + "step": 132600 + }, + { + "epoch": 1.7836500981209173, + "grad_norm": 2.2568159103393555, + "learning_rate": 0.004928945352771635, + "loss": 2.443, + "step": 132700 + }, + { + "epoch": 1.7849942202747386, + "grad_norm": 0.7219163775444031, + "learning_rate": 0.004928819804903343, + "loss": 2.4398, + "step": 132800 + }, + { + "epoch": 1.78633834242856, + "grad_norm": 1.4869344234466553, + "learning_rate": 0.004928694147821474, + "loss": 2.4397, + "step": 132900 + }, + { + "epoch": 1.7876824645823812, + "grad_norm": 0.3790728747844696, + "learning_rate": 0.0049285683815316825, + "loss": 2.4328, + "step": 133000 + }, + { + "epoch": 1.7876824645823812, + "eval_MaskedAccuracy": 0.44502501594055116, + "eval_loss": 2.751193046569824, + "eval_runtime": 154.0912, + "eval_samples_per_second": 411.938, + "eval_steps_per_second": 1.609, + "step": 133000 + }, + { + "epoch": 1.7890265867362025, + "grad_norm": 0.836524486541748, + "learning_rate": 0.00492844250603964, + "loss": 2.4433, + "step": 133100 + }, + { + "epoch": 1.7903707088900238, + "grad_norm": 0.29098621010780334, + "learning_rate": 0.004928316521351023, + "loss": 2.4376, + "step": 133200 + }, + { + "epoch": 1.791714831043845, + "grad_norm": 0.4187617003917694, + "learning_rate": 0.0049281904274715025, + "loss": 2.4439, + "step": 133300 + }, + { + "epoch": 1.7930589531976666, + "grad_norm": 1.8081835508346558, + "learning_rate": 0.004928064224406764, + "loss": 2.4372, + "step": 133400 + }, + { + "epoch": 1.794403075351488, + "grad_norm": 2.1965842247009277, + "learning_rate": 0.0049279379121624925, + "loss": 2.4302, + "step": 133500 + }, + { + "epoch": 1.7957471975053094, + "grad_norm": 1.1956915855407715, + "learning_rate": 0.004927811490744374, + "loss": 2.4379, + "step": 133600 + }, + { + "epoch": 1.7970913196591307, + "grad_norm": 0.7906240820884705, + "learning_rate": 0.004927684960158106, + "loss": 2.4423, + "step": 133700 + }, + { + "epoch": 1.798435441812952, + "grad_norm": 0.24601203203201294, + "learning_rate": 0.004927558320409392, + "loss": 2.4497, + "step": 133800 + }, + { + "epoch": 1.7997795639667733, + "grad_norm": 1.7259650230407715, + "learning_rate": 0.004927431571503945, + "loss": 2.4301, + "step": 133900 + }, + { + "epoch": 1.8011236861205946, + "grad_norm": 0.3489820957183838, + "learning_rate": 0.004927304713447465, + "loss": 2.4494, + "step": 134000 + }, + { + "epoch": 1.8011236861205946, + "eval_MaskedAccuracy": 0.4454901993570299, + "eval_loss": 2.751655101776123, + "eval_runtime": 155.8316, + "eval_samples_per_second": 407.337, + "eval_steps_per_second": 1.591, + "step": 134000 + }, + { + "epoch": 1.8024678082744159, + "grad_norm": 0.9807569980621338, + "learning_rate": 0.004927177746245671, + "loss": 2.4397, + "step": 134100 + }, + { + "epoch": 1.8038119304282372, + "grad_norm": 1.5453386306762695, + "learning_rate": 0.004927050669904284, + "loss": 2.4467, + "step": 134200 + }, + { + "epoch": 1.8051560525820587, + "grad_norm": 0.9938260912895203, + "learning_rate": 0.004926923484429032, + "loss": 2.4361, + "step": 134300 + }, + { + "epoch": 1.80650017473588, + "grad_norm": 1.328628420829773, + "learning_rate": 0.004926796189825648, + "loss": 2.4473, + "step": 134400 + }, + { + "epoch": 1.8078442968897015, + "grad_norm": 0.565514087677002, + "learning_rate": 0.004926668786099871, + "loss": 2.4383, + "step": 134500 + }, + { + "epoch": 1.8091884190435228, + "grad_norm": 3.7751681804656982, + "learning_rate": 0.004926541273257434, + "loss": 2.4386, + "step": 134600 + }, + { + "epoch": 1.810532541197344, + "grad_norm": 1.7504175901412964, + "learning_rate": 0.004926413651304083, + "loss": 2.4384, + "step": 134700 + }, + { + "epoch": 1.8118766633511654, + "grad_norm": 0.6130387783050537, + "learning_rate": 0.004926285920245568, + "loss": 2.437, + "step": 134800 + }, + { + "epoch": 1.8132207855049867, + "grad_norm": 0.5119666457176208, + "learning_rate": 0.004926158080087647, + "loss": 2.4349, + "step": 134900 + }, + { + "epoch": 1.814564907658808, + "grad_norm": 1.4035404920578003, + "learning_rate": 0.004926030130836081, + "loss": 2.4344, + "step": 135000 + }, + { + "epoch": 1.814564907658808, + "eval_MaskedAccuracy": 0.44536970526231284, + "eval_loss": 2.7522668838500977, + "eval_runtime": 154.4591, + "eval_samples_per_second": 410.957, + "eval_steps_per_second": 1.606, + "step": 135000 + }, + { + "epoch": 1.8159090298126292, + "grad_norm": 1.2096890211105347, + "learning_rate": 0.00492590207249663, + "loss": 2.4333, + "step": 135100 + }, + { + "epoch": 1.8172531519664508, + "grad_norm": 0.8168133497238159, + "learning_rate": 0.004925773905075064, + "loss": 2.4354, + "step": 135200 + }, + { + "epoch": 1.818597274120272, + "grad_norm": 0.46834301948547363, + "learning_rate": 0.004925645628577168, + "loss": 2.4345, + "step": 135300 + }, + { + "epoch": 1.8199413962740933, + "grad_norm": 0.7720035910606384, + "learning_rate": 0.004925517243008721, + "loss": 2.447, + "step": 135400 + }, + { + "epoch": 1.8212855184279149, + "grad_norm": 1.1966851949691772, + "learning_rate": 0.0049253887483755, + "loss": 2.4282, + "step": 135500 + }, + { + "epoch": 1.8226296405817362, + "grad_norm": 1.4665566682815552, + "learning_rate": 0.004925260144683294, + "loss": 2.4311, + "step": 135600 + }, + { + "epoch": 1.8239737627355574, + "grad_norm": 0.6458213925361633, + "learning_rate": 0.004925131431937897, + "loss": 2.4453, + "step": 135700 + }, + { + "epoch": 1.8253178848893787, + "grad_norm": 0.864764392375946, + "learning_rate": 0.004925002610145116, + "loss": 2.4315, + "step": 135800 + }, + { + "epoch": 1.8266620070432, + "grad_norm": 1.5035104751586914, + "learning_rate": 0.004924873679310756, + "loss": 2.435, + "step": 135900 + }, + { + "epoch": 1.8280061291970213, + "grad_norm": 0.5938438773155212, + "learning_rate": 0.004924744639440619, + "loss": 2.4461, + "step": 136000 + }, + { + "epoch": 1.8280061291970213, + "eval_MaskedAccuracy": 0.44473731371748054, + "eval_loss": 2.752194881439209, + "eval_runtime": 153.2163, + "eval_samples_per_second": 414.29, + "eval_steps_per_second": 1.619, + "step": 136000 + }, + { + "epoch": 1.8293502513508426, + "grad_norm": 1.69126558303833, + "learning_rate": 0.00492461549054052, + "loss": 2.4375, + "step": 136100 + }, + { + "epoch": 1.8306943735046641, + "grad_norm": 1.1451964378356934, + "learning_rate": 0.004924486232616281, + "loss": 2.4311, + "step": 136200 + }, + { + "epoch": 1.8320384956584854, + "grad_norm": 0.33867311477661133, + "learning_rate": 0.004924356865673731, + "loss": 2.4365, + "step": 136300 + }, + { + "epoch": 1.833382617812307, + "grad_norm": 0.7080395221710205, + "learning_rate": 0.0049242273897186955, + "loss": 2.438, + "step": 136400 + }, + { + "epoch": 1.8347267399661282, + "grad_norm": 0.8327525854110718, + "learning_rate": 0.0049240978047570055, + "loss": 2.4409, + "step": 136500 + }, + { + "epoch": 1.8360708621199495, + "grad_norm": 0.296032190322876, + "learning_rate": 0.004923968110794505, + "loss": 2.4377, + "step": 136600 + }, + { + "epoch": 1.8374149842737708, + "grad_norm": 0.8659167289733887, + "learning_rate": 0.00492383830783703, + "loss": 2.4317, + "step": 136700 + }, + { + "epoch": 1.838759106427592, + "grad_norm": 0.5200401544570923, + "learning_rate": 0.004923708395890436, + "loss": 2.445, + "step": 136800 + }, + { + "epoch": 1.8401032285814134, + "grad_norm": 1.0861247777938843, + "learning_rate": 0.004923578374960567, + "loss": 2.4332, + "step": 136900 + }, + { + "epoch": 1.8414473507352347, + "grad_norm": 0.3223403990268707, + "learning_rate": 0.004923448245053295, + "loss": 2.4422, + "step": 137000 + }, + { + "epoch": 1.8414473507352347, + "eval_MaskedAccuracy": 0.4446580648304679, + "eval_loss": 2.7543158531188965, + "eval_runtime": 153.5392, + "eval_samples_per_second": 413.419, + "eval_steps_per_second": 1.615, + "step": 137000 + }, + { + "epoch": 1.8427914728890562, + "grad_norm": 0.5863522887229919, + "learning_rate": 0.004923318006174473, + "loss": 2.4385, + "step": 137100 + }, + { + "epoch": 1.8441355950428775, + "grad_norm": 0.2514398694038391, + "learning_rate": 0.004923187658329979, + "loss": 2.4398, + "step": 137200 + }, + { + "epoch": 1.8454797171966988, + "grad_norm": 1.2516361474990845, + "learning_rate": 0.004923057201525681, + "loss": 2.4374, + "step": 137300 + }, + { + "epoch": 1.8468238393505203, + "grad_norm": 0.6415001153945923, + "learning_rate": 0.004922926635767458, + "loss": 2.4277, + "step": 137400 + }, + { + "epoch": 1.8481679615043416, + "grad_norm": 0.34172287583351135, + "learning_rate": 0.004922795961061186, + "loss": 2.425, + "step": 137500 + }, + { + "epoch": 1.8495120836581629, + "grad_norm": 2.2421274185180664, + "learning_rate": 0.004922665177412763, + "loss": 2.4364, + "step": 137600 + }, + { + "epoch": 1.8508562058119842, + "grad_norm": 0.48479756712913513, + "learning_rate": 0.00492253428482808, + "loss": 2.4314, + "step": 137700 + }, + { + "epoch": 1.8522003279658055, + "grad_norm": 0.8086810111999512, + "learning_rate": 0.00492240328331303, + "loss": 2.4392, + "step": 137800 + }, + { + "epoch": 1.8535444501196268, + "grad_norm": 0.8768898844718933, + "learning_rate": 0.004922272172873519, + "loss": 2.44, + "step": 137900 + }, + { + "epoch": 1.854888572273448, + "grad_norm": 0.7095423936843872, + "learning_rate": 0.004922140953515459, + "loss": 2.4383, + "step": 138000 + }, + { + "epoch": 1.854888572273448, + "eval_MaskedAccuracy": 0.44575168545661686, + "eval_loss": 2.747347116470337, + "eval_runtime": 154.9017, + "eval_samples_per_second": 409.782, + "eval_steps_per_second": 1.601, + "step": 138000 + }, + { + "epoch": 1.8562326944272696, + "grad_norm": 0.3725392818450928, + "learning_rate": 0.004922009625244753, + "loss": 2.4307, + "step": 138100 + }, + { + "epoch": 1.8575768165810909, + "grad_norm": 0.3521710932254791, + "learning_rate": 0.004921878188067327, + "loss": 2.4272, + "step": 138200 + }, + { + "epoch": 1.8589209387349124, + "grad_norm": 1.7538436651229858, + "learning_rate": 0.004921746641989098, + "loss": 2.4312, + "step": 138300 + }, + { + "epoch": 1.8602650608887337, + "grad_norm": 0.7190144658088684, + "learning_rate": 0.004921614987015991, + "loss": 2.4334, + "step": 138400 + }, + { + "epoch": 1.861609183042555, + "grad_norm": 0.3224917948246002, + "learning_rate": 0.004921483223153946, + "loss": 2.4315, + "step": 138500 + }, + { + "epoch": 1.8629533051963763, + "grad_norm": 0.6861767172813416, + "learning_rate": 0.004921351350408901, + "loss": 2.432, + "step": 138600 + }, + { + "epoch": 1.8642974273501975, + "grad_norm": 0.32877910137176514, + "learning_rate": 0.004921219368786787, + "loss": 2.4436, + "step": 138700 + }, + { + "epoch": 1.8656415495040188, + "grad_norm": 0.8731430172920227, + "learning_rate": 0.004921087278293557, + "loss": 2.4346, + "step": 138800 + }, + { + "epoch": 1.8669856716578401, + "grad_norm": 0.31697648763656616, + "learning_rate": 0.0049209550789351625, + "loss": 2.4353, + "step": 138900 + }, + { + "epoch": 1.8683297938116616, + "grad_norm": 0.30971816182136536, + "learning_rate": 0.004920822770717562, + "loss": 2.4242, + "step": 139000 + }, + { + "epoch": 1.8683297938116616, + "eval_MaskedAccuracy": 0.4455395990497761, + "eval_loss": 2.749335289001465, + "eval_runtime": 153.4613, + "eval_samples_per_second": 413.629, + "eval_steps_per_second": 1.616, + "step": 139000 + }, + { + "epoch": 1.869673915965483, + "grad_norm": 0.32947519421577454, + "learning_rate": 0.004920690353646715, + "loss": 2.4308, + "step": 139100 + }, + { + "epoch": 1.8710180381193042, + "grad_norm": 3.411869525909424, + "learning_rate": 0.004920557827728589, + "loss": 2.426, + "step": 139200 + }, + { + "epoch": 1.8723621602731257, + "grad_norm": 0.3665344715118408, + "learning_rate": 0.004920425192969151, + "loss": 2.4347, + "step": 139300 + }, + { + "epoch": 1.873706282426947, + "grad_norm": 0.7894236445426941, + "learning_rate": 0.0049202924493743794, + "loss": 2.435, + "step": 139400 + }, + { + "epoch": 1.8750504045807683, + "grad_norm": 0.37953993678092957, + "learning_rate": 0.00492015959695027, + "loss": 2.4363, + "step": 139500 + }, + { + "epoch": 1.8763945267345896, + "grad_norm": 0.32298392057418823, + "learning_rate": 0.004920026635702793, + "loss": 2.4356, + "step": 139600 + }, + { + "epoch": 1.877738648888411, + "grad_norm": 1.1138973236083984, + "learning_rate": 0.004919893565637941, + "loss": 2.4246, + "step": 139700 + }, + { + "epoch": 1.8790827710422322, + "grad_norm": 0.4021407663822174, + "learning_rate": 0.00491976038676171, + "loss": 2.4395, + "step": 139800 + }, + { + "epoch": 1.8804268931960535, + "grad_norm": 0.37863534688949585, + "learning_rate": 0.004919627099080102, + "loss": 2.4341, + "step": 139900 + }, + { + "epoch": 1.881771015349875, + "grad_norm": 1.6591562032699585, + "learning_rate": 0.004919493702599126, + "loss": 2.4329, + "step": 140000 + }, + { + "epoch": 1.881771015349875, + "eval_MaskedAccuracy": 0.4456517448251654, + "eval_loss": 2.7490234375, + "eval_runtime": 144.1126, + "eval_samples_per_second": 440.461, + "eval_steps_per_second": 1.721, + "step": 140000 + }, + { + "epoch": 1.8831151375036963, + "grad_norm": 0.8508136868476868, + "learning_rate": 0.004919360197324791, + "loss": 2.4389, + "step": 140100 + }, + { + "epoch": 1.8844592596575178, + "grad_norm": 0.9984667301177979, + "learning_rate": 0.004919226583263112, + "loss": 2.4296, + "step": 140200 + }, + { + "epoch": 1.885803381811339, + "grad_norm": 1.3535406589508057, + "learning_rate": 0.004919092860420102, + "loss": 2.4339, + "step": 140300 + }, + { + "epoch": 1.8871475039651604, + "grad_norm": 0.6479485630989075, + "learning_rate": 0.004918959028801795, + "loss": 2.4319, + "step": 140400 + }, + { + "epoch": 1.8884916261189817, + "grad_norm": 0.6749982237815857, + "learning_rate": 0.004918825088414224, + "loss": 2.433, + "step": 140500 + }, + { + "epoch": 1.889835748272803, + "grad_norm": 0.31086307764053345, + "learning_rate": 0.0049186910392634235, + "loss": 2.435, + "step": 140600 + }, + { + "epoch": 1.8911798704266243, + "grad_norm": 0.5695422291755676, + "learning_rate": 0.0049185568813554185, + "loss": 2.432, + "step": 140700 + }, + { + "epoch": 1.8925239925804456, + "grad_norm": 0.7300012707710266, + "learning_rate": 0.004918422614696274, + "loss": 2.4368, + "step": 140800 + }, + { + "epoch": 1.893868114734267, + "grad_norm": 0.415073424577713, + "learning_rate": 0.004918288239292025, + "loss": 2.443, + "step": 140900 + }, + { + "epoch": 1.8952122368880884, + "grad_norm": 0.9157136678695679, + "learning_rate": 0.0049181537551487315, + "loss": 2.4297, + "step": 141000 + }, + { + "epoch": 1.8952122368880884, + "eval_MaskedAccuracy": 0.4457131985564527, + "eval_loss": 2.7454721927642822, + "eval_runtime": 153.2602, + "eval_samples_per_second": 414.172, + "eval_steps_per_second": 1.618, + "step": 141000 + }, + { + "epoch": 1.89655635904191, + "grad_norm": 0.43132588267326355, + "learning_rate": 0.00491801916227246, + "loss": 2.4381, + "step": 141100 + }, + { + "epoch": 1.8979004811957312, + "grad_norm": 1.6390697956085205, + "learning_rate": 0.004917884460669262, + "loss": 2.4192, + "step": 141200 + }, + { + "epoch": 1.8992446033495525, + "grad_norm": 0.5319965481758118, + "learning_rate": 0.004917749650345217, + "loss": 2.4424, + "step": 141300 + }, + { + "epoch": 1.9005887255033738, + "grad_norm": 0.43463683128356934, + "learning_rate": 0.004917614731306398, + "loss": 2.4381, + "step": 141400 + }, + { + "epoch": 1.901932847657195, + "grad_norm": 1.1953731775283813, + "learning_rate": 0.004917479703558886, + "loss": 2.4382, + "step": 141500 + }, + { + "epoch": 1.9032769698110164, + "grad_norm": 3.3171629905700684, + "learning_rate": 0.004917344567108767, + "loss": 2.4285, + "step": 141600 + }, + { + "epoch": 1.9046210919648376, + "grad_norm": 1.354646921157837, + "learning_rate": 0.004917209321962115, + "loss": 2.4329, + "step": 141700 + }, + { + "epoch": 1.9059652141186592, + "grad_norm": 0.46209368109703064, + "learning_rate": 0.004917073968125036, + "loss": 2.4287, + "step": 141800 + }, + { + "epoch": 1.9073093362724804, + "grad_norm": 0.2556077241897583, + "learning_rate": 0.004916938505603623, + "loss": 2.4351, + "step": 141900 + }, + { + "epoch": 1.9086534584263017, + "grad_norm": 0.9437853097915649, + "learning_rate": 0.004916802934403986, + "loss": 2.4264, + "step": 142000 + }, + { + "epoch": 1.9086534584263017, + "eval_MaskedAccuracy": 0.44617851609047027, + "eval_loss": 2.7451670169830322, + "eval_runtime": 156.2618, + "eval_samples_per_second": 406.216, + "eval_steps_per_second": 1.587, + "step": 142000 + }, + { + "epoch": 1.9099975805801233, + "grad_norm": 0.6614341139793396, + "learning_rate": 0.004916667254532234, + "loss": 2.4258, + "step": 142100 + }, + { + "epoch": 1.9113417027339445, + "grad_norm": 2.363129138946533, + "learning_rate": 0.004916531465994474, + "loss": 2.4318, + "step": 142200 + }, + { + "epoch": 1.9126858248877658, + "grad_norm": 0.27388179302215576, + "learning_rate": 0.004916395568796826, + "loss": 2.4305, + "step": 142300 + }, + { + "epoch": 1.9140299470415871, + "grad_norm": 1.5506696701049805, + "learning_rate": 0.004916259562945416, + "loss": 2.4361, + "step": 142400 + }, + { + "epoch": 1.9153740691954084, + "grad_norm": 0.8063324093818665, + "learning_rate": 0.004916123448446375, + "loss": 2.4362, + "step": 142500 + }, + { + "epoch": 1.9167181913492297, + "grad_norm": 0.9321148991584778, + "learning_rate": 0.004915987225305831, + "loss": 2.4293, + "step": 142600 + }, + { + "epoch": 1.918062313503051, + "grad_norm": 0.9221600294113159, + "learning_rate": 0.004915850893529926, + "loss": 2.4282, + "step": 142700 + }, + { + "epoch": 1.9194064356568725, + "grad_norm": 0.31020745635032654, + "learning_rate": 0.004915714453124789, + "loss": 2.4326, + "step": 142800 + }, + { + "epoch": 1.9207505578106938, + "grad_norm": 0.9556496143341064, + "learning_rate": 0.004915577904096585, + "loss": 2.4317, + "step": 142900 + }, + { + "epoch": 1.9220946799645153, + "grad_norm": 0.9458506107330322, + "learning_rate": 0.004915441246451457, + "loss": 2.4328, + "step": 143000 + }, + { + "epoch": 1.9220946799645153, + "eval_MaskedAccuracy": 0.44678351331325233, + "eval_loss": 2.7399027347564697, + "eval_runtime": 154.7107, + "eval_samples_per_second": 410.288, + "eval_steps_per_second": 1.603, + "step": 143000 + }, + { + "epoch": 1.9234388021183366, + "grad_norm": 0.26708322763442993, + "learning_rate": 0.004915304480195574, + "loss": 2.4305, + "step": 143100 + }, + { + "epoch": 1.924782924272158, + "grad_norm": 0.42214831709861755, + "learning_rate": 0.004915167605335087, + "loss": 2.4342, + "step": 143200 + }, + { + "epoch": 1.9261270464259792, + "grad_norm": 1.0620460510253906, + "learning_rate": 0.00491503062187617, + "loss": 2.4318, + "step": 143300 + }, + { + "epoch": 1.9274711685798005, + "grad_norm": 0.45594775676727295, + "learning_rate": 0.004914893529824991, + "loss": 2.4298, + "step": 143400 + }, + { + "epoch": 1.9288152907336218, + "grad_norm": 0.8724369406700134, + "learning_rate": 0.00491475632918773, + "loss": 2.4303, + "step": 143500 + }, + { + "epoch": 1.930159412887443, + "grad_norm": 1.169735074043274, + "learning_rate": 0.004914619019970569, + "loss": 2.4335, + "step": 143600 + }, + { + "epoch": 1.9315035350412646, + "grad_norm": 0.28445443511009216, + "learning_rate": 0.00491448160217969, + "loss": 2.4301, + "step": 143700 + }, + { + "epoch": 1.9328476571950859, + "grad_norm": 1.0240504741668701, + "learning_rate": 0.004914344075821288, + "loss": 2.4395, + "step": 143800 + }, + { + "epoch": 1.9341917793489072, + "grad_norm": 0.2876899242401123, + "learning_rate": 0.004914206440901561, + "loss": 2.4273, + "step": 143900 + }, + { + "epoch": 1.9355359015027287, + "grad_norm": 1.793506383895874, + "learning_rate": 0.004914068697426708, + "loss": 2.4316, + "step": 144000 + }, + { + "epoch": 1.9355359015027287, + "eval_MaskedAccuracy": 0.4459802489403792, + "eval_loss": 2.7441201210021973, + "eval_runtime": 159.5024, + "eval_samples_per_second": 397.963, + "eval_steps_per_second": 1.555, + "step": 144000 + }, + { + "epoch": 1.93688002365655, + "grad_norm": 0.4511992931365967, + "learning_rate": 0.004913930845402937, + "loss": 2.4312, + "step": 144100 + }, + { + "epoch": 1.9382241458103713, + "grad_norm": 1.3304485082626343, + "learning_rate": 0.004913792884836461, + "loss": 2.4234, + "step": 144200 + }, + { + "epoch": 1.9395682679641926, + "grad_norm": 0.7679903507232666, + "learning_rate": 0.004913654815733498, + "loss": 2.4288, + "step": 144300 + }, + { + "epoch": 1.9409123901180139, + "grad_norm": 0.2566910982131958, + "learning_rate": 0.004913516638100267, + "loss": 2.4307, + "step": 144400 + }, + { + "epoch": 1.9422565122718352, + "grad_norm": 0.3400803804397583, + "learning_rate": 0.004913378351942991, + "loss": 2.4325, + "step": 144500 + }, + { + "epoch": 1.9436006344256564, + "grad_norm": 0.5881883502006531, + "learning_rate": 0.0049132399572679045, + "loss": 2.4283, + "step": 144600 + }, + { + "epoch": 1.944944756579478, + "grad_norm": 2.5710055828094482, + "learning_rate": 0.004913101454081242, + "loss": 2.4259, + "step": 144700 + }, + { + "epoch": 1.9462888787332993, + "grad_norm": 0.6169648766517639, + "learning_rate": 0.0049129628423892424, + "loss": 2.4406, + "step": 144800 + }, + { + "epoch": 1.9476330008871208, + "grad_norm": 0.541467010974884, + "learning_rate": 0.004912824122198156, + "loss": 2.4308, + "step": 144900 + }, + { + "epoch": 1.948977123040942, + "grad_norm": 1.3558932542800903, + "learning_rate": 0.004912685293514229, + "loss": 2.426, + "step": 145000 + }, + { + "epoch": 1.948977123040942, + "eval_MaskedAccuracy": 0.4466767450773424, + "eval_loss": 2.741316556930542, + "eval_runtime": 153.255, + "eval_samples_per_second": 414.186, + "eval_steps_per_second": 1.618, + "step": 145000 + }, + { + "epoch": 1.9503212451947634, + "grad_norm": 0.6421293616294861, + "learning_rate": 0.00491254635634372, + "loss": 2.4282, + "step": 145100 + }, + { + "epoch": 1.9516653673485846, + "grad_norm": 0.5589990615844727, + "learning_rate": 0.004912407310692889, + "loss": 2.4237, + "step": 145200 + }, + { + "epoch": 1.953009489502406, + "grad_norm": 0.45161235332489014, + "learning_rate": 0.004912268156567995, + "loss": 2.4336, + "step": 145300 + }, + { + "epoch": 1.9543536116562272, + "grad_norm": 0.6716010570526123, + "learning_rate": 0.004912128893975318, + "loss": 2.4239, + "step": 145400 + }, + { + "epoch": 1.9556977338100485, + "grad_norm": 1.577767014503479, + "learning_rate": 0.004911989522921124, + "loss": 2.4261, + "step": 145500 + }, + { + "epoch": 1.95704185596387, + "grad_norm": 1.3624939918518066, + "learning_rate": 0.004911850043411697, + "loss": 2.4406, + "step": 145600 + }, + { + "epoch": 1.9583859781176913, + "grad_norm": 0.23402433097362518, + "learning_rate": 0.004911710455453321, + "loss": 2.4291, + "step": 145700 + }, + { + "epoch": 1.9597301002715126, + "grad_norm": 0.6562730669975281, + "learning_rate": 0.004911570759052287, + "loss": 2.4325, + "step": 145800 + }, + { + "epoch": 1.9610742224253341, + "grad_norm": 0.2946057915687561, + "learning_rate": 0.004911430954214888, + "loss": 2.4268, + "step": 145900 + }, + { + "epoch": 1.9624183445791554, + "grad_norm": 2.216837167739868, + "learning_rate": 0.0049112910409474225, + "loss": 2.4272, + "step": 146000 + }, + { + "epoch": 1.9624183445791554, + "eval_MaskedAccuracy": 0.44681526480055744, + "eval_loss": 2.739447593688965, + "eval_runtime": 244.5176, + "eval_samples_per_second": 259.597, + "eval_steps_per_second": 1.014, + "step": 146000 + }, + { + "epoch": 1.9637624667329767, + "grad_norm": 0.9857944250106812, + "learning_rate": 0.004911151019256194, + "loss": 2.4263, + "step": 146100 + }, + { + "epoch": 1.965106588886798, + "grad_norm": 0.9716507792472839, + "learning_rate": 0.004911010889147516, + "loss": 2.4287, + "step": 146200 + }, + { + "epoch": 1.9664507110406193, + "grad_norm": 0.6002909541130066, + "learning_rate": 0.0049108706506277005, + "loss": 2.4169, + "step": 146300 + }, + { + "epoch": 1.9677948331944406, + "grad_norm": 0.33834025263786316, + "learning_rate": 0.004910730303703066, + "loss": 2.425, + "step": 146400 + }, + { + "epoch": 1.9691389553482619, + "grad_norm": 0.3823884129524231, + "learning_rate": 0.004910589848379935, + "loss": 2.4274, + "step": 146500 + }, + { + "epoch": 1.9704830775020834, + "grad_norm": 0.7107839584350586, + "learning_rate": 0.004910449284664644, + "loss": 2.4339, + "step": 146600 + }, + { + "epoch": 1.9718271996559047, + "grad_norm": 1.5292255878448486, + "learning_rate": 0.0049103086125635185, + "loss": 2.4211, + "step": 146700 + }, + { + "epoch": 1.9731713218097262, + "grad_norm": 0.4462113678455353, + "learning_rate": 0.0049101678320829035, + "loss": 2.4163, + "step": 146800 + }, + { + "epoch": 1.9745154439635475, + "grad_norm": 0.4614112377166748, + "learning_rate": 0.004910026943229135, + "loss": 2.4249, + "step": 146900 + }, + { + "epoch": 1.9758595661173688, + "grad_norm": 1.6407921314239502, + "learning_rate": 0.004909885946008562, + "loss": 2.4243, + "step": 147000 + }, + { + "epoch": 1.9758595661173688, + "eval_MaskedAccuracy": 0.44691000163355754, + "eval_loss": 2.740039348602295, + "eval_runtime": 160.8268, + "eval_samples_per_second": 394.686, + "eval_steps_per_second": 1.542, + "step": 147000 + }, + { + "epoch": 1.97720368827119, + "grad_norm": 0.3130664527416229, + "learning_rate": 0.004909744840427542, + "loss": 2.4207, + "step": 147100 + }, + { + "epoch": 1.9785478104250114, + "grad_norm": 0.3282364010810852, + "learning_rate": 0.004909603626492431, + "loss": 2.421, + "step": 147200 + }, + { + "epoch": 1.9798919325788327, + "grad_norm": 0.5056743621826172, + "learning_rate": 0.004909462304209587, + "loss": 2.4254, + "step": 147300 + }, + { + "epoch": 1.981236054732654, + "grad_norm": 1.1724507808685303, + "learning_rate": 0.0049093208735853855, + "loss": 2.428, + "step": 147400 + }, + { + "epoch": 1.9825801768864755, + "grad_norm": 0.2965724766254425, + "learning_rate": 0.004909179334626195, + "loss": 2.4272, + "step": 147500 + }, + { + "epoch": 1.9839242990402968, + "grad_norm": 0.37335798144340515, + "learning_rate": 0.004909037687338392, + "loss": 2.4197, + "step": 147600 + }, + { + "epoch": 1.9852684211941183, + "grad_norm": 0.5587185621261597, + "learning_rate": 0.0049088959317283645, + "loss": 2.4206, + "step": 147700 + }, + { + "epoch": 1.9866125433479396, + "grad_norm": 0.5161391496658325, + "learning_rate": 0.004908754067802488, + "loss": 2.4308, + "step": 147800 + }, + { + "epoch": 1.9879566655017609, + "grad_norm": 0.2679111361503601, + "learning_rate": 0.004908612095567173, + "loss": 2.4215, + "step": 147900 + }, + { + "epoch": 1.9893007876555822, + "grad_norm": 0.6613872051239014, + "learning_rate": 0.004908470015028803, + "loss": 2.4228, + "step": 148000 + }, + { + "epoch": 1.9893007876555822, + "eval_MaskedAccuracy": 0.4469897602234249, + "eval_loss": 2.7378077507019043, + "eval_runtime": 157.5436, + "eval_samples_per_second": 402.911, + "eval_steps_per_second": 1.574, + "step": 148000 + }, + { + "epoch": 1.9906449098094035, + "grad_norm": 0.6632961630821228, + "learning_rate": 0.0049083278261937814, + "loss": 2.4307, + "step": 148100 + }, + { + "epoch": 1.9919890319632247, + "grad_norm": 0.31573984026908875, + "learning_rate": 0.0049081855290685145, + "loss": 2.4226, + "step": 148200 + }, + { + "epoch": 1.993333154117046, + "grad_norm": 2.4147422313690186, + "learning_rate": 0.00490804312365942, + "loss": 2.4199, + "step": 148300 + }, + { + "epoch": 1.9946772762708675, + "grad_norm": 2.1030828952789307, + "learning_rate": 0.004907900609972911, + "loss": 2.4279, + "step": 148400 + }, + { + "epoch": 1.9960213984246888, + "grad_norm": 0.395871639251709, + "learning_rate": 0.004907757988015409, + "loss": 2.425, + "step": 148500 + }, + { + "epoch": 1.9973655205785101, + "grad_norm": 0.538977324962616, + "learning_rate": 0.004907615257793332, + "loss": 2.4217, + "step": 148600 + }, + { + "epoch": 1.9987096427323316, + "grad_norm": 1.8294909000396729, + "learning_rate": 0.004907472419313125, + "loss": 2.4302, + "step": 148700 + }, + { + "epoch": 2.000053764886153, + "grad_norm": 0.25931796431541443, + "learning_rate": 0.004907329472581221, + "loss": 2.4247, + "step": 148800 + }, + { + "epoch": 2.0013978870399742, + "grad_norm": 0.34021487832069397, + "learning_rate": 0.0049071864176040555, + "loss": 2.4319, + "step": 148900 + }, + { + "epoch": 2.0027420091937955, + "grad_norm": 0.7299911379814148, + "learning_rate": 0.004907043254388077, + "loss": 2.4311, + "step": 149000 + }, + { + "epoch": 2.0027420091937955, + "eval_MaskedAccuracy": 0.4465608726612502, + "eval_loss": 2.7390899658203125, + "eval_runtime": 595.6963, + "eval_samples_per_second": 106.558, + "eval_steps_per_second": 0.416, + "step": 149000 + }, + { + "epoch": 2.004086131347617, + "grad_norm": 0.7680445313453674, + "learning_rate": 0.004906899982939738, + "loss": 2.4315, + "step": 149100 + }, + { + "epoch": 2.005430253501438, + "grad_norm": 0.603999137878418, + "learning_rate": 0.004906756603265495, + "loss": 2.4207, + "step": 149200 + }, + { + "epoch": 2.0067743756552594, + "grad_norm": 0.995240330696106, + "learning_rate": 0.004906613115371805, + "loss": 2.4293, + "step": 149300 + }, + { + "epoch": 2.0081184978090807, + "grad_norm": 0.858191967010498, + "learning_rate": 0.0049064695192651345, + "loss": 2.4275, + "step": 149400 + }, + { + "epoch": 2.0094626199629024, + "grad_norm": 0.5625921487808228, + "learning_rate": 0.004906325814951952, + "loss": 2.4333, + "step": 149500 + }, + { + "epoch": 2.0108067421167237, + "grad_norm": 0.3029104769229889, + "learning_rate": 0.004906182002438735, + "loss": 2.4288, + "step": 149600 + }, + { + "epoch": 2.012150864270545, + "grad_norm": 0.6609681248664856, + "learning_rate": 0.004906038081731963, + "loss": 2.4128, + "step": 149700 + }, + { + "epoch": 2.0134949864243663, + "grad_norm": 0.48055407404899597, + "learning_rate": 0.004905894052838118, + "loss": 2.4281, + "step": 149800 + }, + { + "epoch": 2.0148391085781876, + "grad_norm": 0.2819332778453827, + "learning_rate": 0.00490574991576369, + "loss": 2.4247, + "step": 149900 + }, + { + "epoch": 2.016183230732009, + "grad_norm": 0.32838475704193115, + "learning_rate": 0.004905605670515174, + "loss": 2.4259, + "step": 150000 + }, + { + "epoch": 2.016183230732009, + "eval_MaskedAccuracy": 0.44722750040117315, + "eval_loss": 2.738093376159668, + "eval_runtime": 142.6909, + "eval_samples_per_second": 444.85, + "eval_steps_per_second": 1.738, + "step": 150000 + }, + { + "epoch": 2.01752735288583, + "grad_norm": 0.35216397047042847, + "learning_rate": 0.004905461317099078, + "loss": 2.432, + "step": 150100 + }, + { + "epoch": 2.0188714750396515, + "grad_norm": 0.7930958271026611, + "learning_rate": 0.004905316855521893, + "loss": 2.43, + "step": 150200 + }, + { + "epoch": 2.0202155971934728, + "grad_norm": 0.6561233401298523, + "learning_rate": 0.004905172285790133, + "loss": 2.432, + "step": 150300 + }, + { + "epoch": 2.0215597193472945, + "grad_norm": 0.8430966138839722, + "learning_rate": 0.0049050276079103195, + "loss": 2.4273, + "step": 150400 + }, + { + "epoch": 2.022903841501116, + "grad_norm": 0.4468148946762085, + "learning_rate": 0.004904882821888969, + "loss": 2.4255, + "step": 150500 + }, + { + "epoch": 2.024247963654937, + "grad_norm": 0.26562803983688354, + "learning_rate": 0.004904737927732603, + "loss": 2.428, + "step": 150600 + }, + { + "epoch": 2.0255920858087584, + "grad_norm": 0.35955631732940674, + "learning_rate": 0.004904592925447746, + "loss": 2.4227, + "step": 150700 + }, + { + "epoch": 2.0269362079625797, + "grad_norm": 1.039691686630249, + "learning_rate": 0.0049044478150409385, + "loss": 2.4229, + "step": 150800 + }, + { + "epoch": 2.028280330116401, + "grad_norm": 2.361480474472046, + "learning_rate": 0.004904302596518712, + "loss": 2.4246, + "step": 150900 + }, + { + "epoch": 2.0296244522702223, + "grad_norm": 0.4659264087677002, + "learning_rate": 0.004904157269887615, + "loss": 2.429, + "step": 151000 + }, + { + "epoch": 2.0296244522702223, + "eval_MaskedAccuracy": 0.44815486516670616, + "eval_loss": 2.7315616607666016, + "eval_runtime": 142.8587, + "eval_samples_per_second": 444.327, + "eval_steps_per_second": 1.736, + "step": 151000 + }, + { + "epoch": 2.0309685744240435, + "grad_norm": 0.3324325680732727, + "learning_rate": 0.004904011835154192, + "loss": 2.429, + "step": 151100 + }, + { + "epoch": 2.032312696577865, + "grad_norm": 0.4291480481624603, + "learning_rate": 0.0049038662923250025, + "loss": 2.4253, + "step": 151200 + }, + { + "epoch": 2.033656818731686, + "grad_norm": 0.5924742817878723, + "learning_rate": 0.004903720641406596, + "loss": 2.4181, + "step": 151300 + }, + { + "epoch": 2.035000940885508, + "grad_norm": 1.0112535953521729, + "learning_rate": 0.004903574882405539, + "loss": 2.422, + "step": 151400 + }, + { + "epoch": 2.036345063039329, + "grad_norm": 0.6629267334938049, + "learning_rate": 0.004903429015328402, + "loss": 2.4134, + "step": 151500 + }, + { + "epoch": 2.0376891851931505, + "grad_norm": 0.5437670350074768, + "learning_rate": 0.004903283040181754, + "loss": 2.4211, + "step": 151600 + }, + { + "epoch": 2.0390333073469717, + "grad_norm": 0.8838233947753906, + "learning_rate": 0.004903136956972172, + "loss": 2.4261, + "step": 151700 + }, + { + "epoch": 2.040377429500793, + "grad_norm": 0.493783563375473, + "learning_rate": 0.00490299076570624, + "loss": 2.4228, + "step": 151800 + }, + { + "epoch": 2.0417215516546143, + "grad_norm": 0.5075348019599915, + "learning_rate": 0.004902844466390544, + "loss": 2.4249, + "step": 151900 + }, + { + "epoch": 2.0430656738084356, + "grad_norm": 0.31101787090301514, + "learning_rate": 0.004902698059031681, + "loss": 2.4234, + "step": 152000 + }, + { + "epoch": 2.0430656738084356, + "eval_MaskedAccuracy": 0.4476862797973037, + "eval_loss": 2.733727216720581, + "eval_runtime": 142.8049, + "eval_samples_per_second": 444.495, + "eval_steps_per_second": 1.737, + "step": 152000 + }, + { + "epoch": 2.044409795962257, + "grad_norm": 0.3274334967136383, + "learning_rate": 0.004902551543636243, + "loss": 2.4265, + "step": 152100 + }, + { + "epoch": 2.045753918116078, + "grad_norm": 0.4391040503978729, + "learning_rate": 0.004902404920210837, + "loss": 2.4252, + "step": 152200 + }, + { + "epoch": 2.0470980402699, + "grad_norm": 0.36864349246025085, + "learning_rate": 0.004902258188762062, + "loss": 2.4191, + "step": 152300 + }, + { + "epoch": 2.0484421624237212, + "grad_norm": 0.32997363805770874, + "learning_rate": 0.004902111349296531, + "loss": 2.4301, + "step": 152400 + }, + { + "epoch": 2.0497862845775425, + "grad_norm": 0.649544358253479, + "learning_rate": 0.004901964401820856, + "loss": 2.4262, + "step": 152500 + }, + { + "epoch": 2.051130406731364, + "grad_norm": 0.5093933939933777, + "learning_rate": 0.004901817346341668, + "loss": 2.4184, + "step": 152600 + }, + { + "epoch": 2.052474528885185, + "grad_norm": 0.5928727984428406, + "learning_rate": 0.004901670182865587, + "loss": 2.4273, + "step": 152700 + }, + { + "epoch": 2.0538186510390064, + "grad_norm": 0.8361156582832336, + "learning_rate": 0.004901522911399248, + "loss": 2.4176, + "step": 152800 + }, + { + "epoch": 2.0551627731928277, + "grad_norm": 1.1973950862884521, + "learning_rate": 0.004901375531949283, + "loss": 2.4253, + "step": 152900 + }, + { + "epoch": 2.056506895346649, + "grad_norm": 0.7281692028045654, + "learning_rate": 0.004901228044522336, + "loss": 2.4213, + "step": 153000 + }, + { + "epoch": 2.056506895346649, + "eval_MaskedAccuracy": 0.4476455974274515, + "eval_loss": 2.7339048385620117, + "eval_runtime": 146.6823, + "eval_samples_per_second": 432.745, + "eval_steps_per_second": 1.691, + "step": 153000 + }, + { + "epoch": 2.0578510175004703, + "grad_norm": 0.908572793006897, + "learning_rate": 0.004901080449125051, + "loss": 2.4233, + "step": 153100 + }, + { + "epoch": 2.0591951396542916, + "grad_norm": 0.3152801990509033, + "learning_rate": 0.004900932745764079, + "loss": 2.4217, + "step": 153200 + }, + { + "epoch": 2.0605392618081133, + "grad_norm": 0.34237778186798096, + "learning_rate": 0.004900784934446075, + "loss": 2.4311, + "step": 153300 + }, + { + "epoch": 2.0618833839619346, + "grad_norm": 0.9181241393089294, + "learning_rate": 0.004900637015177697, + "loss": 2.4211, + "step": 153400 + }, + { + "epoch": 2.063227506115756, + "grad_norm": 1.0392390489578247, + "learning_rate": 0.004900488987965611, + "loss": 2.4163, + "step": 153500 + }, + { + "epoch": 2.064571628269577, + "grad_norm": 1.2790428400039673, + "learning_rate": 0.0049003408528164915, + "loss": 2.4157, + "step": 153600 + }, + { + "epoch": 2.0659157504233985, + "grad_norm": 0.2699611186981201, + "learning_rate": 0.004900192609737011, + "loss": 2.4267, + "step": 153700 + }, + { + "epoch": 2.0672598725772198, + "grad_norm": 1.472973346710205, + "learning_rate": 0.004900044258733849, + "loss": 2.4306, + "step": 153800 + }, + { + "epoch": 2.068603994731041, + "grad_norm": 0.27637550234794617, + "learning_rate": 0.004899895799813685, + "loss": 2.4221, + "step": 153900 + }, + { + "epoch": 2.0699481168848624, + "grad_norm": 1.1613603830337524, + "learning_rate": 0.004899747232983215, + "loss": 2.4191, + "step": 154000 + }, + { + "epoch": 2.0699481168848624, + "eval_MaskedAccuracy": 0.4478018154999935, + "eval_loss": 2.7331624031066895, + "eval_runtime": 142.1229, + "eval_samples_per_second": 446.628, + "eval_steps_per_second": 1.745, + "step": 154000 + }, + { + "epoch": 2.0712922390386836, + "grad_norm": 1.339164137840271, + "learning_rate": 0.004899598558249125, + "loss": 2.4229, + "step": 154100 + }, + { + "epoch": 2.0726363611925054, + "grad_norm": 0.558399498462677, + "learning_rate": 0.004899449775618128, + "loss": 2.4168, + "step": 154200 + }, + { + "epoch": 2.0739804833463267, + "grad_norm": 0.6973606944084167, + "learning_rate": 0.00489930088509692, + "loss": 2.4206, + "step": 154300 + }, + { + "epoch": 2.075324605500148, + "grad_norm": 0.7255196571350098, + "learning_rate": 0.004899151886692205, + "loss": 2.4187, + "step": 154400 + }, + { + "epoch": 2.0766687276539693, + "grad_norm": 1.0322102308273315, + "learning_rate": 0.004899002780410702, + "loss": 2.4304, + "step": 154500 + }, + { + "epoch": 2.0780128498077906, + "grad_norm": 0.8069867491722107, + "learning_rate": 0.0048988535662591245, + "loss": 2.4222, + "step": 154600 + }, + { + "epoch": 2.079356971961612, + "grad_norm": 1.4786266088485718, + "learning_rate": 0.004898704244244206, + "loss": 2.4193, + "step": 154700 + }, + { + "epoch": 2.080701094115433, + "grad_norm": 0.7500434517860413, + "learning_rate": 0.004898554814372665, + "loss": 2.4213, + "step": 154800 + }, + { + "epoch": 2.0820452162692544, + "grad_norm": 1.6421656608581543, + "learning_rate": 0.004898405276651237, + "loss": 2.4169, + "step": 154900 + }, + { + "epoch": 2.0833893384230757, + "grad_norm": 1.4270743131637573, + "learning_rate": 0.0048982556310866576, + "loss": 2.4242, + "step": 155000 + }, + { + "epoch": 2.0833893384230757, + "eval_MaskedAccuracy": 0.44795841587608753, + "eval_loss": 2.7314157485961914, + "eval_runtime": 142.2388, + "eval_samples_per_second": 446.264, + "eval_steps_per_second": 1.744, + "step": 155000 + }, + { + "epoch": 2.0847334605768975, + "grad_norm": 1.0751755237579346, + "learning_rate": 0.004898105877685676, + "loss": 2.4239, + "step": 155100 + }, + { + "epoch": 2.0860775827307187, + "grad_norm": 0.43115946650505066, + "learning_rate": 0.004897956016455043, + "loss": 2.4202, + "step": 155200 + }, + { + "epoch": 2.08742170488454, + "grad_norm": 0.6999830007553101, + "learning_rate": 0.004897806047401499, + "loss": 2.4164, + "step": 155300 + }, + { + "epoch": 2.0887658270383613, + "grad_norm": 0.6027560830116272, + "learning_rate": 0.004897655970531812, + "loss": 2.4118, + "step": 155400 + }, + { + "epoch": 2.0901099491921826, + "grad_norm": 2.5767462253570557, + "learning_rate": 0.004897505785852734, + "loss": 2.4254, + "step": 155500 + }, + { + "epoch": 2.091454071346004, + "grad_norm": 0.6744506359100342, + "learning_rate": 0.004897355493371032, + "loss": 2.4192, + "step": 155600 + }, + { + "epoch": 2.092798193499825, + "grad_norm": 0.6279339790344238, + "learning_rate": 0.004897205093093492, + "loss": 2.4212, + "step": 155700 + }, + { + "epoch": 2.0941423156536465, + "grad_norm": 0.317688912153244, + "learning_rate": 0.00489705458502688, + "loss": 2.4179, + "step": 155800 + }, + { + "epoch": 2.095486437807468, + "grad_norm": 0.5679840445518494, + "learning_rate": 0.004896903969177977, + "loss": 2.4157, + "step": 155900 + }, + { + "epoch": 2.096830559961289, + "grad_norm": 0.6120138764381409, + "learning_rate": 0.0048967532455535725, + "loss": 2.4172, + "step": 156000 + }, + { + "epoch": 2.096830559961289, + "eval_MaskedAccuracy": 0.44820394067035313, + "eval_loss": 2.730539321899414, + "eval_runtime": 143.8277, + "eval_samples_per_second": 441.334, + "eval_steps_per_second": 1.724, + "step": 156000 + }, + { + "epoch": 2.098174682115111, + "grad_norm": 1.3471684455871582, + "learning_rate": 0.004896602414160458, + "loss": 2.412, + "step": 156100 + }, + { + "epoch": 2.099518804268932, + "grad_norm": 2.606961250305176, + "learning_rate": 0.0048964514750054365, + "loss": 2.4199, + "step": 156200 + }, + { + "epoch": 2.1008629264227534, + "grad_norm": 0.44379231333732605, + "learning_rate": 0.004896300428095295, + "loss": 2.4295, + "step": 156300 + }, + { + "epoch": 2.1022070485765747, + "grad_norm": 0.4089781641960144, + "learning_rate": 0.004896149273436849, + "loss": 2.4195, + "step": 156400 + }, + { + "epoch": 2.103551170730396, + "grad_norm": 0.39541006088256836, + "learning_rate": 0.004895998011036907, + "loss": 2.4226, + "step": 156500 + }, + { + "epoch": 2.1048952928842173, + "grad_norm": 0.7441678047180176, + "learning_rate": 0.004895846640902289, + "loss": 2.4146, + "step": 156600 + }, + { + "epoch": 2.1062394150380386, + "grad_norm": 0.2654988765716553, + "learning_rate": 0.004895695163039808, + "loss": 2.4115, + "step": 156700 + }, + { + "epoch": 2.10758353719186, + "grad_norm": 0.23396815359592438, + "learning_rate": 0.004895543577456294, + "loss": 2.422, + "step": 156800 + }, + { + "epoch": 2.108927659345681, + "grad_norm": 0.3832063674926758, + "learning_rate": 0.004895391884158574, + "loss": 2.4195, + "step": 156900 + }, + { + "epoch": 2.1102717814995025, + "grad_norm": 0.6434290409088135, + "learning_rate": 0.0048952400831534845, + "loss": 2.4227, + "step": 157000 + }, + { + "epoch": 2.1102717814995025, + "eval_MaskedAccuracy": 0.448189960789518, + "eval_loss": 2.7293646335601807, + "eval_runtime": 149.7286, + "eval_samples_per_second": 423.94, + "eval_steps_per_second": 1.656, + "step": 157000 + }, + { + "epoch": 2.111615903653324, + "grad_norm": 0.5817483067512512, + "learning_rate": 0.004895088174447862, + "loss": 2.4267, + "step": 157100 + }, + { + "epoch": 2.1129600258071455, + "grad_norm": 0.36821791529655457, + "learning_rate": 0.004894936158048559, + "loss": 2.4268, + "step": 157200 + }, + { + "epoch": 2.1143041479609668, + "grad_norm": 0.5711908340454102, + "learning_rate": 0.004894784033962416, + "loss": 2.4233, + "step": 157300 + }, + { + "epoch": 2.115648270114788, + "grad_norm": 0.4792524576187134, + "learning_rate": 0.004894631802196298, + "loss": 2.4199, + "step": 157400 + }, + { + "epoch": 2.1169923922686094, + "grad_norm": 0.9362654089927673, + "learning_rate": 0.0048944794627570525, + "loss": 2.425, + "step": 157500 + }, + { + "epoch": 2.1183365144224306, + "grad_norm": 0.8550450801849365, + "learning_rate": 0.004894327015651554, + "loss": 2.4171, + "step": 157600 + }, + { + "epoch": 2.119680636576252, + "grad_norm": 0.2275136560201645, + "learning_rate": 0.004894174460886669, + "loss": 2.4112, + "step": 157700 + }, + { + "epoch": 2.1210247587300732, + "grad_norm": 0.8724859356880188, + "learning_rate": 0.004894021798469268, + "loss": 2.4187, + "step": 157800 + }, + { + "epoch": 2.1223688808838945, + "grad_norm": 0.3414950966835022, + "learning_rate": 0.004893869028406233, + "loss": 2.4113, + "step": 157900 + }, + { + "epoch": 2.1237130030377163, + "grad_norm": 0.5261027216911316, + "learning_rate": 0.004893716150704447, + "loss": 2.4249, + "step": 158000 + }, + { + "epoch": 2.1237130030377163, + "eval_MaskedAccuracy": 0.44844991045030136, + "eval_loss": 2.7277510166168213, + "eval_runtime": 152.1905, + "eval_samples_per_second": 417.083, + "eval_steps_per_second": 1.63, + "step": 158000 + }, + { + "epoch": 2.1250571251915376, + "grad_norm": 0.24052013456821442, + "learning_rate": 0.0048935631653708, + "loss": 2.4195, + "step": 158100 + }, + { + "epoch": 2.126401247345359, + "grad_norm": 0.9782668352127075, + "learning_rate": 0.004893410072412176, + "loss": 2.4313, + "step": 158200 + }, + { + "epoch": 2.12774536949918, + "grad_norm": 1.6755679845809937, + "learning_rate": 0.004893256871835483, + "loss": 2.4201, + "step": 158300 + }, + { + "epoch": 2.1290894916530014, + "grad_norm": 0.42762598395347595, + "learning_rate": 0.00489310356364762, + "loss": 2.4158, + "step": 158400 + }, + { + "epoch": 2.1304336138068227, + "grad_norm": 0.2700982093811035, + "learning_rate": 0.0048929501478555005, + "loss": 2.4236, + "step": 158500 + }, + { + "epoch": 2.131777735960644, + "grad_norm": 0.9225497245788574, + "learning_rate": 0.004892796624466027, + "loss": 2.4191, + "step": 158600 + }, + { + "epoch": 2.1331218581144653, + "grad_norm": 0.9907844066619873, + "learning_rate": 0.004892642993486124, + "loss": 2.4111, + "step": 158700 + }, + { + "epoch": 2.1344659802682866, + "grad_norm": 0.24535821378231049, + "learning_rate": 0.004892489254922714, + "loss": 2.4277, + "step": 158800 + }, + { + "epoch": 2.1358101024221083, + "grad_norm": 1.3044700622558594, + "learning_rate": 0.004892335408782724, + "loss": 2.4191, + "step": 158900 + }, + { + "epoch": 2.1371542245759296, + "grad_norm": 1.1276298761367798, + "learning_rate": 0.0048921814550730814, + "loss": 2.4142, + "step": 159000 + }, + { + "epoch": 2.1371542245759296, + "eval_MaskedAccuracy": 0.44877584775466844, + "eval_loss": 2.726320505142212, + "eval_runtime": 144.1741, + "eval_samples_per_second": 440.273, + "eval_steps_per_second": 1.72, + "step": 159000 + }, + { + "epoch": 2.138498346729751, + "grad_norm": 0.3968954086303711, + "learning_rate": 0.004892027393800723, + "loss": 2.4149, + "step": 159100 + }, + { + "epoch": 2.139842468883572, + "grad_norm": 0.3040700852870941, + "learning_rate": 0.004891873224972598, + "loss": 2.4198, + "step": 159200 + }, + { + "epoch": 2.1411865910373935, + "grad_norm": 0.24721510708332062, + "learning_rate": 0.00489171894859565, + "loss": 2.4209, + "step": 159300 + }, + { + "epoch": 2.142530713191215, + "grad_norm": 0.3159930408000946, + "learning_rate": 0.004891564564676828, + "loss": 2.4214, + "step": 159400 + }, + { + "epoch": 2.143874835345036, + "grad_norm": 0.31632721424102783, + "learning_rate": 0.004891410073223092, + "loss": 2.4242, + "step": 159500 + }, + { + "epoch": 2.1452189574988574, + "grad_norm": 0.422014981508255, + "learning_rate": 0.0048912554742414, + "loss": 2.43, + "step": 159600 + }, + { + "epoch": 2.1465630796526787, + "grad_norm": 0.2504478394985199, + "learning_rate": 0.004891100767738721, + "loss": 2.4201, + "step": 159700 + }, + { + "epoch": 2.1479072018065004, + "grad_norm": 0.9411721229553223, + "learning_rate": 0.0048909459537220265, + "loss": 2.4305, + "step": 159800 + }, + { + "epoch": 2.1492513239603217, + "grad_norm": 0.7317768931388855, + "learning_rate": 0.004890791032198286, + "loss": 2.4167, + "step": 159900 + }, + { + "epoch": 2.150595446114143, + "grad_norm": 0.462750643491745, + "learning_rate": 0.004890636003174482, + "loss": 2.4022, + "step": 160000 + }, + { + "epoch": 2.150595446114143, + "eval_MaskedAccuracy": 0.44730266116401834, + "eval_loss": 2.733778953552246, + "eval_runtime": 157.466, + "eval_samples_per_second": 403.109, + "eval_steps_per_second": 1.575, + "step": 160000 + }, + { + "epoch": 2.1519395682679643, + "grad_norm": 1.2427386045455933, + "learning_rate": 0.004890480866657605, + "loss": 2.4187, + "step": 160100 + }, + { + "epoch": 2.1532836904217856, + "grad_norm": 1.1014420986175537, + "learning_rate": 0.004890325622654636, + "loss": 2.4167, + "step": 160200 + }, + { + "epoch": 2.154627812575607, + "grad_norm": 1.7939949035644531, + "learning_rate": 0.004890170271172577, + "loss": 2.413, + "step": 160300 + }, + { + "epoch": 2.155971934729428, + "grad_norm": 0.28519657254219055, + "learning_rate": 0.004890014812218431, + "loss": 2.4248, + "step": 160400 + }, + { + "epoch": 2.1573160568832495, + "grad_norm": 0.3315064609050751, + "learning_rate": 0.0048898592457991994, + "loss": 2.4238, + "step": 160500 + }, + { + "epoch": 2.1586601790370707, + "grad_norm": 0.5888499617576599, + "learning_rate": 0.004889703571921885, + "loss": 2.4134, + "step": 160600 + }, + { + "epoch": 2.160004301190892, + "grad_norm": 0.3281387686729431, + "learning_rate": 0.004889547790593514, + "loss": 2.4204, + "step": 160700 + }, + { + "epoch": 2.1613484233447133, + "grad_norm": 0.3036564588546753, + "learning_rate": 0.004889391901821095, + "loss": 2.4143, + "step": 160800 + }, + { + "epoch": 2.162692545498535, + "grad_norm": 0.5783784985542297, + "learning_rate": 0.004889235905611661, + "loss": 2.4287, + "step": 160900 + }, + { + "epoch": 2.1640366676523564, + "grad_norm": 0.7875732183456421, + "learning_rate": 0.004889079801972228, + "loss": 2.4151, + "step": 161000 + }, + { + "epoch": 2.1640366676523564, + "eval_MaskedAccuracy": 0.4483840000513704, + "eval_loss": 2.7266931533813477, + "eval_runtime": 157.0844, + "eval_samples_per_second": 404.089, + "eval_steps_per_second": 1.579, + "step": 161000 + }, + { + "epoch": 2.1653807898061777, + "grad_norm": 1.2114744186401367, + "learning_rate": 0.0048889235909098374, + "loss": 2.4198, + "step": 161100 + }, + { + "epoch": 2.166724911959999, + "grad_norm": 3.989194631576538, + "learning_rate": 0.0048887672724315335, + "loss": 2.4213, + "step": 161200 + }, + { + "epoch": 2.1680690341138202, + "grad_norm": 1.195610761642456, + "learning_rate": 0.0048886108465443535, + "loss": 2.4187, + "step": 161300 + }, + { + "epoch": 2.1694131562676415, + "grad_norm": 1.6807376146316528, + "learning_rate": 0.004888454313255354, + "loss": 2.4067, + "step": 161400 + }, + { + "epoch": 2.170757278421463, + "grad_norm": 0.3829794228076935, + "learning_rate": 0.0048882976725715815, + "loss": 2.41, + "step": 161500 + }, + { + "epoch": 2.172101400575284, + "grad_norm": 0.2655125558376312, + "learning_rate": 0.00488814092450009, + "loss": 2.416, + "step": 161600 + }, + { + "epoch": 2.1734455227291054, + "grad_norm": 0.6671358942985535, + "learning_rate": 0.004887984069047942, + "loss": 2.4155, + "step": 161700 + }, + { + "epoch": 2.174789644882927, + "grad_norm": 0.49830421805381775, + "learning_rate": 0.004887827106222209, + "loss": 2.4146, + "step": 161800 + }, + { + "epoch": 2.1761337670367484, + "grad_norm": 0.43950071930885315, + "learning_rate": 0.004887670036029963, + "loss": 2.4123, + "step": 161900 + }, + { + "epoch": 2.1774778891905697, + "grad_norm": 0.9079190492630005, + "learning_rate": 0.004887512858478287, + "loss": 2.4123, + "step": 162000 + }, + { + "epoch": 2.1774778891905697, + "eval_MaskedAccuracy": 0.4486591905523086, + "eval_loss": 2.7250816822052, + "eval_runtime": 154.8776, + "eval_samples_per_second": 409.846, + "eval_steps_per_second": 1.601, + "step": 162000 + }, + { + "epoch": 2.178822011344391, + "grad_norm": 1.3062362670898438, + "learning_rate": 0.004887355573574258, + "loss": 2.4201, + "step": 162100 + }, + { + "epoch": 2.1801661334982123, + "grad_norm": 0.9617155194282532, + "learning_rate": 0.004887198181324969, + "loss": 2.4178, + "step": 162200 + }, + { + "epoch": 2.1815102556520336, + "grad_norm": 0.23437173664569855, + "learning_rate": 0.004887040681737504, + "loss": 2.4276, + "step": 162300 + }, + { + "epoch": 2.182854377805855, + "grad_norm": 0.7280550599098206, + "learning_rate": 0.004886883074818961, + "loss": 2.4122, + "step": 162400 + }, + { + "epoch": 2.184198499959676, + "grad_norm": 0.26790088415145874, + "learning_rate": 0.004886725360576443, + "loss": 2.4176, + "step": 162500 + }, + { + "epoch": 2.1855426221134975, + "grad_norm": 0.24585595726966858, + "learning_rate": 0.004886567539017049, + "loss": 2.4256, + "step": 162600 + }, + { + "epoch": 2.186886744267319, + "grad_norm": 0.2857859134674072, + "learning_rate": 0.0048864096101479005, + "loss": 2.4147, + "step": 162700 + }, + { + "epoch": 2.1882308664211405, + "grad_norm": 0.4526360332965851, + "learning_rate": 0.00488625157397611, + "loss": 2.4197, + "step": 162800 + }, + { + "epoch": 2.189574988574962, + "grad_norm": 0.2927253246307373, + "learning_rate": 0.004886093430508798, + "loss": 2.4114, + "step": 162900 + }, + { + "epoch": 2.190919110728783, + "grad_norm": 0.4610482156276703, + "learning_rate": 0.004885935179753095, + "loss": 2.4147, + "step": 163000 + }, + { + "epoch": 2.190919110728783, + "eval_MaskedAccuracy": 0.44903692639620885, + "eval_loss": 2.7242319583892822, + "eval_runtime": 155.5888, + "eval_samples_per_second": 407.973, + "eval_steps_per_second": 1.594, + "step": 163000 + }, + { + "epoch": 2.1922632328826044, + "grad_norm": 0.6389359831809998, + "learning_rate": 0.004885776821716123, + "loss": 2.4158, + "step": 163100 + }, + { + "epoch": 2.1936073550364257, + "grad_norm": 1.6146184206008911, + "learning_rate": 0.004885618356405023, + "loss": 2.4194, + "step": 163200 + }, + { + "epoch": 2.194951477190247, + "grad_norm": 1.686237096786499, + "learning_rate": 0.004885459783826932, + "loss": 2.4109, + "step": 163300 + }, + { + "epoch": 2.1962955993440683, + "grad_norm": 1.0167760848999023, + "learning_rate": 0.004885301103989002, + "loss": 2.4146, + "step": 163400 + }, + { + "epoch": 2.1976397214978896, + "grad_norm": 0.20749007165431976, + "learning_rate": 0.004885142316898374, + "loss": 2.4179, + "step": 163500 + }, + { + "epoch": 2.1989838436517113, + "grad_norm": 0.3694406747817993, + "learning_rate": 0.0048849834225622064, + "loss": 2.411, + "step": 163600 + }, + { + "epoch": 2.2003279658055326, + "grad_norm": 0.6222627758979797, + "learning_rate": 0.004884824420987657, + "loss": 2.4116, + "step": 163700 + }, + { + "epoch": 2.201672087959354, + "grad_norm": 0.637772798538208, + "learning_rate": 0.004884665312181901, + "loss": 2.4217, + "step": 163800 + }, + { + "epoch": 2.203016210113175, + "grad_norm": 1.5972890853881836, + "learning_rate": 0.004884506096152096, + "loss": 2.4098, + "step": 163900 + }, + { + "epoch": 2.2043603322669965, + "grad_norm": 0.2235487997531891, + "learning_rate": 0.004884346772905415, + "loss": 2.4088, + "step": 164000 + }, + { + "epoch": 2.2043603322669965, + "eval_MaskedAccuracy": 0.44855764145601995, + "eval_loss": 2.725599527359009, + "eval_runtime": 153.7593, + "eval_samples_per_second": 412.827, + "eval_steps_per_second": 1.613, + "step": 164000 + }, + { + "epoch": 2.2057044544208178, + "grad_norm": 0.5953556895256042, + "learning_rate": 0.004884187342449037, + "loss": 2.4125, + "step": 164100 + }, + { + "epoch": 2.207048576574639, + "grad_norm": 0.6413670778274536, + "learning_rate": 0.004884027804790152, + "loss": 2.4191, + "step": 164200 + }, + { + "epoch": 2.2083926987284603, + "grad_norm": 0.5590651035308838, + "learning_rate": 0.004883868159935944, + "loss": 2.4153, + "step": 164300 + }, + { + "epoch": 2.2097368208822816, + "grad_norm": 0.6301490664482117, + "learning_rate": 0.0048837084078936135, + "loss": 2.4125, + "step": 164400 + }, + { + "epoch": 2.211080943036103, + "grad_norm": 1.1809839010238647, + "learning_rate": 0.004883548548670345, + "loss": 2.4097, + "step": 164500 + }, + { + "epoch": 2.2124250651899247, + "grad_norm": 0.8796495199203491, + "learning_rate": 0.004883388582273348, + "loss": 2.4151, + "step": 164600 + }, + { + "epoch": 2.213769187343746, + "grad_norm": 0.5694831609725952, + "learning_rate": 0.004883228508709839, + "loss": 2.4123, + "step": 164700 + }, + { + "epoch": 2.2151133094975672, + "grad_norm": 1.4738579988479614, + "learning_rate": 0.0048830683279870194, + "loss": 2.3999, + "step": 164800 + }, + { + "epoch": 2.2164574316513885, + "grad_norm": 0.6466926336288452, + "learning_rate": 0.004882908040112116, + "loss": 2.4129, + "step": 164900 + }, + { + "epoch": 2.21780155380521, + "grad_norm": 0.2502327263355255, + "learning_rate": 0.004882747645092345, + "loss": 2.4139, + "step": 165000 + }, + { + "epoch": 2.21780155380521, + "eval_MaskedAccuracy": 0.44872007353941323, + "eval_loss": 2.723762273788452, + "eval_runtime": 158.1038, + "eval_samples_per_second": 401.483, + "eval_steps_per_second": 1.569, + "step": 165000 + }, + { + "epoch": 2.219145675959031, + "grad_norm": 0.5904813408851624, + "learning_rate": 0.004882587142934935, + "loss": 2.4146, + "step": 165100 + }, + { + "epoch": 2.2204897981128524, + "grad_norm": 0.41806700825691223, + "learning_rate": 0.004882426533647115, + "loss": 2.4111, + "step": 165200 + }, + { + "epoch": 2.2218339202666737, + "grad_norm": 0.30772215127944946, + "learning_rate": 0.00488226581723612, + "loss": 2.4097, + "step": 165300 + }, + { + "epoch": 2.223178042420495, + "grad_norm": 0.6158583760261536, + "learning_rate": 0.0048821049937092026, + "loss": 2.4132, + "step": 165400 + }, + { + "epoch": 2.2245221645743163, + "grad_norm": 0.5486952066421509, + "learning_rate": 0.004881944063073602, + "loss": 2.4082, + "step": 165500 + }, + { + "epoch": 2.225866286728138, + "grad_norm": 0.44802579283714294, + "learning_rate": 0.00488178302533657, + "loss": 2.4052, + "step": 165600 + }, + { + "epoch": 2.2272104088819593, + "grad_norm": 0.6078771948814392, + "learning_rate": 0.004881621880505358, + "loss": 2.4174, + "step": 165700 + }, + { + "epoch": 2.2285545310357806, + "grad_norm": 1.0394670963287354, + "learning_rate": 0.004881460628587237, + "loss": 2.4053, + "step": 165800 + }, + { + "epoch": 2.229898653189602, + "grad_norm": 0.7442373037338257, + "learning_rate": 0.0048812992695894675, + "loss": 2.4099, + "step": 165900 + }, + { + "epoch": 2.231242775343423, + "grad_norm": 0.39984771609306335, + "learning_rate": 0.004881137803519318, + "loss": 2.4191, + "step": 166000 + }, + { + "epoch": 2.231242775343423, + "eval_MaskedAccuracy": 0.44866405937837617, + "eval_loss": 2.7227699756622314, + "eval_runtime": 176.0029, + "eval_samples_per_second": 360.653, + "eval_steps_per_second": 1.409, + "step": 166000 + }, + { + "epoch": 2.2325868974972445, + "grad_norm": 0.40001779794692993, + "learning_rate": 0.004880976230384064, + "loss": 2.4152, + "step": 166100 + }, + { + "epoch": 2.2339310196510658, + "grad_norm": 0.8670006394386292, + "learning_rate": 0.00488081455019099, + "loss": 2.4158, + "step": 166200 + }, + { + "epoch": 2.235275141804887, + "grad_norm": 0.5148455500602722, + "learning_rate": 0.004880652762947384, + "loss": 2.4162, + "step": 166300 + }, + { + "epoch": 2.2366192639587084, + "grad_norm": 1.6046350002288818, + "learning_rate": 0.004880490868660532, + "loss": 2.4131, + "step": 166400 + }, + { + "epoch": 2.23796338611253, + "grad_norm": 0.896712601184845, + "learning_rate": 0.004880328867337728, + "loss": 2.4037, + "step": 166500 + }, + { + "epoch": 2.2393075082663514, + "grad_norm": 0.881832480430603, + "learning_rate": 0.0048801667589862706, + "loss": 2.4167, + "step": 166600 + }, + { + "epoch": 2.2406516304201727, + "grad_norm": 3.5371510982513428, + "learning_rate": 0.00488000454361346, + "loss": 2.411, + "step": 166700 + }, + { + "epoch": 2.241995752573994, + "grad_norm": 0.2387372851371765, + "learning_rate": 0.004879842221226608, + "loss": 2.4134, + "step": 166800 + }, + { + "epoch": 2.2433398747278153, + "grad_norm": 0.2676979899406433, + "learning_rate": 0.00487967979183303, + "loss": 2.4213, + "step": 166900 + }, + { + "epoch": 2.2446839968816366, + "grad_norm": 0.9509516954421997, + "learning_rate": 0.0048795172554400475, + "loss": 2.4127, + "step": 167000 + }, + { + "epoch": 2.2446839968816366, + "eval_MaskedAccuracy": 0.44885926012655697, + "eval_loss": 2.7260682582855225, + "eval_runtime": 166.3086, + "eval_samples_per_second": 381.676, + "eval_steps_per_second": 1.491, + "step": 167000 + }, + { + "epoch": 2.246028119035458, + "grad_norm": 0.3096467852592468, + "learning_rate": 0.004879354612054984, + "loss": 2.4122, + "step": 167100 + }, + { + "epoch": 2.247372241189279, + "grad_norm": 0.6709736585617065, + "learning_rate": 0.004879191861685164, + "loss": 2.4082, + "step": 167200 + }, + { + "epoch": 2.2487163633431004, + "grad_norm": 0.5945542454719543, + "learning_rate": 0.004879029004337925, + "loss": 2.4125, + "step": 167300 + }, + { + "epoch": 2.250060485496922, + "grad_norm": 1.153786063194275, + "learning_rate": 0.004878866040020597, + "loss": 2.4236, + "step": 167400 + }, + { + "epoch": 2.2514046076507435, + "grad_norm": 0.9520905613899231, + "learning_rate": 0.004878702968740533, + "loss": 2.4141, + "step": 167500 + }, + { + "epoch": 2.2527487298045648, + "grad_norm": 0.8178815841674805, + "learning_rate": 0.004878539790505079, + "loss": 2.4127, + "step": 167600 + }, + { + "epoch": 2.254092851958386, + "grad_norm": 0.29091590642929077, + "learning_rate": 0.004878376505321578, + "loss": 2.4077, + "step": 167700 + }, + { + "epoch": 2.2554369741122073, + "grad_norm": 1.5120651721954346, + "learning_rate": 0.004878213113197398, + "loss": 2.4125, + "step": 167800 + }, + { + "epoch": 2.2567810962660286, + "grad_norm": 1.6276648044586182, + "learning_rate": 0.004878049614139895, + "loss": 2.4122, + "step": 167900 + }, + { + "epoch": 2.25812521841985, + "grad_norm": 0.786770761013031, + "learning_rate": 0.004877886008156443, + "loss": 2.405, + "step": 168000 + }, + { + "epoch": 2.25812521841985, + "eval_MaskedAccuracy": 0.4498786265923044, + "eval_loss": 2.719548225402832, + "eval_runtime": 155.0214, + "eval_samples_per_second": 409.466, + "eval_steps_per_second": 1.6, + "step": 168000 + }, + { + "epoch": 2.259469340573671, + "grad_norm": 0.46489110589027405, + "learning_rate": 0.004877722295254405, + "loss": 2.4138, + "step": 168100 + }, + { + "epoch": 2.2608134627274925, + "grad_norm": 1.4252897500991821, + "learning_rate": 0.004877558475441166, + "loss": 2.411, + "step": 168200 + }, + { + "epoch": 2.2621575848813142, + "grad_norm": 0.2540140450000763, + "learning_rate": 0.004877394548724101, + "loss": 2.4093, + "step": 168300 + }, + { + "epoch": 2.2635017070351355, + "grad_norm": 0.3754124045372009, + "learning_rate": 0.004877230515110599, + "loss": 2.4146, + "step": 168400 + }, + { + "epoch": 2.264845829188957, + "grad_norm": 0.6744637489318848, + "learning_rate": 0.0048770663746080505, + "loss": 2.4203, + "step": 168500 + }, + { + "epoch": 2.266189951342778, + "grad_norm": 0.21679449081420898, + "learning_rate": 0.004876902127223849, + "loss": 2.4013, + "step": 168600 + }, + { + "epoch": 2.2675340734965994, + "grad_norm": 0.3662445843219757, + "learning_rate": 0.004876737772965405, + "loss": 2.4108, + "step": 168700 + }, + { + "epoch": 2.2688781956504207, + "grad_norm": 1.2381483316421509, + "learning_rate": 0.004876573311840115, + "loss": 2.4117, + "step": 168800 + }, + { + "epoch": 2.270222317804242, + "grad_norm": 1.7564358711242676, + "learning_rate": 0.0048764087438553875, + "loss": 2.4082, + "step": 168900 + }, + { + "epoch": 2.2715664399580633, + "grad_norm": 0.6631215810775757, + "learning_rate": 0.004876244069018646, + "loss": 2.4133, + "step": 169000 + }, + { + "epoch": 2.2715664399580633, + "eval_MaskedAccuracy": 0.45023057912506814, + "eval_loss": 2.716172695159912, + "eval_runtime": 156.7093, + "eval_samples_per_second": 405.056, + "eval_steps_per_second": 1.583, + "step": 169000 + }, + { + "epoch": 2.2729105621118846, + "grad_norm": 0.8016698360443115, + "learning_rate": 0.004876079287337305, + "loss": 2.4051, + "step": 169100 + }, + { + "epoch": 2.2742546842657063, + "grad_norm": 0.519679844379425, + "learning_rate": 0.0048759143988187925, + "loss": 2.4115, + "step": 169200 + }, + { + "epoch": 2.275598806419527, + "grad_norm": 0.7286385297775269, + "learning_rate": 0.004875749403470536, + "loss": 2.4202, + "step": 169300 + }, + { + "epoch": 2.276942928573349, + "grad_norm": 0.24820998311042786, + "learning_rate": 0.004875584301299965, + "loss": 2.405, + "step": 169400 + }, + { + "epoch": 2.27828705072717, + "grad_norm": 0.25017213821411133, + "learning_rate": 0.004875419092314521, + "loss": 2.4129, + "step": 169500 + }, + { + "epoch": 2.2796311728809915, + "grad_norm": 0.8957297801971436, + "learning_rate": 0.004875253776521664, + "loss": 2.4071, + "step": 169600 + }, + { + "epoch": 2.2809752950348128, + "grad_norm": 0.5003229975700378, + "learning_rate": 0.004875088353928819, + "loss": 2.4099, + "step": 169700 + }, + { + "epoch": 2.282319417188634, + "grad_norm": 0.6370465159416199, + "learning_rate": 0.004874922824543451, + "loss": 2.4257, + "step": 169800 + }, + { + "epoch": 2.2836635393424554, + "grad_norm": 0.30033034086227417, + "learning_rate": 0.004874757188373019, + "loss": 2.4151, + "step": 169900 + }, + { + "epoch": 2.2850076614962767, + "grad_norm": 1.7922015190124512, + "learning_rate": 0.004874591445424987, + "loss": 2.4176, + "step": 170000 + }, + { + "epoch": 2.2850076614962767, + "eval_MaskedAccuracy": 0.4488507191873566, + "eval_loss": 2.722308874130249, + "eval_runtime": 156.2994, + "eval_samples_per_second": 406.118, + "eval_steps_per_second": 1.587, + "step": 170000 + }, + { + "epoch": 2.286351783650098, + "grad_norm": 1.080620288848877, + "learning_rate": 0.004874425595706819, + "loss": 2.4126, + "step": 170100 + }, + { + "epoch": 2.2876959058039192, + "grad_norm": 0.4531891644001007, + "learning_rate": 0.004874259639225995, + "loss": 2.4058, + "step": 170200 + }, + { + "epoch": 2.289040027957741, + "grad_norm": 0.2731966972351074, + "learning_rate": 0.0048740935759899835, + "loss": 2.4072, + "step": 170300 + }, + { + "epoch": 2.2903841501115623, + "grad_norm": 0.5683355927467346, + "learning_rate": 0.004873927406006273, + "loss": 2.4048, + "step": 170400 + }, + { + "epoch": 2.2917282722653836, + "grad_norm": 1.147813081741333, + "learning_rate": 0.004873761129282351, + "loss": 2.403, + "step": 170500 + }, + { + "epoch": 2.293072394419205, + "grad_norm": 0.30079033970832825, + "learning_rate": 0.0048735947458257095, + "loss": 2.4076, + "step": 170600 + }, + { + "epoch": 2.294416516573026, + "grad_norm": 0.29191407561302185, + "learning_rate": 0.00487342825564384, + "loss": 2.4128, + "step": 170700 + }, + { + "epoch": 2.2957606387268474, + "grad_norm": 0.7784181833267212, + "learning_rate": 0.004873261658744253, + "loss": 2.409, + "step": 170800 + }, + { + "epoch": 2.2971047608806687, + "grad_norm": 0.7186245322227478, + "learning_rate": 0.004873094955134448, + "loss": 2.4138, + "step": 170900 + }, + { + "epoch": 2.29844888303449, + "grad_norm": 0.9298619031906128, + "learning_rate": 0.004872928144821944, + "loss": 2.413, + "step": 171000 + }, + { + "epoch": 2.29844888303449, + "eval_MaskedAccuracy": 0.4487523079732683, + "eval_loss": 2.7223293781280518, + "eval_runtime": 154.8012, + "eval_samples_per_second": 410.049, + "eval_steps_per_second": 1.602, + "step": 171000 + }, + { + "epoch": 2.2997930051883113, + "grad_norm": 0.9932366013526917, + "learning_rate": 0.00487276122781425, + "loss": 2.417, + "step": 171100 + }, + { + "epoch": 2.301137127342133, + "grad_norm": 1.6287879943847656, + "learning_rate": 0.0048725942041188885, + "loss": 2.41, + "step": 171200 + }, + { + "epoch": 2.3024812494959543, + "grad_norm": 0.38423383235931396, + "learning_rate": 0.00487242707374338, + "loss": 2.4128, + "step": 171300 + }, + { + "epoch": 2.3038253716497756, + "grad_norm": 1.0105299949645996, + "learning_rate": 0.0048722598366952715, + "loss": 2.4199, + "step": 171400 + }, + { + "epoch": 2.305169493803597, + "grad_norm": 1.1391443014144897, + "learning_rate": 0.004872092492982082, + "loss": 2.4101, + "step": 171500 + }, + { + "epoch": 2.306513615957418, + "grad_norm": 0.2690454125404358, + "learning_rate": 0.004871925042611355, + "loss": 2.4109, + "step": 171600 + }, + { + "epoch": 2.3078577381112395, + "grad_norm": 0.3632848262786865, + "learning_rate": 0.004871757485590642, + "loss": 2.4156, + "step": 171700 + }, + { + "epoch": 2.309201860265061, + "grad_norm": 1.6692334413528442, + "learning_rate": 0.004871589821927492, + "loss": 2.415, + "step": 171800 + }, + { + "epoch": 2.310545982418882, + "grad_norm": 0.8343886137008667, + "learning_rate": 0.004871422051629453, + "loss": 2.4143, + "step": 171900 + }, + { + "epoch": 2.3118901045727034, + "grad_norm": 0.42459598183631897, + "learning_rate": 0.004871254174704084, + "loss": 2.4116, + "step": 172000 + }, + { + "epoch": 2.3118901045727034, + "eval_MaskedAccuracy": 0.44924426931396766, + "eval_loss": 2.720871925354004, + "eval_runtime": 154.4054, + "eval_samples_per_second": 411.099, + "eval_steps_per_second": 1.606, + "step": 172000 + }, + { + "epoch": 2.313234226726525, + "grad_norm": 0.23972781002521515, + "learning_rate": 0.004871086191158964, + "loss": 2.4127, + "step": 172100 + }, + { + "epoch": 2.3145783488803464, + "grad_norm": 0.5230980515480042, + "learning_rate": 0.004870918101001645, + "loss": 2.4101, + "step": 172200 + }, + { + "epoch": 2.3159224710341677, + "grad_norm": 1.2739980220794678, + "learning_rate": 0.004870749904239708, + "loss": 2.4125, + "step": 172300 + }, + { + "epoch": 2.317266593187989, + "grad_norm": 0.2384161502122879, + "learning_rate": 0.004870581600880735, + "loss": 2.4088, + "step": 172400 + }, + { + "epoch": 2.3186107153418103, + "grad_norm": 0.5129702091217041, + "learning_rate": 0.004870413190932297, + "loss": 2.4088, + "step": 172500 + }, + { + "epoch": 2.3199548374956316, + "grad_norm": 0.2832634449005127, + "learning_rate": 0.004870244674402003, + "loss": 2.4016, + "step": 172600 + }, + { + "epoch": 2.321298959649453, + "grad_norm": 0.287016361951828, + "learning_rate": 0.004870076051297425, + "loss": 2.4126, + "step": 172700 + }, + { + "epoch": 2.322643081803274, + "grad_norm": 0.2521541118621826, + "learning_rate": 0.0048699073216261725, + "loss": 2.4122, + "step": 172800 + }, + { + "epoch": 2.3239872039570955, + "grad_norm": 0.29343220591545105, + "learning_rate": 0.004869738485395845, + "loss": 2.3996, + "step": 172900 + }, + { + "epoch": 2.325331326110917, + "grad_norm": 0.2879350185394287, + "learning_rate": 0.004869569542614052, + "loss": 2.4084, + "step": 173000 + }, + { + "epoch": 2.325331326110917, + "eval_MaskedAccuracy": 0.4492615010440258, + "eval_loss": 2.7211360931396484, + "eval_runtime": 156.2801, + "eval_samples_per_second": 406.168, + "eval_steps_per_second": 1.587, + "step": 173000 + }, + { + "epoch": 2.3266754482647385, + "grad_norm": 0.30386149883270264, + "learning_rate": 0.004869400493288407, + "loss": 2.4125, + "step": 173100 + }, + { + "epoch": 2.32801957041856, + "grad_norm": 0.30869200825691223, + "learning_rate": 0.004869231337426524, + "loss": 2.413, + "step": 173200 + }, + { + "epoch": 2.329363692572381, + "grad_norm": 0.6456079483032227, + "learning_rate": 0.0048690620750360255, + "loss": 2.4144, + "step": 173300 + }, + { + "epoch": 2.3307078147262024, + "grad_norm": 0.6145516633987427, + "learning_rate": 0.004868892706124542, + "loss": 2.4062, + "step": 173400 + }, + { + "epoch": 2.3320519368800237, + "grad_norm": 0.5485299229621887, + "learning_rate": 0.004868723230699693, + "loss": 2.401, + "step": 173500 + }, + { + "epoch": 2.333396059033845, + "grad_norm": 1.082629680633545, + "learning_rate": 0.004868553648769127, + "loss": 2.413, + "step": 173600 + }, + { + "epoch": 2.3347401811876662, + "grad_norm": 0.5388498306274414, + "learning_rate": 0.004868383960340481, + "loss": 2.4097, + "step": 173700 + }, + { + "epoch": 2.3360843033414875, + "grad_norm": 0.23575937747955322, + "learning_rate": 0.004868214165421399, + "loss": 2.4054, + "step": 173800 + }, + { + "epoch": 2.337428425495309, + "grad_norm": 0.7409723997116089, + "learning_rate": 0.004868044264019538, + "loss": 2.4085, + "step": 173900 + }, + { + "epoch": 2.33877254764913, + "grad_norm": 0.2735578715801239, + "learning_rate": 0.004867874256142546, + "loss": 2.4156, + "step": 174000 + }, + { + "epoch": 2.33877254764913, + "eval_MaskedAccuracy": 0.4495013367228055, + "eval_loss": 2.7185449600219727, + "eval_runtime": 154.8489, + "eval_samples_per_second": 409.922, + "eval_steps_per_second": 1.602, + "step": 174000 + }, + { + "epoch": 2.340116669802952, + "grad_norm": 0.26877549290657043, + "learning_rate": 0.004867704141798087, + "loss": 2.4045, + "step": 174100 + }, + { + "epoch": 2.341460791956773, + "grad_norm": 0.6106776595115662, + "learning_rate": 0.004867533920993822, + "loss": 2.4046, + "step": 174200 + }, + { + "epoch": 2.3428049141105944, + "grad_norm": 0.2555699348449707, + "learning_rate": 0.00486736359373743, + "loss": 2.4151, + "step": 174300 + }, + { + "epoch": 2.3441490362644157, + "grad_norm": 0.3675636649131775, + "learning_rate": 0.004867193160036579, + "loss": 2.4072, + "step": 174400 + }, + { + "epoch": 2.345493158418237, + "grad_norm": 0.7142601609230042, + "learning_rate": 0.004867022619898949, + "loss": 2.4049, + "step": 174500 + }, + { + "epoch": 2.3468372805720583, + "grad_norm": 1.72738778591156, + "learning_rate": 0.004866851973332225, + "loss": 2.412, + "step": 174600 + }, + { + "epoch": 2.3481814027258796, + "grad_norm": 0.45770561695098877, + "learning_rate": 0.004866681220344099, + "loss": 2.416, + "step": 174700 + }, + { + "epoch": 2.349525524879701, + "grad_norm": 0.788829505443573, + "learning_rate": 0.004866510360942256, + "loss": 2.4055, + "step": 174800 + }, + { + "epoch": 2.350869647033522, + "grad_norm": 0.2863014042377472, + "learning_rate": 0.004866339395134404, + "loss": 2.4019, + "step": 174900 + }, + { + "epoch": 2.352213769187344, + "grad_norm": 0.27914437651634216, + "learning_rate": 0.004866168322928247, + "loss": 2.4036, + "step": 175000 + }, + { + "epoch": 2.352213769187344, + "eval_MaskedAccuracy": 0.4499790247285866, + "eval_loss": 2.7163689136505127, + "eval_runtime": 155.1875, + "eval_samples_per_second": 409.028, + "eval_steps_per_second": 1.598, + "step": 175000 + }, + { + "epoch": 2.353557891341165, + "grad_norm": 0.49107837677001953, + "learning_rate": 0.004865997144331489, + "loss": 2.4051, + "step": 175100 + }, + { + "epoch": 2.3549020134949865, + "grad_norm": 1.704633355140686, + "learning_rate": 0.004865825859351838, + "loss": 2.4091, + "step": 175200 + }, + { + "epoch": 2.356246135648808, + "grad_norm": 0.2347813844680786, + "learning_rate": 0.004865654467997019, + "loss": 2.4098, + "step": 175300 + }, + { + "epoch": 2.357590257802629, + "grad_norm": 0.9905913472175598, + "learning_rate": 0.0048654829702747555, + "loss": 2.4093, + "step": 175400 + }, + { + "epoch": 2.3589343799564504, + "grad_norm": 0.3170483410358429, + "learning_rate": 0.0048653113661927695, + "loss": 2.4048, + "step": 175500 + }, + { + "epoch": 2.3602785021102717, + "grad_norm": 0.8622209429740906, + "learning_rate": 0.004865139655758797, + "loss": 2.4073, + "step": 175600 + }, + { + "epoch": 2.361622624264093, + "grad_norm": 0.3935220241546631, + "learning_rate": 0.004864967838980571, + "loss": 2.4155, + "step": 175700 + }, + { + "epoch": 2.3629667464179143, + "grad_norm": 1.1118823289871216, + "learning_rate": 0.004864795915865833, + "loss": 2.4112, + "step": 175800 + }, + { + "epoch": 2.364310868571736, + "grad_norm": 0.7513082027435303, + "learning_rate": 0.00486462388642233, + "loss": 2.4152, + "step": 175900 + }, + { + "epoch": 2.3656549907255573, + "grad_norm": 1.073836326599121, + "learning_rate": 0.0048644517506578255, + "loss": 2.4023, + "step": 176000 + }, + { + "epoch": 2.3656549907255573, + "eval_MaskedAccuracy": 0.44973931526656014, + "eval_loss": 2.7163290977478027, + "eval_runtime": 156.8073, + "eval_samples_per_second": 404.803, + "eval_steps_per_second": 1.582, + "step": 176000 + }, + { + "epoch": 2.3669991128793786, + "grad_norm": 0.6424139142036438, + "learning_rate": 0.004864279508580062, + "loss": 2.3978, + "step": 176100 + }, + { + "epoch": 2.3683432350332, + "grad_norm": 0.9152630567550659, + "learning_rate": 0.004864107160196811, + "loss": 2.405, + "step": 176200 + }, + { + "epoch": 2.369687357187021, + "grad_norm": 0.44839566946029663, + "learning_rate": 0.004863934705515825, + "loss": 2.4032, + "step": 176300 + }, + { + "epoch": 2.3710314793408425, + "grad_norm": 1.236681342124939, + "learning_rate": 0.0048637621445448845, + "loss": 2.4041, + "step": 176400 + }, + { + "epoch": 2.3723756014946638, + "grad_norm": 0.62148517370224, + "learning_rate": 0.004863589477291759, + "loss": 2.4113, + "step": 176500 + }, + { + "epoch": 2.373719723648485, + "grad_norm": 0.2154432088136673, + "learning_rate": 0.004863416703764239, + "loss": 2.4113, + "step": 176600 + }, + { + "epoch": 2.3750638458023063, + "grad_norm": 0.5083281993865967, + "learning_rate": 0.004863243823970095, + "loss": 2.4058, + "step": 176700 + }, + { + "epoch": 2.376407967956128, + "grad_norm": 0.33949902653694153, + "learning_rate": 0.004863070837917129, + "loss": 2.4015, + "step": 176800 + }, + { + "epoch": 2.3777520901099494, + "grad_norm": 1.1010327339172363, + "learning_rate": 0.004862897745613131, + "loss": 2.4008, + "step": 176900 + }, + { + "epoch": 2.3790962122637707, + "grad_norm": 1.2555139064788818, + "learning_rate": 0.004862724547065901, + "loss": 2.4059, + "step": 177000 + }, + { + "epoch": 2.3790962122637707, + "eval_MaskedAccuracy": 0.4503048499742003, + "eval_loss": 2.716306447982788, + "eval_runtime": 155.5308, + "eval_samples_per_second": 408.125, + "eval_steps_per_second": 1.595, + "step": 177000 + }, + { + "epoch": 2.380440334417592, + "grad_norm": 0.5980218648910522, + "learning_rate": 0.0048625512422832366, + "loss": 2.4036, + "step": 177100 + }, + { + "epoch": 2.3817844565714132, + "grad_norm": 0.252000629901886, + "learning_rate": 0.004862377831272951, + "loss": 2.4125, + "step": 177200 + }, + { + "epoch": 2.3831285787252345, + "grad_norm": 0.6052896976470947, + "learning_rate": 0.004862204314042862, + "loss": 2.415, + "step": 177300 + }, + { + "epoch": 2.384472700879056, + "grad_norm": 0.37761661410331726, + "learning_rate": 0.004862030690600784, + "loss": 2.4009, + "step": 177400 + }, + { + "epoch": 2.385816823032877, + "grad_norm": 0.3634903132915497, + "learning_rate": 0.004861856960954538, + "loss": 2.4027, + "step": 177500 + }, + { + "epoch": 2.3871609451866984, + "grad_norm": 0.24861650168895721, + "learning_rate": 0.004861683125111956, + "loss": 2.4014, + "step": 177600 + }, + { + "epoch": 2.38850506734052, + "grad_norm": 0.893480122089386, + "learning_rate": 0.004861509183080868, + "loss": 2.4062, + "step": 177700 + }, + { + "epoch": 2.389849189494341, + "grad_norm": 0.22390282154083252, + "learning_rate": 0.004861335134869118, + "loss": 2.3922, + "step": 177800 + }, + { + "epoch": 2.3911933116481627, + "grad_norm": 0.6363741159439087, + "learning_rate": 0.004861160980484538, + "loss": 2.402, + "step": 177900 + }, + { + "epoch": 2.392537433801984, + "grad_norm": 0.918346107006073, + "learning_rate": 0.004860986719934979, + "loss": 2.3986, + "step": 178000 + }, + { + "epoch": 2.392537433801984, + "eval_MaskedAccuracy": 0.44992926294273605, + "eval_loss": 2.714897394180298, + "eval_runtime": 156.6614, + "eval_samples_per_second": 405.18, + "eval_steps_per_second": 1.583, + "step": 178000 + }, + { + "epoch": 2.3938815559558053, + "grad_norm": 0.3443443477153778, + "learning_rate": 0.0048608123532282955, + "loss": 2.4041, + "step": 178100 + }, + { + "epoch": 2.3952256781096266, + "grad_norm": 1.1353808641433716, + "learning_rate": 0.004860637880372347, + "loss": 2.4011, + "step": 178200 + }, + { + "epoch": 2.396569800263448, + "grad_norm": 0.813539445400238, + "learning_rate": 0.004860463301374987, + "loss": 2.4029, + "step": 178300 + }, + { + "epoch": 2.397913922417269, + "grad_norm": 1.0734734535217285, + "learning_rate": 0.004860288616244085, + "loss": 2.4092, + "step": 178400 + }, + { + "epoch": 2.3992580445710905, + "grad_norm": 0.3121657073497772, + "learning_rate": 0.004860113824987515, + "loss": 2.3985, + "step": 178500 + }, + { + "epoch": 2.400602166724912, + "grad_norm": 0.5276472568511963, + "learning_rate": 0.004859938927613148, + "loss": 2.4032, + "step": 178600 + }, + { + "epoch": 2.401946288878733, + "grad_norm": 0.21578922867774963, + "learning_rate": 0.004859763924128867, + "loss": 2.4002, + "step": 178700 + }, + { + "epoch": 2.403290411032555, + "grad_norm": 0.36338990926742554, + "learning_rate": 0.004859588814542557, + "loss": 2.4103, + "step": 178800 + }, + { + "epoch": 2.404634533186376, + "grad_norm": 0.7060322165489197, + "learning_rate": 0.004859413598862113, + "loss": 2.4039, + "step": 178900 + }, + { + "epoch": 2.4059786553401974, + "grad_norm": 0.3517087399959564, + "learning_rate": 0.004859238277095424, + "loss": 2.4107, + "step": 179000 + }, + { + "epoch": 2.4059786553401974, + "eval_MaskedAccuracy": 0.4501261935846993, + "eval_loss": 2.7151551246643066, + "eval_runtime": 158.7753, + "eval_samples_per_second": 399.785, + "eval_steps_per_second": 1.562, + "step": 179000 + }, + { + "epoch": 2.4073227774940187, + "grad_norm": 0.850573718547821, + "learning_rate": 0.004859062849250387, + "loss": 2.4033, + "step": 179100 + }, + { + "epoch": 2.40866689964784, + "grad_norm": 0.25686702132225037, + "learning_rate": 0.004858887315334917, + "loss": 2.4033, + "step": 179200 + }, + { + "epoch": 2.4100110218016613, + "grad_norm": 0.580507755279541, + "learning_rate": 0.0048587116753569136, + "loss": 2.4075, + "step": 179300 + }, + { + "epoch": 2.4113551439554826, + "grad_norm": 0.21325333416461945, + "learning_rate": 0.004858535929324301, + "loss": 2.4066, + "step": 179400 + }, + { + "epoch": 2.412699266109304, + "grad_norm": 1.397865891456604, + "learning_rate": 0.004858360077244987, + "loss": 2.4069, + "step": 179500 + }, + { + "epoch": 2.414043388263125, + "grad_norm": 0.7146439552307129, + "learning_rate": 0.004858184119126909, + "loss": 2.4055, + "step": 179600 + }, + { + "epoch": 2.415387510416947, + "grad_norm": 0.6387777328491211, + "learning_rate": 0.004858008054977973, + "loss": 2.3984, + "step": 179700 + }, + { + "epoch": 2.416731632570768, + "grad_norm": 0.6029757261276245, + "learning_rate": 0.004857831884806135, + "loss": 2.4017, + "step": 179800 + }, + { + "epoch": 2.4180757547245895, + "grad_norm": 0.3883086144924164, + "learning_rate": 0.004857655608619315, + "loss": 2.4009, + "step": 179900 + }, + { + "epoch": 2.4194198768784108, + "grad_norm": 0.24268724024295807, + "learning_rate": 0.004857479226425462, + "loss": 2.4127, + "step": 180000 + }, + { + "epoch": 2.4194198768784108, + "eval_MaskedAccuracy": 0.4500598610681297, + "eval_loss": 2.7149128913879395, + "eval_runtime": 160.6745, + "eval_samples_per_second": 395.06, + "eval_steps_per_second": 1.543, + "step": 180000 + }, + { + "epoch": 2.420763999032232, + "grad_norm": 0.2926810085773468, + "learning_rate": 0.004857302738232527, + "loss": 2.4048, + "step": 180100 + }, + { + "epoch": 2.4221081211860533, + "grad_norm": 0.5268754363059998, + "learning_rate": 0.004857126144048464, + "loss": 2.3981, + "step": 180200 + }, + { + "epoch": 2.4234522433398746, + "grad_norm": 0.27951580286026, + "learning_rate": 0.004856949443881227, + "loss": 2.3983, + "step": 180300 + }, + { + "epoch": 2.424796365493696, + "grad_norm": 0.5589247345924377, + "learning_rate": 0.004856772637738778, + "loss": 2.405, + "step": 180400 + }, + { + "epoch": 2.426140487647517, + "grad_norm": 0.2754684090614319, + "learning_rate": 0.004856595725629084, + "loss": 2.4081, + "step": 180500 + }, + { + "epoch": 2.427484609801339, + "grad_norm": 1.014289379119873, + "learning_rate": 0.004856418707560116, + "loss": 2.4028, + "step": 180600 + }, + { + "epoch": 2.4288287319551602, + "grad_norm": 1.224565029144287, + "learning_rate": 0.004856241583539849, + "loss": 2.4041, + "step": 180700 + }, + { + "epoch": 2.4301728541089815, + "grad_norm": 0.2553938329219818, + "learning_rate": 0.004856064353576271, + "loss": 2.4039, + "step": 180800 + }, + { + "epoch": 2.431516976262803, + "grad_norm": 0.36857032775878906, + "learning_rate": 0.004855887017677354, + "loss": 2.4021, + "step": 180900 + }, + { + "epoch": 2.432861098416624, + "grad_norm": 0.3493588864803314, + "learning_rate": 0.004855709575851092, + "loss": 2.4009, + "step": 181000 + }, + { + "epoch": 2.432861098416624, + "eval_MaskedAccuracy": 0.4509061473369637, + "eval_loss": 2.7095935344696045, + "eval_runtime": 159.2973, + "eval_samples_per_second": 398.475, + "eval_steps_per_second": 1.557, + "step": 181000 + }, + { + "epoch": 2.4342052205704454, + "grad_norm": 0.6775467395782471, + "learning_rate": 0.004855532028105485, + "loss": 2.393, + "step": 181100 + }, + { + "epoch": 2.4355493427242667, + "grad_norm": 1.7895002365112305, + "learning_rate": 0.004855354374448533, + "loss": 2.3995, + "step": 181200 + }, + { + "epoch": 2.436893464878088, + "grad_norm": 0.2109525501728058, + "learning_rate": 0.004855176614888239, + "loss": 2.4031, + "step": 181300 + }, + { + "epoch": 2.4382375870319093, + "grad_norm": 0.24894700944423676, + "learning_rate": 0.004854998749432621, + "loss": 2.4049, + "step": 181400 + }, + { + "epoch": 2.439581709185731, + "grad_norm": 0.3863070011138916, + "learning_rate": 0.004854820778089685, + "loss": 2.3983, + "step": 181500 + }, + { + "epoch": 2.4409258313395523, + "grad_norm": 0.4976949095726013, + "learning_rate": 0.0048546427008674484, + "loss": 2.4076, + "step": 181600 + }, + { + "epoch": 2.4422699534933736, + "grad_norm": 1.53170907497406, + "learning_rate": 0.00485446451777394, + "loss": 2.4009, + "step": 181700 + }, + { + "epoch": 2.443614075647195, + "grad_norm": 0.8086992502212524, + "learning_rate": 0.0048542862288171885, + "loss": 2.4014, + "step": 181800 + }, + { + "epoch": 2.444958197801016, + "grad_norm": 0.8978860378265381, + "learning_rate": 0.00485410783400522, + "loss": 2.4002, + "step": 181900 + }, + { + "epoch": 2.4463023199548375, + "grad_norm": 0.40652504563331604, + "learning_rate": 0.004853929333346081, + "loss": 2.3978, + "step": 182000 + }, + { + "epoch": 2.4463023199548375, + "eval_MaskedAccuracy": 0.4505315380239766, + "eval_loss": 2.7120492458343506, + "eval_runtime": 158.1071, + "eval_samples_per_second": 401.475, + "eval_steps_per_second": 1.569, + "step": 182000 + }, + { + "epoch": 2.447646442108659, + "grad_norm": 0.544252336025238, + "learning_rate": 0.004853750726847811, + "loss": 2.3905, + "step": 182100 + }, + { + "epoch": 2.44899056426248, + "grad_norm": 1.0650444030761719, + "learning_rate": 0.0048535720145184615, + "loss": 2.4042, + "step": 182200 + }, + { + "epoch": 2.4503346864163014, + "grad_norm": 0.23677361011505127, + "learning_rate": 0.004853393196366076, + "loss": 2.3987, + "step": 182300 + }, + { + "epoch": 2.4516788085701227, + "grad_norm": 0.5979803204536438, + "learning_rate": 0.004853214272398713, + "loss": 2.4045, + "step": 182400 + }, + { + "epoch": 2.453022930723944, + "grad_norm": 0.37009915709495544, + "learning_rate": 0.0048530352426244434, + "loss": 2.4025, + "step": 182500 + }, + { + "epoch": 2.4543670528777657, + "grad_norm": 0.9107745289802551, + "learning_rate": 0.00485285610705133, + "loss": 2.4047, + "step": 182600 + }, + { + "epoch": 2.455711175031587, + "grad_norm": 0.6059080958366394, + "learning_rate": 0.004852676865687442, + "loss": 2.4036, + "step": 182700 + }, + { + "epoch": 2.4570552971854083, + "grad_norm": 1.2172014713287354, + "learning_rate": 0.004852497518540862, + "loss": 2.4039, + "step": 182800 + }, + { + "epoch": 2.4583994193392296, + "grad_norm": 0.27083879709243774, + "learning_rate": 0.004852318065619665, + "loss": 2.4028, + "step": 182900 + }, + { + "epoch": 2.459743541493051, + "grad_norm": 0.2236872911453247, + "learning_rate": 0.0048521385069319346, + "loss": 2.3937, + "step": 183000 + }, + { + "epoch": 2.459743541493051, + "eval_MaskedAccuracy": 0.4513215347110807, + "eval_loss": 2.7078845500946045, + "eval_runtime": 156.7319, + "eval_samples_per_second": 404.997, + "eval_steps_per_second": 1.582, + "step": 183000 + }, + { + "epoch": 2.461087663646872, + "grad_norm": 0.7614181637763977, + "learning_rate": 0.004851958842485767, + "loss": 2.3939, + "step": 183100 + }, + { + "epoch": 2.4624317858006934, + "grad_norm": 0.20438380539417267, + "learning_rate": 0.004851779072289257, + "loss": 2.4072, + "step": 183200 + }, + { + "epoch": 2.4637759079545147, + "grad_norm": 0.995496392250061, + "learning_rate": 0.004851599196350496, + "loss": 2.3985, + "step": 183300 + }, + { + "epoch": 2.465120030108336, + "grad_norm": 0.5554972290992737, + "learning_rate": 0.004851419214677599, + "loss": 2.3987, + "step": 183400 + }, + { + "epoch": 2.4664641522621578, + "grad_norm": 0.7563191056251526, + "learning_rate": 0.004851239127278679, + "loss": 2.4124, + "step": 183500 + }, + { + "epoch": 2.467808274415979, + "grad_norm": 0.9702069163322449, + "learning_rate": 0.004851058934161847, + "loss": 2.4062, + "step": 183600 + }, + { + "epoch": 2.4691523965698003, + "grad_norm": 0.3375505208969116, + "learning_rate": 0.004850878635335215, + "loss": 2.3993, + "step": 183700 + }, + { + "epoch": 2.4704965187236216, + "grad_norm": 0.3639039993286133, + "learning_rate": 0.004850698230806917, + "loss": 2.3888, + "step": 183800 + }, + { + "epoch": 2.471840640877443, + "grad_norm": 1.2747180461883545, + "learning_rate": 0.004850517720585071, + "loss": 2.3952, + "step": 183900 + }, + { + "epoch": 2.473184763031264, + "grad_norm": 0.2078840732574463, + "learning_rate": 0.004850337104677819, + "loss": 2.3996, + "step": 184000 + }, + { + "epoch": 2.473184763031264, + "eval_MaskedAccuracy": 0.4516848487899179, + "eval_loss": 2.7041962146759033, + "eval_runtime": 156.0607, + "eval_samples_per_second": 406.739, + "eval_steps_per_second": 1.589, + "step": 184000 + }, + { + "epoch": 2.4745288851850855, + "grad_norm": 0.7602828145027161, + "learning_rate": 0.004850156383093299, + "loss": 2.3861, + "step": 184100 + }, + { + "epoch": 2.475873007338907, + "grad_norm": 0.3689172863960266, + "learning_rate": 0.004849975555839646, + "loss": 2.3889, + "step": 184200 + }, + { + "epoch": 2.477217129492728, + "grad_norm": 0.26970091462135315, + "learning_rate": 0.004849794622925014, + "loss": 2.4053, + "step": 184300 + }, + { + "epoch": 2.47856125164655, + "grad_norm": 0.237064927816391, + "learning_rate": 0.004849613584357556, + "loss": 2.4008, + "step": 184400 + }, + { + "epoch": 2.479905373800371, + "grad_norm": 1.7367196083068848, + "learning_rate": 0.004849432440145427, + "loss": 2.4008, + "step": 184500 + }, + { + "epoch": 2.4812494959541924, + "grad_norm": 0.3666498064994812, + "learning_rate": 0.004849251190296789, + "loss": 2.4011, + "step": 184600 + }, + { + "epoch": 2.4825936181080137, + "grad_norm": 0.4193936288356781, + "learning_rate": 0.004849069834819819, + "loss": 2.4085, + "step": 184700 + }, + { + "epoch": 2.483937740261835, + "grad_norm": 0.3407999277114868, + "learning_rate": 0.004848888373722671, + "loss": 2.4006, + "step": 184800 + }, + { + "epoch": 2.4852818624156563, + "grad_norm": 0.3108844459056854, + "learning_rate": 0.0048487068070135345, + "loss": 2.4081, + "step": 184900 + }, + { + "epoch": 2.4866259845694776, + "grad_norm": 0.3092864453792572, + "learning_rate": 0.004848525134700581, + "loss": 2.4082, + "step": 185000 + }, + { + "epoch": 2.4866259845694776, + "eval_MaskedAccuracy": 0.4504291166073478, + "eval_loss": 2.712066411972046, + "eval_runtime": 156.1824, + "eval_samples_per_second": 406.422, + "eval_steps_per_second": 1.588, + "step": 185000 + }, + { + "epoch": 2.487970106723299, + "grad_norm": 0.25049683451652527, + "learning_rate": 0.004848343356792004, + "loss": 2.4032, + "step": 185100 + }, + { + "epoch": 2.48931422887712, + "grad_norm": 0.9619961977005005, + "learning_rate": 0.004848161473295994, + "loss": 2.4099, + "step": 185200 + }, + { + "epoch": 2.490658351030942, + "grad_norm": 1.3118278980255127, + "learning_rate": 0.004847979484220742, + "loss": 2.3951, + "step": 185300 + }, + { + "epoch": 2.492002473184763, + "grad_norm": 0.890388011932373, + "learning_rate": 0.004847797389574453, + "loss": 2.404, + "step": 185400 + }, + { + "epoch": 2.4933465953385845, + "grad_norm": 0.9836694002151489, + "learning_rate": 0.004847615189365324, + "loss": 2.4056, + "step": 185500 + }, + { + "epoch": 2.494690717492406, + "grad_norm": 1.4075289964675903, + "learning_rate": 0.004847432883601571, + "loss": 2.3974, + "step": 185600 + }, + { + "epoch": 2.496034839646227, + "grad_norm": 1.5059231519699097, + "learning_rate": 0.0048472504722914134, + "loss": 2.399, + "step": 185700 + }, + { + "epoch": 2.4973789618000484, + "grad_norm": 0.42734503746032715, + "learning_rate": 0.004847067955443057, + "loss": 2.4063, + "step": 185800 + }, + { + "epoch": 2.4987230839538697, + "grad_norm": 1.4821892976760864, + "learning_rate": 0.004846885333064743, + "loss": 2.3914, + "step": 185900 + }, + { + "epoch": 2.500067206107691, + "grad_norm": 1.791476845741272, + "learning_rate": 0.004846702605164673, + "loss": 2.3974, + "step": 186000 + }, + { + "epoch": 2.500067206107691, + "eval_MaskedAccuracy": 0.4501642596224747, + "eval_loss": 2.7116963863372803, + "eval_runtime": 159.5858, + "eval_samples_per_second": 397.755, + "eval_steps_per_second": 1.554, + "step": 186000 + }, + { + "epoch": 2.5014113282615122, + "grad_norm": 0.5543153285980225, + "learning_rate": 0.004846519771751113, + "loss": 2.4015, + "step": 186100 + }, + { + "epoch": 2.502755450415334, + "grad_norm": 1.6506179571151733, + "learning_rate": 0.004846336832832279, + "loss": 2.3985, + "step": 186200 + }, + { + "epoch": 2.504099572569155, + "grad_norm": 0.9173561334609985, + "learning_rate": 0.004846153788416425, + "loss": 2.3929, + "step": 186300 + }, + { + "epoch": 2.5054436947229766, + "grad_norm": 0.9084792733192444, + "learning_rate": 0.004845970638511788, + "loss": 2.3981, + "step": 186400 + }, + { + "epoch": 2.506787816876798, + "grad_norm": 0.4403022825717926, + "learning_rate": 0.004845787383126632, + "loss": 2.3893, + "step": 186500 + }, + { + "epoch": 2.508131939030619, + "grad_norm": 0.3516016900539398, + "learning_rate": 0.0048456040222692055, + "loss": 2.3998, + "step": 186600 + }, + { + "epoch": 2.5094760611844404, + "grad_norm": 0.5586411952972412, + "learning_rate": 0.0048454205559477735, + "loss": 2.4068, + "step": 186700 + }, + { + "epoch": 2.5108201833382617, + "grad_norm": 0.2955363094806671, + "learning_rate": 0.004845236984170605, + "loss": 2.4005, + "step": 186800 + }, + { + "epoch": 2.512164305492083, + "grad_norm": 0.22038711607456207, + "learning_rate": 0.004845053306945971, + "loss": 2.4009, + "step": 186900 + }, + { + "epoch": 2.5135084276459043, + "grad_norm": 0.7231718301773071, + "learning_rate": 0.004844869524282139, + "loss": 2.3994, + "step": 187000 + }, + { + "epoch": 2.5135084276459043, + "eval_MaskedAccuracy": 0.45075855288065936, + "eval_loss": 2.709667682647705, + "eval_runtime": 159.519, + "eval_samples_per_second": 397.921, + "eval_steps_per_second": 1.555, + "step": 187000 + }, + { + "epoch": 2.514852549799726, + "grad_norm": 1.2093353271484375, + "learning_rate": 0.004844685636187404, + "loss": 2.4022, + "step": 187100 + }, + { + "epoch": 2.516196671953547, + "grad_norm": 0.46012699604034424, + "learning_rate": 0.004844501642670045, + "loss": 2.4025, + "step": 187200 + }, + { + "epoch": 2.5175407941073686, + "grad_norm": 0.5770087838172913, + "learning_rate": 0.00484431754373835, + "loss": 2.3926, + "step": 187300 + }, + { + "epoch": 2.51888491626119, + "grad_norm": 0.8018302321434021, + "learning_rate": 0.004844133339400616, + "loss": 2.403, + "step": 187400 + }, + { + "epoch": 2.5202290384150112, + "grad_norm": 1.2442452907562256, + "learning_rate": 0.004843949029665142, + "loss": 2.402, + "step": 187500 + }, + { + "epoch": 2.5215731605688325, + "grad_norm": 0.22802333533763885, + "learning_rate": 0.0048437646145402385, + "loss": 2.4039, + "step": 187600 + }, + { + "epoch": 2.522917282722654, + "grad_norm": 0.5682822465896606, + "learning_rate": 0.004843580094034207, + "loss": 2.3969, + "step": 187700 + }, + { + "epoch": 2.524261404876475, + "grad_norm": 0.9464827179908752, + "learning_rate": 0.004843395468155368, + "loss": 2.3924, + "step": 187800 + }, + { + "epoch": 2.5256055270302964, + "grad_norm": 0.5253119468688965, + "learning_rate": 0.00484321073691204, + "loss": 2.3929, + "step": 187900 + }, + { + "epoch": 2.526949649184118, + "grad_norm": 0.6649153828620911, + "learning_rate": 0.004843025900312542, + "loss": 2.3994, + "step": 188000 + }, + { + "epoch": 2.526949649184118, + "eval_MaskedAccuracy": 0.45129980447121243, + "eval_loss": 2.707331657409668, + "eval_runtime": 162.5712, + "eval_samples_per_second": 390.45, + "eval_steps_per_second": 1.525, + "step": 188000 + }, + { + "epoch": 2.528293771337939, + "grad_norm": 0.21869996190071106, + "learning_rate": 0.004842840958365209, + "loss": 2.3981, + "step": 188100 + }, + { + "epoch": 2.5296378934917607, + "grad_norm": 0.394054651260376, + "learning_rate": 0.004842655911078364, + "loss": 2.3985, + "step": 188200 + }, + { + "epoch": 2.530982015645582, + "grad_norm": 0.5307244658470154, + "learning_rate": 0.004842470758460362, + "loss": 2.3977, + "step": 188300 + }, + { + "epoch": 2.5323261377994033, + "grad_norm": 0.19963762164115906, + "learning_rate": 0.004842285500519532, + "loss": 2.4024, + "step": 188400 + }, + { + "epoch": 2.5336702599532246, + "grad_norm": 0.7292469143867493, + "learning_rate": 0.004842100137264226, + "loss": 2.3958, + "step": 188500 + }, + { + "epoch": 2.535014382107046, + "grad_norm": 2.722954034805298, + "learning_rate": 0.004841914668702794, + "loss": 2.3943, + "step": 188600 + }, + { + "epoch": 2.536358504260867, + "grad_norm": 0.30424684286117554, + "learning_rate": 0.004841729094843593, + "loss": 2.3951, + "step": 188700 + }, + { + "epoch": 2.5377026264146885, + "grad_norm": 0.6298586130142212, + "learning_rate": 0.004841543415694982, + "loss": 2.3964, + "step": 188800 + }, + { + "epoch": 2.5390467485685098, + "grad_norm": 0.30196303129196167, + "learning_rate": 0.004841357631265328, + "loss": 2.3894, + "step": 188900 + }, + { + "epoch": 2.540390870722331, + "grad_norm": 0.2881263792514801, + "learning_rate": 0.004841171741563008, + "loss": 2.4015, + "step": 189000 + }, + { + "epoch": 2.540390870722331, + "eval_MaskedAccuracy": 0.4518744871539106, + "eval_loss": 2.7033305168151855, + "eval_runtime": 153.1061, + "eval_samples_per_second": 414.588, + "eval_steps_per_second": 1.62, + "step": 189000 + }, + { + "epoch": 2.541734992876153, + "grad_norm": 0.23784345388412476, + "learning_rate": 0.004840985746596394, + "loss": 2.3971, + "step": 189100 + }, + { + "epoch": 2.543079115029974, + "grad_norm": 0.2152082920074463, + "learning_rate": 0.004840799646373867, + "loss": 2.3981, + "step": 189200 + }, + { + "epoch": 2.5444232371837954, + "grad_norm": 0.2263517528772354, + "learning_rate": 0.004840613440903815, + "loss": 2.3943, + "step": 189300 + }, + { + "epoch": 2.5457673593376167, + "grad_norm": 0.6147149801254272, + "learning_rate": 0.00484042713019462, + "loss": 2.3895, + "step": 189400 + }, + { + "epoch": 2.547111481491438, + "grad_norm": 0.3678668439388275, + "learning_rate": 0.0048402407142546875, + "loss": 2.3991, + "step": 189500 + }, + { + "epoch": 2.5484556036452592, + "grad_norm": 0.323637455701828, + "learning_rate": 0.0048400541930924135, + "loss": 2.3957, + "step": 189600 + }, + { + "epoch": 2.5497997257990805, + "grad_norm": 1.3563669919967651, + "learning_rate": 0.004839867566716193, + "loss": 2.3984, + "step": 189700 + }, + { + "epoch": 2.551143847952902, + "grad_norm": 1.2862578630447388, + "learning_rate": 0.004839680835134451, + "loss": 2.4005, + "step": 189800 + }, + { + "epoch": 2.552487970106723, + "grad_norm": 0.30557116866111755, + "learning_rate": 0.00483949399835559, + "loss": 2.4025, + "step": 189900 + }, + { + "epoch": 2.553832092260545, + "grad_norm": 0.46706828474998474, + "learning_rate": 0.004839307056388037, + "loss": 2.3974, + "step": 190000 + }, + { + "epoch": 2.553832092260545, + "eval_MaskedAccuracy": 0.4516189117545165, + "eval_loss": 2.704475164413452, + "eval_runtime": 155.082, + "eval_samples_per_second": 409.306, + "eval_steps_per_second": 1.599, + "step": 190000 + }, + { + "epoch": 2.5551762144143657, + "grad_norm": 0.8386965990066528, + "learning_rate": 0.0048391200092402055, + "loss": 2.3969, + "step": 190100 + }, + { + "epoch": 2.5565203365681874, + "grad_norm": 0.35076892375946045, + "learning_rate": 0.004838932856920533, + "loss": 2.4024, + "step": 190200 + }, + { + "epoch": 2.5578644587220087, + "grad_norm": 0.2803960144519806, + "learning_rate": 0.0048387455994374435, + "loss": 2.389, + "step": 190300 + }, + { + "epoch": 2.55920858087583, + "grad_norm": 0.4562201499938965, + "learning_rate": 0.004838558236799381, + "loss": 2.3991, + "step": 190400 + }, + { + "epoch": 2.5605527030296513, + "grad_norm": 1.0036048889160156, + "learning_rate": 0.004838370769014779, + "loss": 2.3928, + "step": 190500 + }, + { + "epoch": 2.5618968251834726, + "grad_norm": 0.2660607099533081, + "learning_rate": 0.004838183196092097, + "loss": 2.4001, + "step": 190600 + }, + { + "epoch": 2.563240947337294, + "grad_norm": 0.6841756701469421, + "learning_rate": 0.004837995518039784, + "loss": 2.4028, + "step": 190700 + }, + { + "epoch": 2.564585069491115, + "grad_norm": 1.2373573780059814, + "learning_rate": 0.0048378077348662845, + "loss": 2.4022, + "step": 190800 + }, + { + "epoch": 2.565929191644937, + "grad_norm": 0.6526246070861816, + "learning_rate": 0.004837619846580077, + "loss": 2.3894, + "step": 190900 + }, + { + "epoch": 2.567273313798758, + "grad_norm": 0.2829650938510895, + "learning_rate": 0.004837431853189622, + "loss": 2.4013, + "step": 191000 + }, + { + "epoch": 2.567273313798758, + "eval_MaskedAccuracy": 0.4523377197669412, + "eval_loss": 2.701768636703491, + "eval_runtime": 154.0538, + "eval_samples_per_second": 412.038, + "eval_steps_per_second": 1.61, + "step": 191000 + }, + { + "epoch": 2.5686174359525795, + "grad_norm": 0.32170721888542175, + "learning_rate": 0.004837243754703377, + "loss": 2.3919, + "step": 191100 + }, + { + "epoch": 2.569961558106401, + "grad_norm": 0.49978765845298767, + "learning_rate": 0.004837055551129837, + "loss": 2.398, + "step": 191200 + }, + { + "epoch": 2.571305680260222, + "grad_norm": 0.7216925024986267, + "learning_rate": 0.004836867242477465, + "loss": 2.389, + "step": 191300 + }, + { + "epoch": 2.5726498024140434, + "grad_norm": 0.381653368473053, + "learning_rate": 0.004836678828754757, + "loss": 2.4022, + "step": 191400 + }, + { + "epoch": 2.5739939245678647, + "grad_norm": 0.5341126322746277, + "learning_rate": 0.004836490309970204, + "loss": 2.4023, + "step": 191500 + }, + { + "epoch": 2.575338046721686, + "grad_norm": 1.2294304370880127, + "learning_rate": 0.00483630168613229, + "loss": 2.3937, + "step": 191600 + }, + { + "epoch": 2.5766821688755073, + "grad_norm": 0.9563962817192078, + "learning_rate": 0.004836112957249523, + "loss": 2.3978, + "step": 191700 + }, + { + "epoch": 2.578026291029329, + "grad_norm": 0.6789734959602356, + "learning_rate": 0.0048359241233304024, + "loss": 2.3914, + "step": 191800 + }, + { + "epoch": 2.57937041318315, + "grad_norm": 2.394522190093994, + "learning_rate": 0.004835735184383436, + "loss": 2.397, + "step": 191900 + }, + { + "epoch": 2.5807145353369716, + "grad_norm": 0.9763964414596558, + "learning_rate": 0.0048355461404171445, + "loss": 2.3978, + "step": 192000 + }, + { + "epoch": 2.5807145353369716, + "eval_MaskedAccuracy": 0.45172072887484477, + "eval_loss": 2.704576253890991, + "eval_runtime": 152.9163, + "eval_samples_per_second": 415.103, + "eval_steps_per_second": 1.622, + "step": 192000 + }, + { + "epoch": 2.582058657490793, + "grad_norm": 1.0108835697174072, + "learning_rate": 0.004835356991440036, + "loss": 2.3952, + "step": 192100 + }, + { + "epoch": 2.583402779644614, + "grad_norm": 0.4426333010196686, + "learning_rate": 0.004835167737460643, + "loss": 2.3953, + "step": 192200 + }, + { + "epoch": 2.5847469017984355, + "grad_norm": 0.35795265436172485, + "learning_rate": 0.004834978378487477, + "loss": 2.3937, + "step": 192300 + }, + { + "epoch": 2.5860910239522568, + "grad_norm": 0.5657187104225159, + "learning_rate": 0.004834788914529085, + "loss": 2.3972, + "step": 192400 + }, + { + "epoch": 2.587435146106078, + "grad_norm": 1.4484549760818481, + "learning_rate": 0.004834599345594001, + "loss": 2.4053, + "step": 192500 + }, + { + "epoch": 2.5887792682598993, + "grad_norm": 0.5882700085639954, + "learning_rate": 0.004834409671690765, + "loss": 2.3941, + "step": 192600 + }, + { + "epoch": 2.5901233904137206, + "grad_norm": 0.2406308501958847, + "learning_rate": 0.004834219892827928, + "loss": 2.3877, + "step": 192700 + }, + { + "epoch": 2.591467512567542, + "grad_norm": 0.6876055002212524, + "learning_rate": 0.004834030009014036, + "loss": 2.3869, + "step": 192800 + }, + { + "epoch": 2.5928116347213637, + "grad_norm": 2.6464781761169434, + "learning_rate": 0.0048338400202576435, + "loss": 2.3974, + "step": 192900 + }, + { + "epoch": 2.594155756875185, + "grad_norm": 0.5188945531845093, + "learning_rate": 0.004833649926567314, + "loss": 2.3896, + "step": 193000 + }, + { + "epoch": 2.594155756875185, + "eval_MaskedAccuracy": 0.4505340990281368, + "eval_loss": 2.70825457572937, + "eval_runtime": 153.498, + "eval_samples_per_second": 413.53, + "eval_steps_per_second": 1.616, + "step": 193000 + }, + { + "epoch": 2.5954998790290063, + "grad_norm": 0.28811684250831604, + "learning_rate": 0.004833459727951611, + "loss": 2.3915, + "step": 193100 + }, + { + "epoch": 2.5968440011828275, + "grad_norm": 0.47558093070983887, + "learning_rate": 0.004833269424419109, + "loss": 2.39, + "step": 193200 + }, + { + "epoch": 2.598188123336649, + "grad_norm": 0.34942418336868286, + "learning_rate": 0.004833079015978379, + "loss": 2.3917, + "step": 193300 + }, + { + "epoch": 2.59953224549047, + "grad_norm": 0.28590691089630127, + "learning_rate": 0.004832888502638003, + "loss": 2.3953, + "step": 193400 + }, + { + "epoch": 2.6008763676442914, + "grad_norm": 0.4065092206001282, + "learning_rate": 0.004832697884406561, + "loss": 2.3919, + "step": 193500 + }, + { + "epoch": 2.6022204897981127, + "grad_norm": 0.7588686943054199, + "learning_rate": 0.004832507161292647, + "loss": 2.3976, + "step": 193600 + }, + { + "epoch": 2.603564611951934, + "grad_norm": 0.8062926530838013, + "learning_rate": 0.004832316333304851, + "loss": 2.3993, + "step": 193700 + }, + { + "epoch": 2.6049087341057557, + "grad_norm": 0.3884819447994232, + "learning_rate": 0.004832125400451772, + "loss": 2.3946, + "step": 193800 + }, + { + "epoch": 2.606252856259577, + "grad_norm": 0.950999915599823, + "learning_rate": 0.004831934362742015, + "loss": 2.399, + "step": 193900 + }, + { + "epoch": 2.6075969784133983, + "grad_norm": 0.2583158016204834, + "learning_rate": 0.004831743220184189, + "loss": 2.3895, + "step": 194000 + }, + { + "epoch": 2.6075969784133983, + "eval_MaskedAccuracy": 0.45153046662578544, + "eval_loss": 2.70489764213562, + "eval_runtime": 156.1126, + "eval_samples_per_second": 406.604, + "eval_steps_per_second": 1.589, + "step": 194000 + }, + { + "epoch": 2.6089411005672196, + "grad_norm": 0.20159654319286346, + "learning_rate": 0.004831551972786904, + "loss": 2.4009, + "step": 194100 + }, + { + "epoch": 2.610285222721041, + "grad_norm": 0.3360646963119507, + "learning_rate": 0.004831360620558774, + "loss": 2.4018, + "step": 194200 + }, + { + "epoch": 2.611629344874862, + "grad_norm": 1.5870589017868042, + "learning_rate": 0.004831169163508427, + "loss": 2.3897, + "step": 194300 + }, + { + "epoch": 2.6129734670286835, + "grad_norm": 0.9621824622154236, + "learning_rate": 0.004830977601644483, + "loss": 2.3896, + "step": 194400 + }, + { + "epoch": 2.614317589182505, + "grad_norm": 0.5799399018287659, + "learning_rate": 0.004830785934975585, + "loss": 2.3932, + "step": 194500 + }, + { + "epoch": 2.615661711336326, + "grad_norm": 0.3511755168437958, + "learning_rate": 0.004830594163510357, + "loss": 2.3863, + "step": 194600 + }, + { + "epoch": 2.617005833490148, + "grad_norm": 0.6359513401985168, + "learning_rate": 0.004830402287257447, + "loss": 2.3967, + "step": 194700 + }, + { + "epoch": 2.6183499556439687, + "grad_norm": 0.7922179102897644, + "learning_rate": 0.004830210306225503, + "loss": 2.397, + "step": 194800 + }, + { + "epoch": 2.6196940777977904, + "grad_norm": 0.43679991364479065, + "learning_rate": 0.004830018220423168, + "loss": 2.3966, + "step": 194900 + }, + { + "epoch": 2.6210381999516117, + "grad_norm": 0.5114357471466064, + "learning_rate": 0.004829826029859097, + "loss": 2.3997, + "step": 195000 + }, + { + "epoch": 2.6210381999516117, + "eval_MaskedAccuracy": 0.45087460050173545, + "eval_loss": 2.7082619667053223, + "eval_runtime": 153.3827, + "eval_samples_per_second": 413.841, + "eval_steps_per_second": 1.617, + "step": 195000 + }, + { + "epoch": 2.622382322105433, + "grad_norm": 0.5121012330055237, + "learning_rate": 0.004829633734541953, + "loss": 2.3954, + "step": 195100 + }, + { + "epoch": 2.6237264442592543, + "grad_norm": 0.693209171295166, + "learning_rate": 0.0048294413344804, + "loss": 2.3919, + "step": 195200 + }, + { + "epoch": 2.6250705664130756, + "grad_norm": 0.23428793251514435, + "learning_rate": 0.004829248829683105, + "loss": 2.3982, + "step": 195300 + }, + { + "epoch": 2.626414688566897, + "grad_norm": 0.37016478180885315, + "learning_rate": 0.00482905622015875, + "loss": 2.3932, + "step": 195400 + }, + { + "epoch": 2.627758810720718, + "grad_norm": 0.6766939759254456, + "learning_rate": 0.004828863505916004, + "loss": 2.394, + "step": 195500 + }, + { + "epoch": 2.62910293287454, + "grad_norm": 0.5270081758499146, + "learning_rate": 0.004828670686963563, + "loss": 2.3912, + "step": 195600 + }, + { + "epoch": 2.6304470550283607, + "grad_norm": 0.31191486120224, + "learning_rate": 0.004828477763310103, + "loss": 2.4008, + "step": 195700 + }, + { + "epoch": 2.6317911771821825, + "grad_norm": 0.21520403027534485, + "learning_rate": 0.00482828473496432, + "loss": 2.3868, + "step": 195800 + }, + { + "epoch": 2.6331352993360038, + "grad_norm": 0.2524843215942383, + "learning_rate": 0.00482809160193491, + "loss": 2.3957, + "step": 195900 + }, + { + "epoch": 2.634479421489825, + "grad_norm": 0.6567283272743225, + "learning_rate": 0.004827898364230586, + "loss": 2.3915, + "step": 196000 + }, + { + "epoch": 2.634479421489825, + "eval_MaskedAccuracy": 0.4512480287660106, + "eval_loss": 2.7046916484832764, + "eval_runtime": 154.0885, + "eval_samples_per_second": 411.945, + "eval_steps_per_second": 1.609, + "step": 196000 + }, + { + "epoch": 2.6358235436436463, + "grad_norm": 0.542457103729248, + "learning_rate": 0.004827705021860038, + "loss": 2.3947, + "step": 196100 + }, + { + "epoch": 2.6371676657974676, + "grad_norm": 0.44133803248405457, + "learning_rate": 0.004827511574831989, + "loss": 2.3872, + "step": 196200 + }, + { + "epoch": 2.638511787951289, + "grad_norm": 0.9183239936828613, + "learning_rate": 0.004827318023155146, + "loss": 2.3966, + "step": 196300 + }, + { + "epoch": 2.6398559101051102, + "grad_norm": 0.851475715637207, + "learning_rate": 0.0048271243668382485, + "loss": 2.3931, + "step": 196400 + }, + { + "epoch": 2.641200032258932, + "grad_norm": 0.9451562166213989, + "learning_rate": 0.004826930605890004, + "loss": 2.3922, + "step": 196500 + }, + { + "epoch": 2.642544154412753, + "grad_norm": 0.5648373961448669, + "learning_rate": 0.004826736740319149, + "loss": 2.3882, + "step": 196600 + }, + { + "epoch": 2.6438882765665745, + "grad_norm": 0.32394424080848694, + "learning_rate": 0.0048265427701344185, + "loss": 2.3918, + "step": 196700 + }, + { + "epoch": 2.645232398720396, + "grad_norm": 0.5081258416175842, + "learning_rate": 0.0048263486953445565, + "loss": 2.3863, + "step": 196800 + }, + { + "epoch": 2.646576520874217, + "grad_norm": 0.7854281067848206, + "learning_rate": 0.004826154515958302, + "loss": 2.3833, + "step": 196900 + }, + { + "epoch": 2.6479206430280384, + "grad_norm": 0.5658829212188721, + "learning_rate": 0.004825960231984411, + "loss": 2.3879, + "step": 197000 + }, + { + "epoch": 2.6479206430280384, + "eval_MaskedAccuracy": 0.4514419081074835, + "eval_loss": 2.7042078971862793, + "eval_runtime": 172.0922, + "eval_samples_per_second": 368.849, + "eval_steps_per_second": 1.441, + "step": 197000 + }, + { + "epoch": 2.6492647651818597, + "grad_norm": 0.9744159579277039, + "learning_rate": 0.004825765843431632, + "loss": 2.3993, + "step": 197100 + }, + { + "epoch": 2.650608887335681, + "grad_norm": 1.349397897720337, + "learning_rate": 0.004825571350308726, + "loss": 2.3985, + "step": 197200 + }, + { + "epoch": 2.6519530094895023, + "grad_norm": 0.8859503865242004, + "learning_rate": 0.004825376752624455, + "loss": 2.3838, + "step": 197300 + }, + { + "epoch": 2.6532971316433236, + "grad_norm": 0.27515995502471924, + "learning_rate": 0.004825182050387594, + "loss": 2.3907, + "step": 197400 + }, + { + "epoch": 2.654641253797145, + "grad_norm": 0.2807437777519226, + "learning_rate": 0.0048249872436069065, + "loss": 2.3969, + "step": 197500 + }, + { + "epoch": 2.6559853759509666, + "grad_norm": 0.9948381185531616, + "learning_rate": 0.00482479233229117, + "loss": 2.392, + "step": 197600 + }, + { + "epoch": 2.657329498104788, + "grad_norm": 1.0630486011505127, + "learning_rate": 0.004824597316449177, + "loss": 2.3895, + "step": 197700 + }, + { + "epoch": 2.658673620258609, + "grad_norm": 0.24495822191238403, + "learning_rate": 0.004824402196089705, + "loss": 2.3984, + "step": 197800 + }, + { + "epoch": 2.6600177424124305, + "grad_norm": 1.0143710374832153, + "learning_rate": 0.004824206971221553, + "loss": 2.3864, + "step": 197900 + }, + { + "epoch": 2.661361864566252, + "grad_norm": 0.2758951187133789, + "learning_rate": 0.0048240116418535135, + "loss": 2.3837, + "step": 198000 + }, + { + "epoch": 2.661361864566252, + "eval_MaskedAccuracy": 0.4513660960606149, + "eval_loss": 2.70383882522583, + "eval_runtime": 159.3018, + "eval_samples_per_second": 398.464, + "eval_steps_per_second": 1.557, + "step": 198000 + }, + { + "epoch": 2.662705986720073, + "grad_norm": 0.34540656208992004, + "learning_rate": 0.004823816207994382, + "loss": 2.3947, + "step": 198100 + }, + { + "epoch": 2.6640501088738944, + "grad_norm": 0.716076672077179, + "learning_rate": 0.004823620669652978, + "loss": 2.3981, + "step": 198200 + }, + { + "epoch": 2.6653942310277157, + "grad_norm": 0.37496283650398254, + "learning_rate": 0.004823425026838102, + "loss": 2.3902, + "step": 198300 + }, + { + "epoch": 2.666738353181537, + "grad_norm": 2.4844629764556885, + "learning_rate": 0.004823229279558573, + "loss": 2.387, + "step": 198400 + }, + { + "epoch": 2.6680824753353587, + "grad_norm": 0.2198270559310913, + "learning_rate": 0.004823033427823211, + "loss": 2.3895, + "step": 198500 + }, + { + "epoch": 2.6694265974891795, + "grad_norm": 0.23235642910003662, + "learning_rate": 0.004822837471640838, + "loss": 2.3955, + "step": 198600 + }, + { + "epoch": 2.6707707196430013, + "grad_norm": 0.5798563957214355, + "learning_rate": 0.00482264141102029, + "loss": 2.3911, + "step": 198700 + }, + { + "epoch": 2.6721148417968226, + "grad_norm": 0.36171451210975647, + "learning_rate": 0.004822445245970393, + "loss": 2.3919, + "step": 198800 + }, + { + "epoch": 2.673458963950644, + "grad_norm": 0.9405841827392578, + "learning_rate": 0.00482224897649999, + "loss": 2.3965, + "step": 198900 + }, + { + "epoch": 2.674803086104465, + "grad_norm": 0.40147000551223755, + "learning_rate": 0.004822052602617928, + "loss": 2.3928, + "step": 199000 + }, + { + "epoch": 2.674803086104465, + "eval_MaskedAccuracy": 0.451330398429542, + "eval_loss": 2.7040762901306152, + "eval_runtime": 152.7346, + "eval_samples_per_second": 415.597, + "eval_steps_per_second": 1.624, + "step": 199000 + }, + { + "epoch": 2.6761472082582864, + "grad_norm": 0.9061090350151062, + "learning_rate": 0.004821856124333049, + "loss": 2.3848, + "step": 199100 + }, + { + "epoch": 2.6774913304121077, + "grad_norm": 0.8056171536445618, + "learning_rate": 0.004821659541654204, + "loss": 2.3965, + "step": 199200 + }, + { + "epoch": 2.678835452565929, + "grad_norm": 2.8304381370544434, + "learning_rate": 0.004821462854590257, + "loss": 2.3895, + "step": 199300 + }, + { + "epoch": 2.6801795747197508, + "grad_norm": 0.6876057386398315, + "learning_rate": 0.004821266063150074, + "loss": 2.3914, + "step": 199400 + }, + { + "epoch": 2.6815236968735716, + "grad_norm": 1.1147319078445435, + "learning_rate": 0.004821069167342508, + "loss": 2.3884, + "step": 199500 + }, + { + "epoch": 2.6828678190273934, + "grad_norm": 0.563893735408783, + "learning_rate": 0.004820872167176442, + "loss": 2.3824, + "step": 199600 + }, + { + "epoch": 2.6842119411812146, + "grad_norm": 0.9097170829772949, + "learning_rate": 0.004820675062660752, + "loss": 2.3951, + "step": 199700 + }, + { + "epoch": 2.685556063335036, + "grad_norm": 0.5297996997833252, + "learning_rate": 0.004820477853804319, + "loss": 2.3987, + "step": 199800 + }, + { + "epoch": 2.6869001854888572, + "grad_norm": 0.4210087060928345, + "learning_rate": 0.0048202805406160265, + "loss": 2.3966, + "step": 199900 + }, + { + "epoch": 2.6882443076426785, + "grad_norm": 0.9611253142356873, + "learning_rate": 0.004820083123104765, + "loss": 2.3946, + "step": 200000 + }, + { + "epoch": 2.6882443076426785, + "eval_MaskedAccuracy": 0.4521056806971866, + "eval_loss": 2.7007529735565186, + "eval_runtime": 154.1926, + "eval_samples_per_second": 411.667, + "eval_steps_per_second": 1.608, + "step": 200000 + } + ], + "logging_steps": 100, + "max_steps": 1500000, + "num_input_tokens_seen": 0, + "num_train_epochs": 21, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.56886375582715e+18, + "train_batch_size": 384, + "trial_name": null, + "trial_params": null +}