{ "best_metric": 2.7007529735565186, "best_model_checkpoint": "fat5-fr-small_v1/checkpoint-200000", "epoch": 2.6882443076426785, "eval_steps": 1000, "global_step": 200000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0013441221538213392, "grad_norm": 0.44315826892852783, "learning_rate": 0.002512499999999994, "loss": 7.5626, "step": 100 }, { "epoch": 0.0026882443076426785, "grad_norm": 0.5437139272689819, "learning_rate": 0.0025249999999999904, "loss": 6.2859, "step": 200 }, { "epoch": 0.004032366461464018, "grad_norm": 0.7252405881881714, "learning_rate": 0.00253749999999999, "loss": 5.8614, "step": 300 }, { "epoch": 0.005376488615285357, "grad_norm": 1.1030696630477905, "learning_rate": 0.0025499999999999885, "loss": 5.6073, "step": 400 }, { "epoch": 0.006720610769106696, "grad_norm": 0.8097702264785767, "learning_rate": 0.0025624999999999862, "loss": 5.4185, "step": 500 }, { "epoch": 0.008064732922928036, "grad_norm": 0.9873973727226257, "learning_rate": 0.0025749999999999827, "loss": 5.2865, "step": 600 }, { "epoch": 0.009408855076749375, "grad_norm": 1.1101303100585938, "learning_rate": 0.0025874999999999813, "loss": 5.1929, "step": 700 }, { "epoch": 0.010752977230570714, "grad_norm": 1.1394418478012085, "learning_rate": 0.0025999999999999795, "loss": 5.1122, "step": 800 }, { "epoch": 0.012097099384392053, "grad_norm": 1.2768878936767578, "learning_rate": 0.0026124999999999764, "loss": 5.0262, "step": 900 }, { "epoch": 0.013441221538213392, "grad_norm": 1.238966464996338, "learning_rate": 0.00262499999999997, "loss": 4.9743, "step": 1000 }, { "epoch": 0.013441221538213392, "eval_MaskedAccuracy": 0.19784819583436258, "eval_loss": 5.1476359367370605, "eval_runtime": 157.0766, "eval_samples_per_second": 404.109, "eval_steps_per_second": 1.579, "step": 1000 }, { "epoch": 0.014785343692034733, "grad_norm": 1.1027828454971313, "learning_rate": 0.002637499999999968, "loss": 4.9126, "step": 1100 }, { "epoch": 0.016129465845856072, "grad_norm": 1.021935224533081, "learning_rate": 0.002649999999999965, "loss": 4.855, "step": 1200 }, { "epoch": 0.01747358799967741, "grad_norm": 0.9193494319915771, "learning_rate": 0.0026624999999999635, "loss": 4.8046, "step": 1300 }, { "epoch": 0.01881771015349875, "grad_norm": 1.0063543319702148, "learning_rate": 0.002674999999999961, "loss": 4.7762, "step": 1400 }, { "epoch": 0.02016183230732009, "grad_norm": 1.034637689590454, "learning_rate": 0.0026874999999999564, "loss": 4.7222, "step": 1500 }, { "epoch": 0.021505954461141428, "grad_norm": 0.9538853764533997, "learning_rate": 0.002699999999999955, "loss": 4.6804, "step": 1600 }, { "epoch": 0.02285007661496277, "grad_norm": 0.8716675043106079, "learning_rate": 0.002712499999999955, "loss": 4.6452, "step": 1700 }, { "epoch": 0.024194198768784106, "grad_norm": 1.194484829902649, "learning_rate": 0.0027249999999999514, "loss": 4.6026, "step": 1800 }, { "epoch": 0.025538320922605447, "grad_norm": 0.9794184565544128, "learning_rate": 0.002737499999999953, "loss": 4.5661, "step": 1900 }, { "epoch": 0.026882443076426784, "grad_norm": 0.8896113038063049, "learning_rate": 0.0027499999999999495, "loss": 4.528, "step": 2000 }, { "epoch": 0.026882443076426784, "eval_MaskedAccuracy": 0.22670220534969857, "eval_loss": 4.742874622344971, "eval_runtime": 155.1957, "eval_samples_per_second": 409.006, "eval_steps_per_second": 1.598, "step": 2000 }, { "epoch": 0.028226565230248125, "grad_norm": 0.9724192023277283, "learning_rate": 0.0027624999999999425, "loss": 4.503, "step": 2100 }, { "epoch": 0.029570687384069465, "grad_norm": 0.9325888752937317, "learning_rate": 0.0027749999999999394, "loss": 4.4762, "step": 2200 }, { "epoch": 0.030914809537890803, "grad_norm": 0.8246894478797913, "learning_rate": 0.0027874999999999363, "loss": 4.4439, "step": 2300 }, { "epoch": 0.032258931691712144, "grad_norm": 0.9386630654335022, "learning_rate": 0.0027999999999999345, "loss": 4.4196, "step": 2400 }, { "epoch": 0.033603053845533484, "grad_norm": 0.7635268568992615, "learning_rate": 0.0028124999999999322, "loss": 4.3982, "step": 2500 }, { "epoch": 0.03494717599935482, "grad_norm": 0.905320405960083, "learning_rate": 0.00282499999999993, "loss": 4.3657, "step": 2600 }, { "epoch": 0.03629129815317616, "grad_norm": 0.8763816952705383, "learning_rate": 0.0028374999999999286, "loss": 4.3476, "step": 2700 }, { "epoch": 0.0376354203069975, "grad_norm": 1.0410003662109375, "learning_rate": 0.002849999999999924, "loss": 4.3162, "step": 2800 }, { "epoch": 0.03897954246081884, "grad_norm": 0.8011109232902527, "learning_rate": 0.0028624999999999224, "loss": 4.2953, "step": 2900 }, { "epoch": 0.04032366461464018, "grad_norm": 0.9530174136161804, "learning_rate": 0.002874999999999916, "loss": 4.2607, "step": 3000 }, { "epoch": 0.04032366461464018, "eval_MaskedAccuracy": 0.24486576013786512, "eval_loss": 4.528601169586182, "eval_runtime": 159.1854, "eval_samples_per_second": 398.755, "eval_steps_per_second": 1.558, "step": 3000 }, { "epoch": 0.041667786768461515, "grad_norm": 0.9223951101303101, "learning_rate": 0.0028874999999999123, "loss": 4.2462, "step": 3100 }, { "epoch": 0.043011908922282856, "grad_norm": 0.76118403673172, "learning_rate": 0.0028999999999999113, "loss": 4.2295, "step": 3200 }, { "epoch": 0.044356031076104196, "grad_norm": 1.0855016708374023, "learning_rate": 0.002912499999999909, "loss": 4.1973, "step": 3300 }, { "epoch": 0.04570015322992554, "grad_norm": 1.12669038772583, "learning_rate": 0.0029249999999999042, "loss": 4.1689, "step": 3400 }, { "epoch": 0.04704427538374688, "grad_norm": 0.8598240613937378, "learning_rate": 0.002937499999999899, "loss": 4.1503, "step": 3500 }, { "epoch": 0.04838839753756821, "grad_norm": 0.9049589037895203, "learning_rate": 0.002949999999999894, "loss": 4.1266, "step": 3600 }, { "epoch": 0.04973251969138955, "grad_norm": 0.9112903475761414, "learning_rate": 0.002962499999999894, "loss": 4.0996, "step": 3700 }, { "epoch": 0.05107664184521089, "grad_norm": 0.8487406969070435, "learning_rate": 0.002974999999999889, "loss": 4.0884, "step": 3800 }, { "epoch": 0.052420763999032234, "grad_norm": 0.9482388496398926, "learning_rate": 0.002987499999999884, "loss": 4.0599, "step": 3900 }, { "epoch": 0.05376488615285357, "grad_norm": 1.0483149290084839, "learning_rate": 0.002999999999999876, "loss": 4.039, "step": 4000 }, { "epoch": 0.05376488615285357, "eval_MaskedAccuracy": 0.2591465652834645, "eval_loss": 4.372995376586914, "eval_runtime": 156.5453, "eval_samples_per_second": 405.48, "eval_steps_per_second": 1.584, "step": 4000 }, { "epoch": 0.05510900830667491, "grad_norm": 0.885144054889679, "learning_rate": 0.003012499999999872, "loss": 4.0297, "step": 4100 }, { "epoch": 0.05645313046049625, "grad_norm": 1.0170485973358154, "learning_rate": 0.0030249999999998685, "loss": 4.0089, "step": 4200 }, { "epoch": 0.05779725261431759, "grad_norm": 0.9548616409301758, "learning_rate": 0.003037499999999864, "loss": 3.9864, "step": 4300 }, { "epoch": 0.05914137476813893, "grad_norm": 1.0004748106002808, "learning_rate": 0.0030499999999998627, "loss": 3.9803, "step": 4400 }, { "epoch": 0.060485496921960265, "grad_norm": 0.8399825096130371, "learning_rate": 0.003062499999999858, "loss": 3.9609, "step": 4500 }, { "epoch": 0.061829619075781606, "grad_norm": 0.9576703906059265, "learning_rate": 0.0030749999999998565, "loss": 3.9492, "step": 4600 }, { "epoch": 0.06317374122960295, "grad_norm": 0.8874163031578064, "learning_rate": 0.0030874999999998525, "loss": 3.9307, "step": 4700 }, { "epoch": 0.06451786338342429, "grad_norm": 0.8954266309738159, "learning_rate": 0.003099999999999849, "loss": 3.917, "step": 4800 }, { "epoch": 0.06586198553724562, "grad_norm": 1.2326369285583496, "learning_rate": 0.0031124999999998476, "loss": 3.9057, "step": 4900 }, { "epoch": 0.06720610769106697, "grad_norm": 0.9583144783973694, "learning_rate": 0.003124999999999845, "loss": 3.8926, "step": 5000 }, { "epoch": 0.06720610769106697, "eval_MaskedAccuracy": 0.272405235858602, "eval_loss": 4.243067741394043, "eval_runtime": 160.7599, "eval_samples_per_second": 394.85, "eval_steps_per_second": 1.543, "step": 5000 }, { "epoch": 0.0685502298448883, "grad_norm": 1.063835620880127, "learning_rate": 0.003137499999999841, "loss": 3.8825, "step": 5100 }, { "epoch": 0.06989435199870964, "grad_norm": 0.8451062440872192, "learning_rate": 0.003149999999999838, "loss": 3.8725, "step": 5200 }, { "epoch": 0.07123847415253098, "grad_norm": 0.8295220732688904, "learning_rate": 0.003162499999999835, "loss": 3.8622, "step": 5300 }, { "epoch": 0.07258259630635232, "grad_norm": 0.8434109091758728, "learning_rate": 0.003174999999999833, "loss": 3.8418, "step": 5400 }, { "epoch": 0.07392671846017367, "grad_norm": 0.9849230051040649, "learning_rate": 0.0031874999999998307, "loss": 3.8338, "step": 5500 }, { "epoch": 0.075270840613995, "grad_norm": 1.2400304079055786, "learning_rate": 0.003199999999999828, "loss": 3.8234, "step": 5600 }, { "epoch": 0.07661496276781633, "grad_norm": 0.8637450933456421, "learning_rate": 0.0032124999999998288, "loss": 3.8194, "step": 5700 }, { "epoch": 0.07795908492163768, "grad_norm": 0.7561401128768921, "learning_rate": 0.0032249999999998287, "loss": 3.805, "step": 5800 }, { "epoch": 0.07930320707545901, "grad_norm": 0.822270929813385, "learning_rate": 0.003237499999999828, "loss": 3.7957, "step": 5900 }, { "epoch": 0.08064732922928036, "grad_norm": 0.9090524315834045, "learning_rate": 0.0032499999999998216, "loss": 3.7777, "step": 6000 }, { "epoch": 0.08064732922928036, "eval_MaskedAccuracy": 0.2802241909468907, "eval_loss": 4.147753715515137, "eval_runtime": 154.4313, "eval_samples_per_second": 411.031, "eval_steps_per_second": 1.606, "step": 6000 }, { "epoch": 0.0819914513831017, "grad_norm": 1.0061832666397095, "learning_rate": 0.0032624999999998194, "loss": 3.7827, "step": 6100 }, { "epoch": 0.08333557353692303, "grad_norm": 1.0628759860992432, "learning_rate": 0.0032749999999998154, "loss": 3.7667, "step": 6200 }, { "epoch": 0.08467969569074438, "grad_norm": 1.1841171979904175, "learning_rate": 0.003287499999999813, "loss": 3.7672, "step": 6300 }, { "epoch": 0.08602381784456571, "grad_norm": 0.9175704121589661, "learning_rate": 0.003299999999999811, "loss": 3.7556, "step": 6400 }, { "epoch": 0.08736793999838706, "grad_norm": 0.9184996485710144, "learning_rate": 0.0033124999999998082, "loss": 3.7411, "step": 6500 }, { "epoch": 0.08871206215220839, "grad_norm": 0.8513078093528748, "learning_rate": 0.0033249999999998042, "loss": 3.7332, "step": 6600 }, { "epoch": 0.09005618430602973, "grad_norm": 0.9670765399932861, "learning_rate": 0.0033374999999997994, "loss": 3.7274, "step": 6700 }, { "epoch": 0.09140030645985107, "grad_norm": 0.9749046564102173, "learning_rate": 0.0033499999999997932, "loss": 3.7223, "step": 6800 }, { "epoch": 0.09274442861367241, "grad_norm": 0.8485523462295532, "learning_rate": 0.003362499999999787, "loss": 3.7125, "step": 6900 }, { "epoch": 0.09408855076749376, "grad_norm": 0.84412682056427, "learning_rate": 0.0033749999999997853, "loss": 3.71, "step": 7000 }, { "epoch": 0.09408855076749376, "eval_MaskedAccuracy": 0.2868874563852213, "eval_loss": 4.067524433135986, "eval_runtime": 154.2299, "eval_samples_per_second": 411.567, "eval_steps_per_second": 1.608, "step": 7000 }, { "epoch": 0.09543267292131509, "grad_norm": 0.8195430040359497, "learning_rate": 0.0033874999999997804, "loss": 3.708, "step": 7100 }, { "epoch": 0.09677679507513642, "grad_norm": 0.8684709668159485, "learning_rate": 0.0033999999999997713, "loss": 3.7003, "step": 7200 }, { "epoch": 0.09812091722895777, "grad_norm": 0.8237408399581909, "learning_rate": 0.0034124999999997647, "loss": 3.676, "step": 7300 }, { "epoch": 0.0994650393827791, "grad_norm": 0.8418594598770142, "learning_rate": 0.003424999999999764, "loss": 3.6748, "step": 7400 }, { "epoch": 0.10080916153660045, "grad_norm": 0.900364100933075, "learning_rate": 0.0034374999999997606, "loss": 3.6822, "step": 7500 }, { "epoch": 0.10215328369042179, "grad_norm": 0.9300949573516846, "learning_rate": 0.003449999999999758, "loss": 3.6657, "step": 7600 }, { "epoch": 0.10349740584424312, "grad_norm": 0.8330152034759521, "learning_rate": 0.003462499999999754, "loss": 3.6635, "step": 7700 }, { "epoch": 0.10484152799806447, "grad_norm": 0.8555970191955566, "learning_rate": 0.003474999999999749, "loss": 3.6461, "step": 7800 }, { "epoch": 0.1061856501518858, "grad_norm": 0.9717864394187927, "learning_rate": 0.003487499999999749, "loss": 3.6402, "step": 7900 }, { "epoch": 0.10752977230570714, "grad_norm": 1.3664884567260742, "learning_rate": 0.003499999999999745, "loss": 3.6341, "step": 8000 }, { "epoch": 0.10752977230570714, "eval_MaskedAccuracy": 0.2957456999836483, "eval_loss": 3.9853811264038086, "eval_runtime": 154.2803, "eval_samples_per_second": 411.433, "eval_steps_per_second": 1.607, "step": 8000 }, { "epoch": 0.10887389445952848, "grad_norm": 1.9192525148391724, "learning_rate": 0.0035124999999997415, "loss": 3.6234, "step": 8100 }, { "epoch": 0.11021801661334982, "grad_norm": 0.875035285949707, "learning_rate": 0.003524999999999741, "loss": 3.6126, "step": 8200 }, { "epoch": 0.11156213876717117, "grad_norm": 1.4383764266967773, "learning_rate": 0.0035374999999997353, "loss": 3.5893, "step": 8300 }, { "epoch": 0.1129062609209925, "grad_norm": 1.0789637565612793, "learning_rate": 0.0035499999999997313, "loss": 3.5687, "step": 8400 }, { "epoch": 0.11425038307481383, "grad_norm": 1.7937785387039185, "learning_rate": 0.00356249999999973, "loss": 3.5473, "step": 8500 }, { "epoch": 0.11559450522863518, "grad_norm": 1.0099656581878662, "learning_rate": 0.0035749999999997255, "loss": 3.5311, "step": 8600 }, { "epoch": 0.11693862738245651, "grad_norm": 0.876243531703949, "learning_rate": 0.0035874999999997237, "loss": 3.5129, "step": 8700 }, { "epoch": 0.11828274953627786, "grad_norm": 1.4001803398132324, "learning_rate": 0.0035999999999997236, "loss": 3.5052, "step": 8800 }, { "epoch": 0.1196268716900992, "grad_norm": 1.103613018989563, "learning_rate": 0.003612499999999721, "loss": 3.4943, "step": 8900 }, { "epoch": 0.12097099384392053, "grad_norm": 0.8662921190261841, "learning_rate": 0.00362499999999972, "loss": 3.4864, "step": 9000 }, { "epoch": 0.12097099384392053, "eval_MaskedAccuracy": 0.308084678333672, "eval_loss": 3.829771041870117, "eval_runtime": 161.1285, "eval_samples_per_second": 393.946, "eval_steps_per_second": 1.539, "step": 9000 }, { "epoch": 0.12231511599774188, "grad_norm": 0.9641631841659546, "learning_rate": 0.0036374999999997113, "loss": 3.4636, "step": 9100 }, { "epoch": 0.12365923815156321, "grad_norm": 0.8773326277732849, "learning_rate": 0.0036499999999997073, "loss": 3.4694, "step": 9200 }, { "epoch": 0.12500336030538456, "grad_norm": 1.132733941078186, "learning_rate": 0.003662499999999711, "loss": 3.4452, "step": 9300 }, { "epoch": 0.1263474824592059, "grad_norm": 1.1771531105041504, "learning_rate": 0.0036749999999997097, "loss": 3.4436, "step": 9400 }, { "epoch": 0.12769160461302723, "grad_norm": 0.7729461193084717, "learning_rate": 0.0036874999999997097, "loss": 3.4241, "step": 9500 }, { "epoch": 0.12903572676684857, "grad_norm": 1.1024218797683716, "learning_rate": 0.0036999999999997118, "loss": 3.4175, "step": 9600 }, { "epoch": 0.13037984892066992, "grad_norm": 0.7504922747612, "learning_rate": 0.0037124999999997074, "loss": 3.3946, "step": 9700 }, { "epoch": 0.13172397107449124, "grad_norm": 0.7903368473052979, "learning_rate": 0.0037249999999997034, "loss": 3.3848, "step": 9800 }, { "epoch": 0.1330680932283126, "grad_norm": 0.8163586258888245, "learning_rate": 0.0037374999999996972, "loss": 3.3669, "step": 9900 }, { "epoch": 0.13441221538213394, "grad_norm": 0.839239776134491, "learning_rate": 0.003749999999999697, "loss": 3.3447, "step": 10000 }, { "epoch": 0.13441221538213394, "eval_MaskedAccuracy": 0.32656299271612027, "eval_loss": 3.7040650844573975, "eval_runtime": 161.5088, "eval_samples_per_second": 393.019, "eval_steps_per_second": 1.536, "step": 10000 }, { "epoch": 0.13575633753595526, "grad_norm": 1.061814546585083, "learning_rate": 0.003762499999999692, "loss": 3.3364, "step": 10100 }, { "epoch": 0.1371004596897766, "grad_norm": 0.7809873223304749, "learning_rate": 0.0037749999999996914, "loss": 3.3223, "step": 10200 }, { "epoch": 0.13844458184359795, "grad_norm": 0.8472948670387268, "learning_rate": 0.003787499999999695, "loss": 3.3116, "step": 10300 }, { "epoch": 0.13978870399741927, "grad_norm": 0.7124361991882324, "learning_rate": 0.0037999999999996873, "loss": 3.2889, "step": 10400 }, { "epoch": 0.14113282615124062, "grad_norm": 1.7072255611419678, "learning_rate": 0.003812499999999687, "loss": 3.2982, "step": 10500 }, { "epoch": 0.14247694830506197, "grad_norm": 2.472754716873169, "learning_rate": 0.0038249999999996833, "loss": 3.2711, "step": 10600 }, { "epoch": 0.14382107045888332, "grad_norm": 0.9063215255737305, "learning_rate": 0.0038374999999996823, "loss": 3.2667, "step": 10700 }, { "epoch": 0.14516519261270464, "grad_norm": 0.8283532857894897, "learning_rate": 0.0038499999999996775, "loss": 3.2545, "step": 10800 }, { "epoch": 0.14650931476652598, "grad_norm": 0.8591430187225342, "learning_rate": 0.0038624999999996735, "loss": 3.248, "step": 10900 }, { "epoch": 0.14785343692034733, "grad_norm": 0.971303403377533, "learning_rate": 0.0038749999999996712, "loss": 3.2447, "step": 11000 }, { "epoch": 0.14785343692034733, "eval_MaskedAccuracy": 0.34290362452711665, "eval_loss": 3.601362705230713, "eval_runtime": 162.0306, "eval_samples_per_second": 391.753, "eval_steps_per_second": 1.531, "step": 11000 }, { "epoch": 0.14919755907416865, "grad_norm": 1.0778330564498901, "learning_rate": 0.0038874999999996686, "loss": 3.2298, "step": 11100 }, { "epoch": 0.15054168122799, "grad_norm": 0.7707659602165222, "learning_rate": 0.003899999999999665, "loss": 3.2199, "step": 11200 }, { "epoch": 0.15188580338181135, "grad_norm": 1.2470498085021973, "learning_rate": 0.003912499999999662, "loss": 3.2137, "step": 11300 }, { "epoch": 0.15322992553563267, "grad_norm": 0.8463084101676941, "learning_rate": 0.003924999999999659, "loss": 3.2123, "step": 11400 }, { "epoch": 0.154574047689454, "grad_norm": 0.8666761517524719, "learning_rate": 0.0039374999999996566, "loss": 3.1978, "step": 11500 }, { "epoch": 0.15591816984327536, "grad_norm": 0.899533748626709, "learning_rate": 0.0039499999999996595, "loss": 3.1932, "step": 11600 }, { "epoch": 0.1572622919970967, "grad_norm": 0.8859838843345642, "learning_rate": 0.00396249999999966, "loss": 3.1799, "step": 11700 }, { "epoch": 0.15860641415091803, "grad_norm": 1.3646000623703003, "learning_rate": 0.003974999999999655, "loss": 3.1739, "step": 11800 }, { "epoch": 0.15995053630473938, "grad_norm": 0.7711676359176636, "learning_rate": 0.003987499999999656, "loss": 3.1746, "step": 11900 }, { "epoch": 0.16129465845856072, "grad_norm": 0.9850155711174011, "learning_rate": 0.003999999999999657, "loss": 3.1749, "step": 12000 }, { "epoch": 0.16129465845856072, "eval_MaskedAccuracy": 0.3521379038689249, "eval_loss": 3.5329132080078125, "eval_runtime": 154.2202, "eval_samples_per_second": 411.593, "eval_steps_per_second": 1.608, "step": 12000 }, { "epoch": 0.16263878061238204, "grad_norm": 2.1732335090637207, "learning_rate": 0.004012499999999649, "loss": 3.1557, "step": 12100 }, { "epoch": 0.1639829027662034, "grad_norm": 0.6864789724349976, "learning_rate": 0.00402499999999965, "loss": 3.1457, "step": 12200 }, { "epoch": 0.16532702492002474, "grad_norm": 0.7963569760322571, "learning_rate": 0.004037499999999652, "loss": 3.1462, "step": 12300 }, { "epoch": 0.16667114707384606, "grad_norm": 0.7270441651344299, "learning_rate": 0.00404999999999965, "loss": 3.1424, "step": 12400 }, { "epoch": 0.1680152692276674, "grad_norm": 1.0645862817764282, "learning_rate": 0.004062499999999646, "loss": 3.1301, "step": 12500 }, { "epoch": 0.16935939138148876, "grad_norm": 0.7641903758049011, "learning_rate": 0.004074999999999641, "loss": 3.1309, "step": 12600 }, { "epoch": 0.17070351353531008, "grad_norm": 1.2100406885147095, "learning_rate": 0.004087499999999641, "loss": 3.1348, "step": 12700 }, { "epoch": 0.17204763568913142, "grad_norm": 0.7301545143127441, "learning_rate": 0.004099999999999645, "loss": 3.1175, "step": 12800 }, { "epoch": 0.17339175784295277, "grad_norm": 1.3868557214736938, "learning_rate": 0.004112499999999646, "loss": 3.1196, "step": 12900 }, { "epoch": 0.17473587999677412, "grad_norm": 1.3212717771530151, "learning_rate": 0.004124999999999645, "loss": 3.1098, "step": 13000 }, { "epoch": 0.17473587999677412, "eval_MaskedAccuracy": 0.35983040074292133, "eval_loss": 3.4708173274993896, "eval_runtime": 161.2353, "eval_samples_per_second": 393.685, "eval_steps_per_second": 1.538, "step": 13000 }, { "epoch": 0.17608000215059544, "grad_norm": 0.6421147584915161, "learning_rate": 0.0041374999999996415, "loss": 3.1074, "step": 13100 }, { "epoch": 0.17742412430441679, "grad_norm": 1.0816092491149902, "learning_rate": 0.004149999999999641, "loss": 3.0902, "step": 13200 }, { "epoch": 0.17876824645823813, "grad_norm": 0.8088307976722717, "learning_rate": 0.004162499999999636, "loss": 3.0973, "step": 13300 }, { "epoch": 0.18011236861205945, "grad_norm": 0.9084308743476868, "learning_rate": 0.004174999999999633, "loss": 3.0835, "step": 13400 }, { "epoch": 0.1814564907658808, "grad_norm": 1.0027776956558228, "learning_rate": 0.004187499999999628, "loss": 3.0876, "step": 13500 }, { "epoch": 0.18280061291970215, "grad_norm": 1.0761163234710693, "learning_rate": 0.004199999999999622, "loss": 3.0867, "step": 13600 }, { "epoch": 0.18414473507352347, "grad_norm": 0.7274723052978516, "learning_rate": 0.004212499999999617, "loss": 3.0793, "step": 13700 }, { "epoch": 0.18548885722734482, "grad_norm": 0.9493256211280823, "learning_rate": 0.004224999999999617, "loss": 3.0753, "step": 13800 }, { "epoch": 0.18683297938116616, "grad_norm": 1.288206696510315, "learning_rate": 0.004237499999999612, "loss": 3.0625, "step": 13900 }, { "epoch": 0.1881771015349875, "grad_norm": 0.8344607353210449, "learning_rate": 0.004249999999999605, "loss": 3.0665, "step": 14000 }, { "epoch": 0.1881771015349875, "eval_MaskedAccuracy": 0.36507595159603434, "eval_loss": 3.4270057678222656, "eval_runtime": 161.5547, "eval_samples_per_second": 392.907, "eval_steps_per_second": 1.535, "step": 14000 }, { "epoch": 0.18952122368880883, "grad_norm": 0.7024100422859192, "learning_rate": 0.004262499999999602, "loss": 3.0688, "step": 14100 }, { "epoch": 0.19086534584263018, "grad_norm": 0.7407594323158264, "learning_rate": 0.004274999999999597, "loss": 3.0539, "step": 14200 }, { "epoch": 0.19220946799645153, "grad_norm": 0.8240285515785217, "learning_rate": 0.004287499999999591, "loss": 3.0552, "step": 14300 }, { "epoch": 0.19355359015027285, "grad_norm": 0.7373182773590088, "learning_rate": 0.004299999999999582, "loss": 3.0529, "step": 14400 }, { "epoch": 0.1948977123040942, "grad_norm": 0.6955730319023132, "learning_rate": 0.004312499999999577, "loss": 3.0478, "step": 14500 }, { "epoch": 0.19624183445791554, "grad_norm": 1.1683614253997803, "learning_rate": 0.004324999999999574, "loss": 3.0314, "step": 14600 }, { "epoch": 0.19758595661173686, "grad_norm": 1.4355748891830444, "learning_rate": 0.004337499999999567, "loss": 3.0478, "step": 14700 }, { "epoch": 0.1989300787655582, "grad_norm": 0.7114657163619995, "learning_rate": 0.004349999999999561, "loss": 3.0324, "step": 14800 }, { "epoch": 0.20027420091937956, "grad_norm": 1.0332584381103516, "learning_rate": 0.004362499999999559, "loss": 3.0353, "step": 14900 }, { "epoch": 0.2016183230732009, "grad_norm": 0.6959288716316223, "learning_rate": 0.0043749999999995555, "loss": 3.0278, "step": 15000 }, { "epoch": 0.2016183230732009, "eval_MaskedAccuracy": 0.37069473714124845, "eval_loss": 3.379894256591797, "eval_runtime": 161.4047, "eval_samples_per_second": 393.272, "eval_steps_per_second": 1.537, "step": 15000 }, { "epoch": 0.20296244522702223, "grad_norm": 1.1070080995559692, "learning_rate": 0.004387499999999551, "loss": 3.0205, "step": 15100 }, { "epoch": 0.20430656738084357, "grad_norm": 0.8838359117507935, "learning_rate": 0.004399999999999546, "loss": 3.0299, "step": 15200 }, { "epoch": 0.20565068953466492, "grad_norm": 0.8130790591239929, "learning_rate": 0.004412499999999543, "loss": 3.0231, "step": 15300 }, { "epoch": 0.20699481168848624, "grad_norm": 0.6710375547409058, "learning_rate": 0.004424999999999538, "loss": 3.0161, "step": 15400 }, { "epoch": 0.2083389338423076, "grad_norm": 0.7571612000465393, "learning_rate": 0.004437499999999531, "loss": 3.0166, "step": 15500 }, { "epoch": 0.20968305599612894, "grad_norm": 0.658545196056366, "learning_rate": 0.004449999999999525, "loss": 3.0011, "step": 15600 }, { "epoch": 0.21102717814995026, "grad_norm": 0.7161542177200317, "learning_rate": 0.004462499999999522, "loss": 3.0085, "step": 15700 }, { "epoch": 0.2123713003037716, "grad_norm": 0.7213286757469177, "learning_rate": 0.004474999999999518, "loss": 3.0037, "step": 15800 }, { "epoch": 0.21371542245759295, "grad_norm": 1.3048126697540283, "learning_rate": 0.004487499999999511, "loss": 2.9974, "step": 15900 }, { "epoch": 0.21505954461141427, "grad_norm": 0.999495804309845, "learning_rate": 0.00449999999999951, "loss": 2.9973, "step": 16000 }, { "epoch": 0.21505954461141427, "eval_MaskedAccuracy": 0.3738960824804252, "eval_loss": 3.3454906940460205, "eval_runtime": 157.1677, "eval_samples_per_second": 403.874, "eval_steps_per_second": 1.578, "step": 16000 }, { "epoch": 0.21640366676523562, "grad_norm": 2.9935622215270996, "learning_rate": 0.004512499999999503, "loss": 3.0009, "step": 16100 }, { "epoch": 0.21774778891905697, "grad_norm": 1.100763201713562, "learning_rate": 0.004524999999999504, "loss": 2.9812, "step": 16200 }, { "epoch": 0.21909191107287831, "grad_norm": 0.7473815083503723, "learning_rate": 0.004537499999999495, "loss": 2.9833, "step": 16300 }, { "epoch": 0.22043603322669963, "grad_norm": 0.8163867592811584, "learning_rate": 0.004549999999999481, "loss": 2.9867, "step": 16400 }, { "epoch": 0.22178015538052098, "grad_norm": 4.292899131774902, "learning_rate": 0.0045624999999994715, "loss": 2.982, "step": 16500 }, { "epoch": 0.22312427753434233, "grad_norm": 9.086414337158203, "learning_rate": 0.0045749999999994675, "loss": 2.9867, "step": 16600 }, { "epoch": 0.22446839968816365, "grad_norm": 0.6632740497589111, "learning_rate": 0.004587499999999466, "loss": 2.972, "step": 16700 }, { "epoch": 0.225812521841985, "grad_norm": 3.345919132232666, "learning_rate": 0.004599999999999462, "loss": 2.972, "step": 16800 }, { "epoch": 0.22715664399580635, "grad_norm": 0.6550186276435852, "learning_rate": 0.0046124999999994564, "loss": 2.9672, "step": 16900 }, { "epoch": 0.22850076614962767, "grad_norm": 0.6624706983566284, "learning_rate": 0.004624999999999449, "loss": 2.9731, "step": 17000 }, { "epoch": 0.22850076614962767, "eval_MaskedAccuracy": 0.3775072955122549, "eval_loss": 3.3128609657287598, "eval_runtime": 154.9603, "eval_samples_per_second": 409.628, "eval_steps_per_second": 1.6, "step": 17000 }, { "epoch": 0.229844888303449, "grad_norm": 1.0394245386123657, "learning_rate": 0.004637499999999442, "loss": 2.9749, "step": 17100 }, { "epoch": 0.23118901045727036, "grad_norm": 0.6592782139778137, "learning_rate": 0.00464999999999944, "loss": 2.967, "step": 17200 }, { "epoch": 0.2325331326110917, "grad_norm": 0.923537015914917, "learning_rate": 0.00466249999999944, "loss": 2.9607, "step": 17300 }, { "epoch": 0.23387725476491303, "grad_norm": 0.878057599067688, "learning_rate": 0.004674999999999435, "loss": 2.9609, "step": 17400 }, { "epoch": 0.23522137691873438, "grad_norm": 1.1436913013458252, "learning_rate": 0.004687499999999431, "loss": 2.9521, "step": 17500 }, { "epoch": 0.23656549907255572, "grad_norm": 0.8196259140968323, "learning_rate": 0.004699999999999424, "loss": 2.9523, "step": 17600 }, { "epoch": 0.23790962122637704, "grad_norm": 0.625901460647583, "learning_rate": 0.004712499999999414, "loss": 2.955, "step": 17700 }, { "epoch": 0.2392537433801984, "grad_norm": 0.7023401856422424, "learning_rate": 0.004724999999999409, "loss": 2.949, "step": 17800 }, { "epoch": 0.24059786553401974, "grad_norm": 0.7306541204452515, "learning_rate": 0.0047374999999993984, "loss": 2.9478, "step": 17900 }, { "epoch": 0.24194198768784106, "grad_norm": 0.6469770073890686, "learning_rate": 0.004749999999999393, "loss": 2.9395, "step": 18000 }, { "epoch": 0.24194198768784106, "eval_MaskedAccuracy": 0.3805818810631038, "eval_loss": 3.2859790325164795, "eval_runtime": 159.4762, "eval_samples_per_second": 398.028, "eval_steps_per_second": 1.555, "step": 18000 }, { "epoch": 0.2432861098416624, "grad_norm": 0.6748953461647034, "learning_rate": 0.004762499999999384, "loss": 2.9299, "step": 18100 }, { "epoch": 0.24463023199548375, "grad_norm": 1.1163654327392578, "learning_rate": 0.004774999999999384, "loss": 2.9415, "step": 18200 }, { "epoch": 0.2459743541493051, "grad_norm": 1.0123517513275146, "learning_rate": 0.00478749999999938, "loss": 2.9301, "step": 18300 }, { "epoch": 0.24731847630312642, "grad_norm": 3.0257534980773926, "learning_rate": 0.004799999999999372, "loss": 2.9342, "step": 18400 }, { "epoch": 0.24866259845694777, "grad_norm": 1.3270440101623535, "learning_rate": 0.004812499999999362, "loss": 2.9325, "step": 18500 }, { "epoch": 0.2500067206107691, "grad_norm": 1.0727241039276123, "learning_rate": 0.004824999999999358, "loss": 2.9221, "step": 18600 }, { "epoch": 0.25135084276459047, "grad_norm": 0.7316584587097168, "learning_rate": 0.004837499999999354, "loss": 2.9235, "step": 18700 }, { "epoch": 0.2526949649184118, "grad_norm": 2.4931344985961914, "learning_rate": 0.00484999999999935, "loss": 2.9264, "step": 18800 }, { "epoch": 0.2540390870722331, "grad_norm": 2.006314277648926, "learning_rate": 0.004862499999999345, "loss": 2.9229, "step": 18900 }, { "epoch": 0.25538320922605445, "grad_norm": 1.1321533918380737, "learning_rate": 0.004874999999999335, "loss": 2.917, "step": 19000 }, { "epoch": 0.25538320922605445, "eval_MaskedAccuracy": 0.38268668848805826, "eval_loss": 3.2650303840637207, "eval_runtime": 154.9489, "eval_samples_per_second": 409.658, "eval_steps_per_second": 1.601, "step": 19000 }, { "epoch": 0.2567273313798758, "grad_norm": 0.637344241142273, "learning_rate": 0.004887499999999324, "loss": 2.9099, "step": 19100 }, { "epoch": 0.25807145353369715, "grad_norm": 3.686405897140503, "learning_rate": 0.004899999999999312, "loss": 2.9115, "step": 19200 }, { "epoch": 0.2594155756875185, "grad_norm": 0.734024703502655, "learning_rate": 0.0049124999999993054, "loss": 2.9133, "step": 19300 }, { "epoch": 0.26075969784133984, "grad_norm": 0.9141765236854553, "learning_rate": 0.004924999999999296, "loss": 2.9188, "step": 19400 }, { "epoch": 0.26210381999516114, "grad_norm": 0.9035413265228271, "learning_rate": 0.004937499999999295, "loss": 2.9223, "step": 19500 }, { "epoch": 0.2634479421489825, "grad_norm": 0.9160522222518921, "learning_rate": 0.004949999999999293, "loss": 2.9057, "step": 19600 }, { "epoch": 0.26479206430280383, "grad_norm": 0.6431342363357544, "learning_rate": 0.004962499999999282, "loss": 2.8985, "step": 19700 }, { "epoch": 0.2661361864566252, "grad_norm": 0.6499119997024536, "learning_rate": 0.004974999999999273, "loss": 2.8946, "step": 19800 }, { "epoch": 0.2674803086104465, "grad_norm": 0.7949116826057434, "learning_rate": 0.004987499999999261, "loss": 2.9091, "step": 19900 }, { "epoch": 0.2688244307642679, "grad_norm": 2.123286008834839, "learning_rate": 0.005, "loss": 2.8978, "step": 20000 }, { "epoch": 0.2688244307642679, "eval_MaskedAccuracy": 0.3850105547815323, "eval_loss": 3.2462847232818604, "eval_runtime": 158.4819, "eval_samples_per_second": 400.525, "eval_steps_per_second": 1.565, "step": 20000 }, { "epoch": 0.2701685529180892, "grad_norm": 0.6592645645141602, "learning_rate": 0.004999999943789577, "loss": 2.896, "step": 20100 }, { "epoch": 0.2715126750719105, "grad_norm": 1.3373818397521973, "learning_rate": 0.0049999997751583045, "loss": 2.8877, "step": 20200 }, { "epoch": 0.27285679722573186, "grad_norm": 0.5964152812957764, "learning_rate": 0.004999999494106201, "loss": 2.8969, "step": 20300 }, { "epoch": 0.2742009193795532, "grad_norm": 1.4561213254928589, "learning_rate": 0.004999999100633266, "loss": 2.8942, "step": 20400 }, { "epoch": 0.27554504153337456, "grad_norm": 1.0474812984466553, "learning_rate": 0.004999998594739517, "loss": 2.9026, "step": 20500 }, { "epoch": 0.2768891636871959, "grad_norm": 0.5717761516571045, "learning_rate": 0.004999997976424982, "loss": 2.8946, "step": 20600 }, { "epoch": 0.27823328584101725, "grad_norm": 1.6426117420196533, "learning_rate": 0.004999997245689687, "loss": 2.881, "step": 20700 }, { "epoch": 0.27957740799483854, "grad_norm": 2.5096325874328613, "learning_rate": 0.004999996402533671, "loss": 2.8809, "step": 20800 }, { "epoch": 0.2809215301486599, "grad_norm": 1.026366949081421, "learning_rate": 0.0049999954469569655, "loss": 2.8773, "step": 20900 }, { "epoch": 0.28226565230248124, "grad_norm": 1.2996127605438232, "learning_rate": 0.0049999943789596135, "loss": 2.871, "step": 21000 }, { "epoch": 0.28226565230248124, "eval_MaskedAccuracy": 0.3880521470318861, "eval_loss": 3.224468469619751, "eval_runtime": 161.317, "eval_samples_per_second": 393.486, "eval_steps_per_second": 1.537, "step": 21000 }, { "epoch": 0.2836097744563026, "grad_norm": 0.6892908215522766, "learning_rate": 0.004999993198541667, "loss": 2.8798, "step": 21100 }, { "epoch": 0.28495389661012394, "grad_norm": 0.8045306205749512, "learning_rate": 0.00499999190570318, "loss": 2.8639, "step": 21200 }, { "epoch": 0.2862980187639453, "grad_norm": 1.282244324684143, "learning_rate": 0.004999990500444207, "loss": 2.8754, "step": 21300 }, { "epoch": 0.28764214091776663, "grad_norm": 0.9008962512016296, "learning_rate": 0.00499998898276481, "loss": 2.8698, "step": 21400 }, { "epoch": 0.2889862630715879, "grad_norm": 0.8007209897041321, "learning_rate": 0.00499998735266506, "loss": 2.8647, "step": 21500 }, { "epoch": 0.29033038522540927, "grad_norm": 1.014004111289978, "learning_rate": 0.004999985610145036, "loss": 2.8576, "step": 21600 }, { "epoch": 0.2916745073792306, "grad_norm": 0.891608476638794, "learning_rate": 0.0049999837552048105, "loss": 2.8551, "step": 21700 }, { "epoch": 0.29301862953305197, "grad_norm": 1.4745222330093384, "learning_rate": 0.004999981787844464, "loss": 2.8651, "step": 21800 }, { "epoch": 0.2943627516868733, "grad_norm": 1.727281928062439, "learning_rate": 0.004999979708064089, "loss": 2.8577, "step": 21900 }, { "epoch": 0.29570687384069466, "grad_norm": 0.5889946818351746, "learning_rate": 0.0049999775158637745, "loss": 2.8501, "step": 22000 }, { "epoch": 0.29570687384069466, "eval_MaskedAccuracy": 0.3906472199993126, "eval_loss": 3.1981027126312256, "eval_runtime": 157.5475, "eval_samples_per_second": 402.901, "eval_steps_per_second": 1.574, "step": 22000 }, { "epoch": 0.29705099599451595, "grad_norm": 1.3106350898742676, "learning_rate": 0.00499997521124363, "loss": 2.8498, "step": 22100 }, { "epoch": 0.2983951181483373, "grad_norm": 0.670660674571991, "learning_rate": 0.0049999727942037515, "loss": 2.8471, "step": 22200 }, { "epoch": 0.29973924030215865, "grad_norm": 0.6043007969856262, "learning_rate": 0.004999970264744249, "loss": 2.8529, "step": 22300 }, { "epoch": 0.30108336245598, "grad_norm": 0.654592752456665, "learning_rate": 0.004999967622865233, "loss": 2.855, "step": 22400 }, { "epoch": 0.30242748460980134, "grad_norm": 0.7757537364959717, "learning_rate": 0.004999964868566834, "loss": 2.8442, "step": 22500 }, { "epoch": 0.3037716067636227, "grad_norm": 0.6157627105712891, "learning_rate": 0.004999962001849164, "loss": 2.8504, "step": 22600 }, { "epoch": 0.30511572891744404, "grad_norm": 0.5535680055618286, "learning_rate": 0.004999959022712354, "loss": 2.8465, "step": 22700 }, { "epoch": 0.30645985107126533, "grad_norm": 1.0820826292037964, "learning_rate": 0.0049999559311565385, "loss": 2.8407, "step": 22800 }, { "epoch": 0.3078039732250867, "grad_norm": 0.898114800453186, "learning_rate": 0.004999952727181866, "loss": 2.8461, "step": 22900 }, { "epoch": 0.309148095378908, "grad_norm": 0.787267804145813, "learning_rate": 0.0049999494107884745, "loss": 2.8352, "step": 23000 }, { "epoch": 0.309148095378908, "eval_MaskedAccuracy": 0.3932447713059814, "eval_loss": 3.172161340713501, "eval_runtime": 157.8175, "eval_samples_per_second": 402.212, "eval_steps_per_second": 1.571, "step": 23000 }, { "epoch": 0.3104922175327294, "grad_norm": 0.5774521827697754, "learning_rate": 0.004999945981976511, "loss": 2.8263, "step": 23100 }, { "epoch": 0.3118363396865507, "grad_norm": 0.8543257117271423, "learning_rate": 0.004999942440746131, "loss": 2.8307, "step": 23200 }, { "epoch": 0.31318046184037207, "grad_norm": 2.0594680309295654, "learning_rate": 0.004999938787097496, "loss": 2.8367, "step": 23300 }, { "epoch": 0.3145245839941934, "grad_norm": 0.874093770980835, "learning_rate": 0.00499993502103077, "loss": 2.8386, "step": 23400 }, { "epoch": 0.3158687061480147, "grad_norm": 0.5475397706031799, "learning_rate": 0.004999931142546117, "loss": 2.8264, "step": 23500 }, { "epoch": 0.31721282830183606, "grad_norm": 0.8593809008598328, "learning_rate": 0.004999927151643723, "loss": 2.829, "step": 23600 }, { "epoch": 0.3185569504556574, "grad_norm": 0.8034321665763855, "learning_rate": 0.0049999230483237615, "loss": 2.8212, "step": 23700 }, { "epoch": 0.31990107260947875, "grad_norm": 0.6283460855484009, "learning_rate": 0.004999918832586416, "loss": 2.8213, "step": 23800 }, { "epoch": 0.3212451947633001, "grad_norm": 0.7373483180999756, "learning_rate": 0.004999914504431884, "loss": 2.8307, "step": 23900 }, { "epoch": 0.32258931691712145, "grad_norm": 0.5492293834686279, "learning_rate": 0.004999910063860352, "loss": 2.8248, "step": 24000 }, { "epoch": 0.32258931691712145, "eval_MaskedAccuracy": 0.39530929398224096, "eval_loss": 3.154477596282959, "eval_runtime": 154.5504, "eval_samples_per_second": 410.714, "eval_steps_per_second": 1.605, "step": 24000 }, { "epoch": 0.32393343907094274, "grad_norm": 0.599720299243927, "learning_rate": 0.004999905510872023, "loss": 2.8143, "step": 24100 }, { "epoch": 0.3252775612247641, "grad_norm": 1.1359796524047852, "learning_rate": 0.004999900845467094, "loss": 2.8142, "step": 24200 }, { "epoch": 0.32662168337858544, "grad_norm": 1.4440075159072876, "learning_rate": 0.004999896067645785, "loss": 2.8057, "step": 24300 }, { "epoch": 0.3279658055324068, "grad_norm": 1.1459801197052002, "learning_rate": 0.004999891177408309, "loss": 2.8099, "step": 24400 }, { "epoch": 0.32930992768622813, "grad_norm": 1.4969666004180908, "learning_rate": 0.004999886174754891, "loss": 2.8065, "step": 24500 }, { "epoch": 0.3306540498400495, "grad_norm": 5.83732271194458, "learning_rate": 0.004999881059685752, "loss": 2.8102, "step": 24600 }, { "epoch": 0.3319981719938708, "grad_norm": 1.80928373336792, "learning_rate": 0.004999875832201121, "loss": 2.8057, "step": 24700 }, { "epoch": 0.3333422941476921, "grad_norm": 0.7706397771835327, "learning_rate": 0.004999870492301236, "loss": 2.8147, "step": 24800 }, { "epoch": 0.33468641630151347, "grad_norm": 1.311485767364502, "learning_rate": 0.0049998650399863355, "loss": 2.8115, "step": 24900 }, { "epoch": 0.3360305384553348, "grad_norm": 2.0457215309143066, "learning_rate": 0.004999859475256669, "loss": 2.8031, "step": 25000 }, { "epoch": 0.3360305384553348, "eval_MaskedAccuracy": 0.39694718273561536, "eval_loss": 3.1437790393829346, "eval_runtime": 153.8578, "eval_samples_per_second": 412.563, "eval_steps_per_second": 1.612, "step": 25000 }, { "epoch": 0.33737466060915616, "grad_norm": 1.1708168983459473, "learning_rate": 0.004999853798112488, "loss": 2.8014, "step": 25100 }, { "epoch": 0.3387187827629775, "grad_norm": 0.6717488169670105, "learning_rate": 0.004999848008554047, "loss": 2.8061, "step": 25200 }, { "epoch": 0.34006290491679886, "grad_norm": 1.743415355682373, "learning_rate": 0.004999842106581601, "loss": 2.804, "step": 25300 }, { "epoch": 0.34140702707062015, "grad_norm": 1.1857333183288574, "learning_rate": 0.004999836092195418, "loss": 2.8049, "step": 25400 }, { "epoch": 0.3427511492244415, "grad_norm": 0.7652652263641357, "learning_rate": 0.004999829965395771, "loss": 2.7976, "step": 25500 }, { "epoch": 0.34409527137826285, "grad_norm": 0.8573779463768005, "learning_rate": 0.004999823726182935, "loss": 2.7952, "step": 25600 }, { "epoch": 0.3454393935320842, "grad_norm": 1.1807929277420044, "learning_rate": 0.004999817374557191, "loss": 2.8, "step": 25700 }, { "epoch": 0.34678351568590554, "grad_norm": 0.968813419342041, "learning_rate": 0.004999810910518828, "loss": 2.7862, "step": 25800 }, { "epoch": 0.3481276378397269, "grad_norm": 1.2410879135131836, "learning_rate": 0.004999804334068139, "loss": 2.792, "step": 25900 }, { "epoch": 0.34947175999354824, "grad_norm": 1.0421373844146729, "learning_rate": 0.004999797645205414, "loss": 2.7928, "step": 26000 }, { "epoch": 0.34947175999354824, "eval_MaskedAccuracy": 0.40042939663715493, "eval_loss": 3.116644859313965, "eval_runtime": 159.2355, "eval_samples_per_second": 398.63, "eval_steps_per_second": 1.557, "step": 26000 }, { "epoch": 0.35081588214736953, "grad_norm": 1.1622250080108643, "learning_rate": 0.004999790843930961, "loss": 2.7914, "step": 26100 }, { "epoch": 0.3521600043011909, "grad_norm": 0.9314205050468445, "learning_rate": 0.004999783930245077, "loss": 2.7895, "step": 26200 }, { "epoch": 0.3535041264550122, "grad_norm": 2.6555447578430176, "learning_rate": 0.004999776904148078, "loss": 2.7872, "step": 26300 }, { "epoch": 0.35484824860883357, "grad_norm": 1.0436155796051025, "learning_rate": 0.004999769765640281, "loss": 2.7887, "step": 26400 }, { "epoch": 0.3561923707626549, "grad_norm": 0.901914656162262, "learning_rate": 0.004999762514722008, "loss": 2.788, "step": 26500 }, { "epoch": 0.35753649291647627, "grad_norm": 1.7726536989212036, "learning_rate": 0.004999755151393587, "loss": 2.7863, "step": 26600 }, { "epoch": 0.3588806150702976, "grad_norm": 0.90220046043396, "learning_rate": 0.004999747675655358, "loss": 2.7789, "step": 26700 }, { "epoch": 0.3602247372241189, "grad_norm": 1.074455976486206, "learning_rate": 0.004999740087507646, "loss": 2.7786, "step": 26800 }, { "epoch": 0.36156885937794025, "grad_norm": 1.2537977695465088, "learning_rate": 0.004999732386950793, "loss": 2.7792, "step": 26900 }, { "epoch": 0.3629129815317616, "grad_norm": 1.4169613122940063, "learning_rate": 0.004999724573985147, "loss": 2.7773, "step": 27000 }, { "epoch": 0.3629129815317616, "eval_MaskedAccuracy": 0.4007810600247028, "eval_loss": 3.110098361968994, "eval_runtime": 157.997, "eval_samples_per_second": 401.754, "eval_steps_per_second": 1.57, "step": 27000 }, { "epoch": 0.36425710368558295, "grad_norm": 0.6324722170829773, "learning_rate": 0.004999716648611063, "loss": 2.7757, "step": 27100 }, { "epoch": 0.3656012258394043, "grad_norm": 5.492055416107178, "learning_rate": 0.004999708610828894, "loss": 2.7697, "step": 27200 }, { "epoch": 0.36694534799322565, "grad_norm": 0.8834023475646973, "learning_rate": 0.004999700460639006, "loss": 2.7714, "step": 27300 }, { "epoch": 0.36828947014704694, "grad_norm": 0.8748111724853516, "learning_rate": 0.004999692198041765, "loss": 2.7794, "step": 27400 }, { "epoch": 0.3696335923008683, "grad_norm": 1.2332714796066284, "learning_rate": 0.004999683823037545, "loss": 2.7656, "step": 27500 }, { "epoch": 0.37097771445468963, "grad_norm": 0.7697482705116272, "learning_rate": 0.0049996753356267255, "loss": 2.7677, "step": 27600 }, { "epoch": 0.372321836608511, "grad_norm": 0.9096680283546448, "learning_rate": 0.004999666735809681, "loss": 2.7641, "step": 27700 }, { "epoch": 0.37366595876233233, "grad_norm": 0.5890783667564392, "learning_rate": 0.004999658023586803, "loss": 2.7621, "step": 27800 }, { "epoch": 0.3750100809161537, "grad_norm": 0.7764729261398315, "learning_rate": 0.0049996491989584835, "loss": 2.7697, "step": 27900 }, { "epoch": 0.376354203069975, "grad_norm": 2.1192266941070557, "learning_rate": 0.004999640261925122, "loss": 2.7622, "step": 28000 }, { "epoch": 0.376354203069975, "eval_MaskedAccuracy": 0.4028772828245879, "eval_loss": 3.092843532562256, "eval_runtime": 159.6753, "eval_samples_per_second": 397.532, "eval_steps_per_second": 1.553, "step": 28000 }, { "epoch": 0.3776983252237963, "grad_norm": 1.4101366996765137, "learning_rate": 0.00499963121248712, "loss": 2.7642, "step": 28100 }, { "epoch": 0.37904244737761766, "grad_norm": 1.5181620121002197, "learning_rate": 0.0049996220506448846, "loss": 2.7512, "step": 28200 }, { "epoch": 0.380386569531439, "grad_norm": 1.365559697151184, "learning_rate": 0.004999612776398833, "loss": 2.7688, "step": 28300 }, { "epoch": 0.38173069168526036, "grad_norm": 1.040536880493164, "learning_rate": 0.0049996033897493795, "loss": 2.7537, "step": 28400 }, { "epoch": 0.3830748138390817, "grad_norm": 1.0811822414398193, "learning_rate": 0.004999593890696948, "loss": 2.7639, "step": 28500 }, { "epoch": 0.38441893599290305, "grad_norm": 1.0675724744796753, "learning_rate": 0.004999584279241965, "loss": 2.7556, "step": 28600 }, { "epoch": 0.38576305814672435, "grad_norm": 0.6620686650276184, "learning_rate": 0.0049995745553848645, "loss": 2.7583, "step": 28700 }, { "epoch": 0.3871071803005457, "grad_norm": 1.1735743284225464, "learning_rate": 0.004999564719126081, "loss": 2.7482, "step": 28800 }, { "epoch": 0.38845130245436704, "grad_norm": 0.5812009572982788, "learning_rate": 0.004999554770466058, "loss": 2.7443, "step": 28900 }, { "epoch": 0.3897954246081884, "grad_norm": 1.6214362382888794, "learning_rate": 0.0049995447094052506, "loss": 2.7488, "step": 29000 }, { "epoch": 0.3897954246081884, "eval_MaskedAccuracy": 0.403793286520234, "eval_loss": 3.0827648639678955, "eval_runtime": 160.6528, "eval_samples_per_second": 395.113, "eval_steps_per_second": 1.544, "step": 29000 }, { "epoch": 0.39113954676200974, "grad_norm": 1.382157564163208, "learning_rate": 0.004999534535944106, "loss": 2.753, "step": 29100 }, { "epoch": 0.3924836689158311, "grad_norm": 1.1490193605422974, "learning_rate": 0.00499952425008309, "loss": 2.7411, "step": 29200 }, { "epoch": 0.39382779106965243, "grad_norm": 0.5846394300460815, "learning_rate": 0.00499951385182266, "loss": 2.7502, "step": 29300 }, { "epoch": 0.3951719132234737, "grad_norm": 1.563256025314331, "learning_rate": 0.0049995033411632805, "loss": 2.7484, "step": 29400 }, { "epoch": 0.3965160353772951, "grad_norm": 0.510226309299469, "learning_rate": 0.004999492718105435, "loss": 2.7418, "step": 29500 }, { "epoch": 0.3978601575311164, "grad_norm": 0.7409377694129944, "learning_rate": 0.0049994819826496015, "loss": 2.7455, "step": 29600 }, { "epoch": 0.39920427968493777, "grad_norm": 1.132575511932373, "learning_rate": 0.004999471134796253, "loss": 2.7337, "step": 29700 }, { "epoch": 0.4005484018387591, "grad_norm": 0.8493718504905701, "learning_rate": 0.004999460174545891, "loss": 2.7374, "step": 29800 }, { "epoch": 0.40189252399258046, "grad_norm": 1.6549121141433716, "learning_rate": 0.004999449101898994, "loss": 2.7345, "step": 29900 }, { "epoch": 0.4032366461464018, "grad_norm": 1.0378655195236206, "learning_rate": 0.004999437916856077, "loss": 2.7411, "step": 30000 }, { "epoch": 0.4032366461464018, "eval_MaskedAccuracy": 0.4059693369791507, "eval_loss": 3.067436456680298, "eval_runtime": 349.401, "eval_samples_per_second": 181.671, "eval_steps_per_second": 0.71, "step": 30000 }, { "epoch": 0.4045807683002231, "grad_norm": 3.7114546298980713, "learning_rate": 0.004999426619417629, "loss": 2.7291, "step": 30100 }, { "epoch": 0.40592489045404445, "grad_norm": 0.8062450289726257, "learning_rate": 0.0049994152095841704, "loss": 2.7369, "step": 30200 }, { "epoch": 0.4072690126078658, "grad_norm": 0.82357257604599, "learning_rate": 0.004999403687356217, "loss": 2.7361, "step": 30300 }, { "epoch": 0.40861313476168715, "grad_norm": 0.6285329461097717, "learning_rate": 0.004999392052734278, "loss": 2.7343, "step": 30400 }, { "epoch": 0.4099572569155085, "grad_norm": 0.9262251853942871, "learning_rate": 0.0049993803057188785, "loss": 2.7323, "step": 30500 }, { "epoch": 0.41130137906932984, "grad_norm": 1.2343696355819702, "learning_rate": 0.004999368446310558, "loss": 2.7286, "step": 30600 }, { "epoch": 0.41264550122315113, "grad_norm": 0.9753904342651367, "learning_rate": 0.004999356474509844, "loss": 2.7363, "step": 30700 }, { "epoch": 0.4139896233769725, "grad_norm": 0.7958431839942932, "learning_rate": 0.004999344390317274, "loss": 2.7298, "step": 30800 }, { "epoch": 0.41533374553079383, "grad_norm": 4.113492965698242, "learning_rate": 0.004999332193733396, "loss": 2.732, "step": 30900 }, { "epoch": 0.4166778676846152, "grad_norm": 1.0215173959732056, "learning_rate": 0.004999319884758765, "loss": 2.7333, "step": 31000 }, { "epoch": 0.4166778676846152, "eval_MaskedAccuracy": 0.40677681485969963, "eval_loss": 3.0563271045684814, "eval_runtime": 159.104, "eval_samples_per_second": 398.959, "eval_steps_per_second": 1.559, "step": 31000 }, { "epoch": 0.4180219898384365, "grad_norm": 1.9819655418395996, "learning_rate": 0.004999307463393925, "loss": 2.7266, "step": 31100 }, { "epoch": 0.4193661119922579, "grad_norm": 1.4452855587005615, "learning_rate": 0.004999294929639439, "loss": 2.7248, "step": 31200 }, { "epoch": 0.4207102341460792, "grad_norm": 0.7133325934410095, "learning_rate": 0.004999282283495872, "loss": 2.7337, "step": 31300 }, { "epoch": 0.4220543562999005, "grad_norm": 0.930253267288208, "learning_rate": 0.004999269524963801, "loss": 2.7284, "step": 31400 }, { "epoch": 0.42339847845372186, "grad_norm": 2.160118818283081, "learning_rate": 0.004999256654043792, "loss": 2.7228, "step": 31500 }, { "epoch": 0.4247426006075432, "grad_norm": 1.322357416152954, "learning_rate": 0.004999243670736428, "loss": 2.729, "step": 31600 }, { "epoch": 0.42608672276136456, "grad_norm": 1.2306387424468994, "learning_rate": 0.00499923057504229, "loss": 2.7278, "step": 31700 }, { "epoch": 0.4274308449151859, "grad_norm": 2.193079710006714, "learning_rate": 0.004999217366961968, "loss": 2.7169, "step": 31800 }, { "epoch": 0.42877496706900725, "grad_norm": 0.698665976524353, "learning_rate": 0.0049992040464960615, "loss": 2.717, "step": 31900 }, { "epoch": 0.43011908922282854, "grad_norm": 0.765285313129425, "learning_rate": 0.004999190613645174, "loss": 2.7152, "step": 32000 }, { "epoch": 0.43011908922282854, "eval_MaskedAccuracy": 0.40871540481758456, "eval_loss": 3.042783260345459, "eval_runtime": 159.0761, "eval_samples_per_second": 399.029, "eval_steps_per_second": 1.559, "step": 32000 }, { "epoch": 0.4314632113766499, "grad_norm": 0.7268723845481873, "learning_rate": 0.004999177068409907, "loss": 2.7235, "step": 32100 }, { "epoch": 0.43280733353047124, "grad_norm": 0.5239226818084717, "learning_rate": 0.004999163410790871, "loss": 2.7151, "step": 32200 }, { "epoch": 0.4341514556842926, "grad_norm": 1.3244593143463135, "learning_rate": 0.004999149640788679, "loss": 2.7141, "step": 32300 }, { "epoch": 0.43549557783811393, "grad_norm": 0.9290921688079834, "learning_rate": 0.004999135758403953, "loss": 2.7156, "step": 32400 }, { "epoch": 0.4368396999919353, "grad_norm": 0.5451181530952454, "learning_rate": 0.004999121763637319, "loss": 2.7102, "step": 32500 }, { "epoch": 0.43818382214575663, "grad_norm": 0.9701553583145142, "learning_rate": 0.004999107656489411, "loss": 2.7198, "step": 32600 }, { "epoch": 0.4395279442995779, "grad_norm": 5.558290004730225, "learning_rate": 0.004999093436960863, "loss": 2.7124, "step": 32700 }, { "epoch": 0.44087206645339927, "grad_norm": 0.5010631084442139, "learning_rate": 0.004999079105052311, "loss": 2.7112, "step": 32800 }, { "epoch": 0.4422161886072206, "grad_norm": 0.5793175101280212, "learning_rate": 0.004999064660764402, "loss": 2.7111, "step": 32900 }, { "epoch": 0.44356031076104196, "grad_norm": 0.9704194664955139, "learning_rate": 0.004999050104097792, "loss": 2.7093, "step": 33000 }, { "epoch": 0.44356031076104196, "eval_MaskedAccuracy": 0.4106415839014644, "eval_loss": 3.032620906829834, "eval_runtime": 161.401, "eval_samples_per_second": 393.281, "eval_steps_per_second": 1.537, "step": 33000 }, { "epoch": 0.4449044329148633, "grad_norm": 2.713127613067627, "learning_rate": 0.004999035435053133, "loss": 2.7105, "step": 33100 }, { "epoch": 0.44624855506868466, "grad_norm": 0.9831237196922302, "learning_rate": 0.0049990206536310805, "loss": 2.7106, "step": 33200 }, { "epoch": 0.447592677222506, "grad_norm": 1.5991238355636597, "learning_rate": 0.004999005759832306, "loss": 2.6998, "step": 33300 }, { "epoch": 0.4489367993763273, "grad_norm": 1.0113264322280884, "learning_rate": 0.004998990753657483, "loss": 2.7157, "step": 33400 }, { "epoch": 0.45028092153014865, "grad_norm": 1.1059205532073975, "learning_rate": 0.0049989756351072855, "loss": 2.7127, "step": 33500 }, { "epoch": 0.45162504368397, "grad_norm": 0.8544152975082397, "learning_rate": 0.0049989604041823935, "loss": 2.7107, "step": 33600 }, { "epoch": 0.45296916583779134, "grad_norm": 0.6586983799934387, "learning_rate": 0.004998945060883495, "loss": 2.7205, "step": 33700 }, { "epoch": 0.4543132879916127, "grad_norm": 0.7723863124847412, "learning_rate": 0.00499892960521128, "loss": 2.694, "step": 33800 }, { "epoch": 0.45565741014543404, "grad_norm": 1.5881659984588623, "learning_rate": 0.00499891403716645, "loss": 2.7052, "step": 33900 }, { "epoch": 0.45700153229925533, "grad_norm": 1.2848504781723022, "learning_rate": 0.004998898356749702, "loss": 2.703, "step": 34000 }, { "epoch": 0.45700153229925533, "eval_MaskedAccuracy": 0.4106395221691058, "eval_loss": 3.0283474922180176, "eval_runtime": 258.5416, "eval_samples_per_second": 245.516, "eval_steps_per_second": 0.959, "step": 34000 }, { "epoch": 0.4583456544530767, "grad_norm": 0.7230987548828125, "learning_rate": 0.0049988825639617415, "loss": 2.6976, "step": 34100 }, { "epoch": 0.459689776606898, "grad_norm": 0.671057403087616, "learning_rate": 0.004998866658803282, "loss": 2.6931, "step": 34200 }, { "epoch": 0.4610338987607194, "grad_norm": 1.4508076906204224, "learning_rate": 0.004998850641275041, "loss": 2.7036, "step": 34300 }, { "epoch": 0.4623780209145407, "grad_norm": 1.325042724609375, "learning_rate": 0.004998834511377738, "loss": 2.6982, "step": 34400 }, { "epoch": 0.46372214306836207, "grad_norm": 0.49313464760780334, "learning_rate": 0.004998818269112104, "loss": 2.6976, "step": 34500 }, { "epoch": 0.4650662652221834, "grad_norm": 0.7200739979743958, "learning_rate": 0.00499880191447886, "loss": 2.6979, "step": 34600 }, { "epoch": 0.4664103873760047, "grad_norm": 1.495227336883545, "learning_rate": 0.004998785447478756, "loss": 2.6831, "step": 34700 }, { "epoch": 0.46775450952982606, "grad_norm": 0.5236303210258484, "learning_rate": 0.00499876886811252, "loss": 2.6932, "step": 34800 }, { "epoch": 0.4690986316836474, "grad_norm": 1.4335426092147827, "learning_rate": 0.004998752176380915, "loss": 2.6969, "step": 34900 }, { "epoch": 0.47044275383746875, "grad_norm": 2.9952592849731445, "learning_rate": 0.004998735372284684, "loss": 2.7029, "step": 35000 }, { "epoch": 0.47044275383746875, "eval_MaskedAccuracy": 0.4116071260461691, "eval_loss": 3.016385078430176, "eval_runtime": 156.2897, "eval_samples_per_second": 406.143, "eval_steps_per_second": 1.587, "step": 35000 }, { "epoch": 0.4717868759912901, "grad_norm": 0.6854963302612305, "learning_rate": 0.004998718455824586, "loss": 2.6894, "step": 35100 }, { "epoch": 0.47313099814511145, "grad_norm": 2.010087251663208, "learning_rate": 0.004998701427001374, "loss": 2.691, "step": 35200 }, { "epoch": 0.47447512029893274, "grad_norm": 0.6356353163719177, "learning_rate": 0.004998684285815833, "loss": 2.69, "step": 35300 }, { "epoch": 0.4758192424527541, "grad_norm": 0.5063838958740234, "learning_rate": 0.004998667032268725, "loss": 2.6895, "step": 35400 }, { "epoch": 0.47716336460657544, "grad_norm": 0.8183920979499817, "learning_rate": 0.0049986496663608285, "loss": 2.6951, "step": 35500 }, { "epoch": 0.4785074867603968, "grad_norm": 0.9003168940544128, "learning_rate": 0.004998632188092928, "loss": 2.6822, "step": 35600 }, { "epoch": 0.47985160891421813, "grad_norm": 0.8852120041847229, "learning_rate": 0.004998614597465808, "loss": 2.6806, "step": 35700 }, { "epoch": 0.4811957310680395, "grad_norm": 0.7671552896499634, "learning_rate": 0.004998596894480259, "loss": 2.6789, "step": 35800 }, { "epoch": 0.4825398532218608, "grad_norm": 1.260359287261963, "learning_rate": 0.004998579079137085, "loss": 2.679, "step": 35900 }, { "epoch": 0.4838839753756821, "grad_norm": 0.5925015807151794, "learning_rate": 0.004998561151437081, "loss": 2.69, "step": 36000 }, { "epoch": 0.4838839753756821, "eval_MaskedAccuracy": 0.41254465747964864, "eval_loss": 3.0127551555633545, "eval_runtime": 154.2797, "eval_samples_per_second": 411.434, "eval_steps_per_second": 1.607, "step": 36000 }, { "epoch": 0.48522809752950347, "grad_norm": 0.7644244432449341, "learning_rate": 0.004998543111381064, "loss": 2.6828, "step": 36100 }, { "epoch": 0.4865722196833248, "grad_norm": 0.8469159007072449, "learning_rate": 0.004998524958969839, "loss": 2.6878, "step": 36200 }, { "epoch": 0.48791634183714616, "grad_norm": 0.847205638885498, "learning_rate": 0.004998506694204228, "loss": 2.6818, "step": 36300 }, { "epoch": 0.4892604639909675, "grad_norm": 0.581739604473114, "learning_rate": 0.004998488317085059, "loss": 2.6741, "step": 36400 }, { "epoch": 0.49060458614478886, "grad_norm": 2.133725881576538, "learning_rate": 0.004998469827613156, "loss": 2.6786, "step": 36500 }, { "epoch": 0.4919487082986102, "grad_norm": 1.3259893655776978, "learning_rate": 0.00499845122578935, "loss": 2.6819, "step": 36600 }, { "epoch": 0.4932928304524315, "grad_norm": 1.4917380809783936, "learning_rate": 0.004998432511614469, "loss": 2.6802, "step": 36700 }, { "epoch": 0.49463695260625284, "grad_norm": 2.2140398025512695, "learning_rate": 0.004998413685089377, "loss": 2.6767, "step": 36800 }, { "epoch": 0.4959810747600742, "grad_norm": 0.7825784683227539, "learning_rate": 0.004998394746214914, "loss": 2.6801, "step": 36900 }, { "epoch": 0.49732519691389554, "grad_norm": 2.3250296115875244, "learning_rate": 0.00499837569499193, "loss": 2.6736, "step": 37000 }, { "epoch": 0.49732519691389554, "eval_MaskedAccuracy": 0.413791745700555, "eval_loss": 3.001739740371704, "eval_runtime": 155.081, "eval_samples_per_second": 409.309, "eval_steps_per_second": 1.599, "step": 37000 }, { "epoch": 0.4986693190677169, "grad_norm": 0.9509634971618652, "learning_rate": 0.004998356531421287, "loss": 2.6815, "step": 37100 }, { "epoch": 0.5000134412215382, "grad_norm": 0.5249235033988953, "learning_rate": 0.004998337255503845, "loss": 2.6761, "step": 37200 }, { "epoch": 0.5013575633753595, "grad_norm": 0.492450088262558, "learning_rate": 0.004998317867240472, "loss": 2.6713, "step": 37300 }, { "epoch": 0.5027016855291809, "grad_norm": 0.5649069547653198, "learning_rate": 0.00499829836663204, "loss": 2.6718, "step": 37400 }, { "epoch": 0.5040458076830022, "grad_norm": 1.028743028640747, "learning_rate": 0.004998278753679443, "loss": 2.6786, "step": 37500 }, { "epoch": 0.5053899298368236, "grad_norm": 1.2656359672546387, "learning_rate": 0.004998259028383549, "loss": 2.666, "step": 37600 }, { "epoch": 0.5067340519906449, "grad_norm": 1.0689085721969604, "learning_rate": 0.004998239190745246, "loss": 2.6705, "step": 37700 }, { "epoch": 0.5080781741444662, "grad_norm": 1.323631763458252, "learning_rate": 0.00499821924076543, "loss": 2.6748, "step": 37800 }, { "epoch": 0.5094222962982876, "grad_norm": 1.0238298177719116, "learning_rate": 0.004998199178445008, "loss": 2.6692, "step": 37900 }, { "epoch": 0.5107664184521089, "grad_norm": 1.5579122304916382, "learning_rate": 0.0049981790037848806, "loss": 2.6692, "step": 38000 }, { "epoch": 0.5107664184521089, "eval_MaskedAccuracy": 0.41454185677096544, "eval_loss": 2.9944686889648438, "eval_runtime": 154.5873, "eval_samples_per_second": 410.616, "eval_steps_per_second": 1.604, "step": 38000 }, { "epoch": 0.5121105406059303, "grad_norm": 0.9145413041114807, "learning_rate": 0.004998158716785951, "loss": 2.6664, "step": 38100 }, { "epoch": 0.5134546627597516, "grad_norm": 1.3004742860794067, "learning_rate": 0.0049981383174491365, "loss": 2.6749, "step": 38200 }, { "epoch": 0.5147987849135729, "grad_norm": 1.8650057315826416, "learning_rate": 0.0049981178057753595, "loss": 2.6799, "step": 38300 }, { "epoch": 0.5161429070673943, "grad_norm": 1.2665841579437256, "learning_rate": 0.004998097181765546, "loss": 2.6638, "step": 38400 }, { "epoch": 0.5174870292212156, "grad_norm": 0.8943463563919067, "learning_rate": 0.004998076445420617, "loss": 2.6612, "step": 38500 }, { "epoch": 0.518831151375037, "grad_norm": 1.6985851526260376, "learning_rate": 0.004998055596741511, "loss": 2.6653, "step": 38600 }, { "epoch": 0.5201752735288583, "grad_norm": 2.1343910694122314, "learning_rate": 0.004998034635729168, "loss": 2.6642, "step": 38700 }, { "epoch": 0.5215193956826797, "grad_norm": 1.4548419713974, "learning_rate": 0.004998013562384535, "loss": 2.6524, "step": 38800 }, { "epoch": 0.522863517836501, "grad_norm": 2.0018160343170166, "learning_rate": 0.004997992376708557, "loss": 2.6574, "step": 38900 }, { "epoch": 0.5242076399903223, "grad_norm": 1.4500235319137573, "learning_rate": 0.004997971078702187, "loss": 2.6614, "step": 39000 }, { "epoch": 0.5242076399903223, "eval_MaskedAccuracy": 0.4153363544702115, "eval_loss": 2.9864044189453125, "eval_runtime": 154.3717, "eval_samples_per_second": 411.189, "eval_steps_per_second": 1.607, "step": 39000 }, { "epoch": 0.5255517621441437, "grad_norm": 1.6406347751617432, "learning_rate": 0.0049979496683663855, "loss": 2.6609, "step": 39100 }, { "epoch": 0.526895884297965, "grad_norm": 1.1125140190124512, "learning_rate": 0.004997928145702119, "loss": 2.6553, "step": 39200 }, { "epoch": 0.5282400064517864, "grad_norm": 4.977079391479492, "learning_rate": 0.004997906510710358, "loss": 2.6627, "step": 39300 }, { "epoch": 0.5295841286056077, "grad_norm": 0.5258601903915405, "learning_rate": 0.004997884763392075, "loss": 2.6664, "step": 39400 }, { "epoch": 0.5309282507594291, "grad_norm": 0.5638339519500732, "learning_rate": 0.004997862903748257, "loss": 2.6565, "step": 39500 }, { "epoch": 0.5322723729132504, "grad_norm": 0.5276058912277222, "learning_rate": 0.00499784093177988, "loss": 2.6583, "step": 39600 }, { "epoch": 0.5336164950670717, "grad_norm": 0.7912238836288452, "learning_rate": 0.004997818847487944, "loss": 2.6595, "step": 39700 }, { "epoch": 0.534960617220893, "grad_norm": 0.949805736541748, "learning_rate": 0.004997796650873429, "loss": 2.6698, "step": 39800 }, { "epoch": 0.5363047393747143, "grad_norm": 1.7956501245498657, "learning_rate": 0.004997774341937344, "loss": 2.6617, "step": 39900 }, { "epoch": 0.5376488615285357, "grad_norm": 0.5093005895614624, "learning_rate": 0.004997751920680701, "loss": 2.6498, "step": 40000 }, { "epoch": 0.5376488615285357, "eval_MaskedAccuracy": 0.4163031669419539, "eval_loss": 2.9814693927764893, "eval_runtime": 153.8302, "eval_samples_per_second": 412.637, "eval_steps_per_second": 1.612, "step": 40000 }, { "epoch": 0.538992983682357, "grad_norm": 1.8199607133865356, "learning_rate": 0.004997729387104499, "loss": 2.6542, "step": 40100 }, { "epoch": 0.5403371058361784, "grad_norm": 1.2168132066726685, "learning_rate": 0.004997706741209757, "loss": 2.6484, "step": 40200 }, { "epoch": 0.5416812279899997, "grad_norm": 1.862183928489685, "learning_rate": 0.004997683982997499, "loss": 2.6529, "step": 40300 }, { "epoch": 0.543025350143821, "grad_norm": 1.591640830039978, "learning_rate": 0.004997661112468743, "loss": 2.6541, "step": 40400 }, { "epoch": 0.5443694722976424, "grad_norm": 3.0422515869140625, "learning_rate": 0.004997638129624528, "loss": 2.6493, "step": 40500 }, { "epoch": 0.5457135944514637, "grad_norm": 0.5764662027359009, "learning_rate": 0.004997615034465888, "loss": 2.6454, "step": 40600 }, { "epoch": 0.5470577166052851, "grad_norm": 0.7897889018058777, "learning_rate": 0.004997591826993861, "loss": 2.6526, "step": 40700 }, { "epoch": 0.5484018387591064, "grad_norm": 0.5583525896072388, "learning_rate": 0.004997568507209492, "loss": 2.6488, "step": 40800 }, { "epoch": 0.5497459609129277, "grad_norm": 1.2979474067687988, "learning_rate": 0.004997545075113834, "loss": 2.6439, "step": 40900 }, { "epoch": 0.5510900830667491, "grad_norm": 2.09676456451416, "learning_rate": 0.0049975215307079355, "loss": 2.656, "step": 41000 }, { "epoch": 0.5510900830667491, "eval_MaskedAccuracy": 0.4157399235251504, "eval_loss": 2.981074571609497, "eval_runtime": 154.1369, "eval_samples_per_second": 411.816, "eval_steps_per_second": 1.609, "step": 41000 }, { "epoch": 0.5524342052205704, "grad_norm": 3.9577839374542236, "learning_rate": 0.0049974978739928694, "loss": 2.6505, "step": 41100 }, { "epoch": 0.5537783273743918, "grad_norm": 0.5468003153800964, "learning_rate": 0.004997474104969696, "loss": 2.6442, "step": 41200 }, { "epoch": 0.5551224495282131, "grad_norm": 0.5739485621452332, "learning_rate": 0.004997450223639483, "loss": 2.6446, "step": 41300 }, { "epoch": 0.5564665716820345, "grad_norm": 1.8400816917419434, "learning_rate": 0.00499742623000331, "loss": 2.6448, "step": 41400 }, { "epoch": 0.5578106938358558, "grad_norm": 1.2535688877105713, "learning_rate": 0.004997402124062256, "loss": 2.6482, "step": 41500 }, { "epoch": 0.5591548159896771, "grad_norm": 1.228569746017456, "learning_rate": 0.0049973779058174106, "loss": 2.6498, "step": 41600 }, { "epoch": 0.5604989381434985, "grad_norm": 1.4011684656143188, "learning_rate": 0.004997353575269862, "loss": 2.643, "step": 41700 }, { "epoch": 0.5618430602973198, "grad_norm": 0.5663438439369202, "learning_rate": 0.004997329132420707, "loss": 2.6515, "step": 41800 }, { "epoch": 0.5631871824511412, "grad_norm": 0.5198083519935608, "learning_rate": 0.004997304577271046, "loss": 2.6529, "step": 41900 }, { "epoch": 0.5645313046049625, "grad_norm": 3.9535000324249268, "learning_rate": 0.004997279909821991, "loss": 2.6393, "step": 42000 }, { "epoch": 0.5645313046049625, "eval_MaskedAccuracy": 0.4173549594387652, "eval_loss": 2.971799850463867, "eval_runtime": 155.0862, "eval_samples_per_second": 409.295, "eval_steps_per_second": 1.599, "step": 42000 }, { "epoch": 0.5658754267587839, "grad_norm": 2.1890416145324707, "learning_rate": 0.004997255130074655, "loss": 2.6495, "step": 42100 }, { "epoch": 0.5672195489126052, "grad_norm": 1.686848521232605, "learning_rate": 0.004997230238030147, "loss": 2.6365, "step": 42200 }, { "epoch": 0.5685636710664265, "grad_norm": 1.8088412284851074, "learning_rate": 0.004997205233689578, "loss": 2.6421, "step": 42300 }, { "epoch": 0.5699077932202479, "grad_norm": 1.7008030414581299, "learning_rate": 0.004997180117054094, "loss": 2.6371, "step": 42400 }, { "epoch": 0.5712519153740692, "grad_norm": 1.355776309967041, "learning_rate": 0.0049971548881248195, "loss": 2.6357, "step": 42500 }, { "epoch": 0.5725960375278906, "grad_norm": 1.5508376359939575, "learning_rate": 0.004997129546902893, "loss": 2.6438, "step": 42600 }, { "epoch": 0.5739401596817119, "grad_norm": 0.7327139377593994, "learning_rate": 0.004997104093389453, "loss": 2.6318, "step": 42700 }, { "epoch": 0.5752842818355333, "grad_norm": 0.8198758959770203, "learning_rate": 0.004997078527585654, "loss": 2.6407, "step": 42800 }, { "epoch": 0.5766284039893546, "grad_norm": 1.41826331615448, "learning_rate": 0.004997052849492641, "loss": 2.6364, "step": 42900 }, { "epoch": 0.5779725261431758, "grad_norm": 0.8289880156517029, "learning_rate": 0.0049970270591115735, "loss": 2.6494, "step": 43000 }, { "epoch": 0.5779725261431758, "eval_MaskedAccuracy": 0.41913729394217447, "eval_loss": 2.955897092819214, "eval_runtime": 154.7124, "eval_samples_per_second": 410.284, "eval_steps_per_second": 1.603, "step": 43000 }, { "epoch": 0.5793166482969972, "grad_norm": 1.136483907699585, "learning_rate": 0.0049970011564436055, "loss": 2.6364, "step": 43100 }, { "epoch": 0.5806607704508185, "grad_norm": 1.627422571182251, "learning_rate": 0.004996975141489916, "loss": 2.6424, "step": 43200 }, { "epoch": 0.58200489260464, "grad_norm": 1.4853564500808716, "learning_rate": 0.004996949014251676, "loss": 2.6462, "step": 43300 }, { "epoch": 0.5833490147584612, "grad_norm": 1.7882176637649536, "learning_rate": 0.0049969227747300535, "loss": 2.6278, "step": 43400 }, { "epoch": 0.5846931369122826, "grad_norm": 2.825515031814575, "learning_rate": 0.004996896422926242, "loss": 2.6412, "step": 43500 }, { "epoch": 0.5860372590661039, "grad_norm": 0.4745466709136963, "learning_rate": 0.004996869958841418, "loss": 2.6332, "step": 43600 }, { "epoch": 0.5873813812199252, "grad_norm": 1.2661569118499756, "learning_rate": 0.00499684338247678, "loss": 2.6325, "step": 43700 }, { "epoch": 0.5887255033737466, "grad_norm": 1.2799876928329468, "learning_rate": 0.004996816693833525, "loss": 2.6413, "step": 43800 }, { "epoch": 0.5900696255275679, "grad_norm": 0.7999758720397949, "learning_rate": 0.004996789892912855, "loss": 2.6256, "step": 43900 }, { "epoch": 0.5914137476813893, "grad_norm": 0.7396803498268127, "learning_rate": 0.004996762979715977, "loss": 2.6313, "step": 44000 }, { "epoch": 0.5914137476813893, "eval_MaskedAccuracy": 0.41881557303141803, "eval_loss": 2.9574387073516846, "eval_runtime": 156.1034, "eval_samples_per_second": 406.628, "eval_steps_per_second": 1.589, "step": 44000 }, { "epoch": 0.5927578698352106, "grad_norm": 1.06534743309021, "learning_rate": 0.00499673595424411, "loss": 2.6311, "step": 44100 }, { "epoch": 0.5941019919890319, "grad_norm": 1.120554804801941, "learning_rate": 0.004996708816498461, "loss": 2.6289, "step": 44200 }, { "epoch": 0.5954461141428533, "grad_norm": 0.5264089703559875, "learning_rate": 0.004996681566480261, "loss": 2.6324, "step": 44300 }, { "epoch": 0.5967902362966746, "grad_norm": 2.0513694286346436, "learning_rate": 0.004996654204190736, "loss": 2.6346, "step": 44400 }, { "epoch": 0.598134358450496, "grad_norm": 1.3531913757324219, "learning_rate": 0.004996626729631115, "loss": 2.6351, "step": 44500 }, { "epoch": 0.5994784806043173, "grad_norm": 0.9898929595947266, "learning_rate": 0.004996599142802638, "loss": 2.6232, "step": 44600 }, { "epoch": 0.6008226027581387, "grad_norm": 0.674278736114502, "learning_rate": 0.00499657144370655, "loss": 2.6227, "step": 44700 }, { "epoch": 0.60216672491196, "grad_norm": 0.5685105323791504, "learning_rate": 0.004996543632344098, "loss": 2.6306, "step": 44800 }, { "epoch": 0.6035108470657813, "grad_norm": 0.5396636724472046, "learning_rate": 0.004996515708716534, "loss": 2.6351, "step": 44900 }, { "epoch": 0.6048549692196027, "grad_norm": 1.0353196859359741, "learning_rate": 0.004996487672825116, "loss": 2.6332, "step": 45000 }, { "epoch": 0.6048549692196027, "eval_MaskedAccuracy": 0.4208831607484112, "eval_loss": 2.9444289207458496, "eval_runtime": 157.2902, "eval_samples_per_second": 403.56, "eval_steps_per_second": 1.577, "step": 45000 }, { "epoch": 0.606199091373424, "grad_norm": 0.9090419411659241, "learning_rate": 0.004996459524671107, "loss": 2.6303, "step": 45100 }, { "epoch": 0.6075432135272454, "grad_norm": 0.5557056069374084, "learning_rate": 0.004996431264255778, "loss": 2.6253, "step": 45200 }, { "epoch": 0.6088873356810667, "grad_norm": 1.1961573362350464, "learning_rate": 0.004996402891580404, "loss": 2.6249, "step": 45300 }, { "epoch": 0.6102314578348881, "grad_norm": 0.7062890529632568, "learning_rate": 0.00499637440664626, "loss": 2.6264, "step": 45400 }, { "epoch": 0.6115755799887094, "grad_norm": 0.7188198566436768, "learning_rate": 0.004996345809454634, "loss": 2.63, "step": 45500 }, { "epoch": 0.6129197021425307, "grad_norm": 1.2873228788375854, "learning_rate": 0.0049963171000068085, "loss": 2.6234, "step": 45600 }, { "epoch": 0.6142638242963521, "grad_norm": 1.0937819480895996, "learning_rate": 0.004996288278304078, "loss": 2.6309, "step": 45700 }, { "epoch": 0.6156079464501734, "grad_norm": 0.9512178897857666, "learning_rate": 0.004996259344347743, "loss": 2.6194, "step": 45800 }, { "epoch": 0.6169520686039948, "grad_norm": 0.4616401791572571, "learning_rate": 0.004996230298139107, "loss": 2.6233, "step": 45900 }, { "epoch": 0.618296190757816, "grad_norm": 0.6116599440574646, "learning_rate": 0.004996201139679473, "loss": 2.6205, "step": 46000 }, { "epoch": 0.618296190757816, "eval_MaskedAccuracy": 0.4190083845851312, "eval_loss": 2.9517083168029785, "eval_runtime": 142.8409, "eval_samples_per_second": 444.382, "eval_steps_per_second": 1.736, "step": 46000 }, { "epoch": 0.6196403129116375, "grad_norm": 1.2493271827697754, "learning_rate": 0.004996171868970165, "loss": 2.6293, "step": 46100 }, { "epoch": 0.6209844350654588, "grad_norm": 1.9406765699386597, "learning_rate": 0.004996142486012493, "loss": 2.626, "step": 46200 }, { "epoch": 0.62232855721928, "grad_norm": 0.6907238960266113, "learning_rate": 0.004996112990807786, "loss": 2.6141, "step": 46300 }, { "epoch": 0.6236726793731014, "grad_norm": 0.5618690848350525, "learning_rate": 0.0049960833833573755, "loss": 2.6151, "step": 46400 }, { "epoch": 0.6250168015269227, "grad_norm": 1.8456848859786987, "learning_rate": 0.004996053663662593, "loss": 2.6158, "step": 46500 }, { "epoch": 0.6263609236807441, "grad_norm": 0.4822538495063782, "learning_rate": 0.004996023831724768, "loss": 2.6219, "step": 46600 }, { "epoch": 0.6277050458345654, "grad_norm": 0.4805866777896881, "learning_rate": 0.004995993887545261, "loss": 2.6134, "step": 46700 }, { "epoch": 0.6290491679883868, "grad_norm": 1.5017101764678955, "learning_rate": 0.004995963831125411, "loss": 2.6146, "step": 46800 }, { "epoch": 0.6303932901422081, "grad_norm": 1.2550396919250488, "learning_rate": 0.00499593366246658, "loss": 2.6176, "step": 46900 }, { "epoch": 0.6317374122960294, "grad_norm": 1.6678540706634521, "learning_rate": 0.004995903381570121, "loss": 2.6095, "step": 47000 }, { "epoch": 0.6317374122960294, "eval_MaskedAccuracy": 0.42101418324632395, "eval_loss": 2.939000129699707, "eval_runtime": 157.4312, "eval_samples_per_second": 403.198, "eval_steps_per_second": 1.575, "step": 47000 }, { "epoch": 0.6330815344498508, "grad_norm": 0.7519531846046448, "learning_rate": 0.004995872988437397, "loss": 2.6146, "step": 47100 }, { "epoch": 0.6344256566036721, "grad_norm": 0.5543345808982849, "learning_rate": 0.004995842483069782, "loss": 2.6165, "step": 47200 }, { "epoch": 0.6357697787574935, "grad_norm": 0.8774918913841248, "learning_rate": 0.004995811865468641, "loss": 2.617, "step": 47300 }, { "epoch": 0.6371139009113148, "grad_norm": 0.5010842680931091, "learning_rate": 0.004995781135635373, "loss": 2.622, "step": 47400 }, { "epoch": 0.6384580230651361, "grad_norm": 0.537598192691803, "learning_rate": 0.0049957502935713445, "loss": 2.6228, "step": 47500 }, { "epoch": 0.6398021452189575, "grad_norm": 0.65438312292099, "learning_rate": 0.004995719339277949, "loss": 2.6154, "step": 47600 }, { "epoch": 0.6411462673727788, "grad_norm": 0.726901113986969, "learning_rate": 0.004995688272756588, "loss": 2.6126, "step": 47700 }, { "epoch": 0.6424903895266002, "grad_norm": 1.2524980306625366, "learning_rate": 0.004995657094008657, "loss": 2.6075, "step": 47800 }, { "epoch": 0.6438345116804215, "grad_norm": 2.223083257675171, "learning_rate": 0.004995625803035562, "loss": 2.6113, "step": 47900 }, { "epoch": 0.6451786338342429, "grad_norm": 0.7421071529388428, "learning_rate": 0.004995594399838711, "loss": 2.6088, "step": 48000 }, { "epoch": 0.6451786338342429, "eval_MaskedAccuracy": 0.42257422566160907, "eval_loss": 2.9292173385620117, "eval_runtime": 154.814, "eval_samples_per_second": 410.015, "eval_steps_per_second": 1.602, "step": 48000 }, { "epoch": 0.6465227559880642, "grad_norm": 0.6506995558738708, "learning_rate": 0.004995562884419525, "loss": 2.6105, "step": 48100 }, { "epoch": 0.6478668781418855, "grad_norm": 1.0766329765319824, "learning_rate": 0.004995531256779415, "loss": 2.6139, "step": 48200 }, { "epoch": 0.6492110002957069, "grad_norm": 1.1762914657592773, "learning_rate": 0.00499549951691981, "loss": 2.6078, "step": 48300 }, { "epoch": 0.6505551224495282, "grad_norm": 2.2142832279205322, "learning_rate": 0.004995467664842145, "loss": 2.6093, "step": 48400 }, { "epoch": 0.6518992446033496, "grad_norm": 1.1189358234405518, "learning_rate": 0.00499543570054785, "loss": 2.6141, "step": 48500 }, { "epoch": 0.6532433667571709, "grad_norm": 0.7989232540130615, "learning_rate": 0.004995403624038362, "loss": 2.6155, "step": 48600 }, { "epoch": 0.6545874889109923, "grad_norm": 1.5602754354476929, "learning_rate": 0.004995371435315131, "loss": 2.6137, "step": 48700 }, { "epoch": 0.6559316110648136, "grad_norm": 0.7082908749580383, "learning_rate": 0.004995339134379602, "loss": 2.6138, "step": 48800 }, { "epoch": 0.6572757332186349, "grad_norm": 1.3697164058685303, "learning_rate": 0.004995306721233229, "loss": 2.6094, "step": 48900 }, { "epoch": 0.6586198553724563, "grad_norm": 0.5310668349266052, "learning_rate": 0.004995274195877492, "loss": 2.6069, "step": 49000 }, { "epoch": 0.6586198553724563, "eval_MaskedAccuracy": 0.42236824728278766, "eval_loss": 2.928412437438965, "eval_runtime": 155.9626, "eval_samples_per_second": 406.995, "eval_steps_per_second": 1.59, "step": 49000 }, { "epoch": 0.6599639775262776, "grad_norm": 1.284600019454956, "learning_rate": 0.00499524155831384, "loss": 2.6058, "step": 49100 }, { "epoch": 0.661308099680099, "grad_norm": 0.44148510694503784, "learning_rate": 0.004995208808543738, "loss": 2.6046, "step": 49200 }, { "epoch": 0.6626522218339203, "grad_norm": 1.5299873352050781, "learning_rate": 0.004995175946568674, "loss": 2.6057, "step": 49300 }, { "epoch": 0.6639963439877417, "grad_norm": 0.8762273192405701, "learning_rate": 0.00499514297239013, "loss": 2.6049, "step": 49400 }, { "epoch": 0.665340466141563, "grad_norm": 0.6929760575294495, "learning_rate": 0.004995109886009586, "loss": 2.6117, "step": 49500 }, { "epoch": 0.6666845882953842, "grad_norm": 0.9134759306907654, "learning_rate": 0.004995076687428528, "loss": 2.6008, "step": 49600 }, { "epoch": 0.6680287104492056, "grad_norm": 1.4550007581710815, "learning_rate": 0.004995043376648459, "loss": 2.6087, "step": 49700 }, { "epoch": 0.6693728326030269, "grad_norm": 1.1609247922897339, "learning_rate": 0.004995009953670882, "loss": 2.6043, "step": 49800 }, { "epoch": 0.6707169547568483, "grad_norm": 0.5234310030937195, "learning_rate": 0.004994976418497298, "loss": 2.6068, "step": 49900 }, { "epoch": 0.6720610769106696, "grad_norm": 1.2967578172683716, "learning_rate": 0.004994942771129218, "loss": 2.6029, "step": 50000 }, { "epoch": 0.6720610769106696, "eval_MaskedAccuracy": 0.4233113243940233, "eval_loss": 2.9228169918060303, "eval_runtime": 161.4288, "eval_samples_per_second": 393.213, "eval_steps_per_second": 1.536, "step": 50000 }, { "epoch": 0.673405199064491, "grad_norm": 1.6317380666732788, "learning_rate": 0.00499490901156816, "loss": 2.6123, "step": 50100 }, { "epoch": 0.6747493212183123, "grad_norm": 0.8617536425590515, "learning_rate": 0.004994875139815648, "loss": 2.5964, "step": 50200 }, { "epoch": 0.6760934433721336, "grad_norm": 1.1220712661743164, "learning_rate": 0.004994841155873204, "loss": 2.5989, "step": 50300 }, { "epoch": 0.677437565525955, "grad_norm": 0.5247998237609863, "learning_rate": 0.004994807059742355, "loss": 2.598, "step": 50400 }, { "epoch": 0.6787816876797763, "grad_norm": 1.532121181488037, "learning_rate": 0.004994772851424643, "loss": 2.5946, "step": 50500 }, { "epoch": 0.6801258098335977, "grad_norm": 1.0279977321624756, "learning_rate": 0.004994738530921611, "loss": 2.6086, "step": 50600 }, { "epoch": 0.681469931987419, "grad_norm": 0.8515674471855164, "learning_rate": 0.0049947040982348005, "loss": 2.5934, "step": 50700 }, { "epoch": 0.6828140541412403, "grad_norm": 0.958196222782135, "learning_rate": 0.004994669553365766, "loss": 2.597, "step": 50800 }, { "epoch": 0.6841581762950617, "grad_norm": 0.795566737651825, "learning_rate": 0.0049946348963160634, "loss": 2.6028, "step": 50900 }, { "epoch": 0.685502298448883, "grad_norm": 1.396265983581543, "learning_rate": 0.0049946001270872575, "loss": 2.5972, "step": 51000 }, { "epoch": 0.685502298448883, "eval_MaskedAccuracy": 0.42380398111438183, "eval_loss": 2.9168331623077393, "eval_runtime": 155.1031, "eval_samples_per_second": 409.25, "eval_steps_per_second": 1.599, "step": 51000 }, { "epoch": 0.6868464206027044, "grad_norm": 1.0055732727050781, "learning_rate": 0.004994565245680906, "loss": 2.5967, "step": 51100 }, { "epoch": 0.6881905427565257, "grad_norm": 1.37259840965271, "learning_rate": 0.004994530252098587, "loss": 2.6036, "step": 51200 }, { "epoch": 0.6895346649103471, "grad_norm": 0.4619656801223755, "learning_rate": 0.00499449514634188, "loss": 2.6019, "step": 51300 }, { "epoch": 0.6908787870641684, "grad_norm": 0.7385687828063965, "learning_rate": 0.004994459928412364, "loss": 2.6004, "step": 51400 }, { "epoch": 0.6922229092179897, "grad_norm": 1.73374342918396, "learning_rate": 0.004994424598311631, "loss": 2.592, "step": 51500 }, { "epoch": 0.6935670313718111, "grad_norm": 1.9898829460144043, "learning_rate": 0.00499438915604126, "loss": 2.5961, "step": 51600 }, { "epoch": 0.6949111535256324, "grad_norm": 1.4267821311950684, "learning_rate": 0.00499435360160286, "loss": 2.5896, "step": 51700 }, { "epoch": 0.6962552756794538, "grad_norm": 1.4741652011871338, "learning_rate": 0.004994317934998031, "loss": 2.5957, "step": 51800 }, { "epoch": 0.6975993978332751, "grad_norm": 1.60403573513031, "learning_rate": 0.004994282156228371, "loss": 2.5936, "step": 51900 }, { "epoch": 0.6989435199870965, "grad_norm": 1.172230839729309, "learning_rate": 0.004994246265295497, "loss": 2.5973, "step": 52000 }, { "epoch": 0.6989435199870965, "eval_MaskedAccuracy": 0.42523802061668947, "eval_loss": 2.911010265350342, "eval_runtime": 154.888, "eval_samples_per_second": 409.819, "eval_steps_per_second": 1.601, "step": 52000 }, { "epoch": 0.7002876421409178, "grad_norm": 1.7461953163146973, "learning_rate": 0.004994210262201032, "loss": 2.5981, "step": 52100 }, { "epoch": 0.7016317642947391, "grad_norm": 1.0520185232162476, "learning_rate": 0.004994174146946593, "loss": 2.5931, "step": 52200 }, { "epoch": 0.7029758864485605, "grad_norm": 0.6437071561813354, "learning_rate": 0.0049941379195338085, "loss": 2.5871, "step": 52300 }, { "epoch": 0.7043200086023818, "grad_norm": 0.5879480838775635, "learning_rate": 0.004994101579964312, "loss": 2.5895, "step": 52400 }, { "epoch": 0.7056641307562032, "grad_norm": 0.4580605626106262, "learning_rate": 0.00499406512823974, "loss": 2.5958, "step": 52500 }, { "epoch": 0.7070082529100244, "grad_norm": 1.544168472290039, "learning_rate": 0.004994028564361737, "loss": 2.5952, "step": 52600 }, { "epoch": 0.7083523750638459, "grad_norm": 2.1680054664611816, "learning_rate": 0.004993991888331947, "loss": 2.5939, "step": 52700 }, { "epoch": 0.7096964972176671, "grad_norm": 0.9118250608444214, "learning_rate": 0.004993955100152014, "loss": 2.5873, "step": 52800 }, { "epoch": 0.7110406193714884, "grad_norm": 0.6249663233757019, "learning_rate": 0.004993918199823611, "loss": 2.5945, "step": 52900 }, { "epoch": 0.7123847415253098, "grad_norm": 0.5641465783119202, "learning_rate": 0.004993881187348401, "loss": 2.5875, "step": 53000 }, { "epoch": 0.7123847415253098, "eval_MaskedAccuracy": 0.42511639864491074, "eval_loss": 2.9080851078033447, "eval_runtime": 155.0598, "eval_samples_per_second": 409.365, "eval_steps_per_second": 1.599, "step": 53000 }, { "epoch": 0.7137288636791311, "grad_norm": 1.5355523824691772, "learning_rate": 0.004993844062728042, "loss": 2.586, "step": 53100 }, { "epoch": 0.7150729858329525, "grad_norm": 0.9230164885520935, "learning_rate": 0.004993806825964211, "loss": 2.5855, "step": 53200 }, { "epoch": 0.7164171079867738, "grad_norm": 1.638444185256958, "learning_rate": 0.004993769477058582, "loss": 2.5989, "step": 53300 }, { "epoch": 0.7177612301405952, "grad_norm": 0.5577353835105896, "learning_rate": 0.004993732016012843, "loss": 2.5967, "step": 53400 }, { "epoch": 0.7191053522944165, "grad_norm": 0.5781148672103882, "learning_rate": 0.0049936944428286825, "loss": 2.5923, "step": 53500 }, { "epoch": 0.7204494744482378, "grad_norm": 1.6455879211425781, "learning_rate": 0.00499365675750779, "loss": 2.5868, "step": 53600 }, { "epoch": 0.7217935966020592, "grad_norm": 0.6370722651481628, "learning_rate": 0.004993618960051859, "loss": 2.5879, "step": 53700 }, { "epoch": 0.7231377187558805, "grad_norm": 1.1286998987197876, "learning_rate": 0.004993581050462601, "loss": 2.6016, "step": 53800 }, { "epoch": 0.7244818409097019, "grad_norm": 1.282611608505249, "learning_rate": 0.004993543028741726, "loss": 2.5869, "step": 53900 }, { "epoch": 0.7258259630635232, "grad_norm": 0.51445072889328, "learning_rate": 0.004993504894890944, "loss": 2.5856, "step": 54000 }, { "epoch": 0.7258259630635232, "eval_MaskedAccuracy": 0.4248532700853216, "eval_loss": 2.909213066101074, "eval_runtime": 154.7862, "eval_samples_per_second": 410.088, "eval_steps_per_second": 1.602, "step": 54000 }, { "epoch": 0.7271700852173445, "grad_norm": 1.3245320320129395, "learning_rate": 0.004993466648911963, "loss": 2.5915, "step": 54100 }, { "epoch": 0.7285142073711659, "grad_norm": 1.215162754058838, "learning_rate": 0.004993428290806523, "loss": 2.5772, "step": 54200 }, { "epoch": 0.7298583295249872, "grad_norm": 1.2432647943496704, "learning_rate": 0.004993389820576344, "loss": 2.5902, "step": 54300 }, { "epoch": 0.7312024516788086, "grad_norm": 1.581764817237854, "learning_rate": 0.0049933512382231565, "loss": 2.5795, "step": 54400 }, { "epoch": 0.7325465738326299, "grad_norm": 0.9531721472740173, "learning_rate": 0.004993312543748701, "loss": 2.5764, "step": 54500 }, { "epoch": 0.7338906959864513, "grad_norm": 2.677894115447998, "learning_rate": 0.004993273737154721, "loss": 2.5922, "step": 54600 }, { "epoch": 0.7352348181402726, "grad_norm": 0.793094277381897, "learning_rate": 0.00499323481844297, "loss": 2.5841, "step": 54700 }, { "epoch": 0.7365789402940939, "grad_norm": 1.0504934787750244, "learning_rate": 0.0049931957876151924, "loss": 2.5831, "step": 54800 }, { "epoch": 0.7379230624479153, "grad_norm": 1.0326224565505981, "learning_rate": 0.004993156644673158, "loss": 2.5774, "step": 54900 }, { "epoch": 0.7392671846017366, "grad_norm": 0.7911434173583984, "learning_rate": 0.004993117389618626, "loss": 2.5821, "step": 55000 }, { "epoch": 0.7392671846017366, "eval_MaskedAccuracy": 0.42632109483323904, "eval_loss": 2.8994028568267822, "eval_runtime": 160.7388, "eval_samples_per_second": 394.902, "eval_steps_per_second": 1.543, "step": 55000 }, { "epoch": 0.740611306755558, "grad_norm": 0.5436133742332458, "learning_rate": 0.004993078022453364, "loss": 2.5816, "step": 55100 }, { "epoch": 0.7419554289093793, "grad_norm": 0.8503451943397522, "learning_rate": 0.004993038543179143, "loss": 2.5722, "step": 55200 }, { "epoch": 0.7432995510632007, "grad_norm": 1.585143804550171, "learning_rate": 0.004992998951797748, "loss": 2.5707, "step": 55300 }, { "epoch": 0.744643673217022, "grad_norm": 2.5457680225372314, "learning_rate": 0.0049929592483109585, "loss": 2.5768, "step": 55400 }, { "epoch": 0.7459877953708433, "grad_norm": 1.12667977809906, "learning_rate": 0.004992919432720564, "loss": 2.5813, "step": 55500 }, { "epoch": 0.7473319175246647, "grad_norm": 0.685434103012085, "learning_rate": 0.00499287950502836, "loss": 2.5824, "step": 55600 }, { "epoch": 0.748676039678486, "grad_norm": 0.5508724451065063, "learning_rate": 0.0049928394652361445, "loss": 2.5788, "step": 55700 }, { "epoch": 0.7500201618323074, "grad_norm": 1.0351439714431763, "learning_rate": 0.004992799313345724, "loss": 2.5811, "step": 55800 }, { "epoch": 0.7513642839861286, "grad_norm": 0.4489949345588684, "learning_rate": 0.004992759049358903, "loss": 2.5788, "step": 55900 }, { "epoch": 0.75270840613995, "grad_norm": 0.7552504539489746, "learning_rate": 0.0049927186732775025, "loss": 2.5738, "step": 56000 }, { "epoch": 0.75270840613995, "eval_MaskedAccuracy": 0.42679037653842317, "eval_loss": 2.8953652381896973, "eval_runtime": 154.4325, "eval_samples_per_second": 411.027, "eval_steps_per_second": 1.606, "step": 56000 }, { "epoch": 0.7540525282937713, "grad_norm": 1.503426194190979, "learning_rate": 0.0049926781851033376, "loss": 2.5817, "step": 56100 }, { "epoch": 0.7553966504475926, "grad_norm": 0.7938827872276306, "learning_rate": 0.004992637584838231, "loss": 2.5747, "step": 56200 }, { "epoch": 0.756740772601414, "grad_norm": 0.6982958316802979, "learning_rate": 0.004992596872484014, "loss": 2.5803, "step": 56300 }, { "epoch": 0.7580848947552353, "grad_norm": 0.5538859963417053, "learning_rate": 0.004992556048042525, "loss": 2.5873, "step": 56400 }, { "epoch": 0.7594290169090567, "grad_norm": 1.1309832334518433, "learning_rate": 0.004992515111515598, "loss": 2.5732, "step": 56500 }, { "epoch": 0.760773139062878, "grad_norm": 2.124687671661377, "learning_rate": 0.00499247406290508, "loss": 2.582, "step": 56600 }, { "epoch": 0.7621172612166994, "grad_norm": 1.6379998922348022, "learning_rate": 0.004992432902212811, "loss": 2.5806, "step": 56700 }, { "epoch": 0.7634613833705207, "grad_norm": 2.21696400642395, "learning_rate": 0.004992391629440661, "loss": 2.5809, "step": 56800 }, { "epoch": 0.764805505524342, "grad_norm": 0.6755267977714539, "learning_rate": 0.0049923502445904785, "loss": 2.5759, "step": 56900 }, { "epoch": 0.7661496276781634, "grad_norm": 0.5627455711364746, "learning_rate": 0.004992308747664136, "loss": 2.5784, "step": 57000 }, { "epoch": 0.7661496276781634, "eval_MaskedAccuracy": 0.42715101212392975, "eval_loss": 2.889655351638794, "eval_runtime": 156.8451, "eval_samples_per_second": 404.705, "eval_steps_per_second": 1.581, "step": 57000 }, { "epoch": 0.7674937498319847, "grad_norm": 0.4193578064441681, "learning_rate": 0.004992267138663497, "loss": 2.5768, "step": 57100 }, { "epoch": 0.7688378719858061, "grad_norm": 0.4133063554763794, "learning_rate": 0.004992225417590446, "loss": 2.5712, "step": 57200 }, { "epoch": 0.7701819941396274, "grad_norm": 1.6166553497314453, "learning_rate": 0.004992183584446851, "loss": 2.5751, "step": 57300 }, { "epoch": 0.7715261162934487, "grad_norm": 1.9402251243591309, "learning_rate": 0.0049921416392346, "loss": 2.573, "step": 57400 }, { "epoch": 0.7728702384472701, "grad_norm": 1.0368021726608276, "learning_rate": 0.004992099581955586, "loss": 2.5801, "step": 57500 }, { "epoch": 0.7742143606010914, "grad_norm": 0.431902676820755, "learning_rate": 0.004992057412611701, "loss": 2.5776, "step": 57600 }, { "epoch": 0.7755584827549128, "grad_norm": 0.4904190003871918, "learning_rate": 0.004992015131204851, "loss": 2.5792, "step": 57700 }, { "epoch": 0.7769026049087341, "grad_norm": 1.3063716888427734, "learning_rate": 0.004991972737736936, "loss": 2.5654, "step": 57800 }, { "epoch": 0.7782467270625555, "grad_norm": 1.1166818141937256, "learning_rate": 0.004991930232209869, "loss": 2.5694, "step": 57900 }, { "epoch": 0.7795908492163768, "grad_norm": 0.6093880534172058, "learning_rate": 0.004991887614625561, "loss": 2.5768, "step": 58000 }, { "epoch": 0.7795908492163768, "eval_MaskedAccuracy": 0.42766976413111096, "eval_loss": 2.886188507080078, "eval_runtime": 155.0432, "eval_samples_per_second": 409.408, "eval_steps_per_second": 1.6, "step": 58000 }, { "epoch": 0.7809349713701981, "grad_norm": 1.4043593406677246, "learning_rate": 0.00499184488498594, "loss": 2.5692, "step": 58100 }, { "epoch": 0.7822790935240195, "grad_norm": 1.9828555583953857, "learning_rate": 0.0049918020432929274, "loss": 2.5707, "step": 58200 }, { "epoch": 0.7836232156778408, "grad_norm": 1.0599697828292847, "learning_rate": 0.0049917590895484495, "loss": 2.5715, "step": 58300 }, { "epoch": 0.7849673378316622, "grad_norm": 0.5353257656097412, "learning_rate": 0.00499171602375444, "loss": 2.5601, "step": 58400 }, { "epoch": 0.7863114599854835, "grad_norm": 0.45401671528816223, "learning_rate": 0.004991672845912851, "loss": 2.5764, "step": 58500 }, { "epoch": 0.7876555821393049, "grad_norm": 1.2403732538223267, "learning_rate": 0.00499162955602562, "loss": 2.5637, "step": 58600 }, { "epoch": 0.7889997042931262, "grad_norm": 1.7403950691223145, "learning_rate": 0.004991586154094695, "loss": 2.5634, "step": 58700 }, { "epoch": 0.7903438264469475, "grad_norm": 0.8085100650787354, "learning_rate": 0.004991542640122036, "loss": 2.564, "step": 58800 }, { "epoch": 0.7916879486007689, "grad_norm": 0.4583624005317688, "learning_rate": 0.004991499014109603, "loss": 2.5618, "step": 58900 }, { "epoch": 0.7930320707545901, "grad_norm": 1.7312582731246948, "learning_rate": 0.004991455276059368, "loss": 2.5712, "step": 59000 }, { "epoch": 0.7930320707545901, "eval_MaskedAccuracy": 0.4281439150410401, "eval_loss": 2.884404420852661, "eval_runtime": 154.3415, "eval_samples_per_second": 411.27, "eval_steps_per_second": 1.607, "step": 59000 }, { "epoch": 0.7943761929084115, "grad_norm": 1.1742795705795288, "learning_rate": 0.004991411425973289, "loss": 2.5578, "step": 59100 }, { "epoch": 0.7957203150622328, "grad_norm": 0.8881747722625732, "learning_rate": 0.004991367463853345, "loss": 2.5723, "step": 59200 }, { "epoch": 0.7970644372160542, "grad_norm": 1.2915089130401611, "learning_rate": 0.004991323389701527, "loss": 2.565, "step": 59300 }, { "epoch": 0.7984085593698755, "grad_norm": 0.6965966820716858, "learning_rate": 0.004991279203519816, "loss": 2.5619, "step": 59400 }, { "epoch": 0.7997526815236968, "grad_norm": 2.0180699825286865, "learning_rate": 0.004991234905310197, "loss": 2.5639, "step": 59500 }, { "epoch": 0.8010968036775182, "grad_norm": 0.8435118794441223, "learning_rate": 0.004991190495074672, "loss": 2.5578, "step": 59600 }, { "epoch": 0.8024409258313395, "grad_norm": 1.3183516263961792, "learning_rate": 0.004991145972815238, "loss": 2.5658, "step": 59700 }, { "epoch": 0.8037850479851609, "grad_norm": 0.4824967384338379, "learning_rate": 0.004991101338533903, "loss": 2.5641, "step": 59800 }, { "epoch": 0.8051291701389822, "grad_norm": 0.4352557361125946, "learning_rate": 0.004991056592232678, "loss": 2.5634, "step": 59900 }, { "epoch": 0.8064732922928036, "grad_norm": 0.6158259510993958, "learning_rate": 0.004991011733913582, "loss": 2.5596, "step": 60000 }, { "epoch": 0.8064732922928036, "eval_MaskedAccuracy": 0.42826808328062316, "eval_loss": 2.881223440170288, "eval_runtime": 157.4024, "eval_samples_per_second": 403.272, "eval_steps_per_second": 1.576, "step": 60000 }, { "epoch": 0.8078174144466249, "grad_norm": 1.2049239873886108, "learning_rate": 0.004990966763578632, "loss": 2.5647, "step": 60100 }, { "epoch": 0.8091615366004462, "grad_norm": 1.9667131900787354, "learning_rate": 0.0049909216812298604, "loss": 2.5715, "step": 60200 }, { "epoch": 0.8105056587542676, "grad_norm": 1.7864271402359009, "learning_rate": 0.004990876486869296, "loss": 2.5616, "step": 60300 }, { "epoch": 0.8118497809080889, "grad_norm": 0.9698309302330017, "learning_rate": 0.004990831180498973, "loss": 2.5691, "step": 60400 }, { "epoch": 0.8131939030619103, "grad_norm": 0.7363952398300171, "learning_rate": 0.0049907857621209264, "loss": 2.5709, "step": 60500 }, { "epoch": 0.8145380252157316, "grad_norm": 0.45567846298217773, "learning_rate": 0.004990740231737212, "loss": 2.5688, "step": 60600 }, { "epoch": 0.8158821473695529, "grad_norm": 1.8314270973205566, "learning_rate": 0.004990694589349877, "loss": 2.5694, "step": 60700 }, { "epoch": 0.8172262695233743, "grad_norm": 0.7680712342262268, "learning_rate": 0.004990648834960976, "loss": 2.5678, "step": 60800 }, { "epoch": 0.8185703916771956, "grad_norm": 0.8235045075416565, "learning_rate": 0.004990602968572579, "loss": 2.5641, "step": 60900 }, { "epoch": 0.819914513831017, "grad_norm": 2.173553228378296, "learning_rate": 0.004990556990186752, "loss": 2.5631, "step": 61000 }, { "epoch": 0.819914513831017, "eval_MaskedAccuracy": 0.42752038773182294, "eval_loss": 2.8879265785217285, "eval_runtime": 154.4174, "eval_samples_per_second": 411.068, "eval_steps_per_second": 1.606, "step": 61000 }, { "epoch": 0.8212586359848383, "grad_norm": 1.7964410781860352, "learning_rate": 0.004990510899805561, "loss": 2.5645, "step": 61100 }, { "epoch": 0.8226027581386597, "grad_norm": 0.6248696446418762, "learning_rate": 0.004990464697431084, "loss": 2.5554, "step": 61200 }, { "epoch": 0.823946880292481, "grad_norm": 1.9335159063339233, "learning_rate": 0.004990418383065405, "loss": 2.5484, "step": 61300 }, { "epoch": 0.8252910024463023, "grad_norm": 1.169259786605835, "learning_rate": 0.004990371956710612, "loss": 2.5604, "step": 61400 }, { "epoch": 0.8266351246001237, "grad_norm": 1.7953951358795166, "learning_rate": 0.004990325418368784, "loss": 2.5482, "step": 61500 }, { "epoch": 0.827979246753945, "grad_norm": 0.47882091999053955, "learning_rate": 0.004990278768042037, "loss": 2.5603, "step": 61600 }, { "epoch": 0.8293233689077664, "grad_norm": 0.668745219707489, "learning_rate": 0.004990232005732461, "loss": 2.5587, "step": 61700 }, { "epoch": 0.8306674910615877, "grad_norm": 2.7591805458068848, "learning_rate": 0.0049901851314421665, "loss": 2.5525, "step": 61800 }, { "epoch": 0.8320116132154091, "grad_norm": 0.5808740258216858, "learning_rate": 0.004990138145173262, "loss": 2.5544, "step": 61900 }, { "epoch": 0.8333557353692304, "grad_norm": 1.9806839227676392, "learning_rate": 0.00499009104692787, "loss": 2.5569, "step": 62000 }, { "epoch": 0.8333557353692304, "eval_MaskedAccuracy": 0.4281775187262364, "eval_loss": 2.8783788681030273, "eval_runtime": 157.1845, "eval_samples_per_second": 403.831, "eval_steps_per_second": 1.578, "step": 62000 }, { "epoch": 0.8346998575230516, "grad_norm": 0.4611744284629822, "learning_rate": 0.004990043836708112, "loss": 2.5569, "step": 62100 }, { "epoch": 0.836043979676873, "grad_norm": 0.9632844924926758, "learning_rate": 0.00498999651451611, "loss": 2.549, "step": 62200 }, { "epoch": 0.8373881018306943, "grad_norm": 0.49269410967826843, "learning_rate": 0.004989949080353999, "loss": 2.5543, "step": 62300 }, { "epoch": 0.8387322239845157, "grad_norm": 1.0680642127990723, "learning_rate": 0.0049899015342239166, "loss": 2.558, "step": 62400 }, { "epoch": 0.840076346138337, "grad_norm": 1.5859142541885376, "learning_rate": 0.004989853876128006, "loss": 2.5603, "step": 62500 }, { "epoch": 0.8414204682921584, "grad_norm": 1.1798615455627441, "learning_rate": 0.0049898061060684165, "loss": 2.5592, "step": 62600 }, { "epoch": 0.8427645904459797, "grad_norm": 1.3057469129562378, "learning_rate": 0.004989758224047301, "loss": 2.5618, "step": 62700 }, { "epoch": 0.844108712599801, "grad_norm": 0.3981241285800934, "learning_rate": 0.004989710230066817, "loss": 2.5625, "step": 62800 }, { "epoch": 0.8454528347536224, "grad_norm": 1.141242504119873, "learning_rate": 0.00498966212412912, "loss": 2.5575, "step": 62900 }, { "epoch": 0.8467969569074437, "grad_norm": 1.986693024635315, "learning_rate": 0.004989613906236384, "loss": 2.5557, "step": 63000 }, { "epoch": 0.8467969569074437, "eval_MaskedAccuracy": 0.4287118227276102, "eval_loss": 2.8758063316345215, "eval_runtime": 157.2196, "eval_samples_per_second": 403.741, "eval_steps_per_second": 1.577, "step": 63000 }, { "epoch": 0.8481410790612651, "grad_norm": 0.42742684483528137, "learning_rate": 0.0049895655763907785, "loss": 2.5609, "step": 63100 }, { "epoch": 0.8494852012150864, "grad_norm": 0.6005121469497681, "learning_rate": 0.004989517134594481, "loss": 2.5627, "step": 63200 }, { "epoch": 0.8508293233689078, "grad_norm": 1.2548339366912842, "learning_rate": 0.004989468580849676, "loss": 2.5541, "step": 63300 }, { "epoch": 0.8521734455227291, "grad_norm": 1.1956408023834229, "learning_rate": 0.004989419915158556, "loss": 2.5494, "step": 63400 }, { "epoch": 0.8535175676765504, "grad_norm": 1.7097578048706055, "learning_rate": 0.004989371137523307, "loss": 2.5559, "step": 63500 }, { "epoch": 0.8548616898303718, "grad_norm": 0.8344634175300598, "learning_rate": 0.004989322247946124, "loss": 2.5594, "step": 63600 }, { "epoch": 0.8562058119841931, "grad_norm": 1.0530003309249878, "learning_rate": 0.004989273246429215, "loss": 2.548, "step": 63700 }, { "epoch": 0.8575499341380145, "grad_norm": 1.2484129667282104, "learning_rate": 0.004989224132974791, "loss": 2.5514, "step": 63800 }, { "epoch": 0.8588940562918358, "grad_norm": 0.48474428057670593, "learning_rate": 0.004989174907585054, "loss": 2.5607, "step": 63900 }, { "epoch": 0.8602381784456571, "grad_norm": 0.7260443568229675, "learning_rate": 0.004989125570262237, "loss": 2.5607, "step": 64000 }, { "epoch": 0.8602381784456571, "eval_MaskedAccuracy": 0.4298587031625389, "eval_loss": 2.8685121536254883, "eval_runtime": 156.7219, "eval_samples_per_second": 405.023, "eval_steps_per_second": 1.582, "step": 64000 }, { "epoch": 0.8615823005994785, "grad_norm": 1.7127766609191895, "learning_rate": 0.0049890761210085515, "loss": 2.5457, "step": 64100 }, { "epoch": 0.8629264227532998, "grad_norm": 0.9210508465766907, "learning_rate": 0.004989026559826237, "loss": 2.5567, "step": 64200 }, { "epoch": 0.8642705449071212, "grad_norm": 0.5926476120948792, "learning_rate": 0.004988976886717514, "loss": 2.5502, "step": 64300 }, { "epoch": 0.8656146670609425, "grad_norm": 0.8887967467308044, "learning_rate": 0.0049889271016846275, "loss": 2.5511, "step": 64400 }, { "epoch": 0.8669587892147639, "grad_norm": 0.6354972124099731, "learning_rate": 0.0049888772047298165, "loss": 2.5549, "step": 64500 }, { "epoch": 0.8683029113685852, "grad_norm": 0.45211437344551086, "learning_rate": 0.004988827195855339, "loss": 2.5541, "step": 64600 }, { "epoch": 0.8696470335224065, "grad_norm": 1.1961520910263062, "learning_rate": 0.004988777075063434, "loss": 2.549, "step": 64700 }, { "epoch": 0.8709911556762279, "grad_norm": 0.7773739099502563, "learning_rate": 0.004988726842356369, "loss": 2.5453, "step": 64800 }, { "epoch": 0.8723352778300492, "grad_norm": 1.6650779247283936, "learning_rate": 0.004988676497736411, "loss": 2.5591, "step": 64900 }, { "epoch": 0.8736793999838706, "grad_norm": 3.7934086322784424, "learning_rate": 0.004988626041205816, "loss": 2.5477, "step": 65000 }, { "epoch": 0.8736793999838706, "eval_MaskedAccuracy": 0.42935040461530544, "eval_loss": 2.8740768432617188, "eval_runtime": 157.0461, "eval_samples_per_second": 404.187, "eval_steps_per_second": 1.579, "step": 65000 }, { "epoch": 0.8750235221376919, "grad_norm": 0.37112095952033997, "learning_rate": 0.004988575472766864, "loss": 2.5442, "step": 65100 }, { "epoch": 0.8763676442915133, "grad_norm": 0.4250340461730957, "learning_rate": 0.0049885247924218385, "loss": 2.5397, "step": 65200 }, { "epoch": 0.8777117664453346, "grad_norm": 0.42517802119255066, "learning_rate": 0.004988474000173024, "loss": 2.5453, "step": 65300 }, { "epoch": 0.8790558885991558, "grad_norm": 2.0652735233306885, "learning_rate": 0.0049884230960227, "loss": 2.5497, "step": 65400 }, { "epoch": 0.8804000107529772, "grad_norm": 0.6342276930809021, "learning_rate": 0.0049883720799731605, "loss": 2.5378, "step": 65500 }, { "epoch": 0.8817441329067985, "grad_norm": 1.0406826734542847, "learning_rate": 0.0049883209520267075, "loss": 2.5453, "step": 65600 }, { "epoch": 0.8830882550606199, "grad_norm": 1.4055812358856201, "learning_rate": 0.00498826971218565, "loss": 2.5437, "step": 65700 }, { "epoch": 0.8844323772144412, "grad_norm": 2.7914650440216064, "learning_rate": 0.004988218360452281, "loss": 2.5512, "step": 65800 }, { "epoch": 0.8857764993682626, "grad_norm": 0.9103912711143494, "learning_rate": 0.004988166896828934, "loss": 2.5466, "step": 65900 }, { "epoch": 0.8871206215220839, "grad_norm": 1.0117913484573364, "learning_rate": 0.004988115321317916, "loss": 2.5452, "step": 66000 }, { "epoch": 0.8871206215220839, "eval_MaskedAccuracy": 0.4316935610840671, "eval_loss": 2.8568239212036133, "eval_runtime": 155.5022, "eval_samples_per_second": 408.2, "eval_steps_per_second": 1.595, "step": 66000 }, { "epoch": 0.8884647436759052, "grad_norm": 0.934119701385498, "learning_rate": 0.004988063633921554, "loss": 2.5407, "step": 66100 }, { "epoch": 0.8898088658297266, "grad_norm": 0.5075318217277527, "learning_rate": 0.004988011834642176, "loss": 2.5387, "step": 66200 }, { "epoch": 0.8911529879835479, "grad_norm": 2.349472761154175, "learning_rate": 0.004987959923482121, "loss": 2.5453, "step": 66300 }, { "epoch": 0.8924971101373693, "grad_norm": 1.2346925735473633, "learning_rate": 0.004987907900443723, "loss": 2.5433, "step": 66400 }, { "epoch": 0.8938412322911906, "grad_norm": 0.9349079132080078, "learning_rate": 0.004987855765529321, "loss": 2.5386, "step": 66500 }, { "epoch": 0.895185354445012, "grad_norm": 2.3298845291137695, "learning_rate": 0.00498780351874127, "loss": 2.5579, "step": 66600 }, { "epoch": 0.8965294765988333, "grad_norm": 0.7276595830917358, "learning_rate": 0.0049877511600819255, "loss": 2.547, "step": 66700 }, { "epoch": 0.8978735987526546, "grad_norm": 0.9799304604530334, "learning_rate": 0.004987698689553648, "loss": 2.5479, "step": 66800 }, { "epoch": 0.899217720906476, "grad_norm": 4.606154918670654, "learning_rate": 0.0049876461071588, "loss": 2.5486, "step": 66900 }, { "epoch": 0.9005618430602973, "grad_norm": 0.8625683784484863, "learning_rate": 0.004987593412899746, "loss": 2.5448, "step": 67000 }, { "epoch": 0.9005618430602973, "eval_MaskedAccuracy": 0.431047719967855, "eval_loss": 2.8602442741394043, "eval_runtime": 154.394, "eval_samples_per_second": 411.13, "eval_steps_per_second": 1.606, "step": 67000 }, { "epoch": 0.9019059652141187, "grad_norm": 0.6021907925605774, "learning_rate": 0.0049875406067788645, "loss": 2.5473, "step": 67100 }, { "epoch": 0.90325008736794, "grad_norm": 1.0424331426620483, "learning_rate": 0.004987487688798529, "loss": 2.5401, "step": 67200 }, { "epoch": 0.9045942095217613, "grad_norm": 0.7000874280929565, "learning_rate": 0.004987434658961136, "loss": 2.5403, "step": 67300 }, { "epoch": 0.9059383316755827, "grad_norm": 0.437077134847641, "learning_rate": 0.004987381517269073, "loss": 2.5457, "step": 67400 }, { "epoch": 0.907282453829404, "grad_norm": 0.5734995007514954, "learning_rate": 0.004987328263724719, "loss": 2.5496, "step": 67500 }, { "epoch": 0.9086265759832254, "grad_norm": 0.9793594479560852, "learning_rate": 0.00498727489833049, "loss": 2.5374, "step": 67600 }, { "epoch": 0.9099706981370467, "grad_norm": 1.336166262626648, "learning_rate": 0.004987221421088781, "loss": 2.5348, "step": 67700 }, { "epoch": 0.9113148202908681, "grad_norm": 0.8772963881492615, "learning_rate": 0.00498716783200201, "loss": 2.5453, "step": 67800 }, { "epoch": 0.9126589424446894, "grad_norm": 1.0549404621124268, "learning_rate": 0.0049871141310725816, "loss": 2.5394, "step": 67900 }, { "epoch": 0.9140030645985107, "grad_norm": 0.6671266555786133, "learning_rate": 0.004987060318302928, "loss": 2.5408, "step": 68000 }, { "epoch": 0.9140030645985107, "eval_MaskedAccuracy": 0.43042410070670717, "eval_loss": 2.864036798477173, "eval_runtime": 158.2908, "eval_samples_per_second": 401.009, "eval_steps_per_second": 1.567, "step": 68000 }, { "epoch": 0.9153471867523321, "grad_norm": 1.8076815605163574, "learning_rate": 0.004987006393695469, "loss": 2.5349, "step": 68100 }, { "epoch": 0.9166913089061534, "grad_norm": 0.9832874536514282, "learning_rate": 0.004986952357252627, "loss": 2.5372, "step": 68200 }, { "epoch": 0.9180354310599748, "grad_norm": 0.6802740693092346, "learning_rate": 0.004986898208976844, "loss": 2.538, "step": 68300 }, { "epoch": 0.919379553213796, "grad_norm": 1.3755671977996826, "learning_rate": 0.004986843948870563, "loss": 2.5427, "step": 68400 }, { "epoch": 0.9207236753676175, "grad_norm": 0.6860373020172119, "learning_rate": 0.0049867895769362194, "loss": 2.5456, "step": 68500 }, { "epoch": 0.9220677975214387, "grad_norm": 0.37920355796813965, "learning_rate": 0.004986735093176264, "loss": 2.5388, "step": 68600 }, { "epoch": 0.92341191967526, "grad_norm": 0.48161619901657104, "learning_rate": 0.004986680497593163, "loss": 2.5482, "step": 68700 }, { "epoch": 0.9247560418290814, "grad_norm": 1.5831481218338013, "learning_rate": 0.0049866257901893625, "loss": 2.533, "step": 68800 }, { "epoch": 0.9261001639829027, "grad_norm": 1.874329924583435, "learning_rate": 0.004986570970967331, "loss": 2.5432, "step": 68900 }, { "epoch": 0.9274442861367241, "grad_norm": 0.6668677926063538, "learning_rate": 0.004986516039929541, "loss": 2.5458, "step": 69000 }, { "epoch": 0.9274442861367241, "eval_MaskedAccuracy": 0.4320338287315044, "eval_loss": 2.851020336151123, "eval_runtime": 154.6645, "eval_samples_per_second": 410.411, "eval_steps_per_second": 1.603, "step": 69000 }, { "epoch": 0.9287884082905454, "grad_norm": 0.850222647190094, "learning_rate": 0.004986460997078469, "loss": 2.5411, "step": 69100 }, { "epoch": 0.9301325304443668, "grad_norm": 1.386954665184021, "learning_rate": 0.004986405842416592, "loss": 2.5424, "step": 69200 }, { "epoch": 0.9314766525981881, "grad_norm": 0.7696434259414673, "learning_rate": 0.0049863505759464, "loss": 2.5336, "step": 69300 }, { "epoch": 0.9328207747520094, "grad_norm": 0.8896027207374573, "learning_rate": 0.004986295197670379, "loss": 2.5325, "step": 69400 }, { "epoch": 0.9341648969058308, "grad_norm": 1.7881613969802856, "learning_rate": 0.004986239707591028, "loss": 2.5402, "step": 69500 }, { "epoch": 0.9355090190596521, "grad_norm": 0.5906102061271667, "learning_rate": 0.0049861841057108405, "loss": 2.542, "step": 69600 }, { "epoch": 0.9368531412134735, "grad_norm": 0.7546370625495911, "learning_rate": 0.004986128392032327, "loss": 2.5392, "step": 69700 }, { "epoch": 0.9381972633672948, "grad_norm": 0.7366216778755188, "learning_rate": 0.004986072566557999, "loss": 2.5316, "step": 69800 }, { "epoch": 0.9395413855211162, "grad_norm": 1.4292287826538086, "learning_rate": 0.00498601662929037, "loss": 2.5371, "step": 69900 }, { "epoch": 0.9408855076749375, "grad_norm": 0.9893214106559753, "learning_rate": 0.00498596058023197, "loss": 2.5313, "step": 70000 }, { "epoch": 0.9408855076749375, "eval_MaskedAccuracy": 0.4319902861219465, "eval_loss": 2.853639602661133, "eval_runtime": 157.3354, "eval_samples_per_second": 403.444, "eval_steps_per_second": 1.576, "step": 70000 }, { "epoch": 0.9422296298287588, "grad_norm": 1.4988006353378296, "learning_rate": 0.004985904419385302, "loss": 2.5369, "step": 70100 }, { "epoch": 0.9435737519825802, "grad_norm": 1.3903696537017822, "learning_rate": 0.004985848146752917, "loss": 2.5312, "step": 70200 }, { "epoch": 0.9449178741364015, "grad_norm": 1.741083025932312, "learning_rate": 0.004985791762337335, "loss": 2.5343, "step": 70300 }, { "epoch": 0.9462619962902229, "grad_norm": 0.43283969163894653, "learning_rate": 0.00498573526614111, "loss": 2.5308, "step": 70400 }, { "epoch": 0.9476061184440442, "grad_norm": 0.6915990114212036, "learning_rate": 0.004985678658166784, "loss": 2.5316, "step": 70500 }, { "epoch": 0.9489502405978655, "grad_norm": 3.1753523349761963, "learning_rate": 0.004985621938416906, "loss": 2.5376, "step": 70600 }, { "epoch": 0.9502943627516869, "grad_norm": 1.0071120262145996, "learning_rate": 0.0049855651068940335, "loss": 2.5395, "step": 70700 }, { "epoch": 0.9516384849055082, "grad_norm": 3.022740125656128, "learning_rate": 0.004985508163600716, "loss": 2.5267, "step": 70800 }, { "epoch": 0.9529826070593296, "grad_norm": 2.2657601833343506, "learning_rate": 0.004985451108539533, "loss": 2.5279, "step": 70900 }, { "epoch": 0.9543267292131509, "grad_norm": 1.0798835754394531, "learning_rate": 0.004985393941713058, "loss": 2.5346, "step": 71000 }, { "epoch": 0.9543267292131509, "eval_MaskedAccuracy": 0.43291381575543303, "eval_loss": 2.847073554992676, "eval_runtime": 157.3815, "eval_samples_per_second": 403.326, "eval_steps_per_second": 1.576, "step": 71000 }, { "epoch": 0.9556708513669723, "grad_norm": 0.8452098369598389, "learning_rate": 0.004985336663123854, "loss": 2.5344, "step": 71100 }, { "epoch": 0.9570149735207936, "grad_norm": 1.136425256729126, "learning_rate": 0.0049852792727745135, "loss": 2.5373, "step": 71200 }, { "epoch": 0.9583590956746149, "grad_norm": 0.421425461769104, "learning_rate": 0.004985221770667615, "loss": 2.5365, "step": 71300 }, { "epoch": 0.9597032178284363, "grad_norm": 1.0452524423599243, "learning_rate": 0.004985164156805749, "loss": 2.5405, "step": 71400 }, { "epoch": 0.9610473399822576, "grad_norm": 0.6131758689880371, "learning_rate": 0.004985106431191513, "loss": 2.5379, "step": 71500 }, { "epoch": 0.962391462136079, "grad_norm": 0.5693802833557129, "learning_rate": 0.004985048593827507, "loss": 2.5415, "step": 71600 }, { "epoch": 0.9637355842899002, "grad_norm": 0.576070249080658, "learning_rate": 0.004984990644716343, "loss": 2.5258, "step": 71700 }, { "epoch": 0.9650797064437217, "grad_norm": 0.8983374834060669, "learning_rate": 0.004984932583860627, "loss": 2.5222, "step": 71800 }, { "epoch": 0.9664238285975429, "grad_norm": 1.3723649978637695, "learning_rate": 0.004984874411262972, "loss": 2.5328, "step": 71900 }, { "epoch": 0.9677679507513642, "grad_norm": 0.638632595539093, "learning_rate": 0.004984816126926011, "loss": 2.5291, "step": 72000 }, { "epoch": 0.9677679507513642, "eval_MaskedAccuracy": 0.4332407392153184, "eval_loss": 2.8442609310150146, "eval_runtime": 157.5641, "eval_samples_per_second": 402.858, "eval_steps_per_second": 1.574, "step": 72000 }, { "epoch": 0.9691120729051856, "grad_norm": 0.999791145324707, "learning_rate": 0.0049847577308523554, "loss": 2.5289, "step": 72100 }, { "epoch": 0.9704561950590069, "grad_norm": 0.9468411803245544, "learning_rate": 0.0049846992230446465, "loss": 2.5244, "step": 72200 }, { "epoch": 0.9718003172128283, "grad_norm": 1.2006036043167114, "learning_rate": 0.0049846406035055114, "loss": 2.5294, "step": 72300 }, { "epoch": 0.9731444393666496, "grad_norm": 1.1461031436920166, "learning_rate": 0.0049845818722376025, "loss": 2.5251, "step": 72400 }, { "epoch": 0.974488561520471, "grad_norm": 1.3583283424377441, "learning_rate": 0.004984523029243558, "loss": 2.5401, "step": 72500 }, { "epoch": 0.9758326836742923, "grad_norm": 0.8848266005516052, "learning_rate": 0.004984464074526038, "loss": 2.534, "step": 72600 }, { "epoch": 0.9771768058281136, "grad_norm": 0.8786698579788208, "learning_rate": 0.004984405008087696, "loss": 2.5268, "step": 72700 }, { "epoch": 0.978520927981935, "grad_norm": 1.2354623079299927, "learning_rate": 0.004984345829931181, "loss": 2.5235, "step": 72800 }, { "epoch": 0.9798650501357563, "grad_norm": 0.44590994715690613, "learning_rate": 0.004984286540059167, "loss": 2.5249, "step": 72900 }, { "epoch": 0.9812091722895777, "grad_norm": 1.5958350896835327, "learning_rate": 0.004984227138474335, "loss": 2.5252, "step": 73000 }, { "epoch": 0.9812091722895777, "eval_MaskedAccuracy": 0.43342773457995387, "eval_loss": 2.8439483642578125, "eval_runtime": 156.2562, "eval_samples_per_second": 406.23, "eval_steps_per_second": 1.587, "step": 73000 }, { "epoch": 0.982553294443399, "grad_norm": 0.9275982975959778, "learning_rate": 0.004984167625179348, "loss": 2.5338, "step": 73100 }, { "epoch": 0.9838974165972204, "grad_norm": 1.3247750997543335, "learning_rate": 0.0049841080001768985, "loss": 2.5258, "step": 73200 }, { "epoch": 0.9852415387510417, "grad_norm": 0.8629142642021179, "learning_rate": 0.004984048263469671, "loss": 2.5277, "step": 73300 }, { "epoch": 0.986585660904863, "grad_norm": 1.2178809642791748, "learning_rate": 0.004983988415060354, "loss": 2.5345, "step": 73400 }, { "epoch": 0.9879297830586844, "grad_norm": 1.3900634050369263, "learning_rate": 0.004983928454951644, "loss": 2.5164, "step": 73500 }, { "epoch": 0.9892739052125057, "grad_norm": 1.1817967891693115, "learning_rate": 0.00498386838314624, "loss": 2.5267, "step": 73600 }, { "epoch": 0.9906180273663271, "grad_norm": 0.8701839447021484, "learning_rate": 0.004983808199646853, "loss": 2.5238, "step": 73700 }, { "epoch": 0.9919621495201484, "grad_norm": 1.1527352333068848, "learning_rate": 0.0049837479044561985, "loss": 2.5293, "step": 73800 }, { "epoch": 0.9933062716739697, "grad_norm": 1.1712994575500488, "learning_rate": 0.004983687497576983, "loss": 2.5292, "step": 73900 }, { "epoch": 0.9946503938277911, "grad_norm": 1.6270473003387451, "learning_rate": 0.004983626979011935, "loss": 2.5288, "step": 74000 }, { "epoch": 0.9946503938277911, "eval_MaskedAccuracy": 0.432422431765529, "eval_loss": 2.8484365940093994, "eval_runtime": 154.8637, "eval_samples_per_second": 409.883, "eval_steps_per_second": 1.601, "step": 74000 }, { "epoch": 0.9959945159816124, "grad_norm": 0.4700554609298706, "learning_rate": 0.0049835663487637835, "loss": 2.5225, "step": 74100 }, { "epoch": 0.9973386381354338, "grad_norm": 5.065925121307373, "learning_rate": 0.004983505606835256, "loss": 2.5303, "step": 74200 }, { "epoch": 0.9986827602892551, "grad_norm": 0.3857990503311157, "learning_rate": 0.004983444753229089, "loss": 2.5351, "step": 74300 }, { "epoch": 1.0000268824430765, "grad_norm": 3.9997522830963135, "learning_rate": 0.00498338378794803, "loss": 2.5212, "step": 74400 }, { "epoch": 1.0013710045968978, "grad_norm": 1.400756597518921, "learning_rate": 0.00498332271099482, "loss": 2.52, "step": 74500 }, { "epoch": 1.002715126750719, "grad_norm": 0.6085138916969299, "learning_rate": 0.004983261522372212, "loss": 2.526, "step": 74600 }, { "epoch": 1.0040592489045403, "grad_norm": 1.8259930610656738, "learning_rate": 0.004983200222082968, "loss": 2.5176, "step": 74700 }, { "epoch": 1.0054033710583619, "grad_norm": 2.1580281257629395, "learning_rate": 0.004983138810129848, "loss": 2.5273, "step": 74800 }, { "epoch": 1.0067474932121832, "grad_norm": 0.8419142365455627, "learning_rate": 0.004983077286515609, "loss": 2.5232, "step": 74900 }, { "epoch": 1.0080916153660044, "grad_norm": 1.1031572818756104, "learning_rate": 0.004983015651243041, "loss": 2.5381, "step": 75000 }, { "epoch": 1.0080916153660044, "eval_MaskedAccuracy": 0.43352066355967, "eval_loss": 2.838632822036743, "eval_runtime": 154.6683, "eval_samples_per_second": 410.401, "eval_steps_per_second": 1.603, "step": 75000 }, { "epoch": 1.0094357375198257, "grad_norm": 0.4656729996204376, "learning_rate": 0.004982953904314914, "loss": 2.5295, "step": 75100 }, { "epoch": 1.0107798596736473, "grad_norm": 0.7429810762405396, "learning_rate": 0.004982892045734002, "loss": 2.5279, "step": 75200 }, { "epoch": 1.0121239818274685, "grad_norm": 0.36424189805984497, "learning_rate": 0.004982830075503102, "loss": 2.5211, "step": 75300 }, { "epoch": 1.0134681039812898, "grad_norm": 0.45235106348991394, "learning_rate": 0.004982767993625003, "loss": 2.5254, "step": 75400 }, { "epoch": 1.0148122261351111, "grad_norm": 0.6336104273796082, "learning_rate": 0.0049827058001024995, "loss": 2.5318, "step": 75500 }, { "epoch": 1.0161563482889324, "grad_norm": 0.5706799030303955, "learning_rate": 0.004982643494938397, "loss": 2.5267, "step": 75600 }, { "epoch": 1.017500470442754, "grad_norm": 0.5101211667060852, "learning_rate": 0.004982581078135498, "loss": 2.5239, "step": 75700 }, { "epoch": 1.0188445925965752, "grad_norm": 0.6169567108154297, "learning_rate": 0.004982518549696623, "loss": 2.5309, "step": 75800 }, { "epoch": 1.0201887147503965, "grad_norm": 1.594337821006775, "learning_rate": 0.004982455909624589, "loss": 2.5253, "step": 75900 }, { "epoch": 1.0215328369042178, "grad_norm": 1.7339344024658203, "learning_rate": 0.004982393157922211, "loss": 2.5248, "step": 76000 }, { "epoch": 1.0215328369042178, "eval_MaskedAccuracy": 0.43333633554622425, "eval_loss": 2.8395068645477295, "eval_runtime": 155.9239, "eval_samples_per_second": 407.096, "eval_steps_per_second": 1.591, "step": 76000 }, { "epoch": 1.022876959058039, "grad_norm": 0.3945859968662262, "learning_rate": 0.00498233029459232, "loss": 2.52, "step": 76100 }, { "epoch": 1.0242210812118606, "grad_norm": 0.9986801743507385, "learning_rate": 0.00498226731963775, "loss": 2.5088, "step": 76200 }, { "epoch": 1.025565203365682, "grad_norm": 0.9021244049072266, "learning_rate": 0.0049822042330613416, "loss": 2.5183, "step": 76300 }, { "epoch": 1.0269093255195032, "grad_norm": 0.3702966272830963, "learning_rate": 0.004982141034865935, "loss": 2.5185, "step": 76400 }, { "epoch": 1.0282534476733245, "grad_norm": 1.010032057762146, "learning_rate": 0.004982077725054371, "loss": 2.5323, "step": 76500 }, { "epoch": 1.0295975698271458, "grad_norm": 3.7542576789855957, "learning_rate": 0.004982014303629503, "loss": 2.5153, "step": 76600 }, { "epoch": 1.0309416919809673, "grad_norm": 0.729006826877594, "learning_rate": 0.0049819507705941984, "loss": 2.5259, "step": 76700 }, { "epoch": 1.0322858141347886, "grad_norm": 0.475752592086792, "learning_rate": 0.0049818871259513136, "loss": 2.5191, "step": 76800 }, { "epoch": 1.0336299362886099, "grad_norm": 1.1220066547393799, "learning_rate": 0.004981823369703716, "loss": 2.5177, "step": 76900 }, { "epoch": 1.0349740584424312, "grad_norm": 0.5770747661590576, "learning_rate": 0.004981759501854277, "loss": 2.5298, "step": 77000 }, { "epoch": 1.0349740584424312, "eval_MaskedAccuracy": 0.4340570336735209, "eval_loss": 2.8367176055908203, "eval_runtime": 156.2265, "eval_samples_per_second": 406.307, "eval_steps_per_second": 1.587, "step": 77000 }, { "epoch": 1.0363181805962527, "grad_norm": 0.4791591763496399, "learning_rate": 0.004981695522405884, "loss": 2.5189, "step": 77100 }, { "epoch": 1.037662302750074, "grad_norm": 1.2197085618972778, "learning_rate": 0.00498163143136141, "loss": 2.5181, "step": 77200 }, { "epoch": 1.0390064249038953, "grad_norm": 1.1560014486312866, "learning_rate": 0.004981567228723745, "loss": 2.517, "step": 77300 }, { "epoch": 1.0403505470577166, "grad_norm": 0.5016676783561707, "learning_rate": 0.004981502914495785, "loss": 2.5196, "step": 77400 }, { "epoch": 1.0416946692115379, "grad_norm": 1.4357668161392212, "learning_rate": 0.004981438488680427, "loss": 2.5121, "step": 77500 }, { "epoch": 1.0430387913653594, "grad_norm": 1.0016825199127197, "learning_rate": 0.004981373951280573, "loss": 2.5197, "step": 77600 }, { "epoch": 1.0443829135191807, "grad_norm": 5.918942451477051, "learning_rate": 0.004981309302299129, "loss": 2.5243, "step": 77700 }, { "epoch": 1.045727035673002, "grad_norm": 1.207068681716919, "learning_rate": 0.004981244541739009, "loss": 2.5257, "step": 77800 }, { "epoch": 1.0470711578268233, "grad_norm": 0.649711012840271, "learning_rate": 0.004981179669603129, "loss": 2.5274, "step": 77900 }, { "epoch": 1.0484152799806445, "grad_norm": 1.0089892148971558, "learning_rate": 0.004981114685894418, "loss": 2.5166, "step": 78000 }, { "epoch": 1.0484152799806445, "eval_MaskedAccuracy": 0.4343209398198679, "eval_loss": 2.835813045501709, "eval_runtime": 154.6977, "eval_samples_per_second": 410.323, "eval_steps_per_second": 1.603, "step": 78000 }, { "epoch": 1.049759402134466, "grad_norm": 0.8486099243164062, "learning_rate": 0.0049810495906158, "loss": 2.5232, "step": 78100 }, { "epoch": 1.0511035242882873, "grad_norm": 0.6109094023704529, "learning_rate": 0.004980984383770206, "loss": 2.5207, "step": 78200 }, { "epoch": 1.0524476464421086, "grad_norm": 0.463224858045578, "learning_rate": 0.004980919065360583, "loss": 2.5198, "step": 78300 }, { "epoch": 1.05379176859593, "grad_norm": 0.5557699799537659, "learning_rate": 0.004980853635389861, "loss": 2.5235, "step": 78400 }, { "epoch": 1.0551358907497512, "grad_norm": 4.43305778503418, "learning_rate": 0.004980788093860997, "loss": 2.5133, "step": 78500 }, { "epoch": 1.0564800129035727, "grad_norm": 0.5371536612510681, "learning_rate": 0.0049807224407769435, "loss": 2.5208, "step": 78600 }, { "epoch": 1.057824135057394, "grad_norm": 1.074479341506958, "learning_rate": 0.004980656676140655, "loss": 2.5064, "step": 78700 }, { "epoch": 1.0591682572112153, "grad_norm": 2.650197744369507, "learning_rate": 0.004980590799955096, "loss": 2.523, "step": 78800 }, { "epoch": 1.0605123793650366, "grad_norm": 2.4881539344787598, "learning_rate": 0.004980524812223236, "loss": 2.5181, "step": 78900 }, { "epoch": 1.0618565015188581, "grad_norm": 1.3180015087127686, "learning_rate": 0.004980458712948045, "loss": 2.5122, "step": 79000 }, { "epoch": 1.0618565015188581, "eval_MaskedAccuracy": 0.4343943324864396, "eval_loss": 2.8312034606933594, "eval_runtime": 155.8402, "eval_samples_per_second": 407.315, "eval_steps_per_second": 1.591, "step": 79000 }, { "epoch": 1.0632006236726794, "grad_norm": 0.38148483633995056, "learning_rate": 0.004980392502132505, "loss": 2.5165, "step": 79100 }, { "epoch": 1.0645447458265007, "grad_norm": 0.5590552091598511, "learning_rate": 0.004980326179779604, "loss": 2.5068, "step": 79200 }, { "epoch": 1.065888867980322, "grad_norm": 0.4895831346511841, "learning_rate": 0.004980259745892328, "loss": 2.5226, "step": 79300 }, { "epoch": 1.0672329901341433, "grad_norm": 0.5289179086685181, "learning_rate": 0.004980193200473662, "loss": 2.5117, "step": 79400 }, { "epoch": 1.0685771122879648, "grad_norm": 1.0307695865631104, "learning_rate": 0.004980126543526612, "loss": 2.5128, "step": 79500 }, { "epoch": 1.069921234441786, "grad_norm": 1.9092541933059692, "learning_rate": 0.004980059775054183, "loss": 2.5241, "step": 79600 }, { "epoch": 1.0712653565956074, "grad_norm": 0.8294230103492737, "learning_rate": 0.004979992895059378, "loss": 2.5195, "step": 79700 }, { "epoch": 1.0726094787494287, "grad_norm": 3.4225094318389893, "learning_rate": 0.004979925903545211, "loss": 2.517, "step": 79800 }, { "epoch": 1.0739536009032502, "grad_norm": 0.4758341312408447, "learning_rate": 0.004979858800514705, "loss": 2.5176, "step": 79900 }, { "epoch": 1.0752977230570715, "grad_norm": 0.5281692147254944, "learning_rate": 0.00497979158597088, "loss": 2.522, "step": 80000 }, { "epoch": 1.0752977230570715, "eval_MaskedAccuracy": 0.4351065125485484, "eval_loss": 2.828768491744995, "eval_runtime": 155.597, "eval_samples_per_second": 407.951, "eval_steps_per_second": 1.594, "step": 80000 }, { "epoch": 1.0766418452108928, "grad_norm": 0.611584484577179, "learning_rate": 0.004979724259916765, "loss": 2.5146, "step": 80100 }, { "epoch": 1.077985967364714, "grad_norm": 0.5003161430358887, "learning_rate": 0.0049796568223553946, "loss": 2.519, "step": 80200 }, { "epoch": 1.0793300895185354, "grad_norm": 0.8842087984085083, "learning_rate": 0.004979589273289808, "loss": 2.5081, "step": 80300 }, { "epoch": 1.0806742116723567, "grad_norm": 0.86569744348526, "learning_rate": 0.004979521612723048, "loss": 2.5062, "step": 80400 }, { "epoch": 1.0820183338261782, "grad_norm": 0.7057967782020569, "learning_rate": 0.004979453840658162, "loss": 2.5149, "step": 80500 }, { "epoch": 1.0833624559799995, "grad_norm": 0.7422287464141846, "learning_rate": 0.004979385957098205, "loss": 2.5153, "step": 80600 }, { "epoch": 1.0847065781338208, "grad_norm": 0.6432771682739258, "learning_rate": 0.004979317962046237, "loss": 2.5157, "step": 80700 }, { "epoch": 1.086050700287642, "grad_norm": 0.5962923169136047, "learning_rate": 0.004979249855505322, "loss": 2.5068, "step": 80800 }, { "epoch": 1.0873948224414636, "grad_norm": 0.939208984375, "learning_rate": 0.004979181637478523, "loss": 2.5066, "step": 80900 }, { "epoch": 1.0887389445952849, "grad_norm": 0.3996613025665283, "learning_rate": 0.004979113307968915, "loss": 2.5086, "step": 81000 }, { "epoch": 1.0887389445952849, "eval_MaskedAccuracy": 0.4360474147201799, "eval_loss": 2.824847936630249, "eval_runtime": 156.0397, "eval_samples_per_second": 406.794, "eval_steps_per_second": 1.589, "step": 81000 }, { "epoch": 1.0900830667491062, "grad_norm": 0.3728293478488922, "learning_rate": 0.004979044866979578, "loss": 2.5134, "step": 81100 }, { "epoch": 1.0914271889029274, "grad_norm": 1.5539531707763672, "learning_rate": 0.0049789763145136005, "loss": 2.5235, "step": 81200 }, { "epoch": 1.0927713110567487, "grad_norm": 1.7607640027999878, "learning_rate": 0.004978907650574065, "loss": 2.5174, "step": 81300 }, { "epoch": 1.0941154332105703, "grad_norm": 1.1798290014266968, "learning_rate": 0.004978838875164072, "loss": 2.5081, "step": 81400 }, { "epoch": 1.0954595553643915, "grad_norm": 0.4024448096752167, "learning_rate": 0.00497876998828672, "loss": 2.5172, "step": 81500 }, { "epoch": 1.0968036775182128, "grad_norm": 0.5295103788375854, "learning_rate": 0.004978700989945107, "loss": 2.5015, "step": 81600 }, { "epoch": 1.0981477996720341, "grad_norm": 0.8088193535804749, "learning_rate": 0.0049786318801423425, "loss": 2.508, "step": 81700 }, { "epoch": 1.0994919218258556, "grad_norm": 0.8756518363952637, "learning_rate": 0.004978562658881545, "loss": 2.5063, "step": 81800 }, { "epoch": 1.100836043979677, "grad_norm": 4.078980922698975, "learning_rate": 0.004978493326165831, "loss": 2.5063, "step": 81900 }, { "epoch": 1.1021801661334982, "grad_norm": 1.339497685432434, "learning_rate": 0.0049784238819983295, "loss": 2.5074, "step": 82000 }, { "epoch": 1.1021801661334982, "eval_MaskedAccuracy": 0.43618356871003144, "eval_loss": 2.82234525680542, "eval_runtime": 156.3995, "eval_samples_per_second": 405.858, "eval_steps_per_second": 1.586, "step": 82000 }, { "epoch": 1.1035242882873195, "grad_norm": 1.6204960346221924, "learning_rate": 0.004978354326382158, "loss": 2.5152, "step": 82100 }, { "epoch": 1.1048684104411408, "grad_norm": 0.4489379823207855, "learning_rate": 0.00497828465932046, "loss": 2.5102, "step": 82200 }, { "epoch": 1.1062125325949623, "grad_norm": 1.3593122959136963, "learning_rate": 0.004978214880816373, "loss": 2.5025, "step": 82300 }, { "epoch": 1.1075566547487836, "grad_norm": 0.4765743017196655, "learning_rate": 0.004978144990873039, "loss": 2.5071, "step": 82400 }, { "epoch": 1.108900776902605, "grad_norm": 1.4956125020980835, "learning_rate": 0.004978074989493609, "loss": 2.5068, "step": 82500 }, { "epoch": 1.1102448990564262, "grad_norm": 0.820233941078186, "learning_rate": 0.004978004876681244, "loss": 2.5143, "step": 82600 }, { "epoch": 1.1115890212102475, "grad_norm": 0.39537087082862854, "learning_rate": 0.004977934652439093, "loss": 2.5172, "step": 82700 }, { "epoch": 1.112933143364069, "grad_norm": 2.4342026710510254, "learning_rate": 0.004977864316770319, "loss": 2.5119, "step": 82800 }, { "epoch": 1.1142772655178903, "grad_norm": 1.4095677137374878, "learning_rate": 0.004977793869678101, "loss": 2.5082, "step": 82900 }, { "epoch": 1.1156213876717116, "grad_norm": 0.6153007745742798, "learning_rate": 0.004977723311165601, "loss": 2.5059, "step": 83000 }, { "epoch": 1.1156213876717116, "eval_MaskedAccuracy": 0.43704451908857084, "eval_loss": 2.815913200378418, "eval_runtime": 155.698, "eval_samples_per_second": 407.687, "eval_steps_per_second": 1.593, "step": 83000 }, { "epoch": 1.1169655098255329, "grad_norm": 0.7392873764038086, "learning_rate": 0.004977652641236012, "loss": 2.5019, "step": 83100 }, { "epoch": 1.1183096319793542, "grad_norm": 2.7315380573272705, "learning_rate": 0.004977581859892506, "loss": 2.5046, "step": 83200 }, { "epoch": 1.1196537541331757, "grad_norm": 0.5851826667785645, "learning_rate": 0.0049775109671382715, "loss": 2.5031, "step": 83300 }, { "epoch": 1.120997876286997, "grad_norm": 0.34558480978012085, "learning_rate": 0.004977439962976512, "loss": 2.5129, "step": 83400 }, { "epoch": 1.1223419984408183, "grad_norm": 0.9053849577903748, "learning_rate": 0.004977368847410425, "loss": 2.5075, "step": 83500 }, { "epoch": 1.1236861205946396, "grad_norm": 1.7211493253707886, "learning_rate": 0.004977297620443216, "loss": 2.5035, "step": 83600 }, { "epoch": 1.125030242748461, "grad_norm": 0.7423088550567627, "learning_rate": 0.004977226282078084, "loss": 2.5118, "step": 83700 }, { "epoch": 1.1263743649022824, "grad_norm": 1.4103665351867676, "learning_rate": 0.004977154832318255, "loss": 2.5172, "step": 83800 }, { "epoch": 1.1277184870561037, "grad_norm": 2.3322620391845703, "learning_rate": 0.0049770832711669415, "loss": 2.4982, "step": 83900 }, { "epoch": 1.129062609209925, "grad_norm": 0.7170735001564026, "learning_rate": 0.004977011598627376, "loss": 2.5056, "step": 84000 }, { "epoch": 1.129062609209925, "eval_MaskedAccuracy": 0.4369403158372308, "eval_loss": 2.8143200874328613, "eval_runtime": 156.8035, "eval_samples_per_second": 404.812, "eval_steps_per_second": 1.582, "step": 84000 }, { "epoch": 1.1304067313637463, "grad_norm": 0.4916757643222809, "learning_rate": 0.004976939814702779, "loss": 2.5041, "step": 84100 }, { "epoch": 1.1317508535175678, "grad_norm": 1.2962486743927002, "learning_rate": 0.004976867919396393, "loss": 2.5092, "step": 84200 }, { "epoch": 1.133094975671389, "grad_norm": 0.47311854362487793, "learning_rate": 0.004976795912711456, "loss": 2.5074, "step": 84300 }, { "epoch": 1.1344390978252104, "grad_norm": 0.4036632180213928, "learning_rate": 0.004976723794651204, "loss": 2.5089, "step": 84400 }, { "epoch": 1.1357832199790316, "grad_norm": 1.4147541522979736, "learning_rate": 0.004976651565218892, "loss": 2.5074, "step": 84500 }, { "epoch": 1.1371273421328532, "grad_norm": 1.4491769075393677, "learning_rate": 0.004976579224417779, "loss": 2.5051, "step": 84600 }, { "epoch": 1.1384714642866745, "grad_norm": 1.161087155342102, "learning_rate": 0.0049765067722511176, "loss": 2.5125, "step": 84700 }, { "epoch": 1.1398155864404957, "grad_norm": 0.37587714195251465, "learning_rate": 0.004976434208722176, "loss": 2.5046, "step": 84800 }, { "epoch": 1.141159708594317, "grad_norm": 0.6457163095474243, "learning_rate": 0.004976361533834221, "loss": 2.5107, "step": 84900 }, { "epoch": 1.1425038307481383, "grad_norm": 0.40892913937568665, "learning_rate": 0.004976288747590531, "loss": 2.5095, "step": 85000 }, { "epoch": 1.1425038307481383, "eval_MaskedAccuracy": 0.4367847754877898, "eval_loss": 2.817359685897827, "eval_runtime": 155.4685, "eval_samples_per_second": 408.288, "eval_steps_per_second": 1.595, "step": 85000 }, { "epoch": 1.1438479529019596, "grad_norm": 0.6328030228614807, "learning_rate": 0.00497621584999438, "loss": 2.5027, "step": 85100 }, { "epoch": 1.1451920750557811, "grad_norm": 0.8241310119628906, "learning_rate": 0.004976142841049059, "loss": 2.5109, "step": 85200 }, { "epoch": 1.1465361972096024, "grad_norm": 0.5292043089866638, "learning_rate": 0.0049760697207578515, "loss": 2.5051, "step": 85300 }, { "epoch": 1.1478803193634237, "grad_norm": 0.6570155620574951, "learning_rate": 0.004975996489124052, "loss": 2.4996, "step": 85400 }, { "epoch": 1.149224441517245, "grad_norm": 0.3944413959980011, "learning_rate": 0.0049759231461509726, "loss": 2.5018, "step": 85500 }, { "epoch": 1.1505685636710665, "grad_norm": 0.7858182787895203, "learning_rate": 0.00497584969184191, "loss": 2.4994, "step": 85600 }, { "epoch": 1.1519126858248878, "grad_norm": 0.354902058839798, "learning_rate": 0.004975776126200173, "loss": 2.5109, "step": 85700 }, { "epoch": 1.153256807978709, "grad_norm": 1.0036402940750122, "learning_rate": 0.004975702449229073, "loss": 2.5043, "step": 85800 }, { "epoch": 1.1546009301325304, "grad_norm": 0.9507333636283875, "learning_rate": 0.004975628660931937, "loss": 2.502, "step": 85900 }, { "epoch": 1.1559450522863517, "grad_norm": 0.5167057514190674, "learning_rate": 0.004975554761312083, "loss": 2.5026, "step": 86000 }, { "epoch": 1.1559450522863517, "eval_MaskedAccuracy": 0.43768769620070525, "eval_loss": 2.8092353343963623, "eval_runtime": 156.668, "eval_samples_per_second": 405.162, "eval_steps_per_second": 1.583, "step": 86000 }, { "epoch": 1.1572891744401732, "grad_norm": 0.5097525715827942, "learning_rate": 0.004975480750372851, "loss": 2.5018, "step": 86100 }, { "epoch": 1.1586332965939945, "grad_norm": 0.35351938009262085, "learning_rate": 0.0049754066281175605, "loss": 2.501, "step": 86200 }, { "epoch": 1.1599774187478158, "grad_norm": 0.8449835181236267, "learning_rate": 0.004975332394549564, "loss": 2.5024, "step": 86300 }, { "epoch": 1.161321540901637, "grad_norm": 63.61347579956055, "learning_rate": 0.004975258049672201, "loss": 2.5018, "step": 86400 }, { "epoch": 1.1626656630554586, "grad_norm": 0.5342414975166321, "learning_rate": 0.004975183593488831, "loss": 2.6116, "step": 86500 }, { "epoch": 1.16400978520928, "grad_norm": 0.47309231758117676, "learning_rate": 0.0049751090260028, "loss": 2.5154, "step": 86600 }, { "epoch": 1.1653539073631012, "grad_norm": 1.168167233467102, "learning_rate": 0.004975034347217465, "loss": 2.5076, "step": 86700 }, { "epoch": 1.1666980295169225, "grad_norm": 0.49217191338539124, "learning_rate": 0.004974959557136195, "loss": 2.5049, "step": 86800 }, { "epoch": 1.1680421516707438, "grad_norm": 2.020880937576294, "learning_rate": 0.004974884655762357, "loss": 2.5017, "step": 86900 }, { "epoch": 1.169386273824565, "grad_norm": 1.273708701133728, "learning_rate": 0.004974809643099329, "loss": 2.503, "step": 87000 }, { "epoch": 1.169386273824565, "eval_MaskedAccuracy": 0.4362821121410326, "eval_loss": 2.820596218109131, "eval_runtime": 155.9859, "eval_samples_per_second": 406.934, "eval_steps_per_second": 1.59, "step": 87000 }, { "epoch": 1.1707303959783866, "grad_norm": 0.9947301745414734, "learning_rate": 0.004974734519150486, "loss": 2.5085, "step": 87100 }, { "epoch": 1.1720745181322079, "grad_norm": 0.36364173889160156, "learning_rate": 0.004974659283919222, "loss": 2.52, "step": 87200 }, { "epoch": 1.1734186402860292, "grad_norm": 1.1245728731155396, "learning_rate": 0.0049745839374089225, "loss": 2.5178, "step": 87300 }, { "epoch": 1.1747627624398504, "grad_norm": 0.8172126412391663, "learning_rate": 0.004974508479622976, "loss": 2.5063, "step": 87400 }, { "epoch": 1.176106884593672, "grad_norm": 0.5378974676132202, "learning_rate": 0.004974432910564789, "loss": 2.5085, "step": 87500 }, { "epoch": 1.1774510067474933, "grad_norm": 1.354956030845642, "learning_rate": 0.00497435723023777, "loss": 2.4993, "step": 87600 }, { "epoch": 1.1787951289013145, "grad_norm": 1.2725735902786255, "learning_rate": 0.00497428143864532, "loss": 2.4983, "step": 87700 }, { "epoch": 1.1801392510551358, "grad_norm": 1.188934326171875, "learning_rate": 0.004974205535790863, "loss": 2.4999, "step": 87800 }, { "epoch": 1.1814833732089571, "grad_norm": 1.1253455877304077, "learning_rate": 0.004974129521677812, "loss": 2.496, "step": 87900 }, { "epoch": 1.1828274953627786, "grad_norm": 0.6546825170516968, "learning_rate": 0.004974053396309598, "loss": 2.5013, "step": 88000 }, { "epoch": 1.1828274953627786, "eval_MaskedAccuracy": 0.43612094406632873, "eval_loss": 2.817842721939087, "eval_runtime": 155.3942, "eval_samples_per_second": 408.484, "eval_steps_per_second": 1.596, "step": 88000 }, { "epoch": 1.1841716175166, "grad_norm": 0.5742254257202148, "learning_rate": 0.004973977159689648, "loss": 2.4927, "step": 88100 }, { "epoch": 1.1855157396704212, "grad_norm": 0.9239218831062317, "learning_rate": 0.0049739008118213955, "loss": 2.4991, "step": 88200 }, { "epoch": 1.1868598618242425, "grad_norm": 0.8331120014190674, "learning_rate": 0.004973824352708291, "loss": 2.5034, "step": 88300 }, { "epoch": 1.188203983978064, "grad_norm": 0.5568376779556274, "learning_rate": 0.004973747782353763, "loss": 2.4976, "step": 88400 }, { "epoch": 1.1895481061318853, "grad_norm": 0.7204086184501648, "learning_rate": 0.004973671100761278, "loss": 2.4977, "step": 88500 }, { "epoch": 1.1908922282857066, "grad_norm": 0.5486904382705688, "learning_rate": 0.004973594307934272, "loss": 2.5017, "step": 88600 }, { "epoch": 1.192236350439528, "grad_norm": 0.8633172512054443, "learning_rate": 0.0049735174038762235, "loss": 2.5038, "step": 88700 }, { "epoch": 1.1935804725933492, "grad_norm": 0.5683305859565735, "learning_rate": 0.0049734403885905885, "loss": 2.5001, "step": 88800 }, { "epoch": 1.1949245947471705, "grad_norm": 0.5062243342399597, "learning_rate": 0.004973363262080839, "loss": 2.5065, "step": 88900 }, { "epoch": 1.196268716900992, "grad_norm": 3.755258321762085, "learning_rate": 0.004973286024350451, "loss": 2.491, "step": 89000 }, { "epoch": 1.196268716900992, "eval_MaskedAccuracy": 0.4371673629048697, "eval_loss": 2.814087390899658, "eval_runtime": 156.1272, "eval_samples_per_second": 406.566, "eval_steps_per_second": 1.588, "step": 89000 }, { "epoch": 1.1976128390548133, "grad_norm": 0.8349953293800354, "learning_rate": 0.004973208675402902, "loss": 2.5118, "step": 89100 }, { "epoch": 1.1989569612086346, "grad_norm": 1.188719391822815, "learning_rate": 0.004973131215241677, "loss": 2.5103, "step": 89200 }, { "epoch": 1.200301083362456, "grad_norm": 1.0278277397155762, "learning_rate": 0.004973053643870272, "loss": 2.4956, "step": 89300 }, { "epoch": 1.2016452055162774, "grad_norm": 1.174872636795044, "learning_rate": 0.004972975961292177, "loss": 2.4981, "step": 89400 }, { "epoch": 1.2029893276700987, "grad_norm": 1.0490007400512695, "learning_rate": 0.004972898167510894, "loss": 2.5007, "step": 89500 }, { "epoch": 1.20433344982392, "grad_norm": 1.1336147785186768, "learning_rate": 0.004972820262529928, "loss": 2.5084, "step": 89600 }, { "epoch": 1.2056775719777413, "grad_norm": 0.770378589630127, "learning_rate": 0.004972742246352788, "loss": 2.4995, "step": 89700 }, { "epoch": 1.2070216941315626, "grad_norm": 1.1456156969070435, "learning_rate": 0.00497266411898299, "loss": 2.4966, "step": 89800 }, { "epoch": 1.208365816285384, "grad_norm": 2.0087265968322754, "learning_rate": 0.004972585880424056, "loss": 2.4936, "step": 89900 }, { "epoch": 1.2097099384392054, "grad_norm": 1.6110386848449707, "learning_rate": 0.0049725075306795135, "loss": 2.4972, "step": 90000 }, { "epoch": 1.2097099384392054, "eval_MaskedAccuracy": 0.4370350689102483, "eval_loss": 2.812059164047241, "eval_runtime": 155.8421, "eval_samples_per_second": 407.31, "eval_steps_per_second": 1.591, "step": 90000 }, { "epoch": 1.2110540605930267, "grad_norm": 0.35183417797088623, "learning_rate": 0.004972429069752887, "loss": 2.4858, "step": 90100 }, { "epoch": 1.212398182746848, "grad_norm": 0.4271332621574402, "learning_rate": 0.004972350497647718, "loss": 2.4989, "step": 90200 }, { "epoch": 1.2137423049006695, "grad_norm": 3.801957845687866, "learning_rate": 0.004972271814367547, "loss": 2.4984, "step": 90300 }, { "epoch": 1.2150864270544908, "grad_norm": 2.1145997047424316, "learning_rate": 0.004972193019915915, "loss": 2.4964, "step": 90400 }, { "epoch": 1.216430549208312, "grad_norm": 0.3601829707622528, "learning_rate": 0.004972114114296368, "loss": 2.5001, "step": 90500 }, { "epoch": 1.2177746713621334, "grad_norm": 0.31775668263435364, "learning_rate": 0.004972035097512468, "loss": 2.4985, "step": 90600 }, { "epoch": 1.2191187935159546, "grad_norm": 0.4949300289154053, "learning_rate": 0.004971955969567773, "loss": 2.4992, "step": 90700 }, { "epoch": 1.2204629156697762, "grad_norm": 0.5731763243675232, "learning_rate": 0.004971876730465848, "loss": 2.4985, "step": 90800 }, { "epoch": 1.2218070378235975, "grad_norm": 0.6746828556060791, "learning_rate": 0.004971797380210261, "loss": 2.4902, "step": 90900 }, { "epoch": 1.2231511599774187, "grad_norm": 2.1242833137512207, "learning_rate": 0.004971717918804597, "loss": 2.5026, "step": 91000 }, { "epoch": 1.2231511599774187, "eval_MaskedAccuracy": 0.4372796066068638, "eval_loss": 2.8106234073638916, "eval_runtime": 155.6365, "eval_samples_per_second": 407.848, "eval_steps_per_second": 1.593, "step": 91000 }, { "epoch": 1.22449528213124, "grad_norm": 1.0051345825195312, "learning_rate": 0.004971638346252431, "loss": 2.484, "step": 91100 }, { "epoch": 1.2258394042850613, "grad_norm": 0.33799469470977783, "learning_rate": 0.0049715586625573484, "loss": 2.4978, "step": 91200 }, { "epoch": 1.2271835264388828, "grad_norm": 0.37209373712539673, "learning_rate": 0.004971478867722932, "loss": 2.5002, "step": 91300 }, { "epoch": 1.2285276485927041, "grad_norm": 0.9765738248825073, "learning_rate": 0.004971398961752787, "loss": 2.4962, "step": 91400 }, { "epoch": 1.2298717707465254, "grad_norm": 1.2135403156280518, "learning_rate": 0.004971318944650509, "loss": 2.4967, "step": 91500 }, { "epoch": 1.2312158929003467, "grad_norm": 0.776421070098877, "learning_rate": 0.004971238816419708, "loss": 2.4915, "step": 91600 }, { "epoch": 1.232560015054168, "grad_norm": 0.8908390998840332, "learning_rate": 0.004971158577063992, "loss": 2.5007, "step": 91700 }, { "epoch": 1.2339041372079895, "grad_norm": 0.3629126250743866, "learning_rate": 0.004971078226586978, "loss": 2.4831, "step": 91800 }, { "epoch": 1.2352482593618108, "grad_norm": 0.5708677172660828, "learning_rate": 0.004970997764992286, "loss": 2.488, "step": 91900 }, { "epoch": 1.236592381515632, "grad_norm": 1.317496418952942, "learning_rate": 0.004970917192283536, "loss": 2.4921, "step": 92000 }, { "epoch": 1.236592381515632, "eval_MaskedAccuracy": 0.43781475066312364, "eval_loss": 2.807178020477295, "eval_runtime": 155.1214, "eval_samples_per_second": 409.202, "eval_steps_per_second": 1.599, "step": 92000 }, { "epoch": 1.2379365036694534, "grad_norm": 1.5067718029022217, "learning_rate": 0.004970836508464369, "loss": 2.4944, "step": 92100 }, { "epoch": 1.239280625823275, "grad_norm": 0.6655193567276001, "learning_rate": 0.004970755713538412, "loss": 2.4849, "step": 92200 }, { "epoch": 1.2406247479770962, "grad_norm": 0.9877846837043762, "learning_rate": 0.004970674807509303, "loss": 2.4908, "step": 92300 }, { "epoch": 1.2419688701309175, "grad_norm": 0.47736746072769165, "learning_rate": 0.004970593790380696, "loss": 2.498, "step": 92400 }, { "epoch": 1.2433129922847388, "grad_norm": 0.6927477121353149, "learning_rate": 0.004970512662156239, "loss": 2.4849, "step": 92500 }, { "epoch": 1.24465711443856, "grad_norm": 0.5318709015846252, "learning_rate": 0.004970431422839584, "loss": 2.4965, "step": 92600 }, { "epoch": 1.2460012365923816, "grad_norm": 3.21836519241333, "learning_rate": 0.004970350072434393, "loss": 2.5004, "step": 92700 }, { "epoch": 1.247345358746203, "grad_norm": 0.5743933320045471, "learning_rate": 0.004970268610944332, "loss": 2.4935, "step": 92800 }, { "epoch": 1.2486894809000242, "grad_norm": 0.4667443335056305, "learning_rate": 0.004970187038373073, "loss": 2.4883, "step": 92900 }, { "epoch": 1.2500336030538455, "grad_norm": 0.6037049889564514, "learning_rate": 0.00497010535472429, "loss": 2.4922, "step": 93000 }, { "epoch": 1.2500336030538455, "eval_MaskedAccuracy": 0.4379076740559415, "eval_loss": 2.804267644882202, "eval_runtime": 157.0744, "eval_samples_per_second": 404.114, "eval_steps_per_second": 1.579, "step": 93000 }, { "epoch": 1.251377725207667, "grad_norm": 0.6085057854652405, "learning_rate": 0.004970023560001663, "loss": 2.4916, "step": 93100 }, { "epoch": 1.2527218473614883, "grad_norm": 0.8414444923400879, "learning_rate": 0.00496994165420888, "loss": 2.4915, "step": 93200 }, { "epoch": 1.2540659695153096, "grad_norm": 0.962984561920166, "learning_rate": 0.004969859637349628, "loss": 2.493, "step": 93300 }, { "epoch": 1.2554100916691309, "grad_norm": 0.9756949543952942, "learning_rate": 0.0049697775094276085, "loss": 2.496, "step": 93400 }, { "epoch": 1.2567542138229522, "grad_norm": 1.2727997303009033, "learning_rate": 0.004969695270446518, "loss": 2.4915, "step": 93500 }, { "epoch": 1.2580983359767735, "grad_norm": 1.2825567722320557, "learning_rate": 0.004969612920410054, "loss": 2.4862, "step": 93600 }, { "epoch": 1.259442458130595, "grad_norm": 0.7585506439208984, "learning_rate": 0.004969530459321946, "loss": 2.494, "step": 93700 }, { "epoch": 1.2607865802844163, "grad_norm": 0.6428466439247131, "learning_rate": 0.004969447887185897, "loss": 2.4782, "step": 93800 }, { "epoch": 1.2621307024382376, "grad_norm": 0.8756228089332581, "learning_rate": 0.004969365204005632, "loss": 2.4977, "step": 93900 }, { "epoch": 1.263474824592059, "grad_norm": 0.36692315340042114, "learning_rate": 0.004969282409784867, "loss": 2.4931, "step": 94000 }, { "epoch": 1.263474824592059, "eval_MaskedAccuracy": 0.43933527822037677, "eval_loss": 2.797823429107666, "eval_runtime": 155.1464, "eval_samples_per_second": 409.136, "eval_steps_per_second": 1.598, "step": 94000 }, { "epoch": 1.2648189467458804, "grad_norm": 2.2874348163604736, "learning_rate": 0.004969199504527345, "loss": 2.4854, "step": 94100 }, { "epoch": 1.2661630688997016, "grad_norm": 1.3956974744796753, "learning_rate": 0.0049691164882367895, "loss": 2.4901, "step": 94200 }, { "epoch": 1.267507191053523, "grad_norm": 0.6008788347244263, "learning_rate": 0.004969033360916954, "loss": 2.4849, "step": 94300 }, { "epoch": 1.2688513132073442, "grad_norm": 1.0495715141296387, "learning_rate": 0.004968950122571579, "loss": 2.4919, "step": 94400 }, { "epoch": 1.2701954353611655, "grad_norm": 1.057627558708191, "learning_rate": 0.00496886677320441, "loss": 2.4909, "step": 94500 }, { "epoch": 1.271539557514987, "grad_norm": 0.4978176951408386, "learning_rate": 0.0049687833128192055, "loss": 2.4938, "step": 94600 }, { "epoch": 1.2728836796688083, "grad_norm": 0.5044616460800171, "learning_rate": 0.004968699741419729, "loss": 2.4907, "step": 94700 }, { "epoch": 1.2742278018226296, "grad_norm": 0.37560752034187317, "learning_rate": 0.00496861605900974, "loss": 2.4896, "step": 94800 }, { "epoch": 1.275571923976451, "grad_norm": 0.877223789691925, "learning_rate": 0.004968532265593022, "loss": 2.4882, "step": 94900 }, { "epoch": 1.2769160461302724, "grad_norm": 1.2958379983901978, "learning_rate": 0.004968448361173342, "loss": 2.4856, "step": 95000 }, { "epoch": 1.2769160461302724, "eval_MaskedAccuracy": 0.4387117447553094, "eval_loss": 2.799860954284668, "eval_runtime": 154.6014, "eval_samples_per_second": 410.578, "eval_steps_per_second": 1.604, "step": 95000 }, { "epoch": 1.2782601682840937, "grad_norm": 0.3757873773574829, "learning_rate": 0.004968364345754483, "loss": 2.4836, "step": 95100 }, { "epoch": 1.279604290437915, "grad_norm": 0.9294460415840149, "learning_rate": 0.004968280219340219, "loss": 2.4937, "step": 95200 }, { "epoch": 1.2809484125917363, "grad_norm": 0.3245985805988312, "learning_rate": 0.004968195981934356, "loss": 2.4949, "step": 95300 }, { "epoch": 1.2822925347455576, "grad_norm": 0.342230886220932, "learning_rate": 0.004968111633540681, "loss": 2.492, "step": 95400 }, { "epoch": 1.283636656899379, "grad_norm": 1.0859248638153076, "learning_rate": 0.004968027174162995, "loss": 2.4855, "step": 95500 }, { "epoch": 1.2849807790532004, "grad_norm": 0.45761579275131226, "learning_rate": 0.0049679426038051104, "loss": 2.4863, "step": 95600 }, { "epoch": 1.2863249012070217, "grad_norm": 0.49789857864379883, "learning_rate": 0.004967857922470824, "loss": 2.4931, "step": 95700 }, { "epoch": 1.287669023360843, "grad_norm": 0.39404183626174927, "learning_rate": 0.004967773130163965, "loss": 2.4774, "step": 95800 }, { "epoch": 1.2890131455146645, "grad_norm": 0.3045353293418884, "learning_rate": 0.0049676882268883515, "loss": 2.4826, "step": 95900 }, { "epoch": 1.2903572676684858, "grad_norm": 0.34618687629699707, "learning_rate": 0.004967603212647805, "loss": 2.4788, "step": 96000 }, { "epoch": 1.2903572676684858, "eval_MaskedAccuracy": 0.43972966453745477, "eval_loss": 2.7956392765045166, "eval_runtime": 154.9374, "eval_samples_per_second": 409.688, "eval_steps_per_second": 1.601, "step": 96000 }, { "epoch": 1.291701389822307, "grad_norm": 1.395040512084961, "learning_rate": 0.004967518087446164, "loss": 2.4897, "step": 96100 }, { "epoch": 1.2930455119761284, "grad_norm": 0.49540796875953674, "learning_rate": 0.0049674328512872585, "loss": 2.4829, "step": 96200 }, { "epoch": 1.2943896341299497, "grad_norm": 1.1871672868728638, "learning_rate": 0.00496734750417493, "loss": 2.4834, "step": 96300 }, { "epoch": 1.295733756283771, "grad_norm": 0.9125702977180481, "learning_rate": 0.004967262046113016, "loss": 2.4926, "step": 96400 }, { "epoch": 1.2970778784375925, "grad_norm": 0.8528000712394714, "learning_rate": 0.004967176477105385, "loss": 2.4924, "step": 96500 }, { "epoch": 1.2984220005914138, "grad_norm": 0.3273971378803253, "learning_rate": 0.0049670907971558756, "loss": 2.481, "step": 96600 }, { "epoch": 1.299766122745235, "grad_norm": 1.2552522420883179, "learning_rate": 0.004967005006268358, "loss": 2.4869, "step": 96700 }, { "epoch": 1.3011102448990564, "grad_norm": 0.8510228991508484, "learning_rate": 0.004966919104446687, "loss": 2.4875, "step": 96800 }, { "epoch": 1.3024543670528779, "grad_norm": 0.6518368124961853, "learning_rate": 0.004966833091694741, "loss": 2.4836, "step": 96900 }, { "epoch": 1.3037984892066992, "grad_norm": 1.7670749425888062, "learning_rate": 0.004966746968016394, "loss": 2.4831, "step": 97000 }, { "epoch": 1.3037984892066992, "eval_MaskedAccuracy": 0.4384087863650507, "eval_loss": 2.801335573196411, "eval_runtime": 156.4566, "eval_samples_per_second": 405.71, "eval_steps_per_second": 1.585, "step": 97000 }, { "epoch": 1.3051426113605205, "grad_norm": 0.37767720222473145, "learning_rate": 0.004966660733415522, "loss": 2.4896, "step": 97100 }, { "epoch": 1.3064867335143417, "grad_norm": 1.352961540222168, "learning_rate": 0.004966574387896025, "loss": 2.4821, "step": 97200 }, { "epoch": 1.307830855668163, "grad_norm": 0.5152901411056519, "learning_rate": 0.004966487931461782, "loss": 2.4872, "step": 97300 }, { "epoch": 1.3091749778219843, "grad_norm": 0.6184568405151367, "learning_rate": 0.00496640136411669, "loss": 2.4917, "step": 97400 }, { "epoch": 1.3105190999758058, "grad_norm": 0.8382551074028015, "learning_rate": 0.004966314685864655, "loss": 2.4785, "step": 97500 }, { "epoch": 1.3118632221296271, "grad_norm": 0.9581834077835083, "learning_rate": 0.004966227896709579, "loss": 2.4839, "step": 97600 }, { "epoch": 1.3132073442834484, "grad_norm": 0.33205175399780273, "learning_rate": 0.004966140996655376, "loss": 2.4767, "step": 97700 }, { "epoch": 1.31455146643727, "grad_norm": 1.0447673797607422, "learning_rate": 0.0049660539857059515, "loss": 2.491, "step": 97800 }, { "epoch": 1.3158955885910912, "grad_norm": 1.3554537296295166, "learning_rate": 0.0049659668638652316, "loss": 2.4827, "step": 97900 }, { "epoch": 1.3172397107449125, "grad_norm": 1.2488070726394653, "learning_rate": 0.00496587963113714, "loss": 2.4825, "step": 98000 }, { "epoch": 1.3172397107449125, "eval_MaskedAccuracy": 0.439554404426727, "eval_loss": 2.793813943862915, "eval_runtime": 158.828, "eval_samples_per_second": 399.652, "eval_steps_per_second": 1.561, "step": 98000 }, { "epoch": 1.3185838328987338, "grad_norm": 0.7793965935707092, "learning_rate": 0.004965792287525609, "loss": 2.4838, "step": 98100 }, { "epoch": 1.3199279550525551, "grad_norm": 2.033632755279541, "learning_rate": 0.004965704833034586, "loss": 2.4777, "step": 98200 }, { "epoch": 1.3212720772063764, "grad_norm": 0.8052381277084351, "learning_rate": 0.0049656172676679905, "loss": 2.4794, "step": 98300 }, { "epoch": 1.322616199360198, "grad_norm": 0.6361316442489624, "learning_rate": 0.0049655295914297825, "loss": 2.4872, "step": 98400 }, { "epoch": 1.3239603215140192, "grad_norm": 0.8865070343017578, "learning_rate": 0.004965441804323918, "loss": 2.4818, "step": 98500 }, { "epoch": 1.3253044436678405, "grad_norm": 0.9113361239433289, "learning_rate": 0.004965353906354336, "loss": 2.4834, "step": 98600 }, { "epoch": 1.3266485658216618, "grad_norm": 1.3836218118667603, "learning_rate": 0.004965265897525005, "loss": 2.4795, "step": 98700 }, { "epoch": 1.3279926879754833, "grad_norm": 0.7738713026046753, "learning_rate": 0.0049651777778398805, "loss": 2.4869, "step": 98800 }, { "epoch": 1.3293368101293046, "grad_norm": 1.1943604946136475, "learning_rate": 0.004965089547302944, "loss": 2.489, "step": 98900 }, { "epoch": 1.330680932283126, "grad_norm": 0.4339142441749573, "learning_rate": 0.004965001205918176, "loss": 2.4801, "step": 99000 }, { "epoch": 1.330680932283126, "eval_MaskedAccuracy": 0.4402919161127212, "eval_loss": 2.789841651916504, "eval_runtime": 155.2442, "eval_samples_per_second": 408.878, "eval_steps_per_second": 1.597, "step": 99000 }, { "epoch": 1.3320250544369472, "grad_norm": 0.5905622243881226, "learning_rate": 0.00496491275368955, "loss": 2.4818, "step": 99100 }, { "epoch": 1.3333691765907685, "grad_norm": 0.8844038844108582, "learning_rate": 0.004964824190621052, "loss": 2.4847, "step": 99200 }, { "epoch": 1.3347132987445898, "grad_norm": 0.4743984639644623, "learning_rate": 0.004964735516716671, "loss": 2.4873, "step": 99300 }, { "epoch": 1.3360574208984113, "grad_norm": 0.4455043077468872, "learning_rate": 0.0049646467319804, "loss": 2.4894, "step": 99400 }, { "epoch": 1.3374015430522326, "grad_norm": 0.33678489923477173, "learning_rate": 0.004964557836416248, "loss": 2.4707, "step": 99500 }, { "epoch": 1.3387456652060539, "grad_norm": 0.557610809803009, "learning_rate": 0.004964468830028214, "loss": 2.4761, "step": 99600 }, { "epoch": 1.3400897873598754, "grad_norm": 1.0966076850891113, "learning_rate": 0.00496437971282031, "loss": 2.4776, "step": 99700 }, { "epoch": 1.3414339095136967, "grad_norm": 1.7193931341171265, "learning_rate": 0.00496429048479655, "loss": 2.5095, "step": 99800 }, { "epoch": 1.342778031667518, "grad_norm": 1.0982494354248047, "learning_rate": 0.004964201145960957, "loss": 2.4835, "step": 99900 }, { "epoch": 1.3441221538213393, "grad_norm": 0.7176316976547241, "learning_rate": 0.004964111696317556, "loss": 2.4881, "step": 100000 }, { "epoch": 1.3441221538213393, "eval_MaskedAccuracy": 0.439665552172269, "eval_loss": 2.7934842109680176, "eval_runtime": 156.2627, "eval_samples_per_second": 406.213, "eval_steps_per_second": 1.587, "step": 100000 }, { "epoch": 1.3454662759751606, "grad_norm": 0.6092787384986877, "learning_rate": 0.00496402213587037, "loss": 2.4923, "step": 100100 }, { "epoch": 1.3468103981289818, "grad_norm": 0.48341110348701477, "learning_rate": 0.004963932464623445, "loss": 2.4847, "step": 100200 }, { "epoch": 1.3481545202828034, "grad_norm": 1.8525594472885132, "learning_rate": 0.004963842682580817, "loss": 2.4742, "step": 100300 }, { "epoch": 1.3494986424366247, "grad_norm": 1.4322495460510254, "learning_rate": 0.004963752789746537, "loss": 2.4793, "step": 100400 }, { "epoch": 1.350842764590446, "grad_norm": 0.881243884563446, "learning_rate": 0.004963662786124655, "loss": 2.4773, "step": 100500 }, { "epoch": 1.3521868867442675, "grad_norm": 1.2258617877960205, "learning_rate": 0.0049635726717192205, "loss": 2.4833, "step": 100600 }, { "epoch": 1.3535310088980887, "grad_norm": 0.9638917446136475, "learning_rate": 0.004963482446534289, "loss": 2.4804, "step": 100700 }, { "epoch": 1.35487513105191, "grad_norm": 0.9644854068756104, "learning_rate": 0.004963392110573937, "loss": 2.4719, "step": 100800 }, { "epoch": 1.3562192532057313, "grad_norm": 0.689267635345459, "learning_rate": 0.0049633016638422275, "loss": 2.4756, "step": 100900 }, { "epoch": 1.3575633753595526, "grad_norm": 1.2867717742919922, "learning_rate": 0.004963211106343239, "loss": 2.4812, "step": 101000 }, { "epoch": 1.3575633753595526, "eval_MaskedAccuracy": 0.4396971572510662, "eval_loss": 2.793617010116577, "eval_runtime": 157.0355, "eval_samples_per_second": 404.214, "eval_steps_per_second": 1.579, "step": 101000 }, { "epoch": 1.358907497513374, "grad_norm": 1.8802392482757568, "learning_rate": 0.00496312043808105, "loss": 2.485, "step": 101100 }, { "epoch": 1.3602516196671954, "grad_norm": 0.35604092478752136, "learning_rate": 0.004963029659059752, "loss": 2.4812, "step": 101200 }, { "epoch": 1.3615957418210167, "grad_norm": 1.0150797367095947, "learning_rate": 0.004962938769283431, "loss": 2.4875, "step": 101300 }, { "epoch": 1.362939863974838, "grad_norm": 1.3340040445327759, "learning_rate": 0.004962847768756179, "loss": 2.4684, "step": 101400 }, { "epoch": 1.3642839861286593, "grad_norm": 0.38257646560668945, "learning_rate": 0.004962756657482102, "loss": 2.4866, "step": 101500 }, { "epoch": 1.3656281082824808, "grad_norm": 0.6464138031005859, "learning_rate": 0.0049626654354653025, "loss": 2.4844, "step": 101600 }, { "epoch": 1.3669722304363021, "grad_norm": 0.41035962104797363, "learning_rate": 0.004962574102709892, "loss": 2.4817, "step": 101700 }, { "epoch": 1.3683163525901234, "grad_norm": 0.7176283001899719, "learning_rate": 0.004962482659219983, "loss": 2.4751, "step": 101800 }, { "epoch": 1.3696604747439447, "grad_norm": 0.7560930848121643, "learning_rate": 0.004962391104999701, "loss": 2.48, "step": 101900 }, { "epoch": 1.371004596897766, "grad_norm": 0.8050186038017273, "learning_rate": 0.004962299440053163, "loss": 2.4691, "step": 102000 }, { "epoch": 1.371004596897766, "eval_MaskedAccuracy": 0.44033106657058535, "eval_loss": 2.7889299392700195, "eval_runtime": 155.2645, "eval_samples_per_second": 408.825, "eval_steps_per_second": 1.597, "step": 102000 }, { "epoch": 1.3723487190515873, "grad_norm": 0.39613771438598633, "learning_rate": 0.004962207664384512, "loss": 2.4773, "step": 102100 }, { "epoch": 1.3736928412054088, "grad_norm": 2.5852887630462646, "learning_rate": 0.004962115777997869, "loss": 2.479, "step": 102200 }, { "epoch": 1.37503696335923, "grad_norm": 1.9417086839675903, "learning_rate": 0.004962023780897379, "loss": 2.479, "step": 102300 }, { "epoch": 1.3763810855130514, "grad_norm": 1.185256004333496, "learning_rate": 0.004961931673087198, "loss": 2.4713, "step": 102400 }, { "epoch": 1.377725207666873, "grad_norm": 0.5902850031852722, "learning_rate": 0.004961839454571464, "loss": 2.4805, "step": 102500 }, { "epoch": 1.3790693298206942, "grad_norm": 0.61485755443573, "learning_rate": 0.0049617471253543335, "loss": 2.478, "step": 102600 }, { "epoch": 1.3804134519745155, "grad_norm": 0.3328634798526764, "learning_rate": 0.004961654685439971, "loss": 2.4705, "step": 102700 }, { "epoch": 1.3817575741283368, "grad_norm": 1.5581071376800537, "learning_rate": 0.004961562134832535, "loss": 2.468, "step": 102800 }, { "epoch": 1.383101696282158, "grad_norm": 0.8161963820457458, "learning_rate": 0.004961469473536204, "loss": 2.4773, "step": 102900 }, { "epoch": 1.3844458184359794, "grad_norm": 0.4553765654563904, "learning_rate": 0.004961376701555151, "loss": 2.4774, "step": 103000 }, { "epoch": 1.3844458184359794, "eval_MaskedAccuracy": 0.44020060249449044, "eval_loss": 2.7893545627593994, "eval_runtime": 157.1906, "eval_samples_per_second": 403.815, "eval_steps_per_second": 1.578, "step": 103000 }, { "epoch": 1.3857899405898009, "grad_norm": 0.7315437197685242, "learning_rate": 0.004961283818893556, "loss": 2.4893, "step": 103100 }, { "epoch": 1.3871340627436222, "grad_norm": 1.7851052284240723, "learning_rate": 0.004961190825555604, "loss": 2.4825, "step": 103200 }, { "epoch": 1.3884781848974435, "grad_norm": 0.7662883400917053, "learning_rate": 0.004961097721545482, "loss": 2.4694, "step": 103300 }, { "epoch": 1.3898223070512647, "grad_norm": 0.3333396315574646, "learning_rate": 0.0049610045068673855, "loss": 2.4772, "step": 103400 }, { "epoch": 1.3911664292050863, "grad_norm": 0.4882153570652008, "learning_rate": 0.004960911181525515, "loss": 2.4764, "step": 103500 }, { "epoch": 1.3925105513589076, "grad_norm": 1.4237163066864014, "learning_rate": 0.0049608177455240775, "loss": 2.4822, "step": 103600 }, { "epoch": 1.3938546735127288, "grad_norm": 0.5641297101974487, "learning_rate": 0.004960724198867284, "loss": 2.4779, "step": 103700 }, { "epoch": 1.3951987956665501, "grad_norm": 0.6121580600738525, "learning_rate": 0.00496063054155935, "loss": 2.4663, "step": 103800 }, { "epoch": 1.3965429178203714, "grad_norm": 0.6751658320426941, "learning_rate": 0.004960536773604492, "loss": 2.4718, "step": 103900 }, { "epoch": 1.3978870399741927, "grad_norm": 0.9351543188095093, "learning_rate": 0.004960442895006929, "loss": 2.4855, "step": 104000 }, { "epoch": 1.3978870399741927, "eval_MaskedAccuracy": 0.44113983122276385, "eval_loss": 2.7834458351135254, "eval_runtime": 156.6243, "eval_samples_per_second": 405.276, "eval_steps_per_second": 1.583, "step": 104000 }, { "epoch": 1.3992311621280142, "grad_norm": 0.44563743472099304, "learning_rate": 0.004960348905770906, "loss": 2.478, "step": 104100 }, { "epoch": 1.4005752842818355, "grad_norm": 0.7417739629745483, "learning_rate": 0.004960254805900649, "loss": 2.4782, "step": 104200 }, { "epoch": 1.4019194064356568, "grad_norm": 0.38673731684684753, "learning_rate": 0.004960160595400398, "loss": 2.4703, "step": 104300 }, { "epoch": 1.4032635285894783, "grad_norm": 0.41961851716041565, "learning_rate": 0.004960066274274401, "loss": 2.4758, "step": 104400 }, { "epoch": 1.4046076507432996, "grad_norm": 1.0936461687088013, "learning_rate": 0.00495997184252691, "loss": 2.4711, "step": 104500 }, { "epoch": 1.405951772897121, "grad_norm": 1.4237301349639893, "learning_rate": 0.004959877300162172, "loss": 2.476, "step": 104600 }, { "epoch": 1.4072958950509422, "grad_norm": 0.5586555600166321, "learning_rate": 0.004959782647184455, "loss": 2.4744, "step": 104700 }, { "epoch": 1.4086400172047635, "grad_norm": 0.5998302102088928, "learning_rate": 0.0049596878835980115, "loss": 2.481, "step": 104800 }, { "epoch": 1.4099841393585848, "grad_norm": 0.6141105890274048, "learning_rate": 0.004959593009407127, "loss": 2.4683, "step": 104900 }, { "epoch": 1.4113282615124063, "grad_norm": 0.6551980376243591, "learning_rate": 0.00495949802461607, "loss": 2.4758, "step": 105000 }, { "epoch": 1.4113282615124063, "eval_MaskedAccuracy": 0.4408968621448549, "eval_loss": 2.78601336479187, "eval_runtime": 159.2788, "eval_samples_per_second": 398.521, "eval_steps_per_second": 1.557, "step": 105000 }, { "epoch": 1.4126723836662276, "grad_norm": 0.6265648603439331, "learning_rate": 0.00495940292922911, "loss": 2.4772, "step": 105100 }, { "epoch": 1.414016505820049, "grad_norm": 1.0922765731811523, "learning_rate": 0.004959307723250553, "loss": 2.4649, "step": 105200 }, { "epoch": 1.4153606279738702, "grad_norm": 1.4056020975112915, "learning_rate": 0.004959212406684676, "loss": 2.4811, "step": 105300 }, { "epoch": 1.4167047501276917, "grad_norm": 1.5905888080596924, "learning_rate": 0.004959116979535776, "loss": 2.4769, "step": 105400 }, { "epoch": 1.418048872281513, "grad_norm": 0.448127418756485, "learning_rate": 0.004959021441808149, "loss": 2.4706, "step": 105500 }, { "epoch": 1.4193929944353343, "grad_norm": 1.482865571975708, "learning_rate": 0.004958925793506108, "loss": 2.4795, "step": 105600 }, { "epoch": 1.4207371165891556, "grad_norm": 0.37079110741615295, "learning_rate": 0.004958830034633952, "loss": 2.4688, "step": 105700 }, { "epoch": 1.4220812387429769, "grad_norm": 0.398387610912323, "learning_rate": 0.004958734165196007, "loss": 2.473, "step": 105800 }, { "epoch": 1.4234253608967982, "grad_norm": 0.49912792444229126, "learning_rate": 0.004958638185196587, "loss": 2.4701, "step": 105900 }, { "epoch": 1.4247694830506197, "grad_norm": 0.8730640411376953, "learning_rate": 0.004958542094640022, "loss": 2.474, "step": 106000 }, { "epoch": 1.4247694830506197, "eval_MaskedAccuracy": 0.44114819868603417, "eval_loss": 2.7835192680358887, "eval_runtime": 154.9227, "eval_samples_per_second": 409.727, "eval_steps_per_second": 1.601, "step": 106000 }, { "epoch": 1.426113605204441, "grad_norm": 0.4877232313156128, "learning_rate": 0.004958445893530628, "loss": 2.4685, "step": 106100 }, { "epoch": 1.4274577273582623, "grad_norm": 0.6099728345870972, "learning_rate": 0.004958349581872748, "loss": 2.4724, "step": 106200 }, { "epoch": 1.4288018495120838, "grad_norm": 0.4108321964740753, "learning_rate": 0.004958253159670722, "loss": 2.4686, "step": 106300 }, { "epoch": 1.430145971665905, "grad_norm": 0.5882393717765808, "learning_rate": 0.004958156626928895, "loss": 2.4763, "step": 106400 }, { "epoch": 1.4314900938197264, "grad_norm": 0.7170044779777527, "learning_rate": 0.004958059983651618, "loss": 2.4681, "step": 106500 }, { "epoch": 1.4328342159735477, "grad_norm": 0.8796346783638, "learning_rate": 0.0049579632298432435, "loss": 2.4734, "step": 106600 }, { "epoch": 1.434178338127369, "grad_norm": 0.3585628271102905, "learning_rate": 0.0049578663655081385, "loss": 2.4599, "step": 106700 }, { "epoch": 1.4355224602811902, "grad_norm": 1.9920889139175415, "learning_rate": 0.004957769390650657, "loss": 2.4726, "step": 106800 }, { "epoch": 1.4368665824350118, "grad_norm": 1.0133873224258423, "learning_rate": 0.004957672305275166, "loss": 2.4684, "step": 106900 }, { "epoch": 1.438210704588833, "grad_norm": 1.567334771156311, "learning_rate": 0.004957575109386042, "loss": 2.4643, "step": 107000 }, { "epoch": 1.438210704588833, "eval_MaskedAccuracy": 0.4405992607707991, "eval_loss": 2.786257743835449, "eval_runtime": 154.0026, "eval_samples_per_second": 412.175, "eval_steps_per_second": 1.61, "step": 107000 }, { "epoch": 1.4395548267426543, "grad_norm": 1.3851370811462402, "learning_rate": 0.004957477802987676, "loss": 2.4719, "step": 107100 }, { "epoch": 1.4408989488964756, "grad_norm": 0.3829158842563629, "learning_rate": 0.0049573803860844465, "loss": 2.4618, "step": 107200 }, { "epoch": 1.4422430710502971, "grad_norm": 0.8415204882621765, "learning_rate": 0.004957282858680741, "loss": 2.4691, "step": 107300 }, { "epoch": 1.4435871932041184, "grad_norm": 1.198485016822815, "learning_rate": 0.00495718522078095, "loss": 2.4661, "step": 107400 }, { "epoch": 1.4449313153579397, "grad_norm": 2.3275532722473145, "learning_rate": 0.00495708747238948, "loss": 2.4734, "step": 107500 }, { "epoch": 1.446275437511761, "grad_norm": 0.5716771483421326, "learning_rate": 0.0049569896135107355, "loss": 2.4612, "step": 107600 }, { "epoch": 1.4476195596655823, "grad_norm": 1.2711353302001953, "learning_rate": 0.00495689164414912, "loss": 2.4738, "step": 107700 }, { "epoch": 1.4489636818194038, "grad_norm": 0.5984246730804443, "learning_rate": 0.004956793564309049, "loss": 2.4793, "step": 107800 }, { "epoch": 1.4503078039732251, "grad_norm": 0.992817223072052, "learning_rate": 0.004956695373994945, "loss": 2.4736, "step": 107900 }, { "epoch": 1.4516519261270464, "grad_norm": 0.7644572257995605, "learning_rate": 0.004956597073211232, "loss": 2.4834, "step": 108000 }, { "epoch": 1.4516519261270464, "eval_MaskedAccuracy": 0.4405933158523938, "eval_loss": 2.7867140769958496, "eval_runtime": 155.9739, "eval_samples_per_second": 406.966, "eval_steps_per_second": 1.59, "step": 108000 }, { "epoch": 1.4529960482808677, "grad_norm": 0.39693641662597656, "learning_rate": 0.004956498661962332, "loss": 2.4675, "step": 108100 }, { "epoch": 1.4543401704346892, "grad_norm": 1.5463731288909912, "learning_rate": 0.004956400140252694, "loss": 2.4737, "step": 108200 }, { "epoch": 1.4556842925885105, "grad_norm": 1.4359521865844727, "learning_rate": 0.004956301508086743, "loss": 2.475, "step": 108300 }, { "epoch": 1.4570284147423318, "grad_norm": 1.5656912326812744, "learning_rate": 0.004956202765468931, "loss": 2.4734, "step": 108400 }, { "epoch": 1.458372536896153, "grad_norm": 0.3067225515842438, "learning_rate": 0.004956103912403708, "loss": 2.4703, "step": 108500 }, { "epoch": 1.4597166590499744, "grad_norm": 1.173917293548584, "learning_rate": 0.0049560049488955255, "loss": 2.4626, "step": 108600 }, { "epoch": 1.4610607812037957, "grad_norm": 0.5422664284706116, "learning_rate": 0.004955905874948838, "loss": 2.4634, "step": 108700 }, { "epoch": 1.4624049033576172, "grad_norm": 1.207939863204956, "learning_rate": 0.004955806690568119, "loss": 2.4676, "step": 108800 }, { "epoch": 1.4637490255114385, "grad_norm": 1.1675505638122559, "learning_rate": 0.0049557073957578355, "loss": 2.4668, "step": 108900 }, { "epoch": 1.4650931476652598, "grad_norm": 0.4084634482860565, "learning_rate": 0.004955607990522452, "loss": 2.4674, "step": 109000 }, { "epoch": 1.4650931476652598, "eval_MaskedAccuracy": 0.44142586982077486, "eval_loss": 2.781691551208496, "eval_runtime": 154.5643, "eval_samples_per_second": 410.677, "eval_steps_per_second": 1.605, "step": 109000 }, { "epoch": 1.4664372698190813, "grad_norm": 0.33248618245124817, "learning_rate": 0.004955508474866459, "loss": 2.47, "step": 109100 }, { "epoch": 1.4677813919729026, "grad_norm": 1.3263065814971924, "learning_rate": 0.004955408848794333, "loss": 2.4666, "step": 109200 }, { "epoch": 1.4691255141267239, "grad_norm": 0.5887484550476074, "learning_rate": 0.004955309112310564, "loss": 2.4736, "step": 109300 }, { "epoch": 1.4704696362805452, "grad_norm": 0.5707719326019287, "learning_rate": 0.004955209265419648, "loss": 2.4723, "step": 109400 }, { "epoch": 1.4718137584343665, "grad_norm": 1.564012885093689, "learning_rate": 0.004955109308126084, "loss": 2.4677, "step": 109500 }, { "epoch": 1.4731578805881878, "grad_norm": 0.5303487181663513, "learning_rate": 0.004955009240434372, "loss": 2.4748, "step": 109600 }, { "epoch": 1.4745020027420093, "grad_norm": 0.8532441258430481, "learning_rate": 0.0049549090623490265, "loss": 2.4605, "step": 109700 }, { "epoch": 1.4758461248958306, "grad_norm": 1.2264505624771118, "learning_rate": 0.004954808773874565, "loss": 2.4663, "step": 109800 }, { "epoch": 1.4771902470496519, "grad_norm": 0.4173908233642578, "learning_rate": 0.0049547083750154965, "loss": 2.4677, "step": 109900 }, { "epoch": 1.4785343692034731, "grad_norm": 1.8136452436447144, "learning_rate": 0.004954607865776349, "loss": 2.4652, "step": 110000 }, { "epoch": 1.4785343692034731, "eval_MaskedAccuracy": 0.44079433809670326, "eval_loss": 2.7854974269866943, "eval_runtime": 155.1144, "eval_samples_per_second": 409.221, "eval_steps_per_second": 1.599, "step": 110000 }, { "epoch": 1.4798784913572947, "grad_norm": 1.8435359001159668, "learning_rate": 0.004954507246161656, "loss": 2.4611, "step": 110100 }, { "epoch": 1.481222613511116, "grad_norm": 0.33811017870903015, "learning_rate": 0.004954406516175938, "loss": 2.4659, "step": 110200 }, { "epoch": 1.4825667356649372, "grad_norm": 1.902899146080017, "learning_rate": 0.00495430567582375, "loss": 2.4629, "step": 110300 }, { "epoch": 1.4839108578187585, "grad_norm": 2.60919451713562, "learning_rate": 0.004954204725109627, "loss": 2.4783, "step": 110400 }, { "epoch": 1.4852549799725798, "grad_norm": 0.7571696639060974, "learning_rate": 0.0049541036640381175, "loss": 2.4551, "step": 110500 }, { "epoch": 1.4865991021264011, "grad_norm": 0.4986100196838379, "learning_rate": 0.004954002492613781, "loss": 2.4625, "step": 110600 }, { "epoch": 1.4879432242802226, "grad_norm": 1.091294765472412, "learning_rate": 0.0049539012108411684, "loss": 2.4686, "step": 110700 }, { "epoch": 1.489287346434044, "grad_norm": 0.8807032704353333, "learning_rate": 0.004953799818724848, "loss": 2.4764, "step": 110800 }, { "epoch": 1.4906314685878652, "grad_norm": 1.00475013256073, "learning_rate": 0.0049536983162693895, "loss": 2.4637, "step": 110900 }, { "epoch": 1.4919755907416867, "grad_norm": 0.33167359232902527, "learning_rate": 0.004953596703479359, "loss": 2.4678, "step": 111000 }, { "epoch": 1.4919755907416867, "eval_MaskedAccuracy": 0.44179725878685755, "eval_loss": 2.776883602142334, "eval_runtime": 154.4158, "eval_samples_per_second": 411.072, "eval_steps_per_second": 1.606, "step": 111000 }, { "epoch": 1.493319712895508, "grad_norm": 1.6402376890182495, "learning_rate": 0.004953494980359343, "loss": 2.4685, "step": 111100 }, { "epoch": 1.4946638350493293, "grad_norm": 0.7305079698562622, "learning_rate": 0.004953393146913917, "loss": 2.4643, "step": 111200 }, { "epoch": 1.4960079572031506, "grad_norm": 0.560355544090271, "learning_rate": 0.0049532912031476735, "loss": 2.4659, "step": 111300 }, { "epoch": 1.497352079356972, "grad_norm": 0.9456803202629089, "learning_rate": 0.004953189149065217, "loss": 2.4618, "step": 111400 }, { "epoch": 1.4986962015107932, "grad_norm": 1.5349628925323486, "learning_rate": 0.004953086984671131, "loss": 2.4661, "step": 111500 }, { "epoch": 1.5000403236646145, "grad_norm": 1.062753438949585, "learning_rate": 0.004952984709970025, "loss": 2.4618, "step": 111600 }, { "epoch": 1.501384445818436, "grad_norm": 0.45811429619789124, "learning_rate": 0.0049528823249665056, "loss": 2.4586, "step": 111700 }, { "epoch": 1.5027285679722573, "grad_norm": 0.5497962832450867, "learning_rate": 0.004952779829665184, "loss": 2.4693, "step": 111800 }, { "epoch": 1.5040726901260788, "grad_norm": 0.5482380986213684, "learning_rate": 0.004952677224070681, "loss": 2.4623, "step": 111900 }, { "epoch": 1.5054168122799, "grad_norm": 1.2586387395858765, "learning_rate": 0.0049525745081876165, "loss": 2.4669, "step": 112000 }, { "epoch": 1.5054168122799, "eval_MaskedAccuracy": 0.44191414103263826, "eval_loss": 2.776205539703369, "eval_runtime": 153.7585, "eval_samples_per_second": 412.829, "eval_steps_per_second": 1.613, "step": 112000 }, { "epoch": 1.5067609344337214, "grad_norm": 2.1461663246154785, "learning_rate": 0.004952471682020625, "loss": 2.4569, "step": 112100 }, { "epoch": 1.5081050565875427, "grad_norm": 3.052988290786743, "learning_rate": 0.00495236874557434, "loss": 2.4583, "step": 112200 }, { "epoch": 1.509449178741364, "grad_norm": 0.3190886676311493, "learning_rate": 0.00495226569885339, "loss": 2.4577, "step": 112300 }, { "epoch": 1.5107933008951853, "grad_norm": 0.390209436416626, "learning_rate": 0.0049521625418624295, "loss": 2.4577, "step": 112400 }, { "epoch": 1.5121374230490066, "grad_norm": 1.6563820838928223, "learning_rate": 0.004952059274606098, "loss": 2.4577, "step": 112500 }, { "epoch": 1.513481545202828, "grad_norm": 0.9080681204795837, "learning_rate": 0.004951955897089055, "loss": 2.4618, "step": 112600 }, { "epoch": 1.5148256673566494, "grad_norm": 0.5969460010528564, "learning_rate": 0.0049518524093159514, "loss": 2.472, "step": 112700 }, { "epoch": 1.5161697895104709, "grad_norm": 0.358441025018692, "learning_rate": 0.004951748811291452, "loss": 2.4619, "step": 112800 }, { "epoch": 1.5175139116642922, "grad_norm": 0.6230987906455994, "learning_rate": 0.004951645103020229, "loss": 2.4626, "step": 112900 }, { "epoch": 1.5188580338181135, "grad_norm": 0.5158119797706604, "learning_rate": 0.00495154128450696, "loss": 2.4671, "step": 113000 }, { "epoch": 1.5188580338181135, "eval_MaskedAccuracy": 0.4421825141264522, "eval_loss": 2.7729415893554688, "eval_runtime": 153.7445, "eval_samples_per_second": 412.867, "eval_steps_per_second": 1.613, "step": 113000 }, { "epoch": 1.5202021559719348, "grad_norm": 1.2907798290252686, "learning_rate": 0.004951437355756309, "loss": 2.4693, "step": 113100 }, { "epoch": 1.521546278125756, "grad_norm": 1.6166244745254517, "learning_rate": 0.004951333316772972, "loss": 2.4585, "step": 113200 }, { "epoch": 1.5228904002795773, "grad_norm": 0.5393241047859192, "learning_rate": 0.004951229167561628, "loss": 2.4643, "step": 113300 }, { "epoch": 1.5242345224333986, "grad_norm": 0.7464740872383118, "learning_rate": 0.004951124908126974, "loss": 2.4576, "step": 113400 }, { "epoch": 1.52557864458722, "grad_norm": 0.47540420293807983, "learning_rate": 0.004951020538473705, "loss": 2.4567, "step": 113500 }, { "epoch": 1.5269227667410414, "grad_norm": 0.3154013454914093, "learning_rate": 0.00495091605860653, "loss": 2.4664, "step": 113600 }, { "epoch": 1.5282668888948627, "grad_norm": 0.5142553448677063, "learning_rate": 0.0049508114685301455, "loss": 2.4582, "step": 113700 }, { "epoch": 1.5296110110486842, "grad_norm": 1.193684697151184, "learning_rate": 0.004950706768249272, "loss": 2.4598, "step": 113800 }, { "epoch": 1.5309551332025055, "grad_norm": 0.3604363203048706, "learning_rate": 0.0049506019577686316, "loss": 2.4589, "step": 113900 }, { "epoch": 1.5322992553563268, "grad_norm": 0.5039530396461487, "learning_rate": 0.004950497037092934, "loss": 2.466, "step": 114000 }, { "epoch": 1.5322992553563268, "eval_MaskedAccuracy": 0.4417917832476142, "eval_loss": 2.775747299194336, "eval_runtime": 154.7989, "eval_samples_per_second": 410.055, "eval_steps_per_second": 1.602, "step": 114000 }, { "epoch": 1.5336433775101481, "grad_norm": 0.35583773255348206, "learning_rate": 0.004950392006226925, "loss": 2.4673, "step": 114100 }, { "epoch": 1.5349874996639694, "grad_norm": 0.35848045349121094, "learning_rate": 0.0049502868651753185, "loss": 2.4615, "step": 114200 }, { "epoch": 1.5363316218177907, "grad_norm": 1.3079990148544312, "learning_rate": 0.004950181613942862, "loss": 2.4687, "step": 114300 }, { "epoch": 1.537675743971612, "grad_norm": 1.409440279006958, "learning_rate": 0.004950076252534293, "loss": 2.4501, "step": 114400 }, { "epoch": 1.5390198661254335, "grad_norm": 2.4961087703704834, "learning_rate": 0.0049499707809543565, "loss": 2.461, "step": 114500 }, { "epoch": 1.5403639882792548, "grad_norm": 0.69174724817276, "learning_rate": 0.0049498651992078155, "loss": 2.4543, "step": 114600 }, { "epoch": 1.5417081104330763, "grad_norm": 1.2737400531768799, "learning_rate": 0.004949759507299418, "loss": 2.4584, "step": 114700 }, { "epoch": 1.5430522325868976, "grad_norm": 1.2695642709732056, "learning_rate": 0.00494965370523393, "loss": 2.4593, "step": 114800 }, { "epoch": 1.544396354740719, "grad_norm": 0.31750351190567017, "learning_rate": 0.004949547793016122, "loss": 2.4551, "step": 114900 }, { "epoch": 1.5457404768945402, "grad_norm": 0.38432586193084717, "learning_rate": 0.004949441770650766, "loss": 2.4576, "step": 115000 }, { "epoch": 1.5457404768945402, "eval_MaskedAccuracy": 0.4425237323528715, "eval_loss": 2.774029493331909, "eval_runtime": 156.0215, "eval_samples_per_second": 406.841, "eval_steps_per_second": 1.59, "step": 115000 }, { "epoch": 1.5470845990483615, "grad_norm": 0.34672319889068604, "learning_rate": 0.00494933563814263, "loss": 2.4499, "step": 115100 }, { "epoch": 1.5484287212021828, "grad_norm": 0.6219214200973511, "learning_rate": 0.004949229395496506, "loss": 2.4494, "step": 115200 }, { "epoch": 1.549772843356004, "grad_norm": 3.128021240234375, "learning_rate": 0.004949123042717173, "loss": 2.4652, "step": 115300 }, { "epoch": 1.5511169655098256, "grad_norm": 0.5915595889091492, "learning_rate": 0.004949016579809432, "loss": 2.4696, "step": 115400 }, { "epoch": 1.5524610876636469, "grad_norm": 1.1795876026153564, "learning_rate": 0.004948910006778072, "loss": 2.4595, "step": 115500 }, { "epoch": 1.5538052098174682, "grad_norm": 0.6927655339241028, "learning_rate": 0.004948803323627903, "loss": 2.4517, "step": 115600 }, { "epoch": 1.5551493319712897, "grad_norm": 0.35303354263305664, "learning_rate": 0.004948696530363729, "loss": 2.4593, "step": 115700 }, { "epoch": 1.556493454125111, "grad_norm": 2.31588077545166, "learning_rate": 0.004948589626990361, "loss": 2.4566, "step": 115800 }, { "epoch": 1.5578375762789323, "grad_norm": 0.37357497215270996, "learning_rate": 0.004948482613512615, "loss": 2.4553, "step": 115900 }, { "epoch": 1.5591816984327536, "grad_norm": 1.7756373882293701, "learning_rate": 0.0049483754899353125, "loss": 2.4547, "step": 116000 }, { "epoch": 1.5591816984327536, "eval_MaskedAccuracy": 0.4417388896389656, "eval_loss": 2.7752127647399902, "eval_runtime": 155.7219, "eval_samples_per_second": 407.624, "eval_steps_per_second": 1.593, "step": 116000 }, { "epoch": 1.5605258205865749, "grad_norm": 1.2487781047821045, "learning_rate": 0.004948268256263274, "loss": 2.4604, "step": 116100 }, { "epoch": 1.5618699427403961, "grad_norm": 0.5920694470405579, "learning_rate": 0.0049481609125013384, "loss": 2.4585, "step": 116200 }, { "epoch": 1.5632140648942174, "grad_norm": 1.373819351196289, "learning_rate": 0.004948053458654348, "loss": 2.4526, "step": 116300 }, { "epoch": 1.564558187048039, "grad_norm": 1.0958904027938843, "learning_rate": 0.004947945894727141, "loss": 2.4591, "step": 116400 }, { "epoch": 1.5659023092018602, "grad_norm": 4.045643329620361, "learning_rate": 0.004947838220724564, "loss": 2.4613, "step": 116500 }, { "epoch": 1.5672464313556818, "grad_norm": 1.2977932691574097, "learning_rate": 0.004947730436651466, "loss": 2.4564, "step": 116600 }, { "epoch": 1.568590553509503, "grad_norm": 1.1461297273635864, "learning_rate": 0.004947622542512704, "loss": 2.4542, "step": 116700 }, { "epoch": 1.5699346756633243, "grad_norm": 2.07126522064209, "learning_rate": 0.004947514538313141, "loss": 2.4584, "step": 116800 }, { "epoch": 1.5712787978171456, "grad_norm": 0.6101120114326477, "learning_rate": 0.004947406424057641, "loss": 2.4596, "step": 116900 }, { "epoch": 1.572622919970967, "grad_norm": 1.1047863960266113, "learning_rate": 0.0049472981997510735, "loss": 2.4563, "step": 117000 }, { "epoch": 1.572622919970967, "eval_MaskedAccuracy": 0.44274268801439726, "eval_loss": 2.769578218460083, "eval_runtime": 156.3839, "eval_samples_per_second": 405.899, "eval_steps_per_second": 1.586, "step": 117000 }, { "epoch": 1.5739670421247882, "grad_norm": 0.3559087812900543, "learning_rate": 0.004947189865398322, "loss": 2.4558, "step": 117100 }, { "epoch": 1.5753111642786095, "grad_norm": 1.209089994430542, "learning_rate": 0.004947081421004263, "loss": 2.4592, "step": 117200 }, { "epoch": 1.576655286432431, "grad_norm": 1.5720434188842773, "learning_rate": 0.004946972866573786, "loss": 2.4598, "step": 117300 }, { "epoch": 1.5779994085862523, "grad_norm": 0.33877983689308167, "learning_rate": 0.004946864202111778, "loss": 2.4539, "step": 117400 }, { "epoch": 1.5793435307400736, "grad_norm": 1.1109495162963867, "learning_rate": 0.004946755427623137, "loss": 2.4621, "step": 117500 }, { "epoch": 1.5806876528938951, "grad_norm": 0.32806700468063354, "learning_rate": 0.0049466465431127656, "loss": 2.4553, "step": 117600 }, { "epoch": 1.5820317750477164, "grad_norm": 1.0989540815353394, "learning_rate": 0.004946537548585571, "loss": 2.4632, "step": 117700 }, { "epoch": 1.5833758972015377, "grad_norm": 0.3961617350578308, "learning_rate": 0.004946428444046461, "loss": 2.4529, "step": 117800 }, { "epoch": 1.584720019355359, "grad_norm": 2.489666700363159, "learning_rate": 0.004946319229500352, "loss": 2.4506, "step": 117900 }, { "epoch": 1.5860641415091803, "grad_norm": 0.2874549627304077, "learning_rate": 0.004946209904952169, "loss": 2.451, "step": 118000 }, { "epoch": 1.5860641415091803, "eval_MaskedAccuracy": 0.44264748991328356, "eval_loss": 2.770142078399658, "eval_runtime": 153.5901, "eval_samples_per_second": 413.282, "eval_steps_per_second": 1.615, "step": 118000 }, { "epoch": 1.5874082636630016, "grad_norm": 2.0315134525299072, "learning_rate": 0.004946100470406832, "loss": 2.4472, "step": 118100 }, { "epoch": 1.5887523858168229, "grad_norm": 0.7130675911903381, "learning_rate": 0.004945990925869273, "loss": 2.4548, "step": 118200 }, { "epoch": 1.5900965079706444, "grad_norm": 0.9596794843673706, "learning_rate": 0.0049458812713444305, "loss": 2.462, "step": 118300 }, { "epoch": 1.5914406301244657, "grad_norm": 1.4742804765701294, "learning_rate": 0.004945771506837244, "loss": 2.4549, "step": 118400 }, { "epoch": 1.5927847522782872, "grad_norm": 1.3615772724151611, "learning_rate": 0.004945661632352659, "loss": 2.4574, "step": 118500 }, { "epoch": 1.5941288744321085, "grad_norm": 0.30037587881088257, "learning_rate": 0.004945551647895632, "loss": 2.4514, "step": 118600 }, { "epoch": 1.5954729965859298, "grad_norm": 1.7664852142333984, "learning_rate": 0.004945441553471114, "loss": 2.452, "step": 118700 }, { "epoch": 1.596817118739751, "grad_norm": 1.8963963985443115, "learning_rate": 0.004945331349084064, "loss": 2.4552, "step": 118800 }, { "epoch": 1.5981612408935724, "grad_norm": 0.5631555318832397, "learning_rate": 0.004945221034739453, "loss": 2.4525, "step": 118900 }, { "epoch": 1.5995053630473937, "grad_norm": 0.28932493925094604, "learning_rate": 0.00494511061044224, "loss": 2.4576, "step": 119000 }, { "epoch": 1.5995053630473937, "eval_MaskedAccuracy": 0.44307573744928136, "eval_loss": 2.768948793411255, "eval_runtime": 153.3984, "eval_samples_per_second": 413.798, "eval_steps_per_second": 1.617, "step": 119000 }, { "epoch": 1.600849485201215, "grad_norm": 1.037090539932251, "learning_rate": 0.0049450000761974144, "loss": 2.4576, "step": 119100 }, { "epoch": 1.6021936073550365, "grad_norm": 4.012909412384033, "learning_rate": 0.004944889432009946, "loss": 2.4638, "step": 119200 }, { "epoch": 1.6035377295088578, "grad_norm": 0.358288437128067, "learning_rate": 0.004944778677884832, "loss": 2.4572, "step": 119300 }, { "epoch": 1.6048818516626793, "grad_norm": 0.7761825323104858, "learning_rate": 0.004944667813827054, "loss": 2.4585, "step": 119400 }, { "epoch": 1.6062259738165006, "grad_norm": 1.2888187170028687, "learning_rate": 0.004944556839841606, "loss": 2.4593, "step": 119500 }, { "epoch": 1.6075700959703219, "grad_norm": 0.6099095344543457, "learning_rate": 0.0049444457559334925, "loss": 2.4618, "step": 119600 }, { "epoch": 1.6089142181241431, "grad_norm": 0.30497902631759644, "learning_rate": 0.004944334562107721, "loss": 2.4596, "step": 119700 }, { "epoch": 1.6102583402779644, "grad_norm": 3.6389715671539307, "learning_rate": 0.004944223258369297, "loss": 2.4551, "step": 119800 }, { "epoch": 1.6116024624317857, "grad_norm": 0.3001348078250885, "learning_rate": 0.004944111844723238, "loss": 2.4522, "step": 119900 }, { "epoch": 1.612946584585607, "grad_norm": 0.6880463361740112, "learning_rate": 0.004944000321174563, "loss": 2.4455, "step": 120000 }, { "epoch": 1.612946584585607, "eval_MaskedAccuracy": 0.44322096964619634, "eval_loss": 2.7688114643096924, "eval_runtime": 153.4713, "eval_samples_per_second": 413.602, "eval_steps_per_second": 1.616, "step": 120000 }, { "epoch": 1.6142907067394283, "grad_norm": 0.5405184626579285, "learning_rate": 0.0049438886877282975, "loss": 2.4582, "step": 120100 }, { "epoch": 1.6156348288932498, "grad_norm": 1.32143235206604, "learning_rate": 0.004943776944389477, "loss": 2.4501, "step": 120200 }, { "epoch": 1.6169789510470711, "grad_norm": 0.532461941242218, "learning_rate": 0.004943665091163127, "loss": 2.4512, "step": 120300 }, { "epoch": 1.6183230732008926, "grad_norm": 1.1819446086883545, "learning_rate": 0.004943553128054288, "loss": 2.4623, "step": 120400 }, { "epoch": 1.619667195354714, "grad_norm": 1.5699783563613892, "learning_rate": 0.004943441055068009, "loss": 2.4504, "step": 120500 }, { "epoch": 1.6210113175085352, "grad_norm": 1.6754220724105835, "learning_rate": 0.004943328872209337, "loss": 2.45, "step": 120600 }, { "epoch": 1.6223554396623565, "grad_norm": 0.35783451795578003, "learning_rate": 0.004943216579483333, "loss": 2.4411, "step": 120700 }, { "epoch": 1.6236995618161778, "grad_norm": 2.1771366596221924, "learning_rate": 0.004943104176895054, "loss": 2.4516, "step": 120800 }, { "epoch": 1.625043683969999, "grad_norm": 1.1872481107711792, "learning_rate": 0.0049429916644495656, "loss": 2.4464, "step": 120900 }, { "epoch": 1.6263878061238204, "grad_norm": 1.4642506837844849, "learning_rate": 0.004942879042151933, "loss": 2.461, "step": 121000 }, { "epoch": 1.6263878061238204, "eval_MaskedAccuracy": 0.4426167755231782, "eval_loss": 2.7695846557617188, "eval_runtime": 153.853, "eval_samples_per_second": 412.576, "eval_steps_per_second": 1.612, "step": 121000 }, { "epoch": 1.627731928277642, "grad_norm": 0.3797900080680847, "learning_rate": 0.004942766310007232, "loss": 2.4495, "step": 121100 }, { "epoch": 1.6290760504314632, "grad_norm": 0.4284350872039795, "learning_rate": 0.004942653468020548, "loss": 2.4434, "step": 121200 }, { "epoch": 1.6304201725852847, "grad_norm": 1.1178627014160156, "learning_rate": 0.004942540516196953, "loss": 2.4532, "step": 121300 }, { "epoch": 1.631764294739106, "grad_norm": 1.3800808191299438, "learning_rate": 0.0049424274545415425, "loss": 2.4582, "step": 121400 }, { "epoch": 1.6331084168929273, "grad_norm": 0.3553604781627655, "learning_rate": 0.0049423142830594185, "loss": 2.4546, "step": 121500 }, { "epoch": 1.6344525390467486, "grad_norm": 0.29325976967811584, "learning_rate": 0.004942201001755672, "loss": 2.449, "step": 121600 }, { "epoch": 1.6357966612005699, "grad_norm": 0.28023549914360046, "learning_rate": 0.004942087610635415, "loss": 2.4559, "step": 121700 }, { "epoch": 1.6371407833543912, "grad_norm": 0.6937592029571533, "learning_rate": 0.0049419741097037435, "loss": 2.458, "step": 121800 }, { "epoch": 1.6384849055082125, "grad_norm": 0.3913755416870117, "learning_rate": 0.004941860498965778, "loss": 2.4511, "step": 121900 }, { "epoch": 1.639829027662034, "grad_norm": 0.817623496055603, "learning_rate": 0.004941746778426644, "loss": 2.4551, "step": 122000 }, { "epoch": 1.639829027662034, "eval_MaskedAccuracy": 0.44339662189399426, "eval_loss": 2.764580249786377, "eval_runtime": 157.5643, "eval_samples_per_second": 402.858, "eval_steps_per_second": 1.574, "step": 122000 }, { "epoch": 1.6411731498158553, "grad_norm": 0.36455515027046204, "learning_rate": 0.004941632948091455, "loss": 2.4556, "step": 122100 }, { "epoch": 1.6425172719696766, "grad_norm": 0.5823009014129639, "learning_rate": 0.004941519007965352, "loss": 2.4511, "step": 122200 }, { "epoch": 1.643861394123498, "grad_norm": 0.7032744884490967, "learning_rate": 0.004941404958053464, "loss": 2.4545, "step": 122300 }, { "epoch": 1.6452055162773194, "grad_norm": 2.976278781890869, "learning_rate": 0.004941290798360926, "loss": 2.45, "step": 122400 }, { "epoch": 1.6465496384311407, "grad_norm": 0.37186741828918457, "learning_rate": 0.004941176528892885, "loss": 2.4524, "step": 122500 }, { "epoch": 1.647893760584962, "grad_norm": 1.0369256734848022, "learning_rate": 0.004941062149654489, "loss": 2.4522, "step": 122600 }, { "epoch": 1.6492378827387832, "grad_norm": 0.8359701037406921, "learning_rate": 0.004940947660650888, "loss": 2.4618, "step": 122700 }, { "epoch": 1.6505820048926045, "grad_norm": 0.3227209150791168, "learning_rate": 0.004940833061887247, "loss": 2.4419, "step": 122800 }, { "epoch": 1.6519261270464258, "grad_norm": 0.44361066818237305, "learning_rate": 0.00494071835336873, "loss": 2.4504, "step": 122900 }, { "epoch": 1.6532702492002473, "grad_norm": 0.9630382657051086, "learning_rate": 0.004940603535100503, "loss": 2.4494, "step": 123000 }, { "epoch": 1.6532702492002473, "eval_MaskedAccuracy": 0.4438552018371928, "eval_loss": 2.762709856033325, "eval_runtime": 153.7406, "eval_samples_per_second": 412.877, "eval_steps_per_second": 1.613, "step": 123000 }, { "epoch": 1.6546143713540686, "grad_norm": 1.0358442068099976, "learning_rate": 0.004940488607087739, "loss": 2.4472, "step": 123100 }, { "epoch": 1.6559584935078902, "grad_norm": 1.2041329145431519, "learning_rate": 0.004940373569335615, "loss": 2.4454, "step": 123200 }, { "epoch": 1.6573026156617114, "grad_norm": 1.5662907361984253, "learning_rate": 0.004940258421849318, "loss": 2.4495, "step": 123300 }, { "epoch": 1.6586467378155327, "grad_norm": 0.6521201133728027, "learning_rate": 0.004940143164634034, "loss": 2.4478, "step": 123400 }, { "epoch": 1.659990859969354, "grad_norm": 0.7364070415496826, "learning_rate": 0.004940027797694953, "loss": 2.4472, "step": 123500 }, { "epoch": 1.6613349821231753, "grad_norm": 0.5109328031539917, "learning_rate": 0.004939912321037278, "loss": 2.4537, "step": 123600 }, { "epoch": 1.6626791042769966, "grad_norm": 0.6922867298126221, "learning_rate": 0.004939796734666213, "loss": 2.456, "step": 123700 }, { "epoch": 1.664023226430818, "grad_norm": 0.8259190320968628, "learning_rate": 0.004939681038586966, "loss": 2.4501, "step": 123800 }, { "epoch": 1.6653673485846394, "grad_norm": 0.7042433619499207, "learning_rate": 0.004939565232804747, "loss": 2.4456, "step": 123900 }, { "epoch": 1.6667114707384607, "grad_norm": 1.328395128250122, "learning_rate": 0.0049394493173247804, "loss": 2.4599, "step": 124000 }, { "epoch": 1.6667114707384607, "eval_MaskedAccuracy": 0.44327534756472914, "eval_loss": 2.7654707431793213, "eval_runtime": 156.5562, "eval_samples_per_second": 405.452, "eval_steps_per_second": 1.584, "step": 124000 }, { "epoch": 1.668055592892282, "grad_norm": 1.040299892425537, "learning_rate": 0.0049393332921522795, "loss": 2.4475, "step": 124100 }, { "epoch": 1.6693997150461035, "grad_norm": 0.447637677192688, "learning_rate": 0.004939217157292479, "loss": 2.4522, "step": 124200 }, { "epoch": 1.6707438371999248, "grad_norm": 0.3728015422821045, "learning_rate": 0.004939100912750606, "loss": 2.4541, "step": 124300 }, { "epoch": 1.672087959353746, "grad_norm": 0.4562872350215912, "learning_rate": 0.0049389845585319095, "loss": 2.452, "step": 124400 }, { "epoch": 1.6734320815075674, "grad_norm": 1.0070672035217285, "learning_rate": 0.0049388680946416234, "loss": 2.4478, "step": 124500 }, { "epoch": 1.6747762036613887, "grad_norm": 0.3675072193145752, "learning_rate": 0.004938751521084995, "loss": 2.4464, "step": 124600 }, { "epoch": 1.67612032581521, "grad_norm": 1.3940047025680542, "learning_rate": 0.004938634837867282, "loss": 2.44, "step": 124700 }, { "epoch": 1.6774644479690313, "grad_norm": 0.6820449233055115, "learning_rate": 0.004938518044993735, "loss": 2.4448, "step": 124800 }, { "epoch": 1.6788085701228528, "grad_norm": 0.2972992956638336, "learning_rate": 0.004938401142469626, "loss": 2.4407, "step": 124900 }, { "epoch": 1.680152692276674, "grad_norm": 1.0912327766418457, "learning_rate": 0.004938284130300211, "loss": 2.4543, "step": 125000 }, { "epoch": 1.680152692276674, "eval_MaskedAccuracy": 0.4438537899485854, "eval_loss": 2.7616376876831055, "eval_runtime": 153.4081, "eval_samples_per_second": 413.772, "eval_steps_per_second": 1.617, "step": 125000 }, { "epoch": 1.6814968144304956, "grad_norm": 0.8189194798469543, "learning_rate": 0.004938167008490767, "loss": 2.4444, "step": 125100 }, { "epoch": 1.6828409365843169, "grad_norm": 1.2573928833007812, "learning_rate": 0.004938049777046575, "loss": 2.4477, "step": 125200 }, { "epoch": 1.6841850587381382, "grad_norm": 0.27490001916885376, "learning_rate": 0.004937932435972912, "loss": 2.4494, "step": 125300 }, { "epoch": 1.6855291808919595, "grad_norm": 2.5110983848571777, "learning_rate": 0.0049378149852750694, "loss": 2.4453, "step": 125400 }, { "epoch": 1.6868733030457808, "grad_norm": 0.5286669135093689, "learning_rate": 0.004937697424958338, "loss": 2.4499, "step": 125500 }, { "epoch": 1.688217425199602, "grad_norm": 1.0157794952392578, "learning_rate": 0.004937579755028025, "loss": 2.4499, "step": 125600 }, { "epoch": 1.6895615473534233, "grad_norm": 2.4944801330566406, "learning_rate": 0.004937461975489416, "loss": 2.4454, "step": 125700 }, { "epoch": 1.6909056695072449, "grad_norm": 0.9877827763557434, "learning_rate": 0.0049373440863478175, "loss": 2.4435, "step": 125800 }, { "epoch": 1.6922497916610661, "grad_norm": 0.28307971358299255, "learning_rate": 0.004937226087608559, "loss": 2.4502, "step": 125900 }, { "epoch": 1.6935939138148874, "grad_norm": 0.6857683658599854, "learning_rate": 0.0049371079792769395, "loss": 2.4459, "step": 126000 }, { "epoch": 1.6935939138148874, "eval_MaskedAccuracy": 0.4442026999946834, "eval_loss": 2.760291576385498, "eval_runtime": 154.5053, "eval_samples_per_second": 410.834, "eval_steps_per_second": 1.605, "step": 126000 }, { "epoch": 1.694938035968709, "grad_norm": 0.3602958023548126, "learning_rate": 0.0049369897613582864, "loss": 2.4459, "step": 126100 }, { "epoch": 1.6962821581225302, "grad_norm": 1.104148507118225, "learning_rate": 0.004936871433857933, "loss": 2.452, "step": 126200 }, { "epoch": 1.6976262802763515, "grad_norm": 1.409397840499878, "learning_rate": 0.004936752996781202, "loss": 2.4469, "step": 126300 }, { "epoch": 1.6989704024301728, "grad_norm": 1.9952348470687866, "learning_rate": 0.004936634450133437, "loss": 2.4584, "step": 126400 }, { "epoch": 1.7003145245839941, "grad_norm": 1.7229875326156616, "learning_rate": 0.004936515793919977, "loss": 2.4491, "step": 126500 }, { "epoch": 1.7016586467378154, "grad_norm": 0.6407601237297058, "learning_rate": 0.00493639702814616, "loss": 2.4525, "step": 126600 }, { "epoch": 1.7030027688916367, "grad_norm": 1.1715171337127686, "learning_rate": 0.004936278152817347, "loss": 2.4403, "step": 126700 }, { "epoch": 1.7043468910454582, "grad_norm": 0.6220793128013611, "learning_rate": 0.004936159167938901, "loss": 2.4487, "step": 126800 }, { "epoch": 1.7056910131992795, "grad_norm": 0.5494009256362915, "learning_rate": 0.004936040073516175, "loss": 2.4331, "step": 126900 }, { "epoch": 1.707035135353101, "grad_norm": 1.780874252319336, "learning_rate": 0.004935920869554534, "loss": 2.4391, "step": 127000 }, { "epoch": 1.707035135353101, "eval_MaskedAccuracy": 0.4440303666034177, "eval_loss": 2.7617318630218506, "eval_runtime": 156.1849, "eval_samples_per_second": 406.416, "eval_steps_per_second": 1.588, "step": 127000 }, { "epoch": 1.7083792575069223, "grad_norm": 1.470224142074585, "learning_rate": 0.004935801556059347, "loss": 2.4416, "step": 127100 }, { "epoch": 1.7097233796607436, "grad_norm": 0.2865951359272003, "learning_rate": 0.00493568213303599, "loss": 2.4505, "step": 127200 }, { "epoch": 1.711067501814565, "grad_norm": 1.1740427017211914, "learning_rate": 0.004935562600489849, "loss": 2.4505, "step": 127300 }, { "epoch": 1.7124116239683862, "grad_norm": 0.6032086610794067, "learning_rate": 0.0049354429584263105, "loss": 2.4482, "step": 127400 }, { "epoch": 1.7137557461222075, "grad_norm": 1.0448052883148193, "learning_rate": 0.0049353232068507615, "loss": 2.45, "step": 127500 }, { "epoch": 1.7150998682760288, "grad_norm": 0.433826208114624, "learning_rate": 0.004935203345768603, "loss": 2.4493, "step": 127600 }, { "epoch": 1.7164439904298503, "grad_norm": 1.4232683181762695, "learning_rate": 0.004935083375185232, "loss": 2.4463, "step": 127700 }, { "epoch": 1.7177881125836716, "grad_norm": 1.5276943445205688, "learning_rate": 0.004934963295106055, "loss": 2.4501, "step": 127800 }, { "epoch": 1.719132234737493, "grad_norm": 1.1934447288513184, "learning_rate": 0.0049348431055364775, "loss": 2.4529, "step": 127900 }, { "epoch": 1.7204763568913144, "grad_norm": 0.4801059663295746, "learning_rate": 0.004934722806481924, "loss": 2.4412, "step": 128000 }, { "epoch": 1.7204763568913144, "eval_MaskedAccuracy": 0.4440363882564416, "eval_loss": 2.758544683456421, "eval_runtime": 156.4492, "eval_samples_per_second": 405.729, "eval_steps_per_second": 1.585, "step": 128000 }, { "epoch": 1.7218204790451357, "grad_norm": 0.24669547379016876, "learning_rate": 0.004934602397947807, "loss": 2.4503, "step": 128100 }, { "epoch": 1.723164601198957, "grad_norm": 0.8044118285179138, "learning_rate": 0.00493448187993956, "loss": 2.4397, "step": 128200 }, { "epoch": 1.7245087233527783, "grad_norm": 0.3931942582130432, "learning_rate": 0.004934361252462605, "loss": 2.4423, "step": 128300 }, { "epoch": 1.7258528455065996, "grad_norm": 0.3876301944255829, "learning_rate": 0.004934240515522382, "loss": 2.4393, "step": 128400 }, { "epoch": 1.7271969676604209, "grad_norm": 1.6018636226654053, "learning_rate": 0.004934119669124325, "loss": 2.4476, "step": 128500 }, { "epoch": 1.7285410898142424, "grad_norm": 2.163212776184082, "learning_rate": 0.004933998713273891, "loss": 2.4496, "step": 128600 }, { "epoch": 1.7298852119680637, "grad_norm": 0.32517024874687195, "learning_rate": 0.004933877647976517, "loss": 2.4492, "step": 128700 }, { "epoch": 1.731229334121885, "grad_norm": 0.843076765537262, "learning_rate": 0.00493375647323767, "loss": 2.4452, "step": 128800 }, { "epoch": 1.7325734562757065, "grad_norm": 0.7287679314613342, "learning_rate": 0.004933635189062801, "loss": 2.4411, "step": 128900 }, { "epoch": 1.7339175784295278, "grad_norm": 0.561202347278595, "learning_rate": 0.004933513795457381, "loss": 2.437, "step": 129000 }, { "epoch": 1.7339175784295278, "eval_MaskedAccuracy": 0.4451868773268175, "eval_loss": 2.7529821395874023, "eval_runtime": 154.118, "eval_samples_per_second": 411.866, "eval_steps_per_second": 1.609, "step": 129000 }, { "epoch": 1.735261700583349, "grad_norm": 1.201848030090332, "learning_rate": 0.0049333922924268804, "loss": 2.4469, "step": 129100 }, { "epoch": 1.7366058227371703, "grad_norm": 0.7610087990760803, "learning_rate": 0.004933270679976775, "loss": 2.4427, "step": 129200 }, { "epoch": 1.7379499448909916, "grad_norm": 0.4432719051837921, "learning_rate": 0.004933148958112536, "loss": 2.4431, "step": 129300 }, { "epoch": 1.739294067044813, "grad_norm": 0.4039577841758728, "learning_rate": 0.004933027126839647, "loss": 2.4416, "step": 129400 }, { "epoch": 1.7406381891986342, "grad_norm": 0.47328197956085205, "learning_rate": 0.004932905186163602, "loss": 2.4409, "step": 129500 }, { "epoch": 1.7419823113524557, "grad_norm": 0.9691961407661438, "learning_rate": 0.004932783136089899, "loss": 2.4415, "step": 129600 }, { "epoch": 1.743326433506277, "grad_norm": 0.8540052771568298, "learning_rate": 0.004932660976624034, "loss": 2.444, "step": 129700 }, { "epoch": 1.7446705556600985, "grad_norm": 1.1006224155426025, "learning_rate": 0.004932538707771508, "loss": 2.4509, "step": 129800 }, { "epoch": 1.7460146778139198, "grad_norm": 2.2226502895355225, "learning_rate": 0.004932416329537838, "loss": 2.4423, "step": 129900 }, { "epoch": 1.7473587999677411, "grad_norm": 0.42202070355415344, "learning_rate": 0.004932293841928537, "loss": 2.4394, "step": 130000 }, { "epoch": 1.7473587999677411, "eval_MaskedAccuracy": 0.44459195138244084, "eval_loss": 2.756187915802002, "eval_runtime": 153.5748, "eval_samples_per_second": 413.323, "eval_steps_per_second": 1.615, "step": 130000 }, { "epoch": 1.7487029221215624, "grad_norm": 0.47480854392051697, "learning_rate": 0.004932171244949114, "loss": 2.4463, "step": 130100 }, { "epoch": 1.7500470442753837, "grad_norm": 0.9166473150253296, "learning_rate": 0.004932048538605108, "loss": 2.4467, "step": 130200 }, { "epoch": 1.751391166429205, "grad_norm": 0.4468994438648224, "learning_rate": 0.0049319257229020344, "loss": 2.4422, "step": 130300 }, { "epoch": 1.7527352885830263, "grad_norm": 2.049225091934204, "learning_rate": 0.004931802797845434, "loss": 2.4315, "step": 130400 }, { "epoch": 1.7540794107368478, "grad_norm": 0.9961231350898743, "learning_rate": 0.004931679763440849, "loss": 2.4394, "step": 130500 }, { "epoch": 1.755423532890669, "grad_norm": 0.8406715989112854, "learning_rate": 0.00493155661969382, "loss": 2.4436, "step": 130600 }, { "epoch": 1.7567676550444904, "grad_norm": 0.6432211995124817, "learning_rate": 0.004931433366609882, "loss": 2.4352, "step": 130700 }, { "epoch": 1.758111777198312, "grad_norm": 0.34998586773872375, "learning_rate": 0.004931310004194613, "loss": 2.4438, "step": 130800 }, { "epoch": 1.7594558993521332, "grad_norm": 0.8097212910652161, "learning_rate": 0.00493118653245356, "loss": 2.4447, "step": 130900 }, { "epoch": 1.7608000215059545, "grad_norm": 0.43594908714294434, "learning_rate": 0.004931062951392285, "loss": 2.4391, "step": 131000 }, { "epoch": 1.7608000215059545, "eval_MaskedAccuracy": 0.4449512847956205, "eval_loss": 2.7537848949432373, "eval_runtime": 153.3786, "eval_samples_per_second": 413.852, "eval_steps_per_second": 1.617, "step": 131000 }, { "epoch": 1.7621441436597758, "grad_norm": 0.555332362651825, "learning_rate": 0.004930939261016357, "loss": 2.439, "step": 131100 }, { "epoch": 1.763488265813597, "grad_norm": 0.6408087611198425, "learning_rate": 0.00493081546133135, "loss": 2.4447, "step": 131200 }, { "epoch": 1.7648323879674184, "grad_norm": 0.2732897996902466, "learning_rate": 0.0049306915523428425, "loss": 2.445, "step": 131300 }, { "epoch": 1.7661765101212397, "grad_norm": 0.7091703414916992, "learning_rate": 0.0049305675340564105, "loss": 2.4377, "step": 131400 }, { "epoch": 1.7675206322750612, "grad_norm": 0.3678547739982605, "learning_rate": 0.004930443406477648, "loss": 2.4239, "step": 131500 }, { "epoch": 1.7688647544288825, "grad_norm": 0.7927549481391907, "learning_rate": 0.004930319169612157, "loss": 2.4298, "step": 131600 }, { "epoch": 1.770208876582704, "grad_norm": 0.8828220963478088, "learning_rate": 0.004930194823465527, "loss": 2.4391, "step": 131700 }, { "epoch": 1.7715529987365253, "grad_norm": 0.35567620396614075, "learning_rate": 0.00493007036804336, "loss": 2.4399, "step": 131800 }, { "epoch": 1.7728971208903466, "grad_norm": 0.5886101722717285, "learning_rate": 0.0049299458033512655, "loss": 2.4472, "step": 131900 }, { "epoch": 1.7742412430441679, "grad_norm": 0.3277896046638489, "learning_rate": 0.0049298211293948594, "loss": 2.4406, "step": 132000 }, { "epoch": 1.7742412430441679, "eval_MaskedAccuracy": 0.44539411736195783, "eval_loss": 2.7525792121887207, "eval_runtime": 154.9926, "eval_samples_per_second": 409.542, "eval_steps_per_second": 1.6, "step": 132000 }, { "epoch": 1.7755853651979892, "grad_norm": 0.967954695224762, "learning_rate": 0.004929696346179746, "loss": 2.445, "step": 132100 }, { "epoch": 1.7769294873518104, "grad_norm": 0.38069549202919006, "learning_rate": 0.004929571453711563, "loss": 2.4397, "step": 132200 }, { "epoch": 1.7782736095056317, "grad_norm": 0.7181752920150757, "learning_rate": 0.004929446451995928, "loss": 2.4411, "step": 132300 }, { "epoch": 1.7796177316594533, "grad_norm": 0.7660533785820007, "learning_rate": 0.004929321341038483, "loss": 2.4344, "step": 132400 }, { "epoch": 1.7809618538132745, "grad_norm": 1.3645398616790771, "learning_rate": 0.004929196120844856, "loss": 2.4429, "step": 132500 }, { "epoch": 1.7823059759670958, "grad_norm": 0.40024033188819885, "learning_rate": 0.004929070791420691, "loss": 2.4525, "step": 132600 }, { "epoch": 1.7836500981209173, "grad_norm": 2.2568159103393555, "learning_rate": 0.004928945352771635, "loss": 2.443, "step": 132700 }, { "epoch": 1.7849942202747386, "grad_norm": 0.7219163775444031, "learning_rate": 0.004928819804903343, "loss": 2.4398, "step": 132800 }, { "epoch": 1.78633834242856, "grad_norm": 1.4869344234466553, "learning_rate": 0.004928694147821474, "loss": 2.4397, "step": 132900 }, { "epoch": 1.7876824645823812, "grad_norm": 0.3790728747844696, "learning_rate": 0.0049285683815316825, "loss": 2.4328, "step": 133000 }, { "epoch": 1.7876824645823812, "eval_MaskedAccuracy": 0.44502501594055116, "eval_loss": 2.751193046569824, "eval_runtime": 154.0912, "eval_samples_per_second": 411.938, "eval_steps_per_second": 1.609, "step": 133000 }, { "epoch": 1.7890265867362025, "grad_norm": 0.836524486541748, "learning_rate": 0.00492844250603964, "loss": 2.4433, "step": 133100 }, { "epoch": 1.7903707088900238, "grad_norm": 0.29098621010780334, "learning_rate": 0.004928316521351023, "loss": 2.4376, "step": 133200 }, { "epoch": 1.791714831043845, "grad_norm": 0.4187617003917694, "learning_rate": 0.0049281904274715025, "loss": 2.4439, "step": 133300 }, { "epoch": 1.7930589531976666, "grad_norm": 1.8081835508346558, "learning_rate": 0.004928064224406764, "loss": 2.4372, "step": 133400 }, { "epoch": 1.794403075351488, "grad_norm": 2.1965842247009277, "learning_rate": 0.0049279379121624925, "loss": 2.4302, "step": 133500 }, { "epoch": 1.7957471975053094, "grad_norm": 1.1956915855407715, "learning_rate": 0.004927811490744374, "loss": 2.4379, "step": 133600 }, { "epoch": 1.7970913196591307, "grad_norm": 0.7906240820884705, "learning_rate": 0.004927684960158106, "loss": 2.4423, "step": 133700 }, { "epoch": 1.798435441812952, "grad_norm": 0.24601203203201294, "learning_rate": 0.004927558320409392, "loss": 2.4497, "step": 133800 }, { "epoch": 1.7997795639667733, "grad_norm": 1.7259650230407715, "learning_rate": 0.004927431571503945, "loss": 2.4301, "step": 133900 }, { "epoch": 1.8011236861205946, "grad_norm": 0.3489820957183838, "learning_rate": 0.004927304713447465, "loss": 2.4494, "step": 134000 }, { "epoch": 1.8011236861205946, "eval_MaskedAccuracy": 0.4454901993570299, "eval_loss": 2.751655101776123, "eval_runtime": 155.8316, "eval_samples_per_second": 407.337, "eval_steps_per_second": 1.591, "step": 134000 }, { "epoch": 1.8024678082744159, "grad_norm": 0.9807569980621338, "learning_rate": 0.004927177746245671, "loss": 2.4397, "step": 134100 }, { "epoch": 1.8038119304282372, "grad_norm": 1.5453386306762695, "learning_rate": 0.004927050669904284, "loss": 2.4467, "step": 134200 }, { "epoch": 1.8051560525820587, "grad_norm": 0.9938260912895203, "learning_rate": 0.004926923484429032, "loss": 2.4361, "step": 134300 }, { "epoch": 1.80650017473588, "grad_norm": 1.328628420829773, "learning_rate": 0.004926796189825648, "loss": 2.4473, "step": 134400 }, { "epoch": 1.8078442968897015, "grad_norm": 0.565514087677002, "learning_rate": 0.004926668786099871, "loss": 2.4383, "step": 134500 }, { "epoch": 1.8091884190435228, "grad_norm": 3.7751681804656982, "learning_rate": 0.004926541273257434, "loss": 2.4386, "step": 134600 }, { "epoch": 1.810532541197344, "grad_norm": 1.7504175901412964, "learning_rate": 0.004926413651304083, "loss": 2.4384, "step": 134700 }, { "epoch": 1.8118766633511654, "grad_norm": 0.6130387783050537, "learning_rate": 0.004926285920245568, "loss": 2.437, "step": 134800 }, { "epoch": 1.8132207855049867, "grad_norm": 0.5119666457176208, "learning_rate": 0.004926158080087647, "loss": 2.4349, "step": 134900 }, { "epoch": 1.814564907658808, "grad_norm": 1.4035404920578003, "learning_rate": 0.004926030130836081, "loss": 2.4344, "step": 135000 }, { "epoch": 1.814564907658808, "eval_MaskedAccuracy": 0.44536970526231284, "eval_loss": 2.7522668838500977, "eval_runtime": 154.4591, "eval_samples_per_second": 410.957, "eval_steps_per_second": 1.606, "step": 135000 }, { "epoch": 1.8159090298126292, "grad_norm": 1.2096890211105347, "learning_rate": 0.00492590207249663, "loss": 2.4333, "step": 135100 }, { "epoch": 1.8172531519664508, "grad_norm": 0.8168133497238159, "learning_rate": 0.004925773905075064, "loss": 2.4354, "step": 135200 }, { "epoch": 1.818597274120272, "grad_norm": 0.46834301948547363, "learning_rate": 0.004925645628577168, "loss": 2.4345, "step": 135300 }, { "epoch": 1.8199413962740933, "grad_norm": 0.7720035910606384, "learning_rate": 0.004925517243008721, "loss": 2.447, "step": 135400 }, { "epoch": 1.8212855184279149, "grad_norm": 1.1966851949691772, "learning_rate": 0.0049253887483755, "loss": 2.4282, "step": 135500 }, { "epoch": 1.8226296405817362, "grad_norm": 1.4665566682815552, "learning_rate": 0.004925260144683294, "loss": 2.4311, "step": 135600 }, { "epoch": 1.8239737627355574, "grad_norm": 0.6458213925361633, "learning_rate": 0.004925131431937897, "loss": 2.4453, "step": 135700 }, { "epoch": 1.8253178848893787, "grad_norm": 0.864764392375946, "learning_rate": 0.004925002610145116, "loss": 2.4315, "step": 135800 }, { "epoch": 1.8266620070432, "grad_norm": 1.5035104751586914, "learning_rate": 0.004924873679310756, "loss": 2.435, "step": 135900 }, { "epoch": 1.8280061291970213, "grad_norm": 0.5938438773155212, "learning_rate": 0.004924744639440619, "loss": 2.4461, "step": 136000 }, { "epoch": 1.8280061291970213, "eval_MaskedAccuracy": 0.44473731371748054, "eval_loss": 2.752194881439209, "eval_runtime": 153.2163, "eval_samples_per_second": 414.29, "eval_steps_per_second": 1.619, "step": 136000 }, { "epoch": 1.8293502513508426, "grad_norm": 1.69126558303833, "learning_rate": 0.00492461549054052, "loss": 2.4375, "step": 136100 }, { "epoch": 1.8306943735046641, "grad_norm": 1.1451964378356934, "learning_rate": 0.004924486232616281, "loss": 2.4311, "step": 136200 }, { "epoch": 1.8320384956584854, "grad_norm": 0.33867311477661133, "learning_rate": 0.004924356865673731, "loss": 2.4365, "step": 136300 }, { "epoch": 1.833382617812307, "grad_norm": 0.7080395221710205, "learning_rate": 0.0049242273897186955, "loss": 2.438, "step": 136400 }, { "epoch": 1.8347267399661282, "grad_norm": 0.8327525854110718, "learning_rate": 0.0049240978047570055, "loss": 2.4409, "step": 136500 }, { "epoch": 1.8360708621199495, "grad_norm": 0.296032190322876, "learning_rate": 0.004923968110794505, "loss": 2.4377, "step": 136600 }, { "epoch": 1.8374149842737708, "grad_norm": 0.8659167289733887, "learning_rate": 0.00492383830783703, "loss": 2.4317, "step": 136700 }, { "epoch": 1.838759106427592, "grad_norm": 0.5200401544570923, "learning_rate": 0.004923708395890436, "loss": 2.445, "step": 136800 }, { "epoch": 1.8401032285814134, "grad_norm": 1.0861247777938843, "learning_rate": 0.004923578374960567, "loss": 2.4332, "step": 136900 }, { "epoch": 1.8414473507352347, "grad_norm": 0.3223403990268707, "learning_rate": 0.004923448245053295, "loss": 2.4422, "step": 137000 }, { "epoch": 1.8414473507352347, "eval_MaskedAccuracy": 0.4446580648304679, "eval_loss": 2.7543158531188965, "eval_runtime": 153.5392, "eval_samples_per_second": 413.419, "eval_steps_per_second": 1.615, "step": 137000 }, { "epoch": 1.8427914728890562, "grad_norm": 0.5863522887229919, "learning_rate": 0.004923318006174473, "loss": 2.4385, "step": 137100 }, { "epoch": 1.8441355950428775, "grad_norm": 0.2514398694038391, "learning_rate": 0.004923187658329979, "loss": 2.4398, "step": 137200 }, { "epoch": 1.8454797171966988, "grad_norm": 1.2516361474990845, "learning_rate": 0.004923057201525681, "loss": 2.4374, "step": 137300 }, { "epoch": 1.8468238393505203, "grad_norm": 0.6415001153945923, "learning_rate": 0.004922926635767458, "loss": 2.4277, "step": 137400 }, { "epoch": 1.8481679615043416, "grad_norm": 0.34172287583351135, "learning_rate": 0.004922795961061186, "loss": 2.425, "step": 137500 }, { "epoch": 1.8495120836581629, "grad_norm": 2.2421274185180664, "learning_rate": 0.004922665177412763, "loss": 2.4364, "step": 137600 }, { "epoch": 1.8508562058119842, "grad_norm": 0.48479756712913513, "learning_rate": 0.00492253428482808, "loss": 2.4314, "step": 137700 }, { "epoch": 1.8522003279658055, "grad_norm": 0.8086810111999512, "learning_rate": 0.00492240328331303, "loss": 2.4392, "step": 137800 }, { "epoch": 1.8535444501196268, "grad_norm": 0.8768898844718933, "learning_rate": 0.004922272172873519, "loss": 2.44, "step": 137900 }, { "epoch": 1.854888572273448, "grad_norm": 0.7095423936843872, "learning_rate": 0.004922140953515459, "loss": 2.4383, "step": 138000 }, { "epoch": 1.854888572273448, "eval_MaskedAccuracy": 0.44575168545661686, "eval_loss": 2.747347116470337, "eval_runtime": 154.9017, "eval_samples_per_second": 409.782, "eval_steps_per_second": 1.601, "step": 138000 }, { "epoch": 1.8562326944272696, "grad_norm": 0.3725392818450928, "learning_rate": 0.004922009625244753, "loss": 2.4307, "step": 138100 }, { "epoch": 1.8575768165810909, "grad_norm": 0.3521710932254791, "learning_rate": 0.004921878188067327, "loss": 2.4272, "step": 138200 }, { "epoch": 1.8589209387349124, "grad_norm": 1.7538436651229858, "learning_rate": 0.004921746641989098, "loss": 2.4312, "step": 138300 }, { "epoch": 1.8602650608887337, "grad_norm": 0.7190144658088684, "learning_rate": 0.004921614987015991, "loss": 2.4334, "step": 138400 }, { "epoch": 1.861609183042555, "grad_norm": 0.3224917948246002, "learning_rate": 0.004921483223153946, "loss": 2.4315, "step": 138500 }, { "epoch": 1.8629533051963763, "grad_norm": 0.6861767172813416, "learning_rate": 0.004921351350408901, "loss": 2.432, "step": 138600 }, { "epoch": 1.8642974273501975, "grad_norm": 0.32877910137176514, "learning_rate": 0.004921219368786787, "loss": 2.4436, "step": 138700 }, { "epoch": 1.8656415495040188, "grad_norm": 0.8731430172920227, "learning_rate": 0.004921087278293557, "loss": 2.4346, "step": 138800 }, { "epoch": 1.8669856716578401, "grad_norm": 0.31697648763656616, "learning_rate": 0.0049209550789351625, "loss": 2.4353, "step": 138900 }, { "epoch": 1.8683297938116616, "grad_norm": 0.30971816182136536, "learning_rate": 0.004920822770717562, "loss": 2.4242, "step": 139000 }, { "epoch": 1.8683297938116616, "eval_MaskedAccuracy": 0.4455395990497761, "eval_loss": 2.749335289001465, "eval_runtime": 153.4613, "eval_samples_per_second": 413.629, "eval_steps_per_second": 1.616, "step": 139000 }, { "epoch": 1.869673915965483, "grad_norm": 0.32947519421577454, "learning_rate": 0.004920690353646715, "loss": 2.4308, "step": 139100 }, { "epoch": 1.8710180381193042, "grad_norm": 3.411869525909424, "learning_rate": 0.004920557827728589, "loss": 2.426, "step": 139200 }, { "epoch": 1.8723621602731257, "grad_norm": 0.3665344715118408, "learning_rate": 0.004920425192969151, "loss": 2.4347, "step": 139300 }, { "epoch": 1.873706282426947, "grad_norm": 0.7894236445426941, "learning_rate": 0.0049202924493743794, "loss": 2.435, "step": 139400 }, { "epoch": 1.8750504045807683, "grad_norm": 0.37953993678092957, "learning_rate": 0.00492015959695027, "loss": 2.4363, "step": 139500 }, { "epoch": 1.8763945267345896, "grad_norm": 0.32298392057418823, "learning_rate": 0.004920026635702793, "loss": 2.4356, "step": 139600 }, { "epoch": 1.877738648888411, "grad_norm": 1.1138973236083984, "learning_rate": 0.004919893565637941, "loss": 2.4246, "step": 139700 }, { "epoch": 1.8790827710422322, "grad_norm": 0.4021407663822174, "learning_rate": 0.00491976038676171, "loss": 2.4395, "step": 139800 }, { "epoch": 1.8804268931960535, "grad_norm": 0.37863534688949585, "learning_rate": 0.004919627099080102, "loss": 2.4341, "step": 139900 }, { "epoch": 1.881771015349875, "grad_norm": 1.6591562032699585, "learning_rate": 0.004919493702599126, "loss": 2.4329, "step": 140000 }, { "epoch": 1.881771015349875, "eval_MaskedAccuracy": 0.4456517448251654, "eval_loss": 2.7490234375, "eval_runtime": 144.1126, "eval_samples_per_second": 440.461, "eval_steps_per_second": 1.721, "step": 140000 }, { "epoch": 1.8831151375036963, "grad_norm": 0.8508136868476868, "learning_rate": 0.004919360197324791, "loss": 2.4389, "step": 140100 }, { "epoch": 1.8844592596575178, "grad_norm": 0.9984667301177979, "learning_rate": 0.004919226583263112, "loss": 2.4296, "step": 140200 }, { "epoch": 1.885803381811339, "grad_norm": 1.3535406589508057, "learning_rate": 0.004919092860420102, "loss": 2.4339, "step": 140300 }, { "epoch": 1.8871475039651604, "grad_norm": 0.6479485630989075, "learning_rate": 0.004918959028801795, "loss": 2.4319, "step": 140400 }, { "epoch": 1.8884916261189817, "grad_norm": 0.6749982237815857, "learning_rate": 0.004918825088414224, "loss": 2.433, "step": 140500 }, { "epoch": 1.889835748272803, "grad_norm": 0.31086307764053345, "learning_rate": 0.0049186910392634235, "loss": 2.435, "step": 140600 }, { "epoch": 1.8911798704266243, "grad_norm": 0.5695422291755676, "learning_rate": 0.0049185568813554185, "loss": 2.432, "step": 140700 }, { "epoch": 1.8925239925804456, "grad_norm": 0.7300012707710266, "learning_rate": 0.004918422614696274, "loss": 2.4368, "step": 140800 }, { "epoch": 1.893868114734267, "grad_norm": 0.415073424577713, "learning_rate": 0.004918288239292025, "loss": 2.443, "step": 140900 }, { "epoch": 1.8952122368880884, "grad_norm": 0.9157136678695679, "learning_rate": 0.0049181537551487315, "loss": 2.4297, "step": 141000 }, { "epoch": 1.8952122368880884, "eval_MaskedAccuracy": 0.4457131985564527, "eval_loss": 2.7454721927642822, "eval_runtime": 153.2602, "eval_samples_per_second": 414.172, "eval_steps_per_second": 1.618, "step": 141000 }, { "epoch": 1.89655635904191, "grad_norm": 0.43132588267326355, "learning_rate": 0.00491801916227246, "loss": 2.4381, "step": 141100 }, { "epoch": 1.8979004811957312, "grad_norm": 1.6390697956085205, "learning_rate": 0.004917884460669262, "loss": 2.4192, "step": 141200 }, { "epoch": 1.8992446033495525, "grad_norm": 0.5319965481758118, "learning_rate": 0.004917749650345217, "loss": 2.4424, "step": 141300 }, { "epoch": 1.9005887255033738, "grad_norm": 0.43463683128356934, "learning_rate": 0.004917614731306398, "loss": 2.4381, "step": 141400 }, { "epoch": 1.901932847657195, "grad_norm": 1.1953731775283813, "learning_rate": 0.004917479703558886, "loss": 2.4382, "step": 141500 }, { "epoch": 1.9032769698110164, "grad_norm": 3.3171629905700684, "learning_rate": 0.004917344567108767, "loss": 2.4285, "step": 141600 }, { "epoch": 1.9046210919648376, "grad_norm": 1.354646921157837, "learning_rate": 0.004917209321962115, "loss": 2.4329, "step": 141700 }, { "epoch": 1.9059652141186592, "grad_norm": 0.46209368109703064, "learning_rate": 0.004917073968125036, "loss": 2.4287, "step": 141800 }, { "epoch": 1.9073093362724804, "grad_norm": 0.2556077241897583, "learning_rate": 0.004916938505603623, "loss": 2.4351, "step": 141900 }, { "epoch": 1.9086534584263017, "grad_norm": 0.9437853097915649, "learning_rate": 0.004916802934403986, "loss": 2.4264, "step": 142000 }, { "epoch": 1.9086534584263017, "eval_MaskedAccuracy": 0.44617851609047027, "eval_loss": 2.7451670169830322, "eval_runtime": 156.2618, "eval_samples_per_second": 406.216, "eval_steps_per_second": 1.587, "step": 142000 }, { "epoch": 1.9099975805801233, "grad_norm": 0.6614341139793396, "learning_rate": 0.004916667254532234, "loss": 2.4258, "step": 142100 }, { "epoch": 1.9113417027339445, "grad_norm": 2.363129138946533, "learning_rate": 0.004916531465994474, "loss": 2.4318, "step": 142200 }, { "epoch": 1.9126858248877658, "grad_norm": 0.27388179302215576, "learning_rate": 0.004916395568796826, "loss": 2.4305, "step": 142300 }, { "epoch": 1.9140299470415871, "grad_norm": 1.5506696701049805, "learning_rate": 0.004916259562945416, "loss": 2.4361, "step": 142400 }, { "epoch": 1.9153740691954084, "grad_norm": 0.8063324093818665, "learning_rate": 0.004916123448446375, "loss": 2.4362, "step": 142500 }, { "epoch": 1.9167181913492297, "grad_norm": 0.9321148991584778, "learning_rate": 0.004915987225305831, "loss": 2.4293, "step": 142600 }, { "epoch": 1.918062313503051, "grad_norm": 0.9221600294113159, "learning_rate": 0.004915850893529926, "loss": 2.4282, "step": 142700 }, { "epoch": 1.9194064356568725, "grad_norm": 0.31020745635032654, "learning_rate": 0.004915714453124789, "loss": 2.4326, "step": 142800 }, { "epoch": 1.9207505578106938, "grad_norm": 0.9556496143341064, "learning_rate": 0.004915577904096585, "loss": 2.4317, "step": 142900 }, { "epoch": 1.9220946799645153, "grad_norm": 0.9458506107330322, "learning_rate": 0.004915441246451457, "loss": 2.4328, "step": 143000 }, { "epoch": 1.9220946799645153, "eval_MaskedAccuracy": 0.44678351331325233, "eval_loss": 2.7399027347564697, "eval_runtime": 154.7107, "eval_samples_per_second": 410.288, "eval_steps_per_second": 1.603, "step": 143000 }, { "epoch": 1.9234388021183366, "grad_norm": 0.26708322763442993, "learning_rate": 0.004915304480195574, "loss": 2.4305, "step": 143100 }, { "epoch": 1.924782924272158, "grad_norm": 0.42214831709861755, "learning_rate": 0.004915167605335087, "loss": 2.4342, "step": 143200 }, { "epoch": 1.9261270464259792, "grad_norm": 1.0620460510253906, "learning_rate": 0.00491503062187617, "loss": 2.4318, "step": 143300 }, { "epoch": 1.9274711685798005, "grad_norm": 0.45594775676727295, "learning_rate": 0.004914893529824991, "loss": 2.4298, "step": 143400 }, { "epoch": 1.9288152907336218, "grad_norm": 0.8724369406700134, "learning_rate": 0.00491475632918773, "loss": 2.4303, "step": 143500 }, { "epoch": 1.930159412887443, "grad_norm": 1.169735074043274, "learning_rate": 0.004914619019970569, "loss": 2.4335, "step": 143600 }, { "epoch": 1.9315035350412646, "grad_norm": 0.28445443511009216, "learning_rate": 0.00491448160217969, "loss": 2.4301, "step": 143700 }, { "epoch": 1.9328476571950859, "grad_norm": 1.0240504741668701, "learning_rate": 0.004914344075821288, "loss": 2.4395, "step": 143800 }, { "epoch": 1.9341917793489072, "grad_norm": 0.2876899242401123, "learning_rate": 0.004914206440901561, "loss": 2.4273, "step": 143900 }, { "epoch": 1.9355359015027287, "grad_norm": 1.793506383895874, "learning_rate": 0.004914068697426708, "loss": 2.4316, "step": 144000 }, { "epoch": 1.9355359015027287, "eval_MaskedAccuracy": 0.4459802489403792, "eval_loss": 2.7441201210021973, "eval_runtime": 159.5024, "eval_samples_per_second": 397.963, "eval_steps_per_second": 1.555, "step": 144000 }, { "epoch": 1.93688002365655, "grad_norm": 0.4511992931365967, "learning_rate": 0.004913930845402937, "loss": 2.4312, "step": 144100 }, { "epoch": 1.9382241458103713, "grad_norm": 1.3304485082626343, "learning_rate": 0.004913792884836461, "loss": 2.4234, "step": 144200 }, { "epoch": 1.9395682679641926, "grad_norm": 0.7679903507232666, "learning_rate": 0.004913654815733498, "loss": 2.4288, "step": 144300 }, { "epoch": 1.9409123901180139, "grad_norm": 0.2566910982131958, "learning_rate": 0.004913516638100267, "loss": 2.4307, "step": 144400 }, { "epoch": 1.9422565122718352, "grad_norm": 0.3400803804397583, "learning_rate": 0.004913378351942991, "loss": 2.4325, "step": 144500 }, { "epoch": 1.9436006344256564, "grad_norm": 0.5881883502006531, "learning_rate": 0.0049132399572679045, "loss": 2.4283, "step": 144600 }, { "epoch": 1.944944756579478, "grad_norm": 2.5710055828094482, "learning_rate": 0.004913101454081242, "loss": 2.4259, "step": 144700 }, { "epoch": 1.9462888787332993, "grad_norm": 0.6169648766517639, "learning_rate": 0.0049129628423892424, "loss": 2.4406, "step": 144800 }, { "epoch": 1.9476330008871208, "grad_norm": 0.541467010974884, "learning_rate": 0.004912824122198156, "loss": 2.4308, "step": 144900 }, { "epoch": 1.948977123040942, "grad_norm": 1.3558932542800903, "learning_rate": 0.004912685293514229, "loss": 2.426, "step": 145000 }, { "epoch": 1.948977123040942, "eval_MaskedAccuracy": 0.4466767450773424, "eval_loss": 2.741316556930542, "eval_runtime": 153.255, "eval_samples_per_second": 414.186, "eval_steps_per_second": 1.618, "step": 145000 }, { "epoch": 1.9503212451947634, "grad_norm": 0.6421293616294861, "learning_rate": 0.00491254635634372, "loss": 2.4282, "step": 145100 }, { "epoch": 1.9516653673485846, "grad_norm": 0.5589990615844727, "learning_rate": 0.004912407310692889, "loss": 2.4237, "step": 145200 }, { "epoch": 1.953009489502406, "grad_norm": 0.45161235332489014, "learning_rate": 0.004912268156567995, "loss": 2.4336, "step": 145300 }, { "epoch": 1.9543536116562272, "grad_norm": 0.6716010570526123, "learning_rate": 0.004912128893975318, "loss": 2.4239, "step": 145400 }, { "epoch": 1.9556977338100485, "grad_norm": 1.577767014503479, "learning_rate": 0.004911989522921124, "loss": 2.4261, "step": 145500 }, { "epoch": 1.95704185596387, "grad_norm": 1.3624939918518066, "learning_rate": 0.004911850043411697, "loss": 2.4406, "step": 145600 }, { "epoch": 1.9583859781176913, "grad_norm": 0.23402433097362518, "learning_rate": 0.004911710455453321, "loss": 2.4291, "step": 145700 }, { "epoch": 1.9597301002715126, "grad_norm": 0.6562730669975281, "learning_rate": 0.004911570759052287, "loss": 2.4325, "step": 145800 }, { "epoch": 1.9610742224253341, "grad_norm": 0.2946057915687561, "learning_rate": 0.004911430954214888, "loss": 2.4268, "step": 145900 }, { "epoch": 1.9624183445791554, "grad_norm": 2.216837167739868, "learning_rate": 0.0049112910409474225, "loss": 2.4272, "step": 146000 }, { "epoch": 1.9624183445791554, "eval_MaskedAccuracy": 0.44681526480055744, "eval_loss": 2.739447593688965, "eval_runtime": 244.5176, "eval_samples_per_second": 259.597, "eval_steps_per_second": 1.014, "step": 146000 }, { "epoch": 1.9637624667329767, "grad_norm": 0.9857944250106812, "learning_rate": 0.004911151019256194, "loss": 2.4263, "step": 146100 }, { "epoch": 1.965106588886798, "grad_norm": 0.9716507792472839, "learning_rate": 0.004911010889147516, "loss": 2.4287, "step": 146200 }, { "epoch": 1.9664507110406193, "grad_norm": 0.6002909541130066, "learning_rate": 0.0049108706506277005, "loss": 2.4169, "step": 146300 }, { "epoch": 1.9677948331944406, "grad_norm": 0.33834025263786316, "learning_rate": 0.004910730303703066, "loss": 2.425, "step": 146400 }, { "epoch": 1.9691389553482619, "grad_norm": 0.3823884129524231, "learning_rate": 0.004910589848379935, "loss": 2.4274, "step": 146500 }, { "epoch": 1.9704830775020834, "grad_norm": 0.7107839584350586, "learning_rate": 0.004910449284664644, "loss": 2.4339, "step": 146600 }, { "epoch": 1.9718271996559047, "grad_norm": 1.5292255878448486, "learning_rate": 0.0049103086125635185, "loss": 2.4211, "step": 146700 }, { "epoch": 1.9731713218097262, "grad_norm": 0.4462113678455353, "learning_rate": 0.0049101678320829035, "loss": 2.4163, "step": 146800 }, { "epoch": 1.9745154439635475, "grad_norm": 0.4614112377166748, "learning_rate": 0.004910026943229135, "loss": 2.4249, "step": 146900 }, { "epoch": 1.9758595661173688, "grad_norm": 1.6407921314239502, "learning_rate": 0.004909885946008562, "loss": 2.4243, "step": 147000 }, { "epoch": 1.9758595661173688, "eval_MaskedAccuracy": 0.44691000163355754, "eval_loss": 2.740039348602295, "eval_runtime": 160.8268, "eval_samples_per_second": 394.686, "eval_steps_per_second": 1.542, "step": 147000 }, { "epoch": 1.97720368827119, "grad_norm": 0.3130664527416229, "learning_rate": 0.004909744840427542, "loss": 2.4207, "step": 147100 }, { "epoch": 1.9785478104250114, "grad_norm": 0.3282364010810852, "learning_rate": 0.004909603626492431, "loss": 2.421, "step": 147200 }, { "epoch": 1.9798919325788327, "grad_norm": 0.5056743621826172, "learning_rate": 0.004909462304209587, "loss": 2.4254, "step": 147300 }, { "epoch": 1.981236054732654, "grad_norm": 1.1724507808685303, "learning_rate": 0.0049093208735853855, "loss": 2.428, "step": 147400 }, { "epoch": 1.9825801768864755, "grad_norm": 0.2965724766254425, "learning_rate": 0.004909179334626195, "loss": 2.4272, "step": 147500 }, { "epoch": 1.9839242990402968, "grad_norm": 0.37335798144340515, "learning_rate": 0.004909037687338392, "loss": 2.4197, "step": 147600 }, { "epoch": 1.9852684211941183, "grad_norm": 0.5587185621261597, "learning_rate": 0.0049088959317283645, "loss": 2.4206, "step": 147700 }, { "epoch": 1.9866125433479396, "grad_norm": 0.5161391496658325, "learning_rate": 0.004908754067802488, "loss": 2.4308, "step": 147800 }, { "epoch": 1.9879566655017609, "grad_norm": 0.2679111361503601, "learning_rate": 0.004908612095567173, "loss": 2.4215, "step": 147900 }, { "epoch": 1.9893007876555822, "grad_norm": 0.6613872051239014, "learning_rate": 0.004908470015028803, "loss": 2.4228, "step": 148000 }, { "epoch": 1.9893007876555822, "eval_MaskedAccuracy": 0.4469897602234249, "eval_loss": 2.7378077507019043, "eval_runtime": 157.5436, "eval_samples_per_second": 402.911, "eval_steps_per_second": 1.574, "step": 148000 }, { "epoch": 1.9906449098094035, "grad_norm": 0.6632961630821228, "learning_rate": 0.0049083278261937814, "loss": 2.4307, "step": 148100 }, { "epoch": 1.9919890319632247, "grad_norm": 0.31573984026908875, "learning_rate": 0.0049081855290685145, "loss": 2.4226, "step": 148200 }, { "epoch": 1.993333154117046, "grad_norm": 2.4147422313690186, "learning_rate": 0.00490804312365942, "loss": 2.4199, "step": 148300 }, { "epoch": 1.9946772762708675, "grad_norm": 2.1030828952789307, "learning_rate": 0.004907900609972911, "loss": 2.4279, "step": 148400 }, { "epoch": 1.9960213984246888, "grad_norm": 0.395871639251709, "learning_rate": 0.004907757988015409, "loss": 2.425, "step": 148500 }, { "epoch": 1.9973655205785101, "grad_norm": 0.538977324962616, "learning_rate": 0.004907615257793332, "loss": 2.4217, "step": 148600 }, { "epoch": 1.9987096427323316, "grad_norm": 1.8294909000396729, "learning_rate": 0.004907472419313125, "loss": 2.4302, "step": 148700 }, { "epoch": 2.000053764886153, "grad_norm": 0.25931796431541443, "learning_rate": 0.004907329472581221, "loss": 2.4247, "step": 148800 }, { "epoch": 2.0013978870399742, "grad_norm": 0.34021487832069397, "learning_rate": 0.0049071864176040555, "loss": 2.4319, "step": 148900 }, { "epoch": 2.0027420091937955, "grad_norm": 0.7299911379814148, "learning_rate": 0.004907043254388077, "loss": 2.4311, "step": 149000 }, { "epoch": 2.0027420091937955, "eval_MaskedAccuracy": 0.4465608726612502, "eval_loss": 2.7390899658203125, "eval_runtime": 595.6963, "eval_samples_per_second": 106.558, "eval_steps_per_second": 0.416, "step": 149000 }, { "epoch": 2.004086131347617, "grad_norm": 0.7680445313453674, "learning_rate": 0.004906899982939738, "loss": 2.4315, "step": 149100 }, { "epoch": 2.005430253501438, "grad_norm": 0.603999137878418, "learning_rate": 0.004906756603265495, "loss": 2.4207, "step": 149200 }, { "epoch": 2.0067743756552594, "grad_norm": 0.995240330696106, "learning_rate": 0.004906613115371805, "loss": 2.4293, "step": 149300 }, { "epoch": 2.0081184978090807, "grad_norm": 0.858191967010498, "learning_rate": 0.0049064695192651345, "loss": 2.4275, "step": 149400 }, { "epoch": 2.0094626199629024, "grad_norm": 0.5625921487808228, "learning_rate": 0.004906325814951952, "loss": 2.4333, "step": 149500 }, { "epoch": 2.0108067421167237, "grad_norm": 0.3029104769229889, "learning_rate": 0.004906182002438735, "loss": 2.4288, "step": 149600 }, { "epoch": 2.012150864270545, "grad_norm": 0.6609681248664856, "learning_rate": 0.004906038081731963, "loss": 2.4128, "step": 149700 }, { "epoch": 2.0134949864243663, "grad_norm": 0.48055407404899597, "learning_rate": 0.004905894052838118, "loss": 2.4281, "step": 149800 }, { "epoch": 2.0148391085781876, "grad_norm": 0.2819332778453827, "learning_rate": 0.00490574991576369, "loss": 2.4247, "step": 149900 }, { "epoch": 2.016183230732009, "grad_norm": 0.32838475704193115, "learning_rate": 0.004905605670515174, "loss": 2.4259, "step": 150000 }, { "epoch": 2.016183230732009, "eval_MaskedAccuracy": 0.44722750040117315, "eval_loss": 2.738093376159668, "eval_runtime": 142.6909, "eval_samples_per_second": 444.85, "eval_steps_per_second": 1.738, "step": 150000 }, { "epoch": 2.01752735288583, "grad_norm": 0.35216397047042847, "learning_rate": 0.004905461317099078, "loss": 2.432, "step": 150100 }, { "epoch": 2.0188714750396515, "grad_norm": 0.7930958271026611, "learning_rate": 0.004905316855521893, "loss": 2.43, "step": 150200 }, { "epoch": 2.0202155971934728, "grad_norm": 0.6561233401298523, "learning_rate": 0.004905172285790133, "loss": 2.432, "step": 150300 }, { "epoch": 2.0215597193472945, "grad_norm": 0.8430966138839722, "learning_rate": 0.0049050276079103195, "loss": 2.4273, "step": 150400 }, { "epoch": 2.022903841501116, "grad_norm": 0.4468148946762085, "learning_rate": 0.004904882821888969, "loss": 2.4255, "step": 150500 }, { "epoch": 2.024247963654937, "grad_norm": 0.26562803983688354, "learning_rate": 0.004904737927732603, "loss": 2.428, "step": 150600 }, { "epoch": 2.0255920858087584, "grad_norm": 0.35955631732940674, "learning_rate": 0.004904592925447746, "loss": 2.4227, "step": 150700 }, { "epoch": 2.0269362079625797, "grad_norm": 1.039691686630249, "learning_rate": 0.0049044478150409385, "loss": 2.4229, "step": 150800 }, { "epoch": 2.028280330116401, "grad_norm": 2.361480474472046, "learning_rate": 0.004904302596518712, "loss": 2.4246, "step": 150900 }, { "epoch": 2.0296244522702223, "grad_norm": 0.4659264087677002, "learning_rate": 0.004904157269887615, "loss": 2.429, "step": 151000 }, { "epoch": 2.0296244522702223, "eval_MaskedAccuracy": 0.44815486516670616, "eval_loss": 2.7315616607666016, "eval_runtime": 142.8587, "eval_samples_per_second": 444.327, "eval_steps_per_second": 1.736, "step": 151000 }, { "epoch": 2.0309685744240435, "grad_norm": 0.3324325680732727, "learning_rate": 0.004904011835154192, "loss": 2.429, "step": 151100 }, { "epoch": 2.032312696577865, "grad_norm": 0.4291480481624603, "learning_rate": 0.0049038662923250025, "loss": 2.4253, "step": 151200 }, { "epoch": 2.033656818731686, "grad_norm": 0.5924742817878723, "learning_rate": 0.004903720641406596, "loss": 2.4181, "step": 151300 }, { "epoch": 2.035000940885508, "grad_norm": 1.0112535953521729, "learning_rate": 0.004903574882405539, "loss": 2.422, "step": 151400 }, { "epoch": 2.036345063039329, "grad_norm": 0.6629267334938049, "learning_rate": 0.004903429015328402, "loss": 2.4134, "step": 151500 }, { "epoch": 2.0376891851931505, "grad_norm": 0.5437670350074768, "learning_rate": 0.004903283040181754, "loss": 2.4211, "step": 151600 }, { "epoch": 2.0390333073469717, "grad_norm": 0.8838233947753906, "learning_rate": 0.004903136956972172, "loss": 2.4261, "step": 151700 }, { "epoch": 2.040377429500793, "grad_norm": 0.493783563375473, "learning_rate": 0.00490299076570624, "loss": 2.4228, "step": 151800 }, { "epoch": 2.0417215516546143, "grad_norm": 0.5075348019599915, "learning_rate": 0.004902844466390544, "loss": 2.4249, "step": 151900 }, { "epoch": 2.0430656738084356, "grad_norm": 0.31101787090301514, "learning_rate": 0.004902698059031681, "loss": 2.4234, "step": 152000 }, { "epoch": 2.0430656738084356, "eval_MaskedAccuracy": 0.4476862797973037, "eval_loss": 2.733727216720581, "eval_runtime": 142.8049, "eval_samples_per_second": 444.495, "eval_steps_per_second": 1.737, "step": 152000 }, { "epoch": 2.044409795962257, "grad_norm": 0.3274334967136383, "learning_rate": 0.004902551543636243, "loss": 2.4265, "step": 152100 }, { "epoch": 2.045753918116078, "grad_norm": 0.4391040503978729, "learning_rate": 0.004902404920210837, "loss": 2.4252, "step": 152200 }, { "epoch": 2.0470980402699, "grad_norm": 0.36864349246025085, "learning_rate": 0.004902258188762062, "loss": 2.4191, "step": 152300 }, { "epoch": 2.0484421624237212, "grad_norm": 0.32997363805770874, "learning_rate": 0.004902111349296531, "loss": 2.4301, "step": 152400 }, { "epoch": 2.0497862845775425, "grad_norm": 0.649544358253479, "learning_rate": 0.004901964401820856, "loss": 2.4262, "step": 152500 }, { "epoch": 2.051130406731364, "grad_norm": 0.5093933939933777, "learning_rate": 0.004901817346341668, "loss": 2.4184, "step": 152600 }, { "epoch": 2.052474528885185, "grad_norm": 0.5928727984428406, "learning_rate": 0.004901670182865587, "loss": 2.4273, "step": 152700 }, { "epoch": 2.0538186510390064, "grad_norm": 0.8361156582832336, "learning_rate": 0.004901522911399248, "loss": 2.4176, "step": 152800 }, { "epoch": 2.0551627731928277, "grad_norm": 1.1973950862884521, "learning_rate": 0.004901375531949283, "loss": 2.4253, "step": 152900 }, { "epoch": 2.056506895346649, "grad_norm": 0.7281692028045654, "learning_rate": 0.004901228044522336, "loss": 2.4213, "step": 153000 }, { "epoch": 2.056506895346649, "eval_MaskedAccuracy": 0.4476455974274515, "eval_loss": 2.7339048385620117, "eval_runtime": 146.6823, "eval_samples_per_second": 432.745, "eval_steps_per_second": 1.691, "step": 153000 }, { "epoch": 2.0578510175004703, "grad_norm": 0.908572793006897, "learning_rate": 0.004901080449125051, "loss": 2.4233, "step": 153100 }, { "epoch": 2.0591951396542916, "grad_norm": 0.3152801990509033, "learning_rate": 0.004900932745764079, "loss": 2.4217, "step": 153200 }, { "epoch": 2.0605392618081133, "grad_norm": 0.34237778186798096, "learning_rate": 0.004900784934446075, "loss": 2.4311, "step": 153300 }, { "epoch": 2.0618833839619346, "grad_norm": 0.9181241393089294, "learning_rate": 0.004900637015177697, "loss": 2.4211, "step": 153400 }, { "epoch": 2.063227506115756, "grad_norm": 1.0392390489578247, "learning_rate": 0.004900488987965611, "loss": 2.4163, "step": 153500 }, { "epoch": 2.064571628269577, "grad_norm": 1.2790428400039673, "learning_rate": 0.0049003408528164915, "loss": 2.4157, "step": 153600 }, { "epoch": 2.0659157504233985, "grad_norm": 0.2699611186981201, "learning_rate": 0.004900192609737011, "loss": 2.4267, "step": 153700 }, { "epoch": 2.0672598725772198, "grad_norm": 1.472973346710205, "learning_rate": 0.004900044258733849, "loss": 2.4306, "step": 153800 }, { "epoch": 2.068603994731041, "grad_norm": 0.27637550234794617, "learning_rate": 0.004899895799813685, "loss": 2.4221, "step": 153900 }, { "epoch": 2.0699481168848624, "grad_norm": 1.1613603830337524, "learning_rate": 0.004899747232983215, "loss": 2.4191, "step": 154000 }, { "epoch": 2.0699481168848624, "eval_MaskedAccuracy": 0.4478018154999935, "eval_loss": 2.7331624031066895, "eval_runtime": 142.1229, "eval_samples_per_second": 446.628, "eval_steps_per_second": 1.745, "step": 154000 }, { "epoch": 2.0712922390386836, "grad_norm": 1.339164137840271, "learning_rate": 0.004899598558249125, "loss": 2.4229, "step": 154100 }, { "epoch": 2.0726363611925054, "grad_norm": 0.558399498462677, "learning_rate": 0.004899449775618128, "loss": 2.4168, "step": 154200 }, { "epoch": 2.0739804833463267, "grad_norm": 0.6973606944084167, "learning_rate": 0.00489930088509692, "loss": 2.4206, "step": 154300 }, { "epoch": 2.075324605500148, "grad_norm": 0.7255196571350098, "learning_rate": 0.004899151886692205, "loss": 2.4187, "step": 154400 }, { "epoch": 2.0766687276539693, "grad_norm": 1.0322102308273315, "learning_rate": 0.004899002780410702, "loss": 2.4304, "step": 154500 }, { "epoch": 2.0780128498077906, "grad_norm": 0.8069867491722107, "learning_rate": 0.0048988535662591245, "loss": 2.4222, "step": 154600 }, { "epoch": 2.079356971961612, "grad_norm": 1.4786266088485718, "learning_rate": 0.004898704244244206, "loss": 2.4193, "step": 154700 }, { "epoch": 2.080701094115433, "grad_norm": 0.7500434517860413, "learning_rate": 0.004898554814372665, "loss": 2.4213, "step": 154800 }, { "epoch": 2.0820452162692544, "grad_norm": 1.6421656608581543, "learning_rate": 0.004898405276651237, "loss": 2.4169, "step": 154900 }, { "epoch": 2.0833893384230757, "grad_norm": 1.4270743131637573, "learning_rate": 0.0048982556310866576, "loss": 2.4242, "step": 155000 }, { "epoch": 2.0833893384230757, "eval_MaskedAccuracy": 0.44795841587608753, "eval_loss": 2.7314157485961914, "eval_runtime": 142.2388, "eval_samples_per_second": 446.264, "eval_steps_per_second": 1.744, "step": 155000 }, { "epoch": 2.0847334605768975, "grad_norm": 1.0751755237579346, "learning_rate": 0.004898105877685676, "loss": 2.4239, "step": 155100 }, { "epoch": 2.0860775827307187, "grad_norm": 0.43115946650505066, "learning_rate": 0.004897956016455043, "loss": 2.4202, "step": 155200 }, { "epoch": 2.08742170488454, "grad_norm": 0.6999830007553101, "learning_rate": 0.004897806047401499, "loss": 2.4164, "step": 155300 }, { "epoch": 2.0887658270383613, "grad_norm": 0.6027560830116272, "learning_rate": 0.004897655970531812, "loss": 2.4118, "step": 155400 }, { "epoch": 2.0901099491921826, "grad_norm": 2.5767462253570557, "learning_rate": 0.004897505785852734, "loss": 2.4254, "step": 155500 }, { "epoch": 2.091454071346004, "grad_norm": 0.6744506359100342, "learning_rate": 0.004897355493371032, "loss": 2.4192, "step": 155600 }, { "epoch": 2.092798193499825, "grad_norm": 0.6279339790344238, "learning_rate": 0.004897205093093492, "loss": 2.4212, "step": 155700 }, { "epoch": 2.0941423156536465, "grad_norm": 0.317688912153244, "learning_rate": 0.00489705458502688, "loss": 2.4179, "step": 155800 }, { "epoch": 2.095486437807468, "grad_norm": 0.5679840445518494, "learning_rate": 0.004896903969177977, "loss": 2.4157, "step": 155900 }, { "epoch": 2.096830559961289, "grad_norm": 0.6120138764381409, "learning_rate": 0.0048967532455535725, "loss": 2.4172, "step": 156000 }, { "epoch": 2.096830559961289, "eval_MaskedAccuracy": 0.44820394067035313, "eval_loss": 2.730539321899414, "eval_runtime": 143.8277, "eval_samples_per_second": 441.334, "eval_steps_per_second": 1.724, "step": 156000 }, { "epoch": 2.098174682115111, "grad_norm": 1.3471684455871582, "learning_rate": 0.004896602414160458, "loss": 2.412, "step": 156100 }, { "epoch": 2.099518804268932, "grad_norm": 2.606961250305176, "learning_rate": 0.0048964514750054365, "loss": 2.4199, "step": 156200 }, { "epoch": 2.1008629264227534, "grad_norm": 0.44379231333732605, "learning_rate": 0.004896300428095295, "loss": 2.4295, "step": 156300 }, { "epoch": 2.1022070485765747, "grad_norm": 0.4089781641960144, "learning_rate": 0.004896149273436849, "loss": 2.4195, "step": 156400 }, { "epoch": 2.103551170730396, "grad_norm": 0.39541006088256836, "learning_rate": 0.004895998011036907, "loss": 2.4226, "step": 156500 }, { "epoch": 2.1048952928842173, "grad_norm": 0.7441678047180176, "learning_rate": 0.004895846640902289, "loss": 2.4146, "step": 156600 }, { "epoch": 2.1062394150380386, "grad_norm": 0.2654988765716553, "learning_rate": 0.004895695163039808, "loss": 2.4115, "step": 156700 }, { "epoch": 2.10758353719186, "grad_norm": 0.23396815359592438, "learning_rate": 0.004895543577456294, "loss": 2.422, "step": 156800 }, { "epoch": 2.108927659345681, "grad_norm": 0.3832063674926758, "learning_rate": 0.004895391884158574, "loss": 2.4195, "step": 156900 }, { "epoch": 2.1102717814995025, "grad_norm": 0.6434290409088135, "learning_rate": 0.0048952400831534845, "loss": 2.4227, "step": 157000 }, { "epoch": 2.1102717814995025, "eval_MaskedAccuracy": 0.448189960789518, "eval_loss": 2.7293646335601807, "eval_runtime": 149.7286, "eval_samples_per_second": 423.94, "eval_steps_per_second": 1.656, "step": 157000 }, { "epoch": 2.111615903653324, "grad_norm": 0.5817483067512512, "learning_rate": 0.004895088174447862, "loss": 2.4267, "step": 157100 }, { "epoch": 2.1129600258071455, "grad_norm": 0.36821791529655457, "learning_rate": 0.004894936158048559, "loss": 2.4268, "step": 157200 }, { "epoch": 2.1143041479609668, "grad_norm": 0.5711908340454102, "learning_rate": 0.004894784033962416, "loss": 2.4233, "step": 157300 }, { "epoch": 2.115648270114788, "grad_norm": 0.4792524576187134, "learning_rate": 0.004894631802196298, "loss": 2.4199, "step": 157400 }, { "epoch": 2.1169923922686094, "grad_norm": 0.9362654089927673, "learning_rate": 0.0048944794627570525, "loss": 2.425, "step": 157500 }, { "epoch": 2.1183365144224306, "grad_norm": 0.8550450801849365, "learning_rate": 0.004894327015651554, "loss": 2.4171, "step": 157600 }, { "epoch": 2.119680636576252, "grad_norm": 0.2275136560201645, "learning_rate": 0.004894174460886669, "loss": 2.4112, "step": 157700 }, { "epoch": 2.1210247587300732, "grad_norm": 0.8724859356880188, "learning_rate": 0.004894021798469268, "loss": 2.4187, "step": 157800 }, { "epoch": 2.1223688808838945, "grad_norm": 0.3414950966835022, "learning_rate": 0.004893869028406233, "loss": 2.4113, "step": 157900 }, { "epoch": 2.1237130030377163, "grad_norm": 0.5261027216911316, "learning_rate": 0.004893716150704447, "loss": 2.4249, "step": 158000 }, { "epoch": 2.1237130030377163, "eval_MaskedAccuracy": 0.44844991045030136, "eval_loss": 2.7277510166168213, "eval_runtime": 152.1905, "eval_samples_per_second": 417.083, "eval_steps_per_second": 1.63, "step": 158000 }, { "epoch": 2.1250571251915376, "grad_norm": 0.24052013456821442, "learning_rate": 0.0048935631653708, "loss": 2.4195, "step": 158100 }, { "epoch": 2.126401247345359, "grad_norm": 0.9782668352127075, "learning_rate": 0.004893410072412176, "loss": 2.4313, "step": 158200 }, { "epoch": 2.12774536949918, "grad_norm": 1.6755679845809937, "learning_rate": 0.004893256871835483, "loss": 2.4201, "step": 158300 }, { "epoch": 2.1290894916530014, "grad_norm": 0.42762598395347595, "learning_rate": 0.00489310356364762, "loss": 2.4158, "step": 158400 }, { "epoch": 2.1304336138068227, "grad_norm": 0.2700982093811035, "learning_rate": 0.0048929501478555005, "loss": 2.4236, "step": 158500 }, { "epoch": 2.131777735960644, "grad_norm": 0.9225497245788574, "learning_rate": 0.004892796624466027, "loss": 2.4191, "step": 158600 }, { "epoch": 2.1331218581144653, "grad_norm": 0.9907844066619873, "learning_rate": 0.004892642993486124, "loss": 2.4111, "step": 158700 }, { "epoch": 2.1344659802682866, "grad_norm": 0.24535821378231049, "learning_rate": 0.004892489254922714, "loss": 2.4277, "step": 158800 }, { "epoch": 2.1358101024221083, "grad_norm": 1.3044700622558594, "learning_rate": 0.004892335408782724, "loss": 2.4191, "step": 158900 }, { "epoch": 2.1371542245759296, "grad_norm": 1.1276298761367798, "learning_rate": 0.0048921814550730814, "loss": 2.4142, "step": 159000 }, { "epoch": 2.1371542245759296, "eval_MaskedAccuracy": 0.44877584775466844, "eval_loss": 2.726320505142212, "eval_runtime": 144.1741, "eval_samples_per_second": 440.273, "eval_steps_per_second": 1.72, "step": 159000 }, { "epoch": 2.138498346729751, "grad_norm": 0.3968954086303711, "learning_rate": 0.004892027393800723, "loss": 2.4149, "step": 159100 }, { "epoch": 2.139842468883572, "grad_norm": 0.3040700852870941, "learning_rate": 0.004891873224972598, "loss": 2.4198, "step": 159200 }, { "epoch": 2.1411865910373935, "grad_norm": 0.24721510708332062, "learning_rate": 0.00489171894859565, "loss": 2.4209, "step": 159300 }, { "epoch": 2.142530713191215, "grad_norm": 0.3159930408000946, "learning_rate": 0.004891564564676828, "loss": 2.4214, "step": 159400 }, { "epoch": 2.143874835345036, "grad_norm": 0.31632721424102783, "learning_rate": 0.004891410073223092, "loss": 2.4242, "step": 159500 }, { "epoch": 2.1452189574988574, "grad_norm": 0.422014981508255, "learning_rate": 0.0048912554742414, "loss": 2.43, "step": 159600 }, { "epoch": 2.1465630796526787, "grad_norm": 0.2504478394985199, "learning_rate": 0.004891100767738721, "loss": 2.4201, "step": 159700 }, { "epoch": 2.1479072018065004, "grad_norm": 0.9411721229553223, "learning_rate": 0.0048909459537220265, "loss": 2.4305, "step": 159800 }, { "epoch": 2.1492513239603217, "grad_norm": 0.7317768931388855, "learning_rate": 0.004890791032198286, "loss": 2.4167, "step": 159900 }, { "epoch": 2.150595446114143, "grad_norm": 0.462750643491745, "learning_rate": 0.004890636003174482, "loss": 2.4022, "step": 160000 }, { "epoch": 2.150595446114143, "eval_MaskedAccuracy": 0.44730266116401834, "eval_loss": 2.733778953552246, "eval_runtime": 157.466, "eval_samples_per_second": 403.109, "eval_steps_per_second": 1.575, "step": 160000 }, { "epoch": 2.1519395682679643, "grad_norm": 1.2427386045455933, "learning_rate": 0.004890480866657605, "loss": 2.4187, "step": 160100 }, { "epoch": 2.1532836904217856, "grad_norm": 1.1014420986175537, "learning_rate": 0.004890325622654636, "loss": 2.4167, "step": 160200 }, { "epoch": 2.154627812575607, "grad_norm": 1.7939949035644531, "learning_rate": 0.004890170271172577, "loss": 2.413, "step": 160300 }, { "epoch": 2.155971934729428, "grad_norm": 0.28519657254219055, "learning_rate": 0.004890014812218431, "loss": 2.4248, "step": 160400 }, { "epoch": 2.1573160568832495, "grad_norm": 0.3315064609050751, "learning_rate": 0.0048898592457991994, "loss": 2.4238, "step": 160500 }, { "epoch": 2.1586601790370707, "grad_norm": 0.5888499617576599, "learning_rate": 0.004889703571921885, "loss": 2.4134, "step": 160600 }, { "epoch": 2.160004301190892, "grad_norm": 0.3281387686729431, "learning_rate": 0.004889547790593514, "loss": 2.4204, "step": 160700 }, { "epoch": 2.1613484233447133, "grad_norm": 0.3036564588546753, "learning_rate": 0.004889391901821095, "loss": 2.4143, "step": 160800 }, { "epoch": 2.162692545498535, "grad_norm": 0.5783784985542297, "learning_rate": 0.004889235905611661, "loss": 2.4287, "step": 160900 }, { "epoch": 2.1640366676523564, "grad_norm": 0.7875732183456421, "learning_rate": 0.004889079801972228, "loss": 2.4151, "step": 161000 }, { "epoch": 2.1640366676523564, "eval_MaskedAccuracy": 0.4483840000513704, "eval_loss": 2.7266931533813477, "eval_runtime": 157.0844, "eval_samples_per_second": 404.089, "eval_steps_per_second": 1.579, "step": 161000 }, { "epoch": 2.1653807898061777, "grad_norm": 1.2114744186401367, "learning_rate": 0.0048889235909098374, "loss": 2.4198, "step": 161100 }, { "epoch": 2.166724911959999, "grad_norm": 3.989194631576538, "learning_rate": 0.0048887672724315335, "loss": 2.4213, "step": 161200 }, { "epoch": 2.1680690341138202, "grad_norm": 1.195610761642456, "learning_rate": 0.0048886108465443535, "loss": 2.4187, "step": 161300 }, { "epoch": 2.1694131562676415, "grad_norm": 1.6807376146316528, "learning_rate": 0.004888454313255354, "loss": 2.4067, "step": 161400 }, { "epoch": 2.170757278421463, "grad_norm": 0.3829794228076935, "learning_rate": 0.0048882976725715815, "loss": 2.41, "step": 161500 }, { "epoch": 2.172101400575284, "grad_norm": 0.2655125558376312, "learning_rate": 0.00488814092450009, "loss": 2.416, "step": 161600 }, { "epoch": 2.1734455227291054, "grad_norm": 0.6671358942985535, "learning_rate": 0.004887984069047942, "loss": 2.4155, "step": 161700 }, { "epoch": 2.174789644882927, "grad_norm": 0.49830421805381775, "learning_rate": 0.004887827106222209, "loss": 2.4146, "step": 161800 }, { "epoch": 2.1761337670367484, "grad_norm": 0.43950071930885315, "learning_rate": 0.004887670036029963, "loss": 2.4123, "step": 161900 }, { "epoch": 2.1774778891905697, "grad_norm": 0.9079190492630005, "learning_rate": 0.004887512858478287, "loss": 2.4123, "step": 162000 }, { "epoch": 2.1774778891905697, "eval_MaskedAccuracy": 0.4486591905523086, "eval_loss": 2.7250816822052, "eval_runtime": 154.8776, "eval_samples_per_second": 409.846, "eval_steps_per_second": 1.601, "step": 162000 }, { "epoch": 2.178822011344391, "grad_norm": 1.3062362670898438, "learning_rate": 0.004887355573574258, "loss": 2.4201, "step": 162100 }, { "epoch": 2.1801661334982123, "grad_norm": 0.9617155194282532, "learning_rate": 0.004887198181324969, "loss": 2.4178, "step": 162200 }, { "epoch": 2.1815102556520336, "grad_norm": 0.23437173664569855, "learning_rate": 0.004887040681737504, "loss": 2.4276, "step": 162300 }, { "epoch": 2.182854377805855, "grad_norm": 0.7280550599098206, "learning_rate": 0.004886883074818961, "loss": 2.4122, "step": 162400 }, { "epoch": 2.184198499959676, "grad_norm": 0.26790088415145874, "learning_rate": 0.004886725360576443, "loss": 2.4176, "step": 162500 }, { "epoch": 2.1855426221134975, "grad_norm": 0.24585595726966858, "learning_rate": 0.004886567539017049, "loss": 2.4256, "step": 162600 }, { "epoch": 2.186886744267319, "grad_norm": 0.2857859134674072, "learning_rate": 0.0048864096101479005, "loss": 2.4147, "step": 162700 }, { "epoch": 2.1882308664211405, "grad_norm": 0.4526360332965851, "learning_rate": 0.00488625157397611, "loss": 2.4197, "step": 162800 }, { "epoch": 2.189574988574962, "grad_norm": 0.2927253246307373, "learning_rate": 0.004886093430508798, "loss": 2.4114, "step": 162900 }, { "epoch": 2.190919110728783, "grad_norm": 0.4610482156276703, "learning_rate": 0.004885935179753095, "loss": 2.4147, "step": 163000 }, { "epoch": 2.190919110728783, "eval_MaskedAccuracy": 0.44903692639620885, "eval_loss": 2.7242319583892822, "eval_runtime": 155.5888, "eval_samples_per_second": 407.973, "eval_steps_per_second": 1.594, "step": 163000 }, { "epoch": 2.1922632328826044, "grad_norm": 0.6389359831809998, "learning_rate": 0.004885776821716123, "loss": 2.4158, "step": 163100 }, { "epoch": 2.1936073550364257, "grad_norm": 1.6146184206008911, "learning_rate": 0.004885618356405023, "loss": 2.4194, "step": 163200 }, { "epoch": 2.194951477190247, "grad_norm": 1.686237096786499, "learning_rate": 0.004885459783826932, "loss": 2.4109, "step": 163300 }, { "epoch": 2.1962955993440683, "grad_norm": 1.0167760848999023, "learning_rate": 0.004885301103989002, "loss": 2.4146, "step": 163400 }, { "epoch": 2.1976397214978896, "grad_norm": 0.20749007165431976, "learning_rate": 0.004885142316898374, "loss": 2.4179, "step": 163500 }, { "epoch": 2.1989838436517113, "grad_norm": 0.3694406747817993, "learning_rate": 0.0048849834225622064, "loss": 2.411, "step": 163600 }, { "epoch": 2.2003279658055326, "grad_norm": 0.6222627758979797, "learning_rate": 0.004884824420987657, "loss": 2.4116, "step": 163700 }, { "epoch": 2.201672087959354, "grad_norm": 0.637772798538208, "learning_rate": 0.004884665312181901, "loss": 2.4217, "step": 163800 }, { "epoch": 2.203016210113175, "grad_norm": 1.5972890853881836, "learning_rate": 0.004884506096152096, "loss": 2.4098, "step": 163900 }, { "epoch": 2.2043603322669965, "grad_norm": 0.2235487997531891, "learning_rate": 0.004884346772905415, "loss": 2.4088, "step": 164000 }, { "epoch": 2.2043603322669965, "eval_MaskedAccuracy": 0.44855764145601995, "eval_loss": 2.725599527359009, "eval_runtime": 153.7593, "eval_samples_per_second": 412.827, "eval_steps_per_second": 1.613, "step": 164000 }, { "epoch": 2.2057044544208178, "grad_norm": 0.5953556895256042, "learning_rate": 0.004884187342449037, "loss": 2.4125, "step": 164100 }, { "epoch": 2.207048576574639, "grad_norm": 0.6413670778274536, "learning_rate": 0.004884027804790152, "loss": 2.4191, "step": 164200 }, { "epoch": 2.2083926987284603, "grad_norm": 0.5590651035308838, "learning_rate": 0.004883868159935944, "loss": 2.4153, "step": 164300 }, { "epoch": 2.2097368208822816, "grad_norm": 0.6301490664482117, "learning_rate": 0.0048837084078936135, "loss": 2.4125, "step": 164400 }, { "epoch": 2.211080943036103, "grad_norm": 1.1809839010238647, "learning_rate": 0.004883548548670345, "loss": 2.4097, "step": 164500 }, { "epoch": 2.2124250651899247, "grad_norm": 0.8796495199203491, "learning_rate": 0.004883388582273348, "loss": 2.4151, "step": 164600 }, { "epoch": 2.213769187343746, "grad_norm": 0.5694831609725952, "learning_rate": 0.004883228508709839, "loss": 2.4123, "step": 164700 }, { "epoch": 2.2151133094975672, "grad_norm": 1.4738579988479614, "learning_rate": 0.0048830683279870194, "loss": 2.3999, "step": 164800 }, { "epoch": 2.2164574316513885, "grad_norm": 0.6466926336288452, "learning_rate": 0.004882908040112116, "loss": 2.4129, "step": 164900 }, { "epoch": 2.21780155380521, "grad_norm": 0.2502327263355255, "learning_rate": 0.004882747645092345, "loss": 2.4139, "step": 165000 }, { "epoch": 2.21780155380521, "eval_MaskedAccuracy": 0.44872007353941323, "eval_loss": 2.723762273788452, "eval_runtime": 158.1038, "eval_samples_per_second": 401.483, "eval_steps_per_second": 1.569, "step": 165000 }, { "epoch": 2.219145675959031, "grad_norm": 0.5904813408851624, "learning_rate": 0.004882587142934935, "loss": 2.4146, "step": 165100 }, { "epoch": 2.2204897981128524, "grad_norm": 0.41806700825691223, "learning_rate": 0.004882426533647115, "loss": 2.4111, "step": 165200 }, { "epoch": 2.2218339202666737, "grad_norm": 0.30772215127944946, "learning_rate": 0.00488226581723612, "loss": 2.4097, "step": 165300 }, { "epoch": 2.223178042420495, "grad_norm": 0.6158583760261536, "learning_rate": 0.0048821049937092026, "loss": 2.4132, "step": 165400 }, { "epoch": 2.2245221645743163, "grad_norm": 0.5486952066421509, "learning_rate": 0.004881944063073602, "loss": 2.4082, "step": 165500 }, { "epoch": 2.225866286728138, "grad_norm": 0.44802579283714294, "learning_rate": 0.00488178302533657, "loss": 2.4052, "step": 165600 }, { "epoch": 2.2272104088819593, "grad_norm": 0.6078771948814392, "learning_rate": 0.004881621880505358, "loss": 2.4174, "step": 165700 }, { "epoch": 2.2285545310357806, "grad_norm": 1.0394670963287354, "learning_rate": 0.004881460628587237, "loss": 2.4053, "step": 165800 }, { "epoch": 2.229898653189602, "grad_norm": 0.7442373037338257, "learning_rate": 0.0048812992695894675, "loss": 2.4099, "step": 165900 }, { "epoch": 2.231242775343423, "grad_norm": 0.39984771609306335, "learning_rate": 0.004881137803519318, "loss": 2.4191, "step": 166000 }, { "epoch": 2.231242775343423, "eval_MaskedAccuracy": 0.44866405937837617, "eval_loss": 2.7227699756622314, "eval_runtime": 176.0029, "eval_samples_per_second": 360.653, "eval_steps_per_second": 1.409, "step": 166000 }, { "epoch": 2.2325868974972445, "grad_norm": 0.40001779794692993, "learning_rate": 0.004880976230384064, "loss": 2.4152, "step": 166100 }, { "epoch": 2.2339310196510658, "grad_norm": 0.8670006394386292, "learning_rate": 0.00488081455019099, "loss": 2.4158, "step": 166200 }, { "epoch": 2.235275141804887, "grad_norm": 0.5148455500602722, "learning_rate": 0.004880652762947384, "loss": 2.4162, "step": 166300 }, { "epoch": 2.2366192639587084, "grad_norm": 1.6046350002288818, "learning_rate": 0.004880490868660532, "loss": 2.4131, "step": 166400 }, { "epoch": 2.23796338611253, "grad_norm": 0.896712601184845, "learning_rate": 0.004880328867337728, "loss": 2.4037, "step": 166500 }, { "epoch": 2.2393075082663514, "grad_norm": 0.881832480430603, "learning_rate": 0.0048801667589862706, "loss": 2.4167, "step": 166600 }, { "epoch": 2.2406516304201727, "grad_norm": 3.5371510982513428, "learning_rate": 0.00488000454361346, "loss": 2.411, "step": 166700 }, { "epoch": 2.241995752573994, "grad_norm": 0.2387372851371765, "learning_rate": 0.004879842221226608, "loss": 2.4134, "step": 166800 }, { "epoch": 2.2433398747278153, "grad_norm": 0.2676979899406433, "learning_rate": 0.00487967979183303, "loss": 2.4213, "step": 166900 }, { "epoch": 2.2446839968816366, "grad_norm": 0.9509516954421997, "learning_rate": 0.0048795172554400475, "loss": 2.4127, "step": 167000 }, { "epoch": 2.2446839968816366, "eval_MaskedAccuracy": 0.44885926012655697, "eval_loss": 2.7260682582855225, "eval_runtime": 166.3086, "eval_samples_per_second": 381.676, "eval_steps_per_second": 1.491, "step": 167000 }, { "epoch": 2.246028119035458, "grad_norm": 0.3096467852592468, "learning_rate": 0.004879354612054984, "loss": 2.4122, "step": 167100 }, { "epoch": 2.247372241189279, "grad_norm": 0.6709736585617065, "learning_rate": 0.004879191861685164, "loss": 2.4082, "step": 167200 }, { "epoch": 2.2487163633431004, "grad_norm": 0.5945542454719543, "learning_rate": 0.004879029004337925, "loss": 2.4125, "step": 167300 }, { "epoch": 2.250060485496922, "grad_norm": 1.153786063194275, "learning_rate": 0.004878866040020597, "loss": 2.4236, "step": 167400 }, { "epoch": 2.2514046076507435, "grad_norm": 0.9520905613899231, "learning_rate": 0.004878702968740533, "loss": 2.4141, "step": 167500 }, { "epoch": 2.2527487298045648, "grad_norm": 0.8178815841674805, "learning_rate": 0.004878539790505079, "loss": 2.4127, "step": 167600 }, { "epoch": 2.254092851958386, "grad_norm": 0.29091590642929077, "learning_rate": 0.004878376505321578, "loss": 2.4077, "step": 167700 }, { "epoch": 2.2554369741122073, "grad_norm": 1.5120651721954346, "learning_rate": 0.004878213113197398, "loss": 2.4125, "step": 167800 }, { "epoch": 2.2567810962660286, "grad_norm": 1.6276648044586182, "learning_rate": 0.004878049614139895, "loss": 2.4122, "step": 167900 }, { "epoch": 2.25812521841985, "grad_norm": 0.786770761013031, "learning_rate": 0.004877886008156443, "loss": 2.405, "step": 168000 }, { "epoch": 2.25812521841985, "eval_MaskedAccuracy": 0.4498786265923044, "eval_loss": 2.719548225402832, "eval_runtime": 155.0214, "eval_samples_per_second": 409.466, "eval_steps_per_second": 1.6, "step": 168000 }, { "epoch": 2.259469340573671, "grad_norm": 0.46489110589027405, "learning_rate": 0.004877722295254405, "loss": 2.4138, "step": 168100 }, { "epoch": 2.2608134627274925, "grad_norm": 1.4252897500991821, "learning_rate": 0.004877558475441166, "loss": 2.411, "step": 168200 }, { "epoch": 2.2621575848813142, "grad_norm": 0.2540140450000763, "learning_rate": 0.004877394548724101, "loss": 2.4093, "step": 168300 }, { "epoch": 2.2635017070351355, "grad_norm": 0.3754124045372009, "learning_rate": 0.004877230515110599, "loss": 2.4146, "step": 168400 }, { "epoch": 2.264845829188957, "grad_norm": 0.6744637489318848, "learning_rate": 0.0048770663746080505, "loss": 2.4203, "step": 168500 }, { "epoch": 2.266189951342778, "grad_norm": 0.21679449081420898, "learning_rate": 0.004876902127223849, "loss": 2.4013, "step": 168600 }, { "epoch": 2.2675340734965994, "grad_norm": 0.3662445843219757, "learning_rate": 0.004876737772965405, "loss": 2.4108, "step": 168700 }, { "epoch": 2.2688781956504207, "grad_norm": 1.2381483316421509, "learning_rate": 0.004876573311840115, "loss": 2.4117, "step": 168800 }, { "epoch": 2.270222317804242, "grad_norm": 1.7564358711242676, "learning_rate": 0.0048764087438553875, "loss": 2.4082, "step": 168900 }, { "epoch": 2.2715664399580633, "grad_norm": 0.6631215810775757, "learning_rate": 0.004876244069018646, "loss": 2.4133, "step": 169000 }, { "epoch": 2.2715664399580633, "eval_MaskedAccuracy": 0.45023057912506814, "eval_loss": 2.716172695159912, "eval_runtime": 156.7093, "eval_samples_per_second": 405.056, "eval_steps_per_second": 1.583, "step": 169000 }, { "epoch": 2.2729105621118846, "grad_norm": 0.8016698360443115, "learning_rate": 0.004876079287337305, "loss": 2.4051, "step": 169100 }, { "epoch": 2.2742546842657063, "grad_norm": 0.519679844379425, "learning_rate": 0.0048759143988187925, "loss": 2.4115, "step": 169200 }, { "epoch": 2.275598806419527, "grad_norm": 0.7286385297775269, "learning_rate": 0.004875749403470536, "loss": 2.4202, "step": 169300 }, { "epoch": 2.276942928573349, "grad_norm": 0.24820998311042786, "learning_rate": 0.004875584301299965, "loss": 2.405, "step": 169400 }, { "epoch": 2.27828705072717, "grad_norm": 0.25017213821411133, "learning_rate": 0.004875419092314521, "loss": 2.4129, "step": 169500 }, { "epoch": 2.2796311728809915, "grad_norm": 0.8957297801971436, "learning_rate": 0.004875253776521664, "loss": 2.4071, "step": 169600 }, { "epoch": 2.2809752950348128, "grad_norm": 0.5003229975700378, "learning_rate": 0.004875088353928819, "loss": 2.4099, "step": 169700 }, { "epoch": 2.282319417188634, "grad_norm": 0.6370465159416199, "learning_rate": 0.004874922824543451, "loss": 2.4257, "step": 169800 }, { "epoch": 2.2836635393424554, "grad_norm": 0.30033034086227417, "learning_rate": 0.004874757188373019, "loss": 2.4151, "step": 169900 }, { "epoch": 2.2850076614962767, "grad_norm": 1.7922015190124512, "learning_rate": 0.004874591445424987, "loss": 2.4176, "step": 170000 }, { "epoch": 2.2850076614962767, "eval_MaskedAccuracy": 0.4488507191873566, "eval_loss": 2.722308874130249, "eval_runtime": 156.2994, "eval_samples_per_second": 406.118, "eval_steps_per_second": 1.587, "step": 170000 }, { "epoch": 2.286351783650098, "grad_norm": 1.080620288848877, "learning_rate": 0.004874425595706819, "loss": 2.4126, "step": 170100 }, { "epoch": 2.2876959058039192, "grad_norm": 0.4531891644001007, "learning_rate": 0.004874259639225995, "loss": 2.4058, "step": 170200 }, { "epoch": 2.289040027957741, "grad_norm": 0.2731966972351074, "learning_rate": 0.0048740935759899835, "loss": 2.4072, "step": 170300 }, { "epoch": 2.2903841501115623, "grad_norm": 0.5683355927467346, "learning_rate": 0.004873927406006273, "loss": 2.4048, "step": 170400 }, { "epoch": 2.2917282722653836, "grad_norm": 1.147813081741333, "learning_rate": 0.004873761129282351, "loss": 2.403, "step": 170500 }, { "epoch": 2.293072394419205, "grad_norm": 0.30079033970832825, "learning_rate": 0.0048735947458257095, "loss": 2.4076, "step": 170600 }, { "epoch": 2.294416516573026, "grad_norm": 0.29191407561302185, "learning_rate": 0.00487342825564384, "loss": 2.4128, "step": 170700 }, { "epoch": 2.2957606387268474, "grad_norm": 0.7784181833267212, "learning_rate": 0.004873261658744253, "loss": 2.409, "step": 170800 }, { "epoch": 2.2971047608806687, "grad_norm": 0.7186245322227478, "learning_rate": 0.004873094955134448, "loss": 2.4138, "step": 170900 }, { "epoch": 2.29844888303449, "grad_norm": 0.9298619031906128, "learning_rate": 0.004872928144821944, "loss": 2.413, "step": 171000 }, { "epoch": 2.29844888303449, "eval_MaskedAccuracy": 0.4487523079732683, "eval_loss": 2.7223293781280518, "eval_runtime": 154.8012, "eval_samples_per_second": 410.049, "eval_steps_per_second": 1.602, "step": 171000 }, { "epoch": 2.2997930051883113, "grad_norm": 0.9932366013526917, "learning_rate": 0.00487276122781425, "loss": 2.417, "step": 171100 }, { "epoch": 2.301137127342133, "grad_norm": 1.6287879943847656, "learning_rate": 0.0048725942041188885, "loss": 2.41, "step": 171200 }, { "epoch": 2.3024812494959543, "grad_norm": 0.38423383235931396, "learning_rate": 0.00487242707374338, "loss": 2.4128, "step": 171300 }, { "epoch": 2.3038253716497756, "grad_norm": 1.0105299949645996, "learning_rate": 0.0048722598366952715, "loss": 2.4199, "step": 171400 }, { "epoch": 2.305169493803597, "grad_norm": 1.1391443014144897, "learning_rate": 0.004872092492982082, "loss": 2.4101, "step": 171500 }, { "epoch": 2.306513615957418, "grad_norm": 0.2690454125404358, "learning_rate": 0.004871925042611355, "loss": 2.4109, "step": 171600 }, { "epoch": 2.3078577381112395, "grad_norm": 0.3632848262786865, "learning_rate": 0.004871757485590642, "loss": 2.4156, "step": 171700 }, { "epoch": 2.309201860265061, "grad_norm": 1.6692334413528442, "learning_rate": 0.004871589821927492, "loss": 2.415, "step": 171800 }, { "epoch": 2.310545982418882, "grad_norm": 0.8343886137008667, "learning_rate": 0.004871422051629453, "loss": 2.4143, "step": 171900 }, { "epoch": 2.3118901045727034, "grad_norm": 0.42459598183631897, "learning_rate": 0.004871254174704084, "loss": 2.4116, "step": 172000 }, { "epoch": 2.3118901045727034, "eval_MaskedAccuracy": 0.44924426931396766, "eval_loss": 2.720871925354004, "eval_runtime": 154.4054, "eval_samples_per_second": 411.099, "eval_steps_per_second": 1.606, "step": 172000 }, { "epoch": 2.313234226726525, "grad_norm": 0.23972781002521515, "learning_rate": 0.004871086191158964, "loss": 2.4127, "step": 172100 }, { "epoch": 2.3145783488803464, "grad_norm": 0.5230980515480042, "learning_rate": 0.004870918101001645, "loss": 2.4101, "step": 172200 }, { "epoch": 2.3159224710341677, "grad_norm": 1.2739980220794678, "learning_rate": 0.004870749904239708, "loss": 2.4125, "step": 172300 }, { "epoch": 2.317266593187989, "grad_norm": 0.2384161502122879, "learning_rate": 0.004870581600880735, "loss": 2.4088, "step": 172400 }, { "epoch": 2.3186107153418103, "grad_norm": 0.5129702091217041, "learning_rate": 0.004870413190932297, "loss": 2.4088, "step": 172500 }, { "epoch": 2.3199548374956316, "grad_norm": 0.2832634449005127, "learning_rate": 0.004870244674402003, "loss": 2.4016, "step": 172600 }, { "epoch": 2.321298959649453, "grad_norm": 0.287016361951828, "learning_rate": 0.004870076051297425, "loss": 2.4126, "step": 172700 }, { "epoch": 2.322643081803274, "grad_norm": 0.2521541118621826, "learning_rate": 0.0048699073216261725, "loss": 2.4122, "step": 172800 }, { "epoch": 2.3239872039570955, "grad_norm": 0.29343220591545105, "learning_rate": 0.004869738485395845, "loss": 2.3996, "step": 172900 }, { "epoch": 2.325331326110917, "grad_norm": 0.2879350185394287, "learning_rate": 0.004869569542614052, "loss": 2.4084, "step": 173000 }, { "epoch": 2.325331326110917, "eval_MaskedAccuracy": 0.4492615010440258, "eval_loss": 2.7211360931396484, "eval_runtime": 156.2801, "eval_samples_per_second": 406.168, "eval_steps_per_second": 1.587, "step": 173000 }, { "epoch": 2.3266754482647385, "grad_norm": 0.30386149883270264, "learning_rate": 0.004869400493288407, "loss": 2.4125, "step": 173100 }, { "epoch": 2.32801957041856, "grad_norm": 0.30869200825691223, "learning_rate": 0.004869231337426524, "loss": 2.413, "step": 173200 }, { "epoch": 2.329363692572381, "grad_norm": 0.6456079483032227, "learning_rate": 0.0048690620750360255, "loss": 2.4144, "step": 173300 }, { "epoch": 2.3307078147262024, "grad_norm": 0.6145516633987427, "learning_rate": 0.004868892706124542, "loss": 2.4062, "step": 173400 }, { "epoch": 2.3320519368800237, "grad_norm": 0.5485299229621887, "learning_rate": 0.004868723230699693, "loss": 2.401, "step": 173500 }, { "epoch": 2.333396059033845, "grad_norm": 1.082629680633545, "learning_rate": 0.004868553648769127, "loss": 2.413, "step": 173600 }, { "epoch": 2.3347401811876662, "grad_norm": 0.5388498306274414, "learning_rate": 0.004868383960340481, "loss": 2.4097, "step": 173700 }, { "epoch": 2.3360843033414875, "grad_norm": 0.23575937747955322, "learning_rate": 0.004868214165421399, "loss": 2.4054, "step": 173800 }, { "epoch": 2.337428425495309, "grad_norm": 0.7409723997116089, "learning_rate": 0.004868044264019538, "loss": 2.4085, "step": 173900 }, { "epoch": 2.33877254764913, "grad_norm": 0.2735578715801239, "learning_rate": 0.004867874256142546, "loss": 2.4156, "step": 174000 }, { "epoch": 2.33877254764913, "eval_MaskedAccuracy": 0.4495013367228055, "eval_loss": 2.7185449600219727, "eval_runtime": 154.8489, "eval_samples_per_second": 409.922, "eval_steps_per_second": 1.602, "step": 174000 }, { "epoch": 2.340116669802952, "grad_norm": 0.26877549290657043, "learning_rate": 0.004867704141798087, "loss": 2.4045, "step": 174100 }, { "epoch": 2.341460791956773, "grad_norm": 0.6106776595115662, "learning_rate": 0.004867533920993822, "loss": 2.4046, "step": 174200 }, { "epoch": 2.3428049141105944, "grad_norm": 0.2555699348449707, "learning_rate": 0.00486736359373743, "loss": 2.4151, "step": 174300 }, { "epoch": 2.3441490362644157, "grad_norm": 0.3675636649131775, "learning_rate": 0.004867193160036579, "loss": 2.4072, "step": 174400 }, { "epoch": 2.345493158418237, "grad_norm": 0.7142601609230042, "learning_rate": 0.004867022619898949, "loss": 2.4049, "step": 174500 }, { "epoch": 2.3468372805720583, "grad_norm": 1.72738778591156, "learning_rate": 0.004866851973332225, "loss": 2.412, "step": 174600 }, { "epoch": 2.3481814027258796, "grad_norm": 0.45770561695098877, "learning_rate": 0.004866681220344099, "loss": 2.416, "step": 174700 }, { "epoch": 2.349525524879701, "grad_norm": 0.788829505443573, "learning_rate": 0.004866510360942256, "loss": 2.4055, "step": 174800 }, { "epoch": 2.350869647033522, "grad_norm": 0.2863014042377472, "learning_rate": 0.004866339395134404, "loss": 2.4019, "step": 174900 }, { "epoch": 2.352213769187344, "grad_norm": 0.27914437651634216, "learning_rate": 0.004866168322928247, "loss": 2.4036, "step": 175000 }, { "epoch": 2.352213769187344, "eval_MaskedAccuracy": 0.4499790247285866, "eval_loss": 2.7163689136505127, "eval_runtime": 155.1875, "eval_samples_per_second": 409.028, "eval_steps_per_second": 1.598, "step": 175000 }, { "epoch": 2.353557891341165, "grad_norm": 0.49107837677001953, "learning_rate": 0.004865997144331489, "loss": 2.4051, "step": 175100 }, { "epoch": 2.3549020134949865, "grad_norm": 1.704633355140686, "learning_rate": 0.004865825859351838, "loss": 2.4091, "step": 175200 }, { "epoch": 2.356246135648808, "grad_norm": 0.2347813844680786, "learning_rate": 0.004865654467997019, "loss": 2.4098, "step": 175300 }, { "epoch": 2.357590257802629, "grad_norm": 0.9905913472175598, "learning_rate": 0.0048654829702747555, "loss": 2.4093, "step": 175400 }, { "epoch": 2.3589343799564504, "grad_norm": 0.3170483410358429, "learning_rate": 0.0048653113661927695, "loss": 2.4048, "step": 175500 }, { "epoch": 2.3602785021102717, "grad_norm": 0.8622209429740906, "learning_rate": 0.004865139655758797, "loss": 2.4073, "step": 175600 }, { "epoch": 2.361622624264093, "grad_norm": 0.3935220241546631, "learning_rate": 0.004864967838980571, "loss": 2.4155, "step": 175700 }, { "epoch": 2.3629667464179143, "grad_norm": 1.1118823289871216, "learning_rate": 0.004864795915865833, "loss": 2.4112, "step": 175800 }, { "epoch": 2.364310868571736, "grad_norm": 0.7513082027435303, "learning_rate": 0.00486462388642233, "loss": 2.4152, "step": 175900 }, { "epoch": 2.3656549907255573, "grad_norm": 1.073836326599121, "learning_rate": 0.0048644517506578255, "loss": 2.4023, "step": 176000 }, { "epoch": 2.3656549907255573, "eval_MaskedAccuracy": 0.44973931526656014, "eval_loss": 2.7163290977478027, "eval_runtime": 156.8073, "eval_samples_per_second": 404.803, "eval_steps_per_second": 1.582, "step": 176000 }, { "epoch": 2.3669991128793786, "grad_norm": 0.6424139142036438, "learning_rate": 0.004864279508580062, "loss": 2.3978, "step": 176100 }, { "epoch": 2.3683432350332, "grad_norm": 0.9152630567550659, "learning_rate": 0.004864107160196811, "loss": 2.405, "step": 176200 }, { "epoch": 2.369687357187021, "grad_norm": 0.44839566946029663, "learning_rate": 0.004863934705515825, "loss": 2.4032, "step": 176300 }, { "epoch": 2.3710314793408425, "grad_norm": 1.236681342124939, "learning_rate": 0.0048637621445448845, "loss": 2.4041, "step": 176400 }, { "epoch": 2.3723756014946638, "grad_norm": 0.62148517370224, "learning_rate": 0.004863589477291759, "loss": 2.4113, "step": 176500 }, { "epoch": 2.373719723648485, "grad_norm": 0.2154432088136673, "learning_rate": 0.004863416703764239, "loss": 2.4113, "step": 176600 }, { "epoch": 2.3750638458023063, "grad_norm": 0.5083281993865967, "learning_rate": 0.004863243823970095, "loss": 2.4058, "step": 176700 }, { "epoch": 2.376407967956128, "grad_norm": 0.33949902653694153, "learning_rate": 0.004863070837917129, "loss": 2.4015, "step": 176800 }, { "epoch": 2.3777520901099494, "grad_norm": 1.1010327339172363, "learning_rate": 0.004862897745613131, "loss": 2.4008, "step": 176900 }, { "epoch": 2.3790962122637707, "grad_norm": 1.2555139064788818, "learning_rate": 0.004862724547065901, "loss": 2.4059, "step": 177000 }, { "epoch": 2.3790962122637707, "eval_MaskedAccuracy": 0.4503048499742003, "eval_loss": 2.716306447982788, "eval_runtime": 155.5308, "eval_samples_per_second": 408.125, "eval_steps_per_second": 1.595, "step": 177000 }, { "epoch": 2.380440334417592, "grad_norm": 0.5980218648910522, "learning_rate": 0.0048625512422832366, "loss": 2.4036, "step": 177100 }, { "epoch": 2.3817844565714132, "grad_norm": 0.252000629901886, "learning_rate": 0.004862377831272951, "loss": 2.4125, "step": 177200 }, { "epoch": 2.3831285787252345, "grad_norm": 0.6052896976470947, "learning_rate": 0.004862204314042862, "loss": 2.415, "step": 177300 }, { "epoch": 2.384472700879056, "grad_norm": 0.37761661410331726, "learning_rate": 0.004862030690600784, "loss": 2.4009, "step": 177400 }, { "epoch": 2.385816823032877, "grad_norm": 0.3634903132915497, "learning_rate": 0.004861856960954538, "loss": 2.4027, "step": 177500 }, { "epoch": 2.3871609451866984, "grad_norm": 0.24861650168895721, "learning_rate": 0.004861683125111956, "loss": 2.4014, "step": 177600 }, { "epoch": 2.38850506734052, "grad_norm": 0.893480122089386, "learning_rate": 0.004861509183080868, "loss": 2.4062, "step": 177700 }, { "epoch": 2.389849189494341, "grad_norm": 0.22390282154083252, "learning_rate": 0.004861335134869118, "loss": 2.3922, "step": 177800 }, { "epoch": 2.3911933116481627, "grad_norm": 0.6363741159439087, "learning_rate": 0.004861160980484538, "loss": 2.402, "step": 177900 }, { "epoch": 2.392537433801984, "grad_norm": 0.918346107006073, "learning_rate": 0.004860986719934979, "loss": 2.3986, "step": 178000 }, { "epoch": 2.392537433801984, "eval_MaskedAccuracy": 0.44992926294273605, "eval_loss": 2.714897394180298, "eval_runtime": 156.6614, "eval_samples_per_second": 405.18, "eval_steps_per_second": 1.583, "step": 178000 }, { "epoch": 2.3938815559558053, "grad_norm": 0.3443443477153778, "learning_rate": 0.0048608123532282955, "loss": 2.4041, "step": 178100 }, { "epoch": 2.3952256781096266, "grad_norm": 1.1353808641433716, "learning_rate": 0.004860637880372347, "loss": 2.4011, "step": 178200 }, { "epoch": 2.396569800263448, "grad_norm": 0.813539445400238, "learning_rate": 0.004860463301374987, "loss": 2.4029, "step": 178300 }, { "epoch": 2.397913922417269, "grad_norm": 1.0734734535217285, "learning_rate": 0.004860288616244085, "loss": 2.4092, "step": 178400 }, { "epoch": 2.3992580445710905, "grad_norm": 0.3121657073497772, "learning_rate": 0.004860113824987515, "loss": 2.3985, "step": 178500 }, { "epoch": 2.400602166724912, "grad_norm": 0.5276472568511963, "learning_rate": 0.004859938927613148, "loss": 2.4032, "step": 178600 }, { "epoch": 2.401946288878733, "grad_norm": 0.21578922867774963, "learning_rate": 0.004859763924128867, "loss": 2.4002, "step": 178700 }, { "epoch": 2.403290411032555, "grad_norm": 0.36338990926742554, "learning_rate": 0.004859588814542557, "loss": 2.4103, "step": 178800 }, { "epoch": 2.404634533186376, "grad_norm": 0.7060322165489197, "learning_rate": 0.004859413598862113, "loss": 2.4039, "step": 178900 }, { "epoch": 2.4059786553401974, "grad_norm": 0.3517087399959564, "learning_rate": 0.004859238277095424, "loss": 2.4107, "step": 179000 }, { "epoch": 2.4059786553401974, "eval_MaskedAccuracy": 0.4501261935846993, "eval_loss": 2.7151551246643066, "eval_runtime": 158.7753, "eval_samples_per_second": 399.785, "eval_steps_per_second": 1.562, "step": 179000 }, { "epoch": 2.4073227774940187, "grad_norm": 0.850573718547821, "learning_rate": 0.004859062849250387, "loss": 2.4033, "step": 179100 }, { "epoch": 2.40866689964784, "grad_norm": 0.25686702132225037, "learning_rate": 0.004858887315334917, "loss": 2.4033, "step": 179200 }, { "epoch": 2.4100110218016613, "grad_norm": 0.580507755279541, "learning_rate": 0.0048587116753569136, "loss": 2.4075, "step": 179300 }, { "epoch": 2.4113551439554826, "grad_norm": 0.21325333416461945, "learning_rate": 0.004858535929324301, "loss": 2.4066, "step": 179400 }, { "epoch": 2.412699266109304, "grad_norm": 1.397865891456604, "learning_rate": 0.004858360077244987, "loss": 2.4069, "step": 179500 }, { "epoch": 2.414043388263125, "grad_norm": 0.7146439552307129, "learning_rate": 0.004858184119126909, "loss": 2.4055, "step": 179600 }, { "epoch": 2.415387510416947, "grad_norm": 0.6387777328491211, "learning_rate": 0.004858008054977973, "loss": 2.3984, "step": 179700 }, { "epoch": 2.416731632570768, "grad_norm": 0.6029757261276245, "learning_rate": 0.004857831884806135, "loss": 2.4017, "step": 179800 }, { "epoch": 2.4180757547245895, "grad_norm": 0.3883086144924164, "learning_rate": 0.004857655608619315, "loss": 2.4009, "step": 179900 }, { "epoch": 2.4194198768784108, "grad_norm": 0.24268724024295807, "learning_rate": 0.004857479226425462, "loss": 2.4127, "step": 180000 }, { "epoch": 2.4194198768784108, "eval_MaskedAccuracy": 0.4500598610681297, "eval_loss": 2.7149128913879395, "eval_runtime": 160.6745, "eval_samples_per_second": 395.06, "eval_steps_per_second": 1.543, "step": 180000 }, { "epoch": 2.420763999032232, "grad_norm": 0.2926810085773468, "learning_rate": 0.004857302738232527, "loss": 2.4048, "step": 180100 }, { "epoch": 2.4221081211860533, "grad_norm": 0.5268754363059998, "learning_rate": 0.004857126144048464, "loss": 2.3981, "step": 180200 }, { "epoch": 2.4234522433398746, "grad_norm": 0.27951580286026, "learning_rate": 0.004856949443881227, "loss": 2.3983, "step": 180300 }, { "epoch": 2.424796365493696, "grad_norm": 0.5589247345924377, "learning_rate": 0.004856772637738778, "loss": 2.405, "step": 180400 }, { "epoch": 2.426140487647517, "grad_norm": 0.2754684090614319, "learning_rate": 0.004856595725629084, "loss": 2.4081, "step": 180500 }, { "epoch": 2.427484609801339, "grad_norm": 1.014289379119873, "learning_rate": 0.004856418707560116, "loss": 2.4028, "step": 180600 }, { "epoch": 2.4288287319551602, "grad_norm": 1.224565029144287, "learning_rate": 0.004856241583539849, "loss": 2.4041, "step": 180700 }, { "epoch": 2.4301728541089815, "grad_norm": 0.2553938329219818, "learning_rate": 0.004856064353576271, "loss": 2.4039, "step": 180800 }, { "epoch": 2.431516976262803, "grad_norm": 0.36857032775878906, "learning_rate": 0.004855887017677354, "loss": 2.4021, "step": 180900 }, { "epoch": 2.432861098416624, "grad_norm": 0.3493588864803314, "learning_rate": 0.004855709575851092, "loss": 2.4009, "step": 181000 }, { "epoch": 2.432861098416624, "eval_MaskedAccuracy": 0.4509061473369637, "eval_loss": 2.7095935344696045, "eval_runtime": 159.2973, "eval_samples_per_second": 398.475, "eval_steps_per_second": 1.557, "step": 181000 }, { "epoch": 2.4342052205704454, "grad_norm": 0.6775467395782471, "learning_rate": 0.004855532028105485, "loss": 2.393, "step": 181100 }, { "epoch": 2.4355493427242667, "grad_norm": 1.7895002365112305, "learning_rate": 0.004855354374448533, "loss": 2.3995, "step": 181200 }, { "epoch": 2.436893464878088, "grad_norm": 0.2109525501728058, "learning_rate": 0.004855176614888239, "loss": 2.4031, "step": 181300 }, { "epoch": 2.4382375870319093, "grad_norm": 0.24894700944423676, "learning_rate": 0.004854998749432621, "loss": 2.4049, "step": 181400 }, { "epoch": 2.439581709185731, "grad_norm": 0.3863070011138916, "learning_rate": 0.004854820778089685, "loss": 2.3983, "step": 181500 }, { "epoch": 2.4409258313395523, "grad_norm": 0.4976949095726013, "learning_rate": 0.0048546427008674484, "loss": 2.4076, "step": 181600 }, { "epoch": 2.4422699534933736, "grad_norm": 1.53170907497406, "learning_rate": 0.00485446451777394, "loss": 2.4009, "step": 181700 }, { "epoch": 2.443614075647195, "grad_norm": 0.8086992502212524, "learning_rate": 0.0048542862288171885, "loss": 2.4014, "step": 181800 }, { "epoch": 2.444958197801016, "grad_norm": 0.8978860378265381, "learning_rate": 0.00485410783400522, "loss": 2.4002, "step": 181900 }, { "epoch": 2.4463023199548375, "grad_norm": 0.40652504563331604, "learning_rate": 0.004853929333346081, "loss": 2.3978, "step": 182000 }, { "epoch": 2.4463023199548375, "eval_MaskedAccuracy": 0.4505315380239766, "eval_loss": 2.7120492458343506, "eval_runtime": 158.1071, "eval_samples_per_second": 401.475, "eval_steps_per_second": 1.569, "step": 182000 }, { "epoch": 2.447646442108659, "grad_norm": 0.544252336025238, "learning_rate": 0.004853750726847811, "loss": 2.3905, "step": 182100 }, { "epoch": 2.44899056426248, "grad_norm": 1.0650444030761719, "learning_rate": 0.0048535720145184615, "loss": 2.4042, "step": 182200 }, { "epoch": 2.4503346864163014, "grad_norm": 0.23677361011505127, "learning_rate": 0.004853393196366076, "loss": 2.3987, "step": 182300 }, { "epoch": 2.4516788085701227, "grad_norm": 0.5979803204536438, "learning_rate": 0.004853214272398713, "loss": 2.4045, "step": 182400 }, { "epoch": 2.453022930723944, "grad_norm": 0.37009915709495544, "learning_rate": 0.0048530352426244434, "loss": 2.4025, "step": 182500 }, { "epoch": 2.4543670528777657, "grad_norm": 0.9107745289802551, "learning_rate": 0.00485285610705133, "loss": 2.4047, "step": 182600 }, { "epoch": 2.455711175031587, "grad_norm": 0.6059080958366394, "learning_rate": 0.004852676865687442, "loss": 2.4036, "step": 182700 }, { "epoch": 2.4570552971854083, "grad_norm": 1.2172014713287354, "learning_rate": 0.004852497518540862, "loss": 2.4039, "step": 182800 }, { "epoch": 2.4583994193392296, "grad_norm": 0.27083879709243774, "learning_rate": 0.004852318065619665, "loss": 2.4028, "step": 182900 }, { "epoch": 2.459743541493051, "grad_norm": 0.2236872911453247, "learning_rate": 0.0048521385069319346, "loss": 2.3937, "step": 183000 }, { "epoch": 2.459743541493051, "eval_MaskedAccuracy": 0.4513215347110807, "eval_loss": 2.7078845500946045, "eval_runtime": 156.7319, "eval_samples_per_second": 404.997, "eval_steps_per_second": 1.582, "step": 183000 }, { "epoch": 2.461087663646872, "grad_norm": 0.7614181637763977, "learning_rate": 0.004851958842485767, "loss": 2.3939, "step": 183100 }, { "epoch": 2.4624317858006934, "grad_norm": 0.20438380539417267, "learning_rate": 0.004851779072289257, "loss": 2.4072, "step": 183200 }, { "epoch": 2.4637759079545147, "grad_norm": 0.995496392250061, "learning_rate": 0.004851599196350496, "loss": 2.3985, "step": 183300 }, { "epoch": 2.465120030108336, "grad_norm": 0.5554972290992737, "learning_rate": 0.004851419214677599, "loss": 2.3987, "step": 183400 }, { "epoch": 2.4664641522621578, "grad_norm": 0.7563191056251526, "learning_rate": 0.004851239127278679, "loss": 2.4124, "step": 183500 }, { "epoch": 2.467808274415979, "grad_norm": 0.9702069163322449, "learning_rate": 0.004851058934161847, "loss": 2.4062, "step": 183600 }, { "epoch": 2.4691523965698003, "grad_norm": 0.3375505208969116, "learning_rate": 0.004850878635335215, "loss": 2.3993, "step": 183700 }, { "epoch": 2.4704965187236216, "grad_norm": 0.3639039993286133, "learning_rate": 0.004850698230806917, "loss": 2.3888, "step": 183800 }, { "epoch": 2.471840640877443, "grad_norm": 1.2747180461883545, "learning_rate": 0.004850517720585071, "loss": 2.3952, "step": 183900 }, { "epoch": 2.473184763031264, "grad_norm": 0.2078840732574463, "learning_rate": 0.004850337104677819, "loss": 2.3996, "step": 184000 }, { "epoch": 2.473184763031264, "eval_MaskedAccuracy": 0.4516848487899179, "eval_loss": 2.7041962146759033, "eval_runtime": 156.0607, "eval_samples_per_second": 406.739, "eval_steps_per_second": 1.589, "step": 184000 }, { "epoch": 2.4745288851850855, "grad_norm": 0.7602828145027161, "learning_rate": 0.004850156383093299, "loss": 2.3861, "step": 184100 }, { "epoch": 2.475873007338907, "grad_norm": 0.3689172863960266, "learning_rate": 0.004849975555839646, "loss": 2.3889, "step": 184200 }, { "epoch": 2.477217129492728, "grad_norm": 0.26970091462135315, "learning_rate": 0.004849794622925014, "loss": 2.4053, "step": 184300 }, { "epoch": 2.47856125164655, "grad_norm": 0.237064927816391, "learning_rate": 0.004849613584357556, "loss": 2.4008, "step": 184400 }, { "epoch": 2.479905373800371, "grad_norm": 1.7367196083068848, "learning_rate": 0.004849432440145427, "loss": 2.4008, "step": 184500 }, { "epoch": 2.4812494959541924, "grad_norm": 0.3666498064994812, "learning_rate": 0.004849251190296789, "loss": 2.4011, "step": 184600 }, { "epoch": 2.4825936181080137, "grad_norm": 0.4193936288356781, "learning_rate": 0.004849069834819819, "loss": 2.4085, "step": 184700 }, { "epoch": 2.483937740261835, "grad_norm": 0.3407999277114868, "learning_rate": 0.004848888373722671, "loss": 2.4006, "step": 184800 }, { "epoch": 2.4852818624156563, "grad_norm": 0.3108844459056854, "learning_rate": 0.0048487068070135345, "loss": 2.4081, "step": 184900 }, { "epoch": 2.4866259845694776, "grad_norm": 0.3092864453792572, "learning_rate": 0.004848525134700581, "loss": 2.4082, "step": 185000 }, { "epoch": 2.4866259845694776, "eval_MaskedAccuracy": 0.4504291166073478, "eval_loss": 2.712066411972046, "eval_runtime": 156.1824, "eval_samples_per_second": 406.422, "eval_steps_per_second": 1.588, "step": 185000 }, { "epoch": 2.487970106723299, "grad_norm": 0.25049683451652527, "learning_rate": 0.004848343356792004, "loss": 2.4032, "step": 185100 }, { "epoch": 2.48931422887712, "grad_norm": 0.9619961977005005, "learning_rate": 0.004848161473295994, "loss": 2.4099, "step": 185200 }, { "epoch": 2.490658351030942, "grad_norm": 1.3118278980255127, "learning_rate": 0.004847979484220742, "loss": 2.3951, "step": 185300 }, { "epoch": 2.492002473184763, "grad_norm": 0.890388011932373, "learning_rate": 0.004847797389574453, "loss": 2.404, "step": 185400 }, { "epoch": 2.4933465953385845, "grad_norm": 0.9836694002151489, "learning_rate": 0.004847615189365324, "loss": 2.4056, "step": 185500 }, { "epoch": 2.494690717492406, "grad_norm": 1.4075289964675903, "learning_rate": 0.004847432883601571, "loss": 2.3974, "step": 185600 }, { "epoch": 2.496034839646227, "grad_norm": 1.5059231519699097, "learning_rate": 0.0048472504722914134, "loss": 2.399, "step": 185700 }, { "epoch": 2.4973789618000484, "grad_norm": 0.42734503746032715, "learning_rate": 0.004847067955443057, "loss": 2.4063, "step": 185800 }, { "epoch": 2.4987230839538697, "grad_norm": 1.4821892976760864, "learning_rate": 0.004846885333064743, "loss": 2.3914, "step": 185900 }, { "epoch": 2.500067206107691, "grad_norm": 1.791476845741272, "learning_rate": 0.004846702605164673, "loss": 2.3974, "step": 186000 }, { "epoch": 2.500067206107691, "eval_MaskedAccuracy": 0.4501642596224747, "eval_loss": 2.7116963863372803, "eval_runtime": 159.5858, "eval_samples_per_second": 397.755, "eval_steps_per_second": 1.554, "step": 186000 }, { "epoch": 2.5014113282615122, "grad_norm": 0.5543153285980225, "learning_rate": 0.004846519771751113, "loss": 2.4015, "step": 186100 }, { "epoch": 2.502755450415334, "grad_norm": 1.6506179571151733, "learning_rate": 0.004846336832832279, "loss": 2.3985, "step": 186200 }, { "epoch": 2.504099572569155, "grad_norm": 0.9173561334609985, "learning_rate": 0.004846153788416425, "loss": 2.3929, "step": 186300 }, { "epoch": 2.5054436947229766, "grad_norm": 0.9084792733192444, "learning_rate": 0.004845970638511788, "loss": 2.3981, "step": 186400 }, { "epoch": 2.506787816876798, "grad_norm": 0.4403022825717926, "learning_rate": 0.004845787383126632, "loss": 2.3893, "step": 186500 }, { "epoch": 2.508131939030619, "grad_norm": 0.3516016900539398, "learning_rate": 0.0048456040222692055, "loss": 2.3998, "step": 186600 }, { "epoch": 2.5094760611844404, "grad_norm": 0.5586411952972412, "learning_rate": 0.0048454205559477735, "loss": 2.4068, "step": 186700 }, { "epoch": 2.5108201833382617, "grad_norm": 0.2955363094806671, "learning_rate": 0.004845236984170605, "loss": 2.4005, "step": 186800 }, { "epoch": 2.512164305492083, "grad_norm": 0.22038711607456207, "learning_rate": 0.004845053306945971, "loss": 2.4009, "step": 186900 }, { "epoch": 2.5135084276459043, "grad_norm": 0.7231718301773071, "learning_rate": 0.004844869524282139, "loss": 2.3994, "step": 187000 }, { "epoch": 2.5135084276459043, "eval_MaskedAccuracy": 0.45075855288065936, "eval_loss": 2.709667682647705, "eval_runtime": 159.519, "eval_samples_per_second": 397.921, "eval_steps_per_second": 1.555, "step": 187000 }, { "epoch": 2.514852549799726, "grad_norm": 1.2093353271484375, "learning_rate": 0.004844685636187404, "loss": 2.4022, "step": 187100 }, { "epoch": 2.516196671953547, "grad_norm": 0.46012699604034424, "learning_rate": 0.004844501642670045, "loss": 2.4025, "step": 187200 }, { "epoch": 2.5175407941073686, "grad_norm": 0.5770087838172913, "learning_rate": 0.00484431754373835, "loss": 2.3926, "step": 187300 }, { "epoch": 2.51888491626119, "grad_norm": 0.8018302321434021, "learning_rate": 0.004844133339400616, "loss": 2.403, "step": 187400 }, { "epoch": 2.5202290384150112, "grad_norm": 1.2442452907562256, "learning_rate": 0.004843949029665142, "loss": 2.402, "step": 187500 }, { "epoch": 2.5215731605688325, "grad_norm": 0.22802333533763885, "learning_rate": 0.0048437646145402385, "loss": 2.4039, "step": 187600 }, { "epoch": 2.522917282722654, "grad_norm": 0.5682822465896606, "learning_rate": 0.004843580094034207, "loss": 2.3969, "step": 187700 }, { "epoch": 2.524261404876475, "grad_norm": 0.9464827179908752, "learning_rate": 0.004843395468155368, "loss": 2.3924, "step": 187800 }, { "epoch": 2.5256055270302964, "grad_norm": 0.5253119468688965, "learning_rate": 0.00484321073691204, "loss": 2.3929, "step": 187900 }, { "epoch": 2.526949649184118, "grad_norm": 0.6649153828620911, "learning_rate": 0.004843025900312542, "loss": 2.3994, "step": 188000 }, { "epoch": 2.526949649184118, "eval_MaskedAccuracy": 0.45129980447121243, "eval_loss": 2.707331657409668, "eval_runtime": 162.5712, "eval_samples_per_second": 390.45, "eval_steps_per_second": 1.525, "step": 188000 }, { "epoch": 2.528293771337939, "grad_norm": 0.21869996190071106, "learning_rate": 0.004842840958365209, "loss": 2.3981, "step": 188100 }, { "epoch": 2.5296378934917607, "grad_norm": 0.394054651260376, "learning_rate": 0.004842655911078364, "loss": 2.3985, "step": 188200 }, { "epoch": 2.530982015645582, "grad_norm": 0.5307244658470154, "learning_rate": 0.004842470758460362, "loss": 2.3977, "step": 188300 }, { "epoch": 2.5323261377994033, "grad_norm": 0.19963762164115906, "learning_rate": 0.004842285500519532, "loss": 2.4024, "step": 188400 }, { "epoch": 2.5336702599532246, "grad_norm": 0.7292469143867493, "learning_rate": 0.004842100137264226, "loss": 2.3958, "step": 188500 }, { "epoch": 2.535014382107046, "grad_norm": 2.722954034805298, "learning_rate": 0.004841914668702794, "loss": 2.3943, "step": 188600 }, { "epoch": 2.536358504260867, "grad_norm": 0.30424684286117554, "learning_rate": 0.004841729094843593, "loss": 2.3951, "step": 188700 }, { "epoch": 2.5377026264146885, "grad_norm": 0.6298586130142212, "learning_rate": 0.004841543415694982, "loss": 2.3964, "step": 188800 }, { "epoch": 2.5390467485685098, "grad_norm": 0.30196303129196167, "learning_rate": 0.004841357631265328, "loss": 2.3894, "step": 188900 }, { "epoch": 2.540390870722331, "grad_norm": 0.2881263792514801, "learning_rate": 0.004841171741563008, "loss": 2.4015, "step": 189000 }, { "epoch": 2.540390870722331, "eval_MaskedAccuracy": 0.4518744871539106, "eval_loss": 2.7033305168151855, "eval_runtime": 153.1061, "eval_samples_per_second": 414.588, "eval_steps_per_second": 1.62, "step": 189000 }, { "epoch": 2.541734992876153, "grad_norm": 0.23784345388412476, "learning_rate": 0.004840985746596394, "loss": 2.3971, "step": 189100 }, { "epoch": 2.543079115029974, "grad_norm": 0.2152082920074463, "learning_rate": 0.004840799646373867, "loss": 2.3981, "step": 189200 }, { "epoch": 2.5444232371837954, "grad_norm": 0.2263517528772354, "learning_rate": 0.004840613440903815, "loss": 2.3943, "step": 189300 }, { "epoch": 2.5457673593376167, "grad_norm": 0.6147149801254272, "learning_rate": 0.00484042713019462, "loss": 2.3895, "step": 189400 }, { "epoch": 2.547111481491438, "grad_norm": 0.3678668439388275, "learning_rate": 0.0048402407142546875, "loss": 2.3991, "step": 189500 }, { "epoch": 2.5484556036452592, "grad_norm": 0.323637455701828, "learning_rate": 0.0048400541930924135, "loss": 2.3957, "step": 189600 }, { "epoch": 2.5497997257990805, "grad_norm": 1.3563669919967651, "learning_rate": 0.004839867566716193, "loss": 2.3984, "step": 189700 }, { "epoch": 2.551143847952902, "grad_norm": 1.2862578630447388, "learning_rate": 0.004839680835134451, "loss": 2.4005, "step": 189800 }, { "epoch": 2.552487970106723, "grad_norm": 0.30557116866111755, "learning_rate": 0.00483949399835559, "loss": 2.4025, "step": 189900 }, { "epoch": 2.553832092260545, "grad_norm": 0.46706828474998474, "learning_rate": 0.004839307056388037, "loss": 2.3974, "step": 190000 }, { "epoch": 2.553832092260545, "eval_MaskedAccuracy": 0.4516189117545165, "eval_loss": 2.704475164413452, "eval_runtime": 155.082, "eval_samples_per_second": 409.306, "eval_steps_per_second": 1.599, "step": 190000 }, { "epoch": 2.5551762144143657, "grad_norm": 0.8386965990066528, "learning_rate": 0.0048391200092402055, "loss": 2.3969, "step": 190100 }, { "epoch": 2.5565203365681874, "grad_norm": 0.35076892375946045, "learning_rate": 0.004838932856920533, "loss": 2.4024, "step": 190200 }, { "epoch": 2.5578644587220087, "grad_norm": 0.2803960144519806, "learning_rate": 0.0048387455994374435, "loss": 2.389, "step": 190300 }, { "epoch": 2.55920858087583, "grad_norm": 0.4562201499938965, "learning_rate": 0.004838558236799381, "loss": 2.3991, "step": 190400 }, { "epoch": 2.5605527030296513, "grad_norm": 1.0036048889160156, "learning_rate": 0.004838370769014779, "loss": 2.3928, "step": 190500 }, { "epoch": 2.5618968251834726, "grad_norm": 0.2660607099533081, "learning_rate": 0.004838183196092097, "loss": 2.4001, "step": 190600 }, { "epoch": 2.563240947337294, "grad_norm": 0.6841756701469421, "learning_rate": 0.004837995518039784, "loss": 2.4028, "step": 190700 }, { "epoch": 2.564585069491115, "grad_norm": 1.2373573780059814, "learning_rate": 0.0048378077348662845, "loss": 2.4022, "step": 190800 }, { "epoch": 2.565929191644937, "grad_norm": 0.6526246070861816, "learning_rate": 0.004837619846580077, "loss": 2.3894, "step": 190900 }, { "epoch": 2.567273313798758, "grad_norm": 0.2829650938510895, "learning_rate": 0.004837431853189622, "loss": 2.4013, "step": 191000 }, { "epoch": 2.567273313798758, "eval_MaskedAccuracy": 0.4523377197669412, "eval_loss": 2.701768636703491, "eval_runtime": 154.0538, "eval_samples_per_second": 412.038, "eval_steps_per_second": 1.61, "step": 191000 }, { "epoch": 2.5686174359525795, "grad_norm": 0.32170721888542175, "learning_rate": 0.004837243754703377, "loss": 2.3919, "step": 191100 }, { "epoch": 2.569961558106401, "grad_norm": 0.49978765845298767, "learning_rate": 0.004837055551129837, "loss": 2.398, "step": 191200 }, { "epoch": 2.571305680260222, "grad_norm": 0.7216925024986267, "learning_rate": 0.004836867242477465, "loss": 2.389, "step": 191300 }, { "epoch": 2.5726498024140434, "grad_norm": 0.381653368473053, "learning_rate": 0.004836678828754757, "loss": 2.4022, "step": 191400 }, { "epoch": 2.5739939245678647, "grad_norm": 0.5341126322746277, "learning_rate": 0.004836490309970204, "loss": 2.4023, "step": 191500 }, { "epoch": 2.575338046721686, "grad_norm": 1.2294304370880127, "learning_rate": 0.00483630168613229, "loss": 2.3937, "step": 191600 }, { "epoch": 2.5766821688755073, "grad_norm": 0.9563962817192078, "learning_rate": 0.004836112957249523, "loss": 2.3978, "step": 191700 }, { "epoch": 2.578026291029329, "grad_norm": 0.6789734959602356, "learning_rate": 0.0048359241233304024, "loss": 2.3914, "step": 191800 }, { "epoch": 2.57937041318315, "grad_norm": 2.394522190093994, "learning_rate": 0.004835735184383436, "loss": 2.397, "step": 191900 }, { "epoch": 2.5807145353369716, "grad_norm": 0.9763964414596558, "learning_rate": 0.0048355461404171445, "loss": 2.3978, "step": 192000 }, { "epoch": 2.5807145353369716, "eval_MaskedAccuracy": 0.45172072887484477, "eval_loss": 2.704576253890991, "eval_runtime": 152.9163, "eval_samples_per_second": 415.103, "eval_steps_per_second": 1.622, "step": 192000 }, { "epoch": 2.582058657490793, "grad_norm": 1.0108835697174072, "learning_rate": 0.004835356991440036, "loss": 2.3952, "step": 192100 }, { "epoch": 2.583402779644614, "grad_norm": 0.4426333010196686, "learning_rate": 0.004835167737460643, "loss": 2.3953, "step": 192200 }, { "epoch": 2.5847469017984355, "grad_norm": 0.35795265436172485, "learning_rate": 0.004834978378487477, "loss": 2.3937, "step": 192300 }, { "epoch": 2.5860910239522568, "grad_norm": 0.5657187104225159, "learning_rate": 0.004834788914529085, "loss": 2.3972, "step": 192400 }, { "epoch": 2.587435146106078, "grad_norm": 1.4484549760818481, "learning_rate": 0.004834599345594001, "loss": 2.4053, "step": 192500 }, { "epoch": 2.5887792682598993, "grad_norm": 0.5882700085639954, "learning_rate": 0.004834409671690765, "loss": 2.3941, "step": 192600 }, { "epoch": 2.5901233904137206, "grad_norm": 0.2406308501958847, "learning_rate": 0.004834219892827928, "loss": 2.3877, "step": 192700 }, { "epoch": 2.591467512567542, "grad_norm": 0.6876055002212524, "learning_rate": 0.004834030009014036, "loss": 2.3869, "step": 192800 }, { "epoch": 2.5928116347213637, "grad_norm": 2.6464781761169434, "learning_rate": 0.0048338400202576435, "loss": 2.3974, "step": 192900 }, { "epoch": 2.594155756875185, "grad_norm": 0.5188945531845093, "learning_rate": 0.004833649926567314, "loss": 2.3896, "step": 193000 }, { "epoch": 2.594155756875185, "eval_MaskedAccuracy": 0.4505340990281368, "eval_loss": 2.70825457572937, "eval_runtime": 153.498, "eval_samples_per_second": 413.53, "eval_steps_per_second": 1.616, "step": 193000 }, { "epoch": 2.5954998790290063, "grad_norm": 0.28811684250831604, "learning_rate": 0.004833459727951611, "loss": 2.3915, "step": 193100 }, { "epoch": 2.5968440011828275, "grad_norm": 0.47558093070983887, "learning_rate": 0.004833269424419109, "loss": 2.39, "step": 193200 }, { "epoch": 2.598188123336649, "grad_norm": 0.34942418336868286, "learning_rate": 0.004833079015978379, "loss": 2.3917, "step": 193300 }, { "epoch": 2.59953224549047, "grad_norm": 0.28590691089630127, "learning_rate": 0.004832888502638003, "loss": 2.3953, "step": 193400 }, { "epoch": 2.6008763676442914, "grad_norm": 0.4065092206001282, "learning_rate": 0.004832697884406561, "loss": 2.3919, "step": 193500 }, { "epoch": 2.6022204897981127, "grad_norm": 0.7588686943054199, "learning_rate": 0.004832507161292647, "loss": 2.3976, "step": 193600 }, { "epoch": 2.603564611951934, "grad_norm": 0.8062926530838013, "learning_rate": 0.004832316333304851, "loss": 2.3993, "step": 193700 }, { "epoch": 2.6049087341057557, "grad_norm": 0.3884819447994232, "learning_rate": 0.004832125400451772, "loss": 2.3946, "step": 193800 }, { "epoch": 2.606252856259577, "grad_norm": 0.950999915599823, "learning_rate": 0.004831934362742015, "loss": 2.399, "step": 193900 }, { "epoch": 2.6075969784133983, "grad_norm": 0.2583158016204834, "learning_rate": 0.004831743220184189, "loss": 2.3895, "step": 194000 }, { "epoch": 2.6075969784133983, "eval_MaskedAccuracy": 0.45153046662578544, "eval_loss": 2.70489764213562, "eval_runtime": 156.1126, "eval_samples_per_second": 406.604, "eval_steps_per_second": 1.589, "step": 194000 }, { "epoch": 2.6089411005672196, "grad_norm": 0.20159654319286346, "learning_rate": 0.004831551972786904, "loss": 2.4009, "step": 194100 }, { "epoch": 2.610285222721041, "grad_norm": 0.3360646963119507, "learning_rate": 0.004831360620558774, "loss": 2.4018, "step": 194200 }, { "epoch": 2.611629344874862, "grad_norm": 1.5870589017868042, "learning_rate": 0.004831169163508427, "loss": 2.3897, "step": 194300 }, { "epoch": 2.6129734670286835, "grad_norm": 0.9621824622154236, "learning_rate": 0.004830977601644483, "loss": 2.3896, "step": 194400 }, { "epoch": 2.614317589182505, "grad_norm": 0.5799399018287659, "learning_rate": 0.004830785934975585, "loss": 2.3932, "step": 194500 }, { "epoch": 2.615661711336326, "grad_norm": 0.3511755168437958, "learning_rate": 0.004830594163510357, "loss": 2.3863, "step": 194600 }, { "epoch": 2.617005833490148, "grad_norm": 0.6359513401985168, "learning_rate": 0.004830402287257447, "loss": 2.3967, "step": 194700 }, { "epoch": 2.6183499556439687, "grad_norm": 0.7922179102897644, "learning_rate": 0.004830210306225503, "loss": 2.397, "step": 194800 }, { "epoch": 2.6196940777977904, "grad_norm": 0.43679991364479065, "learning_rate": 0.004830018220423168, "loss": 2.3966, "step": 194900 }, { "epoch": 2.6210381999516117, "grad_norm": 0.5114357471466064, "learning_rate": 0.004829826029859097, "loss": 2.3997, "step": 195000 }, { "epoch": 2.6210381999516117, "eval_MaskedAccuracy": 0.45087460050173545, "eval_loss": 2.7082619667053223, "eval_runtime": 153.3827, "eval_samples_per_second": 413.841, "eval_steps_per_second": 1.617, "step": 195000 }, { "epoch": 2.622382322105433, "grad_norm": 0.5121012330055237, "learning_rate": 0.004829633734541953, "loss": 2.3954, "step": 195100 }, { "epoch": 2.6237264442592543, "grad_norm": 0.693209171295166, "learning_rate": 0.0048294413344804, "loss": 2.3919, "step": 195200 }, { "epoch": 2.6250705664130756, "grad_norm": 0.23428793251514435, "learning_rate": 0.004829248829683105, "loss": 2.3982, "step": 195300 }, { "epoch": 2.626414688566897, "grad_norm": 0.37016478180885315, "learning_rate": 0.00482905622015875, "loss": 2.3932, "step": 195400 }, { "epoch": 2.627758810720718, "grad_norm": 0.6766939759254456, "learning_rate": 0.004828863505916004, "loss": 2.394, "step": 195500 }, { "epoch": 2.62910293287454, "grad_norm": 0.5270081758499146, "learning_rate": 0.004828670686963563, "loss": 2.3912, "step": 195600 }, { "epoch": 2.6304470550283607, "grad_norm": 0.31191486120224, "learning_rate": 0.004828477763310103, "loss": 2.4008, "step": 195700 }, { "epoch": 2.6317911771821825, "grad_norm": 0.21520403027534485, "learning_rate": 0.00482828473496432, "loss": 2.3868, "step": 195800 }, { "epoch": 2.6331352993360038, "grad_norm": 0.2524843215942383, "learning_rate": 0.00482809160193491, "loss": 2.3957, "step": 195900 }, { "epoch": 2.634479421489825, "grad_norm": 0.6567283272743225, "learning_rate": 0.004827898364230586, "loss": 2.3915, "step": 196000 }, { "epoch": 2.634479421489825, "eval_MaskedAccuracy": 0.4512480287660106, "eval_loss": 2.7046916484832764, "eval_runtime": 154.0885, "eval_samples_per_second": 411.945, "eval_steps_per_second": 1.609, "step": 196000 }, { "epoch": 2.6358235436436463, "grad_norm": 0.542457103729248, "learning_rate": 0.004827705021860038, "loss": 2.3947, "step": 196100 }, { "epoch": 2.6371676657974676, "grad_norm": 0.44133803248405457, "learning_rate": 0.004827511574831989, "loss": 2.3872, "step": 196200 }, { "epoch": 2.638511787951289, "grad_norm": 0.9183239936828613, "learning_rate": 0.004827318023155146, "loss": 2.3966, "step": 196300 }, { "epoch": 2.6398559101051102, "grad_norm": 0.851475715637207, "learning_rate": 0.0048271243668382485, "loss": 2.3931, "step": 196400 }, { "epoch": 2.641200032258932, "grad_norm": 0.9451562166213989, "learning_rate": 0.004826930605890004, "loss": 2.3922, "step": 196500 }, { "epoch": 2.642544154412753, "grad_norm": 0.5648373961448669, "learning_rate": 0.004826736740319149, "loss": 2.3882, "step": 196600 }, { "epoch": 2.6438882765665745, "grad_norm": 0.32394424080848694, "learning_rate": 0.0048265427701344185, "loss": 2.3918, "step": 196700 }, { "epoch": 2.645232398720396, "grad_norm": 0.5081258416175842, "learning_rate": 0.0048263486953445565, "loss": 2.3863, "step": 196800 }, { "epoch": 2.646576520874217, "grad_norm": 0.7854281067848206, "learning_rate": 0.004826154515958302, "loss": 2.3833, "step": 196900 }, { "epoch": 2.6479206430280384, "grad_norm": 0.5658829212188721, "learning_rate": 0.004825960231984411, "loss": 2.3879, "step": 197000 }, { "epoch": 2.6479206430280384, "eval_MaskedAccuracy": 0.4514419081074835, "eval_loss": 2.7042078971862793, "eval_runtime": 172.0922, "eval_samples_per_second": 368.849, "eval_steps_per_second": 1.441, "step": 197000 }, { "epoch": 2.6492647651818597, "grad_norm": 0.9744159579277039, "learning_rate": 0.004825765843431632, "loss": 2.3993, "step": 197100 }, { "epoch": 2.650608887335681, "grad_norm": 1.349397897720337, "learning_rate": 0.004825571350308726, "loss": 2.3985, "step": 197200 }, { "epoch": 2.6519530094895023, "grad_norm": 0.8859503865242004, "learning_rate": 0.004825376752624455, "loss": 2.3838, "step": 197300 }, { "epoch": 2.6532971316433236, "grad_norm": 0.27515995502471924, "learning_rate": 0.004825182050387594, "loss": 2.3907, "step": 197400 }, { "epoch": 2.654641253797145, "grad_norm": 0.2807437777519226, "learning_rate": 0.0048249872436069065, "loss": 2.3969, "step": 197500 }, { "epoch": 2.6559853759509666, "grad_norm": 0.9948381185531616, "learning_rate": 0.00482479233229117, "loss": 2.392, "step": 197600 }, { "epoch": 2.657329498104788, "grad_norm": 1.0630486011505127, "learning_rate": 0.004824597316449177, "loss": 2.3895, "step": 197700 }, { "epoch": 2.658673620258609, "grad_norm": 0.24495822191238403, "learning_rate": 0.004824402196089705, "loss": 2.3984, "step": 197800 }, { "epoch": 2.6600177424124305, "grad_norm": 1.0143710374832153, "learning_rate": 0.004824206971221553, "loss": 2.3864, "step": 197900 }, { "epoch": 2.661361864566252, "grad_norm": 0.2758951187133789, "learning_rate": 0.0048240116418535135, "loss": 2.3837, "step": 198000 }, { "epoch": 2.661361864566252, "eval_MaskedAccuracy": 0.4513660960606149, "eval_loss": 2.70383882522583, "eval_runtime": 159.3018, "eval_samples_per_second": 398.464, "eval_steps_per_second": 1.557, "step": 198000 }, { "epoch": 2.662705986720073, "grad_norm": 0.34540656208992004, "learning_rate": 0.004823816207994382, "loss": 2.3947, "step": 198100 }, { "epoch": 2.6640501088738944, "grad_norm": 0.716076672077179, "learning_rate": 0.004823620669652978, "loss": 2.3981, "step": 198200 }, { "epoch": 2.6653942310277157, "grad_norm": 0.37496283650398254, "learning_rate": 0.004823425026838102, "loss": 2.3902, "step": 198300 }, { "epoch": 2.666738353181537, "grad_norm": 2.4844629764556885, "learning_rate": 0.004823229279558573, "loss": 2.387, "step": 198400 }, { "epoch": 2.6680824753353587, "grad_norm": 0.2198270559310913, "learning_rate": 0.004823033427823211, "loss": 2.3895, "step": 198500 }, { "epoch": 2.6694265974891795, "grad_norm": 0.23235642910003662, "learning_rate": 0.004822837471640838, "loss": 2.3955, "step": 198600 }, { "epoch": 2.6707707196430013, "grad_norm": 0.5798563957214355, "learning_rate": 0.00482264141102029, "loss": 2.3911, "step": 198700 }, { "epoch": 2.6721148417968226, "grad_norm": 0.36171451210975647, "learning_rate": 0.004822445245970393, "loss": 2.3919, "step": 198800 }, { "epoch": 2.673458963950644, "grad_norm": 0.9405841827392578, "learning_rate": 0.00482224897649999, "loss": 2.3965, "step": 198900 }, { "epoch": 2.674803086104465, "grad_norm": 0.40147000551223755, "learning_rate": 0.004822052602617928, "loss": 2.3928, "step": 199000 }, { "epoch": 2.674803086104465, "eval_MaskedAccuracy": 0.451330398429542, "eval_loss": 2.7040762901306152, "eval_runtime": 152.7346, "eval_samples_per_second": 415.597, "eval_steps_per_second": 1.624, "step": 199000 }, { "epoch": 2.6761472082582864, "grad_norm": 0.9061090350151062, "learning_rate": 0.004821856124333049, "loss": 2.3848, "step": 199100 }, { "epoch": 2.6774913304121077, "grad_norm": 0.8056171536445618, "learning_rate": 0.004821659541654204, "loss": 2.3965, "step": 199200 }, { "epoch": 2.678835452565929, "grad_norm": 2.8304381370544434, "learning_rate": 0.004821462854590257, "loss": 2.3895, "step": 199300 }, { "epoch": 2.6801795747197508, "grad_norm": 0.6876057386398315, "learning_rate": 0.004821266063150074, "loss": 2.3914, "step": 199400 }, { "epoch": 2.6815236968735716, "grad_norm": 1.1147319078445435, "learning_rate": 0.004821069167342508, "loss": 2.3884, "step": 199500 }, { "epoch": 2.6828678190273934, "grad_norm": 0.563893735408783, "learning_rate": 0.004820872167176442, "loss": 2.3824, "step": 199600 }, { "epoch": 2.6842119411812146, "grad_norm": 0.9097170829772949, "learning_rate": 0.004820675062660752, "loss": 2.3951, "step": 199700 }, { "epoch": 2.685556063335036, "grad_norm": 0.5297996997833252, "learning_rate": 0.004820477853804319, "loss": 2.3987, "step": 199800 }, { "epoch": 2.6869001854888572, "grad_norm": 0.4210087060928345, "learning_rate": 0.0048202805406160265, "loss": 2.3966, "step": 199900 }, { "epoch": 2.6882443076426785, "grad_norm": 0.9611253142356873, "learning_rate": 0.004820083123104765, "loss": 2.3946, "step": 200000 }, { "epoch": 2.6882443076426785, "eval_MaskedAccuracy": 0.4521056806971866, "eval_loss": 2.7007529735565186, "eval_runtime": 154.1926, "eval_samples_per_second": 411.667, "eval_steps_per_second": 1.608, "step": 200000 } ], "logging_steps": 100, "max_steps": 1500000, "num_input_tokens_seen": 0, "num_train_epochs": 21, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.56886375582715e+18, "train_batch_size": 384, "trial_name": null, "trial_params": null }