diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26083 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996087515448103, + "eval_steps": 50, + "global_step": 2515, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00039745874812914924, + "grad_norm": 4.6875, + "learning_rate": 4.000000000000001e-06, + "lm_loss": 13.1874, + "loss": 34.0326, + "mask_loss": 8.446, + "step": 1, + "topk_loss": 12.3992 + }, + { + "epoch": 0.0007949174962582985, + "grad_norm": 4.5, + "learning_rate": 8.000000000000001e-06, + "lm_loss": 13.1965, + "loss": 34.0618, + "mask_loss": 8.4655, + "step": 2, + "topk_loss": 12.3999 + }, + { + "epoch": 0.0011923762443874478, + "grad_norm": 4.375, + "learning_rate": 1.2e-05, + "lm_loss": 13.1916, + "loss": 34.0373, + "mask_loss": 8.4476, + "step": 3, + "topk_loss": 12.398 + }, + { + "epoch": 0.001589834992516597, + "grad_norm": 4.5, + "learning_rate": 1.6000000000000003e-05, + "lm_loss": 13.1915, + "loss": 34.0283, + "mask_loss": 8.4445, + "step": 4, + "topk_loss": 12.3922 + }, + { + "epoch": 0.0019872937406457463, + "grad_norm": 4.5625, + "learning_rate": 2e-05, + "lm_loss": 13.181, + "loss": 33.9921, + "mask_loss": 8.4261, + "step": 5, + "topk_loss": 12.385 + }, + { + "epoch": 0.0023847524887748955, + "grad_norm": 4.28125, + "learning_rate": 2.4e-05, + "lm_loss": 13.1677, + "loss": 33.9502, + "mask_loss": 8.4174, + "step": 6, + "topk_loss": 12.365 + }, + { + "epoch": 0.0027822112369040447, + "grad_norm": 4.46875, + "learning_rate": 2.8000000000000003e-05, + "lm_loss": 13.15, + "loss": 33.8695, + "mask_loss": 8.3807, + "step": 7, + "topk_loss": 12.3388 + }, + { + "epoch": 0.003179669985033194, + "grad_norm": 4.34375, + "learning_rate": 3.2000000000000005e-05, + "lm_loss": 13.1204, + "loss": 33.7783, + "mask_loss": 8.3465, + "step": 8, + "topk_loss": 12.3114 + }, + { + "epoch": 0.003577128733162343, + "grad_norm": 4.28125, + "learning_rate": 3.6e-05, + "lm_loss": 13.0919, + "loss": 33.688, + "mask_loss": 8.3141, + "step": 9, + "topk_loss": 12.2821 + }, + { + "epoch": 0.003974587481291493, + "grad_norm": 4.125, + "learning_rate": 4e-05, + "lm_loss": 13.0259, + "loss": 33.4608, + "mask_loss": 8.2243, + "step": 10, + "topk_loss": 12.2106 + }, + { + "epoch": 0.004372046229420641, + "grad_norm": 4.09375, + "learning_rate": 4.4000000000000006e-05, + "lm_loss": 12.9687, + "loss": 33.24, + "mask_loss": 8.1515, + "step": 11, + "topk_loss": 12.1198 + }, + { + "epoch": 0.004769504977549791, + "grad_norm": 3.96875, + "learning_rate": 4.8e-05, + "lm_loss": 12.8958, + "loss": 33.001, + "mask_loss": 8.0599, + "step": 12, + "topk_loss": 12.0454 + }, + { + "epoch": 0.005166963725678941, + "grad_norm": 3.859375, + "learning_rate": 5.2000000000000004e-05, + "lm_loss": 12.8306, + "loss": 32.7889, + "mask_loss": 7.9851, + "step": 13, + "topk_loss": 11.9732 + }, + { + "epoch": 0.005564422473808089, + "grad_norm": 3.9375, + "learning_rate": 5.6000000000000006e-05, + "lm_loss": 12.7439, + "loss": 32.4743, + "mask_loss": 7.8639, + "step": 14, + "topk_loss": 11.8666 + }, + { + "epoch": 0.005961881221937239, + "grad_norm": 3.828125, + "learning_rate": 6e-05, + "lm_loss": 12.663, + "loss": 32.2566, + "mask_loss": 7.7964, + "step": 15, + "topk_loss": 11.7973 + }, + { + "epoch": 0.006359339970066388, + "grad_norm": 3.828125, + "learning_rate": 6.400000000000001e-05, + "lm_loss": 12.6021, + "loss": 32.0481, + "mask_loss": 7.7306, + "step": 16, + "topk_loss": 11.7154 + }, + { + "epoch": 0.006756798718195537, + "grad_norm": 4.1875, + "learning_rate": 6.800000000000001e-05, + "lm_loss": 12.4821, + "loss": 31.6167, + "mask_loss": 7.5745, + "step": 17, + "topk_loss": 11.5601 + }, + { + "epoch": 0.007154257466324686, + "grad_norm": 4.46875, + "learning_rate": 7.2e-05, + "lm_loss": 12.3375, + "loss": 31.0941, + "mask_loss": 7.4022, + "step": 18, + "topk_loss": 11.3544 + }, + { + "epoch": 0.007551716214453836, + "grad_norm": 5.0, + "learning_rate": 7.6e-05, + "lm_loss": 12.1452, + "loss": 30.3741, + "mask_loss": 7.1468, + "step": 19, + "topk_loss": 11.0821 + }, + { + "epoch": 0.007949174962582985, + "grad_norm": 5.375, + "learning_rate": 8e-05, + "lm_loss": 11.9773, + "loss": 29.7729, + "mask_loss": 6.9488, + "step": 20, + "topk_loss": 10.8468 + }, + { + "epoch": 0.008346633710712134, + "grad_norm": 6.03125, + "learning_rate": 8.4e-05, + "lm_loss": 11.777, + "loss": 28.9801, + "mask_loss": 6.6822, + "step": 21, + "topk_loss": 10.5208 + }, + { + "epoch": 0.008744092458841283, + "grad_norm": 6.21875, + "learning_rate": 8.800000000000001e-05, + "lm_loss": 11.6339, + "loss": 28.5436, + "mask_loss": 6.5708, + "step": 22, + "topk_loss": 10.3388 + }, + { + "epoch": 0.009141551206970433, + "grad_norm": 6.5625, + "learning_rate": 9.200000000000001e-05, + "lm_loss": 11.478, + "loss": 27.9254, + "mask_loss": 6.3739, + "step": 23, + "topk_loss": 10.0736 + }, + { + "epoch": 0.009539009955099582, + "grad_norm": 7.28125, + "learning_rate": 9.6e-05, + "lm_loss": 11.2339, + "loss": 26.8817, + "mask_loss": 6.0258, + "step": 24, + "topk_loss": 9.622 + }, + { + "epoch": 0.00993646870322873, + "grad_norm": 7.3125, + "learning_rate": 0.0001, + "lm_loss": 11.0631, + "loss": 26.26, + "mask_loss": 5.8523, + "step": 25, + "topk_loss": 9.3446 + }, + { + "epoch": 0.010333927451357881, + "grad_norm": 7.6875, + "learning_rate": 0.00010400000000000001, + "lm_loss": 10.8143, + "loss": 25.2251, + "mask_loss": 5.5267, + "step": 26, + "topk_loss": 8.8841 + }, + { + "epoch": 0.01073138619948703, + "grad_norm": 7.75, + "learning_rate": 0.00010800000000000001, + "lm_loss": 10.5906, + "loss": 24.3879, + "mask_loss": 5.2855, + "step": 27, + "topk_loss": 8.5119 + }, + { + "epoch": 0.011128844947616179, + "grad_norm": 7.90625, + "learning_rate": 0.00011200000000000001, + "lm_loss": 10.3778, + "loss": 23.3374, + "mask_loss": 4.96, + "step": 28, + "topk_loss": 7.9996 + }, + { + "epoch": 0.011526303695745328, + "grad_norm": 7.53125, + "learning_rate": 0.000116, + "lm_loss": 10.1656, + "loss": 22.4997, + "mask_loss": 4.7425, + "step": 29, + "topk_loss": 7.5915 + }, + { + "epoch": 0.011923762443874478, + "grad_norm": 7.65625, + "learning_rate": 0.00012, + "lm_loss": 9.8681, + "loss": 21.171, + "mask_loss": 4.3273, + "step": 30, + "topk_loss": 6.9756 + }, + { + "epoch": 0.012321221192003627, + "grad_norm": 7.0625, + "learning_rate": 0.000124, + "lm_loss": 9.6834, + "loss": 20.4646, + "mask_loss": 4.1626, + "step": 31, + "topk_loss": 6.6186 + }, + { + "epoch": 0.012718679940132776, + "grad_norm": 6.65625, + "learning_rate": 0.00012800000000000002, + "lm_loss": 9.5271, + "loss": 19.8425, + "mask_loss": 4.0218, + "step": 32, + "topk_loss": 6.2936 + }, + { + "epoch": 0.013116138688261926, + "grad_norm": 7.59375, + "learning_rate": 0.000132, + "lm_loss": 9.1918, + "loss": 17.9938, + "mask_loss": 3.4106, + "step": 33, + "topk_loss": 5.3914 + }, + { + "epoch": 0.013513597436391075, + "grad_norm": 7.4375, + "learning_rate": 0.00013600000000000003, + "lm_loss": 9.1679, + "loss": 18.1523, + "mask_loss": 3.5831, + "step": 34, + "topk_loss": 5.4013 + }, + { + "epoch": 0.013911056184520224, + "grad_norm": 7.46875, + "learning_rate": 0.00014, + "lm_loss": 8.9109, + "loss": 16.9051, + "mask_loss": 3.196, + "step": 35, + "topk_loss": 4.7982 + }, + { + "epoch": 0.014308514932649372, + "grad_norm": 7.1875, + "learning_rate": 0.000144, + "lm_loss": 8.6916, + "loss": 16.0099, + "mask_loss": 2.9918, + "step": 36, + "topk_loss": 4.3265 + }, + { + "epoch": 0.014705973680778523, + "grad_norm": 6.0, + "learning_rate": 0.000148, + "lm_loss": 8.6587, + "loss": 15.9112, + "mask_loss": 3.0149, + "step": 37, + "topk_loss": 4.2376 + }, + { + "epoch": 0.015103432428907672, + "grad_norm": 5.75, + "learning_rate": 0.000152, + "lm_loss": 8.5764, + "loss": 15.4146, + "mask_loss": 2.9393, + "step": 38, + "topk_loss": 3.8989 + }, + { + "epoch": 0.01550089117703682, + "grad_norm": 5.78125, + "learning_rate": 0.00015600000000000002, + "lm_loss": 8.5832, + "loss": 15.6764, + "mask_loss": 3.089, + "step": 39, + "topk_loss": 4.0043 + }, + { + "epoch": 0.01589834992516597, + "grad_norm": 5.78125, + "learning_rate": 0.00016, + "lm_loss": 8.4336, + "loss": 14.6859, + "mask_loss": 2.7272, + "step": 40, + "topk_loss": 3.525 + }, + { + "epoch": 0.01629580867329512, + "grad_norm": 6.0, + "learning_rate": 0.000164, + "lm_loss": 8.4196, + "loss": 14.891, + "mask_loss": 2.8416, + "step": 41, + "topk_loss": 3.6298 + }, + { + "epoch": 0.016693267421424268, + "grad_norm": 5.90625, + "learning_rate": 0.000168, + "lm_loss": 8.2859, + "loss": 13.974, + "mask_loss": 2.506, + "step": 42, + "topk_loss": 3.1822 + }, + { + "epoch": 0.017090726169553417, + "grad_norm": 6.4375, + "learning_rate": 0.000172, + "lm_loss": 8.3544, + "loss": 14.6417, + "mask_loss": 2.8646, + "step": 43, + "topk_loss": 3.4227 + }, + { + "epoch": 0.017488184917682566, + "grad_norm": 6.28125, + "learning_rate": 0.00017600000000000002, + "lm_loss": 8.1956, + "loss": 14.0252, + "mask_loss": 2.6556, + "step": 44, + "topk_loss": 3.174 + }, + { + "epoch": 0.017885643665811718, + "grad_norm": 5.9375, + "learning_rate": 0.00018, + "lm_loss": 8.0939, + "loss": 12.9191, + "mask_loss": 2.2004, + "step": 45, + "topk_loss": 2.6249 + }, + { + "epoch": 0.018283102413940867, + "grad_norm": 6.03125, + "learning_rate": 0.00018400000000000003, + "lm_loss": 8.0049, + "loss": 13.2306, + "mask_loss": 2.4107, + "step": 46, + "topk_loss": 2.815 + }, + { + "epoch": 0.018680561162070015, + "grad_norm": 5.34375, + "learning_rate": 0.000188, + "lm_loss": 7.8148, + "loss": 12.1767, + "mask_loss": 2.0091, + "step": 47, + "topk_loss": 2.3528 + }, + { + "epoch": 0.019078019910199164, + "grad_norm": 5.40625, + "learning_rate": 0.000192, + "lm_loss": 7.7777, + "loss": 12.071, + "mask_loss": 1.9772, + "step": 48, + "topk_loss": 2.3161 + }, + { + "epoch": 0.019475478658328313, + "grad_norm": 5.28125, + "learning_rate": 0.000196, + "lm_loss": 7.7451, + "loss": 12.0437, + "mask_loss": 1.9371, + "step": 49, + "topk_loss": 2.3615 + }, + { + "epoch": 0.01987293740645746, + "grad_norm": 5.6875, + "learning_rate": 0.0002, + "lm_loss": 7.7121, + "loss": 12.1355, + "mask_loss": 2.017, + "step": 50, + "topk_loss": 2.4065 + }, + { + "epoch": 0.01987293740645746, + "eval_lm_loss": 320.9280700683594, + "eval_loss": 324.5454406738281, + "eval_mask_hit_rate": 0.18616949021816254, + "eval_mask_loss": 1.6387584209442139, + "eval_mask_top_10_hit_rate": 0.5981898903846741, + "eval_mask_top_1_hit_rate": 0.719202995300293, + "eval_mask_top_20_hit_rate": 0.5433949828147888, + "eval_mask_top_5_hit_rate": 0.6420404314994812, + "eval_runtime": 147.9637, + "eval_samples_per_second": 13.841, + "eval_steps_per_second": 6.921, + "eval_token_accuracy": 0.17364394664764404, + "eval_top_k_diff": 15844.38671875, + "eval_topk_loss": 1.9786218404769897, + "step": 50 + }, + { + "epoch": 0.02027039615458661, + "grad_norm": 5.375, + "learning_rate": 0.00019999991878507574, + "lm_loss": 7.4662, + "loss": 11.3416, + "mask_loss": 1.7451, + "step": 51, + "topk_loss": 2.1303 + }, + { + "epoch": 0.020667854902715763, + "grad_norm": 4.96875, + "learning_rate": 0.00019999967514043482, + "lm_loss": 7.3426, + "loss": 11.0435, + "mask_loss": 1.6574, + "step": 52, + "topk_loss": 2.0435 + }, + { + "epoch": 0.02106531365084491, + "grad_norm": 4.875, + "learning_rate": 0.000199999269066473, + "lm_loss": 7.27, + "loss": 10.9138, + "mask_loss": 1.6493, + "step": 53, + "topk_loss": 1.9945 + }, + { + "epoch": 0.02146277239897406, + "grad_norm": 3.578125, + "learning_rate": 0.00019999870056384994, + "lm_loss": 6.9719, + "loss": 9.7904, + "mask_loss": 1.2467, + "step": 54, + "topk_loss": 1.5717 + }, + { + "epoch": 0.02186023114710321, + "grad_norm": 4.09375, + "learning_rate": 0.00019999796963348897, + "lm_loss": 7.0271, + "loss": 10.1004, + "mask_loss": 1.3678, + "step": 55, + "topk_loss": 1.7055 + }, + { + "epoch": 0.022257689895232358, + "grad_norm": 3.578125, + "learning_rate": 0.00019999707627657736, + "lm_loss": 6.9151, + "loss": 9.7666, + "mask_loss": 1.2558, + "step": 56, + "topk_loss": 1.5958 + }, + { + "epoch": 0.022655148643361506, + "grad_norm": 3.546875, + "learning_rate": 0.0001999960204945662, + "lm_loss": 6.85, + "loss": 9.5896, + "mask_loss": 1.1859, + "step": 57, + "topk_loss": 1.5536 + }, + { + "epoch": 0.023052607391490655, + "grad_norm": 3.0625, + "learning_rate": 0.0001999948022891704, + "lm_loss": 6.6166, + "loss": 9.1052, + "mask_loss": 1.1085, + "step": 58, + "topk_loss": 1.3801 + }, + { + "epoch": 0.023450066139619807, + "grad_norm": 2.65625, + "learning_rate": 0.00019999342166236868, + "lm_loss": 6.6206, + "loss": 9.1253, + "mask_loss": 1.1119, + "step": 59, + "topk_loss": 1.3928 + }, + { + "epoch": 0.023847524887748956, + "grad_norm": 2.375, + "learning_rate": 0.00019999187861640362, + "lm_loss": 6.6699, + "loss": 8.8312, + "mask_loss": 0.9332, + "step": 60, + "topk_loss": 1.2282 + }, + { + "epoch": 0.024244983635878105, + "grad_norm": 1.8515625, + "learning_rate": 0.00019999017315378153, + "lm_loss": 6.5315, + "loss": 8.4229, + "mask_loss": 0.8263, + "step": 61, + "topk_loss": 1.0651 + }, + { + "epoch": 0.024642442384007254, + "grad_norm": 1.75, + "learning_rate": 0.00019998830527727263, + "lm_loss": 6.5239, + "loss": 8.435, + "mask_loss": 0.8378, + "step": 62, + "topk_loss": 1.0733 + }, + { + "epoch": 0.025039901132136402, + "grad_norm": 1.5859375, + "learning_rate": 0.0001999862749899109, + "lm_loss": 6.2827, + "loss": 7.9781, + "mask_loss": 0.7475, + "step": 63, + "topk_loss": 0.9479 + }, + { + "epoch": 0.02543735988026555, + "grad_norm": 1.5, + "learning_rate": 0.0001999840822949941, + "lm_loss": 6.351, + "loss": 7.812, + "mask_loss": 0.6495, + "step": 64, + "topk_loss": 0.8115 + }, + { + "epoch": 0.0258348186283947, + "grad_norm": 1.4453125, + "learning_rate": 0.0001999817271960839, + "lm_loss": 6.3347, + "loss": 7.8494, + "mask_loss": 0.6792, + "step": 65, + "topk_loss": 0.8355 + }, + { + "epoch": 0.026232277376523852, + "grad_norm": 1.546875, + "learning_rate": 0.0001999792096970056, + "lm_loss": 6.231, + "loss": 7.4894, + "mask_loss": 0.572, + "step": 66, + "topk_loss": 0.6865 + }, + { + "epoch": 0.026629736124653, + "grad_norm": 1.359375, + "learning_rate": 0.00019997652980184843, + "lm_loss": 6.2884, + "loss": 7.6097, + "mask_loss": 0.6205, + "step": 67, + "topk_loss": 0.7008 + }, + { + "epoch": 0.02702719487278215, + "grad_norm": 1.359375, + "learning_rate": 0.00019997368751496528, + "lm_loss": 6.0524, + "loss": 7.2628, + "mask_loss": 0.5931, + "step": 68, + "topk_loss": 0.6173 + }, + { + "epoch": 0.0274246536209113, + "grad_norm": 1.359375, + "learning_rate": 0.00019997068284097295, + "lm_loss": 6.1849, + "loss": 7.393, + "mask_loss": 0.5898, + "step": 69, + "topk_loss": 0.6183 + }, + { + "epoch": 0.027822112369040447, + "grad_norm": 1.1796875, + "learning_rate": 0.00019996751578475186, + "lm_loss": 6.0249, + "loss": 7.0865, + "mask_loss": 0.5251, + "step": 70, + "topk_loss": 0.5364 + }, + { + "epoch": 0.028219571117169596, + "grad_norm": 1.078125, + "learning_rate": 0.0001999641863514463, + "lm_loss": 5.9085, + "loss": 6.8733, + "mask_loss": 0.5183, + "step": 71, + "topk_loss": 0.4465 + }, + { + "epoch": 0.028617029865298745, + "grad_norm": 0.98046875, + "learning_rate": 0.0001999606945464642, + "lm_loss": 5.8834, + "loss": 6.853, + "mask_loss": 0.5229, + "step": 72, + "topk_loss": 0.4467 + }, + { + "epoch": 0.029014488613427897, + "grad_norm": 0.8828125, + "learning_rate": 0.0001999570403754774, + "lm_loss": 5.796, + "loss": 6.6022, + "mask_loss": 0.4409, + "step": 73, + "topk_loss": 0.3653 + }, + { + "epoch": 0.029411947361557046, + "grad_norm": 0.8671875, + "learning_rate": 0.00019995322384442127, + "lm_loss": 5.7359, + "loss": 6.5091, + "mask_loss": 0.4397, + "step": 74, + "topk_loss": 0.3336 + }, + { + "epoch": 0.029809406109686194, + "grad_norm": 0.921875, + "learning_rate": 0.00019994924495949504, + "lm_loss": 5.6994, + "loss": 6.5209, + "mask_loss": 0.46, + "step": 75, + "topk_loss": 0.3615 + }, + { + "epoch": 0.030206864857815343, + "grad_norm": 1.0390625, + "learning_rate": 0.00019994510372716158, + "lm_loss": 5.5887, + "loss": 6.4227, + "mask_loss": 0.4811, + "step": 76, + "topk_loss": 0.353 + }, + { + "epoch": 0.030604323605944492, + "grad_norm": 0.875, + "learning_rate": 0.0001999408001541475, + "lm_loss": 5.472, + "loss": 6.2222, + "mask_loss": 0.445, + "step": 77, + "topk_loss": 0.3052 + }, + { + "epoch": 0.03100178235407364, + "grad_norm": 0.80859375, + "learning_rate": 0.00019993633424744307, + "lm_loss": 5.5286, + "loss": 6.2379, + "mask_loss": 0.4314, + "step": 78, + "topk_loss": 0.2779 + }, + { + "epoch": 0.03139924110220279, + "grad_norm": 0.78515625, + "learning_rate": 0.0001999317060143023, + "lm_loss": 5.5386, + "loss": 6.2345, + "mask_loss": 0.4308, + "step": 79, + "topk_loss": 0.2652 + }, + { + "epoch": 0.03179669985033194, + "grad_norm": 0.7421875, + "learning_rate": 0.00019992691546224278, + "lm_loss": 5.3942, + "loss": 6.0123, + "mask_loss": 0.3858, + "step": 80, + "topk_loss": 0.2323 + }, + { + "epoch": 0.03219415859846109, + "grad_norm": 0.6171875, + "learning_rate": 0.00019992196259904584, + "lm_loss": 5.4596, + "loss": 5.9972, + "mask_loss": 0.3486, + "step": 81, + "topk_loss": 0.189 + }, + { + "epoch": 0.03259161734659024, + "grad_norm": 0.70703125, + "learning_rate": 0.00019991684743275636, + "lm_loss": 5.3695, + "loss": 5.9697, + "mask_loss": 0.3813, + "step": 82, + "topk_loss": 0.219 + }, + { + "epoch": 0.03298907609471939, + "grad_norm": 0.60546875, + "learning_rate": 0.00019991156997168293, + "lm_loss": 5.2743, + "loss": 5.8273, + "mask_loss": 0.3639, + "step": 83, + "topk_loss": 0.1891 + }, + { + "epoch": 0.033386534842848536, + "grad_norm": 0.953125, + "learning_rate": 0.0001999061302243977, + "lm_loss": 5.3097, + "loss": 6.0197, + "mask_loss": 0.4563, + "step": 84, + "topk_loss": 0.2538 + }, + { + "epoch": 0.03378399359097769, + "grad_norm": 0.6953125, + "learning_rate": 0.00019990052819973642, + "lm_loss": 5.1893, + "loss": 5.7923, + "mask_loss": 0.3903, + "step": 85, + "topk_loss": 0.2126 + }, + { + "epoch": 0.034181452339106834, + "grad_norm": 0.5078125, + "learning_rate": 0.00019989476390679854, + "lm_loss": 5.1706, + "loss": 5.6086, + "mask_loss": 0.3068, + "step": 86, + "topk_loss": 0.1312 + }, + { + "epoch": 0.034578911087235986, + "grad_norm": 0.6171875, + "learning_rate": 0.0001998888373549469, + "lm_loss": 5.0552, + "loss": 5.563, + "mask_loss": 0.3447, + "step": 87, + "topk_loss": 0.1631 + }, + { + "epoch": 0.03497636983536513, + "grad_norm": 0.56640625, + "learning_rate": 0.00019988274855380804, + "lm_loss": 5.1613, + "loss": 5.6357, + "mask_loss": 0.32, + "step": 88, + "topk_loss": 0.1545 + }, + { + "epoch": 0.035373828583494284, + "grad_norm": 0.72265625, + "learning_rate": 0.00019987649751327196, + "lm_loss": 5.0626, + "loss": 5.607, + "mask_loss": 0.3565, + "step": 89, + "topk_loss": 0.1879 + }, + { + "epoch": 0.035771287331623436, + "grad_norm": 0.478515625, + "learning_rate": 0.00019987008424349225, + "lm_loss": 4.9735, + "loss": 5.4065, + "mask_loss": 0.3099, + "step": 90, + "topk_loss": 0.1231 + }, + { + "epoch": 0.03616874607975258, + "grad_norm": 0.72265625, + "learning_rate": 0.00019986350875488595, + "lm_loss": 4.9142, + "loss": 5.4443, + "mask_loss": 0.3508, + "step": 91, + "topk_loss": 0.1793 + }, + { + "epoch": 0.03656620482788173, + "grad_norm": 0.6953125, + "learning_rate": 0.00019985677105813362, + "lm_loss": 4.9241, + "loss": 5.4404, + "mask_loss": 0.3437, + "step": 92, + "topk_loss": 0.1725 + }, + { + "epoch": 0.03696366357601088, + "grad_norm": 0.5078125, + "learning_rate": 0.00019984987116417928, + "lm_loss": 4.9039, + "loss": 5.3879, + "mask_loss": 0.3354, + "step": 93, + "topk_loss": 0.1485 + }, + { + "epoch": 0.03736112232414003, + "grad_norm": 0.41796875, + "learning_rate": 0.00019984280908423043, + "lm_loss": 4.7759, + "loss": 5.1349, + "mask_loss": 0.2757, + "step": 94, + "topk_loss": 0.0832 + }, + { + "epoch": 0.037758581072269176, + "grad_norm": 0.453125, + "learning_rate": 0.00019983558482975799, + "lm_loss": 4.8718, + "loss": 5.2221, + "mask_loss": 0.2656, + "step": 95, + "topk_loss": 0.0848 + }, + { + "epoch": 0.03815603982039833, + "grad_norm": 0.59765625, + "learning_rate": 0.00019982819841249632, + "lm_loss": 4.7451, + "loss": 5.2327, + "mask_loss": 0.3364, + "step": 96, + "topk_loss": 0.1513 + }, + { + "epoch": 0.03855349856852748, + "grad_norm": 0.625, + "learning_rate": 0.00019982064984444317, + "lm_loss": 4.7885, + "loss": 5.2653, + "mask_loss": 0.3325, + "step": 97, + "topk_loss": 0.1443 + }, + { + "epoch": 0.038950957316656626, + "grad_norm": 0.48046875, + "learning_rate": 0.00019981293913785963, + "lm_loss": 4.7141, + "loss": 5.13, + "mask_loss": 0.3013, + "step": 98, + "topk_loss": 0.1147 + }, + { + "epoch": 0.03934841606478578, + "grad_norm": 0.40625, + "learning_rate": 0.00019980506630527022, + "lm_loss": 4.5939, + "loss": 4.974, + "mask_loss": 0.2834, + "step": 99, + "topk_loss": 0.0967 + }, + { + "epoch": 0.03974587481291492, + "grad_norm": 0.380859375, + "learning_rate": 0.00019979703135946278, + "lm_loss": 4.7149, + "loss": 5.0904, + "mask_loss": 0.2803, + "step": 100, + "topk_loss": 0.0952 + }, + { + "epoch": 0.03974587481291492, + "eval_lm_loss": 706.3309326171875, + "eval_loss": 706.7132568359375, + "eval_mask_hit_rate": 0.3013326823711395, + "eval_mask_loss": 0.27427732944488525, + "eval_mask_top_10_hit_rate": 0.7967365980148315, + "eval_mask_top_1_hit_rate": 0.8906242847442627, + "eval_mask_top_20_hit_rate": 0.7520042657852173, + "eval_mask_top_5_hit_rate": 0.8314727544784546, + "eval_runtime": 147.1012, + "eval_samples_per_second": 13.922, + "eval_steps_per_second": 6.961, + "eval_token_accuracy": 0.36604347825050354, + "eval_top_k_diff": -257.53240966796875, + "eval_topk_loss": 0.10801860690116882, + "step": 100 + }, + { + "epoch": 0.040143333561044076, + "grad_norm": 0.55859375, + "learning_rate": 0.00019978883431348845, + "lm_loss": 4.6222, + "loss": 5.0187, + "mask_loss": 0.2857, + "step": 101, + "topk_loss": 0.1108 + }, + { + "epoch": 0.04054079230917322, + "grad_norm": 0.4453125, + "learning_rate": 0.00019978047518066165, + "lm_loss": 4.5105, + "loss": 4.9176, + "mask_loss": 0.2925, + "step": 102, + "topk_loss": 0.1146 + }, + { + "epoch": 0.04093825105730237, + "grad_norm": 0.453125, + "learning_rate": 0.0001997719539745601, + "lm_loss": 4.577, + "loss": 4.9669, + "mask_loss": 0.2823, + "step": 103, + "topk_loss": 0.1076 + }, + { + "epoch": 0.041335709805431525, + "grad_norm": 0.443359375, + "learning_rate": 0.0001997632707090249, + "lm_loss": 4.5435, + "loss": 4.9416, + "mask_loss": 0.2884, + "step": 104, + "topk_loss": 0.1098 + }, + { + "epoch": 0.04173316855356067, + "grad_norm": 0.373046875, + "learning_rate": 0.00019975442539816012, + "lm_loss": 4.4278, + "loss": 4.7529, + "mask_loss": 0.2572, + "step": 105, + "topk_loss": 0.0679 + }, + { + "epoch": 0.04213062730168982, + "grad_norm": 0.380859375, + "learning_rate": 0.00019974541805633324, + "lm_loss": 4.4459, + "loss": 4.8022, + "mask_loss": 0.2692, + "step": 106, + "topk_loss": 0.0871 + }, + { + "epoch": 0.04252808604981897, + "grad_norm": 0.33984375, + "learning_rate": 0.0001997362486981749, + "lm_loss": 4.4027, + "loss": 4.7272, + "mask_loss": 0.2551, + "step": 107, + "topk_loss": 0.0694 + }, + { + "epoch": 0.04292554479794812, + "grad_norm": 0.421875, + "learning_rate": 0.00019972691733857883, + "lm_loss": 4.3911, + "loss": 4.756, + "mask_loss": 0.2738, + "step": 108, + "topk_loss": 0.091 + }, + { + "epoch": 0.043323003546077266, + "grad_norm": 0.38671875, + "learning_rate": 0.00019971742399270195, + "lm_loss": 4.3224, + "loss": 4.6388, + "mask_loss": 0.251, + "step": 109, + "topk_loss": 0.0655 + }, + { + "epoch": 0.04372046229420642, + "grad_norm": 0.48046875, + "learning_rate": 0.0001997077686759643, + "lm_loss": 4.3758, + "loss": 4.7341, + "mask_loss": 0.2682, + "step": 110, + "topk_loss": 0.0901 + }, + { + "epoch": 0.04411792104233557, + "grad_norm": 0.3125, + "learning_rate": 0.000199697951404049, + "lm_loss": 4.2691, + "loss": 4.5957, + "mask_loss": 0.2554, + "step": 111, + "topk_loss": 0.0712 + }, + { + "epoch": 0.044515379790464715, + "grad_norm": 0.3671875, + "learning_rate": 0.00019968797219290226, + "lm_loss": 4.1656, + "loss": 4.4926, + "mask_loss": 0.2545, + "step": 112, + "topk_loss": 0.0725 + }, + { + "epoch": 0.04491283853859387, + "grad_norm": 0.50390625, + "learning_rate": 0.00019967783105873324, + "lm_loss": 4.275, + "loss": 4.6403, + "mask_loss": 0.2683, + "step": 113, + "topk_loss": 0.097 + }, + { + "epoch": 0.04531029728672301, + "grad_norm": 0.333984375, + "learning_rate": 0.00019966752801801416, + "lm_loss": 4.2514, + "loss": 4.579, + "mask_loss": 0.2551, + "step": 114, + "topk_loss": 0.0725 + }, + { + "epoch": 0.045707756034852165, + "grad_norm": 0.408203125, + "learning_rate": 0.00019965706308748028, + "lm_loss": 4.3101, + "loss": 4.6724, + "mask_loss": 0.2659, + "step": 115, + "topk_loss": 0.0964 + }, + { + "epoch": 0.04610521478298131, + "grad_norm": 0.396484375, + "learning_rate": 0.0001996464362841298, + "lm_loss": 4.2544, + "loss": 4.5976, + "mask_loss": 0.2575, + "step": 116, + "topk_loss": 0.0857 + }, + { + "epoch": 0.04650267353111046, + "grad_norm": 0.3203125, + "learning_rate": 0.00019963564762522372, + "lm_loss": 4.166, + "loss": 4.4859, + "mask_loss": 0.2479, + "step": 117, + "topk_loss": 0.072 + }, + { + "epoch": 0.046900132279239615, + "grad_norm": 0.328125, + "learning_rate": 0.00019962469712828614, + "lm_loss": 4.1945, + "loss": 4.5042, + "mask_loss": 0.2447, + "step": 118, + "topk_loss": 0.0651 + }, + { + "epoch": 0.04729759102736876, + "grad_norm": 0.3515625, + "learning_rate": 0.00019961358481110385, + "lm_loss": 4.2637, + "loss": 4.5898, + "mask_loss": 0.2511, + "step": 119, + "topk_loss": 0.0751 + }, + { + "epoch": 0.04769504977549791, + "grad_norm": 0.333984375, + "learning_rate": 0.0001996023106917267, + "lm_loss": 4.1716, + "loss": 4.4768, + "mask_loss": 0.2403, + "step": 120, + "topk_loss": 0.065 + }, + { + "epoch": 0.04809250852362706, + "grad_norm": 0.29296875, + "learning_rate": 0.00019959087478846707, + "lm_loss": 4.1371, + "loss": 4.4277, + "mask_loss": 0.2343, + "step": 121, + "topk_loss": 0.0564 + }, + { + "epoch": 0.04848996727175621, + "grad_norm": 0.478515625, + "learning_rate": 0.00019957927711990035, + "lm_loss": 4.1211, + "loss": 4.563, + "mask_loss": 0.3337, + "step": 122, + "topk_loss": 0.1081 + }, + { + "epoch": 0.048887426019885355, + "grad_norm": 0.322265625, + "learning_rate": 0.00019956751770486462, + "lm_loss": 4.0737, + "loss": 4.382, + "mask_loss": 0.2407, + "step": 123, + "topk_loss": 0.0676 + }, + { + "epoch": 0.04928488476801451, + "grad_norm": 0.287109375, + "learning_rate": 0.00019955559656246067, + "lm_loss": 3.9877, + "loss": 4.2723, + "mask_loss": 0.2326, + "step": 124, + "topk_loss": 0.052 + }, + { + "epoch": 0.04968234351614366, + "grad_norm": 0.341796875, + "learning_rate": 0.00019954351371205203, + "lm_loss": 3.9643, + "loss": 4.2797, + "mask_loss": 0.2451, + "step": 125, + "topk_loss": 0.0702 + }, + { + "epoch": 0.050079802264272805, + "grad_norm": 0.306640625, + "learning_rate": 0.00019953126917326478, + "lm_loss": 4.0202, + "loss": 4.3116, + "mask_loss": 0.2359, + "step": 126, + "topk_loss": 0.0555 + }, + { + "epoch": 0.05047726101240196, + "grad_norm": 0.326171875, + "learning_rate": 0.0001995188629659878, + "lm_loss": 4.0063, + "loss": 4.3179, + "mask_loss": 0.245, + "step": 127, + "topk_loss": 0.0665 + }, + { + "epoch": 0.0508747197605311, + "grad_norm": 0.359375, + "learning_rate": 0.00019950629511037237, + "lm_loss": 3.9746, + "loss": 4.2866, + "mask_loss": 0.2411, + "step": 128, + "topk_loss": 0.0708 + }, + { + "epoch": 0.051272178508660254, + "grad_norm": 0.310546875, + "learning_rate": 0.00019949356562683256, + "lm_loss": 3.9167, + "loss": 4.2184, + "mask_loss": 0.2379, + "step": 129, + "topk_loss": 0.0637 + }, + { + "epoch": 0.0516696372567894, + "grad_norm": 0.30078125, + "learning_rate": 0.00019948067453604473, + "lm_loss": 3.9231, + "loss": 4.2166, + "mask_loss": 0.2347, + "step": 130, + "topk_loss": 0.0588 + }, + { + "epoch": 0.05206709600491855, + "grad_norm": 0.310546875, + "learning_rate": 0.00019946762185894793, + "lm_loss": 3.835, + "loss": 4.1341, + "mask_loss": 0.2372, + "step": 131, + "topk_loss": 0.062 + }, + { + "epoch": 0.052464554753047704, + "grad_norm": 0.3125, + "learning_rate": 0.00019945440761674359, + "lm_loss": 3.9297, + "loss": 4.228, + "mask_loss": 0.2332, + "step": 132, + "topk_loss": 0.0652 + }, + { + "epoch": 0.05286201350117685, + "grad_norm": 0.3125, + "learning_rate": 0.00019944103183089558, + "lm_loss": 3.9599, + "loss": 4.2613, + "mask_loss": 0.2367, + "step": 133, + "topk_loss": 0.0647 + }, + { + "epoch": 0.053259472249306, + "grad_norm": 0.2734375, + "learning_rate": 0.00019942749452313017, + "lm_loss": 3.8308, + "loss": 4.1095, + "mask_loss": 0.2253, + "step": 134, + "topk_loss": 0.0534 + }, + { + "epoch": 0.05365693099743515, + "grad_norm": 0.3046875, + "learning_rate": 0.00019941379571543596, + "lm_loss": 3.8412, + "loss": 4.1291, + "mask_loss": 0.2316, + "step": 135, + "topk_loss": 0.0563 + }, + { + "epoch": 0.0540543897455643, + "grad_norm": 0.294921875, + "learning_rate": 0.00019939993543006395, + "lm_loss": 3.8049, + "loss": 4.0803, + "mask_loss": 0.2271, + "step": 136, + "topk_loss": 0.0483 + }, + { + "epoch": 0.054451848493693444, + "grad_norm": 0.3984375, + "learning_rate": 0.0001993859136895274, + "lm_loss": 3.8622, + "loss": 4.1821, + "mask_loss": 0.2446, + "step": 137, + "topk_loss": 0.0753 + }, + { + "epoch": 0.0548493072418226, + "grad_norm": 0.2890625, + "learning_rate": 0.00019937173051660172, + "lm_loss": 3.8942, + "loss": 4.1659, + "mask_loss": 0.2219, + "step": 138, + "topk_loss": 0.0498 + }, + { + "epoch": 0.05524676598995175, + "grad_norm": 0.283203125, + "learning_rate": 0.00019935738593432464, + "lm_loss": 3.7593, + "loss": 4.0358, + "mask_loss": 0.2235, + "step": 139, + "topk_loss": 0.053 + }, + { + "epoch": 0.055644224738080894, + "grad_norm": 0.2890625, + "learning_rate": 0.00019934287996599607, + "lm_loss": 3.775, + "loss": 4.0379, + "mask_loss": 0.2182, + "step": 140, + "topk_loss": 0.0447 + }, + { + "epoch": 0.056041683486210046, + "grad_norm": 0.279296875, + "learning_rate": 0.00019932821263517805, + "lm_loss": 3.7231, + "loss": 3.9992, + "mask_loss": 0.2219, + "step": 141, + "topk_loss": 0.0542 + }, + { + "epoch": 0.05643914223433919, + "grad_norm": 0.28125, + "learning_rate": 0.00019931338396569467, + "lm_loss": 3.7311, + "loss": 3.9954, + "mask_loss": 0.2175, + "step": 142, + "topk_loss": 0.0469 + }, + { + "epoch": 0.056836600982468344, + "grad_norm": 0.3203125, + "learning_rate": 0.00019929839398163214, + "lm_loss": 3.6278, + "loss": 3.9166, + "mask_loss": 0.2331, + "step": 143, + "topk_loss": 0.0557 + }, + { + "epoch": 0.05723405973059749, + "grad_norm": 0.314453125, + "learning_rate": 0.00019928324270733862, + "lm_loss": 3.6726, + "loss": 3.9472, + "mask_loss": 0.2248, + "step": 144, + "topk_loss": 0.0499 + }, + { + "epoch": 0.05763151847872664, + "grad_norm": 0.2890625, + "learning_rate": 0.00019926793016742435, + "lm_loss": 3.6402, + "loss": 3.9064, + "mask_loss": 0.2184, + "step": 145, + "topk_loss": 0.0478 + }, + { + "epoch": 0.058028977226855794, + "grad_norm": 0.283203125, + "learning_rate": 0.00019925245638676144, + "lm_loss": 3.6263, + "loss": 3.8756, + "mask_loss": 0.2114, + "step": 146, + "topk_loss": 0.0378 + }, + { + "epoch": 0.05842643597498494, + "grad_norm": 0.310546875, + "learning_rate": 0.00019923682139048396, + "lm_loss": 3.61, + "loss": 3.8891, + "mask_loss": 0.2262, + "step": 147, + "topk_loss": 0.0528 + }, + { + "epoch": 0.05882389472311409, + "grad_norm": 0.33984375, + "learning_rate": 0.00019922102520398776, + "lm_loss": 3.6744, + "loss": 3.9671, + "mask_loss": 0.2268, + "step": 148, + "topk_loss": 0.066 + }, + { + "epoch": 0.059221353471243236, + "grad_norm": 0.359375, + "learning_rate": 0.0001992050678529306, + "lm_loss": 3.6987, + "loss": 4.0077, + "mask_loss": 0.2327, + "step": 149, + "topk_loss": 0.0763 + }, + { + "epoch": 0.05961881221937239, + "grad_norm": 0.30859375, + "learning_rate": 0.00019918894936323195, + "lm_loss": 3.6849, + "loss": 3.9491, + "mask_loss": 0.2151, + "step": 150, + "topk_loss": 0.0491 + }, + { + "epoch": 0.05961881221937239, + "eval_lm_loss": 722.8411254882812, + "eval_loss": 723.1029663085938, + "eval_mask_hit_rate": 0.3664598762989044, + "eval_mask_loss": 0.20979231595993042, + "eval_mask_top_10_hit_rate": 0.8763496279716492, + "eval_mask_top_1_hit_rate": 0.9443230628967285, + "eval_mask_top_20_hit_rate": 0.8418170809745789, + "eval_mask_top_5_hit_rate": 0.9021064639091492, + "eval_runtime": 144.2832, + "eval_samples_per_second": 14.194, + "eval_steps_per_second": 7.097, + "eval_token_accuracy": 0.44723033905029297, + "eval_top_k_diff": -527.2366943359375, + "eval_topk_loss": 0.05206891894340515, + "step": 150 + }, + { + "epoch": 0.060016270967501534, + "grad_norm": 0.302734375, + "learning_rate": 0.00019917266976107308, + "lm_loss": 3.6228, + "loss": 3.8777, + "mask_loss": 0.2122, + "step": 151, + "topk_loss": 0.0426 + }, + { + "epoch": 0.060413729715630686, + "grad_norm": 0.275390625, + "learning_rate": 0.00019915622907289694, + "lm_loss": 3.5653, + "loss": 3.8254, + "mask_loss": 0.2136, + "step": 152, + "topk_loss": 0.0465 + }, + { + "epoch": 0.06081118846375984, + "grad_norm": 0.322265625, + "learning_rate": 0.00019913962732540807, + "lm_loss": 3.5934, + "loss": 3.8731, + "mask_loss": 0.2234, + "step": 153, + "topk_loss": 0.0563 + }, + { + "epoch": 0.061208647211888983, + "grad_norm": 0.28125, + "learning_rate": 0.00019912286454557267, + "lm_loss": 3.5041, + "loss": 3.7657, + "mask_loss": 0.216, + "step": 154, + "topk_loss": 0.0455 + }, + { + "epoch": 0.061606105960018136, + "grad_norm": 0.263671875, + "learning_rate": 0.00019910594076061853, + "lm_loss": 3.5847, + "loss": 3.8352, + "mask_loss": 0.2104, + "step": 155, + "topk_loss": 0.0401 + }, + { + "epoch": 0.06200356470814728, + "grad_norm": 0.248046875, + "learning_rate": 0.0001990888559980349, + "lm_loss": 3.4814, + "loss": 3.7271, + "mask_loss": 0.2087, + "step": 156, + "topk_loss": 0.037 + }, + { + "epoch": 0.06240102345627643, + "grad_norm": 0.287109375, + "learning_rate": 0.00019907161028557253, + "lm_loss": 3.4867, + "loss": 3.7611, + "mask_loss": 0.2215, + "step": 157, + "topk_loss": 0.0529 + }, + { + "epoch": 0.06279848220440558, + "grad_norm": 0.298828125, + "learning_rate": 0.00019905420365124362, + "lm_loss": 3.4163, + "loss": 3.6892, + "mask_loss": 0.2213, + "step": 158, + "topk_loss": 0.0516 + }, + { + "epoch": 0.06319594095253474, + "grad_norm": 0.3046875, + "learning_rate": 0.00019903663612332175, + "lm_loss": 3.495, + "loss": 3.7624, + "mask_loss": 0.2175, + "step": 159, + "topk_loss": 0.0499 + }, + { + "epoch": 0.06359339970066388, + "grad_norm": 0.26953125, + "learning_rate": 0.0001990189077303418, + "lm_loss": 3.4781, + "loss": 3.7132, + "mask_loss": 0.2029, + "step": 160, + "topk_loss": 0.0321 + }, + { + "epoch": 0.06399085844879303, + "grad_norm": 0.251953125, + "learning_rate": 0.00019900101850109999, + "lm_loss": 3.4428, + "loss": 3.6969, + "mask_loss": 0.2104, + "step": 161, + "topk_loss": 0.0436 + }, + { + "epoch": 0.06438831719692217, + "grad_norm": 0.275390625, + "learning_rate": 0.00019898296846465377, + "lm_loss": 3.3954, + "loss": 3.6293, + "mask_loss": 0.2024, + "step": 162, + "topk_loss": 0.0315 + }, + { + "epoch": 0.06478577594505133, + "grad_norm": 0.251953125, + "learning_rate": 0.00019896475765032175, + "lm_loss": 3.5172, + "loss": 3.7485, + "mask_loss": 0.1987, + "step": 163, + "topk_loss": 0.0325 + }, + { + "epoch": 0.06518323469318048, + "grad_norm": 0.27734375, + "learning_rate": 0.0001989463860876838, + "lm_loss": 3.4891, + "loss": 3.7279, + "mask_loss": 0.2038, + "step": 164, + "topk_loss": 0.035 + }, + { + "epoch": 0.06558069344130962, + "grad_norm": 0.263671875, + "learning_rate": 0.00019892785380658078, + "lm_loss": 3.3805, + "loss": 3.6412, + "mask_loss": 0.2121, + "step": 165, + "topk_loss": 0.0487 + }, + { + "epoch": 0.06597815218943878, + "grad_norm": 0.27734375, + "learning_rate": 0.0001989091608371146, + "lm_loss": 3.4758, + "loss": 3.72, + "mask_loss": 0.2043, + "step": 166, + "topk_loss": 0.0399 + }, + { + "epoch": 0.06637561093756793, + "grad_norm": 0.2890625, + "learning_rate": 0.0001988903072096483, + "lm_loss": 3.4014, + "loss": 3.6362, + "mask_loss": 0.2012, + "step": 167, + "topk_loss": 0.0337 + }, + { + "epoch": 0.06677306968569707, + "grad_norm": 0.2333984375, + "learning_rate": 0.00019887129295480577, + "lm_loss": 3.3535, + "loss": 3.5857, + "mask_loss": 0.2002, + "step": 168, + "topk_loss": 0.0321 + }, + { + "epoch": 0.06717052843382622, + "grad_norm": 0.251953125, + "learning_rate": 0.00019885211810347184, + "lm_loss": 3.348, + "loss": 3.5855, + "mask_loss": 0.2034, + "step": 169, + "topk_loss": 0.0341 + }, + { + "epoch": 0.06756798718195538, + "grad_norm": 0.26171875, + "learning_rate": 0.00019883278268679216, + "lm_loss": 3.326, + "loss": 3.5556, + "mask_loss": 0.1982, + "step": 170, + "topk_loss": 0.0314 + }, + { + "epoch": 0.06796544593008452, + "grad_norm": 0.271484375, + "learning_rate": 0.00019881328673617327, + "lm_loss": 3.3377, + "loss": 3.5755, + "mask_loss": 0.2037, + "step": 171, + "topk_loss": 0.0342 + }, + { + "epoch": 0.06836290467821367, + "grad_norm": 0.318359375, + "learning_rate": 0.00019879363028328236, + "lm_loss": 3.3258, + "loss": 3.5867, + "mask_loss": 0.2129, + "step": 172, + "topk_loss": 0.048 + }, + { + "epoch": 0.06876036342634283, + "grad_norm": 0.27734375, + "learning_rate": 0.0001987738133600474, + "lm_loss": 3.2987, + "loss": 3.5528, + "mask_loss": 0.2083, + "step": 173, + "topk_loss": 0.0458 + }, + { + "epoch": 0.06915782217447197, + "grad_norm": 0.30078125, + "learning_rate": 0.000198753835998657, + "lm_loss": 3.2923, + "loss": 3.5251, + "mask_loss": 0.1979, + "step": 174, + "topk_loss": 0.0349 + }, + { + "epoch": 0.06955528092260112, + "grad_norm": 0.283203125, + "learning_rate": 0.00019873369823156036, + "lm_loss": 3.3049, + "loss": 3.5294, + "mask_loss": 0.1948, + "step": 175, + "topk_loss": 0.0298 + }, + { + "epoch": 0.06995273967073026, + "grad_norm": 0.232421875, + "learning_rate": 0.0001987134000914672, + "lm_loss": 3.2683, + "loss": 3.5029, + "mask_loss": 0.1999, + "step": 176, + "topk_loss": 0.0348 + }, + { + "epoch": 0.07035019841885942, + "grad_norm": 0.26171875, + "learning_rate": 0.00019869294161134777, + "lm_loss": 3.2072, + "loss": 3.4514, + "mask_loss": 0.2033, + "step": 177, + "topk_loss": 0.0409 + }, + { + "epoch": 0.07074765716698857, + "grad_norm": 0.265625, + "learning_rate": 0.00019867232282443277, + "lm_loss": 3.3398, + "loss": 3.5819, + "mask_loss": 0.2012, + "step": 178, + "topk_loss": 0.0409 + }, + { + "epoch": 0.07114511591511771, + "grad_norm": 0.275390625, + "learning_rate": 0.00019865154376421323, + "lm_loss": 3.2698, + "loss": 3.5008, + "mask_loss": 0.1976, + "step": 179, + "topk_loss": 0.0335 + }, + { + "epoch": 0.07154257466324687, + "grad_norm": 0.28125, + "learning_rate": 0.00019863060446444054, + "lm_loss": 3.19, + "loss": 3.4156, + "mask_loss": 0.1943, + "step": 180, + "topk_loss": 0.0312 + }, + { + "epoch": 0.07194003341137602, + "grad_norm": 0.26953125, + "learning_rate": 0.00019860950495912642, + "lm_loss": 3.2296, + "loss": 3.4531, + "mask_loss": 0.1938, + "step": 181, + "topk_loss": 0.0298 + }, + { + "epoch": 0.07233749215950516, + "grad_norm": 0.2734375, + "learning_rate": 0.0001985882452825427, + "lm_loss": 3.2934, + "loss": 3.5372, + "mask_loss": 0.2014, + "step": 182, + "topk_loss": 0.0424 + }, + { + "epoch": 0.07273495090763431, + "grad_norm": 0.279296875, + "learning_rate": 0.00019856682546922155, + "lm_loss": 3.2163, + "loss": 3.4633, + "mask_loss": 0.2032, + "step": 183, + "topk_loss": 0.0438 + }, + { + "epoch": 0.07313240965576347, + "grad_norm": 0.27734375, + "learning_rate": 0.00019854524555395502, + "lm_loss": 3.1944, + "loss": 3.4276, + "mask_loss": 0.1967, + "step": 184, + "topk_loss": 0.0365 + }, + { + "epoch": 0.07352986840389261, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019852350557179538, + "lm_loss": 3.1588, + "loss": 3.3794, + "mask_loss": 0.1912, + "step": 185, + "topk_loss": 0.0295 + }, + { + "epoch": 0.07392732715202176, + "grad_norm": 0.25390625, + "learning_rate": 0.00019850160555805486, + "lm_loss": 3.196, + "loss": 3.4323, + "mask_loss": 0.1969, + "step": 186, + "topk_loss": 0.0393 + }, + { + "epoch": 0.07432478590015092, + "grad_norm": 0.24609375, + "learning_rate": 0.00019847954554830558, + "lm_loss": 3.1731, + "loss": 3.3968, + "mask_loss": 0.1939, + "step": 187, + "topk_loss": 0.0298 + }, + { + "epoch": 0.07472224464828006, + "grad_norm": 0.24609375, + "learning_rate": 0.00019845732557837966, + "lm_loss": 3.186, + "loss": 3.4056, + "mask_loss": 0.1901, + "step": 188, + "topk_loss": 0.0295 + }, + { + "epoch": 0.07511970339640921, + "grad_norm": 0.2431640625, + "learning_rate": 0.0001984349456843689, + "lm_loss": 3.1528, + "loss": 3.3835, + "mask_loss": 0.1955, + "step": 189, + "topk_loss": 0.0352 + }, + { + "epoch": 0.07551716214453835, + "grad_norm": 0.26171875, + "learning_rate": 0.00019841240590262493, + "lm_loss": 3.131, + "loss": 3.3606, + "mask_loss": 0.1956, + "step": 190, + "topk_loss": 0.034 + }, + { + "epoch": 0.07591462089266751, + "grad_norm": 0.234375, + "learning_rate": 0.0001983897062697591, + "lm_loss": 3.213, + "loss": 3.4263, + "mask_loss": 0.1881, + "step": 191, + "topk_loss": 0.0252 + }, + { + "epoch": 0.07631207964079666, + "grad_norm": 0.24609375, + "learning_rate": 0.00019836684682264242, + "lm_loss": 3.1288, + "loss": 3.3551, + "mask_loss": 0.192, + "step": 192, + "topk_loss": 0.0343 + }, + { + "epoch": 0.0767095383889258, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019834382759840538, + "lm_loss": 3.1637, + "loss": 3.3819, + "mask_loss": 0.1889, + "step": 193, + "topk_loss": 0.0293 + }, + { + "epoch": 0.07710699713705496, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001983206486344381, + "lm_loss": 3.0662, + "loss": 3.2871, + "mask_loss": 0.1928, + "step": 194, + "topk_loss": 0.0282 + }, + { + "epoch": 0.0775044558851841, + "grad_norm": 0.251953125, + "learning_rate": 0.0001982973099683902, + "lm_loss": 3.0626, + "loss": 3.2906, + "mask_loss": 0.1945, + "step": 195, + "topk_loss": 0.0336 + }, + { + "epoch": 0.07790191463331325, + "grad_norm": 0.2412109375, + "learning_rate": 0.00019827381163817055, + "lm_loss": 3.0142, + "loss": 3.2344, + "mask_loss": 0.1899, + "step": 196, + "topk_loss": 0.0303 + }, + { + "epoch": 0.0782993733814424, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019825015368194748, + "lm_loss": 3.0554, + "loss": 3.2809, + "mask_loss": 0.1919, + "step": 197, + "topk_loss": 0.0335 + }, + { + "epoch": 0.07869683212957156, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019822633613814862, + "lm_loss": 3.0914, + "loss": 3.304, + "mask_loss": 0.1855, + "step": 198, + "topk_loss": 0.0271 + }, + { + "epoch": 0.0790942908777007, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001982023590454607, + "lm_loss": 3.0515, + "loss": 3.2819, + "mask_loss": 0.1956, + "step": 199, + "topk_loss": 0.0348 + }, + { + "epoch": 0.07949174962582985, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019817822244282973, + "lm_loss": 3.0209, + "loss": 3.2367, + "mask_loss": 0.1891, + "step": 200, + "topk_loss": 0.0267 + }, + { + "epoch": 0.07949174962582985, + "eval_lm_loss": 729.3817749023438, + "eval_loss": 729.6011962890625, + "eval_mask_hit_rate": 0.40478789806365967, + "eval_mask_loss": 0.18504393100738525, + "eval_mask_top_10_hit_rate": 0.9154680371284485, + "eval_mask_top_1_hit_rate": 0.9666690826416016, + "eval_mask_top_20_hit_rate": 0.8877524137496948, + "eval_mask_top_5_hit_rate": 0.9352514743804932, + "eval_runtime": 144.7547, + "eval_samples_per_second": 14.148, + "eval_steps_per_second": 7.074, + "eval_token_accuracy": 0.4949284791946411, + "eval_top_k_diff": -554.7236328125, + "eval_topk_loss": 0.03440730273723602, + "step": 200 + }, + { + "epoch": 0.079889208373959, + "grad_norm": 0.29296875, + "learning_rate": 0.00019815392636946073, + "lm_loss": 3.0973, + "loss": 3.3482, + "mask_loss": 0.2018, + "step": 201, + "topk_loss": 0.0491 + }, + { + "epoch": 0.08028666712208815, + "grad_norm": 0.29296875, + "learning_rate": 0.0001981294708648178, + "lm_loss": 2.9858, + "loss": 3.197, + "mask_loss": 0.1852, + "step": 202, + "topk_loss": 0.026 + }, + { + "epoch": 0.0806841258702173, + "grad_norm": 0.263671875, + "learning_rate": 0.00019810485596862392, + "lm_loss": 3.0715, + "loss": 3.298, + "mask_loss": 0.1895, + "step": 203, + "topk_loss": 0.037 + }, + { + "epoch": 0.08108158461834644, + "grad_norm": 0.232421875, + "learning_rate": 0.0001980800817208611, + "lm_loss": 3.0564, + "loss": 3.2737, + "mask_loss": 0.188, + "step": 204, + "topk_loss": 0.0293 + }, + { + "epoch": 0.0814790433664756, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019805514816177006, + "lm_loss": 3.062, + "loss": 3.2838, + "mask_loss": 0.1892, + "step": 205, + "topk_loss": 0.0326 + }, + { + "epoch": 0.08187650211460475, + "grad_norm": 0.22265625, + "learning_rate": 0.00019803005533185038, + "lm_loss": 3.0045, + "loss": 3.2168, + "mask_loss": 0.1863, + "step": 206, + "topk_loss": 0.0261 + }, + { + "epoch": 0.08227396086273389, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001980048032718603, + "lm_loss": 3.0125, + "loss": 3.2284, + "mask_loss": 0.1866, + "step": 207, + "topk_loss": 0.0293 + }, + { + "epoch": 0.08267141961086305, + "grad_norm": 0.2265625, + "learning_rate": 0.00019797939202281664, + "lm_loss": 2.9925, + "loss": 3.2016, + "mask_loss": 0.1839, + "step": 208, + "topk_loss": 0.0252 + }, + { + "epoch": 0.0830688783589922, + "grad_norm": 0.2275390625, + "learning_rate": 0.00019795382162599495, + "lm_loss": 2.9526, + "loss": 3.1611, + "mask_loss": 0.1821, + "step": 209, + "topk_loss": 0.0264 + }, + { + "epoch": 0.08346633710712134, + "grad_norm": 0.236328125, + "learning_rate": 0.00019792809212292912, + "lm_loss": 3.0184, + "loss": 3.2271, + "mask_loss": 0.1825, + "step": 210, + "topk_loss": 0.0263 + }, + { + "epoch": 0.08386379585525049, + "grad_norm": 0.20703125, + "learning_rate": 0.0001979022035554116, + "lm_loss": 2.987, + "loss": 3.201, + "mask_loss": 0.1875, + "step": 211, + "topk_loss": 0.0264 + }, + { + "epoch": 0.08426125460337965, + "grad_norm": 0.24609375, + "learning_rate": 0.00019787615596549308, + "lm_loss": 2.9766, + "loss": 3.1854, + "mask_loss": 0.183, + "step": 212, + "topk_loss": 0.0257 + }, + { + "epoch": 0.08465871335150879, + "grad_norm": 0.21484375, + "learning_rate": 0.00019784994939548266, + "lm_loss": 2.9765, + "loss": 3.1829, + "mask_loss": 0.1792, + "step": 213, + "topk_loss": 0.0272 + }, + { + "epoch": 0.08505617209963794, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019782358388794767, + "lm_loss": 2.9522, + "loss": 3.1674, + "mask_loss": 0.1847, + "step": 214, + "topk_loss": 0.0305 + }, + { + "epoch": 0.0854536308477671, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019779705948571346, + "lm_loss": 3.0139, + "loss": 3.2169, + "mask_loss": 0.1792, + "step": 215, + "topk_loss": 0.0239 + }, + { + "epoch": 0.08585108959589624, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001977703762318637, + "lm_loss": 2.8918, + "loss": 3.0996, + "mask_loss": 0.1829, + "step": 216, + "topk_loss": 0.0249 + }, + { + "epoch": 0.08624854834402539, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001977435341697399, + "lm_loss": 2.9047, + "loss": 3.1255, + "mask_loss": 0.1885, + "step": 217, + "topk_loss": 0.0323 + }, + { + "epoch": 0.08664600709215453, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019771653334294152, + "lm_loss": 2.9129, + "loss": 3.1139, + "mask_loss": 0.179, + "step": 218, + "topk_loss": 0.022 + }, + { + "epoch": 0.08704346584028369, + "grad_norm": 0.201171875, + "learning_rate": 0.00019768937379532604, + "lm_loss": 2.8712, + "loss": 3.075, + "mask_loss": 0.1812, + "step": 219, + "topk_loss": 0.0225 + }, + { + "epoch": 0.08744092458841284, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019766205557100868, + "lm_loss": 2.9447, + "loss": 3.1567, + "mask_loss": 0.1842, + "step": 220, + "topk_loss": 0.0278 + }, + { + "epoch": 0.08783838333654198, + "grad_norm": 0.224609375, + "learning_rate": 0.00019763457871436235, + "lm_loss": 2.9517, + "loss": 3.1604, + "mask_loss": 0.181, + "step": 221, + "topk_loss": 0.0276 + }, + { + "epoch": 0.08823584208467114, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019760694327001767, + "lm_loss": 2.9037, + "loss": 3.1047, + "mask_loss": 0.179, + "step": 222, + "topk_loss": 0.022 + }, + { + "epoch": 0.08863330083280029, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019757914928286287, + "lm_loss": 2.8908, + "loss": 3.0943, + "mask_loss": 0.1776, + "step": 223, + "topk_loss": 0.0259 + }, + { + "epoch": 0.08903075958092943, + "grad_norm": 0.212890625, + "learning_rate": 0.00019755119679804367, + "lm_loss": 2.8657, + "loss": 3.066, + "mask_loss": 0.1779, + "step": 224, + "topk_loss": 0.0223 + }, + { + "epoch": 0.08942821832905858, + "grad_norm": 0.21484375, + "learning_rate": 0.00019752308586096326, + "lm_loss": 2.9197, + "loss": 3.1269, + "mask_loss": 0.1807, + "step": 225, + "topk_loss": 0.0265 + }, + { + "epoch": 0.08982567707718773, + "grad_norm": 0.203125, + "learning_rate": 0.00019749481651728216, + "lm_loss": 2.9023, + "loss": 3.1059, + "mask_loss": 0.1783, + "step": 226, + "topk_loss": 0.0253 + }, + { + "epoch": 0.09022313582531688, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019746638881291829, + "lm_loss": 2.9175, + "loss": 3.1209, + "mask_loss": 0.1786, + "step": 227, + "topk_loss": 0.0248 + }, + { + "epoch": 0.09062059457344603, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001974378027940466, + "lm_loss": 2.8023, + "loss": 3.0019, + "mask_loss": 0.1784, + "step": 228, + "topk_loss": 0.0212 + }, + { + "epoch": 0.09101805332157518, + "grad_norm": 0.205078125, + "learning_rate": 0.00019740905850709948, + "lm_loss": 2.8686, + "loss": 3.0719, + "mask_loss": 0.1806, + "step": 229, + "topk_loss": 0.0226 + }, + { + "epoch": 0.09141551206970433, + "grad_norm": 0.205078125, + "learning_rate": 0.0001973801559987661, + "lm_loss": 2.8833, + "loss": 3.0931, + "mask_loss": 0.1829, + "step": 230, + "topk_loss": 0.0269 + }, + { + "epoch": 0.09181297081783348, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019735109531599285, + "lm_loss": 2.8533, + "loss": 3.057, + "mask_loss": 0.1784, + "step": 231, + "topk_loss": 0.0253 + }, + { + "epoch": 0.09221042956596262, + "grad_norm": 0.220703125, + "learning_rate": 0.0001973218765059829, + "lm_loss": 2.7831, + "loss": 2.9885, + "mask_loss": 0.1791, + "step": 232, + "topk_loss": 0.0263 + }, + { + "epoch": 0.09260788831409178, + "grad_norm": 0.21875, + "learning_rate": 0.00019729249961619635, + "lm_loss": 2.8501, + "loss": 3.0536, + "mask_loss": 0.1774, + "step": 233, + "topk_loss": 0.0261 + }, + { + "epoch": 0.09300534706222092, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019726296469435, + "lm_loss": 2.9035, + "loss": 3.1098, + "mask_loss": 0.1781, + "step": 234, + "topk_loss": 0.0282 + }, + { + "epoch": 0.09340280581035007, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019723327178841743, + "lm_loss": 2.7768, + "loss": 2.9769, + "mask_loss": 0.1772, + "step": 235, + "topk_loss": 0.0229 + }, + { + "epoch": 0.09380026455847923, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001972034209466287, + "lm_loss": 2.7848, + "loss": 2.9854, + "mask_loss": 0.1756, + "step": 236, + "topk_loss": 0.0249 + }, + { + "epoch": 0.09419772330660837, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019717341221747056, + "lm_loss": 2.9198, + "loss": 3.1178, + "mask_loss": 0.1749, + "step": 237, + "topk_loss": 0.0232 + }, + { + "epoch": 0.09459518205473752, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019714324564968613, + "lm_loss": 2.768, + "loss": 2.9666, + "mask_loss": 0.1745, + "step": 238, + "topk_loss": 0.024 + }, + { + "epoch": 0.09499264080286667, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001971129212922749, + "lm_loss": 2.7249, + "loss": 2.9168, + "mask_loss": 0.1725, + "step": 239, + "topk_loss": 0.0195 + }, + { + "epoch": 0.09539009955099582, + "grad_norm": 0.1982421875, + "learning_rate": 0.0001970824391944927, + "lm_loss": 2.802, + "loss": 2.9983, + "mask_loss": 0.1741, + "step": 240, + "topk_loss": 0.0221 + }, + { + "epoch": 0.09578755829912497, + "grad_norm": 0.244140625, + "learning_rate": 0.00019705179940585155, + "lm_loss": 2.8494, + "loss": 3.0489, + "mask_loss": 0.1753, + "step": 241, + "topk_loss": 0.0241 + }, + { + "epoch": 0.09618501704725411, + "grad_norm": 0.220703125, + "learning_rate": 0.00019702100197611962, + "lm_loss": 2.8158, + "loss": 3.0133, + "mask_loss": 0.174, + "step": 242, + "topk_loss": 0.0234 + }, + { + "epoch": 0.09658247579538327, + "grad_norm": 0.26171875, + "learning_rate": 0.0001969900469553211, + "lm_loss": 2.8639, + "loss": 3.086, + "mask_loss": 0.1847, + "step": 243, + "topk_loss": 0.0374 + }, + { + "epoch": 0.09697993454351242, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019695893439373622, + "lm_loss": 2.8073, + "loss": 3.0075, + "mask_loss": 0.1754, + "step": 244, + "topk_loss": 0.0248 + }, + { + "epoch": 0.09737739329164156, + "grad_norm": 0.24609375, + "learning_rate": 0.00019692766434190105, + "lm_loss": 2.7724, + "loss": 2.9628, + "mask_loss": 0.1706, + "step": 245, + "topk_loss": 0.0199 + }, + { + "epoch": 0.09777485203977071, + "grad_norm": 0.224609375, + "learning_rate": 0.00019689623685060744, + "lm_loss": 2.7779, + "loss": 2.9715, + "mask_loss": 0.1706, + "step": 246, + "topk_loss": 0.0231 + }, + { + "epoch": 0.09817231078789987, + "grad_norm": 0.2373046875, + "learning_rate": 0.0001968646519709031, + "lm_loss": 2.731, + "loss": 2.9322, + "mask_loss": 0.1761, + "step": 247, + "topk_loss": 0.0251 + }, + { + "epoch": 0.09856976953602901, + "grad_norm": 0.29296875, + "learning_rate": 0.00019683290975409126, + "lm_loss": 2.7375, + "loss": 2.9383, + "mask_loss": 0.1772, + "step": 248, + "topk_loss": 0.0236 + }, + { + "epoch": 0.09896722828415816, + "grad_norm": 0.224609375, + "learning_rate": 0.00019680101025173073, + "lm_loss": 2.7414, + "loss": 2.9383, + "mask_loss": 0.1749, + "step": 249, + "topk_loss": 0.022 + }, + { + "epoch": 0.09936468703228732, + "grad_norm": 0.21484375, + "learning_rate": 0.00019676895351563588, + "lm_loss": 2.6959, + "loss": 2.8897, + "mask_loss": 0.1725, + "step": 250, + "topk_loss": 0.0214 + }, + { + "epoch": 0.09936468703228732, + "eval_lm_loss": 725.5792236328125, + "eval_loss": 725.7776489257812, + "eval_mask_hit_rate": 0.42889782786369324, + "eval_mask_loss": 0.1703386902809143, + "eval_mask_top_10_hit_rate": 0.9372355937957764, + "eval_mask_top_1_hit_rate": 0.978151798248291, + "eval_mask_top_20_hit_rate": 0.9138137102127075, + "eval_mask_top_5_hit_rate": 0.9532917141914368, + "eval_runtime": 144.6138, + "eval_samples_per_second": 14.162, + "eval_steps_per_second": 7.081, + "eval_token_accuracy": 0.5259637832641602, + "eval_top_k_diff": -537.6810913085938, + "eval_topk_loss": 0.028092004358768463, + "step": 250 + }, + { + "epoch": 0.09976214578041646, + "grad_norm": 0.2578125, + "learning_rate": 0.00019673673959787639, + "lm_loss": 2.6879, + "loss": 2.9018, + "mask_loss": 0.1815, + "step": 251, + "topk_loss": 0.0324 + }, + { + "epoch": 0.10015960452854561, + "grad_norm": 0.314453125, + "learning_rate": 0.00019670436855077726, + "lm_loss": 2.7854, + "loss": 2.9788, + "mask_loss": 0.1692, + "step": 252, + "topk_loss": 0.0242 + }, + { + "epoch": 0.10055706327667475, + "grad_norm": 0.212890625, + "learning_rate": 0.00019667184042691875, + "lm_loss": 2.764, + "loss": 2.959, + "mask_loss": 0.1714, + "step": 253, + "topk_loss": 0.0235 + }, + { + "epoch": 0.10095452202480391, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019663915527913625, + "lm_loss": 2.787, + "loss": 2.9914, + "mask_loss": 0.1759, + "step": 254, + "topk_loss": 0.0285 + }, + { + "epoch": 0.10135198077293306, + "grad_norm": 0.2578125, + "learning_rate": 0.00019660631316052021, + "lm_loss": 2.7066, + "loss": 2.9002, + "mask_loss": 0.1721, + "step": 255, + "topk_loss": 0.0215 + }, + { + "epoch": 0.1017494395210622, + "grad_norm": 0.25390625, + "learning_rate": 0.00019657331412441598, + "lm_loss": 2.7055, + "loss": 2.901, + "mask_loss": 0.1719, + "step": 256, + "topk_loss": 0.0236 + }, + { + "epoch": 0.10214689826919136, + "grad_norm": 0.23046875, + "learning_rate": 0.0001965401582244239, + "lm_loss": 2.7156, + "loss": 2.9156, + "mask_loss": 0.1733, + "step": 257, + "topk_loss": 0.0267 + }, + { + "epoch": 0.10254435701732051, + "grad_norm": 0.22265625, + "learning_rate": 0.000196506845514399, + "lm_loss": 2.7108, + "loss": 2.902, + "mask_loss": 0.1704, + "step": 258, + "topk_loss": 0.0208 + }, + { + "epoch": 0.10294181576544965, + "grad_norm": 0.26953125, + "learning_rate": 0.00019647337604845107, + "lm_loss": 2.7099, + "loss": 2.9024, + "mask_loss": 0.1696, + "step": 259, + "topk_loss": 0.0229 + }, + { + "epoch": 0.1033392745135788, + "grad_norm": 0.236328125, + "learning_rate": 0.00019643974988094458, + "lm_loss": 2.6189, + "loss": 2.8054, + "mask_loss": 0.1682, + "step": 260, + "topk_loss": 0.0184 + }, + { + "epoch": 0.10373673326170796, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019640596706649841, + "lm_loss": 2.6854, + "loss": 2.884, + "mask_loss": 0.1734, + "step": 261, + "topk_loss": 0.0252 + }, + { + "epoch": 0.1041341920098371, + "grad_norm": 0.265625, + "learning_rate": 0.00019637202765998592, + "lm_loss": 2.6226, + "loss": 2.8346, + "mask_loss": 0.1796, + "step": 262, + "topk_loss": 0.0324 + }, + { + "epoch": 0.10453165075796625, + "grad_norm": 0.2138671875, + "learning_rate": 0.00019633793171653488, + "lm_loss": 2.6852, + "loss": 2.8731, + "mask_loss": 0.1679, + "step": 263, + "topk_loss": 0.02 + }, + { + "epoch": 0.10492910950609541, + "grad_norm": 0.2177734375, + "learning_rate": 0.00019630367929152724, + "lm_loss": 2.6046, + "loss": 2.8038, + "mask_loss": 0.1743, + "step": 264, + "topk_loss": 0.0249 + }, + { + "epoch": 0.10532656825422455, + "grad_norm": 0.2109375, + "learning_rate": 0.00019626927044059914, + "lm_loss": 2.6704, + "loss": 2.8629, + "mask_loss": 0.1709, + "step": 265, + "topk_loss": 0.0216 + }, + { + "epoch": 0.1057240270023537, + "grad_norm": 0.203125, + "learning_rate": 0.00019623470521964092, + "lm_loss": 2.6652, + "loss": 2.8514, + "mask_loss": 0.167, + "step": 266, + "topk_loss": 0.0193 + }, + { + "epoch": 0.10612148575048284, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019619998368479674, + "lm_loss": 2.5714, + "loss": 2.7607, + "mask_loss": 0.17, + "step": 267, + "topk_loss": 0.0192 + }, + { + "epoch": 0.106518944498612, + "grad_norm": 0.1875, + "learning_rate": 0.00019616510589246474, + "lm_loss": 2.6796, + "loss": 2.8766, + "mask_loss": 0.1737, + "step": 268, + "topk_loss": 0.0233 + }, + { + "epoch": 0.10691640324674115, + "grad_norm": 0.2294921875, + "learning_rate": 0.00019613007189929688, + "lm_loss": 2.6008, + "loss": 2.7886, + "mask_loss": 0.1672, + "step": 269, + "topk_loss": 0.0206 + }, + { + "epoch": 0.1073138619948703, + "grad_norm": 0.2421875, + "learning_rate": 0.00019609488176219886, + "lm_loss": 2.6117, + "loss": 2.807, + "mask_loss": 0.1725, + "step": 270, + "topk_loss": 0.0228 + }, + { + "epoch": 0.10771132074299945, + "grad_norm": 0.19921875, + "learning_rate": 0.00019605953553832988, + "lm_loss": 2.6278, + "loss": 2.8085, + "mask_loss": 0.1629, + "step": 271, + "topk_loss": 0.0179 + }, + { + "epoch": 0.1081087794911286, + "grad_norm": 0.20703125, + "learning_rate": 0.0001960240332851028, + "lm_loss": 2.6579, + "loss": 2.8498, + "mask_loss": 0.169, + "step": 272, + "topk_loss": 0.0229 + }, + { + "epoch": 0.10850623823925774, + "grad_norm": 0.2099609375, + "learning_rate": 0.00019598837506018391, + "lm_loss": 2.7048, + "loss": 2.8962, + "mask_loss": 0.1682, + "step": 273, + "topk_loss": 0.0232 + }, + { + "epoch": 0.10890369698738689, + "grad_norm": 0.22265625, + "learning_rate": 0.0001959525609214928, + "lm_loss": 2.6585, + "loss": 2.8469, + "mask_loss": 0.17, + "step": 274, + "topk_loss": 0.0184 + }, + { + "epoch": 0.10930115573551605, + "grad_norm": 0.197265625, + "learning_rate": 0.00019591659092720227, + "lm_loss": 2.6604, + "loss": 2.8479, + "mask_loss": 0.1682, + "step": 275, + "topk_loss": 0.0193 + }, + { + "epoch": 0.1096986144836452, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019588046513573839, + "lm_loss": 2.4789, + "loss": 2.6631, + "mask_loss": 0.167, + "step": 276, + "topk_loss": 0.0172 + }, + { + "epoch": 0.11009607323177434, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019584418360578016, + "lm_loss": 2.592, + "loss": 2.7834, + "mask_loss": 0.168, + "step": 277, + "topk_loss": 0.0234 + }, + { + "epoch": 0.1104935319799035, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019580774639625968, + "lm_loss": 2.586, + "loss": 2.7701, + "mask_loss": 0.1666, + "step": 278, + "topk_loss": 0.0174 + }, + { + "epoch": 0.11089099072803264, + "grad_norm": 0.171875, + "learning_rate": 0.00019577115356636182, + "lm_loss": 2.5704, + "loss": 2.7551, + "mask_loss": 0.1656, + "step": 279, + "topk_loss": 0.019 + }, + { + "epoch": 0.11128844947616179, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019573440517552427, + "lm_loss": 2.6662, + "loss": 2.8529, + "mask_loss": 0.1666, + "step": 280, + "topk_loss": 0.0201 + }, + { + "epoch": 0.11168590822429093, + "grad_norm": 0.205078125, + "learning_rate": 0.0001956975012834374, + "lm_loss": 2.5903, + "loss": 2.7714, + "mask_loss": 0.1629, + "step": 281, + "topk_loss": 0.0182 + }, + { + "epoch": 0.11208336697242009, + "grad_norm": 0.1826171875, + "learning_rate": 0.0001956604419500441, + "lm_loss": 2.5635, + "loss": 2.7467, + "mask_loss": 0.1665, + "step": 282, + "topk_loss": 0.0167 + }, + { + "epoch": 0.11248082572054924, + "grad_norm": 0.1875, + "learning_rate": 0.00019562322723553984, + "lm_loss": 2.5431, + "loss": 2.7258, + "mask_loss": 0.1658, + "step": 283, + "topk_loss": 0.017 + }, + { + "epoch": 0.11287828446867838, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019558585720037236, + "lm_loss": 2.635, + "loss": 2.8242, + "mask_loss": 0.166, + "step": 284, + "topk_loss": 0.0232 + }, + { + "epoch": 0.11327574321680754, + "grad_norm": 0.1875, + "learning_rate": 0.00019554833190524182, + "lm_loss": 2.57, + "loss": 2.757, + "mask_loss": 0.1669, + "step": 285, + "topk_loss": 0.0201 + }, + { + "epoch": 0.11367320196493669, + "grad_norm": 0.181640625, + "learning_rate": 0.00019551065141110047, + "lm_loss": 2.5794, + "loss": 2.7671, + "mask_loss": 0.1669, + "step": 286, + "topk_loss": 0.0208 + }, + { + "epoch": 0.11407066071306583, + "grad_norm": 0.205078125, + "learning_rate": 0.00019547281577915267, + "lm_loss": 2.5847, + "loss": 2.7863, + "mask_loss": 0.1732, + "step": 287, + "topk_loss": 0.0285 + }, + { + "epoch": 0.11446811946119498, + "grad_norm": 0.203125, + "learning_rate": 0.00019543482507085482, + "lm_loss": 2.4931, + "loss": 2.682, + "mask_loss": 0.1675, + "step": 288, + "topk_loss": 0.0213 + }, + { + "epoch": 0.11486557820932414, + "grad_norm": 0.185546875, + "learning_rate": 0.00019539667934791513, + "lm_loss": 2.5702, + "loss": 2.7578, + "mask_loss": 0.168, + "step": 289, + "topk_loss": 0.0197 + }, + { + "epoch": 0.11526303695745328, + "grad_norm": 0.18359375, + "learning_rate": 0.00019535837867229363, + "lm_loss": 2.614, + "loss": 2.7945, + "mask_loss": 0.1625, + "step": 290, + "topk_loss": 0.018 + }, + { + "epoch": 0.11566049570558243, + "grad_norm": 0.2578125, + "learning_rate": 0.0001953199231062021, + "lm_loss": 2.6642, + "loss": 2.8608, + "mask_loss": 0.1715, + "step": 291, + "topk_loss": 0.0251 + }, + { + "epoch": 0.11605795445371159, + "grad_norm": 0.2197265625, + "learning_rate": 0.00019528131271210383, + "lm_loss": 2.534, + "loss": 2.7202, + "mask_loss": 0.1677, + "step": 292, + "topk_loss": 0.0185 + }, + { + "epoch": 0.11645541320184073, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001952425475527136, + "lm_loss": 2.6175, + "loss": 2.7969, + "mask_loss": 0.1626, + "step": 293, + "topk_loss": 0.0168 + }, + { + "epoch": 0.11685287194996988, + "grad_norm": 0.2158203125, + "learning_rate": 0.00019520362769099764, + "lm_loss": 2.5311, + "loss": 2.7126, + "mask_loss": 0.1629, + "step": 294, + "topk_loss": 0.0186 + }, + { + "epoch": 0.11725033069809902, + "grad_norm": 0.251953125, + "learning_rate": 0.0001951645531901734, + "lm_loss": 2.542, + "loss": 2.7468, + "mask_loss": 0.1748, + "step": 295, + "topk_loss": 0.03 + }, + { + "epoch": 0.11764778944622818, + "grad_norm": 0.310546875, + "learning_rate": 0.00019512532411370954, + "lm_loss": 2.5719, + "loss": 2.756, + "mask_loss": 0.1638, + "step": 296, + "topk_loss": 0.0203 + }, + { + "epoch": 0.11804524819435733, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001950859405253258, + "lm_loss": 2.5602, + "loss": 2.7346, + "mask_loss": 0.1598, + "step": 297, + "topk_loss": 0.0146 + }, + { + "epoch": 0.11844270694248647, + "grad_norm": 0.24609375, + "learning_rate": 0.00019504640248899286, + "lm_loss": 2.5456, + "loss": 2.7311, + "mask_loss": 0.1654, + "step": 298, + "topk_loss": 0.0201 + }, + { + "epoch": 0.11884016569061563, + "grad_norm": 0.220703125, + "learning_rate": 0.0001950067100689323, + "lm_loss": 2.5376, + "loss": 2.7266, + "mask_loss": 0.1673, + "step": 299, + "topk_loss": 0.0216 + }, + { + "epoch": 0.11923762443874478, + "grad_norm": 0.201171875, + "learning_rate": 0.00019496686332961646, + "lm_loss": 2.5539, + "loss": 2.7356, + "mask_loss": 0.1625, + "step": 300, + "topk_loss": 0.0192 + }, + { + "epoch": 0.11923762443874478, + "eval_lm_loss": 725.0694580078125, + "eval_loss": 725.2548217773438, + "eval_mask_hit_rate": 0.4468006491661072, + "eval_mask_loss": 0.1619417518377304, + "eval_mask_top_10_hit_rate": 0.9498538970947266, + "eval_mask_top_1_hit_rate": 0.9841432571411133, + "eval_mask_top_20_hit_rate": 0.9292661547660828, + "eval_mask_top_5_hit_rate": 0.9635714888572693, + "eval_runtime": 144.0678, + "eval_samples_per_second": 14.216, + "eval_steps_per_second": 7.108, + "eval_token_accuracy": 0.5460589528083801, + "eval_top_k_diff": -557.624267578125, + "eval_topk_loss": 0.02342473529279232, + "step": 300 + }, + { + "epoch": 0.11963508318687392, + "grad_norm": 0.181640625, + "learning_rate": 0.00019492686233576833, + "lm_loss": 2.5287, + "loss": 2.7131, + "mask_loss": 0.165, + "step": 301, + "topk_loss": 0.0194 + }, + { + "epoch": 0.12003254193500307, + "grad_norm": 0.19140625, + "learning_rate": 0.0001948867071523615, + "lm_loss": 2.5342, + "loss": 2.7148, + "mask_loss": 0.1631, + "step": 302, + "topk_loss": 0.0175 + }, + { + "epoch": 0.12043000068313223, + "grad_norm": 0.203125, + "learning_rate": 0.00019484639784461994, + "lm_loss": 2.4958, + "loss": 2.6775, + "mask_loss": 0.1647, + "step": 303, + "topk_loss": 0.017 + }, + { + "epoch": 0.12082745943126137, + "grad_norm": 0.1826171875, + "learning_rate": 0.00019480593447801799, + "lm_loss": 2.489, + "loss": 2.6758, + "mask_loss": 0.166, + "step": 304, + "topk_loss": 0.0208 + }, + { + "epoch": 0.12122491817939052, + "grad_norm": 0.2216796875, + "learning_rate": 0.00019476531711828027, + "lm_loss": 2.512, + "loss": 2.705, + "mask_loss": 0.1678, + "step": 305, + "topk_loss": 0.0252 + }, + { + "epoch": 0.12162237692751968, + "grad_norm": 0.201171875, + "learning_rate": 0.00019472454583138144, + "lm_loss": 2.5055, + "loss": 2.6908, + "mask_loss": 0.164, + "step": 306, + "topk_loss": 0.0213 + }, + { + "epoch": 0.12201983567564882, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001946836206835463, + "lm_loss": 2.5713, + "loss": 2.7526, + "mask_loss": 0.162, + "step": 307, + "topk_loss": 0.0193 + }, + { + "epoch": 0.12241729442377797, + "grad_norm": 0.220703125, + "learning_rate": 0.0001946425417412495, + "lm_loss": 2.4884, + "loss": 2.6637, + "mask_loss": 0.1589, + "step": 308, + "topk_loss": 0.0164 + }, + { + "epoch": 0.12281475317190711, + "grad_norm": 0.25, + "learning_rate": 0.00019460130907121545, + "lm_loss": 2.5331, + "loss": 2.726, + "mask_loss": 0.1689, + "step": 309, + "topk_loss": 0.024 + }, + { + "epoch": 0.12321221192003627, + "grad_norm": 0.255859375, + "learning_rate": 0.00019455992274041835, + "lm_loss": 2.547, + "loss": 2.733, + "mask_loss": 0.1649, + "step": 310, + "topk_loss": 0.0211 + }, + { + "epoch": 0.12360967066816542, + "grad_norm": 0.203125, + "learning_rate": 0.00019451838281608197, + "lm_loss": 2.536, + "loss": 2.7128, + "mask_loss": 0.1605, + "step": 311, + "topk_loss": 0.0163 + }, + { + "epoch": 0.12400712941629456, + "grad_norm": 0.19140625, + "learning_rate": 0.00019447668936567952, + "lm_loss": 2.4844, + "loss": 2.6624, + "mask_loss": 0.1608, + "step": 312, + "topk_loss": 0.0172 + }, + { + "epoch": 0.12440458816442372, + "grad_norm": 0.205078125, + "learning_rate": 0.0001944348424569336, + "lm_loss": 2.4448, + "loss": 2.6258, + "mask_loss": 0.1618, + "step": 313, + "topk_loss": 0.0192 + }, + { + "epoch": 0.12480204691255287, + "grad_norm": 0.23828125, + "learning_rate": 0.00019439284215781613, + "lm_loss": 2.4238, + "loss": 2.6091, + "mask_loss": 0.1635, + "step": 314, + "topk_loss": 0.0219 + }, + { + "epoch": 0.12519950566068203, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019435068853654807, + "lm_loss": 2.4857, + "loss": 2.667, + "mask_loss": 0.1626, + "step": 315, + "topk_loss": 0.0187 + }, + { + "epoch": 0.12559696440881116, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019430838166159954, + "lm_loss": 2.5535, + "loss": 2.7497, + "mask_loss": 0.1674, + "step": 316, + "topk_loss": 0.0288 + }, + { + "epoch": 0.12599442315694032, + "grad_norm": 0.2158203125, + "learning_rate": 0.0001942659216016895, + "lm_loss": 2.4172, + "loss": 2.5921, + "mask_loss": 0.1587, + "step": 317, + "topk_loss": 0.0162 + }, + { + "epoch": 0.12639188190506948, + "grad_norm": 0.24609375, + "learning_rate": 0.00019422330842578577, + "lm_loss": 2.4766, + "loss": 2.6646, + "mask_loss": 0.1639, + "step": 318, + "topk_loss": 0.0241 + }, + { + "epoch": 0.1267893406531986, + "grad_norm": 0.23046875, + "learning_rate": 0.00019418054220310483, + "lm_loss": 2.5006, + "loss": 2.6787, + "mask_loss": 0.1625, + "step": 319, + "topk_loss": 0.0155 + }, + { + "epoch": 0.12718679940132777, + "grad_norm": 0.1943359375, + "learning_rate": 0.00019413762300311182, + "lm_loss": 2.455, + "loss": 2.6357, + "mask_loss": 0.1624, + "step": 320, + "topk_loss": 0.0183 + }, + { + "epoch": 0.1275842581494569, + "grad_norm": 0.23828125, + "learning_rate": 0.00019409455089552038, + "lm_loss": 2.5089, + "loss": 2.6893, + "mask_loss": 0.16, + "step": 321, + "topk_loss": 0.0204 + }, + { + "epoch": 0.12798171689758606, + "grad_norm": 0.1904296875, + "learning_rate": 0.0001940513259502924, + "lm_loss": 2.4325, + "loss": 2.6111, + "mask_loss": 0.1624, + "step": 322, + "topk_loss": 0.0162 + }, + { + "epoch": 0.12837917564571522, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019400794823763815, + "lm_loss": 2.4237, + "loss": 2.5945, + "mask_loss": 0.1567, + "step": 323, + "topk_loss": 0.0142 + }, + { + "epoch": 0.12877663439384435, + "grad_norm": 0.212890625, + "learning_rate": 0.00019396441782801592, + "lm_loss": 2.5473, + "loss": 2.729, + "mask_loss": 0.1632, + "step": 324, + "topk_loss": 0.0185 + }, + { + "epoch": 0.1291740931419735, + "grad_norm": 0.1953125, + "learning_rate": 0.00019392073479213213, + "lm_loss": 2.4768, + "loss": 2.6561, + "mask_loss": 0.1606, + "step": 325, + "topk_loss": 0.0188 + }, + { + "epoch": 0.12957155189010267, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019387689920094103, + "lm_loss": 2.4692, + "loss": 2.6498, + "mask_loss": 0.1615, + "step": 326, + "topk_loss": 0.019 + }, + { + "epoch": 0.1299690106382318, + "grad_norm": 0.1923828125, + "learning_rate": 0.00019383291112564478, + "lm_loss": 2.4854, + "loss": 2.6632, + "mask_loss": 0.16, + "step": 327, + "topk_loss": 0.0178 + }, + { + "epoch": 0.13036646938636096, + "grad_norm": 0.2060546875, + "learning_rate": 0.00019378877063769309, + "lm_loss": 2.4537, + "loss": 2.6312, + "mask_loss": 0.1587, + "step": 328, + "topk_loss": 0.0188 + }, + { + "epoch": 0.13076392813449011, + "grad_norm": 0.2265625, + "learning_rate": 0.00019374447780878327, + "lm_loss": 2.5265, + "loss": 2.7126, + "mask_loss": 0.1639, + "step": 329, + "topk_loss": 0.0222 + }, + { + "epoch": 0.13116138688261925, + "grad_norm": 0.2177734375, + "learning_rate": 0.0001937000327108601, + "lm_loss": 2.4116, + "loss": 2.5915, + "mask_loss": 0.1613, + "step": 330, + "topk_loss": 0.0185 + }, + { + "epoch": 0.1315588456307484, + "grad_norm": 0.232421875, + "learning_rate": 0.00019365543541611575, + "lm_loss": 2.4401, + "loss": 2.6144, + "mask_loss": 0.1587, + "step": 331, + "topk_loss": 0.0156 + }, + { + "epoch": 0.13195630437887756, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019361068599698945, + "lm_loss": 2.4014, + "loss": 2.5804, + "mask_loss": 0.1605, + "step": 332, + "topk_loss": 0.0185 + }, + { + "epoch": 0.1323537631270067, + "grad_norm": 0.19921875, + "learning_rate": 0.00019356578452616772, + "lm_loss": 2.4467, + "loss": 2.6252, + "mask_loss": 0.1604, + "step": 333, + "topk_loss": 0.0181 + }, + { + "epoch": 0.13275122187513586, + "grad_norm": 0.2255859375, + "learning_rate": 0.00019352073107658385, + "lm_loss": 2.3818, + "loss": 2.5607, + "mask_loss": 0.1606, + "step": 334, + "topk_loss": 0.0183 + }, + { + "epoch": 0.133148680623265, + "grad_norm": 0.177734375, + "learning_rate": 0.0001934755257214181, + "lm_loss": 2.4268, + "loss": 2.6045, + "mask_loss": 0.1592, + "step": 335, + "topk_loss": 0.0184 + }, + { + "epoch": 0.13354613937139415, + "grad_norm": 0.1845703125, + "learning_rate": 0.00019343016853409754, + "lm_loss": 2.3896, + "loss": 2.5674, + "mask_loss": 0.1611, + "step": 336, + "topk_loss": 0.0166 + }, + { + "epoch": 0.1339435981195233, + "grad_norm": 0.2021484375, + "learning_rate": 0.00019338465958829572, + "lm_loss": 2.4972, + "loss": 2.6721, + "mask_loss": 0.1585, + "step": 337, + "topk_loss": 0.0163 + }, + { + "epoch": 0.13434105686765244, + "grad_norm": 0.251953125, + "learning_rate": 0.00019333899895793272, + "lm_loss": 2.4311, + "loss": 2.6161, + "mask_loss": 0.1619, + "step": 338, + "topk_loss": 0.0231 + }, + { + "epoch": 0.1347385156157816, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001932931867171751, + "lm_loss": 2.3507, + "loss": 2.5257, + "mask_loss": 0.1579, + "step": 339, + "topk_loss": 0.0171 + }, + { + "epoch": 0.13513597436391075, + "grad_norm": 0.2353515625, + "learning_rate": 0.00019324722294043558, + "lm_loss": 2.427, + "loss": 2.6035, + "mask_loss": 0.1595, + "step": 340, + "topk_loss": 0.0171 + }, + { + "epoch": 0.13553343311203989, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019320110770237308, + "lm_loss": 2.4311, + "loss": 2.6085, + "mask_loss": 0.1604, + "step": 341, + "topk_loss": 0.017 + }, + { + "epoch": 0.13593089186016905, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019315484107789246, + "lm_loss": 2.3972, + "loss": 2.5739, + "mask_loss": 0.159, + "step": 342, + "topk_loss": 0.0177 + }, + { + "epoch": 0.1363283506082982, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019310842314214458, + "lm_loss": 2.46, + "loss": 2.6425, + "mask_loss": 0.1619, + "step": 343, + "topk_loss": 0.0205 + }, + { + "epoch": 0.13672580935642734, + "grad_norm": 0.224609375, + "learning_rate": 0.000193061853970526, + "lm_loss": 2.2869, + "loss": 2.4596, + "mask_loss": 0.159, + "step": 344, + "topk_loss": 0.0138 + }, + { + "epoch": 0.1371232681045565, + "grad_norm": 0.2119140625, + "learning_rate": 0.00019301513363867895, + "lm_loss": 2.3823, + "loss": 2.5584, + "mask_loss": 0.1607, + "step": 345, + "topk_loss": 0.0154 + }, + { + "epoch": 0.13752072685268565, + "grad_norm": 0.19140625, + "learning_rate": 0.0001929682622224912, + "lm_loss": 2.4612, + "loss": 2.6359, + "mask_loss": 0.1581, + "step": 346, + "topk_loss": 0.0165 + }, + { + "epoch": 0.13791818560081479, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019292123979809594, + "lm_loss": 2.4941, + "loss": 2.6731, + "mask_loss": 0.1604, + "step": 347, + "topk_loss": 0.0187 + }, + { + "epoch": 0.13831564434894394, + "grad_norm": 0.310546875, + "learning_rate": 0.00019287406644187156, + "lm_loss": 2.4155, + "loss": 2.59, + "mask_loss": 0.1558, + "step": 348, + "topk_loss": 0.0188 + }, + { + "epoch": 0.13871310309707308, + "grad_norm": 0.216796875, + "learning_rate": 0.00019282674223044177, + "lm_loss": 2.4682, + "loss": 2.64, + "mask_loss": 0.1555, + "step": 349, + "topk_loss": 0.0163 + }, + { + "epoch": 0.13911056184520224, + "grad_norm": 0.251953125, + "learning_rate": 0.0001927792672406751, + "lm_loss": 2.4956, + "loss": 2.6801, + "mask_loss": 0.1615, + "step": 350, + "topk_loss": 0.0231 + }, + { + "epoch": 0.13911056184520224, + "eval_lm_loss": 721.0567016601562, + "eval_loss": 721.2329711914062, + "eval_mask_hit_rate": 0.4606643319129944, + "eval_mask_loss": 0.1554771065711975, + "eval_mask_top_10_hit_rate": 0.9582056999206543, + "eval_mask_top_1_hit_rate": 0.9878551959991455, + "eval_mask_top_20_hit_rate": 0.9397642612457275, + "eval_mask_top_5_hit_rate": 0.9702033996582031, + "eval_runtime": 145.1773, + "eval_samples_per_second": 14.107, + "eval_steps_per_second": 7.053, + "eval_token_accuracy": 0.5597802400588989, + "eval_top_k_diff": -564.8240356445312, + "eval_topk_loss": 0.02081306278705597, + "step": 350 + }, + { + "epoch": 0.1395080205933314, + "grad_norm": 0.234375, + "learning_rate": 0.00019273164154968522, + "lm_loss": 2.5178, + "loss": 2.6936, + "mask_loss": 0.1571, + "step": 351, + "topk_loss": 0.0187 + }, + { + "epoch": 0.13990547934146053, + "grad_norm": 0.203125, + "learning_rate": 0.00019268386523483037, + "lm_loss": 2.3764, + "loss": 2.5479, + "mask_loss": 0.156, + "step": 352, + "topk_loss": 0.0155 + }, + { + "epoch": 0.14030293808958968, + "grad_norm": 0.1875, + "learning_rate": 0.0001926359383737136, + "lm_loss": 2.3557, + "loss": 2.5295, + "mask_loss": 0.1579, + "step": 353, + "topk_loss": 0.0158 + }, + { + "epoch": 0.14070039683771884, + "grad_norm": 0.1884765625, + "learning_rate": 0.00019258786104418244, + "lm_loss": 2.3476, + "loss": 2.5195, + "mask_loss": 0.1562, + "step": 354, + "topk_loss": 0.0157 + }, + { + "epoch": 0.14109785558584798, + "grad_norm": 0.267578125, + "learning_rate": 0.00019253963332432878, + "lm_loss": 2.4405, + "loss": 2.6102, + "mask_loss": 0.1543, + "step": 355, + "topk_loss": 0.0154 + }, + { + "epoch": 0.14149531433397713, + "grad_norm": 0.21484375, + "learning_rate": 0.0001924912552924889, + "lm_loss": 2.4723, + "loss": 2.6404, + "mask_loss": 0.1524, + "step": 356, + "topk_loss": 0.0157 + }, + { + "epoch": 0.1418927730821063, + "grad_norm": 0.22265625, + "learning_rate": 0.0001924427270272431, + "lm_loss": 2.4031, + "loss": 2.5813, + "mask_loss": 0.1572, + "step": 357, + "topk_loss": 0.021 + }, + { + "epoch": 0.14229023183023543, + "grad_norm": 0.1787109375, + "learning_rate": 0.00019239404860741578, + "lm_loss": 2.2449, + "loss": 2.4173, + "mask_loss": 0.1549, + "step": 358, + "topk_loss": 0.0175 + }, + { + "epoch": 0.14268769057836458, + "grad_norm": 0.17578125, + "learning_rate": 0.00019234522011207528, + "lm_loss": 2.3971, + "loss": 2.5702, + "mask_loss": 0.157, + "step": 359, + "topk_loss": 0.016 + }, + { + "epoch": 0.14308514932649374, + "grad_norm": 0.1982421875, + "learning_rate": 0.00019229624162053356, + "lm_loss": 2.3401, + "loss": 2.5133, + "mask_loss": 0.1566, + "step": 360, + "topk_loss": 0.0167 + }, + { + "epoch": 0.14348260807462287, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001922471132123464, + "lm_loss": 2.3972, + "loss": 2.567, + "mask_loss": 0.1534, + "step": 361, + "topk_loss": 0.0164 + }, + { + "epoch": 0.14388006682275203, + "grad_norm": 0.2080078125, + "learning_rate": 0.00019219783496731292, + "lm_loss": 2.3702, + "loss": 2.5355, + "mask_loss": 0.152, + "step": 362, + "topk_loss": 0.0133 + }, + { + "epoch": 0.14427752557088117, + "grad_norm": 0.193359375, + "learning_rate": 0.00019214840696547575, + "lm_loss": 2.3916, + "loss": 2.5707, + "mask_loss": 0.1599, + "step": 363, + "topk_loss": 0.0192 + }, + { + "epoch": 0.14467498431901032, + "grad_norm": 0.1875, + "learning_rate": 0.0001920988292871207, + "lm_loss": 2.3686, + "loss": 2.5369, + "mask_loss": 0.1522, + "step": 364, + "topk_loss": 0.0161 + }, + { + "epoch": 0.14507244306713948, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019204910201277672, + "lm_loss": 2.3517, + "loss": 2.5237, + "mask_loss": 0.1555, + "step": 365, + "topk_loss": 0.0165 + }, + { + "epoch": 0.14546990181526862, + "grad_norm": 0.1904296875, + "learning_rate": 0.00019199922522321574, + "lm_loss": 2.3763, + "loss": 2.5485, + "mask_loss": 0.1553, + "step": 366, + "topk_loss": 0.0169 + }, + { + "epoch": 0.14586736056339777, + "grad_norm": 0.1953125, + "learning_rate": 0.0001919491989994526, + "lm_loss": 2.4421, + "loss": 2.6105, + "mask_loss": 0.1537, + "step": 367, + "topk_loss": 0.0147 + }, + { + "epoch": 0.14626481931152693, + "grad_norm": 0.181640625, + "learning_rate": 0.00019189902342274471, + "lm_loss": 2.3083, + "loss": 2.4815, + "mask_loss": 0.1568, + "step": 368, + "topk_loss": 0.0164 + }, + { + "epoch": 0.14666227805965606, + "grad_norm": 0.2001953125, + "learning_rate": 0.00019184869857459232, + "lm_loss": 2.3493, + "loss": 2.5219, + "mask_loss": 0.1557, + "step": 369, + "topk_loss": 0.0169 + }, + { + "epoch": 0.14705973680778522, + "grad_norm": 0.23046875, + "learning_rate": 0.0001917982245367379, + "lm_loss": 2.3418, + "loss": 2.5175, + "mask_loss": 0.1564, + "step": 370, + "topk_loss": 0.0193 + }, + { + "epoch": 0.14745719555591438, + "grad_norm": 0.1962890625, + "learning_rate": 0.00019174760139116642, + "lm_loss": 2.4028, + "loss": 2.5704, + "mask_loss": 0.1531, + "step": 371, + "topk_loss": 0.0145 + }, + { + "epoch": 0.14785465430404351, + "grad_norm": 0.2041015625, + "learning_rate": 0.00019169682922010492, + "lm_loss": 2.34, + "loss": 2.5094, + "mask_loss": 0.1543, + "step": 372, + "topk_loss": 0.0151 + }, + { + "epoch": 0.14825211305217267, + "grad_norm": 0.220703125, + "learning_rate": 0.00019164590810602262, + "lm_loss": 2.2568, + "loss": 2.4285, + "mask_loss": 0.1558, + "step": 373, + "topk_loss": 0.016 + }, + { + "epoch": 0.14864957180030183, + "grad_norm": 0.20703125, + "learning_rate": 0.00019159483813163054, + "lm_loss": 2.3312, + "loss": 2.5006, + "mask_loss": 0.1551, + "step": 374, + "topk_loss": 0.0143 + }, + { + "epoch": 0.14904703054843096, + "grad_norm": 0.205078125, + "learning_rate": 0.00019154361937988163, + "lm_loss": 2.3235, + "loss": 2.4962, + "mask_loss": 0.1544, + "step": 375, + "topk_loss": 0.0183 + }, + { + "epoch": 0.14944448929656012, + "grad_norm": 0.197265625, + "learning_rate": 0.00019149225193397043, + "lm_loss": 2.3933, + "loss": 2.5681, + "mask_loss": 0.1555, + "step": 376, + "topk_loss": 0.0193 + }, + { + "epoch": 0.14984194804468925, + "grad_norm": 0.2109375, + "learning_rate": 0.00019144073587733294, + "lm_loss": 2.355, + "loss": 2.5333, + "mask_loss": 0.1579, + "step": 377, + "topk_loss": 0.0205 + }, + { + "epoch": 0.15023940679281841, + "grad_norm": 0.201171875, + "learning_rate": 0.00019138907129364664, + "lm_loss": 2.3831, + "loss": 2.5541, + "mask_loss": 0.1546, + "step": 378, + "topk_loss": 0.0164 + }, + { + "epoch": 0.15063686554094757, + "grad_norm": 0.2197265625, + "learning_rate": 0.0001913372582668303, + "lm_loss": 2.3562, + "loss": 2.5254, + "mask_loss": 0.1544, + "step": 379, + "topk_loss": 0.0148 + }, + { + "epoch": 0.1510343242890767, + "grad_norm": 0.26953125, + "learning_rate": 0.00019128529688104364, + "lm_loss": 2.4145, + "loss": 2.6087, + "mask_loss": 0.1629, + "step": 380, + "topk_loss": 0.0314 + }, + { + "epoch": 0.15143178303720586, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001912331872206875, + "lm_loss": 2.4485, + "loss": 2.6143, + "mask_loss": 0.1507, + "step": 381, + "topk_loss": 0.0151 + }, + { + "epoch": 0.15182924178533502, + "grad_norm": 0.1875, + "learning_rate": 0.00019118092937040352, + "lm_loss": 2.358, + "loss": 2.5224, + "mask_loss": 0.1491, + "step": 382, + "topk_loss": 0.0153 + }, + { + "epoch": 0.15222670053346415, + "grad_norm": 0.2421875, + "learning_rate": 0.0001911285234150741, + "lm_loss": 2.3632, + "loss": 2.5428, + "mask_loss": 0.1573, + "step": 383, + "topk_loss": 0.0223 + }, + { + "epoch": 0.1526241592815933, + "grad_norm": 0.197265625, + "learning_rate": 0.0001910759694398221, + "lm_loss": 2.3099, + "loss": 2.4799, + "mask_loss": 0.1533, + "step": 384, + "topk_loss": 0.0167 + }, + { + "epoch": 0.15302161802972247, + "grad_norm": 0.1708984375, + "learning_rate": 0.00019102326753001086, + "lm_loss": 2.3235, + "loss": 2.4979, + "mask_loss": 0.1567, + "step": 385, + "topk_loss": 0.0177 + }, + { + "epoch": 0.1534190767778516, + "grad_norm": 0.208984375, + "learning_rate": 0.000190970417771244, + "lm_loss": 2.3149, + "loss": 2.4784, + "mask_loss": 0.1492, + "step": 386, + "topk_loss": 0.0143 + }, + { + "epoch": 0.15381653552598076, + "grad_norm": 0.20703125, + "learning_rate": 0.00019091742024936537, + "lm_loss": 2.372, + "loss": 2.5421, + "mask_loss": 0.1525, + "step": 387, + "topk_loss": 0.0175 + }, + { + "epoch": 0.15421399427410992, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001908642750504587, + "lm_loss": 2.2879, + "loss": 2.4548, + "mask_loss": 0.1521, + "step": 388, + "topk_loss": 0.0148 + }, + { + "epoch": 0.15461145302223905, + "grad_norm": 0.201171875, + "learning_rate": 0.0001908109822608477, + "lm_loss": 2.3243, + "loss": 2.5138, + "mask_loss": 0.1627, + "step": 389, + "topk_loss": 0.0268 + }, + { + "epoch": 0.1550089117703682, + "grad_norm": 0.259765625, + "learning_rate": 0.00019075754196709572, + "lm_loss": 2.3223, + "loss": 2.4913, + "mask_loss": 0.1537, + "step": 390, + "topk_loss": 0.0153 + }, + { + "epoch": 0.15540637051849734, + "grad_norm": 0.220703125, + "learning_rate": 0.00019070395425600578, + "lm_loss": 2.341, + "loss": 2.5162, + "mask_loss": 0.155, + "step": 391, + "topk_loss": 0.0202 + }, + { + "epoch": 0.1558038292666265, + "grad_norm": 0.34375, + "learning_rate": 0.0001906502192146203, + "lm_loss": 2.2998, + "loss": 2.5121, + "mask_loss": 0.1694, + "step": 392, + "topk_loss": 0.0429 + }, + { + "epoch": 0.15620128801475566, + "grad_norm": 0.1865234375, + "learning_rate": 0.00019059633693022104, + "lm_loss": 2.387, + "loss": 2.5585, + "mask_loss": 0.1538, + "step": 393, + "topk_loss": 0.0177 + }, + { + "epoch": 0.1565987467628848, + "grad_norm": 0.16015625, + "learning_rate": 0.00019054230749032894, + "lm_loss": 2.3027, + "loss": 2.4735, + "mask_loss": 0.154, + "step": 394, + "topk_loss": 0.0168 + }, + { + "epoch": 0.15699620551101395, + "grad_norm": 0.1650390625, + "learning_rate": 0.00019048813098270387, + "lm_loss": 2.299, + "loss": 2.4641, + "mask_loss": 0.1508, + "step": 395, + "topk_loss": 0.0143 + }, + { + "epoch": 0.1573936642591431, + "grad_norm": 0.1767578125, + "learning_rate": 0.00019043380749534473, + "lm_loss": 2.3667, + "loss": 2.5338, + "mask_loss": 0.1512, + "step": 396, + "topk_loss": 0.0159 + }, + { + "epoch": 0.15779112300727224, + "grad_norm": 0.1669921875, + "learning_rate": 0.000190379337116489, + "lm_loss": 2.2272, + "loss": 2.3972, + "mask_loss": 0.1545, + "step": 397, + "topk_loss": 0.0155 + }, + { + "epoch": 0.1581885817554014, + "grad_norm": 0.166015625, + "learning_rate": 0.0001903247199346129, + "lm_loss": 2.3447, + "loss": 2.5147, + "mask_loss": 0.1537, + "step": 398, + "topk_loss": 0.0164 + }, + { + "epoch": 0.15858604050353056, + "grad_norm": 0.17578125, + "learning_rate": 0.00019026995603843097, + "lm_loss": 2.2948, + "loss": 2.4652, + "mask_loss": 0.1534, + "step": 399, + "topk_loss": 0.0169 + }, + { + "epoch": 0.1589834992516597, + "grad_norm": 0.162109375, + "learning_rate": 0.0001902150455168962, + "lm_loss": 2.3454, + "loss": 2.5136, + "mask_loss": 0.1525, + "step": 400, + "topk_loss": 0.0157 + }, + { + "epoch": 0.1589834992516597, + "eval_lm_loss": 713.7522583007812, + "eval_loss": 713.9226684570312, + "eval_mask_hit_rate": 0.4711623787879944, + "eval_mask_loss": 0.15088751912117004, + "eval_mask_top_10_hit_rate": 0.964043378829956, + "eval_mask_top_1_hit_rate": 0.9903273582458496, + "eval_mask_top_20_hit_rate": 0.9471466541290283, + "eval_mask_top_5_hit_rate": 0.9748400449752808, + "eval_runtime": 148.3333, + "eval_samples_per_second": 13.807, + "eval_steps_per_second": 6.903, + "eval_token_accuracy": 0.5702993869781494, + "eval_top_k_diff": -536.0791015625, + "eval_topk_loss": 0.019542653113603592, + "step": 400 + }, + { + "epoch": 0.15938095799978885, + "grad_norm": 0.173828125, + "learning_rate": 0.0001901599884591996, + "lm_loss": 2.3265, + "loss": 2.4894, + "mask_loss": 0.1491, + "step": 401, + "topk_loss": 0.0138 + }, + { + "epoch": 0.159778416747918, + "grad_norm": 0.177734375, + "learning_rate": 0.0001901047849547703, + "lm_loss": 2.3177, + "loss": 2.4838, + "mask_loss": 0.1497, + "step": 402, + "topk_loss": 0.0164 + }, + { + "epoch": 0.16017587549604714, + "grad_norm": 0.16796875, + "learning_rate": 0.00019004943509327523, + "lm_loss": 2.3049, + "loss": 2.4731, + "mask_loss": 0.1531, + "step": 403, + "topk_loss": 0.0151 + }, + { + "epoch": 0.1605733342441763, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018999393896461917, + "lm_loss": 2.2954, + "loss": 2.4605, + "mask_loss": 0.151, + "step": 404, + "topk_loss": 0.014 + }, + { + "epoch": 0.16097079299230543, + "grad_norm": 0.2109375, + "learning_rate": 0.0001899382966589443, + "lm_loss": 2.3559, + "loss": 2.5239, + "mask_loss": 0.1516, + "step": 405, + "topk_loss": 0.0163 + }, + { + "epoch": 0.1613682517404346, + "grad_norm": 0.16796875, + "learning_rate": 0.0001898825082666304, + "lm_loss": 2.3218, + "loss": 2.4847, + "mask_loss": 0.1495, + "step": 406, + "topk_loss": 0.0134 + }, + { + "epoch": 0.16176571048856375, + "grad_norm": 0.244140625, + "learning_rate": 0.00018982657387829445, + "lm_loss": 2.2732, + "loss": 2.4401, + "mask_loss": 0.1522, + "step": 407, + "topk_loss": 0.0146 + }, + { + "epoch": 0.16216316923669288, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018977049358479057, + "lm_loss": 2.2258, + "loss": 2.3913, + "mask_loss": 0.1505, + "step": 408, + "topk_loss": 0.015 + }, + { + "epoch": 0.16256062798482204, + "grad_norm": 0.185546875, + "learning_rate": 0.00018971426747720993, + "lm_loss": 2.3396, + "loss": 2.5083, + "mask_loss": 0.1517, + "step": 409, + "topk_loss": 0.017 + }, + { + "epoch": 0.1629580867329512, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001896578956468805, + "lm_loss": 2.3099, + "loss": 2.4746, + "mask_loss": 0.1501, + "step": 410, + "topk_loss": 0.0146 + }, + { + "epoch": 0.16335554548108033, + "grad_norm": 0.16796875, + "learning_rate": 0.00018960137818536694, + "lm_loss": 2.3023, + "loss": 2.471, + "mask_loss": 0.1535, + "step": 411, + "topk_loss": 0.0152 + }, + { + "epoch": 0.1637530042292095, + "grad_norm": 0.15625, + "learning_rate": 0.00018954471518447052, + "lm_loss": 2.3396, + "loss": 2.5091, + "mask_loss": 0.1522, + "step": 412, + "topk_loss": 0.0173 + }, + { + "epoch": 0.16415046297733865, + "grad_norm": 0.1923828125, + "learning_rate": 0.00018948790673622884, + "lm_loss": 2.2395, + "loss": 2.4175, + "mask_loss": 0.1569, + "step": 413, + "topk_loss": 0.0211 + }, + { + "epoch": 0.16454792172546778, + "grad_norm": 0.1962890625, + "learning_rate": 0.00018943095293291572, + "lm_loss": 2.217, + "loss": 2.3831, + "mask_loss": 0.1516, + "step": 414, + "topk_loss": 0.0145 + }, + { + "epoch": 0.16494538047359694, + "grad_norm": 0.173828125, + "learning_rate": 0.00018937385386704126, + "lm_loss": 2.3289, + "loss": 2.5003, + "mask_loss": 0.1517, + "step": 415, + "topk_loss": 0.0196 + }, + { + "epoch": 0.1653428392217261, + "grad_norm": 0.1494140625, + "learning_rate": 0.00018931660963135126, + "lm_loss": 2.2105, + "loss": 2.376, + "mask_loss": 0.1511, + "step": 416, + "topk_loss": 0.0144 + }, + { + "epoch": 0.16574029796985523, + "grad_norm": 0.189453125, + "learning_rate": 0.00018925922031882758, + "lm_loss": 2.2829, + "loss": 2.4448, + "mask_loss": 0.1483, + "step": 417, + "topk_loss": 0.0136 + }, + { + "epoch": 0.1661377567179844, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018920168602268748, + "lm_loss": 2.2297, + "loss": 2.3947, + "mask_loss": 0.1514, + "step": 418, + "topk_loss": 0.0136 + }, + { + "epoch": 0.16653521546611352, + "grad_norm": 0.169921875, + "learning_rate": 0.00018914400683638384, + "lm_loss": 2.3595, + "loss": 2.5301, + "mask_loss": 0.1528, + "step": 419, + "topk_loss": 0.0178 + }, + { + "epoch": 0.16693267421424268, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018908618285360494, + "lm_loss": 2.2403, + "loss": 2.4043, + "mask_loss": 0.1505, + "step": 420, + "topk_loss": 0.0135 + }, + { + "epoch": 0.16733013296237184, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018902821416827412, + "lm_loss": 2.2461, + "loss": 2.4134, + "mask_loss": 0.1519, + "step": 421, + "topk_loss": 0.0154 + }, + { + "epoch": 0.16772759171050097, + "grad_norm": 0.16796875, + "learning_rate": 0.00018897010087454987, + "lm_loss": 2.2979, + "loss": 2.4568, + "mask_loss": 0.1468, + "step": 422, + "topk_loss": 0.0121 + }, + { + "epoch": 0.16812505045863013, + "grad_norm": 0.15234375, + "learning_rate": 0.00018891184306682552, + "lm_loss": 2.205, + "loss": 2.3694, + "mask_loss": 0.1502, + "step": 423, + "topk_loss": 0.0142 + }, + { + "epoch": 0.1685225092067593, + "grad_norm": 0.2138671875, + "learning_rate": 0.00018885344083972914, + "lm_loss": 2.2649, + "loss": 2.4345, + "mask_loss": 0.153, + "step": 424, + "topk_loss": 0.0167 + }, + { + "epoch": 0.16891996795488842, + "grad_norm": 0.396484375, + "learning_rate": 0.00018879489428812334, + "lm_loss": 2.2908, + "loss": 2.5763, + "mask_loss": 0.2318, + "step": 425, + "topk_loss": 0.0537 + }, + { + "epoch": 0.16931742670301758, + "grad_norm": 0.212890625, + "learning_rate": 0.00018873620350710527, + "lm_loss": 2.2308, + "loss": 2.3953, + "mask_loss": 0.1496, + "step": 426, + "topk_loss": 0.0149 + }, + { + "epoch": 0.16971488545114674, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001886773685920062, + "lm_loss": 2.2539, + "loss": 2.4188, + "mask_loss": 0.1513, + "step": 427, + "topk_loss": 0.0136 + }, + { + "epoch": 0.17011234419927587, + "grad_norm": 0.16796875, + "learning_rate": 0.00018861838963839164, + "lm_loss": 2.1987, + "loss": 2.3669, + "mask_loss": 0.1523, + "step": 428, + "topk_loss": 0.0158 + }, + { + "epoch": 0.17050980294740503, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018855926674206098, + "lm_loss": 2.2432, + "loss": 2.4087, + "mask_loss": 0.1501, + "step": 429, + "topk_loss": 0.0154 + }, + { + "epoch": 0.1709072616955342, + "grad_norm": 0.3046875, + "learning_rate": 0.0001884999999990475, + "lm_loss": 2.2507, + "loss": 2.4245, + "mask_loss": 0.1537, + "step": 430, + "topk_loss": 0.0202 + }, + { + "epoch": 0.17130472044366332, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018844058950561805, + "lm_loss": 2.2252, + "loss": 2.3899, + "mask_loss": 0.1511, + "step": 431, + "topk_loss": 0.0136 + }, + { + "epoch": 0.17170217919179248, + "grad_norm": 0.234375, + "learning_rate": 0.00018838103535827297, + "lm_loss": 2.2232, + "loss": 2.3882, + "mask_loss": 0.1487, + "step": 432, + "topk_loss": 0.0163 + }, + { + "epoch": 0.1720996379399216, + "grad_norm": 0.189453125, + "learning_rate": 0.00018832133765374606, + "lm_loss": 2.2149, + "loss": 2.3849, + "mask_loss": 0.1534, + "step": 433, + "topk_loss": 0.0166 + }, + { + "epoch": 0.17249709668805077, + "grad_norm": 0.1572265625, + "learning_rate": 0.00018826149648900416, + "lm_loss": 2.2296, + "loss": 2.3944, + "mask_loss": 0.1496, + "step": 434, + "topk_loss": 0.0152 + }, + { + "epoch": 0.17289455543617993, + "grad_norm": 0.158203125, + "learning_rate": 0.00018820151196124717, + "lm_loss": 2.2515, + "loss": 2.418, + "mask_loss": 0.1514, + "step": 435, + "topk_loss": 0.0151 + }, + { + "epoch": 0.17329201418430906, + "grad_norm": 0.16796875, + "learning_rate": 0.00018814138416790787, + "lm_loss": 2.2328, + "loss": 2.3971, + "mask_loss": 0.1501, + "step": 436, + "topk_loss": 0.0142 + }, + { + "epoch": 0.17368947293243822, + "grad_norm": 0.1640625, + "learning_rate": 0.00018808111320665173, + "lm_loss": 2.2423, + "loss": 2.4073, + "mask_loss": 0.1507, + "step": 437, + "topk_loss": 0.0144 + }, + { + "epoch": 0.17408693168056738, + "grad_norm": 0.169921875, + "learning_rate": 0.00018802069917537686, + "lm_loss": 2.241, + "loss": 2.4033, + "mask_loss": 0.1486, + "step": 438, + "topk_loss": 0.0136 + }, + { + "epoch": 0.1744843904286965, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001879601421722136, + "lm_loss": 2.1755, + "loss": 2.3374, + "mask_loss": 0.1476, + "step": 439, + "topk_loss": 0.0143 + }, + { + "epoch": 0.17488184917682567, + "grad_norm": 0.166015625, + "learning_rate": 0.0001878994422955246, + "lm_loss": 2.2717, + "loss": 2.4436, + "mask_loss": 0.1528, + "step": 440, + "topk_loss": 0.0191 + }, + { + "epoch": 0.17527930792495483, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018783859964390464, + "lm_loss": 2.2874, + "loss": 2.4501, + "mask_loss": 0.1482, + "step": 441, + "topk_loss": 0.0146 + }, + { + "epoch": 0.17567676667308396, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001877776143161803, + "lm_loss": 2.2562, + "loss": 2.4221, + "mask_loss": 0.1501, + "step": 442, + "topk_loss": 0.0158 + }, + { + "epoch": 0.17607422542121312, + "grad_norm": 0.1806640625, + "learning_rate": 0.00018771648641140995, + "lm_loss": 2.2138, + "loss": 2.3753, + "mask_loss": 0.1495, + "step": 443, + "topk_loss": 0.012 + }, + { + "epoch": 0.17647168416934228, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001876552160288836, + "lm_loss": 2.2031, + "loss": 2.3683, + "mask_loss": 0.15, + "step": 444, + "topk_loss": 0.0152 + }, + { + "epoch": 0.1768691429174714, + "grad_norm": 0.1875, + "learning_rate": 0.00018759380326812257, + "lm_loss": 2.2839, + "loss": 2.4543, + "mask_loss": 0.1508, + "step": 445, + "topk_loss": 0.0196 + }, + { + "epoch": 0.17726660166560057, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018753224822887958, + "lm_loss": 2.2664, + "loss": 2.4313, + "mask_loss": 0.1495, + "step": 446, + "topk_loss": 0.0155 + }, + { + "epoch": 0.1776640604137297, + "grad_norm": 0.18359375, + "learning_rate": 0.00018747055101113832, + "lm_loss": 2.2679, + "loss": 2.4332, + "mask_loss": 0.1499, + "step": 447, + "topk_loss": 0.0154 + }, + { + "epoch": 0.17806151916185886, + "grad_norm": 0.17578125, + "learning_rate": 0.00018740871171511357, + "lm_loss": 2.255, + "loss": 2.4222, + "mask_loss": 0.1507, + "step": 448, + "topk_loss": 0.0166 + }, + { + "epoch": 0.17845897790998802, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018734673044125072, + "lm_loss": 2.2597, + "loss": 2.4192, + "mask_loss": 0.1447, + "step": 449, + "topk_loss": 0.0148 + }, + { + "epoch": 0.17885643665811715, + "grad_norm": 0.1728515625, + "learning_rate": 0.0001872846072902259, + "lm_loss": 2.201, + "loss": 2.3614, + "mask_loss": 0.1471, + "step": 450, + "topk_loss": 0.0133 + }, + { + "epoch": 0.17885643665811715, + "eval_lm_loss": 708.869384765625, + "eval_loss": 709.0347900390625, + "eval_mask_hit_rate": 0.4796638786792755, + "eval_mask_loss": 0.1468515545129776, + "eval_mask_top_10_hit_rate": 0.9682042598724365, + "eval_mask_top_1_hit_rate": 0.9918677806854248, + "eval_mask_top_20_hit_rate": 0.9525398015975952, + "eval_mask_top_5_hit_rate": 0.978020429611206, + "eval_runtime": 143.8801, + "eval_samples_per_second": 14.234, + "eval_steps_per_second": 7.117, + "eval_token_accuracy": 0.5777865052223206, + "eval_top_k_diff": -531.690185546875, + "eval_topk_loss": 0.018510261550545692, + "step": 450 + }, + { + "epoch": 0.1792538954062463, + "grad_norm": 0.16015625, + "learning_rate": 0.00018722234236294568, + "lm_loss": 2.2775, + "loss": 2.4381, + "mask_loss": 0.1468, + "step": 451, + "topk_loss": 0.0138 + }, + { + "epoch": 0.17965135415437547, + "grad_norm": 0.1640625, + "learning_rate": 0.00018715993576054685, + "lm_loss": 2.1349, + "loss": 2.3018, + "mask_loss": 0.1513, + "step": 452, + "topk_loss": 0.0157 + }, + { + "epoch": 0.1800488129025046, + "grad_norm": 0.17578125, + "learning_rate": 0.00018709738758439635, + "lm_loss": 2.2279, + "loss": 2.3951, + "mask_loss": 0.1484, + "step": 453, + "topk_loss": 0.0189 + }, + { + "epoch": 0.18044627165063376, + "grad_norm": 0.154296875, + "learning_rate": 0.00018703469793609112, + "lm_loss": 2.1836, + "loss": 2.344, + "mask_loss": 0.1476, + "step": 454, + "topk_loss": 0.0128 + }, + { + "epoch": 0.18084373039876292, + "grad_norm": 0.1826171875, + "learning_rate": 0.00018697186691745782, + "lm_loss": 2.2809, + "loss": 2.4444, + "mask_loss": 0.148, + "step": 455, + "topk_loss": 0.0155 + }, + { + "epoch": 0.18124118914689205, + "grad_norm": 0.1806640625, + "learning_rate": 0.00018690889463055283, + "lm_loss": 2.2363, + "loss": 2.4038, + "mask_loss": 0.151, + "step": 456, + "topk_loss": 0.0166 + }, + { + "epoch": 0.1816386478950212, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001868457811776619, + "lm_loss": 2.2923, + "loss": 2.4541, + "mask_loss": 0.1475, + "step": 457, + "topk_loss": 0.0143 + }, + { + "epoch": 0.18203610664315037, + "grad_norm": 0.189453125, + "learning_rate": 0.00018678252666130013, + "lm_loss": 2.2225, + "loss": 2.386, + "mask_loss": 0.1488, + "step": 458, + "topk_loss": 0.0147 + }, + { + "epoch": 0.1824335653912795, + "grad_norm": 0.1748046875, + "learning_rate": 0.00018671913118421175, + "lm_loss": 2.1708, + "loss": 2.3386, + "mask_loss": 0.1522, + "step": 459, + "topk_loss": 0.0155 + }, + { + "epoch": 0.18283102413940866, + "grad_norm": 0.1640625, + "learning_rate": 0.0001866555948493699, + "lm_loss": 2.2273, + "loss": 2.3916, + "mask_loss": 0.1488, + "step": 460, + "topk_loss": 0.0155 + }, + { + "epoch": 0.1832284828875378, + "grad_norm": 0.1650390625, + "learning_rate": 0.0001865919177599766, + "lm_loss": 2.2133, + "loss": 2.3763, + "mask_loss": 0.1481, + "step": 461, + "topk_loss": 0.0149 + }, + { + "epoch": 0.18362594163566695, + "grad_norm": 0.251953125, + "learning_rate": 0.00018652810001946243, + "lm_loss": 2.2823, + "loss": 2.4524, + "mask_loss": 0.149, + "step": 462, + "topk_loss": 0.0211 + }, + { + "epoch": 0.1840234003837961, + "grad_norm": 0.2177734375, + "learning_rate": 0.00018646414173148642, + "lm_loss": 2.1885, + "loss": 2.3479, + "mask_loss": 0.1469, + "step": 463, + "topk_loss": 0.0125 + }, + { + "epoch": 0.18442085913192524, + "grad_norm": 0.166015625, + "learning_rate": 0.00018640004299993597, + "lm_loss": 2.1878, + "loss": 2.3487, + "mask_loss": 0.1478, + "step": 464, + "topk_loss": 0.0131 + }, + { + "epoch": 0.1848183178800544, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018633580392892648, + "lm_loss": 2.1575, + "loss": 2.319, + "mask_loss": 0.1474, + "step": 465, + "topk_loss": 0.0141 + }, + { + "epoch": 0.18521577662818356, + "grad_norm": 0.169921875, + "learning_rate": 0.00018627142462280144, + "lm_loss": 2.2481, + "loss": 2.4132, + "mask_loss": 0.1479, + "step": 466, + "topk_loss": 0.0172 + }, + { + "epoch": 0.1856132353763127, + "grad_norm": 0.1875, + "learning_rate": 0.00018620690518613206, + "lm_loss": 2.2096, + "loss": 2.3743, + "mask_loss": 0.148, + "step": 467, + "topk_loss": 0.0168 + }, + { + "epoch": 0.18601069412444185, + "grad_norm": 0.177734375, + "learning_rate": 0.00018614224572371715, + "lm_loss": 2.2322, + "loss": 2.3952, + "mask_loss": 0.1483, + "step": 468, + "topk_loss": 0.0148 + }, + { + "epoch": 0.186408152872571, + "grad_norm": 0.2041015625, + "learning_rate": 0.00018607744634058294, + "lm_loss": 2.1991, + "loss": 2.3597, + "mask_loss": 0.1448, + "step": 469, + "topk_loss": 0.0159 + }, + { + "epoch": 0.18680561162070014, + "grad_norm": 0.162109375, + "learning_rate": 0.00018601250714198302, + "lm_loss": 2.1219, + "loss": 2.282, + "mask_loss": 0.147, + "step": 470, + "topk_loss": 0.0131 + }, + { + "epoch": 0.1872030703688293, + "grad_norm": 0.15625, + "learning_rate": 0.00018594742823339802, + "lm_loss": 2.1696, + "loss": 2.331, + "mask_loss": 0.1484, + "step": 471, + "topk_loss": 0.013 + }, + { + "epoch": 0.18760052911695846, + "grad_norm": 0.1708984375, + "learning_rate": 0.0001858822097205355, + "lm_loss": 2.2675, + "loss": 2.4326, + "mask_loss": 0.1487, + "step": 472, + "topk_loss": 0.0163 + }, + { + "epoch": 0.1879979878650876, + "grad_norm": 0.1640625, + "learning_rate": 0.0001858168517093298, + "lm_loss": 2.2102, + "loss": 2.3716, + "mask_loss": 0.1471, + "step": 473, + "topk_loss": 0.0143 + }, + { + "epoch": 0.18839544661321675, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018575135430594185, + "lm_loss": 2.2098, + "loss": 2.3708, + "mask_loss": 0.1452, + "step": 474, + "topk_loss": 0.0158 + }, + { + "epoch": 0.1887929053613459, + "grad_norm": 0.173828125, + "learning_rate": 0.00018568571761675893, + "lm_loss": 2.2549, + "loss": 2.415, + "mask_loss": 0.1457, + "step": 475, + "topk_loss": 0.0144 + }, + { + "epoch": 0.18919036410947504, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018561994174839467, + "lm_loss": 2.1715, + "loss": 2.3349, + "mask_loss": 0.1468, + "step": 476, + "topk_loss": 0.0166 + }, + { + "epoch": 0.1895878228576042, + "grad_norm": 0.1630859375, + "learning_rate": 0.0001855540268076887, + "lm_loss": 2.2258, + "loss": 2.3837, + "mask_loss": 0.1443, + "step": 477, + "topk_loss": 0.0136 + }, + { + "epoch": 0.18998528160573333, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001854879729017066, + "lm_loss": 2.1926, + "loss": 2.3549, + "mask_loss": 0.1479, + "step": 478, + "topk_loss": 0.0144 + }, + { + "epoch": 0.1903827403538625, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018542178013773955, + "lm_loss": 2.1665, + "loss": 2.3269, + "mask_loss": 0.1462, + "step": 479, + "topk_loss": 0.0142 + }, + { + "epoch": 0.19078019910199165, + "grad_norm": 0.1767578125, + "learning_rate": 0.00018535544862330436, + "lm_loss": 2.2233, + "loss": 2.3878, + "mask_loss": 0.1466, + "step": 480, + "topk_loss": 0.0178 + }, + { + "epoch": 0.19117765785012078, + "grad_norm": 0.23828125, + "learning_rate": 0.0001852889784661433, + "lm_loss": 2.167, + "loss": 2.3412, + "mask_loss": 0.1538, + "step": 481, + "topk_loss": 0.0204 + }, + { + "epoch": 0.19157511659824994, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018522236977422363, + "lm_loss": 2.1695, + "loss": 2.328, + "mask_loss": 0.1447, + "step": 482, + "topk_loss": 0.0138 + }, + { + "epoch": 0.1919725753463791, + "grad_norm": 0.2216796875, + "learning_rate": 0.00018515562265573784, + "lm_loss": 2.181, + "loss": 2.3646, + "mask_loss": 0.1569, + "step": 483, + "topk_loss": 0.0268 + }, + { + "epoch": 0.19237003409450823, + "grad_norm": 0.25390625, + "learning_rate": 0.00018508873721910315, + "lm_loss": 2.231, + "loss": 2.3881, + "mask_loss": 0.1436, + "step": 484, + "topk_loss": 0.0134 + }, + { + "epoch": 0.1927674928426374, + "grad_norm": 0.166015625, + "learning_rate": 0.00018502171357296144, + "lm_loss": 2.2171, + "loss": 2.3809, + "mask_loss": 0.1497, + "step": 485, + "topk_loss": 0.0141 + }, + { + "epoch": 0.19316495159076655, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018495455182617913, + "lm_loss": 2.2345, + "loss": 2.3997, + "mask_loss": 0.1487, + "step": 486, + "topk_loss": 0.0165 + }, + { + "epoch": 0.19356241033889568, + "grad_norm": 0.216796875, + "learning_rate": 0.00018488725208784694, + "lm_loss": 2.2348, + "loss": 2.3944, + "mask_loss": 0.1462, + "step": 487, + "topk_loss": 0.0133 + }, + { + "epoch": 0.19395986908702484, + "grad_norm": 0.150390625, + "learning_rate": 0.00018481981446727977, + "lm_loss": 2.1731, + "loss": 2.3324, + "mask_loss": 0.1465, + "step": 488, + "topk_loss": 0.0128 + }, + { + "epoch": 0.194357327835154, + "grad_norm": 0.1875, + "learning_rate": 0.00018475223907401638, + "lm_loss": 2.1905, + "loss": 2.3522, + "mask_loss": 0.146, + "step": 489, + "topk_loss": 0.0157 + }, + { + "epoch": 0.19475478658328313, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018468452601781948, + "lm_loss": 2.1728, + "loss": 2.3308, + "mask_loss": 0.1447, + "step": 490, + "topk_loss": 0.0133 + }, + { + "epoch": 0.1951522453314123, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001846166754086752, + "lm_loss": 2.197, + "loss": 2.357, + "mask_loss": 0.1457, + "step": 491, + "topk_loss": 0.0143 + }, + { + "epoch": 0.19554970407954142, + "grad_norm": 0.2275390625, + "learning_rate": 0.0001845486873567932, + "lm_loss": 2.1466, + "loss": 2.3035, + "mask_loss": 0.1441, + "step": 492, + "topk_loss": 0.0128 + }, + { + "epoch": 0.19594716282767058, + "grad_norm": 0.1796875, + "learning_rate": 0.00018448056197260636, + "lm_loss": 2.1935, + "loss": 2.3586, + "mask_loss": 0.149, + "step": 493, + "topk_loss": 0.0161 + }, + { + "epoch": 0.19634462157579974, + "grad_norm": 0.1552734375, + "learning_rate": 0.00018441229936677064, + "lm_loss": 2.149, + "loss": 2.3075, + "mask_loss": 0.1457, + "step": 494, + "topk_loss": 0.0129 + }, + { + "epoch": 0.19674208032392887, + "grad_norm": 0.1611328125, + "learning_rate": 0.00018434389965016495, + "lm_loss": 2.1451, + "loss": 2.3038, + "mask_loss": 0.1457, + "step": 495, + "topk_loss": 0.0129 + }, + { + "epoch": 0.19713953907205803, + "grad_norm": 0.1904296875, + "learning_rate": 0.00018427536293389075, + "lm_loss": 2.1242, + "loss": 2.2799, + "mask_loss": 0.144, + "step": 496, + "topk_loss": 0.0118 + }, + { + "epoch": 0.1975369978201872, + "grad_norm": 0.17578125, + "learning_rate": 0.0001842066893292722, + "lm_loss": 2.206, + "loss": 2.3713, + "mask_loss": 0.1482, + "step": 497, + "topk_loss": 0.0172 + }, + { + "epoch": 0.19793445656831632, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001841378789478557, + "lm_loss": 2.1557, + "loss": 2.3138, + "mask_loss": 0.1442, + "step": 498, + "topk_loss": 0.014 + }, + { + "epoch": 0.19833191531644548, + "grad_norm": 0.197265625, + "learning_rate": 0.0001840689319014098, + "lm_loss": 2.1505, + "loss": 2.3097, + "mask_loss": 0.1457, + "step": 499, + "topk_loss": 0.0135 + }, + { + "epoch": 0.19872937406457464, + "grad_norm": 0.1650390625, + "learning_rate": 0.00018399984830192522, + "lm_loss": 2.199, + "loss": 2.3564, + "mask_loss": 0.1445, + "step": 500, + "topk_loss": 0.0129 + }, + { + "epoch": 0.19872937406457464, + "eval_lm_loss": 705.312744140625, + "eval_loss": 705.47412109375, + "eval_mask_hit_rate": 0.4869436025619507, + "eval_mask_loss": 0.1438208818435669, + "eval_mask_top_10_hit_rate": 0.9714464545249939, + "eval_mask_top_1_hit_rate": 0.9930768013000488, + "eval_mask_top_20_hit_rate": 0.9567796587944031, + "eval_mask_top_5_hit_rate": 0.9805311560630798, + "eval_runtime": 144.86, + "eval_samples_per_second": 14.138, + "eval_steps_per_second": 7.069, + "eval_token_accuracy": 0.5837854146957397, + "eval_top_k_diff": -524.4144287109375, + "eval_topk_loss": 0.017527751624584198, + "step": 500 + }, + { + "epoch": 0.19912683281270377, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018393062826161418, + "lm_loss": 2.174, + "loss": 2.3281, + "mask_loss": 0.1425, + "step": 501, + "topk_loss": 0.0116 + }, + { + "epoch": 0.19952429156083293, + "grad_norm": 0.1875, + "learning_rate": 0.00018386127189291084, + "lm_loss": 2.1827, + "loss": 2.3404, + "mask_loss": 0.1451, + "step": 502, + "topk_loss": 0.0126 + }, + { + "epoch": 0.1999217503089621, + "grad_norm": 0.1591796875, + "learning_rate": 0.0001837917793084705, + "lm_loss": 2.2056, + "loss": 2.363, + "mask_loss": 0.1436, + "step": 503, + "topk_loss": 0.0138 + }, + { + "epoch": 0.20031920905709122, + "grad_norm": 0.18359375, + "learning_rate": 0.00018372215062116998, + "lm_loss": 2.1284, + "loss": 2.2903, + "mask_loss": 0.1454, + "step": 504, + "topk_loss": 0.0165 + }, + { + "epoch": 0.20071666780522038, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018365238594410696, + "lm_loss": 2.0537, + "loss": 2.2151, + "mask_loss": 0.1473, + "step": 505, + "topk_loss": 0.0141 + }, + { + "epoch": 0.2011141265533495, + "grad_norm": 0.166015625, + "learning_rate": 0.00018358248539060017, + "lm_loss": 2.164, + "loss": 2.3251, + "mask_loss": 0.1462, + "step": 506, + "topk_loss": 0.0149 + }, + { + "epoch": 0.20151158530147867, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018351244907418893, + "lm_loss": 2.1705, + "loss": 2.3272, + "mask_loss": 0.1424, + "step": 507, + "topk_loss": 0.0142 + }, + { + "epoch": 0.20190904404960783, + "grad_norm": 0.1669921875, + "learning_rate": 0.00018344227710863316, + "lm_loss": 2.1265, + "loss": 2.2824, + "mask_loss": 0.1426, + "step": 508, + "topk_loss": 0.0133 + }, + { + "epoch": 0.20230650279773696, + "grad_norm": 0.1630859375, + "learning_rate": 0.00018337196960791302, + "lm_loss": 2.109, + "loss": 2.2652, + "mask_loss": 0.1434, + "step": 509, + "topk_loss": 0.0128 + }, + { + "epoch": 0.20270396154586612, + "grad_norm": 0.1982421875, + "learning_rate": 0.00018330152668622892, + "lm_loss": 2.1532, + "loss": 2.3208, + "mask_loss": 0.1476, + "step": 510, + "topk_loss": 0.0201 + }, + { + "epoch": 0.20310142029399528, + "grad_norm": 0.158203125, + "learning_rate": 0.00018323094845800123, + "lm_loss": 2.1261, + "loss": 2.2814, + "mask_loss": 0.1429, + "step": 511, + "topk_loss": 0.0123 + }, + { + "epoch": 0.2034988790421244, + "grad_norm": 0.1787109375, + "learning_rate": 0.00018316023503786997, + "lm_loss": 2.1097, + "loss": 2.2726, + "mask_loss": 0.1463, + "step": 512, + "topk_loss": 0.0165 + }, + { + "epoch": 0.20389633779025357, + "grad_norm": 0.1591796875, + "learning_rate": 0.00018308938654069487, + "lm_loss": 2.2056, + "loss": 2.3602, + "mask_loss": 0.1418, + "step": 513, + "topk_loss": 0.0128 + }, + { + "epoch": 0.20429379653838273, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018301840308155507, + "lm_loss": 2.1918, + "loss": 2.3471, + "mask_loss": 0.1432, + "step": 514, + "topk_loss": 0.0121 + }, + { + "epoch": 0.20469125528651186, + "grad_norm": 0.181640625, + "learning_rate": 0.00018294728477574886, + "lm_loss": 2.2104, + "loss": 2.365, + "mask_loss": 0.1416, + "step": 515, + "topk_loss": 0.013 + }, + { + "epoch": 0.20508871403464102, + "grad_norm": 0.15625, + "learning_rate": 0.00018287603173879364, + "lm_loss": 2.192, + "loss": 2.3463, + "mask_loss": 0.1417, + "step": 516, + "topk_loss": 0.0125 + }, + { + "epoch": 0.20548617278277018, + "grad_norm": 0.15625, + "learning_rate": 0.00018280464408642556, + "lm_loss": 2.1694, + "loss": 2.3299, + "mask_loss": 0.1452, + "step": 517, + "topk_loss": 0.0153 + }, + { + "epoch": 0.2058836315308993, + "grad_norm": 0.171875, + "learning_rate": 0.00018273312193459952, + "lm_loss": 2.2149, + "loss": 2.3693, + "mask_loss": 0.1418, + "step": 518, + "topk_loss": 0.0125 + }, + { + "epoch": 0.20628109027902847, + "grad_norm": 0.169921875, + "learning_rate": 0.00018266146539948878, + "lm_loss": 2.1073, + "loss": 2.267, + "mask_loss": 0.1466, + "step": 519, + "topk_loss": 0.013 + }, + { + "epoch": 0.2066785490271576, + "grad_norm": 0.17578125, + "learning_rate": 0.000182589674597485, + "lm_loss": 2.1863, + "loss": 2.3462, + "mask_loss": 0.1452, + "step": 520, + "topk_loss": 0.0147 + }, + { + "epoch": 0.20707600777528676, + "grad_norm": 0.216796875, + "learning_rate": 0.00018251774964519785, + "lm_loss": 2.174, + "loss": 2.3344, + "mask_loss": 0.1461, + "step": 521, + "topk_loss": 0.0144 + }, + { + "epoch": 0.20747346652341592, + "grad_norm": 0.146484375, + "learning_rate": 0.00018244569065945494, + "lm_loss": 2.1521, + "loss": 2.3099, + "mask_loss": 0.144, + "step": 522, + "topk_loss": 0.0138 + }, + { + "epoch": 0.20787092527154505, + "grad_norm": 0.15625, + "learning_rate": 0.00018237349775730152, + "lm_loss": 2.1465, + "loss": 2.3039, + "mask_loss": 0.1437, + "step": 523, + "topk_loss": 0.0137 + }, + { + "epoch": 0.2082683840196742, + "grad_norm": 0.1435546875, + "learning_rate": 0.00018230117105600047, + "lm_loss": 2.1609, + "loss": 2.3177, + "mask_loss": 0.1445, + "step": 524, + "topk_loss": 0.0123 + }, + { + "epoch": 0.20866584276780337, + "grad_norm": 0.162109375, + "learning_rate": 0.00018222871067303192, + "lm_loss": 2.1812, + "loss": 2.3414, + "mask_loss": 0.1465, + "step": 525, + "topk_loss": 0.0136 + }, + { + "epoch": 0.2090633015159325, + "grad_norm": 0.1455078125, + "learning_rate": 0.00018215611672609317, + "lm_loss": 2.1737, + "loss": 2.3348, + "mask_loss": 0.1469, + "step": 526, + "topk_loss": 0.0143 + }, + { + "epoch": 0.20946076026406166, + "grad_norm": 0.25390625, + "learning_rate": 0.00018208338933309843, + "lm_loss": 2.1444, + "loss": 2.316, + "mask_loss": 0.1503, + "step": 527, + "topk_loss": 0.0213 + }, + { + "epoch": 0.20985821901219082, + "grad_norm": 0.203125, + "learning_rate": 0.0001820105286121787, + "lm_loss": 2.0979, + "loss": 2.2516, + "mask_loss": 0.1413, + "step": 528, + "topk_loss": 0.0124 + }, + { + "epoch": 0.21025567776031995, + "grad_norm": 0.2294921875, + "learning_rate": 0.00018193753468168154, + "lm_loss": 2.0986, + "loss": 2.2581, + "mask_loss": 0.1445, + "step": 529, + "topk_loss": 0.015 + }, + { + "epoch": 0.2106531365084491, + "grad_norm": 0.162109375, + "learning_rate": 0.0001818644076601709, + "lm_loss": 2.2358, + "loss": 2.3915, + "mask_loss": 0.1417, + "step": 530, + "topk_loss": 0.014 + }, + { + "epoch": 0.21105059525657827, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001817911476664269, + "lm_loss": 2.1233, + "loss": 2.2813, + "mask_loss": 0.1459, + "step": 531, + "topk_loss": 0.0121 + }, + { + "epoch": 0.2114480540047074, + "grad_norm": 0.158203125, + "learning_rate": 0.00018171775481944563, + "lm_loss": 2.1246, + "loss": 2.2851, + "mask_loss": 0.1454, + "step": 532, + "topk_loss": 0.0151 + }, + { + "epoch": 0.21184551275283656, + "grad_norm": 0.1865234375, + "learning_rate": 0.00018164422923843893, + "lm_loss": 2.1321, + "loss": 2.291, + "mask_loss": 0.1458, + "step": 533, + "topk_loss": 0.0131 + }, + { + "epoch": 0.2122429715009657, + "grad_norm": 0.169921875, + "learning_rate": 0.00018157057104283431, + "lm_loss": 2.1354, + "loss": 2.2932, + "mask_loss": 0.1449, + "step": 534, + "topk_loss": 0.0129 + }, + { + "epoch": 0.21264043024909485, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018149678035227473, + "lm_loss": 2.0942, + "loss": 2.2499, + "mask_loss": 0.1424, + "step": 535, + "topk_loss": 0.0132 + }, + { + "epoch": 0.213037888997224, + "grad_norm": 0.240234375, + "learning_rate": 0.0001814228572866182, + "lm_loss": 2.1389, + "loss": 2.2964, + "mask_loss": 0.1438, + "step": 536, + "topk_loss": 0.0137 + }, + { + "epoch": 0.21343534774535314, + "grad_norm": 0.1943359375, + "learning_rate": 0.00018134880196593795, + "lm_loss": 2.1755, + "loss": 2.3365, + "mask_loss": 0.1442, + "step": 537, + "topk_loss": 0.0167 + }, + { + "epoch": 0.2138328064934823, + "grad_norm": 0.2060546875, + "learning_rate": 0.00018127461451052183, + "lm_loss": 2.1824, + "loss": 2.3344, + "mask_loss": 0.141, + "step": 538, + "topk_loss": 0.011 + }, + { + "epoch": 0.21423026524161146, + "grad_norm": 0.181640625, + "learning_rate": 0.00018120029504087247, + "lm_loss": 2.1616, + "loss": 2.3154, + "mask_loss": 0.141, + "step": 539, + "topk_loss": 0.0128 + }, + { + "epoch": 0.2146277239897406, + "grad_norm": 0.154296875, + "learning_rate": 0.00018112584367770685, + "lm_loss": 2.1107, + "loss": 2.2671, + "mask_loss": 0.1428, + "step": 540, + "topk_loss": 0.0136 + }, + { + "epoch": 0.21502518273786975, + "grad_norm": 0.2197265625, + "learning_rate": 0.00018105126054195617, + "lm_loss": 2.1106, + "loss": 2.2669, + "mask_loss": 0.1431, + "step": 541, + "topk_loss": 0.0132 + }, + { + "epoch": 0.2154226414859989, + "grad_norm": 0.1689453125, + "learning_rate": 0.00018097654575476577, + "lm_loss": 2.1814, + "loss": 2.3328, + "mask_loss": 0.1399, + "step": 542, + "topk_loss": 0.0115 + }, + { + "epoch": 0.21582010023412804, + "grad_norm": 0.1533203125, + "learning_rate": 0.00018090169943749476, + "lm_loss": 2.1029, + "loss": 2.2617, + "mask_loss": 0.1445, + "step": 543, + "topk_loss": 0.0143 + }, + { + "epoch": 0.2162175589822572, + "grad_norm": 0.1884765625, + "learning_rate": 0.00018082672171171584, + "lm_loss": 2.1275, + "loss": 2.2836, + "mask_loss": 0.1421, + "step": 544, + "topk_loss": 0.014 + }, + { + "epoch": 0.21661501773038636, + "grad_norm": 0.2236328125, + "learning_rate": 0.00018075161269921527, + "lm_loss": 2.1511, + "loss": 2.303, + "mask_loss": 0.1411, + "step": 545, + "topk_loss": 0.0108 + }, + { + "epoch": 0.2170124764785155, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001806763725219925, + "lm_loss": 2.1007, + "loss": 2.2545, + "mask_loss": 0.1413, + "step": 546, + "topk_loss": 0.0125 + }, + { + "epoch": 0.21740993522664465, + "grad_norm": 0.1728515625, + "learning_rate": 0.00018060100130226002, + "lm_loss": 2.1629, + "loss": 2.3212, + "mask_loss": 0.1444, + "step": 547, + "topk_loss": 0.0139 + }, + { + "epoch": 0.21780739397477378, + "grad_norm": 0.1923828125, + "learning_rate": 0.0001805254991624432, + "lm_loss": 2.1271, + "loss": 2.2895, + "mask_loss": 0.1456, + "step": 548, + "topk_loss": 0.0169 + }, + { + "epoch": 0.21820485272290294, + "grad_norm": 0.18359375, + "learning_rate": 0.00018044986622518002, + "lm_loss": 2.1949, + "loss": 2.3501, + "mask_loss": 0.141, + "step": 549, + "topk_loss": 0.0142 + }, + { + "epoch": 0.2186023114710321, + "grad_norm": 0.173828125, + "learning_rate": 0.000180374102613321, + "lm_loss": 2.1611, + "loss": 2.321, + "mask_loss": 0.1446, + "step": 550, + "topk_loss": 0.0153 + }, + { + "epoch": 0.2186023114710321, + "eval_lm_loss": 704.4793701171875, + "eval_loss": 704.6373291015625, + "eval_mask_hit_rate": 0.49326711893081665, + "eval_mask_loss": 0.14139045774936676, + "eval_mask_top_10_hit_rate": 0.9738826751708984, + "eval_mask_top_1_hit_rate": 0.9939601421356201, + "eval_mask_top_20_hit_rate": 0.959991455078125, + "eval_mask_top_5_hit_rate": 0.9823801517486572, + "eval_runtime": 143.9303, + "eval_samples_per_second": 14.229, + "eval_steps_per_second": 7.115, + "eval_token_accuracy": 0.5886777639389038, + "eval_top_k_diff": -542.8375244140625, + "eval_topk_loss": 0.01653169095516205, + "step": 550 + }, + { + "epoch": 0.21899977021916123, + "grad_norm": 0.17578125, + "learning_rate": 0.00018029820844992883, + "lm_loss": 2.1458, + "loss": 2.3007, + "mask_loss": 0.1428, + "step": 551, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2193972289672904, + "grad_norm": 0.1708984375, + "learning_rate": 0.00018022218385827828, + "lm_loss": 2.1524, + "loss": 2.308, + "mask_loss": 0.1433, + "step": 552, + "topk_loss": 0.0124 + }, + { + "epoch": 0.21979468771541955, + "grad_norm": 0.1513671875, + "learning_rate": 0.00018014602896185595, + "lm_loss": 2.1102, + "loss": 2.2644, + "mask_loss": 0.1418, + "step": 553, + "topk_loss": 0.0124 + }, + { + "epoch": 0.22019214646354868, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001800697438843602, + "lm_loss": 2.1335, + "loss": 2.2837, + "mask_loss": 0.1388, + "step": 554, + "topk_loss": 0.0114 + }, + { + "epoch": 0.22058960521167784, + "grad_norm": 0.181640625, + "learning_rate": 0.0001799933287497007, + "lm_loss": 2.1569, + "loss": 2.3133, + "mask_loss": 0.1417, + "step": 555, + "topk_loss": 0.0146 + }, + { + "epoch": 0.220987063959807, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017991678368199846, + "lm_loss": 2.2034, + "loss": 2.3606, + "mask_loss": 0.1401, + "step": 556, + "topk_loss": 0.0171 + }, + { + "epoch": 0.22138452270793613, + "grad_norm": 0.14453125, + "learning_rate": 0.00017984010880558554, + "lm_loss": 2.129, + "loss": 2.2839, + "mask_loss": 0.1419, + "step": 557, + "topk_loss": 0.0131 + }, + { + "epoch": 0.22178198145606529, + "grad_norm": 0.2890625, + "learning_rate": 0.00017976330424500478, + "lm_loss": 2.0838, + "loss": 2.244, + "mask_loss": 0.1447, + "step": 558, + "topk_loss": 0.0155 + }, + { + "epoch": 0.22217944020419444, + "grad_norm": 0.1708984375, + "learning_rate": 0.00017968637012500972, + "lm_loss": 2.0653, + "loss": 2.2237, + "mask_loss": 0.1442, + "step": 559, + "topk_loss": 0.0142 + }, + { + "epoch": 0.22257689895232358, + "grad_norm": 0.1806640625, + "learning_rate": 0.00017960930657056438, + "lm_loss": 2.1554, + "loss": 2.3135, + "mask_loss": 0.1431, + "step": 560, + "topk_loss": 0.015 + }, + { + "epoch": 0.22297435770045274, + "grad_norm": 0.22265625, + "learning_rate": 0.00017953211370684295, + "lm_loss": 2.1087, + "loss": 2.2637, + "mask_loss": 0.1415, + "step": 561, + "topk_loss": 0.0136 + }, + { + "epoch": 0.22337181644858187, + "grad_norm": 0.1640625, + "learning_rate": 0.00017945479165922966, + "lm_loss": 2.141, + "loss": 2.292, + "mask_loss": 0.138, + "step": 562, + "topk_loss": 0.0129 + }, + { + "epoch": 0.22376927519671103, + "grad_norm": 0.181640625, + "learning_rate": 0.0001793773405533186, + "lm_loss": 2.1095, + "loss": 2.2669, + "mask_loss": 0.1423, + "step": 563, + "topk_loss": 0.015 + }, + { + "epoch": 0.22416673394484019, + "grad_norm": 0.58203125, + "learning_rate": 0.0001792997605149135, + "lm_loss": 2.1214, + "loss": 2.3774, + "mask_loss": 0.201, + "step": 564, + "topk_loss": 0.0549 + }, + { + "epoch": 0.22456419269296932, + "grad_norm": 0.2451171875, + "learning_rate": 0.00017922205167002754, + "lm_loss": 2.0831, + "loss": 2.2377, + "mask_loss": 0.1414, + "step": 565, + "topk_loss": 0.0133 + }, + { + "epoch": 0.22496165144109848, + "grad_norm": 0.228515625, + "learning_rate": 0.00017914421414488298, + "lm_loss": 2.1192, + "loss": 2.2754, + "mask_loss": 0.1429, + "step": 566, + "topk_loss": 0.0133 + }, + { + "epoch": 0.22535911018922763, + "grad_norm": 0.1728515625, + "learning_rate": 0.00017906624806591126, + "lm_loss": 2.1005, + "loss": 2.2539, + "mask_loss": 0.1405, + "step": 567, + "topk_loss": 0.0129 + }, + { + "epoch": 0.22575656893735677, + "grad_norm": 0.169921875, + "learning_rate": 0.00017898815355975255, + "lm_loss": 2.11, + "loss": 2.2656, + "mask_loss": 0.1429, + "step": 568, + "topk_loss": 0.0127 + }, + { + "epoch": 0.22615402768548593, + "grad_norm": 0.166015625, + "learning_rate": 0.00017890993075325565, + "lm_loss": 2.1541, + "loss": 2.3053, + "mask_loss": 0.14, + "step": 569, + "topk_loss": 0.0112 + }, + { + "epoch": 0.22655148643361508, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017883157977347774, + "lm_loss": 2.0569, + "loss": 2.2116, + "mask_loss": 0.1428, + "step": 570, + "topk_loss": 0.0119 + }, + { + "epoch": 0.22694894518174422, + "grad_norm": 0.201171875, + "learning_rate": 0.00017875310074768418, + "lm_loss": 2.1304, + "loss": 2.2855, + "mask_loss": 0.1429, + "step": 571, + "topk_loss": 0.0123 + }, + { + "epoch": 0.22734640392987338, + "grad_norm": 0.15234375, + "learning_rate": 0.00017867449380334834, + "lm_loss": 2.1621, + "loss": 2.318, + "mask_loss": 0.1417, + "step": 572, + "topk_loss": 0.0142 + }, + { + "epoch": 0.22774386267800253, + "grad_norm": 0.1689453125, + "learning_rate": 0.00017859575906815139, + "lm_loss": 2.0412, + "loss": 2.1965, + "mask_loss": 0.1427, + "step": 573, + "topk_loss": 0.0126 + }, + { + "epoch": 0.22814132142613167, + "grad_norm": 0.197265625, + "learning_rate": 0.00017851689666998198, + "lm_loss": 2.0992, + "loss": 2.2534, + "mask_loss": 0.1416, + "step": 574, + "topk_loss": 0.0127 + }, + { + "epoch": 0.22853878017426082, + "grad_norm": 0.1640625, + "learning_rate": 0.00017843790673693625, + "lm_loss": 2.1353, + "loss": 2.2892, + "mask_loss": 0.1407, + "step": 575, + "topk_loss": 0.0132 + }, + { + "epoch": 0.22893623892238996, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001783587893973174, + "lm_loss": 2.1064, + "loss": 2.2618, + "mask_loss": 0.141, + "step": 576, + "topk_loss": 0.0143 + }, + { + "epoch": 0.22933369767051912, + "grad_norm": 0.181640625, + "learning_rate": 0.00017827954477963557, + "lm_loss": 2.1688, + "loss": 2.3245, + "mask_loss": 0.1431, + "step": 577, + "topk_loss": 0.0125 + }, + { + "epoch": 0.22973115641864827, + "grad_norm": 0.2138671875, + "learning_rate": 0.00017820017301260776, + "lm_loss": 2.015, + "loss": 2.1693, + "mask_loss": 0.1418, + "step": 578, + "topk_loss": 0.0126 + }, + { + "epoch": 0.2301286151667774, + "grad_norm": 0.16015625, + "learning_rate": 0.00017812067422515732, + "lm_loss": 2.1014, + "loss": 2.2561, + "mask_loss": 0.1435, + "step": 579, + "topk_loss": 0.0113 + }, + { + "epoch": 0.23052607391490657, + "grad_norm": 0.140625, + "learning_rate": 0.00017804104854641408, + "lm_loss": 2.1788, + "loss": 2.333, + "mask_loss": 0.1413, + "step": 580, + "topk_loss": 0.0129 + }, + { + "epoch": 0.23092353266303572, + "grad_norm": 0.251953125, + "learning_rate": 0.00017796129610571384, + "lm_loss": 2.1014, + "loss": 2.258, + "mask_loss": 0.143, + "step": 581, + "topk_loss": 0.0136 + }, + { + "epoch": 0.23132099141116486, + "grad_norm": 0.21875, + "learning_rate": 0.0001778814170325984, + "lm_loss": 2.1588, + "loss": 2.3118, + "mask_loss": 0.1398, + "step": 582, + "topk_loss": 0.0132 + }, + { + "epoch": 0.23171845015929401, + "grad_norm": 0.1435546875, + "learning_rate": 0.0001778014114568153, + "lm_loss": 2.0827, + "loss": 2.2349, + "mask_loss": 0.1409, + "step": 583, + "topk_loss": 0.0113 + }, + { + "epoch": 0.23211590890742317, + "grad_norm": 0.158203125, + "learning_rate": 0.00017772127950831733, + "lm_loss": 2.0314, + "loss": 2.1852, + "mask_loss": 0.1416, + "step": 584, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2325133676555523, + "grad_norm": 0.2314453125, + "learning_rate": 0.0001776410213172628, + "lm_loss": 2.1545, + "loss": 2.3171, + "mask_loss": 0.1438, + "step": 585, + "topk_loss": 0.0188 + }, + { + "epoch": 0.23291082640368146, + "grad_norm": 0.240234375, + "learning_rate": 0.00017756063701401492, + "lm_loss": 2.0725, + "loss": 2.2282, + "mask_loss": 0.1413, + "step": 586, + "topk_loss": 0.0144 + }, + { + "epoch": 0.23330828515181062, + "grad_norm": 0.162109375, + "learning_rate": 0.00017748012672914176, + "lm_loss": 2.0879, + "loss": 2.2427, + "mask_loss": 0.1416, + "step": 587, + "topk_loss": 0.0132 + }, + { + "epoch": 0.23370574389993976, + "grad_norm": 0.1787109375, + "learning_rate": 0.00017739949059341617, + "lm_loss": 2.0445, + "loss": 2.2004, + "mask_loss": 0.142, + "step": 588, + "topk_loss": 0.0139 + }, + { + "epoch": 0.23410320264806891, + "grad_norm": 0.2041015625, + "learning_rate": 0.00017731872873781517, + "lm_loss": 2.097, + "loss": 2.2535, + "mask_loss": 0.1421, + "step": 589, + "topk_loss": 0.0145 + }, + { + "epoch": 0.23450066139619805, + "grad_norm": 0.19921875, + "learning_rate": 0.00017723784129352018, + "lm_loss": 2.1099, + "loss": 2.2612, + "mask_loss": 0.1395, + "step": 590, + "topk_loss": 0.0119 + }, + { + "epoch": 0.2348981201443272, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001771568283919166, + "lm_loss": 2.0986, + "loss": 2.2533, + "mask_loss": 0.1422, + "step": 591, + "topk_loss": 0.0124 + }, + { + "epoch": 0.23529557889245636, + "grad_norm": 0.255859375, + "learning_rate": 0.00017707569016459348, + "lm_loss": 2.1375, + "loss": 2.2998, + "mask_loss": 0.1441, + "step": 592, + "topk_loss": 0.0182 + }, + { + "epoch": 0.2356930376405855, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017699442674334358, + "lm_loss": 2.1091, + "loss": 2.2608, + "mask_loss": 0.1403, + "step": 593, + "topk_loss": 0.0113 + }, + { + "epoch": 0.23609049638871465, + "grad_norm": 0.18359375, + "learning_rate": 0.0001769130382601629, + "lm_loss": 2.1533, + "loss": 2.3117, + "mask_loss": 0.1425, + "step": 594, + "topk_loss": 0.016 + }, + { + "epoch": 0.2364879551368438, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017683152484725066, + "lm_loss": 2.0379, + "loss": 2.1905, + "mask_loss": 0.1403, + "step": 595, + "topk_loss": 0.0122 + }, + { + "epoch": 0.23688541388497295, + "grad_norm": 0.150390625, + "learning_rate": 0.00017674988663700898, + "lm_loss": 2.0963, + "loss": 2.2508, + "mask_loss": 0.1414, + "step": 596, + "topk_loss": 0.0131 + }, + { + "epoch": 0.2372828726331021, + "grad_norm": 0.162109375, + "learning_rate": 0.00017666812376204266, + "lm_loss": 2.0917, + "loss": 2.2451, + "mask_loss": 0.1403, + "step": 597, + "topk_loss": 0.013 + }, + { + "epoch": 0.23768033138123126, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017658623635515906, + "lm_loss": 2.092, + "loss": 2.2446, + "mask_loss": 0.1392, + "step": 598, + "topk_loss": 0.0134 + }, + { + "epoch": 0.2380777901293604, + "grad_norm": 0.1484375, + "learning_rate": 0.00017650422454936772, + "lm_loss": 2.1547, + "loss": 2.3048, + "mask_loss": 0.139, + "step": 599, + "topk_loss": 0.0111 + }, + { + "epoch": 0.23847524887748955, + "grad_norm": 0.169921875, + "learning_rate": 0.00017642208847788032, + "lm_loss": 2.173, + "loss": 2.3238, + "mask_loss": 0.1388, + "step": 600, + "topk_loss": 0.012 + }, + { + "epoch": 0.23847524887748955, + "eval_lm_loss": 699.9920043945312, + "eval_loss": 700.147216796875, + "eval_mask_hit_rate": 0.49874863028526306, + "eval_mask_loss": 0.1390666365623474, + "eval_mask_top_10_hit_rate": 0.975847601890564, + "eval_mask_top_1_hit_rate": 0.9946634769439697, + "eval_mask_top_20_hit_rate": 0.9626063108444214, + "eval_mask_top_5_hit_rate": 0.9838546514511108, + "eval_runtime": 143.9171, + "eval_samples_per_second": 14.23, + "eval_steps_per_second": 7.115, + "eval_token_accuracy": 0.5928311347961426, + "eval_top_k_diff": -522.0046997070312, + "eval_topk_loss": 0.0161855798214674, + "step": 600 + }, + { + "epoch": 0.2388727076256187, + "grad_norm": 0.169921875, + "learning_rate": 0.00017633982827411032, + "lm_loss": 2.0913, + "loss": 2.2444, + "mask_loss": 0.14, + "step": 601, + "topk_loss": 0.0131 + }, + { + "epoch": 0.23927016637374784, + "grad_norm": 0.1494140625, + "learning_rate": 0.00017625744407167288, + "lm_loss": 2.0633, + "loss": 2.2181, + "mask_loss": 0.1417, + "step": 602, + "topk_loss": 0.0131 + }, + { + "epoch": 0.239667625121877, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001761749360043845, + "lm_loss": 2.0286, + "loss": 2.1887, + "mask_loss": 0.1452, + "step": 603, + "topk_loss": 0.0149 + }, + { + "epoch": 0.24006508387000614, + "grad_norm": 0.1904296875, + "learning_rate": 0.00017609230420626297, + "lm_loss": 2.0473, + "loss": 2.1985, + "mask_loss": 0.1397, + "step": 604, + "topk_loss": 0.0116 + }, + { + "epoch": 0.2404625426181353, + "grad_norm": 0.146484375, + "learning_rate": 0.00017600954881152693, + "lm_loss": 2.0933, + "loss": 2.2414, + "mask_loss": 0.1379, + "step": 605, + "topk_loss": 0.0102 + }, + { + "epoch": 0.24086000136626445, + "grad_norm": 0.154296875, + "learning_rate": 0.0001759266699545959, + "lm_loss": 2.1098, + "loss": 2.263, + "mask_loss": 0.1397, + "step": 606, + "topk_loss": 0.0136 + }, + { + "epoch": 0.24125746011439358, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017584366777008984, + "lm_loss": 2.0171, + "loss": 2.1688, + "mask_loss": 0.1391, + "step": 607, + "topk_loss": 0.0126 + }, + { + "epoch": 0.24165491886252274, + "grad_norm": 0.15234375, + "learning_rate": 0.0001757605423928291, + "lm_loss": 2.1159, + "loss": 2.2668, + "mask_loss": 0.138, + "step": 608, + "topk_loss": 0.0128 + }, + { + "epoch": 0.2420523776106519, + "grad_norm": 0.154296875, + "learning_rate": 0.00017567729395783405, + "lm_loss": 2.0309, + "loss": 2.1834, + "mask_loss": 0.1404, + "step": 609, + "topk_loss": 0.012 + }, + { + "epoch": 0.24244983635878103, + "grad_norm": 0.1474609375, + "learning_rate": 0.00017559392260032506, + "lm_loss": 2.072, + "loss": 2.2261, + "mask_loss": 0.1417, + "step": 610, + "topk_loss": 0.0125 + }, + { + "epoch": 0.2428472951069102, + "grad_norm": 0.158203125, + "learning_rate": 0.00017551042845572208, + "lm_loss": 2.0899, + "loss": 2.2414, + "mask_loss": 0.1399, + "step": 611, + "topk_loss": 0.0116 + }, + { + "epoch": 0.24324475385503935, + "grad_norm": 0.1435546875, + "learning_rate": 0.0001754268116596445, + "lm_loss": 2.0962, + "loss": 2.2487, + "mask_loss": 0.1398, + "step": 612, + "topk_loss": 0.0127 + }, + { + "epoch": 0.24364221260316848, + "grad_norm": 0.1640625, + "learning_rate": 0.00017534307234791098, + "lm_loss": 2.1056, + "loss": 2.2622, + "mask_loss": 0.1442, + "step": 613, + "topk_loss": 0.0124 + }, + { + "epoch": 0.24403967135129764, + "grad_norm": 0.1416015625, + "learning_rate": 0.00017525921065653918, + "lm_loss": 2.1073, + "loss": 2.257, + "mask_loss": 0.1376, + "step": 614, + "topk_loss": 0.0121 + }, + { + "epoch": 0.2444371300994268, + "grad_norm": 0.177734375, + "learning_rate": 0.00017517522672174548, + "lm_loss": 2.0809, + "loss": 2.2406, + "mask_loss": 0.1426, + "step": 615, + "topk_loss": 0.0172 + }, + { + "epoch": 0.24483458884755593, + "grad_norm": 0.259765625, + "learning_rate": 0.00017509112067994487, + "lm_loss": 2.1177, + "loss": 2.2708, + "mask_loss": 0.1405, + "step": 616, + "topk_loss": 0.0125 + }, + { + "epoch": 0.2452320475956851, + "grad_norm": 0.138671875, + "learning_rate": 0.00017500689266775063, + "lm_loss": 2.0753, + "loss": 2.2273, + "mask_loss": 0.1397, + "step": 617, + "topk_loss": 0.0123 + }, + { + "epoch": 0.24562950634381422, + "grad_norm": 0.140625, + "learning_rate": 0.00017492254282197424, + "lm_loss": 2.1113, + "loss": 2.2631, + "mask_loss": 0.1406, + "step": 618, + "topk_loss": 0.0113 + }, + { + "epoch": 0.24602696509194338, + "grad_norm": 0.1787109375, + "learning_rate": 0.00017483807127962502, + "lm_loss": 2.0832, + "loss": 2.2345, + "mask_loss": 0.1396, + "step": 619, + "topk_loss": 0.0117 + }, + { + "epoch": 0.24642442384007254, + "grad_norm": 0.15625, + "learning_rate": 0.00017475347817790996, + "lm_loss": 2.096, + "loss": 2.2442, + "mask_loss": 0.1382, + "step": 620, + "topk_loss": 0.0099 + }, + { + "epoch": 0.24682188258820167, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001746687636542335, + "lm_loss": 2.0826, + "loss": 2.2342, + "mask_loss": 0.1389, + "step": 621, + "topk_loss": 0.0127 + }, + { + "epoch": 0.24721934133633083, + "grad_norm": 0.166015625, + "learning_rate": 0.00017458392784619735, + "lm_loss": 2.1393, + "loss": 2.2944, + "mask_loss": 0.1395, + "step": 622, + "topk_loss": 0.0156 + }, + { + "epoch": 0.24761680008446, + "grad_norm": 0.173828125, + "learning_rate": 0.00017449897089160014, + "lm_loss": 2.0229, + "loss": 2.1723, + "mask_loss": 0.1387, + "step": 623, + "topk_loss": 0.0106 + }, + { + "epoch": 0.24801425883258912, + "grad_norm": 0.1767578125, + "learning_rate": 0.00017441389292843733, + "lm_loss": 2.1188, + "loss": 2.2722, + "mask_loss": 0.1396, + "step": 624, + "topk_loss": 0.0137 + }, + { + "epoch": 0.24841171758071828, + "grad_norm": 0.1552734375, + "learning_rate": 0.00017432869409490093, + "lm_loss": 2.148, + "loss": 2.3017, + "mask_loss": 0.1405, + "step": 625, + "topk_loss": 0.0131 + }, + { + "epoch": 0.24880917632884744, + "grad_norm": 0.19921875, + "learning_rate": 0.0001742433745293793, + "lm_loss": 2.1267, + "loss": 2.2854, + "mask_loss": 0.1423, + "step": 626, + "topk_loss": 0.0163 + }, + { + "epoch": 0.24920663507697657, + "grad_norm": 0.2060546875, + "learning_rate": 0.00017415793437045685, + "lm_loss": 2.1103, + "loss": 2.2611, + "mask_loss": 0.1395, + "step": 627, + "topk_loss": 0.0113 + }, + { + "epoch": 0.24960409382510573, + "grad_norm": 0.158203125, + "learning_rate": 0.00017407237375691392, + "lm_loss": 2.0104, + "loss": 2.1611, + "mask_loss": 0.1391, + "step": 628, + "topk_loss": 0.0116 + }, + { + "epoch": 0.25000155257323486, + "grad_norm": 0.19140625, + "learning_rate": 0.00017398669282772645, + "lm_loss": 2.0692, + "loss": 2.2181, + "mask_loss": 0.1372, + "step": 629, + "topk_loss": 0.0117 + }, + { + "epoch": 0.25039901132136405, + "grad_norm": 0.193359375, + "learning_rate": 0.00017390089172206592, + "lm_loss": 2.0427, + "loss": 2.1912, + "mask_loss": 0.1357, + "step": 630, + "topk_loss": 0.0128 + }, + { + "epoch": 0.2507964700694932, + "grad_norm": 0.19140625, + "learning_rate": 0.00017381497057929884, + "lm_loss": 2.0333, + "loss": 2.1853, + "mask_loss": 0.1395, + "step": 631, + "topk_loss": 0.0125 + }, + { + "epoch": 0.2511939288176223, + "grad_norm": 0.1630859375, + "learning_rate": 0.00017372892953898688, + "lm_loss": 2.1063, + "loss": 2.2558, + "mask_loss": 0.1371, + "step": 632, + "topk_loss": 0.0124 + }, + { + "epoch": 0.2515913875657515, + "grad_norm": 0.1953125, + "learning_rate": 0.00017364276874088633, + "lm_loss": 2.0446, + "loss": 2.1945, + "mask_loss": 0.1377, + "step": 633, + "topk_loss": 0.0122 + }, + { + "epoch": 0.25198884631388063, + "grad_norm": 0.205078125, + "learning_rate": 0.00017355648832494803, + "lm_loss": 2.0793, + "loss": 2.2304, + "mask_loss": 0.1373, + "step": 634, + "topk_loss": 0.0138 + }, + { + "epoch": 0.25238630506200976, + "grad_norm": 0.162109375, + "learning_rate": 0.00017347008843131712, + "lm_loss": 2.0977, + "loss": 2.2481, + "mask_loss": 0.1377, + "step": 635, + "topk_loss": 0.0128 + }, + { + "epoch": 0.25278376381013895, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001733835692003329, + "lm_loss": 2.053, + "loss": 2.2053, + "mask_loss": 0.1398, + "step": 636, + "topk_loss": 0.0125 + }, + { + "epoch": 0.2531812225582681, + "grad_norm": 0.16796875, + "learning_rate": 0.0001732969307725283, + "lm_loss": 1.9659, + "loss": 2.1155, + "mask_loss": 0.1381, + "step": 637, + "topk_loss": 0.0115 + }, + { + "epoch": 0.2535786813063972, + "grad_norm": 0.1484375, + "learning_rate": 0.00017321017328863009, + "lm_loss": 2.0408, + "loss": 2.1949, + "mask_loss": 0.1394, + "step": 638, + "topk_loss": 0.0147 + }, + { + "epoch": 0.25397614005452634, + "grad_norm": 0.1484375, + "learning_rate": 0.00017312329688955828, + "lm_loss": 2.0357, + "loss": 2.188, + "mask_loss": 0.1403, + "step": 639, + "topk_loss": 0.012 + }, + { + "epoch": 0.25437359880265553, + "grad_norm": 0.2578125, + "learning_rate": 0.00017303630171642607, + "lm_loss": 2.1687, + "loss": 2.3187, + "mask_loss": 0.1377, + "step": 640, + "topk_loss": 0.0123 + }, + { + "epoch": 0.25477105755078466, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001729491879105396, + "lm_loss": 2.0333, + "loss": 2.1854, + "mask_loss": 0.138, + "step": 641, + "topk_loss": 0.0141 + }, + { + "epoch": 0.2551685162989138, + "grad_norm": 0.2255859375, + "learning_rate": 0.0001728619556133977, + "lm_loss": 2.1377, + "loss": 2.2926, + "mask_loss": 0.1386, + "step": 642, + "topk_loss": 0.0163 + }, + { + "epoch": 0.255565975047043, + "grad_norm": 0.1787109375, + "learning_rate": 0.0001727746049666916, + "lm_loss": 2.0641, + "loss": 2.2196, + "mask_loss": 0.1395, + "step": 643, + "topk_loss": 0.0161 + }, + { + "epoch": 0.2559634337951721, + "grad_norm": 0.21875, + "learning_rate": 0.0001726871361123049, + "lm_loss": 2.0548, + "loss": 2.2032, + "mask_loss": 0.1365, + "step": 644, + "topk_loss": 0.0119 + }, + { + "epoch": 0.25636089254330124, + "grad_norm": 0.169921875, + "learning_rate": 0.0001725995491923131, + "lm_loss": 2.1623, + "loss": 2.3127, + "mask_loss": 0.1378, + "step": 645, + "topk_loss": 0.0126 + }, + { + "epoch": 0.25675835129143043, + "grad_norm": 0.212890625, + "learning_rate": 0.00017251184434898347, + "lm_loss": 2.0753, + "loss": 2.2262, + "mask_loss": 0.1376, + "step": 646, + "topk_loss": 0.0133 + }, + { + "epoch": 0.25715581003955956, + "grad_norm": 0.232421875, + "learning_rate": 0.0001724240217247749, + "lm_loss": 2.1105, + "loss": 2.2604, + "mask_loss": 0.1378, + "step": 647, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2575532687876887, + "grad_norm": 0.1865234375, + "learning_rate": 0.00017233608146233754, + "lm_loss": 2.0786, + "loss": 2.2304, + "mask_loss": 0.1396, + "step": 648, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2579507275358179, + "grad_norm": 0.1455078125, + "learning_rate": 0.00017224802370451262, + "lm_loss": 1.9832, + "loss": 2.1332, + "mask_loss": 0.1381, + "step": 649, + "topk_loss": 0.0119 + }, + { + "epoch": 0.258348186283947, + "grad_norm": 0.2138671875, + "learning_rate": 0.0001721598485943322, + "lm_loss": 2.0282, + "loss": 2.1852, + "mask_loss": 0.1409, + "step": 650, + "topk_loss": 0.016 + }, + { + "epoch": 0.258348186283947, + "eval_lm_loss": 697.0224609375, + "eval_loss": 697.17529296875, + "eval_mask_hit_rate": 0.5033272504806519, + "eval_mask_loss": 0.13698282837867737, + "eval_mask_top_10_hit_rate": 0.9774259328842163, + "eval_mask_top_1_hit_rate": 0.9951572418212891, + "eval_mask_top_20_hit_rate": 0.9647897481918335, + "eval_mask_top_5_hit_rate": 0.9849950075149536, + "eval_runtime": 144.1684, + "eval_samples_per_second": 14.206, + "eval_steps_per_second": 7.103, + "eval_token_accuracy": 0.5961358547210693, + "eval_top_k_diff": -516.7088012695312, + "eval_topk_loss": 0.015864230692386627, + "step": 650 + }, + { + "epoch": 0.25874564503207614, + "grad_norm": 0.1875, + "learning_rate": 0.00017207155627501898, + "lm_loss": 2.0546, + "loss": 2.2045, + "mask_loss": 0.1381, + "step": 651, + "topk_loss": 0.0119 + }, + { + "epoch": 0.25914310378020533, + "grad_norm": 0.138671875, + "learning_rate": 0.00017198314688998608, + "lm_loss": 2.0694, + "loss": 2.2223, + "mask_loss": 0.1398, + "step": 652, + "topk_loss": 0.0131 + }, + { + "epoch": 0.25954056252833446, + "grad_norm": 0.169921875, + "learning_rate": 0.00017189462058283668, + "lm_loss": 2.0493, + "loss": 2.203, + "mask_loss": 0.1399, + "step": 653, + "topk_loss": 0.0137 + }, + { + "epoch": 0.2599380212764636, + "grad_norm": 0.1484375, + "learning_rate": 0.00017180597749736395, + "lm_loss": 2.074, + "loss": 2.2212, + "mask_loss": 0.1358, + "step": 654, + "topk_loss": 0.0114 + }, + { + "epoch": 0.2603354800245928, + "grad_norm": 0.16015625, + "learning_rate": 0.00017171721777755074, + "lm_loss": 2.0825, + "loss": 2.2314, + "mask_loss": 0.1369, + "step": 655, + "topk_loss": 0.0119 + }, + { + "epoch": 0.2607329387727219, + "grad_norm": 0.138671875, + "learning_rate": 0.0001716283415675693, + "lm_loss": 2.019, + "loss": 2.1732, + "mask_loss": 0.1421, + "step": 656, + "topk_loss": 0.012 + }, + { + "epoch": 0.26113039752085104, + "grad_norm": 0.13671875, + "learning_rate": 0.00017153934901178113, + "lm_loss": 2.0657, + "loss": 2.2213, + "mask_loss": 0.1419, + "step": 657, + "topk_loss": 0.0137 + }, + { + "epoch": 0.26152785626898023, + "grad_norm": 0.134765625, + "learning_rate": 0.0001714502402547367, + "lm_loss": 1.9909, + "loss": 2.1457, + "mask_loss": 0.1424, + "step": 658, + "topk_loss": 0.0124 + }, + { + "epoch": 0.26192531501710936, + "grad_norm": 0.1796875, + "learning_rate": 0.00017136101544117525, + "lm_loss": 2.0786, + "loss": 2.2269, + "mask_loss": 0.137, + "step": 659, + "topk_loss": 0.0113 + }, + { + "epoch": 0.2623227737652385, + "grad_norm": 0.146484375, + "learning_rate": 0.00017127167471602447, + "lm_loss": 2.0891, + "loss": 2.2369, + "mask_loss": 0.1367, + "step": 660, + "topk_loss": 0.011 + }, + { + "epoch": 0.2627202325133677, + "grad_norm": 0.158203125, + "learning_rate": 0.0001711822182244004, + "lm_loss": 2.0156, + "loss": 2.1625, + "mask_loss": 0.1363, + "step": 661, + "topk_loss": 0.0107 + }, + { + "epoch": 0.2631176912614968, + "grad_norm": 0.1923828125, + "learning_rate": 0.00017109264611160708, + "lm_loss": 2.0307, + "loss": 2.18, + "mask_loss": 0.1379, + "step": 662, + "topk_loss": 0.0114 + }, + { + "epoch": 0.26351515000962594, + "grad_norm": 0.16015625, + "learning_rate": 0.00017100295852313634, + "lm_loss": 2.0537, + "loss": 2.2058, + "mask_loss": 0.139, + "step": 663, + "topk_loss": 0.0131 + }, + { + "epoch": 0.26391260875775513, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001709131556046676, + "lm_loss": 2.0956, + "loss": 2.2443, + "mask_loss": 0.1375, + "step": 664, + "topk_loss": 0.0113 + }, + { + "epoch": 0.26431006750588426, + "grad_norm": 0.1611328125, + "learning_rate": 0.00017082323750206761, + "lm_loss": 2.0686, + "loss": 2.2177, + "mask_loss": 0.1365, + "step": 665, + "topk_loss": 0.0126 + }, + { + "epoch": 0.2647075262540134, + "grad_norm": 0.1669921875, + "learning_rate": 0.00017073320436139023, + "lm_loss": 2.0032, + "loss": 2.1575, + "mask_loss": 0.1415, + "step": 666, + "topk_loss": 0.0128 + }, + { + "epoch": 0.2651049850021425, + "grad_norm": 0.1435546875, + "learning_rate": 0.0001706430563288761, + "lm_loss": 2.0073, + "loss": 2.1573, + "mask_loss": 0.1382, + "step": 667, + "topk_loss": 0.0118 + }, + { + "epoch": 0.2655024437502717, + "grad_norm": 0.169921875, + "learning_rate": 0.0001705527935509526, + "lm_loss": 2.0847, + "loss": 2.237, + "mask_loss": 0.1385, + "step": 668, + "topk_loss": 0.0139 + }, + { + "epoch": 0.26589990249840084, + "grad_norm": 0.2060546875, + "learning_rate": 0.00017046241617423336, + "lm_loss": 2.0493, + "loss": 2.201, + "mask_loss": 0.1388, + "step": 669, + "topk_loss": 0.0129 + }, + { + "epoch": 0.26629736124653, + "grad_norm": 0.1513671875, + "learning_rate": 0.00017037192434551823, + "lm_loss": 2.0498, + "loss": 2.1971, + "mask_loss": 0.1358, + "step": 670, + "topk_loss": 0.0115 + }, + { + "epoch": 0.26669481999465916, + "grad_norm": 0.1923828125, + "learning_rate": 0.00017028131821179295, + "lm_loss": 2.1096, + "loss": 2.2599, + "mask_loss": 0.1383, + "step": 671, + "topk_loss": 0.012 + }, + { + "epoch": 0.2670922787427883, + "grad_norm": 0.1845703125, + "learning_rate": 0.000170190597920229, + "lm_loss": 2.0352, + "loss": 2.1855, + "mask_loss": 0.1383, + "step": 672, + "topk_loss": 0.0121 + }, + { + "epoch": 0.2674897374909174, + "grad_norm": 0.166015625, + "learning_rate": 0.0001700997636181831, + "lm_loss": 2.0313, + "loss": 2.1832, + "mask_loss": 0.1399, + "step": 673, + "topk_loss": 0.012 + }, + { + "epoch": 0.2678871962390466, + "grad_norm": 0.18359375, + "learning_rate": 0.00017000881545319735, + "lm_loss": 2.0797, + "loss": 2.2309, + "mask_loss": 0.1378, + "step": 674, + "topk_loss": 0.0135 + }, + { + "epoch": 0.26828465498717574, + "grad_norm": 0.203125, + "learning_rate": 0.00016991775357299866, + "lm_loss": 2.0063, + "loss": 2.1545, + "mask_loss": 0.1373, + "step": 675, + "topk_loss": 0.0109 + }, + { + "epoch": 0.2686821137353049, + "grad_norm": 0.208984375, + "learning_rate": 0.00016982657812549874, + "lm_loss": 2.0277, + "loss": 2.1847, + "mask_loss": 0.1407, + "step": 676, + "topk_loss": 0.0163 + }, + { + "epoch": 0.26907957248343406, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016973528925879374, + "lm_loss": 1.9607, + "loss": 2.1099, + "mask_loss": 0.137, + "step": 677, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2694770312315632, + "grad_norm": 0.21875, + "learning_rate": 0.000169643887121164, + "lm_loss": 2.0658, + "loss": 2.2251, + "mask_loss": 0.1411, + "step": 678, + "topk_loss": 0.0182 + }, + { + "epoch": 0.2698744899796923, + "grad_norm": 0.255859375, + "learning_rate": 0.00016955237186107387, + "lm_loss": 2.0325, + "loss": 2.1816, + "mask_loss": 0.1364, + "step": 679, + "topk_loss": 0.0128 + }, + { + "epoch": 0.2702719487278215, + "grad_norm": 0.1875, + "learning_rate": 0.00016946074362717147, + "lm_loss": 2.0734, + "loss": 2.2269, + "mask_loss": 0.1391, + "step": 680, + "topk_loss": 0.0144 + }, + { + "epoch": 0.27066940747595064, + "grad_norm": 0.181640625, + "learning_rate": 0.00016936900256828838, + "lm_loss": 2.0314, + "loss": 2.1812, + "mask_loss": 0.1394, + "step": 681, + "topk_loss": 0.0104 + }, + { + "epoch": 0.27106686622407977, + "grad_norm": 0.18359375, + "learning_rate": 0.00016927714883343948, + "lm_loss": 2.0189, + "loss": 2.1689, + "mask_loss": 0.1388, + "step": 682, + "topk_loss": 0.0111 + }, + { + "epoch": 0.27146432497220896, + "grad_norm": 0.228515625, + "learning_rate": 0.00016918518257182265, + "lm_loss": 2.0225, + "loss": 2.1695, + "mask_loss": 0.136, + "step": 683, + "topk_loss": 0.011 + }, + { + "epoch": 0.2718617837203381, + "grad_norm": 0.154296875, + "learning_rate": 0.00016909310393281856, + "lm_loss": 2.0208, + "loss": 2.1713, + "mask_loss": 0.1388, + "step": 684, + "topk_loss": 0.0117 + }, + { + "epoch": 0.2722592424684672, + "grad_norm": 0.142578125, + "learning_rate": 0.00016900091306599042, + "lm_loss": 2.0877, + "loss": 2.2388, + "mask_loss": 0.1388, + "step": 685, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2726567012165964, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016890861012108365, + "lm_loss": 2.0105, + "loss": 2.1592, + "mask_loss": 0.1366, + "step": 686, + "topk_loss": 0.0122 + }, + { + "epoch": 0.27305415996472554, + "grad_norm": 0.18359375, + "learning_rate": 0.00016881619524802583, + "lm_loss": 2.0659, + "loss": 2.2165, + "mask_loss": 0.1384, + "step": 687, + "topk_loss": 0.0122 + }, + { + "epoch": 0.27345161871285467, + "grad_norm": 0.142578125, + "learning_rate": 0.00016872366859692627, + "lm_loss": 2.042, + "loss": 2.188, + "mask_loss": 0.1355, + "step": 688, + "topk_loss": 0.0105 + }, + { + "epoch": 0.27384907746098386, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001686310303180759, + "lm_loss": 2.0872, + "loss": 2.2323, + "mask_loss": 0.1338, + "step": 689, + "topk_loss": 0.0112 + }, + { + "epoch": 0.274246536209113, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016853828056194697, + "lm_loss": 2.0472, + "loss": 2.1952, + "mask_loss": 0.1368, + "step": 690, + "topk_loss": 0.0112 + }, + { + "epoch": 0.2746439949572421, + "grad_norm": 0.224609375, + "learning_rate": 0.00016844541947919268, + "lm_loss": 2.0788, + "loss": 2.2304, + "mask_loss": 0.1373, + "step": 691, + "topk_loss": 0.0143 + }, + { + "epoch": 0.2750414537053713, + "grad_norm": 0.15234375, + "learning_rate": 0.00016835244722064716, + "lm_loss": 1.9852, + "loss": 2.1361, + "mask_loss": 0.1386, + "step": 692, + "topk_loss": 0.0123 + }, + { + "epoch": 0.27543891245350044, + "grad_norm": 0.2333984375, + "learning_rate": 0.0001682593639373252, + "lm_loss": 2.0409, + "loss": 2.1966, + "mask_loss": 0.1405, + "step": 693, + "topk_loss": 0.0151 + }, + { + "epoch": 0.27583637120162957, + "grad_norm": 0.17578125, + "learning_rate": 0.00016816616978042174, + "lm_loss": 2.042, + "loss": 2.1896, + "mask_loss": 0.1356, + "step": 694, + "topk_loss": 0.012 + }, + { + "epoch": 0.2762338299497587, + "grad_norm": 0.2490234375, + "learning_rate": 0.00016807286490131196, + "lm_loss": 2.0005, + "loss": 2.15, + "mask_loss": 0.1373, + "step": 695, + "topk_loss": 0.0122 + }, + { + "epoch": 0.2766312886978879, + "grad_norm": 0.181640625, + "learning_rate": 0.0001679794494515508, + "lm_loss": 2.0451, + "loss": 2.1984, + "mask_loss": 0.1384, + "step": 696, + "topk_loss": 0.0149 + }, + { + "epoch": 0.277028747446017, + "grad_norm": 0.3671875, + "learning_rate": 0.00016788592358287286, + "lm_loss": 2.0757, + "loss": 2.2443, + "mask_loss": 0.1449, + "step": 697, + "topk_loss": 0.0238 + }, + { + "epoch": 0.27742620619414615, + "grad_norm": 0.19140625, + "learning_rate": 0.00016779228744719205, + "lm_loss": 2.0335, + "loss": 2.1829, + "mask_loss": 0.1376, + "step": 698, + "topk_loss": 0.0117 + }, + { + "epoch": 0.27782366494227534, + "grad_norm": 0.185546875, + "learning_rate": 0.0001676985411966014, + "lm_loss": 2.0207, + "loss": 2.1695, + "mask_loss": 0.1379, + "step": 699, + "topk_loss": 0.011 + }, + { + "epoch": 0.27822112369040447, + "grad_norm": 0.1904296875, + "learning_rate": 0.00016760468498337283, + "lm_loss": 1.9973, + "loss": 2.1477, + "mask_loss": 0.1369, + "step": 700, + "topk_loss": 0.0134 + }, + { + "epoch": 0.27822112369040447, + "eval_lm_loss": 695.3560180664062, + "eval_loss": 695.5069580078125, + "eval_mask_hit_rate": 0.5073809623718262, + "eval_mask_loss": 0.13550084829330444, + "eval_mask_top_10_hit_rate": 0.9787076711654663, + "eval_mask_top_1_hit_rate": 0.995525598526001, + "eval_mask_top_20_hit_rate": 0.9665408134460449, + "eval_mask_top_5_hit_rate": 0.9859187602996826, + "eval_runtime": 144.1276, + "eval_samples_per_second": 14.21, + "eval_steps_per_second": 7.105, + "eval_token_accuracy": 0.5988088846206665, + "eval_top_k_diff": -515.3989868164062, + "eval_topk_loss": 0.015472196973860264, + "step": 700 + }, + { + "epoch": 0.2786185824385336, + "grad_norm": 0.1923828125, + "learning_rate": 0.00016751071895995684, + "lm_loss": 1.9556, + "loss": 2.1084, + "mask_loss": 0.1402, + "step": 701, + "topk_loss": 0.0126 + }, + { + "epoch": 0.2790160411866628, + "grad_norm": 0.2119140625, + "learning_rate": 0.00016741664327898223, + "lm_loss": 2.0031, + "loss": 2.1516, + "mask_loss": 0.1363, + "step": 702, + "topk_loss": 0.0123 + }, + { + "epoch": 0.2794134999347919, + "grad_norm": 0.2109375, + "learning_rate": 0.0001673224580932561, + "lm_loss": 2.0067, + "loss": 2.1557, + "mask_loss": 0.137, + "step": 703, + "topk_loss": 0.012 + }, + { + "epoch": 0.27981095868292105, + "grad_norm": 0.197265625, + "learning_rate": 0.00016722816355576323, + "lm_loss": 2.0194, + "loss": 2.1701, + "mask_loss": 0.1381, + "step": 704, + "topk_loss": 0.0126 + }, + { + "epoch": 0.28020841743105024, + "grad_norm": 0.19921875, + "learning_rate": 0.00016713375981966612, + "lm_loss": 2.0297, + "loss": 2.1771, + "mask_loss": 0.1363, + "step": 705, + "topk_loss": 0.0111 + }, + { + "epoch": 0.28060587617917937, + "grad_norm": 0.177734375, + "learning_rate": 0.0001670392470383046, + "lm_loss": 2.0465, + "loss": 2.1973, + "mask_loss": 0.1373, + "step": 706, + "topk_loss": 0.0134 + }, + { + "epoch": 0.2810033349273085, + "grad_norm": 0.1689453125, + "learning_rate": 0.00016694462536519561, + "lm_loss": 2.0325, + "loss": 2.1811, + "mask_loss": 0.1352, + "step": 707, + "topk_loss": 0.0134 + }, + { + "epoch": 0.2814007936754377, + "grad_norm": 0.16015625, + "learning_rate": 0.00016684989495403308, + "lm_loss": 2.0532, + "loss": 2.2063, + "mask_loss": 0.138, + "step": 708, + "topk_loss": 0.0151 + }, + { + "epoch": 0.2817982524235668, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001667550559586874, + "lm_loss": 2.0983, + "loss": 2.247, + "mask_loss": 0.1364, + "step": 709, + "topk_loss": 0.0122 + }, + { + "epoch": 0.28219571117169595, + "grad_norm": 0.1708984375, + "learning_rate": 0.00016666010853320543, + "lm_loss": 1.977, + "loss": 2.127, + "mask_loss": 0.1376, + "step": 710, + "topk_loss": 0.0124 + }, + { + "epoch": 0.28259316991982514, + "grad_norm": 0.1572265625, + "learning_rate": 0.0001665650528318101, + "lm_loss": 2.0323, + "loss": 2.1819, + "mask_loss": 0.1378, + "step": 711, + "topk_loss": 0.0118 + }, + { + "epoch": 0.28299062866795427, + "grad_norm": 0.173828125, + "learning_rate": 0.0001664698890089003, + "lm_loss": 1.9691, + "loss": 2.12, + "mask_loss": 0.137, + "step": 712, + "topk_loss": 0.0138 + }, + { + "epoch": 0.2833880874160834, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016637461721905045, + "lm_loss": 2.065, + "loss": 2.214, + "mask_loss": 0.1377, + "step": 713, + "topk_loss": 0.0113 + }, + { + "epoch": 0.2837855461642126, + "grad_norm": 0.1748046875, + "learning_rate": 0.00016627923761701038, + "lm_loss": 2.0362, + "loss": 2.1902, + "mask_loss": 0.14, + "step": 714, + "topk_loss": 0.014 + }, + { + "epoch": 0.2841830049123417, + "grad_norm": 0.140625, + "learning_rate": 0.00016618375035770498, + "lm_loss": 2.0468, + "loss": 2.1948, + "mask_loss": 0.1373, + "step": 715, + "topk_loss": 0.0107 + }, + { + "epoch": 0.28458046366047085, + "grad_norm": 0.150390625, + "learning_rate": 0.00016608815559623414, + "lm_loss": 2.0417, + "loss": 2.1922, + "mask_loss": 0.138, + "step": 716, + "topk_loss": 0.0125 + }, + { + "epoch": 0.28497792240860004, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001659924534878723, + "lm_loss": 2.0316, + "loss": 2.1754, + "mask_loss": 0.1327, + "step": 717, + "topk_loss": 0.011 + }, + { + "epoch": 0.28537538115672917, + "grad_norm": 0.1513671875, + "learning_rate": 0.00016589664418806814, + "lm_loss": 2.0287, + "loss": 2.1845, + "mask_loss": 0.14, + "step": 718, + "topk_loss": 0.0158 + }, + { + "epoch": 0.2857728399048583, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016580072785244462, + "lm_loss": 2.0515, + "loss": 2.2143, + "mask_loss": 0.1435, + "step": 719, + "topk_loss": 0.0193 + }, + { + "epoch": 0.2861702986529875, + "grad_norm": 0.166015625, + "learning_rate": 0.00016570470463679856, + "lm_loss": 2.1156, + "loss": 2.2641, + "mask_loss": 0.1365, + "step": 720, + "topk_loss": 0.012 + }, + { + "epoch": 0.2865677574011166, + "grad_norm": 0.1416015625, + "learning_rate": 0.00016560857469710022, + "lm_loss": 2.0524, + "loss": 2.1965, + "mask_loss": 0.1328, + "step": 721, + "topk_loss": 0.0113 + }, + { + "epoch": 0.28696521614924575, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016551233818949337, + "lm_loss": 2.0192, + "loss": 2.1676, + "mask_loss": 0.1359, + "step": 722, + "topk_loss": 0.0124 + }, + { + "epoch": 0.28736267489737494, + "grad_norm": 0.14453125, + "learning_rate": 0.0001654159952702948, + "lm_loss": 2.0376, + "loss": 2.1842, + "mask_loss": 0.1342, + "step": 723, + "topk_loss": 0.0124 + }, + { + "epoch": 0.28776013364550407, + "grad_norm": 0.1494140625, + "learning_rate": 0.0001653195460959942, + "lm_loss": 2.0463, + "loss": 2.1955, + "mask_loss": 0.1367, + "step": 724, + "topk_loss": 0.0126 + }, + { + "epoch": 0.2881575923936332, + "grad_norm": 0.1328125, + "learning_rate": 0.00016522299082325382, + "lm_loss": 2.0002, + "loss": 2.1447, + "mask_loss": 0.1336, + "step": 725, + "topk_loss": 0.0109 + }, + { + "epoch": 0.28855505114176233, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016512632960890823, + "lm_loss": 2.063, + "loss": 2.2123, + "mask_loss": 0.1358, + "step": 726, + "topk_loss": 0.0135 + }, + { + "epoch": 0.2889525098898915, + "grad_norm": 0.16015625, + "learning_rate": 0.00016502956260996404, + "lm_loss": 2.0324, + "loss": 2.181, + "mask_loss": 0.1365, + "step": 727, + "topk_loss": 0.0121 + }, + { + "epoch": 0.28934996863802065, + "grad_norm": 0.158203125, + "learning_rate": 0.00016493268998359986, + "lm_loss": 1.9948, + "loss": 2.1435, + "mask_loss": 0.1368, + "step": 728, + "topk_loss": 0.0119 + }, + { + "epoch": 0.2897474273861498, + "grad_norm": 0.1962890625, + "learning_rate": 0.00016483571188716562, + "lm_loss": 2.0435, + "loss": 2.1896, + "mask_loss": 0.1348, + "step": 729, + "topk_loss": 0.0114 + }, + { + "epoch": 0.29014488613427897, + "grad_norm": 0.16015625, + "learning_rate": 0.00016473862847818277, + "lm_loss": 2.0547, + "loss": 2.2039, + "mask_loss": 0.1362, + "step": 730, + "topk_loss": 0.013 + }, + { + "epoch": 0.2905423448824081, + "grad_norm": 0.1591796875, + "learning_rate": 0.00016464143991434373, + "lm_loss": 1.989, + "loss": 2.1381, + "mask_loss": 0.1372, + "step": 731, + "topk_loss": 0.0118 + }, + { + "epoch": 0.29093980363053723, + "grad_norm": 0.13671875, + "learning_rate": 0.00016454414635351175, + "lm_loss": 2.0332, + "loss": 2.1796, + "mask_loss": 0.1357, + "step": 732, + "topk_loss": 0.0107 + }, + { + "epoch": 0.2913372623786664, + "grad_norm": 0.14453125, + "learning_rate": 0.00016444674795372058, + "lm_loss": 2.0188, + "loss": 2.1681, + "mask_loss": 0.137, + "step": 733, + "topk_loss": 0.0122 + }, + { + "epoch": 0.29173472112679555, + "grad_norm": 0.138671875, + "learning_rate": 0.0001643492448731743, + "lm_loss": 1.9968, + "loss": 2.1447, + "mask_loss": 0.136, + "step": 734, + "topk_loss": 0.0118 + }, + { + "epoch": 0.2921321798749247, + "grad_norm": 0.15234375, + "learning_rate": 0.00016425163727024707, + "lm_loss": 2.0086, + "loss": 2.1536, + "mask_loss": 0.1341, + "step": 735, + "topk_loss": 0.0109 + }, + { + "epoch": 0.29252963862305387, + "grad_norm": 0.146484375, + "learning_rate": 0.0001641539253034827, + "lm_loss": 2.0357, + "loss": 2.1823, + "mask_loss": 0.1349, + "step": 736, + "topk_loss": 0.0118 + }, + { + "epoch": 0.292927097371183, + "grad_norm": 0.271484375, + "learning_rate": 0.00016405610913159465, + "lm_loss": 1.9641, + "loss": 2.1237, + "mask_loss": 0.1402, + "step": 737, + "topk_loss": 0.0194 + }, + { + "epoch": 0.29332455611931213, + "grad_norm": 0.21875, + "learning_rate": 0.0001639581889134655, + "lm_loss": 2.0021, + "loss": 2.1475, + "mask_loss": 0.1339, + "step": 738, + "topk_loss": 0.0116 + }, + { + "epoch": 0.2937220148674413, + "grad_norm": 0.1904296875, + "learning_rate": 0.000163860164808147, + "lm_loss": 1.9545, + "loss": 2.101, + "mask_loss": 0.1357, + "step": 739, + "topk_loss": 0.0109 + }, + { + "epoch": 0.29411947361557045, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001637620369748595, + "lm_loss": 2.0343, + "loss": 2.1775, + "mask_loss": 0.132, + "step": 740, + "topk_loss": 0.0111 + }, + { + "epoch": 0.2945169323636996, + "grad_norm": 0.1484375, + "learning_rate": 0.0001636638055729919, + "lm_loss": 1.9935, + "loss": 2.1405, + "mask_loss": 0.136, + "step": 741, + "topk_loss": 0.0109 + }, + { + "epoch": 0.29491439111182877, + "grad_norm": 0.1943359375, + "learning_rate": 0.00016356547076210135, + "lm_loss": 2.0936, + "loss": 2.2431, + "mask_loss": 0.1348, + "step": 742, + "topk_loss": 0.0148 + }, + { + "epoch": 0.2953118498599579, + "grad_norm": 0.1513671875, + "learning_rate": 0.0001634670327019129, + "lm_loss": 2.0426, + "loss": 2.1898, + "mask_loss": 0.1351, + "step": 743, + "topk_loss": 0.0121 + }, + { + "epoch": 0.29570930860808703, + "grad_norm": 0.1923828125, + "learning_rate": 0.00016336849155231935, + "lm_loss": 1.9431, + "loss": 2.0942, + "mask_loss": 0.1381, + "step": 744, + "topk_loss": 0.013 + }, + { + "epoch": 0.2961067673562162, + "grad_norm": 0.12890625, + "learning_rate": 0.00016326984747338095, + "lm_loss": 1.9955, + "loss": 2.1404, + "mask_loss": 0.1338, + "step": 745, + "topk_loss": 0.0112 + }, + { + "epoch": 0.29650422610434535, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001631711006253251, + "lm_loss": 1.9625, + "loss": 2.1121, + "mask_loss": 0.1382, + "step": 746, + "topk_loss": 0.0114 + }, + { + "epoch": 0.2969016848524745, + "grad_norm": 0.1484375, + "learning_rate": 0.00016307225116854622, + "lm_loss": 2.021, + "loss": 2.1701, + "mask_loss": 0.1363, + "step": 747, + "topk_loss": 0.0129 + }, + { + "epoch": 0.29729914360060367, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016297329926360523, + "lm_loss": 1.9784, + "loss": 2.1291, + "mask_loss": 0.138, + "step": 748, + "topk_loss": 0.0127 + }, + { + "epoch": 0.2976966023487328, + "grad_norm": 0.130859375, + "learning_rate": 0.00016287424507122964, + "lm_loss": 1.9802, + "loss": 2.1269, + "mask_loss": 0.1356, + "step": 749, + "topk_loss": 0.0111 + }, + { + "epoch": 0.29809406109686193, + "grad_norm": 0.1494140625, + "learning_rate": 0.00016277508875231302, + "lm_loss": 2.0315, + "loss": 2.1765, + "mask_loss": 0.1338, + "step": 750, + "topk_loss": 0.0112 + }, + { + "epoch": 0.29809406109686193, + "eval_lm_loss": 694.0836181640625, + "eval_loss": 694.2323608398438, + "eval_mask_hit_rate": 0.5113680362701416, + "eval_mask_loss": 0.133888840675354, + "eval_mask_top_10_hit_rate": 0.9798060059547424, + "eval_mask_top_1_hit_rate": 0.9958555698394775, + "eval_mask_top_20_hit_rate": 0.9680401086807251, + "eval_mask_top_5_hit_rate": 0.986741304397583, + "eval_runtime": 143.6233, + "eval_samples_per_second": 14.26, + "eval_steps_per_second": 7.13, + "eval_token_accuracy": 0.6011416912078857, + "eval_top_k_diff": -520.20947265625, + "eval_topk_loss": 0.01484230998903513, + "step": 750 + }, + { + "epoch": 0.2984915198449911, + "grad_norm": 0.15234375, + "learning_rate": 0.0001626758304679148, + "lm_loss": 2.0184, + "loss": 2.1698, + "mask_loss": 0.1377, + "step": 751, + "topk_loss": 0.0137 + }, + { + "epoch": 0.29888897859312025, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016257647037926006, + "lm_loss": 1.9852, + "loss": 2.1327, + "mask_loss": 0.1355, + "step": 752, + "topk_loss": 0.012 + }, + { + "epoch": 0.2992864373412494, + "grad_norm": 0.142578125, + "learning_rate": 0.00016247700864773927, + "lm_loss": 2.0101, + "loss": 2.158, + "mask_loss": 0.135, + "step": 753, + "topk_loss": 0.0129 + }, + { + "epoch": 0.2996838960893785, + "grad_norm": 0.1435546875, + "learning_rate": 0.00016237744543490796, + "lm_loss": 1.9929, + "loss": 2.1393, + "mask_loss": 0.1342, + "step": 754, + "topk_loss": 0.0121 + }, + { + "epoch": 0.3000813548375077, + "grad_norm": 0.1474609375, + "learning_rate": 0.00016227778090248648, + "lm_loss": 2.0294, + "loss": 2.1785, + "mask_loss": 0.1378, + "step": 755, + "topk_loss": 0.0113 + }, + { + "epoch": 0.30047881358563683, + "grad_norm": 0.171875, + "learning_rate": 0.0001621780152123598, + "lm_loss": 2.038, + "loss": 2.1829, + "mask_loss": 0.1337, + "step": 756, + "topk_loss": 0.0112 + }, + { + "epoch": 0.30087627233376596, + "grad_norm": 0.1552734375, + "learning_rate": 0.0001620781485265772, + "lm_loss": 1.9621, + "loss": 2.1093, + "mask_loss": 0.1368, + "step": 757, + "topk_loss": 0.0105 + }, + { + "epoch": 0.30127373108189515, + "grad_norm": 0.1806640625, + "learning_rate": 0.00016197818100735197, + "lm_loss": 1.9951, + "loss": 2.1406, + "mask_loss": 0.1345, + "step": 758, + "topk_loss": 0.0111 + }, + { + "epoch": 0.3016711898300243, + "grad_norm": 0.173828125, + "learning_rate": 0.00016187811281706115, + "lm_loss": 2.0192, + "loss": 2.1718, + "mask_loss": 0.1378, + "step": 759, + "topk_loss": 0.0148 + }, + { + "epoch": 0.3020686485781534, + "grad_norm": 0.2265625, + "learning_rate": 0.00016177794411824544, + "lm_loss": 2.0547, + "loss": 2.1978, + "mask_loss": 0.1334, + "step": 760, + "topk_loss": 0.0096 + }, + { + "epoch": 0.3024661073262826, + "grad_norm": 0.1396484375, + "learning_rate": 0.00016167767507360866, + "lm_loss": 1.9905, + "loss": 2.1363, + "mask_loss": 0.1345, + "step": 761, + "topk_loss": 0.0113 + }, + { + "epoch": 0.3028635660744117, + "grad_norm": 0.1650390625, + "learning_rate": 0.00016157730584601764, + "lm_loss": 2.0099, + "loss": 2.1553, + "mask_loss": 0.1334, + "step": 762, + "topk_loss": 0.012 + }, + { + "epoch": 0.30326102482254086, + "grad_norm": 0.162109375, + "learning_rate": 0.00016147683659850198, + "lm_loss": 1.9952, + "loss": 2.1452, + "mask_loss": 0.1368, + "step": 763, + "topk_loss": 0.0132 + }, + { + "epoch": 0.30365848357067005, + "grad_norm": 0.1337890625, + "learning_rate": 0.00016137626749425377, + "lm_loss": 1.9443, + "loss": 2.0912, + "mask_loss": 0.1364, + "step": 764, + "topk_loss": 0.0106 + }, + { + "epoch": 0.3040559423187992, + "grad_norm": 0.15625, + "learning_rate": 0.00016127559869662722, + "lm_loss": 2.0581, + "loss": 2.2016, + "mask_loss": 0.1316, + "step": 765, + "topk_loss": 0.0119 + }, + { + "epoch": 0.3044534010669283, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001611748303691385, + "lm_loss": 1.9773, + "loss": 2.1237, + "mask_loss": 0.136, + "step": 766, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3048508598150575, + "grad_norm": 0.14453125, + "learning_rate": 0.00016107396267546546, + "lm_loss": 1.9836, + "loss": 2.1356, + "mask_loss": 0.1385, + "step": 767, + "topk_loss": 0.0134 + }, + { + "epoch": 0.3052483185631866, + "grad_norm": 0.19140625, + "learning_rate": 0.00016097299577944735, + "lm_loss": 2.0708, + "loss": 2.2217, + "mask_loss": 0.1358, + "step": 768, + "topk_loss": 0.015 + }, + { + "epoch": 0.30564577731131576, + "grad_norm": 0.150390625, + "learning_rate": 0.00016087192984508451, + "lm_loss": 1.9767, + "loss": 2.124, + "mask_loss": 0.1366, + "step": 769, + "topk_loss": 0.0107 + }, + { + "epoch": 0.30604323605944495, + "grad_norm": 0.1669921875, + "learning_rate": 0.00016077076503653826, + "lm_loss": 2.0156, + "loss": 2.1636, + "mask_loss": 0.1353, + "step": 770, + "topk_loss": 0.0128 + }, + { + "epoch": 0.3064406948075741, + "grad_norm": 0.138671875, + "learning_rate": 0.00016066950151813033, + "lm_loss": 2.012, + "loss": 2.1566, + "mask_loss": 0.1334, + "step": 771, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3068381535557032, + "grad_norm": 0.1630859375, + "learning_rate": 0.00016056813945434302, + "lm_loss": 2.0295, + "loss": 2.1749, + "mask_loss": 0.1353, + "step": 772, + "topk_loss": 0.0101 + }, + { + "epoch": 0.3072356123038324, + "grad_norm": 0.138671875, + "learning_rate": 0.0001604666790098185, + "lm_loss": 2.0299, + "loss": 2.1751, + "mask_loss": 0.1341, + "step": 773, + "topk_loss": 0.0111 + }, + { + "epoch": 0.3076330710519615, + "grad_norm": 0.255859375, + "learning_rate": 0.00016036512034935886, + "lm_loss": 1.9921, + "loss": 2.1371, + "mask_loss": 0.1334, + "step": 774, + "topk_loss": 0.0116 + }, + { + "epoch": 0.30803052980009066, + "grad_norm": 0.140625, + "learning_rate": 0.00016026346363792567, + "lm_loss": 1.967, + "loss": 2.1142, + "mask_loss": 0.136, + "step": 775, + "topk_loss": 0.0112 + }, + { + "epoch": 0.30842798854821984, + "grad_norm": 0.1748046875, + "learning_rate": 0.0001601617090406397, + "lm_loss": 2.0243, + "loss": 2.1723, + "mask_loss": 0.1356, + "step": 776, + "topk_loss": 0.0124 + }, + { + "epoch": 0.308825447296349, + "grad_norm": 0.1240234375, + "learning_rate": 0.00016005985672278093, + "lm_loss": 1.9735, + "loss": 2.1208, + "mask_loss": 0.1353, + "step": 777, + "topk_loss": 0.012 + }, + { + "epoch": 0.3092229060444781, + "grad_norm": 0.158203125, + "learning_rate": 0.0001599579068497878, + "lm_loss": 2.0466, + "loss": 2.1879, + "mask_loss": 0.1309, + "step": 778, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3096203647926073, + "grad_norm": 0.1884765625, + "learning_rate": 0.00015985585958725736, + "lm_loss": 2.0242, + "loss": 2.1745, + "mask_loss": 0.1364, + "step": 779, + "topk_loss": 0.0139 + }, + { + "epoch": 0.3100178235407364, + "grad_norm": 0.1455078125, + "learning_rate": 0.00015975371510094485, + "lm_loss": 1.9809, + "loss": 2.1286, + "mask_loss": 0.1363, + "step": 780, + "topk_loss": 0.0115 + }, + { + "epoch": 0.31041528228886556, + "grad_norm": 0.189453125, + "learning_rate": 0.00015965147355676343, + "lm_loss": 1.9497, + "loss": 2.0973, + "mask_loss": 0.1358, + "step": 781, + "topk_loss": 0.0118 + }, + { + "epoch": 0.3108127410369947, + "grad_norm": 0.1689453125, + "learning_rate": 0.00015954913512078385, + "lm_loss": 1.9924, + "loss": 2.1432, + "mask_loss": 0.1369, + "step": 782, + "topk_loss": 0.0139 + }, + { + "epoch": 0.3112101997851239, + "grad_norm": 0.1728515625, + "learning_rate": 0.00015944669995923426, + "lm_loss": 2.0464, + "loss": 2.1883, + "mask_loss": 0.131, + "step": 783, + "topk_loss": 0.0108 + }, + { + "epoch": 0.311607658533253, + "grad_norm": 0.166015625, + "learning_rate": 0.00015934416823849997, + "lm_loss": 1.9721, + "loss": 2.1281, + "mask_loss": 0.1398, + "step": 784, + "topk_loss": 0.0162 + }, + { + "epoch": 0.31200511728138214, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015924154012512308, + "lm_loss": 2.0275, + "loss": 2.1707, + "mask_loss": 0.1315, + "step": 785, + "topk_loss": 0.0117 + }, + { + "epoch": 0.3124025760295113, + "grad_norm": 0.1298828125, + "learning_rate": 0.00015913881578580227, + "lm_loss": 1.9875, + "loss": 2.1333, + "mask_loss": 0.135, + "step": 786, + "topk_loss": 0.0108 + }, + { + "epoch": 0.31280003477764046, + "grad_norm": 0.140625, + "learning_rate": 0.00015903599538739254, + "lm_loss": 1.9729, + "loss": 2.1233, + "mask_loss": 0.1376, + "step": 787, + "topk_loss": 0.0128 + }, + { + "epoch": 0.3131974935257696, + "grad_norm": 0.1435546875, + "learning_rate": 0.00015893307909690493, + "lm_loss": 1.976, + "loss": 2.1287, + "mask_loss": 0.1389, + "step": 788, + "topk_loss": 0.0137 + }, + { + "epoch": 0.3135949522738988, + "grad_norm": 0.1357421875, + "learning_rate": 0.00015883006708150623, + "lm_loss": 1.9939, + "loss": 2.1413, + "mask_loss": 0.1358, + "step": 789, + "topk_loss": 0.0116 + }, + { + "epoch": 0.3139924110220279, + "grad_norm": 0.13671875, + "learning_rate": 0.0001587269595085186, + "lm_loss": 2.0191, + "loss": 2.1641, + "mask_loss": 0.1336, + "step": 790, + "topk_loss": 0.0115 + }, + { + "epoch": 0.31438986977015704, + "grad_norm": 0.14453125, + "learning_rate": 0.00015862375654541964, + "lm_loss": 2.005, + "loss": 2.1523, + "mask_loss": 0.1346, + "step": 791, + "topk_loss": 0.0127 + }, + { + "epoch": 0.3147873285182862, + "grad_norm": 0.142578125, + "learning_rate": 0.0001585204583598417, + "lm_loss": 1.9805, + "loss": 2.1273, + "mask_loss": 0.1354, + "step": 792, + "topk_loss": 0.0114 + }, + { + "epoch": 0.31518478726641536, + "grad_norm": 0.1416015625, + "learning_rate": 0.00015841706511957184, + "lm_loss": 1.9668, + "loss": 2.1207, + "mask_loss": 0.1385, + "step": 793, + "topk_loss": 0.0154 + }, + { + "epoch": 0.3155822460145445, + "grad_norm": 0.134765625, + "learning_rate": 0.00015831357699255157, + "lm_loss": 2.0282, + "loss": 2.172, + "mask_loss": 0.1334, + "step": 794, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3159797047626737, + "grad_norm": 0.140625, + "learning_rate": 0.00015820999414687656, + "lm_loss": 1.9895, + "loss": 2.1362, + "mask_loss": 0.1339, + "step": 795, + "topk_loss": 0.0128 + }, + { + "epoch": 0.3163771635108028, + "grad_norm": 0.1416015625, + "learning_rate": 0.00015810631675079617, + "lm_loss": 1.9351, + "loss": 2.0815, + "mask_loss": 0.1344, + "step": 796, + "topk_loss": 0.012 + }, + { + "epoch": 0.31677462225893194, + "grad_norm": 0.16796875, + "learning_rate": 0.00015800254497271352, + "lm_loss": 1.9859, + "loss": 2.1291, + "mask_loss": 0.1328, + "step": 797, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3171720810070611, + "grad_norm": 0.1533203125, + "learning_rate": 0.0001578986789811849, + "lm_loss": 1.9865, + "loss": 2.1324, + "mask_loss": 0.1349, + "step": 798, + "topk_loss": 0.011 + }, + { + "epoch": 0.31756953975519026, + "grad_norm": 0.150390625, + "learning_rate": 0.00015779471894491966, + "lm_loss": 2.0869, + "loss": 2.2302, + "mask_loss": 0.1321, + "step": 799, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3179669985033194, + "grad_norm": 0.171875, + "learning_rate": 0.00015769066503277997, + "lm_loss": 2.0015, + "loss": 2.1486, + "mask_loss": 0.1341, + "step": 800, + "topk_loss": 0.0129 + }, + { + "epoch": 0.3179669985033194, + "eval_lm_loss": 693.9171752929688, + "eval_loss": 694.0645141601562, + "eval_mask_hit_rate": 0.5143714547157288, + "eval_mask_loss": 0.13279682397842407, + "eval_mask_top_10_hit_rate": 0.9807169437408447, + "eval_mask_top_1_hit_rate": 0.9961609840393066, + "eval_mask_top_20_hit_rate": 0.9693107008934021, + "eval_mask_top_5_hit_rate": 0.9874145984649658, + "eval_runtime": 144.2716, + "eval_samples_per_second": 14.195, + "eval_steps_per_second": 7.098, + "eval_token_accuracy": 0.6030571460723877, + "eval_top_k_diff": -527.3522338867188, + "eval_topk_loss": 0.014561420306563377, + "step": 800 + }, + { + "epoch": 0.3183644572514486, + "grad_norm": 0.142578125, + "learning_rate": 0.0001575865174137805, + "lm_loss": 1.9426, + "loss": 2.0876, + "mask_loss": 0.1347, + "step": 801, + "topk_loss": 0.0103 + }, + { + "epoch": 0.3187619159995777, + "grad_norm": 0.146484375, + "learning_rate": 0.00015748227625708797, + "lm_loss": 1.9777, + "loss": 2.1199, + "mask_loss": 0.1313, + "step": 802, + "topk_loss": 0.0109 + }, + { + "epoch": 0.31915937474770684, + "grad_norm": 0.1689453125, + "learning_rate": 0.0001573779417320212, + "lm_loss": 2.0242, + "loss": 2.1683, + "mask_loss": 0.1337, + "step": 803, + "topk_loss": 0.0104 + }, + { + "epoch": 0.319556833495836, + "grad_norm": 0.1337890625, + "learning_rate": 0.00015727351400805052, + "lm_loss": 2.007, + "loss": 2.1524, + "mask_loss": 0.1331, + "step": 804, + "topk_loss": 0.0123 + }, + { + "epoch": 0.31995429224396515, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001571689932547978, + "lm_loss": 2.0379, + "loss": 2.1856, + "mask_loss": 0.1353, + "step": 805, + "topk_loss": 0.0125 + }, + { + "epoch": 0.3203517509920943, + "grad_norm": 0.181640625, + "learning_rate": 0.00015706437964203596, + "lm_loss": 2.0187, + "loss": 2.1612, + "mask_loss": 0.1315, + "step": 806, + "topk_loss": 0.0109 + }, + { + "epoch": 0.3207492097402235, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001569596733396886, + "lm_loss": 2.0469, + "loss": 2.1899, + "mask_loss": 0.1325, + "step": 807, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3211466684883526, + "grad_norm": 0.1435546875, + "learning_rate": 0.00015685487451783017, + "lm_loss": 1.9933, + "loss": 2.1382, + "mask_loss": 0.1333, + "step": 808, + "topk_loss": 0.0116 + }, + { + "epoch": 0.32154412723648174, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001567499833466852, + "lm_loss": 2.0374, + "loss": 2.1831, + "mask_loss": 0.1335, + "step": 809, + "topk_loss": 0.0122 + }, + { + "epoch": 0.32194158598461087, + "grad_norm": 0.1484375, + "learning_rate": 0.00015664499999662815, + "lm_loss": 1.9767, + "loss": 2.1208, + "mask_loss": 0.1327, + "step": 810, + "topk_loss": 0.0114 + }, + { + "epoch": 0.32233904473274005, + "grad_norm": 0.193359375, + "learning_rate": 0.0001565399246381834, + "lm_loss": 1.9659, + "loss": 2.1118, + "mask_loss": 0.1334, + "step": 811, + "topk_loss": 0.0125 + }, + { + "epoch": 0.3227365034808692, + "grad_norm": 0.1875, + "learning_rate": 0.0001564347574420247, + "lm_loss": 1.8997, + "loss": 2.0483, + "mask_loss": 0.1366, + "step": 812, + "topk_loss": 0.0121 + }, + { + "epoch": 0.3231339622289983, + "grad_norm": 0.150390625, + "learning_rate": 0.00015632949857897498, + "lm_loss": 1.9574, + "loss": 2.1024, + "mask_loss": 0.1344, + "step": 813, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3235314209771275, + "grad_norm": 0.158203125, + "learning_rate": 0.000156224148220006, + "lm_loss": 2.0218, + "loss": 2.168, + "mask_loss": 0.1347, + "step": 814, + "topk_loss": 0.0115 + }, + { + "epoch": 0.32392887972525664, + "grad_norm": 0.25390625, + "learning_rate": 0.00015611870653623825, + "lm_loss": 2.0542, + "loss": 2.201, + "mask_loss": 0.1344, + "step": 815, + "topk_loss": 0.0124 + }, + { + "epoch": 0.32432633847338577, + "grad_norm": 0.142578125, + "learning_rate": 0.00015601317369894044, + "lm_loss": 1.976, + "loss": 2.1171, + "mask_loss": 0.1323, + "step": 816, + "topk_loss": 0.0089 + }, + { + "epoch": 0.32472379722151495, + "grad_norm": 0.146484375, + "learning_rate": 0.00015590754987952944, + "lm_loss": 1.9774, + "loss": 2.1179, + "mask_loss": 0.1297, + "step": 817, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3251212559696441, + "grad_norm": 0.169921875, + "learning_rate": 0.00015580183524956982, + "lm_loss": 1.9778, + "loss": 2.1274, + "mask_loss": 0.1365, + "step": 818, + "topk_loss": 0.0132 + }, + { + "epoch": 0.3255187147177732, + "grad_norm": 0.1435546875, + "learning_rate": 0.0001556960299807737, + "lm_loss": 1.9972, + "loss": 2.1413, + "mask_loss": 0.1333, + "step": 819, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3259161734659024, + "grad_norm": 0.287109375, + "learning_rate": 0.00015559013424500047, + "lm_loss": 2.0236, + "loss": 2.1729, + "mask_loss": 0.1348, + "step": 820, + "topk_loss": 0.0145 + }, + { + "epoch": 0.32631363221403153, + "grad_norm": 0.15625, + "learning_rate": 0.00015548414821425638, + "lm_loss": 1.9912, + "loss": 2.1343, + "mask_loss": 0.1323, + "step": 821, + "topk_loss": 0.0109 + }, + { + "epoch": 0.32671109096216067, + "grad_norm": 0.2099609375, + "learning_rate": 0.00015537807206069434, + "lm_loss": 1.9572, + "loss": 2.1075, + "mask_loss": 0.1351, + "step": 822, + "topk_loss": 0.0152 + }, + { + "epoch": 0.32710854971028985, + "grad_norm": 0.28515625, + "learning_rate": 0.00015527190595661375, + "lm_loss": 2.0038, + "loss": 2.1511, + "mask_loss": 0.1346, + "step": 823, + "topk_loss": 0.0127 + }, + { + "epoch": 0.327506008458419, + "grad_norm": 0.1533203125, + "learning_rate": 0.00015516565007446, + "lm_loss": 1.9312, + "loss": 2.0774, + "mask_loss": 0.1342, + "step": 824, + "topk_loss": 0.012 + }, + { + "epoch": 0.3279034672065481, + "grad_norm": 0.25390625, + "learning_rate": 0.0001550593045868244, + "lm_loss": 2.0119, + "loss": 2.1579, + "mask_loss": 0.1327, + "step": 825, + "topk_loss": 0.0133 + }, + { + "epoch": 0.3283009259546773, + "grad_norm": 0.197265625, + "learning_rate": 0.00015495286966644373, + "lm_loss": 2.0199, + "loss": 2.1609, + "mask_loss": 0.131, + "step": 826, + "topk_loss": 0.01 + }, + { + "epoch": 0.32869838470280643, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001548463454862001, + "lm_loss": 1.955, + "loss": 2.1022, + "mask_loss": 0.1357, + "step": 827, + "topk_loss": 0.0115 + }, + { + "epoch": 0.32909584345093557, + "grad_norm": 0.169921875, + "learning_rate": 0.00015473973221912055, + "lm_loss": 1.9188, + "loss": 2.0655, + "mask_loss": 0.1343, + "step": 828, + "topk_loss": 0.0125 + }, + { + "epoch": 0.32949330219906475, + "grad_norm": 0.244140625, + "learning_rate": 0.0001546330300383769, + "lm_loss": 1.9589, + "loss": 2.1009, + "mask_loss": 0.1308, + "step": 829, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3298907609471939, + "grad_norm": 0.265625, + "learning_rate": 0.00015452623911728523, + "lm_loss": 1.9411, + "loss": 2.0862, + "mask_loss": 0.1334, + "step": 830, + "topk_loss": 0.0117 + }, + { + "epoch": 0.330288219695323, + "grad_norm": 0.146484375, + "learning_rate": 0.00015441935962930598, + "lm_loss": 2.0214, + "loss": 2.1666, + "mask_loss": 0.1343, + "step": 831, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3306856784434522, + "grad_norm": 0.21484375, + "learning_rate": 0.00015431239174804328, + "lm_loss": 1.9706, + "loss": 2.113, + "mask_loss": 0.1324, + "step": 832, + "topk_loss": 0.0101 + }, + { + "epoch": 0.33108313719158133, + "grad_norm": 0.1826171875, + "learning_rate": 0.00015420533564724495, + "lm_loss": 1.995, + "loss": 2.137, + "mask_loss": 0.1317, + "step": 833, + "topk_loss": 0.0102 + }, + { + "epoch": 0.33148059593971047, + "grad_norm": 0.1416015625, + "learning_rate": 0.000154098191500802, + "lm_loss": 2.0297, + "loss": 2.1746, + "mask_loss": 0.1339, + "step": 834, + "topk_loss": 0.0111 + }, + { + "epoch": 0.33187805468783965, + "grad_norm": 0.158203125, + "learning_rate": 0.00015399095948274852, + "lm_loss": 1.9266, + "loss": 2.0701, + "mask_loss": 0.1332, + "step": 835, + "topk_loss": 0.0103 + }, + { + "epoch": 0.3322755134359688, + "grad_norm": 0.158203125, + "learning_rate": 0.00015388363976726133, + "lm_loss": 2.0069, + "loss": 2.1489, + "mask_loss": 0.1307, + "step": 836, + "topk_loss": 0.0113 + }, + { + "epoch": 0.3326729721840979, + "grad_norm": 0.1591796875, + "learning_rate": 0.00015377623252865968, + "lm_loss": 1.9808, + "loss": 2.1257, + "mask_loss": 0.1344, + "step": 837, + "topk_loss": 0.0105 + }, + { + "epoch": 0.33307043093222705, + "grad_norm": 0.16015625, + "learning_rate": 0.00015366873794140498, + "lm_loss": 1.9781, + "loss": 2.1324, + "mask_loss": 0.1383, + "step": 838, + "topk_loss": 0.016 + }, + { + "epoch": 0.33346788968035623, + "grad_norm": 0.146484375, + "learning_rate": 0.0001535611561801005, + "lm_loss": 1.9886, + "loss": 2.1363, + "mask_loss": 0.1348, + "step": 839, + "topk_loss": 0.0129 + }, + { + "epoch": 0.33386534842848536, + "grad_norm": 0.1337890625, + "learning_rate": 0.00015345348741949117, + "lm_loss": 1.9809, + "loss": 2.1258, + "mask_loss": 0.1341, + "step": 840, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3342628071766145, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001533457318344632, + "lm_loss": 1.939, + "loss": 2.0832, + "mask_loss": 0.1327, + "step": 841, + "topk_loss": 0.0116 + }, + { + "epoch": 0.3346602659247437, + "grad_norm": 0.146484375, + "learning_rate": 0.00015323788960004377, + "lm_loss": 1.999, + "loss": 2.1412, + "mask_loss": 0.1319, + "step": 842, + "topk_loss": 0.0103 + }, + { + "epoch": 0.3350577246728728, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015312996089140088, + "lm_loss": 1.943, + "loss": 2.0855, + "mask_loss": 0.1318, + "step": 843, + "topk_loss": 0.0107 + }, + { + "epoch": 0.33545518342100195, + "grad_norm": 0.138671875, + "learning_rate": 0.00015302194588384302, + "lm_loss": 1.9981, + "loss": 2.1457, + "mask_loss": 0.1353, + "step": 844, + "topk_loss": 0.0124 + }, + { + "epoch": 0.33585264216913113, + "grad_norm": 0.1328125, + "learning_rate": 0.00015291384475281877, + "lm_loss": 2.0021, + "loss": 2.1457, + "mask_loss": 0.1328, + "step": 845, + "topk_loss": 0.0107 + }, + { + "epoch": 0.33625010091726026, + "grad_norm": 0.1494140625, + "learning_rate": 0.00015280565767391657, + "lm_loss": 1.9705, + "loss": 2.1158, + "mask_loss": 0.1341, + "step": 846, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3366475596653894, + "grad_norm": 0.13671875, + "learning_rate": 0.0001526973848228646, + "lm_loss": 1.9768, + "loss": 2.1185, + "mask_loss": 0.1309, + "step": 847, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3370450184135186, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001525890263755303, + "lm_loss": 1.9851, + "loss": 2.1264, + "mask_loss": 0.1305, + "step": 848, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3374424771616477, + "grad_norm": 0.14453125, + "learning_rate": 0.00015248058250792008, + "lm_loss": 1.9555, + "loss": 2.1018, + "mask_loss": 0.1332, + "step": 849, + "topk_loss": 0.0131 + }, + { + "epoch": 0.33783993590977685, + "grad_norm": 0.126953125, + "learning_rate": 0.00015237205339617917, + "lm_loss": 1.9254, + "loss": 2.0689, + "mask_loss": 0.1324, + "step": 850, + "topk_loss": 0.0111 + }, + { + "epoch": 0.33783993590977685, + "eval_lm_loss": 693.2371826171875, + "eval_loss": 693.3829345703125, + "eval_mask_hit_rate": 0.5174924731254578, + "eval_mask_loss": 0.13158242404460907, + "eval_mask_top_10_hit_rate": 0.9814284443855286, + "eval_mask_top_1_hit_rate": 0.996345043182373, + "eval_mask_top_20_hit_rate": 0.9703196883201599, + "eval_mask_top_5_hit_rate": 0.9878877997398376, + "eval_runtime": 144.2145, + "eval_samples_per_second": 14.201, + "eval_steps_per_second": 7.101, + "eval_token_accuracy": 0.6048117876052856, + "eval_top_k_diff": -530.045654296875, + "eval_topk_loss": 0.014150199480354786, + "step": 850 + }, + { + "epoch": 0.33823739465790603, + "grad_norm": 0.1767578125, + "learning_rate": 0.00015226343921659124, + "lm_loss": 1.9664, + "loss": 2.1133, + "mask_loss": 0.1343, + "step": 851, + "topk_loss": 0.0126 + }, + { + "epoch": 0.33863485340603516, + "grad_norm": 0.1630859375, + "learning_rate": 0.00015215474014557815, + "lm_loss": 2.0229, + "loss": 2.1646, + "mask_loss": 0.1322, + "step": 852, + "topk_loss": 0.0095 + }, + { + "epoch": 0.3390323121541643, + "grad_norm": 0.166015625, + "learning_rate": 0.00015204595635969964, + "lm_loss": 2.012, + "loss": 2.1594, + "mask_loss": 0.1336, + "step": 853, + "topk_loss": 0.0137 + }, + { + "epoch": 0.3394297709022935, + "grad_norm": 0.134765625, + "learning_rate": 0.00015193708803565303, + "lm_loss": 1.9471, + "loss": 2.0903, + "mask_loss": 0.133, + "step": 854, + "topk_loss": 0.0102 + }, + { + "epoch": 0.3398272296504226, + "grad_norm": 0.185546875, + "learning_rate": 0.000151828135350273, + "lm_loss": 2.0118, + "loss": 2.1573, + "mask_loss": 0.1326, + "step": 855, + "topk_loss": 0.0129 + }, + { + "epoch": 0.34022468839855174, + "grad_norm": 0.14453125, + "learning_rate": 0.00015171909848053119, + "lm_loss": 1.9994, + "loss": 2.1399, + "mask_loss": 0.1312, + "step": 856, + "topk_loss": 0.0092 + }, + { + "epoch": 0.34062214714668093, + "grad_norm": 0.130859375, + "learning_rate": 0.00015160997760353605, + "lm_loss": 2.0021, + "loss": 2.1448, + "mask_loss": 0.1322, + "step": 857, + "topk_loss": 0.0104 + }, + { + "epoch": 0.34101960589481006, + "grad_norm": 0.1474609375, + "learning_rate": 0.00015150077289653244, + "lm_loss": 1.9989, + "loss": 2.1424, + "mask_loss": 0.132, + "step": 858, + "topk_loss": 0.0115 + }, + { + "epoch": 0.3414170646429392, + "grad_norm": 0.13671875, + "learning_rate": 0.00015139148453690145, + "lm_loss": 1.9976, + "loss": 2.1409, + "mask_loss": 0.1329, + "step": 859, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3418145233910684, + "grad_norm": 0.134765625, + "learning_rate": 0.00015128211270215992, + "lm_loss": 1.9472, + "loss": 2.0914, + "mask_loss": 0.1332, + "step": 860, + "topk_loss": 0.011 + }, + { + "epoch": 0.3422119821391975, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001511726575699604, + "lm_loss": 1.9626, + "loss": 2.1048, + "mask_loss": 0.1309, + "step": 861, + "topk_loss": 0.0112 + }, + { + "epoch": 0.34260944088732664, + "grad_norm": 0.2001953125, + "learning_rate": 0.0001510631193180907, + "lm_loss": 2.0179, + "loss": 2.1569, + "mask_loss": 0.1293, + "step": 862, + "topk_loss": 0.0097 + }, + { + "epoch": 0.34300689963545583, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001509534981244736, + "lm_loss": 1.9051, + "loss": 2.0497, + "mask_loss": 0.1341, + "step": 863, + "topk_loss": 0.0106 + }, + { + "epoch": 0.34340435838358496, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001508437941671667, + "lm_loss": 1.9047, + "loss": 2.0507, + "mask_loss": 0.1345, + "step": 864, + "topk_loss": 0.0115 + }, + { + "epoch": 0.3438018171317141, + "grad_norm": 0.1650390625, + "learning_rate": 0.00015073400762436197, + "lm_loss": 1.9328, + "loss": 2.0763, + "mask_loss": 0.1319, + "step": 865, + "topk_loss": 0.0116 + }, + { + "epoch": 0.3441992758798432, + "grad_norm": 0.1376953125, + "learning_rate": 0.0001506241386743854, + "lm_loss": 2.0323, + "loss": 2.1749, + "mask_loss": 0.1313, + "step": 866, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3445967346279724, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001505141874956971, + "lm_loss": 1.9805, + "loss": 2.1269, + "mask_loss": 0.1344, + "step": 867, + "topk_loss": 0.0119 + }, + { + "epoch": 0.34499419337610154, + "grad_norm": 0.15234375, + "learning_rate": 0.00015040415426689055, + "lm_loss": 2.0136, + "loss": 2.1543, + "mask_loss": 0.1288, + "step": 868, + "topk_loss": 0.0119 + }, + { + "epoch": 0.3453916521242307, + "grad_norm": 0.138671875, + "learning_rate": 0.00015029403916669258, + "lm_loss": 1.9408, + "loss": 2.0839, + "mask_loss": 0.1327, + "step": 869, + "topk_loss": 0.0104 + }, + { + "epoch": 0.34578911087235986, + "grad_norm": 0.1474609375, + "learning_rate": 0.00015018384237396292, + "lm_loss": 1.98, + "loss": 2.1231, + "mask_loss": 0.1319, + "step": 870, + "topk_loss": 0.0111 + }, + { + "epoch": 0.346186569620489, + "grad_norm": 0.15234375, + "learning_rate": 0.0001500735640676941, + "lm_loss": 1.9363, + "loss": 2.0827, + "mask_loss": 0.1337, + "step": 871, + "topk_loss": 0.0127 + }, + { + "epoch": 0.3465840283686181, + "grad_norm": 0.138671875, + "learning_rate": 0.00014996320442701102, + "lm_loss": 2.0109, + "loss": 2.1533, + "mask_loss": 0.1324, + "step": 872, + "topk_loss": 0.01 + }, + { + "epoch": 0.3469814871167473, + "grad_norm": 0.134765625, + "learning_rate": 0.00014985276363117065, + "lm_loss": 1.9425, + "loss": 2.0854, + "mask_loss": 0.132, + "step": 873, + "topk_loss": 0.0109 + }, + { + "epoch": 0.34737894586487644, + "grad_norm": 0.1494140625, + "learning_rate": 0.00014974224185956186, + "lm_loss": 1.9894, + "loss": 2.1354, + "mask_loss": 0.1339, + "step": 874, + "topk_loss": 0.012 + }, + { + "epoch": 0.3477764046130056, + "grad_norm": 0.140625, + "learning_rate": 0.0001496316392917049, + "lm_loss": 1.9791, + "loss": 2.1316, + "mask_loss": 0.1382, + "step": 875, + "topk_loss": 0.0142 + }, + { + "epoch": 0.34817386336113476, + "grad_norm": 0.134765625, + "learning_rate": 0.00014952095610725139, + "lm_loss": 1.9584, + "loss": 2.1019, + "mask_loss": 0.1331, + "step": 876, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3485713221092639, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001494101924859839, + "lm_loss": 1.9915, + "loss": 2.1382, + "mask_loss": 0.133, + "step": 877, + "topk_loss": 0.0136 + }, + { + "epoch": 0.348968780857393, + "grad_norm": 0.1328125, + "learning_rate": 0.0001492993486078156, + "lm_loss": 1.9783, + "loss": 2.121, + "mask_loss": 0.1323, + "step": 878, + "topk_loss": 0.0103 + }, + { + "epoch": 0.3493662396055222, + "grad_norm": 0.126953125, + "learning_rate": 0.00014918842465279, + "lm_loss": 1.9671, + "loss": 2.1064, + "mask_loss": 0.1298, + "step": 879, + "topk_loss": 0.0096 + }, + { + "epoch": 0.34976369835365134, + "grad_norm": 0.1357421875, + "learning_rate": 0.00014907742080108073, + "lm_loss": 1.9153, + "loss": 2.0578, + "mask_loss": 0.1318, + "step": 880, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3501611571017805, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001489663372329912, + "lm_loss": 1.9766, + "loss": 2.1174, + "mask_loss": 0.1315, + "step": 881, + "topk_loss": 0.0093 + }, + { + "epoch": 0.35055861584990966, + "grad_norm": 0.1484375, + "learning_rate": 0.00014885517412895424, + "lm_loss": 1.9713, + "loss": 2.1244, + "mask_loss": 0.1373, + "step": 882, + "topk_loss": 0.0158 + }, + { + "epoch": 0.3509560745980388, + "grad_norm": 0.1416015625, + "learning_rate": 0.00014874393166953192, + "lm_loss": 1.9796, + "loss": 2.1193, + "mask_loss": 0.1301, + "step": 883, + "topk_loss": 0.0097 + }, + { + "epoch": 0.3513535333461679, + "grad_norm": 0.205078125, + "learning_rate": 0.00014863261003541525, + "lm_loss": 1.954, + "loss": 2.0957, + "mask_loss": 0.1314, + "step": 884, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3517509920942971, + "grad_norm": 0.140625, + "learning_rate": 0.0001485212094074237, + "lm_loss": 1.9849, + "loss": 2.1253, + "mask_loss": 0.1303, + "step": 885, + "topk_loss": 0.0101 + }, + { + "epoch": 0.35214845084242624, + "grad_norm": 0.21484375, + "learning_rate": 0.00014840972996650525, + "lm_loss": 1.9263, + "loss": 2.0688, + "mask_loss": 0.1315, + "step": 886, + "topk_loss": 0.011 + }, + { + "epoch": 0.3525459095905554, + "grad_norm": 0.1845703125, + "learning_rate": 0.0001482981718937357, + "lm_loss": 1.8775, + "loss": 2.0324, + "mask_loss": 0.1371, + "step": 887, + "topk_loss": 0.0178 + }, + { + "epoch": 0.35294336833868456, + "grad_norm": 0.1259765625, + "learning_rate": 0.00014818653537031868, + "lm_loss": 1.9684, + "loss": 2.1109, + "mask_loss": 0.1323, + "step": 888, + "topk_loss": 0.0102 + }, + { + "epoch": 0.3533408270868137, + "grad_norm": 0.1689453125, + "learning_rate": 0.00014807482057758528, + "lm_loss": 1.9892, + "loss": 2.1299, + "mask_loss": 0.1304, + "step": 889, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3537382858349428, + "grad_norm": 0.13671875, + "learning_rate": 0.0001479630276969936, + "lm_loss": 1.9411, + "loss": 2.0891, + "mask_loss": 0.1357, + "step": 890, + "topk_loss": 0.0123 + }, + { + "epoch": 0.354135744583072, + "grad_norm": 0.1435546875, + "learning_rate": 0.00014785115691012864, + "lm_loss": 2.0491, + "loss": 2.1903, + "mask_loss": 0.1302, + "step": 891, + "topk_loss": 0.0111 + }, + { + "epoch": 0.35453320333120114, + "grad_norm": 0.12890625, + "learning_rate": 0.000147739208398702, + "lm_loss": 1.9524, + "loss": 2.0971, + "mask_loss": 0.1335, + "step": 892, + "topk_loss": 0.0113 + }, + { + "epoch": 0.3549306620793303, + "grad_norm": 0.15234375, + "learning_rate": 0.00014762718234455151, + "lm_loss": 1.9716, + "loss": 2.1133, + "mask_loss": 0.1307, + "step": 893, + "topk_loss": 0.011 + }, + { + "epoch": 0.3553281208274594, + "grad_norm": 0.1328125, + "learning_rate": 0.00014751507892964082, + "lm_loss": 1.9312, + "loss": 2.0766, + "mask_loss": 0.1334, + "step": 894, + "topk_loss": 0.012 + }, + { + "epoch": 0.3557255795755886, + "grad_norm": 0.1494140625, + "learning_rate": 0.00014740289833605939, + "lm_loss": 1.9993, + "loss": 2.1397, + "mask_loss": 0.1293, + "step": 895, + "topk_loss": 0.0111 + }, + { + "epoch": 0.3561230383237177, + "grad_norm": 0.1416015625, + "learning_rate": 0.00014729064074602198, + "lm_loss": 1.9107, + "loss": 2.0586, + "mask_loss": 0.1354, + "step": 896, + "topk_loss": 0.0125 + }, + { + "epoch": 0.35652049707184685, + "grad_norm": 0.1337890625, + "learning_rate": 0.00014717830634186844, + "lm_loss": 1.9405, + "loss": 2.0837, + "mask_loss": 0.131, + "step": 897, + "topk_loss": 0.0122 + }, + { + "epoch": 0.35691795581997604, + "grad_norm": 0.1416015625, + "learning_rate": 0.00014706589530606335, + "lm_loss": 2.0209, + "loss": 2.163, + "mask_loss": 0.1316, + "step": 898, + "topk_loss": 0.0106 + }, + { + "epoch": 0.35731541456810517, + "grad_norm": 0.162109375, + "learning_rate": 0.0001469534078211958, + "lm_loss": 2.0017, + "loss": 2.1479, + "mask_loss": 0.1329, + "step": 899, + "topk_loss": 0.0133 + }, + { + "epoch": 0.3577128733162343, + "grad_norm": 0.1787109375, + "learning_rate": 0.00014684084406997903, + "lm_loss": 2.0326, + "loss": 2.1736, + "mask_loss": 0.1308, + "step": 900, + "topk_loss": 0.0101 + }, + { + "epoch": 0.3577128733162343, + "eval_lm_loss": 692.6829833984375, + "eval_loss": 692.8275146484375, + "eval_mask_hit_rate": 0.5198476314544678, + "eval_mask_loss": 0.13041795790195465, + "eval_mask_top_10_hit_rate": 0.982144832611084, + "eval_mask_top_1_hit_rate": 0.9965412616729736, + "eval_mask_top_20_hit_rate": 0.9712841510772705, + "eval_mask_top_5_hit_rate": 0.9884326457977295, + "eval_runtime": 143.7303, + "eval_samples_per_second": 14.249, + "eval_steps_per_second": 7.124, + "eval_token_accuracy": 0.606438159942627, + "eval_top_k_diff": -526.1043701171875, + "eval_topk_loss": 0.014123985543847084, + "step": 900 + }, + { + "epoch": 0.3581103320643635, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001467282042352502, + "lm_loss": 2.0374, + "loss": 2.1807, + "mask_loss": 0.1308, + "step": 901, + "topk_loss": 0.0124 + }, + { + "epoch": 0.3585077908124926, + "grad_norm": 0.1396484375, + "learning_rate": 0.00014661548849996997, + "lm_loss": 1.9942, + "loss": 2.136, + "mask_loss": 0.1317, + "step": 902, + "topk_loss": 0.0101 + }, + { + "epoch": 0.35890524956062175, + "grad_norm": 0.1337890625, + "learning_rate": 0.00014650269704722237, + "lm_loss": 1.8926, + "loss": 2.0358, + "mask_loss": 0.1325, + "step": 903, + "topk_loss": 0.0107 + }, + { + "epoch": 0.35930270830875094, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014638983006021438, + "lm_loss": 2.0248, + "loss": 2.1703, + "mask_loss": 0.1331, + "step": 904, + "topk_loss": 0.0125 + }, + { + "epoch": 0.35970016705688007, + "grad_norm": 0.1455078125, + "learning_rate": 0.0001462768877222757, + "lm_loss": 1.9501, + "loss": 2.0948, + "mask_loss": 0.1327, + "step": 905, + "topk_loss": 0.012 + }, + { + "epoch": 0.3600976258050092, + "grad_norm": 0.171875, + "learning_rate": 0.00014616387021685836, + "lm_loss": 1.9497, + "loss": 2.0882, + "mask_loss": 0.1288, + "step": 906, + "topk_loss": 0.0098 + }, + { + "epoch": 0.3604950845531384, + "grad_norm": 0.12353515625, + "learning_rate": 0.00014605077772753656, + "lm_loss": 1.8988, + "loss": 2.0387, + "mask_loss": 0.1309, + "step": 907, + "topk_loss": 0.0089 + }, + { + "epoch": 0.3608925433012675, + "grad_norm": 0.1474609375, + "learning_rate": 0.00014593761043800622, + "lm_loss": 1.9902, + "loss": 2.1325, + "mask_loss": 0.1315, + "step": 908, + "topk_loss": 0.0108 + }, + { + "epoch": 0.36129000204939665, + "grad_norm": 0.212890625, + "learning_rate": 0.00014582436853208483, + "lm_loss": 1.9879, + "loss": 2.1295, + "mask_loss": 0.1306, + "step": 909, + "topk_loss": 0.011 + }, + { + "epoch": 0.36168746079752584, + "grad_norm": 0.14453125, + "learning_rate": 0.00014571105219371102, + "lm_loss": 1.9871, + "loss": 2.1328, + "mask_loss": 0.1344, + "step": 910, + "topk_loss": 0.0113 + }, + { + "epoch": 0.36208491954565497, + "grad_norm": 0.2041015625, + "learning_rate": 0.00014559766160694436, + "lm_loss": 1.9585, + "loss": 2.101, + "mask_loss": 0.1306, + "step": 911, + "topk_loss": 0.0119 + }, + { + "epoch": 0.3624823782937841, + "grad_norm": 0.1474609375, + "learning_rate": 0.00014548419695596505, + "lm_loss": 1.9858, + "loss": 2.127, + "mask_loss": 0.1307, + "step": 912, + "topk_loss": 0.0104 + }, + { + "epoch": 0.3628798370419133, + "grad_norm": 0.177734375, + "learning_rate": 0.0001453706584250735, + "lm_loss": 1.9632, + "loss": 2.106, + "mask_loss": 0.1315, + "step": 913, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3632772957900424, + "grad_norm": 0.1279296875, + "learning_rate": 0.00014525704619869015, + "lm_loss": 1.9146, + "loss": 2.0587, + "mask_loss": 0.1334, + "step": 914, + "topk_loss": 0.0107 + }, + { + "epoch": 0.36367475453817155, + "grad_norm": 0.203125, + "learning_rate": 0.00014514336046135518, + "lm_loss": 1.9857, + "loss": 2.1289, + "mask_loss": 0.1318, + "step": 915, + "topk_loss": 0.0113 + }, + { + "epoch": 0.36407221328630074, + "grad_norm": 0.2021484375, + "learning_rate": 0.00014502960139772824, + "lm_loss": 2.0455, + "loss": 2.1842, + "mask_loss": 0.1287, + "step": 916, + "topk_loss": 0.0101 + }, + { + "epoch": 0.36446967203442987, + "grad_norm": 0.154296875, + "learning_rate": 0.00014491576919258792, + "lm_loss": 1.9393, + "loss": 2.082, + "mask_loss": 0.1333, + "step": 917, + "topk_loss": 0.0094 + }, + { + "epoch": 0.364867130782559, + "grad_norm": 0.1640625, + "learning_rate": 0.00014480186403083173, + "lm_loss": 2.0212, + "loss": 2.1674, + "mask_loss": 0.1345, + "step": 918, + "topk_loss": 0.0117 + }, + { + "epoch": 0.3652645895306882, + "grad_norm": 0.2431640625, + "learning_rate": 0.00014468788609747565, + "lm_loss": 1.9828, + "loss": 2.1276, + "mask_loss": 0.1322, + "step": 919, + "topk_loss": 0.0126 + }, + { + "epoch": 0.3656620482788173, + "grad_norm": 0.265625, + "learning_rate": 0.00014457383557765386, + "lm_loss": 1.9425, + "loss": 2.0874, + "mask_loss": 0.1335, + "step": 920, + "topk_loss": 0.0114 + }, + { + "epoch": 0.36605950702694645, + "grad_norm": 0.1796875, + "learning_rate": 0.00014445971265661842, + "lm_loss": 1.9202, + "loss": 2.064, + "mask_loss": 0.1333, + "step": 921, + "topk_loss": 0.0105 + }, + { + "epoch": 0.3664569657750756, + "grad_norm": 0.3046875, + "learning_rate": 0.00014434551751973907, + "lm_loss": 1.967, + "loss": 2.1158, + "mask_loss": 0.1341, + "step": 922, + "topk_loss": 0.0148 + }, + { + "epoch": 0.36685442452320477, + "grad_norm": 0.26171875, + "learning_rate": 0.00014423125035250276, + "lm_loss": 2.0018, + "loss": 2.1425, + "mask_loss": 0.1295, + "step": 923, + "topk_loss": 0.0112 + }, + { + "epoch": 0.3672518832713339, + "grad_norm": 0.224609375, + "learning_rate": 0.00014411691134051348, + "lm_loss": 1.9848, + "loss": 2.1317, + "mask_loss": 0.1317, + "step": 924, + "topk_loss": 0.0152 + }, + { + "epoch": 0.36764934201946303, + "grad_norm": 0.1513671875, + "learning_rate": 0.00014400250066949196, + "lm_loss": 1.924, + "loss": 2.066, + "mask_loss": 0.1314, + "step": 925, + "topk_loss": 0.0106 + }, + { + "epoch": 0.3680468007675922, + "grad_norm": 0.1630859375, + "learning_rate": 0.00014388801852527526, + "lm_loss": 1.9335, + "loss": 2.0793, + "mask_loss": 0.1345, + "step": 926, + "topk_loss": 0.0113 + }, + { + "epoch": 0.36844425951572135, + "grad_norm": 0.28125, + "learning_rate": 0.00014377346509381647, + "lm_loss": 2.0293, + "loss": 2.1739, + "mask_loss": 0.1308, + "step": 927, + "topk_loss": 0.0138 + }, + { + "epoch": 0.3688417182638505, + "grad_norm": 0.17578125, + "learning_rate": 0.00014365884056118466, + "lm_loss": 1.9228, + "loss": 2.064, + "mask_loss": 0.1312, + "step": 928, + "topk_loss": 0.0101 + }, + { + "epoch": 0.36923917701197967, + "grad_norm": 0.181640625, + "learning_rate": 0.00014354414511356427, + "lm_loss": 1.9399, + "loss": 2.0858, + "mask_loss": 0.1336, + "step": 929, + "topk_loss": 0.0123 + }, + { + "epoch": 0.3696366357601088, + "grad_norm": 0.173828125, + "learning_rate": 0.00014342937893725488, + "lm_loss": 1.9367, + "loss": 2.0863, + "mask_loss": 0.133, + "step": 930, + "topk_loss": 0.0166 + }, + { + "epoch": 0.37003409450823793, + "grad_norm": 0.2236328125, + "learning_rate": 0.00014331454221867108, + "lm_loss": 1.9506, + "loss": 2.0912, + "mask_loss": 0.1298, + "step": 931, + "topk_loss": 0.0109 + }, + { + "epoch": 0.3704315532563671, + "grad_norm": 0.2412109375, + "learning_rate": 0.0001431996351443419, + "lm_loss": 1.9137, + "loss": 2.0563, + "mask_loss": 0.1311, + "step": 932, + "topk_loss": 0.0115 + }, + { + "epoch": 0.37082901200449625, + "grad_norm": 0.158203125, + "learning_rate": 0.00014308465790091086, + "lm_loss": 2.0159, + "loss": 2.157, + "mask_loss": 0.1303, + "step": 933, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3712264707526254, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001429696106751352, + "lm_loss": 1.9211, + "loss": 2.0639, + "mask_loss": 0.1328, + "step": 934, + "topk_loss": 0.01 + }, + { + "epoch": 0.37162392950075457, + "grad_norm": 0.1552734375, + "learning_rate": 0.00014285449365388598, + "lm_loss": 1.9324, + "loss": 2.0745, + "mask_loss": 0.1312, + "step": 935, + "topk_loss": 0.0109 + }, + { + "epoch": 0.3720213882488837, + "grad_norm": 0.197265625, + "learning_rate": 0.00014273930702414766, + "lm_loss": 1.9688, + "loss": 2.1088, + "mask_loss": 0.13, + "step": 936, + "topk_loss": 0.0099 + }, + { + "epoch": 0.37241884699701283, + "grad_norm": 0.2099609375, + "learning_rate": 0.00014262405097301763, + "lm_loss": 1.9681, + "loss": 2.109, + "mask_loss": 0.1295, + "step": 937, + "topk_loss": 0.0114 + }, + { + "epoch": 0.372816305745142, + "grad_norm": 0.1865234375, + "learning_rate": 0.0001425087256877062, + "lm_loss": 1.916, + "loss": 2.0563, + "mask_loss": 0.1298, + "step": 938, + "topk_loss": 0.0105 + }, + { + "epoch": 0.37321376449327115, + "grad_norm": 0.1376953125, + "learning_rate": 0.00014239333135553596, + "lm_loss": 1.9135, + "loss": 2.0549, + "mask_loss": 0.1314, + "step": 939, + "topk_loss": 0.0099 + }, + { + "epoch": 0.3736112232414003, + "grad_norm": 0.1767578125, + "learning_rate": 0.00014227786816394184, + "lm_loss": 1.9723, + "loss": 2.1155, + "mask_loss": 0.1313, + "step": 940, + "topk_loss": 0.0118 + }, + { + "epoch": 0.37400868198952947, + "grad_norm": 0.17578125, + "learning_rate": 0.0001421623363004705, + "lm_loss": 1.9524, + "loss": 2.0944, + "mask_loss": 0.1313, + "step": 941, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3744061407376586, + "grad_norm": 0.23828125, + "learning_rate": 0.00014204673595278016, + "lm_loss": 1.9616, + "loss": 2.1092, + "mask_loss": 0.1335, + "step": 942, + "topk_loss": 0.014 + }, + { + "epoch": 0.37480359948578773, + "grad_norm": 0.169921875, + "learning_rate": 0.00014193106730864025, + "lm_loss": 2.0416, + "loss": 2.1849, + "mask_loss": 0.1309, + "step": 943, + "topk_loss": 0.0125 + }, + { + "epoch": 0.3752010582339169, + "grad_norm": 0.13671875, + "learning_rate": 0.00014181533055593123, + "lm_loss": 1.9677, + "loss": 2.1085, + "mask_loss": 0.1299, + "step": 944, + "topk_loss": 0.0109 + }, + { + "epoch": 0.37559851698204605, + "grad_norm": 0.171875, + "learning_rate": 0.00014169952588264417, + "lm_loss": 1.9138, + "loss": 2.0575, + "mask_loss": 0.1322, + "step": 945, + "topk_loss": 0.0115 + }, + { + "epoch": 0.3759959757301752, + "grad_norm": 0.2353515625, + "learning_rate": 0.00014158365347688033, + "lm_loss": 1.9805, + "loss": 2.1201, + "mask_loss": 0.1288, + "step": 946, + "topk_loss": 0.0108 + }, + { + "epoch": 0.37639343447830437, + "grad_norm": 0.2890625, + "learning_rate": 0.00014146771352685112, + "lm_loss": 1.984, + "loss": 2.1205, + "mask_loss": 0.1266, + "step": 947, + "topk_loss": 0.0099 + }, + { + "epoch": 0.3767908932264335, + "grad_norm": 0.1376953125, + "learning_rate": 0.00014135170622087763, + "lm_loss": 1.9521, + "loss": 2.0922, + "mask_loss": 0.1306, + "step": 948, + "topk_loss": 0.0095 + }, + { + "epoch": 0.37718835197456263, + "grad_norm": 0.169921875, + "learning_rate": 0.00014123563174739037, + "lm_loss": 1.9175, + "loss": 2.0623, + "mask_loss": 0.1333, + "step": 949, + "topk_loss": 0.0114 + }, + { + "epoch": 0.3775858107226918, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001411194902949289, + "lm_loss": 1.9277, + "loss": 2.0684, + "mask_loss": 0.13, + "step": 950, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3775858107226918, + "eval_lm_loss": 690.0902709960938, + "eval_loss": 690.23388671875, + "eval_mask_hit_rate": 0.5220222473144531, + "eval_mask_loss": 0.12960419058799744, + "eval_mask_top_10_hit_rate": 0.9827470183372498, + "eval_mask_top_1_hit_rate": 0.9967126846313477, + "eval_mask_top_20_hit_rate": 0.9721159934997559, + "eval_mask_top_5_hit_rate": 0.9888770580291748, + "eval_runtime": 144.1331, + "eval_samples_per_second": 14.209, + "eval_steps_per_second": 7.105, + "eval_token_accuracy": 0.607653021812439, + "eval_top_k_diff": -511.9417724609375, + "eval_topk_loss": 0.013996293768286705, + "step": 950 + }, + { + "epoch": 0.37798326947082095, + "grad_norm": 0.173828125, + "learning_rate": 0.0001410032820521416, + "lm_loss": 1.9364, + "loss": 2.0792, + "mask_loss": 0.131, + "step": 951, + "topk_loss": 0.0117 + }, + { + "epoch": 0.3783807282189501, + "grad_norm": 0.203125, + "learning_rate": 0.00014088700720778542, + "lm_loss": 1.9916, + "loss": 2.132, + "mask_loss": 0.1295, + "step": 952, + "topk_loss": 0.0109 + }, + { + "epoch": 0.3787781869670792, + "grad_norm": 0.140625, + "learning_rate": 0.0001407706659507253, + "lm_loss": 1.9614, + "loss": 2.1022, + "mask_loss": 0.131, + "step": 953, + "topk_loss": 0.0097 + }, + { + "epoch": 0.3791756457152084, + "grad_norm": 0.17578125, + "learning_rate": 0.00014065425846993424, + "lm_loss": 1.9445, + "loss": 2.0854, + "mask_loss": 0.1298, + "step": 954, + "topk_loss": 0.0112 + }, + { + "epoch": 0.37957310446333753, + "grad_norm": 0.1953125, + "learning_rate": 0.0001405377849544927, + "lm_loss": 1.9137, + "loss": 2.0556, + "mask_loss": 0.1321, + "step": 955, + "topk_loss": 0.0099 + }, + { + "epoch": 0.37997056321146666, + "grad_norm": 0.1884765625, + "learning_rate": 0.00014042124559358846, + "lm_loss": 1.9787, + "loss": 2.1205, + "mask_loss": 0.1304, + "step": 956, + "topk_loss": 0.0114 + }, + { + "epoch": 0.38036802195959585, + "grad_norm": 0.140625, + "learning_rate": 0.00014030464057651626, + "lm_loss": 1.9185, + "loss": 2.0577, + "mask_loss": 0.1299, + "step": 957, + "topk_loss": 0.0093 + }, + { + "epoch": 0.380765480707725, + "grad_norm": 0.1494140625, + "learning_rate": 0.00014018797009267736, + "lm_loss": 1.9063, + "loss": 2.0482, + "mask_loss": 0.1313, + "step": 958, + "topk_loss": 0.0106 + }, + { + "epoch": 0.3811629394558541, + "grad_norm": 0.1708984375, + "learning_rate": 0.00014007123433157953, + "lm_loss": 1.9527, + "loss": 2.09, + "mask_loss": 0.1279, + "step": 959, + "topk_loss": 0.0094 + }, + { + "epoch": 0.3815603982039833, + "grad_norm": 0.1953125, + "learning_rate": 0.00013995443348283645, + "lm_loss": 1.9635, + "loss": 2.1111, + "mask_loss": 0.1325, + "step": 960, + "topk_loss": 0.0151 + }, + { + "epoch": 0.38195785695211243, + "grad_norm": 0.16015625, + "learning_rate": 0.00013983756773616762, + "lm_loss": 1.9174, + "loss": 2.06, + "mask_loss": 0.1322, + "step": 961, + "topk_loss": 0.0105 + }, + { + "epoch": 0.38235531570024156, + "grad_norm": 0.1318359375, + "learning_rate": 0.0001397206372813978, + "lm_loss": 1.9535, + "loss": 2.0936, + "mask_loss": 0.1304, + "step": 962, + "topk_loss": 0.0098 + }, + { + "epoch": 0.38275277444837075, + "grad_norm": 0.1455078125, + "learning_rate": 0.000139603642308457, + "lm_loss": 1.9739, + "loss": 2.1229, + "mask_loss": 0.1342, + "step": 963, + "topk_loss": 0.0148 + }, + { + "epoch": 0.3831502331964999, + "grad_norm": 0.177734375, + "learning_rate": 0.00013948658300737998, + "lm_loss": 1.9771, + "loss": 2.1228, + "mask_loss": 0.1316, + "step": 964, + "topk_loss": 0.014 + }, + { + "epoch": 0.383547691944629, + "grad_norm": 0.1474609375, + "learning_rate": 0.00013936945956830602, + "lm_loss": 1.9432, + "loss": 2.0818, + "mask_loss": 0.1284, + "step": 965, + "topk_loss": 0.0101 + }, + { + "epoch": 0.3839451506927582, + "grad_norm": 0.16796875, + "learning_rate": 0.00013925227218147847, + "lm_loss": 1.9558, + "loss": 2.0974, + "mask_loss": 0.1299, + "step": 966, + "topk_loss": 0.0117 + }, + { + "epoch": 0.38434260944088733, + "grad_norm": 0.134765625, + "learning_rate": 0.00013913502103724468, + "lm_loss": 1.9696, + "loss": 2.1086, + "mask_loss": 0.1284, + "step": 967, + "topk_loss": 0.0106 + }, + { + "epoch": 0.38474006818901646, + "grad_norm": 0.18359375, + "learning_rate": 0.00013901770632605547, + "lm_loss": 2.0012, + "loss": 2.144, + "mask_loss": 0.1296, + "step": 968, + "topk_loss": 0.0132 + }, + { + "epoch": 0.38513752693714565, + "grad_norm": 0.126953125, + "learning_rate": 0.00013890032823846496, + "lm_loss": 1.8701, + "loss": 2.0099, + "mask_loss": 0.1298, + "step": 969, + "topk_loss": 0.01 + }, + { + "epoch": 0.3855349856852748, + "grad_norm": 0.1552734375, + "learning_rate": 0.00013878288696513022, + "lm_loss": 1.9742, + "loss": 2.1138, + "mask_loss": 0.1303, + "step": 970, + "topk_loss": 0.0092 + }, + { + "epoch": 0.3859324444334039, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001386653826968109, + "lm_loss": 1.8972, + "loss": 2.0379, + "mask_loss": 0.1309, + "step": 971, + "topk_loss": 0.0099 + }, + { + "epoch": 0.3863299031815331, + "grad_norm": 0.12890625, + "learning_rate": 0.00013854781562436906, + "lm_loss": 1.954, + "loss": 2.0933, + "mask_loss": 0.1298, + "step": 972, + "topk_loss": 0.0096 + }, + { + "epoch": 0.38672736192966223, + "grad_norm": 0.1455078125, + "learning_rate": 0.00013843018593876868, + "lm_loss": 1.9546, + "loss": 2.101, + "mask_loss": 0.1339, + "step": 973, + "topk_loss": 0.0126 + }, + { + "epoch": 0.38712482067779136, + "grad_norm": 0.19140625, + "learning_rate": 0.00013831249383107545, + "lm_loss": 1.9676, + "loss": 2.1118, + "mask_loss": 0.1325, + "step": 974, + "topk_loss": 0.0116 + }, + { + "epoch": 0.38752227942592055, + "grad_norm": 0.1494140625, + "learning_rate": 0.00013819473949245654, + "lm_loss": 1.9611, + "loss": 2.1034, + "mask_loss": 0.1309, + "step": 975, + "topk_loss": 0.0114 + }, + { + "epoch": 0.3879197381740497, + "grad_norm": 0.138671875, + "learning_rate": 0.0001380769231141801, + "lm_loss": 1.962, + "loss": 2.1035, + "mask_loss": 0.131, + "step": 976, + "topk_loss": 0.0106 + }, + { + "epoch": 0.3883171969221788, + "grad_norm": 0.1494140625, + "learning_rate": 0.00013795904488761516, + "lm_loss": 1.9095, + "loss": 2.0576, + "mask_loss": 0.1336, + "step": 977, + "topk_loss": 0.0145 + }, + { + "epoch": 0.388714655670308, + "grad_norm": 0.259765625, + "learning_rate": 0.00013784110500423104, + "lm_loss": 1.937, + "loss": 2.0796, + "mask_loss": 0.1311, + "step": 978, + "topk_loss": 0.0114 + }, + { + "epoch": 0.3891121144184371, + "grad_norm": 0.1396484375, + "learning_rate": 0.0001377231036555974, + "lm_loss": 2.0023, + "loss": 2.1457, + "mask_loss": 0.1311, + "step": 979, + "topk_loss": 0.0123 + }, + { + "epoch": 0.38950957316656626, + "grad_norm": 0.134765625, + "learning_rate": 0.0001376050410333836, + "lm_loss": 1.9727, + "loss": 2.1118, + "mask_loss": 0.1298, + "step": 980, + "topk_loss": 0.0092 + }, + { + "epoch": 0.3899070319146954, + "grad_norm": 0.2060546875, + "learning_rate": 0.00013748691732935864, + "lm_loss": 1.9375, + "loss": 2.0856, + "mask_loss": 0.1326, + "step": 981, + "topk_loss": 0.0155 + }, + { + "epoch": 0.3903044906628246, + "grad_norm": 0.216796875, + "learning_rate": 0.00013736873273539058, + "lm_loss": 1.9246, + "loss": 2.0686, + "mask_loss": 0.1329, + "step": 982, + "topk_loss": 0.0111 + }, + { + "epoch": 0.3907019494109537, + "grad_norm": 0.1572265625, + "learning_rate": 0.00013725048744344658, + "lm_loss": 1.9293, + "loss": 2.0731, + "mask_loss": 0.1322, + "step": 983, + "topk_loss": 0.0116 + }, + { + "epoch": 0.39109940815908284, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013713218164559222, + "lm_loss": 2.0246, + "loss": 2.1643, + "mask_loss": 0.1287, + "step": 984, + "topk_loss": 0.0109 + }, + { + "epoch": 0.391496866907212, + "grad_norm": 0.162109375, + "learning_rate": 0.00013701381553399145, + "lm_loss": 1.9579, + "loss": 2.0978, + "mask_loss": 0.1298, + "step": 985, + "topk_loss": 0.0101 + }, + { + "epoch": 0.39189432565534116, + "grad_norm": 0.1982421875, + "learning_rate": 0.00013689538930090618, + "lm_loss": 1.9602, + "loss": 2.1008, + "mask_loss": 0.1293, + "step": 986, + "topk_loss": 0.0113 + }, + { + "epoch": 0.3922917844034703, + "grad_norm": 0.1376953125, + "learning_rate": 0.00013677690313869593, + "lm_loss": 1.9438, + "loss": 2.0823, + "mask_loss": 0.1284, + "step": 987, + "topk_loss": 0.0102 + }, + { + "epoch": 0.3926892431515995, + "grad_norm": 0.1416015625, + "learning_rate": 0.0001366583572398176, + "lm_loss": 1.9037, + "loss": 2.0451, + "mask_loss": 0.1307, + "step": 988, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3930867018997286, + "grad_norm": 0.138671875, + "learning_rate": 0.00013653975179682515, + "lm_loss": 2.012, + "loss": 2.15, + "mask_loss": 0.1281, + "step": 989, + "topk_loss": 0.0099 + }, + { + "epoch": 0.39348416064785774, + "grad_norm": 0.18359375, + "learning_rate": 0.00013642108700236916, + "lm_loss": 1.9912, + "loss": 2.1324, + "mask_loss": 0.1305, + "step": 990, + "topk_loss": 0.0107 + }, + { + "epoch": 0.3938816193959869, + "grad_norm": 0.173828125, + "learning_rate": 0.00013630236304919673, + "lm_loss": 1.9203, + "loss": 2.0621, + "mask_loss": 0.1303, + "step": 991, + "topk_loss": 0.0115 + }, + { + "epoch": 0.39427907814411606, + "grad_norm": 0.1279296875, + "learning_rate": 0.00013618358013015098, + "lm_loss": 1.9615, + "loss": 2.1036, + "mask_loss": 0.1313, + "step": 992, + "topk_loss": 0.0109 + }, + { + "epoch": 0.3946765368922452, + "grad_norm": 0.1357421875, + "learning_rate": 0.00013606473843817086, + "lm_loss": 1.9148, + "loss": 2.0565, + "mask_loss": 0.1302, + "step": 993, + "topk_loss": 0.0114 + }, + { + "epoch": 0.3950739956403744, + "grad_norm": 0.146484375, + "learning_rate": 0.0001359458381662907, + "lm_loss": 2.0316, + "loss": 2.1717, + "mask_loss": 0.1303, + "step": 994, + "topk_loss": 0.0098 + }, + { + "epoch": 0.3954714543885035, + "grad_norm": 0.158203125, + "learning_rate": 0.00013582687950764, + "lm_loss": 1.9824, + "loss": 2.1226, + "mask_loss": 0.1288, + "step": 995, + "topk_loss": 0.0115 + }, + { + "epoch": 0.39586891313663264, + "grad_norm": 0.21875, + "learning_rate": 0.0001357078626554432, + "lm_loss": 1.9552, + "loss": 2.0961, + "mask_loss": 0.1303, + "step": 996, + "topk_loss": 0.0106 + }, + { + "epoch": 0.3962663718847618, + "grad_norm": 0.1591796875, + "learning_rate": 0.00013558878780301918, + "lm_loss": 1.8611, + "loss": 2.0037, + "mask_loss": 0.1321, + "step": 997, + "topk_loss": 0.0104 + }, + { + "epoch": 0.39666383063289096, + "grad_norm": 0.2177734375, + "learning_rate": 0.000135469655143781, + "lm_loss": 1.9426, + "loss": 2.0828, + "mask_loss": 0.1294, + "step": 998, + "topk_loss": 0.0108 + }, + { + "epoch": 0.3970612893810201, + "grad_norm": 0.16015625, + "learning_rate": 0.0001353504648712357, + "lm_loss": 1.9499, + "loss": 2.0884, + "mask_loss": 0.1291, + "step": 999, + "topk_loss": 0.0095 + }, + { + "epoch": 0.3974587481291493, + "grad_norm": 0.142578125, + "learning_rate": 0.00013523121717898387, + "lm_loss": 1.9975, + "loss": 2.1404, + "mask_loss": 0.1318, + "step": 1000, + "topk_loss": 0.0111 + }, + { + "epoch": 0.3974587481291493, + "eval_lm_loss": 690.0011596679688, + "eval_loss": 690.1435546875, + "eval_mask_hit_rate": 0.5240002870559692, + "eval_mask_loss": 0.1288737952709198, + "eval_mask_top_10_hit_rate": 0.9831773638725281, + "eval_mask_top_1_hit_rate": 0.9968259334564209, + "eval_mask_top_20_hit_rate": 0.9727537631988525, + "eval_mask_top_5_hit_rate": 0.9891763925552368, + "eval_runtime": 144.1675, + "eval_samples_per_second": 14.206, + "eval_steps_per_second": 7.103, + "eval_token_accuracy": 0.6087426543235779, + "eval_top_k_diff": -517.8064575195312, + "eval_topk_loss": 0.013575425371527672, + "step": 1000 + }, + { + "epoch": 0.3978562068772784, + "grad_norm": 0.1318359375, + "learning_rate": 0.00013511191226071932, + "lm_loss": 1.9351, + "loss": 2.0778, + "mask_loss": 0.1313, + "step": 1001, + "topk_loss": 0.0114 + }, + { + "epoch": 0.39825366562540754, + "grad_norm": 0.2451171875, + "learning_rate": 0.00013499255031022885, + "lm_loss": 1.9526, + "loss": 2.0903, + "mask_loss": 0.1272, + "step": 1002, + "topk_loss": 0.0105 + }, + { + "epoch": 0.3986511243735367, + "grad_norm": 0.197265625, + "learning_rate": 0.0001348731315213919, + "lm_loss": 1.9861, + "loss": 2.126, + "mask_loss": 0.1294, + "step": 1003, + "topk_loss": 0.0105 + }, + { + "epoch": 0.39904858312166586, + "grad_norm": 0.14453125, + "learning_rate": 0.00013475365608818027, + "lm_loss": 1.927, + "loss": 2.0676, + "mask_loss": 0.1305, + "step": 1004, + "topk_loss": 0.0102 + }, + { + "epoch": 0.399446041869795, + "grad_norm": 0.140625, + "learning_rate": 0.00013463412420465767, + "lm_loss": 1.9248, + "loss": 2.0663, + "mask_loss": 0.1296, + "step": 1005, + "topk_loss": 0.0119 + }, + { + "epoch": 0.3998435006179242, + "grad_norm": 0.171875, + "learning_rate": 0.00013451453606497956, + "lm_loss": 1.9101, + "loss": 2.0537, + "mask_loss": 0.1307, + "step": 1006, + "topk_loss": 0.0129 + }, + { + "epoch": 0.4002409593660533, + "grad_norm": 0.1943359375, + "learning_rate": 0.00013439489186339282, + "lm_loss": 1.9559, + "loss": 2.0934, + "mask_loss": 0.1275, + "step": 1007, + "topk_loss": 0.0101 + }, + { + "epoch": 0.40063841811418244, + "grad_norm": 0.130859375, + "learning_rate": 0.00013427519179423528, + "lm_loss": 1.8883, + "loss": 2.0283, + "mask_loss": 0.1303, + "step": 1008, + "topk_loss": 0.0097 + }, + { + "epoch": 0.40103587686231157, + "grad_norm": 0.134765625, + "learning_rate": 0.00013415543605193567, + "lm_loss": 1.9815, + "loss": 2.1222, + "mask_loss": 0.13, + "step": 1009, + "topk_loss": 0.0107 + }, + { + "epoch": 0.40143333561044076, + "grad_norm": 0.1845703125, + "learning_rate": 0.00013403562483101298, + "lm_loss": 1.9476, + "loss": 2.0912, + "mask_loss": 0.1309, + "step": 1010, + "topk_loss": 0.0127 + }, + { + "epoch": 0.4018307943585699, + "grad_norm": 0.142578125, + "learning_rate": 0.00013391575832607643, + "lm_loss": 2.0251, + "loss": 2.1652, + "mask_loss": 0.1289, + "step": 1011, + "topk_loss": 0.0111 + }, + { + "epoch": 0.402228253106699, + "grad_norm": 0.1416015625, + "learning_rate": 0.000133795836731825, + "lm_loss": 1.9069, + "loss": 2.0469, + "mask_loss": 0.1302, + "step": 1012, + "topk_loss": 0.0098 + }, + { + "epoch": 0.4026257118548282, + "grad_norm": 0.1279296875, + "learning_rate": 0.00013367586024304714, + "lm_loss": 1.9828, + "loss": 2.1196, + "mask_loss": 0.1276, + "step": 1013, + "topk_loss": 0.0093 + }, + { + "epoch": 0.40302317060295734, + "grad_norm": 0.189453125, + "learning_rate": 0.0001335558290546205, + "lm_loss": 2.0127, + "loss": 2.1539, + "mask_loss": 0.1295, + "step": 1014, + "topk_loss": 0.0118 + }, + { + "epoch": 0.40342062935108647, + "grad_norm": 0.169921875, + "learning_rate": 0.00013343574336151153, + "lm_loss": 1.918, + "loss": 2.0595, + "mask_loss": 0.1293, + "step": 1015, + "topk_loss": 0.0123 + }, + { + "epoch": 0.40381808809921566, + "grad_norm": 0.1376953125, + "learning_rate": 0.00013331560335877525, + "lm_loss": 1.9773, + "loss": 2.1191, + "mask_loss": 0.1314, + "step": 1016, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4042155468473448, + "grad_norm": 0.1328125, + "learning_rate": 0.0001331954092415549, + "lm_loss": 1.98, + "loss": 2.1209, + "mask_loss": 0.1302, + "step": 1017, + "topk_loss": 0.0106 + }, + { + "epoch": 0.4046130055954739, + "grad_norm": 0.1396484375, + "learning_rate": 0.00013307516120508161, + "lm_loss": 1.9573, + "loss": 2.0995, + "mask_loss": 0.1317, + "step": 1018, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4050104643436031, + "grad_norm": 0.138671875, + "learning_rate": 0.00013295485944467405, + "lm_loss": 2.0041, + "loss": 2.1453, + "mask_loss": 0.1314, + "step": 1019, + "topk_loss": 0.0099 + }, + { + "epoch": 0.40540792309173224, + "grad_norm": 0.14453125, + "learning_rate": 0.0001328345041557382, + "lm_loss": 1.8941, + "loss": 2.0372, + "mask_loss": 0.1321, + "step": 1020, + "topk_loss": 0.011 + }, + { + "epoch": 0.40580538183986137, + "grad_norm": 0.12890625, + "learning_rate": 0.000132714095533767, + "lm_loss": 1.8778, + "loss": 2.0211, + "mask_loss": 0.1318, + "step": 1021, + "topk_loss": 0.0115 + }, + { + "epoch": 0.40620284058799055, + "grad_norm": 0.12353515625, + "learning_rate": 0.00013259363377433994, + "lm_loss": 1.962, + "loss": 2.0981, + "mask_loss": 0.1272, + "step": 1022, + "topk_loss": 0.0089 + }, + { + "epoch": 0.4066002993361197, + "grad_norm": 0.12353515625, + "learning_rate": 0.0001324731190731229, + "lm_loss": 1.9555, + "loss": 2.0984, + "mask_loss": 0.1309, + "step": 1023, + "topk_loss": 0.012 + }, + { + "epoch": 0.4069977580842488, + "grad_norm": 0.1484375, + "learning_rate": 0.00013235255162586773, + "lm_loss": 1.9699, + "loss": 2.1104, + "mask_loss": 0.1297, + "step": 1024, + "topk_loss": 0.0108 + }, + { + "epoch": 0.407395216832378, + "grad_norm": 0.1572265625, + "learning_rate": 0.000132231931628412, + "lm_loss": 1.8687, + "loss": 2.0083, + "mask_loss": 0.1294, + "step": 1025, + "topk_loss": 0.0103 + }, + { + "epoch": 0.40779267558050714, + "grad_norm": 0.2080078125, + "learning_rate": 0.0001321112592766785, + "lm_loss": 1.8635, + "loss": 2.0047, + "mask_loss": 0.1306, + "step": 1026, + "topk_loss": 0.0106 + }, + { + "epoch": 0.40819013432863627, + "grad_norm": 0.130859375, + "learning_rate": 0.00013199053476667518, + "lm_loss": 1.9358, + "loss": 2.0763, + "mask_loss": 0.1294, + "step": 1027, + "topk_loss": 0.0111 + }, + { + "epoch": 0.40858759307676545, + "grad_norm": 0.259765625, + "learning_rate": 0.0001318697582944947, + "lm_loss": 1.9778, + "loss": 2.12, + "mask_loss": 0.1299, + "step": 1028, + "topk_loss": 0.0122 + }, + { + "epoch": 0.4089850518248946, + "grad_norm": 0.2060546875, + "learning_rate": 0.00013174893005631414, + "lm_loss": 1.9636, + "loss": 2.1044, + "mask_loss": 0.1295, + "step": 1029, + "topk_loss": 0.0113 + }, + { + "epoch": 0.4093825105730237, + "grad_norm": 0.1259765625, + "learning_rate": 0.00013162805024839448, + "lm_loss": 1.9506, + "loss": 2.0864, + "mask_loss": 0.1263, + "step": 1030, + "topk_loss": 0.0094 + }, + { + "epoch": 0.4097799693211529, + "grad_norm": 0.1416015625, + "learning_rate": 0.00013150711906708077, + "lm_loss": 1.9486, + "loss": 2.0921, + "mask_loss": 0.1325, + "step": 1031, + "topk_loss": 0.0109 + }, + { + "epoch": 0.41017742806928204, + "grad_norm": 0.177734375, + "learning_rate": 0.00013138613670880123, + "lm_loss": 1.9334, + "loss": 2.0708, + "mask_loss": 0.1281, + "step": 1032, + "topk_loss": 0.0093 + }, + { + "epoch": 0.41057488681741117, + "grad_norm": 0.16015625, + "learning_rate": 0.0001312651033700674, + "lm_loss": 1.9784, + "loss": 2.118, + "mask_loss": 0.1295, + "step": 1033, + "topk_loss": 0.0101 + }, + { + "epoch": 0.41097234556554035, + "grad_norm": 0.2119140625, + "learning_rate": 0.0001311440192474735, + "lm_loss": 1.9494, + "loss": 2.0924, + "mask_loss": 0.1305, + "step": 1034, + "topk_loss": 0.0125 + }, + { + "epoch": 0.4113698043136695, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013102288453769632, + "lm_loss": 1.9732, + "loss": 2.1122, + "mask_loss": 0.1275, + "step": 1035, + "topk_loss": 0.0115 + }, + { + "epoch": 0.4117672630617986, + "grad_norm": 0.13671875, + "learning_rate": 0.00013090169943749476, + "lm_loss": 1.9036, + "loss": 2.0438, + "mask_loss": 0.1301, + "step": 1036, + "topk_loss": 0.0101 + }, + { + "epoch": 0.41216472180992775, + "grad_norm": 0.19140625, + "learning_rate": 0.0001307804641437096, + "lm_loss": 1.88, + "loss": 2.0215, + "mask_loss": 0.1311, + "step": 1037, + "topk_loss": 0.0104 + }, + { + "epoch": 0.41256218055805693, + "grad_norm": 0.318359375, + "learning_rate": 0.00013065917885326313, + "lm_loss": 1.9213, + "loss": 2.1418, + "mask_loss": 0.1812, + "step": 1038, + "topk_loss": 0.0393 + }, + { + "epoch": 0.41295963930618607, + "grad_norm": 0.2470703125, + "learning_rate": 0.00013053784376315888, + "lm_loss": 1.8941, + "loss": 2.0317, + "mask_loss": 0.1272, + "step": 1039, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4133570980543152, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001304164590704813, + "lm_loss": 1.9242, + "loss": 2.0638, + "mask_loss": 0.1287, + "step": 1040, + "topk_loss": 0.0109 + }, + { + "epoch": 0.4137545568024444, + "grad_norm": 0.1748046875, + "learning_rate": 0.00013029502497239533, + "lm_loss": 1.9485, + "loss": 2.0907, + "mask_loss": 0.1297, + "step": 1041, + "topk_loss": 0.0125 + }, + { + "epoch": 0.4141520155505735, + "grad_norm": 0.1689453125, + "learning_rate": 0.00013017354166614613, + "lm_loss": 1.9211, + "loss": 2.061, + "mask_loss": 0.1294, + "step": 1042, + "topk_loss": 0.0106 + }, + { + "epoch": 0.41454947429870265, + "grad_norm": 0.2021484375, + "learning_rate": 0.0001300520093490589, + "lm_loss": 1.9634, + "loss": 2.1031, + "mask_loss": 0.1288, + "step": 1043, + "topk_loss": 0.0109 + }, + { + "epoch": 0.41494693304683183, + "grad_norm": 0.193359375, + "learning_rate": 0.0001299304282185384, + "lm_loss": 1.9653, + "loss": 2.1087, + "mask_loss": 0.1307, + "step": 1044, + "topk_loss": 0.0126 + }, + { + "epoch": 0.41534439179496097, + "grad_norm": 0.142578125, + "learning_rate": 0.0001298087984720687, + "lm_loss": 1.9501, + "loss": 2.0898, + "mask_loss": 0.1295, + "step": 1045, + "topk_loss": 0.0102 + }, + { + "epoch": 0.4157418505430901, + "grad_norm": 0.14453125, + "learning_rate": 0.00012968712030721278, + "lm_loss": 1.9013, + "loss": 2.0384, + "mask_loss": 0.1279, + "step": 1046, + "topk_loss": 0.0092 + }, + { + "epoch": 0.4161393092912193, + "grad_norm": 0.166015625, + "learning_rate": 0.00012956539392161229, + "lm_loss": 1.91, + "loss": 2.0526, + "mask_loss": 0.1311, + "step": 1047, + "topk_loss": 0.0115 + }, + { + "epoch": 0.4165367680393484, + "grad_norm": 0.1552734375, + "learning_rate": 0.00012944361951298722, + "lm_loss": 1.9418, + "loss": 2.08, + "mask_loss": 0.1285, + "step": 1048, + "topk_loss": 0.0097 + }, + { + "epoch": 0.41693422678747755, + "grad_norm": 0.1337890625, + "learning_rate": 0.0001293217972791356, + "lm_loss": 1.9176, + "loss": 2.0586, + "mask_loss": 0.1306, + "step": 1049, + "topk_loss": 0.0104 + }, + { + "epoch": 0.41733168553560673, + "grad_norm": 0.1220703125, + "learning_rate": 0.00012919992741793307, + "lm_loss": 1.9382, + "loss": 2.0761, + "mask_loss": 0.128, + "step": 1050, + "topk_loss": 0.0099 + }, + { + "epoch": 0.41733168553560673, + "eval_lm_loss": 689.869384765625, + "eval_loss": 690.0109252929688, + "eval_mask_hit_rate": 0.5258909463882446, + "eval_mask_loss": 0.12821093201637268, + "eval_mask_top_10_hit_rate": 0.9835715293884277, + "eval_mask_top_1_hit_rate": 0.9969048500061035, + "eval_mask_top_20_hit_rate": 0.9733020067214966, + "eval_mask_top_5_hit_rate": 0.9894579648971558, + "eval_runtime": 143.5444, + "eval_samples_per_second": 14.267, + "eval_steps_per_second": 7.134, + "eval_token_accuracy": 0.6096411943435669, + "eval_top_k_diff": -523.7191162109375, + "eval_topk_loss": 0.013318357057869434, + "step": 1050 + }, + { + "epoch": 0.41772914428373586, + "grad_norm": 0.1962890625, + "learning_rate": 0.0001290780101273326, + "lm_loss": 1.943, + "loss": 2.1011, + "mask_loss": 0.1379, + "step": 1051, + "topk_loss": 0.0202 + }, + { + "epoch": 0.418126603031865, + "grad_norm": 0.134765625, + "learning_rate": 0.00012895604560536435, + "lm_loss": 1.9621, + "loss": 2.0999, + "mask_loss": 0.1288, + "step": 1052, + "topk_loss": 0.009 + }, + { + "epoch": 0.4185240617799942, + "grad_norm": 0.1328125, + "learning_rate": 0.0001288340340501351, + "lm_loss": 1.9349, + "loss": 2.0802, + "mask_loss": 0.1337, + "step": 1053, + "topk_loss": 0.0115 + }, + { + "epoch": 0.4189215205281233, + "grad_norm": 0.173828125, + "learning_rate": 0.000128711975659828, + "lm_loss": 1.9396, + "loss": 2.077, + "mask_loss": 0.1274, + "step": 1054, + "topk_loss": 0.01 + }, + { + "epoch": 0.41931897927625245, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001285898706327023, + "lm_loss": 1.92, + "loss": 2.0605, + "mask_loss": 0.1302, + "step": 1055, + "topk_loss": 0.0103 + }, + { + "epoch": 0.41971643802438163, + "grad_norm": 0.12890625, + "learning_rate": 0.00012846771916709304, + "lm_loss": 1.855, + "loss": 1.9959, + "mask_loss": 0.1316, + "step": 1056, + "topk_loss": 0.0093 + }, + { + "epoch": 0.42011389677251076, + "grad_norm": 0.134765625, + "learning_rate": 0.00012834552146141065, + "lm_loss": 1.9188, + "loss": 2.0576, + "mask_loss": 0.1292, + "step": 1057, + "topk_loss": 0.0095 + }, + { + "epoch": 0.4205113555206399, + "grad_norm": 0.1455078125, + "learning_rate": 0.00012822327771414067, + "lm_loss": 1.9327, + "loss": 2.0795, + "mask_loss": 0.1333, + "step": 1058, + "topk_loss": 0.0134 + }, + { + "epoch": 0.4209088142687691, + "grad_norm": 0.16796875, + "learning_rate": 0.00012810098812384346, + "lm_loss": 1.8952, + "loss": 2.0352, + "mask_loss": 0.129, + "step": 1059, + "topk_loss": 0.011 + }, + { + "epoch": 0.4213062730168982, + "grad_norm": 0.234375, + "learning_rate": 0.0001279786528891538, + "lm_loss": 1.8074, + "loss": 1.9475, + "mask_loss": 0.1296, + "step": 1060, + "topk_loss": 0.0105 + }, + { + "epoch": 0.42170373176502735, + "grad_norm": 0.1259765625, + "learning_rate": 0.0001278562722087806, + "lm_loss": 1.9386, + "loss": 2.0806, + "mask_loss": 0.1302, + "step": 1061, + "topk_loss": 0.0117 + }, + { + "epoch": 0.42210119051315653, + "grad_norm": 0.154296875, + "learning_rate": 0.00012773384628150667, + "lm_loss": 1.8896, + "loss": 2.0308, + "mask_loss": 0.13, + "step": 1062, + "topk_loss": 0.0112 + }, + { + "epoch": 0.42249864926128566, + "grad_norm": 0.12890625, + "learning_rate": 0.0001276113753061882, + "lm_loss": 1.9116, + "loss": 2.0539, + "mask_loss": 0.1319, + "step": 1063, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4228961080094148, + "grad_norm": 0.1728515625, + "learning_rate": 0.00012748885948175466, + "lm_loss": 1.8769, + "loss": 2.0163, + "mask_loss": 0.1297, + "step": 1064, + "topk_loss": 0.0098 + }, + { + "epoch": 0.4232935667575439, + "grad_norm": 0.19921875, + "learning_rate": 0.0001273662990072083, + "lm_loss": 1.9988, + "loss": 2.1388, + "mask_loss": 0.1285, + "step": 1065, + "topk_loss": 0.0114 + }, + { + "epoch": 0.4236910255056731, + "grad_norm": 0.1796875, + "learning_rate": 0.0001272436940816239, + "lm_loss": 1.9298, + "loss": 2.0734, + "mask_loss": 0.1302, + "step": 1066, + "topk_loss": 0.0133 + }, + { + "epoch": 0.42408848425380224, + "grad_norm": 0.130859375, + "learning_rate": 0.00012712104490414844, + "lm_loss": 1.9186, + "loss": 2.0582, + "mask_loss": 0.129, + "step": 1067, + "topk_loss": 0.0106 + }, + { + "epoch": 0.4244859430019314, + "grad_norm": 0.2099609375, + "learning_rate": 0.00012699835167400084, + "lm_loss": 1.9561, + "loss": 2.0928, + "mask_loss": 0.128, + "step": 1068, + "topk_loss": 0.0088 + }, + { + "epoch": 0.42488340175006056, + "grad_norm": 0.22265625, + "learning_rate": 0.0001268756145904715, + "lm_loss": 1.9269, + "loss": 2.0654, + "mask_loss": 0.1281, + "step": 1069, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4252808604981897, + "grad_norm": 0.1376953125, + "learning_rate": 0.00012675283385292212, + "lm_loss": 1.948, + "loss": 2.0851, + "mask_loss": 0.128, + "step": 1070, + "topk_loss": 0.0092 + }, + { + "epoch": 0.4256783192463188, + "grad_norm": 0.1376953125, + "learning_rate": 0.00012663000966078516, + "lm_loss": 1.9498, + "loss": 2.0969, + "mask_loss": 0.1344, + "step": 1071, + "topk_loss": 0.0128 + }, + { + "epoch": 0.426075777994448, + "grad_norm": 0.126953125, + "learning_rate": 0.00012650714221356388, + "lm_loss": 1.9578, + "loss": 2.0974, + "mask_loss": 0.1282, + "step": 1072, + "topk_loss": 0.0114 + }, + { + "epoch": 0.42647323674257714, + "grad_norm": 0.1328125, + "learning_rate": 0.00012638423171083165, + "lm_loss": 1.9249, + "loss": 2.0654, + "mask_loss": 0.1309, + "step": 1073, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4268706954907063, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012626127835223177, + "lm_loss": 1.9757, + "loss": 2.1239, + "mask_loss": 0.1333, + "step": 1074, + "topk_loss": 0.0149 + }, + { + "epoch": 0.42726815423883546, + "grad_norm": 0.224609375, + "learning_rate": 0.00012613828233747727, + "lm_loss": 2.0369, + "loss": 2.1774, + "mask_loss": 0.1285, + "step": 1075, + "topk_loss": 0.0119 + }, + { + "epoch": 0.4276656129869646, + "grad_norm": 0.1259765625, + "learning_rate": 0.00012601524386635036, + "lm_loss": 1.9857, + "loss": 2.1217, + "mask_loss": 0.1263, + "step": 1076, + "topk_loss": 0.0097 + }, + { + "epoch": 0.4280630717350937, + "grad_norm": 0.1298828125, + "learning_rate": 0.00012589216313870223, + "lm_loss": 1.8912, + "loss": 2.0309, + "mask_loss": 0.1286, + "step": 1077, + "topk_loss": 0.011 + }, + { + "epoch": 0.4284605304832229, + "grad_norm": 0.15625, + "learning_rate": 0.0001257690403544527, + "lm_loss": 1.8908, + "loss": 2.0333, + "mask_loss": 0.1294, + "step": 1078, + "topk_loss": 0.0131 + }, + { + "epoch": 0.42885798923135204, + "grad_norm": 0.1259765625, + "learning_rate": 0.00012564587571359, + "lm_loss": 1.9254, + "loss": 2.0659, + "mask_loss": 0.1293, + "step": 1079, + "topk_loss": 0.0112 + }, + { + "epoch": 0.4292554479794812, + "grad_norm": 0.2099609375, + "learning_rate": 0.00012552266941617018, + "lm_loss": 1.9031, + "loss": 2.0396, + "mask_loss": 0.1272, + "step": 1080, + "topk_loss": 0.0093 + }, + { + "epoch": 0.42965290672761036, + "grad_norm": 0.1279296875, + "learning_rate": 0.00012539942166231712, + "lm_loss": 2.0051, + "loss": 2.1465, + "mask_loss": 0.1295, + "step": 1081, + "topk_loss": 0.0119 + }, + { + "epoch": 0.4300503654757395, + "grad_norm": 0.1298828125, + "learning_rate": 0.00012527613265222187, + "lm_loss": 1.9319, + "loss": 2.0702, + "mask_loss": 0.1279, + "step": 1082, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4304478242238686, + "grad_norm": 0.158203125, + "learning_rate": 0.00012515280258614266, + "lm_loss": 1.9023, + "loss": 2.0417, + "mask_loss": 0.1298, + "step": 1083, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4308452829719978, + "grad_norm": 0.1357421875, + "learning_rate": 0.0001250294316644043, + "lm_loss": 1.9735, + "loss": 2.1158, + "mask_loss": 0.1309, + "step": 1084, + "topk_loss": 0.0115 + }, + { + "epoch": 0.43124274172012694, + "grad_norm": 0.16015625, + "learning_rate": 0.000124906020087398, + "lm_loss": 1.8906, + "loss": 2.0301, + "mask_loss": 0.1297, + "step": 1085, + "topk_loss": 0.0097 + }, + { + "epoch": 0.4316402004682561, + "grad_norm": 0.1396484375, + "learning_rate": 0.000124782568055581, + "lm_loss": 1.9388, + "loss": 2.0832, + "mask_loss": 0.1321, + "step": 1086, + "topk_loss": 0.0123 + }, + { + "epoch": 0.43203765921638526, + "grad_norm": 0.12890625, + "learning_rate": 0.00012465907576947622, + "lm_loss": 1.8991, + "loss": 2.0419, + "mask_loss": 0.1317, + "step": 1087, + "topk_loss": 0.0111 + }, + { + "epoch": 0.4324351179645144, + "grad_norm": 0.130859375, + "learning_rate": 0.000124535543429672, + "lm_loss": 1.937, + "loss": 2.0774, + "mask_loss": 0.1283, + "step": 1088, + "topk_loss": 0.0121 + }, + { + "epoch": 0.4328325767126435, + "grad_norm": 0.17578125, + "learning_rate": 0.0001244119712368218, + "lm_loss": 1.8791, + "loss": 2.016, + "mask_loss": 0.1273, + "step": 1089, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4332300354607727, + "grad_norm": 0.125, + "learning_rate": 0.00012428835939164363, + "lm_loss": 1.9085, + "loss": 2.0468, + "mask_loss": 0.1281, + "step": 1090, + "topk_loss": 0.0102 + }, + { + "epoch": 0.43362749420890184, + "grad_norm": 0.1806640625, + "learning_rate": 0.00012416470809492011, + "lm_loss": 2.0234, + "loss": 2.1647, + "mask_loss": 0.1301, + "step": 1091, + "topk_loss": 0.0111 + }, + { + "epoch": 0.434024952957031, + "grad_norm": 0.134765625, + "learning_rate": 0.00012404101754749782, + "lm_loss": 1.9514, + "loss": 2.0922, + "mask_loss": 0.1295, + "step": 1092, + "topk_loss": 0.0112 + }, + { + "epoch": 0.4344224117051601, + "grad_norm": 0.173828125, + "learning_rate": 0.00012391728795028718, + "lm_loss": 1.9315, + "loss": 2.0693, + "mask_loss": 0.1275, + "step": 1093, + "topk_loss": 0.0102 + }, + { + "epoch": 0.4348198704532893, + "grad_norm": 0.13671875, + "learning_rate": 0.00012379351950426187, + "lm_loss": 1.9659, + "loss": 2.1042, + "mask_loss": 0.1273, + "step": 1094, + "topk_loss": 0.011 + }, + { + "epoch": 0.4352173292014184, + "grad_norm": 0.1298828125, + "learning_rate": 0.00012366971241045894, + "lm_loss": 1.95, + "loss": 2.089, + "mask_loss": 0.1298, + "step": 1095, + "topk_loss": 0.0092 + }, + { + "epoch": 0.43561478794954756, + "grad_norm": 0.12255859375, + "learning_rate": 0.00012354586686997792, + "lm_loss": 1.9592, + "loss": 2.0967, + "mask_loss": 0.1276, + "step": 1096, + "topk_loss": 0.0099 + }, + { + "epoch": 0.43601224669767674, + "grad_norm": 0.1904296875, + "learning_rate": 0.00012342198308398108, + "lm_loss": 1.9012, + "loss": 2.0601, + "mask_loss": 0.1373, + "step": 1097, + "topk_loss": 0.0216 + }, + { + "epoch": 0.4364097054458059, + "grad_norm": 0.2001953125, + "learning_rate": 0.00012329806125369253, + "lm_loss": 1.8855, + "loss": 2.0276, + "mask_loss": 0.1309, + "step": 1098, + "topk_loss": 0.0112 + }, + { + "epoch": 0.436807164193935, + "grad_norm": 0.185546875, + "learning_rate": 0.0001231741015803984, + "lm_loss": 1.9489, + "loss": 2.0897, + "mask_loss": 0.1292, + "step": 1099, + "topk_loss": 0.0117 + }, + { + "epoch": 0.4372046229420642, + "grad_norm": 0.1318359375, + "learning_rate": 0.00012305010426544614, + "lm_loss": 1.8913, + "loss": 2.0306, + "mask_loss": 0.1289, + "step": 1100, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4372046229420642, + "eval_lm_loss": 689.3663330078125, + "eval_loss": 689.50732421875, + "eval_mask_hit_rate": 0.5272812843322754, + "eval_mask_loss": 0.12767279148101807, + "eval_mask_top_10_hit_rate": 0.9839221835136414, + "eval_mask_top_1_hit_rate": 0.9970452785491943, + "eval_mask_top_20_hit_rate": 0.9737967252731323, + "eval_mask_top_5_hit_rate": 0.9897100925445557, + "eval_runtime": 144.2783, + "eval_samples_per_second": 14.195, + "eval_steps_per_second": 7.097, + "eval_token_accuracy": 0.6103806495666504, + "eval_top_k_diff": -526.74169921875, + "eval_topk_loss": 0.013399062678217888, + "step": 1100 + }, + { + "epoch": 0.4376020816901933, + "grad_norm": 0.2060546875, + "learning_rate": 0.00012292606951024447, + "lm_loss": 1.9511, + "loss": 2.0901, + "mask_loss": 0.1288, + "step": 1101, + "topk_loss": 0.0102 + }, + { + "epoch": 0.43799954043832245, + "grad_norm": 0.1279296875, + "learning_rate": 0.00012280199751626278, + "lm_loss": 1.8895, + "loss": 2.0293, + "mask_loss": 0.13, + "step": 1102, + "topk_loss": 0.0098 + }, + { + "epoch": 0.43839699918645164, + "grad_norm": 0.1318359375, + "learning_rate": 0.00012267788848503106, + "lm_loss": 1.9403, + "loss": 2.0793, + "mask_loss": 0.1289, + "step": 1103, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4387944579345808, + "grad_norm": 0.14453125, + "learning_rate": 0.00012255374261813944, + "lm_loss": 1.9576, + "loss": 2.0953, + "mask_loss": 0.1273, + "step": 1104, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4391919166827099, + "grad_norm": 0.1552734375, + "learning_rate": 0.00012242956011723782, + "lm_loss": 1.9051, + "loss": 2.0509, + "mask_loss": 0.1335, + "step": 1105, + "topk_loss": 0.0124 + }, + { + "epoch": 0.4395893754308391, + "grad_norm": 0.12890625, + "learning_rate": 0.00012230534118403568, + "lm_loss": 1.9667, + "loss": 2.1039, + "mask_loss": 0.1273, + "step": 1106, + "topk_loss": 0.0098 + }, + { + "epoch": 0.4399868341789682, + "grad_norm": 0.1435546875, + "learning_rate": 0.00012218108602030163, + "lm_loss": 1.9088, + "loss": 2.0525, + "mask_loss": 0.1317, + "step": 1107, + "topk_loss": 0.012 + }, + { + "epoch": 0.44038429292709735, + "grad_norm": 0.1376953125, + "learning_rate": 0.00012205679482786317, + "lm_loss": 2.0079, + "loss": 2.1486, + "mask_loss": 0.1294, + "step": 1108, + "topk_loss": 0.0113 + }, + { + "epoch": 0.44078175167522654, + "grad_norm": 0.1513671875, + "learning_rate": 0.00012193246780860628, + "lm_loss": 1.973, + "loss": 2.1148, + "mask_loss": 0.1301, + "step": 1109, + "topk_loss": 0.0117 + }, + { + "epoch": 0.44117921042335567, + "grad_norm": 0.1513671875, + "learning_rate": 0.00012180810516447512, + "lm_loss": 2.0177, + "loss": 2.1566, + "mask_loss": 0.1278, + "step": 1110, + "topk_loss": 0.0111 + }, + { + "epoch": 0.4415766691714848, + "grad_norm": 0.13671875, + "learning_rate": 0.00012168370709747177, + "lm_loss": 1.9501, + "loss": 2.0943, + "mask_loss": 0.132, + "step": 1111, + "topk_loss": 0.0122 + }, + { + "epoch": 0.441974127919614, + "grad_norm": 0.140625, + "learning_rate": 0.00012155927380965582, + "lm_loss": 1.9107, + "loss": 2.0503, + "mask_loss": 0.1302, + "step": 1112, + "topk_loss": 0.0095 + }, + { + "epoch": 0.4423715866677431, + "grad_norm": 0.16015625, + "learning_rate": 0.0001214348055031441, + "lm_loss": 1.9949, + "loss": 2.1317, + "mask_loss": 0.127, + "step": 1113, + "topk_loss": 0.0097 + }, + { + "epoch": 0.44276904541587225, + "grad_norm": 0.11865234375, + "learning_rate": 0.00012131030238011025, + "lm_loss": 1.8971, + "loss": 2.0372, + "mask_loss": 0.1292, + "step": 1114, + "topk_loss": 0.0109 + }, + { + "epoch": 0.44316650416400144, + "grad_norm": 0.15625, + "learning_rate": 0.0001211857646427845, + "lm_loss": 1.9859, + "loss": 2.1258, + "mask_loss": 0.128, + "step": 1115, + "topk_loss": 0.0119 + }, + { + "epoch": 0.44356396291213057, + "grad_norm": 0.15234375, + "learning_rate": 0.00012106119249345336, + "lm_loss": 1.9206, + "loss": 2.0607, + "mask_loss": 0.1296, + "step": 1116, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4439614216602597, + "grad_norm": 0.140625, + "learning_rate": 0.00012093658613445913, + "lm_loss": 1.9308, + "loss": 2.0717, + "mask_loss": 0.1296, + "step": 1117, + "topk_loss": 0.0113 + }, + { + "epoch": 0.4443588804083889, + "grad_norm": 0.1630859375, + "learning_rate": 0.00012081194576819974, + "lm_loss": 1.9135, + "loss": 2.0519, + "mask_loss": 0.1285, + "step": 1118, + "topk_loss": 0.0099 + }, + { + "epoch": 0.444756339156518, + "grad_norm": 0.15625, + "learning_rate": 0.00012068727159712838, + "lm_loss": 1.9549, + "loss": 2.0941, + "mask_loss": 0.1289, + "step": 1119, + "topk_loss": 0.0103 + }, + { + "epoch": 0.44515379790464715, + "grad_norm": 0.1640625, + "learning_rate": 0.00012056256382375308, + "lm_loss": 1.9009, + "loss": 2.0382, + "mask_loss": 0.1278, + "step": 1120, + "topk_loss": 0.0095 + }, + { + "epoch": 0.4455512566527763, + "grad_norm": 0.1474609375, + "learning_rate": 0.0001204378226506365, + "lm_loss": 1.9945, + "loss": 2.1307, + "mask_loss": 0.127, + "step": 1121, + "topk_loss": 0.0092 + }, + { + "epoch": 0.44594871540090547, + "grad_norm": 0.12255859375, + "learning_rate": 0.00012031304828039554, + "lm_loss": 1.9242, + "loss": 2.0615, + "mask_loss": 0.1277, + "step": 1122, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4463461741490346, + "grad_norm": 0.138671875, + "learning_rate": 0.00012018824091570103, + "lm_loss": 1.9284, + "loss": 2.0676, + "mask_loss": 0.129, + "step": 1123, + "topk_loss": 0.0101 + }, + { + "epoch": 0.44674363289716373, + "grad_norm": 0.1787109375, + "learning_rate": 0.00012006340075927736, + "lm_loss": 1.9781, + "loss": 2.1182, + "mask_loss": 0.1283, + "step": 1124, + "topk_loss": 0.0118 + }, + { + "epoch": 0.4471410916452929, + "grad_norm": 0.138671875, + "learning_rate": 0.00011993852801390226, + "lm_loss": 1.9751, + "loss": 2.1202, + "mask_loss": 0.1317, + "step": 1125, + "topk_loss": 0.0134 + }, + { + "epoch": 0.44753855039342205, + "grad_norm": 0.12890625, + "learning_rate": 0.00011981362288240627, + "lm_loss": 1.8825, + "loss": 2.0205, + "mask_loss": 0.1295, + "step": 1126, + "topk_loss": 0.0085 + }, + { + "epoch": 0.4479360091415512, + "grad_norm": 0.134765625, + "learning_rate": 0.00011968868556767266, + "lm_loss": 1.9584, + "loss": 2.0997, + "mask_loss": 0.1308, + "step": 1127, + "topk_loss": 0.0105 + }, + { + "epoch": 0.44833346788968037, + "grad_norm": 0.1337890625, + "learning_rate": 0.00011956371627263687, + "lm_loss": 1.9445, + "loss": 2.0809, + "mask_loss": 0.1263, + "step": 1128, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4487309266378095, + "grad_norm": 0.1376953125, + "learning_rate": 0.00011943871520028642, + "lm_loss": 1.9005, + "loss": 2.0399, + "mask_loss": 0.1296, + "step": 1129, + "topk_loss": 0.0098 + }, + { + "epoch": 0.44912838538593863, + "grad_norm": 0.138671875, + "learning_rate": 0.00011931368255366027, + "lm_loss": 1.9449, + "loss": 2.0849, + "mask_loss": 0.1299, + "step": 1130, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4495258441340678, + "grad_norm": 0.134765625, + "learning_rate": 0.0001191886185358488, + "lm_loss": 1.9556, + "loss": 2.0947, + "mask_loss": 0.1281, + "step": 1131, + "topk_loss": 0.011 + }, + { + "epoch": 0.44992330288219695, + "grad_norm": 0.1376953125, + "learning_rate": 0.00011906352334999331, + "lm_loss": 1.926, + "loss": 2.0596, + "mask_loss": 0.1247, + "step": 1132, + "topk_loss": 0.0089 + }, + { + "epoch": 0.4503207616303261, + "grad_norm": 0.1396484375, + "learning_rate": 0.00011893839719928573, + "lm_loss": 1.8941, + "loss": 2.0384, + "mask_loss": 0.1325, + "step": 1133, + "topk_loss": 0.0118 + }, + { + "epoch": 0.45071822037845527, + "grad_norm": 0.12890625, + "learning_rate": 0.00011881324028696824, + "lm_loss": 1.9472, + "loss": 2.0868, + "mask_loss": 0.1292, + "step": 1134, + "topk_loss": 0.0103 + }, + { + "epoch": 0.4511156791265844, + "grad_norm": 0.15625, + "learning_rate": 0.00011868805281633304, + "lm_loss": 1.9391, + "loss": 2.0752, + "mask_loss": 0.1268, + "step": 1135, + "topk_loss": 0.0093 + }, + { + "epoch": 0.45151313787471353, + "grad_norm": 0.14453125, + "learning_rate": 0.00011856283499072196, + "lm_loss": 1.8981, + "loss": 2.0365, + "mask_loss": 0.1284, + "step": 1136, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4519105966228427, + "grad_norm": 0.1865234375, + "learning_rate": 0.00011843758701352614, + "lm_loss": 1.8743, + "loss": 2.0243, + "mask_loss": 0.1346, + "step": 1137, + "topk_loss": 0.0154 + }, + { + "epoch": 0.45230805537097185, + "grad_norm": 0.130859375, + "learning_rate": 0.00011831230908818563, + "lm_loss": 1.9063, + "loss": 2.0504, + "mask_loss": 0.1299, + "step": 1138, + "topk_loss": 0.0143 + }, + { + "epoch": 0.452705514119101, + "grad_norm": 0.169921875, + "learning_rate": 0.00011818700141818921, + "lm_loss": 1.9056, + "loss": 2.0454, + "mask_loss": 0.1293, + "step": 1139, + "topk_loss": 0.0105 + }, + { + "epoch": 0.45310297286723017, + "grad_norm": 0.16796875, + "learning_rate": 0.00011806166420707392, + "lm_loss": 1.9614, + "loss": 2.0986, + "mask_loss": 0.1276, + "step": 1140, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4535004316153593, + "grad_norm": 0.1357421875, + "learning_rate": 0.00011793629765842482, + "lm_loss": 1.9674, + "loss": 2.112, + "mask_loss": 0.1318, + "step": 1141, + "topk_loss": 0.0128 + }, + { + "epoch": 0.45389789036348843, + "grad_norm": 0.126953125, + "learning_rate": 0.00011781090197587459, + "lm_loss": 1.9271, + "loss": 2.0674, + "mask_loss": 0.1295, + "step": 1142, + "topk_loss": 0.0108 + }, + { + "epoch": 0.4542953491116176, + "grad_norm": 0.203125, + "learning_rate": 0.00011768547736310327, + "lm_loss": 1.8767, + "loss": 2.0153, + "mask_loss": 0.1282, + "step": 1143, + "topk_loss": 0.0104 + }, + { + "epoch": 0.45469280785974675, + "grad_norm": 0.1552734375, + "learning_rate": 0.00011756002402383783, + "lm_loss": 1.9228, + "loss": 2.0596, + "mask_loss": 0.1268, + "step": 1144, + "topk_loss": 0.01 + }, + { + "epoch": 0.4550902666078759, + "grad_norm": 0.13671875, + "learning_rate": 0.00011743454216185201, + "lm_loss": 1.9901, + "loss": 2.1275, + "mask_loss": 0.1275, + "step": 1145, + "topk_loss": 0.0099 + }, + { + "epoch": 0.45548772535600507, + "grad_norm": 0.16796875, + "learning_rate": 0.00011730903198096573, + "lm_loss": 1.9429, + "loss": 2.0844, + "mask_loss": 0.1297, + "step": 1146, + "topk_loss": 0.0118 + }, + { + "epoch": 0.4558851841041342, + "grad_norm": 0.1328125, + "learning_rate": 0.000117183493685045, + "lm_loss": 1.9209, + "loss": 2.0591, + "mask_loss": 0.1285, + "step": 1147, + "topk_loss": 0.0097 + }, + { + "epoch": 0.45628264285226333, + "grad_norm": 0.125, + "learning_rate": 0.00011705792747800153, + "lm_loss": 1.8713, + "loss": 2.0083, + "mask_loss": 0.1286, + "step": 1148, + "topk_loss": 0.0084 + }, + { + "epoch": 0.45668010160039246, + "grad_norm": 0.140625, + "learning_rate": 0.0001169323335637923, + "lm_loss": 1.9579, + "loss": 2.0947, + "mask_loss": 0.1269, + "step": 1149, + "topk_loss": 0.0098 + }, + { + "epoch": 0.45707756034852165, + "grad_norm": 0.14453125, + "learning_rate": 0.00011680671214641927, + "lm_loss": 1.9345, + "loss": 2.0771, + "mask_loss": 0.1301, + "step": 1150, + "topk_loss": 0.0124 + }, + { + "epoch": 0.45707756034852165, + "eval_lm_loss": 689.210693359375, + "eval_loss": 689.3509521484375, + "eval_mask_hit_rate": 0.5287200808525085, + "eval_mask_loss": 0.12717577815055847, + "eval_mask_top_10_hit_rate": 0.9842155575752258, + "eval_mask_top_1_hit_rate": 0.9971096515655518, + "eval_mask_top_20_hit_rate": 0.9742197394371033, + "eval_mask_top_5_hit_rate": 0.9899111390113831, + "eval_runtime": 144.4636, + "eval_samples_per_second": 14.177, + "eval_steps_per_second": 7.088, + "eval_token_accuracy": 0.6111284494400024, + "eval_top_k_diff": -525.6437377929688, + "eval_topk_loss": 0.013032329268753529, + "step": 1150 + }, + { + "epoch": 0.4574750190966508, + "grad_norm": 0.1396484375, + "learning_rate": 0.00011668106342992917, + "lm_loss": 1.939, + "loss": 2.0791, + "mask_loss": 0.1299, + "step": 1151, + "topk_loss": 0.0103 + }, + { + "epoch": 0.4578724778447799, + "grad_norm": 0.130859375, + "learning_rate": 0.000116555387618413, + "lm_loss": 1.9299, + "loss": 2.0699, + "mask_loss": 0.1301, + "step": 1152, + "topk_loss": 0.01 + }, + { + "epoch": 0.4582699365929091, + "grad_norm": 0.158203125, + "learning_rate": 0.00011642968491600581, + "lm_loss": 1.9045, + "loss": 2.0441, + "mask_loss": 0.128, + "step": 1153, + "topk_loss": 0.0115 + }, + { + "epoch": 0.45866739534103823, + "grad_norm": 0.1640625, + "learning_rate": 0.0001163039555268863, + "lm_loss": 1.8923, + "loss": 2.0362, + "mask_loss": 0.1296, + "step": 1154, + "topk_loss": 0.0142 + }, + { + "epoch": 0.45906485408916736, + "grad_norm": 0.1943359375, + "learning_rate": 0.0001161781996552765, + "lm_loss": 1.9283, + "loss": 2.0677, + "mask_loss": 0.1269, + "step": 1155, + "topk_loss": 0.0125 + }, + { + "epoch": 0.45946231283729655, + "grad_norm": 0.2236328125, + "learning_rate": 0.0001160524175054415, + "lm_loss": 2.0017, + "loss": 2.1393, + "mask_loss": 0.1267, + "step": 1156, + "topk_loss": 0.0109 + }, + { + "epoch": 0.4598597715854257, + "grad_norm": 0.14453125, + "learning_rate": 0.00011592660928168904, + "lm_loss": 1.9245, + "loss": 2.0614, + "mask_loss": 0.1265, + "step": 1157, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4602572303335548, + "grad_norm": 0.2373046875, + "learning_rate": 0.00011580077518836927, + "lm_loss": 1.9163, + "loss": 2.0628, + "mask_loss": 0.1307, + "step": 1158, + "topk_loss": 0.0158 + }, + { + "epoch": 0.460654689081684, + "grad_norm": 0.2001953125, + "learning_rate": 0.00011567491542987427, + "lm_loss": 1.8958, + "loss": 2.0348, + "mask_loss": 0.128, + "step": 1159, + "topk_loss": 0.0111 + }, + { + "epoch": 0.46105214782981313, + "grad_norm": 0.11767578125, + "learning_rate": 0.0001155490302106379, + "lm_loss": 1.8862, + "loss": 2.0216, + "mask_loss": 0.126, + "step": 1160, + "topk_loss": 0.0094 + }, + { + "epoch": 0.46144960657794226, + "grad_norm": 0.1455078125, + "learning_rate": 0.00011542311973513534, + "lm_loss": 1.905, + "loss": 2.0475, + "mask_loss": 0.1318, + "step": 1161, + "topk_loss": 0.0107 + }, + { + "epoch": 0.46184706532607145, + "grad_norm": 0.1630859375, + "learning_rate": 0.00011529718420788269, + "lm_loss": 1.9248, + "loss": 2.0633, + "mask_loss": 0.1284, + "step": 1162, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4622445240742006, + "grad_norm": 0.15234375, + "learning_rate": 0.00011517122383343692, + "lm_loss": 1.9848, + "loss": 2.1228, + "mask_loss": 0.1269, + "step": 1163, + "topk_loss": 0.0111 + }, + { + "epoch": 0.4626419828223297, + "grad_norm": 0.12255859375, + "learning_rate": 0.00011504523881639526, + "lm_loss": 1.9251, + "loss": 2.0624, + "mask_loss": 0.1272, + "step": 1164, + "topk_loss": 0.01 + }, + { + "epoch": 0.4630394415704589, + "grad_norm": 0.17578125, + "learning_rate": 0.00011491922936139499, + "lm_loss": 1.8857, + "loss": 2.0289, + "mask_loss": 0.1295, + "step": 1165, + "topk_loss": 0.0137 + }, + { + "epoch": 0.46343690031858803, + "grad_norm": 0.1552734375, + "learning_rate": 0.00011479319567311304, + "lm_loss": 1.9295, + "loss": 2.0796, + "mask_loss": 0.1347, + "step": 1166, + "topk_loss": 0.0154 + }, + { + "epoch": 0.46383435906671716, + "grad_norm": 0.138671875, + "learning_rate": 0.00011466713795626576, + "lm_loss": 1.9211, + "loss": 2.0586, + "mask_loss": 0.1266, + "step": 1167, + "topk_loss": 0.0109 + }, + { + "epoch": 0.46423181781484635, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001145410564156085, + "lm_loss": 1.9013, + "loss": 2.0452, + "mask_loss": 0.133, + "step": 1168, + "topk_loss": 0.0109 + }, + { + "epoch": 0.4646292765629755, + "grad_norm": 0.1494140625, + "learning_rate": 0.00011441495125593538, + "lm_loss": 1.9573, + "loss": 2.0961, + "mask_loss": 0.128, + "step": 1169, + "topk_loss": 0.0108 + }, + { + "epoch": 0.4650267353111046, + "grad_norm": 0.1943359375, + "learning_rate": 0.00011428882268207872, + "lm_loss": 1.9372, + "loss": 2.0773, + "mask_loss": 0.1288, + "step": 1170, + "topk_loss": 0.0114 + }, + { + "epoch": 0.4654241940592338, + "grad_norm": 0.15234375, + "learning_rate": 0.00011416267089890901, + "lm_loss": 1.8922, + "loss": 2.0304, + "mask_loss": 0.1266, + "step": 1171, + "topk_loss": 0.0116 + }, + { + "epoch": 0.46582165280736293, + "grad_norm": 0.1279296875, + "learning_rate": 0.00011403649611133444, + "lm_loss": 1.9008, + "loss": 2.0439, + "mask_loss": 0.1317, + "step": 1172, + "topk_loss": 0.0114 + }, + { + "epoch": 0.46621911155549206, + "grad_norm": 0.1806640625, + "learning_rate": 0.00011391029852430048, + "lm_loss": 1.9365, + "loss": 2.0713, + "mask_loss": 0.126, + "step": 1173, + "topk_loss": 0.0087 + }, + { + "epoch": 0.46661657030362125, + "grad_norm": 0.17578125, + "learning_rate": 0.0001137840783427897, + "lm_loss": 1.8781, + "loss": 2.0154, + "mask_loss": 0.1264, + "step": 1174, + "topk_loss": 0.0109 + }, + { + "epoch": 0.4670140290517504, + "grad_norm": 0.189453125, + "learning_rate": 0.00011365783577182132, + "lm_loss": 1.9147, + "loss": 2.0551, + "mask_loss": 0.1299, + "step": 1175, + "topk_loss": 0.0106 + }, + { + "epoch": 0.4674114877998795, + "grad_norm": 0.158203125, + "learning_rate": 0.000113531571016451, + "lm_loss": 1.9203, + "loss": 2.0579, + "mask_loss": 0.1271, + "step": 1176, + "topk_loss": 0.0106 + }, + { + "epoch": 0.4678089465480087, + "grad_norm": 0.1884765625, + "learning_rate": 0.0001134052842817704, + "lm_loss": 1.8985, + "loss": 2.0398, + "mask_loss": 0.1293, + "step": 1177, + "topk_loss": 0.012 + }, + { + "epoch": 0.46820640529613783, + "grad_norm": 0.16015625, + "learning_rate": 0.0001132789757729068, + "lm_loss": 1.9963, + "loss": 2.1341, + "mask_loss": 0.127, + "step": 1178, + "topk_loss": 0.0109 + }, + { + "epoch": 0.46860386404426696, + "grad_norm": 0.140625, + "learning_rate": 0.00011315264569502298, + "lm_loss": 1.9405, + "loss": 2.0809, + "mask_loss": 0.1279, + "step": 1179, + "topk_loss": 0.0126 + }, + { + "epoch": 0.4690013227923961, + "grad_norm": 0.1357421875, + "learning_rate": 0.00011302629425331666, + "lm_loss": 1.9246, + "loss": 2.0608, + "mask_loss": 0.1264, + "step": 1180, + "topk_loss": 0.0098 + }, + { + "epoch": 0.4693987815405253, + "grad_norm": 0.138671875, + "learning_rate": 0.00011289992165302035, + "lm_loss": 1.9812, + "loss": 2.1207, + "mask_loss": 0.1285, + "step": 1181, + "topk_loss": 0.011 + }, + { + "epoch": 0.4697962402886544, + "grad_norm": 0.12353515625, + "learning_rate": 0.00011277352809940081, + "lm_loss": 1.921, + "loss": 2.0608, + "mask_loss": 0.1283, + "step": 1182, + "topk_loss": 0.0115 + }, + { + "epoch": 0.47019369903678354, + "grad_norm": 0.2734375, + "learning_rate": 0.00011264711379775892, + "lm_loss": 1.8834, + "loss": 2.0426, + "mask_loss": 0.1373, + "step": 1183, + "topk_loss": 0.0219 + }, + { + "epoch": 0.47059115778491273, + "grad_norm": 0.203125, + "learning_rate": 0.00011252067895342923, + "lm_loss": 1.9036, + "loss": 2.0438, + "mask_loss": 0.1287, + "step": 1184, + "topk_loss": 0.0114 + }, + { + "epoch": 0.47098861653304186, + "grad_norm": 0.13671875, + "learning_rate": 0.00011239422377177973, + "lm_loss": 1.8994, + "loss": 2.0381, + "mask_loss": 0.1285, + "step": 1185, + "topk_loss": 0.0101 + }, + { + "epoch": 0.471386075281171, + "grad_norm": 0.126953125, + "learning_rate": 0.00011226774845821129, + "lm_loss": 1.8618, + "loss": 2.0016, + "mask_loss": 0.129, + "step": 1186, + "topk_loss": 0.0108 + }, + { + "epoch": 0.4717835340293002, + "grad_norm": 0.1298828125, + "learning_rate": 0.0001121412532181576, + "lm_loss": 1.9478, + "loss": 2.0871, + "mask_loss": 0.1287, + "step": 1187, + "topk_loss": 0.0106 + }, + { + "epoch": 0.4721809927774293, + "grad_norm": 0.16015625, + "learning_rate": 0.00011201473825708471, + "lm_loss": 1.7894, + "loss": 1.9291, + "mask_loss": 0.1295, + "step": 1188, + "topk_loss": 0.0102 + }, + { + "epoch": 0.47257845152555844, + "grad_norm": 0.18359375, + "learning_rate": 0.00011188820378049065, + "lm_loss": 1.9324, + "loss": 2.0723, + "mask_loss": 0.1277, + "step": 1189, + "topk_loss": 0.0121 + }, + { + "epoch": 0.4729759102736876, + "grad_norm": 0.1357421875, + "learning_rate": 0.00011176164999390522, + "lm_loss": 1.991, + "loss": 2.1278, + "mask_loss": 0.1275, + "step": 1190, + "topk_loss": 0.0092 + }, + { + "epoch": 0.47337336902181676, + "grad_norm": 0.130859375, + "learning_rate": 0.0001116350771028895, + "lm_loss": 1.9111, + "loss": 2.0493, + "mask_loss": 0.1281, + "step": 1191, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4737708277699459, + "grad_norm": 0.1396484375, + "learning_rate": 0.00011150848531303567, + "lm_loss": 1.8928, + "loss": 2.0307, + "mask_loss": 0.1284, + "step": 1192, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4741682865180751, + "grad_norm": 0.125, + "learning_rate": 0.00011138187482996658, + "lm_loss": 1.9357, + "loss": 2.0759, + "mask_loss": 0.1292, + "step": 1193, + "topk_loss": 0.011 + }, + { + "epoch": 0.4745657452662042, + "grad_norm": 0.1416015625, + "learning_rate": 0.00011125524585933542, + "lm_loss": 1.9198, + "loss": 2.0555, + "mask_loss": 0.1259, + "step": 1194, + "topk_loss": 0.0098 + }, + { + "epoch": 0.47496320401433334, + "grad_norm": 0.173828125, + "learning_rate": 0.00011112859860682547, + "lm_loss": 1.8896, + "loss": 2.0298, + "mask_loss": 0.1293, + "step": 1195, + "topk_loss": 0.0109 + }, + { + "epoch": 0.4753606627624625, + "grad_norm": 0.1484375, + "learning_rate": 0.00011100193327814964, + "lm_loss": 1.9365, + "loss": 2.0732, + "mask_loss": 0.1269, + "step": 1196, + "topk_loss": 0.0098 + }, + { + "epoch": 0.47575812151059166, + "grad_norm": 0.1923828125, + "learning_rate": 0.00011087525007905031, + "lm_loss": 1.9849, + "loss": 2.1198, + "mask_loss": 0.1255, + "step": 1197, + "topk_loss": 0.0094 + }, + { + "epoch": 0.4761555802587208, + "grad_norm": 0.1611328125, + "learning_rate": 0.00011074854921529869, + "lm_loss": 1.8791, + "loss": 2.0175, + "mask_loss": 0.1283, + "step": 1198, + "topk_loss": 0.01 + }, + { + "epoch": 0.47655303900685, + "grad_norm": 0.1767578125, + "learning_rate": 0.00011062183089269487, + "lm_loss": 1.8634, + "loss": 2.0048, + "mask_loss": 0.1296, + "step": 1199, + "topk_loss": 0.0118 + }, + { + "epoch": 0.4769504977549791, + "grad_norm": 0.140625, + "learning_rate": 0.0001104950953170672, + "lm_loss": 1.9092, + "loss": 2.0484, + "mask_loss": 0.1275, + "step": 1200, + "topk_loss": 0.0117 + }, + { + "epoch": 0.4769504977549791, + "eval_lm_loss": 688.0737915039062, + "eval_loss": 688.2135009765625, + "eval_mask_hit_rate": 0.5299395322799683, + "eval_mask_loss": 0.12668579816818237, + "eval_mask_top_10_hit_rate": 0.9844464659690857, + "eval_mask_top_1_hit_rate": 0.9971840381622314, + "eval_mask_top_20_hit_rate": 0.9745547771453857, + "eval_mask_top_5_hit_rate": 0.9900786876678467, + "eval_runtime": 144.1965, + "eval_samples_per_second": 14.203, + "eval_steps_per_second": 7.101, + "eval_token_accuracy": 0.611724853515625, + "eval_top_k_diff": -523.8722534179688, + "eval_topk_loss": 0.013040510006248951, + "step": 1200 + }, + { + "epoch": 0.47734795650310824, + "grad_norm": 0.130859375, + "learning_rate": 0.00011036834269427214, + "lm_loss": 1.9617, + "loss": 2.101, + "mask_loss": 0.1289, + "step": 1201, + "topk_loss": 0.0104 + }, + { + "epoch": 0.4777454152512374, + "grad_norm": 0.138671875, + "learning_rate": 0.00011024157323019373, + "lm_loss": 1.9129, + "loss": 2.0465, + "mask_loss": 0.1241, + "step": 1202, + "topk_loss": 0.0095 + }, + { + "epoch": 0.47814287399936656, + "grad_norm": 0.150390625, + "learning_rate": 0.00011011478713074343, + "lm_loss": 1.9386, + "loss": 2.0754, + "mask_loss": 0.1265, + "step": 1203, + "topk_loss": 0.0103 + }, + { + "epoch": 0.4785403327474957, + "grad_norm": 0.138671875, + "learning_rate": 0.00010998798460185971, + "lm_loss": 1.9397, + "loss": 2.0819, + "mask_loss": 0.1297, + "step": 1204, + "topk_loss": 0.0126 + }, + { + "epoch": 0.4789377914956249, + "grad_norm": 0.1376953125, + "learning_rate": 0.00010986116584950774, + "lm_loss": 1.9978, + "loss": 2.1365, + "mask_loss": 0.1284, + "step": 1205, + "topk_loss": 0.0103 + }, + { + "epoch": 0.479335250243754, + "grad_norm": 0.1298828125, + "learning_rate": 0.00010973433107967902, + "lm_loss": 1.943, + "loss": 2.0842, + "mask_loss": 0.1298, + "step": 1206, + "topk_loss": 0.0113 + }, + { + "epoch": 0.47973270899188314, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010960748049839103, + "lm_loss": 1.919, + "loss": 2.0543, + "mask_loss": 0.1259, + "step": 1207, + "topk_loss": 0.0094 + }, + { + "epoch": 0.48013016774001227, + "grad_norm": 0.12109375, + "learning_rate": 0.00010948061431168701, + "lm_loss": 1.9268, + "loss": 2.0629, + "mask_loss": 0.1261, + "step": 1208, + "topk_loss": 0.0099 + }, + { + "epoch": 0.48052762648814146, + "grad_norm": 0.2314453125, + "learning_rate": 0.00010935373272563556, + "lm_loss": 1.9396, + "loss": 2.0815, + "mask_loss": 0.1301, + "step": 1209, + "topk_loss": 0.0118 + }, + { + "epoch": 0.4809250852362706, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010922683594633021, + "lm_loss": 1.8694, + "loss": 2.006, + "mask_loss": 0.1261, + "step": 1210, + "topk_loss": 0.0105 + }, + { + "epoch": 0.4813225439843997, + "grad_norm": 0.1572265625, + "learning_rate": 0.00010909992417988919, + "lm_loss": 1.8895, + "loss": 2.0313, + "mask_loss": 0.1302, + "step": 1211, + "topk_loss": 0.0116 + }, + { + "epoch": 0.4817200027325289, + "grad_norm": 0.130859375, + "learning_rate": 0.00010897299763245512, + "lm_loss": 1.9207, + "loss": 2.0579, + "mask_loss": 0.1274, + "step": 1212, + "topk_loss": 0.0097 + }, + { + "epoch": 0.48211746148065804, + "grad_norm": 0.123046875, + "learning_rate": 0.00010884605651019459, + "lm_loss": 1.9332, + "loss": 2.0717, + "mask_loss": 0.1286, + "step": 1213, + "topk_loss": 0.0099 + }, + { + "epoch": 0.48251492022878717, + "grad_norm": 0.1328125, + "learning_rate": 0.00010871910101929785, + "lm_loss": 1.9124, + "loss": 2.0518, + "mask_loss": 0.1289, + "step": 1214, + "topk_loss": 0.0106 + }, + { + "epoch": 0.48291237897691636, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010859213136597853, + "lm_loss": 1.9437, + "loss": 2.0799, + "mask_loss": 0.1255, + "step": 1215, + "topk_loss": 0.0107 + }, + { + "epoch": 0.4833098377250455, + "grad_norm": 0.177734375, + "learning_rate": 0.00010846514775647325, + "lm_loss": 1.9137, + "loss": 2.053, + "mask_loss": 0.1276, + "step": 1216, + "topk_loss": 0.0117 + }, + { + "epoch": 0.4837072964731746, + "grad_norm": 0.1279296875, + "learning_rate": 0.00010833815039704132, + "lm_loss": 1.9353, + "loss": 2.0706, + "mask_loss": 0.1256, + "step": 1217, + "topk_loss": 0.0096 + }, + { + "epoch": 0.4841047552213038, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010821113949396428, + "lm_loss": 1.9319, + "loss": 2.0705, + "mask_loss": 0.1275, + "step": 1218, + "topk_loss": 0.0111 + }, + { + "epoch": 0.48450221396943294, + "grad_norm": 0.1669921875, + "learning_rate": 0.0001080841152535458, + "lm_loss": 1.9325, + "loss": 2.0684, + "mask_loss": 0.1266, + "step": 1219, + "topk_loss": 0.0093 + }, + { + "epoch": 0.48489967271756207, + "grad_norm": 0.1640625, + "learning_rate": 0.00010795707788211118, + "lm_loss": 1.9437, + "loss": 2.0846, + "mask_loss": 0.1302, + "step": 1220, + "topk_loss": 0.0108 + }, + { + "epoch": 0.48529713146569126, + "grad_norm": 0.169921875, + "learning_rate": 0.00010783002758600702, + "lm_loss": 1.9576, + "loss": 2.1026, + "mask_loss": 0.1297, + "step": 1221, + "topk_loss": 0.0153 + }, + { + "epoch": 0.4856945902138204, + "grad_norm": 0.1171875, + "learning_rate": 0.00010770296457160088, + "lm_loss": 1.9115, + "loss": 2.0482, + "mask_loss": 0.1265, + "step": 1222, + "topk_loss": 0.0101 + }, + { + "epoch": 0.4860920489619495, + "grad_norm": 0.16796875, + "learning_rate": 0.00010757588904528106, + "lm_loss": 1.931, + "loss": 2.0681, + "mask_loss": 0.1263, + "step": 1223, + "topk_loss": 0.0107 + }, + { + "epoch": 0.4864895077100787, + "grad_norm": 0.140625, + "learning_rate": 0.00010744880121345613, + "lm_loss": 1.9292, + "loss": 2.064, + "mask_loss": 0.1255, + "step": 1224, + "topk_loss": 0.0093 + }, + { + "epoch": 0.48688696645820784, + "grad_norm": 0.1357421875, + "learning_rate": 0.00010732170128255468, + "lm_loss": 1.9025, + "loss": 2.0443, + "mask_loss": 0.1313, + "step": 1225, + "topk_loss": 0.0105 + }, + { + "epoch": 0.48728442520633697, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010719458945902492, + "lm_loss": 1.9129, + "loss": 2.0516, + "mask_loss": 0.1286, + "step": 1226, + "topk_loss": 0.0101 + }, + { + "epoch": 0.48768188395446616, + "grad_norm": 0.1279296875, + "learning_rate": 0.00010706746594933438, + "lm_loss": 1.9273, + "loss": 2.0636, + "mask_loss": 0.1254, + "step": 1227, + "topk_loss": 0.0108 + }, + { + "epoch": 0.4880793427025953, + "grad_norm": 0.1552734375, + "learning_rate": 0.00010694033095996962, + "lm_loss": 1.946, + "loss": 2.0808, + "mask_loss": 0.1253, + "step": 1228, + "topk_loss": 0.0095 + }, + { + "epoch": 0.4884768014507244, + "grad_norm": 0.126953125, + "learning_rate": 0.00010681318469743582, + "lm_loss": 1.9932, + "loss": 2.13, + "mask_loss": 0.1268, + "step": 1229, + "topk_loss": 0.01 + }, + { + "epoch": 0.4888742601988536, + "grad_norm": 0.142578125, + "learning_rate": 0.00010668602736825641, + "lm_loss": 1.9252, + "loss": 2.0662, + "mask_loss": 0.1287, + "step": 1230, + "topk_loss": 0.0123 + }, + { + "epoch": 0.48927171894698274, + "grad_norm": 0.126953125, + "learning_rate": 0.00010655885917897286, + "lm_loss": 1.9524, + "loss": 2.0886, + "mask_loss": 0.1261, + "step": 1231, + "topk_loss": 0.0101 + }, + { + "epoch": 0.48966917769511187, + "grad_norm": 0.134765625, + "learning_rate": 0.0001064316803361443, + "lm_loss": 1.935, + "loss": 2.0723, + "mask_loss": 0.1281, + "step": 1232, + "topk_loss": 0.0092 + }, + { + "epoch": 0.49006663644324105, + "grad_norm": 0.1875, + "learning_rate": 0.00010630449104634712, + "lm_loss": 1.9682, + "loss": 2.1198, + "mask_loss": 0.1327, + "step": 1233, + "topk_loss": 0.0189 + }, + { + "epoch": 0.4904640951913702, + "grad_norm": 0.1943359375, + "learning_rate": 0.00010617729151617465, + "lm_loss": 1.9365, + "loss": 2.0733, + "mask_loss": 0.1257, + "step": 1234, + "topk_loss": 0.0111 + }, + { + "epoch": 0.4908615539394993, + "grad_norm": 0.15625, + "learning_rate": 0.00010605008195223694, + "lm_loss": 1.9279, + "loss": 2.065, + "mask_loss": 0.1257, + "step": 1235, + "topk_loss": 0.0114 + }, + { + "epoch": 0.49125901268762845, + "grad_norm": 0.115234375, + "learning_rate": 0.00010592286256116027, + "lm_loss": 1.9797, + "loss": 2.1129, + "mask_loss": 0.1242, + "step": 1236, + "topk_loss": 0.0089 + }, + { + "epoch": 0.49165647143575764, + "grad_norm": 0.177734375, + "learning_rate": 0.00010579563354958692, + "lm_loss": 1.9539, + "loss": 2.0972, + "mask_loss": 0.1304, + "step": 1237, + "topk_loss": 0.0129 + }, + { + "epoch": 0.49205393018388677, + "grad_norm": 0.1298828125, + "learning_rate": 0.00010566839512417479, + "lm_loss": 1.8372, + "loss": 1.975, + "mask_loss": 0.1287, + "step": 1238, + "topk_loss": 0.0091 + }, + { + "epoch": 0.4924513889320159, + "grad_norm": 0.12890625, + "learning_rate": 0.000105541147491597, + "lm_loss": 1.9374, + "loss": 2.0735, + "mask_loss": 0.1268, + "step": 1239, + "topk_loss": 0.0093 + }, + { + "epoch": 0.4928488476801451, + "grad_norm": 0.12255859375, + "learning_rate": 0.00010541389085854176, + "lm_loss": 1.897, + "loss": 2.0366, + "mask_loss": 0.1284, + "step": 1240, + "topk_loss": 0.0113 + }, + { + "epoch": 0.4932463064282742, + "grad_norm": 0.20703125, + "learning_rate": 0.0001052866254317118, + "lm_loss": 1.9831, + "loss": 2.1197, + "mask_loss": 0.1252, + "step": 1241, + "topk_loss": 0.0113 + }, + { + "epoch": 0.49364376517640335, + "grad_norm": 0.1552734375, + "learning_rate": 0.00010515935141782414, + "lm_loss": 1.8972, + "loss": 2.0329, + "mask_loss": 0.1258, + "step": 1242, + "topk_loss": 0.0099 + }, + { + "epoch": 0.49404122392453254, + "grad_norm": 0.12353515625, + "learning_rate": 0.0001050320690236098, + "lm_loss": 1.8784, + "loss": 2.012, + "mask_loss": 0.1247, + "step": 1243, + "topk_loss": 0.0089 + }, + { + "epoch": 0.49443868267266167, + "grad_norm": 0.1611328125, + "learning_rate": 0.00010490477845581337, + "lm_loss": 1.9397, + "loss": 2.0775, + "mask_loss": 0.127, + "step": 1244, + "topk_loss": 0.0108 + }, + { + "epoch": 0.4948361414207908, + "grad_norm": 0.185546875, + "learning_rate": 0.00010477747992119273, + "lm_loss": 1.8701, + "loss": 2.0037, + "mask_loss": 0.1244, + "step": 1245, + "topk_loss": 0.0092 + }, + { + "epoch": 0.49523360016892, + "grad_norm": 0.177734375, + "learning_rate": 0.00010465017362651868, + "lm_loss": 1.9222, + "loss": 2.0605, + "mask_loss": 0.1279, + "step": 1246, + "topk_loss": 0.0103 + }, + { + "epoch": 0.4956310589170491, + "grad_norm": 0.1826171875, + "learning_rate": 0.00010452285977857463, + "lm_loss": 1.9269, + "loss": 2.0687, + "mask_loss": 0.1297, + "step": 1247, + "topk_loss": 0.0122 + }, + { + "epoch": 0.49602851766517825, + "grad_norm": 0.1240234375, + "learning_rate": 0.0001043955385841563, + "lm_loss": 1.9156, + "loss": 2.0507, + "mask_loss": 0.1259, + "step": 1248, + "topk_loss": 0.0092 + }, + { + "epoch": 0.49642597641330743, + "grad_norm": 0.1748046875, + "learning_rate": 0.00010426821025007134, + "lm_loss": 1.962, + "loss": 2.0986, + "mask_loss": 0.126, + "step": 1249, + "topk_loss": 0.0106 + }, + { + "epoch": 0.49682343516143657, + "grad_norm": 0.197265625, + "learning_rate": 0.0001041408749831389, + "lm_loss": 1.927, + "loss": 2.0636, + "mask_loss": 0.1273, + "step": 1250, + "topk_loss": 0.0093 + }, + { + "epoch": 0.49682343516143657, + "eval_lm_loss": 689.285888671875, + "eval_loss": 689.4251708984375, + "eval_mask_hit_rate": 0.530980110168457, + "eval_mask_loss": 0.126395121216774, + "eval_mask_top_10_hit_rate": 0.9846255779266357, + "eval_mask_top_1_hit_rate": 0.997197151184082, + "eval_mask_top_20_hit_rate": 0.9748088717460632, + "eval_mask_top_5_hit_rate": 0.9901968240737915, + "eval_runtime": 144.3204, + "eval_samples_per_second": 14.191, + "eval_steps_per_second": 7.095, + "eval_token_accuracy": 0.6121336221694946, + "eval_top_k_diff": -530.7422485351562, + "eval_topk_loss": 0.012827267870306969, + "step": 1250 + }, + { + "epoch": 0.4972208939095657, + "grad_norm": 0.16796875, + "learning_rate": 0.0001040135329901895, + "lm_loss": 1.984, + "loss": 2.1308, + "mask_loss": 0.131, + "step": 1251, + "topk_loss": 0.0158 + }, + { + "epoch": 0.4976183526576949, + "grad_norm": 0.1220703125, + "learning_rate": 0.00010388618447806455, + "lm_loss": 1.903, + "loss": 2.0404, + "mask_loss": 0.1279, + "step": 1252, + "topk_loss": 0.0095 + }, + { + "epoch": 0.498015811405824, + "grad_norm": 0.1162109375, + "learning_rate": 0.00010375882965361605, + "lm_loss": 1.913, + "loss": 2.0495, + "mask_loss": 0.1271, + "step": 1253, + "topk_loss": 0.0094 + }, + { + "epoch": 0.49841327015395315, + "grad_norm": 0.134765625, + "learning_rate": 0.00010363146872370622, + "lm_loss": 1.9371, + "loss": 2.076, + "mask_loss": 0.1283, + "step": 1254, + "topk_loss": 0.0106 + }, + { + "epoch": 0.49881072890208233, + "grad_norm": 0.134765625, + "learning_rate": 0.00010350410189520723, + "lm_loss": 1.9335, + "loss": 2.072, + "mask_loss": 0.1289, + "step": 1255, + "topk_loss": 0.0096 + }, + { + "epoch": 0.49920818765021147, + "grad_norm": 0.1220703125, + "learning_rate": 0.00010337672937500085, + "lm_loss": 1.9149, + "loss": 2.0508, + "mask_loss": 0.1261, + "step": 1256, + "topk_loss": 0.0099 + }, + { + "epoch": 0.4996056463983406, + "grad_norm": 0.13671875, + "learning_rate": 0.00010324935136997806, + "lm_loss": 1.9897, + "loss": 2.1257, + "mask_loss": 0.126, + "step": 1257, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5000031051464697, + "grad_norm": 0.1220703125, + "learning_rate": 0.00010312196808703876, + "lm_loss": 1.9753, + "loss": 2.1177, + "mask_loss": 0.13, + "step": 1258, + "topk_loss": 0.0124 + }, + { + "epoch": 0.5004005638945989, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010299457973309142, + "lm_loss": 1.9811, + "loss": 2.1195, + "mask_loss": 0.128, + "step": 1259, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5007980226427281, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010286718651505275, + "lm_loss": 1.9495, + "loss": 2.0852, + "mask_loss": 0.1253, + "step": 1260, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5011954813908572, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010273978863984742, + "lm_loss": 1.9871, + "loss": 2.1249, + "mask_loss": 0.1265, + "step": 1261, + "topk_loss": 0.0113 + }, + { + "epoch": 0.5015929401389864, + "grad_norm": 0.1318359375, + "learning_rate": 0.00010261238631440748, + "lm_loss": 1.8924, + "loss": 2.0294, + "mask_loss": 0.1266, + "step": 1262, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5019903988871155, + "grad_norm": 0.11962890625, + "learning_rate": 0.00010248497974567244, + "lm_loss": 1.885, + "loss": 2.0211, + "mask_loss": 0.1253, + "step": 1263, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5023878576352446, + "grad_norm": 0.1455078125, + "learning_rate": 0.00010235756914058856, + "lm_loss": 1.9563, + "loss": 2.0923, + "mask_loss": 0.126, + "step": 1264, + "topk_loss": 0.01 + }, + { + "epoch": 0.5027853163833738, + "grad_norm": 0.12109375, + "learning_rate": 0.00010223015470610871, + "lm_loss": 1.8781, + "loss": 2.0166, + "mask_loss": 0.1285, + "step": 1265, + "topk_loss": 0.01 + }, + { + "epoch": 0.503182775131503, + "grad_norm": 0.1220703125, + "learning_rate": 0.00010210273664919191, + "lm_loss": 1.893, + "loss": 2.03, + "mask_loss": 0.1268, + "step": 1266, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5035802338796321, + "grad_norm": 0.1640625, + "learning_rate": 0.00010197531517680319, + "lm_loss": 1.9048, + "loss": 2.0391, + "mask_loss": 0.1253, + "step": 1267, + "topk_loss": 0.009 + }, + { + "epoch": 0.5039776926277613, + "grad_norm": 0.15234375, + "learning_rate": 0.00010184789049591299, + "lm_loss": 1.9125, + "loss": 2.0491, + "mask_loss": 0.1248, + "step": 1268, + "topk_loss": 0.0118 + }, + { + "epoch": 0.5043751513758904, + "grad_norm": 0.12451171875, + "learning_rate": 0.0001017204628134971, + "lm_loss": 1.9724, + "loss": 2.1081, + "mask_loss": 0.1257, + "step": 1269, + "topk_loss": 0.01 + }, + { + "epoch": 0.5047726101240195, + "grad_norm": 0.1533203125, + "learning_rate": 0.00010159303233653604, + "lm_loss": 1.9138, + "loss": 2.0603, + "mask_loss": 0.1319, + "step": 1270, + "topk_loss": 0.0147 + }, + { + "epoch": 0.5051700688721487, + "grad_norm": 0.1240234375, + "learning_rate": 0.00010146559927201495, + "lm_loss": 1.9232, + "loss": 2.0605, + "mask_loss": 0.1272, + "step": 1271, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5055675276202779, + "grad_norm": 0.12060546875, + "learning_rate": 0.0001013381638269232, + "lm_loss": 1.9513, + "loss": 2.0924, + "mask_loss": 0.1277, + "step": 1272, + "topk_loss": 0.0134 + }, + { + "epoch": 0.505964986368407, + "grad_norm": 0.1259765625, + "learning_rate": 0.00010121072620825397, + "lm_loss": 1.9005, + "loss": 2.0375, + "mask_loss": 0.1265, + "step": 1273, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5063624451165362, + "grad_norm": 0.1416015625, + "learning_rate": 0.000101083286623004, + "lm_loss": 1.9435, + "loss": 2.0812, + "mask_loss": 0.129, + "step": 1274, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5067599038646653, + "grad_norm": 0.12109375, + "learning_rate": 0.00010095584527817319, + "lm_loss": 1.8898, + "loss": 2.0306, + "mask_loss": 0.1292, + "step": 1275, + "topk_loss": 0.0116 + }, + { + "epoch": 0.5071573626127944, + "grad_norm": 0.130859375, + "learning_rate": 0.00010082840238076436, + "lm_loss": 1.8348, + "loss": 1.971, + "mask_loss": 0.1276, + "step": 1276, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5075548213609236, + "grad_norm": 0.173828125, + "learning_rate": 0.00010070095813778281, + "lm_loss": 1.9219, + "loss": 2.0633, + "mask_loss": 0.1317, + "step": 1277, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5079522801090527, + "grad_norm": 0.1298828125, + "learning_rate": 0.000100573512756236, + "lm_loss": 2.0069, + "loss": 2.1456, + "mask_loss": 0.1267, + "step": 1278, + "topk_loss": 0.012 + }, + { + "epoch": 0.5083497388571819, + "grad_norm": 0.11962890625, + "learning_rate": 0.0001004460664431333, + "lm_loss": 1.8486, + "loss": 1.9884, + "mask_loss": 0.1294, + "step": 1279, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5087471976053111, + "grad_norm": 0.12255859375, + "learning_rate": 0.00010031861940548555, + "lm_loss": 1.9759, + "loss": 2.1135, + "mask_loss": 0.1276, + "step": 1280, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5091446563534402, + "grad_norm": 0.12451171875, + "learning_rate": 0.00010019117185030478, + "lm_loss": 1.9389, + "loss": 2.0731, + "mask_loss": 0.1249, + "step": 1281, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5095421151015693, + "grad_norm": 0.11962890625, + "learning_rate": 0.00010006372398460387, + "lm_loss": 1.9281, + "loss": 2.0706, + "mask_loss": 0.1299, + "step": 1282, + "topk_loss": 0.0126 + }, + { + "epoch": 0.5099395738496985, + "grad_norm": 0.1220703125, + "learning_rate": 9.993627601539617e-05, + "lm_loss": 1.9432, + "loss": 2.0786, + "mask_loss": 0.1259, + "step": 1283, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5103370325978276, + "grad_norm": 0.1328125, + "learning_rate": 9.980882814969524e-05, + "lm_loss": 1.9283, + "loss": 2.0659, + "mask_loss": 0.1277, + "step": 1284, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5107344913459568, + "grad_norm": 0.1162109375, + "learning_rate": 9.968138059451446e-05, + "lm_loss": 1.9759, + "loss": 2.1111, + "mask_loss": 0.1254, + "step": 1285, + "topk_loss": 0.0098 + }, + { + "epoch": 0.511131950094086, + "grad_norm": 0.11279296875, + "learning_rate": 9.955393355686671e-05, + "lm_loss": 1.8777, + "loss": 2.0146, + "mask_loss": 0.126, + "step": 1286, + "topk_loss": 0.0109 + }, + { + "epoch": 0.5115294088422151, + "grad_norm": 0.1591796875, + "learning_rate": 9.942648724376403e-05, + "lm_loss": 1.9045, + "loss": 2.0417, + "mask_loss": 0.1267, + "step": 1287, + "topk_loss": 0.0105 + }, + { + "epoch": 0.5119268675903442, + "grad_norm": 0.119140625, + "learning_rate": 9.929904186221722e-05, + "lm_loss": 1.9832, + "loss": 2.1178, + "mask_loss": 0.1253, + "step": 1288, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5123243263384734, + "grad_norm": 0.125, + "learning_rate": 9.917159761923566e-05, + "lm_loss": 2.0023, + "loss": 2.139, + "mask_loss": 0.126, + "step": 1289, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5127217850866025, + "grad_norm": 0.12451171875, + "learning_rate": 9.904415472182682e-05, + "lm_loss": 1.9215, + "loss": 2.0587, + "mask_loss": 0.1279, + "step": 1290, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5131192438347317, + "grad_norm": 0.1220703125, + "learning_rate": 9.891671337699602e-05, + "lm_loss": 1.8964, + "loss": 2.0349, + "mask_loss": 0.1294, + "step": 1291, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5135167025828609, + "grad_norm": 0.12451171875, + "learning_rate": 9.878927379174605e-05, + "lm_loss": 1.9059, + "loss": 2.0463, + "mask_loss": 0.1285, + "step": 1292, + "topk_loss": 0.0118 + }, + { + "epoch": 0.51391416133099, + "grad_norm": 0.1572265625, + "learning_rate": 9.866183617307682e-05, + "lm_loss": 1.9395, + "loss": 2.0755, + "mask_loss": 0.1266, + "step": 1293, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5143116200791191, + "grad_norm": 0.140625, + "learning_rate": 9.853440072798507e-05, + "lm_loss": 1.9447, + "loss": 2.0812, + "mask_loss": 0.1273, + "step": 1294, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5147090788272483, + "grad_norm": 0.1279296875, + "learning_rate": 9.840696766346401e-05, + "lm_loss": 1.9092, + "loss": 2.0484, + "mask_loss": 0.1283, + "step": 1295, + "topk_loss": 0.0109 + }, + { + "epoch": 0.5151065375753774, + "grad_norm": 0.1259765625, + "learning_rate": 9.827953718650295e-05, + "lm_loss": 1.8731, + "loss": 2.0116, + "mask_loss": 0.1287, + "step": 1296, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5155039963235066, + "grad_norm": 0.123046875, + "learning_rate": 9.815210950408704e-05, + "lm_loss": 1.9524, + "loss": 2.0892, + "mask_loss": 0.1272, + "step": 1297, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5159014550716358, + "grad_norm": 0.125, + "learning_rate": 9.802468482319683e-05, + "lm_loss": 1.8773, + "loss": 2.0144, + "mask_loss": 0.127, + "step": 1298, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5162989138197649, + "grad_norm": 0.150390625, + "learning_rate": 9.78972633508081e-05, + "lm_loss": 1.9349, + "loss": 2.0831, + "mask_loss": 0.1314, + "step": 1299, + "topk_loss": 0.0167 + }, + { + "epoch": 0.516696372567894, + "grad_norm": 0.15234375, + "learning_rate": 9.776984529389131e-05, + "lm_loss": 1.9471, + "loss": 2.0846, + "mask_loss": 0.1275, + "step": 1300, + "topk_loss": 0.01 + }, + { + "epoch": 0.516696372567894, + "eval_lm_loss": 688.7545166015625, + "eval_loss": 688.8933715820312, + "eval_mask_hit_rate": 0.5318499803543091, + "eval_mask_loss": 0.12599800527095795, + "eval_mask_top_10_hit_rate": 0.98480224609375, + "eval_mask_top_1_hit_rate": 0.9972474575042725, + "eval_mask_top_20_hit_rate": 0.9750704765319824, + "eval_mask_top_5_hit_rate": 0.9903174638748169, + "eval_runtime": 144.2764, + "eval_samples_per_second": 14.195, + "eval_steps_per_second": 7.097, + "eval_token_accuracy": 0.6126667857170105, + "eval_top_k_diff": -528.0156860351562, + "eval_topk_loss": 0.012844683602452278, + "step": 1300 + }, + { + "epoch": 0.5170938313160232, + "grad_norm": 0.1171875, + "learning_rate": 9.764243085941145e-05, + "lm_loss": 1.8988, + "loss": 2.0387, + "mask_loss": 0.1302, + "step": 1301, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5174912900641523, + "grad_norm": 0.11669921875, + "learning_rate": 9.751502025432756e-05, + "lm_loss": 1.916, + "loss": 2.0535, + "mask_loss": 0.1283, + "step": 1302, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5178887488122815, + "grad_norm": 0.11279296875, + "learning_rate": 9.738761368559256e-05, + "lm_loss": 1.859, + "loss": 1.9954, + "mask_loss": 0.127, + "step": 1303, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5182862075604107, + "grad_norm": 0.11865234375, + "learning_rate": 9.726021136015265e-05, + "lm_loss": 1.9261, + "loss": 2.0612, + "mask_loss": 0.1262, + "step": 1304, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5186836663085398, + "grad_norm": 0.15234375, + "learning_rate": 9.713281348494726e-05, + "lm_loss": 1.938, + "loss": 2.0769, + "mask_loss": 0.1278, + "step": 1305, + "topk_loss": 0.0111 + }, + { + "epoch": 0.5190811250566689, + "grad_norm": 0.115234375, + "learning_rate": 9.700542026690859e-05, + "lm_loss": 1.894, + "loss": 2.0337, + "mask_loss": 0.1293, + "step": 1306, + "topk_loss": 0.0104 + }, + { + "epoch": 0.519478583804798, + "grad_norm": 0.11669921875, + "learning_rate": 9.687803191296126e-05, + "lm_loss": 1.8523, + "loss": 1.9875, + "mask_loss": 0.1264, + "step": 1307, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5198760425529272, + "grad_norm": 0.1875, + "learning_rate": 9.675064863002196e-05, + "lm_loss": 1.9044, + "loss": 2.0509, + "mask_loss": 0.1314, + "step": 1308, + "topk_loss": 0.0151 + }, + { + "epoch": 0.5202735013010563, + "grad_norm": 0.12158203125, + "learning_rate": 9.662327062499918e-05, + "lm_loss": 1.9658, + "loss": 2.1023, + "mask_loss": 0.1264, + "step": 1309, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5206709600491856, + "grad_norm": 0.1201171875, + "learning_rate": 9.64958981047928e-05, + "lm_loss": 1.9269, + "loss": 2.0627, + "mask_loss": 0.1263, + "step": 1310, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5210684187973147, + "grad_norm": 0.134765625, + "learning_rate": 9.636853127629383e-05, + "lm_loss": 1.9253, + "loss": 2.0637, + "mask_loss": 0.1285, + "step": 1311, + "topk_loss": 0.01 + }, + { + "epoch": 0.5214658775454438, + "grad_norm": 0.12890625, + "learning_rate": 9.6241170346384e-05, + "lm_loss": 1.9359, + "loss": 2.0784, + "mask_loss": 0.13, + "step": 1312, + "topk_loss": 0.0124 + }, + { + "epoch": 0.521863336293573, + "grad_norm": 0.11865234375, + "learning_rate": 9.611381552193548e-05, + "lm_loss": 1.8707, + "loss": 2.0083, + "mask_loss": 0.1269, + "step": 1313, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5222607950417021, + "grad_norm": 0.1552734375, + "learning_rate": 9.598646700981051e-05, + "lm_loss": 1.9509, + "loss": 2.1019, + "mask_loss": 0.1345, + "step": 1314, + "topk_loss": 0.0164 + }, + { + "epoch": 0.5226582537898312, + "grad_norm": 0.138671875, + "learning_rate": 9.585912501686111e-05, + "lm_loss": 1.8712, + "loss": 2.0107, + "mask_loss": 0.1292, + "step": 1315, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5230557125379605, + "grad_norm": 0.1552734375, + "learning_rate": 9.57317897499287e-05, + "lm_loss": 1.9163, + "loss": 2.0533, + "mask_loss": 0.1266, + "step": 1316, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5234531712860896, + "grad_norm": 0.1162109375, + "learning_rate": 9.56044614158437e-05, + "lm_loss": 1.9554, + "loss": 2.0914, + "mask_loss": 0.1266, + "step": 1317, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5238506300342187, + "grad_norm": 0.1220703125, + "learning_rate": 9.547714022142537e-05, + "lm_loss": 1.9215, + "loss": 2.0598, + "mask_loss": 0.1283, + "step": 1318, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5242480887823479, + "grad_norm": 0.12451171875, + "learning_rate": 9.534982637348137e-05, + "lm_loss": 1.8686, + "loss": 2.0076, + "mask_loss": 0.1275, + "step": 1319, + "topk_loss": 0.0115 + }, + { + "epoch": 0.524645547530477, + "grad_norm": 0.1201171875, + "learning_rate": 9.522252007880732e-05, + "lm_loss": 1.8746, + "loss": 2.0121, + "mask_loss": 0.1273, + "step": 1320, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5250430062786061, + "grad_norm": 0.1328125, + "learning_rate": 9.509522154418667e-05, + "lm_loss": 1.9392, + "loss": 2.0784, + "mask_loss": 0.1276, + "step": 1321, + "topk_loss": 0.0116 + }, + { + "epoch": 0.5254404650267354, + "grad_norm": 0.12255859375, + "learning_rate": 9.496793097639022e-05, + "lm_loss": 1.9044, + "loss": 2.0384, + "mask_loss": 0.1251, + "step": 1322, + "topk_loss": 0.0089 + }, + { + "epoch": 0.5258379237748645, + "grad_norm": 0.12353515625, + "learning_rate": 9.484064858217587e-05, + "lm_loss": 1.8868, + "loss": 2.0242, + "mask_loss": 0.1276, + "step": 1323, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5262353825229936, + "grad_norm": 0.2001953125, + "learning_rate": 9.471337456828822e-05, + "lm_loss": 1.8827, + "loss": 2.0208, + "mask_loss": 0.1269, + "step": 1324, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5266328412711228, + "grad_norm": 0.12060546875, + "learning_rate": 9.458610914145826e-05, + "lm_loss": 1.8432, + "loss": 1.9847, + "mask_loss": 0.1299, + "step": 1325, + "topk_loss": 0.0115 + }, + { + "epoch": 0.5270303000192519, + "grad_norm": 0.11865234375, + "learning_rate": 9.4458852508403e-05, + "lm_loss": 1.9125, + "loss": 2.0457, + "mask_loss": 0.1245, + "step": 1326, + "topk_loss": 0.0087 + }, + { + "epoch": 0.527427758767381, + "grad_norm": 0.1552734375, + "learning_rate": 9.433160487582526e-05, + "lm_loss": 1.8939, + "loss": 2.0335, + "mask_loss": 0.1291, + "step": 1327, + "topk_loss": 0.0105 + }, + { + "epoch": 0.5278252175155103, + "grad_norm": 0.1474609375, + "learning_rate": 9.420436645041311e-05, + "lm_loss": 1.9032, + "loss": 2.0403, + "mask_loss": 0.1259, + "step": 1328, + "topk_loss": 0.0111 + }, + { + "epoch": 0.5282226762636394, + "grad_norm": 0.12890625, + "learning_rate": 9.407713743883976e-05, + "lm_loss": 1.9396, + "loss": 2.0754, + "mask_loss": 0.1262, + "step": 1329, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5286201350117685, + "grad_norm": 0.12109375, + "learning_rate": 9.394991804776309e-05, + "lm_loss": 1.9457, + "loss": 2.0857, + "mask_loss": 0.1294, + "step": 1330, + "topk_loss": 0.0106 + }, + { + "epoch": 0.5290175937598977, + "grad_norm": 0.1689453125, + "learning_rate": 9.382270848382537e-05, + "lm_loss": 1.9432, + "loss": 2.0787, + "mask_loss": 0.1254, + "step": 1331, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5294150525080268, + "grad_norm": 0.1904296875, + "learning_rate": 9.369550895365291e-05, + "lm_loss": 1.8668, + "loss": 2.004, + "mask_loss": 0.1281, + "step": 1332, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5298125112561559, + "grad_norm": 0.181640625, + "learning_rate": 9.356831966385571e-05, + "lm_loss": 1.9468, + "loss": 2.0846, + "mask_loss": 0.1272, + "step": 1333, + "topk_loss": 0.0106 + }, + { + "epoch": 0.530209970004285, + "grad_norm": 0.1337890625, + "learning_rate": 9.344114082102712e-05, + "lm_loss": 1.9696, + "loss": 2.1073, + "mask_loss": 0.1265, + "step": 1334, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5306074287524143, + "grad_norm": 0.21484375, + "learning_rate": 9.331397263174364e-05, + "lm_loss": 1.8924, + "loss": 2.0298, + "mask_loss": 0.1271, + "step": 1335, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5310048875005434, + "grad_norm": 0.27734375, + "learning_rate": 9.318681530256423e-05, + "lm_loss": 1.9154, + "loss": 2.0501, + "mask_loss": 0.125, + "step": 1336, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5314023462486726, + "grad_norm": 0.1337890625, + "learning_rate": 9.30596690400304e-05, + "lm_loss": 1.9739, + "loss": 2.1086, + "mask_loss": 0.1259, + "step": 1337, + "topk_loss": 0.0089 + }, + { + "epoch": 0.5317998049968017, + "grad_norm": 0.12451171875, + "learning_rate": 9.293253405066563e-05, + "lm_loss": 1.9695, + "loss": 2.1053, + "mask_loss": 0.127, + "step": 1338, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5321972637449308, + "grad_norm": 0.1826171875, + "learning_rate": 9.28054105409751e-05, + "lm_loss": 1.9149, + "loss": 2.0536, + "mask_loss": 0.1274, + "step": 1339, + "topk_loss": 0.0113 + }, + { + "epoch": 0.53259472249306, + "grad_norm": 0.154296875, + "learning_rate": 9.267829871744536e-05, + "lm_loss": 1.8955, + "loss": 2.0342, + "mask_loss": 0.1282, + "step": 1340, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5329921812411892, + "grad_norm": 0.1787109375, + "learning_rate": 9.25511987865439e-05, + "lm_loss": 1.9207, + "loss": 2.056, + "mask_loss": 0.1264, + "step": 1341, + "topk_loss": 0.0089 + }, + { + "epoch": 0.5333896399893183, + "grad_norm": 0.1259765625, + "learning_rate": 9.242411095471897e-05, + "lm_loss": 1.9638, + "loss": 2.1008, + "mask_loss": 0.1277, + "step": 1342, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5337870987374475, + "grad_norm": 0.1220703125, + "learning_rate": 9.229703542839917e-05, + "lm_loss": 1.9143, + "loss": 2.0539, + "mask_loss": 0.1284, + "step": 1343, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5341845574855766, + "grad_norm": 0.166015625, + "learning_rate": 9.216997241399303e-05, + "lm_loss": 1.8686, + "loss": 2.0033, + "mask_loss": 0.125, + "step": 1344, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5345820162337057, + "grad_norm": 0.1376953125, + "learning_rate": 9.204292211788884e-05, + "lm_loss": 1.8998, + "loss": 2.0334, + "mask_loss": 0.1255, + "step": 1345, + "topk_loss": 0.0081 + }, + { + "epoch": 0.5349794749818348, + "grad_norm": 0.12890625, + "learning_rate": 9.19158847464542e-05, + "lm_loss": 1.8785, + "loss": 2.0162, + "mask_loss": 0.1277, + "step": 1346, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5353769337299641, + "grad_norm": 0.150390625, + "learning_rate": 9.178886050603574e-05, + "lm_loss": 1.9795, + "loss": 2.1147, + "mask_loss": 0.1253, + "step": 1347, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5357743924780932, + "grad_norm": 0.12158203125, + "learning_rate": 9.166184960295872e-05, + "lm_loss": 1.9062, + "loss": 2.0437, + "mask_loss": 0.1265, + "step": 1348, + "topk_loss": 0.011 + }, + { + "epoch": 0.5361718512262224, + "grad_norm": 0.1435546875, + "learning_rate": 9.153485224352675e-05, + "lm_loss": 1.8742, + "loss": 2.0098, + "mask_loss": 0.1269, + "step": 1349, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5365693099743515, + "grad_norm": 0.12255859375, + "learning_rate": 9.140786863402147e-05, + "lm_loss": 1.8814, + "loss": 2.017, + "mask_loss": 0.126, + "step": 1350, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5365693099743515, + "eval_lm_loss": 688.3827514648438, + "eval_loss": 688.5211791992188, + "eval_mask_hit_rate": 0.5326009392738342, + "eval_mask_loss": 0.12567125260829926, + "eval_mask_top_10_hit_rate": 0.9849717020988464, + "eval_mask_top_1_hit_rate": 0.9972808361053467, + "eval_mask_top_20_hit_rate": 0.9753085374832153, + "eval_mask_top_5_hit_rate": 0.990444004535675, + "eval_runtime": 143.7784, + "eval_samples_per_second": 14.244, + "eval_steps_per_second": 7.122, + "eval_token_accuracy": 0.6129680275917053, + "eval_top_k_diff": -529.1483764648438, + "eval_topk_loss": 0.012769084423780441, + "step": 1350 + }, + { + "epoch": 0.5369667687224806, + "grad_norm": 0.1943359375, + "learning_rate": 9.12808989807022e-05, + "lm_loss": 1.9485, + "loss": 2.0909, + "mask_loss": 0.1276, + "step": 1351, + "topk_loss": 0.0148 + }, + { + "epoch": 0.5373642274706097, + "grad_norm": 0.11083984375, + "learning_rate": 9.115394348980546e-05, + "lm_loss": 1.8064, + "loss": 1.9452, + "mask_loss": 0.1287, + "step": 1352, + "topk_loss": 0.0101 + }, + { + "epoch": 0.537761686218739, + "grad_norm": 0.11865234375, + "learning_rate": 9.102700236754492e-05, + "lm_loss": 1.9177, + "loss": 2.0579, + "mask_loss": 0.1282, + "step": 1353, + "topk_loss": 0.0119 + }, + { + "epoch": 0.5381591449668681, + "grad_norm": 0.1484375, + "learning_rate": 9.090007582011082e-05, + "lm_loss": 1.8761, + "loss": 2.0117, + "mask_loss": 0.1252, + "step": 1354, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5385566037149973, + "grad_norm": 0.1298828125, + "learning_rate": 9.077316405366981e-05, + "lm_loss": 1.9223, + "loss": 2.0624, + "mask_loss": 0.1284, + "step": 1355, + "topk_loss": 0.0116 + }, + { + "epoch": 0.5389540624631264, + "grad_norm": 0.150390625, + "learning_rate": 9.064626727436445e-05, + "lm_loss": 1.9325, + "loss": 2.0709, + "mask_loss": 0.128, + "step": 1356, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5393515212112555, + "grad_norm": 0.11865234375, + "learning_rate": 9.051938568831298e-05, + "lm_loss": 1.9355, + "loss": 2.0713, + "mask_loss": 0.1257, + "step": 1357, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5397489799593846, + "grad_norm": 0.1220703125, + "learning_rate": 9.039251950160899e-05, + "lm_loss": 1.8959, + "loss": 2.0302, + "mask_loss": 0.1255, + "step": 1358, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5401464387075139, + "grad_norm": 0.1220703125, + "learning_rate": 9.026566892032105e-05, + "lm_loss": 1.9949, + "loss": 2.1293, + "mask_loss": 0.1238, + "step": 1359, + "topk_loss": 0.0106 + }, + { + "epoch": 0.540543897455643, + "grad_norm": 0.16796875, + "learning_rate": 9.01388341504923e-05, + "lm_loss": 1.884, + "loss": 2.0216, + "mask_loss": 0.1278, + "step": 1360, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5409413562037722, + "grad_norm": 0.125, + "learning_rate": 9.001201539814031e-05, + "lm_loss": 1.865, + "loss": 2.0017, + "mask_loss": 0.1267, + "step": 1361, + "topk_loss": 0.01 + }, + { + "epoch": 0.5413388149519013, + "grad_norm": 0.12158203125, + "learning_rate": 8.98852128692566e-05, + "lm_loss": 1.9142, + "loss": 2.0569, + "mask_loss": 0.1315, + "step": 1362, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5417362737000304, + "grad_norm": 0.1318359375, + "learning_rate": 8.975842676980629e-05, + "lm_loss": 1.9219, + "loss": 2.0595, + "mask_loss": 0.1277, + "step": 1363, + "topk_loss": 0.01 + }, + { + "epoch": 0.5421337324481595, + "grad_norm": 0.1552734375, + "learning_rate": 8.963165730572787e-05, + "lm_loss": 1.9269, + "loss": 2.0639, + "mask_loss": 0.1272, + "step": 1364, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5425311911962887, + "grad_norm": 0.166015625, + "learning_rate": 8.950490468293279e-05, + "lm_loss": 1.9045, + "loss": 2.0434, + "mask_loss": 0.1278, + "step": 1365, + "topk_loss": 0.0111 + }, + { + "epoch": 0.5429286499444179, + "grad_norm": 0.1279296875, + "learning_rate": 8.937816910730513e-05, + "lm_loss": 1.9077, + "loss": 2.0504, + "mask_loss": 0.1304, + "step": 1366, + "topk_loss": 0.0123 + }, + { + "epoch": 0.543326108692547, + "grad_norm": 0.1220703125, + "learning_rate": 8.925145078470135e-05, + "lm_loss": 1.8535, + "loss": 1.9929, + "mask_loss": 0.1286, + "step": 1367, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5437235674406762, + "grad_norm": 0.1455078125, + "learning_rate": 8.912474992094974e-05, + "lm_loss": 1.9703, + "loss": 2.1097, + "mask_loss": 0.1262, + "step": 1368, + "topk_loss": 0.0132 + }, + { + "epoch": 0.5441210261888053, + "grad_norm": 0.197265625, + "learning_rate": 8.899806672185037e-05, + "lm_loss": 1.9328, + "loss": 2.0695, + "mask_loss": 0.1276, + "step": 1369, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5445184849369344, + "grad_norm": 0.1376953125, + "learning_rate": 8.887140139317454e-05, + "lm_loss": 1.9, + "loss": 2.039, + "mask_loss": 0.1276, + "step": 1370, + "topk_loss": 0.0114 + }, + { + "epoch": 0.5449159436850636, + "grad_norm": 0.1240234375, + "learning_rate": 8.87447541406646e-05, + "lm_loss": 1.9248, + "loss": 2.0604, + "mask_loss": 0.1252, + "step": 1371, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5453134024331928, + "grad_norm": 0.12255859375, + "learning_rate": 8.861812517003345e-05, + "lm_loss": 1.9264, + "loss": 2.0684, + "mask_loss": 0.1294, + "step": 1372, + "topk_loss": 0.0126 + }, + { + "epoch": 0.545710861181322, + "grad_norm": 0.12890625, + "learning_rate": 8.849151468696434e-05, + "lm_loss": 1.89, + "loss": 2.0247, + "mask_loss": 0.1253, + "step": 1373, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5461083199294511, + "grad_norm": 0.1357421875, + "learning_rate": 8.836492289711051e-05, + "lm_loss": 1.9543, + "loss": 2.0929, + "mask_loss": 0.1274, + "step": 1374, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5465057786775802, + "grad_norm": 0.1484375, + "learning_rate": 8.823835000609482e-05, + "lm_loss": 1.9542, + "loss": 2.0914, + "mask_loss": 0.1274, + "step": 1375, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5469032374257093, + "grad_norm": 0.12890625, + "learning_rate": 8.811179621950936e-05, + "lm_loss": 1.9566, + "loss": 2.0955, + "mask_loss": 0.1284, + "step": 1376, + "topk_loss": 0.0105 + }, + { + "epoch": 0.5473006961738385, + "grad_norm": 0.1259765625, + "learning_rate": 8.798526174291531e-05, + "lm_loss": 1.9385, + "loss": 2.0803, + "mask_loss": 0.1292, + "step": 1377, + "topk_loss": 0.0126 + }, + { + "epoch": 0.5476981549219677, + "grad_norm": 0.1181640625, + "learning_rate": 8.785874678184242e-05, + "lm_loss": 1.8737, + "loss": 2.0062, + "mask_loss": 0.1237, + "step": 1378, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5480956136700968, + "grad_norm": 0.177734375, + "learning_rate": 8.773225154178873e-05, + "lm_loss": 1.92, + "loss": 2.058, + "mask_loss": 0.127, + "step": 1379, + "topk_loss": 0.011 + }, + { + "epoch": 0.548493072418226, + "grad_norm": 0.138671875, + "learning_rate": 8.76057762282203e-05, + "lm_loss": 1.9239, + "loss": 2.0607, + "mask_loss": 0.1255, + "step": 1380, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5488905311663551, + "grad_norm": 0.12451171875, + "learning_rate": 8.747932104657076e-05, + "lm_loss": 1.9414, + "loss": 2.0752, + "mask_loss": 0.1244, + "step": 1381, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5492879899144842, + "grad_norm": 0.12353515625, + "learning_rate": 8.73528862022411e-05, + "lm_loss": 1.9191, + "loss": 2.0573, + "mask_loss": 0.128, + "step": 1382, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5496854486626134, + "grad_norm": 0.166015625, + "learning_rate": 8.722647190059924e-05, + "lm_loss": 1.8907, + "loss": 2.0265, + "mask_loss": 0.1266, + "step": 1383, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5500829074107426, + "grad_norm": 0.1435546875, + "learning_rate": 8.710007834697969e-05, + "lm_loss": 1.8894, + "loss": 2.0261, + "mask_loss": 0.1269, + "step": 1384, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5504803661588717, + "grad_norm": 0.130859375, + "learning_rate": 8.697370574668335e-05, + "lm_loss": 1.8602, + "loss": 1.9982, + "mask_loss": 0.1286, + "step": 1385, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5508778249070009, + "grad_norm": 0.11474609375, + "learning_rate": 8.684735430497704e-05, + "lm_loss": 1.9372, + "loss": 2.0721, + "mask_loss": 0.1264, + "step": 1386, + "topk_loss": 0.0085 + }, + { + "epoch": 0.55127528365513, + "grad_norm": 0.119140625, + "learning_rate": 8.672102422709323e-05, + "lm_loss": 1.9999, + "loss": 2.1386, + "mask_loss": 0.1274, + "step": 1387, + "topk_loss": 0.0114 + }, + { + "epoch": 0.5516727424032591, + "grad_norm": 0.171875, + "learning_rate": 8.659471571822964e-05, + "lm_loss": 1.9412, + "loss": 2.078, + "mask_loss": 0.1265, + "step": 1388, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5520702011513883, + "grad_norm": 0.1533203125, + "learning_rate": 8.6468428983549e-05, + "lm_loss": 1.8598, + "loss": 1.9957, + "mask_loss": 0.1269, + "step": 1389, + "topk_loss": 0.009 + }, + { + "epoch": 0.5524676598995174, + "grad_norm": 0.11962890625, + "learning_rate": 8.634216422817867e-05, + "lm_loss": 1.8608, + "loss": 1.9996, + "mask_loss": 0.1274, + "step": 1390, + "topk_loss": 0.0113 + }, + { + "epoch": 0.5528651186476466, + "grad_norm": 0.1123046875, + "learning_rate": 8.621592165721034e-05, + "lm_loss": 1.9219, + "loss": 2.0558, + "mask_loss": 0.1253, + "step": 1391, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5532625773957758, + "grad_norm": 0.12353515625, + "learning_rate": 8.608970147569954e-05, + "lm_loss": 1.9326, + "loss": 2.0663, + "mask_loss": 0.1248, + "step": 1392, + "topk_loss": 0.0089 + }, + { + "epoch": 0.5536600361439049, + "grad_norm": 0.142578125, + "learning_rate": 8.596350388866558e-05, + "lm_loss": 1.916, + "loss": 2.0553, + "mask_loss": 0.1274, + "step": 1393, + "topk_loss": 0.0119 + }, + { + "epoch": 0.554057494892034, + "grad_norm": 0.134765625, + "learning_rate": 8.5837329101091e-05, + "lm_loss": 1.8629, + "loss": 2.0017, + "mask_loss": 0.1272, + "step": 1394, + "topk_loss": 0.0117 + }, + { + "epoch": 0.5544549536401632, + "grad_norm": 0.140625, + "learning_rate": 8.57111773179213e-05, + "lm_loss": 1.8755, + "loss": 2.0101, + "mask_loss": 0.1261, + "step": 1395, + "topk_loss": 0.0085 + }, + { + "epoch": 0.5548524123882923, + "grad_norm": 0.126953125, + "learning_rate": 8.558504874406464e-05, + "lm_loss": 1.9013, + "loss": 2.0408, + "mask_loss": 0.1271, + "step": 1396, + "topk_loss": 0.0124 + }, + { + "epoch": 0.5552498711364215, + "grad_norm": 0.11865234375, + "learning_rate": 8.545894358439148e-05, + "lm_loss": 1.896, + "loss": 2.0308, + "mask_loss": 0.1265, + "step": 1397, + "topk_loss": 0.0083 + }, + { + "epoch": 0.5556473298845507, + "grad_norm": 0.1162109375, + "learning_rate": 8.533286204373424e-05, + "lm_loss": 1.9127, + "loss": 2.052, + "mask_loss": 0.1289, + "step": 1398, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5560447886326798, + "grad_norm": 0.1435546875, + "learning_rate": 8.520680432688702e-05, + "lm_loss": 1.9136, + "loss": 2.0554, + "mask_loss": 0.1288, + "step": 1399, + "topk_loss": 0.013 + }, + { + "epoch": 0.5564422473808089, + "grad_norm": 0.16015625, + "learning_rate": 8.508077063860506e-05, + "lm_loss": 1.9322, + "loss": 2.0681, + "mask_loss": 0.1249, + "step": 1400, + "topk_loss": 0.0111 + }, + { + "epoch": 0.5564422473808089, + "eval_lm_loss": 687.5111083984375, + "eval_loss": 687.6494140625, + "eval_mask_hit_rate": 0.5331037044525146, + "eval_mask_loss": 0.12541639804840088, + "eval_mask_top_10_hit_rate": 0.9850775599479675, + "eval_mask_top_1_hit_rate": 0.997307538986206, + "eval_mask_top_20_hit_rate": 0.9754698276519775, + "eval_mask_top_5_hit_rate": 0.9905154705047607, + "eval_runtime": 144.4147, + "eval_samples_per_second": 14.181, + "eval_steps_per_second": 7.091, + "eval_token_accuracy": 0.6132440567016602, + "eval_top_k_diff": -523.2880249023438, + "eval_topk_loss": 0.012859683483839035, + "step": 1400 + }, + { + "epoch": 0.5568397061289381, + "grad_norm": 0.1123046875, + "learning_rate": 8.495476118360477e-05, + "lm_loss": 1.912, + "loss": 2.0477, + "mask_loss": 0.1263, + "step": 1401, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5572371648770672, + "grad_norm": 0.123046875, + "learning_rate": 8.48287761665631e-05, + "lm_loss": 1.9322, + "loss": 2.0712, + "mask_loss": 0.1292, + "step": 1402, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5576346236251964, + "grad_norm": 0.119140625, + "learning_rate": 8.470281579211733e-05, + "lm_loss": 1.9025, + "loss": 2.0405, + "mask_loss": 0.1283, + "step": 1403, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5580320823733256, + "grad_norm": 0.12060546875, + "learning_rate": 8.45768802648647e-05, + "lm_loss": 1.8724, + "loss": 2.0123, + "mask_loss": 0.1289, + "step": 1404, + "topk_loss": 0.011 + }, + { + "epoch": 0.5584295411214547, + "grad_norm": 0.12890625, + "learning_rate": 8.44509697893621e-05, + "lm_loss": 1.8755, + "loss": 2.0127, + "mask_loss": 0.1258, + "step": 1405, + "topk_loss": 0.0114 + }, + { + "epoch": 0.5588269998695838, + "grad_norm": 0.1455078125, + "learning_rate": 8.432508457012571e-05, + "lm_loss": 1.8924, + "loss": 2.0313, + "mask_loss": 0.1269, + "step": 1406, + "topk_loss": 0.012 + }, + { + "epoch": 0.559224458617713, + "grad_norm": 0.1162109375, + "learning_rate": 8.419922481163075e-05, + "lm_loss": 1.9442, + "loss": 2.0822, + "mask_loss": 0.1283, + "step": 1407, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5596219173658421, + "grad_norm": 0.12890625, + "learning_rate": 8.407339071831097e-05, + "lm_loss": 1.9364, + "loss": 2.0706, + "mask_loss": 0.1246, + "step": 1408, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5600193761139713, + "grad_norm": 0.11962890625, + "learning_rate": 8.394758249455853e-05, + "lm_loss": 1.8795, + "loss": 2.0169, + "mask_loss": 0.1279, + "step": 1409, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5604168348621005, + "grad_norm": 0.1171875, + "learning_rate": 8.382180034472353e-05, + "lm_loss": 1.8734, + "loss": 2.0089, + "mask_loss": 0.1263, + "step": 1410, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5608142936102296, + "grad_norm": 0.12060546875, + "learning_rate": 8.369604447311373e-05, + "lm_loss": 1.9064, + "loss": 2.0414, + "mask_loss": 0.1263, + "step": 1411, + "topk_loss": 0.0086 + }, + { + "epoch": 0.5612117523583587, + "grad_norm": 0.1201171875, + "learning_rate": 8.35703150839942e-05, + "lm_loss": 1.8829, + "loss": 2.0187, + "mask_loss": 0.1255, + "step": 1412, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5616092111064879, + "grad_norm": 0.1611328125, + "learning_rate": 8.344461238158699e-05, + "lm_loss": 1.9118, + "loss": 2.0582, + "mask_loss": 0.131, + "step": 1413, + "topk_loss": 0.0154 + }, + { + "epoch": 0.562006669854617, + "grad_norm": 0.1259765625, + "learning_rate": 8.331893657007082e-05, + "lm_loss": 1.8842, + "loss": 2.0185, + "mask_loss": 0.1253, + "step": 1414, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5624041286027462, + "grad_norm": 0.11279296875, + "learning_rate": 8.319328785358078e-05, + "lm_loss": 1.9075, + "loss": 2.0437, + "mask_loss": 0.1263, + "step": 1415, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5628015873508754, + "grad_norm": 0.1435546875, + "learning_rate": 8.306766643620774e-05, + "lm_loss": 1.903, + "loss": 2.0387, + "mask_loss": 0.1261, + "step": 1416, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5631990460990045, + "grad_norm": 0.1259765625, + "learning_rate": 8.29420725219985e-05, + "lm_loss": 1.8671, + "loss": 2.0027, + "mask_loss": 0.1268, + "step": 1417, + "topk_loss": 0.0088 + }, + { + "epoch": 0.5635965048471336, + "grad_norm": 0.140625, + "learning_rate": 8.281650631495501e-05, + "lm_loss": 1.8995, + "loss": 2.0343, + "mask_loss": 0.1248, + "step": 1418, + "topk_loss": 0.01 + }, + { + "epoch": 0.5639939635952628, + "grad_norm": 0.146484375, + "learning_rate": 8.26909680190343e-05, + "lm_loss": 1.9331, + "loss": 2.071, + "mask_loss": 0.1264, + "step": 1419, + "topk_loss": 0.0115 + }, + { + "epoch": 0.5643914223433919, + "grad_norm": 0.1923828125, + "learning_rate": 8.256545783814802e-05, + "lm_loss": 1.853, + "loss": 1.9906, + "mask_loss": 0.1275, + "step": 1420, + "topk_loss": 0.0102 + }, + { + "epoch": 0.564788881091521, + "grad_norm": 0.123046875, + "learning_rate": 8.243997597616217e-05, + "lm_loss": 1.9117, + "loss": 2.0458, + "mask_loss": 0.1249, + "step": 1421, + "topk_loss": 0.0091 + }, + { + "epoch": 0.5651863398396503, + "grad_norm": 0.12890625, + "learning_rate": 8.231452263689674e-05, + "lm_loss": 1.9307, + "loss": 2.0671, + "mask_loss": 0.1261, + "step": 1422, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5655837985877794, + "grad_norm": 0.140625, + "learning_rate": 8.218909802412542e-05, + "lm_loss": 1.9443, + "loss": 2.0841, + "mask_loss": 0.1291, + "step": 1423, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5659812573359085, + "grad_norm": 0.12451171875, + "learning_rate": 8.20637023415752e-05, + "lm_loss": 1.8607, + "loss": 1.9997, + "mask_loss": 0.1284, + "step": 1424, + "topk_loss": 0.0106 + }, + { + "epoch": 0.5663787160840377, + "grad_norm": 0.1376953125, + "learning_rate": 8.19383357929261e-05, + "lm_loss": 1.9362, + "loss": 2.0714, + "mask_loss": 0.1248, + "step": 1425, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5667761748321668, + "grad_norm": 0.119140625, + "learning_rate": 8.181299858181082e-05, + "lm_loss": 1.8738, + "loss": 2.0099, + "mask_loss": 0.1261, + "step": 1426, + "topk_loss": 0.01 + }, + { + "epoch": 0.5671736335802959, + "grad_norm": 0.12890625, + "learning_rate": 8.168769091181438e-05, + "lm_loss": 1.9215, + "loss": 2.0634, + "mask_loss": 0.1293, + "step": 1427, + "topk_loss": 0.0126 + }, + { + "epoch": 0.5675710923284252, + "grad_norm": 0.11767578125, + "learning_rate": 8.156241298647388e-05, + "lm_loss": 1.8747, + "loss": 2.014, + "mask_loss": 0.1293, + "step": 1428, + "topk_loss": 0.01 + }, + { + "epoch": 0.5679685510765543, + "grad_norm": 0.1337890625, + "learning_rate": 8.143716500927804e-05, + "lm_loss": 1.9372, + "loss": 2.0732, + "mask_loss": 0.1263, + "step": 1429, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5683660098246834, + "grad_norm": 0.13671875, + "learning_rate": 8.131194718366696e-05, + "lm_loss": 1.9328, + "loss": 2.0675, + "mask_loss": 0.1254, + "step": 1430, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5687634685728126, + "grad_norm": 0.12109375, + "learning_rate": 8.11867597130318e-05, + "lm_loss": 1.9498, + "loss": 2.0898, + "mask_loss": 0.1283, + "step": 1431, + "topk_loss": 0.0117 + }, + { + "epoch": 0.5691609273209417, + "grad_norm": 0.173828125, + "learning_rate": 8.106160280071431e-05, + "lm_loss": 1.9142, + "loss": 2.0487, + "mask_loss": 0.126, + "step": 1432, + "topk_loss": 0.0086 + }, + { + "epoch": 0.5695583860690708, + "grad_norm": 0.166015625, + "learning_rate": 8.093647665000672e-05, + "lm_loss": 1.9168, + "loss": 2.0526, + "mask_loss": 0.1262, + "step": 1433, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5699558448172001, + "grad_norm": 0.1318359375, + "learning_rate": 8.081138146415121e-05, + "lm_loss": 1.9281, + "loss": 2.0671, + "mask_loss": 0.1288, + "step": 1434, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5703533035653292, + "grad_norm": 0.126953125, + "learning_rate": 8.068631744633976e-05, + "lm_loss": 1.951, + "loss": 2.0878, + "mask_loss": 0.1256, + "step": 1435, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5707507623134583, + "grad_norm": 0.12060546875, + "learning_rate": 8.056128479971361e-05, + "lm_loss": 1.977, + "loss": 2.1121, + "mask_loss": 0.125, + "step": 1436, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5711482210615875, + "grad_norm": 0.1416015625, + "learning_rate": 8.043628372736311e-05, + "lm_loss": 1.9565, + "loss": 2.0922, + "mask_loss": 0.1252, + "step": 1437, + "topk_loss": 0.0106 + }, + { + "epoch": 0.5715456798097166, + "grad_norm": 0.17578125, + "learning_rate": 8.031131443232734e-05, + "lm_loss": 1.9105, + "loss": 2.0467, + "mask_loss": 0.1267, + "step": 1438, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5719431385578457, + "grad_norm": 0.162109375, + "learning_rate": 8.018637711759377e-05, + "lm_loss": 1.8987, + "loss": 2.0331, + "mask_loss": 0.1252, + "step": 1439, + "topk_loss": 0.0093 + }, + { + "epoch": 0.572340597305975, + "grad_norm": 0.1357421875, + "learning_rate": 8.006147198609778e-05, + "lm_loss": 1.9341, + "loss": 2.0673, + "mask_loss": 0.1235, + "step": 1440, + "topk_loss": 0.0097 + }, + { + "epoch": 0.5727380560541041, + "grad_norm": 0.123046875, + "learning_rate": 7.993659924072265e-05, + "lm_loss": 1.9626, + "loss": 2.1014, + "mask_loss": 0.1275, + "step": 1441, + "topk_loss": 0.0113 + }, + { + "epoch": 0.5731355148022332, + "grad_norm": 0.11865234375, + "learning_rate": 7.9811759084299e-05, + "lm_loss": 1.8526, + "loss": 1.9898, + "mask_loss": 0.1268, + "step": 1442, + "topk_loss": 0.0104 + }, + { + "epoch": 0.5735329735503624, + "grad_norm": 0.123046875, + "learning_rate": 7.968695171960449e-05, + "lm_loss": 1.8952, + "loss": 2.0323, + "mask_loss": 0.1271, + "step": 1443, + "topk_loss": 0.01 + }, + { + "epoch": 0.5739304322984915, + "grad_norm": 0.11962890625, + "learning_rate": 7.956217734936353e-05, + "lm_loss": 1.9943, + "loss": 2.1317, + "mask_loss": 0.1265, + "step": 1444, + "topk_loss": 0.0109 + }, + { + "epoch": 0.5743278910466206, + "grad_norm": 0.11865234375, + "learning_rate": 7.943743617624695e-05, + "lm_loss": 1.9151, + "loss": 2.0512, + "mask_loss": 0.1258, + "step": 1445, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5747253497947499, + "grad_norm": 0.123046875, + "learning_rate": 7.931272840287165e-05, + "lm_loss": 1.9121, + "loss": 2.048, + "mask_loss": 0.1262, + "step": 1446, + "topk_loss": 0.0097 + }, + { + "epoch": 0.575122808542879, + "grad_norm": 0.15234375, + "learning_rate": 7.918805423180029e-05, + "lm_loss": 1.9777, + "loss": 2.1201, + "mask_loss": 0.1269, + "step": 1447, + "topk_loss": 0.0155 + }, + { + "epoch": 0.5755202672910081, + "grad_norm": 0.1318359375, + "learning_rate": 7.90634138655409e-05, + "lm_loss": 1.9178, + "loss": 2.0617, + "mask_loss": 0.1302, + "step": 1448, + "topk_loss": 0.0137 + }, + { + "epoch": 0.5759177260391373, + "grad_norm": 0.1279296875, + "learning_rate": 7.893880750654668e-05, + "lm_loss": 1.906, + "loss": 2.0468, + "mask_loss": 0.1286, + "step": 1449, + "topk_loss": 0.0123 + }, + { + "epoch": 0.5763151847872664, + "grad_norm": 0.1318359375, + "learning_rate": 7.881423535721551e-05, + "lm_loss": 1.9239, + "loss": 2.0578, + "mask_loss": 0.1246, + "step": 1450, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5763151847872664, + "eval_lm_loss": 687.342041015625, + "eval_loss": 687.4800415039062, + "eval_mask_hit_rate": 0.5336586236953735, + "eval_mask_loss": 0.12521842122077942, + "eval_mask_top_10_hit_rate": 0.9852107763290405, + "eval_mask_top_1_hit_rate": 0.9973337650299072, + "eval_mask_top_20_hit_rate": 0.9756519198417664, + "eval_mask_top_5_hit_rate": 0.9906177520751953, + "eval_runtime": 144.3265, + "eval_samples_per_second": 14.19, + "eval_steps_per_second": 7.095, + "eval_token_accuracy": 0.6135073304176331, + "eval_top_k_diff": -522.5317993164062, + "eval_topk_loss": 0.012827243655920029, + "step": 1450 + }, + { + "epoch": 0.5767126435353955, + "grad_norm": 0.337890625, + "learning_rate": 7.868969761988978e-05, + "lm_loss": 1.8985, + "loss": 2.0948, + "mask_loss": 0.1625, + "step": 1451, + "topk_loss": 0.0338 + }, + { + "epoch": 0.5771101022835247, + "grad_norm": 0.19921875, + "learning_rate": 7.856519449685591e-05, + "lm_loss": 1.9234, + "loss": 2.0611, + "mask_loss": 0.1265, + "step": 1452, + "topk_loss": 0.0112 + }, + { + "epoch": 0.5775075610316539, + "grad_norm": 0.123046875, + "learning_rate": 7.844072619034417e-05, + "lm_loss": 1.9635, + "loss": 2.1034, + "mask_loss": 0.1275, + "step": 1453, + "topk_loss": 0.0124 + }, + { + "epoch": 0.577905019779783, + "grad_norm": 0.125, + "learning_rate": 7.831629290252823e-05, + "lm_loss": 1.9108, + "loss": 2.0478, + "mask_loss": 0.1269, + "step": 1454, + "topk_loss": 0.0101 + }, + { + "epoch": 0.5783024785279122, + "grad_norm": 0.1318359375, + "learning_rate": 7.819189483552493e-05, + "lm_loss": 1.9113, + "loss": 2.0471, + "mask_loss": 0.1258, + "step": 1455, + "topk_loss": 0.01 + }, + { + "epoch": 0.5786999372760413, + "grad_norm": 0.123046875, + "learning_rate": 7.806753219139377e-05, + "lm_loss": 1.8646, + "loss": 1.9995, + "mask_loss": 0.1265, + "step": 1456, + "topk_loss": 0.0084 + }, + { + "epoch": 0.5790973960241704, + "grad_norm": 0.111328125, + "learning_rate": 7.794320517213687e-05, + "lm_loss": 1.9129, + "loss": 2.0482, + "mask_loss": 0.1259, + "step": 1457, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5794948547722996, + "grad_norm": 0.123046875, + "learning_rate": 7.781891397969838e-05, + "lm_loss": 1.8934, + "loss": 2.0337, + "mask_loss": 0.127, + "step": 1458, + "topk_loss": 0.0133 + }, + { + "epoch": 0.5798923135204288, + "grad_norm": 0.11669921875, + "learning_rate": 7.769465881596434e-05, + "lm_loss": 1.9682, + "loss": 2.1037, + "mask_loss": 0.1254, + "step": 1459, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5802897722685579, + "grad_norm": 0.11474609375, + "learning_rate": 7.75704398827622e-05, + "lm_loss": 1.876, + "loss": 2.0089, + "mask_loss": 0.1245, + "step": 1460, + "topk_loss": 0.0083 + }, + { + "epoch": 0.5806872310166871, + "grad_norm": 0.306640625, + "learning_rate": 7.744625738186059e-05, + "lm_loss": 1.9677, + "loss": 2.1455, + "mask_loss": 0.1494, + "step": 1461, + "topk_loss": 0.0285 + }, + { + "epoch": 0.5810846897648162, + "grad_norm": 0.12255859375, + "learning_rate": 7.732211151496895e-05, + "lm_loss": 1.9315, + "loss": 2.066, + "mask_loss": 0.1256, + "step": 1462, + "topk_loss": 0.009 + }, + { + "epoch": 0.5814821485129453, + "grad_norm": 0.125, + "learning_rate": 7.719800248373726e-05, + "lm_loss": 1.9285, + "loss": 2.0643, + "mask_loss": 0.1263, + "step": 1463, + "topk_loss": 0.0095 + }, + { + "epoch": 0.5818796072610745, + "grad_norm": 0.12451171875, + "learning_rate": 7.707393048975558e-05, + "lm_loss": 1.8748, + "loss": 2.0142, + "mask_loss": 0.1282, + "step": 1464, + "topk_loss": 0.0113 + }, + { + "epoch": 0.5822770660092037, + "grad_norm": 0.134765625, + "learning_rate": 7.694989573455388e-05, + "lm_loss": 1.8607, + "loss": 1.9976, + "mask_loss": 0.1269, + "step": 1465, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5826745247573328, + "grad_norm": 0.1435546875, + "learning_rate": 7.682589841960164e-05, + "lm_loss": 1.8428, + "loss": 1.9787, + "mask_loss": 0.1261, + "step": 1466, + "topk_loss": 0.0098 + }, + { + "epoch": 0.583071983505462, + "grad_norm": 0.1796875, + "learning_rate": 7.670193874630749e-05, + "lm_loss": 1.9415, + "loss": 2.0803, + "mask_loss": 0.1276, + "step": 1467, + "topk_loss": 0.0113 + }, + { + "epoch": 0.5834694422535911, + "grad_norm": 0.1328125, + "learning_rate": 7.657801691601896e-05, + "lm_loss": 2.0485, + "loss": 2.181, + "mask_loss": 0.1232, + "step": 1468, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5838669010017202, + "grad_norm": 0.126953125, + "learning_rate": 7.645413313002207e-05, + "lm_loss": 1.84, + "loss": 1.9775, + "mask_loss": 0.1289, + "step": 1469, + "topk_loss": 0.0086 + }, + { + "epoch": 0.5842643597498494, + "grad_norm": 0.130859375, + "learning_rate": 7.633028758954109e-05, + "lm_loss": 1.8724, + "loss": 2.0092, + "mask_loss": 0.1262, + "step": 1470, + "topk_loss": 0.0106 + }, + { + "epoch": 0.5846618184979786, + "grad_norm": 0.15234375, + "learning_rate": 7.620648049573815e-05, + "lm_loss": 1.8695, + "loss": 2.0039, + "mask_loss": 0.1254, + "step": 1471, + "topk_loss": 0.009 + }, + { + "epoch": 0.5850592772461077, + "grad_norm": 0.130859375, + "learning_rate": 7.608271204971287e-05, + "lm_loss": 1.9123, + "loss": 2.0483, + "mask_loss": 0.1257, + "step": 1472, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5854567359942369, + "grad_norm": 0.1455078125, + "learning_rate": 7.59589824525022e-05, + "lm_loss": 1.8812, + "loss": 2.0175, + "mask_loss": 0.1274, + "step": 1473, + "topk_loss": 0.0089 + }, + { + "epoch": 0.585854194742366, + "grad_norm": 0.1298828125, + "learning_rate": 7.583529190507992e-05, + "lm_loss": 1.9804, + "loss": 2.1196, + "mask_loss": 0.1275, + "step": 1474, + "topk_loss": 0.0118 + }, + { + "epoch": 0.5862516534904951, + "grad_norm": 0.1142578125, + "learning_rate": 7.571164060835641e-05, + "lm_loss": 1.9199, + "loss": 2.0533, + "mask_loss": 0.1244, + "step": 1475, + "topk_loss": 0.009 + }, + { + "epoch": 0.5866491122386243, + "grad_norm": 0.1201171875, + "learning_rate": 7.558802876317825e-05, + "lm_loss": 1.9074, + "loss": 2.0467, + "mask_loss": 0.1278, + "step": 1476, + "topk_loss": 0.0115 + }, + { + "epoch": 0.5870465709867534, + "grad_norm": 0.1357421875, + "learning_rate": 7.546445657032801e-05, + "lm_loss": 1.9905, + "loss": 2.1273, + "mask_loss": 0.1257, + "step": 1477, + "topk_loss": 0.011 + }, + { + "epoch": 0.5874440297348826, + "grad_norm": 0.11962890625, + "learning_rate": 7.534092423052381e-05, + "lm_loss": 1.9266, + "loss": 2.0633, + "mask_loss": 0.1269, + "step": 1478, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5878414884830118, + "grad_norm": 0.1259765625, + "learning_rate": 7.521743194441904e-05, + "lm_loss": 1.9371, + "loss": 2.0692, + "mask_loss": 0.1234, + "step": 1479, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5882389472311409, + "grad_norm": 0.125, + "learning_rate": 7.509397991260202e-05, + "lm_loss": 1.9036, + "loss": 2.0376, + "mask_loss": 0.1244, + "step": 1480, + "topk_loss": 0.0096 + }, + { + "epoch": 0.58863640597927, + "grad_norm": 0.171875, + "learning_rate": 7.497056833559573e-05, + "lm_loss": 1.9135, + "loss": 2.0566, + "mask_loss": 0.1294, + "step": 1481, + "topk_loss": 0.0137 + }, + { + "epoch": 0.5890338647273992, + "grad_norm": 0.17578125, + "learning_rate": 7.484719741385735e-05, + "lm_loss": 1.8347, + "loss": 1.9715, + "mask_loss": 0.1274, + "step": 1482, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5894313234755283, + "grad_norm": 0.12158203125, + "learning_rate": 7.472386734777814e-05, + "lm_loss": 1.8663, + "loss": 2.002, + "mask_loss": 0.1258, + "step": 1483, + "topk_loss": 0.01 + }, + { + "epoch": 0.5898287822236575, + "grad_norm": 0.1220703125, + "learning_rate": 7.460057833768292e-05, + "lm_loss": 1.9451, + "loss": 2.0813, + "mask_loss": 0.1264, + "step": 1484, + "topk_loss": 0.0098 + }, + { + "epoch": 0.5902262409717867, + "grad_norm": 0.1142578125, + "learning_rate": 7.447733058382981e-05, + "lm_loss": 1.9263, + "loss": 2.0637, + "mask_loss": 0.1265, + "step": 1485, + "topk_loss": 0.0109 + }, + { + "epoch": 0.5906236997199158, + "grad_norm": 0.1357421875, + "learning_rate": 7.435412428641001e-05, + "lm_loss": 1.9, + "loss": 2.0355, + "mask_loss": 0.1259, + "step": 1486, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5910211584680449, + "grad_norm": 0.126953125, + "learning_rate": 7.423095964554731e-05, + "lm_loss": 1.8908, + "loss": 2.0295, + "mask_loss": 0.1278, + "step": 1487, + "topk_loss": 0.011 + }, + { + "epoch": 0.5914186172161741, + "grad_norm": 0.1416015625, + "learning_rate": 7.41078368612978e-05, + "lm_loss": 1.9473, + "loss": 2.0858, + "mask_loss": 0.1276, + "step": 1488, + "topk_loss": 0.0108 + }, + { + "epoch": 0.5918160759643032, + "grad_norm": 0.1259765625, + "learning_rate": 7.398475613364969e-05, + "lm_loss": 1.9053, + "loss": 2.0394, + "mask_loss": 0.1249, + "step": 1489, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5922135347124324, + "grad_norm": 0.1787109375, + "learning_rate": 7.386171766252274e-05, + "lm_loss": 1.9353, + "loss": 2.0831, + "mask_loss": 0.1312, + "step": 1490, + "topk_loss": 0.0165 + }, + { + "epoch": 0.5926109934605616, + "grad_norm": 0.146484375, + "learning_rate": 7.373872164776824e-05, + "lm_loss": 1.8609, + "loss": 2.0012, + "mask_loss": 0.1276, + "step": 1491, + "topk_loss": 0.0126 + }, + { + "epoch": 0.5930084522086907, + "grad_norm": 0.5, + "learning_rate": 7.361576828916839e-05, + "lm_loss": 1.8879, + "loss": 2.0956, + "mask_loss": 0.1665, + "step": 1492, + "topk_loss": 0.0412 + }, + { + "epoch": 0.5934059109568198, + "grad_norm": 0.1298828125, + "learning_rate": 7.349285778643614e-05, + "lm_loss": 1.908, + "loss": 2.045, + "mask_loss": 0.1267, + "step": 1493, + "topk_loss": 0.0102 + }, + { + "epoch": 0.593803369704949, + "grad_norm": 0.1572265625, + "learning_rate": 7.336999033921486e-05, + "lm_loss": 1.9472, + "loss": 2.0836, + "mask_loss": 0.1257, + "step": 1494, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5942008284530781, + "grad_norm": 0.1943359375, + "learning_rate": 7.324716614707793e-05, + "lm_loss": 1.8706, + "loss": 2.0106, + "mask_loss": 0.1298, + "step": 1495, + "topk_loss": 0.0102 + }, + { + "epoch": 0.5945982872012073, + "grad_norm": 0.1396484375, + "learning_rate": 7.312438540952852e-05, + "lm_loss": 1.8157, + "loss": 1.9582, + "mask_loss": 0.1301, + "step": 1496, + "topk_loss": 0.0124 + }, + { + "epoch": 0.5949957459493365, + "grad_norm": 0.1484375, + "learning_rate": 7.300164832599917e-05, + "lm_loss": 1.8506, + "loss": 1.9867, + "mask_loss": 0.1268, + "step": 1497, + "topk_loss": 0.0092 + }, + { + "epoch": 0.5953932046974656, + "grad_norm": 0.158203125, + "learning_rate": 7.287895509585156e-05, + "lm_loss": 1.8594, + "loss": 1.9964, + "mask_loss": 0.1277, + "step": 1498, + "topk_loss": 0.0093 + }, + { + "epoch": 0.5957906634455947, + "grad_norm": 0.123046875, + "learning_rate": 7.275630591837613e-05, + "lm_loss": 1.9108, + "loss": 2.0471, + "mask_loss": 0.1255, + "step": 1499, + "topk_loss": 0.0107 + }, + { + "epoch": 0.5961881221937239, + "grad_norm": 0.1533203125, + "learning_rate": 7.263370099279172e-05, + "lm_loss": 1.9664, + "loss": 2.1044, + "mask_loss": 0.1264, + "step": 1500, + "topk_loss": 0.0115 + }, + { + "epoch": 0.5961881221937239, + "eval_lm_loss": 688.2054443359375, + "eval_loss": 688.3433837890625, + "eval_mask_hit_rate": 0.5340866446495056, + "eval_mask_loss": 0.12516987323760986, + "eval_mask_top_10_hit_rate": 0.9852725267410278, + "eval_mask_top_1_hit_rate": 0.9973559379577637, + "eval_mask_top_20_hit_rate": 0.9757484197616577, + "eval_mask_top_5_hit_rate": 0.9906572103500366, + "eval_runtime": 143.6792, + "eval_samples_per_second": 14.254, + "eval_steps_per_second": 7.127, + "eval_token_accuracy": 0.6136693358421326, + "eval_top_k_diff": -533.968017578125, + "eval_topk_loss": 0.012739567086100578, + "step": 1500 + }, + { + "epoch": 0.596585580941853, + "grad_norm": 0.162109375, + "learning_rate": 7.251114051824534e-05, + "lm_loss": 1.8898, + "loss": 2.0319, + "mask_loss": 0.1297, + "step": 1501, + "topk_loss": 0.0124 + }, + { + "epoch": 0.5969830396899822, + "grad_norm": 0.203125, + "learning_rate": 7.238862469381177e-05, + "lm_loss": 1.9536, + "loss": 2.0893, + "mask_loss": 0.126, + "step": 1502, + "topk_loss": 0.0096 + }, + { + "epoch": 0.5973804984381114, + "grad_norm": 0.1376953125, + "learning_rate": 7.226615371849337e-05, + "lm_loss": 1.9464, + "loss": 2.0878, + "mask_loss": 0.1285, + "step": 1503, + "topk_loss": 0.013 + }, + { + "epoch": 0.5977779571862405, + "grad_norm": 0.158203125, + "learning_rate": 7.214372779121942e-05, + "lm_loss": 1.9762, + "loss": 2.1121, + "mask_loss": 0.126, + "step": 1504, + "topk_loss": 0.0099 + }, + { + "epoch": 0.5981754159343696, + "grad_norm": 0.126953125, + "learning_rate": 7.202134711084624e-05, + "lm_loss": 1.882, + "loss": 2.0148, + "mask_loss": 0.1242, + "step": 1505, + "topk_loss": 0.0087 + }, + { + "epoch": 0.5985728746824988, + "grad_norm": 0.1220703125, + "learning_rate": 7.189901187615658e-05, + "lm_loss": 1.8998, + "loss": 2.0348, + "mask_loss": 0.1255, + "step": 1506, + "topk_loss": 0.0094 + }, + { + "epoch": 0.5989703334306279, + "grad_norm": 0.126953125, + "learning_rate": 7.177672228585935e-05, + "lm_loss": 1.8359, + "loss": 1.9746, + "mask_loss": 0.1276, + "step": 1507, + "topk_loss": 0.0111 + }, + { + "epoch": 0.599367792178757, + "grad_norm": 0.1689453125, + "learning_rate": 7.165447853858937e-05, + "lm_loss": 1.9679, + "loss": 2.1039, + "mask_loss": 0.1256, + "step": 1508, + "topk_loss": 0.0103 + }, + { + "epoch": 0.5997652509268863, + "grad_norm": 0.2041015625, + "learning_rate": 7.153228083290698e-05, + "lm_loss": 1.8872, + "loss": 2.0229, + "mask_loss": 0.1256, + "step": 1509, + "topk_loss": 0.01 + }, + { + "epoch": 0.6001627096750154, + "grad_norm": 0.140625, + "learning_rate": 7.141012936729771e-05, + "lm_loss": 1.8764, + "loss": 2.0122, + "mask_loss": 0.126, + "step": 1510, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6005601684231445, + "grad_norm": 0.1162109375, + "learning_rate": 7.128802434017205e-05, + "lm_loss": 1.8735, + "loss": 2.0111, + "mask_loss": 0.1272, + "step": 1511, + "topk_loss": 0.0104 + }, + { + "epoch": 0.6009576271712737, + "grad_norm": 0.146484375, + "learning_rate": 7.116596594986494e-05, + "lm_loss": 1.917, + "loss": 2.0519, + "mask_loss": 0.1257, + "step": 1512, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6013550859194028, + "grad_norm": 0.1435546875, + "learning_rate": 7.104395439463567e-05, + "lm_loss": 1.9171, + "loss": 2.0525, + "mask_loss": 0.1263, + "step": 1513, + "topk_loss": 0.0091 + }, + { + "epoch": 0.6017525446675319, + "grad_norm": 0.1337890625, + "learning_rate": 7.092198987266742e-05, + "lm_loss": 1.9347, + "loss": 2.0748, + "mask_loss": 0.1287, + "step": 1514, + "topk_loss": 0.0114 + }, + { + "epoch": 0.6021500034156612, + "grad_norm": 0.1171875, + "learning_rate": 7.080007258206698e-05, + "lm_loss": 1.9195, + "loss": 2.0541, + "mask_loss": 0.1258, + "step": 1515, + "topk_loss": 0.0088 + }, + { + "epoch": 0.6025474621637903, + "grad_norm": 0.189453125, + "learning_rate": 7.067820272086443e-05, + "lm_loss": 1.854, + "loss": 1.9898, + "mask_loss": 0.1266, + "step": 1516, + "topk_loss": 0.0091 + }, + { + "epoch": 0.6029449209119194, + "grad_norm": 0.154296875, + "learning_rate": 7.055638048701278e-05, + "lm_loss": 1.8665, + "loss": 2.0022, + "mask_loss": 0.126, + "step": 1517, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6033423796600486, + "grad_norm": 0.2255859375, + "learning_rate": 7.043460607838772e-05, + "lm_loss": 1.8717, + "loss": 2.0117, + "mask_loss": 0.1279, + "step": 1518, + "topk_loss": 0.0121 + }, + { + "epoch": 0.6037398384081777, + "grad_norm": 0.15625, + "learning_rate": 7.031287969278728e-05, + "lm_loss": 1.9536, + "loss": 2.093, + "mask_loss": 0.1274, + "step": 1519, + "topk_loss": 0.012 + }, + { + "epoch": 0.6041372971563068, + "grad_norm": 0.134765625, + "learning_rate": 7.019120152793135e-05, + "lm_loss": 1.9424, + "loss": 2.0781, + "mask_loss": 0.1255, + "step": 1520, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6045347559044361, + "grad_norm": 0.1181640625, + "learning_rate": 7.006957178146162e-05, + "lm_loss": 1.9302, + "loss": 2.0649, + "mask_loss": 0.1254, + "step": 1521, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6049322146525652, + "grad_norm": 0.1533203125, + "learning_rate": 6.994799065094113e-05, + "lm_loss": 1.9078, + "loss": 2.0483, + "mask_loss": 0.1288, + "step": 1522, + "topk_loss": 0.0117 + }, + { + "epoch": 0.6053296734006943, + "grad_norm": 0.1533203125, + "learning_rate": 6.982645833385391e-05, + "lm_loss": 1.9466, + "loss": 2.0862, + "mask_loss": 0.1265, + "step": 1523, + "topk_loss": 0.0132 + }, + { + "epoch": 0.6057271321488235, + "grad_norm": 0.14453125, + "learning_rate": 6.970497502760471e-05, + "lm_loss": 1.8732, + "loss": 2.0105, + "mask_loss": 0.1271, + "step": 1524, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6061245908969526, + "grad_norm": 0.158203125, + "learning_rate": 6.95835409295187e-05, + "lm_loss": 1.8879, + "loss": 2.0242, + "mask_loss": 0.1254, + "step": 1525, + "topk_loss": 0.0109 + }, + { + "epoch": 0.6065220496450817, + "grad_norm": 0.150390625, + "learning_rate": 6.94621562368411e-05, + "lm_loss": 1.9548, + "loss": 2.0923, + "mask_loss": 0.1263, + "step": 1526, + "topk_loss": 0.0112 + }, + { + "epoch": 0.606919508393211, + "grad_norm": 0.1162109375, + "learning_rate": 6.934082114673688e-05, + "lm_loss": 1.8601, + "loss": 1.9986, + "mask_loss": 0.1276, + "step": 1527, + "topk_loss": 0.0109 + }, + { + "epoch": 0.6073169671413401, + "grad_norm": 0.1171875, + "learning_rate": 6.921953585629043e-05, + "lm_loss": 1.8455, + "loss": 1.9823, + "mask_loss": 0.1275, + "step": 1528, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6077144258894692, + "grad_norm": 0.12255859375, + "learning_rate": 6.909830056250527e-05, + "lm_loss": 1.93, + "loss": 2.0652, + "mask_loss": 0.1246, + "step": 1529, + "topk_loss": 0.0106 + }, + { + "epoch": 0.6081118846375984, + "grad_norm": 0.126953125, + "learning_rate": 6.89771154623037e-05, + "lm_loss": 1.9597, + "loss": 2.0993, + "mask_loss": 0.1271, + "step": 1530, + "topk_loss": 0.0124 + }, + { + "epoch": 0.6085093433857275, + "grad_norm": 0.15234375, + "learning_rate": 6.88559807525265e-05, + "lm_loss": 1.8741, + "loss": 2.0109, + "mask_loss": 0.1274, + "step": 1531, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6089068021338566, + "grad_norm": 0.1328125, + "learning_rate": 6.873489662993261e-05, + "lm_loss": 1.9553, + "loss": 2.0951, + "mask_loss": 0.1274, + "step": 1532, + "topk_loss": 0.0124 + }, + { + "epoch": 0.6093042608819857, + "grad_norm": 0.11962890625, + "learning_rate": 6.861386329119877e-05, + "lm_loss": 1.8683, + "loss": 2.0041, + "mask_loss": 0.1267, + "step": 1533, + "topk_loss": 0.0091 + }, + { + "epoch": 0.609701719630115, + "grad_norm": 0.1259765625, + "learning_rate": 6.849288093291924e-05, + "lm_loss": 1.9182, + "loss": 2.0571, + "mask_loss": 0.1285, + "step": 1534, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6100991783782441, + "grad_norm": 0.1416015625, + "learning_rate": 6.837194975160554e-05, + "lm_loss": 1.9849, + "loss": 2.1179, + "mask_loss": 0.1241, + "step": 1535, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6104966371263733, + "grad_norm": 0.12109375, + "learning_rate": 6.825106994368593e-05, + "lm_loss": 1.8937, + "loss": 2.0317, + "mask_loss": 0.1267, + "step": 1536, + "topk_loss": 0.0112 + }, + { + "epoch": 0.6108940958745024, + "grad_norm": 0.1181640625, + "learning_rate": 6.813024170550531e-05, + "lm_loss": 1.9574, + "loss": 2.0939, + "mask_loss": 0.1254, + "step": 1537, + "topk_loss": 0.0111 + }, + { + "epoch": 0.6112915546226315, + "grad_norm": 0.11767578125, + "learning_rate": 6.800946523332484e-05, + "lm_loss": 1.936, + "loss": 2.073, + "mask_loss": 0.1268, + "step": 1538, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6116890133707606, + "grad_norm": 0.1435546875, + "learning_rate": 6.788874072332152e-05, + "lm_loss": 1.9401, + "loss": 2.0826, + "mask_loss": 0.1284, + "step": 1539, + "topk_loss": 0.0141 + }, + { + "epoch": 0.6120864721188899, + "grad_norm": 0.1220703125, + "learning_rate": 6.776806837158802e-05, + "lm_loss": 1.8665, + "loss": 2.0022, + "mask_loss": 0.1264, + "step": 1540, + "topk_loss": 0.0093 + }, + { + "epoch": 0.612483930867019, + "grad_norm": 0.1142578125, + "learning_rate": 6.764744837413225e-05, + "lm_loss": 1.848, + "loss": 1.9839, + "mask_loss": 0.1263, + "step": 1541, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6128813896151482, + "grad_norm": 0.12353515625, + "learning_rate": 6.75268809268771e-05, + "lm_loss": 1.9316, + "loss": 2.0693, + "mask_loss": 0.1266, + "step": 1542, + "topk_loss": 0.0111 + }, + { + "epoch": 0.6132788483632773, + "grad_norm": 0.1171875, + "learning_rate": 6.74063662256601e-05, + "lm_loss": 1.9418, + "loss": 2.0779, + "mask_loss": 0.1252, + "step": 1543, + "topk_loss": 0.011 + }, + { + "epoch": 0.6136763071114064, + "grad_norm": 0.12353515625, + "learning_rate": 6.728590446623305e-05, + "lm_loss": 1.9308, + "loss": 2.0664, + "mask_loss": 0.1256, + "step": 1544, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6140737658595355, + "grad_norm": 0.119140625, + "learning_rate": 6.716549584426182e-05, + "lm_loss": 1.8845, + "loss": 2.0173, + "mask_loss": 0.1246, + "step": 1545, + "topk_loss": 0.0082 + }, + { + "epoch": 0.6144712246076648, + "grad_norm": 0.123046875, + "learning_rate": 6.704514055532597e-05, + "lm_loss": 1.9182, + "loss": 2.0568, + "mask_loss": 0.1265, + "step": 1546, + "topk_loss": 0.012 + }, + { + "epoch": 0.6148686833557939, + "grad_norm": 0.11962890625, + "learning_rate": 6.692483879491841e-05, + "lm_loss": 1.9522, + "loss": 2.091, + "mask_loss": 0.1285, + "step": 1547, + "topk_loss": 0.0102 + }, + { + "epoch": 0.615266142103923, + "grad_norm": 0.12353515625, + "learning_rate": 6.68045907584451e-05, + "lm_loss": 1.9256, + "loss": 2.0618, + "mask_loss": 0.127, + "step": 1548, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6156636008520522, + "grad_norm": 0.126953125, + "learning_rate": 6.668439664122475e-05, + "lm_loss": 1.9176, + "loss": 2.0529, + "mask_loss": 0.1261, + "step": 1549, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6160610596001813, + "grad_norm": 0.1591796875, + "learning_rate": 6.656425663848848e-05, + "lm_loss": 1.8485, + "loss": 1.9852, + "mask_loss": 0.1272, + "step": 1550, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6160610596001813, + "eval_lm_loss": 687.9506225585938, + "eval_loss": 688.0882568359375, + "eval_mask_hit_rate": 0.534498929977417, + "eval_mask_loss": 0.12500986456871033, + "eval_mask_top_10_hit_rate": 0.9853236675262451, + "eval_mask_top_1_hit_rate": 0.9973630905151367, + "eval_mask_top_20_hit_rate": 0.975848376750946, + "eval_mask_top_5_hit_rate": 0.9906688928604126, + "eval_runtime": 144.4709, + "eval_samples_per_second": 14.176, + "eval_steps_per_second": 7.088, + "eval_token_accuracy": 0.6139100193977356, + "eval_top_k_diff": -530.8353271484375, + "eval_topk_loss": 0.012646865099668503, + "step": 1550 + }, + { + "epoch": 0.6164585183483104, + "grad_norm": 0.11474609375, + "learning_rate": 6.644417094537956e-05, + "lm_loss": 1.9334, + "loss": 2.072, + "mask_loss": 0.1269, + "step": 1551, + "topk_loss": 0.0117 + }, + { + "epoch": 0.6168559770964397, + "grad_norm": 0.11328125, + "learning_rate": 6.63241397569529e-05, + "lm_loss": 1.8446, + "loss": 1.9813, + "mask_loss": 0.1269, + "step": 1552, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6172534358445688, + "grad_norm": 0.11767578125, + "learning_rate": 6.620416326817504e-05, + "lm_loss": 1.9193, + "loss": 2.052, + "mask_loss": 0.1234, + "step": 1553, + "topk_loss": 0.0094 + }, + { + "epoch": 0.617650894592698, + "grad_norm": 0.11767578125, + "learning_rate": 6.60842416739236e-05, + "lm_loss": 1.8713, + "loss": 2.0097, + "mask_loss": 0.1282, + "step": 1554, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6180483533408271, + "grad_norm": 0.1455078125, + "learning_rate": 6.596437516898703e-05, + "lm_loss": 1.9221, + "loss": 2.0639, + "mask_loss": 0.1295, + "step": 1555, + "topk_loss": 0.0122 + }, + { + "epoch": 0.6184458120889562, + "grad_norm": 0.11474609375, + "learning_rate": 6.584456394806434e-05, + "lm_loss": 1.921, + "loss": 2.0569, + "mask_loss": 0.1265, + "step": 1556, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6188432708370853, + "grad_norm": 0.11328125, + "learning_rate": 6.572480820576469e-05, + "lm_loss": 1.8918, + "loss": 2.0312, + "mask_loss": 0.128, + "step": 1557, + "topk_loss": 0.0114 + }, + { + "epoch": 0.6192407295852146, + "grad_norm": 0.1162109375, + "learning_rate": 6.560510813660719e-05, + "lm_loss": 1.8826, + "loss": 2.0195, + "mask_loss": 0.1273, + "step": 1558, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6196381883333437, + "grad_norm": 0.1689453125, + "learning_rate": 6.548546393502045e-05, + "lm_loss": 1.9157, + "loss": 2.0527, + "mask_loss": 0.1269, + "step": 1559, + "topk_loss": 0.01 + }, + { + "epoch": 0.6200356470814729, + "grad_norm": 0.130859375, + "learning_rate": 6.536587579534236e-05, + "lm_loss": 1.8873, + "loss": 2.0228, + "mask_loss": 0.1252, + "step": 1560, + "topk_loss": 0.0103 + }, + { + "epoch": 0.620433105829602, + "grad_norm": 0.11962890625, + "learning_rate": 6.524634391181975e-05, + "lm_loss": 1.8931, + "loss": 2.0282, + "mask_loss": 0.1253, + "step": 1561, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6208305645777311, + "grad_norm": 0.12255859375, + "learning_rate": 6.51268684786081e-05, + "lm_loss": 1.9159, + "loss": 2.0564, + "mask_loss": 0.1285, + "step": 1562, + "topk_loss": 0.012 + }, + { + "epoch": 0.6212280233258602, + "grad_norm": 0.1328125, + "learning_rate": 6.500744968977116e-05, + "lm_loss": 1.9496, + "loss": 2.0841, + "mask_loss": 0.1245, + "step": 1563, + "topk_loss": 0.01 + }, + { + "epoch": 0.6216254820739894, + "grad_norm": 0.12890625, + "learning_rate": 6.48880877392807e-05, + "lm_loss": 1.8963, + "loss": 2.0317, + "mask_loss": 0.1262, + "step": 1564, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6220229408221186, + "grad_norm": 0.123046875, + "learning_rate": 6.476878282101614e-05, + "lm_loss": 1.9197, + "loss": 2.0538, + "mask_loss": 0.1241, + "step": 1565, + "topk_loss": 0.01 + }, + { + "epoch": 0.6224203995702478, + "grad_norm": 0.1201171875, + "learning_rate": 6.46495351287643e-05, + "lm_loss": 1.9183, + "loss": 2.0556, + "mask_loss": 0.1268, + "step": 1566, + "topk_loss": 0.0105 + }, + { + "epoch": 0.6228178583183769, + "grad_norm": 0.126953125, + "learning_rate": 6.453034485621904e-05, + "lm_loss": 1.9571, + "loss": 2.0942, + "mask_loss": 0.1272, + "step": 1567, + "topk_loss": 0.0099 + }, + { + "epoch": 0.623215317066506, + "grad_norm": 0.1357421875, + "learning_rate": 6.441121219698087e-05, + "lm_loss": 1.9259, + "loss": 2.0597, + "mask_loss": 0.125, + "step": 1568, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6236127758146351, + "grad_norm": 0.11376953125, + "learning_rate": 6.429213734455683e-05, + "lm_loss": 1.9268, + "loss": 2.0627, + "mask_loss": 0.126, + "step": 1569, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6240102345627643, + "grad_norm": 0.1259765625, + "learning_rate": 6.417312049236004e-05, + "lm_loss": 1.9038, + "loss": 2.0373, + "mask_loss": 0.124, + "step": 1570, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6244076933108935, + "grad_norm": 0.1220703125, + "learning_rate": 6.405416183370936e-05, + "lm_loss": 1.8937, + "loss": 2.0307, + "mask_loss": 0.1275, + "step": 1571, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6248051520590227, + "grad_norm": 0.1435546875, + "learning_rate": 6.393526156182918e-05, + "lm_loss": 1.8852, + "loss": 2.0238, + "mask_loss": 0.1267, + "step": 1572, + "topk_loss": 0.012 + }, + { + "epoch": 0.6252026108071518, + "grad_norm": 0.1328125, + "learning_rate": 6.381641986984901e-05, + "lm_loss": 1.9827, + "loss": 2.1183, + "mask_loss": 0.1256, + "step": 1573, + "topk_loss": 0.0101 + }, + { + "epoch": 0.6256000695552809, + "grad_norm": 0.115234375, + "learning_rate": 6.369763695080327e-05, + "lm_loss": 1.9191, + "loss": 2.0537, + "mask_loss": 0.1248, + "step": 1574, + "topk_loss": 0.0098 + }, + { + "epoch": 0.62599752830341, + "grad_norm": 0.1220703125, + "learning_rate": 6.357891299763086e-05, + "lm_loss": 1.8599, + "loss": 1.9967, + "mask_loss": 0.1272, + "step": 1575, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6263949870515392, + "grad_norm": 0.1630859375, + "learning_rate": 6.346024820317488e-05, + "lm_loss": 1.894, + "loss": 2.0287, + "mask_loss": 0.1251, + "step": 1576, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6267924457996684, + "grad_norm": 0.1416015625, + "learning_rate": 6.334164276018242e-05, + "lm_loss": 1.8849, + "loss": 2.0216, + "mask_loss": 0.1267, + "step": 1577, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6271899045477975, + "grad_norm": 0.126953125, + "learning_rate": 6.32230968613041e-05, + "lm_loss": 1.8525, + "loss": 1.9909, + "mask_loss": 0.1278, + "step": 1578, + "topk_loss": 0.0106 + }, + { + "epoch": 0.6275873632959267, + "grad_norm": 0.1650390625, + "learning_rate": 6.310461069909384e-05, + "lm_loss": 1.8428, + "loss": 1.9792, + "mask_loss": 0.1258, + "step": 1579, + "topk_loss": 0.0106 + }, + { + "epoch": 0.6279848220440558, + "grad_norm": 0.1845703125, + "learning_rate": 6.298618446600856e-05, + "lm_loss": 1.8653, + "loss": 1.999, + "mask_loss": 0.1242, + "step": 1580, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6283822807921849, + "grad_norm": 0.1376953125, + "learning_rate": 6.286781835440778e-05, + "lm_loss": 1.9132, + "loss": 2.0546, + "mask_loss": 0.1285, + "step": 1581, + "topk_loss": 0.013 + }, + { + "epoch": 0.6287797395403141, + "grad_norm": 0.11962890625, + "learning_rate": 6.274951255655344e-05, + "lm_loss": 1.9113, + "loss": 2.0469, + "mask_loss": 0.1258, + "step": 1582, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6291771982884433, + "grad_norm": 0.1142578125, + "learning_rate": 6.263126726460945e-05, + "lm_loss": 1.8686, + "loss": 2.0055, + "mask_loss": 0.1281, + "step": 1583, + "topk_loss": 0.0088 + }, + { + "epoch": 0.6295746570365724, + "grad_norm": 0.1376953125, + "learning_rate": 6.251308267064143e-05, + "lm_loss": 1.943, + "loss": 2.087, + "mask_loss": 0.1282, + "step": 1584, + "topk_loss": 0.0157 + }, + { + "epoch": 0.6299721157847016, + "grad_norm": 0.11767578125, + "learning_rate": 6.239495896661643e-05, + "lm_loss": 1.8985, + "loss": 2.0348, + "mask_loss": 0.127, + "step": 1585, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6303695745328307, + "grad_norm": 0.1279296875, + "learning_rate": 6.227689634440263e-05, + "lm_loss": 1.893, + "loss": 2.0332, + "mask_loss": 0.1281, + "step": 1586, + "topk_loss": 0.012 + }, + { + "epoch": 0.6307670332809598, + "grad_norm": 0.1416015625, + "learning_rate": 6.215889499576898e-05, + "lm_loss": 1.8923, + "loss": 2.0254, + "mask_loss": 0.124, + "step": 1587, + "topk_loss": 0.0091 + }, + { + "epoch": 0.631164492029089, + "grad_norm": 0.1455078125, + "learning_rate": 6.204095511238487e-05, + "lm_loss": 1.8733, + "loss": 2.0108, + "mask_loss": 0.1275, + "step": 1588, + "topk_loss": 0.01 + }, + { + "epoch": 0.6315619507772181, + "grad_norm": 0.12109375, + "learning_rate": 6.192307688581989e-05, + "lm_loss": 1.9041, + "loss": 2.0397, + "mask_loss": 0.1248, + "step": 1589, + "topk_loss": 0.0109 + }, + { + "epoch": 0.6319594095253473, + "grad_norm": 0.119140625, + "learning_rate": 6.180526050754346e-05, + "lm_loss": 1.9108, + "loss": 2.045, + "mask_loss": 0.1254, + "step": 1590, + "topk_loss": 0.0088 + }, + { + "epoch": 0.6323568682734765, + "grad_norm": 0.138671875, + "learning_rate": 6.168750616892459e-05, + "lm_loss": 1.9126, + "loss": 2.048, + "mask_loss": 0.1255, + "step": 1591, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6327543270216056, + "grad_norm": 0.1259765625, + "learning_rate": 6.156981406123137e-05, + "lm_loss": 1.9135, + "loss": 2.0517, + "mask_loss": 0.1274, + "step": 1592, + "topk_loss": 0.0108 + }, + { + "epoch": 0.6331517857697347, + "grad_norm": 0.12890625, + "learning_rate": 6.145218437563097e-05, + "lm_loss": 1.9114, + "loss": 2.046, + "mask_loss": 0.1257, + "step": 1593, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6335492445178639, + "grad_norm": 0.130859375, + "learning_rate": 6.133461730318911e-05, + "lm_loss": 1.9025, + "loss": 2.0381, + "mask_loss": 0.1265, + "step": 1594, + "topk_loss": 0.0091 + }, + { + "epoch": 0.633946703265993, + "grad_norm": 0.138671875, + "learning_rate": 6.12171130348698e-05, + "lm_loss": 1.9223, + "loss": 2.0587, + "mask_loss": 0.1267, + "step": 1595, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6343441620141222, + "grad_norm": 0.1474609375, + "learning_rate": 6.109967176153506e-05, + "lm_loss": 1.8486, + "loss": 1.9804, + "mask_loss": 0.1234, + "step": 1596, + "topk_loss": 0.0084 + }, + { + "epoch": 0.6347416207622514, + "grad_norm": 0.11376953125, + "learning_rate": 6.0982293673944544e-05, + "lm_loss": 1.9579, + "loss": 2.0944, + "mask_loss": 0.1267, + "step": 1597, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6351390795103805, + "grad_norm": 0.11572265625, + "learning_rate": 6.0864978962755335e-05, + "lm_loss": 1.8544, + "loss": 1.9925, + "mask_loss": 0.1269, + "step": 1598, + "topk_loss": 0.0112 + }, + { + "epoch": 0.6355365382585096, + "grad_norm": 0.11474609375, + "learning_rate": 6.074772781852158e-05, + "lm_loss": 1.9056, + "loss": 2.0393, + "mask_loss": 0.1248, + "step": 1599, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6359339970066388, + "grad_norm": 0.1376953125, + "learning_rate": 6.0630540431694026e-05, + "lm_loss": 1.8817, + "loss": 2.0173, + "mask_loss": 0.1259, + "step": 1600, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6359339970066388, + "eval_lm_loss": 686.8424682617188, + "eval_loss": 686.9800415039062, + "eval_mask_hit_rate": 0.534833550453186, + "eval_mask_loss": 0.12482748180627823, + "eval_mask_top_10_hit_rate": 0.9854109287261963, + "eval_mask_top_1_hit_rate": 0.9973936080932617, + "eval_mask_top_20_hit_rate": 0.9759554862976074, + "eval_mask_top_5_hit_rate": 0.9907490015029907, + "eval_runtime": 144.1701, + "eval_samples_per_second": 14.205, + "eval_steps_per_second": 7.103, + "eval_token_accuracy": 0.6140152215957642, + "eval_top_k_diff": -523.09375, + "eval_topk_loss": 0.01275416649878025, + "step": 1600 + }, + { + "epoch": 0.6363314557547679, + "grad_norm": 0.123046875, + "learning_rate": 6.051341699262003e-05, + "lm_loss": 1.8735, + "loss": 2.0167, + "mask_loss": 0.1314, + "step": 1601, + "topk_loss": 0.0119 + }, + { + "epoch": 0.6367289145028971, + "grad_norm": 0.146484375, + "learning_rate": 6.039635769154301e-05, + "lm_loss": 1.8738, + "loss": 2.0069, + "mask_loss": 0.1247, + "step": 1602, + "topk_loss": 0.0085 + }, + { + "epoch": 0.6371263732510263, + "grad_norm": 0.1806640625, + "learning_rate": 6.027936271860223e-05, + "lm_loss": 1.9179, + "loss": 2.0517, + "mask_loss": 0.1249, + "step": 1603, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6375238319991554, + "grad_norm": 0.115234375, + "learning_rate": 6.016243226383241e-05, + "lm_loss": 1.9301, + "loss": 2.0631, + "mask_loss": 0.1237, + "step": 1604, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6379212907472845, + "grad_norm": 0.111328125, + "learning_rate": 6.004556651716354e-05, + "lm_loss": 1.8686, + "loss": 2.0035, + "mask_loss": 0.1266, + "step": 1605, + "topk_loss": 0.0083 + }, + { + "epoch": 0.6383187494954137, + "grad_norm": 0.138671875, + "learning_rate": 5.992876566842047e-05, + "lm_loss": 1.9002, + "loss": 2.0353, + "mask_loss": 0.1248, + "step": 1606, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6387162082435428, + "grad_norm": 0.11328125, + "learning_rate": 5.981202990732267e-05, + "lm_loss": 1.9001, + "loss": 2.0344, + "mask_loss": 0.1255, + "step": 1607, + "topk_loss": 0.0087 + }, + { + "epoch": 0.639113666991672, + "grad_norm": 0.1318359375, + "learning_rate": 5.969535942348379e-05, + "lm_loss": 1.943, + "loss": 2.0812, + "mask_loss": 0.1265, + "step": 1608, + "topk_loss": 0.0116 + }, + { + "epoch": 0.6395111257398012, + "grad_norm": 0.1171875, + "learning_rate": 5.957875440641155e-05, + "lm_loss": 1.9564, + "loss": 2.0918, + "mask_loss": 0.1257, + "step": 1609, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6399085844879303, + "grad_norm": 0.13671875, + "learning_rate": 5.946221504550732e-05, + "lm_loss": 1.8903, + "loss": 2.0278, + "mask_loss": 0.1279, + "step": 1610, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6403060432360594, + "grad_norm": 0.1484375, + "learning_rate": 5.934574153006579e-05, + "lm_loss": 1.9521, + "loss": 2.0879, + "mask_loss": 0.1263, + "step": 1611, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6407035019841886, + "grad_norm": 0.1162109375, + "learning_rate": 5.922933404927473e-05, + "lm_loss": 1.9203, + "loss": 2.0542, + "mask_loss": 0.1239, + "step": 1612, + "topk_loss": 0.01 + }, + { + "epoch": 0.6411009607323177, + "grad_norm": 0.11865234375, + "learning_rate": 5.911299279221463e-05, + "lm_loss": 1.8034, + "loss": 1.9406, + "mask_loss": 0.128, + "step": 1613, + "topk_loss": 0.0092 + }, + { + "epoch": 0.641498419480447, + "grad_norm": 0.1181640625, + "learning_rate": 5.899671794785839e-05, + "lm_loss": 1.9372, + "loss": 2.0712, + "mask_loss": 0.1243, + "step": 1614, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6418958782285761, + "grad_norm": 0.14453125, + "learning_rate": 5.888050970507114e-05, + "lm_loss": 1.8709, + "loss": 2.0097, + "mask_loss": 0.1276, + "step": 1615, + "topk_loss": 0.0111 + }, + { + "epoch": 0.6422933369767052, + "grad_norm": 0.1162109375, + "learning_rate": 5.876436825260967e-05, + "lm_loss": 1.9412, + "loss": 2.0764, + "mask_loss": 0.1249, + "step": 1616, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6426907957248343, + "grad_norm": 0.1162109375, + "learning_rate": 5.86482937791224e-05, + "lm_loss": 1.8946, + "loss": 2.0269, + "mask_loss": 0.1251, + "step": 1617, + "topk_loss": 0.0072 + }, + { + "epoch": 0.6430882544729635, + "grad_norm": 0.1376953125, + "learning_rate": 5.85322864731489e-05, + "lm_loss": 1.9194, + "loss": 2.0567, + "mask_loss": 0.1281, + "step": 1618, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6434857132210926, + "grad_norm": 0.12353515625, + "learning_rate": 5.841634652311969e-05, + "lm_loss": 1.8678, + "loss": 2.0049, + "mask_loss": 0.1277, + "step": 1619, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6438831719692217, + "grad_norm": 0.1123046875, + "learning_rate": 5.830047411735588e-05, + "lm_loss": 1.9259, + "loss": 2.0574, + "mask_loss": 0.1223, + "step": 1620, + "topk_loss": 0.0093 + }, + { + "epoch": 0.644280630717351, + "grad_norm": 0.1201171875, + "learning_rate": 5.818466944406877e-05, + "lm_loss": 1.8894, + "loss": 2.0248, + "mask_loss": 0.1255, + "step": 1621, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6446780894654801, + "grad_norm": 0.11376953125, + "learning_rate": 5.8068932691359753e-05, + "lm_loss": 1.9152, + "loss": 2.0512, + "mask_loss": 0.1264, + "step": 1622, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6450755482136092, + "grad_norm": 0.12158203125, + "learning_rate": 5.795326404721988e-05, + "lm_loss": 1.9404, + "loss": 2.0807, + "mask_loss": 0.1271, + "step": 1623, + "topk_loss": 0.0132 + }, + { + "epoch": 0.6454730069617384, + "grad_norm": 0.12109375, + "learning_rate": 5.783766369952952e-05, + "lm_loss": 1.8673, + "loss": 1.9987, + "mask_loss": 0.1229, + "step": 1624, + "topk_loss": 0.0086 + }, + { + "epoch": 0.6458704657098675, + "grad_norm": 0.11279296875, + "learning_rate": 5.772213183605817e-05, + "lm_loss": 1.9092, + "loss": 2.0442, + "mask_loss": 0.1246, + "step": 1625, + "topk_loss": 0.0104 + }, + { + "epoch": 0.6462679244579966, + "grad_norm": 0.11669921875, + "learning_rate": 5.760666864446403e-05, + "lm_loss": 1.8945, + "loss": 2.0302, + "mask_loss": 0.1266, + "step": 1626, + "topk_loss": 0.009 + }, + { + "epoch": 0.6466653832061259, + "grad_norm": 0.130859375, + "learning_rate": 5.7491274312293816e-05, + "lm_loss": 1.8766, + "loss": 2.0137, + "mask_loss": 0.1274, + "step": 1627, + "topk_loss": 0.0097 + }, + { + "epoch": 0.647062841954255, + "grad_norm": 0.11572265625, + "learning_rate": 5.7375949026982365e-05, + "lm_loss": 1.9554, + "loss": 2.092, + "mask_loss": 0.1258, + "step": 1628, + "topk_loss": 0.0108 + }, + { + "epoch": 0.6474603007023841, + "grad_norm": 0.1298828125, + "learning_rate": 5.726069297585235e-05, + "lm_loss": 1.8913, + "loss": 2.0263, + "mask_loss": 0.1246, + "step": 1629, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6478577594505133, + "grad_norm": 0.2333984375, + "learning_rate": 5.714550634611401e-05, + "lm_loss": 1.8887, + "loss": 2.0472, + "mask_loss": 0.1386, + "step": 1630, + "topk_loss": 0.0199 + }, + { + "epoch": 0.6482552181986424, + "grad_norm": 0.11669921875, + "learning_rate": 5.703038932486484e-05, + "lm_loss": 1.8684, + "loss": 2.0043, + "mask_loss": 0.1265, + "step": 1631, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6486526769467715, + "grad_norm": 0.169921875, + "learning_rate": 5.691534209908919e-05, + "lm_loss": 1.904, + "loss": 2.0369, + "mask_loss": 0.1234, + "step": 1632, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6490501356949008, + "grad_norm": 0.1767578125, + "learning_rate": 5.680036485565811e-05, + "lm_loss": 1.9141, + "loss": 2.048, + "mask_loss": 0.1243, + "step": 1633, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6494475944430299, + "grad_norm": 0.1357421875, + "learning_rate": 5.668545778132897e-05, + "lm_loss": 1.8989, + "loss": 2.0371, + "mask_loss": 0.1274, + "step": 1634, + "topk_loss": 0.0108 + }, + { + "epoch": 0.649845053191159, + "grad_norm": 0.1796875, + "learning_rate": 5.6570621062745146e-05, + "lm_loss": 1.9017, + "loss": 2.0359, + "mask_loss": 0.1239, + "step": 1635, + "topk_loss": 0.0103 + }, + { + "epoch": 0.6502425119392882, + "grad_norm": 0.12158203125, + "learning_rate": 5.6455854886435765e-05, + "lm_loss": 1.869, + "loss": 2.0046, + "mask_loss": 0.1262, + "step": 1636, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6506399706874173, + "grad_norm": 0.1259765625, + "learning_rate": 5.634115943881535e-05, + "lm_loss": 1.9075, + "loss": 2.0437, + "mask_loss": 0.1252, + "step": 1637, + "topk_loss": 0.011 + }, + { + "epoch": 0.6510374294355464, + "grad_norm": 0.15625, + "learning_rate": 5.622653490618353e-05, + "lm_loss": 1.982, + "loss": 2.1171, + "mask_loss": 0.1254, + "step": 1638, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6514348881836757, + "grad_norm": 0.1279296875, + "learning_rate": 5.611198147472481e-05, + "lm_loss": 1.9015, + "loss": 2.04, + "mask_loss": 0.1291, + "step": 1639, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6518323469318048, + "grad_norm": 0.1533203125, + "learning_rate": 5.5997499330508066e-05, + "lm_loss": 1.8138, + "loss": 1.9527, + "mask_loss": 0.1291, + "step": 1640, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6522298056799339, + "grad_norm": 0.1376953125, + "learning_rate": 5.5883088659486525e-05, + "lm_loss": 1.8897, + "loss": 2.0261, + "mask_loss": 0.1262, + "step": 1641, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6526272644280631, + "grad_norm": 0.11376953125, + "learning_rate": 5.576874964749727e-05, + "lm_loss": 1.8951, + "loss": 2.0294, + "mask_loss": 0.1247, + "step": 1642, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6530247231761922, + "grad_norm": 0.12451171875, + "learning_rate": 5.5654482480260964e-05, + "lm_loss": 1.923, + "loss": 2.0618, + "mask_loss": 0.1273, + "step": 1643, + "topk_loss": 0.0115 + }, + { + "epoch": 0.6534221819243213, + "grad_norm": 0.1474609375, + "learning_rate": 5.5540287343381606e-05, + "lm_loss": 1.8869, + "loss": 2.0227, + "mask_loss": 0.1261, + "step": 1644, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6538196406724506, + "grad_norm": 0.1982421875, + "learning_rate": 5.542616442234618e-05, + "lm_loss": 1.8922, + "loss": 2.047, + "mask_loss": 0.135, + "step": 1645, + "topk_loss": 0.0198 + }, + { + "epoch": 0.6542170994205797, + "grad_norm": 0.1396484375, + "learning_rate": 5.531211390252438e-05, + "lm_loss": 1.9581, + "loss": 2.0917, + "mask_loss": 0.1247, + "step": 1646, + "topk_loss": 0.009 + }, + { + "epoch": 0.6546145581687088, + "grad_norm": 0.12451171875, + "learning_rate": 5.519813596916833e-05, + "lm_loss": 1.9184, + "loss": 2.0532, + "mask_loss": 0.1247, + "step": 1647, + "topk_loss": 0.0101 + }, + { + "epoch": 0.655012016916838, + "grad_norm": 0.125, + "learning_rate": 5.5084230807412126e-05, + "lm_loss": 1.9282, + "loss": 2.0674, + "mask_loss": 0.1273, + "step": 1648, + "topk_loss": 0.012 + }, + { + "epoch": 0.6554094756649671, + "grad_norm": 0.14453125, + "learning_rate": 5.497039860227181e-05, + "lm_loss": 1.8658, + "loss": 1.9997, + "mask_loss": 0.124, + "step": 1649, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6558069344130962, + "grad_norm": 0.11376953125, + "learning_rate": 5.485663953864484e-05, + "lm_loss": 1.9037, + "loss": 2.0386, + "mask_loss": 0.1262, + "step": 1650, + "topk_loss": 0.0087 + }, + { + "epoch": 0.6558069344130962, + "eval_lm_loss": 687.2271118164062, + "eval_loss": 687.3645629882812, + "eval_mask_hit_rate": 0.5350625514984131, + "eval_mask_loss": 0.12471777945756912, + "eval_mask_top_10_hit_rate": 0.9854594469070435, + "eval_mask_top_1_hit_rate": 0.9974088668823242, + "eval_mask_top_20_hit_rate": 0.9760203957557678, + "eval_mask_top_5_hit_rate": 0.9907782077789307, + "eval_runtime": 144.1065, + "eval_samples_per_second": 14.212, + "eval_steps_per_second": 7.106, + "eval_token_accuracy": 0.6141659617424011, + "eval_top_k_diff": -526.4982299804688, + "eval_topk_loss": 0.012702615931630135, + "step": 1650 + }, + { + "epoch": 0.6562043931612254, + "grad_norm": 0.111328125, + "learning_rate": 5.474295380130989e-05, + "lm_loss": 1.8784, + "loss": 2.013, + "mask_loss": 0.1255, + "step": 1651, + "topk_loss": 0.0091 + }, + { + "epoch": 0.6566018519093546, + "grad_norm": 0.1220703125, + "learning_rate": 5.462934157492656e-05, + "lm_loss": 1.9161, + "loss": 2.0519, + "mask_loss": 0.1261, + "step": 1652, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6569993106574837, + "grad_norm": 0.1484375, + "learning_rate": 5.4515803044034985e-05, + "lm_loss": 1.9306, + "loss": 2.0692, + "mask_loss": 0.1257, + "step": 1653, + "topk_loss": 0.0129 + }, + { + "epoch": 0.6573967694056129, + "grad_norm": 0.1162109375, + "learning_rate": 5.440233839305564e-05, + "lm_loss": 1.9124, + "loss": 2.045, + "mask_loss": 0.1233, + "step": 1654, + "topk_loss": 0.0093 + }, + { + "epoch": 0.657794228153742, + "grad_norm": 0.134765625, + "learning_rate": 5.428894780628899e-05, + "lm_loss": 1.8946, + "loss": 2.0295, + "mask_loss": 0.1243, + "step": 1655, + "topk_loss": 0.0105 + }, + { + "epoch": 0.6581916869018711, + "grad_norm": 0.10888671875, + "learning_rate": 5.417563146791519e-05, + "lm_loss": 1.9002, + "loss": 2.0332, + "mask_loss": 0.1238, + "step": 1656, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6585891456500003, + "grad_norm": 0.11376953125, + "learning_rate": 5.4062389561993786e-05, + "lm_loss": 1.9347, + "loss": 2.0702, + "mask_loss": 0.1256, + "step": 1657, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6589866043981295, + "grad_norm": 0.1103515625, + "learning_rate": 5.3949222272463464e-05, + "lm_loss": 1.9164, + "loss": 2.0519, + "mask_loss": 0.1258, + "step": 1658, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6593840631462586, + "grad_norm": 0.1416015625, + "learning_rate": 5.383612978314164e-05, + "lm_loss": 1.8827, + "loss": 2.0163, + "mask_loss": 0.1246, + "step": 1659, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6597815218943878, + "grad_norm": 0.11474609375, + "learning_rate": 5.372311227772431e-05, + "lm_loss": 1.9462, + "loss": 2.0799, + "mask_loss": 0.1236, + "step": 1660, + "topk_loss": 0.0101 + }, + { + "epoch": 0.6601789806425169, + "grad_norm": 0.1201171875, + "learning_rate": 5.3610169939785615e-05, + "lm_loss": 1.9548, + "loss": 2.0921, + "mask_loss": 0.126, + "step": 1661, + "topk_loss": 0.0113 + }, + { + "epoch": 0.660576439390646, + "grad_norm": 0.11767578125, + "learning_rate": 5.349730295277764e-05, + "lm_loss": 1.8905, + "loss": 2.025, + "mask_loss": 0.1253, + "step": 1662, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6609738981387752, + "grad_norm": 0.115234375, + "learning_rate": 5.338451150003008e-05, + "lm_loss": 1.868, + "loss": 2.0035, + "mask_loss": 0.1255, + "step": 1663, + "topk_loss": 0.01 + }, + { + "epoch": 0.6613713568869044, + "grad_norm": 0.1474609375, + "learning_rate": 5.3271795764749856e-05, + "lm_loss": 1.9277, + "loss": 2.0632, + "mask_loss": 0.1252, + "step": 1664, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6617688156350335, + "grad_norm": 0.11962890625, + "learning_rate": 5.3159155930021e-05, + "lm_loss": 1.9127, + "loss": 2.0476, + "mask_loss": 0.1254, + "step": 1665, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6621662743831627, + "grad_norm": 0.166015625, + "learning_rate": 5.304659217880423e-05, + "lm_loss": 1.905, + "loss": 2.0432, + "mask_loss": 0.128, + "step": 1666, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6625637331312918, + "grad_norm": 0.123046875, + "learning_rate": 5.293410469393667e-05, + "lm_loss": 1.8786, + "loss": 2.0161, + "mask_loss": 0.1277, + "step": 1667, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6629611918794209, + "grad_norm": 0.11181640625, + "learning_rate": 5.282169365813158e-05, + "lm_loss": 1.9189, + "loss": 2.0558, + "mask_loss": 0.1259, + "step": 1668, + "topk_loss": 0.011 + }, + { + "epoch": 0.6633586506275501, + "grad_norm": 0.11474609375, + "learning_rate": 5.270935925397802e-05, + "lm_loss": 1.9022, + "loss": 2.0397, + "mask_loss": 0.1267, + "step": 1669, + "topk_loss": 0.0108 + }, + { + "epoch": 0.6637561093756793, + "grad_norm": 0.1513671875, + "learning_rate": 5.259710166394062e-05, + "lm_loss": 1.886, + "loss": 2.0255, + "mask_loss": 0.1273, + "step": 1670, + "topk_loss": 0.0122 + }, + { + "epoch": 0.6641535681238084, + "grad_norm": 0.115234375, + "learning_rate": 5.2484921070359226e-05, + "lm_loss": 1.8727, + "loss": 2.0109, + "mask_loss": 0.1276, + "step": 1671, + "topk_loss": 0.0107 + }, + { + "epoch": 0.6645510268719376, + "grad_norm": 0.146484375, + "learning_rate": 5.237281765544852e-05, + "lm_loss": 1.8136, + "loss": 1.9479, + "mask_loss": 0.1251, + "step": 1672, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6649484856200667, + "grad_norm": 0.109375, + "learning_rate": 5.2260791601298e-05, + "lm_loss": 1.8949, + "loss": 2.0347, + "mask_loss": 0.1284, + "step": 1673, + "topk_loss": 0.0115 + }, + { + "epoch": 0.6653459443681958, + "grad_norm": 0.1162109375, + "learning_rate": 5.214884308987136e-05, + "lm_loss": 1.8865, + "loss": 2.0207, + "mask_loss": 0.1248, + "step": 1674, + "topk_loss": 0.0094 + }, + { + "epoch": 0.665743403116325, + "grad_norm": 0.130859375, + "learning_rate": 5.2036972303006426e-05, + "lm_loss": 1.8873, + "loss": 2.024, + "mask_loss": 0.1266, + "step": 1675, + "topk_loss": 0.01 + }, + { + "epoch": 0.6661408618644541, + "grad_norm": 0.130859375, + "learning_rate": 5.192517942241474e-05, + "lm_loss": 1.9327, + "loss": 2.0659, + "mask_loss": 0.1239, + "step": 1676, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6665383206125833, + "grad_norm": 0.11279296875, + "learning_rate": 5.181346462968131e-05, + "lm_loss": 1.8702, + "loss": 2.0044, + "mask_loss": 0.125, + "step": 1677, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6669357793607125, + "grad_norm": 0.1123046875, + "learning_rate": 5.1701828106264305e-05, + "lm_loss": 1.9382, + "loss": 2.0733, + "mask_loss": 0.1237, + "step": 1678, + "topk_loss": 0.0114 + }, + { + "epoch": 0.6673332381088416, + "grad_norm": 0.1279296875, + "learning_rate": 5.159027003349479e-05, + "lm_loss": 1.9099, + "loss": 2.0431, + "mask_loss": 0.1241, + "step": 1679, + "topk_loss": 0.0091 + }, + { + "epoch": 0.6677306968569707, + "grad_norm": 0.1455078125, + "learning_rate": 5.147879059257632e-05, + "lm_loss": 1.9072, + "loss": 2.0402, + "mask_loss": 0.1238, + "step": 1680, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6681281556050999, + "grad_norm": 0.13671875, + "learning_rate": 5.13673899645848e-05, + "lm_loss": 1.8836, + "loss": 2.0254, + "mask_loss": 0.1291, + "step": 1681, + "topk_loss": 0.0128 + }, + { + "epoch": 0.668525614353229, + "grad_norm": 0.10986328125, + "learning_rate": 5.12560683304681e-05, + "lm_loss": 1.9045, + "loss": 2.0388, + "mask_loss": 0.1247, + "step": 1682, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6689230731013582, + "grad_norm": 0.1171875, + "learning_rate": 5.1144825871045796e-05, + "lm_loss": 1.8755, + "loss": 2.0099, + "mask_loss": 0.1265, + "step": 1683, + "topk_loss": 0.0079 + }, + { + "epoch": 0.6693205318494874, + "grad_norm": 0.15234375, + "learning_rate": 5.103366276700884e-05, + "lm_loss": 1.9191, + "loss": 2.0561, + "mask_loss": 0.1273, + "step": 1684, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6697179905976165, + "grad_norm": 0.12060546875, + "learning_rate": 5.092257919891929e-05, + "lm_loss": 1.9203, + "loss": 2.0551, + "mask_loss": 0.1249, + "step": 1685, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6701154493457456, + "grad_norm": 0.11376953125, + "learning_rate": 5.081157534721002e-05, + "lm_loss": 1.8822, + "loss": 2.0158, + "mask_loss": 0.124, + "step": 1686, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6705129080938748, + "grad_norm": 0.1552734375, + "learning_rate": 5.070065139218443e-05, + "lm_loss": 1.8402, + "loss": 1.9812, + "mask_loss": 0.1281, + "step": 1687, + "topk_loss": 0.0129 + }, + { + "epoch": 0.6709103668420039, + "grad_norm": 0.1103515625, + "learning_rate": 5.05898075140161e-05, + "lm_loss": 1.9395, + "loss": 2.0754, + "mask_loss": 0.1265, + "step": 1688, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6713078255901331, + "grad_norm": 0.1328125, + "learning_rate": 5.04790438927486e-05, + "lm_loss": 1.8916, + "loss": 2.0311, + "mask_loss": 0.1285, + "step": 1689, + "topk_loss": 0.011 + }, + { + "epoch": 0.6717052843382623, + "grad_norm": 0.11767578125, + "learning_rate": 5.036836070829512e-05, + "lm_loss": 1.8803, + "loss": 2.0128, + "mask_loss": 0.1232, + "step": 1690, + "topk_loss": 0.0093 + }, + { + "epoch": 0.6721027430863914, + "grad_norm": 0.1162109375, + "learning_rate": 5.025775814043816e-05, + "lm_loss": 1.883, + "loss": 2.0221, + "mask_loss": 0.1281, + "step": 1691, + "topk_loss": 0.011 + }, + { + "epoch": 0.6725002018345205, + "grad_norm": 0.12158203125, + "learning_rate": 5.014723636882932e-05, + "lm_loss": 1.9106, + "loss": 2.0437, + "mask_loss": 0.1237, + "step": 1692, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6728976605826497, + "grad_norm": 0.12890625, + "learning_rate": 5.003679557298896e-05, + "lm_loss": 1.8627, + "loss": 2.0023, + "mask_loss": 0.1283, + "step": 1693, + "topk_loss": 0.0113 + }, + { + "epoch": 0.6732951193307788, + "grad_norm": 0.125, + "learning_rate": 4.992643593230587e-05, + "lm_loss": 1.9231, + "loss": 2.0596, + "mask_loss": 0.1272, + "step": 1694, + "topk_loss": 0.0093 + }, + { + "epoch": 0.673692578078908, + "grad_norm": 0.1318359375, + "learning_rate": 4.98161576260371e-05, + "lm_loss": 1.9269, + "loss": 2.0619, + "mask_loss": 0.1244, + "step": 1695, + "topk_loss": 0.0106 + }, + { + "epoch": 0.6740900368270372, + "grad_norm": 0.1103515625, + "learning_rate": 4.9705960833307455e-05, + "lm_loss": 1.9213, + "loss": 2.0574, + "mask_loss": 0.1252, + "step": 1696, + "topk_loss": 0.0109 + }, + { + "epoch": 0.6744874955751663, + "grad_norm": 0.11962890625, + "learning_rate": 4.9595845733109455e-05, + "lm_loss": 1.9245, + "loss": 2.0593, + "mask_loss": 0.1258, + "step": 1697, + "topk_loss": 0.009 + }, + { + "epoch": 0.6748849543232954, + "grad_norm": 0.109375, + "learning_rate": 4.948581250430291e-05, + "lm_loss": 1.9287, + "loss": 2.0645, + "mask_loss": 0.1248, + "step": 1698, + "topk_loss": 0.011 + }, + { + "epoch": 0.6752824130714246, + "grad_norm": 0.1279296875, + "learning_rate": 4.9375861325614606e-05, + "lm_loss": 1.951, + "loss": 2.0861, + "mask_loss": 0.1255, + "step": 1699, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6756798718195537, + "grad_norm": 0.140625, + "learning_rate": 4.926599237563807e-05, + "lm_loss": 1.9685, + "loss": 2.1018, + "mask_loss": 0.1235, + "step": 1700, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6756798718195537, + "eval_lm_loss": 687.1553955078125, + "eval_loss": 687.2926025390625, + "eval_mask_hit_rate": 0.5353307723999023, + "eval_mask_loss": 0.1246270090341568, + "eval_mask_top_10_hit_rate": 0.985496997833252, + "eval_mask_top_1_hit_rate": 0.9974105358123779, + "eval_mask_top_20_hit_rate": 0.9760777950286865, + "eval_mask_top_5_hit_rate": 0.9908045530319214, + "eval_runtime": 144.3717, + "eval_samples_per_second": 14.186, + "eval_steps_per_second": 7.093, + "eval_token_accuracy": 0.6143229007720947, + "eval_top_k_diff": -527.6005859375, + "eval_topk_loss": 0.012607835233211517, + "step": 1700 + }, + { + "epoch": 0.6760773305676829, + "grad_norm": 0.12158203125, + "learning_rate": 4.915620583283329e-05, + "lm_loss": 1.8965, + "loss": 2.0348, + "mask_loss": 0.1282, + "step": 1701, + "topk_loss": 0.01 + }, + { + "epoch": 0.6764747893158121, + "grad_norm": 0.1494140625, + "learning_rate": 4.904650187552637e-05, + "lm_loss": 1.866, + "loss": 2.0022, + "mask_loss": 0.1255, + "step": 1702, + "topk_loss": 0.0106 + }, + { + "epoch": 0.6768722480639412, + "grad_norm": 0.11083984375, + "learning_rate": 4.893688068190932e-05, + "lm_loss": 1.8394, + "loss": 1.9773, + "mask_loss": 0.1282, + "step": 1703, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6772697068120703, + "grad_norm": 0.1337890625, + "learning_rate": 4.8827342430039624e-05, + "lm_loss": 1.8682, + "loss": 2.0079, + "mask_loss": 0.1286, + "step": 1704, + "topk_loss": 0.0111 + }, + { + "epoch": 0.6776671655601995, + "grad_norm": 0.1259765625, + "learning_rate": 4.87178872978401e-05, + "lm_loss": 1.8977, + "loss": 2.0312, + "mask_loss": 0.1247, + "step": 1705, + "topk_loss": 0.0088 + }, + { + "epoch": 0.6780646243083286, + "grad_norm": 0.138671875, + "learning_rate": 4.860851546309858e-05, + "lm_loss": 1.8222, + "loss": 1.96, + "mask_loss": 0.1259, + "step": 1706, + "topk_loss": 0.0119 + }, + { + "epoch": 0.6784620830564577, + "grad_norm": 0.138671875, + "learning_rate": 4.8499227103467574e-05, + "lm_loss": 1.8512, + "loss": 1.9842, + "mask_loss": 0.1239, + "step": 1707, + "topk_loss": 0.0091 + }, + { + "epoch": 0.678859541804587, + "grad_norm": 0.12353515625, + "learning_rate": 4.8390022396463965e-05, + "lm_loss": 1.8797, + "loss": 2.0186, + "mask_loss": 0.1273, + "step": 1708, + "topk_loss": 0.0116 + }, + { + "epoch": 0.6792570005527161, + "grad_norm": 0.1357421875, + "learning_rate": 4.828090151946882e-05, + "lm_loss": 1.9118, + "loss": 2.0482, + "mask_loss": 0.1246, + "step": 1709, + "topk_loss": 0.0118 + }, + { + "epoch": 0.6796544593008452, + "grad_norm": 0.130859375, + "learning_rate": 4.817186464972702e-05, + "lm_loss": 1.958, + "loss": 2.093, + "mask_loss": 0.1255, + "step": 1710, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6800519180489744, + "grad_norm": 0.1748046875, + "learning_rate": 4.8062911964347004e-05, + "lm_loss": 1.9112, + "loss": 2.0502, + "mask_loss": 0.1251, + "step": 1711, + "topk_loss": 0.0139 + }, + { + "epoch": 0.6804493767971035, + "grad_norm": 0.1494140625, + "learning_rate": 4.7954043640300394e-05, + "lm_loss": 1.8771, + "loss": 2.0308, + "mask_loss": 0.1357, + "step": 1712, + "topk_loss": 0.018 + }, + { + "epoch": 0.6808468355452326, + "grad_norm": 0.1513671875, + "learning_rate": 4.7845259854421875e-05, + "lm_loss": 1.9122, + "loss": 2.0563, + "mask_loss": 0.1302, + "step": 1713, + "topk_loss": 0.0139 + }, + { + "epoch": 0.6812442942933619, + "grad_norm": 0.11669921875, + "learning_rate": 4.773656078340879e-05, + "lm_loss": 1.9049, + "loss": 2.0427, + "mask_loss": 0.1274, + "step": 1714, + "topk_loss": 0.0104 + }, + { + "epoch": 0.681641753041491, + "grad_norm": 0.11962890625, + "learning_rate": 4.762794660382086e-05, + "lm_loss": 1.9272, + "loss": 2.0626, + "mask_loss": 0.1263, + "step": 1715, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6820392117896201, + "grad_norm": 0.1259765625, + "learning_rate": 4.751941749207995e-05, + "lm_loss": 1.9171, + "loss": 2.0481, + "mask_loss": 0.123, + "step": 1716, + "topk_loss": 0.0079 + }, + { + "epoch": 0.6824366705377493, + "grad_norm": 0.1513671875, + "learning_rate": 4.741097362446973e-05, + "lm_loss": 1.9016, + "loss": 2.0418, + "mask_loss": 0.1281, + "step": 1717, + "topk_loss": 0.0122 + }, + { + "epoch": 0.6828341292858784, + "grad_norm": 0.1416015625, + "learning_rate": 4.730261517713541e-05, + "lm_loss": 1.8347, + "loss": 1.9794, + "mask_loss": 0.1306, + "step": 1718, + "topk_loss": 0.0141 + }, + { + "epoch": 0.6832315880340075, + "grad_norm": 0.1533203125, + "learning_rate": 4.719434232608345e-05, + "lm_loss": 1.8892, + "loss": 2.024, + "mask_loss": 0.1244, + "step": 1719, + "topk_loss": 0.0104 + }, + { + "epoch": 0.6836290467821368, + "grad_norm": 0.1328125, + "learning_rate": 4.708615524718128e-05, + "lm_loss": 1.9153, + "loss": 2.0522, + "mask_loss": 0.1255, + "step": 1720, + "topk_loss": 0.0114 + }, + { + "epoch": 0.6840265055302659, + "grad_norm": 0.1259765625, + "learning_rate": 4.6978054116156987e-05, + "lm_loss": 1.9998, + "loss": 2.1383, + "mask_loss": 0.1255, + "step": 1721, + "topk_loss": 0.013 + }, + { + "epoch": 0.684423964278395, + "grad_norm": 0.1123046875, + "learning_rate": 4.687003910859911e-05, + "lm_loss": 1.9427, + "loss": 2.0796, + "mask_loss": 0.127, + "step": 1722, + "topk_loss": 0.0099 + }, + { + "epoch": 0.6848214230265242, + "grad_norm": 0.1240234375, + "learning_rate": 4.676211039995623e-05, + "lm_loss": 1.893, + "loss": 2.027, + "mask_loss": 0.1244, + "step": 1723, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6852188817746533, + "grad_norm": 0.13671875, + "learning_rate": 4.6654268165536805e-05, + "lm_loss": 1.8517, + "loss": 1.9876, + "mask_loss": 0.1266, + "step": 1724, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6856163405227824, + "grad_norm": 0.126953125, + "learning_rate": 4.6546512580508804e-05, + "lm_loss": 1.9012, + "loss": 2.0353, + "mask_loss": 0.1243, + "step": 1725, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6860137992709117, + "grad_norm": 0.12109375, + "learning_rate": 4.643884381989947e-05, + "lm_loss": 1.8417, + "loss": 1.9781, + "mask_loss": 0.1267, + "step": 1726, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6864112580190408, + "grad_norm": 0.1240234375, + "learning_rate": 4.633126205859504e-05, + "lm_loss": 1.9173, + "loss": 2.0508, + "mask_loss": 0.124, + "step": 1727, + "topk_loss": 0.0095 + }, + { + "epoch": 0.6868087167671699, + "grad_norm": 0.115234375, + "learning_rate": 4.6223767471340326e-05, + "lm_loss": 1.8859, + "loss": 2.0179, + "mask_loss": 0.1228, + "step": 1728, + "topk_loss": 0.0092 + }, + { + "epoch": 0.6872061755152991, + "grad_norm": 0.11279296875, + "learning_rate": 4.6116360232738675e-05, + "lm_loss": 1.9024, + "loss": 2.0398, + "mask_loss": 0.1267, + "step": 1729, + "topk_loss": 0.0107 + }, + { + "epoch": 0.6876036342634282, + "grad_norm": 0.1142578125, + "learning_rate": 4.600904051725148e-05, + "lm_loss": 1.9572, + "loss": 2.0923, + "mask_loss": 0.1261, + "step": 1730, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6880010930115573, + "grad_norm": 0.21875, + "learning_rate": 4.5901808499198004e-05, + "lm_loss": 1.8948, + "loss": 2.0469, + "mask_loss": 0.1327, + "step": 1731, + "topk_loss": 0.0194 + }, + { + "epoch": 0.6883985517596865, + "grad_norm": 0.11572265625, + "learning_rate": 4.5794664352755055e-05, + "lm_loss": 1.9081, + "loss": 2.0427, + "mask_loss": 0.1249, + "step": 1732, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6887960105078157, + "grad_norm": 0.1259765625, + "learning_rate": 4.5687608251956714e-05, + "lm_loss": 1.8729, + "loss": 2.0116, + "mask_loss": 0.1266, + "step": 1733, + "topk_loss": 0.0122 + }, + { + "epoch": 0.6891934692559448, + "grad_norm": 0.1494140625, + "learning_rate": 4.5580640370694027e-05, + "lm_loss": 1.9346, + "loss": 2.0687, + "mask_loss": 0.1247, + "step": 1734, + "topk_loss": 0.0094 + }, + { + "epoch": 0.689590928004074, + "grad_norm": 0.11474609375, + "learning_rate": 4.54737608827148e-05, + "lm_loss": 1.8316, + "loss": 1.9689, + "mask_loss": 0.1269, + "step": 1735, + "topk_loss": 0.0104 + }, + { + "epoch": 0.6899883867522031, + "grad_norm": 0.11767578125, + "learning_rate": 4.5366969961623166e-05, + "lm_loss": 1.9245, + "loss": 2.0613, + "mask_loss": 0.1263, + "step": 1736, + "topk_loss": 0.0105 + }, + { + "epoch": 0.6903858455003322, + "grad_norm": 0.11328125, + "learning_rate": 4.526026778087947e-05, + "lm_loss": 1.8769, + "loss": 2.0139, + "mask_loss": 0.1277, + "step": 1737, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6907833042484613, + "grad_norm": 0.1279296875, + "learning_rate": 4.515365451379993e-05, + "lm_loss": 1.9314, + "loss": 2.0678, + "mask_loss": 0.1262, + "step": 1738, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6911807629965906, + "grad_norm": 0.1123046875, + "learning_rate": 4.504713033355629e-05, + "lm_loss": 1.9246, + "loss": 2.0592, + "mask_loss": 0.1249, + "step": 1739, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6915782217447197, + "grad_norm": 0.1376953125, + "learning_rate": 4.4940695413175626e-05, + "lm_loss": 1.9354, + "loss": 2.0684, + "mask_loss": 0.1242, + "step": 1740, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6919756804928489, + "grad_norm": 0.1103515625, + "learning_rate": 4.483434992554001e-05, + "lm_loss": 1.8529, + "loss": 1.9866, + "mask_loss": 0.1249, + "step": 1741, + "topk_loss": 0.0088 + }, + { + "epoch": 0.692373139240978, + "grad_norm": 0.12451171875, + "learning_rate": 4.472809404338627e-05, + "lm_loss": 1.9255, + "loss": 2.0644, + "mask_loss": 0.1275, + "step": 1742, + "topk_loss": 0.0113 + }, + { + "epoch": 0.6927705979891071, + "grad_norm": 0.12158203125, + "learning_rate": 4.4621927939305695e-05, + "lm_loss": 1.8837, + "loss": 2.0211, + "mask_loss": 0.1272, + "step": 1743, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6931680567372362, + "grad_norm": 0.11376953125, + "learning_rate": 4.451585178574368e-05, + "lm_loss": 1.8989, + "loss": 2.032, + "mask_loss": 0.1237, + "step": 1744, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6935655154853655, + "grad_norm": 0.1083984375, + "learning_rate": 4.440986575499956e-05, + "lm_loss": 1.883, + "loss": 2.0159, + "mask_loss": 0.1233, + "step": 1745, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6939629742334946, + "grad_norm": 0.1875, + "learning_rate": 4.430397001922631e-05, + "lm_loss": 1.8949, + "loss": 2.0337, + "mask_loss": 0.126, + "step": 1746, + "topk_loss": 0.0127 + }, + { + "epoch": 0.6943604329816238, + "grad_norm": 0.1796875, + "learning_rate": 4.4198164750430217e-05, + "lm_loss": 1.9357, + "loss": 2.0683, + "mask_loss": 0.1232, + "step": 1747, + "topk_loss": 0.0094 + }, + { + "epoch": 0.6947578917297529, + "grad_norm": 0.1279296875, + "learning_rate": 4.40924501204706e-05, + "lm_loss": 1.8786, + "loss": 2.0171, + "mask_loss": 0.1264, + "step": 1748, + "topk_loss": 0.0121 + }, + { + "epoch": 0.695155350477882, + "grad_norm": 0.1328125, + "learning_rate": 4.398682630105958e-05, + "lm_loss": 1.9416, + "loss": 2.0772, + "mask_loss": 0.1267, + "step": 1749, + "topk_loss": 0.009 + }, + { + "epoch": 0.6955528092260111, + "grad_norm": 0.1279296875, + "learning_rate": 4.388129346376178e-05, + "lm_loss": 1.9031, + "loss": 2.0414, + "mask_loss": 0.1276, + "step": 1750, + "topk_loss": 0.0107 + }, + { + "epoch": 0.6955528092260111, + "eval_lm_loss": 687.1192626953125, + "eval_loss": 687.2565307617188, + "eval_mask_hit_rate": 0.5354921817779541, + "eval_mask_loss": 0.12459547817707062, + "eval_mask_top_10_hit_rate": 0.9855087995529175, + "eval_mask_top_1_hit_rate": 0.9974174499511719, + "eval_mask_top_20_hit_rate": 0.9761062264442444, + "eval_mask_top_5_hit_rate": 0.990810751914978, + "eval_runtime": 144.4662, + "eval_samples_per_second": 14.176, + "eval_steps_per_second": 7.088, + "eval_token_accuracy": 0.6143444180488586, + "eval_top_k_diff": -527.2138671875, + "eval_topk_loss": 0.012662166729569435, + "step": 1750 + }, + { + "epoch": 0.6959502679741404, + "grad_norm": 0.1201171875, + "learning_rate": 4.377585177999404e-05, + "lm_loss": 1.9144, + "loss": 2.051, + "mask_loss": 0.1258, + "step": 1751, + "topk_loss": 0.0108 + }, + { + "epoch": 0.6963477267222695, + "grad_norm": 0.10986328125, + "learning_rate": 4.367050142102507e-05, + "lm_loss": 1.9316, + "loss": 2.0626, + "mask_loss": 0.1224, + "step": 1752, + "topk_loss": 0.0086 + }, + { + "epoch": 0.6967451854703987, + "grad_norm": 0.1142578125, + "learning_rate": 4.3565242557975326e-05, + "lm_loss": 1.9602, + "loss": 2.0933, + "mask_loss": 0.1231, + "step": 1753, + "topk_loss": 0.01 + }, + { + "epoch": 0.6971426442185278, + "grad_norm": 0.1220703125, + "learning_rate": 4.3460075361816635e-05, + "lm_loss": 1.9415, + "loss": 2.0767, + "mask_loss": 0.124, + "step": 1754, + "topk_loss": 0.0112 + }, + { + "epoch": 0.6975401029666569, + "grad_norm": 0.11865234375, + "learning_rate": 4.335500000337189e-05, + "lm_loss": 1.9125, + "loss": 2.0484, + "mask_loss": 0.1261, + "step": 1755, + "topk_loss": 0.0098 + }, + { + "epoch": 0.697937561714786, + "grad_norm": 0.1103515625, + "learning_rate": 4.3250016653314864e-05, + "lm_loss": 1.8968, + "loss": 2.0319, + "mask_loss": 0.1254, + "step": 1756, + "topk_loss": 0.0097 + }, + { + "epoch": 0.6983350204629153, + "grad_norm": 0.177734375, + "learning_rate": 4.314512548216985e-05, + "lm_loss": 1.8136, + "loss": 1.9476, + "mask_loss": 0.1251, + "step": 1757, + "topk_loss": 0.0089 + }, + { + "epoch": 0.6987324792110444, + "grad_norm": 0.1201171875, + "learning_rate": 4.304032666031139e-05, + "lm_loss": 1.9155, + "loss": 2.0515, + "mask_loss": 0.1257, + "step": 1758, + "topk_loss": 0.0102 + }, + { + "epoch": 0.6991299379591736, + "grad_norm": 0.1279296875, + "learning_rate": 4.2935620357964076e-05, + "lm_loss": 1.8805, + "loss": 2.0169, + "mask_loss": 0.1266, + "step": 1759, + "topk_loss": 0.0098 + }, + { + "epoch": 0.6995273967073027, + "grad_norm": 0.130859375, + "learning_rate": 4.283100674520219e-05, + "lm_loss": 1.903, + "loss": 2.0378, + "mask_loss": 0.1252, + "step": 1760, + "topk_loss": 0.0096 + }, + { + "epoch": 0.6999248554554318, + "grad_norm": 0.1171875, + "learning_rate": 4.272648599194948e-05, + "lm_loss": 1.9202, + "loss": 2.0537, + "mask_loss": 0.1248, + "step": 1761, + "topk_loss": 0.0087 + }, + { + "epoch": 0.700322314203561, + "grad_norm": 0.130859375, + "learning_rate": 4.262205826797883e-05, + "lm_loss": 1.9085, + "loss": 2.0459, + "mask_loss": 0.1252, + "step": 1762, + "topk_loss": 0.0122 + }, + { + "epoch": 0.7007197729516901, + "grad_norm": 0.115234375, + "learning_rate": 4.251772374291203e-05, + "lm_loss": 1.8532, + "loss": 1.9879, + "mask_loss": 0.1254, + "step": 1763, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7011172316998193, + "grad_norm": 0.12158203125, + "learning_rate": 4.24134825862195e-05, + "lm_loss": 1.9942, + "loss": 2.1308, + "mask_loss": 0.1265, + "step": 1764, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7015146904479485, + "grad_norm": 0.12158203125, + "learning_rate": 4.2309334967219995e-05, + "lm_loss": 1.8729, + "loss": 2.0136, + "mask_loss": 0.1285, + "step": 1765, + "topk_loss": 0.0122 + }, + { + "epoch": 0.7019121491960776, + "grad_norm": 0.1279296875, + "learning_rate": 4.2205281055080325e-05, + "lm_loss": 1.9088, + "loss": 2.043, + "mask_loss": 0.1245, + "step": 1766, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7023096079442067, + "grad_norm": 0.11279296875, + "learning_rate": 4.210132101881516e-05, + "lm_loss": 1.869, + "loss": 2.0075, + "mask_loss": 0.1285, + "step": 1767, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7027070666923358, + "grad_norm": 0.11279296875, + "learning_rate": 4.1997455027286525e-05, + "lm_loss": 1.8832, + "loss": 2.0195, + "mask_loss": 0.1252, + "step": 1768, + "topk_loss": 0.011 + }, + { + "epoch": 0.703104525440465, + "grad_norm": 0.11279296875, + "learning_rate": 4.189368324920385e-05, + "lm_loss": 1.8982, + "loss": 2.0335, + "mask_loss": 0.1255, + "step": 1769, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7035019841885942, + "grad_norm": 0.1123046875, + "learning_rate": 4.179000585312347e-05, + "lm_loss": 1.8529, + "loss": 1.9909, + "mask_loss": 0.1276, + "step": 1770, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7038994429367234, + "grad_norm": 0.1318359375, + "learning_rate": 4.1686423007448426e-05, + "lm_loss": 1.9087, + "loss": 2.0481, + "mask_loss": 0.1284, + "step": 1771, + "topk_loss": 0.011 + }, + { + "epoch": 0.7042969016848525, + "grad_norm": 0.1181640625, + "learning_rate": 4.158293488042818e-05, + "lm_loss": 1.8858, + "loss": 2.0233, + "mask_loss": 0.1268, + "step": 1772, + "topk_loss": 0.0107 + }, + { + "epoch": 0.7046943604329816, + "grad_norm": 0.11279296875, + "learning_rate": 4.147954164015832e-05, + "lm_loss": 1.9356, + "loss": 2.0718, + "mask_loss": 0.1262, + "step": 1773, + "topk_loss": 0.01 + }, + { + "epoch": 0.7050918191811107, + "grad_norm": 0.11474609375, + "learning_rate": 4.1376243454580366e-05, + "lm_loss": 1.8602, + "loss": 1.9965, + "mask_loss": 0.126, + "step": 1774, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7054892779292399, + "grad_norm": 0.11083984375, + "learning_rate": 4.127304049148142e-05, + "lm_loss": 1.9244, + "loss": 2.0568, + "mask_loss": 0.1232, + "step": 1775, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7058867366773691, + "grad_norm": 0.12451171875, + "learning_rate": 4.116993291849381e-05, + "lm_loss": 1.9718, + "loss": 2.106, + "mask_loss": 0.125, + "step": 1776, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7062841954254983, + "grad_norm": 0.1181640625, + "learning_rate": 4.1066920903095076e-05, + "lm_loss": 1.9779, + "loss": 2.1117, + "mask_loss": 0.1247, + "step": 1777, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7066816541736274, + "grad_norm": 0.1181640625, + "learning_rate": 4.0964004612607465e-05, + "lm_loss": 1.9201, + "loss": 2.0557, + "mask_loss": 0.1251, + "step": 1778, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7070791129217565, + "grad_norm": 0.11767578125, + "learning_rate": 4.086118421419774e-05, + "lm_loss": 1.929, + "loss": 2.0673, + "mask_loss": 0.1269, + "step": 1779, + "topk_loss": 0.0113 + }, + { + "epoch": 0.7074765716698856, + "grad_norm": 0.109375, + "learning_rate": 4.0758459874876954e-05, + "lm_loss": 1.9014, + "loss": 2.0363, + "mask_loss": 0.1256, + "step": 1780, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7078740304180148, + "grad_norm": 0.1279296875, + "learning_rate": 4.065583176150005e-05, + "lm_loss": 1.936, + "loss": 2.0691, + "mask_loss": 0.1232, + "step": 1781, + "topk_loss": 0.0099 + }, + { + "epoch": 0.708271489166144, + "grad_norm": 0.11572265625, + "learning_rate": 4.0553300040765755e-05, + "lm_loss": 1.9247, + "loss": 2.059, + "mask_loss": 0.1245, + "step": 1782, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7086689479142732, + "grad_norm": 0.11181640625, + "learning_rate": 4.04508648792162e-05, + "lm_loss": 1.8667, + "loss": 2.0013, + "mask_loss": 0.125, + "step": 1783, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7090664066624023, + "grad_norm": 0.1328125, + "learning_rate": 4.034852644323661e-05, + "lm_loss": 1.8723, + "loss": 2.0095, + "mask_loss": 0.1266, + "step": 1784, + "topk_loss": 0.0106 + }, + { + "epoch": 0.7094638654105314, + "grad_norm": 0.126953125, + "learning_rate": 4.024628489905517e-05, + "lm_loss": 1.8956, + "loss": 2.0314, + "mask_loss": 0.1262, + "step": 1785, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7098613241586605, + "grad_norm": 0.1171875, + "learning_rate": 4.014414041274267e-05, + "lm_loss": 1.9571, + "loss": 2.0909, + "mask_loss": 0.1242, + "step": 1786, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7102587829067897, + "grad_norm": 0.11865234375, + "learning_rate": 4.004209315021225e-05, + "lm_loss": 1.8871, + "loss": 2.0244, + "mask_loss": 0.1273, + "step": 1787, + "topk_loss": 0.01 + }, + { + "epoch": 0.7106562416549188, + "grad_norm": 0.111328125, + "learning_rate": 3.994014327721912e-05, + "lm_loss": 1.9109, + "loss": 2.0447, + "mask_loss": 0.1248, + "step": 1788, + "topk_loss": 0.009 + }, + { + "epoch": 0.711053700403048, + "grad_norm": 0.1162109375, + "learning_rate": 3.9838290959360304e-05, + "lm_loss": 1.8853, + "loss": 2.0224, + "mask_loss": 0.1262, + "step": 1789, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7114511591511772, + "grad_norm": 0.126953125, + "learning_rate": 3.973653636207437e-05, + "lm_loss": 1.8864, + "loss": 2.0219, + "mask_loss": 0.1268, + "step": 1790, + "topk_loss": 0.0087 + }, + { + "epoch": 0.7118486178993063, + "grad_norm": 0.11083984375, + "learning_rate": 3.9634879650641153e-05, + "lm_loss": 1.8685, + "loss": 2.0048, + "mask_loss": 0.1272, + "step": 1791, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7122460766474354, + "grad_norm": 0.1513671875, + "learning_rate": 3.953332099018151e-05, + "lm_loss": 1.8939, + "loss": 2.0353, + "mask_loss": 0.1287, + "step": 1792, + "topk_loss": 0.0128 + }, + { + "epoch": 0.7126435353955646, + "grad_norm": 0.126953125, + "learning_rate": 3.943186054565699e-05, + "lm_loss": 1.9455, + "loss": 2.0885, + "mask_loss": 0.1282, + "step": 1793, + "topk_loss": 0.0147 + }, + { + "epoch": 0.7130409941436937, + "grad_norm": 0.1162109375, + "learning_rate": 3.933049848186967e-05, + "lm_loss": 1.9081, + "loss": 2.0451, + "mask_loss": 0.126, + "step": 1794, + "topk_loss": 0.011 + }, + { + "epoch": 0.713438452891823, + "grad_norm": 0.14453125, + "learning_rate": 3.9229234963461766e-05, + "lm_loss": 1.8904, + "loss": 2.0273, + "mask_loss": 0.1267, + "step": 1795, + "topk_loss": 0.0102 + }, + { + "epoch": 0.7138359116399521, + "grad_norm": 0.1376953125, + "learning_rate": 3.9128070154915496e-05, + "lm_loss": 1.9129, + "loss": 2.0478, + "mask_loss": 0.124, + "step": 1796, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7142333703880812, + "grad_norm": 0.13671875, + "learning_rate": 3.902700422055266e-05, + "lm_loss": 1.8739, + "loss": 2.0057, + "mask_loss": 0.1231, + "step": 1797, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7146308291362103, + "grad_norm": 0.24609375, + "learning_rate": 3.892603732453455e-05, + "lm_loss": 1.9116, + "loss": 2.0648, + "mask_loss": 0.1329, + "step": 1798, + "topk_loss": 0.0204 + }, + { + "epoch": 0.7150282878843395, + "grad_norm": 0.11474609375, + "learning_rate": 3.882516963086154e-05, + "lm_loss": 1.911, + "loss": 2.0459, + "mask_loss": 0.1254, + "step": 1799, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7154257466324686, + "grad_norm": 0.12890625, + "learning_rate": 3.872440130337281e-05, + "lm_loss": 1.8986, + "loss": 2.0375, + "mask_loss": 0.1264, + "step": 1800, + "topk_loss": 0.0125 + }, + { + "epoch": 0.7154257466324686, + "eval_lm_loss": 687.646728515625, + "eval_loss": 687.7838745117188, + "eval_mask_hit_rate": 0.5356104969978333, + "eval_mask_loss": 0.12455646693706512, + "eval_mask_top_10_hit_rate": 0.9855278730392456, + "eval_mask_top_1_hit_rate": 0.9974215030670166, + "eval_mask_top_20_hit_rate": 0.9761320948600769, + "eval_mask_top_5_hit_rate": 0.9908208847045898, + "eval_runtime": 144.5288, + "eval_samples_per_second": 14.17, + "eval_steps_per_second": 7.085, + "eval_token_accuracy": 0.6144205331802368, + "eval_top_k_diff": -531.4647216796875, + "eval_topk_loss": 0.012550951912999153, + "step": 1800 + }, + { + "epoch": 0.7158232053805978, + "grad_norm": 0.1103515625, + "learning_rate": 3.862373250574626e-05, + "lm_loss": 1.8972, + "loss": 2.0315, + "mask_loss": 0.1249, + "step": 1801, + "topk_loss": 0.0094 + }, + { + "epoch": 0.716220664128727, + "grad_norm": 0.12255859375, + "learning_rate": 3.852316340149803e-05, + "lm_loss": 1.9338, + "loss": 2.0749, + "mask_loss": 0.1287, + "step": 1802, + "topk_loss": 0.0125 + }, + { + "epoch": 0.7166181228768561, + "grad_norm": 0.1201171875, + "learning_rate": 3.842269415398239e-05, + "lm_loss": 1.9396, + "loss": 2.0795, + "mask_loss": 0.1284, + "step": 1803, + "topk_loss": 0.0115 + }, + { + "epoch": 0.7170155816249852, + "grad_norm": 0.1240234375, + "learning_rate": 3.832232492639137e-05, + "lm_loss": 1.9292, + "loss": 2.0664, + "mask_loss": 0.1268, + "step": 1804, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7174130403731144, + "grad_norm": 0.11376953125, + "learning_rate": 3.822205588175457e-05, + "lm_loss": 1.8633, + "loss": 2.0003, + "mask_loss": 0.1259, + "step": 1805, + "topk_loss": 0.0111 + }, + { + "epoch": 0.7178104991212435, + "grad_norm": 0.134765625, + "learning_rate": 3.8121887182938845e-05, + "lm_loss": 1.8854, + "loss": 2.0247, + "mask_loss": 0.1274, + "step": 1806, + "topk_loss": 0.0119 + }, + { + "epoch": 0.7182079578693727, + "grad_norm": 0.1162109375, + "learning_rate": 3.802181899264809e-05, + "lm_loss": 1.887, + "loss": 2.0228, + "mask_loss": 0.1258, + "step": 1807, + "topk_loss": 0.01 + }, + { + "epoch": 0.7186054166175019, + "grad_norm": 0.11767578125, + "learning_rate": 3.7921851473422834e-05, + "lm_loss": 1.9839, + "loss": 2.1216, + "mask_loss": 0.1265, + "step": 1808, + "topk_loss": 0.0112 + }, + { + "epoch": 0.719002875365631, + "grad_norm": 0.162109375, + "learning_rate": 3.782198478764021e-05, + "lm_loss": 1.8774, + "loss": 2.0124, + "mask_loss": 0.1252, + "step": 1809, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7194003341137601, + "grad_norm": 0.11376953125, + "learning_rate": 3.772221909751353e-05, + "lm_loss": 1.8666, + "loss": 2.0032, + "mask_loss": 0.127, + "step": 1810, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7197977928618893, + "grad_norm": 0.166015625, + "learning_rate": 3.762255456509206e-05, + "lm_loss": 1.8814, + "loss": 2.0285, + "mask_loss": 0.1326, + "step": 1811, + "topk_loss": 0.0146 + }, + { + "epoch": 0.7201952516100184, + "grad_norm": 0.125, + "learning_rate": 3.752299135226074e-05, + "lm_loss": 1.9509, + "loss": 2.0859, + "mask_loss": 0.1252, + "step": 1812, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7205927103581476, + "grad_norm": 0.1240234375, + "learning_rate": 3.742352962073995e-05, + "lm_loss": 1.9578, + "loss": 2.097, + "mask_loss": 0.127, + "step": 1813, + "topk_loss": 0.0121 + }, + { + "epoch": 0.7209901691062768, + "grad_norm": 0.115234375, + "learning_rate": 3.732416953208522e-05, + "lm_loss": 1.8663, + "loss": 2.0015, + "mask_loss": 0.1259, + "step": 1814, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7213876278544059, + "grad_norm": 0.11767578125, + "learning_rate": 3.722491124768702e-05, + "lm_loss": 1.9073, + "loss": 2.0426, + "mask_loss": 0.1268, + "step": 1815, + "topk_loss": 0.0084 + }, + { + "epoch": 0.721785086602535, + "grad_norm": 0.12353515625, + "learning_rate": 3.71257549287704e-05, + "lm_loss": 1.9045, + "loss": 2.0425, + "mask_loss": 0.1269, + "step": 1816, + "topk_loss": 0.0111 + }, + { + "epoch": 0.7221825453506642, + "grad_norm": 0.11181640625, + "learning_rate": 3.70267007363948e-05, + "lm_loss": 1.8537, + "loss": 1.9912, + "mask_loss": 0.1267, + "step": 1817, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7225800040987933, + "grad_norm": 0.11279296875, + "learning_rate": 3.6927748831453836e-05, + "lm_loss": 1.831, + "loss": 1.9661, + "mask_loss": 0.125, + "step": 1818, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7229774628469224, + "grad_norm": 0.119140625, + "learning_rate": 3.682889937467493e-05, + "lm_loss": 1.8503, + "loss": 1.9856, + "mask_loss": 0.1254, + "step": 1819, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7233749215950517, + "grad_norm": 0.1923828125, + "learning_rate": 3.673015252661909e-05, + "lm_loss": 1.859, + "loss": 1.993, + "mask_loss": 0.1248, + "step": 1820, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7237723803431808, + "grad_norm": 0.134765625, + "learning_rate": 3.6631508447680675e-05, + "lm_loss": 1.8928, + "loss": 2.0283, + "mask_loss": 0.1249, + "step": 1821, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7241698390913099, + "grad_norm": 0.1123046875, + "learning_rate": 3.653296729808712e-05, + "lm_loss": 1.9698, + "loss": 2.1035, + "mask_loss": 0.1236, + "step": 1822, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7245672978394391, + "grad_norm": 0.1103515625, + "learning_rate": 3.643452923789866e-05, + "lm_loss": 1.8551, + "loss": 1.9914, + "mask_loss": 0.1265, + "step": 1823, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7249647565875682, + "grad_norm": 0.1201171875, + "learning_rate": 3.633619442700811e-05, + "lm_loss": 1.9069, + "loss": 2.0422, + "mask_loss": 0.1253, + "step": 1824, + "topk_loss": 0.01 + }, + { + "epoch": 0.7253622153356973, + "grad_norm": 0.11279296875, + "learning_rate": 3.623796302514051e-05, + "lm_loss": 1.9117, + "loss": 2.0472, + "mask_loss": 0.1257, + "step": 1825, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7257596740838266, + "grad_norm": 0.130859375, + "learning_rate": 3.613983519185301e-05, + "lm_loss": 1.9091, + "loss": 2.0477, + "mask_loss": 0.1268, + "step": 1826, + "topk_loss": 0.0118 + }, + { + "epoch": 0.7261571328319557, + "grad_norm": 0.1376953125, + "learning_rate": 3.604181108653449e-05, + "lm_loss": 1.8876, + "loss": 2.0223, + "mask_loss": 0.1254, + "step": 1827, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7265545915800848, + "grad_norm": 0.11572265625, + "learning_rate": 3.594389086840537e-05, + "lm_loss": 1.791, + "loss": 1.9253, + "mask_loss": 0.1257, + "step": 1828, + "topk_loss": 0.0086 + }, + { + "epoch": 0.726952050328214, + "grad_norm": 0.1162109375, + "learning_rate": 3.58460746965173e-05, + "lm_loss": 1.9217, + "loss": 2.0565, + "mask_loss": 0.1252, + "step": 1829, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7273495090763431, + "grad_norm": 0.12109375, + "learning_rate": 3.574836272975293e-05, + "lm_loss": 1.8529, + "loss": 1.9888, + "mask_loss": 0.1268, + "step": 1830, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7277469678244722, + "grad_norm": 0.11376953125, + "learning_rate": 3.5650755126825706e-05, + "lm_loss": 1.8983, + "loss": 2.0354, + "mask_loss": 0.1279, + "step": 1831, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7281444265726015, + "grad_norm": 0.11572265625, + "learning_rate": 3.555325204627944e-05, + "lm_loss": 1.9628, + "loss": 2.1015, + "mask_loss": 0.1284, + "step": 1832, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7285418853207306, + "grad_norm": 0.1103515625, + "learning_rate": 3.545585364648828e-05, + "lm_loss": 1.8873, + "loss": 2.0248, + "mask_loss": 0.1269, + "step": 1833, + "topk_loss": 0.0106 + }, + { + "epoch": 0.7289393440688597, + "grad_norm": 0.130859375, + "learning_rate": 3.5358560085656276e-05, + "lm_loss": 1.9359, + "loss": 2.0735, + "mask_loss": 0.1275, + "step": 1834, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7293368028169889, + "grad_norm": 0.150390625, + "learning_rate": 3.5261371521817244e-05, + "lm_loss": 1.8569, + "loss": 1.9914, + "mask_loss": 0.1251, + "step": 1835, + "topk_loss": 0.0094 + }, + { + "epoch": 0.729734261565118, + "grad_norm": 0.11376953125, + "learning_rate": 3.516428811283439e-05, + "lm_loss": 1.8744, + "loss": 2.007, + "mask_loss": 0.1238, + "step": 1836, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7301317203132471, + "grad_norm": 0.1298828125, + "learning_rate": 3.506731001640017e-05, + "lm_loss": 1.9186, + "loss": 2.0535, + "mask_loss": 0.1244, + "step": 1837, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7305291790613764, + "grad_norm": 0.1083984375, + "learning_rate": 3.497043739003594e-05, + "lm_loss": 1.861, + "loss": 1.9987, + "mask_loss": 0.1279, + "step": 1838, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7309266378095055, + "grad_norm": 0.1171875, + "learning_rate": 3.487367039109182e-05, + "lm_loss": 1.9308, + "loss": 2.0705, + "mask_loss": 0.1279, + "step": 1839, + "topk_loss": 0.0118 + }, + { + "epoch": 0.7313240965576346, + "grad_norm": 0.11865234375, + "learning_rate": 3.47770091767462e-05, + "lm_loss": 1.8843, + "loss": 2.0196, + "mask_loss": 0.1245, + "step": 1840, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7317215553057638, + "grad_norm": 0.1201171875, + "learning_rate": 3.4680453904005805e-05, + "lm_loss": 1.9344, + "loss": 2.0687, + "mask_loss": 0.1252, + "step": 1841, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7321190140538929, + "grad_norm": 0.154296875, + "learning_rate": 3.4584004729705213e-05, + "lm_loss": 1.9301, + "loss": 2.071, + "mask_loss": 0.1279, + "step": 1842, + "topk_loss": 0.013 + }, + { + "epoch": 0.732516472802022, + "grad_norm": 0.1416015625, + "learning_rate": 3.4487661810506656e-05, + "lm_loss": 1.9054, + "loss": 2.0481, + "mask_loss": 0.129, + "step": 1843, + "topk_loss": 0.0137 + }, + { + "epoch": 0.7329139315501512, + "grad_norm": 0.11572265625, + "learning_rate": 3.439142530289981e-05, + "lm_loss": 1.8682, + "loss": 2.0045, + "mask_loss": 0.1258, + "step": 1844, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7333113902982804, + "grad_norm": 0.11376953125, + "learning_rate": 3.4295295363201476e-05, + "lm_loss": 1.9143, + "loss": 2.0493, + "mask_loss": 0.125, + "step": 1845, + "topk_loss": 0.01 + }, + { + "epoch": 0.7337088490464095, + "grad_norm": 0.1337890625, + "learning_rate": 3.419927214755538e-05, + "lm_loss": 1.8642, + "loss": 2.0035, + "mask_loss": 0.1295, + "step": 1846, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7341063077945387, + "grad_norm": 0.1181640625, + "learning_rate": 3.4103355811931915e-05, + "lm_loss": 1.9921, + "loss": 2.1293, + "mask_loss": 0.1262, + "step": 1847, + "topk_loss": 0.0111 + }, + { + "epoch": 0.7345037665426678, + "grad_norm": 0.1396484375, + "learning_rate": 3.400754651212776e-05, + "lm_loss": 1.8553, + "loss": 1.9919, + "mask_loss": 0.1265, + "step": 1848, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7349012252907969, + "grad_norm": 0.13671875, + "learning_rate": 3.391184440376588e-05, + "lm_loss": 2.0014, + "loss": 2.1335, + "mask_loss": 0.1232, + "step": 1849, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7352986840389261, + "grad_norm": 0.138671875, + "learning_rate": 3.381624964229504e-05, + "lm_loss": 2.0318, + "loss": 2.1653, + "mask_loss": 0.1232, + "step": 1850, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7352986840389261, + "eval_lm_loss": 687.4080810546875, + "eval_loss": 687.5452270507812, + "eval_mask_hit_rate": 0.5357130765914917, + "eval_mask_loss": 0.12451068311929703, + "eval_mask_top_10_hit_rate": 0.9855594038963318, + "eval_mask_top_1_hit_rate": 0.9974288940429688, + "eval_mask_top_20_hit_rate": 0.9761731624603271, + "eval_mask_top_5_hit_rate": 0.9908478260040283, + "eval_runtime": 144.0268, + "eval_samples_per_second": 14.22, + "eval_steps_per_second": 7.11, + "eval_token_accuracy": 0.6144834756851196, + "eval_top_k_diff": -528.8475341796875, + "eval_topk_loss": 0.012559626251459122, + "step": 1850 + }, + { + "epoch": 0.7356961427870553, + "grad_norm": 0.12890625, + "learning_rate": 3.3720762382989654e-05, + "lm_loss": 1.9008, + "loss": 2.0361, + "mask_loss": 0.1252, + "step": 1851, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7360936015351844, + "grad_norm": 0.140625, + "learning_rate": 3.3625382780949574e-05, + "lm_loss": 1.8123, + "loss": 1.9497, + "mask_loss": 0.1269, + "step": 1852, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7364910602833136, + "grad_norm": 0.11572265625, + "learning_rate": 3.3530110991099706e-05, + "lm_loss": 1.8366, + "loss": 1.9694, + "mask_loss": 0.1253, + "step": 1853, + "topk_loss": 0.0075 + }, + { + "epoch": 0.7368885190314427, + "grad_norm": 0.1259765625, + "learning_rate": 3.343494716818989e-05, + "lm_loss": 1.9022, + "loss": 2.0385, + "mask_loss": 0.1269, + "step": 1854, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7372859777795718, + "grad_norm": 0.12451171875, + "learning_rate": 3.333989146679458e-05, + "lm_loss": 1.943, + "loss": 2.0801, + "mask_loss": 0.127, + "step": 1855, + "topk_loss": 0.0101 + }, + { + "epoch": 0.737683436527701, + "grad_norm": 0.11962890625, + "learning_rate": 3.324494404131261e-05, + "lm_loss": 1.918, + "loss": 2.0573, + "mask_loss": 0.1274, + "step": 1856, + "topk_loss": 0.0119 + }, + { + "epoch": 0.7380808952758302, + "grad_norm": 0.11865234375, + "learning_rate": 3.315010504596692e-05, + "lm_loss": 1.9292, + "loss": 2.0654, + "mask_loss": 0.1267, + "step": 1857, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7384783540239593, + "grad_norm": 0.13671875, + "learning_rate": 3.305537463480437e-05, + "lm_loss": 1.9136, + "loss": 2.0482, + "mask_loss": 0.1258, + "step": 1858, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7388758127720885, + "grad_norm": 0.138671875, + "learning_rate": 3.296075296169542e-05, + "lm_loss": 1.9337, + "loss": 2.0683, + "mask_loss": 0.1246, + "step": 1859, + "topk_loss": 0.01 + }, + { + "epoch": 0.7392732715202176, + "grad_norm": 0.12451171875, + "learning_rate": 3.286624018033389e-05, + "lm_loss": 1.8758, + "loss": 2.0088, + "mask_loss": 0.1239, + "step": 1860, + "topk_loss": 0.009 + }, + { + "epoch": 0.7396707302683467, + "grad_norm": 0.11474609375, + "learning_rate": 3.277183644423677e-05, + "lm_loss": 1.8901, + "loss": 2.0252, + "mask_loss": 0.1258, + "step": 1861, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7400681890164759, + "grad_norm": 0.11767578125, + "learning_rate": 3.267754190674389e-05, + "lm_loss": 1.9397, + "loss": 2.079, + "mask_loss": 0.1285, + "step": 1862, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7404656477646051, + "grad_norm": 0.1181640625, + "learning_rate": 3.258335672101778e-05, + "lm_loss": 1.9947, + "loss": 2.1311, + "mask_loss": 0.1259, + "step": 1863, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7408631065127342, + "grad_norm": 0.1376953125, + "learning_rate": 3.248928104004321e-05, + "lm_loss": 1.9536, + "loss": 2.0884, + "mask_loss": 0.1249, + "step": 1864, + "topk_loss": 0.01 + }, + { + "epoch": 0.7412605652608634, + "grad_norm": 0.111328125, + "learning_rate": 3.2395315016627195e-05, + "lm_loss": 1.8897, + "loss": 2.0264, + "mask_loss": 0.1269, + "step": 1865, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7416580240089925, + "grad_norm": 0.1162109375, + "learning_rate": 3.230145880339861e-05, + "lm_loss": 1.8932, + "loss": 2.029, + "mask_loss": 0.1258, + "step": 1866, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7420554827571216, + "grad_norm": 0.11376953125, + "learning_rate": 3.220771255280797e-05, + "lm_loss": 1.9349, + "loss": 2.0697, + "mask_loss": 0.1252, + "step": 1867, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7424529415052508, + "grad_norm": 0.125, + "learning_rate": 3.211407641712716e-05, + "lm_loss": 1.8691, + "loss": 2.0071, + "mask_loss": 0.1282, + "step": 1868, + "topk_loss": 0.0098 + }, + { + "epoch": 0.74285040025338, + "grad_norm": 0.11767578125, + "learning_rate": 3.202055054844921e-05, + "lm_loss": 1.9568, + "loss": 2.0909, + "mask_loss": 0.1245, + "step": 1869, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7432478590015091, + "grad_norm": 0.1533203125, + "learning_rate": 3.1927135098688056e-05, + "lm_loss": 1.8673, + "loss": 2.0069, + "mask_loss": 0.128, + "step": 1870, + "topk_loss": 0.0116 + }, + { + "epoch": 0.7436453177496383, + "grad_norm": 0.1201171875, + "learning_rate": 3.1833830219578284e-05, + "lm_loss": 1.8598, + "loss": 1.9951, + "mask_loss": 0.1265, + "step": 1871, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7440427764977674, + "grad_norm": 0.1328125, + "learning_rate": 3.174063606267483e-05, + "lm_loss": 1.893, + "loss": 2.0251, + "mask_loss": 0.1226, + "step": 1872, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7444402352458965, + "grad_norm": 0.11279296875, + "learning_rate": 3.164755277935284e-05, + "lm_loss": 1.8708, + "loss": 2.0078, + "mask_loss": 0.1278, + "step": 1873, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7448376939940257, + "grad_norm": 0.1513671875, + "learning_rate": 3.155458052080735e-05, + "lm_loss": 1.9627, + "loss": 2.1058, + "mask_loss": 0.1296, + "step": 1874, + "topk_loss": 0.0135 + }, + { + "epoch": 0.7452351527421548, + "grad_norm": 0.11083984375, + "learning_rate": 3.146171943805307e-05, + "lm_loss": 1.8988, + "loss": 2.0342, + "mask_loss": 0.1266, + "step": 1875, + "topk_loss": 0.0088 + }, + { + "epoch": 0.745632611490284, + "grad_norm": 0.1259765625, + "learning_rate": 3.13689696819241e-05, + "lm_loss": 1.9575, + "loss": 2.099, + "mask_loss": 0.1284, + "step": 1876, + "topk_loss": 0.0131 + }, + { + "epoch": 0.7460300702384132, + "grad_norm": 0.1171875, + "learning_rate": 3.1276331403073735e-05, + "lm_loss": 1.9339, + "loss": 2.0669, + "mask_loss": 0.1239, + "step": 1877, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7464275289865423, + "grad_norm": 0.1298828125, + "learning_rate": 3.118380475197419e-05, + "lm_loss": 1.9466, + "loss": 2.081, + "mask_loss": 0.1251, + "step": 1878, + "topk_loss": 0.0094 + }, + { + "epoch": 0.7468249877346714, + "grad_norm": 0.1240234375, + "learning_rate": 3.109138987891639e-05, + "lm_loss": 1.8772, + "loss": 2.0131, + "mask_loss": 0.1256, + "step": 1879, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7472224464828006, + "grad_norm": 0.11279296875, + "learning_rate": 3.0999086934009625e-05, + "lm_loss": 1.8922, + "loss": 2.0279, + "mask_loss": 0.1268, + "step": 1880, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7476199052309297, + "grad_norm": 0.11279296875, + "learning_rate": 3.090689606718146e-05, + "lm_loss": 1.9026, + "loss": 2.0382, + "mask_loss": 0.1263, + "step": 1881, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7480173639790589, + "grad_norm": 0.12109375, + "learning_rate": 3.081481742817736e-05, + "lm_loss": 1.8625, + "loss": 1.9959, + "mask_loss": 0.1244, + "step": 1882, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7484148227271881, + "grad_norm": 0.138671875, + "learning_rate": 3.072285116656053e-05, + "lm_loss": 1.8993, + "loss": 2.0387, + "mask_loss": 0.1282, + "step": 1883, + "topk_loss": 0.0112 + }, + { + "epoch": 0.7488122814753172, + "grad_norm": 0.11181640625, + "learning_rate": 3.0630997431711636e-05, + "lm_loss": 1.9531, + "loss": 2.0886, + "mask_loss": 0.1254, + "step": 1884, + "topk_loss": 0.0101 + }, + { + "epoch": 0.7492097402234463, + "grad_norm": 0.12109375, + "learning_rate": 3.053925637282856e-05, + "lm_loss": 1.9536, + "loss": 2.0873, + "mask_loss": 0.124, + "step": 1885, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7496071989715755, + "grad_norm": 0.11181640625, + "learning_rate": 3.0447628138926156e-05, + "lm_loss": 1.9276, + "loss": 2.0615, + "mask_loss": 0.125, + "step": 1886, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7500046577197046, + "grad_norm": 0.126953125, + "learning_rate": 3.035611287883603e-05, + "lm_loss": 1.8919, + "loss": 2.0299, + "mask_loss": 0.1256, + "step": 1887, + "topk_loss": 0.0125 + }, + { + "epoch": 0.7504021164678338, + "grad_norm": 0.109375, + "learning_rate": 3.0264710741206283e-05, + "lm_loss": 1.8436, + "loss": 1.9778, + "mask_loss": 0.1251, + "step": 1888, + "topk_loss": 0.0091 + }, + { + "epoch": 0.750799575215963, + "grad_norm": 0.1494140625, + "learning_rate": 3.0173421874501262e-05, + "lm_loss": 1.9214, + "loss": 2.065, + "mask_loss": 0.1287, + "step": 1889, + "topk_loss": 0.0149 + }, + { + "epoch": 0.7511970339640921, + "grad_norm": 0.111328125, + "learning_rate": 3.0082246427001347e-05, + "lm_loss": 1.8264, + "loss": 1.9638, + "mask_loss": 0.1273, + "step": 1890, + "topk_loss": 0.01 + }, + { + "epoch": 0.7515944927122212, + "grad_norm": 0.10693359375, + "learning_rate": 2.9991184546802663e-05, + "lm_loss": 1.8666, + "loss": 2.0018, + "mask_loss": 0.1258, + "step": 1891, + "topk_loss": 0.0094 + }, + { + "epoch": 0.7519919514603504, + "grad_norm": 0.1162109375, + "learning_rate": 2.9900236381816893e-05, + "lm_loss": 1.9626, + "loss": 2.0946, + "mask_loss": 0.1234, + "step": 1892, + "topk_loss": 0.0085 + }, + { + "epoch": 0.7523894102084795, + "grad_norm": 0.115234375, + "learning_rate": 2.980940207977101e-05, + "lm_loss": 1.9205, + "loss": 2.0548, + "mask_loss": 0.1246, + "step": 1893, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7527868689566087, + "grad_norm": 0.123046875, + "learning_rate": 2.9718681788207016e-05, + "lm_loss": 1.8521, + "loss": 1.9935, + "mask_loss": 0.1283, + "step": 1894, + "topk_loss": 0.013 + }, + { + "epoch": 0.7531843277047379, + "grad_norm": 0.173828125, + "learning_rate": 2.962807565448179e-05, + "lm_loss": 1.8949, + "loss": 2.0285, + "mask_loss": 0.1249, + "step": 1895, + "topk_loss": 0.0088 + }, + { + "epoch": 0.753581786452867, + "grad_norm": 0.10986328125, + "learning_rate": 2.9537583825766667e-05, + "lm_loss": 1.8918, + "loss": 2.0257, + "mask_loss": 0.1246, + "step": 1896, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7539792452009961, + "grad_norm": 0.1181640625, + "learning_rate": 2.9447206449047427e-05, + "lm_loss": 1.9062, + "loss": 2.0406, + "mask_loss": 0.1241, + "step": 1897, + "topk_loss": 0.0104 + }, + { + "epoch": 0.7543767039491253, + "grad_norm": 0.1142578125, + "learning_rate": 2.9356943671123904e-05, + "lm_loss": 1.8916, + "loss": 2.0297, + "mask_loss": 0.1287, + "step": 1898, + "topk_loss": 0.0094 + }, + { + "epoch": 0.7547741626972544, + "grad_norm": 0.126953125, + "learning_rate": 2.926679563860978e-05, + "lm_loss": 1.9112, + "loss": 2.0523, + "mask_loss": 0.1278, + "step": 1899, + "topk_loss": 0.0133 + }, + { + "epoch": 0.7551716214453836, + "grad_norm": 0.1240234375, + "learning_rate": 2.9176762497932375e-05, + "lm_loss": 1.9086, + "loss": 2.0441, + "mask_loss": 0.1252, + "step": 1900, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7551716214453836, + "eval_lm_loss": 687.020751953125, + "eval_loss": 687.1578369140625, + "eval_mask_hit_rate": 0.5357862710952759, + "eval_mask_loss": 0.12445982545614243, + "eval_mask_top_10_hit_rate": 0.9855801463127136, + "eval_mask_top_1_hit_rate": 0.9974265098571777, + "eval_mask_top_20_hit_rate": 0.9762018918991089, + "eval_mask_top_5_hit_rate": 0.9908568859100342, + "eval_runtime": 144.5163, + "eval_samples_per_second": 14.171, + "eval_steps_per_second": 7.086, + "eval_token_accuracy": 0.6144884824752808, + "eval_top_k_diff": -527.6876220703125, + "eval_topk_loss": 0.012589771300554276, + "step": 1900 + }, + { + "epoch": 0.7555690801935128, + "grad_norm": 0.1376953125, + "learning_rate": 2.9086844395332392e-05, + "lm_loss": 1.8859, + "loss": 2.0197, + "mask_loss": 0.1241, + "step": 1901, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7559665389416419, + "grad_norm": 0.12353515625, + "learning_rate": 2.899704147686366e-05, + "lm_loss": 1.9076, + "loss": 2.0417, + "mask_loss": 0.1253, + "step": 1902, + "topk_loss": 0.0089 + }, + { + "epoch": 0.756363997689771, + "grad_norm": 0.126953125, + "learning_rate": 2.890735388839295e-05, + "lm_loss": 1.9016, + "loss": 2.038, + "mask_loss": 0.1258, + "step": 1903, + "topk_loss": 0.0106 + }, + { + "epoch": 0.7567614564379002, + "grad_norm": 0.1318359375, + "learning_rate": 2.8817781775599618e-05, + "lm_loss": 1.9314, + "loss": 2.0643, + "mask_loss": 0.1244, + "step": 1904, + "topk_loss": 0.0086 + }, + { + "epoch": 0.7571589151860293, + "grad_norm": 0.11474609375, + "learning_rate": 2.8728325283975553e-05, + "lm_loss": 1.8348, + "loss": 1.9699, + "mask_loss": 0.1253, + "step": 1905, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7575563739341584, + "grad_norm": 0.1328125, + "learning_rate": 2.8638984558824777e-05, + "lm_loss": 1.9063, + "loss": 2.0438, + "mask_loss": 0.1278, + "step": 1906, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7579538326822877, + "grad_norm": 0.11181640625, + "learning_rate": 2.8549759745263314e-05, + "lm_loss": 1.9223, + "loss": 2.0585, + "mask_loss": 0.1249, + "step": 1907, + "topk_loss": 0.0112 + }, + { + "epoch": 0.7583512914304168, + "grad_norm": 0.1162109375, + "learning_rate": 2.8460650988218886e-05, + "lm_loss": 1.9308, + "loss": 2.0659, + "mask_loss": 0.1254, + "step": 1908, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7587487501785459, + "grad_norm": 0.11767578125, + "learning_rate": 2.8371658432430716e-05, + "lm_loss": 1.9609, + "loss": 2.0927, + "mask_loss": 0.1235, + "step": 1909, + "topk_loss": 0.0083 + }, + { + "epoch": 0.7591462089266751, + "grad_norm": 0.126953125, + "learning_rate": 2.8282782222449267e-05, + "lm_loss": 1.9922, + "loss": 2.1295, + "mask_loss": 0.1249, + "step": 1910, + "topk_loss": 0.0125 + }, + { + "epoch": 0.7595436676748042, + "grad_norm": 0.1142578125, + "learning_rate": 2.8194022502636075e-05, + "lm_loss": 1.8613, + "loss": 1.9952, + "mask_loss": 0.1254, + "step": 1911, + "topk_loss": 0.0086 + }, + { + "epoch": 0.7599411264229333, + "grad_norm": 0.130859375, + "learning_rate": 2.8105379417163357e-05, + "lm_loss": 1.8442, + "loss": 1.9784, + "mask_loss": 0.125, + "step": 1912, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7603385851710626, + "grad_norm": 0.115234375, + "learning_rate": 2.801685311001396e-05, + "lm_loss": 1.832, + "loss": 1.9689, + "mask_loss": 0.1266, + "step": 1913, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7607360439191917, + "grad_norm": 0.119140625, + "learning_rate": 2.7928443724981045e-05, + "lm_loss": 1.8346, + "loss": 1.9715, + "mask_loss": 0.1263, + "step": 1914, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7611335026673208, + "grad_norm": 0.109375, + "learning_rate": 2.7840151405667837e-05, + "lm_loss": 1.9061, + "loss": 2.0427, + "mask_loss": 0.1274, + "step": 1915, + "topk_loss": 0.0092 + }, + { + "epoch": 0.76153096141545, + "grad_norm": 0.130859375, + "learning_rate": 2.7751976295487402e-05, + "lm_loss": 1.9256, + "loss": 2.062, + "mask_loss": 0.1253, + "step": 1916, + "topk_loss": 0.0112 + }, + { + "epoch": 0.7619284201635791, + "grad_norm": 0.12158203125, + "learning_rate": 2.766391853766247e-05, + "lm_loss": 1.8934, + "loss": 2.028, + "mask_loss": 0.1253, + "step": 1917, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7623258789117082, + "grad_norm": 0.1220703125, + "learning_rate": 2.757597827522509e-05, + "lm_loss": 1.9198, + "loss": 2.0545, + "mask_loss": 0.1249, + "step": 1918, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7627233376598375, + "grad_norm": 0.10888671875, + "learning_rate": 2.7488155651016556e-05, + "lm_loss": 1.9409, + "loss": 2.0746, + "mask_loss": 0.1248, + "step": 1919, + "topk_loss": 0.009 + }, + { + "epoch": 0.7631207964079666, + "grad_norm": 0.1171875, + "learning_rate": 2.7400450807686938e-05, + "lm_loss": 1.9052, + "loss": 2.0384, + "mask_loss": 0.1241, + "step": 1920, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7635182551560957, + "grad_norm": 0.10791015625, + "learning_rate": 2.731286388769514e-05, + "lm_loss": 1.872, + "loss": 2.0074, + "mask_loss": 0.1257, + "step": 1921, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7639157139042249, + "grad_norm": 0.1142578125, + "learning_rate": 2.722539503330843e-05, + "lm_loss": 1.9372, + "loss": 2.0737, + "mask_loss": 0.1271, + "step": 1922, + "topk_loss": 0.0094 + }, + { + "epoch": 0.764313172652354, + "grad_norm": 0.1796875, + "learning_rate": 2.7138044386602358e-05, + "lm_loss": 1.9238, + "loss": 2.0597, + "mask_loss": 0.1267, + "step": 1923, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7647106314004831, + "grad_norm": 0.1220703125, + "learning_rate": 2.705081208946043e-05, + "lm_loss": 1.8969, + "loss": 2.0332, + "mask_loss": 0.1248, + "step": 1924, + "topk_loss": 0.0115 + }, + { + "epoch": 0.7651080901486124, + "grad_norm": 0.1103515625, + "learning_rate": 2.6963698283573958e-05, + "lm_loss": 1.9095, + "loss": 2.0487, + "mask_loss": 0.1293, + "step": 1925, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7655055488967415, + "grad_norm": 0.1142578125, + "learning_rate": 2.6876703110441747e-05, + "lm_loss": 1.8824, + "loss": 2.017, + "mask_loss": 0.1241, + "step": 1926, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7659030076448706, + "grad_norm": 0.11279296875, + "learning_rate": 2.6789826711369924e-05, + "lm_loss": 1.8939, + "loss": 2.0288, + "mask_loss": 0.1253, + "step": 1927, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7663004663929998, + "grad_norm": 0.11328125, + "learning_rate": 2.670306922747171e-05, + "lm_loss": 1.8682, + "loss": 2.0054, + "mask_loss": 0.1277, + "step": 1928, + "topk_loss": 0.0096 + }, + { + "epoch": 0.7666979251411289, + "grad_norm": 0.1103515625, + "learning_rate": 2.6616430799667136e-05, + "lm_loss": 1.9249, + "loss": 2.0587, + "mask_loss": 0.125, + "step": 1929, + "topk_loss": 0.0088 + }, + { + "epoch": 0.767095383889258, + "grad_norm": 0.126953125, + "learning_rate": 2.6529911568682876e-05, + "lm_loss": 1.8887, + "loss": 2.0276, + "mask_loss": 0.1284, + "step": 1930, + "topk_loss": 0.0104 + }, + { + "epoch": 0.7674928426373872, + "grad_norm": 0.11865234375, + "learning_rate": 2.644351167505199e-05, + "lm_loss": 1.9812, + "loss": 2.1151, + "mask_loss": 0.1255, + "step": 1931, + "topk_loss": 0.0084 + }, + { + "epoch": 0.7678903013855164, + "grad_norm": 0.1279296875, + "learning_rate": 2.635723125911368e-05, + "lm_loss": 1.9342, + "loss": 2.0737, + "mask_loss": 0.1276, + "step": 1932, + "topk_loss": 0.0118 + }, + { + "epoch": 0.7682877601336455, + "grad_norm": 0.12255859375, + "learning_rate": 2.6271070461013116e-05, + "lm_loss": 1.8474, + "loss": 1.9836, + "mask_loss": 0.126, + "step": 1933, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7686852188817747, + "grad_norm": 0.1396484375, + "learning_rate": 2.6185029420701136e-05, + "lm_loss": 1.921, + "loss": 2.0654, + "mask_loss": 0.1302, + "step": 1934, + "topk_loss": 0.0142 + }, + { + "epoch": 0.7690826776299038, + "grad_norm": 0.11279296875, + "learning_rate": 2.6099108277934103e-05, + "lm_loss": 1.8728, + "loss": 2.0099, + "mask_loss": 0.1264, + "step": 1935, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7694801363780329, + "grad_norm": 0.11767578125, + "learning_rate": 2.6013307172273548e-05, + "lm_loss": 1.8983, + "loss": 2.0338, + "mask_loss": 0.1256, + "step": 1936, + "topk_loss": 0.0098 + }, + { + "epoch": 0.769877595126162, + "grad_norm": 0.11376953125, + "learning_rate": 2.59276262430861e-05, + "lm_loss": 1.8407, + "loss": 1.9774, + "mask_loss": 0.1257, + "step": 1937, + "topk_loss": 0.0111 + }, + { + "epoch": 0.7702750538742913, + "grad_norm": 0.1240234375, + "learning_rate": 2.5842065629543166e-05, + "lm_loss": 1.8745, + "loss": 2.0144, + "mask_loss": 0.127, + "step": 1938, + "topk_loss": 0.0129 + }, + { + "epoch": 0.7706725126224204, + "grad_norm": 0.12353515625, + "learning_rate": 2.575662547062071e-05, + "lm_loss": 1.8346, + "loss": 1.9679, + "mask_loss": 0.126, + "step": 1939, + "topk_loss": 0.0073 + }, + { + "epoch": 0.7710699713705496, + "grad_norm": 0.1123046875, + "learning_rate": 2.5671305905099075e-05, + "lm_loss": 1.8815, + "loss": 2.0147, + "mask_loss": 0.1244, + "step": 1940, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7714674301186787, + "grad_norm": 0.1201171875, + "learning_rate": 2.558610707156268e-05, + "lm_loss": 1.8496, + "loss": 1.9857, + "mask_loss": 0.1262, + "step": 1941, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7718648888668078, + "grad_norm": 0.173828125, + "learning_rate": 2.5501029108399866e-05, + "lm_loss": 1.8575, + "loss": 1.9934, + "mask_loss": 0.1258, + "step": 1942, + "topk_loss": 0.01 + }, + { + "epoch": 0.772262347614937, + "grad_norm": 0.12158203125, + "learning_rate": 2.5416072153802683e-05, + "lm_loss": 1.8578, + "loss": 1.9934, + "mask_loss": 0.1256, + "step": 1943, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7726598063630662, + "grad_norm": 0.12451171875, + "learning_rate": 2.5331236345766517e-05, + "lm_loss": 1.9165, + "loss": 2.0485, + "mask_loss": 0.1232, + "step": 1944, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7730572651111953, + "grad_norm": 0.11083984375, + "learning_rate": 2.5246521822090064e-05, + "lm_loss": 1.8737, + "loss": 2.0102, + "mask_loss": 0.1256, + "step": 1945, + "topk_loss": 0.0109 + }, + { + "epoch": 0.7734547238593245, + "grad_norm": 0.11376953125, + "learning_rate": 2.5161928720374993e-05, + "lm_loss": 1.9129, + "loss": 2.0471, + "mask_loss": 0.1249, + "step": 1946, + "topk_loss": 0.0092 + }, + { + "epoch": 0.7738521826074536, + "grad_norm": 0.12890625, + "learning_rate": 2.5077457178025777e-05, + "lm_loss": 1.8962, + "loss": 2.0343, + "mask_loss": 0.1265, + "step": 1947, + "topk_loss": 0.0116 + }, + { + "epoch": 0.7742496413555827, + "grad_norm": 0.1171875, + "learning_rate": 2.4993107332249387e-05, + "lm_loss": 1.9555, + "loss": 2.0913, + "mask_loss": 0.1255, + "step": 1948, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7746471001037118, + "grad_norm": 0.1171875, + "learning_rate": 2.4908879320055167e-05, + "lm_loss": 1.86, + "loss": 1.9944, + "mask_loss": 0.1253, + "step": 1949, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7750445588518411, + "grad_norm": 0.1220703125, + "learning_rate": 2.4824773278254544e-05, + "lm_loss": 1.9012, + "loss": 2.0344, + "mask_loss": 0.1239, + "step": 1950, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7750445588518411, + "eval_lm_loss": 686.8341064453125, + "eval_loss": 686.97119140625, + "eval_mask_hit_rate": 0.5358555316925049, + "eval_mask_loss": 0.12442468106746674, + "eval_mask_top_10_hit_rate": 0.9855944514274597, + "eval_mask_top_1_hit_rate": 0.9974312782287598, + "eval_mask_top_20_hit_rate": 0.9762206077575684, + "eval_mask_top_5_hit_rate": 0.9908714294433594, + "eval_runtime": 144.2029, + "eval_samples_per_second": 14.202, + "eval_steps_per_second": 7.101, + "eval_token_accuracy": 0.6145423650741577, + "eval_top_k_diff": -526.3795166015625, + "eval_topk_loss": 0.012643979862332344, + "step": 1950 + }, + { + "epoch": 0.7754420175999702, + "grad_norm": 0.1318359375, + "learning_rate": 2.4740789343460857e-05, + "lm_loss": 1.916, + "loss": 2.054, + "mask_loss": 0.1272, + "step": 1951, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7758394763480994, + "grad_norm": 0.1591796875, + "learning_rate": 2.4656927652089034e-05, + "lm_loss": 1.9277, + "loss": 2.0645, + "mask_loss": 0.1259, + "step": 1952, + "topk_loss": 0.0108 + }, + { + "epoch": 0.7762369350962285, + "grad_norm": 0.11083984375, + "learning_rate": 2.457318834035551e-05, + "lm_loss": 1.8682, + "loss": 2.0037, + "mask_loss": 0.1262, + "step": 1953, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7766343938443576, + "grad_norm": 0.1103515625, + "learning_rate": 2.4489571544277945e-05, + "lm_loss": 1.8163, + "loss": 1.9471, + "mask_loss": 0.1226, + "step": 1954, + "topk_loss": 0.0081 + }, + { + "epoch": 0.7770318525924867, + "grad_norm": 0.1328125, + "learning_rate": 2.4406077399674963e-05, + "lm_loss": 1.8933, + "loss": 2.027, + "mask_loss": 0.1231, + "step": 1955, + "topk_loss": 0.0106 + }, + { + "epoch": 0.777429311340616, + "grad_norm": 0.1396484375, + "learning_rate": 2.4322706042165967e-05, + "lm_loss": 1.9016, + "loss": 2.0375, + "mask_loss": 0.1262, + "step": 1956, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7778267700887451, + "grad_norm": 0.138671875, + "learning_rate": 2.4239457607170946e-05, + "lm_loss": 1.8388, + "loss": 1.9749, + "mask_loss": 0.1264, + "step": 1957, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7782242288368743, + "grad_norm": 0.11376953125, + "learning_rate": 2.4156332229910182e-05, + "lm_loss": 1.8868, + "loss": 2.0214, + "mask_loss": 0.1259, + "step": 1958, + "topk_loss": 0.0087 + }, + { + "epoch": 0.7786216875850034, + "grad_norm": 0.11083984375, + "learning_rate": 2.4073330045404118e-05, + "lm_loss": 1.939, + "loss": 2.0748, + "mask_loss": 0.1247, + "step": 1959, + "topk_loss": 0.0111 + }, + { + "epoch": 0.7790191463331325, + "grad_norm": 0.13671875, + "learning_rate": 2.3990451188473073e-05, + "lm_loss": 1.9089, + "loss": 2.0474, + "mask_loss": 0.1275, + "step": 1960, + "topk_loss": 0.011 + }, + { + "epoch": 0.7794166050812616, + "grad_norm": 0.1259765625, + "learning_rate": 2.390769579373705e-05, + "lm_loss": 1.8258, + "loss": 1.9621, + "mask_loss": 0.1266, + "step": 1961, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7798140638293908, + "grad_norm": 0.1083984375, + "learning_rate": 2.3825063995615505e-05, + "lm_loss": 1.8292, + "loss": 1.9649, + "mask_loss": 0.1264, + "step": 1962, + "topk_loss": 0.0092 + }, + { + "epoch": 0.78021152257752, + "grad_norm": 0.1416015625, + "learning_rate": 2.3742555928327137e-05, + "lm_loss": 1.8748, + "loss": 2.012, + "mask_loss": 0.1279, + "step": 1963, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7806089813256492, + "grad_norm": 0.11376953125, + "learning_rate": 2.36601717258897e-05, + "lm_loss": 1.9319, + "loss": 2.0692, + "mask_loss": 0.1264, + "step": 1964, + "topk_loss": 0.0109 + }, + { + "epoch": 0.7810064400737783, + "grad_norm": 0.1181640625, + "learning_rate": 2.35779115221197e-05, + "lm_loss": 1.9059, + "loss": 2.0435, + "mask_loss": 0.1269, + "step": 1965, + "topk_loss": 0.0107 + }, + { + "epoch": 0.7814038988219074, + "grad_norm": 0.11279296875, + "learning_rate": 2.3495775450632283e-05, + "lm_loss": 1.9232, + "loss": 2.0584, + "mask_loss": 0.1263, + "step": 1966, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7818013575700365, + "grad_norm": 0.1259765625, + "learning_rate": 2.341376364484097e-05, + "lm_loss": 1.9051, + "loss": 2.0416, + "mask_loss": 0.1266, + "step": 1967, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7821988163181657, + "grad_norm": 0.126953125, + "learning_rate": 2.333187623795734e-05, + "lm_loss": 1.9181, + "loss": 2.0577, + "mask_loss": 0.1278, + "step": 1968, + "topk_loss": 0.0118 + }, + { + "epoch": 0.7825962750662949, + "grad_norm": 0.11572265625, + "learning_rate": 2.325011336299103e-05, + "lm_loss": 1.9011, + "loss": 2.0339, + "mask_loss": 0.1235, + "step": 1969, + "topk_loss": 0.0094 + }, + { + "epoch": 0.782993733814424, + "grad_norm": 0.11376953125, + "learning_rate": 2.3168475152749346e-05, + "lm_loss": 1.8592, + "loss": 1.9931, + "mask_loss": 0.1244, + "step": 1970, + "topk_loss": 0.0095 + }, + { + "epoch": 0.7833911925625532, + "grad_norm": 0.1201171875, + "learning_rate": 2.308696173983711e-05, + "lm_loss": 1.9293, + "loss": 2.0637, + "mask_loss": 0.1257, + "step": 1971, + "topk_loss": 0.0087 + }, + { + "epoch": 0.7837886513106823, + "grad_norm": 0.1083984375, + "learning_rate": 2.3005573256656443e-05, + "lm_loss": 1.8757, + "loss": 2.0112, + "mask_loss": 0.1256, + "step": 1972, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7841861100588114, + "grad_norm": 0.1240234375, + "learning_rate": 2.292430983540652e-05, + "lm_loss": 1.927, + "loss": 2.067, + "mask_loss": 0.1275, + "step": 1973, + "topk_loss": 0.0124 + }, + { + "epoch": 0.7845835688069406, + "grad_norm": 0.1162109375, + "learning_rate": 2.2843171608083414e-05, + "lm_loss": 1.8984, + "loss": 2.0312, + "mask_loss": 0.1242, + "step": 1974, + "topk_loss": 0.0085 + }, + { + "epoch": 0.7849810275550698, + "grad_norm": 0.13671875, + "learning_rate": 2.276215870647983e-05, + "lm_loss": 1.9559, + "loss": 2.0891, + "mask_loss": 0.1237, + "step": 1975, + "topk_loss": 0.0095 + }, + { + "epoch": 0.785378486303199, + "grad_norm": 0.1318359375, + "learning_rate": 2.2681271262184856e-05, + "lm_loss": 1.8774, + "loss": 2.0193, + "mask_loss": 0.1289, + "step": 1976, + "topk_loss": 0.013 + }, + { + "epoch": 0.7857759450513281, + "grad_norm": 0.1123046875, + "learning_rate": 2.260050940658388e-05, + "lm_loss": 1.9079, + "loss": 2.0424, + "mask_loss": 0.1248, + "step": 1977, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7861734037994572, + "grad_norm": 0.11962890625, + "learning_rate": 2.251987327085825e-05, + "lm_loss": 1.9246, + "loss": 2.0578, + "mask_loss": 0.123, + "step": 1978, + "topk_loss": 0.0102 + }, + { + "epoch": 0.7865708625475863, + "grad_norm": 0.1630859375, + "learning_rate": 2.2439362985985124e-05, + "lm_loss": 1.8482, + "loss": 1.9903, + "mask_loss": 0.1281, + "step": 1979, + "topk_loss": 0.014 + }, + { + "epoch": 0.7869683212957155, + "grad_norm": 0.11376953125, + "learning_rate": 2.235897868273723e-05, + "lm_loss": 1.8265, + "loss": 1.9625, + "mask_loss": 0.1257, + "step": 1980, + "topk_loss": 0.0103 + }, + { + "epoch": 0.7873657800438447, + "grad_norm": 0.12890625, + "learning_rate": 2.2278720491682682e-05, + "lm_loss": 1.9383, + "loss": 2.0739, + "mask_loss": 0.1263, + "step": 1981, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7877632387919739, + "grad_norm": 0.12451171875, + "learning_rate": 2.2198588543184728e-05, + "lm_loss": 1.8872, + "loss": 2.0209, + "mask_loss": 0.1233, + "step": 1982, + "topk_loss": 0.0105 + }, + { + "epoch": 0.788160697540103, + "grad_norm": 0.134765625, + "learning_rate": 2.2118582967401604e-05, + "lm_loss": 1.9392, + "loss": 2.0782, + "mask_loss": 0.1266, + "step": 1983, + "topk_loss": 0.0124 + }, + { + "epoch": 0.7885581562882321, + "grad_norm": 0.11474609375, + "learning_rate": 2.2038703894286182e-05, + "lm_loss": 1.8916, + "loss": 2.0264, + "mask_loss": 0.126, + "step": 1984, + "topk_loss": 0.0088 + }, + { + "epoch": 0.7889556150363612, + "grad_norm": 0.119140625, + "learning_rate": 2.1958951453585964e-05, + "lm_loss": 1.9144, + "loss": 2.0487, + "mask_loss": 0.1246, + "step": 1985, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7893530737844904, + "grad_norm": 0.1142578125, + "learning_rate": 2.187932577484271e-05, + "lm_loss": 1.8908, + "loss": 2.0243, + "mask_loss": 0.1248, + "step": 1986, + "topk_loss": 0.0087 + }, + { + "epoch": 0.7897505325326195, + "grad_norm": 0.1083984375, + "learning_rate": 2.179982698739228e-05, + "lm_loss": 1.8755, + "loss": 2.0111, + "mask_loss": 0.1267, + "step": 1987, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7901479912807488, + "grad_norm": 0.1162109375, + "learning_rate": 2.1720455220364444e-05, + "lm_loss": 1.919, + "loss": 2.0554, + "mask_loss": 0.1264, + "step": 1988, + "topk_loss": 0.01 + }, + { + "epoch": 0.7905454500288779, + "grad_norm": 0.109375, + "learning_rate": 2.1641210602682637e-05, + "lm_loss": 1.9149, + "loss": 2.0494, + "mask_loss": 0.1248, + "step": 1989, + "topk_loss": 0.0097 + }, + { + "epoch": 0.790942908777007, + "grad_norm": 0.11083984375, + "learning_rate": 2.1562093263063777e-05, + "lm_loss": 1.8652, + "loss": 2.0003, + "mask_loss": 0.1255, + "step": 1990, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7913403675251361, + "grad_norm": 0.1181640625, + "learning_rate": 2.148310333001804e-05, + "lm_loss": 1.9265, + "loss": 2.0623, + "mask_loss": 0.1266, + "step": 1991, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7917378262732653, + "grad_norm": 0.18359375, + "learning_rate": 2.140424093184864e-05, + "lm_loss": 1.8628, + "loss": 1.9956, + "mask_loss": 0.1235, + "step": 1992, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7921352850213944, + "grad_norm": 0.126953125, + "learning_rate": 2.132550619665168e-05, + "lm_loss": 1.825, + "loss": 1.9607, + "mask_loss": 0.1272, + "step": 1993, + "topk_loss": 0.0085 + }, + { + "epoch": 0.7925327437695237, + "grad_norm": 0.11328125, + "learning_rate": 2.1246899252315843e-05, + "lm_loss": 1.8818, + "loss": 2.0198, + "mask_loss": 0.1276, + "step": 1994, + "topk_loss": 0.0104 + }, + { + "epoch": 0.7929302025176528, + "grad_norm": 0.12109375, + "learning_rate": 2.116842022652228e-05, + "lm_loss": 1.9266, + "loss": 2.0608, + "mask_loss": 0.1245, + "step": 1995, + "topk_loss": 0.0097 + }, + { + "epoch": 0.7933276612657819, + "grad_norm": 0.11328125, + "learning_rate": 2.109006924674436e-05, + "lm_loss": 1.9416, + "loss": 2.0784, + "mask_loss": 0.1262, + "step": 1996, + "topk_loss": 0.0106 + }, + { + "epoch": 0.793725120013911, + "grad_norm": 0.1787109375, + "learning_rate": 2.101184644024745e-05, + "lm_loss": 1.8627, + "loss": 2.0, + "mask_loss": 0.1266, + "step": 1997, + "topk_loss": 0.0107 + }, + { + "epoch": 0.7941225787620402, + "grad_norm": 0.107421875, + "learning_rate": 2.0933751934088743e-05, + "lm_loss": 1.9036, + "loss": 2.038, + "mask_loss": 0.1254, + "step": 1998, + "topk_loss": 0.0091 + }, + { + "epoch": 0.7945200375101693, + "grad_norm": 0.115234375, + "learning_rate": 2.085578585511705e-05, + "lm_loss": 1.9016, + "loss": 2.0373, + "mask_loss": 0.1251, + "step": 1999, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7949174962582986, + "grad_norm": 0.11474609375, + "learning_rate": 2.0777948329972497e-05, + "lm_loss": 1.9414, + "loss": 2.0777, + "mask_loss": 0.1254, + "step": 2000, + "topk_loss": 0.0109 + }, + { + "epoch": 0.7949174962582986, + "eval_lm_loss": 687.2774658203125, + "eval_loss": 687.4144287109375, + "eval_mask_hit_rate": 0.5359091758728027, + "eval_mask_loss": 0.12442217767238617, + "eval_mask_top_10_hit_rate": 0.9855918884277344, + "eval_mask_top_1_hit_rate": 0.9974300861358643, + "eval_mask_top_20_hit_rate": 0.9762183427810669, + "eval_mask_top_5_hit_rate": 0.9908632636070251, + "eval_runtime": 143.846, + "eval_samples_per_second": 14.237, + "eval_steps_per_second": 7.119, + "eval_token_accuracy": 0.6145159006118774, + "eval_top_k_diff": -529.1758422851562, + "eval_topk_loss": 0.012563161551952362, + "step": 2000 + }, + { + "epoch": 0.7953149550064277, + "grad_norm": 0.12109375, + "learning_rate": 2.0700239485086505e-05, + "lm_loss": 1.8677, + "loss": 2.0028, + "mask_loss": 0.1262, + "step": 2001, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7957124137545568, + "grad_norm": 0.11865234375, + "learning_rate": 2.06226594466814e-05, + "lm_loss": 1.9036, + "loss": 2.0411, + "mask_loss": 0.1271, + "step": 2002, + "topk_loss": 0.0104 + }, + { + "epoch": 0.796109872502686, + "grad_norm": 0.10546875, + "learning_rate": 2.054520834077036e-05, + "lm_loss": 1.8778, + "loss": 2.015, + "mask_loss": 0.1279, + "step": 2003, + "topk_loss": 0.0093 + }, + { + "epoch": 0.7965073312508151, + "grad_norm": 0.10595703125, + "learning_rate": 2.046788629315707e-05, + "lm_loss": 1.8541, + "loss": 1.9886, + "mask_loss": 0.1247, + "step": 2004, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7969047899989442, + "grad_norm": 0.1298828125, + "learning_rate": 2.0390693429435627e-05, + "lm_loss": 1.9962, + "loss": 2.1304, + "mask_loss": 0.1244, + "step": 2005, + "topk_loss": 0.0098 + }, + { + "epoch": 0.7973022487470734, + "grad_norm": 0.1376953125, + "learning_rate": 2.031362987499027e-05, + "lm_loss": 1.9665, + "loss": 2.0999, + "mask_loss": 0.1246, + "step": 2006, + "topk_loss": 0.0089 + }, + { + "epoch": 0.7976997074952026, + "grad_norm": 0.11572265625, + "learning_rate": 2.023669575499526e-05, + "lm_loss": 1.9415, + "loss": 2.0783, + "mask_loss": 0.1253, + "step": 2007, + "topk_loss": 0.0115 + }, + { + "epoch": 0.7980971662433317, + "grad_norm": 0.1220703125, + "learning_rate": 2.0159891194414504e-05, + "lm_loss": 1.9272, + "loss": 2.0644, + "mask_loss": 0.1267, + "step": 2008, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7984946249914608, + "grad_norm": 0.1171875, + "learning_rate": 2.0083216318001564e-05, + "lm_loss": 1.9089, + "loss": 2.0452, + "mask_loss": 0.1251, + "step": 2009, + "topk_loss": 0.0112 + }, + { + "epoch": 0.79889208373959, + "grad_norm": 0.1123046875, + "learning_rate": 2.0006671250299337e-05, + "lm_loss": 1.8893, + "loss": 2.0264, + "mask_loss": 0.1266, + "step": 2010, + "topk_loss": 0.0105 + }, + { + "epoch": 0.7992895424877191, + "grad_norm": 0.125, + "learning_rate": 1.9930256115639832e-05, + "lm_loss": 1.7778, + "loss": 1.916, + "mask_loss": 0.1284, + "step": 2011, + "topk_loss": 0.0099 + }, + { + "epoch": 0.7996870012358483, + "grad_norm": 0.11328125, + "learning_rate": 1.985397103814407e-05, + "lm_loss": 1.8974, + "loss": 2.0344, + "mask_loss": 0.1263, + "step": 2012, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8000844599839775, + "grad_norm": 0.1337890625, + "learning_rate": 1.977781614172176e-05, + "lm_loss": 1.8861, + "loss": 2.0212, + "mask_loss": 0.1253, + "step": 2013, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8004819187321066, + "grad_norm": 0.11669921875, + "learning_rate": 1.9701791550071202e-05, + "lm_loss": 1.8094, + "loss": 1.9439, + "mask_loss": 0.1254, + "step": 2014, + "topk_loss": 0.009 + }, + { + "epoch": 0.8008793774802357, + "grad_norm": 0.11865234375, + "learning_rate": 1.9625897386679038e-05, + "lm_loss": 1.8877, + "loss": 2.0223, + "mask_loss": 0.1257, + "step": 2015, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8012768362283649, + "grad_norm": 0.1103515625, + "learning_rate": 1.9550133774820002e-05, + "lm_loss": 1.9153, + "loss": 2.0523, + "mask_loss": 0.1264, + "step": 2016, + "topk_loss": 0.0106 + }, + { + "epoch": 0.801674294976494, + "grad_norm": 0.1357421875, + "learning_rate": 1.9474500837556842e-05, + "lm_loss": 1.9657, + "loss": 2.0985, + "mask_loss": 0.1237, + "step": 2017, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8020717537246231, + "grad_norm": 0.1171875, + "learning_rate": 1.9398998697740002e-05, + "lm_loss": 1.8875, + "loss": 2.0217, + "mask_loss": 0.1255, + "step": 2018, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8024692124727524, + "grad_norm": 0.1103515625, + "learning_rate": 1.9323627478007522e-05, + "lm_loss": 1.9256, + "loss": 2.0609, + "mask_loss": 0.1257, + "step": 2019, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8028666712208815, + "grad_norm": 0.1123046875, + "learning_rate": 1.924838730078474e-05, + "lm_loss": 1.8953, + "loss": 2.0295, + "mask_loss": 0.125, + "step": 2020, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8032641299690106, + "grad_norm": 0.11962890625, + "learning_rate": 1.917327828828417e-05, + "lm_loss": 1.9119, + "loss": 2.049, + "mask_loss": 0.1267, + "step": 2021, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8036615887171398, + "grad_norm": 0.11865234375, + "learning_rate": 1.9098300562505266e-05, + "lm_loss": 1.9271, + "loss": 2.0639, + "mask_loss": 0.1258, + "step": 2022, + "topk_loss": 0.0109 + }, + { + "epoch": 0.8040590474652689, + "grad_norm": 0.123046875, + "learning_rate": 1.902345424523423e-05, + "lm_loss": 1.9097, + "loss": 2.048, + "mask_loss": 0.1268, + "step": 2023, + "topk_loss": 0.0115 + }, + { + "epoch": 0.804456506213398, + "grad_norm": 0.111328125, + "learning_rate": 1.894873945804383e-05, + "lm_loss": 1.9408, + "loss": 2.0737, + "mask_loss": 0.1237, + "step": 2024, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8048539649615273, + "grad_norm": 0.10791015625, + "learning_rate": 1.887415632229318e-05, + "lm_loss": 1.8618, + "loss": 1.9936, + "mask_loss": 0.123, + "step": 2025, + "topk_loss": 0.0088 + }, + { + "epoch": 0.8052514237096564, + "grad_norm": 0.1435546875, + "learning_rate": 1.879970495912755e-05, + "lm_loss": 1.9102, + "loss": 2.05, + "mask_loss": 0.1281, + "step": 2026, + "topk_loss": 0.0117 + }, + { + "epoch": 0.8056488824577855, + "grad_norm": 0.11474609375, + "learning_rate": 1.8725385489478176e-05, + "lm_loss": 1.8907, + "loss": 2.028, + "mask_loss": 0.1261, + "step": 2027, + "topk_loss": 0.0112 + }, + { + "epoch": 0.8060463412059147, + "grad_norm": 0.1455078125, + "learning_rate": 1.8651198034062058e-05, + "lm_loss": 1.9389, + "loss": 2.0798, + "mask_loss": 0.1271, + "step": 2028, + "topk_loss": 0.0138 + }, + { + "epoch": 0.8064437999540438, + "grad_norm": 0.12890625, + "learning_rate": 1.857714271338178e-05, + "lm_loss": 1.9314, + "loss": 2.0663, + "mask_loss": 0.1254, + "step": 2029, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8068412587021729, + "grad_norm": 0.11279296875, + "learning_rate": 1.850321964772528e-05, + "lm_loss": 1.9748, + "loss": 2.1091, + "mask_loss": 0.1245, + "step": 2030, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8072387174503022, + "grad_norm": 0.1171875, + "learning_rate": 1.8429428957165696e-05, + "lm_loss": 1.8888, + "loss": 2.0244, + "mask_loss": 0.1255, + "step": 2031, + "topk_loss": 0.0102 + }, + { + "epoch": 0.8076361761984313, + "grad_norm": 0.11865234375, + "learning_rate": 1.8355770761561098e-05, + "lm_loss": 1.9131, + "loss": 2.0502, + "mask_loss": 0.1255, + "step": 2032, + "topk_loss": 0.0116 + }, + { + "epoch": 0.8080336349465604, + "grad_norm": 0.11376953125, + "learning_rate": 1.8282245180554413e-05, + "lm_loss": 1.934, + "loss": 2.0718, + "mask_loss": 0.1279, + "step": 2033, + "topk_loss": 0.01 + }, + { + "epoch": 0.8084310936946896, + "grad_norm": 0.1162109375, + "learning_rate": 1.820885233357311e-05, + "lm_loss": 1.9349, + "loss": 2.0669, + "mask_loss": 0.1228, + "step": 2034, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8088285524428187, + "grad_norm": 0.11669921875, + "learning_rate": 1.8135592339829098e-05, + "lm_loss": 1.9668, + "loss": 2.1008, + "mask_loss": 0.1246, + "step": 2035, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8092260111909478, + "grad_norm": 0.11865234375, + "learning_rate": 1.8062465318318454e-05, + "lm_loss": 1.9336, + "loss": 2.0664, + "mask_loss": 0.1231, + "step": 2036, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8096234699390771, + "grad_norm": 0.1162109375, + "learning_rate": 1.798947138782131e-05, + "lm_loss": 1.8986, + "loss": 2.0331, + "mask_loss": 0.1257, + "step": 2037, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8100209286872062, + "grad_norm": 0.109375, + "learning_rate": 1.791661066690159e-05, + "lm_loss": 1.8422, + "loss": 1.9779, + "mask_loss": 0.1263, + "step": 2038, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8104183874353353, + "grad_norm": 0.1201171875, + "learning_rate": 1.784388327390687e-05, + "lm_loss": 1.8936, + "loss": 2.0325, + "mask_loss": 0.1274, + "step": 2039, + "topk_loss": 0.0116 + }, + { + "epoch": 0.8108158461834645, + "grad_norm": 0.115234375, + "learning_rate": 1.7771289326968098e-05, + "lm_loss": 1.9652, + "loss": 2.096, + "mask_loss": 0.1214, + "step": 2040, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8112133049315936, + "grad_norm": 0.1162109375, + "learning_rate": 1.7698828943999545e-05, + "lm_loss": 1.8781, + "loss": 2.0122, + "mask_loss": 0.1251, + "step": 2041, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8116107636797227, + "grad_norm": 0.11376953125, + "learning_rate": 1.7626502242698484e-05, + "lm_loss": 1.8796, + "loss": 2.0157, + "mask_loss": 0.1256, + "step": 2042, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8120082224278519, + "grad_norm": 0.115234375, + "learning_rate": 1.7554309340545084e-05, + "lm_loss": 1.8795, + "loss": 2.0135, + "mask_loss": 0.1251, + "step": 2043, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8124056811759811, + "grad_norm": 0.1181640625, + "learning_rate": 1.7482250354802156e-05, + "lm_loss": 1.896, + "loss": 2.0344, + "mask_loss": 0.1283, + "step": 2044, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8128031399241102, + "grad_norm": 0.1298828125, + "learning_rate": 1.7410325402515003e-05, + "lm_loss": 1.859, + "loss": 1.9968, + "mask_loss": 0.128, + "step": 2045, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8132005986722394, + "grad_norm": 0.11865234375, + "learning_rate": 1.7338534600511224e-05, + "lm_loss": 1.9085, + "loss": 2.0445, + "mask_loss": 0.1256, + "step": 2046, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8135980574203685, + "grad_norm": 0.1083984375, + "learning_rate": 1.7266878065400527e-05, + "lm_loss": 1.9167, + "loss": 2.0494, + "mask_loss": 0.1244, + "step": 2047, + "topk_loss": 0.0083 + }, + { + "epoch": 0.8139955161684976, + "grad_norm": 0.11376953125, + "learning_rate": 1.719535591357446e-05, + "lm_loss": 1.8987, + "loss": 2.0348, + "mask_loss": 0.1256, + "step": 2048, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8143929749166268, + "grad_norm": 0.1123046875, + "learning_rate": 1.712396826120639e-05, + "lm_loss": 1.8859, + "loss": 2.0243, + "mask_loss": 0.1277, + "step": 2049, + "topk_loss": 0.0106 + }, + { + "epoch": 0.814790433664756, + "grad_norm": 0.11279296875, + "learning_rate": 1.7052715224251147e-05, + "lm_loss": 1.9431, + "loss": 2.0784, + "mask_loss": 0.1251, + "step": 2050, + "topk_loss": 0.0102 + }, + { + "epoch": 0.814790433664756, + "eval_lm_loss": 687.176025390625, + "eval_loss": 687.31298828125, + "eval_mask_hit_rate": 0.5359328389167786, + "eval_mask_loss": 0.12440869212150574, + "eval_mask_top_10_hit_rate": 0.9855953454971313, + "eval_mask_top_1_hit_rate": 0.9974322319030762, + "eval_mask_top_20_hit_rate": 0.9762266874313354, + "eval_mask_top_5_hit_rate": 0.9908630847930908, + "eval_runtime": 144.3186, + "eval_samples_per_second": 14.191, + "eval_steps_per_second": 7.095, + "eval_token_accuracy": 0.6145559549331665, + "eval_top_k_diff": -528.7608642578125, + "eval_topk_loss": 0.0125643415376544, + "step": 2050 + }, + { + "epoch": 0.8151878924128851, + "grad_norm": 0.11865234375, + "learning_rate": 1.6981596918444953e-05, + "lm_loss": 1.9253, + "loss": 2.0592, + "mask_loss": 0.1247, + "step": 2051, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8155853511610143, + "grad_norm": 0.12109375, + "learning_rate": 1.6910613459305146e-05, + "lm_loss": 1.9252, + "loss": 2.0608, + "mask_loss": 0.1256, + "step": 2052, + "topk_loss": 0.01 + }, + { + "epoch": 0.8159828099091434, + "grad_norm": 0.130859375, + "learning_rate": 1.6839764962130057e-05, + "lm_loss": 1.7789, + "loss": 1.9157, + "mask_loss": 0.1272, + "step": 2053, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8163802686572725, + "grad_norm": 0.11865234375, + "learning_rate": 1.6769051541998803e-05, + "lm_loss": 1.8885, + "loss": 2.0259, + "mask_loss": 0.1276, + "step": 2054, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8167777274054017, + "grad_norm": 0.11279296875, + "learning_rate": 1.669847331377109e-05, + "lm_loss": 1.9007, + "loss": 2.0387, + "mask_loss": 0.1283, + "step": 2055, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8171751861535309, + "grad_norm": 0.1171875, + "learning_rate": 1.6628030392087e-05, + "lm_loss": 1.9148, + "loss": 2.0535, + "mask_loss": 0.1284, + "step": 2056, + "topk_loss": 0.0103 + }, + { + "epoch": 0.81757264490166, + "grad_norm": 0.1181640625, + "learning_rate": 1.6557722891366878e-05, + "lm_loss": 1.9268, + "loss": 2.0637, + "mask_loss": 0.1273, + "step": 2057, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8179701036497892, + "grad_norm": 0.1220703125, + "learning_rate": 1.6487550925811092e-05, + "lm_loss": 1.8928, + "loss": 2.0282, + "mask_loss": 0.1265, + "step": 2058, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8183675623979183, + "grad_norm": 0.1201171875, + "learning_rate": 1.6417514609399865e-05, + "lm_loss": 1.958, + "loss": 2.0927, + "mask_loss": 0.1248, + "step": 2059, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8187650211460474, + "grad_norm": 0.12890625, + "learning_rate": 1.6347614055893055e-05, + "lm_loss": 1.8695, + "loss": 2.0049, + "mask_loss": 0.1256, + "step": 2060, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8191624798941766, + "grad_norm": 0.12255859375, + "learning_rate": 1.6277849378830057e-05, + "lm_loss": 1.8947, + "loss": 2.0258, + "mask_loss": 0.123, + "step": 2061, + "topk_loss": 0.0081 + }, + { + "epoch": 0.8195599386423058, + "grad_norm": 0.1083984375, + "learning_rate": 1.620822069152952e-05, + "lm_loss": 1.9621, + "loss": 2.0934, + "mask_loss": 0.1222, + "step": 2062, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8199573973904349, + "grad_norm": 0.11376953125, + "learning_rate": 1.613872810708921e-05, + "lm_loss": 1.8747, + "loss": 2.0133, + "mask_loss": 0.1287, + "step": 2063, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8203548561385641, + "grad_norm": 0.1416015625, + "learning_rate": 1.606937173838582e-05, + "lm_loss": 1.7899, + "loss": 1.927, + "mask_loss": 0.1278, + "step": 2064, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8207523148866932, + "grad_norm": 0.10888671875, + "learning_rate": 1.6000151698074816e-05, + "lm_loss": 1.9232, + "loss": 2.0579, + "mask_loss": 0.125, + "step": 2065, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8211497736348223, + "grad_norm": 0.125, + "learning_rate": 1.5931068098590186e-05, + "lm_loss": 1.8724, + "loss": 2.007, + "mask_loss": 0.1244, + "step": 2066, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8215472323829515, + "grad_norm": 0.12353515625, + "learning_rate": 1.586212105214432e-05, + "lm_loss": 1.9352, + "loss": 2.0713, + "mask_loss": 0.1256, + "step": 2067, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8219446911310807, + "grad_norm": 0.1103515625, + "learning_rate": 1.5793310670727814e-05, + "lm_loss": 1.8989, + "loss": 2.0332, + "mask_loss": 0.124, + "step": 2068, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8223421498792098, + "grad_norm": 0.12109375, + "learning_rate": 1.5724637066109248e-05, + "lm_loss": 1.9171, + "loss": 2.0565, + "mask_loss": 0.1268, + "step": 2069, + "topk_loss": 0.0126 + }, + { + "epoch": 0.822739608627339, + "grad_norm": 0.10302734375, + "learning_rate": 1.5656100349835057e-05, + "lm_loss": 1.9178, + "loss": 2.0523, + "mask_loss": 0.1252, + "step": 2070, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8231370673754681, + "grad_norm": 0.1123046875, + "learning_rate": 1.5587700633229363e-05, + "lm_loss": 1.8394, + "loss": 1.9755, + "mask_loss": 0.1272, + "step": 2071, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8235345261235972, + "grad_norm": 0.10595703125, + "learning_rate": 1.5519438027393662e-05, + "lm_loss": 1.8679, + "loss": 1.998, + "mask_loss": 0.1218, + "step": 2072, + "topk_loss": 0.0083 + }, + { + "epoch": 0.8239319848717264, + "grad_norm": 0.1279296875, + "learning_rate": 1.5451312643206827e-05, + "lm_loss": 1.9522, + "loss": 2.088, + "mask_loss": 0.1257, + "step": 2073, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8243294436198555, + "grad_norm": 0.1123046875, + "learning_rate": 1.538332459132482e-05, + "lm_loss": 1.8928, + "loss": 2.0295, + "mask_loss": 0.1271, + "step": 2074, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8247269023679847, + "grad_norm": 0.12158203125, + "learning_rate": 1.531547398218053e-05, + "lm_loss": 1.8445, + "loss": 1.9775, + "mask_loss": 0.1244, + "step": 2075, + "topk_loss": 0.0085 + }, + { + "epoch": 0.8251243611161139, + "grad_norm": 0.111328125, + "learning_rate": 1.5247760925983601e-05, + "lm_loss": 1.9218, + "loss": 2.0537, + "mask_loss": 0.1243, + "step": 2076, + "topk_loss": 0.0077 + }, + { + "epoch": 0.825521819864243, + "grad_norm": 0.11083984375, + "learning_rate": 1.5180185532720237e-05, + "lm_loss": 1.9045, + "loss": 2.0409, + "mask_loss": 0.126, + "step": 2077, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8259192786123721, + "grad_norm": 0.11962890625, + "learning_rate": 1.5112747912153057e-05, + "lm_loss": 1.8387, + "loss": 1.9758, + "mask_loss": 0.1274, + "step": 2078, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8263167373605013, + "grad_norm": 0.11376953125, + "learning_rate": 1.5045448173820908e-05, + "lm_loss": 1.8736, + "loss": 2.0055, + "mask_loss": 0.1237, + "step": 2079, + "topk_loss": 0.0081 + }, + { + "epoch": 0.8267141961086304, + "grad_norm": 0.11376953125, + "learning_rate": 1.4978286427038601e-05, + "lm_loss": 1.971, + "loss": 2.1062, + "mask_loss": 0.1252, + "step": 2080, + "topk_loss": 0.01 + }, + { + "epoch": 0.8271116548567596, + "grad_norm": 0.11767578125, + "learning_rate": 1.4911262780896884e-05, + "lm_loss": 1.8702, + "loss": 2.0083, + "mask_loss": 0.128, + "step": 2081, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8275091136048888, + "grad_norm": 0.1298828125, + "learning_rate": 1.4844377344262172e-05, + "lm_loss": 1.895, + "loss": 2.0298, + "mask_loss": 0.1256, + "step": 2082, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8279065723530179, + "grad_norm": 0.11181640625, + "learning_rate": 1.4777630225776374e-05, + "lm_loss": 1.8548, + "loss": 1.9897, + "mask_loss": 0.1253, + "step": 2083, + "topk_loss": 0.0096 + }, + { + "epoch": 0.828304031101147, + "grad_norm": 0.123046875, + "learning_rate": 1.4711021533856728e-05, + "lm_loss": 1.9464, + "loss": 2.0808, + "mask_loss": 0.1257, + "step": 2084, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8287014898492762, + "grad_norm": 0.173828125, + "learning_rate": 1.4644551376695636e-05, + "lm_loss": 1.8577, + "loss": 1.9984, + "mask_loss": 0.1267, + "step": 2085, + "topk_loss": 0.014 + }, + { + "epoch": 0.8290989485974053, + "grad_norm": 0.1083984375, + "learning_rate": 1.4578219862260478e-05, + "lm_loss": 1.8929, + "loss": 2.0281, + "mask_loss": 0.1265, + "step": 2086, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8294964073455345, + "grad_norm": 0.11572265625, + "learning_rate": 1.4512027098293445e-05, + "lm_loss": 1.8918, + "loss": 2.0291, + "mask_loss": 0.126, + "step": 2087, + "topk_loss": 0.0113 + }, + { + "epoch": 0.8298938660936637, + "grad_norm": 0.107421875, + "learning_rate": 1.4445973192311312e-05, + "lm_loss": 1.8956, + "loss": 2.03, + "mask_loss": 0.1252, + "step": 2088, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8302913248417928, + "grad_norm": 0.11181640625, + "learning_rate": 1.4380058251605343e-05, + "lm_loss": 1.873, + "loss": 2.01, + "mask_loss": 0.1255, + "step": 2089, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8306887835899219, + "grad_norm": 0.126953125, + "learning_rate": 1.4314282383241096e-05, + "lm_loss": 1.9036, + "loss": 2.0418, + "mask_loss": 0.1276, + "step": 2090, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8310862423380511, + "grad_norm": 0.1513671875, + "learning_rate": 1.4248645694058193e-05, + "lm_loss": 1.8892, + "loss": 2.0246, + "mask_loss": 0.1255, + "step": 2091, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8314837010861802, + "grad_norm": 0.1083984375, + "learning_rate": 1.4183148290670223e-05, + "lm_loss": 1.9051, + "loss": 2.0387, + "mask_loss": 0.1245, + "step": 2092, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8318811598343094, + "grad_norm": 0.1298828125, + "learning_rate": 1.4117790279464526e-05, + "lm_loss": 1.8981, + "loss": 2.0332, + "mask_loss": 0.1252, + "step": 2093, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8322786185824386, + "grad_norm": 0.126953125, + "learning_rate": 1.4052571766601996e-05, + "lm_loss": 1.8772, + "loss": 2.0117, + "mask_loss": 0.1258, + "step": 2094, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8326760773305677, + "grad_norm": 0.125, + "learning_rate": 1.3987492858016994e-05, + "lm_loss": 1.9037, + "loss": 2.0373, + "mask_loss": 0.1247, + "step": 2095, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8330735360786968, + "grad_norm": 0.1318359375, + "learning_rate": 1.392255365941707e-05, + "lm_loss": 1.9019, + "loss": 2.0371, + "mask_loss": 0.125, + "step": 2096, + "topk_loss": 0.0102 + }, + { + "epoch": 0.833470994826826, + "grad_norm": 0.11083984375, + "learning_rate": 1.3857754276282875e-05, + "lm_loss": 1.813, + "loss": 1.9519, + "mask_loss": 0.1284, + "step": 2097, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8338684535749551, + "grad_norm": 0.12890625, + "learning_rate": 1.3793094813867947e-05, + "lm_loss": 1.9538, + "loss": 2.0897, + "mask_loss": 0.1252, + "step": 2098, + "topk_loss": 0.0108 + }, + { + "epoch": 0.8342659123230842, + "grad_norm": 0.11572265625, + "learning_rate": 1.372857537719855e-05, + "lm_loss": 1.9157, + "loss": 2.0535, + "mask_loss": 0.1253, + "step": 2099, + "topk_loss": 0.0126 + }, + { + "epoch": 0.8346633710712135, + "grad_norm": 0.1376953125, + "learning_rate": 1.3664196071073521e-05, + "lm_loss": 1.8927, + "loss": 2.0387, + "mask_loss": 0.1296, + "step": 2100, + "topk_loss": 0.0164 + }, + { + "epoch": 0.8346633710712135, + "eval_lm_loss": 687.4456787109375, + "eval_loss": 687.58251953125, + "eval_mask_hit_rate": 0.5359505414962769, + "eval_mask_loss": 0.12440785765647888, + "eval_mask_top_10_hit_rate": 0.9855968952178955, + "eval_mask_top_1_hit_rate": 0.9974315166473389, + "eval_mask_top_20_hit_rate": 0.9762290120124817, + "eval_mask_top_5_hit_rate": 0.9908639192581177, + "eval_runtime": 144.4261, + "eval_samples_per_second": 14.18, + "eval_steps_per_second": 7.09, + "eval_token_accuracy": 0.6145721673965454, + "eval_top_k_diff": -530.545166015625, + "eval_topk_loss": 0.012532277032732964, + "step": 2100 + }, + { + "epoch": 0.8350608298193426, + "grad_norm": 0.1259765625, + "learning_rate": 1.3599957000064057e-05, + "lm_loss": 1.9435, + "loss": 2.0832, + "mask_loss": 0.1273, + "step": 2101, + "topk_loss": 0.0125 + }, + { + "epoch": 0.8354582885674717, + "grad_norm": 0.111328125, + "learning_rate": 1.353585826851358e-05, + "lm_loss": 1.9445, + "loss": 2.0803, + "mask_loss": 0.1255, + "step": 2102, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8358557473156009, + "grad_norm": 0.10986328125, + "learning_rate": 1.3471899980537594e-05, + "lm_loss": 1.8861, + "loss": 2.0213, + "mask_loss": 0.126, + "step": 2103, + "topk_loss": 0.0092 + }, + { + "epoch": 0.83625320606373, + "grad_norm": 0.1103515625, + "learning_rate": 1.3408082240023412e-05, + "lm_loss": 1.8881, + "loss": 2.0225, + "mask_loss": 0.1237, + "step": 2104, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8366506648118591, + "grad_norm": 0.10791015625, + "learning_rate": 1.334440515063009e-05, + "lm_loss": 1.9355, + "loss": 2.0751, + "mask_loss": 0.1289, + "step": 2105, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8370481235599884, + "grad_norm": 0.11083984375, + "learning_rate": 1.3280868815788249e-05, + "lm_loss": 1.8827, + "loss": 2.018, + "mask_loss": 0.126, + "step": 2106, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8374455823081175, + "grad_norm": 0.1083984375, + "learning_rate": 1.3217473338699859e-05, + "lm_loss": 1.9252, + "loss": 2.0583, + "mask_loss": 0.1246, + "step": 2107, + "topk_loss": 0.0085 + }, + { + "epoch": 0.8378430410562466, + "grad_norm": 0.10888671875, + "learning_rate": 1.3154218822338094e-05, + "lm_loss": 1.8615, + "loss": 1.9949, + "mask_loss": 0.1244, + "step": 2108, + "topk_loss": 0.009 + }, + { + "epoch": 0.8382404998043758, + "grad_norm": 0.10888671875, + "learning_rate": 1.3091105369447165e-05, + "lm_loss": 1.9219, + "loss": 2.0596, + "mask_loss": 0.1279, + "step": 2109, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8386379585525049, + "grad_norm": 0.11279296875, + "learning_rate": 1.3028133082542172e-05, + "lm_loss": 1.9221, + "loss": 2.0598, + "mask_loss": 0.1286, + "step": 2110, + "topk_loss": 0.0091 + }, + { + "epoch": 0.839035417300634, + "grad_norm": 0.1083984375, + "learning_rate": 1.2965302063908902e-05, + "lm_loss": 1.9019, + "loss": 2.035, + "mask_loss": 0.1242, + "step": 2111, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8394328760487633, + "grad_norm": 0.1416015625, + "learning_rate": 1.2902612415603665e-05, + "lm_loss": 1.9063, + "loss": 2.0412, + "mask_loss": 0.1248, + "step": 2112, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8398303347968924, + "grad_norm": 0.126953125, + "learning_rate": 1.2840064239453176e-05, + "lm_loss": 1.898, + "loss": 2.0321, + "mask_loss": 0.1235, + "step": 2113, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8402277935450215, + "grad_norm": 0.1318359375, + "learning_rate": 1.277765763705434e-05, + "lm_loss": 1.9033, + "loss": 2.0409, + "mask_loss": 0.1264, + "step": 2114, + "topk_loss": 0.0112 + }, + { + "epoch": 0.8406252522931507, + "grad_norm": 0.1083984375, + "learning_rate": 1.2715392709774099e-05, + "lm_loss": 1.9219, + "loss": 2.0598, + "mask_loss": 0.1271, + "step": 2115, + "topk_loss": 0.0109 + }, + { + "epoch": 0.8410227110412798, + "grad_norm": 0.1943359375, + "learning_rate": 1.2653269558749292e-05, + "lm_loss": 1.8536, + "loss": 1.9936, + "mask_loss": 0.128, + "step": 2116, + "topk_loss": 0.012 + }, + { + "epoch": 0.8414201697894089, + "grad_norm": 0.1337890625, + "learning_rate": 1.259128828488646e-05, + "lm_loss": 1.8852, + "loss": 2.0253, + "mask_loss": 0.1286, + "step": 2117, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8418176285375382, + "grad_norm": 0.125, + "learning_rate": 1.252944898886168e-05, + "lm_loss": 1.9582, + "loss": 2.0964, + "mask_loss": 0.1274, + "step": 2118, + "topk_loss": 0.0108 + }, + { + "epoch": 0.8422150872856673, + "grad_norm": 0.11083984375, + "learning_rate": 1.2467751771120462e-05, + "lm_loss": 1.9098, + "loss": 2.0448, + "mask_loss": 0.1252, + "step": 2119, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8426125460337964, + "grad_norm": 0.10693359375, + "learning_rate": 1.2406196731877462e-05, + "lm_loss": 1.8932, + "loss": 2.0318, + "mask_loss": 0.1289, + "step": 2120, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8430100047819256, + "grad_norm": 0.142578125, + "learning_rate": 1.2344783971116436e-05, + "lm_loss": 1.8694, + "loss": 2.0124, + "mask_loss": 0.1287, + "step": 2121, + "topk_loss": 0.0143 + }, + { + "epoch": 0.8434074635300547, + "grad_norm": 0.111328125, + "learning_rate": 1.2283513588590067e-05, + "lm_loss": 1.8623, + "loss": 2.0009, + "mask_loss": 0.1279, + "step": 2122, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8438049222781838, + "grad_norm": 0.1298828125, + "learning_rate": 1.2222385683819714e-05, + "lm_loss": 2.0004, + "loss": 2.1375, + "mask_loss": 0.1256, + "step": 2123, + "topk_loss": 0.0114 + }, + { + "epoch": 0.8442023810263131, + "grad_norm": 0.111328125, + "learning_rate": 1.2161400356095375e-05, + "lm_loss": 1.9151, + "loss": 2.0496, + "mask_loss": 0.1242, + "step": 2124, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8445998397744422, + "grad_norm": 0.140625, + "learning_rate": 1.2100557704475401e-05, + "lm_loss": 1.8272, + "loss": 1.9643, + "mask_loss": 0.1275, + "step": 2125, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8449972985225713, + "grad_norm": 0.11083984375, + "learning_rate": 1.2039857827786416e-05, + "lm_loss": 1.9475, + "loss": 2.0824, + "mask_loss": 0.1259, + "step": 2126, + "topk_loss": 0.009 + }, + { + "epoch": 0.8453947572707005, + "grad_norm": 0.11669921875, + "learning_rate": 1.1979300824623163e-05, + "lm_loss": 1.9046, + "loss": 2.0357, + "mask_loss": 0.1225, + "step": 2127, + "topk_loss": 0.0086 + }, + { + "epoch": 0.8457922160188296, + "grad_norm": 0.1181640625, + "learning_rate": 1.191888679334826e-05, + "lm_loss": 1.9204, + "loss": 2.0547, + "mask_loss": 0.1249, + "step": 2128, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8461896747669587, + "grad_norm": 0.10595703125, + "learning_rate": 1.1858615832092156e-05, + "lm_loss": 1.8909, + "loss": 2.0276, + "mask_loss": 0.1274, + "step": 2129, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8465871335150879, + "grad_norm": 0.1298828125, + "learning_rate": 1.1798488038752853e-05, + "lm_loss": 1.9399, + "loss": 2.0785, + "mask_loss": 0.1252, + "step": 2130, + "topk_loss": 0.0134 + }, + { + "epoch": 0.8469845922632171, + "grad_norm": 0.130859375, + "learning_rate": 1.1738503510995857e-05, + "lm_loss": 1.9269, + "loss": 2.0617, + "mask_loss": 0.1252, + "step": 2131, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8473820510113462, + "grad_norm": 0.11376953125, + "learning_rate": 1.1678662346253933e-05, + "lm_loss": 1.9157, + "loss": 2.0515, + "mask_loss": 0.1258, + "step": 2132, + "topk_loss": 0.01 + }, + { + "epoch": 0.8477795097594754, + "grad_norm": 0.115234375, + "learning_rate": 1.1618964641727004e-05, + "lm_loss": 1.8925, + "loss": 2.0287, + "mask_loss": 0.1263, + "step": 2133, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8481769685076045, + "grad_norm": 0.10986328125, + "learning_rate": 1.1559410494381951e-05, + "lm_loss": 1.9129, + "loss": 2.0473, + "mask_loss": 0.1256, + "step": 2134, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8485744272557336, + "grad_norm": 0.1455078125, + "learning_rate": 1.1500000000952516e-05, + "lm_loss": 1.9484, + "loss": 2.0885, + "mask_loss": 0.1276, + "step": 2135, + "topk_loss": 0.0126 + }, + { + "epoch": 0.8489718860038628, + "grad_norm": 0.11328125, + "learning_rate": 1.1440733257939018e-05, + "lm_loss": 1.923, + "loss": 2.0559, + "mask_loss": 0.1241, + "step": 2136, + "topk_loss": 0.0087 + }, + { + "epoch": 0.849369344751992, + "grad_norm": 0.130859375, + "learning_rate": 1.1381610361608374e-05, + "lm_loss": 1.8942, + "loss": 2.0304, + "mask_loss": 0.1254, + "step": 2137, + "topk_loss": 0.0109 + }, + { + "epoch": 0.8497668035001211, + "grad_norm": 0.11962890625, + "learning_rate": 1.1322631407993811e-05, + "lm_loss": 1.9726, + "loss": 2.1108, + "mask_loss": 0.1256, + "step": 2138, + "topk_loss": 0.0125 + }, + { + "epoch": 0.8501642622482503, + "grad_norm": 0.11474609375, + "learning_rate": 1.1263796492894751e-05, + "lm_loss": 1.9186, + "loss": 2.053, + "mask_loss": 0.1252, + "step": 2139, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8505617209963794, + "grad_norm": 0.10888671875, + "learning_rate": 1.1205105711876651e-05, + "lm_loss": 1.8365, + "loss": 1.9712, + "mask_loss": 0.1248, + "step": 2140, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8509591797445085, + "grad_norm": 0.12060546875, + "learning_rate": 1.1146559160270875e-05, + "lm_loss": 1.8962, + "loss": 2.0291, + "mask_loss": 0.1239, + "step": 2141, + "topk_loss": 0.009 + }, + { + "epoch": 0.8513566384926377, + "grad_norm": 0.11279296875, + "learning_rate": 1.1088156933174487e-05, + "lm_loss": 1.8732, + "loss": 2.0136, + "mask_loss": 0.1299, + "step": 2142, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8517540972407669, + "grad_norm": 0.1103515625, + "learning_rate": 1.102989912545015e-05, + "lm_loss": 1.9038, + "loss": 2.0408, + "mask_loss": 0.127, + "step": 2143, + "topk_loss": 0.0099 + }, + { + "epoch": 0.852151555988896, + "grad_norm": 0.11181640625, + "learning_rate": 1.0971785831725901e-05, + "lm_loss": 1.8961, + "loss": 2.0278, + "mask_loss": 0.1231, + "step": 2144, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8525490147370252, + "grad_norm": 0.111328125, + "learning_rate": 1.0913817146395088e-05, + "lm_loss": 1.9004, + "loss": 2.0378, + "mask_loss": 0.1271, + "step": 2145, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8529464734851543, + "grad_norm": 0.125, + "learning_rate": 1.0855993163616174e-05, + "lm_loss": 1.9038, + "loss": 2.0462, + "mask_loss": 0.1294, + "step": 2146, + "topk_loss": 0.0129 + }, + { + "epoch": 0.8533439322332834, + "grad_norm": 0.11669921875, + "learning_rate": 1.0798313977312557e-05, + "lm_loss": 1.9402, + "loss": 2.0766, + "mask_loss": 0.1274, + "step": 2147, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8537413909814126, + "grad_norm": 0.11083984375, + "learning_rate": 1.0740779681172453e-05, + "lm_loss": 1.9036, + "loss": 2.0357, + "mask_loss": 0.1231, + "step": 2148, + "topk_loss": 0.009 + }, + { + "epoch": 0.8541388497295418, + "grad_norm": 0.1611328125, + "learning_rate": 1.0683390368648726e-05, + "lm_loss": 1.9036, + "loss": 2.0408, + "mask_loss": 0.1265, + "step": 2149, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8545363084776709, + "grad_norm": 0.11328125, + "learning_rate": 1.0626146132958759e-05, + "lm_loss": 1.8856, + "loss": 2.0193, + "mask_loss": 0.1244, + "step": 2150, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8545363084776709, + "eval_lm_loss": 687.17578125, + "eval_loss": 687.3126831054688, + "eval_mask_hit_rate": 0.5359647274017334, + "eval_mask_loss": 0.12439900636672974, + "eval_mask_top_10_hit_rate": 0.9855988025665283, + "eval_mask_top_1_hit_rate": 0.9974305629730225, + "eval_mask_top_20_hit_rate": 0.9762325286865234, + "eval_mask_top_5_hit_rate": 0.9908649921417236, + "eval_runtime": 143.8341, + "eval_samples_per_second": 14.239, + "eval_steps_per_second": 7.119, + "eval_token_accuracy": 0.6145526170730591, + "eval_top_k_diff": -529.2546997070312, + "eval_topk_loss": 0.012550394050776958, + "step": 2150 + }, + { + "epoch": 0.8549337672258001, + "grad_norm": 0.13671875, + "learning_rate": 1.0569047067084293e-05, + "lm_loss": 1.8935, + "loss": 2.035, + "mask_loss": 0.128, + "step": 2151, + "topk_loss": 0.0136 + }, + { + "epoch": 0.8553312259739292, + "grad_norm": 0.111328125, + "learning_rate": 1.0512093263771206e-05, + "lm_loss": 1.8717, + "loss": 2.0083, + "mask_loss": 0.1273, + "step": 2152, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8557286847220583, + "grad_norm": 0.1181640625, + "learning_rate": 1.045528481552951e-05, + "lm_loss": 1.8821, + "loss": 2.0174, + "mask_loss": 0.126, + "step": 2153, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8561261434701875, + "grad_norm": 0.107421875, + "learning_rate": 1.039862181463307e-05, + "lm_loss": 1.9685, + "loss": 2.1001, + "mask_loss": 0.1233, + "step": 2154, + "topk_loss": 0.0084 + }, + { + "epoch": 0.8565236022183167, + "grad_norm": 0.1064453125, + "learning_rate": 1.034210435311952e-05, + "lm_loss": 1.8213, + "loss": 1.9588, + "mask_loss": 0.1274, + "step": 2155, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8569210609664458, + "grad_norm": 0.12451171875, + "learning_rate": 1.0285732522790092e-05, + "lm_loss": 1.9379, + "loss": 2.0686, + "mask_loss": 0.1221, + "step": 2156, + "topk_loss": 0.0086 + }, + { + "epoch": 0.857318519714575, + "grad_norm": 0.11328125, + "learning_rate": 1.0229506415209444e-05, + "lm_loss": 1.8874, + "loss": 2.0255, + "mask_loss": 0.1271, + "step": 2157, + "topk_loss": 0.0111 + }, + { + "epoch": 0.8577159784627041, + "grad_norm": 0.154296875, + "learning_rate": 1.0173426121705576e-05, + "lm_loss": 1.8887, + "loss": 2.0259, + "mask_loss": 0.1263, + "step": 2158, + "topk_loss": 0.0109 + }, + { + "epoch": 0.8581134372108332, + "grad_norm": 0.11865234375, + "learning_rate": 1.0117491733369611e-05, + "lm_loss": 1.8654, + "loss": 2.0052, + "mask_loss": 0.1283, + "step": 2159, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8585108959589624, + "grad_norm": 0.107421875, + "learning_rate": 1.0061703341055706e-05, + "lm_loss": 1.8766, + "loss": 2.0102, + "mask_loss": 0.1251, + "step": 2160, + "topk_loss": 0.0085 + }, + { + "epoch": 0.8589083547070915, + "grad_norm": 0.12890625, + "learning_rate": 1.0006061035380843e-05, + "lm_loss": 1.7927, + "loss": 1.937, + "mask_loss": 0.1311, + "step": 2161, + "topk_loss": 0.0132 + }, + { + "epoch": 0.8593058134552207, + "grad_norm": 0.11279296875, + "learning_rate": 9.950564906724757e-06, + "lm_loss": 1.8876, + "loss": 2.0232, + "mask_loss": 0.1269, + "step": 2162, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8597032722033499, + "grad_norm": 0.12890625, + "learning_rate": 9.89521504522971e-06, + "lm_loss": 1.9252, + "loss": 2.0581, + "mask_loss": 0.1238, + "step": 2163, + "topk_loss": 0.0091 + }, + { + "epoch": 0.860100730951479, + "grad_norm": 0.11181640625, + "learning_rate": 9.840011540800409e-06, + "lm_loss": 1.8699, + "loss": 2.0043, + "mask_loss": 0.1248, + "step": 2164, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8604981896996081, + "grad_norm": 0.1318359375, + "learning_rate": 9.784954483103803e-06, + "lm_loss": 1.901, + "loss": 2.0414, + "mask_loss": 0.1277, + "step": 2165, + "topk_loss": 0.0127 + }, + { + "epoch": 0.8608956484477372, + "grad_norm": 0.11279296875, + "learning_rate": 9.730043961569013e-06, + "lm_loss": 1.9087, + "loss": 2.0432, + "mask_loss": 0.1244, + "step": 2166, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8612931071958664, + "grad_norm": 0.111328125, + "learning_rate": 9.675280065387116e-06, + "lm_loss": 1.9338, + "loss": 2.0717, + "mask_loss": 0.1273, + "step": 2167, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8616905659439956, + "grad_norm": 0.130859375, + "learning_rate": 9.620662883511e-06, + "lm_loss": 1.8762, + "loss": 2.0116, + "mask_loss": 0.1256, + "step": 2168, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8620880246921248, + "grad_norm": 0.111328125, + "learning_rate": 9.56619250465528e-06, + "lm_loss": 1.9275, + "loss": 2.0628, + "mask_loss": 0.1256, + "step": 2169, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8624854834402539, + "grad_norm": 0.1396484375, + "learning_rate": 9.511869017296116e-06, + "lm_loss": 1.9117, + "loss": 2.0558, + "mask_loss": 0.1293, + "step": 2170, + "topk_loss": 0.0148 + }, + { + "epoch": 0.862882942188383, + "grad_norm": 0.1240234375, + "learning_rate": 9.457692509671069e-06, + "lm_loss": 1.9475, + "loss": 2.087, + "mask_loss": 0.1279, + "step": 2171, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8632804009365121, + "grad_norm": 0.1201171875, + "learning_rate": 9.403663069778945e-06, + "lm_loss": 1.8539, + "loss": 1.9927, + "mask_loss": 0.1277, + "step": 2172, + "topk_loss": 0.011 + }, + { + "epoch": 0.8636778596846413, + "grad_norm": 0.1376953125, + "learning_rate": 9.349780785379703e-06, + "lm_loss": 1.9392, + "loss": 2.0739, + "mask_loss": 0.1252, + "step": 2173, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8640753184327705, + "grad_norm": 0.11083984375, + "learning_rate": 9.29604574399423e-06, + "lm_loss": 1.8761, + "loss": 2.0089, + "mask_loss": 0.1241, + "step": 2174, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8644727771808997, + "grad_norm": 0.203125, + "learning_rate": 9.242458032904311e-06, + "lm_loss": 1.9129, + "loss": 2.0492, + "mask_loss": 0.1249, + "step": 2175, + "topk_loss": 0.0114 + }, + { + "epoch": 0.8648702359290288, + "grad_norm": 0.111328125, + "learning_rate": 9.189017739152328e-06, + "lm_loss": 1.9449, + "loss": 2.0813, + "mask_loss": 0.1248, + "step": 2176, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8652676946771579, + "grad_norm": 0.11767578125, + "learning_rate": 9.135724949541314e-06, + "lm_loss": 1.9331, + "loss": 2.0664, + "mask_loss": 0.1247, + "step": 2177, + "topk_loss": 0.0086 + }, + { + "epoch": 0.865665153425287, + "grad_norm": 0.1171875, + "learning_rate": 9.082579750634646e-06, + "lm_loss": 1.93, + "loss": 2.0705, + "mask_loss": 0.1283, + "step": 2178, + "topk_loss": 0.0123 + }, + { + "epoch": 0.8660626121734162, + "grad_norm": 0.1435546875, + "learning_rate": 9.029582228755996e-06, + "lm_loss": 1.9148, + "loss": 2.0521, + "mask_loss": 0.127, + "step": 2179, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8664600709215454, + "grad_norm": 0.1259765625, + "learning_rate": 8.976732469989157e-06, + "lm_loss": 1.8945, + "loss": 2.0325, + "mask_loss": 0.1264, + "step": 2180, + "topk_loss": 0.0115 + }, + { + "epoch": 0.8668575296696746, + "grad_norm": 0.11767578125, + "learning_rate": 8.924030560177921e-06, + "lm_loss": 1.828, + "loss": 1.9669, + "mask_loss": 0.1283, + "step": 2181, + "topk_loss": 0.0106 + }, + { + "epoch": 0.8672549884178037, + "grad_norm": 0.11572265625, + "learning_rate": 8.871476584925909e-06, + "lm_loss": 1.8724, + "loss": 2.0077, + "mask_loss": 0.1255, + "step": 2182, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8676524471659328, + "grad_norm": 0.11328125, + "learning_rate": 8.819070629596482e-06, + "lm_loss": 1.8596, + "loss": 1.9933, + "mask_loss": 0.1251, + "step": 2183, + "topk_loss": 0.0086 + }, + { + "epoch": 0.868049905914062, + "grad_norm": 0.1171875, + "learning_rate": 8.766812779312528e-06, + "lm_loss": 1.9529, + "loss": 2.0883, + "mask_loss": 0.1261, + "step": 2184, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8684473646621911, + "grad_norm": 0.1435546875, + "learning_rate": 8.714703118956402e-06, + "lm_loss": 1.8389, + "loss": 1.976, + "mask_loss": 0.1273, + "step": 2185, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8688448234103202, + "grad_norm": 0.111328125, + "learning_rate": 8.662741733169743e-06, + "lm_loss": 1.9521, + "loss": 2.0861, + "mask_loss": 0.1248, + "step": 2186, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8692422821584495, + "grad_norm": 0.224609375, + "learning_rate": 8.61092870635336e-06, + "lm_loss": 1.9036, + "loss": 2.0395, + "mask_loss": 0.1252, + "step": 2187, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8696397409065786, + "grad_norm": 0.140625, + "learning_rate": 8.559264122667087e-06, + "lm_loss": 1.9506, + "loss": 2.0836, + "mask_loss": 0.1239, + "step": 2188, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8700371996547077, + "grad_norm": 0.115234375, + "learning_rate": 8.507748066029597e-06, + "lm_loss": 1.9317, + "loss": 2.0702, + "mask_loss": 0.1274, + "step": 2189, + "topk_loss": 0.0111 + }, + { + "epoch": 0.8704346584028368, + "grad_norm": 0.10986328125, + "learning_rate": 8.456380620118365e-06, + "lm_loss": 1.9114, + "loss": 2.0483, + "mask_loss": 0.1265, + "step": 2190, + "topk_loss": 0.0104 + }, + { + "epoch": 0.870832117150966, + "grad_norm": 0.1552734375, + "learning_rate": 8.405161868369448e-06, + "lm_loss": 1.9055, + "loss": 2.0511, + "mask_loss": 0.1299, + "step": 2191, + "topk_loss": 0.0157 + }, + { + "epoch": 0.8712295758990951, + "grad_norm": 0.11865234375, + "learning_rate": 8.354091893977401e-06, + "lm_loss": 1.912, + "loss": 2.0495, + "mask_loss": 0.1259, + "step": 2192, + "topk_loss": 0.0116 + }, + { + "epoch": 0.8716270346472244, + "grad_norm": 0.115234375, + "learning_rate": 8.303170779895086e-06, + "lm_loss": 1.8498, + "loss": 1.9863, + "mask_loss": 0.1258, + "step": 2193, + "topk_loss": 0.0108 + }, + { + "epoch": 0.8720244933953535, + "grad_norm": 0.119140625, + "learning_rate": 8.2523986088336e-06, + "lm_loss": 1.8843, + "loss": 2.0206, + "mask_loss": 0.1265, + "step": 2194, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8724219521434826, + "grad_norm": 0.11328125, + "learning_rate": 8.201775463262107e-06, + "lm_loss": 1.8533, + "loss": 1.9879, + "mask_loss": 0.1252, + "step": 2195, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8728194108916117, + "grad_norm": 0.1162109375, + "learning_rate": 8.151301425407699e-06, + "lm_loss": 1.8667, + "loss": 2.0021, + "mask_loss": 0.1261, + "step": 2196, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8732168696397409, + "grad_norm": 0.10498046875, + "learning_rate": 8.100976577255281e-06, + "lm_loss": 1.8093, + "loss": 1.9459, + "mask_loss": 0.1273, + "step": 2197, + "topk_loss": 0.0094 + }, + { + "epoch": 0.87361432838787, + "grad_norm": 0.11572265625, + "learning_rate": 8.050801000547426e-06, + "lm_loss": 1.9094, + "loss": 2.05, + "mask_loss": 0.1277, + "step": 2198, + "topk_loss": 0.0128 + }, + { + "epoch": 0.8740117871359993, + "grad_norm": 0.1064453125, + "learning_rate": 8.00077477678427e-06, + "lm_loss": 1.8488, + "loss": 1.9849, + "mask_loss": 0.126, + "step": 2199, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8744092458841284, + "grad_norm": 0.1279296875, + "learning_rate": 7.950897987223304e-06, + "lm_loss": 1.8628, + "loss": 1.999, + "mask_loss": 0.1271, + "step": 2200, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8744092458841284, + "eval_lm_loss": 687.38037109375, + "eval_loss": 687.517333984375, + "eval_mask_hit_rate": 0.5359687805175781, + "eval_mask_loss": 0.12440390139818192, + "eval_mask_top_10_hit_rate": 0.9855976104736328, + "eval_mask_top_1_hit_rate": 0.997429609298706, + "eval_mask_top_20_hit_rate": 0.9762320518493652, + "eval_mask_top_5_hit_rate": 0.990863561630249, + "eval_runtime": 144.9294, + "eval_samples_per_second": 14.131, + "eval_steps_per_second": 7.066, + "eval_token_accuracy": 0.614575982093811, + "eval_top_k_diff": -530.4586181640625, + "eval_topk_loss": 0.012535331770777702, + "step": 2200 + }, + { + "epoch": 0.8748067046322575, + "grad_norm": 0.1083984375, + "learning_rate": 7.901170712879325e-06, + "lm_loss": 1.9053, + "loss": 2.0407, + "mask_loss": 0.1261, + "step": 2201, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8752041633803866, + "grad_norm": 0.1181640625, + "learning_rate": 7.851593034524262e-06, + "lm_loss": 1.9185, + "loss": 2.0577, + "mask_loss": 0.1274, + "step": 2202, + "topk_loss": 0.0118 + }, + { + "epoch": 0.8756016221285158, + "grad_norm": 0.1162109375, + "learning_rate": 7.802165032687092e-06, + "lm_loss": 1.8895, + "loss": 2.025, + "mask_loss": 0.1259, + "step": 2203, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8759990808766449, + "grad_norm": 0.1279296875, + "learning_rate": 7.752886787653624e-06, + "lm_loss": 1.8695, + "loss": 2.0027, + "mask_loss": 0.1239, + "step": 2204, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8763965396247742, + "grad_norm": 0.11181640625, + "learning_rate": 7.703758379466441e-06, + "lm_loss": 1.969, + "loss": 2.1055, + "mask_loss": 0.1254, + "step": 2205, + "topk_loss": 0.0111 + }, + { + "epoch": 0.8767939983729033, + "grad_norm": 0.115234375, + "learning_rate": 7.654779887924734e-06, + "lm_loss": 1.9584, + "loss": 2.0925, + "mask_loss": 0.1249, + "step": 2206, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8771914571210324, + "grad_norm": 0.12158203125, + "learning_rate": 7.605951392584221e-06, + "lm_loss": 1.9129, + "loss": 2.049, + "mask_loss": 0.1256, + "step": 2207, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8775889158691615, + "grad_norm": 0.1162109375, + "learning_rate": 7.557272972756923e-06, + "lm_loss": 1.9447, + "loss": 2.0829, + "mask_loss": 0.126, + "step": 2208, + "topk_loss": 0.0122 + }, + { + "epoch": 0.8779863746172907, + "grad_norm": 0.12451171875, + "learning_rate": 7.508744707511117e-06, + "lm_loss": 1.8635, + "loss": 1.9989, + "mask_loss": 0.1262, + "step": 2209, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8783838333654198, + "grad_norm": 0.10791015625, + "learning_rate": 7.460366675671215e-06, + "lm_loss": 1.9124, + "loss": 2.0458, + "mask_loss": 0.1239, + "step": 2210, + "topk_loss": 0.0094 + }, + { + "epoch": 0.878781292113549, + "grad_norm": 0.142578125, + "learning_rate": 7.412138955817571e-06, + "lm_loss": 1.9115, + "loss": 2.0482, + "mask_loss": 0.1263, + "step": 2211, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8791787508616782, + "grad_norm": 0.1728515625, + "learning_rate": 7.3640616262864e-06, + "lm_loss": 1.9172, + "loss": 2.0554, + "mask_loss": 0.1261, + "step": 2212, + "topk_loss": 0.0121 + }, + { + "epoch": 0.8795762096098073, + "grad_norm": 0.1298828125, + "learning_rate": 7.316134765169635e-06, + "lm_loss": 1.9638, + "loss": 2.0982, + "mask_loss": 0.1238, + "step": 2213, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8799736683579364, + "grad_norm": 0.11376953125, + "learning_rate": 7.268358450314794e-06, + "lm_loss": 1.8659, + "loss": 2.0009, + "mask_loss": 0.1261, + "step": 2214, + "topk_loss": 0.0088 + }, + { + "epoch": 0.8803711271060656, + "grad_norm": 0.134765625, + "learning_rate": 7.220732759324911e-06, + "lm_loss": 1.9214, + "loss": 2.0551, + "mask_loss": 0.1236, + "step": 2215, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8807685858541947, + "grad_norm": 0.134765625, + "learning_rate": 7.173257769558262e-06, + "lm_loss": 1.849, + "loss": 1.9857, + "mask_loss": 0.1275, + "step": 2216, + "topk_loss": 0.0092 + }, + { + "epoch": 0.8811660446023238, + "grad_norm": 0.126953125, + "learning_rate": 7.125933558128451e-06, + "lm_loss": 1.8932, + "loss": 2.0279, + "mask_loss": 0.1239, + "step": 2217, + "topk_loss": 0.0108 + }, + { + "epoch": 0.8815635033504531, + "grad_norm": 0.1259765625, + "learning_rate": 7.078760201904089e-06, + "lm_loss": 1.9196, + "loss": 2.0543, + "mask_loss": 0.126, + "step": 2218, + "topk_loss": 0.0087 + }, + { + "epoch": 0.8819609620985822, + "grad_norm": 0.1162109375, + "learning_rate": 7.031737777508818e-06, + "lm_loss": 1.9288, + "loss": 2.062, + "mask_loss": 0.1236, + "step": 2219, + "topk_loss": 0.0097 + }, + { + "epoch": 0.8823584208467113, + "grad_norm": 0.11376953125, + "learning_rate": 6.984866361321063e-06, + "lm_loss": 1.9207, + "loss": 2.0537, + "mask_loss": 0.1242, + "step": 2220, + "topk_loss": 0.0088 + }, + { + "epoch": 0.8827558795948405, + "grad_norm": 0.11181640625, + "learning_rate": 6.938146029474013e-06, + "lm_loss": 1.8107, + "loss": 1.9457, + "mask_loss": 0.1258, + "step": 2221, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8831533383429696, + "grad_norm": 0.125, + "learning_rate": 6.891576857855431e-06, + "lm_loss": 1.8534, + "loss": 1.9932, + "mask_loss": 0.1276, + "step": 2222, + "topk_loss": 0.0122 + }, + { + "epoch": 0.8835507970910987, + "grad_norm": 0.11376953125, + "learning_rate": 6.845158922107553e-06, + "lm_loss": 1.8817, + "loss": 2.0154, + "mask_loss": 0.1241, + "step": 2223, + "topk_loss": 0.0096 + }, + { + "epoch": 0.883948255839228, + "grad_norm": 0.1259765625, + "learning_rate": 6.798892297626946e-06, + "lm_loss": 1.9011, + "loss": 2.0375, + "mask_loss": 0.126, + "step": 2224, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8843457145873571, + "grad_norm": 0.1318359375, + "learning_rate": 6.75277705956443e-06, + "lm_loss": 1.9335, + "loss": 2.0707, + "mask_loss": 0.1265, + "step": 2225, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8847431733354862, + "grad_norm": 0.130859375, + "learning_rate": 6.70681328282492e-06, + "lm_loss": 1.8614, + "loss": 2.004, + "mask_loss": 0.1301, + "step": 2226, + "topk_loss": 0.0125 + }, + { + "epoch": 0.8851406320836154, + "grad_norm": 0.126953125, + "learning_rate": 6.661001042067294e-06, + "lm_loss": 1.8986, + "loss": 2.0315, + "mask_loss": 0.1234, + "step": 2227, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8855380908317445, + "grad_norm": 0.1162109375, + "learning_rate": 6.615340411704318e-06, + "lm_loss": 1.8793, + "loss": 2.0171, + "mask_loss": 0.1272, + "step": 2228, + "topk_loss": 0.0107 + }, + { + "epoch": 0.8859355495798736, + "grad_norm": 0.12890625, + "learning_rate": 6.569831465902488e-06, + "lm_loss": 1.9577, + "loss": 2.0932, + "mask_loss": 0.1242, + "step": 2229, + "topk_loss": 0.0113 + }, + { + "epoch": 0.8863330083280029, + "grad_norm": 0.134765625, + "learning_rate": 6.524474278581905e-06, + "lm_loss": 1.9069, + "loss": 2.0411, + "mask_loss": 0.1243, + "step": 2230, + "topk_loss": 0.0098 + }, + { + "epoch": 0.886730467076132, + "grad_norm": 0.1494140625, + "learning_rate": 6.479268923416182e-06, + "lm_loss": 1.8512, + "loss": 1.9893, + "mask_loss": 0.1276, + "step": 2231, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8871279258242611, + "grad_norm": 0.1181640625, + "learning_rate": 6.4342154738323054e-06, + "lm_loss": 1.9095, + "loss": 2.0453, + "mask_loss": 0.1255, + "step": 2232, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8875253845723903, + "grad_norm": 0.1279296875, + "learning_rate": 6.389314003010538e-06, + "lm_loss": 1.8588, + "loss": 1.9968, + "mask_loss": 0.1277, + "step": 2233, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8879228433205194, + "grad_norm": 0.11279296875, + "learning_rate": 6.344564583884271e-06, + "lm_loss": 1.9211, + "loss": 2.0571, + "mask_loss": 0.1265, + "step": 2234, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8883203020686485, + "grad_norm": 0.115234375, + "learning_rate": 6.299967289139896e-06, + "lm_loss": 1.8947, + "loss": 2.0331, + "mask_loss": 0.1279, + "step": 2235, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8887177608167778, + "grad_norm": 0.11865234375, + "learning_rate": 6.255522191216756e-06, + "lm_loss": 1.9274, + "loss": 2.0611, + "mask_loss": 0.1243, + "step": 2236, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8891152195649069, + "grad_norm": 0.11376953125, + "learning_rate": 6.211229362306947e-06, + "lm_loss": 1.8775, + "loss": 2.0134, + "mask_loss": 0.1267, + "step": 2237, + "topk_loss": 0.0091 + }, + { + "epoch": 0.889512678313036, + "grad_norm": 0.12353515625, + "learning_rate": 6.167088874355231e-06, + "lm_loss": 1.9621, + "loss": 2.097, + "mask_loss": 0.1244, + "step": 2238, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8899101370611652, + "grad_norm": 0.1552734375, + "learning_rate": 6.123100799058978e-06, + "lm_loss": 1.9197, + "loss": 2.0546, + "mask_loss": 0.1254, + "step": 2239, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8903075958092943, + "grad_norm": 0.11328125, + "learning_rate": 6.079265207867901e-06, + "lm_loss": 1.973, + "loss": 2.1072, + "mask_loss": 0.1248, + "step": 2240, + "topk_loss": 0.0094 + }, + { + "epoch": 0.8907050545574234, + "grad_norm": 0.12255859375, + "learning_rate": 6.0355821719841e-06, + "lm_loss": 1.9375, + "loss": 2.0719, + "mask_loss": 0.1249, + "step": 2241, + "topk_loss": 0.0095 + }, + { + "epoch": 0.8911025133055526, + "grad_norm": 0.11328125, + "learning_rate": 5.992051762361883e-06, + "lm_loss": 1.8657, + "loss": 2.0021, + "mask_loss": 0.1263, + "step": 2242, + "topk_loss": 0.0101 + }, + { + "epoch": 0.8914999720536818, + "grad_norm": 0.10986328125, + "learning_rate": 5.948674049707603e-06, + "lm_loss": 1.8879, + "loss": 2.0215, + "mask_loss": 0.1245, + "step": 2243, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8918974308018109, + "grad_norm": 0.1103515625, + "learning_rate": 5.905449104479632e-06, + "lm_loss": 1.9105, + "loss": 2.0441, + "mask_loss": 0.1245, + "step": 2244, + "topk_loss": 0.0091 + }, + { + "epoch": 0.8922948895499401, + "grad_norm": 0.11181640625, + "learning_rate": 5.862376996888175e-06, + "lm_loss": 1.9115, + "loss": 2.0469, + "mask_loss": 0.125, + "step": 2245, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8926923482980692, + "grad_norm": 0.1220703125, + "learning_rate": 5.819457796895189e-06, + "lm_loss": 1.919, + "loss": 2.0514, + "mask_loss": 0.1235, + "step": 2246, + "topk_loss": 0.009 + }, + { + "epoch": 0.8930898070461983, + "grad_norm": 0.11474609375, + "learning_rate": 5.776691574214277e-06, + "lm_loss": 1.893, + "loss": 2.0296, + "mask_loss": 0.1263, + "step": 2247, + "topk_loss": 0.0102 + }, + { + "epoch": 0.8934872657943275, + "grad_norm": 0.10986328125, + "learning_rate": 5.734078398310538e-06, + "lm_loss": 1.8622, + "loss": 1.9989, + "mask_loss": 0.1263, + "step": 2248, + "topk_loss": 0.0105 + }, + { + "epoch": 0.8938847245424567, + "grad_norm": 0.1171875, + "learning_rate": 5.691618338400484e-06, + "lm_loss": 1.9444, + "loss": 2.0784, + "mask_loss": 0.1236, + "step": 2249, + "topk_loss": 0.0104 + }, + { + "epoch": 0.8942821832905858, + "grad_norm": 0.11279296875, + "learning_rate": 5.6493114634519455e-06, + "lm_loss": 1.8951, + "loss": 2.0316, + "mask_loss": 0.1268, + "step": 2250, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8942821832905858, + "eval_lm_loss": 687.3623046875, + "eval_loss": 687.499267578125, + "eval_mask_hit_rate": 0.5359718799591064, + "eval_mask_loss": 0.12440043687820435, + "eval_mask_top_10_hit_rate": 0.985600471496582, + "eval_mask_top_1_hit_rate": 0.9974288940429688, + "eval_mask_top_20_hit_rate": 0.9762358665466309, + "eval_mask_top_5_hit_rate": 0.9908660054206848, + "eval_runtime": 144.2344, + "eval_samples_per_second": 14.199, + "eval_steps_per_second": 7.1, + "eval_token_accuracy": 0.6145902872085571, + "eval_top_k_diff": -529.9525146484375, + "eval_topk_loss": 0.012543203309178352, + "step": 2250 + }, + { + "epoch": 0.894679642038715, + "grad_norm": 0.1396484375, + "learning_rate": 5.607157842183896e-06, + "lm_loss": 1.8832, + "loss": 2.0298, + "mask_loss": 0.1305, + "step": 2251, + "topk_loss": 0.0161 + }, + { + "epoch": 0.8950771007868441, + "grad_norm": 0.125, + "learning_rate": 5.565157543066402e-06, + "lm_loss": 1.9263, + "loss": 2.0671, + "mask_loss": 0.1276, + "step": 2252, + "topk_loss": 0.0133 + }, + { + "epoch": 0.8954745595349732, + "grad_norm": 0.1474609375, + "learning_rate": 5.5233106343205e-06, + "lm_loss": 1.9293, + "loss": 2.0669, + "mask_loss": 0.1268, + "step": 2253, + "topk_loss": 0.0108 + }, + { + "epoch": 0.8958720182831024, + "grad_norm": 0.111328125, + "learning_rate": 5.481617183918053e-06, + "lm_loss": 1.9071, + "loss": 2.0405, + "mask_loss": 0.125, + "step": 2254, + "topk_loss": 0.0084 + }, + { + "epoch": 0.8962694770312316, + "grad_norm": 0.10888671875, + "learning_rate": 5.4400772595816774e-06, + "lm_loss": 1.8233, + "loss": 1.9602, + "mask_loss": 0.1265, + "step": 2255, + "topk_loss": 0.0103 + }, + { + "epoch": 0.8966669357793607, + "grad_norm": 0.115234375, + "learning_rate": 5.398690928784578e-06, + "lm_loss": 1.8864, + "loss": 2.0203, + "mask_loss": 0.1243, + "step": 2256, + "topk_loss": 0.0096 + }, + { + "epoch": 0.8970643945274899, + "grad_norm": 0.11279296875, + "learning_rate": 5.357458258750547e-06, + "lm_loss": 1.8947, + "loss": 2.0277, + "mask_loss": 0.1239, + "step": 2257, + "topk_loss": 0.0091 + }, + { + "epoch": 0.897461853275619, + "grad_norm": 0.11767578125, + "learning_rate": 5.316379316453713e-06, + "lm_loss": 1.9164, + "loss": 2.0509, + "mask_loss": 0.1248, + "step": 2258, + "topk_loss": 0.0098 + }, + { + "epoch": 0.8978593120237481, + "grad_norm": 0.119140625, + "learning_rate": 5.275454168618577e-06, + "lm_loss": 1.8878, + "loss": 2.0233, + "mask_loss": 0.1262, + "step": 2259, + "topk_loss": 0.0093 + }, + { + "epoch": 0.8982567707718773, + "grad_norm": 0.12060546875, + "learning_rate": 5.2346828817197655e-06, + "lm_loss": 1.9441, + "loss": 2.083, + "mask_loss": 0.1273, + "step": 2260, + "topk_loss": 0.0117 + }, + { + "epoch": 0.8986542295200065, + "grad_norm": 0.119140625, + "learning_rate": 5.194065521982028e-06, + "lm_loss": 1.8663, + "loss": 2.0009, + "mask_loss": 0.1246, + "step": 2261, + "topk_loss": 0.0099 + }, + { + "epoch": 0.8990516882681356, + "grad_norm": 0.11376953125, + "learning_rate": 5.153602155380089e-06, + "lm_loss": 1.9223, + "loss": 2.0597, + "mask_loss": 0.1272, + "step": 2262, + "topk_loss": 0.0102 + }, + { + "epoch": 0.8994491470162648, + "grad_norm": 0.109375, + "learning_rate": 5.113292847638518e-06, + "lm_loss": 1.8611, + "loss": 1.9966, + "mask_loss": 0.1266, + "step": 2263, + "topk_loss": 0.0089 + }, + { + "epoch": 0.8998466057643939, + "grad_norm": 0.10888671875, + "learning_rate": 5.073137664231675e-06, + "lm_loss": 1.9406, + "loss": 2.0761, + "mask_loss": 0.1255, + "step": 2264, + "topk_loss": 0.01 + }, + { + "epoch": 0.900244064512523, + "grad_norm": 0.10693359375, + "learning_rate": 5.033136670383554e-06, + "lm_loss": 1.9223, + "loss": 2.0605, + "mask_loss": 0.1285, + "step": 2265, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9006415232606522, + "grad_norm": 0.11376953125, + "learning_rate": 4.993289931067713e-06, + "lm_loss": 1.921, + "loss": 2.0529, + "mask_loss": 0.123, + "step": 2266, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9010389820087814, + "grad_norm": 0.109375, + "learning_rate": 4.953597511007158e-06, + "lm_loss": 1.9313, + "loss": 2.063, + "mask_loss": 0.1236, + "step": 2267, + "topk_loss": 0.0081 + }, + { + "epoch": 0.9014364407569105, + "grad_norm": 0.216796875, + "learning_rate": 4.914059474674216e-06, + "lm_loss": 1.9744, + "loss": 2.1198, + "mask_loss": 0.1299, + "step": 2268, + "topk_loss": 0.0155 + }, + { + "epoch": 0.9018338995050397, + "grad_norm": 0.115234375, + "learning_rate": 4.874675886290459e-06, + "lm_loss": 1.8485, + "loss": 1.984, + "mask_loss": 0.1253, + "step": 2269, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9022313582531688, + "grad_norm": 0.11181640625, + "learning_rate": 4.835446809826604e-06, + "lm_loss": 1.8822, + "loss": 2.0156, + "mask_loss": 0.1252, + "step": 2270, + "topk_loss": 0.0082 + }, + { + "epoch": 0.9026288170012979, + "grad_norm": 0.12060546875, + "learning_rate": 4.796372309002372e-06, + "lm_loss": 1.9152, + "loss": 2.0512, + "mask_loss": 0.1263, + "step": 2271, + "topk_loss": 0.0098 + }, + { + "epoch": 0.9030262757494271, + "grad_norm": 0.10595703125, + "learning_rate": 4.757452447286415e-06, + "lm_loss": 1.8863, + "loss": 2.0192, + "mask_loss": 0.1239, + "step": 2272, + "topk_loss": 0.009 + }, + { + "epoch": 0.9034237344975562, + "grad_norm": 0.154296875, + "learning_rate": 4.718687287896195e-06, + "lm_loss": 1.9199, + "loss": 2.0688, + "mask_loss": 0.1321, + "step": 2273, + "topk_loss": 0.0168 + }, + { + "epoch": 0.9038211932456854, + "grad_norm": 0.111328125, + "learning_rate": 4.680076893797914e-06, + "lm_loss": 1.8638, + "loss": 1.9998, + "mask_loss": 0.1263, + "step": 2274, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9042186519938146, + "grad_norm": 0.10693359375, + "learning_rate": 4.641621327706369e-06, + "lm_loss": 1.938, + "loss": 2.0717, + "mask_loss": 0.1243, + "step": 2275, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9046161107419437, + "grad_norm": 0.1474609375, + "learning_rate": 4.603320652084886e-06, + "lm_loss": 1.8922, + "loss": 2.0352, + "mask_loss": 0.1287, + "step": 2276, + "topk_loss": 0.0143 + }, + { + "epoch": 0.9050135694900728, + "grad_norm": 0.111328125, + "learning_rate": 4.565174929145188e-06, + "lm_loss": 1.8313, + "loss": 1.966, + "mask_loss": 0.1254, + "step": 2277, + "topk_loss": 0.0093 + }, + { + "epoch": 0.905411028238202, + "grad_norm": 0.12255859375, + "learning_rate": 4.527184220847325e-06, + "lm_loss": 1.8758, + "loss": 2.0109, + "mask_loss": 0.126, + "step": 2278, + "topk_loss": 0.0091 + }, + { + "epoch": 0.9058084869863311, + "grad_norm": 0.109375, + "learning_rate": 4.489348588899556e-06, + "lm_loss": 1.8638, + "loss": 1.9987, + "mask_loss": 0.1257, + "step": 2279, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9062059457344603, + "grad_norm": 0.11669921875, + "learning_rate": 4.451668094758199e-06, + "lm_loss": 1.9263, + "loss": 2.0598, + "mask_loss": 0.1241, + "step": 2280, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9066034044825895, + "grad_norm": 0.1455078125, + "learning_rate": 4.414142799627663e-06, + "lm_loss": 1.8887, + "loss": 2.0264, + "mask_loss": 0.1277, + "step": 2281, + "topk_loss": 0.01 + }, + { + "epoch": 0.9070008632307186, + "grad_norm": 0.1103515625, + "learning_rate": 4.3767727644602015e-06, + "lm_loss": 1.9005, + "loss": 2.039, + "mask_loss": 0.1272, + "step": 2282, + "topk_loss": 0.0112 + }, + { + "epoch": 0.9073983219788477, + "grad_norm": 0.115234375, + "learning_rate": 4.339558049955927e-06, + "lm_loss": 1.8799, + "loss": 2.0178, + "mask_loss": 0.1259, + "step": 2283, + "topk_loss": 0.012 + }, + { + "epoch": 0.9077957807269769, + "grad_norm": 0.1103515625, + "learning_rate": 4.3024987165626305e-06, + "lm_loss": 1.956, + "loss": 2.0933, + "mask_loss": 0.1261, + "step": 2284, + "topk_loss": 0.0113 + }, + { + "epoch": 0.908193239475106, + "grad_norm": 0.1103515625, + "learning_rate": 4.265594824475738e-06, + "lm_loss": 1.8609, + "loss": 1.9953, + "mask_loss": 0.1254, + "step": 2285, + "topk_loss": 0.009 + }, + { + "epoch": 0.9085906982232352, + "grad_norm": 0.138671875, + "learning_rate": 4.22884643363819e-06, + "lm_loss": 1.8658, + "loss": 2.0039, + "mask_loss": 0.1276, + "step": 2286, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9089881569713644, + "grad_norm": 0.1201171875, + "learning_rate": 4.192253603740337e-06, + "lm_loss": 1.9286, + "loss": 2.065, + "mask_loss": 0.1256, + "step": 2287, + "topk_loss": 0.0108 + }, + { + "epoch": 0.9093856157194935, + "grad_norm": 0.11669921875, + "learning_rate": 4.155816394219858e-06, + "lm_loss": 1.9051, + "loss": 2.0402, + "mask_loss": 0.1246, + "step": 2288, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9097830744676226, + "grad_norm": 0.1142578125, + "learning_rate": 4.119534864261643e-06, + "lm_loss": 1.9425, + "loss": 2.0796, + "mask_loss": 0.128, + "step": 2289, + "topk_loss": 0.009 + }, + { + "epoch": 0.9101805332157518, + "grad_norm": 0.1328125, + "learning_rate": 4.0834090727977505e-06, + "lm_loss": 1.9699, + "loss": 2.1058, + "mask_loss": 0.1257, + "step": 2290, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9105779919638809, + "grad_norm": 0.12109375, + "learning_rate": 4.04743907850722e-06, + "lm_loss": 1.8334, + "loss": 1.9683, + "mask_loss": 0.1262, + "step": 2291, + "topk_loss": 0.0087 + }, + { + "epoch": 0.9109754507120101, + "grad_norm": 0.11279296875, + "learning_rate": 4.011624939816094e-06, + "lm_loss": 1.9264, + "loss": 2.0581, + "mask_loss": 0.1231, + "step": 2292, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9113729094601393, + "grad_norm": 0.1279296875, + "learning_rate": 3.975966714897195e-06, + "lm_loss": 1.9287, + "loss": 2.0642, + "mask_loss": 0.1264, + "step": 2293, + "topk_loss": 0.0091 + }, + { + "epoch": 0.9117703682082684, + "grad_norm": 0.10498046875, + "learning_rate": 3.940464461670135e-06, + "lm_loss": 1.9425, + "loss": 2.0769, + "mask_loss": 0.1253, + "step": 2294, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9121678269563975, + "grad_norm": 0.1171875, + "learning_rate": 3.9051182378011755e-06, + "lm_loss": 1.9558, + "loss": 2.0917, + "mask_loss": 0.1264, + "step": 2295, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9125652857045267, + "grad_norm": 0.125, + "learning_rate": 3.8699281007031245e-06, + "lm_loss": 1.8112, + "loss": 1.9456, + "mask_loss": 0.126, + "step": 2296, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9129627444526558, + "grad_norm": 0.166015625, + "learning_rate": 3.834894107535269e-06, + "lm_loss": 1.956, + "loss": 2.0907, + "mask_loss": 0.1249, + "step": 2297, + "topk_loss": 0.0098 + }, + { + "epoch": 0.9133602032007849, + "grad_norm": 0.14453125, + "learning_rate": 3.8000163152032697e-06, + "lm_loss": 1.8895, + "loss": 2.0251, + "mask_loss": 0.1246, + "step": 2298, + "topk_loss": 0.011 + }, + { + "epoch": 0.9137576619489142, + "grad_norm": 0.1240234375, + "learning_rate": 3.7652947803590855e-06, + "lm_loss": 1.8623, + "loss": 1.9993, + "mask_loss": 0.1275, + "step": 2299, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9141551206970433, + "grad_norm": 0.10888671875, + "learning_rate": 3.7307295594008472e-06, + "lm_loss": 1.9201, + "loss": 2.0575, + "mask_loss": 0.127, + "step": 2300, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9141551206970433, + "eval_lm_loss": 687.35888671875, + "eval_loss": 687.495849609375, + "eval_mask_hit_rate": 0.5359721183776855, + "eval_mask_loss": 0.12440076470375061, + "eval_mask_top_10_hit_rate": 0.9856007695198059, + "eval_mask_top_1_hit_rate": 0.9974291324615479, + "eval_mask_top_20_hit_rate": 0.9762364029884338, + "eval_mask_top_5_hit_rate": 0.9908666610717773, + "eval_runtime": 143.7299, + "eval_samples_per_second": 14.249, + "eval_steps_per_second": 7.124, + "eval_token_accuracy": 0.614601731300354, + "eval_top_k_diff": -530.08642578125, + "eval_topk_loss": 0.012541667558252811, + "step": 2300 + }, + { + "epoch": 0.9145525794451724, + "grad_norm": 0.12109375, + "learning_rate": 3.696320708472778e-06, + "lm_loss": 1.917, + "loss": 2.0521, + "mask_loss": 0.125, + "step": 2301, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9149500381933016, + "grad_norm": 0.130859375, + "learning_rate": 3.6620682834651366e-06, + "lm_loss": 1.9343, + "loss": 2.0674, + "mask_loss": 0.124, + "step": 2302, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9153474969414307, + "grad_norm": 0.11181640625, + "learning_rate": 3.627972340014085e-06, + "lm_loss": 1.8732, + "loss": 2.0106, + "mask_loss": 0.1276, + "step": 2303, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9157449556895598, + "grad_norm": 0.126953125, + "learning_rate": 3.594032933501601e-06, + "lm_loss": 1.9487, + "loss": 2.0853, + "mask_loss": 0.1256, + "step": 2304, + "topk_loss": 0.0109 + }, + { + "epoch": 0.9161424144376891, + "grad_norm": 0.119140625, + "learning_rate": 3.5602501190554193e-06, + "lm_loss": 1.9035, + "loss": 2.0389, + "mask_loss": 0.1253, + "step": 2305, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9165398731858182, + "grad_norm": 0.1123046875, + "learning_rate": 3.526623951548913e-06, + "lm_loss": 1.8585, + "loss": 1.9931, + "mask_loss": 0.1261, + "step": 2306, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9169373319339473, + "grad_norm": 0.1611328125, + "learning_rate": 3.4931544856010133e-06, + "lm_loss": 1.975, + "loss": 2.1077, + "mask_loss": 0.1238, + "step": 2307, + "topk_loss": 0.0088 + }, + { + "epoch": 0.9173347906820765, + "grad_norm": 0.1298828125, + "learning_rate": 3.4598417755761225e-06, + "lm_loss": 1.89, + "loss": 2.0273, + "mask_loss": 0.1263, + "step": 2308, + "topk_loss": 0.011 + }, + { + "epoch": 0.9177322494302056, + "grad_norm": 0.1376953125, + "learning_rate": 3.4266858755840346e-06, + "lm_loss": 1.8809, + "loss": 2.0172, + "mask_loss": 0.1261, + "step": 2309, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9181297081783347, + "grad_norm": 0.10888671875, + "learning_rate": 3.393686839479815e-06, + "lm_loss": 1.9265, + "loss": 2.0603, + "mask_loss": 0.1251, + "step": 2310, + "topk_loss": 0.0088 + }, + { + "epoch": 0.918527166926464, + "grad_norm": 0.12255859375, + "learning_rate": 3.360844720863765e-06, + "lm_loss": 1.9214, + "loss": 2.0567, + "mask_loss": 0.125, + "step": 2311, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9189246256745931, + "grad_norm": 0.15234375, + "learning_rate": 3.3281595730812575e-06, + "lm_loss": 1.9702, + "loss": 2.1047, + "mask_loss": 0.1229, + "step": 2312, + "topk_loss": 0.0116 + }, + { + "epoch": 0.9193220844227222, + "grad_norm": 0.12158203125, + "learning_rate": 3.295631449222758e-06, + "lm_loss": 1.9301, + "loss": 2.0679, + "mask_loss": 0.1269, + "step": 2313, + "topk_loss": 0.0109 + }, + { + "epoch": 0.9197195431708514, + "grad_norm": 0.10791015625, + "learning_rate": 3.2632604021236358e-06, + "lm_loss": 1.8932, + "loss": 2.029, + "mask_loss": 0.1258, + "step": 2314, + "topk_loss": 0.01 + }, + { + "epoch": 0.9201170019189805, + "grad_norm": 0.13671875, + "learning_rate": 3.2310464843641307e-06, + "lm_loss": 1.8941, + "loss": 2.0359, + "mask_loss": 0.1295, + "step": 2315, + "topk_loss": 0.0124 + }, + { + "epoch": 0.9205144606671096, + "grad_norm": 0.140625, + "learning_rate": 3.1989897482692767e-06, + "lm_loss": 1.8964, + "loss": 2.0293, + "mask_loss": 0.1244, + "step": 2316, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9209119194152389, + "grad_norm": 0.1298828125, + "learning_rate": 3.1670902459087658e-06, + "lm_loss": 1.9011, + "loss": 2.0372, + "mask_loss": 0.1267, + "step": 2317, + "topk_loss": 0.0094 + }, + { + "epoch": 0.921309378163368, + "grad_norm": 0.1201171875, + "learning_rate": 3.1353480290969183e-06, + "lm_loss": 1.9406, + "loss": 2.0769, + "mask_loss": 0.1265, + "step": 2318, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9217068369114971, + "grad_norm": 0.1376953125, + "learning_rate": 3.103763149392569e-06, + "lm_loss": 1.859, + "loss": 1.995, + "mask_loss": 0.1266, + "step": 2319, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9221042956596263, + "grad_norm": 0.216796875, + "learning_rate": 3.0723356580989904e-06, + "lm_loss": 1.8902, + "loss": 2.0444, + "mask_loss": 0.1333, + "step": 2320, + "topk_loss": 0.0209 + }, + { + "epoch": 0.9225017544077554, + "grad_norm": 0.1162109375, + "learning_rate": 3.041065606263804e-06, + "lm_loss": 1.8954, + "loss": 2.0335, + "mask_loss": 0.1276, + "step": 2321, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9228992131558845, + "grad_norm": 0.2197265625, + "learning_rate": 3.0099530446789036e-06, + "lm_loss": 1.8985, + "loss": 2.0366, + "mask_loss": 0.1275, + "step": 2322, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9232966719040138, + "grad_norm": 0.12158203125, + "learning_rate": 2.978998023880386e-06, + "lm_loss": 1.8974, + "loss": 2.0356, + "mask_loss": 0.1276, + "step": 2323, + "topk_loss": 0.0107 + }, + { + "epoch": 0.9236941306521429, + "grad_norm": 0.1103515625, + "learning_rate": 2.9482005941484423e-06, + "lm_loss": 1.883, + "loss": 2.0155, + "mask_loss": 0.1234, + "step": 2324, + "topk_loss": 0.0091 + }, + { + "epoch": 0.924091589400272, + "grad_norm": 0.11474609375, + "learning_rate": 2.9175608055072913e-06, + "lm_loss": 1.9774, + "loss": 2.1142, + "mask_loss": 0.1252, + "step": 2325, + "topk_loss": 0.0116 + }, + { + "epoch": 0.9244890481484012, + "grad_norm": 0.109375, + "learning_rate": 2.8870787077250994e-06, + "lm_loss": 1.92, + "loss": 2.0525, + "mask_loss": 0.1234, + "step": 2326, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9248865068965303, + "grad_norm": 0.11669921875, + "learning_rate": 2.856754350313873e-06, + "lm_loss": 1.896, + "loss": 2.032, + "mask_loss": 0.126, + "step": 2327, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9252839656446594, + "grad_norm": 0.1328125, + "learning_rate": 2.826587782529444e-06, + "lm_loss": 1.9167, + "loss": 2.0525, + "mask_loss": 0.1252, + "step": 2328, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9256814243927886, + "grad_norm": 0.109375, + "learning_rate": 2.7965790533713064e-06, + "lm_loss": 1.9363, + "loss": 2.068, + "mask_loss": 0.1231, + "step": 2329, + "topk_loss": 0.0086 + }, + { + "epoch": 0.9260788831409178, + "grad_norm": 0.1142578125, + "learning_rate": 2.7667282115826033e-06, + "lm_loss": 1.9306, + "loss": 2.0698, + "mask_loss": 0.1275, + "step": 2330, + "topk_loss": 0.0117 + }, + { + "epoch": 0.9264763418890469, + "grad_norm": 0.12451171875, + "learning_rate": 2.737035305650004e-06, + "lm_loss": 1.8215, + "loss": 1.9577, + "mask_loss": 0.1264, + "step": 2331, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9268738006371761, + "grad_norm": 0.1484375, + "learning_rate": 2.707500383803663e-06, + "lm_loss": 1.8949, + "loss": 2.0284, + "mask_loss": 0.1248, + "step": 2332, + "topk_loss": 0.0087 + }, + { + "epoch": 0.9272712593853052, + "grad_norm": 0.12353515625, + "learning_rate": 2.678123494017093e-06, + "lm_loss": 1.9689, + "loss": 2.1045, + "mask_loss": 0.1259, + "step": 2333, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9276687181334343, + "grad_norm": 0.169921875, + "learning_rate": 2.6489046840071475e-06, + "lm_loss": 1.8665, + "loss": 2.0033, + "mask_loss": 0.1255, + "step": 2334, + "topk_loss": 0.0113 + }, + { + "epoch": 0.9280661768815635, + "grad_norm": 0.109375, + "learning_rate": 2.619844001233884e-06, + "lm_loss": 1.9569, + "loss": 2.0919, + "mask_loss": 0.1246, + "step": 2335, + "topk_loss": 0.0104 + }, + { + "epoch": 0.9284636356296927, + "grad_norm": 0.115234375, + "learning_rate": 2.590941492900534e-06, + "lm_loss": 1.8497, + "loss": 1.9884, + "mask_loss": 0.1285, + "step": 2336, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9288610943778218, + "grad_norm": 0.1318359375, + "learning_rate": 2.562197205953376e-06, + "lm_loss": 1.9076, + "loss": 2.0424, + "mask_loss": 0.1242, + "step": 2337, + "topk_loss": 0.0106 + }, + { + "epoch": 0.929258553125951, + "grad_norm": 0.115234375, + "learning_rate": 2.5336111870817414e-06, + "lm_loss": 1.8494, + "loss": 1.9835, + "mask_loss": 0.1246, + "step": 2338, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9296560118740801, + "grad_norm": 0.15234375, + "learning_rate": 2.5051834827178432e-06, + "lm_loss": 1.9085, + "loss": 2.0447, + "mask_loss": 0.1255, + "step": 2339, + "topk_loss": 0.0107 + }, + { + "epoch": 0.9300534706222092, + "grad_norm": 0.1162109375, + "learning_rate": 2.4769141390367567e-06, + "lm_loss": 1.8833, + "loss": 2.0206, + "mask_loss": 0.1276, + "step": 2340, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9304509293703384, + "grad_norm": 0.115234375, + "learning_rate": 2.4488032019563402e-06, + "lm_loss": 1.9008, + "loss": 2.0353, + "mask_loss": 0.125, + "step": 2341, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9308483881184676, + "grad_norm": 0.1279296875, + "learning_rate": 2.4208507171371353e-06, + "lm_loss": 1.8745, + "loss": 2.01, + "mask_loss": 0.1263, + "step": 2342, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9312458468665967, + "grad_norm": 0.115234375, + "learning_rate": 2.3930567299823457e-06, + "lm_loss": 1.9383, + "loss": 2.0736, + "mask_loss": 0.1251, + "step": 2343, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9316433056147259, + "grad_norm": 0.11865234375, + "learning_rate": 2.36542128563767e-06, + "lm_loss": 1.9577, + "loss": 2.0925, + "mask_loss": 0.1252, + "step": 2344, + "topk_loss": 0.0096 + }, + { + "epoch": 0.932040764362855, + "grad_norm": 0.1064453125, + "learning_rate": 2.3379444289913342e-06, + "lm_loss": 1.9382, + "loss": 2.0704, + "mask_loss": 0.1237, + "step": 2345, + "topk_loss": 0.0086 + }, + { + "epoch": 0.9324382231109841, + "grad_norm": 0.1279296875, + "learning_rate": 2.31062620467396e-06, + "lm_loss": 1.903, + "loss": 2.0397, + "mask_loss": 0.1272, + "step": 2346, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9328356818591133, + "grad_norm": 0.11865234375, + "learning_rate": 2.2834666570584862e-06, + "lm_loss": 1.9193, + "loss": 2.0559, + "mask_loss": 0.127, + "step": 2347, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9332331406072425, + "grad_norm": 0.162109375, + "learning_rate": 2.256465830260135e-06, + "lm_loss": 1.9004, + "loss": 2.0394, + "mask_loss": 0.1266, + "step": 2348, + "topk_loss": 0.0124 + }, + { + "epoch": 0.9336305993553716, + "grad_norm": 0.11328125, + "learning_rate": 2.229623768136313e-06, + "lm_loss": 1.8747, + "loss": 2.0122, + "mask_loss": 0.1273, + "step": 2349, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9340280581035008, + "grad_norm": 0.1259765625, + "learning_rate": 2.2029405142865224e-06, + "lm_loss": 1.8982, + "loss": 2.0336, + "mask_loss": 0.1252, + "step": 2350, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9340280581035008, + "eval_lm_loss": 687.3653564453125, + "eval_loss": 687.5023193359375, + "eval_mask_hit_rate": 0.5359728336334229, + "eval_mask_loss": 0.12440057098865509, + "eval_mask_top_10_hit_rate": 0.9856017231941223, + "eval_mask_top_1_hit_rate": 0.997429370880127, + "eval_mask_top_20_hit_rate": 0.9762368202209473, + "eval_mask_top_5_hit_rate": 0.9908671975135803, + "eval_runtime": 144.3473, + "eval_samples_per_second": 14.188, + "eval_steps_per_second": 7.094, + "eval_token_accuracy": 0.6146026849746704, + "eval_top_k_diff": -530.0986328125, + "eval_topk_loss": 0.012541640549898148, + "step": 2350 + }, + { + "epoch": 0.9344255168516299, + "grad_norm": 0.1123046875, + "learning_rate": 2.1764161120523484e-06, + "lm_loss": 1.8715, + "loss": 2.0062, + "mask_loss": 0.1254, + "step": 2351, + "topk_loss": 0.0093 + }, + { + "epoch": 0.934822975599759, + "grad_norm": 0.11865234375, + "learning_rate": 2.1500506045173287e-06, + "lm_loss": 1.8771, + "loss": 2.0135, + "mask_loss": 0.126, + "step": 2352, + "topk_loss": 0.0104 + }, + { + "epoch": 0.9352204343478882, + "grad_norm": 0.12158203125, + "learning_rate": 2.123844034506928e-06, + "lm_loss": 1.8795, + "loss": 2.0134, + "mask_loss": 0.1244, + "step": 2353, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9356178930960174, + "grad_norm": 0.10888671875, + "learning_rate": 2.097796444588418e-06, + "lm_loss": 1.9088, + "loss": 2.0436, + "mask_loss": 0.1248, + "step": 2354, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9360153518441465, + "grad_norm": 0.1171875, + "learning_rate": 2.0719078770708777e-06, + "lm_loss": 1.8938, + "loss": 2.0272, + "mask_loss": 0.1244, + "step": 2355, + "topk_loss": 0.009 + }, + { + "epoch": 0.9364128105922757, + "grad_norm": 0.1259765625, + "learning_rate": 2.046178374005059e-06, + "lm_loss": 1.9042, + "loss": 2.0408, + "mask_loss": 0.1266, + "step": 2356, + "topk_loss": 0.01 + }, + { + "epoch": 0.9368102693404048, + "grad_norm": 0.11376953125, + "learning_rate": 2.020607977183353e-06, + "lm_loss": 1.8682, + "loss": 2.0092, + "mask_loss": 0.1288, + "step": 2357, + "topk_loss": 0.0122 + }, + { + "epoch": 0.9372077280885339, + "grad_norm": 0.1513671875, + "learning_rate": 1.9951967281397257e-06, + "lm_loss": 1.8533, + "loss": 1.9906, + "mask_loss": 0.1277, + "step": 2358, + "topk_loss": 0.0096 + }, + { + "epoch": 0.937605186836663, + "grad_norm": 0.10986328125, + "learning_rate": 1.969944668149637e-06, + "lm_loss": 1.8471, + "loss": 1.9827, + "mask_loss": 0.1269, + "step": 2359, + "topk_loss": 0.0088 + }, + { + "epoch": 0.9380026455847922, + "grad_norm": 0.11572265625, + "learning_rate": 1.9448518382299553e-06, + "lm_loss": 1.8907, + "loss": 2.0257, + "mask_loss": 0.1254, + "step": 2360, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9384001043329214, + "grad_norm": 0.1279296875, + "learning_rate": 1.9199182791389214e-06, + "lm_loss": 1.9252, + "loss": 2.0615, + "mask_loss": 0.126, + "step": 2361, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9387975630810506, + "grad_norm": 0.126953125, + "learning_rate": 1.8951440313760837e-06, + "lm_loss": 1.8604, + "loss": 1.9968, + "mask_loss": 0.1262, + "step": 2362, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9391950218291797, + "grad_norm": 0.10986328125, + "learning_rate": 1.8705291351822307e-06, + "lm_loss": 1.9192, + "loss": 2.0565, + "mask_loss": 0.1267, + "step": 2363, + "topk_loss": 0.0107 + }, + { + "epoch": 0.9395924805773088, + "grad_norm": 0.10888671875, + "learning_rate": 1.84607363053928e-06, + "lm_loss": 1.9336, + "loss": 2.0659, + "mask_loss": 0.1234, + "step": 2364, + "topk_loss": 0.0089 + }, + { + "epoch": 0.939989939325438, + "grad_norm": 0.1240234375, + "learning_rate": 1.8217775571702677e-06, + "lm_loss": 1.9271, + "loss": 2.0636, + "mask_loss": 0.1259, + "step": 2365, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9403873980735671, + "grad_norm": 0.109375, + "learning_rate": 1.7976409545392924e-06, + "lm_loss": 1.9143, + "loss": 2.0514, + "mask_loss": 0.1263, + "step": 2366, + "topk_loss": 0.0108 + }, + { + "epoch": 0.9407848568216963, + "grad_norm": 0.1162109375, + "learning_rate": 1.7736638618513934e-06, + "lm_loss": 1.8904, + "loss": 2.0244, + "mask_loss": 0.1258, + "step": 2367, + "topk_loss": 0.0082 + }, + { + "epoch": 0.9411823155698255, + "grad_norm": 0.126953125, + "learning_rate": 1.7498463180525172e-06, + "lm_loss": 1.9554, + "loss": 2.0874, + "mask_loss": 0.1224, + "step": 2368, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9415797743179546, + "grad_norm": 0.109375, + "learning_rate": 1.7261883618294616e-06, + "lm_loss": 1.8866, + "loss": 2.0236, + "mask_loss": 0.1267, + "step": 2369, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9419772330660837, + "grad_norm": 0.115234375, + "learning_rate": 1.7026900316098215e-06, + "lm_loss": 1.89, + "loss": 2.0293, + "mask_loss": 0.1291, + "step": 2370, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9423746918142129, + "grad_norm": 0.13671875, + "learning_rate": 1.6793513655618986e-06, + "lm_loss": 1.9824, + "loss": 2.1146, + "mask_loss": 0.122, + "step": 2371, + "topk_loss": 0.0103 + }, + { + "epoch": 0.942772150562342, + "grad_norm": 0.1083984375, + "learning_rate": 1.6561724015946356e-06, + "lm_loss": 1.8736, + "loss": 2.0121, + "mask_loss": 0.1274, + "step": 2372, + "topk_loss": 0.011 + }, + { + "epoch": 0.9431696093104712, + "grad_norm": 0.1337890625, + "learning_rate": 1.6331531773576048e-06, + "lm_loss": 1.9437, + "loss": 2.0775, + "mask_loss": 0.1243, + "step": 2373, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9435670680586004, + "grad_norm": 0.11376953125, + "learning_rate": 1.6102937302408972e-06, + "lm_loss": 1.8901, + "loss": 2.0256, + "mask_loss": 0.1254, + "step": 2374, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9439645268067295, + "grad_norm": 0.11767578125, + "learning_rate": 1.587594097375078e-06, + "lm_loss": 1.925, + "loss": 2.0596, + "mask_loss": 0.1254, + "step": 2375, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9443619855548586, + "grad_norm": 0.10888671875, + "learning_rate": 1.5650543156311205e-06, + "lm_loss": 1.9367, + "loss": 2.0719, + "mask_loss": 0.1247, + "step": 2376, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9447594443029877, + "grad_norm": 0.1552734375, + "learning_rate": 1.5426744216203493e-06, + "lm_loss": 1.899, + "loss": 2.0373, + "mask_loss": 0.1273, + "step": 2377, + "topk_loss": 0.011 + }, + { + "epoch": 0.9451569030511169, + "grad_norm": 0.142578125, + "learning_rate": 1.5204544516944198e-06, + "lm_loss": 1.8555, + "loss": 1.9928, + "mask_loss": 0.1273, + "step": 2378, + "topk_loss": 0.01 + }, + { + "epoch": 0.9455543617992461, + "grad_norm": 0.11376953125, + "learning_rate": 1.4983944419451613e-06, + "lm_loss": 1.9303, + "loss": 2.0642, + "mask_loss": 0.1238, + "step": 2379, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9459518205473753, + "grad_norm": 0.1591796875, + "learning_rate": 1.4764944282046445e-06, + "lm_loss": 1.8519, + "loss": 1.9877, + "mask_loss": 0.1254, + "step": 2380, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9463492792955044, + "grad_norm": 0.11474609375, + "learning_rate": 1.4547544460450035e-06, + "lm_loss": 1.8738, + "loss": 2.01, + "mask_loss": 0.1255, + "step": 2381, + "topk_loss": 0.0108 + }, + { + "epoch": 0.9467467380436335, + "grad_norm": 0.115234375, + "learning_rate": 1.4331745307784805e-06, + "lm_loss": 1.8965, + "loss": 2.0315, + "mask_loss": 0.1252, + "step": 2382, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9471441967917626, + "grad_norm": 0.1103515625, + "learning_rate": 1.411754717457292e-06, + "lm_loss": 1.8521, + "loss": 1.9849, + "mask_loss": 0.1239, + "step": 2383, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9475416555398918, + "grad_norm": 0.1103515625, + "learning_rate": 1.3904950408735962e-06, + "lm_loss": 1.8487, + "loss": 1.9845, + "mask_loss": 0.1262, + "step": 2384, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9479391142880209, + "grad_norm": 0.11181640625, + "learning_rate": 1.36939553555947e-06, + "lm_loss": 1.9358, + "loss": 2.0704, + "mask_loss": 0.1248, + "step": 2385, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9483365730361502, + "grad_norm": 0.1103515625, + "learning_rate": 1.3484562357867992e-06, + "lm_loss": 1.8867, + "loss": 2.021, + "mask_loss": 0.1248, + "step": 2386, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9487340317842793, + "grad_norm": 0.44140625, + "learning_rate": 1.3276771755672545e-06, + "lm_loss": 1.9017, + "loss": 2.1053, + "mask_loss": 0.1622, + "step": 2387, + "topk_loss": 0.0414 + }, + { + "epoch": 0.9491314905324084, + "grad_norm": 0.11181640625, + "learning_rate": 1.307058388652238e-06, + "lm_loss": 1.8856, + "loss": 2.0201, + "mask_loss": 0.1256, + "step": 2388, + "topk_loss": 0.009 + }, + { + "epoch": 0.9495289492805375, + "grad_norm": 0.119140625, + "learning_rate": 1.286599908532815e-06, + "lm_loss": 1.8799, + "loss": 2.014, + "mask_loss": 0.1247, + "step": 2389, + "topk_loss": 0.0093 + }, + { + "epoch": 0.9499264080286667, + "grad_norm": 0.1494140625, + "learning_rate": 1.2663017684396593e-06, + "lm_loss": 1.8059, + "loss": 1.9435, + "mask_loss": 0.1279, + "step": 2390, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9503238667767958, + "grad_norm": 0.1181640625, + "learning_rate": 1.2461640013430087e-06, + "lm_loss": 1.9366, + "loss": 2.0719, + "mask_loss": 0.1246, + "step": 2391, + "topk_loss": 0.0107 + }, + { + "epoch": 0.950721325524925, + "grad_norm": 0.12353515625, + "learning_rate": 1.22618663995262e-06, + "lm_loss": 1.8743, + "loss": 2.0077, + "mask_loss": 0.1249, + "step": 2392, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9511187842730542, + "grad_norm": 0.2412109375, + "learning_rate": 1.20636971671767e-06, + "lm_loss": 1.888, + "loss": 2.0355, + "mask_loss": 0.1307, + "step": 2393, + "topk_loss": 0.0168 + }, + { + "epoch": 0.9515162430211833, + "grad_norm": 0.1171875, + "learning_rate": 1.1867132638267664e-06, + "lm_loss": 1.8891, + "loss": 2.0248, + "mask_loss": 0.1261, + "step": 2394, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9519137017693124, + "grad_norm": 0.1435546875, + "learning_rate": 1.167217313207858e-06, + "lm_loss": 1.8765, + "loss": 2.0168, + "mask_loss": 0.1266, + "step": 2395, + "topk_loss": 0.0137 + }, + { + "epoch": 0.9523111605174416, + "grad_norm": 0.1337890625, + "learning_rate": 1.1478818965281911e-06, + "lm_loss": 1.9205, + "loss": 2.0549, + "mask_loss": 0.1258, + "step": 2396, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9527086192655707, + "grad_norm": 0.1162109375, + "learning_rate": 1.1287070451942438e-06, + "lm_loss": 1.9174, + "loss": 2.0505, + "mask_loss": 0.1241, + "step": 2397, + "topk_loss": 0.009 + }, + { + "epoch": 0.9531060780137, + "grad_norm": 0.10791015625, + "learning_rate": 1.109692790351713e-06, + "lm_loss": 1.9071, + "loss": 2.0398, + "mask_loss": 0.1243, + "step": 2398, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9535035367618291, + "grad_norm": 0.11962890625, + "learning_rate": 1.0908391628854041e-06, + "lm_loss": 1.8462, + "loss": 1.9832, + "mask_loss": 0.1274, + "step": 2399, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9539009955099582, + "grad_norm": 0.11865234375, + "learning_rate": 1.0721461934192545e-06, + "lm_loss": 1.9195, + "loss": 2.0519, + "mask_loss": 0.1236, + "step": 2400, + "topk_loss": 0.0087 + }, + { + "epoch": 0.9539009955099582, + "eval_lm_loss": 687.3690795898438, + "eval_loss": 687.5060424804688, + "eval_mask_hit_rate": 0.5359731912612915, + "eval_mask_loss": 0.12440052628517151, + "eval_mask_top_10_hit_rate": 0.9856013059616089, + "eval_mask_top_1_hit_rate": 0.997429609298706, + "eval_mask_top_20_hit_rate": 0.9762364625930786, + "eval_mask_top_5_hit_rate": 0.990867018699646, + "eval_runtime": 144.1892, + "eval_samples_per_second": 14.204, + "eval_steps_per_second": 7.102, + "eval_token_accuracy": 0.6146031618118286, + "eval_top_k_diff": -530.1141357421875, + "eval_topk_loss": 0.012541300617158413, + "step": 2400 + }, + { + "epoch": 0.9542984542580873, + "grad_norm": 0.11572265625, + "learning_rate": 1.0536139123162093e-06, + "lm_loss": 1.8699, + "loss": 2.0035, + "mask_loss": 0.1253, + "step": 2401, + "topk_loss": 0.0083 + }, + { + "epoch": 0.9546959130062165, + "grad_norm": 0.1162109375, + "learning_rate": 1.035242349678245e-06, + "lm_loss": 1.8848, + "loss": 2.0193, + "mask_loss": 0.1261, + "step": 2402, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9550933717543456, + "grad_norm": 0.1318359375, + "learning_rate": 1.0170315353462466e-06, + "lm_loss": 1.9585, + "loss": 2.0925, + "mask_loss": 0.1241, + "step": 2403, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9554908305024749, + "grad_norm": 0.12109375, + "learning_rate": 9.9898149890002e-07, + "lm_loss": 1.9355, + "loss": 2.0712, + "mask_loss": 0.1262, + "step": 2404, + "topk_loss": 0.0096 + }, + { + "epoch": 0.955888289250604, + "grad_norm": 0.1259765625, + "learning_rate": 9.810922696582014e-07, + "lm_loss": 1.9173, + "loss": 2.0542, + "mask_loss": 0.1258, + "step": 2405, + "topk_loss": 0.011 + }, + { + "epoch": 0.9562857479987331, + "grad_norm": 0.11474609375, + "learning_rate": 9.633638766782582e-07, + "lm_loss": 1.9092, + "loss": 2.044, + "mask_loss": 0.1253, + "step": 2406, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9566832067468622, + "grad_norm": 0.11572265625, + "learning_rate": 9.457963487563781e-07, + "lm_loss": 1.8532, + "loss": 1.9889, + "mask_loss": 0.1254, + "step": 2407, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9570806654949914, + "grad_norm": 0.1162109375, + "learning_rate": 9.283897144274689e-07, + "lm_loss": 1.8861, + "loss": 2.0216, + "mask_loss": 0.1258, + "step": 2408, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9574781242431205, + "grad_norm": 0.126953125, + "learning_rate": 9.11144001965103e-07, + "lm_loss": 1.881, + "loss": 2.0185, + "mask_loss": 0.1271, + "step": 2409, + "topk_loss": 0.0104 + }, + { + "epoch": 0.9578755829912498, + "grad_norm": 0.1240234375, + "learning_rate": 8.940592393814728e-07, + "lm_loss": 1.8996, + "loss": 2.0353, + "mask_loss": 0.1259, + "step": 2410, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9582730417393789, + "grad_norm": 0.1162109375, + "learning_rate": 8.771354544273247e-07, + "lm_loss": 1.8675, + "loss": 2.0009, + "mask_loss": 0.1237, + "step": 2411, + "topk_loss": 0.0096 + }, + { + "epoch": 0.958670500487508, + "grad_norm": 0.15625, + "learning_rate": 8.603726745919361e-07, + "lm_loss": 1.8671, + "loss": 2.0062, + "mask_loss": 0.129, + "step": 2412, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9590679592356371, + "grad_norm": 0.11669921875, + "learning_rate": 8.437709271030603e-07, + "lm_loss": 1.9812, + "loss": 2.1134, + "mask_loss": 0.1229, + "step": 2413, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9594654179837663, + "grad_norm": 0.11083984375, + "learning_rate": 8.273302389269044e-07, + "lm_loss": 1.9474, + "loss": 2.0823, + "mask_loss": 0.1258, + "step": 2414, + "topk_loss": 0.0091 + }, + { + "epoch": 0.9598628767318954, + "grad_norm": 0.10595703125, + "learning_rate": 8.110506367680515e-07, + "lm_loss": 1.845, + "loss": 1.979, + "mask_loss": 0.1244, + "step": 2415, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9602603354800245, + "grad_norm": 0.11669921875, + "learning_rate": 7.94932147069416e-07, + "lm_loss": 1.8422, + "loss": 1.9828, + "mask_loss": 0.1291, + "step": 2416, + "topk_loss": 0.0115 + }, + { + "epoch": 0.9606577942281538, + "grad_norm": 0.11376953125, + "learning_rate": 7.789747960122551e-07, + "lm_loss": 1.874, + "loss": 2.0101, + "mask_loss": 0.1265, + "step": 2417, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9610552529762829, + "grad_norm": 0.11328125, + "learning_rate": 7.631786095160687e-07, + "lm_loss": 1.8978, + "loss": 2.0364, + "mask_loss": 0.128, + "step": 2418, + "topk_loss": 0.0107 + }, + { + "epoch": 0.961452711724412, + "grad_norm": 0.1103515625, + "learning_rate": 7.47543613238566e-07, + "lm_loss": 1.8937, + "loss": 2.0272, + "mask_loss": 0.1238, + "step": 2419, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9618501704725412, + "grad_norm": 0.140625, + "learning_rate": 7.320698325756658e-07, + "lm_loss": 1.8951, + "loss": 2.0322, + "mask_loss": 0.1261, + "step": 2420, + "topk_loss": 0.0109 + }, + { + "epoch": 0.9622476292206703, + "grad_norm": 0.11181640625, + "learning_rate": 7.167572926613853e-07, + "lm_loss": 1.9415, + "loss": 2.0742, + "mask_loss": 0.1237, + "step": 2421, + "topk_loss": 0.009 + }, + { + "epoch": 0.9626450879687994, + "grad_norm": 0.130859375, + "learning_rate": 7.01606018367873e-07, + "lm_loss": 1.9043, + "loss": 2.0436, + "mask_loss": 0.1273, + "step": 2422, + "topk_loss": 0.0121 + }, + { + "epoch": 0.9630425467169287, + "grad_norm": 0.1171875, + "learning_rate": 6.866160343053318e-07, + "lm_loss": 1.8718, + "loss": 2.0095, + "mask_loss": 0.1265, + "step": 2423, + "topk_loss": 0.0112 + }, + { + "epoch": 0.9634400054650578, + "grad_norm": 0.11669921875, + "learning_rate": 6.71787364821952e-07, + "lm_loss": 1.9277, + "loss": 2.0597, + "mask_loss": 0.1235, + "step": 2424, + "topk_loss": 0.0085 + }, + { + "epoch": 0.963837464213187, + "grad_norm": 0.1279296875, + "learning_rate": 6.571200340039218e-07, + "lm_loss": 1.8771, + "loss": 2.0154, + "mask_loss": 0.1264, + "step": 2425, + "topk_loss": 0.0119 + }, + { + "epoch": 0.9642349229613161, + "grad_norm": 0.1201171875, + "learning_rate": 6.426140656753621e-07, + "lm_loss": 1.892, + "loss": 2.0291, + "mask_loss": 0.1251, + "step": 2426, + "topk_loss": 0.0119 + }, + { + "epoch": 0.9646323817094452, + "grad_norm": 0.115234375, + "learning_rate": 6.282694833983138e-07, + "lm_loss": 1.8803, + "loss": 2.0187, + "mask_loss": 0.127, + "step": 2427, + "topk_loss": 0.0114 + }, + { + "epoch": 0.9650298404575743, + "grad_norm": 0.1220703125, + "learning_rate": 6.140863104726391e-07, + "lm_loss": 1.9055, + "loss": 2.045, + "mask_loss": 0.1271, + "step": 2428, + "topk_loss": 0.0124 + }, + { + "epoch": 0.9654272992057036, + "grad_norm": 0.11865234375, + "learning_rate": 6.000645699360541e-07, + "lm_loss": 1.9126, + "loss": 2.0477, + "mask_loss": 0.125, + "step": 2429, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9658247579538327, + "grad_norm": 0.109375, + "learning_rate": 5.862042845640403e-07, + "lm_loss": 1.9581, + "loss": 2.0898, + "mask_loss": 0.1228, + "step": 2430, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9662222167019618, + "grad_norm": 0.185546875, + "learning_rate": 5.725054768698557e-07, + "lm_loss": 1.9026, + "loss": 2.0387, + "mask_loss": 0.1244, + "step": 2431, + "topk_loss": 0.0117 + }, + { + "epoch": 0.966619675450091, + "grad_norm": 0.1142578125, + "learning_rate": 5.589681691044346e-07, + "lm_loss": 1.9114, + "loss": 2.0518, + "mask_loss": 0.1274, + "step": 2432, + "topk_loss": 0.0131 + }, + { + "epoch": 0.9670171341982201, + "grad_norm": 0.11767578125, + "learning_rate": 5.455923832564214e-07, + "lm_loss": 1.8213, + "loss": 1.9585, + "mask_loss": 0.1263, + "step": 2433, + "topk_loss": 0.0109 + }, + { + "epoch": 0.9674145929463492, + "grad_norm": 0.1220703125, + "learning_rate": 5.323781410520812e-07, + "lm_loss": 1.92, + "loss": 2.0552, + "mask_loss": 0.1256, + "step": 2434, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9678120516944785, + "grad_norm": 0.11181640625, + "learning_rate": 5.19325463955278e-07, + "lm_loss": 1.9386, + "loss": 2.0739, + "mask_loss": 0.1254, + "step": 2435, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9682095104426076, + "grad_norm": 0.11572265625, + "learning_rate": 5.064343731674637e-07, + "lm_loss": 1.8839, + "loss": 2.0184, + "mask_loss": 0.1259, + "step": 2436, + "topk_loss": 0.0086 + }, + { + "epoch": 0.9686069691907367, + "grad_norm": 0.1279296875, + "learning_rate": 4.93704889627622e-07, + "lm_loss": 1.9107, + "loss": 2.0489, + "mask_loss": 0.1263, + "step": 2437, + "topk_loss": 0.0119 + }, + { + "epoch": 0.9690044279388659, + "grad_norm": 0.1279296875, + "learning_rate": 4.811370340122134e-07, + "lm_loss": 1.8769, + "loss": 2.0172, + "mask_loss": 0.1284, + "step": 2438, + "topk_loss": 0.0119 + }, + { + "epoch": 0.969401886686995, + "grad_norm": 0.1298828125, + "learning_rate": 4.6873082673521975e-07, + "lm_loss": 1.8932, + "loss": 2.0289, + "mask_loss": 0.125, + "step": 2439, + "topk_loss": 0.0107 + }, + { + "epoch": 0.9697993454351241, + "grad_norm": 0.11328125, + "learning_rate": 4.564862879479881e-07, + "lm_loss": 1.8464, + "loss": 1.9827, + "mask_loss": 0.1256, + "step": 2440, + "topk_loss": 0.0108 + }, + { + "epoch": 0.9701968041832533, + "grad_norm": 0.146484375, + "learning_rate": 4.4440343753933135e-07, + "lm_loss": 1.8917, + "loss": 2.0308, + "mask_loss": 0.1269, + "step": 2441, + "topk_loss": 0.0122 + }, + { + "epoch": 0.9705942629313825, + "grad_norm": 0.1162109375, + "learning_rate": 4.324822951353946e-07, + "lm_loss": 1.8782, + "loss": 2.0162, + "mask_loss": 0.1272, + "step": 2442, + "topk_loss": 0.0108 + }, + { + "epoch": 0.9709917216795116, + "grad_norm": 0.11572265625, + "learning_rate": 4.2072288009966654e-07, + "lm_loss": 1.8861, + "loss": 2.0231, + "mask_loss": 0.1266, + "step": 2443, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9713891804276408, + "grad_norm": 0.25390625, + "learning_rate": 4.091252115329569e-07, + "lm_loss": 1.8934, + "loss": 2.0477, + "mask_loss": 0.1333, + "step": 2444, + "topk_loss": 0.0211 + }, + { + "epoch": 0.9717866391757699, + "grad_norm": 0.1142578125, + "learning_rate": 3.976893082733413e-07, + "lm_loss": 1.8902, + "loss": 2.0255, + "mask_loss": 0.1259, + "step": 2445, + "topk_loss": 0.0094 + }, + { + "epoch": 0.972184097923899, + "grad_norm": 0.1513671875, + "learning_rate": 3.864151888961387e-07, + "lm_loss": 1.9002, + "loss": 2.0379, + "mask_loss": 0.1258, + "step": 2446, + "topk_loss": 0.0119 + }, + { + "epoch": 0.9725815566720282, + "grad_norm": 0.10498046875, + "learning_rate": 3.7530287171387843e-07, + "lm_loss": 1.8868, + "loss": 2.024, + "mask_loss": 0.1269, + "step": 2447, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9729790154201574, + "grad_norm": 0.12890625, + "learning_rate": 3.6435237477627784e-07, + "lm_loss": 1.8634, + "loss": 1.998, + "mask_loss": 0.1254, + "step": 2448, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9733764741682865, + "grad_norm": 0.1259765625, + "learning_rate": 3.5356371587021987e-07, + "lm_loss": 1.9244, + "loss": 2.0558, + "mask_loss": 0.1229, + "step": 2449, + "topk_loss": 0.0085 + }, + { + "epoch": 0.9737739329164157, + "grad_norm": 0.109375, + "learning_rate": 3.429369125197091e-07, + "lm_loss": 1.9097, + "loss": 2.044, + "mask_loss": 0.1255, + "step": 2450, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9737739329164157, + "eval_lm_loss": 687.3634033203125, + "eval_loss": 687.5003662109375, + "eval_mask_hit_rate": 0.535973310470581, + "eval_mask_loss": 0.12440047413110733, + "eval_mask_top_10_hit_rate": 0.985601544380188, + "eval_mask_top_1_hit_rate": 0.997429609298706, + "eval_mask_top_20_hit_rate": 0.9762364625930786, + "eval_mask_top_5_hit_rate": 0.9908676147460938, + "eval_runtime": 143.7229, + "eval_samples_per_second": 14.25, + "eval_steps_per_second": 7.125, + "eval_token_accuracy": 0.6146076917648315, + "eval_top_k_diff": -530.109619140625, + "eval_topk_loss": 0.012541374191641808, + "step": 2450 + }, + { + "epoch": 0.9741713916645448, + "grad_norm": 0.146484375, + "learning_rate": 3.3247198198583793e-07, + "lm_loss": 1.9452, + "loss": 2.0893, + "mask_loss": 0.1291, + "step": 2451, + "topk_loss": 0.0149 + }, + { + "epoch": 0.9745688504126739, + "grad_norm": 0.12255859375, + "learning_rate": 3.221689412667872e-07, + "lm_loss": 1.8813, + "loss": 2.0189, + "mask_loss": 0.1272, + "step": 2452, + "topk_loss": 0.0104 + }, + { + "epoch": 0.9749663091608031, + "grad_norm": 0.1181640625, + "learning_rate": 3.1202780709775894e-07, + "lm_loss": 1.8595, + "loss": 1.9945, + "mask_loss": 0.1261, + "step": 2453, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9753637679089323, + "grad_norm": 0.1611328125, + "learning_rate": 3.020485959509989e-07, + "lm_loss": 1.8525, + "loss": 1.9929, + "mask_loss": 0.1275, + "step": 2454, + "topk_loss": 0.0129 + }, + { + "epoch": 0.9757612266570614, + "grad_norm": 0.11669921875, + "learning_rate": 2.9223132403570773e-07, + "lm_loss": 1.9263, + "loss": 2.0603, + "mask_loss": 0.1246, + "step": 2455, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9761586854051906, + "grad_norm": 0.1240234375, + "learning_rate": 2.825760072980632e-07, + "lm_loss": 1.9789, + "loss": 2.1153, + "mask_loss": 0.1253, + "step": 2456, + "topk_loss": 0.0112 + }, + { + "epoch": 0.9765561441533197, + "grad_norm": 0.11572265625, + "learning_rate": 2.7308266142119785e-07, + "lm_loss": 1.9255, + "loss": 2.0597, + "mask_loss": 0.1245, + "step": 2457, + "topk_loss": 0.0098 + }, + { + "epoch": 0.9769536029014488, + "grad_norm": 0.126953125, + "learning_rate": 2.637513018251325e-07, + "lm_loss": 1.9166, + "loss": 2.0541, + "mask_loss": 0.1268, + "step": 2458, + "topk_loss": 0.0107 + }, + { + "epoch": 0.977351061649578, + "grad_norm": 0.14453125, + "learning_rate": 2.545819436667651e-07, + "lm_loss": 1.8749, + "loss": 2.0101, + "mask_loss": 0.1256, + "step": 2459, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9777485203977072, + "grad_norm": 0.1171875, + "learning_rate": 2.45574601839893e-07, + "lm_loss": 1.921, + "loss": 2.0566, + "mask_loss": 0.1262, + "step": 2460, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9781459791458363, + "grad_norm": 0.12890625, + "learning_rate": 2.3672929097512396e-07, + "lm_loss": 1.859, + "loss": 1.9979, + "mask_loss": 0.1273, + "step": 2461, + "topk_loss": 0.0116 + }, + { + "epoch": 0.9785434378939655, + "grad_norm": 0.1123046875, + "learning_rate": 2.280460254398764e-07, + "lm_loss": 1.8034, + "loss": 1.9411, + "mask_loss": 0.1275, + "step": 2462, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9789408966420946, + "grad_norm": 0.12109375, + "learning_rate": 2.19524819338357e-07, + "lm_loss": 1.8752, + "loss": 2.0141, + "mask_loss": 0.1273, + "step": 2463, + "topk_loss": 0.0116 + }, + { + "epoch": 0.9793383553902237, + "grad_norm": 0.11376953125, + "learning_rate": 2.1116568651156076e-07, + "lm_loss": 1.884, + "loss": 2.0205, + "mask_loss": 0.1264, + "step": 2464, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9797358141383529, + "grad_norm": 0.10498046875, + "learning_rate": 2.0296864053721555e-07, + "lm_loss": 1.8769, + "loss": 2.0129, + "mask_loss": 0.1257, + "step": 2465, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9801332728864821, + "grad_norm": 0.11474609375, + "learning_rate": 1.9493369472977086e-07, + "lm_loss": 1.9309, + "loss": 2.0648, + "mask_loss": 0.1241, + "step": 2466, + "topk_loss": 0.0098 + }, + { + "epoch": 0.9805307316346112, + "grad_norm": 0.1103515625, + "learning_rate": 1.8706086214036467e-07, + "lm_loss": 1.9119, + "loss": 2.0473, + "mask_loss": 0.1258, + "step": 2467, + "topk_loss": 0.0097 + }, + { + "epoch": 0.9809281903827404, + "grad_norm": 0.1171875, + "learning_rate": 1.7935015555683444e-07, + "lm_loss": 1.9403, + "loss": 2.0755, + "mask_loss": 0.1246, + "step": 2468, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9813256491308695, + "grad_norm": 0.1123046875, + "learning_rate": 1.718015875036727e-07, + "lm_loss": 1.9041, + "loss": 2.0384, + "mask_loss": 0.1253, + "step": 2469, + "topk_loss": 0.009 + }, + { + "epoch": 0.9817231078789986, + "grad_norm": 0.11474609375, + "learning_rate": 1.6441517024200492e-07, + "lm_loss": 1.8488, + "loss": 1.9795, + "mask_loss": 0.1221, + "step": 2470, + "topk_loss": 0.0086 + }, + { + "epoch": 0.9821205666271278, + "grad_norm": 0.1298828125, + "learning_rate": 1.5719091576957835e-07, + "lm_loss": 1.9321, + "loss": 2.0716, + "mask_loss": 0.1265, + "step": 2471, + "topk_loss": 0.013 + }, + { + "epoch": 0.9825180253752569, + "grad_norm": 0.119140625, + "learning_rate": 1.5012883582073976e-07, + "lm_loss": 1.8822, + "loss": 2.0182, + "mask_loss": 0.1261, + "step": 2472, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9829154841233861, + "grad_norm": 0.119140625, + "learning_rate": 1.4322894186640236e-07, + "lm_loss": 1.9432, + "loss": 2.0766, + "mask_loss": 0.1222, + "step": 2473, + "topk_loss": 0.0112 + }, + { + "epoch": 0.9833129428715153, + "grad_norm": 0.109375, + "learning_rate": 1.3649124511406764e-07, + "lm_loss": 1.9177, + "loss": 2.0534, + "mask_loss": 0.1258, + "step": 2474, + "topk_loss": 0.01 + }, + { + "epoch": 0.9837104016196444, + "grad_norm": 0.1328125, + "learning_rate": 1.299157565077702e-07, + "lm_loss": 1.8753, + "loss": 2.0122, + "mask_loss": 0.1263, + "step": 2475, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9841078603677735, + "grad_norm": 0.1513671875, + "learning_rate": 1.2350248672804433e-07, + "lm_loss": 1.8758, + "loss": 2.0142, + "mask_loss": 0.1289, + "step": 2476, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9845053191159027, + "grad_norm": 0.12255859375, + "learning_rate": 1.1725144619197937e-07, + "lm_loss": 1.9304, + "loss": 2.0647, + "mask_loss": 0.1253, + "step": 2477, + "topk_loss": 0.009 + }, + { + "epoch": 0.9849027778640318, + "grad_norm": 0.11279296875, + "learning_rate": 1.1116264505310891e-07, + "lm_loss": 1.8625, + "loss": 1.9944, + "mask_loss": 0.1231, + "step": 2478, + "topk_loss": 0.0088 + }, + { + "epoch": 0.985300236612161, + "grad_norm": 0.11328125, + "learning_rate": 1.0523609320147732e-07, + "lm_loss": 1.8549, + "loss": 1.9947, + "mask_loss": 0.1293, + "step": 2479, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9856976953602902, + "grad_norm": 0.10546875, + "learning_rate": 9.947180026357305e-08, + "lm_loss": 1.858, + "loss": 1.9912, + "mask_loss": 0.1249, + "step": 2480, + "topk_loss": 0.0084 + }, + { + "epoch": 0.9860951541084193, + "grad_norm": 0.1103515625, + "learning_rate": 9.386977560232879e-08, + "lm_loss": 1.8755, + "loss": 2.0128, + "mask_loss": 0.1277, + "step": 2481, + "topk_loss": 0.0096 + }, + { + "epoch": 0.9864926128565484, + "grad_norm": 0.109375, + "learning_rate": 8.843002831709912e-08, + "lm_loss": 1.8747, + "loss": 2.0086, + "mask_loss": 0.1244, + "step": 2482, + "topk_loss": 0.0095 + }, + { + "epoch": 0.9868900716046776, + "grad_norm": 0.12353515625, + "learning_rate": 8.31525672436606e-08, + "lm_loss": 1.9447, + "loss": 2.0818, + "mask_loss": 0.1265, + "step": 2483, + "topk_loss": 0.0106 + }, + { + "epoch": 0.9872875303528067, + "grad_norm": 0.12060546875, + "learning_rate": 7.803740095417844e-08, + "lm_loss": 1.8844, + "loss": 2.0219, + "mask_loss": 0.1271, + "step": 2484, + "topk_loss": 0.0104 + }, + { + "epoch": 0.9876849891009359, + "grad_norm": 0.1181640625, + "learning_rate": 7.308453775721758e-08, + "lm_loss": 1.953, + "loss": 2.0876, + "mask_loss": 0.1245, + "step": 2485, + "topk_loss": 0.01 + }, + { + "epoch": 0.9880824478490651, + "grad_norm": 0.107421875, + "learning_rate": 6.829398569770939e-08, + "lm_loss": 1.8718, + "loss": 2.0061, + "mask_loss": 0.125, + "step": 2486, + "topk_loss": 0.0093 + }, + { + "epoch": 0.9884799065971942, + "grad_norm": 0.10498046875, + "learning_rate": 6.366575255694062e-08, + "lm_loss": 1.8023, + "loss": 1.9392, + "mask_loss": 0.127, + "step": 2487, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9888773653453233, + "grad_norm": 0.11669921875, + "learning_rate": 5.919984585253113e-08, + "lm_loss": 1.8762, + "loss": 2.0138, + "mask_loss": 0.1271, + "step": 2488, + "topk_loss": 0.0105 + }, + { + "epoch": 0.9892748240934525, + "grad_norm": 0.11181640625, + "learning_rate": 5.4896272838445004e-08, + "lm_loss": 1.9008, + "loss": 2.0385, + "mask_loss": 0.1282, + "step": 2489, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9896722828415816, + "grad_norm": 0.10986328125, + "learning_rate": 5.075504050499058e-08, + "lm_loss": 1.9343, + "loss": 2.0691, + "mask_loss": 0.1255, + "step": 2490, + "topk_loss": 0.0093 + }, + { + "epoch": 0.9900697415897108, + "grad_norm": 0.11474609375, + "learning_rate": 4.677615557874271e-08, + "lm_loss": 1.8501, + "loss": 1.9853, + "mask_loss": 0.1257, + "step": 2491, + "topk_loss": 0.0094 + }, + { + "epoch": 0.99046720033784, + "grad_norm": 0.1123046875, + "learning_rate": 4.295962452262048e-08, + "lm_loss": 1.8687, + "loss": 2.0074, + "mask_loss": 0.1276, + "step": 2492, + "topk_loss": 0.0111 + }, + { + "epoch": 0.9908646590859691, + "grad_norm": 0.10986328125, + "learning_rate": 3.93054535357873e-08, + "lm_loss": 1.913, + "loss": 2.0455, + "mask_loss": 0.1233, + "step": 2493, + "topk_loss": 0.0092 + }, + { + "epoch": 0.9912621178340982, + "grad_norm": 0.1162109375, + "learning_rate": 3.5813648553717494e-08, + "lm_loss": 1.8775, + "loss": 2.0153, + "mask_loss": 0.1268, + "step": 2494, + "topk_loss": 0.011 + }, + { + "epoch": 0.9916595765822274, + "grad_norm": 0.11474609375, + "learning_rate": 3.2484215248140824e-08, + "lm_loss": 1.9635, + "loss": 2.1013, + "mask_loss": 0.1252, + "step": 2495, + "topk_loss": 0.0126 + }, + { + "epoch": 0.9920570353303565, + "grad_norm": 0.1171875, + "learning_rate": 2.9317159027064666e-08, + "lm_loss": 1.8992, + "loss": 2.0349, + "mask_loss": 0.1254, + "step": 2496, + "topk_loss": 0.0103 + }, + { + "epoch": 0.9924544940784856, + "grad_norm": 0.1123046875, + "learning_rate": 2.6312485034718504e-08, + "lm_loss": 1.9095, + "loss": 2.0447, + "mask_loss": 0.1253, + "step": 2497, + "topk_loss": 0.0099 + }, + { + "epoch": 0.9928519528266149, + "grad_norm": 0.11279296875, + "learning_rate": 2.347019815158724e-08, + "lm_loss": 1.8812, + "loss": 2.0148, + "mask_loss": 0.1252, + "step": 2498, + "topk_loss": 0.0085 + }, + { + "epoch": 0.993249411574744, + "grad_norm": 0.125, + "learning_rate": 2.0790302994411204e-08, + "lm_loss": 1.816, + "loss": 1.957, + "mask_loss": 0.1293, + "step": 2499, + "topk_loss": 0.0118 + }, + { + "epoch": 0.9936468703228731, + "grad_norm": 0.12353515625, + "learning_rate": 1.827280391611952e-08, + "lm_loss": 1.8924, + "loss": 2.0265, + "mask_loss": 0.1254, + "step": 2500, + "topk_loss": 0.0087 + }, + { + "epoch": 0.9936468703228731, + "eval_lm_loss": 687.3634033203125, + "eval_loss": 687.5003662109375, + "eval_mask_hit_rate": 0.535973310470581, + "eval_mask_loss": 0.12440047413110733, + "eval_mask_top_10_hit_rate": 0.9856016039848328, + "eval_mask_top_1_hit_rate": 0.997429609298706, + "eval_mask_top_20_hit_rate": 0.9762364625930786, + "eval_mask_top_5_hit_rate": 0.9908676147460938, + "eval_runtime": 144.2467, + "eval_samples_per_second": 14.198, + "eval_steps_per_second": 7.099, + "eval_token_accuracy": 0.6146076917648315, + "eval_top_k_diff": -530.1094970703125, + "eval_topk_loss": 0.012541375122964382, + "step": 2500 + }, + { + "epoch": 0.9940443290710023, + "grad_norm": 0.109375, + "learning_rate": 1.591770500589673e-08, + "lm_loss": 1.8376, + "loss": 1.9718, + "mask_loss": 0.1253, + "step": 2501, + "topk_loss": 0.0089 + }, + { + "epoch": 0.9944417878191314, + "grad_norm": 0.1162109375, + "learning_rate": 1.3725010089116198e-08, + "lm_loss": 1.8811, + "loss": 2.0195, + "mask_loss": 0.1277, + "step": 2502, + "topk_loss": 0.0107 + }, + { + "epoch": 0.9948392465672605, + "grad_norm": 0.125, + "learning_rate": 1.1694722727384477e-08, + "lm_loss": 1.8401, + "loss": 1.9748, + "mask_loss": 0.126, + "step": 2503, + "topk_loss": 0.0087 + }, + { + "epoch": 0.9952367053153898, + "grad_norm": 0.1376953125, + "learning_rate": 9.82684621847474e-09, + "lm_loss": 1.9323, + "loss": 2.0686, + "mask_loss": 0.1253, + "step": 2504, + "topk_loss": 0.011 + }, + { + "epoch": 0.9956341640635189, + "grad_norm": 0.12890625, + "learning_rate": 8.121383596393362e-09, + "lm_loss": 1.8994, + "loss": 2.0352, + "mask_loss": 0.1258, + "step": 2505, + "topk_loss": 0.0101 + }, + { + "epoch": 0.996031622811648, + "grad_norm": 0.10791015625, + "learning_rate": 6.578337631313325e-09, + "lm_loss": 1.8886, + "loss": 2.0258, + "mask_loss": 0.1274, + "step": 2506, + "topk_loss": 0.0098 + }, + { + "epoch": 0.9964290815597772, + "grad_norm": 0.1142578125, + "learning_rate": 5.197710829596414e-09, + "lm_loss": 1.8999, + "loss": 2.0346, + "mask_loss": 0.1258, + "step": 2507, + "topk_loss": 0.0088 + }, + { + "epoch": 0.9968265403079063, + "grad_norm": 0.11328125, + "learning_rate": 3.9795054337932184e-09, + "lm_loss": 1.8878, + "loss": 2.0225, + "mask_loss": 0.1253, + "step": 2508, + "topk_loss": 0.0094 + }, + { + "epoch": 0.9972239990560354, + "grad_norm": 0.1103515625, + "learning_rate": 2.9237234226431322e-09, + "lm_loss": 1.9018, + "loss": 2.0338, + "mask_loss": 0.1234, + "step": 2509, + "topk_loss": 0.0086 + }, + { + "epoch": 0.9976214578041647, + "grad_norm": 0.11181640625, + "learning_rate": 2.0303665110410484e-09, + "lm_loss": 1.8768, + "loss": 2.0138, + "mask_loss": 0.1267, + "step": 2510, + "topk_loss": 0.0102 + }, + { + "epoch": 0.9980189165522938, + "grad_norm": 0.1318359375, + "learning_rate": 1.2994361500706652e-09, + "lm_loss": 1.8978, + "loss": 2.0396, + "mask_loss": 0.1287, + "step": 2511, + "topk_loss": 0.0131 + }, + { + "epoch": 0.9984163753004229, + "grad_norm": 0.10693359375, + "learning_rate": 7.309335269822804e-10, + "lm_loss": 1.8741, + "loss": 2.0061, + "mask_loss": 0.1239, + "step": 2512, + "topk_loss": 0.008 + }, + { + "epoch": 0.9988138340485521, + "grad_norm": 0.11474609375, + "learning_rate": 3.2485956518168994e-10, + "lm_loss": 1.9016, + "loss": 2.0378, + "mask_loss": 0.1248, + "step": 2513, + "topk_loss": 0.0114 + }, + { + "epoch": 0.9992112927966812, + "grad_norm": 0.115234375, + "learning_rate": 8.121492427459742e-11, + "lm_loss": 1.9225, + "loss": 2.0591, + "mask_loss": 0.1265, + "step": 2514, + "topk_loss": 0.0101 + }, + { + "epoch": 0.9996087515448103, + "grad_norm": 0.111328125, + "learning_rate": 0.0, + "lm_loss": 1.9003, + "loss": 2.0373, + "mask_loss": 0.1264, + "step": 2515, + "topk_loss": 0.0106 + } + ], + "logging_steps": 1, + "max_steps": 2515, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.5540471456590725e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}