diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7631 @@ +{ + "best_metric": 3.5567543506622314, + "best_model_checkpoint": "/home/hpcpudu1/rds/hpc-work/data/pretrain-mds/led_pretrain/ver2/gen_model/Centrum_pretrain_base_batch_16_21-7-22.1/checkpoint-96500", + "epoch": 9.269546276127357, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 3.0000000000000004e-09, + "loss": 5.1238, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 2.97e-07, + "loss": 4.8301, + "step": 100 + }, + { + "epoch": 0.02, + "learning_rate": 5.970000000000001e-07, + "loss": 4.4653, + "step": 200 + }, + { + "epoch": 0.03, + "learning_rate": 8.97e-07, + "loss": 4.2831, + "step": 300 + }, + { + "epoch": 0.04, + "learning_rate": 1.197e-06, + "loss": 4.2033, + "step": 400 + }, + { + "epoch": 0.05, + "learning_rate": 1.497e-06, + "loss": 4.1628, + "step": 500 + }, + { + "epoch": 0.05, + "eval_loss": 4.073188304901123, + "eval_runtime": 23.8328, + "eval_samples_per_second": 105.821, + "eval_steps_per_second": 6.63, + "step": 500 + }, + { + "epoch": 0.06, + "learning_rate": 1.7970000000000001e-06, + "loss": 4.1047, + "step": 600 + }, + { + "epoch": 0.06, + "learning_rate": 2.097e-06, + "loss": 4.1061, + "step": 700 + }, + { + "epoch": 0.07, + "learning_rate": 2.397e-06, + "loss": 4.0666, + "step": 800 + }, + { + "epoch": 0.08, + "learning_rate": 2.6969999999999998e-06, + "loss": 4.05, + "step": 900 + }, + { + "epoch": 0.09, + "learning_rate": 2.9970000000000003e-06, + "loss": 4.0278, + "step": 1000 + }, + { + "epoch": 0.09, + "eval_loss": 3.9799606800079346, + "eval_runtime": 23.808, + "eval_samples_per_second": 105.931, + "eval_steps_per_second": 6.636, + "step": 1000 + }, + { + "epoch": 0.1, + "learning_rate": 3.297e-06, + "loss": 4.0257, + "step": 1100 + }, + { + "epoch": 0.11, + "learning_rate": 3.5970000000000005e-06, + "loss": 4.0151, + "step": 1200 + }, + { + "epoch": 0.12, + "learning_rate": 3.897e-06, + "loss": 3.9934, + "step": 1300 + }, + { + "epoch": 0.13, + "learning_rate": 4.197e-06, + "loss": 4.0018, + "step": 1400 + }, + { + "epoch": 0.14, + "learning_rate": 4.497e-06, + "loss": 4.0008, + "step": 1500 + }, + { + "epoch": 0.14, + "eval_loss": 3.9282920360565186, + "eval_runtime": 23.8216, + "eval_samples_per_second": 105.87, + "eval_steps_per_second": 6.633, + "step": 1500 + }, + { + "epoch": 0.15, + "learning_rate": 4.797e-06, + "loss": 3.98, + "step": 1600 + }, + { + "epoch": 0.16, + "learning_rate": 5.097e-06, + "loss": 3.9486, + "step": 1700 + }, + { + "epoch": 0.17, + "learning_rate": 5.397e-06, + "loss": 3.9664, + "step": 1800 + }, + { + "epoch": 0.18, + "learning_rate": 5.697000000000001e-06, + "loss": 3.9469, + "step": 1900 + }, + { + "epoch": 0.19, + "learning_rate": 5.997e-06, + "loss": 3.9564, + "step": 2000 + }, + { + "epoch": 0.19, + "eval_loss": 3.8940863609313965, + "eval_runtime": 23.785, + "eval_samples_per_second": 106.033, + "eval_steps_per_second": 6.643, + "step": 2000 + }, + { + "epoch": 0.19, + "learning_rate": 6.297e-06, + "loss": 3.9533, + "step": 2100 + }, + { + "epoch": 0.2, + "learning_rate": 6.5970000000000005e-06, + "loss": 3.9263, + "step": 2200 + }, + { + "epoch": 0.21, + "learning_rate": 6.897e-06, + "loss": 3.9351, + "step": 2300 + }, + { + "epoch": 0.22, + "learning_rate": 7.197e-06, + "loss": 3.9339, + "step": 2400 + }, + { + "epoch": 0.23, + "learning_rate": 7.497e-06, + "loss": 3.9193, + "step": 2500 + }, + { + "epoch": 0.23, + "eval_loss": 3.878038167953491, + "eval_runtime": 23.806, + "eval_samples_per_second": 105.94, + "eval_steps_per_second": 6.637, + "step": 2500 + }, + { + "epoch": 0.24, + "learning_rate": 7.797e-06, + "loss": 3.9323, + "step": 2600 + }, + { + "epoch": 0.25, + "learning_rate": 8.096999999999999e-06, + "loss": 3.9102, + "step": 2700 + }, + { + "epoch": 0.26, + "learning_rate": 8.397e-06, + "loss": 3.891, + "step": 2800 + }, + { + "epoch": 0.27, + "learning_rate": 8.694e-06, + "loss": 3.9072, + "step": 2900 + }, + { + "epoch": 0.28, + "learning_rate": 8.994e-06, + "loss": 3.9185, + "step": 3000 + }, + { + "epoch": 0.28, + "eval_loss": 3.8500914573669434, + "eval_runtime": 23.7916, + "eval_samples_per_second": 106.004, + "eval_steps_per_second": 6.641, + "step": 3000 + }, + { + "epoch": 0.29, + "learning_rate": 9.294000000000001e-06, + "loss": 3.8927, + "step": 3100 + }, + { + "epoch": 0.3, + "learning_rate": 9.594e-06, + "loss": 3.8977, + "step": 3200 + }, + { + "epoch": 0.31, + "learning_rate": 9.894e-06, + "loss": 3.8906, + "step": 3300 + }, + { + "epoch": 0.32, + "learning_rate": 1.0194e-05, + "loss": 3.8869, + "step": 3400 + }, + { + "epoch": 0.32, + "learning_rate": 1.0494e-05, + "loss": 3.8881, + "step": 3500 + }, + { + "epoch": 0.32, + "eval_loss": 3.833404779434204, + "eval_runtime": 23.8004, + "eval_samples_per_second": 105.965, + "eval_steps_per_second": 6.639, + "step": 3500 + }, + { + "epoch": 0.33, + "learning_rate": 1.0794e-05, + "loss": 3.8722, + "step": 3600 + }, + { + "epoch": 0.34, + "learning_rate": 1.1094e-05, + "loss": 3.8633, + "step": 3700 + }, + { + "epoch": 0.35, + "learning_rate": 1.1394000000000001e-05, + "loss": 3.8719, + "step": 3800 + }, + { + "epoch": 0.36, + "learning_rate": 1.1694e-05, + "loss": 3.8499, + "step": 3900 + }, + { + "epoch": 0.37, + "learning_rate": 1.1994e-05, + "loss": 3.8869, + "step": 4000 + }, + { + "epoch": 0.37, + "eval_loss": 3.8210906982421875, + "eval_runtime": 23.7934, + "eval_samples_per_second": 105.996, + "eval_steps_per_second": 6.64, + "step": 4000 + }, + { + "epoch": 0.38, + "learning_rate": 1.2294e-05, + "loss": 3.8583, + "step": 4100 + }, + { + "epoch": 0.39, + "learning_rate": 1.2594e-05, + "loss": 3.8594, + "step": 4200 + }, + { + "epoch": 0.4, + "learning_rate": 1.2894e-05, + "loss": 3.8824, + "step": 4300 + }, + { + "epoch": 0.41, + "learning_rate": 1.3194000000000001e-05, + "loss": 3.8901, + "step": 4400 + }, + { + "epoch": 0.42, + "learning_rate": 1.3494e-05, + "loss": 3.876, + "step": 4500 + }, + { + "epoch": 0.42, + "eval_loss": 3.8056981563568115, + "eval_runtime": 23.8047, + "eval_samples_per_second": 105.946, + "eval_steps_per_second": 6.637, + "step": 4500 + }, + { + "epoch": 0.43, + "learning_rate": 1.3794e-05, + "loss": 3.8473, + "step": 4600 + }, + { + "epoch": 0.44, + "learning_rate": 1.4094000000000001e-05, + "loss": 3.8718, + "step": 4700 + }, + { + "epoch": 0.44, + "learning_rate": 1.4394e-05, + "loss": 3.8671, + "step": 4800 + }, + { + "epoch": 0.45, + "learning_rate": 1.4694e-05, + "loss": 3.8396, + "step": 4900 + }, + { + "epoch": 0.46, + "learning_rate": 1.4994e-05, + "loss": 3.8552, + "step": 5000 + }, + { + "epoch": 0.46, + "eval_loss": 3.795402765274048, + "eval_runtime": 23.8098, + "eval_samples_per_second": 105.923, + "eval_steps_per_second": 6.636, + "step": 5000 + }, + { + "epoch": 0.47, + "learning_rate": 1.5294000000000003e-05, + "loss": 3.838, + "step": 5100 + }, + { + "epoch": 0.48, + "learning_rate": 1.5594e-05, + "loss": 3.8231, + "step": 5200 + }, + { + "epoch": 0.49, + "learning_rate": 1.5894e-05, + "loss": 3.8139, + "step": 5300 + }, + { + "epoch": 0.5, + "learning_rate": 1.6193999999999998e-05, + "loss": 3.8249, + "step": 5400 + }, + { + "epoch": 0.51, + "learning_rate": 1.6493999999999998e-05, + "loss": 3.8198, + "step": 5500 + }, + { + "epoch": 0.51, + "eval_loss": 3.7860653400421143, + "eval_runtime": 23.7811, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 6.644, + "step": 5500 + }, + { + "epoch": 0.52, + "learning_rate": 1.6794e-05, + "loss": 3.8332, + "step": 5600 + }, + { + "epoch": 0.53, + "learning_rate": 1.7094e-05, + "loss": 3.833, + "step": 5700 + }, + { + "epoch": 0.54, + "learning_rate": 1.7394e-05, + "loss": 3.8316, + "step": 5800 + }, + { + "epoch": 0.55, + "learning_rate": 1.7694e-05, + "loss": 3.8073, + "step": 5900 + }, + { + "epoch": 0.56, + "learning_rate": 1.7994e-05, + "loss": 3.8016, + "step": 6000 + }, + { + "epoch": 0.56, + "eval_loss": 3.7749924659729004, + "eval_runtime": 23.805, + "eval_samples_per_second": 105.944, + "eval_steps_per_second": 6.637, + "step": 6000 + }, + { + "epoch": 0.57, + "learning_rate": 1.8294e-05, + "loss": 3.8169, + "step": 6100 + }, + { + "epoch": 0.57, + "learning_rate": 1.8594000000000002e-05, + "loss": 3.8168, + "step": 6200 + }, + { + "epoch": 0.58, + "learning_rate": 1.8894000000000002e-05, + "loss": 3.8178, + "step": 6300 + }, + { + "epoch": 0.59, + "learning_rate": 1.9194000000000003e-05, + "loss": 3.7942, + "step": 6400 + }, + { + "epoch": 0.6, + "learning_rate": 1.9494000000000003e-05, + "loss": 3.8033, + "step": 6500 + }, + { + "epoch": 0.6, + "eval_loss": 3.765125036239624, + "eval_runtime": 23.776, + "eval_samples_per_second": 106.074, + "eval_steps_per_second": 6.645, + "step": 6500 + }, + { + "epoch": 0.61, + "learning_rate": 1.9794e-05, + "loss": 3.7867, + "step": 6600 + }, + { + "epoch": 0.62, + "learning_rate": 2.0093999999999998e-05, + "loss": 3.7787, + "step": 6700 + }, + { + "epoch": 0.63, + "learning_rate": 2.0393999999999998e-05, + "loss": 3.8017, + "step": 6800 + }, + { + "epoch": 0.64, + "learning_rate": 2.0694e-05, + "loss": 3.7995, + "step": 6900 + }, + { + "epoch": 0.65, + "learning_rate": 2.0994e-05, + "loss": 3.7927, + "step": 7000 + }, + { + "epoch": 0.65, + "eval_loss": 3.7527832984924316, + "eval_runtime": 23.795, + "eval_samples_per_second": 105.988, + "eval_steps_per_second": 6.64, + "step": 7000 + }, + { + "epoch": 0.66, + "learning_rate": 2.1294e-05, + "loss": 3.788, + "step": 7100 + }, + { + "epoch": 0.67, + "learning_rate": 2.1594e-05, + "loss": 3.8054, + "step": 7200 + }, + { + "epoch": 0.68, + "learning_rate": 2.1894e-05, + "loss": 3.8069, + "step": 7300 + }, + { + "epoch": 0.69, + "learning_rate": 2.2194e-05, + "loss": 3.7761, + "step": 7400 + }, + { + "epoch": 0.7, + "learning_rate": 2.2494000000000002e-05, + "loss": 3.7978, + "step": 7500 + }, + { + "epoch": 0.7, + "eval_loss": 3.742873191833496, + "eval_runtime": 23.8023, + "eval_samples_per_second": 105.956, + "eval_steps_per_second": 6.638, + "step": 7500 + }, + { + "epoch": 0.7, + "learning_rate": 2.2794000000000002e-05, + "loss": 3.7738, + "step": 7600 + }, + { + "epoch": 0.71, + "learning_rate": 2.3094000000000003e-05, + "loss": 3.7788, + "step": 7700 + }, + { + "epoch": 0.72, + "learning_rate": 2.3394000000000003e-05, + "loss": 3.7823, + "step": 7800 + }, + { + "epoch": 0.73, + "learning_rate": 2.3694e-05, + "loss": 3.7879, + "step": 7900 + }, + { + "epoch": 0.74, + "learning_rate": 2.3993999999999998e-05, + "loss": 3.7727, + "step": 8000 + }, + { + "epoch": 0.74, + "eval_loss": 3.7367067337036133, + "eval_runtime": 23.7892, + "eval_samples_per_second": 106.014, + "eval_steps_per_second": 6.642, + "step": 8000 + }, + { + "epoch": 0.75, + "learning_rate": 2.4293999999999998e-05, + "loss": 3.7747, + "step": 8100 + }, + { + "epoch": 0.76, + "learning_rate": 2.4594e-05, + "loss": 3.7716, + "step": 8200 + }, + { + "epoch": 0.77, + "learning_rate": 2.4894e-05, + "loss": 3.765, + "step": 8300 + }, + { + "epoch": 0.78, + "learning_rate": 2.5191e-05, + "loss": 3.7819, + "step": 8400 + }, + { + "epoch": 0.79, + "learning_rate": 2.5491e-05, + "loss": 3.7634, + "step": 8500 + }, + { + "epoch": 0.79, + "eval_loss": 3.7275290489196777, + "eval_runtime": 23.7785, + "eval_samples_per_second": 106.062, + "eval_steps_per_second": 6.645, + "step": 8500 + }, + { + "epoch": 0.8, + "learning_rate": 2.5791e-05, + "loss": 3.7757, + "step": 8600 + }, + { + "epoch": 0.81, + "learning_rate": 2.6091e-05, + "loss": 3.7559, + "step": 8700 + }, + { + "epoch": 0.82, + "learning_rate": 2.6391000000000002e-05, + "loss": 3.7709, + "step": 8800 + }, + { + "epoch": 0.82, + "learning_rate": 2.6688e-05, + "loss": 3.7626, + "step": 8900 + }, + { + "epoch": 0.83, + "learning_rate": 2.6985e-05, + "loss": 3.7395, + "step": 9000 + }, + { + "epoch": 0.83, + "eval_loss": 3.715789318084717, + "eval_runtime": 23.7885, + "eval_samples_per_second": 106.018, + "eval_steps_per_second": 6.642, + "step": 9000 + }, + { + "epoch": 0.84, + "learning_rate": 2.7285e-05, + "loss": 3.7773, + "step": 9100 + }, + { + "epoch": 0.85, + "learning_rate": 2.7585e-05, + "loss": 3.7273, + "step": 9200 + }, + { + "epoch": 0.86, + "learning_rate": 2.7885e-05, + "loss": 3.7598, + "step": 9300 + }, + { + "epoch": 0.87, + "learning_rate": 2.8185000000000002e-05, + "loss": 3.7531, + "step": 9400 + }, + { + "epoch": 0.88, + "learning_rate": 2.8485000000000003e-05, + "loss": 3.7432, + "step": 9500 + }, + { + "epoch": 0.88, + "eval_loss": 3.706634283065796, + "eval_runtime": 23.7692, + "eval_samples_per_second": 106.103, + "eval_steps_per_second": 6.647, + "step": 9500 + }, + { + "epoch": 0.89, + "learning_rate": 2.8785e-05, + "loss": 3.7297, + "step": 9600 + }, + { + "epoch": 0.9, + "learning_rate": 2.9085e-05, + "loss": 3.7366, + "step": 9700 + }, + { + "epoch": 0.91, + "learning_rate": 2.9385e-05, + "loss": 3.7179, + "step": 9800 + }, + { + "epoch": 0.92, + "learning_rate": 2.9685e-05, + "loss": 3.7166, + "step": 9900 + }, + { + "epoch": 0.93, + "learning_rate": 2.9985000000000002e-05, + "loss": 3.7623, + "step": 10000 + }, + { + "epoch": 0.93, + "eval_loss": 3.7039339542388916, + "eval_runtime": 23.7947, + "eval_samples_per_second": 105.99, + "eval_steps_per_second": 6.64, + "step": 10000 + }, + { + "epoch": 0.94, + "learning_rate": 2.9968333333333332e-05, + "loss": 3.7351, + "step": 10100 + }, + { + "epoch": 0.95, + "learning_rate": 2.9935e-05, + "loss": 3.7235, + "step": 10200 + }, + { + "epoch": 0.95, + "learning_rate": 2.9901666666666665e-05, + "loss": 3.7207, + "step": 10300 + }, + { + "epoch": 0.96, + "learning_rate": 2.9868333333333333e-05, + "loss": 3.7299, + "step": 10400 + }, + { + "epoch": 0.97, + "learning_rate": 2.9835e-05, + "loss": 3.7182, + "step": 10500 + }, + { + "epoch": 0.97, + "eval_loss": 3.6904149055480957, + "eval_runtime": 23.8525, + "eval_samples_per_second": 105.733, + "eval_steps_per_second": 6.624, + "step": 10500 + }, + { + "epoch": 0.98, + "learning_rate": 2.9801666666666666e-05, + "loss": 3.7165, + "step": 10600 + }, + { + "epoch": 0.99, + "learning_rate": 2.9768333333333334e-05, + "loss": 3.7112, + "step": 10700 + }, + { + "epoch": 1.0, + "learning_rate": 2.9735e-05, + "loss": 3.7438, + "step": 10800 + }, + { + "epoch": 1.01, + "learning_rate": 2.9701666666666667e-05, + "loss": 3.6832, + "step": 10900 + }, + { + "epoch": 1.02, + "learning_rate": 2.9668333333333332e-05, + "loss": 3.7146, + "step": 11000 + }, + { + "epoch": 1.02, + "eval_loss": 3.6881186962127686, + "eval_runtime": 23.7931, + "eval_samples_per_second": 105.997, + "eval_steps_per_second": 6.641, + "step": 11000 + }, + { + "epoch": 1.03, + "learning_rate": 2.9635e-05, + "loss": 3.6748, + "step": 11100 + }, + { + "epoch": 1.04, + "learning_rate": 2.9601666666666665e-05, + "loss": 3.6982, + "step": 11200 + }, + { + "epoch": 1.05, + "learning_rate": 2.9568333333333333e-05, + "loss": 3.7015, + "step": 11300 + }, + { + "epoch": 1.06, + "learning_rate": 2.9535e-05, + "loss": 3.6715, + "step": 11400 + }, + { + "epoch": 1.07, + "learning_rate": 2.9501666666666666e-05, + "loss": 3.681, + "step": 11500 + }, + { + "epoch": 1.07, + "eval_loss": 3.679689407348633, + "eval_runtime": 23.7803, + "eval_samples_per_second": 106.054, + "eval_steps_per_second": 6.644, + "step": 11500 + }, + { + "epoch": 1.08, + "learning_rate": 2.9468333333333334e-05, + "loss": 3.6938, + "step": 11600 + }, + { + "epoch": 1.08, + "learning_rate": 2.9435e-05, + "loss": 3.6871, + "step": 11700 + }, + { + "epoch": 1.09, + "learning_rate": 2.9401666666666667e-05, + "loss": 3.6833, + "step": 11800 + }, + { + "epoch": 1.1, + "learning_rate": 2.936833333333333e-05, + "loss": 3.6986, + "step": 11900 + }, + { + "epoch": 1.11, + "learning_rate": 2.9335e-05, + "loss": 3.6745, + "step": 12000 + }, + { + "epoch": 1.11, + "eval_loss": 3.674957275390625, + "eval_runtime": 23.7927, + "eval_samples_per_second": 105.999, + "eval_steps_per_second": 6.641, + "step": 12000 + }, + { + "epoch": 1.12, + "learning_rate": 2.9301666666666668e-05, + "loss": 3.666, + "step": 12100 + }, + { + "epoch": 1.13, + "learning_rate": 2.9268333333333332e-05, + "loss": 3.6803, + "step": 12200 + }, + { + "epoch": 1.14, + "learning_rate": 2.9235e-05, + "loss": 3.6671, + "step": 12300 + }, + { + "epoch": 1.15, + "learning_rate": 2.9201666666666665e-05, + "loss": 3.6783, + "step": 12400 + }, + { + "epoch": 1.16, + "learning_rate": 2.9168333333333333e-05, + "loss": 3.6794, + "step": 12500 + }, + { + "epoch": 1.16, + "eval_loss": 3.6748008728027344, + "eval_runtime": 23.8003, + "eval_samples_per_second": 105.965, + "eval_steps_per_second": 6.639, + "step": 12500 + }, + { + "epoch": 1.17, + "learning_rate": 2.9134999999999998e-05, + "loss": 3.69, + "step": 12600 + }, + { + "epoch": 1.18, + "learning_rate": 2.9101666666666666e-05, + "loss": 3.6674, + "step": 12700 + }, + { + "epoch": 1.19, + "learning_rate": 2.9068333333333334e-05, + "loss": 3.647, + "step": 12800 + }, + { + "epoch": 1.2, + "learning_rate": 2.9035e-05, + "loss": 3.6818, + "step": 12900 + }, + { + "epoch": 1.21, + "learning_rate": 2.9001666666666667e-05, + "loss": 3.6802, + "step": 13000 + }, + { + "epoch": 1.21, + "eval_loss": 3.6695618629455566, + "eval_runtime": 23.7884, + "eval_samples_per_second": 106.018, + "eval_steps_per_second": 6.642, + "step": 13000 + }, + { + "epoch": 1.21, + "learning_rate": 2.896866666666667e-05, + "loss": 3.6639, + "step": 13100 + }, + { + "epoch": 1.22, + "learning_rate": 2.8935333333333334e-05, + "loss": 3.6644, + "step": 13200 + }, + { + "epoch": 1.23, + "learning_rate": 2.8902000000000002e-05, + "loss": 3.6877, + "step": 13300 + }, + { + "epoch": 1.24, + "learning_rate": 2.8868666666666667e-05, + "loss": 3.6686, + "step": 13400 + }, + { + "epoch": 1.25, + "learning_rate": 2.8835333333333335e-05, + "loss": 3.665, + "step": 13500 + }, + { + "epoch": 1.25, + "eval_loss": 3.6609461307525635, + "eval_runtime": 23.7865, + "eval_samples_per_second": 106.027, + "eval_steps_per_second": 6.642, + "step": 13500 + }, + { + "epoch": 1.26, + "learning_rate": 2.8802e-05, + "loss": 3.6411, + "step": 13600 + }, + { + "epoch": 1.27, + "learning_rate": 2.8768666666666668e-05, + "loss": 3.6634, + "step": 13700 + }, + { + "epoch": 1.28, + "learning_rate": 2.8735333333333336e-05, + "loss": 3.6865, + "step": 13800 + }, + { + "epoch": 1.29, + "learning_rate": 2.8702e-05, + "loss": 3.6675, + "step": 13900 + }, + { + "epoch": 1.3, + "learning_rate": 2.866866666666667e-05, + "loss": 3.6516, + "step": 14000 + }, + { + "epoch": 1.3, + "eval_loss": 3.663334608078003, + "eval_runtime": 23.7962, + "eval_samples_per_second": 105.983, + "eval_steps_per_second": 6.64, + "step": 14000 + }, + { + "epoch": 1.31, + "learning_rate": 2.8635333333333333e-05, + "loss": 3.6612, + "step": 14100 + }, + { + "epoch": 1.32, + "learning_rate": 2.8602e-05, + "loss": 3.68, + "step": 14200 + }, + { + "epoch": 1.33, + "learning_rate": 2.8568666666666666e-05, + "loss": 3.6674, + "step": 14300 + }, + { + "epoch": 1.33, + "learning_rate": 2.8535333333333334e-05, + "loss": 3.6875, + "step": 14400 + }, + { + "epoch": 1.34, + "learning_rate": 2.8502e-05, + "loss": 3.6577, + "step": 14500 + }, + { + "epoch": 1.34, + "eval_loss": 3.657306432723999, + "eval_runtime": 23.7877, + "eval_samples_per_second": 106.021, + "eval_steps_per_second": 6.642, + "step": 14500 + }, + { + "epoch": 1.35, + "learning_rate": 2.8468666666666667e-05, + "loss": 3.6649, + "step": 14600 + }, + { + "epoch": 1.36, + "learning_rate": 2.8435333333333335e-05, + "loss": 3.6539, + "step": 14700 + }, + { + "epoch": 1.37, + "learning_rate": 2.8402e-05, + "loss": 3.6726, + "step": 14800 + }, + { + "epoch": 1.38, + "learning_rate": 2.8368666666666668e-05, + "loss": 3.6542, + "step": 14900 + }, + { + "epoch": 1.39, + "learning_rate": 2.8335333333333333e-05, + "loss": 3.6409, + "step": 15000 + }, + { + "epoch": 1.39, + "eval_loss": 3.6518640518188477, + "eval_runtime": 23.7734, + "eval_samples_per_second": 106.085, + "eval_steps_per_second": 6.646, + "step": 15000 + }, + { + "epoch": 1.4, + "learning_rate": 2.8302e-05, + "loss": 3.6629, + "step": 15100 + }, + { + "epoch": 1.41, + "learning_rate": 2.8268666666666666e-05, + "loss": 3.6467, + "step": 15200 + }, + { + "epoch": 1.42, + "learning_rate": 2.8235333333333334e-05, + "loss": 3.6499, + "step": 15300 + }, + { + "epoch": 1.43, + "learning_rate": 2.8202000000000002e-05, + "loss": 3.6623, + "step": 15400 + }, + { + "epoch": 1.44, + "learning_rate": 2.8168666666666667e-05, + "loss": 3.6691, + "step": 15500 + }, + { + "epoch": 1.44, + "eval_loss": 3.6490323543548584, + "eval_runtime": 23.8031, + "eval_samples_per_second": 105.953, + "eval_steps_per_second": 6.638, + "step": 15500 + }, + { + "epoch": 1.45, + "learning_rate": 2.813566666666667e-05, + "loss": 3.6585, + "step": 15600 + }, + { + "epoch": 1.46, + "learning_rate": 2.8102333333333337e-05, + "loss": 3.6582, + "step": 15700 + }, + { + "epoch": 1.46, + "learning_rate": 2.8069000000000002e-05, + "loss": 3.648, + "step": 15800 + }, + { + "epoch": 1.47, + "learning_rate": 2.803566666666667e-05, + "loss": 3.6484, + "step": 15900 + }, + { + "epoch": 1.48, + "learning_rate": 2.8002333333333335e-05, + "loss": 3.6521, + "step": 16000 + }, + { + "epoch": 1.48, + "eval_loss": 3.647477865219116, + "eval_runtime": 23.7875, + "eval_samples_per_second": 106.022, + "eval_steps_per_second": 6.642, + "step": 16000 + }, + { + "epoch": 1.49, + "learning_rate": 2.7969000000000003e-05, + "loss": 3.6635, + "step": 16100 + }, + { + "epoch": 1.5, + "learning_rate": 2.7935666666666667e-05, + "loss": 3.6423, + "step": 16200 + }, + { + "epoch": 1.51, + "learning_rate": 2.7902333333333336e-05, + "loss": 3.6403, + "step": 16300 + }, + { + "epoch": 1.52, + "learning_rate": 2.7869000000000004e-05, + "loss": 3.6515, + "step": 16400 + }, + { + "epoch": 1.53, + "learning_rate": 2.783566666666667e-05, + "loss": 3.6435, + "step": 16500 + }, + { + "epoch": 1.53, + "eval_loss": 3.6465208530426025, + "eval_runtime": 23.7862, + "eval_samples_per_second": 106.028, + "eval_steps_per_second": 6.643, + "step": 16500 + }, + { + "epoch": 1.54, + "learning_rate": 2.7802333333333337e-05, + "loss": 3.667, + "step": 16600 + }, + { + "epoch": 1.55, + "learning_rate": 2.7769e-05, + "loss": 3.6313, + "step": 16700 + }, + { + "epoch": 1.56, + "learning_rate": 2.773566666666667e-05, + "loss": 3.6229, + "step": 16800 + }, + { + "epoch": 1.57, + "learning_rate": 2.7702333333333334e-05, + "loss": 3.6347, + "step": 16900 + }, + { + "epoch": 1.58, + "learning_rate": 2.7669000000000002e-05, + "loss": 3.6466, + "step": 17000 + }, + { + "epoch": 1.58, + "eval_loss": 3.6391589641571045, + "eval_runtime": 23.7988, + "eval_samples_per_second": 105.972, + "eval_steps_per_second": 6.639, + "step": 17000 + }, + { + "epoch": 1.59, + "learning_rate": 2.7635666666666667e-05, + "loss": 3.6402, + "step": 17100 + }, + { + "epoch": 1.59, + "learning_rate": 2.7602333333333335e-05, + "loss": 3.6356, + "step": 17200 + }, + { + "epoch": 1.6, + "learning_rate": 2.7569000000000003e-05, + "loss": 3.6462, + "step": 17300 + }, + { + "epoch": 1.61, + "learning_rate": 2.7535666666666668e-05, + "loss": 3.6613, + "step": 17400 + }, + { + "epoch": 1.62, + "learning_rate": 2.7502333333333336e-05, + "loss": 3.644, + "step": 17500 + }, + { + "epoch": 1.62, + "eval_loss": 3.641881227493286, + "eval_runtime": 23.9969, + "eval_samples_per_second": 105.097, + "eval_steps_per_second": 6.584, + "step": 17500 + }, + { + "epoch": 1.63, + "learning_rate": 2.7469e-05, + "loss": 3.641, + "step": 17600 + }, + { + "epoch": 1.64, + "learning_rate": 2.743566666666667e-05, + "loss": 3.6439, + "step": 17700 + }, + { + "epoch": 1.65, + "learning_rate": 2.7402333333333334e-05, + "loss": 3.642, + "step": 17800 + }, + { + "epoch": 1.66, + "learning_rate": 2.7369000000000002e-05, + "loss": 3.6524, + "step": 17900 + }, + { + "epoch": 1.67, + "learning_rate": 2.733566666666667e-05, + "loss": 3.6347, + "step": 18000 + }, + { + "epoch": 1.67, + "eval_loss": 3.6347177028656006, + "eval_runtime": 23.7827, + "eval_samples_per_second": 106.044, + "eval_steps_per_second": 6.643, + "step": 18000 + }, + { + "epoch": 1.68, + "learning_rate": 2.7302333333333335e-05, + "loss": 3.6091, + "step": 18100 + }, + { + "epoch": 1.69, + "learning_rate": 2.7269000000000003e-05, + "loss": 3.6392, + "step": 18200 + }, + { + "epoch": 1.7, + "learning_rate": 2.7235666666666667e-05, + "loss": 3.6385, + "step": 18300 + }, + { + "epoch": 1.71, + "learning_rate": 2.7202333333333336e-05, + "loss": 3.6259, + "step": 18400 + }, + { + "epoch": 1.71, + "learning_rate": 2.7169e-05, + "loss": 3.6205, + "step": 18500 + }, + { + "epoch": 1.71, + "eval_loss": 3.6328201293945312, + "eval_runtime": 23.7927, + "eval_samples_per_second": 105.999, + "eval_steps_per_second": 6.641, + "step": 18500 + }, + { + "epoch": 1.72, + "learning_rate": 2.713566666666667e-05, + "loss": 3.6171, + "step": 18600 + }, + { + "epoch": 1.73, + "learning_rate": 2.7102333333333336e-05, + "loss": 3.62, + "step": 18700 + }, + { + "epoch": 1.74, + "learning_rate": 2.7069e-05, + "loss": 3.6423, + "step": 18800 + }, + { + "epoch": 1.75, + "learning_rate": 2.703566666666667e-05, + "loss": 3.6225, + "step": 18900 + }, + { + "epoch": 1.76, + "learning_rate": 2.7002333333333334e-05, + "loss": 3.6451, + "step": 19000 + }, + { + "epoch": 1.76, + "eval_loss": 3.630978584289551, + "eval_runtime": 23.7828, + "eval_samples_per_second": 106.043, + "eval_steps_per_second": 6.643, + "step": 19000 + }, + { + "epoch": 1.77, + "learning_rate": 2.6969000000000002e-05, + "loss": 3.6364, + "step": 19100 + }, + { + "epoch": 1.78, + "learning_rate": 2.6936e-05, + "loss": 3.6305, + "step": 19200 + }, + { + "epoch": 1.79, + "learning_rate": 2.6902666666666666e-05, + "loss": 3.6389, + "step": 19300 + }, + { + "epoch": 1.8, + "learning_rate": 2.6869333333333334e-05, + "loss": 3.6359, + "step": 19400 + }, + { + "epoch": 1.81, + "learning_rate": 2.6836e-05, + "loss": 3.6327, + "step": 19500 + }, + { + "epoch": 1.81, + "eval_loss": 3.6284360885620117, + "eval_runtime": 23.7713, + "eval_samples_per_second": 106.094, + "eval_steps_per_second": 6.647, + "step": 19500 + }, + { + "epoch": 1.82, + "learning_rate": 2.6802666666666667e-05, + "loss": 3.6089, + "step": 19600 + }, + { + "epoch": 1.83, + "learning_rate": 2.6769333333333335e-05, + "loss": 3.6254, + "step": 19700 + }, + { + "epoch": 1.84, + "learning_rate": 2.6736e-05, + "loss": 3.6378, + "step": 19800 + }, + { + "epoch": 1.84, + "learning_rate": 2.6702666666666668e-05, + "loss": 3.6374, + "step": 19900 + }, + { + "epoch": 1.85, + "learning_rate": 2.6669333333333332e-05, + "loss": 3.6166, + "step": 20000 + }, + { + "epoch": 1.85, + "eval_loss": 3.6267168521881104, + "eval_runtime": 23.7269, + "eval_samples_per_second": 106.293, + "eval_steps_per_second": 6.659, + "step": 20000 + }, + { + "epoch": 1.86, + "learning_rate": 2.6636e-05, + "loss": 3.6233, + "step": 20100 + }, + { + "epoch": 1.87, + "learning_rate": 2.6602666666666665e-05, + "loss": 3.6276, + "step": 20200 + }, + { + "epoch": 1.88, + "learning_rate": 2.6569333333333333e-05, + "loss": 3.64, + "step": 20300 + }, + { + "epoch": 1.89, + "learning_rate": 2.6535999999999998e-05, + "loss": 3.6328, + "step": 20400 + }, + { + "epoch": 1.9, + "learning_rate": 2.6502666666666666e-05, + "loss": 3.622, + "step": 20500 + }, + { + "epoch": 1.9, + "eval_loss": 3.621175765991211, + "eval_runtime": 23.806, + "eval_samples_per_second": 105.94, + "eval_steps_per_second": 6.637, + "step": 20500 + }, + { + "epoch": 1.91, + "learning_rate": 2.6469333333333334e-05, + "loss": 3.6092, + "step": 20600 + }, + { + "epoch": 1.92, + "learning_rate": 2.6436e-05, + "loss": 3.6236, + "step": 20700 + }, + { + "epoch": 1.93, + "learning_rate": 2.6402666666666667e-05, + "loss": 3.6293, + "step": 20800 + }, + { + "epoch": 1.94, + "learning_rate": 2.6369333333333332e-05, + "loss": 3.6257, + "step": 20900 + }, + { + "epoch": 1.95, + "learning_rate": 2.6336e-05, + "loss": 3.6164, + "step": 21000 + }, + { + "epoch": 1.95, + "eval_loss": 3.619929075241089, + "eval_runtime": 23.9488, + "eval_samples_per_second": 105.308, + "eval_steps_per_second": 6.597, + "step": 21000 + }, + { + "epoch": 1.96, + "learning_rate": 2.6303333333333333e-05, + "loss": 3.6132, + "step": 21100 + }, + { + "epoch": 1.97, + "learning_rate": 2.627e-05, + "loss": 3.6215, + "step": 21200 + }, + { + "epoch": 1.97, + "learning_rate": 2.6236666666666666e-05, + "loss": 3.6209, + "step": 21300 + }, + { + "epoch": 1.98, + "learning_rate": 2.6203333333333334e-05, + "loss": 3.6149, + "step": 21400 + }, + { + "epoch": 1.99, + "learning_rate": 2.6170333333333336e-05, + "loss": 3.6178, + "step": 21500 + }, + { + "epoch": 1.99, + "eval_loss": 3.6201303005218506, + "eval_runtime": 23.776, + "eval_samples_per_second": 106.074, + "eval_steps_per_second": 6.645, + "step": 21500 + }, + { + "epoch": 2.0, + "learning_rate": 2.6137e-05, + "loss": 3.6313, + "step": 21600 + }, + { + "epoch": 2.01, + "learning_rate": 2.610366666666667e-05, + "loss": 3.5876, + "step": 21700 + }, + { + "epoch": 2.02, + "learning_rate": 2.6070333333333333e-05, + "loss": 3.573, + "step": 21800 + }, + { + "epoch": 2.03, + "learning_rate": 2.6037e-05, + "loss": 3.5804, + "step": 21900 + }, + { + "epoch": 2.04, + "learning_rate": 2.600366666666667e-05, + "loss": 3.5892, + "step": 22000 + }, + { + "epoch": 2.04, + "eval_loss": 3.620147466659546, + "eval_runtime": 23.7922, + "eval_samples_per_second": 106.001, + "eval_steps_per_second": 6.641, + "step": 22000 + }, + { + "epoch": 2.05, + "learning_rate": 2.5970333333333334e-05, + "loss": 3.5861, + "step": 22100 + }, + { + "epoch": 2.06, + "learning_rate": 2.5937000000000003e-05, + "loss": 3.5823, + "step": 22200 + }, + { + "epoch": 2.07, + "learning_rate": 2.5903666666666667e-05, + "loss": 3.5576, + "step": 22300 + }, + { + "epoch": 2.08, + "learning_rate": 2.5870333333333335e-05, + "loss": 3.5834, + "step": 22400 + }, + { + "epoch": 2.09, + "learning_rate": 2.5837e-05, + "loss": 3.5855, + "step": 22500 + }, + { + "epoch": 2.09, + "eval_loss": 3.6220951080322266, + "eval_runtime": 23.7972, + "eval_samples_per_second": 105.979, + "eval_steps_per_second": 6.639, + "step": 22500 + }, + { + "epoch": 2.09, + "learning_rate": 2.5803666666666668e-05, + "loss": 3.5831, + "step": 22600 + }, + { + "epoch": 2.1, + "learning_rate": 2.5770333333333333e-05, + "loss": 3.5785, + "step": 22700 + }, + { + "epoch": 2.11, + "learning_rate": 2.5737e-05, + "loss": 3.5825, + "step": 22800 + }, + { + "epoch": 2.12, + "learning_rate": 2.570366666666667e-05, + "loss": 3.5766, + "step": 22900 + }, + { + "epoch": 2.13, + "learning_rate": 2.5670333333333334e-05, + "loss": 3.5658, + "step": 23000 + }, + { + "epoch": 2.13, + "eval_loss": 3.6193323135375977, + "eval_runtime": 23.7837, + "eval_samples_per_second": 106.039, + "eval_steps_per_second": 6.643, + "step": 23000 + }, + { + "epoch": 2.14, + "learning_rate": 2.5637000000000002e-05, + "loss": 3.5795, + "step": 23100 + }, + { + "epoch": 2.15, + "learning_rate": 2.5603666666666667e-05, + "loss": 3.5644, + "step": 23200 + }, + { + "epoch": 2.16, + "learning_rate": 2.5570333333333335e-05, + "loss": 3.5725, + "step": 23300 + }, + { + "epoch": 2.17, + "learning_rate": 2.5537e-05, + "loss": 3.5833, + "step": 23400 + }, + { + "epoch": 2.18, + "learning_rate": 2.5503666666666668e-05, + "loss": 3.5916, + "step": 23500 + }, + { + "epoch": 2.18, + "eval_loss": 3.6143603324890137, + "eval_runtime": 23.7839, + "eval_samples_per_second": 106.038, + "eval_steps_per_second": 6.643, + "step": 23500 + }, + { + "epoch": 2.19, + "learning_rate": 2.5470333333333336e-05, + "loss": 3.5665, + "step": 23600 + }, + { + "epoch": 2.2, + "learning_rate": 2.5437e-05, + "loss": 3.577, + "step": 23700 + }, + { + "epoch": 2.21, + "learning_rate": 2.540366666666667e-05, + "loss": 3.5749, + "step": 23800 + }, + { + "epoch": 2.22, + "learning_rate": 2.5370333333333333e-05, + "loss": 3.5777, + "step": 23900 + }, + { + "epoch": 2.22, + "learning_rate": 2.5337e-05, + "loss": 3.5767, + "step": 24000 + }, + { + "epoch": 2.22, + "eval_loss": 3.6100852489471436, + "eval_runtime": 23.7895, + "eval_samples_per_second": 106.013, + "eval_steps_per_second": 6.642, + "step": 24000 + }, + { + "epoch": 2.23, + "learning_rate": 2.5303666666666666e-05, + "loss": 3.557, + "step": 24100 + }, + { + "epoch": 2.24, + "learning_rate": 2.5270333333333334e-05, + "loss": 3.5769, + "step": 24200 + }, + { + "epoch": 2.25, + "learning_rate": 2.5237000000000002e-05, + "loss": 3.5622, + "step": 24300 + }, + { + "epoch": 2.26, + "learning_rate": 2.5203666666666667e-05, + "loss": 3.571, + "step": 24400 + }, + { + "epoch": 2.27, + "learning_rate": 2.5170333333333335e-05, + "loss": 3.5809, + "step": 24500 + }, + { + "epoch": 2.27, + "eval_loss": 3.611521005630493, + "eval_runtime": 23.7847, + "eval_samples_per_second": 106.035, + "eval_steps_per_second": 6.643, + "step": 24500 + }, + { + "epoch": 2.28, + "learning_rate": 2.5137e-05, + "loss": 3.583, + "step": 24600 + }, + { + "epoch": 2.29, + "learning_rate": 2.5103666666666668e-05, + "loss": 3.5867, + "step": 24700 + }, + { + "epoch": 2.3, + "learning_rate": 2.5070333333333333e-05, + "loss": 3.5639, + "step": 24800 + }, + { + "epoch": 2.31, + "learning_rate": 2.5037e-05, + "loss": 3.5715, + "step": 24900 + }, + { + "epoch": 2.32, + "learning_rate": 2.5003666666666666e-05, + "loss": 3.5561, + "step": 25000 + }, + { + "epoch": 2.32, + "eval_loss": 3.611006021499634, + "eval_runtime": 23.7869, + "eval_samples_per_second": 106.025, + "eval_steps_per_second": 6.642, + "step": 25000 + }, + { + "epoch": 2.33, + "learning_rate": 2.4970333333333334e-05, + "loss": 3.5542, + "step": 25100 + }, + { + "epoch": 2.34, + "learning_rate": 2.4937000000000002e-05, + "loss": 3.589, + "step": 25200 + }, + { + "epoch": 2.35, + "learning_rate": 2.4903666666666667e-05, + "loss": 3.5546, + "step": 25300 + }, + { + "epoch": 2.35, + "learning_rate": 2.4870333333333335e-05, + "loss": 3.547, + "step": 25400 + }, + { + "epoch": 2.36, + "learning_rate": 2.4837e-05, + "loss": 3.5831, + "step": 25500 + }, + { + "epoch": 2.36, + "eval_loss": 3.6079983711242676, + "eval_runtime": 23.8882, + "eval_samples_per_second": 105.575, + "eval_steps_per_second": 6.614, + "step": 25500 + }, + { + "epoch": 2.37, + "learning_rate": 2.4803666666666668e-05, + "loss": 3.5712, + "step": 25600 + }, + { + "epoch": 2.38, + "learning_rate": 2.4770333333333332e-05, + "loss": 3.5836, + "step": 25700 + }, + { + "epoch": 2.39, + "learning_rate": 2.4737e-05, + "loss": 3.5661, + "step": 25800 + }, + { + "epoch": 2.4, + "learning_rate": 2.470366666666667e-05, + "loss": 3.5508, + "step": 25900 + }, + { + "epoch": 2.41, + "learning_rate": 2.4670666666666667e-05, + "loss": 3.5551, + "step": 26000 + }, + { + "epoch": 2.41, + "eval_loss": 3.612078905105591, + "eval_runtime": 23.7677, + "eval_samples_per_second": 106.11, + "eval_steps_per_second": 6.648, + "step": 26000 + }, + { + "epoch": 2.42, + "learning_rate": 2.4637333333333336e-05, + "loss": 3.564, + "step": 26100 + }, + { + "epoch": 2.43, + "learning_rate": 2.4604000000000004e-05, + "loss": 3.5561, + "step": 26200 + }, + { + "epoch": 2.44, + "learning_rate": 2.457066666666667e-05, + "loss": 3.5639, + "step": 26300 + }, + { + "epoch": 2.45, + "learning_rate": 2.4537333333333336e-05, + "loss": 3.5583, + "step": 26400 + }, + { + "epoch": 2.46, + "learning_rate": 2.4504e-05, + "loss": 3.5588, + "step": 26500 + }, + { + "epoch": 2.46, + "eval_loss": 3.6071577072143555, + "eval_runtime": 23.8054, + "eval_samples_per_second": 105.942, + "eval_steps_per_second": 6.637, + "step": 26500 + }, + { + "epoch": 2.47, + "learning_rate": 2.447066666666667e-05, + "loss": 3.5733, + "step": 26600 + }, + { + "epoch": 2.47, + "learning_rate": 2.4437333333333334e-05, + "loss": 3.5882, + "step": 26700 + }, + { + "epoch": 2.48, + "learning_rate": 2.4404000000000002e-05, + "loss": 3.5682, + "step": 26800 + }, + { + "epoch": 2.49, + "learning_rate": 2.437066666666667e-05, + "loss": 3.5627, + "step": 26900 + }, + { + "epoch": 2.5, + "learning_rate": 2.4337333333333335e-05, + "loss": 3.5645, + "step": 27000 + }, + { + "epoch": 2.5, + "eval_loss": 3.6056134700775146, + "eval_runtime": 23.7971, + "eval_samples_per_second": 105.979, + "eval_steps_per_second": 6.639, + "step": 27000 + }, + { + "epoch": 2.51, + "learning_rate": 2.4304000000000003e-05, + "loss": 3.5442, + "step": 27100 + }, + { + "epoch": 2.52, + "learning_rate": 2.4270666666666668e-05, + "loss": 3.5718, + "step": 27200 + }, + { + "epoch": 2.53, + "learning_rate": 2.4237333333333336e-05, + "loss": 3.5869, + "step": 27300 + }, + { + "epoch": 2.54, + "learning_rate": 2.4204e-05, + "loss": 3.5762, + "step": 27400 + }, + { + "epoch": 2.55, + "learning_rate": 2.417066666666667e-05, + "loss": 3.5804, + "step": 27500 + }, + { + "epoch": 2.55, + "eval_loss": 3.6037800312042236, + "eval_runtime": 23.7846, + "eval_samples_per_second": 106.035, + "eval_steps_per_second": 6.643, + "step": 27500 + }, + { + "epoch": 2.56, + "learning_rate": 2.4137333333333334e-05, + "loss": 3.567, + "step": 27600 + }, + { + "epoch": 2.57, + "learning_rate": 2.4104e-05, + "loss": 3.5534, + "step": 27700 + }, + { + "epoch": 2.58, + "learning_rate": 2.407066666666667e-05, + "loss": 3.568, + "step": 27800 + }, + { + "epoch": 2.59, + "learning_rate": 2.4037333333333334e-05, + "loss": 3.5745, + "step": 27900 + }, + { + "epoch": 2.6, + "learning_rate": 2.4004000000000003e-05, + "loss": 3.5712, + "step": 28000 + }, + { + "epoch": 2.6, + "eval_loss": 3.605182647705078, + "eval_runtime": 23.7889, + "eval_samples_per_second": 106.016, + "eval_steps_per_second": 6.642, + "step": 28000 + }, + { + "epoch": 2.6, + "learning_rate": 2.3970666666666667e-05, + "loss": 3.5636, + "step": 28100 + }, + { + "epoch": 2.61, + "learning_rate": 2.3937333333333335e-05, + "loss": 3.5593, + "step": 28200 + }, + { + "epoch": 2.62, + "learning_rate": 2.3904e-05, + "loss": 3.5347, + "step": 28300 + }, + { + "epoch": 2.63, + "learning_rate": 2.3870666666666668e-05, + "loss": 3.5813, + "step": 28400 + }, + { + "epoch": 2.64, + "learning_rate": 2.3837333333333336e-05, + "loss": 3.5494, + "step": 28500 + }, + { + "epoch": 2.64, + "eval_loss": 3.601443290710449, + "eval_runtime": 23.783, + "eval_samples_per_second": 106.042, + "eval_steps_per_second": 6.643, + "step": 28500 + }, + { + "epoch": 2.65, + "learning_rate": 2.3804e-05, + "loss": 3.5589, + "step": 28600 + }, + { + "epoch": 2.66, + "learning_rate": 2.377066666666667e-05, + "loss": 3.583, + "step": 28700 + }, + { + "epoch": 2.67, + "learning_rate": 2.3737666666666668e-05, + "loss": 3.5648, + "step": 28800 + }, + { + "epoch": 2.68, + "learning_rate": 2.3704333333333333e-05, + "loss": 3.549, + "step": 28900 + }, + { + "epoch": 2.69, + "learning_rate": 2.3671e-05, + "loss": 3.582, + "step": 29000 + }, + { + "epoch": 2.69, + "eval_loss": 3.599480152130127, + "eval_runtime": 23.7091, + "eval_samples_per_second": 106.373, + "eval_steps_per_second": 6.664, + "step": 29000 + }, + { + "epoch": 2.7, + "learning_rate": 2.3637666666666666e-05, + "loss": 3.5619, + "step": 29100 + }, + { + "epoch": 2.71, + "learning_rate": 2.3604333333333334e-05, + "loss": 3.5682, + "step": 29200 + }, + { + "epoch": 2.72, + "learning_rate": 2.3571e-05, + "loss": 3.5896, + "step": 29300 + }, + { + "epoch": 2.73, + "learning_rate": 2.3537666666666667e-05, + "loss": 3.552, + "step": 29400 + }, + { + "epoch": 2.73, + "learning_rate": 2.3504333333333335e-05, + "loss": 3.5487, + "step": 29500 + }, + { + "epoch": 2.73, + "eval_loss": 3.6051015853881836, + "eval_runtime": 23.8293, + "eval_samples_per_second": 105.836, + "eval_steps_per_second": 6.63, + "step": 29500 + }, + { + "epoch": 2.74, + "learning_rate": 2.3471e-05, + "loss": 3.5452, + "step": 29600 + }, + { + "epoch": 2.75, + "learning_rate": 2.3437666666666668e-05, + "loss": 3.5542, + "step": 29700 + }, + { + "epoch": 2.76, + "learning_rate": 2.3404333333333332e-05, + "loss": 3.5642, + "step": 29800 + }, + { + "epoch": 2.77, + "learning_rate": 2.3371e-05, + "loss": 3.5566, + "step": 29900 + }, + { + "epoch": 2.78, + "learning_rate": 2.3337666666666665e-05, + "loss": 3.5709, + "step": 30000 + }, + { + "epoch": 2.78, + "eval_loss": 3.5953872203826904, + "eval_runtime": 23.8304, + "eval_samples_per_second": 105.831, + "eval_steps_per_second": 6.63, + "step": 30000 + }, + { + "epoch": 2.79, + "learning_rate": 2.3304333333333333e-05, + "loss": 3.5628, + "step": 30100 + }, + { + "epoch": 2.8, + "learning_rate": 2.3270999999999998e-05, + "loss": 3.5695, + "step": 30200 + }, + { + "epoch": 2.81, + "learning_rate": 2.3237666666666666e-05, + "loss": 3.5499, + "step": 30300 + }, + { + "epoch": 2.82, + "learning_rate": 2.3204333333333334e-05, + "loss": 3.5622, + "step": 30400 + }, + { + "epoch": 2.83, + "learning_rate": 2.3171e-05, + "loss": 3.5546, + "step": 30500 + }, + { + "epoch": 2.83, + "eval_loss": 3.5941429138183594, + "eval_runtime": 23.8009, + "eval_samples_per_second": 105.963, + "eval_steps_per_second": 6.638, + "step": 30500 + }, + { + "epoch": 2.84, + "learning_rate": 2.3137666666666667e-05, + "loss": 3.5545, + "step": 30600 + }, + { + "epoch": 2.85, + "learning_rate": 2.3104333333333332e-05, + "loss": 3.5677, + "step": 30700 + }, + { + "epoch": 2.85, + "learning_rate": 2.3071e-05, + "loss": 3.5301, + "step": 30800 + }, + { + "epoch": 2.86, + "learning_rate": 2.3037666666666665e-05, + "loss": 3.5521, + "step": 30900 + }, + { + "epoch": 2.87, + "learning_rate": 2.3004333333333333e-05, + "loss": 3.5525, + "step": 31000 + }, + { + "epoch": 2.87, + "eval_loss": 3.5951895713806152, + "eval_runtime": 23.8131, + "eval_samples_per_second": 105.908, + "eval_steps_per_second": 6.635, + "step": 31000 + }, + { + "epoch": 2.88, + "learning_rate": 2.2971e-05, + "loss": 3.5602, + "step": 31100 + }, + { + "epoch": 2.89, + "learning_rate": 2.2937666666666666e-05, + "loss": 3.5652, + "step": 31200 + }, + { + "epoch": 2.9, + "learning_rate": 2.2904333333333334e-05, + "loss": 3.5511, + "step": 31300 + }, + { + "epoch": 2.91, + "learning_rate": 2.2871333333333336e-05, + "loss": 3.5592, + "step": 31400 + }, + { + "epoch": 2.92, + "learning_rate": 2.2838e-05, + "loss": 3.5603, + "step": 31500 + }, + { + "epoch": 2.92, + "eval_loss": 3.597166061401367, + "eval_runtime": 23.7699, + "eval_samples_per_second": 106.1, + "eval_steps_per_second": 6.647, + "step": 31500 + }, + { + "epoch": 2.93, + "learning_rate": 2.280466666666667e-05, + "loss": 3.5604, + "step": 31600 + }, + { + "epoch": 2.94, + "learning_rate": 2.2771333333333333e-05, + "loss": 3.5652, + "step": 31700 + }, + { + "epoch": 2.95, + "learning_rate": 2.2738e-05, + "loss": 3.5589, + "step": 31800 + }, + { + "epoch": 2.96, + "learning_rate": 2.2704666666666666e-05, + "loss": 3.5469, + "step": 31900 + }, + { + "epoch": 2.97, + "learning_rate": 2.2671666666666665e-05, + "loss": 3.5572, + "step": 32000 + }, + { + "epoch": 2.97, + "eval_loss": 3.5946662425994873, + "eval_runtime": 23.7852, + "eval_samples_per_second": 106.032, + "eval_steps_per_second": 6.643, + "step": 32000 + }, + { + "epoch": 2.98, + "learning_rate": 2.2638333333333333e-05, + "loss": 3.5633, + "step": 32100 + }, + { + "epoch": 2.98, + "learning_rate": 2.2604999999999998e-05, + "loss": 3.5561, + "step": 32200 + }, + { + "epoch": 2.99, + "learning_rate": 2.2571666666666666e-05, + "loss": 3.5372, + "step": 32300 + }, + { + "epoch": 3.0, + "learning_rate": 2.2538333333333334e-05, + "loss": 3.565, + "step": 32400 + }, + { + "epoch": 3.01, + "learning_rate": 2.2505e-05, + "loss": 3.5106, + "step": 32500 + }, + { + "epoch": 3.01, + "eval_loss": 3.5952203273773193, + "eval_runtime": 23.7897, + "eval_samples_per_second": 106.012, + "eval_steps_per_second": 6.642, + "step": 32500 + }, + { + "epoch": 3.02, + "learning_rate": 2.2471666666666667e-05, + "loss": 3.5149, + "step": 32600 + }, + { + "epoch": 3.03, + "learning_rate": 2.243866666666667e-05, + "loss": 3.5158, + "step": 32700 + }, + { + "epoch": 3.04, + "learning_rate": 2.2405333333333334e-05, + "loss": 3.5136, + "step": 32800 + }, + { + "epoch": 3.05, + "learning_rate": 2.2372000000000002e-05, + "loss": 3.5401, + "step": 32900 + }, + { + "epoch": 3.06, + "learning_rate": 2.2338666666666667e-05, + "loss": 3.5142, + "step": 33000 + }, + { + "epoch": 3.06, + "eval_loss": 3.5936639308929443, + "eval_runtime": 23.7767, + "eval_samples_per_second": 106.07, + "eval_steps_per_second": 6.645, + "step": 33000 + }, + { + "epoch": 3.07, + "learning_rate": 2.2305333333333335e-05, + "loss": 3.5154, + "step": 33100 + }, + { + "epoch": 3.08, + "learning_rate": 2.2272e-05, + "loss": 3.5303, + "step": 33200 + }, + { + "epoch": 3.09, + "learning_rate": 2.2238666666666668e-05, + "loss": 3.5124, + "step": 33300 + }, + { + "epoch": 3.1, + "learning_rate": 2.2205333333333336e-05, + "loss": 3.5242, + "step": 33400 + }, + { + "epoch": 3.11, + "learning_rate": 2.2172e-05, + "loss": 3.506, + "step": 33500 + }, + { + "epoch": 3.11, + "eval_loss": 3.5964934825897217, + "eval_runtime": 23.7863, + "eval_samples_per_second": 106.028, + "eval_steps_per_second": 6.642, + "step": 33500 + }, + { + "epoch": 3.11, + "learning_rate": 2.213866666666667e-05, + "loss": 3.5077, + "step": 33600 + }, + { + "epoch": 3.12, + "learning_rate": 2.2105333333333333e-05, + "loss": 3.501, + "step": 33700 + }, + { + "epoch": 3.13, + "learning_rate": 2.2072e-05, + "loss": 3.512, + "step": 33800 + }, + { + "epoch": 3.14, + "learning_rate": 2.2038666666666666e-05, + "loss": 3.5373, + "step": 33900 + }, + { + "epoch": 3.15, + "learning_rate": 2.2005333333333334e-05, + "loss": 3.515, + "step": 34000 + }, + { + "epoch": 3.15, + "eval_loss": 3.5931718349456787, + "eval_runtime": 23.7854, + "eval_samples_per_second": 106.031, + "eval_steps_per_second": 6.643, + "step": 34000 + }, + { + "epoch": 3.16, + "learning_rate": 2.1972000000000002e-05, + "loss": 3.5225, + "step": 34100 + }, + { + "epoch": 3.17, + "learning_rate": 2.1938666666666667e-05, + "loss": 3.4866, + "step": 34200 + }, + { + "epoch": 3.18, + "learning_rate": 2.1905333333333335e-05, + "loss": 3.526, + "step": 34300 + }, + { + "epoch": 3.19, + "learning_rate": 2.1872e-05, + "loss": 3.5036, + "step": 34400 + }, + { + "epoch": 3.2, + "learning_rate": 2.1838666666666668e-05, + "loss": 3.5247, + "step": 34500 + }, + { + "epoch": 3.2, + "eval_loss": 3.5951099395751953, + "eval_runtime": 23.7873, + "eval_samples_per_second": 106.023, + "eval_steps_per_second": 6.642, + "step": 34500 + }, + { + "epoch": 3.21, + "learning_rate": 2.1805333333333333e-05, + "loss": 3.528, + "step": 34600 + }, + { + "epoch": 3.22, + "learning_rate": 2.1772e-05, + "loss": 3.5123, + "step": 34700 + }, + { + "epoch": 3.23, + "learning_rate": 2.1738666666666666e-05, + "loss": 3.5277, + "step": 34800 + }, + { + "epoch": 3.24, + "learning_rate": 2.1705333333333334e-05, + "loss": 3.522, + "step": 34900 + }, + { + "epoch": 3.24, + "learning_rate": 2.1672000000000002e-05, + "loss": 3.5384, + "step": 35000 + }, + { + "epoch": 3.24, + "eval_loss": 3.591693878173828, + "eval_runtime": 23.7823, + "eval_samples_per_second": 106.045, + "eval_steps_per_second": 6.644, + "step": 35000 + }, + { + "epoch": 3.25, + "learning_rate": 2.1638666666666667e-05, + "loss": 3.526, + "step": 35100 + }, + { + "epoch": 3.26, + "learning_rate": 2.1605333333333335e-05, + "loss": 3.5137, + "step": 35200 + }, + { + "epoch": 3.27, + "learning_rate": 2.1572e-05, + "loss": 3.525, + "step": 35300 + }, + { + "epoch": 3.28, + "learning_rate": 2.1538666666666668e-05, + "loss": 3.519, + "step": 35400 + }, + { + "epoch": 3.29, + "learning_rate": 2.1505666666666666e-05, + "loss": 3.5165, + "step": 35500 + }, + { + "epoch": 3.29, + "eval_loss": 3.5887227058410645, + "eval_runtime": 23.8133, + "eval_samples_per_second": 105.907, + "eval_steps_per_second": 6.635, + "step": 35500 + }, + { + "epoch": 3.3, + "learning_rate": 2.147233333333333e-05, + "loss": 3.5157, + "step": 35600 + }, + { + "epoch": 3.31, + "learning_rate": 2.1439e-05, + "loss": 3.5363, + "step": 35700 + }, + { + "epoch": 3.32, + "learning_rate": 2.1405666666666664e-05, + "loss": 3.5146, + "step": 35800 + }, + { + "epoch": 3.33, + "learning_rate": 2.1372333333333332e-05, + "loss": 3.5313, + "step": 35900 + }, + { + "epoch": 3.34, + "learning_rate": 2.1339e-05, + "loss": 3.5187, + "step": 36000 + }, + { + "epoch": 3.34, + "eval_loss": 3.5865957736968994, + "eval_runtime": 23.7698, + "eval_samples_per_second": 106.101, + "eval_steps_per_second": 6.647, + "step": 36000 + }, + { + "epoch": 3.35, + "learning_rate": 2.1305666666666668e-05, + "loss": 3.5181, + "step": 36100 + }, + { + "epoch": 3.36, + "learning_rate": 2.1272333333333336e-05, + "loss": 3.5087, + "step": 36200 + }, + { + "epoch": 3.36, + "learning_rate": 2.1239e-05, + "loss": 3.5211, + "step": 36300 + }, + { + "epoch": 3.37, + "learning_rate": 2.120566666666667e-05, + "loss": 3.5419, + "step": 36400 + }, + { + "epoch": 3.38, + "learning_rate": 2.1172333333333334e-05, + "loss": 3.5097, + "step": 36500 + }, + { + "epoch": 3.38, + "eval_loss": 3.5894858837127686, + "eval_runtime": 23.8034, + "eval_samples_per_second": 105.951, + "eval_steps_per_second": 6.638, + "step": 36500 + }, + { + "epoch": 3.39, + "learning_rate": 2.1139000000000002e-05, + "loss": 3.525, + "step": 36600 + }, + { + "epoch": 3.4, + "learning_rate": 2.110566666666667e-05, + "loss": 3.5083, + "step": 36700 + }, + { + "epoch": 3.41, + "learning_rate": 2.1072333333333335e-05, + "loss": 3.4998, + "step": 36800 + }, + { + "epoch": 3.42, + "learning_rate": 2.1039000000000003e-05, + "loss": 3.5201, + "step": 36900 + }, + { + "epoch": 3.43, + "learning_rate": 2.1005666666666668e-05, + "loss": 3.5136, + "step": 37000 + }, + { + "epoch": 3.43, + "eval_loss": 3.587806224822998, + "eval_runtime": 23.7779, + "eval_samples_per_second": 106.065, + "eval_steps_per_second": 6.645, + "step": 37000 + }, + { + "epoch": 3.44, + "learning_rate": 2.0972333333333336e-05, + "loss": 3.5331, + "step": 37100 + }, + { + "epoch": 3.45, + "learning_rate": 2.0939e-05, + "loss": 3.5299, + "step": 37200 + }, + { + "epoch": 3.46, + "learning_rate": 2.090566666666667e-05, + "loss": 3.5274, + "step": 37300 + }, + { + "epoch": 3.47, + "learning_rate": 2.0872333333333337e-05, + "loss": 3.5212, + "step": 37400 + }, + { + "epoch": 3.48, + "learning_rate": 2.0839e-05, + "loss": 3.5095, + "step": 37500 + }, + { + "epoch": 3.48, + "eval_loss": 3.5838775634765625, + "eval_runtime": 23.7902, + "eval_samples_per_second": 106.01, + "eval_steps_per_second": 6.641, + "step": 37500 + }, + { + "epoch": 3.49, + "learning_rate": 2.080566666666667e-05, + "loss": 3.5125, + "step": 37600 + }, + { + "epoch": 3.49, + "learning_rate": 2.0772333333333334e-05, + "loss": 3.4908, + "step": 37700 + }, + { + "epoch": 3.5, + "learning_rate": 2.0739000000000003e-05, + "loss": 3.5018, + "step": 37800 + }, + { + "epoch": 3.51, + "learning_rate": 2.0705666666666667e-05, + "loss": 3.5034, + "step": 37900 + }, + { + "epoch": 3.52, + "learning_rate": 2.0672333333333335e-05, + "loss": 3.5226, + "step": 38000 + }, + { + "epoch": 3.52, + "eval_loss": 3.5859289169311523, + "eval_runtime": 23.7148, + "eval_samples_per_second": 106.347, + "eval_steps_per_second": 6.663, + "step": 38000 + }, + { + "epoch": 3.53, + "learning_rate": 2.0639e-05, + "loss": 3.5247, + "step": 38100 + }, + { + "epoch": 3.54, + "learning_rate": 2.0605666666666668e-05, + "loss": 3.5342, + "step": 38200 + }, + { + "epoch": 3.55, + "learning_rate": 2.0572333333333336e-05, + "loss": 3.5144, + "step": 38300 + }, + { + "epoch": 3.56, + "learning_rate": 2.0539e-05, + "loss": 3.5106, + "step": 38400 + }, + { + "epoch": 3.57, + "learning_rate": 2.050566666666667e-05, + "loss": 3.5277, + "step": 38500 + }, + { + "epoch": 3.57, + "eval_loss": 3.5827441215515137, + "eval_runtime": 23.8131, + "eval_samples_per_second": 105.908, + "eval_steps_per_second": 6.635, + "step": 38500 + }, + { + "epoch": 3.58, + "learning_rate": 2.0472333333333334e-05, + "loss": 3.4914, + "step": 38600 + }, + { + "epoch": 3.59, + "learning_rate": 2.0439000000000002e-05, + "loss": 3.5394, + "step": 38700 + }, + { + "epoch": 3.6, + "learning_rate": 2.0405666666666667e-05, + "loss": 3.5314, + "step": 38800 + }, + { + "epoch": 3.61, + "learning_rate": 2.0372333333333335e-05, + "loss": 3.5096, + "step": 38900 + }, + { + "epoch": 3.62, + "learning_rate": 2.0339000000000003e-05, + "loss": 3.4959, + "step": 39000 + }, + { + "epoch": 3.62, + "eval_loss": 3.5846378803253174, + "eval_runtime": 23.7913, + "eval_samples_per_second": 106.005, + "eval_steps_per_second": 6.641, + "step": 39000 + }, + { + "epoch": 3.62, + "learning_rate": 2.0305666666666668e-05, + "loss": 3.51, + "step": 39100 + }, + { + "epoch": 3.63, + "learning_rate": 2.0272333333333336e-05, + "loss": 3.507, + "step": 39200 + }, + { + "epoch": 3.64, + "learning_rate": 2.0239e-05, + "loss": 3.5048, + "step": 39300 + }, + { + "epoch": 3.65, + "learning_rate": 2.020566666666667e-05, + "loss": 3.525, + "step": 39400 + }, + { + "epoch": 3.66, + "learning_rate": 2.0172333333333333e-05, + "loss": 3.5003, + "step": 39500 + }, + { + "epoch": 3.66, + "eval_loss": 3.5823278427124023, + "eval_runtime": 23.8021, + "eval_samples_per_second": 105.957, + "eval_steps_per_second": 6.638, + "step": 39500 + }, + { + "epoch": 3.67, + "learning_rate": 2.0139e-05, + "loss": 3.4986, + "step": 39600 + }, + { + "epoch": 3.68, + "learning_rate": 2.0105666666666666e-05, + "loss": 3.516, + "step": 39700 + }, + { + "epoch": 3.69, + "learning_rate": 2.0072333333333334e-05, + "loss": 3.5114, + "step": 39800 + }, + { + "epoch": 3.7, + "learning_rate": 2.0039000000000002e-05, + "loss": 3.5071, + "step": 39900 + }, + { + "epoch": 3.71, + "learning_rate": 2.0005666666666667e-05, + "loss": 3.5095, + "step": 40000 + }, + { + "epoch": 3.71, + "eval_loss": 3.581997871398926, + "eval_runtime": 23.772, + "eval_samples_per_second": 106.091, + "eval_steps_per_second": 6.646, + "step": 40000 + }, + { + "epoch": 3.72, + "learning_rate": 1.9972333333333335e-05, + "loss": 3.5524, + "step": 40100 + }, + { + "epoch": 3.73, + "learning_rate": 1.9939e-05, + "loss": 3.5158, + "step": 40200 + }, + { + "epoch": 3.74, + "learning_rate": 1.9905666666666668e-05, + "loss": 3.4829, + "step": 40300 + }, + { + "epoch": 3.74, + "learning_rate": 1.9872333333333333e-05, + "loss": 3.498, + "step": 40400 + }, + { + "epoch": 3.75, + "learning_rate": 1.9839e-05, + "loss": 3.4814, + "step": 40500 + }, + { + "epoch": 3.75, + "eval_loss": 3.5854456424713135, + "eval_runtime": 23.7663, + "eval_samples_per_second": 106.117, + "eval_steps_per_second": 6.648, + "step": 40500 + }, + { + "epoch": 3.76, + "learning_rate": 1.980566666666667e-05, + "loss": 3.5146, + "step": 40600 + }, + { + "epoch": 3.77, + "learning_rate": 1.9772333333333334e-05, + "loss": 3.5109, + "step": 40700 + }, + { + "epoch": 3.78, + "learning_rate": 1.9739000000000002e-05, + "loss": 3.5271, + "step": 40800 + }, + { + "epoch": 3.79, + "learning_rate": 1.9705666666666667e-05, + "loss": 3.5158, + "step": 40900 + }, + { + "epoch": 3.8, + "learning_rate": 1.9672333333333335e-05, + "loss": 3.5173, + "step": 41000 + }, + { + "epoch": 3.8, + "eval_loss": 3.5796046257019043, + "eval_runtime": 23.7829, + "eval_samples_per_second": 106.043, + "eval_steps_per_second": 6.643, + "step": 41000 + }, + { + "epoch": 3.81, + "learning_rate": 1.9639e-05, + "loss": 3.5034, + "step": 41100 + }, + { + "epoch": 3.82, + "learning_rate": 1.9605666666666668e-05, + "loss": 3.5118, + "step": 41200 + }, + { + "epoch": 3.83, + "learning_rate": 1.9572333333333336e-05, + "loss": 3.5265, + "step": 41300 + }, + { + "epoch": 3.84, + "learning_rate": 1.9539e-05, + "loss": 3.513, + "step": 41400 + }, + { + "epoch": 3.85, + "learning_rate": 1.950566666666667e-05, + "loss": 3.4968, + "step": 41500 + }, + { + "epoch": 3.85, + "eval_loss": 3.5810136795043945, + "eval_runtime": 23.9055, + "eval_samples_per_second": 105.499, + "eval_steps_per_second": 6.609, + "step": 41500 + }, + { + "epoch": 3.86, + "learning_rate": 1.9472333333333333e-05, + "loss": 3.5123, + "step": 41600 + }, + { + "epoch": 3.87, + "learning_rate": 1.9439e-05, + "loss": 3.495, + "step": 41700 + }, + { + "epoch": 3.87, + "learning_rate": 1.9405666666666666e-05, + "loss": 3.5034, + "step": 41800 + }, + { + "epoch": 3.88, + "learning_rate": 1.9372333333333334e-05, + "loss": 3.5135, + "step": 41900 + }, + { + "epoch": 3.89, + "learning_rate": 1.9339e-05, + "loss": 3.5183, + "step": 42000 + }, + { + "epoch": 3.89, + "eval_loss": 3.578335762023926, + "eval_runtime": 23.7695, + "eval_samples_per_second": 106.102, + "eval_steps_per_second": 6.647, + "step": 42000 + }, + { + "epoch": 3.9, + "learning_rate": 1.9305666666666667e-05, + "loss": 3.4925, + "step": 42100 + }, + { + "epoch": 3.91, + "learning_rate": 1.9272333333333335e-05, + "loss": 3.5166, + "step": 42200 + }, + { + "epoch": 3.92, + "learning_rate": 1.9239e-05, + "loss": 3.4917, + "step": 42300 + }, + { + "epoch": 3.93, + "learning_rate": 1.9205666666666668e-05, + "loss": 3.4986, + "step": 42400 + }, + { + "epoch": 3.94, + "learning_rate": 1.9172333333333333e-05, + "loss": 3.512, + "step": 42500 + }, + { + "epoch": 3.94, + "eval_loss": 3.5784130096435547, + "eval_runtime": 23.7738, + "eval_samples_per_second": 106.083, + "eval_steps_per_second": 6.646, + "step": 42500 + }, + { + "epoch": 3.95, + "learning_rate": 1.9139e-05, + "loss": 3.5175, + "step": 42600 + }, + { + "epoch": 3.96, + "learning_rate": 1.9105666666666666e-05, + "loss": 3.5299, + "step": 42700 + }, + { + "epoch": 3.97, + "learning_rate": 1.9072333333333334e-05, + "loss": 3.4949, + "step": 42800 + }, + { + "epoch": 3.98, + "learning_rate": 1.9039000000000002e-05, + "loss": 3.5017, + "step": 42900 + }, + { + "epoch": 3.99, + "learning_rate": 1.9005666666666667e-05, + "loss": 3.5069, + "step": 43000 + }, + { + "epoch": 3.99, + "eval_loss": 3.577517032623291, + "eval_runtime": 23.8132, + "eval_samples_per_second": 105.908, + "eval_steps_per_second": 6.635, + "step": 43000 + }, + { + "epoch": 4.0, + "learning_rate": 1.8972333333333335e-05, + "loss": 3.5017, + "step": 43100 + }, + { + "epoch": 4.0, + "learning_rate": 1.8939e-05, + "loss": 3.517, + "step": 43200 + }, + { + "epoch": 4.01, + "learning_rate": 1.8905666666666668e-05, + "loss": 3.4586, + "step": 43300 + }, + { + "epoch": 4.02, + "learning_rate": 1.8872333333333332e-05, + "loss": 3.4568, + "step": 43400 + }, + { + "epoch": 4.03, + "learning_rate": 1.8839e-05, + "loss": 3.5014, + "step": 43500 + }, + { + "epoch": 4.03, + "eval_loss": 3.581881046295166, + "eval_runtime": 23.7711, + "eval_samples_per_second": 106.095, + "eval_steps_per_second": 6.647, + "step": 43500 + }, + { + "epoch": 4.04, + "learning_rate": 1.880566666666667e-05, + "loss": 3.4791, + "step": 43600 + }, + { + "epoch": 4.05, + "learning_rate": 1.8772333333333333e-05, + "loss": 3.4744, + "step": 43700 + }, + { + "epoch": 4.06, + "learning_rate": 1.8739e-05, + "loss": 3.4972, + "step": 43800 + }, + { + "epoch": 4.07, + "learning_rate": 1.8705666666666666e-05, + "loss": 3.4811, + "step": 43900 + }, + { + "epoch": 4.08, + "learning_rate": 1.8672333333333334e-05, + "loss": 3.4787, + "step": 44000 + }, + { + "epoch": 4.08, + "eval_loss": 3.5835728645324707, + "eval_runtime": 23.7828, + "eval_samples_per_second": 106.043, + "eval_steps_per_second": 6.643, + "step": 44000 + }, + { + "epoch": 4.09, + "learning_rate": 1.8639e-05, + "loss": 3.4686, + "step": 44100 + }, + { + "epoch": 4.1, + "learning_rate": 1.8605666666666667e-05, + "loss": 3.4751, + "step": 44200 + }, + { + "epoch": 4.11, + "learning_rate": 1.8572333333333332e-05, + "loss": 3.4829, + "step": 44300 + }, + { + "epoch": 4.12, + "learning_rate": 1.8539e-05, + "loss": 3.4669, + "step": 44400 + }, + { + "epoch": 4.12, + "learning_rate": 1.8505666666666668e-05, + "loss": 3.4625, + "step": 44500 + }, + { + "epoch": 4.12, + "eval_loss": 3.5787713527679443, + "eval_runtime": 23.7907, + "eval_samples_per_second": 106.008, + "eval_steps_per_second": 6.641, + "step": 44500 + }, + { + "epoch": 4.13, + "learning_rate": 1.8472333333333333e-05, + "loss": 3.4731, + "step": 44600 + }, + { + "epoch": 4.14, + "learning_rate": 1.8439e-05, + "loss": 3.4799, + "step": 44700 + }, + { + "epoch": 4.15, + "learning_rate": 1.8405666666666666e-05, + "loss": 3.5001, + "step": 44800 + }, + { + "epoch": 4.16, + "learning_rate": 1.8372333333333334e-05, + "loss": 3.4744, + "step": 44900 + }, + { + "epoch": 4.17, + "learning_rate": 1.8339e-05, + "loss": 3.4902, + "step": 45000 + }, + { + "epoch": 4.17, + "eval_loss": 3.5783867835998535, + "eval_runtime": 23.7793, + "eval_samples_per_second": 106.059, + "eval_steps_per_second": 6.644, + "step": 45000 + }, + { + "epoch": 4.18, + "learning_rate": 1.8305666666666667e-05, + "loss": 3.4946, + "step": 45100 + }, + { + "epoch": 4.19, + "learning_rate": 1.8272333333333335e-05, + "loss": 3.4695, + "step": 45200 + }, + { + "epoch": 4.2, + "learning_rate": 1.8239e-05, + "loss": 3.4579, + "step": 45300 + }, + { + "epoch": 4.21, + "learning_rate": 1.8205666666666667e-05, + "loss": 3.4617, + "step": 45400 + }, + { + "epoch": 4.22, + "learning_rate": 1.8172333333333332e-05, + "loss": 3.4927, + "step": 45500 + }, + { + "epoch": 4.22, + "eval_loss": 3.577256202697754, + "eval_runtime": 23.8012, + "eval_samples_per_second": 105.961, + "eval_steps_per_second": 6.638, + "step": 45500 + }, + { + "epoch": 4.23, + "learning_rate": 1.8139e-05, + "loss": 3.4659, + "step": 45600 + }, + { + "epoch": 4.24, + "learning_rate": 1.8105666666666665e-05, + "loss": 3.4958, + "step": 45700 + }, + { + "epoch": 4.25, + "learning_rate": 1.8072333333333333e-05, + "loss": 3.4759, + "step": 45800 + }, + { + "epoch": 4.25, + "learning_rate": 1.8038999999999998e-05, + "loss": 3.4523, + "step": 45900 + }, + { + "epoch": 4.26, + "learning_rate": 1.8005666666666666e-05, + "loss": 3.4813, + "step": 46000 + }, + { + "epoch": 4.26, + "eval_loss": 3.576850652694702, + "eval_runtime": 23.7844, + "eval_samples_per_second": 106.036, + "eval_steps_per_second": 6.643, + "step": 46000 + }, + { + "epoch": 4.27, + "learning_rate": 1.7972333333333334e-05, + "loss": 3.4841, + "step": 46100 + }, + { + "epoch": 4.28, + "learning_rate": 1.7939e-05, + "loss": 3.472, + "step": 46200 + }, + { + "epoch": 4.29, + "learning_rate": 1.7905666666666667e-05, + "loss": 3.4769, + "step": 46300 + }, + { + "epoch": 4.3, + "learning_rate": 1.787233333333333e-05, + "loss": 3.4783, + "step": 46400 + }, + { + "epoch": 4.31, + "learning_rate": 1.7839e-05, + "loss": 3.4637, + "step": 46500 + }, + { + "epoch": 4.31, + "eval_loss": 3.576143264770508, + "eval_runtime": 23.8068, + "eval_samples_per_second": 105.936, + "eval_steps_per_second": 6.637, + "step": 46500 + }, + { + "epoch": 4.32, + "learning_rate": 1.7805666666666665e-05, + "loss": 3.4812, + "step": 46600 + }, + { + "epoch": 4.33, + "learning_rate": 1.7772333333333333e-05, + "loss": 3.476, + "step": 46700 + }, + { + "epoch": 4.34, + "learning_rate": 1.7739e-05, + "loss": 3.4689, + "step": 46800 + }, + { + "epoch": 4.35, + "learning_rate": 1.7705666666666665e-05, + "loss": 3.4719, + "step": 46900 + }, + { + "epoch": 4.36, + "learning_rate": 1.7672333333333334e-05, + "loss": 3.4731, + "step": 47000 + }, + { + "epoch": 4.36, + "eval_loss": 3.577054262161255, + "eval_runtime": 23.719, + "eval_samples_per_second": 106.328, + "eval_steps_per_second": 6.661, + "step": 47000 + }, + { + "epoch": 4.37, + "learning_rate": 1.7639e-05, + "loss": 3.4923, + "step": 47100 + }, + { + "epoch": 4.38, + "learning_rate": 1.7605666666666666e-05, + "loss": 3.4679, + "step": 47200 + }, + { + "epoch": 4.38, + "learning_rate": 1.757233333333333e-05, + "loss": 3.4742, + "step": 47300 + }, + { + "epoch": 4.39, + "learning_rate": 1.7539e-05, + "loss": 3.4628, + "step": 47400 + }, + { + "epoch": 4.4, + "learning_rate": 1.7505666666666667e-05, + "loss": 3.4856, + "step": 47500 + }, + { + "epoch": 4.4, + "eval_loss": 3.578598976135254, + "eval_runtime": 23.7952, + "eval_samples_per_second": 105.988, + "eval_steps_per_second": 6.64, + "step": 47500 + }, + { + "epoch": 4.41, + "learning_rate": 1.7472333333333332e-05, + "loss": 3.488, + "step": 47600 + }, + { + "epoch": 4.42, + "learning_rate": 1.7439e-05, + "loss": 3.4711, + "step": 47700 + }, + { + "epoch": 4.43, + "learning_rate": 1.7405666666666665e-05, + "loss": 3.4814, + "step": 47800 + }, + { + "epoch": 4.44, + "learning_rate": 1.7372333333333333e-05, + "loss": 3.4768, + "step": 47900 + }, + { + "epoch": 4.45, + "learning_rate": 1.7338999999999998e-05, + "loss": 3.4579, + "step": 48000 + }, + { + "epoch": 4.45, + "eval_loss": 3.5790109634399414, + "eval_runtime": 23.779, + "eval_samples_per_second": 106.06, + "eval_steps_per_second": 6.645, + "step": 48000 + }, + { + "epoch": 4.46, + "learning_rate": 1.7305666666666666e-05, + "loss": 3.4752, + "step": 48100 + }, + { + "epoch": 4.47, + "learning_rate": 1.727233333333333e-05, + "loss": 3.4836, + "step": 48200 + }, + { + "epoch": 4.48, + "learning_rate": 1.7239333333333333e-05, + "loss": 3.4835, + "step": 48300 + }, + { + "epoch": 4.49, + "learning_rate": 1.7206e-05, + "loss": 3.4798, + "step": 48400 + }, + { + "epoch": 4.5, + "learning_rate": 1.717266666666667e-05, + "loss": 3.5032, + "step": 48500 + }, + { + "epoch": 4.5, + "eval_loss": 3.573821544647217, + "eval_runtime": 23.7995, + "eval_samples_per_second": 105.969, + "eval_steps_per_second": 6.639, + "step": 48500 + }, + { + "epoch": 4.5, + "learning_rate": 1.7139333333333334e-05, + "loss": 3.4885, + "step": 48600 + }, + { + "epoch": 4.51, + "learning_rate": 1.7106000000000002e-05, + "loss": 3.4805, + "step": 48700 + }, + { + "epoch": 4.52, + "learning_rate": 1.7072666666666667e-05, + "loss": 3.4588, + "step": 48800 + }, + { + "epoch": 4.53, + "learning_rate": 1.7039333333333335e-05, + "loss": 3.4742, + "step": 48900 + }, + { + "epoch": 4.54, + "learning_rate": 1.7006e-05, + "loss": 3.4826, + "step": 49000 + }, + { + "epoch": 4.54, + "eval_loss": 3.574939012527466, + "eval_runtime": 23.7807, + "eval_samples_per_second": 106.052, + "eval_steps_per_second": 6.644, + "step": 49000 + }, + { + "epoch": 4.55, + "learning_rate": 1.6972666666666668e-05, + "loss": 3.4866, + "step": 49100 + }, + { + "epoch": 4.56, + "learning_rate": 1.6939333333333332e-05, + "loss": 3.4699, + "step": 49200 + }, + { + "epoch": 4.57, + "learning_rate": 1.6906e-05, + "loss": 3.4643, + "step": 49300 + }, + { + "epoch": 4.58, + "learning_rate": 1.687266666666667e-05, + "loss": 3.4771, + "step": 49400 + }, + { + "epoch": 4.59, + "learning_rate": 1.6839333333333333e-05, + "loss": 3.4709, + "step": 49500 + }, + { + "epoch": 4.59, + "eval_loss": 3.574620246887207, + "eval_runtime": 23.7928, + "eval_samples_per_second": 105.999, + "eval_steps_per_second": 6.641, + "step": 49500 + }, + { + "epoch": 4.6, + "learning_rate": 1.6806e-05, + "loss": 3.4724, + "step": 49600 + }, + { + "epoch": 4.61, + "learning_rate": 1.6772666666666666e-05, + "loss": 3.4675, + "step": 49700 + }, + { + "epoch": 4.62, + "learning_rate": 1.6739333333333334e-05, + "loss": 3.4958, + "step": 49800 + }, + { + "epoch": 4.63, + "learning_rate": 1.6706e-05, + "loss": 3.4738, + "step": 49900 + }, + { + "epoch": 4.63, + "learning_rate": 1.6672666666666667e-05, + "loss": 3.4916, + "step": 50000 + }, + { + "epoch": 4.63, + "eval_loss": 3.574481964111328, + "eval_runtime": 23.7895, + "eval_samples_per_second": 106.013, + "eval_steps_per_second": 6.642, + "step": 50000 + }, + { + "epoch": 4.64, + "learning_rate": 1.6639333333333335e-05, + "loss": 3.4786, + "step": 50100 + }, + { + "epoch": 4.65, + "learning_rate": 1.6606e-05, + "loss": 3.4727, + "step": 50200 + }, + { + "epoch": 4.66, + "learning_rate": 1.6572666666666668e-05, + "loss": 3.4872, + "step": 50300 + }, + { + "epoch": 4.67, + "learning_rate": 1.6539333333333333e-05, + "loss": 3.4945, + "step": 50400 + }, + { + "epoch": 4.68, + "learning_rate": 1.6506e-05, + "loss": 3.4715, + "step": 50500 + }, + { + "epoch": 4.68, + "eval_loss": 3.5705904960632324, + "eval_runtime": 24.2293, + "eval_samples_per_second": 104.089, + "eval_steps_per_second": 6.521, + "step": 50500 + }, + { + "epoch": 4.69, + "learning_rate": 1.6472666666666666e-05, + "loss": 3.4569, + "step": 50600 + }, + { + "epoch": 4.7, + "learning_rate": 1.6439333333333334e-05, + "loss": 3.4647, + "step": 50700 + }, + { + "epoch": 4.71, + "learning_rate": 1.6406333333333336e-05, + "loss": 3.4686, + "step": 50800 + }, + { + "epoch": 4.72, + "learning_rate": 1.6373e-05, + "loss": 3.4876, + "step": 50900 + }, + { + "epoch": 4.73, + "learning_rate": 1.633966666666667e-05, + "loss": 3.4926, + "step": 51000 + }, + { + "epoch": 4.73, + "eval_loss": 3.572871685028076, + "eval_runtime": 23.7903, + "eval_samples_per_second": 106.009, + "eval_steps_per_second": 6.641, + "step": 51000 + }, + { + "epoch": 4.74, + "learning_rate": 1.6306333333333337e-05, + "loss": 3.4759, + "step": 51100 + }, + { + "epoch": 4.75, + "learning_rate": 1.6273e-05, + "loss": 3.4418, + "step": 51200 + }, + { + "epoch": 4.76, + "learning_rate": 1.623966666666667e-05, + "loss": 3.4926, + "step": 51300 + }, + { + "epoch": 4.76, + "learning_rate": 1.6206333333333334e-05, + "loss": 3.4671, + "step": 51400 + }, + { + "epoch": 4.77, + "learning_rate": 1.6173000000000003e-05, + "loss": 3.4974, + "step": 51500 + }, + { + "epoch": 4.77, + "eval_loss": 3.5724916458129883, + "eval_runtime": 23.8031, + "eval_samples_per_second": 105.952, + "eval_steps_per_second": 6.638, + "step": 51500 + }, + { + "epoch": 4.78, + "learning_rate": 1.6139666666666667e-05, + "loss": 3.4833, + "step": 51600 + }, + { + "epoch": 4.79, + "learning_rate": 1.6106333333333335e-05, + "loss": 3.4603, + "step": 51700 + }, + { + "epoch": 4.8, + "learning_rate": 1.6073e-05, + "loss": 3.4809, + "step": 51800 + }, + { + "epoch": 4.81, + "learning_rate": 1.6039666666666668e-05, + "loss": 3.4697, + "step": 51900 + }, + { + "epoch": 4.82, + "learning_rate": 1.6006333333333336e-05, + "loss": 3.4796, + "step": 52000 + }, + { + "epoch": 4.82, + "eval_loss": 3.568263292312622, + "eval_runtime": 23.7971, + "eval_samples_per_second": 105.979, + "eval_steps_per_second": 6.639, + "step": 52000 + }, + { + "epoch": 4.83, + "learning_rate": 1.5973e-05, + "loss": 3.4968, + "step": 52100 + }, + { + "epoch": 4.84, + "learning_rate": 1.593966666666667e-05, + "loss": 3.469, + "step": 52200 + }, + { + "epoch": 4.85, + "learning_rate": 1.5906333333333334e-05, + "loss": 3.4832, + "step": 52300 + }, + { + "epoch": 4.86, + "learning_rate": 1.5873000000000002e-05, + "loss": 3.4819, + "step": 52400 + }, + { + "epoch": 4.87, + "learning_rate": 1.5839666666666667e-05, + "loss": 3.4817, + "step": 52500 + }, + { + "epoch": 4.87, + "eval_loss": 3.5706946849823, + "eval_runtime": 23.8071, + "eval_samples_per_second": 105.935, + "eval_steps_per_second": 6.637, + "step": 52500 + }, + { + "epoch": 4.88, + "learning_rate": 1.5806333333333335e-05, + "loss": 3.4747, + "step": 52600 + }, + { + "epoch": 4.89, + "learning_rate": 1.5773000000000003e-05, + "loss": 3.4643, + "step": 52700 + }, + { + "epoch": 4.89, + "learning_rate": 1.5739666666666668e-05, + "loss": 3.4446, + "step": 52800 + }, + { + "epoch": 4.9, + "learning_rate": 1.5706333333333336e-05, + "loss": 3.4721, + "step": 52900 + }, + { + "epoch": 4.91, + "learning_rate": 1.5673e-05, + "loss": 3.4683, + "step": 53000 + }, + { + "epoch": 4.91, + "eval_loss": 3.5720579624176025, + "eval_runtime": 23.787, + "eval_samples_per_second": 106.024, + "eval_steps_per_second": 6.642, + "step": 53000 + }, + { + "epoch": 4.92, + "learning_rate": 1.563966666666667e-05, + "loss": 3.4983, + "step": 53100 + }, + { + "epoch": 4.93, + "learning_rate": 1.5606333333333333e-05, + "loss": 3.4543, + "step": 53200 + }, + { + "epoch": 4.94, + "learning_rate": 1.5573e-05, + "loss": 3.4886, + "step": 53300 + }, + { + "epoch": 4.95, + "learning_rate": 1.553966666666667e-05, + "loss": 3.4598, + "step": 53400 + }, + { + "epoch": 4.96, + "learning_rate": 1.5506333333333334e-05, + "loss": 3.4986, + "step": 53500 + }, + { + "epoch": 4.96, + "eval_loss": 3.5689127445220947, + "eval_runtime": 23.9712, + "eval_samples_per_second": 105.21, + "eval_steps_per_second": 6.591, + "step": 53500 + }, + { + "epoch": 4.97, + "learning_rate": 1.5473000000000002e-05, + "loss": 3.4927, + "step": 53600 + }, + { + "epoch": 4.98, + "learning_rate": 1.5439666666666667e-05, + "loss": 3.4767, + "step": 53700 + }, + { + "epoch": 4.99, + "learning_rate": 1.5406333333333335e-05, + "loss": 3.4679, + "step": 53800 + }, + { + "epoch": 5.0, + "learning_rate": 1.5373333333333334e-05, + "loss": 3.4892, + "step": 53900 + }, + { + "epoch": 5.01, + "learning_rate": 1.534e-05, + "loss": 3.4763, + "step": 54000 + }, + { + "epoch": 5.01, + "eval_loss": 3.571553945541382, + "eval_runtime": 23.7751, + "eval_samples_per_second": 106.077, + "eval_steps_per_second": 6.646, + "step": 54000 + }, + { + "epoch": 5.01, + "learning_rate": 1.5306666666666667e-05, + "loss": 3.448, + "step": 54100 + }, + { + "epoch": 5.02, + "learning_rate": 1.527333333333333e-05, + "loss": 3.4476, + "step": 54200 + }, + { + "epoch": 5.03, + "learning_rate": 1.524e-05, + "loss": 3.452, + "step": 54300 + }, + { + "epoch": 5.04, + "learning_rate": 1.5206666666666668e-05, + "loss": 3.4499, + "step": 54400 + }, + { + "epoch": 5.05, + "learning_rate": 1.5173333333333334e-05, + "loss": 3.4668, + "step": 54500 + }, + { + "epoch": 5.05, + "eval_loss": 3.5700042247772217, + "eval_runtime": 23.7777, + "eval_samples_per_second": 106.066, + "eval_steps_per_second": 6.645, + "step": 54500 + }, + { + "epoch": 5.06, + "learning_rate": 1.5140000000000002e-05, + "loss": 3.446, + "step": 54600 + }, + { + "epoch": 5.07, + "learning_rate": 1.5106666666666667e-05, + "loss": 3.449, + "step": 54700 + }, + { + "epoch": 5.08, + "learning_rate": 1.5073333333333335e-05, + "loss": 3.4855, + "step": 54800 + }, + { + "epoch": 5.09, + "learning_rate": 1.504e-05, + "loss": 3.4352, + "step": 54900 + }, + { + "epoch": 5.1, + "learning_rate": 1.5006666666666668e-05, + "loss": 3.4274, + "step": 55000 + }, + { + "epoch": 5.1, + "eval_loss": 3.5723605155944824, + "eval_runtime": 23.7887, + "eval_samples_per_second": 106.017, + "eval_steps_per_second": 6.642, + "step": 55000 + }, + { + "epoch": 5.11, + "learning_rate": 1.4973333333333335e-05, + "loss": 3.4536, + "step": 55100 + }, + { + "epoch": 5.12, + "learning_rate": 1.4940000000000001e-05, + "loss": 3.4443, + "step": 55200 + }, + { + "epoch": 5.13, + "learning_rate": 1.4906666666666667e-05, + "loss": 3.4531, + "step": 55300 + }, + { + "epoch": 5.14, + "learning_rate": 1.4873333333333334e-05, + "loss": 3.4414, + "step": 55400 + }, + { + "epoch": 5.14, + "learning_rate": 1.484e-05, + "loss": 3.4499, + "step": 55500 + }, + { + "epoch": 5.14, + "eval_loss": 3.571748971939087, + "eval_runtime": 23.7939, + "eval_samples_per_second": 105.994, + "eval_steps_per_second": 6.64, + "step": 55500 + }, + { + "epoch": 5.15, + "learning_rate": 1.4806666666666668e-05, + "loss": 3.4533, + "step": 55600 + }, + { + "epoch": 5.16, + "learning_rate": 1.4773333333333335e-05, + "loss": 3.4528, + "step": 55700 + }, + { + "epoch": 5.17, + "learning_rate": 1.4740000000000001e-05, + "loss": 3.4617, + "step": 55800 + }, + { + "epoch": 5.18, + "learning_rate": 1.4706666666666668e-05, + "loss": 3.454, + "step": 55900 + }, + { + "epoch": 5.19, + "learning_rate": 1.4673333333333334e-05, + "loss": 3.4507, + "step": 56000 + }, + { + "epoch": 5.19, + "eval_loss": 3.5706114768981934, + "eval_runtime": 23.7225, + "eval_samples_per_second": 106.313, + "eval_steps_per_second": 6.66, + "step": 56000 + }, + { + "epoch": 5.2, + "learning_rate": 1.4640333333333335e-05, + "loss": 3.4436, + "step": 56100 + }, + { + "epoch": 5.21, + "learning_rate": 1.4607000000000001e-05, + "loss": 3.4584, + "step": 56200 + }, + { + "epoch": 5.22, + "learning_rate": 1.4573666666666667e-05, + "loss": 3.4517, + "step": 56300 + }, + { + "epoch": 5.23, + "learning_rate": 1.4540333333333334e-05, + "loss": 3.4581, + "step": 56400 + }, + { + "epoch": 5.24, + "learning_rate": 1.4507e-05, + "loss": 3.4343, + "step": 56500 + }, + { + "epoch": 5.24, + "eval_loss": 3.5696890354156494, + "eval_runtime": 23.8115, + "eval_samples_per_second": 105.915, + "eval_steps_per_second": 6.635, + "step": 56500 + }, + { + "epoch": 5.25, + "learning_rate": 1.4473666666666668e-05, + "loss": 3.4306, + "step": 56600 + }, + { + "epoch": 5.26, + "learning_rate": 1.4440333333333335e-05, + "loss": 3.4402, + "step": 56700 + }, + { + "epoch": 5.27, + "learning_rate": 1.4407000000000001e-05, + "loss": 3.453, + "step": 56800 + }, + { + "epoch": 5.27, + "learning_rate": 1.4373666666666668e-05, + "loss": 3.434, + "step": 56900 + }, + { + "epoch": 5.28, + "learning_rate": 1.4340333333333334e-05, + "loss": 3.4151, + "step": 57000 + }, + { + "epoch": 5.28, + "eval_loss": 3.5709781646728516, + "eval_runtime": 23.7861, + "eval_samples_per_second": 106.028, + "eval_steps_per_second": 6.643, + "step": 57000 + }, + { + "epoch": 5.29, + "learning_rate": 1.4307e-05, + "loss": 3.4446, + "step": 57100 + }, + { + "epoch": 5.3, + "learning_rate": 1.4273666666666667e-05, + "loss": 3.4401, + "step": 57200 + }, + { + "epoch": 5.31, + "learning_rate": 1.4240333333333333e-05, + "loss": 3.4597, + "step": 57300 + }, + { + "epoch": 5.32, + "learning_rate": 1.4207000000000001e-05, + "loss": 3.4369, + "step": 57400 + }, + { + "epoch": 5.33, + "learning_rate": 1.4173666666666668e-05, + "loss": 3.4469, + "step": 57500 + }, + { + "epoch": 5.33, + "eval_loss": 3.571150779724121, + "eval_runtime": 23.8246, + "eval_samples_per_second": 105.857, + "eval_steps_per_second": 6.632, + "step": 57500 + }, + { + "epoch": 5.34, + "learning_rate": 1.4140333333333334e-05, + "loss": 3.4317, + "step": 57600 + }, + { + "epoch": 5.35, + "learning_rate": 1.4107e-05, + "loss": 3.4437, + "step": 57700 + }, + { + "epoch": 5.36, + "learning_rate": 1.4073666666666667e-05, + "loss": 3.4488, + "step": 57800 + }, + { + "epoch": 5.37, + "learning_rate": 1.4040333333333333e-05, + "loss": 3.4404, + "step": 57900 + }, + { + "epoch": 5.38, + "learning_rate": 1.4007e-05, + "loss": 3.458, + "step": 58000 + }, + { + "epoch": 5.38, + "eval_loss": 3.5692391395568848, + "eval_runtime": 23.7837, + "eval_samples_per_second": 106.039, + "eval_steps_per_second": 6.643, + "step": 58000 + }, + { + "epoch": 5.39, + "learning_rate": 1.3973666666666666e-05, + "loss": 3.4656, + "step": 58100 + }, + { + "epoch": 5.39, + "learning_rate": 1.3940333333333334e-05, + "loss": 3.4639, + "step": 58200 + }, + { + "epoch": 5.4, + "learning_rate": 1.3907000000000001e-05, + "loss": 3.4502, + "step": 58300 + }, + { + "epoch": 5.41, + "learning_rate": 1.3873666666666667e-05, + "loss": 3.4752, + "step": 58400 + }, + { + "epoch": 5.42, + "learning_rate": 1.3840333333333334e-05, + "loss": 3.4559, + "step": 58500 + }, + { + "epoch": 5.42, + "eval_loss": 3.567958116531372, + "eval_runtime": 23.7984, + "eval_samples_per_second": 105.974, + "eval_steps_per_second": 6.639, + "step": 58500 + }, + { + "epoch": 5.43, + "learning_rate": 1.3807e-05, + "loss": 3.4518, + "step": 58600 + }, + { + "epoch": 5.44, + "learning_rate": 1.3774e-05, + "loss": 3.418, + "step": 58700 + }, + { + "epoch": 5.45, + "learning_rate": 1.3740666666666667e-05, + "loss": 3.438, + "step": 58800 + }, + { + "epoch": 5.46, + "learning_rate": 1.3707333333333333e-05, + "loss": 3.4315, + "step": 58900 + }, + { + "epoch": 5.47, + "learning_rate": 1.3674e-05, + "loss": 3.4354, + "step": 59000 + }, + { + "epoch": 5.47, + "eval_loss": 3.56830096244812, + "eval_runtime": 23.7688, + "eval_samples_per_second": 106.106, + "eval_steps_per_second": 6.647, + "step": 59000 + }, + { + "epoch": 5.48, + "learning_rate": 1.3640666666666666e-05, + "loss": 3.4574, + "step": 59100 + }, + { + "epoch": 5.49, + "learning_rate": 1.3607333333333334e-05, + "loss": 3.4327, + "step": 59200 + }, + { + "epoch": 5.5, + "learning_rate": 1.3574e-05, + "loss": 3.4542, + "step": 59300 + }, + { + "epoch": 5.51, + "learning_rate": 1.3540666666666667e-05, + "loss": 3.4424, + "step": 59400 + }, + { + "epoch": 5.52, + "learning_rate": 1.3507333333333334e-05, + "loss": 3.4479, + "step": 59500 + }, + { + "epoch": 5.52, + "eval_loss": 3.570251703262329, + "eval_runtime": 23.7862, + "eval_samples_per_second": 106.028, + "eval_steps_per_second": 6.643, + "step": 59500 + }, + { + "epoch": 5.52, + "learning_rate": 1.3474e-05, + "loss": 3.4637, + "step": 59600 + }, + { + "epoch": 5.53, + "learning_rate": 1.3440666666666667e-05, + "loss": 3.4584, + "step": 59700 + }, + { + "epoch": 5.54, + "learning_rate": 1.3407333333333333e-05, + "loss": 3.4347, + "step": 59800 + }, + { + "epoch": 5.55, + "learning_rate": 1.3374e-05, + "loss": 3.4526, + "step": 59900 + }, + { + "epoch": 5.56, + "learning_rate": 1.3340666666666667e-05, + "loss": 3.4627, + "step": 60000 + }, + { + "epoch": 5.56, + "eval_loss": 3.5677947998046875, + "eval_runtime": 23.7816, + "eval_samples_per_second": 106.049, + "eval_steps_per_second": 6.644, + "step": 60000 + }, + { + "epoch": 5.57, + "learning_rate": 1.3307333333333334e-05, + "loss": 3.4423, + "step": 60100 + }, + { + "epoch": 5.58, + "learning_rate": 1.3274e-05, + "loss": 3.4469, + "step": 60200 + }, + { + "epoch": 5.59, + "learning_rate": 1.3240666666666667e-05, + "loss": 3.4423, + "step": 60300 + }, + { + "epoch": 5.6, + "learning_rate": 1.3207333333333333e-05, + "loss": 3.4322, + "step": 60400 + }, + { + "epoch": 5.61, + "learning_rate": 1.3174e-05, + "loss": 3.4478, + "step": 60500 + }, + { + "epoch": 5.61, + "eval_loss": 3.565863847732544, + "eval_runtime": 23.8989, + "eval_samples_per_second": 105.528, + "eval_steps_per_second": 6.611, + "step": 60500 + }, + { + "epoch": 5.62, + "learning_rate": 1.3140666666666666e-05, + "loss": 3.4294, + "step": 60600 + }, + { + "epoch": 5.63, + "learning_rate": 1.3107333333333332e-05, + "loss": 3.4547, + "step": 60700 + }, + { + "epoch": 5.64, + "learning_rate": 1.3074e-05, + "loss": 3.4605, + "step": 60800 + }, + { + "epoch": 5.65, + "learning_rate": 1.3040666666666667e-05, + "loss": 3.4517, + "step": 60900 + }, + { + "epoch": 5.65, + "learning_rate": 1.3007333333333333e-05, + "loss": 3.4645, + "step": 61000 + }, + { + "epoch": 5.65, + "eval_loss": 3.567469596862793, + "eval_runtime": 23.816, + "eval_samples_per_second": 105.895, + "eval_steps_per_second": 6.634, + "step": 61000 + }, + { + "epoch": 5.66, + "learning_rate": 1.2974e-05, + "loss": 3.4338, + "step": 61100 + }, + { + "epoch": 5.67, + "learning_rate": 1.2940666666666666e-05, + "loss": 3.4675, + "step": 61200 + }, + { + "epoch": 5.68, + "learning_rate": 1.2907333333333333e-05, + "loss": 3.4542, + "step": 61300 + }, + { + "epoch": 5.69, + "learning_rate": 1.2873999999999999e-05, + "loss": 3.46, + "step": 61400 + }, + { + "epoch": 5.7, + "learning_rate": 1.2840666666666667e-05, + "loss": 3.4658, + "step": 61500 + }, + { + "epoch": 5.7, + "eval_loss": 3.5665736198425293, + "eval_runtime": 23.7752, + "eval_samples_per_second": 106.077, + "eval_steps_per_second": 6.646, + "step": 61500 + }, + { + "epoch": 5.71, + "learning_rate": 1.2807333333333334e-05, + "loss": 3.4398, + "step": 61600 + }, + { + "epoch": 5.72, + "learning_rate": 1.2774e-05, + "loss": 3.4504, + "step": 61700 + }, + { + "epoch": 5.73, + "learning_rate": 1.2740666666666666e-05, + "loss": 3.4197, + "step": 61800 + }, + { + "epoch": 5.74, + "learning_rate": 1.2707333333333333e-05, + "loss": 3.4719, + "step": 61900 + }, + { + "epoch": 5.75, + "learning_rate": 1.2674e-05, + "loss": 3.4657, + "step": 62000 + }, + { + "epoch": 5.75, + "eval_loss": 3.5658366680145264, + "eval_runtime": 23.7952, + "eval_samples_per_second": 105.988, + "eval_steps_per_second": 6.64, + "step": 62000 + }, + { + "epoch": 5.76, + "learning_rate": 1.2641e-05, + "loss": 3.4558, + "step": 62100 + }, + { + "epoch": 5.77, + "learning_rate": 1.2607666666666666e-05, + "loss": 3.4448, + "step": 62200 + }, + { + "epoch": 5.77, + "learning_rate": 1.2574333333333333e-05, + "loss": 3.4367, + "step": 62300 + }, + { + "epoch": 5.78, + "learning_rate": 1.2540999999999999e-05, + "loss": 3.4363, + "step": 62400 + }, + { + "epoch": 5.79, + "learning_rate": 1.2507666666666667e-05, + "loss": 3.4618, + "step": 62500 + }, + { + "epoch": 5.79, + "eval_loss": 3.565267324447632, + "eval_runtime": 23.7952, + "eval_samples_per_second": 105.988, + "eval_steps_per_second": 6.64, + "step": 62500 + }, + { + "epoch": 5.8, + "learning_rate": 1.2474666666666666e-05, + "loss": 3.4519, + "step": 62600 + }, + { + "epoch": 5.81, + "learning_rate": 1.2441333333333332e-05, + "loss": 3.4173, + "step": 62700 + }, + { + "epoch": 5.82, + "learning_rate": 1.2408e-05, + "loss": 3.4429, + "step": 62800 + }, + { + "epoch": 5.83, + "learning_rate": 1.2374666666666667e-05, + "loss": 3.4334, + "step": 62900 + }, + { + "epoch": 5.84, + "learning_rate": 1.2341333333333333e-05, + "loss": 3.4541, + "step": 63000 + }, + { + "epoch": 5.84, + "eval_loss": 3.565286874771118, + "eval_runtime": 23.7849, + "eval_samples_per_second": 106.034, + "eval_steps_per_second": 6.643, + "step": 63000 + }, + { + "epoch": 5.85, + "learning_rate": 1.2308e-05, + "loss": 3.4434, + "step": 63100 + }, + { + "epoch": 5.86, + "learning_rate": 1.2274666666666666e-05, + "loss": 3.4478, + "step": 63200 + }, + { + "epoch": 5.87, + "learning_rate": 1.2241333333333333e-05, + "loss": 3.4517, + "step": 63300 + }, + { + "epoch": 5.88, + "learning_rate": 1.2207999999999999e-05, + "loss": 3.462, + "step": 63400 + }, + { + "epoch": 5.89, + "learning_rate": 1.2174666666666665e-05, + "loss": 3.4552, + "step": 63500 + }, + { + "epoch": 5.89, + "eval_loss": 3.5648128986358643, + "eval_runtime": 23.8, + "eval_samples_per_second": 105.966, + "eval_steps_per_second": 6.639, + "step": 63500 + }, + { + "epoch": 5.9, + "learning_rate": 1.2141333333333334e-05, + "loss": 3.4486, + "step": 63600 + }, + { + "epoch": 5.9, + "learning_rate": 1.2108e-05, + "loss": 3.4296, + "step": 63700 + }, + { + "epoch": 5.91, + "learning_rate": 1.2074666666666666e-05, + "loss": 3.4678, + "step": 63800 + }, + { + "epoch": 5.92, + "learning_rate": 1.2041333333333334e-05, + "loss": 3.4566, + "step": 63900 + }, + { + "epoch": 5.93, + "learning_rate": 1.2008000000000001e-05, + "loss": 3.4679, + "step": 64000 + }, + { + "epoch": 5.93, + "eval_loss": 3.5647895336151123, + "eval_runtime": 23.7762, + "eval_samples_per_second": 106.073, + "eval_steps_per_second": 6.645, + "step": 64000 + }, + { + "epoch": 5.94, + "learning_rate": 1.1974666666666667e-05, + "loss": 3.4522, + "step": 64100 + }, + { + "epoch": 5.95, + "learning_rate": 1.1941333333333334e-05, + "loss": 3.4432, + "step": 64200 + }, + { + "epoch": 5.96, + "learning_rate": 1.1908000000000002e-05, + "loss": 3.4428, + "step": 64300 + }, + { + "epoch": 5.97, + "learning_rate": 1.1874666666666668e-05, + "loss": 3.4462, + "step": 64400 + }, + { + "epoch": 5.98, + "learning_rate": 1.1841333333333335e-05, + "loss": 3.4423, + "step": 64500 + }, + { + "epoch": 5.98, + "eval_loss": 3.5651543140411377, + "eval_runtime": 23.7835, + "eval_samples_per_second": 106.04, + "eval_steps_per_second": 6.643, + "step": 64500 + }, + { + "epoch": 5.99, + "learning_rate": 1.1808000000000001e-05, + "loss": 3.4277, + "step": 64600 + }, + { + "epoch": 6.0, + "learning_rate": 1.1774666666666668e-05, + "loss": 3.4397, + "step": 64700 + }, + { + "epoch": 6.01, + "learning_rate": 1.1741333333333334e-05, + "loss": 3.4286, + "step": 64800 + }, + { + "epoch": 6.02, + "learning_rate": 1.1708e-05, + "loss": 3.4134, + "step": 64900 + }, + { + "epoch": 6.03, + "learning_rate": 1.1674666666666667e-05, + "loss": 3.3893, + "step": 65000 + }, + { + "epoch": 6.03, + "eval_loss": 3.564636707305908, + "eval_runtime": 23.8888, + "eval_samples_per_second": 105.572, + "eval_steps_per_second": 6.614, + "step": 65000 + }, + { + "epoch": 6.03, + "learning_rate": 1.1641333333333335e-05, + "loss": 3.4141, + "step": 65100 + }, + { + "epoch": 6.04, + "learning_rate": 1.1608000000000001e-05, + "loss": 3.4166, + "step": 65200 + }, + { + "epoch": 6.05, + "learning_rate": 1.1574666666666668e-05, + "loss": 3.4112, + "step": 65300 + }, + { + "epoch": 6.06, + "learning_rate": 1.1541333333333334e-05, + "loss": 3.4282, + "step": 65400 + }, + { + "epoch": 6.07, + "learning_rate": 1.1508e-05, + "loss": 3.4239, + "step": 65500 + }, + { + "epoch": 6.07, + "eval_loss": 3.566786527633667, + "eval_runtime": 23.7837, + "eval_samples_per_second": 106.039, + "eval_steps_per_second": 6.643, + "step": 65500 + }, + { + "epoch": 6.08, + "learning_rate": 1.1474666666666667e-05, + "loss": 3.4181, + "step": 65600 + }, + { + "epoch": 6.09, + "learning_rate": 1.1441333333333333e-05, + "loss": 3.4323, + "step": 65700 + }, + { + "epoch": 6.1, + "learning_rate": 1.1408e-05, + "loss": 3.4269, + "step": 65800 + }, + { + "epoch": 6.11, + "learning_rate": 1.1374666666666668e-05, + "loss": 3.4389, + "step": 65900 + }, + { + "epoch": 6.12, + "learning_rate": 1.1341333333333334e-05, + "loss": 3.4329, + "step": 66000 + }, + { + "epoch": 6.12, + "eval_loss": 3.563938856124878, + "eval_runtime": 23.781, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 6.644, + "step": 66000 + }, + { + "epoch": 6.13, + "learning_rate": 1.1308e-05, + "loss": 3.4201, + "step": 66100 + }, + { + "epoch": 6.14, + "learning_rate": 1.1274666666666667e-05, + "loss": 3.4139, + "step": 66200 + }, + { + "epoch": 6.15, + "learning_rate": 1.1241333333333334e-05, + "loss": 3.4178, + "step": 66300 + }, + { + "epoch": 6.15, + "learning_rate": 1.1208333333333334e-05, + "loss": 3.4203, + "step": 66400 + }, + { + "epoch": 6.16, + "learning_rate": 1.1175e-05, + "loss": 3.4151, + "step": 66500 + }, + { + "epoch": 6.16, + "eval_loss": 3.564937114715576, + "eval_runtime": 23.7834, + "eval_samples_per_second": 106.04, + "eval_steps_per_second": 6.643, + "step": 66500 + }, + { + "epoch": 6.17, + "learning_rate": 1.1141666666666667e-05, + "loss": 3.4345, + "step": 66600 + }, + { + "epoch": 6.18, + "learning_rate": 1.1108333333333333e-05, + "loss": 3.4681, + "step": 66700 + }, + { + "epoch": 6.19, + "learning_rate": 1.1075e-05, + "loss": 3.4163, + "step": 66800 + }, + { + "epoch": 6.2, + "learning_rate": 1.1041666666666668e-05, + "loss": 3.422, + "step": 66900 + }, + { + "epoch": 6.21, + "learning_rate": 1.1008333333333334e-05, + "loss": 3.4181, + "step": 67000 + }, + { + "epoch": 6.21, + "eval_loss": 3.568237066268921, + "eval_runtime": 23.8051, + "eval_samples_per_second": 105.944, + "eval_steps_per_second": 6.637, + "step": 67000 + }, + { + "epoch": 6.22, + "learning_rate": 1.0975e-05, + "loss": 3.4151, + "step": 67100 + }, + { + "epoch": 6.23, + "learning_rate": 1.0941666666666667e-05, + "loss": 3.4258, + "step": 67200 + }, + { + "epoch": 6.24, + "learning_rate": 1.0908333333333334e-05, + "loss": 3.411, + "step": 67300 + }, + { + "epoch": 6.25, + "learning_rate": 1.0875e-05, + "loss": 3.4326, + "step": 67400 + }, + { + "epoch": 6.26, + "learning_rate": 1.0841666666666666e-05, + "loss": 3.4314, + "step": 67500 + }, + { + "epoch": 6.26, + "eval_loss": 3.566948413848877, + "eval_runtime": 23.7688, + "eval_samples_per_second": 106.105, + "eval_steps_per_second": 6.647, + "step": 67500 + }, + { + "epoch": 6.27, + "learning_rate": 1.0808333333333333e-05, + "loss": 3.3988, + "step": 67600 + }, + { + "epoch": 6.28, + "learning_rate": 1.0775000000000001e-05, + "loss": 3.4227, + "step": 67700 + }, + { + "epoch": 6.28, + "learning_rate": 1.0741666666666667e-05, + "loss": 3.4358, + "step": 67800 + }, + { + "epoch": 6.29, + "learning_rate": 1.0708333333333334e-05, + "loss": 3.4499, + "step": 67900 + }, + { + "epoch": 6.3, + "learning_rate": 1.0675e-05, + "loss": 3.4245, + "step": 68000 + }, + { + "epoch": 6.3, + "eval_loss": 3.562889575958252, + "eval_runtime": 23.7724, + "eval_samples_per_second": 106.089, + "eval_steps_per_second": 6.646, + "step": 68000 + }, + { + "epoch": 6.31, + "learning_rate": 1.0641666666666667e-05, + "loss": 3.4124, + "step": 68100 + }, + { + "epoch": 6.32, + "learning_rate": 1.0608333333333333e-05, + "loss": 3.4106, + "step": 68200 + }, + { + "epoch": 6.33, + "learning_rate": 1.0575e-05, + "loss": 3.4115, + "step": 68300 + }, + { + "epoch": 6.34, + "learning_rate": 1.0541666666666666e-05, + "loss": 3.4309, + "step": 68400 + }, + { + "epoch": 6.35, + "learning_rate": 1.0508333333333334e-05, + "loss": 3.421, + "step": 68500 + }, + { + "epoch": 6.35, + "eval_loss": 3.566340446472168, + "eval_runtime": 23.7726, + "eval_samples_per_second": 106.089, + "eval_steps_per_second": 6.646, + "step": 68500 + }, + { + "epoch": 6.36, + "learning_rate": 1.0475e-05, + "loss": 3.4107, + "step": 68600 + }, + { + "epoch": 6.37, + "learning_rate": 1.0441666666666667e-05, + "loss": 3.4152, + "step": 68700 + }, + { + "epoch": 6.38, + "learning_rate": 1.0408333333333333e-05, + "loss": 3.4019, + "step": 68800 + }, + { + "epoch": 6.39, + "learning_rate": 1.0375e-05, + "loss": 3.3954, + "step": 68900 + }, + { + "epoch": 6.4, + "learning_rate": 1.0341666666666666e-05, + "loss": 3.4329, + "step": 69000 + }, + { + "epoch": 6.4, + "eval_loss": 3.565971851348877, + "eval_runtime": 23.7798, + "eval_samples_per_second": 106.056, + "eval_steps_per_second": 6.644, + "step": 69000 + }, + { + "epoch": 6.41, + "learning_rate": 1.0308333333333333e-05, + "loss": 3.419, + "step": 69100 + }, + { + "epoch": 6.41, + "learning_rate": 1.0275e-05, + "loss": 3.4272, + "step": 69200 + }, + { + "epoch": 6.42, + "learning_rate": 1.0241666666666667e-05, + "loss": 3.4056, + "step": 69300 + }, + { + "epoch": 6.43, + "learning_rate": 1.0208333333333334e-05, + "loss": 3.4373, + "step": 69400 + }, + { + "epoch": 6.44, + "learning_rate": 1.0175333333333334e-05, + "loss": 3.4122, + "step": 69500 + }, + { + "epoch": 6.44, + "eval_loss": 3.5650551319122314, + "eval_runtime": 23.7808, + "eval_samples_per_second": 106.052, + "eval_steps_per_second": 6.644, + "step": 69500 + }, + { + "epoch": 6.45, + "learning_rate": 1.0142e-05, + "loss": 3.4152, + "step": 69600 + }, + { + "epoch": 6.46, + "learning_rate": 1.0109000000000001e-05, + "loss": 3.463, + "step": 69700 + }, + { + "epoch": 6.47, + "learning_rate": 1.0075666666666667e-05, + "loss": 3.4214, + "step": 69800 + }, + { + "epoch": 6.48, + "learning_rate": 1.0042333333333334e-05, + "loss": 3.4423, + "step": 69900 + }, + { + "epoch": 6.49, + "learning_rate": 1.0009e-05, + "loss": 3.4362, + "step": 70000 + }, + { + "epoch": 6.49, + "eval_loss": 3.5628156661987305, + "eval_runtime": 23.761, + "eval_samples_per_second": 106.14, + "eval_steps_per_second": 6.65, + "step": 70000 + }, + { + "epoch": 6.5, + "learning_rate": 9.975666666666667e-06, + "loss": 3.4393, + "step": 70100 + }, + { + "epoch": 6.51, + "learning_rate": 9.942333333333333e-06, + "loss": 3.4448, + "step": 70200 + }, + { + "epoch": 6.52, + "learning_rate": 9.909e-06, + "loss": 3.4162, + "step": 70300 + }, + { + "epoch": 6.53, + "learning_rate": 9.875666666666666e-06, + "loss": 3.4291, + "step": 70400 + }, + { + "epoch": 6.54, + "learning_rate": 9.842333333333334e-06, + "loss": 3.4497, + "step": 70500 + }, + { + "epoch": 6.54, + "eval_loss": 3.5647528171539307, + "eval_runtime": 23.7333, + "eval_samples_per_second": 106.264, + "eval_steps_per_second": 6.657, + "step": 70500 + }, + { + "epoch": 6.54, + "learning_rate": 9.809e-06, + "loss": 3.4127, + "step": 70600 + }, + { + "epoch": 6.55, + "learning_rate": 9.775666666666667e-06, + "loss": 3.4347, + "step": 70700 + }, + { + "epoch": 6.56, + "learning_rate": 9.742333333333333e-06, + "loss": 3.4317, + "step": 70800 + }, + { + "epoch": 6.57, + "learning_rate": 9.709e-06, + "loss": 3.4201, + "step": 70900 + }, + { + "epoch": 6.58, + "learning_rate": 9.675666666666666e-06, + "loss": 3.431, + "step": 71000 + }, + { + "epoch": 6.58, + "eval_loss": 3.562624216079712, + "eval_runtime": 23.7932, + "eval_samples_per_second": 105.997, + "eval_steps_per_second": 6.641, + "step": 71000 + }, + { + "epoch": 6.59, + "learning_rate": 9.642333333333333e-06, + "loss": 3.4172, + "step": 71100 + }, + { + "epoch": 6.6, + "learning_rate": 9.608999999999999e-06, + "loss": 3.4387, + "step": 71200 + }, + { + "epoch": 6.61, + "learning_rate": 9.575666666666667e-06, + "loss": 3.4288, + "step": 71300 + }, + { + "epoch": 6.62, + "learning_rate": 9.542333333333334e-06, + "loss": 3.4287, + "step": 71400 + }, + { + "epoch": 6.63, + "learning_rate": 9.509e-06, + "loss": 3.432, + "step": 71500 + }, + { + "epoch": 6.63, + "eval_loss": 3.5648353099823, + "eval_runtime": 23.9564, + "eval_samples_per_second": 105.274, + "eval_steps_per_second": 6.595, + "step": 71500 + }, + { + "epoch": 6.64, + "learning_rate": 9.475666666666666e-06, + "loss": 3.4299, + "step": 71600 + }, + { + "epoch": 6.65, + "learning_rate": 9.442333333333333e-06, + "loss": 3.4381, + "step": 71700 + }, + { + "epoch": 6.66, + "learning_rate": 9.409e-06, + "loss": 3.4313, + "step": 71800 + }, + { + "epoch": 6.66, + "learning_rate": 9.375666666666666e-06, + "loss": 3.4254, + "step": 71900 + }, + { + "epoch": 6.67, + "learning_rate": 9.342333333333334e-06, + "loss": 3.4208, + "step": 72000 + }, + { + "epoch": 6.67, + "eval_loss": 3.563504934310913, + "eval_runtime": 23.7942, + "eval_samples_per_second": 105.992, + "eval_steps_per_second": 6.64, + "step": 72000 + }, + { + "epoch": 6.68, + "learning_rate": 9.309e-06, + "loss": 3.4348, + "step": 72100 + }, + { + "epoch": 6.69, + "learning_rate": 9.275666666666667e-06, + "loss": 3.4201, + "step": 72200 + }, + { + "epoch": 6.7, + "learning_rate": 9.242333333333333e-06, + "loss": 3.3956, + "step": 72300 + }, + { + "epoch": 6.71, + "learning_rate": 9.209e-06, + "loss": 3.4262, + "step": 72400 + }, + { + "epoch": 6.72, + "learning_rate": 9.175666666666666e-06, + "loss": 3.4526, + "step": 72500 + }, + { + "epoch": 6.72, + "eval_loss": 3.564483404159546, + "eval_runtime": 23.8074, + "eval_samples_per_second": 105.934, + "eval_steps_per_second": 6.637, + "step": 72500 + }, + { + "epoch": 6.73, + "learning_rate": 9.142333333333332e-06, + "loss": 3.4344, + "step": 72600 + }, + { + "epoch": 6.74, + "learning_rate": 9.109e-06, + "loss": 3.4501, + "step": 72700 + }, + { + "epoch": 6.75, + "learning_rate": 9.075666666666668e-06, + "loss": 3.405, + "step": 72800 + }, + { + "epoch": 6.76, + "learning_rate": 9.042333333333335e-06, + "loss": 3.4416, + "step": 72900 + }, + { + "epoch": 6.77, + "learning_rate": 9.009000000000001e-06, + "loss": 3.4139, + "step": 73000 + }, + { + "epoch": 6.77, + "eval_loss": 3.5620737075805664, + "eval_runtime": 23.7869, + "eval_samples_per_second": 106.025, + "eval_steps_per_second": 6.642, + "step": 73000 + }, + { + "epoch": 6.78, + "learning_rate": 8.975666666666668e-06, + "loss": 3.4286, + "step": 73100 + }, + { + "epoch": 6.79, + "learning_rate": 8.942333333333334e-06, + "loss": 3.428, + "step": 73200 + }, + { + "epoch": 6.79, + "learning_rate": 8.909e-06, + "loss": 3.4446, + "step": 73300 + }, + { + "epoch": 6.8, + "learning_rate": 8.875666666666667e-06, + "loss": 3.415, + "step": 73400 + }, + { + "epoch": 6.81, + "learning_rate": 8.842333333333333e-06, + "loss": 3.4212, + "step": 73500 + }, + { + "epoch": 6.81, + "eval_loss": 3.562941789627075, + "eval_runtime": 23.7809, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 6.644, + "step": 73500 + }, + { + "epoch": 6.82, + "learning_rate": 8.809000000000002e-06, + "loss": 3.4272, + "step": 73600 + }, + { + "epoch": 6.83, + "learning_rate": 8.775666666666668e-06, + "loss": 3.4448, + "step": 73700 + }, + { + "epoch": 6.84, + "learning_rate": 8.742333333333334e-06, + "loss": 3.4102, + "step": 73800 + }, + { + "epoch": 6.85, + "learning_rate": 8.709e-06, + "loss": 3.4299, + "step": 73900 + }, + { + "epoch": 6.86, + "learning_rate": 8.675666666666667e-06, + "loss": 3.4352, + "step": 74000 + }, + { + "epoch": 6.86, + "eval_loss": 3.5597243309020996, + "eval_runtime": 23.7901, + "eval_samples_per_second": 106.01, + "eval_steps_per_second": 6.641, + "step": 74000 + }, + { + "epoch": 6.87, + "learning_rate": 8.642333333333334e-06, + "loss": 3.4144, + "step": 74100 + }, + { + "epoch": 6.88, + "learning_rate": 8.609e-06, + "loss": 3.4328, + "step": 74200 + }, + { + "epoch": 6.89, + "learning_rate": 8.575666666666666e-06, + "loss": 3.4214, + "step": 74300 + }, + { + "epoch": 6.9, + "learning_rate": 8.542333333333335e-06, + "loss": 3.4055, + "step": 74400 + }, + { + "epoch": 6.91, + "learning_rate": 8.509000000000001e-06, + "loss": 3.4242, + "step": 74500 + }, + { + "epoch": 6.91, + "eval_loss": 3.559704303741455, + "eval_runtime": 23.8077, + "eval_samples_per_second": 105.932, + "eval_steps_per_second": 6.637, + "step": 74500 + }, + { + "epoch": 6.92, + "learning_rate": 8.475666666666667e-06, + "loss": 3.4363, + "step": 74600 + }, + { + "epoch": 6.92, + "learning_rate": 8.442333333333334e-06, + "loss": 3.4417, + "step": 74700 + }, + { + "epoch": 6.93, + "learning_rate": 8.409e-06, + "loss": 3.4616, + "step": 74800 + }, + { + "epoch": 6.94, + "learning_rate": 8.375666666666667e-06, + "loss": 3.4322, + "step": 74900 + }, + { + "epoch": 6.95, + "learning_rate": 8.342333333333333e-06, + "loss": 3.429, + "step": 75000 + }, + { + "epoch": 6.95, + "eval_loss": 3.561878204345703, + "eval_runtime": 23.7887, + "eval_samples_per_second": 106.017, + "eval_steps_per_second": 6.642, + "step": 75000 + }, + { + "epoch": 6.96, + "learning_rate": 8.309000000000001e-06, + "loss": 3.4187, + "step": 75100 + }, + { + "epoch": 6.97, + "learning_rate": 8.276e-06, + "loss": 3.419, + "step": 75200 + }, + { + "epoch": 6.98, + "learning_rate": 8.242666666666666e-06, + "loss": 3.4123, + "step": 75300 + }, + { + "epoch": 6.99, + "learning_rate": 8.209333333333335e-06, + "loss": 3.4338, + "step": 75400 + }, + { + "epoch": 7.0, + "learning_rate": 8.176000000000001e-06, + "loss": 3.4133, + "step": 75500 + }, + { + "epoch": 7.0, + "eval_loss": 3.5591769218444824, + "eval_runtime": 23.7782, + "eval_samples_per_second": 106.064, + "eval_steps_per_second": 6.645, + "step": 75500 + }, + { + "epoch": 7.01, + "learning_rate": 8.142666666666667e-06, + "loss": 3.4184, + "step": 75600 + }, + { + "epoch": 7.02, + "learning_rate": 8.109333333333334e-06, + "loss": 3.3904, + "step": 75700 + }, + { + "epoch": 7.03, + "learning_rate": 8.076e-06, + "loss": 3.3813, + "step": 75800 + }, + { + "epoch": 7.04, + "learning_rate": 8.042666666666667e-06, + "loss": 3.4422, + "step": 75900 + }, + { + "epoch": 7.04, + "learning_rate": 8.009333333333333e-06, + "loss": 3.4086, + "step": 76000 + }, + { + "epoch": 7.04, + "eval_loss": 3.562080144882202, + "eval_runtime": 23.7794, + "eval_samples_per_second": 106.058, + "eval_steps_per_second": 6.644, + "step": 76000 + }, + { + "epoch": 7.05, + "learning_rate": 7.976e-06, + "loss": 3.3967, + "step": 76100 + }, + { + "epoch": 7.06, + "learning_rate": 7.942666666666668e-06, + "loss": 3.3924, + "step": 76200 + }, + { + "epoch": 7.07, + "learning_rate": 7.909333333333334e-06, + "loss": 3.4045, + "step": 76300 + }, + { + "epoch": 7.08, + "learning_rate": 7.876e-06, + "loss": 3.4003, + "step": 76400 + }, + { + "epoch": 7.09, + "learning_rate": 7.842666666666667e-06, + "loss": 3.4056, + "step": 76500 + }, + { + "epoch": 7.09, + "eval_loss": 3.5603981018066406, + "eval_runtime": 23.7637, + "eval_samples_per_second": 106.128, + "eval_steps_per_second": 6.649, + "step": 76500 + }, + { + "epoch": 7.1, + "learning_rate": 7.809333333333333e-06, + "loss": 3.4325, + "step": 76600 + }, + { + "epoch": 7.11, + "learning_rate": 7.776e-06, + "loss": 3.4493, + "step": 76700 + }, + { + "epoch": 7.12, + "learning_rate": 7.742666666666666e-06, + "loss": 3.4245, + "step": 76800 + }, + { + "epoch": 7.13, + "learning_rate": 7.709333333333334e-06, + "loss": 3.3955, + "step": 76900 + }, + { + "epoch": 7.14, + "learning_rate": 7.676e-06, + "loss": 3.4158, + "step": 77000 + }, + { + "epoch": 7.14, + "eval_loss": 3.56292986869812, + "eval_runtime": 23.8347, + "eval_samples_per_second": 105.812, + "eval_steps_per_second": 6.629, + "step": 77000 + }, + { + "epoch": 7.15, + "learning_rate": 7.642666666666667e-06, + "loss": 3.3797, + "step": 77100 + }, + { + "epoch": 7.16, + "learning_rate": 7.6093333333333335e-06, + "loss": 3.4144, + "step": 77200 + }, + { + "epoch": 7.17, + "learning_rate": 7.576e-06, + "loss": 3.4127, + "step": 77300 + }, + { + "epoch": 7.17, + "learning_rate": 7.542666666666666e-06, + "loss": 3.4166, + "step": 77400 + }, + { + "epoch": 7.18, + "learning_rate": 7.509333333333333e-06, + "loss": 3.4153, + "step": 77500 + }, + { + "epoch": 7.18, + "eval_loss": 3.5608999729156494, + "eval_runtime": 23.7878, + "eval_samples_per_second": 106.021, + "eval_steps_per_second": 6.642, + "step": 77500 + }, + { + "epoch": 7.19, + "learning_rate": 7.476e-06, + "loss": 3.4062, + "step": 77600 + }, + { + "epoch": 7.2, + "learning_rate": 7.4426666666666665e-06, + "loss": 3.4288, + "step": 77700 + }, + { + "epoch": 7.21, + "learning_rate": 7.409333333333333e-06, + "loss": 3.4145, + "step": 77800 + }, + { + "epoch": 7.22, + "learning_rate": 7.376e-06, + "loss": 3.4103, + "step": 77900 + }, + { + "epoch": 7.23, + "learning_rate": 7.342666666666667e-06, + "loss": 3.4155, + "step": 78000 + }, + { + "epoch": 7.23, + "eval_loss": 3.5620954036712646, + "eval_runtime": 23.793, + "eval_samples_per_second": 105.997, + "eval_steps_per_second": 6.641, + "step": 78000 + }, + { + "epoch": 7.24, + "learning_rate": 7.309333333333333e-06, + "loss": 3.4204, + "step": 78100 + }, + { + "epoch": 7.25, + "learning_rate": 7.2759999999999995e-06, + "loss": 3.3973, + "step": 78200 + }, + { + "epoch": 7.26, + "learning_rate": 7.2426666666666676e-06, + "loss": 3.4022, + "step": 78300 + }, + { + "epoch": 7.27, + "learning_rate": 7.209333333333334e-06, + "loss": 3.3983, + "step": 78400 + }, + { + "epoch": 7.28, + "learning_rate": 7.176e-06, + "loss": 3.4117, + "step": 78500 + }, + { + "epoch": 7.28, + "eval_loss": 3.5625743865966797, + "eval_runtime": 23.7846, + "eval_samples_per_second": 106.035, + "eval_steps_per_second": 6.643, + "step": 78500 + }, + { + "epoch": 7.29, + "learning_rate": 7.142666666666667e-06, + "loss": 3.4003, + "step": 78600 + }, + { + "epoch": 7.3, + "learning_rate": 7.109333333333334e-06, + "loss": 3.4077, + "step": 78700 + }, + { + "epoch": 7.3, + "learning_rate": 7.0760000000000005e-06, + "loss": 3.4098, + "step": 78800 + }, + { + "epoch": 7.31, + "learning_rate": 7.042666666666667e-06, + "loss": 3.3968, + "step": 78900 + }, + { + "epoch": 7.32, + "learning_rate": 7.009333333333333e-06, + "loss": 3.407, + "step": 79000 + }, + { + "epoch": 7.32, + "eval_loss": 3.563791275024414, + "eval_runtime": 23.7857, + "eval_samples_per_second": 106.03, + "eval_steps_per_second": 6.643, + "step": 79000 + }, + { + "epoch": 7.33, + "learning_rate": 6.976000000000001e-06, + "loss": 3.4117, + "step": 79100 + }, + { + "epoch": 7.34, + "learning_rate": 6.942666666666667e-06, + "loss": 3.4233, + "step": 79200 + }, + { + "epoch": 7.35, + "learning_rate": 6.9093333333333335e-06, + "loss": 3.3917, + "step": 79300 + }, + { + "epoch": 7.36, + "learning_rate": 6.876e-06, + "loss": 3.3981, + "step": 79400 + }, + { + "epoch": 7.37, + "learning_rate": 6.843e-06, + "loss": 3.3977, + "step": 79500 + }, + { + "epoch": 7.37, + "eval_loss": 3.5603787899017334, + "eval_runtime": 23.7147, + "eval_samples_per_second": 106.348, + "eval_steps_per_second": 6.663, + "step": 79500 + }, + { + "epoch": 7.38, + "learning_rate": 6.81e-06, + "loss": 3.418, + "step": 79600 + }, + { + "epoch": 7.39, + "learning_rate": 6.7766666666666664e-06, + "loss": 3.4228, + "step": 79700 + }, + { + "epoch": 7.4, + "learning_rate": 6.743333333333334e-06, + "loss": 3.4012, + "step": 79800 + }, + { + "epoch": 7.41, + "learning_rate": 6.710000000000001e-06, + "loss": 3.4078, + "step": 79900 + }, + { + "epoch": 7.42, + "learning_rate": 6.676666666666667e-06, + "loss": 3.4134, + "step": 80000 + }, + { + "epoch": 7.42, + "eval_loss": 3.5610873699188232, + "eval_runtime": 23.7819, + "eval_samples_per_second": 106.047, + "eval_steps_per_second": 6.644, + "step": 80000 + }, + { + "epoch": 7.42, + "learning_rate": 6.643333333333334e-06, + "loss": 3.4111, + "step": 80100 + }, + { + "epoch": 7.43, + "learning_rate": 6.61e-06, + "loss": 3.4028, + "step": 80200 + }, + { + "epoch": 7.44, + "learning_rate": 6.5766666666666675e-06, + "loss": 3.4303, + "step": 80300 + }, + { + "epoch": 7.45, + "learning_rate": 6.543333333333334e-06, + "loss": 3.3938, + "step": 80400 + }, + { + "epoch": 7.46, + "learning_rate": 6.51e-06, + "loss": 3.4403, + "step": 80500 + }, + { + "epoch": 7.46, + "eval_loss": 3.563021659851074, + "eval_runtime": 23.7781, + "eval_samples_per_second": 106.064, + "eval_steps_per_second": 6.645, + "step": 80500 + }, + { + "epoch": 7.47, + "learning_rate": 6.476666666666667e-06, + "loss": 3.409, + "step": 80600 + }, + { + "epoch": 7.48, + "learning_rate": 6.443333333333334e-06, + "loss": 3.4077, + "step": 80700 + }, + { + "epoch": 7.49, + "learning_rate": 6.4100000000000005e-06, + "loss": 3.4048, + "step": 80800 + }, + { + "epoch": 7.5, + "learning_rate": 6.376666666666667e-06, + "loss": 3.3906, + "step": 80900 + }, + { + "epoch": 7.51, + "learning_rate": 6.343666666666667e-06, + "loss": 3.4002, + "step": 81000 + }, + { + "epoch": 7.51, + "eval_loss": 3.5600671768188477, + "eval_runtime": 23.7876, + "eval_samples_per_second": 106.022, + "eval_steps_per_second": 6.642, + "step": 81000 + }, + { + "epoch": 7.52, + "learning_rate": 6.310333333333334e-06, + "loss": 3.4011, + "step": 81100 + }, + { + "epoch": 7.53, + "learning_rate": 6.277e-06, + "loss": 3.4164, + "step": 81200 + }, + { + "epoch": 7.54, + "learning_rate": 6.243666666666667e-06, + "loss": 3.4116, + "step": 81300 + }, + { + "epoch": 7.55, + "learning_rate": 6.210333333333334e-06, + "loss": 3.3992, + "step": 81400 + }, + { + "epoch": 7.55, + "learning_rate": 6.177e-06, + "loss": 3.4147, + "step": 81500 + }, + { + "epoch": 7.55, + "eval_loss": 3.557727098464966, + "eval_runtime": 23.7591, + "eval_samples_per_second": 106.149, + "eval_steps_per_second": 6.65, + "step": 81500 + }, + { + "epoch": 7.56, + "learning_rate": 6.143666666666667e-06, + "loss": 3.4273, + "step": 81600 + }, + { + "epoch": 7.57, + "learning_rate": 6.110333333333334e-06, + "loss": 3.4086, + "step": 81700 + }, + { + "epoch": 7.58, + "learning_rate": 6.0770000000000004e-06, + "loss": 3.4102, + "step": 81800 + }, + { + "epoch": 7.59, + "learning_rate": 6.043666666666667e-06, + "loss": 3.4214, + "step": 81900 + }, + { + "epoch": 7.6, + "learning_rate": 6.010333333333333e-06, + "loss": 3.4068, + "step": 82000 + }, + { + "epoch": 7.6, + "eval_loss": 3.5587644577026367, + "eval_runtime": 23.8126, + "eval_samples_per_second": 105.91, + "eval_steps_per_second": 6.635, + "step": 82000 + }, + { + "epoch": 7.61, + "learning_rate": 5.9770000000000005e-06, + "loss": 3.422, + "step": 82100 + }, + { + "epoch": 7.62, + "learning_rate": 5.943666666666667e-06, + "loss": 3.4128, + "step": 82200 + }, + { + "epoch": 7.63, + "learning_rate": 5.910333333333333e-06, + "loss": 3.3978, + "step": 82300 + }, + { + "epoch": 7.64, + "learning_rate": 5.877e-06, + "loss": 3.4095, + "step": 82400 + }, + { + "epoch": 7.65, + "learning_rate": 5.843666666666667e-06, + "loss": 3.4165, + "step": 82500 + }, + { + "epoch": 7.65, + "eval_loss": 3.5612709522247314, + "eval_runtime": 23.7739, + "eval_samples_per_second": 106.083, + "eval_steps_per_second": 6.646, + "step": 82500 + }, + { + "epoch": 7.66, + "learning_rate": 5.8103333333333335e-06, + "loss": 3.3894, + "step": 82600 + }, + { + "epoch": 7.67, + "learning_rate": 5.777e-06, + "loss": 3.3967, + "step": 82700 + }, + { + "epoch": 7.68, + "learning_rate": 5.743666666666666e-06, + "loss": 3.4344, + "step": 82800 + }, + { + "epoch": 7.68, + "learning_rate": 5.710333333333334e-06, + "loss": 3.4359, + "step": 82900 + }, + { + "epoch": 7.69, + "learning_rate": 5.677e-06, + "loss": 3.409, + "step": 83000 + }, + { + "epoch": 7.69, + "eval_loss": 3.55955171585083, + "eval_runtime": 23.7881, + "eval_samples_per_second": 106.019, + "eval_steps_per_second": 6.642, + "step": 83000 + }, + { + "epoch": 7.7, + "learning_rate": 5.6436666666666664e-06, + "loss": 3.403, + "step": 83100 + }, + { + "epoch": 7.71, + "learning_rate": 5.610333333333333e-06, + "loss": 3.3973, + "step": 83200 + }, + { + "epoch": 7.72, + "learning_rate": 5.577e-06, + "loss": 3.4116, + "step": 83300 + }, + { + "epoch": 7.73, + "learning_rate": 5.5436666666666666e-06, + "loss": 3.4164, + "step": 83400 + }, + { + "epoch": 7.74, + "learning_rate": 5.510333333333333e-06, + "loss": 3.4213, + "step": 83500 + }, + { + "epoch": 7.74, + "eval_loss": 3.558309555053711, + "eval_runtime": 23.7902, + "eval_samples_per_second": 106.01, + "eval_steps_per_second": 6.641, + "step": 83500 + }, + { + "epoch": 7.75, + "learning_rate": 5.476999999999999e-06, + "loss": 3.4144, + "step": 83600 + }, + { + "epoch": 7.76, + "learning_rate": 5.443666666666667e-06, + "loss": 3.4265, + "step": 83700 + }, + { + "epoch": 7.77, + "learning_rate": 5.410333333333334e-06, + "loss": 3.4128, + "step": 83800 + }, + { + "epoch": 7.78, + "learning_rate": 5.377e-06, + "loss": 3.4057, + "step": 83900 + }, + { + "epoch": 7.79, + "learning_rate": 5.343666666666667e-06, + "loss": 3.403, + "step": 84000 + }, + { + "epoch": 7.79, + "eval_loss": 3.5601158142089844, + "eval_runtime": 23.7948, + "eval_samples_per_second": 105.989, + "eval_steps_per_second": 6.64, + "step": 84000 + }, + { + "epoch": 7.8, + "learning_rate": 5.310333333333334e-06, + "loss": 3.4139, + "step": 84100 + }, + { + "epoch": 7.8, + "learning_rate": 5.2770000000000005e-06, + "loss": 3.4101, + "step": 84200 + }, + { + "epoch": 7.81, + "learning_rate": 5.243666666666667e-06, + "loss": 3.3735, + "step": 84300 + }, + { + "epoch": 7.82, + "learning_rate": 5.210333333333333e-06, + "loss": 3.4077, + "step": 84400 + }, + { + "epoch": 7.83, + "learning_rate": 5.177000000000001e-06, + "loss": 3.3819, + "step": 84500 + }, + { + "epoch": 7.83, + "eval_loss": 3.5579638481140137, + "eval_runtime": 23.8048, + "eval_samples_per_second": 105.945, + "eval_steps_per_second": 6.637, + "step": 84500 + }, + { + "epoch": 7.84, + "learning_rate": 5.143666666666667e-06, + "loss": 3.4202, + "step": 84600 + }, + { + "epoch": 7.85, + "learning_rate": 5.110333333333333e-06, + "loss": 3.4047, + "step": 84700 + }, + { + "epoch": 7.86, + "learning_rate": 5.077000000000001e-06, + "loss": 3.4063, + "step": 84800 + }, + { + "epoch": 7.87, + "learning_rate": 5.043666666666667e-06, + "loss": 3.4038, + "step": 84900 + }, + { + "epoch": 7.88, + "learning_rate": 5.0103333333333335e-06, + "loss": 3.4182, + "step": 85000 + }, + { + "epoch": 7.88, + "eval_loss": 3.5569992065429688, + "eval_runtime": 23.7728, + "eval_samples_per_second": 106.088, + "eval_steps_per_second": 6.646, + "step": 85000 + }, + { + "epoch": 7.89, + "learning_rate": 4.977e-06, + "loss": 3.3997, + "step": 85100 + }, + { + "epoch": 7.9, + "learning_rate": 4.943666666666667e-06, + "loss": 3.381, + "step": 85200 + }, + { + "epoch": 7.91, + "learning_rate": 4.910333333333334e-06, + "loss": 3.4129, + "step": 85300 + }, + { + "epoch": 7.92, + "learning_rate": 4.877e-06, + "loss": 3.4063, + "step": 85400 + }, + { + "epoch": 7.93, + "learning_rate": 4.8436666666666665e-06, + "loss": 3.4099, + "step": 85500 + }, + { + "epoch": 7.93, + "eval_loss": 3.556957483291626, + "eval_runtime": 23.7791, + "eval_samples_per_second": 106.06, + "eval_steps_per_second": 6.644, + "step": 85500 + }, + { + "epoch": 7.93, + "learning_rate": 4.810333333333334e-06, + "loss": 3.4177, + "step": 85600 + }, + { + "epoch": 7.94, + "learning_rate": 4.777e-06, + "loss": 3.3911, + "step": 85700 + }, + { + "epoch": 7.95, + "learning_rate": 4.743666666666667e-06, + "loss": 3.4014, + "step": 85800 + }, + { + "epoch": 7.96, + "learning_rate": 4.710333333333333e-06, + "loss": 3.4199, + "step": 85900 + }, + { + "epoch": 7.97, + "learning_rate": 4.677e-06, + "loss": 3.3845, + "step": 86000 + }, + { + "epoch": 7.97, + "eval_loss": 3.558166980743408, + "eval_runtime": 23.7635, + "eval_samples_per_second": 106.129, + "eval_steps_per_second": 6.649, + "step": 86000 + }, + { + "epoch": 7.98, + "learning_rate": 4.643666666666667e-06, + "loss": 3.3794, + "step": 86100 + }, + { + "epoch": 7.99, + "learning_rate": 4.610333333333333e-06, + "loss": 3.4015, + "step": 86200 + }, + { + "epoch": 8.0, + "learning_rate": 4.5769999999999995e-06, + "loss": 3.3849, + "step": 86300 + }, + { + "epoch": 8.01, + "learning_rate": 4.543666666666667e-06, + "loss": 3.4057, + "step": 86400 + }, + { + "epoch": 8.02, + "learning_rate": 4.510333333333333e-06, + "loss": 3.411, + "step": 86500 + }, + { + "epoch": 8.02, + "eval_loss": 3.5609631538391113, + "eval_runtime": 23.7818, + "eval_samples_per_second": 106.047, + "eval_steps_per_second": 6.644, + "step": 86500 + }, + { + "epoch": 8.03, + "learning_rate": 4.477e-06, + "loss": 3.3687, + "step": 86600 + }, + { + "epoch": 8.04, + "learning_rate": 4.443666666666666e-06, + "loss": 3.4203, + "step": 86700 + }, + { + "epoch": 8.05, + "learning_rate": 4.410333333333333e-06, + "loss": 3.3975, + "step": 86800 + }, + { + "epoch": 8.06, + "learning_rate": 4.377e-06, + "loss": 3.3786, + "step": 86900 + }, + { + "epoch": 8.06, + "learning_rate": 4.343666666666667e-06, + "loss": 3.3952, + "step": 87000 + }, + { + "epoch": 8.06, + "eval_loss": 3.558793544769287, + "eval_runtime": 23.7833, + "eval_samples_per_second": 106.041, + "eval_steps_per_second": 6.643, + "step": 87000 + }, + { + "epoch": 8.07, + "learning_rate": 4.3103333333333335e-06, + "loss": 3.4223, + "step": 87100 + }, + { + "epoch": 8.08, + "learning_rate": 4.277000000000001e-06, + "loss": 3.3969, + "step": 87200 + }, + { + "epoch": 8.09, + "learning_rate": 4.243666666666667e-06, + "loss": 3.4057, + "step": 87300 + }, + { + "epoch": 8.1, + "learning_rate": 4.2103333333333336e-06, + "loss": 3.3814, + "step": 87400 + }, + { + "epoch": 8.11, + "learning_rate": 4.177e-06, + "loss": 3.4211, + "step": 87500 + }, + { + "epoch": 8.11, + "eval_loss": 3.558820962905884, + "eval_runtime": 23.7795, + "eval_samples_per_second": 106.058, + "eval_steps_per_second": 6.644, + "step": 87500 + }, + { + "epoch": 8.12, + "learning_rate": 4.143666666666667e-06, + "loss": 3.3888, + "step": 87600 + }, + { + "epoch": 8.13, + "learning_rate": 4.110333333333334e-06, + "loss": 3.3956, + "step": 87700 + }, + { + "epoch": 8.14, + "learning_rate": 4.077e-06, + "loss": 3.3754, + "step": 87800 + }, + { + "epoch": 8.15, + "learning_rate": 4.043666666666667e-06, + "loss": 3.3996, + "step": 87900 + }, + { + "epoch": 8.16, + "learning_rate": 4.010333333333334e-06, + "loss": 3.4171, + "step": 88000 + }, + { + "epoch": 8.16, + "eval_loss": 3.5569565296173096, + "eval_runtime": 23.7758, + "eval_samples_per_second": 106.074, + "eval_steps_per_second": 6.645, + "step": 88000 + }, + { + "epoch": 8.17, + "learning_rate": 3.977e-06, + "loss": 3.4081, + "step": 88100 + }, + { + "epoch": 8.18, + "learning_rate": 3.943666666666667e-06, + "loss": 3.3594, + "step": 88200 + }, + { + "epoch": 8.19, + "learning_rate": 3.910333333333334e-06, + "loss": 3.4095, + "step": 88300 + }, + { + "epoch": 8.19, + "learning_rate": 3.877e-06, + "loss": 3.4008, + "step": 88400 + }, + { + "epoch": 8.2, + "learning_rate": 3.843666666666667e-06, + "loss": 3.3825, + "step": 88500 + }, + { + "epoch": 8.2, + "eval_loss": 3.56074595451355, + "eval_runtime": 23.7071, + "eval_samples_per_second": 106.382, + "eval_steps_per_second": 6.665, + "step": 88500 + }, + { + "epoch": 8.21, + "learning_rate": 3.810333333333333e-06, + "loss": 3.3902, + "step": 88600 + }, + { + "epoch": 8.22, + "learning_rate": 3.7770000000000004e-06, + "loss": 3.3889, + "step": 88700 + }, + { + "epoch": 8.23, + "learning_rate": 3.743666666666667e-06, + "loss": 3.3736, + "step": 88800 + }, + { + "epoch": 8.24, + "learning_rate": 3.7103333333333333e-06, + "loss": 3.3951, + "step": 88900 + }, + { + "epoch": 8.25, + "learning_rate": 3.677e-06, + "loss": 3.3807, + "step": 89000 + }, + { + "epoch": 8.25, + "eval_loss": 3.5578765869140625, + "eval_runtime": 23.792, + "eval_samples_per_second": 106.002, + "eval_steps_per_second": 6.641, + "step": 89000 + }, + { + "epoch": 8.26, + "learning_rate": 3.644e-06, + "loss": 3.3969, + "step": 89100 + }, + { + "epoch": 8.27, + "learning_rate": 3.6106666666666666e-06, + "loss": 3.3845, + "step": 89200 + }, + { + "epoch": 8.28, + "learning_rate": 3.5773333333333334e-06, + "loss": 3.3756, + "step": 89300 + }, + { + "epoch": 8.29, + "learning_rate": 3.5440000000000003e-06, + "loss": 3.3856, + "step": 89400 + }, + { + "epoch": 8.3, + "learning_rate": 3.5106666666666667e-06, + "loss": 3.3842, + "step": 89500 + }, + { + "epoch": 8.3, + "eval_loss": 3.558271646499634, + "eval_runtime": 23.788, + "eval_samples_per_second": 106.02, + "eval_steps_per_second": 6.642, + "step": 89500 + }, + { + "epoch": 8.31, + "learning_rate": 3.4773333333333336e-06, + "loss": 3.3895, + "step": 89600 + }, + { + "epoch": 8.31, + "learning_rate": 3.444e-06, + "loss": 3.4004, + "step": 89700 + }, + { + "epoch": 8.32, + "learning_rate": 3.410666666666667e-06, + "loss": 3.3814, + "step": 89800 + }, + { + "epoch": 8.33, + "learning_rate": 3.3773333333333332e-06, + "loss": 3.3887, + "step": 89900 + }, + { + "epoch": 8.34, + "learning_rate": 3.344e-06, + "loss": 3.3809, + "step": 90000 + }, + { + "epoch": 8.34, + "eval_loss": 3.5595808029174805, + "eval_runtime": 23.7777, + "eval_samples_per_second": 106.066, + "eval_steps_per_second": 6.645, + "step": 90000 + }, + { + "epoch": 8.35, + "learning_rate": 3.3106666666666665e-06, + "loss": 3.3975, + "step": 90100 + }, + { + "epoch": 8.36, + "learning_rate": 3.2773333333333338e-06, + "loss": 3.4037, + "step": 90200 + }, + { + "epoch": 8.37, + "learning_rate": 3.244e-06, + "loss": 3.4035, + "step": 90300 + }, + { + "epoch": 8.38, + "learning_rate": 3.210666666666667e-06, + "loss": 3.4036, + "step": 90400 + }, + { + "epoch": 8.39, + "learning_rate": 3.1773333333333335e-06, + "loss": 3.4033, + "step": 90500 + }, + { + "epoch": 8.39, + "eval_loss": 3.55897855758667, + "eval_runtime": 23.7836, + "eval_samples_per_second": 106.039, + "eval_steps_per_second": 6.643, + "step": 90500 + }, + { + "epoch": 8.4, + "learning_rate": 3.1440000000000003e-06, + "loss": 3.4154, + "step": 90600 + }, + { + "epoch": 8.41, + "learning_rate": 3.1106666666666667e-06, + "loss": 3.3953, + "step": 90700 + }, + { + "epoch": 8.42, + "learning_rate": 3.0773333333333336e-06, + "loss": 3.3951, + "step": 90800 + }, + { + "epoch": 8.43, + "learning_rate": 3.044e-06, + "loss": 3.3892, + "step": 90900 + }, + { + "epoch": 8.44, + "learning_rate": 3.010666666666667e-06, + "loss": 3.4156, + "step": 91000 + }, + { + "epoch": 8.44, + "eval_loss": 3.5576674938201904, + "eval_runtime": 23.7762, + "eval_samples_per_second": 106.072, + "eval_steps_per_second": 6.645, + "step": 91000 + }, + { + "epoch": 8.44, + "learning_rate": 2.9773333333333333e-06, + "loss": 3.4082, + "step": 91100 + }, + { + "epoch": 8.45, + "learning_rate": 2.9443333333333337e-06, + "loss": 3.3943, + "step": 91200 + }, + { + "epoch": 8.46, + "learning_rate": 2.911e-06, + "loss": 3.3896, + "step": 91300 + }, + { + "epoch": 8.47, + "learning_rate": 2.877666666666667e-06, + "loss": 3.415, + "step": 91400 + }, + { + "epoch": 8.48, + "learning_rate": 2.8443333333333334e-06, + "loss": 3.3927, + "step": 91500 + }, + { + "epoch": 8.48, + "eval_loss": 3.558485269546509, + "eval_runtime": 23.7779, + "eval_samples_per_second": 106.065, + "eval_steps_per_second": 6.645, + "step": 91500 + }, + { + "epoch": 8.49, + "learning_rate": 2.8110000000000003e-06, + "loss": 3.3887, + "step": 91600 + }, + { + "epoch": 8.5, + "learning_rate": 2.7776666666666667e-06, + "loss": 3.3867, + "step": 91700 + }, + { + "epoch": 8.51, + "learning_rate": 2.7443333333333335e-06, + "loss": 3.3735, + "step": 91800 + }, + { + "epoch": 8.52, + "learning_rate": 2.711e-06, + "loss": 3.3878, + "step": 91900 + }, + { + "epoch": 8.53, + "learning_rate": 2.677666666666667e-06, + "loss": 3.4041, + "step": 92000 + }, + { + "epoch": 8.53, + "eval_loss": 3.5595757961273193, + "eval_runtime": 23.7777, + "eval_samples_per_second": 106.066, + "eval_steps_per_second": 6.645, + "step": 92000 + }, + { + "epoch": 8.54, + "learning_rate": 2.6443333333333332e-06, + "loss": 3.4155, + "step": 92100 + }, + { + "epoch": 8.55, + "learning_rate": 2.611e-06, + "loss": 3.379, + "step": 92200 + }, + { + "epoch": 8.56, + "learning_rate": 2.5776666666666665e-06, + "loss": 3.3913, + "step": 92300 + }, + { + "epoch": 8.57, + "learning_rate": 2.5443333333333333e-06, + "loss": 3.3749, + "step": 92400 + }, + { + "epoch": 8.57, + "learning_rate": 2.5109999999999998e-06, + "loss": 3.4006, + "step": 92500 + }, + { + "epoch": 8.57, + "eval_loss": 3.5600292682647705, + "eval_runtime": 23.781, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 6.644, + "step": 92500 + }, + { + "epoch": 8.58, + "learning_rate": 2.477666666666667e-06, + "loss": 3.4026, + "step": 92600 + }, + { + "epoch": 8.59, + "learning_rate": 2.4443333333333334e-06, + "loss": 3.4121, + "step": 92700 + }, + { + "epoch": 8.6, + "learning_rate": 2.4110000000000003e-06, + "loss": 3.3956, + "step": 92800 + }, + { + "epoch": 8.61, + "learning_rate": 2.3776666666666667e-06, + "loss": 3.4069, + "step": 92900 + }, + { + "epoch": 8.62, + "learning_rate": 2.3446666666666668e-06, + "loss": 3.4007, + "step": 93000 + }, + { + "epoch": 8.62, + "eval_loss": 3.5577518939971924, + "eval_runtime": 23.7878, + "eval_samples_per_second": 106.021, + "eval_steps_per_second": 6.642, + "step": 93000 + }, + { + "epoch": 8.63, + "learning_rate": 2.311333333333333e-06, + "loss": 3.3887, + "step": 93100 + }, + { + "epoch": 8.64, + "learning_rate": 2.278e-06, + "loss": 3.3724, + "step": 93200 + }, + { + "epoch": 8.65, + "learning_rate": 2.2446666666666665e-06, + "loss": 3.3821, + "step": 93300 + }, + { + "epoch": 8.66, + "learning_rate": 2.2113333333333337e-06, + "loss": 3.4197, + "step": 93400 + }, + { + "epoch": 8.67, + "learning_rate": 2.178e-06, + "loss": 3.4047, + "step": 93500 + }, + { + "epoch": 8.67, + "eval_loss": 3.557192087173462, + "eval_runtime": 23.7805, + "eval_samples_per_second": 106.053, + "eval_steps_per_second": 6.644, + "step": 93500 + }, + { + "epoch": 8.68, + "learning_rate": 2.144666666666667e-06, + "loss": 3.4009, + "step": 93600 + }, + { + "epoch": 8.69, + "learning_rate": 2.1113333333333334e-06, + "loss": 3.3817, + "step": 93700 + }, + { + "epoch": 8.69, + "learning_rate": 2.0780000000000003e-06, + "loss": 3.371, + "step": 93800 + }, + { + "epoch": 8.7, + "learning_rate": 2.0446666666666667e-06, + "loss": 3.3711, + "step": 93900 + }, + { + "epoch": 8.71, + "learning_rate": 2.0113333333333335e-06, + "loss": 3.3904, + "step": 94000 + }, + { + "epoch": 8.71, + "eval_loss": 3.557133913040161, + "eval_runtime": 23.7814, + "eval_samples_per_second": 106.049, + "eval_steps_per_second": 6.644, + "step": 94000 + }, + { + "epoch": 8.72, + "learning_rate": 1.978e-06, + "loss": 3.4108, + "step": 94100 + }, + { + "epoch": 8.73, + "learning_rate": 1.944666666666667e-06, + "loss": 3.4221, + "step": 94200 + }, + { + "epoch": 8.74, + "learning_rate": 1.9113333333333332e-06, + "loss": 3.3815, + "step": 94300 + }, + { + "epoch": 8.75, + "learning_rate": 1.878e-06, + "loss": 3.3923, + "step": 94400 + }, + { + "epoch": 8.76, + "learning_rate": 1.8446666666666667e-06, + "loss": 3.3888, + "step": 94500 + }, + { + "epoch": 8.76, + "eval_loss": 3.558104991912842, + "eval_runtime": 23.7899, + "eval_samples_per_second": 106.011, + "eval_steps_per_second": 6.641, + "step": 94500 + }, + { + "epoch": 8.77, + "learning_rate": 1.8113333333333335e-06, + "loss": 3.4093, + "step": 94600 + }, + { + "epoch": 8.78, + "learning_rate": 1.7780000000000002e-06, + "loss": 3.4228, + "step": 94700 + }, + { + "epoch": 8.79, + "learning_rate": 1.7446666666666668e-06, + "loss": 3.4168, + "step": 94800 + }, + { + "epoch": 8.8, + "learning_rate": 1.7113333333333334e-06, + "loss": 3.3995, + "step": 94900 + }, + { + "epoch": 8.81, + "learning_rate": 1.678e-06, + "loss": 3.3876, + "step": 95000 + }, + { + "epoch": 8.81, + "eval_loss": 3.55720591545105, + "eval_runtime": 23.7866, + "eval_samples_per_second": 106.026, + "eval_steps_per_second": 6.642, + "step": 95000 + }, + { + "epoch": 8.82, + "learning_rate": 1.6446666666666667e-06, + "loss": 3.3805, + "step": 95100 + }, + { + "epoch": 8.82, + "learning_rate": 1.6113333333333333e-06, + "loss": 3.3955, + "step": 95200 + }, + { + "epoch": 8.83, + "learning_rate": 1.578e-06, + "loss": 3.391, + "step": 95300 + }, + { + "epoch": 8.84, + "learning_rate": 1.5446666666666668e-06, + "loss": 3.401, + "step": 95400 + }, + { + "epoch": 8.85, + "learning_rate": 1.5113333333333334e-06, + "loss": 3.3872, + "step": 95500 + }, + { + "epoch": 8.85, + "eval_loss": 3.5574591159820557, + "eval_runtime": 23.7708, + "eval_samples_per_second": 106.096, + "eval_steps_per_second": 6.647, + "step": 95500 + }, + { + "epoch": 8.86, + "learning_rate": 1.478e-06, + "loss": 3.4022, + "step": 95600 + }, + { + "epoch": 8.87, + "learning_rate": 1.4446666666666667e-06, + "loss": 3.407, + "step": 95700 + }, + { + "epoch": 8.88, + "learning_rate": 1.4113333333333333e-06, + "loss": 3.3884, + "step": 95800 + }, + { + "epoch": 8.89, + "learning_rate": 1.378e-06, + "loss": 3.3934, + "step": 95900 + }, + { + "epoch": 8.9, + "learning_rate": 1.3446666666666666e-06, + "loss": 3.3753, + "step": 96000 + }, + { + "epoch": 8.9, + "eval_loss": 3.5576891899108887, + "eval_runtime": 23.8007, + "eval_samples_per_second": 105.963, + "eval_steps_per_second": 6.638, + "step": 96000 + }, + { + "epoch": 8.91, + "learning_rate": 1.3113333333333332e-06, + "loss": 3.39, + "step": 96100 + }, + { + "epoch": 8.92, + "learning_rate": 1.278e-06, + "loss": 3.418, + "step": 96200 + }, + { + "epoch": 8.93, + "learning_rate": 1.2446666666666667e-06, + "loss": 3.3898, + "step": 96300 + }, + { + "epoch": 8.94, + "learning_rate": 1.2113333333333334e-06, + "loss": 3.3955, + "step": 96400 + }, + { + "epoch": 8.95, + "learning_rate": 1.178e-06, + "loss": 3.3961, + "step": 96500 + }, + { + "epoch": 8.95, + "eval_loss": 3.5567543506622314, + "eval_runtime": 23.775, + "eval_samples_per_second": 106.078, + "eval_steps_per_second": 6.646, + "step": 96500 + }, + { + "epoch": 8.95, + "learning_rate": 1.1446666666666666e-06, + "loss": 3.3927, + "step": 96600 + }, + { + "epoch": 8.96, + "learning_rate": 1.1113333333333333e-06, + "loss": 3.3778, + "step": 96700 + }, + { + "epoch": 8.97, + "learning_rate": 1.0779999999999999e-06, + "loss": 3.4238, + "step": 96800 + }, + { + "epoch": 8.98, + "learning_rate": 1.0446666666666665e-06, + "loss": 3.3893, + "step": 96900 + }, + { + "epoch": 8.99, + "learning_rate": 1.0113333333333334e-06, + "loss": 3.4131, + "step": 97000 + }, + { + "epoch": 8.99, + "eval_loss": 3.55790376663208, + "eval_runtime": 23.774, + "eval_samples_per_second": 106.082, + "eval_steps_per_second": 6.646, + "step": 97000 + }, + { + "epoch": 9.0, + "learning_rate": 9.783333333333334e-07, + "loss": 3.4256, + "step": 97100 + }, + { + "epoch": 9.01, + "learning_rate": 9.450000000000001e-07, + "loss": 3.3537, + "step": 97200 + }, + { + "epoch": 9.02, + "learning_rate": 9.116666666666667e-07, + "loss": 3.3696, + "step": 97300 + }, + { + "epoch": 9.03, + "learning_rate": 8.783333333333333e-07, + "loss": 3.3778, + "step": 97400 + }, + { + "epoch": 9.04, + "learning_rate": 8.453333333333334e-07, + "loss": 3.3647, + "step": 97500 + }, + { + "epoch": 9.04, + "eval_loss": 3.557291269302368, + "eval_runtime": 23.7854, + "eval_samples_per_second": 106.032, + "eval_steps_per_second": 6.643, + "step": 97500 + }, + { + "epoch": 9.05, + "learning_rate": 8.12e-07, + "loss": 3.3852, + "step": 97600 + }, + { + "epoch": 9.06, + "learning_rate": 7.786666666666667e-07, + "loss": 3.4056, + "step": 97700 + }, + { + "epoch": 9.07, + "learning_rate": 7.453333333333333e-07, + "loss": 3.3955, + "step": 97800 + }, + { + "epoch": 9.07, + "learning_rate": 7.12e-07, + "loss": 3.4157, + "step": 97900 + }, + { + "epoch": 9.08, + "learning_rate": 6.786666666666667e-07, + "loss": 3.3792, + "step": 98000 + }, + { + "epoch": 9.08, + "eval_loss": 3.557647466659546, + "eval_runtime": 23.7799, + "eval_samples_per_second": 106.056, + "eval_steps_per_second": 6.644, + "step": 98000 + }, + { + "epoch": 9.09, + "learning_rate": 6.453333333333334e-07, + "loss": 3.3977, + "step": 98100 + }, + { + "epoch": 9.1, + "learning_rate": 6.12e-07, + "loss": 3.3818, + "step": 98200 + }, + { + "epoch": 9.11, + "learning_rate": 5.786666666666668e-07, + "loss": 3.3923, + "step": 98300 + }, + { + "epoch": 9.12, + "learning_rate": 5.453333333333334e-07, + "loss": 3.3855, + "step": 98400 + }, + { + "epoch": 9.13, + "learning_rate": 5.123333333333334e-07, + "loss": 3.3755, + "step": 98500 + }, + { + "epoch": 9.13, + "eval_loss": 3.55747389793396, + "eval_runtime": 23.7811, + "eval_samples_per_second": 106.051, + "eval_steps_per_second": 6.644, + "step": 98500 + }, + { + "epoch": 9.14, + "learning_rate": 4.79e-07, + "loss": 3.378, + "step": 98600 + }, + { + "epoch": 9.15, + "learning_rate": 4.456666666666667e-07, + "loss": 3.3893, + "step": 98700 + }, + { + "epoch": 9.16, + "learning_rate": 4.1233333333333336e-07, + "loss": 3.387, + "step": 98800 + }, + { + "epoch": 9.17, + "learning_rate": 3.79e-07, + "loss": 3.3539, + "step": 98900 + }, + { + "epoch": 9.18, + "learning_rate": 3.456666666666667e-07, + "loss": 3.3981, + "step": 99000 + }, + { + "epoch": 9.18, + "eval_loss": 3.557342290878296, + "eval_runtime": 23.7655, + "eval_samples_per_second": 106.12, + "eval_steps_per_second": 6.648, + "step": 99000 + }, + { + "epoch": 9.19, + "learning_rate": 3.1266666666666663e-07, + "loss": 3.3796, + "step": 99100 + }, + { + "epoch": 9.2, + "learning_rate": 2.793333333333333e-07, + "loss": 3.3579, + "step": 99200 + }, + { + "epoch": 9.2, + "learning_rate": 2.46e-07, + "loss": 3.3742, + "step": 99300 + }, + { + "epoch": 9.21, + "learning_rate": 2.1266666666666667e-07, + "loss": 3.3831, + "step": 99400 + }, + { + "epoch": 9.22, + "learning_rate": 1.7933333333333335e-07, + "loss": 3.3914, + "step": 99500 + }, + { + "epoch": 9.22, + "eval_loss": 3.5572941303253174, + "eval_runtime": 23.7886, + "eval_samples_per_second": 106.017, + "eval_steps_per_second": 6.642, + "step": 99500 + }, + { + "epoch": 9.23, + "learning_rate": 1.46e-07, + "loss": 3.4, + "step": 99600 + }, + { + "epoch": 9.24, + "learning_rate": 1.1266666666666667e-07, + "loss": 3.3892, + "step": 99700 + }, + { + "epoch": 9.25, + "learning_rate": 7.933333333333334e-08, + "loss": 3.3746, + "step": 99800 + }, + { + "epoch": 9.26, + "learning_rate": 4.6e-08, + "loss": 3.3924, + "step": 99900 + }, + { + "epoch": 9.27, + "learning_rate": 1.2666666666666668e-08, + "loss": 3.4136, + "step": 100000 + }, + { + "epoch": 9.27, + "eval_loss": 3.557504177093506, + "eval_runtime": 23.7953, + "eval_samples_per_second": 105.987, + "eval_steps_per_second": 6.64, + "step": 100000 + }, + { + "epoch": 9.27, + "step": 100000, + "total_flos": 2.2851507373589135e+18, + "train_loss": 3.524176220359802, + "train_runtime": 67760.9425, + "train_samples_per_second": 23.612, + "train_steps_per_second": 1.476 + } + ], + "max_steps": 100000, + "num_train_epochs": 10, + "total_flos": 2.2851507373589135e+18, + "trial_name": null, + "trial_params": null +}