{ "best_metric": null, "best_model_checkpoint": null, "epoch": 400.0, "global_step": 15200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.26, "eval_loss": 1.7080078125, "eval_runtime": 3.7763, "eval_samples_per_second": 68.321, "eval_steps_per_second": 8.739, "step": 200 }, { "epoch": 10.53, "eval_loss": 1.7001953125, "eval_runtime": 3.7858, "eval_samples_per_second": 68.149, "eval_steps_per_second": 8.717, "step": 400 }, { "epoch": 13.16, "learning_rate": 5e-05, "loss": 1.671, "step": 500 }, { "epoch": 15.79, "eval_loss": 1.7138671875, "eval_runtime": 3.7865, "eval_samples_per_second": 68.137, "eval_steps_per_second": 8.715, "step": 600 }, { "epoch": 21.05, "eval_loss": 1.744140625, "eval_runtime": 3.7879, "eval_samples_per_second": 68.112, "eval_steps_per_second": 8.712, "step": 800 }, { "epoch": 26.32, "learning_rate": 5e-05, "loss": 1.4438, "step": 1000 }, { "epoch": 26.32, "eval_loss": 1.794921875, "eval_runtime": 3.7869, "eval_samples_per_second": 68.13, "eval_steps_per_second": 8.714, "step": 1000 }, { "epoch": 31.58, "eval_loss": 1.84375, "eval_runtime": 3.7879, "eval_samples_per_second": 68.112, "eval_steps_per_second": 8.712, "step": 1200 }, { "epoch": 36.84, "eval_loss": 1.896484375, "eval_runtime": 3.7905, "eval_samples_per_second": 68.066, "eval_steps_per_second": 8.706, "step": 1400 }, { "epoch": 39.47, "learning_rate": 5e-05, "loss": 1.2806, "step": 1500 }, { "epoch": 42.11, "eval_loss": 1.9619140625, "eval_runtime": 3.7916, "eval_samples_per_second": 68.044, "eval_steps_per_second": 8.703, "step": 1600 }, { "epoch": 47.37, "eval_loss": 2.01953125, "eval_runtime": 3.7897, "eval_samples_per_second": 68.08, "eval_steps_per_second": 8.708, "step": 1800 }, { "epoch": 52.63, "learning_rate": 5e-05, "loss": 1.1433, "step": 2000 }, { "epoch": 52.63, "eval_loss": 2.068359375, "eval_runtime": 3.7891, "eval_samples_per_second": 68.091, "eval_steps_per_second": 8.709, "step": 2000 }, { "epoch": 57.89, "eval_loss": 2.1171875, "eval_runtime": 3.7902, "eval_samples_per_second": 68.069, "eval_steps_per_second": 8.707, "step": 2200 }, { "epoch": 63.16, "eval_loss": 2.1953125, "eval_runtime": 3.7898, "eval_samples_per_second": 68.077, "eval_steps_per_second": 8.708, "step": 2400 }, { "epoch": 65.79, "learning_rate": 5e-05, "loss": 1.027, "step": 2500 }, { "epoch": 68.42, "eval_loss": 2.25, "eval_runtime": 3.7881, "eval_samples_per_second": 68.108, "eval_steps_per_second": 8.711, "step": 2600 }, { "epoch": 73.68, "eval_loss": 2.291015625, "eval_runtime": 3.7876, "eval_samples_per_second": 68.118, "eval_steps_per_second": 8.713, "step": 2800 }, { "epoch": 78.95, "learning_rate": 5e-05, "loss": 0.9216, "step": 3000 }, { "epoch": 78.95, "eval_loss": 2.34765625, "eval_runtime": 3.7885, "eval_samples_per_second": 68.101, "eval_steps_per_second": 8.711, "step": 3000 }, { "epoch": 84.21, "eval_loss": 2.423828125, "eval_runtime": 3.7907, "eval_samples_per_second": 68.062, "eval_steps_per_second": 8.706, "step": 3200 }, { "epoch": 89.47, "eval_loss": 2.482421875, "eval_runtime": 3.7903, "eval_samples_per_second": 68.068, "eval_steps_per_second": 8.706, "step": 3400 }, { "epoch": 92.11, "learning_rate": 5e-05, "loss": 0.8209, "step": 3500 }, { "epoch": 94.74, "eval_loss": 2.529296875, "eval_runtime": 3.7863, "eval_samples_per_second": 68.14, "eval_steps_per_second": 8.716, "step": 3600 }, { "epoch": 100.0, "eval_loss": 2.5859375, "eval_runtime": 3.785, "eval_samples_per_second": 68.164, "eval_steps_per_second": 8.719, "step": 3800 }, { "epoch": 105.26, "learning_rate": 5e-05, "loss": 0.7231, "step": 4000 }, { "epoch": 105.26, "eval_loss": 2.6640625, "eval_runtime": 3.7856, "eval_samples_per_second": 68.153, "eval_steps_per_second": 8.717, "step": 4000 }, { "epoch": 110.53, "eval_loss": 2.703125, "eval_runtime": 3.7862, "eval_samples_per_second": 68.142, "eval_steps_per_second": 8.716, "step": 4200 }, { "epoch": 115.79, "eval_loss": 2.78515625, "eval_runtime": 3.7894, "eval_samples_per_second": 68.084, "eval_steps_per_second": 8.708, "step": 4400 }, { "epoch": 118.42, "learning_rate": 5e-05, "loss": 0.6281, "step": 4500 }, { "epoch": 121.05, "eval_loss": 2.84375, "eval_runtime": 3.7883, "eval_samples_per_second": 68.105, "eval_steps_per_second": 8.711, "step": 4600 }, { "epoch": 126.32, "eval_loss": 2.921875, "eval_runtime": 3.79, "eval_samples_per_second": 68.074, "eval_steps_per_second": 8.707, "step": 4800 }, { "epoch": 131.58, "learning_rate": 5e-05, "loss": 0.5384, "step": 5000 }, { "epoch": 131.58, "eval_loss": 2.994140625, "eval_runtime": 3.7895, "eval_samples_per_second": 68.082, "eval_steps_per_second": 8.708, "step": 5000 }, { "epoch": 136.84, "eval_loss": 3.048828125, "eval_runtime": 3.7912, "eval_samples_per_second": 68.053, "eval_steps_per_second": 8.704, "step": 5200 }, { "epoch": 142.11, "eval_loss": 3.107421875, "eval_runtime": 3.7872, "eval_samples_per_second": 68.123, "eval_steps_per_second": 8.713, "step": 5400 }, { "epoch": 144.74, "learning_rate": 5e-05, "loss": 0.4574, "step": 5500 }, { "epoch": 147.37, "eval_loss": 3.169921875, "eval_runtime": 3.7886, "eval_samples_per_second": 68.1, "eval_steps_per_second": 8.71, "step": 5600 }, { "epoch": 152.63, "eval_loss": 3.2265625, "eval_runtime": 3.7924, "eval_samples_per_second": 68.03, "eval_steps_per_second": 8.702, "step": 5800 }, { "epoch": 157.89, "learning_rate": 5e-05, "loss": 0.3848, "step": 6000 }, { "epoch": 157.89, "eval_loss": 3.291015625, "eval_runtime": 3.7859, "eval_samples_per_second": 68.148, "eval_steps_per_second": 8.717, "step": 6000 }, { "epoch": 163.16, "eval_loss": 3.376953125, "eval_runtime": 3.7886, "eval_samples_per_second": 68.099, "eval_steps_per_second": 8.71, "step": 6200 }, { "epoch": 168.42, "eval_loss": 3.408203125, "eval_runtime": 3.7885, "eval_samples_per_second": 68.1, "eval_steps_per_second": 8.71, "step": 6400 }, { "epoch": 171.05, "learning_rate": 5e-05, "loss": 0.3224, "step": 6500 }, { "epoch": 173.68, "eval_loss": 3.4765625, "eval_runtime": 3.7922, "eval_samples_per_second": 68.034, "eval_steps_per_second": 8.702, "step": 6600 }, { "epoch": 178.95, "eval_loss": 3.529296875, "eval_runtime": 3.7898, "eval_samples_per_second": 68.077, "eval_steps_per_second": 8.708, "step": 6800 }, { "epoch": 184.21, "learning_rate": 5e-05, "loss": 0.2697, "step": 7000 }, { "epoch": 184.21, "eval_loss": 3.591796875, "eval_runtime": 3.7854, "eval_samples_per_second": 68.157, "eval_steps_per_second": 8.718, "step": 7000 }, { "epoch": 189.47, "eval_loss": 3.634765625, "eval_runtime": 3.7918, "eval_samples_per_second": 68.041, "eval_steps_per_second": 8.703, "step": 7200 }, { "epoch": 194.74, "eval_loss": 3.68359375, "eval_runtime": 3.7891, "eval_samples_per_second": 68.09, "eval_steps_per_second": 8.709, "step": 7400 }, { "epoch": 197.37, "learning_rate": 5e-05, "loss": 0.2258, "step": 7500 }, { "epoch": 200.0, "eval_loss": 3.7265625, "eval_runtime": 3.7895, "eval_samples_per_second": 68.083, "eval_steps_per_second": 8.708, "step": 7600 }, { "epoch": 205.26, "eval_loss": 3.79296875, "eval_runtime": 3.7901, "eval_samples_per_second": 68.073, "eval_steps_per_second": 8.707, "step": 7800 }, { "epoch": 210.53, "learning_rate": 5e-05, "loss": 0.1893, "step": 8000 }, { "epoch": 210.53, "eval_loss": 3.828125, "eval_runtime": 3.7891, "eval_samples_per_second": 68.09, "eval_steps_per_second": 8.709, "step": 8000 }, { "epoch": 215.79, "eval_loss": 3.880859375, "eval_runtime": 3.7907, "eval_samples_per_second": 68.062, "eval_steps_per_second": 8.706, "step": 8200 }, { "epoch": 221.05, "eval_loss": 3.923828125, "eval_runtime": 3.7895, "eval_samples_per_second": 68.082, "eval_steps_per_second": 8.708, "step": 8400 }, { "epoch": 223.68, "learning_rate": 5e-05, "loss": 0.1602, "step": 8500 }, { "epoch": 226.32, "eval_loss": 3.974609375, "eval_runtime": 3.7894, "eval_samples_per_second": 68.084, "eval_steps_per_second": 8.708, "step": 8600 }, { "epoch": 231.58, "eval_loss": 4.00390625, "eval_runtime": 3.7923, "eval_samples_per_second": 68.032, "eval_steps_per_second": 8.702, "step": 8800 }, { "epoch": 236.84, "learning_rate": 5e-05, "loss": 0.137, "step": 9000 }, { "epoch": 236.84, "eval_loss": 4.046875, "eval_runtime": 3.7922, "eval_samples_per_second": 68.034, "eval_steps_per_second": 8.702, "step": 9000 }, { "epoch": 242.11, "eval_loss": 4.07421875, "eval_runtime": 3.7901, "eval_samples_per_second": 68.072, "eval_steps_per_second": 8.707, "step": 9200 }, { "epoch": 247.37, "eval_loss": 4.12109375, "eval_runtime": 3.7896, "eval_samples_per_second": 68.08, "eval_steps_per_second": 8.708, "step": 9400 }, { "epoch": 250.0, "learning_rate": 5e-05, "loss": 0.1179, "step": 9500 }, { "epoch": 252.63, "eval_loss": 4.15625, "eval_runtime": 3.7912, "eval_samples_per_second": 68.053, "eval_steps_per_second": 8.704, "step": 9600 }, { "epoch": 257.89, "eval_loss": 4.203125, "eval_runtime": 3.7923, "eval_samples_per_second": 68.032, "eval_steps_per_second": 8.702, "step": 9800 }, { "epoch": 263.16, "learning_rate": 5e-05, "loss": 0.1024, "step": 10000 }, { "epoch": 263.16, "eval_loss": 4.234375, "eval_runtime": 3.7852, "eval_samples_per_second": 68.159, "eval_steps_per_second": 8.718, "step": 10000 }, { "epoch": 268.42, "eval_loss": 4.2734375, "eval_runtime": 3.7869, "eval_samples_per_second": 68.129, "eval_steps_per_second": 8.714, "step": 10200 }, { "epoch": 273.68, "eval_loss": 4.3046875, "eval_runtime": 3.7892, "eval_samples_per_second": 68.088, "eval_steps_per_second": 8.709, "step": 10400 }, { "epoch": 276.32, "learning_rate": 5e-05, "loss": 0.0901, "step": 10500 }, { "epoch": 278.95, "eval_loss": 4.3125, "eval_runtime": 3.7869, "eval_samples_per_second": 68.129, "eval_steps_per_second": 8.714, "step": 10600 }, { "epoch": 284.21, "eval_loss": 4.375, "eval_runtime": 3.7872, "eval_samples_per_second": 68.125, "eval_steps_per_second": 8.714, "step": 10800 }, { "epoch": 289.47, "learning_rate": 5e-05, "loss": 0.0796, "step": 11000 }, { "epoch": 289.47, "eval_loss": 4.390625, "eval_runtime": 3.7843, "eval_samples_per_second": 68.177, "eval_steps_per_second": 8.72, "step": 11000 }, { "epoch": 294.74, "eval_loss": 4.4375, "eval_runtime": 3.7881, "eval_samples_per_second": 68.107, "eval_steps_per_second": 8.711, "step": 11200 }, { "epoch": 300.0, "eval_loss": 4.453125, "eval_runtime": 3.7869, "eval_samples_per_second": 68.129, "eval_steps_per_second": 8.714, "step": 11400 }, { "epoch": 302.63, "learning_rate": 5e-05, "loss": 0.0706, "step": 11500 }, { "epoch": 305.26, "eval_loss": 4.5078125, "eval_runtime": 3.7854, "eval_samples_per_second": 68.156, "eval_steps_per_second": 8.718, "step": 11600 }, { "epoch": 310.53, "eval_loss": 4.515625, "eval_runtime": 3.787, "eval_samples_per_second": 68.128, "eval_steps_per_second": 8.714, "step": 11800 }, { "epoch": 315.79, "learning_rate": 5e-05, "loss": 0.0631, "step": 12000 }, { "epoch": 315.79, "eval_loss": 4.53515625, "eval_runtime": 3.7837, "eval_samples_per_second": 68.187, "eval_steps_per_second": 8.722, "step": 12000 }, { "epoch": 321.05, "eval_loss": 4.5859375, "eval_runtime": 3.7869, "eval_samples_per_second": 68.13, "eval_steps_per_second": 8.714, "step": 12200 }, { "epoch": 326.32, "eval_loss": 4.609375, "eval_runtime": 3.788, "eval_samples_per_second": 68.11, "eval_steps_per_second": 8.712, "step": 12400 }, { "epoch": 328.95, "learning_rate": 5e-05, "loss": 0.0573, "step": 12500 }, { "epoch": 331.58, "eval_loss": 4.63671875, "eval_runtime": 3.7891, "eval_samples_per_second": 68.09, "eval_steps_per_second": 8.709, "step": 12600 }, { "epoch": 336.84, "eval_loss": 4.63671875, "eval_runtime": 3.7855, "eval_samples_per_second": 68.154, "eval_steps_per_second": 8.717, "step": 12800 }, { "epoch": 342.11, "learning_rate": 5e-05, "loss": 0.0521, "step": 13000 }, { "epoch": 342.11, "eval_loss": 4.6640625, "eval_runtime": 3.7838, "eval_samples_per_second": 68.185, "eval_steps_per_second": 8.721, "step": 13000 }, { "epoch": 347.37, "eval_loss": 4.70703125, "eval_runtime": 3.7834, "eval_samples_per_second": 68.192, "eval_steps_per_second": 8.722, "step": 13200 }, { "epoch": 352.63, "eval_loss": 4.69921875, "eval_runtime": 3.789, "eval_samples_per_second": 68.092, "eval_steps_per_second": 8.709, "step": 13400 }, { "epoch": 355.26, "learning_rate": 5e-05, "loss": 0.0475, "step": 13500 }, { "epoch": 357.89, "eval_loss": 4.75390625, "eval_runtime": 3.7901, "eval_samples_per_second": 68.073, "eval_steps_per_second": 8.707, "step": 13600 }, { "epoch": 363.16, "eval_loss": 4.765625, "eval_runtime": 3.7877, "eval_samples_per_second": 68.116, "eval_steps_per_second": 8.712, "step": 13800 }, { "epoch": 368.42, "learning_rate": 5e-05, "loss": 0.0437, "step": 14000 }, { "epoch": 368.42, "eval_loss": 4.80078125, "eval_runtime": 3.7858, "eval_samples_per_second": 68.15, "eval_steps_per_second": 8.717, "step": 14000 }, { "epoch": 373.68, "eval_loss": 4.83203125, "eval_runtime": 3.7888, "eval_samples_per_second": 68.095, "eval_steps_per_second": 8.71, "step": 14200 }, { "epoch": 378.95, "eval_loss": 4.8515625, "eval_runtime": 3.7901, "eval_samples_per_second": 68.073, "eval_steps_per_second": 8.707, "step": 14400 }, { "epoch": 381.58, "learning_rate": 5e-05, "loss": 0.0399, "step": 14500 }, { "epoch": 384.21, "eval_loss": 4.86328125, "eval_runtime": 3.7938, "eval_samples_per_second": 68.006, "eval_steps_per_second": 8.698, "step": 14600 }, { "epoch": 389.47, "eval_loss": 4.89453125, "eval_runtime": 3.7887, "eval_samples_per_second": 68.098, "eval_steps_per_second": 8.71, "step": 14800 }, { "epoch": 394.74, "learning_rate": 5e-05, "loss": 0.0367, "step": 15000 }, { "epoch": 394.74, "eval_loss": 4.90625, "eval_runtime": 3.7864, "eval_samples_per_second": 68.138, "eval_steps_per_second": 8.715, "step": 15000 }, { "epoch": 400.0, "eval_loss": 4.94140625, "eval_runtime": 3.791, "eval_samples_per_second": 68.057, "eval_steps_per_second": 8.705, "step": 15200 }, { "epoch": 400.0, "step": 15200, "total_flos": 1.2604727427386573e+17, "train_loss": 0.4328666927939967, "train_runtime": 22235.2418, "train_samples_per_second": 10.848, "train_steps_per_second": 0.684 } ], "max_steps": 15200, "num_train_epochs": 400, "total_flos": 1.2604727427386573e+17, "trial_name": null, "trial_params": null }