{ "best_metric": 1.3889119625091553, "best_model_checkpoint": "/home/co-ou1/rds/hpc-work/models/longt5_xl_govreport_4096/checkpoint-272", "epoch": 4.985108820160367, "eval_steps": 500, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 0.001, "loss": 2.6414, "step": 2 }, { "epoch": 0.06, "learning_rate": 0.001, "loss": 2.4747, "step": 4 }, { "epoch": 0.09, "learning_rate": 0.001, "loss": 1.8708, "step": 6 }, { "epoch": 0.12, "learning_rate": 0.001, "loss": 2.4373, "step": 8 }, { "epoch": 0.15, "learning_rate": 0.001, "loss": 2.2961, "step": 10 }, { "epoch": 0.18, "learning_rate": 0.001, "loss": 2.1959, "step": 12 }, { "epoch": 0.21, "learning_rate": 0.001, "loss": 1.9882, "step": 14 }, { "epoch": 0.23, "learning_rate": 0.001, "loss": 2.0164, "step": 16 }, { "epoch": 0.26, "learning_rate": 0.001, "loss": 1.8772, "step": 18 }, { "epoch": 0.29, "learning_rate": 0.001, "loss": 1.8516, "step": 20 }, { "epoch": 0.32, "learning_rate": 0.001, "loss": 1.8587, "step": 22 }, { "epoch": 0.35, "learning_rate": 0.001, "loss": 1.729, "step": 24 }, { "epoch": 0.38, "learning_rate": 0.001, "loss": 1.7265, "step": 26 }, { "epoch": 0.41, "learning_rate": 0.001, "loss": 1.7689, "step": 28 }, { "epoch": 0.44, "learning_rate": 0.001, "loss": 1.7899, "step": 30 }, { "epoch": 0.47, "learning_rate": 0.001, "loss": 1.6755, "step": 32 }, { "epoch": 0.5, "learning_rate": 0.001, "loss": 1.6701, "step": 34 }, { "epoch": 0.53, "learning_rate": 0.001, "loss": 1.6626, "step": 36 }, { "epoch": 0.56, "learning_rate": 0.001, "loss": 1.708, "step": 38 }, { "epoch": 0.59, "learning_rate": 0.001, "loss": 1.7019, "step": 40 }, { "epoch": 0.62, "learning_rate": 0.001, "loss": 1.6705, "step": 42 }, { "epoch": 0.65, "learning_rate": 0.001, "loss": 1.6039, "step": 44 }, { "epoch": 0.67, "learning_rate": 0.001, "loss": 1.6041, "step": 46 }, { "epoch": 0.7, "learning_rate": 0.001, "loss": 1.5829, "step": 48 }, { "epoch": 0.73, "learning_rate": 0.001, "loss": 1.6283, "step": 50 }, { "epoch": 0.76, "learning_rate": 0.001, "loss": 1.7403, "step": 52 }, { "epoch": 0.79, "learning_rate": 0.001, "loss": 1.6786, "step": 54 }, { "epoch": 0.82, "learning_rate": 0.001, "loss": 1.6499, "step": 56 }, { "epoch": 0.85, "learning_rate": 0.001, "loss": 1.5641, "step": 58 }, { "epoch": 0.88, "learning_rate": 0.001, "loss": 1.5824, "step": 60 }, { "epoch": 0.91, "learning_rate": 0.001, "loss": 1.6462, "step": 62 }, { "epoch": 0.94, "learning_rate": 0.001, "loss": 1.5787, "step": 64 }, { "epoch": 0.97, "learning_rate": 0.001, "loss": 1.6053, "step": 66 }, { "epoch": 1.0, "learning_rate": 0.001, "loss": 1.6043, "step": 68 }, { "epoch": 1.0, "eval_loss": 1.4757200479507446, "eval_runtime": 333.5276, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 68 }, { "epoch": 1.03, "learning_rate": 0.001, "loss": 1.549, "step": 70 }, { "epoch": 1.06, "learning_rate": 0.001, "loss": 1.4379, "step": 72 }, { "epoch": 1.08, "learning_rate": 0.001, "loss": 1.5224, "step": 74 }, { "epoch": 1.11, "learning_rate": 0.001, "loss": 1.5106, "step": 76 }, { "epoch": 1.14, "learning_rate": 0.001, "loss": 1.4786, "step": 78 }, { "epoch": 1.17, "learning_rate": 0.001, "loss": 1.5101, "step": 80 }, { "epoch": 1.2, "learning_rate": 0.001, "loss": 1.4835, "step": 82 }, { "epoch": 1.23, "learning_rate": 0.001, "loss": 1.4749, "step": 84 }, { "epoch": 1.26, "learning_rate": 0.001, "loss": 1.488, "step": 86 }, { "epoch": 1.29, "learning_rate": 0.001, "loss": 1.4604, "step": 88 }, { "epoch": 1.32, "learning_rate": 0.001, "loss": 1.5358, "step": 90 }, { "epoch": 1.35, "learning_rate": 0.001, "loss": 1.5396, "step": 92 }, { "epoch": 1.38, "learning_rate": 0.001, "loss": 1.5071, "step": 94 }, { "epoch": 1.41, "learning_rate": 0.001, "loss": 1.5219, "step": 96 }, { "epoch": 1.44, "learning_rate": 0.001, "loss": 1.4589, "step": 98 }, { "epoch": 1.47, "learning_rate": 0.001, "loss": 1.4392, "step": 100 }, { "epoch": 1.5, "learning_rate": 0.001, "loss": 1.485, "step": 102 }, { "epoch": 1.52, "learning_rate": 0.001, "loss": 1.4617, "step": 104 }, { "epoch": 1.55, "learning_rate": 0.001, "loss": 1.4659, "step": 106 }, { "epoch": 1.58, "learning_rate": 0.001, "loss": 1.4908, "step": 108 }, { "epoch": 1.61, "learning_rate": 0.001, "loss": 1.4929, "step": 110 }, { "epoch": 1.64, "learning_rate": 0.001, "loss": 1.4538, "step": 112 }, { "epoch": 1.67, "learning_rate": 0.001, "loss": 1.4815, "step": 114 }, { "epoch": 1.7, "learning_rate": 0.001, "loss": 1.4525, "step": 116 }, { "epoch": 1.73, "learning_rate": 0.001, "loss": 1.4664, "step": 118 }, { "epoch": 1.76, "learning_rate": 0.001, "loss": 1.4498, "step": 120 }, { "epoch": 1.79, "learning_rate": 0.001, "loss": 1.5247, "step": 122 }, { "epoch": 1.82, "learning_rate": 0.001, "loss": 1.5033, "step": 124 }, { "epoch": 1.85, "learning_rate": 0.001, "loss": 1.4434, "step": 126 }, { "epoch": 1.88, "learning_rate": 0.001, "loss": 1.4806, "step": 128 }, { "epoch": 1.91, "learning_rate": 0.001, "loss": 1.4551, "step": 130 }, { "epoch": 1.94, "learning_rate": 0.001, "loss": 1.4359, "step": 132 }, { "epoch": 1.96, "learning_rate": 0.001, "loss": 1.4424, "step": 134 }, { "epoch": 1.99, "learning_rate": 0.001, "loss": 1.4471, "step": 136 }, { "epoch": 1.99, "eval_loss": 1.401979923248291, "eval_runtime": 333.5413, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 136 }, { "epoch": 2.02, "learning_rate": 0.001, "loss": 1.3704, "step": 138 }, { "epoch": 2.05, "learning_rate": 0.001, "loss": 1.3246, "step": 140 }, { "epoch": 2.08, "learning_rate": 0.001, "loss": 1.3714, "step": 142 }, { "epoch": 2.11, "learning_rate": 0.001, "loss": 1.3134, "step": 144 }, { "epoch": 2.14, "learning_rate": 0.001, "loss": 1.2998, "step": 146 }, { "epoch": 2.17, "learning_rate": 0.001, "loss": 1.3643, "step": 148 }, { "epoch": 2.2, "learning_rate": 0.001, "loss": 1.306, "step": 150 }, { "epoch": 2.23, "learning_rate": 0.001, "loss": 1.3152, "step": 152 }, { "epoch": 2.26, "learning_rate": 0.001, "loss": 1.3086, "step": 154 }, { "epoch": 2.29, "learning_rate": 0.001, "loss": 1.3033, "step": 156 }, { "epoch": 2.32, "learning_rate": 0.001, "loss": 1.343, "step": 158 }, { "epoch": 2.35, "learning_rate": 0.001, "loss": 1.3399, "step": 160 }, { "epoch": 2.38, "learning_rate": 0.001, "loss": 1.3134, "step": 162 }, { "epoch": 2.4, "learning_rate": 0.001, "loss": 1.3472, "step": 164 }, { "epoch": 2.43, "learning_rate": 0.001, "loss": 1.3261, "step": 166 }, { "epoch": 2.46, "learning_rate": 0.001, "loss": 1.3162, "step": 168 }, { "epoch": 2.49, "learning_rate": 0.001, "loss": 1.3194, "step": 170 }, { "epoch": 2.52, "learning_rate": 0.001, "loss": 1.3236, "step": 172 }, { "epoch": 2.55, "learning_rate": 0.001, "loss": 1.3033, "step": 174 }, { "epoch": 2.58, "learning_rate": 0.001, "loss": 1.3212, "step": 176 }, { "epoch": 2.61, "learning_rate": 0.001, "loss": 1.3433, "step": 178 }, { "epoch": 2.64, "learning_rate": 0.001, "loss": 1.34, "step": 180 }, { "epoch": 2.67, "learning_rate": 0.001, "loss": 1.3306, "step": 182 }, { "epoch": 2.7, "learning_rate": 0.001, "loss": 1.3377, "step": 184 }, { "epoch": 2.73, "learning_rate": 0.001, "loss": 1.3048, "step": 186 }, { "epoch": 2.76, "learning_rate": 0.001, "loss": 1.3186, "step": 188 }, { "epoch": 2.79, "learning_rate": 0.001, "loss": 1.3405, "step": 190 }, { "epoch": 2.82, "learning_rate": 0.001, "loss": 1.3266, "step": 192 }, { "epoch": 2.84, "learning_rate": 0.001, "loss": 1.3182, "step": 194 }, { "epoch": 2.87, "learning_rate": 0.001, "loss": 1.3755, "step": 196 }, { "epoch": 2.9, "learning_rate": 0.001, "loss": 1.3422, "step": 198 }, { "epoch": 2.93, "learning_rate": 0.001, "loss": 1.3247, "step": 200 }, { "epoch": 2.96, "learning_rate": 0.001, "loss": 1.312, "step": 202 }, { "epoch": 2.99, "learning_rate": 0.001, "loss": 1.315, "step": 204 }, { "epoch": 2.99, "eval_loss": 1.3935041427612305, "eval_runtime": 333.777, "eval_samples_per_second": 2.912, "eval_steps_per_second": 0.728, "step": 204 }, { "epoch": 3.02, "learning_rate": 0.001, "loss": 1.2264, "step": 206 }, { "epoch": 3.05, "learning_rate": 0.001, "loss": 1.1606, "step": 208 }, { "epoch": 3.08, "learning_rate": 0.001, "loss": 1.1569, "step": 210 }, { "epoch": 3.11, "learning_rate": 0.001, "loss": 1.1434, "step": 212 }, { "epoch": 3.14, "learning_rate": 0.001, "loss": 1.1736, "step": 214 }, { "epoch": 3.17, "learning_rate": 0.001, "loss": 1.17, "step": 216 }, { "epoch": 3.2, "learning_rate": 0.001, "loss": 1.1802, "step": 218 }, { "epoch": 3.23, "learning_rate": 0.001, "loss": 1.1856, "step": 220 }, { "epoch": 3.25, "learning_rate": 0.001, "loss": 1.1918, "step": 222 }, { "epoch": 3.28, "learning_rate": 0.001, "loss": 1.1751, "step": 224 }, { "epoch": 3.31, "learning_rate": 0.001, "loss": 1.1896, "step": 226 }, { "epoch": 3.34, "learning_rate": 0.001, "loss": 1.1837, "step": 228 }, { "epoch": 3.37, "learning_rate": 0.001, "loss": 1.2053, "step": 230 }, { "epoch": 3.4, "learning_rate": 0.001, "loss": 1.2031, "step": 232 }, { "epoch": 3.43, "learning_rate": 0.001, "loss": 1.1886, "step": 234 }, { "epoch": 3.46, "learning_rate": 0.001, "loss": 1.1769, "step": 236 }, { "epoch": 3.49, "learning_rate": 0.001, "loss": 1.1951, "step": 238 }, { "epoch": 3.52, "learning_rate": 0.001, "loss": 1.1943, "step": 240 }, { "epoch": 3.55, "learning_rate": 0.001, "loss": 1.1816, "step": 242 }, { "epoch": 3.58, "learning_rate": 0.001, "loss": 1.1763, "step": 244 }, { "epoch": 3.61, "learning_rate": 0.001, "loss": 1.2028, "step": 246 }, { "epoch": 3.64, "learning_rate": 0.001, "loss": 1.2199, "step": 248 }, { "epoch": 3.67, "learning_rate": 0.001, "loss": 1.1948, "step": 250 }, { "epoch": 3.69, "learning_rate": 0.001, "loss": 1.1859, "step": 252 }, { "epoch": 3.72, "learning_rate": 0.001, "loss": 1.1864, "step": 254 }, { "epoch": 3.75, "learning_rate": 0.001, "loss": 1.2012, "step": 256 }, { "epoch": 3.78, "learning_rate": 0.001, "loss": 1.2177, "step": 258 }, { "epoch": 3.81, "learning_rate": 0.001, "loss": 1.2156, "step": 260 }, { "epoch": 3.84, "learning_rate": 0.001, "loss": 1.204, "step": 262 }, { "epoch": 3.87, "learning_rate": 0.001, "loss": 1.2053, "step": 264 }, { "epoch": 3.9, "learning_rate": 0.001, "loss": 1.2138, "step": 266 }, { "epoch": 3.93, "learning_rate": 0.001, "loss": 1.1992, "step": 268 }, { "epoch": 3.96, "learning_rate": 0.001, "loss": 1.1903, "step": 270 }, { "epoch": 3.99, "learning_rate": 0.001, "loss": 1.2098, "step": 272 }, { "epoch": 3.99, "eval_loss": 1.3889119625091553, "eval_runtime": 333.5624, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.728, "step": 272 }, { "epoch": 4.02, "learning_rate": 0.001, "loss": 1.1135, "step": 274 }, { "epoch": 4.05, "learning_rate": 0.001, "loss": 1.0318, "step": 276 }, { "epoch": 4.08, "learning_rate": 0.001, "loss": 1.068, "step": 278 }, { "epoch": 4.11, "learning_rate": 0.001, "loss": 1.0386, "step": 280 }, { "epoch": 4.13, "learning_rate": 0.001, "loss": 1.0727, "step": 282 }, { "epoch": 4.16, "learning_rate": 0.001, "loss": 1.0592, "step": 284 }, { "epoch": 4.19, "learning_rate": 0.001, "loss": 1.0348, "step": 286 }, { "epoch": 4.22, "learning_rate": 0.001, "loss": 1.045, "step": 288 }, { "epoch": 4.25, "learning_rate": 0.001, "loss": 1.0535, "step": 290 }, { "epoch": 4.28, "learning_rate": 0.001, "loss": 1.0628, "step": 292 }, { "epoch": 4.31, "learning_rate": 0.001, "loss": 1.0699, "step": 294 }, { "epoch": 4.34, "learning_rate": 0.001, "loss": 1.0482, "step": 296 }, { "epoch": 4.37, "learning_rate": 0.001, "loss": 1.0703, "step": 298 }, { "epoch": 4.4, "learning_rate": 0.001, "loss": 1.0498, "step": 300 }, { "epoch": 4.43, "learning_rate": 0.001, "loss": 1.0775, "step": 302 }, { "epoch": 4.46, "learning_rate": 0.001, "loss": 1.0609, "step": 304 }, { "epoch": 4.49, "learning_rate": 0.001, "loss": 1.0985, "step": 306 }, { "epoch": 4.52, "learning_rate": 0.001, "loss": 1.081, "step": 308 }, { "epoch": 4.55, "learning_rate": 0.001, "loss": 1.0942, "step": 310 }, { "epoch": 4.57, "learning_rate": 0.001, "loss": 1.0789, "step": 312 }, { "epoch": 4.6, "learning_rate": 0.001, "loss": 1.0897, "step": 314 }, { "epoch": 4.63, "learning_rate": 0.001, "loss": 1.0869, "step": 316 }, { "epoch": 4.66, "learning_rate": 0.001, "loss": 1.1108, "step": 318 }, { "epoch": 4.69, "learning_rate": 0.001, "loss": 1.0687, "step": 320 }, { "epoch": 4.72, "learning_rate": 0.001, "loss": 1.1109, "step": 322 }, { "epoch": 4.75, "learning_rate": 0.001, "loss": 1.0838, "step": 324 }, { "epoch": 4.78, "learning_rate": 0.001, "loss": 1.0932, "step": 326 }, { "epoch": 4.81, "learning_rate": 0.001, "loss": 1.0923, "step": 328 }, { "epoch": 4.84, "learning_rate": 0.001, "loss": 1.0662, "step": 330 }, { "epoch": 4.87, "learning_rate": 0.001, "loss": 1.0727, "step": 332 }, { "epoch": 4.9, "learning_rate": 0.001, "loss": 1.0809, "step": 334 }, { "epoch": 4.93, "learning_rate": 0.001, "loss": 1.1087, "step": 336 }, { "epoch": 4.96, "learning_rate": 0.001, "loss": 1.0912, "step": 338 }, { "epoch": 4.99, "learning_rate": 0.001, "loss": 1.0888, "step": 340 }, { "epoch": 4.99, "eval_loss": 1.4388501644134521, "eval_runtime": 333.5483, "eval_samples_per_second": 2.914, "eval_steps_per_second": 0.729, "step": 340 }, { "epoch": 4.99, "step": 340, "total_flos": 5.95419516573578e+18, "train_loss": 1.3779389661901138, "train_runtime": 112099.6799, "train_samples_per_second": 0.779, "train_steps_per_second": 0.003 } ], "logging_steps": 2, "max_steps": 340, "num_train_epochs": 5, "save_steps": 500, "total_flos": 5.95419516573578e+18, "trial_name": null, "trial_params": null }