longt5_xl_sfd_bp_20 / trainer_state.json
learn3r's picture
End of training
297a3c4
{
"best_metric": 1.503156304359436,
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/summarization/longt5_xl_sfd_bp_20/checkpoint-57",
"epoch": 19.47826086956522,
"eval_steps": 500,
"global_step": 280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14,
"learning_rate": 0.001,
"loss": 2.9672,
"step": 2
},
{
"epoch": 0.28,
"learning_rate": 0.001,
"loss": 3.0162,
"step": 4
},
{
"epoch": 0.42,
"learning_rate": 0.001,
"loss": 3.1689,
"step": 6
},
{
"epoch": 0.56,
"learning_rate": 0.001,
"loss": 2.902,
"step": 8
},
{
"epoch": 0.7,
"learning_rate": 0.001,
"loss": 2.4891,
"step": 10
},
{
"epoch": 0.83,
"learning_rate": 0.001,
"loss": 2.8498,
"step": 12
},
{
"epoch": 0.97,
"learning_rate": 0.001,
"loss": 2.3973,
"step": 14
},
{
"epoch": 0.97,
"eval_gen_len": 511.0,
"eval_loss": 1.9073989391326904,
"eval_rouge1": 10.6164,
"eval_rouge2": 2.4585,
"eval_rougeL": 10.4856,
"eval_rougeLsum": 9.8193,
"eval_runtime": 1794.269,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 14
},
{
"epoch": 1.11,
"learning_rate": 0.001,
"loss": 2.1921,
"step": 16
},
{
"epoch": 1.25,
"learning_rate": 0.001,
"loss": 2.0091,
"step": 18
},
{
"epoch": 1.39,
"learning_rate": 0.001,
"loss": 1.8884,
"step": 20
},
{
"epoch": 1.53,
"learning_rate": 0.001,
"loss": 1.7955,
"step": 22
},
{
"epoch": 1.67,
"learning_rate": 0.001,
"loss": 1.7023,
"step": 24
},
{
"epoch": 1.81,
"learning_rate": 0.001,
"loss": 1.8178,
"step": 26
},
{
"epoch": 1.95,
"learning_rate": 0.001,
"loss": 1.9188,
"step": 28
},
{
"epoch": 1.95,
"eval_gen_len": 511.0,
"eval_loss": 1.7082059383392334,
"eval_rouge1": 17.4258,
"eval_rouge2": 4.2128,
"eval_rougeL": 16.5213,
"eval_rougeLsum": 15.8377,
"eval_runtime": 1795.1498,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 28
},
{
"epoch": 2.09,
"learning_rate": 0.001,
"loss": 1.6461,
"step": 30
},
{
"epoch": 2.23,
"learning_rate": 0.001,
"loss": 1.552,
"step": 32
},
{
"epoch": 2.37,
"learning_rate": 0.001,
"loss": 1.4914,
"step": 34
},
{
"epoch": 2.5,
"learning_rate": 0.001,
"loss": 1.457,
"step": 36
},
{
"epoch": 2.64,
"learning_rate": 0.001,
"loss": 1.4499,
"step": 38
},
{
"epoch": 2.78,
"learning_rate": 0.001,
"loss": 1.4868,
"step": 40
},
{
"epoch": 2.92,
"learning_rate": 0.001,
"loss": 1.4297,
"step": 42
},
{
"epoch": 2.99,
"eval_gen_len": 506.77448071216617,
"eval_loss": 1.5072847604751587,
"eval_rouge1": 18.6504,
"eval_rouge2": 5.4242,
"eval_rougeL": 17.2648,
"eval_rougeLsum": 17.0203,
"eval_runtime": 1796.521,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 43
},
{
"epoch": 3.06,
"learning_rate": 0.001,
"loss": 1.3757,
"step": 44
},
{
"epoch": 3.2,
"learning_rate": 0.001,
"loss": 1.2701,
"step": 46
},
{
"epoch": 3.34,
"learning_rate": 0.001,
"loss": 1.2826,
"step": 48
},
{
"epoch": 3.48,
"learning_rate": 0.001,
"loss": 1.2945,
"step": 50
},
{
"epoch": 3.62,
"learning_rate": 0.001,
"loss": 1.2963,
"step": 52
},
{
"epoch": 3.76,
"learning_rate": 0.001,
"loss": 1.2933,
"step": 54
},
{
"epoch": 3.9,
"learning_rate": 0.001,
"loss": 1.2759,
"step": 56
},
{
"epoch": 3.97,
"eval_gen_len": 497.8783382789318,
"eval_loss": 1.503156304359436,
"eval_rouge1": 22.11,
"eval_rouge2": 7.544,
"eval_rougeL": 19.7035,
"eval_rougeLsum": 20.2813,
"eval_runtime": 1797.0423,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 57
},
{
"epoch": 4.03,
"learning_rate": 0.001,
"loss": 1.2446,
"step": 58
},
{
"epoch": 4.17,
"learning_rate": 0.001,
"loss": 1.0992,
"step": 60
},
{
"epoch": 4.31,
"learning_rate": 0.001,
"loss": 1.0771,
"step": 62
},
{
"epoch": 4.45,
"learning_rate": 0.001,
"loss": 1.1254,
"step": 64
},
{
"epoch": 4.59,
"learning_rate": 0.001,
"loss": 1.1644,
"step": 66
},
{
"epoch": 4.73,
"learning_rate": 0.001,
"loss": 1.1485,
"step": 68
},
{
"epoch": 4.87,
"learning_rate": 0.001,
"loss": 1.1421,
"step": 70
},
{
"epoch": 4.94,
"eval_gen_len": 503.60237388724033,
"eval_loss": 1.5462485551834106,
"eval_rouge1": 20.6049,
"eval_rouge2": 6.7146,
"eval_rougeL": 18.5084,
"eval_rougeLsum": 19.0876,
"eval_runtime": 1793.6239,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 71
},
{
"epoch": 5.01,
"learning_rate": 0.001,
"loss": 1.1233,
"step": 72
},
{
"epoch": 5.15,
"learning_rate": 0.001,
"loss": 0.8919,
"step": 74
},
{
"epoch": 5.29,
"learning_rate": 0.001,
"loss": 0.9349,
"step": 76
},
{
"epoch": 5.43,
"learning_rate": 0.001,
"loss": 0.9363,
"step": 78
},
{
"epoch": 5.57,
"learning_rate": 0.001,
"loss": 0.9203,
"step": 80
},
{
"epoch": 5.7,
"learning_rate": 0.001,
"loss": 0.9429,
"step": 82
},
{
"epoch": 5.84,
"learning_rate": 0.001,
"loss": 0.9495,
"step": 84
},
{
"epoch": 5.98,
"learning_rate": 0.001,
"loss": 0.9605,
"step": 86
},
{
"epoch": 5.98,
"eval_gen_len": 510.27299703264094,
"eval_loss": 1.6233196258544922,
"eval_rouge1": 22.6777,
"eval_rouge2": 7.9362,
"eval_rougeL": 18.7936,
"eval_rougeLsum": 21.41,
"eval_runtime": 1793.8682,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 86
},
{
"epoch": 6.12,
"learning_rate": 0.001,
"loss": 0.7747,
"step": 88
},
{
"epoch": 6.26,
"learning_rate": 0.001,
"loss": 0.7664,
"step": 90
},
{
"epoch": 6.4,
"learning_rate": 0.001,
"loss": 0.7998,
"step": 92
},
{
"epoch": 6.54,
"learning_rate": 0.001,
"loss": 0.7715,
"step": 94
},
{
"epoch": 6.68,
"learning_rate": 0.001,
"loss": 0.8038,
"step": 96
},
{
"epoch": 6.82,
"learning_rate": 0.001,
"loss": 0.8059,
"step": 98
},
{
"epoch": 6.96,
"learning_rate": 0.001,
"loss": 0.8082,
"step": 100
},
{
"epoch": 6.96,
"eval_gen_len": 511.0,
"eval_loss": 1.7575491666793823,
"eval_rouge1": 26.5338,
"eval_rouge2": 9.9474,
"eval_rougeL": 20.3789,
"eval_rougeLsum": 25.0767,
"eval_runtime": 1794.6298,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 100
},
{
"epoch": 7.1,
"learning_rate": 0.001,
"loss": 0.6708,
"step": 102
},
{
"epoch": 7.23,
"learning_rate": 0.001,
"loss": 0.6186,
"step": 104
},
{
"epoch": 7.37,
"learning_rate": 0.001,
"loss": 0.6101,
"step": 106
},
{
"epoch": 7.51,
"learning_rate": 0.001,
"loss": 0.6328,
"step": 108
},
{
"epoch": 7.65,
"learning_rate": 0.001,
"loss": 0.6529,
"step": 110
},
{
"epoch": 7.79,
"learning_rate": 0.001,
"loss": 0.6312,
"step": 112
},
{
"epoch": 7.93,
"learning_rate": 0.001,
"loss": 0.664,
"step": 114
},
{
"epoch": 8.0,
"eval_gen_len": 329.7151335311573,
"eval_loss": 1.7701919078826904,
"eval_rouge1": 35.1918,
"eval_rouge2": 13.7223,
"eval_rougeL": 26.1763,
"eval_rougeLsum": 33.3997,
"eval_runtime": 1795.1755,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 115
},
{
"epoch": 8.07,
"learning_rate": 0.001,
"loss": 0.6177,
"step": 116
},
{
"epoch": 8.21,
"learning_rate": 0.001,
"loss": 0.5241,
"step": 118
},
{
"epoch": 8.35,
"learning_rate": 0.001,
"loss": 0.5173,
"step": 120
},
{
"epoch": 8.49,
"learning_rate": 0.001,
"loss": 0.5241,
"step": 122
},
{
"epoch": 8.63,
"learning_rate": 0.001,
"loss": 0.5546,
"step": 124
},
{
"epoch": 8.77,
"learning_rate": 0.001,
"loss": 0.5401,
"step": 126
},
{
"epoch": 8.9,
"learning_rate": 0.001,
"loss": 0.5471,
"step": 128
},
{
"epoch": 8.97,
"eval_gen_len": 506.82789317507417,
"eval_loss": 1.938284993171692,
"eval_rouge1": 27.0414,
"eval_rouge2": 10.4166,
"eval_rougeL": 20.1803,
"eval_rougeLsum": 25.6283,
"eval_runtime": 1797.9806,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.024,
"step": 129
},
{
"epoch": 9.04,
"learning_rate": 0.001,
"loss": 0.526,
"step": 130
},
{
"epoch": 9.18,
"learning_rate": 0.001,
"loss": 0.409,
"step": 132
},
{
"epoch": 9.32,
"learning_rate": 0.001,
"loss": 0.4163,
"step": 134
},
{
"epoch": 9.46,
"learning_rate": 0.001,
"loss": 0.4304,
"step": 136
},
{
"epoch": 9.6,
"learning_rate": 0.001,
"loss": 0.4512,
"step": 138
},
{
"epoch": 9.74,
"learning_rate": 0.001,
"loss": 0.4396,
"step": 140
},
{
"epoch": 9.88,
"learning_rate": 0.001,
"loss": 0.4349,
"step": 142
},
{
"epoch": 9.95,
"eval_gen_len": 454.7032640949555,
"eval_loss": 1.9607620239257812,
"eval_rouge1": 29.5613,
"eval_rouge2": 11.7633,
"eval_rougeL": 22.7176,
"eval_rougeLsum": 27.9563,
"eval_runtime": 1794.1152,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 143
},
{
"epoch": 10.02,
"learning_rate": 0.001,
"loss": 0.4077,
"step": 144
},
{
"epoch": 10.16,
"learning_rate": 0.001,
"loss": 0.3439,
"step": 146
},
{
"epoch": 10.3,
"learning_rate": 0.001,
"loss": 0.3503,
"step": 148
},
{
"epoch": 10.43,
"learning_rate": 0.001,
"loss": 0.3572,
"step": 150
},
{
"epoch": 10.57,
"learning_rate": 0.001,
"loss": 0.3643,
"step": 152
},
{
"epoch": 10.71,
"learning_rate": 0.001,
"loss": 0.3516,
"step": 154
},
{
"epoch": 10.85,
"learning_rate": 0.001,
"loss": 0.377,
"step": 156
},
{
"epoch": 10.99,
"learning_rate": 0.001,
"loss": 0.4338,
"step": 158
},
{
"epoch": 10.99,
"eval_gen_len": 493.3234421364985,
"eval_loss": 2.1197292804718018,
"eval_rouge1": 31.2004,
"eval_rouge2": 12.8569,
"eval_rougeL": 22.1282,
"eval_rougeLsum": 29.8827,
"eval_runtime": 1793.5921,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 158
},
{
"epoch": 11.13,
"learning_rate": 0.001,
"loss": 0.2715,
"step": 160
},
{
"epoch": 11.27,
"learning_rate": 0.001,
"loss": 0.2391,
"step": 162
},
{
"epoch": 11.41,
"learning_rate": 0.001,
"loss": 0.2958,
"step": 164
},
{
"epoch": 11.55,
"learning_rate": 0.001,
"loss": 0.3101,
"step": 166
},
{
"epoch": 11.69,
"learning_rate": 0.001,
"loss": 0.3417,
"step": 168
},
{
"epoch": 11.83,
"learning_rate": 0.001,
"loss": 0.3292,
"step": 170
},
{
"epoch": 11.97,
"learning_rate": 0.001,
"loss": 0.2887,
"step": 172
},
{
"epoch": 11.97,
"eval_gen_len": 381.35905044510383,
"eval_loss": 2.1204545497894287,
"eval_rouge1": 34.9566,
"eval_rouge2": 13.8574,
"eval_rougeL": 25.1764,
"eval_rougeLsum": 33.2914,
"eval_runtime": 1792.804,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 172
},
{
"epoch": 12.1,
"learning_rate": 0.001,
"loss": 0.2532,
"step": 174
},
{
"epoch": 12.24,
"learning_rate": 0.001,
"loss": 0.2565,
"step": 176
},
{
"epoch": 12.38,
"learning_rate": 0.001,
"loss": 0.2791,
"step": 178
},
{
"epoch": 12.52,
"learning_rate": 0.001,
"loss": 0.2803,
"step": 180
},
{
"epoch": 12.66,
"learning_rate": 0.001,
"loss": 0.3015,
"step": 182
},
{
"epoch": 12.8,
"learning_rate": 0.001,
"loss": 0.2764,
"step": 184
},
{
"epoch": 12.94,
"learning_rate": 0.001,
"loss": 0.2753,
"step": 186
},
{
"epoch": 12.94,
"eval_gen_len": 338.7240356083086,
"eval_loss": 2.429886817932129,
"eval_rouge1": 36.3877,
"eval_rouge2": 13.8584,
"eval_rougeL": 25.7829,
"eval_rougeLsum": 34.8601,
"eval_runtime": 1789.0093,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 186
},
{
"epoch": 13.08,
"learning_rate": 0.001,
"loss": 0.2563,
"step": 188
},
{
"epoch": 13.22,
"learning_rate": 0.001,
"loss": 0.2024,
"step": 190
},
{
"epoch": 13.36,
"learning_rate": 0.001,
"loss": 0.2252,
"step": 192
},
{
"epoch": 13.5,
"learning_rate": 0.001,
"loss": 0.2487,
"step": 194
},
{
"epoch": 13.63,
"learning_rate": 0.001,
"loss": 0.2086,
"step": 196
},
{
"epoch": 13.77,
"learning_rate": 0.001,
"loss": 0.2181,
"step": 198
},
{
"epoch": 13.91,
"learning_rate": 0.001,
"loss": 0.2114,
"step": 200
},
{
"epoch": 13.98,
"eval_gen_len": 302.48367952522256,
"eval_loss": 2.5798637866973877,
"eval_rouge1": 39.7535,
"eval_rouge2": 16.1209,
"eval_rougeL": 27.8512,
"eval_rougeLsum": 37.8553,
"eval_runtime": 1773.2158,
"eval_samples_per_second": 0.19,
"eval_steps_per_second": 0.024,
"step": 201
},
{
"epoch": 14.05,
"learning_rate": 0.001,
"loss": 0.1828,
"step": 202
},
{
"epoch": 14.19,
"learning_rate": 0.001,
"loss": 0.2025,
"step": 204
},
{
"epoch": 14.33,
"learning_rate": 0.001,
"loss": 0.1991,
"step": 206
},
{
"epoch": 14.47,
"learning_rate": 0.001,
"loss": 0.1844,
"step": 208
},
{
"epoch": 14.61,
"learning_rate": 0.001,
"loss": 0.1934,
"step": 210
},
{
"epoch": 14.75,
"learning_rate": 0.001,
"loss": 0.2,
"step": 212
},
{
"epoch": 14.89,
"learning_rate": 0.001,
"loss": 0.1805,
"step": 214
},
{
"epoch": 14.96,
"eval_gen_len": 442.9258160237389,
"eval_loss": 2.612277030944824,
"eval_rouge1": 33.3254,
"eval_rouge2": 13.0868,
"eval_rougeL": 23.3214,
"eval_rougeLsum": 31.7901,
"eval_runtime": 1794.4617,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 215
},
{
"epoch": 15.03,
"learning_rate": 0.001,
"loss": 0.1739,
"step": 216
},
{
"epoch": 15.17,
"learning_rate": 0.001,
"loss": 0.1504,
"step": 218
},
{
"epoch": 15.3,
"learning_rate": 0.001,
"loss": 0.1431,
"step": 220
},
{
"epoch": 15.44,
"learning_rate": 0.001,
"loss": 0.152,
"step": 222
},
{
"epoch": 15.58,
"learning_rate": 0.001,
"loss": 0.142,
"step": 224
},
{
"epoch": 15.72,
"learning_rate": 0.001,
"loss": 0.145,
"step": 226
},
{
"epoch": 15.86,
"learning_rate": 0.001,
"loss": 0.1476,
"step": 228
},
{
"epoch": 16.0,
"learning_rate": 0.001,
"loss": 0.1543,
"step": 230
},
{
"epoch": 16.0,
"eval_gen_len": 463.080118694362,
"eval_loss": 2.563481330871582,
"eval_rouge1": 31.7816,
"eval_rouge2": 13.1085,
"eval_rougeL": 22.9117,
"eval_rougeLsum": 30.2286,
"eval_runtime": 1793.3728,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 230
},
{
"epoch": 16.14,
"learning_rate": 0.001,
"loss": 0.1245,
"step": 232
},
{
"epoch": 16.28,
"learning_rate": 0.001,
"loss": 0.1204,
"step": 234
},
{
"epoch": 16.42,
"learning_rate": 0.001,
"loss": 0.2117,
"step": 236
},
{
"epoch": 16.56,
"learning_rate": 0.001,
"loss": 0.6894,
"step": 238
},
{
"epoch": 16.7,
"learning_rate": 0.001,
"loss": 0.35,
"step": 240
},
{
"epoch": 16.83,
"learning_rate": 0.001,
"loss": 0.8395,
"step": 242
},
{
"epoch": 16.97,
"learning_rate": 0.001,
"loss": 0.5166,
"step": 244
},
{
"epoch": 16.97,
"eval_gen_len": 511.0,
"eval_loss": 2.513441562652588,
"eval_rouge1": 30.3969,
"eval_rouge2": 12.1295,
"eval_rougeL": 21.6616,
"eval_rougeLsum": 28.7606,
"eval_runtime": 1793.8473,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 244
},
{
"epoch": 17.11,
"learning_rate": 0.001,
"loss": 0.202,
"step": 246
},
{
"epoch": 17.25,
"learning_rate": 0.001,
"loss": 0.1247,
"step": 248
},
{
"epoch": 17.39,
"learning_rate": 0.001,
"loss": 0.1368,
"step": 250
},
{
"epoch": 17.53,
"learning_rate": 0.001,
"loss": 0.1096,
"step": 252
},
{
"epoch": 17.67,
"learning_rate": 0.001,
"loss": 0.1066,
"step": 254
},
{
"epoch": 17.81,
"learning_rate": 0.001,
"loss": 0.1078,
"step": 256
},
{
"epoch": 17.95,
"learning_rate": 0.001,
"loss": 0.1117,
"step": 258
},
{
"epoch": 17.95,
"eval_gen_len": 431.11572700296733,
"eval_loss": 2.8108906745910645,
"eval_rouge1": 35.336,
"eval_rouge2": 14.9492,
"eval_rougeL": 24.1938,
"eval_rougeLsum": 33.822,
"eval_runtime": 1792.5735,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 258
},
{
"epoch": 18.09,
"learning_rate": 0.001,
"loss": 0.0934,
"step": 260
},
{
"epoch": 18.23,
"learning_rate": 0.001,
"loss": 0.0793,
"step": 262
},
{
"epoch": 18.37,
"learning_rate": 0.001,
"loss": 0.0887,
"step": 264
},
{
"epoch": 18.5,
"learning_rate": 0.001,
"loss": 0.103,
"step": 266
},
{
"epoch": 18.64,
"learning_rate": 0.001,
"loss": 0.0847,
"step": 268
},
{
"epoch": 18.78,
"learning_rate": 0.001,
"loss": 0.0869,
"step": 270
},
{
"epoch": 18.92,
"learning_rate": 0.001,
"loss": 0.0895,
"step": 272
},
{
"epoch": 18.99,
"eval_gen_len": 240.13649851632047,
"eval_loss": 2.7576751708984375,
"eval_rouge1": 41.0982,
"eval_rouge2": 16.3935,
"eval_rougeL": 28.1073,
"eval_rougeLsum": 39.1641,
"eval_runtime": 1702.7984,
"eval_samples_per_second": 0.198,
"eval_steps_per_second": 0.025,
"step": 273
},
{
"epoch": 19.06,
"learning_rate": 0.001,
"loss": 0.0884,
"step": 274
},
{
"epoch": 19.2,
"learning_rate": 0.001,
"loss": 0.0838,
"step": 276
},
{
"epoch": 19.34,
"learning_rate": 0.001,
"loss": 0.0731,
"step": 278
},
{
"epoch": 19.48,
"learning_rate": 0.001,
"loss": 0.0779,
"step": 280
},
{
"epoch": 19.48,
"eval_gen_len": 488.513353115727,
"eval_loss": 2.8927440643310547,
"eval_rouge1": 32.7788,
"eval_rouge2": 13.9352,
"eval_rougeL": 22.5175,
"eval_rougeLsum": 31.548,
"eval_runtime": 1795.5446,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 280
},
{
"epoch": 19.48,
"step": 280,
"total_flos": 4.895224157149471e+18,
"train_loss": 0.722327525381531,
"train_runtime": 104579.6075,
"train_samples_per_second": 0.702,
"train_steps_per_second": 0.003
}
],
"logging_steps": 2,
"max_steps": 280,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 4.895224157149471e+18,
"trial_name": null,
"trial_params": null
}