longt5_xl_sfd_20 / trainer_state.json
learn3r's picture
End of training
1f1cb2f verified
{
"best_metric": 2.2994935512542725,
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_20/checkpoint-28",
"epoch": 19.47826086956522,
"eval_steps": 500,
"global_step": 280,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14,
"grad_norm": 8.068708419799805,
"learning_rate": 0.001,
"loss": 3.274,
"step": 2
},
{
"epoch": 0.28,
"grad_norm": 1.4994572401046753,
"learning_rate": 0.001,
"loss": 3.2963,
"step": 4
},
{
"epoch": 0.42,
"grad_norm": 1.0570803880691528,
"learning_rate": 0.001,
"loss": 3.3164,
"step": 6
},
{
"epoch": 0.56,
"grad_norm": 1.2446849346160889,
"learning_rate": 0.001,
"loss": 3.0866,
"step": 8
},
{
"epoch": 0.7,
"grad_norm": 0.721084713935852,
"learning_rate": 0.001,
"loss": 2.8976,
"step": 10
},
{
"epoch": 0.83,
"grad_norm": 1.2132383584976196,
"learning_rate": 0.001,
"loss": 2.8298,
"step": 12
},
{
"epoch": 0.97,
"grad_norm": 0.4689762592315674,
"learning_rate": 0.001,
"loss": 2.9377,
"step": 14
},
{
"epoch": 0.97,
"eval_loss": 2.7965147495269775,
"eval_runtime": 81.4763,
"eval_samples_per_second": 4.148,
"eval_steps_per_second": 0.528,
"step": 14
},
{
"epoch": 1.11,
"grad_norm": 0.42892181873321533,
"learning_rate": 0.001,
"loss": 2.741,
"step": 16
},
{
"epoch": 1.25,
"grad_norm": 0.4487678110599518,
"learning_rate": 0.001,
"loss": 2.4441,
"step": 18
},
{
"epoch": 1.39,
"grad_norm": 0.4653552770614624,
"learning_rate": 0.001,
"loss": 2.432,
"step": 20
},
{
"epoch": 1.53,
"grad_norm": 0.35275548696517944,
"learning_rate": 0.001,
"loss": 2.4016,
"step": 22
},
{
"epoch": 1.67,
"grad_norm": 0.43277695775032043,
"learning_rate": 0.001,
"loss": 2.391,
"step": 24
},
{
"epoch": 1.81,
"grad_norm": 0.3408297300338745,
"learning_rate": 0.001,
"loss": 2.3911,
"step": 26
},
{
"epoch": 1.95,
"grad_norm": 0.3205319344997406,
"learning_rate": 0.001,
"loss": 2.3247,
"step": 28
},
{
"epoch": 1.95,
"eval_loss": 2.2994935512542725,
"eval_runtime": 81.4693,
"eval_samples_per_second": 4.149,
"eval_steps_per_second": 0.528,
"step": 28
},
{
"epoch": 2.09,
"grad_norm": 0.4033512771129608,
"learning_rate": 0.001,
"loss": 2.0701,
"step": 30
},
{
"epoch": 2.23,
"grad_norm": 0.36825311183929443,
"learning_rate": 0.001,
"loss": 2.0968,
"step": 32
},
{
"epoch": 2.37,
"grad_norm": 0.5080482363700867,
"learning_rate": 0.001,
"loss": 2.0681,
"step": 34
},
{
"epoch": 2.5,
"grad_norm": 0.4196927845478058,
"learning_rate": 0.001,
"loss": 2.0914,
"step": 36
},
{
"epoch": 2.64,
"grad_norm": 0.3230506479740143,
"learning_rate": 0.001,
"loss": 2.0317,
"step": 38
},
{
"epoch": 2.78,
"grad_norm": 0.2733004689216614,
"learning_rate": 0.001,
"loss": 1.9723,
"step": 40
},
{
"epoch": 2.92,
"grad_norm": 0.2709517776966095,
"learning_rate": 0.001,
"loss": 1.9943,
"step": 42
},
{
"epoch": 2.99,
"eval_loss": 2.3308048248291016,
"eval_runtime": 81.5083,
"eval_samples_per_second": 4.147,
"eval_steps_per_second": 0.528,
"step": 43
},
{
"epoch": 3.06,
"grad_norm": 0.3230663537979126,
"learning_rate": 0.001,
"loss": 1.9093,
"step": 44
},
{
"epoch": 3.2,
"grad_norm": 0.3976946175098419,
"learning_rate": 0.001,
"loss": 1.7682,
"step": 46
},
{
"epoch": 3.34,
"grad_norm": 0.42008209228515625,
"learning_rate": 0.001,
"loss": 1.7119,
"step": 48
},
{
"epoch": 3.48,
"grad_norm": 0.31828513741493225,
"learning_rate": 0.001,
"loss": 1.7283,
"step": 50
},
{
"epoch": 3.62,
"grad_norm": 0.2448839396238327,
"learning_rate": 0.001,
"loss": 1.6905,
"step": 52
},
{
"epoch": 3.76,
"grad_norm": 0.25552132725715637,
"learning_rate": 0.001,
"loss": 1.6645,
"step": 54
},
{
"epoch": 3.9,
"grad_norm": 15.679224014282227,
"learning_rate": 0.001,
"loss": 1.7056,
"step": 56
},
{
"epoch": 3.97,
"eval_loss": 2.3368992805480957,
"eval_runtime": 81.4742,
"eval_samples_per_second": 4.149,
"eval_steps_per_second": 0.528,
"step": 57
},
{
"epoch": 4.03,
"grad_norm": 0.29547178745269775,
"learning_rate": 0.001,
"loss": 1.564,
"step": 58
},
{
"epoch": 4.17,
"grad_norm": 0.31610924005508423,
"learning_rate": 0.001,
"loss": 1.3607,
"step": 60
},
{
"epoch": 4.31,
"grad_norm": 0.32351407408714294,
"learning_rate": 0.001,
"loss": 1.4158,
"step": 62
},
{
"epoch": 4.45,
"grad_norm": 0.5101042985916138,
"learning_rate": 0.001,
"loss": 1.4694,
"step": 64
},
{
"epoch": 4.59,
"grad_norm": 0.41575145721435547,
"learning_rate": 0.001,
"loss": 1.4755,
"step": 66
},
{
"epoch": 4.73,
"grad_norm": 0.3269899785518646,
"learning_rate": 0.001,
"loss": 1.4268,
"step": 68
},
{
"epoch": 4.87,
"grad_norm": 0.4077276587486267,
"learning_rate": 0.001,
"loss": 1.4471,
"step": 70
},
{
"epoch": 4.94,
"eval_loss": 2.553175926208496,
"eval_runtime": 81.5149,
"eval_samples_per_second": 4.146,
"eval_steps_per_second": 0.528,
"step": 71
},
{
"epoch": 5.01,
"grad_norm": 0.37493908405303955,
"learning_rate": 0.001,
"loss": 1.4436,
"step": 72
},
{
"epoch": 5.15,
"grad_norm": 0.8398223519325256,
"learning_rate": 0.001,
"loss": 1.1776,
"step": 74
},
{
"epoch": 5.29,
"grad_norm": 0.621316134929657,
"learning_rate": 0.001,
"loss": 1.192,
"step": 76
},
{
"epoch": 5.43,
"grad_norm": 0.5988876819610596,
"learning_rate": 0.001,
"loss": 1.1561,
"step": 78
},
{
"epoch": 5.57,
"grad_norm": 0.561390221118927,
"learning_rate": 0.001,
"loss": 1.2129,
"step": 80
},
{
"epoch": 5.7,
"grad_norm": 0.32573097944259644,
"learning_rate": 0.001,
"loss": 1.19,
"step": 82
},
{
"epoch": 5.84,
"grad_norm": 0.3272527754306793,
"learning_rate": 0.001,
"loss": 1.1933,
"step": 84
},
{
"epoch": 5.98,
"grad_norm": 0.36107558012008667,
"learning_rate": 0.001,
"loss": 1.1932,
"step": 86
},
{
"epoch": 5.98,
"eval_loss": 2.696089744567871,
"eval_runtime": 81.5294,
"eval_samples_per_second": 4.146,
"eval_steps_per_second": 0.527,
"step": 86
},
{
"epoch": 6.12,
"grad_norm": 0.4167131781578064,
"learning_rate": 0.001,
"loss": 0.9285,
"step": 88
},
{
"epoch": 6.26,
"grad_norm": 0.38736867904663086,
"learning_rate": 0.001,
"loss": 0.9568,
"step": 90
},
{
"epoch": 6.4,
"grad_norm": 0.3212537169456482,
"learning_rate": 0.001,
"loss": 0.9538,
"step": 92
},
{
"epoch": 6.54,
"grad_norm": 0.2966512143611908,
"learning_rate": 0.001,
"loss": 0.9133,
"step": 94
},
{
"epoch": 6.68,
"grad_norm": 0.3149372935295105,
"learning_rate": 0.001,
"loss": 0.9374,
"step": 96
},
{
"epoch": 6.82,
"grad_norm": 0.3140605092048645,
"learning_rate": 0.001,
"loss": 0.9585,
"step": 98
},
{
"epoch": 6.96,
"grad_norm": 0.33559679985046387,
"learning_rate": 0.001,
"loss": 0.9199,
"step": 100
},
{
"epoch": 6.96,
"eval_loss": 2.645321846008301,
"eval_runtime": 81.5044,
"eval_samples_per_second": 4.147,
"eval_steps_per_second": 0.528,
"step": 100
},
{
"epoch": 7.1,
"grad_norm": 0.3616858720779419,
"learning_rate": 0.001,
"loss": 0.7517,
"step": 102
},
{
"epoch": 7.23,
"grad_norm": 0.4970415234565735,
"learning_rate": 0.001,
"loss": 0.7378,
"step": 104
},
{
"epoch": 7.37,
"grad_norm": 0.6654688119888306,
"learning_rate": 0.001,
"loss": 0.7864,
"step": 106
},
{
"epoch": 7.51,
"grad_norm": 0.51229327917099,
"learning_rate": 0.001,
"loss": 0.762,
"step": 108
},
{
"epoch": 7.65,
"grad_norm": 0.4524416923522949,
"learning_rate": 0.001,
"loss": 0.7342,
"step": 110
},
{
"epoch": 7.79,
"grad_norm": 0.48206427693367004,
"learning_rate": 0.001,
"loss": 0.7706,
"step": 112
},
{
"epoch": 7.93,
"grad_norm": 0.4534417688846588,
"learning_rate": 0.001,
"loss": 0.7571,
"step": 114
},
{
"epoch": 8.0,
"eval_loss": 3.0977730751037598,
"eval_runtime": 81.5778,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 0.527,
"step": 115
},
{
"epoch": 8.07,
"grad_norm": 0.306815505027771,
"learning_rate": 0.001,
"loss": 0.6809,
"step": 116
},
{
"epoch": 8.21,
"grad_norm": 0.34183812141418457,
"learning_rate": 0.001,
"loss": 0.5853,
"step": 118
},
{
"epoch": 8.35,
"grad_norm": 0.3781261444091797,
"learning_rate": 0.001,
"loss": 0.5819,
"step": 120
},
{
"epoch": 8.49,
"grad_norm": 0.36344149708747864,
"learning_rate": 0.001,
"loss": 0.6059,
"step": 122
},
{
"epoch": 8.63,
"grad_norm": 0.38990476727485657,
"learning_rate": 0.001,
"loss": 0.5929,
"step": 124
},
{
"epoch": 8.77,
"grad_norm": 0.34000781178474426,
"learning_rate": 0.001,
"loss": 0.5887,
"step": 126
},
{
"epoch": 8.9,
"grad_norm": 0.32895970344543457,
"learning_rate": 0.001,
"loss": 0.6287,
"step": 128
},
{
"epoch": 8.97,
"eval_loss": 3.145782709121704,
"eval_runtime": 81.5735,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 0.527,
"step": 129
},
{
"epoch": 9.04,
"grad_norm": 0.36275872588157654,
"learning_rate": 0.001,
"loss": 0.5983,
"step": 130
},
{
"epoch": 9.18,
"grad_norm": 0.3596336245536804,
"learning_rate": 0.001,
"loss": 0.4615,
"step": 132
},
{
"epoch": 9.32,
"grad_norm": 0.37557095289230347,
"learning_rate": 0.001,
"loss": 0.4756,
"step": 134
},
{
"epoch": 9.46,
"grad_norm": 0.39249515533447266,
"learning_rate": 0.001,
"loss": 0.4546,
"step": 136
},
{
"epoch": 9.6,
"grad_norm": 0.3760348856449127,
"learning_rate": 0.001,
"loss": 0.4792,
"step": 138
},
{
"epoch": 9.74,
"grad_norm": 0.3137217164039612,
"learning_rate": 0.001,
"loss": 0.4674,
"step": 140
},
{
"epoch": 9.88,
"grad_norm": 0.40549594163894653,
"learning_rate": 0.001,
"loss": 0.4939,
"step": 142
},
{
"epoch": 9.95,
"eval_loss": 3.5685999393463135,
"eval_runtime": 81.5958,
"eval_samples_per_second": 4.142,
"eval_steps_per_second": 0.527,
"step": 143
},
{
"epoch": 10.02,
"grad_norm": 0.4173819422721863,
"learning_rate": 0.001,
"loss": 0.5055,
"step": 144
},
{
"epoch": 10.16,
"grad_norm": 0.280066579580307,
"learning_rate": 0.001,
"loss": 0.3353,
"step": 146
},
{
"epoch": 10.3,
"grad_norm": 0.30166783928871155,
"learning_rate": 0.001,
"loss": 0.351,
"step": 148
},
{
"epoch": 10.43,
"grad_norm": 0.28606531023979187,
"learning_rate": 0.001,
"loss": 0.3834,
"step": 150
},
{
"epoch": 10.57,
"grad_norm": 0.2835221588611603,
"learning_rate": 0.001,
"loss": 0.3718,
"step": 152
},
{
"epoch": 10.71,
"grad_norm": 0.3148328959941864,
"learning_rate": 0.001,
"loss": 0.3692,
"step": 154
},
{
"epoch": 10.85,
"grad_norm": 0.3502219021320343,
"learning_rate": 0.001,
"loss": 0.38,
"step": 156
},
{
"epoch": 10.99,
"grad_norm": 0.3344653844833374,
"learning_rate": 0.001,
"loss": 0.376,
"step": 158
},
{
"epoch": 10.99,
"eval_loss": 3.425977945327759,
"eval_runtime": 81.532,
"eval_samples_per_second": 4.146,
"eval_steps_per_second": 0.527,
"step": 158
},
{
"epoch": 11.13,
"grad_norm": 0.32332998514175415,
"learning_rate": 0.001,
"loss": 0.2827,
"step": 160
},
{
"epoch": 11.27,
"grad_norm": 0.35432103276252747,
"learning_rate": 0.001,
"loss": 0.2966,
"step": 162
},
{
"epoch": 11.41,
"grad_norm": 0.29032111167907715,
"learning_rate": 0.001,
"loss": 0.2954,
"step": 164
},
{
"epoch": 11.55,
"grad_norm": 0.3170696198940277,
"learning_rate": 0.001,
"loss": 0.2738,
"step": 166
},
{
"epoch": 11.69,
"grad_norm": 0.3339516520500183,
"learning_rate": 0.001,
"loss": 0.2786,
"step": 168
},
{
"epoch": 11.83,
"grad_norm": 0.3187398910522461,
"learning_rate": 0.001,
"loss": 0.315,
"step": 170
},
{
"epoch": 11.97,
"grad_norm": 0.2842791974544525,
"learning_rate": 0.001,
"loss": 0.313,
"step": 172
},
{
"epoch": 11.97,
"eval_loss": 3.9301607608795166,
"eval_runtime": 81.5908,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 0.527,
"step": 172
},
{
"epoch": 12.1,
"grad_norm": 0.2522130012512207,
"learning_rate": 0.001,
"loss": 0.2504,
"step": 174
},
{
"epoch": 12.24,
"grad_norm": 0.23560765385627747,
"learning_rate": 0.001,
"loss": 0.212,
"step": 176
},
{
"epoch": 12.38,
"grad_norm": 0.24140460789203644,
"learning_rate": 0.001,
"loss": 0.2156,
"step": 178
},
{
"epoch": 12.52,
"grad_norm": 0.2790488302707672,
"learning_rate": 0.001,
"loss": 0.2474,
"step": 180
},
{
"epoch": 12.66,
"grad_norm": 0.2879179120063782,
"learning_rate": 0.001,
"loss": 0.2486,
"step": 182
},
{
"epoch": 12.8,
"grad_norm": 0.3126004934310913,
"learning_rate": 0.001,
"loss": 0.2499,
"step": 184
},
{
"epoch": 12.94,
"grad_norm": 0.3011338412761688,
"learning_rate": 0.001,
"loss": 0.2562,
"step": 186
},
{
"epoch": 12.94,
"eval_loss": 3.743312120437622,
"eval_runtime": 81.5885,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 0.527,
"step": 186
},
{
"epoch": 13.08,
"grad_norm": 0.24417123198509216,
"learning_rate": 0.001,
"loss": 0.2166,
"step": 188
},
{
"epoch": 13.22,
"grad_norm": 0.21955759823322296,
"learning_rate": 0.001,
"loss": 0.1767,
"step": 190
},
{
"epoch": 13.36,
"grad_norm": 0.20537225902080536,
"learning_rate": 0.001,
"loss": 0.1715,
"step": 192
},
{
"epoch": 13.5,
"grad_norm": 0.21406413614749908,
"learning_rate": 0.001,
"loss": 0.1857,
"step": 194
},
{
"epoch": 13.63,
"grad_norm": 0.21677067875862122,
"learning_rate": 0.001,
"loss": 0.1881,
"step": 196
},
{
"epoch": 13.77,
"grad_norm": 0.2592070996761322,
"learning_rate": 0.001,
"loss": 0.2022,
"step": 198
},
{
"epoch": 13.91,
"grad_norm": 0.23913638293743134,
"learning_rate": 0.001,
"loss": 0.2051,
"step": 200
},
{
"epoch": 13.98,
"eval_loss": 3.911346197128296,
"eval_runtime": 81.5425,
"eval_samples_per_second": 4.145,
"eval_steps_per_second": 0.527,
"step": 201
},
{
"epoch": 14.05,
"grad_norm": 0.19888806343078613,
"learning_rate": 0.001,
"loss": 0.1774,
"step": 202
},
{
"epoch": 14.19,
"grad_norm": 0.17841410636901855,
"learning_rate": 0.001,
"loss": 0.1409,
"step": 204
},
{
"epoch": 14.33,
"grad_norm": 0.22502601146697998,
"learning_rate": 0.001,
"loss": 0.1432,
"step": 206
},
{
"epoch": 14.47,
"grad_norm": 0.21947847306728363,
"learning_rate": 0.001,
"loss": 0.1487,
"step": 208
},
{
"epoch": 14.61,
"grad_norm": 0.20319664478302002,
"learning_rate": 0.001,
"loss": 0.1753,
"step": 210
},
{
"epoch": 14.75,
"grad_norm": 0.20484566688537598,
"learning_rate": 0.001,
"loss": 0.1627,
"step": 212
},
{
"epoch": 14.89,
"grad_norm": 0.24411869049072266,
"learning_rate": 0.001,
"loss": 0.1802,
"step": 214
},
{
"epoch": 14.96,
"eval_loss": 4.0449538230896,
"eval_runtime": 81.5583,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 0.527,
"step": 215
},
{
"epoch": 15.03,
"grad_norm": 0.23610645532608032,
"learning_rate": 0.001,
"loss": 0.1881,
"step": 216
},
{
"epoch": 15.17,
"grad_norm": 0.17829175293445587,
"learning_rate": 0.001,
"loss": 0.123,
"step": 218
},
{
"epoch": 15.3,
"grad_norm": 0.178519606590271,
"learning_rate": 0.001,
"loss": 0.1166,
"step": 220
},
{
"epoch": 15.44,
"grad_norm": 0.19595706462860107,
"learning_rate": 0.001,
"loss": 0.135,
"step": 222
},
{
"epoch": 15.58,
"grad_norm": 0.20790521800518036,
"learning_rate": 0.001,
"loss": 0.1494,
"step": 224
},
{
"epoch": 15.72,
"grad_norm": 0.1832074671983719,
"learning_rate": 0.001,
"loss": 0.1488,
"step": 226
},
{
"epoch": 15.86,
"grad_norm": 0.17795896530151367,
"learning_rate": 0.001,
"loss": 0.1448,
"step": 228
},
{
"epoch": 16.0,
"grad_norm": 0.20039702951908112,
"learning_rate": 0.001,
"loss": 0.1378,
"step": 230
},
{
"epoch": 16.0,
"eval_loss": 3.939739227294922,
"eval_runtime": 81.6032,
"eval_samples_per_second": 4.142,
"eval_steps_per_second": 0.527,
"step": 230
},
{
"epoch": 16.14,
"grad_norm": 0.19622142612934113,
"learning_rate": 0.001,
"loss": 0.3001,
"step": 232
},
{
"epoch": 16.28,
"grad_norm": 19.05455207824707,
"learning_rate": 0.001,
"loss": 0.2708,
"step": 234
},
{
"epoch": 16.42,
"grad_norm": 29.798582077026367,
"learning_rate": 0.001,
"loss": 0.2154,
"step": 236
},
{
"epoch": 16.56,
"grad_norm": 8.835821151733398,
"learning_rate": 0.001,
"loss": 0.1348,
"step": 238
},
{
"epoch": 16.7,
"grad_norm": 0.3760863244533539,
"learning_rate": 0.001,
"loss": 0.6235,
"step": 240
},
{
"epoch": 16.83,
"grad_norm": 0.3473583459854126,
"learning_rate": 0.001,
"loss": 0.1445,
"step": 242
},
{
"epoch": 16.97,
"grad_norm": 0.4041793942451477,
"learning_rate": 0.001,
"loss": 0.1546,
"step": 244
},
{
"epoch": 16.97,
"eval_loss": 4.307888984680176,
"eval_runtime": 81.6566,
"eval_samples_per_second": 4.139,
"eval_steps_per_second": 0.527,
"step": 244
},
{
"epoch": 17.11,
"grad_norm": 0.2586219906806946,
"learning_rate": 0.001,
"loss": 0.1188,
"step": 246
},
{
"epoch": 17.25,
"grad_norm": 0.4334220886230469,
"learning_rate": 0.001,
"loss": 0.1041,
"step": 248
},
{
"epoch": 17.39,
"grad_norm": 17.520734786987305,
"learning_rate": 0.001,
"loss": 0.1108,
"step": 250
},
{
"epoch": 17.53,
"grad_norm": 0.5943770408630371,
"learning_rate": 0.001,
"loss": 0.1146,
"step": 252
},
{
"epoch": 17.67,
"grad_norm": 0.4325353503227234,
"learning_rate": 0.001,
"loss": 0.1325,
"step": 254
},
{
"epoch": 17.81,
"grad_norm": 0.41412413120269775,
"learning_rate": 0.001,
"loss": 0.1491,
"step": 256
},
{
"epoch": 17.95,
"grad_norm": 0.19986829161643982,
"learning_rate": 0.001,
"loss": 0.1375,
"step": 258
},
{
"epoch": 17.95,
"eval_loss": 4.552526950836182,
"eval_runtime": 81.6054,
"eval_samples_per_second": 4.142,
"eval_steps_per_second": 0.527,
"step": 258
},
{
"epoch": 18.09,
"grad_norm": 0.7999384999275208,
"learning_rate": 0.001,
"loss": 0.1155,
"step": 260
},
{
"epoch": 18.23,
"grad_norm": 0.17563021183013916,
"learning_rate": 0.001,
"loss": 0.1006,
"step": 262
},
{
"epoch": 18.37,
"grad_norm": 0.17661228775978088,
"learning_rate": 0.001,
"loss": 0.1062,
"step": 264
},
{
"epoch": 18.5,
"grad_norm": 0.17768113315105438,
"learning_rate": 0.001,
"loss": 0.1059,
"step": 266
},
{
"epoch": 18.64,
"grad_norm": 0.15412819385528564,
"learning_rate": 0.001,
"loss": 0.0981,
"step": 268
},
{
"epoch": 18.78,
"grad_norm": 0.1754271388053894,
"learning_rate": 0.001,
"loss": 0.0988,
"step": 270
},
{
"epoch": 18.92,
"grad_norm": 0.15736614167690277,
"learning_rate": 0.001,
"loss": 0.1005,
"step": 272
},
{
"epoch": 18.99,
"eval_loss": 4.900540828704834,
"eval_runtime": 81.5789,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 0.527,
"step": 273
},
{
"epoch": 19.06,
"grad_norm": 0.1531495302915573,
"learning_rate": 0.001,
"loss": 0.0844,
"step": 274
},
{
"epoch": 19.2,
"grad_norm": 0.15237411856651306,
"learning_rate": 0.001,
"loss": 0.0752,
"step": 276
},
{
"epoch": 19.34,
"grad_norm": 0.1433786153793335,
"learning_rate": 0.001,
"loss": 0.0782,
"step": 278
},
{
"epoch": 19.48,
"grad_norm": 0.1296713650226593,
"learning_rate": 0.001,
"loss": 0.0808,
"step": 280
},
{
"epoch": 19.48,
"eval_loss": 4.81671667098999,
"eval_runtime": 81.4692,
"eval_samples_per_second": 4.149,
"eval_steps_per_second": 0.528,
"step": 280
},
{
"epoch": 19.48,
"step": 280,
"total_flos": 4.895208054457934e+18,
"train_loss": 0.8494854368801628,
"train_runtime": 68771.7044,
"train_samples_per_second": 1.068,
"train_steps_per_second": 0.004
}
],
"logging_steps": 2,
"max_steps": 280,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"total_flos": 4.895208054457934e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}