ZhangShenao's picture
Training in progress, epoch 4
d6d0ec6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 431,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04640371229698376,
"grad_norm": 302.0,
"learning_rate": 1.5384615384615387e-06,
"loss": 2.5055,
"step": 5
},
{
"epoch": 0.09280742459396751,
"grad_norm": 58.75,
"learning_rate": 3.0769230769230774e-06,
"loss": 2.3084,
"step": 10
},
{
"epoch": 0.13921113689095127,
"grad_norm": 192.0,
"learning_rate": 4.615384615384616e-06,
"loss": 1.8971,
"step": 15
},
{
"epoch": 0.18561484918793503,
"grad_norm": 7.375,
"learning_rate": 6.153846153846155e-06,
"loss": 1.4568,
"step": 20
},
{
"epoch": 0.23201856148491878,
"grad_norm": 5.75,
"learning_rate": 7.692307692307694e-06,
"loss": 1.1801,
"step": 25
},
{
"epoch": 0.27842227378190254,
"grad_norm": 4.8125,
"learning_rate": 9.230769230769232e-06,
"loss": 1.0086,
"step": 30
},
{
"epoch": 0.3248259860788863,
"grad_norm": 3.984375,
"learning_rate": 1.076923076923077e-05,
"loss": 0.9797,
"step": 35
},
{
"epoch": 0.37122969837587005,
"grad_norm": 4.375,
"learning_rate": 1.230769230769231e-05,
"loss": 0.939,
"step": 40
},
{
"epoch": 0.4176334106728538,
"grad_norm": 3.6875,
"learning_rate": 1.3846153846153847e-05,
"loss": 0.896,
"step": 45
},
{
"epoch": 0.46403712296983757,
"grad_norm": 3.671875,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.8722,
"step": 50
},
{
"epoch": 0.5104408352668214,
"grad_norm": 3.8125,
"learning_rate": 1.6923076923076924e-05,
"loss": 0.9006,
"step": 55
},
{
"epoch": 0.5568445475638051,
"grad_norm": 3.5625,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.8545,
"step": 60
},
{
"epoch": 0.6032482598607889,
"grad_norm": 3.546875,
"learning_rate": 2e-05,
"loss": 0.8253,
"step": 65
},
{
"epoch": 0.6496519721577726,
"grad_norm": 3.359375,
"learning_rate": 1.9996294632312766e-05,
"loss": 0.8318,
"step": 70
},
{
"epoch": 0.6960556844547564,
"grad_norm": 3.625,
"learning_rate": 1.9985181275201e-05,
"loss": 0.8287,
"step": 75
},
{
"epoch": 0.7424593967517401,
"grad_norm": 3.09375,
"learning_rate": 1.9966668164479567e-05,
"loss": 0.8215,
"step": 80
},
{
"epoch": 0.7888631090487239,
"grad_norm": 3.3125,
"learning_rate": 1.9940769019724926e-05,
"loss": 0.8739,
"step": 85
},
{
"epoch": 0.8352668213457076,
"grad_norm": 3.015625,
"learning_rate": 1.9907503034107893e-05,
"loss": 0.8378,
"step": 90
},
{
"epoch": 0.8816705336426914,
"grad_norm": 3.0625,
"learning_rate": 1.9866894860170104e-05,
"loss": 0.8728,
"step": 95
},
{
"epoch": 0.9280742459396751,
"grad_norm": 3.015625,
"learning_rate": 1.9818974591554668e-05,
"loss": 0.8036,
"step": 100
},
{
"epoch": 0.974477958236659,
"grad_norm": 3.25,
"learning_rate": 1.9763777740704572e-05,
"loss": 0.8385,
"step": 105
},
{
"epoch": 1.0208816705336428,
"grad_norm": 2.53125,
"learning_rate": 1.970134521254532e-05,
"loss": 0.757,
"step": 110
},
{
"epoch": 1.0672853828306264,
"grad_norm": 3.125,
"learning_rate": 1.9631723274171412e-05,
"loss": 0.6218,
"step": 115
},
{
"epoch": 1.1136890951276102,
"grad_norm": 3.015625,
"learning_rate": 1.9554963520559003e-05,
"loss": 0.6392,
"step": 120
},
{
"epoch": 1.160092807424594,
"grad_norm": 3.34375,
"learning_rate": 1.9471122836330236e-05,
"loss": 0.6406,
"step": 125
},
{
"epoch": 1.2064965197215778,
"grad_norm": 3.59375,
"learning_rate": 1.9380263353597553e-05,
"loss": 0.6187,
"step": 130
},
{
"epoch": 1.2529002320185616,
"grad_norm": 3.265625,
"learning_rate": 1.9282452405919235e-05,
"loss": 0.6496,
"step": 135
},
{
"epoch": 1.2993039443155452,
"grad_norm": 3.34375,
"learning_rate": 1.9177762478400276e-05,
"loss": 0.6379,
"step": 140
},
{
"epoch": 1.345707656612529,
"grad_norm": 2.859375,
"learning_rate": 1.9066271153975602e-05,
"loss": 0.649,
"step": 145
},
{
"epoch": 1.3921113689095128,
"grad_norm": 3.359375,
"learning_rate": 1.8948061055915395e-05,
"loss": 0.6498,
"step": 150
},
{
"epoch": 1.4385150812064964,
"grad_norm": 2.84375,
"learning_rate": 1.882321978659519e-05,
"loss": 0.6101,
"step": 155
},
{
"epoch": 1.4849187935034802,
"grad_norm": 3.171875,
"learning_rate": 1.869183986257606e-05,
"loss": 0.655,
"step": 160
},
{
"epoch": 1.531322505800464,
"grad_norm": 3.09375,
"learning_rate": 1.8554018646043045e-05,
"loss": 0.6264,
"step": 165
},
{
"epoch": 1.5777262180974478,
"grad_norm": 3.21875,
"learning_rate": 1.840985827265262e-05,
"loss": 0.6092,
"step": 170
},
{
"epoch": 1.6241299303944317,
"grad_norm": 3.328125,
"learning_rate": 1.825946557584265e-05,
"loss": 0.6433,
"step": 175
},
{
"epoch": 1.6705336426914155,
"grad_norm": 3.25,
"learning_rate": 1.810295200766097e-05,
"loss": 0.6223,
"step": 180
},
{
"epoch": 1.716937354988399,
"grad_norm": 3.21875,
"learning_rate": 1.794043355617121e-05,
"loss": 0.6193,
"step": 185
},
{
"epoch": 1.7633410672853829,
"grad_norm": 3.25,
"learning_rate": 1.7772030659497112e-05,
"loss": 0.6316,
"step": 190
},
{
"epoch": 1.8097447795823665,
"grad_norm": 2.984375,
"learning_rate": 1.7597868116569036e-05,
"loss": 0.6421,
"step": 195
},
{
"epoch": 1.8561484918793503,
"grad_norm": 3.203125,
"learning_rate": 1.7418074994638752e-05,
"loss": 0.6177,
"step": 200
},
{
"epoch": 1.902552204176334,
"grad_norm": 3.46875,
"learning_rate": 1.7232784533631148e-05,
"loss": 0.6237,
"step": 205
},
{
"epoch": 1.948955916473318,
"grad_norm": 3.015625,
"learning_rate": 1.7042134047403613e-05,
"loss": 0.647,
"step": 210
},
{
"epoch": 1.9953596287703017,
"grad_norm": 3.4375,
"learning_rate": 1.684626482198639e-05,
"loss": 0.6445,
"step": 215
},
{
"epoch": 2.0417633410672855,
"grad_norm": 3.828125,
"learning_rate": 1.6645322010879242e-05,
"loss": 0.4345,
"step": 220
},
{
"epoch": 2.0881670533642693,
"grad_norm": 4.03125,
"learning_rate": 1.6439454527482014e-05,
"loss": 0.3731,
"step": 225
},
{
"epoch": 2.1345707656612527,
"grad_norm": 3.953125,
"learning_rate": 1.6228814934738873e-05,
"loss": 0.3653,
"step": 230
},
{
"epoch": 2.1809744779582365,
"grad_norm": 4.09375,
"learning_rate": 1.6013559332077945e-05,
"loss": 0.3391,
"step": 235
},
{
"epoch": 2.2273781902552203,
"grad_norm": 5.3125,
"learning_rate": 1.5793847239730148e-05,
"loss": 0.3467,
"step": 240
},
{
"epoch": 2.273781902552204,
"grad_norm": 4.1875,
"learning_rate": 1.5569841480512972e-05,
"loss": 0.3468,
"step": 245
},
{
"epoch": 2.320185614849188,
"grad_norm": 3.46875,
"learning_rate": 1.534170805916681e-05,
"loss": 0.3488,
"step": 250
},
{
"epoch": 2.3665893271461718,
"grad_norm": 4.96875,
"learning_rate": 1.510961603933324e-05,
"loss": 0.361,
"step": 255
},
{
"epoch": 2.4129930394431556,
"grad_norm": 3.875,
"learning_rate": 1.4873737418266398e-05,
"loss": 0.3524,
"step": 260
},
{
"epoch": 2.4593967517401394,
"grad_norm": 4.0,
"learning_rate": 1.4634246999370415e-05,
"loss": 0.3388,
"step": 265
},
{
"epoch": 2.505800464037123,
"grad_norm": 4.34375,
"learning_rate": 1.4391322262657206e-05,
"loss": 0.3513,
"step": 270
},
{
"epoch": 2.5522041763341066,
"grad_norm": 4.28125,
"learning_rate": 1.4145143233220741e-05,
"loss": 0.3354,
"step": 275
},
{
"epoch": 2.5986078886310904,
"grad_norm": 4.34375,
"learning_rate": 1.3895892347825205e-05,
"loss": 0.3506,
"step": 280
},
{
"epoch": 2.645011600928074,
"grad_norm": 4.5,
"learning_rate": 1.3643754319705956e-05,
"loss": 0.3502,
"step": 285
},
{
"epoch": 2.691415313225058,
"grad_norm": 4.28125,
"learning_rate": 1.3388916001683412e-05,
"loss": 0.3574,
"step": 290
},
{
"epoch": 2.737819025522042,
"grad_norm": 4.0,
"learning_rate": 1.3131566247691387e-05,
"loss": 0.323,
"step": 295
},
{
"epoch": 2.7842227378190256,
"grad_norm": 4.0625,
"learning_rate": 1.2871895772822442e-05,
"loss": 0.3474,
"step": 300
},
{
"epoch": 2.8306264501160094,
"grad_norm": 4.15625,
"learning_rate": 1.261009701199395e-05,
"loss": 0.3494,
"step": 305
},
{
"epoch": 2.877030162412993,
"grad_norm": 3.546875,
"learning_rate": 1.2346363977339698e-05,
"loss": 0.3633,
"step": 310
},
{
"epoch": 2.9234338747099766,
"grad_norm": 4.34375,
"learning_rate": 1.208089211443262e-05,
"loss": 0.3365,
"step": 315
},
{
"epoch": 2.9698375870069604,
"grad_norm": 3.734375,
"learning_rate": 1.1813878157445253e-05,
"loss": 0.3391,
"step": 320
},
{
"epoch": 3.0162412993039442,
"grad_norm": 3.46875,
"learning_rate": 1.1545519983355255e-05,
"loss": 0.3049,
"step": 325
},
{
"epoch": 3.062645011600928,
"grad_norm": 3.984375,
"learning_rate": 1.1276016465303989e-05,
"loss": 0.1773,
"step": 330
},
{
"epoch": 3.109048723897912,
"grad_norm": 6.84375,
"learning_rate": 1.1005567325216946e-05,
"loss": 0.1798,
"step": 335
},
{
"epoch": 3.1554524361948957,
"grad_norm": 3.890625,
"learning_rate": 1.0734372985795062e-05,
"loss": 0.1666,
"step": 340
},
{
"epoch": 3.2018561484918795,
"grad_norm": 3.671875,
"learning_rate": 1.0462634421986786e-05,
"loss": 0.1737,
"step": 345
},
{
"epoch": 3.2482598607888633,
"grad_norm": 4.15625,
"learning_rate": 1.0190553012050868e-05,
"loss": 0.1723,
"step": 350
},
{
"epoch": 3.2946635730858467,
"grad_norm": 3.921875,
"learning_rate": 9.918330388320235e-06,
"loss": 0.165,
"step": 355
},
{
"epoch": 3.3410672853828305,
"grad_norm": 4.28125,
"learning_rate": 9.646168287777633e-06,
"loss": 0.1683,
"step": 360
},
{
"epoch": 3.3874709976798143,
"grad_norm": 3.90625,
"learning_rate": 9.374268402553665e-06,
"loss": 0.158,
"step": 365
},
{
"epoch": 3.433874709976798,
"grad_norm": 3.71875,
"learning_rate": 9.102832230458115e-06,
"loss": 0.1679,
"step": 370
},
{
"epoch": 3.480278422273782,
"grad_norm": 3.953125,
"learning_rate": 8.83206092565522e-06,
"loss": 0.1609,
"step": 375
},
{
"epoch": 3.5266821345707657,
"grad_norm": 4.46875,
"learning_rate": 8.562155149593673e-06,
"loss": 0.1726,
"step": 380
},
{
"epoch": 3.5730858468677495,
"grad_norm": 4.125,
"learning_rate": 8.293314922301715e-06,
"loss": 0.1623,
"step": 385
},
{
"epoch": 3.619489559164733,
"grad_norm": 3.71875,
"learning_rate": 8.025739474157595e-06,
"loss": 0.1669,
"step": 390
},
{
"epoch": 3.6658932714617167,
"grad_norm": 4.625,
"learning_rate": 7.759627098245207e-06,
"loss": 0.1683,
"step": 395
},
{
"epoch": 3.7122969837587005,
"grad_norm": 4.375,
"learning_rate": 7.49517500340432e-06,
"loss": 0.1676,
"step": 400
},
{
"epoch": 3.7587006960556844,
"grad_norm": 3.65625,
"learning_rate": 7.232579168084344e-06,
"loss": 0.1519,
"step": 405
},
{
"epoch": 3.805104408352668,
"grad_norm": 4.46875,
"learning_rate": 6.972034195109885e-06,
"loss": 0.1675,
"step": 410
},
{
"epoch": 3.851508120649652,
"grad_norm": 3.5625,
"learning_rate": 6.713733167465723e-06,
"loss": 0.1699,
"step": 415
},
{
"epoch": 3.897911832946636,
"grad_norm": 4.59375,
"learning_rate": 6.4578675052081395e-06,
"loss": 0.1589,
"step": 420
},
{
"epoch": 3.9443155452436196,
"grad_norm": 4.09375,
"learning_rate": 6.204626823608584e-06,
"loss": 0.1738,
"step": 425
},
{
"epoch": 3.9907192575406034,
"grad_norm": 3.78125,
"learning_rate": 5.954198792634782e-06,
"loss": 0.1613,
"step": 430
}
],
"logging_steps": 5,
"max_steps": 642,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 999999,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.382695759314125e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}