|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 431, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04640371229698376, |
|
"grad_norm": 302.0, |
|
"learning_rate": 1.5384615384615387e-06, |
|
"loss": 2.5055, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09280742459396751, |
|
"grad_norm": 58.75, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 2.3084, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13921113689095127, |
|
"grad_norm": 192.0, |
|
"learning_rate": 4.615384615384616e-06, |
|
"loss": 1.8971, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18561484918793503, |
|
"grad_norm": 7.375, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 1.4568, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.23201856148491878, |
|
"grad_norm": 5.75, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 1.1801, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.27842227378190254, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 1.0086, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3248259860788863, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.076923076923077e-05, |
|
"loss": 0.9797, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.37122969837587005, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.939, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4176334106728538, |
|
"grad_norm": 3.6875, |
|
"learning_rate": 1.3846153846153847e-05, |
|
"loss": 0.896, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.46403712296983757, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.8722, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5104408352668214, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 1.6923076923076924e-05, |
|
"loss": 0.9006, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5568445475638051, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.8545, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6032482598607889, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 2e-05, |
|
"loss": 0.8253, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6496519721577726, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.9996294632312766e-05, |
|
"loss": 0.8318, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6960556844547564, |
|
"grad_norm": 3.625, |
|
"learning_rate": 1.9985181275201e-05, |
|
"loss": 0.8287, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7424593967517401, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.9966668164479567e-05, |
|
"loss": 0.8215, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7888631090487239, |
|
"grad_norm": 3.3125, |
|
"learning_rate": 1.9940769019724926e-05, |
|
"loss": 0.8739, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8352668213457076, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.9907503034107893e-05, |
|
"loss": 0.8378, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8816705336426914, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.9866894860170104e-05, |
|
"loss": 0.8728, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9280742459396751, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.9818974591554668e-05, |
|
"loss": 0.8036, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.974477958236659, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.9763777740704572e-05, |
|
"loss": 0.8385, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.0208816705336428, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 1.970134521254532e-05, |
|
"loss": 0.757, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0672853828306264, |
|
"grad_norm": 3.125, |
|
"learning_rate": 1.9631723274171412e-05, |
|
"loss": 0.6218, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.1136890951276102, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.9554963520559003e-05, |
|
"loss": 0.6392, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.160092807424594, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.9471122836330236e-05, |
|
"loss": 0.6406, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2064965197215778, |
|
"grad_norm": 3.59375, |
|
"learning_rate": 1.9380263353597553e-05, |
|
"loss": 0.6187, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2529002320185616, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 1.9282452405919235e-05, |
|
"loss": 0.6496, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2993039443155452, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 1.9177762478400276e-05, |
|
"loss": 0.6379, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.345707656612529, |
|
"grad_norm": 2.859375, |
|
"learning_rate": 1.9066271153975602e-05, |
|
"loss": 0.649, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3921113689095128, |
|
"grad_norm": 3.359375, |
|
"learning_rate": 1.8948061055915395e-05, |
|
"loss": 0.6498, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4385150812064964, |
|
"grad_norm": 2.84375, |
|
"learning_rate": 1.882321978659519e-05, |
|
"loss": 0.6101, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4849187935034802, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 1.869183986257606e-05, |
|
"loss": 0.655, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.531322505800464, |
|
"grad_norm": 3.09375, |
|
"learning_rate": 1.8554018646043045e-05, |
|
"loss": 0.6264, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5777262180974478, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1.840985827265262e-05, |
|
"loss": 0.6092, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6241299303944317, |
|
"grad_norm": 3.328125, |
|
"learning_rate": 1.825946557584265e-05, |
|
"loss": 0.6433, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6705336426914155, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.810295200766097e-05, |
|
"loss": 0.6223, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.716937354988399, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 1.794043355617121e-05, |
|
"loss": 0.6193, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7633410672853829, |
|
"grad_norm": 3.25, |
|
"learning_rate": 1.7772030659497112e-05, |
|
"loss": 0.6316, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.8097447795823665, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 1.7597868116569036e-05, |
|
"loss": 0.6421, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8561484918793503, |
|
"grad_norm": 3.203125, |
|
"learning_rate": 1.7418074994638752e-05, |
|
"loss": 0.6177, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.902552204176334, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.7232784533631148e-05, |
|
"loss": 0.6237, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.948955916473318, |
|
"grad_norm": 3.015625, |
|
"learning_rate": 1.7042134047403613e-05, |
|
"loss": 0.647, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9953596287703017, |
|
"grad_norm": 3.4375, |
|
"learning_rate": 1.684626482198639e-05, |
|
"loss": 0.6445, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.0417633410672855, |
|
"grad_norm": 3.828125, |
|
"learning_rate": 1.6645322010879242e-05, |
|
"loss": 0.4345, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0881670533642693, |
|
"grad_norm": 4.03125, |
|
"learning_rate": 1.6439454527482014e-05, |
|
"loss": 0.3731, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.1345707656612527, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 1.6228814934738873e-05, |
|
"loss": 0.3653, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1809744779582365, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 1.6013559332077945e-05, |
|
"loss": 0.3391, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.2273781902552203, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.5793847239730148e-05, |
|
"loss": 0.3467, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.273781902552204, |
|
"grad_norm": 4.1875, |
|
"learning_rate": 1.5569841480512972e-05, |
|
"loss": 0.3468, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.320185614849188, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.534170805916681e-05, |
|
"loss": 0.3488, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3665893271461718, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.510961603933324e-05, |
|
"loss": 0.361, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.4129930394431556, |
|
"grad_norm": 3.875, |
|
"learning_rate": 1.4873737418266398e-05, |
|
"loss": 0.3524, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.4593967517401394, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.4634246999370415e-05, |
|
"loss": 0.3388, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.505800464037123, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.4391322262657206e-05, |
|
"loss": 0.3513, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.5522041763341066, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.4145143233220741e-05, |
|
"loss": 0.3354, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.5986078886310904, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.3895892347825205e-05, |
|
"loss": 0.3506, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.645011600928074, |
|
"grad_norm": 4.5, |
|
"learning_rate": 1.3643754319705956e-05, |
|
"loss": 0.3502, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.691415313225058, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 1.3388916001683412e-05, |
|
"loss": 0.3574, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.737819025522042, |
|
"grad_norm": 4.0, |
|
"learning_rate": 1.3131566247691387e-05, |
|
"loss": 0.323, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.7842227378190256, |
|
"grad_norm": 4.0625, |
|
"learning_rate": 1.2871895772822442e-05, |
|
"loss": 0.3474, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.8306264501160094, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.261009701199395e-05, |
|
"loss": 0.3494, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.877030162412993, |
|
"grad_norm": 3.546875, |
|
"learning_rate": 1.2346363977339698e-05, |
|
"loss": 0.3633, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.9234338747099766, |
|
"grad_norm": 4.34375, |
|
"learning_rate": 1.208089211443262e-05, |
|
"loss": 0.3365, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.9698375870069604, |
|
"grad_norm": 3.734375, |
|
"learning_rate": 1.1813878157445253e-05, |
|
"loss": 0.3391, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 3.0162412993039442, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 1.1545519983355255e-05, |
|
"loss": 0.3049, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 3.062645011600928, |
|
"grad_norm": 3.984375, |
|
"learning_rate": 1.1276016465303989e-05, |
|
"loss": 0.1773, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.109048723897912, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 1.1005567325216946e-05, |
|
"loss": 0.1798, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.1554524361948957, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 1.0734372985795062e-05, |
|
"loss": 0.1666, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.2018561484918795, |
|
"grad_norm": 3.671875, |
|
"learning_rate": 1.0462634421986786e-05, |
|
"loss": 0.1737, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.2482598607888633, |
|
"grad_norm": 4.15625, |
|
"learning_rate": 1.0190553012050868e-05, |
|
"loss": 0.1723, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.2946635730858467, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 9.918330388320235e-06, |
|
"loss": 0.165, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.3410672853828305, |
|
"grad_norm": 4.28125, |
|
"learning_rate": 9.646168287777633e-06, |
|
"loss": 0.1683, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.3874709976798143, |
|
"grad_norm": 3.90625, |
|
"learning_rate": 9.374268402553665e-06, |
|
"loss": 0.158, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.433874709976798, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 9.102832230458115e-06, |
|
"loss": 0.1679, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.480278422273782, |
|
"grad_norm": 3.953125, |
|
"learning_rate": 8.83206092565522e-06, |
|
"loss": 0.1609, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.5266821345707657, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 8.562155149593673e-06, |
|
"loss": 0.1726, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.5730858468677495, |
|
"grad_norm": 4.125, |
|
"learning_rate": 8.293314922301715e-06, |
|
"loss": 0.1623, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.619489559164733, |
|
"grad_norm": 3.71875, |
|
"learning_rate": 8.025739474157595e-06, |
|
"loss": 0.1669, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.6658932714617167, |
|
"grad_norm": 4.625, |
|
"learning_rate": 7.759627098245207e-06, |
|
"loss": 0.1683, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.7122969837587005, |
|
"grad_norm": 4.375, |
|
"learning_rate": 7.49517500340432e-06, |
|
"loss": 0.1676, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.7587006960556844, |
|
"grad_norm": 3.65625, |
|
"learning_rate": 7.232579168084344e-06, |
|
"loss": 0.1519, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.805104408352668, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 6.972034195109885e-06, |
|
"loss": 0.1675, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.851508120649652, |
|
"grad_norm": 3.5625, |
|
"learning_rate": 6.713733167465723e-06, |
|
"loss": 0.1699, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.897911832946636, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 6.4578675052081395e-06, |
|
"loss": 0.1589, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.9443155452436196, |
|
"grad_norm": 4.09375, |
|
"learning_rate": 6.204626823608584e-06, |
|
"loss": 0.1738, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.9907192575406034, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 5.954198792634782e-06, |
|
"loss": 0.1613, |
|
"step": 430 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 642, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 999999, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.382695759314125e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|