|
{ |
|
"best_metric": 0.25531044602394104, |
|
"best_model_checkpoint": "./convnext-tiny-new-1e-4/checkpoint-10990", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 10990, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.848655700683594, |
|
"learning_rate": 9.99795725199423e-05, |
|
"loss": 2.511, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 21.852506637573242, |
|
"learning_rate": 9.991830677104683e-05, |
|
"loss": 1.4511, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 14.456836700439453, |
|
"learning_rate": 9.981625281350813e-05, |
|
"loss": 1.1871, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 25.593276977539062, |
|
"learning_rate": 9.967349403553353e-05, |
|
"loss": 1.0097, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 21.799734115600586, |
|
"learning_rate": 9.949014708520663e-05, |
|
"loss": 0.9728, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 22.431013107299805, |
|
"learning_rate": 9.926636177517427e-05, |
|
"loss": 0.9024, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 28.28536033630371, |
|
"learning_rate": 9.900232096023477e-05, |
|
"loss": 0.8912, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 27.407716751098633, |
|
"learning_rate": 9.869824038792741e-05, |
|
"loss": 0.865, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 13.469520568847656, |
|
"learning_rate": 9.835436852224525e-05, |
|
"loss": 0.7768, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 17.95415496826172, |
|
"learning_rate": 9.797098634061542e-05, |
|
"loss": 0.8432, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.862027833001988, |
|
"eval_loss": 0.4732176661491394, |
|
"eval_runtime": 70.5844, |
|
"eval_samples_per_second": 35.631, |
|
"eval_steps_per_second": 2.238, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 28.058094024658203, |
|
"learning_rate": 9.754840710431274e-05, |
|
"loss": 0.827, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 10.800704002380371, |
|
"learning_rate": 9.708697610249406e-05, |
|
"loss": 0.733, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 17.161815643310547, |
|
"learning_rate": 9.658707037006294e-05, |
|
"loss": 0.7117, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 15.4861478805542, |
|
"learning_rate": 9.604909837959455e-05, |
|
"loss": 0.6609, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 20.579856872558594, |
|
"learning_rate": 9.547349970757317e-05, |
|
"loss": 0.6396, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 24.0612735748291, |
|
"learning_rate": 9.486074467521456e-05, |
|
"loss": 0.6321, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 23.076522827148438, |
|
"learning_rate": 9.421133396416686e-05, |
|
"loss": 0.7177, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 24.146862030029297, |
|
"learning_rate": 9.352579820740405e-05, |
|
"loss": 0.7033, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 13.88412857055664, |
|
"learning_rate": 9.280469755564613e-05, |
|
"loss": 0.637, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 18.419206619262695, |
|
"learning_rate": 9.204862121966044e-05, |
|
"loss": 0.6056, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 22.241830825805664, |
|
"learning_rate": 9.125818698881798e-05, |
|
"loss": 0.6603, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9009940357852882, |
|
"eval_loss": 0.35555583238601685, |
|
"eval_runtime": 70.2432, |
|
"eval_samples_per_second": 35.804, |
|
"eval_steps_per_second": 2.249, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 28.679054260253906, |
|
"learning_rate": 9.043404072629829e-05, |
|
"loss": 0.6342, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 25.620079040527344, |
|
"learning_rate": 8.957685584135502e-05, |
|
"loss": 0.5285, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 17.019142150878906, |
|
"learning_rate": 8.86873327390739e-05, |
|
"loss": 0.5882, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 15.56048583984375, |
|
"learning_rate": 8.776619824807224e-05, |
|
"loss": 0.545, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 10.068634033203125, |
|
"learning_rate": 8.681420502660786e-05, |
|
"loss": 0.5944, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 15.93220043182373, |
|
"learning_rate": 8.583213094758261e-05, |
|
"loss": 0.5378, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 16.840496063232422, |
|
"learning_rate": 8.482077846294308e-05, |
|
"loss": 0.5597, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 23.36423683166504, |
|
"learning_rate": 8.378097394799773e-05, |
|
"loss": 0.5476, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 14.018542289733887, |
|
"learning_rate": 8.271356702618626e-05, |
|
"loss": 0.5429, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 22.620033264160156, |
|
"learning_rate": 8.161942987485303e-05, |
|
"loss": 0.5361, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 29.02153778076172, |
|
"learning_rate": 8.049945651259163e-05, |
|
"loss": 0.487, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.882703777335984, |
|
"eval_loss": 0.4106706380844116, |
|
"eval_runtime": 69.7812, |
|
"eval_samples_per_second": 36.041, |
|
"eval_steps_per_second": 2.264, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 9.186837196350098, |
|
"learning_rate": 7.935456206874292e-05, |
|
"loss": 0.5448, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 10.694817543029785, |
|
"learning_rate": 7.818568203564374e-05, |
|
"loss": 0.4207, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 9.862343788146973, |
|
"learning_rate": 7.699377150423672e-05, |
|
"loss": 0.475, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 17.465476989746094, |
|
"learning_rate": 7.577980438366628e-05, |
|
"loss": 0.48, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 19.368242263793945, |
|
"learning_rate": 7.454477260549828e-05, |
|
"loss": 0.4126, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 24.577472686767578, |
|
"learning_rate": 7.32896853132135e-05, |
|
"loss": 0.4992, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 7.475000381469727, |
|
"learning_rate": 7.201556803763725e-05, |
|
"loss": 0.4931, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 13.887510299682617, |
|
"learning_rate": 7.07234618589791e-05, |
|
"loss": 0.4909, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 20.115652084350586, |
|
"learning_rate": 6.94144225561669e-05, |
|
"loss": 0.4814, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 14.20095443725586, |
|
"learning_rate": 6.808951974417078e-05, |
|
"loss": 0.457, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 29.256624221801758, |
|
"learning_rate": 6.674983600002155e-05, |
|
"loss": 0.4369, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9172962226640159, |
|
"eval_loss": 0.3118807375431061, |
|
"eval_runtime": 70.2783, |
|
"eval_samples_per_second": 35.786, |
|
"eval_steps_per_second": 2.248, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 10.440736770629883, |
|
"learning_rate": 6.539646597823791e-05, |
|
"loss": 0.4036, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 14.221583366394043, |
|
"learning_rate": 6.403051551638508e-05, |
|
"loss": 0.3902, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 18.483789443969727, |
|
"learning_rate": 6.265310073149584e-05, |
|
"loss": 0.4096, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 27.6778564453125, |
|
"learning_rate": 6.126534710809216e-05, |
|
"loss": 0.3778, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 18.118946075439453, |
|
"learning_rate": 5.9868388578552734e-05, |
|
"loss": 0.356, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 27.71598243713379, |
|
"learning_rate": 5.8463366596577706e-05, |
|
"loss": 0.3544, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 10.941313743591309, |
|
"learning_rate": 5.705142920450777e-05, |
|
"loss": 0.4005, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 20.2288818359375, |
|
"learning_rate": 5.5633730095259695e-05, |
|
"loss": 0.4007, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 13.51459789276123, |
|
"learning_rate": 5.421142766964474e-05, |
|
"loss": 0.411, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 10.38302993774414, |
|
"learning_rate": 5.278568408984037e-05, |
|
"loss": 0.4178, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 7.628846168518066, |
|
"learning_rate": 5.135766432978829e-05, |
|
"loss": 0.3791, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9196819085487078, |
|
"eval_loss": 0.3082761764526367, |
|
"eval_runtime": 71.5388, |
|
"eval_samples_per_second": 35.156, |
|
"eval_steps_per_second": 2.209, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 15.286186218261719, |
|
"learning_rate": 4.9928535223295344e-05, |
|
"loss": 0.3914, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 19.06636619567871, |
|
"learning_rate": 4.849946451061443e-05, |
|
"loss": 0.3268, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 16.880006790161133, |
|
"learning_rate": 4.707161988428495e-05, |
|
"loss": 0.3184, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 10.349203109741211, |
|
"learning_rate": 4.564616803501205e-05, |
|
"loss": 0.3417, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 17.491863250732422, |
|
"learning_rate": 4.4224273698364735e-05, |
|
"loss": 0.3218, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 9.212013244628906, |
|
"learning_rate": 4.2807098703071255e-05, |
|
"loss": 0.3143, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 19.431245803833008, |
|
"learning_rate": 4.1395801021689746e-05, |
|
"loss": 0.3262, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 16.24408531188965, |
|
"learning_rate": 3.999153382442995e-05, |
|
"loss": 0.2885, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 23.65041732788086, |
|
"learning_rate": 3.859544453689853e-05, |
|
"loss": 0.3555, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 10.8707275390625, |
|
"learning_rate": 3.7208673902538706e-05, |
|
"loss": 0.3404, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 18.250511169433594, |
|
"learning_rate": 3.583235505052955e-05, |
|
"loss": 0.3342, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9296222664015904, |
|
"eval_loss": 0.28032296895980835, |
|
"eval_runtime": 79.2042, |
|
"eval_samples_per_second": 31.753, |
|
"eval_steps_per_second": 1.995, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 6.232498645782471, |
|
"learning_rate": 3.446761256990723e-05, |
|
"loss": 0.3301, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 19.711963653564453, |
|
"learning_rate": 3.311556159066397e-05, |
|
"loss": 0.2638, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 11.230116844177246, |
|
"learning_rate": 3.177730687257639e-05, |
|
"loss": 0.312, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 9.200843811035156, |
|
"learning_rate": 3.0453941902507177e-05, |
|
"loss": 0.3046, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 18.336387634277344, |
|
"learning_rate": 2.914654800091768e-05, |
|
"loss": 0.2505, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 4.870161533355713, |
|
"learning_rate": 2.7856193438321986e-05, |
|
"loss": 0.3136, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 8.88219165802002, |
|
"learning_rate": 2.6583932562403957e-05, |
|
"loss": 0.2656, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 12.868867874145508, |
|
"learning_rate": 2.5330804936510373e-05, |
|
"loss": 0.2714, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 26.35186767578125, |
|
"learning_rate": 2.409783449022475e-05, |
|
"loss": 0.2675, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 13.712077140808105, |
|
"learning_rate": 2.2886028682715217e-05, |
|
"loss": 0.2664, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 0.8319320678710938, |
|
"learning_rate": 2.169637767954048e-05, |
|
"loss": 0.2803, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9339960238568589, |
|
"eval_loss": 0.26732343435287476, |
|
"eval_runtime": 70.0882, |
|
"eval_samples_per_second": 35.883, |
|
"eval_steps_per_second": 2.254, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 15.552393913269043, |
|
"learning_rate": 2.052985354358622e-05, |
|
"loss": 0.2109, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 15.977888107299805, |
|
"learning_rate": 1.9387409440793386e-05, |
|
"loss": 0.2315, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 12.987915992736816, |
|
"learning_rate": 1.82699788613271e-05, |
|
"loss": 0.2421, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 7.147055625915527, |
|
"learning_rate": 1.7178474856822456e-05, |
|
"loss": 0.2454, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 25.91172981262207, |
|
"learning_rate": 1.611378929433083e-05, |
|
"loss": 0.2504, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 12.569241523742676, |
|
"learning_rate": 1.5076792127576073e-05, |
|
"loss": 0.2338, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 19.739892959594727, |
|
"learning_rate": 1.4068330686115943e-05, |
|
"loss": 0.2161, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 9.218408584594727, |
|
"learning_rate": 1.308922898298977e-05, |
|
"loss": 0.2329, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 2.0763704776763916, |
|
"learning_rate": 1.2140287041418203e-05, |
|
"loss": 0.2085, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 11.455533981323242, |
|
"learning_rate": 1.1222280241104716e-05, |
|
"loss": 0.2561, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 3.408818006515503, |
|
"learning_rate": 1.0335958684673574e-05, |
|
"loss": 0.2118, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.9320079522862823, |
|
"eval_loss": 0.2637202739715576, |
|
"eval_runtime": 74.3067, |
|
"eval_samples_per_second": 33.846, |
|
"eval_steps_per_second": 2.126, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 10.710466384887695, |
|
"learning_rate": 9.482046584761495e-06, |
|
"loss": 0.2045, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 14.385254859924316, |
|
"learning_rate": 8.661241672264192e-06, |
|
"loss": 0.2176, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 10.243017196655273, |
|
"learning_rate": 7.874214626220899e-06, |
|
"loss": 0.2099, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 6.018984317779541, |
|
"learning_rate": 7.1216085258031414e-06, |
|
"loss": 0.1686, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 1.289069652557373, |
|
"learning_rate": 6.404038324855222e-06, |
|
"loss": 0.2214, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 2.6374850273132324, |
|
"learning_rate": 5.7220903494159316e-06, |
|
"loss": 0.1675, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 2.8265738487243652, |
|
"learning_rate": 5.076321818632018e-06, |
|
"loss": 0.2129, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 11.41331958770752, |
|
"learning_rate": 4.467260389454864e-06, |
|
"loss": 0.2097, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 22.140159606933594, |
|
"learning_rate": 3.895403725492402e-06, |
|
"loss": 0.2099, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 7.952567100524902, |
|
"learning_rate": 3.3612190903686005e-06, |
|
"loss": 0.2438, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 19.06670379638672, |
|
"learning_rate": 2.86514296592269e-06, |
|
"loss": 0.1706, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9347912524850894, |
|
"eval_loss": 0.25745028257369995, |
|
"eval_runtime": 74.1249, |
|
"eval_samples_per_second": 33.929, |
|
"eval_steps_per_second": 2.132, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 16.285367965698242, |
|
"learning_rate": 2.407580695560252e-06, |
|
"loss": 0.1872, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 5.482550621032715, |
|
"learning_rate": 1.9889061530473986e-06, |
|
"loss": 0.1863, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 8.077670097351074, |
|
"learning_rate": 1.6094614370188499e-06, |
|
"loss": 0.1759, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 14.164857864379883, |
|
"learning_rate": 1.269556591449389e-06, |
|
"loss": 0.1768, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 16.639026641845703, |
|
"learning_rate": 9.694693523171927e-07, |
|
"loss": 0.2211, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 13.088668823242188, |
|
"learning_rate": 7.094449206659748e-07, |
|
"loss": 0.1891, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 7.857825756072998, |
|
"learning_rate": 4.896957622514298e-07, |
|
"loss": 0.2362, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 16.95376968383789, |
|
"learning_rate": 3.104014339355921e-07, |
|
"loss": 0.2201, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 11.055394172668457, |
|
"learning_rate": 1.7170843697111304e-07, |
|
"loss": 0.2139, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 8.469569206237793, |
|
"learning_rate": 7.37300972951771e-08, |
|
"loss": 0.2026, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 18.598886489868164, |
|
"learning_rate": 1.654647293098388e-08, |
|
"loss": 0.1995, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9375745526838967, |
|
"eval_loss": 0.25531044602394104, |
|
"eval_runtime": 73.7798, |
|
"eval_samples_per_second": 34.088, |
|
"eval_steps_per_second": 2.142, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 10990, |
|
"total_flos": 1.301428412334932e+19, |
|
"train_loss": 0.4412916103637251, |
|
"train_runtime": 10918.1842, |
|
"train_samples_per_second": 16.102, |
|
"train_steps_per_second": 1.007 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 1.301428412334932e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|