|
{ |
|
"best_metric": 0.5352830290794373, |
|
"best_model_checkpoint": "/media/mldrive/kcardenas/limb_classification_person_crop_seq/t2_8heads_1layers_2.5e-4lr/checkpoint-9050", |
|
"epoch": 25.0, |
|
"eval_steps": 500, |
|
"global_step": 9050, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06906077348066299, |
|
"grad_norm": 546493.6875, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.3426, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13812154696132597, |
|
"grad_norm": 1021909.8125, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0862, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.20718232044198895, |
|
"grad_norm": 331840.375, |
|
"learning_rate": 7.5e-05, |
|
"loss": 0.9961, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.27624309392265195, |
|
"grad_norm": 301711.5, |
|
"learning_rate": 0.0001, |
|
"loss": 0.9317, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3453038674033149, |
|
"grad_norm": 585924.875, |
|
"learning_rate": 0.000125, |
|
"loss": 1.0058, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4143646408839779, |
|
"grad_norm": 690722.75, |
|
"learning_rate": 0.00015, |
|
"loss": 0.9653, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.48342541436464087, |
|
"grad_norm": 464025.3125, |
|
"learning_rate": 0.000175, |
|
"loss": 1.0392, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5524861878453039, |
|
"grad_norm": 518616.53125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9737, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6215469613259669, |
|
"grad_norm": 350802.71875, |
|
"learning_rate": 0.00022500000000000002, |
|
"loss": 0.9236, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.6906077348066298, |
|
"grad_norm": 973885.875, |
|
"learning_rate": 0.00025, |
|
"loss": 0.9832, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.7596685082872928, |
|
"grad_norm": 618290.5625, |
|
"learning_rate": 0.0002492897727272727, |
|
"loss": 1.0118, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.8287292817679558, |
|
"grad_norm": 326306.40625, |
|
"learning_rate": 0.00024857954545454543, |
|
"loss": 0.9599, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8977900552486188, |
|
"grad_norm": 371112.5625, |
|
"learning_rate": 0.0002478693181818182, |
|
"loss": 0.9315, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.9668508287292817, |
|
"grad_norm": 495571.71875, |
|
"learning_rate": 0.0002471590909090909, |
|
"loss": 0.8925, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7448680351906158, |
|
"eval_loss": 0.790420651435852, |
|
"eval_runtime": 42.055, |
|
"eval_samples_per_second": 24.325, |
|
"eval_steps_per_second": 1.522, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.0359116022099448, |
|
"grad_norm": 452897.09375, |
|
"learning_rate": 0.0002464488636363637, |
|
"loss": 0.8909, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.1049723756906078, |
|
"grad_norm": 355295.59375, |
|
"learning_rate": 0.0002457386363636364, |
|
"loss": 0.8213, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.1740331491712708, |
|
"grad_norm": 495158.0625, |
|
"learning_rate": 0.0002450284090909091, |
|
"loss": 0.9398, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.2430939226519337, |
|
"grad_norm": 370363.21875, |
|
"learning_rate": 0.0002443181818181818, |
|
"loss": 0.8851, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.3121546961325967, |
|
"grad_norm": 392437.6875, |
|
"learning_rate": 0.00024360795454545457, |
|
"loss": 0.8499, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.3812154696132597, |
|
"grad_norm": 439837.0625, |
|
"learning_rate": 0.00024289772727272726, |
|
"loss": 0.8963, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.4502762430939227, |
|
"grad_norm": 198029.296875, |
|
"learning_rate": 0.0002421875, |
|
"loss": 0.8455, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.5193370165745856, |
|
"grad_norm": 313499.625, |
|
"learning_rate": 0.00024147727272727274, |
|
"loss": 0.8612, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.5883977900552486, |
|
"grad_norm": 564825.8125, |
|
"learning_rate": 0.00024076704545454545, |
|
"loss": 0.9247, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.6574585635359116, |
|
"grad_norm": 376266.0, |
|
"learning_rate": 0.0002400568181818182, |
|
"loss": 0.8356, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.7265193370165746, |
|
"grad_norm": 337162.90625, |
|
"learning_rate": 0.00023934659090909093, |
|
"loss": 0.8917, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.7955801104972375, |
|
"grad_norm": 503179.65625, |
|
"learning_rate": 0.00023863636363636364, |
|
"loss": 0.8588, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.8646408839779005, |
|
"grad_norm": 256478.4375, |
|
"learning_rate": 0.00023792613636363635, |
|
"loss": 0.8356, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.9337016574585635, |
|
"grad_norm": 521165.65625, |
|
"learning_rate": 0.0002372159090909091, |
|
"loss": 0.858, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7174975562072337, |
|
"eval_loss": 0.8537685871124268, |
|
"eval_runtime": 42.2046, |
|
"eval_samples_per_second": 24.239, |
|
"eval_steps_per_second": 1.516, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 2.0027624309392267, |
|
"grad_norm": 281006.9375, |
|
"learning_rate": 0.00023650568181818183, |
|
"loss": 0.879, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.0718232044198897, |
|
"grad_norm": 555881.125, |
|
"learning_rate": 0.00023579545454545457, |
|
"loss": 0.8786, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.1408839779005526, |
|
"grad_norm": 483952.3125, |
|
"learning_rate": 0.00023508522727272728, |
|
"loss": 0.7862, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.2099447513812156, |
|
"grad_norm": 216426.828125, |
|
"learning_rate": 0.000234375, |
|
"loss": 0.8047, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.2790055248618786, |
|
"grad_norm": 197438.46875, |
|
"learning_rate": 0.00023366477272727273, |
|
"loss": 0.8293, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.3480662983425415, |
|
"grad_norm": 494899.25, |
|
"learning_rate": 0.00023295454545454544, |
|
"loss": 0.8145, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.4171270718232045, |
|
"grad_norm": 270262.375, |
|
"learning_rate": 0.00023224431818181818, |
|
"loss": 0.8891, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.4861878453038675, |
|
"grad_norm": 405765.28125, |
|
"learning_rate": 0.00023153409090909092, |
|
"loss": 0.8854, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.5552486187845305, |
|
"grad_norm": 205706.828125, |
|
"learning_rate": 0.00023082386363636366, |
|
"loss": 0.7497, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.6243093922651934, |
|
"grad_norm": 376715.0625, |
|
"learning_rate": 0.00023011363636363637, |
|
"loss": 0.8717, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.6933701657458564, |
|
"grad_norm": 380497.1875, |
|
"learning_rate": 0.00022940340909090908, |
|
"loss": 0.8441, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.7624309392265194, |
|
"grad_norm": 426279.59375, |
|
"learning_rate": 0.00022869318181818182, |
|
"loss": 0.9413, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.8314917127071824, |
|
"grad_norm": 289505.75, |
|
"learning_rate": 0.00022798295454545456, |
|
"loss": 0.7789, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 2.9005524861878453, |
|
"grad_norm": 355856.84375, |
|
"learning_rate": 0.00022727272727272727, |
|
"loss": 0.8151, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.9696132596685083, |
|
"grad_norm": 264853.96875, |
|
"learning_rate": 0.0002265625, |
|
"loss": 0.8091, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7018572825024438, |
|
"eval_loss": 0.9682632684707642, |
|
"eval_runtime": 42.1565, |
|
"eval_samples_per_second": 24.267, |
|
"eval_steps_per_second": 1.518, |
|
"step": 1086 |
|
}, |
|
{ |
|
"epoch": 3.0386740331491713, |
|
"grad_norm": 379952.03125, |
|
"learning_rate": 0.00022585227272727275, |
|
"loss": 0.9232, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.1077348066298343, |
|
"grad_norm": 296822.28125, |
|
"learning_rate": 0.00022514204545454544, |
|
"loss": 0.7854, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.1767955801104972, |
|
"grad_norm": 199548.140625, |
|
"learning_rate": 0.00022443181818181817, |
|
"loss": 0.777, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.24585635359116, |
|
"grad_norm": 341872.78125, |
|
"learning_rate": 0.0002237215909090909, |
|
"loss": 0.8094, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 3.314917127071823, |
|
"grad_norm": 743207.6875, |
|
"learning_rate": 0.00022301136363636365, |
|
"loss": 0.8712, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 3.383977900552486, |
|
"grad_norm": 460510.03125, |
|
"learning_rate": 0.00022230113636363636, |
|
"loss": 0.8482, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 3.453038674033149, |
|
"grad_norm": 312884.0625, |
|
"learning_rate": 0.0002215909090909091, |
|
"loss": 0.7549, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 3.522099447513812, |
|
"grad_norm": 169966.15625, |
|
"learning_rate": 0.00022088068181818181, |
|
"loss": 0.7376, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 3.591160220994475, |
|
"grad_norm": 409671.96875, |
|
"learning_rate": 0.00022017045454545455, |
|
"loss": 0.8223, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 3.660220994475138, |
|
"grad_norm": 196511.53125, |
|
"learning_rate": 0.00021946022727272727, |
|
"loss": 0.8005, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 3.729281767955801, |
|
"grad_norm": 235688.984375, |
|
"learning_rate": 0.00021875, |
|
"loss": 0.7482, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 3.798342541436464, |
|
"grad_norm": 321537.4375, |
|
"learning_rate": 0.00021803977272727274, |
|
"loss": 0.7621, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 3.867403314917127, |
|
"grad_norm": 578654.1875, |
|
"learning_rate": 0.00021732954545454546, |
|
"loss": 0.8879, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.93646408839779, |
|
"grad_norm": 213948.453125, |
|
"learning_rate": 0.0002166193181818182, |
|
"loss": 0.7739, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7956989247311828, |
|
"eval_loss": 0.740921139717102, |
|
"eval_runtime": 42.4931, |
|
"eval_samples_per_second": 24.075, |
|
"eval_steps_per_second": 1.506, |
|
"step": 1448 |
|
}, |
|
{ |
|
"epoch": 4.005524861878453, |
|
"grad_norm": 290470.71875, |
|
"learning_rate": 0.0002159090909090909, |
|
"loss": 0.7391, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.074585635359116, |
|
"grad_norm": 210350.8125, |
|
"learning_rate": 0.00021519886363636365, |
|
"loss": 0.7517, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 4.143646408839779, |
|
"grad_norm": 432410.59375, |
|
"learning_rate": 0.00021448863636363636, |
|
"loss": 0.8225, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 4.212707182320442, |
|
"grad_norm": 346385.28125, |
|
"learning_rate": 0.0002137784090909091, |
|
"loss": 0.8012, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 4.281767955801105, |
|
"grad_norm": 377627.84375, |
|
"learning_rate": 0.00021306818181818183, |
|
"loss": 0.7995, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 4.350828729281768, |
|
"grad_norm": 266334.875, |
|
"learning_rate": 0.00021235795454545457, |
|
"loss": 0.7491, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 4.419889502762431, |
|
"grad_norm": 399612.75, |
|
"learning_rate": 0.00021164772727272726, |
|
"loss": 0.8616, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 4.488950276243094, |
|
"grad_norm": 201637.8125, |
|
"learning_rate": 0.0002109375, |
|
"loss": 0.7697, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 4.558011049723757, |
|
"grad_norm": 583351.3125, |
|
"learning_rate": 0.00021022727272727274, |
|
"loss": 0.7404, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 4.62707182320442, |
|
"grad_norm": 423896.03125, |
|
"learning_rate": 0.00020951704545454545, |
|
"loss": 0.7651, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 4.696132596685083, |
|
"grad_norm": 166796.078125, |
|
"learning_rate": 0.0002088068181818182, |
|
"loss": 0.7299, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 4.765193370165746, |
|
"grad_norm": 186467.21875, |
|
"learning_rate": 0.00020809659090909093, |
|
"loss": 0.7226, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 4.834254143646409, |
|
"grad_norm": 292986.375, |
|
"learning_rate": 0.00020738636363636364, |
|
"loss": 0.7822, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 4.903314917127072, |
|
"grad_norm": 232852.234375, |
|
"learning_rate": 0.00020667613636363635, |
|
"loss": 0.8355, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 4.972375690607735, |
|
"grad_norm": 240884.328125, |
|
"learning_rate": 0.0002059659090909091, |
|
"loss": 0.7614, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.8035190615835777, |
|
"eval_loss": 0.7167670130729675, |
|
"eval_runtime": 42.3724, |
|
"eval_samples_per_second": 24.143, |
|
"eval_steps_per_second": 1.51, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 5.041436464088398, |
|
"grad_norm": 405361.84375, |
|
"learning_rate": 0.00020525568181818183, |
|
"loss": 0.8033, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 5.110497237569061, |
|
"grad_norm": 380140.96875, |
|
"learning_rate": 0.00020454545454545457, |
|
"loss": 0.8198, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 5.179558011049724, |
|
"grad_norm": 418996.78125, |
|
"learning_rate": 0.00020383522727272728, |
|
"loss": 0.761, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 5.248618784530387, |
|
"grad_norm": 281704.5, |
|
"learning_rate": 0.00020312500000000002, |
|
"loss": 0.7772, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 5.31767955801105, |
|
"grad_norm": 282670.625, |
|
"learning_rate": 0.00020241477272727273, |
|
"loss": 0.6764, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 5.386740331491713, |
|
"grad_norm": 476126.5, |
|
"learning_rate": 0.00020170454545454544, |
|
"loss": 0.8089, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 5.455801104972376, |
|
"grad_norm": 450313.03125, |
|
"learning_rate": 0.00020099431818181818, |
|
"loss": 0.786, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 5.524861878453039, |
|
"grad_norm": 653649.3125, |
|
"learning_rate": 0.00020028409090909092, |
|
"loss": 0.7067, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 5.593922651933702, |
|
"grad_norm": 285791.28125, |
|
"learning_rate": 0.00019957386363636366, |
|
"loss": 0.7857, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 5.662983425414365, |
|
"grad_norm": 256986.796875, |
|
"learning_rate": 0.00019886363636363637, |
|
"loss": 0.7325, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 5.732044198895028, |
|
"grad_norm": 454397.375, |
|
"learning_rate": 0.00019815340909090908, |
|
"loss": 0.7978, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 5.801104972375691, |
|
"grad_norm": 252106.578125, |
|
"learning_rate": 0.00019744318181818182, |
|
"loss": 0.7344, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 5.870165745856354, |
|
"grad_norm": 418520.96875, |
|
"learning_rate": 0.00019673295454545456, |
|
"loss": 0.7462, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 5.939226519337017, |
|
"grad_norm": 454961.09375, |
|
"learning_rate": 0.00019602272727272727, |
|
"loss": 0.8067, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.8005865102639296, |
|
"eval_loss": 0.7569773197174072, |
|
"eval_runtime": 42.2742, |
|
"eval_samples_per_second": 24.199, |
|
"eval_steps_per_second": 1.514, |
|
"step": 2172 |
|
}, |
|
{ |
|
"epoch": 6.00828729281768, |
|
"grad_norm": 372438.28125, |
|
"learning_rate": 0.0001953125, |
|
"loss": 0.7101, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 6.077348066298343, |
|
"grad_norm": 144956.078125, |
|
"learning_rate": 0.00019460227272727275, |
|
"loss": 0.7648, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 6.1464088397790055, |
|
"grad_norm": 241067.53125, |
|
"learning_rate": 0.00019389204545454543, |
|
"loss": 0.7331, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 6.2154696132596685, |
|
"grad_norm": 199682.640625, |
|
"learning_rate": 0.00019318181818181817, |
|
"loss": 0.7246, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 6.2845303867403315, |
|
"grad_norm": 226706.40625, |
|
"learning_rate": 0.0001924715909090909, |
|
"loss": 0.7489, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 6.3535911602209945, |
|
"grad_norm": 252510.46875, |
|
"learning_rate": 0.00019176136363636365, |
|
"loss": 0.7618, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 6.422651933701657, |
|
"grad_norm": 591994.1875, |
|
"learning_rate": 0.00019105113636363636, |
|
"loss": 0.7302, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 6.49171270718232, |
|
"grad_norm": 231069.296875, |
|
"learning_rate": 0.0001903409090909091, |
|
"loss": 0.7773, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 6.560773480662983, |
|
"grad_norm": 228927.859375, |
|
"learning_rate": 0.00018963068181818181, |
|
"loss": 0.7117, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 6.629834254143646, |
|
"grad_norm": 238666.609375, |
|
"learning_rate": 0.00018892045454545455, |
|
"loss": 0.7709, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 6.698895027624309, |
|
"grad_norm": 165791.421875, |
|
"learning_rate": 0.00018821022727272726, |
|
"loss": 0.7352, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 6.767955801104972, |
|
"grad_norm": 239828.046875, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.7527, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 6.837016574585635, |
|
"grad_norm": 156010.734375, |
|
"learning_rate": 0.00018678977272727274, |
|
"loss": 0.6658, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 6.906077348066298, |
|
"grad_norm": 347614.625, |
|
"learning_rate": 0.00018607954545454545, |
|
"loss": 0.7481, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 6.975138121546961, |
|
"grad_norm": 238463.296875, |
|
"learning_rate": 0.0001853693181818182, |
|
"loss": 0.749, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7634408602150538, |
|
"eval_loss": 0.8141921758651733, |
|
"eval_runtime": 42.3662, |
|
"eval_samples_per_second": 24.147, |
|
"eval_steps_per_second": 1.511, |
|
"step": 2534 |
|
}, |
|
{ |
|
"epoch": 7.044198895027624, |
|
"grad_norm": 294229.125, |
|
"learning_rate": 0.0001846590909090909, |
|
"loss": 0.6796, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 7.113259668508287, |
|
"grad_norm": 202620.265625, |
|
"learning_rate": 0.00018394886363636364, |
|
"loss": 0.6318, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 7.18232044198895, |
|
"grad_norm": 318938.96875, |
|
"learning_rate": 0.00018323863636363636, |
|
"loss": 0.7313, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 7.251381215469613, |
|
"grad_norm": 549962.75, |
|
"learning_rate": 0.0001825284090909091, |
|
"loss": 0.7158, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 7.320441988950276, |
|
"grad_norm": 456374.625, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.6857, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 7.389502762430939, |
|
"grad_norm": 448997.71875, |
|
"learning_rate": 0.00018110795454545457, |
|
"loss": 0.7868, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 7.458563535911602, |
|
"grad_norm": 553743.1875, |
|
"learning_rate": 0.00018039772727272726, |
|
"loss": 0.7701, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 7.527624309392265, |
|
"grad_norm": 164408.890625, |
|
"learning_rate": 0.0001796875, |
|
"loss": 0.7025, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 7.596685082872928, |
|
"grad_norm": 390214.28125, |
|
"learning_rate": 0.00017897727272727274, |
|
"loss": 0.7381, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 7.665745856353591, |
|
"grad_norm": 340693.0, |
|
"learning_rate": 0.00017826704545454545, |
|
"loss": 0.7677, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 7.734806629834254, |
|
"grad_norm": 210816.5625, |
|
"learning_rate": 0.0001775568181818182, |
|
"loss": 0.7318, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 7.803867403314917, |
|
"grad_norm": 256154.375, |
|
"learning_rate": 0.00017684659090909093, |
|
"loss": 0.7017, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 7.87292817679558, |
|
"grad_norm": 410985.0, |
|
"learning_rate": 0.00017613636363636364, |
|
"loss": 0.6981, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 7.941988950276243, |
|
"grad_norm": 328773.6875, |
|
"learning_rate": 0.00017542613636363635, |
|
"loss": 0.7555, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.8318670576735093, |
|
"eval_loss": 0.6704066395759583, |
|
"eval_runtime": 42.485, |
|
"eval_samples_per_second": 24.079, |
|
"eval_steps_per_second": 1.506, |
|
"step": 2896 |
|
}, |
|
{ |
|
"epoch": 8.011049723756907, |
|
"grad_norm": 214139.25, |
|
"learning_rate": 0.0001747159090909091, |
|
"loss": 0.7334, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 8.08011049723757, |
|
"grad_norm": 297389.03125, |
|
"learning_rate": 0.00017400568181818183, |
|
"loss": 0.7099, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 8.149171270718233, |
|
"grad_norm": 232340.171875, |
|
"learning_rate": 0.00017329545454545457, |
|
"loss": 0.6811, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 8.218232044198896, |
|
"grad_norm": 321969.25, |
|
"learning_rate": 0.00017258522727272728, |
|
"loss": 0.7802, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 8.287292817679559, |
|
"grad_norm": 367906.875, |
|
"learning_rate": 0.000171875, |
|
"loss": 0.6819, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 8.356353591160222, |
|
"grad_norm": 235728.671875, |
|
"learning_rate": 0.00017116477272727273, |
|
"loss": 0.6979, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 8.425414364640885, |
|
"grad_norm": 404114.96875, |
|
"learning_rate": 0.00017045454545454544, |
|
"loss": 0.7285, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 8.494475138121548, |
|
"grad_norm": 251749.625, |
|
"learning_rate": 0.00016974431818181818, |
|
"loss": 0.7039, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 8.56353591160221, |
|
"grad_norm": 400764.6875, |
|
"learning_rate": 0.00016903409090909092, |
|
"loss": 0.6598, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 8.632596685082873, |
|
"grad_norm": 417687.65625, |
|
"learning_rate": 0.00016832386363636366, |
|
"loss": 0.682, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 8.701657458563536, |
|
"grad_norm": 217835.28125, |
|
"learning_rate": 0.00016761363636363637, |
|
"loss": 0.7315, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 8.7707182320442, |
|
"grad_norm": 414153.71875, |
|
"learning_rate": 0.00016690340909090908, |
|
"loss": 0.7044, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 8.839779005524862, |
|
"grad_norm": 318105.0, |
|
"learning_rate": 0.00016619318181818182, |
|
"loss": 0.6636, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 8.908839779005525, |
|
"grad_norm": 259801.65625, |
|
"learning_rate": 0.00016548295454545456, |
|
"loss": 0.7535, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 8.977900552486188, |
|
"grad_norm": 345877.375, |
|
"learning_rate": 0.00016477272727272727, |
|
"loss": 0.6637, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.823069403714565, |
|
"eval_loss": 0.681505560874939, |
|
"eval_runtime": 42.4292, |
|
"eval_samples_per_second": 24.111, |
|
"eval_steps_per_second": 1.508, |
|
"step": 3258 |
|
}, |
|
{ |
|
"epoch": 9.046961325966851, |
|
"grad_norm": 440922.0, |
|
"learning_rate": 0.0001640625, |
|
"loss": 0.6217, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 9.116022099447514, |
|
"grad_norm": 172642.34375, |
|
"learning_rate": 0.00016335227272727275, |
|
"loss": 0.6543, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 9.185082872928177, |
|
"grad_norm": 305327.90625, |
|
"learning_rate": 0.00016264204545454543, |
|
"loss": 0.6961, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 9.25414364640884, |
|
"grad_norm": 265570.0625, |
|
"learning_rate": 0.00016193181818181817, |
|
"loss": 0.6666, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 9.323204419889503, |
|
"grad_norm": 287697.3125, |
|
"learning_rate": 0.0001612215909090909, |
|
"loss": 0.691, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 9.392265193370166, |
|
"grad_norm": 336017.90625, |
|
"learning_rate": 0.00016051136363636365, |
|
"loss": 0.6883, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 9.46132596685083, |
|
"grad_norm": 230630.921875, |
|
"learning_rate": 0.00015980113636363636, |
|
"loss": 0.7177, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 9.530386740331492, |
|
"grad_norm": 322946.15625, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 0.6431, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 9.599447513812155, |
|
"grad_norm": 577997.375, |
|
"learning_rate": 0.0001583806818181818, |
|
"loss": 0.6677, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 9.668508287292818, |
|
"grad_norm": 191955.53125, |
|
"learning_rate": 0.00015767045454545455, |
|
"loss": 0.7274, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 9.737569060773481, |
|
"grad_norm": 276779.125, |
|
"learning_rate": 0.00015696022727272726, |
|
"loss": 0.6441, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 9.806629834254144, |
|
"grad_norm": 576859.0, |
|
"learning_rate": 0.00015625, |
|
"loss": 0.6688, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 9.875690607734807, |
|
"grad_norm": 494593.40625, |
|
"learning_rate": 0.00015553977272727274, |
|
"loss": 0.785, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 9.94475138121547, |
|
"grad_norm": 309429.8125, |
|
"learning_rate": 0.00015482954545454545, |
|
"loss": 0.6502, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.8543499511241447, |
|
"eval_loss": 0.6418806910514832, |
|
"eval_runtime": 42.5916, |
|
"eval_samples_per_second": 24.019, |
|
"eval_steps_per_second": 1.503, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 10.013812154696133, |
|
"grad_norm": 345612.75, |
|
"learning_rate": 0.0001541193181818182, |
|
"loss": 0.7162, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 10.082872928176796, |
|
"grad_norm": 304771.84375, |
|
"learning_rate": 0.0001534090909090909, |
|
"loss": 0.6017, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 10.151933701657459, |
|
"grad_norm": 235138.234375, |
|
"learning_rate": 0.00015269886363636364, |
|
"loss": 0.6122, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 10.220994475138122, |
|
"grad_norm": 170193.25, |
|
"learning_rate": 0.00015198863636363636, |
|
"loss": 0.6785, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 10.290055248618785, |
|
"grad_norm": 195562.453125, |
|
"learning_rate": 0.0001512784090909091, |
|
"loss": 0.6081, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 10.359116022099448, |
|
"grad_norm": 411620.40625, |
|
"learning_rate": 0.00015056818181818183, |
|
"loss": 0.657, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 10.42817679558011, |
|
"grad_norm": 255837.109375, |
|
"learning_rate": 0.00014985795454545457, |
|
"loss": 0.6439, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 10.497237569060774, |
|
"grad_norm": 274584.53125, |
|
"learning_rate": 0.00014914772727272726, |
|
"loss": 0.7706, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 10.566298342541437, |
|
"grad_norm": 200431.109375, |
|
"learning_rate": 0.0001484375, |
|
"loss": 0.7256, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 10.6353591160221, |
|
"grad_norm": 293831.5625, |
|
"learning_rate": 0.00014772727272727274, |
|
"loss": 0.6941, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 10.704419889502763, |
|
"grad_norm": 343135.21875, |
|
"learning_rate": 0.00014701704545454545, |
|
"loss": 0.6216, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 10.773480662983426, |
|
"grad_norm": 268533.625, |
|
"learning_rate": 0.00014630681818181819, |
|
"loss": 0.6195, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 10.842541436464089, |
|
"grad_norm": 184908.40625, |
|
"learning_rate": 0.00014559659090909093, |
|
"loss": 0.7079, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 10.911602209944752, |
|
"grad_norm": 222204.625, |
|
"learning_rate": 0.00014488636363636364, |
|
"loss": 0.6653, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 10.980662983425415, |
|
"grad_norm": 226897.578125, |
|
"learning_rate": 0.00014417613636363635, |
|
"loss": 0.6432, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.8504398826979472, |
|
"eval_loss": 0.651556134223938, |
|
"eval_runtime": 41.7934, |
|
"eval_samples_per_second": 24.478, |
|
"eval_steps_per_second": 1.531, |
|
"step": 3982 |
|
}, |
|
{ |
|
"epoch": 11.049723756906078, |
|
"grad_norm": 495195.5, |
|
"learning_rate": 0.0001434659090909091, |
|
"loss": 0.6866, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 11.11878453038674, |
|
"grad_norm": 341327.34375, |
|
"learning_rate": 0.00014275568181818183, |
|
"loss": 0.6468, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 11.187845303867404, |
|
"grad_norm": 361602.59375, |
|
"learning_rate": 0.00014204545454545457, |
|
"loss": 0.6404, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 11.256906077348066, |
|
"grad_norm": 341543.6875, |
|
"learning_rate": 0.00014133522727272728, |
|
"loss": 0.6526, |
|
"step": 4075 |
|
}, |
|
{ |
|
"epoch": 11.32596685082873, |
|
"grad_norm": 158669.609375, |
|
"learning_rate": 0.00014062500000000002, |
|
"loss": 0.6858, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 11.395027624309392, |
|
"grad_norm": 440959.625, |
|
"learning_rate": 0.00013991477272727273, |
|
"loss": 0.7111, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 11.464088397790055, |
|
"grad_norm": 284844.53125, |
|
"learning_rate": 0.00013920454545454544, |
|
"loss": 0.6824, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 11.533149171270718, |
|
"grad_norm": 172543.8125, |
|
"learning_rate": 0.00013849431818181818, |
|
"loss": 0.645, |
|
"step": 4175 |
|
}, |
|
{ |
|
"epoch": 11.602209944751381, |
|
"grad_norm": 336567.6875, |
|
"learning_rate": 0.00013778409090909092, |
|
"loss": 0.5683, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 11.671270718232044, |
|
"grad_norm": 207626.125, |
|
"learning_rate": 0.00013707386363636366, |
|
"loss": 0.5553, |
|
"step": 4225 |
|
}, |
|
{ |
|
"epoch": 11.740331491712707, |
|
"grad_norm": 273981.8125, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 0.6666, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 11.80939226519337, |
|
"grad_norm": 248155.4375, |
|
"learning_rate": 0.00013565340909090908, |
|
"loss": 0.65, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 11.878453038674033, |
|
"grad_norm": 210834.28125, |
|
"learning_rate": 0.00013494318181818182, |
|
"loss": 0.6554, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 11.947513812154696, |
|
"grad_norm": 264953.0625, |
|
"learning_rate": 0.00013423295454545456, |
|
"loss": 0.6431, |
|
"step": 4325 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.863147605083089, |
|
"eval_loss": 0.61189866065979, |
|
"eval_runtime": 42.1136, |
|
"eval_samples_per_second": 24.291, |
|
"eval_steps_per_second": 1.52, |
|
"step": 4344 |
|
}, |
|
{ |
|
"epoch": 12.01657458563536, |
|
"grad_norm": 302859.9375, |
|
"learning_rate": 0.00013352272727272727, |
|
"loss": 0.6218, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 12.085635359116022, |
|
"grad_norm": 366192.0625, |
|
"learning_rate": 0.0001328125, |
|
"loss": 0.6251, |
|
"step": 4375 |
|
}, |
|
{ |
|
"epoch": 12.154696132596685, |
|
"grad_norm": 209801.171875, |
|
"learning_rate": 0.00013210227272727275, |
|
"loss": 0.6082, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 12.223756906077348, |
|
"grad_norm": 364777.40625, |
|
"learning_rate": 0.00013139204545454543, |
|
"loss": 0.5764, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 12.292817679558011, |
|
"grad_norm": 574424.125, |
|
"learning_rate": 0.00013068181818181817, |
|
"loss": 0.6261, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 12.361878453038674, |
|
"grad_norm": 468495.6875, |
|
"learning_rate": 0.0001299715909090909, |
|
"loss": 0.6322, |
|
"step": 4475 |
|
}, |
|
{ |
|
"epoch": 12.430939226519337, |
|
"grad_norm": 161215.09375, |
|
"learning_rate": 0.00012926136363636365, |
|
"loss": 0.6585, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 12.5, |
|
"grad_norm": 318936.53125, |
|
"learning_rate": 0.00012855113636363636, |
|
"loss": 0.6615, |
|
"step": 4525 |
|
}, |
|
{ |
|
"epoch": 12.569060773480663, |
|
"grad_norm": 339025.34375, |
|
"learning_rate": 0.0001278409090909091, |
|
"loss": 0.5965, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 12.638121546961326, |
|
"grad_norm": 194275.75, |
|
"learning_rate": 0.0001271306818181818, |
|
"loss": 0.63, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 12.707182320441989, |
|
"grad_norm": 230095.640625, |
|
"learning_rate": 0.00012642045454545455, |
|
"loss": 0.5895, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 12.776243093922652, |
|
"grad_norm": 516592.5, |
|
"learning_rate": 0.00012571022727272726, |
|
"loss": 0.6961, |
|
"step": 4625 |
|
}, |
|
{ |
|
"epoch": 12.845303867403315, |
|
"grad_norm": 301114.0625, |
|
"learning_rate": 0.000125, |
|
"loss": 0.6588, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 12.914364640883978, |
|
"grad_norm": 376623.71875, |
|
"learning_rate": 0.00012428977272727271, |
|
"loss": 0.5894, |
|
"step": 4675 |
|
}, |
|
{ |
|
"epoch": 12.98342541436464, |
|
"grad_norm": 211816.203125, |
|
"learning_rate": 0.00012357954545454545, |
|
"loss": 0.6504, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.863147605083089, |
|
"eval_loss": 0.5947189331054688, |
|
"eval_runtime": 42.4848, |
|
"eval_samples_per_second": 24.079, |
|
"eval_steps_per_second": 1.506, |
|
"step": 4706 |
|
}, |
|
{ |
|
"epoch": 13.052486187845304, |
|
"grad_norm": 472150.125, |
|
"learning_rate": 0.0001228693181818182, |
|
"loss": 0.6, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 13.121546961325967, |
|
"grad_norm": 361846.28125, |
|
"learning_rate": 0.0001221590909090909, |
|
"loss": 0.5998, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 13.19060773480663, |
|
"grad_norm": 425699.9375, |
|
"learning_rate": 0.00012144886363636363, |
|
"loss": 0.6615, |
|
"step": 4775 |
|
}, |
|
{ |
|
"epoch": 13.259668508287293, |
|
"grad_norm": 467732.625, |
|
"learning_rate": 0.00012073863636363637, |
|
"loss": 0.6446, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 13.328729281767956, |
|
"grad_norm": 338163.0625, |
|
"learning_rate": 0.0001200284090909091, |
|
"loss": 0.6463, |
|
"step": 4825 |
|
}, |
|
{ |
|
"epoch": 13.397790055248619, |
|
"grad_norm": 263720.1875, |
|
"learning_rate": 0.00011931818181818182, |
|
"loss": 0.6007, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 13.466850828729282, |
|
"grad_norm": 399349.40625, |
|
"learning_rate": 0.00011860795454545454, |
|
"loss": 0.6425, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 13.535911602209945, |
|
"grad_norm": 253915.4375, |
|
"learning_rate": 0.00011789772727272728, |
|
"loss": 0.6258, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 13.604972375690608, |
|
"grad_norm": 358377.46875, |
|
"learning_rate": 0.0001171875, |
|
"loss": 0.632, |
|
"step": 4925 |
|
}, |
|
{ |
|
"epoch": 13.67403314917127, |
|
"grad_norm": 341063.78125, |
|
"learning_rate": 0.00011647727272727272, |
|
"loss": 0.6586, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 13.743093922651934, |
|
"grad_norm": 637828.4375, |
|
"learning_rate": 0.00011576704545454546, |
|
"loss": 0.6253, |
|
"step": 4975 |
|
}, |
|
{ |
|
"epoch": 13.812154696132596, |
|
"grad_norm": 435619.34375, |
|
"learning_rate": 0.00011505681818181819, |
|
"loss": 0.6197, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 13.88121546961326, |
|
"grad_norm": 298846.5, |
|
"learning_rate": 0.00011434659090909091, |
|
"loss": 0.5851, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 13.950276243093922, |
|
"grad_norm": 243213.671875, |
|
"learning_rate": 0.00011363636363636364, |
|
"loss": 0.5558, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.8602150537634409, |
|
"eval_loss": 0.5987916588783264, |
|
"eval_runtime": 42.7136, |
|
"eval_samples_per_second": 23.95, |
|
"eval_steps_per_second": 1.498, |
|
"step": 5068 |
|
}, |
|
{ |
|
"epoch": 14.019337016574585, |
|
"grad_norm": 294237.46875, |
|
"learning_rate": 0.00011292613636363638, |
|
"loss": 0.5404, |
|
"step": 5075 |
|
}, |
|
{ |
|
"epoch": 14.088397790055248, |
|
"grad_norm": 316393.09375, |
|
"learning_rate": 0.00011221590909090909, |
|
"loss": 0.5825, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 14.157458563535911, |
|
"grad_norm": 557254.6875, |
|
"learning_rate": 0.00011150568181818183, |
|
"loss": 0.5979, |
|
"step": 5125 |
|
}, |
|
{ |
|
"epoch": 14.226519337016574, |
|
"grad_norm": 465808.96875, |
|
"learning_rate": 0.00011079545454545455, |
|
"loss": 0.6134, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 14.295580110497237, |
|
"grad_norm": 289574.84375, |
|
"learning_rate": 0.00011008522727272728, |
|
"loss": 0.5898, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 14.3646408839779, |
|
"grad_norm": 181121.796875, |
|
"learning_rate": 0.000109375, |
|
"loss": 0.668, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 14.433701657458563, |
|
"grad_norm": 227056.890625, |
|
"learning_rate": 0.00010866477272727273, |
|
"loss": 0.5981, |
|
"step": 5225 |
|
}, |
|
{ |
|
"epoch": 14.502762430939226, |
|
"grad_norm": 309664.21875, |
|
"learning_rate": 0.00010795454545454545, |
|
"loss": 0.5788, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 14.57182320441989, |
|
"grad_norm": 265956.34375, |
|
"learning_rate": 0.00010724431818181818, |
|
"loss": 0.6353, |
|
"step": 5275 |
|
}, |
|
{ |
|
"epoch": 14.640883977900552, |
|
"grad_norm": 453111.28125, |
|
"learning_rate": 0.00010653409090909092, |
|
"loss": 0.6019, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 14.709944751381215, |
|
"grad_norm": 278395.90625, |
|
"learning_rate": 0.00010582386363636363, |
|
"loss": 0.6407, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 14.779005524861878, |
|
"grad_norm": 327899.15625, |
|
"learning_rate": 0.00010511363636363637, |
|
"loss": 0.5937, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 14.848066298342541, |
|
"grad_norm": 325341.34375, |
|
"learning_rate": 0.0001044034090909091, |
|
"loss": 0.5691, |
|
"step": 5375 |
|
}, |
|
{ |
|
"epoch": 14.917127071823204, |
|
"grad_norm": 462184.03125, |
|
"learning_rate": 0.00010369318181818182, |
|
"loss": 0.572, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 14.986187845303867, |
|
"grad_norm": 304309.71875, |
|
"learning_rate": 0.00010298295454545454, |
|
"loss": 0.53, |
|
"step": 5425 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.8699902248289345, |
|
"eval_loss": 0.5832861065864563, |
|
"eval_runtime": 42.6054, |
|
"eval_samples_per_second": 24.011, |
|
"eval_steps_per_second": 1.502, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 15.05524861878453, |
|
"grad_norm": 716567.1875, |
|
"learning_rate": 0.00010227272727272728, |
|
"loss": 0.6312, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 15.124309392265193, |
|
"grad_norm": 259661.140625, |
|
"learning_rate": 0.00010156250000000001, |
|
"loss": 0.619, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 15.193370165745856, |
|
"grad_norm": 255588.5, |
|
"learning_rate": 0.00010085227272727272, |
|
"loss": 0.5662, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 15.262430939226519, |
|
"grad_norm": 232239.296875, |
|
"learning_rate": 0.00010014204545454546, |
|
"loss": 0.5648, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 15.331491712707182, |
|
"grad_norm": 222320.40625, |
|
"learning_rate": 9.943181818181819e-05, |
|
"loss": 0.5947, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 15.400552486187845, |
|
"grad_norm": 336205.4375, |
|
"learning_rate": 9.872159090909091e-05, |
|
"loss": 0.5688, |
|
"step": 5575 |
|
}, |
|
{ |
|
"epoch": 15.469613259668508, |
|
"grad_norm": 257701.59375, |
|
"learning_rate": 9.801136363636364e-05, |
|
"loss": 0.5786, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 15.53867403314917, |
|
"grad_norm": 263016.90625, |
|
"learning_rate": 9.730113636363637e-05, |
|
"loss": 0.6234, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 15.607734806629834, |
|
"grad_norm": 393092.21875, |
|
"learning_rate": 9.659090909090909e-05, |
|
"loss": 0.6204, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 15.676795580110497, |
|
"grad_norm": 290087.03125, |
|
"learning_rate": 9.588068181818183e-05, |
|
"loss": 0.5572, |
|
"step": 5675 |
|
}, |
|
{ |
|
"epoch": 15.74585635359116, |
|
"grad_norm": 342712.75, |
|
"learning_rate": 9.517045454545455e-05, |
|
"loss": 0.5205, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 15.814917127071823, |
|
"grad_norm": 285120.03125, |
|
"learning_rate": 9.446022727272728e-05, |
|
"loss": 0.6048, |
|
"step": 5725 |
|
}, |
|
{ |
|
"epoch": 15.883977900552486, |
|
"grad_norm": 387243.15625, |
|
"learning_rate": 9.375e-05, |
|
"loss": 0.5143, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 15.953038674033149, |
|
"grad_norm": 515102.03125, |
|
"learning_rate": 9.303977272727273e-05, |
|
"loss": 0.6418, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.8709677419354839, |
|
"eval_loss": 0.6134275197982788, |
|
"eval_runtime": 42.5239, |
|
"eval_samples_per_second": 24.057, |
|
"eval_steps_per_second": 1.505, |
|
"step": 5792 |
|
}, |
|
{ |
|
"epoch": 16.022099447513813, |
|
"grad_norm": 508912.78125, |
|
"learning_rate": 9.232954545454545e-05, |
|
"loss": 0.5368, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 16.091160220994475, |
|
"grad_norm": 220953.15625, |
|
"learning_rate": 9.161931818181818e-05, |
|
"loss": 0.568, |
|
"step": 5825 |
|
}, |
|
{ |
|
"epoch": 16.16022099447514, |
|
"grad_norm": 396922.8125, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 0.565, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 16.2292817679558, |
|
"grad_norm": 312722.0625, |
|
"learning_rate": 9.019886363636363e-05, |
|
"loss": 0.6043, |
|
"step": 5875 |
|
}, |
|
{ |
|
"epoch": 16.298342541436465, |
|
"grad_norm": 284121.65625, |
|
"learning_rate": 8.948863636363637e-05, |
|
"loss": 0.5914, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 16.367403314917127, |
|
"grad_norm": 418331.65625, |
|
"learning_rate": 8.87784090909091e-05, |
|
"loss": 0.5655, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 16.43646408839779, |
|
"grad_norm": 418501.40625, |
|
"learning_rate": 8.806818181818182e-05, |
|
"loss": 0.5379, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 16.505524861878452, |
|
"grad_norm": 374238.375, |
|
"learning_rate": 8.735795454545454e-05, |
|
"loss": 0.5807, |
|
"step": 5975 |
|
}, |
|
{ |
|
"epoch": 16.574585635359117, |
|
"grad_norm": 282150.65625, |
|
"learning_rate": 8.664772727272728e-05, |
|
"loss": 0.5928, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 16.64364640883978, |
|
"grad_norm": 508603.71875, |
|
"learning_rate": 8.59375e-05, |
|
"loss": 0.5235, |
|
"step": 6025 |
|
}, |
|
{ |
|
"epoch": 16.712707182320443, |
|
"grad_norm": 69492.65625, |
|
"learning_rate": 8.522727272727272e-05, |
|
"loss": 0.5403, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 16.781767955801104, |
|
"grad_norm": 455852.34375, |
|
"learning_rate": 8.451704545454546e-05, |
|
"loss": 0.5742, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 16.85082872928177, |
|
"grad_norm": 240280.28125, |
|
"learning_rate": 8.380681818181818e-05, |
|
"loss": 0.609, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 16.91988950276243, |
|
"grad_norm": 231682.703125, |
|
"learning_rate": 8.309659090909091e-05, |
|
"loss": 0.5812, |
|
"step": 6125 |
|
}, |
|
{ |
|
"epoch": 16.988950276243095, |
|
"grad_norm": 355277.6875, |
|
"learning_rate": 8.238636363636364e-05, |
|
"loss": 0.5703, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.8699902248289345, |
|
"eval_loss": 0.6182804107666016, |
|
"eval_runtime": 42.0719, |
|
"eval_samples_per_second": 24.316, |
|
"eval_steps_per_second": 1.521, |
|
"step": 6154 |
|
}, |
|
{ |
|
"epoch": 17.058011049723756, |
|
"grad_norm": 236222.328125, |
|
"learning_rate": 8.167613636363637e-05, |
|
"loss": 0.5618, |
|
"step": 6175 |
|
}, |
|
{ |
|
"epoch": 17.12707182320442, |
|
"grad_norm": 447430.8125, |
|
"learning_rate": 8.096590909090909e-05, |
|
"loss": 0.5555, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 17.196132596685082, |
|
"grad_norm": 351022.90625, |
|
"learning_rate": 8.025568181818183e-05, |
|
"loss": 0.5803, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 17.265193370165747, |
|
"grad_norm": 324263.125, |
|
"learning_rate": 7.954545454545455e-05, |
|
"loss": 0.5047, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 17.334254143646408, |
|
"grad_norm": 222075.109375, |
|
"learning_rate": 7.883522727272728e-05, |
|
"loss": 0.5297, |
|
"step": 6275 |
|
}, |
|
{ |
|
"epoch": 17.403314917127073, |
|
"grad_norm": 415280.03125, |
|
"learning_rate": 7.8125e-05, |
|
"loss": 0.5556, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 17.472375690607734, |
|
"grad_norm": 340595.46875, |
|
"learning_rate": 7.741477272727273e-05, |
|
"loss": 0.5727, |
|
"step": 6325 |
|
}, |
|
{ |
|
"epoch": 17.5414364640884, |
|
"grad_norm": 466527.96875, |
|
"learning_rate": 7.670454545454545e-05, |
|
"loss": 0.615, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 17.61049723756906, |
|
"grad_norm": 171126.625, |
|
"learning_rate": 7.599431818181818e-05, |
|
"loss": 0.5942, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 17.679558011049725, |
|
"grad_norm": 350894.5, |
|
"learning_rate": 7.528409090909092e-05, |
|
"loss": 0.5792, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 17.748618784530386, |
|
"grad_norm": 416593.5, |
|
"learning_rate": 7.457386363636363e-05, |
|
"loss": 0.5789, |
|
"step": 6425 |
|
}, |
|
{ |
|
"epoch": 17.81767955801105, |
|
"grad_norm": 264111.65625, |
|
"learning_rate": 7.386363636363637e-05, |
|
"loss": 0.5188, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 17.886740331491712, |
|
"grad_norm": 127494.890625, |
|
"learning_rate": 7.315340909090909e-05, |
|
"loss": 0.625, |
|
"step": 6475 |
|
}, |
|
{ |
|
"epoch": 17.955801104972377, |
|
"grad_norm": 240379.359375, |
|
"learning_rate": 7.244318181818182e-05, |
|
"loss": 0.5244, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.8680351906158358, |
|
"eval_loss": 0.5691604018211365, |
|
"eval_runtime": 42.1833, |
|
"eval_samples_per_second": 24.251, |
|
"eval_steps_per_second": 1.517, |
|
"step": 6516 |
|
}, |
|
{ |
|
"epoch": 18.024861878453038, |
|
"grad_norm": 216740.765625, |
|
"learning_rate": 7.173295454545454e-05, |
|
"loss": 0.5197, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 18.093922651933703, |
|
"grad_norm": 231694.28125, |
|
"learning_rate": 7.102272727272728e-05, |
|
"loss": 0.5297, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 18.162983425414364, |
|
"grad_norm": 265140.96875, |
|
"learning_rate": 7.031250000000001e-05, |
|
"loss": 0.5488, |
|
"step": 6575 |
|
}, |
|
{ |
|
"epoch": 18.23204419889503, |
|
"grad_norm": 261013.03125, |
|
"learning_rate": 6.960227272727272e-05, |
|
"loss": 0.5665, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 18.30110497237569, |
|
"grad_norm": 211400.59375, |
|
"learning_rate": 6.889204545454546e-05, |
|
"loss": 0.633, |
|
"step": 6625 |
|
}, |
|
{ |
|
"epoch": 18.370165745856355, |
|
"grad_norm": 89337.546875, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 0.5285, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 18.439226519337016, |
|
"grad_norm": 124479.828125, |
|
"learning_rate": 6.747159090909091e-05, |
|
"loss": 0.5081, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 18.50828729281768, |
|
"grad_norm": 184627.265625, |
|
"learning_rate": 6.676136363636364e-05, |
|
"loss": 0.5996, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 18.57734806629834, |
|
"grad_norm": 200278.921875, |
|
"learning_rate": 6.605113636363637e-05, |
|
"loss": 0.5558, |
|
"step": 6725 |
|
}, |
|
{ |
|
"epoch": 18.646408839779006, |
|
"grad_norm": 263707.4375, |
|
"learning_rate": 6.534090909090909e-05, |
|
"loss": 0.5293, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 18.715469613259668, |
|
"grad_norm": 256204.46875, |
|
"learning_rate": 6.463068181818183e-05, |
|
"loss": 0.5377, |
|
"step": 6775 |
|
}, |
|
{ |
|
"epoch": 18.784530386740332, |
|
"grad_norm": 270741.84375, |
|
"learning_rate": 6.392045454545455e-05, |
|
"loss": 0.5416, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 18.853591160220994, |
|
"grad_norm": 318270.90625, |
|
"learning_rate": 6.321022727272728e-05, |
|
"loss": 0.5273, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 18.92265193370166, |
|
"grad_norm": 286275.125, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.5097, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 18.99171270718232, |
|
"grad_norm": 341063.40625, |
|
"learning_rate": 6.178977272727273e-05, |
|
"loss": 0.5311, |
|
"step": 6875 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.8660801564027371, |
|
"eval_loss": 0.5639104843139648, |
|
"eval_runtime": 42.2429, |
|
"eval_samples_per_second": 24.217, |
|
"eval_steps_per_second": 1.515, |
|
"step": 6878 |
|
}, |
|
{ |
|
"epoch": 19.060773480662984, |
|
"grad_norm": 287663.375, |
|
"learning_rate": 6.107954545454545e-05, |
|
"loss": 0.494, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 19.129834254143645, |
|
"grad_norm": 288540.75, |
|
"learning_rate": 6.0369318181818184e-05, |
|
"loss": 0.582, |
|
"step": 6925 |
|
}, |
|
{ |
|
"epoch": 19.19889502762431, |
|
"grad_norm": 312512.46875, |
|
"learning_rate": 5.965909090909091e-05, |
|
"loss": 0.4917, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 19.26795580110497, |
|
"grad_norm": 391702.5625, |
|
"learning_rate": 5.894886363636364e-05, |
|
"loss": 0.6078, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 19.337016574585636, |
|
"grad_norm": 152936.984375, |
|
"learning_rate": 5.823863636363636e-05, |
|
"loss": 0.4911, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 19.406077348066297, |
|
"grad_norm": 363362.21875, |
|
"learning_rate": 5.752840909090909e-05, |
|
"loss": 0.6013, |
|
"step": 7025 |
|
}, |
|
{ |
|
"epoch": 19.475138121546962, |
|
"grad_norm": 228958.9375, |
|
"learning_rate": 5.681818181818182e-05, |
|
"loss": 0.5734, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 19.544198895027623, |
|
"grad_norm": 263691.90625, |
|
"learning_rate": 5.6107954545454544e-05, |
|
"loss": 0.5037, |
|
"step": 7075 |
|
}, |
|
{ |
|
"epoch": 19.613259668508288, |
|
"grad_norm": 237882.953125, |
|
"learning_rate": 5.5397727272727276e-05, |
|
"loss": 0.5405, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 19.68232044198895, |
|
"grad_norm": 143271.046875, |
|
"learning_rate": 5.46875e-05, |
|
"loss": 0.5455, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 19.751381215469614, |
|
"grad_norm": 449154.46875, |
|
"learning_rate": 5.3977272727272727e-05, |
|
"loss": 0.5721, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 19.820441988950275, |
|
"grad_norm": 323360.8125, |
|
"learning_rate": 5.326704545454546e-05, |
|
"loss": 0.4785, |
|
"step": 7175 |
|
}, |
|
{ |
|
"epoch": 19.88950276243094, |
|
"grad_norm": 285545.21875, |
|
"learning_rate": 5.2556818181818184e-05, |
|
"loss": 0.5242, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 19.9585635359116, |
|
"grad_norm": 445151.09375, |
|
"learning_rate": 5.184659090909091e-05, |
|
"loss": 0.5504, |
|
"step": 7225 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.8836754643206256, |
|
"eval_loss": 0.5518479943275452, |
|
"eval_runtime": 42.2715, |
|
"eval_samples_per_second": 24.201, |
|
"eval_steps_per_second": 1.514, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 20.027624309392266, |
|
"grad_norm": 142386.203125, |
|
"learning_rate": 5.113636363636364e-05, |
|
"loss": 0.5008, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 20.096685082872927, |
|
"grad_norm": 275875.53125, |
|
"learning_rate": 5.042613636363636e-05, |
|
"loss": 0.514, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 20.165745856353592, |
|
"grad_norm": 307304.6875, |
|
"learning_rate": 4.971590909090909e-05, |
|
"loss": 0.5605, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 20.234806629834253, |
|
"grad_norm": 232638.171875, |
|
"learning_rate": 4.900568181818182e-05, |
|
"loss": 0.5112, |
|
"step": 7325 |
|
}, |
|
{ |
|
"epoch": 20.303867403314918, |
|
"grad_norm": 322207.4375, |
|
"learning_rate": 4.829545454545454e-05, |
|
"loss": 0.5471, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 20.37292817679558, |
|
"grad_norm": 238292.0625, |
|
"learning_rate": 4.7585227272727276e-05, |
|
"loss": 0.5348, |
|
"step": 7375 |
|
}, |
|
{ |
|
"epoch": 20.441988950276244, |
|
"grad_norm": 198509.34375, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 0.5274, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 20.511049723756905, |
|
"grad_norm": 245971.59375, |
|
"learning_rate": 4.6164772727272726e-05, |
|
"loss": 0.5053, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 20.58011049723757, |
|
"grad_norm": 292159.65625, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 0.5368, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 20.64917127071823, |
|
"grad_norm": 270163.8125, |
|
"learning_rate": 4.4744318181818184e-05, |
|
"loss": 0.5346, |
|
"step": 7475 |
|
}, |
|
{ |
|
"epoch": 20.718232044198896, |
|
"grad_norm": 202049.21875, |
|
"learning_rate": 4.403409090909091e-05, |
|
"loss": 0.4808, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 20.787292817679557, |
|
"grad_norm": 253560.09375, |
|
"learning_rate": 4.332386363636364e-05, |
|
"loss": 0.5081, |
|
"step": 7525 |
|
}, |
|
{ |
|
"epoch": 20.85635359116022, |
|
"grad_norm": 138984.609375, |
|
"learning_rate": 4.261363636363636e-05, |
|
"loss": 0.5197, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 20.925414364640883, |
|
"grad_norm": 390483.5, |
|
"learning_rate": 4.190340909090909e-05, |
|
"loss": 0.5846, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 20.994475138121548, |
|
"grad_norm": 261808.59375, |
|
"learning_rate": 4.119318181818182e-05, |
|
"loss": 0.4922, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.8670576735092864, |
|
"eval_loss": 0.5776973962783813, |
|
"eval_runtime": 42.5772, |
|
"eval_samples_per_second": 24.027, |
|
"eval_steps_per_second": 1.503, |
|
"step": 7602 |
|
}, |
|
{ |
|
"epoch": 21.06353591160221, |
|
"grad_norm": 270493.0625, |
|
"learning_rate": 4.048295454545454e-05, |
|
"loss": 0.5107, |
|
"step": 7625 |
|
}, |
|
{ |
|
"epoch": 21.132596685082873, |
|
"grad_norm": 265968.53125, |
|
"learning_rate": 3.9772727272727275e-05, |
|
"loss": 0.5877, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 21.201657458563535, |
|
"grad_norm": 473493.84375, |
|
"learning_rate": 3.90625e-05, |
|
"loss": 0.4783, |
|
"step": 7675 |
|
}, |
|
{ |
|
"epoch": 21.2707182320442, |
|
"grad_norm": 332053.03125, |
|
"learning_rate": 3.8352272727272726e-05, |
|
"loss": 0.5025, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 21.33977900552486, |
|
"grad_norm": 234659.0625, |
|
"learning_rate": 3.764204545454546e-05, |
|
"loss": 0.5128, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 21.408839779005525, |
|
"grad_norm": 267205.5, |
|
"learning_rate": 3.6931818181818184e-05, |
|
"loss": 0.457, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 21.477900552486187, |
|
"grad_norm": 400299.75, |
|
"learning_rate": 3.622159090909091e-05, |
|
"loss": 0.5191, |
|
"step": 7775 |
|
}, |
|
{ |
|
"epoch": 21.54696132596685, |
|
"grad_norm": 237402.875, |
|
"learning_rate": 3.551136363636364e-05, |
|
"loss": 0.4861, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 21.616022099447513, |
|
"grad_norm": 447478.4375, |
|
"learning_rate": 3.480113636363636e-05, |
|
"loss": 0.5129, |
|
"step": 7825 |
|
}, |
|
{ |
|
"epoch": 21.685082872928177, |
|
"grad_norm": 130432.0546875, |
|
"learning_rate": 3.409090909090909e-05, |
|
"loss": 0.5732, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 21.75414364640884, |
|
"grad_norm": 257269.390625, |
|
"learning_rate": 3.338068181818182e-05, |
|
"loss": 0.5339, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 21.823204419889503, |
|
"grad_norm": 336605.34375, |
|
"learning_rate": 3.267045454545454e-05, |
|
"loss": 0.4991, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 21.892265193370164, |
|
"grad_norm": 263841.71875, |
|
"learning_rate": 3.1960227272727275e-05, |
|
"loss": 0.5089, |
|
"step": 7925 |
|
}, |
|
{ |
|
"epoch": 21.96132596685083, |
|
"grad_norm": 121368.7421875, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.4801, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.8778103616813294, |
|
"eval_loss": 0.5548605918884277, |
|
"eval_runtime": 42.3535, |
|
"eval_samples_per_second": 24.154, |
|
"eval_steps_per_second": 1.511, |
|
"step": 7964 |
|
}, |
|
{ |
|
"epoch": 22.03038674033149, |
|
"grad_norm": 180251.40625, |
|
"learning_rate": 3.0539772727272726e-05, |
|
"loss": 0.5442, |
|
"step": 7975 |
|
}, |
|
{ |
|
"epoch": 22.099447513812155, |
|
"grad_norm": 85616.40625, |
|
"learning_rate": 2.9829545454545455e-05, |
|
"loss": 0.4761, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 22.168508287292816, |
|
"grad_norm": 171296.453125, |
|
"learning_rate": 2.911931818181818e-05, |
|
"loss": 0.4682, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 22.23756906077348, |
|
"grad_norm": 342381.15625, |
|
"learning_rate": 2.840909090909091e-05, |
|
"loss": 0.5581, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 22.306629834254142, |
|
"grad_norm": 151855.984375, |
|
"learning_rate": 2.7698863636363638e-05, |
|
"loss": 0.4941, |
|
"step": 8075 |
|
}, |
|
{ |
|
"epoch": 22.375690607734807, |
|
"grad_norm": 219308.734375, |
|
"learning_rate": 2.6988636363636363e-05, |
|
"loss": 0.5053, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 22.444751381215468, |
|
"grad_norm": 469136.375, |
|
"learning_rate": 2.6278409090909092e-05, |
|
"loss": 0.5903, |
|
"step": 8125 |
|
}, |
|
{ |
|
"epoch": 22.513812154696133, |
|
"grad_norm": 180401.59375, |
|
"learning_rate": 2.556818181818182e-05, |
|
"loss": 0.5119, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 22.582872928176794, |
|
"grad_norm": 351697.78125, |
|
"learning_rate": 2.4857954545454546e-05, |
|
"loss": 0.5023, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 22.65193370165746, |
|
"grad_norm": 413491.84375, |
|
"learning_rate": 2.414772727272727e-05, |
|
"loss": 0.5567, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 22.72099447513812, |
|
"grad_norm": 268774.71875, |
|
"learning_rate": 2.34375e-05, |
|
"loss": 0.5543, |
|
"step": 8225 |
|
}, |
|
{ |
|
"epoch": 22.790055248618785, |
|
"grad_norm": 186057.15625, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 0.4627, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 22.859116022099446, |
|
"grad_norm": 199888.609375, |
|
"learning_rate": 2.2017045454545455e-05, |
|
"loss": 0.4735, |
|
"step": 8275 |
|
}, |
|
{ |
|
"epoch": 22.92817679558011, |
|
"grad_norm": 278445.65625, |
|
"learning_rate": 2.130681818181818e-05, |
|
"loss": 0.4419, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 22.997237569060772, |
|
"grad_norm": 395816.59375, |
|
"learning_rate": 2.059659090909091e-05, |
|
"loss": 0.5085, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.8768328445747801, |
|
"eval_loss": 0.5502294301986694, |
|
"eval_runtime": 42.1647, |
|
"eval_samples_per_second": 24.262, |
|
"eval_steps_per_second": 1.518, |
|
"step": 8326 |
|
}, |
|
{ |
|
"epoch": 23.066298342541437, |
|
"grad_norm": 170681.640625, |
|
"learning_rate": 1.9886363636363638e-05, |
|
"loss": 0.461, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 23.135359116022098, |
|
"grad_norm": 197108.734375, |
|
"learning_rate": 1.9176136363636363e-05, |
|
"loss": 0.5135, |
|
"step": 8375 |
|
}, |
|
{ |
|
"epoch": 23.204419889502763, |
|
"grad_norm": 202833.734375, |
|
"learning_rate": 1.8465909090909092e-05, |
|
"loss": 0.5294, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 23.273480662983424, |
|
"grad_norm": 250280.265625, |
|
"learning_rate": 1.775568181818182e-05, |
|
"loss": 0.4441, |
|
"step": 8425 |
|
}, |
|
{ |
|
"epoch": 23.34254143646409, |
|
"grad_norm": 691687.625, |
|
"learning_rate": 1.7045454545454546e-05, |
|
"loss": 0.4639, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 23.41160220994475, |
|
"grad_norm": 281234.75, |
|
"learning_rate": 1.633522727272727e-05, |
|
"loss": 0.4794, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 23.480662983425415, |
|
"grad_norm": 201831.9375, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 0.5207, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 23.549723756906076, |
|
"grad_norm": 249279.59375, |
|
"learning_rate": 1.4914772727272727e-05, |
|
"loss": 0.5208, |
|
"step": 8525 |
|
}, |
|
{ |
|
"epoch": 23.61878453038674, |
|
"grad_norm": 160730.84375, |
|
"learning_rate": 1.4204545454545455e-05, |
|
"loss": 0.566, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 23.6878453038674, |
|
"grad_norm": 229437.453125, |
|
"learning_rate": 1.3494318181818182e-05, |
|
"loss": 0.4588, |
|
"step": 8575 |
|
}, |
|
{ |
|
"epoch": 23.756906077348066, |
|
"grad_norm": 302624.1875, |
|
"learning_rate": 1.278409090909091e-05, |
|
"loss": 0.5412, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 23.825966850828728, |
|
"grad_norm": 190862.921875, |
|
"learning_rate": 1.2073863636363636e-05, |
|
"loss": 0.4805, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 23.895027624309392, |
|
"grad_norm": 187550.84375, |
|
"learning_rate": 1.1363636363636365e-05, |
|
"loss": 0.4483, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 23.964088397790054, |
|
"grad_norm": 282162.96875, |
|
"learning_rate": 1.065340909090909e-05, |
|
"loss": 0.5002, |
|
"step": 8675 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.8856304985337243, |
|
"eval_loss": 0.5444589853286743, |
|
"eval_runtime": 42.3026, |
|
"eval_samples_per_second": 24.183, |
|
"eval_steps_per_second": 1.513, |
|
"step": 8688 |
|
}, |
|
{ |
|
"epoch": 24.03314917127072, |
|
"grad_norm": 562951.625, |
|
"learning_rate": 9.943181818181819e-06, |
|
"loss": 0.4826, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 24.10220994475138, |
|
"grad_norm": 169346.78125, |
|
"learning_rate": 9.232954545454546e-06, |
|
"loss": 0.518, |
|
"step": 8725 |
|
}, |
|
{ |
|
"epoch": 24.171270718232044, |
|
"grad_norm": 364698.15625, |
|
"learning_rate": 8.522727272727273e-06, |
|
"loss": 0.5192, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 24.240331491712706, |
|
"grad_norm": 342001.78125, |
|
"learning_rate": 7.8125e-06, |
|
"loss": 0.4712, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 24.30939226519337, |
|
"grad_norm": 399564.5, |
|
"learning_rate": 7.102272727272727e-06, |
|
"loss": 0.5009, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 24.37845303867403, |
|
"grad_norm": 162135.25, |
|
"learning_rate": 6.392045454545455e-06, |
|
"loss": 0.5196, |
|
"step": 8825 |
|
}, |
|
{ |
|
"epoch": 24.447513812154696, |
|
"grad_norm": 297179.53125, |
|
"learning_rate": 5.681818181818182e-06, |
|
"loss": 0.4387, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 24.516574585635357, |
|
"grad_norm": 41471.75390625, |
|
"learning_rate": 4.9715909090909094e-06, |
|
"loss": 0.4538, |
|
"step": 8875 |
|
}, |
|
{ |
|
"epoch": 24.585635359116022, |
|
"grad_norm": 206171.1875, |
|
"learning_rate": 4.2613636363636365e-06, |
|
"loss": 0.4808, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 24.654696132596683, |
|
"grad_norm": 143367.078125, |
|
"learning_rate": 3.5511363636363636e-06, |
|
"loss": 0.5113, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 24.723756906077348, |
|
"grad_norm": 388039.125, |
|
"learning_rate": 2.840909090909091e-06, |
|
"loss": 0.4725, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 24.792817679558013, |
|
"grad_norm": 162006.796875, |
|
"learning_rate": 2.1306818181818183e-06, |
|
"loss": 0.4681, |
|
"step": 8975 |
|
}, |
|
{ |
|
"epoch": 24.861878453038674, |
|
"grad_norm": 211019.484375, |
|
"learning_rate": 1.4204545454545456e-06, |
|
"loss": 0.4651, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 24.930939226519335, |
|
"grad_norm": 242317.265625, |
|
"learning_rate": 7.102272727272728e-07, |
|
"loss": 0.52, |
|
"step": 9025 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"grad_norm": 90494.8828125, |
|
"learning_rate": 0.0, |
|
"loss": 0.4404, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.884652981427175, |
|
"eval_loss": 0.5352830290794373, |
|
"eval_runtime": 42.51, |
|
"eval_samples_per_second": 24.065, |
|
"eval_steps_per_second": 1.506, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"step": 9050, |
|
"total_flos": 0.0, |
|
"train_loss": 0.6549073659780934, |
|
"train_runtime": 10456.9852, |
|
"train_samples_per_second": 13.845, |
|
"train_steps_per_second": 0.865 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 9050, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|