|
{ |
|
"best_metric": 0.2772659659385681, |
|
"best_model_checkpoint": "./convnext-base-8e-5/checkpoint-8792", |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 10990, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 11.6831636428833, |
|
"learning_rate": 7.998365801595384e-05, |
|
"loss": 1.9321, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 23.91045379638672, |
|
"learning_rate": 7.993464541683746e-05, |
|
"loss": 0.9556, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 16.425535202026367, |
|
"learning_rate": 7.98530022508065e-05, |
|
"loss": 0.7791, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 18.70790672302246, |
|
"learning_rate": 7.973879522842682e-05, |
|
"loss": 0.6424, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 20.97853660583496, |
|
"learning_rate": 7.959211766816531e-05, |
|
"loss": 0.6717, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 19.02173614501953, |
|
"learning_rate": 7.941308942013942e-05, |
|
"loss": 0.5696, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 18.764028549194336, |
|
"learning_rate": 7.920185676818782e-05, |
|
"loss": 0.5981, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 18.06161880493164, |
|
"learning_rate": 7.895859231034193e-05, |
|
"loss": 0.5988, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 13.58993148803711, |
|
"learning_rate": 7.86834948177962e-05, |
|
"loss": 0.5275, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 10.574773788452148, |
|
"learning_rate": 7.837678907249235e-05, |
|
"loss": 0.5312, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.856858846918489, |
|
"eval_loss": 0.47219011187553406, |
|
"eval_runtime": 105.0222, |
|
"eval_samples_per_second": 23.947, |
|
"eval_steps_per_second": 1.504, |
|
"step": 1099 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 15.577301979064941, |
|
"learning_rate": 7.80387256834502e-05, |
|
"loss": 0.5937, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.9820380210876465, |
|
"learning_rate": 7.766958088199526e-05, |
|
"loss": 0.4501, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 11.32746410369873, |
|
"learning_rate": 7.726965629605035e-05, |
|
"loss": 0.4311, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 17.756572723388672, |
|
"learning_rate": 7.683927870367564e-05, |
|
"loss": 0.4342, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 18.274375915527344, |
|
"learning_rate": 7.637879976605853e-05, |
|
"loss": 0.4238, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 20.55085563659668, |
|
"learning_rate": 7.588859574017165e-05, |
|
"loss": 0.3663, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 11.306856155395508, |
|
"learning_rate": 7.53690671713335e-05, |
|
"loss": 0.4696, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 17.713024139404297, |
|
"learning_rate": 7.482063856592323e-05, |
|
"loss": 0.4407, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 8.358209609985352, |
|
"learning_rate": 7.42437580445169e-05, |
|
"loss": 0.43, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 15.047048568725586, |
|
"learning_rate": 7.363889697572835e-05, |
|
"loss": 0.372, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.985563278198242, |
|
"learning_rate": 7.300654959105439e-05, |
|
"loss": 0.4064, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9025844930417495, |
|
"eval_loss": 0.36510175466537476, |
|
"eval_runtime": 106.7789, |
|
"eval_samples_per_second": 23.553, |
|
"eval_steps_per_second": 1.48, |
|
"step": 2198 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 14.219414710998535, |
|
"learning_rate": 7.234723258103863e-05, |
|
"loss": 0.4234, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 8.225436210632324, |
|
"learning_rate": 7.1661484673084e-05, |
|
"loss": 0.3427, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 17.93203353881836, |
|
"learning_rate": 7.094986619125911e-05, |
|
"loss": 0.3326, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 6.593932151794434, |
|
"learning_rate": 7.02129585984578e-05, |
|
"loss": 0.3063, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 16.711748123168945, |
|
"learning_rate": 6.945136402128628e-05, |
|
"loss": 0.3645, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 4.706127643585205, |
|
"learning_rate": 6.86657047580661e-05, |
|
"loss": 0.3467, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 2.0358073711395264, |
|
"learning_rate": 6.785662277035447e-05, |
|
"loss": 0.3398, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 9.968092918395996, |
|
"learning_rate": 6.702477915839819e-05, |
|
"loss": 0.3555, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 8.269189834594727, |
|
"learning_rate": 6.617085362094902e-05, |
|
"loss": 0.3263, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 10.920413970947266, |
|
"learning_rate": 6.529554389988243e-05, |
|
"loss": 0.3033, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 11.657285690307617, |
|
"learning_rate": 6.43995652100733e-05, |
|
"loss": 0.318, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9145129224652088, |
|
"eval_loss": 0.32599496841430664, |
|
"eval_runtime": 105.5258, |
|
"eval_samples_per_second": 23.833, |
|
"eval_steps_per_second": 1.497, |
|
"step": 3297 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 7.324591636657715, |
|
"learning_rate": 6.348364965499434e-05, |
|
"loss": 0.3295, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.09, |
|
"grad_norm": 1.3288770914077759, |
|
"learning_rate": 6.2548545628515e-05, |
|
"loss": 0.2353, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.18, |
|
"grad_norm": 0.16654238104820251, |
|
"learning_rate": 6.159501720338938e-05, |
|
"loss": 0.2515, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 11.06922721862793, |
|
"learning_rate": 6.062384350693302e-05, |
|
"loss": 0.2783, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 10.742453575134277, |
|
"learning_rate": 5.9635818084398626e-05, |
|
"loss": 0.2667, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 6.648169994354248, |
|
"learning_rate": 5.86317482505708e-05, |
|
"loss": 0.2368, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 3.55, |
|
"grad_norm": 11.184340476989746, |
|
"learning_rate": 5.7612454430109804e-05, |
|
"loss": 0.2933, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 7.161631107330322, |
|
"learning_rate": 5.657876948718328e-05, |
|
"loss": 0.2561, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.73, |
|
"grad_norm": 0.8756735920906067, |
|
"learning_rate": 5.5531538044933525e-05, |
|
"loss": 0.2809, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 3.82, |
|
"grad_norm": 0.12508328258991241, |
|
"learning_rate": 5.447161579533662e-05, |
|
"loss": 0.2495, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 3.783909320831299, |
|
"learning_rate": 5.339986880001724e-05, |
|
"loss": 0.2725, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9196819085487078, |
|
"eval_loss": 0.3164690434932709, |
|
"eval_runtime": 105.5164, |
|
"eval_samples_per_second": 23.835, |
|
"eval_steps_per_second": 1.497, |
|
"step": 4396 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 13.933430671691895, |
|
"learning_rate": 5.2317172782590326e-05, |
|
"loss": 0.2394, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 6.490962982177734, |
|
"learning_rate": 5.122441241310807e-05, |
|
"loss": 0.2062, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 4.284941673278809, |
|
"learning_rate": 5.012248058519667e-05, |
|
"loss": 0.2109, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 9.453885078430176, |
|
"learning_rate": 4.9012277686473734e-05, |
|
"loss": 0.2091, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 4.37, |
|
"grad_norm": 0.10080689936876297, |
|
"learning_rate": 4.789471086284219e-05, |
|
"loss": 0.2065, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 9.822230339050293, |
|
"learning_rate": 4.6770693277262165e-05, |
|
"loss": 0.2135, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 4.55, |
|
"grad_norm": 7.230046272277832, |
|
"learning_rate": 4.5641143363606216e-05, |
|
"loss": 0.1851, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 13.58958911895752, |
|
"learning_rate": 4.450698407620776e-05, |
|
"loss": 0.1805, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 7.531309127807617, |
|
"learning_rate": 4.33691421357158e-05, |
|
"loss": 0.2118, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 17.00364875793457, |
|
"learning_rate": 4.22285472718723e-05, |
|
"loss": 0.1952, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 4.91, |
|
"grad_norm": 10.512598037719727, |
|
"learning_rate": 4.108613146383063e-05, |
|
"loss": 0.212, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9256461232604374, |
|
"eval_loss": 0.3330000936985016, |
|
"eval_runtime": 104.2603, |
|
"eval_samples_per_second": 24.122, |
|
"eval_steps_per_second": 1.515, |
|
"step": 5495 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 4.856630325317383, |
|
"learning_rate": 3.994282817863628e-05, |
|
"loss": 0.1993, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 5.001902103424072, |
|
"learning_rate": 3.879957160849155e-05, |
|
"loss": 0.1622, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 5.19, |
|
"grad_norm": 8.48959732055664, |
|
"learning_rate": 3.7657295907427964e-05, |
|
"loss": 0.1709, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 5.28, |
|
"grad_norm": 1.4750434160232544, |
|
"learning_rate": 3.651693442800964e-05, |
|
"loss": 0.1536, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 0.030608195811510086, |
|
"learning_rate": 3.537941895869179e-05, |
|
"loss": 0.1609, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 5.46, |
|
"grad_norm": 0.20574839413166046, |
|
"learning_rate": 3.4245678962457005e-05, |
|
"loss": 0.1318, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 5.55, |
|
"grad_norm": 0.6550854444503784, |
|
"learning_rate": 3.31166408173518e-05, |
|
"loss": 0.1667, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 5.64, |
|
"grad_norm": 14.868585586547852, |
|
"learning_rate": 3.199322705954396e-05, |
|
"loss": 0.1402, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 5.7412896156311035, |
|
"learning_rate": 3.087635562951882e-05, |
|
"loss": 0.1862, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.82, |
|
"grad_norm": 11.104312896728516, |
|
"learning_rate": 2.9766939122030964e-05, |
|
"loss": 0.1592, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 5.91, |
|
"grad_norm": 12.107190132141113, |
|
"learning_rate": 2.866588404042364e-05, |
|
"loss": 0.1711, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.927634194831014, |
|
"eval_loss": 0.3272738456726074, |
|
"eval_runtime": 105.6481, |
|
"eval_samples_per_second": 23.805, |
|
"eval_steps_per_second": 1.496, |
|
"step": 6594 |
|
}, |
|
{ |
|
"epoch": 6.01, |
|
"grad_norm": 8.225330352783203, |
|
"learning_rate": 2.757409005592578e-05, |
|
"loss": 0.1678, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 6.1, |
|
"grad_norm": 20.111202239990234, |
|
"learning_rate": 2.6492449272531175e-05, |
|
"loss": 0.1286, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 6.19, |
|
"grad_norm": 23.527673721313477, |
|
"learning_rate": 2.5421845498061117e-05, |
|
"loss": 0.1245, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 15.480986595153809, |
|
"learning_rate": 2.4363153522005743e-05, |
|
"loss": 0.1185, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 6.37, |
|
"grad_norm": 0.05464939400553703, |
|
"learning_rate": 2.3317238400734145e-05, |
|
"loss": 0.125, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 6.46, |
|
"grad_norm": 0.09599953889846802, |
|
"learning_rate": 2.228495475065759e-05, |
|
"loss": 0.132, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 6.55, |
|
"grad_norm": 21.158004760742188, |
|
"learning_rate": 2.1267146049923167e-05, |
|
"loss": 0.1252, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 17.28738784790039, |
|
"learning_rate": 2.02646439492083e-05, |
|
"loss": 0.1289, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 6.73, |
|
"grad_norm": 13.451830863952637, |
|
"learning_rate": 1.9278267592179803e-05, |
|
"loss": 0.1473, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 12.016372680664062, |
|
"learning_rate": 1.8308822946172174e-05, |
|
"loss": 0.1245, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 6.92, |
|
"grad_norm": 3.5636794567108154, |
|
"learning_rate": 1.7357102143632386e-05, |
|
"loss": 0.1552, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9304174950298211, |
|
"eval_loss": 0.29928237199783325, |
|
"eval_runtime": 105.3329, |
|
"eval_samples_per_second": 23.877, |
|
"eval_steps_per_second": 1.5, |
|
"step": 7693 |
|
}, |
|
{ |
|
"epoch": 7.01, |
|
"grad_norm": 9.268345832824707, |
|
"learning_rate": 1.6423882834868976e-05, |
|
"loss": 0.0984, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 8.35660457611084, |
|
"learning_rate": 1.550992755263471e-05, |
|
"loss": 0.0894, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 2.3893661499023438, |
|
"learning_rate": 1.4615983089061678e-05, |
|
"loss": 0.1349, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 7.28, |
|
"grad_norm": 0.8107450604438782, |
|
"learning_rate": 1.3742779885457966e-05, |
|
"loss": 0.0983, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 3.668052911758423, |
|
"learning_rate": 1.2891031435464663e-05, |
|
"loss": 0.1119, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 7.46, |
|
"grad_norm": 1.1919211149215698, |
|
"learning_rate": 1.206143370206086e-05, |
|
"loss": 0.0952, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 8.72845458984375, |
|
"learning_rate": 1.1254664548892755e-05, |
|
"loss": 0.0973, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 7.64, |
|
"grad_norm": 0.031977806240320206, |
|
"learning_rate": 1.0471383186391817e-05, |
|
"loss": 0.1219, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 7.73, |
|
"grad_norm": 0.5276290774345398, |
|
"learning_rate": 9.712229633134562e-06, |
|
"loss": 0.118, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 0.260431170463562, |
|
"learning_rate": 8.977824192883772e-06, |
|
"loss": 0.1148, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 7.92, |
|
"grad_norm": 6.11373233795166, |
|
"learning_rate": 8.26876694773886e-06, |
|
"loss": 0.1039, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.941948310139165, |
|
"eval_loss": 0.2772659659385681, |
|
"eval_runtime": 105.3277, |
|
"eval_samples_per_second": 23.878, |
|
"eval_steps_per_second": 1.5, |
|
"step": 8792 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 0.6443412899971008, |
|
"learning_rate": 7.5856372678091964e-06, |
|
"loss": 0.0876, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 8.1, |
|
"grad_norm": 0.9504626989364624, |
|
"learning_rate": 6.928993337811354e-06, |
|
"loss": 0.1, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 1.0361099243164062, |
|
"learning_rate": 6.29937170097672e-06, |
|
"loss": 0.0938, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 8.28, |
|
"grad_norm": 10.742104530334473, |
|
"learning_rate": 5.697286820642514e-06, |
|
"loss": 0.087, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 8.37, |
|
"grad_norm": 0.1779290735721588, |
|
"learning_rate": 5.123230659884178e-06, |
|
"loss": 0.0974, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 8.46, |
|
"grad_norm": 0.006353577133268118, |
|
"learning_rate": 4.577672279532746e-06, |
|
"loss": 0.0764, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 8.55, |
|
"grad_norm": 7.467300891876221, |
|
"learning_rate": 4.061057454905615e-06, |
|
"loss": 0.0875, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 8.64, |
|
"grad_norm": 2.8993914127349854, |
|
"learning_rate": 3.573808311563891e-06, |
|
"loss": 0.0817, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 8.74, |
|
"grad_norm": 0.09099097549915314, |
|
"learning_rate": 3.116322980393922e-06, |
|
"loss": 0.1057, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 8.83, |
|
"grad_norm": 8.745915412902832, |
|
"learning_rate": 2.68897527229488e-06, |
|
"loss": 0.1134, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 8.92, |
|
"grad_norm": 20.60851287841797, |
|
"learning_rate": 2.2921143727381525e-06, |
|
"loss": 0.0996, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.945924453280318, |
|
"eval_loss": 0.27863016724586487, |
|
"eval_runtime": 105.7885, |
|
"eval_samples_per_second": 23.774, |
|
"eval_steps_per_second": 1.494, |
|
"step": 9891 |
|
}, |
|
{ |
|
"epoch": 9.01, |
|
"grad_norm": 2.4663915634155273, |
|
"learning_rate": 1.926064556448202e-06, |
|
"loss": 0.0928, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 9.1, |
|
"grad_norm": 0.007756831590086222, |
|
"learning_rate": 1.5911249224379189e-06, |
|
"loss": 0.0681, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.19, |
|
"grad_norm": 0.06630658358335495, |
|
"learning_rate": 1.28756914961508e-06, |
|
"loss": 0.0806, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 9.28, |
|
"grad_norm": 0.05963192135095596, |
|
"learning_rate": 1.0156452731595112e-06, |
|
"loss": 0.0906, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 9.37, |
|
"grad_norm": 0.010499324649572372, |
|
"learning_rate": 7.755754818537542e-07, |
|
"loss": 0.0677, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 1.1624623537063599, |
|
"learning_rate": 5.675559365327798e-07, |
|
"loss": 0.0816, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 9.55, |
|
"grad_norm": 0.02799438126385212, |
|
"learning_rate": 3.917566098011438e-07, |
|
"loss": 0.1033, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 9.65, |
|
"grad_norm": 10.219303131103516, |
|
"learning_rate": 2.4832114714847367e-07, |
|
"loss": 0.0825, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 5.508051872253418, |
|
"learning_rate": 1.3736674957689045e-07, |
|
"loss": 0.117, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 9.83, |
|
"grad_norm": 0.7247039675712585, |
|
"learning_rate": 5.898407783614169e-08, |
|
"loss": 0.0994, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 9.92, |
|
"grad_norm": 0.1574389934539795, |
|
"learning_rate": 1.3237178344787106e-08, |
|
"loss": 0.0765, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9475149105367793, |
|
"eval_loss": 0.27801209688186646, |
|
"eval_runtime": 105.1016, |
|
"eval_samples_per_second": 23.929, |
|
"eval_steps_per_second": 1.503, |
|
"step": 10990 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 10990, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_loss": 0.2560113200065328, |
|
"train_runtime": 17058.8463, |
|
"train_samples_per_second": 10.306, |
|
"train_steps_per_second": 0.644 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 10990, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 4.09349935387607e+19, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|