|
{ |
|
"best_metric": 0.45421910285949707, |
|
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/bd_ortho_ign/bd_ortho-DinoVdeau-large-2024_11_27-batch-size64_freeze_probs/checkpoint-11492", |
|
"epoch": 62.0, |
|
"eval_steps": 500, |
|
"global_step": 13702, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"eval_explained_variance": 0.5492395758628845, |
|
"eval_kl_divergence": 0.06964559853076935, |
|
"eval_loss": 0.46336060762405396, |
|
"eval_mae": 0.07600608468055725, |
|
"eval_rmse": 0.10175278037786484, |
|
"eval_runtime": 26.595, |
|
"eval_samples_per_second": 176.65, |
|
"eval_steps_per_second": 2.782, |
|
"learning_rate": 0.001, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_explained_variance": 0.6113448739051819, |
|
"eval_kl_divergence": 0.0038063330575823784, |
|
"eval_loss": 0.45933997631073, |
|
"eval_mae": 0.07159148901700974, |
|
"eval_rmse": 0.09520163387060165, |
|
"eval_runtime": 25.5426, |
|
"eval_samples_per_second": 183.928, |
|
"eval_steps_per_second": 2.897, |
|
"learning_rate": 0.001, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.262443438914027, |
|
"grad_norm": 0.16188210248947144, |
|
"learning_rate": 0.001, |
|
"loss": 0.5185, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_explained_variance": 0.6245184540748596, |
|
"eval_kl_divergence": 0.05826142057776451, |
|
"eval_loss": 0.457367479801178, |
|
"eval_mae": 0.0670078918337822, |
|
"eval_rmse": 0.0917908325791359, |
|
"eval_runtime": 25.6126, |
|
"eval_samples_per_second": 183.425, |
|
"eval_steps_per_second": 2.889, |
|
"learning_rate": 0.001, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_explained_variance": 0.6129782795906067, |
|
"eval_kl_divergence": -0.06495417654514313, |
|
"eval_loss": 0.459468811750412, |
|
"eval_mae": 0.07134346663951874, |
|
"eval_rmse": 0.09552835673093796, |
|
"eval_runtime": 25.6003, |
|
"eval_samples_per_second": 183.514, |
|
"eval_steps_per_second": 2.891, |
|
"learning_rate": 0.001, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 4.524886877828054, |
|
"grad_norm": 0.09988280385732651, |
|
"learning_rate": 0.001, |
|
"loss": 0.4806, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_explained_variance": 0.6206489205360413, |
|
"eval_kl_divergence": -0.08347146958112717, |
|
"eval_loss": 0.45927393436431885, |
|
"eval_mae": 0.07016489654779434, |
|
"eval_rmse": 0.0953657403588295, |
|
"eval_runtime": 25.74, |
|
"eval_samples_per_second": 182.518, |
|
"eval_steps_per_second": 2.875, |
|
"learning_rate": 0.001, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_explained_variance": 0.6041414737701416, |
|
"eval_kl_divergence": -0.07046143710613251, |
|
"eval_loss": 0.46080395579338074, |
|
"eval_mae": 0.07277411222457886, |
|
"eval_rmse": 0.09773259609937668, |
|
"eval_runtime": 25.4681, |
|
"eval_samples_per_second": 184.466, |
|
"eval_steps_per_second": 2.906, |
|
"learning_rate": 0.001, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 6.787330316742081, |
|
"grad_norm": 0.08271574974060059, |
|
"learning_rate": 0.001, |
|
"loss": 0.4786, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_explained_variance": 0.628325879573822, |
|
"eval_kl_divergence": -0.004442690871655941, |
|
"eval_loss": 0.4581476151943207, |
|
"eval_mae": 0.06827609241008759, |
|
"eval_rmse": 0.09274852275848389, |
|
"eval_runtime": 26.0251, |
|
"eval_samples_per_second": 180.518, |
|
"eval_steps_per_second": 2.843, |
|
"learning_rate": 0.001, |
|
"step": 1547 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_explained_variance": 0.6276748776435852, |
|
"eval_kl_divergence": 0.07988782227039337, |
|
"eval_loss": 0.4573117196559906, |
|
"eval_mae": 0.06800529360771179, |
|
"eval_rmse": 0.09162522107362747, |
|
"eval_runtime": 25.7197, |
|
"eval_samples_per_second": 182.662, |
|
"eval_steps_per_second": 2.877, |
|
"learning_rate": 0.001, |
|
"step": 1768 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_explained_variance": 0.6196129322052002, |
|
"eval_kl_divergence": 0.02327939122915268, |
|
"eval_loss": 0.45939013361930847, |
|
"eval_mae": 0.07057134807109833, |
|
"eval_rmse": 0.09471722692251205, |
|
"eval_runtime": 25.8299, |
|
"eval_samples_per_second": 181.883, |
|
"eval_steps_per_second": 2.865, |
|
"learning_rate": 0.001, |
|
"step": 1989 |
|
}, |
|
{ |
|
"epoch": 9.049773755656108, |
|
"grad_norm": 0.05649600923061371, |
|
"learning_rate": 0.001, |
|
"loss": 0.4776, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_explained_variance": 0.6293186545372009, |
|
"eval_kl_divergence": 0.0885055735707283, |
|
"eval_loss": 0.45772281289100647, |
|
"eval_mae": 0.06745484471321106, |
|
"eval_rmse": 0.09179002046585083, |
|
"eval_runtime": 25.5273, |
|
"eval_samples_per_second": 184.039, |
|
"eval_steps_per_second": 2.899, |
|
"learning_rate": 0.001, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_explained_variance": 0.6422439813613892, |
|
"eval_kl_divergence": 0.1296330839395523, |
|
"eval_loss": 0.45641985535621643, |
|
"eval_mae": 0.06617596000432968, |
|
"eval_rmse": 0.08975591510534286, |
|
"eval_runtime": 25.7282, |
|
"eval_samples_per_second": 182.601, |
|
"eval_steps_per_second": 2.876, |
|
"learning_rate": 0.001, |
|
"step": 2431 |
|
}, |
|
{ |
|
"epoch": 11.312217194570136, |
|
"grad_norm": 0.04163961857557297, |
|
"learning_rate": 0.001, |
|
"loss": 0.4772, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_explained_variance": 0.6385617256164551, |
|
"eval_kl_divergence": -0.006057058461010456, |
|
"eval_loss": 0.45718902349472046, |
|
"eval_mae": 0.06766870617866516, |
|
"eval_rmse": 0.09130751341581345, |
|
"eval_runtime": 25.6849, |
|
"eval_samples_per_second": 182.909, |
|
"eval_steps_per_second": 2.881, |
|
"learning_rate": 0.001, |
|
"step": 2652 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_explained_variance": 0.6186209321022034, |
|
"eval_kl_divergence": -0.20600058138370514, |
|
"eval_loss": 0.4622880220413208, |
|
"eval_mae": 0.07468675822019577, |
|
"eval_rmse": 0.10024455189704895, |
|
"eval_runtime": 25.9645, |
|
"eval_samples_per_second": 180.939, |
|
"eval_steps_per_second": 2.85, |
|
"learning_rate": 0.001, |
|
"step": 2873 |
|
}, |
|
{ |
|
"epoch": 13.574660633484163, |
|
"grad_norm": 0.0532899908721447, |
|
"learning_rate": 0.001, |
|
"loss": 0.4769, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_explained_variance": 0.6346250176429749, |
|
"eval_kl_divergence": -0.0371401272714138, |
|
"eval_loss": 0.45775285363197327, |
|
"eval_mae": 0.06778896600008011, |
|
"eval_rmse": 0.092497818171978, |
|
"eval_runtime": 25.7017, |
|
"eval_samples_per_second": 182.79, |
|
"eval_steps_per_second": 2.879, |
|
"learning_rate": 0.001, |
|
"step": 3094 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_explained_variance": 0.6340083479881287, |
|
"eval_kl_divergence": 0.04575105383992195, |
|
"eval_loss": 0.4575214684009552, |
|
"eval_mae": 0.0666513592004776, |
|
"eval_rmse": 0.0916559174656868, |
|
"eval_runtime": 26.025, |
|
"eval_samples_per_second": 180.519, |
|
"eval_steps_per_second": 2.843, |
|
"learning_rate": 0.001, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 15.83710407239819, |
|
"grad_norm": 0.0473792664706707, |
|
"learning_rate": 0.001, |
|
"loss": 0.4766, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_explained_variance": 0.6277230381965637, |
|
"eval_kl_divergence": 0.01510859839618206, |
|
"eval_loss": 0.4578736424446106, |
|
"eval_mae": 0.06800080835819244, |
|
"eval_rmse": 0.09264300018548965, |
|
"eval_runtime": 25.6671, |
|
"eval_samples_per_second": 183.036, |
|
"eval_steps_per_second": 2.883, |
|
"learning_rate": 0.001, |
|
"step": 3536 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_explained_variance": 0.6246375441551208, |
|
"eval_kl_divergence": -0.06794208288192749, |
|
"eval_loss": 0.4592094421386719, |
|
"eval_mae": 0.07020581513643265, |
|
"eval_rmse": 0.09485668689012527, |
|
"eval_runtime": 25.9387, |
|
"eval_samples_per_second": 181.119, |
|
"eval_steps_per_second": 2.853, |
|
"learning_rate": 0.001, |
|
"step": 3757 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_explained_variance": 0.6493042707443237, |
|
"eval_kl_divergence": 0.04208216443657875, |
|
"eval_loss": 0.45573291182518005, |
|
"eval_mae": 0.06506813317537308, |
|
"eval_rmse": 0.08873652666807175, |
|
"eval_runtime": 25.6229, |
|
"eval_samples_per_second": 183.352, |
|
"eval_steps_per_second": 2.888, |
|
"learning_rate": 0.0001, |
|
"step": 3978 |
|
}, |
|
{ |
|
"epoch": 18.099547511312217, |
|
"grad_norm": 0.048517756164073944, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4758, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_explained_variance": 0.6507542729377747, |
|
"eval_kl_divergence": 0.04677804559469223, |
|
"eval_loss": 0.4555513262748718, |
|
"eval_mae": 0.06473750621080399, |
|
"eval_rmse": 0.08847790211439133, |
|
"eval_runtime": 25.7638, |
|
"eval_samples_per_second": 182.349, |
|
"eval_steps_per_second": 2.872, |
|
"learning_rate": 0.0001, |
|
"step": 4199 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_explained_variance": 0.6518434882164001, |
|
"eval_kl_divergence": 0.0404924675822258, |
|
"eval_loss": 0.45553284883499146, |
|
"eval_mae": 0.06476090103387833, |
|
"eval_rmse": 0.08838176727294922, |
|
"eval_runtime": 25.6331, |
|
"eval_samples_per_second": 183.279, |
|
"eval_steps_per_second": 2.887, |
|
"learning_rate": 0.0001, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 20.361990950226243, |
|
"grad_norm": 0.04679996892809868, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4741, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_explained_variance": 0.6532743573188782, |
|
"eval_kl_divergence": 0.047539714723825455, |
|
"eval_loss": 0.4555487334728241, |
|
"eval_mae": 0.06497333198785782, |
|
"eval_rmse": 0.08836204558610916, |
|
"eval_runtime": 25.803, |
|
"eval_samples_per_second": 182.072, |
|
"eval_steps_per_second": 2.868, |
|
"learning_rate": 0.0001, |
|
"step": 4641 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_explained_variance": 0.6534684300422668, |
|
"eval_kl_divergence": 0.0570099912583828, |
|
"eval_loss": 0.45551028847694397, |
|
"eval_mae": 0.06458985060453415, |
|
"eval_rmse": 0.08831282705068588, |
|
"eval_runtime": 25.9625, |
|
"eval_samples_per_second": 180.953, |
|
"eval_steps_per_second": 2.85, |
|
"learning_rate": 0.0001, |
|
"step": 4862 |
|
}, |
|
{ |
|
"epoch": 22.624434389140273, |
|
"grad_norm": 0.05471302196383476, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4738, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_explained_variance": 0.6569964289665222, |
|
"eval_kl_divergence": 0.08867427706718445, |
|
"eval_loss": 0.45505577325820923, |
|
"eval_mae": 0.0640987753868103, |
|
"eval_rmse": 0.08740502595901489, |
|
"eval_runtime": 25.8915, |
|
"eval_samples_per_second": 181.45, |
|
"eval_steps_per_second": 2.858, |
|
"learning_rate": 0.0001, |
|
"step": 5083 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_explained_variance": 0.6552526354789734, |
|
"eval_kl_divergence": 0.055539198219776154, |
|
"eval_loss": 0.4552234709262848, |
|
"eval_mae": 0.06417837738990784, |
|
"eval_rmse": 0.08780523389577866, |
|
"eval_runtime": 27.2231, |
|
"eval_samples_per_second": 172.574, |
|
"eval_steps_per_second": 2.718, |
|
"learning_rate": 0.0001, |
|
"step": 5304 |
|
}, |
|
{ |
|
"epoch": 24.8868778280543, |
|
"grad_norm": 0.0545237734913826, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4736, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_explained_variance": 0.6582456231117249, |
|
"eval_kl_divergence": 0.023763582110404968, |
|
"eval_loss": 0.45521080493927, |
|
"eval_mae": 0.06447087973356247, |
|
"eval_rmse": 0.08778873831033707, |
|
"eval_runtime": 25.7982, |
|
"eval_samples_per_second": 182.106, |
|
"eval_steps_per_second": 2.868, |
|
"learning_rate": 0.0001, |
|
"step": 5525 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_explained_variance": 0.6571853756904602, |
|
"eval_kl_divergence": 0.040941931307315826, |
|
"eval_loss": 0.4557025730609894, |
|
"eval_mae": 0.06462270766496658, |
|
"eval_rmse": 0.08846313506364822, |
|
"eval_runtime": 25.5822, |
|
"eval_samples_per_second": 183.643, |
|
"eval_steps_per_second": 2.893, |
|
"learning_rate": 0.0001, |
|
"step": 5746 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_explained_variance": 0.6576172709465027, |
|
"eval_kl_divergence": 0.05476689711213112, |
|
"eval_loss": 0.4550967216491699, |
|
"eval_mae": 0.06391049176454544, |
|
"eval_rmse": 0.08758416771888733, |
|
"eval_runtime": 26.0908, |
|
"eval_samples_per_second": 180.064, |
|
"eval_steps_per_second": 2.836, |
|
"learning_rate": 0.0001, |
|
"step": 5967 |
|
}, |
|
{ |
|
"epoch": 27.149321266968325, |
|
"grad_norm": 0.05160004645586014, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4731, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_explained_variance": 0.658767580986023, |
|
"eval_kl_divergence": 0.027325255796313286, |
|
"eval_loss": 0.45512688159942627, |
|
"eval_mae": 0.0641704872250557, |
|
"eval_rmse": 0.08764084428548813, |
|
"eval_runtime": 25.6818, |
|
"eval_samples_per_second": 182.931, |
|
"eval_steps_per_second": 2.881, |
|
"learning_rate": 0.0001, |
|
"step": 6188 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_explained_variance": 0.6617770195007324, |
|
"eval_kl_divergence": 0.0744185745716095, |
|
"eval_loss": 0.45477041602134705, |
|
"eval_mae": 0.0634256973862648, |
|
"eval_rmse": 0.08693012595176697, |
|
"eval_runtime": 25.726, |
|
"eval_samples_per_second": 182.617, |
|
"eval_steps_per_second": 2.876, |
|
"learning_rate": 0.0001, |
|
"step": 6409 |
|
}, |
|
{ |
|
"epoch": 29.41176470588235, |
|
"grad_norm": 0.07741276919841766, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4727, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_explained_variance": 0.6594749093055725, |
|
"eval_kl_divergence": 0.049223385751247406, |
|
"eval_loss": 0.4549327790737152, |
|
"eval_mae": 0.06360659003257751, |
|
"eval_rmse": 0.0873405933380127, |
|
"eval_runtime": 25.4772, |
|
"eval_samples_per_second": 184.4, |
|
"eval_steps_per_second": 2.905, |
|
"learning_rate": 0.0001, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_explained_variance": 0.6613443493843079, |
|
"eval_kl_divergence": 0.06878047436475754, |
|
"eval_loss": 0.4547973871231079, |
|
"eval_mae": 0.06322694569826126, |
|
"eval_rmse": 0.08694975823163986, |
|
"eval_runtime": 25.8257, |
|
"eval_samples_per_second": 181.912, |
|
"eval_steps_per_second": 2.865, |
|
"learning_rate": 0.0001, |
|
"step": 6851 |
|
}, |
|
{ |
|
"epoch": 31.67420814479638, |
|
"grad_norm": 0.055884115397930145, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4732, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_explained_variance": 0.6602151393890381, |
|
"eval_kl_divergence": 0.027085499837994576, |
|
"eval_loss": 0.454988956451416, |
|
"eval_mae": 0.063857302069664, |
|
"eval_rmse": 0.08743549138307571, |
|
"eval_runtime": 25.6292, |
|
"eval_samples_per_second": 183.307, |
|
"eval_steps_per_second": 2.887, |
|
"learning_rate": 0.0001, |
|
"step": 7072 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_explained_variance": 0.6580324172973633, |
|
"eval_kl_divergence": -0.017361771315336227, |
|
"eval_loss": 0.455375999212265, |
|
"eval_mae": 0.0646858736872673, |
|
"eval_rmse": 0.08816961199045181, |
|
"eval_runtime": 25.8246, |
|
"eval_samples_per_second": 181.919, |
|
"eval_steps_per_second": 2.865, |
|
"learning_rate": 0.0001, |
|
"step": 7293 |
|
}, |
|
{ |
|
"epoch": 33.93665158371041, |
|
"grad_norm": 0.08047891408205032, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4725, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_explained_variance": 0.6616186499595642, |
|
"eval_kl_divergence": 0.10939505696296692, |
|
"eval_loss": 0.45461305975914, |
|
"eval_mae": 0.0628495141863823, |
|
"eval_rmse": 0.08664888888597488, |
|
"eval_runtime": 25.7346, |
|
"eval_samples_per_second": 182.556, |
|
"eval_steps_per_second": 2.876, |
|
"learning_rate": 0.0001, |
|
"step": 7514 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_explained_variance": 0.6582692265510559, |
|
"eval_kl_divergence": 0.05707371234893799, |
|
"eval_loss": 0.45498156547546387, |
|
"eval_mae": 0.06386271119117737, |
|
"eval_rmse": 0.08741921186447144, |
|
"eval_runtime": 25.7857, |
|
"eval_samples_per_second": 182.194, |
|
"eval_steps_per_second": 2.87, |
|
"learning_rate": 0.0001, |
|
"step": 7735 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_explained_variance": 0.6615896224975586, |
|
"eval_kl_divergence": 0.14533284306526184, |
|
"eval_loss": 0.4548388123512268, |
|
"eval_mae": 0.0629100501537323, |
|
"eval_rmse": 0.08686337620019913, |
|
"eval_runtime": 29.7733, |
|
"eval_samples_per_second": 157.793, |
|
"eval_steps_per_second": 2.485, |
|
"learning_rate": 0.0001, |
|
"step": 7956 |
|
}, |
|
{ |
|
"epoch": 36.199095022624434, |
|
"grad_norm": 0.07811417430639267, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4727, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_explained_variance": 0.6586756110191345, |
|
"eval_kl_divergence": -0.015241213142871857, |
|
"eval_loss": 0.45526784658432007, |
|
"eval_mae": 0.06451455503702164, |
|
"eval_rmse": 0.08806425333023071, |
|
"eval_runtime": 25.6924, |
|
"eval_samples_per_second": 182.855, |
|
"eval_steps_per_second": 2.88, |
|
"learning_rate": 0.0001, |
|
"step": 8177 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_explained_variance": 0.6612560153007507, |
|
"eval_kl_divergence": 0.049000147730112076, |
|
"eval_loss": 0.45479556918144226, |
|
"eval_mae": 0.06361590325832367, |
|
"eval_rmse": 0.08704841136932373, |
|
"eval_runtime": 26.1103, |
|
"eval_samples_per_second": 179.929, |
|
"eval_steps_per_second": 2.834, |
|
"learning_rate": 0.0001, |
|
"step": 8398 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 0.062047556042671204, |
|
"learning_rate": 0.0001, |
|
"loss": 0.4727, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_explained_variance": 0.6610231995582581, |
|
"eval_kl_divergence": 0.07255241274833679, |
|
"eval_loss": 0.454780250787735, |
|
"eval_mae": 0.06311424821615219, |
|
"eval_rmse": 0.08698847889900208, |
|
"eval_runtime": 25.6403, |
|
"eval_samples_per_second": 183.227, |
|
"eval_steps_per_second": 2.886, |
|
"learning_rate": 0.0001, |
|
"step": 8619 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_explained_variance": 0.6605435013771057, |
|
"eval_kl_divergence": 0.06372024863958359, |
|
"eval_loss": 0.45476558804512024, |
|
"eval_mae": 0.06323693692684174, |
|
"eval_rmse": 0.08702895045280457, |
|
"eval_runtime": 26.038, |
|
"eval_samples_per_second": 180.429, |
|
"eval_steps_per_second": 2.842, |
|
"learning_rate": 0.0001, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 40.723981900452486, |
|
"grad_norm": 0.08612842857837677, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4721, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_explained_variance": 0.6628013253211975, |
|
"eval_kl_divergence": 0.039023660123348236, |
|
"eval_loss": 0.45470812916755676, |
|
"eval_mae": 0.0634213536977768, |
|
"eval_rmse": 0.08692529052495956, |
|
"eval_runtime": 25.9883, |
|
"eval_samples_per_second": 180.774, |
|
"eval_steps_per_second": 2.847, |
|
"learning_rate": 1e-05, |
|
"step": 9061 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_explained_variance": 0.6656690239906311, |
|
"eval_kl_divergence": 0.11149828135967255, |
|
"eval_loss": 0.4543863534927368, |
|
"eval_mae": 0.06281669437885284, |
|
"eval_rmse": 0.08619723469018936, |
|
"eval_runtime": 26.3115, |
|
"eval_samples_per_second": 178.553, |
|
"eval_steps_per_second": 2.812, |
|
"learning_rate": 1e-05, |
|
"step": 9282 |
|
}, |
|
{ |
|
"epoch": 42.98642533936652, |
|
"grad_norm": 0.06828662008047104, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4721, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_explained_variance": 0.6645870804786682, |
|
"eval_kl_divergence": 0.05330301821231842, |
|
"eval_loss": 0.4545557498931885, |
|
"eval_mae": 0.06320130825042725, |
|
"eval_rmse": 0.0865868553519249, |
|
"eval_runtime": 25.8985, |
|
"eval_samples_per_second": 181.4, |
|
"eval_steps_per_second": 2.857, |
|
"learning_rate": 1e-05, |
|
"step": 9503 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_explained_variance": 0.6648023128509521, |
|
"eval_kl_divergence": 0.13496889173984528, |
|
"eval_loss": 0.45448434352874756, |
|
"eval_mae": 0.06253467500209808, |
|
"eval_rmse": 0.08635282516479492, |
|
"eval_runtime": 26.0508, |
|
"eval_samples_per_second": 180.34, |
|
"eval_steps_per_second": 2.841, |
|
"learning_rate": 1e-05, |
|
"step": 9724 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_explained_variance": 0.6624875068664551, |
|
"eval_kl_divergence": 0.004431928042322397, |
|
"eval_loss": 0.4550137519836426, |
|
"eval_mae": 0.06418145447969437, |
|
"eval_rmse": 0.0874209776520729, |
|
"eval_runtime": 25.8495, |
|
"eval_samples_per_second": 181.744, |
|
"eval_steps_per_second": 2.863, |
|
"learning_rate": 1e-05, |
|
"step": 9945 |
|
}, |
|
{ |
|
"epoch": 45.248868778280546, |
|
"grad_norm": 0.07514863461256027, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4716, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_explained_variance": 0.6642169952392578, |
|
"eval_kl_divergence": 0.03887256979942322, |
|
"eval_loss": 0.4545902609825134, |
|
"eval_mae": 0.06316760927438736, |
|
"eval_rmse": 0.08669499307870865, |
|
"eval_runtime": 25.9222, |
|
"eval_samples_per_second": 181.235, |
|
"eval_steps_per_second": 2.855, |
|
"learning_rate": 1e-05, |
|
"step": 10166 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_explained_variance": 0.6651113629341125, |
|
"eval_kl_divergence": 0.037030890583992004, |
|
"eval_loss": 0.4544997215270996, |
|
"eval_mae": 0.06298934668302536, |
|
"eval_rmse": 0.0865601971745491, |
|
"eval_runtime": 25.9565, |
|
"eval_samples_per_second": 180.995, |
|
"eval_steps_per_second": 2.851, |
|
"learning_rate": 1e-05, |
|
"step": 10387 |
|
}, |
|
{ |
|
"epoch": 47.51131221719457, |
|
"grad_norm": 0.057216282933950424, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4722, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_explained_variance": 0.6645199060440063, |
|
"eval_kl_divergence": 0.019425788894295692, |
|
"eval_loss": 0.4546374976634979, |
|
"eval_mae": 0.06339576095342636, |
|
"eval_rmse": 0.08680880069732666, |
|
"eval_runtime": 25.7117, |
|
"eval_samples_per_second": 182.718, |
|
"eval_steps_per_second": 2.878, |
|
"learning_rate": 1e-05, |
|
"step": 10608 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_explained_variance": 0.6666774153709412, |
|
"eval_kl_divergence": 0.0667150691151619, |
|
"eval_loss": 0.45436596870422363, |
|
"eval_mae": 0.06269881874322891, |
|
"eval_rmse": 0.08620164543390274, |
|
"eval_runtime": 27.6905, |
|
"eval_samples_per_second": 169.661, |
|
"eval_steps_per_second": 2.672, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 10829 |
|
}, |
|
{ |
|
"epoch": 49.7737556561086, |
|
"grad_norm": 0.07466714084148407, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4717, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_explained_variance": 0.6650940179824829, |
|
"eval_kl_divergence": 0.05483337119221687, |
|
"eval_loss": 0.45450592041015625, |
|
"eval_mae": 0.06310971826314926, |
|
"eval_rmse": 0.08650273084640503, |
|
"eval_runtime": 27.7128, |
|
"eval_samples_per_second": 169.524, |
|
"eval_steps_per_second": 2.67, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_explained_variance": 0.6651105284690857, |
|
"eval_kl_divergence": 0.04277108237147331, |
|
"eval_loss": 0.4544804096221924, |
|
"eval_mae": 0.06292647123336792, |
|
"eval_rmse": 0.08647629618644714, |
|
"eval_runtime": 26.6553, |
|
"eval_samples_per_second": 176.25, |
|
"eval_steps_per_second": 2.776, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 11271 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_explained_variance": 0.667234480381012, |
|
"eval_kl_divergence": 0.12364839017391205, |
|
"eval_loss": 0.45421910285949707, |
|
"eval_mae": 0.06233237311244011, |
|
"eval_rmse": 0.08589440584182739, |
|
"eval_runtime": 25.8544, |
|
"eval_samples_per_second": 181.71, |
|
"eval_steps_per_second": 2.862, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 11492 |
|
}, |
|
{ |
|
"epoch": 52.036199095022624, |
|
"grad_norm": 0.08442794531583786, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4718, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_explained_variance": 0.6671742796897888, |
|
"eval_kl_divergence": 0.08869530260562897, |
|
"eval_loss": 0.4542272686958313, |
|
"eval_mae": 0.06253313273191452, |
|
"eval_rmse": 0.08594661206007004, |
|
"eval_runtime": 25.9744, |
|
"eval_samples_per_second": 180.871, |
|
"eval_steps_per_second": 2.849, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 11713 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_explained_variance": 0.6653165221214294, |
|
"eval_kl_divergence": 0.09171402454376221, |
|
"eval_loss": 0.4543103575706482, |
|
"eval_mae": 0.0623968206346035, |
|
"eval_rmse": 0.08615261316299438, |
|
"eval_runtime": 26.0699, |
|
"eval_samples_per_second": 180.208, |
|
"eval_steps_per_second": 2.839, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 11934 |
|
}, |
|
{ |
|
"epoch": 54.29864253393665, |
|
"grad_norm": 0.08775485306978226, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4716, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_explained_variance": 0.6649713516235352, |
|
"eval_kl_divergence": 0.07737051695585251, |
|
"eval_loss": 0.45456644892692566, |
|
"eval_mae": 0.06305743753910065, |
|
"eval_rmse": 0.0865490511059761, |
|
"eval_runtime": 26.0104, |
|
"eval_samples_per_second": 180.62, |
|
"eval_steps_per_second": 2.845, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 12155 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_explained_variance": 0.6649186611175537, |
|
"eval_kl_divergence": 0.04731013998389244, |
|
"eval_loss": 0.45458319783210754, |
|
"eval_mae": 0.06328658014535904, |
|
"eval_rmse": 0.08663744479417801, |
|
"eval_runtime": 25.8104, |
|
"eval_samples_per_second": 182.019, |
|
"eval_steps_per_second": 2.867, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 12376 |
|
}, |
|
{ |
|
"epoch": 56.56108597285068, |
|
"grad_norm": 0.0692247599363327, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.4717, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_explained_variance": 0.6657507419586182, |
|
"eval_kl_divergence": -0.004581684246659279, |
|
"eval_loss": 0.4548773169517517, |
|
"eval_mae": 0.0639243796467781, |
|
"eval_rmse": 0.0871059000492096, |
|
"eval_runtime": 25.4962, |
|
"eval_samples_per_second": 184.262, |
|
"eval_steps_per_second": 2.902, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 12597 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_explained_variance": 0.6655800342559814, |
|
"eval_kl_divergence": 0.0553017221391201, |
|
"eval_loss": 0.45440155267715454, |
|
"eval_mae": 0.06271661818027496, |
|
"eval_rmse": 0.08635643124580383, |
|
"eval_runtime": 26.1057, |
|
"eval_samples_per_second": 179.961, |
|
"eval_steps_per_second": 2.835, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"step": 12818 |
|
}, |
|
{ |
|
"epoch": 58.8235294117647, |
|
"grad_norm": 0.07922232896089554, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4716, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_explained_variance": 0.6654148101806641, |
|
"eval_kl_divergence": 0.03675610199570656, |
|
"eval_loss": 0.45448538661003113, |
|
"eval_mae": 0.06308572739362717, |
|
"eval_rmse": 0.08650225400924683, |
|
"eval_runtime": 25.8122, |
|
"eval_samples_per_second": 182.007, |
|
"eval_steps_per_second": 2.867, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 13039 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_explained_variance": 0.6660366058349609, |
|
"eval_kl_divergence": 0.047148581594228745, |
|
"eval_loss": 0.4544091522693634, |
|
"eval_mae": 0.06294982880353928, |
|
"eval_rmse": 0.08633282780647278, |
|
"eval_runtime": 26.4937, |
|
"eval_samples_per_second": 177.325, |
|
"eval_steps_per_second": 2.793, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_explained_variance": 0.6669723987579346, |
|
"eval_kl_divergence": 0.09280110895633698, |
|
"eval_loss": 0.4542348086833954, |
|
"eval_mae": 0.062441930174827576, |
|
"eval_rmse": 0.08595842123031616, |
|
"eval_runtime": 26.0483, |
|
"eval_samples_per_second": 180.357, |
|
"eval_steps_per_second": 2.841, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 13481 |
|
}, |
|
{ |
|
"epoch": 61.085972850678736, |
|
"grad_norm": 0.07845129072666168, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"loss": 0.4718, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_explained_variance": 0.6661055088043213, |
|
"eval_kl_divergence": 0.028626998886466026, |
|
"eval_loss": 0.4545469284057617, |
|
"eval_mae": 0.06315190345048904, |
|
"eval_rmse": 0.0865735188126564, |
|
"eval_runtime": 25.8503, |
|
"eval_samples_per_second": 181.739, |
|
"eval_steps_per_second": 2.863, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 13702 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"learning_rate": 1.0000000000000002e-07, |
|
"step": 13702, |
|
"total_flos": 9.42369297866869e+19, |
|
"train_loss": 0.4754439868851833, |
|
"train_runtime": 8961.4221, |
|
"train_samples_per_second": 235.894, |
|
"train_steps_per_second": 3.699 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 33150, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 150, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 10, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.42369297866869e+19, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|