{ "best_metric": 0.45421910285949707, "best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/bd_ortho_ign/bd_ortho-DinoVdeau-large-2024_11_27-batch-size64_freeze_probs/checkpoint-11492", "epoch": 62.0, "eval_steps": 500, "global_step": 13702, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_explained_variance": 0.5492395758628845, "eval_kl_divergence": 0.06964559853076935, "eval_loss": 0.46336060762405396, "eval_mae": 0.07600608468055725, "eval_rmse": 0.10175278037786484, "eval_runtime": 26.595, "eval_samples_per_second": 176.65, "eval_steps_per_second": 2.782, "learning_rate": 0.001, "step": 221 }, { "epoch": 2.0, "eval_explained_variance": 0.6113448739051819, "eval_kl_divergence": 0.0038063330575823784, "eval_loss": 0.45933997631073, "eval_mae": 0.07159148901700974, "eval_rmse": 0.09520163387060165, "eval_runtime": 25.5426, "eval_samples_per_second": 183.928, "eval_steps_per_second": 2.897, "learning_rate": 0.001, "step": 442 }, { "epoch": 2.262443438914027, "grad_norm": 0.16188210248947144, "learning_rate": 0.001, "loss": 0.5185, "step": 500 }, { "epoch": 3.0, "eval_explained_variance": 0.6245184540748596, "eval_kl_divergence": 0.05826142057776451, "eval_loss": 0.457367479801178, "eval_mae": 0.0670078918337822, "eval_rmse": 0.0917908325791359, "eval_runtime": 25.6126, "eval_samples_per_second": 183.425, "eval_steps_per_second": 2.889, "learning_rate": 0.001, "step": 663 }, { "epoch": 4.0, "eval_explained_variance": 0.6129782795906067, "eval_kl_divergence": -0.06495417654514313, "eval_loss": 0.459468811750412, "eval_mae": 0.07134346663951874, "eval_rmse": 0.09552835673093796, "eval_runtime": 25.6003, "eval_samples_per_second": 183.514, "eval_steps_per_second": 2.891, "learning_rate": 0.001, "step": 884 }, { "epoch": 4.524886877828054, "grad_norm": 0.09988280385732651, "learning_rate": 0.001, "loss": 0.4806, "step": 1000 }, { "epoch": 5.0, "eval_explained_variance": 0.6206489205360413, "eval_kl_divergence": -0.08347146958112717, "eval_loss": 0.45927393436431885, "eval_mae": 0.07016489654779434, "eval_rmse": 0.0953657403588295, "eval_runtime": 25.74, "eval_samples_per_second": 182.518, "eval_steps_per_second": 2.875, "learning_rate": 0.001, "step": 1105 }, { "epoch": 6.0, "eval_explained_variance": 0.6041414737701416, "eval_kl_divergence": -0.07046143710613251, "eval_loss": 0.46080395579338074, "eval_mae": 0.07277411222457886, "eval_rmse": 0.09773259609937668, "eval_runtime": 25.4681, "eval_samples_per_second": 184.466, "eval_steps_per_second": 2.906, "learning_rate": 0.001, "step": 1326 }, { "epoch": 6.787330316742081, "grad_norm": 0.08271574974060059, "learning_rate": 0.001, "loss": 0.4786, "step": 1500 }, { "epoch": 7.0, "eval_explained_variance": 0.628325879573822, "eval_kl_divergence": -0.004442690871655941, "eval_loss": 0.4581476151943207, "eval_mae": 0.06827609241008759, "eval_rmse": 0.09274852275848389, "eval_runtime": 26.0251, "eval_samples_per_second": 180.518, "eval_steps_per_second": 2.843, "learning_rate": 0.001, "step": 1547 }, { "epoch": 8.0, "eval_explained_variance": 0.6276748776435852, "eval_kl_divergence": 0.07988782227039337, "eval_loss": 0.4573117196559906, "eval_mae": 0.06800529360771179, "eval_rmse": 0.09162522107362747, "eval_runtime": 25.7197, "eval_samples_per_second": 182.662, "eval_steps_per_second": 2.877, "learning_rate": 0.001, "step": 1768 }, { "epoch": 9.0, "eval_explained_variance": 0.6196129322052002, "eval_kl_divergence": 0.02327939122915268, "eval_loss": 0.45939013361930847, "eval_mae": 0.07057134807109833, "eval_rmse": 0.09471722692251205, "eval_runtime": 25.8299, "eval_samples_per_second": 181.883, "eval_steps_per_second": 2.865, "learning_rate": 0.001, "step": 1989 }, { "epoch": 9.049773755656108, "grad_norm": 0.05649600923061371, "learning_rate": 0.001, "loss": 0.4776, "step": 2000 }, { "epoch": 10.0, "eval_explained_variance": 0.6293186545372009, "eval_kl_divergence": 0.0885055735707283, "eval_loss": 0.45772281289100647, "eval_mae": 0.06745484471321106, "eval_rmse": 0.09179002046585083, "eval_runtime": 25.5273, "eval_samples_per_second": 184.039, "eval_steps_per_second": 2.899, "learning_rate": 0.001, "step": 2210 }, { "epoch": 11.0, "eval_explained_variance": 0.6422439813613892, "eval_kl_divergence": 0.1296330839395523, "eval_loss": 0.45641985535621643, "eval_mae": 0.06617596000432968, "eval_rmse": 0.08975591510534286, "eval_runtime": 25.7282, "eval_samples_per_second": 182.601, "eval_steps_per_second": 2.876, "learning_rate": 0.001, "step": 2431 }, { "epoch": 11.312217194570136, "grad_norm": 0.04163961857557297, "learning_rate": 0.001, "loss": 0.4772, "step": 2500 }, { "epoch": 12.0, "eval_explained_variance": 0.6385617256164551, "eval_kl_divergence": -0.006057058461010456, "eval_loss": 0.45718902349472046, "eval_mae": 0.06766870617866516, "eval_rmse": 0.09130751341581345, "eval_runtime": 25.6849, "eval_samples_per_second": 182.909, "eval_steps_per_second": 2.881, "learning_rate": 0.001, "step": 2652 }, { "epoch": 13.0, "eval_explained_variance": 0.6186209321022034, "eval_kl_divergence": -0.20600058138370514, "eval_loss": 0.4622880220413208, "eval_mae": 0.07468675822019577, "eval_rmse": 0.10024455189704895, "eval_runtime": 25.9645, "eval_samples_per_second": 180.939, "eval_steps_per_second": 2.85, "learning_rate": 0.001, "step": 2873 }, { "epoch": 13.574660633484163, "grad_norm": 0.0532899908721447, "learning_rate": 0.001, "loss": 0.4769, "step": 3000 }, { "epoch": 14.0, "eval_explained_variance": 0.6346250176429749, "eval_kl_divergence": -0.0371401272714138, "eval_loss": 0.45775285363197327, "eval_mae": 0.06778896600008011, "eval_rmse": 0.092497818171978, "eval_runtime": 25.7017, "eval_samples_per_second": 182.79, "eval_steps_per_second": 2.879, "learning_rate": 0.001, "step": 3094 }, { "epoch": 15.0, "eval_explained_variance": 0.6340083479881287, "eval_kl_divergence": 0.04575105383992195, "eval_loss": 0.4575214684009552, "eval_mae": 0.0666513592004776, "eval_rmse": 0.0916559174656868, "eval_runtime": 26.025, "eval_samples_per_second": 180.519, "eval_steps_per_second": 2.843, "learning_rate": 0.001, "step": 3315 }, { "epoch": 15.83710407239819, "grad_norm": 0.0473792664706707, "learning_rate": 0.001, "loss": 0.4766, "step": 3500 }, { "epoch": 16.0, "eval_explained_variance": 0.6277230381965637, "eval_kl_divergence": 0.01510859839618206, "eval_loss": 0.4578736424446106, "eval_mae": 0.06800080835819244, "eval_rmse": 0.09264300018548965, "eval_runtime": 25.6671, "eval_samples_per_second": 183.036, "eval_steps_per_second": 2.883, "learning_rate": 0.001, "step": 3536 }, { "epoch": 17.0, "eval_explained_variance": 0.6246375441551208, "eval_kl_divergence": -0.06794208288192749, "eval_loss": 0.4592094421386719, "eval_mae": 0.07020581513643265, "eval_rmse": 0.09485668689012527, "eval_runtime": 25.9387, "eval_samples_per_second": 181.119, "eval_steps_per_second": 2.853, "learning_rate": 0.001, "step": 3757 }, { "epoch": 18.0, "eval_explained_variance": 0.6493042707443237, "eval_kl_divergence": 0.04208216443657875, "eval_loss": 0.45573291182518005, "eval_mae": 0.06506813317537308, "eval_rmse": 0.08873652666807175, "eval_runtime": 25.6229, "eval_samples_per_second": 183.352, "eval_steps_per_second": 2.888, "learning_rate": 0.0001, "step": 3978 }, { "epoch": 18.099547511312217, "grad_norm": 0.048517756164073944, "learning_rate": 0.0001, "loss": 0.4758, "step": 4000 }, { "epoch": 19.0, "eval_explained_variance": 0.6507542729377747, "eval_kl_divergence": 0.04677804559469223, "eval_loss": 0.4555513262748718, "eval_mae": 0.06473750621080399, "eval_rmse": 0.08847790211439133, "eval_runtime": 25.7638, "eval_samples_per_second": 182.349, "eval_steps_per_second": 2.872, "learning_rate": 0.0001, "step": 4199 }, { "epoch": 20.0, "eval_explained_variance": 0.6518434882164001, "eval_kl_divergence": 0.0404924675822258, "eval_loss": 0.45553284883499146, "eval_mae": 0.06476090103387833, "eval_rmse": 0.08838176727294922, "eval_runtime": 25.6331, "eval_samples_per_second": 183.279, "eval_steps_per_second": 2.887, "learning_rate": 0.0001, "step": 4420 }, { "epoch": 20.361990950226243, "grad_norm": 0.04679996892809868, "learning_rate": 0.0001, "loss": 0.4741, "step": 4500 }, { "epoch": 21.0, "eval_explained_variance": 0.6532743573188782, "eval_kl_divergence": 0.047539714723825455, "eval_loss": 0.4555487334728241, "eval_mae": 0.06497333198785782, "eval_rmse": 0.08836204558610916, "eval_runtime": 25.803, "eval_samples_per_second": 182.072, "eval_steps_per_second": 2.868, "learning_rate": 0.0001, "step": 4641 }, { "epoch": 22.0, "eval_explained_variance": 0.6534684300422668, "eval_kl_divergence": 0.0570099912583828, "eval_loss": 0.45551028847694397, "eval_mae": 0.06458985060453415, "eval_rmse": 0.08831282705068588, "eval_runtime": 25.9625, "eval_samples_per_second": 180.953, "eval_steps_per_second": 2.85, "learning_rate": 0.0001, "step": 4862 }, { "epoch": 22.624434389140273, "grad_norm": 0.05471302196383476, "learning_rate": 0.0001, "loss": 0.4738, "step": 5000 }, { "epoch": 23.0, "eval_explained_variance": 0.6569964289665222, "eval_kl_divergence": 0.08867427706718445, "eval_loss": 0.45505577325820923, "eval_mae": 0.0640987753868103, "eval_rmse": 0.08740502595901489, "eval_runtime": 25.8915, "eval_samples_per_second": 181.45, "eval_steps_per_second": 2.858, "learning_rate": 0.0001, "step": 5083 }, { "epoch": 24.0, "eval_explained_variance": 0.6552526354789734, "eval_kl_divergence": 0.055539198219776154, "eval_loss": 0.4552234709262848, "eval_mae": 0.06417837738990784, "eval_rmse": 0.08780523389577866, "eval_runtime": 27.2231, "eval_samples_per_second": 172.574, "eval_steps_per_second": 2.718, "learning_rate": 0.0001, "step": 5304 }, { "epoch": 24.8868778280543, "grad_norm": 0.0545237734913826, "learning_rate": 0.0001, "loss": 0.4736, "step": 5500 }, { "epoch": 25.0, "eval_explained_variance": 0.6582456231117249, "eval_kl_divergence": 0.023763582110404968, "eval_loss": 0.45521080493927, "eval_mae": 0.06447087973356247, "eval_rmse": 0.08778873831033707, "eval_runtime": 25.7982, "eval_samples_per_second": 182.106, "eval_steps_per_second": 2.868, "learning_rate": 0.0001, "step": 5525 }, { "epoch": 26.0, "eval_explained_variance": 0.6571853756904602, "eval_kl_divergence": 0.040941931307315826, "eval_loss": 0.4557025730609894, "eval_mae": 0.06462270766496658, "eval_rmse": 0.08846313506364822, "eval_runtime": 25.5822, "eval_samples_per_second": 183.643, "eval_steps_per_second": 2.893, "learning_rate": 0.0001, "step": 5746 }, { "epoch": 27.0, "eval_explained_variance": 0.6576172709465027, "eval_kl_divergence": 0.05476689711213112, "eval_loss": 0.4550967216491699, "eval_mae": 0.06391049176454544, "eval_rmse": 0.08758416771888733, "eval_runtime": 26.0908, "eval_samples_per_second": 180.064, "eval_steps_per_second": 2.836, "learning_rate": 0.0001, "step": 5967 }, { "epoch": 27.149321266968325, "grad_norm": 0.05160004645586014, "learning_rate": 0.0001, "loss": 0.4731, "step": 6000 }, { "epoch": 28.0, "eval_explained_variance": 0.658767580986023, "eval_kl_divergence": 0.027325255796313286, "eval_loss": 0.45512688159942627, "eval_mae": 0.0641704872250557, "eval_rmse": 0.08764084428548813, "eval_runtime": 25.6818, "eval_samples_per_second": 182.931, "eval_steps_per_second": 2.881, "learning_rate": 0.0001, "step": 6188 }, { "epoch": 29.0, "eval_explained_variance": 0.6617770195007324, "eval_kl_divergence": 0.0744185745716095, "eval_loss": 0.45477041602134705, "eval_mae": 0.0634256973862648, "eval_rmse": 0.08693012595176697, "eval_runtime": 25.726, "eval_samples_per_second": 182.617, "eval_steps_per_second": 2.876, "learning_rate": 0.0001, "step": 6409 }, { "epoch": 29.41176470588235, "grad_norm": 0.07741276919841766, "learning_rate": 0.0001, "loss": 0.4727, "step": 6500 }, { "epoch": 30.0, "eval_explained_variance": 0.6594749093055725, "eval_kl_divergence": 0.049223385751247406, "eval_loss": 0.4549327790737152, "eval_mae": 0.06360659003257751, "eval_rmse": 0.0873405933380127, "eval_runtime": 25.4772, "eval_samples_per_second": 184.4, "eval_steps_per_second": 2.905, "learning_rate": 0.0001, "step": 6630 }, { "epoch": 31.0, "eval_explained_variance": 0.6613443493843079, "eval_kl_divergence": 0.06878047436475754, "eval_loss": 0.4547973871231079, "eval_mae": 0.06322694569826126, "eval_rmse": 0.08694975823163986, "eval_runtime": 25.8257, "eval_samples_per_second": 181.912, "eval_steps_per_second": 2.865, "learning_rate": 0.0001, "step": 6851 }, { "epoch": 31.67420814479638, "grad_norm": 0.055884115397930145, "learning_rate": 0.0001, "loss": 0.4732, "step": 7000 }, { "epoch": 32.0, "eval_explained_variance": 0.6602151393890381, "eval_kl_divergence": 0.027085499837994576, "eval_loss": 0.454988956451416, "eval_mae": 0.063857302069664, "eval_rmse": 0.08743549138307571, "eval_runtime": 25.6292, "eval_samples_per_second": 183.307, "eval_steps_per_second": 2.887, "learning_rate": 0.0001, "step": 7072 }, { "epoch": 33.0, "eval_explained_variance": 0.6580324172973633, "eval_kl_divergence": -0.017361771315336227, "eval_loss": 0.455375999212265, "eval_mae": 0.0646858736872673, "eval_rmse": 0.08816961199045181, "eval_runtime": 25.8246, "eval_samples_per_second": 181.919, "eval_steps_per_second": 2.865, "learning_rate": 0.0001, "step": 7293 }, { "epoch": 33.93665158371041, "grad_norm": 0.08047891408205032, "learning_rate": 0.0001, "loss": 0.4725, "step": 7500 }, { "epoch": 34.0, "eval_explained_variance": 0.6616186499595642, "eval_kl_divergence": 0.10939505696296692, "eval_loss": 0.45461305975914, "eval_mae": 0.0628495141863823, "eval_rmse": 0.08664888888597488, "eval_runtime": 25.7346, "eval_samples_per_second": 182.556, "eval_steps_per_second": 2.876, "learning_rate": 0.0001, "step": 7514 }, { "epoch": 35.0, "eval_explained_variance": 0.6582692265510559, "eval_kl_divergence": 0.05707371234893799, "eval_loss": 0.45498156547546387, "eval_mae": 0.06386271119117737, "eval_rmse": 0.08741921186447144, "eval_runtime": 25.7857, "eval_samples_per_second": 182.194, "eval_steps_per_second": 2.87, "learning_rate": 0.0001, "step": 7735 }, { "epoch": 36.0, "eval_explained_variance": 0.6615896224975586, "eval_kl_divergence": 0.14533284306526184, "eval_loss": 0.4548388123512268, "eval_mae": 0.0629100501537323, "eval_rmse": 0.08686337620019913, "eval_runtime": 29.7733, "eval_samples_per_second": 157.793, "eval_steps_per_second": 2.485, "learning_rate": 0.0001, "step": 7956 }, { "epoch": 36.199095022624434, "grad_norm": 0.07811417430639267, "learning_rate": 0.0001, "loss": 0.4727, "step": 8000 }, { "epoch": 37.0, "eval_explained_variance": 0.6586756110191345, "eval_kl_divergence": -0.015241213142871857, "eval_loss": 0.45526784658432007, "eval_mae": 0.06451455503702164, "eval_rmse": 0.08806425333023071, "eval_runtime": 25.6924, "eval_samples_per_second": 182.855, "eval_steps_per_second": 2.88, "learning_rate": 0.0001, "step": 8177 }, { "epoch": 38.0, "eval_explained_variance": 0.6612560153007507, "eval_kl_divergence": 0.049000147730112076, "eval_loss": 0.45479556918144226, "eval_mae": 0.06361590325832367, "eval_rmse": 0.08704841136932373, "eval_runtime": 26.1103, "eval_samples_per_second": 179.929, "eval_steps_per_second": 2.834, "learning_rate": 0.0001, "step": 8398 }, { "epoch": 38.46153846153846, "grad_norm": 0.062047556042671204, "learning_rate": 0.0001, "loss": 0.4727, "step": 8500 }, { "epoch": 39.0, "eval_explained_variance": 0.6610231995582581, "eval_kl_divergence": 0.07255241274833679, "eval_loss": 0.454780250787735, "eval_mae": 0.06311424821615219, "eval_rmse": 0.08698847889900208, "eval_runtime": 25.6403, "eval_samples_per_second": 183.227, "eval_steps_per_second": 2.886, "learning_rate": 0.0001, "step": 8619 }, { "epoch": 40.0, "eval_explained_variance": 0.6605435013771057, "eval_kl_divergence": 0.06372024863958359, "eval_loss": 0.45476558804512024, "eval_mae": 0.06323693692684174, "eval_rmse": 0.08702895045280457, "eval_runtime": 26.038, "eval_samples_per_second": 180.429, "eval_steps_per_second": 2.842, "learning_rate": 0.0001, "step": 8840 }, { "epoch": 40.723981900452486, "grad_norm": 0.08612842857837677, "learning_rate": 1e-05, "loss": 0.4721, "step": 9000 }, { "epoch": 41.0, "eval_explained_variance": 0.6628013253211975, "eval_kl_divergence": 0.039023660123348236, "eval_loss": 0.45470812916755676, "eval_mae": 0.0634213536977768, "eval_rmse": 0.08692529052495956, "eval_runtime": 25.9883, "eval_samples_per_second": 180.774, "eval_steps_per_second": 2.847, "learning_rate": 1e-05, "step": 9061 }, { "epoch": 42.0, "eval_explained_variance": 0.6656690239906311, "eval_kl_divergence": 0.11149828135967255, "eval_loss": 0.4543863534927368, "eval_mae": 0.06281669437885284, "eval_rmse": 0.08619723469018936, "eval_runtime": 26.3115, "eval_samples_per_second": 178.553, "eval_steps_per_second": 2.812, "learning_rate": 1e-05, "step": 9282 }, { "epoch": 42.98642533936652, "grad_norm": 0.06828662008047104, "learning_rate": 1e-05, "loss": 0.4721, "step": 9500 }, { "epoch": 43.0, "eval_explained_variance": 0.6645870804786682, "eval_kl_divergence": 0.05330301821231842, "eval_loss": 0.4545557498931885, "eval_mae": 0.06320130825042725, "eval_rmse": 0.0865868553519249, "eval_runtime": 25.8985, "eval_samples_per_second": 181.4, "eval_steps_per_second": 2.857, "learning_rate": 1e-05, "step": 9503 }, { "epoch": 44.0, "eval_explained_variance": 0.6648023128509521, "eval_kl_divergence": 0.13496889173984528, "eval_loss": 0.45448434352874756, "eval_mae": 0.06253467500209808, "eval_rmse": 0.08635282516479492, "eval_runtime": 26.0508, "eval_samples_per_second": 180.34, "eval_steps_per_second": 2.841, "learning_rate": 1e-05, "step": 9724 }, { "epoch": 45.0, "eval_explained_variance": 0.6624875068664551, "eval_kl_divergence": 0.004431928042322397, "eval_loss": 0.4550137519836426, "eval_mae": 0.06418145447969437, "eval_rmse": 0.0874209776520729, "eval_runtime": 25.8495, "eval_samples_per_second": 181.744, "eval_steps_per_second": 2.863, "learning_rate": 1e-05, "step": 9945 }, { "epoch": 45.248868778280546, "grad_norm": 0.07514863461256027, "learning_rate": 1e-05, "loss": 0.4716, "step": 10000 }, { "epoch": 46.0, "eval_explained_variance": 0.6642169952392578, "eval_kl_divergence": 0.03887256979942322, "eval_loss": 0.4545902609825134, "eval_mae": 0.06316760927438736, "eval_rmse": 0.08669499307870865, "eval_runtime": 25.9222, "eval_samples_per_second": 181.235, "eval_steps_per_second": 2.855, "learning_rate": 1e-05, "step": 10166 }, { "epoch": 47.0, "eval_explained_variance": 0.6651113629341125, "eval_kl_divergence": 0.037030890583992004, "eval_loss": 0.4544997215270996, "eval_mae": 0.06298934668302536, "eval_rmse": 0.0865601971745491, "eval_runtime": 25.9565, "eval_samples_per_second": 180.995, "eval_steps_per_second": 2.851, "learning_rate": 1e-05, "step": 10387 }, { "epoch": 47.51131221719457, "grad_norm": 0.057216282933950424, "learning_rate": 1e-05, "loss": 0.4722, "step": 10500 }, { "epoch": 48.0, "eval_explained_variance": 0.6645199060440063, "eval_kl_divergence": 0.019425788894295692, "eval_loss": 0.4546374976634979, "eval_mae": 0.06339576095342636, "eval_rmse": 0.08680880069732666, "eval_runtime": 25.7117, "eval_samples_per_second": 182.718, "eval_steps_per_second": 2.878, "learning_rate": 1e-05, "step": 10608 }, { "epoch": 49.0, "eval_explained_variance": 0.6666774153709412, "eval_kl_divergence": 0.0667150691151619, "eval_loss": 0.45436596870422363, "eval_mae": 0.06269881874322891, "eval_rmse": 0.08620164543390274, "eval_runtime": 27.6905, "eval_samples_per_second": 169.661, "eval_steps_per_second": 2.672, "learning_rate": 1.0000000000000002e-06, "step": 10829 }, { "epoch": 49.7737556561086, "grad_norm": 0.07466714084148407, "learning_rate": 1.0000000000000002e-06, "loss": 0.4717, "step": 11000 }, { "epoch": 50.0, "eval_explained_variance": 0.6650940179824829, "eval_kl_divergence": 0.05483337119221687, "eval_loss": 0.45450592041015625, "eval_mae": 0.06310971826314926, "eval_rmse": 0.08650273084640503, "eval_runtime": 27.7128, "eval_samples_per_second": 169.524, "eval_steps_per_second": 2.67, "learning_rate": 1.0000000000000002e-06, "step": 11050 }, { "epoch": 51.0, "eval_explained_variance": 0.6651105284690857, "eval_kl_divergence": 0.04277108237147331, "eval_loss": 0.4544804096221924, "eval_mae": 0.06292647123336792, "eval_rmse": 0.08647629618644714, "eval_runtime": 26.6553, "eval_samples_per_second": 176.25, "eval_steps_per_second": 2.776, "learning_rate": 1.0000000000000002e-06, "step": 11271 }, { "epoch": 52.0, "eval_explained_variance": 0.667234480381012, "eval_kl_divergence": 0.12364839017391205, "eval_loss": 0.45421910285949707, "eval_mae": 0.06233237311244011, "eval_rmse": 0.08589440584182739, "eval_runtime": 25.8544, "eval_samples_per_second": 181.71, "eval_steps_per_second": 2.862, "learning_rate": 1.0000000000000002e-06, "step": 11492 }, { "epoch": 52.036199095022624, "grad_norm": 0.08442794531583786, "learning_rate": 1.0000000000000002e-06, "loss": 0.4718, "step": 11500 }, { "epoch": 53.0, "eval_explained_variance": 0.6671742796897888, "eval_kl_divergence": 0.08869530260562897, "eval_loss": 0.4542272686958313, "eval_mae": 0.06253313273191452, "eval_rmse": 0.08594661206007004, "eval_runtime": 25.9744, "eval_samples_per_second": 180.871, "eval_steps_per_second": 2.849, "learning_rate": 1.0000000000000002e-06, "step": 11713 }, { "epoch": 54.0, "eval_explained_variance": 0.6653165221214294, "eval_kl_divergence": 0.09171402454376221, "eval_loss": 0.4543103575706482, "eval_mae": 0.0623968206346035, "eval_rmse": 0.08615261316299438, "eval_runtime": 26.0699, "eval_samples_per_second": 180.208, "eval_steps_per_second": 2.839, "learning_rate": 1.0000000000000002e-06, "step": 11934 }, { "epoch": 54.29864253393665, "grad_norm": 0.08775485306978226, "learning_rate": 1.0000000000000002e-06, "loss": 0.4716, "step": 12000 }, { "epoch": 55.0, "eval_explained_variance": 0.6649713516235352, "eval_kl_divergence": 0.07737051695585251, "eval_loss": 0.45456644892692566, "eval_mae": 0.06305743753910065, "eval_rmse": 0.0865490511059761, "eval_runtime": 26.0104, "eval_samples_per_second": 180.62, "eval_steps_per_second": 2.845, "learning_rate": 1.0000000000000002e-06, "step": 12155 }, { "epoch": 56.0, "eval_explained_variance": 0.6649186611175537, "eval_kl_divergence": 0.04731013998389244, "eval_loss": 0.45458319783210754, "eval_mae": 0.06328658014535904, "eval_rmse": 0.08663744479417801, "eval_runtime": 25.8104, "eval_samples_per_second": 182.019, "eval_steps_per_second": 2.867, "learning_rate": 1.0000000000000002e-06, "step": 12376 }, { "epoch": 56.56108597285068, "grad_norm": 0.0692247599363327, "learning_rate": 1.0000000000000002e-06, "loss": 0.4717, "step": 12500 }, { "epoch": 57.0, "eval_explained_variance": 0.6657507419586182, "eval_kl_divergence": -0.004581684246659279, "eval_loss": 0.4548773169517517, "eval_mae": 0.0639243796467781, "eval_rmse": 0.0871059000492096, "eval_runtime": 25.4962, "eval_samples_per_second": 184.262, "eval_steps_per_second": 2.902, "learning_rate": 1.0000000000000002e-06, "step": 12597 }, { "epoch": 58.0, "eval_explained_variance": 0.6655800342559814, "eval_kl_divergence": 0.0553017221391201, "eval_loss": 0.45440155267715454, "eval_mae": 0.06271661818027496, "eval_rmse": 0.08635643124580383, "eval_runtime": 26.1057, "eval_samples_per_second": 179.961, "eval_steps_per_second": 2.835, "learning_rate": 1.0000000000000002e-06, "step": 12818 }, { "epoch": 58.8235294117647, "grad_norm": 0.07922232896089554, "learning_rate": 1.0000000000000002e-07, "loss": 0.4716, "step": 13000 }, { "epoch": 59.0, "eval_explained_variance": 0.6654148101806641, "eval_kl_divergence": 0.03675610199570656, "eval_loss": 0.45448538661003113, "eval_mae": 0.06308572739362717, "eval_rmse": 0.08650225400924683, "eval_runtime": 25.8122, "eval_samples_per_second": 182.007, "eval_steps_per_second": 2.867, "learning_rate": 1.0000000000000002e-07, "step": 13039 }, { "epoch": 60.0, "eval_explained_variance": 0.6660366058349609, "eval_kl_divergence": 0.047148581594228745, "eval_loss": 0.4544091522693634, "eval_mae": 0.06294982880353928, "eval_rmse": 0.08633282780647278, "eval_runtime": 26.4937, "eval_samples_per_second": 177.325, "eval_steps_per_second": 2.793, "learning_rate": 1.0000000000000002e-07, "step": 13260 }, { "epoch": 61.0, "eval_explained_variance": 0.6669723987579346, "eval_kl_divergence": 0.09280110895633698, "eval_loss": 0.4542348086833954, "eval_mae": 0.062441930174827576, "eval_rmse": 0.08595842123031616, "eval_runtime": 26.0483, "eval_samples_per_second": 180.357, "eval_steps_per_second": 2.841, "learning_rate": 1.0000000000000002e-07, "step": 13481 }, { "epoch": 61.085972850678736, "grad_norm": 0.07845129072666168, "learning_rate": 1.0000000000000002e-07, "loss": 0.4718, "step": 13500 }, { "epoch": 62.0, "eval_explained_variance": 0.6661055088043213, "eval_kl_divergence": 0.028626998886466026, "eval_loss": 0.4545469284057617, "eval_mae": 0.06315190345048904, "eval_rmse": 0.0865735188126564, "eval_runtime": 25.8503, "eval_samples_per_second": 181.739, "eval_steps_per_second": 2.863, "learning_rate": 1.0000000000000002e-07, "step": 13702 }, { "epoch": 62.0, "learning_rate": 1.0000000000000002e-07, "step": 13702, "total_flos": 9.42369297866869e+19, "train_loss": 0.4754439868851833, "train_runtime": 8961.4221, "train_samples_per_second": 235.894, "train_steps_per_second": 3.699 } ], "logging_steps": 500, "max_steps": 33150, "num_input_tokens_seen": 0, "num_train_epochs": 150, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.42369297866869e+19, "train_batch_size": 64, "trial_name": null, "trial_params": null }