lombardata's picture
Evaluation on the test set completed on 2024_11_27.
5726e4e verified
{
"best_metric": 0.45421910285949707,
"best_model_checkpoint": "/home/datawork-iot-nos/Seatizen/models/multilabel/bd_ortho_ign/bd_ortho-DinoVdeau-large-2024_11_27-batch-size64_freeze_probs/checkpoint-11492",
"epoch": 62.0,
"eval_steps": 500,
"global_step": 13702,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"eval_explained_variance": 0.5492395758628845,
"eval_kl_divergence": 0.06964559853076935,
"eval_loss": 0.46336060762405396,
"eval_mae": 0.07600608468055725,
"eval_rmse": 0.10175278037786484,
"eval_runtime": 26.595,
"eval_samples_per_second": 176.65,
"eval_steps_per_second": 2.782,
"learning_rate": 0.001,
"step": 221
},
{
"epoch": 2.0,
"eval_explained_variance": 0.6113448739051819,
"eval_kl_divergence": 0.0038063330575823784,
"eval_loss": 0.45933997631073,
"eval_mae": 0.07159148901700974,
"eval_rmse": 0.09520163387060165,
"eval_runtime": 25.5426,
"eval_samples_per_second": 183.928,
"eval_steps_per_second": 2.897,
"learning_rate": 0.001,
"step": 442
},
{
"epoch": 2.262443438914027,
"grad_norm": 0.16188210248947144,
"learning_rate": 0.001,
"loss": 0.5185,
"step": 500
},
{
"epoch": 3.0,
"eval_explained_variance": 0.6245184540748596,
"eval_kl_divergence": 0.05826142057776451,
"eval_loss": 0.457367479801178,
"eval_mae": 0.0670078918337822,
"eval_rmse": 0.0917908325791359,
"eval_runtime": 25.6126,
"eval_samples_per_second": 183.425,
"eval_steps_per_second": 2.889,
"learning_rate": 0.001,
"step": 663
},
{
"epoch": 4.0,
"eval_explained_variance": 0.6129782795906067,
"eval_kl_divergence": -0.06495417654514313,
"eval_loss": 0.459468811750412,
"eval_mae": 0.07134346663951874,
"eval_rmse": 0.09552835673093796,
"eval_runtime": 25.6003,
"eval_samples_per_second": 183.514,
"eval_steps_per_second": 2.891,
"learning_rate": 0.001,
"step": 884
},
{
"epoch": 4.524886877828054,
"grad_norm": 0.09988280385732651,
"learning_rate": 0.001,
"loss": 0.4806,
"step": 1000
},
{
"epoch": 5.0,
"eval_explained_variance": 0.6206489205360413,
"eval_kl_divergence": -0.08347146958112717,
"eval_loss": 0.45927393436431885,
"eval_mae": 0.07016489654779434,
"eval_rmse": 0.0953657403588295,
"eval_runtime": 25.74,
"eval_samples_per_second": 182.518,
"eval_steps_per_second": 2.875,
"learning_rate": 0.001,
"step": 1105
},
{
"epoch": 6.0,
"eval_explained_variance": 0.6041414737701416,
"eval_kl_divergence": -0.07046143710613251,
"eval_loss": 0.46080395579338074,
"eval_mae": 0.07277411222457886,
"eval_rmse": 0.09773259609937668,
"eval_runtime": 25.4681,
"eval_samples_per_second": 184.466,
"eval_steps_per_second": 2.906,
"learning_rate": 0.001,
"step": 1326
},
{
"epoch": 6.787330316742081,
"grad_norm": 0.08271574974060059,
"learning_rate": 0.001,
"loss": 0.4786,
"step": 1500
},
{
"epoch": 7.0,
"eval_explained_variance": 0.628325879573822,
"eval_kl_divergence": -0.004442690871655941,
"eval_loss": 0.4581476151943207,
"eval_mae": 0.06827609241008759,
"eval_rmse": 0.09274852275848389,
"eval_runtime": 26.0251,
"eval_samples_per_second": 180.518,
"eval_steps_per_second": 2.843,
"learning_rate": 0.001,
"step": 1547
},
{
"epoch": 8.0,
"eval_explained_variance": 0.6276748776435852,
"eval_kl_divergence": 0.07988782227039337,
"eval_loss": 0.4573117196559906,
"eval_mae": 0.06800529360771179,
"eval_rmse": 0.09162522107362747,
"eval_runtime": 25.7197,
"eval_samples_per_second": 182.662,
"eval_steps_per_second": 2.877,
"learning_rate": 0.001,
"step": 1768
},
{
"epoch": 9.0,
"eval_explained_variance": 0.6196129322052002,
"eval_kl_divergence": 0.02327939122915268,
"eval_loss": 0.45939013361930847,
"eval_mae": 0.07057134807109833,
"eval_rmse": 0.09471722692251205,
"eval_runtime": 25.8299,
"eval_samples_per_second": 181.883,
"eval_steps_per_second": 2.865,
"learning_rate": 0.001,
"step": 1989
},
{
"epoch": 9.049773755656108,
"grad_norm": 0.05649600923061371,
"learning_rate": 0.001,
"loss": 0.4776,
"step": 2000
},
{
"epoch": 10.0,
"eval_explained_variance": 0.6293186545372009,
"eval_kl_divergence": 0.0885055735707283,
"eval_loss": 0.45772281289100647,
"eval_mae": 0.06745484471321106,
"eval_rmse": 0.09179002046585083,
"eval_runtime": 25.5273,
"eval_samples_per_second": 184.039,
"eval_steps_per_second": 2.899,
"learning_rate": 0.001,
"step": 2210
},
{
"epoch": 11.0,
"eval_explained_variance": 0.6422439813613892,
"eval_kl_divergence": 0.1296330839395523,
"eval_loss": 0.45641985535621643,
"eval_mae": 0.06617596000432968,
"eval_rmse": 0.08975591510534286,
"eval_runtime": 25.7282,
"eval_samples_per_second": 182.601,
"eval_steps_per_second": 2.876,
"learning_rate": 0.001,
"step": 2431
},
{
"epoch": 11.312217194570136,
"grad_norm": 0.04163961857557297,
"learning_rate": 0.001,
"loss": 0.4772,
"step": 2500
},
{
"epoch": 12.0,
"eval_explained_variance": 0.6385617256164551,
"eval_kl_divergence": -0.006057058461010456,
"eval_loss": 0.45718902349472046,
"eval_mae": 0.06766870617866516,
"eval_rmse": 0.09130751341581345,
"eval_runtime": 25.6849,
"eval_samples_per_second": 182.909,
"eval_steps_per_second": 2.881,
"learning_rate": 0.001,
"step": 2652
},
{
"epoch": 13.0,
"eval_explained_variance": 0.6186209321022034,
"eval_kl_divergence": -0.20600058138370514,
"eval_loss": 0.4622880220413208,
"eval_mae": 0.07468675822019577,
"eval_rmse": 0.10024455189704895,
"eval_runtime": 25.9645,
"eval_samples_per_second": 180.939,
"eval_steps_per_second": 2.85,
"learning_rate": 0.001,
"step": 2873
},
{
"epoch": 13.574660633484163,
"grad_norm": 0.0532899908721447,
"learning_rate": 0.001,
"loss": 0.4769,
"step": 3000
},
{
"epoch": 14.0,
"eval_explained_variance": 0.6346250176429749,
"eval_kl_divergence": -0.0371401272714138,
"eval_loss": 0.45775285363197327,
"eval_mae": 0.06778896600008011,
"eval_rmse": 0.092497818171978,
"eval_runtime": 25.7017,
"eval_samples_per_second": 182.79,
"eval_steps_per_second": 2.879,
"learning_rate": 0.001,
"step": 3094
},
{
"epoch": 15.0,
"eval_explained_variance": 0.6340083479881287,
"eval_kl_divergence": 0.04575105383992195,
"eval_loss": 0.4575214684009552,
"eval_mae": 0.0666513592004776,
"eval_rmse": 0.0916559174656868,
"eval_runtime": 26.025,
"eval_samples_per_second": 180.519,
"eval_steps_per_second": 2.843,
"learning_rate": 0.001,
"step": 3315
},
{
"epoch": 15.83710407239819,
"grad_norm": 0.0473792664706707,
"learning_rate": 0.001,
"loss": 0.4766,
"step": 3500
},
{
"epoch": 16.0,
"eval_explained_variance": 0.6277230381965637,
"eval_kl_divergence": 0.01510859839618206,
"eval_loss": 0.4578736424446106,
"eval_mae": 0.06800080835819244,
"eval_rmse": 0.09264300018548965,
"eval_runtime": 25.6671,
"eval_samples_per_second": 183.036,
"eval_steps_per_second": 2.883,
"learning_rate": 0.001,
"step": 3536
},
{
"epoch": 17.0,
"eval_explained_variance": 0.6246375441551208,
"eval_kl_divergence": -0.06794208288192749,
"eval_loss": 0.4592094421386719,
"eval_mae": 0.07020581513643265,
"eval_rmse": 0.09485668689012527,
"eval_runtime": 25.9387,
"eval_samples_per_second": 181.119,
"eval_steps_per_second": 2.853,
"learning_rate": 0.001,
"step": 3757
},
{
"epoch": 18.0,
"eval_explained_variance": 0.6493042707443237,
"eval_kl_divergence": 0.04208216443657875,
"eval_loss": 0.45573291182518005,
"eval_mae": 0.06506813317537308,
"eval_rmse": 0.08873652666807175,
"eval_runtime": 25.6229,
"eval_samples_per_second": 183.352,
"eval_steps_per_second": 2.888,
"learning_rate": 0.0001,
"step": 3978
},
{
"epoch": 18.099547511312217,
"grad_norm": 0.048517756164073944,
"learning_rate": 0.0001,
"loss": 0.4758,
"step": 4000
},
{
"epoch": 19.0,
"eval_explained_variance": 0.6507542729377747,
"eval_kl_divergence": 0.04677804559469223,
"eval_loss": 0.4555513262748718,
"eval_mae": 0.06473750621080399,
"eval_rmse": 0.08847790211439133,
"eval_runtime": 25.7638,
"eval_samples_per_second": 182.349,
"eval_steps_per_second": 2.872,
"learning_rate": 0.0001,
"step": 4199
},
{
"epoch": 20.0,
"eval_explained_variance": 0.6518434882164001,
"eval_kl_divergence": 0.0404924675822258,
"eval_loss": 0.45553284883499146,
"eval_mae": 0.06476090103387833,
"eval_rmse": 0.08838176727294922,
"eval_runtime": 25.6331,
"eval_samples_per_second": 183.279,
"eval_steps_per_second": 2.887,
"learning_rate": 0.0001,
"step": 4420
},
{
"epoch": 20.361990950226243,
"grad_norm": 0.04679996892809868,
"learning_rate": 0.0001,
"loss": 0.4741,
"step": 4500
},
{
"epoch": 21.0,
"eval_explained_variance": 0.6532743573188782,
"eval_kl_divergence": 0.047539714723825455,
"eval_loss": 0.4555487334728241,
"eval_mae": 0.06497333198785782,
"eval_rmse": 0.08836204558610916,
"eval_runtime": 25.803,
"eval_samples_per_second": 182.072,
"eval_steps_per_second": 2.868,
"learning_rate": 0.0001,
"step": 4641
},
{
"epoch": 22.0,
"eval_explained_variance": 0.6534684300422668,
"eval_kl_divergence": 0.0570099912583828,
"eval_loss": 0.45551028847694397,
"eval_mae": 0.06458985060453415,
"eval_rmse": 0.08831282705068588,
"eval_runtime": 25.9625,
"eval_samples_per_second": 180.953,
"eval_steps_per_second": 2.85,
"learning_rate": 0.0001,
"step": 4862
},
{
"epoch": 22.624434389140273,
"grad_norm": 0.05471302196383476,
"learning_rate": 0.0001,
"loss": 0.4738,
"step": 5000
},
{
"epoch": 23.0,
"eval_explained_variance": 0.6569964289665222,
"eval_kl_divergence": 0.08867427706718445,
"eval_loss": 0.45505577325820923,
"eval_mae": 0.0640987753868103,
"eval_rmse": 0.08740502595901489,
"eval_runtime": 25.8915,
"eval_samples_per_second": 181.45,
"eval_steps_per_second": 2.858,
"learning_rate": 0.0001,
"step": 5083
},
{
"epoch": 24.0,
"eval_explained_variance": 0.6552526354789734,
"eval_kl_divergence": 0.055539198219776154,
"eval_loss": 0.4552234709262848,
"eval_mae": 0.06417837738990784,
"eval_rmse": 0.08780523389577866,
"eval_runtime": 27.2231,
"eval_samples_per_second": 172.574,
"eval_steps_per_second": 2.718,
"learning_rate": 0.0001,
"step": 5304
},
{
"epoch": 24.8868778280543,
"grad_norm": 0.0545237734913826,
"learning_rate": 0.0001,
"loss": 0.4736,
"step": 5500
},
{
"epoch": 25.0,
"eval_explained_variance": 0.6582456231117249,
"eval_kl_divergence": 0.023763582110404968,
"eval_loss": 0.45521080493927,
"eval_mae": 0.06447087973356247,
"eval_rmse": 0.08778873831033707,
"eval_runtime": 25.7982,
"eval_samples_per_second": 182.106,
"eval_steps_per_second": 2.868,
"learning_rate": 0.0001,
"step": 5525
},
{
"epoch": 26.0,
"eval_explained_variance": 0.6571853756904602,
"eval_kl_divergence": 0.040941931307315826,
"eval_loss": 0.4557025730609894,
"eval_mae": 0.06462270766496658,
"eval_rmse": 0.08846313506364822,
"eval_runtime": 25.5822,
"eval_samples_per_second": 183.643,
"eval_steps_per_second": 2.893,
"learning_rate": 0.0001,
"step": 5746
},
{
"epoch": 27.0,
"eval_explained_variance": 0.6576172709465027,
"eval_kl_divergence": 0.05476689711213112,
"eval_loss": 0.4550967216491699,
"eval_mae": 0.06391049176454544,
"eval_rmse": 0.08758416771888733,
"eval_runtime": 26.0908,
"eval_samples_per_second": 180.064,
"eval_steps_per_second": 2.836,
"learning_rate": 0.0001,
"step": 5967
},
{
"epoch": 27.149321266968325,
"grad_norm": 0.05160004645586014,
"learning_rate": 0.0001,
"loss": 0.4731,
"step": 6000
},
{
"epoch": 28.0,
"eval_explained_variance": 0.658767580986023,
"eval_kl_divergence": 0.027325255796313286,
"eval_loss": 0.45512688159942627,
"eval_mae": 0.0641704872250557,
"eval_rmse": 0.08764084428548813,
"eval_runtime": 25.6818,
"eval_samples_per_second": 182.931,
"eval_steps_per_second": 2.881,
"learning_rate": 0.0001,
"step": 6188
},
{
"epoch": 29.0,
"eval_explained_variance": 0.6617770195007324,
"eval_kl_divergence": 0.0744185745716095,
"eval_loss": 0.45477041602134705,
"eval_mae": 0.0634256973862648,
"eval_rmse": 0.08693012595176697,
"eval_runtime": 25.726,
"eval_samples_per_second": 182.617,
"eval_steps_per_second": 2.876,
"learning_rate": 0.0001,
"step": 6409
},
{
"epoch": 29.41176470588235,
"grad_norm": 0.07741276919841766,
"learning_rate": 0.0001,
"loss": 0.4727,
"step": 6500
},
{
"epoch": 30.0,
"eval_explained_variance": 0.6594749093055725,
"eval_kl_divergence": 0.049223385751247406,
"eval_loss": 0.4549327790737152,
"eval_mae": 0.06360659003257751,
"eval_rmse": 0.0873405933380127,
"eval_runtime": 25.4772,
"eval_samples_per_second": 184.4,
"eval_steps_per_second": 2.905,
"learning_rate": 0.0001,
"step": 6630
},
{
"epoch": 31.0,
"eval_explained_variance": 0.6613443493843079,
"eval_kl_divergence": 0.06878047436475754,
"eval_loss": 0.4547973871231079,
"eval_mae": 0.06322694569826126,
"eval_rmse": 0.08694975823163986,
"eval_runtime": 25.8257,
"eval_samples_per_second": 181.912,
"eval_steps_per_second": 2.865,
"learning_rate": 0.0001,
"step": 6851
},
{
"epoch": 31.67420814479638,
"grad_norm": 0.055884115397930145,
"learning_rate": 0.0001,
"loss": 0.4732,
"step": 7000
},
{
"epoch": 32.0,
"eval_explained_variance": 0.6602151393890381,
"eval_kl_divergence": 0.027085499837994576,
"eval_loss": 0.454988956451416,
"eval_mae": 0.063857302069664,
"eval_rmse": 0.08743549138307571,
"eval_runtime": 25.6292,
"eval_samples_per_second": 183.307,
"eval_steps_per_second": 2.887,
"learning_rate": 0.0001,
"step": 7072
},
{
"epoch": 33.0,
"eval_explained_variance": 0.6580324172973633,
"eval_kl_divergence": -0.017361771315336227,
"eval_loss": 0.455375999212265,
"eval_mae": 0.0646858736872673,
"eval_rmse": 0.08816961199045181,
"eval_runtime": 25.8246,
"eval_samples_per_second": 181.919,
"eval_steps_per_second": 2.865,
"learning_rate": 0.0001,
"step": 7293
},
{
"epoch": 33.93665158371041,
"grad_norm": 0.08047891408205032,
"learning_rate": 0.0001,
"loss": 0.4725,
"step": 7500
},
{
"epoch": 34.0,
"eval_explained_variance": 0.6616186499595642,
"eval_kl_divergence": 0.10939505696296692,
"eval_loss": 0.45461305975914,
"eval_mae": 0.0628495141863823,
"eval_rmse": 0.08664888888597488,
"eval_runtime": 25.7346,
"eval_samples_per_second": 182.556,
"eval_steps_per_second": 2.876,
"learning_rate": 0.0001,
"step": 7514
},
{
"epoch": 35.0,
"eval_explained_variance": 0.6582692265510559,
"eval_kl_divergence": 0.05707371234893799,
"eval_loss": 0.45498156547546387,
"eval_mae": 0.06386271119117737,
"eval_rmse": 0.08741921186447144,
"eval_runtime": 25.7857,
"eval_samples_per_second": 182.194,
"eval_steps_per_second": 2.87,
"learning_rate": 0.0001,
"step": 7735
},
{
"epoch": 36.0,
"eval_explained_variance": 0.6615896224975586,
"eval_kl_divergence": 0.14533284306526184,
"eval_loss": 0.4548388123512268,
"eval_mae": 0.0629100501537323,
"eval_rmse": 0.08686337620019913,
"eval_runtime": 29.7733,
"eval_samples_per_second": 157.793,
"eval_steps_per_second": 2.485,
"learning_rate": 0.0001,
"step": 7956
},
{
"epoch": 36.199095022624434,
"grad_norm": 0.07811417430639267,
"learning_rate": 0.0001,
"loss": 0.4727,
"step": 8000
},
{
"epoch": 37.0,
"eval_explained_variance": 0.6586756110191345,
"eval_kl_divergence": -0.015241213142871857,
"eval_loss": 0.45526784658432007,
"eval_mae": 0.06451455503702164,
"eval_rmse": 0.08806425333023071,
"eval_runtime": 25.6924,
"eval_samples_per_second": 182.855,
"eval_steps_per_second": 2.88,
"learning_rate": 0.0001,
"step": 8177
},
{
"epoch": 38.0,
"eval_explained_variance": 0.6612560153007507,
"eval_kl_divergence": 0.049000147730112076,
"eval_loss": 0.45479556918144226,
"eval_mae": 0.06361590325832367,
"eval_rmse": 0.08704841136932373,
"eval_runtime": 26.1103,
"eval_samples_per_second": 179.929,
"eval_steps_per_second": 2.834,
"learning_rate": 0.0001,
"step": 8398
},
{
"epoch": 38.46153846153846,
"grad_norm": 0.062047556042671204,
"learning_rate": 0.0001,
"loss": 0.4727,
"step": 8500
},
{
"epoch": 39.0,
"eval_explained_variance": 0.6610231995582581,
"eval_kl_divergence": 0.07255241274833679,
"eval_loss": 0.454780250787735,
"eval_mae": 0.06311424821615219,
"eval_rmse": 0.08698847889900208,
"eval_runtime": 25.6403,
"eval_samples_per_second": 183.227,
"eval_steps_per_second": 2.886,
"learning_rate": 0.0001,
"step": 8619
},
{
"epoch": 40.0,
"eval_explained_variance": 0.6605435013771057,
"eval_kl_divergence": 0.06372024863958359,
"eval_loss": 0.45476558804512024,
"eval_mae": 0.06323693692684174,
"eval_rmse": 0.08702895045280457,
"eval_runtime": 26.038,
"eval_samples_per_second": 180.429,
"eval_steps_per_second": 2.842,
"learning_rate": 0.0001,
"step": 8840
},
{
"epoch": 40.723981900452486,
"grad_norm": 0.08612842857837677,
"learning_rate": 1e-05,
"loss": 0.4721,
"step": 9000
},
{
"epoch": 41.0,
"eval_explained_variance": 0.6628013253211975,
"eval_kl_divergence": 0.039023660123348236,
"eval_loss": 0.45470812916755676,
"eval_mae": 0.0634213536977768,
"eval_rmse": 0.08692529052495956,
"eval_runtime": 25.9883,
"eval_samples_per_second": 180.774,
"eval_steps_per_second": 2.847,
"learning_rate": 1e-05,
"step": 9061
},
{
"epoch": 42.0,
"eval_explained_variance": 0.6656690239906311,
"eval_kl_divergence": 0.11149828135967255,
"eval_loss": 0.4543863534927368,
"eval_mae": 0.06281669437885284,
"eval_rmse": 0.08619723469018936,
"eval_runtime": 26.3115,
"eval_samples_per_second": 178.553,
"eval_steps_per_second": 2.812,
"learning_rate": 1e-05,
"step": 9282
},
{
"epoch": 42.98642533936652,
"grad_norm": 0.06828662008047104,
"learning_rate": 1e-05,
"loss": 0.4721,
"step": 9500
},
{
"epoch": 43.0,
"eval_explained_variance": 0.6645870804786682,
"eval_kl_divergence": 0.05330301821231842,
"eval_loss": 0.4545557498931885,
"eval_mae": 0.06320130825042725,
"eval_rmse": 0.0865868553519249,
"eval_runtime": 25.8985,
"eval_samples_per_second": 181.4,
"eval_steps_per_second": 2.857,
"learning_rate": 1e-05,
"step": 9503
},
{
"epoch": 44.0,
"eval_explained_variance": 0.6648023128509521,
"eval_kl_divergence": 0.13496889173984528,
"eval_loss": 0.45448434352874756,
"eval_mae": 0.06253467500209808,
"eval_rmse": 0.08635282516479492,
"eval_runtime": 26.0508,
"eval_samples_per_second": 180.34,
"eval_steps_per_second": 2.841,
"learning_rate": 1e-05,
"step": 9724
},
{
"epoch": 45.0,
"eval_explained_variance": 0.6624875068664551,
"eval_kl_divergence": 0.004431928042322397,
"eval_loss": 0.4550137519836426,
"eval_mae": 0.06418145447969437,
"eval_rmse": 0.0874209776520729,
"eval_runtime": 25.8495,
"eval_samples_per_second": 181.744,
"eval_steps_per_second": 2.863,
"learning_rate": 1e-05,
"step": 9945
},
{
"epoch": 45.248868778280546,
"grad_norm": 0.07514863461256027,
"learning_rate": 1e-05,
"loss": 0.4716,
"step": 10000
},
{
"epoch": 46.0,
"eval_explained_variance": 0.6642169952392578,
"eval_kl_divergence": 0.03887256979942322,
"eval_loss": 0.4545902609825134,
"eval_mae": 0.06316760927438736,
"eval_rmse": 0.08669499307870865,
"eval_runtime": 25.9222,
"eval_samples_per_second": 181.235,
"eval_steps_per_second": 2.855,
"learning_rate": 1e-05,
"step": 10166
},
{
"epoch": 47.0,
"eval_explained_variance": 0.6651113629341125,
"eval_kl_divergence": 0.037030890583992004,
"eval_loss": 0.4544997215270996,
"eval_mae": 0.06298934668302536,
"eval_rmse": 0.0865601971745491,
"eval_runtime": 25.9565,
"eval_samples_per_second": 180.995,
"eval_steps_per_second": 2.851,
"learning_rate": 1e-05,
"step": 10387
},
{
"epoch": 47.51131221719457,
"grad_norm": 0.057216282933950424,
"learning_rate": 1e-05,
"loss": 0.4722,
"step": 10500
},
{
"epoch": 48.0,
"eval_explained_variance": 0.6645199060440063,
"eval_kl_divergence": 0.019425788894295692,
"eval_loss": 0.4546374976634979,
"eval_mae": 0.06339576095342636,
"eval_rmse": 0.08680880069732666,
"eval_runtime": 25.7117,
"eval_samples_per_second": 182.718,
"eval_steps_per_second": 2.878,
"learning_rate": 1e-05,
"step": 10608
},
{
"epoch": 49.0,
"eval_explained_variance": 0.6666774153709412,
"eval_kl_divergence": 0.0667150691151619,
"eval_loss": 0.45436596870422363,
"eval_mae": 0.06269881874322891,
"eval_rmse": 0.08620164543390274,
"eval_runtime": 27.6905,
"eval_samples_per_second": 169.661,
"eval_steps_per_second": 2.672,
"learning_rate": 1.0000000000000002e-06,
"step": 10829
},
{
"epoch": 49.7737556561086,
"grad_norm": 0.07466714084148407,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4717,
"step": 11000
},
{
"epoch": 50.0,
"eval_explained_variance": 0.6650940179824829,
"eval_kl_divergence": 0.05483337119221687,
"eval_loss": 0.45450592041015625,
"eval_mae": 0.06310971826314926,
"eval_rmse": 0.08650273084640503,
"eval_runtime": 27.7128,
"eval_samples_per_second": 169.524,
"eval_steps_per_second": 2.67,
"learning_rate": 1.0000000000000002e-06,
"step": 11050
},
{
"epoch": 51.0,
"eval_explained_variance": 0.6651105284690857,
"eval_kl_divergence": 0.04277108237147331,
"eval_loss": 0.4544804096221924,
"eval_mae": 0.06292647123336792,
"eval_rmse": 0.08647629618644714,
"eval_runtime": 26.6553,
"eval_samples_per_second": 176.25,
"eval_steps_per_second": 2.776,
"learning_rate": 1.0000000000000002e-06,
"step": 11271
},
{
"epoch": 52.0,
"eval_explained_variance": 0.667234480381012,
"eval_kl_divergence": 0.12364839017391205,
"eval_loss": 0.45421910285949707,
"eval_mae": 0.06233237311244011,
"eval_rmse": 0.08589440584182739,
"eval_runtime": 25.8544,
"eval_samples_per_second": 181.71,
"eval_steps_per_second": 2.862,
"learning_rate": 1.0000000000000002e-06,
"step": 11492
},
{
"epoch": 52.036199095022624,
"grad_norm": 0.08442794531583786,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4718,
"step": 11500
},
{
"epoch": 53.0,
"eval_explained_variance": 0.6671742796897888,
"eval_kl_divergence": 0.08869530260562897,
"eval_loss": 0.4542272686958313,
"eval_mae": 0.06253313273191452,
"eval_rmse": 0.08594661206007004,
"eval_runtime": 25.9744,
"eval_samples_per_second": 180.871,
"eval_steps_per_second": 2.849,
"learning_rate": 1.0000000000000002e-06,
"step": 11713
},
{
"epoch": 54.0,
"eval_explained_variance": 0.6653165221214294,
"eval_kl_divergence": 0.09171402454376221,
"eval_loss": 0.4543103575706482,
"eval_mae": 0.0623968206346035,
"eval_rmse": 0.08615261316299438,
"eval_runtime": 26.0699,
"eval_samples_per_second": 180.208,
"eval_steps_per_second": 2.839,
"learning_rate": 1.0000000000000002e-06,
"step": 11934
},
{
"epoch": 54.29864253393665,
"grad_norm": 0.08775485306978226,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4716,
"step": 12000
},
{
"epoch": 55.0,
"eval_explained_variance": 0.6649713516235352,
"eval_kl_divergence": 0.07737051695585251,
"eval_loss": 0.45456644892692566,
"eval_mae": 0.06305743753910065,
"eval_rmse": 0.0865490511059761,
"eval_runtime": 26.0104,
"eval_samples_per_second": 180.62,
"eval_steps_per_second": 2.845,
"learning_rate": 1.0000000000000002e-06,
"step": 12155
},
{
"epoch": 56.0,
"eval_explained_variance": 0.6649186611175537,
"eval_kl_divergence": 0.04731013998389244,
"eval_loss": 0.45458319783210754,
"eval_mae": 0.06328658014535904,
"eval_rmse": 0.08663744479417801,
"eval_runtime": 25.8104,
"eval_samples_per_second": 182.019,
"eval_steps_per_second": 2.867,
"learning_rate": 1.0000000000000002e-06,
"step": 12376
},
{
"epoch": 56.56108597285068,
"grad_norm": 0.0692247599363327,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.4717,
"step": 12500
},
{
"epoch": 57.0,
"eval_explained_variance": 0.6657507419586182,
"eval_kl_divergence": -0.004581684246659279,
"eval_loss": 0.4548773169517517,
"eval_mae": 0.0639243796467781,
"eval_rmse": 0.0871059000492096,
"eval_runtime": 25.4962,
"eval_samples_per_second": 184.262,
"eval_steps_per_second": 2.902,
"learning_rate": 1.0000000000000002e-06,
"step": 12597
},
{
"epoch": 58.0,
"eval_explained_variance": 0.6655800342559814,
"eval_kl_divergence": 0.0553017221391201,
"eval_loss": 0.45440155267715454,
"eval_mae": 0.06271661818027496,
"eval_rmse": 0.08635643124580383,
"eval_runtime": 26.1057,
"eval_samples_per_second": 179.961,
"eval_steps_per_second": 2.835,
"learning_rate": 1.0000000000000002e-06,
"step": 12818
},
{
"epoch": 58.8235294117647,
"grad_norm": 0.07922232896089554,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4716,
"step": 13000
},
{
"epoch": 59.0,
"eval_explained_variance": 0.6654148101806641,
"eval_kl_divergence": 0.03675610199570656,
"eval_loss": 0.45448538661003113,
"eval_mae": 0.06308572739362717,
"eval_rmse": 0.08650225400924683,
"eval_runtime": 25.8122,
"eval_samples_per_second": 182.007,
"eval_steps_per_second": 2.867,
"learning_rate": 1.0000000000000002e-07,
"step": 13039
},
{
"epoch": 60.0,
"eval_explained_variance": 0.6660366058349609,
"eval_kl_divergence": 0.047148581594228745,
"eval_loss": 0.4544091522693634,
"eval_mae": 0.06294982880353928,
"eval_rmse": 0.08633282780647278,
"eval_runtime": 26.4937,
"eval_samples_per_second": 177.325,
"eval_steps_per_second": 2.793,
"learning_rate": 1.0000000000000002e-07,
"step": 13260
},
{
"epoch": 61.0,
"eval_explained_variance": 0.6669723987579346,
"eval_kl_divergence": 0.09280110895633698,
"eval_loss": 0.4542348086833954,
"eval_mae": 0.062441930174827576,
"eval_rmse": 0.08595842123031616,
"eval_runtime": 26.0483,
"eval_samples_per_second": 180.357,
"eval_steps_per_second": 2.841,
"learning_rate": 1.0000000000000002e-07,
"step": 13481
},
{
"epoch": 61.085972850678736,
"grad_norm": 0.07845129072666168,
"learning_rate": 1.0000000000000002e-07,
"loss": 0.4718,
"step": 13500
},
{
"epoch": 62.0,
"eval_explained_variance": 0.6661055088043213,
"eval_kl_divergence": 0.028626998886466026,
"eval_loss": 0.4545469284057617,
"eval_mae": 0.06315190345048904,
"eval_rmse": 0.0865735188126564,
"eval_runtime": 25.8503,
"eval_samples_per_second": 181.739,
"eval_steps_per_second": 2.863,
"learning_rate": 1.0000000000000002e-07,
"step": 13702
},
{
"epoch": 62.0,
"learning_rate": 1.0000000000000002e-07,
"step": 13702,
"total_flos": 9.42369297866869e+19,
"train_loss": 0.4754439868851833,
"train_runtime": 8961.4221,
"train_samples_per_second": 235.894,
"train_steps_per_second": 3.699
}
],
"logging_steps": 500,
"max_steps": 33150,
"num_input_tokens_seen": 0,
"num_train_epochs": 150,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 10,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.42369297866869e+19,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}