dada22231's picture
Training in progress, step 75, checkpoint
26f325a verified
{
"best_metric": 0.04871680587530136,
"best_model_checkpoint": "miner_id_24/checkpoint-75",
"epoch": 0.026428516369162325,
"eval_steps": 25,
"global_step": 75,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00035238021825549767,
"grad_norm": 0.18368496000766754,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.07,
"step": 1
},
{
"epoch": 0.00035238021825549767,
"eval_loss": 0.3029957115650177,
"eval_runtime": 5.7325,
"eval_samples_per_second": 8.722,
"eval_steps_per_second": 2.268,
"step": 1
},
{
"epoch": 0.0007047604365109953,
"grad_norm": 0.40931159257888794,
"learning_rate": 6.666666666666667e-05,
"loss": 0.0927,
"step": 2
},
{
"epoch": 0.001057140654766493,
"grad_norm": 0.46541017293930054,
"learning_rate": 0.0001,
"loss": 0.101,
"step": 3
},
{
"epoch": 0.0014095208730219907,
"grad_norm": 0.3590395152568817,
"learning_rate": 9.99571699711836e-05,
"loss": 0.0586,
"step": 4
},
{
"epoch": 0.0017619010912774884,
"grad_norm": 0.9103890061378479,
"learning_rate": 9.982876141412856e-05,
"loss": 0.063,
"step": 5
},
{
"epoch": 0.002114281309532986,
"grad_norm": 0.22858065366744995,
"learning_rate": 9.961501876182148e-05,
"loss": 0.0388,
"step": 6
},
{
"epoch": 0.0024666615277884837,
"grad_norm": 0.5428525805473328,
"learning_rate": 9.931634888554937e-05,
"loss": 0.0452,
"step": 7
},
{
"epoch": 0.0028190417460439814,
"grad_norm": 0.1322905421257019,
"learning_rate": 9.893332032039701e-05,
"loss": 0.0359,
"step": 8
},
{
"epoch": 0.003171421964299479,
"grad_norm": 0.2533002495765686,
"learning_rate": 9.846666218300807e-05,
"loss": 0.0429,
"step": 9
},
{
"epoch": 0.0035238021825549767,
"grad_norm": 0.15599848330020905,
"learning_rate": 9.791726278367022e-05,
"loss": 0.0426,
"step": 10
},
{
"epoch": 0.0038761824008104744,
"grad_norm": 0.13487255573272705,
"learning_rate": 9.728616793536588e-05,
"loss": 0.0465,
"step": 11
},
{
"epoch": 0.004228562619065972,
"grad_norm": 0.14608468115329742,
"learning_rate": 9.657457896300791e-05,
"loss": 0.0574,
"step": 12
},
{
"epoch": 0.00458094283732147,
"grad_norm": 0.37514936923980713,
"learning_rate": 9.578385041664925e-05,
"loss": 0.0393,
"step": 13
},
{
"epoch": 0.004933323055576967,
"grad_norm": 0.09332629293203354,
"learning_rate": 9.491548749301997e-05,
"loss": 0.0262,
"step": 14
},
{
"epoch": 0.0052857032738324655,
"grad_norm": 0.04472605511546135,
"learning_rate": 9.397114317029975e-05,
"loss": 0.0165,
"step": 15
},
{
"epoch": 0.005638083492087963,
"grad_norm": 0.04193398728966713,
"learning_rate": 9.295261506157986e-05,
"loss": 0.0166,
"step": 16
},
{
"epoch": 0.005990463710343461,
"grad_norm": 0.06309419125318527,
"learning_rate": 9.186184199300464e-05,
"loss": 0.0194,
"step": 17
},
{
"epoch": 0.006342843928598958,
"grad_norm": 0.049311138689517975,
"learning_rate": 9.070090031310558e-05,
"loss": 0.022,
"step": 18
},
{
"epoch": 0.006695224146854456,
"grad_norm": 0.033052124083042145,
"learning_rate": 8.947199994035401e-05,
"loss": 0.0277,
"step": 19
},
{
"epoch": 0.007047604365109953,
"grad_norm": 0.047948189079761505,
"learning_rate": 8.817748015645558e-05,
"loss": 0.0308,
"step": 20
},
{
"epoch": 0.0073999845833654515,
"grad_norm": 0.09758961200714111,
"learning_rate": 8.681980515339464e-05,
"loss": 0.0334,
"step": 21
},
{
"epoch": 0.007752364801620949,
"grad_norm": 0.08111658692359924,
"learning_rate": 8.540155934270471e-05,
"loss": 0.0375,
"step": 22
},
{
"epoch": 0.008104745019876446,
"grad_norm": 0.10492856800556183,
"learning_rate": 8.392544243589427e-05,
"loss": 0.0447,
"step": 23
},
{
"epoch": 0.008457125238131944,
"grad_norm": 0.27334481477737427,
"learning_rate": 8.239426430539243e-05,
"loss": 0.0558,
"step": 24
},
{
"epoch": 0.008809505456387442,
"grad_norm": 0.09733742475509644,
"learning_rate": 8.081093963579707e-05,
"loss": 0.0646,
"step": 25
},
{
"epoch": 0.008809505456387442,
"eval_loss": 0.05434239283204079,
"eval_runtime": 5.835,
"eval_samples_per_second": 8.569,
"eval_steps_per_second": 2.228,
"step": 25
},
{
"epoch": 0.00916188567464294,
"grad_norm": 0.1138065904378891,
"learning_rate": 7.917848237560709e-05,
"loss": 0.0288,
"step": 26
},
{
"epoch": 0.009514265892898437,
"grad_norm": 0.13079240918159485,
"learning_rate": 7.75e-05,
"loss": 0.0152,
"step": 27
},
{
"epoch": 0.009866646111153935,
"grad_norm": 0.16374854743480682,
"learning_rate": 7.577868759557654e-05,
"loss": 0.0212,
"step": 28
},
{
"epoch": 0.010219026329409433,
"grad_norm": 0.09367111325263977,
"learning_rate": 7.401782177833148e-05,
"loss": 0.0222,
"step": 29
},
{
"epoch": 0.010571406547664931,
"grad_norm": 0.027064498513936996,
"learning_rate": 7.222075445642904e-05,
"loss": 0.0222,
"step": 30
},
{
"epoch": 0.010923786765920427,
"grad_norm": 0.12557287514209747,
"learning_rate": 7.03909064496551e-05,
"loss": 0.0284,
"step": 31
},
{
"epoch": 0.011276166984175925,
"grad_norm": 0.14048165082931519,
"learning_rate": 6.853176097769229e-05,
"loss": 0.0319,
"step": 32
},
{
"epoch": 0.011628547202431424,
"grad_norm": 0.1092550978064537,
"learning_rate": 6.664685702961344e-05,
"loss": 0.0327,
"step": 33
},
{
"epoch": 0.011980927420686922,
"grad_norm": 0.05221077799797058,
"learning_rate": 6.473978262721463e-05,
"loss": 0.037,
"step": 34
},
{
"epoch": 0.012333307638942418,
"grad_norm": 0.04570182412862778,
"learning_rate": 6.281416799501188e-05,
"loss": 0.0403,
"step": 35
},
{
"epoch": 0.012685687857197916,
"grad_norm": 0.12411309778690338,
"learning_rate": 6.087367864990233e-05,
"loss": 0.0467,
"step": 36
},
{
"epoch": 0.013038068075453414,
"grad_norm": 0.09028882533311844,
"learning_rate": 5.8922008423644624e-05,
"loss": 0.0558,
"step": 37
},
{
"epoch": 0.013390448293708912,
"grad_norm": 0.20290955901145935,
"learning_rate": 5.696287243144013e-05,
"loss": 0.0216,
"step": 38
},
{
"epoch": 0.01374282851196441,
"grad_norm": 0.040300991386175156,
"learning_rate": 5.500000000000001e-05,
"loss": 0.0202,
"step": 39
},
{
"epoch": 0.014095208730219907,
"grad_norm": 0.014306614175438881,
"learning_rate": 5.303712756855988e-05,
"loss": 0.0137,
"step": 40
},
{
"epoch": 0.014447588948475405,
"grad_norm": 0.02834320440888405,
"learning_rate": 5.107799157635538e-05,
"loss": 0.0183,
"step": 41
},
{
"epoch": 0.014799969166730903,
"grad_norm": 0.01949421875178814,
"learning_rate": 4.912632135009769e-05,
"loss": 0.0196,
"step": 42
},
{
"epoch": 0.015152349384986401,
"grad_norm": 0.03610558062791824,
"learning_rate": 4.718583200498814e-05,
"loss": 0.024,
"step": 43
},
{
"epoch": 0.015504729603241897,
"grad_norm": 0.039888154715299606,
"learning_rate": 4.526021737278538e-05,
"loss": 0.0262,
"step": 44
},
{
"epoch": 0.015857109821497396,
"grad_norm": 0.0371965691447258,
"learning_rate": 4.3353142970386564e-05,
"loss": 0.0282,
"step": 45
},
{
"epoch": 0.016209490039752892,
"grad_norm": 0.038961343467235565,
"learning_rate": 4.146823902230772e-05,
"loss": 0.0355,
"step": 46
},
{
"epoch": 0.016561870258008392,
"grad_norm": 0.045494209975004196,
"learning_rate": 3.960909355034491e-05,
"loss": 0.0383,
"step": 47
},
{
"epoch": 0.016914250476263888,
"grad_norm": 0.05365985259413719,
"learning_rate": 3.777924554357096e-05,
"loss": 0.044,
"step": 48
},
{
"epoch": 0.017266630694519388,
"grad_norm": 0.0467500165104866,
"learning_rate": 3.598217822166854e-05,
"loss": 0.0511,
"step": 49
},
{
"epoch": 0.017619010912774884,
"grad_norm": 0.07677899301052094,
"learning_rate": 3.422131240442349e-05,
"loss": 0.0697,
"step": 50
},
{
"epoch": 0.017619010912774884,
"eval_loss": 0.05016423761844635,
"eval_runtime": 5.5589,
"eval_samples_per_second": 8.995,
"eval_steps_per_second": 2.339,
"step": 50
},
{
"epoch": 0.01797139113103038,
"grad_norm": 0.05385514348745346,
"learning_rate": 3.250000000000001e-05,
"loss": 0.0233,
"step": 51
},
{
"epoch": 0.01832377134928588,
"grad_norm": 0.05782073363661766,
"learning_rate": 3.082151762439293e-05,
"loss": 0.0122,
"step": 52
},
{
"epoch": 0.018676151567541377,
"grad_norm": 0.0645056962966919,
"learning_rate": 2.9189060364202943e-05,
"loss": 0.0168,
"step": 53
},
{
"epoch": 0.019028531785796873,
"grad_norm": 0.07855939865112305,
"learning_rate": 2.760573569460757e-05,
"loss": 0.0207,
"step": 54
},
{
"epoch": 0.019380912004052373,
"grad_norm": 0.07800403237342834,
"learning_rate": 2.6074557564105727e-05,
"loss": 0.0227,
"step": 55
},
{
"epoch": 0.01973329222230787,
"grad_norm": 0.060954831540584564,
"learning_rate": 2.459844065729529e-05,
"loss": 0.0237,
"step": 56
},
{
"epoch": 0.02008567244056337,
"grad_norm": 0.04431851580739021,
"learning_rate": 2.3180194846605367e-05,
"loss": 0.0264,
"step": 57
},
{
"epoch": 0.020438052658818866,
"grad_norm": 0.033269140869379044,
"learning_rate": 2.1822519843544424e-05,
"loss": 0.0316,
"step": 58
},
{
"epoch": 0.020790432877074362,
"grad_norm": 0.01977214589715004,
"learning_rate": 2.0528000059645997e-05,
"loss": 0.0318,
"step": 59
},
{
"epoch": 0.021142813095329862,
"grad_norm": 0.03904677554965019,
"learning_rate": 1.9299099686894423e-05,
"loss": 0.0402,
"step": 60
},
{
"epoch": 0.02149519331358536,
"grad_norm": 0.10086186975240707,
"learning_rate": 1.8138158006995364e-05,
"loss": 0.0493,
"step": 61
},
{
"epoch": 0.021847573531840855,
"grad_norm": 0.1489732712507248,
"learning_rate": 1.7047384938420154e-05,
"loss": 0.0598,
"step": 62
},
{
"epoch": 0.022199953750096355,
"grad_norm": 0.06647814810276031,
"learning_rate": 1.602885682970026e-05,
"loss": 0.0266,
"step": 63
},
{
"epoch": 0.02255233396835185,
"grad_norm": 0.033716265112161636,
"learning_rate": 1.5084512506980026e-05,
"loss": 0.0177,
"step": 64
},
{
"epoch": 0.02290471418660735,
"grad_norm": 0.01775234565138817,
"learning_rate": 1.4216149583350754e-05,
"loss": 0.0125,
"step": 65
},
{
"epoch": 0.023257094404862847,
"grad_norm": 0.013323824852705002,
"learning_rate": 1.3425421036992098e-05,
"loss": 0.0178,
"step": 66
},
{
"epoch": 0.023609474623118343,
"grad_norm": 0.02086944319307804,
"learning_rate": 1.2713832064634126e-05,
"loss": 0.0187,
"step": 67
},
{
"epoch": 0.023961854841373843,
"grad_norm": 0.019485145807266235,
"learning_rate": 1.2082737216329794e-05,
"loss": 0.0246,
"step": 68
},
{
"epoch": 0.02431423505962934,
"grad_norm": 0.00897766649723053,
"learning_rate": 1.1533337816991932e-05,
"loss": 0.0277,
"step": 69
},
{
"epoch": 0.024666615277884836,
"grad_norm": 0.014582326635718346,
"learning_rate": 1.1066679679603e-05,
"loss": 0.0299,
"step": 70
},
{
"epoch": 0.025018995496140336,
"grad_norm": 0.029548194259405136,
"learning_rate": 1.0683651114450641e-05,
"loss": 0.0331,
"step": 71
},
{
"epoch": 0.025371375714395832,
"grad_norm": 0.03517298772931099,
"learning_rate": 1.0384981238178534e-05,
"loss": 0.0361,
"step": 72
},
{
"epoch": 0.025723755932651332,
"grad_norm": 0.026599425822496414,
"learning_rate": 1.017123858587145e-05,
"loss": 0.0432,
"step": 73
},
{
"epoch": 0.02607613615090683,
"grad_norm": 0.022014496847987175,
"learning_rate": 1.00428300288164e-05,
"loss": 0.0471,
"step": 74
},
{
"epoch": 0.026428516369162325,
"grad_norm": 0.07925083488225937,
"learning_rate": 1e-05,
"loss": 0.0657,
"step": 75
},
{
"epoch": 0.026428516369162325,
"eval_loss": 0.04871680587530136,
"eval_runtime": 5.8192,
"eval_samples_per_second": 8.592,
"eval_steps_per_second": 2.234,
"step": 75
}
],
"logging_steps": 1,
"max_steps": 75,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 25,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.559191436625183e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}