RoyJoy's picture
Training in progress, step 116, checkpoint
9da4e3b verified
raw
history blame
22 kB
{
"best_metric": 0.6968957781791687,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 3.005658852061439,
"eval_steps": 25,
"global_step": 116,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.025869037995149554,
"grad_norm": 348.736083984375,
"learning_rate": 2.9999999999999997e-05,
"loss": 274.9102,
"step": 1
},
{
"epoch": 0.025869037995149554,
"eval_loss": 8.283905982971191,
"eval_runtime": 0.6332,
"eval_samples_per_second": 78.958,
"eval_steps_per_second": 3.158,
"step": 1
},
{
"epoch": 0.05173807599029911,
"grad_norm": 348.5337829589844,
"learning_rate": 5.9999999999999995e-05,
"loss": 269.3359,
"step": 2
},
{
"epoch": 0.07760711398544867,
"grad_norm": 336.2130126953125,
"learning_rate": 8.999999999999999e-05,
"loss": 265.1719,
"step": 3
},
{
"epoch": 0.10347615198059822,
"grad_norm": 333.7207946777344,
"learning_rate": 0.00011999999999999999,
"loss": 249.0039,
"step": 4
},
{
"epoch": 0.1293451899757478,
"grad_norm": 298.2082824707031,
"learning_rate": 0.00015,
"loss": 206.7969,
"step": 5
},
{
"epoch": 0.15521422797089734,
"grad_norm": 250.3636474609375,
"learning_rate": 0.00017999999999999998,
"loss": 161.4229,
"step": 6
},
{
"epoch": 0.18108326596604687,
"grad_norm": 226.34564208984375,
"learning_rate": 0.00020999999999999998,
"loss": 120.4238,
"step": 7
},
{
"epoch": 0.20695230396119643,
"grad_norm": 308.28143310546875,
"learning_rate": 0.00023999999999999998,
"loss": 84.0972,
"step": 8
},
{
"epoch": 0.232821341956346,
"grad_norm": 549.4783325195312,
"learning_rate": 0.00027,
"loss": 52.3253,
"step": 9
},
{
"epoch": 0.2586903799514956,
"grad_norm": 572.8822021484375,
"learning_rate": 0.0003,
"loss": 46.6969,
"step": 10
},
{
"epoch": 0.28455941794664513,
"grad_norm": 67.11377716064453,
"learning_rate": 0.00029993412547631913,
"loss": 24.6701,
"step": 11
},
{
"epoch": 0.3104284559417947,
"grad_norm": 113.78333282470703,
"learning_rate": 0.0002997365597646482,
"loss": 26.2216,
"step": 12
},
{
"epoch": 0.33629749393694425,
"grad_norm": 129.92559814453125,
"learning_rate": 0.0002994074763922825,
"loss": 24.956,
"step": 13
},
{
"epoch": 0.36216653193209375,
"grad_norm": 104.27074432373047,
"learning_rate": 0.0002989471644020275,
"loss": 24.2584,
"step": 14
},
{
"epoch": 0.3880355699272433,
"grad_norm": 119.985595703125,
"learning_rate": 0.00029835602809832456,
"loss": 25.2181,
"step": 15
},
{
"epoch": 0.41390460792239286,
"grad_norm": 48.832435607910156,
"learning_rate": 0.0002976345866921395,
"loss": 22.767,
"step": 16
},
{
"epoch": 0.4397736459175424,
"grad_norm": 81.3912353515625,
"learning_rate": 0.0002967834738449256,
"loss": 23.8924,
"step": 17
},
{
"epoch": 0.465642683912692,
"grad_norm": 54.908119201660156,
"learning_rate": 0.0002958034371120616,
"loss": 24.4696,
"step": 18
},
{
"epoch": 0.49151172190784154,
"grad_norm": 51.236083984375,
"learning_rate": 0.00029469533728625376,
"loss": 23.9197,
"step": 19
},
{
"epoch": 0.5173807599029911,
"grad_norm": 47.18868637084961,
"learning_rate": 0.00029346014764147836,
"loss": 23.3075,
"step": 20
},
{
"epoch": 0.5432497978981407,
"grad_norm": 19.380447387695312,
"learning_rate": 0.0002920989530781287,
"loss": 22.372,
"step": 21
},
{
"epoch": 0.5691188358932903,
"grad_norm": 51.02498245239258,
"learning_rate": 0.00029061294917011814,
"loss": 23.7828,
"step": 22
},
{
"epoch": 0.5949878738884398,
"grad_norm": 61.17385482788086,
"learning_rate": 0.000289003441114775,
"loss": 25.444,
"step": 23
},
{
"epoch": 0.6208569118835894,
"grad_norm": 34.5933723449707,
"learning_rate": 0.0002872718425864527,
"loss": 23.0492,
"step": 24
},
{
"epoch": 0.6467259498787389,
"grad_norm": 19.800092697143555,
"learning_rate": 0.0002854196744948615,
"loss": 22.5537,
"step": 25
},
{
"epoch": 0.6467259498787389,
"eval_loss": 0.741091787815094,
"eval_runtime": 0.6281,
"eval_samples_per_second": 79.606,
"eval_steps_per_second": 3.184,
"step": 25
},
{
"epoch": 0.6725949878738885,
"grad_norm": 28.665796279907227,
"learning_rate": 0.0002834485636492121,
"loss": 23.0459,
"step": 26
},
{
"epoch": 0.698464025869038,
"grad_norm": 54.957481384277344,
"learning_rate": 0.0002813602413293455,
"loss": 25.5733,
"step": 27
},
{
"epoch": 0.7243330638641875,
"grad_norm": 28.139915466308594,
"learning_rate": 0.0002791565417651033,
"loss": 23.6223,
"step": 28
},
{
"epoch": 0.7502021018593371,
"grad_norm": 29.63477897644043,
"learning_rate": 0.0002768394005252739,
"loss": 23.6755,
"step": 29
},
{
"epoch": 0.7760711398544866,
"grad_norm": 24.49903678894043,
"learning_rate": 0.00027441085281753024,
"loss": 22.999,
"step": 30
},
{
"epoch": 0.8019401778496362,
"grad_norm": 16.341554641723633,
"learning_rate": 0.0002718730317008522,
"loss": 22.6584,
"step": 31
},
{
"epoch": 0.8278092158447857,
"grad_norm": 19.589542388916016,
"learning_rate": 0.000269228166212003,
"loss": 23.1001,
"step": 32
},
{
"epoch": 0.8536782538399353,
"grad_norm": 20.623701095581055,
"learning_rate": 0.00026647857940770634,
"loss": 23.1374,
"step": 33
},
{
"epoch": 0.8795472918350848,
"grad_norm": 24.688899993896484,
"learning_rate": 0.000263626686324243,
"loss": 23.4097,
"step": 34
},
{
"epoch": 0.9054163298302345,
"grad_norm": 22.096830368041992,
"learning_rate": 0.0002606749918562591,
"loss": 23.1316,
"step": 35
},
{
"epoch": 0.931285367825384,
"grad_norm": 27.559904098510742,
"learning_rate": 0.00025762608855664965,
"loss": 23.2516,
"step": 36
},
{
"epoch": 0.9571544058205336,
"grad_norm": 15.820242881774902,
"learning_rate": 0.00025448265435944954,
"loss": 22.5069,
"step": 37
},
{
"epoch": 0.9830234438156831,
"grad_norm": 18.462657928466797,
"learning_rate": 0.0002512474502277316,
"loss": 22.9305,
"step": 38
},
{
"epoch": 1.0105092966855296,
"grad_norm": 16.62592887878418,
"learning_rate": 0.0002479233177285782,
"loss": 22.217,
"step": 39
},
{
"epoch": 1.036378334680679,
"grad_norm": 19.09794044494629,
"learning_rate": 0.0002445131765372567,
"loss": 22.9307,
"step": 40
},
{
"epoch": 1.0622473726758286,
"grad_norm": 17.65207290649414,
"learning_rate": 0.000241020021872789,
"loss": 22.5269,
"step": 41
},
{
"epoch": 1.0881164106709782,
"grad_norm": 23.124086380004883,
"learning_rate": 0.00023744692186717078,
"loss": 22.7486,
"step": 42
},
{
"epoch": 1.1139854486661278,
"grad_norm": 3.4357526302337646,
"learning_rate": 0.00023379701487054785,
"loss": 22.0516,
"step": 43
},
{
"epoch": 1.1398544866612772,
"grad_norm": 30.857542037963867,
"learning_rate": 0.00023007350669471862,
"loss": 23.7532,
"step": 44
},
{
"epoch": 1.1657235246564268,
"grad_norm": 40.20075988769531,
"learning_rate": 0.00022627966779738306,
"loss": 24.4793,
"step": 45
},
{
"epoch": 1.1915925626515764,
"grad_norm": 24.608131408691406,
"learning_rate": 0.00022241883040961173,
"loss": 22.423,
"step": 46
},
{
"epoch": 1.217461600646726,
"grad_norm": 11.946351051330566,
"learning_rate": 0.00021849438560905693,
"loss": 22.8885,
"step": 47
},
{
"epoch": 1.2433306386418754,
"grad_norm": 11.997475624084473,
"learning_rate": 0.00021450978034147806,
"loss": 22.3267,
"step": 48
},
{
"epoch": 1.269199676637025,
"grad_norm": 24.218584060668945,
"learning_rate": 0.00021046851439319585,
"loss": 22.9163,
"step": 49
},
{
"epoch": 1.2950687146321747,
"grad_norm": 21.68750762939453,
"learning_rate": 0.0002063741373171357,
"loss": 22.702,
"step": 50
},
{
"epoch": 1.2950687146321747,
"eval_loss": 0.7224195599555969,
"eval_runtime": 0.6276,
"eval_samples_per_second": 79.667,
"eval_steps_per_second": 3.187,
"step": 50
},
{
"epoch": 1.3209377526273243,
"grad_norm": 15.50403118133545,
"learning_rate": 0.0002022302453151598,
"loss": 22.3089,
"step": 51
},
{
"epoch": 1.3468067906224737,
"grad_norm": 24.226417541503906,
"learning_rate": 0.0001980404780794256,
"loss": 23.1566,
"step": 52
},
{
"epoch": 1.3726758286176233,
"grad_norm": 20.176647186279297,
"learning_rate": 0.00019380851559554636,
"loss": 22.929,
"step": 53
},
{
"epoch": 1.3985448666127729,
"grad_norm": 15.459396362304688,
"learning_rate": 0.00018953807491036011,
"loss": 22.6978,
"step": 54
},
{
"epoch": 1.4244139046079223,
"grad_norm": 14.14704418182373,
"learning_rate": 0.00018523290686714756,
"loss": 22.7141,
"step": 55
},
{
"epoch": 1.450282942603072,
"grad_norm": 7.990035533905029,
"learning_rate": 0.00018089679281116472,
"loss": 23.1633,
"step": 56
},
{
"epoch": 1.4761519805982215,
"grad_norm": 3.211017608642578,
"learning_rate": 0.00017653354126838592,
"loss": 22.3353,
"step": 57
},
{
"epoch": 1.502021018593371,
"grad_norm": 18.740083694458008,
"learning_rate": 0.00017214698460037218,
"loss": 23.5309,
"step": 58
},
{
"epoch": 1.5278900565885207,
"grad_norm": 16.11014175415039,
"learning_rate": 0.00016774097563820485,
"loss": 22.8019,
"step": 59
},
{
"epoch": 1.5537590945836701,
"grad_norm": 26.232606887817383,
"learning_rate": 0.00016331938429844022,
"loss": 23.5109,
"step": 60
},
{
"epoch": 1.5796281325788197,
"grad_norm": 16.256412506103516,
"learning_rate": 0.00015888609418405713,
"loss": 22.8009,
"step": 61
},
{
"epoch": 1.6054971705739693,
"grad_norm": 11.629958152770996,
"learning_rate": 0.00015444499917338395,
"loss": 22.3203,
"step": 62
},
{
"epoch": 1.6313662085691187,
"grad_norm": 11.147138595581055,
"learning_rate": 0.00015,
"loss": 22.4757,
"step": 63
},
{
"epoch": 1.6572352465642683,
"grad_norm": 5.99025297164917,
"learning_rate": 0.00014555500082661602,
"loss": 22.2444,
"step": 64
},
{
"epoch": 1.683104284559418,
"grad_norm": 11.468669891357422,
"learning_rate": 0.00014111390581594284,
"loss": 22.2462,
"step": 65
},
{
"epoch": 1.7089733225545674,
"grad_norm": 14.979022979736328,
"learning_rate": 0.00013668061570155978,
"loss": 21.7589,
"step": 66
},
{
"epoch": 1.7348423605497172,
"grad_norm": 12.94080924987793,
"learning_rate": 0.00013225902436179513,
"loss": 22.4269,
"step": 67
},
{
"epoch": 1.7607113985448666,
"grad_norm": 11.411182403564453,
"learning_rate": 0.00012785301539962782,
"loss": 21.7354,
"step": 68
},
{
"epoch": 1.7865804365400162,
"grad_norm": 27.090801239013672,
"learning_rate": 0.00012346645873161408,
"loss": 23.5318,
"step": 69
},
{
"epoch": 1.8124494745351658,
"grad_norm": 17.46219825744629,
"learning_rate": 0.00011910320718883525,
"loss": 22.8003,
"step": 70
},
{
"epoch": 1.8383185125303152,
"grad_norm": 17.276792526245117,
"learning_rate": 0.00011476709313285244,
"loss": 22.7198,
"step": 71
},
{
"epoch": 1.8641875505254648,
"grad_norm": 13.101729393005371,
"learning_rate": 0.00011046192508963989,
"loss": 22.2413,
"step": 72
},
{
"epoch": 1.8900565885206144,
"grad_norm": 10.330924987792969,
"learning_rate": 0.00010619148440445364,
"loss": 21.9412,
"step": 73
},
{
"epoch": 1.9159256265157638,
"grad_norm": 16.028894424438477,
"learning_rate": 0.00010195952192057438,
"loss": 22.5098,
"step": 74
},
{
"epoch": 1.9417946645109136,
"grad_norm": 8.1192626953125,
"learning_rate": 9.776975468484019e-05,
"loss": 22.1182,
"step": 75
},
{
"epoch": 1.9417946645109136,
"eval_loss": 0.7175214886665344,
"eval_runtime": 0.6276,
"eval_samples_per_second": 79.669,
"eval_steps_per_second": 3.187,
"step": 75
},
{
"epoch": 1.967663702506063,
"grad_norm": 11.423409461975098,
"learning_rate": 9.36258626828643e-05,
"loss": 22.3389,
"step": 76
},
{
"epoch": 1.9935327405012127,
"grad_norm": 12.934334754943848,
"learning_rate": 8.953148560680418e-05,
"loss": 22.7501,
"step": 77
},
{
"epoch": 2.021018593371059,
"grad_norm": 22.10219383239746,
"learning_rate": 8.549021965852197e-05,
"loss": 23.1807,
"step": 78
},
{
"epoch": 2.0468876313662085,
"grad_norm": 15.90378475189209,
"learning_rate": 8.150561439094303e-05,
"loss": 22.5372,
"step": 79
},
{
"epoch": 2.072756669361358,
"grad_norm": 10.656487464904785,
"learning_rate": 7.758116959038828e-05,
"loss": 22.1827,
"step": 80
},
{
"epoch": 2.0986257073565078,
"grad_norm": 22.766876220703125,
"learning_rate": 7.372033220261696e-05,
"loss": 22.6163,
"step": 81
},
{
"epoch": 2.124494745351657,
"grad_norm": 11.259724617004395,
"learning_rate": 6.992649330528145e-05,
"loss": 22.0147,
"step": 82
},
{
"epoch": 2.1503637833468066,
"grad_norm": 12.66515827178955,
"learning_rate": 6.620298512945214e-05,
"loss": 21.9512,
"step": 83
},
{
"epoch": 2.1762328213419564,
"grad_norm": 5.229973793029785,
"learning_rate": 6.255307813282921e-05,
"loss": 22.17,
"step": 84
},
{
"epoch": 2.202101859337106,
"grad_norm": 6.952908992767334,
"learning_rate": 5.897997812721103e-05,
"loss": 22.418,
"step": 85
},
{
"epoch": 2.2279708973322556,
"grad_norm": 9.438949584960938,
"learning_rate": 5.5486823462743344e-05,
"loss": 22.334,
"step": 86
},
{
"epoch": 2.253839935327405,
"grad_norm": 13.546004295349121,
"learning_rate": 5.2076682271421774e-05,
"loss": 22.3634,
"step": 87
},
{
"epoch": 2.2797089733225544,
"grad_norm": 14.096308708190918,
"learning_rate": 4.8752549772268444e-05,
"loss": 22.6631,
"step": 88
},
{
"epoch": 2.3055780113177042,
"grad_norm": 18.847871780395508,
"learning_rate": 4.551734564055049e-05,
"loss": 22.0801,
"step": 89
},
{
"epoch": 2.3314470493128536,
"grad_norm": 7.903066635131836,
"learning_rate": 4.2373911443350286e-05,
"loss": 22.043,
"step": 90
},
{
"epoch": 2.3573160873080035,
"grad_norm": 16.976978302001953,
"learning_rate": 3.932500814374089e-05,
"loss": 22.2002,
"step": 91
},
{
"epoch": 2.383185125303153,
"grad_norm": 11.1248140335083,
"learning_rate": 3.637331367575698e-05,
"loss": 22.1329,
"step": 92
},
{
"epoch": 2.4090541632983022,
"grad_norm": 5.761756896972656,
"learning_rate": 3.352142059229365e-05,
"loss": 22.0856,
"step": 93
},
{
"epoch": 2.434923201293452,
"grad_norm": 12.847921371459961,
"learning_rate": 3.077183378799699e-05,
"loss": 22.0646,
"step": 94
},
{
"epoch": 2.4607922392886015,
"grad_norm": 9.289769172668457,
"learning_rate": 2.81269682991478e-05,
"loss": 21.8848,
"step": 95
},
{
"epoch": 2.486661277283751,
"grad_norm": 13.644316673278809,
"learning_rate": 2.5589147182469732e-05,
"loss": 23.1436,
"step": 96
},
{
"epoch": 2.5125303152789007,
"grad_norm": 16.434682846069336,
"learning_rate": 2.316059947472607e-05,
"loss": 22.212,
"step": 97
},
{
"epoch": 2.53839935327405,
"grad_norm": 6.969300270080566,
"learning_rate": 2.0843458234896666e-05,
"loss": 22.2793,
"step": 98
},
{
"epoch": 2.5642683912691995,
"grad_norm": 21.42749786376953,
"learning_rate": 1.8639758670654486e-05,
"loss": 22.3692,
"step": 99
},
{
"epoch": 2.5901374292643493,
"grad_norm": 13.674956321716309,
"learning_rate": 1.6551436350787918e-05,
"loss": 22.2481,
"step": 100
},
{
"epoch": 2.5901374292643493,
"eval_loss": 0.6968957781791687,
"eval_runtime": 0.6272,
"eval_samples_per_second": 79.714,
"eval_steps_per_second": 3.189,
"step": 100
},
{
"epoch": 2.6160064672594987,
"grad_norm": 18.59197235107422,
"learning_rate": 1.4580325505138468e-05,
"loss": 22.2291,
"step": 101
},
{
"epoch": 2.6418755052546485,
"grad_norm": 8.891879081726074,
"learning_rate": 1.272815741354723e-05,
"loss": 22.3545,
"step": 102
},
{
"epoch": 2.667744543249798,
"grad_norm": 8.696002006530762,
"learning_rate": 1.0996558885224993e-05,
"loss": 22.1393,
"step": 103
},
{
"epoch": 2.6936135812449473,
"grad_norm": 6.413660049438477,
"learning_rate": 9.387050829881865e-06,
"loss": 22.9287,
"step": 104
},
{
"epoch": 2.719482619240097,
"grad_norm": 13.523515701293945,
"learning_rate": 7.90104692187129e-06,
"loss": 22.8497,
"step": 105
},
{
"epoch": 2.7453516572352465,
"grad_norm": 8.049901008605957,
"learning_rate": 6.539852358521636e-06,
"loss": 22.0333,
"step": 106
},
{
"epoch": 2.7712206952303964,
"grad_norm": 8.899979591369629,
"learning_rate": 5.304662713746205e-06,
"loss": 22.1953,
"step": 107
},
{
"epoch": 2.7970897332255458,
"grad_norm": 9.008318901062012,
"learning_rate": 4.1965628879383875e-06,
"loss": 22.1504,
"step": 108
},
{
"epoch": 2.822958771220695,
"grad_norm": 13.858719825744629,
"learning_rate": 3.2165261550743946e-06,
"loss": 22.0938,
"step": 109
},
{
"epoch": 2.8488278092158446,
"grad_norm": 6.062250137329102,
"learning_rate": 2.3654133078604753e-06,
"loss": 22.1504,
"step": 110
},
{
"epoch": 2.8746968472109944,
"grad_norm": 5.169662952423096,
"learning_rate": 1.643971901675395e-06,
"loss": 22.1182,
"step": 111
},
{
"epoch": 2.900565885206144,
"grad_norm": 9.194791793823242,
"learning_rate": 1.0528355979724624e-06,
"loss": 22.0225,
"step": 112
},
{
"epoch": 2.9264349232012936,
"grad_norm": 5.832217693328857,
"learning_rate": 5.925236077174655e-07,
"loss": 22.2256,
"step": 113
},
{
"epoch": 2.952303961196443,
"grad_norm": 3.4554474353790283,
"learning_rate": 2.634402353517973e-07,
"loss": 22.0733,
"step": 114
},
{
"epoch": 2.9781729991915924,
"grad_norm": 10.36470890045166,
"learning_rate": 6.587452368084779e-08,
"loss": 22.0811,
"step": 115
},
{
"epoch": 3.005658852061439,
"grad_norm": 10.728325843811035,
"learning_rate": 0.0,
"loss": 21.647,
"step": 116
}
],
"logging_steps": 1,
"max_steps": 116,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1836675900991078e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}