farmery's picture
Training in progress, step 150, checkpoint
009ed55 verified
raw
history blame
28.4 kB
{
"best_metric": 0.29094862937927246,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 0.7761966364812419,
"eval_steps": 25,
"global_step": 150,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00517464424320828,
"grad_norm": 137.61221313476562,
"learning_rate": 5.555555555555556e-06,
"loss": 45.3923,
"step": 1
},
{
"epoch": 0.00517464424320828,
"eval_loss": 0.879862904548645,
"eval_runtime": 2.4641,
"eval_samples_per_second": 20.291,
"eval_steps_per_second": 5.276,
"step": 1
},
{
"epoch": 0.01034928848641656,
"grad_norm": 145.64358520507812,
"learning_rate": 1.1111111111111112e-05,
"loss": 47.4644,
"step": 2
},
{
"epoch": 0.015523932729624839,
"grad_norm": 143.3395233154297,
"learning_rate": 1.6666666666666667e-05,
"loss": 46.5953,
"step": 3
},
{
"epoch": 0.02069857697283312,
"grad_norm": 131.87200927734375,
"learning_rate": 2.2222222222222223e-05,
"loss": 46.5328,
"step": 4
},
{
"epoch": 0.0258732212160414,
"grad_norm": 114.46612548828125,
"learning_rate": 2.777777777777778e-05,
"loss": 44.6468,
"step": 5
},
{
"epoch": 0.031047865459249677,
"grad_norm": 86.16083526611328,
"learning_rate": 3.3333333333333335e-05,
"loss": 40.958,
"step": 6
},
{
"epoch": 0.03622250970245795,
"grad_norm": 135.6624755859375,
"learning_rate": 3.888888888888889e-05,
"loss": 39.1718,
"step": 7
},
{
"epoch": 0.04139715394566624,
"grad_norm": 88.87834167480469,
"learning_rate": 4.4444444444444447e-05,
"loss": 35.5996,
"step": 8
},
{
"epoch": 0.04657179818887452,
"grad_norm": 134.31365966796875,
"learning_rate": 5e-05,
"loss": 44.2182,
"step": 9
},
{
"epoch": 0.0517464424320828,
"grad_norm": 112.15583038330078,
"learning_rate": 5.555555555555556e-05,
"loss": 40.9168,
"step": 10
},
{
"epoch": 0.056921086675291076,
"grad_norm": 116.23684692382812,
"learning_rate": 6.111111111111112e-05,
"loss": 40.278,
"step": 11
},
{
"epoch": 0.062095730918499355,
"grad_norm": 91.58599853515625,
"learning_rate": 6.666666666666667e-05,
"loss": 39.5289,
"step": 12
},
{
"epoch": 0.06727037516170763,
"grad_norm": 84.65013122558594,
"learning_rate": 7.222222222222222e-05,
"loss": 34.301,
"step": 13
},
{
"epoch": 0.0724450194049159,
"grad_norm": 63.37664031982422,
"learning_rate": 7.777777777777778e-05,
"loss": 30.2999,
"step": 14
},
{
"epoch": 0.07761966364812418,
"grad_norm": 39.683807373046875,
"learning_rate": 8.333333333333334e-05,
"loss": 29.2968,
"step": 15
},
{
"epoch": 0.08279430789133248,
"grad_norm": 31.897132873535156,
"learning_rate": 8.888888888888889e-05,
"loss": 28.7925,
"step": 16
},
{
"epoch": 0.08796895213454076,
"grad_norm": 34.73341751098633,
"learning_rate": 9.444444444444444e-05,
"loss": 28.2201,
"step": 17
},
{
"epoch": 0.09314359637774904,
"grad_norm": 33.116825103759766,
"learning_rate": 0.0001,
"loss": 27.4134,
"step": 18
},
{
"epoch": 0.09831824062095731,
"grad_norm": 29.32647132873535,
"learning_rate": 9.999884773765534e-05,
"loss": 27.2748,
"step": 19
},
{
"epoch": 0.1034928848641656,
"grad_norm": 34.262603759765625,
"learning_rate": 9.999539100963065e-05,
"loss": 26.7995,
"step": 20
},
{
"epoch": 0.10866752910737387,
"grad_norm": 34.93130111694336,
"learning_rate": 9.998962999295068e-05,
"loss": 30.0288,
"step": 21
},
{
"epoch": 0.11384217335058215,
"grad_norm": 45.51692581176758,
"learning_rate": 9.998156498264669e-05,
"loss": 32.0179,
"step": 22
},
{
"epoch": 0.11901681759379043,
"grad_norm": 53.52436828613281,
"learning_rate": 9.997119639174122e-05,
"loss": 31.3302,
"step": 23
},
{
"epoch": 0.12419146183699871,
"grad_norm": 53.14868927001953,
"learning_rate": 9.995852475122702e-05,
"loss": 31.3536,
"step": 24
},
{
"epoch": 0.129366106080207,
"grad_norm": 55.426658630371094,
"learning_rate": 9.994355071003984e-05,
"loss": 30.1863,
"step": 25
},
{
"epoch": 0.129366106080207,
"eval_loss": 0.4283739924430847,
"eval_runtime": 2.4658,
"eval_samples_per_second": 20.277,
"eval_steps_per_second": 5.272,
"step": 25
},
{
"epoch": 0.13454075032341525,
"grad_norm": 59.29961395263672,
"learning_rate": 9.992627503502517e-05,
"loss": 25.8217,
"step": 26
},
{
"epoch": 0.13971539456662355,
"grad_norm": 32.18626403808594,
"learning_rate": 9.990669861089904e-05,
"loss": 24.9423,
"step": 27
},
{
"epoch": 0.1448900388098318,
"grad_norm": 42.07598876953125,
"learning_rate": 9.988482244020256e-05,
"loss": 24.4099,
"step": 28
},
{
"epoch": 0.1500646830530401,
"grad_norm": 32.522953033447266,
"learning_rate": 9.986064764325079e-05,
"loss": 23.9485,
"step": 29
},
{
"epoch": 0.15523932729624837,
"grad_norm": 32.758941650390625,
"learning_rate": 9.983417545807521e-05,
"loss": 23.9998,
"step": 30
},
{
"epoch": 0.16041397153945666,
"grad_norm": 30.58071517944336,
"learning_rate": 9.980540724036031e-05,
"loss": 23.4842,
"step": 31
},
{
"epoch": 0.16558861578266496,
"grad_norm": 29.740413665771484,
"learning_rate": 9.977434446337431e-05,
"loss": 23.3358,
"step": 32
},
{
"epoch": 0.17076326002587322,
"grad_norm": 31.934289932250977,
"learning_rate": 9.974098871789359e-05,
"loss": 24.1982,
"step": 33
},
{
"epoch": 0.1759379042690815,
"grad_norm": 52.297630310058594,
"learning_rate": 9.970534171212117e-05,
"loss": 29.5956,
"step": 34
},
{
"epoch": 0.18111254851228978,
"grad_norm": 44.05955123901367,
"learning_rate": 9.966740527159945e-05,
"loss": 28.9523,
"step": 35
},
{
"epoch": 0.18628719275549807,
"grad_norm": 48.28145217895508,
"learning_rate": 9.962718133911648e-05,
"loss": 27.3425,
"step": 36
},
{
"epoch": 0.19146183699870634,
"grad_norm": 50.967891693115234,
"learning_rate": 9.958467197460662e-05,
"loss": 28.0913,
"step": 37
},
{
"epoch": 0.19663648124191463,
"grad_norm": 48.36716842651367,
"learning_rate": 9.953987935504497e-05,
"loss": 24.9412,
"step": 38
},
{
"epoch": 0.2018111254851229,
"grad_norm": 34.47467803955078,
"learning_rate": 9.949280577433593e-05,
"loss": 22.663,
"step": 39
},
{
"epoch": 0.2069857697283312,
"grad_norm": 24.475858688354492,
"learning_rate": 9.944345364319571e-05,
"loss": 22.2558,
"step": 40
},
{
"epoch": 0.21216041397153945,
"grad_norm": 24.5921630859375,
"learning_rate": 9.939182548902883e-05,
"loss": 23.2109,
"step": 41
},
{
"epoch": 0.21733505821474774,
"grad_norm": 22.380992889404297,
"learning_rate": 9.933792395579877e-05,
"loss": 22.2931,
"step": 42
},
{
"epoch": 0.222509702457956,
"grad_norm": 27.86241340637207,
"learning_rate": 9.928175180389254e-05,
"loss": 21.9427,
"step": 43
},
{
"epoch": 0.2276843467011643,
"grad_norm": 25.076570510864258,
"learning_rate": 9.922331190997922e-05,
"loss": 21.5344,
"step": 44
},
{
"epoch": 0.23285899094437257,
"grad_norm": 26.751014709472656,
"learning_rate": 9.916260726686278e-05,
"loss": 21.4259,
"step": 45
},
{
"epoch": 0.23803363518758086,
"grad_norm": 34.77674865722656,
"learning_rate": 9.909964098332879e-05,
"loss": 24.7821,
"step": 46
},
{
"epoch": 0.24320827943078913,
"grad_norm": 45.179378509521484,
"learning_rate": 9.903441628398511e-05,
"loss": 25.4269,
"step": 47
},
{
"epoch": 0.24838292367399742,
"grad_norm": 49.48162078857422,
"learning_rate": 9.896693650909686e-05,
"loss": 26.9395,
"step": 48
},
{
"epoch": 0.2535575679172057,
"grad_norm": 51.22860336303711,
"learning_rate": 9.889720511441532e-05,
"loss": 24.3481,
"step": 49
},
{
"epoch": 0.258732212160414,
"grad_norm": 45.01136779785156,
"learning_rate": 9.882522567100093e-05,
"loss": 25.6337,
"step": 50
},
{
"epoch": 0.258732212160414,
"eval_loss": 0.3623690903186798,
"eval_runtime": 2.4658,
"eval_samples_per_second": 20.278,
"eval_steps_per_second": 5.272,
"step": 50
},
{
"epoch": 0.26390685640362227,
"grad_norm": 34.13395690917969,
"learning_rate": 9.875100186504046e-05,
"loss": 22.0404,
"step": 51
},
{
"epoch": 0.2690815006468305,
"grad_norm": 25.99323272705078,
"learning_rate": 9.867453749765821e-05,
"loss": 21.5824,
"step": 52
},
{
"epoch": 0.2742561448900388,
"grad_norm": 25.306379318237305,
"learning_rate": 9.859583648472133e-05,
"loss": 21.0068,
"step": 53
},
{
"epoch": 0.2794307891332471,
"grad_norm": 24.23813247680664,
"learning_rate": 9.851490285663937e-05,
"loss": 20.8086,
"step": 54
},
{
"epoch": 0.2846054333764554,
"grad_norm": 26.623321533203125,
"learning_rate": 9.84317407581577e-05,
"loss": 20.7912,
"step": 55
},
{
"epoch": 0.2897800776196636,
"grad_norm": 24.135229110717773,
"learning_rate": 9.834635444814545e-05,
"loss": 21.1925,
"step": 56
},
{
"epoch": 0.2949547218628719,
"grad_norm": 23.138614654541016,
"learning_rate": 9.825874829937722e-05,
"loss": 20.337,
"step": 57
},
{
"epoch": 0.3001293661060802,
"grad_norm": 25.857755661010742,
"learning_rate": 9.816892679830937e-05,
"loss": 20.4409,
"step": 58
},
{
"epoch": 0.3053040103492885,
"grad_norm": 51.40500259399414,
"learning_rate": 9.807689454485e-05,
"loss": 27.1275,
"step": 59
},
{
"epoch": 0.31047865459249674,
"grad_norm": 46.01802444458008,
"learning_rate": 9.798265625212358e-05,
"loss": 24.8525,
"step": 60
},
{
"epoch": 0.31565329883570503,
"grad_norm": 44.5273323059082,
"learning_rate": 9.788621674622949e-05,
"loss": 25.6639,
"step": 61
},
{
"epoch": 0.3208279430789133,
"grad_norm": 55.26118469238281,
"learning_rate": 9.778758096599488e-05,
"loss": 24.8307,
"step": 62
},
{
"epoch": 0.3260025873221216,
"grad_norm": 39.436317443847656,
"learning_rate": 9.76867539627218e-05,
"loss": 23.664,
"step": 63
},
{
"epoch": 0.3311772315653299,
"grad_norm": 27.185758590698242,
"learning_rate": 9.758374089992841e-05,
"loss": 20.8368,
"step": 64
},
{
"epoch": 0.33635187580853815,
"grad_norm": 24.435218811035156,
"learning_rate": 9.747854705308464e-05,
"loss": 20.0758,
"step": 65
},
{
"epoch": 0.34152652005174644,
"grad_norm": 24.6842041015625,
"learning_rate": 9.737117780934197e-05,
"loss": 20.2967,
"step": 66
},
{
"epoch": 0.34670116429495473,
"grad_norm": 24.836183547973633,
"learning_rate": 9.726163866725763e-05,
"loss": 19.7435,
"step": 67
},
{
"epoch": 0.351875808538163,
"grad_norm": 23.984460830688477,
"learning_rate": 9.714993523651283e-05,
"loss": 19.9251,
"step": 68
},
{
"epoch": 0.35705045278137126,
"grad_norm": 24.694488525390625,
"learning_rate": 9.703607323762569e-05,
"loss": 19.8217,
"step": 69
},
{
"epoch": 0.36222509702457956,
"grad_norm": 24.874622344970703,
"learning_rate": 9.692005850165816e-05,
"loss": 20.1879,
"step": 70
},
{
"epoch": 0.36739974126778785,
"grad_norm": 27.704814910888672,
"learning_rate": 9.680189696991742e-05,
"loss": 21.443,
"step": 71
},
{
"epoch": 0.37257438551099614,
"grad_norm": 47.33317565917969,
"learning_rate": 9.668159469365163e-05,
"loss": 24.7285,
"step": 72
},
{
"epoch": 0.3777490297542044,
"grad_norm": 39.55653762817383,
"learning_rate": 9.655915783374005e-05,
"loss": 23.3597,
"step": 73
},
{
"epoch": 0.3829236739974127,
"grad_norm": 41.66143798828125,
"learning_rate": 9.643459266037744e-05,
"loss": 22.6492,
"step": 74
},
{
"epoch": 0.38809831824062097,
"grad_norm": 61.841583251953125,
"learning_rate": 9.630790555275313e-05,
"loss": 23.7358,
"step": 75
},
{
"epoch": 0.38809831824062097,
"eval_loss": 0.3401211202144623,
"eval_runtime": 2.4648,
"eval_samples_per_second": 20.286,
"eval_steps_per_second": 5.274,
"step": 75
},
{
"epoch": 0.39327296248382926,
"grad_norm": 47.54359817504883,
"learning_rate": 9.617910299872416e-05,
"loss": 20.79,
"step": 76
},
{
"epoch": 0.3984476067270375,
"grad_norm": 28.126712799072266,
"learning_rate": 9.604819159448309e-05,
"loss": 20.1379,
"step": 77
},
{
"epoch": 0.4036222509702458,
"grad_norm": 29.2869930267334,
"learning_rate": 9.591517804422023e-05,
"loss": 19.5183,
"step": 78
},
{
"epoch": 0.4087968952134541,
"grad_norm": 30.21017074584961,
"learning_rate": 9.578006915978022e-05,
"loss": 19.2306,
"step": 79
},
{
"epoch": 0.4139715394566624,
"grad_norm": 23.017995834350586,
"learning_rate": 9.564287186031333e-05,
"loss": 19.4107,
"step": 80
},
{
"epoch": 0.4191461836998706,
"grad_norm": 24.11803436279297,
"learning_rate": 9.550359317192096e-05,
"loss": 19.2999,
"step": 81
},
{
"epoch": 0.4243208279430789,
"grad_norm": 25.78137969970703,
"learning_rate": 9.536224022729591e-05,
"loss": 19.0366,
"step": 82
},
{
"epoch": 0.4294954721862872,
"grad_norm": 23.27781867980957,
"learning_rate": 9.521882026535708e-05,
"loss": 18.6447,
"step": 83
},
{
"epoch": 0.4346701164294955,
"grad_norm": 43.739017486572266,
"learning_rate": 9.50733406308788e-05,
"loss": 23.4598,
"step": 84
},
{
"epoch": 0.4398447606727037,
"grad_norm": 39.98222351074219,
"learning_rate": 9.492580877411456e-05,
"loss": 22.9503,
"step": 85
},
{
"epoch": 0.445019404915912,
"grad_norm": 39.081974029541016,
"learning_rate": 9.477623225041565e-05,
"loss": 23.2451,
"step": 86
},
{
"epoch": 0.4501940491591203,
"grad_norm": 45.064517974853516,
"learning_rate": 9.462461871984411e-05,
"loss": 22.8303,
"step": 87
},
{
"epoch": 0.4553686934023286,
"grad_norm": 35.90961456298828,
"learning_rate": 9.447097594678046e-05,
"loss": 20.3165,
"step": 88
},
{
"epoch": 0.46054333764553684,
"grad_norm": 29.820219039916992,
"learning_rate": 9.431531179952613e-05,
"loss": 19.7648,
"step": 89
},
{
"epoch": 0.46571798188874514,
"grad_norm": 24.659387588500977,
"learning_rate": 9.415763424990047e-05,
"loss": 19.644,
"step": 90
},
{
"epoch": 0.47089262613195343,
"grad_norm": 23.654937744140625,
"learning_rate": 9.39979513728325e-05,
"loss": 18.7199,
"step": 91
},
{
"epoch": 0.4760672703751617,
"grad_norm": 25.759111404418945,
"learning_rate": 9.383627134594741e-05,
"loss": 19.023,
"step": 92
},
{
"epoch": 0.48124191461837,
"grad_norm": 26.290327072143555,
"learning_rate": 9.367260244914768e-05,
"loss": 19.474,
"step": 93
},
{
"epoch": 0.48641655886157825,
"grad_norm": 22.56080436706543,
"learning_rate": 9.350695306418922e-05,
"loss": 18.6015,
"step": 94
},
{
"epoch": 0.49159120310478654,
"grad_norm": 24.438447952270508,
"learning_rate": 9.333933167425194e-05,
"loss": 18.6668,
"step": 95
},
{
"epoch": 0.49676584734799484,
"grad_norm": 26.50576400756836,
"learning_rate": 9.316974686350542e-05,
"loss": 20.2677,
"step": 96
},
{
"epoch": 0.5019404915912031,
"grad_norm": 34.5866584777832,
"learning_rate": 9.299820731666933e-05,
"loss": 22.8894,
"step": 97
},
{
"epoch": 0.5071151358344114,
"grad_norm": 37.455406188964844,
"learning_rate": 9.282472181856854e-05,
"loss": 22.4006,
"step": 98
},
{
"epoch": 0.5122897800776197,
"grad_norm": 42.7640380859375,
"learning_rate": 9.264929925368338e-05,
"loss": 22.2345,
"step": 99
},
{
"epoch": 0.517464424320828,
"grad_norm": 38.7821044921875,
"learning_rate": 9.247194860569454e-05,
"loss": 20.2589,
"step": 100
},
{
"epoch": 0.517464424320828,
"eval_loss": 0.30999085307121277,
"eval_runtime": 2.4658,
"eval_samples_per_second": 20.278,
"eval_steps_per_second": 5.272,
"step": 100
},
{
"epoch": 0.5226390685640362,
"grad_norm": 36.86570739746094,
"learning_rate": 9.229267895702307e-05,
"loss": 19.0464,
"step": 101
},
{
"epoch": 0.5278137128072445,
"grad_norm": 25.674314498901367,
"learning_rate": 9.211149948836523e-05,
"loss": 18.7845,
"step": 102
},
{
"epoch": 0.5329883570504528,
"grad_norm": 26.45655632019043,
"learning_rate": 9.192841947822232e-05,
"loss": 18.6392,
"step": 103
},
{
"epoch": 0.538163001293661,
"grad_norm": 21.524782180786133,
"learning_rate": 9.17434483024255e-05,
"loss": 17.6237,
"step": 104
},
{
"epoch": 0.5433376455368694,
"grad_norm": 22.33011817932129,
"learning_rate": 9.155659543365574e-05,
"loss": 18.1102,
"step": 105
},
{
"epoch": 0.5485122897800776,
"grad_norm": 25.29583168029785,
"learning_rate": 9.136787044095856e-05,
"loss": 17.8555,
"step": 106
},
{
"epoch": 0.553686934023286,
"grad_norm": 23.13666343688965,
"learning_rate": 9.117728298925407e-05,
"loss": 17.7725,
"step": 107
},
{
"epoch": 0.5588615782664942,
"grad_norm": 29.024078369140625,
"learning_rate": 9.0984842838842e-05,
"loss": 19.2546,
"step": 108
},
{
"epoch": 0.5640362225097024,
"grad_norm": 50.17615509033203,
"learning_rate": 9.079055984490186e-05,
"loss": 23.3918,
"step": 109
},
{
"epoch": 0.5692108667529108,
"grad_norm": 35.480079650878906,
"learning_rate": 9.059444395698823e-05,
"loss": 22.019,
"step": 110
},
{
"epoch": 0.574385510996119,
"grad_norm": 38.16156005859375,
"learning_rate": 9.039650521852124e-05,
"loss": 20.7054,
"step": 111
},
{
"epoch": 0.5795601552393272,
"grad_norm": 46.26918411254883,
"learning_rate": 9.019675376627223e-05,
"loss": 22.4658,
"step": 112
},
{
"epoch": 0.5847347994825356,
"grad_norm": 39.86201095581055,
"learning_rate": 8.99951998298446e-05,
"loss": 19.7684,
"step": 113
},
{
"epoch": 0.5899094437257438,
"grad_norm": 31.37214469909668,
"learning_rate": 8.979185373114996e-05,
"loss": 18.5229,
"step": 114
},
{
"epoch": 0.5950840879689522,
"grad_norm": 23.724746704101562,
"learning_rate": 8.958672588387953e-05,
"loss": 18.7853,
"step": 115
},
{
"epoch": 0.6002587322121604,
"grad_norm": 20.492137908935547,
"learning_rate": 8.937982679297084e-05,
"loss": 17.696,
"step": 116
},
{
"epoch": 0.6054333764553687,
"grad_norm": 21.25743293762207,
"learning_rate": 8.917116705406973e-05,
"loss": 18.5685,
"step": 117
},
{
"epoch": 0.610608020698577,
"grad_norm": 20.935260772705078,
"learning_rate": 8.89607573529878e-05,
"loss": 17.7144,
"step": 118
},
{
"epoch": 0.6157826649417852,
"grad_norm": 22.177967071533203,
"learning_rate": 8.8748608465155e-05,
"loss": 17.7657,
"step": 119
},
{
"epoch": 0.6209573091849935,
"grad_norm": 23.44055938720703,
"learning_rate": 8.853473125506803e-05,
"loss": 17.5972,
"step": 120
},
{
"epoch": 0.6261319534282018,
"grad_norm": 28.071083068847656,
"learning_rate": 8.831913667573379e-05,
"loss": 20.4785,
"step": 121
},
{
"epoch": 0.6313065976714101,
"grad_norm": 41.40524673461914,
"learning_rate": 8.810183576810856e-05,
"loss": 22.6656,
"step": 122
},
{
"epoch": 0.6364812419146184,
"grad_norm": 35.51282501220703,
"learning_rate": 8.788283966053244e-05,
"loss": 20.7936,
"step": 123
},
{
"epoch": 0.6416558861578266,
"grad_norm": 36.20588302612305,
"learning_rate": 8.766215956815959e-05,
"loss": 20.6696,
"step": 124
},
{
"epoch": 0.6468305304010349,
"grad_norm": 45.860923767089844,
"learning_rate": 8.743980679238385e-05,
"loss": 20.0276,
"step": 125
},
{
"epoch": 0.6468305304010349,
"eval_loss": 0.3056069612503052,
"eval_runtime": 2.4651,
"eval_samples_per_second": 20.283,
"eval_steps_per_second": 5.274,
"step": 125
},
{
"epoch": 0.6520051746442432,
"grad_norm": 29.0579891204834,
"learning_rate": 8.721579272025989e-05,
"loss": 18.0009,
"step": 126
},
{
"epoch": 0.6571798188874515,
"grad_norm": 24.840801239013672,
"learning_rate": 8.699012882392018e-05,
"loss": 18.2828,
"step": 127
},
{
"epoch": 0.6623544631306598,
"grad_norm": 21.309526443481445,
"learning_rate": 8.676282665998736e-05,
"loss": 17.479,
"step": 128
},
{
"epoch": 0.6675291073738681,
"grad_norm": 19.316205978393555,
"learning_rate": 8.653389786898255e-05,
"loss": 17.4983,
"step": 129
},
{
"epoch": 0.6727037516170763,
"grad_norm": 19.56000328063965,
"learning_rate": 8.630335417472909e-05,
"loss": 17.2595,
"step": 130
},
{
"epoch": 0.6778783958602846,
"grad_norm": 21.257116317749023,
"learning_rate": 8.607120738375219e-05,
"loss": 17.2479,
"step": 131
},
{
"epoch": 0.6830530401034929,
"grad_norm": 22.04469108581543,
"learning_rate": 8.583746938467436e-05,
"loss": 17.3352,
"step": 132
},
{
"epoch": 0.6882276843467011,
"grad_norm": 22.032026290893555,
"learning_rate": 8.560215214760647e-05,
"loss": 17.1702,
"step": 133
},
{
"epoch": 0.6934023285899095,
"grad_norm": 35.43321228027344,
"learning_rate": 8.53652677235348e-05,
"loss": 21.7454,
"step": 134
},
{
"epoch": 0.6985769728331177,
"grad_norm": 35.82448196411133,
"learning_rate": 8.512682824370386e-05,
"loss": 21.3486,
"step": 135
},
{
"epoch": 0.703751617076326,
"grad_norm": 33.74680709838867,
"learning_rate": 8.48868459189952e-05,
"loss": 19.9599,
"step": 136
},
{
"epoch": 0.7089262613195343,
"grad_norm": 34.64817428588867,
"learning_rate": 8.464533303930195e-05,
"loss": 20.4086,
"step": 137
},
{
"epoch": 0.7141009055627425,
"grad_norm": 30.78453826904297,
"learning_rate": 8.440230197289955e-05,
"loss": 19.2218,
"step": 138
},
{
"epoch": 0.7192755498059509,
"grad_norm": 21.971385955810547,
"learning_rate": 8.415776516581229e-05,
"loss": 17.5228,
"step": 139
},
{
"epoch": 0.7244501940491591,
"grad_norm": 25.183547973632812,
"learning_rate": 8.391173514117591e-05,
"loss": 17.9851,
"step": 140
},
{
"epoch": 0.7296248382923674,
"grad_norm": 21.053089141845703,
"learning_rate": 8.366422449859635e-05,
"loss": 17.2393,
"step": 141
},
{
"epoch": 0.7347994825355757,
"grad_norm": 20.95396614074707,
"learning_rate": 8.34152459135044e-05,
"loss": 16.9388,
"step": 142
},
{
"epoch": 0.7399741267787839,
"grad_norm": 20.432722091674805,
"learning_rate": 8.316481213650668e-05,
"loss": 16.9324,
"step": 143
},
{
"epoch": 0.7451487710219923,
"grad_norm": 21.420108795166016,
"learning_rate": 8.291293599273253e-05,
"loss": 17.3103,
"step": 144
},
{
"epoch": 0.7503234152652005,
"grad_norm": 22.44611167907715,
"learning_rate": 8.265963038117736e-05,
"loss": 16.4267,
"step": 145
},
{
"epoch": 0.7554980595084088,
"grad_norm": 29.85533905029297,
"learning_rate": 8.240490827404196e-05,
"loss": 19.9061,
"step": 146
},
{
"epoch": 0.7606727037516171,
"grad_norm": 36.1021614074707,
"learning_rate": 8.21487827160682e-05,
"loss": 21.6027,
"step": 147
},
{
"epoch": 0.7658473479948253,
"grad_norm": 35.79337692260742,
"learning_rate": 8.189126682387103e-05,
"loss": 21.6038,
"step": 148
},
{
"epoch": 0.7710219922380336,
"grad_norm": 39.69446563720703,
"learning_rate": 8.163237378526669e-05,
"loss": 21.0512,
"step": 149
},
{
"epoch": 0.7761966364812419,
"grad_norm": 37.87336730957031,
"learning_rate": 8.137211685859739e-05,
"loss": 20.3793,
"step": 150
},
{
"epoch": 0.7761966364812419,
"eval_loss": 0.29094862937927246,
"eval_runtime": 2.466,
"eval_samples_per_second": 20.276,
"eval_steps_per_second": 5.272,
"step": 150
}
],
"logging_steps": 1,
"max_steps": 457,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.500900642665267e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}