nttx's picture
Training in progress, step 172, checkpoint
50bea87 verified
{
"best_metric": 0.970888078212738,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 2.0,
"eval_steps": 25,
"global_step": 172,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011627906976744186,
"grad_norm": 2.9615676403045654,
"learning_rate": 2.9999999999999997e-05,
"loss": 3.7306,
"step": 1
},
{
"epoch": 0.011627906976744186,
"eval_loss": 1.230737328529358,
"eval_runtime": 7.9791,
"eval_samples_per_second": 6.266,
"eval_steps_per_second": 0.877,
"step": 1
},
{
"epoch": 0.023255813953488372,
"grad_norm": 2.557929754257202,
"learning_rate": 5.9999999999999995e-05,
"loss": 3.564,
"step": 2
},
{
"epoch": 0.03488372093023256,
"grad_norm": 2.1619441509246826,
"learning_rate": 8.999999999999999e-05,
"loss": 3.744,
"step": 3
},
{
"epoch": 0.046511627906976744,
"grad_norm": 1.1338863372802734,
"learning_rate": 0.00011999999999999999,
"loss": 5.1335,
"step": 4
},
{
"epoch": 0.05813953488372093,
"grad_norm": 1.0265060663223267,
"learning_rate": 0.00015,
"loss": 4.7372,
"step": 5
},
{
"epoch": 0.06976744186046512,
"grad_norm": 1.4262174367904663,
"learning_rate": 0.00017999999999999998,
"loss": 5.1326,
"step": 6
},
{
"epoch": 0.08139534883720931,
"grad_norm": 1.4468605518341064,
"learning_rate": 0.00020999999999999998,
"loss": 5.1196,
"step": 7
},
{
"epoch": 0.09302325581395349,
"grad_norm": 1.4061644077301025,
"learning_rate": 0.00023999999999999998,
"loss": 5.5418,
"step": 8
},
{
"epoch": 0.10465116279069768,
"grad_norm": 1.197460651397705,
"learning_rate": 0.00027,
"loss": 4.9912,
"step": 9
},
{
"epoch": 0.11627906976744186,
"grad_norm": 1.332681655883789,
"learning_rate": 0.0003,
"loss": 5.1568,
"step": 10
},
{
"epoch": 0.12790697674418605,
"grad_norm": 1.0973302125930786,
"learning_rate": 0.00029997179556727515,
"loss": 5.2029,
"step": 11
},
{
"epoch": 0.13953488372093023,
"grad_norm": 1.9083119630813599,
"learning_rate": 0.0002998871928756345,
"loss": 4.6768,
"step": 12
},
{
"epoch": 0.1511627906976744,
"grad_norm": 1.5310242176055908,
"learning_rate": 0.00029974622374069024,
"loss": 4.4159,
"step": 13
},
{
"epoch": 0.16279069767441862,
"grad_norm": 0.9514783620834351,
"learning_rate": 0.0002995489411751688,
"loss": 4.3375,
"step": 14
},
{
"epoch": 0.1744186046511628,
"grad_norm": 1.3517793416976929,
"learning_rate": 0.0002992954193689748,
"loss": 4.7541,
"step": 15
},
{
"epoch": 0.18604651162790697,
"grad_norm": 1.600083827972412,
"learning_rate": 0.00029898575366129145,
"loss": 5.5065,
"step": 16
},
{
"epoch": 0.19767441860465115,
"grad_norm": 0.972209095954895,
"learning_rate": 0.00029862006050472675,
"loss": 4.8114,
"step": 17
},
{
"epoch": 0.20930232558139536,
"grad_norm": 0.8742607831954956,
"learning_rate": 0.0002981984774215213,
"loss": 4.5777,
"step": 18
},
{
"epoch": 0.22093023255813954,
"grad_norm": 1.0211178064346313,
"learning_rate": 0.0002977211629518312,
"loss": 5.0349,
"step": 19
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.19241464138031,
"learning_rate": 0.00029718829659410766,
"loss": 5.1001,
"step": 20
},
{
"epoch": 0.2441860465116279,
"grad_norm": 2.0463316440582275,
"learning_rate": 0.00029660007873759533,
"loss": 5.5808,
"step": 21
},
{
"epoch": 0.2558139534883721,
"grad_norm": 5.027365207672119,
"learning_rate": 0.00029595673058697357,
"loss": 3.2185,
"step": 22
},
{
"epoch": 0.26744186046511625,
"grad_norm": 1.7920422554016113,
"learning_rate": 0.00029525849407917087,
"loss": 3.5772,
"step": 23
},
{
"epoch": 0.27906976744186046,
"grad_norm": 2.2887630462646484,
"learning_rate": 0.000294505631792382,
"loss": 2.6486,
"step": 24
},
{
"epoch": 0.29069767441860467,
"grad_norm": 1.6574441194534302,
"learning_rate": 0.00029369842684732334,
"loss": 3.5684,
"step": 25
},
{
"epoch": 0.29069767441860467,
"eval_loss": 1.0823593139648438,
"eval_runtime": 7.799,
"eval_samples_per_second": 6.411,
"eval_steps_per_second": 0.898,
"step": 25
},
{
"epoch": 0.3023255813953488,
"grad_norm": 0.8784985542297363,
"learning_rate": 0.00029283718280076227,
"loss": 4.3739,
"step": 26
},
{
"epoch": 0.313953488372093,
"grad_norm": 1.098872423171997,
"learning_rate": 0.00029192222353136254,
"loss": 4.6181,
"step": 27
},
{
"epoch": 0.32558139534883723,
"grad_norm": 0.9574615359306335,
"learning_rate": 0.0002909538931178862,
"loss": 4.7032,
"step": 28
},
{
"epoch": 0.3372093023255814,
"grad_norm": 1.1205692291259766,
"learning_rate": 0.0002899325557098001,
"loss": 4.9195,
"step": 29
},
{
"epoch": 0.3488372093023256,
"grad_norm": 0.9449394941329956,
"learning_rate": 0.00028885859539033357,
"loss": 4.2665,
"step": 30
},
{
"epoch": 0.36046511627906974,
"grad_norm": 1.0518043041229248,
"learning_rate": 0.0002877324160320411,
"loss": 4.6985,
"step": 31
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.9868097901344299,
"learning_rate": 0.000286554441144922,
"loss": 4.8123,
"step": 32
},
{
"epoch": 0.38372093023255816,
"grad_norm": 0.8500082492828369,
"learning_rate": 0.00028532511371715566,
"loss": 4.5633,
"step": 33
},
{
"epoch": 0.3953488372093023,
"grad_norm": 0.9240705370903015,
"learning_rate": 0.0002840448960485118,
"loss": 4.4635,
"step": 34
},
{
"epoch": 0.4069767441860465,
"grad_norm": 0.8523270487785339,
"learning_rate": 0.00028271426957649865,
"loss": 4.1489,
"step": 35
},
{
"epoch": 0.4186046511627907,
"grad_norm": 0.9723194241523743,
"learning_rate": 0.00028133373469531363,
"loss": 4.0247,
"step": 36
},
{
"epoch": 0.43023255813953487,
"grad_norm": 1.9848984479904175,
"learning_rate": 0.0002799038105676658,
"loss": 4.588,
"step": 37
},
{
"epoch": 0.4418604651162791,
"grad_norm": 0.8572053909301758,
"learning_rate": 0.00027842503492953995,
"loss": 4.6031,
"step": 38
},
{
"epoch": 0.45348837209302323,
"grad_norm": 0.9035416841506958,
"learning_rate": 0.0002768979638879761,
"loss": 4.8538,
"step": 39
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.053252935409546,
"learning_rate": 0.00027532317171194046,
"loss": 4.9722,
"step": 40
},
{
"epoch": 0.47674418604651164,
"grad_norm": 1.071892261505127,
"learning_rate": 0.000273701250616366,
"loss": 5.0359,
"step": 41
},
{
"epoch": 0.4883720930232558,
"grad_norm": 2.722141981124878,
"learning_rate": 0.0002720328105394451,
"loss": 6.2443,
"step": 42
},
{
"epoch": 0.5,
"grad_norm": 2.1856935024261475,
"learning_rate": 0.00027031847891325657,
"loss": 3.1716,
"step": 43
},
{
"epoch": 0.5116279069767442,
"grad_norm": 1.6351940631866455,
"learning_rate": 0.00026855890042781387,
"loss": 2.7951,
"step": 44
},
{
"epoch": 0.5232558139534884,
"grad_norm": 1.7642279863357544,
"learning_rate": 0.000266754736788624,
"loss": 3.6811,
"step": 45
},
{
"epoch": 0.5348837209302325,
"grad_norm": 1.665910243988037,
"learning_rate": 0.00026490666646784665,
"loss": 4.2234,
"step": 46
},
{
"epoch": 0.5465116279069767,
"grad_norm": 1.4838505983352661,
"learning_rate": 0.00026301538444914907,
"loss": 4.4947,
"step": 47
},
{
"epoch": 0.5581395348837209,
"grad_norm": 1.082304835319519,
"learning_rate": 0.00026108160196635066,
"loss": 4.6096,
"step": 48
},
{
"epoch": 0.5697674418604651,
"grad_norm": 0.9804915189743042,
"learning_rate": 0.0002591060462359573,
"loss": 4.3917,
"step": 49
},
{
"epoch": 0.5813953488372093,
"grad_norm": 1.103212594985962,
"learning_rate": 0.00025708946018368484,
"loss": 4.2729,
"step": 50
},
{
"epoch": 0.5813953488372093,
"eval_loss": 1.018844723701477,
"eval_runtime": 8.0091,
"eval_samples_per_second": 6.243,
"eval_steps_per_second": 0.874,
"step": 50
},
{
"epoch": 0.5930232558139535,
"grad_norm": 0.9773282408714294,
"learning_rate": 0.00025503260216507527,
"loss": 4.8289,
"step": 51
},
{
"epoch": 0.6046511627906976,
"grad_norm": 0.9617090225219727,
"learning_rate": 0.00025293624568031,
"loss": 4.4117,
"step": 52
},
{
"epoch": 0.6162790697674418,
"grad_norm": 2.6547250747680664,
"learning_rate": 0.00025080117908332834,
"loss": 4.1853,
"step": 53
},
{
"epoch": 0.627906976744186,
"grad_norm": 0.9078519940376282,
"learning_rate": 0.00024862820528535954,
"loss": 4.1129,
"step": 54
},
{
"epoch": 0.6395348837209303,
"grad_norm": 0.9886384606361389,
"learning_rate": 0.0002464181414529809,
"loss": 3.9609,
"step": 55
},
{
"epoch": 0.6511627906976745,
"grad_norm": 0.9917184710502625,
"learning_rate": 0.0002441718187008148,
"loss": 3.9052,
"step": 56
},
{
"epoch": 0.6627906976744186,
"grad_norm": 0.8498942255973816,
"learning_rate": 0.0002418900817789804,
"loss": 4.3063,
"step": 57
},
{
"epoch": 0.6744186046511628,
"grad_norm": 3.341484308242798,
"learning_rate": 0.00023957378875541792,
"loss": 5.0247,
"step": 58
},
{
"epoch": 0.686046511627907,
"grad_norm": 0.8067652583122253,
"learning_rate": 0.00023722381069320398,
"loss": 4.6544,
"step": 59
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.8337781429290771,
"learning_rate": 0.00023484103132298079,
"loss": 4.5483,
"step": 60
},
{
"epoch": 0.7093023255813954,
"grad_norm": 0.9353996515274048,
"learning_rate": 0.0002324263467106209,
"loss": 4.5697,
"step": 61
},
{
"epoch": 0.7209302325581395,
"grad_norm": 1.1387176513671875,
"learning_rate": 0.0002299806649202537,
"loss": 4.8516,
"step": 62
},
{
"epoch": 0.7325581395348837,
"grad_norm": 2.2056901454925537,
"learning_rate": 0.00022750490567277943,
"loss": 5.6958,
"step": 63
},
{
"epoch": 0.7441860465116279,
"grad_norm": 1.9957636594772339,
"learning_rate": 0.000225,
"loss": 2.8987,
"step": 64
},
{
"epoch": 0.7558139534883721,
"grad_norm": 2.1281590461730957,
"learning_rate": 0.00022246688989449576,
"loss": 3.1358,
"step": 65
},
{
"epoch": 0.7674418604651163,
"grad_norm": 1.0050021409988403,
"learning_rate": 0.00021990652795538082,
"loss": 3.127,
"step": 66
},
{
"epoch": 0.7790697674418605,
"grad_norm": 0.9873821139335632,
"learning_rate": 0.00021731987703006933,
"loss": 4.0994,
"step": 67
},
{
"epoch": 0.7906976744186046,
"grad_norm": 1.039175271987915,
"learning_rate": 0.00021470790985218802,
"loss": 4.3436,
"step": 68
},
{
"epoch": 0.8023255813953488,
"grad_norm": 0.9042471647262573,
"learning_rate": 0.00021207160867577087,
"loss": 4.0852,
"step": 69
},
{
"epoch": 0.813953488372093,
"grad_norm": 0.8749189972877502,
"learning_rate": 0.0002094119649058735,
"loss": 4.5285,
"step": 70
},
{
"epoch": 0.8255813953488372,
"grad_norm": 0.9549452662467957,
"learning_rate": 0.00020672997872574637,
"loss": 4.3908,
"step": 71
},
{
"epoch": 0.8372093023255814,
"grad_norm": 1.4416093826293945,
"learning_rate": 0.00020402665872070654,
"loss": 4.1939,
"step": 72
},
{
"epoch": 0.8488372093023255,
"grad_norm": 0.9574504494667053,
"learning_rate": 0.00020130302149885031,
"loss": 4.4245,
"step": 73
},
{
"epoch": 0.8604651162790697,
"grad_norm": 0.8751915097236633,
"learning_rate": 0.00019856009130874816,
"loss": 4.207,
"step": 74
},
{
"epoch": 0.872093023255814,
"grad_norm": 0.8442956209182739,
"learning_rate": 0.00019579889965426698,
"loss": 3.7247,
"step": 75
},
{
"epoch": 0.872093023255814,
"eval_loss": 1.0017675161361694,
"eval_runtime": 7.7953,
"eval_samples_per_second": 6.414,
"eval_steps_per_second": 0.898,
"step": 75
},
{
"epoch": 0.8837209302325582,
"grad_norm": 0.7961885929107666,
"learning_rate": 0.00019302048490666353,
"loss": 3.855,
"step": 76
},
{
"epoch": 0.8953488372093024,
"grad_norm": 0.8288469314575195,
"learning_rate": 0.0001902258919140956,
"loss": 4.3016,
"step": 77
},
{
"epoch": 0.9069767441860465,
"grad_norm": 0.9873271584510803,
"learning_rate": 0.00018741617160869718,
"loss": 4.0653,
"step": 78
},
{
"epoch": 0.9186046511627907,
"grad_norm": 0.8603689074516296,
"learning_rate": 0.00018459238061136602,
"loss": 4.3487,
"step": 79
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.8512229323387146,
"learning_rate": 0.00018175558083441162,
"loss": 4.331,
"step": 80
},
{
"epoch": 0.9418604651162791,
"grad_norm": 0.9621251821517944,
"learning_rate": 0.00017890683908221346,
"loss": 4.4871,
"step": 81
},
{
"epoch": 0.9534883720930233,
"grad_norm": 1.079795479774475,
"learning_rate": 0.00017604722665003956,
"loss": 4.9358,
"step": 82
},
{
"epoch": 0.9651162790697675,
"grad_norm": 1.1401747465133667,
"learning_rate": 0.00017317781892117607,
"loss": 5.0393,
"step": 83
},
{
"epoch": 0.9767441860465116,
"grad_norm": 2.458475112915039,
"learning_rate": 0.00017029969496251966,
"loss": 5.3823,
"step": 84
},
{
"epoch": 0.9883720930232558,
"grad_norm": 1.0386871099472046,
"learning_rate": 0.00016741393711878453,
"loss": 3.8578,
"step": 85
},
{
"epoch": 1.0,
"grad_norm": 1.7885408401489258,
"learning_rate": 0.00016452163060547687,
"loss": 4.6428,
"step": 86
},
{
"epoch": 1.0116279069767442,
"grad_norm": 1.0790536403656006,
"learning_rate": 0.00016162386310078963,
"loss": 2.3767,
"step": 87
},
{
"epoch": 1.0232558139534884,
"grad_norm": 0.8506271839141846,
"learning_rate": 0.00015872172433657134,
"loss": 2.6958,
"step": 88
},
{
"epoch": 1.0348837209302326,
"grad_norm": 0.7386451959609985,
"learning_rate": 0.0001558163056885225,
"loss": 3.0156,
"step": 89
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.901319682598114,
"learning_rate": 0.00015290869976577364,
"loss": 3.9778,
"step": 90
},
{
"epoch": 1.058139534883721,
"grad_norm": 0.9542390704154968,
"learning_rate": 0.00015,
"loss": 3.7751,
"step": 91
},
{
"epoch": 1.069767441860465,
"grad_norm": 0.9419402480125427,
"learning_rate": 0.00014709130023422633,
"loss": 4.0309,
"step": 92
},
{
"epoch": 1.0813953488372092,
"grad_norm": 0.8811500668525696,
"learning_rate": 0.00014418369431147746,
"loss": 4.2052,
"step": 93
},
{
"epoch": 1.0930232558139534,
"grad_norm": 0.9012645483016968,
"learning_rate": 0.00014127827566342863,
"loss": 3.7495,
"step": 94
},
{
"epoch": 1.1046511627906976,
"grad_norm": 0.9330968260765076,
"learning_rate": 0.00013837613689921037,
"loss": 4.3093,
"step": 95
},
{
"epoch": 1.1162790697674418,
"grad_norm": 0.9980186820030212,
"learning_rate": 0.00013547836939452313,
"loss": 4.2077,
"step": 96
},
{
"epoch": 1.127906976744186,
"grad_norm": 0.8667058944702148,
"learning_rate": 0.00013258606288121542,
"loss": 3.4672,
"step": 97
},
{
"epoch": 1.1395348837209303,
"grad_norm": 1.185315489768982,
"learning_rate": 0.00012970030503748036,
"loss": 3.9651,
"step": 98
},
{
"epoch": 1.1511627906976745,
"grad_norm": 0.883290708065033,
"learning_rate": 0.00012682218107882393,
"loss": 3.8345,
"step": 99
},
{
"epoch": 1.1627906976744187,
"grad_norm": 0.7864513993263245,
"learning_rate": 0.00012395277334996044,
"loss": 3.7654,
"step": 100
},
{
"epoch": 1.1627906976744187,
"eval_loss": 0.991995632648468,
"eval_runtime": 8.0088,
"eval_samples_per_second": 6.243,
"eval_steps_per_second": 0.874,
"step": 100
},
{
"epoch": 1.1744186046511629,
"grad_norm": 0.8056565523147583,
"learning_rate": 0.0001210931609177865,
"loss": 3.8797,
"step": 101
},
{
"epoch": 1.1860465116279069,
"grad_norm": 1.6551860570907593,
"learning_rate": 0.00011824441916558842,
"loss": 4.1864,
"step": 102
},
{
"epoch": 1.197674418604651,
"grad_norm": 0.8464885354042053,
"learning_rate": 0.00011540761938863397,
"loss": 4.1092,
"step": 103
},
{
"epoch": 1.2093023255813953,
"grad_norm": 0.9470563530921936,
"learning_rate": 0.00011258382839130281,
"loss": 4.2251,
"step": 104
},
{
"epoch": 1.2209302325581395,
"grad_norm": 1.0200443267822266,
"learning_rate": 0.00010977410808590436,
"loss": 4.2479,
"step": 105
},
{
"epoch": 1.2325581395348837,
"grad_norm": 1.1114517450332642,
"learning_rate": 0.0001069795150933365,
"loss": 4.3392,
"step": 106
},
{
"epoch": 1.244186046511628,
"grad_norm": 2.389307975769043,
"learning_rate": 0.00010420110034573304,
"loss": 4.9816,
"step": 107
},
{
"epoch": 1.255813953488372,
"grad_norm": 1.2697150707244873,
"learning_rate": 0.00010143990869125184,
"loss": 2.6297,
"step": 108
},
{
"epoch": 1.2674418604651163,
"grad_norm": 0.9846199154853821,
"learning_rate": 9.869697850114969e-05,
"loss": 2.7252,
"step": 109
},
{
"epoch": 1.2790697674418605,
"grad_norm": 1.0952304601669312,
"learning_rate": 9.597334127929346e-05,
"loss": 2.7719,
"step": 110
},
{
"epoch": 1.2906976744186047,
"grad_norm": 0.9392086267471313,
"learning_rate": 9.327002127425363e-05,
"loss": 3.5918,
"step": 111
},
{
"epoch": 1.302325581395349,
"grad_norm": 0.9461237192153931,
"learning_rate": 9.058803509412646e-05,
"loss": 3.874,
"step": 112
},
{
"epoch": 1.3139534883720931,
"grad_norm": 0.9763650894165039,
"learning_rate": 8.792839132422913e-05,
"loss": 4.1091,
"step": 113
},
{
"epoch": 1.3255813953488373,
"grad_norm": 0.9497533440589905,
"learning_rate": 8.529209014781201e-05,
"loss": 3.8025,
"step": 114
},
{
"epoch": 1.3372093023255813,
"grad_norm": 1.0650036334991455,
"learning_rate": 8.268012296993067e-05,
"loss": 4.2753,
"step": 115
},
{
"epoch": 1.3488372093023255,
"grad_norm": 1.0653228759765625,
"learning_rate": 8.009347204461921e-05,
"loss": 4.0739,
"step": 116
},
{
"epoch": 1.3604651162790697,
"grad_norm": 1.1015644073486328,
"learning_rate": 7.753311010550421e-05,
"loss": 4.076,
"step": 117
},
{
"epoch": 1.372093023255814,
"grad_norm": 1.0751556158065796,
"learning_rate": 7.500000000000002e-05,
"loss": 4.0752,
"step": 118
},
{
"epoch": 1.3837209302325582,
"grad_norm": 0.9765347838401794,
"learning_rate": 7.249509432722056e-05,
"loss": 3.8782,
"step": 119
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.0646424293518066,
"learning_rate": 7.001933507974633e-05,
"loss": 3.7195,
"step": 120
},
{
"epoch": 1.4069767441860466,
"grad_norm": 0.9464161992073059,
"learning_rate": 6.75736532893791e-05,
"loss": 3.7267,
"step": 121
},
{
"epoch": 1.4186046511627908,
"grad_norm": 1.1027458906173706,
"learning_rate": 6.515896867701923e-05,
"loss": 4.2282,
"step": 122
},
{
"epoch": 1.4302325581395348,
"grad_norm": 0.9523140788078308,
"learning_rate": 6.277618930679598e-05,
"loss": 4.1755,
"step": 123
},
{
"epoch": 1.441860465116279,
"grad_norm": 0.9763263463973999,
"learning_rate": 6.04262112445821e-05,
"loss": 4.1322,
"step": 124
},
{
"epoch": 1.4534883720930232,
"grad_norm": 0.9657738208770752,
"learning_rate": 5.8109918221019566e-05,
"loss": 4.2455,
"step": 125
},
{
"epoch": 1.4534883720930232,
"eval_loss": 0.9675049781799316,
"eval_runtime": 8.0071,
"eval_samples_per_second": 6.244,
"eval_steps_per_second": 0.874,
"step": 125
},
{
"epoch": 1.4651162790697674,
"grad_norm": 1.0695807933807373,
"learning_rate": 5.582818129918524e-05,
"loss": 4.2064,
"step": 126
},
{
"epoch": 1.4767441860465116,
"grad_norm": 1.2192351818084717,
"learning_rate": 5.358185854701909e-05,
"loss": 4.4307,
"step": 127
},
{
"epoch": 1.4883720930232558,
"grad_norm": 2.163886308670044,
"learning_rate": 5.137179471464047e-05,
"loss": 4.5119,
"step": 128
},
{
"epoch": 1.5,
"grad_norm": 1.2242028713226318,
"learning_rate": 4.9198820916671634e-05,
"loss": 2.6797,
"step": 129
},
{
"epoch": 1.5116279069767442,
"grad_norm": 1.0933971405029297,
"learning_rate": 4.706375431968997e-05,
"loss": 2.4876,
"step": 130
},
{
"epoch": 1.5232558139534884,
"grad_norm": 1.007104754447937,
"learning_rate": 4.4967397834924724e-05,
"loss": 3.1256,
"step": 131
},
{
"epoch": 1.5348837209302326,
"grad_norm": 0.9222277402877808,
"learning_rate": 4.2910539816315164e-05,
"loss": 3.4608,
"step": 132
},
{
"epoch": 1.5465116279069768,
"grad_norm": 1.0027642250061035,
"learning_rate": 4.089395376404269e-05,
"loss": 3.692,
"step": 133
},
{
"epoch": 1.558139534883721,
"grad_norm": 1.139125943183899,
"learning_rate": 3.891839803364934e-05,
"loss": 4.2357,
"step": 134
},
{
"epoch": 1.5697674418604652,
"grad_norm": 1.028533697128296,
"learning_rate": 3.698461555085089e-05,
"loss": 4.1098,
"step": 135
},
{
"epoch": 1.5813953488372094,
"grad_norm": 1.124190092086792,
"learning_rate": 3.509333353215331e-05,
"loss": 4.0042,
"step": 136
},
{
"epoch": 1.5930232558139537,
"grad_norm": 1.042450189590454,
"learning_rate": 3.324526321137599e-05,
"loss": 3.7289,
"step": 137
},
{
"epoch": 1.6046511627906976,
"grad_norm": 1.0798102617263794,
"learning_rate": 3.144109957218612e-05,
"loss": 3.8191,
"step": 138
},
{
"epoch": 1.6162790697674418,
"grad_norm": 1.1347808837890625,
"learning_rate": 2.9681521086743422e-05,
"loss": 3.9489,
"step": 139
},
{
"epoch": 1.627906976744186,
"grad_norm": 1.0719928741455078,
"learning_rate": 2.7967189460554872e-05,
"loss": 4.3926,
"step": 140
},
{
"epoch": 1.6395348837209303,
"grad_norm": 0.9701864719390869,
"learning_rate": 2.629874938363398e-05,
"loss": 3.552,
"step": 141
},
{
"epoch": 1.6511627906976745,
"grad_norm": 0.8670133948326111,
"learning_rate": 2.4676828288059558e-05,
"loss": 3.6456,
"step": 142
},
{
"epoch": 1.6627906976744184,
"grad_norm": 0.9276437759399414,
"learning_rate": 2.3102036112023836e-05,
"loss": 3.3793,
"step": 143
},
{
"epoch": 1.6744186046511627,
"grad_norm": 1.150744915008545,
"learning_rate": 2.1574965070460043e-05,
"loss": 3.6836,
"step": 144
},
{
"epoch": 1.6860465116279069,
"grad_norm": 0.9892753958702087,
"learning_rate": 2.009618943233419e-05,
"loss": 4.1351,
"step": 145
},
{
"epoch": 1.697674418604651,
"grad_norm": 0.9877732396125793,
"learning_rate": 1.8666265304686383e-05,
"loss": 4.105,
"step": 146
},
{
"epoch": 1.7093023255813953,
"grad_norm": 1.0883934497833252,
"learning_rate": 1.7285730423501327e-05,
"loss": 4.1906,
"step": 147
},
{
"epoch": 1.7209302325581395,
"grad_norm": 1.1771783828735352,
"learning_rate": 1.5955103951488173e-05,
"loss": 4.368,
"step": 148
},
{
"epoch": 1.7325581395348837,
"grad_norm": 2.0767550468444824,
"learning_rate": 1.467488628284434e-05,
"loss": 5.0929,
"step": 149
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.9449374079704285,
"learning_rate": 1.3445558855078014e-05,
"loss": 2.7604,
"step": 150
},
{
"epoch": 1.744186046511628,
"eval_loss": 0.970888078212738,
"eval_runtime": 8.0092,
"eval_samples_per_second": 6.243,
"eval_steps_per_second": 0.874,
"step": 150
},
{
"epoch": 1.755813953488372,
"grad_norm": 1.1184589862823486,
"learning_rate": 1.2267583967958916e-05,
"loss": 2.6164,
"step": 151
},
{
"epoch": 1.7674418604651163,
"grad_norm": 0.9397222995758057,
"learning_rate": 1.1141404609666449e-05,
"loss": 3.2114,
"step": 152
},
{
"epoch": 1.7790697674418605,
"grad_norm": 0.8842024803161621,
"learning_rate": 1.0067444290199917e-05,
"loss": 3.6752,
"step": 153
},
{
"epoch": 1.7906976744186047,
"grad_norm": 0.9894306659698486,
"learning_rate": 9.046106882113751e-06,
"loss": 3.8772,
"step": 154
},
{
"epoch": 1.802325581395349,
"grad_norm": 0.9347633123397827,
"learning_rate": 8.07777646863746e-06,
"loss": 3.9717,
"step": 155
},
{
"epoch": 1.8139534883720931,
"grad_norm": 1.0060522556304932,
"learning_rate": 7.1628171992377025e-06,
"loss": 4.3449,
"step": 156
},
{
"epoch": 1.8255813953488373,
"grad_norm": 0.9990627765655518,
"learning_rate": 6.301573152676664e-06,
"loss": 4.1012,
"step": 157
},
{
"epoch": 1.8372093023255816,
"grad_norm": 1.0378891229629517,
"learning_rate": 5.494368207617949e-06,
"loss": 4.0339,
"step": 158
},
{
"epoch": 1.8488372093023255,
"grad_norm": 1.0531872510910034,
"learning_rate": 4.741505920829131e-06,
"loss": 4.4799,
"step": 159
},
{
"epoch": 1.8604651162790697,
"grad_norm": 0.9710641503334045,
"learning_rate": 4.043269413026429e-06,
"loss": 3.6334,
"step": 160
},
{
"epoch": 1.872093023255814,
"grad_norm": 0.9414262771606445,
"learning_rate": 3.3999212624046646e-06,
"loss": 3.8207,
"step": 161
},
{
"epoch": 1.8837209302325582,
"grad_norm": 0.8727117776870728,
"learning_rate": 2.811703405892296e-06,
"loss": 3.6237,
"step": 162
},
{
"epoch": 1.8953488372093024,
"grad_norm": 0.8367646932601929,
"learning_rate": 2.2788370481687965e-06,
"loss": 3.5522,
"step": 163
},
{
"epoch": 1.9069767441860463,
"grad_norm": 0.9638428688049316,
"learning_rate": 1.801522578478648e-06,
"loss": 3.7745,
"step": 164
},
{
"epoch": 1.9186046511627906,
"grad_norm": 1.643754005432129,
"learning_rate": 1.3799394952732024e-06,
"loss": 4.3574,
"step": 165
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.9748329520225525,
"learning_rate": 1.0142463387085464e-06,
"loss": 4.2242,
"step": 166
},
{
"epoch": 1.941860465116279,
"grad_norm": 0.938869297504425,
"learning_rate": 7.045806310251257e-07,
"loss": 4.0438,
"step": 167
},
{
"epoch": 1.9534883720930232,
"grad_norm": 1.0137250423431396,
"learning_rate": 4.510588248311964e-07,
"loss": 4.342,
"step": 168
},
{
"epoch": 1.9651162790697674,
"grad_norm": 1.2118498086929321,
"learning_rate": 2.5377625930977363e-07,
"loss": 4.5787,
"step": 169
},
{
"epoch": 1.9767441860465116,
"grad_norm": 2.177273750305176,
"learning_rate": 1.1280712436549378e-07,
"loss": 4.6907,
"step": 170
},
{
"epoch": 1.9883720930232558,
"grad_norm": 0.9952888488769531,
"learning_rate": 2.8204432724798775e-08,
"loss": 3.898,
"step": 171
},
{
"epoch": 2.0,
"grad_norm": 2.449486017227173,
"learning_rate": 0.0,
"loss": 5.0474,
"step": 172
}
],
"logging_steps": 1,
"max_steps": 172,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7.485613329088512e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}