RoyJoy's picture
Training in progress, step 121, checkpoint
dc02950 verified
raw
history blame
23.2 kB
{
"best_metric": 11.921277046203613,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 2.008298755186722,
"eval_steps": 25,
"global_step": 121,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016597510373443983,
"grad_norm": 0.008619977161288261,
"learning_rate": 2.9999999999999997e-05,
"loss": 11.9327,
"step": 1
},
{
"epoch": 0.016597510373443983,
"eval_loss": 11.933255195617676,
"eval_runtime": 0.0685,
"eval_samples_per_second": 729.868,
"eval_steps_per_second": 29.195,
"step": 1
},
{
"epoch": 0.03319502074688797,
"grad_norm": 0.007508904207497835,
"learning_rate": 5.9999999999999995e-05,
"loss": 11.9321,
"step": 2
},
{
"epoch": 0.04979253112033195,
"grad_norm": 0.007870020344853401,
"learning_rate": 8.999999999999999e-05,
"loss": 11.9325,
"step": 3
},
{
"epoch": 0.06639004149377593,
"grad_norm": 0.008674144744873047,
"learning_rate": 0.00011999999999999999,
"loss": 11.9323,
"step": 4
},
{
"epoch": 0.08298755186721991,
"grad_norm": 0.00782778114080429,
"learning_rate": 0.00015,
"loss": 11.9326,
"step": 5
},
{
"epoch": 0.0995850622406639,
"grad_norm": 0.007341116201132536,
"learning_rate": 0.00017999999999999998,
"loss": 11.9326,
"step": 6
},
{
"epoch": 0.11618257261410789,
"grad_norm": 0.00769679993391037,
"learning_rate": 0.00020999999999999998,
"loss": 11.9325,
"step": 7
},
{
"epoch": 0.13278008298755187,
"grad_norm": 0.008552188985049725,
"learning_rate": 0.00023999999999999998,
"loss": 11.9325,
"step": 8
},
{
"epoch": 0.14937759336099585,
"grad_norm": 0.0071903131902217865,
"learning_rate": 0.00027,
"loss": 11.9329,
"step": 9
},
{
"epoch": 0.16597510373443983,
"grad_norm": 0.006793375127017498,
"learning_rate": 0.0003,
"loss": 11.9332,
"step": 10
},
{
"epoch": 0.1825726141078838,
"grad_norm": 0.006766727194190025,
"learning_rate": 0.00029993992606774825,
"loss": 11.9326,
"step": 11
},
{
"epoch": 0.1991701244813278,
"grad_norm": 0.006954336538910866,
"learning_rate": 0.00029975975238935744,
"loss": 11.9319,
"step": 12
},
{
"epoch": 0.2157676348547718,
"grad_norm": 0.008698610588908195,
"learning_rate": 0.00029945962328137895,
"loss": 11.9319,
"step": 13
},
{
"epoch": 0.23236514522821577,
"grad_norm": 0.008457913063466549,
"learning_rate": 0.0002990397791429554,
"loss": 11.9322,
"step": 14
},
{
"epoch": 0.24896265560165975,
"grad_norm": 0.008619188331067562,
"learning_rate": 0.0002985005562632645,
"loss": 11.9318,
"step": 15
},
{
"epoch": 0.26556016597510373,
"grad_norm": 0.009035887196660042,
"learning_rate": 0.00029784238655215626,
"loss": 11.9322,
"step": 16
},
{
"epoch": 0.2821576763485477,
"grad_norm": 0.010372682474553585,
"learning_rate": 0.000297065797194199,
"loss": 11.9315,
"step": 17
},
{
"epoch": 0.2987551867219917,
"grad_norm": 0.009721986949443817,
"learning_rate": 0.00029617141022641243,
"loss": 11.932,
"step": 18
},
{
"epoch": 0.3153526970954357,
"grad_norm": 0.011710022576153278,
"learning_rate": 0.00029515994204002484,
"loss": 11.9316,
"step": 19
},
{
"epoch": 0.33195020746887965,
"grad_norm": 0.011379418894648552,
"learning_rate": 0.00029403220280665337,
"loss": 11.932,
"step": 20
},
{
"epoch": 0.34854771784232363,
"grad_norm": 0.012264162302017212,
"learning_rate": 0.0002927890958293689,
"loss": 11.9319,
"step": 21
},
{
"epoch": 0.3651452282157676,
"grad_norm": 0.012727621011435986,
"learning_rate": 0.0002914316168191626,
"loss": 11.9315,
"step": 22
},
{
"epoch": 0.3817427385892116,
"grad_norm": 0.013410990126430988,
"learning_rate": 0.0002899608530973956,
"loss": 11.9308,
"step": 23
},
{
"epoch": 0.3983402489626556,
"grad_norm": 0.014416859485208988,
"learning_rate": 0.00028837798272487026,
"loss": 11.932,
"step": 24
},
{
"epoch": 0.4149377593360996,
"grad_norm": 0.01126072183251381,
"learning_rate": 0.00028668427355822034,
"loss": 11.9328,
"step": 25
},
{
"epoch": 0.4149377593360996,
"eval_loss": 11.931723594665527,
"eval_runtime": 0.0673,
"eval_samples_per_second": 743.046,
"eval_steps_per_second": 29.722,
"step": 25
},
{
"epoch": 0.4315352697095436,
"grad_norm": 0.019695591181516647,
"learning_rate": 0.0002848810822343755,
"loss": 11.931,
"step": 26
},
{
"epoch": 0.44813278008298757,
"grad_norm": 0.01740470714867115,
"learning_rate": 0.00028296985308391476,
"loss": 11.9307,
"step": 27
},
{
"epoch": 0.46473029045643155,
"grad_norm": 0.021143503487110138,
"learning_rate": 0.0002809521169741782,
"loss": 11.9306,
"step": 28
},
{
"epoch": 0.48132780082987553,
"grad_norm": 0.023337863385677338,
"learning_rate": 0.0002788294900830639,
"loss": 11.9307,
"step": 29
},
{
"epoch": 0.4979253112033195,
"grad_norm": 0.026668556034564972,
"learning_rate": 0.00027660367260449255,
"loss": 11.9305,
"step": 30
},
{
"epoch": 0.5145228215767634,
"grad_norm": 0.02838689088821411,
"learning_rate": 0.0002742764473865763,
"loss": 11.9296,
"step": 31
},
{
"epoch": 0.5311203319502075,
"grad_norm": 0.027914777398109436,
"learning_rate": 0.00027184967850358286,
"loss": 11.9302,
"step": 32
},
{
"epoch": 0.5477178423236515,
"grad_norm": 0.028979269787669182,
"learning_rate": 0.0002693253097628385,
"loss": 11.9297,
"step": 33
},
{
"epoch": 0.5643153526970954,
"grad_norm": 0.03429955616593361,
"learning_rate": 0.00026670536314776593,
"loss": 11.9294,
"step": 34
},
{
"epoch": 0.5809128630705395,
"grad_norm": 0.026081860065460205,
"learning_rate": 0.00026399193719830457,
"loss": 11.9296,
"step": 35
},
{
"epoch": 0.5975103734439834,
"grad_norm": 0.026742985472083092,
"learning_rate": 0.00026118720533001,
"loss": 11.9289,
"step": 36
},
{
"epoch": 0.6141078838174274,
"grad_norm": 0.03144499287009239,
"learning_rate": 0.0002582934140931786,
"loss": 11.929,
"step": 37
},
{
"epoch": 0.6307053941908713,
"grad_norm": 0.04750339314341545,
"learning_rate": 0.0002553128813733934,
"loss": 11.9278,
"step": 38
},
{
"epoch": 0.6473029045643154,
"grad_norm": 0.044398147612810135,
"learning_rate": 0.0002522479945349299,
"loss": 11.9275,
"step": 39
},
{
"epoch": 0.6639004149377593,
"grad_norm": 0.04385744780302048,
"learning_rate": 0.00024910120850851216,
"loss": 11.9278,
"step": 40
},
{
"epoch": 0.6804979253112033,
"grad_norm": 0.05074305832386017,
"learning_rate": 0.00024587504382494774,
"loss": 11.9262,
"step": 41
},
{
"epoch": 0.6970954356846473,
"grad_norm": 0.047362346202135086,
"learning_rate": 0.00024257208459621828,
"loss": 11.9268,
"step": 42
},
{
"epoch": 0.7136929460580913,
"grad_norm": 0.04583980143070221,
"learning_rate": 0.00023919497644564298,
"loss": 11.9258,
"step": 43
},
{
"epoch": 0.7302904564315352,
"grad_norm": 0.049639176577329636,
"learning_rate": 0.0002357464243887718,
"loss": 11.9255,
"step": 44
},
{
"epoch": 0.7468879668049793,
"grad_norm": 0.04268966615200043,
"learning_rate": 0.00023222919066670647,
"loss": 11.9256,
"step": 45
},
{
"epoch": 0.7634854771784232,
"grad_norm": 0.04423046112060547,
"learning_rate": 0.00022864609253358474,
"loss": 11.9251,
"step": 46
},
{
"epoch": 0.7800829875518672,
"grad_norm": 0.03876250982284546,
"learning_rate": 0.000225,
"loss": 11.9259,
"step": 47
},
{
"epoch": 0.7966804979253111,
"grad_norm": 0.031338173896074295,
"learning_rate": 0.00022129383353416347,
"loss": 11.9259,
"step": 48
},
{
"epoch": 0.8132780082987552,
"grad_norm": 0.031210312619805336,
"learning_rate": 0.00021753056172265096,
"loss": 11.9261,
"step": 49
},
{
"epoch": 0.8298755186721992,
"grad_norm": 0.03140793740749359,
"learning_rate": 0.00021371319889260717,
"loss": 11.926,
"step": 50
},
{
"epoch": 0.8298755186721992,
"eval_loss": 11.924413681030273,
"eval_runtime": 0.0674,
"eval_samples_per_second": 742.066,
"eval_steps_per_second": 29.683,
"step": 50
},
{
"epoch": 0.8464730290456431,
"grad_norm": 0.043788015842437744,
"learning_rate": 0.00020984480269731242,
"loss": 11.9232,
"step": 51
},
{
"epoch": 0.8630705394190872,
"grad_norm": 0.03660878911614418,
"learning_rate": 0.0002059284716670463,
"loss": 11.9244,
"step": 52
},
{
"epoch": 0.8796680497925311,
"grad_norm": 0.03714355081319809,
"learning_rate": 0.00020196734272720854,
"loss": 11.923,
"step": 53
},
{
"epoch": 0.8962655601659751,
"grad_norm": 0.03057694435119629,
"learning_rate": 0.00019796458868568678,
"loss": 11.922,
"step": 54
},
{
"epoch": 0.9128630705394191,
"grad_norm": 0.02990574575960636,
"learning_rate": 0.00019392341569148252,
"loss": 11.9218,
"step": 55
},
{
"epoch": 0.9294605809128631,
"grad_norm": 0.025974757969379425,
"learning_rate": 0.00018984706066663143,
"loss": 11.9227,
"step": 56
},
{
"epoch": 0.946058091286307,
"grad_norm": 0.02820601500570774,
"learning_rate": 0.00018573878871347473,
"loss": 11.9225,
"step": 57
},
{
"epoch": 0.9626556016597511,
"grad_norm": 0.028666473925113678,
"learning_rate": 0.00018160189049935892,
"loss": 11.9233,
"step": 58
},
{
"epoch": 0.979253112033195,
"grad_norm": 0.02322000451385975,
"learning_rate": 0.00017743967962085798,
"loss": 11.923,
"step": 59
},
{
"epoch": 0.995850622406639,
"grad_norm": 0.028861528262495995,
"learning_rate": 0.00017325548994962965,
"loss": 11.924,
"step": 60
},
{
"epoch": 1.012448132780083,
"grad_norm": 0.04327556490898132,
"learning_rate": 0.0001690526729620318,
"loss": 21.3788,
"step": 61
},
{
"epoch": 1.0290456431535269,
"grad_norm": 0.02456553652882576,
"learning_rate": 0.00016483459505463747,
"loss": 11.6539,
"step": 62
},
{
"epoch": 1.045643153526971,
"grad_norm": 0.02364276722073555,
"learning_rate": 0.00016060463484779918,
"loss": 11.83,
"step": 63
},
{
"epoch": 1.062240663900415,
"grad_norm": 0.02070525474846363,
"learning_rate": 0.00015636618047942222,
"loss": 11.9144,
"step": 64
},
{
"epoch": 1.0788381742738589,
"grad_norm": 0.0210164375603199,
"learning_rate": 0.00015212262689111433,
"loss": 11.9249,
"step": 65
},
{
"epoch": 1.095435684647303,
"grad_norm": 0.02003309689462185,
"learning_rate": 0.0001478773731088857,
"loss": 11.9061,
"step": 66
},
{
"epoch": 1.112033195020747,
"grad_norm": 0.01854197308421135,
"learning_rate": 0.00014363381952057778,
"loss": 11.938,
"step": 67
},
{
"epoch": 1.1286307053941909,
"grad_norm": 0.022063063457608223,
"learning_rate": 0.0001393953651522008,
"loss": 11.9017,
"step": 68
},
{
"epoch": 1.1452282157676348,
"grad_norm": 0.017270218580961227,
"learning_rate": 0.00013516540494536253,
"loss": 11.978,
"step": 69
},
{
"epoch": 1.161825726141079,
"grad_norm": 0.01767859421670437,
"learning_rate": 0.00013094732703796818,
"loss": 11.9076,
"step": 70
},
{
"epoch": 1.1784232365145229,
"grad_norm": 0.01846623234450817,
"learning_rate": 0.0001267445100503703,
"loss": 11.9928,
"step": 71
},
{
"epoch": 1.1950207468879668,
"grad_norm": 0.02277253195643425,
"learning_rate": 0.000122560320379142,
"loss": 11.9957,
"step": 72
},
{
"epoch": 1.2116182572614107,
"grad_norm": 0.017460819333791733,
"learning_rate": 0.00011839810950064109,
"loss": 9.1042,
"step": 73
},
{
"epoch": 1.2282157676348548,
"grad_norm": 0.027226807549595833,
"learning_rate": 0.00011426121128652526,
"loss": 14.9039,
"step": 74
},
{
"epoch": 1.2448132780082988,
"grad_norm": 0.02005860209465027,
"learning_rate": 0.00011015293933336857,
"loss": 11.642,
"step": 75
},
{
"epoch": 1.2448132780082988,
"eval_loss": 11.92165756225586,
"eval_runtime": 0.0678,
"eval_samples_per_second": 737.673,
"eval_steps_per_second": 29.507,
"step": 75
},
{
"epoch": 1.2614107883817427,
"grad_norm": 0.01866528019309044,
"learning_rate": 0.00010607658430851744,
"loss": 11.8735,
"step": 76
},
{
"epoch": 1.2780082987551866,
"grad_norm": 0.014573541469871998,
"learning_rate": 0.0001020354113143132,
"loss": 11.8891,
"step": 77
},
{
"epoch": 1.2946058091286308,
"grad_norm": 0.014832578599452972,
"learning_rate": 9.803265727279149e-05,
"loss": 11.9601,
"step": 78
},
{
"epoch": 1.3112033195020747,
"grad_norm": 0.010414165444672108,
"learning_rate": 9.407152833295372e-05,
"loss": 11.9166,
"step": 79
},
{
"epoch": 1.3278008298755186,
"grad_norm": 0.014295806176960468,
"learning_rate": 9.015519730268754e-05,
"loss": 11.883,
"step": 80
},
{
"epoch": 1.3443983402489628,
"grad_norm": 0.011363668367266655,
"learning_rate": 8.62868011073928e-05,
"loss": 11.9823,
"step": 81
},
{
"epoch": 1.3609958506224067,
"grad_norm": 0.012058326043188572,
"learning_rate": 8.246943827734897e-05,
"loss": 11.9326,
"step": 82
},
{
"epoch": 1.3775933609958506,
"grad_norm": 0.013995883986353874,
"learning_rate": 7.870616646583648e-05,
"loss": 11.9321,
"step": 83
},
{
"epoch": 1.3941908713692945,
"grad_norm": 0.017102347686886787,
"learning_rate": 7.500000000000002e-05,
"loss": 11.9893,
"step": 84
},
{
"epoch": 1.4107883817427385,
"grad_norm": 0.019754430279135704,
"learning_rate": 7.135390746641526e-05,
"loss": 12.3581,
"step": 85
},
{
"epoch": 1.4273858921161826,
"grad_norm": 0.014634879305958748,
"learning_rate": 6.777080933329354e-05,
"loss": 11.5957,
"step": 86
},
{
"epoch": 1.4439834024896265,
"grad_norm": 0.01592666283249855,
"learning_rate": 6.425357561122819e-05,
"loss": 11.8704,
"step": 87
},
{
"epoch": 1.4605809128630705,
"grad_norm": 0.01606922596693039,
"learning_rate": 6.080502355435701e-05,
"loss": 11.7446,
"step": 88
},
{
"epoch": 1.4771784232365146,
"grad_norm": 0.017506958916783333,
"learning_rate": 5.742791540378175e-05,
"loss": 11.9198,
"step": 89
},
{
"epoch": 1.4937759336099585,
"grad_norm": 0.014228662475943565,
"learning_rate": 5.4124956175052295e-05,
"loss": 11.9523,
"step": 90
},
{
"epoch": 1.5103734439834025,
"grad_norm": 0.01120895054191351,
"learning_rate": 5.089879149148781e-05,
"loss": 11.8631,
"step": 91
},
{
"epoch": 1.5269709543568464,
"grad_norm": 0.01069362461566925,
"learning_rate": 4.7752005465070094e-05,
"loss": 11.9319,
"step": 92
},
{
"epoch": 1.5435684647302903,
"grad_norm": 0.011584184132516384,
"learning_rate": 4.468711862660662e-05,
"loss": 11.9276,
"step": 93
},
{
"epoch": 1.5601659751037344,
"grad_norm": 0.012647481635212898,
"learning_rate": 4.1706585906821334e-05,
"loss": 11.9491,
"step": 94
},
{
"epoch": 1.5767634854771784,
"grad_norm": 0.012272904627025127,
"learning_rate": 3.881279466999001e-05,
"loss": 11.9568,
"step": 95
},
{
"epoch": 1.5933609958506225,
"grad_norm": 0.016778666526079178,
"learning_rate": 3.600806280169541e-05,
"loss": 11.9439,
"step": 96
},
{
"epoch": 1.6099585062240664,
"grad_norm": 0.01830066554248333,
"learning_rate": 3.3294636852234105e-05,
"loss": 12.0261,
"step": 97
},
{
"epoch": 1.6265560165975104,
"grad_norm": 0.0076575614511966705,
"learning_rate": 3.067469023716154e-05,
"loss": 9.0989,
"step": 98
},
{
"epoch": 1.6431535269709543,
"grad_norm": 0.020825980231165886,
"learning_rate": 2.8150321496417135e-05,
"loss": 14.9122,
"step": 99
},
{
"epoch": 1.6597510373443982,
"grad_norm": 0.013375605456531048,
"learning_rate": 2.5723552613423687e-05,
"loss": 11.6177,
"step": 100
},
{
"epoch": 1.6597510373443982,
"eval_loss": 11.921277046203613,
"eval_runtime": 0.0676,
"eval_samples_per_second": 739.82,
"eval_steps_per_second": 29.593,
"step": 100
},
{
"epoch": 1.6763485477178424,
"grad_norm": 0.01629016175866127,
"learning_rate": 2.3396327395507448e-05,
"loss": 11.9071,
"step": 101
},
{
"epoch": 1.6929460580912863,
"grad_norm": 0.015467053279280663,
"learning_rate": 2.117050991693609e-05,
"loss": 11.8865,
"step": 102
},
{
"epoch": 1.7095435684647304,
"grad_norm": 0.01504999864846468,
"learning_rate": 1.9047883025821774e-05,
"loss": 11.9293,
"step": 103
},
{
"epoch": 1.7261410788381744,
"grad_norm": 0.01039854995906353,
"learning_rate": 1.7030146916085185e-05,
"loss": 11.8954,
"step": 104
},
{
"epoch": 1.7427385892116183,
"grad_norm": 0.011301021091639996,
"learning_rate": 1.5118917765624467e-05,
"loss": 11.9575,
"step": 105
},
{
"epoch": 1.7593360995850622,
"grad_norm": 0.009601314552128315,
"learning_rate": 1.3315726441779629e-05,
"loss": 11.9122,
"step": 106
},
{
"epoch": 1.7759336099585061,
"grad_norm": 0.011440463364124298,
"learning_rate": 1.1622017275129708e-05,
"loss": 11.9329,
"step": 107
},
{
"epoch": 1.79253112033195,
"grad_norm": 0.013274903409183025,
"learning_rate": 1.00391469026044e-05,
"loss": 11.9641,
"step": 108
},
{
"epoch": 1.8091286307053942,
"grad_norm": 0.013196711428463459,
"learning_rate": 8.568383180837368e-06,
"loss": 12.013,
"step": 109
},
{
"epoch": 1.8257261410788381,
"grad_norm": 0.022849783301353455,
"learning_rate": 7.210904170631021e-06,
"loss": 12.3974,
"step": 110
},
{
"epoch": 1.8423236514522823,
"grad_norm": 0.015286913141608238,
"learning_rate": 5.967797193346574e-06,
"loss": 11.7548,
"step": 111
},
{
"epoch": 1.8589211618257262,
"grad_norm": 0.012541989795863628,
"learning_rate": 4.840057959975169e-06,
"loss": 11.5806,
"step": 112
},
{
"epoch": 1.8755186721991701,
"grad_norm": 0.013375689275562763,
"learning_rate": 3.828589773587515e-06,
"loss": 11.8641,
"step": 113
},
{
"epoch": 1.892116182572614,
"grad_norm": 0.01497753243893385,
"learning_rate": 2.934202805800989e-06,
"loss": 11.8974,
"step": 114
},
{
"epoch": 1.908713692946058,
"grad_norm": 0.010004348121583462,
"learning_rate": 2.1576134478437313e-06,
"loss": 11.8974,
"step": 115
},
{
"epoch": 1.9253112033195021,
"grad_norm": 0.009814193472266197,
"learning_rate": 1.4994437367354339e-06,
"loss": 11.9167,
"step": 116
},
{
"epoch": 1.941908713692946,
"grad_norm": 0.008285530842840672,
"learning_rate": 9.602208570445636e-07,
"loss": 12.0253,
"step": 117
},
{
"epoch": 1.9585062240663902,
"grad_norm": 0.01668418012559414,
"learning_rate": 5.403767186210218e-07,
"loss": 11.8426,
"step": 118
},
{
"epoch": 1.9751037344398341,
"grad_norm": 0.014265616424381733,
"learning_rate": 2.402476106425466e-07,
"loss": 12.0079,
"step": 119
},
{
"epoch": 1.991701244813278,
"grad_norm": 0.016387728974223137,
"learning_rate": 6.007393225176404e-08,
"loss": 12.1605,
"step": 120
},
{
"epoch": 2.008298755186722,
"grad_norm": 0.04243115335702896,
"learning_rate": 0.0,
"loss": 20.6424,
"step": 121
}
],
"logging_steps": 1,
"max_steps": 121,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 774412984320.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}