diaenra's picture
Training in progress, step 239, checkpoint
100bdf1 verified
raw
history blame
41.4 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.009663691408816602,
"eval_steps": 500,
"global_step": 239,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.043385526701507e-05,
"grad_norm": 45.212158203125,
"learning_rate": 1.0000000000000002e-06,
"loss": 8.4776,
"step": 1
},
{
"epoch": 8.086771053403014e-05,
"grad_norm": 42.83491897583008,
"learning_rate": 2.0000000000000003e-06,
"loss": 7.4022,
"step": 2
},
{
"epoch": 0.00012130156580104521,
"grad_norm": 47.69302749633789,
"learning_rate": 3e-06,
"loss": 8.5953,
"step": 3
},
{
"epoch": 0.00016173542106806028,
"grad_norm": 47.8973503112793,
"learning_rate": 4.000000000000001e-06,
"loss": 7.7783,
"step": 4
},
{
"epoch": 0.00020216927633507535,
"grad_norm": 43.4521598815918,
"learning_rate": 5e-06,
"loss": 7.9174,
"step": 5
},
{
"epoch": 0.00024260313160209043,
"grad_norm": 46.271541595458984,
"learning_rate": 6e-06,
"loss": 8.6758,
"step": 6
},
{
"epoch": 0.0002830369868691055,
"grad_norm": 45.16845703125,
"learning_rate": 7.000000000000001e-06,
"loss": 7.8099,
"step": 7
},
{
"epoch": 0.00032347084213612057,
"grad_norm": 43.50260543823242,
"learning_rate": 8.000000000000001e-06,
"loss": 7.1801,
"step": 8
},
{
"epoch": 0.00036390469740313564,
"grad_norm": 41.92473602294922,
"learning_rate": 9e-06,
"loss": 7.1574,
"step": 9
},
{
"epoch": 0.0004043385526701507,
"grad_norm": 43.03952407836914,
"learning_rate": 1e-05,
"loss": 7.1354,
"step": 10
},
{
"epoch": 0.0004447724079371658,
"grad_norm": 41.086021423339844,
"learning_rate": 1.1000000000000001e-05,
"loss": 7.3757,
"step": 11
},
{
"epoch": 0.00048520626320418085,
"grad_norm": 43.03165817260742,
"learning_rate": 1.2e-05,
"loss": 7.2673,
"step": 12
},
{
"epoch": 0.0005256401184711959,
"grad_norm": 41.32279586791992,
"learning_rate": 1.3000000000000001e-05,
"loss": 7.0322,
"step": 13
},
{
"epoch": 0.000566073973738211,
"grad_norm": 40.678192138671875,
"learning_rate": 1.4000000000000001e-05,
"loss": 6.9458,
"step": 14
},
{
"epoch": 0.0006065078290052261,
"grad_norm": 42.551509857177734,
"learning_rate": 1.5e-05,
"loss": 7.4505,
"step": 15
},
{
"epoch": 0.0006469416842722411,
"grad_norm": 39.79718017578125,
"learning_rate": 1.6000000000000003e-05,
"loss": 6.314,
"step": 16
},
{
"epoch": 0.0006873755395392562,
"grad_norm": 39.78065490722656,
"learning_rate": 1.7000000000000003e-05,
"loss": 6.0012,
"step": 17
},
{
"epoch": 0.0007278093948062713,
"grad_norm": 34.97587966918945,
"learning_rate": 1.8e-05,
"loss": 5.7898,
"step": 18
},
{
"epoch": 0.0007682432500732863,
"grad_norm": 34.85056686401367,
"learning_rate": 1.9e-05,
"loss": 5.3237,
"step": 19
},
{
"epoch": 0.0008086771053403014,
"grad_norm": 31.200321197509766,
"learning_rate": 2e-05,
"loss": 4.8291,
"step": 20
},
{
"epoch": 0.0008491109606073165,
"grad_norm": 34.75196075439453,
"learning_rate": 2.1e-05,
"loss": 5.7917,
"step": 21
},
{
"epoch": 0.0008895448158743316,
"grad_norm": 25.683300018310547,
"learning_rate": 2.2000000000000003e-05,
"loss": 3.6869,
"step": 22
},
{
"epoch": 0.0009299786711413466,
"grad_norm": 23.978288650512695,
"learning_rate": 2.3000000000000003e-05,
"loss": 3.6214,
"step": 23
},
{
"epoch": 0.0009704125264083617,
"grad_norm": 24.98045539855957,
"learning_rate": 2.4e-05,
"loss": 3.8051,
"step": 24
},
{
"epoch": 0.0010108463816753768,
"grad_norm": 25.768600463867188,
"learning_rate": 2.5e-05,
"loss": 3.8138,
"step": 25
},
{
"epoch": 0.0010512802369423918,
"grad_norm": 28.268779754638672,
"learning_rate": 2.6000000000000002e-05,
"loss": 3.6824,
"step": 26
},
{
"epoch": 0.001091714092209407,
"grad_norm": 24.55759620666504,
"learning_rate": 2.7000000000000002e-05,
"loss": 2.6137,
"step": 27
},
{
"epoch": 0.001132147947476422,
"grad_norm": 32.37775421142578,
"learning_rate": 2.8000000000000003e-05,
"loss": 3.0377,
"step": 28
},
{
"epoch": 0.001172581802743437,
"grad_norm": 24.953506469726562,
"learning_rate": 2.9e-05,
"loss": 2.3354,
"step": 29
},
{
"epoch": 0.0012130156580104521,
"grad_norm": 22.368303298950195,
"learning_rate": 3e-05,
"loss": 1.6599,
"step": 30
},
{
"epoch": 0.0012534495132774672,
"grad_norm": 26.778047561645508,
"learning_rate": 3.1e-05,
"loss": 1.6387,
"step": 31
},
{
"epoch": 0.0012938833685444823,
"grad_norm": 13.506389617919922,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9049,
"step": 32
},
{
"epoch": 0.0013343172238114973,
"grad_norm": 18.525230407714844,
"learning_rate": 3.3e-05,
"loss": 0.8052,
"step": 33
},
{
"epoch": 0.0013747510790785124,
"grad_norm": 25.12320899963379,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.8044,
"step": 34
},
{
"epoch": 0.0014151849343455275,
"grad_norm": 16.060945510864258,
"learning_rate": 3.5e-05,
"loss": 0.6799,
"step": 35
},
{
"epoch": 0.0014556187896125426,
"grad_norm": 18.008813858032227,
"learning_rate": 3.6e-05,
"loss": 0.4561,
"step": 36
},
{
"epoch": 0.0014960526448795576,
"grad_norm": 21.687620162963867,
"learning_rate": 3.7e-05,
"loss": 0.613,
"step": 37
},
{
"epoch": 0.0015364865001465727,
"grad_norm": 14.03872013092041,
"learning_rate": 3.8e-05,
"loss": 0.5134,
"step": 38
},
{
"epoch": 0.0015769203554135878,
"grad_norm": 8.90583610534668,
"learning_rate": 3.9000000000000006e-05,
"loss": 0.2898,
"step": 39
},
{
"epoch": 0.0016173542106806028,
"grad_norm": 15.97493839263916,
"learning_rate": 4e-05,
"loss": 0.4115,
"step": 40
},
{
"epoch": 0.001657788065947618,
"grad_norm": 6.131041526794434,
"learning_rate": 4.1e-05,
"loss": 0.3075,
"step": 41
},
{
"epoch": 0.001698221921214633,
"grad_norm": 21.00753402709961,
"learning_rate": 4.2e-05,
"loss": 0.5586,
"step": 42
},
{
"epoch": 0.001738655776481648,
"grad_norm": 23.8162899017334,
"learning_rate": 4.3e-05,
"loss": 0.6964,
"step": 43
},
{
"epoch": 0.0017790896317486631,
"grad_norm": 22.47564125061035,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.543,
"step": 44
},
{
"epoch": 0.0018195234870156782,
"grad_norm": 20.058208465576172,
"learning_rate": 4.5e-05,
"loss": 0.5529,
"step": 45
},
{
"epoch": 0.0018599573422826933,
"grad_norm": 9.034168243408203,
"learning_rate": 4.600000000000001e-05,
"loss": 0.4264,
"step": 46
},
{
"epoch": 0.0019003911975497083,
"grad_norm": 13.754554748535156,
"learning_rate": 4.7e-05,
"loss": 0.4332,
"step": 47
},
{
"epoch": 0.0019408250528167234,
"grad_norm": 16.2254638671875,
"learning_rate": 4.8e-05,
"loss": 0.5085,
"step": 48
},
{
"epoch": 0.0019812589080837385,
"grad_norm": 13.377281188964844,
"learning_rate": 4.9e-05,
"loss": 0.4047,
"step": 49
},
{
"epoch": 0.0020216927633507535,
"grad_norm": 16.529783248901367,
"learning_rate": 5e-05,
"loss": 0.602,
"step": 50
},
{
"epoch": 0.0020621266186177686,
"grad_norm": 16.30471420288086,
"learning_rate": 5.1000000000000006e-05,
"loss": 0.4734,
"step": 51
},
{
"epoch": 0.0021025604738847837,
"grad_norm": 9.81867790222168,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.4394,
"step": 52
},
{
"epoch": 0.0021429943291517988,
"grad_norm": 8.821556091308594,
"learning_rate": 5.300000000000001e-05,
"loss": 0.3627,
"step": 53
},
{
"epoch": 0.002183428184418814,
"grad_norm": 7.72442626953125,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.3547,
"step": 54
},
{
"epoch": 0.002223862039685829,
"grad_norm": 9.638863563537598,
"learning_rate": 5.500000000000001e-05,
"loss": 0.3275,
"step": 55
},
{
"epoch": 0.002264295894952844,
"grad_norm": 6.1317458152771,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.2867,
"step": 56
},
{
"epoch": 0.002304729750219859,
"grad_norm": 11.842965126037598,
"learning_rate": 5.6999999999999996e-05,
"loss": 0.3486,
"step": 57
},
{
"epoch": 0.002345163605486874,
"grad_norm": 3.987241506576538,
"learning_rate": 5.8e-05,
"loss": 0.2699,
"step": 58
},
{
"epoch": 0.002385597460753889,
"grad_norm": 6.591022968292236,
"learning_rate": 5.9e-05,
"loss": 0.3184,
"step": 59
},
{
"epoch": 0.0024260313160209043,
"grad_norm": 7.872280120849609,
"learning_rate": 6e-05,
"loss": 0.3346,
"step": 60
},
{
"epoch": 0.0024664651712879193,
"grad_norm": 2.6104869842529297,
"learning_rate": 6.1e-05,
"loss": 0.3243,
"step": 61
},
{
"epoch": 0.0025068990265549344,
"grad_norm": 3.023655652999878,
"learning_rate": 6.2e-05,
"loss": 0.3306,
"step": 62
},
{
"epoch": 0.0025473328818219495,
"grad_norm": 6.13469123840332,
"learning_rate": 6.3e-05,
"loss": 0.344,
"step": 63
},
{
"epoch": 0.0025877667370889645,
"grad_norm": 6.2675957679748535,
"learning_rate": 6.400000000000001e-05,
"loss": 0.3637,
"step": 64
},
{
"epoch": 0.0026282005923559796,
"grad_norm": 15.284539222717285,
"learning_rate": 6.500000000000001e-05,
"loss": 0.4253,
"step": 65
},
{
"epoch": 0.0026686344476229947,
"grad_norm": 13.781516075134277,
"learning_rate": 6.6e-05,
"loss": 0.3658,
"step": 66
},
{
"epoch": 0.0027090683028900098,
"grad_norm": 3.6815264225006104,
"learning_rate": 6.7e-05,
"loss": 0.3152,
"step": 67
},
{
"epoch": 0.002749502158157025,
"grad_norm": 5.936532497406006,
"learning_rate": 6.800000000000001e-05,
"loss": 0.312,
"step": 68
},
{
"epoch": 0.00278993601342404,
"grad_norm": 5.848452568054199,
"learning_rate": 6.9e-05,
"loss": 0.2422,
"step": 69
},
{
"epoch": 0.002830369868691055,
"grad_norm": 19.137374877929688,
"learning_rate": 7e-05,
"loss": 0.4338,
"step": 70
},
{
"epoch": 0.00287080372395807,
"grad_norm": 10.636536598205566,
"learning_rate": 7.1e-05,
"loss": 0.3493,
"step": 71
},
{
"epoch": 0.002911237579225085,
"grad_norm": 4.964332580566406,
"learning_rate": 7.2e-05,
"loss": 0.2712,
"step": 72
},
{
"epoch": 0.0029516714344921,
"grad_norm": 8.327373504638672,
"learning_rate": 7.3e-05,
"loss": 0.2796,
"step": 73
},
{
"epoch": 0.0029921052897591153,
"grad_norm": 8.643411636352539,
"learning_rate": 7.4e-05,
"loss": 0.3479,
"step": 74
},
{
"epoch": 0.0030325391450261303,
"grad_norm": 9.094339370727539,
"learning_rate": 7.500000000000001e-05,
"loss": 0.3282,
"step": 75
},
{
"epoch": 0.0030729730002931454,
"grad_norm": 16.117694854736328,
"learning_rate": 7.6e-05,
"loss": 0.4187,
"step": 76
},
{
"epoch": 0.0031134068555601605,
"grad_norm": 21.272748947143555,
"learning_rate": 7.7e-05,
"loss": 0.5197,
"step": 77
},
{
"epoch": 0.0031538407108271755,
"grad_norm": 5.69344425201416,
"learning_rate": 7.800000000000001e-05,
"loss": 0.2646,
"step": 78
},
{
"epoch": 0.0031942745660941906,
"grad_norm": 7.0776824951171875,
"learning_rate": 7.900000000000001e-05,
"loss": 0.282,
"step": 79
},
{
"epoch": 0.0032347084213612057,
"grad_norm": 5.962209701538086,
"learning_rate": 8e-05,
"loss": 0.3024,
"step": 80
},
{
"epoch": 0.0032751422766282207,
"grad_norm": 6.475072860717773,
"learning_rate": 8.1e-05,
"loss": 0.3527,
"step": 81
},
{
"epoch": 0.003315576131895236,
"grad_norm": 10.585362434387207,
"learning_rate": 8.2e-05,
"loss": 0.3635,
"step": 82
},
{
"epoch": 0.003356009987162251,
"grad_norm": 13.020295143127441,
"learning_rate": 8.3e-05,
"loss": 0.296,
"step": 83
},
{
"epoch": 0.003396443842429266,
"grad_norm": 12.620179176330566,
"learning_rate": 8.4e-05,
"loss": 0.3679,
"step": 84
},
{
"epoch": 0.003436877697696281,
"grad_norm": 14.295244216918945,
"learning_rate": 8.5e-05,
"loss": 0.3293,
"step": 85
},
{
"epoch": 0.003477311552963296,
"grad_norm": 11.91524887084961,
"learning_rate": 8.6e-05,
"loss": 0.3314,
"step": 86
},
{
"epoch": 0.003517745408230311,
"grad_norm": 4.24912166595459,
"learning_rate": 8.7e-05,
"loss": 0.3097,
"step": 87
},
{
"epoch": 0.0035581792634973262,
"grad_norm": 2.8676578998565674,
"learning_rate": 8.800000000000001e-05,
"loss": 0.2086,
"step": 88
},
{
"epoch": 0.0035986131187643413,
"grad_norm": 13.0736665725708,
"learning_rate": 8.900000000000001e-05,
"loss": 0.3789,
"step": 89
},
{
"epoch": 0.0036390469740313564,
"grad_norm": 15.212523460388184,
"learning_rate": 9e-05,
"loss": 0.3977,
"step": 90
},
{
"epoch": 0.0036794808292983715,
"grad_norm": 4.857946395874023,
"learning_rate": 9.1e-05,
"loss": 0.2782,
"step": 91
},
{
"epoch": 0.0037199146845653865,
"grad_norm": 13.703444480895996,
"learning_rate": 9.200000000000001e-05,
"loss": 0.4441,
"step": 92
},
{
"epoch": 0.0037603485398324016,
"grad_norm": 7.481781482696533,
"learning_rate": 9.300000000000001e-05,
"loss": 0.4544,
"step": 93
},
{
"epoch": 0.0038007823950994167,
"grad_norm": 10.419188499450684,
"learning_rate": 9.4e-05,
"loss": 0.2466,
"step": 94
},
{
"epoch": 0.0038412162503664317,
"grad_norm": 6.384120941162109,
"learning_rate": 9.5e-05,
"loss": 0.3243,
"step": 95
},
{
"epoch": 0.003881650105633447,
"grad_norm": 5.624557971954346,
"learning_rate": 9.6e-05,
"loss": 0.2636,
"step": 96
},
{
"epoch": 0.003922083960900462,
"grad_norm": 6.27712869644165,
"learning_rate": 9.7e-05,
"loss": 0.3291,
"step": 97
},
{
"epoch": 0.003962517816167477,
"grad_norm": 8.306694030761719,
"learning_rate": 9.8e-05,
"loss": 0.3152,
"step": 98
},
{
"epoch": 0.0040029516714344925,
"grad_norm": 10.752472877502441,
"learning_rate": 9.900000000000001e-05,
"loss": 0.3415,
"step": 99
},
{
"epoch": 0.004043385526701507,
"grad_norm": 10.531058311462402,
"learning_rate": 0.0001,
"loss": 0.4284,
"step": 100
},
{
"epoch": 0.004083819381968523,
"grad_norm": 16.106300354003906,
"learning_rate": 9.99999995932986e-05,
"loss": 0.6921,
"step": 101
},
{
"epoch": 0.004124253237235537,
"grad_norm": 11.43233585357666,
"learning_rate": 9.999999837319442e-05,
"loss": 0.3084,
"step": 102
},
{
"epoch": 0.004164687092502553,
"grad_norm": 5.925229072570801,
"learning_rate": 9.999999633968746e-05,
"loss": 0.2721,
"step": 103
},
{
"epoch": 0.004205120947769567,
"grad_norm": 13.037198066711426,
"learning_rate": 9.999999349277778e-05,
"loss": 0.3153,
"step": 104
},
{
"epoch": 0.004245554803036583,
"grad_norm": 11.120277404785156,
"learning_rate": 9.999998983246538e-05,
"loss": 0.3173,
"step": 105
},
{
"epoch": 0.0042859886583035975,
"grad_norm": 17.156024932861328,
"learning_rate": 9.999998535875038e-05,
"loss": 0.582,
"step": 106
},
{
"epoch": 0.004326422513570613,
"grad_norm": 1.7565780878067017,
"learning_rate": 9.999998007163281e-05,
"loss": 0.1944,
"step": 107
},
{
"epoch": 0.004366856368837628,
"grad_norm": 4.962274074554443,
"learning_rate": 9.999997397111278e-05,
"loss": 0.2287,
"step": 108
},
{
"epoch": 0.004407290224104643,
"grad_norm": 5.148132801055908,
"learning_rate": 9.999996705719036e-05,
"loss": 0.2062,
"step": 109
},
{
"epoch": 0.004447724079371658,
"grad_norm": 7.428779602050781,
"learning_rate": 9.999995932986568e-05,
"loss": 0.2594,
"step": 110
},
{
"epoch": 0.004488157934638673,
"grad_norm": 7.802266597747803,
"learning_rate": 9.999995078913888e-05,
"loss": 0.2838,
"step": 111
},
{
"epoch": 0.004528591789905688,
"grad_norm": 9.690343856811523,
"learning_rate": 9.999994143501008e-05,
"loss": 0.2114,
"step": 112
},
{
"epoch": 0.0045690256451727035,
"grad_norm": 8.961145401000977,
"learning_rate": 9.999993126747943e-05,
"loss": 0.1836,
"step": 113
},
{
"epoch": 0.004609459500439718,
"grad_norm": 2.7630367279052734,
"learning_rate": 9.999992028654711e-05,
"loss": 0.1005,
"step": 114
},
{
"epoch": 0.004649893355706734,
"grad_norm": 20.46099090576172,
"learning_rate": 9.999990849221329e-05,
"loss": 0.4513,
"step": 115
},
{
"epoch": 0.004690327210973748,
"grad_norm": 11.76425838470459,
"learning_rate": 9.999989588447816e-05,
"loss": 0.348,
"step": 116
},
{
"epoch": 0.004730761066240764,
"grad_norm": 13.530948638916016,
"learning_rate": 9.999988246334193e-05,
"loss": 0.2852,
"step": 117
},
{
"epoch": 0.004771194921507778,
"grad_norm": 4.60286808013916,
"learning_rate": 9.999986822880483e-05,
"loss": 0.1511,
"step": 118
},
{
"epoch": 0.004811628776774794,
"grad_norm": 8.1397705078125,
"learning_rate": 9.999985318086706e-05,
"loss": 0.3053,
"step": 119
},
{
"epoch": 0.0048520626320418085,
"grad_norm": 9.727378845214844,
"learning_rate": 9.999983731952889e-05,
"loss": 0.3261,
"step": 120
},
{
"epoch": 0.004892496487308824,
"grad_norm": 6.355672359466553,
"learning_rate": 9.999982064479057e-05,
"loss": 0.1726,
"step": 121
},
{
"epoch": 0.004932930342575839,
"grad_norm": 8.556386947631836,
"learning_rate": 9.999980315665237e-05,
"loss": 0.21,
"step": 122
},
{
"epoch": 0.004973364197842854,
"grad_norm": 9.588984489440918,
"learning_rate": 9.999978485511459e-05,
"loss": 0.3401,
"step": 123
},
{
"epoch": 0.005013798053109869,
"grad_norm": 6.82341194152832,
"learning_rate": 9.999976574017749e-05,
"loss": 0.2865,
"step": 124
},
{
"epoch": 0.005054231908376884,
"grad_norm": 13.025908470153809,
"learning_rate": 9.999974581184142e-05,
"loss": 0.3679,
"step": 125
},
{
"epoch": 0.005094665763643899,
"grad_norm": 7.907348155975342,
"learning_rate": 9.999972507010669e-05,
"loss": 0.2739,
"step": 126
},
{
"epoch": 0.0051350996189109144,
"grad_norm": 5.3686676025390625,
"learning_rate": 9.999970351497363e-05,
"loss": 0.1398,
"step": 127
},
{
"epoch": 0.005175533474177929,
"grad_norm": 6.662126064300537,
"learning_rate": 9.99996811464426e-05,
"loss": 0.2436,
"step": 128
},
{
"epoch": 0.005215967329444945,
"grad_norm": 7.145336627960205,
"learning_rate": 9.999965796451397e-05,
"loss": 0.174,
"step": 129
},
{
"epoch": 0.005256401184711959,
"grad_norm": 7.168648719787598,
"learning_rate": 9.99996339691881e-05,
"loss": 0.2184,
"step": 130
},
{
"epoch": 0.005296835039978975,
"grad_norm": 10.925111770629883,
"learning_rate": 9.99996091604654e-05,
"loss": 0.3104,
"step": 131
},
{
"epoch": 0.005337268895245989,
"grad_norm": 7.4000396728515625,
"learning_rate": 9.999958353834624e-05,
"loss": 0.1764,
"step": 132
},
{
"epoch": 0.005377702750513005,
"grad_norm": 14.293291091918945,
"learning_rate": 9.999955710283109e-05,
"loss": 0.3948,
"step": 133
},
{
"epoch": 0.0054181366057800195,
"grad_norm": 17.54743003845215,
"learning_rate": 9.999952985392033e-05,
"loss": 0.4679,
"step": 134
},
{
"epoch": 0.005458570461047035,
"grad_norm": 10.179829597473145,
"learning_rate": 9.999950179161442e-05,
"loss": 0.247,
"step": 135
},
{
"epoch": 0.00549900431631405,
"grad_norm": 8.208870887756348,
"learning_rate": 9.999947291591383e-05,
"loss": 0.3418,
"step": 136
},
{
"epoch": 0.005539438171581065,
"grad_norm": 7.8983917236328125,
"learning_rate": 9.9999443226819e-05,
"loss": 0.3445,
"step": 137
},
{
"epoch": 0.00557987202684808,
"grad_norm": 14.267950057983398,
"learning_rate": 9.999941272433046e-05,
"loss": 0.3628,
"step": 138
},
{
"epoch": 0.005620305882115095,
"grad_norm": 11.430856704711914,
"learning_rate": 9.999938140844866e-05,
"loss": 0.278,
"step": 139
},
{
"epoch": 0.00566073973738211,
"grad_norm": 8.389185905456543,
"learning_rate": 9.999934927917414e-05,
"loss": 0.3661,
"step": 140
},
{
"epoch": 0.0057011735926491254,
"grad_norm": 8.984382629394531,
"learning_rate": 9.999931633650739e-05,
"loss": 0.3152,
"step": 141
},
{
"epoch": 0.00574160744791614,
"grad_norm": 6.51492166519165,
"learning_rate": 9.999928258044899e-05,
"loss": 0.2281,
"step": 142
},
{
"epoch": 0.005782041303183156,
"grad_norm": 7.01376485824585,
"learning_rate": 9.999924801099946e-05,
"loss": 0.3193,
"step": 143
},
{
"epoch": 0.00582247515845017,
"grad_norm": 7.640584468841553,
"learning_rate": 9.999921262815936e-05,
"loss": 0.2331,
"step": 144
},
{
"epoch": 0.005862909013717186,
"grad_norm": 4.045457363128662,
"learning_rate": 9.999917643192928e-05,
"loss": 0.1184,
"step": 145
},
{
"epoch": 0.0059033428689842,
"grad_norm": 13.00910758972168,
"learning_rate": 9.999913942230979e-05,
"loss": 0.3623,
"step": 146
},
{
"epoch": 0.005943776724251216,
"grad_norm": 7.396110534667969,
"learning_rate": 9.999910159930151e-05,
"loss": 0.2281,
"step": 147
},
{
"epoch": 0.0059842105795182305,
"grad_norm": 3.814600944519043,
"learning_rate": 9.999906296290506e-05,
"loss": 0.1162,
"step": 148
},
{
"epoch": 0.006024644434785246,
"grad_norm": 10.155074119567871,
"learning_rate": 9.999902351312105e-05,
"loss": 0.2232,
"step": 149
},
{
"epoch": 0.006065078290052261,
"grad_norm": 7.059305667877197,
"learning_rate": 9.999898324995013e-05,
"loss": 0.1808,
"step": 150
},
{
"epoch": 0.006105512145319276,
"grad_norm": 12.093023300170898,
"learning_rate": 9.999894217339296e-05,
"loss": 0.3253,
"step": 151
},
{
"epoch": 0.006145946000586291,
"grad_norm": 13.166694641113281,
"learning_rate": 9.999890028345019e-05,
"loss": 0.3509,
"step": 152
},
{
"epoch": 0.006186379855853306,
"grad_norm": 5.3184733390808105,
"learning_rate": 9.999885758012253e-05,
"loss": 0.1875,
"step": 153
},
{
"epoch": 0.006226813711120321,
"grad_norm": 7.991506099700928,
"learning_rate": 9.999881406341065e-05,
"loss": 0.186,
"step": 154
},
{
"epoch": 0.006267247566387336,
"grad_norm": 10.271965026855469,
"learning_rate": 9.999876973331528e-05,
"loss": 0.3393,
"step": 155
},
{
"epoch": 0.006307681421654351,
"grad_norm": 8.440420150756836,
"learning_rate": 9.99987245898371e-05,
"loss": 0.3662,
"step": 156
},
{
"epoch": 0.006348115276921367,
"grad_norm": 9.392427444458008,
"learning_rate": 9.99986786329769e-05,
"loss": 0.2827,
"step": 157
},
{
"epoch": 0.006388549132188381,
"grad_norm": 11.753120422363281,
"learning_rate": 9.999863186273539e-05,
"loss": 0.3695,
"step": 158
},
{
"epoch": 0.006428982987455397,
"grad_norm": 7.335087776184082,
"learning_rate": 9.999858427911335e-05,
"loss": 0.2826,
"step": 159
},
{
"epoch": 0.006469416842722411,
"grad_norm": 8.617646217346191,
"learning_rate": 9.999853588211154e-05,
"loss": 0.2268,
"step": 160
},
{
"epoch": 0.006509850697989427,
"grad_norm": 6.348892688751221,
"learning_rate": 9.999848667173075e-05,
"loss": 0.2811,
"step": 161
},
{
"epoch": 0.0065502845532564415,
"grad_norm": 9.774231910705566,
"learning_rate": 9.999843664797178e-05,
"loss": 0.4024,
"step": 162
},
{
"epoch": 0.006590718408523457,
"grad_norm": 11.390604019165039,
"learning_rate": 9.999838581083546e-05,
"loss": 0.2387,
"step": 163
},
{
"epoch": 0.006631152263790472,
"grad_norm": 4.1529388427734375,
"learning_rate": 9.99983341603226e-05,
"loss": 0.2513,
"step": 164
},
{
"epoch": 0.006671586119057487,
"grad_norm": 9.902484893798828,
"learning_rate": 9.999828169643404e-05,
"loss": 0.2148,
"step": 165
},
{
"epoch": 0.006712019974324502,
"grad_norm": 13.555265426635742,
"learning_rate": 9.999822841917064e-05,
"loss": 0.3568,
"step": 166
},
{
"epoch": 0.006752453829591517,
"grad_norm": 10.916128158569336,
"learning_rate": 9.999817432853326e-05,
"loss": 0.2892,
"step": 167
},
{
"epoch": 0.006792887684858532,
"grad_norm": 5.216245651245117,
"learning_rate": 9.999811942452279e-05,
"loss": 0.2052,
"step": 168
},
{
"epoch": 0.006833321540125547,
"grad_norm": 11.540072441101074,
"learning_rate": 9.999806370714011e-05,
"loss": 0.3585,
"step": 169
},
{
"epoch": 0.006873755395392562,
"grad_norm": 5.5519304275512695,
"learning_rate": 9.999800717638614e-05,
"loss": 0.2453,
"step": 170
},
{
"epoch": 0.006914189250659578,
"grad_norm": 7.624457359313965,
"learning_rate": 9.999794983226179e-05,
"loss": 0.2453,
"step": 171
},
{
"epoch": 0.006954623105926592,
"grad_norm": 3.1242964267730713,
"learning_rate": 9.999789167476801e-05,
"loss": 0.2162,
"step": 172
},
{
"epoch": 0.006995056961193608,
"grad_norm": 5.320684432983398,
"learning_rate": 9.999783270390572e-05,
"loss": 0.2053,
"step": 173
},
{
"epoch": 0.007035490816460622,
"grad_norm": 9.082324028015137,
"learning_rate": 9.999777291967589e-05,
"loss": 0.3074,
"step": 174
},
{
"epoch": 0.007075924671727638,
"grad_norm": 9.537432670593262,
"learning_rate": 9.999771232207951e-05,
"loss": 0.2791,
"step": 175
},
{
"epoch": 0.0071163585269946525,
"grad_norm": 9.438758850097656,
"learning_rate": 9.999765091111754e-05,
"loss": 0.2213,
"step": 176
},
{
"epoch": 0.007156792382261668,
"grad_norm": 6.272062301635742,
"learning_rate": 9.999758868679099e-05,
"loss": 0.2219,
"step": 177
},
{
"epoch": 0.007197226237528683,
"grad_norm": 3.2677524089813232,
"learning_rate": 9.999752564910086e-05,
"loss": 0.2241,
"step": 178
},
{
"epoch": 0.007237660092795698,
"grad_norm": 3.407979726791382,
"learning_rate": 9.99974617980482e-05,
"loss": 0.1709,
"step": 179
},
{
"epoch": 0.007278093948062713,
"grad_norm": 7.650908946990967,
"learning_rate": 9.999739713363404e-05,
"loss": 0.189,
"step": 180
},
{
"epoch": 0.007318527803329728,
"grad_norm": 5.595089912414551,
"learning_rate": 9.999733165585943e-05,
"loss": 0.1611,
"step": 181
},
{
"epoch": 0.007358961658596743,
"grad_norm": 5.560061931610107,
"learning_rate": 9.999726536472542e-05,
"loss": 0.1824,
"step": 182
},
{
"epoch": 0.007399395513863758,
"grad_norm": 13.8944091796875,
"learning_rate": 9.99971982602331e-05,
"loss": 0.3737,
"step": 183
},
{
"epoch": 0.007439829369130773,
"grad_norm": 5.863430976867676,
"learning_rate": 9.999713034238359e-05,
"loss": 0.2017,
"step": 184
},
{
"epoch": 0.0074802632243977886,
"grad_norm": 4.334754467010498,
"learning_rate": 9.999706161117795e-05,
"loss": 0.0885,
"step": 185
},
{
"epoch": 0.007520697079664803,
"grad_norm": 5.766237735748291,
"learning_rate": 9.99969920666173e-05,
"loss": 0.254,
"step": 186
},
{
"epoch": 0.007561130934931819,
"grad_norm": 4.142415523529053,
"learning_rate": 9.99969217087028e-05,
"loss": 0.1412,
"step": 187
},
{
"epoch": 0.007601564790198833,
"grad_norm": 7.84074068069458,
"learning_rate": 9.999685053743559e-05,
"loss": 0.1959,
"step": 188
},
{
"epoch": 0.007641998645465849,
"grad_norm": 8.681429862976074,
"learning_rate": 9.999677855281682e-05,
"loss": 0.1584,
"step": 189
},
{
"epoch": 0.0076824325007328635,
"grad_norm": 9.750258445739746,
"learning_rate": 9.999670575484765e-05,
"loss": 0.2074,
"step": 190
},
{
"epoch": 0.007722866355999879,
"grad_norm": 7.412321090698242,
"learning_rate": 9.999663214352929e-05,
"loss": 0.1696,
"step": 191
},
{
"epoch": 0.007763300211266894,
"grad_norm": 9.03699016571045,
"learning_rate": 9.999655771886291e-05,
"loss": 0.1942,
"step": 192
},
{
"epoch": 0.007803734066533909,
"grad_norm": 7.925232887268066,
"learning_rate": 9.999648248084974e-05,
"loss": 0.1793,
"step": 193
},
{
"epoch": 0.007844167921800925,
"grad_norm": 7.363532066345215,
"learning_rate": 9.9996406429491e-05,
"loss": 0.1718,
"step": 194
},
{
"epoch": 0.00788460177706794,
"grad_norm": 9.17047119140625,
"learning_rate": 9.999632956478793e-05,
"loss": 0.404,
"step": 195
},
{
"epoch": 0.007925035632334954,
"grad_norm": 8.83364486694336,
"learning_rate": 9.999625188674175e-05,
"loss": 0.2276,
"step": 196
},
{
"epoch": 0.007965469487601969,
"grad_norm": 9.548094749450684,
"learning_rate": 9.999617339535378e-05,
"loss": 0.1875,
"step": 197
},
{
"epoch": 0.008005903342868985,
"grad_norm": 6.08480167388916,
"learning_rate": 9.999609409062525e-05,
"loss": 0.145,
"step": 198
},
{
"epoch": 0.008046337198136,
"grad_norm": 15.142061233520508,
"learning_rate": 9.999601397255747e-05,
"loss": 0.4395,
"step": 199
},
{
"epoch": 0.008086771053403014,
"grad_norm": 15.888526916503906,
"learning_rate": 9.999593304115174e-05,
"loss": 0.5784,
"step": 200
},
{
"epoch": 0.008127204908670029,
"grad_norm": 6.388537883758545,
"learning_rate": 9.999585129640936e-05,
"loss": 0.2958,
"step": 201
},
{
"epoch": 0.008167638763937045,
"grad_norm": 5.720010757446289,
"learning_rate": 9.999576873833169e-05,
"loss": 0.1192,
"step": 202
},
{
"epoch": 0.00820807261920406,
"grad_norm": 7.905060291290283,
"learning_rate": 9.999568536692006e-05,
"loss": 0.3184,
"step": 203
},
{
"epoch": 0.008248506474471074,
"grad_norm": 3.085916519165039,
"learning_rate": 9.999560118217583e-05,
"loss": 0.1954,
"step": 204
},
{
"epoch": 0.008288940329738089,
"grad_norm": 8.547829627990723,
"learning_rate": 9.999551618410034e-05,
"loss": 0.2605,
"step": 205
},
{
"epoch": 0.008329374185005105,
"grad_norm": 7.506508827209473,
"learning_rate": 9.999543037269504e-05,
"loss": 0.3028,
"step": 206
},
{
"epoch": 0.00836980804027212,
"grad_norm": 4.869304656982422,
"learning_rate": 9.999534374796124e-05,
"loss": 0.2271,
"step": 207
},
{
"epoch": 0.008410241895539135,
"grad_norm": 7.360259056091309,
"learning_rate": 9.999525630990041e-05,
"loss": 0.2761,
"step": 208
},
{
"epoch": 0.00845067575080615,
"grad_norm": 6.078726768493652,
"learning_rate": 9.999516805851397e-05,
"loss": 0.2623,
"step": 209
},
{
"epoch": 0.008491109606073166,
"grad_norm": 2.553845167160034,
"learning_rate": 9.999507899380331e-05,
"loss": 0.1659,
"step": 210
},
{
"epoch": 0.00853154346134018,
"grad_norm": 3.2435362339019775,
"learning_rate": 9.999498911576993e-05,
"loss": 0.1498,
"step": 211
},
{
"epoch": 0.008571977316607195,
"grad_norm": 6.379277229309082,
"learning_rate": 9.999489842441527e-05,
"loss": 0.2309,
"step": 212
},
{
"epoch": 0.00861241117187421,
"grad_norm": 4.347065448760986,
"learning_rate": 9.99948069197408e-05,
"loss": 0.2128,
"step": 213
},
{
"epoch": 0.008652845027141226,
"grad_norm": 9.05762767791748,
"learning_rate": 9.999471460174803e-05,
"loss": 0.1779,
"step": 214
},
{
"epoch": 0.00869327888240824,
"grad_norm": 4.830628871917725,
"learning_rate": 9.999462147043843e-05,
"loss": 0.1574,
"step": 215
},
{
"epoch": 0.008733712737675255,
"grad_norm": 3.8125157356262207,
"learning_rate": 9.999452752581355e-05,
"loss": 0.1504,
"step": 216
},
{
"epoch": 0.00877414659294227,
"grad_norm": 2.8179168701171875,
"learning_rate": 9.999443276787489e-05,
"loss": 0.1777,
"step": 217
},
{
"epoch": 0.008814580448209286,
"grad_norm": 5.864883899688721,
"learning_rate": 9.9994337196624e-05,
"loss": 0.3495,
"step": 218
},
{
"epoch": 0.008855014303476301,
"grad_norm": 6.230106353759766,
"learning_rate": 9.999424081206245e-05,
"loss": 0.1318,
"step": 219
},
{
"epoch": 0.008895448158743316,
"grad_norm": 6.827365875244141,
"learning_rate": 9.999414361419178e-05,
"loss": 0.1966,
"step": 220
},
{
"epoch": 0.00893588201401033,
"grad_norm": 12.011444091796875,
"learning_rate": 9.99940456030136e-05,
"loss": 0.2844,
"step": 221
},
{
"epoch": 0.008976315869277347,
"grad_norm": 7.509864330291748,
"learning_rate": 9.999394677852948e-05,
"loss": 0.3334,
"step": 222
},
{
"epoch": 0.009016749724544361,
"grad_norm": 10.87213134765625,
"learning_rate": 9.999384714074105e-05,
"loss": 0.2559,
"step": 223
},
{
"epoch": 0.009057183579811376,
"grad_norm": 7.5143327713012695,
"learning_rate": 9.99937466896499e-05,
"loss": 0.1708,
"step": 224
},
{
"epoch": 0.00909761743507839,
"grad_norm": 17.510313034057617,
"learning_rate": 9.99936454252577e-05,
"loss": 0.4556,
"step": 225
},
{
"epoch": 0.009138051290345407,
"grad_norm": 6.468897342681885,
"learning_rate": 9.999354334756608e-05,
"loss": 0.2092,
"step": 226
},
{
"epoch": 0.009178485145612422,
"grad_norm": 6.171923637390137,
"learning_rate": 9.99934404565767e-05,
"loss": 0.2199,
"step": 227
},
{
"epoch": 0.009218919000879436,
"grad_norm": 8.079005241394043,
"learning_rate": 9.999333675229123e-05,
"loss": 0.2471,
"step": 228
},
{
"epoch": 0.00925935285614645,
"grad_norm": 11.559858322143555,
"learning_rate": 9.999323223471136e-05,
"loss": 0.3881,
"step": 229
},
{
"epoch": 0.009299786711413467,
"grad_norm": 16.177331924438477,
"learning_rate": 9.999312690383881e-05,
"loss": 0.4198,
"step": 230
},
{
"epoch": 0.009340220566680482,
"grad_norm": 16.129283905029297,
"learning_rate": 9.999302075967526e-05,
"loss": 0.3592,
"step": 231
},
{
"epoch": 0.009380654421947496,
"grad_norm": 13.0033597946167,
"learning_rate": 9.999291380222246e-05,
"loss": 0.3194,
"step": 232
},
{
"epoch": 0.009421088277214511,
"grad_norm": 5.525213718414307,
"learning_rate": 9.999280603148215e-05,
"loss": 0.1632,
"step": 233
},
{
"epoch": 0.009461522132481527,
"grad_norm": 7.511592388153076,
"learning_rate": 9.999269744745606e-05,
"loss": 0.2767,
"step": 234
},
{
"epoch": 0.009501955987748542,
"grad_norm": 4.852881908416748,
"learning_rate": 9.999258805014599e-05,
"loss": 0.1132,
"step": 235
},
{
"epoch": 0.009542389843015557,
"grad_norm": 5.765609264373779,
"learning_rate": 9.999247783955369e-05,
"loss": 0.1631,
"step": 236
},
{
"epoch": 0.009582823698282571,
"grad_norm": 9.90166187286377,
"learning_rate": 9.999236681568097e-05,
"loss": 0.2498,
"step": 237
},
{
"epoch": 0.009623257553549588,
"grad_norm": 7.612752914428711,
"learning_rate": 9.999225497852962e-05,
"loss": 0.2766,
"step": 238
},
{
"epoch": 0.009663691408816602,
"grad_norm": 11.317419052124023,
"learning_rate": 9.99921423281015e-05,
"loss": 0.2912,
"step": 239
}
],
"logging_steps": 1,
"max_steps": 24731,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 239,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4782353377329152e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}