KoMoAlpaca-0522-6epoch / trainer_state.json
mingming2000's picture
Upload checkpoint-17500
eb28dd8 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.0,
"eval_steps": 500,
"global_step": 7500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008,
"grad_norm": 1.6428182125091553,
"learning_rate": 0.00029983999999999995,
"loss": 4.0499,
"step": 10
},
{
"epoch": 0.016,
"grad_norm": 1.1695531606674194,
"learning_rate": 0.00029968,
"loss": 2.6823,
"step": 20
},
{
"epoch": 0.024,
"grad_norm": 0.982557475566864,
"learning_rate": 0.00029951999999999995,
"loss": 2.3495,
"step": 30
},
{
"epoch": 0.032,
"grad_norm": 1.119385004043579,
"learning_rate": 0.00029936,
"loss": 2.187,
"step": 40
},
{
"epoch": 0.04,
"grad_norm": 1.1943817138671875,
"learning_rate": 0.00029919999999999995,
"loss": 2.1278,
"step": 50
},
{
"epoch": 0.048,
"grad_norm": 1.0324301719665527,
"learning_rate": 0.00029904,
"loss": 2.1567,
"step": 60
},
{
"epoch": 0.056,
"grad_norm": 1.0339545011520386,
"learning_rate": 0.00029887999999999996,
"loss": 2.1682,
"step": 70
},
{
"epoch": 0.064,
"grad_norm": 1.1292812824249268,
"learning_rate": 0.00029872,
"loss": 2.0833,
"step": 80
},
{
"epoch": 0.072,
"grad_norm": 1.112321376800537,
"learning_rate": 0.00029855999999999996,
"loss": 2.0453,
"step": 90
},
{
"epoch": 0.08,
"grad_norm": 1.2117633819580078,
"learning_rate": 0.0002984,
"loss": 2.1188,
"step": 100
},
{
"epoch": 0.088,
"grad_norm": 1.0593370199203491,
"learning_rate": 0.00029823999999999996,
"loss": 2.1201,
"step": 110
},
{
"epoch": 0.096,
"grad_norm": 1.1461642980575562,
"learning_rate": 0.00029808,
"loss": 2.035,
"step": 120
},
{
"epoch": 0.104,
"grad_norm": 1.2336146831512451,
"learning_rate": 0.00029791999999999997,
"loss": 2.0329,
"step": 130
},
{
"epoch": 0.112,
"grad_norm": 1.0999081134796143,
"learning_rate": 0.00029776,
"loss": 2.029,
"step": 140
},
{
"epoch": 0.12,
"grad_norm": 1.109130620956421,
"learning_rate": 0.00029759999999999997,
"loss": 2.0032,
"step": 150
},
{
"epoch": 0.128,
"grad_norm": 1.150937557220459,
"learning_rate": 0.00029744,
"loss": 2.039,
"step": 160
},
{
"epoch": 0.136,
"grad_norm": 1.1265838146209717,
"learning_rate": 0.00029727999999999997,
"loss": 2.0358,
"step": 170
},
{
"epoch": 0.144,
"grad_norm": 1.1429523229599,
"learning_rate": 0.00029711999999999995,
"loss": 2.0357,
"step": 180
},
{
"epoch": 0.152,
"grad_norm": 1.0551432371139526,
"learning_rate": 0.00029696,
"loss": 2.0233,
"step": 190
},
{
"epoch": 0.16,
"grad_norm": 1.1221256256103516,
"learning_rate": 0.00029679999999999995,
"loss": 2.0512,
"step": 200
},
{
"epoch": 0.168,
"grad_norm": 1.0235646963119507,
"learning_rate": 0.00029664,
"loss": 2.0874,
"step": 210
},
{
"epoch": 0.176,
"grad_norm": 1.0271421670913696,
"learning_rate": 0.00029647999999999995,
"loss": 2.007,
"step": 220
},
{
"epoch": 0.184,
"grad_norm": 1.1792947053909302,
"learning_rate": 0.00029632,
"loss": 1.9954,
"step": 230
},
{
"epoch": 0.192,
"grad_norm": 1.1998450756072998,
"learning_rate": 0.00029615999999999996,
"loss": 1.9629,
"step": 240
},
{
"epoch": 0.2,
"grad_norm": 1.0941493511199951,
"learning_rate": 0.000296,
"loss": 1.9895,
"step": 250
},
{
"epoch": 0.208,
"grad_norm": 1.1195231676101685,
"learning_rate": 0.00029583999999999996,
"loss": 1.9704,
"step": 260
},
{
"epoch": 0.216,
"grad_norm": 1.0294626951217651,
"learning_rate": 0.00029568,
"loss": 1.9912,
"step": 270
},
{
"epoch": 0.224,
"grad_norm": 1.0843749046325684,
"learning_rate": 0.00029551999999999996,
"loss": 1.9365,
"step": 280
},
{
"epoch": 0.232,
"grad_norm": 0.8985214233398438,
"learning_rate": 0.00029536,
"loss": 2.0002,
"step": 290
},
{
"epoch": 0.24,
"grad_norm": 1.0384533405303955,
"learning_rate": 0.00029519999999999997,
"loss": 1.94,
"step": 300
},
{
"epoch": 0.248,
"grad_norm": 1.1195266246795654,
"learning_rate": 0.00029504,
"loss": 2.0072,
"step": 310
},
{
"epoch": 0.256,
"grad_norm": 1.0751473903656006,
"learning_rate": 0.00029487999999999997,
"loss": 1.9446,
"step": 320
},
{
"epoch": 0.264,
"grad_norm": 1.0846151113510132,
"learning_rate": 0.00029472,
"loss": 1.9619,
"step": 330
},
{
"epoch": 0.272,
"grad_norm": 1.0839966535568237,
"learning_rate": 0.00029455999999999997,
"loss": 1.9454,
"step": 340
},
{
"epoch": 0.28,
"grad_norm": 1.0731072425842285,
"learning_rate": 0.00029439999999999995,
"loss": 1.9626,
"step": 350
},
{
"epoch": 0.288,
"grad_norm": 1.0523524284362793,
"learning_rate": 0.00029424,
"loss": 1.913,
"step": 360
},
{
"epoch": 0.296,
"grad_norm": 1.0012118816375732,
"learning_rate": 0.00029407999999999995,
"loss": 1.9395,
"step": 370
},
{
"epoch": 0.304,
"grad_norm": 0.9734252691268921,
"learning_rate": 0.00029392,
"loss": 2.0065,
"step": 380
},
{
"epoch": 0.312,
"grad_norm": 1.127196192741394,
"learning_rate": 0.00029375999999999995,
"loss": 1.8512,
"step": 390
},
{
"epoch": 0.32,
"grad_norm": 1.2507715225219727,
"learning_rate": 0.0002936,
"loss": 1.9136,
"step": 400
},
{
"epoch": 0.328,
"grad_norm": 1.0916541814804077,
"learning_rate": 0.00029343999999999996,
"loss": 1.8957,
"step": 410
},
{
"epoch": 0.336,
"grad_norm": 1.1081781387329102,
"learning_rate": 0.00029328,
"loss": 1.9262,
"step": 420
},
{
"epoch": 0.344,
"grad_norm": 1.1098934412002563,
"learning_rate": 0.00029311999999999996,
"loss": 1.9213,
"step": 430
},
{
"epoch": 0.352,
"grad_norm": 1.0184811353683472,
"learning_rate": 0.00029296,
"loss": 1.9374,
"step": 440
},
{
"epoch": 0.36,
"grad_norm": 1.1124446392059326,
"learning_rate": 0.00029279999999999996,
"loss": 1.9237,
"step": 450
},
{
"epoch": 0.368,
"grad_norm": 1.1229047775268555,
"learning_rate": 0.00029264,
"loss": 1.8897,
"step": 460
},
{
"epoch": 0.376,
"grad_norm": 1.0087217092514038,
"learning_rate": 0.00029247999999999996,
"loss": 1.9317,
"step": 470
},
{
"epoch": 0.384,
"grad_norm": 1.0527478456497192,
"learning_rate": 0.00029232,
"loss": 1.9571,
"step": 480
},
{
"epoch": 0.392,
"grad_norm": 0.9762263894081116,
"learning_rate": 0.00029215999999999997,
"loss": 1.911,
"step": 490
},
{
"epoch": 0.4,
"grad_norm": 1.0288947820663452,
"learning_rate": 0.000292,
"loss": 1.8763,
"step": 500
},
{
"epoch": 0.408,
"grad_norm": 1.0375839471817017,
"learning_rate": 0.00029183999999999997,
"loss": 1.9924,
"step": 510
},
{
"epoch": 0.416,
"grad_norm": 1.005863904953003,
"learning_rate": 0.00029167999999999994,
"loss": 1.8497,
"step": 520
},
{
"epoch": 0.424,
"grad_norm": 0.9753358960151672,
"learning_rate": 0.00029152,
"loss": 1.9155,
"step": 530
},
{
"epoch": 0.432,
"grad_norm": 1.0157995223999023,
"learning_rate": 0.00029135999999999995,
"loss": 1.9108,
"step": 540
},
{
"epoch": 0.44,
"grad_norm": 1.1655962467193604,
"learning_rate": 0.0002912,
"loss": 1.8594,
"step": 550
},
{
"epoch": 0.448,
"grad_norm": 1.0194449424743652,
"learning_rate": 0.00029103999999999995,
"loss": 1.8832,
"step": 560
},
{
"epoch": 0.456,
"grad_norm": 1.0156056880950928,
"learning_rate": 0.00029088,
"loss": 1.9253,
"step": 570
},
{
"epoch": 0.464,
"grad_norm": 1.031867265701294,
"learning_rate": 0.00029071999999999995,
"loss": 1.8896,
"step": 580
},
{
"epoch": 0.472,
"grad_norm": 0.9771973490715027,
"learning_rate": 0.00029056,
"loss": 1.8817,
"step": 590
},
{
"epoch": 0.48,
"grad_norm": 1.0212839841842651,
"learning_rate": 0.00029039999999999996,
"loss": 1.9077,
"step": 600
},
{
"epoch": 0.488,
"grad_norm": 1.09153413772583,
"learning_rate": 0.00029024,
"loss": 1.8725,
"step": 610
},
{
"epoch": 0.496,
"grad_norm": 1.043017029762268,
"learning_rate": 0.00029007999999999996,
"loss": 1.8432,
"step": 620
},
{
"epoch": 0.504,
"grad_norm": 0.9705913066864014,
"learning_rate": 0.00028992,
"loss": 1.8996,
"step": 630
},
{
"epoch": 0.512,
"grad_norm": 0.9535217881202698,
"learning_rate": 0.00028975999999999996,
"loss": 1.9339,
"step": 640
},
{
"epoch": 0.52,
"grad_norm": 1.1274858713150024,
"learning_rate": 0.0002896,
"loss": 1.8275,
"step": 650
},
{
"epoch": 0.528,
"grad_norm": 1.1044244766235352,
"learning_rate": 0.00028943999999999997,
"loss": 1.894,
"step": 660
},
{
"epoch": 0.536,
"grad_norm": 1.0410267114639282,
"learning_rate": 0.00028928,
"loss": 1.9064,
"step": 670
},
{
"epoch": 0.544,
"grad_norm": 1.118211269378662,
"learning_rate": 0.00028911999999999997,
"loss": 1.8881,
"step": 680
},
{
"epoch": 0.552,
"grad_norm": 1.0527877807617188,
"learning_rate": 0.00028895999999999994,
"loss": 1.8371,
"step": 690
},
{
"epoch": 0.56,
"grad_norm": 1.0014268159866333,
"learning_rate": 0.00028879999999999997,
"loss": 1.9004,
"step": 700
},
{
"epoch": 0.568,
"grad_norm": 1.0764245986938477,
"learning_rate": 0.00028863999999999995,
"loss": 1.9347,
"step": 710
},
{
"epoch": 0.576,
"grad_norm": 1.0075087547302246,
"learning_rate": 0.00028848,
"loss": 1.8226,
"step": 720
},
{
"epoch": 0.584,
"grad_norm": 1.0563082695007324,
"learning_rate": 0.00028831999999999995,
"loss": 1.8147,
"step": 730
},
{
"epoch": 0.592,
"grad_norm": 1.1010650396347046,
"learning_rate": 0.00028816,
"loss": 1.9306,
"step": 740
},
{
"epoch": 0.6,
"grad_norm": 0.9899283647537231,
"learning_rate": 0.00028799999999999995,
"loss": 1.885,
"step": 750
},
{
"epoch": 0.608,
"grad_norm": 1.0245839357376099,
"learning_rate": 0.00028784,
"loss": 1.8532,
"step": 760
},
{
"epoch": 0.616,
"grad_norm": 1.056541085243225,
"learning_rate": 0.00028767999999999996,
"loss": 1.8861,
"step": 770
},
{
"epoch": 0.624,
"grad_norm": 0.9766470193862915,
"learning_rate": 0.00028752,
"loss": 1.8241,
"step": 780
},
{
"epoch": 0.632,
"grad_norm": 1.10284423828125,
"learning_rate": 0.00028735999999999996,
"loss": 1.7675,
"step": 790
},
{
"epoch": 0.64,
"grad_norm": 1.080234408378601,
"learning_rate": 0.0002872,
"loss": 1.8204,
"step": 800
},
{
"epoch": 0.648,
"grad_norm": 1.0814071893692017,
"learning_rate": 0.00028703999999999996,
"loss": 1.8619,
"step": 810
},
{
"epoch": 0.656,
"grad_norm": 0.9824687838554382,
"learning_rate": 0.00028688,
"loss": 1.9017,
"step": 820
},
{
"epoch": 0.664,
"grad_norm": 1.0177820920944214,
"learning_rate": 0.00028671999999999997,
"loss": 1.8842,
"step": 830
},
{
"epoch": 0.672,
"grad_norm": 0.9703278541564941,
"learning_rate": 0.00028656,
"loss": 1.916,
"step": 840
},
{
"epoch": 0.68,
"grad_norm": 1.0800108909606934,
"learning_rate": 0.00028639999999999997,
"loss": 1.8274,
"step": 850
},
{
"epoch": 0.688,
"grad_norm": 1.0110689401626587,
"learning_rate": 0.00028624,
"loss": 1.8077,
"step": 860
},
{
"epoch": 0.696,
"grad_norm": 1.091354250907898,
"learning_rate": 0.00028607999999999997,
"loss": 1.8971,
"step": 870
},
{
"epoch": 0.704,
"grad_norm": 1.0147050619125366,
"learning_rate": 0.00028591999999999995,
"loss": 1.8365,
"step": 880
},
{
"epoch": 0.712,
"grad_norm": 1.0930813550949097,
"learning_rate": 0.00028576,
"loss": 1.7962,
"step": 890
},
{
"epoch": 0.72,
"grad_norm": 1.0309563875198364,
"learning_rate": 0.00028559999999999995,
"loss": 1.808,
"step": 900
},
{
"epoch": 0.728,
"grad_norm": 1.0878843069076538,
"learning_rate": 0.00028544,
"loss": 1.8481,
"step": 910
},
{
"epoch": 0.736,
"grad_norm": 1.039565086364746,
"learning_rate": 0.00028527999999999995,
"loss": 1.8475,
"step": 920
},
{
"epoch": 0.744,
"grad_norm": 0.9955683350563049,
"learning_rate": 0.00028512,
"loss": 1.8577,
"step": 930
},
{
"epoch": 0.752,
"grad_norm": 0.9792163372039795,
"learning_rate": 0.00028495999999999996,
"loss": 1.8577,
"step": 940
},
{
"epoch": 0.76,
"grad_norm": 1.0933603048324585,
"learning_rate": 0.0002848,
"loss": 1.8941,
"step": 950
},
{
"epoch": 0.768,
"grad_norm": 1.0719082355499268,
"learning_rate": 0.00028463999999999996,
"loss": 1.8739,
"step": 960
},
{
"epoch": 0.776,
"grad_norm": 1.039011836051941,
"learning_rate": 0.00028448,
"loss": 1.8526,
"step": 970
},
{
"epoch": 0.784,
"grad_norm": 1.1158881187438965,
"learning_rate": 0.00028431999999999996,
"loss": 1.8001,
"step": 980
},
{
"epoch": 0.792,
"grad_norm": 0.9756163954734802,
"learning_rate": 0.00028416,
"loss": 1.8211,
"step": 990
},
{
"epoch": 0.8,
"grad_norm": 1.0662978887557983,
"learning_rate": 0.00028399999999999996,
"loss": 1.8549,
"step": 1000
},
{
"epoch": 0.808,
"grad_norm": 1.060304880142212,
"learning_rate": 0.00028384,
"loss": 1.8516,
"step": 1010
},
{
"epoch": 0.816,
"grad_norm": 1.0433423519134521,
"learning_rate": 0.00028367999999999997,
"loss": 1.8146,
"step": 1020
},
{
"epoch": 0.824,
"grad_norm": 1.0191080570220947,
"learning_rate": 0.00028352,
"loss": 1.8172,
"step": 1030
},
{
"epoch": 0.832,
"grad_norm": 1.0157259702682495,
"learning_rate": 0.00028335999999999997,
"loss": 1.8096,
"step": 1040
},
{
"epoch": 0.84,
"grad_norm": 1.0125967264175415,
"learning_rate": 0.00028319999999999994,
"loss": 1.7954,
"step": 1050
},
{
"epoch": 0.848,
"grad_norm": 1.0847101211547852,
"learning_rate": 0.00028304,
"loss": 1.7961,
"step": 1060
},
{
"epoch": 0.856,
"grad_norm": 0.9798891544342041,
"learning_rate": 0.00028287999999999995,
"loss": 1.8246,
"step": 1070
},
{
"epoch": 0.864,
"grad_norm": 0.9857827425003052,
"learning_rate": 0.00028272,
"loss": 1.8989,
"step": 1080
},
{
"epoch": 0.872,
"grad_norm": 0.9614414572715759,
"learning_rate": 0.00028255999999999995,
"loss": 1.9081,
"step": 1090
},
{
"epoch": 0.88,
"grad_norm": 0.9770805835723877,
"learning_rate": 0.0002824,
"loss": 1.8396,
"step": 1100
},
{
"epoch": 0.888,
"grad_norm": 1.0100719928741455,
"learning_rate": 0.00028223999999999995,
"loss": 1.8233,
"step": 1110
},
{
"epoch": 0.896,
"grad_norm": 0.9945518970489502,
"learning_rate": 0.00028208,
"loss": 1.8163,
"step": 1120
},
{
"epoch": 0.904,
"grad_norm": 1.0281423330307007,
"learning_rate": 0.00028191999999999996,
"loss": 1.8317,
"step": 1130
},
{
"epoch": 0.912,
"grad_norm": 1.0575731992721558,
"learning_rate": 0.00028176,
"loss": 1.8673,
"step": 1140
},
{
"epoch": 0.92,
"grad_norm": 1.1658177375793457,
"learning_rate": 0.00028159999999999996,
"loss": 1.827,
"step": 1150
},
{
"epoch": 0.928,
"grad_norm": 1.0432631969451904,
"learning_rate": 0.00028144,
"loss": 1.8164,
"step": 1160
},
{
"epoch": 0.936,
"grad_norm": 1.0257468223571777,
"learning_rate": 0.00028127999999999996,
"loss": 1.8025,
"step": 1170
},
{
"epoch": 0.944,
"grad_norm": 1.1194055080413818,
"learning_rate": 0.00028112,
"loss": 1.8514,
"step": 1180
},
{
"epoch": 0.952,
"grad_norm": 1.0339341163635254,
"learning_rate": 0.00028095999999999997,
"loss": 1.7735,
"step": 1190
},
{
"epoch": 0.96,
"grad_norm": 0.9593726396560669,
"learning_rate": 0.0002808,
"loss": 1.8003,
"step": 1200
},
{
"epoch": 0.968,
"grad_norm": 0.9705820083618164,
"learning_rate": 0.00028063999999999997,
"loss": 1.7788,
"step": 1210
},
{
"epoch": 0.976,
"grad_norm": 1.0924532413482666,
"learning_rate": 0.00028047999999999994,
"loss": 1.7911,
"step": 1220
},
{
"epoch": 0.984,
"grad_norm": 1.0870336294174194,
"learning_rate": 0.00028031999999999997,
"loss": 1.777,
"step": 1230
},
{
"epoch": 0.992,
"grad_norm": 1.0212570428848267,
"learning_rate": 0.00028015999999999995,
"loss": 1.8329,
"step": 1240
},
{
"epoch": 1.0,
"grad_norm": 0.9898034334182739,
"learning_rate": 0.00028,
"loss": 1.803,
"step": 1250
},
{
"epoch": 1.008,
"grad_norm": 1.1636098623275757,
"learning_rate": 0.00027983999999999995,
"loss": 1.7512,
"step": 1260
},
{
"epoch": 1.016,
"grad_norm": 1.122517704963684,
"learning_rate": 0.00027968,
"loss": 1.7579,
"step": 1270
},
{
"epoch": 1.024,
"grad_norm": 1.1521267890930176,
"learning_rate": 0.00027951999999999995,
"loss": 1.6773,
"step": 1280
},
{
"epoch": 1.032,
"grad_norm": 1.164711833000183,
"learning_rate": 0.00027936,
"loss": 1.6539,
"step": 1290
},
{
"epoch": 1.04,
"grad_norm": 1.0965043306350708,
"learning_rate": 0.00027919999999999996,
"loss": 1.7126,
"step": 1300
},
{
"epoch": 1.048,
"grad_norm": 1.2235987186431885,
"learning_rate": 0.00027904,
"loss": 1.6967,
"step": 1310
},
{
"epoch": 1.056,
"grad_norm": 1.1083018779754639,
"learning_rate": 0.00027887999999999996,
"loss": 1.7282,
"step": 1320
},
{
"epoch": 1.064,
"grad_norm": 1.1210997104644775,
"learning_rate": 0.00027872,
"loss": 1.7371,
"step": 1330
},
{
"epoch": 1.072,
"grad_norm": 1.1816761493682861,
"learning_rate": 0.00027855999999999996,
"loss": 1.7099,
"step": 1340
},
{
"epoch": 1.08,
"grad_norm": 1.1083471775054932,
"learning_rate": 0.0002784,
"loss": 1.7029,
"step": 1350
},
{
"epoch": 1.088,
"grad_norm": 1.1974619626998901,
"learning_rate": 0.00027823999999999997,
"loss": 1.676,
"step": 1360
},
{
"epoch": 1.096,
"grad_norm": 1.1856564283370972,
"learning_rate": 0.00027808,
"loss": 1.7675,
"step": 1370
},
{
"epoch": 1.104,
"grad_norm": 1.1293108463287354,
"learning_rate": 0.00027791999999999997,
"loss": 1.6936,
"step": 1380
},
{
"epoch": 1.112,
"grad_norm": 1.1792447566986084,
"learning_rate": 0.00027775999999999994,
"loss": 1.7194,
"step": 1390
},
{
"epoch": 1.12,
"grad_norm": 1.1036359071731567,
"learning_rate": 0.00027759999999999997,
"loss": 1.6564,
"step": 1400
},
{
"epoch": 1.1280000000000001,
"grad_norm": 1.2215582132339478,
"learning_rate": 0.00027743999999999995,
"loss": 1.7064,
"step": 1410
},
{
"epoch": 1.1360000000000001,
"grad_norm": 1.1735379695892334,
"learning_rate": 0.00027728,
"loss": 1.7184,
"step": 1420
},
{
"epoch": 1.144,
"grad_norm": 1.1964507102966309,
"learning_rate": 0.00027711999999999995,
"loss": 1.7329,
"step": 1430
},
{
"epoch": 1.152,
"grad_norm": 1.138510823249817,
"learning_rate": 0.00027696,
"loss": 1.7096,
"step": 1440
},
{
"epoch": 1.16,
"grad_norm": 1.1308197975158691,
"learning_rate": 0.00027679999999999995,
"loss": 1.7457,
"step": 1450
},
{
"epoch": 1.168,
"grad_norm": 1.1567286252975464,
"learning_rate": 0.00027664,
"loss": 1.6813,
"step": 1460
},
{
"epoch": 1.176,
"grad_norm": 1.1560039520263672,
"learning_rate": 0.00027647999999999995,
"loss": 1.699,
"step": 1470
},
{
"epoch": 1.184,
"grad_norm": 1.230444073677063,
"learning_rate": 0.00027632,
"loss": 1.7782,
"step": 1480
},
{
"epoch": 1.192,
"grad_norm": 1.2430510520935059,
"learning_rate": 0.00027615999999999996,
"loss": 1.716,
"step": 1490
},
{
"epoch": 1.2,
"grad_norm": 1.1405155658721924,
"learning_rate": 0.000276,
"loss": 1.7072,
"step": 1500
},
{
"epoch": 1.208,
"grad_norm": 1.1308519840240479,
"learning_rate": 0.00027583999999999996,
"loss": 1.6952,
"step": 1510
},
{
"epoch": 1.216,
"grad_norm": 1.2301914691925049,
"learning_rate": 0.00027568,
"loss": 1.721,
"step": 1520
},
{
"epoch": 1.224,
"grad_norm": 1.2387229204177856,
"learning_rate": 0.00027551999999999996,
"loss": 1.6866,
"step": 1530
},
{
"epoch": 1.232,
"grad_norm": 1.070438027381897,
"learning_rate": 0.00027536,
"loss": 1.6882,
"step": 1540
},
{
"epoch": 1.24,
"grad_norm": 1.1818335056304932,
"learning_rate": 0.00027519999999999997,
"loss": 1.7479,
"step": 1550
},
{
"epoch": 1.248,
"grad_norm": 1.1129992008209229,
"learning_rate": 0.00027503999999999994,
"loss": 1.7563,
"step": 1560
},
{
"epoch": 1.256,
"grad_norm": 1.2298282384872437,
"learning_rate": 0.00027487999999999997,
"loss": 1.684,
"step": 1570
},
{
"epoch": 1.264,
"grad_norm": 1.2662142515182495,
"learning_rate": 0.00027471999999999994,
"loss": 1.7052,
"step": 1580
},
{
"epoch": 1.272,
"grad_norm": 1.2866853475570679,
"learning_rate": 0.00027456,
"loss": 1.7403,
"step": 1590
},
{
"epoch": 1.28,
"grad_norm": 1.225150227546692,
"learning_rate": 0.00027439999999999995,
"loss": 1.7141,
"step": 1600
},
{
"epoch": 1.288,
"grad_norm": 1.2254585027694702,
"learning_rate": 0.00027424,
"loss": 1.7558,
"step": 1610
},
{
"epoch": 1.296,
"grad_norm": 1.228905439376831,
"learning_rate": 0.00027407999999999995,
"loss": 1.7081,
"step": 1620
},
{
"epoch": 1.304,
"grad_norm": 1.190305471420288,
"learning_rate": 0.00027392,
"loss": 1.7173,
"step": 1630
},
{
"epoch": 1.312,
"grad_norm": 1.1080456972122192,
"learning_rate": 0.00027375999999999995,
"loss": 1.6771,
"step": 1640
},
{
"epoch": 1.32,
"grad_norm": 1.1341513395309448,
"learning_rate": 0.0002736,
"loss": 1.6865,
"step": 1650
},
{
"epoch": 1.328,
"grad_norm": 1.1582372188568115,
"learning_rate": 0.00027343999999999996,
"loss": 1.7083,
"step": 1660
},
{
"epoch": 1.336,
"grad_norm": 1.2780879735946655,
"learning_rate": 0.00027328,
"loss": 1.7348,
"step": 1670
},
{
"epoch": 1.3439999999999999,
"grad_norm": 1.1118934154510498,
"learning_rate": 0.00027311999999999996,
"loss": 1.763,
"step": 1680
},
{
"epoch": 1.3519999999999999,
"grad_norm": 1.2540453672409058,
"learning_rate": 0.00027296,
"loss": 1.7273,
"step": 1690
},
{
"epoch": 1.3599999999999999,
"grad_norm": 1.1426582336425781,
"learning_rate": 0.00027279999999999996,
"loss": 1.7098,
"step": 1700
},
{
"epoch": 1.3679999999999999,
"grad_norm": 1.1964046955108643,
"learning_rate": 0.00027264,
"loss": 1.709,
"step": 1710
},
{
"epoch": 1.376,
"grad_norm": 1.14896559715271,
"learning_rate": 0.00027247999999999997,
"loss": 1.7313,
"step": 1720
},
{
"epoch": 1.384,
"grad_norm": 1.202528953552246,
"learning_rate": 0.00027231999999999994,
"loss": 1.744,
"step": 1730
},
{
"epoch": 1.392,
"grad_norm": 1.1821858882904053,
"learning_rate": 0.00027215999999999997,
"loss": 1.6926,
"step": 1740
},
{
"epoch": 1.4,
"grad_norm": 1.2063863277435303,
"learning_rate": 0.00027199999999999994,
"loss": 1.6386,
"step": 1750
},
{
"epoch": 1.408,
"grad_norm": 1.2269303798675537,
"learning_rate": 0.00027183999999999997,
"loss": 1.7395,
"step": 1760
},
{
"epoch": 1.416,
"grad_norm": 1.1873764991760254,
"learning_rate": 0.00027167999999999995,
"loss": 1.7408,
"step": 1770
},
{
"epoch": 1.424,
"grad_norm": 1.1701534986495972,
"learning_rate": 0.00027152,
"loss": 1.6902,
"step": 1780
},
{
"epoch": 1.432,
"grad_norm": 1.2059394121170044,
"learning_rate": 0.00027135999999999995,
"loss": 1.7189,
"step": 1790
},
{
"epoch": 1.44,
"grad_norm": 1.2177969217300415,
"learning_rate": 0.0002712,
"loss": 1.6688,
"step": 1800
},
{
"epoch": 1.448,
"grad_norm": 1.1420925855636597,
"learning_rate": 0.00027103999999999995,
"loss": 1.7027,
"step": 1810
},
{
"epoch": 1.456,
"grad_norm": 1.1630126237869263,
"learning_rate": 0.00027088,
"loss": 1.7268,
"step": 1820
},
{
"epoch": 1.464,
"grad_norm": 1.1708976030349731,
"learning_rate": 0.00027071999999999996,
"loss": 1.6667,
"step": 1830
},
{
"epoch": 1.472,
"grad_norm": 1.1763298511505127,
"learning_rate": 0.00027056,
"loss": 1.7667,
"step": 1840
},
{
"epoch": 1.48,
"grad_norm": 1.1959589719772339,
"learning_rate": 0.00027039999999999996,
"loss": 1.682,
"step": 1850
},
{
"epoch": 1.488,
"grad_norm": 1.187795639038086,
"learning_rate": 0.00027024,
"loss": 1.7078,
"step": 1860
},
{
"epoch": 1.496,
"grad_norm": 1.1146178245544434,
"learning_rate": 0.00027007999999999996,
"loss": 1.6864,
"step": 1870
},
{
"epoch": 1.504,
"grad_norm": 1.1661298274993896,
"learning_rate": 0.00026992,
"loss": 1.7206,
"step": 1880
},
{
"epoch": 1.512,
"grad_norm": 1.1348265409469604,
"learning_rate": 0.00026975999999999997,
"loss": 1.6971,
"step": 1890
},
{
"epoch": 1.52,
"grad_norm": 1.2029168605804443,
"learning_rate": 0.00026959999999999994,
"loss": 1.7435,
"step": 1900
},
{
"epoch": 1.528,
"grad_norm": 1.2038522958755493,
"learning_rate": 0.00026943999999999997,
"loss": 1.6938,
"step": 1910
},
{
"epoch": 1.536,
"grad_norm": 1.1772645711898804,
"learning_rate": 0.00026927999999999994,
"loss": 1.6974,
"step": 1920
},
{
"epoch": 1.544,
"grad_norm": 1.2052574157714844,
"learning_rate": 0.00026911999999999997,
"loss": 1.6798,
"step": 1930
},
{
"epoch": 1.552,
"grad_norm": 1.22791588306427,
"learning_rate": 0.00026895999999999995,
"loss": 1.7074,
"step": 1940
},
{
"epoch": 1.56,
"grad_norm": 1.0809330940246582,
"learning_rate": 0.0002688,
"loss": 1.7332,
"step": 1950
},
{
"epoch": 1.568,
"grad_norm": 1.2375030517578125,
"learning_rate": 0.00026863999999999995,
"loss": 1.7188,
"step": 1960
},
{
"epoch": 1.576,
"grad_norm": 1.1218806505203247,
"learning_rate": 0.00026848,
"loss": 1.7222,
"step": 1970
},
{
"epoch": 1.584,
"grad_norm": 1.1987130641937256,
"learning_rate": 0.00026831999999999995,
"loss": 1.6705,
"step": 1980
},
{
"epoch": 1.592,
"grad_norm": 1.1293755769729614,
"learning_rate": 0.00026816,
"loss": 1.7911,
"step": 1990
},
{
"epoch": 1.6,
"grad_norm": 1.1469671726226807,
"learning_rate": 0.00026799999999999995,
"loss": 1.7355,
"step": 2000
},
{
"epoch": 1.608,
"grad_norm": 1.2343659400939941,
"learning_rate": 0.00026784,
"loss": 1.6655,
"step": 2010
},
{
"epoch": 1.616,
"grad_norm": 1.1669197082519531,
"learning_rate": 0.00026767999999999996,
"loss": 1.7655,
"step": 2020
},
{
"epoch": 1.624,
"grad_norm": 1.1948648691177368,
"learning_rate": 0.00026752,
"loss": 1.5903,
"step": 2030
},
{
"epoch": 1.6320000000000001,
"grad_norm": 1.210276484489441,
"learning_rate": 0.00026735999999999996,
"loss": 1.6919,
"step": 2040
},
{
"epoch": 1.6400000000000001,
"grad_norm": 1.1474298238754272,
"learning_rate": 0.0002672,
"loss": 1.7315,
"step": 2050
},
{
"epoch": 1.6480000000000001,
"grad_norm": 1.1558197736740112,
"learning_rate": 0.00026703999999999996,
"loss": 1.7004,
"step": 2060
},
{
"epoch": 1.6560000000000001,
"grad_norm": 1.2014431953430176,
"learning_rate": 0.00026687999999999994,
"loss": 1.7262,
"step": 2070
},
{
"epoch": 1.6640000000000001,
"grad_norm": 1.1946237087249756,
"learning_rate": 0.00026671999999999997,
"loss": 1.7575,
"step": 2080
},
{
"epoch": 1.6720000000000002,
"grad_norm": 1.096993088722229,
"learning_rate": 0.00026655999999999994,
"loss": 1.7049,
"step": 2090
},
{
"epoch": 1.6800000000000002,
"grad_norm": 1.136132001876831,
"learning_rate": 0.00026639999999999997,
"loss": 1.7116,
"step": 2100
},
{
"epoch": 1.688,
"grad_norm": 1.1487154960632324,
"learning_rate": 0.00026623999999999994,
"loss": 1.6711,
"step": 2110
},
{
"epoch": 1.696,
"grad_norm": 1.2251691818237305,
"learning_rate": 0.00026608,
"loss": 1.7647,
"step": 2120
},
{
"epoch": 1.704,
"grad_norm": 1.1736303567886353,
"learning_rate": 0.00026591999999999995,
"loss": 1.7074,
"step": 2130
},
{
"epoch": 1.712,
"grad_norm": 1.1187472343444824,
"learning_rate": 0.00026576,
"loss": 1.6904,
"step": 2140
},
{
"epoch": 1.72,
"grad_norm": 1.2309964895248413,
"learning_rate": 0.00026559999999999995,
"loss": 1.6816,
"step": 2150
},
{
"epoch": 1.728,
"grad_norm": 1.182122826576233,
"learning_rate": 0.00026544,
"loss": 1.7505,
"step": 2160
},
{
"epoch": 1.736,
"grad_norm": 1.1426887512207031,
"learning_rate": 0.00026527999999999995,
"loss": 1.7415,
"step": 2170
},
{
"epoch": 1.744,
"grad_norm": 1.1243617534637451,
"learning_rate": 0.00026512,
"loss": 1.7427,
"step": 2180
},
{
"epoch": 1.752,
"grad_norm": 1.1814117431640625,
"learning_rate": 0.00026495999999999996,
"loss": 1.6717,
"step": 2190
},
{
"epoch": 1.76,
"grad_norm": 1.1558399200439453,
"learning_rate": 0.0002648,
"loss": 1.7521,
"step": 2200
},
{
"epoch": 1.768,
"grad_norm": 1.1759192943572998,
"learning_rate": 0.00026463999999999996,
"loss": 1.6922,
"step": 2210
},
{
"epoch": 1.776,
"grad_norm": 1.213027834892273,
"learning_rate": 0.00026448,
"loss": 1.6918,
"step": 2220
},
{
"epoch": 1.784,
"grad_norm": 1.1476492881774902,
"learning_rate": 0.00026431999999999996,
"loss": 1.7319,
"step": 2230
},
{
"epoch": 1.792,
"grad_norm": 1.171706199645996,
"learning_rate": 0.00026415999999999994,
"loss": 1.7126,
"step": 2240
},
{
"epoch": 1.8,
"grad_norm": 1.2222481966018677,
"learning_rate": 0.00026399999999999997,
"loss": 1.666,
"step": 2250
},
{
"epoch": 1.808,
"grad_norm": 1.1074283123016357,
"learning_rate": 0.00026383999999999994,
"loss": 1.7136,
"step": 2260
},
{
"epoch": 1.8159999999999998,
"grad_norm": 1.0644099712371826,
"learning_rate": 0.00026367999999999997,
"loss": 1.7343,
"step": 2270
},
{
"epoch": 1.8239999999999998,
"grad_norm": 1.1833316087722778,
"learning_rate": 0.00026351999999999994,
"loss": 1.6886,
"step": 2280
},
{
"epoch": 1.8319999999999999,
"grad_norm": 1.2397806644439697,
"learning_rate": 0.00026335999999999997,
"loss": 1.7136,
"step": 2290
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.6187876462936401,
"learning_rate": 0.00026319999999999995,
"loss": 1.5873,
"step": 2300
},
{
"epoch": 1.8479999999999999,
"grad_norm": 1.141440749168396,
"learning_rate": 0.00026304,
"loss": 1.6937,
"step": 2310
},
{
"epoch": 1.8559999999999999,
"grad_norm": 1.1803792715072632,
"learning_rate": 0.00026287999999999995,
"loss": 1.8136,
"step": 2320
},
{
"epoch": 1.8639999999999999,
"grad_norm": 1.1979511976242065,
"learning_rate": 0.00026272,
"loss": 1.7426,
"step": 2330
},
{
"epoch": 1.8719999999999999,
"grad_norm": 1.1708755493164062,
"learning_rate": 0.00026255999999999995,
"loss": 1.7532,
"step": 2340
},
{
"epoch": 1.88,
"grad_norm": 1.0788543224334717,
"learning_rate": 0.0002624,
"loss": 1.7435,
"step": 2350
},
{
"epoch": 1.888,
"grad_norm": 1.1670109033584595,
"learning_rate": 0.00026223999999999996,
"loss": 1.7338,
"step": 2360
},
{
"epoch": 1.896,
"grad_norm": 1.1337978839874268,
"learning_rate": 0.00026208,
"loss": 1.7508,
"step": 2370
},
{
"epoch": 1.904,
"grad_norm": 1.131404995918274,
"learning_rate": 0.00026191999999999996,
"loss": 1.7321,
"step": 2380
},
{
"epoch": 1.912,
"grad_norm": 1.1655117273330688,
"learning_rate": 0.00026176,
"loss": 1.7629,
"step": 2390
},
{
"epoch": 1.92,
"grad_norm": 1.1582902669906616,
"learning_rate": 0.00026159999999999996,
"loss": 1.7083,
"step": 2400
},
{
"epoch": 1.928,
"grad_norm": 1.181884765625,
"learning_rate": 0.00026143999999999994,
"loss": 1.6752,
"step": 2410
},
{
"epoch": 1.936,
"grad_norm": 1.1487571001052856,
"learning_rate": 0.00026127999999999996,
"loss": 1.6785,
"step": 2420
},
{
"epoch": 1.944,
"grad_norm": 1.2264763116836548,
"learning_rate": 0.00026111999999999994,
"loss": 1.7237,
"step": 2430
},
{
"epoch": 1.952,
"grad_norm": 1.285232424736023,
"learning_rate": 0.00026095999999999997,
"loss": 1.7148,
"step": 2440
},
{
"epoch": 1.96,
"grad_norm": 1.2243884801864624,
"learning_rate": 0.00026079999999999994,
"loss": 1.7305,
"step": 2450
},
{
"epoch": 1.968,
"grad_norm": 1.2969841957092285,
"learning_rate": 0.00026063999999999997,
"loss": 1.7271,
"step": 2460
},
{
"epoch": 1.976,
"grad_norm": 1.1378511190414429,
"learning_rate": 0.00026047999999999995,
"loss": 1.7425,
"step": 2470
},
{
"epoch": 1.984,
"grad_norm": 1.1501156091690063,
"learning_rate": 0.00026032,
"loss": 1.712,
"step": 2480
},
{
"epoch": 1.992,
"grad_norm": 1.2089420557022095,
"learning_rate": 0.00026015999999999995,
"loss": 1.7164,
"step": 2490
},
{
"epoch": 2.0,
"grad_norm": 1.1784271001815796,
"learning_rate": 0.00026,
"loss": 1.736,
"step": 2500
},
{
"epoch": 2.008,
"grad_norm": 1.1402006149291992,
"learning_rate": 0.00025983999999999995,
"loss": 1.5834,
"step": 2510
},
{
"epoch": 2.016,
"grad_norm": 1.1883610486984253,
"learning_rate": 0.00025968,
"loss": 1.5888,
"step": 2520
},
{
"epoch": 2.024,
"grad_norm": 1.3511167764663696,
"learning_rate": 0.00025951999999999995,
"loss": 1.5551,
"step": 2530
},
{
"epoch": 2.032,
"grad_norm": 1.2824231386184692,
"learning_rate": 0.00025936,
"loss": 1.5119,
"step": 2540
},
{
"epoch": 2.04,
"grad_norm": 1.3076261281967163,
"learning_rate": 0.00025919999999999996,
"loss": 1.5044,
"step": 2550
},
{
"epoch": 2.048,
"grad_norm": 1.3731348514556885,
"learning_rate": 0.00025904,
"loss": 1.5585,
"step": 2560
},
{
"epoch": 2.056,
"grad_norm": 1.3174951076507568,
"learning_rate": 0.00025887999999999996,
"loss": 1.5591,
"step": 2570
},
{
"epoch": 2.064,
"grad_norm": 1.3365912437438965,
"learning_rate": 0.00025872,
"loss": 1.5357,
"step": 2580
},
{
"epoch": 2.072,
"grad_norm": 1.3091166019439697,
"learning_rate": 0.00025855999999999996,
"loss": 1.5232,
"step": 2590
},
{
"epoch": 2.08,
"grad_norm": 1.372147560119629,
"learning_rate": 0.00025839999999999994,
"loss": 1.5427,
"step": 2600
},
{
"epoch": 2.088,
"grad_norm": 1.412030577659607,
"learning_rate": 0.00025823999999999997,
"loss": 1.5851,
"step": 2610
},
{
"epoch": 2.096,
"grad_norm": 1.3613168001174927,
"learning_rate": 0.00025807999999999994,
"loss": 1.5283,
"step": 2620
},
{
"epoch": 2.104,
"grad_norm": 1.3381197452545166,
"learning_rate": 0.00025791999999999997,
"loss": 1.5812,
"step": 2630
},
{
"epoch": 2.112,
"grad_norm": 1.3254282474517822,
"learning_rate": 0.00025775999999999994,
"loss": 1.5016,
"step": 2640
},
{
"epoch": 2.12,
"grad_norm": 1.410579800605774,
"learning_rate": 0.0002576,
"loss": 1.5876,
"step": 2650
},
{
"epoch": 2.128,
"grad_norm": 1.3652007579803467,
"learning_rate": 0.00025743999999999995,
"loss": 1.6116,
"step": 2660
},
{
"epoch": 2.136,
"grad_norm": 1.4137206077575684,
"learning_rate": 0.00025728,
"loss": 1.5736,
"step": 2670
},
{
"epoch": 2.144,
"grad_norm": 1.3824632167816162,
"learning_rate": 0.00025711999999999995,
"loss": 1.5523,
"step": 2680
},
{
"epoch": 2.152,
"grad_norm": 1.4239449501037598,
"learning_rate": 0.00025696,
"loss": 1.5221,
"step": 2690
},
{
"epoch": 2.16,
"grad_norm": 1.4395557641983032,
"learning_rate": 0.00025679999999999995,
"loss": 1.5161,
"step": 2700
},
{
"epoch": 2.168,
"grad_norm": 1.5318158864974976,
"learning_rate": 0.00025664,
"loss": 1.5784,
"step": 2710
},
{
"epoch": 2.176,
"grad_norm": 1.3760408163070679,
"learning_rate": 0.00025647999999999996,
"loss": 1.567,
"step": 2720
},
{
"epoch": 2.184,
"grad_norm": 1.4429463148117065,
"learning_rate": 0.00025632,
"loss": 1.5653,
"step": 2730
},
{
"epoch": 2.192,
"grad_norm": 1.424181342124939,
"learning_rate": 0.00025615999999999996,
"loss": 1.5378,
"step": 2740
},
{
"epoch": 2.2,
"grad_norm": 1.4517723321914673,
"learning_rate": 0.000256,
"loss": 1.5738,
"step": 2750
},
{
"epoch": 2.208,
"grad_norm": 1.455818772315979,
"learning_rate": 0.00025583999999999996,
"loss": 1.5117,
"step": 2760
},
{
"epoch": 2.216,
"grad_norm": 1.3915988206863403,
"learning_rate": 0.00025567999999999994,
"loss": 1.5682,
"step": 2770
},
{
"epoch": 2.224,
"grad_norm": 1.3807563781738281,
"learning_rate": 0.00025551999999999997,
"loss": 1.5618,
"step": 2780
},
{
"epoch": 2.232,
"grad_norm": 1.3413660526275635,
"learning_rate": 0.00025535999999999994,
"loss": 1.5724,
"step": 2790
},
{
"epoch": 2.24,
"grad_norm": 1.3377097845077515,
"learning_rate": 0.00025519999999999997,
"loss": 1.5662,
"step": 2800
},
{
"epoch": 2.248,
"grad_norm": 1.3897491693496704,
"learning_rate": 0.00025503999999999994,
"loss": 1.5431,
"step": 2810
},
{
"epoch": 2.2560000000000002,
"grad_norm": 1.4780755043029785,
"learning_rate": 0.00025487999999999997,
"loss": 1.5657,
"step": 2820
},
{
"epoch": 2.2640000000000002,
"grad_norm": 1.3818844556808472,
"learning_rate": 0.00025471999999999995,
"loss": 1.5399,
"step": 2830
},
{
"epoch": 2.2720000000000002,
"grad_norm": 1.4447002410888672,
"learning_rate": 0.00025456,
"loss": 1.5729,
"step": 2840
},
{
"epoch": 2.2800000000000002,
"grad_norm": 1.381330132484436,
"learning_rate": 0.00025439999999999995,
"loss": 1.52,
"step": 2850
},
{
"epoch": 2.288,
"grad_norm": 1.418094277381897,
"learning_rate": 0.00025424,
"loss": 1.5954,
"step": 2860
},
{
"epoch": 2.296,
"grad_norm": 1.329988718032837,
"learning_rate": 0.00025407999999999995,
"loss": 1.5852,
"step": 2870
},
{
"epoch": 2.304,
"grad_norm": 1.3431826829910278,
"learning_rate": 0.00025392,
"loss": 1.5482,
"step": 2880
},
{
"epoch": 2.312,
"grad_norm": 1.4532684087753296,
"learning_rate": 0.00025375999999999996,
"loss": 1.4967,
"step": 2890
},
{
"epoch": 2.32,
"grad_norm": 1.3491160869598389,
"learning_rate": 0.0002536,
"loss": 1.5145,
"step": 2900
},
{
"epoch": 2.328,
"grad_norm": 1.3651959896087646,
"learning_rate": 0.00025343999999999996,
"loss": 1.5726,
"step": 2910
},
{
"epoch": 2.336,
"grad_norm": 1.4137355089187622,
"learning_rate": 0.00025328,
"loss": 1.5646,
"step": 2920
},
{
"epoch": 2.344,
"grad_norm": 1.4950937032699585,
"learning_rate": 0.00025311999999999996,
"loss": 1.5653,
"step": 2930
},
{
"epoch": 2.352,
"grad_norm": 1.3360849618911743,
"learning_rate": 0.00025295999999999994,
"loss": 1.5669,
"step": 2940
},
{
"epoch": 2.36,
"grad_norm": 1.4283881187438965,
"learning_rate": 0.00025279999999999996,
"loss": 1.5729,
"step": 2950
},
{
"epoch": 2.368,
"grad_norm": 1.40790855884552,
"learning_rate": 0.00025263999999999994,
"loss": 1.54,
"step": 2960
},
{
"epoch": 2.376,
"grad_norm": 1.5222750902175903,
"learning_rate": 0.00025247999999999997,
"loss": 1.6079,
"step": 2970
},
{
"epoch": 2.384,
"grad_norm": 1.424391746520996,
"learning_rate": 0.00025231999999999994,
"loss": 1.5835,
"step": 2980
},
{
"epoch": 2.392,
"grad_norm": 1.4419969320297241,
"learning_rate": 0.00025215999999999997,
"loss": 1.6546,
"step": 2990
},
{
"epoch": 2.4,
"grad_norm": 1.3506951332092285,
"learning_rate": 0.00025199999999999995,
"loss": 1.5856,
"step": 3000
},
{
"epoch": 2.408,
"grad_norm": 1.3341702222824097,
"learning_rate": 0.00025184,
"loss": 1.5933,
"step": 3010
},
{
"epoch": 2.416,
"grad_norm": 1.3723673820495605,
"learning_rate": 0.00025167999999999995,
"loss": 1.602,
"step": 3020
},
{
"epoch": 2.424,
"grad_norm": 1.4064242839813232,
"learning_rate": 0.00025152,
"loss": 1.5438,
"step": 3030
},
{
"epoch": 2.432,
"grad_norm": 1.3700838088989258,
"learning_rate": 0.00025135999999999995,
"loss": 1.6322,
"step": 3040
},
{
"epoch": 2.44,
"grad_norm": 1.4045076370239258,
"learning_rate": 0.0002512,
"loss": 1.5728,
"step": 3050
},
{
"epoch": 2.448,
"grad_norm": 1.4885849952697754,
"learning_rate": 0.00025103999999999995,
"loss": 1.6177,
"step": 3060
},
{
"epoch": 2.456,
"grad_norm": 1.4054323434829712,
"learning_rate": 0.00025088,
"loss": 1.5579,
"step": 3070
},
{
"epoch": 2.464,
"grad_norm": 1.4171288013458252,
"learning_rate": 0.00025071999999999996,
"loss": 1.6058,
"step": 3080
},
{
"epoch": 2.472,
"grad_norm": 1.3950269222259521,
"learning_rate": 0.00025056,
"loss": 1.5906,
"step": 3090
},
{
"epoch": 2.48,
"grad_norm": 1.3375904560089111,
"learning_rate": 0.00025039999999999996,
"loss": 1.5984,
"step": 3100
},
{
"epoch": 2.488,
"grad_norm": 1.3980008363723755,
"learning_rate": 0.00025024,
"loss": 1.5724,
"step": 3110
},
{
"epoch": 2.496,
"grad_norm": 1.3917794227600098,
"learning_rate": 0.00025007999999999996,
"loss": 1.6085,
"step": 3120
},
{
"epoch": 2.504,
"grad_norm": 1.3524688482284546,
"learning_rate": 0.00024991999999999994,
"loss": 1.5734,
"step": 3130
},
{
"epoch": 2.512,
"grad_norm": 1.4597851037979126,
"learning_rate": 0.00024975999999999997,
"loss": 1.6001,
"step": 3140
},
{
"epoch": 2.52,
"grad_norm": 1.4018633365631104,
"learning_rate": 0.00024959999999999994,
"loss": 1.5761,
"step": 3150
},
{
"epoch": 2.528,
"grad_norm": 1.414162278175354,
"learning_rate": 0.00024943999999999997,
"loss": 1.5799,
"step": 3160
},
{
"epoch": 2.536,
"grad_norm": 1.3470393419265747,
"learning_rate": 0.00024927999999999994,
"loss": 1.5905,
"step": 3170
},
{
"epoch": 2.544,
"grad_norm": 1.350521445274353,
"learning_rate": 0.00024912,
"loss": 1.605,
"step": 3180
},
{
"epoch": 2.552,
"grad_norm": 1.4013463258743286,
"learning_rate": 0.00024895999999999995,
"loss": 1.5653,
"step": 3190
},
{
"epoch": 2.56,
"grad_norm": 1.3292449712753296,
"learning_rate": 0.0002488,
"loss": 1.5761,
"step": 3200
},
{
"epoch": 2.568,
"grad_norm": 1.2734830379486084,
"learning_rate": 0.00024863999999999995,
"loss": 1.6476,
"step": 3210
},
{
"epoch": 2.576,
"grad_norm": 1.4279605150222778,
"learning_rate": 0.00024848,
"loss": 1.5671,
"step": 3220
},
{
"epoch": 2.584,
"grad_norm": 1.4233906269073486,
"learning_rate": 0.00024831999999999995,
"loss": 1.5653,
"step": 3230
},
{
"epoch": 2.592,
"grad_norm": 1.393425703048706,
"learning_rate": 0.00024816,
"loss": 1.5605,
"step": 3240
},
{
"epoch": 2.6,
"grad_norm": 1.4003586769104004,
"learning_rate": 0.00024799999999999996,
"loss": 1.5775,
"step": 3250
},
{
"epoch": 2.608,
"grad_norm": 1.3909311294555664,
"learning_rate": 0.00024784,
"loss": 1.599,
"step": 3260
},
{
"epoch": 2.616,
"grad_norm": 1.3618372678756714,
"learning_rate": 0.00024767999999999996,
"loss": 1.5981,
"step": 3270
},
{
"epoch": 2.624,
"grad_norm": 1.3769896030426025,
"learning_rate": 0.00024752,
"loss": 1.6391,
"step": 3280
},
{
"epoch": 2.632,
"grad_norm": 1.2977598905563354,
"learning_rate": 0.00024735999999999996,
"loss": 1.6043,
"step": 3290
},
{
"epoch": 2.64,
"grad_norm": 1.3508882522583008,
"learning_rate": 0.0002472,
"loss": 1.5432,
"step": 3300
},
{
"epoch": 2.648,
"grad_norm": 1.411916732788086,
"learning_rate": 0.00024703999999999997,
"loss": 1.5889,
"step": 3310
},
{
"epoch": 2.656,
"grad_norm": 1.3107311725616455,
"learning_rate": 0.00024687999999999994,
"loss": 1.5618,
"step": 3320
},
{
"epoch": 2.664,
"grad_norm": 1.383974552154541,
"learning_rate": 0.00024671999999999997,
"loss": 1.5893,
"step": 3330
},
{
"epoch": 2.672,
"grad_norm": 1.3793646097183228,
"learning_rate": 0.00024655999999999994,
"loss": 1.5901,
"step": 3340
},
{
"epoch": 2.68,
"grad_norm": 1.405423879623413,
"learning_rate": 0.00024639999999999997,
"loss": 1.638,
"step": 3350
},
{
"epoch": 2.6879999999999997,
"grad_norm": 1.3815405368804932,
"learning_rate": 0.00024623999999999995,
"loss": 1.6037,
"step": 3360
},
{
"epoch": 2.6959999999999997,
"grad_norm": 1.297813057899475,
"learning_rate": 0.00024608,
"loss": 1.5632,
"step": 3370
},
{
"epoch": 2.7039999999999997,
"grad_norm": 1.3591309785842896,
"learning_rate": 0.00024591999999999995,
"loss": 1.5892,
"step": 3380
},
{
"epoch": 2.7119999999999997,
"grad_norm": 1.4379678964614868,
"learning_rate": 0.00024576,
"loss": 1.6029,
"step": 3390
},
{
"epoch": 2.7199999999999998,
"grad_norm": 1.4956458806991577,
"learning_rate": 0.00024559999999999995,
"loss": 1.5974,
"step": 3400
},
{
"epoch": 2.7279999999999998,
"grad_norm": 1.4072085618972778,
"learning_rate": 0.00024544,
"loss": 1.6431,
"step": 3410
},
{
"epoch": 2.7359999999999998,
"grad_norm": 1.28607177734375,
"learning_rate": 0.00024527999999999996,
"loss": 1.5868,
"step": 3420
},
{
"epoch": 2.7439999999999998,
"grad_norm": 1.5061297416687012,
"learning_rate": 0.00024512,
"loss": 1.613,
"step": 3430
},
{
"epoch": 2.752,
"grad_norm": 1.4274139404296875,
"learning_rate": 0.00024495999999999996,
"loss": 1.5507,
"step": 3440
},
{
"epoch": 2.76,
"grad_norm": 1.4335947036743164,
"learning_rate": 0.0002448,
"loss": 1.5557,
"step": 3450
},
{
"epoch": 2.768,
"grad_norm": 1.3052548170089722,
"learning_rate": 0.00024463999999999996,
"loss": 1.5779,
"step": 3460
},
{
"epoch": 2.776,
"grad_norm": 1.2695350646972656,
"learning_rate": 0.00024448,
"loss": 1.5587,
"step": 3470
},
{
"epoch": 2.784,
"grad_norm": 1.4060364961624146,
"learning_rate": 0.00024431999999999996,
"loss": 1.6129,
"step": 3480
},
{
"epoch": 2.792,
"grad_norm": 1.4803110361099243,
"learning_rate": 0.00024416,
"loss": 1.6236,
"step": 3490
},
{
"epoch": 2.8,
"grad_norm": 1.353215217590332,
"learning_rate": 0.000244,
"loss": 1.6323,
"step": 3500
},
{
"epoch": 2.808,
"grad_norm": 1.3456429243087769,
"learning_rate": 0.00024383999999999997,
"loss": 1.6635,
"step": 3510
},
{
"epoch": 2.816,
"grad_norm": 1.4098529815673828,
"learning_rate": 0.00024368,
"loss": 1.5523,
"step": 3520
},
{
"epoch": 2.824,
"grad_norm": 1.5074928998947144,
"learning_rate": 0.00024351999999999997,
"loss": 1.5736,
"step": 3530
},
{
"epoch": 2.832,
"grad_norm": 1.4895234107971191,
"learning_rate": 0.00024336,
"loss": 1.6004,
"step": 3540
},
{
"epoch": 2.84,
"grad_norm": 1.370694875717163,
"learning_rate": 0.00024319999999999998,
"loss": 1.5812,
"step": 3550
},
{
"epoch": 2.848,
"grad_norm": 1.3541662693023682,
"learning_rate": 0.00024303999999999998,
"loss": 1.6059,
"step": 3560
},
{
"epoch": 2.856,
"grad_norm": 1.3392258882522583,
"learning_rate": 0.00024287999999999998,
"loss": 1.5564,
"step": 3570
},
{
"epoch": 2.864,
"grad_norm": 1.4230504035949707,
"learning_rate": 0.00024271999999999998,
"loss": 1.6174,
"step": 3580
},
{
"epoch": 2.872,
"grad_norm": 1.3360211849212646,
"learning_rate": 0.00024255999999999998,
"loss": 1.5281,
"step": 3590
},
{
"epoch": 2.88,
"grad_norm": 1.4017730951309204,
"learning_rate": 0.00024239999999999998,
"loss": 1.5762,
"step": 3600
},
{
"epoch": 2.888,
"grad_norm": 1.4613922834396362,
"learning_rate": 0.00024223999999999998,
"loss": 1.6045,
"step": 3610
},
{
"epoch": 2.896,
"grad_norm": 1.458549976348877,
"learning_rate": 0.00024207999999999996,
"loss": 1.5711,
"step": 3620
},
{
"epoch": 2.904,
"grad_norm": 1.4020884037017822,
"learning_rate": 0.00024192,
"loss": 1.5384,
"step": 3630
},
{
"epoch": 2.912,
"grad_norm": 1.306881308555603,
"learning_rate": 0.00024175999999999996,
"loss": 1.6462,
"step": 3640
},
{
"epoch": 2.92,
"grad_norm": 1.417031168937683,
"learning_rate": 0.0002416,
"loss": 1.6108,
"step": 3650
},
{
"epoch": 2.928,
"grad_norm": 1.3262524604797363,
"learning_rate": 0.00024143999999999997,
"loss": 1.6286,
"step": 3660
},
{
"epoch": 2.936,
"grad_norm": 1.4688752889633179,
"learning_rate": 0.00024128,
"loss": 1.6187,
"step": 3670
},
{
"epoch": 2.944,
"grad_norm": 1.413116216659546,
"learning_rate": 0.00024111999999999997,
"loss": 1.5804,
"step": 3680
},
{
"epoch": 2.952,
"grad_norm": 1.3572710752487183,
"learning_rate": 0.00024096,
"loss": 1.5947,
"step": 3690
},
{
"epoch": 2.96,
"grad_norm": 1.4315195083618164,
"learning_rate": 0.00024079999999999997,
"loss": 1.5673,
"step": 3700
},
{
"epoch": 2.968,
"grad_norm": 1.455693244934082,
"learning_rate": 0.00024064,
"loss": 1.6311,
"step": 3710
},
{
"epoch": 2.976,
"grad_norm": 1.4192560911178589,
"learning_rate": 0.00024047999999999997,
"loss": 1.604,
"step": 3720
},
{
"epoch": 2.984,
"grad_norm": 1.4426895380020142,
"learning_rate": 0.00024032,
"loss": 1.6163,
"step": 3730
},
{
"epoch": 2.992,
"grad_norm": 1.3536198139190674,
"learning_rate": 0.00024015999999999998,
"loss": 1.6009,
"step": 3740
},
{
"epoch": 3.0,
"grad_norm": 1.4080064296722412,
"learning_rate": 0.00023999999999999998,
"loss": 1.5628,
"step": 3750
},
{
"epoch": 3.008,
"grad_norm": 1.457725167274475,
"learning_rate": 0.00023983999999999998,
"loss": 1.3987,
"step": 3760
},
{
"epoch": 3.016,
"grad_norm": 1.5731124877929688,
"learning_rate": 0.00023967999999999998,
"loss": 1.3801,
"step": 3770
},
{
"epoch": 3.024,
"grad_norm": 1.6412163972854614,
"learning_rate": 0.00023951999999999998,
"loss": 1.3839,
"step": 3780
},
{
"epoch": 3.032,
"grad_norm": 1.596416711807251,
"learning_rate": 0.00023935999999999996,
"loss": 1.3565,
"step": 3790
},
{
"epoch": 3.04,
"grad_norm": 1.645661473274231,
"learning_rate": 0.0002392,
"loss": 1.3738,
"step": 3800
},
{
"epoch": 3.048,
"grad_norm": 1.6392892599105835,
"learning_rate": 0.00023903999999999996,
"loss": 1.4528,
"step": 3810
},
{
"epoch": 3.056,
"grad_norm": 1.6278529167175293,
"learning_rate": 0.00023888,
"loss": 1.3921,
"step": 3820
},
{
"epoch": 3.064,
"grad_norm": 1.5314973592758179,
"learning_rate": 0.00023871999999999996,
"loss": 1.3978,
"step": 3830
},
{
"epoch": 3.072,
"grad_norm": 1.648292899131775,
"learning_rate": 0.00023856,
"loss": 1.3825,
"step": 3840
},
{
"epoch": 3.08,
"grad_norm": 1.5989805459976196,
"learning_rate": 0.00023839999999999997,
"loss": 1.4229,
"step": 3850
},
{
"epoch": 3.088,
"grad_norm": 1.581239104270935,
"learning_rate": 0.00023824,
"loss": 1.4038,
"step": 3860
},
{
"epoch": 3.096,
"grad_norm": 1.6047204732894897,
"learning_rate": 0.00023807999999999997,
"loss": 1.3652,
"step": 3870
},
{
"epoch": 3.104,
"grad_norm": 1.6853421926498413,
"learning_rate": 0.00023792,
"loss": 1.4256,
"step": 3880
},
{
"epoch": 3.112,
"grad_norm": 1.531514048576355,
"learning_rate": 0.00023775999999999997,
"loss": 1.4061,
"step": 3890
},
{
"epoch": 3.12,
"grad_norm": 1.6761980056762695,
"learning_rate": 0.0002376,
"loss": 1.4205,
"step": 3900
},
{
"epoch": 3.128,
"grad_norm": 1.6190063953399658,
"learning_rate": 0.00023743999999999998,
"loss": 1.434,
"step": 3910
},
{
"epoch": 3.136,
"grad_norm": 1.6470814943313599,
"learning_rate": 0.00023728,
"loss": 1.42,
"step": 3920
},
{
"epoch": 3.144,
"grad_norm": 1.5572431087493896,
"learning_rate": 0.00023711999999999998,
"loss": 1.418,
"step": 3930
},
{
"epoch": 3.152,
"grad_norm": 1.6244536638259888,
"learning_rate": 0.00023695999999999998,
"loss": 1.4415,
"step": 3940
},
{
"epoch": 3.16,
"grad_norm": 1.7610841989517212,
"learning_rate": 0.00023679999999999998,
"loss": 1.4249,
"step": 3950
},
{
"epoch": 3.168,
"grad_norm": 1.6935431957244873,
"learning_rate": 0.00023663999999999996,
"loss": 1.4373,
"step": 3960
},
{
"epoch": 3.176,
"grad_norm": 1.6628581285476685,
"learning_rate": 0.00023647999999999999,
"loss": 1.4168,
"step": 3970
},
{
"epoch": 3.184,
"grad_norm": 1.6654530763626099,
"learning_rate": 0.00023631999999999996,
"loss": 1.4028,
"step": 3980
},
{
"epoch": 3.192,
"grad_norm": 1.6582281589508057,
"learning_rate": 0.00023616,
"loss": 1.4062,
"step": 3990
},
{
"epoch": 3.2,
"grad_norm": 1.7206676006317139,
"learning_rate": 0.00023599999999999996,
"loss": 1.3972,
"step": 4000
},
{
"epoch": 3.208,
"grad_norm": 1.6492377519607544,
"learning_rate": 0.000235856,
"loss": 1.4392,
"step": 4010
},
{
"epoch": 3.216,
"grad_norm": 1.8048959970474243,
"learning_rate": 0.00023569599999999997,
"loss": 1.4523,
"step": 4020
},
{
"epoch": 3.224,
"grad_norm": 1.644860029220581,
"learning_rate": 0.000235536,
"loss": 1.4168,
"step": 4030
},
{
"epoch": 3.232,
"grad_norm": 1.666577696800232,
"learning_rate": 0.00023537599999999998,
"loss": 1.5143,
"step": 4040
},
{
"epoch": 3.24,
"grad_norm": 1.6702250242233276,
"learning_rate": 0.000235216,
"loss": 1.4573,
"step": 4050
},
{
"epoch": 3.248,
"grad_norm": 1.6784839630126953,
"learning_rate": 0.00023505599999999998,
"loss": 1.4388,
"step": 4060
},
{
"epoch": 3.2560000000000002,
"grad_norm": 1.6922202110290527,
"learning_rate": 0.000234896,
"loss": 1.4481,
"step": 4070
},
{
"epoch": 3.2640000000000002,
"grad_norm": 1.5728070735931396,
"learning_rate": 0.00023473599999999998,
"loss": 1.4358,
"step": 4080
},
{
"epoch": 3.2720000000000002,
"grad_norm": 1.6510705947875977,
"learning_rate": 0.00023457599999999996,
"loss": 1.4615,
"step": 4090
},
{
"epoch": 3.2800000000000002,
"grad_norm": 1.655216097831726,
"learning_rate": 0.00023441599999999999,
"loss": 1.4087,
"step": 4100
},
{
"epoch": 3.288,
"grad_norm": 1.6214429140090942,
"learning_rate": 0.00023425599999999996,
"loss": 1.3582,
"step": 4110
},
{
"epoch": 3.296,
"grad_norm": 1.5415211915969849,
"learning_rate": 0.000234096,
"loss": 1.438,
"step": 4120
},
{
"epoch": 3.304,
"grad_norm": 1.6406790018081665,
"learning_rate": 0.00023393599999999996,
"loss": 1.4781,
"step": 4130
},
{
"epoch": 3.312,
"grad_norm": 1.7424193620681763,
"learning_rate": 0.000233776,
"loss": 1.447,
"step": 4140
},
{
"epoch": 3.32,
"grad_norm": 1.5324851274490356,
"learning_rate": 0.00023361599999999997,
"loss": 1.4415,
"step": 4150
},
{
"epoch": 3.328,
"grad_norm": 1.6945812702178955,
"learning_rate": 0.000233456,
"loss": 1.4761,
"step": 4160
},
{
"epoch": 3.336,
"grad_norm": 1.7372822761535645,
"learning_rate": 0.00023329599999999997,
"loss": 1.4504,
"step": 4170
},
{
"epoch": 3.344,
"grad_norm": 1.6869308948516846,
"learning_rate": 0.000233136,
"loss": 1.4623,
"step": 4180
},
{
"epoch": 3.352,
"grad_norm": 1.7480005025863647,
"learning_rate": 0.00023299199999999998,
"loss": 1.4184,
"step": 4190
},
{
"epoch": 3.36,
"grad_norm": 1.570494532585144,
"learning_rate": 0.00023283199999999996,
"loss": 1.3832,
"step": 4200
},
{
"epoch": 3.368,
"grad_norm": 1.8143585920333862,
"learning_rate": 0.00023267199999999998,
"loss": 1.4252,
"step": 4210
},
{
"epoch": 3.376,
"grad_norm": 1.571781039237976,
"learning_rate": 0.00023251199999999996,
"loss": 1.4336,
"step": 4220
},
{
"epoch": 3.384,
"grad_norm": 1.6962851285934448,
"learning_rate": 0.000232352,
"loss": 1.4297,
"step": 4230
},
{
"epoch": 3.392,
"grad_norm": 1.6035798788070679,
"learning_rate": 0.00023219199999999996,
"loss": 1.451,
"step": 4240
},
{
"epoch": 3.4,
"grad_norm": 1.6665290594100952,
"learning_rate": 0.000232032,
"loss": 1.4281,
"step": 4250
},
{
"epoch": 3.408,
"grad_norm": 1.8126115798950195,
"learning_rate": 0.00023187199999999996,
"loss": 1.4711,
"step": 4260
},
{
"epoch": 3.416,
"grad_norm": 1.6531234979629517,
"learning_rate": 0.000231712,
"loss": 1.4257,
"step": 4270
},
{
"epoch": 3.424,
"grad_norm": 1.7809317111968994,
"learning_rate": 0.00023155199999999997,
"loss": 1.4205,
"step": 4280
},
{
"epoch": 3.432,
"grad_norm": 1.8113545179367065,
"learning_rate": 0.000231392,
"loss": 1.4269,
"step": 4290
},
{
"epoch": 3.44,
"grad_norm": 1.6733276844024658,
"learning_rate": 0.00023123199999999997,
"loss": 1.4178,
"step": 4300
},
{
"epoch": 3.448,
"grad_norm": 1.6381465196609497,
"learning_rate": 0.000231072,
"loss": 1.4926,
"step": 4310
},
{
"epoch": 3.456,
"grad_norm": 1.6401630640029907,
"learning_rate": 0.00023091199999999997,
"loss": 1.4817,
"step": 4320
},
{
"epoch": 3.464,
"grad_norm": 1.628445029258728,
"learning_rate": 0.000230752,
"loss": 1.4935,
"step": 4330
},
{
"epoch": 3.472,
"grad_norm": 1.632957935333252,
"learning_rate": 0.00023059199999999998,
"loss": 1.4787,
"step": 4340
},
{
"epoch": 3.48,
"grad_norm": 1.7037655115127563,
"learning_rate": 0.000230432,
"loss": 1.4547,
"step": 4350
},
{
"epoch": 3.488,
"grad_norm": 1.7122141122817993,
"learning_rate": 0.00023027199999999998,
"loss": 1.4314,
"step": 4360
},
{
"epoch": 3.496,
"grad_norm": 1.620144248008728,
"learning_rate": 0.000230112,
"loss": 1.4198,
"step": 4370
},
{
"epoch": 3.504,
"grad_norm": 1.604781150817871,
"learning_rate": 0.00022995199999999998,
"loss": 1.4648,
"step": 4380
},
{
"epoch": 3.512,
"grad_norm": 1.6798850297927856,
"learning_rate": 0.00022979199999999996,
"loss": 1.4556,
"step": 4390
},
{
"epoch": 3.52,
"grad_norm": 1.7711225748062134,
"learning_rate": 0.00022963199999999999,
"loss": 1.4741,
"step": 4400
},
{
"epoch": 3.528,
"grad_norm": 1.7546377182006836,
"learning_rate": 0.00022947199999999996,
"loss": 1.4186,
"step": 4410
},
{
"epoch": 3.536,
"grad_norm": 1.5374271869659424,
"learning_rate": 0.000229312,
"loss": 1.4726,
"step": 4420
},
{
"epoch": 3.544,
"grad_norm": 1.691049575805664,
"learning_rate": 0.00022915199999999996,
"loss": 1.4302,
"step": 4430
},
{
"epoch": 3.552,
"grad_norm": 1.8386030197143555,
"learning_rate": 0.000228992,
"loss": 1.4855,
"step": 4440
},
{
"epoch": 3.56,
"grad_norm": 1.659847617149353,
"learning_rate": 0.00022883199999999997,
"loss": 1.4818,
"step": 4450
},
{
"epoch": 3.568,
"grad_norm": 1.693474292755127,
"learning_rate": 0.000228672,
"loss": 1.4226,
"step": 4460
},
{
"epoch": 3.576,
"grad_norm": 1.6777136325836182,
"learning_rate": 0.00022851199999999997,
"loss": 1.4819,
"step": 4470
},
{
"epoch": 3.584,
"grad_norm": 1.6764864921569824,
"learning_rate": 0.000228352,
"loss": 1.4802,
"step": 4480
},
{
"epoch": 3.592,
"grad_norm": 1.790226697921753,
"learning_rate": 0.00022819199999999997,
"loss": 1.5031,
"step": 4490
},
{
"epoch": 3.6,
"grad_norm": 1.5997536182403564,
"learning_rate": 0.000228032,
"loss": 1.4584,
"step": 4500
},
{
"epoch": 3.608,
"grad_norm": 1.6292929649353027,
"learning_rate": 0.00022787199999999998,
"loss": 1.4389,
"step": 4510
},
{
"epoch": 3.616,
"grad_norm": 1.6309911012649536,
"learning_rate": 0.000227712,
"loss": 1.4545,
"step": 4520
},
{
"epoch": 3.624,
"grad_norm": 1.6208481788635254,
"learning_rate": 0.00022755199999999998,
"loss": 1.4653,
"step": 4530
},
{
"epoch": 3.632,
"grad_norm": 1.75088369846344,
"learning_rate": 0.000227392,
"loss": 1.4934,
"step": 4540
},
{
"epoch": 3.64,
"grad_norm": 1.8166102170944214,
"learning_rate": 0.00022723199999999998,
"loss": 1.4889,
"step": 4550
},
{
"epoch": 3.648,
"grad_norm": 1.5575486421585083,
"learning_rate": 0.00022707199999999996,
"loss": 1.4948,
"step": 4560
},
{
"epoch": 3.656,
"grad_norm": 1.6632091999053955,
"learning_rate": 0.00022691199999999999,
"loss": 1.4832,
"step": 4570
},
{
"epoch": 3.664,
"grad_norm": 1.583553433418274,
"learning_rate": 0.00022675199999999996,
"loss": 1.492,
"step": 4580
},
{
"epoch": 3.672,
"grad_norm": 1.668994426727295,
"learning_rate": 0.000226592,
"loss": 1.4978,
"step": 4590
},
{
"epoch": 3.68,
"grad_norm": 1.515479326248169,
"learning_rate": 0.00022643199999999996,
"loss": 1.4916,
"step": 4600
},
{
"epoch": 3.6879999999999997,
"grad_norm": 1.652949333190918,
"learning_rate": 0.000226272,
"loss": 1.4584,
"step": 4610
},
{
"epoch": 3.6959999999999997,
"grad_norm": 1.6760021448135376,
"learning_rate": 0.00022611199999999997,
"loss": 1.5198,
"step": 4620
},
{
"epoch": 3.7039999999999997,
"grad_norm": 1.5702306032180786,
"learning_rate": 0.000225952,
"loss": 1.4703,
"step": 4630
},
{
"epoch": 3.7119999999999997,
"grad_norm": 1.4395867586135864,
"learning_rate": 0.00022579199999999997,
"loss": 1.5072,
"step": 4640
},
{
"epoch": 3.7199999999999998,
"grad_norm": 1.5167125463485718,
"learning_rate": 0.000225632,
"loss": 1.473,
"step": 4650
},
{
"epoch": 3.7279999999999998,
"grad_norm": 1.6731159687042236,
"learning_rate": 0.00022547199999999997,
"loss": 1.4886,
"step": 4660
},
{
"epoch": 3.7359999999999998,
"grad_norm": 1.6189254522323608,
"learning_rate": 0.000225312,
"loss": 1.4699,
"step": 4670
},
{
"epoch": 3.7439999999999998,
"grad_norm": 1.5746572017669678,
"learning_rate": 0.00022515199999999997,
"loss": 1.5175,
"step": 4680
},
{
"epoch": 3.752,
"grad_norm": 1.7097058296203613,
"learning_rate": 0.000224992,
"loss": 1.4978,
"step": 4690
},
{
"epoch": 3.76,
"grad_norm": 1.6119595766067505,
"learning_rate": 0.00022483199999999998,
"loss": 1.4492,
"step": 4700
},
{
"epoch": 3.768,
"grad_norm": 1.736672043800354,
"learning_rate": 0.000224672,
"loss": 1.4344,
"step": 4710
},
{
"epoch": 3.776,
"grad_norm": 1.7588441371917725,
"learning_rate": 0.00022451199999999998,
"loss": 1.4563,
"step": 4720
},
{
"epoch": 3.784,
"grad_norm": 1.6483169794082642,
"learning_rate": 0.00022435199999999996,
"loss": 1.525,
"step": 4730
},
{
"epoch": 3.792,
"grad_norm": 1.5439528226852417,
"learning_rate": 0.00022419199999999998,
"loss": 1.4263,
"step": 4740
},
{
"epoch": 3.8,
"grad_norm": 1.5422582626342773,
"learning_rate": 0.00022403199999999996,
"loss": 1.4792,
"step": 4750
},
{
"epoch": 3.808,
"grad_norm": 1.580538272857666,
"learning_rate": 0.000223872,
"loss": 1.5005,
"step": 4760
},
{
"epoch": 3.816,
"grad_norm": 1.5790603160858154,
"learning_rate": 0.00022371199999999996,
"loss": 1.4742,
"step": 4770
},
{
"epoch": 3.824,
"grad_norm": 1.597711443901062,
"learning_rate": 0.000223552,
"loss": 1.4576,
"step": 4780
},
{
"epoch": 3.832,
"grad_norm": 1.7034629583358765,
"learning_rate": 0.00022339199999999996,
"loss": 1.4753,
"step": 4790
},
{
"epoch": 3.84,
"grad_norm": 1.6988534927368164,
"learning_rate": 0.000223232,
"loss": 1.4657,
"step": 4800
},
{
"epoch": 3.848,
"grad_norm": 1.5512536764144897,
"learning_rate": 0.00022307199999999997,
"loss": 1.5351,
"step": 4810
},
{
"epoch": 3.856,
"grad_norm": 1.6742076873779297,
"learning_rate": 0.000222912,
"loss": 1.5108,
"step": 4820
},
{
"epoch": 3.864,
"grad_norm": 1.6922365427017212,
"learning_rate": 0.00022275199999999997,
"loss": 1.4979,
"step": 4830
},
{
"epoch": 3.872,
"grad_norm": 1.6407732963562012,
"learning_rate": 0.000222592,
"loss": 1.535,
"step": 4840
},
{
"epoch": 3.88,
"grad_norm": 1.7077395915985107,
"learning_rate": 0.00022243199999999997,
"loss": 1.4981,
"step": 4850
},
{
"epoch": 3.888,
"grad_norm": 1.7150638103485107,
"learning_rate": 0.000222272,
"loss": 1.5042,
"step": 4860
},
{
"epoch": 3.896,
"grad_norm": 1.5963282585144043,
"learning_rate": 0.00022211199999999998,
"loss": 1.4823,
"step": 4870
},
{
"epoch": 3.904,
"grad_norm": 1.5717283487319946,
"learning_rate": 0.000221952,
"loss": 1.4598,
"step": 4880
},
{
"epoch": 3.912,
"grad_norm": 1.6642472743988037,
"learning_rate": 0.00022179199999999998,
"loss": 1.5055,
"step": 4890
},
{
"epoch": 3.92,
"grad_norm": 1.6903674602508545,
"learning_rate": 0.00022163199999999995,
"loss": 1.4801,
"step": 4900
},
{
"epoch": 3.928,
"grad_norm": 1.5324851274490356,
"learning_rate": 0.00022147199999999998,
"loss": 1.4673,
"step": 4910
},
{
"epoch": 3.936,
"grad_norm": 1.6329302787780762,
"learning_rate": 0.00022131199999999996,
"loss": 1.5142,
"step": 4920
},
{
"epoch": 3.944,
"grad_norm": 1.74014413356781,
"learning_rate": 0.00022115199999999999,
"loss": 1.5035,
"step": 4930
},
{
"epoch": 3.952,
"grad_norm": 1.7019540071487427,
"learning_rate": 0.00022099199999999996,
"loss": 1.4721,
"step": 4940
},
{
"epoch": 3.96,
"grad_norm": 1.8085479736328125,
"learning_rate": 0.000220832,
"loss": 1.5223,
"step": 4950
},
{
"epoch": 3.968,
"grad_norm": 1.5533138513565063,
"learning_rate": 0.00022067199999999996,
"loss": 1.5005,
"step": 4960
},
{
"epoch": 3.976,
"grad_norm": 1.5848819017410278,
"learning_rate": 0.000220512,
"loss": 1.4882,
"step": 4970
},
{
"epoch": 3.984,
"grad_norm": 1.7058250904083252,
"learning_rate": 0.00022035199999999997,
"loss": 1.5222,
"step": 4980
},
{
"epoch": 3.992,
"grad_norm": 1.6337260007858276,
"learning_rate": 0.000220192,
"loss": 1.5153,
"step": 4990
},
{
"epoch": 4.0,
"grad_norm": 1.6282905340194702,
"learning_rate": 0.00022003199999999997,
"loss": 1.5281,
"step": 5000
},
{
"epoch": 4.008,
"grad_norm": 1.752145528793335,
"learning_rate": 0.000219872,
"loss": 1.2938,
"step": 5010
},
{
"epoch": 4.016,
"grad_norm": 1.7552149295806885,
"learning_rate": 0.00021971199999999997,
"loss": 1.2279,
"step": 5020
},
{
"epoch": 4.024,
"grad_norm": 1.7727724313735962,
"learning_rate": 0.000219552,
"loss": 1.2764,
"step": 5030
},
{
"epoch": 4.032,
"grad_norm": 1.9101881980895996,
"learning_rate": 0.00021939199999999998,
"loss": 1.2379,
"step": 5040
},
{
"epoch": 4.04,
"grad_norm": 1.8460677862167358,
"learning_rate": 0.000219232,
"loss": 1.2328,
"step": 5050
},
{
"epoch": 4.048,
"grad_norm": 1.869718313217163,
"learning_rate": 0.00021907199999999998,
"loss": 1.3119,
"step": 5060
},
{
"epoch": 4.056,
"grad_norm": 1.789228916168213,
"learning_rate": 0.00021891199999999995,
"loss": 1.3397,
"step": 5070
},
{
"epoch": 4.064,
"grad_norm": 1.9012354612350464,
"learning_rate": 0.00021875199999999998,
"loss": 1.2709,
"step": 5080
},
{
"epoch": 4.072,
"grad_norm": 1.9376635551452637,
"learning_rate": 0.00021859199999999996,
"loss": 1.3102,
"step": 5090
},
{
"epoch": 4.08,
"grad_norm": 2.017179489135742,
"learning_rate": 0.00021843199999999998,
"loss": 1.2612,
"step": 5100
},
{
"epoch": 4.088,
"grad_norm": 1.9067007303237915,
"learning_rate": 0.00021827199999999996,
"loss": 1.2686,
"step": 5110
},
{
"epoch": 4.096,
"grad_norm": 1.8910828828811646,
"learning_rate": 0.000218112,
"loss": 1.3517,
"step": 5120
},
{
"epoch": 4.104,
"grad_norm": 2.0437941551208496,
"learning_rate": 0.00021795199999999996,
"loss": 1.3021,
"step": 5130
},
{
"epoch": 4.112,
"grad_norm": 1.896239995956421,
"learning_rate": 0.000217792,
"loss": 1.2559,
"step": 5140
},
{
"epoch": 4.12,
"grad_norm": 1.965014100074768,
"learning_rate": 0.00021763199999999997,
"loss": 1.2925,
"step": 5150
},
{
"epoch": 4.128,
"grad_norm": 1.9773045778274536,
"learning_rate": 0.000217472,
"loss": 1.2994,
"step": 5160
},
{
"epoch": 4.136,
"grad_norm": 1.7610217332839966,
"learning_rate": 0.00021731199999999997,
"loss": 1.2904,
"step": 5170
},
{
"epoch": 4.144,
"grad_norm": 2.0166215896606445,
"learning_rate": 0.000217152,
"loss": 1.3171,
"step": 5180
},
{
"epoch": 4.152,
"grad_norm": 1.8862032890319824,
"learning_rate": 0.00021699199999999997,
"loss": 1.3267,
"step": 5190
},
{
"epoch": 4.16,
"grad_norm": 1.7716232538223267,
"learning_rate": 0.000216832,
"loss": 1.3342,
"step": 5200
},
{
"epoch": 4.168,
"grad_norm": 1.8332161903381348,
"learning_rate": 0.00021667199999999997,
"loss": 1.2916,
"step": 5210
},
{
"epoch": 4.176,
"grad_norm": 1.9238322973251343,
"learning_rate": 0.000216512,
"loss": 1.3271,
"step": 5220
},
{
"epoch": 4.184,
"grad_norm": 1.7780416011810303,
"learning_rate": 0.00021635199999999998,
"loss": 1.321,
"step": 5230
},
{
"epoch": 4.192,
"grad_norm": 1.985548973083496,
"learning_rate": 0.00021619199999999995,
"loss": 1.3342,
"step": 5240
},
{
"epoch": 4.2,
"grad_norm": 1.9339617490768433,
"learning_rate": 0.00021603199999999998,
"loss": 1.3496,
"step": 5250
},
{
"epoch": 4.208,
"grad_norm": 1.7527296543121338,
"learning_rate": 0.00021587199999999996,
"loss": 1.2671,
"step": 5260
},
{
"epoch": 4.216,
"grad_norm": 1.8272658586502075,
"learning_rate": 0.00021571199999999998,
"loss": 1.2471,
"step": 5270
},
{
"epoch": 4.224,
"grad_norm": 1.87795090675354,
"learning_rate": 0.00021555199999999996,
"loss": 1.2631,
"step": 5280
},
{
"epoch": 4.232,
"grad_norm": 1.9426238536834717,
"learning_rate": 0.000215392,
"loss": 1.2649,
"step": 5290
},
{
"epoch": 4.24,
"grad_norm": 1.819056510925293,
"learning_rate": 0.00021523199999999996,
"loss": 1.3185,
"step": 5300
},
{
"epoch": 4.248,
"grad_norm": 1.9573816061019897,
"learning_rate": 0.000215072,
"loss": 1.3122,
"step": 5310
},
{
"epoch": 4.256,
"grad_norm": 1.919756531715393,
"learning_rate": 0.00021491199999999996,
"loss": 1.329,
"step": 5320
},
{
"epoch": 4.264,
"grad_norm": 1.9141929149627686,
"learning_rate": 0.000214752,
"loss": 1.2475,
"step": 5330
},
{
"epoch": 4.272,
"grad_norm": 2.0552964210510254,
"learning_rate": 0.00021459199999999997,
"loss": 1.3471,
"step": 5340
},
{
"epoch": 4.28,
"grad_norm": 1.90670645236969,
"learning_rate": 0.000214432,
"loss": 1.3266,
"step": 5350
},
{
"epoch": 4.288,
"grad_norm": 2.149916410446167,
"learning_rate": 0.00021427199999999997,
"loss": 1.3484,
"step": 5360
},
{
"epoch": 4.296,
"grad_norm": 1.805012822151184,
"learning_rate": 0.000214112,
"loss": 1.3079,
"step": 5370
},
{
"epoch": 4.304,
"grad_norm": 1.9589773416519165,
"learning_rate": 0.00021395199999999997,
"loss": 1.36,
"step": 5380
},
{
"epoch": 4.312,
"grad_norm": 2.037567138671875,
"learning_rate": 0.000213792,
"loss": 1.2982,
"step": 5390
},
{
"epoch": 4.32,
"grad_norm": 1.8047535419464111,
"learning_rate": 0.00021363199999999998,
"loss": 1.3386,
"step": 5400
},
{
"epoch": 4.328,
"grad_norm": 1.9072496891021729,
"learning_rate": 0.00021347199999999998,
"loss": 1.2602,
"step": 5410
},
{
"epoch": 4.336,
"grad_norm": 1.9491392374038696,
"learning_rate": 0.00021331199999999998,
"loss": 1.3797,
"step": 5420
},
{
"epoch": 4.344,
"grad_norm": 2.073835611343384,
"learning_rate": 0.00021315199999999995,
"loss": 1.3107,
"step": 5430
},
{
"epoch": 4.352,
"grad_norm": 1.936270833015442,
"learning_rate": 0.00021299199999999998,
"loss": 1.3318,
"step": 5440
},
{
"epoch": 4.36,
"grad_norm": 1.9866790771484375,
"learning_rate": 0.00021283199999999996,
"loss": 1.273,
"step": 5450
},
{
"epoch": 4.368,
"grad_norm": 2.0993947982788086,
"learning_rate": 0.00021267199999999999,
"loss": 1.3157,
"step": 5460
},
{
"epoch": 4.376,
"grad_norm": 1.937992811203003,
"learning_rate": 0.00021251199999999996,
"loss": 1.2858,
"step": 5470
},
{
"epoch": 4.384,
"grad_norm": 1.7649872303009033,
"learning_rate": 0.000212352,
"loss": 1.3118,
"step": 5480
},
{
"epoch": 4.392,
"grad_norm": 1.8896372318267822,
"learning_rate": 0.00021219199999999996,
"loss": 1.3051,
"step": 5490
},
{
"epoch": 4.4,
"grad_norm": 1.9377533197402954,
"learning_rate": 0.000212032,
"loss": 1.3561,
"step": 5500
},
{
"epoch": 4.408,
"grad_norm": 2.079291820526123,
"learning_rate": 0.00021187199999999997,
"loss": 1.3083,
"step": 5510
},
{
"epoch": 4.416,
"grad_norm": 2.0022287368774414,
"learning_rate": 0.000211712,
"loss": 1.3287,
"step": 5520
},
{
"epoch": 4.424,
"grad_norm": 1.80183744430542,
"learning_rate": 0.00021155199999999997,
"loss": 1.3599,
"step": 5530
},
{
"epoch": 4.432,
"grad_norm": 1.9421368837356567,
"learning_rate": 0.000211392,
"loss": 1.3661,
"step": 5540
},
{
"epoch": 4.44,
"grad_norm": 1.9392564296722412,
"learning_rate": 0.00021123199999999997,
"loss": 1.3463,
"step": 5550
},
{
"epoch": 4.448,
"grad_norm": 2.102717638015747,
"learning_rate": 0.000211072,
"loss": 1.3544,
"step": 5560
},
{
"epoch": 4.456,
"grad_norm": 1.9294030666351318,
"learning_rate": 0.00021091199999999998,
"loss": 1.3765,
"step": 5570
},
{
"epoch": 4.464,
"grad_norm": 1.8542896509170532,
"learning_rate": 0.00021075199999999998,
"loss": 1.3624,
"step": 5580
},
{
"epoch": 4.4719999999999995,
"grad_norm": 2.159574031829834,
"learning_rate": 0.00021059199999999998,
"loss": 1.3452,
"step": 5590
},
{
"epoch": 4.48,
"grad_norm": 2.136308193206787,
"learning_rate": 0.00021043199999999998,
"loss": 1.3514,
"step": 5600
},
{
"epoch": 4.4879999999999995,
"grad_norm": 1.959116816520691,
"learning_rate": 0.00021027199999999998,
"loss": 1.3332,
"step": 5610
},
{
"epoch": 4.496,
"grad_norm": 1.9541338682174683,
"learning_rate": 0.00021011199999999996,
"loss": 1.3396,
"step": 5620
},
{
"epoch": 4.504,
"grad_norm": 1.9139293432235718,
"learning_rate": 0.00020995199999999998,
"loss": 1.3306,
"step": 5630
},
{
"epoch": 4.5120000000000005,
"grad_norm": 2.0729434490203857,
"learning_rate": 0.00020979199999999996,
"loss": 1.3385,
"step": 5640
},
{
"epoch": 4.52,
"grad_norm": 1.9547297954559326,
"learning_rate": 0.000209632,
"loss": 1.3378,
"step": 5650
},
{
"epoch": 4.5280000000000005,
"grad_norm": 2.0007593631744385,
"learning_rate": 0.00020947199999999996,
"loss": 1.3744,
"step": 5660
},
{
"epoch": 4.536,
"grad_norm": 1.841583251953125,
"learning_rate": 0.000209312,
"loss": 1.3461,
"step": 5670
},
{
"epoch": 4.5440000000000005,
"grad_norm": 1.950011968612671,
"learning_rate": 0.00020915199999999997,
"loss": 1.3898,
"step": 5680
},
{
"epoch": 4.552,
"grad_norm": 1.9242889881134033,
"learning_rate": 0.000208992,
"loss": 1.375,
"step": 5690
},
{
"epoch": 4.5600000000000005,
"grad_norm": 2.023679733276367,
"learning_rate": 0.00020883199999999997,
"loss": 1.3547,
"step": 5700
},
{
"epoch": 4.568,
"grad_norm": 1.96961510181427,
"learning_rate": 0.000208672,
"loss": 1.33,
"step": 5710
},
{
"epoch": 4.576,
"grad_norm": 1.9337737560272217,
"learning_rate": 0.00020851199999999997,
"loss": 1.338,
"step": 5720
},
{
"epoch": 4.584,
"grad_norm": 1.9906611442565918,
"learning_rate": 0.000208352,
"loss": 1.3979,
"step": 5730
},
{
"epoch": 4.592,
"grad_norm": 1.819471001625061,
"learning_rate": 0.00020819199999999997,
"loss": 1.375,
"step": 5740
},
{
"epoch": 4.6,
"grad_norm": 1.9368617534637451,
"learning_rate": 0.00020803199999999998,
"loss": 1.3637,
"step": 5750
},
{
"epoch": 4.608,
"grad_norm": 1.9653687477111816,
"learning_rate": 0.00020787199999999998,
"loss": 1.3554,
"step": 5760
},
{
"epoch": 4.616,
"grad_norm": 1.9763808250427246,
"learning_rate": 0.00020771199999999998,
"loss": 1.3437,
"step": 5770
},
{
"epoch": 4.624,
"grad_norm": 1.8649840354919434,
"learning_rate": 0.00020755199999999998,
"loss": 1.3624,
"step": 5780
},
{
"epoch": 4.632,
"grad_norm": 1.828291893005371,
"learning_rate": 0.00020739199999999998,
"loss": 1.378,
"step": 5790
},
{
"epoch": 4.64,
"grad_norm": 1.8722482919692993,
"learning_rate": 0.00020723199999999998,
"loss": 1.3548,
"step": 5800
},
{
"epoch": 4.648,
"grad_norm": 2.2012381553649902,
"learning_rate": 0.00020707199999999996,
"loss": 1.3698,
"step": 5810
},
{
"epoch": 4.656,
"grad_norm": 1.9233702421188354,
"learning_rate": 0.000206912,
"loss": 1.3798,
"step": 5820
},
{
"epoch": 4.664,
"grad_norm": 1.9627357721328735,
"learning_rate": 0.00020675199999999996,
"loss": 1.424,
"step": 5830
},
{
"epoch": 4.672,
"grad_norm": 1.8615745306015015,
"learning_rate": 0.000206592,
"loss": 1.3909,
"step": 5840
},
{
"epoch": 4.68,
"grad_norm": 1.9583446979522705,
"learning_rate": 0.00020643199999999996,
"loss": 1.3946,
"step": 5850
},
{
"epoch": 4.688,
"grad_norm": 1.9457666873931885,
"learning_rate": 0.000206272,
"loss": 1.372,
"step": 5860
},
{
"epoch": 4.696,
"grad_norm": 1.8619425296783447,
"learning_rate": 0.00020611199999999997,
"loss": 1.3787,
"step": 5870
},
{
"epoch": 4.704,
"grad_norm": 1.9508628845214844,
"learning_rate": 0.000205952,
"loss": 1.371,
"step": 5880
},
{
"epoch": 4.712,
"grad_norm": 1.7727349996566772,
"learning_rate": 0.00020579199999999997,
"loss": 1.3631,
"step": 5890
},
{
"epoch": 4.72,
"grad_norm": 1.7935019731521606,
"learning_rate": 0.000205632,
"loss": 1.3514,
"step": 5900
},
{
"epoch": 4.728,
"grad_norm": 1.9109313488006592,
"learning_rate": 0.00020547199999999997,
"loss": 1.327,
"step": 5910
},
{
"epoch": 4.736,
"grad_norm": 1.9290359020233154,
"learning_rate": 0.00020531199999999997,
"loss": 1.3574,
"step": 5920
},
{
"epoch": 4.744,
"grad_norm": 2.15079665184021,
"learning_rate": 0.00020515199999999998,
"loss": 1.3939,
"step": 5930
},
{
"epoch": 4.752,
"grad_norm": 2.0457019805908203,
"learning_rate": 0.00020499199999999998,
"loss": 1.3558,
"step": 5940
},
{
"epoch": 4.76,
"grad_norm": 1.9548970460891724,
"learning_rate": 0.00020483199999999998,
"loss": 1.362,
"step": 5950
},
{
"epoch": 4.768,
"grad_norm": 1.791396141052246,
"learning_rate": 0.00020467199999999998,
"loss": 1.33,
"step": 5960
},
{
"epoch": 4.776,
"grad_norm": 1.8451635837554932,
"learning_rate": 0.00020451199999999998,
"loss": 1.3507,
"step": 5970
},
{
"epoch": 4.784,
"grad_norm": 1.7999178171157837,
"learning_rate": 0.00020435199999999998,
"loss": 1.4174,
"step": 5980
},
{
"epoch": 4.792,
"grad_norm": 1.8192893266677856,
"learning_rate": 0.00020419199999999999,
"loss": 1.3481,
"step": 5990
},
{
"epoch": 4.8,
"grad_norm": 1.9753166437149048,
"learning_rate": 0.00020403199999999996,
"loss": 1.4223,
"step": 6000
},
{
"epoch": 4.808,
"grad_norm": 1.8800415992736816,
"learning_rate": 0.000203872,
"loss": 1.3661,
"step": 6010
},
{
"epoch": 4.816,
"grad_norm": 1.8040504455566406,
"learning_rate": 0.00020371199999999996,
"loss": 1.3519,
"step": 6020
},
{
"epoch": 4.824,
"grad_norm": 1.9058725833892822,
"learning_rate": 0.000203552,
"loss": 1.3973,
"step": 6030
},
{
"epoch": 4.832,
"grad_norm": 1.7217756509780884,
"learning_rate": 0.00020339199999999997,
"loss": 1.403,
"step": 6040
},
{
"epoch": 4.84,
"grad_norm": 1.8864495754241943,
"learning_rate": 0.000203232,
"loss": 1.3468,
"step": 6050
},
{
"epoch": 4.848,
"grad_norm": 2.006610870361328,
"learning_rate": 0.00020307199999999997,
"loss": 1.3972,
"step": 6060
},
{
"epoch": 4.856,
"grad_norm": 1.9524073600769043,
"learning_rate": 0.000202912,
"loss": 1.4012,
"step": 6070
},
{
"epoch": 4.864,
"grad_norm": 1.9322147369384766,
"learning_rate": 0.00020275199999999997,
"loss": 1.3928,
"step": 6080
},
{
"epoch": 4.872,
"grad_norm": 1.929335594177246,
"learning_rate": 0.000202592,
"loss": 1.3799,
"step": 6090
},
{
"epoch": 4.88,
"grad_norm": 1.8158811330795288,
"learning_rate": 0.00020243199999999998,
"loss": 1.3967,
"step": 6100
},
{
"epoch": 4.888,
"grad_norm": 1.9702143669128418,
"learning_rate": 0.00020227199999999998,
"loss": 1.3714,
"step": 6110
},
{
"epoch": 4.896,
"grad_norm": 1.6967090368270874,
"learning_rate": 0.00020211199999999998,
"loss": 1.3553,
"step": 6120
},
{
"epoch": 4.904,
"grad_norm": 1.7388558387756348,
"learning_rate": 0.00020195199999999998,
"loss": 1.4053,
"step": 6130
},
{
"epoch": 4.912,
"grad_norm": 1.9453833103179932,
"learning_rate": 0.00020179199999999998,
"loss": 1.3512,
"step": 6140
},
{
"epoch": 4.92,
"grad_norm": 1.8605188131332397,
"learning_rate": 0.00020163199999999998,
"loss": 1.3376,
"step": 6150
},
{
"epoch": 4.928,
"grad_norm": 1.9881434440612793,
"learning_rate": 0.00020147199999999998,
"loss": 1.3877,
"step": 6160
},
{
"epoch": 4.936,
"grad_norm": 1.8932327032089233,
"learning_rate": 0.00020131199999999999,
"loss": 1.331,
"step": 6170
},
{
"epoch": 4.944,
"grad_norm": 1.8074854612350464,
"learning_rate": 0.000201152,
"loss": 1.4211,
"step": 6180
},
{
"epoch": 4.952,
"grad_norm": 1.9307423830032349,
"learning_rate": 0.00020099199999999996,
"loss": 1.3498,
"step": 6190
},
{
"epoch": 4.96,
"grad_norm": 1.949623942375183,
"learning_rate": 0.000200832,
"loss": 1.3897,
"step": 6200
},
{
"epoch": 4.968,
"grad_norm": 1.7373038530349731,
"learning_rate": 0.00020067199999999997,
"loss": 1.3696,
"step": 6210
},
{
"epoch": 4.976,
"grad_norm": 1.9628345966339111,
"learning_rate": 0.000200512,
"loss": 1.3667,
"step": 6220
},
{
"epoch": 4.984,
"grad_norm": 1.9516173601150513,
"learning_rate": 0.00020035199999999997,
"loss": 1.4143,
"step": 6230
},
{
"epoch": 4.992,
"grad_norm": 1.7527846097946167,
"learning_rate": 0.000200192,
"loss": 1.3599,
"step": 6240
},
{
"epoch": 5.0,
"grad_norm": 1.9414066076278687,
"learning_rate": 0.00020003199999999997,
"loss": 1.3942,
"step": 6250
},
{
"epoch": 5.008,
"grad_norm": 1.802935242652893,
"learning_rate": 0.000199872,
"loss": 1.2225,
"step": 6260
},
{
"epoch": 5.016,
"grad_norm": 2.1949446201324463,
"learning_rate": 0.00019971199999999997,
"loss": 1.168,
"step": 6270
},
{
"epoch": 5.024,
"grad_norm": 2.1667227745056152,
"learning_rate": 0.00019955199999999998,
"loss": 1.2283,
"step": 6280
},
{
"epoch": 5.032,
"grad_norm": 2.0180180072784424,
"learning_rate": 0.00019939199999999998,
"loss": 1.1925,
"step": 6290
},
{
"epoch": 5.04,
"grad_norm": 2.257992744445801,
"learning_rate": 0.00019923199999999998,
"loss": 1.1695,
"step": 6300
},
{
"epoch": 5.048,
"grad_norm": 2.023444890975952,
"learning_rate": 0.00019907199999999998,
"loss": 1.1454,
"step": 6310
},
{
"epoch": 5.056,
"grad_norm": 2.1307425498962402,
"learning_rate": 0.00019891199999999998,
"loss": 1.1645,
"step": 6320
},
{
"epoch": 5.064,
"grad_norm": 2.018718957901001,
"learning_rate": 0.00019875199999999998,
"loss": 1.1451,
"step": 6330
},
{
"epoch": 5.072,
"grad_norm": 2.158968448638916,
"learning_rate": 0.00019859199999999999,
"loss": 1.1819,
"step": 6340
},
{
"epoch": 5.08,
"grad_norm": 2.125598907470703,
"learning_rate": 0.000198432,
"loss": 1.1819,
"step": 6350
},
{
"epoch": 5.088,
"grad_norm": 2.2982337474823,
"learning_rate": 0.000198272,
"loss": 1.1838,
"step": 6360
},
{
"epoch": 5.096,
"grad_norm": 2.3263471126556396,
"learning_rate": 0.000198112,
"loss": 1.176,
"step": 6370
},
{
"epoch": 5.104,
"grad_norm": 2.0729761123657227,
"learning_rate": 0.00019795199999999996,
"loss": 1.1237,
"step": 6380
},
{
"epoch": 5.112,
"grad_norm": 2.302323579788208,
"learning_rate": 0.000197792,
"loss": 1.1658,
"step": 6390
},
{
"epoch": 5.12,
"grad_norm": 2.1555356979370117,
"learning_rate": 0.00019763199999999997,
"loss": 1.1943,
"step": 6400
},
{
"epoch": 5.128,
"grad_norm": 2.104564666748047,
"learning_rate": 0.000197472,
"loss": 1.1776,
"step": 6410
},
{
"epoch": 5.136,
"grad_norm": 2.101271390914917,
"learning_rate": 0.00019731199999999997,
"loss": 1.2107,
"step": 6420
},
{
"epoch": 5.144,
"grad_norm": 2.1387553215026855,
"learning_rate": 0.000197152,
"loss": 1.1662,
"step": 6430
},
{
"epoch": 5.152,
"grad_norm": 1.9566245079040527,
"learning_rate": 0.00019699199999999997,
"loss": 1.1862,
"step": 6440
},
{
"epoch": 5.16,
"grad_norm": 2.1503751277923584,
"learning_rate": 0.00019683199999999997,
"loss": 1.1739,
"step": 6450
},
{
"epoch": 5.168,
"grad_norm": 2.0225651264190674,
"learning_rate": 0.00019668799999999998,
"loss": 1.1878,
"step": 6460
},
{
"epoch": 5.176,
"grad_norm": 2.179147481918335,
"learning_rate": 0.00019652799999999999,
"loss": 1.1973,
"step": 6470
},
{
"epoch": 5.184,
"grad_norm": 2.376354932785034,
"learning_rate": 0.000196368,
"loss": 1.1756,
"step": 6480
},
{
"epoch": 5.192,
"grad_norm": 2.143554449081421,
"learning_rate": 0.000196208,
"loss": 1.2341,
"step": 6490
},
{
"epoch": 5.2,
"grad_norm": 2.128620147705078,
"learning_rate": 0.000196048,
"loss": 1.1808,
"step": 6500
},
{
"epoch": 5.208,
"grad_norm": 2.025129556655884,
"learning_rate": 0.00019588799999999996,
"loss": 1.1889,
"step": 6510
},
{
"epoch": 5.216,
"grad_norm": 2.1475353240966797,
"learning_rate": 0.000195728,
"loss": 1.2154,
"step": 6520
},
{
"epoch": 5.224,
"grad_norm": 2.032588005065918,
"learning_rate": 0.00019556799999999997,
"loss": 1.2046,
"step": 6530
},
{
"epoch": 5.232,
"grad_norm": 2.2672226428985596,
"learning_rate": 0.000195408,
"loss": 1.1553,
"step": 6540
},
{
"epoch": 5.24,
"grad_norm": 2.2911875247955322,
"learning_rate": 0.00019524799999999997,
"loss": 1.2179,
"step": 6550
},
{
"epoch": 5.248,
"grad_norm": 2.0162782669067383,
"learning_rate": 0.00019508799999999997,
"loss": 1.2399,
"step": 6560
},
{
"epoch": 5.256,
"grad_norm": 2.193554639816284,
"learning_rate": 0.00019492799999999997,
"loss": 1.1745,
"step": 6570
},
{
"epoch": 5.264,
"grad_norm": 2.104660749435425,
"learning_rate": 0.00019476799999999998,
"loss": 1.2107,
"step": 6580
},
{
"epoch": 5.272,
"grad_norm": 2.141188621520996,
"learning_rate": 0.00019460799999999998,
"loss": 1.2222,
"step": 6590
},
{
"epoch": 5.28,
"grad_norm": 2.184913158416748,
"learning_rate": 0.00019444799999999998,
"loss": 1.2103,
"step": 6600
},
{
"epoch": 5.288,
"grad_norm": 2.3275797367095947,
"learning_rate": 0.00019428799999999998,
"loss": 1.2263,
"step": 6610
},
{
"epoch": 5.296,
"grad_norm": 2.2514960765838623,
"learning_rate": 0.00019412799999999998,
"loss": 1.2263,
"step": 6620
},
{
"epoch": 5.304,
"grad_norm": 2.335054874420166,
"learning_rate": 0.00019396799999999998,
"loss": 1.2105,
"step": 6630
},
{
"epoch": 5.312,
"grad_norm": 2.0840258598327637,
"learning_rate": 0.00019380799999999998,
"loss": 1.2322,
"step": 6640
},
{
"epoch": 5.32,
"grad_norm": 2.2909815311431885,
"learning_rate": 0.00019364799999999999,
"loss": 1.2027,
"step": 6650
},
{
"epoch": 5.328,
"grad_norm": 2.076932668685913,
"learning_rate": 0.000193488,
"loss": 1.2039,
"step": 6660
},
{
"epoch": 5.336,
"grad_norm": 2.017833948135376,
"learning_rate": 0.000193328,
"loss": 1.1752,
"step": 6670
},
{
"epoch": 5.344,
"grad_norm": 2.242431879043579,
"learning_rate": 0.000193168,
"loss": 1.2351,
"step": 6680
},
{
"epoch": 5.352,
"grad_norm": 2.0976057052612305,
"learning_rate": 0.000193008,
"loss": 1.2151,
"step": 6690
},
{
"epoch": 5.36,
"grad_norm": 2.2112200260162354,
"learning_rate": 0.00019284799999999997,
"loss": 1.2196,
"step": 6700
},
{
"epoch": 5.368,
"grad_norm": 2.1883575916290283,
"learning_rate": 0.000192688,
"loss": 1.2368,
"step": 6710
},
{
"epoch": 5.376,
"grad_norm": 2.3068554401397705,
"learning_rate": 0.00019252799999999997,
"loss": 1.2621,
"step": 6720
},
{
"epoch": 5.384,
"grad_norm": 2.039863109588623,
"learning_rate": 0.00019236799999999997,
"loss": 1.2687,
"step": 6730
},
{
"epoch": 5.392,
"grad_norm": 2.26802396774292,
"learning_rate": 0.00019220799999999997,
"loss": 1.1788,
"step": 6740
},
{
"epoch": 5.4,
"grad_norm": 2.010828733444214,
"learning_rate": 0.00019204799999999997,
"loss": 1.2232,
"step": 6750
},
{
"epoch": 5.408,
"grad_norm": 2.1727616786956787,
"learning_rate": 0.00019188799999999998,
"loss": 1.2625,
"step": 6760
},
{
"epoch": 5.416,
"grad_norm": 2.030134439468384,
"learning_rate": 0.00019172799999999998,
"loss": 1.2391,
"step": 6770
},
{
"epoch": 5.424,
"grad_norm": 2.2361104488372803,
"learning_rate": 0.00019156799999999998,
"loss": 1.2329,
"step": 6780
},
{
"epoch": 5.432,
"grad_norm": 2.1066739559173584,
"learning_rate": 0.00019140799999999998,
"loss": 1.2456,
"step": 6790
},
{
"epoch": 5.44,
"grad_norm": 2.1428840160369873,
"learning_rate": 0.00019124799999999998,
"loss": 1.1813,
"step": 6800
},
{
"epoch": 5.448,
"grad_norm": 2.3433635234832764,
"learning_rate": 0.00019108799999999998,
"loss": 1.2672,
"step": 6810
},
{
"epoch": 5.456,
"grad_norm": 2.185671091079712,
"learning_rate": 0.00019092799999999999,
"loss": 1.2162,
"step": 6820
},
{
"epoch": 5.464,
"grad_norm": 2.205509662628174,
"learning_rate": 0.000190768,
"loss": 1.2383,
"step": 6830
},
{
"epoch": 5.4719999999999995,
"grad_norm": 2.428114891052246,
"learning_rate": 0.000190608,
"loss": 1.3059,
"step": 6840
},
{
"epoch": 5.48,
"grad_norm": 2.135251998901367,
"learning_rate": 0.000190448,
"loss": 1.2053,
"step": 6850
},
{
"epoch": 5.4879999999999995,
"grad_norm": 2.074209213256836,
"learning_rate": 0.000190288,
"loss": 1.194,
"step": 6860
},
{
"epoch": 5.496,
"grad_norm": 2.0454697608947754,
"learning_rate": 0.000190128,
"loss": 1.232,
"step": 6870
},
{
"epoch": 5.504,
"grad_norm": 1.9665228128433228,
"learning_rate": 0.000189968,
"loss": 1.2077,
"step": 6880
},
{
"epoch": 5.5120000000000005,
"grad_norm": 2.0836398601531982,
"learning_rate": 0.00018980799999999997,
"loss": 1.186,
"step": 6890
},
{
"epoch": 5.52,
"grad_norm": 2.0634419918060303,
"learning_rate": 0.000189648,
"loss": 1.2539,
"step": 6900
},
{
"epoch": 5.5280000000000005,
"grad_norm": 2.2017769813537598,
"learning_rate": 0.00018948799999999997,
"loss": 1.2484,
"step": 6910
},
{
"epoch": 5.536,
"grad_norm": 2.193028450012207,
"learning_rate": 0.00018932799999999997,
"loss": 1.2916,
"step": 6920
},
{
"epoch": 5.5440000000000005,
"grad_norm": 2.163944721221924,
"learning_rate": 0.00018916799999999998,
"loss": 1.2706,
"step": 6930
},
{
"epoch": 5.552,
"grad_norm": 2.214864730834961,
"learning_rate": 0.00018900799999999998,
"loss": 1.2626,
"step": 6940
},
{
"epoch": 5.5600000000000005,
"grad_norm": 2.167754888534546,
"learning_rate": 0.00018884799999999998,
"loss": 1.2654,
"step": 6950
},
{
"epoch": 5.568,
"grad_norm": 2.114359140396118,
"learning_rate": 0.00018868799999999998,
"loss": 1.2345,
"step": 6960
},
{
"epoch": 5.576,
"grad_norm": 2.2773566246032715,
"learning_rate": 0.00018852799999999998,
"loss": 1.2244,
"step": 6970
},
{
"epoch": 5.584,
"grad_norm": 2.1949045658111572,
"learning_rate": 0.00018836799999999998,
"loss": 1.2508,
"step": 6980
},
{
"epoch": 5.592,
"grad_norm": 2.0954575538635254,
"learning_rate": 0.00018820799999999998,
"loss": 1.2387,
"step": 6990
},
{
"epoch": 5.6,
"grad_norm": 2.1742050647735596,
"learning_rate": 0.00018804799999999999,
"loss": 1.2407,
"step": 7000
},
{
"epoch": 5.608,
"grad_norm": 2.1627070903778076,
"learning_rate": 0.000187888,
"loss": 1.1706,
"step": 7010
},
{
"epoch": 5.616,
"grad_norm": 2.1110544204711914,
"learning_rate": 0.000187728,
"loss": 1.2538,
"step": 7020
},
{
"epoch": 5.624,
"grad_norm": 2.255958318710327,
"learning_rate": 0.000187568,
"loss": 1.209,
"step": 7030
},
{
"epoch": 5.632,
"grad_norm": 2.2075769901275635,
"learning_rate": 0.000187408,
"loss": 1.2942,
"step": 7040
},
{
"epoch": 5.64,
"grad_norm": 1.964128851890564,
"learning_rate": 0.000187248,
"loss": 1.2519,
"step": 7050
},
{
"epoch": 5.648,
"grad_norm": 2.2681636810302734,
"learning_rate": 0.000187088,
"loss": 1.2993,
"step": 7060
},
{
"epoch": 5.656,
"grad_norm": 2.313188076019287,
"learning_rate": 0.000186928,
"loss": 1.2438,
"step": 7070
},
{
"epoch": 5.664,
"grad_norm": 2.369359254837036,
"learning_rate": 0.00018676799999999997,
"loss": 1.2157,
"step": 7080
},
{
"epoch": 5.672,
"grad_norm": 2.2245047092437744,
"learning_rate": 0.00018660799999999997,
"loss": 1.2857,
"step": 7090
},
{
"epoch": 5.68,
"grad_norm": 2.058401107788086,
"learning_rate": 0.00018644799999999997,
"loss": 1.2045,
"step": 7100
},
{
"epoch": 5.688,
"grad_norm": 2.2531964778900146,
"learning_rate": 0.00018628799999999998,
"loss": 1.2493,
"step": 7110
},
{
"epoch": 5.696,
"grad_norm": 2.2315497398376465,
"learning_rate": 0.00018612799999999998,
"loss": 1.235,
"step": 7120
},
{
"epoch": 5.704,
"grad_norm": 2.018808603286743,
"learning_rate": 0.00018596799999999998,
"loss": 1.2016,
"step": 7130
},
{
"epoch": 5.712,
"grad_norm": 2.0911753177642822,
"learning_rate": 0.00018580799999999998,
"loss": 1.2624,
"step": 7140
},
{
"epoch": 5.72,
"grad_norm": 2.147120475769043,
"learning_rate": 0.00018564799999999998,
"loss": 1.2376,
"step": 7150
},
{
"epoch": 5.728,
"grad_norm": 2.1546943187713623,
"learning_rate": 0.00018548799999999998,
"loss": 1.2286,
"step": 7160
},
{
"epoch": 5.736,
"grad_norm": 2.0924603939056396,
"learning_rate": 0.00018532799999999998,
"loss": 1.2964,
"step": 7170
},
{
"epoch": 5.744,
"grad_norm": 2.337070941925049,
"learning_rate": 0.00018516799999999999,
"loss": 1.2699,
"step": 7180
},
{
"epoch": 5.752,
"grad_norm": 2.4989166259765625,
"learning_rate": 0.000185008,
"loss": 1.2631,
"step": 7190
},
{
"epoch": 5.76,
"grad_norm": 2.3049070835113525,
"learning_rate": 0.000184848,
"loss": 1.2879,
"step": 7200
},
{
"epoch": 5.768,
"grad_norm": 2.328397274017334,
"learning_rate": 0.000184688,
"loss": 1.2624,
"step": 7210
},
{
"epoch": 5.776,
"grad_norm": 2.147589921951294,
"learning_rate": 0.000184528,
"loss": 1.2825,
"step": 7220
},
{
"epoch": 5.784,
"grad_norm": 2.348174571990967,
"learning_rate": 0.000184368,
"loss": 1.2751,
"step": 7230
},
{
"epoch": 5.792,
"grad_norm": 2.270873785018921,
"learning_rate": 0.000184208,
"loss": 1.2714,
"step": 7240
},
{
"epoch": 5.8,
"grad_norm": 2.289658308029175,
"learning_rate": 0.000184048,
"loss": 1.2412,
"step": 7250
},
{
"epoch": 5.808,
"grad_norm": 2.3569588661193848,
"learning_rate": 0.00018388799999999997,
"loss": 1.2714,
"step": 7260
},
{
"epoch": 5.816,
"grad_norm": 2.372729539871216,
"learning_rate": 0.00018372799999999997,
"loss": 1.2824,
"step": 7270
},
{
"epoch": 5.824,
"grad_norm": 2.3369643688201904,
"learning_rate": 0.00018356799999999997,
"loss": 1.2897,
"step": 7280
},
{
"epoch": 5.832,
"grad_norm": 2.149664878845215,
"learning_rate": 0.00018340799999999998,
"loss": 1.2411,
"step": 7290
},
{
"epoch": 5.84,
"grad_norm": 2.1661763191223145,
"learning_rate": 0.00018324799999999998,
"loss": 1.2484,
"step": 7300
},
{
"epoch": 5.848,
"grad_norm": 2.2296934127807617,
"learning_rate": 0.00018308799999999998,
"loss": 1.2713,
"step": 7310
},
{
"epoch": 5.856,
"grad_norm": 2.0819859504699707,
"learning_rate": 0.00018292799999999998,
"loss": 1.2636,
"step": 7320
},
{
"epoch": 5.864,
"grad_norm": 2.158386468887329,
"learning_rate": 0.00018276799999999998,
"loss": 1.2886,
"step": 7330
},
{
"epoch": 5.872,
"grad_norm": 2.1622161865234375,
"learning_rate": 0.00018260799999999998,
"loss": 1.2663,
"step": 7340
},
{
"epoch": 5.88,
"grad_norm": 2.1625213623046875,
"learning_rate": 0.00018244799999999999,
"loss": 1.3031,
"step": 7350
},
{
"epoch": 5.888,
"grad_norm": 2.1951282024383545,
"learning_rate": 0.000182288,
"loss": 1.2569,
"step": 7360
},
{
"epoch": 5.896,
"grad_norm": 2.2481329441070557,
"learning_rate": 0.000182128,
"loss": 1.2376,
"step": 7370
},
{
"epoch": 5.904,
"grad_norm": 2.11740779876709,
"learning_rate": 0.000181968,
"loss": 1.2642,
"step": 7380
},
{
"epoch": 5.912,
"grad_norm": 2.3954527378082275,
"learning_rate": 0.000181808,
"loss": 1.3029,
"step": 7390
},
{
"epoch": 5.92,
"grad_norm": 2.222752571105957,
"learning_rate": 0.000181648,
"loss": 1.3028,
"step": 7400
},
{
"epoch": 5.928,
"grad_norm": 2.15301513671875,
"learning_rate": 0.000181488,
"loss": 1.3011,
"step": 7410
},
{
"epoch": 5.936,
"grad_norm": 2.27708101272583,
"learning_rate": 0.000181328,
"loss": 1.2727,
"step": 7420
},
{
"epoch": 5.944,
"grad_norm": 2.1490461826324463,
"learning_rate": 0.00018116799999999997,
"loss": 1.2968,
"step": 7430
},
{
"epoch": 5.952,
"grad_norm": 2.247800588607788,
"learning_rate": 0.000181008,
"loss": 1.3003,
"step": 7440
},
{
"epoch": 5.96,
"grad_norm": 2.2584476470947266,
"learning_rate": 0.00018084799999999997,
"loss": 1.2659,
"step": 7450
},
{
"epoch": 5.968,
"grad_norm": 2.1247005462646484,
"learning_rate": 0.00018068799999999997,
"loss": 1.26,
"step": 7460
},
{
"epoch": 5.976,
"grad_norm": 2.2989518642425537,
"learning_rate": 0.00018052799999999998,
"loss": 1.28,
"step": 7470
},
{
"epoch": 5.984,
"grad_norm": 2.3190391063690186,
"learning_rate": 0.00018036799999999998,
"loss": 1.2763,
"step": 7480
},
{
"epoch": 5.992,
"grad_norm": 2.170459032058716,
"learning_rate": 0.00018020799999999998,
"loss": 1.2263,
"step": 7490
},
{
"epoch": 6.0,
"grad_norm": 2.1551315784454346,
"learning_rate": 0.00018004799999999998,
"loss": 1.3013,
"step": 7500
}
],
"logging_steps": 10,
"max_steps": 18750,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 2500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.2603608444698624e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}