namespace-Pt's picture
Upload folder using huggingface_hub
5b72a55 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50,
"global_step": 11436,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.24680212140083313,
"learning_rate": 4.979447262550289e-05,
"loss": 2.1164,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 0.1798873394727707,
"learning_rate": 4.957582648242085e-05,
"loss": 2.108,
"step": 100
},
{
"epoch": 0.01,
"grad_norm": 0.12875229120254517,
"learning_rate": 4.9357180339338815e-05,
"loss": 2.1164,
"step": 150
},
{
"epoch": 0.02,
"grad_norm": 0.13966438174247742,
"learning_rate": 4.913853419625678e-05,
"loss": 1.975,
"step": 200
},
{
"epoch": 0.02,
"grad_norm": 0.14950478076934814,
"learning_rate": 4.891988805317474e-05,
"loss": 2.0598,
"step": 250
},
{
"epoch": 0.03,
"grad_norm": 0.11292492598295212,
"learning_rate": 4.8701241910092706e-05,
"loss": 2.0375,
"step": 300
},
{
"epoch": 0.03,
"grad_norm": 0.15274354815483093,
"learning_rate": 4.848259576701068e-05,
"loss": 2.0463,
"step": 350
},
{
"epoch": 0.03,
"grad_norm": 0.13233616948127747,
"learning_rate": 4.8263949623928634e-05,
"loss": 2.0382,
"step": 400
},
{
"epoch": 0.04,
"grad_norm": 0.11243890225887299,
"learning_rate": 4.80453034808466e-05,
"loss": 2.0239,
"step": 450
},
{
"epoch": 0.04,
"grad_norm": 0.12158068269491196,
"learning_rate": 4.782665733776456e-05,
"loss": 2.0376,
"step": 500
},
{
"epoch": 0.05,
"grad_norm": 0.14348222315311432,
"learning_rate": 4.7608011194682526e-05,
"loss": 2.0658,
"step": 550
},
{
"epoch": 0.05,
"grad_norm": 0.1698862612247467,
"learning_rate": 4.738936505160049e-05,
"loss": 2.0811,
"step": 600
},
{
"epoch": 0.06,
"grad_norm": 0.18587234616279602,
"learning_rate": 4.717071890851846e-05,
"loss": 2.0649,
"step": 650
},
{
"epoch": 0.06,
"grad_norm": 0.1404772698879242,
"learning_rate": 4.6952072765436424e-05,
"loss": 2.0424,
"step": 700
},
{
"epoch": 0.07,
"grad_norm": 0.13483217358589172,
"learning_rate": 4.673342662235438e-05,
"loss": 2.0892,
"step": 750
},
{
"epoch": 0.07,
"grad_norm": 0.12589296698570251,
"learning_rate": 4.6514780479272345e-05,
"loss": 1.9907,
"step": 800
},
{
"epoch": 0.07,
"grad_norm": 0.11405760794878006,
"learning_rate": 4.629613433619031e-05,
"loss": 2.0512,
"step": 850
},
{
"epoch": 0.08,
"grad_norm": 0.14347495138645172,
"learning_rate": 4.607748819310827e-05,
"loss": 2.0381,
"step": 900
},
{
"epoch": 0.08,
"grad_norm": 0.14272941648960114,
"learning_rate": 4.5858842050026244e-05,
"loss": 2.0089,
"step": 950
},
{
"epoch": 0.09,
"grad_norm": 0.11842367798089981,
"learning_rate": 4.564019590694421e-05,
"loss": 2.0551,
"step": 1000
},
{
"epoch": 0.09,
"grad_norm": 0.14809106290340424,
"learning_rate": 4.542154976386217e-05,
"loss": 2.0635,
"step": 1050
},
{
"epoch": 0.1,
"grad_norm": 0.17582620680332184,
"learning_rate": 4.520290362078013e-05,
"loss": 2.0037,
"step": 1100
},
{
"epoch": 0.1,
"grad_norm": 0.15331187844276428,
"learning_rate": 4.498425747769809e-05,
"loss": 2.0176,
"step": 1150
},
{
"epoch": 0.1,
"grad_norm": 0.1289476752281189,
"learning_rate": 4.476561133461606e-05,
"loss": 1.9955,
"step": 1200
},
{
"epoch": 0.11,
"grad_norm": 0.12641307711601257,
"learning_rate": 4.454696519153402e-05,
"loss": 2.0665,
"step": 1250
},
{
"epoch": 0.11,
"grad_norm": 0.1526792198419571,
"learning_rate": 4.432831904845199e-05,
"loss": 2.0381,
"step": 1300
},
{
"epoch": 0.12,
"grad_norm": 0.15751318633556366,
"learning_rate": 4.4109672905369955e-05,
"loss": 2.1066,
"step": 1350
},
{
"epoch": 0.12,
"grad_norm": 0.16713933646678925,
"learning_rate": 4.389102676228792e-05,
"loss": 2.0773,
"step": 1400
},
{
"epoch": 0.13,
"grad_norm": 0.1296485811471939,
"learning_rate": 4.3672380619205876e-05,
"loss": 2.0354,
"step": 1450
},
{
"epoch": 0.13,
"grad_norm": 0.1575527936220169,
"learning_rate": 4.345373447612384e-05,
"loss": 2.057,
"step": 1500
},
{
"epoch": 0.14,
"grad_norm": 0.14888359606266022,
"learning_rate": 4.3235088333041804e-05,
"loss": 2.0769,
"step": 1550
},
{
"epoch": 0.14,
"grad_norm": 0.13250517845153809,
"learning_rate": 4.3016442189959775e-05,
"loss": 2.0548,
"step": 1600
},
{
"epoch": 0.14,
"grad_norm": 0.11338368058204651,
"learning_rate": 4.279779604687774e-05,
"loss": 2.0733,
"step": 1650
},
{
"epoch": 0.15,
"grad_norm": 0.13314321637153625,
"learning_rate": 4.25791499037957e-05,
"loss": 2.0927,
"step": 1700
},
{
"epoch": 0.15,
"grad_norm": 0.13528186082839966,
"learning_rate": 4.2360503760713666e-05,
"loss": 2.0358,
"step": 1750
},
{
"epoch": 0.16,
"grad_norm": 0.123367078602314,
"learning_rate": 4.2141857617631624e-05,
"loss": 2.122,
"step": 1800
},
{
"epoch": 0.16,
"grad_norm": 0.1488184630870819,
"learning_rate": 4.192321147454959e-05,
"loss": 2.0561,
"step": 1850
},
{
"epoch": 0.17,
"grad_norm": 0.16916592419147491,
"learning_rate": 4.170456533146755e-05,
"loss": 2.0353,
"step": 1900
},
{
"epoch": 0.17,
"grad_norm": 0.14417660236358643,
"learning_rate": 4.148591918838552e-05,
"loss": 1.9982,
"step": 1950
},
{
"epoch": 0.17,
"grad_norm": 0.12554995715618134,
"learning_rate": 4.1267273045303486e-05,
"loss": 2.1305,
"step": 2000
},
{
"epoch": 0.18,
"grad_norm": 0.1197732612490654,
"learning_rate": 4.104862690222145e-05,
"loss": 2.0228,
"step": 2050
},
{
"epoch": 0.18,
"grad_norm": 0.11664281785488129,
"learning_rate": 4.0829980759139414e-05,
"loss": 2.0536,
"step": 2100
},
{
"epoch": 0.19,
"grad_norm": 0.1292441338300705,
"learning_rate": 4.061133461605737e-05,
"loss": 2.072,
"step": 2150
},
{
"epoch": 0.19,
"grad_norm": 0.1317589282989502,
"learning_rate": 4.0392688472975335e-05,
"loss": 2.0793,
"step": 2200
},
{
"epoch": 0.2,
"grad_norm": 0.14351141452789307,
"learning_rate": 4.0174042329893305e-05,
"loss": 2.0527,
"step": 2250
},
{
"epoch": 0.2,
"grad_norm": 0.1438673734664917,
"learning_rate": 3.995539618681127e-05,
"loss": 2.0586,
"step": 2300
},
{
"epoch": 0.21,
"grad_norm": 0.13835753500461578,
"learning_rate": 3.973675004372923e-05,
"loss": 2.0294,
"step": 2350
},
{
"epoch": 0.21,
"grad_norm": 0.110976442694664,
"learning_rate": 3.95181039006472e-05,
"loss": 2.0917,
"step": 2400
},
{
"epoch": 0.21,
"grad_norm": 0.11097148805856705,
"learning_rate": 3.9299457757565154e-05,
"loss": 2.0185,
"step": 2450
},
{
"epoch": 0.22,
"grad_norm": 0.173320472240448,
"learning_rate": 3.908081161448312e-05,
"loss": 2.0346,
"step": 2500
},
{
"epoch": 0.22,
"grad_norm": 0.10903707146644592,
"learning_rate": 3.886216547140108e-05,
"loss": 2.0342,
"step": 2550
},
{
"epoch": 0.23,
"grad_norm": 0.1535567194223404,
"learning_rate": 3.864351932831905e-05,
"loss": 2.0104,
"step": 2600
},
{
"epoch": 0.23,
"grad_norm": 0.13500377535820007,
"learning_rate": 3.842487318523702e-05,
"loss": 2.0813,
"step": 2650
},
{
"epoch": 0.24,
"grad_norm": 0.1488364040851593,
"learning_rate": 3.820622704215498e-05,
"loss": 2.0034,
"step": 2700
},
{
"epoch": 0.24,
"grad_norm": 0.14241208136081696,
"learning_rate": 3.7987580899072945e-05,
"loss": 2.0316,
"step": 2750
},
{
"epoch": 0.24,
"grad_norm": 0.12958261370658875,
"learning_rate": 3.77689347559909e-05,
"loss": 2.0108,
"step": 2800
},
{
"epoch": 0.25,
"grad_norm": 0.12320134788751602,
"learning_rate": 3.7550288612908866e-05,
"loss": 1.9801,
"step": 2850
},
{
"epoch": 0.25,
"grad_norm": 0.12050608545541763,
"learning_rate": 3.7331642469826836e-05,
"loss": 2.0882,
"step": 2900
},
{
"epoch": 0.26,
"grad_norm": 0.1593894511461258,
"learning_rate": 3.71129963267448e-05,
"loss": 2.0175,
"step": 2950
},
{
"epoch": 0.26,
"grad_norm": 0.13536041975021362,
"learning_rate": 3.6894350183662764e-05,
"loss": 2.0235,
"step": 3000
},
{
"epoch": 0.27,
"grad_norm": 0.16139154136180878,
"learning_rate": 3.667570404058073e-05,
"loss": 2.0379,
"step": 3050
},
{
"epoch": 0.27,
"grad_norm": 0.14355123043060303,
"learning_rate": 3.645705789749869e-05,
"loss": 2.0232,
"step": 3100
},
{
"epoch": 0.28,
"grad_norm": 0.1250680685043335,
"learning_rate": 3.623841175441665e-05,
"loss": 2.0238,
"step": 3150
},
{
"epoch": 0.28,
"grad_norm": 0.16705290973186493,
"learning_rate": 3.601976561133461e-05,
"loss": 2.0823,
"step": 3200
},
{
"epoch": 0.28,
"grad_norm": 0.1477033495903015,
"learning_rate": 3.5801119468252584e-05,
"loss": 2.0459,
"step": 3250
},
{
"epoch": 0.29,
"grad_norm": 0.15941354632377625,
"learning_rate": 3.558247332517055e-05,
"loss": 2.0443,
"step": 3300
},
{
"epoch": 0.29,
"grad_norm": 0.1715698093175888,
"learning_rate": 3.536382718208851e-05,
"loss": 2.0891,
"step": 3350
},
{
"epoch": 0.3,
"grad_norm": 0.29363158345222473,
"learning_rate": 3.5145181039006475e-05,
"loss": 2.0438,
"step": 3400
},
{
"epoch": 0.3,
"grad_norm": 0.1486337035894394,
"learning_rate": 3.492653489592444e-05,
"loss": 2.0515,
"step": 3450
},
{
"epoch": 0.31,
"grad_norm": 0.12089661508798599,
"learning_rate": 3.4707888752842396e-05,
"loss": 2.0669,
"step": 3500
},
{
"epoch": 0.31,
"grad_norm": 0.3512625992298126,
"learning_rate": 3.448924260976037e-05,
"loss": 2.0875,
"step": 3550
},
{
"epoch": 0.31,
"grad_norm": 0.15912899374961853,
"learning_rate": 3.427059646667833e-05,
"loss": 2.0583,
"step": 3600
},
{
"epoch": 0.32,
"grad_norm": 0.14142446219921112,
"learning_rate": 3.4051950323596295e-05,
"loss": 2.0666,
"step": 3650
},
{
"epoch": 0.32,
"grad_norm": 0.13084927201271057,
"learning_rate": 3.383330418051426e-05,
"loss": 2.1078,
"step": 3700
},
{
"epoch": 0.33,
"grad_norm": 0.1620977371931076,
"learning_rate": 3.361465803743222e-05,
"loss": 2.0184,
"step": 3750
},
{
"epoch": 0.33,
"grad_norm": 0.18495036661624908,
"learning_rate": 3.3396011894350187e-05,
"loss": 2.0117,
"step": 3800
},
{
"epoch": 0.34,
"grad_norm": 0.14189182221889496,
"learning_rate": 3.3177365751268144e-05,
"loss": 2.0641,
"step": 3850
},
{
"epoch": 0.34,
"grad_norm": 0.16044092178344727,
"learning_rate": 3.2958719608186114e-05,
"loss": 2.0496,
"step": 3900
},
{
"epoch": 0.35,
"grad_norm": 0.12184160947799683,
"learning_rate": 3.274007346510408e-05,
"loss": 2.0343,
"step": 3950
},
{
"epoch": 0.35,
"grad_norm": 0.10508805513381958,
"learning_rate": 3.252142732202204e-05,
"loss": 1.9982,
"step": 4000
},
{
"epoch": 0.35,
"grad_norm": 0.14695701003074646,
"learning_rate": 3.2302781178940006e-05,
"loss": 2.0163,
"step": 4050
},
{
"epoch": 0.36,
"grad_norm": 0.2915743291378021,
"learning_rate": 3.208413503585797e-05,
"loss": 2.0559,
"step": 4100
},
{
"epoch": 0.36,
"grad_norm": 0.16513386368751526,
"learning_rate": 3.1865488892775934e-05,
"loss": 2.0771,
"step": 4150
},
{
"epoch": 0.37,
"grad_norm": 0.11002013087272644,
"learning_rate": 3.16468427496939e-05,
"loss": 2.0516,
"step": 4200
},
{
"epoch": 0.37,
"grad_norm": 0.1383182853460312,
"learning_rate": 3.142819660661186e-05,
"loss": 2.0783,
"step": 4250
},
{
"epoch": 0.38,
"grad_norm": 0.12230634689331055,
"learning_rate": 3.1209550463529826e-05,
"loss": 2.0521,
"step": 4300
},
{
"epoch": 0.38,
"grad_norm": 0.15650221705436707,
"learning_rate": 3.099090432044779e-05,
"loss": 2.0358,
"step": 4350
},
{
"epoch": 0.38,
"grad_norm": 0.16002853214740753,
"learning_rate": 3.0772258177365753e-05,
"loss": 2.0217,
"step": 4400
},
{
"epoch": 0.39,
"grad_norm": 0.13085784018039703,
"learning_rate": 3.055361203428372e-05,
"loss": 2.0019,
"step": 4450
},
{
"epoch": 0.39,
"grad_norm": 0.11252173781394958,
"learning_rate": 3.0334965891201685e-05,
"loss": 2.0142,
"step": 4500
},
{
"epoch": 0.4,
"grad_norm": 0.13878273963928223,
"learning_rate": 3.0116319748119642e-05,
"loss": 2.0507,
"step": 4550
},
{
"epoch": 0.4,
"grad_norm": 0.13026192784309387,
"learning_rate": 2.9897673605037606e-05,
"loss": 2.0927,
"step": 4600
},
{
"epoch": 0.41,
"grad_norm": 0.12136498093605042,
"learning_rate": 2.9679027461955573e-05,
"loss": 2.0314,
"step": 4650
},
{
"epoch": 0.41,
"grad_norm": 0.1311144381761551,
"learning_rate": 2.9460381318873537e-05,
"loss": 2.0167,
"step": 4700
},
{
"epoch": 0.42,
"grad_norm": 0.13698582351207733,
"learning_rate": 2.92417351757915e-05,
"loss": 2.0603,
"step": 4750
},
{
"epoch": 0.42,
"grad_norm": 0.13138704001903534,
"learning_rate": 2.9023089032709465e-05,
"loss": 2.0659,
"step": 4800
},
{
"epoch": 0.42,
"grad_norm": 0.13684587180614471,
"learning_rate": 2.8804442889627432e-05,
"loss": 2.0426,
"step": 4850
},
{
"epoch": 0.43,
"grad_norm": 0.15447795391082764,
"learning_rate": 2.858579674654539e-05,
"loss": 2.0724,
"step": 4900
},
{
"epoch": 0.43,
"grad_norm": 0.2186385840177536,
"learning_rate": 2.8367150603463356e-05,
"loss": 2.0357,
"step": 4950
},
{
"epoch": 0.44,
"grad_norm": 0.12155057489871979,
"learning_rate": 2.814850446038132e-05,
"loss": 2.062,
"step": 5000
},
{
"epoch": 0.44,
"grad_norm": 0.28663524985313416,
"learning_rate": 2.7929858317299284e-05,
"loss": 1.9945,
"step": 5050
},
{
"epoch": 0.45,
"grad_norm": 0.16327697038650513,
"learning_rate": 2.7711212174217248e-05,
"loss": 2.0262,
"step": 5100
},
{
"epoch": 0.45,
"grad_norm": 0.11811967194080353,
"learning_rate": 2.7492566031135215e-05,
"loss": 1.9902,
"step": 5150
},
{
"epoch": 0.45,
"grad_norm": 0.14936485886573792,
"learning_rate": 2.727391988805318e-05,
"loss": 1.9835,
"step": 5200
},
{
"epoch": 0.46,
"grad_norm": 0.12473815679550171,
"learning_rate": 2.7055273744971136e-05,
"loss": 2.0101,
"step": 5250
},
{
"epoch": 0.46,
"grad_norm": 0.12709933519363403,
"learning_rate": 2.6836627601889104e-05,
"loss": 2.0394,
"step": 5300
},
{
"epoch": 0.47,
"grad_norm": 0.2092856615781784,
"learning_rate": 2.6617981458807068e-05,
"loss": 2.0482,
"step": 5350
},
{
"epoch": 0.47,
"grad_norm": 0.10988406836986542,
"learning_rate": 2.639933531572503e-05,
"loss": 2.0043,
"step": 5400
},
{
"epoch": 0.48,
"grad_norm": 0.11755118519067764,
"learning_rate": 2.6180689172642995e-05,
"loss": 2.04,
"step": 5450
},
{
"epoch": 0.48,
"grad_norm": 0.12184648215770721,
"learning_rate": 2.5962043029560963e-05,
"loss": 2.0741,
"step": 5500
},
{
"epoch": 0.49,
"grad_norm": 0.19833241403102875,
"learning_rate": 2.574339688647892e-05,
"loss": 2.0551,
"step": 5550
},
{
"epoch": 0.49,
"grad_norm": 0.1213446855545044,
"learning_rate": 2.5524750743396887e-05,
"loss": 2.0349,
"step": 5600
},
{
"epoch": 0.49,
"grad_norm": 0.10714009404182434,
"learning_rate": 2.530610460031485e-05,
"loss": 1.9495,
"step": 5650
},
{
"epoch": 0.5,
"grad_norm": 0.11714328825473785,
"learning_rate": 2.5087458457232815e-05,
"loss": 2.0051,
"step": 5700
},
{
"epoch": 0.5,
"grad_norm": 0.14028282463550568,
"learning_rate": 2.486881231415078e-05,
"loss": 2.0059,
"step": 5750
},
{
"epoch": 0.51,
"grad_norm": 0.11705906689167023,
"learning_rate": 2.4650166171068746e-05,
"loss": 2.0374,
"step": 5800
},
{
"epoch": 0.51,
"grad_norm": 0.15596671402454376,
"learning_rate": 2.4431520027986707e-05,
"loss": 2.0238,
"step": 5850
},
{
"epoch": 0.52,
"grad_norm": 0.13032393157482147,
"learning_rate": 2.421287388490467e-05,
"loss": 2.0565,
"step": 5900
},
{
"epoch": 0.52,
"grad_norm": 0.14617374539375305,
"learning_rate": 2.3994227741822638e-05,
"loss": 2.0515,
"step": 5950
},
{
"epoch": 0.52,
"grad_norm": 0.156155064702034,
"learning_rate": 2.37755815987406e-05,
"loss": 2.0327,
"step": 6000
},
{
"epoch": 0.53,
"grad_norm": 0.11759106069803238,
"learning_rate": 2.3556935455658562e-05,
"loss": 2.0721,
"step": 6050
},
{
"epoch": 0.53,
"grad_norm": 0.1545974314212799,
"learning_rate": 2.3338289312576526e-05,
"loss": 1.9866,
"step": 6100
},
{
"epoch": 0.54,
"grad_norm": 0.13251113891601562,
"learning_rate": 2.3119643169494493e-05,
"loss": 2.0147,
"step": 6150
},
{
"epoch": 0.54,
"grad_norm": 0.1533696949481964,
"learning_rate": 2.2900997026412454e-05,
"loss": 2.0267,
"step": 6200
},
{
"epoch": 0.55,
"grad_norm": 0.12238262593746185,
"learning_rate": 2.2682350883330418e-05,
"loss": 2.0202,
"step": 6250
},
{
"epoch": 0.55,
"grad_norm": 0.13370424509048462,
"learning_rate": 2.2463704740248385e-05,
"loss": 2.0542,
"step": 6300
},
{
"epoch": 0.56,
"grad_norm": 0.135158509016037,
"learning_rate": 2.2245058597166346e-05,
"loss": 2.0228,
"step": 6350
},
{
"epoch": 0.56,
"grad_norm": 0.1633879691362381,
"learning_rate": 2.202641245408431e-05,
"loss": 2.014,
"step": 6400
},
{
"epoch": 0.56,
"grad_norm": 0.14239120483398438,
"learning_rate": 2.1807766311002277e-05,
"loss": 2.1116,
"step": 6450
},
{
"epoch": 0.57,
"grad_norm": 0.15084555745124817,
"learning_rate": 2.1589120167920237e-05,
"loss": 2.0803,
"step": 6500
},
{
"epoch": 0.57,
"grad_norm": 0.12641365826129913,
"learning_rate": 2.13704740248382e-05,
"loss": 2.0501,
"step": 6550
},
{
"epoch": 0.58,
"grad_norm": 0.18917310237884521,
"learning_rate": 2.115182788175617e-05,
"loss": 2.0069,
"step": 6600
},
{
"epoch": 0.58,
"grad_norm": 0.1685107797384262,
"learning_rate": 2.0933181738674133e-05,
"loss": 2.0559,
"step": 6650
},
{
"epoch": 0.59,
"grad_norm": 0.1830686628818512,
"learning_rate": 2.0714535595592093e-05,
"loss": 2.0366,
"step": 6700
},
{
"epoch": 0.59,
"grad_norm": 0.11661963164806366,
"learning_rate": 2.0495889452510057e-05,
"loss": 2.0451,
"step": 6750
},
{
"epoch": 0.59,
"grad_norm": 0.15775232017040253,
"learning_rate": 2.0277243309428024e-05,
"loss": 2.0785,
"step": 6800
},
{
"epoch": 0.6,
"grad_norm": 0.1487419754266739,
"learning_rate": 2.0058597166345985e-05,
"loss": 2.0231,
"step": 6850
},
{
"epoch": 0.6,
"grad_norm": 0.11676029115915298,
"learning_rate": 1.983995102326395e-05,
"loss": 2.0444,
"step": 6900
},
{
"epoch": 0.61,
"grad_norm": 0.16297248005867004,
"learning_rate": 1.9621304880181916e-05,
"loss": 2.0288,
"step": 6950
},
{
"epoch": 0.61,
"grad_norm": 0.10936591029167175,
"learning_rate": 1.940265873709988e-05,
"loss": 2.0973,
"step": 7000
},
{
"epoch": 0.62,
"grad_norm": 0.13290850818157196,
"learning_rate": 1.918401259401784e-05,
"loss": 2.0536,
"step": 7050
},
{
"epoch": 0.62,
"grad_norm": 0.1514565795660019,
"learning_rate": 1.8965366450935808e-05,
"loss": 1.9953,
"step": 7100
},
{
"epoch": 0.63,
"grad_norm": 0.14755938947200775,
"learning_rate": 1.874672030785377e-05,
"loss": 2.0123,
"step": 7150
},
{
"epoch": 0.63,
"grad_norm": 0.14362554252147675,
"learning_rate": 1.8528074164771732e-05,
"loss": 2.0726,
"step": 7200
},
{
"epoch": 0.63,
"grad_norm": 0.13935257494449615,
"learning_rate": 1.83094280216897e-05,
"loss": 2.0586,
"step": 7250
},
{
"epoch": 0.64,
"grad_norm": 0.17782963812351227,
"learning_rate": 1.8090781878607663e-05,
"loss": 2.0021,
"step": 7300
},
{
"epoch": 0.64,
"grad_norm": 0.11529221385717392,
"learning_rate": 1.7872135735525627e-05,
"loss": 2.0405,
"step": 7350
},
{
"epoch": 0.65,
"grad_norm": 0.20195411145687103,
"learning_rate": 1.765348959244359e-05,
"loss": 2.0568,
"step": 7400
},
{
"epoch": 0.65,
"grad_norm": 0.1562044322490692,
"learning_rate": 1.7434843449361555e-05,
"loss": 2.0669,
"step": 7450
},
{
"epoch": 0.66,
"grad_norm": 0.1276342272758484,
"learning_rate": 1.721619730627952e-05,
"loss": 2.0323,
"step": 7500
},
{
"epoch": 0.66,
"grad_norm": 0.12369738519191742,
"learning_rate": 1.699755116319748e-05,
"loss": 2.0451,
"step": 7550
},
{
"epoch": 0.66,
"grad_norm": 0.12338880449533463,
"learning_rate": 1.6778905020115447e-05,
"loss": 2.0672,
"step": 7600
},
{
"epoch": 0.67,
"grad_norm": 0.1343850940465927,
"learning_rate": 1.656025887703341e-05,
"loss": 2.0319,
"step": 7650
},
{
"epoch": 0.67,
"grad_norm": 0.11484856903553009,
"learning_rate": 1.6341612733951375e-05,
"loss": 2.0128,
"step": 7700
},
{
"epoch": 0.68,
"grad_norm": 0.1755642592906952,
"learning_rate": 1.612296659086934e-05,
"loss": 2.0056,
"step": 7750
},
{
"epoch": 0.68,
"grad_norm": 0.1479136049747467,
"learning_rate": 1.5904320447787302e-05,
"loss": 2.0359,
"step": 7800
},
{
"epoch": 0.69,
"grad_norm": 0.14136551320552826,
"learning_rate": 1.5685674304705266e-05,
"loss": 2.0306,
"step": 7850
},
{
"epoch": 0.69,
"grad_norm": 0.24016603827476501,
"learning_rate": 1.546702816162323e-05,
"loss": 2.0095,
"step": 7900
},
{
"epoch": 0.7,
"grad_norm": 0.1085515171289444,
"learning_rate": 1.5248382018541194e-05,
"loss": 2.0747,
"step": 7950
},
{
"epoch": 0.7,
"grad_norm": 0.10604669898748398,
"learning_rate": 1.5029735875459158e-05,
"loss": 1.9993,
"step": 8000
},
{
"epoch": 0.7,
"grad_norm": 0.12853656709194183,
"learning_rate": 1.481108973237712e-05,
"loss": 2.08,
"step": 8050
},
{
"epoch": 0.71,
"grad_norm": 0.13124075531959534,
"learning_rate": 1.4592443589295086e-05,
"loss": 2.0017,
"step": 8100
},
{
"epoch": 0.71,
"grad_norm": 0.12069742381572723,
"learning_rate": 1.437379744621305e-05,
"loss": 2.0255,
"step": 8150
},
{
"epoch": 0.72,
"grad_norm": 0.17367246747016907,
"learning_rate": 1.4155151303131015e-05,
"loss": 1.977,
"step": 8200
},
{
"epoch": 0.72,
"grad_norm": 0.17460305988788605,
"learning_rate": 1.3936505160048976e-05,
"loss": 2.0414,
"step": 8250
},
{
"epoch": 0.73,
"grad_norm": 0.11902675032615662,
"learning_rate": 1.3717859016966941e-05,
"loss": 2.0401,
"step": 8300
},
{
"epoch": 0.73,
"grad_norm": 0.1274511069059372,
"learning_rate": 1.3499212873884905e-05,
"loss": 2.0513,
"step": 8350
},
{
"epoch": 0.73,
"grad_norm": 0.12478269636631012,
"learning_rate": 1.3280566730802868e-05,
"loss": 2.0096,
"step": 8400
},
{
"epoch": 0.74,
"grad_norm": 0.12884369492530823,
"learning_rate": 1.3061920587720833e-05,
"loss": 2.0188,
"step": 8450
},
{
"epoch": 0.74,
"grad_norm": 0.13595916330814362,
"learning_rate": 1.2843274444638797e-05,
"loss": 2.0259,
"step": 8500
},
{
"epoch": 0.75,
"grad_norm": 0.17813335359096527,
"learning_rate": 1.2624628301556763e-05,
"loss": 2.0578,
"step": 8550
},
{
"epoch": 0.75,
"grad_norm": 0.1450045257806778,
"learning_rate": 1.2405982158474727e-05,
"loss": 2.0138,
"step": 8600
},
{
"epoch": 0.76,
"grad_norm": 0.1711457371711731,
"learning_rate": 1.2187336015392689e-05,
"loss": 1.9998,
"step": 8650
},
{
"epoch": 0.76,
"grad_norm": 0.13330256938934326,
"learning_rate": 1.1968689872310653e-05,
"loss": 1.9919,
"step": 8700
},
{
"epoch": 0.77,
"grad_norm": 0.1473345309495926,
"learning_rate": 1.1750043729228617e-05,
"loss": 2.071,
"step": 8750
},
{
"epoch": 0.77,
"grad_norm": 0.16961540281772614,
"learning_rate": 1.153139758614658e-05,
"loss": 2.0439,
"step": 8800
},
{
"epoch": 0.77,
"grad_norm": 0.14884643256664276,
"learning_rate": 1.1312751443064546e-05,
"loss": 2.066,
"step": 8850
},
{
"epoch": 0.78,
"grad_norm": 0.17238876223564148,
"learning_rate": 1.1094105299982508e-05,
"loss": 1.9937,
"step": 8900
},
{
"epoch": 0.78,
"grad_norm": 0.18265317380428314,
"learning_rate": 1.0875459156900472e-05,
"loss": 2.0032,
"step": 8950
},
{
"epoch": 0.79,
"grad_norm": 0.14806517958641052,
"learning_rate": 1.0656813013818436e-05,
"loss": 2.0888,
"step": 9000
},
{
"epoch": 0.79,
"grad_norm": 0.12588676810264587,
"learning_rate": 1.04381668707364e-05,
"loss": 2.0247,
"step": 9050
},
{
"epoch": 0.8,
"grad_norm": 0.1755397915840149,
"learning_rate": 1.0219520727654366e-05,
"loss": 2.0018,
"step": 9100
},
{
"epoch": 0.8,
"grad_norm": 0.15915009379386902,
"learning_rate": 1.0000874584572328e-05,
"loss": 2.0219,
"step": 9150
},
{
"epoch": 0.8,
"grad_norm": 0.15032340586185455,
"learning_rate": 9.782228441490293e-06,
"loss": 2.0401,
"step": 9200
},
{
"epoch": 0.81,
"grad_norm": 0.1431685835123062,
"learning_rate": 9.563582298408257e-06,
"loss": 2.1078,
"step": 9250
},
{
"epoch": 0.81,
"grad_norm": 0.16817353665828705,
"learning_rate": 9.34493615532622e-06,
"loss": 2.0233,
"step": 9300
},
{
"epoch": 0.82,
"grad_norm": 0.11254491657018661,
"learning_rate": 9.126290012244185e-06,
"loss": 2.0385,
"step": 9350
},
{
"epoch": 0.82,
"grad_norm": 0.1725805401802063,
"learning_rate": 8.907643869162147e-06,
"loss": 2.0767,
"step": 9400
},
{
"epoch": 0.83,
"grad_norm": 0.15062490105628967,
"learning_rate": 8.688997726080113e-06,
"loss": 2.0677,
"step": 9450
},
{
"epoch": 0.83,
"grad_norm": 0.09490929543972015,
"learning_rate": 8.470351582998077e-06,
"loss": 2.059,
"step": 9500
},
{
"epoch": 0.84,
"grad_norm": 0.13435165584087372,
"learning_rate": 8.25170543991604e-06,
"loss": 2.0475,
"step": 9550
},
{
"epoch": 0.84,
"grad_norm": 0.17339274287223816,
"learning_rate": 8.033059296834005e-06,
"loss": 2.0018,
"step": 9600
},
{
"epoch": 0.84,
"grad_norm": 0.18383900821208954,
"learning_rate": 7.814413153751967e-06,
"loss": 2.0207,
"step": 9650
},
{
"epoch": 0.85,
"grad_norm": 0.15471112728118896,
"learning_rate": 7.5957670106699325e-06,
"loss": 2.1015,
"step": 9700
},
{
"epoch": 0.85,
"grad_norm": 0.13521017134189606,
"learning_rate": 7.3771208675878956e-06,
"loss": 2.0512,
"step": 9750
},
{
"epoch": 0.86,
"grad_norm": 0.15841689705848694,
"learning_rate": 7.15847472450586e-06,
"loss": 2.0085,
"step": 9800
},
{
"epoch": 0.86,
"grad_norm": 0.16433627903461456,
"learning_rate": 6.939828581423824e-06,
"loss": 2.0714,
"step": 9850
},
{
"epoch": 0.87,
"grad_norm": 0.23176471889019012,
"learning_rate": 6.721182438341787e-06,
"loss": 2.0268,
"step": 9900
},
{
"epoch": 0.87,
"grad_norm": 0.16546279191970825,
"learning_rate": 6.502536295259752e-06,
"loss": 2.0405,
"step": 9950
},
{
"epoch": 0.87,
"grad_norm": 0.1396203637123108,
"learning_rate": 6.283890152177715e-06,
"loss": 2.0846,
"step": 10000
},
{
"epoch": 0.88,
"grad_norm": 0.13313227891921997,
"learning_rate": 6.06524400909568e-06,
"loss": 2.0844,
"step": 10050
},
{
"epoch": 0.88,
"grad_norm": 0.14923414587974548,
"learning_rate": 5.846597866013644e-06,
"loss": 2.0501,
"step": 10100
},
{
"epoch": 0.89,
"grad_norm": 0.12807855010032654,
"learning_rate": 5.627951722931608e-06,
"loss": 2.0571,
"step": 10150
},
{
"epoch": 0.89,
"grad_norm": 0.15558409690856934,
"learning_rate": 5.4093055798495716e-06,
"loss": 2.0353,
"step": 10200
},
{
"epoch": 0.9,
"grad_norm": 0.1312248706817627,
"learning_rate": 5.1906594367675355e-06,
"loss": 2.0441,
"step": 10250
},
{
"epoch": 0.9,
"grad_norm": 0.15154685080051422,
"learning_rate": 4.9720132936855e-06,
"loss": 2.0675,
"step": 10300
},
{
"epoch": 0.91,
"grad_norm": 0.1447318196296692,
"learning_rate": 4.753367150603463e-06,
"loss": 1.9998,
"step": 10350
},
{
"epoch": 0.91,
"grad_norm": 0.1512334793806076,
"learning_rate": 4.534721007521427e-06,
"loss": 2.0112,
"step": 10400
},
{
"epoch": 0.91,
"grad_norm": 0.13242949545383453,
"learning_rate": 4.316074864439391e-06,
"loss": 2.0191,
"step": 10450
},
{
"epoch": 0.92,
"grad_norm": 0.13372650742530823,
"learning_rate": 4.097428721357356e-06,
"loss": 2.0326,
"step": 10500
},
{
"epoch": 0.92,
"grad_norm": 0.1071472018957138,
"learning_rate": 3.87878257827532e-06,
"loss": 2.0598,
"step": 10550
},
{
"epoch": 0.93,
"grad_norm": 0.14204634726047516,
"learning_rate": 3.6601364351932836e-06,
"loss": 2.0241,
"step": 10600
},
{
"epoch": 0.93,
"grad_norm": 0.11200432479381561,
"learning_rate": 3.441490292111247e-06,
"loss": 1.9981,
"step": 10650
},
{
"epoch": 0.94,
"grad_norm": 0.17016327381134033,
"learning_rate": 3.222844149029211e-06,
"loss": 1.9943,
"step": 10700
},
{
"epoch": 0.94,
"grad_norm": 0.15685972571372986,
"learning_rate": 3.004198005947175e-06,
"loss": 2.0516,
"step": 10750
},
{
"epoch": 0.94,
"grad_norm": 0.1390787661075592,
"learning_rate": 2.7855518628651393e-06,
"loss": 2.0148,
"step": 10800
},
{
"epoch": 0.95,
"grad_norm": 0.1561822146177292,
"learning_rate": 2.566905719783103e-06,
"loss": 1.9953,
"step": 10850
},
{
"epoch": 0.95,
"grad_norm": 0.1486501395702362,
"learning_rate": 2.348259576701067e-06,
"loss": 2.0293,
"step": 10900
},
{
"epoch": 0.96,
"grad_norm": 0.12501879036426544,
"learning_rate": 2.129613433619031e-06,
"loss": 2.0506,
"step": 10950
},
{
"epoch": 0.96,
"grad_norm": 0.12802496552467346,
"learning_rate": 1.9109672905369953e-06,
"loss": 1.9748,
"step": 11000
},
{
"epoch": 0.97,
"grad_norm": 0.11280578374862671,
"learning_rate": 1.6923211474549588e-06,
"loss": 2.0521,
"step": 11050
},
{
"epoch": 0.97,
"grad_norm": 0.13323938846588135,
"learning_rate": 1.473675004372923e-06,
"loss": 2.0329,
"step": 11100
},
{
"epoch": 0.97,
"grad_norm": 0.1350185126066208,
"learning_rate": 1.2550288612908868e-06,
"loss": 2.0562,
"step": 11150
},
{
"epoch": 0.98,
"grad_norm": 0.14205139875411987,
"learning_rate": 1.036382718208851e-06,
"loss": 2.0528,
"step": 11200
},
{
"epoch": 0.98,
"grad_norm": 0.19515833258628845,
"learning_rate": 8.177365751268147e-07,
"loss": 1.9489,
"step": 11250
},
{
"epoch": 0.99,
"grad_norm": 0.15463581681251526,
"learning_rate": 5.990904320447787e-07,
"loss": 2.0119,
"step": 11300
},
{
"epoch": 0.99,
"grad_norm": 0.13750217854976654,
"learning_rate": 3.8044428896274276e-07,
"loss": 2.0552,
"step": 11350
},
{
"epoch": 1.0,
"grad_norm": 0.1413380652666092,
"learning_rate": 1.6179814588070666e-07,
"loss": 2.0558,
"step": 11400
}
],
"logging_steps": 50,
"max_steps": 11436,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 2.4504387448594235e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}