{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.980132450331126,
  "eval_steps": 500,
  "global_step": 225,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.013245033112582781,
      "grad_norm": 0.8096176088552035,
      "learning_rate": 8.695652173913044e-06,
      "loss": 1.2541,
      "step": 1
    },
    {
      "epoch": 0.026490066225165563,
      "grad_norm": 0.8050822017472643,
      "learning_rate": 1.739130434782609e-05,
      "loss": 1.227,
      "step": 2
    },
    {
      "epoch": 0.039735099337748346,
      "grad_norm": 0.7944772711887119,
      "learning_rate": 2.608695652173913e-05,
      "loss": 1.2415,
      "step": 3
    },
    {
      "epoch": 0.052980132450331126,
      "grad_norm": 0.7598134845438774,
      "learning_rate": 3.478260869565218e-05,
      "loss": 1.1949,
      "step": 4
    },
    {
      "epoch": 0.06622516556291391,
      "grad_norm": 0.7683127560022982,
      "learning_rate": 4.347826086956522e-05,
      "loss": 1.2093,
      "step": 5
    },
    {
      "epoch": 0.07947019867549669,
      "grad_norm": 0.5344525931760804,
      "learning_rate": 5.217391304347826e-05,
      "loss": 1.1036,
      "step": 6
    },
    {
      "epoch": 0.09271523178807947,
      "grad_norm": 0.4587044664340658,
      "learning_rate": 6.086956521739131e-05,
      "loss": 1.0166,
      "step": 7
    },
    {
      "epoch": 0.10596026490066225,
      "grad_norm": 0.4868625164917359,
      "learning_rate": 6.956521739130436e-05,
      "loss": 0.955,
      "step": 8
    },
    {
      "epoch": 0.11920529801324503,
      "grad_norm": 0.5418471125188639,
      "learning_rate": 7.82608695652174e-05,
      "loss": 0.8997,
      "step": 9
    },
    {
      "epoch": 0.13245033112582782,
      "grad_norm": 0.5223349521251892,
      "learning_rate": 8.695652173913044e-05,
      "loss": 0.8113,
      "step": 10
    },
    {
      "epoch": 0.1456953642384106,
      "grad_norm": 0.4786982568033246,
      "learning_rate": 9.565217391304348e-05,
      "loss": 0.7325,
      "step": 11
    },
    {
      "epoch": 0.15894039735099338,
      "grad_norm": 0.46957216029807536,
      "learning_rate": 0.00010434782608695653,
      "loss": 0.6606,
      "step": 12
    },
    {
      "epoch": 0.17218543046357615,
      "grad_norm": 0.38029367288689914,
      "learning_rate": 0.00011304347826086956,
      "loss": 0.5808,
      "step": 13
    },
    {
      "epoch": 0.18543046357615894,
      "grad_norm": 0.24720582418095602,
      "learning_rate": 0.00012173913043478263,
      "loss": 0.5613,
      "step": 14
    },
    {
      "epoch": 0.1986754966887417,
      "grad_norm": 0.23099067802861695,
      "learning_rate": 0.00013043478260869567,
      "loss": 0.5391,
      "step": 15
    },
    {
      "epoch": 0.2119205298013245,
      "grad_norm": 0.20957820248410008,
      "learning_rate": 0.0001391304347826087,
      "loss": 0.539,
      "step": 16
    },
    {
      "epoch": 0.2251655629139073,
      "grad_norm": 0.21711931182463448,
      "learning_rate": 0.00014782608695652173,
      "loss": 0.5268,
      "step": 17
    },
    {
      "epoch": 0.23841059602649006,
      "grad_norm": 0.1951790595421549,
      "learning_rate": 0.0001565217391304348,
      "loss": 0.4963,
      "step": 18
    },
    {
      "epoch": 0.25165562913907286,
      "grad_norm": 0.1826409685431601,
      "learning_rate": 0.00016521739130434784,
      "loss": 0.4952,
      "step": 19
    },
    {
      "epoch": 0.26490066225165565,
      "grad_norm": 0.14373385619543355,
      "learning_rate": 0.00017391304347826088,
      "loss": 0.4837,
      "step": 20
    },
    {
      "epoch": 0.2781456953642384,
      "grad_norm": 0.12173908533781636,
      "learning_rate": 0.00018260869565217392,
      "loss": 0.4634,
      "step": 21
    },
    {
      "epoch": 0.2913907284768212,
      "grad_norm": 0.12297735060498352,
      "learning_rate": 0.00019130434782608697,
      "loss": 0.4573,
      "step": 22
    },
    {
      "epoch": 0.304635761589404,
      "grad_norm": 0.10994270746188307,
      "learning_rate": 0.0002,
      "loss": 0.4683,
      "step": 23
    },
    {
      "epoch": 0.31788079470198677,
      "grad_norm": 0.11351044281096902,
      "learning_rate": 0.00019998790632601496,
      "loss": 0.4322,
      "step": 24
    },
    {
      "epoch": 0.33112582781456956,
      "grad_norm": 0.11243087776192183,
      "learning_rate": 0.00019995162822919883,
      "loss": 0.4516,
      "step": 25
    },
    {
      "epoch": 0.3443708609271523,
      "grad_norm": 0.11510175208476785,
      "learning_rate": 0.00019989117448426108,
      "loss": 0.4499,
      "step": 26
    },
    {
      "epoch": 0.3576158940397351,
      "grad_norm": 0.11693433753737806,
      "learning_rate": 0.00019980655971335945,
      "loss": 0.4542,
      "step": 27
    },
    {
      "epoch": 0.3708609271523179,
      "grad_norm": 0.11467246423231502,
      "learning_rate": 0.00019969780438256293,
      "loss": 0.4337,
      "step": 28
    },
    {
      "epoch": 0.3841059602649007,
      "grad_norm": 0.11115653137915112,
      "learning_rate": 0.0001995649347969019,
      "loss": 0.4263,
      "step": 29
    },
    {
      "epoch": 0.3973509933774834,
      "grad_norm": 0.11024786542483019,
      "learning_rate": 0.00019940798309400526,
      "loss": 0.4342,
      "step": 30
    },
    {
      "epoch": 0.4105960264900662,
      "grad_norm": 0.10312580553142063,
      "learning_rate": 0.00019922698723632767,
      "loss": 0.4267,
      "step": 31
    },
    {
      "epoch": 0.423841059602649,
      "grad_norm": 0.11074151337400631,
      "learning_rate": 0.00019902199100196697,
      "loss": 0.4286,
      "step": 32
    },
    {
      "epoch": 0.4370860927152318,
      "grad_norm": 0.09029943151079976,
      "learning_rate": 0.0001987930439740757,
      "loss": 0.4152,
      "step": 33
    },
    {
      "epoch": 0.4503311258278146,
      "grad_norm": 0.09101826700354056,
      "learning_rate": 0.00019854020152886814,
      "loss": 0.4313,
      "step": 34
    },
    {
      "epoch": 0.46357615894039733,
      "grad_norm": 0.0914630983642065,
      "learning_rate": 0.00019826352482222638,
      "loss": 0.4117,
      "step": 35
    },
    {
      "epoch": 0.4768211920529801,
      "grad_norm": 0.09219697877770537,
      "learning_rate": 0.00019796308077490817,
      "loss": 0.4175,
      "step": 36
    },
    {
      "epoch": 0.4900662251655629,
      "grad_norm": 0.08852002864296264,
      "learning_rate": 0.00019763894205636072,
      "loss": 0.4041,
      "step": 37
    },
    {
      "epoch": 0.5033112582781457,
      "grad_norm": 0.08580676378486166,
      "learning_rate": 0.00019729118706714375,
      "loss": 0.404,
      "step": 38
    },
    {
      "epoch": 0.5165562913907285,
      "grad_norm": 0.08598698501328113,
      "learning_rate": 0.00019691989991996663,
      "loss": 0.4087,
      "step": 39
    },
    {
      "epoch": 0.5298013245033113,
      "grad_norm": 0.08961053716539952,
      "learning_rate": 0.00019652517041934356,
      "loss": 0.4014,
      "step": 40
    },
    {
      "epoch": 0.543046357615894,
      "grad_norm": 0.08443482401797175,
      "learning_rate": 0.00019610709403987246,
      "loss": 0.4137,
      "step": 41
    },
    {
      "epoch": 0.5562913907284768,
      "grad_norm": 0.08466021640310874,
      "learning_rate": 0.00019566577190314197,
      "loss": 0.4071,
      "step": 42
    },
    {
      "epoch": 0.5695364238410596,
      "grad_norm": 0.08784527020927076,
      "learning_rate": 0.00019520131075327298,
      "loss": 0.4061,
      "step": 43
    },
    {
      "epoch": 0.5827814569536424,
      "grad_norm": 0.08325332082087357,
      "learning_rate": 0.00019471382293110003,
      "loss": 0.3957,
      "step": 44
    },
    {
      "epoch": 0.5960264900662252,
      "grad_norm": 0.08614805595781429,
      "learning_rate": 0.0001942034263469989,
      "loss": 0.4053,
      "step": 45
    },
    {
      "epoch": 0.609271523178808,
      "grad_norm": 0.07902174863469037,
      "learning_rate": 0.00019367024445236754,
      "loss": 0.3987,
      "step": 46
    },
    {
      "epoch": 0.6225165562913907,
      "grad_norm": 0.08133695710941313,
      "learning_rate": 0.00019311440620976597,
      "loss": 0.3942,
      "step": 47
    },
    {
      "epoch": 0.6357615894039735,
      "grad_norm": 0.08276360028919133,
      "learning_rate": 0.00019253604606172417,
      "loss": 0.3951,
      "step": 48
    },
    {
      "epoch": 0.6490066225165563,
      "grad_norm": 0.08194802489692825,
      "learning_rate": 0.00019193530389822363,
      "loss": 0.3917,
      "step": 49
    },
    {
      "epoch": 0.6622516556291391,
      "grad_norm": 0.08159974959706186,
      "learning_rate": 0.00019131232502286188,
      "loss": 0.3934,
      "step": 50
    },
    {
      "epoch": 0.6754966887417219,
      "grad_norm": 0.08170998905157066,
      "learning_rate": 0.00019066726011770726,
      "loss": 0.3851,
      "step": 51
    },
    {
      "epoch": 0.6887417218543046,
      "grad_norm": 0.08020907094953274,
      "learning_rate": 0.00019000026520685302,
      "loss": 0.3893,
      "step": 52
    },
    {
      "epoch": 0.7019867549668874,
      "grad_norm": 0.08034981466771474,
      "learning_rate": 0.00018931150161867916,
      "loss": 0.381,
      "step": 53
    },
    {
      "epoch": 0.7152317880794702,
      "grad_norm": 0.08444845993593682,
      "learning_rate": 0.00018860113594683148,
      "loss": 0.3915,
      "step": 54
    },
    {
      "epoch": 0.7284768211920529,
      "grad_norm": 0.08015215412606266,
      "learning_rate": 0.00018786934000992688,
      "loss": 0.3833,
      "step": 55
    },
    {
      "epoch": 0.7417218543046358,
      "grad_norm": 0.08464858931007045,
      "learning_rate": 0.00018711629080999504,
      "loss": 0.3826,
      "step": 56
    },
    {
      "epoch": 0.7549668874172185,
      "grad_norm": 0.08291520407405459,
      "learning_rate": 0.00018634217048966637,
      "loss": 0.3738,
      "step": 57
    },
    {
      "epoch": 0.7682119205298014,
      "grad_norm": 0.08660040487398858,
      "learning_rate": 0.0001855471662881164,
      "loss": 0.3856,
      "step": 58
    },
    {
      "epoch": 0.7814569536423841,
      "grad_norm": 0.0857196214995308,
      "learning_rate": 0.00018473147049577774,
      "loss": 0.3779,
      "step": 59
    },
    {
      "epoch": 0.7947019867549668,
      "grad_norm": 0.07987880371713715,
      "learning_rate": 0.00018389528040783012,
      "loss": 0.3766,
      "step": 60
    },
    {
      "epoch": 0.8079470198675497,
      "grad_norm": 0.08369440099668185,
      "learning_rate": 0.00018303879827647975,
      "loss": 0.3835,
      "step": 61
    },
    {
      "epoch": 0.8211920529801324,
      "grad_norm": 0.08373532556639413,
      "learning_rate": 0.00018216223126204007,
      "loss": 0.3745,
      "step": 62
    },
    {
      "epoch": 0.8344370860927153,
      "grad_norm": 0.08073536197157054,
      "learning_rate": 0.00018126579138282503,
      "loss": 0.3687,
      "step": 63
    },
    {
      "epoch": 0.847682119205298,
      "grad_norm": 0.08284465509601228,
      "learning_rate": 0.00018034969546386757,
      "loss": 0.3787,
      "step": 64
    },
    {
      "epoch": 0.8609271523178808,
      "grad_norm": 0.0842934427371451,
      "learning_rate": 0.00017941416508447536,
      "loss": 0.3873,
      "step": 65
    },
    {
      "epoch": 0.8741721854304636,
      "grad_norm": 0.08355593713327628,
      "learning_rate": 0.0001784594265246366,
      "loss": 0.3778,
      "step": 66
    },
    {
      "epoch": 0.8874172185430463,
      "grad_norm": 0.08950539941436171,
      "learning_rate": 0.000177485710710289,
      "loss": 0.3727,
      "step": 67
    },
    {
      "epoch": 0.9006622516556292,
      "grad_norm": 0.08710263548451828,
      "learning_rate": 0.00017649325315746478,
      "loss": 0.3808,
      "step": 68
    },
    {
      "epoch": 0.9139072847682119,
      "grad_norm": 0.0887614198652171,
      "learning_rate": 0.00017548229391532572,
      "loss": 0.3789,
      "step": 69
    },
    {
      "epoch": 0.9271523178807947,
      "grad_norm": 0.08666661250569707,
      "learning_rate": 0.0001744530775081015,
      "loss": 0.3732,
      "step": 70
    },
    {
      "epoch": 0.9403973509933775,
      "grad_norm": 0.0849525268450149,
      "learning_rate": 0.00017340585287594604,
      "loss": 0.3712,
      "step": 71
    },
    {
      "epoch": 0.9536423841059603,
      "grad_norm": 0.08625788315304235,
      "learning_rate": 0.00017234087331472497,
      "loss": 0.3597,
      "step": 72
    },
    {
      "epoch": 0.9668874172185431,
      "grad_norm": 0.07851130512605926,
      "learning_rate": 0.00017125839641475072,
      "loss": 0.3639,
      "step": 73
    },
    {
      "epoch": 0.9801324503311258,
      "grad_norm": 0.08964240238751611,
      "learning_rate": 0.00017015868399847768,
      "loss": 0.3844,
      "step": 74
    },
    {
      "epoch": 0.9933774834437086,
      "grad_norm": 0.08516340365396252,
      "learning_rate": 0.0001690420020571747,
      "loss": 0.372,
      "step": 75
    },
    {
      "epoch": 0.9933774834437086,
      "eval_loss": 0.3703567683696747,
      "eval_runtime": 46.123,
      "eval_samples_per_second": 21.941,
      "eval_steps_per_second": 0.694,
      "step": 75
    },
    {
      "epoch": 1.0066225165562914,
      "grad_norm": 0.07944382362889917,
      "learning_rate": 0.0001679086206865886,
      "loss": 0.3697,
      "step": 76
    },
    {
      "epoch": 1.0198675496688743,
      "grad_norm": 0.08265930361903498,
      "learning_rate": 0.00016675881402161536,
      "loss": 0.3551,
      "step": 77
    },
    {
      "epoch": 1.033112582781457,
      "grad_norm": 0.08703614399996357,
      "learning_rate": 0.000165592860169994,
      "loss": 0.3442,
      "step": 78
    },
    {
      "epoch": 1.0463576158940397,
      "grad_norm": 0.08916319509375828,
      "learning_rate": 0.0001644110411450398,
      "loss": 0.365,
      "step": 79
    },
    {
      "epoch": 1.0596026490066226,
      "grad_norm": 0.08703848127871557,
      "learning_rate": 0.00016321364279743266,
      "loss": 0.3611,
      "step": 80
    },
    {
      "epoch": 1.0728476821192052,
      "grad_norm": 0.09052558000694078,
      "learning_rate": 0.00016200095474607753,
      "loss": 0.3615,
      "step": 81
    },
    {
      "epoch": 1.086092715231788,
      "grad_norm": 0.08918100371610707,
      "learning_rate": 0.0001607732703080532,
      "loss": 0.342,
      "step": 82
    },
    {
      "epoch": 1.099337748344371,
      "grad_norm": 0.08576575268439565,
      "learning_rate": 0.0001595308864276666,
      "loss": 0.3598,
      "step": 83
    },
    {
      "epoch": 1.1125827814569536,
      "grad_norm": 0.08585017464402006,
      "learning_rate": 0.0001582741036046301,
      "loss": 0.3504,
      "step": 84
    },
    {
      "epoch": 1.1258278145695364,
      "grad_norm": 0.08593452414859805,
      "learning_rate": 0.00015700322582137827,
      "loss": 0.3432,
      "step": 85
    },
    {
      "epoch": 1.1390728476821192,
      "grad_norm": 0.08731970510720415,
      "learning_rate": 0.00015571856046954285,
      "loss": 0.3457,
      "step": 86
    },
    {
      "epoch": 1.152317880794702,
      "grad_norm": 0.0921843418842424,
      "learning_rate": 0.00015442041827560274,
      "loss": 0.3507,
      "step": 87
    },
    {
      "epoch": 1.1655629139072847,
      "grad_norm": 0.09651961400159455,
      "learning_rate": 0.00015310911322572753,
      "loss": 0.3596,
      "step": 88
    },
    {
      "epoch": 1.1788079470198676,
      "grad_norm": 0.08524005048376013,
      "learning_rate": 0.00015178496248983254,
      "loss": 0.3554,
      "step": 89
    },
    {
      "epoch": 1.1920529801324504,
      "grad_norm": 0.08859594152270273,
      "learning_rate": 0.000150448286344864,
      "loss": 0.3551,
      "step": 90
    },
    {
      "epoch": 1.205298013245033,
      "grad_norm": 0.0924808469627539,
      "learning_rate": 0.00014909940809733222,
      "loss": 0.3525,
      "step": 91
    },
    {
      "epoch": 1.218543046357616,
      "grad_norm": 0.08644059805052462,
      "learning_rate": 0.00014773865400511272,
      "loss": 0.3503,
      "step": 92
    },
    {
      "epoch": 1.2317880794701987,
      "grad_norm": 0.09131894341880005,
      "learning_rate": 0.00014636635319853275,
      "loss": 0.3571,
      "step": 93
    },
    {
      "epoch": 1.2450331125827814,
      "grad_norm": 0.08393682045402433,
      "learning_rate": 0.0001449828376007636,
      "loss": 0.3476,
      "step": 94
    },
    {
      "epoch": 1.2582781456953642,
      "grad_norm": 0.08696313045637266,
      "learning_rate": 0.00014358844184753712,
      "loss": 0.3594,
      "step": 95
    },
    {
      "epoch": 1.271523178807947,
      "grad_norm": 0.09458041630505085,
      "learning_rate": 0.00014218350320620624,
      "loss": 0.3626,
      "step": 96
    },
    {
      "epoch": 1.2847682119205297,
      "grad_norm": 0.08823303635376296,
      "learning_rate": 0.00014076836149416887,
      "loss": 0.3499,
      "step": 97
    },
    {
      "epoch": 1.2980132450331126,
      "grad_norm": 0.09294675372857181,
      "learning_rate": 0.00013934335899667527,
      "loss": 0.3539,
      "step": 98
    },
    {
      "epoch": 1.3112582781456954,
      "grad_norm": 0.08824268036877034,
      "learning_rate": 0.00013790884038403795,
      "loss": 0.3514,
      "step": 99
    },
    {
      "epoch": 1.3245033112582782,
      "grad_norm": 0.08535480262896947,
      "learning_rate": 0.00013646515262826552,
      "loss": 0.345,
      "step": 100
    },
    {
      "epoch": 1.3377483443708609,
      "grad_norm": 0.08847562725166169,
      "learning_rate": 0.00013501264491913906,
      "loss": 0.3616,
      "step": 101
    },
    {
      "epoch": 1.3509933774834437,
      "grad_norm": 0.08859058434854095,
      "learning_rate": 0.0001335516685797525,
      "loss": 0.3562,
      "step": 102
    },
    {
      "epoch": 1.3642384105960264,
      "grad_norm": 0.08715025975746184,
      "learning_rate": 0.00013208257698153677,
      "loss": 0.3455,
      "step": 103
    },
    {
      "epoch": 1.3774834437086092,
      "grad_norm": 0.0853594568437305,
      "learning_rate": 0.00013060572545878875,
      "loss": 0.346,
      "step": 104
    },
    {
      "epoch": 1.390728476821192,
      "grad_norm": 0.08722491192064814,
      "learning_rate": 0.00012912147122272523,
      "loss": 0.3555,
      "step": 105
    },
    {
      "epoch": 1.403973509933775,
      "grad_norm": 0.0871433664730764,
      "learning_rate": 0.00012763017327508305,
      "loss": 0.3556,
      "step": 106
    },
    {
      "epoch": 1.4172185430463577,
      "grad_norm": 0.08803547541904783,
      "learning_rate": 0.00012613219232128608,
      "loss": 0.3534,
      "step": 107
    },
    {
      "epoch": 1.4304635761589404,
      "grad_norm": 0.09122226233927531,
      "learning_rate": 0.00012462789068320017,
      "loss": 0.3569,
      "step": 108
    },
    {
      "epoch": 1.4437086092715232,
      "grad_norm": 0.09822341257641279,
      "learning_rate": 0.000123117632211497,
      "loss": 0.3633,
      "step": 109
    },
    {
      "epoch": 1.4569536423841059,
      "grad_norm": 0.09270090775666746,
      "learning_rate": 0.00012160178219764837,
      "loss": 0.3453,
      "step": 110
    },
    {
      "epoch": 1.4701986754966887,
      "grad_norm": 0.08925565696630358,
      "learning_rate": 0.00012008070728557186,
      "loss": 0.3508,
      "step": 111
    },
    {
      "epoch": 1.4834437086092715,
      "grad_norm": 0.09170653617303556,
      "learning_rate": 0.00011855477538294935,
      "loss": 0.3534,
      "step": 112
    },
    {
      "epoch": 1.4966887417218544,
      "grad_norm": 0.08583635619816832,
      "learning_rate": 0.00011702435557223987,
      "loss": 0.3463,
      "step": 113
    },
    {
      "epoch": 1.5099337748344372,
      "grad_norm": 0.08058809711878263,
      "learning_rate": 0.00011548981802140848,
      "loss": 0.3477,
      "step": 114
    },
    {
      "epoch": 1.5231788079470199,
      "grad_norm": 0.09093533643868798,
      "learning_rate": 0.00011395153389439233,
      "loss": 0.3512,
      "step": 115
    },
    {
      "epoch": 1.5364238410596025,
      "grad_norm": 0.09171376470501859,
      "learning_rate": 0.00011240987526132594,
      "loss": 0.3544,
      "step": 116
    },
    {
      "epoch": 1.5496688741721854,
      "grad_norm": 0.08586078909940174,
      "learning_rate": 0.00011086521500854745,
      "loss": 0.3694,
      "step": 117
    },
    {
      "epoch": 1.5629139072847682,
      "grad_norm": 0.08632019045566638,
      "learning_rate": 0.00010931792674840718,
      "loss": 0.3453,
      "step": 118
    },
    {
      "epoch": 1.576158940397351,
      "grad_norm": 0.09269094674353331,
      "learning_rate": 0.00010776838472890065,
      "loss": 0.3587,
      "step": 119
    },
    {
      "epoch": 1.589403973509934,
      "grad_norm": 0.08779002368050795,
      "learning_rate": 0.00010621696374314807,
      "loss": 0.3478,
      "step": 120
    },
    {
      "epoch": 1.6026490066225165,
      "grad_norm": 0.08586261022719192,
      "learning_rate": 0.00010466403903874176,
      "loss": 0.341,
      "step": 121
    },
    {
      "epoch": 1.6158940397350994,
      "grad_norm": 0.08611577193250892,
      "learning_rate": 0.0001031099862269837,
      "loss": 0.3558,
      "step": 122
    },
    {
      "epoch": 1.629139072847682,
      "grad_norm": 0.09316621499512412,
      "learning_rate": 0.0001015551811920351,
      "loss": 0.3541,
      "step": 123
    },
    {
      "epoch": 1.6423841059602649,
      "grad_norm": 0.08404147766450029,
      "learning_rate": 0.0001,
      "loss": 0.3489,
      "step": 124
    },
    {
      "epoch": 1.6556291390728477,
      "grad_norm": 0.08524287111150772,
      "learning_rate": 9.844481880796491e-05,
      "loss": 0.3541,
      "step": 125
    },
    {
      "epoch": 1.6688741721854305,
      "grad_norm": 0.08369196863657465,
      "learning_rate": 9.689001377301633e-05,
      "loss": 0.3421,
      "step": 126
    },
    {
      "epoch": 1.6821192052980134,
      "grad_norm": 0.08831018354579961,
      "learning_rate": 9.533596096125825e-05,
      "loss": 0.3484,
      "step": 127
    },
    {
      "epoch": 1.695364238410596,
      "grad_norm": 0.08931583825994703,
      "learning_rate": 9.378303625685195e-05,
      "loss": 0.3418,
      "step": 128
    },
    {
      "epoch": 1.7086092715231787,
      "grad_norm": 0.0920976409870365,
      "learning_rate": 9.223161527109937e-05,
      "loss": 0.3477,
      "step": 129
    },
    {
      "epoch": 1.7218543046357615,
      "grad_norm": 0.0866166191323527,
      "learning_rate": 9.068207325159284e-05,
      "loss": 0.3422,
      "step": 130
    },
    {
      "epoch": 1.7350993377483444,
      "grad_norm": 0.08394672431065998,
      "learning_rate": 8.913478499145254e-05,
      "loss": 0.337,
      "step": 131
    },
    {
      "epoch": 1.7483443708609272,
      "grad_norm": 0.08368403453651165,
      "learning_rate": 8.759012473867407e-05,
      "loss": 0.3487,
      "step": 132
    },
    {
      "epoch": 1.76158940397351,
      "grad_norm": 0.08503534775674756,
      "learning_rate": 8.604846610560771e-05,
      "loss": 0.3463,
      "step": 133
    },
    {
      "epoch": 1.7748344370860927,
      "grad_norm": 0.08495442186575057,
      "learning_rate": 8.451018197859153e-05,
      "loss": 0.3506,
      "step": 134
    },
    {
      "epoch": 1.7880794701986755,
      "grad_norm": 0.08766338307723749,
      "learning_rate": 8.297564442776014e-05,
      "loss": 0.3423,
      "step": 135
    },
    {
      "epoch": 1.8013245033112582,
      "grad_norm": 0.08162961612606438,
      "learning_rate": 8.144522461705067e-05,
      "loss": 0.3316,
      "step": 136
    },
    {
      "epoch": 1.814569536423841,
      "grad_norm": 0.08852249330426205,
      "learning_rate": 7.991929271442817e-05,
      "loss": 0.3483,
      "step": 137
    },
    {
      "epoch": 1.8278145695364238,
      "grad_norm": 0.08788889130608463,
      "learning_rate": 7.839821780235168e-05,
      "loss": 0.3554,
      "step": 138
    },
    {
      "epoch": 1.8410596026490067,
      "grad_norm": 0.08567621661342421,
      "learning_rate": 7.688236778850306e-05,
      "loss": 0.3333,
      "step": 139
    },
    {
      "epoch": 1.8543046357615895,
      "grad_norm": 0.09025227183243908,
      "learning_rate": 7.537210931679987e-05,
      "loss": 0.3461,
      "step": 140
    },
    {
      "epoch": 1.8675496688741722,
      "grad_norm": 0.0887176743957205,
      "learning_rate": 7.386780767871397e-05,
      "loss": 0.3459,
      "step": 141
    },
    {
      "epoch": 1.8807947019867548,
      "grad_norm": 0.08665996940712498,
      "learning_rate": 7.236982672491698e-05,
      "loss": 0.3539,
      "step": 142
    },
    {
      "epoch": 1.8940397350993377,
      "grad_norm": 0.08608862013105582,
      "learning_rate": 7.087852877727481e-05,
      "loss": 0.3418,
      "step": 143
    },
    {
      "epoch": 1.9072847682119205,
      "grad_norm": 0.08420947731369693,
      "learning_rate": 6.939427454121128e-05,
      "loss": 0.3385,
      "step": 144
    },
    {
      "epoch": 1.9205298013245033,
      "grad_norm": 0.08687771570570416,
      "learning_rate": 6.791742301846326e-05,
      "loss": 0.3484,
      "step": 145
    },
    {
      "epoch": 1.9337748344370862,
      "grad_norm": 0.09001811775951214,
      "learning_rate": 6.644833142024751e-05,
      "loss": 0.3482,
      "step": 146
    },
    {
      "epoch": 1.9470198675496688,
      "grad_norm": 0.08461468347282106,
      "learning_rate": 6.498735508086093e-05,
      "loss": 0.3384,
      "step": 147
    },
    {
      "epoch": 1.9602649006622517,
      "grad_norm": 0.08353611993941902,
      "learning_rate": 6.35348473717345e-05,
      "loss": 0.343,
      "step": 148
    },
    {
      "epoch": 1.9735099337748343,
      "grad_norm": 0.0834738694275141,
      "learning_rate": 6.209115961596208e-05,
      "loss": 0.3431,
      "step": 149
    },
    {
      "epoch": 1.9867549668874172,
      "grad_norm": 0.08599845820919347,
      "learning_rate": 6.065664100332478e-05,
      "loss": 0.3381,
      "step": 150
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.08781968968497832,
      "learning_rate": 5.923163850583113e-05,
      "loss": 0.3361,
      "step": 151
    },
    {
      "epoch": 2.0,
      "eval_loss": 0.35156726837158203,
      "eval_runtime": 38.8035,
      "eval_samples_per_second": 26.08,
      "eval_steps_per_second": 0.825,
      "step": 151
    },
    {
      "epoch": 2.013245033112583,
      "grad_norm": 0.08189131042429836,
      "learning_rate": 5.781649679379378e-05,
      "loss": 0.3168,
      "step": 152
    },
    {
      "epoch": 2.0264900662251657,
      "grad_norm": 0.08590965338671859,
      "learning_rate": 5.6411558152462894e-05,
      "loss": 0.3327,
      "step": 153
    },
    {
      "epoch": 2.0397350993377485,
      "grad_norm": 0.08632653329140866,
      "learning_rate": 5.501716239923642e-05,
      "loss": 0.331,
      "step": 154
    },
    {
      "epoch": 2.052980132450331,
      "grad_norm": 0.08516842826462703,
      "learning_rate": 5.363364680146725e-05,
      "loss": 0.3306,
      "step": 155
    },
    {
      "epoch": 2.066225165562914,
      "grad_norm": 0.08496401039658237,
      "learning_rate": 5.226134599488728e-05,
      "loss": 0.3248,
      "step": 156
    },
    {
      "epoch": 2.0794701986754967,
      "grad_norm": 0.08826525390483432,
      "learning_rate": 5.090059190266779e-05,
      "loss": 0.3308,
      "step": 157
    },
    {
      "epoch": 2.0927152317880795,
      "grad_norm": 0.08487280637626197,
      "learning_rate": 4.955171365513603e-05,
      "loss": 0.3211,
      "step": 158
    },
    {
      "epoch": 2.1059602649006623,
      "grad_norm": 0.09382764910639449,
      "learning_rate": 4.821503751016746e-05,
      "loss": 0.3354,
      "step": 159
    },
    {
      "epoch": 2.119205298013245,
      "grad_norm": 0.08732672940741114,
      "learning_rate": 4.689088677427249e-05,
      "loss": 0.3315,
      "step": 160
    },
    {
      "epoch": 2.1324503311258276,
      "grad_norm": 0.09541697755263766,
      "learning_rate": 4.5579581724397255e-05,
      "loss": 0.3373,
      "step": 161
    },
    {
      "epoch": 2.1456953642384105,
      "grad_norm": 0.08867554361971618,
      "learning_rate": 4.428143953045717e-05,
      "loss": 0.3383,
      "step": 162
    },
    {
      "epoch": 2.1589403973509933,
      "grad_norm": 0.09288456090060858,
      "learning_rate": 4.2996774178621736e-05,
      "loss": 0.331,
      "step": 163
    },
    {
      "epoch": 2.172185430463576,
      "grad_norm": 0.08808813047917079,
      "learning_rate": 4.172589639536991e-05,
      "loss": 0.3223,
      "step": 164
    },
    {
      "epoch": 2.185430463576159,
      "grad_norm": 0.09275105554751231,
      "learning_rate": 4.046911357233343e-05,
      "loss": 0.3301,
      "step": 165
    },
    {
      "epoch": 2.198675496688742,
      "grad_norm": 0.09353735027294084,
      "learning_rate": 3.922672969194686e-05,
      "loss": 0.3295,
      "step": 166
    },
    {
      "epoch": 2.2119205298013247,
      "grad_norm": 0.09234588799290942,
      "learning_rate": 3.79990452539225e-05,
      "loss": 0.3214,
      "step": 167
    },
    {
      "epoch": 2.225165562913907,
      "grad_norm": 0.09179773375765557,
      "learning_rate": 3.678635720256737e-05,
      "loss": 0.3241,
      "step": 168
    },
    {
      "epoch": 2.23841059602649,
      "grad_norm": 0.08971692725792768,
      "learning_rate": 3.558895885496023e-05,
      "loss": 0.3175,
      "step": 169
    },
    {
      "epoch": 2.251655629139073,
      "grad_norm": 0.08939100980866099,
      "learning_rate": 3.440713983000601e-05,
      "loss": 0.3252,
      "step": 170
    },
    {
      "epoch": 2.2649006622516556,
      "grad_norm": 0.09306831321980909,
      "learning_rate": 3.324118597838464e-05,
      "loss": 0.3225,
      "step": 171
    },
    {
      "epoch": 2.2781456953642385,
      "grad_norm": 0.09091774211009096,
      "learning_rate": 3.209137931341143e-05,
      "loss": 0.3215,
      "step": 172
    },
    {
      "epoch": 2.2913907284768213,
      "grad_norm": 0.08998835153295978,
      "learning_rate": 3.0957997942825336e-05,
      "loss": 0.3332,
      "step": 173
    },
    {
      "epoch": 2.304635761589404,
      "grad_norm": 0.08999871518726542,
      "learning_rate": 2.9841316001522347e-05,
      "loss": 0.3265,
      "step": 174
    },
    {
      "epoch": 2.3178807947019866,
      "grad_norm": 0.08874688997641272,
      "learning_rate": 2.874160358524931e-05,
      "loss": 0.328,
      "step": 175
    },
    {
      "epoch": 2.3311258278145695,
      "grad_norm": 0.08979245895359222,
      "learning_rate": 2.7659126685275027e-05,
      "loss": 0.3288,
      "step": 176
    },
    {
      "epoch": 2.3443708609271523,
      "grad_norm": 0.09322170086883196,
      "learning_rate": 2.659414712405398e-05,
      "loss": 0.3264,
      "step": 177
    },
    {
      "epoch": 2.357615894039735,
      "grad_norm": 0.0873785964065595,
      "learning_rate": 2.5546922491898495e-05,
      "loss": 0.3283,
      "step": 178
    },
    {
      "epoch": 2.370860927152318,
      "grad_norm": 0.09137697607964013,
      "learning_rate": 2.451770608467432e-05,
      "loss": 0.3265,
      "step": 179
    },
    {
      "epoch": 2.384105960264901,
      "grad_norm": 0.08934971281847022,
      "learning_rate": 2.3506746842535242e-05,
      "loss": 0.3197,
      "step": 180
    },
    {
      "epoch": 2.3973509933774833,
      "grad_norm": 0.09226380851297578,
      "learning_rate": 2.251428928971102e-05,
      "loss": 0.3303,
      "step": 181
    },
    {
      "epoch": 2.410596026490066,
      "grad_norm": 0.08813038828978075,
      "learning_rate": 2.1540573475363402e-05,
      "loss": 0.3147,
      "step": 182
    },
    {
      "epoch": 2.423841059602649,
      "grad_norm": 0.09148478249319783,
      "learning_rate": 2.058583491552465e-05,
      "loss": 0.3304,
      "step": 183
    },
    {
      "epoch": 2.437086092715232,
      "grad_norm": 0.08970976007155415,
      "learning_rate": 1.9650304536132426e-05,
      "loss": 0.3142,
      "step": 184
    },
    {
      "epoch": 2.4503311258278146,
      "grad_norm": 0.0914061480835884,
      "learning_rate": 1.8734208617174988e-05,
      "loss": 0.3332,
      "step": 185
    },
    {
      "epoch": 2.4635761589403975,
      "grad_norm": 0.09223482668849642,
      "learning_rate": 1.783776873795994e-05,
      "loss": 0.3235,
      "step": 186
    },
    {
      "epoch": 2.47682119205298,
      "grad_norm": 0.09218058790384615,
      "learning_rate": 1.696120172352025e-05,
      "loss": 0.3281,
      "step": 187
    },
    {
      "epoch": 2.4900662251655628,
      "grad_norm": 0.09120288324314661,
      "learning_rate": 1.6104719592169902e-05,
      "loss": 0.323,
      "step": 188
    },
    {
      "epoch": 2.5033112582781456,
      "grad_norm": 0.09425838170079778,
      "learning_rate": 1.526852950422226e-05,
      "loss": 0.3214,
      "step": 189
    },
    {
      "epoch": 2.5165562913907285,
      "grad_norm": 0.09259911612664488,
      "learning_rate": 1.4452833711883628e-05,
      "loss": 0.3172,
      "step": 190
    },
    {
      "epoch": 2.5298013245033113,
      "grad_norm": 0.08967866399346999,
      "learning_rate": 1.3657829510333654e-05,
      "loss": 0.314,
      "step": 191
    },
    {
      "epoch": 2.543046357615894,
      "grad_norm": 0.09263981141490185,
      "learning_rate": 1.2883709190004955e-05,
      "loss": 0.3306,
      "step": 192
    },
    {
      "epoch": 2.556291390728477,
      "grad_norm": 0.0924041757651034,
      "learning_rate": 1.2130659990073146e-05,
      "loss": 0.3238,
      "step": 193
    },
    {
      "epoch": 2.5695364238410594,
      "grad_norm": 0.08680414784000516,
      "learning_rate": 1.1398864053168534e-05,
      "loss": 0.3172,
      "step": 194
    },
    {
      "epoch": 2.5827814569536423,
      "grad_norm": 0.08927214818010673,
      "learning_rate": 1.0688498381320855e-05,
      "loss": 0.3148,
      "step": 195
    },
    {
      "epoch": 2.596026490066225,
      "grad_norm": 0.09039528377033235,
      "learning_rate": 9.999734793146998e-06,
      "loss": 0.3212,
      "step": 196
    },
    {
      "epoch": 2.609271523178808,
      "grad_norm": 0.08907654916187858,
      "learning_rate": 9.332739882292752e-06,
      "loss": 0.3124,
      "step": 197
    },
    {
      "epoch": 2.622516556291391,
      "grad_norm": 0.09035973348094353,
      "learning_rate": 8.687674977138116e-06,
      "loss": 0.3246,
      "step": 198
    },
    {
      "epoch": 2.6357615894039736,
      "grad_norm": 0.08737713823497803,
      "learning_rate": 8.064696101776358e-06,
      "loss": 0.3143,
      "step": 199
    },
    {
      "epoch": 2.6490066225165565,
      "grad_norm": 0.08814135175802748,
      "learning_rate": 7.463953938275858e-06,
      "loss": 0.3094,
      "step": 200
    },
    {
      "epoch": 2.662251655629139,
      "grad_norm": 0.08889240634697596,
      "learning_rate": 6.8855937902340576e-06,
      "loss": 0.3214,
      "step": 201
    },
    {
      "epoch": 2.6754966887417218,
      "grad_norm": 0.09012485234682949,
      "learning_rate": 6.329755547632499e-06,
      "loss": 0.3169,
      "step": 202
    },
    {
      "epoch": 2.6887417218543046,
      "grad_norm": 0.09076602960863962,
      "learning_rate": 5.7965736530010916e-06,
      "loss": 0.3218,
      "step": 203
    },
    {
      "epoch": 2.7019867549668874,
      "grad_norm": 0.09128692637997875,
      "learning_rate": 5.286177068899989e-06,
      "loss": 0.3224,
      "step": 204
    },
    {
      "epoch": 2.7152317880794703,
      "grad_norm": 0.08980696390068593,
      "learning_rate": 4.798689246727006e-06,
      "loss": 0.3255,
      "step": 205
    },
    {
      "epoch": 2.7284768211920527,
      "grad_norm": 0.08721555286082,
      "learning_rate": 4.3342280968580285e-06,
      "loss": 0.3056,
      "step": 206
    },
    {
      "epoch": 2.741721854304636,
      "grad_norm": 0.09013962844918878,
      "learning_rate": 3.892905960127546e-06,
      "loss": 0.3198,
      "step": 207
    },
    {
      "epoch": 2.7549668874172184,
      "grad_norm": 0.09102568370124482,
      "learning_rate": 3.4748295806564356e-06,
      "loss": 0.3192,
      "step": 208
    },
    {
      "epoch": 2.7682119205298013,
      "grad_norm": 0.09384836363080047,
      "learning_rate": 3.0801000800333877e-06,
      "loss": 0.3269,
      "step": 209
    },
    {
      "epoch": 2.781456953642384,
      "grad_norm": 0.09126268422899254,
      "learning_rate": 2.708812932856253e-06,
      "loss": 0.3302,
      "step": 210
    },
    {
      "epoch": 2.794701986754967,
      "grad_norm": 0.08781813338797502,
      "learning_rate": 2.3610579436393e-06,
      "loss": 0.3272,
      "step": 211
    },
    {
      "epoch": 2.80794701986755,
      "grad_norm": 0.09110065248669541,
      "learning_rate": 2.036919225091827e-06,
      "loss": 0.3206,
      "step": 212
    },
    {
      "epoch": 2.821192052980132,
      "grad_norm": 0.09086421544518553,
      "learning_rate": 1.7364751777736332e-06,
      "loss": 0.3245,
      "step": 213
    },
    {
      "epoch": 2.8344370860927155,
      "grad_norm": 0.08855581117736014,
      "learning_rate": 1.459798471131868e-06,
      "loss": 0.3118,
      "step": 214
    },
    {
      "epoch": 2.847682119205298,
      "grad_norm": 0.08936995804191887,
      "learning_rate": 1.2069560259243328e-06,
      "loss": 0.3215,
      "step": 215
    },
    {
      "epoch": 2.8609271523178808,
      "grad_norm": 0.0921595910113618,
      "learning_rate": 9.780089980330642e-07,
      "loss": 0.3174,
      "step": 216
    },
    {
      "epoch": 2.8741721854304636,
      "grad_norm": 0.08711718437070236,
      "learning_rate": 7.730127636723539e-07,
      "loss": 0.3177,
      "step": 217
    },
    {
      "epoch": 2.8874172185430464,
      "grad_norm": 0.09131775721484407,
      "learning_rate": 5.920169059947411e-07,
      "loss": 0.3232,
      "step": 218
    },
    {
      "epoch": 2.9006622516556293,
      "grad_norm": 0.08947994407470564,
      "learning_rate": 4.3506520309813947e-07,
      "loss": 0.3204,
      "step": 219
    },
    {
      "epoch": 2.9139072847682117,
      "grad_norm": 0.08743216843583222,
      "learning_rate": 3.0219561743707326e-07,
      "loss": 0.3231,
      "step": 220
    },
    {
      "epoch": 2.9271523178807946,
      "grad_norm": 0.09204563273581286,
      "learning_rate": 1.9344028664056713e-07,
      "loss": 0.3206,
      "step": 221
    },
    {
      "epoch": 2.9403973509933774,
      "grad_norm": 0.08928755161531188,
      "learning_rate": 1.0882551573891953e-07,
      "loss": 0.3258,
      "step": 222
    },
    {
      "epoch": 2.9536423841059603,
      "grad_norm": 0.09055680073868443,
      "learning_rate": 4.837177080119215e-08,
      "loss": 0.3207,
      "step": 223
    },
    {
      "epoch": 2.966887417218543,
      "grad_norm": 0.0882029082304654,
      "learning_rate": 1.209367398504746e-08,
      "loss": 0.314,
      "step": 224
    },
    {
      "epoch": 2.980132450331126,
      "grad_norm": 0.09307741342290024,
      "learning_rate": 0.0,
      "loss": 0.3346,
      "step": 225
    },
    {
      "epoch": 2.980132450331126,
      "eval_loss": 0.3478808104991913,
      "eval_runtime": 37.4367,
      "eval_samples_per_second": 27.032,
      "eval_steps_per_second": 0.855,
      "step": 225
    },
    {
      "epoch": 2.980132450331126,
      "step": 225,
      "total_flos": 1.002324572158034e+17,
      "train_loss": 0.3962253777186076,
      "train_runtime": 3220.2895,
      "train_samples_per_second": 8.951,
      "train_steps_per_second": 0.07
    }
  ],
  "logging_steps": 1,
  "max_steps": 225,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.002324572158034e+17,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}