{ "best_metric": null, "best_model_checkpoint": null, "epoch": 24.0, "eval_steps": 500, "global_step": 827208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09, "learning_rate": 0.00013054830287206266, "loss": 18.218, "step": 3000 }, { "epoch": 0.17, "learning_rate": 0.0002610966057441253, "loss": 4.1185, "step": 6000 }, { "epoch": 0.26, "learning_rate": 0.0002990742124651621, "loss": 3.2662, "step": 9000 }, { "epoch": 0.35, "learning_rate": 0.00029775542680299976, "loss": 3.0539, "step": 12000 }, { "epoch": 0.44, "learning_rate": 0.0002964366411408375, "loss": 2.9401, "step": 15000 }, { "epoch": 0.52, "learning_rate": 0.0002951178554786752, "loss": 2.8804, "step": 18000 }, { "epoch": 0.61, "learning_rate": 0.00029379906981651295, "loss": 2.8248, "step": 21000 }, { "epoch": 0.7, "learning_rate": 0.0002924802841543507, "loss": 2.7855, "step": 24000 }, { "epoch": 0.78, "learning_rate": 0.0002911614984921884, "loss": 2.7762, "step": 27000 }, { "epoch": 0.87, "learning_rate": 0.0002898427128300261, "loss": 2.7404, "step": 30000 }, { "epoch": 0.96, "learning_rate": 0.0002885239271678638, "loss": 2.7029, "step": 33000 }, { "epoch": 1.04, "learning_rate": 0.00028720514150570154, "loss": 2.6612, "step": 36000 }, { "epoch": 1.13, "learning_rate": 0.0002858863558435392, "loss": 2.6141, "step": 39000 }, { "epoch": 1.22, "learning_rate": 0.000284567570181377, "loss": 2.5986, "step": 42000 }, { "epoch": 1.31, "learning_rate": 0.00028324878451921467, "loss": 2.5744, "step": 45000 }, { "epoch": 1.39, "learning_rate": 0.0002819299988570524, "loss": 2.5785, "step": 48000 }, { "epoch": 1.48, "learning_rate": 0.00028061121319489013, "loss": 2.5643, "step": 51000 }, { "epoch": 1.57, "learning_rate": 0.0002792924275327278, "loss": 2.569, "step": 54000 }, { "epoch": 1.65, "learning_rate": 0.00027797364187056553, "loss": 2.5455, "step": 57000 }, { "epoch": 1.74, "learning_rate": 0.00027665485620840326, "loss": 2.5417, "step": 60000 }, { "epoch": 1.83, "learning_rate": 0.000275336070546241, "loss": 2.5095, "step": 63000 }, { "epoch": 1.91, "learning_rate": 0.0002740172848840787, "loss": 2.5052, "step": 66000 }, { "epoch": 2.0, "learning_rate": 0.00027269849922191645, "loss": 2.5086, "step": 69000 }, { "epoch": 2.09, "learning_rate": 0.0002713797135597541, "loss": 2.4391, "step": 72000 }, { "epoch": 2.18, "learning_rate": 0.00027006092789759185, "loss": 2.4283, "step": 75000 }, { "epoch": 2.26, "learning_rate": 0.0002687421422354296, "loss": 2.4079, "step": 78000 }, { "epoch": 2.35, "learning_rate": 0.0002674233565732673, "loss": 2.4065, "step": 81000 }, { "epoch": 2.44, "learning_rate": 0.00026610457091110504, "loss": 2.3893, "step": 84000 }, { "epoch": 2.52, "learning_rate": 0.00026478578524894277, "loss": 2.4166, "step": 87000 }, { "epoch": 2.61, "learning_rate": 0.00026346699958678044, "loss": 2.3907, "step": 90000 }, { "epoch": 2.7, "learning_rate": 0.00026214821392461817, "loss": 2.3829, "step": 93000 }, { "epoch": 2.79, "learning_rate": 0.0002608294282624559, "loss": 2.3767, "step": 96000 }, { "epoch": 2.87, "learning_rate": 0.00025951064260029363, "loss": 2.3518, "step": 99000 }, { "epoch": 2.96, "learning_rate": 0.00025819185693813136, "loss": 2.3734, "step": 102000 }, { "epoch": 3.05, "learning_rate": 0.0002568730712759691, "loss": 2.3126, "step": 105000 }, { "epoch": 3.13, "learning_rate": 0.00025555428561380676, "loss": 2.2802, "step": 108000 }, { "epoch": 3.22, "learning_rate": 0.0002542354999516445, "loss": 2.2739, "step": 111000 }, { "epoch": 3.31, "learning_rate": 0.0002529167142894822, "loss": 2.2588, "step": 114000 }, { "epoch": 3.39, "learning_rate": 0.00025159792862731995, "loss": 2.2573, "step": 117000 }, { "epoch": 3.48, "learning_rate": 0.0002502791429651577, "loss": 2.2541, "step": 120000 }, { "epoch": 3.57, "learning_rate": 0.0002489603573029954, "loss": 2.2843, "step": 123000 }, { "epoch": 3.66, "learning_rate": 0.0002476415716408331, "loss": 2.2548, "step": 126000 }, { "epoch": 3.74, "learning_rate": 0.0002463227859786708, "loss": 2.252, "step": 129000 }, { "epoch": 3.83, "learning_rate": 0.00024500400031650854, "loss": 2.2677, "step": 132000 }, { "epoch": 3.92, "learning_rate": 0.00024368521465434624, "loss": 2.2497, "step": 135000 }, { "epoch": 4.0, "learning_rate": 0.00024236642899218397, "loss": 2.2434, "step": 138000 }, { "epoch": 4.09, "learning_rate": 0.0002410476433300217, "loss": 2.1625, "step": 141000 }, { "epoch": 4.18, "learning_rate": 0.0002397288576678594, "loss": 2.1795, "step": 144000 }, { "epoch": 4.26, "learning_rate": 0.00023841007200569713, "loss": 2.1666, "step": 147000 }, { "epoch": 4.35, "learning_rate": 0.00023709128634353483, "loss": 2.1749, "step": 150000 }, { "epoch": 4.44, "learning_rate": 0.00023577250068137256, "loss": 2.1931, "step": 153000 }, { "epoch": 4.53, "learning_rate": 0.0002344537150192103, "loss": 2.1797, "step": 156000 }, { "epoch": 4.61, "learning_rate": 0.000233134929357048, "loss": 2.1573, "step": 159000 }, { "epoch": 4.7, "learning_rate": 0.00023181614369488572, "loss": 2.1811, "step": 162000 }, { "epoch": 4.79, "learning_rate": 0.00023049735803272345, "loss": 2.1539, "step": 165000 }, { "epoch": 4.87, "learning_rate": 0.00022917857237056115, "loss": 2.1657, "step": 168000 }, { "epoch": 4.96, "learning_rate": 0.00022785978670839888, "loss": 2.1662, "step": 171000 }, { "epoch": 5.05, "learning_rate": 0.0002265410010462366, "loss": 2.1237, "step": 174000 }, { "epoch": 5.14, "learning_rate": 0.0002252222153840743, "loss": 2.0812, "step": 177000 }, { "epoch": 5.22, "learning_rate": 0.00022390342972191204, "loss": 2.0917, "step": 180000 }, { "epoch": 5.31, "learning_rate": 0.00022258464405974977, "loss": 2.09, "step": 183000 }, { "epoch": 5.4, "learning_rate": 0.00022126585839758747, "loss": 2.0931, "step": 186000 }, { "epoch": 5.48, "learning_rate": 0.0002199470727354252, "loss": 2.0907, "step": 189000 }, { "epoch": 5.57, "learning_rate": 0.00021862828707326293, "loss": 2.0855, "step": 192000 }, { "epoch": 5.66, "learning_rate": 0.00021730950141110063, "loss": 2.0946, "step": 195000 }, { "epoch": 5.74, "learning_rate": 0.00021599071574893836, "loss": 2.0904, "step": 198000 }, { "epoch": 5.83, "learning_rate": 0.0002146719300867761, "loss": 2.0788, "step": 201000 }, { "epoch": 5.92, "learning_rate": 0.0002133531444246138, "loss": 2.0771, "step": 204000 }, { "epoch": 6.01, "learning_rate": 0.00021203435876245152, "loss": 2.0746, "step": 207000 }, { "epoch": 6.09, "learning_rate": 0.00021071557310028925, "loss": 2.0091, "step": 210000 }, { "epoch": 6.18, "learning_rate": 0.00020939678743812695, "loss": 2.0144, "step": 213000 }, { "epoch": 6.27, "learning_rate": 0.00020807800177596468, "loss": 2.0083, "step": 216000 }, { "epoch": 6.35, "learning_rate": 0.0002067592161138024, "loss": 2.0337, "step": 219000 }, { "epoch": 6.44, "learning_rate": 0.0002054404304516401, "loss": 2.0169, "step": 222000 }, { "epoch": 6.53, "learning_rate": 0.00020412164478947784, "loss": 2.0264, "step": 225000 }, { "epoch": 6.62, "learning_rate": 0.00020280285912731557, "loss": 2.0089, "step": 228000 }, { "epoch": 6.7, "learning_rate": 0.00020148407346515327, "loss": 2.0148, "step": 231000 }, { "epoch": 6.79, "learning_rate": 0.000200165287802991, "loss": 2.0224, "step": 234000 }, { "epoch": 6.88, "learning_rate": 0.00019884650214082873, "loss": 2.0242, "step": 237000 }, { "epoch": 6.96, "learning_rate": 0.00019752771647866643, "loss": 2.0142, "step": 240000 }, { "epoch": 7.05, "learning_rate": 0.00019620893081650416, "loss": 1.9622, "step": 243000 }, { "epoch": 7.14, "learning_rate": 0.0001948901451543419, "loss": 1.9643, "step": 246000 }, { "epoch": 7.22, "learning_rate": 0.0001935713594921796, "loss": 1.9589, "step": 249000 }, { "epoch": 7.31, "learning_rate": 0.00019225257383001732, "loss": 1.9411, "step": 252000 }, { "epoch": 7.4, "learning_rate": 0.000190933788167855, "loss": 1.956, "step": 255000 }, { "epoch": 7.49, "learning_rate": 0.00018961500250569275, "loss": 1.9596, "step": 258000 }, { "epoch": 7.57, "learning_rate": 0.00018829621684353048, "loss": 1.9373, "step": 261000 }, { "epoch": 7.66, "learning_rate": 0.00018697743118136815, "loss": 1.9532, "step": 264000 }, { "epoch": 7.75, "learning_rate": 0.00018565864551920588, "loss": 1.9669, "step": 267000 }, { "epoch": 7.83, "learning_rate": 0.00018433985985704364, "loss": 1.957, "step": 270000 }, { "epoch": 7.92, "learning_rate": 0.0001830210741948813, "loss": 1.977, "step": 273000 }, { "epoch": 8.01, "learning_rate": 0.00018170228853271904, "loss": 1.9712, "step": 276000 }, { "epoch": 8.09, "learning_rate": 0.0001803835028705568, "loss": 1.8922, "step": 279000 }, { "epoch": 8.18, "learning_rate": 0.00017906471720839447, "loss": 1.9054, "step": 282000 }, { "epoch": 8.27, "learning_rate": 0.0001777459315462322, "loss": 1.8847, "step": 285000 }, { "epoch": 8.36, "learning_rate": 0.00017642714588406993, "loss": 1.896, "step": 288000 }, { "epoch": 8.44, "learning_rate": 0.00017510836022190763, "loss": 1.9139, "step": 291000 }, { "epoch": 8.53, "learning_rate": 0.00017378957455974536, "loss": 1.9086, "step": 294000 }, { "epoch": 8.62, "learning_rate": 0.0001724707888975831, "loss": 1.9158, "step": 297000 }, { "epoch": 8.7, "learning_rate": 0.0001711520032354208, "loss": 1.908, "step": 300000 }, { "epoch": 8.79, "learning_rate": 0.00016983321757325852, "loss": 1.9034, "step": 303000 }, { "epoch": 8.88, "learning_rate": 0.00016851443191109625, "loss": 1.9022, "step": 306000 }, { "epoch": 8.97, "learning_rate": 0.00016719564624893395, "loss": 1.9045, "step": 309000 }, { "epoch": 9.05, "learning_rate": 0.00016587686058677168, "loss": 1.8607, "step": 312000 }, { "epoch": 9.14, "learning_rate": 0.0001645580749246094, "loss": 1.8439, "step": 315000 }, { "epoch": 9.23, "learning_rate": 0.0001632392892624471, "loss": 1.8252, "step": 318000 }, { "epoch": 9.31, "learning_rate": 0.00016192050360028484, "loss": 1.844, "step": 321000 }, { "epoch": 9.4, "learning_rate": 0.00016060171793812257, "loss": 1.839, "step": 324000 }, { "epoch": 9.49, "learning_rate": 0.00015928293227596027, "loss": 1.8442, "step": 327000 }, { "epoch": 9.57, "learning_rate": 0.000157964146613798, "loss": 1.8378, "step": 330000 }, { "epoch": 9.66, "learning_rate": 0.00015664536095163573, "loss": 1.8436, "step": 333000 }, { "epoch": 9.75, "learning_rate": 0.00015532657528947343, "loss": 1.8399, "step": 336000 }, { "epoch": 9.84, "learning_rate": 0.00015400778962731116, "loss": 1.8357, "step": 339000 }, { "epoch": 9.92, "learning_rate": 0.0001526890039651489, "loss": 1.8406, "step": 342000 }, { "epoch": 10.01, "learning_rate": 0.0001513702183029866, "loss": 1.8385, "step": 345000 }, { "epoch": 10.1, "learning_rate": 0.00015005143264082432, "loss": 1.7886, "step": 348000 }, { "epoch": 10.18, "learning_rate": 0.00014873264697866202, "loss": 1.7909, "step": 351000 }, { "epoch": 10.27, "learning_rate": 0.00014741386131649975, "loss": 1.794, "step": 354000 }, { "epoch": 10.36, "learning_rate": 0.00014609507565433748, "loss": 1.7712, "step": 357000 }, { "epoch": 10.44, "learning_rate": 0.00014477628999217518, "loss": 1.7875, "step": 360000 }, { "epoch": 10.53, "learning_rate": 0.0001434575043300129, "loss": 1.786, "step": 363000 }, { "epoch": 10.62, "learning_rate": 0.00014213871866785064, "loss": 1.7925, "step": 366000 }, { "epoch": 10.71, "learning_rate": 0.00014081993300568834, "loss": 1.7875, "step": 369000 }, { "epoch": 10.79, "learning_rate": 0.00013950114734352607, "loss": 1.7752, "step": 372000 }, { "epoch": 10.88, "learning_rate": 0.0001381823616813638, "loss": 1.791, "step": 375000 }, { "epoch": 10.97, "learning_rate": 0.0001368635760192015, "loss": 1.7857, "step": 378000 }, { "epoch": 11.05, "learning_rate": 0.00013554479035703923, "loss": 1.7407, "step": 381000 }, { "epoch": 11.14, "learning_rate": 0.00013422600469487693, "loss": 1.7115, "step": 384000 }, { "epoch": 11.23, "learning_rate": 0.00013290721903271466, "loss": 1.7294, "step": 387000 }, { "epoch": 11.32, "learning_rate": 0.0001315884333705524, "loss": 1.7205, "step": 390000 }, { "epoch": 11.4, "learning_rate": 0.0001302696477083901, "loss": 1.7281, "step": 393000 }, { "epoch": 11.49, "learning_rate": 0.00012895086204622782, "loss": 1.7418, "step": 396000 }, { "epoch": 11.58, "learning_rate": 0.00012763207638406555, "loss": 1.7316, "step": 399000 }, { "epoch": 11.66, "learning_rate": 0.00012631329072190325, "loss": 1.7348, "step": 402000 }, { "epoch": 11.75, "learning_rate": 0.00012499450505974098, "loss": 1.7392, "step": 405000 }, { "epoch": 11.84, "learning_rate": 0.0001236757193975787, "loss": 1.7341, "step": 408000 }, { "epoch": 11.92, "learning_rate": 0.0001223569337354164, "loss": 1.7347, "step": 411000 }, { "epoch": 12.01, "learning_rate": 0.00012103814807325414, "loss": 1.7196, "step": 414000 }, { "epoch": 12.1, "learning_rate": 0.00011971936241109186, "loss": 1.6614, "step": 417000 }, { "epoch": 12.19, "learning_rate": 0.00011840057674892957, "loss": 1.6671, "step": 420000 }, { "epoch": 12.27, "learning_rate": 0.0001170817910867673, "loss": 1.6665, "step": 423000 }, { "epoch": 12.36, "learning_rate": 0.00011576300542460502, "loss": 1.6775, "step": 426000 }, { "epoch": 12.45, "learning_rate": 0.00011444421976244273, "loss": 1.6646, "step": 429000 }, { "epoch": 12.53, "learning_rate": 0.00011312543410028046, "loss": 1.6779, "step": 432000 }, { "epoch": 12.62, "learning_rate": 0.00011180664843811818, "loss": 1.6802, "step": 435000 }, { "epoch": 12.71, "learning_rate": 0.00011048786277595589, "loss": 1.6759, "step": 438000 }, { "epoch": 12.79, "learning_rate": 0.0001091690771137936, "loss": 1.6729, "step": 441000 }, { "epoch": 12.88, "learning_rate": 0.00010785029145163134, "loss": 1.6769, "step": 444000 }, { "epoch": 12.97, "learning_rate": 0.00010653150578946905, "loss": 1.6721, "step": 447000 }, { "epoch": 13.06, "learning_rate": 0.00010521272012730677, "loss": 1.6302, "step": 450000 }, { "epoch": 13.14, "learning_rate": 0.0001038939344651445, "loss": 1.6112, "step": 453000 }, { "epoch": 13.23, "learning_rate": 0.00010257514880298221, "loss": 1.5991, "step": 456000 }, { "epoch": 13.32, "learning_rate": 0.00010125636314081991, "loss": 1.6221, "step": 459000 }, { "epoch": 13.4, "learning_rate": 9.993757747865765e-05, "loss": 1.6168, "step": 462000 }, { "epoch": 13.49, "learning_rate": 9.861879181649536e-05, "loss": 1.6166, "step": 465000 }, { "epoch": 13.58, "learning_rate": 9.730000615433307e-05, "loss": 1.6234, "step": 468000 }, { "epoch": 13.67, "learning_rate": 9.59812204921708e-05, "loss": 1.6245, "step": 471000 }, { "epoch": 13.75, "learning_rate": 9.466243483000852e-05, "loss": 1.618, "step": 474000 }, { "epoch": 13.84, "learning_rate": 9.334364916784623e-05, "loss": 1.6341, "step": 477000 }, { "epoch": 13.93, "learning_rate": 9.202486350568396e-05, "loss": 1.6353, "step": 480000 }, { "epoch": 14.01, "learning_rate": 9.070607784352168e-05, "loss": 1.6104, "step": 483000 }, { "epoch": 14.1, "learning_rate": 8.938729218135939e-05, "loss": 1.562, "step": 486000 }, { "epoch": 14.19, "learning_rate": 8.806850651919711e-05, "loss": 1.5654, "step": 489000 }, { "epoch": 14.27, "learning_rate": 8.674972085703484e-05, "loss": 1.564, "step": 492000 }, { "epoch": 14.36, "learning_rate": 8.543093519487255e-05, "loss": 1.5606, "step": 495000 }, { "epoch": 14.45, "learning_rate": 8.411214953271027e-05, "loss": 1.5764, "step": 498000 }, { "epoch": 14.54, "learning_rate": 8.2793363870548e-05, "loss": 1.5718, "step": 501000 }, { "epoch": 14.62, "learning_rate": 8.147457820838571e-05, "loss": 1.5768, "step": 504000 }, { "epoch": 14.71, "learning_rate": 8.015579254622343e-05, "loss": 1.5686, "step": 507000 }, { "epoch": 14.8, "learning_rate": 7.883700688406116e-05, "loss": 1.5753, "step": 510000 }, { "epoch": 14.88, "learning_rate": 7.751822122189887e-05, "loss": 1.5588, "step": 513000 }, { "epoch": 14.97, "learning_rate": 7.619943555973659e-05, "loss": 1.5588, "step": 516000 }, { "epoch": 15.06, "learning_rate": 7.48806498975743e-05, "loss": 1.5413, "step": 519000 }, { "epoch": 15.14, "learning_rate": 7.356186423541203e-05, "loss": 1.5102, "step": 522000 }, { "epoch": 15.23, "learning_rate": 7.224307857324975e-05, "loss": 1.5183, "step": 525000 }, { "epoch": 15.32, "learning_rate": 7.092429291108746e-05, "loss": 1.5267, "step": 528000 }, { "epoch": 15.41, "learning_rate": 6.960550724892519e-05, "loss": 1.5185, "step": 531000 }, { "epoch": 15.49, "learning_rate": 6.828672158676289e-05, "loss": 1.5195, "step": 534000 }, { "epoch": 15.58, "learning_rate": 6.696793592460062e-05, "loss": 1.5154, "step": 537000 }, { "epoch": 15.67, "learning_rate": 6.564915026243834e-05, "loss": 1.5312, "step": 540000 }, { "epoch": 15.75, "learning_rate": 6.433036460027605e-05, "loss": 1.5191, "step": 543000 }, { "epoch": 15.84, "learning_rate": 6.301157893811378e-05, "loss": 1.5209, "step": 546000 }, { "epoch": 15.93, "learning_rate": 6.16927932759515e-05, "loss": 1.5145, "step": 549000 }, { "epoch": 16.02, "learning_rate": 6.037400761378922e-05, "loss": 1.5147, "step": 552000 }, { "epoch": 16.1, "learning_rate": 5.905522195162694e-05, "loss": 1.47, "step": 555000 }, { "epoch": 16.19, "learning_rate": 5.773643628946465e-05, "loss": 1.4766, "step": 558000 }, { "epoch": 16.28, "learning_rate": 5.641765062730237e-05, "loss": 1.4688, "step": 561000 }, { "epoch": 16.36, "learning_rate": 5.5098864965140094e-05, "loss": 1.4727, "step": 564000 }, { "epoch": 16.45, "learning_rate": 5.378007930297781e-05, "loss": 1.4673, "step": 567000 }, { "epoch": 16.54, "learning_rate": 5.246129364081553e-05, "loss": 1.4728, "step": 570000 }, { "epoch": 16.62, "learning_rate": 5.1142507978653254e-05, "loss": 1.4751, "step": 573000 }, { "epoch": 16.71, "learning_rate": 4.982372231649097e-05, "loss": 1.4652, "step": 576000 }, { "epoch": 16.8, "learning_rate": 4.850493665432869e-05, "loss": 1.4783, "step": 579000 }, { "epoch": 16.89, "learning_rate": 4.718615099216641e-05, "loss": 1.4678, "step": 582000 }, { "epoch": 16.97, "learning_rate": 4.586736533000413e-05, "loss": 1.4717, "step": 585000 }, { "epoch": 17.06, "learning_rate": 4.454857966784185e-05, "loss": 1.4422, "step": 588000 }, { "epoch": 17.15, "learning_rate": 4.322979400567956e-05, "loss": 1.4209, "step": 591000 }, { "epoch": 17.23, "learning_rate": 4.191100834351728e-05, "loss": 1.4398, "step": 594000 }, { "epoch": 17.32, "learning_rate": 4.0592222681355005e-05, "loss": 1.4331, "step": 597000 }, { "epoch": 17.41, "learning_rate": 3.927343701919272e-05, "loss": 1.4253, "step": 600000 }, { "epoch": 17.49, "learning_rate": 3.795465135703044e-05, "loss": 1.4247, "step": 603000 }, { "epoch": 17.58, "learning_rate": 3.6635865694868164e-05, "loss": 1.432, "step": 606000 }, { "epoch": 17.67, "learning_rate": 3.531708003270588e-05, "loss": 1.4275, "step": 609000 }, { "epoch": 17.76, "learning_rate": 3.39982943705436e-05, "loss": 1.4305, "step": 612000 }, { "epoch": 17.84, "learning_rate": 3.267950870838132e-05, "loss": 1.4237, "step": 615000 }, { "epoch": 17.93, "learning_rate": 3.136072304621904e-05, "loss": 1.4339, "step": 618000 }, { "epoch": 18.02, "learning_rate": 3.0041937384056755e-05, "loss": 1.4083, "step": 621000 }, { "epoch": 18.1, "learning_rate": 2.8723151721894477e-05, "loss": 1.383, "step": 624000 }, { "epoch": 18.19, "learning_rate": 2.7404366059732196e-05, "loss": 1.3949, "step": 627000 }, { "epoch": 18.28, "learning_rate": 2.6085580397569915e-05, "loss": 1.3857, "step": 630000 }, { "epoch": 18.37, "learning_rate": 2.4766794735407634e-05, "loss": 1.4049, "step": 633000 }, { "epoch": 18.45, "learning_rate": 2.3448009073245356e-05, "loss": 1.3972, "step": 636000 }, { "epoch": 18.54, "learning_rate": 2.2129223411083075e-05, "loss": 1.3983, "step": 639000 }, { "epoch": 18.63, "learning_rate": 2.081043774892079e-05, "loss": 1.3907, "step": 642000 }, { "epoch": 18.71, "learning_rate": 1.949165208675851e-05, "loss": 1.3888, "step": 645000 }, { "epoch": 18.8, "learning_rate": 1.817286642459623e-05, "loss": 1.3762, "step": 648000 }, { "epoch": 18.89, "learning_rate": 1.685408076243395e-05, "loss": 1.3926, "step": 651000 }, { "epoch": 18.97, "learning_rate": 1.553529510027167e-05, "loss": 1.3874, "step": 654000 }, { "epoch": 19.06, "learning_rate": 1.4216509438109387e-05, "loss": 1.375, "step": 657000 }, { "epoch": 19.15, "learning_rate": 1.2897723775947106e-05, "loss": 1.3655, "step": 660000 }, { "epoch": 19.24, "learning_rate": 1.1578938113784827e-05, "loss": 1.3625, "step": 663000 }, { "epoch": 19.32, "learning_rate": 1.0260152451622544e-05, "loss": 1.3705, "step": 666000 }, { "epoch": 19.41, "learning_rate": 8.941366789460264e-06, "loss": 1.3672, "step": 669000 }, { "epoch": 19.5, "learning_rate": 7.622581127297983e-06, "loss": 1.3545, "step": 672000 }, { "epoch": 19.58, "learning_rate": 6.303795465135703e-06, "loss": 1.3611, "step": 675000 }, { "epoch": 19.67, "learning_rate": 4.985009802973422e-06, "loss": 1.3574, "step": 678000 }, { "epoch": 19.76, "learning_rate": 3.666224140811141e-06, "loss": 1.3673, "step": 681000 }, { "epoch": 19.85, "learning_rate": 2.34743847864886e-06, "loss": 1.3629, "step": 684000 }, { "epoch": 19.93, "learning_rate": 1.028652816486579e-06, "loss": 1.3694, "step": 687000 }, { "epoch": 20.0, "step": 689340, "total_flos": 1.193067231326117e+21, "train_loss": 1.952947931640795, "train_runtime": 385301.7422, "train_samples_per_second": 28.625, "train_steps_per_second": 1.789 }, { "epoch": 20.0, "step": 689340, "total_flos": 1.193067231326117e+21, "train_loss": 0.0, "train_runtime": 10.1094, "train_samples_per_second": 272749.588, "train_steps_per_second": 17047.066 }, { "epoch": 20.02, "learning_rate": 1.027864919821501e-06, "loss": 1.3347, "step": 690000 }, { "epoch": 20.11, "learning_rate": 1.0242835713438735e-06, "loss": 1.3627, "step": 693000 }, { "epoch": 20.19, "learning_rate": 1.0207022228662458e-06, "loss": 1.3556, "step": 696000 }, { "epoch": 20.28, "learning_rate": 1.0171208743886182e-06, "loss": 1.3565, "step": 699000 }, { "epoch": 20.37, "learning_rate": 1.0135395259109907e-06, "loss": 1.3619, "step": 702000 }, { "epoch": 20.45, "learning_rate": 1.009958177433363e-06, "loss": 1.3536, "step": 705000 }, { "epoch": 20.54, "learning_rate": 1.0063768289557357e-06, "loss": 1.3481, "step": 708000 }, { "epoch": 20.63, "learning_rate": 1.002795480478108e-06, "loss": 1.348, "step": 711000 }, { "epoch": 20.72, "learning_rate": 9.992141320004804e-07, "loss": 1.3539, "step": 714000 }, { "epoch": 20.8, "learning_rate": 9.956327835228528e-07, "loss": 1.365, "step": 717000 }, { "epoch": 20.89, "learning_rate": 9.920514350452253e-07, "loss": 1.3384, "step": 720000 }, { "epoch": 20.98, "learning_rate": 9.884700865675977e-07, "loss": 1.3558, "step": 723000 }, { "epoch": 21.06, "learning_rate": 9.848887380899703e-07, "loss": 1.3388, "step": 726000 }, { "epoch": 21.15, "learning_rate": 9.813073896123426e-07, "loss": 1.3527, "step": 729000 }, { "epoch": 21.24, "learning_rate": 9.77726041134715e-07, "loss": 1.3458, "step": 732000 }, { "epoch": 21.32, "learning_rate": 9.741446926570875e-07, "loss": 1.3496, "step": 735000 }, { "epoch": 21.41, "learning_rate": 9.7056334417946e-07, "loss": 1.3294, "step": 738000 }, { "epoch": 21.5, "learning_rate": 9.669819957018323e-07, "loss": 1.3606, "step": 741000 }, { "epoch": 21.59, "learning_rate": 9.634006472242046e-07, "loss": 1.3573, "step": 744000 }, { "epoch": 21.67, "learning_rate": 9.598192987465772e-07, "loss": 1.3445, "step": 747000 }, { "epoch": 21.76, "learning_rate": 9.562379502689496e-07, "loss": 1.3407, "step": 750000 }, { "epoch": 21.85, "learning_rate": 9.52656601791322e-07, "loss": 1.3646, "step": 753000 }, { "epoch": 21.93, "learning_rate": 9.490752533136945e-07, "loss": 1.3589, "step": 756000 }, { "epoch": 22.02, "learning_rate": 9.454939048360669e-07, "loss": 1.3504, "step": 759000 }, { "epoch": 22.11, "learning_rate": 9.419125563584393e-07, "loss": 1.3435, "step": 762000 }, { "epoch": 22.2, "learning_rate": 9.383312078808117e-07, "loss": 1.3433, "step": 765000 }, { "epoch": 22.28, "learning_rate": 9.347498594031841e-07, "loss": 1.35, "step": 768000 }, { "epoch": 22.37, "learning_rate": 9.311685109255566e-07, "loss": 1.3595, "step": 771000 }, { "epoch": 22.46, "learning_rate": 9.275871624479291e-07, "loss": 1.3518, "step": 774000 }, { "epoch": 22.54, "learning_rate": 9.240058139703014e-07, "loss": 1.3451, "step": 777000 }, { "epoch": 22.63, "learning_rate": 9.204244654926739e-07, "loss": 1.3378, "step": 780000 }, { "epoch": 22.72, "learning_rate": 9.168431170150462e-07, "loss": 1.3374, "step": 783000 }, { "epoch": 22.8, "learning_rate": 9.132617685374188e-07, "loss": 1.3521, "step": 786000 }, { "epoch": 22.89, "learning_rate": 9.096804200597912e-07, "loss": 1.3519, "step": 789000 }, { "epoch": 22.98, "learning_rate": 9.060990715821636e-07, "loss": 1.3381, "step": 792000 }, { "epoch": 23.07, "learning_rate": 9.02517723104536e-07, "loss": 1.3294, "step": 795000 }, { "epoch": 23.15, "learning_rate": 8.989363746269084e-07, "loss": 1.3473, "step": 798000 }, { "epoch": 23.24, "learning_rate": 8.953550261492809e-07, "loss": 1.3453, "step": 801000 }, { "epoch": 23.33, "learning_rate": 8.917736776716533e-07, "loss": 1.3416, "step": 804000 }, { "epoch": 23.41, "learning_rate": 8.881923291940258e-07, "loss": 1.3383, "step": 807000 }, { "epoch": 23.5, "learning_rate": 8.846109807163981e-07, "loss": 1.3419, "step": 810000 }, { "epoch": 23.59, "learning_rate": 8.810296322387707e-07, "loss": 1.3607, "step": 813000 }, { "epoch": 23.67, "learning_rate": 8.77448283761143e-07, "loss": 1.3438, "step": 816000 }, { "epoch": 23.76, "learning_rate": 8.738669352835155e-07, "loss": 1.3464, "step": 819000 }, { "epoch": 23.85, "learning_rate": 8.702855868058879e-07, "loss": 1.346, "step": 822000 }, { "epoch": 23.94, "learning_rate": 8.667042383282602e-07, "loss": 1.3413, "step": 825000 } ], "logging_steps": 3000, "max_steps": 861675, "num_train_epochs": 25, "save_steps": 500, "total_flos": 1.4316827713775893e+21, "trial_name": null, "trial_params": null }