diff --git "a/checkpoint-1727400/trainer_state.json" "b/checkpoint-1727400/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1727400/trainer_state.json" @@ -0,0 +1,5215 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 39.99994211019902, + "eval_steps": 500, + "global_step": 1727400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.05, + "learning_rate": 4.6310589688175354e-05, + "loss": 4.5055, + "step": 2000 + }, + { + "epoch": 0.09, + "learning_rate": 9.262117937635071e-05, + "loss": 1.4851, + "step": 4000 + }, + { + "epoch": 0.14, + "learning_rate": 0.00013893176906452608, + "loss": 1.1808, + "step": 6000 + }, + { + "epoch": 0.19, + "learning_rate": 0.00018524235875270141, + "loss": 1.132, + "step": 8000 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002315529484408768, + "loss": 1.1349, + "step": 10000 + }, + { + "epoch": 0.28, + "learning_rate": 0.00027786353812905216, + "loss": 1.1422, + "step": 12000 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002997558073716234, + "loss": 1.1513, + "step": 14000 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002992880054015534, + "loss": 1.142, + "step": 16000 + }, + { + "epoch": 0.42, + "learning_rate": 0.00029882020343148334, + "loss": 1.1303, + "step": 18000 + }, + { + "epoch": 0.46, + "learning_rate": 0.0002983524014614133, + "loss": 1.1215, + "step": 20000 + }, + { + "epoch": 0.51, + "learning_rate": 0.0002978845994913433, + "loss": 1.109, + "step": 22000 + }, + { + "epoch": 0.56, + "learning_rate": 0.00029741679752127325, + "loss": 1.1034, + "step": 24000 + }, + { + "epoch": 0.6, + "learning_rate": 0.0002969489955512032, + "loss": 1.0998, + "step": 26000 + }, + { + "epoch": 0.65, + "learning_rate": 0.00029648119358113324, + "loss": 1.0979, + "step": 28000 + }, + { + "epoch": 0.69, + "learning_rate": 0.00029601339161106316, + "loss": 1.083, + "step": 30000 + }, + { + "epoch": 0.74, + "learning_rate": 0.0002955455896409932, + "loss": 1.0741, + "step": 32000 + }, + { + "epoch": 0.79, + "learning_rate": 0.0002950777876709231, + "loss": 1.0741, + "step": 34000 + }, + { + "epoch": 0.83, + "learning_rate": 0.0002946099857008531, + "loss": 1.0707, + "step": 36000 + }, + { + "epoch": 0.88, + "learning_rate": 0.00029414218373078303, + "loss": 1.0689, + "step": 38000 + }, + { + "epoch": 0.93, + "learning_rate": 0.00029367438176071306, + "loss": 1.0536, + "step": 40000 + }, + { + "epoch": 0.97, + "learning_rate": 0.00029320657979064297, + "loss": 1.0556, + "step": 42000 + }, + { + "epoch": 1.02, + "learning_rate": 0.000292738777820573, + "loss": 1.0358, + "step": 44000 + }, + { + "epoch": 1.07, + "learning_rate": 0.0002922709758505029, + "loss": 1.018, + "step": 46000 + }, + { + "epoch": 1.11, + "learning_rate": 0.00029180317388043293, + "loss": 1.0162, + "step": 48000 + }, + { + "epoch": 1.16, + "learning_rate": 0.00029133537191036285, + "loss": 1.017, + "step": 50000 + }, + { + "epoch": 1.2, + "learning_rate": 0.00029086756994029287, + "loss": 1.0145, + "step": 52000 + }, + { + "epoch": 1.25, + "learning_rate": 0.0002903997679702228, + "loss": 1.0103, + "step": 54000 + }, + { + "epoch": 1.3, + "learning_rate": 0.0002899319660001528, + "loss": 1.0046, + "step": 56000 + }, + { + "epoch": 1.34, + "learning_rate": 0.0002894641640300827, + "loss": 1.01, + "step": 58000 + }, + { + "epoch": 1.39, + "learning_rate": 0.00028899636206001274, + "loss": 1.0146, + "step": 60000 + }, + { + "epoch": 1.44, + "learning_rate": 0.00028852856008994266, + "loss": 1.0122, + "step": 62000 + }, + { + "epoch": 1.48, + "learning_rate": 0.0002880607581198727, + "loss": 0.9961, + "step": 64000 + }, + { + "epoch": 1.53, + "learning_rate": 0.00028759295614980265, + "loss": 0.9914, + "step": 66000 + }, + { + "epoch": 1.57, + "learning_rate": 0.0002871251541797326, + "loss": 0.9936, + "step": 68000 + }, + { + "epoch": 1.62, + "learning_rate": 0.0002866573522096626, + "loss": 0.9998, + "step": 70000 + }, + { + "epoch": 1.67, + "learning_rate": 0.00028618955023959256, + "loss": 0.9833, + "step": 72000 + }, + { + "epoch": 1.71, + "learning_rate": 0.0002857217482695225, + "loss": 0.9901, + "step": 74000 + }, + { + "epoch": 1.76, + "learning_rate": 0.0002852539462994525, + "loss": 0.9905, + "step": 76000 + }, + { + "epoch": 1.81, + "learning_rate": 0.00028478614432938246, + "loss": 0.9845, + "step": 78000 + }, + { + "epoch": 1.85, + "learning_rate": 0.00028431834235931243, + "loss": 0.9886, + "step": 80000 + }, + { + "epoch": 1.9, + "learning_rate": 0.0002838505403892424, + "loss": 0.9848, + "step": 82000 + }, + { + "epoch": 1.95, + "learning_rate": 0.00028338273841917237, + "loss": 0.9827, + "step": 84000 + }, + { + "epoch": 1.99, + "learning_rate": 0.00028291493644910234, + "loss": 0.9764, + "step": 86000 + }, + { + "epoch": 2.04, + "learning_rate": 0.0002824471344790323, + "loss": 0.9546, + "step": 88000 + }, + { + "epoch": 2.08, + "learning_rate": 0.0002819793325089623, + "loss": 0.9388, + "step": 90000 + }, + { + "epoch": 2.13, + "learning_rate": 0.00028151153053889225, + "loss": 0.9432, + "step": 92000 + }, + { + "epoch": 2.18, + "learning_rate": 0.0002810437285688222, + "loss": 0.9591, + "step": 94000 + }, + { + "epoch": 2.22, + "learning_rate": 0.0002805759265987522, + "loss": 0.9431, + "step": 96000 + }, + { + "epoch": 2.27, + "learning_rate": 0.00028010812462868215, + "loss": 0.9388, + "step": 98000 + }, + { + "epoch": 2.32, + "learning_rate": 0.0002796403226586121, + "loss": 0.9421, + "step": 100000 + }, + { + "epoch": 2.36, + "learning_rate": 0.0002791725206885421, + "loss": 0.9452, + "step": 102000 + }, + { + "epoch": 2.41, + "learning_rate": 0.0002787047187184721, + "loss": 0.9535, + "step": 104000 + }, + { + "epoch": 2.45, + "learning_rate": 0.00027823691674840203, + "loss": 0.9373, + "step": 106000 + }, + { + "epoch": 2.5, + "learning_rate": 0.00027776911477833205, + "loss": 0.9345, + "step": 108000 + }, + { + "epoch": 2.55, + "learning_rate": 0.00027730131280826197, + "loss": 0.9478, + "step": 110000 + }, + { + "epoch": 2.59, + "learning_rate": 0.000276833510838192, + "loss": 0.9445, + "step": 112000 + }, + { + "epoch": 2.64, + "learning_rate": 0.0002763657088681219, + "loss": 0.9297, + "step": 114000 + }, + { + "epoch": 2.69, + "learning_rate": 0.0002758979068980519, + "loss": 0.9294, + "step": 116000 + }, + { + "epoch": 2.73, + "learning_rate": 0.00027543010492798184, + "loss": 0.9273, + "step": 118000 + }, + { + "epoch": 2.78, + "learning_rate": 0.00027496230295791186, + "loss": 0.9242, + "step": 120000 + }, + { + "epoch": 2.83, + "learning_rate": 0.0002744945009878418, + "loss": 0.9228, + "step": 122000 + }, + { + "epoch": 2.87, + "learning_rate": 0.0002740266990177718, + "loss": 0.9316, + "step": 124000 + }, + { + "epoch": 2.92, + "learning_rate": 0.0002735588970477017, + "loss": 0.9448, + "step": 126000 + }, + { + "epoch": 2.96, + "learning_rate": 0.00027309109507763174, + "loss": 0.9253, + "step": 128000 + }, + { + "epoch": 3.01, + "learning_rate": 0.00027262329310756165, + "loss": 0.9197, + "step": 130000 + }, + { + "epoch": 3.06, + "learning_rate": 0.0002721554911374917, + "loss": 0.9025, + "step": 132000 + }, + { + "epoch": 3.1, + "learning_rate": 0.0002716876891674216, + "loss": 0.8983, + "step": 134000 + }, + { + "epoch": 3.15, + "learning_rate": 0.0002712198871973516, + "loss": 0.9013, + "step": 136000 + }, + { + "epoch": 3.2, + "learning_rate": 0.0002707520852272816, + "loss": 0.9019, + "step": 138000 + }, + { + "epoch": 3.24, + "learning_rate": 0.00027028428325721155, + "loss": 0.9016, + "step": 140000 + }, + { + "epoch": 3.29, + "learning_rate": 0.0002698164812871415, + "loss": 0.8979, + "step": 142000 + }, + { + "epoch": 3.33, + "learning_rate": 0.0002693486793170715, + "loss": 0.8961, + "step": 144000 + }, + { + "epoch": 3.38, + "learning_rate": 0.00026888087734700146, + "loss": 0.9007, + "step": 146000 + }, + { + "epoch": 3.43, + "learning_rate": 0.00026841307537693143, + "loss": 0.8951, + "step": 148000 + }, + { + "epoch": 3.47, + "learning_rate": 0.0002679452734068614, + "loss": 0.8926, + "step": 150000 + }, + { + "epoch": 3.52, + "learning_rate": 0.00026747747143679137, + "loss": 0.8924, + "step": 152000 + }, + { + "epoch": 3.57, + "learning_rate": 0.00026700966946672134, + "loss": 0.9044, + "step": 154000 + }, + { + "epoch": 3.61, + "learning_rate": 0.0002665418674966513, + "loss": 0.8952, + "step": 156000 + }, + { + "epoch": 3.66, + "learning_rate": 0.00026607406552658127, + "loss": 0.9001, + "step": 158000 + }, + { + "epoch": 3.7, + "learning_rate": 0.00026560626355651124, + "loss": 0.8898, + "step": 160000 + }, + { + "epoch": 3.75, + "learning_rate": 0.0002651384615864412, + "loss": 0.895, + "step": 162000 + }, + { + "epoch": 3.8, + "learning_rate": 0.0002646706596163712, + "loss": 0.9015, + "step": 164000 + }, + { + "epoch": 3.84, + "learning_rate": 0.00026420285764630115, + "loss": 0.892, + "step": 166000 + }, + { + "epoch": 3.89, + "learning_rate": 0.0002637350556762311, + "loss": 0.8903, + "step": 168000 + }, + { + "epoch": 3.94, + "learning_rate": 0.0002632672537061611, + "loss": 0.8916, + "step": 170000 + }, + { + "epoch": 3.98, + "learning_rate": 0.00026279945173609105, + "loss": 0.8941, + "step": 172000 + }, + { + "epoch": 4.03, + "learning_rate": 0.000262331649766021, + "loss": 0.8771, + "step": 174000 + }, + { + "epoch": 4.08, + "learning_rate": 0.000261863847795951, + "loss": 0.8716, + "step": 176000 + }, + { + "epoch": 4.12, + "learning_rate": 0.00026139604582588096, + "loss": 0.8632, + "step": 178000 + }, + { + "epoch": 4.17, + "learning_rate": 0.00026092824385581093, + "loss": 0.8573, + "step": 180000 + }, + { + "epoch": 4.21, + "learning_rate": 0.0002604604418857409, + "loss": 0.8642, + "step": 182000 + }, + { + "epoch": 4.26, + "learning_rate": 0.00025999263991567087, + "loss": 0.8642, + "step": 184000 + }, + { + "epoch": 4.31, + "learning_rate": 0.00025952483794560084, + "loss": 0.8617, + "step": 186000 + }, + { + "epoch": 4.35, + "learning_rate": 0.0002590570359755308, + "loss": 0.8574, + "step": 188000 + }, + { + "epoch": 4.4, + "learning_rate": 0.0002585892340054608, + "loss": 0.8612, + "step": 190000 + }, + { + "epoch": 4.45, + "learning_rate": 0.00025812143203539074, + "loss": 0.8706, + "step": 192000 + }, + { + "epoch": 4.49, + "learning_rate": 0.0002576536300653207, + "loss": 0.8605, + "step": 194000 + }, + { + "epoch": 4.54, + "learning_rate": 0.0002571858280952507, + "loss": 0.8703, + "step": 196000 + }, + { + "epoch": 4.58, + "learning_rate": 0.00025671802612518065, + "loss": 0.8691, + "step": 198000 + }, + { + "epoch": 4.63, + "learning_rate": 0.0002562502241551106, + "loss": 0.873, + "step": 200000 + }, + { + "epoch": 4.68, + "learning_rate": 0.0002557824221850406, + "loss": 0.857, + "step": 202000 + }, + { + "epoch": 4.72, + "learning_rate": 0.00025531462021497056, + "loss": 0.8686, + "step": 204000 + }, + { + "epoch": 4.77, + "learning_rate": 0.0002548468182449005, + "loss": 0.868, + "step": 206000 + }, + { + "epoch": 4.82, + "learning_rate": 0.0002543790162748305, + "loss": 0.8756, + "step": 208000 + }, + { + "epoch": 4.86, + "learning_rate": 0.0002539112143047605, + "loss": 0.8656, + "step": 210000 + }, + { + "epoch": 4.91, + "learning_rate": 0.00025344341233469043, + "loss": 0.8725, + "step": 212000 + }, + { + "epoch": 4.96, + "learning_rate": 0.00025297561036462045, + "loss": 0.8675, + "step": 214000 + }, + { + "epoch": 5.0, + "learning_rate": 0.00025250780839455037, + "loss": 0.8644, + "step": 216000 + }, + { + "epoch": 5.05, + "learning_rate": 0.0002520400064244804, + "loss": 0.8397, + "step": 218000 + }, + { + "epoch": 5.09, + "learning_rate": 0.0002515722044544103, + "loss": 0.8374, + "step": 220000 + }, + { + "epoch": 5.14, + "learning_rate": 0.00025110440248434033, + "loss": 0.8364, + "step": 222000 + }, + { + "epoch": 5.19, + "learning_rate": 0.00025063660051427025, + "loss": 0.8454, + "step": 224000 + }, + { + "epoch": 5.23, + "learning_rate": 0.00025016879854420027, + "loss": 0.8367, + "step": 226000 + }, + { + "epoch": 5.28, + "learning_rate": 0.0002497009965741302, + "loss": 0.8275, + "step": 228000 + }, + { + "epoch": 5.33, + "learning_rate": 0.0002492331946040602, + "loss": 0.8398, + "step": 230000 + }, + { + "epoch": 5.37, + "learning_rate": 0.0002487653926339901, + "loss": 0.8403, + "step": 232000 + }, + { + "epoch": 5.42, + "learning_rate": 0.00024829759066392014, + "loss": 0.8409, + "step": 234000 + }, + { + "epoch": 5.46, + "learning_rate": 0.00024782978869385006, + "loss": 0.8366, + "step": 236000 + }, + { + "epoch": 5.51, + "learning_rate": 0.0002473619867237801, + "loss": 0.8381, + "step": 238000 + }, + { + "epoch": 5.56, + "learning_rate": 0.00024689418475371, + "loss": 0.842, + "step": 240000 + }, + { + "epoch": 5.6, + "learning_rate": 0.00024642638278364, + "loss": 0.843, + "step": 242000 + }, + { + "epoch": 5.65, + "learning_rate": 0.00024595858081357, + "loss": 0.8489, + "step": 244000 + }, + { + "epoch": 5.7, + "learning_rate": 0.00024549077884349996, + "loss": 0.8313, + "step": 246000 + }, + { + "epoch": 5.74, + "learning_rate": 0.0002450229768734299, + "loss": 0.8468, + "step": 248000 + }, + { + "epoch": 5.79, + "learning_rate": 0.0002445551749033599, + "loss": 0.8446, + "step": 250000 + }, + { + "epoch": 5.84, + "learning_rate": 0.00024408737293328986, + "loss": 0.8329, + "step": 252000 + }, + { + "epoch": 5.88, + "learning_rate": 0.00024361957096321983, + "loss": 0.8402, + "step": 254000 + }, + { + "epoch": 5.93, + "learning_rate": 0.0002431517689931498, + "loss": 0.8502, + "step": 256000 + }, + { + "epoch": 5.97, + "learning_rate": 0.00024268396702307977, + "loss": 0.8437, + "step": 258000 + }, + { + "epoch": 6.02, + "learning_rate": 0.00024221616505300974, + "loss": 0.8341, + "step": 260000 + }, + { + "epoch": 6.07, + "learning_rate": 0.0002417483630829397, + "loss": 0.8152, + "step": 262000 + }, + { + "epoch": 6.11, + "learning_rate": 0.00024128056111286968, + "loss": 0.8189, + "step": 264000 + }, + { + "epoch": 6.16, + "learning_rate": 0.00024081275914279965, + "loss": 0.8193, + "step": 266000 + }, + { + "epoch": 6.21, + "learning_rate": 0.00024034495717272961, + "loss": 0.825, + "step": 268000 + }, + { + "epoch": 6.25, + "learning_rate": 0.00023987715520265958, + "loss": 0.818, + "step": 270000 + }, + { + "epoch": 6.3, + "learning_rate": 0.00023940935323258955, + "loss": 0.8204, + "step": 272000 + }, + { + "epoch": 6.34, + "learning_rate": 0.00023894155126251952, + "loss": 0.823, + "step": 274000 + }, + { + "epoch": 6.39, + "learning_rate": 0.0002384737492924495, + "loss": 0.8179, + "step": 276000 + }, + { + "epoch": 6.44, + "learning_rate": 0.00023800594732237946, + "loss": 0.8152, + "step": 278000 + }, + { + "epoch": 6.48, + "learning_rate": 0.00023753814535230943, + "loss": 0.8178, + "step": 280000 + }, + { + "epoch": 6.53, + "learning_rate": 0.00023707034338223942, + "loss": 0.8212, + "step": 282000 + }, + { + "epoch": 6.58, + "learning_rate": 0.00023660254141216937, + "loss": 0.8161, + "step": 284000 + }, + { + "epoch": 6.62, + "learning_rate": 0.00023613473944209936, + "loss": 0.8162, + "step": 286000 + }, + { + "epoch": 6.67, + "learning_rate": 0.0002356669374720293, + "loss": 0.828, + "step": 288000 + }, + { + "epoch": 6.72, + "learning_rate": 0.0002351991355019593, + "loss": 0.823, + "step": 290000 + }, + { + "epoch": 6.76, + "learning_rate": 0.00023473133353188927, + "loss": 0.8156, + "step": 292000 + }, + { + "epoch": 6.81, + "learning_rate": 0.00023426353156181924, + "loss": 0.8176, + "step": 294000 + }, + { + "epoch": 6.85, + "learning_rate": 0.0002337957295917492, + "loss": 0.8226, + "step": 296000 + }, + { + "epoch": 6.9, + "learning_rate": 0.00023332792762167917, + "loss": 0.8189, + "step": 298000 + }, + { + "epoch": 6.95, + "learning_rate": 0.00023286012565160914, + "loss": 0.8082, + "step": 300000 + }, + { + "epoch": 6.99, + "learning_rate": 0.0002323923236815391, + "loss": 0.8257, + "step": 302000 + }, + { + "epoch": 7.04, + "learning_rate": 0.00023192452171146908, + "loss": 0.7925, + "step": 304000 + }, + { + "epoch": 7.09, + "learning_rate": 0.00023145671974139905, + "loss": 0.7902, + "step": 306000 + }, + { + "epoch": 7.13, + "learning_rate": 0.00023098891777132902, + "loss": 0.7994, + "step": 308000 + }, + { + "epoch": 7.18, + "learning_rate": 0.000230521115801259, + "loss": 0.8029, + "step": 310000 + }, + { + "epoch": 7.22, + "learning_rate": 0.00023005331383118896, + "loss": 0.7983, + "step": 312000 + }, + { + "epoch": 7.27, + "learning_rate": 0.00022958551186111893, + "loss": 0.797, + "step": 314000 + }, + { + "epoch": 7.32, + "learning_rate": 0.00022911770989104892, + "loss": 0.8009, + "step": 316000 + }, + { + "epoch": 7.36, + "learning_rate": 0.00022864990792097886, + "loss": 0.7979, + "step": 318000 + }, + { + "epoch": 7.41, + "learning_rate": 0.00022818210595090886, + "loss": 0.8055, + "step": 320000 + }, + { + "epoch": 7.46, + "learning_rate": 0.0002277143039808388, + "loss": 0.7971, + "step": 322000 + }, + { + "epoch": 7.5, + "learning_rate": 0.0002272465020107688, + "loss": 0.8066, + "step": 324000 + }, + { + "epoch": 7.55, + "learning_rate": 0.00022677870004069874, + "loss": 0.7975, + "step": 326000 + }, + { + "epoch": 7.6, + "learning_rate": 0.00022631089807062873, + "loss": 0.7912, + "step": 328000 + }, + { + "epoch": 7.64, + "learning_rate": 0.00022584309610055868, + "loss": 0.7988, + "step": 330000 + }, + { + "epoch": 7.69, + "learning_rate": 0.00022537529413048867, + "loss": 0.7999, + "step": 332000 + }, + { + "epoch": 7.73, + "learning_rate": 0.00022490749216041861, + "loss": 0.8019, + "step": 334000 + }, + { + "epoch": 7.78, + "learning_rate": 0.0002244396901903486, + "loss": 0.8108, + "step": 336000 + }, + { + "epoch": 7.83, + "learning_rate": 0.00022397188822027855, + "loss": 0.8075, + "step": 338000 + }, + { + "epoch": 7.87, + "learning_rate": 0.00022350408625020855, + "loss": 0.7995, + "step": 340000 + }, + { + "epoch": 7.92, + "learning_rate": 0.0002230362842801385, + "loss": 0.802, + "step": 342000 + }, + { + "epoch": 7.97, + "learning_rate": 0.00022256848231006848, + "loss": 0.7998, + "step": 344000 + }, + { + "epoch": 8.01, + "learning_rate": 0.00022210068033999843, + "loss": 0.8023, + "step": 346000 + }, + { + "epoch": 8.06, + "learning_rate": 0.00022163287836992842, + "loss": 0.7748, + "step": 348000 + }, + { + "epoch": 8.1, + "learning_rate": 0.00022116507639985836, + "loss": 0.7839, + "step": 350000 + }, + { + "epoch": 8.15, + "learning_rate": 0.00022069727442978836, + "loss": 0.7806, + "step": 352000 + }, + { + "epoch": 8.2, + "learning_rate": 0.00022022947245971836, + "loss": 0.7775, + "step": 354000 + }, + { + "epoch": 8.24, + "learning_rate": 0.0002197616704896483, + "loss": 0.7734, + "step": 356000 + }, + { + "epoch": 8.29, + "learning_rate": 0.0002192938685195783, + "loss": 0.7728, + "step": 358000 + }, + { + "epoch": 8.34, + "learning_rate": 0.00021882606654950824, + "loss": 0.7879, + "step": 360000 + }, + { + "epoch": 8.38, + "learning_rate": 0.00021835826457943823, + "loss": 0.7891, + "step": 362000 + }, + { + "epoch": 8.43, + "learning_rate": 0.00021789046260936817, + "loss": 0.7922, + "step": 364000 + }, + { + "epoch": 8.48, + "learning_rate": 0.00021742266063929817, + "loss": 0.7837, + "step": 366000 + }, + { + "epoch": 8.52, + "learning_rate": 0.0002169548586692281, + "loss": 0.7838, + "step": 368000 + }, + { + "epoch": 8.57, + "learning_rate": 0.0002164870566991581, + "loss": 0.7797, + "step": 370000 + }, + { + "epoch": 8.61, + "learning_rate": 0.00021601925472908805, + "loss": 0.7818, + "step": 372000 + }, + { + "epoch": 8.66, + "learning_rate": 0.00021555145275901804, + "loss": 0.7838, + "step": 374000 + }, + { + "epoch": 8.71, + "learning_rate": 0.000215083650788948, + "loss": 0.7828, + "step": 376000 + }, + { + "epoch": 8.75, + "learning_rate": 0.00021461584881887798, + "loss": 0.7822, + "step": 378000 + }, + { + "epoch": 8.8, + "learning_rate": 0.00021414804684880792, + "loss": 0.7952, + "step": 380000 + }, + { + "epoch": 8.85, + "learning_rate": 0.00021368024487873792, + "loss": 0.7888, + "step": 382000 + }, + { + "epoch": 8.89, + "learning_rate": 0.00021321244290866786, + "loss": 0.7813, + "step": 384000 + }, + { + "epoch": 8.94, + "learning_rate": 0.00021274464093859786, + "loss": 0.7784, + "step": 386000 + }, + { + "epoch": 8.98, + "learning_rate": 0.00021227683896852783, + "loss": 0.776, + "step": 388000 + }, + { + "epoch": 9.03, + "learning_rate": 0.0002118090369984578, + "loss": 0.7686, + "step": 390000 + }, + { + "epoch": 9.08, + "learning_rate": 0.0002113412350283878, + "loss": 0.7576, + "step": 392000 + }, + { + "epoch": 9.12, + "learning_rate": 0.00021087343305831773, + "loss": 0.7611, + "step": 394000 + }, + { + "epoch": 9.17, + "learning_rate": 0.00021040563108824773, + "loss": 0.7698, + "step": 396000 + }, + { + "epoch": 9.22, + "learning_rate": 0.00020993782911817767, + "loss": 0.7632, + "step": 398000 + }, + { + "epoch": 9.26, + "learning_rate": 0.00020947002714810767, + "loss": 0.7725, + "step": 400000 + }, + { + "epoch": 9.31, + "learning_rate": 0.0002090022251780376, + "loss": 0.7706, + "step": 402000 + }, + { + "epoch": 9.35, + "learning_rate": 0.0002085344232079676, + "loss": 0.7709, + "step": 404000 + }, + { + "epoch": 9.4, + "learning_rate": 0.00020806662123789755, + "loss": 0.7651, + "step": 406000 + }, + { + "epoch": 9.45, + "learning_rate": 0.00020759881926782754, + "loss": 0.7657, + "step": 408000 + }, + { + "epoch": 9.49, + "learning_rate": 0.00020713101729775748, + "loss": 0.7589, + "step": 410000 + }, + { + "epoch": 9.54, + "learning_rate": 0.00020666321532768748, + "loss": 0.7683, + "step": 412000 + }, + { + "epoch": 9.59, + "learning_rate": 0.00020619541335761742, + "loss": 0.7684, + "step": 414000 + }, + { + "epoch": 9.63, + "learning_rate": 0.00020572761138754742, + "loss": 0.7756, + "step": 416000 + }, + { + "epoch": 9.68, + "learning_rate": 0.00020525980941747736, + "loss": 0.7653, + "step": 418000 + }, + { + "epoch": 9.73, + "learning_rate": 0.00020479200744740736, + "loss": 0.7718, + "step": 420000 + }, + { + "epoch": 9.77, + "learning_rate": 0.0002043242054773373, + "loss": 0.7676, + "step": 422000 + }, + { + "epoch": 9.82, + "learning_rate": 0.0002038564035072673, + "loss": 0.772, + "step": 424000 + }, + { + "epoch": 9.86, + "learning_rate": 0.00020338860153719726, + "loss": 0.766, + "step": 426000 + }, + { + "epoch": 9.91, + "learning_rate": 0.00020292079956712723, + "loss": 0.7739, + "step": 428000 + }, + { + "epoch": 9.96, + "learning_rate": 0.0002024529975970572, + "loss": 0.7743, + "step": 430000 + }, + { + "epoch": 10.0, + "learning_rate": 0.00020198519562698717, + "loss": 0.7719, + "step": 432000 + }, + { + "epoch": 10.05, + "learning_rate": 0.00020151739365691714, + "loss": 0.7443, + "step": 434000 + }, + { + "epoch": 10.1, + "learning_rate": 0.0002010495916868471, + "loss": 0.7573, + "step": 436000 + }, + { + "epoch": 10.14, + "learning_rate": 0.00020058178971677708, + "loss": 0.7546, + "step": 438000 + }, + { + "epoch": 10.19, + "learning_rate": 0.00020011398774670704, + "loss": 0.7516, + "step": 440000 + }, + { + "epoch": 10.23, + "learning_rate": 0.000199646185776637, + "loss": 0.7444, + "step": 442000 + }, + { + "epoch": 10.28, + "learning_rate": 0.00019917838380656698, + "loss": 0.7656, + "step": 444000 + }, + { + "epoch": 10.33, + "learning_rate": 0.00019871058183649695, + "loss": 0.7452, + "step": 446000 + }, + { + "epoch": 10.37, + "learning_rate": 0.00019824277986642692, + "loss": 0.7555, + "step": 448000 + }, + { + "epoch": 10.42, + "learning_rate": 0.0001977749778963569, + "loss": 0.7486, + "step": 450000 + }, + { + "epoch": 10.47, + "learning_rate": 0.00019730717592628686, + "loss": 0.7509, + "step": 452000 + }, + { + "epoch": 10.51, + "learning_rate": 0.00019683937395621683, + "loss": 0.7484, + "step": 454000 + }, + { + "epoch": 10.56, + "learning_rate": 0.0001963715719861468, + "loss": 0.7554, + "step": 456000 + }, + { + "epoch": 10.61, + "learning_rate": 0.00019590377001607676, + "loss": 0.7557, + "step": 458000 + }, + { + "epoch": 10.65, + "learning_rate": 0.00019543596804600676, + "loss": 0.7603, + "step": 460000 + }, + { + "epoch": 10.7, + "learning_rate": 0.00019496816607593673, + "loss": 0.7577, + "step": 462000 + }, + { + "epoch": 10.74, + "learning_rate": 0.0001945003641058667, + "loss": 0.7641, + "step": 464000 + }, + { + "epoch": 10.79, + "learning_rate": 0.00019403256213579667, + "loss": 0.7648, + "step": 466000 + }, + { + "epoch": 10.84, + "learning_rate": 0.00019356476016572664, + "loss": 0.755, + "step": 468000 + }, + { + "epoch": 10.88, + "learning_rate": 0.0001930969581956566, + "loss": 0.7445, + "step": 470000 + }, + { + "epoch": 10.93, + "learning_rate": 0.00019262915622558657, + "loss": 0.7614, + "step": 472000 + }, + { + "epoch": 10.98, + "learning_rate": 0.00019216135425551654, + "loss": 0.7526, + "step": 474000 + }, + { + "epoch": 11.02, + "learning_rate": 0.0001916935522854465, + "loss": 0.7493, + "step": 476000 + }, + { + "epoch": 11.07, + "learning_rate": 0.00019122575031537648, + "loss": 0.7299, + "step": 478000 + }, + { + "epoch": 11.11, + "learning_rate": 0.00019075794834530645, + "loss": 0.7379, + "step": 480000 + }, + { + "epoch": 11.16, + "learning_rate": 0.00019029014637523642, + "loss": 0.7365, + "step": 482000 + }, + { + "epoch": 11.21, + "learning_rate": 0.00018982234440516639, + "loss": 0.7402, + "step": 484000 + }, + { + "epoch": 11.25, + "learning_rate": 0.00018935454243509636, + "loss": 0.7409, + "step": 486000 + }, + { + "epoch": 11.3, + "learning_rate": 0.00018888674046502632, + "loss": 0.7294, + "step": 488000 + }, + { + "epoch": 11.35, + "learning_rate": 0.0001884189384949563, + "loss": 0.7467, + "step": 490000 + }, + { + "epoch": 11.39, + "learning_rate": 0.00018795113652488626, + "loss": 0.7357, + "step": 492000 + }, + { + "epoch": 11.44, + "learning_rate": 0.00018748333455481623, + "loss": 0.744, + "step": 494000 + }, + { + "epoch": 11.49, + "learning_rate": 0.0001870155325847462, + "loss": 0.741, + "step": 496000 + }, + { + "epoch": 11.53, + "learning_rate": 0.0001865477306146762, + "loss": 0.7404, + "step": 498000 + }, + { + "epoch": 11.58, + "learning_rate": 0.00018607992864460614, + "loss": 0.749, + "step": 500000 + }, + { + "epoch": 11.62, + "learning_rate": 0.00018561212667453613, + "loss": 0.7388, + "step": 502000 + }, + { + "epoch": 11.67, + "learning_rate": 0.00018514432470446607, + "loss": 0.742, + "step": 504000 + }, + { + "epoch": 11.72, + "learning_rate": 0.00018467652273439607, + "loss": 0.7481, + "step": 506000 + }, + { + "epoch": 11.76, + "learning_rate": 0.000184208720764326, + "loss": 0.7553, + "step": 508000 + }, + { + "epoch": 11.81, + "learning_rate": 0.000183740918794256, + "loss": 0.7457, + "step": 510000 + }, + { + "epoch": 11.86, + "learning_rate": 0.00018327311682418595, + "loss": 0.7447, + "step": 512000 + }, + { + "epoch": 11.9, + "learning_rate": 0.00018280531485411595, + "loss": 0.752, + "step": 514000 + }, + { + "epoch": 11.95, + "learning_rate": 0.0001823375128840459, + "loss": 0.7419, + "step": 516000 + }, + { + "epoch": 11.99, + "learning_rate": 0.00018186971091397588, + "loss": 0.7412, + "step": 518000 + }, + { + "epoch": 12.04, + "learning_rate": 0.00018140190894390583, + "loss": 0.7262, + "step": 520000 + }, + { + "epoch": 12.09, + "learning_rate": 0.00018093410697383582, + "loss": 0.7299, + "step": 522000 + }, + { + "epoch": 12.13, + "learning_rate": 0.00018046630500376576, + "loss": 0.7279, + "step": 524000 + }, + { + "epoch": 12.18, + "learning_rate": 0.00017999850303369576, + "loss": 0.7308, + "step": 526000 + }, + { + "epoch": 12.23, + "learning_rate": 0.0001795307010636257, + "loss": 0.7305, + "step": 528000 + }, + { + "epoch": 12.27, + "learning_rate": 0.0001790628990935557, + "loss": 0.7348, + "step": 530000 + }, + { + "epoch": 12.32, + "learning_rate": 0.0001785950971234857, + "loss": 0.7312, + "step": 532000 + }, + { + "epoch": 12.37, + "learning_rate": 0.00017812729515341563, + "loss": 0.7275, + "step": 534000 + }, + { + "epoch": 12.41, + "learning_rate": 0.00017765949318334563, + "loss": 0.7291, + "step": 536000 + }, + { + "epoch": 12.46, + "learning_rate": 0.00017719169121327557, + "loss": 0.7265, + "step": 538000 + }, + { + "epoch": 12.5, + "learning_rate": 0.00017672388924320557, + "loss": 0.7224, + "step": 540000 + }, + { + "epoch": 12.55, + "learning_rate": 0.0001762560872731355, + "loss": 0.7232, + "step": 542000 + }, + { + "epoch": 12.6, + "learning_rate": 0.0001757882853030655, + "loss": 0.7272, + "step": 544000 + }, + { + "epoch": 12.64, + "learning_rate": 0.00017532048333299545, + "loss": 0.7305, + "step": 546000 + }, + { + "epoch": 12.69, + "learning_rate": 0.00017485268136292544, + "loss": 0.7375, + "step": 548000 + }, + { + "epoch": 12.74, + "learning_rate": 0.00017438487939285539, + "loss": 0.7377, + "step": 550000 + }, + { + "epoch": 12.78, + "learning_rate": 0.00017391707742278538, + "loss": 0.7278, + "step": 552000 + }, + { + "epoch": 12.83, + "learning_rate": 0.00017344927545271532, + "loss": 0.7369, + "step": 554000 + }, + { + "epoch": 12.87, + "learning_rate": 0.00017298147348264532, + "loss": 0.7366, + "step": 556000 + }, + { + "epoch": 12.92, + "learning_rate": 0.00017251367151257526, + "loss": 0.736, + "step": 558000 + }, + { + "epoch": 12.97, + "learning_rate": 0.00017204586954250526, + "loss": 0.737, + "step": 560000 + }, + { + "epoch": 13.01, + "learning_rate": 0.0001715780675724352, + "loss": 0.7301, + "step": 562000 + }, + { + "epoch": 13.06, + "learning_rate": 0.0001711102656023652, + "loss": 0.7003, + "step": 564000 + }, + { + "epoch": 13.11, + "learning_rate": 0.00017064246363229514, + "loss": 0.7132, + "step": 566000 + }, + { + "epoch": 13.15, + "learning_rate": 0.00017017466166222513, + "loss": 0.7178, + "step": 568000 + }, + { + "epoch": 13.2, + "learning_rate": 0.00016970685969215513, + "loss": 0.7187, + "step": 570000 + }, + { + "epoch": 13.25, + "learning_rate": 0.00016923905772208507, + "loss": 0.7239, + "step": 572000 + }, + { + "epoch": 13.29, + "learning_rate": 0.00016877125575201507, + "loss": 0.7271, + "step": 574000 + }, + { + "epoch": 13.34, + "learning_rate": 0.000168303453781945, + "loss": 0.7208, + "step": 576000 + }, + { + "epoch": 13.38, + "learning_rate": 0.000167835651811875, + "loss": 0.7199, + "step": 578000 + }, + { + "epoch": 13.43, + "learning_rate": 0.00016736784984180495, + "loss": 0.7094, + "step": 580000 + }, + { + "epoch": 13.48, + "learning_rate": 0.00016690004787173494, + "loss": 0.7114, + "step": 582000 + }, + { + "epoch": 13.52, + "learning_rate": 0.00016643224590166488, + "loss": 0.7196, + "step": 584000 + }, + { + "epoch": 13.57, + "learning_rate": 0.00016596444393159488, + "loss": 0.7222, + "step": 586000 + }, + { + "epoch": 13.62, + "learning_rate": 0.00016549664196152482, + "loss": 0.7345, + "step": 588000 + }, + { + "epoch": 13.66, + "learning_rate": 0.00016502883999145482, + "loss": 0.7208, + "step": 590000 + }, + { + "epoch": 13.71, + "learning_rate": 0.00016456103802138476, + "loss": 0.7298, + "step": 592000 + }, + { + "epoch": 13.75, + "learning_rate": 0.00016409323605131475, + "loss": 0.7324, + "step": 594000 + }, + { + "epoch": 13.8, + "learning_rate": 0.0001636254340812447, + "loss": 0.7243, + "step": 596000 + }, + { + "epoch": 13.85, + "learning_rate": 0.0001631576321111747, + "loss": 0.7215, + "step": 598000 + }, + { + "epoch": 13.89, + "learning_rate": 0.00016268983014110463, + "loss": 0.7246, + "step": 600000 + }, + { + "epoch": 13.94, + "learning_rate": 0.00016222202817103463, + "loss": 0.7219, + "step": 602000 + }, + { + "epoch": 13.99, + "learning_rate": 0.0001617542262009646, + "loss": 0.7248, + "step": 604000 + }, + { + "epoch": 14.03, + "learning_rate": 0.00016128642423089457, + "loss": 0.7139, + "step": 606000 + }, + { + "epoch": 14.08, + "learning_rate": 0.00016081862226082454, + "loss": 0.7026, + "step": 608000 + }, + { + "epoch": 14.13, + "learning_rate": 0.0001603508202907545, + "loss": 0.7107, + "step": 610000 + }, + { + "epoch": 14.17, + "learning_rate": 0.00015988301832068447, + "loss": 0.7037, + "step": 612000 + }, + { + "epoch": 14.22, + "learning_rate": 0.00015941521635061444, + "loss": 0.7145, + "step": 614000 + }, + { + "epoch": 14.26, + "learning_rate": 0.0001589474143805444, + "loss": 0.7181, + "step": 616000 + }, + { + "epoch": 14.31, + "learning_rate": 0.00015847961241047438, + "loss": 0.7026, + "step": 618000 + }, + { + "epoch": 14.36, + "learning_rate": 0.00015801181044040435, + "loss": 0.7142, + "step": 620000 + }, + { + "epoch": 14.4, + "learning_rate": 0.00015754400847033432, + "loss": 0.7087, + "step": 622000 + }, + { + "epoch": 14.45, + "learning_rate": 0.0001570762065002643, + "loss": 0.7109, + "step": 624000 + }, + { + "epoch": 14.5, + "learning_rate": 0.00015660840453019426, + "loss": 0.7031, + "step": 626000 + }, + { + "epoch": 14.54, + "learning_rate": 0.00015614060256012425, + "loss": 0.7101, + "step": 628000 + }, + { + "epoch": 14.59, + "learning_rate": 0.0001556728005900542, + "loss": 0.7152, + "step": 630000 + }, + { + "epoch": 14.63, + "learning_rate": 0.0001552049986199842, + "loss": 0.7147, + "step": 632000 + }, + { + "epoch": 14.68, + "learning_rate": 0.00015473719664991413, + "loss": 0.7144, + "step": 634000 + }, + { + "epoch": 14.73, + "learning_rate": 0.00015426939467984413, + "loss": 0.7113, + "step": 636000 + }, + { + "epoch": 14.77, + "learning_rate": 0.00015380159270977407, + "loss": 0.7071, + "step": 638000 + }, + { + "epoch": 14.82, + "learning_rate": 0.00015333379073970407, + "loss": 0.7118, + "step": 640000 + }, + { + "epoch": 14.87, + "learning_rate": 0.00015286598876963403, + "loss": 0.7098, + "step": 642000 + }, + { + "epoch": 14.91, + "learning_rate": 0.000152398186799564, + "loss": 0.706, + "step": 644000 + }, + { + "epoch": 14.96, + "learning_rate": 0.00015193038482949397, + "loss": 0.709, + "step": 646000 + }, + { + "epoch": 15.01, + "learning_rate": 0.00015146258285942394, + "loss": 0.7087, + "step": 648000 + }, + { + "epoch": 15.05, + "learning_rate": 0.0001509947808893539, + "loss": 0.6983, + "step": 650000 + }, + { + "epoch": 15.1, + "learning_rate": 0.00015052697891928388, + "loss": 0.693, + "step": 652000 + }, + { + "epoch": 15.14, + "learning_rate": 0.00015005917694921385, + "loss": 0.6953, + "step": 654000 + }, + { + "epoch": 15.19, + "learning_rate": 0.00014959137497914382, + "loss": 0.6994, + "step": 656000 + }, + { + "epoch": 15.24, + "learning_rate": 0.00014912357300907379, + "loss": 0.6975, + "step": 658000 + }, + { + "epoch": 15.28, + "learning_rate": 0.00014865577103900375, + "loss": 0.7047, + "step": 660000 + }, + { + "epoch": 15.33, + "learning_rate": 0.00014818796906893372, + "loss": 0.6975, + "step": 662000 + }, + { + "epoch": 15.38, + "learning_rate": 0.0001477201670988637, + "loss": 0.704, + "step": 664000 + }, + { + "epoch": 15.42, + "learning_rate": 0.00014725236512879366, + "loss": 0.7042, + "step": 666000 + }, + { + "epoch": 15.47, + "learning_rate": 0.00014678456315872363, + "loss": 0.6917, + "step": 668000 + }, + { + "epoch": 15.51, + "learning_rate": 0.0001463167611886536, + "loss": 0.6914, + "step": 670000 + }, + { + "epoch": 15.56, + "learning_rate": 0.00014584895921858357, + "loss": 0.7018, + "step": 672000 + }, + { + "epoch": 15.61, + "learning_rate": 0.00014538115724851354, + "loss": 0.7016, + "step": 674000 + }, + { + "epoch": 15.65, + "learning_rate": 0.0001449133552784435, + "loss": 0.7078, + "step": 676000 + }, + { + "epoch": 15.7, + "learning_rate": 0.00014444555330837347, + "loss": 0.6932, + "step": 678000 + }, + { + "epoch": 15.75, + "learning_rate": 0.00014397775133830344, + "loss": 0.6964, + "step": 680000 + }, + { + "epoch": 15.79, + "learning_rate": 0.0001435099493682334, + "loss": 0.6997, + "step": 682000 + }, + { + "epoch": 15.84, + "learning_rate": 0.00014304214739816338, + "loss": 0.7065, + "step": 684000 + }, + { + "epoch": 15.88, + "learning_rate": 0.00014257434542809335, + "loss": 0.7047, + "step": 686000 + }, + { + "epoch": 15.93, + "learning_rate": 0.00014210654345802332, + "loss": 0.7154, + "step": 688000 + }, + { + "epoch": 15.98, + "learning_rate": 0.0001416387414879533, + "loss": 0.6993, + "step": 690000 + }, + { + "epoch": 16.02, + "learning_rate": 0.00014117093951788326, + "loss": 0.6969, + "step": 692000 + }, + { + "epoch": 16.07, + "learning_rate": 0.00014070313754781325, + "loss": 0.689, + "step": 694000 + }, + { + "epoch": 16.12, + "learning_rate": 0.00014023533557774322, + "loss": 0.6888, + "step": 696000 + }, + { + "epoch": 16.16, + "learning_rate": 0.0001397675336076732, + "loss": 0.6818, + "step": 698000 + }, + { + "epoch": 16.21, + "learning_rate": 0.00013929973163760316, + "loss": 0.693, + "step": 700000 + }, + { + "epoch": 16.26, + "learning_rate": 0.00013883192966753313, + "loss": 0.6909, + "step": 702000 + }, + { + "epoch": 16.3, + "learning_rate": 0.0001383641276974631, + "loss": 0.6873, + "step": 704000 + }, + { + "epoch": 16.35, + "learning_rate": 0.00013789632572739307, + "loss": 0.6906, + "step": 706000 + }, + { + "epoch": 16.39, + "learning_rate": 0.00013742852375732303, + "loss": 0.6866, + "step": 708000 + }, + { + "epoch": 16.44, + "learning_rate": 0.000136960721787253, + "loss": 0.701, + "step": 710000 + }, + { + "epoch": 16.49, + "learning_rate": 0.00013649291981718297, + "loss": 0.6937, + "step": 712000 + }, + { + "epoch": 16.53, + "learning_rate": 0.00013602511784711294, + "loss": 0.6907, + "step": 714000 + }, + { + "epoch": 16.58, + "learning_rate": 0.0001355573158770429, + "loss": 0.6897, + "step": 716000 + }, + { + "epoch": 16.63, + "learning_rate": 0.00013508951390697288, + "loss": 0.6952, + "step": 718000 + }, + { + "epoch": 16.67, + "learning_rate": 0.00013462171193690285, + "loss": 0.6865, + "step": 720000 + }, + { + "epoch": 16.72, + "learning_rate": 0.00013415390996683282, + "loss": 0.6935, + "step": 722000 + }, + { + "epoch": 16.76, + "learning_rate": 0.00013368610799676278, + "loss": 0.6919, + "step": 724000 + }, + { + "epoch": 16.81, + "learning_rate": 0.00013321830602669275, + "loss": 0.6904, + "step": 726000 + }, + { + "epoch": 16.86, + "learning_rate": 0.00013275050405662272, + "loss": 0.6964, + "step": 728000 + }, + { + "epoch": 16.9, + "learning_rate": 0.00013228270208655272, + "loss": 0.6943, + "step": 730000 + }, + { + "epoch": 16.95, + "learning_rate": 0.0001318149001164827, + "loss": 0.6949, + "step": 732000 + }, + { + "epoch": 17.0, + "learning_rate": 0.00013134709814641266, + "loss": 0.6943, + "step": 734000 + }, + { + "epoch": 17.04, + "learning_rate": 0.00013087929617634263, + "loss": 0.6851, + "step": 736000 + }, + { + "epoch": 17.09, + "learning_rate": 0.0001304114942062726, + "loss": 0.6802, + "step": 738000 + }, + { + "epoch": 17.14, + "learning_rate": 0.00012994369223620256, + "loss": 0.6801, + "step": 740000 + }, + { + "epoch": 17.18, + "learning_rate": 0.00012947589026613253, + "loss": 0.6756, + "step": 742000 + }, + { + "epoch": 17.23, + "learning_rate": 0.0001290080882960625, + "loss": 0.6824, + "step": 744000 + }, + { + "epoch": 17.27, + "learning_rate": 0.00012854028632599247, + "loss": 0.6894, + "step": 746000 + }, + { + "epoch": 17.32, + "learning_rate": 0.00012807248435592244, + "loss": 0.682, + "step": 748000 + }, + { + "epoch": 17.37, + "learning_rate": 0.0001276046823858524, + "loss": 0.6814, + "step": 750000 + }, + { + "epoch": 17.41, + "learning_rate": 0.00012713688041578238, + "loss": 0.6737, + "step": 752000 + }, + { + "epoch": 17.46, + "learning_rate": 0.00012666907844571234, + "loss": 0.6874, + "step": 754000 + }, + { + "epoch": 17.51, + "learning_rate": 0.00012620127647564231, + "loss": 0.6842, + "step": 756000 + }, + { + "epoch": 17.55, + "learning_rate": 0.00012573347450557228, + "loss": 0.6871, + "step": 758000 + }, + { + "epoch": 17.6, + "learning_rate": 0.00012526567253550225, + "loss": 0.6833, + "step": 760000 + }, + { + "epoch": 17.64, + "learning_rate": 0.00012479787056543222, + "loss": 0.6818, + "step": 762000 + }, + { + "epoch": 17.69, + "learning_rate": 0.0001243300685953622, + "loss": 0.6824, + "step": 764000 + }, + { + "epoch": 17.74, + "learning_rate": 0.00012386226662529219, + "loss": 0.684, + "step": 766000 + }, + { + "epoch": 17.78, + "learning_rate": 0.00012339446465522215, + "loss": 0.6822, + "step": 768000 + }, + { + "epoch": 17.83, + "learning_rate": 0.00012292666268515212, + "loss": 0.68, + "step": 770000 + }, + { + "epoch": 17.88, + "learning_rate": 0.0001224588607150821, + "loss": 0.6842, + "step": 772000 + }, + { + "epoch": 17.92, + "learning_rate": 0.00012199105874501206, + "loss": 0.6827, + "step": 774000 + }, + { + "epoch": 17.97, + "learning_rate": 0.00012152325677494203, + "loss": 0.6901, + "step": 776000 + }, + { + "epoch": 18.02, + "learning_rate": 0.000121055454804872, + "loss": 0.6781, + "step": 778000 + }, + { + "epoch": 18.06, + "learning_rate": 0.00012058765283480197, + "loss": 0.6768, + "step": 780000 + }, + { + "epoch": 18.11, + "learning_rate": 0.00012011985086473194, + "loss": 0.6704, + "step": 782000 + }, + { + "epoch": 18.15, + "learning_rate": 0.0001196520488946619, + "loss": 0.6767, + "step": 784000 + }, + { + "epoch": 18.2, + "learning_rate": 0.00011918424692459187, + "loss": 0.6696, + "step": 786000 + }, + { + "epoch": 18.25, + "learning_rate": 0.00011871644495452184, + "loss": 0.6717, + "step": 788000 + }, + { + "epoch": 18.29, + "learning_rate": 0.00011824864298445181, + "loss": 0.6666, + "step": 790000 + }, + { + "epoch": 18.34, + "learning_rate": 0.00011778084101438178, + "loss": 0.6681, + "step": 792000 + }, + { + "epoch": 18.39, + "learning_rate": 0.00011731303904431175, + "loss": 0.6688, + "step": 794000 + }, + { + "epoch": 18.43, + "learning_rate": 0.00011684523707424172, + "loss": 0.6809, + "step": 796000 + }, + { + "epoch": 18.48, + "learning_rate": 0.00011637743510417169, + "loss": 0.6704, + "step": 798000 + }, + { + "epoch": 18.52, + "learning_rate": 0.00011590963313410166, + "loss": 0.6732, + "step": 800000 + }, + { + "epoch": 18.57, + "learning_rate": 0.00011544183116403164, + "loss": 0.6688, + "step": 802000 + }, + { + "epoch": 18.62, + "learning_rate": 0.00011497402919396161, + "loss": 0.6767, + "step": 804000 + }, + { + "epoch": 18.66, + "learning_rate": 0.00011450622722389158, + "loss": 0.6721, + "step": 806000 + }, + { + "epoch": 18.71, + "learning_rate": 0.00011403842525382154, + "loss": 0.6716, + "step": 808000 + }, + { + "epoch": 18.76, + "learning_rate": 0.00011357062328375151, + "loss": 0.673, + "step": 810000 + }, + { + "epoch": 18.8, + "learning_rate": 0.00011310282131368148, + "loss": 0.6717, + "step": 812000 + }, + { + "epoch": 18.85, + "learning_rate": 0.00011263501934361145, + "loss": 0.6607, + "step": 814000 + }, + { + "epoch": 18.9, + "learning_rate": 0.00011216721737354142, + "loss": 0.6732, + "step": 816000 + }, + { + "epoch": 18.94, + "learning_rate": 0.00011169941540347139, + "loss": 0.6715, + "step": 818000 + }, + { + "epoch": 18.99, + "learning_rate": 0.00011123161343340136, + "loss": 0.678, + "step": 820000 + }, + { + "epoch": 19.03, + "learning_rate": 0.00011076381146333133, + "loss": 0.6618, + "step": 822000 + }, + { + "epoch": 19.08, + "learning_rate": 0.0001102960094932613, + "loss": 0.6589, + "step": 824000 + }, + { + "epoch": 19.13, + "learning_rate": 0.00010982820752319126, + "loss": 0.6624, + "step": 826000 + }, + { + "epoch": 19.17, + "learning_rate": 0.00010936040555312123, + "loss": 0.6618, + "step": 828000 + }, + { + "epoch": 19.22, + "learning_rate": 0.0001088926035830512, + "loss": 0.6666, + "step": 830000 + }, + { + "epoch": 19.27, + "learning_rate": 0.00010842480161298117, + "loss": 0.6645, + "step": 832000 + }, + { + "epoch": 19.31, + "learning_rate": 0.00010795699964291114, + "loss": 0.6667, + "step": 834000 + }, + { + "epoch": 19.36, + "learning_rate": 0.00010748919767284111, + "loss": 0.6649, + "step": 836000 + }, + { + "epoch": 19.4, + "learning_rate": 0.0001070213957027711, + "loss": 0.659, + "step": 838000 + }, + { + "epoch": 19.45, + "learning_rate": 0.00010655359373270107, + "loss": 0.6611, + "step": 840000 + }, + { + "epoch": 19.5, + "learning_rate": 0.00010608579176263104, + "loss": 0.6565, + "step": 842000 + }, + { + "epoch": 19.54, + "learning_rate": 0.00010561798979256101, + "loss": 0.6631, + "step": 844000 + }, + { + "epoch": 19.59, + "learning_rate": 0.00010515018782249098, + "loss": 0.6593, + "step": 846000 + }, + { + "epoch": 19.64, + "learning_rate": 0.00010468238585242095, + "loss": 0.6654, + "step": 848000 + }, + { + "epoch": 19.68, + "learning_rate": 0.00010421458388235092, + "loss": 0.6621, + "step": 850000 + }, + { + "epoch": 19.73, + "learning_rate": 0.00010374678191228089, + "loss": 0.661, + "step": 852000 + }, + { + "epoch": 19.78, + "learning_rate": 0.00010327897994221086, + "loss": 0.6515, + "step": 854000 + }, + { + "epoch": 19.82, + "learning_rate": 0.00010281117797214082, + "loss": 0.6614, + "step": 856000 + }, + { + "epoch": 19.87, + "learning_rate": 0.0001023433760020708, + "loss": 0.6616, + "step": 858000 + }, + { + "epoch": 19.91, + "learning_rate": 0.00010187557403200076, + "loss": 0.6598, + "step": 860000 + }, + { + "epoch": 19.96, + "learning_rate": 0.00010140777206193073, + "loss": 0.6616, + "step": 862000 + }, + { + "epoch": 20.01, + "learning_rate": 0.0001009399700918607, + "loss": 0.6679, + "step": 864000 + }, + { + "epoch": 20.05, + "learning_rate": 0.00010047216812179067, + "loss": 0.6518, + "step": 866000 + }, + { + "epoch": 20.1, + "learning_rate": 0.00010000436615172064, + "loss": 0.6463, + "step": 868000 + }, + { + "epoch": 20.15, + "learning_rate": 9.95365641816506e-05, + "loss": 0.6529, + "step": 870000 + }, + { + "epoch": 20.19, + "learning_rate": 9.906876221158058e-05, + "loss": 0.6463, + "step": 872000 + }, + { + "epoch": 20.24, + "learning_rate": 9.860096024151056e-05, + "loss": 0.6545, + "step": 874000 + }, + { + "epoch": 20.28, + "learning_rate": 9.813315827144053e-05, + "loss": 0.6531, + "step": 876000 + }, + { + "epoch": 20.33, + "learning_rate": 9.76653563013705e-05, + "loss": 0.6442, + "step": 878000 + }, + { + "epoch": 20.38, + "learning_rate": 9.719755433130046e-05, + "loss": 0.65, + "step": 880000 + }, + { + "epoch": 20.42, + "learning_rate": 9.672975236123043e-05, + "loss": 0.6518, + "step": 882000 + }, + { + "epoch": 20.47, + "learning_rate": 9.62619503911604e-05, + "loss": 0.6546, + "step": 884000 + }, + { + "epoch": 20.52, + "learning_rate": 9.579414842109037e-05, + "loss": 0.6494, + "step": 886000 + }, + { + "epoch": 20.56, + "learning_rate": 9.532634645102035e-05, + "loss": 0.654, + "step": 888000 + }, + { + "epoch": 20.61, + "learning_rate": 9.485854448095032e-05, + "loss": 0.6536, + "step": 890000 + }, + { + "epoch": 20.66, + "learning_rate": 9.439074251088029e-05, + "loss": 0.6547, + "step": 892000 + }, + { + "epoch": 20.7, + "learning_rate": 9.392294054081026e-05, + "loss": 0.6421, + "step": 894000 + }, + { + "epoch": 20.75, + "learning_rate": 9.345513857074023e-05, + "loss": 0.6506, + "step": 896000 + }, + { + "epoch": 20.79, + "learning_rate": 9.29873366006702e-05, + "loss": 0.6551, + "step": 898000 + }, + { + "epoch": 20.84, + "learning_rate": 9.251953463060017e-05, + "loss": 0.6542, + "step": 900000 + }, + { + "epoch": 20.89, + "learning_rate": 9.205173266053014e-05, + "loss": 0.6398, + "step": 902000 + }, + { + "epoch": 20.93, + "learning_rate": 9.15839306904601e-05, + "loss": 0.653, + "step": 904000 + }, + { + "epoch": 20.98, + "learning_rate": 9.111612872039007e-05, + "loss": 0.6476, + "step": 906000 + }, + { + "epoch": 21.03, + "learning_rate": 9.064832675032004e-05, + "loss": 0.6378, + "step": 908000 + }, + { + "epoch": 21.07, + "learning_rate": 9.018052478025002e-05, + "loss": 0.6413, + "step": 910000 + }, + { + "epoch": 21.12, + "learning_rate": 8.971272281017999e-05, + "loss": 0.6368, + "step": 912000 + }, + { + "epoch": 21.16, + "learning_rate": 8.924492084010996e-05, + "loss": 0.6366, + "step": 914000 + }, + { + "epoch": 21.21, + "learning_rate": 8.877711887003993e-05, + "loss": 0.6455, + "step": 916000 + }, + { + "epoch": 21.26, + "learning_rate": 8.83093168999699e-05, + "loss": 0.6448, + "step": 918000 + }, + { + "epoch": 21.3, + "learning_rate": 8.784151492989987e-05, + "loss": 0.6371, + "step": 920000 + }, + { + "epoch": 21.35, + "learning_rate": 8.737371295982984e-05, + "loss": 0.6457, + "step": 922000 + }, + { + "epoch": 21.4, + "learning_rate": 8.69059109897598e-05, + "loss": 0.6399, + "step": 924000 + }, + { + "epoch": 21.44, + "learning_rate": 8.643810901968978e-05, + "loss": 0.6389, + "step": 926000 + }, + { + "epoch": 21.49, + "learning_rate": 8.597030704961974e-05, + "loss": 0.6444, + "step": 928000 + }, + { + "epoch": 21.54, + "learning_rate": 8.550250507954971e-05, + "loss": 0.6346, + "step": 930000 + }, + { + "epoch": 21.58, + "learning_rate": 8.503470310947968e-05, + "loss": 0.6394, + "step": 932000 + }, + { + "epoch": 21.63, + "learning_rate": 8.456690113940965e-05, + "loss": 0.6397, + "step": 934000 + }, + { + "epoch": 21.67, + "learning_rate": 8.409909916933962e-05, + "loss": 0.6411, + "step": 936000 + }, + { + "epoch": 21.72, + "learning_rate": 8.363129719926959e-05, + "loss": 0.6383, + "step": 938000 + }, + { + "epoch": 21.77, + "learning_rate": 8.316349522919956e-05, + "loss": 0.6416, + "step": 940000 + }, + { + "epoch": 21.81, + "learning_rate": 8.269569325912953e-05, + "loss": 0.635, + "step": 942000 + }, + { + "epoch": 21.86, + "learning_rate": 8.22278912890595e-05, + "loss": 0.6371, + "step": 944000 + }, + { + "epoch": 21.91, + "learning_rate": 8.176008931898949e-05, + "loss": 0.6412, + "step": 946000 + }, + { + "epoch": 21.95, + "learning_rate": 8.129228734891946e-05, + "loss": 0.6414, + "step": 948000 + }, + { + "epoch": 22.0, + "learning_rate": 8.082448537884943e-05, + "loss": 0.6285, + "step": 950000 + }, + { + "epoch": 22.04, + "learning_rate": 8.03566834087794e-05, + "loss": 0.6285, + "step": 952000 + }, + { + "epoch": 22.09, + "learning_rate": 7.988888143870937e-05, + "loss": 0.6268, + "step": 954000 + }, + { + "epoch": 22.14, + "learning_rate": 7.942107946863934e-05, + "loss": 0.6251, + "step": 956000 + }, + { + "epoch": 22.18, + "learning_rate": 7.89532774985693e-05, + "loss": 0.6306, + "step": 958000 + }, + { + "epoch": 22.23, + "learning_rate": 7.848547552849927e-05, + "loss": 0.6283, + "step": 960000 + }, + { + "epoch": 22.28, + "learning_rate": 7.801767355842924e-05, + "loss": 0.6264, + "step": 962000 + }, + { + "epoch": 22.32, + "learning_rate": 7.754987158835921e-05, + "loss": 0.6279, + "step": 964000 + }, + { + "epoch": 22.37, + "learning_rate": 7.708206961828918e-05, + "loss": 0.6272, + "step": 966000 + }, + { + "epoch": 22.41, + "learning_rate": 7.661426764821915e-05, + "loss": 0.6355, + "step": 968000 + }, + { + "epoch": 22.46, + "learning_rate": 7.614646567814912e-05, + "loss": 0.6349, + "step": 970000 + }, + { + "epoch": 22.51, + "learning_rate": 7.567866370807909e-05, + "loss": 0.6281, + "step": 972000 + }, + { + "epoch": 22.55, + "learning_rate": 7.521086173800905e-05, + "loss": 0.6269, + "step": 974000 + }, + { + "epoch": 22.6, + "learning_rate": 7.474305976793904e-05, + "loss": 0.6221, + "step": 976000 + }, + { + "epoch": 22.65, + "learning_rate": 7.4275257797869e-05, + "loss": 0.6295, + "step": 978000 + }, + { + "epoch": 22.69, + "learning_rate": 7.380745582779897e-05, + "loss": 0.6265, + "step": 980000 + }, + { + "epoch": 22.74, + "learning_rate": 7.333965385772894e-05, + "loss": 0.6203, + "step": 982000 + }, + { + "epoch": 22.79, + "learning_rate": 7.287185188765891e-05, + "loss": 0.6306, + "step": 984000 + }, + { + "epoch": 22.83, + "learning_rate": 7.240404991758888e-05, + "loss": 0.6319, + "step": 986000 + }, + { + "epoch": 22.88, + "learning_rate": 7.193624794751885e-05, + "loss": 0.6211, + "step": 988000 + }, + { + "epoch": 22.92, + "learning_rate": 7.146844597744882e-05, + "loss": 0.6244, + "step": 990000 + }, + { + "epoch": 22.97, + "learning_rate": 7.100064400737879e-05, + "loss": 0.6262, + "step": 992000 + }, + { + "epoch": 23.02, + "learning_rate": 7.053284203730876e-05, + "loss": 0.6166, + "step": 994000 + }, + { + "epoch": 23.06, + "learning_rate": 7.006504006723873e-05, + "loss": 0.6166, + "step": 996000 + }, + { + "epoch": 23.11, + "learning_rate": 6.95972380971687e-05, + "loss": 0.6175, + "step": 998000 + }, + { + "epoch": 23.16, + "learning_rate": 6.912943612709866e-05, + "loss": 0.6151, + "step": 1000000 + }, + { + "epoch": 23.2, + "learning_rate": 6.866163415702863e-05, + "loss": 0.6153, + "step": 1002000 + }, + { + "epoch": 23.25, + "learning_rate": 6.81938321869586e-05, + "loss": 0.6212, + "step": 1004000 + }, + { + "epoch": 23.29, + "learning_rate": 6.772603021688858e-05, + "loss": 0.6161, + "step": 1006000 + }, + { + "epoch": 23.34, + "learning_rate": 6.725822824681855e-05, + "loss": 0.6158, + "step": 1008000 + }, + { + "epoch": 23.39, + "learning_rate": 6.679042627674852e-05, + "loss": 0.6089, + "step": 1010000 + }, + { + "epoch": 23.43, + "learning_rate": 6.632262430667849e-05, + "loss": 0.6166, + "step": 1012000 + }, + { + "epoch": 23.48, + "learning_rate": 6.585482233660846e-05, + "loss": 0.6134, + "step": 1014000 + }, + { + "epoch": 23.53, + "learning_rate": 6.538702036653843e-05, + "loss": 0.6171, + "step": 1016000 + }, + { + "epoch": 23.57, + "learning_rate": 6.49192183964684e-05, + "loss": 0.6122, + "step": 1018000 + }, + { + "epoch": 23.62, + "learning_rate": 6.445141642639837e-05, + "loss": 0.6176, + "step": 1020000 + }, + { + "epoch": 23.67, + "learning_rate": 6.398361445632833e-05, + "loss": 0.6146, + "step": 1022000 + }, + { + "epoch": 23.71, + "learning_rate": 6.35158124862583e-05, + "loss": 0.6069, + "step": 1024000 + }, + { + "epoch": 23.76, + "learning_rate": 6.304801051618829e-05, + "loss": 0.6169, + "step": 1026000 + }, + { + "epoch": 23.8, + "learning_rate": 6.258020854611825e-05, + "loss": 0.6222, + "step": 1028000 + }, + { + "epoch": 23.85, + "learning_rate": 6.211240657604822e-05, + "loss": 0.6152, + "step": 1030000 + }, + { + "epoch": 23.9, + "learning_rate": 6.164460460597819e-05, + "loss": 0.6181, + "step": 1032000 + }, + { + "epoch": 23.94, + "learning_rate": 6.117680263590816e-05, + "loss": 0.6123, + "step": 1034000 + }, + { + "epoch": 23.99, + "learning_rate": 6.070900066583813e-05, + "loss": 0.619, + "step": 1036000 + }, + { + "epoch": 24.04, + "learning_rate": 6.02411986957681e-05, + "loss": 0.6099, + "step": 1038000 + }, + { + "epoch": 24.08, + "learning_rate": 5.977339672569807e-05, + "loss": 0.6098, + "step": 1040000 + }, + { + "epoch": 24.13, + "learning_rate": 5.930559475562804e-05, + "loss": 0.5965, + "step": 1042000 + }, + { + "epoch": 24.17, + "learning_rate": 5.883779278555802e-05, + "loss": 0.6059, + "step": 1044000 + }, + { + "epoch": 24.22, + "learning_rate": 5.836999081548799e-05, + "loss": 0.6021, + "step": 1046000 + }, + { + "epoch": 24.27, + "learning_rate": 5.790218884541796e-05, + "loss": 0.6093, + "step": 1048000 + }, + { + "epoch": 24.31, + "learning_rate": 5.7434386875347926e-05, + "loss": 0.6031, + "step": 1050000 + }, + { + "epoch": 24.36, + "learning_rate": 5.6966584905277895e-05, + "loss": 0.6053, + "step": 1052000 + }, + { + "epoch": 24.41, + "learning_rate": 5.6498782935207863e-05, + "loss": 0.6036, + "step": 1054000 + }, + { + "epoch": 24.45, + "learning_rate": 5.603098096513783e-05, + "loss": 0.6011, + "step": 1056000 + }, + { + "epoch": 24.5, + "learning_rate": 5.55631789950678e-05, + "loss": 0.6035, + "step": 1058000 + }, + { + "epoch": 24.55, + "learning_rate": 5.509537702499777e-05, + "loss": 0.6066, + "step": 1060000 + }, + { + "epoch": 24.59, + "learning_rate": 5.4627575054927746e-05, + "loss": 0.6061, + "step": 1062000 + }, + { + "epoch": 24.64, + "learning_rate": 5.4159773084857714e-05, + "loss": 0.6027, + "step": 1064000 + }, + { + "epoch": 24.68, + "learning_rate": 5.369197111478768e-05, + "loss": 0.6, + "step": 1066000 + }, + { + "epoch": 24.73, + "learning_rate": 5.322416914471765e-05, + "loss": 0.6062, + "step": 1068000 + }, + { + "epoch": 24.78, + "learning_rate": 5.275636717464762e-05, + "loss": 0.6003, + "step": 1070000 + }, + { + "epoch": 24.82, + "learning_rate": 5.228856520457759e-05, + "loss": 0.5988, + "step": 1072000 + }, + { + "epoch": 24.87, + "learning_rate": 5.182076323450756e-05, + "loss": 0.6096, + "step": 1074000 + }, + { + "epoch": 24.92, + "learning_rate": 5.135296126443753e-05, + "loss": 0.5988, + "step": 1076000 + }, + { + "epoch": 24.96, + "learning_rate": 5.08851592943675e-05, + "loss": 0.6086, + "step": 1078000 + }, + { + "epoch": 25.01, + "learning_rate": 5.041735732429748e-05, + "loss": 0.5942, + "step": 1080000 + }, + { + "epoch": 25.05, + "learning_rate": 4.994955535422745e-05, + "loss": 0.5954, + "step": 1082000 + }, + { + "epoch": 25.1, + "learning_rate": 4.9481753384157417e-05, + "loss": 0.5948, + "step": 1084000 + }, + { + "epoch": 25.15, + "learning_rate": 4.9013951414087385e-05, + "loss": 0.5946, + "step": 1086000 + }, + { + "epoch": 25.19, + "learning_rate": 4.8546149444017354e-05, + "loss": 0.5938, + "step": 1088000 + }, + { + "epoch": 25.24, + "learning_rate": 4.807834747394732e-05, + "loss": 0.5961, + "step": 1090000 + }, + { + "epoch": 25.29, + "learning_rate": 4.761054550387729e-05, + "loss": 0.5947, + "step": 1092000 + }, + { + "epoch": 25.33, + "learning_rate": 4.714274353380726e-05, + "loss": 0.6019, + "step": 1094000 + }, + { + "epoch": 25.38, + "learning_rate": 4.667494156373723e-05, + "loss": 0.5927, + "step": 1096000 + }, + { + "epoch": 25.43, + "learning_rate": 4.6207139593667205e-05, + "loss": 0.5921, + "step": 1098000 + }, + { + "epoch": 25.47, + "learning_rate": 4.5739337623597174e-05, + "loss": 0.5954, + "step": 1100000 + }, + { + "epoch": 25.52, + "learning_rate": 4.527153565352715e-05, + "loss": 0.5926, + "step": 1102000 + }, + { + "epoch": 25.56, + "learning_rate": 4.480373368345712e-05, + "loss": 0.5963, + "step": 1104000 + }, + { + "epoch": 25.61, + "learning_rate": 4.433593171338709e-05, + "loss": 0.5902, + "step": 1106000 + }, + { + "epoch": 25.66, + "learning_rate": 4.3868129743317056e-05, + "loss": 0.5952, + "step": 1108000 + }, + { + "epoch": 25.7, + "learning_rate": 4.3400327773247025e-05, + "loss": 0.5878, + "step": 1110000 + }, + { + "epoch": 25.75, + "learning_rate": 4.2932525803176994e-05, + "loss": 0.5926, + "step": 1112000 + }, + { + "epoch": 25.8, + "learning_rate": 4.246472383310696e-05, + "loss": 0.5854, + "step": 1114000 + }, + { + "epoch": 25.84, + "learning_rate": 4.199692186303694e-05, + "loss": 0.5916, + "step": 1116000 + }, + { + "epoch": 25.89, + "learning_rate": 4.152911989296691e-05, + "loss": 0.5869, + "step": 1118000 + }, + { + "epoch": 25.93, + "learning_rate": 4.1061317922896876e-05, + "loss": 0.5913, + "step": 1120000 + }, + { + "epoch": 25.98, + "learning_rate": 4.0593515952826845e-05, + "loss": 0.5822, + "step": 1122000 + }, + { + "epoch": 26.03, + "learning_rate": 4.0125713982756814e-05, + "loss": 0.5831, + "step": 1124000 + }, + { + "epoch": 26.07, + "learning_rate": 3.965791201268678e-05, + "loss": 0.5847, + "step": 1126000 + }, + { + "epoch": 26.12, + "learning_rate": 3.919011004261675e-05, + "loss": 0.5828, + "step": 1128000 + }, + { + "epoch": 26.17, + "learning_rate": 3.872230807254672e-05, + "loss": 0.5825, + "step": 1130000 + }, + { + "epoch": 26.21, + "learning_rate": 3.825450610247669e-05, + "loss": 0.5848, + "step": 1132000 + }, + { + "epoch": 26.26, + "learning_rate": 3.778670413240667e-05, + "loss": 0.5866, + "step": 1134000 + }, + { + "epoch": 26.31, + "learning_rate": 3.7318902162336634e-05, + "loss": 0.5832, + "step": 1136000 + }, + { + "epoch": 26.35, + "learning_rate": 3.685110019226661e-05, + "loss": 0.58, + "step": 1138000 + }, + { + "epoch": 26.4, + "learning_rate": 3.638329822219658e-05, + "loss": 0.5767, + "step": 1140000 + }, + { + "epoch": 26.44, + "learning_rate": 3.591549625212655e-05, + "loss": 0.5792, + "step": 1142000 + }, + { + "epoch": 26.49, + "learning_rate": 3.5447694282056516e-05, + "loss": 0.5764, + "step": 1144000 + }, + { + "epoch": 26.54, + "learning_rate": 3.4979892311986485e-05, + "loss": 0.5794, + "step": 1146000 + }, + { + "epoch": 26.58, + "learning_rate": 3.451209034191646e-05, + "loss": 0.5738, + "step": 1148000 + }, + { + "epoch": 26.63, + "learning_rate": 3.404428837184643e-05, + "loss": 0.5822, + "step": 1150000 + }, + { + "epoch": 26.68, + "learning_rate": 3.35764864017764e-05, + "loss": 0.5734, + "step": 1152000 + }, + { + "epoch": 26.72, + "learning_rate": 3.310868443170637e-05, + "loss": 0.5794, + "step": 1154000 + }, + { + "epoch": 26.77, + "learning_rate": 3.2640882461636336e-05, + "loss": 0.5853, + "step": 1156000 + }, + { + "epoch": 26.81, + "learning_rate": 3.2173080491566305e-05, + "loss": 0.5842, + "step": 1158000 + }, + { + "epoch": 26.86, + "learning_rate": 3.170527852149628e-05, + "loss": 0.5847, + "step": 1160000 + }, + { + "epoch": 26.91, + "learning_rate": 3.123747655142625e-05, + "loss": 0.5786, + "step": 1162000 + }, + { + "epoch": 26.95, + "learning_rate": 3.076967458135622e-05, + "loss": 0.5818, + "step": 1164000 + }, + { + "epoch": 27.0, + "learning_rate": 3.030187261128619e-05, + "loss": 0.5722, + "step": 1166000 + }, + { + "epoch": 27.05, + "learning_rate": 2.983407064121616e-05, + "loss": 0.5726, + "step": 1168000 + }, + { + "epoch": 27.09, + "learning_rate": 2.936626867114613e-05, + "loss": 0.5745, + "step": 1170000 + }, + { + "epoch": 27.14, + "learning_rate": 2.8898466701076097e-05, + "loss": 0.5655, + "step": 1172000 + }, + { + "epoch": 27.19, + "learning_rate": 2.843066473100607e-05, + "loss": 0.5747, + "step": 1174000 + }, + { + "epoch": 27.23, + "learning_rate": 2.7962862760936038e-05, + "loss": 0.5734, + "step": 1176000 + }, + { + "epoch": 27.28, + "learning_rate": 2.7495060790866007e-05, + "loss": 0.5752, + "step": 1178000 + }, + { + "epoch": 27.32, + "learning_rate": 2.7027258820795976e-05, + "loss": 0.5784, + "step": 1180000 + }, + { + "epoch": 27.37, + "learning_rate": 2.6559456850725945e-05, + "loss": 0.5667, + "step": 1182000 + }, + { + "epoch": 27.42, + "learning_rate": 2.609165488065592e-05, + "loss": 0.5748, + "step": 1184000 + }, + { + "epoch": 27.46, + "learning_rate": 2.562385291058589e-05, + "loss": 0.5762, + "step": 1186000 + }, + { + "epoch": 27.51, + "learning_rate": 2.5156050940515858e-05, + "loss": 0.5783, + "step": 1188000 + }, + { + "epoch": 27.56, + "learning_rate": 2.4688248970445827e-05, + "loss": 0.5668, + "step": 1190000 + }, + { + "epoch": 27.6, + "learning_rate": 2.42204470003758e-05, + "loss": 0.5671, + "step": 1192000 + }, + { + "epoch": 27.65, + "learning_rate": 2.3752645030305768e-05, + "loss": 0.5688, + "step": 1194000 + }, + { + "epoch": 27.69, + "learning_rate": 2.328484306023574e-05, + "loss": 0.5643, + "step": 1196000 + }, + { + "epoch": 27.74, + "learning_rate": 2.281704109016571e-05, + "loss": 0.5688, + "step": 1198000 + }, + { + "epoch": 27.79, + "learning_rate": 2.2349239120095678e-05, + "loss": 0.5651, + "step": 1200000 + }, + { + "epoch": 27.83, + "learning_rate": 2.188143715002565e-05, + "loss": 0.5705, + "step": 1202000 + }, + { + "epoch": 27.88, + "learning_rate": 2.141363517995562e-05, + "loss": 0.5684, + "step": 1204000 + }, + { + "epoch": 27.93, + "learning_rate": 2.0945833209885588e-05, + "loss": 0.567, + "step": 1206000 + }, + { + "epoch": 27.97, + "learning_rate": 2.0478031239815557e-05, + "loss": 0.5711, + "step": 1208000 + }, + { + "epoch": 28.02, + "learning_rate": 2.0010229269745533e-05, + "loss": 0.5684, + "step": 1210000 + }, + { + "epoch": 28.06, + "learning_rate": 1.95424272996755e-05, + "loss": 0.5604, + "step": 1212000 + }, + { + "epoch": 28.11, + "learning_rate": 1.907462532960547e-05, + "loss": 0.5615, + "step": 1214000 + }, + { + "epoch": 28.16, + "learning_rate": 1.860682335953544e-05, + "loss": 0.5679, + "step": 1216000 + }, + { + "epoch": 28.2, + "learning_rate": 1.813902138946541e-05, + "loss": 0.5644, + "step": 1218000 + }, + { + "epoch": 28.25, + "learning_rate": 1.767121941939538e-05, + "loss": 0.5663, + "step": 1220000 + }, + { + "epoch": 28.3, + "learning_rate": 1.720341744932535e-05, + "loss": 0.5584, + "step": 1222000 + }, + { + "epoch": 28.34, + "learning_rate": 1.6735615479255318e-05, + "loss": 0.558, + "step": 1224000 + }, + { + "epoch": 28.39, + "learning_rate": 1.626781350918529e-05, + "loss": 0.5575, + "step": 1226000 + }, + { + "epoch": 28.44, + "learning_rate": 1.580001153911526e-05, + "loss": 0.5728, + "step": 1228000 + }, + { + "epoch": 28.48, + "learning_rate": 1.533220956904523e-05, + "loss": 0.5653, + "step": 1230000 + }, + { + "epoch": 28.53, + "learning_rate": 1.48644075989752e-05, + "loss": 0.5603, + "step": 1232000 + }, + { + "epoch": 28.57, + "learning_rate": 1.4396605628905172e-05, + "loss": 0.5613, + "step": 1234000 + }, + { + "epoch": 28.62, + "learning_rate": 1.3928803658835141e-05, + "loss": 0.5563, + "step": 1236000 + }, + { + "epoch": 28.67, + "learning_rate": 1.346100168876511e-05, + "loss": 0.5705, + "step": 1238000 + }, + { + "epoch": 28.71, + "learning_rate": 1.299319971869508e-05, + "loss": 0.5568, + "step": 1240000 + }, + { + "epoch": 28.76, + "learning_rate": 1.252539774862505e-05, + "loss": 0.5517, + "step": 1242000 + }, + { + "epoch": 28.81, + "learning_rate": 1.2057595778555022e-05, + "loss": 0.5647, + "step": 1244000 + }, + { + "epoch": 28.85, + "learning_rate": 1.158979380848499e-05, + "loss": 0.5551, + "step": 1246000 + }, + { + "epoch": 28.9, + "learning_rate": 1.1121991838414961e-05, + "loss": 0.5598, + "step": 1248000 + }, + { + "epoch": 28.94, + "learning_rate": 1.0654189868344932e-05, + "loss": 0.562, + "step": 1250000 + }, + { + "epoch": 28.99, + "learning_rate": 1.0186387898274902e-05, + "loss": 0.5563, + "step": 1252000 + }, + { + "epoch": 29.04, + "learning_rate": 9.718585928204871e-06, + "loss": 0.5606, + "step": 1254000 + }, + { + "epoch": 29.08, + "learning_rate": 9.250783958134842e-06, + "loss": 0.5509, + "step": 1256000 + }, + { + "epoch": 29.13, + "learning_rate": 8.782981988064812e-06, + "loss": 0.551, + "step": 1258000 + }, + { + "epoch": 29.18, + "learning_rate": 8.315180017994783e-06, + "loss": 0.5548, + "step": 1260000 + }, + { + "epoch": 29.22, + "learning_rate": 7.847378047924752e-06, + "loss": 0.5562, + "step": 1262000 + }, + { + "epoch": 29.27, + "learning_rate": 7.3795760778547214e-06, + "loss": 0.5563, + "step": 1264000 + }, + { + "epoch": 29.32, + "learning_rate": 6.911774107784692e-06, + "loss": 0.5551, + "step": 1266000 + }, + { + "epoch": 29.36, + "learning_rate": 6.443972137714662e-06, + "loss": 0.555, + "step": 1268000 + }, + { + "epoch": 29.41, + "learning_rate": 5.976170167644632e-06, + "loss": 0.554, + "step": 1270000 + }, + { + "epoch": 29.45, + "learning_rate": 5.508368197574602e-06, + "loss": 0.5522, + "step": 1272000 + }, + { + "epoch": 29.5, + "learning_rate": 5.0405662275045725e-06, + "loss": 0.5522, + "step": 1274000 + }, + { + "epoch": 29.55, + "learning_rate": 4.572764257434542e-06, + "loss": 0.5601, + "step": 1276000 + }, + { + "epoch": 29.59, + "learning_rate": 4.104962287364513e-06, + "loss": 0.5578, + "step": 1278000 + }, + { + "epoch": 29.64, + "learning_rate": 3.6371603172944825e-06, + "loss": 0.5602, + "step": 1280000 + }, + { + "epoch": 29.69, + "learning_rate": 3.1693583472244526e-06, + "loss": 0.5517, + "step": 1282000 + }, + { + "epoch": 29.73, + "learning_rate": 2.701556377154423e-06, + "loss": 0.5538, + "step": 1284000 + }, + { + "epoch": 29.78, + "learning_rate": 2.233754407084393e-06, + "loss": 0.549, + "step": 1286000 + }, + { + "epoch": 29.82, + "learning_rate": 1.765952437014363e-06, + "loss": 0.5595, + "step": 1288000 + }, + { + "epoch": 29.87, + "learning_rate": 1.2981504669443331e-06, + "loss": 0.5546, + "step": 1290000 + }, + { + "epoch": 29.92, + "learning_rate": 8.303484968743031e-07, + "loss": 0.5547, + "step": 1292000 + }, + { + "epoch": 29.96, + "learning_rate": 3.6254652680427316e-07, + "loss": 0.5494, + "step": 1294000 + }, + { + "epoch": 30.0, + "step": 1295550, + "total_flos": 2.6480449905256835e+21, + "train_loss": 0.7350748663054241, + "train_runtime": 658563.1153, + "train_samples_per_second": 31.476, + "train_steps_per_second": 1.967 + }, + { + "epoch": 30.01, + "learning_rate": 3.624385885736958e-07, + "loss": 0.5685, + "step": 1296000 + }, + { + "epoch": 30.06, + "learning_rate": 3.619588631044631e-07, + "loss": 0.5522, + "step": 1298000 + }, + { + "epoch": 30.1, + "learning_rate": 3.6147913763523037e-07, + "loss": 0.5498, + "step": 1300000 + }, + { + "epoch": 30.15, + "learning_rate": 3.609994121659976e-07, + "loss": 0.5494, + "step": 1302000 + }, + { + "epoch": 30.2, + "learning_rate": 3.6051968669676483e-07, + "loss": 0.5514, + "step": 1304000 + }, + { + "epoch": 30.24, + "learning_rate": 3.600399612275321e-07, + "loss": 0.5472, + "step": 1306000 + }, + { + "epoch": 30.29, + "learning_rate": 3.595602357582994e-07, + "loss": 0.5596, + "step": 1308000 + }, + { + "epoch": 30.33, + "learning_rate": 3.590805102890667e-07, + "loss": 0.5525, + "step": 1310000 + }, + { + "epoch": 30.38, + "learning_rate": 3.586007848198339e-07, + "loss": 0.5601, + "step": 1312000 + }, + { + "epoch": 30.43, + "learning_rate": 3.581210593506012e-07, + "loss": 0.5526, + "step": 1314000 + }, + { + "epoch": 30.47, + "learning_rate": 3.5764133388136847e-07, + "loss": 0.551, + "step": 1316000 + }, + { + "epoch": 30.52, + "learning_rate": 3.5716160841213575e-07, + "loss": 0.5513, + "step": 1318000 + }, + { + "epoch": 30.57, + "learning_rate": 3.56681882942903e-07, + "loss": 0.5456, + "step": 1320000 + }, + { + "epoch": 30.61, + "learning_rate": 3.5620215747367026e-07, + "loss": 0.5563, + "step": 1322000 + }, + { + "epoch": 30.66, + "learning_rate": 3.5572243200443755e-07, + "loss": 0.5567, + "step": 1324000 + }, + { + "epoch": 30.71, + "learning_rate": 3.5524270653520483e-07, + "loss": 0.5573, + "step": 1326000 + }, + { + "epoch": 30.75, + "learning_rate": 3.5476298106597206e-07, + "loss": 0.5575, + "step": 1328000 + }, + { + "epoch": 30.8, + "learning_rate": 3.542832555967393e-07, + "loss": 0.5599, + "step": 1330000 + }, + { + "epoch": 30.84, + "learning_rate": 3.5380353012750657e-07, + "loss": 0.554, + "step": 1332000 + }, + { + "epoch": 30.89, + "learning_rate": 3.5332380465827385e-07, + "loss": 0.5507, + "step": 1334000 + }, + { + "epoch": 30.94, + "learning_rate": 3.5284407918904113e-07, + "loss": 0.5522, + "step": 1336000 + }, + { + "epoch": 30.98, + "learning_rate": 3.5236435371980836e-07, + "loss": 0.5439, + "step": 1338000 + }, + { + "epoch": 31.03, + "learning_rate": 3.5188462825057564e-07, + "loss": 0.554, + "step": 1340000 + }, + { + "epoch": 31.08, + "learning_rate": 3.514049027813429e-07, + "loss": 0.5477, + "step": 1342000 + }, + { + "epoch": 31.12, + "learning_rate": 3.509251773121102e-07, + "loss": 0.5565, + "step": 1344000 + }, + { + "epoch": 31.17, + "learning_rate": 3.5044545184287744e-07, + "loss": 0.5482, + "step": 1346000 + }, + { + "epoch": 31.21, + "learning_rate": 3.499657263736447e-07, + "loss": 0.5506, + "step": 1348000 + }, + { + "epoch": 31.26, + "learning_rate": 3.49486000904412e-07, + "loss": 0.5471, + "step": 1350000 + }, + { + "epoch": 31.31, + "learning_rate": 3.490062754351793e-07, + "loss": 0.5529, + "step": 1352000 + }, + { + "epoch": 31.35, + "learning_rate": 3.485265499659465e-07, + "loss": 0.5547, + "step": 1354000 + }, + { + "epoch": 31.4, + "learning_rate": 3.480468244967138e-07, + "loss": 0.5522, + "step": 1356000 + }, + { + "epoch": 31.45, + "learning_rate": 3.475670990274811e-07, + "loss": 0.5473, + "step": 1358000 + }, + { + "epoch": 31.49, + "learning_rate": 3.470873735582483e-07, + "loss": 0.5599, + "step": 1360000 + }, + { + "epoch": 31.54, + "learning_rate": 3.466076480890156e-07, + "loss": 0.5573, + "step": 1362000 + }, + { + "epoch": 31.59, + "learning_rate": 3.461279226197828e-07, + "loss": 0.553, + "step": 1364000 + }, + { + "epoch": 31.63, + "learning_rate": 3.456481971505501e-07, + "loss": 0.5506, + "step": 1366000 + }, + { + "epoch": 31.68, + "learning_rate": 3.451684716813174e-07, + "loss": 0.5537, + "step": 1368000 + }, + { + "epoch": 31.72, + "learning_rate": 3.4468874621208466e-07, + "loss": 0.547, + "step": 1370000 + }, + { + "epoch": 31.77, + "learning_rate": 3.442090207428519e-07, + "loss": 0.5579, + "step": 1372000 + }, + { + "epoch": 31.82, + "learning_rate": 3.437292952736192e-07, + "loss": 0.5446, + "step": 1374000 + }, + { + "epoch": 31.86, + "learning_rate": 3.4324956980438646e-07, + "loss": 0.5491, + "step": 1376000 + }, + { + "epoch": 31.91, + "learning_rate": 3.4276984433515374e-07, + "loss": 0.5489, + "step": 1378000 + }, + { + "epoch": 31.96, + "learning_rate": 3.4229011886592097e-07, + "loss": 0.553, + "step": 1380000 + }, + { + "epoch": 32.0, + "learning_rate": 3.4181039339668825e-07, + "loss": 0.549, + "step": 1382000 + }, + { + "epoch": 32.05, + "learning_rate": 3.4133066792745553e-07, + "loss": 0.5498, + "step": 1384000 + }, + { + "epoch": 32.09, + "learning_rate": 3.408509424582228e-07, + "loss": 0.5619, + "step": 1386000 + }, + { + "epoch": 32.14, + "learning_rate": 3.4037121698899004e-07, + "loss": 0.5461, + "step": 1388000 + }, + { + "epoch": 32.19, + "learning_rate": 3.398914915197573e-07, + "loss": 0.5501, + "step": 1390000 + }, + { + "epoch": 32.23, + "learning_rate": 3.3941176605052456e-07, + "loss": 0.5583, + "step": 1392000 + }, + { + "epoch": 32.28, + "learning_rate": 3.3893204058129184e-07, + "loss": 0.5597, + "step": 1394000 + }, + { + "epoch": 32.33, + "learning_rate": 3.384523151120591e-07, + "loss": 0.5482, + "step": 1396000 + }, + { + "epoch": 32.37, + "learning_rate": 3.3797258964282635e-07, + "loss": 0.5573, + "step": 1398000 + }, + { + "epoch": 32.42, + "learning_rate": 3.3749286417359363e-07, + "loss": 0.546, + "step": 1400000 + }, + { + "epoch": 32.46, + "learning_rate": 3.370131387043609e-07, + "loss": 0.5481, + "step": 1402000 + }, + { + "epoch": 32.51, + "learning_rate": 3.365334132351282e-07, + "loss": 0.5572, + "step": 1404000 + }, + { + "epoch": 32.56, + "learning_rate": 3.360536877658954e-07, + "loss": 0.5479, + "step": 1406000 + }, + { + "epoch": 32.6, + "learning_rate": 3.355739622966627e-07, + "loss": 0.5456, + "step": 1408000 + }, + { + "epoch": 32.65, + "learning_rate": 3.3509423682743e-07, + "loss": 0.5498, + "step": 1410000 + }, + { + "epoch": 32.7, + "learning_rate": 3.3461451135819727e-07, + "loss": 0.5519, + "step": 1412000 + }, + { + "epoch": 32.74, + "learning_rate": 3.341347858889645e-07, + "loss": 0.5533, + "step": 1414000 + }, + { + "epoch": 32.79, + "learning_rate": 3.336550604197318e-07, + "loss": 0.5455, + "step": 1416000 + }, + { + "epoch": 32.84, + "learning_rate": 3.33175334950499e-07, + "loss": 0.5535, + "step": 1418000 + }, + { + "epoch": 32.88, + "learning_rate": 3.326956094812663e-07, + "loss": 0.5512, + "step": 1420000 + }, + { + "epoch": 32.93, + "learning_rate": 3.322158840120336e-07, + "loss": 0.5496, + "step": 1422000 + }, + { + "epoch": 32.97, + "learning_rate": 3.317361585428008e-07, + "loss": 0.5505, + "step": 1424000 + }, + { + "epoch": 33.02, + "learning_rate": 3.312564330735681e-07, + "loss": 0.5561, + "step": 1426000 + }, + { + "epoch": 33.07, + "learning_rate": 3.3077670760433537e-07, + "loss": 0.5512, + "step": 1428000 + }, + { + "epoch": 33.11, + "learning_rate": 3.3029698213510265e-07, + "loss": 0.5533, + "step": 1430000 + }, + { + "epoch": 33.16, + "learning_rate": 3.298172566658699e-07, + "loss": 0.5554, + "step": 1432000 + }, + { + "epoch": 33.21, + "learning_rate": 3.2933753119663716e-07, + "loss": 0.5536, + "step": 1434000 + }, + { + "epoch": 33.25, + "learning_rate": 3.2885780572740445e-07, + "loss": 0.5553, + "step": 1436000 + }, + { + "epoch": 33.3, + "learning_rate": 3.2837808025817173e-07, + "loss": 0.5451, + "step": 1438000 + }, + { + "epoch": 33.34, + "learning_rate": 3.2789835478893896e-07, + "loss": 0.5468, + "step": 1440000 + }, + { + "epoch": 33.39, + "learning_rate": 3.2741862931970624e-07, + "loss": 0.5532, + "step": 1442000 + }, + { + "epoch": 33.44, + "learning_rate": 3.269389038504735e-07, + "loss": 0.5451, + "step": 1444000 + }, + { + "epoch": 33.48, + "learning_rate": 3.2645917838124075e-07, + "loss": 0.5482, + "step": 1446000 + }, + { + "epoch": 33.53, + "learning_rate": 3.2597945291200803e-07, + "loss": 0.553, + "step": 1448000 + }, + { + "epoch": 33.58, + "learning_rate": 3.2549972744277526e-07, + "loss": 0.5483, + "step": 1450000 + }, + { + "epoch": 33.62, + "learning_rate": 3.2502000197354254e-07, + "loss": 0.552, + "step": 1452000 + }, + { + "epoch": 33.67, + "learning_rate": 3.245402765043098e-07, + "loss": 0.5494, + "step": 1454000 + }, + { + "epoch": 33.72, + "learning_rate": 3.240605510350771e-07, + "loss": 0.5583, + "step": 1456000 + }, + { + "epoch": 33.76, + "learning_rate": 3.2358082556584434e-07, + "loss": 0.5506, + "step": 1458000 + }, + { + "epoch": 33.81, + "learning_rate": 3.231011000966116e-07, + "loss": 0.5524, + "step": 1460000 + }, + { + "epoch": 33.85, + "learning_rate": 3.226213746273789e-07, + "loss": 0.5529, + "step": 1462000 + }, + { + "epoch": 33.9, + "learning_rate": 3.221416491581462e-07, + "loss": 0.5563, + "step": 1464000 + }, + { + "epoch": 33.95, + "learning_rate": 3.216619236889134e-07, + "loss": 0.5493, + "step": 1466000 + }, + { + "epoch": 33.99, + "learning_rate": 3.211821982196807e-07, + "loss": 0.5478, + "step": 1468000 + }, + { + "epoch": 34.04, + "learning_rate": 3.20702472750448e-07, + "loss": 0.546, + "step": 1470000 + }, + { + "epoch": 34.09, + "learning_rate": 3.2022274728121526e-07, + "loss": 0.5556, + "step": 1472000 + }, + { + "epoch": 34.13, + "learning_rate": 3.197430218119825e-07, + "loss": 0.5511, + "step": 1474000 + }, + { + "epoch": 34.18, + "learning_rate": 3.192632963427497e-07, + "loss": 0.5467, + "step": 1476000 + }, + { + "epoch": 34.22, + "learning_rate": 3.18783570873517e-07, + "loss": 0.5534, + "step": 1478000 + }, + { + "epoch": 34.27, + "learning_rate": 3.183038454042843e-07, + "loss": 0.5558, + "step": 1480000 + }, + { + "epoch": 34.32, + "learning_rate": 3.1782411993505156e-07, + "loss": 0.5532, + "step": 1482000 + }, + { + "epoch": 34.36, + "learning_rate": 3.173443944658188e-07, + "loss": 0.5468, + "step": 1484000 + }, + { + "epoch": 34.41, + "learning_rate": 3.168646689965861e-07, + "loss": 0.5532, + "step": 1486000 + }, + { + "epoch": 34.46, + "learning_rate": 3.1638494352735336e-07, + "loss": 0.549, + "step": 1488000 + }, + { + "epoch": 34.5, + "learning_rate": 3.1590521805812064e-07, + "loss": 0.5499, + "step": 1490000 + }, + { + "epoch": 34.55, + "learning_rate": 3.1542549258888787e-07, + "loss": 0.5535, + "step": 1492000 + }, + { + "epoch": 34.6, + "learning_rate": 3.1494576711965515e-07, + "loss": 0.5503, + "step": 1494000 + }, + { + "epoch": 34.64, + "learning_rate": 3.1446604165042243e-07, + "loss": 0.5468, + "step": 1496000 + }, + { + "epoch": 34.69, + "learning_rate": 3.139863161811897e-07, + "loss": 0.5515, + "step": 1498000 + }, + { + "epoch": 34.73, + "learning_rate": 3.1350659071195695e-07, + "loss": 0.5507, + "step": 1500000 + }, + { + "epoch": 34.78, + "learning_rate": 3.1302686524272423e-07, + "loss": 0.5539, + "step": 1502000 + }, + { + "epoch": 34.83, + "learning_rate": 3.1254713977349146e-07, + "loss": 0.5451, + "step": 1504000 + }, + { + "epoch": 34.87, + "learning_rate": 3.1206741430425874e-07, + "loss": 0.5502, + "step": 1506000 + }, + { + "epoch": 34.92, + "learning_rate": 3.11587688835026e-07, + "loss": 0.5534, + "step": 1508000 + }, + { + "epoch": 34.97, + "learning_rate": 3.1110796336579325e-07, + "loss": 0.5511, + "step": 1510000 + }, + { + "epoch": 35.0, + "step": 1511475, + "total_flos": 3.089388519897025e+21, + "train_loss": 0.07881541758525794, + "train_runtime": 103159.2271, + "train_samples_per_second": 234.43, + "train_steps_per_second": 14.652 + }, + { + "epoch": 35.01, + "learning_rate": 3.6243633986055876e-07, + "loss": 0.5431, + "step": 1512000 + }, + { + "epoch": 35.06, + "learning_rate": 3.620165800749801e-07, + "loss": 0.5573, + "step": 1514000 + }, + { + "epoch": 35.1, + "learning_rate": 3.6159682028940147e-07, + "loss": 0.5447, + "step": 1516000 + }, + { + "epoch": 35.15, + "learning_rate": 3.611770605038228e-07, + "loss": 0.5497, + "step": 1518000 + }, + { + "epoch": 35.2, + "learning_rate": 3.607573007182442e-07, + "loss": 0.5369, + "step": 1520000 + }, + { + "epoch": 35.24, + "learning_rate": 3.603375409326656e-07, + "loss": 0.5425, + "step": 1522000 + }, + { + "epoch": 35.29, + "learning_rate": 3.599177811470869e-07, + "loss": 0.5472, + "step": 1524000 + }, + { + "epoch": 35.34, + "learning_rate": 3.594980213615083e-07, + "loss": 0.5528, + "step": 1526000 + }, + { + "epoch": 35.38, + "learning_rate": 3.590782615759296e-07, + "loss": 0.5469, + "step": 1528000 + }, + { + "epoch": 35.43, + "learning_rate": 3.58658501790351e-07, + "loss": 0.548, + "step": 1530000 + }, + { + "epoch": 35.48, + "learning_rate": 3.5823874200477234e-07, + "loss": 0.5515, + "step": 1532000 + }, + { + "epoch": 35.52, + "learning_rate": 3.5781898221919373e-07, + "loss": 0.5562, + "step": 1534000 + }, + { + "epoch": 35.57, + "learning_rate": 3.5739922243361506e-07, + "loss": 0.5544, + "step": 1536000 + }, + { + "epoch": 35.61, + "learning_rate": 3.569794626480364e-07, + "loss": 0.5488, + "step": 1538000 + }, + { + "epoch": 35.66, + "learning_rate": 3.565597028624578e-07, + "loss": 0.5498, + "step": 1540000 + }, + { + "epoch": 35.71, + "learning_rate": 3.561399430768791e-07, + "loss": 0.5545, + "step": 1542000 + }, + { + "epoch": 35.75, + "learning_rate": 3.557201832913005e-07, + "loss": 0.5547, + "step": 1544000 + }, + { + "epoch": 35.8, + "learning_rate": 3.553004235057218e-07, + "loss": 0.5455, + "step": 1546000 + }, + { + "epoch": 35.85, + "learning_rate": 3.548806637201432e-07, + "loss": 0.558, + "step": 1548000 + }, + { + "epoch": 35.89, + "learning_rate": 3.544609039345646e-07, + "loss": 0.551, + "step": 1550000 + }, + { + "epoch": 35.94, + "learning_rate": 3.5404114414898593e-07, + "loss": 0.5445, + "step": 1552000 + }, + { + "epoch": 35.98, + "learning_rate": 3.536213843634073e-07, + "loss": 0.5491, + "step": 1554000 + }, + { + "epoch": 36.03, + "learning_rate": 3.5320162457782865e-07, + "loss": 0.5408, + "step": 1556000 + }, + { + "epoch": 36.08, + "learning_rate": 3.5278186479225003e-07, + "loss": 0.5503, + "step": 1558000 + }, + { + "epoch": 36.12, + "learning_rate": 3.5236210500667136e-07, + "loss": 0.5569, + "step": 1560000 + }, + { + "epoch": 36.17, + "learning_rate": 3.519423452210927e-07, + "loss": 0.5472, + "step": 1562000 + }, + { + "epoch": 36.22, + "learning_rate": 3.5152258543551403e-07, + "loss": 0.5464, + "step": 1564000 + }, + { + "epoch": 36.26, + "learning_rate": 3.511028256499354e-07, + "loss": 0.5448, + "step": 1566000 + }, + { + "epoch": 36.31, + "learning_rate": 3.506830658643568e-07, + "loss": 0.5461, + "step": 1568000 + }, + { + "epoch": 36.36, + "learning_rate": 3.5026330607877813e-07, + "loss": 0.5453, + "step": 1570000 + }, + { + "epoch": 36.4, + "learning_rate": 3.498435462931995e-07, + "loss": 0.5527, + "step": 1572000 + }, + { + "epoch": 36.45, + "learning_rate": 3.4942378650762085e-07, + "loss": 0.549, + "step": 1574000 + }, + { + "epoch": 36.49, + "learning_rate": 3.4900402672204223e-07, + "loss": 0.5472, + "step": 1576000 + }, + { + "epoch": 36.54, + "learning_rate": 3.4858426693646356e-07, + "loss": 0.5522, + "step": 1578000 + }, + { + "epoch": 36.59, + "learning_rate": 3.4816450715088495e-07, + "loss": 0.5518, + "step": 1580000 + }, + { + "epoch": 36.63, + "learning_rate": 3.4774474736530633e-07, + "loss": 0.5552, + "step": 1582000 + }, + { + "epoch": 36.68, + "learning_rate": 3.4732498757972767e-07, + "loss": 0.55, + "step": 1584000 + }, + { + "epoch": 36.73, + "learning_rate": 3.46905227794149e-07, + "loss": 0.5487, + "step": 1586000 + }, + { + "epoch": 36.77, + "learning_rate": 3.464854680085704e-07, + "loss": 0.5579, + "step": 1588000 + }, + { + "epoch": 36.82, + "learning_rate": 3.460657082229917e-07, + "loss": 0.543, + "step": 1590000 + }, + { + "epoch": 36.86, + "learning_rate": 3.4564594843741305e-07, + "loss": 0.5513, + "step": 1592000 + }, + { + "epoch": 36.91, + "learning_rate": 3.4522618865183443e-07, + "loss": 0.5476, + "step": 1594000 + }, + { + "epoch": 36.96, + "learning_rate": 3.448064288662558e-07, + "loss": 0.5523, + "step": 1596000 + }, + { + "epoch": 37.0, + "learning_rate": 3.4438666908067715e-07, + "loss": 0.5498, + "step": 1598000 + }, + { + "epoch": 37.05, + "learning_rate": 3.4396690929509854e-07, + "loss": 0.5556, + "step": 1600000 + }, + { + "epoch": 37.1, + "learning_rate": 3.4354714950951987e-07, + "loss": 0.5466, + "step": 1602000 + }, + { + "epoch": 37.14, + "learning_rate": 3.4312738972394125e-07, + "loss": 0.5505, + "step": 1604000 + }, + { + "epoch": 37.19, + "learning_rate": 3.427076299383626e-07, + "loss": 0.5453, + "step": 1606000 + }, + { + "epoch": 37.24, + "learning_rate": 3.4228787015278397e-07, + "loss": 0.5552, + "step": 1608000 + }, + { + "epoch": 37.28, + "learning_rate": 3.418681103672053e-07, + "loss": 0.5526, + "step": 1610000 + }, + { + "epoch": 37.33, + "learning_rate": 3.414483505816267e-07, + "loss": 0.5522, + "step": 1612000 + }, + { + "epoch": 37.37, + "learning_rate": 3.41028590796048e-07, + "loss": 0.5523, + "step": 1614000 + }, + { + "epoch": 37.42, + "learning_rate": 3.4060883101046935e-07, + "loss": 0.5473, + "step": 1616000 + }, + { + "epoch": 37.47, + "learning_rate": 3.4018907122489074e-07, + "loss": 0.5514, + "step": 1618000 + }, + { + "epoch": 37.51, + "learning_rate": 3.3976931143931207e-07, + "loss": 0.5513, + "step": 1620000 + }, + { + "epoch": 37.56, + "learning_rate": 3.3934955165373345e-07, + "loss": 0.5409, + "step": 1622000 + }, + { + "epoch": 37.61, + "learning_rate": 3.389297918681548e-07, + "loss": 0.5521, + "step": 1624000 + }, + { + "epoch": 37.65, + "learning_rate": 3.3851003208257617e-07, + "loss": 0.5508, + "step": 1626000 + }, + { + "epoch": 37.7, + "learning_rate": 3.3809027229699756e-07, + "loss": 0.5425, + "step": 1628000 + }, + { + "epoch": 37.74, + "learning_rate": 3.376705125114189e-07, + "loss": 0.5453, + "step": 1630000 + }, + { + "epoch": 37.79, + "learning_rate": 3.372507527258403e-07, + "loss": 0.5409, + "step": 1632000 + }, + { + "epoch": 37.84, + "learning_rate": 3.368309929402616e-07, + "loss": 0.5393, + "step": 1634000 + }, + { + "epoch": 37.88, + "learning_rate": 3.36411233154683e-07, + "loss": 0.5566, + "step": 1636000 + }, + { + "epoch": 37.93, + "learning_rate": 3.359914733691043e-07, + "loss": 0.5505, + "step": 1638000 + }, + { + "epoch": 37.98, + "learning_rate": 3.3557171358352566e-07, + "loss": 0.552, + "step": 1640000 + }, + { + "epoch": 38.02, + "learning_rate": 3.35151953797947e-07, + "loss": 0.5508, + "step": 1642000 + }, + { + "epoch": 38.07, + "learning_rate": 3.3473219401236837e-07, + "loss": 0.5447, + "step": 1644000 + }, + { + "epoch": 38.12, + "learning_rate": 3.3431243422678976e-07, + "loss": 0.5525, + "step": 1646000 + }, + { + "epoch": 38.16, + "learning_rate": 3.338926744412111e-07, + "loss": 0.5497, + "step": 1648000 + }, + { + "epoch": 38.21, + "learning_rate": 3.334729146556325e-07, + "loss": 0.553, + "step": 1650000 + }, + { + "epoch": 38.25, + "learning_rate": 3.330531548700538e-07, + "loss": 0.5471, + "step": 1652000 + }, + { + "epoch": 38.3, + "learning_rate": 3.326333950844752e-07, + "loss": 0.5522, + "step": 1654000 + }, + { + "epoch": 38.35, + "learning_rate": 3.322136352988965e-07, + "loss": 0.5514, + "step": 1656000 + }, + { + "epoch": 38.39, + "learning_rate": 3.317938755133179e-07, + "loss": 0.5477, + "step": 1658000 + }, + { + "epoch": 38.44, + "learning_rate": 3.313741157277393e-07, + "loss": 0.5403, + "step": 1660000 + }, + { + "epoch": 38.49, + "learning_rate": 3.3095435594216063e-07, + "loss": 0.5448, + "step": 1662000 + }, + { + "epoch": 38.53, + "learning_rate": 3.3053459615658196e-07, + "loss": 0.5504, + "step": 1664000 + }, + { + "epoch": 38.58, + "learning_rate": 3.301148363710033e-07, + "loss": 0.5531, + "step": 1666000 + }, + { + "epoch": 38.62, + "learning_rate": 3.296950765854247e-07, + "loss": 0.5531, + "step": 1668000 + }, + { + "epoch": 38.67, + "learning_rate": 3.29275316799846e-07, + "loss": 0.5436, + "step": 1670000 + }, + { + "epoch": 38.72, + "learning_rate": 3.288555570142674e-07, + "loss": 0.5443, + "step": 1672000 + }, + { + "epoch": 38.76, + "learning_rate": 3.284357972286888e-07, + "loss": 0.5496, + "step": 1674000 + }, + { + "epoch": 38.81, + "learning_rate": 3.280160374431101e-07, + "loss": 0.5475, + "step": 1676000 + }, + { + "epoch": 38.86, + "learning_rate": 3.275962776575315e-07, + "loss": 0.5528, + "step": 1678000 + }, + { + "epoch": 38.9, + "learning_rate": 3.2717651787195283e-07, + "loss": 0.5414, + "step": 1680000 + }, + { + "epoch": 38.95, + "learning_rate": 3.267567580863742e-07, + "loss": 0.549, + "step": 1682000 + }, + { + "epoch": 38.99, + "learning_rate": 3.2633699830079555e-07, + "loss": 0.5523, + "step": 1684000 + }, + { + "epoch": 39.04, + "learning_rate": 3.2591723851521693e-07, + "loss": 0.5477, + "step": 1686000 + }, + { + "epoch": 39.09, + "learning_rate": 3.2549747872963826e-07, + "loss": 0.5512, + "step": 1688000 + }, + { + "epoch": 39.13, + "learning_rate": 3.250777189440596e-07, + "loss": 0.5459, + "step": 1690000 + }, + { + "epoch": 39.18, + "learning_rate": 3.24657959158481e-07, + "loss": 0.5448, + "step": 1692000 + }, + { + "epoch": 39.23, + "learning_rate": 3.242381993729023e-07, + "loss": 0.5425, + "step": 1694000 + }, + { + "epoch": 39.27, + "learning_rate": 3.238184395873237e-07, + "loss": 0.549, + "step": 1696000 + }, + { + "epoch": 39.32, + "learning_rate": 3.2339867980174503e-07, + "loss": 0.554, + "step": 1698000 + }, + { + "epoch": 39.37, + "learning_rate": 3.229789200161664e-07, + "loss": 0.5514, + "step": 1700000 + }, + { + "epoch": 39.41, + "learning_rate": 3.2255916023058775e-07, + "loss": 0.5526, + "step": 1702000 + }, + { + "epoch": 39.46, + "learning_rate": 3.2213940044500913e-07, + "loss": 0.5433, + "step": 1704000 + }, + { + "epoch": 39.5, + "learning_rate": 3.217196406594305e-07, + "loss": 0.5414, + "step": 1706000 + }, + { + "epoch": 39.55, + "learning_rate": 3.2129988087385185e-07, + "loss": 0.5546, + "step": 1708000 + }, + { + "epoch": 39.6, + "learning_rate": 3.2088012108827324e-07, + "loss": 0.5445, + "step": 1710000 + }, + { + "epoch": 39.64, + "learning_rate": 3.2046036130269457e-07, + "loss": 0.5568, + "step": 1712000 + }, + { + "epoch": 39.69, + "learning_rate": 3.200406015171159e-07, + "loss": 0.5403, + "step": 1714000 + }, + { + "epoch": 39.74, + "learning_rate": 3.196208417315373e-07, + "loss": 0.5463, + "step": 1716000 + }, + { + "epoch": 39.78, + "learning_rate": 3.192010819459586e-07, + "loss": 0.5515, + "step": 1718000 + }, + { + "epoch": 39.83, + "learning_rate": 3.1878132216038e-07, + "loss": 0.5464, + "step": 1720000 + }, + { + "epoch": 39.87, + "learning_rate": 3.1836156237480133e-07, + "loss": 0.546, + "step": 1722000 + }, + { + "epoch": 39.92, + "learning_rate": 3.179418025892227e-07, + "loss": 0.551, + "step": 1724000 + }, + { + "epoch": 39.97, + "learning_rate": 3.1752204280364405e-07, + "loss": 0.5533, + "step": 1726000 + } + ], + "logging_steps": 2000, + "max_steps": 1727400, + "num_train_epochs": 40, + "save_steps": 500, + "total_flos": 3.530727795360958e+21, + "trial_name": null, + "trial_params": null +}